2 *******************************************************************************
3 * Copyright (C) 2012-2013, International Business Machines Corporation and *
4 * others. All Rights Reserved. *
5 *******************************************************************************
7 package com.ibm.icu.text;
9 import static com.ibm.icu.impl.CharacterIteration.DONE32;
10 import static com.ibm.icu.impl.CharacterIteration.current32;
11 import static com.ibm.icu.impl.CharacterIteration.next32;
13 import java.io.IOException;
14 import java.text.CharacterIterator;
15 import java.util.Stack;
17 import com.ibm.icu.impl.Assert;
19 class CjkBreakEngine implements LanguageBreakEngine {
20 private static final UnicodeSet fHangulWordSet = new UnicodeSet();
21 private static final UnicodeSet fHanWordSet = new UnicodeSet();
22 private static final UnicodeSet fKatakanaWordSet = new UnicodeSet();
23 private static final UnicodeSet fHiraganaWordSet = new UnicodeSet();
25 fHangulWordSet.applyPattern("[\\uac00-\\ud7a3]");
26 fHanWordSet.applyPattern("[:Han:]");
27 fKatakanaWordSet.applyPattern("[[:Katakana:]\\uff9e\\uff9f]");
28 fHiraganaWordSet.applyPattern("[:Hiragana:]");
31 fHangulWordSet.freeze();
33 fKatakanaWordSet.freeze();
34 fHiraganaWordSet.freeze();
37 private final UnicodeSet fWordSet;
38 private DictionaryMatcher fDictionary = null;
40 public CjkBreakEngine(boolean korean) throws IOException {
41 fDictionary = DictionaryData.loadDictionaryFor("Hira");
43 fWordSet = fHangulWordSet;
45 fWordSet = new UnicodeSet();
46 fWordSet.addAll(fHanWordSet);
47 fWordSet.addAll(fKatakanaWordSet);
48 fWordSet.addAll(fHiraganaWordSet);
49 fWordSet.add(0xFF70); // HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK
50 fWordSet.add(0x30FC); // KATAKANA-HIRAGANA PROLONGED SOUND MARK
54 public boolean handles(int c, int breakType) {
55 return (breakType == BreakIterator.KIND_WORD) &&
56 (fWordSet.contains(c));
59 private static final int kMaxKatakanaLength = 8;
60 private static final int kMaxKatakanaGroupLength = 20;
61 private static final int maxSnlp = 255;
62 private static final int kint32max = Integer.MAX_VALUE;
63 private static int getKatakanaCost(int wordlength) {
64 int katakanaCost[] = new int[] { 8192, 984, 408, 240, 204, 252, 300, 372, 480 };
65 return (wordlength > kMaxKatakanaLength) ? 8192 : katakanaCost[wordlength];
68 private static boolean isKatakana(int value) {
69 return (value >= 0x30A1 && value <= 0x30FE && value != 0x30FB) ||
70 (value >= 0xFF66 && value <= 0xFF9F);
73 public int findBreaks(CharacterIterator inText, int startPos, int endPos,
74 boolean reverse, int breakType, Stack<Integer> foundBreaks) {
75 if (startPos >= endPos) {
79 inText.setIndex(startPos);
81 int inputLength = endPos - startPos;
82 int[] charPositions = new int[inputLength + 1];
83 StringBuffer s = new StringBuffer("");
84 inText.setIndex(startPos);
85 while (inText.getIndex() < endPos) {
86 s.append(inText.current());
89 String prenormstr = s.toString();
90 boolean isNormalized = Normalizer.quickCheck(prenormstr, Normalizer.NFKC) == Normalizer.YES ||
91 Normalizer.isNormalized(prenormstr, Normalizer.NFKC, 0);
92 CharacterIterator text = inText;
97 while (index < prenormstr.length()) {
98 int codepoint = prenormstr.codePointAt(index);
99 index += Character.charCount(codepoint);
101 charPositions[numChars] = index;
104 String normStr = Normalizer.normalize(prenormstr, Normalizer.NFKC);
105 text = new java.text.StringCharacterIterator(normStr);
106 charPositions = new int[normStr.length() + 1];
107 Normalizer normalizer = new Normalizer(prenormstr, Normalizer.NFKC, 0);
109 charPositions[0] = 0;
110 while (index < normalizer.endIndex()) {
113 index = normalizer.getIndex();
114 charPositions[numChars] = index;
118 // From here on out, do the algorithm. Note that our indices
119 // refer to indices within the normalized string.
120 int[] bestSnlp = new int[numChars + 1];
122 for (int i = 1; i <= numChars; i++) {
123 bestSnlp[i] = kint32max;
126 int[] prev = new int[numChars + 1];
127 for (int i = 0; i <= numChars; i++) {
131 final int maxWordSize = 20;
132 int values[] = new int[numChars];
133 int lengths[] = new int[numChars];
134 // dynamic programming to find the best segmentation
135 boolean is_prev_katakana = false;
136 for (int i = 0; i < numChars; i++) {
138 if (bestSnlp[i] == kint32max) {
142 int maxSearchLength = (i + maxWordSize < numChars) ? maxWordSize : (numChars - i);
143 int[] count_ = new int[1];
144 fDictionary.matches(text, maxSearchLength, lengths, count_, maxSearchLength, values);
145 int count = count_[0];
147 // if there are no single character matches found in the dictionary
148 // starting with this character, treat character as a 1-character word
149 // with the highest value possible (i.e. the least likely to occur).
150 // Exclude Korean characters from this treatment, as they should be
151 // left together by default.
152 if ((count == 0 || lengths[0] != 1) && current32(text) != DONE32 && !fHangulWordSet.contains(current32(text))) {
153 values[count] = maxSnlp;
158 for (int j = 0; j < count; j++) {
159 int newSnlp = bestSnlp[i] + values[j];
160 if (newSnlp < bestSnlp[lengths[j] + i]) {
161 bestSnlp[lengths[j] + i] = newSnlp;
162 prev[lengths[j] + i] = i;
166 // In Japanese, single-character Katakana words are pretty rare.
167 // So we apply the following heuristic to Katakana: any continuous
168 // run of Katakana characters is considered a candidate word with
169 // a default cost specified in the katakanaCost table according
172 boolean is_katakana = isKatakana(current32(text));
173 if (!is_prev_katakana && is_katakana) {
176 while (j < numChars && (j - i) < kMaxKatakanaGroupLength && isKatakana(current32(text))) {
181 if ((j - i) < kMaxKatakanaGroupLength) {
182 int newSnlp = bestSnlp[i] + getKatakanaCost(j - i);
183 if (newSnlp < bestSnlp[j]) {
184 bestSnlp[j] = newSnlp;
189 is_prev_katakana = is_katakana;
192 int t_boundary[] = new int[numChars + 1];
194 if (bestSnlp[numChars] == kint32max) {
195 t_boundary[numBreaks] = numChars;
198 for (int i = numChars; i > 0; i = prev[i]) {
199 t_boundary[numBreaks] = i;
202 Assert.assrt(prev[t_boundary[numBreaks - 1]] == 0);
205 if (foundBreaks.size() == 0 || foundBreaks.peek() < startPos) {
206 t_boundary[numBreaks++] = 0;
209 for (int i = numBreaks - 1; i >= 0; i--) {
210 int pos = charPositions[t_boundary[i]] + startPos;
211 if (!(foundBreaks.contains(pos) || pos == startPos))
212 foundBreaks.push(charPositions[t_boundary[i]] + startPos);
215 if (!foundBreaks.empty() && foundBreaks.peek() == endPos)
217 if (!foundBreaks.empty())
218 inText.setIndex(foundBreaks.peek());