2 *******************************************************************************
\r
3 * Copyright (C) 1996-2010, International Business Machines Corporation and *
\r
4 * others. All Rights Reserved. *
\r
5 *******************************************************************************
\r
8 package com.ibm.icu.text;
\r
10 import java.io.IOException;
\r
11 import java.io.InputStream;
\r
12 import java.text.CharacterIterator;
\r
13 import java.util.Stack;
\r
14 import java.util.Vector;
\r
16 import com.ibm.icu.impl.Assert;
\r
20 * A subclass of RuleBasedBreakIterator that adds the ability to use a dictionary
\r
21 * to further subdivide ranges of text beyond what is possible using just the
\r
22 * state-table-based algorithm. This is necessary, for example, to handle
\r
23 * word and line breaking in Thai, which doesn't use spaces between words. The
\r
24 * state-table-based algorithm used by RuleBasedBreakIterator_Old is used to divide
\r
25 * up text as far as possible, and then contiguous ranges of letters are
\r
26 * repeatedly compared against a list of known words (i.e., the dictionary)
\r
27 * to divide them up into words.
\r
29 * DictionaryBasedBreakIterator uses the same rule language as RuleBasedBreakIterator_Old,
\r
30 * but adds one more special substitution name: _dictionary_. This substitution
\r
31 * name is used to identify characters in words in the dictionary. The idea is that
\r
32 * if the iterator passes over a chunk of text that includes two or more characters
\r
33 * in a row that are included in _dictionary_, it goes back through that range and
\r
34 * derives additional break positions (if possible) using the dictionary.
\r
36 * DictionaryBasedBreakIterator is also constructed with the filename of a dictionary
\r
37 * file. It uses Class.getResource() to locate the dictionary file. The
\r
38 * dictionary file is in a serialized binary format. We have a very primitive (and
\r
39 * slow) BuildDictionaryFile utility for creating dictionary files, but aren't
\r
40 * currently making it public. Contact us for help.
\r
44 public class DictionaryBasedBreakIterator extends RuleBasedBreakIterator {
\r
47 * Keeps track of if we are using the compact trie dictionary.
\r
49 private boolean usingCTDictionary = false;
\r
51 * a list of known words that is used to divide up contiguous ranges of letters,
\r
52 * stored in a compressed, indexed, format that offers fast access
\r
54 private BreakDictionary dictionary;
\r
57 * a list of flags indicating which character categories are contained in
\r
58 * the dictionary file (this is used to determine which ranges of characters
\r
59 * to apply the dictionary to)
\r
61 //private boolean[] categoryFlags;
\r
65 * when a range of characters is divided up using the dictionary, the break
\r
66 * positions that are discovered are stored here, preventing us from having
\r
67 * to use either the dictionary or the state table again until the iterator
\r
68 * leaves this range of text
\r
70 int[] cachedBreakPositions;
\r
73 * if cachedBreakPositions is not null, this indicates which item in the
\r
74 * cache the current iteration position refers to
\r
76 int positionInCache;
\r
79 * Special variable name for characters in words in dictionary
\r
83 * Construct a DictionarBasedBreakIterator from precompiled rules. Use by ThaiBreakEngine
\r
84 * uses the BreakCTDictionary.
\r
85 * @param compiledRules an input stream containing the binary (flattened) compiled rules.
\r
87 * @deprecated This API is ICU internal only.
\r
89 protected DictionaryBasedBreakIterator(InputStream compiledRules) throws IOException {
\r
90 fRData = RBBIDataWrapper.get(compiledRules); // Init the RBBI part of this iterator.
\r
92 usingCTDictionary = true;
\r
95 * Constructs a DictionaryBasedBreakIterator.
\r
96 * @param rules Same as the rules parameter on RuleBasedBreakIterator,
\r
97 * except for the special meaning of "_dictionary_". This parameter is just
\r
98 * passed through to RuleBasedBreakIterator constructor.
\r
99 * @param dictionaryStream the stream containing the dictionary data
\r
102 public DictionaryBasedBreakIterator(String rules,
\r
103 InputStream dictionaryStream) throws IOException {
\r
105 dictionary = new BreakDictionary(dictionaryStream);
\r
110 * Construct a DictionarBasedBreakIterator from precompiled rules.
\r
111 * @param compiledRules an input stream containing the binary (flattened) compiled rules.
\r
112 * @param dictionaryStream an input stream containing the dictionary data
\r
114 * @deprecated This API is ICU internal only.
\r
116 public DictionaryBasedBreakIterator(InputStream compiledRules,
\r
117 InputStream dictionaryStream) throws IOException {
\r
118 fRData = RBBIDataWrapper.get(compiledRules); // Init the RBBI part of this iterator.
\r
119 dictionary = new BreakDictionary(dictionaryStream);
\r
123 /** @stable ICU 2.0 */
\r
124 public void setText(CharacterIterator newText) {
\r
125 super.setText(newText);
\r
126 cachedBreakPositions = null;
\r
127 fDictionaryCharCount = 0;
\r
128 positionInCache = 0;
\r
132 * Sets the current iteration position to the beginning of the text.
\r
133 * (i.e., the CharacterIterator's starting offset).
\r
134 * @return The offset of the beginning of the text.
\r
137 public int first() {
\r
138 cachedBreakPositions = null;
\r
139 fDictionaryCharCount = 0;
\r
140 positionInCache = 0;
\r
141 return super.first();
\r
145 * Sets the current iteration position to the end of the text.
\r
146 * (i.e., the CharacterIterator's ending offset).
\r
147 * @return The text's past-the-end offset.
\r
150 public int last() {
\r
151 cachedBreakPositions = null;
\r
152 fDictionaryCharCount = 0;
\r
153 positionInCache = 0;
\r
154 return super.last();
\r
158 * Advances the iterator one step backwards.
\r
159 * @return The position of the last boundary position before the
\r
160 * current iteration position
\r
163 public int previous() {
\r
164 CharacterIterator text = getText();
\r
166 // if we have cached break positions and we're still in the range
\r
167 // covered by them, just move one step backward in the cache
\r
168 if (cachedBreakPositions != null && positionInCache > 0) {
\r
170 text.setIndex(cachedBreakPositions[positionInCache]);
\r
171 return cachedBreakPositions[positionInCache];
\r
174 // otherwise, dump the cache and use the inherited previous() method to move
\r
175 // backward. This may fill up the cache with new break positions, in which
\r
176 // case we have to mark our position in the cache. If it doesn't, use next()
\r
177 // to move forward until we hit or pass the current position. This *will* fill
\r
180 cachedBreakPositions = null;
\r
181 int offset = current();
\r
182 int result = super.previous();
\r
184 if (cachedBreakPositions != null) {
\r
185 positionInCache = cachedBreakPositions.length - 2;
\r
189 while (result < offset) {
\r
190 int nextResult = next();
\r
192 if (nextResult >= offset) {
\r
196 result = nextResult;
\r
199 if (cachedBreakPositions != null) {
\r
200 positionInCache = cachedBreakPositions.length - 2;
\r
203 if (result != BreakIterator.DONE) {
\r
204 text.setIndex(result);
\r
212 * Sets the current iteration position to the last boundary position
\r
213 * before the specified position.
\r
214 * @param offset The position to begin searching from
\r
215 * @return The position of the last boundary before "offset"
\r
218 public int preceding(int offset) {
\r
219 CharacterIterator text = getText();
\r
220 checkOffset(offset, text);
\r
222 // if we have no cached break positions, or "offset" is outside the
\r
223 // range covered by the cache, we can just call the inherited routine
\r
224 // (which will eventually call other routines in this class that may
\r
225 // refresh the cache)
\r
226 if (cachedBreakPositions == null || offset <= cachedBreakPositions[0] ||
\r
227 offset > cachedBreakPositions[cachedBreakPositions.length - 1]) {
\r
228 cachedBreakPositions = null;
\r
229 return super.preceding(offset);
\r
232 // on the other hand, if "offset" is within the range covered by the cache,
\r
233 // then all we have to do is search the cache for the last break position
\r
236 positionInCache = 0;
\r
237 while (positionInCache < cachedBreakPositions.length
\r
238 && offset > cachedBreakPositions[positionInCache])
\r
241 text.setIndex(cachedBreakPositions[positionInCache]);
\r
242 return text.getIndex();
\r
247 * Sets the current iteration position to the first boundary position after
\r
248 * the specified position.
\r
249 * @param offset The position to begin searching forward from
\r
250 * @return The position of the first boundary after "offset"
\r
253 public int following(int offset) {
\r
254 CharacterIterator text = getText();
\r
255 checkOffset(offset, text);
\r
257 // if we have no cached break positions, or if "offset" is outside the
\r
258 // range covered by the cache, then dump the cache and call our
\r
259 // inherited following() method. This will call other methods in this
\r
260 // class that may refresh the cache.
\r
261 if (cachedBreakPositions == null || offset < cachedBreakPositions[0] ||
\r
262 offset >= cachedBreakPositions[cachedBreakPositions.length - 1]) {
\r
263 cachedBreakPositions = null;
\r
264 return super.following(offset);
\r
267 // on the other hand, if "offset" is within the range covered by the
\r
268 // cache, then just search the cache for the first break position
\r
271 positionInCache = 0;
\r
272 while (positionInCache < cachedBreakPositions.length
\r
273 && offset >= cachedBreakPositions[positionInCache])
\r
275 text.setIndex(cachedBreakPositions[positionInCache]);
\r
276 return text.getIndex();
\r
282 * Return the status tag from the break rule that determined the most recently
\r
283 * returned break position.
\r
285 * TODO: not supported with dictionary based break iterators.
\r
287 * @return the status from the break rule that determined the most recently
\r
288 * returned break position.
\r
290 * @provisional This API might change or be removed in a future release.
\r
292 public int getRuleStatus() {
\r
298 * Get the status (tag) values from the break rule(s) that determined the most
\r
299 * recently returned break position. The values appear in the rule source
\r
300 * within brackets, {123}, for example. The default status value for rules
\r
301 * that do not explicitly provide one is zero.
\r
303 * TODO: not supported for dictionary based break iterator.
\r
305 * @param fillInArray an array to be filled in with the status values.
\r
306 * @return The number of rule status values from rules that determined
\r
307 * the most recent boundary returned by the break iterator.
\r
308 * In the event that the array is too small, the return value
\r
309 * is the total number of status values that were available,
\r
310 * not the reduced number that were actually returned.
\r
312 * @provisional This API might change or be removed in a future release.
\r
314 public int getRuleStatusVec(int[] fillInArray) {
\r
315 if (fillInArray != null && fillInArray.length>=1) {
\r
316 fillInArray[0] = 0;
\r
321 * This is the implementation function for next().
\r
323 * @deprecated This API is ICU internal only.
\r
325 protected int handleNext() {
\r
326 CharacterIterator text = getText();
\r
328 // if there are no cached break positions, or if we've just moved
\r
329 // off the end of the range covered by the cache, we have to dump
\r
330 // and possibly regenerate the cache
\r
331 if (cachedBreakPositions == null || positionInCache == cachedBreakPositions.length - 1) {
\r
333 // start by using the inherited handleNext() to find a tentative return
\r
334 // value. dictionaryCharCount tells us how many dictionary characters
\r
335 // we passed over on our way to the tentative return value
\r
336 int startPos = text.getIndex();
\r
337 fDictionaryCharCount = 0;
\r
338 int result = super.handleNext();
\r
340 // if we passed over more than one dictionary character, then we use
\r
341 // divideUpDictionaryRange() to regenerate the cached break positions
\r
342 // for the new range.
\r
343 if (!usingCTDictionary && fDictionaryCharCount > 1 && result - startPos > 1) {
\r
344 divideUpDictionaryRange(startPos, result);
\r
347 // otherwise, the value we got back from the inherited fuction
\r
348 // is our return value, and we can dump the cache
\r
350 cachedBreakPositions = null;
\r
355 // if the cache of break positions has been regenerated (or existed all
\r
356 // along), then just advance to the next break position in the cache
\r
358 if (cachedBreakPositions != null) {
\r
360 text.setIndex(cachedBreakPositions[positionInCache]);
\r
361 return cachedBreakPositions[positionInCache];
\r
364 Assert.assrt(false);
\r
365 return -9999; // SHOULD NEVER GET HERE!
\r
370 * This is the function that actually implements the dictionary-based
\r
371 * algorithm. Given the endpoints of a range of text, it uses the
\r
372 * dictionary to determine the positions of any boundaries in this
\r
373 * range. It stores all the boundary positions it discovers in
\r
374 * cachedBreakPositions so that we only have to do this work once
\r
375 * for each time we enter the range.
\r
377 @SuppressWarnings("unchecked")
\r
378 private void divideUpDictionaryRange(int startPos, int endPos) {
\r
379 CharacterIterator text = getText();
\r
381 // the range we're dividing may begin or end with non-dictionary characters
\r
382 // (i.e., for line breaking, we may have leading or trailing punctuation
\r
383 // that needs to be kept with the word). Seek from the beginning of the
\r
384 // range to the first dictionary character
\r
385 text.setIndex(startPos);
\r
386 int c = CICurrent32(text);
\r
387 while (isDictionaryChar(c) == false) {
\r
388 c = CINext32(text);
\r
391 //System.out.println("\nDividing up range from " + (text.getIndex() + 1) + " to " + endPos);
\r
393 // initialize. We maintain two stacks: currentBreakPositions contains
\r
394 // the list of break positions that will be returned if we successfully
\r
395 // finish traversing the whole range now. possibleBreakPositions lists
\r
396 // all other possible word ends we've passed along the way. (Whenever
\r
397 // we reach an error [a sequence of characters that can't begin any word
\r
398 // in the dictionary], we back up, possibly delete some breaks from
\r
399 // currentBreakPositions, move a break from possibleBreakPositions
\r
400 // to currentBreakPositions, and start over from there. This process
\r
401 // continues in this way until we either successfully make it all the way
\r
402 // across the range, or exhaust all of our combinations of break
\r
404 Stack<Integer> currentBreakPositions = new Stack<Integer>();
\r
405 Stack<Integer> possibleBreakPositions = new Stack<Integer>();
\r
406 Vector<Integer> wrongBreakPositions = new Vector<Integer>();
\r
408 // the dictionary is implemented as a trie, which is treated as a state
\r
409 // machine. -1 represents the end of a legal word. Every word in the
\r
410 // dictionary is represented by a path from the root node to -1. A path
\r
411 // that ends in state 0 is an illegal combination of characters.
\r
414 // these two variables are used for error handling. We keep track of the
\r
415 // farthest we've gotten through the range being divided, and the combination
\r
416 // of breaks that got us that far. If we use up all possible break
\r
417 // combinations, the text contains an error or a word that's not in the
\r
418 // dictionary. In this case, we "bless" the break positions that got us the
\r
419 // farthest as real break positions, and then start over from scratch with
\r
420 // the character where the error occurred.
\r
421 int farthestEndPoint = text.getIndex();
\r
422 Stack<Integer> bestBreakPositions = null;
\r
424 // initialize (we always exit the loop with a break statement)
\r
425 c = CICurrent32(text);
\r
427 //System.out.print("c = " + Integer.toString(c, 16) + ", pos = " + text.getIndex());
\r
429 // if we can transition to state "-1" from our current state, we're
\r
430 // on the last character of a legal word. Push that position onto
\r
431 // the possible-break-positions stack
\r
432 if (dictionary.at(state, 0) == -1) {
\r
433 possibleBreakPositions.push(Integer.valueOf(text.getIndex()));
\r
436 // look up the new state to transition to in the dictionary
\r
437 // There will be no supplementaries here because the Thai dictionary
\r
438 // does not include any. This code is going away soon, not worth
\r
440 state = (dictionary.at(state, (char)c)) & 0xFFFF; // TODO: fix supplementaries
\r
441 //System.out.print(", state = " + state);
\r
443 // if the character we're sitting on causes us to transition to
\r
444 // the "end of word" state, then it was a non-dictionary character
\r
445 // and we've successfully traversed the whole range. Drop out
\r
447 if (state == /*-1*/ 0xFFFF) {
\r
448 currentBreakPositions.push(Integer.valueOf(text.getIndex()));
\r
452 // if the character we're sitting on causes us to transition to
\r
453 // the error state, or if we've gone off the end of the range
\r
454 // without transitioning to the "end of word" state, we've hit
\r
456 else if (state == 0 || text.getIndex() >= endPos) {
\r
458 // if this is the farthest we've gotten, take note of it in
\r
459 // case there's an error in the text
\r
460 if (text.getIndex() > farthestEndPoint) {
\r
461 farthestEndPoint = text.getIndex();
\r
462 bestBreakPositions = (Stack<Integer>)(currentBreakPositions.clone());
\r
465 // wrongBreakPositions is a list of all break positions we've tried starting
\r
466 // that didn't allow us to traverse all the way through the text. Every time
\r
467 // we pop a break position off of currentBreakPositions, we put it into
\r
468 // wrongBreakPositions to avoid trying it again later. If we make it to this
\r
469 // spot, we're either going to back up to a break in possibleBreakPositions
\r
470 // and try starting over from there, or we've exhausted all possible break
\r
471 // positions and are going to do the fallback procedure. This loop prevents
\r
472 // us from messing with anything in possibleBreakPositions that didn't work as
\r
473 // a starting point the last time we tried it (this is to prevent a bunch of
\r
474 // repetitive checks from slowing down some extreme cases)
\r
475 // variable not used Integer newStartingSpot = null;
\r
476 while (!possibleBreakPositions.isEmpty() && wrongBreakPositions.contains(
\r
477 possibleBreakPositions.peek())) {
\r
478 possibleBreakPositions.pop();
\r
481 // if we've used up all possible break-position combinations, there's
\r
482 // an error or an unknown word in the text. In this case, we start
\r
483 // over, treating the farthest character we've reached as the beginning
\r
484 // of the range, and "blessing" the break positions that got us that
\r
485 // far as real break positions
\r
486 if (possibleBreakPositions.isEmpty()) {
\r
487 if (bestBreakPositions != null) {
\r
488 currentBreakPositions = bestBreakPositions;
\r
489 if (farthestEndPoint < endPos) {
\r
490 text.setIndex(farthestEndPoint + 1);
\r
497 if ((currentBreakPositions.size() == 0
\r
498 || currentBreakPositions.peek().intValue() != text.getIndex())
\r
499 && text.getIndex() != startPos) {
\r
500 currentBreakPositions.push(Integer.valueOf(text.getIndex()));
\r
503 currentBreakPositions.push(Integer.valueOf(text.getIndex()));
\r
507 // if we still have more break positions we can try, then promote the
\r
508 // last break in possibleBreakPositions into currentBreakPositions,
\r
509 // and get rid of all entries in currentBreakPositions that come after
\r
510 // it. Then back up to that position and start over from there (i.e.,
\r
511 // treat that position as the beginning of a new word)
\r
513 Integer temp = possibleBreakPositions.pop();
\r
514 Integer temp2 = null;
\r
515 while (!currentBreakPositions.isEmpty() && temp.intValue() <
\r
516 currentBreakPositions.peek().intValue()) {
\r
517 temp2 = currentBreakPositions.pop();
\r
518 wrongBreakPositions.addElement(temp2);
\r
520 currentBreakPositions.push(temp);
\r
521 text.setIndex(currentBreakPositions.peek().intValue());
\r
524 // re-sync "c" for the next go-round, and drop out of the loop if
\r
525 // we've made it off the end of the range
\r
526 c = CICurrent32(text);
\r
528 if (text.getIndex() >= endPos) {
\r
533 // if we didn't hit any exceptional conditions on this last iteration,
\r
534 // just advance to the next character and loop
\r
536 c = CINext32(text);
\r
538 //System.out.print(", possibleBreakPositions = { "); for (int i = 0; i < possibleBreakPositions.size(); i++) System.out.print(possibleBreakPositions.elementAt(i) + " "); System.out.print("}");
\r
539 //System.out.print(", currentBreakPositions = { "); for (int i = 0; i < currentBreakPositions.size(); i++) System.out.print(currentBreakPositions.elementAt(i) + " "); System.out.println("}");
\r
542 // dump the last break position in the list, and replace it with the actual
\r
543 // end of the range (which may be the same character, or may be further on
\r
544 // because the range actually ended with non-dictionary characters we want to
\r
545 // keep with the word)
\r
546 if (!currentBreakPositions.isEmpty()) {
\r
547 currentBreakPositions.pop();
\r
549 currentBreakPositions.push(Integer.valueOf(endPos));
\r
551 // create a regular array to hold the break positions and copy
\r
552 // the break positions from the stack to the array (in addition,
\r
553 // our starting position goes into this array as a break position).
\r
554 // This array becomes the cache of break positions used by next()
\r
555 // and previous(), so this is where we actually refresh the cache.
\r
556 cachedBreakPositions = new int[currentBreakPositions.size() + 1];
\r
557 cachedBreakPositions[0] = startPos;
\r
559 for (int i = 0; i < currentBreakPositions.size(); i++) {
\r
560 cachedBreakPositions[i + 1] = currentBreakPositions.elementAt(i).intValue();
\r
562 positionInCache = 0;
\r