jars/icu4j-4_4_2-src/main/classes/core/src/com/ibm/icu/text/ThaiBreakIterator.java

   1 /*\r
   2  *******************************************************************************\r
   3  * Copyright (C) 1996-2010, International Business Machines Corporation and    *\r
   4  * others. All Rights Reserved.                                                *\r
   5  *******************************************************************************\r
   6  */\r
   7 package com.ibm.icu.text;\r
   8 \r
   9 import java.io.IOException;\r
  10 import java.io.InputStream;\r
  11 import java.text.CharacterIterator;\r
  12 import java.util.Stack;\r
  13 \r
  14 import com.ibm.icu.impl.Assert;\r
  15 \r
  16 class ThaiBreakIterator extends DictionaryBasedBreakIterator {\r
  17 \r
  18     /* Helper class for improving readability of the Thai word break\r
  19      * algorithm.\r
  20      */\r
  21     static class PossibleWord {\r
  22         // List size, limited by the maximum number of words in the dictionary\r
  23         // that form a nested sequence.\r
  24         private final int POSSIBLE_WORD_LIST_MAX = 20;\r
  25         //list of word candidate lengths, in increasing length order\r
  26         private int lengths[];\r
  27         private int count[];    // Count of candidates\r
  28         private int prefix;     // The longeset match with a dictionary word\r
  29         private int offset;     // Offset in the text of these candidates\r
  30         private int mark;       // The preferred candidate's offset\r
  31         private int current;    // The candidate we're currently looking at\r
  32 \r
  33         // Default constructor\r
  34         public PossibleWord() {\r
  35             lengths = new int[POSSIBLE_WORD_LIST_MAX];\r
  36             count = new int[1]; // count needs to be an array of 1 so that it can be pass as reference\r
  37             offset = -1;\r
  38         }\r
  39 \r
  40         // Fill the list of candidates if needed, select the longest, and return the number found\r
  41         public int candidates(CharacterIterator fIter, BreakCTDictionary dict, int rangeEnd) {\r
  42             int start = fIter.getIndex();\r
  43             if (start != offset) {\r
  44                 offset = start;\r
  45                 prefix = dict.matches(fIter, rangeEnd - start, lengths, count, lengths.length);\r
  46                 // Dictionary leaves text after longest prefix, not longest word. Back up.\r
  47                 if (count[0] <= 0) {\r
  48                     fIter.setIndex(start);\r
  49                 }\r
  50             }\r
  51             if (count[0] > 0) {\r
  52                 fIter.setIndex(start + lengths[count[0]-1]);\r
  53             }\r
  54             current = count[0] - 1;\r
  55             mark = current;\r
  56             return count[0];\r
  57         }\r
  58 \r
  59         // Select the currently marked candidate, point after it in the text, and invalidate self\r
  60         public int acceptMarked(CharacterIterator fIter) {\r
  61             fIter.setIndex(offset + lengths[mark]);\r
  62             return lengths[mark];\r
  63         }\r
  64 \r
  65         // Backup from the current candidate to the next shorter one; rreturn true if that exists\r
  66         // and point the text after it\r
  67         public boolean backUp(CharacterIterator fIter) {\r
  68             if (current > 0) {\r
  69                 fIter.setIndex(offset + lengths[--current]);\r
  70                 return true;\r
  71             }\r
  72             return false;\r
  73         }\r
  74 \r
  75         // Return the longest prefix this candidate location shares with a dictionary word\r
  76         public int longestPrefix() {\r
  77             return prefix;\r
  78         }\r
  79 \r
  80         // Mark the current candidate as the one we like\r
  81         public void markCurrent() {\r
  82             mark = current;\r
  83         }\r
  84     }\r
  85 \r
  86     private static UnicodeSet fThaiWordSet;\r
  87     private static UnicodeSet fEndWordSet;\r
  88     private static UnicodeSet fBeginWordSet;\r
  89     private static UnicodeSet fSuffixSet;\r
  90     private static UnicodeSet fMarkSet;\r
  91     private BreakCTDictionary fDictionary;\r
  92 \r
  93     // Constants for ThaiBreakIterator\r
  94     // How many words in a row are "good enough"?\r
  95     private static final byte THAI_LOOKAHEAD = 3;\r
  96     // Will not combine a non-word with a preceding dictionary word longer than this\r
  97     private static final byte THAI_ROOT_COMBINE_THRESHOLD = 3;\r
  98     // Will not combine a non-word that shares at least this much prefix with a\r
  99     // dictionary word with a preceding word\r
 100     private static final byte THAI_PREFIX_COMBINE_THRESHOLD = 3;\r
 101     // Ellision character\r
 102     private static final char THAI_PAIYANNOI = 0x0E2F;\r
 103     // Repeat character\r
 104     private static final char THAI_MAIYAMOK = 0x0E46;\r
 105     // Minimum word size\r
 106     private static final byte THAI_MIN_WORD = 2;\r
 107     // Minimum number of characters for two words\r
 108     //private final int THAI_MIN_WORD_SPAN = THAI_MIN_WORD * 2;\r
 109 \r
 110     static {\r
 111         // Initialize UnicodeSets\r
 112         fThaiWordSet = new UnicodeSet();\r
 113         fMarkSet = new UnicodeSet();\r
 114         fEndWordSet = new UnicodeSet();\r
 115         fBeginWordSet = new UnicodeSet();\r
 116         fSuffixSet = new UnicodeSet();\r
 117 \r
 118         fThaiWordSet.applyPattern("[[:Thai:]&[:LineBreak=SA:]]");\r
 119         fThaiWordSet.compact();\r
 120 \r
 121         fMarkSet.applyPattern("[[:Thai:]&[:LineBreak=SA:]&[:M:]]");\r
 122         fMarkSet.add(0x0020);\r
 123         fEndWordSet = fThaiWordSet;\r
 124         fEndWordSet.remove(0x0E31); // MAI HAN-AKAT\r
 125         fEndWordSet.remove(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI\r
 126         fBeginWordSet.add(0x0E01, 0x0E2E); //KO KAI through HO NOKHUK\r
 127         fBeginWordSet.add(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI\r
 128         fSuffixSet.add(THAI_PAIYANNOI);\r
 129         fSuffixSet.add(THAI_MAIYAMOK);\r
 130 \r
 131         // Compact for caching\r
 132         fMarkSet.compact();\r
 133         fEndWordSet.compact();\r
 134         fBeginWordSet.compact();\r
 135         fSuffixSet.compact();\r
 136         \r
 137         // Freeze the static UnicodeSet\r
 138         fThaiWordSet.freeze();\r
 139         fMarkSet.freeze();\r
 140         fEndWordSet.freeze();\r
 141         fBeginWordSet.freeze();\r
 142         fSuffixSet.freeze();\r
 143     }\r
 144 \r
 145     public ThaiBreakIterator(InputStream ruleStream, InputStream dictionaryStream) throws IOException {\r
 146         super(ruleStream);\r
 147         // Initialize diciontary\r
 148         fDictionary = new BreakCTDictionary(dictionaryStream);\r
 149     }\r
 150 \r
 151     /**\r
 152      * This is the implementation function for next().\r
 153      */\r
 154     protected int handleNext() {\r
 155         CharacterIterator text = getText();\r
 156 \r
 157         // if there are no cached break positions, or if we've just moved\r
 158         // off the end of the range covered by the cache, we have to dump\r
 159         // and possibly regenerate the cache\r
 160         if (cachedBreakPositions == null || positionInCache == cachedBreakPositions.length - 1) {\r
 161 \r
 162             // start by using the inherited handleNext() to find a tentative return\r
 163             // value.   dictionaryCharCount tells us how many dictionary characters\r
 164             // we passed over on our way to the tentative return value\r
 165             int startPos = text.getIndex();\r
 166             fDictionaryCharCount = 0;\r
 167             int result = super.handleNext();\r
 168 \r
 169             // if we passed over more than one dictionary character, then we use\r
 170             // divideUpDictionaryRange() to regenerate the cached break positions\r
 171             // for the new range\r
 172             if (fDictionaryCharCount > 1 && result - startPos > 1) {\r
 173                 divideUpDictionaryRange(startPos, result);\r
 174             }\r
 175 \r
 176             // otherwise, the value we got back from the inherited fuction\r
 177             // is our return value, and we can dump the cache\r
 178             else {\r
 179                 cachedBreakPositions = null;\r
 180                 return result;\r
 181             }\r
 182         }\r
 183         // if the cache of break positions has been regenerated (or existed all\r
 184         // along), then just advance to the next break position in the cache\r
 185         // and return it\r
 186         if (cachedBreakPositions != null) {\r
 187             ++positionInCache;\r
 188             text.setIndex(cachedBreakPositions[positionInCache]);\r
 189             return cachedBreakPositions[positionInCache];\r
 190         }\r
 191         Assert.assrt(false);\r
 192         return -9999;   // SHOULD NEVER GET HERE!\r
 193     }\r
 194 \r
 195     /**\r
 196      * Divide up a range of known dictionary characters.\r
 197      *\r
 198      * @param rangeStart The start of the range of dictionary characters\r
 199      * @param rangeEnd The end of the range of dictionary characters\r
 200      * @return The number of breaks found\r
 201      */\r
 202     private int divideUpDictionaryRange(int rangeStart, int rangeEnd) {\r
 203         if ((rangeEnd - rangeStart) < THAI_MIN_WORD) {\r
 204             return 0;  // Not enough chacters for word\r
 205         }\r
 206         CharacterIterator fIter = getText();\r
 207         int wordsFound = 0;\r
 208         int wordLength;\r
 209         int current;\r
 210         Stack<Integer> foundBreaks = new Stack<Integer>();\r
 211         PossibleWord words[] = new PossibleWord[THAI_LOOKAHEAD];\r
 212         for (int i = 0; i < THAI_LOOKAHEAD; i++) {\r
 213             words[i] = new PossibleWord();\r
 214         }\r
 215         int uc;\r
 216 \r
 217         fIter.setIndex(rangeStart);\r
 218 \r
 219         while ((current = fIter.getIndex()) < rangeEnd) {\r
 220             wordLength = 0;\r
 221 \r
 222             //Look for candidate words at the current position\r
 223             int candidates = words[wordsFound%THAI_LOOKAHEAD].candidates(fIter, fDictionary, rangeEnd);\r
 224 \r
 225             // If we found exactly one, use that\r
 226             if (candidates == 1) {\r
 227                 wordLength = words[wordsFound%THAI_LOOKAHEAD].acceptMarked(fIter);\r
 228                 wordsFound += 1;\r
 229             }\r
 230 \r
 231             // If there was more than one, see which one can take use forward the most words\r
 232             else if (candidates > 1) {\r
 233                 boolean foundBest = false;\r
 234                 // If we're already at the end of the range, we're done\r
 235                 if (fIter.getIndex() < rangeEnd) {\r
 236                     do {\r
 237                         int wordsMatched = 1;\r
 238                         if (words[(wordsFound+1)%THAI_LOOKAHEAD].candidates(fIter, fDictionary, rangeEnd) > 0) {\r
 239                             if (wordsMatched < 2) {\r
 240                                 // Followed by another dictionary word; mark first word as a good candidate\r
 241                                 words[wordsFound%THAI_LOOKAHEAD].markCurrent();\r
 242                                 wordsMatched = 2;\r
 243                             }\r
 244 \r
 245                             // If we're already at the end of the range, we're done\r
 246                             if (fIter.getIndex() >= rangeEnd) {\r
 247                                 break;\r
 248                             }\r
 249 \r
 250                             // See if any of the possible second words is followed by a third word\r
 251                             do {\r
 252                                 // If we find a third word, stop right away\r
 253                                 if (words[(wordsFound+2)%THAI_LOOKAHEAD].candidates(fIter, fDictionary, rangeEnd) > 0) {\r
 254                                     words[wordsFound%THAI_LOOKAHEAD].markCurrent();\r
 255                                     foundBest = true;\r
 256                                     break;\r
 257                                 }\r
 258                             } while (words[(wordsFound+1)%THAI_LOOKAHEAD].backUp(fIter));\r
 259                         }\r
 260                     } while (words[wordsFound%THAI_LOOKAHEAD].backUp(fIter) && !foundBest);\r
 261                 }\r
 262                 /* foundBest: */wordLength = words[wordsFound%THAI_LOOKAHEAD].acceptMarked(fIter);\r
 263                 wordsFound += 1;\r
 264             }\r
 265             // We come here after having either found a word or not. We look ahead to the\r
 266             // next word. If it's not a dictionary word, we will combine it with the word we\r
 267             // just found (if there is one), but only if the preceding word does not exceed\r
 268             // the threshold.\r
 269             // The text iterator should now be positioned at the end of the word we found.\r
 270             if (fIter.getIndex() < rangeEnd && wordLength < THAI_ROOT_COMBINE_THRESHOLD) {\r
 271                 // If it is a dictionary word, do nothing. If it isn't, then if there is\r
 272                 // no preceding word, or the non-word shares less than the minimum threshold\r
 273                 // of characters with a dictionary word, then scan to resynchronize\r
 274                 if (words[wordsFound%THAI_LOOKAHEAD].candidates(fIter, fDictionary, rangeEnd) <= 0 &&\r
 275                         (wordLength == 0 || \r
 276                                 words[wordsFound%THAI_LOOKAHEAD].longestPrefix() < THAI_PREFIX_COMBINE_THRESHOLD)) {\r
 277                     // Look for a plausible word boundary\r
 278                     int remaining = rangeEnd - (current + wordLength);\r
 279                     int pc = fIter.current();\r
 280                     int chars = 0;\r
 281                     for (;;) {\r
 282                         fIter.next();\r
 283                         uc = fIter.current();\r
 284                         chars += 1;\r
 285                         if (--remaining <= 0) {\r
 286                             break;\r
 287                         }\r
 288                         if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {\r
 289                             // Maybe. See if it's in the dictionary.\r
 290                             // Note: In the original Apple code, checked that the next\r
 291                             // two characters after uc were not 0x0E4C THANTHAKHAT before\r
 292                             // checking the dictionary. That is just a performance filter,\r
 293                             // but it's not clear it's faster than checking the trie\r
 294                             int candidate = words[(wordsFound+1)%THAI_LOOKAHEAD].candidates(fIter, fDictionary, rangeEnd);\r
 295                             fIter.setIndex(current+wordLength+chars);\r
 296                             if (candidate > 0) {\r
 297                                 break;\r
 298                             }\r
 299                         }\r
 300                         pc = uc;\r
 301                     }\r
 302 \r
 303                     // Bump the word cound if there wasn't already one\r
 304                     if (wordLength <= 0) {\r
 305                         wordsFound += 1;\r
 306                     }\r
 307 \r
 308                     // Update the length with the passed-over characters\r
 309                     wordLength += chars;\r
 310                 } else {\r
 311                     // Backup to where we were for next iteration\r
 312                     fIter.setIndex(current+wordLength);\r
 313                 }\r
 314             }\r
 315 \r
 316             // Never stop before a combining mark.\r
 317             int currPos;\r
 318             while ((currPos = fIter.getIndex()) < rangeEnd && fMarkSet.contains(fIter.current())) {\r
 319                 fIter.next();\r
 320                 wordLength += fIter.getIndex() - currPos;\r
 321             }\r
 322 \r
 323             // Look ahead for possible suffixes if a dictionary word does not follow.\r
 324             // We do this in code rather than using a rule so that the heuristic\r
 325             // resynch continues to function. For example, one of the suffix characters \r
 326             // could be a typo in the middle of a word.\r
 327             if (fIter.getIndex() < rangeEnd && wordLength > 0) {\r
 328                 if (words[wordsFound%THAI_LOOKAHEAD].candidates(fIter, fDictionary, rangeEnd) <= 0 &&\r
 329                         fSuffixSet.contains(uc = fIter.current())) {\r
 330                     if (uc == THAI_PAIYANNOI) {\r
 331                         if (!fSuffixSet.contains(fIter.previous())) {\r
 332                             // Skip over previous end and PAIYANNOI\r
 333                             fIter.next();\r
 334                             fIter.next();\r
 335                             wordLength += 1;\r
 336                             uc = fIter.current();\r
 337                         } else {\r
 338                             // Restore prior position\r
 339                             fIter.next();\r
 340                         }\r
 341                     }\r
 342                     if (uc == THAI_MAIYAMOK) {\r
 343                         if (fIter.previous() != THAI_MAIYAMOK) {\r
 344                             // Skip over previous end and MAIYAMOK\r
 345                             fIter.next();\r
 346                             fIter.next();\r
 347                             wordLength += 1;\r
 348                         } else {\r
 349                             // restore prior position\r
 350                             fIter.next();\r
 351                         }\r
 352                     }\r
 353                 } else {\r
 354                     fIter.setIndex(current+wordLength);\r
 355                 }\r
 356             }\r
 357 \r
 358             // Did we find a word on this iteration? If so, push it on the break stack\r
 359             if (wordLength > 0) {\r
 360                 foundBreaks.push(Integer.valueOf(current+wordLength));\r
 361             }\r
 362         }\r
 363 \r
 364         // Don't return a break for the end of the dictionary range if there is one there\r
 365         if (foundBreaks.peek().intValue() >= rangeEnd) {\r
 366             foundBreaks.pop();\r
 367             wordsFound -= 1;\r
 368         }\r
 369 \r
 370         // Store the break points in cachedBreakPositions.\r
 371         cachedBreakPositions = new int[foundBreaks.size() + 2];\r
 372         cachedBreakPositions[0] = rangeStart;\r
 373         int i;\r
 374         for (i = 0; i < foundBreaks.size(); i++) {\r
 375             cachedBreakPositions[i + 1] = foundBreaks.elementAt(i).intValue();\r
 376         }\r
 377         cachedBreakPositions[i + 1] = rangeEnd;\r
 378         positionInCache = 0;\r
 379 \r
 380         return wordsFound;\r
 381     }\r
 382 }\r