]> gitweb.fperrin.net Git - Dictionary.git/blob - jars/icu4j-4_2_1-src/src/com/ibm/icu/dev/test/rbbi/RBBITest.java
icu4jsrc
[Dictionary.git] / jars / icu4j-4_2_1-src / src / com / ibm / icu / dev / test / rbbi / RBBITest.java
1 /*\r
2  *******************************************************************************\r
3  * Copyright (C) 1996-2009, International Business Machines Corporation and    *\r
4  * others. All Rights Reserved.                                                *\r
5  *******************************************************************************\r
6  */\r
7 package com.ibm.icu.dev.test.rbbi;\r
8  \r
9 //Regression testing of RuleBasedBreakIterator\r
10 //\r
11 //  TODO:  These tests should be mostly retired.\r
12 //          Much of the test data that was originally here was removed when the RBBI rules\r
13 //            were updated to match the Unicode boundary TRs, and the data was found to be invalid.\r
14 //          Much of the remaining data has been moved into the rbbitst.txt test data file,\r
15 //            which is common between ICU4C and ICU4J.  The remaining test data should also be moved,\r
16 //            or simply retired if it is no longer interesting.\r
17 import com.ibm.icu.dev.test.*;\r
18 import com.ibm.icu.text.RuleBasedBreakIterator;\r
19 import com.ibm.icu.text.BreakIterator;\r
20 import com.ibm.icu.util.ULocale;\r
21 \r
22 import java.util.Vector;\r
23 \r
24 public class RBBITest extends TestFmwk \r
25 {  \r
26     \r
27   public static void main(String[] args) throws Exception {\r
28     new RBBITest().run(args);\r
29   }\r
30 \r
31   public RBBITest() { \r
32   }\r
33   \r
34   private static final String halfNA = "\u0928\u094d\u200d";  /*halfform NA = devanigiri NA + virama(supresses inherent vowel)+ zero width joiner */  \r
35 \r
36 \r
37   // tests default rules based character iteration.\r
38   // Builds a new iterator from the source rules in the default (prebuilt) iterator.\r
39   //\r
40   public void TestDefaultRuleBasedCharacterIteration(){\r
41       RuleBasedBreakIterator rbbi=(RuleBasedBreakIterator)BreakIterator.getCharacterInstance();\r
42       logln("Testing the RBBI for character iteration by using default rules");\r
43 \r
44       //fetch the rules used to create the above RuleBasedBreakIterator\r
45       String defaultRules=rbbi.toString();\r
46       \r
47       RuleBasedBreakIterator charIterDefault=null;\r
48       try{\r
49           charIterDefault   = new RuleBasedBreakIterator(defaultRules); \r
50       }catch(IllegalArgumentException iae){\r
51           errln("ERROR: failed construction in TestDefaultRuleBasedCharacterIteration()"+ iae.toString());\r
52       }\r
53 \r
54       Vector chardata = new Vector();\r
55       chardata.addElement("H");\r
56       chardata.addElement("e");\r
57       chardata.addElement("l");\r
58       chardata.addElement("l");\r
59       chardata.addElement("o");\r
60       chardata.addElement("e\u0301");                   //acuteE\r
61       chardata.addElement("&");\r
62       chardata.addElement("e\u0303");                   //tildaE\r
63       //devanagiri characters for Hindi support\r
64       chardata.addElement("\u0906");                    //devanagiri AA\r
65       //chardata.addElement("\u093e\u0901");              //devanagiri vowelsign AA+ chandrabindhu\r
66       chardata.addElement("\u0916\u0947");              //devanagiri KHA+vowelsign E\r
67       chardata.addElement("\u0938\u0941\u0902");        //devanagiri SA+vowelsign U + anusvara(bindu)\r
68       chardata.addElement("\u0926");                    //devanagiri consonant DA\r
69       chardata.addElement("\u0930");                    //devanagiri consonant RA\r
70       // chardata.addElement("\u0939\u094c");              //devanagiri HA+vowel sign AI\r
71       chardata.addElement("\u0964");                    //devanagiri danda\r
72       //end hindi characters      \r
73       chardata.addElement("A\u0302");                   // circumflexA \r
74       chardata.addElement("i\u0301");                   // acuteBelowI   \r
75       // conjoining jamo... \r
76       chardata.addElement("\u1109\u1161\u11bc");\r
77       chardata.addElement("\u1112\u1161\u11bc");\r
78       chardata.addElement("\n");\r
79       chardata.addElement("\r\n");                      // keep CRLF sequences together  \r
80       chardata.addElement("S\u0300");                   //graveS\r
81       chardata.addElement("i\u0301");                   // acuteBelowI\r
82       chardata.addElement("!");\r
83 \r
84        // What follows is a string of Korean characters (I found it in the Yellow Pages\r
85       // ad for the Korean Presbyterian Church of San Francisco, and I hope I transcribed\r
86       // it correctly), first as precomposed syllables, and then as conjoining jamo.\r
87       // Both sequences should be semantically identical and break the same way.\r
88       // precomposed syllables...\r
89       chardata.addElement("\uc0c1");\r
90       chardata.addElement("\ud56d");\r
91       chardata.addElement(" ");\r
92       chardata.addElement("\ud55c");\r
93       chardata.addElement("\uc778");\r
94       chardata.addElement(" ");\r
95       chardata.addElement("\uc5f0");\r
96       chardata.addElement("\ud569");\r
97       chardata.addElement(" ");\r
98       chardata.addElement("\uc7a5");\r
99       chardata.addElement("\ub85c");\r
100       chardata.addElement("\uad50");\r
101       chardata.addElement("\ud68c");\r
102       chardata.addElement(" ");\r
103        // conjoining jamo...\r
104       chardata.addElement("\u1109\u1161\u11bc");\r
105       chardata.addElement("\u1112\u1161\u11bc");\r
106       chardata.addElement(" ");\r
107       chardata.addElement("\u1112\u1161\u11ab");\r
108       chardata.addElement("\u110b\u1175\u11ab");\r
109       chardata.addElement(" ");\r
110       chardata.addElement("\u110b\u1167\u11ab");\r
111       chardata.addElement("\u1112\u1161\u11b8");\r
112       chardata.addElement(" ");\r
113       chardata.addElement("\u110c\u1161\u11bc");\r
114       chardata.addElement("\u1105\u1169");\r
115       chardata.addElement("\u1100\u116d");\r
116       chardata.addElement("\u1112\u116c");\r
117 \r
118 \r
119       generalIteratorTest(charIterDefault, chardata);\r
120 \r
121   }\r
122 \r
123   public void TestDefaultRuleBasedWordIteration(){\r
124       logln("Testing the RBBI for word iteration using default rules");\r
125       RuleBasedBreakIterator rbbi=(RuleBasedBreakIterator)BreakIterator.getWordInstance();\r
126       //fetch the rules used to create the above RuleBasedBreakIterator\r
127       String defaultRules=rbbi.toString();\r
128       \r
129       RuleBasedBreakIterator wordIterDefault=null;\r
130       try{\r
131       wordIterDefault   = new RuleBasedBreakIterator(defaultRules); \r
132       }catch(IllegalArgumentException iae){\r
133           errln("ERROR: failed construction in TestDefaultRuleBasedWordIteration() -- custom rules"+ iae.toString());\r
134       }\r
135 \r
136       Vector worddata = new Vector();\r
137       worddata.addElement ("Write");\r
138       worddata.addElement (" ");\r
139       worddata.addElement ("wordrules");\r
140       worddata.addElement (".");\r
141       worddata.addElement(" ");\r
142       //worddata.addElement("alpha-beta-gamma");\r
143       worddata.addElement(" ");      \r
144       worddata.addElement("\u092f\u0939");\r
145       worddata.addElement(" ");\r
146       worddata.addElement("\u0939\u093f" + halfNA + "\u0926\u0940");\r
147       worddata.addElement(" ");\r
148       worddata.addElement("\u0939\u0948");\r
149       //  worddata.addElement("\u0964");   //danda followed by a space\r
150       worddata.addElement(" ");\r
151       worddata.addElement("\u0905\u093e\u092a");\r
152       worddata.addElement(" ");\r
153       worddata.addElement("\u0938\u093f\u0916\u094b\u0917\u0947");\r
154       worddata.addElement("?");\r
155       worddata.addElement(" ");\r
156        worddata.addElement("\r");\r
157       worddata.addElement("It's");\r
158       worddata.addElement(" ");\r
159      // worddata.addElement("$30.10");\r
160       worddata.addElement(" ");  \r
161        worddata.addElement(" ");\r
162       worddata.addElement("Badges");\r
163       worddata.addElement("?");\r
164       worddata.addElement(" ");\r
165       worddata.addElement("BADGES");\r
166       worddata.addElement("!");\r
167       worddata.addElement("1000,233,456.000");\r
168       worddata.addElement(" ");\r
169 \r
170       generalIteratorTest(wordIterDefault, worddata);\r
171   }\r
172 //  private static final String kParagraphSeparator = "\u2029";\r
173   private static final String kLineSeparator      = "\u2028";\r
174 \r
175   public void TestDefaultRuleBasedSentenceIteration(){\r
176       logln("Testing the RBBI for sentence iteration using default rules");\r
177       RuleBasedBreakIterator rbbi=(RuleBasedBreakIterator)BreakIterator.getSentenceInstance();\r
178       \r
179       //fetch the rules used to create the above RuleBasedBreakIterator\r
180       String defaultRules=rbbi.toString();\r
181       RuleBasedBreakIterator sentIterDefault=null;\r
182       try{\r
183           sentIterDefault   = new RuleBasedBreakIterator(defaultRules); \r
184       }catch(IllegalArgumentException iae){\r
185           errln("ERROR: failed construction in TestDefaultRuleBasedSentenceIteration()" + iae.toString());\r
186       }\r
187       \r
188       Vector sentdata = new Vector();\r
189       sentdata.addElement("(This is it.) ");\r
190       sentdata.addElement("Testing the sentence iterator. ");\r
191       sentdata.addElement("\"This isn\'t it.\" ");\r
192       sentdata.addElement("Hi! ");\r
193       sentdata.addElement("This is a simple sample sentence. ");\r
194       sentdata.addElement("(This is it.) ");\r
195       sentdata.addElement("This is a simple sample sentence. ");\r
196       sentdata.addElement("\"This isn\'t it.\" ");\r
197       sentdata.addElement("Hi! ");\r
198       sentdata.addElement("This is a simple sample sentence. ");\r
199       sentdata.addElement("It does not have to make any sense as you can see. ");\r
200       sentdata.addElement("Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura. ");\r
201       sentdata.addElement("Che la dritta via aveo smarrita. ");\r
202        generalIteratorTest(sentIterDefault, sentdata);\r
203   }\r
204    \r
205   public void TestDefaultRuleBasedLineIteration(){\r
206       logln("Testing the RBBI for line iteration using default rules");\r
207       RuleBasedBreakIterator rbbi=(RuleBasedBreakIterator)RuleBasedBreakIterator.getLineInstance();\r
208       //fetch the rules used to create the above RuleBasedBreakIterator\r
209       String defaultRules=rbbi.toString();\r
210       RuleBasedBreakIterator lineIterDefault=null;\r
211       try{\r
212           lineIterDefault   = new RuleBasedBreakIterator(defaultRules); \r
213       }catch(IllegalArgumentException iae){\r
214           errln("ERROR: failed construction in TestDefaultRuleBasedLineIteration()" + iae.toString());\r
215       }\r
216 \r
217      Vector linedata = new Vector();\r
218      linedata.addElement("Multi-");\r
219      linedata.addElement("Level ");\r
220      linedata.addElement("example ");\r
221      linedata.addElement("of ");\r
222      linedata.addElement("a ");\r
223      linedata.addElement("semi-");\r
224      linedata.addElement("idiotic ");\r
225      linedata.addElement("non-");\r
226      linedata.addElement("sensical ");\r
227      linedata.addElement("(non-");\r
228      linedata.addElement("important) ");\r
229      linedata.addElement("sentence. ");\r
230 \r
231      linedata.addElement("Hi  ");\r
232      linedata.addElement("Hello ");\r
233      linedata.addElement("How\n");\r
234      linedata.addElement("are\r");\r
235      linedata.addElement("you" + kLineSeparator);\r
236      linedata.addElement("fine.\t");\r
237      linedata.addElement("good.  ");\r
238 \r
239      linedata.addElement("Now\r");\r
240      linedata.addElement("is\n");\r
241      linedata.addElement("the\r\n");\r
242      linedata.addElement("time\n");\r
243      linedata.addElement("\r");\r
244      linedata.addElement("for\r");\r
245      linedata.addElement("\r");\r
246      linedata.addElement("all");\r
247 \r
248      generalIteratorTest(lineIterDefault, linedata);\r
249 \r
250 \r
251   }\r
252  \r
253       //=========================================================================\r
254      // general test subroutines\r
255      //=========================================================================\r
256 \r
257      private void generalIteratorTest(RuleBasedBreakIterator rbbi, Vector expectedResult){\r
258          StringBuffer buffer = new StringBuffer();\r
259          String text;\r
260          for (int i = 0; i < expectedResult.size(); i++) {\r
261              text = (String)expectedResult.elementAt(i);\r
262              buffer.append(text);\r
263          }\r
264          text = buffer.toString();\r
265          if (rbbi == null) {\r
266              errln("null iterator, test skipped.");\r
267              return;\r
268          }\r
269 \r
270          rbbi.setText(text);\r
271 \r
272          Vector nextResults = _testFirstAndNext(rbbi, text);\r
273          Vector previousResults = _testLastAndPrevious(rbbi, text);\r
274 \r
275          logln("comparing forward and backward...");\r
276          int errs = getErrorCount();\r
277          compareFragmentLists("forward iteration", "backward iteration", nextResults,\r
278                          previousResults);\r
279          if (getErrorCount() == errs) {\r
280              logln("comparing expected and actual...");\r
281              compareFragmentLists("expected result", "actual result", expectedResult,\r
282                              nextResults);\r
283          }\r
284 \r
285         int[] boundaries = new int[expectedResult.size() + 3];\r
286         boundaries[0] = RuleBasedBreakIterator.DONE;\r
287         boundaries[1] = 0;\r
288         for (int i = 0; i < expectedResult.size(); i++)\r
289          boundaries[i + 2] = boundaries[i + 1] + ((String)expectedResult.elementAt(i)).length();\r
290       \r
291         boundaries[boundaries.length - 1] = RuleBasedBreakIterator.DONE;\r
292       \r
293         _testFollowing(rbbi, text, boundaries);\r
294         _testPreceding(rbbi, text, boundaries);\r
295         _testIsBoundary(rbbi, text, boundaries);\r
296 \r
297         doMultipleSelectionTest(rbbi, text);\r
298      }\r
299 \r
300      private Vector _testFirstAndNext(RuleBasedBreakIterator rbbi, String text) {\r
301          int p = rbbi.first();\r
302          int lastP = p;\r
303          Vector result = new Vector();\r
304 \r
305          if (p != 0)\r
306              errln("first() returned " + p + " instead of 0");\r
307          while (p != RuleBasedBreakIterator.DONE) {\r
308              p = rbbi.next();\r
309              if (p != RuleBasedBreakIterator.DONE) {\r
310                  if (p <= lastP)\r
311                      errln("next() failed to move forward: next() on position "\r
312                                      + lastP + " yielded " + p);\r
313 \r
314                  result.addElement(text.substring(lastP, p));\r
315              }\r
316              else {\r
317                  if (lastP != text.length())\r
318                      errln("next() returned DONE prematurely: offset was "\r
319                                      + lastP + " instead of " + text.length());\r
320              }\r
321              lastP = p;\r
322          }\r
323          return result;\r
324      }\r
325 \r
326      private Vector _testLastAndPrevious(RuleBasedBreakIterator rbbi, String text) {\r
327          int p = rbbi.last();\r
328          int lastP = p;\r
329          Vector result = new Vector();\r
330 \r
331          if (p != text.length())\r
332              errln("last() returned " + p + " instead of " + text.length());\r
333          while (p != RuleBasedBreakIterator.DONE) {\r
334              p = rbbi.previous();\r
335              if (p != RuleBasedBreakIterator.DONE) {\r
336                  if (p >= lastP)\r
337                      errln("previous() failed to move backward: previous() on position "\r
338                                      + lastP + " yielded " + p);\r
339 \r
340                  result.insertElementAt(text.substring(p, lastP), 0);\r
341              }\r
342              else {\r
343                  if (lastP != 0)\r
344                      errln("previous() returned DONE prematurely: offset was "\r
345                                      + lastP + " instead of 0");\r
346              }\r
347              lastP = p;\r
348          }\r
349          return result;\r
350      }\r
351 \r
352      private void compareFragmentLists(String f1Name, String f2Name, Vector f1, Vector f2) {\r
353          int p1 = 0;\r
354          int p2 = 0;\r
355          String s1;\r
356          String s2;\r
357          int t1 = 0;\r
358          int t2 = 0;\r
359 \r
360          while (p1 < f1.size() && p2 < f2.size()) {\r
361              s1 = (String)f1.elementAt(p1);\r
362              s2 = (String)f2.elementAt(p2);\r
363              t1 += s1.length();\r
364              t2 += s2.length();\r
365 \r
366              if (s1.equals(s2)) {\r
367                  debugLogln("   >" + s1 + "<");\r
368                  ++p1;\r
369                  ++p2;\r
370              }\r
371              else {\r
372                  int tempT1 = t1;\r
373                  int tempT2 = t2;\r
374                  int tempP1 = p1;\r
375                  int tempP2 = p2;\r
376 \r
377                  while (tempT1 != tempT2 && tempP1 < f1.size() && tempP2 < f2.size()) {\r
378                      while (tempT1 < tempT2 && tempP1 < f1.size()) {\r
379                          tempT1 += ((String)f1.elementAt(tempP1)).length();\r
380                          ++tempP1;\r
381                      }\r
382                      while (tempT2 < tempT1 && tempP2 < f2.size()) {\r
383                          tempT2 += ((String)f2.elementAt(tempP2)).length();\r
384                          ++tempP2;\r
385                      }\r
386                  }\r
387                  logln("*** " + f1Name + " has:");\r
388                  while (p1 <= tempP1 && p1 < f1.size()) {\r
389                      s1 = (String)f1.elementAt(p1);\r
390                      t1 += s1.length();\r
391                      debugLogln(" *** >" + s1 + "<");\r
392                      ++p1;\r
393                  }\r
394                  logln("***** " + f2Name + " has:");\r
395                  while (p2 <= tempP2 && p2 < f2.size()) {\r
396                      s2 = (String)f2.elementAt(p2);\r
397                      t2 += s2.length();\r
398                      debugLogln(" ***** >" + s2 + "<");\r
399                      ++p2;\r
400                  }\r
401                  errln("Discrepancy between " + f1Name + " and " + f2Name);\r
402              }\r
403          }\r
404      }\r
405 \r
406     private void _testFollowing(RuleBasedBreakIterator rbbi, String text, int[] boundaries) {\r
407        logln("testFollowing():");\r
408        int p = 2;\r
409        for(int i = 0; i <= text.length(); i++) {\r
410            if (i == boundaries[p])\r
411                ++p;\r
412            int b = rbbi.following(i);\r
413            logln("rbbi.following(" + i + ") -> " + b);\r
414            if (b != boundaries[p])\r
415                errln("Wrong result from following() for " + i + ": expected " + boundaries[p]\r
416                                + ", got " + b);\r
417        }\r
418    }\r
419 \r
420    private void _testPreceding(RuleBasedBreakIterator rbbi, String text, int[] boundaries) {\r
421        logln("testPreceding():");\r
422        int p = 0;\r
423        for(int i = 0; i <= text.length(); i++) {\r
424            int b = rbbi.preceding(i);\r
425            logln("rbbi.preceding(" + i + ") -> " + b);\r
426            if (b != boundaries[p])\r
427                errln("Wrong result from preceding() for " + i + ": expected " + boundaries[p]\r
428                               + ", got " + b);\r
429            if (i == boundaries[p + 1])\r
430                ++p;\r
431        }\r
432    }\r
433 \r
434    private void _testIsBoundary(RuleBasedBreakIterator rbbi, String text, int[] boundaries) {\r
435        logln("testIsBoundary():");\r
436        int p = 1;\r
437        boolean isB;\r
438        for(int i = 0; i <= text.length(); i++) {\r
439            isB = rbbi.isBoundary(i);\r
440            logln("rbbi.isBoundary(" + i + ") -> " + isB);\r
441            if(i == boundaries[p]) {\r
442                if (!isB)\r
443                    errln("Wrong result from isBoundary() for " + i + ": expected true, got false");\r
444                ++p;\r
445            }\r
446            else {\r
447                if(isB)\r
448                    errln("Wrong result from isBoundary() for " + i + ": expected false, got true");\r
449            }\r
450        }\r
451    }\r
452    private void doMultipleSelectionTest(RuleBasedBreakIterator iterator, String testText)\r
453    {\r
454        logln("Multiple selection test...");\r
455        RuleBasedBreakIterator testIterator = (RuleBasedBreakIterator)iterator.clone();\r
456        int offset = iterator.first();\r
457        int testOffset;\r
458        int count = 0;\r
459 \r
460        do {\r
461            testOffset = testIterator.first();\r
462            testOffset = testIterator.next(count);\r
463            logln("next(" + count + ") -> " + testOffset);\r
464            if (offset != testOffset)\r
465                errln("next(n) and next() not returning consistent results: for step " + count + ", next(n) returned " + testOffset + " and next() had " + offset);\r
466 \r
467            if (offset != RuleBasedBreakIterator.DONE) {\r
468                count++;\r
469                offset = iterator.next();\r
470            }\r
471        } while (offset != RuleBasedBreakIterator.DONE);\r
472 \r
473        // now do it backwards...\r
474        offset = iterator.last();\r
475        count = 0;\r
476 \r
477        do {\r
478            testOffset = testIterator.last();\r
479            testOffset = testIterator.next(count);\r
480            logln("next(" + count + ") -> " + testOffset);\r
481            if (offset != testOffset)\r
482                errln("next(n) and next() not returning consistent results: for step " + count + ", next(n) returned " + testOffset + " and next() had " + offset);\r
483 \r
484            if (offset != RuleBasedBreakIterator.DONE) {\r
485                count--;\r
486                offset = iterator.previous();\r
487            }\r
488        } while (offset != RuleBasedBreakIterator.DONE);\r
489    }\r
490 \r
491    private void debugLogln(String s) {\r
492         final String zeros = "0000";\r
493         String temp;\r
494         StringBuffer out = new StringBuffer();\r
495         for (int i = 0; i < s.length(); i++) {\r
496             char c = s.charAt(i);\r
497             if (c >= ' ' && c < '\u007f')\r
498                 out.append(c);\r
499             else {\r
500                 out.append("\\u");\r
501                 temp = Integer.toHexString((int)c);\r
502                 out.append(zeros.substring(0, 4 - temp.length()));\r
503                 out.append(temp);\r
504             }\r
505         }\r
506          logln(out.toString());\r
507     }\r
508    \r
509    public void TestThaiDictionaryBreakIterator() {\r
510        int position;\r
511        int index;\r
512        int result[] = { 1, 2, 5, 10, 11, 12, 11, 10, 5, 2, 1, 0 };\r
513        char ctext[] = { \r
514                0x0041, 0x0020,\r
515                0x0E01, 0x0E32, 0x0E23, 0x0E17, 0x0E14, 0x0E25, 0x0E2D, 0x0E07,\r
516                0x0020, 0x0041\r
517                };\r
518        String text = new String(ctext);\r
519        \r
520        ULocale locale = ULocale.createCanonical("th");\r
521        BreakIterator b = BreakIterator.getWordInstance(locale);\r
522        \r
523        b.setText(text);\r
524        \r
525        index = 0;\r
526        // Test forward iteration\r
527        while ((position = b.next())!= BreakIterator.DONE) {\r
528            if (position != result[index++]) {\r
529                errln("Error with ThaiDictionaryBreakIterator forward iteration test at " + position + ".\nShould have been " + result[index-1]);\r
530            }\r
531        }\r
532        \r
533        // Test backward iteration\r
534        while ((position = b.previous())!= BreakIterator.DONE) {\r
535            if (position != result[index++]) {\r
536                errln("Error with ThaiDictionaryBreakIterator backward iteration test at " + position + ".\nShould have been " + result[index-1]);\r
537            }\r
538        }\r
539        \r
540        //Test invalid sequence and spaces\r
541        char text2[] = {\r
542                0x0E01, 0x0E39, 0x0020, 0x0E01, 0x0E34, 0x0E19, 0x0E01, 0x0E38, 0x0E49, 0x0E07, 0x0020, 0x0E1B, \r
543                0x0E34, 0x0E49, 0x0E48, 0x0E07, 0x0E2D, 0x0E22, 0x0E39, 0x0E48, 0x0E43, 0x0E19, \r
544                0x0E16, 0x0E49, 0x0E33\r
545        };\r
546        int expectedWordResult[] = {\r
547                2, 3, 6, 10, 11, 15, 17, 20, 22\r
548        };\r
549        int expectedLineResult[] = {\r
550                3, 6, 11, 15, 17, 20, 22\r
551        };\r
552        BreakIterator brk = BreakIterator.getWordInstance(new ULocale("th"));\r
553        brk.setText(new String(text2));\r
554        position = index = 0;\r
555        while ((position = brk.next()) != BreakIterator.DONE && position < text2.length) {\r
556            if (position != expectedWordResult[index++]) {\r
557                errln("Incorrect break given by thai word break iterator. Expected: " + expectedWordResult[index-1] + " Got: " + position);\r
558            }\r
559        }\r
560       \r
561        brk = BreakIterator.getLineInstance(new ULocale("th"));\r
562        brk.setText(new String(text2));\r
563        position = index = 0;\r
564        while ((position = brk.next()) != BreakIterator.DONE && position < text2.length) {\r
565            if (position != expectedLineResult[index++]) {\r
566                errln("Incorrect break given by thai line break iterator. Expected: " + expectedLineResult[index-1] + " Got: " + position);\r
567            }\r
568        }\r
569    }\r
570   \r
571     public void TestTailoredBreaks() {\r
572         class TBItem {\r
573             private int     type;\r
574             private ULocale locale;\r
575             private String  text;\r
576             private int[]   expectOffsets;\r
577             TBItem(int typ, ULocale loc, String txt, int[] eOffs) {\r
578                 type          = typ;\r
579                 locale        = loc;\r
580                 text          = txt;\r
581                 expectOffsets = eOffs;\r
582             }\r
583             private static final int maxOffsetCount = 128;\r
584             private boolean offsetsMatchExpected(int[] foundOffsets, int foundOffsetsLength) {\r
585                 if ( foundOffsetsLength != expectOffsets.length ) {\r
586                     return false;\r
587                 }\r
588                 for (int i = 0; i < foundOffsetsLength; i++) {\r
589                     if ( foundOffsets[i] != expectOffsets[i] ) {\r
590                         return false;\r
591                     }\r
592                 }\r
593                 return true;\r
594             }\r
595             private String formatOffsets(int[] offsets, int length) {\r
596                 StringBuffer buildString = new StringBuffer(4*maxOffsetCount);\r
597                 for (int i = 0; i < length; i++) {\r
598                     buildString.append(" " + offsets[i]);\r
599                 }\r
600                 return buildString.toString();\r
601             }\r
602             public void doTest() {\r
603                 BreakIterator brkIter;\r
604                 switch( type ) {\r
605                     case BreakIterator.KIND_CHARACTER: brkIter = BreakIterator.getCharacterInstance(locale); break;\r
606                     case BreakIterator.KIND_WORD:      brkIter = BreakIterator.getWordInstance(locale); break;\r
607                     case BreakIterator.KIND_LINE:      brkIter = BreakIterator.getLineInstance(locale); break;\r
608                     case BreakIterator.KIND_SENTENCE:  brkIter = BreakIterator.getSentenceInstance(locale); break;\r
609                     default: errln("Unsupported break iterator type " + type); return;\r
610                 }\r
611                 brkIter.setText(text);\r
612                 int[] foundOffsets = new int[maxOffsetCount];\r
613                 int offset, foundOffsetsCount = 0;\r
614                 // do forwards iteration test\r
615                 while ( foundOffsetsCount < maxOffsetCount && (offset = brkIter.next()) != BreakIterator.DONE ) {\r
616                     foundOffsets[foundOffsetsCount++] = offset;\r
617                 }\r
618                 if ( !offsetsMatchExpected(foundOffsets, foundOffsetsCount) ) {\r
619                     // log error for forwards test\r
620                     String textToDisplay = (text.length() <= 16)? text: text.substring(0,16);\r
621                     errln("For type " + type + " " + locale + ", text \"" + textToDisplay + "...\"" +\r
622                             "; expect " + expectOffsets.length + " offsets:" + formatOffsets(expectOffsets, expectOffsets.length) +\r
623                             "; found " + foundOffsetsCount + " offsets fwd:" + formatOffsets(foundOffsets, foundOffsetsCount) );\r
624                 } else {\r
625                     // do backwards iteration test\r
626                     --foundOffsetsCount; // back off one from the end offset\r
627                     while ( foundOffsetsCount > 0 ) {\r
628                         offset = brkIter.previous();\r
629                         if ( offset != foundOffsets[--foundOffsetsCount] ) {\r
630                             // log error for backwards test\r
631                             String textToDisplay = (text.length() <= 16)? text: text.substring(0,16);\r
632                             errln("For type " + type + " " + locale + ", text \"" + textToDisplay + "...\"" +\r
633                                     "; expect " + expectOffsets.length + " offsets:" + formatOffsets(expectOffsets, expectOffsets.length) +\r
634                                     "; found rev offset " + offset + " where expect " + foundOffsets[foundOffsetsCount] );\r
635                             break;\r
636                         }\r
637                     }\r
638                 }\r
639             }\r
640         }\r
641         // KIND_WORD "en_US_POSIX"\r
642         final String posxWordText     = "Can't have breaks in xx:yy or struct.field for CS-types.";\r
643         final int[]  posxWordTOffsets = { 5, 6, 10, 11, 17, 18, 20, 21, 23, 24, 26, 27, 29, 30, 36, 37, 42, 43, 46, 47, 49, 50, 55, 56 };\r
644         final int[]  posxWordROffsets = { 5, 6, 10, 11, 17, 18, 20, 21,         26, 27, 29, 30,         42, 43, 46, 47, 49, 50, 55, 56 };\r
645         // KIND_WORD "ja"\r
646         final String jaWordText     = "\u79C1\u9054\u306B\u4E00\u3007\u3007\u3007\u306E\u30B3\u30F3\u30D4\u30E5\u30FC\u30BF" +\r
647                                       "\u304C\u3042\u308B\u3002\u5948\u3005\u306F\u30EF\u30FC\u30C9\u3067\u3042\u308B\u3002";\r
648         final int[]  jaWordTOffsets = {    2, 3,          7, 8, 14,         17, 18,     20, 21, 24,         27, 28 };\r
649         final int[]  jaWordROffsets = { 1, 2, 3, 4, 5, 6, 7, 8, 14, 15, 16, 17, 18, 19, 20, 21, 24, 25, 26, 27, 28 };\r
650         // KIND_SENTENCE "el"\r
651         final String elSentText     = "\u0391\u03B2, \u03B3\u03B4; \u0395 \u03B6\u03B7\u037E \u0398 \u03B9\u03BA. " +\r
652                                       "\u039B\u03BC \u03BD\u03BE! \u039F\u03C0, \u03A1\u03C2? \u03A3";\r
653         final int[]  elSentTOffsets = { 8, 14, 20, 27, 35, 36 };\r
654         final int[]  elSentROffsets = {        20, 27, 35, 36 };\r
655         // KIND_CHARACTER "th"\r
656         final String thCharText     = "\u0E01\u0E23\u0E30\u0E17\u0E48\u0E2D\u0E21\u0E23\u0E08\u0E19\u0E32 " +\r
657                                       "(\u0E2A\u0E38\u0E0A\u0E32\u0E15\u0E34-\u0E08\u0E38\u0E11\u0E32\u0E21\u0E32\u0E28) " +\r
658                                       "\u0E40\u0E14\u0E47\u0E01\u0E21\u0E35\u0E1B\u0E31\u0E0D\u0E2B\u0E32 ";\r
659         final int[]  thCharTOffsets = { 1, 2, 3, 5, 6, 7, 8, 9, 10, 11,\r
660                                         12, 13, 15, 16, 17, 19, 20, 22, 23, 24, 25, 26, 27, 28,\r
661                                         29, 30, 32, 33, 35, 37, 38, 39, 40, 41 };\r
662         final int[]  thCharROffsets = { 1,    3, 5, 6, 7, 8, 9,     11,\r
663                                         12, 13, 15,     17, 19, 20, 22,     24,     26, 27, 28,\r
664                                         29,     32, 33, 35, 37, 38,     40, 41 };\r
665         \r
666         final TBItem[] tests = {\r
667             new TBItem( BreakIterator.KIND_WORD,      new ULocale("en_US_POSIX"), posxWordText, posxWordTOffsets ),\r
668             new TBItem( BreakIterator.KIND_WORD,      ULocale.ROOT,               posxWordText, posxWordROffsets ),\r
669             new TBItem( BreakIterator.KIND_WORD,      new ULocale("ja"),          jaWordText,   jaWordTOffsets   ),\r
670             new TBItem( BreakIterator.KIND_WORD,      ULocale.ROOT,               jaWordText,   jaWordROffsets   ),\r
671             new TBItem( BreakIterator.KIND_SENTENCE,  new ULocale("el"),          elSentText,   elSentTOffsets   ),\r
672             new TBItem( BreakIterator.KIND_SENTENCE,  ULocale.ROOT,               elSentText,   elSentROffsets   ),\r
673             new TBItem( BreakIterator.KIND_CHARACTER, new ULocale("th"),          thCharText,   thCharTOffsets   ),\r
674             new TBItem( BreakIterator.KIND_CHARACTER, ULocale.ROOT,               thCharText,   thCharROffsets   ),\r
675         };\r
676         for (int iTest = 0; iTest < tests.length; iTest++) {\r
677             tests[iTest].doTest();\r
678         }\r
679     }\r
680 \r
681 }\r