2 * Created on May 5, 2004
4 * Copyright (C) 2004-2013 International Business Machines Corporation and others.
8 package com.ibm.icu.dev.test.rbbi;
10 import java.io.IOException;
11 import java.io.InputStream;
12 import java.io.InputStreamReader;
13 import java.util.Arrays;
15 import com.ibm.icu.dev.test.TestFmwk;
16 import com.ibm.icu.impl.Utility;
17 import com.ibm.icu.lang.UCharacter;
18 import com.ibm.icu.text.BreakIterator;
19 import com.ibm.icu.text.UTF16;
20 import com.ibm.icu.util.ULocale;
24 * Rule based break iterator data driven test.
25 * Perform the tests from the file rbbitst.txt.
26 * The test data file is common to both ICU4C and ICU4J.
27 * See the data file for a description of the tests.
30 public class RBBITestExtended extends TestFmwk {
32 public static void main(String[] args)throws Exception {
33 new RBBITestExtended().run(args);
37 public RBBITestExtended() {
42 static class TestParams {
44 StringBuffer dataToBreak = new StringBuffer();
45 int[] expectedBreaks = new int[1000];
46 int[] srcLine = new int[1000];
47 int[] srcCol = new int[1000];
48 ULocale currentLocale = new ULocale("en_US");
52 public void TestExtended() {
53 TestParams tp = new TestParams();
57 // Open and read the test data file.
59 InputStreamReader isr = null;
60 StringBuffer testFileBuf = new StringBuffer();
62 InputStream is = RBBITestExtended.class.getResourceAsStream("rbbitst.txt");
64 errln("Could not open test data file rbbitst.txt");
67 isr = new InputStreamReader(is, "UTF-8");
76 if (c==0xFEFF && count==1) {
77 // BOM in the test data file. Discard it.
81 UTF16.append(testFileBuf, c);
84 } catch (IOException e) {
89 String testString = testFileBuf.toString();
92 final int PARSE_COMMENT = 1;
93 final int PARSE_TAG = 2;
94 final int PARSE_DATA = 3;
95 final int PARSE_NUM = 4;
97 int parseState = PARSE_TAG;
99 int savedState = PARSE_TAG;
101 final char CH_LF = 0x0a;
102 final char CH_CR = 0x0d;
103 final char CH_HASH = 0x23;
104 /*static const UChar CH_PERIOD = 0x2e;*/
105 final char CH_LT = 0x3c;
106 final char CH_GT = 0x3e;
107 final char CH_BACKSLASH = 0x5c;
108 final char CH_BULLET = 0x2022;
116 int tagValue = 0; // The numeric value of a <nnn> tag.
117 int len = testString.length();
119 for (charIdx = 0; charIdx < len; ) {
120 int c = UTF16.charAt(testString, charIdx);
122 if (c == CH_CR && charIdx<len && testString.charAt(charIdx) == CH_LF) {
123 // treat CRLF as a unit
127 if (c == CH_LF || c == CH_CR) {
131 column = charIdx - colStart + 1;
133 switch (parseState) {
135 if (c == 0x0a || c == 0x0d) {
136 parseState = savedState;
143 parseState = PARSE_COMMENT;
144 savedState = PARSE_TAG;
147 if (UCharacter.isWhitespace(c)) {
150 if (testString.startsWith("<word>", charIdx-1)) {
151 tp.bi = BreakIterator.getWordInstance(tp.currentLocale);
155 if (testString.startsWith("<char>", charIdx-1)) {
156 tp.bi = BreakIterator.getCharacterInstance(tp.currentLocale);
160 if (testString.startsWith("<line>", charIdx-1)) {
161 tp.bi = BreakIterator.getLineInstance(tp.currentLocale);
165 if (testString.startsWith("<sent>", charIdx-1)) {
166 tp.bi = BreakIterator.getSentenceInstance(tp.currentLocale);
170 if (testString.startsWith("<title>", charIdx-1)) {
171 tp.bi = BreakIterator.getTitleInstance(tp.currentLocale);
175 if (testString.startsWith("<locale ", charIdx-1)) {
176 int closeIndex = testString.indexOf(">", charIdx);
177 if (closeIndex < 0) {
178 errln("line" + lineNum + ": missing close on <locale tag.");
181 String localeName = testString.substring(charIdx+6, closeIndex);
182 localeName = localeName.trim();
183 tp.currentLocale = new ULocale(localeName);
184 charIdx = closeIndex+1;
187 if (testString.startsWith("<data>", charIdx-1)) {
188 parseState = PARSE_DATA;
190 tp.dataToBreak.setLength(0);
191 Arrays.fill(tp.expectedBreaks, 0);
192 Arrays.fill(tp.srcCol, 0);
193 Arrays.fill(tp.srcLine, 0);
197 errln("line" + lineNum + ": Tag expected in test file.");
199 //parseState = PARSE_COMMENT;
200 //savedState = PARSE_DATA;
204 if (c == CH_BULLET) {
205 int breakIdx = tp.dataToBreak.length();
206 tp.expectedBreaks[breakIdx] = -1;
207 tp.srcLine[breakIdx] = lineNum;
208 tp.srcCol[breakIdx] = column;
212 if (testString.startsWith("</data>", charIdx-1)) {
213 // Add final entry to mappings from break location to source file position.
214 // Need one extra because last break position returned is after the
215 // last char in the data, not at the last char.
216 int idx = tp.dataToBreak.length();
217 tp.srcLine[idx] = lineNum;
218 tp.srcCol[idx] = column;
220 parseState = PARSE_TAG;
228 if (testString.startsWith("\\N{", charIdx-1)) {
229 int nameEndIdx = testString.indexOf('}', charIdx);
230 if (nameEndIdx == -1) {
231 errln("Error in named character in test file at line " + lineNum +
234 // Named character, e.g. \N{COMBINING GRAVE ACCENT}
235 // Get the code point from the name and insert it into the test data.
236 String charName = testString.substring(charIdx+2, nameEndIdx);
237 c = UCharacter.getCharFromName(charName);
239 errln("Error in named character in test file at line " + lineNum +
242 // Named code point was recognized. Insert it
243 // into the test data.
244 UTF16.append(tp.dataToBreak, c);
245 for (i = tp.dataToBreak.length()-1; i>=0 && tp.srcLine[i]==0; i--) {
246 tp.srcLine[i] = lineNum;
247 tp.srcCol[i] = column;
251 if (nameEndIdx > charIdx) {
252 charIdx = nameEndIdx+1;
257 if (testString.startsWith("<>", charIdx-1)) {
259 int breakIdx = tp.dataToBreak.length();
260 tp.expectedBreaks[breakIdx] = -1;
261 tp.srcLine[breakIdx] = lineNum;
262 tp.srcCol[breakIdx] = column;
268 parseState = PARSE_NUM;
272 if (c == CH_HASH && column==3) { // TODO: why is column off so far?
273 parseState = PARSE_COMMENT;
274 savedState = PARSE_DATA;
278 if (c == CH_BACKSLASH) {
279 // Check for \ at end of line, a line continuation.
280 // Advance over (discard) the newline
281 int cp = UTF16.charAt(testString, charIdx);
282 if (cp == CH_CR && charIdx<len && UTF16.charAt(testString, charIdx+1) == CH_LF) {
284 // Need an extra increment of the input ptr to move over both of them
287 if (cp == CH_LF || cp == CH_CR) {
295 // Let unescape handle the back slash.
296 int charIdxAr[] = new int[1];
297 charIdxAr[0] = charIdx;
298 cp = Utility.unescapeAt(testString, charIdxAr);
300 // Escape sequence was recognized. Insert the char
301 // into the test data.
302 charIdx = charIdxAr[0];
303 UTF16.append(tp.dataToBreak, cp);
304 for (i=tp.dataToBreak.length()-1; i>=0 && tp.srcLine[i]==0; i--) {
305 tp.srcLine[i] = lineNum;
306 tp.srcCol[i] = column;
313 // Not a recognized backslash escape sequence.
314 // Take the next char as a literal.
315 // TODO: Should this be an error?
316 c = UTF16.charAt(testString,charIdx);
317 charIdx = UTF16.moveCodePointOffset(testString, charIdx, 1);
320 // Normal, non-escaped data char.
321 UTF16.append(tp.dataToBreak, c);
323 // Save the mapping from offset in the data to line/column numbers in
324 // the original input file. Will be used for better error messages only.
325 // If there's an expected break before this char, the slot in the mapping
326 // vector will already be set for this char; don't overwrite it.
327 for (i=tp.dataToBreak.length()-1; i>=0 && tp.srcLine[i]==0; i--) {
328 tp.srcLine[i] = lineNum;
329 tp.srcCol[i] = column;
335 // We are parsing an expected numeric tag value, like <1234>,
336 // within a chunk of data.
337 if (UCharacter.isWhitespace(c)) {
342 // Finished the number. Add the info to the expected break data,
343 // and switch parse state back to doing plain data.
344 parseState = PARSE_DATA;
348 int breakIdx = tp.dataToBreak.length();
349 tp.expectedBreaks[breakIdx] = tagValue;
350 tp.srcLine[breakIdx] = lineNum;
351 tp.srcCol[breakIdx] = column;
355 if (UCharacter.isDigit(c)) {
356 tagValue = tagValue*10 + UCharacter.digit(c);
360 errln("Syntax Error in test file at line "+ lineNum +", col %d" + column);
363 // parseState = PARSE_COMMENT; // TODO: unreachable. Don't stop on errors.
372 void executeTest(TestParams t) {
381 t.bi.setText(t.dataToBreak.toString());
383 // Run the iterator forward
386 for (bp = t.bi.first(); bp != BreakIterator.DONE; bp = t.bi.next()) {
388 // Fail for lack of forward progress.
389 errln("Forward Iteration, no forward progress. Break Pos=" + bp +
390 " File line,col=" + t.srcLine[bp] + ", " + t.srcCol[bp]);
394 // Check that there were we didn't miss an expected break between the last one
396 for (i=prevBP+1; i<bp; i++) {
397 if (t.expectedBreaks[i] != 0) {
398 errln("Forward Iteration, break expected, but not found. Pos=" + i +
399 " File line,col= " + t.srcLine[i] + ", " + t.srcCol[i]);
403 // Check that the break we did find was expected
404 if (t.expectedBreaks[bp] == 0) {
405 errln("Forward Iteration, break found, but not expected. Pos=" + bp +
406 " File line,col= " + t.srcLine[bp] + ", " + t.srcCol[bp]);
408 // The break was expected.
409 // Check that the {nnn} tag value is correct.
410 int expectedTagVal = t.expectedBreaks[bp];
411 if (expectedTagVal == -1) {
414 int line = t.srcLine[bp];
415 int rs = t.bi.getRuleStatus();
416 if (rs != expectedTagVal) {
417 errln("Incorrect status for forward break. Pos = " + bp +
418 ". File line,col = " + line + ", " + t.srcCol[bp] + "\n" +
419 " Actual, Expected status = " + rs + ", " + expectedTagVal);
421 int[] fillInArray = new int[4];
422 int numStatusVals = t.bi.getRuleStatusVec(fillInArray);
423 assertTrue("", numStatusVals >= 1);
424 assertEquals("", expectedTagVal, fillInArray[0]);
431 // Verify that there were no missed expected breaks after the last one found
432 for (i=prevBP+1; i<t.dataToBreak.length()+1; i++) {
433 if (t.expectedBreaks[i] != 0) {
434 errln("Forward Iteration, break expected, but not found. Pos=" + i +
435 " File line,col= " + t.srcLine[i] + ", " + t.srcCol[i]);
441 // Run the iterator backwards, verify that the same breaks are found.
443 prevBP = t.dataToBreak.length()+2; // start with a phony value for the last break pos seen.
444 for (bp = t.bi.last(); bp != BreakIterator.DONE; bp = t.bi.previous()) {
446 // Fail for lack of progress.
447 errln("Reverse Iteration, no progress. Break Pos=" + bp +
448 "File line,col=" + t.srcLine[bp] + " " + t.srcCol[bp]);
452 // Check that we didn't miss an expected break between the last one
453 // and this one. (UVector returns zeros for index out of bounds.)
454 for (i=prevBP-1; i>bp; i--) {
455 if (t.expectedBreaks[i] != 0) {
456 errln("Reverse Itertion, break expected, but not found. Pos=" + i +
457 " File line,col= " + t.srcLine[i] + ", " + t.srcCol[i]);
461 // Check that the break we did find was expected
462 if (t.expectedBreaks[bp] == 0) {
463 errln("Reverse Itertion, break found, but not expected. Pos=" + bp +
464 " File line,col= " + t.srcLine[bp] + ", " + t.srcCol[bp]);
466 // The break was expected.
467 // Check that the {nnn} tag value is correct.
468 int expectedTagVal = t.expectedBreaks[bp];
469 if (expectedTagVal == -1) {
472 int line = t.srcLine[bp];
473 int rs = t.bi.getRuleStatus();
474 if (rs != expectedTagVal) {
475 errln("Incorrect status for reverse break. Pos= " + bp +
476 "File line,col= " + line + ", " + t.srcCol[bp] + "\n" +
477 " Actual, Expected status = " + rs + ", " + expectedTagVal);
484 // Verify that there were no missed breaks prior to the last one found
485 for (i=prevBP-1; i>=0; i--) {
486 if (t.expectedBreaks[i] != 0) {
487 errln("Reverse Itertion, break expected, but not found. Pos=" + i +
488 " File line,col= " + t.srcLine[i] + ", " + t.srcCol[i]);
491 // Check isBoundary()
492 for (i=0; i<=t.dataToBreak.length(); i++) {
493 boolean boundaryExpected = (t.expectedBreaks[i] != 0);
494 boolean boundaryFound = t.bi.isBoundary(i);
495 if (boundaryExpected != boundaryFound) {
496 errln("isBoundary(" + i + ") incorrect.\n" +
497 " File line,col= " + t.srcLine[i] + ", " + t.srcCol[i] +
498 " Expected, Actual= " + boundaryExpected + ", " + boundaryFound);
503 for (i=0; i<=t.dataToBreak.length(); i++) {
504 int actualBreak = t.bi.following(i);
505 int expectedBreak = BreakIterator.DONE;
506 for (int j=i+1; j < t.expectedBreaks.length; j++) {
507 if (t.expectedBreaks[j] != 0) {
512 if (expectedBreak != actualBreak) {
513 errln("following(" + i + ") incorrect.\n" +
514 " File line,col= " + t.srcLine[i] + ", " + t.srcCol[i] +
515 " Expected, Actual= " + expectedBreak + ", " + actualBreak);
520 for (i=t.dataToBreak.length(); i>=0; i--) {
521 int actualBreak = t.bi.preceding(i);
522 int expectedBreak = BreakIterator.DONE;
524 for (int j=i-1; j >= 0; j--) {
525 if (t.expectedBreaks[j] != 0) {
530 if (expectedBreak != actualBreak) {
531 errln("preceding(" + i + ") incorrect.\n" +
532 " File line,col= " + t.srcLine[i] + ", " + t.srcCol[i] +
533 " Expected, Actual= " + expectedBreak + ", " + actualBreak);