2 * Created on May 5, 2004
\r
4 * Copyright (C) 2004-2010 International Business Machines Corporation and others.
\r
5 * All Rights Reserved.
\r
8 package com.ibm.icu.dev.test.rbbi;
\r
10 import java.io.IOException;
\r
11 import java.io.InputStream;
\r
12 import java.io.InputStreamReader;
\r
13 import java.util.Arrays;
\r
15 import com.ibm.icu.dev.test.TestFmwk;
\r
16 import com.ibm.icu.impl.Utility;
\r
17 import com.ibm.icu.lang.UCharacter;
\r
18 import com.ibm.icu.text.BreakIterator;
\r
19 import com.ibm.icu.text.RuleBasedBreakIterator;
\r
20 import com.ibm.icu.text.UTF16;
\r
21 import com.ibm.icu.util.ULocale;
\r
25 * Rule based break iterator data driven test.
\r
26 * Perform the tests from the file rbbitst.txt.
\r
27 * The test data file is common to both ICU4C and ICU4J.
\r
28 * See the data file for a description of the tests.
\r
31 public class RBBITestExtended extends TestFmwk {
\r
33 public static void main(String[] args)throws Exception {
\r
34 new RBBITestExtended().run(args);
\r
38 public RBBITestExtended() {
\r
43 static class TestParams {
\r
45 StringBuffer dataToBreak = new StringBuffer();
\r
46 int[] expectedBreaks = new int[1000];
\r
47 int[] srcLine = new int[1000];
\r
48 int[] srcCol = new int[1000];
\r
49 ULocale currentLocale = new ULocale("en_US");
\r
53 public void TestExtended() {
\r
55 TestParams tp = new TestParams();
\r
59 // Open and read the test data file.
\r
61 InputStreamReader isr = null;
\r
62 StringBuffer testFileBuf = new StringBuffer();
\r
64 InputStream is = RBBITestExtended.class.getResourceAsStream("rbbitst.txt");
\r
66 errln("Could not open test data file rbbitst.txt");
\r
69 isr = new InputStreamReader(is, "UTF-8");
\r
78 if (c==0xFEFF && count==1) {
\r
79 // BOM in the test data file. Discard it.
\r
83 UTF16.append(testFileBuf, c);
\r
86 } catch (IOException e) {
\r
87 errln(e.toString());
\r
91 String testString = testFileBuf.toString();
\r
94 final int PARSE_COMMENT = 1;
\r
95 final int PARSE_TAG = 2;
\r
96 final int PARSE_DATA = 3;
\r
97 final int PARSE_NUM = 4;
\r
99 int parseState = PARSE_TAG;
\r
101 int savedState = PARSE_TAG;
\r
103 final char CH_LF = 0x0a;
\r
104 final char CH_CR = 0x0d;
\r
105 final char CH_HASH = 0x23;
\r
106 /*static const UChar CH_PERIOD = 0x2e;*/
\r
107 final char CH_LT = 0x3c;
\r
108 final char CH_GT = 0x3e;
\r
109 final char CH_BACKSLASH = 0x5c;
\r
110 final char CH_BULLET = 0x2022;
\r
118 int tagValue = 0; // The numeric value of a <nnn> tag.
\r
119 int len = testString.length();
\r
121 for (charIdx = 0; charIdx < len; ) {
\r
122 int c = UTF16.charAt(testString, charIdx);
\r
124 if (c == CH_CR && charIdx<len && testString.charAt(charIdx) == CH_LF) {
\r
125 // treat CRLF as a unit
\r
129 if (c == CH_LF || c == CH_CR) {
\r
131 colStart = charIdx;
\r
133 column = charIdx - colStart + 1;
\r
135 switch (parseState) {
\r
136 case PARSE_COMMENT:
\r
137 if (c == 0x0a || c == 0x0d) {
\r
138 parseState = savedState;
\r
144 if (c == CH_HASH) {
\r
145 parseState = PARSE_COMMENT;
\r
146 savedState = PARSE_TAG;
\r
149 if (UCharacter.isWhitespace(c)) {
\r
152 if (testString.startsWith("<word>", charIdx-1)) {
\r
153 tp.bi = BreakIterator.getWordInstance(tp.currentLocale);
\r
157 if (testString.startsWith("<char>", charIdx-1)) {
\r
158 tp.bi = BreakIterator.getCharacterInstance(tp.currentLocale);
\r
162 if (testString.startsWith("<line>", charIdx-1)) {
\r
163 tp.bi = BreakIterator.getLineInstance(tp.currentLocale);
\r
167 if (testString.startsWith("<sent>", charIdx-1)) {
\r
168 tp.bi = BreakIterator.getSentenceInstance(tp.currentLocale);
\r
172 if (testString.startsWith("<title>", charIdx-1)) {
\r
173 tp.bi = BreakIterator.getTitleInstance(tp.currentLocale);
\r
177 if (testString.startsWith("<locale ", charIdx-1)) {
\r
178 int closeIndex = testString.indexOf(">", charIdx);
\r
179 if (closeIndex < 0) {
\r
180 errln("line" + lineNum + ": missing close on <locale tag.");
\r
183 String localeName = testString.substring(charIdx+6, closeIndex);
\r
184 localeName = localeName.trim();
\r
185 tp.currentLocale = new ULocale(localeName);
\r
186 charIdx = closeIndex+1;
\r
189 if (testString.startsWith("<data>", charIdx-1)) {
\r
190 parseState = PARSE_DATA;
\r
192 tp.dataToBreak.setLength(0);
\r
193 Arrays.fill(tp.expectedBreaks, 0);
\r
194 Arrays.fill(tp.srcCol, 0);
\r
195 Arrays.fill(tp.srcLine, 0);
\r
199 errln("line" + lineNum + ": Tag expected in test file.");
\r
201 //parseState = PARSE_COMMENT;
\r
202 //savedState = PARSE_DATA;
\r
206 if (c == CH_BULLET) {
\r
207 int breakIdx = tp.dataToBreak.length();
\r
208 tp.expectedBreaks[breakIdx] = -1;
\r
209 tp.srcLine[breakIdx] = lineNum;
\r
210 tp.srcCol[breakIdx] = column;
\r
214 if (testString.startsWith("</data>", charIdx-1)) {
\r
215 // Add final entry to mappings from break location to source file position.
\r
216 // Need one extra because last break position returned is after the
\r
217 // last char in the data, not at the last char.
\r
218 int idx = tp.dataToBreak.length();
\r
219 tp.srcLine[idx] = lineNum;
\r
220 tp.srcCol[idx] = column;
\r
222 parseState = PARSE_TAG;
\r
230 if (testString.startsWith("\\N{", charIdx-1)) {
\r
231 int nameEndIdx = testString.indexOf('}', charIdx);
\r
232 if (nameEndIdx == -1) {
\r
233 errln("Error in named character in test file at line " + lineNum +
\r
234 ", col " + column);
\r
236 // Named character, e.g. \N{COMBINING GRAVE ACCENT}
\r
237 // Get the code point from the name and insert it into the test data.
\r
238 String charName = testString.substring(charIdx+2, nameEndIdx);
\r
239 c = UCharacter.getCharFromName(charName);
\r
241 errln("Error in named character in test file at line " + lineNum +
\r
242 ", col " + column);
\r
244 // Named code point was recognized. Insert it
\r
245 // into the test data.
\r
246 UTF16.append(tp.dataToBreak, c);
\r
247 for (i = tp.dataToBreak.length()-1; i>=0 && tp.srcLine[i]==0; i--) {
\r
248 tp.srcLine[i] = lineNum;
\r
249 tp.srcCol[i] = column;
\r
253 if (nameEndIdx > charIdx) {
\r
254 charIdx = nameEndIdx+1;
\r
259 if (testString.startsWith("<>", charIdx-1)) {
\r
261 int breakIdx = tp.dataToBreak.length();
\r
262 tp.expectedBreaks[breakIdx] = -1;
\r
263 tp.srcLine[breakIdx] = lineNum;
\r
264 tp.srcCol[breakIdx] = column;
\r
270 parseState = PARSE_NUM;
\r
274 if (c == CH_HASH && column==3) { // TODO: why is column off so far?
\r
275 parseState = PARSE_COMMENT;
\r
276 savedState = PARSE_DATA;
\r
280 if (c == CH_BACKSLASH) {
\r
281 // Check for \ at end of line, a line continuation.
\r
282 // Advance over (discard) the newline
\r
283 int cp = UTF16.charAt(testString, charIdx);
\r
284 if (cp == CH_CR && charIdx<len && UTF16.charAt(testString, charIdx+1) == CH_LF) {
\r
286 // Need an extra increment of the input ptr to move over both of them
\r
289 if (cp == CH_LF || cp == CH_CR) {
\r
293 colStart = charIdx;
\r
297 // Let unescape handle the back slash.
\r
298 int charIdxAr[] = new int[1];
\r
299 charIdxAr[0] = charIdx;
\r
300 cp = Utility.unescapeAt(testString, charIdxAr);
\r
302 // Escape sequence was recognized. Insert the char
\r
303 // into the test data.
\r
304 charIdx = charIdxAr[0];
\r
305 UTF16.append(tp.dataToBreak, cp);
\r
306 for (i=tp.dataToBreak.length()-1; i>=0 && tp.srcLine[i]==0; i--) {
\r
307 tp.srcLine[i] = lineNum;
\r
308 tp.srcCol[i] = column;
\r
315 // Not a recognized backslash escape sequence.
\r
316 // Take the next char as a literal.
\r
317 // TODO: Should this be an error?
\r
318 c = UTF16.charAt(testString,charIdx);
\r
319 charIdx = UTF16.moveCodePointOffset(testString, charIdx, 1);
\r
322 // Normal, non-escaped data char.
\r
323 UTF16.append(tp.dataToBreak, c);
\r
325 // Save the mapping from offset in the data to line/column numbers in
\r
326 // the original input file. Will be used for better error messages only.
\r
327 // If there's an expected break before this char, the slot in the mapping
\r
328 // vector will already be set for this char; don't overwrite it.
\r
329 for (i=tp.dataToBreak.length()-1; i>=0 && tp.srcLine[i]==0; i--) {
\r
330 tp.srcLine[i] = lineNum;
\r
331 tp.srcCol[i] = column;
\r
337 // We are parsing an expected numeric tag value, like <1234>,
\r
338 // within a chunk of data.
\r
339 if (UCharacter.isWhitespace(c)) {
\r
344 // Finished the number. Add the info to the expected break data,
\r
345 // and switch parse state back to doing plain data.
\r
346 parseState = PARSE_DATA;
\r
347 if (tagValue == 0) {
\r
350 int breakIdx = tp.dataToBreak.length();
\r
351 tp.expectedBreaks[breakIdx] = tagValue;
\r
352 tp.srcLine[breakIdx] = lineNum;
\r
353 tp.srcCol[breakIdx] = column;
\r
357 if (UCharacter.isDigit(c)) {
\r
358 tagValue = tagValue*10 + UCharacter.digit(c);
\r
362 errln("Syntax Error in test file at line "+ lineNum +", col %d" + column);
\r
365 // parseState = PARSE_COMMENT; // TODO: unreachable. Don't stop on errors.
\r
374 void executeTest(TestParams t) {
\r
379 if (t.bi == null) {
\r
383 t.bi.setText(t.dataToBreak.toString());
\r
385 // Run the iterator forward
\r
388 for (bp = t.bi.first(); bp != BreakIterator.DONE; bp = t.bi.next()) {
\r
389 if (prevBP == bp) {
\r
390 // Fail for lack of forward progress.
\r
391 errln("Forward Iteration, no forward progress. Break Pos=" + bp +
\r
392 " File line,col=" + t.srcLine[bp] + ", " + t.srcCol[bp]);
\r
396 // Check that there were we didn't miss an expected break between the last one
\r
398 for (i=prevBP+1; i<bp; i++) {
\r
399 if (t.expectedBreaks[i] != 0) {
\r
400 errln("Forward Iteration, break expected, but not found. Pos=" + i +
\r
401 " File line,col= " + t.srcLine[i] + ", " + t.srcCol[i]);
\r
405 // Check that the break we did find was expected
\r
406 if (t.expectedBreaks[bp] == 0) {
\r
407 errln("Forward Iteration, break found, but not expected. Pos=" + bp +
\r
408 " File line,col= " + t.srcLine[bp] + ", " + t.srcCol[bp]);
\r
410 // The break was expected.
\r
411 // Check that the {nnn} tag value is correct.
\r
412 int expectedTagVal = t.expectedBreaks[bp];
\r
413 if (expectedTagVal == -1) {
\r
414 expectedTagVal = 0;
\r
416 int line = t.srcLine[bp];
\r
417 int rs = ((RuleBasedBreakIterator)t.bi).getRuleStatus();
\r
418 if (rs != expectedTagVal) {
\r
419 errln("Incorrect status for forward break. Pos = " + bp +
\r
420 ". File line,col = " + line + ", " + t.srcCol[bp] + "\n" +
\r
421 " Actual, Expected status = " + rs + ", " + expectedTagVal);
\r
429 // Verify that there were no missed expected breaks after the last one found
\r
430 for (i=prevBP+1; i<t.dataToBreak.length()+1; i++) {
\r
431 if (t.expectedBreaks[i] != 0) {
\r
432 errln("Forward Iteration, break expected, but not found. Pos=" + i +
\r
433 " File line,col= " + t.srcLine[i] + ", " + t.srcCol[i]);
\r
438 // Run the iterator backwards, verify that the same breaks are found.
\r
440 prevBP = t.dataToBreak.length()+2; // start with a phony value for the last break pos seen.
\r
441 for (bp = t.bi.last(); bp != BreakIterator.DONE; bp = t.bi.previous()) {
\r
442 if (prevBP == bp) {
\r
443 // Fail for lack of progress.
\r
444 errln("Reverse Iteration, no progress. Break Pos=" + bp +
\r
445 "File line,col=" + t.srcLine[bp] + " " + t.srcCol[bp]);
\r
449 // Check that there were we didn't miss an expected break between the last one
\r
450 // and this one. (UVector returns zeros for index out of bounds.)
\r
451 for (i=prevBP-1; i>bp; i--) {
\r
452 if (t.expectedBreaks[i] != 0) {
\r
453 errln("Reverse Itertion, break expected, but not found. Pos=" + i +
\r
454 " File line,col= " + t.srcLine[i] + ", " + t.srcCol[i]);
\r
458 // Check that the break we did find was expected
\r
459 if (t.expectedBreaks[bp] == 0) {
\r
460 errln("Reverse Itertion, break found, but not expected. Pos=" + bp +
\r
461 " File line,col= " + t.srcLine[bp] + ", " + t.srcCol[bp]);
\r
463 // The break was expected.
\r
464 // Check that the {nnn} tag value is correct.
\r
465 int expectedTagVal = t.expectedBreaks[bp];
\r
466 if (expectedTagVal == -1) {
\r
467 expectedTagVal = 0;
\r
469 int line = t.srcLine[bp];
\r
470 int rs = ((RuleBasedBreakIterator)t.bi).getRuleStatus();
\r
471 if (rs != expectedTagVal) {
\r
472 errln("Incorrect status for reverse break. Pos= " + bp +
\r
473 "File line,col= " + line + ", " + t.srcCol[bp] + "\n" +
\r
474 " Actual, Expected status = " + rs + ", " + expectedTagVal);
\r
481 // Verify that there were no missed breaks prior to the last one found
\r
482 for (i=prevBP-1; i>=0; i--) {
\r
483 if (t.expectedBreaks[i] != 0) {
\r
484 errln("Forward Itertion, break expected, but not found. Pos=" + i +
\r
485 " File line,col= " + t.srcLine[i] + ", " + t.srcCol[i]);
\r