2 * Created on May 5, 2004
\r
4 * Copyright (C) 2004-2006 International Business Machines Corporation and others.
\r
5 * All Rights Reserved.
\r
8 package com.ibm.icu.dev.test.rbbi;
\r
10 import com.ibm.icu.dev.test.TestFmwk;
\r
11 import com.ibm.icu.impl.Utility;
\r
12 import com.ibm.icu.text.BreakIterator;
\r
13 import com.ibm.icu.text.RuleBasedBreakIterator;
\r
14 import com.ibm.icu.lang.UCharacter;
\r
15 import com.ibm.icu.text.UTF16;
\r
16 import com.ibm.icu.util.ULocale;
\r
17 import java.io.InputStream;
\r
18 import java.io.InputStreamReader;
\r
19 import java.io.IOException;
\r
20 import java.util.Arrays;
\r
24 * Rule based break iterator data driven test.
\r
25 * Perform the tests from the file rbbitst.txt.
\r
26 * The test data file is common to both ICU4C and ICU4J.
\r
27 * See the data file for a description of the tests.
\r
30 public class RBBITestExtended extends TestFmwk {
\r
32 public static void main(String[] args)throws Exception {
\r
33 new RBBITestExtended().run(args);
\r
37 public RBBITestExtended() {
\r
42 static class TestParams {
\r
44 StringBuffer dataToBreak = new StringBuffer();
\r
45 int[] expectedBreaks = new int[1000];
\r
46 int[] srcLine = new int[1000];
\r
47 int[] srcCol = new int[1000];
\r
48 ULocale currentLocale = new ULocale("en_US");
\r
52 public void TestExtended() {
\r
54 TestParams tp = new TestParams();
\r
58 // Open and read the test data file.
\r
60 InputStreamReader isr = null;
\r
61 StringBuffer testFileBuf = new StringBuffer();
\r
63 InputStream is = RBBITestExtended.class.getResourceAsStream("rbbitst.txt");
\r
65 errln("Could not open test data file rbbitst.txt");
\r
68 isr = new InputStreamReader(is, "UTF-8");
\r
77 if (c==0xFEFF && count==1) {
\r
78 // BOM in the test data file. Discard it.
\r
82 UTF16.append(testFileBuf, c);
\r
85 } catch (IOException e) {
\r
86 errln(e.toString());
\r
90 String testString = testFileBuf.toString();
\r
93 final int PARSE_COMMENT = 1;
\r
94 final int PARSE_TAG = 2;
\r
95 final int PARSE_DATA = 3;
\r
96 final int PARSE_NUM = 4;
\r
98 int parseState = PARSE_TAG;
\r
100 int savedState = PARSE_TAG;
\r
102 final char CH_LF = 0x0a;
\r
103 final char CH_CR = 0x0d;
\r
104 final char CH_HASH = 0x23;
\r
105 /*static const UChar CH_PERIOD = 0x2e;*/
\r
106 final char CH_LT = 0x3c;
\r
107 final char CH_GT = 0x3e;
\r
108 final char CH_BACKSLASH = 0x5c;
\r
109 final char CH_BULLET = 0x2022;
\r
117 int tagValue = 0; // The numeric value of a <nnn> tag.
\r
118 int len = testString.length();
\r
120 for (charIdx = 0; charIdx < len; ) {
\r
121 int c = UTF16.charAt(testString, charIdx);
\r
123 if (c == CH_CR && charIdx<len && testString.charAt(charIdx) == CH_LF) {
\r
124 // treat CRLF as a unit
\r
128 if (c == CH_LF || c == CH_CR) {
\r
130 colStart = charIdx;
\r
132 column = charIdx - colStart + 1;
\r
134 switch (parseState) {
\r
135 case PARSE_COMMENT:
\r
136 if (c == 0x0a || c == 0x0d) {
\r
137 parseState = savedState;
\r
143 if (c == CH_HASH) {
\r
144 parseState = PARSE_COMMENT;
\r
145 savedState = PARSE_TAG;
\r
148 if (UCharacter.isWhitespace(c)) {
\r
151 if (testString.startsWith("<word>", charIdx-1)) {
\r
152 tp.bi = BreakIterator.getWordInstance(tp.currentLocale);
\r
156 if (testString.startsWith("<char>", charIdx-1)) {
\r
157 tp.bi = BreakIterator.getCharacterInstance(tp.currentLocale);
\r
161 if (testString.startsWith("<line>", charIdx-1)) {
\r
162 tp.bi = BreakIterator.getLineInstance(tp.currentLocale);
\r
166 if (testString.startsWith("<sent>", charIdx-1)) {
\r
167 tp.bi = BreakIterator.getSentenceInstance(tp.currentLocale);
\r
171 if (testString.startsWith("<title>", charIdx-1)) {
\r
172 tp.bi = BreakIterator.getTitleInstance(tp.currentLocale);
\r
176 if (testString.startsWith("<locale ", charIdx-1)) {
\r
177 int closeIndex = testString.indexOf(">", charIdx);
\r
178 if (closeIndex < 0) {
\r
179 errln("line" + lineNum + ": missing close on <locale tag.");
\r
182 String localeName = testString.substring(charIdx+6, closeIndex);
\r
183 localeName = localeName.trim();
\r
184 tp.currentLocale = new ULocale(localeName);
\r
185 charIdx = closeIndex+1;
\r
188 if (testString.startsWith("<data>", charIdx-1)) {
\r
189 parseState = PARSE_DATA;
\r
191 tp.dataToBreak.setLength(0);
\r
192 Arrays.fill(tp.expectedBreaks, 0);
\r
193 Arrays.fill(tp.srcCol, 0);
\r
194 Arrays.fill(tp.srcLine, 0);
\r
198 errln("line" + lineNum + ": Tag expected in test file.");
\r
200 //parseState = PARSE_COMMENT;
\r
201 //savedState = PARSE_DATA;
\r
205 if (c == CH_BULLET) {
\r
206 int breakIdx = tp.dataToBreak.length();
\r
207 tp.expectedBreaks[breakIdx] = -1;
\r
208 tp.srcLine[breakIdx] = lineNum;
\r
209 tp.srcCol[breakIdx] = column;
\r
213 if (testString.startsWith("</data>", charIdx-1)) {
\r
214 // Add final entry to mappings from break location to source file position.
\r
215 // Need one extra because last break position returned is after the
\r
216 // last char in the data, not at the last char.
\r
217 int idx = tp.dataToBreak.length();
\r
218 tp.srcLine[idx] = lineNum;
\r
219 tp.srcCol[idx] = column;
\r
221 parseState = PARSE_TAG;
\r
229 if (testString.startsWith("\\N{", charIdx-1)) {
\r
230 int nameEndIdx = testString.indexOf('}', charIdx);
\r
231 if (nameEndIdx == -1) {
\r
232 errln("Error in named character in test file at line " + lineNum +
\r
233 ", col " + column);
\r
235 // Named character, e.g. \N{COMBINING GRAVE ACCENT}
\r
236 // Get the code point from the name and insert it into the test data.
\r
237 String charName = testString.substring(charIdx+2, nameEndIdx);
\r
238 c = UCharacter.getCharFromName(charName);
\r
240 errln("Error in named character in test file at line " + lineNum +
\r
241 ", col " + column);
\r
243 // Named code point was recognized. Insert it
\r
244 // into the test data.
\r
245 UTF16.append(tp.dataToBreak, c);
\r
246 for (i = tp.dataToBreak.length()-1; i>=0 && tp.srcLine[i]==0; i--) {
\r
247 tp.srcLine[i] = lineNum;
\r
248 tp.srcCol[i] = column;
\r
252 if (nameEndIdx > charIdx) {
\r
253 charIdx = nameEndIdx+1;
\r
258 if (testString.startsWith("<>", charIdx-1)) {
\r
260 int breakIdx = tp.dataToBreak.length();
\r
261 tp.expectedBreaks[breakIdx] = -1;
\r
262 tp.srcLine[breakIdx] = lineNum;
\r
263 tp.srcCol[breakIdx] = column;
\r
269 parseState = PARSE_NUM;
\r
273 if (c == CH_HASH && column==3) { // TODO: why is column off so far?
\r
274 parseState = PARSE_COMMENT;
\r
275 savedState = PARSE_DATA;
\r
279 if (c == CH_BACKSLASH) {
\r
280 // Check for \ at end of line, a line continuation.
\r
281 // Advance over (discard) the newline
\r
282 int cp = UTF16.charAt(testString, charIdx);
\r
283 if (cp == CH_CR && charIdx<len && UTF16.charAt(testString, charIdx+1) == CH_LF) {
\r
285 // Need an extra increment of the input ptr to move over both of them
\r
288 if (cp == CH_LF || cp == CH_CR) {
\r
292 colStart = charIdx;
\r
296 // Let unescape handle the back slash.
\r
297 int charIdxAr[] = new int[1];
\r
298 charIdxAr[0] = charIdx;
\r
299 cp = Utility.unescapeAt(testString, charIdxAr);
\r
301 // Escape sequence was recognized. Insert the char
\r
302 // into the test data.
\r
303 charIdx = charIdxAr[0];
\r
304 UTF16.append(tp.dataToBreak, cp);
\r
305 for (i=tp.dataToBreak.length()-1; i>=0 && tp.srcLine[i]==0; i--) {
\r
306 tp.srcLine[i] = lineNum;
\r
307 tp.srcCol[i] = column;
\r
314 // Not a recognized backslash escape sequence.
\r
315 // Take the next char as a literal.
\r
316 // TODO: Should this be an error?
\r
317 c = UTF16.charAt(testString,charIdx);
\r
318 charIdx = UTF16.moveCodePointOffset(testString, charIdx, 1);
\r
321 // Normal, non-escaped data char.
\r
322 UTF16.append(tp.dataToBreak, c);
\r
324 // Save the mapping from offset in the data to line/column numbers in
\r
325 // the original input file. Will be used for better error messages only.
\r
326 // If there's an expected break before this char, the slot in the mapping
\r
327 // vector will already be set for this char; don't overwrite it.
\r
328 for (i=tp.dataToBreak.length()-1; i>=0 && tp.srcLine[i]==0; i--) {
\r
329 tp.srcLine[i] = lineNum;
\r
330 tp.srcCol[i] = column;
\r
336 // We are parsing an expected numeric tag value, like <1234>,
\r
337 // within a chunk of data.
\r
338 if (UCharacter.isWhitespace(c)) {
\r
343 // Finished the number. Add the info to the expected break data,
\r
344 // and switch parse state back to doing plain data.
\r
345 parseState = PARSE_DATA;
\r
346 if (tagValue == 0) {
\r
349 int breakIdx = tp.dataToBreak.length();
\r
350 tp.expectedBreaks[breakIdx] = tagValue;
\r
351 tp.srcLine[breakIdx] = lineNum;
\r
352 tp.srcCol[breakIdx] = column;
\r
356 if (UCharacter.isDigit(c)) {
\r
357 tagValue = tagValue*10 + UCharacter.digit(c);
\r
361 errln("Syntax Error in test file at line "+ lineNum +", col %d" + column);
\r
364 // parseState = PARSE_COMMENT; // TODO: unreachable. Don't stop on errors.
\r
373 void executeTest(TestParams t) {
\r
378 if (t.bi == null) {
\r
382 t.bi.setText(t.dataToBreak.toString());
\r
384 // Run the iterator forward
\r
387 for (bp = t.bi.first(); bp != BreakIterator.DONE; bp = t.bi.next()) {
\r
388 if (prevBP == bp) {
\r
389 // Fail for lack of forward progress.
\r
390 errln("Forward Iteration, no forward progress. Break Pos=" + bp +
\r
391 " File line,col=" + t.srcLine[bp] + ", " + t.srcCol[bp]);
\r
395 // Check that there were we didn't miss an expected break between the last one
\r
397 for (i=prevBP+1; i<bp; i++) {
\r
398 if (t.expectedBreaks[i] != 0) {
\r
399 errln("Forward Iteration, break expected, but not found. Pos=" + i +
\r
400 " File line,col= " + t.srcLine[i] + ", " + t.srcCol[i]);
\r
404 // Check that the break we did find was expected
\r
405 if (t.expectedBreaks[bp] == 0) {
\r
406 errln("Forward Iteration, break found, but not expected. Pos=" + bp +
\r
407 " File line,col= " + t.srcLine[bp] + ", " + t.srcCol[bp]);
\r
409 // The break was expected.
\r
410 // Check that the {nnn} tag value is correct.
\r
411 int expectedTagVal = t.expectedBreaks[bp];
\r
412 if (expectedTagVal == -1) {
\r
413 expectedTagVal = 0;
\r
415 int line = t.srcLine[bp];
\r
416 int rs = ((RuleBasedBreakIterator)t.bi).getRuleStatus();
\r
417 if (rs != expectedTagVal) {
\r
418 errln("Incorrect status for forward break. Pos = " + bp +
\r
419 ". File line,col = " + line + ", " + t.srcCol[bp] + "\n" +
\r
420 " Actual, Expected status = " + rs + ", " + expectedTagVal);
\r
428 // Verify that there were no missed expected breaks after the last one found
\r
429 for (i=prevBP+1; i<t.dataToBreak.length()+1; i++) {
\r
430 if (t.expectedBreaks[i] != 0) {
\r
431 errln("Forward Iteration, break expected, but not found. Pos=" + i +
\r
432 " File line,col= " + t.srcLine[i] + ", " + t.srcCol[i]);
\r
437 // Run the iterator backwards, verify that the same breaks are found.
\r
439 prevBP = t.dataToBreak.length()+2; // start with a phony value for the last break pos seen.
\r
440 for (bp = t.bi.last(); bp != BreakIterator.DONE; bp = t.bi.previous()) {
\r
441 if (prevBP == bp) {
\r
442 // Fail for lack of progress.
\r
443 errln("Reverse Iteration, no progress. Break Pos=" + bp +
\r
444 "File line,col=" + t.srcLine[bp] + " " + t.srcCol[bp]);
\r
448 // Check that there were we didn't miss an expected break between the last one
\r
449 // and this one. (UVector returns zeros for index out of bounds.)
\r
450 for (i=prevBP-1; i>bp; i--) {
\r
451 if (t.expectedBreaks[i] != 0) {
\r
452 errln("Reverse Itertion, break expected, but not found. Pos=" + i +
\r
453 " File line,col= " + t.srcLine[i] + ", " + t.srcCol[i]);
\r
457 // Check that the break we did find was expected
\r
458 if (t.expectedBreaks[bp] == 0) {
\r
459 errln("Reverse Itertion, break found, but not expected. Pos=" + bp +
\r
460 " File line,col= " + t.srcLine[bp] + ", " + t.srcCol[bp]);
\r
462 // The break was expected.
\r
463 // Check that the {nnn} tag value is correct.
\r
464 int expectedTagVal = t.expectedBreaks[bp];
\r
465 if (expectedTagVal == -1) {
\r
466 expectedTagVal = 0;
\r
468 int line = t.srcLine[bp];
\r
469 int rs = ((RuleBasedBreakIterator)t.bi).getRuleStatus();
\r
470 if (rs != expectedTagVal) {
\r
471 errln("Incorrect status for reverse break. Pos= " + bp +
\r
472 "File line,col= " + line + ", " + t.srcCol[bp] + "\n" +
\r
473 " Actual, Expected status = " + rs + ", " + expectedTagVal);
\r
480 // Verify that there were no missed breaks prior to the last one found
\r
481 for (i=prevBP-1; i>=0; i--) {
\r
482 if (t.expectedBreaks[i] != 0) {
\r
483 errln("Forward Itertion, break expected, but not found. Pos=" + i +
\r
484 " File line,col= " + t.srcLine[i] + ", " + t.srcCol[i]);
\r