2 *******************************************************************************
3 * Copyright (C) 2010-2013, International Business Machines Corporation and
4 * others. All Rights Reserved.
5 *******************************************************************************
7 package com.ibm.icu.dev.test.bidi;
9 import java.io.BufferedReader;
10 import java.io.IOException;
12 import com.ibm.icu.dev.test.TestFmwk;
13 import com.ibm.icu.dev.test.TestUtil;
14 import com.ibm.icu.lang.UCharacterDirection;
15 import com.ibm.icu.text.Bidi;
16 import com.ibm.icu.text.BidiClassifier;
19 * @author Markus W. Scherer
20 * BiDi conformance test, using the Unicode BidiTest.txt and BidiCharacterTest.txt files.
21 * Ported from ICU4C intltest/bidiconf.cpp .
23 public class BiDiConformanceTest extends TestFmwk {
24 public static void main(String[] args) throws Exception {
25 new BiDiConformanceTest().run(args);
27 public BiDiConformanceTest() {}
29 public void TestBidiTest() throws IOException {
30 BufferedReader bidiTestFile=TestUtil.getDataReader("unicode/BidiTest.txt");
31 Bidi ubidi=new Bidi();
32 ubidi.setCustomClassifier(new ConfTestBidiClassifier());
38 while(errorCount<10 && (line=bidiTestFile.readLine())!=null) {
41 // Remove trailing comments and whitespace.
42 int commentStart=line.indexOf('#');
44 line=line.substring(0, commentStart);
46 if(!skipWhitespace()) {
47 continue; // Skip empty and comment-only lines.
49 if(line.charAt(lineIndex)=='@') {
51 if(line.startsWith("Levels:", lineIndex)) {
53 if(!parseLevels(line.substring(lineIndex))) { break; }
54 } else if(line.startsWith("Reorder:", lineIndex)) {
56 if(!parseOrdering(line.substring(lineIndex))) { break; }
58 // Skip unknown @Xyz: ...
60 parseInputStringFromBiDiClasses();
61 if(!skipWhitespace() || line.charAt(lineIndex++)!=';') {
62 errln("missing ; separator on input line "+line);
65 int bitset=Integer.parseInt(line.substring(lineIndex).trim(), 16);
66 // Loop over the bitset.
67 for(int i=0; i<=3; ++i) {
68 if((bitset&(1<<i))!=0) {
69 ubidi.setPara(inputString, paraLevels[i], null);
70 byte actualLevels[]=ubidi.getLevels();
71 paraLevelName=paraLevelNames[i];
72 if(!checkLevels(actualLevels)) {
75 if(!checkOrdering(ubidi)) {
85 *******************************************************************************
87 * created on: 2013jul01
88 * created by: Matitiahu Allouche
90 This function performs a conformance test for implementations of the
91 Unicode Bidirectional Algorithm, specified in UAX #9: Unicode
92 Bidirectional Algorithm, at http://www.unicode.org/unicode/reports/tr9/
94 Each test case is represented in a single line which is read from a file
95 named BidiCharacter.txt. Empty, blank and comment lines may also appear
98 The format of the test data is specified below. Note that each test
99 case constitutes a single line of text; reordering is applied within a
100 single line and independently of a rendering engine, and rules L3 and L4
103 The number sign '#' is the comment character: everything is ignored from
104 the occurrence of '#' until the end of the line,
105 Empty lines and lines containing only spaces and/or comments are ignored.
107 Lines which represent test cases consist of 4 or 5 fields separated by a
108 semicolon. Each field consists of tokens separated by whitespace (space
109 or Tab). Whitespace before and after semicolons is optional.
111 Field 0: A sequence of hexadecimal code point values separated by space
113 Field 1: A value representing the paragraph direction, as follows:
114 - 0 represents left-to-right
115 - 1 represents right-to-left
116 - 2 represents auto-LTR according to rules P2 and P3 of the algorithm
117 - 3 represents auto-RTL according to rules P2 and P3 of the algorithm
118 - a negative number whose absolute value is taken as paragraph level;
119 this may be useful to test cases where the embedding level approaches
120 or exceeds the maximum embedding level.
122 Field 2: The resolved paragraph embedding level. If the input (field 0)
123 includes more than one paragraph, this field represents the
124 resolved level of the first paragraph.
126 Field 3: An ordered list of resulting levels for each token in field 0
127 (each token represents one source character).
128 The UBA does not assign levels to certain characters (e.g. LRO);
129 characters removed in rule X9 are indicated with an 'x'.
131 Field 4: An ordered list of indices showing the resulting visual ordering
132 from left to right; characters with a resolved level of 'x' are
133 skipped. The number are zero-based. Each index corresponds to
134 a character in the reordered (visual) string. It represents the
135 index of the source character in the input (field 0).
136 This field is optional. When it is absent, the visual ordering
141 # This is a comment line.
142 L L ON R ; 0 ; 0 ; 0 0 0 1 ; 0 1 2 3
143 L L ON R;0;0;0 0 0 1;0 1 2 3
145 # Note: in the next line, 'B' represents a block separator, not the letter 'B'.
146 LRE A B C PDF;2;0;x 2 0 0 x;1 2 3
147 # Note: in the next line, 'b' represents the letter 'b', not a block separator.
148 a b c 05d0 05d1 x ; 0 ; 0 ; 0 0 0 1 1 0 ; 0 1 2 4 3 5
150 a R R x ; 1 ; 1 ; 2 1 1 2
151 L L R R R B R R L L L B ON ON ; 3 ; 0 ; 0 0 1 1 1 0 1 1 2 2 2 1 1 1
154 *******************************************************************************
156 public void TestBidiCharacterTest() throws IOException {
157 BufferedReader bidiTestFile=TestUtil.getDataReader("unicode/BidiCharacterTest.txt");
158 Bidi ubidi=new Bidi();
164 while(errorCount<20 && (line=bidiTestFile.readLine())!=null) {
169 // Remove trailing comments and whitespace.
170 int commentStart=line.indexOf('#');
171 if(commentStart>=0) {
172 line=line.substring(0, commentStart);
174 if(!skipWhitespace()) {
175 continue; // Skip empty and comment-only lines.
177 String[] parts=line.split(";");
180 errln(" on line " + lineNumber + ": Missing ; separator on line: " + line);
183 // Parse the code point string in field 0.
185 inputStringBuilder.delete(0, inputStringBuilder.length());
186 for(String cp : parts[0].trim().split("[ \t]+")) {
187 inputStringBuilder.appendCodePoint(Integer.parseInt(cp, 16));
189 inputString=inputStringBuilder.toString();
190 } catch(Exception e) {
191 errln(" ------------ Invalid string in field 0 on line '"+line+"'");
195 int paraDirection=intFromString(parts[1].trim());
197 if(paraDirection==0) {
201 else if(paraDirection==1) {
205 else if(paraDirection==2) {
206 paraLevel=Bidi.DIRECTION_DEFAULT_LEFT_TO_RIGHT;
207 paraLevelName="Auto/LTR";
209 else if(paraDirection==3) {
210 paraLevel=Bidi.DIRECTION_DEFAULT_RIGHT_TO_LEFT;
211 paraLevelName="Auto/RTL";
213 else if(paraDirection<0 && -paraDirection<=(Bidi.MAX_EXPLICIT_LEVEL+1)) {
214 paraLevel=(byte)(-paraDirection);
215 paraLevelName=Byte.toString(paraLevel);
219 errln(" on line " + lineNumber + ": Input paragraph direction incorrect at " + line);
222 int resolvedParaLevel=intFromString(parts[2].trim());
223 if(resolvedParaLevel<0 || resolvedParaLevel>(Bidi.MAX_EXPLICIT_LEVEL+1)) {
225 errln(" on line " + lineNumber + ": Resolved paragraph level incorrect at " + line);
228 if(!parseLevels(parts[3])) {
232 if(!parseOrdering(parts[4]))
238 ubidi.setPara(inputString, paraLevel, null);
239 byte actualParaLevel=ubidi.getParaLevel();
240 if(actualParaLevel!=resolvedParaLevel) {
241 errln(" ------------ Wrong resolved paragraph level; expected "
242 +resolvedParaLevel+" actual "
246 byte[] actualLevels=ubidi.getLevels();
247 if(!checkLevels(actualLevels)) {
250 if(!checkOrdering(ubidi)) {
256 private static final byte paraLevels[]={
257 Bidi.DIRECTION_DEFAULT_LEFT_TO_RIGHT,
260 Bidi.DIRECTION_DEFAULT_RIGHT_TO_LEFT
262 private static final String paraLevelNames[]={ "auto/LTR", "LTR", "RTL", "auto/RTL" };
264 private int intFromString(String str) {
266 return Integer.parseInt(str);
267 } catch (Exception e) {
272 private boolean parseLevels(String s) {
275 String[] levelStrings=s.trim().split("[ \t]+");
276 for(String levelString: levelStrings) {
277 if(levelString.length()==0) { continue; }
278 if(levelString.equals("x")) {
279 levels[levelsCount++]=-1;
282 int value=Integer.parseInt(levelString);
283 if(0<=value && value<=(Bidi.MAX_EXPLICIT_LEVEL+1)) {
284 levels[levelsCount++]=(byte)value;
285 directionBits|=(1<<(value&1));
288 } catch(Exception e) {
290 errln(" ------------ Levels parse error at '"+levelString+"'");
297 private boolean parseOrdering(String s) {
299 String[] orderingStrings=s.trim().split("[ \t]+");
300 for(String orderingString: orderingStrings) {
301 if(orderingString.length()==0) { continue; }
303 int value=Integer.parseInt(orderingString);
305 ordering[orderingCount++]=value;
308 } catch(Exception e) {
310 errln(" ------------ Reorder parse error at '"+orderingString+"'");
316 private static char charFromBiDiClass[]={
336 // new in Unicode 6.3/ICU 52
342 private class ConfTestBidiClassifier extends BidiClassifier {
343 public ConfTestBidiClassifier() {
347 public int classify(int c) {
348 for(int i=0; i<charFromBiDiClass.length; ++i) {
349 if(c==charFromBiDiClass[i]) {
353 // Character not in our hardcoded table.
354 // Should not occur during testing.
355 return Bidi.CLASS_DEFAULT;
358 private static final int biDiClassNameLengths[]={
359 1, 1, 2, 2, 2, 2, 2, 1, 1, 2, 2, 3, 3, 2, 3, 3, 3, 3, 2, 3, 3, 3, 3, 0
361 private void parseInputStringFromBiDiClasses() {
362 inputStringBuilder.delete(0, 0x7fffffff);
364 * Lengthy but fast BiDi class parser.
365 * A simple parser could terminate or extract the name string and use
366 * int32_t biDiClassInt=u_getPropertyValueEnum(UCHAR_BIDI_CLASS, bidiClassString);
367 * but that makes this test take significantly more time.
370 while(skipWhitespace() && (c0=line.charAt(lineIndex))!=';') {
371 int biDiClass=UCharacterDirection.CHAR_DIRECTION_COUNT;
372 // Compare each character once until we have a match on
373 // a complete, short BiDi class name.
375 if((lineIndex+2)<line.length() && line.charAt(lineIndex+1)=='R') {
376 c2=line.charAt(lineIndex+2);
378 biDiClass=UCharacterDirection.LEFT_TO_RIGHT_EMBEDDING;
380 biDiClass=UCharacterDirection.LEFT_TO_RIGHT_ISOLATE;
382 biDiClass=UCharacterDirection.LEFT_TO_RIGHT_OVERRIDE;
385 biDiClass=UCharacterDirection.LEFT_TO_RIGHT;
388 if((lineIndex+2)<line.length() && line.charAt(lineIndex+1)=='L') {
389 c2=line.charAt(lineIndex+2);
391 biDiClass=UCharacterDirection.RIGHT_TO_LEFT_EMBEDDING;
393 biDiClass=UCharacterDirection.RIGHT_TO_LEFT_ISOLATE;
395 biDiClass=UCharacterDirection.RIGHT_TO_LEFT_OVERRIDE;
398 biDiClass=UCharacterDirection.RIGHT_TO_LEFT;
401 if((lineIndex+1)>=line.length()) {
403 } else if((c1=line.charAt(lineIndex+1))=='N') {
404 biDiClass=UCharacterDirection.EUROPEAN_NUMBER;
406 biDiClass=UCharacterDirection.EUROPEAN_NUMBER_SEPARATOR;
408 biDiClass=UCharacterDirection.EUROPEAN_NUMBER_TERMINATOR;
411 if((lineIndex+1)>=line.length()) {
413 } else if((c1=line.charAt(lineIndex+1))=='L') {
414 biDiClass=UCharacterDirection.RIGHT_TO_LEFT_ARABIC;
416 biDiClass=UCharacterDirection.ARABIC_NUMBER;
418 } else if(c0=='C' && (lineIndex+1)<line.length() && line.charAt(lineIndex+1)=='S') {
419 biDiClass=UCharacterDirection.COMMON_NUMBER_SEPARATOR;
421 if((lineIndex+1)<line.length() && line.charAt(lineIndex+1)=='N') {
422 biDiClass=UCharacterDirection.BOUNDARY_NEUTRAL;
424 biDiClass=UCharacterDirection.BLOCK_SEPARATOR;
427 biDiClass=UCharacterDirection.SEGMENT_SEPARATOR;
428 } else if(c0=='W' && (lineIndex+1)<line.length() && line.charAt(lineIndex+1)=='S') {
429 biDiClass=UCharacterDirection.WHITE_SPACE_NEUTRAL;
430 } else if(c0=='O' && (lineIndex+1)<line.length() && line.charAt(lineIndex+1)=='N') {
431 biDiClass=UCharacterDirection.OTHER_NEUTRAL;
432 } else if(c0=='P' && (lineIndex+2)<line.length() && line.charAt(lineIndex+1)=='D') {
433 if(line.charAt(lineIndex+2)=='F') {
434 biDiClass=UCharacterDirection.POP_DIRECTIONAL_FORMAT;
435 } else if(line.charAt(lineIndex+2)=='I') {
436 biDiClass=UCharacterDirection.POP_DIRECTIONAL_ISOLATE;
438 } else if(c0=='N' && (lineIndex+2)<line.length() &&
439 line.charAt(lineIndex+1)=='S' && line.charAt(lineIndex+2)=='M') {
440 biDiClass=UCharacterDirection.DIR_NON_SPACING_MARK;
441 } else if(c0=='F' && (lineIndex+2)<line.length() &&
442 line.charAt(lineIndex+1)=='S' && line.charAt(lineIndex+2)=='I') {
443 biDiClass=UCharacterDirection.FIRST_STRONG_ISOLATE;
445 // Now we verify that the class name is terminated properly,
446 // and not just the start of a longer word.
447 int biDiClassNameLength=biDiClassNameLengths[biDiClass];
449 if( biDiClass==UCharacterDirection.CHAR_DIRECTION_COUNT ||
450 ((lineIndex+biDiClassNameLength)<line.length() &&
451 !isInvWhitespace(c=line.charAt(lineIndex+biDiClassNameLength)) &&
454 throw new IllegalArgumentException(
455 "BiDi class string not recognized at "+line.substring(lineIndex)+" in "+line);
457 inputStringBuilder.append(charFromBiDiClass[biDiClass]);
458 lineIndex+=biDiClassNameLength;
460 inputString=inputStringBuilder.toString();
463 private static char printLevel(byte level) {
467 return (char)('0'+level);
471 private static int getDirectionBits(byte actualLevels[]) {
472 int actualDirectionBits=0;
473 for(int i=0; i<actualLevels.length; ++i) {
474 actualDirectionBits|=(1<<(actualLevels[i]&1));
476 return actualDirectionBits;
478 private boolean checkLevels(byte actualLevels[]) {
480 if(levelsCount!=actualLevels.length) {
481 errln(" ------------ Wrong number of level values; expected "+levelsCount+" actual "+actualLevels.length);
484 for(int i=0; i<actualLevels.length; ++i) {
485 if(levels[i]!=actualLevels[i] && levels[i]>=0) {
486 if(directionBits!=3 && directionBits==getDirectionBits(actualLevels)) {
487 // ICU used a shortcut:
488 // Since the text is unidirectional, it did not store the resolved
489 // levels but just returns all levels as the paragraph level 0 or 1.
490 // The reordering result is the same, so this is fine.
493 errln(" ------------ Wrong level value at index "+i+"; expected "+levels[i]+" actual "+actualLevels[i]);
502 StringBuilder els=new StringBuilder("Expected levels: ");
504 for(i=0; i<levelsCount; ++i) {
505 els.append(' ').append(printLevel(levels[i]));
507 StringBuilder als=new StringBuilder("Actual levels: ");
508 for(i=0; i<actualLevels.length; ++i) {
509 als.append(' ').append(printLevel(actualLevels[i]));
511 errln(els.toString());
512 errln(als.toString());
517 // Note: ubidi_setReorderingOptions(ubidi, UBIDI_OPTION_REMOVE_CONTROLS);
518 // does not work for custom BiDi class assignments
519 // and anyway also removes LRM/RLM/ZWJ/ZWNJ which is not desirable here.
520 // Therefore we just skip the indexes for BiDi controls while comparing
521 // with the expected ordering that has them omitted.
522 private boolean checkOrdering(Bidi ubidi) {
526 int resultLength=ubidi.getResultLength(); // visual length including BiDi controls
528 // Note: It should be faster to call ubidi_countRuns()/ubidi_getVisualRun()
529 // and loop over each run's indexes, but that seems unnecessary for this test code.
530 for(i=visualIndex=0; i<resultLength; ++i) {
531 int logicalIndex=ubidi.getLogicalIndex(i);
532 if(levels[logicalIndex]<0) {
533 continue; // BiDi control, omitted from expected ordering.
535 if(visualIndex<orderingCount && logicalIndex!=ordering[visualIndex]) {
536 errln(" ------------ Wrong ordering value at visual index "+visualIndex+"; expected "+
537 ordering[visualIndex]+" actual "+logicalIndex);
543 // visualIndex is now the visual length minus the BiDi controls,
544 // which should match the length of the BidiTest.txt ordering.
545 if(isOk && orderingCount!=visualIndex) {
546 errln(" ------------ Wrong number of ordering values; expected "+orderingCount+" actual "+visualIndex);
551 StringBuilder eord=new StringBuilder("Expected ordering: ");
552 for(i=0; i<orderingCount; ++i) {
553 eord.append(' ').append((char)('0'+ordering[i]));
555 StringBuilder aord=new StringBuilder("Actual ordering: ");
556 for(i=0; i<resultLength; ++i) {
557 int logicalIndex=ubidi.getLogicalIndex(i);
558 if(levels[logicalIndex]<Bidi.LEVEL_DEFAULT_LTR) {
559 aord.append(' ').append((char)('0'+logicalIndex));
562 errln(eord.toString());
563 errln(aord.toString());
568 private void printErrorLine() {
570 errln(String.format("Input line %5d: %s", lineNumber, line));
571 errln("Input string: "+inputString);
572 errln("Para level: "+paraLevelName);
575 private static boolean isInvWhitespace(char c) {
576 return ((c)==' ' || (c)=='\t' || (c)=='\r' || (c)=='\n');
579 * Skip isInvWhitespace() characters.
580 * @return true if line.charAt[lineIndex] is a non-whitespace, false if lineIndex>=line.length()
582 private boolean skipWhitespace() {
583 while(lineIndex<line.length()) {
584 if(!isInvWhitespace(line.charAt(lineIndex))) {
593 private int lineIndex;
594 private byte levels[]=new byte[1000]; // UBiDiLevel
595 private int directionBits;
596 private int ordering[]=new int[1000];
597 private int lineNumber;
598 private int levelsCount;
599 private int orderingCount;
600 private int errorCount;
601 private String inputString;
602 private String paraLevelName;
603 private StringBuilder inputStringBuilder=new StringBuilder();