jars/icu4j-4_8_1_1/tools/misc/src/com/ibm/icu/dev/tool/translit/indic.pl

   1 #/**
   2 # *******************************************************************************
   3 # * Copyright (C) 2000-2004, International Business Machines Corporation and    *
   4 # * others. All Rights Reserved.                                                *
   5 # *******************************************************************************
   6 # */
   7
   8 #!perl
   9
  10 # Usage - $0 <remap file>
  11 #  e.g. - indic indic.txt
  12 # The input file should be a subset of the Unicode data file containing
  13 # the blocks of interest.
  14 #
  15 # The remap file should have lines of the form
  16 # "\u0D01>\u0D02;"
  17 # including the quotes.  These will be interpreted as saying that the
  18 # undefined code point U+D01 (derived via mapping from InterIndic)
  19 # can be remapped to U+D02.
  20 #
  21 # The purpose of this script is to process the Indic script data into
  22 # a form usable by the IndicTransliterator, that is, the Indic-Indic
  23 # transliterator.  The transliterator needs two things: A mapping of
  24 # the code points in common, and a list of the exceptions.
  25
  26 # Assume we are located in icu4j/src/com/ibm/tools/translit/.
  27 # We want the Unicode DB in icu4j/src/data/unicode/.
  28 $UNICODE_DB = "../../../../data/unicode/UnicodeData.txt";
  29 $EXCEPTIONS_FILE = shift;
  30
  31 # Assume we are located in icu4j/src/com/ibm/tools/translit/.
  32 # We want to output files to icu4j/src/com/ibm/text/resources/.
  33 # Output directory
  34 $OUTDIR = "../../text/resources";
  35
  36 # The template file should contain java code that can be used
  37 # to generate RuleBasedTransliterator resource files.  The template
  38 # should contain the following embedded symbols, which this script
  39 # will replace:
  40 # $TOOL - name of generating tool
  41 # $DATE - date of generation
  42 # $SCRIPTFROM - name of source script
  43 # $SCRIPTTO - name of target script
  44 # $RULES - rules
  45 $RBT_TEMPLATE = 'rbtTemplate.txt';
  46
  47 # Name of this tool in generated RBT files
  48 $RBT_GEN_TOOL = 'icu4j/src/com/ibm/tools/translit/indic.pl';
  49
  50 $DUMP = 0; # If 1, dump out internal data
  51
  52 $DO_HEURISTIC_REMAP = 0; # If 1, do automatic heuristic remapping
  53 $DO_DECOMP_REMAP = 0; # If 1, do decomp remapping
  54
  55 open(UNICODE_DB);
  56 while (<UNICODE_DB>) {
  57     next if (m|^0[0-8]|); # Skip up to Devanagari block (0900)
  58     last if (m|^0D[8-F]|i); # Bail out after Malayam block (0D00)
  59     # 0D39;MALAYALAM LETTER HA;Lo;0;L;;;;;N;;;;;
  60     my @data = split(/;/);
  61     my $fullCode = hex($data[0]); # e.g., 0x093F
  62     my $code = $fullCode & 0x7F; # e.g., 0x3F
  63     my ($script, $name) = ($data[1] =~ /(\w+)\s+(.+)/);
  64     die "Can't parse $_" unless ($name);
  65     # e.g., $code/$script/$name = 3F/MALAYALAM/VOWEL SIGN I
  66
  67     # Titlecase the script
  68     $script = ucfirst(lc($script));
  69
  70     # Fix a couple inconsistencies in the 3.0 data
  71     # REVISIT: Is this okay to do?
  72     if ($DO_HEURISTIC_REMAP) {
  73         if ($script eq 'Gujarati' && $code >= 5 && $code <= 0x14) {
  74             $name =~ s/^VOWEL/LETTER/;
  75         }
  76     }
  77
  78     # Keep track of all script names we encounter.  We also note the
  79     # base of the block.
  80     my $base = $fullCode & ~0x7F; # e.g., 0x900;
  81     if (exists $SCRIPT_TO_BASE{$script}) {
  82         die "Script base mismatch for $script: $base vs. $SCRIPT_TO_BASE{$script}"
  83             if ($SCRIPT_TO_BASE{$script} ne $base);
  84     } else {
  85         $SCRIPT_TO_BASE{$script} = $base;
  86     }
  87
  88     # Build up a mapping by name.  For each name, keep a hash keyed by
  89     # code point.  For each code point, keep an array of script names.
  90     # Also keep a total use count for each name.
  91     push @{$NAME_CODE_TO_SCRIPTS{$name}{$code}}, $script;
  92     ++$NAME_CODE_TO_SCRIPTS{$name}{count};
  93
  94     # Build a map that looks like this:
  95     # $SCRIPT_NAME_TO_CODE{<script>}{<name>} = <code>
  96     # or undef if there is no mapping.
  97     $SCRIPT_NAME_TO_CODE{$script}{$name} = $code;
  98
  99     # Build a map that looks like this:
 100     $SCRIPT_CODE_TO_NAME{$script}{$code} = $name;
 101
 102     # And a map from the fullCode point to the name
 103     $FULLCODE_TO_NAME{$fullCode} = $name;
 104
 105     # Map code (0..7F) to name.  This is usually a 1-1 mapping, but
 106     # is 1-n in a few cases.
 107     if (exists $CODE_TO_NAME{$code}) {
 108         if ($name ne $CODE_TO_NAME{$code}) {
 109             # For multiple names on a code offset, use the format
 110             # (a/b), (a/b/c), etc.
 111             local $_ = $CODE_TO_NAME{$code};
 112             if (m|^\(|) {
 113                 if (!m|[\(\)/]$name[\(\)/]|) {
 114                     s|\)$|/$name\)|;
 115                 }
 116             } else {
 117                 $_ = "($_/$name)";
 118             }
 119             $CODE_TO_NAME{$code} = $_;
 120         }
 121     } else {
 122         $CODE_TO_NAME{$code} = $name;
 123     }
 124 }
 125 close(UNICODE_DB);
 126
 127 # Read and parse the manual remapping file.  This contains lines
 128 # of the form:
 129
 130 # |"\u0956>\u0948;"  // AI Length Mark -> Devanagari Vowel Sign AI
 131
 132 # The left hand side contains a non-existent full code value.  It
 133 # should be a single value.  The right hand side contains one or more
 134 # real full code values.  The idea is that when a mapping from another
 135 # script ends up at the non-existent code point on the left, the
 136 # sequence on the right should be substituted.  In this example,
 137 # Devanagari has no AI Length Mark.  So, if transliterating from
 138 # Oriya, then the character 0B56 (Oriya AI Length Mark) will remap to
 139 # the non-existent 0956, and that remaps to 0948, our chosen
 140 # Devanagari equivalent.  For our purposes, the left hand side should
 141 # be taken to mean its equivalent point in the InterIndic range.  In
 142 # this example, what it really says is E056>0948 in the
 143 # InterIndic-Devanagari transliterator.
 144
 145 if ($EXCEPTIONS_FILE) {
 146     open(EXCEPTIONS_FILE) or die;
 147     while (<EXCEPTIONS_FILE>) {
 148         if (m|^\s*\"([^\"]*?)\"|) {
 149             my $line = $_;
 150             $_ = $1;
 151             if (/^(.*)>(.*);$/) {
 152                 my ($rawFrom, $rawTo) = ($1, $2);
 153                 my @from = parseUnicodeEscape($rawFrom);
 154                 my @to = parseUnicodeEscape($rawTo);
 155                 my $from = hexArray(@from);
 156                 # Some entries look like this:
 157                 # |"\u0955>\u0955;"
 158                 # these do nothing; ignore them.
 159                 if (intArraysEqual(\@from, \@to)) {
 160                     #print STDERR "Ignoring NOOP remap of $from\n";
 161                 } elsif (exists $EXCEPTIONS{$from}) {
 162                     print STDERR "ERROR in $EXCEPTIONS_FILE - Duplicate remap entries for $from\n";
 163                 } elsif (scalar @from > 1) {
 164                     print STDERR "ERROR in $EXCEPTIONS_FILE - Ignoring multichar remap: ", hexArray(@from), "->", hexArray(@to), "\n";
 165                 } else {
 166                     # Check this for validity.  Full code on the left
 167                     # should NOT exist.  Full code seq on the right should.
 168                     if (exists $FULLCODE_TO_NAME{$from[0]}) {
 169                         print STDERR "ERROR in $EXCEPTIONS_FILE - Invalid remap; left side defined: ", hexArray(@from), "->", hexArray(@to), "\n";
 170                     } elsif (grep(! exists $FULLCODE_TO_NAME{$_}, @to)) {
 171                         print STDERR "ERROR in $EXCEPTIONS_FILE - Invalid remap; right side undefined: ", hexArray(@from), "->", hexArray(@to), "\n";
 172                     } else {
 173                         $EXCEPTIONS{$from[0]} = \@to;
 174                     }
 175                 }
 176             } else { die "ERROR in $EXCEPTIONS_FILE - Can't parse \"$_\" in line $line"; }
 177         }
 178     }
 179     close(EXCEPTIONS_FILE);
 180     print STDERR "$EXCEPTIONS_FILE: Loaded ", scalar keys %EXCEPTIONS, " remappings\n";
 181 }
 182
 183 if ($DO_DECOMP_REMAP) {
 184     # Read the NamesList.txt file.  This contains decomposition data.
 185     # Gather these into %DECOMP, which maps a name to n1.n2..., where n1
 186     # etc. are decomposed names.  E.g. $DECOMP{'LETTER RRA'} -> 'LETTER
 187     # RA.SIGN NUKTA'.  There may be different mappings in different script
 188     # blocks (LETTER RRA is mapped differently in Devanagari and Bengali),
 189     # in which case the name goes into %DECOMP_MISMATCH, and is removed
 190     # from %DECOMP.
 191     $NAMES = "NamesList.txt";
 192     open(NAMES);
 193     while (<NAMES>) {
 194         # Skip to start of DEVANAGARI block
 195         last if (/^\@\@\s+0900/);
 196     }
 197     while (<NAMES>) {
 198         # Continue until start of SINHALA block
 199         last if (/^\@\@\s+0D80/);
 200         if (/^([0-9A-Z]{4})/i) {
 201             $code = $1;
 202         } elsif (/^\s+:\s*(.+)/) {
 203             # We've found a mapping of the form:
 204             # 0929    DEVANAGARI LETTER NNNA
 205             #     * for transcribing Dravidian alveolar n
 206             #     : 0928 093C
 207             my $from = $FULLCODE_TO_NAME{hex($code)};
 208             my @to = map($FULLCODE_TO_NAME{hex($_)}, split(/\s+/, $1));
 209             if (exists $DECOMP{$from}) {
 210                 my $aref = $DECOMP{$from};
 211                 if (join(".", @$aref) ne join(".", @to)) {
 212                     print STDERR "ERROR: Decomp mismatch for $from\n";
 213                     print STDERR "     : $from = ", join(".", @$aref), "\n";
 214                     print STDERR "     : $from = ", join(".", @to), "\n";
 215                     $DECOMP_MISMATCH{$from} = 1;
 216                 }
 217             } else {
 218                 $DECOMP{$from} = \@to;
 219             }
 220         }
 221     }
 222     close(NAMES);
 223     # Remove mismatches
 224     foreach (keys %DECOMP_MISMATCH) {
 225         delete $DECOMP{$_};
 226     }
 227     if ($DUMP) {
 228         foreach (keys %DECOMP) {
 229             print "$_ = ", join(" + ", @{$DECOMP{$_}}), "\n";
 230         }
 231     }
 232 }
 233
 234 # Count the total number of scripts
 235
 236 $SCRIPT_COUNT = scalar keys %SCRIPT_TO_BASE;
 237 #print join("\n", sort keys %SCRIPT_TO_BASE), "\n";
 238
 239 # Dump out the %NAME_CODE_TO_SCRIPTS map.
 240
 241 if ($DUMP) {
 242     print "\nBY NAME:\n";
 243     foreach my $pass ((1, 2)) {
 244         print "\nBY NAME - SINGLETONS:\n" if ($pass eq 2);
 245         foreach my $name (sort keys %NAME_CODE_TO_SCRIPTS) {
 246             if ($pass eq 1) {
 247                 next if (1 >= $NAME_CODE_TO_SCRIPTS{$name}{count});
 248             } else {
 249                 next if (1 < $NAME_CODE_TO_SCRIPTS{$name}{count});
 250             }
 251             print "$name:";
 252             my $href = $NAME_CODE_TO_SCRIPTS{$name};
 253             foreach my $code (sort {$a <=> $b} keys %$href) {
 254                 next if ($code eq 'count');
 255                 my $aref = $href->{$code};
 256                 print " ", hex2($code), " (", formatScriptList($aref), ")";
 257             }
 258             print "\n";
 259         }
 260     }
 261 }
 262
 263 # Create some transliterators, based on the scripts and the %NAME_CODE_TO_SCRIPTS
 264 # map.  Only use %NAME_CODE_TO_SCRIPTS entries with a count of 2 or more, that is,
 265 # names that occur in two or more scripts.  For those scripts where
 266 # the names occur, map both up to the InterIndic range, and down to
 267 # the target script.
 268
 269 $INTERINDIC = 0xE000;
 270 $INTERINDIC_EXTRA = 0xE080;
 271 $INTERINDIC_EXTRA_NEXT = $INTERINDIC_EXTRA;
 272
 273 # For each script, create a hash.  The hash has a key for each
 274 # code point, either within its block, or in the InterIndic block.
 275 # the value of the key is the mapping.
 276
 277 # The script hashes are named %DEVANAGARI, etc., and referenced
 278 # with symbolic refs.
 279
 280 @REMAP = ('s/\bSHORT\s+//i',
 281           's/\bCANDRA\s+//i',
 282           's/\bQA$/KA/i',
 283           's/\bKHHA$/KHA/i',
 284           's/\bGHHA$/GA/i',
 285           's/\bZA$/JA/i',
 286           's/\bFA$/PHA/i',
 287           's/\bVA$/BA/i',
 288           's/\bNNNA$/NA/i',
 289           's/\bRRA$/RA/i',
 290           's/\bLLLA$/LLA/i',
 291           's/\bLLLA$/LA/i',
 292           's/\bLLA$/LA/i',
 293           's/^A(.) LENGTH MARK$/VOWEL SIGN A$1/i',
 294           's/CANDRABINDU/BINDI/i',
 295           's/BINDI/CANDRABINDU/i',
 296           );
 297
 298 # Do this so we see zero counts:
 299 foreach my $remap (@REMAP) { $REMAP{$remap} = 0; }
 300
 301 # This loop iterates over the names in the NAME_CODE_TO_SCRIPTS hash.
 302 # These names are things like "LETTER NNNA".  For each name, it then
 303 # creates script mappings up to the InterIndic area, and back down
 304 # to the script areas.  If a name maps to more than one offset,
 305 # then it uses the InterIndic extra range.  Either way, it picks
 306 # a single InterIndic point, either an offset point or something in
 307 # the extra range, and maps up and down from that point.
 308 foreach my $name (sort keys %NAME_CODE_TO_SCRIPTS) {
 309     next if (1 >= $NAME_CODE_TO_SCRIPTS{$name}{count});
 310     my $href = $NAME_CODE_TO_SCRIPTS{$name};
 311     # Count the number of different codes assigned to this name.
 312     # Usually 1, but 2 for a handful of names.
 313     my $codeCount = (keys %{$NAME_CODE_TO_SCRIPTS{$name}}) - 1; # less 1: {count}
 314     # If $codeCount is 1, then map directly up to the $INTERINDIC
 315     # base.  If $codeCount is 2, then map into unused spots starting
 316     # at $INTERINDIC_EXTRA.
 317     my $interIndicCode;
 318     if ($codeCount > 1) {
 319         # Map into the InterIndic extra range
 320         $interIndicCode = $INTERINDIC_EXTRA_NEXT++;
 321     }
 322     my %seen;
 323     foreach my $code (sort {$a ne 'count' && $b ne 'count' && $a <=> $b} keys %$href) {
 324         next if ($code eq 'count');
 325         my $aref = $href->{$code}; # Ref to array of scripts
 326         if ($codeCount == 1) {
 327             # Map directly
 328             $interIndicCode = $INTERINDIC + $code;
 329         }
 330         # Keep track of the names of the extra InterIndic points
 331         $INTERINDIC_NAME_TO_FULLCODE{$name} = $interIndicCode;
 332
 333         foreach my $scr (@$aref) {
 334             $seen{$scr} = 1;
 335             my $fullCode = $SCRIPT_TO_BASE{$scr} + $code;
 336             $ {$scr}{$fullCode} = hex4($interIndicCode) . "; // $name";
 337             $ {$scr}{$interIndicCode} = hex4($fullCode) . "; // $name";
 338         }
 339     }
 340     # Now handle InterIndic->Script unmapped points.  For each name,
 341     # some of the scripts will be left out -- will have no mappings
 342     # to that name.  For these scripts, we can either leave them
 343     # unmapped (so the InterIndic->Local mapping is empty), or
 344     # try to remap.
 345  unmappedScript:
 346     foreach my $scr (keys %SCRIPT_TO_BASE) {
 347         next if ($seen{$scr});
 348
 349         if ($DO_HEURISTIC_REMAP) {
 350             # Try to remap through the known equivalences in our
 351             # remapping table
 352             foreach my $remapRE (@REMAP) {
 353                 local $_ = $name;
 354                 if (eval($remapRE)) {
 355                     if (exists $SCRIPT_NAME_TO_CODE{$scr}{$_}) {
 356                         $ {$scr}{$interIndicCode} =
 357                             hex4($SCRIPT_TO_BASE{$scr} + $SCRIPT_NAME_TO_CODE{$scr}{$_}) .
 358                                 "; // REMAP: $name -> $_";
 359                         ++$REMAP{$remapRE};
 360                         next unmappedScript;
 361                     }
 362                 }
 363             }
 364         }
 365
 366         # Try to remap through the file.  This contains remappings of
 367         # the form 0991->0993.  That is, it contains local remappings
 368         # that we can substitute and try again with.
 369         #|GURMUKHI-InterIndic ------------------------------
 370         #|// 0A02>; // UNMAPPED INTERNAL: SIGN BINDI
 371         #|InterIndic-GURMUKHI ------------------------------
 372         #|// E001>; // UNMAPPED EXTERNAL: SIGN CANDRABINDU
 373         #|"\u0A01>\u0A02;"
 374         # In this example, the remapping tells us that the non-existent
 375         # character A01 should be considered equivalent to the real
 376         # character A02.
 377         # We implement this by adding two mappings; one from
 378         # the InterIndic equivalent of A01, that is, E001, to A02,
 379         # and one from A02, which otherwise has no mapping, to E001.
 380         if ($EXCEPTIONS_FILE && $interIndicCode < $INTERINDIC_EXTRA) {
 381             # Try to map this InterIndic character back to a the spot
 382             # it would occupy in this script if it had a mapping.
 383             my $code = $interIndicCode & 0x7F;
 384             my $pseudoFullCode = $SCRIPT_TO_BASE{$scr} + $code;
 385             if (exists $EXCEPTIONS{$pseudoFullCode}) {
 386                 my $fullCodeArray = $EXCEPTIONS{$pseudoFullCode};
 387                 my $comment;
 388                 foreach my $c (@$fullCodeArray) {
 389                     $comment .= "." if ($comment);
 390                     $comment .= $FULLCODE_TO_NAME{$c};
 391                 }
 392                 $comment = "; // REMAP ($EXCEPTIONS_FILE): " .
 393                     hex4($pseudoFullCode) . ">" . hexArray(@$fullCodeArray) . " = " .
 394                     $CODE_TO_NAME{$code} . ">" . $comment;
 395                 $ {$scr}{$interIndicCode} = hexArray(@$fullCodeArray) . $comment;
 396                 if (scalar @$fullCodeArray == 1) {
 397                     if (exists $ {$scr}{$fullCodeArray->[0]}) {
 398                         # There's already a proper mapping; no need to fill
 399                         # in reverse
 400                     } else {
 401                         $ {$scr}{$fullCodeArray->[0]} = hex4($interIndicCode) . $comment;
 402                     }
 403                 }
 404                 next unmappedScript;
 405             }
 406         }
 407
 408         $SCRIPT_FULLCODE_TO_IS_UNMAPPED{$scr}{$interIndicCode} = 1;
 409         local $_ = "; // UNMAPPED InterIndic-$scr: $name";
 410         if (exists $SCRIPT_CODE_TO_NAME{$scr}{$interIndicCode & 0x7F}) {
 411             my $fullCode = $SCRIPT_TO_BASE{$scr} + ($interIndicCode & 0x7F);
 412             $_ .= " (" . hex4($fullCode) . " = " . $FULLCODE_TO_NAME{$fullCode} . ")";
 413         }
 414         $ {$scr}{$interIndicCode} = $_;
 415     }
 416 }
 417
 418 # Add in unmapped entries for each script
 419 foreach my $scr (keys %SCRIPT_TO_BASE) {
 420     my $base = $SCRIPT_TO_BASE{$scr};
 421  unmappedInt:
 422     foreach my $code (keys %{$SCRIPT_CODE_TO_NAME{$scr}}) {
 423         my $fullCode = $code + $base;
 424         next if (exists $ {$scr}{$fullCode});
 425         my $name = $SCRIPT_CODE_TO_NAME{$scr}{$code};
 426
 427         if ($DO_HEURISTIC_REMAP) {
 428             foreach my $remapRE (@REMAP) {
 429                 local $_ = $name;
 430                 if (eval($remapRE)) {
 431                     if (exists $INTERINDIC_NAME_TO_FULLCODE{$_}) {
 432                         $ {$scr}{$fullCode} =
 433                             hex4($INTERINDIC_NAME_TO_FULLCODE{$_}) .
 434                                 "; // REMAP: $name -> $_";
 435                         ++$REMAP{$remapRE};
 436                         next unmappedInt;
 437                     }
 438                 }
 439             }
 440         }
 441
 442         # Now try the decomp table
 443         if ($DO_DECOMP_REMAP && exists $DECOMP{$name}) {
 444             my $x;
 445             my $cmt = "; // DECOMP: $name -> ";
 446             foreach my $n (@{$DECOMP{$name}}) {
 447                 if (exists $SCRIPT_NAME_TO_CODE{$scr}{$n}) {
 448                     $x .= hex4($SCRIPT_TO_BASE{$scr} + $SCRIPT_NAME_TO_CODE{$scr}{$n});
 449                     $cmt .= $n . " + ";
 450                 } else {
 451                     $cmt = 0;
 452                     last;
 453                 }
 454             }
 455             if ($cmt) {
 456                 $ {$scr}{$fullCode} = $x . $cmt;
 457                 next unmappedInt;
 458             }
 459         }
 460
 461         $SCRIPT_FULLCODE_TO_IS_UNMAPPED{$scr}{$fullCode} = 1;
 462         $ {$scr}{$fullCode} = "; // UNMAPPED $scr-InterIndic: $name";
 463     }
 464 }
 465
 466 # GUR
 467 # E00B>; // UNMAPPED EXTERNAL: LETTER VOCALIC R "\u0A0B>\u0A30\u0A3F;"
 468 # E00C>; // UNMAPPED EXTERNAL: LETTER VOCALIC L "\u0A0C>\u0A07;"
 469 # E00D>; // UNMAPPED EXTERNAL: LETTER CANDRA E "\u0A0D>\u0A10;"
 470 # E011>; // UNMAPPED EXTERNAL: LETTER CANDRA O "\u0A11>\u0A14;"
 471 # E037>; // UNMAPPED EXTERNAL: LETTER SSA "\u0A37>\u0A36;"
 472 # E045>; // UNMAPPED EXTERNAL: VOWEL SIGN CANDRA E "\u0A45>\u0A48;"
 473 # E049>; // UNMAPPED EXTERNAL: VOWEL SIGN CANDRA O "\u0A49>\u0A4C;"
 474 # Fix QA too
 475
 476 # Dump out script maps
 477 foreach my $scr (sort keys %SCRIPT_TO_BASE) {
 478     ## next unless ($scr eq 'TELUGU'); # Debugging
 479     my @rules;
 480     my $flag = 1;
 481     foreach my $fullCode (sort {$a <=> $b} keys %{$scr}) {
 482         if ($flag && $fullCode >= $INTERINDIC) {
 483             # We have the complete <scr>-InterIndic rules; dump
 484             # them out.
 485             generateRBT($scr, "InterIndic", \@rules, $OUTDIR);
 486             @rules = ();
 487             $flag = 0;
 488         }
 489         if (exists $SCRIPT_FULLCODE_TO_IS_UNMAPPED{$scr}{$fullCode}) {
 490             push @rules, "// " . hex4($fullCode) . ">" . $ {$scr}{$fullCode};
 491         } else {
 492             push @rules, hex4($fullCode) . ">" . $ {$scr}{$fullCode};
 493         }
 494     }
 495     # Now generate the InterIndic-<scr> rules.
 496     generateRBT("InterIndic", $scr, \@rules, $OUTDIR);
 497
 498 #    print "$scr-InterIndic ------------------------------\n";
 499 #    my $flag = 1;
 500 #    foreach my $fullCode (sort {$a <=> $b} keys %{$scr}) {
 501 #        if ($flag && $fullCode >= $INTERINDIC) {
 502 #            print "InterIndic-$scr ------------------------------\n";
 503 #            $flag = 0;
 504 #        }
 505 #        if (exists $SCRIPT_FULLCODE_TO_IS_UNMAPPED{$scr}{$fullCode}) {
 506 #            print "// ", hex4($fullCode), ">", $ {$scr}{$fullCode}, "\n";
 507 #        } else {
 508 #            print hex4($fullCode), ">", $ {$scr}{$fullCode}, "\n";
 509 #        }
 510 #    }
 511 }
 512
 513 # List successful remappings
 514 if ($DO_HEURISTIC_REMAP) {
 515     foreach my $remap (sort keys %REMAP) {
 516         print STDERR "REMAP ", $REMAP{$remap}, " x $remap\n";
 517     }
 518 }
 519
 520 #----------------------------------------------------------------------
 521 # SUBROUTINES
 522
 523 # Return a listing of an array of scripts
 524 # Param: array ref
 525 sub formatScriptList {
 526     my $aref = shift;
 527     if ($SCRIPT_COUNT == @$aref) {
 528         return "all";
 529     } elsif (($SCRIPT_COUNT - 3) <= @$aref) {
 530         my $s = "all but";
 531         my %temp;
 532         foreach (@$aref) { $temp{$_} = 1; }
 533         foreach (sort keys %SCRIPT_TO_BASE) {
 534             $s .= " $_" unless exists $temp{$_};
 535         }
 536         return $s;
 537     } else {
 538         return join(" ", @$aref);
 539     }
 540 }
 541
 542 # Format as %02X hex
 543 sub hex2 {
 544     sprintf("%02X", $_[0]);
 545 }
 546
 547 # Format as %04X hex
 548 sub hex4 {
 549     sprintf("\\u%04X", $_[0]);
 550 }
 551
 552 # Format an array as %04X hex, delimited by "."s
 553 sub hexArray {
 554     join("", map { hex4($_); } @_);
 555 }
 556
 557 # Parse a string of the form "\u0D01" to an array of integers.
 558 # Must ONLY contain escapes.
 559 # Return the array.
 560 sub parseUnicodeEscape {
 561     local $_ = shift;
 562     my $orig = $_;
 563     my @result;
 564     while (length($_)) {
 565         if (/^\\u([0-9a-f]{4})(.*)/i) {
 566             push @result, hex($1);
 567             $_ = $2;
 568         } else {
 569             die "Can't parse Unicode escape $orig\n";
 570         }
 571     }
 572     if (0 == @result) {
 573         die "Can't parse Unicode escape $orig\n";
 574     }
 575     @result;
 576 }
 577
 578 # Return 1 if the two arrays of ints are equal.
 579 # Param: ref to array of ints
 580 # Param: ref to array of ints
 581 sub intArraysEqual {
 582     my $a = shift;
 583     my $b = shift;
 584     if (scalar @$a == scalar @$b) {
 585         for (my $i=0; $i<@$a; ++$i) {
 586             if ($a->[$i] != $b->[$i]) {
 587                 return 0;
 588             }
 589         }
 590         return 1;
 591     }
 592     return 0;
 593 }
 594
 595 # Given a rule, possibly with trailing // comment,
 596 # quote the rule part and add a trailing "+" after
 597 # it.
 598 sub quoteRule {
 599     my $cmt;
 600     $cmt = $1 if (s|(\s*//.*)||); # isolate trailing // comment
 601     s/^(.*;)/\"$1\"+/;
 602     s/$/$cmt/;
 603     $_;
 604 }
 605
 606 # Given the name of the source script, name of the target script,
 607 # and array of rule strings, return a string containing the source
 608 # for a RuleBasedTransliterator file.
 609 # Param: source script name
 610 # Param: target script name
 611 # Param: ref to array of rules.  These rules are unquoted, without
 612 #  concatenators between them, but do have trailing ';' separators.
 613 # Param: name of output directory
 614 sub generateRBT {
 615     # $TOOL - name of generating tool
 616     # $DATE - date of generation
 617     # $SCRIPTFROM - name of source script
 618     # $SCRIPTTO - name of target script
 619     # $RULES - rules
 620     my ($source, $target, $rules, $outdir) = @_;
 621     my $text;
 622     $outdir =~ s|[/\\]$||; # Delete trailing / or \
 623     my $OUT = "$outdir/TransliterationRule_${source}_$target.java";
 624     open(RBT_TEMPLATE) or die;
 625     open(OUT, ">$OUT") or die;
 626     while (<RBT_TEMPLATE>) {
 627         while (/\$([A-Za-z0-9]+)/) {
 628             my $tag = $1;
 629             my $sub;
 630             if ($tag eq 'TOOL') {
 631                 $sub = $RBT_GEN_TOOL;
 632             } elsif ($tag eq 'DATE') {
 633                 $sub = localtime;
 634             } elsif ($tag eq 'SCRIPTFROM') {
 635                 $sub = $source;
 636             } elsif ($tag eq 'SCRIPTTO') {
 637                 $sub = $target;
 638             } elsif ($tag eq 'RULES') {
 639                 # Get any whitespace-only indent off the front of this tag
 640                 my $indent;
 641                 $indent = $1 if (/^(\s+)\$$tag/);
 642
 643                 # The rules in the array are not quoted.  We need to quote
 644                 # them and add '+' operators between them.  We do NOT need
 645                 # to add ';' separators.  We DO need to separate trailing
 646                 # // comments and handle them.
 647                 $sub = join("\n$indent", map(&quoteRule, @$rules)) .
 648                     "\n$indent\"\"";
 649             } else {
 650                 print STDERR "ERROR in $RBT_TEMPLATE: Unknown tag $tag\n";
 651                 $sub = "[ERROR:Unknown tag \$$tag]";
 652             }
 653             s/\$$tag/$sub/;
 654         }
 655         print OUT;
 656     }
 657     close(OUT);
 658     close(RBT_TEMPLATE);
 659     print STDERR "Written: $OUT\n";
 660 }
 661
 662 __END__