jars/icu4j-4_4_2-src/tools/misc/src/com/ibm/icu/dev/tool/translit/indic.pl

   1 #/**\r
   2 # *******************************************************************************\r
   3 # * Copyright (C) 2000-2004, International Business Machines Corporation and    *\r
   4 # * others. All Rights Reserved.                                                *\r
   5 # *******************************************************************************\r
   6 # */\r
   7 \r
   8 #!perl\r
   9 \r
  10 # Usage - $0 <remap file>\r
  11 #  e.g. - indic indic.txt\r
  12 # The input file should be a subset of the Unicode data file containing\r
  13 # the blocks of interest.\r
  14 #\r
  15 # The remap file should have lines of the form\r
  16 # "\u0D01>\u0D02;"\r
  17 # including the quotes.  These will be interpreted as saying that the\r
  18 # undefined code point U+D01 (derived via mapping from InterIndic)\r
  19 # can be remapped to U+D02.\r
  20 # \r
  21 # The purpose of this script is to process the Indic script data into\r
  22 # a form usable by the IndicTransliterator, that is, the Indic-Indic\r
  23 # transliterator.  The transliterator needs two things: A mapping of\r
  24 # the code points in common, and a list of the exceptions.\r
  25 \r
  26 # Assume we are located in icu4j/src/com/ibm/tools/translit/.\r
  27 # We want the Unicode DB in icu4j/src/data/unicode/.\r
  28 $UNICODE_DB = "../../../../data/unicode/UnicodeData.txt";\r
  29 $EXCEPTIONS_FILE = shift;\r
  30 \r
  31 # Assume we are located in icu4j/src/com/ibm/tools/translit/.\r
  32 # We want to output files to icu4j/src/com/ibm/text/resources/.\r
  33 # Output directory\r
  34 $OUTDIR = "../../text/resources";\r
  35 \r
  36 # The template file should contain java code that can be used\r
  37 # to generate RuleBasedTransliterator resource files.  The template\r
  38 # should contain the following embedded symbols, which this script\r
  39 # will replace:\r
  40 # $TOOL - name of generating tool\r
  41 # $DATE - date of generation\r
  42 # $SCRIPTFROM - name of source script\r
  43 # $SCRIPTTO - name of target script\r
  44 # $RULES - rules\r
  45 $RBT_TEMPLATE = 'rbtTemplate.txt';\r
  46 \r
  47 # Name of this tool in generated RBT files\r
  48 $RBT_GEN_TOOL = 'icu4j/src/com/ibm/tools/translit/indic.pl';\r
  49 \r
  50 $DUMP = 0; # If 1, dump out internal data\r
  51 \r
  52 $DO_HEURISTIC_REMAP = 0; # If 1, do automatic heuristic remapping\r
  53 $DO_DECOMP_REMAP = 0; # If 1, do decomp remapping\r
  54 \r
  55 open(UNICODE_DB);\r
  56 while (<UNICODE_DB>) {\r
  57     next if (m|^0[0-8]|); # Skip up to Devanagari block (0900)\r
  58     last if (m|^0D[8-F]|i); # Bail out after Malayam block (0D00)\r
  59     # 0D39;MALAYALAM LETTER HA;Lo;0;L;;;;;N;;;;;\r
  60     my @data = split(/;/);\r
  61     my $fullCode = hex($data[0]); # e.g., 0x093F\r
  62     my $code = $fullCode & 0x7F; # e.g., 0x3F\r
  63     my ($script, $name) = ($data[1] =~ /(\w+)\s+(.+)/);\r
  64     die "Can't parse $_" unless ($name);\r
  65     # e.g., $code/$script/$name = 3F/MALAYALAM/VOWEL SIGN I\r
  66 \r
  67     # Titlecase the script\r
  68     $script = ucfirst(lc($script));\r
  69 \r
  70     # Fix a couple inconsistencies in the 3.0 data\r
  71     # REVISIT: Is this okay to do?\r
  72     if ($DO_HEURISTIC_REMAP) {\r
  73         if ($script eq 'Gujarati' && $code >= 5 && $code <= 0x14) {\r
  74             $name =~ s/^VOWEL/LETTER/;\r
  75         }\r
  76     }\r
  77 \r
  78     # Keep track of all script names we encounter.  We also note the\r
  79     # base of the block.\r
  80     my $base = $fullCode & ~0x7F; # e.g., 0x900;\r
  81     if (exists $SCRIPT_TO_BASE{$script}) {\r
  82         die "Script base mismatch for $script: $base vs. $SCRIPT_TO_BASE{$script}"\r
  83             if ($SCRIPT_TO_BASE{$script} ne $base);\r
  84     } else {\r
  85         $SCRIPT_TO_BASE{$script} = $base;\r
  86     }\r
  87 \r
  88     # Build up a mapping by name.  For each name, keep a hash keyed by\r
  89     # code point.  For each code point, keep an array of script names.\r
  90     # Also keep a total use count for each name.\r
  91     push @{$NAME_CODE_TO_SCRIPTS{$name}{$code}}, $script;\r
  92     ++$NAME_CODE_TO_SCRIPTS{$name}{count};\r
  93 \r
  94     # Build a map that looks like this:\r
  95     # $SCRIPT_NAME_TO_CODE{<script>}{<name>} = <code>\r
  96     # or undef if there is no mapping.\r
  97     $SCRIPT_NAME_TO_CODE{$script}{$name} = $code;\r
  98 \r
  99     # Build a map that looks like this:\r
 100     $SCRIPT_CODE_TO_NAME{$script}{$code} = $name;\r
 101 \r
 102     # And a map from the fullCode point to the name\r
 103     $FULLCODE_TO_NAME{$fullCode} = $name;\r
 104 \r
 105     # Map code (0..7F) to name.  This is usually a 1-1 mapping, but\r
 106     # is 1-n in a few cases.\r
 107     if (exists $CODE_TO_NAME{$code}) {\r
 108         if ($name ne $CODE_TO_NAME{$code}) {\r
 109             # For multiple names on a code offset, use the format\r
 110             # (a/b), (a/b/c), etc.\r
 111             local $_ = $CODE_TO_NAME{$code};\r
 112             if (m|^\(|) {\r
 113                 if (!m|[\(\)/]$name[\(\)/]|) {\r
 114                     s|\)$|/$name\)|;\r
 115                 }\r
 116             } else {\r
 117                 $_ = "($_/$name)";\r
 118             }\r
 119             $CODE_TO_NAME{$code} = $_;\r
 120         }\r
 121     } else {\r
 122         $CODE_TO_NAME{$code} = $name;\r
 123     }\r
 124 }\r
 125 close(UNICODE_DB);\r
 126 \r
 127 # Read and parse the manual remapping file.  This contains lines\r
 128 # of the form:\r
 129 \r
 130 # |"\u0956>\u0948;"  // AI Length Mark -> Devanagari Vowel Sign AI \r
 131 \r
 132 # The left hand side contains a non-existent full code value.  It\r
 133 # should be a single value.  The right hand side contains one or more\r
 134 # real full code values.  The idea is that when a mapping from another\r
 135 # script ends up at the non-existent code point on the left, the\r
 136 # sequence on the right should be substituted.  In this example,\r
 137 # Devanagari has no AI Length Mark.  So, if transliterating from\r
 138 # Oriya, then the character 0B56 (Oriya AI Length Mark) will remap to\r
 139 # the non-existent 0956, and that remaps to 0948, our chosen\r
 140 # Devanagari equivalent.  For our purposes, the left hand side should\r
 141 # be taken to mean its equivalent point in the InterIndic range.  In\r
 142 # this example, what it really says is E056>0948 in the\r
 143 # InterIndic-Devanagari transliterator.\r
 144 \r
 145 if ($EXCEPTIONS_FILE) {\r
 146     open(EXCEPTIONS_FILE) or die;\r
 147     while (<EXCEPTIONS_FILE>) {\r
 148         if (m|^\s*\"([^\"]*?)\"|) {\r
 149             my $line = $_;\r
 150             $_ = $1;\r
 151             if (/^(.*)>(.*);$/) {\r
 152                 my ($rawFrom, $rawTo) = ($1, $2);\r
 153                 my @from = parseUnicodeEscape($rawFrom);\r
 154                 my @to = parseUnicodeEscape($rawTo);\r
 155                 my $from = hexArray(@from);\r
 156                 # Some entries look like this:\r
 157                 # |"\u0955>\u0955;"\r
 158                 # these do nothing; ignore them.\r
 159                 if (intArraysEqual(\@from, \@to)) {\r
 160                     #print STDERR "Ignoring NOOP remap of $from\n";\r
 161                 } elsif (exists $EXCEPTIONS{$from}) {\r
 162                     print STDERR "ERROR in $EXCEPTIONS_FILE - Duplicate remap entries for $from\n";\r
 163                 } elsif (scalar @from > 1) {\r
 164                     print STDERR "ERROR in $EXCEPTIONS_FILE - Ignoring multichar remap: ", hexArray(@from), "->", hexArray(@to), "\n";                    \r
 165                 } else {\r
 166                     # Check this for validity.  Full code on the left\r
 167                     # should NOT exist.  Full code seq on the right should.\r
 168                     if (exists $FULLCODE_TO_NAME{$from[0]}) {\r
 169                         print STDERR "ERROR in $EXCEPTIONS_FILE - Invalid remap; left side defined: ", hexArray(@from), "->", hexArray(@to), "\n";\r
 170                     } elsif (grep(! exists $FULLCODE_TO_NAME{$_}, @to)) {\r
 171                         print STDERR "ERROR in $EXCEPTIONS_FILE - Invalid remap; right side undefined: ", hexArray(@from), "->", hexArray(@to), "\n";\r
 172                     } else {\r
 173                         $EXCEPTIONS{$from[0]} = \@to;\r
 174                     }\r
 175                 }\r
 176             } else { die "ERROR in $EXCEPTIONS_FILE - Can't parse \"$_\" in line $line"; }\r
 177         }\r
 178     }\r
 179     close(EXCEPTIONS_FILE);\r
 180     print STDERR "$EXCEPTIONS_FILE: Loaded ", scalar keys %EXCEPTIONS, " remappings\n";\r
 181 }\r
 182 \r
 183 if ($DO_DECOMP_REMAP) {\r
 184     # Read the NamesList.txt file.  This contains decomposition data.\r
 185     # Gather these into %DECOMP, which maps a name to n1.n2..., where n1\r
 186     # etc. are decomposed names.  E.g. $DECOMP{'LETTER RRA'} -> 'LETTER\r
 187     # RA.SIGN NUKTA'.  There may be different mappings in different script\r
 188     # blocks (LETTER RRA is mapped differently in Devanagari and Bengali),\r
 189     # in which case the name goes into %DECOMP_MISMATCH, and is removed\r
 190     # from %DECOMP.\r
 191     $NAMES = "NamesList.txt";\r
 192     open(NAMES);\r
 193     while (<NAMES>) {\r
 194         # Skip to start of DEVANAGARI block\r
 195         last if (/^\@\@\s+0900/);\r
 196     }\r
 197     while (<NAMES>) {\r
 198         # Continue until start of SINHALA block\r
 199         last if (/^\@\@\s+0D80/);\r
 200         if (/^([0-9A-Z]{4})/i) {\r
 201             $code = $1;\r
 202         } elsif (/^\s+:\s*(.+)/) {\r
 203             # We've found a mapping of the form:\r
 204             # 0929    DEVANAGARI LETTER NNNA\r
 205             #     * for transcribing Dravidian alveolar n\r
 206             #     : 0928 093C\r
 207             my $from = $FULLCODE_TO_NAME{hex($code)};\r
 208             my @to = map($FULLCODE_TO_NAME{hex($_)}, split(/\s+/, $1));\r
 209             if (exists $DECOMP{$from}) {\r
 210                 my $aref = $DECOMP{$from};\r
 211                 if (join(".", @$aref) ne join(".", @to)) {\r
 212                     print STDERR "ERROR: Decomp mismatch for $from\n";\r
 213                     print STDERR "     : $from = ", join(".", @$aref), "\n";\r
 214                     print STDERR "     : $from = ", join(".", @to), "\n";\r
 215                     $DECOMP_MISMATCH{$from} = 1;\r
 216                 }\r
 217             } else {\r
 218                 $DECOMP{$from} = \@to;\r
 219             }\r
 220         }\r
 221     }\r
 222     close(NAMES);\r
 223     # Remove mismatches\r
 224     foreach (keys %DECOMP_MISMATCH) {\r
 225         delete $DECOMP{$_};\r
 226     }\r
 227     if ($DUMP) {\r
 228         foreach (keys %DECOMP) {\r
 229             print "$_ = ", join(" + ", @{$DECOMP{$_}}), "\n";\r
 230         }\r
 231     }\r
 232 }\r
 233 \r
 234 # Count the total number of scripts\r
 235 \r
 236 $SCRIPT_COUNT = scalar keys %SCRIPT_TO_BASE;\r
 237 #print join("\n", sort keys %SCRIPT_TO_BASE), "\n";\r
 238 \r
 239 # Dump out the %NAME_CODE_TO_SCRIPTS map.\r
 240 \r
 241 if ($DUMP) {\r
 242     print "\nBY NAME:\n";\r
 243     foreach my $pass ((1, 2)) {\r
 244         print "\nBY NAME - SINGLETONS:\n" if ($pass eq 2);\r
 245         foreach my $name (sort keys %NAME_CODE_TO_SCRIPTS) {\r
 246             if ($pass eq 1) {\r
 247                 next if (1 >= $NAME_CODE_TO_SCRIPTS{$name}{count});\r
 248             } else {\r
 249                 next if (1 < $NAME_CODE_TO_SCRIPTS{$name}{count});\r
 250             }\r
 251             print "$name:";\r
 252             my $href = $NAME_CODE_TO_SCRIPTS{$name};\r
 253             foreach my $code (sort {$a <=> $b} keys %$href) {\r
 254                 next if ($code eq 'count');\r
 255                 my $aref = $href->{$code};\r
 256                 print " ", hex2($code), " (", formatScriptList($aref), ")";\r
 257             }\r
 258             print "\n";\r
 259         }\r
 260     }\r
 261 }\r
 262 \r
 263 # Create some transliterators, based on the scripts and the %NAME_CODE_TO_SCRIPTS\r
 264 # map.  Only use %NAME_CODE_TO_SCRIPTS entries with a count of 2 or more, that is,\r
 265 # names that occur in two or more scripts.  For those scripts where\r
 266 # the names occur, map both up to the InterIndic range, and down to\r
 267 # the target script.\r
 268 \r
 269 $INTERINDIC = 0xE000;\r
 270 $INTERINDIC_EXTRA = 0xE080;\r
 271 $INTERINDIC_EXTRA_NEXT = $INTERINDIC_EXTRA;\r
 272 \r
 273 # For each script, create a hash.  The hash has a key for each\r
 274 # code point, either within its block, or in the InterIndic block.\r
 275 # the value of the key is the mapping.\r
 276 \r
 277 # The script hashes are named %DEVANAGARI, etc., and referenced\r
 278 # with symbolic refs.\r
 279 \r
 280 @REMAP = ('s/\bSHORT\s+//i',\r
 281           's/\bCANDRA\s+//i',\r
 282           's/\bQA$/KA/i',\r
 283           's/\bKHHA$/KHA/i',\r
 284           's/\bGHHA$/GA/i',\r
 285           's/\bZA$/JA/i',\r
 286           's/\bFA$/PHA/i',\r
 287           's/\bVA$/BA/i',\r
 288           's/\bNNNA$/NA/i',\r
 289           's/\bRRA$/RA/i',\r
 290           's/\bLLLA$/LLA/i',\r
 291           's/\bLLLA$/LA/i',\r
 292           's/\bLLA$/LA/i',\r
 293           's/^A(.) LENGTH MARK$/VOWEL SIGN A$1/i',\r
 294           's/CANDRABINDU/BINDI/i',\r
 295           's/BINDI/CANDRABINDU/i',\r
 296           );\r
 297 \r
 298 # Do this so we see zero counts:\r
 299 foreach my $remap (@REMAP) { $REMAP{$remap} = 0; }\r
 300 \r
 301 # This loop iterates over the names in the NAME_CODE_TO_SCRIPTS hash.\r
 302 # These names are things like "LETTER NNNA".  For each name, it then\r
 303 # creates script mappings up to the InterIndic area, and back down\r
 304 # to the script areas.  If a name maps to more than one offset,\r
 305 # then it uses the InterIndic extra range.  Either way, it picks\r
 306 # a single InterIndic point, either an offset point or something in\r
 307 # the extra range, and maps up and down from that point.\r
 308 foreach my $name (sort keys %NAME_CODE_TO_SCRIPTS) {\r
 309     next if (1 >= $NAME_CODE_TO_SCRIPTS{$name}{count});\r
 310     my $href = $NAME_CODE_TO_SCRIPTS{$name};\r
 311     # Count the number of different codes assigned to this name.\r
 312     # Usually 1, but 2 for a handful of names.\r
 313     my $codeCount = (keys %{$NAME_CODE_TO_SCRIPTS{$name}}) - 1; # less 1: {count}\r
 314     # If $codeCount is 1, then map directly up to the $INTERINDIC\r
 315     # base.  If $codeCount is 2, then map into unused spots starting\r
 316     # at $INTERINDIC_EXTRA.\r
 317     my $interIndicCode;\r
 318     if ($codeCount > 1) {\r
 319         # Map into the InterIndic extra range\r
 320         $interIndicCode = $INTERINDIC_EXTRA_NEXT++;\r
 321     }\r
 322     my %seen;\r
 323     foreach my $code (sort {$a ne 'count' && $b ne 'count' && $a <=> $b} keys %$href) {\r
 324         next if ($code eq 'count');\r
 325         my $aref = $href->{$code}; # Ref to array of scripts\r
 326         if ($codeCount == 1) {\r
 327             # Map directly\r
 328             $interIndicCode = $INTERINDIC + $code;\r
 329         }\r
 330         # Keep track of the names of the extra InterIndic points\r
 331         $INTERINDIC_NAME_TO_FULLCODE{$name} = $interIndicCode;\r
 332 \r
 333         foreach my $scr (@$aref) {\r
 334             $seen{$scr} = 1;\r
 335             my $fullCode = $SCRIPT_TO_BASE{$scr} + $code;\r
 336             $ {$scr}{$fullCode} = hex4($interIndicCode) . "; // $name";\r
 337             $ {$scr}{$interIndicCode} = hex4($fullCode) . "; // $name";\r
 338         }\r
 339     }\r
 340     # Now handle InterIndic->Script unmapped points.  For each name,\r
 341     # some of the scripts will be left out -- will have no mappings\r
 342     # to that name.  For these scripts, we can either leave them\r
 343     # unmapped (so the InterIndic->Local mapping is empty), or\r
 344     # try to remap.\r
 345  unmappedScript:\r
 346     foreach my $scr (keys %SCRIPT_TO_BASE) {\r
 347         next if ($seen{$scr});\r
 348 \r
 349         if ($DO_HEURISTIC_REMAP) {\r
 350             # Try to remap through the known equivalences in our\r
 351             # remapping table\r
 352             foreach my $remapRE (@REMAP) {\r
 353                 local $_ = $name;\r
 354                 if (eval($remapRE)) {\r
 355                     if (exists $SCRIPT_NAME_TO_CODE{$scr}{$_}) {\r
 356                         $ {$scr}{$interIndicCode} =\r
 357                             hex4($SCRIPT_TO_BASE{$scr} + $SCRIPT_NAME_TO_CODE{$scr}{$_}) .\r
 358                                 "; // REMAP: $name -> $_";\r
 359                         ++$REMAP{$remapRE};\r
 360                         next unmappedScript;\r
 361                     }\r
 362                 }\r
 363             }\r
 364         }\r
 365 \r
 366         # Try to remap through the file.  This contains remappings of\r
 367         # the form 0991->0993.  That is, it contains local remappings\r
 368         # that we can substitute and try again with.\r
 369         #|GURMUKHI-InterIndic ------------------------------\r
 370         #|// 0A02>; // UNMAPPED INTERNAL: SIGN BINDI\r
 371         #|InterIndic-GURMUKHI ------------------------------\r
 372         #|// E001>; // UNMAPPED EXTERNAL: SIGN CANDRABINDU\r
 373         #|"\u0A01>\u0A02;"\r
 374         # In this example, the remapping tells us that the non-existent\r
 375         # character A01 should be considered equivalent to the real\r
 376         # character A02.\r
 377         # We implement this by adding two mappings; one from\r
 378         # the InterIndic equivalent of A01, that is, E001, to A02,\r
 379         # and one from A02, which otherwise has no mapping, to E001.\r
 380         if ($EXCEPTIONS_FILE && $interIndicCode < $INTERINDIC_EXTRA) {\r
 381             # Try to map this InterIndic character back to a the spot\r
 382             # it would occupy in this script if it had a mapping.\r
 383             my $code = $interIndicCode & 0x7F;\r
 384             my $pseudoFullCode = $SCRIPT_TO_BASE{$scr} + $code;\r
 385             if (exists $EXCEPTIONS{$pseudoFullCode}) {\r
 386                 my $fullCodeArray = $EXCEPTIONS{$pseudoFullCode};\r
 387                 my $comment;\r
 388                 foreach my $c (@$fullCodeArray) {\r
 389                     $comment .= "." if ($comment);\r
 390                     $comment .= $FULLCODE_TO_NAME{$c};\r
 391                 }\r
 392                 $comment = "; // REMAP ($EXCEPTIONS_FILE): " .\r
 393                     hex4($pseudoFullCode) . ">" . hexArray(@$fullCodeArray) . " = " .\r
 394                     $CODE_TO_NAME{$code} . ">" . $comment;\r
 395                 $ {$scr}{$interIndicCode} = hexArray(@$fullCodeArray) . $comment;\r
 396                 if (scalar @$fullCodeArray == 1) {\r
 397                     if (exists $ {$scr}{$fullCodeArray->[0]}) {\r
 398                         # There's already a proper mapping; no need to fill\r
 399                         # in reverse\r
 400                     } else {\r
 401                         $ {$scr}{$fullCodeArray->[0]} = hex4($interIndicCode) . $comment;\r
 402                     }\r
 403                 }\r
 404                 next unmappedScript;\r
 405             }\r
 406         }\r
 407 \r
 408         $SCRIPT_FULLCODE_TO_IS_UNMAPPED{$scr}{$interIndicCode} = 1;\r
 409         local $_ = "; // UNMAPPED InterIndic-$scr: $name";\r
 410         if (exists $SCRIPT_CODE_TO_NAME{$scr}{$interIndicCode & 0x7F}) {\r
 411             my $fullCode = $SCRIPT_TO_BASE{$scr} + ($interIndicCode & 0x7F);\r
 412             $_ .= " (" . hex4($fullCode) . " = " . $FULLCODE_TO_NAME{$fullCode} . ")";\r
 413         }\r
 414         $ {$scr}{$interIndicCode} = $_;\r
 415     }\r
 416 }\r
 417 \r
 418 # Add in unmapped entries for each script\r
 419 foreach my $scr (keys %SCRIPT_TO_BASE) {\r
 420     my $base = $SCRIPT_TO_BASE{$scr};\r
 421  unmappedInt:\r
 422     foreach my $code (keys %{$SCRIPT_CODE_TO_NAME{$scr}}) {\r
 423         my $fullCode = $code + $base;\r
 424         next if (exists $ {$scr}{$fullCode});\r
 425         my $name = $SCRIPT_CODE_TO_NAME{$scr}{$code};\r
 426 \r
 427         if ($DO_HEURISTIC_REMAP) {\r
 428             foreach my $remapRE (@REMAP) {\r
 429                 local $_ = $name;\r
 430                 if (eval($remapRE)) {\r
 431                     if (exists $INTERINDIC_NAME_TO_FULLCODE{$_}) {\r
 432                         $ {$scr}{$fullCode} =\r
 433                             hex4($INTERINDIC_NAME_TO_FULLCODE{$_}) .\r
 434                                 "; // REMAP: $name -> $_";\r
 435                         ++$REMAP{$remapRE};\r
 436                         next unmappedInt;\r
 437                     }\r
 438                 }\r
 439             }\r
 440         }\r
 441 \r
 442         # Now try the decomp table\r
 443         if ($DO_DECOMP_REMAP && exists $DECOMP{$name}) {\r
 444             my $x;\r
 445             my $cmt = "; // DECOMP: $name -> ";\r
 446             foreach my $n (@{$DECOMP{$name}}) {\r
 447                 if (exists $SCRIPT_NAME_TO_CODE{$scr}{$n}) {\r
 448                     $x .= hex4($SCRIPT_TO_BASE{$scr} + $SCRIPT_NAME_TO_CODE{$scr}{$n});\r
 449                     $cmt .= $n . " + ";\r
 450                 } else {\r
 451                     $cmt = 0;\r
 452                     last;\r
 453                 }\r
 454             }\r
 455             if ($cmt) {\r
 456                 $ {$scr}{$fullCode} = $x . $cmt;\r
 457                 next unmappedInt;\r
 458             }\r
 459         }\r
 460 \r
 461         $SCRIPT_FULLCODE_TO_IS_UNMAPPED{$scr}{$fullCode} = 1;\r
 462         $ {$scr}{$fullCode} = "; // UNMAPPED $scr-InterIndic: $name";\r
 463     }\r
 464 }\r
 465 \r
 466 # GUR\r
 467 # E00B>; // UNMAPPED EXTERNAL: LETTER VOCALIC R "\u0A0B>\u0A30\u0A3F;"\r
 468 # E00C>; // UNMAPPED EXTERNAL: LETTER VOCALIC L "\u0A0C>\u0A07;"\r
 469 # E00D>; // UNMAPPED EXTERNAL: LETTER CANDRA E "\u0A0D>\u0A10;"\r
 470 # E011>; // UNMAPPED EXTERNAL: LETTER CANDRA O "\u0A11>\u0A14;"\r
 471 # E037>; // UNMAPPED EXTERNAL: LETTER SSA "\u0A37>\u0A36;"\r
 472 # E045>; // UNMAPPED EXTERNAL: VOWEL SIGN CANDRA E "\u0A45>\u0A48;"\r
 473 # E049>; // UNMAPPED EXTERNAL: VOWEL SIGN CANDRA O "\u0A49>\u0A4C;"\r
 474 # Fix QA too\r
 475 \r
 476 # Dump out script maps\r
 477 foreach my $scr (sort keys %SCRIPT_TO_BASE) {\r
 478     ## next unless ($scr eq 'TELUGU'); # Debugging\r
 479     my @rules;\r
 480     my $flag = 1;\r
 481     foreach my $fullCode (sort {$a <=> $b} keys %{$scr}) {\r
 482         if ($flag && $fullCode >= $INTERINDIC) {\r
 483             # We have the complete <scr>-InterIndic rules; dump\r
 484             # them out.\r
 485             generateRBT($scr, "InterIndic", \@rules, $OUTDIR);\r
 486             @rules = ();\r
 487             $flag = 0;\r
 488         }\r
 489         if (exists $SCRIPT_FULLCODE_TO_IS_UNMAPPED{$scr}{$fullCode}) {\r
 490             push @rules, "// " . hex4($fullCode) . ">" . $ {$scr}{$fullCode};\r
 491         } else {\r
 492             push @rules, hex4($fullCode) . ">" . $ {$scr}{$fullCode};\r
 493         }\r
 494     }\r
 495     # Now generate the InterIndic-<scr> rules.\r
 496     generateRBT("InterIndic", $scr, \@rules, $OUTDIR);\r
 497 \r
 498 #    print "$scr-InterIndic ------------------------------\n";\r
 499 #    my $flag = 1;\r
 500 #    foreach my $fullCode (sort {$a <=> $b} keys %{$scr}) {\r
 501 #        if ($flag && $fullCode >= $INTERINDIC) {\r
 502 #            print "InterIndic-$scr ------------------------------\n";\r
 503 #            $flag = 0;\r
 504 #        }\r
 505 #        if (exists $SCRIPT_FULLCODE_TO_IS_UNMAPPED{$scr}{$fullCode}) {\r
 506 #            print "// ", hex4($fullCode), ">", $ {$scr}{$fullCode}, "\n";\r
 507 #        } else {\r
 508 #            print hex4($fullCode), ">", $ {$scr}{$fullCode}, "\n";\r
 509 #        }\r
 510 #    }\r
 511 }\r
 512 \r
 513 # List successful remappings\r
 514 if ($DO_HEURISTIC_REMAP) {\r
 515     foreach my $remap (sort keys %REMAP) {\r
 516         print STDERR "REMAP ", $REMAP{$remap}, " x $remap\n";\r
 517     }\r
 518 }\r
 519 \r
 520 #----------------------------------------------------------------------\r
 521 # SUBROUTINES\r
 522 \r
 523 # Return a listing of an array of scripts\r
 524 # Param: array ref\r
 525 sub formatScriptList {\r
 526     my $aref = shift;\r
 527     if ($SCRIPT_COUNT == @$aref) {\r
 528         return "all";\r
 529     } elsif (($SCRIPT_COUNT - 3) <= @$aref) {\r
 530         my $s = "all but";\r
 531         my %temp;\r
 532         foreach (@$aref) { $temp{$_} = 1; }\r
 533         foreach (sort keys %SCRIPT_TO_BASE) {\r
 534             $s .= " $_" unless exists $temp{$_};\r
 535         }\r
 536         return $s;\r
 537     } else {\r
 538         return join(" ", @$aref);\r
 539     }\r
 540 }\r
 541 \r
 542 # Format as %02X hex\r
 543 sub hex2 {\r
 544     sprintf("%02X", $_[0]);\r
 545 }\r
 546 \r
 547 # Format as %04X hex\r
 548 sub hex4 {\r
 549     sprintf("\\u%04X", $_[0]);\r
 550 }\r
 551 \r
 552 # Format an array as %04X hex, delimited by "."s\r
 553 sub hexArray {\r
 554     join("", map { hex4($_); } @_);\r
 555 }\r
 556 \r
 557 # Parse a string of the form "\u0D01" to an array of integers.\r
 558 # Must ONLY contain escapes.\r
 559 # Return the array.\r
 560 sub parseUnicodeEscape {\r
 561     local $_ = shift;\r
 562     my $orig = $_;\r
 563     my @result;\r
 564     while (length($_)) {\r
 565         if (/^\\u([0-9a-f]{4})(.*)/i) {\r
 566             push @result, hex($1);\r
 567             $_ = $2;\r
 568         } else {\r
 569             die "Can't parse Unicode escape $orig\n";\r
 570         }\r
 571     }\r
 572     if (0 == @result) {\r
 573         die "Can't parse Unicode escape $orig\n";        \r
 574     }\r
 575     @result;\r
 576 }\r
 577 \r
 578 # Return 1 if the two arrays of ints are equal.\r
 579 # Param: ref to array of ints\r
 580 # Param: ref to array of ints\r
 581 sub intArraysEqual {\r
 582     my $a = shift;\r
 583     my $b = shift;\r
 584     if (scalar @$a == scalar @$b) {\r
 585         for (my $i=0; $i<@$a; ++$i) {\r
 586             if ($a->[$i] != $b->[$i]) {\r
 587                 return 0;\r
 588             }\r
 589         }\r
 590         return 1;\r
 591     }\r
 592     return 0;\r
 593 }\r
 594 \r
 595 # Given a rule, possibly with trailing // comment,\r
 596 # quote the rule part and add a trailing "+" after\r
 597 # it.\r
 598 sub quoteRule {\r
 599     my $cmt;\r
 600     $cmt = $1 if (s|(\s*//.*)||); # isolate trailing // comment\r
 601     s/^(.*;)/\"$1\"+/;\r
 602     s/$/$cmt/;\r
 603     $_;\r
 604 }\r
 605 \r
 606 # Given the name of the source script, name of the target script,\r
 607 # and array of rule strings, return a string containing the source\r
 608 # for a RuleBasedTransliterator file.\r
 609 # Param: source script name\r
 610 # Param: target script name\r
 611 # Param: ref to array of rules.  These rules are unquoted, without\r
 612 #  concatenators between them, but do have trailing ';' separators.\r
 613 # Param: name of output directory\r
 614 sub generateRBT {\r
 615     # $TOOL - name of generating tool\r
 616     # $DATE - date of generation\r
 617     # $SCRIPTFROM - name of source script\r
 618     # $SCRIPTTO - name of target script\r
 619     # $RULES - rules\r
 620     my ($source, $target, $rules, $outdir) = @_;\r
 621     my $text;\r
 622     $outdir =~ s|[/\\]$||; # Delete trailing / or \\r
 623     my $OUT = "$outdir/TransliterationRule_${source}_$target.java";\r
 624     open(RBT_TEMPLATE) or die;\r
 625     open(OUT, ">$OUT") or die;\r
 626     while (<RBT_TEMPLATE>) {\r
 627         while (/\$([A-Za-z0-9]+)/) {\r
 628             my $tag = $1;\r
 629             my $sub;\r
 630             if ($tag eq 'TOOL') {\r
 631                 $sub = $RBT_GEN_TOOL;\r
 632             } elsif ($tag eq 'DATE') {\r
 633                 $sub = localtime;\r
 634             } elsif ($tag eq 'SCRIPTFROM') {\r
 635                 $sub = $source;\r
 636             } elsif ($tag eq 'SCRIPTTO') {\r
 637                 $sub = $target;\r
 638             } elsif ($tag eq 'RULES') {\r
 639                 # Get any whitespace-only indent off the front of this tag\r
 640                 my $indent;\r
 641                 $indent = $1 if (/^(\s+)\$$tag/);\r
 642 \r
 643                 # The rules in the array are not quoted.  We need to quote\r
 644                 # them and add '+' operators between them.  We do NOT need\r
 645                 # to add ';' separators.  We DO need to separate trailing\r
 646                 # // comments and handle them.\r
 647                 $sub = join("\n$indent", map(&quoteRule, @$rules)) .\r
 648                     "\n$indent\"\"";\r
 649             } else {\r
 650                 print STDERR "ERROR in $RBT_TEMPLATE: Unknown tag $tag\n";\r
 651                 $sub = "[ERROR:Unknown tag \$$tag]";\r
 652             }\r
 653             s/\$$tag/$sub/;\r
 654         }\r
 655         print OUT;\r
 656     }\r
 657     close(OUT);\r
 658     close(RBT_TEMPLATE);\r
 659     print STDERR "Written: $OUT\n";\r
 660 }\r
 661 \r
 662 __END__\r