2 # *******************************************************************************
3 # * Copyright (C) 2000-2004, International Business Machines Corporation and *
4 # * others. All Rights Reserved. *
5 # *******************************************************************************
10 # Usage - $0 <remap file>
11 # e.g. - indic indic.txt
12 # The input file should be a subset of the Unicode data file containing
13 # the blocks of interest.
15 # The remap file should have lines of the form
17 # including the quotes. These will be interpreted as saying that the
18 # undefined code point U+D01 (derived via mapping from InterIndic)
19 # can be remapped to U+D02.
21 # The purpose of this script is to process the Indic script data into
22 # a form usable by the IndicTransliterator, that is, the Indic-Indic
23 # transliterator. The transliterator needs two things: A mapping of
24 # the code points in common, and a list of the exceptions.
26 # Assume we are located in icu4j/src/com/ibm/tools/translit/.
27 # We want the Unicode DB in icu4j/src/data/unicode/.
28 $UNICODE_DB = "../../../../data/unicode/UnicodeData.txt";
29 $EXCEPTIONS_FILE = shift;
31 # Assume we are located in icu4j/src/com/ibm/tools/translit/.
32 # We want to output files to icu4j/src/com/ibm/text/resources/.
34 $OUTDIR = "../../text/resources";
36 # The template file should contain java code that can be used
37 # to generate RuleBasedTransliterator resource files. The template
38 # should contain the following embedded symbols, which this script
40 # $TOOL - name of generating tool
41 # $DATE - date of generation
42 # $SCRIPTFROM - name of source script
43 # $SCRIPTTO - name of target script
45 $RBT_TEMPLATE = 'rbtTemplate.txt';
47 # Name of this tool in generated RBT files
48 $RBT_GEN_TOOL = 'icu4j/src/com/ibm/tools/translit/indic.pl';
50 $DUMP = 0; # If 1, dump out internal data
52 $DO_HEURISTIC_REMAP = 0; # If 1, do automatic heuristic remapping
53 $DO_DECOMP_REMAP = 0; # If 1, do decomp remapping
56 while (<UNICODE_DB>) {
57 next if (m|^0[0-8]|); # Skip up to Devanagari block (0900)
58 last if (m|^0D[8-F]|i); # Bail out after Malayam block (0D00)
59 # 0D39;MALAYALAM LETTER HA;Lo;0;L;;;;;N;;;;;
60 my @data = split(/;/);
61 my $fullCode = hex($data[0]); # e.g., 0x093F
62 my $code = $fullCode & 0x7F; # e.g., 0x3F
63 my ($script, $name) = ($data[1] =~ /(\w+)\s+(.+)/);
64 die "Can't parse $_" unless ($name);
65 # e.g., $code/$script/$name = 3F/MALAYALAM/VOWEL SIGN I
67 # Titlecase the script
68 $script = ucfirst(lc($script));
70 # Fix a couple inconsistencies in the 3.0 data
71 # REVISIT: Is this okay to do?
72 if ($DO_HEURISTIC_REMAP) {
73 if ($script eq 'Gujarati' && $code >= 5 && $code <= 0x14) {
74 $name =~ s/^VOWEL/LETTER/;
78 # Keep track of all script names we encounter. We also note the
80 my $base = $fullCode & ~0x7F; # e.g., 0x900;
81 if (exists $SCRIPT_TO_BASE{$script}) {
82 die "Script base mismatch for $script: $base vs. $SCRIPT_TO_BASE{$script}"
83 if ($SCRIPT_TO_BASE{$script} ne $base);
85 $SCRIPT_TO_BASE{$script} = $base;
88 # Build up a mapping by name. For each name, keep a hash keyed by
89 # code point. For each code point, keep an array of script names.
90 # Also keep a total use count for each name.
91 push @{$NAME_CODE_TO_SCRIPTS{$name}{$code}}, $script;
92 ++$NAME_CODE_TO_SCRIPTS{$name}{count};
94 # Build a map that looks like this:
95 # $SCRIPT_NAME_TO_CODE{<script>}{<name>} = <code>
96 # or undef if there is no mapping.
97 $SCRIPT_NAME_TO_CODE{$script}{$name} = $code;
99 # Build a map that looks like this:
100 $SCRIPT_CODE_TO_NAME{$script}{$code} = $name;
102 # And a map from the fullCode point to the name
103 $FULLCODE_TO_NAME{$fullCode} = $name;
105 # Map code (0..7F) to name. This is usually a 1-1 mapping, but
106 # is 1-n in a few cases.
107 if (exists $CODE_TO_NAME{$code}) {
108 if ($name ne $CODE_TO_NAME{$code}) {
109 # For multiple names on a code offset, use the format
110 # (a/b), (a/b/c), etc.
111 local $_ = $CODE_TO_NAME{$code};
113 if (!m|[\(\)/]$name[\(\)/]|) {
119 $CODE_TO_NAME{$code} = $_;
122 $CODE_TO_NAME{$code} = $name;
127 # Read and parse the manual remapping file. This contains lines
130 # |"\u0956>\u0948;" // AI Length Mark -> Devanagari Vowel Sign AI
132 # The left hand side contains a non-existent full code value. It
133 # should be a single value. The right hand side contains one or more
134 # real full code values. The idea is that when a mapping from another
135 # script ends up at the non-existent code point on the left, the
136 # sequence on the right should be substituted. In this example,
137 # Devanagari has no AI Length Mark. So, if transliterating from
138 # Oriya, then the character 0B56 (Oriya AI Length Mark) will remap to
139 # the non-existent 0956, and that remaps to 0948, our chosen
140 # Devanagari equivalent. For our purposes, the left hand side should
141 # be taken to mean its equivalent point in the InterIndic range. In
142 # this example, what it really says is E056>0948 in the
143 # InterIndic-Devanagari transliterator.
145 if ($EXCEPTIONS_FILE) {
146 open(EXCEPTIONS_FILE) or die;
147 while (<EXCEPTIONS_FILE>) {
148 if (m|^\s*\"([^\"]*?)\"|) {
151 if (/^(.*)>(.*);$/) {
152 my ($rawFrom, $rawTo) = ($1, $2);
153 my @from = parseUnicodeEscape($rawFrom);
154 my @to = parseUnicodeEscape($rawTo);
155 my $from = hexArray(@from);
156 # Some entries look like this:
158 # these do nothing; ignore them.
159 if (intArraysEqual(\@from, \@to)) {
160 #print STDERR "Ignoring NOOP remap of $from\n";
161 } elsif (exists $EXCEPTIONS{$from}) {
162 print STDERR "ERROR in $EXCEPTIONS_FILE - Duplicate remap entries for $from\n";
163 } elsif (scalar @from > 1) {
164 print STDERR "ERROR in $EXCEPTIONS_FILE - Ignoring multichar remap: ", hexArray(@from), "->", hexArray(@to), "\n";
166 # Check this for validity. Full code on the left
167 # should NOT exist. Full code seq on the right should.
168 if (exists $FULLCODE_TO_NAME{$from[0]}) {
169 print STDERR "ERROR in $EXCEPTIONS_FILE - Invalid remap; left side defined: ", hexArray(@from), "->", hexArray(@to), "\n";
170 } elsif (grep(! exists $FULLCODE_TO_NAME{$_}, @to)) {
171 print STDERR "ERROR in $EXCEPTIONS_FILE - Invalid remap; right side undefined: ", hexArray(@from), "->", hexArray(@to), "\n";
173 $EXCEPTIONS{$from[0]} = \@to;
176 } else { die "ERROR in $EXCEPTIONS_FILE - Can't parse \"$_\" in line $line"; }
179 close(EXCEPTIONS_FILE);
180 print STDERR "$EXCEPTIONS_FILE: Loaded ", scalar keys %EXCEPTIONS, " remappings\n";
183 if ($DO_DECOMP_REMAP) {
184 # Read the NamesList.txt file. This contains decomposition data.
185 # Gather these into %DECOMP, which maps a name to n1.n2..., where n1
186 # etc. are decomposed names. E.g. $DECOMP{'LETTER RRA'} -> 'LETTER
187 # RA.SIGN NUKTA'. There may be different mappings in different script
188 # blocks (LETTER RRA is mapped differently in Devanagari and Bengali),
189 # in which case the name goes into %DECOMP_MISMATCH, and is removed
191 $NAMES = "NamesList.txt";
194 # Skip to start of DEVANAGARI block
195 last if (/^\@\@\s+0900/);
198 # Continue until start of SINHALA block
199 last if (/^\@\@\s+0D80/);
200 if (/^([0-9A-Z]{4})/i) {
202 } elsif (/^\s+:\s*(.+)/) {
203 # We've found a mapping of the form:
204 # 0929 DEVANAGARI LETTER NNNA
205 # * for transcribing Dravidian alveolar n
207 my $from = $FULLCODE_TO_NAME{hex($code)};
208 my @to = map($FULLCODE_TO_NAME{hex($_)}, split(/\s+/, $1));
209 if (exists $DECOMP{$from}) {
210 my $aref = $DECOMP{$from};
211 if (join(".", @$aref) ne join(".", @to)) {
212 print STDERR "ERROR: Decomp mismatch for $from\n";
213 print STDERR " : $from = ", join(".", @$aref), "\n";
214 print STDERR " : $from = ", join(".", @to), "\n";
215 $DECOMP_MISMATCH{$from} = 1;
218 $DECOMP{$from} = \@to;
224 foreach (keys %DECOMP_MISMATCH) {
228 foreach (keys %DECOMP) {
229 print "$_ = ", join(" + ", @{$DECOMP{$_}}), "\n";
234 # Count the total number of scripts
236 $SCRIPT_COUNT = scalar keys %SCRIPT_TO_BASE;
237 #print join("\n", sort keys %SCRIPT_TO_BASE), "\n";
239 # Dump out the %NAME_CODE_TO_SCRIPTS map.
242 print "\nBY NAME:\n";
243 foreach my $pass ((1, 2)) {
244 print "\nBY NAME - SINGLETONS:\n" if ($pass eq 2);
245 foreach my $name (sort keys %NAME_CODE_TO_SCRIPTS) {
247 next if (1 >= $NAME_CODE_TO_SCRIPTS{$name}{count});
249 next if (1 < $NAME_CODE_TO_SCRIPTS{$name}{count});
252 my $href = $NAME_CODE_TO_SCRIPTS{$name};
253 foreach my $code (sort {$a <=> $b} keys %$href) {
254 next if ($code eq 'count');
255 my $aref = $href->{$code};
256 print " ", hex2($code), " (", formatScriptList($aref), ")";
263 # Create some transliterators, based on the scripts and the %NAME_CODE_TO_SCRIPTS
264 # map. Only use %NAME_CODE_TO_SCRIPTS entries with a count of 2 or more, that is,
265 # names that occur in two or more scripts. For those scripts where
266 # the names occur, map both up to the InterIndic range, and down to
269 $INTERINDIC = 0xE000;
270 $INTERINDIC_EXTRA = 0xE080;
271 $INTERINDIC_EXTRA_NEXT = $INTERINDIC_EXTRA;
273 # For each script, create a hash. The hash has a key for each
274 # code point, either within its block, or in the InterIndic block.
275 # the value of the key is the mapping.
277 # The script hashes are named %DEVANAGARI, etc., and referenced
278 # with symbolic refs.
280 @REMAP = ('s/\bSHORT\s+//i',
293 's/^A(.) LENGTH MARK$/VOWEL SIGN A$1/i',
294 's/CANDRABINDU/BINDI/i',
295 's/BINDI/CANDRABINDU/i',
298 # Do this so we see zero counts:
299 foreach my $remap (@REMAP) { $REMAP{$remap} = 0; }
301 # This loop iterates over the names in the NAME_CODE_TO_SCRIPTS hash.
302 # These names are things like "LETTER NNNA". For each name, it then
303 # creates script mappings up to the InterIndic area, and back down
304 # to the script areas. If a name maps to more than one offset,
305 # then it uses the InterIndic extra range. Either way, it picks
306 # a single InterIndic point, either an offset point or something in
307 # the extra range, and maps up and down from that point.
308 foreach my $name (sort keys %NAME_CODE_TO_SCRIPTS) {
309 next if (1 >= $NAME_CODE_TO_SCRIPTS{$name}{count});
310 my $href = $NAME_CODE_TO_SCRIPTS{$name};
311 # Count the number of different codes assigned to this name.
312 # Usually 1, but 2 for a handful of names.
313 my $codeCount = (keys %{$NAME_CODE_TO_SCRIPTS{$name}}) - 1; # less 1: {count}
314 # If $codeCount is 1, then map directly up to the $INTERINDIC
315 # base. If $codeCount is 2, then map into unused spots starting
316 # at $INTERINDIC_EXTRA.
318 if ($codeCount > 1) {
319 # Map into the InterIndic extra range
320 $interIndicCode = $INTERINDIC_EXTRA_NEXT++;
323 foreach my $code (sort {$a ne 'count' && $b ne 'count' && $a <=> $b} keys %$href) {
324 next if ($code eq 'count');
325 my $aref = $href->{$code}; # Ref to array of scripts
326 if ($codeCount == 1) {
328 $interIndicCode = $INTERINDIC + $code;
330 # Keep track of the names of the extra InterIndic points
331 $INTERINDIC_NAME_TO_FULLCODE{$name} = $interIndicCode;
333 foreach my $scr (@$aref) {
335 my $fullCode = $SCRIPT_TO_BASE{$scr} + $code;
336 $ {$scr}{$fullCode} = hex4($interIndicCode) . "; // $name";
337 $ {$scr}{$interIndicCode} = hex4($fullCode) . "; // $name";
340 # Now handle InterIndic->Script unmapped points. For each name,
341 # some of the scripts will be left out -- will have no mappings
342 # to that name. For these scripts, we can either leave them
343 # unmapped (so the InterIndic->Local mapping is empty), or
346 foreach my $scr (keys %SCRIPT_TO_BASE) {
347 next if ($seen{$scr});
349 if ($DO_HEURISTIC_REMAP) {
350 # Try to remap through the known equivalences in our
352 foreach my $remapRE (@REMAP) {
354 if (eval($remapRE)) {
355 if (exists $SCRIPT_NAME_TO_CODE{$scr}{$_}) {
356 $ {$scr}{$interIndicCode} =
357 hex4($SCRIPT_TO_BASE{$scr} + $SCRIPT_NAME_TO_CODE{$scr}{$_}) .
358 "; // REMAP: $name -> $_";
366 # Try to remap through the file. This contains remappings of
367 # the form 0991->0993. That is, it contains local remappings
368 # that we can substitute and try again with.
369 #|GURMUKHI-InterIndic ------------------------------
370 #|// 0A02>; // UNMAPPED INTERNAL: SIGN BINDI
371 #|InterIndic-GURMUKHI ------------------------------
372 #|// E001>; // UNMAPPED EXTERNAL: SIGN CANDRABINDU
374 # In this example, the remapping tells us that the non-existent
375 # character A01 should be considered equivalent to the real
377 # We implement this by adding two mappings; one from
378 # the InterIndic equivalent of A01, that is, E001, to A02,
379 # and one from A02, which otherwise has no mapping, to E001.
380 if ($EXCEPTIONS_FILE && $interIndicCode < $INTERINDIC_EXTRA) {
381 # Try to map this InterIndic character back to a the spot
382 # it would occupy in this script if it had a mapping.
383 my $code = $interIndicCode & 0x7F;
384 my $pseudoFullCode = $SCRIPT_TO_BASE{$scr} + $code;
385 if (exists $EXCEPTIONS{$pseudoFullCode}) {
386 my $fullCodeArray = $EXCEPTIONS{$pseudoFullCode};
388 foreach my $c (@$fullCodeArray) {
389 $comment .= "." if ($comment);
390 $comment .= $FULLCODE_TO_NAME{$c};
392 $comment = "; // REMAP ($EXCEPTIONS_FILE): " .
393 hex4($pseudoFullCode) . ">" . hexArray(@$fullCodeArray) . " = " .
394 $CODE_TO_NAME{$code} . ">" . $comment;
395 $ {$scr}{$interIndicCode} = hexArray(@$fullCodeArray) . $comment;
396 if (scalar @$fullCodeArray == 1) {
397 if (exists $ {$scr}{$fullCodeArray->[0]}) {
398 # There's already a proper mapping; no need to fill
401 $ {$scr}{$fullCodeArray->[0]} = hex4($interIndicCode) . $comment;
408 $SCRIPT_FULLCODE_TO_IS_UNMAPPED{$scr}{$interIndicCode} = 1;
409 local $_ = "; // UNMAPPED InterIndic-$scr: $name";
410 if (exists $SCRIPT_CODE_TO_NAME{$scr}{$interIndicCode & 0x7F}) {
411 my $fullCode = $SCRIPT_TO_BASE{$scr} + ($interIndicCode & 0x7F);
412 $_ .= " (" . hex4($fullCode) . " = " . $FULLCODE_TO_NAME{$fullCode} . ")";
414 $ {$scr}{$interIndicCode} = $_;
418 # Add in unmapped entries for each script
419 foreach my $scr (keys %SCRIPT_TO_BASE) {
420 my $base = $SCRIPT_TO_BASE{$scr};
422 foreach my $code (keys %{$SCRIPT_CODE_TO_NAME{$scr}}) {
423 my $fullCode = $code + $base;
424 next if (exists $ {$scr}{$fullCode});
425 my $name = $SCRIPT_CODE_TO_NAME{$scr}{$code};
427 if ($DO_HEURISTIC_REMAP) {
428 foreach my $remapRE (@REMAP) {
430 if (eval($remapRE)) {
431 if (exists $INTERINDIC_NAME_TO_FULLCODE{$_}) {
432 $ {$scr}{$fullCode} =
433 hex4($INTERINDIC_NAME_TO_FULLCODE{$_}) .
434 "; // REMAP: $name -> $_";
442 # Now try the decomp table
443 if ($DO_DECOMP_REMAP && exists $DECOMP{$name}) {
445 my $cmt = "; // DECOMP: $name -> ";
446 foreach my $n (@{$DECOMP{$name}}) {
447 if (exists $SCRIPT_NAME_TO_CODE{$scr}{$n}) {
448 $x .= hex4($SCRIPT_TO_BASE{$scr} + $SCRIPT_NAME_TO_CODE{$scr}{$n});
456 $ {$scr}{$fullCode} = $x . $cmt;
461 $SCRIPT_FULLCODE_TO_IS_UNMAPPED{$scr}{$fullCode} = 1;
462 $ {$scr}{$fullCode} = "; // UNMAPPED $scr-InterIndic: $name";
467 # E00B>; // UNMAPPED EXTERNAL: LETTER VOCALIC R "\u0A0B>\u0A30\u0A3F;"
468 # E00C>; // UNMAPPED EXTERNAL: LETTER VOCALIC L "\u0A0C>\u0A07;"
469 # E00D>; // UNMAPPED EXTERNAL: LETTER CANDRA E "\u0A0D>\u0A10;"
470 # E011>; // UNMAPPED EXTERNAL: LETTER CANDRA O "\u0A11>\u0A14;"
471 # E037>; // UNMAPPED EXTERNAL: LETTER SSA "\u0A37>\u0A36;"
472 # E045>; // UNMAPPED EXTERNAL: VOWEL SIGN CANDRA E "\u0A45>\u0A48;"
473 # E049>; // UNMAPPED EXTERNAL: VOWEL SIGN CANDRA O "\u0A49>\u0A4C;"
476 # Dump out script maps
477 foreach my $scr (sort keys %SCRIPT_TO_BASE) {
478 ## next unless ($scr eq 'TELUGU'); # Debugging
481 foreach my $fullCode (sort {$a <=> $b} keys %{$scr}) {
482 if ($flag && $fullCode >= $INTERINDIC) {
483 # We have the complete <scr>-InterIndic rules; dump
485 generateRBT($scr, "InterIndic", \@rules, $OUTDIR);
489 if (exists $SCRIPT_FULLCODE_TO_IS_UNMAPPED{$scr}{$fullCode}) {
490 push @rules, "// " . hex4($fullCode) . ">" . $ {$scr}{$fullCode};
492 push @rules, hex4($fullCode) . ">" . $ {$scr}{$fullCode};
495 # Now generate the InterIndic-<scr> rules.
496 generateRBT("InterIndic", $scr, \@rules, $OUTDIR);
498 # print "$scr-InterIndic ------------------------------\n";
500 # foreach my $fullCode (sort {$a <=> $b} keys %{$scr}) {
501 # if ($flag && $fullCode >= $INTERINDIC) {
502 # print "InterIndic-$scr ------------------------------\n";
505 # if (exists $SCRIPT_FULLCODE_TO_IS_UNMAPPED{$scr}{$fullCode}) {
506 # print "// ", hex4($fullCode), ">", $ {$scr}{$fullCode}, "\n";
508 # print hex4($fullCode), ">", $ {$scr}{$fullCode}, "\n";
513 # List successful remappings
514 if ($DO_HEURISTIC_REMAP) {
515 foreach my $remap (sort keys %REMAP) {
516 print STDERR "REMAP ", $REMAP{$remap}, " x $remap\n";
520 #----------------------------------------------------------------------
523 # Return a listing of an array of scripts
525 sub formatScriptList {
527 if ($SCRIPT_COUNT == @$aref) {
529 } elsif (($SCRIPT_COUNT - 3) <= @$aref) {
532 foreach (@$aref) { $temp{$_} = 1; }
533 foreach (sort keys %SCRIPT_TO_BASE) {
534 $s .= " $_" unless exists $temp{$_};
538 return join(" ", @$aref);
544 sprintf("%02X", $_[0]);
549 sprintf("\\u%04X", $_[0]);
552 # Format an array as %04X hex, delimited by "."s
554 join("", map { hex4($_); } @_);
557 # Parse a string of the form "\u0D01" to an array of integers.
558 # Must ONLY contain escapes.
560 sub parseUnicodeEscape {
565 if (/^\\u([0-9a-f]{4})(.*)/i) {
566 push @result, hex($1);
569 die "Can't parse Unicode escape $orig\n";
573 die "Can't parse Unicode escape $orig\n";
578 # Return 1 if the two arrays of ints are equal.
579 # Param: ref to array of ints
580 # Param: ref to array of ints
584 if (scalar @$a == scalar @$b) {
585 for (my $i=0; $i<@$a; ++$i) {
586 if ($a->[$i] != $b->[$i]) {
595 # Given a rule, possibly with trailing // comment,
596 # quote the rule part and add a trailing "+" after
600 $cmt = $1 if (s|(\s*//.*)||); # isolate trailing // comment
606 # Given the name of the source script, name of the target script,
607 # and array of rule strings, return a string containing the source
608 # for a RuleBasedTransliterator file.
609 # Param: source script name
610 # Param: target script name
611 # Param: ref to array of rules. These rules are unquoted, without
612 # concatenators between them, but do have trailing ';' separators.
613 # Param: name of output directory
615 # $TOOL - name of generating tool
616 # $DATE - date of generation
617 # $SCRIPTFROM - name of source script
618 # $SCRIPTTO - name of target script
620 my ($source, $target, $rules, $outdir) = @_;
622 $outdir =~ s|[/\\]$||; # Delete trailing / or \
623 my $OUT = "$outdir/TransliterationRule_${source}_$target.java";
624 open(RBT_TEMPLATE) or die;
625 open(OUT, ">$OUT") or die;
626 while (<RBT_TEMPLATE>) {
627 while (/\$([A-Za-z0-9]+)/) {
630 if ($tag eq 'TOOL') {
631 $sub = $RBT_GEN_TOOL;
632 } elsif ($tag eq 'DATE') {
634 } elsif ($tag eq 'SCRIPTFROM') {
636 } elsif ($tag eq 'SCRIPTTO') {
638 } elsif ($tag eq 'RULES') {
639 # Get any whitespace-only indent off the front of this tag
641 $indent = $1 if (/^(\s+)\$$tag/);
643 # The rules in the array are not quoted. We need to quote
644 # them and add '+' operators between them. We do NOT need
645 # to add ';' separators. We DO need to separate trailing
646 # // comments and handle them.
647 $sub = join("\n$indent", map("eRule, @$rules)) .
650 print STDERR "ERROR in $RBT_TEMPLATE: Unknown tag $tag\n";
651 $sub = "[ERROR:Unknown tag \$$tag]";
659 print STDERR "Written: $OUT\n";