2 # *******************************************************************************
\r
3 # * Copyright (C) 2000-2004, International Business Machines Corporation and *
\r
4 # * others. All Rights Reserved. *
\r
5 # *******************************************************************************
\r
10 # Usage - $0 <remap file>
\r
11 # e.g. - indic indic.txt
\r
12 # The input file should be a subset of the Unicode data file containing
\r
13 # the blocks of interest.
\r
15 # The remap file should have lines of the form
\r
17 # including the quotes. These will be interpreted as saying that the
\r
18 # undefined code point U+D01 (derived via mapping from InterIndic)
\r
19 # can be remapped to U+D02.
\r
21 # The purpose of this script is to process the Indic script data into
\r
22 # a form usable by the IndicTransliterator, that is, the Indic-Indic
\r
23 # transliterator. The transliterator needs two things: A mapping of
\r
24 # the code points in common, and a list of the exceptions.
\r
26 # Assume we are located in icu4j/src/com/ibm/tools/translit/.
\r
27 # We want the Unicode DB in icu4j/src/data/unicode/.
\r
28 $UNICODE_DB = "../../../../data/unicode/UnicodeData.txt";
\r
29 $EXCEPTIONS_FILE = shift;
\r
31 # Assume we are located in icu4j/src/com/ibm/tools/translit/.
\r
32 # We want to output files to icu4j/src/com/ibm/text/resources/.
\r
34 $OUTDIR = "../../text/resources";
\r
36 # The template file should contain java code that can be used
\r
37 # to generate RuleBasedTransliterator resource files. The template
\r
38 # should contain the following embedded symbols, which this script
\r
40 # $TOOL - name of generating tool
\r
41 # $DATE - date of generation
\r
42 # $SCRIPTFROM - name of source script
\r
43 # $SCRIPTTO - name of target script
\r
45 $RBT_TEMPLATE = 'rbtTemplate.txt';
\r
47 # Name of this tool in generated RBT files
\r
48 $RBT_GEN_TOOL = 'icu4j/src/com/ibm/tools/translit/indic.pl';
\r
50 $DUMP = 0; # If 1, dump out internal data
\r
52 $DO_HEURISTIC_REMAP = 0; # If 1, do automatic heuristic remapping
\r
53 $DO_DECOMP_REMAP = 0; # If 1, do decomp remapping
\r
56 while (<UNICODE_DB>) {
\r
57 next if (m|^0[0-8]|); # Skip up to Devanagari block (0900)
\r
58 last if (m|^0D[8-F]|i); # Bail out after Malayam block (0D00)
\r
59 # 0D39;MALAYALAM LETTER HA;Lo;0;L;;;;;N;;;;;
\r
60 my @data = split(/;/);
\r
61 my $fullCode = hex($data[0]); # e.g., 0x093F
\r
62 my $code = $fullCode & 0x7F; # e.g., 0x3F
\r
63 my ($script, $name) = ($data[1] =~ /(\w+)\s+(.+)/);
\r
64 die "Can't parse $_" unless ($name);
\r
65 # e.g., $code/$script/$name = 3F/MALAYALAM/VOWEL SIGN I
\r
67 # Titlecase the script
\r
68 $script = ucfirst(lc($script));
\r
70 # Fix a couple inconsistencies in the 3.0 data
\r
71 # REVISIT: Is this okay to do?
\r
72 if ($DO_HEURISTIC_REMAP) {
\r
73 if ($script eq 'Gujarati' && $code >= 5 && $code <= 0x14) {
\r
74 $name =~ s/^VOWEL/LETTER/;
\r
78 # Keep track of all script names we encounter. We also note the
\r
79 # base of the block.
\r
80 my $base = $fullCode & ~0x7F; # e.g., 0x900;
\r
81 if (exists $SCRIPT_TO_BASE{$script}) {
\r
82 die "Script base mismatch for $script: $base vs. $SCRIPT_TO_BASE{$script}"
\r
83 if ($SCRIPT_TO_BASE{$script} ne $base);
\r
85 $SCRIPT_TO_BASE{$script} = $base;
\r
88 # Build up a mapping by name. For each name, keep a hash keyed by
\r
89 # code point. For each code point, keep an array of script names.
\r
90 # Also keep a total use count for each name.
\r
91 push @{$NAME_CODE_TO_SCRIPTS{$name}{$code}}, $script;
\r
92 ++$NAME_CODE_TO_SCRIPTS{$name}{count};
\r
94 # Build a map that looks like this:
\r
95 # $SCRIPT_NAME_TO_CODE{<script>}{<name>} = <code>
\r
96 # or undef if there is no mapping.
\r
97 $SCRIPT_NAME_TO_CODE{$script}{$name} = $code;
\r
99 # Build a map that looks like this:
\r
100 $SCRIPT_CODE_TO_NAME{$script}{$code} = $name;
\r
102 # And a map from the fullCode point to the name
\r
103 $FULLCODE_TO_NAME{$fullCode} = $name;
\r
105 # Map code (0..7F) to name. This is usually a 1-1 mapping, but
\r
106 # is 1-n in a few cases.
\r
107 if (exists $CODE_TO_NAME{$code}) {
\r
108 if ($name ne $CODE_TO_NAME{$code}) {
\r
109 # For multiple names on a code offset, use the format
\r
110 # (a/b), (a/b/c), etc.
\r
111 local $_ = $CODE_TO_NAME{$code};
\r
113 if (!m|[\(\)/]$name[\(\)/]|) {
\r
119 $CODE_TO_NAME{$code} = $_;
\r
122 $CODE_TO_NAME{$code} = $name;
\r
127 # Read and parse the manual remapping file. This contains lines
\r
130 # |"\u0956>\u0948;" // AI Length Mark -> Devanagari Vowel Sign AI
\r
132 # The left hand side contains a non-existent full code value. It
\r
133 # should be a single value. The right hand side contains one or more
\r
134 # real full code values. The idea is that when a mapping from another
\r
135 # script ends up at the non-existent code point on the left, the
\r
136 # sequence on the right should be substituted. In this example,
\r
137 # Devanagari has no AI Length Mark. So, if transliterating from
\r
138 # Oriya, then the character 0B56 (Oriya AI Length Mark) will remap to
\r
139 # the non-existent 0956, and that remaps to 0948, our chosen
\r
140 # Devanagari equivalent. For our purposes, the left hand side should
\r
141 # be taken to mean its equivalent point in the InterIndic range. In
\r
142 # this example, what it really says is E056>0948 in the
\r
143 # InterIndic-Devanagari transliterator.
\r
145 if ($EXCEPTIONS_FILE) {
\r
146 open(EXCEPTIONS_FILE) or die;
\r
147 while (<EXCEPTIONS_FILE>) {
\r
148 if (m|^\s*\"([^\"]*?)\"|) {
\r
151 if (/^(.*)>(.*);$/) {
\r
152 my ($rawFrom, $rawTo) = ($1, $2);
\r
153 my @from = parseUnicodeEscape($rawFrom);
\r
154 my @to = parseUnicodeEscape($rawTo);
\r
155 my $from = hexArray(@from);
\r
156 # Some entries look like this:
\r
157 # |"\u0955>\u0955;"
\r
158 # these do nothing; ignore them.
\r
159 if (intArraysEqual(\@from, \@to)) {
\r
160 #print STDERR "Ignoring NOOP remap of $from\n";
\r
161 } elsif (exists $EXCEPTIONS{$from}) {
\r
162 print STDERR "ERROR in $EXCEPTIONS_FILE - Duplicate remap entries for $from\n";
\r
163 } elsif (scalar @from > 1) {
\r
164 print STDERR "ERROR in $EXCEPTIONS_FILE - Ignoring multichar remap: ", hexArray(@from), "->", hexArray(@to), "\n";
\r
166 # Check this for validity. Full code on the left
\r
167 # should NOT exist. Full code seq on the right should.
\r
168 if (exists $FULLCODE_TO_NAME{$from[0]}) {
\r
169 print STDERR "ERROR in $EXCEPTIONS_FILE - Invalid remap; left side defined: ", hexArray(@from), "->", hexArray(@to), "\n";
\r
170 } elsif (grep(! exists $FULLCODE_TO_NAME{$_}, @to)) {
\r
171 print STDERR "ERROR in $EXCEPTIONS_FILE - Invalid remap; right side undefined: ", hexArray(@from), "->", hexArray(@to), "\n";
\r
173 $EXCEPTIONS{$from[0]} = \@to;
\r
176 } else { die "ERROR in $EXCEPTIONS_FILE - Can't parse \"$_\" in line $line"; }
\r
179 close(EXCEPTIONS_FILE);
\r
180 print STDERR "$EXCEPTIONS_FILE: Loaded ", scalar keys %EXCEPTIONS, " remappings\n";
\r
183 if ($DO_DECOMP_REMAP) {
\r
184 # Read the NamesList.txt file. This contains decomposition data.
\r
185 # Gather these into %DECOMP, which maps a name to n1.n2..., where n1
\r
186 # etc. are decomposed names. E.g. $DECOMP{'LETTER RRA'} -> 'LETTER
\r
187 # RA.SIGN NUKTA'. There may be different mappings in different script
\r
188 # blocks (LETTER RRA is mapped differently in Devanagari and Bengali),
\r
189 # in which case the name goes into %DECOMP_MISMATCH, and is removed
\r
191 $NAMES = "NamesList.txt";
\r
194 # Skip to start of DEVANAGARI block
\r
195 last if (/^\@\@\s+0900/);
\r
198 # Continue until start of SINHALA block
\r
199 last if (/^\@\@\s+0D80/);
\r
200 if (/^([0-9A-Z]{4})/i) {
\r
202 } elsif (/^\s+:\s*(.+)/) {
\r
203 # We've found a mapping of the form:
\r
204 # 0929 DEVANAGARI LETTER NNNA
\r
205 # * for transcribing Dravidian alveolar n
\r
207 my $from = $FULLCODE_TO_NAME{hex($code)};
\r
208 my @to = map($FULLCODE_TO_NAME{hex($_)}, split(/\s+/, $1));
\r
209 if (exists $DECOMP{$from}) {
\r
210 my $aref = $DECOMP{$from};
\r
211 if (join(".", @$aref) ne join(".", @to)) {
\r
212 print STDERR "ERROR: Decomp mismatch for $from\n";
\r
213 print STDERR " : $from = ", join(".", @$aref), "\n";
\r
214 print STDERR " : $from = ", join(".", @to), "\n";
\r
215 $DECOMP_MISMATCH{$from} = 1;
\r
218 $DECOMP{$from} = \@to;
\r
223 # Remove mismatches
\r
224 foreach (keys %DECOMP_MISMATCH) {
\r
225 delete $DECOMP{$_};
\r
228 foreach (keys %DECOMP) {
\r
229 print "$_ = ", join(" + ", @{$DECOMP{$_}}), "\n";
\r
234 # Count the total number of scripts
\r
236 $SCRIPT_COUNT = scalar keys %SCRIPT_TO_BASE;
\r
237 #print join("\n", sort keys %SCRIPT_TO_BASE), "\n";
\r
239 # Dump out the %NAME_CODE_TO_SCRIPTS map.
\r
242 print "\nBY NAME:\n";
\r
243 foreach my $pass ((1, 2)) {
\r
244 print "\nBY NAME - SINGLETONS:\n" if ($pass eq 2);
\r
245 foreach my $name (sort keys %NAME_CODE_TO_SCRIPTS) {
\r
247 next if (1 >= $NAME_CODE_TO_SCRIPTS{$name}{count});
\r
249 next if (1 < $NAME_CODE_TO_SCRIPTS{$name}{count});
\r
252 my $href = $NAME_CODE_TO_SCRIPTS{$name};
\r
253 foreach my $code (sort {$a <=> $b} keys %$href) {
\r
254 next if ($code eq 'count');
\r
255 my $aref = $href->{$code};
\r
256 print " ", hex2($code), " (", formatScriptList($aref), ")";
\r
263 # Create some transliterators, based on the scripts and the %NAME_CODE_TO_SCRIPTS
\r
264 # map. Only use %NAME_CODE_TO_SCRIPTS entries with a count of 2 or more, that is,
\r
265 # names that occur in two or more scripts. For those scripts where
\r
266 # the names occur, map both up to the InterIndic range, and down to
\r
267 # the target script.
\r
269 $INTERINDIC = 0xE000;
\r
270 $INTERINDIC_EXTRA = 0xE080;
\r
271 $INTERINDIC_EXTRA_NEXT = $INTERINDIC_EXTRA;
\r
273 # For each script, create a hash. The hash has a key for each
\r
274 # code point, either within its block, or in the InterIndic block.
\r
275 # the value of the key is the mapping.
\r
277 # The script hashes are named %DEVANAGARI, etc., and referenced
\r
278 # with symbolic refs.
\r
280 @REMAP = ('s/\bSHORT\s+//i',
\r
281 's/\bCANDRA\s+//i',
\r
293 's/^A(.) LENGTH MARK$/VOWEL SIGN A$1/i',
\r
294 's/CANDRABINDU/BINDI/i',
\r
295 's/BINDI/CANDRABINDU/i',
\r
298 # Do this so we see zero counts:
\r
299 foreach my $remap (@REMAP) { $REMAP{$remap} = 0; }
\r
301 # This loop iterates over the names in the NAME_CODE_TO_SCRIPTS hash.
\r
302 # These names are things like "LETTER NNNA". For each name, it then
\r
303 # creates script mappings up to the InterIndic area, and back down
\r
304 # to the script areas. If a name maps to more than one offset,
\r
305 # then it uses the InterIndic extra range. Either way, it picks
\r
306 # a single InterIndic point, either an offset point or something in
\r
307 # the extra range, and maps up and down from that point.
\r
308 foreach my $name (sort keys %NAME_CODE_TO_SCRIPTS) {
\r
309 next if (1 >= $NAME_CODE_TO_SCRIPTS{$name}{count});
\r
310 my $href = $NAME_CODE_TO_SCRIPTS{$name};
\r
311 # Count the number of different codes assigned to this name.
\r
312 # Usually 1, but 2 for a handful of names.
\r
313 my $codeCount = (keys %{$NAME_CODE_TO_SCRIPTS{$name}}) - 1; # less 1: {count}
\r
314 # If $codeCount is 1, then map directly up to the $INTERINDIC
\r
315 # base. If $codeCount is 2, then map into unused spots starting
\r
316 # at $INTERINDIC_EXTRA.
\r
317 my $interIndicCode;
\r
318 if ($codeCount > 1) {
\r
319 # Map into the InterIndic extra range
\r
320 $interIndicCode = $INTERINDIC_EXTRA_NEXT++;
\r
323 foreach my $code (sort {$a ne 'count' && $b ne 'count' && $a <=> $b} keys %$href) {
\r
324 next if ($code eq 'count');
\r
325 my $aref = $href->{$code}; # Ref to array of scripts
\r
326 if ($codeCount == 1) {
\r
328 $interIndicCode = $INTERINDIC + $code;
\r
330 # Keep track of the names of the extra InterIndic points
\r
331 $INTERINDIC_NAME_TO_FULLCODE{$name} = $interIndicCode;
\r
333 foreach my $scr (@$aref) {
\r
335 my $fullCode = $SCRIPT_TO_BASE{$scr} + $code;
\r
336 $ {$scr}{$fullCode} = hex4($interIndicCode) . "; // $name";
\r
337 $ {$scr}{$interIndicCode} = hex4($fullCode) . "; // $name";
\r
340 # Now handle InterIndic->Script unmapped points. For each name,
\r
341 # some of the scripts will be left out -- will have no mappings
\r
342 # to that name. For these scripts, we can either leave them
\r
343 # unmapped (so the InterIndic->Local mapping is empty), or
\r
346 foreach my $scr (keys %SCRIPT_TO_BASE) {
\r
347 next if ($seen{$scr});
\r
349 if ($DO_HEURISTIC_REMAP) {
\r
350 # Try to remap through the known equivalences in our
\r
352 foreach my $remapRE (@REMAP) {
\r
354 if (eval($remapRE)) {
\r
355 if (exists $SCRIPT_NAME_TO_CODE{$scr}{$_}) {
\r
356 $ {$scr}{$interIndicCode} =
\r
357 hex4($SCRIPT_TO_BASE{$scr} + $SCRIPT_NAME_TO_CODE{$scr}{$_}) .
\r
358 "; // REMAP: $name -> $_";
\r
359 ++$REMAP{$remapRE};
\r
360 next unmappedScript;
\r
366 # Try to remap through the file. This contains remappings of
\r
367 # the form 0991->0993. That is, it contains local remappings
\r
368 # that we can substitute and try again with.
\r
369 #|GURMUKHI-InterIndic ------------------------------
\r
370 #|// 0A02>; // UNMAPPED INTERNAL: SIGN BINDI
\r
371 #|InterIndic-GURMUKHI ------------------------------
\r
372 #|// E001>; // UNMAPPED EXTERNAL: SIGN CANDRABINDU
\r
374 # In this example, the remapping tells us that the non-existent
\r
375 # character A01 should be considered equivalent to the real
\r
377 # We implement this by adding two mappings; one from
\r
378 # the InterIndic equivalent of A01, that is, E001, to A02,
\r
379 # and one from A02, which otherwise has no mapping, to E001.
\r
380 if ($EXCEPTIONS_FILE && $interIndicCode < $INTERINDIC_EXTRA) {
\r
381 # Try to map this InterIndic character back to a the spot
\r
382 # it would occupy in this script if it had a mapping.
\r
383 my $code = $interIndicCode & 0x7F;
\r
384 my $pseudoFullCode = $SCRIPT_TO_BASE{$scr} + $code;
\r
385 if (exists $EXCEPTIONS{$pseudoFullCode}) {
\r
386 my $fullCodeArray = $EXCEPTIONS{$pseudoFullCode};
\r
388 foreach my $c (@$fullCodeArray) {
\r
389 $comment .= "." if ($comment);
\r
390 $comment .= $FULLCODE_TO_NAME{$c};
\r
392 $comment = "; // REMAP ($EXCEPTIONS_FILE): " .
\r
393 hex4($pseudoFullCode) . ">" . hexArray(@$fullCodeArray) . " = " .
\r
394 $CODE_TO_NAME{$code} . ">" . $comment;
\r
395 $ {$scr}{$interIndicCode} = hexArray(@$fullCodeArray) . $comment;
\r
396 if (scalar @$fullCodeArray == 1) {
\r
397 if (exists $ {$scr}{$fullCodeArray->[0]}) {
\r
398 # There's already a proper mapping; no need to fill
\r
401 $ {$scr}{$fullCodeArray->[0]} = hex4($interIndicCode) . $comment;
\r
404 next unmappedScript;
\r
408 $SCRIPT_FULLCODE_TO_IS_UNMAPPED{$scr}{$interIndicCode} = 1;
\r
409 local $_ = "; // UNMAPPED InterIndic-$scr: $name";
\r
410 if (exists $SCRIPT_CODE_TO_NAME{$scr}{$interIndicCode & 0x7F}) {
\r
411 my $fullCode = $SCRIPT_TO_BASE{$scr} + ($interIndicCode & 0x7F);
\r
412 $_ .= " (" . hex4($fullCode) . " = " . $FULLCODE_TO_NAME{$fullCode} . ")";
\r
414 $ {$scr}{$interIndicCode} = $_;
\r
418 # Add in unmapped entries for each script
\r
419 foreach my $scr (keys %SCRIPT_TO_BASE) {
\r
420 my $base = $SCRIPT_TO_BASE{$scr};
\r
422 foreach my $code (keys %{$SCRIPT_CODE_TO_NAME{$scr}}) {
\r
423 my $fullCode = $code + $base;
\r
424 next if (exists $ {$scr}{$fullCode});
\r
425 my $name = $SCRIPT_CODE_TO_NAME{$scr}{$code};
\r
427 if ($DO_HEURISTIC_REMAP) {
\r
428 foreach my $remapRE (@REMAP) {
\r
430 if (eval($remapRE)) {
\r
431 if (exists $INTERINDIC_NAME_TO_FULLCODE{$_}) {
\r
432 $ {$scr}{$fullCode} =
\r
433 hex4($INTERINDIC_NAME_TO_FULLCODE{$_}) .
\r
434 "; // REMAP: $name -> $_";
\r
435 ++$REMAP{$remapRE};
\r
442 # Now try the decomp table
\r
443 if ($DO_DECOMP_REMAP && exists $DECOMP{$name}) {
\r
445 my $cmt = "; // DECOMP: $name -> ";
\r
446 foreach my $n (@{$DECOMP{$name}}) {
\r
447 if (exists $SCRIPT_NAME_TO_CODE{$scr}{$n}) {
\r
448 $x .= hex4($SCRIPT_TO_BASE{$scr} + $SCRIPT_NAME_TO_CODE{$scr}{$n});
\r
449 $cmt .= $n . " + ";
\r
456 $ {$scr}{$fullCode} = $x . $cmt;
\r
461 $SCRIPT_FULLCODE_TO_IS_UNMAPPED{$scr}{$fullCode} = 1;
\r
462 $ {$scr}{$fullCode} = "; // UNMAPPED $scr-InterIndic: $name";
\r
467 # E00B>; // UNMAPPED EXTERNAL: LETTER VOCALIC R "\u0A0B>\u0A30\u0A3F;"
\r
468 # E00C>; // UNMAPPED EXTERNAL: LETTER VOCALIC L "\u0A0C>\u0A07;"
\r
469 # E00D>; // UNMAPPED EXTERNAL: LETTER CANDRA E "\u0A0D>\u0A10;"
\r
470 # E011>; // UNMAPPED EXTERNAL: LETTER CANDRA O "\u0A11>\u0A14;"
\r
471 # E037>; // UNMAPPED EXTERNAL: LETTER SSA "\u0A37>\u0A36;"
\r
472 # E045>; // UNMAPPED EXTERNAL: VOWEL SIGN CANDRA E "\u0A45>\u0A48;"
\r
473 # E049>; // UNMAPPED EXTERNAL: VOWEL SIGN CANDRA O "\u0A49>\u0A4C;"
\r
476 # Dump out script maps
\r
477 foreach my $scr (sort keys %SCRIPT_TO_BASE) {
\r
478 ## next unless ($scr eq 'TELUGU'); # Debugging
\r
481 foreach my $fullCode (sort {$a <=> $b} keys %{$scr}) {
\r
482 if ($flag && $fullCode >= $INTERINDIC) {
\r
483 # We have the complete <scr>-InterIndic rules; dump
\r
485 generateRBT($scr, "InterIndic", \@rules, $OUTDIR);
\r
489 if (exists $SCRIPT_FULLCODE_TO_IS_UNMAPPED{$scr}{$fullCode}) {
\r
490 push @rules, "// " . hex4($fullCode) . ">" . $ {$scr}{$fullCode};
\r
492 push @rules, hex4($fullCode) . ">" . $ {$scr}{$fullCode};
\r
495 # Now generate the InterIndic-<scr> rules.
\r
496 generateRBT("InterIndic", $scr, \@rules, $OUTDIR);
\r
498 # print "$scr-InterIndic ------------------------------\n";
\r
500 # foreach my $fullCode (sort {$a <=> $b} keys %{$scr}) {
\r
501 # if ($flag && $fullCode >= $INTERINDIC) {
\r
502 # print "InterIndic-$scr ------------------------------\n";
\r
505 # if (exists $SCRIPT_FULLCODE_TO_IS_UNMAPPED{$scr}{$fullCode}) {
\r
506 # print "// ", hex4($fullCode), ">", $ {$scr}{$fullCode}, "\n";
\r
508 # print hex4($fullCode), ">", $ {$scr}{$fullCode}, "\n";
\r
513 # List successful remappings
\r
514 if ($DO_HEURISTIC_REMAP) {
\r
515 foreach my $remap (sort keys %REMAP) {
\r
516 print STDERR "REMAP ", $REMAP{$remap}, " x $remap\n";
\r
520 #----------------------------------------------------------------------
\r
523 # Return a listing of an array of scripts
\r
525 sub formatScriptList {
\r
527 if ($SCRIPT_COUNT == @$aref) {
\r
529 } elsif (($SCRIPT_COUNT - 3) <= @$aref) {
\r
532 foreach (@$aref) { $temp{$_} = 1; }
\r
533 foreach (sort keys %SCRIPT_TO_BASE) {
\r
534 $s .= " $_" unless exists $temp{$_};
\r
538 return join(" ", @$aref);
\r
542 # Format as %02X hex
\r
544 sprintf("%02X", $_[0]);
\r
547 # Format as %04X hex
\r
549 sprintf("\\u%04X", $_[0]);
\r
552 # Format an array as %04X hex, delimited by "."s
\r
554 join("", map { hex4($_); } @_);
\r
557 # Parse a string of the form "\u0D01" to an array of integers.
\r
558 # Must ONLY contain escapes.
\r
559 # Return the array.
\r
560 sub parseUnicodeEscape {
\r
564 while (length($_)) {
\r
565 if (/^\\u([0-9a-f]{4})(.*)/i) {
\r
566 push @result, hex($1);
\r
569 die "Can't parse Unicode escape $orig\n";
\r
572 if (0 == @result) {
\r
573 die "Can't parse Unicode escape $orig\n";
\r
578 # Return 1 if the two arrays of ints are equal.
\r
579 # Param: ref to array of ints
\r
580 # Param: ref to array of ints
\r
581 sub intArraysEqual {
\r
584 if (scalar @$a == scalar @$b) {
\r
585 for (my $i=0; $i<@$a; ++$i) {
\r
586 if ($a->[$i] != $b->[$i]) {
\r
595 # Given a rule, possibly with trailing // comment,
\r
596 # quote the rule part and add a trailing "+" after
\r
600 $cmt = $1 if (s|(\s*//.*)||); # isolate trailing // comment
\r
606 # Given the name of the source script, name of the target script,
\r
607 # and array of rule strings, return a string containing the source
\r
608 # for a RuleBasedTransliterator file.
\r
609 # Param: source script name
\r
610 # Param: target script name
\r
611 # Param: ref to array of rules. These rules are unquoted, without
\r
612 # concatenators between them, but do have trailing ';' separators.
\r
613 # Param: name of output directory
\r
615 # $TOOL - name of generating tool
\r
616 # $DATE - date of generation
\r
617 # $SCRIPTFROM - name of source script
\r
618 # $SCRIPTTO - name of target script
\r
620 my ($source, $target, $rules, $outdir) = @_;
\r
622 $outdir =~ s|[/\\]$||; # Delete trailing / or \
\r
623 my $OUT = "$outdir/TransliterationRule_${source}_$target.java";
\r
624 open(RBT_TEMPLATE) or die;
\r
625 open(OUT, ">$OUT") or die;
\r
626 while (<RBT_TEMPLATE>) {
\r
627 while (/\$([A-Za-z0-9]+)/) {
\r
630 if ($tag eq 'TOOL') {
\r
631 $sub = $RBT_GEN_TOOL;
\r
632 } elsif ($tag eq 'DATE') {
\r
634 } elsif ($tag eq 'SCRIPTFROM') {
\r
636 } elsif ($tag eq 'SCRIPTTO') {
\r
638 } elsif ($tag eq 'RULES') {
\r
639 # Get any whitespace-only indent off the front of this tag
\r
641 $indent = $1 if (/^(\s+)\$$tag/);
\r
643 # The rules in the array are not quoted. We need to quote
\r
644 # them and add '+' operators between them. We do NOT need
\r
645 # to add ';' separators. We DO need to separate trailing
\r
646 # // comments and handle them.
\r
647 $sub = join("\n$indent", map("eRule, @$rules)) .
\r
650 print STDERR "ERROR in $RBT_TEMPLATE: Unknown tag $tag\n";
\r
651 $sub = "[ERROR:Unknown tag \$$tag]";
\r
658 close(RBT_TEMPLATE);
\r
659 print STDERR "Written: $OUT\n";
\r