#/** # ******************************************************************************* # * Copyright (C) 2000-2004, International Business Machines Corporation and * # * others. All Rights Reserved. * # ******************************************************************************* # */ #!perl # Usage - $0 # e.g. - indic indic.txt # The input file should be a subset of the Unicode data file containing # the blocks of interest. # # The remap file should have lines of the form # "\u0D01>\u0D02;" # including the quotes. These will be interpreted as saying that the # undefined code point U+D01 (derived via mapping from InterIndic) # can be remapped to U+D02. # # The purpose of this script is to process the Indic script data into # a form usable by the IndicTransliterator, that is, the Indic-Indic # transliterator. The transliterator needs two things: A mapping of # the code points in common, and a list of the exceptions. # Assume we are located in icu4j/src/com/ibm/tools/translit/. # We want the Unicode DB in icu4j/src/data/unicode/. $UNICODE_DB = "../../../../data/unicode/UnicodeData.txt"; $EXCEPTIONS_FILE = shift; # Assume we are located in icu4j/src/com/ibm/tools/translit/. # We want to output files to icu4j/src/com/ibm/text/resources/. # Output directory $OUTDIR = "../../text/resources"; # The template file should contain java code that can be used # to generate RuleBasedTransliterator resource files. The template # should contain the following embedded symbols, which this script # will replace: # $TOOL - name of generating tool # $DATE - date of generation # $SCRIPTFROM - name of source script # $SCRIPTTO - name of target script # $RULES - rules $RBT_TEMPLATE = 'rbtTemplate.txt'; # Name of this tool in generated RBT files $RBT_GEN_TOOL = 'icu4j/src/com/ibm/tools/translit/indic.pl'; $DUMP = 0; # If 1, dump out internal data $DO_HEURISTIC_REMAP = 0; # If 1, do automatic heuristic remapping $DO_DECOMP_REMAP = 0; # If 1, do decomp remapping open(UNICODE_DB); while () { next if (m|^0[0-8]|); # Skip up to Devanagari block (0900) last if (m|^0D[8-F]|i); # Bail out after Malayam block (0D00) # 0D39;MALAYALAM LETTER HA;Lo;0;L;;;;;N;;;;; my @data = split(/;/); my $fullCode = hex($data[0]); # e.g., 0x093F my $code = $fullCode & 0x7F; # e.g., 0x3F my ($script, $name) = ($data[1] =~ /(\w+)\s+(.+)/); die "Can't parse $_" unless ($name); # e.g., $code/$script/$name = 3F/MALAYALAM/VOWEL SIGN I # Titlecase the script $script = ucfirst(lc($script)); # Fix a couple inconsistencies in the 3.0 data # REVISIT: Is this okay to do? if ($DO_HEURISTIC_REMAP) { if ($script eq 'Gujarati' && $code >= 5 && $code <= 0x14) { $name =~ s/^VOWEL/LETTER/; } } # Keep track of all script names we encounter. We also note the # base of the block. my $base = $fullCode & ~0x7F; # e.g., 0x900; if (exists $SCRIPT_TO_BASE{$script}) { die "Script base mismatch for $script: $base vs. $SCRIPT_TO_BASE{$script}" if ($SCRIPT_TO_BASE{$script} ne $base); } else { $SCRIPT_TO_BASE{$script} = $base; } # Build up a mapping by name. For each name, keep a hash keyed by # code point. For each code point, keep an array of script names. # Also keep a total use count for each name. push @{$NAME_CODE_TO_SCRIPTS{$name}{$code}}, $script; ++$NAME_CODE_TO_SCRIPTS{$name}{count}; # Build a map that looks like this: # $SCRIPT_NAME_TO_CODE{