@rem = '--*-Perl-*-- @echo off if "%OS%" == "Windows_NT" goto WinNT perl -x -S "%0" %1 %2 %3 %4 %5 %6 %7 %8 %9 goto endofperl :WinNT perl -x -S "%0" %* if NOT "%COMSPEC%" == "%SystemRoot%\system32\cmd.exe" goto endofperl if %errorlevel% == 9009 echo You do not have Perl in your PATH. goto endofperl @rem '; #!perl #line 14 # ************************************************************************ # Copyright (C) 2000-2004, International Business Machines Corporation and # others. All Rights Reserved. # ************************************************************************ # This perl script creates ICU transliterator data files, that live # in icu/data, from ICU4J UTF8 transliterator data files, in # icu4j/src/com/ibm/icu/impl/data/. # # The transformation that is done is very minimal. The script assumes # that the input files use only # comments # and that they follow a rigid format. # # The output files are named according to ICU conventions (see NAME_MAP # below) and created in the current directory. They should be manually # checked and then copied into the icu/data/tranlit directory. # An ICU build must # then be initiated, and the standard suite of ICU transliterator tests # should be run after that. # # Alan Liu 5/19/00 2/27/01 use Getopt::Long; use strict; use vars qw(%USED_FILES); my $DIR = "../../../impl/data"; my $ID = ''; GetOptions('dir=s' => \$DIR, 'id=s' => \$ID, '<>' => \&usage) || die; usage() if (@ARGV); my $ID =~ s/-/_/; if (! -d $DIR) { print STDERR "$DIR is not a directory\n"; usage(); } sub usage { my $me = $0; $me =~ s|.+[/\\]||; print "Usage: $me [-dir ] [-id ]\n"; print " --dir Specify the directory containing the\n"; print " Transliterator_*.txt files\n"; print " --id Specify a single ID to transform, e.g.\n"; print " Fullwidth-Halfwidth\n"; exit(1); } my $JAVA_ONLY = '-'; my $OUTDIR = "icu4c"; mkdir($OUTDIR,0777); # Mapping from Java file names to ICU file names # Constraints on ICU4C file name: icudt20b_ # |--9 (prefix)---|---18(name with distinguisher,e.g. "t_" )----| # --4 ("."+extn)--| = 31 characters total. # That is, must have length(%NAME_MAP{x}) <= 16 my $MAX_ICU4C_FILENAME_LEN = 18; # -- HISTORY -- If not marked, then pre 2.2. # All InterIndic are pre 2.2. # Any_Accents # Any_Publishing # Arabic_Latin * 2.2 # Cyrillic_Latin # Fullwidth_Halfwidth # Greek_Latin # Greek_Latin_UNGEGN * 2.2 (moved from el.txt) # Han_Latin * 2.2 # Han_Latin_Definition * 2.2 # Han_Latin_EDICT * 2.2 J only # Hebrew_Latin * 2.2 # Hiragana_Katakana # Hiragana_Latin # Latin_Jamo # Latin_Katakana # ThaiLogical_Latin * 2.2 J only # Thai_ThaiLogical * 2.2 J only # Thai_ThaiSemi * 2.2 J only my %NAME_MAP = ( # An ICU name of "" means the ICU name == the ID # We filter names based on what is in use in the index file. # Flag a rule as JAVA_ONLY if it exists and we use it in Java, # but we don't use it in C. # Use official script abbreviations where possible. # |..............| |..............| # 1234567890123456 1234567890123456 Any_Accents => "", Any_Publishing => "", Cyrillic_Latin => "Cyrl_Latn", Fullwidth_Halfwidth => "FWidth_HWidth", Greek_Latin => "Grek_Latn", Hiragana_Katakana => "Hira_Kana", Hiragana_Latin => "Hira_Latn", Latin_Jamo => "Latn_Jamo", Latin_Katakana => "Latn_Kana", Arabic_Latin => "Arab_Latn", Greek_Latin_UNGEGN => "Grek_Latn_UNGEGN", Han_Latin => "Hani_Latn", Han_Latin_Definition => "Hani_Latn_Def", Han_Latin_EDICT => "Hani_Latn_EDICT", Hebrew_Latin => "Hebr_Latn", ThaiLogical_Latin => $JAVA_ONLY, # "ThaiLog_Latn", Thai_ThaiLogical => $JAVA_ONLY, # "Thai_ThaiLog", Thai_ThaiSemi => $JAVA_ONLY, # "Thai_ThaiSemi", InterIndic_Bengali => "InterIndic_Beng", InterIndic_Devanagari => "InterIndic_Deva", InterIndic_Gujarati => "InterIndic_Gujr", InterIndic_Gurmukhi => "InterIndic_Guru", InterIndic_Kannada => "InterIndic_Knda", InterIndic_Latin => "InterIndic_Latn", InterIndic_Malayalam => "InterIndic_Mlym", InterIndic_Oriya => "InterIndic_Orya", InterIndic_Tamil => "InterIndic_Taml", InterIndic_Telugu => "InterIndic_Telu", Bengali_InterIndic => "Beng_InterIndic", Devanagari_InterIndic => "Deva_InterIndic", Gujarati_InterIndic => "Gujr_InterIndic", Gurmukhi_InterIndic => "Guru_InterIndic", Kannada_InterIndic => "Knda_InterIndic", Latin_InterIndic => "Latn_InterIndic", Malayalam_InterIndic => "Mlym_InterIndic", Oriya_InterIndic => "Orya_InterIndic", Tamil_InterIndic => "Taml_InterIndic", Telugu_InterIndic => "Telu_InterIndic", Han_Pinyin => $JAVA_ONLY, Kanji_English => $JAVA_ONLY, Kanji_OnRomaji => $JAVA_ONLY, Latin_NumericPinyin => "Latn_NPinyn", Tone_Digit => "Tone_Digit", Han_Spacedhan => "Hani_SpHan", ); my ($x,$x,$x,$x,$x,$THIS_YEAR) = localtime(); $THIS_YEAR += 1900; # Header blocks of text written at start of ICU output files my $HEADER1 = <) { next if (/~$/); # Ignore emacs backups next if (/_index\.txt$/); # The index file was processed above # Select either the command-line arg, if there was one, or # any files mentioned in the index. my $leaf = $_; $leaf =~ s|.+[/\\]||; if (($ID && $leaf =~ /$ID/) || exists $USED_FILES{$leaf}) { my ($out, $id) = convertFileName($_); if ($out) { if ($out eq $JAVA_ONLY) { print STDERR "*** $id skipped: Java only ***\n"; next; } file($id, $_, $out); } } elsif (!$ID) { print "*** $leaf skipped: not in use ***\n"; } } ###################################################################### # Convert a Java file name to C # Param: Java file name of the form m|Transliterator_(.+)\.utf8\.txt$| # Return: A C file name (e.g., ldevan.txt) or the empty string, # if there is no mapping, or $JAVA_ONLY if the given file isn't # intended to be incorporated into C. sub convertFileName { local $_ = shift; my $id; if (m|Transliterator_(.+)\.utf8\.txt$| || m|Transliterator_(.+)\.txt$|) { $id = $1; } else { die "Can't parse Java file name $_"; } if (!exists $NAME_MAP{$id}) { print STDERR "ERROR: $id not in map; please update $0\n"; return ''; } my $out = $NAME_MAP{$id}; if ($out eq '') { $out = $id; } if ($out ne $JAVA_ONLY) { $out = 't_' . $out; } if (length($out) > $MAX_ICU4C_FILENAME_LEN) { print STDERR "ERROR: ICU4C file name \"$out\" too long; please update $0\n"; return ''; } return ($out, $id); } ###################################################################### # Convert the index file from Java to C format sub convertIndex { my $JAVA_INDEX = "Transliterator_index.txt"; my $C_INDEX = "translit_index.txt"; open(JAVA_INDEX, "$DIR/$JAVA_INDEX") or die; open(C_INDEX, ">$OUTDIR/$C_INDEX") or die; header(\*C_INDEX, $JAVA_INDEX); print C_INDEX <) { # ignore CVS keyword substitutions next if /\$(Source|Revision|Date)/; # we have printed out the copyright info ... ignore one in Java version next if /Copyright/ ; next if /Corporation/; # Comments; change # to // if (s|^(\s*)\#|$1//|) { print C_INDEX; next; } # Blank lines if (!/\S/) { print C_INDEX; next; } # Content lines chomp; my $prefix = ''; #replace \p with \\p $_=~ s/\\p/\\\\p/g; my @a = split(':', $_); if ($a[1] eq 'file' || $a[1] eq 'internal') { # Convert the file name my $id; # Record file names in use $USED_FILES{$a[2]} = 1; ($a[2], $id) = convertFileName($a[2]); if ($a[2] eq $JAVA_ONLY) { $prefix = '// Java only: '; } # Delete the encoding field splice(@a, 3, 1); } elsif ($a[1] eq 'alias') { # Pad out with extra blank fields to make the # 2-d array square push @a, ""; } else { die "Can't parse $_"; } print C_INDEX $prefix, "{ ", join(", ", map("\"$_\"", @a)), " },\n"; } print C_INDEX < $C_INDEX\n"; } ###################################################################### # Output a header # Param: Filehandle sub header { my $out = shift; my $in = shift; print $out $HEADER1; print $out "// Tool: $TOOL\n// Source: $in\n"; print $out "// Date: ", scalar localtime, "\n"; print $out $HEADER2; print $out "\n"; } ###################################################################### # Process one file # Param: ID, e.g. Fullwidth-Halfwidth # Param: Java input file name, e.g. # f:/icu4j/src/com/ibm/text/resources/Transliterator_Fullwidth_Halfwidth.utf8.txt # Param: ICU output file name, e.g. fullhalf sub file { my $id = shift; my $IN = shift; my $out = shift; my $OUT = "$out.txt"; # Show input size. Show output size later -- useful for quick sanity check. print "$id (", -s $IN, ") -> $OUT ("; # Open file, write UTF8 marker, close it, and reopen in text mode open(OUT, ">$OUTDIR/$OUT") or die; binmode OUT; # Must do this so we can write our UTF8 marker print OUT pack("C3", 0xEF, 0xBB, 0xBF); # Write UTF8 marker close(OUT); open(OUT, ">>$OUTDIR/$OUT") or die; print OUT " // -*- Coding: utf-8; -*-\n"; header(\*OUT, $IN); print OUT "// $id\n"; print OUT "\n"; print OUT "$out {\n"; print OUT " Rule {\n"; open(IN, $IN) or die; binmode IN; # IN is a UTF8 file my $first = 1; my $BOM = pack("C3", 239, 187, 191); # a UTF8 byte order mark # Process each line by changing # comments to // comments # and taking other text and enclosing it in double quotes while () { my $raw = $_; # ignore CVS keyword substitutions next if /\$(Source|Revision|Date)/; # we have printed out the copyright info ... ignore one in Java version next if /Copyright/ ; next if /Corporation/; # Look for and delete BOM if ($first) { s/^$BOM//; $first = 0; } # Clean the eol junk up s/[\x0D\x0A]+$//; # If there is a trailing backslash, then delete it -- we don't # need line continuation in C, since adjacent strings are # concatenated. Count trailing backslashes; if they are odd, # one is trailing. if (m|(\\+)$|) { if ((length($1) % 2) == 1) { s|\\$||; } } # Transform escaped characters hideEscapes(); if (/^(\s*)(\#.*)$/) { # Comment-only line my ($white, $cmt) = ($1, $2); $cmt =~ s|\#|//|; $_ = $white . $cmt; } elsif (!/\S/) { # Blank line -- leave as-is } else { # Remove single-quoted matter my @quotes; my $nquotes = 0; my $x = $_; while (s/^([^\']*)(\'[^\']*\')/$1<>/) { push @quotes, $2; ++$nquotes; } # Extract comment my $cmt = ''; if (s|\#(.*)||) { $cmt = '//' . $1; } # Add quotes s|^(\s*)(\S.*?)(\s*)$|$1\"$2\"$3|; # Restore single-quoted matter for (my $i=0; $i<$nquotes; ++$i) { s|<>|$quotes[$i]|; } # Restore comment $_ .= $cmt; } # Restore escaped characters restoreEscapes(); print OUT $_, "\n"; } # Finish up close(IN); print OUT " }\n"; print OUT "}\n"; close(OUT); # Write output file size for sanity check print -s "$OUTDIR/$OUT", ")\n"; } ###################################################################### sub hideEscapes { # Transform escaped characters s|\\\\|<>|g; # DO THIS FIRST Transform backslashes s|\\u([a-zA-Z0-9]{4})|<>|g; # Transform Unicode escapes s|\\\"|<>|g; # Transform backslash double quote s|\\\'|<>|g; # Transform backslash single quote s|\\\#|<>|g; # Transform backslash pound s|\\(.)|<>|g; # Transform backslash escapes } ###################################################################### sub restoreEscapes { # Restore escaped characters s|<>|\\\\|g; s|<>|\\\\\\\"|g; s|<>|\\\\\\\'|g; s|<>|\\\\\\\#|g; s|<>|\\\\\\$1|g; s|<>|\\\\u0000|g; # Double escape U+0000 s|<>|\\u$1|g; } __END__ :endofperl