jars/icu4j-52_1/tools/misc/src/com/ibm/icu/dev/tool/translit/dumpICUrules.bat

   1 @rem = '--*-Perl-*--
   2 @echo off
   3 if "%OS%" == "Windows_NT" goto WinNT
   4 perl -x -S "%0" %1 %2 %3 %4 %5 %6 %7 %8 %9
   5 goto endofperl
   6 :WinNT
   7 perl -x -S "%0" %*
   8 if NOT "%COMSPEC%" == "%SystemRoot%\system32\cmd.exe" goto endofperl
   9 if %errorlevel% == 9009 echo You do not have Perl in your PATH.
  10 goto endofperl
  11 @rem ';
  12 #!perl
  13 #line 14
  14 # ************************************************************************
  15 # Copyright (C) 2000-2004, International Business Machines Corporation and
  16 # others. All Rights Reserved.
  17 # ************************************************************************
  18
  19 # This perl script creates ICU transliterator data files, that live
  20 # in icu/data, from ICU4J UTF8 transliterator data files, in
  21 # icu4j/src/com/ibm/icu/impl/data/.
  22 #
  23 # The transformation that is done is very minimal.  The script assumes
  24 # that the input files use only # comments
  25 # and that they follow a rigid format.
  26 #
  27 # The output files are named according to ICU conventions (see NAME_MAP
  28 # below) and created in the current directory.  They should be manually
  29 # checked and then copied into the icu/data/tranlit directory.
  30 # An ICU build must
  31 # then be initiated, and the standard suite of ICU transliterator tests
  32 # should be run after that.
  33 #
  34 # Alan Liu 5/19/00 2/27/01
  35
  36 use Getopt::Long;
  37 use strict;
  38
  39 use vars qw(%USED_FILES);
  40
  41 my $DIR = "../../../impl/data";
  42 my $ID = '';
  43
  44 GetOptions('dir=s' => \$DIR,
  45            'id=s' => \$ID,
  46            '<>' => \&usage) || die;
  47
  48 usage() if (@ARGV);
  49
  50 my $ID =~ s/-/_/;
  51 if (! -d $DIR) {
  52     print STDERR "$DIR is not a directory\n";
  53     usage();
  54 }
  55
  56 sub usage {
  57     my $me = $0;
  58     $me =~ s|.+[/\\]||;
  59     print "Usage: $me [-dir <dir>] [-id <id>]\n";
  60     print " --dir <dir> Specify the directory containing the\n";
  61     print "             Transliterator_*.txt files\n";
  62     print " --id <id>   Specify a single ID to transform, e.g.\n";
  63     print "             Fullwidth-Halfwidth\n";
  64     exit(1);
  65 }
  66
  67 my $JAVA_ONLY = '-';
  68
  69 my $OUTDIR = "icu4c";
  70 mkdir($OUTDIR,0777);
  71
  72 # Mapping from Java file names to ICU file names
  73 # Constraints on ICU4C file name: icudt20b_
  74 # |--9 (prefix)---|---18(name with distinguisher,e.g. "t_" )----|
  75 #  --4 ("."+extn)--| = 31 characters total.
  76 # That is, must have length(%NAME_MAP{x}) <= 16
  77
  78 my $MAX_ICU4C_FILENAME_LEN = 18;
  79
  80 # -- HISTORY -- If not marked, then pre 2.2.
  81 #               All InterIndic are pre 2.2.
  82 # Any_Accents
  83 # Any_Publishing
  84 # Arabic_Latin           * 2.2
  85 # Cyrillic_Latin
  86 # Fullwidth_Halfwidth
  87 # Greek_Latin
  88 # Greek_Latin_UNGEGN     * 2.2 (moved from el.txt)
  89 # Han_Latin              * 2.2
  90 # Han_Latin_Definition   * 2.2
  91 # Han_Latin_EDICT        * 2.2 J only
  92 # Hebrew_Latin           * 2.2
  93 # Hiragana_Katakana
  94 # Hiragana_Latin
  95 # Latin_Jamo
  96 # Latin_Katakana
  97 # ThaiLogical_Latin      * 2.2 J only
  98 # Thai_ThaiLogical       * 2.2 J only
  99 # Thai_ThaiSemi          * 2.2 J only
 100
 101 my %NAME_MAP = (
 102      # An ICU name of "" means the ICU name == the ID
 103
 104      # We filter names based on what is in use in the index file.
 105
 106      # Flag a rule as JAVA_ONLY if it exists and we use it in Java,
 107      # but we don't use it in C.
 108
 109      # Use official script abbreviations where possible.
 110
 111    # |..............|           |..............|
 112    # 1234567890123456           1234567890123456
 113      Any_Accents            => "",
 114      Any_Publishing         => "",
 115      Cyrillic_Latin         => "Cyrl_Latn",
 116      Fullwidth_Halfwidth    => "FWidth_HWidth",
 117      Greek_Latin            => "Grek_Latn",
 118      Hiragana_Katakana      => "Hira_Kana",
 119      Hiragana_Latin         => "Hira_Latn",
 120      Latin_Jamo             => "Latn_Jamo",
 121      Latin_Katakana         => "Latn_Kana",
 122
 123      Arabic_Latin           => "Arab_Latn",
 124      Greek_Latin_UNGEGN     => "Grek_Latn_UNGEGN",
 125      Han_Latin              => "Hani_Latn",
 126      Han_Latin_Definition   => "Hani_Latn_Def",
 127      Han_Latin_EDICT        => "Hani_Latn_EDICT",
 128      Hebrew_Latin           => "Hebr_Latn",
 129      ThaiLogical_Latin      => $JAVA_ONLY, # "ThaiLog_Latn",
 130      Thai_ThaiLogical       => $JAVA_ONLY, # "Thai_ThaiLog",
 131      Thai_ThaiSemi          => $JAVA_ONLY, # "Thai_ThaiSemi",
 132
 133      InterIndic_Bengali     => "InterIndic_Beng",
 134      InterIndic_Devanagari  => "InterIndic_Deva",
 135      InterIndic_Gujarati    => "InterIndic_Gujr",
 136      InterIndic_Gurmukhi    => "InterIndic_Guru",
 137      InterIndic_Kannada     => "InterIndic_Knda",
 138      InterIndic_Latin       => "InterIndic_Latn",
 139      InterIndic_Malayalam   => "InterIndic_Mlym",
 140      InterIndic_Oriya       => "InterIndic_Orya",
 141      InterIndic_Tamil       => "InterIndic_Taml",
 142      InterIndic_Telugu      => "InterIndic_Telu",
 143
 144      Bengali_InterIndic     => "Beng_InterIndic",
 145      Devanagari_InterIndic  => "Deva_InterIndic",
 146      Gujarati_InterIndic    => "Gujr_InterIndic",
 147      Gurmukhi_InterIndic    => "Guru_InterIndic",
 148      Kannada_InterIndic     => "Knda_InterIndic",
 149      Latin_InterIndic       => "Latn_InterIndic",
 150      Malayalam_InterIndic   => "Mlym_InterIndic",
 151      Oriya_InterIndic       => "Orya_InterIndic",
 152      Tamil_InterIndic       => "Taml_InterIndic",
 153      Telugu_InterIndic      => "Telu_InterIndic",
 154
 155      Han_Pinyin             => $JAVA_ONLY,
 156      Kanji_English          => $JAVA_ONLY,
 157      Kanji_OnRomaji         => $JAVA_ONLY,
 158
 159      Latin_NumericPinyin    => "Latn_NPinyn",
 160      Tone_Digit             => "Tone_Digit",
 161      Han_Spacedhan          => "Hani_SpHan",
 162      );
 163
 164 my ($x,$x,$x,$x,$x,$THIS_YEAR) = localtime();
 165 $THIS_YEAR += 1900;
 166
 167 # Header blocks of text written at start of ICU output files
 168 my $HEADER1 = <<END;
 169 //--------------------------------------------------------------------
 170 // Copyright (c) 1999-$THIS_YEAR, International Business Machines
 171 // Corporation and others.  All Rights Reserved.
 172 //--------------------------------------------------------------------
 173 // THIS IS A MACHINE-GENERATED FILE
 174 END
 175 my $HEADER2 = <<END;
 176 //--------------------------------------------------------------------
 177 END
 178
 179 my $TOOL = $0;
 180
 181 # Convert the index first; this tells us which rule files are in use.
 182 convertIndex();
 183
 184 # print "In use:\n", join("\n", sort keys(%USED_FILES)), "\n";
 185
 186 # Iterate over all Java RBT rule files
 187 foreach (<$DIR/Transliterator_*.txt>) {
 188     next if (/~$/); # Ignore emacs backups
 189     next if (/_index\.txt$/); # The index file was processed above
 190     # Select either the command-line arg, if there was one, or
 191     # any files mentioned in the index.
 192     my $leaf = $_;
 193     $leaf =~ s|.+[/\\]||;
 194     if (($ID && $leaf =~ /$ID/) || exists $USED_FILES{$leaf}) {
 195         my ($out, $id) = convertFileName($_);
 196         if ($out) {
 197             if ($out eq $JAVA_ONLY) {
 198                 print STDERR "*** $id skipped: Java only ***\n";
 199                 next;
 200             }
 201             file($id, $_, $out);
 202         }
 203     } elsif (!$ID) {
 204         print "*** $leaf skipped: not in use ***\n";
 205     }
 206 }
 207
 208 ######################################################################
 209 # Convert a Java file name to C
 210 # Param: Java file name of the form m|Transliterator_(.+)\.utf8\.txt$|
 211 # Return: A C file name (e.g., ldevan.txt) or the empty string,
 212 #  if there is no mapping, or $JAVA_ONLY if the given file isn't
 213 #  intended to be incorporated into C.
 214 sub convertFileName {
 215     local $_ = shift;
 216     my $id;
 217     if (m|Transliterator_(.+)\.utf8\.txt$| ||
 218         m|Transliterator_(.+)\.txt$|) {
 219         $id = $1;
 220     } else { die "Can't parse Java file name $_"; }
 221     if (!exists $NAME_MAP{$id}) {
 222         print STDERR "ERROR: $id not in map; please update $0\n";
 223         return '';
 224     }
 225     my $out = $NAME_MAP{$id};
 226     if ($out eq '') {
 227         $out = $id;
 228     }
 229     if ($out ne $JAVA_ONLY) {
 230         $out = 't_' . $out;
 231     }
 232     if (length($out) > $MAX_ICU4C_FILENAME_LEN) {
 233         print STDERR "ERROR: ICU4C file name \"$out\" too long; please update $0\n";
 234         return '';
 235     }
 236     return ($out, $id);
 237 }
 238
 239 ######################################################################
 240 # Convert the index file from Java to C format
 241 sub convertIndex {
 242     my $JAVA_INDEX = "Transliterator_index.txt";
 243     my $C_INDEX = "translit_index.txt";
 244     open(JAVA_INDEX, "$DIR/$JAVA_INDEX") or die;
 245     open(C_INDEX, ">$OUTDIR/$C_INDEX") or die;
 246
 247     header(\*C_INDEX, $JAVA_INDEX);
 248
 249     print C_INDEX <<END;
 250 //--------------------------------------------------------------------
 251 // N.B.: This file has been generated mechanically from the
 252 // corresponding ICU4J file, which is the master file that receives
 253 // primary updates.  The colon-delimited fields have been split into
 254 // separate strings.  For 'file' and 'internal' lines, the encoding
 255 // field has been deleted, since the encoding is processed at build
 256 // time in ICU4C.  Certain large rule sets not intended for general
 257 // use have been commented out with the notation "Java only".
 258 //--------------------------------------------------------------------
 259
 260 translit_index {
 261   RuleBasedTransliteratorIDs {
 262 END
 263
 264     while (<JAVA_INDEX>) {
 265         # ignore CVS keyword substitutions
 266         next if /\$(Source|Revision|Date)/;
 267
 268         # we have printed out the copyright info ... ignore one in Java version
 269         next if /Copyright/ ;
 270         next if /Corporation/;
 271
 272         # Comments; change # to //
 273         if (s|^(\s*)\#|$1//|) {
 274             print C_INDEX;
 275             next;
 276         }
 277         # Blank lines
 278         if (!/\S/) {
 279             print C_INDEX;
 280             next;
 281         }
 282         # Content lines
 283         chomp;
 284         my $prefix = '';
 285         #replace \p with \\p
 286         $_=~ s/\\p/\\\\p/g;
 287         my @a = split(':', $_);
 288         if ($a[1] eq 'file' || $a[1] eq 'internal') {
 289             # Convert the file name
 290             my $id;
 291             # Record file names in use
 292             $USED_FILES{$a[2]} = 1;
 293             ($a[2], $id) = convertFileName($a[2]);
 294             if ($a[2] eq $JAVA_ONLY) {
 295                 $prefix = '// Java only: ';
 296             }
 297
 298             # Delete the encoding field
 299             splice(@a, 3, 1);
 300         } elsif ($a[1] eq 'alias') {
 301             # Pad out with extra blank fields to make the
 302             # 2-d array square
 303             push @a, "";
 304         } else {
 305             die "Can't parse $_";
 306         }
 307         print C_INDEX
 308             $prefix, "{ ",
 309             join(", ", map("\"$_\"", @a)),
 310             " },\n";
 311     }
 312
 313     print C_INDEX <<END;
 314   }
 315 }
 316 END
 317
 318     close(C_INDEX);
 319     close(JAVA_INDEX);
 320     print STDERR "$JAVA_INDEX -> $C_INDEX\n";
 321 }
 322
 323 ######################################################################
 324 # Output a header
 325 # Param: Filehandle
 326 sub header {
 327     my $out = shift;
 328     my $in = shift;
 329     print $out $HEADER1;
 330     print $out "// Tool: $TOOL\n// Source: $in\n";
 331     print $out "// Date: ", scalar localtime, "\n";
 332     print $out $HEADER2;
 333     print $out "\n";
 334 }
 335
 336 ######################################################################
 337 # Process one file
 338 # Param: ID, e.g. Fullwidth-Halfwidth
 339 # Param: Java input file name, e.g.
 340 #  f:/icu4j/src/com/ibm/text/resources/Transliterator_Fullwidth_Halfwidth.utf8.txt
 341 # Param: ICU output file name, e.g. fullhalf
 342 sub file {
 343     my $id = shift;
 344     my $IN = shift;
 345     my $out = shift;
 346
 347     my $OUT = "$out.txt";
 348
 349     # Show input size. Show output size later -- useful for quick sanity check.
 350     print "$id (", -s $IN, ") -> $OUT (";
 351
 352     # Open file, write UTF8 marker, close it, and reopen in text mode
 353     open(OUT, ">$OUTDIR/$OUT") or die;
 354     binmode OUT;        # Must do this so we can write our UTF8 marker
 355     print OUT pack("C3", 0xEF, 0xBB, 0xBF); # Write UTF8 marker
 356     close(OUT);
 357
 358     open(OUT, ">>$OUTDIR/$OUT") or die;
 359     print OUT " // -*- Coding: utf-8; -*-\n";
 360
 361     header(\*OUT, $IN);
 362     print OUT "// $id\n";
 363     print OUT "\n";
 364     print OUT "$out {\n";
 365     print OUT "  Rule {\n";
 366
 367     open(IN, $IN) or die;
 368     binmode IN;                 # IN is a UTF8 file
 369
 370     my $first = 1;
 371     my $BOM = pack("C3", 239, 187, 191); # a UTF8 byte order mark
 372
 373     # Process each line by changing # comments to // comments
 374     # and taking other text and enclosing it in double quotes
 375     while (<IN>) {
 376         my $raw = $_;
 377         # ignore CVS keyword substitutions
 378         next if /\$(Source|Revision|Date)/;
 379
 380         # we have printed out the copyright info ... ignore one in Java version
 381         next if /Copyright/ ;
 382         next if /Corporation/;
 383
 384         # Look for and delete BOM
 385         if ($first) {
 386             s/^$BOM//;
 387             $first = 0;
 388         }
 389
 390         # Clean the eol junk up
 391         s/[\x0D\x0A]+$//;
 392
 393         # If there is a trailing backslash, then delete it -- we don't
 394         # need line continuation in C, since adjacent strings are
 395         # concatenated.  Count trailing backslashes; if they are odd,
 396         # one is trailing.
 397         if (m|(\\+)$|) {
 398             if ((length($1) % 2) == 1) {
 399                 s|\\$||;
 400             }
 401         }
 402
 403         # Transform escaped characters
 404         hideEscapes();
 405
 406         if (/^(\s*)(\#.*)$/) {
 407             # Comment-only line
 408             my ($white, $cmt) = ($1, $2);
 409             $cmt =~ s|\#|//|;
 410             $_ = $white . $cmt;
 411
 412         } elsif (!/\S/) {
 413             # Blank line -- leave as-is
 414
 415         } else {
 416             # Remove single-quoted matter
 417             my @quotes;
 418             my $nquotes = 0;
 419             my $x = $_;
 420             while (s/^([^\']*)(\'[^\']*\')/$1<<x$nquotes>>/) {
 421                 push @quotes, $2;
 422                 ++$nquotes;
 423             }
 424
 425             # Extract comment
 426             my $cmt = '';
 427             if (s|\#(.*)||) {
 428                 $cmt = '//' . $1;
 429             }
 430
 431             # Add quotes
 432             s|^(\s*)(\S.*?)(\s*)$|$1\"$2\"$3|;
 433
 434             # Restore single-quoted matter
 435             for (my $i=0; $i<$nquotes; ++$i) {
 436                 s|<<x$i>>|$quotes[$i]|;
 437             }
 438
 439             # Restore comment
 440             $_ .= $cmt;
 441         }
 442
 443         # Restore escaped characters
 444         restoreEscapes();
 445
 446         print OUT $_, "\n";
 447     }
 448
 449     # Finish up
 450     close(IN);
 451     print OUT "  }\n";
 452     print OUT "}\n";
 453     close(OUT);
 454
 455     # Write output file size for sanity check
 456     print -s "$OUTDIR/$OUT", ")\n";
 457 }
 458
 459 ######################################################################
 460 sub hideEscapes {
 461     # Transform escaped characters
 462     s|\\\\|<<bs>>|g; # DO THIS FIRST Transform backslashes
 463     s|\\u([a-zA-Z0-9]{4})|<<u$1>>|g; # Transform Unicode escapes
 464     s|\\\"|<<dq>>|g; # Transform backslash double quote
 465     s|\\\'|<<sq>>|g; # Transform backslash single quote
 466     s|\\\#|<<lb>>|g; # Transform backslash pound
 467     s|\\(.)|<<q$1>>|g; # Transform backslash escapes
 468 }
 469
 470 ######################################################################
 471 sub restoreEscapes {
 472     # Restore escaped characters
 473     s|<<bs>>|\\\\|g;
 474     s|<<dq>>|\\\\\\\"|g;
 475     s|<<sq>>|\\\\\\\'|g;
 476     s|<<lb>>|\\\\\\\#|g;
 477     s|<<q(.)>>|\\\\\\$1|g;
 478     s|<<u0000>>|\\\\u0000|g; # Double escape U+0000
 479     s|<<u(....)>>|\\u$1|g;
 480 }
 481
 482 __END__
 483 :endofperl