3 if "%OS%" == "Windows_NT" goto WinNT
\r
4 perl -x -S "%0" %1 %2 %3 %4 %5 %6 %7 %8 %9
\r
8 if NOT "%COMSPEC%" == "%SystemRoot%\system32\cmd.exe" goto endofperl
\r
9 if %errorlevel% == 9009 echo You do not have Perl in your PATH.
\r
14 # ************************************************************************
\r
15 # Copyright (C) 2000-2004, International Business Machines Corporation and
\r
16 # others. All Rights Reserved.
\r
17 # ************************************************************************
\r
19 # This perl script creates ICU transliterator data files, that live
\r
20 # in icu/data, from ICU4J UTF8 transliterator data files, in
\r
21 # icu4j/src/com/ibm/icu/impl/data/.
\r
23 # The transformation that is done is very minimal. The script assumes
\r
24 # that the input files use only # comments
\r
25 # and that they follow a rigid format.
\r
27 # The output files are named according to ICU conventions (see NAME_MAP
\r
28 # below) and created in the current directory. They should be manually
\r
29 # checked and then copied into the icu/data/tranlit directory.
\r
31 # then be initiated, and the standard suite of ICU transliterator tests
\r
32 # should be run after that.
\r
34 # Alan Liu 5/19/00 2/27/01
\r
39 use vars qw(%USED_FILES);
\r
41 my $DIR = "../../../impl/data";
\r
44 GetOptions('dir=s' => \$DIR,
\r
46 '<>' => \&usage) || die;
\r
52 print STDERR "$DIR is not a directory\n";
\r
59 print "Usage: $me [-dir <dir>] [-id <id>]\n";
\r
60 print " --dir <dir> Specify the directory containing the\n";
\r
61 print " Transliterator_*.txt files\n";
\r
62 print " --id <id> Specify a single ID to transform, e.g.\n";
\r
63 print " Fullwidth-Halfwidth\n";
\r
67 my $JAVA_ONLY = '-';
\r
69 my $OUTDIR = "icu4c";
\r
70 mkdir($OUTDIR,0777);
\r
72 # Mapping from Java file names to ICU file names
\r
73 # Constraints on ICU4C file name: icudt20b_
\r
74 # |--9 (prefix)---|---18(name with distinguisher,e.g. "t_" )----|
\r
75 # --4 ("."+extn)--| = 31 characters total.
\r
76 # That is, must have length(%NAME_MAP{x}) <= 16
\r
78 my $MAX_ICU4C_FILENAME_LEN = 18;
\r
80 # -- HISTORY -- If not marked, then pre 2.2.
\r
81 # All InterIndic are pre 2.2.
\r
84 # Arabic_Latin * 2.2
\r
86 # Fullwidth_Halfwidth
\r
88 # Greek_Latin_UNGEGN * 2.2 (moved from el.txt)
\r
90 # Han_Latin_Definition * 2.2
\r
91 # Han_Latin_EDICT * 2.2 J only
\r
92 # Hebrew_Latin * 2.2
\r
97 # ThaiLogical_Latin * 2.2 J only
\r
98 # Thai_ThaiLogical * 2.2 J only
\r
99 # Thai_ThaiSemi * 2.2 J only
\r
102 # An ICU name of "" means the ICU name == the ID
\r
104 # We filter names based on what is in use in the index file.
\r
106 # Flag a rule as JAVA_ONLY if it exists and we use it in Java,
\r
107 # but we don't use it in C.
\r
109 # Use official script abbreviations where possible.
\r
111 # |..............| |..............|
\r
112 # 1234567890123456 1234567890123456
\r
114 Any_Publishing => "",
\r
115 Cyrillic_Latin => "Cyrl_Latn",
\r
116 Fullwidth_Halfwidth => "FWidth_HWidth",
\r
117 Greek_Latin => "Grek_Latn",
\r
118 Hiragana_Katakana => "Hira_Kana",
\r
119 Hiragana_Latin => "Hira_Latn",
\r
120 Latin_Jamo => "Latn_Jamo",
\r
121 Latin_Katakana => "Latn_Kana",
\r
123 Arabic_Latin => "Arab_Latn",
\r
124 Greek_Latin_UNGEGN => "Grek_Latn_UNGEGN",
\r
125 Han_Latin => "Hani_Latn",
\r
126 Han_Latin_Definition => "Hani_Latn_Def",
\r
127 Han_Latin_EDICT => "Hani_Latn_EDICT",
\r
128 Hebrew_Latin => "Hebr_Latn",
\r
129 ThaiLogical_Latin => $JAVA_ONLY, # "ThaiLog_Latn",
\r
130 Thai_ThaiLogical => $JAVA_ONLY, # "Thai_ThaiLog",
\r
131 Thai_ThaiSemi => $JAVA_ONLY, # "Thai_ThaiSemi",
\r
133 InterIndic_Bengali => "InterIndic_Beng",
\r
134 InterIndic_Devanagari => "InterIndic_Deva",
\r
135 InterIndic_Gujarati => "InterIndic_Gujr",
\r
136 InterIndic_Gurmukhi => "InterIndic_Guru",
\r
137 InterIndic_Kannada => "InterIndic_Knda",
\r
138 InterIndic_Latin => "InterIndic_Latn",
\r
139 InterIndic_Malayalam => "InterIndic_Mlym",
\r
140 InterIndic_Oriya => "InterIndic_Orya",
\r
141 InterIndic_Tamil => "InterIndic_Taml",
\r
142 InterIndic_Telugu => "InterIndic_Telu",
\r
144 Bengali_InterIndic => "Beng_InterIndic",
\r
145 Devanagari_InterIndic => "Deva_InterIndic",
\r
146 Gujarati_InterIndic => "Gujr_InterIndic",
\r
147 Gurmukhi_InterIndic => "Guru_InterIndic",
\r
148 Kannada_InterIndic => "Knda_InterIndic",
\r
149 Latin_InterIndic => "Latn_InterIndic",
\r
150 Malayalam_InterIndic => "Mlym_InterIndic",
\r
151 Oriya_InterIndic => "Orya_InterIndic",
\r
152 Tamil_InterIndic => "Taml_InterIndic",
\r
153 Telugu_InterIndic => "Telu_InterIndic",
\r
155 Han_Pinyin => $JAVA_ONLY,
\r
156 Kanji_English => $JAVA_ONLY,
\r
157 Kanji_OnRomaji => $JAVA_ONLY,
\r
159 Latin_NumericPinyin => "Latn_NPinyn",
\r
160 Tone_Digit => "Tone_Digit",
\r
161 Han_Spacedhan => "Hani_SpHan",
\r
164 my ($x,$x,$x,$x,$x,$THIS_YEAR) = localtime();
\r
165 $THIS_YEAR += 1900;
\r
167 # Header blocks of text written at start of ICU output files
\r
168 my $HEADER1 = <<END;
\r
169 //--------------------------------------------------------------------
\r
170 // Copyright (c) 1999-$THIS_YEAR, International Business Machines
\r
171 // Corporation and others. All Rights Reserved.
\r
172 //--------------------------------------------------------------------
\r
173 // THIS IS A MACHINE-GENERATED FILE
\r
175 my $HEADER2 = <<END;
\r
176 //--------------------------------------------------------------------
\r
181 # Convert the index first; this tells us which rule files are in use.
\r
184 # print "In use:\n", join("\n", sort keys(%USED_FILES)), "\n";
\r
186 # Iterate over all Java RBT rule files
\r
187 foreach (<$DIR/Transliterator_*.txt>) {
\r
188 next if (/~$/); # Ignore emacs backups
\r
189 next if (/_index\.txt$/); # The index file was processed above
\r
190 # Select either the command-line arg, if there was one, or
\r
191 # any files mentioned in the index.
\r
193 $leaf =~ s|.+[/\\]||;
\r
194 if (($ID && $leaf =~ /$ID/) || exists $USED_FILES{$leaf}) {
\r
195 my ($out, $id) = convertFileName($_);
\r
197 if ($out eq $JAVA_ONLY) {
\r
198 print STDERR "*** $id skipped: Java only ***\n";
\r
201 file($id, $_, $out);
\r
204 print "*** $leaf skipped: not in use ***\n";
\r
208 ######################################################################
\r
209 # Convert a Java file name to C
\r
210 # Param: Java file name of the form m|Transliterator_(.+)\.utf8\.txt$|
\r
211 # Return: A C file name (e.g., ldevan.txt) or the empty string,
\r
212 # if there is no mapping, or $JAVA_ONLY if the given file isn't
\r
213 # intended to be incorporated into C.
\r
214 sub convertFileName {
\r
217 if (m|Transliterator_(.+)\.utf8\.txt$| ||
\r
218 m|Transliterator_(.+)\.txt$|) {
\r
220 } else { die "Can't parse Java file name $_"; }
\r
221 if (!exists $NAME_MAP{$id}) {
\r
222 print STDERR "ERROR: $id not in map; please update $0\n";
\r
225 my $out = $NAME_MAP{$id};
\r
229 if ($out ne $JAVA_ONLY) {
\r
230 $out = 't_' . $out;
\r
232 if (length($out) > $MAX_ICU4C_FILENAME_LEN) {
\r
233 print STDERR "ERROR: ICU4C file name \"$out\" too long; please update $0\n";
\r
236 return ($out, $id);
\r
239 ######################################################################
\r
240 # Convert the index file from Java to C format
\r
242 my $JAVA_INDEX = "Transliterator_index.txt";
\r
243 my $C_INDEX = "translit_index.txt";
\r
244 open(JAVA_INDEX, "$DIR/$JAVA_INDEX") or die;
\r
245 open(C_INDEX, ">$OUTDIR/$C_INDEX") or die;
\r
247 header(\*C_INDEX, $JAVA_INDEX);
\r
249 print C_INDEX <<END;
\r
250 //--------------------------------------------------------------------
\r
251 // N.B.: This file has been generated mechanically from the
\r
252 // corresponding ICU4J file, which is the master file that receives
\r
253 // primary updates. The colon-delimited fields have been split into
\r
254 // separate strings. For 'file' and 'internal' lines, the encoding
\r
255 // field has been deleted, since the encoding is processed at build
\r
256 // time in ICU4C. Certain large rule sets not intended for general
\r
257 // use have been commented out with the notation "Java only".
\r
258 //--------------------------------------------------------------------
\r
261 RuleBasedTransliteratorIDs {
\r
264 while (<JAVA_INDEX>) {
\r
265 # ignore CVS keyword substitutions
\r
266 next if /\$(Source|Revision|Date)/;
\r
268 # we have printed out the copyright info ... ignore one in Java version
\r
269 next if /Copyright/ ;
\r
270 next if /Corporation/;
\r
272 # Comments; change # to //
\r
273 if (s|^(\s*)\#|$1//|) {
\r
285 #replace \p with \\p
\r
286 $_=~ s/\\p/\\\\p/g;
\r
287 my @a = split(':', $_);
\r
288 if ($a[1] eq 'file' || $a[1] eq 'internal') {
\r
289 # Convert the file name
\r
291 # Record file names in use
\r
292 $USED_FILES{$a[2]} = 1;
\r
293 ($a[2], $id) = convertFileName($a[2]);
\r
294 if ($a[2] eq $JAVA_ONLY) {
\r
295 $prefix = '// Java only: ';
\r
298 # Delete the encoding field
\r
300 } elsif ($a[1] eq 'alias') {
\r
301 # Pad out with extra blank fields to make the
\r
305 die "Can't parse $_";
\r
309 join(", ", map("\"$_\"", @a)),
\r
313 print C_INDEX <<END;
\r
320 print STDERR "$JAVA_INDEX -> $C_INDEX\n";
\r
323 ######################################################################
\r
325 # Param: Filehandle
\r
329 print $out $HEADER1;
\r
330 print $out "// Tool: $TOOL\n// Source: $in\n";
\r
331 print $out "// Date: ", scalar localtime, "\n";
\r
332 print $out $HEADER2;
\r
336 ######################################################################
\r
338 # Param: ID, e.g. Fullwidth-Halfwidth
\r
339 # Param: Java input file name, e.g.
\r
340 # f:/icu4j/src/com/ibm/text/resources/Transliterator_Fullwidth_Halfwidth.utf8.txt
\r
341 # Param: ICU output file name, e.g. fullhalf
\r
347 my $OUT = "$out.txt";
\r
349 # Show input size. Show output size later -- useful for quick sanity check.
\r
350 print "$id (", -s $IN, ") -> $OUT (";
\r
352 # Open file, write UTF8 marker, close it, and reopen in text mode
\r
353 open(OUT, ">$OUTDIR/$OUT") or die;
\r
354 binmode OUT; # Must do this so we can write our UTF8 marker
\r
355 print OUT pack("C3", 0xEF, 0xBB, 0xBF); # Write UTF8 marker
\r
358 open(OUT, ">>$OUTDIR/$OUT") or die;
\r
359 print OUT " // -*- Coding: utf-8; -*-\n";
\r
361 header(\*OUT, $IN);
\r
362 print OUT "// $id\n";
\r
364 print OUT "$out {\n";
\r
365 print OUT " Rule {\n";
\r
367 open(IN, $IN) or die;
\r
368 binmode IN; # IN is a UTF8 file
\r
371 my $BOM = pack("C3", 239, 187, 191); # a UTF8 byte order mark
\r
373 # Process each line by changing # comments to // comments
\r
374 # and taking other text and enclosing it in double quotes
\r
377 # ignore CVS keyword substitutions
\r
378 next if /\$(Source|Revision|Date)/;
\r
380 # we have printed out the copyright info ... ignore one in Java version
\r
381 next if /Copyright/ ;
\r
382 next if /Corporation/;
\r
384 # Look for and delete BOM
\r
390 # Clean the eol junk up
\r
393 # If there is a trailing backslash, then delete it -- we don't
\r
394 # need line continuation in C, since adjacent strings are
\r
395 # concatenated. Count trailing backslashes; if they are odd,
\r
398 if ((length($1) % 2) == 1) {
\r
403 # Transform escaped characters
\r
406 if (/^(\s*)(\#.*)$/) {
\r
407 # Comment-only line
\r
408 my ($white, $cmt) = ($1, $2);
\r
410 $_ = $white . $cmt;
\r
413 # Blank line -- leave as-is
\r
416 # Remove single-quoted matter
\r
420 while (s/^([^\']*)(\'[^\']*\')/$1<<x$nquotes>>/) {
\r
432 s|^(\s*)(\S.*?)(\s*)$|$1\"$2\"$3|;
\r
434 # Restore single-quoted matter
\r
435 for (my $i=0; $i<$nquotes; ++$i) {
\r
436 s|<<x$i>>|$quotes[$i]|;
\r
443 # Restore escaped characters
\r
446 print OUT $_, "\n";
\r
455 # Write output file size for sanity check
\r
456 print -s "$OUTDIR/$OUT", ")\n";
\r
459 ######################################################################
\r
461 # Transform escaped characters
\r
462 s|\\\\|<<bs>>|g; # DO THIS FIRST Transform backslashes
\r
463 s|\\u([a-zA-Z0-9]{4})|<<u$1>>|g; # Transform Unicode escapes
\r
464 s|\\\"|<<dq>>|g; # Transform backslash double quote
\r
465 s|\\\'|<<sq>>|g; # Transform backslash single quote
\r
466 s|\\\#|<<lb>>|g; # Transform backslash pound
\r
467 s|\\(.)|<<q$1>>|g; # Transform backslash escapes
\r
470 ######################################################################
\r
471 sub restoreEscapes {
\r
472 # Restore escaped characters
\r
474 s|<<dq>>|\\\\\\\"|g;
\r
475 s|<<sq>>|\\\\\\\'|g;
\r
476 s|<<lb>>|\\\\\\\#|g;
\r
477 s|<<q(.)>>|\\\\\\$1|g;
\r
478 s|<<u0000>>|\\\\u0000|g; # Double escape U+0000
\r
479 s|<<u(....)>>|\\u$1|g;
\r