X-Git-Url: http://gitweb.fperrin.net/?a=blobdiff_plain;f=src%2Fcom%2Fhughes%2Fandroid%2Fdictionary%2Fengine%2FDictionaryBuilder.java;h=5fd6fc10ed29a41044457c1c2bfced85445fd073;hb=253466ba45a33fcc3ba3a399cfa2f243392db0b0;hp=a3a08c95f1b8d0080ab38de1e994fc5e4fe40cdd;hpb=9a0850fd39f27b5cf08dbf63510f523d8bceff5d;p=DictionaryPC.git diff --git a/src/com/hughes/android/dictionary/engine/DictionaryBuilder.java b/src/com/hughes/android/dictionary/engine/DictionaryBuilder.java index a3a08c9..5fd6fc1 100644 --- a/src/com/hughes/android/dictionary/engine/DictionaryBuilder.java +++ b/src/com/hughes/android/dictionary/engine/DictionaryBuilder.java @@ -1,3 +1,17 @@ +// Copyright 2011 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + package com.hughes.android.dictionary.engine; import java.io.File; @@ -6,8 +20,10 @@ import java.io.PrintStream; import java.io.RandomAccessFile; import java.nio.charset.Charset; import java.util.ArrayList; +import java.util.LinkedHashSet; import java.util.List; import java.util.Map; +import java.util.Set; import java.util.regex.Pattern; import javax.xml.parsers.ParserConfigurationException; @@ -15,42 +31,20 @@ import javax.xml.parsers.ParserConfigurationException; import org.xml.sax.SAXException; import com.hughes.android.dictionary.parser.DictFileParser; -import com.hughes.android.dictionary.parser.EnWiktionaryXmlParser; +import com.hughes.android.dictionary.parser.enwiktionary.EnWiktionaryXmlParser; import com.hughes.util.Args; import com.hughes.util.FileUtil; -/* - ---maxEntries=100 ---dictOut=de-en.dict ---lang1=DE ---lang2=EN ---dictInfo=@dictInfo.txt - ---input0=/Users/thadh/personal/quickDic/de-en-chemnitz.txt ---input0Name=chemnitz ---input0Charset=UTF8 ---input0Format=chemnitz - ---input1=/Users/thadh/personal/quickDic/dewiktionary-20100326-pages-articles.xml ---input1Name=wiktionary ---input1Format=wiktionary - ---input2=/Users/thadh/personal/quickDic/de-en-dictcc.txt ---input2Name=dictcc ---input2Charset=Cp1252 ---input2Format=dictcc - */ public class DictionaryBuilder { public final Dictionary dictionary; public final List indexBuilders = new ArrayList(); - public DictionaryBuilder(final String dictInfo, final Language lang0, final Language lang1, final String normalizerRules1, final String normalizerRules2) { - dictionary = new Dictionary(dictInfo); - indexBuilders.add(new IndexBuilder(this, lang0.getSymbol(), lang0.getSymbol() + "->" + lang1.getSymbol(), lang0, normalizerRules1, false)); - indexBuilders.add(new IndexBuilder(this, lang1.getSymbol(), lang1.getSymbol() + "->" + lang0.getSymbol(), lang1, normalizerRules2, true)); + public DictionaryBuilder(final String dictInfoString, final Language lang0, final Language lang1, final String normalizerRules1, final String normalizerRules2, final Set lang1Stoplist, final Set lang2Stoplist) { + dictionary = new Dictionary(dictInfoString); + indexBuilders.add(new IndexBuilder(this, lang0.getIsoCode(), lang0.getIsoCode() + "->" + lang1.getIsoCode(), lang0, normalizerRules1, lang1Stoplist, false)); + indexBuilders.add(new IndexBuilder(this, lang1.getIsoCode(), lang1.getIsoCode() + "->" + lang0.getIsoCode(), lang1, normalizerRules2, lang2Stoplist, true)); } void build() { @@ -61,14 +55,30 @@ public class DictionaryBuilder { } public static void main(final String[] args) throws IOException, ParserConfigurationException, SAXException { + System.out.println("Running with arguments:"); + for (final String arg : args) { + System.out.println(arg); + } + final Map keyValueArgs = Args.keyValueArgs(args); + if (!keyValueArgs.containsKey("lang1") || !keyValueArgs.containsKey("lang2")) { + fatalError("--lang1= and --lang2= must both be specified."); + } final Language lang1 = Language.lookup(keyValueArgs.remove("lang1")); final Language lang2 = Language.lookup(keyValueArgs.remove("lang2")); - if (lang1 == null || lang2 == null) { - fatalError("--lang1= and --lang2= must both be specified."); + + final Set lang1Stoplist = new LinkedHashSet(); + final Set lang2Stoplist = new LinkedHashSet(); + final String lang1StoplistFile = keyValueArgs.remove("lang1Stoplist"); + final String lang2StoplistFile = keyValueArgs.remove("lang2Stoplist"); + if (lang1StoplistFile != null) { + lang1Stoplist.addAll(FileUtil.readLines(new File(lang1StoplistFile))); } - + if (lang2StoplistFile != null) { + lang2Stoplist.addAll(FileUtil.readLines(new File(lang2StoplistFile))); + } + String normalizerRules1 = keyValueArgs.remove("normalizerRules1"); String normalizerRules2 = keyValueArgs.remove("normalizerRules2"); if (normalizerRules1 == null) { @@ -100,7 +110,7 @@ public class DictionaryBuilder { System.out.println("dictInfo=" + dictInfo); System.out.println("dictOut=" + dictOutFilename); - final DictionaryBuilder dictionaryBuilder = new DictionaryBuilder(dictInfo, lang1, lang2, normalizerRules1, normalizerRules2); + final DictionaryBuilder dictionaryBuilder = new DictionaryBuilder(dictInfo, lang1, lang2, normalizerRules1, normalizerRules2, lang1Stoplist, lang2Stoplist); for (int i = 0; i < 100; ++i) { final String prefix = "input" + i; @@ -117,23 +127,30 @@ public class DictionaryBuilder { fatalError("Must specify human readable name for: " + prefix + "Name"); } - final EntrySource entrySource = new EntrySource(dictionaryBuilder.dictionary.sources.size(), inputName, dictionaryBuilder.dictionary.pairEntries.size()); + final EntrySource entrySource = new EntrySource(dictionaryBuilder.dictionary.sources.size(), inputName, 0); System.out.println(""); String inputFormat = keyValueArgs.remove(prefix + "Format"); - if ("dictcc".equals(inputFormat)) { - new DictFileParser(charset, false, DictFileParser.TAB, null, dictionaryBuilder, dictionaryBuilder.indexBuilders.toArray(new IndexBuilder[0]), null).parseFile(file); + if ("tab_separated".equals(inputFormat)) { + final boolean flipColumns = "true".equals(keyValueArgs.remove(prefix + "FlipColumns")); + new DictFileParser(charset, flipColumns, DictFileParser.TAB, null, dictionaryBuilder, dictionaryBuilder.indexBuilders.toArray(new IndexBuilder[0]), null).parseFile(file, entrySource); } else if ("chemnitz".equals(inputFormat)) { - new DictFileParser(charset, false, DictFileParser.DOUBLE_COLON, DictFileParser.PIPE, dictionaryBuilder, dictionaryBuilder.indexBuilders.toArray(new IndexBuilder[0]), null).parseFile(file); + final boolean flipColumns = "true".equals(keyValueArgs.remove(prefix + "FlipColumns")); + new DictFileParser(charset, flipColumns, DictFileParser.DOUBLE_COLON, DictFileParser.PIPE, dictionaryBuilder, dictionaryBuilder.indexBuilders.toArray(new IndexBuilder[0]), null).parseFile(file, entrySource); } else if ("enwiktionary".equals(inputFormat)) { - final Pattern[] translationPatterns = new Pattern[2]; - translationPatterns[0] = Pattern.compile(keyValueArgs.remove(prefix + "TranslationPattern1")); - translationPatterns[1] = Pattern.compile(keyValueArgs.remove(prefix + "TranslationPattern2")); + final Pattern langPattern = Pattern.compile(keyValueArgs.remove(prefix + "LangPattern"), Pattern.CASE_INSENSITIVE); + final Pattern langCodePattern = Pattern.compile(keyValueArgs.remove(prefix + "LangCodePattern")); final int enIndex = Integer.parseInt(keyValueArgs.remove(prefix + "EnIndex")) - 1; + String pageLimit = keyValueArgs.remove(prefix + "PageLimit"); + if (pageLimit == null) { + pageLimit = "-1"; + } + if (enIndex < 0 || enIndex >= 2) { fatalError("Must be 1 or 2: " + prefix + "EnIndex"); } - new EnWiktionaryXmlParser(dictionaryBuilder, translationPatterns, enIndex).parse(file); + new EnWiktionaryXmlParser(dictionaryBuilder.indexBuilders.get(enIndex), dictionaryBuilder.indexBuilders.get(1-enIndex), + langPattern, langCodePattern, enIndex != 0).parse(file, entrySource, Integer.parseInt(pageLimit)); } else { fatalError("Invalid or missing input format: " + inputFormat); } @@ -166,6 +183,8 @@ public class DictionaryBuilder { private static void fatalError(String string) { System.err.println(string); + + System.exit(1); }