From: Thad Hughes Date: Fri, 16 Dec 2011 19:47:23 +0000 (-0800) Subject: Stoplists, fix location of wikisplits. X-Git-Url: http://gitweb.fperrin.net/?p=DictionaryPC.git;a=commitdiff_plain;h=5fab504f765ff1553c98096ba85b04ffc2ef1062 Stoplists, fix location of wikisplits. --- diff --git a/bugs b/bugs index b146b34..259529c 100644 --- a/bugs +++ b/bugs @@ -1,5 +1,5 @@ - Alfredo {{it-proper noun|g=m}} :: , equivalent to English Alfred. - +sub-levels in translations. +examples. in wiktionary futurismo :: futurism () (noun) diff --git a/src/com/hughes/android/dictionary/SerializeCollatorTest.java b/src/com/hughes/android/dictionary/SerializeCollatorTest.java new file mode 100644 index 0000000..d22ce5e --- /dev/null +++ b/src/com/hughes/android/dictionary/SerializeCollatorTest.java @@ -0,0 +1,35 @@ +// Copyright 2011 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package com.hughes.android.dictionary; + +import java.io.File; +import java.io.IOException; + +import com.hughes.android.dictionary.engine.Language; +import com.ibm.icu.text.Collator; + +public class SerializeCollatorTest { + + /** + * @param args + * @throws IOException + */ + public static void main(String[] args) throws IOException { + File temp = File.createTempFile("temp", null); + final Collator c = Language.de.getCollator(); + //FileUtil.writeObject(c, temp); + } + +} diff --git a/src/com/hughes/android/dictionary/engine/DictionaryBuilder.java b/src/com/hughes/android/dictionary/engine/DictionaryBuilder.java index 549d386..04b72b4 100644 --- a/src/com/hughes/android/dictionary/engine/DictionaryBuilder.java +++ b/src/com/hughes/android/dictionary/engine/DictionaryBuilder.java @@ -20,8 +20,10 @@ import java.io.PrintStream; import java.io.RandomAccessFile; import java.nio.charset.Charset; import java.util.ArrayList; +import java.util.LinkedHashSet; import java.util.List; import java.util.Map; +import java.util.Set; import java.util.regex.Pattern; import javax.xml.parsers.ParserConfigurationException; @@ -33,38 +35,16 @@ import com.hughes.android.dictionary.parser.EnWiktionaryXmlParser; import com.hughes.util.Args; import com.hughes.util.FileUtil; -/* - ---maxEntries=100 ---dictOut=de-en.dict ---lang1=DE ---lang2=EN ---dictInfo=@dictInfo.txt - ---input0=/Users/thadh/personal/quickDic/de-en-chemnitz.txt ---input0Name=chemnitz ---input0Charset=UTF8 ---input0Format=chemnitz - ---input1=/Users/thadh/personal/quickDic/dewiktionary-20100326-pages-articles.xml ---input1Name=wiktionary ---input1Format=wiktionary - ---input2=/Users/thadh/personal/quickDic/de-en-dictcc.txt ---input2Name=dictcc ---input2Charset=Cp1252 ---input2Format=dictcc - */ public class DictionaryBuilder { public final Dictionary dictionary; public final List indexBuilders = new ArrayList(); - public DictionaryBuilder(final String dictInfo, final Language lang0, final Language lang1, final String normalizerRules1, final String normalizerRules2) { + public DictionaryBuilder(final String dictInfo, final Language lang0, final Language lang1, final String normalizerRules1, final String normalizerRules2, final Set lang1Stoplist, final Set lang2Stoplist) { dictionary = new Dictionary(dictInfo); - indexBuilders.add(new IndexBuilder(this, lang0.getSymbol(), lang0.getSymbol() + "->" + lang1.getSymbol(), lang0, normalizerRules1, false)); - indexBuilders.add(new IndexBuilder(this, lang1.getSymbol(), lang1.getSymbol() + "->" + lang0.getSymbol(), lang1, normalizerRules2, true)); + indexBuilders.add(new IndexBuilder(this, lang0.getSymbol(), lang0.getSymbol() + "->" + lang1.getSymbol(), lang0, normalizerRules1, lang1Stoplist, false)); + indexBuilders.add(new IndexBuilder(this, lang1.getSymbol(), lang1.getSymbol() + "->" + lang0.getSymbol(), lang1, normalizerRules2, lang2Stoplist, true)); } void build() { @@ -82,7 +62,18 @@ public class DictionaryBuilder { if (lang1 == null || lang2 == null) { fatalError("--lang1= and --lang2= must both be specified."); } - + + final Set lang1Stoplist = new LinkedHashSet(); + final Set lang2Stoplist = new LinkedHashSet(); + final String lang1StoplistFile = keyValueArgs.remove("lang1Stoplist"); + final String lang2StoplistFile = keyValueArgs.remove("lang2Stoplist"); + if (lang1StoplistFile != null) { + lang1Stoplist.addAll(FileUtil.readLines(new File(lang1StoplistFile))); + } + if (lang2StoplistFile != null) { + lang2Stoplist.addAll(FileUtil.readLines(new File(lang2StoplistFile))); + } + String normalizerRules1 = keyValueArgs.remove("normalizerRules1"); String normalizerRules2 = keyValueArgs.remove("normalizerRules2"); if (normalizerRules1 == null) { @@ -114,7 +105,7 @@ public class DictionaryBuilder { System.out.println("dictInfo=" + dictInfo); System.out.println("dictOut=" + dictOutFilename); - final DictionaryBuilder dictionaryBuilder = new DictionaryBuilder(dictInfo, lang1, lang2, normalizerRules1, normalizerRules2); + final DictionaryBuilder dictionaryBuilder = new DictionaryBuilder(dictInfo, lang1, lang2, normalizerRules1, normalizerRules2, lang1Stoplist, lang2Stoplist); for (int i = 0; i < 100; ++i) { final String prefix = "input" + i; diff --git a/src/com/hughes/android/dictionary/engine/DictionaryBuilderMain.java b/src/com/hughes/android/dictionary/engine/DictionaryBuilderMain.java index 2369915..ebf4ba7 100644 --- a/src/com/hughes/android/dictionary/engine/DictionaryBuilderMain.java +++ b/src/com/hughes/android/dictionary/engine/DictionaryBuilderMain.java @@ -26,16 +26,19 @@ import junit.framework.TestCase; public class DictionaryBuilderMain extends TestCase { static final String INPUTS = "../DictionaryData/inputs/"; + static final String STOPLISTS = "../DictionaryData/inputs/stoplists/"; static final String OUTPUTS = "../DictionaryData/outputs/"; static class Lang { final String nameRegex; final String isoCode; final String wikiSplit; - public Lang(String nameRegex, String code, final String wikiSplit) { + final String stoplistFile; + public Lang(String nameRegex, String code, final String wikiSplit, final String stoplistFile) { this.nameRegex = nameRegex; this.isoCode = code; this.wikiSplit = wikiSplit; + this.stoplistFile = stoplistFile; } } @@ -43,29 +46,29 @@ public class DictionaryBuilderMain extends TestCase { public static void main(final String[] args) throws Exception { Lang[] langs1 = new Lang[] { - new Lang("^English$", "EN", null), + new Lang("^English$", "EN", null, "en.txt"), }; Lang[] langs2 = new Lang[] { - new Lang("^.*Italian.*$", "IT", "italian.data"), - new Lang("^.*Greek.*$", "EL", "greek.data"), - new Lang("^.*Spanish.*$", "ES", "spanish.data"), + new Lang("^.*Italian.*$", "IT", "italian.data", "it.txt"), + new Lang("^.*French.*$", "FR", "french.data", "empty.txt"), + new Lang("^.*Spanish.*$", "ES", "spanish.data", "empty.txt"), + new Lang("^.*Greek.*$", "EL", "greek.data", "empty.txt"), + new Lang("^.*Japanese.*$", "JA", "japanese.data", "empty.txt"), + new Lang("^.*Chinese.*$|^.*Mandarin.*$", "ZH", "mandarin.data", "empty.txt"), /* new Lang("^German$", "DE"), new Lang("^Afrikaans$", "AF"), new Lang("^Armenian$", "HY"), new Lang("^Arabic$", "AR"), - new Lang("^Chinese$|^Mandarin$", "ZH"), new Lang("^Croation$", "HR"), new Lang("^Czech$", "CS"), new Lang("^Dutch$", "NL"), new Lang("^English$", "EN"), new Lang("^Finnish$", "FI"), - new Lang("^French$", "FR"), new Lang("^Hebrew$", "HE"), new Lang("^Hindi$", "HI"), new Lang("^Icelandic$", "IS"), new Lang("^Irish$", "GA"), - new Lang("^Japanese$", "JA"), new Lang("^Korean$", "KO"), new Lang("^Kurdish$", "KU"), new Lang("^Lithuanian$", "LT"), @@ -117,16 +120,18 @@ public class DictionaryBuilderMain extends TestCase { String.format("--dictOut=%s", dictFile), String.format("--lang1=%s", lang1.isoCode), String.format("--lang2=%s", lang2.isoCode), + String.format("--lang1Stoplist=%s", STOPLISTS + lang1.stoplistFile), + String.format("--lang2Stoplist=%s", STOPLISTS + lang2.stoplistFile), String.format("--dictInfo=(EN)Wikitionary-based %s-%s dictionary", lang1.isoCode, lang2.isoCode), - "--input2=" + INPUTS + "wikiSplit/" + nonEnglish.wikiSplit, + "--input2=" + INPUTS + "enWikiSplit/" + nonEnglish.wikiSplit, "--input2Name=enwiktionary." + nonEnglish.wikiSplit, "--input2Format=enwiktionary", "--input2LangPattern=" + nonEnglish.nameRegex, "--input2LangCodePattern=" + nonEnglish.isoCode.toLowerCase(), "--input2EnIndex=" + enIndex, - "--input3=" + INPUTS + "wikiSplit/english.data", + "--input3=" + INPUTS + "enWikiSplit/english.data", "--input3Name=enwiktionary.english", "--input3Format=enwiktionary", "--input3LangPattern=" + nonEnglish.nameRegex, @@ -173,7 +178,7 @@ public class DictionaryBuilderMain extends TestCase { "--input2Charset=UTF8", "--input2Format=chemnitz", - "--input3=" + INPUTS + "/copyrighted/de-en_dictcc.txt", + "--input3=" + INPUTS + "/NONFREE/de-en_dictcc.txt", "--input3Name=dictcc", "--input3Charset=UTF8", "--input3Format=dictcc", diff --git a/src/com/hughes/android/dictionary/engine/DictionaryBuilderTest.java b/src/com/hughes/android/dictionary/engine/DictionaryBuilderTest.java index c6dcccb..71cb4c2 100644 --- a/src/com/hughes/android/dictionary/engine/DictionaryBuilderTest.java +++ b/src/com/hughes/android/dictionary/engine/DictionaryBuilderTest.java @@ -27,7 +27,8 @@ import junit.framework.TestCase; public class DictionaryBuilderTest extends TestCase { public static final String TEST_INPUTS = "../DictionaryData/testdata/inputs/"; - public static final String WIKISPLIT = "../DictionaryData/inputs/wikiSplit/"; + public static final String WIKISPLIT = "../DictionaryData/inputs/enWikiSplit/"; + public static final String STOPLISTS = "../DictionaryData/inputs/stoplists/"; public static final String GOLDENS = "../DictionaryData/testdata/goldens/"; public static final String TEST_OUTPUTS = "../DictionaryData/testdata/outputs/"; @@ -40,6 +41,8 @@ public class DictionaryBuilderTest extends TestCase { "--dictOut=" + result.getAbsolutePath(), "--lang1=IT", "--lang2=EN", + "--lang1Stoplist=" + STOPLISTS + "it.txt", + "--lang2Stoplist=" + STOPLISTS + "en.txt", "--dictInfo=SomeWikiData", "--input4=" + WIKISPLIT + "italian.data", @@ -64,6 +67,8 @@ public class DictionaryBuilderTest extends TestCase { "--dictOut=" + result.getAbsolutePath(), "--lang1=IT", "--lang2=EN", + "--lang1Stoplist=" + STOPLISTS + "it.txt", + "--lang2Stoplist=" + STOPLISTS + "en.txt", "--dictInfo=SomeWikiData", "--input3=" + WIKISPLIT + "english.data", diff --git a/src/com/hughes/android/dictionary/engine/DictionaryTest.java b/src/com/hughes/android/dictionary/engine/DictionaryTest.java index 0451c7e..a44b520 100644 --- a/src/com/hughes/android/dictionary/engine/DictionaryTest.java +++ b/src/com/hughes/android/dictionary/engine/DictionaryTest.java @@ -45,7 +45,7 @@ public class DictionaryTest extends TestCase { final Index enIndex = dict.indices.get(0); final RowBase row = enIndex.rows.get(4); - assertEquals("carbonyl chloride (the compound COCl2) (noun)\tossicloruro di carbonio", row.getRawText(false)); + assertEquals("eagle (A gold coin with a face value of $10.00) (noun)\tmoneta di dieci dollari", row.getRawText(false)); raf.close(); } diff --git a/src/com/hughes/android/dictionary/engine/IndexBuilder.java b/src/com/hughes/android/dictionary/engine/IndexBuilder.java index 87ea308..98c24e5 100644 --- a/src/com/hughes/android/dictionary/engine/IndexBuilder.java +++ b/src/com/hughes/android/dictionary/engine/IndexBuilder.java @@ -32,13 +32,15 @@ public class IndexBuilder { final DictionaryBuilder dictionaryBuilder; public final Index index; + final Set stoplist; final SortedMap tokenToData; - IndexBuilder(final DictionaryBuilder dictionaryBuilder, final String shortName, final String longName, final Language language, final String normalizerRules, final boolean swapPairEntries) { + IndexBuilder(final DictionaryBuilder dictionaryBuilder, final String shortName, final String longName, final Language language, final String normalizerRules, final Set stoplist, final boolean swapPairEntries) { this.dictionaryBuilder = dictionaryBuilder; index = new Index(dictionaryBuilder.dictionary, shortName, longName, language, normalizerRules, swapPairEntries); tokenToData = new TreeMap(new NormalizeComparator(index.normalizer(), language.getCollator())); + this.stoplist = stoplist; } public void build() { @@ -96,7 +98,7 @@ public class IndexBuilder { } } - public TokenData getOrCreateTokenData(final String token) { + private TokenData getOrCreateTokenData(final String token) { TokenData tokenData = tokenToData.get(token); if (tokenData == null) { tokenData = new TokenData(token); @@ -105,7 +107,7 @@ public class IndexBuilder { return tokenData; } - public List getOrCreateEntries(final String token, final EntryTypeName entryTypeName) { + private List getOrCreateEntries(final String token, final EntryTypeName entryTypeName) { final TokenData tokenData = getOrCreateTokenData(token); List entries = tokenData.typeToEntries.get(entryTypeName); if (entries == null) { @@ -118,6 +120,7 @@ public class IndexBuilder { public void addEntryWithTokens(final IndexedEntry indexedEntry, final Set tokens, final EntryTypeName entryTypeName) { for (final String token : tokens) { + if (entryTypeName.overridesStopList || !stoplist.contains(token)) getOrCreateEntries(token, entryTypeName).add(indexedEntry); } } diff --git a/src/com/hughes/android/dictionary/engine/WiktionarySplitter.java b/src/com/hughes/android/dictionary/engine/WiktionarySplitter.java index 83cb043..39addd5 100644 --- a/src/com/hughes/android/dictionary/engine/WiktionarySplitter.java +++ b/src/com/hughes/android/dictionary/engine/WiktionarySplitter.java @@ -79,27 +79,27 @@ public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler { if (selectors.isEmpty()) { selectors.addAll(Arrays.asList( - new Selector("wikiSplit/arabic.data", ".*[Ar]rabic.*"), - new Selector("wikiSplit/croation.data", ".*[Cc]roation.*"), - new Selector("wikiSplit/czech.data", ".*[Cc]zech.*"), - new Selector("wikiSplit/mandarin.data", ".*[Mm]andarin|[Cc]hinese.*"), - new Selector("wikiSplit/dutch.data", ".*[Du]utch.*"), - new Selector("wikiSplit/english.data", ".*[Ee]nglish.*"), - new Selector("wikiSplit/french.data", ".*[Ff]rench.*"), - new Selector("wikiSplit/german.data", ".*[Gg]erman.*"), - new Selector("wikiSplit/greek.data", ".*[Gg]reek.*"), - new Selector("wikiSplit/hindi.data", ".*[Hh]indi.*"), - new Selector("wikiSplit/italian.data", ".*[Ii]talian.*"), - new Selector("wikiSplit/japanese.data", ".*[Jj]apanese.*"), - new Selector("wikiSplit/korean.data", ".*[Kk]orean.*"), - new Selector("wikiSplit/persian.data", ".*[Pp]ersian.*"), - new Selector("wikiSplit/portuguese.data", ".*[Pp]ortuguese.*"), - new Selector("wikiSplit/romanian.data", ".*[Rr]omanian.*"), - new Selector("wikiSplit/russian.data", ".*[Rr]ussian.*"), - new Selector("wikiSplit/spanish.data", ".*[Ss]panish.*"), - new Selector("wikiSplit/swedish.data", ".*[Ss]wedish.*"), - new Selector("wikiSplit/thai.data", ".*[Tt]hai.*"), - new Selector("wikiSplit/vietnamese.data", ".*[Vv]ietnamese.*") + new Selector("../DictionaryData/inputs/enWikiSplit/arabic.data", ".*[Ar]rabic.*"), + new Selector("../DictionaryData/inputs/enWikiSplit/croation.data", ".*[Cc]roation.*"), + new Selector("../DictionaryData/inputs/enWikiSplit/czech.data", ".*[Cc]zech.*"), + new Selector("../DictionaryData/inputs/enWikiSplit/mandarin.data", ".*[Mm]andarin|[Cc]hinese.*"), + new Selector("../DictionaryData/inputs/enWikiSplit/dutch.data", ".*[Du]utch.*"), + new Selector("../DictionaryData/inputs/enWikiSplit/english.data", ".*[Ee]nglish.*"), + new Selector("../DictionaryData/inputs/enWikiSplit/french.data", ".*[Ff]rench.*"), + new Selector("../DictionaryData/inputs/enWikiSplit/german.data", ".*[Gg]erman.*"), + new Selector("../DictionaryData/inputs/enWikiSplit/greek.data", ".*[Gg]reek.*"), + new Selector("../DictionaryData/inputs/enWikiSplit/hindi.data", ".*[Hh]indi.*"), + new Selector("../DictionaryData/inputs/enWikiSplit/italian.data", ".*[Ii]talian.*"), + new Selector("../DictionaryData/inputs/enWikiSplit/japanese.data", ".*[Jj]apanese.*"), + new Selector("../DictionaryData/inputs/enWikiSplit/korean.data", ".*[Kk]orean.*"), + new Selector("../DictionaryData/inputs/enWikiSplit/persian.data", ".*[Pp]ersian.*"), + new Selector("../DictionaryData/inputs/enWikiSplit/portuguese.data", ".*[Pp]ortuguese.*"), + new Selector("../DictionaryData/inputs/enWikiSplit/romanian.data", ".*[Rr]omanian.*"), + new Selector("../DictionaryData/inputs/enWikiSplit/russian.data", ".*[Rr]ussian.*"), + new Selector("../DictionaryData/inputs/enWikiSplit/spanish.data", ".*[Ss]panish.*"), + new Selector("../DictionaryData/inputs/enWikiSplit/swedish.data", ".*[Ss]wedish.*"), + new Selector("../DictionaryData/inputs/enWikiSplit/thai.data", ".*[Tt]hai.*"), + new Selector("../DictionaryData/inputs/enWikiSplit/vietnamese.data", ".*[Vv]ietnamese.*") )); } diff --git a/src/com/hughes/android/dictionary/parser/DictFileParser.java b/src/com/hughes/android/dictionary/parser/DictFileParser.java index cea4f90..67ca432 100644 --- a/src/com/hughes/android/dictionary/parser/DictFileParser.java +++ b/src/com/hughes/android/dictionary/parser/DictFileParser.java @@ -21,6 +21,7 @@ import java.io.IOException; import java.io.InputStreamReader; import java.nio.charset.Charset; import java.util.Arrays; +import java.util.Collections; import java.util.LinkedHashSet; import java.util.List; import java.util.Set; @@ -211,8 +212,7 @@ public class DictFileParser { for (String token : tokens) { token = TRIM_PUNC.matcher(token).replaceAll(""); if (/*!alreadyDone.contains(token) && */token.length() > 0) { - final List entries = indexBuilder.getOrCreateEntries(token, entryTypeName); - entries.add(entryData); + indexBuilder.addEntryWithTokens(entryData, Collections.singleton(token), entryTypeName); // alreadyDone.add(token); // also split words on dashes, do them, too. @@ -220,8 +220,7 @@ public class DictFileParser { final String[] dashed = token.split("-"); for (final String dashedToken : dashed) { if (/*!alreadyDone.contains(dashedToken) && */dashedToken.length() > 0) { - final List dashEntries = indexBuilder.getOrCreateEntries(dashedToken, EntryTypeName.PART_OF_HYPHENATED); - dashEntries.add(entryData); + indexBuilder.addEntryWithTokens(entryData, Collections.singleton(dashedToken), EntryTypeName.PART_OF_HYPHENATED); } } } @@ -234,8 +233,7 @@ public class DictFileParser { for (final String token : bracketedTokens) { assert !token.contains("-"); if (/*!alreadyDone.contains(token) && */token.length() > 0) { - final List entries = indexBuilder.getOrCreateEntries(token, EntryTypeName.BRACKETED); - entries.add(entryData); + indexBuilder.addEntryWithTokens(entryData, Collections.singleton(token), EntryTypeName.BRACKETED); } } @@ -244,8 +242,7 @@ public class DictFileParser { for (final String token : parenTokens) { assert !token.contains("-"); if (/*!alreadyDone.contains(token) && */token.length() > 0) { - final List entries = indexBuilder.getOrCreateEntries(token, EntryTypeName.PARENTHESIZED); - entries.add(entryData); + indexBuilder.addEntryWithTokens(entryData, Collections.singleton(token), EntryTypeName.PARENTHESIZED); } }