fatalError("Must specify human readable name for: " + prefix + "Name");
}
+ final EntrySource entrySource = new EntrySource(dictionaryBuilder.dictionary.sources.size(), inputName, dictionaryBuilder.dictionary.pairEntries.size());
+ System.out.println("");
+
String inputFormat = keyValueArgs.remove(prefix + "Format");
if ("dictcc".equals(inputFormat)) {
new DictFileParser(charset, false, DictFileParser.TAB, null, dictionaryBuilder, dictionaryBuilder.indexBuilders.toArray(new IndexBuilder[0]), null).parseFile(file);
fatalError("Invalid or missing input format: " + inputFormat);
}
- final EntrySource entrySource = new EntrySource(dictionaryBuilder.dictionary.sources.size(), inputName);
dictionaryBuilder.dictionary.sources.add(entrySource);
System.out.println("Done: " + file + "\n\n");
}
public static void main(final String[] args) throws Exception {
-
+
+ DictionaryBuilder.main(new String[] {
+ "--dictOut=dictOutputs/DE-EN_chemnitz.quickdic",
+ "--lang1=DE",
+ "--lang2=EN",
+ "--dictInfo=@dictInputs/de-en_chemnitz.info",
+
+ "--input1=dictInputs/de-en_chemnitz.txt",
+ "--input1Name=chemnitz",
+ "--input1Charset=UTF8",
+ "--input1Format=chemnitz",
+ });
+
Lang[] langs1 = new Lang[] {
new Lang("^English$", "EN"),
new Lang("^German$", "DE"),
} // langs2
} // langs1
-
+
DictionaryBuilder.main(new String[] {
"--dictOut=dictOutputs/de-en_all.quickdic",
"--lang1=DE",
});
- DictionaryBuilder.main(new String[] {
- "--dictOut=dictOutputs/de-en_chemnitz.quickdic",
- "--lang1=DE",
- "--lang2=EN",
- "--dictInfo=@dictInputs/de-en_chemnitz.info",
-
- "--input1=dictInputs/de-en_chemnitz.txt",
- "--input1Name=dictcc",
- "--input1Charset=UTF8",
- "--input1Format=chemnitz",
- });
-
}
}
"--dictInfo=@testdata/de-en_dictInfo.txt",
"--input1=testdata/de-en_chemnitz_100",
- "--input1Name=dictcc",
+ "--input1Name=chemnitz",
"--input1Charset=UTF8",
"--input1Format=chemnitz",
import java.io.IOException;
import java.io.RandomAccessFile;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.Collections;
-import java.util.List;
import java.util.concurrent.atomic.AtomicBoolean;
import junit.framework.TestCase;
import com.hughes.android.dictionary.engine.Index.IndexEntry;
-import com.ibm.icu.text.Transliterator;
+import com.hughes.android.dictionary.engine.PairEntry.Row;
public class DictionaryTest extends TestCase {
+
+ @Override
+ protected void setUp() {
+ while (!TransliteratorManager.init(null)) {
+ try {
+ Thread.sleep(10);
+ } catch (InterruptedException e) {
+ e.printStackTrace();
+ }
+ }
+ }
+
+ public void testEnItWiktionary() throws IOException {
+ final RandomAccessFile raf = new RandomAccessFile("dictOutputs/EN-IT_enwiktionary.quickdic", "r");
+ final Dictionary dict = new Dictionary(raf);
+ final Index enIndex = dict.indices.get(0);
+ final PairEntry.Row row = (Row) enIndex.rows.get(2);
+ assertEquals("z", row.getRawText(false));
+
+ raf.close();
+ }
+
public void testGermanMetadata() throws IOException {
final RandomAccessFile raf = new RandomAccessFile("testdata/de-en.quickdic", "r");
final Dictionary dict = new Dictionary(raf);
assertEquals("de", deIndex.shortName);
assertEquals("de->en", deIndex.longName);
+ assertEquals(2, dict.sources.size());
+ assertEquals("chemnitz", dict.sources.get(0).name);
+ assertEquals(0, dict.sources.get(0).pairEntryStart);
+ assertEquals("dictcc", dict.sources.get(1).name);
+ assertEquals(113, dict.sources.get(1).pairEntryStart);
+
raf.close();
}
System.out.println("testing: " + indexEntry.token);
final IndexEntry searchResult = deIndex.findInsertionPoint(indexEntry.token, new AtomicBoolean(
false));
- assertEquals(indexEntry.token.toLowerCase(), searchResult.token.toLowerCase());
+ assertEquals("Looked up: " + indexEntry.token, indexEntry.token.toLowerCase(), searchResult.token.toLowerCase());
}
// TODO: maybe if user types capitalization, use it.
raf.close();
}
- public void testGermanSort() {
- final Transliterator normalizer = Transliterator.createFromRules("", Language.de.getDefaultNormalizerRules(), Transliterator.FORWARD);
- assertEquals("aüääss", normalizer.transform("aueAeAEß"));
- final List<String> words = Arrays.asList(
- "er-ben",
- "erben",
- "Erben",
- "Erbse",
- "Erbsen",
- "essen",
- "Essen",
- "Grosformat",
- "Grosformats",
- "Grossformat",
- "Großformat",
- "Grossformats",
- "Großformats",
- "Großpoo",
- "Großpoos",
- "Hörvermögen",
- "Hörweite",
- "hos",
- "Höschen",
- "Hostel",
- "hulle",
- "Hulle",
- "huelle",
- "Huelle",
- "hülle",
- "Hülle",
- "Huellen",
- "Hüllen",
- "Hum"
- );
- final NormalizeComparator comparator = new NormalizeComparator(normalizer, Language.de.getCollator());
- assertEquals(1, comparator.compare("hülle", "huelle"));
- assertEquals(-1, comparator.compare("huelle", "hülle"));
-
- assertEquals(-1, comparator.compare("hülle", "Hülle"));
-
- assertEquals("hülle", normalizer.transform("Hülle"));
- assertEquals("hulle", normalizer.transform("Hulle"));
-
-
- final List<String> sorted = new ArrayList<String>(words);
-// Collections.shuffle(shuffled, new Random(0));
- Collections.sort(sorted, comparator);
- System.out.println(sorted.toString());
- for (int i = 0; i < words.size(); ++i) {
- System.out.println(words.get(i) + "\t" + sorted.get(i));
- assertEquals(words.get(i), sorted.get(i));
- }
- }
-
- public void testEnglishSort() {
- final Transliterator normalizer = Transliterator.createFromRules("", Language.en.getDefaultNormalizerRules(), Transliterator.FORWARD);
-
- final List<String> words = Arrays.asList(
- "pre-print",
- "preppie",
- "preppy",
- "preprocess");
-
- final List<String> sorted = new ArrayList<String>(words);
- final NormalizeComparator comparator = new NormalizeComparator(normalizer, Language.en.getCollator());
- Collections.sort(sorted, comparator);
- for (int i = 0; i < words.size(); ++i) {
- if (i > 0) {
- assertTrue(comparator.compare(words.get(i-1), words.get(i)) < 0);
- }
- System.out.println(words.get(i) + "\t" + sorted.get(i));
- assertEquals(words.get(i), sorted.get(i));
- }
-
- assertTrue(comparator.compare("pre-print", "preppy") < 0);
-
- }
-
- public void testLanguage() {
- assertEquals(Language.de, Language.lookup("de"));
- assertEquals(Language.en, Language.lookup("en"));
- assertEquals("es", Language.lookup("es").getSymbol());
- }
-
- public void testTextNorm() {
- //final Transliterator transliterator = Transliterator.getInstance("Any-Latin; Upper; Lower; 'oe' > 'o'; NFD; [:Nonspacing Mark:] Remove; NFC", Transliterator.FORWARD);
- final Transliterator transliterator = Transliterator.createFromRules("", ":: Any-Latin; :: Upper; :: Lower; 'oe' > 'o'; :: NFD; :: [:Nonspacing Mark:] Remove; :: NFC ;", Transliterator.FORWARD);
- assertEquals("hoschen", transliterator.transliterate("Höschen"));
- assertEquals("hoschen", transliterator.transliterate("Hoeschen"));
- assertEquals("grosspoo", transliterator.transliterate("Großpoo"));
-
- assertEquals("kyanpasu", transliterator.transliterate("キャンパス"));
- assertEquals("alphabetikos katalogos", transliterator.transliterate("Αλφαβητικός Κατάλογος"));
- assertEquals("biologiceskom", transliterator.transliterate("биологическом"));
- }
-
public void testChemnitz() throws IOException {
final RandomAccessFile raf = new RandomAccessFile("dictOutputs/de-en_chemnitz.quickdic", "r");
final Dictionary dict = new Dictionary(raf);
raf.close();
}
+
}
IndexBuilder(final DictionaryBuilder dictionaryBuilder, final String shortName, final String longName, final Language language, final String normalizerRules, final boolean swapPairEntries) {
this.dictionaryBuilder = dictionaryBuilder;
index = new Index(dictionaryBuilder.dictionary, shortName, longName, language, normalizerRules, swapPairEntries);
- tokenToData = new TreeMap<String, TokenData>(new NormalizeComparator(index.normalizer, language.collator));
+ tokenToData = new TreeMap<String, TokenData>(new NormalizeComparator(index.normalizer(), language.getCollator()));
}
public void build() {
}
}
}
- index.sortedIndexEntries.add(new Index.IndexEntry(tokenData.token, startRow, numRows));
+ index.sortedIndexEntries.add(new Index.IndexEntry(tokenData.token, index
+ .normalizer().transliterate(tokenData.token), startRow, numRows));
}
final List<IndexEntry> sortedEntries = new ArrayList<IndexEntry>(index.sortedIndexEntries);
fields[0] = fields[1];
fields[1] = temp;
}
-
+
final String[][] subfields = new String[2][];
if (subfieldSplit != null) {
subfields[0] = subfieldSplit.split(fields[0]);
subfields[1] = new String[] { fields[1] };
}
- final Pair[] pairs = new Pair[subfields[0].length];
- for (int i = 0; i < pairs.length; ++i) {
+ final PairEntry pairEntry = new PairEntry();
+ for (int i = 0; i < subfields[0].length; ++i) {
subfields[0][i] = subfields[0][i].trim();
subfields[1][i] = subfields[1][i].trim();
- pairs[i] = new Pair(subfields[0][i], subfields[1][i]);
+ pairEntry.pairs.add(new Pair(subfields[0][i], subfields[1][i]));
}
- final PairEntry pairEntry = new PairEntry(pairs);
final EntryData entryData = new EntryData(dictBuilder.dictionary.pairEntries.size(), pairEntry);
dictBuilder.dictionary.pairEntries.add(pairEntry);
namedArgs.remove("lang");
namedArgs.remove("nocat");
+ namedArgs.remove("nocap");
namedArgs.remove("sc");
// Pronunciation
return;
}
- if (name.equals("IPA") || name.equals("SAMPA") || name.equals("X-SAMPA")|| name.equals("enPR")) {
+ if (name.equals("IPA") || name.equals("SAMPA") || name.equals("X-SAMPA") || name.equals("enPR")) {
namedArgs.remove("lang");
for (int i = 0; i < 100 && !namedArgs.isEmpty(); ++i) {
final String pron = namedArgs.remove("" + i);
assert positionalArgs.size() >= 1 && namedArgs.isEmpty() : positionalArgs.toString() + namedArgs + title;
if (currentPartOfSpeech == null) {
- assert !currentWord.partsOfSpeech.isEmpty() : title;
+ assert currentWord != null && !currentWord.partsOfSpeech.isEmpty() : title;
System.err.println("Assuming last part of speech for non-nested translation section: " + title);
currentPartOfSpeech = ListUtil.getLast(currentWord.partsOfSpeech);
}
for (final WikiWord.PartOfSpeech partOfSpeech : partsOfSpeech) {
partOfSpeechToQuickDic(dictBuilder, enIndexBuilder, partOfSpeech);
} // PartOfSpeech
+
+ // Pronunciation.
+ if (index != -1) {
+ final PairEntry pronEntry = new PairEntry();
+ for (final Map.Entry<String, StringBuilder> accentToPron : accentToPronunciation.entrySet()) {
+ String accent = accentToPron.getKey();
+ if (accent.length() > 0) {
+ accent = accent + ": ";
+ }
+ pronEntry.pairs.add(new Pair(accent + accentToPron.getValue(), "", index != 0));
+ }
+ if (pronEntry.pairs.size() > 0) {
+ final EntryData entryData = new EntryData(dictBuilder.dictionary.pairEntries.size(), pronEntry);
+ dictBuilder.dictionary.pairEntries.add(pronEntry);
+ final Set<String> tokens = DictFileParser.tokenize(title, DictFileParser.NON_CHAR);
+ dictBuilder.indexBuilders.get(index).addEntryWithTokens(entryData, tokens, EntryTypeName.WIKTIONARY_PRONUNCIATION);
+ }
+ }
}
}
}
-
if (index != -1) {
final boolean formOfSwap = index != 0;
for (final FormOf formOf : partOfSpeech.formOfs) {
final Pair pair = new Pair(title + ": " + formOf.grammarForm + ": " + formOf.target, "", formOfSwap);
- final PairEntry pairEntry = new PairEntry(new Pair[] {pair});
+ final PairEntry pairEntry = new PairEntry();
+ pairEntry.pairs.add(pair);
final EntryData entryData = new EntryData(dictBuilder.dictionary.pairEntries.size(), pairEntry);
dictBuilder.dictionary.pairEntries.add(pairEntry);
// Meanings.
for (final Meaning meaning : partOfSpeech.meanings) {
- final List<Pair> pairs = new ArrayList<PairEntry.Pair>();
-
+ final PairEntry pairEntry = new PairEntry();
+ final List<Pair> pairs = pairEntry.pairs;
+
final List<Set<String>> exampleTokens = new ArrayList<Set<String>>();
exampleTokens.add(new LinkedHashSet<String>());
exampleTokens.add(new LinkedHashSet<String>());
}
// Create EntryData with the PairEntry.
- final PairEntry pairEntry = new PairEntry(pairs.toArray(new Pair[0]));
final EntryData entryData = new EntryData(dictBuilder.dictionary.pairEntries.size(), pairEntry);
dictBuilder.dictionary.pairEntries.add(pairEntry);
final String englishBase = title + " (" + partOfSpeech.name.toLowerCase() + "%s)";
- final StringBuilder englishPron = new StringBuilder();
- for (final Map.Entry<String, StringBuilder> accentToPron : accentToPronunciation.entrySet()) {
- englishPron.append("\n");
- if (accentToPron.getKey().length() > 0) {
- englishPron.append(accentToPron.getKey()).append(": ");
- }
- englishPron.append(accentToPron.getValue());
- }
-
for (final TranslationSense translationSense : partOfSpeech.translationSenses) {
//System.out.println(" sense: " + translationSense.sense);
if (translationSense.sense == null) {
//System.err.println(" null sense: " + title);
}
String englishSense = String.format(englishBase, translationSense.sense != null ? (": " + translationSense.sense) : "");
- englishSense += englishPron.toString();
final StringBuilder[] sideBuilders = new StringBuilder[2];
final List<Map<EntryTypeName, List<String>>> sideTokens = new ArrayList<Map<EntryTypeName,List<String>>>();
// Construct the Translations-based QuickDic entry for this TranslationSense.
if (sideBuilders[0].length() > 0 && sideBuilders[1].length() > 0) {
final Pair pair = new Pair(sideBuilders[0].toString(), sideBuilders[1].toString());
- final PairEntry pairEntry = new PairEntry(new Pair[] { pair });
+ final PairEntry pairEntry = new PairEntry();
+ pairEntry.pairs.add(pair);
final EntryData entryData = new EntryData(dictBuilder.dictionary.pairEntries.size(), pairEntry);
dictBuilder.dictionary.pairEntries.add(pairEntry);