]> gitweb.fperrin.net Git - DictionaryPC.git/commitdiff
Changing the way dictionaries are indexed (listed), new type of TokenRow
authorThad Hughes <thad.hughes@gmail.com>
Mon, 16 Jan 2012 00:08:07 +0000 (16:08 -0800)
committerThad Hughes <thad.hughes@gmail.com>
Mon, 16 Jan 2012 00:08:07 +0000 (16:08 -0800)
(to distinguish major from minor entries).

src/com/hughes/android/dictionary/engine/CheckDictionariesMain.java [new file with mode: 0644]
src/com/hughes/android/dictionary/engine/DictionaryBuilder.java
src/com/hughes/android/dictionary/engine/DictionaryBuilderMain.java
src/com/hughes/android/dictionary/engine/IndexBuilder.java
src/com/hughes/android/dictionary/engine/LanguageTest.java
src/com/hughes/android/dictionary/engine/WiktionarySplitter.java
src/com/hughes/android/dictionary/parser/enwiktionary/EnWiktionaryLangs.java [new file with mode: 0644]
todo.txt

diff --git a/src/com/hughes/android/dictionary/engine/CheckDictionariesMain.java b/src/com/hughes/android/dictionary/engine/CheckDictionariesMain.java
new file mode 100644 (file)
index 0000000..97cfeef
--- /dev/null
@@ -0,0 +1,73 @@
+package com.hughes.android.dictionary.engine;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.PrintWriter;
+import java.io.RandomAccessFile;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.List;
+
+import com.hughes.android.dictionary.DictionaryInfo;
+import com.hughes.android.dictionary.engine.Index.IndexEntry;
+
+
+public class CheckDictionariesMain {
+
+  public static void main(String[] args) throws IOException {
+    final File dictDir = new File(DictionaryBuilderMain.OUTPUTS);
+    
+    final PrintWriter dictionaryInfoOut = new PrintWriter(new File("../Dictionary/res/raw/dictionary_info.txt"));
+    dictionaryInfoOut.println("# LANG_1\t%LANG_2\tFILENAME\tFILESIZE\tNUM_MAIN_WORDS_1\tNUM_MAIN_WORDS_2\tNUM_ALL_WORDS_1\tNUM_ALL_WORDS_2");
+
+    final File[] files = dictDir.listFiles();
+    Arrays.sort(files);
+    for (final File dictFile : files) {
+      if (!dictFile.getName().endsWith("quickdic")) {
+        continue;
+      }
+      System.out.println(dictFile.getPath());
+      
+      final DictionaryInfo dictionaryInfo = new DictionaryInfo();
+      
+      final RandomAccessFile raf = new RandomAccessFile(dictFile, "r");
+      final Dictionary dict = new Dictionary(raf);
+      
+      dictionaryInfo.uncompressedFilename = dictFile.getName();
+      dictionaryInfo.uncompressedSize = dictFile.length();
+
+      // Print it.
+      final PrintWriter textOut = new PrintWriter(new File(dictFile + ".text"));
+      final List<PairEntry> sorted = new ArrayList<PairEntry>(dict.pairEntries);
+      Collections.sort(sorted);
+      for (final PairEntry pairEntry : sorted) {
+        textOut.println(pairEntry.getRawText(false));
+      }
+      textOut.close();
+      
+      // Find the stats.
+      System.out.println("Stats...");
+      for (int i = 0; i < 2; ++i) {
+        dictionaryInfo.langIsos[i] = dict.indices.get(i).sortLanguage.getIsoCode();
+        final Index index = dict.indices.get(i);
+        for (final IndexEntry indexEntry : index.sortedIndexEntries) {
+          final TokenRow tokenRow = (TokenRow) index.rows.get(indexEntry.startRow);
+          dictionaryInfo.allTokenCounts[i]++; 
+          if (tokenRow.hasMainEntry) {
+            dictionaryInfo.mainTokenCounts[i]++; 
+          }
+        }
+      }
+      
+      raf.close();
+      
+      dictionaryInfoOut.println(dictionaryInfo.toTabSeparatedString());
+      dictionaryInfoOut.flush();
+      System.out.println(dictionaryInfo.toTabSeparatedString() + "\n");
+    }
+    
+    dictionaryInfoOut.close();
+  }
+
+}
index a3cc7c02cd5b7354a7d23b928eda7ae895cd66a2..2db5721c84b6867c2771dc6f68797d1661924fd2 100644 (file)
@@ -43,8 +43,8 @@ public class DictionaryBuilder {
   
   public DictionaryBuilder(final String dictInfo, final Language lang0, final Language lang1, final String normalizerRules1, final String normalizerRules2, final Set<String> lang1Stoplist, final Set<String> lang2Stoplist) {
     dictionary = new Dictionary(dictInfo);
-    indexBuilders.add(new IndexBuilder(this, lang0.getSymbol(), lang0.getSymbol() + "->" + lang1.getSymbol(), lang0, normalizerRules1, lang1Stoplist, false));
-    indexBuilders.add(new IndexBuilder(this, lang1.getSymbol(), lang1.getSymbol() + "->" + lang0.getSymbol(), lang1, normalizerRules2, lang2Stoplist, true));
+    indexBuilders.add(new IndexBuilder(this, lang0.getIsoCode(), lang0.getIsoCode() + "->" + lang1.getIsoCode(), lang0, normalizerRules1, lang1Stoplist, false));
+    indexBuilders.add(new IndexBuilder(this, lang1.getIsoCode(), lang1.getIsoCode() + "->" + lang0.getIsoCode(), lang1, normalizerRules2, lang2Stoplist, true));
   }
   
   void build() {
index 175b7a27839e76f24596f1c84b1aede6ec976fe6..72ea6aff9f8733ac85408be5ea768593d4af1202 100644 (file)
 
 package com.hughes.android.dictionary.engine;
 
-import java.io.File;
-import java.io.FileNotFoundException;
-import java.io.IOException;
-import java.io.PrintWriter;
-import java.io.RandomAccessFile;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.Collections;
 import java.util.LinkedHashMap;
-import java.util.List;
 import java.util.Map;
 
 import junit.framework.TestCase;
 
+import com.hughes.android.dictionary.parser.enwiktionary.EnWiktionaryLangs;
+
 public class DictionaryBuilderMain extends TestCase {
   
   static final String INPUTS = "data/inputs/";
   static final String STOPLISTS = "data/inputs/stoplists/";
-  static final String OUTPUTS = "data/outputs/";
-    
+  static final String OUTPUTS = "data/outputs/";  
+  
+  static final String VERSION_SUFFIX = "v002";
+
+  
   public static void main(final String[] args) throws Exception {
     
-    final Map<String,String> isoToWikiName = new LinkedHashMap<String, String>(Language.isoCodeToWikiName);
+    // Builds all the dictionaries it can, outputs list to a text file.
+    
+    final Map<String,String> isoToWikiName = new LinkedHashMap<String, String>(EnWiktionaryLangs.isoCodeToWikiName);
     isoToWikiName.remove("EN");
     isoToWikiName.remove("DE");
 
     final Map<String,String>  isoToDedication = new LinkedHashMap<String, String>();
     isoToDedication.put("AF", "Afrikaans dictionary dedicated to Heiko and Mariëtte Horn.");
-    isoToDedication.put("HR", "Croation dictionary dedicated to Ines Viskic and Miro Kresonja.");
+    isoToDedication.put("HR", "Croatian dictionary dedicated to Ines Viskic and Miro Kresonja.");
     isoToDedication.put("NL", "Dutch dictionary dedicated to Mike LeBeau.");
     // German handled in file.
     isoToDedication.put("EL", "Greek dictionary dedicated to Noah Egge.");
@@ -77,7 +75,7 @@ public class DictionaryBuilderMain extends TestCase {
         continue;
       }
 
-        final String dictFile = String.format(OUTPUTS + "/EN-%s_enwiktionary.quickdic", foreignIso);
+        final String dictFile = String.format("%s/EN-%s_enwiktionary.%s.quickdic", OUTPUTS, foreignIso, VERSION_SUFFIX);
         System.out.println("building dictFile: " + dictFile);
         
         if (!isoToStoplist.containsKey(foreignIso)) {
@@ -114,12 +112,9 @@ public class DictionaryBuilderMain extends TestCase {
 
         });
         
-        // Print the entries for diffing.
-        printToText(dictFile);
-
     }  // foreignIso
 
-    final String dictFile = OUTPUTS + "DE-EN_chemnitz_enwiktionary.quickdic"; 
+    final String dictFile = String.format("%s/DE-EN_chemnitz_enwiktionary.%s.quickdic", OUTPUTS, VERSION_SUFFIX);
     DictionaryBuilder.main(new String[] {
         "--dictOut=" + dictFile,
         "--lang1=DE",
@@ -147,21 +142,7 @@ public class DictionaryBuilderMain extends TestCase {
         "--input3LangCodePattern=de",
         "--input3EnIndex=2",
     });
-    printToText(dictFile);
     
   }
-  
-  static void printToText(final String dictFile) throws IOException {
-    final RandomAccessFile raf = new RandomAccessFile(new File(dictFile), "r");
-    final Dictionary dict = new Dictionary(raf);
-    final PrintWriter textOut = new PrintWriter(new File(dictFile + ".text"));
-    final List<PairEntry> sorted = new ArrayList<PairEntry>(dict.pairEntries);
-    Collections.sort(sorted);
-    for (final PairEntry pairEntry : sorted) {
-      textOut.println(pairEntry.getRawText(false));
-    }
-    textOut.close();
-    raf.close();
-  }
-  
+    
 }
index 32a087f47390d48e1c9ae23d928c07c4449fe735..6f28d30ba0faee0131d2fcd533706c70e48805c9 100644 (file)
@@ -50,7 +50,8 @@ public class IndexBuilder {
       tokenEntryDatas.clear();
       final int indexIndex = index.sortedIndexEntries.size();
       final int startRow = rows.size();
-      rows.add(new TokenRow(indexIndex, rows.size(), index));
+      
+      rows.add(new TokenRow(indexIndex, rows.size(), index, tokenData.hasMainEntry));
 //      System.out.println("Added TokenRow: " + rows.get(rows.size() - 1));
       int numRows = 0;
 //      System.out.println("TOKEN: " + tokenData.token);
@@ -74,15 +75,15 @@ public class IndexBuilder {
           .normalizer().transliterate(tokenData.token), startRow, numRows));
     }
     
-    final List<IndexEntry> entriesSortedByRows = new ArrayList<IndexEntry>(index.sortedIndexEntries);
-    Collections.sort(entriesSortedByRows, new Comparator<IndexEntry>() {
+    final List<IndexEntry> entriesSortedByNumRows = new ArrayList<IndexEntry>(index.sortedIndexEntries);
+    Collections.sort(entriesSortedByNumRows, new Comparator<IndexEntry>() {
       @Override
       public int compare(IndexEntry object1, IndexEntry object2) {
         return object2.numRows - object1.numRows;
       }});
     System.out.println("Most common tokens:");
-    for (int i = 0; i < 50 && i < entriesSortedByRows.size(); ++i) {
-      System.out.println("  " + entriesSortedByRows.get(i));
+    for (int i = 0; i < 50 && i < entriesSortedByNumRows.size(); ++i) {
+      System.out.println("  " + entriesSortedByNumRows.get(i));
     }
   }
   
@@ -90,6 +91,7 @@ public class IndexBuilder {
     final String token;
         
     final Map<EntryTypeName, List<IndexedEntry>> typeToEntries = new EnumMap<EntryTypeName, List<IndexedEntry>>(EntryTypeName.class);
+    boolean hasMainEntry = false;
     
     TokenData(final String token) {
       assert token.equals(token.trim());
@@ -110,6 +112,9 @@ public class IndexBuilder {
   private List<IndexedEntry> getOrCreateEntries(final String token, final EntryTypeName entryTypeName) {
     final TokenData tokenData = getOrCreateTokenData(token);
     List<IndexedEntry> entries = tokenData.typeToEntries.get(entryTypeName);
+    if (entryTypeName.overridesStopList) {
+      tokenData.hasMainEntry = true;
+    }
     if (entries == null) {
       entries = new ArrayList<IndexedEntry>();
       tokenData.typeToEntries.put(entryTypeName, entries);
@@ -124,8 +129,9 @@ public class IndexBuilder {
     }
     assert indexedEntry != null;
     for (final String token : tokens) {
-      if (entryTypeName.overridesStopList || !stoplist.contains(token))
-      getOrCreateEntries(token, entryTypeName).add(indexedEntry);
+      if (entryTypeName.overridesStopList || !stoplist.contains(token)) {
+        getOrCreateEntries(token, entryTypeName).add(indexedEntry);
+      }
     }    
   }
 
index 2d9b6a0afd052fdaa439738a87724788821b8f07..0b7b0411b88e3dcb03a1281a5feab6a826b6ce6d 100644 (file)
@@ -26,8 +26,6 @@ import com.ibm.icu.text.Transliterator;
 public class LanguageTest extends TestCase {
   
   public void testGermanSort() {
-    System.out.println(Language.isoCodeToWikiName.values());
-    
     final Transliterator normalizer = Transliterator.createFromRules("", Language.de.getDefaultNormalizerRules(), Transliterator.FORWARD);
     assertEquals("aüääss", normalizer.transform("aueAeAEß"));
     final List<String> words = Arrays.asList(
@@ -108,7 +106,7 @@ public class LanguageTest extends TestCase {
   public void testLanguage() {
     assertEquals(Language.de, Language.lookup("de"));
     assertEquals(Language.en, Language.lookup("en"));
-    assertEquals("es", Language.lookup("es").getSymbol());
+    assertEquals("es", Language.lookup("es").getIsoCode());
   }
 
   public void testTextNorm() {
@@ -160,8 +158,8 @@ public class LanguageTest extends TestCase {
     // These don't seem quite right....
     assertEquals("haswb", transliterator.transliterate("حاسوب"));
     assertEquals("kmbywtr", transliterator.transliterate("كمبيوتر"));
-  }
-
 
+    assertEquals("{\u200eكمبيوتر\u200e}", Language.fixBidiText("{كمبيوتر}"));
+  }
 
 }
index 2e732f03e79628944c17d88867bf25c2429a49db..c05cbb02099cd45076adcb28a25cb73d40601c5e 100644 (file)
@@ -32,6 +32,8 @@ import javax.xml.parsers.SAXParserFactory;
 import org.xml.sax.Attributes;
 import org.xml.sax.SAXException;
 
+import com.hughes.android.dictionary.parser.enwiktionary.EnWiktionaryLangs;
+
 public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler {
   
   private static final String FILE_TO_SPLIT = "data/inputs/enwiktionary-20111224-pages-articles.xml";
@@ -80,7 +82,7 @@ public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler {
     }
 
     if (selectors.isEmpty()) {
-      for (final Map.Entry<String, String> entry : Language.isoCodeToWikiName.entrySet()) {
+      for (final Map.Entry<String, String> entry : EnWiktionaryLangs.isoCodeToWikiName.entrySet()) {
         selectors.add(new Selector(String.format("data/inputs/enWikiSplit/%s.data", entry.getKey()), entry.getValue()));
       }
     }
diff --git a/src/com/hughes/android/dictionary/parser/enwiktionary/EnWiktionaryLangs.java b/src/com/hughes/android/dictionary/parser/enwiktionary/EnWiktionaryLangs.java
new file mode 100644 (file)
index 0000000..80f47ed
--- /dev/null
@@ -0,0 +1,74 @@
+package com.hughes.android.dictionary.parser.enwiktionary;
+
+import java.util.LinkedHashMap;
+import java.util.Map;
+
+public class EnWiktionaryLangs {
+  
+  public static final Map<String,String> isoCodeToWikiName = new LinkedHashMap<String,String>();
+  static {
+    isoCodeToWikiName.put("AF", "Afrikaans");
+    isoCodeToWikiName.put("SQ", "Albanian");
+    isoCodeToWikiName.put("AR", "Arabic");
+    isoCodeToWikiName.put("HY", "Armenian");
+    isoCodeToWikiName.put("BE", "Belarusian");
+    isoCodeToWikiName.put("BN", "Bengali");
+    isoCodeToWikiName.put("BS", "Bosnian");
+    isoCodeToWikiName.put("BG", "Bulgarian");
+    isoCodeToWikiName.put("CA", "Catalan");
+    isoCodeToWikiName.put("HR", "Croatian");
+    isoCodeToWikiName.put("CS", "Czech");
+    isoCodeToWikiName.put("ZH", "Chinese|Mandarin|Cantonese");
+    isoCodeToWikiName.put("DA", "Danish");
+    isoCodeToWikiName.put("NL", "Dutch");
+    isoCodeToWikiName.put("EN", "English");
+    isoCodeToWikiName.put("EO", "Esperanto");
+    isoCodeToWikiName.put("ET", "Estonian");
+    isoCodeToWikiName.put("FI", "Finnish");
+    isoCodeToWikiName.put("FR", "French");
+    isoCodeToWikiName.put("DE", "German");
+    isoCodeToWikiName.put("EL", "Greek");
+    isoCodeToWikiName.put("haw", "Hawaiian");
+    isoCodeToWikiName.put("HE", "Hebrew");
+    isoCodeToWikiName.put("HI", "Hindi");
+    isoCodeToWikiName.put("HU", "Hungarian");
+    isoCodeToWikiName.put("IS", "Icelandic");
+    isoCodeToWikiName.put("ID", "Indonesian");
+    isoCodeToWikiName.put("GA", "Irish");
+    isoCodeToWikiName.put("IT", "Italian");
+    isoCodeToWikiName.put("LA", "Latin");
+    isoCodeToWikiName.put("LV", "Latvian");
+    isoCodeToWikiName.put("LT", "Lithuanian");
+    isoCodeToWikiName.put("JA", "Japanese");
+    isoCodeToWikiName.put("KO", "Korean");
+    isoCodeToWikiName.put("KU", "Kurdish");
+    isoCodeToWikiName.put("MS", "Malay");
+    isoCodeToWikiName.put("MI", "Maori");
+    isoCodeToWikiName.put("MN", "Mongolian");
+    isoCodeToWikiName.put("NE", "Nepali");
+    isoCodeToWikiName.put("NO", "Norwegian");
+    isoCodeToWikiName.put("FA", "Persian");
+    isoCodeToWikiName.put("PL", "Polish");
+    isoCodeToWikiName.put("PT", "Portuguese");
+    isoCodeToWikiName.put("PA", "Punjabi");
+    isoCodeToWikiName.put("RO", "Romanian");
+    isoCodeToWikiName.put("RU", "Russian");
+    isoCodeToWikiName.put("SA", "Sanskrit");
+    isoCodeToWikiName.put("SR", "Serbian");
+    isoCodeToWikiName.put("SK", "Slovak");
+    isoCodeToWikiName.put("SO", "Somali");
+    isoCodeToWikiName.put("ES", "Spanish");
+    isoCodeToWikiName.put("SW", "Swahili");
+    isoCodeToWikiName.put("SV", "Swedish");
+    isoCodeToWikiName.put("TG", "Tajik");
+    isoCodeToWikiName.put("TH", "Thai");
+    isoCodeToWikiName.put("BO", "Tibetan");
+    isoCodeToWikiName.put("TR", "Turkish");
+    isoCodeToWikiName.put("UK", "Ukrainian");
+    isoCodeToWikiName.put("VI", "Vietnamese");
+    isoCodeToWikiName.put("CI", "Welsh");
+    isoCodeToWikiName.put("YI", "Yiddish");
+    isoCodeToWikiName.put("ZU", "Zulu");
+  }
+
+}
index 2016c550d52c516e8ce2e137985bd070e5bdfc20..5f1667cb9543c4966bc2405e55eebb5798876c3f 100644 (file)
--- a/todo.txt
+++ b/todo.txt
@@ -1,10 +1,12 @@
 For next release:
+arabic UI fix
 "form of" to bottom
 handle examples like "asdf (asdf)"
 random word jump
 multiword find.
 dictionary update.
 ???italian verbs
+dictionary builder generates text file with list of dictionaries built, sizes, timestamps, token counts, etc.
 
 pronunciation
 synonyms