]> gitweb.fperrin.net Git - DictionaryPC.git/commitdiff
Changing the way dictionaries are indexed (listed), new type of TokenRow
authorThad Hughes <thad.hughes@gmail.com>
Mon, 16 Jan 2012 00:08:07 +0000 (16:08 -0800)
committerThad Hughes <thad.hughes@gmail.com>
Mon, 16 Jan 2012 00:08:07 +0000 (16:08 -0800)
(to distinguish major from minor entries).

src/com/hughes/android/dictionary/engine/CheckDictionariesMain.java [new file with mode: 0644]
src/com/hughes/android/dictionary/engine/DictionaryBuilder.java
src/com/hughes/android/dictionary/engine/DictionaryBuilderMain.java
src/com/hughes/android/dictionary/engine/IndexBuilder.java
src/com/hughes/android/dictionary/engine/LanguageTest.java
src/com/hughes/android/dictionary/engine/WiktionarySplitter.java
src/com/hughes/android/dictionary/parser/enwiktionary/EnWiktionaryLangs.java [new file with mode: 0644]
todo.txt

diff --git a/src/com/hughes/android/dictionary/engine/CheckDictionariesMain.java b/src/com/hughes/android/dictionary/engine/CheckDictionariesMain.java
new file mode 100644 (file)
index 0000000..97cfeef
--- /dev/null
@@ -0,0 +1,73 @@
+package com.hughes.android.dictionary.engine;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.PrintWriter;
+import java.io.RandomAccessFile;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.List;
+
+import com.hughes.android.dictionary.DictionaryInfo;
+import com.hughes.android.dictionary.engine.Index.IndexEntry;
+
+
+public class CheckDictionariesMain {
+
+  public static void main(String[] args) throws IOException {
+    final File dictDir = new File(DictionaryBuilderMain.OUTPUTS);
+    
+    final PrintWriter dictionaryInfoOut = new PrintWriter(new File("../Dictionary/res/raw/dictionary_info.txt"));
+    dictionaryInfoOut.println("# LANG_1\t%LANG_2\tFILENAME\tFILESIZE\tNUM_MAIN_WORDS_1\tNUM_MAIN_WORDS_2\tNUM_ALL_WORDS_1\tNUM_ALL_WORDS_2");
+
+    final File[] files = dictDir.listFiles();
+    Arrays.sort(files);
+    for (final File dictFile : files) {
+      if (!dictFile.getName().endsWith("quickdic")) {
+        continue;
+      }
+      System.out.println(dictFile.getPath());
+      
+      final DictionaryInfo dictionaryInfo = new DictionaryInfo();
+      
+      final RandomAccessFile raf = new RandomAccessFile(dictFile, "r");
+      final Dictionary dict = new Dictionary(raf);
+      
+      dictionaryInfo.uncompressedFilename = dictFile.getName();
+      dictionaryInfo.uncompressedSize = dictFile.length();
+
+      // Print it.
+      final PrintWriter textOut = new PrintWriter(new File(dictFile + ".text"));
+      final List<PairEntry> sorted = new ArrayList<PairEntry>(dict.pairEntries);
+      Collections.sort(sorted);
+      for (final PairEntry pairEntry : sorted) {
+        textOut.println(pairEntry.getRawText(false));
+      }
+      textOut.close();
+      
+      // Find the stats.
+      System.out.println("Stats...");
+      for (int i = 0; i < 2; ++i) {
+        dictionaryInfo.langIsos[i] = dict.indices.get(i).sortLanguage.getIsoCode();
+        final Index index = dict.indices.get(i);
+        for (final IndexEntry indexEntry : index.sortedIndexEntries) {
+          final TokenRow tokenRow = (TokenRow) index.rows.get(indexEntry.startRow);
+          dictionaryInfo.allTokenCounts[i]++; 
+          if (tokenRow.hasMainEntry) {
+            dictionaryInfo.mainTokenCounts[i]++; 
+          }
+        }
+      }
+      
+      raf.close();
+      
+      dictionaryInfoOut.println(dictionaryInfo.toTabSeparatedString());
+      dictionaryInfoOut.flush();
+      System.out.println(dictionaryInfo.toTabSeparatedString() + "\n");
+    }
+    
+    dictionaryInfoOut.close();
+  }
+
+}
index a3cc7c02cd5b7354a7d23b928eda7ae895cd66a2..2db5721c84b6867c2771dc6f68797d1661924fd2 100644 (file)
@@ -43,8 +43,8 @@ public class DictionaryBuilder {
   
   public DictionaryBuilder(final String dictInfo, final Language lang0, final Language lang1, final String normalizerRules1, final String normalizerRules2, final Set<String> lang1Stoplist, final Set<String> lang2Stoplist) {
     dictionary = new Dictionary(dictInfo);
   
   public DictionaryBuilder(final String dictInfo, final Language lang0, final Language lang1, final String normalizerRules1, final String normalizerRules2, final Set<String> lang1Stoplist, final Set<String> lang2Stoplist) {
     dictionary = new Dictionary(dictInfo);
-    indexBuilders.add(new IndexBuilder(this, lang0.getSymbol(), lang0.getSymbol() + "->" + lang1.getSymbol(), lang0, normalizerRules1, lang1Stoplist, false));
-    indexBuilders.add(new IndexBuilder(this, lang1.getSymbol(), lang1.getSymbol() + "->" + lang0.getSymbol(), lang1, normalizerRules2, lang2Stoplist, true));
+    indexBuilders.add(new IndexBuilder(this, lang0.getIsoCode(), lang0.getIsoCode() + "->" + lang1.getIsoCode(), lang0, normalizerRules1, lang1Stoplist, false));
+    indexBuilders.add(new IndexBuilder(this, lang1.getIsoCode(), lang1.getIsoCode() + "->" + lang0.getIsoCode(), lang1, normalizerRules2, lang2Stoplist, true));
   }
   
   void build() {
   }
   
   void build() {
index 175b7a27839e76f24596f1c84b1aede6ec976fe6..72ea6aff9f8733ac85408be5ea768593d4af1202 100644 (file)
 
 package com.hughes.android.dictionary.engine;
 
 
 package com.hughes.android.dictionary.engine;
 
-import java.io.File;
-import java.io.FileNotFoundException;
-import java.io.IOException;
-import java.io.PrintWriter;
-import java.io.RandomAccessFile;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.Collections;
 import java.util.LinkedHashMap;
 import java.util.LinkedHashMap;
-import java.util.List;
 import java.util.Map;
 
 import junit.framework.TestCase;
 
 import java.util.Map;
 
 import junit.framework.TestCase;
 
+import com.hughes.android.dictionary.parser.enwiktionary.EnWiktionaryLangs;
+
 public class DictionaryBuilderMain extends TestCase {
   
   static final String INPUTS = "data/inputs/";
   static final String STOPLISTS = "data/inputs/stoplists/";
 public class DictionaryBuilderMain extends TestCase {
   
   static final String INPUTS = "data/inputs/";
   static final String STOPLISTS = "data/inputs/stoplists/";
-  static final String OUTPUTS = "data/outputs/";
-    
+  static final String OUTPUTS = "data/outputs/";  
+  
+  static final String VERSION_SUFFIX = "v002";
+
+  
   public static void main(final String[] args) throws Exception {
     
   public static void main(final String[] args) throws Exception {
     
-    final Map<String,String> isoToWikiName = new LinkedHashMap<String, String>(Language.isoCodeToWikiName);
+    // Builds all the dictionaries it can, outputs list to a text file.
+    
+    final Map<String,String> isoToWikiName = new LinkedHashMap<String, String>(EnWiktionaryLangs.isoCodeToWikiName);
     isoToWikiName.remove("EN");
     isoToWikiName.remove("DE");
 
     final Map<String,String>  isoToDedication = new LinkedHashMap<String, String>();
     isoToDedication.put("AF", "Afrikaans dictionary dedicated to Heiko and Mariëtte Horn.");
     isoToWikiName.remove("EN");
     isoToWikiName.remove("DE");
 
     final Map<String,String>  isoToDedication = new LinkedHashMap<String, String>();
     isoToDedication.put("AF", "Afrikaans dictionary dedicated to Heiko and Mariëtte Horn.");
-    isoToDedication.put("HR", "Croation dictionary dedicated to Ines Viskic and Miro Kresonja.");
+    isoToDedication.put("HR", "Croatian dictionary dedicated to Ines Viskic and Miro Kresonja.");
     isoToDedication.put("NL", "Dutch dictionary dedicated to Mike LeBeau.");
     // German handled in file.
     isoToDedication.put("EL", "Greek dictionary dedicated to Noah Egge.");
     isoToDedication.put("NL", "Dutch dictionary dedicated to Mike LeBeau.");
     // German handled in file.
     isoToDedication.put("EL", "Greek dictionary dedicated to Noah Egge.");
@@ -77,7 +75,7 @@ public class DictionaryBuilderMain extends TestCase {
         continue;
       }
 
         continue;
       }
 
-        final String dictFile = String.format(OUTPUTS + "/EN-%s_enwiktionary.quickdic", foreignIso);
+        final String dictFile = String.format("%s/EN-%s_enwiktionary.%s.quickdic", OUTPUTS, foreignIso, VERSION_SUFFIX);
         System.out.println("building dictFile: " + dictFile);
         
         if (!isoToStoplist.containsKey(foreignIso)) {
         System.out.println("building dictFile: " + dictFile);
         
         if (!isoToStoplist.containsKey(foreignIso)) {
@@ -114,12 +112,9 @@ public class DictionaryBuilderMain extends TestCase {
 
         });
         
 
         });
         
-        // Print the entries for diffing.
-        printToText(dictFile);
-
     }  // foreignIso
 
     }  // foreignIso
 
-    final String dictFile = OUTPUTS + "DE-EN_chemnitz_enwiktionary.quickdic"; 
+    final String dictFile = String.format("%s/DE-EN_chemnitz_enwiktionary.%s.quickdic", OUTPUTS, VERSION_SUFFIX);
     DictionaryBuilder.main(new String[] {
         "--dictOut=" + dictFile,
         "--lang1=DE",
     DictionaryBuilder.main(new String[] {
         "--dictOut=" + dictFile,
         "--lang1=DE",
@@ -147,21 +142,7 @@ public class DictionaryBuilderMain extends TestCase {
         "--input3LangCodePattern=de",
         "--input3EnIndex=2",
     });
         "--input3LangCodePattern=de",
         "--input3EnIndex=2",
     });
-    printToText(dictFile);
     
   }
     
   }
-  
-  static void printToText(final String dictFile) throws IOException {
-    final RandomAccessFile raf = new RandomAccessFile(new File(dictFile), "r");
-    final Dictionary dict = new Dictionary(raf);
-    final PrintWriter textOut = new PrintWriter(new File(dictFile + ".text"));
-    final List<PairEntry> sorted = new ArrayList<PairEntry>(dict.pairEntries);
-    Collections.sort(sorted);
-    for (final PairEntry pairEntry : sorted) {
-      textOut.println(pairEntry.getRawText(false));
-    }
-    textOut.close();
-    raf.close();
-  }
-  
+    
 }
 }
index 32a087f47390d48e1c9ae23d928c07c4449fe735..6f28d30ba0faee0131d2fcd533706c70e48805c9 100644 (file)
@@ -50,7 +50,8 @@ public class IndexBuilder {
       tokenEntryDatas.clear();
       final int indexIndex = index.sortedIndexEntries.size();
       final int startRow = rows.size();
       tokenEntryDatas.clear();
       final int indexIndex = index.sortedIndexEntries.size();
       final int startRow = rows.size();
-      rows.add(new TokenRow(indexIndex, rows.size(), index));
+      
+      rows.add(new TokenRow(indexIndex, rows.size(), index, tokenData.hasMainEntry));
 //      System.out.println("Added TokenRow: " + rows.get(rows.size() - 1));
       int numRows = 0;
 //      System.out.println("TOKEN: " + tokenData.token);
 //      System.out.println("Added TokenRow: " + rows.get(rows.size() - 1));
       int numRows = 0;
 //      System.out.println("TOKEN: " + tokenData.token);
@@ -74,15 +75,15 @@ public class IndexBuilder {
           .normalizer().transliterate(tokenData.token), startRow, numRows));
     }
     
           .normalizer().transliterate(tokenData.token), startRow, numRows));
     }
     
-    final List<IndexEntry> entriesSortedByRows = new ArrayList<IndexEntry>(index.sortedIndexEntries);
-    Collections.sort(entriesSortedByRows, new Comparator<IndexEntry>() {
+    final List<IndexEntry> entriesSortedByNumRows = new ArrayList<IndexEntry>(index.sortedIndexEntries);
+    Collections.sort(entriesSortedByNumRows, new Comparator<IndexEntry>() {
       @Override
       public int compare(IndexEntry object1, IndexEntry object2) {
         return object2.numRows - object1.numRows;
       }});
     System.out.println("Most common tokens:");
       @Override
       public int compare(IndexEntry object1, IndexEntry object2) {
         return object2.numRows - object1.numRows;
       }});
     System.out.println("Most common tokens:");
-    for (int i = 0; i < 50 && i < entriesSortedByRows.size(); ++i) {
-      System.out.println("  " + entriesSortedByRows.get(i));
+    for (int i = 0; i < 50 && i < entriesSortedByNumRows.size(); ++i) {
+      System.out.println("  " + entriesSortedByNumRows.get(i));
     }
   }
   
     }
   }
   
@@ -90,6 +91,7 @@ public class IndexBuilder {
     final String token;
         
     final Map<EntryTypeName, List<IndexedEntry>> typeToEntries = new EnumMap<EntryTypeName, List<IndexedEntry>>(EntryTypeName.class);
     final String token;
         
     final Map<EntryTypeName, List<IndexedEntry>> typeToEntries = new EnumMap<EntryTypeName, List<IndexedEntry>>(EntryTypeName.class);
+    boolean hasMainEntry = false;
     
     TokenData(final String token) {
       assert token.equals(token.trim());
     
     TokenData(final String token) {
       assert token.equals(token.trim());
@@ -110,6 +112,9 @@ public class IndexBuilder {
   private List<IndexedEntry> getOrCreateEntries(final String token, final EntryTypeName entryTypeName) {
     final TokenData tokenData = getOrCreateTokenData(token);
     List<IndexedEntry> entries = tokenData.typeToEntries.get(entryTypeName);
   private List<IndexedEntry> getOrCreateEntries(final String token, final EntryTypeName entryTypeName) {
     final TokenData tokenData = getOrCreateTokenData(token);
     List<IndexedEntry> entries = tokenData.typeToEntries.get(entryTypeName);
+    if (entryTypeName.overridesStopList) {
+      tokenData.hasMainEntry = true;
+    }
     if (entries == null) {
       entries = new ArrayList<IndexedEntry>();
       tokenData.typeToEntries.put(entryTypeName, entries);
     if (entries == null) {
       entries = new ArrayList<IndexedEntry>();
       tokenData.typeToEntries.put(entryTypeName, entries);
@@ -124,8 +129,9 @@ public class IndexBuilder {
     }
     assert indexedEntry != null;
     for (final String token : tokens) {
     }
     assert indexedEntry != null;
     for (final String token : tokens) {
-      if (entryTypeName.overridesStopList || !stoplist.contains(token))
-      getOrCreateEntries(token, entryTypeName).add(indexedEntry);
+      if (entryTypeName.overridesStopList || !stoplist.contains(token)) {
+        getOrCreateEntries(token, entryTypeName).add(indexedEntry);
+      }
     }    
   }
 
     }    
   }
 
index 2d9b6a0afd052fdaa439738a87724788821b8f07..0b7b0411b88e3dcb03a1281a5feab6a826b6ce6d 100644 (file)
@@ -26,8 +26,6 @@ import com.ibm.icu.text.Transliterator;
 public class LanguageTest extends TestCase {
   
   public void testGermanSort() {
 public class LanguageTest extends TestCase {
   
   public void testGermanSort() {
-    System.out.println(Language.isoCodeToWikiName.values());
-    
     final Transliterator normalizer = Transliterator.createFromRules("", Language.de.getDefaultNormalizerRules(), Transliterator.FORWARD);
     assertEquals("aüääss", normalizer.transform("aueAeAEß"));
     final List<String> words = Arrays.asList(
     final Transliterator normalizer = Transliterator.createFromRules("", Language.de.getDefaultNormalizerRules(), Transliterator.FORWARD);
     assertEquals("aüääss", normalizer.transform("aueAeAEß"));
     final List<String> words = Arrays.asList(
@@ -108,7 +106,7 @@ public class LanguageTest extends TestCase {
   public void testLanguage() {
     assertEquals(Language.de, Language.lookup("de"));
     assertEquals(Language.en, Language.lookup("en"));
   public void testLanguage() {
     assertEquals(Language.de, Language.lookup("de"));
     assertEquals(Language.en, Language.lookup("en"));
-    assertEquals("es", Language.lookup("es").getSymbol());
+    assertEquals("es", Language.lookup("es").getIsoCode());
   }
 
   public void testTextNorm() {
   }
 
   public void testTextNorm() {
@@ -160,8 +158,8 @@ public class LanguageTest extends TestCase {
     // These don't seem quite right....
     assertEquals("haswb", transliterator.transliterate("حاسوب"));
     assertEquals("kmbywtr", transliterator.transliterate("كمبيوتر"));
     // These don't seem quite right....
     assertEquals("haswb", transliterator.transliterate("حاسوب"));
     assertEquals("kmbywtr", transliterator.transliterate("كمبيوتر"));
-  }
-
 
 
+    assertEquals("{\u200eكمبيوتر\u200e}", Language.fixBidiText("{كمبيوتر}"));
+  }
 
 }
 
 }
index 2e732f03e79628944c17d88867bf25c2429a49db..c05cbb02099cd45076adcb28a25cb73d40601c5e 100644 (file)
@@ -32,6 +32,8 @@ import javax.xml.parsers.SAXParserFactory;
 import org.xml.sax.Attributes;
 import org.xml.sax.SAXException;
 
 import org.xml.sax.Attributes;
 import org.xml.sax.SAXException;
 
+import com.hughes.android.dictionary.parser.enwiktionary.EnWiktionaryLangs;
+
 public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler {
   
   private static final String FILE_TO_SPLIT = "data/inputs/enwiktionary-20111224-pages-articles.xml";
 public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler {
   
   private static final String FILE_TO_SPLIT = "data/inputs/enwiktionary-20111224-pages-articles.xml";
@@ -80,7 +82,7 @@ public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler {
     }
 
     if (selectors.isEmpty()) {
     }
 
     if (selectors.isEmpty()) {
-      for (final Map.Entry<String, String> entry : Language.isoCodeToWikiName.entrySet()) {
+      for (final Map.Entry<String, String> entry : EnWiktionaryLangs.isoCodeToWikiName.entrySet()) {
         selectors.add(new Selector(String.format("data/inputs/enWikiSplit/%s.data", entry.getKey()), entry.getValue()));
       }
     }
         selectors.add(new Selector(String.format("data/inputs/enWikiSplit/%s.data", entry.getKey()), entry.getValue()));
       }
     }
diff --git a/src/com/hughes/android/dictionary/parser/enwiktionary/EnWiktionaryLangs.java b/src/com/hughes/android/dictionary/parser/enwiktionary/EnWiktionaryLangs.java
new file mode 100644 (file)
index 0000000..80f47ed
--- /dev/null
@@ -0,0 +1,74 @@
+package com.hughes.android.dictionary.parser.enwiktionary;
+
+import java.util.LinkedHashMap;
+import java.util.Map;
+
+public class EnWiktionaryLangs {
+  
+  public static final Map<String,String> isoCodeToWikiName = new LinkedHashMap<String,String>();
+  static {
+    isoCodeToWikiName.put("AF", "Afrikaans");
+    isoCodeToWikiName.put("SQ", "Albanian");
+    isoCodeToWikiName.put("AR", "Arabic");
+    isoCodeToWikiName.put("HY", "Armenian");
+    isoCodeToWikiName.put("BE", "Belarusian");
+    isoCodeToWikiName.put("BN", "Bengali");
+    isoCodeToWikiName.put("BS", "Bosnian");
+    isoCodeToWikiName.put("BG", "Bulgarian");
+    isoCodeToWikiName.put("CA", "Catalan");
+    isoCodeToWikiName.put("HR", "Croatian");
+    isoCodeToWikiName.put("CS", "Czech");
+    isoCodeToWikiName.put("ZH", "Chinese|Mandarin|Cantonese");
+    isoCodeToWikiName.put("DA", "Danish");
+    isoCodeToWikiName.put("NL", "Dutch");
+    isoCodeToWikiName.put("EN", "English");
+    isoCodeToWikiName.put("EO", "Esperanto");
+    isoCodeToWikiName.put("ET", "Estonian");
+    isoCodeToWikiName.put("FI", "Finnish");
+    isoCodeToWikiName.put("FR", "French");
+    isoCodeToWikiName.put("DE", "German");
+    isoCodeToWikiName.put("EL", "Greek");
+    isoCodeToWikiName.put("haw", "Hawaiian");
+    isoCodeToWikiName.put("HE", "Hebrew");
+    isoCodeToWikiName.put("HI", "Hindi");
+    isoCodeToWikiName.put("HU", "Hungarian");
+    isoCodeToWikiName.put("IS", "Icelandic");
+    isoCodeToWikiName.put("ID", "Indonesian");
+    isoCodeToWikiName.put("GA", "Irish");
+    isoCodeToWikiName.put("IT", "Italian");
+    isoCodeToWikiName.put("LA", "Latin");
+    isoCodeToWikiName.put("LV", "Latvian");
+    isoCodeToWikiName.put("LT", "Lithuanian");
+    isoCodeToWikiName.put("JA", "Japanese");
+    isoCodeToWikiName.put("KO", "Korean");
+    isoCodeToWikiName.put("KU", "Kurdish");
+    isoCodeToWikiName.put("MS", "Malay");
+    isoCodeToWikiName.put("MI", "Maori");
+    isoCodeToWikiName.put("MN", "Mongolian");
+    isoCodeToWikiName.put("NE", "Nepali");
+    isoCodeToWikiName.put("NO", "Norwegian");
+    isoCodeToWikiName.put("FA", "Persian");
+    isoCodeToWikiName.put("PL", "Polish");
+    isoCodeToWikiName.put("PT", "Portuguese");
+    isoCodeToWikiName.put("PA", "Punjabi");
+    isoCodeToWikiName.put("RO", "Romanian");
+    isoCodeToWikiName.put("RU", "Russian");
+    isoCodeToWikiName.put("SA", "Sanskrit");
+    isoCodeToWikiName.put("SR", "Serbian");
+    isoCodeToWikiName.put("SK", "Slovak");
+    isoCodeToWikiName.put("SO", "Somali");
+    isoCodeToWikiName.put("ES", "Spanish");
+    isoCodeToWikiName.put("SW", "Swahili");
+    isoCodeToWikiName.put("SV", "Swedish");
+    isoCodeToWikiName.put("TG", "Tajik");
+    isoCodeToWikiName.put("TH", "Thai");
+    isoCodeToWikiName.put("BO", "Tibetan");
+    isoCodeToWikiName.put("TR", "Turkish");
+    isoCodeToWikiName.put("UK", "Ukrainian");
+    isoCodeToWikiName.put("VI", "Vietnamese");
+    isoCodeToWikiName.put("CI", "Welsh");
+    isoCodeToWikiName.put("YI", "Yiddish");
+    isoCodeToWikiName.put("ZU", "Zulu");
+  }
+
+}
index 2016c550d52c516e8ce2e137985bd070e5bdfc20..5f1667cb9543c4966bc2405e55eebb5798876c3f 100644 (file)
--- a/todo.txt
+++ b/todo.txt
@@ -1,10 +1,12 @@
 For next release:
 For next release:
+arabic UI fix
 "form of" to bottom
 handle examples like "asdf (asdf)"
 random word jump
 multiword find.
 dictionary update.
 ???italian verbs
 "form of" to bottom
 handle examples like "asdf (asdf)"
 random word jump
 multiword find.
 dictionary update.
 ???italian verbs
+dictionary builder generates text file with list of dictionaries built, sizes, timestamps, token counts, etc.
 
 pronunciation
 synonyms
 
 pronunciation
 synonyms