]> gitweb.fperrin.net Git - DictionaryPC.git/commitdiff
go
authorThad Hughes <thad.hughes@gmail.com>
Fri, 8 Oct 2010 22:14:16 +0000 (15:14 -0700)
committerThad Hughes <thad.hughes@gmail.com>
Tue, 13 Dec 2011 01:27:17 +0000 (17:27 -0800)
.classpath
src/com/hughes/android/dictionary/DictionaryBuilder.java
src/com/hughes/android/dictionary/engine/DictFileParser.java [new file with mode: 0644]
src/com/hughes/android/dictionary/engine/DictionaryBuilder.java [new file with mode: 0644]
src/com/hughes/android/dictionary/engine/DictionaryBuilderTest.java [new file with mode: 0644]
src/com/hughes/android/dictionary/engine/EntryData.java [new file with mode: 0644]
src/com/hughes/android/dictionary/engine/IndexBuilder.java [new file with mode: 0644]

index 22e2c09af131149601e42543f2049bf24fa66f94..cc0189b582db139e2a007fb3d579419b35344a94 100755 (executable)
@@ -1,8 +1,8 @@
-<?xml version="1.0" encoding="UTF-8"?>\r
-<classpath>\r
-       <classpathentry kind="src" path="src"/>\r
-       <classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER"/>\r
-       <classpathentry combineaccessrules="false" kind="src" path="/Dictionary"/>\r
-       <classpathentry kind="con" path="org.eclipse.jdt.junit.JUNIT_CONTAINER/3"/>\r
-       <classpathentry kind="output" path="bin"/>\r
-</classpath>\r
+<?xml version="1.0" encoding="UTF-8"?>
+<classpath>
+       <classpathentry kind="src" path="src"/>
+       <classpathentry combineaccessrules="false" kind="src" path="/Dictionary"/>
+       <classpathentry kind="con" path="org.eclipse.jdt.junit.JUNIT_CONTAINER/3"/>
+       <classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER/org.eclipse.jdt.internal.launching.macosx.MacOSXType/JVM 1.6.0"/>
+       <classpathentry kind="output" path="bin"/>
+</classpath>
index 1e76822a49869fea1cbe9286d7396e75efe229b9..ba60c4c3ff7c1f99bf136333f744e641f01d3d3c 100755 (executable)
@@ -58,7 +58,7 @@ public class DictionaryBuilder {
     System.out.println("lang1=" + lang1);\r
     System.out.println("lang2=" + lang2);\r
     System.out.println("summaryText=" + summaryText);\r
-    System.out.println("dictOut=" + dictOutFilename);    \r
+    System.out.println("dictOut=" + dictOutFilename);\r
 \r
     final Dictionary dict = new Dictionary(summaryText, lang1, lang2);\r
 \r
@@ -167,7 +167,7 @@ public class DictionaryBuilder {
     final Map<String, TokenData> tokenToData = new TreeMap<String, TokenData>(dict.languageDatas[lang].language.sortComparator);\r
 \r
     for (int e = 0; e < dict.entries.size(); ++e) {\r
-      final SimpleEntry entry = dict.entries.get(e);\r
+      final SimpleEntry entry = null; //dict.entries.get(e);\r
       final Set<String> tokens = entry.getIndexableTokens(lang);\r
       for (final String token : tokens) {\r
         TokenData tokenData = tokenToData.get(token);\r
diff --git a/src/com/hughes/android/dictionary/engine/DictFileParser.java b/src/com/hughes/android/dictionary/engine/DictFileParser.java
new file mode 100644 (file)
index 0000000..9b4ac0a
--- /dev/null
@@ -0,0 +1,253 @@
+package com.hughes.android.dictionary.engine;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.nio.charset.Charset;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+import java.util.logging.Logger;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import com.hughes.android.dictionary.Language;
+import com.hughes.android.dictionary.engine.PairEntry.Pair;
+
+public class DictFileParser {
+  
+  static final Logger logger = Logger.getLogger(DictFileParser.class.getName());
+
+  // Dictcc
+  static final Pattern TAB = Pattern.compile("\\t");
+
+  // Chemnitz
+  static final Pattern DOUBLE_COLON = Pattern.compile(" :: ");
+  static final Pattern PIPE = Pattern.compile(" \\| ");
+  
+  static final Pattern SPACES = Pattern.compile("\\s+");
+  static final Pattern DE_NOUN = Pattern.compile("([^ ]+) *\\{(m|f|n|pl)\\}");
+  static final Pattern EN_VERB = Pattern.compile("^to ([^ ]+)");
+  
+  static final Pattern BRACKETED = Pattern.compile("\\[([^]]+)\\]");
+  static final Pattern PARENTHESIZED = Pattern.compile("\\(([^)]+)\\]");
+  
+  static final Pattern NON_CHAR_DASH = Pattern.compile("[^-'\\p{L}0-9]+");
+  static final Pattern NON_CHAR = Pattern.compile("[^\\p{L}0-9]+");
+
+  static final Pattern TRIM_PUNC = Pattern.compile("^[^\\p{L}0-9]+|[^\\p{L}0-9]+$");
+
+  final Charset charset;
+  final boolean flipCols;
+  
+  final Pattern fieldSplit;
+  final Pattern subfieldSplit;
+  
+  final DictionaryBuilder dictBuilder;
+  final IndexBuilder[] langIndexBuilders;
+  final IndexBuilder bothIndexBuilder;
+  
+  final Set<String> alreadyDone = new HashSet<String>();
+    
+  public DictFileParser(final Charset charset, boolean flipCols,
+      final Pattern fieldSplit, final Pattern subfieldSplit,
+      final DictionaryBuilder dictBuilder, final IndexBuilder[] langIndexBuilders,
+      final IndexBuilder bothIndexBuilder) {
+    this.charset = charset;
+    this.flipCols = flipCols;
+    this.fieldSplit = fieldSplit;
+    this.subfieldSplit = subfieldSplit;
+    this.dictBuilder = dictBuilder;
+    this.langIndexBuilders = langIndexBuilders;
+    this.bothIndexBuilder = bothIndexBuilder;
+  }
+
+  public void parseFile(final File file) throws IOException {
+    final BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(file), charset));
+    String line;
+    while ((line = reader.readLine()) != null) {
+      parseLine(line);
+    }
+  }
+  
+  private void parseLine(final String line) {
+    if (line.startsWith("#") || line.length() == 0) {
+      logger.info("Skipping comment line: " + line);
+      return;
+    }
+    final String[] fields = fieldSplit.split(line);
+    if (fields.length != 2) {
+      logger.warning("Malformed line: " + line);
+      return;
+    }
+    
+    fields[0] = SPACES.matcher(fields[0]).replaceAll(" ").trim();
+    fields[1] = SPACES.matcher(fields[1]).replaceAll(" ").trim();
+    if (flipCols) {
+      final String temp = fields[0];
+      fields[0] = fields[1];
+      fields[1] = temp;
+    }
+    
+    final String[][] subfields = new String[2][];
+      if (subfieldSplit != null) {
+      subfields[0] = subfieldSplit.split(fields[0]);
+      subfields[1] = subfieldSplit.split(fields[1]);
+      if (subfields[0].length != subfields[1].length) {
+        logger.warning("Number of subfields doesn't match: " + line);
+        return;
+      }
+    } else {
+      subfields[0] = new String[] { fields[0] };
+      subfields[1] = new String[] { fields[1] };
+    }
+    
+    final Pair[] pairs = new Pair[subfields[0].length];
+    for (int i = 0; i < pairs.length; ++i) {
+      pairs[i] = new Pair(subfields[0][i], subfields[1][i]);
+    }
+    final PairEntry pairEntry = new PairEntry(pairs);
+    final EntryData entryData = new EntryData(dictBuilder.dictionary.pairEntries.size(), pairEntry);
+    dictBuilder.dictionary.pairEntries.add(pairEntry);
+    dictBuilder.entryDatas.add(entryData);  // TODO: delete me.
+    
+    for (int l = 0; l < 2; ++l) {
+      alreadyDone.clear();
+      
+      for (int j = 0; j < subfields[l].length; ++j) {
+        String subfield = subfields[l][j];
+        final IndexBuilder indexBuilder = langIndexBuilders[l];
+        if (indexBuilder.index.sortLanguage == Language.de) {
+          subfield = parseField_DE(indexBuilder, subfield, entryData, j);
+        } else if (indexBuilder.index.sortLanguage == Language.en) {
+          subfield = parseField_EN(indexBuilder, subfield, entryData, j);
+        }
+        parseFieldGeneric(indexBuilder, subfield, entryData, j, subfields.length);
+      }
+    }
+  }
+
+  private void parseFieldGeneric(final IndexBuilder indexBuilder, String field,
+      final EntryData entryData, final int subfieldIdx, final int numSubFields) {
+    // remove bracketed and parenthesized stuff.
+    final StringBuilder bracketed = new StringBuilder(); 
+    final StringBuilder parenthesized = new StringBuilder();
+    
+    Matcher matcher;
+    while ((matcher = BRACKETED.matcher(field)).matches()) {
+      bracketed.append(matcher.group(1)).append(" ");
+      field = matcher.replaceFirst(" ");
+    }
+
+    while ((matcher = PARENTHESIZED.matcher(field)).matches()) {
+      parenthesized.append(matcher.group(1)).append(" ");
+      field = matcher.replaceFirst(" ");
+    }
+    
+    field = SPACES.matcher(field).replaceAll(" ").trim();
+
+    // split words on non -A-z0-9, do them.
+    final String[] tokens = NON_CHAR_DASH.split(field);
+
+    final EntryTypeName entryTypeName;
+    if (numSubFields == 1) {
+      assert subfieldIdx == 0;
+      if (tokens.length == 1) {
+        entryTypeName = EntryTypeName.ONE_WORD;
+      } else if (tokens.length == 2) {
+        entryTypeName = EntryTypeName.TWO_WORDS;
+      } else if (tokens.length == 3) {
+        entryTypeName = EntryTypeName.THREE_WORDS;
+      } else if (tokens.length == 4) {
+        entryTypeName = EntryTypeName.FOUR_WORDS;
+      } else {
+        entryTypeName = EntryTypeName.FIVE_OR_MORE_WORDS;
+      }
+    } else {
+      assert numSubFields > 1;
+      if (subfieldIdx == 0) {
+        if (tokens.length == 1) {
+          entryTypeName = EntryTypeName.MULTIROW_HEAD_ONE_WORD;
+        } else {
+          entryTypeName = EntryTypeName.MULTIROW_HEAD_MANY_WORDS;
+        }
+      } else {
+        assert subfieldIdx > 0;
+        if (tokens.length == 1) {
+          entryTypeName = EntryTypeName.MULTIROW_TAIL_ONE_WORD;
+        } else {
+          entryTypeName = EntryTypeName.MULTIROW_TAIL_MANY_WORDS;
+        }
+      }
+    }
+
+    for (String token : tokens) {
+      token = TRIM_PUNC.matcher(token).replaceAll("");
+      if (!alreadyDone.contains(token) && token.length() > 0) {
+        final List<EntryData> entries = indexBuilder.getOrCreateEntries(token, entryTypeName);
+        entries.add(entryData);
+        alreadyDone.add(token);
+        
+        // also split words on dashes, do them, too.
+        if (token.contains("-")) {
+          final String[] dashed = token.split("-");
+          for (final String dashedToken : dashed) {
+            if (!alreadyDone.contains(dashedToken) && dashedToken.length() > 0) {
+              final List<EntryData> dashEntries = indexBuilder.getOrCreateEntries(dashedToken, EntryTypeName.PART_OF_HYPHENATED);
+              dashEntries.add(entryData);
+            }
+          }
+        }
+
+      }  // if (!alreadyDone.contains(token)) {
+    }  // for (final String token : tokens) { 
+    
+    // process bracketed stuff (split on spaces and dashes always)
+    final String[] bracketedTokens = NON_CHAR.split(bracketed.toString());
+    for (final String token : bracketedTokens) {
+      assert !token.contains("-");
+      if (!alreadyDone.contains(token) && token.length() > 0) {
+        final List<EntryData> entries = indexBuilder.getOrCreateEntries(token, EntryTypeName.BRACKETED);
+        entries.add(entryData);
+      }
+    }
+    
+    // process paren stuff
+    final String[] parenTokens = NON_CHAR.split(bracketed.toString());
+    for (final String token : parenTokens) {
+      assert !token.contains("-");
+      if (!alreadyDone.contains(token) && token.length() > 0) {
+        final List<EntryData> entries = indexBuilder.getOrCreateEntries(token, EntryTypeName.PARENTHESIZED);
+        entries.add(entryData);
+      }
+    }
+    
+  }
+
+  private String parseField_DE(final IndexBuilder indexBuilder, String field,
+      final EntryData entryData, final int subfieldIdx) {
+    final Matcher matcher = DE_NOUN.matcher(field);
+    while (matcher.find()) {
+      final String noun = matcher.group(1);
+      //final String gender = matcher.group(2);
+      if (alreadyDone.add(noun)) {
+        // System.out.println("Found DE noun " + noun + ", " + gender);
+        final List<EntryData> entries = indexBuilder.getOrCreateEntries(noun, EntryTypeName.NOUN);
+        entries.add(entryData);
+      }
+    }
+    return field;
+  }
+  
+  private String parseField_EN(final IndexBuilder indexBuilder, String field,
+      final EntryData entryData, final int subfieldIdx) {
+    if (field.startsWith("to ")) {
+      field = field.substring(3);
+    }
+    return field;
+  }
+
+
+}
diff --git a/src/com/hughes/android/dictionary/engine/DictionaryBuilder.java b/src/com/hughes/android/dictionary/engine/DictionaryBuilder.java
new file mode 100644 (file)
index 0000000..bff164b
--- /dev/null
@@ -0,0 +1,150 @@
+package com.hughes.android.dictionary.engine;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.PrintStream;
+import java.io.RandomAccessFile;
+import java.nio.charset.Charset;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+
+import com.hughes.android.dictionary.Language;
+import com.hughes.util.Args;
+import com.hughes.util.FileUtil;
+
+/*
+
+--maxEntries=100
+--dictOut=de-en.dict
+--lang1=DE
+--lang2=EN
+--dictInfo=@dictInfo.txt
+
+--input0=/Users/thadh/personal/quickDic/de-en-chemnitz.txt
+--input0Name=chemnitz
+--input0Charset=UTF8
+--input0Format=chemnitz
+
+--input1=/Users/thadh/personal/quickDic/dewiktionary-20100326-pages-articles.xml
+--input1Name=wiktionary
+--input1Format=wiktionary
+
+--input2=/Users/thadh/personal/quickDic/de-en-dictcc.txt
+--input2Name=dictcc
+--input2Charset=Cp1252
+--input2Format=dictcc
+ */
+
+public class DictionaryBuilder {
+  
+  final Dictionary dictionary;
+  
+  final List<EntryData> entryDatas = new ArrayList<EntryData>();
+  
+  final List<IndexBuilder> indexBuilders = new ArrayList<IndexBuilder>();
+  
+  public DictionaryBuilder(final String dictInfo, final Language lang0, final Language lang1) {
+    dictionary = new Dictionary(dictInfo);
+    indexBuilders.add(new IndexBuilder(this, lang0.getSymbol(), lang0.getSymbol() + "->" + lang1.getSymbol(), lang0));
+    indexBuilders.add(new IndexBuilder(this, lang1.getSymbol(), lang1.getSymbol() + "->" + lang0.getSymbol(), lang1));
+  }
+  
+  void build() {
+    for (final IndexBuilder indexBuilder : indexBuilders) {
+      indexBuilder.build();
+      dictionary.indices.add(indexBuilder.index);
+    }
+  }
+  
+  public static void main(final String[] args) throws IOException {
+    final Map<String,String> keyValueArgs = Args.keyValueArgs(args);
+    
+    final Language lang1 = Language.lookup(keyValueArgs.remove("lang1"));
+    final Language lang2 = Language.lookup(keyValueArgs.remove("lang2"));
+    if (lang1 == null || lang2 == null) {
+      fatalError("--lang1= and --lang2= must both be specified.");
+    }
+    
+    final String dictOutFilename = keyValueArgs.remove("dictOut");
+    if (dictOutFilename == null) {
+      fatalError("--dictOut= must be specified.");
+    }
+    
+    String dictInfo = keyValueArgs.remove("dictInfo");
+    if (dictInfo == null) {
+      fatalError("--dictInfo= must be specified.");
+    }
+    if (dictInfo.startsWith("@")) {
+      dictInfo = FileUtil.readToString(new File(dictInfo.substring(1)));
+    }
+    
+    final String printFile = keyValueArgs.remove("print");
+    
+    System.out.println("lang1=" + lang1);
+    System.out.println("lang2=" + lang2);
+    System.out.println("dictInfo=" + dictInfo);
+    System.out.println("dictOut=" + dictOutFilename);    
+    
+    final DictionaryBuilder dictionaryBuilder = new DictionaryBuilder(dictInfo, lang1, lang2);
+    
+    for (int i = 0; i < 100; ++i) {
+      final String prefix = "input" + i;
+      if (keyValueArgs.containsKey(prefix)) {
+        final File file = new File(keyValueArgs.remove(prefix));
+        System.out.println("Processing: " + file);
+        String charsetName = keyValueArgs.remove(prefix + "Charset");
+        if (charsetName == null) {
+          charsetName = "UTF8";
+        }
+        final Charset charset = Charset.forName(charsetName);
+        String inputName = keyValueArgs.remove(prefix + "Name");
+        if (inputName == null) {
+          fatalError("Must specify human readable name for: " + prefix + "Name");
+        }
+
+        String inputFormat = keyValueArgs.remove(prefix + "Format");
+        if ("dictcc".equals(inputFormat)) {
+          new DictFileParser(charset, false, DictFileParser.TAB, null, dictionaryBuilder, dictionaryBuilder.indexBuilders.toArray(new IndexBuilder[0]), null).parseFile(file);
+        } else if ("chemnitz".equals(inputFormat)) {
+          new DictFileParser(charset, false, DictFileParser.DOUBLE_COLON, DictFileParser.PIPE, dictionaryBuilder, dictionaryBuilder.indexBuilders.toArray(new IndexBuilder[0]), null).parseFile(file);
+        } else if ("wiktionary".equals(inputFormat)) {
+          throw new RuntimeException();
+//          new WiktionaryXmlParser(dict).parse(file);
+        } else {
+          fatalError("Invalid or missing input format: " + inputFormat);
+        }
+        
+        final EntrySource entrySource = new EntrySource(dictionaryBuilder.dictionary.sources.size(), inputName);
+        dictionaryBuilder.dictionary.sources.add(entrySource);
+        System.out.println("Done: " + file + "\n\n");
+      }
+    }
+   
+    dictionaryBuilder.build();
+    
+    if (printFile != null) {
+      final PrintStream out = new PrintStream(new File(printFile));
+      dictionaryBuilder.dictionary.print(out);
+      out.close();
+    }
+    
+    System.out.println("Writing dictionary to: " + dictOutFilename);
+    final RandomAccessFile dictOut = new RandomAccessFile(dictOutFilename, "rw");
+    dictOut.setLength(0);
+    dictionaryBuilder.dictionary.write(dictOut);
+    dictOut.close();
+    
+    if (!keyValueArgs.isEmpty()) {
+      System.err.println("WARNING: couldn't parse arguments: " + keyValueArgs);
+      System.exit(1);
+    }
+  
+  }
+  
+  private static void fatalError(String string) {
+    System.err.println(string);
+    System.exit(1);
+  }
+  
+}
diff --git a/src/com/hughes/android/dictionary/engine/DictionaryBuilderTest.java b/src/com/hughes/android/dictionary/engine/DictionaryBuilderTest.java
new file mode 100644 (file)
index 0000000..a2468f2
--- /dev/null
@@ -0,0 +1,56 @@
+package com.hughes.android.dictionary.engine;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.PrintStream;
+import java.io.RandomAccessFile;
+
+import com.hughes.util.FileUtil;
+
+import junit.framework.TestCase;
+
+public class DictionaryBuilderTest extends TestCase {
+  
+  public void testGermanCombined() throws IOException {
+    final File result = File.createTempFile("de_en", ".dict");
+    System.out.println("Writing to: " + result);
+    DictionaryBuilder.main(new String[] {
+        "--dictOut=" + result.getAbsolutePath(),
+        "--lang1=DE",
+        "--lang2=EN",
+        "--dictInfo=@testdata/de_en_dictInfo.txt",
+
+        "--input1=testdata/de-en-chemnitz_100",
+        "--input1Name=dictcc",
+        "--input1Charset=UTF8",
+        "--input1Format=chemnitz",
+
+        "--input2=testdata/de-en-dictcc_100",
+        "--input2Name=dictcc",
+        "--input2Charset=UTF8",
+        "--input2Format=dictcc",
+        
+        "--print=testdata/de_en.test",
+    });
+    
+    // Check it once:
+    assertFilesEqual("testdata/de_en.golden", "testdata/de_en.test"); 
+    
+    
+    // Check it again.
+    final Dictionary dict = new Dictionary(new RandomAccessFile(result.getAbsolutePath(), "r"));
+    final PrintStream out = new PrintStream(new File("testdata/de_en.test"));
+    dict.print(out);
+    out.close();
+    
+    assertFilesEqual("testdata/de_en.golden", "testdata/de_en.test");
+  }
+  
+
+  void assertFilesEqual(final String expected, final String actual) throws IOException {
+    final String expectedString = FileUtil.readToString(new File(expected));
+    final String actualString = FileUtil.readToString(new File(actual));
+    assertEquals(expectedString, actualString);
+  }
+
+}
diff --git a/src/com/hughes/android/dictionary/engine/EntryData.java b/src/com/hughes/android/dictionary/engine/EntryData.java
new file mode 100644 (file)
index 0000000..7f6b9b5
--- /dev/null
@@ -0,0 +1,14 @@
+/**
+ * 
+ */
+package com.hughes.android.dictionary.engine;
+
+import com.hughes.util.IndexedObject;
+
+class EntryData extends IndexedObject {
+  EntryData(final int index, final Entry entry) {
+    super(index);
+    this.entry = entry;
+  }
+  Entry entry;
+}
\ No newline at end of file
diff --git a/src/com/hughes/android/dictionary/engine/IndexBuilder.java b/src/com/hughes/android/dictionary/engine/IndexBuilder.java
new file mode 100644 (file)
index 0000000..44ff0d3
--- /dev/null
@@ -0,0 +1,81 @@
+package com.hughes.android.dictionary.engine;
+
+import java.util.ArrayList;
+import java.util.EnumMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.SortedMap;
+import java.util.TreeMap;
+
+import com.hughes.android.dictionary.Language;
+
+public class IndexBuilder {
+  
+  final DictionaryBuilder dictionaryBuilder;
+  final Index index;
+
+  final SortedMap<String, TokenData> tokenToData;
+
+  @SuppressWarnings("unchecked")
+  IndexBuilder(final DictionaryBuilder dictionaryBuilder, final String shortName, final String longName, final Language language) {
+    this.dictionaryBuilder = dictionaryBuilder;
+    index = new Index(dictionaryBuilder.dictionary, shortName, longName, language);
+    tokenToData = new TreeMap<String, TokenData>(language.getSortCollator());
+  }
+  
+  public void build() {
+    final Set<EntryData> tokenEntryDatas = new HashSet<EntryData>();
+    final List<RowBase> rows = index.rows;
+    for (final TokenData tokenData : tokenToData.values()) {
+      tokenEntryDatas.clear();
+      final int indexRow = index.sortedIndexEntries.size();
+      index.sortedIndexEntries.add(new Index.IndexEntry(tokenData.token, rows.size()));
+      rows.add(new TokenRow(indexRow, rows.size(), index));
+      int count = 0;
+      for (final List<EntryData> entryDatas : tokenData.typeToEntries.values()) {
+        for (final EntryData entryData : entryDatas) {
+          if (tokenEntryDatas.add(entryData)) {
+            rows.add(new PairEntry.Row(entryData.index(), rows.size(), index));
+            ++count;
+          }
+        }
+      }
+      System.out.println(count + " ENTRIES FOR TOKEN " + tokenData.token);
+    }
+  }
+  
+  static class TokenData {
+    final String token;
+        
+    final Map<EntryTypeName, List<EntryData>> typeToEntries = new EnumMap<EntryTypeName, List<EntryData>>(EntryTypeName.class);
+    
+    TokenData(final String token) {
+      assert token.equals(token.trim());
+      assert token.length() > 0;
+      this.token = token;
+    }
+  }
+
+  public TokenData getOrCreateTokenData(final String token) {
+    TokenData tokenData = tokenToData.get(token);
+    if (tokenData == null) {
+      tokenData = new TokenData(token);
+      tokenToData.put(token, tokenData);
+    }
+    return tokenData;
+  }
+
+  public List<EntryData> getOrCreateEntries(final String token, final EntryTypeName entryTypeName) {
+    final TokenData tokenData = getOrCreateTokenData(token);
+    List<EntryData> entries = tokenData.typeToEntries.get(entryTypeName);
+    if (entries == null) {
+      entries = new ArrayList<EntryData>();
+      tokenData.typeToEntries.put(entryTypeName, entries);
+    }
+    return entries;
+  }
+  
+
+}