]> gitweb.fperrin.net Git - DictionaryPC.git/commitdiff
Apply astyle code formatting.
authorReimar Döffinger <Reimar.Doeffinger@gmx.de>
Tue, 8 Nov 2016 22:28:19 +0000 (23:28 +0100)
committerReimar Döffinger <Reimar.Doeffinger@gmx.de>
Tue, 8 Nov 2016 22:28:19 +0000 (23:28 +0100)
27 files changed:
src/com/hughes/android/dictionary/DateFormatTest.java
src/com/hughes/android/dictionary/SerializeCollatorTest.java
src/com/hughes/android/dictionary/engine/CheckDictionariesMain.java
src/com/hughes/android/dictionary/engine/DictionaryBuilder.java
src/com/hughes/android/dictionary/engine/DictionaryBuilderMain.java
src/com/hughes/android/dictionary/engine/DictionaryBuilderTest.java
src/com/hughes/android/dictionary/engine/DictionaryTest.java
src/com/hughes/android/dictionary/engine/IndexBuilder.java
src/com/hughes/android/dictionary/engine/IndexedEntry.java
src/com/hughes/android/dictionary/engine/LanguageTest.java
src/com/hughes/android/dictionary/engine/WiktionarySplitter.java
src/com/hughes/android/dictionary/parser/DictFileParser.java
src/com/hughes/android/dictionary/parser/Parser.java
src/com/hughes/android/dictionary/parser/WikiTokenizer.java
src/com/hughes/android/dictionary/parser/WikiTokenizerTest.java
src/com/hughes/android/dictionary/parser/wiktionary/AbstractWiktionaryParser.java
src/com/hughes/android/dictionary/parser/wiktionary/DeFunctionCallbacks.java
src/com/hughes/android/dictionary/parser/wiktionary/EnForeignParser.java
src/com/hughes/android/dictionary/parser/wiktionary/EnFunctionCallbacks.java
src/com/hughes/android/dictionary/parser/wiktionary/EnParser.java
src/com/hughes/android/dictionary/parser/wiktionary/EnToTranslationParser.java
src/com/hughes/android/dictionary/parser/wiktionary/EnTranslationToTranslationParser.java
src/com/hughes/android/dictionary/parser/wiktionary/FrFunctionCallbacks.java
src/com/hughes/android/dictionary/parser/wiktionary/FunctionCallback.java
src/com/hughes/android/dictionary/parser/wiktionary/ItFunctionCallbacks.java
src/com/hughes/android/dictionary/parser/wiktionary/WholeSectionToHtmlParser.java
src/com/hughes/android/dictionary/parser/wiktionary/WiktionaryLangs.java

index fd2c91094e29021d4213bfdd5efc0ca9ea8fbf32..fce209525089d281b22dfd22822d85260e0b8aae 100644 (file)
@@ -19,11 +19,11 @@ import java.util.Date;
 \r
 public class DateFormatTest {\r
 \r
-  /**\r
-   * @param args\r
-   */\r
-  public static void main(String[] args) {\r
-    System.out.println(new SimpleDateFormat("yyyy.MM.dd HH:mm:ss").format(new Date()));\r
-  }\r
+    /**\r
+     * @param args\r
+     */\r
+    public static void main(String[] args) {\r
+        System.out.println(new SimpleDateFormat("yyyy.MM.dd HH:mm:ss").format(new Date()));\r
+    }\r
 \r
 }\r
index bfc531e59f3f94847a338fc77f46f085960c114e..7a1e42e41b74703faf0a563a10eecbc4c5c94dfe 100644 (file)
@@ -23,14 +23,14 @@ import java.text.Collator;
 \r
 public class SerializeCollatorTest {\r
 \r
-  /**\r
-   * @param args\r
-   * @throws IOException \r
-   */\r
-  public static void main(String[] args) throws IOException {\r
-    File temp = File.createTempFile("temp", null);\r
-    final Comparator c = Language.de.getCollator();\r
-    //FileUtil.writeObject(c, temp);\r
-  }\r
+    /**\r
+     * @param args\r
+     * @throws IOException\r
+     */\r
+    public static void main(String[] args) throws IOException {\r
+        File temp = File.createTempFile("temp", null);\r
+        final Comparator c = Language.de.getCollator();\r
+        //FileUtil.writeObject(c, temp);\r
+    }\r
 \r
 }\r
index 6ad8fb298cdba440e80bcec259e51511bb7494f2..8be96fc4dd7de1d5eb89523bb9e79a8cad5d04c2 100644 (file)
@@ -14,45 +14,45 @@ import java.util.Collections;
 import java.util.List;
 
 public class CheckDictionariesMain {
-  
-  static final String BASE_URL = "http://github.com/rdoeffinger/Dictionary/releases/download/v0.2-dictionaries/";
-  static final String VERSION_CODE_OLD = "v006";
-  static final String VERSION_CODE = "v007";
-
-  public static void main(String[] args) throws IOException {
-    final File dictDir = new File(DictionaryBuilderMain.OUTPUTS);
-    
-    final PrintWriter dictionaryInfoOut = new PrintWriter(new File("../Dictionary/res/raw/dictionary_info.txt"));
+
+    static final String BASE_URL = "http://github.com/rdoeffinger/Dictionary/releases/download/v0.2-dictionaries/";
+    static final String VERSION_CODE_OLD = "v006";
+    static final String VERSION_CODE = "v007";
+
+    public static void main(String[] args) throws IOException {
+        final File dictDir = new File(DictionaryBuilderMain.OUTPUTS);
+
+        final PrintWriter dictionaryInfoOut = new PrintWriter(new File("../Dictionary/res/raw/dictionary_info.txt"));
 //    dictionaryInfoOut.println("# LANG_1\t%LANG_2\tFILENAME\tVERSION_CODE\tFILESIZE\tNUM_MAIN_WORDS_1\tNUM_MAIN_WORDS_2\tNUM_ALL_WORDS_1\tNUM_ALL_WORDS_2");
 
-    final File[] files = dictDir.listFiles();
-    final List<String> dictNames = new ArrayList<String>();
-    Arrays.sort(files);
-    for (final File dictFile : files) {
-      if (!dictFile.getName().endsWith("quickdic")) {
-        continue;
-      }
-      System.out.println(dictFile.getPath());
-      
-      
-      final RandomAccessFile raf = new RandomAccessFile(dictFile, "r");
-      final Dictionary dict = new Dictionary(raf);
-
-      final DictionaryInfo dictionaryInfo = dict.getDictionaryInfo();
-
-      String version_code = VERSION_CODE;
-      File zipFile = new File(dictFile.getPath() + "." + version_code + ".zip");
-      if (!zipFile.canRead()) {
-          version_code = VERSION_CODE_OLD;
-          zipFile = new File(dictFile.getPath() + "." + version_code + ".zip");
-      }
-      dictionaryInfo.uncompressedFilename = dictFile.getName();
-      dictionaryInfo.downloadUrl = BASE_URL + dictFile.getName() + "." + version_code + ".zip";
-      // TODO: zip it right here....
-      dictionaryInfo.uncompressedBytes = dictFile.length();
-      dictionaryInfo.zipBytes = zipFile.canRead() ? zipFile.length() : -1;
-
-      // Print it.
+        final File[] files = dictDir.listFiles();
+        final List<String> dictNames = new ArrayList<String>();
+        Arrays.sort(files);
+        for (final File dictFile : files) {
+            if (!dictFile.getName().endsWith("quickdic")) {
+                continue;
+            }
+            System.out.println(dictFile.getPath());
+
+
+            final RandomAccessFile raf = new RandomAccessFile(dictFile, "r");
+            final Dictionary dict = new Dictionary(raf);
+
+            final DictionaryInfo dictionaryInfo = dict.getDictionaryInfo();
+
+            String version_code = VERSION_CODE;
+            File zipFile = new File(dictFile.getPath() + "." + version_code + ".zip");
+            if (!zipFile.canRead()) {
+                version_code = VERSION_CODE_OLD;
+                zipFile = new File(dictFile.getPath() + "." + version_code + ".zip");
+            }
+            dictionaryInfo.uncompressedFilename = dictFile.getName();
+            dictionaryInfo.downloadUrl = BASE_URL + dictFile.getName() + "." + version_code + ".zip";
+            // TODO: zip it right here....
+            dictionaryInfo.uncompressedBytes = dictFile.length();
+            dictionaryInfo.zipBytes = zipFile.canRead() ? zipFile.length() : -1;
+
+            // Print it.
 //      final PrintWriter textOut = new PrintWriter(new BufferedWriter(new FileWriter(dictFile + ".text")));
 //      final List<PairEntry> sorted = new ArrayList<PairEntry>(dict.pairEntries);
 //      Collections.sort(sorted);
@@ -60,31 +60,31 @@ public class CheckDictionariesMain {
 //        textOut.println(pairEntry.getRawText(false));
 //      }
 //      textOut.close();
-      
-      // Find the stats.
-      System.out.println("Stats...");
-      final List<String> indexNames = new ArrayList<String>();
-      for (final IndexInfo indexInfo : dictionaryInfo.indexInfos) {
-          indexNames.add(indexInfo.shortName);
-      }
-      dictNames.add(CollectionUtil.join(indexNames, "-") + "\n");
-      final String row = dictionaryInfo.append(new StringBuilder()).toString();
-      if (!zipFile.canRead()) {
-        System.err.println("Couldn't read zipfile: " + zipFile);
-      }
-      System.out.println(row + "\n");
-      
-      
-      dictionaryInfoOut.println(row);
-      dictionaryInfoOut.flush();
-      
-      raf.close();
+
+            // Find the stats.
+            System.out.println("Stats...");
+            final List<String> indexNames = new ArrayList<String>();
+            for (final IndexInfo indexInfo : dictionaryInfo.indexInfos) {
+                indexNames.add(indexInfo.shortName);
+            }
+            dictNames.add(CollectionUtil.join(indexNames, "-") + "\n");
+            final String row = dictionaryInfo.append(new StringBuilder()).toString();
+            if (!zipFile.canRead()) {
+                System.err.println("Couldn't read zipfile: " + zipFile);
+            }
+            System.out.println(row + "\n");
+
+
+            dictionaryInfoOut.println(row);
+            dictionaryInfoOut.flush();
+
+            raf.close();
+        }
+
+        Collections.sort(dictNames);
+        System.out.println(dictNames.toString().replace(",", "  *"));
+
+        dictionaryInfoOut.close();
     }
-    
-    Collections.sort(dictNames);
-    System.out.println(dictNames.toString().replace(",", "  *"));
-    
-    dictionaryInfoOut.close();
-  }
 
 }
index 624ade9357b965f304cbc067d81def0eaa139137..d105af2d4764e45778b12b574c3eeeebe47c1401 100644 (file)
@@ -40,198 +40,198 @@ import com.hughes.util.Args;
 import com.hughes.util.FileUtil;
 
 public class DictionaryBuilder {
-  
-  public final Dictionary dictionary;
-  public final List<IndexBuilder> indexBuilders = new ArrayList<IndexBuilder>();
-  
-  public DictionaryBuilder(final String dictInfoString, final Language lang0, final Language lang1, final String normalizerRules1, final String normalizerRules2, final Set<String> lang1Stoplist, final Set<String> lang2Stoplist) {
-    dictionary = new Dictionary(dictInfoString);
-    if (lang1 != null) {
-        indexBuilders.add(new IndexBuilder(this, lang0.getIsoCode(), lang0.getIsoCode() + "->" + lang1.getIsoCode(), lang0, normalizerRules1, lang1Stoplist, false));
-        indexBuilders.add(new IndexBuilder(this, lang1.getIsoCode(), lang1.getIsoCode() + "->" + lang0.getIsoCode(), lang1, normalizerRules2, lang2Stoplist, true));
-    } else {
-        indexBuilders.add(new IndexBuilder(this, lang0.getIsoCode(), lang0.getIsoCode(), lang0, normalizerRules1, lang1Stoplist, false));
-    }
-  }
-  
-  void build() {
-    for (final IndexBuilder indexBuilder : indexBuilders) {
-      indexBuilder.build();
-      dictionary.indices.add(indexBuilder.index);
-    }
-  }
-  
-  public static void main(final String[] args) throws IOException, ParserConfigurationException, SAXException {
-    System.out.println("Running with arguments:");
-    for (final String arg : args) {
-      System.out.println(arg);
-    }
-    
-    final Map<String,String> keyValueArgs = Args.keyValueArgs(args);
-    
-    if (!keyValueArgs.containsKey("lang1")) {
-      fatalError("--lang1= must be specified.");
-    }
-    final Language lang1 = Language.lookup(keyValueArgs.remove("lang1"));
-    final Language lang2;
-    if (keyValueArgs.containsKey("lang2")) {
-        lang2 = Language.lookup(keyValueArgs.remove("lang2"));
-    } else {
-        lang2 = null;
-    }
 
-    final Set<String> lang1Stoplist = new LinkedHashSet<String>();
-    final Set<String> lang2Stoplist = new LinkedHashSet<String>();
-    final String lang1StoplistFile = keyValueArgs.remove("lang1Stoplist");
-    final String lang2StoplistFile = keyValueArgs.remove("lang2Stoplist");
-    if (lang1StoplistFile != null) {
-      lang1Stoplist.addAll(FileUtil.readLines(new File(lang1StoplistFile)));
-    }
-    if (lang2StoplistFile != null) {
-      lang2Stoplist.addAll(FileUtil.readLines(new File(lang2StoplistFile)));
-    }
+    public final Dictionary dictionary;
+    public final List<IndexBuilder> indexBuilders = new ArrayList<IndexBuilder>();
 
-    String normalizerRules1 = keyValueArgs.remove("normalizerRules1");
-    String normalizerRules2 = keyValueArgs.remove("normalizerRules2");
-    if (normalizerRules1 == null) {
-      normalizerRules1 = lang1.getDefaultNormalizerRules();
-    }
-    if (normalizerRules2 == null) {
-      normalizerRules2 = lang2 == null ? null : lang2.getDefaultNormalizerRules();
-    }
-    
-    final String dictOutFilename = keyValueArgs.remove("dictOut");
-    if (dictOutFilename == null) {
-      fatalError("--dictOut= must be specified.");
-    }
-    
-    String dictInfo = keyValueArgs.remove("dictInfo");
-    if (dictInfo == null) {
-      fatalError("--dictInfo= must be specified.");
-    }
-    if (dictInfo.startsWith("@")) {
-      dictInfo = FileUtil.readToString(new File(dictInfo.substring(1)));
+    public DictionaryBuilder(final String dictInfoString, final Language lang0, final Language lang1, final String normalizerRules1, final String normalizerRules2, final Set<String> lang1Stoplist, final Set<String> lang2Stoplist) {
+        dictionary = new Dictionary(dictInfoString);
+        if (lang1 != null) {
+            indexBuilders.add(new IndexBuilder(this, lang0.getIsoCode(), lang0.getIsoCode() + "->" + lang1.getIsoCode(), lang0, normalizerRules1, lang1Stoplist, false));
+            indexBuilders.add(new IndexBuilder(this, lang1.getIsoCode(), lang1.getIsoCode() + "->" + lang0.getIsoCode(), lang1, normalizerRules2, lang2Stoplist, true));
+        } else {
+            indexBuilders.add(new IndexBuilder(this, lang0.getIsoCode(), lang0.getIsoCode(), lang0, normalizerRules1, lang1Stoplist, false));
+        }
     }
-    
-    final String printFile = keyValueArgs.remove("print");
-    
-    System.out.println("lang1=" + lang1);
-    System.out.println("lang2=" + lang2);
-    System.out.println("normalizerRules1=" + normalizerRules1);
-    System.out.println("normalizerRules2=" + normalizerRules2);
-    System.out.println("dictInfo=" + dictInfo);
-    System.out.println("dictOut=" + dictOutFilename);    
-    
-    final DictionaryBuilder dictionaryBuilder = new DictionaryBuilder(dictInfo, lang1, lang2, normalizerRules1, normalizerRules2, lang1Stoplist, lang2Stoplist);
-    
-    for (int i = 0; i < 100; ++i) {
-      final String prefix = "input" + i;
-      if (keyValueArgs.containsKey(prefix)) {
-        final File file = new File(keyValueArgs.remove(prefix));
-        System.out.println("Processing: " + file);
-        String charsetName = keyValueArgs.remove(prefix + "Charset");
-        if (charsetName == null) {
-          charsetName = "UTF8";
+
+    void build() {
+        for (final IndexBuilder indexBuilder : indexBuilders) {
+            indexBuilder.build();
+            dictionary.indices.add(indexBuilder.index);
         }
-        final Charset charset = Charset.forName(charsetName);
-        String inputName = keyValueArgs.remove(prefix + "Name");
-        if (inputName == null) {
-          fatalError("Must specify human readable name for: " + prefix + "Name");
+    }
+
+    public static void main(final String[] args) throws IOException, ParserConfigurationException, SAXException {
+        System.out.println("Running with arguments:");
+        for (final String arg : args) {
+            System.out.println(arg);
         }
-        String pageLimitString = keyValueArgs.remove(prefix + "PageLimit");
-        if (pageLimitString == null) {
-          pageLimitString = "-1";
+
+        final Map<String,String> keyValueArgs = Args.keyValueArgs(args);
+
+        if (!keyValueArgs.containsKey("lang1")) {
+            fatalError("--lang1= must be specified.");
         }
-        final int pageLimit = Integer.parseInt(pageLimitString);
-
-        final EntrySource entrySource = new EntrySource(dictionaryBuilder.dictionary.sources.size(), inputName, 0);
-        System.out.println("");
-        
-        String inputFormat = keyValueArgs.remove(prefix + "Format");
-        if ("tab_separated".equals(inputFormat)) {
-          final boolean flipColumns = "true".equals(keyValueArgs.remove(prefix + "FlipColumns"));
-          new DictFileParser(charset, flipColumns, DictFileParser.TAB, null, dictionaryBuilder, dictionaryBuilder.indexBuilders.toArray(new IndexBuilder[0]), null).parse(file, entrySource, pageLimit);
-        } else if ("chemnitz".equals(inputFormat)) {
-          final boolean flipColumns = "true".equals(keyValueArgs.remove(prefix + "FlipColumns"));
-          new DictFileParser(charset, flipColumns, DictFileParser.DOUBLE_COLON, DictFileParser.PIPE, dictionaryBuilder, dictionaryBuilder.indexBuilders.toArray(new IndexBuilder[0]), null).parse(file, entrySource, pageLimit);
-        } else if ("enwiktionary".equals(inputFormat)) {
-          final String type = keyValueArgs.remove(prefix + "WiktionaryType");
-          final Pattern langPattern = Pattern.compile(keyValueArgs.remove(prefix + "LangPattern"), Pattern.CASE_INSENSITIVE);
-          final Pattern langCodePattern = Pattern.compile(keyValueArgs.remove(prefix + "LangCodePattern"));
-          final int enIndex = Integer.parseInt(keyValueArgs.remove(prefix + "EnIndex")) - 1;
-            
-          if (enIndex < 0 || enIndex >= 2) {
-            fatalError("Must be 1 or 2: " + prefix + "EnIndex");
-          }
-          final Parser parser;
-          if ("EnToTranslation".equals(type)) {
-            parser = new EnToTranslationParser(dictionaryBuilder.indexBuilders.get(enIndex), dictionaryBuilder.indexBuilders.get(1-enIndex),
-                langPattern, langCodePattern, enIndex != 0);
-          } else if ("EnForeign".equals(type)) {
-            parser = new EnForeignParser(dictionaryBuilder.indexBuilders.get(enIndex), dictionaryBuilder.indexBuilders.get(1-enIndex),
-                langPattern, langCodePattern, enIndex != 0);
-          } else if ("EnEnglish".equals(type)) {
-              parser = new EnForeignParser(dictionaryBuilder.indexBuilders.get(enIndex), dictionaryBuilder.indexBuilders.get(enIndex),
-                  langPattern, langCodePattern, true);
-          } else {
-            fatalError("Invalid WiktionaryType (use EnToTranslation or EnForeign or EnEnglish): " + type);
-            return;
-          }
-          parser.parse(file, entrySource, pageLimit);
-        } else if (EnTranslationToTranslationParser.NAME.equals(inputFormat)) {
-          final String code1 = keyValueArgs.remove(prefix + "LangPattern1");
-          final String code2 = keyValueArgs.remove(prefix + "LangPattern2");
-          if (code1 == null || code2 == null) {
-            fatalError("Must specify LangPattern1 and LangPattern2.");
-            return;
-          }
-          final Pattern codePattern1 = Pattern.compile(code1, Pattern.CASE_INSENSITIVE);
-          final Pattern codePattern2 = Pattern.compile(code2, Pattern.CASE_INSENSITIVE);
-          new EnTranslationToTranslationParser(dictionaryBuilder.indexBuilders, new Pattern[] {codePattern1, codePattern2}).parse(file, entrySource, pageLimit);
-        } else if (WholeSectionToHtmlParser.NAME.equals(inputFormat)) {
-          final int titleIndex = Integer.parseInt(keyValueArgs.remove(prefix + "TitleIndex")) - 1;
-          final String wiktionaryLang = keyValueArgs.remove(prefix + "WiktionaryLang");
-          final String webUrlTemplate = keyValueArgs.remove(prefix + "WebUrlTemplate");
-          String skipLang = keyValueArgs.remove(prefix + "SkipLang");
-          if (skipLang == null) skipLang = "";
-          new WholeSectionToHtmlParser(dictionaryBuilder.indexBuilders.get(titleIndex), null, wiktionaryLang, skipLang, webUrlTemplate).parse(file, entrySource, pageLimit);
+        final Language lang1 = Language.lookup(keyValueArgs.remove("lang1"));
+        final Language lang2;
+        if (keyValueArgs.containsKey("lang2")) {
+            lang2 = Language.lookup(keyValueArgs.remove("lang2"));
         } else {
-          fatalError("Invalid or missing input format: " + inputFormat);
+            lang2 = null;
         }
-        
-        dictionaryBuilder.dictionary.sources.add(entrySource);
-        System.out.println("Done: " + file + "\n\n");
-      }
-    }
-   
-    dictionaryBuilder.build();
-    // Drop indexBuilders to free RAM
-    dictionaryBuilder.indexBuilders.clear();
-    
-    if (printFile != null) {
-      final PrintStream out = new PrintStream(new File(printFile));
-      dictionaryBuilder.dictionary.print(out);
-      out.close();
+
+        final Set<String> lang1Stoplist = new LinkedHashSet<String>();
+        final Set<String> lang2Stoplist = new LinkedHashSet<String>();
+        final String lang1StoplistFile = keyValueArgs.remove("lang1Stoplist");
+        final String lang2StoplistFile = keyValueArgs.remove("lang2Stoplist");
+        if (lang1StoplistFile != null) {
+            lang1Stoplist.addAll(FileUtil.readLines(new File(lang1StoplistFile)));
+        }
+        if (lang2StoplistFile != null) {
+            lang2Stoplist.addAll(FileUtil.readLines(new File(lang2StoplistFile)));
+        }
+
+        String normalizerRules1 = keyValueArgs.remove("normalizerRules1");
+        String normalizerRules2 = keyValueArgs.remove("normalizerRules2");
+        if (normalizerRules1 == null) {
+            normalizerRules1 = lang1.getDefaultNormalizerRules();
+        }
+        if (normalizerRules2 == null) {
+            normalizerRules2 = lang2 == null ? null : lang2.getDefaultNormalizerRules();
+        }
+
+        final String dictOutFilename = keyValueArgs.remove("dictOut");
+        if (dictOutFilename == null) {
+            fatalError("--dictOut= must be specified.");
+        }
+
+        String dictInfo = keyValueArgs.remove("dictInfo");
+        if (dictInfo == null) {
+            fatalError("--dictInfo= must be specified.");
+        }
+        if (dictInfo.startsWith("@")) {
+            dictInfo = FileUtil.readToString(new File(dictInfo.substring(1)));
+        }
+
+        final String printFile = keyValueArgs.remove("print");
+
+        System.out.println("lang1=" + lang1);
+        System.out.println("lang2=" + lang2);
+        System.out.println("normalizerRules1=" + normalizerRules1);
+        System.out.println("normalizerRules2=" + normalizerRules2);
+        System.out.println("dictInfo=" + dictInfo);
+        System.out.println("dictOut=" + dictOutFilename);
+
+        final DictionaryBuilder dictionaryBuilder = new DictionaryBuilder(dictInfo, lang1, lang2, normalizerRules1, normalizerRules2, lang1Stoplist, lang2Stoplist);
+
+        for (int i = 0; i < 100; ++i) {
+            final String prefix = "input" + i;
+            if (keyValueArgs.containsKey(prefix)) {
+                final File file = new File(keyValueArgs.remove(prefix));
+                System.out.println("Processing: " + file);
+                String charsetName = keyValueArgs.remove(prefix + "Charset");
+                if (charsetName == null) {
+                    charsetName = "UTF8";
+                }
+                final Charset charset = Charset.forName(charsetName);
+                String inputName = keyValueArgs.remove(prefix + "Name");
+                if (inputName == null) {
+                    fatalError("Must specify human readable name for: " + prefix + "Name");
+                }
+                String pageLimitString = keyValueArgs.remove(prefix + "PageLimit");
+                if (pageLimitString == null) {
+                    pageLimitString = "-1";
+                }
+                final int pageLimit = Integer.parseInt(pageLimitString);
+
+                final EntrySource entrySource = new EntrySource(dictionaryBuilder.dictionary.sources.size(), inputName, 0);
+                System.out.println("");
+
+                String inputFormat = keyValueArgs.remove(prefix + "Format");
+                if ("tab_separated".equals(inputFormat)) {
+                    final boolean flipColumns = "true".equals(keyValueArgs.remove(prefix + "FlipColumns"));
+                    new DictFileParser(charset, flipColumns, DictFileParser.TAB, null, dictionaryBuilder, dictionaryBuilder.indexBuilders.toArray(new IndexBuilder[0]), null).parse(file, entrySource, pageLimit);
+                } else if ("chemnitz".equals(inputFormat)) {
+                    final boolean flipColumns = "true".equals(keyValueArgs.remove(prefix + "FlipColumns"));
+                    new DictFileParser(charset, flipColumns, DictFileParser.DOUBLE_COLON, DictFileParser.PIPE, dictionaryBuilder, dictionaryBuilder.indexBuilders.toArray(new IndexBuilder[0]), null).parse(file, entrySource, pageLimit);
+                } else if ("enwiktionary".equals(inputFormat)) {
+                    final String type = keyValueArgs.remove(prefix + "WiktionaryType");
+                    final Pattern langPattern = Pattern.compile(keyValueArgs.remove(prefix + "LangPattern"), Pattern.CASE_INSENSITIVE);
+                    final Pattern langCodePattern = Pattern.compile(keyValueArgs.remove(prefix + "LangCodePattern"));
+                    final int enIndex = Integer.parseInt(keyValueArgs.remove(prefix + "EnIndex")) - 1;
+
+                    if (enIndex < 0 || enIndex >= 2) {
+                        fatalError("Must be 1 or 2: " + prefix + "EnIndex");
+                    }
+                    final Parser parser;
+                    if ("EnToTranslation".equals(type)) {
+                        parser = new EnToTranslationParser(dictionaryBuilder.indexBuilders.get(enIndex), dictionaryBuilder.indexBuilders.get(1-enIndex),
+                                                           langPattern, langCodePattern, enIndex != 0);
+                    } else if ("EnForeign".equals(type)) {
+                        parser = new EnForeignParser(dictionaryBuilder.indexBuilders.get(enIndex), dictionaryBuilder.indexBuilders.get(1-enIndex),
+                                                     langPattern, langCodePattern, enIndex != 0);
+                    } else if ("EnEnglish".equals(type)) {
+                        parser = new EnForeignParser(dictionaryBuilder.indexBuilders.get(enIndex), dictionaryBuilder.indexBuilders.get(enIndex),
+                                                     langPattern, langCodePattern, true);
+                    } else {
+                        fatalError("Invalid WiktionaryType (use EnToTranslation or EnForeign or EnEnglish): " + type);
+                        return;
+                    }
+                    parser.parse(file, entrySource, pageLimit);
+                } else if (EnTranslationToTranslationParser.NAME.equals(inputFormat)) {
+                    final String code1 = keyValueArgs.remove(prefix + "LangPattern1");
+                    final String code2 = keyValueArgs.remove(prefix + "LangPattern2");
+                    if (code1 == null || code2 == null) {
+                        fatalError("Must specify LangPattern1 and LangPattern2.");
+                        return;
+                    }
+                    final Pattern codePattern1 = Pattern.compile(code1, Pattern.CASE_INSENSITIVE);
+                    final Pattern codePattern2 = Pattern.compile(code2, Pattern.CASE_INSENSITIVE);
+                    new EnTranslationToTranslationParser(dictionaryBuilder.indexBuilders, new Pattern[] {codePattern1, codePattern2}).parse(file, entrySource, pageLimit);
+                } else if (WholeSectionToHtmlParser.NAME.equals(inputFormat)) {
+                    final int titleIndex = Integer.parseInt(keyValueArgs.remove(prefix + "TitleIndex")) - 1;
+                    final String wiktionaryLang = keyValueArgs.remove(prefix + "WiktionaryLang");
+                    final String webUrlTemplate = keyValueArgs.remove(prefix + "WebUrlTemplate");
+                    String skipLang = keyValueArgs.remove(prefix + "SkipLang");
+                    if (skipLang == null) skipLang = "";
+                    new WholeSectionToHtmlParser(dictionaryBuilder.indexBuilders.get(titleIndex), null, wiktionaryLang, skipLang, webUrlTemplate).parse(file, entrySource, pageLimit);
+                } else {
+                    fatalError("Invalid or missing input format: " + inputFormat);
+                }
+
+                dictionaryBuilder.dictionary.sources.add(entrySource);
+                System.out.println("Done: " + file + "\n\n");
+            }
+        }
+
+        dictionaryBuilder.build();
+        // Drop indexBuilders to free RAM
+        dictionaryBuilder.indexBuilders.clear();
+
+        if (printFile != null) {
+            final PrintStream out = new PrintStream(new File(printFile));
+            dictionaryBuilder.dictionary.print(out);
+            out.close();
+        }
+
+        System.out.println("Writing dictionary to: " + dictOutFilename);
+        final RandomAccessFile dictOut = new RandomAccessFile(dictOutFilename, "rw");
+        dictOut.setLength(0);
+        dictionaryBuilder.dictionary.write(dictOut);
+        dictOut.close();
+
+        if (!keyValueArgs.isEmpty()) {
+            System.err.println("WARNING: couldn't parse arguments: " + keyValueArgs);
+            System.exit(1);
+        }
+
     }
-    
-    System.out.println("Writing dictionary to: " + dictOutFilename);
-    final RandomAccessFile dictOut = new RandomAccessFile(dictOutFilename, "rw");
-    dictOut.setLength(0);
-    dictionaryBuilder.dictionary.write(dictOut);
-    dictOut.close();
-    
-    if (!keyValueArgs.isEmpty()) {
-      System.err.println("WARNING: couldn't parse arguments: " + keyValueArgs);
-      System.exit(1);
+
+    private static void fatalError(String string) {
+        System.err.println(string);
+
+
+        System.exit(1);
     }
-  
-  }
-  
-  private static void fatalError(String string) {
-    System.err.println(string);
-    
-    
-    System.exit(1);
-  }
-  
+
 }
index 57e76cc7d435acc8999535fe097cc68661be2380..cf5fa96fc172acf6ee8fc4633b3180fcf6806c40 100644 (file)
@@ -30,320 +30,320 @@ import java.util.Map;
 import java.util.Set;
 
 public class DictionaryBuilderMain extends TestCase {
-  
-  static final String INPUTS = "data/inputs/";
-  static final String STOPLISTS = "data/inputs/stoplists/";
-  static final String OUTPUTS = "data/outputs/";  
-  
-  // Build the non EN ones.
-  static final String[][] nonEnPairs = new String[][] {
-      {"EN"},
-      {"DE"},
-      {"IT"},
-      // This one takes a really long time, and the result is too big for code.google.com
-      //{"FR"},
-          
-      // The 3 I use most:
-      {"IT", "EN" },
-      {"DE", "EN" },
-      {"DE", "IT" },
-
-      {"AR", "DE" },
-      {"AR", "ES" },
-      {"AR", "FR" },
-      {"AR", "HE" },
-      {"AR", "IT" },
-      {"AR", "JA" },
-      {"AR", "RU" },
-      {"AR", "TR" },  // Turkish
-      {"AR", "cmn" },
-      
-      {"DE", "AR" },
-      {"DE", "FR" },
-      {"DE", "CA" },  // Catalan
-      {"DE", "CS" },  // Czech
-      {"DE", "EO" },  // Esperanto
-      {"DE", "ES" },
-      {"DE", "FR" },
-      {"DE", "HE" },
-      {"DE", "HU" },  // Hungarian
-      {"DE", "IT" },
-      {"DE", "JA" },
-      {"DE", "LA" },  // Latin
-      {"DE", "NL" },  // Dutch
-      {"DE", "PL" },  // Polish
-      {"DE", "RU" },
-      {"DE", "SV" },  // Swedish
-      {"DE", "TR" },  // Turkish
-      {"DE", "cmn" },
-      {"DE", "TA" },  // Tamil
-      
-      {"ES", "RU" },  // Spanish-Russian
-      
-      {"FR", "BG" },  // Bulgarian
-      {"FR", "CS" },  // Czech
-      {"FR", "DE" },
-      {"FR", "ES" },
-      {"FR", "IT" },
-      {"FR", "JA" },
-      {"FR", "LA" },
-      {"FR", "NL" },  // Dutch
-      {"FR", "RU" },
-      {"FR", "TR" },  // Turkish
-      {"FR", "cmn" },
-      {"FR", "EL" },  
-
-      {"IT", "DE" },
-      {"IT", "EL" },  // Greek
-      {"IT", "ES" },
-      {"IT", "FR" },
-      {"IT", "HU" },
-      {"IT", "JA" },
-      {"IT", "LA" },  // Latin
-      {"IT", "LV" },  // Latvian
-      {"IT", "NL" },
-      {"IT", "PL" },
-      {"IT", "RU" },
-      {"IT", "SV" },
-      {"IT", "TR" },  // Turkish
-      {"IT", "cmn" },
-
-      {"JA", "cmn" },
-      {"JA", "AR" },
-      {"JA", "KO" },
-
-      {"cmn", "AR" },
-      {"cmn", "DE" },
-      {"cmn", "ES" },
-      {"cmn", "FR" },
-      {"cmn", "IT" },
-      {"cmn", "KO" },
-
-      {"NO", "SV" },
-      {"NO", "FI" },
-      {"FI", "SV" },
-      
-      {"PL", "FR" },  // Polish
-      {"PL", "RU" },  // Polish
-      {"PL", "HU" },  // Polish
-      {"PL", "ES" },  // Polish
-      
-      {"TR", "EL" },  // Turkish, Greek
-
-      {"FA", "HY" },  // Persian, Armenian, by request.
-      {"FA", "SV" },  // Persian, Swedish, by request.
-      {"NL", "PL" },  // Dutch, Polish, by request.
-      
-  };
-
-
-  
-  static final Map<String,String>  isoToDedication = new LinkedHashMap<String, String>();
-  static {
-  isoToDedication.put("AF", "Wiktionary-based Afrikaans dictionary dedicated to Heiko and Mariëtte Horn.");
-  isoToDedication.put("HR", "Wiktionary-based Croatian dictionary dedicated to Ines Viskic and Miro Kresonja.");
-  isoToDedication.put("NL", "Wiktionary-based Dutch dictionary dedicated to Mike LeBeau.");
-  isoToDedication.put("DE", "@data/inputs/de-en_dedication.txt");
-  isoToDedication.put("EL", "Wiktionary-based Greek dictionary dedicated to Noah Egge.");
-  isoToDedication.put("IT", "Wiktionary-based Italian dictionary dedicated to Carolina Tropini, my favorite stardust in the whole universe!  Ti amo!");
-  isoToDedication.put("KO", "Wiktionary-based Korean dictionary dedicated to Ande Elwood--fall fashion und Fernsehturms!");
-  isoToDedication.put("PT", "Wiktionary-based Portuguese dictionary dedicated to Carlos Melo, one Tough Mudder.");
-  isoToDedication.put("RO", "Wiktionary-based Romanian dictionary dedicated to Radu Teodorescu.");
-  isoToDedication.put("RU", "Wiktionary-based Russian dictionary dedicated to Maxim Aronin--best friend always!.");
-  isoToDedication.put("SR", "Wiktionary-based Serbian dictionary dedicated to Filip Crnogorac--thanks for the honey!");
-  isoToDedication.put("ES", "Wiktionary-based Spanish dictionary made especially for Carolina Tropini! <3 XoXoXXXXX!");
-  isoToDedication.put("SV", "Wiktionary-based Swedish dictionary dedicated to Kajsa Palmblad--björn kramar!");
-  }
-  private static String getEnDictionaryInfo(String iso) {
-    return isoToDedication.containsKey(iso) ? isoToDedication.get(iso) : String.format("Wiktionary-based %s dictionary.", iso);
-  }
-  
-  static final Map<String,String>  isoToStoplist = new LinkedHashMap<String, String>();
-  static {
-  isoToStoplist.put("DE", "de.txt");
-  isoToStoplist.put("EN", "en.txt");
-  isoToStoplist.put("ES", "es.txt");
-  isoToStoplist.put("IT", "it.txt");
-  isoToStoplist.put("FR", "fr.txt");
-  }
-  private static String getStoplist(String iso) {
-    return isoToStoplist.containsKey(iso) ? isoToStoplist.get(iso) : "empty.txt";
-  }
-  
-  static String getOtherLang(final String[] pair, final String first) {
-      assert Arrays.asList(pair).contains(first);
-      assert pair.length == 2;
-      return pair[0].equals(first) ? pair[1] : pair[0];
-  }
-  
-  static List<String> getMainArgs(final String[] pair) {
-      final List<String> result = new ArrayList<String>();
-      
-    int i = 1;
-    
-    if (pair.length == 1) {
+
+    static final String INPUTS = "data/inputs/";
+    static final String STOPLISTS = "data/inputs/stoplists/";
+    static final String OUTPUTS = "data/outputs/";
+
+    // Build the non EN ones.
+    static final String[][] nonEnPairs = new String[][] {
+        {"EN"},
+        {"DE"},
+        {"IT"},
+        // This one takes a really long time, and the result is too big for code.google.com
+        //{"FR"},
+
+        // The 3 I use most:
+        {"IT", "EN" },
+        {"DE", "EN" },
+        {"DE", "IT" },
+
+        {"AR", "DE" },
+        {"AR", "ES" },
+        {"AR", "FR" },
+        {"AR", "HE" },
+        {"AR", "IT" },
+        {"AR", "JA" },
+        {"AR", "RU" },
+        {"AR", "TR" },  // Turkish
+        {"AR", "cmn" },
+
+        {"DE", "AR" },
+        {"DE", "FR" },
+        {"DE", "CA" },  // Catalan
+        {"DE", "CS" },  // Czech
+        {"DE", "EO" },  // Esperanto
+        {"DE", "ES" },
+        {"DE", "FR" },
+        {"DE", "HE" },
+        {"DE", "HU" },  // Hungarian
+        {"DE", "IT" },
+        {"DE", "JA" },
+        {"DE", "LA" },  // Latin
+        {"DE", "NL" },  // Dutch
+        {"DE", "PL" },  // Polish
+        {"DE", "RU" },
+        {"DE", "SV" },  // Swedish
+        {"DE", "TR" },  // Turkish
+        {"DE", "cmn" },
+        {"DE", "TA" },  // Tamil
+
+        {"ES", "RU" },  // Spanish-Russian
+
+        {"FR", "BG" },  // Bulgarian
+        {"FR", "CS" },  // Czech
+        {"FR", "DE" },
+        {"FR", "ES" },
+        {"FR", "IT" },
+        {"FR", "JA" },
+        {"FR", "LA" },
+        {"FR", "NL" },  // Dutch
+        {"FR", "RU" },
+        {"FR", "TR" },  // Turkish
+        {"FR", "cmn" },
+        {"FR", "EL" },
+
+        {"IT", "DE" },
+        {"IT", "EL" },  // Greek
+        {"IT", "ES" },
+        {"IT", "FR" },
+        {"IT", "HU" },
+        {"IT", "JA" },
+        {"IT", "LA" },  // Latin
+        {"IT", "LV" },  // Latvian
+        {"IT", "NL" },
+        {"IT", "PL" },
+        {"IT", "RU" },
+        {"IT", "SV" },
+        {"IT", "TR" },  // Turkish
+        {"IT", "cmn" },
+
+        {"JA", "cmn" },
+        {"JA", "AR" },
+        {"JA", "KO" },
+
+        {"cmn", "AR" },
+        {"cmn", "DE" },
+        {"cmn", "ES" },
+        {"cmn", "FR" },
+        {"cmn", "IT" },
+        {"cmn", "KO" },
+
+        {"NO", "SV" },
+        {"NO", "FI" },
+        {"FI", "SV" },
+
+        {"PL", "FR" },  // Polish
+        {"PL", "RU" },  // Polish
+        {"PL", "HU" },  // Polish
+        {"PL", "ES" },  // Polish
+
+        {"TR", "EL" },  // Turkish, Greek
+
+        {"FA", "HY" },  // Persian, Armenian, by request.
+        {"FA", "SV" },  // Persian, Swedish, by request.
+        {"NL", "PL" },  // Dutch, Polish, by request.
+
+    };
+
+
+
+    static final Map<String,String>  isoToDedication = new LinkedHashMap<String, String>();
+    static {
+        isoToDedication.put("AF", "Wiktionary-based Afrikaans dictionary dedicated to Heiko and Mariëtte Horn.");
+        isoToDedication.put("HR", "Wiktionary-based Croatian dictionary dedicated to Ines Viskic and Miro Kresonja.");
+        isoToDedication.put("NL", "Wiktionary-based Dutch dictionary dedicated to Mike LeBeau.");
+        isoToDedication.put("DE", "@data/inputs/de-en_dedication.txt");
+        isoToDedication.put("EL", "Wiktionary-based Greek dictionary dedicated to Noah Egge.");
+        isoToDedication.put("IT", "Wiktionary-based Italian dictionary dedicated to Carolina Tropini, my favorite stardust in the whole universe!  Ti amo!");
+        isoToDedication.put("KO", "Wiktionary-based Korean dictionary dedicated to Ande Elwood--fall fashion und Fernsehturms!");
+        isoToDedication.put("PT", "Wiktionary-based Portuguese dictionary dedicated to Carlos Melo, one Tough Mudder.");
+        isoToDedication.put("RO", "Wiktionary-based Romanian dictionary dedicated to Radu Teodorescu.");
+        isoToDedication.put("RU", "Wiktionary-based Russian dictionary dedicated to Maxim Aronin--best friend always!.");
+        isoToDedication.put("SR", "Wiktionary-based Serbian dictionary dedicated to Filip Crnogorac--thanks for the honey!");
+        isoToDedication.put("ES", "Wiktionary-based Spanish dictionary made especially for Carolina Tropini! <3 XoXoXXXXX!");
+        isoToDedication.put("SV", "Wiktionary-based Swedish dictionary dedicated to Kajsa Palmblad--björn kramar!");
+    }
+    private static String getEnDictionaryInfo(String iso) {
+        return isoToDedication.containsKey(iso) ? isoToDedication.get(iso) : String.format("Wiktionary-based %s dictionary.", iso);
+    }
+
+    static final Map<String,String>  isoToStoplist = new LinkedHashMap<String, String>();
+    static {
+        isoToStoplist.put("DE", "de.txt");
+        isoToStoplist.put("EN", "en.txt");
+        isoToStoplist.put("ES", "es.txt");
+        isoToStoplist.put("IT", "it.txt");
+        isoToStoplist.put("FR", "fr.txt");
+    }
+    private static String getStoplist(String iso) {
+        return isoToStoplist.containsKey(iso) ? isoToStoplist.get(iso) : "empty.txt";
+    }
+
+    static String getOtherLang(final String[] pair, final String first) {
+        assert Arrays.asList(pair).contains(first);
+        assert pair.length == 2;
+        return pair[0].equals(first) ? pair[1] : pair[0];
+    }
+
+    static List<String> getMainArgs(final String[] pair) {
+        final List<String> result = new ArrayList<String>();
+
+        int i = 1;
+
+        if (pair.length == 1) {
+            final String lang1 = pair[0];
+            final String dictFile = String.format("%s/%s.quickdic", OUTPUTS, lang1);
+            result.add(String.format("--dictOut=%s", dictFile));
+            result.add(String.format("--lang1=%s", lang1));
+            result.add(String.format("--lang1Stoplist=%s", STOPLISTS + getStoplist(lang1)));
+            result.add(String.format("--dictInfo=Wikitionary-based %s dictionary.", lang1));
+
+
+            final String wikiSplitFile = String.format("%s/wikiSplit/%s/%s.data", INPUTS, lang1.toLowerCase(), lang1);
+            if (new File(wikiSplitFile).canRead()) {
+                result.add(String.format("--input%d=%s", i, wikiSplitFile));
+                result.add(String.format("--input%dName=%s.wiktionary.org", i, lang1.toLowerCase()));
+                result.add(String.format("--input%dFormat=%s", i, WholeSectionToHtmlParser.NAME));
+                result.add(String.format("--input%dTitleIndex=%d", i, 1));
+                result.add(String.format("--input%dWiktionaryLang=%s", i, lang1));
+                result.add(String.format("--input%dSkipLang=%s", i, lang1));
+                result.add(String.format("--input%dWebUrlTemplate=http://%s.wiktionary.org/wiki/%%s", i, lang1.toLowerCase()));
+                //result.add(String.format("--input%dPageLimit=100", i));
+                ++i;
+            } else {
+                System.err.println("Can't read file: " + wikiSplitFile);
+            }
+
+            if (lang1.equals("EN") && !lang1.equals("EN")) {
+                // Add a parser that tries to use the definitions.  This is
+                // not very pretty yet.
+                result.add(String.format("--input%d=%s/wikiSplit/en/%s.data", i, INPUTS, lang1));
+                result.add(String.format("--input%dName=ENWiktionary.%s", i, lang1)) ;
+                result.add(String.format("--input%dFormat=enwiktionary", i));
+                result.add(String.format("--input%dWiktionaryType=EnEnglish", i));
+                result.add(String.format("--input%dLangPattern=%s", i, "English"));
+                result.add(String.format("--input%dLangCodePattern=%s", i, lang1.toLowerCase()));
+                result.add(String.format("--input%dEnIndex=%d", i, 1));
+                //result.add(String.format("--input%dPageLimit=100", i));
+                ++i;
+            }
+
+            return result;
+        }  // Single-lang dictionaries.
+
         final String lang1 = pair[0];
-        final String dictFile = String.format("%s/%s.quickdic", OUTPUTS, lang1);
+        final String lang2 = pair[1];
+
+        final String dictFile = String.format("%s/%s-%s.quickdic",
+                                              OUTPUTS, lang1, lang2);
+
         result.add(String.format("--dictOut=%s", dictFile));
-        result.add(String.format("--lang1=%s", lang1));
         result.add(String.format("--lang1Stoplist=%s", STOPLISTS + getStoplist(lang1)));
-        result.add(String.format("--dictInfo=Wikitionary-based %s dictionary.", lang1));
+        result.add(String.format("--lang2Stoplist=%s", STOPLISTS + getStoplist(lang2)));
 
-        
-        final String wikiSplitFile = String.format("%s/wikiSplit/%s/%s.data", INPUTS, lang1.toLowerCase(), lang1);
-        if (new File(wikiSplitFile).canRead()) {
+        // For a few langs, put the defs of the other language in DE/IT/FR using WholeSection.
+        for (final String wikitionaryLang : Arrays.asList("EN", "DE", "IT", "FR")) {
+            if (!Arrays.asList(pair).contains(wikitionaryLang)) {
+                continue;
+            }
+            final String foreignIso = getOtherLang(pair, wikitionaryLang);
+            final String wikiSplitFile = String.format("%s/wikiSplit/%s/%s.data", INPUTS, wikitionaryLang.toLowerCase(), foreignIso);
+            if (!new File(wikiSplitFile).canRead()) {
+                System.err.println("WARNING: Can't read file: " + wikiSplitFile);
+                continue;
+            }
             result.add(String.format("--input%d=%s", i, wikiSplitFile));
-            result.add(String.format("--input%dName=%s.wiktionary.org", i, lang1.toLowerCase()));
+            result.add(String.format("--input%dName=%s.wiktionary.org", i, wikitionaryLang.toLowerCase()));
             result.add(String.format("--input%dFormat=%s", i, WholeSectionToHtmlParser.NAME));
-            result.add(String.format("--input%dTitleIndex=%d", i, 1));
-            result.add(String.format("--input%dWiktionaryLang=%s", i, lang1));
-            result.add(String.format("--input%dSkipLang=%s", i, lang1));
-            result.add(String.format("--input%dWebUrlTemplate=http://%s.wiktionary.org/wiki/%%s", i, lang1.toLowerCase()));
-            //result.add(String.format("--input%dPageLimit=100", i));
+            result.add(String.format("--input%dTitleIndex=%d", i, Arrays.asList(pair).indexOf(foreignIso) + 1));
+            result.add(String.format("--input%dWiktionaryLang=%s", i, wikitionaryLang));
+            result.add(String.format("--input%dSkipLang=%s", i, foreignIso));
+            result.add(String.format("--input%dWebUrlTemplate=http://%s.wiktionary.org/wiki/%%s", i, wikitionaryLang.toLowerCase()));
             ++i;
-        } else {
-            System.err.println("Can't read file: " + wikiSplitFile);
         }
-        
-        if (lang1.equals("EN") && !lang1.equals("EN")) {
-            // Add a parser that tries to use the definitions.  This is
-            // not very pretty yet.
-            result.add(String.format("--input%d=%s/wikiSplit/en/%s.data", i, INPUTS, lang1));
-            result.add(String.format("--input%dName=ENWiktionary.%s", i, lang1)) ;
+
+        // Deal with the pairs where one is English.
+        if (Arrays.asList(pair).contains("EN")) {
+            final String foreignIso = getOtherLang(pair, "EN");
+            String foreignRegex = WiktionaryLangs.isoCodeToEnWikiName.get(foreignIso);
+
+            result.add(String.format("--lang1=%s", lang1));
+            result.add(String.format("--lang2=%s",  lang2));
+            result.add(String.format("--dictInfo=%s", getEnDictionaryInfo(foreignIso)));
+
+            // Foreign section.
+            result.add(String.format("--input%d=%s/wikiSplit/en/%s.data", i, INPUTS, foreignIso));
+            result.add(String.format("--input%dName=ENWiktionary.%s", i, foreignIso)) ;
             result.add(String.format("--input%dFormat=enwiktionary", i));
-            result.add(String.format("--input%dWiktionaryType=EnEnglish", i));
-            result.add(String.format("--input%dLangPattern=%s", i, "English"));
-            result.add(String.format("--input%dLangCodePattern=%s", i, lang1.toLowerCase()));
-            result.add(String.format("--input%dEnIndex=%d", i, 1));
-            //result.add(String.format("--input%dPageLimit=100", i));
+            result.add(String.format("--input%dWiktionaryType=EnForeign", i));
+            result.add(String.format("--input%dLangPattern=%s", i, foreignRegex));
+            result.add(String.format("--input%dLangCodePattern=%s", i, foreignIso.toLowerCase()));
+            result.add(String.format("--input%dEnIndex=%d", i, Arrays.asList(pair).indexOf("EN") + 1));
+            ++i;
+
+            // Translation section.
+            result.add(String.format("--input%d=%swikiSplit/en/EN.data", i, INPUTS));
+            result.add(String.format("--input%dName=enwiktionary.english", i));
+            result.add(String.format("--input%dFormat=enwiktionary", i));
+            result.add(String.format("--input%dWiktionaryType=EnToTranslation", i));
+            result.add(String.format("--input%dLangPattern=%s", i, foreignRegex));
+            result.add(String.format("--input%dLangCodePattern=%s", i, foreignIso.toLowerCase()));
+            result.add(String.format("--input%dEnIndex=%d", i, Arrays.asList(pair).indexOf("EN") + 1));
+            ++i;
+
+            if (foreignIso.equals("DE")) {
+                result.add(String.format("--input%d=%sde-en_chemnitz.txt", i, INPUTS));
+                result.add(String.format("--input%dName=chemnitz", i));
+                result.add(String.format("--input%dCharset=UTF8", i));
+                result.add(String.format("--input%dFormat=chemnitz", i));
+                ++i;
+            }
+
+        } else {
+            // Pairs without English.
+            result.add(String.format("--lang1=%s", lang1));
+            result.add(String.format("--lang2=%s", lang2));
+            result.add(String.format("--dictInfo=Wikitionary-based %s-%s dictionary.", lang1, lang2));
+
+            result.add(String.format("--input%d=%swikiSplit/en/EN.data", i, INPUTS));
+            result.add(String.format("--input%dName=BETA!enwiktionary.%s-%s", i, lang1, lang2));
+            result.add(String.format("--input%dFormat=%s", i, EnTranslationToTranslationParser.NAME));
+            result.add(String.format("--input%dLangPattern1=%s", i, lang1));
+            result.add(String.format("--input%dLangPattern2=%s", i, lang2));
             ++i;
+
+            // TODO: Could use FR translation section here too.
         }
-        
+
         return result;
-    }  // Single-lang dictionaries.
-    
-    final String lang1 = pair[0];
-    final String lang2 = pair[1];
-    
-    final String dictFile = String.format("%s/%s-%s.quickdic", 
-        OUTPUTS, lang1, lang2);
-    
-    result.add(String.format("--dictOut=%s", dictFile));
-    result.add(String.format("--lang1Stoplist=%s", STOPLISTS + getStoplist(lang1)));
-    result.add(String.format("--lang2Stoplist=%s", STOPLISTS + getStoplist(lang2)));
-
-    // For a few langs, put the defs of the other language in DE/IT/FR using WholeSection.
-    for (final String wikitionaryLang : Arrays.asList("EN", "DE", "IT", "FR")) {
-        if (!Arrays.asList(pair).contains(wikitionaryLang)) {
-            continue;
+    }
+
+    public static void main(final String[] args) throws Exception {
+
+        final List<String[]> allPairs = new ArrayList<String[]>();
+
+        allPairs.addAll(Arrays.asList(nonEnPairs));
+        // Add all the EN-XX pairs.
+        for (final String isoCode : WiktionaryLangs.isoCodeToEnWikiName.keySet()) {
+            if (!isoCode.equals("EN")) {
+                allPairs.add(new String[] {"EN", isoCode});
+            }
         }
-        final String foreignIso = getOtherLang(pair, wikitionaryLang);
-        final String wikiSplitFile = String.format("%s/wikiSplit/%s/%s.data", INPUTS, wikitionaryLang.toLowerCase(), foreignIso);
-        if (!new File(wikiSplitFile).canRead()) {
-            System.err.println("WARNING: Can't read file: " + wikiSplitFile);
-            continue;
+
+
+        final Set<List<String>> done = new LinkedHashSet<List<String>>();
+        boolean go = true;
+        for (final String[] pair : allPairs) {
+            Arrays.sort(pair);
+            final List<String> pairList = Arrays.asList(pair);
+            if (done.contains(pairList)) {
+                continue;
+            }
+            done.add(pairList);
+
+            if (pairList.contains("EN") && pairList.contains("DE")) {
+                go = true;
+            } else {
+                go = false;
+            }
+
+            if (!go) {
+                continue;
+            }
+
+            DictionaryBuilder.main(getMainArgs(pair).toArray(new String[0]));
         }
-        result.add(String.format("--input%d=%s", i, wikiSplitFile));
-        result.add(String.format("--input%dName=%s.wiktionary.org", i, wikitionaryLang.toLowerCase()));
-        result.add(String.format("--input%dFormat=%s", i, WholeSectionToHtmlParser.NAME));
-        result.add(String.format("--input%dTitleIndex=%d", i, Arrays.asList(pair).indexOf(foreignIso) + 1));
-        result.add(String.format("--input%dWiktionaryLang=%s", i, wikitionaryLang));
-        result.add(String.format("--input%dSkipLang=%s", i, foreignIso));
-        result.add(String.format("--input%dWebUrlTemplate=http://%s.wiktionary.org/wiki/%%s", i, wikitionaryLang.toLowerCase()));
-        ++i;
-    }
-    
-    // Deal with the pairs where one is English.
-    if (Arrays.asList(pair).contains("EN")) {
-      final String foreignIso = getOtherLang(pair, "EN");
-      String foreignRegex = WiktionaryLangs.isoCodeToEnWikiName.get(foreignIso);
-      
-      result.add(String.format("--lang1=%s", lang1));
-      result.add(String.format("--lang2=%s",  lang2));
-      result.add(String.format("--dictInfo=%s", getEnDictionaryInfo(foreignIso)));
-      
-      // Foreign section.
-      result.add(String.format("--input%d=%s/wikiSplit/en/%s.data", i, INPUTS, foreignIso));
-      result.add(String.format("--input%dName=ENWiktionary.%s", i, foreignIso)) ;
-      result.add(String.format("--input%dFormat=enwiktionary", i));
-      result.add(String.format("--input%dWiktionaryType=EnForeign", i));
-      result.add(String.format("--input%dLangPattern=%s", i, foreignRegex));
-      result.add(String.format("--input%dLangCodePattern=%s", i, foreignIso.toLowerCase()));
-      result.add(String.format("--input%dEnIndex=%d", i, Arrays.asList(pair).indexOf("EN") + 1));
-      ++i;
-
-      // Translation section.
-      result.add(String.format("--input%d=%swikiSplit/en/EN.data", i, INPUTS));
-      result.add(String.format("--input%dName=enwiktionary.english", i));
-      result.add(String.format("--input%dFormat=enwiktionary", i));
-      result.add(String.format("--input%dWiktionaryType=EnToTranslation", i));
-      result.add(String.format("--input%dLangPattern=%s", i, foreignRegex));
-      result.add(String.format("--input%dLangCodePattern=%s", i, foreignIso.toLowerCase()));
-      result.add(String.format("--input%dEnIndex=%d", i, Arrays.asList(pair).indexOf("EN") + 1));
-      ++i;
-      
-      if (foreignIso.equals("DE")) {
-        result.add(String.format("--input%d=%sde-en_chemnitz.txt", i, INPUTS));
-        result.add(String.format("--input%dName=chemnitz", i));
-        result.add(String.format("--input%dCharset=UTF8", i));
-        result.add(String.format("--input%dFormat=chemnitz", i));
-        ++i;
-      }
-      
-    } else {
-      // Pairs without English.
-      result.add(String.format("--lang1=%s", lang1));
-      result.add(String.format("--lang2=%s", lang2));
-      result.add(String.format("--dictInfo=Wikitionary-based %s-%s dictionary.", lang1, lang2));
-      result.add(String.format("--input%d=%swikiSplit/en/EN.data", i, INPUTS));
-      result.add(String.format("--input%dName=BETA!enwiktionary.%s-%s", i, lang1, lang2));
-      result.add(String.format("--input%dFormat=%s", i, EnTranslationToTranslationParser.NAME));
-      result.add(String.format("--input%dLangPattern1=%s", i, lang1));
-      result.add(String.format("--input%dLangPattern2=%s", i, lang2));
-      ++i;
-      
-      // TODO: Could use FR translation section here too.
-    }
-    
-    return result;
-  }
-
-  public static void main(final String[] args) throws Exception {
-    
-    final List<String[]> allPairs = new ArrayList<String[]>();
-    
-    allPairs.addAll(Arrays.asList(nonEnPairs));
-    // Add all the EN-XX pairs.
-    for (final String isoCode : WiktionaryLangs.isoCodeToEnWikiName.keySet()) {
-      if (!isoCode.equals("EN")) {
-          allPairs.add(new String[] {"EN", isoCode});
-      }
-    }
-    
-        
-    final Set<List<String>> done = new LinkedHashSet<List<String>>();
-    boolean go = true;
-    for (final String[] pair : allPairs) {
-      Arrays.sort(pair);
-      final List<String> pairList = Arrays.asList(pair);
-      if (done.contains(pairList)) {
-        continue;
-      }
-      done.add(pairList);
-      
-      if (pairList.contains("EN") && pairList.contains("DE")) {
-          go = true;
-      } else {
-          go = false;
-      }
-      
-      if (!go) {
-          continue;
-      }
-      
-      DictionaryBuilder.main(getMainArgs(pair).toArray(new String[0]));
+
     }
-    
-  }    
 }
index 417df82d420112076e0423ca1a61abf70f675780..7669414a4e37235d68aed3ffc8004ff8395e97e6 100644 (file)
@@ -28,343 +28,343 @@ import com.hughes.util.FileUtil;
 import junit.framework.TestCase;
 
 public class DictionaryBuilderTest extends TestCase {
-  
-  public static final String TEST_INPUTS = "testdata/inputs/";
-  public static final String WIKISPLIT = "data/inputs/wikiSplit/";
-  public static final String WIKISPLIT_EN = "data/inputs/wikiSplit/en/";
-  public static final String STOPLISTS = "data/inputs/stoplists/";
-  public static final String GOLDENS = "testdata/goldens/";
-
-  public static final String TEST_OUTPUTS = "testdata/outputs/";
-
-  public void testItConj() throws Exception {
-      final String toParse = "{{it-conj-are|d|avere|pres2s=dai|pres3s=dà|pres3p=danno|prem1s=diedi|prem1s2=detti|prem2s=desti|prem3s=diede|prem3s2=dette|prem1p=demmo|prem2p=deste|prem3p=diedero|prem3p2=dettero|fut1s=darò|fut2s=darai|fut3s=darà|fut1p=daremo|fut2p=darete|fut3p=daranno|cond1s=darei|cond2s=daresti|cond3s=darebbe|cond1p=daremmo|cond2p=dareste|cond3p=darebbero|sub123s=dia|sub3p=diano|impsub12s=dessi|impsub3s=desse|impsub1p=dessimo|impsub2p=deste|impsub3p=dessero|imp2s=dà|imp2s2=dai|imp2s3=da'|imp3s=dia|imp3p=diano}}\n" +
-              "{{it-conj-are|accus|avere}}\n" +
-              "{{it-conj-care|pag|avere or essere}}\n" +
-              "{{it-conj-iare|studi|avere}}\n" +
-              "{{it-conj-iare-b|avvi|avere}}\n" +
-              "{{it-conj-ciare|pronunc|avere}}\n" +
-              "{{it-conj-ere|sed|essere|pres1s=siedo|pres1s2=seggo|pres2s=siedi|pres3s=siede|pres3p=siedono|pres3p2=seggono|fut1s2=siederò|fut2s2=siederai|fut3s2=siederà|fut1p2=siederemo|fut2p2=siederete|fut3p2=siederanno|cond1s2=siederei|cond2s2=siederesti|cond3s2=siederebbe|cond1p2=siederemmo|cond2p2=siedereste|cond3p2=siederebbero|sub123s=sieda|sub3p=siedano|imp2s=siedi|imp3s=sieda|imp3s2=segga|imp3p=siedano|imp3p2=seggano}}\n" +
-              "{{it-conj-ere|persuad|avere|pastp=persuaso|prem1s=persuasi|prem3s=persuase|prem3s2=''|prem3p=persuasero|prem3p2=''}}\n" +
-              "{{it-conj-ere|abbatt|avere}}\n" +
-              "{{it-conj-ire|copr|avere|pastp=coperto|prem1s2=copersi|prem3s2=coperse|prem3p2=copersero}}\n" +
-              "{{it-conj-ire-b|prefer|avere}}\n" +
-              "{{it-conj-urre|prod|avere}}\n" +
-              "{{it-conj-arsi|lav}}\n" +
-              "{{it-conj-ersi|abbatt}}\n" +
-              "{{it-conj-iarsi|annoi}}\n" +
-              "{{it-conj-carsi|coniug}}\n" +
-              "{{it-conj-ciarsi|affacc}}\n" +
-              "{{it-conj-irsi|vest}}\n" +
-              "{{it-conj-irsi-b|fer}}\n" +
-              "{{it-conj-ursi|rid|essere}}\n" +
-              "{{it-conj-cire|ricuc|avere}}\n" +
-              "{{it-conj-iarsi-b|riavvi|essere}}" +
-              "{{it-conj-fare|putre|avere}}\n" + 
-              "{{it-conj-cirsi|cuc|essere}}\n" +
-              "{{it-conj-ere|smett|avere|pastp=smesso|prem1s=smisi|prem3s=smise|prem3s2=''|prem3p=smisero|prem3p2=''}}\n" +
-              "{{term||[[cor#Latin|Cor]] [[Carolus#Latin|Carolī]]|Charles' heart}}\n" +
-              "{{term|sc=Grek|λόγος|tr=lógos||word}}\n" +
-              "{{term|verbo|verbō|for the word}}\n"
-              ;
-      final DictionaryBuilder db = new DictionaryBuilder("", Language.en, Language.it,  "", "", Collections.singleton("X"), Collections.singleton("X"));
-      WholeSectionToHtmlParser parser = new WholeSectionToHtmlParser(db.indexBuilders.get(0), null, "EN", "IT", "http://en.wiktionary.org/wiki/%s");
-      parser.title = "dummyTitle";
-      parser.entrySource = new EntrySource(0, "dummySource", 0);
-      parser.parseSection("dummyHeading", toParse);
-      db.build();
-      
-      final String dictName = "testItConj.html";
-      final PrintStream out = new PrintStream(new File(TEST_OUTPUTS, dictName));
-      db.dictionary.print(out);
-      out.close();
-      
-      assertFilesEqual(GOLDENS + dictName, TEST_OUTPUTS + dictName);
-  }
-  
-  public void doTestCustomDict(final String name, final String lang1,
-      final String lang2, final String inputFile) throws Exception {
-    final File result = new File(TEST_OUTPUTS + name);
-    System.out.println("Writing to: " + result);
-    DictionaryBuilder.main(new String[] {
-        "--dictOut=" + result.getAbsolutePath(),
-        "--lang1=" + lang1,
-        "--lang2=" + lang2,
-        "--lang1Stoplist=" + STOPLISTS + "empty.txt",
-        "--lang2Stoplist=" + STOPLISTS + "empty.txt",
-        "--dictInfo=bleh.",
-        
-        "--input1=testdata/inputs/" + inputFile,
-        "--input1Name=my_input_" + name,
-        "--input1Charset=ISO-8859-1",
-        "--input1Format=tab_separated",
-
-        "--print=" + result.getPath() + ".text",
-    });
-    
-    checkGolden(name, result); 
-  }
-  
-  public void test_FR_NL() throws Exception {
-    doTestCustomDict("QuickDic-FR-NL.quickdic", "FR", "NL", "QuickDic-FR-NL.txt");
-  }
-  
-  public void testWiktionary_en_de2fr() throws Exception {
-    wiktionaryTestWithEnTrans2Trans("wiktionary.de_fr.quickdic", "DE", "FR");
-  }
-
-  public void wiktionaryTestWithEnTrans2Trans(final String name, final String lang1,
-      final String lang2) throws Exception {
-    final File result = new File(TEST_OUTPUTS + name);
-    System.out.println("Writing to: " + result);
-    DictionaryBuilder.main(new String[] {
-        "--dictOut=" + result.getAbsolutePath(),
-        "--lang1=" + lang1,
-        "--lang2=" + lang2,
-        "--lang1Stoplist=" + STOPLISTS + "empty.txt",
-        "--lang2Stoplist=" + STOPLISTS + "empty.txt",
-        "--dictInfo=SomeWikiDataTrans2Trans",
-
-        "--input4=" + WIKISPLIT_EN + "EN.data",
-        "--input4Name=" + name,
-        "--input4Format=" + EnTranslationToTranslationParser.NAME,
-        "--input4LangPattern1=" + lang1,
-        "--input4LangPattern2=" + lang2,
-        "--input4PageLimit=1000",
-
-        "--print=" + result.getPath() + ".text",
-    });
-    
-    checkGolden(name, result); 
-  }
-
-  public void testWiktionary_WholeSection_DE() throws Exception {
-    enWiktionaryTestWithWholeSectionToHtml("enwiktionary.WholeSection.DE.quickdic", "DE", 100);
-  }
-
-  public void testWiktionary_WholeSection_EN() throws Exception {
-    enWiktionaryTestWithWholeSectionToHtml("enwiktionary.WholeSection.EN.quickdic", "EN", 100);
-  }
-
-  public void testWiktionary_WholeSection_IT() throws Exception {
-    // Have to run to 800 to get a few verb conjugations (including essere!)
-    enWiktionaryTestWithWholeSectionToHtml("enwiktionary.WholeSection.IT.quickdic", "IT", 800);
-  }
-
-  public void enWiktionaryTestWithWholeSectionToHtml(final String name, final String langCode, final int pageLimit) throws Exception {
-    final File result = new File(TEST_OUTPUTS + name);
-    System.out.println("Writing to: " + result);
-    DictionaryBuilder.main(new String[] {
-        "--dictOut=" + result.getAbsolutePath(),
-        "--lang1=" + langCode,
-        "--lang2=" + "EN",
-        "--lang1Stoplist=" + STOPLISTS + "empty.txt",
-        "--lang2Stoplist=" + STOPLISTS + "empty.txt",
-        "--dictInfo=SomeWikiDataWholeSection",
-
-        "--input4=" + WIKISPLIT_EN + langCode + ".data",
-        "--input4Name=" + name,
-        "--input4Format=" + WholeSectionToHtmlParser.NAME,
-        "--input4WiktionaryLang=EN",
-        "--input4SkipLang=" + langCode,
-        "--input4TitleIndex=" + "1",
-        "--input4PageLimit=" + pageLimit,
-
-        "--print=" + result.getPath() + ".text",
-    });
-    checkGolden(name, result); 
-  }
-  
-  //-----------------------------------------------------------------
-
-  public void testSingleLang_EN() throws Exception {
-      wiktionaryTestSingleLang("SingleLang_EN.quickdic", "EN", 100);
-  }
-
-  public void testSingleLang_DE() throws Exception {
-      wiktionaryTestSingleLang("SingleLang_DE.quickdic", "DE", 100);
-  }
-
-  public void testSingleLang_IT() throws Exception {
-      wiktionaryTestSingleLang("SingleLang_IT.quickdic", "IT", 100);
-  }
-
-  public void testSingleLang_FR() throws Exception {
-      wiktionaryTestSingleLang("SingleLang_FR.quickdic", "FR", 100);
-  }
-
-  public void wiktionaryTestSingleLang(final String name, final String langCode, final int pageLimit) throws Exception {
-      final File result = new File(TEST_OUTPUTS + name);
-      System.out.println("Writing to: " + result);
-      DictionaryBuilder.main(new String[] {
-          "--dictOut=" + result.getAbsolutePath(),
-          "--lang1=" + langCode,
-          "--lang1Stoplist=" + STOPLISTS + "empty.txt",
-          "--dictInfo=SomeWikiDataWholeSection",
-          "--input4=" + WIKISPLIT + langCode.toLowerCase() + "/" + langCode + ".data",
-          "--input4Name=" + name,
-          "--input4Format=" + WholeSectionToHtmlParser.NAME,
-          "--input4WiktionaryLang=" + langCode,
-          "--input4SkipLang=" + langCode,
-          "--input4TitleIndex=" + "1",
-          "--input4PageLimit=" + pageLimit,
-          "--print=" + result.getPath() + ".text",
-      });
-      checkGolden(name, result); 
+
+    public static final String TEST_INPUTS = "testdata/inputs/";
+    public static final String WIKISPLIT = "data/inputs/wikiSplit/";
+    public static final String WIKISPLIT_EN = "data/inputs/wikiSplit/en/";
+    public static final String STOPLISTS = "data/inputs/stoplists/";
+    public static final String GOLDENS = "testdata/goldens/";
+
+    public static final String TEST_OUTPUTS = "testdata/outputs/";
+
+    public void testItConj() throws Exception {
+        final String toParse = "{{it-conj-are|d|avere|pres2s=dai|pres3s=dà|pres3p=danno|prem1s=diedi|prem1s2=detti|prem2s=desti|prem3s=diede|prem3s2=dette|prem1p=demmo|prem2p=deste|prem3p=diedero|prem3p2=dettero|fut1s=darò|fut2s=darai|fut3s=darà|fut1p=daremo|fut2p=darete|fut3p=daranno|cond1s=darei|cond2s=daresti|cond3s=darebbe|cond1p=daremmo|cond2p=dareste|cond3p=darebbero|sub123s=dia|sub3p=diano|impsub12s=dessi|impsub3s=desse|impsub1p=dessimo|impsub2p=deste|impsub3p=dessero|imp2s=dà|imp2s2=dai|imp2s3=da'|imp3s=dia|imp3p=diano}}\n" +
+                               "{{it-conj-are|accus|avere}}\n" +
+                               "{{it-conj-care|pag|avere or essere}}\n" +
+                               "{{it-conj-iare|studi|avere}}\n" +
+                               "{{it-conj-iare-b|avvi|avere}}\n" +
+                               "{{it-conj-ciare|pronunc|avere}}\n" +
+                               "{{it-conj-ere|sed|essere|pres1s=siedo|pres1s2=seggo|pres2s=siedi|pres3s=siede|pres3p=siedono|pres3p2=seggono|fut1s2=siederò|fut2s2=siederai|fut3s2=siederà|fut1p2=siederemo|fut2p2=siederete|fut3p2=siederanno|cond1s2=siederei|cond2s2=siederesti|cond3s2=siederebbe|cond1p2=siederemmo|cond2p2=siedereste|cond3p2=siederebbero|sub123s=sieda|sub3p=siedano|imp2s=siedi|imp3s=sieda|imp3s2=segga|imp3p=siedano|imp3p2=seggano}}\n" +
+                               "{{it-conj-ere|persuad|avere|pastp=persuaso|prem1s=persuasi|prem3s=persuase|prem3s2=''|prem3p=persuasero|prem3p2=''}}\n" +
+                               "{{it-conj-ere|abbatt|avere}}\n" +
+                               "{{it-conj-ire|copr|avere|pastp=coperto|prem1s2=copersi|prem3s2=coperse|prem3p2=copersero}}\n" +
+                               "{{it-conj-ire-b|prefer|avere}}\n" +
+                               "{{it-conj-urre|prod|avere}}\n" +
+                               "{{it-conj-arsi|lav}}\n" +
+                               "{{it-conj-ersi|abbatt}}\n" +
+                               "{{it-conj-iarsi|annoi}}\n" +
+                               "{{it-conj-carsi|coniug}}\n" +
+                               "{{it-conj-ciarsi|affacc}}\n" +
+                               "{{it-conj-irsi|vest}}\n" +
+                               "{{it-conj-irsi-b|fer}}\n" +
+                               "{{it-conj-ursi|rid|essere}}\n" +
+                               "{{it-conj-cire|ricuc|avere}}\n" +
+                               "{{it-conj-iarsi-b|riavvi|essere}}" +
+                               "{{it-conj-fare|putre|avere}}\n" +
+                               "{{it-conj-cirsi|cuc|essere}}\n" +
+                               "{{it-conj-ere|smett|avere|pastp=smesso|prem1s=smisi|prem3s=smise|prem3s2=''|prem3p=smisero|prem3p2=''}}\n" +
+                               "{{term||[[cor#Latin|Cor]] [[Carolus#Latin|Carolī]]|Charles' heart}}\n" +
+                               "{{term|sc=Grek|λόγος|tr=lógos||word}}\n" +
+                               "{{term|verbo|verbō|for the word}}\n"
+                               ;
+        final DictionaryBuilder db = new DictionaryBuilder("", Language.en, Language.it,  "", "", Collections.singleton("X"), Collections.singleton("X"));
+        WholeSectionToHtmlParser parser = new WholeSectionToHtmlParser(db.indexBuilders.get(0), null, "EN", "IT", "http://en.wiktionary.org/wiki/%s");
+        parser.title = "dummyTitle";
+        parser.entrySource = new EntrySource(0, "dummySource", 0);
+        parser.parseSection("dummyHeading", toParse);
+        db.build();
+
+        final String dictName = "testItConj.html";
+        final PrintStream out = new PrintStream(new File(TEST_OUTPUTS, dictName));
+        db.dictionary.print(out);
+        out.close();
+
+        assertFilesEqual(GOLDENS + dictName, TEST_OUTPUTS + dictName);
+    }
+
+    public void doTestCustomDict(final String name, final String lang1,
+                                 final String lang2, final String inputFile) throws Exception {
+        final File result = new File(TEST_OUTPUTS + name);
+        System.out.println("Writing to: " + result);
+        DictionaryBuilder.main(new String[] {
+                                   "--dictOut=" + result.getAbsolutePath(),
+                                   "--lang1=" + lang1,
+                                   "--lang2=" + lang2,
+                                   "--lang1Stoplist=" + STOPLISTS + "empty.txt",
+                                   "--lang2Stoplist=" + STOPLISTS + "empty.txt",
+                                   "--dictInfo=bleh.",
+
+                                   "--input1=testdata/inputs/" + inputFile,
+                                   "--input1Name=my_input_" + name,
+                                   "--input1Charset=ISO-8859-1",
+                                   "--input1Format=tab_separated",
+
+                                   "--print=" + result.getPath() + ".text",
+                               });
+
+        checkGolden(name, result);
+    }
+
+    public void test_FR_NL() throws Exception {
+        doTestCustomDict("QuickDic-FR-NL.quickdic", "FR", "NL", "QuickDic-FR-NL.txt");
+    }
+
+    public void testWiktionary_en_de2fr() throws Exception {
+        wiktionaryTestWithEnTrans2Trans("wiktionary.de_fr.quickdic", "DE", "FR");
+    }
+
+    public void wiktionaryTestWithEnTrans2Trans(final String name, final String lang1,
+            final String lang2) throws Exception {
+        final File result = new File(TEST_OUTPUTS + name);
+        System.out.println("Writing to: " + result);
+        DictionaryBuilder.main(new String[] {
+                                   "--dictOut=" + result.getAbsolutePath(),
+                                   "--lang1=" + lang1,
+                                   "--lang2=" + lang2,
+                                   "--lang1Stoplist=" + STOPLISTS + "empty.txt",
+                                   "--lang2Stoplist=" + STOPLISTS + "empty.txt",
+                                   "--dictInfo=SomeWikiDataTrans2Trans",
+
+                                   "--input4=" + WIKISPLIT_EN + "EN.data",
+                                   "--input4Name=" + name,
+                                   "--input4Format=" + EnTranslationToTranslationParser.NAME,
+                                   "--input4LangPattern1=" + lang1,
+                                   "--input4LangPattern2=" + lang2,
+                                   "--input4PageLimit=1000",
+
+                                   "--print=" + result.getPath() + ".text",
+                               });
+
+        checkGolden(name, result);
+    }
+
+    public void testWiktionary_WholeSection_DE() throws Exception {
+        enWiktionaryTestWithWholeSectionToHtml("enwiktionary.WholeSection.DE.quickdic", "DE", 100);
+    }
+
+    public void testWiktionary_WholeSection_EN() throws Exception {
+        enWiktionaryTestWithWholeSectionToHtml("enwiktionary.WholeSection.EN.quickdic", "EN", 100);
+    }
+
+    public void testWiktionary_WholeSection_IT() throws Exception {
+        // Have to run to 800 to get a few verb conjugations (including essere!)
+        enWiktionaryTestWithWholeSectionToHtml("enwiktionary.WholeSection.IT.quickdic", "IT", 800);
+    }
+
+    public void enWiktionaryTestWithWholeSectionToHtml(final String name, final String langCode, final int pageLimit) throws Exception {
+        final File result = new File(TEST_OUTPUTS + name);
+        System.out.println("Writing to: " + result);
+        DictionaryBuilder.main(new String[] {
+                                   "--dictOut=" + result.getAbsolutePath(),
+                                   "--lang1=" + langCode,
+                                   "--lang2=" + "EN",
+                                   "--lang1Stoplist=" + STOPLISTS + "empty.txt",
+                                   "--lang2Stoplist=" + STOPLISTS + "empty.txt",
+                                   "--dictInfo=SomeWikiDataWholeSection",
+
+                                   "--input4=" + WIKISPLIT_EN + langCode + ".data",
+                                   "--input4Name=" + name,
+                                   "--input4Format=" + WholeSectionToHtmlParser.NAME,
+                                   "--input4WiktionaryLang=EN",
+                                   "--input4SkipLang=" + langCode,
+                                   "--input4TitleIndex=" + "1",
+                                   "--input4PageLimit=" + pageLimit,
+
+                                   "--print=" + result.getPath() + ".text",
+                               });
+        checkGolden(name, result);
+    }
+
+    //-----------------------------------------------------------------
+
+    public void testSingleLang_EN() throws Exception {
+        wiktionaryTestSingleLang("SingleLang_EN.quickdic", "EN", 100);
+    }
+
+    public void testSingleLang_DE() throws Exception {
+        wiktionaryTestSingleLang("SingleLang_DE.quickdic", "DE", 100);
+    }
+
+    public void testSingleLang_IT() throws Exception {
+        wiktionaryTestSingleLang("SingleLang_IT.quickdic", "IT", 100);
+    }
+
+    public void testSingleLang_FR() throws Exception {
+        wiktionaryTestSingleLang("SingleLang_FR.quickdic", "FR", 100);
+    }
+
+    public void wiktionaryTestSingleLang(final String name, final String langCode, final int pageLimit) throws Exception {
+        final File result = new File(TEST_OUTPUTS + name);
+        System.out.println("Writing to: " + result);
+        DictionaryBuilder.main(new String[] {
+                                   "--dictOut=" + result.getAbsolutePath(),
+                                   "--lang1=" + langCode,
+                                   "--lang1Stoplist=" + STOPLISTS + "empty.txt",
+                                   "--dictInfo=SomeWikiDataWholeSection",
+                                   "--input4=" + WIKISPLIT + langCode.toLowerCase() + "/" + langCode + ".data",
+                                   "--input4Name=" + name,
+                                   "--input4Format=" + WholeSectionToHtmlParser.NAME,
+                                   "--input4WiktionaryLang=" + langCode,
+                                   "--input4SkipLang=" + langCode,
+                                   "--input4TitleIndex=" + "1",
+                                   "--input4PageLimit=" + pageLimit,
+                                   "--print=" + result.getPath() + ".text",
+                               });
+        checkGolden(name, result);
+    }
+
+    //-----------------------------------------------------------------
+
+    public void testWiktionary_IT_EN() throws Exception {
+        wiktionaryTestWithLangToEn("wiktionary.it_en.quickdic", "IT", "it.txt",
+                                   "EN.data", "enwiktionary.english", "Italian", "it", 1000);
+    }
+
+    public void testWiktionary_cmn_EN() throws Exception {
+        wiktionaryTestWithLangToEn("wiktionary.cmn_en.quickdic", "cmn", "empty.txt",
+                                   // These missing "e" prevents a complete match, forcing the name to be printed
+                                   "EN.data", "enwiktionary.english", "Chinese|Mandarin", "cmn", 1000);
+    }
+
+    public void testWiktionary_DE_EN() throws Exception {
+        wiktionaryTestWithLangToEn("wiktionary.de_en.quickdic", "DE", "de.txt",
+                                   "EN.data", "enwiktionary.english", "German", "de", 1000);
+    }
+
+    public void testWiktionary_IT_IT() throws Exception {
+        wiktionaryTestWithLangToEn("wiktionary.it_it.quickdic", "IT", "it.txt",
+                                   "IT.data", "enwiktionary.italian", "Italian", "it", 1000);
     }
 
-  //-----------------------------------------------------------------
-
-  public void testWiktionary_IT_EN() throws Exception {
-    wiktionaryTestWithLangToEn("wiktionary.it_en.quickdic", "IT", "it.txt",
-        "EN.data", "enwiktionary.english", "Italian", "it", 1000);
-  }
-
-  public void testWiktionary_cmn_EN() throws Exception {
-    wiktionaryTestWithLangToEn("wiktionary.cmn_en.quickdic", "cmn", "empty.txt",
-        // These missing "e" prevents a complete match, forcing the name to be printed
-        "EN.data", "enwiktionary.english", "Chinese|Mandarin", "cmn", 1000);
-  }
-
-  public void testWiktionary_DE_EN() throws Exception {
-    wiktionaryTestWithLangToEn("wiktionary.de_en.quickdic", "DE", "de.txt",
-        "EN.data", "enwiktionary.english", "German", "de", 1000);
-  }
-
-  public void testWiktionary_IT_IT() throws Exception {
-    wiktionaryTestWithLangToEn("wiktionary.it_it.quickdic", "IT", "it.txt",
-        "IT.data", "enwiktionary.italian", "Italian", "it", 1000);
-  }
-
-  // French
-  public void testWiktionary_FR_FR() throws Exception {
-    wiktionaryTestWithLangToEn("wiktionary.fr_fr.quickdic", "FR", "fr.txt",
-        "FR.data", "enwiktionary.french", "French", "fr", 1000);
-  }
-
-  
-  // Arabic
-  public void testWiktionary_AR_AR() throws Exception {
-      // Arabic is really big for some reason, use fewer pages.
-    wiktionaryTestWithLangToEn("wiktionary.ar_ar.quickdic", "AR", "empty.txt",
-        "AR.data", "enwiktionary.arabic", "Arabic", "ar", 200);
-  }
-
-  // Chinese
-  public void testWiktionary_cmn_cmn() throws Exception {
-    wiktionaryTestWithLangToEn("wiktionary.cmn_cmn.quickdic", "cmn", "empty.txt",
-        // These missing "e" prevents a complete match, forcing the name to be printed.
-        "cmn.data", "enwiktionary.chinese", "Chinese|Mandarin", "cmn", 1000);
-  }
-
-  // German
-  public void testWiktionary_DE_DE() throws Exception {
-    wiktionaryTestWithLangToEn("wiktionary.de_de.quickdic", "DE", "de.txt",
-        "DE.data", "enwiktionary.german", "German", "de", 1000);
-  }
-
-  // Thai
-  public void testWiktionary_TH_TH() throws Exception {
-    wiktionaryTestWithLangToEn("wiktionary.th_th.quickdic", "TH", "empty.txt",
-        // These missing "e" prevents a complete match, forcing the name to be printed.
-        "TH.data", "enwiktionary.thai", "Thai", "th", 1000);
-  }
-
-  public void wiktionaryTestWithLangToEn(final String name, final String lang1,
-      final String stoplist, final String data, final String dictName,
-      final String langPattern, final String langCode, int pageLimit) throws Exception {
-    final File result = new File(TEST_OUTPUTS + name);
-    System.out.println("Writing to: " + result);
-    final String type = data.equals("EN.data") ? "EnToTranslation" : "EnForeign";
-    DictionaryBuilder.main(new String[] {
-        "--dictOut=" + result.getAbsolutePath(),
-        "--lang1=" + lang1,
-        "--lang2=EN",
-        "--lang1Stoplist=" + STOPLISTS + stoplist,
-        "--lang2Stoplist=" + STOPLISTS + "en.txt",
-        "--dictInfo=SomeWikiData",
-
-        "--input4=" + WIKISPLIT_EN + data,
-        "--input4Name=" + dictName,
-        "--input4Format=enwiktionary",
-        "--input4WiktionaryType=" + type,
-        "--input4LangPattern=" + langPattern,
-        "--input4LangCodePattern=" + langCode,
-        "--input4EnIndex=2",
-        "--input4PageLimit=" + pageLimit,
-
-        "--print=" + result.getPath() + ".text",
-    });
-    
-    checkGolden(name, result); 
-  }
-
-  public void testGermanCombined() throws Exception {
-    final String name = "de-en.quickdic";
-    final File result = new File(TEST_OUTPUTS + name);
-    System.out.println("Writing to: " + result);
-    DictionaryBuilder.main(new String[] {
-        "--dictOut=" + result.getAbsolutePath(),
-        "--lang1=DE",
-        "--lang2=EN",
-        "--dictInfo=@" + TEST_INPUTS + "de-en_dictInfo.txt",
-
-        "--input1=" + TEST_INPUTS + "de-en_chemnitz_100",
-        "--input1Name=chemnitz",
-        "--input1Charset=UTF8",
-        "--input1Format=chemnitz",
-
-        "--input2=" + TEST_INPUTS + "de-en_dictcc_simulated",
-        "--input2Name=dictcc",
-        "--input2Charset=UTF8",
-        "--input2Format=tab_separated",
-
-        "--print=" + result.getPath() + ".text",
-    });
-    
-    checkGolden(name, result); 
-  }
-
-  public void testItalianTurkish() throws Exception {
-      final String name = "it-tr_dictcc.quickdic";
-      final File result = new File(TEST_OUTPUTS + name);
-      System.out.println("Writing to: " + result);
-      DictionaryBuilder.main(new String[] {
-          "--dictOut=" + result.getAbsolutePath(),
-          "--lang1=IT",
-          "--lang2=TR",
-          "--dictInfo=it-tr_dictcc_simulated",
-
-          "--input1=" + TEST_INPUTS + "it-tr_dictcc_simulated.txt",
-          "--input1Name=dictcc",
-          "--input1Charset=UTF8",
-          "--input1Format=tab_separated",
-
-          "--print=" + result.getPath() + ".text",
-      });
-      
-      checkGolden(name, result); 
+    // French
+    public void testWiktionary_FR_FR() throws Exception {
+        wiktionaryTestWithLangToEn("wiktionary.fr_fr.quickdic", "FR", "fr.txt",
+                                   "FR.data", "enwiktionary.french", "French", "fr", 1000);
     }
 
-  private void checkGolden(final String dictName, final File dictFile)
-      throws IOException, FileNotFoundException {
-    // Check it once:
-    assertFilesEqual(GOLDENS + dictName + ".text", dictFile.getPath() + ".text");
 
-    // Check it again.
-    final Dictionary dict = new Dictionary(new RandomAccessFile(dictFile.getAbsolutePath(), "r"));
-    final PrintStream out = new PrintStream(new File(dictFile.getPath() + ".text"));
-    dict.print(out);
-    out.close();
-    assertFilesEqual(GOLDENS + dictName + ".text", dictFile.getPath() + ".text");
-  }
+    // Arabic
+    public void testWiktionary_AR_AR() throws Exception {
+        // Arabic is really big for some reason, use fewer pages.
+        wiktionaryTestWithLangToEn("wiktionary.ar_ar.quickdic", "AR", "empty.txt",
+                                   "AR.data", "enwiktionary.arabic", "Arabic", "ar", 200);
+    }
 
+    // Chinese
+    public void testWiktionary_cmn_cmn() throws Exception {
+        wiktionaryTestWithLangToEn("wiktionary.cmn_cmn.quickdic", "cmn", "empty.txt",
+                                   // These missing "e" prevents a complete match, forcing the name to be printed.
+                                   "cmn.data", "enwiktionary.chinese", "Chinese|Mandarin", "cmn", 1000);
+    }
+
+    // German
+    public void testWiktionary_DE_DE() throws Exception {
+        wiktionaryTestWithLangToEn("wiktionary.de_de.quickdic", "DE", "de.txt",
+                                   "DE.data", "enwiktionary.german", "German", "de", 1000);
+    }
+
+    // Thai
+    public void testWiktionary_TH_TH() throws Exception {
+        wiktionaryTestWithLangToEn("wiktionary.th_th.quickdic", "TH", "empty.txt",
+                                   // These missing "e" prevents a complete match, forcing the name to be printed.
+                                   "TH.data", "enwiktionary.thai", "Thai", "th", 1000);
+    }
+
+    public void wiktionaryTestWithLangToEn(final String name, final String lang1,
+                                           final String stoplist, final String data, final String dictName,
+                                           final String langPattern, final String langCode, int pageLimit) throws Exception {
+        final File result = new File(TEST_OUTPUTS + name);
+        System.out.println("Writing to: " + result);
+        final String type = data.equals("EN.data") ? "EnToTranslation" : "EnForeign";
+        DictionaryBuilder.main(new String[] {
+                                   "--dictOut=" + result.getAbsolutePath(),
+                                   "--lang1=" + lang1,
+                                   "--lang2=EN",
+                                   "--lang1Stoplist=" + STOPLISTS + stoplist,
+                                   "--lang2Stoplist=" + STOPLISTS + "en.txt",
+                                   "--dictInfo=SomeWikiData",
+
+                                   "--input4=" + WIKISPLIT_EN + data,
+                                   "--input4Name=" + dictName,
+                                   "--input4Format=enwiktionary",
+                                   "--input4WiktionaryType=" + type,
+                                   "--input4LangPattern=" + langPattern,
+                                   "--input4LangCodePattern=" + langCode,
+                                   "--input4EnIndex=2",
+                                   "--input4PageLimit=" + pageLimit,
+
+                                   "--print=" + result.getPath() + ".text",
+                               });
+
+        checkGolden(name, result);
+    }
+
+    public void testGermanCombined() throws Exception {
+        final String name = "de-en.quickdic";
+        final File result = new File(TEST_OUTPUTS + name);
+        System.out.println("Writing to: " + result);
+        DictionaryBuilder.main(new String[] {
+                                   "--dictOut=" + result.getAbsolutePath(),
+                                   "--lang1=DE",
+                                   "--lang2=EN",
+                                   "--dictInfo=@" + TEST_INPUTS + "de-en_dictInfo.txt",
+
+                                   "--input1=" + TEST_INPUTS + "de-en_chemnitz_100",
+                                   "--input1Name=chemnitz",
+                                   "--input1Charset=UTF8",
+                                   "--input1Format=chemnitz",
+
+                                   "--input2=" + TEST_INPUTS + "de-en_dictcc_simulated",
+                                   "--input2Name=dictcc",
+                                   "--input2Charset=UTF8",
+                                   "--input2Format=tab_separated",
+
+                                   "--print=" + result.getPath() + ".text",
+                               });
+
+        checkGolden(name, result);
+    }
+
+    public void testItalianTurkish() throws Exception {
+        final String name = "it-tr_dictcc.quickdic";
+        final File result = new File(TEST_OUTPUTS + name);
+        System.out.println("Writing to: " + result);
+        DictionaryBuilder.main(new String[] {
+                                   "--dictOut=" + result.getAbsolutePath(),
+                                   "--lang1=IT",
+                                   "--lang2=TR",
+                                   "--dictInfo=it-tr_dictcc_simulated",
+
+                                   "--input1=" + TEST_INPUTS + "it-tr_dictcc_simulated.txt",
+                                   "--input1Name=dictcc",
+                                   "--input1Charset=UTF8",
+                                   "--input1Format=tab_separated",
+
+                                   "--print=" + result.getPath() + ".text",
+                               });
+
+        checkGolden(name, result);
+    }
+
+    private void checkGolden(final String dictName, final File dictFile)
+    throws IOException, FileNotFoundException {
+        // Check it once:
+        assertFilesEqual(GOLDENS + dictName + ".text", dictFile.getPath() + ".text");
+
+        // Check it again.
+        final Dictionary dict = new Dictionary(new RandomAccessFile(dictFile.getAbsolutePath(), "r"));
+        final PrintStream out = new PrintStream(new File(dictFile.getPath() + ".text"));
+        dict.print(out);
+        out.close();
+        assertFilesEqual(GOLDENS + dictName + ".text", dictFile.getPath() + ".text");
+    }
+
+
+    void assertFilesEqual(final String expected, final String actual) throws IOException {
+        final String expectedString = FileUtil.readToString(new File(expected));
+        final String actualString = FileUtil.readToString(new File(actual));
+        assertEquals(expectedString, actualString);
+    }
 
-  void assertFilesEqual(final String expected, final String actual) throws IOException {
-    final String expectedString = FileUtil.readToString(new File(expected));
-    final String actualString = FileUtil.readToString(new File(actual));
-    assertEquals(expectedString, actualString);
-  }
 
-  
 }
index 23747e196dd8f8c13dea9da6f74093d5f580e72f..16db7237a3e7b762dfdd30fddda1cd30d7fd8825 100644 (file)
@@ -28,66 +28,66 @@ import com.hughes.util.CollectionUtil;
 
 
 public class DictionaryTest extends TestCase {
-  
-  static final String TEST_OUTPUTS = com.hughes.android.dictionary.engine.DictionaryBuilderTest.TEST_OUTPUTS;
-  public static final String OUTPUTS = "data/outputs/";
-
-  @Override
-  protected void setUp() {
-    while (!TransliteratorManager.init(null)) {
-      try {
-        Thread.sleep(10);
-      } catch (InterruptedException e) {
-        e.printStackTrace();
-      }
-    }
-  }
-  
-  public void testURLFormatting() {
-  }
-
-  public void testEnItWiktionary() throws IOException {
-    final RandomAccessFile raf = new RandomAccessFile(OUTPUTS + "EN-IT.quickdic", "r");
-    final Dictionary dict = new Dictionary(raf);
-    final Index enIndex = dict.indices.get(0);
-    
-    final RowBase row = enIndex.rows.get(4);
-    assertEquals("-ical", row.getRawText(false));
-    
-    final Index itIndex = dict.indices.get(1);
-    {
-    final List<RowBase> rows = itIndex.multiWordSearch("come mai", Arrays.asList("come", "mai"), new AtomicBoolean(false));
-    System.out.println(CollectionUtil.join(rows, "\n  "));
-    assertTrue(rows.toString(), rows.size() > 0);
-    assertTrue(rows.get(0).toString().startsWith("come mai@"));
-    assertTrue(rows.get(0) instanceof TokenRow);
-    assertTrue(!((TokenRow)rows.get(0)).getIndexEntry().htmlEntries.isEmpty());
-    }
 
-    {
-    final List<RowBase> rows = itIndex.multiWordSearch("buon g", Arrays.asList("buon", "g"), new AtomicBoolean(false));
-    System.out.println(CollectionUtil.join(rows, "\n  "));
-    assertTrue(rows.toString(), rows.size() > 0);
-    assertTrue(rows.get(0).toString().startsWith("buon giorno@"));
-    assertTrue(rows.get(0) instanceof TokenRow);
-    assertTrue(!((TokenRow)rows.get(0)).getIndexEntry().htmlEntries.isEmpty());
+    static final String TEST_OUTPUTS = com.hughes.android.dictionary.engine.DictionaryBuilderTest.TEST_OUTPUTS;
+    public static final String OUTPUTS = "data/outputs/";
+
+    @Override
+    protected void setUp() {
+        while (!TransliteratorManager.init(null)) {
+            try {
+                Thread.sleep(10);
+            } catch (InterruptedException e) {
+                e.printStackTrace();
+            }
+        }
     }
 
-    {
-        final IndexEntry searchResult = itIndex.findInsertionPoint("azzurro", new AtomicBoolean(
-                false));
-        HtmlEntry htmlEntry = searchResult.htmlEntries.get(0);
-        System.out.println("azzurro:\n" + htmlEntry.getHtml());
+    public void testURLFormatting() {
     }
 
-    raf.close();
-  }
+    public void testEnItWiktionary() throws IOException {
+        final RandomAccessFile raf = new RandomAccessFile(OUTPUTS + "EN-IT.quickdic", "r");
+        final Dictionary dict = new Dictionary(raf);
+        final Index enIndex = dict.indices.get(0);
+
+        final RowBase row = enIndex.rows.get(4);
+        assertEquals("-ical", row.getRawText(false));
+
+        final Index itIndex = dict.indices.get(1);
+        {
+            final List<RowBase> rows = itIndex.multiWordSearch("come mai", Arrays.asList("come", "mai"), new AtomicBoolean(false));
+            System.out.println(CollectionUtil.join(rows, "\n  "));
+            assertTrue(rows.toString(), rows.size() > 0);
+            assertTrue(rows.get(0).toString().startsWith("come mai@"));
+            assertTrue(rows.get(0) instanceof TokenRow);
+            assertTrue(!((TokenRow)rows.get(0)).getIndexEntry().htmlEntries.isEmpty());
+        }
+
+        {
+            final List<RowBase> rows = itIndex.multiWordSearch("buon g", Arrays.asList("buon", "g"), new AtomicBoolean(false));
+            System.out.println(CollectionUtil.join(rows, "\n  "));
+            assertTrue(rows.toString(), rows.size() > 0);
+            assertTrue(rows.get(0).toString().startsWith("buon giorno@"));
+            assertTrue(rows.get(0) instanceof TokenRow);
+            assertTrue(!((TokenRow)rows.get(0)).getIndexEntry().htmlEntries.isEmpty());
+        }
+
+        {
+            final IndexEntry searchResult = itIndex.findInsertionPoint("azzurro", new AtomicBoolean(
+                                                false));
+            HtmlEntry htmlEntry = searchResult.htmlEntries.get(0);
+            System.out.println("azzurro:\n" + htmlEntry.getHtml());
+        }
+
+        raf.close();
+    }
 
 //  public void testFr() throws IOException {
 //      final RandomAccessFile raf = new RandomAccessFile(OUTPUTS + "FR.quickdic", "r");
 //      final Dictionary dict = new Dictionary(raf);
 //      final Index frIndex = dict.indices.get(0);
-//      
+//
 //      // Now they're all cached, we shouldn't have to search.
 //      for (final IndexEntry indexEntry : frIndex.sortedIndexEntries) {
 //          System.out.println(indexEntry.token);
@@ -96,302 +96,302 @@ public class DictionaryTest extends TestCase {
 //      raf.close();
 //  }
 
-  
-  public void testDeEnWiktionary() throws IOException {
-      final RandomAccessFile raf = new RandomAccessFile(OUTPUTS + "DE-EN.quickdic", "r");
-      final Dictionary dict = new Dictionary(raf);
-            
-      final Index deIndex = dict.indices.get(0);
-
-      {
-          final IndexEntry searchResult = deIndex.findInsertionPoint("rot", new AtomicBoolean(
-                  false));
-          HtmlEntry htmlEntry = searchResult.htmlEntries.get(0);
-          System.out.println("rot:\n" + htmlEntry.getHtml());
-      }
-
-      raf.close();
-    }
 
-  public void testGermanMetadata() throws IOException {
-    final RandomAccessFile raf = new RandomAccessFile(TEST_OUTPUTS + "de-en.quickdic", "r");
-    final Dictionary dict = new Dictionary(raf);
-    final Index deIndex = dict.indices.get(0);
-    
-    assertEquals("DE", deIndex.shortName);
-    assertEquals("DE->EN", deIndex.longName);
-    
-    assertEquals(2, dict.sources.size());
-    assertEquals("chemnitz", dict.sources.get(0).name);
-    assertEquals("dictcc", dict.sources.get(1).name);
-    
-    assertEquals("dictcc", dict.pairEntries.get(0).entrySource.name);
-    assertEquals("chemnitz", dict.pairEntries.get(1).entrySource.name);
-    
-    raf.close();
-  }
-  
-  public void testGermanIndex() throws IOException {
-    final RandomAccessFile raf = new RandomAccessFile(TEST_OUTPUTS + "de-en.quickdic", "r");
-    final Dictionary dict = new Dictionary(raf);
-    final Index deIndex = dict.indices.get(0);
-    
-    for (final Index.IndexEntry indexEntry : deIndex.sortedIndexEntries) {
-      System.out.println("testing: " + indexEntry.token);
-      final IndexEntry searchResult = deIndex.findInsertionPoint(indexEntry.token, new AtomicBoolean(
-          false));
-      assertEquals("Looked up: " + indexEntry.token, indexEntry.token.toLowerCase(), searchResult.token.toLowerCase());
-    }
+    public void testDeEnWiktionary() throws IOException {
+        final RandomAccessFile raf = new RandomAccessFile(OUTPUTS + "DE-EN.quickdic", "r");
+        final Dictionary dict = new Dictionary(raf);
 
-    // TODO: maybe if user types capitalization, use it.
-    assertSearchResult("aaac", "aaac", deIndex.findInsertionPoint("aaac", new AtomicBoolean(false)));
-    assertSearchResult("aaac", "aaac", deIndex.findInsertionPoint("AAAC", new AtomicBoolean(false)));
-    assertSearchResult("aaac", "aaac", deIndex.findInsertionPoint("AAAc", new AtomicBoolean(false)));
-    assertSearchResult("aaac", "aaac", deIndex.findInsertionPoint("aAac", new AtomicBoolean(false)));
-
-    // Before the beginning.
-    assertSearchResult("40", "40" /* special case */, deIndex.findInsertionPoint("", new AtomicBoolean(false)));
-    assertSearchResult("40", "40" /* special case */, deIndex.findInsertionPoint("__", new AtomicBoolean(false)));
-    
-    // After the end.
-    assertSearchResult("Zweckorientiertheit", "zählen", deIndex.findInsertionPoint("ZZZZZ", new AtomicBoolean(false)));
-
-    assertSearchResult("ab", "aaac", deIndex.findInsertionPoint("aaaca", new AtomicBoolean(false)));
-    assertSearchResult("machen", "machen", deIndex.findInsertionPoint("m", new AtomicBoolean(false)));
-    assertSearchResult("machen", "machen", deIndex.findInsertionPoint("macdddd", new AtomicBoolean(false)));
-
-
-    assertSearchResult("überprüfe", "überprüfe", deIndex.findInsertionPoint("ueberprüfe", new AtomicBoolean(false)));
-    assertSearchResult("überprüfe", "überprüfe", deIndex.findInsertionPoint("ueberpruefe", new AtomicBoolean(false)));
-
-    assertSearchResult("überprüfe", "überprüfe", deIndex.findInsertionPoint("ueberpBLEH", new AtomicBoolean(false)));
-    assertSearchResult("überprüfe", "überprüfe", deIndex.findInsertionPoint("überprBLEH", new AtomicBoolean(false)));
-
-    assertSearchResult("überprüfen", "überprüfe", deIndex.findInsertionPoint("überprüfeBLEH", new AtomicBoolean(false)));
-
-    // Check that search in lowercase works.
-    assertSearchResult("Alibi", "Alibi", deIndex.findInsertionPoint("alib", new AtomicBoolean(false)));
-    System.out.println(deIndex.findInsertionPoint("alib", new AtomicBoolean(false)).toString());
-    
-    raf.close();
-  }
-  
-  private void assertSearchResult(final String insertionPoint, final String longestPrefix,
-      final IndexEntry actual) {
-    assertEquals(insertionPoint, actual.token);
-  }
-
-  public void testGermanTokenRows() throws IOException {
-    final RandomAccessFile raf = new RandomAccessFile(TEST_OUTPUTS + "de-en.quickdic", "r");
-    final Dictionary dict = new Dictionary(raf);
-    final Index deIndex = dict.indices.get(0);
-    
-    // Pre-cache a few of these, just to make sure that's working.
-    for (int i = 0; i < deIndex.rows.size(); i += 7) {
-      deIndex.rows.get(i).getTokenRow(true);
-    }
-    
-    // Do the exhaustive searching.
-    TokenRow lastTokenRow = null;
-    for (final RowBase row : deIndex.rows) {
-      if (row instanceof TokenRow) {
-        lastTokenRow = (TokenRow) row;
-      }
-      assertEquals(lastTokenRow, row.getTokenRow(true));
-    }
+        final Index deIndex = dict.indices.get(0);
 
-    // Now they're all cached, we shouldn't have to search.
-    for (final RowBase row : deIndex.rows) {
-      if (row instanceof TokenRow) {
-        lastTokenRow = (TokenRow) row;
-      }
-      // This will break if the Row cache isn't big enough.
-      assertEquals(lastTokenRow, row.getTokenRow(false));
+        {
+            final IndexEntry searchResult = deIndex.findInsertionPoint("rot", new AtomicBoolean(
+                                                false));
+            HtmlEntry htmlEntry = searchResult.htmlEntries.get(0);
+            System.out.println("rot:\n" + htmlEntry.getHtml());
+        }
+
+        raf.close();
     }
-    
-    raf.close();
-  }
-  
-  public void testChemnitz() throws IOException {
-    final RandomAccessFile raf = new RandomAccessFile(TEST_OUTPUTS + "de-en.quickdic", "r");
-    final Dictionary dict = new Dictionary(raf);
-    final Index deIndex = dict.indices.get(0);
-    
-    assertSearchResult("Höschen", "Hos", deIndex.findInsertionPoint("Hos", new AtomicBoolean(false)));
-    assertSearchResult("Höschen", "hos", deIndex.findInsertionPoint("hos", new AtomicBoolean(false)));
-    
-    raf.close();
-  }
-
-  public void testMultiSearch() throws IOException {
-    final RandomAccessFile raf = new RandomAccessFile(TEST_OUTPUTS + "de-en.quickdic", "r");
-    final Dictionary dict = new Dictionary(raf);
-    final Index deIndex = dict.indices.get(0);
-
-    {
-    final List<RowBase> rows = deIndex.multiWordSearch("aaa aaab", Arrays.asList("aaa", "aaab"), new AtomicBoolean(false));
-    System.out.println(CollectionUtil.join(rows, "\n  "));
-    assertTrue(rows.toString(), rows.size() > 0);
+
+    public void testGermanMetadata() throws IOException {
+        final RandomAccessFile raf = new RandomAccessFile(TEST_OUTPUTS + "de-en.quickdic", "r");
+        final Dictionary dict = new Dictionary(raf);
+        final Index deIndex = dict.indices.get(0);
+
+        assertEquals("DE", deIndex.shortName);
+        assertEquals("DE->EN", deIndex.longName);
+
+        assertEquals(2, dict.sources.size());
+        assertEquals("chemnitz", dict.sources.get(0).name);
+        assertEquals("dictcc", dict.sources.get(1).name);
+
+        assertEquals("dictcc", dict.pairEntries.get(0).entrySource.name);
+        assertEquals("chemnitz", dict.pairEntries.get(1).entrySource.name);
+
+        raf.close();
     }
-    
-    raf.close();
-  }
-  
-  public void testMultiSearchIt() throws IOException {
-      final RandomAccessFile raf = new RandomAccessFile(OUTPUTS + "IT.quickdic", "r");
-      final Dictionary dict = new Dictionary(raf);
-      final Index index = dict.indices.get(0);
-
-      {
-      final List<RowBase> rows = index.multiWordSearch("fare centro", 
-              Arrays.asList("fare", "centro"), new AtomicBoolean(false));
-      System.out.println(CollectionUtil.join(rows, "\n  "));
-      assertTrue(rows.toString(), rows.size() > 0);
-      assertTrue(rows.get(0).toString().startsWith("fare centro@"));
-      }
-  }
-
-  public void testMultiSearchDeBig() throws IOException {
-    final RandomAccessFile raf = new RandomAccessFile(OUTPUTS + "DE-EN.quickdic", "r");
-    final Dictionary dict = new Dictionary(raf);
-    final Index enIndex = dict.indices.get(1);
-
-    {
-    final List<RowBase> rows = enIndex.multiWordSearch("train station", Arrays.asList("train", "station"), new AtomicBoolean(false));
-    System.out.println(CollectionUtil.join(rows, "\n  "));
-    assertTrue(rows.toString(), rows.size() > 0);
-    assertTrue(rows.get(0).toString().startsWith("train station@"));
+
+    public void testGermanIndex() throws IOException {
+        final RandomAccessFile raf = new RandomAccessFile(TEST_OUTPUTS + "de-en.quickdic", "r");
+        final Dictionary dict = new Dictionary(raf);
+        final Index deIndex = dict.indices.get(0);
+
+        for (final Index.IndexEntry indexEntry : deIndex.sortedIndexEntries) {
+            System.out.println("testing: " + indexEntry.token);
+            final IndexEntry searchResult = deIndex.findInsertionPoint(indexEntry.token, new AtomicBoolean(
+                                                false));
+            assertEquals("Looked up: " + indexEntry.token, indexEntry.token.toLowerCase(), searchResult.token.toLowerCase());
+        }
+
+        // TODO: maybe if user types capitalization, use it.
+        assertSearchResult("aaac", "aaac", deIndex.findInsertionPoint("aaac", new AtomicBoolean(false)));
+        assertSearchResult("aaac", "aaac", deIndex.findInsertionPoint("AAAC", new AtomicBoolean(false)));
+        assertSearchResult("aaac", "aaac", deIndex.findInsertionPoint("AAAc", new AtomicBoolean(false)));
+        assertSearchResult("aaac", "aaac", deIndex.findInsertionPoint("aAac", new AtomicBoolean(false)));
+
+        // Before the beginning.
+        assertSearchResult("40", "40" /* special case */, deIndex.findInsertionPoint("", new AtomicBoolean(false)));
+        assertSearchResult("40", "40" /* special case */, deIndex.findInsertionPoint("__", new AtomicBoolean(false)));
+
+        // After the end.
+        assertSearchResult("Zweckorientiertheit", "zählen", deIndex.findInsertionPoint("ZZZZZ", new AtomicBoolean(false)));
+
+        assertSearchResult("ab", "aaac", deIndex.findInsertionPoint("aaaca", new AtomicBoolean(false)));
+        assertSearchResult("machen", "machen", deIndex.findInsertionPoint("m", new AtomicBoolean(false)));
+        assertSearchResult("machen", "machen", deIndex.findInsertionPoint("macdddd", new AtomicBoolean(false)));
+
+
+        assertSearchResult("überprüfe", "überprüfe", deIndex.findInsertionPoint("ueberprüfe", new AtomicBoolean(false)));
+        assertSearchResult("überprüfe", "überprüfe", deIndex.findInsertionPoint("ueberpruefe", new AtomicBoolean(false)));
+
+        assertSearchResult("überprüfe", "überprüfe", deIndex.findInsertionPoint("ueberpBLEH", new AtomicBoolean(false)));
+        assertSearchResult("überprüfe", "überprüfe", deIndex.findInsertionPoint("überprBLEH", new AtomicBoolean(false)));
+
+        assertSearchResult("überprüfen", "überprüfe", deIndex.findInsertionPoint("überprüfeBLEH", new AtomicBoolean(false)));
+
+        // Check that search in lowercase works.
+        assertSearchResult("Alibi", "Alibi", deIndex.findInsertionPoint("alib", new AtomicBoolean(false)));
+        System.out.println(deIndex.findInsertionPoint("alib", new AtomicBoolean(false)).toString());
+
+        raf.close();
     }
 
-    {
-    final List<RowBase> rows = enIndex.multiWordSearch("a train station", Arrays.asList("a", "train", "station"), new AtomicBoolean(false));
-    System.out.println(CollectionUtil.join(rows, "\n  "));
-    assertTrue(rows.toString(), rows.size() > 0);
-    assertEquals("Bahnhofsuhr {{de-noun|g=f|plural=Bahnhofsuhren}}\tstation clock (at a train station)", rows.get(0).toString());
+    private void assertSearchResult(final String insertionPoint, final String longestPrefix,
+                                    final IndexEntry actual) {
+        assertEquals(insertionPoint, actual.token);
     }
 
-    {
-    final List<RowBase> rows = enIndex.multiWordSearch("a station", Arrays.asList("a", "station"), new AtomicBoolean(false));
-    System.out.println(CollectionUtil.join(rows, "\n  "));
-    assertTrue(rows.toString(), rows.size() > 0);
-    assertEquals("Abfahrthalle {en-noun}\tDeparture room of a station.", rows.get(0).toString());
+    public void testGermanTokenRows() throws IOException {
+        final RandomAccessFile raf = new RandomAccessFile(TEST_OUTPUTS + "de-en.quickdic", "r");
+        final Dictionary dict = new Dictionary(raf);
+        final Index deIndex = dict.indices.get(0);
+
+        // Pre-cache a few of these, just to make sure that's working.
+        for (int i = 0; i < deIndex.rows.size(); i += 7) {
+            deIndex.rows.get(i).getTokenRow(true);
+        }
+
+        // Do the exhaustive searching.
+        TokenRow lastTokenRow = null;
+        for (final RowBase row : deIndex.rows) {
+            if (row instanceof TokenRow) {
+                lastTokenRow = (TokenRow) row;
+            }
+            assertEquals(lastTokenRow, row.getTokenRow(true));
+        }
+
+        // Now they're all cached, we shouldn't have to search.
+        for (final RowBase row : deIndex.rows) {
+            if (row instanceof TokenRow) {
+                lastTokenRow = (TokenRow) row;
+            }
+            // This will break if the Row cache isn't big enough.
+            assertEquals(lastTokenRow, row.getTokenRow(false));
+        }
+
+        raf.close();
     }
 
-    {
-    // Should print: Giving up, too many words with prefix: p
-    final List<RowBase> rows = enIndex.multiWordSearch("p eat", Arrays.asList("p", "eat"), new AtomicBoolean(false));
-    System.out.println(CollectionUtil.join(rows, "\n  "));
-    assertTrue(rows.toString(), rows.size() > 0);
-    assertTrue(rows.toString().contains("verschlingen; verputzen\tto dispatch (eat)"));
+    public void testChemnitz() throws IOException {
+        final RandomAccessFile raf = new RandomAccessFile(TEST_OUTPUTS + "de-en.quickdic", "r");
+        final Dictionary dict = new Dictionary(raf);
+        final Index deIndex = dict.indices.get(0);
+
+        assertSearchResult("Höschen", "Hos", deIndex.findInsertionPoint("Hos", new AtomicBoolean(false)));
+        assertSearchResult("Höschen", "hos", deIndex.findInsertionPoint("hos", new AtomicBoolean(false)));
+
+        raf.close();
     }
 
-    {
-    // Should print: Giving up, too many words with prefix: p
-    final List<RowBase> rows = enIndex.multiWordSearch("p p", Arrays.asList("p", "p"), new AtomicBoolean(false));
-    assertTrue(rows.size() >= 1000);
+    public void testMultiSearch() throws IOException {
+        final RandomAccessFile raf = new RandomAccessFile(TEST_OUTPUTS + "de-en.quickdic", "r");
+        final Dictionary dict = new Dictionary(raf);
+        final Index deIndex = dict.indices.get(0);
+
+        {
+            final List<RowBase> rows = deIndex.multiWordSearch("aaa aaab", Arrays.asList("aaa", "aaab"), new AtomicBoolean(false));
+            System.out.println(CollectionUtil.join(rows, "\n  "));
+            assertTrue(rows.toString(), rows.size() > 0);
+        }
+
+        raf.close();
     }
 
-    {
-    // Should print: Giving up, too many words with prefix: a
-    final List<RowBase> rows = enIndex.multiWordSearch("a a", Arrays.asList("a", "a"), new AtomicBoolean(false));
-    assertTrue(rows.size() >= 1000);
+    public void testMultiSearchIt() throws IOException {
+        final RandomAccessFile raf = new RandomAccessFile(OUTPUTS + "IT.quickdic", "r");
+        final Dictionary dict = new Dictionary(raf);
+        final Index index = dict.indices.get(0);
+
+        {
+            final List<RowBase> rows = index.multiWordSearch("fare centro",
+                                       Arrays.asList("fare", "centro"), new AtomicBoolean(false));
+            System.out.println(CollectionUtil.join(rows, "\n  "));
+            assertTrue(rows.toString(), rows.size() > 0);
+            assertTrue(rows.get(0).toString().startsWith("fare centro@"));
+        }
     }
 
-    {
-    // Should print: Giving up, too many words with prefix: a
-    final List<RowBase> rows = enIndex.multiWordSearch("b ba", Arrays.asList("b", "ba"), new AtomicBoolean(false));
-    assertTrue(rows.size() >= 1000);
+    public void testMultiSearchDeBig() throws IOException {
+        final RandomAccessFile raf = new RandomAccessFile(OUTPUTS + "DE-EN.quickdic", "r");
+        final Dictionary dict = new Dictionary(raf);
+        final Index enIndex = dict.indices.get(1);
+
+        {
+            final List<RowBase> rows = enIndex.multiWordSearch("train station", Arrays.asList("train", "station"), new AtomicBoolean(false));
+            System.out.println(CollectionUtil.join(rows, "\n  "));
+            assertTrue(rows.toString(), rows.size() > 0);
+            assertTrue(rows.get(0).toString().startsWith("train station@"));
+        }
+
+        {
+            final List<RowBase> rows = enIndex.multiWordSearch("a train station", Arrays.asList("a", "train", "station"), new AtomicBoolean(false));
+            System.out.println(CollectionUtil.join(rows, "\n  "));
+            assertTrue(rows.toString(), rows.size() > 0);
+            assertEquals("Bahnhofsuhr {{de-noun|g=f|plural=Bahnhofsuhren}}\tstation clock (at a train station)", rows.get(0).toString());
+        }
+
+        {
+            final List<RowBase> rows = enIndex.multiWordSearch("a station", Arrays.asList("a", "station"), new AtomicBoolean(false));
+            System.out.println(CollectionUtil.join(rows, "\n  "));
+            assertTrue(rows.toString(), rows.size() > 0);
+            assertEquals("Abfahrthalle {en-noun}\tDeparture room of a station.", rows.get(0).toString());
+        }
+
+        {
+            // Should print: Giving up, too many words with prefix: p
+            final List<RowBase> rows = enIndex.multiWordSearch("p eat", Arrays.asList("p", "eat"), new AtomicBoolean(false));
+            System.out.println(CollectionUtil.join(rows, "\n  "));
+            assertTrue(rows.toString(), rows.size() > 0);
+            assertTrue(rows.toString().contains("verschlingen; verputzen\tto dispatch (eat)"));
+        }
+
+        {
+            // Should print: Giving up, too many words with prefix: p
+            final List<RowBase> rows = enIndex.multiWordSearch("p p", Arrays.asList("p", "p"), new AtomicBoolean(false));
+            assertTrue(rows.size() >= 1000);
+        }
+
+        {
+            // Should print: Giving up, too many words with prefix: a
+            final List<RowBase> rows = enIndex.multiWordSearch("a a", Arrays.asList("a", "a"), new AtomicBoolean(false));
+            assertTrue(rows.size() >= 1000);
+        }
+
+        {
+            // Should print: Giving up, too many words with prefix: a
+            final List<RowBase> rows = enIndex.multiWordSearch("b ba", Arrays.asList("b", "ba"), new AtomicBoolean(false));
+            assertTrue(rows.size() >= 1000);
+        }
+
+        {
+            // Should print: Giving up, too many words with prefix: a
+            final List<RowBase> rows = enIndex.multiWordSearch("b ba", Arrays.asList("b", "ba"), new AtomicBoolean(false));
+            assertTrue(rows.size() >= 1000);
+        }
+
+        raf.close();
     }
 
-    {
-    // Should print: Giving up, too many words with prefix: a
-    final List<RowBase> rows = enIndex.multiWordSearch("b ba", Arrays.asList("b", "ba"), new AtomicBoolean(false));
-    assertTrue(rows.size() >= 1000);
+    public void testMultiSearchBigAF() throws IOException {
+        final RandomAccessFile raf = new RandomAccessFile(OUTPUTS + "AF-EN.quickdic", "r");
+        final Dictionary dict = new Dictionary(raf);
+        final Index enIndex = dict.indices.get(1);
+
+        {
+            final List<RowBase> rows = enIndex.multiWordSearch("pig eats", Arrays.asList("pig", "eats"), new AtomicBoolean(false));
+            System.out.println(CollectionUtil.join(rows, "\n  "));
+            assertTrue(rows.toString(), rows.size() > 0);
+            assertEquals("vark\tpig (someone who overeats or eats rapidly) (noun)", rows.get(0).toString());
+        }
+
+        {
+            final List<RowBase> rows = enIndex.multiWordSearch("pig eat", Arrays.asList("pig", "eat"), new AtomicBoolean(false));
+            System.out.println(CollectionUtil.join(rows, "\n  "));
+            assertTrue(rows.toString(), rows.size() > 0);
+            assertEquals("vark\tpig (someone who overeats or eats rapidly) (noun)", rows.get(0).toString());
+        }
+
+        {
+            final List<RowBase> rows = enIndex.multiWordSearch("pi ea", Arrays.asList("pi", "ea"), new AtomicBoolean(false));
+            System.out.println(CollectionUtil.join(rows, "\n  "));
+            assertTrue(rows.toString(), rows.size() > 0);
+            assertTrue(rows.toString().contains("vark\tpig (someone who overeats or eats rapidly) (noun)"));
+        }
+
+        {
+            final List<RowBase> rows = enIndex.multiWordSearch("p eat", Arrays.asList("p", "eat"), new AtomicBoolean(false));
+            System.out.println(CollectionUtil.join(rows, "\n  "));
+            assertTrue(rows.toString(), rows.size() > 0);
+            assertTrue(rows.toString().contains("vark\tpig (someone who overeats or eats rapidly) (noun)"));
+        }
+
+
+        raf.close();
     }
 
-    raf.close();
-  }
 
-  public void testMultiSearchBigAF() throws IOException {
-    final RandomAccessFile raf = new RandomAccessFile(OUTPUTS + "AF-EN.quickdic", "r");
-    final Dictionary dict = new Dictionary(raf);
-    final Index enIndex = dict.indices.get(1);
+    public void testExactSearch() throws IOException {
+        final RandomAccessFile raf = new RandomAccessFile(OUTPUTS + "EN-cmn.quickdic", "r");
+        final Dictionary dict = new Dictionary(raf);
+        final Index cmnIndex = dict.indices.get(1);
 
-    {
-    final List<RowBase> rows = enIndex.multiWordSearch("pig eats", Arrays.asList("pig", "eats"), new AtomicBoolean(false));
-    System.out.println(CollectionUtil.join(rows, "\n  "));
-    assertTrue(rows.toString(), rows.size() > 0);
-    assertEquals("vark\tpig (someone who overeats or eats rapidly) (noun)", rows.get(0).toString());
-    }
+        final Random random = new Random(10);
 
-    {
-    final List<RowBase> rows = enIndex.multiWordSearch("pig eat", Arrays.asList("pig", "eat"), new AtomicBoolean(false));
-    System.out.println(CollectionUtil.join(rows, "\n  "));
-    assertTrue(rows.toString(), rows.size() > 0);
-    assertEquals("vark\tpig (someone who overeats or eats rapidly) (noun)", rows.get(0).toString());
-    }
+        for (int i = 0; i < 1000; ++i) {
+            final int ii = random.nextInt(cmnIndex.sortedIndexEntries.size());
+            final IndexEntry indexEntry = cmnIndex.sortedIndexEntries.get(ii);
+            final IndexEntry found = cmnIndex.findExact(indexEntry.token);
+            assertNotNull(found);
+            assertEquals(indexEntry.token, found.token);
+            assertEquals(indexEntry, found);  // Test of caching....
+        }
 
-    {
-    final List<RowBase> rows = enIndex.multiWordSearch("pi ea", Arrays.asList("pi", "ea"), new AtomicBoolean(false));
-    System.out.println(CollectionUtil.join(rows, "\n  "));
-    assertTrue(rows.toString(), rows.size() > 0);
-    assertTrue(rows.toString().contains("vark\tpig (someone who overeats or eats rapidly) (noun)"));
+        raf.close();
     }
 
-    {
-    final List<RowBase> rows = enIndex.multiWordSearch("p eat", Arrays.asList("p", "eat"), new AtomicBoolean(false));
-    System.out.println(CollectionUtil.join(rows, "\n  "));
-    assertTrue(rows.toString(), rows.size() > 0);
-    assertTrue(rows.toString().contains("vark\tpig (someone who overeats or eats rapidly) (noun)"));
+    public void testThai() throws IOException {
+        final RandomAccessFile raf = new RandomAccessFile(OUTPUTS + "EN-TH.quickdic", "r");
+        final Dictionary dict = new Dictionary(raf);
+        final Index thIndex = dict.indices.get(1);
+
+        final IndexEntry entry = thIndex.findInsertionPoint("ดี", new AtomicBoolean(false));
+        assertEquals("di", entry.token);
+
+        raf.close();
     }
 
-    
-    raf.close();
-  }
-
-
-  public void testExactSearch() throws IOException {
-    final RandomAccessFile raf = new RandomAccessFile(OUTPUTS + "EN-cmn.quickdic", "r");
-    final Dictionary dict = new Dictionary(raf);
-    final Index cmnIndex = dict.indices.get(1);
-
-    final Random random = new Random(10);
-    
-    for (int i = 0; i < 1000; ++i) {
-      final int ii = random.nextInt(cmnIndex.sortedIndexEntries.size());
-      final IndexEntry indexEntry = cmnIndex.sortedIndexEntries.get(ii);
-      final IndexEntry found = cmnIndex.findExact(indexEntry.token);
-      assertNotNull(found);
-      assertEquals(indexEntry.token, found.token);
-      assertEquals(indexEntry, found);  // Test of caching....
+    public void testNorwegian() throws IOException {
+        final RandomAccessFile raf = new RandomAccessFile(OUTPUTS + "EN-NL.quickdic", "r");
+        final Dictionary dict = new Dictionary(raf);
+        final Index nlIndex = dict.indices.get(1);
+
+        IndexEntry entry = nlIndex.findInsertionPoint("Xhosa", new AtomicBoolean(false));
+        assertEquals("Xhosa", entry.token);
+
+        entry = nlIndex.findInsertionPoint("Zyne", new AtomicBoolean(false));
+        assertEquals("Zyne", entry.token);
+
+        raf.close();
     }
-    
-    raf.close();
-  }
-
-  public void testThai() throws IOException {
-    final RandomAccessFile raf = new RandomAccessFile(OUTPUTS + "EN-TH.quickdic", "r");
-    final Dictionary dict = new Dictionary(raf);
-    final Index thIndex = dict.indices.get(1);
-
-    final IndexEntry entry = thIndex.findInsertionPoint("ดี", new AtomicBoolean(false));
-    assertEquals("di", entry.token);
-    
-    raf.close();
-  }
-
-  public void testNorwegian() throws IOException {
-      final RandomAccessFile raf = new RandomAccessFile(OUTPUTS + "EN-NL.quickdic", "r");
-      final Dictionary dict = new Dictionary(raf);
-      final Index nlIndex = dict.indices.get(1);
-
-      IndexEntry entry = nlIndex.findInsertionPoint("Xhosa", new AtomicBoolean(false));
-      assertEquals("Xhosa", entry.token);
-
-      entry = nlIndex.findInsertionPoint("Zyne", new AtomicBoolean(false));
-      assertEquals("Zyne", entry.token);
-
-      raf.close();
-  }
 
 }
index 0c3fa13da2147e30c0bd9e9f0ab3c0ed2b925bb0..e7e1b43635627d146d263dbf0c5a6ea5d85b1570 100644 (file)
@@ -29,149 +29,150 @@ import com.hughes.android.dictionary.engine.Index.IndexEntry;
 import com.hughes.android.dictionary.parser.DictFileParser;
 
 public class IndexBuilder {
-  
-  final DictionaryBuilder dictionaryBuilder;
-  public final Index index;
-  final Set<String> stoplist;
-
-  final SortedMap<String, TokenData> tokenToData;
-
-  IndexBuilder(final DictionaryBuilder dictionaryBuilder, final String shortName, final String longName, final Language language, final String normalizerRules, final Set<String> stoplist, final boolean swapPairEntries) {
-    this.dictionaryBuilder = dictionaryBuilder;
-    index = new Index(dictionaryBuilder.dictionary, shortName, longName, language, normalizerRules, swapPairEntries, stoplist);
-    tokenToData = new TreeMap<String, TokenData>(index.getSortComparator());
-    this.stoplist = stoplist;
-  }
-  
-  public void build() {
-    final Set<IndexedEntry> tokenIndexedEntries = new HashSet<IndexedEntry>();
-    final List<RowBase> rows = index.rows;
-    index.mainTokenCount = 0;
-    for (final TokenData tokenData : tokenToData.values()) {
-      tokenIndexedEntries.clear();
-      final int indexIndex = index.sortedIndexEntries.size();
-      final int startRow = rows.size();
-      
-      TokenRow tokenRow = null;
-      if (!tokenData.htmlEntries.isEmpty()) {
-          tokenRow = new TokenRow(indexIndex, rows.size(), index, tokenData.hasMainEntry);
-          rows.add(tokenRow);
-      }
-      
+
+    final DictionaryBuilder dictionaryBuilder;
+    public final Index index;
+    final Set<String> stoplist;
+
+    final SortedMap<String, TokenData> tokenToData;
+
+    IndexBuilder(final DictionaryBuilder dictionaryBuilder, final String shortName, final String longName, final Language language, final String normalizerRules, final Set<String> stoplist, final boolean swapPairEntries) {
+        this.dictionaryBuilder = dictionaryBuilder;
+        index = new Index(dictionaryBuilder.dictionary, shortName, longName, language, normalizerRules, swapPairEntries, stoplist);
+        tokenToData = new TreeMap<String, TokenData>(index.getSortComparator());
+        this.stoplist = stoplist;
+    }
+
+    public void build() {
+        final Set<IndexedEntry> tokenIndexedEntries = new HashSet<IndexedEntry>();
+        final List<RowBase> rows = index.rows;
+        index.mainTokenCount = 0;
+        for (final TokenData tokenData : tokenToData.values()) {
+            tokenIndexedEntries.clear();
+            final int indexIndex = index.sortedIndexEntries.size();
+            final int startRow = rows.size();
+
+            TokenRow tokenRow = null;
+            if (!tokenData.htmlEntries.isEmpty()) {
+                tokenRow = new TokenRow(indexIndex, rows.size(), index, tokenData.hasMainEntry);
+                rows.add(tokenRow);
+            }
+
 //    System.out.println("Added TokenRow: " + rows.get(rows.size() - 1));
-      
-      int numRows = 0;  // off by one--doesn't count the token row!
+
+            int numRows = 0;  // off by one--doesn't count the token row!
 //      System.out.println("TOKEN: " + tokenData.token);
-      for (final Map.Entry<EntryTypeName, List<IndexedEntry>> typeToIndexedEntries : tokenData.typeToEntries.entrySet()) {
-        for (final IndexedEntry indexedEntry : typeToIndexedEntries.getValue()) {
-          if (!indexedEntry.isValid) {
-            continue;
-          }
-          
-          if (tokenRow == null) {
-              tokenRow = new TokenRow(indexIndex, rows.size(), index, tokenData.hasMainEntry);
-              rows.add(tokenRow);
-          }
-          
-          if (indexedEntry.entry.index() == -1) {
-            indexedEntry.entry.addToDictionary(dictionaryBuilder.dictionary);
-            assert indexedEntry.entry.index() >= 0;
-          }
-          if (tokenIndexedEntries.add(indexedEntry) && !tokenData.htmlEntries.contains(indexedEntry.entry)) {
-            rows.add(indexedEntry.entry.CreateRow(rows.size(), index));
-            ++indexedEntry.entry.entrySource.numEntries;
-            ++numRows;
-            
+            for (final Map.Entry<EntryTypeName, List<IndexedEntry>> typeToIndexedEntries : tokenData.typeToEntries.entrySet()) {
+                for (final IndexedEntry indexedEntry : typeToIndexedEntries.getValue()) {
+                    if (!indexedEntry.isValid) {
+                        continue;
+                    }
+
+                    if (tokenRow == null) {
+                        tokenRow = new TokenRow(indexIndex, rows.size(), index, tokenData.hasMainEntry);
+                        rows.add(tokenRow);
+                    }
+
+                    if (indexedEntry.entry.index() == -1) {
+                        indexedEntry.entry.addToDictionary(dictionaryBuilder.dictionary);
+                        assert indexedEntry.entry.index() >= 0;
+                    }
+                    if (tokenIndexedEntries.add(indexedEntry) && !tokenData.htmlEntries.contains(indexedEntry.entry)) {
+                        rows.add(indexedEntry.entry.CreateRow(rows.size(), index));
+                        ++indexedEntry.entry.entrySource.numEntries;
+                        ++numRows;
+
 //            System.out.print("  " + typeToEntry.getKey() + ": ");
-  //          rows.get(rows.size() - 1).print(System.out);
+                        //          rows.get(rows.size() - 1).print(System.out);
 //            System.out.println();
-          }
+                    }
+                }
+            }
+
+            if (tokenRow != null) {
+                if (tokenRow.hasMainEntry) {
+                    index.mainTokenCount++;
+                }
+
+                final Index.IndexEntry indexEntry = new Index.IndexEntry(index, tokenData.token, index
+                        .normalizer().transliterate(tokenData.token), startRow, numRows);
+                indexEntry.htmlEntries.addAll(tokenData.htmlEntries);
+                index.sortedIndexEntries.add(indexEntry);
+            }
+        }
+
+        final List<IndexEntry> entriesSortedByNumRows = new ArrayList<IndexEntry>(index.sortedIndexEntries);
+        Collections.sort(entriesSortedByNumRows, new Comparator<IndexEntry>() {
+            @Override
+            public int compare(IndexEntry object1, IndexEntry object2) {
+                return object2.numRows - object1.numRows;
+            }
+        });
+        System.out.println("Most common tokens:");
+        for (int i = 0; i < 50 && i < entriesSortedByNumRows.size(); ++i) {
+            System.out.println("  " + entriesSortedByNumRows.get(i));
         }
-      }
-      
-      if (tokenRow != null) {
-          if (tokenRow.hasMainEntry) {
-              index.mainTokenCount++;
-          }
-          
-          final Index.IndexEntry indexEntry = new Index.IndexEntry(index, tokenData.token, index
-                  .normalizer().transliterate(tokenData.token), startRow, numRows);
-          indexEntry.htmlEntries.addAll(tokenData.htmlEntries);
-          index.sortedIndexEntries.add(indexEntry);
-      }
     }
-    
-    final List<IndexEntry> entriesSortedByNumRows = new ArrayList<IndexEntry>(index.sortedIndexEntries);
-    Collections.sort(entriesSortedByNumRows, new Comparator<IndexEntry>() {
-      @Override
-      public int compare(IndexEntry object1, IndexEntry object2) {
-        return object2.numRows - object1.numRows;
-      }});
-    System.out.println("Most common tokens:");
-    for (int i = 0; i < 50 && i < entriesSortedByNumRows.size(); ++i) {
-      System.out.println("  " + entriesSortedByNumRows.get(i));
+
+    public static class TokenData {
+        final String token;
+
+        final Map<EntryTypeName, List<IndexedEntry>> typeToEntries = new EnumMap<EntryTypeName, List<IndexedEntry>>(EntryTypeName.class);
+        public boolean hasMainEntry = false;
+
+        public List<HtmlEntry> htmlEntries = new ArrayList<HtmlEntry>();
+
+        TokenData(final String token) {
+            assert token.equals(token.trim());
+            assert token.length() > 0;
+            this.token = token;
+        }
     }
-  }
-  
-  public static class TokenData {
-    final String token;
-        
-    final Map<EntryTypeName, List<IndexedEntry>> typeToEntries = new EnumMap<EntryTypeName, List<IndexedEntry>>(EntryTypeName.class);
-    public boolean hasMainEntry = false;
-    
-    public List<HtmlEntry> htmlEntries = new ArrayList<HtmlEntry>();
-    
-    TokenData(final String token) {
-      assert token.equals(token.trim());
-      assert token.length() > 0;
-      this.token = token;
+
+    public TokenData getOrCreateTokenData(final String token) {
+        TokenData tokenData = tokenToData.get(token);
+        if (tokenData == null) {
+            tokenData = new TokenData(token);
+            tokenToData.put(token, tokenData);
+        }
+        return tokenData;
     }
-  }
 
-  public TokenData getOrCreateTokenData(final String token) {
-    TokenData tokenData = tokenToData.get(token);
-    if (tokenData == null) {
-      tokenData = new TokenData(token);
-      tokenToData.put(token, tokenData);
+    private List<IndexedEntry> getOrCreateEntries(final String token, final EntryTypeName entryTypeName) {
+        final TokenData tokenData = getOrCreateTokenData(token);
+        List<IndexedEntry> entries = tokenData.typeToEntries.get(entryTypeName);
+        if (entryTypeName.mainWord) {
+            tokenData.hasMainEntry = true;
+        }
+        if (entries == null) {
+            entries = new ArrayList<IndexedEntry>();
+            tokenData.typeToEntries.put(entryTypeName, entries);
+        }
+        return entries;
     }
-    return tokenData;
-  }
-
-  private List<IndexedEntry> getOrCreateEntries(final String token, final EntryTypeName entryTypeName) {
-    final TokenData tokenData = getOrCreateTokenData(token);
-    List<IndexedEntry> entries = tokenData.typeToEntries.get(entryTypeName);
-    if (entryTypeName.mainWord) {
-      tokenData.hasMainEntry = true;
+
+    public void addEntryWithTokens(final IndexedEntry indexedEntry, final Set<String> tokens,
+                                   final EntryTypeName entryTypeName) {
+        if (indexedEntry == null) {
+            System.out.println("asdfasdf");
+        }
+        assert indexedEntry != null;
+        for (final String token : tokens) {
+            if (entryTypeName.overridesStopList || !stoplist.contains(token)) {
+                getOrCreateEntries(token, entryTypeName).add(indexedEntry);
+            }
+        }
     }
-    if (entries == null) {
-      entries = new ArrayList<IndexedEntry>();
-      tokenData.typeToEntries.put(entryTypeName, entries);
+
+    public void addEntryWithString(final IndexedEntry indexedEntry, final String untokenizedString,
+                                   final EntryTypeName entryTypeName) {
+        final Set<String> tokens = DictFileParser.tokenize(untokenizedString, DictFileParser.NON_CHAR);
+        addEntryWithTokens(indexedEntry, tokens, tokens.size() == 1 ? entryTypeName.singleWordInstance : entryTypeName);
     }
-    return entries;
-  }
 
-  public void addEntryWithTokens(final IndexedEntry indexedEntry, final Set<String> tokens,
-      final EntryTypeName entryTypeName) {
-    if (indexedEntry == null) {
-      System.out.println("asdfasdf");
+    public void addEntryWithStringNoSingle(final IndexedEntry indexedEntry, final String untokenizedString,
+                                           final EntryTypeName entryTypeName) {
+        final Set<String> tokens = DictFileParser.tokenize(untokenizedString, DictFileParser.NON_CHAR);
+        addEntryWithTokens(indexedEntry, tokens, entryTypeName);
     }
-    assert indexedEntry != null;
-    for (final String token : tokens) {
-      if (entryTypeName.overridesStopList || !stoplist.contains(token)) {
-        getOrCreateEntries(token, entryTypeName).add(indexedEntry);
-      }
-    }    
-  }
-
-  public void addEntryWithString(final IndexedEntry indexedEntry, final String untokenizedString,
-      final EntryTypeName entryTypeName) {
-    final Set<String> tokens = DictFileParser.tokenize(untokenizedString, DictFileParser.NON_CHAR);
-    addEntryWithTokens(indexedEntry, tokens, tokens.size() == 1 ? entryTypeName.singleWordInstance : entryTypeName);
-  }
-
-  public void addEntryWithStringNoSingle(final IndexedEntry indexedEntry, final String untokenizedString,
-      final EntryTypeName entryTypeName) {
-    final Set<String> tokens = DictFileParser.tokenize(untokenizedString, DictFileParser.NON_CHAR);
-    addEntryWithTokens(indexedEntry, tokens, entryTypeName);
-  }
 }
index faf11fd423697b45d24196600a38393ed603ab17..d7089511355b27f5b22995c7bc41eb000842fae9 100644 (file)
@@ -16,10 +16,10 @@ package com.hughes.android.dictionary.engine;
 
 
 public class IndexedEntry {
-  AbstractEntry entry;
-  public boolean isValid = false;
-  
-  public IndexedEntry(final AbstractEntry entry) {
-    this.entry = entry;
-  }
+    AbstractEntry entry;
+    public boolean isValid = false;
+
+    public IndexedEntry(final AbstractEntry entry) {
+        this.entry = entry;
+    }
 }
\ No newline at end of file
index d81ad873ff2f7d9a86b5d141b38fd4bb9dd4daff..24fe094ffe99563e7478ed36167ac1ac9b331a94 100644 (file)
@@ -28,170 +28,170 @@ import com.hughes.android.dictionary.parser.wiktionary.WiktionaryLangs;
 import com.ibm.icu.text.Transliterator;
 
 public class LanguageTest extends TestCase {
-  
-  public void testGermanSort() {
-    final Transliterator normalizer = Transliterator.createFromRules("", Language.de.getDefaultNormalizerRules(), Transliterator.FORWARD);
-    assertEquals("aüääss", normalizer.transform("aueAeAEß"));
-    final List<String> words = Arrays.asList(
-        "er-ben",
-        "erben",
-        "Erben",
-        "Erbse",
-        "Erbsen",
-        "essen",
-        "Essen",
-        "Grosformat",
-        "Grosformats",
-        "Grossformat",
-        "Großformat",
-        "Grossformats",
-        "Großformats",
-        "Großpoo",
-        "Großpoos",
-        "Hörvermögen",
-        "Hörweite",
-        "hos",
-        "Höschen",
-        "Hostel",
-        "hulle",
-        "Hulle",
-        "huelle",
-        "Huelle",
-        "hülle",
-        "Hülle",
-        "Huellen",
-        "Hüllen",
-        "Hum"
-        );
-    final NormalizeComparator comparator = new NormalizeComparator(normalizer, Language.de.getCollator(), 7);
-    assertEquals(1, comparator.compare("hülle", "huelle"));
-    assertEquals(-1, comparator.compare("huelle", "hülle"));
-    
-    assertEquals(-1, comparator.compare("hülle", "Hülle"));
-    
-    assertEquals("hülle", normalizer.transform("Hülle"));
-    assertEquals("hulle", normalizer.transform("Hulle"));
-
-    
-    final List<String> sorted = new ArrayList<String>(words);
+
+    public void testGermanSort() {
+        final Transliterator normalizer = Transliterator.createFromRules("", Language.de.getDefaultNormalizerRules(), Transliterator.FORWARD);
+        assertEquals("aüääss", normalizer.transform("aueAeAEß"));
+        final List<String> words = Arrays.asList(
+                                       "er-ben",
+                                       "erben",
+                                       "Erben",
+                                       "Erbse",
+                                       "Erbsen",
+                                       "essen",
+                                       "Essen",
+                                       "Grosformat",
+                                       "Grosformats",
+                                       "Grossformat",
+                                       "Großformat",
+                                       "Grossformats",
+                                       "Großformats",
+                                       "Großpoo",
+                                       "Großpoos",
+                                       "Hörvermögen",
+                                       "Hörweite",
+                                       "hos",
+                                       "Höschen",
+                                       "Hostel",
+                                       "hulle",
+                                       "Hulle",
+                                       "huelle",
+                                       "Huelle",
+                                       "hülle",
+                                       "Hülle",
+                                       "Huellen",
+                                       "Hüllen",
+                                       "Hum"
+                                   );
+        final NormalizeComparator comparator = new NormalizeComparator(normalizer, Language.de.getCollator(), 7);
+        assertEquals(1, comparator.compare("hülle", "huelle"));
+        assertEquals(-1, comparator.compare("huelle", "hülle"));
+
+        assertEquals(-1, comparator.compare("hülle", "Hülle"));
+
+        assertEquals("hülle", normalizer.transform("Hülle"));
+        assertEquals("hulle", normalizer.transform("Hulle"));
+
+
+        final List<String> sorted = new ArrayList<String>(words);
 //    Collections.shuffle(shuffled, new Random(0));
-    Collections.sort(sorted, comparator);
-    System.out.println(sorted.toString());
-    for (int i = 0; i < words.size(); ++i) {
-      System.out.println(words.get(i) + "\t" + sorted.get(i));
-      assertEquals(words.get(i), sorted.get(i));
+        Collections.sort(sorted, comparator);
+        System.out.println(sorted.toString());
+        for (int i = 0; i < words.size(); ++i) {
+            System.out.println(words.get(i) + "\t" + sorted.get(i));
+            assertEquals(words.get(i), sorted.get(i));
+        }
+    }
+
+    public void testEnglishSort() {
+        final Transliterator normalizer = Transliterator.createFromRules("", Language.en.getDefaultNormalizerRules(), Transliterator.FORWARD);
+
+        final List<String> words = Arrays.asList(
+                                       "pre-print",
+                                       "preppie",
+                                       "preppy",
+                                       "preprocess");
+
+        final List<String> sorted = new ArrayList<String>(words);
+        final NormalizeComparator comparator = new NormalizeComparator(normalizer, Language.en.getCollator(), 7);
+        Collections.sort(sorted, comparator);
+        for (int i = 0; i < words.size(); ++i) {
+            if (i > 0) {
+                assertTrue(comparator.compare(words.get(i-1), words.get(i)) < 0);
+            }
+            System.out.println(words.get(i) + "\t" + sorted.get(i));
+            assertEquals(words.get(i), sorted.get(i));
+        }
+
+        assertTrue(comparator.compare("pre-print", "preppy") < 0);
+
+    }
+
+    public void testLanguage() {
+        assertEquals(Language.de, Language.lookup("de"));
+        assertEquals(Language.en, Language.lookup("en"));
+        assertEquals("es", Language.lookup("es").getIsoCode());
+    }
+
+    public void testTextNorm() {
+        //final Transliterator transliterator = Transliterator.getInstance("Any-Latin; Upper; Lower; 'oe' > 'o'; NFD; [:Nonspacing Mark:] Remove; NFC", Transliterator.FORWARD);
+        final Transliterator transliterator = Transliterator.createFromRules("", ":: Any-Latin; :: Upper; :: Lower; 'oe' > 'o'; :: NFD; :: [:Nonspacing Mark:] Remove; :: NFC ;", Transliterator.FORWARD);
+        assertEquals("hoschen", transliterator.transliterate("Höschen"));
+        assertEquals("hoschen", transliterator.transliterate("Hoeschen"));
+        assertEquals("grosspoo", transliterator.transliterate("Großpoo"));
+
+        assertEquals("kyanpasu", transliterator.transliterate("キャンパス"));
+        assertEquals("alphabetikos katalogos", transliterator.transliterate("Αλφαβητικός Κατάλογος"));
+        assertEquals("biologiceskom", transliterator.transliterate("биологическом"));
+    }
+    public void testHalfTextNorm() {
+        final Transliterator transliterator = Transliterator.createFromRules("", ":: Any-Latin; ' ' > ; :: Lower; ", Transliterator.FORWARD);
+        assertEquals("kyanpasu", transliterator.transliterate("キャンパス"));
+        assertEquals("alphabētikóskatálogos", transliterator.transliterate("Αλφαβητικός Κατάλογος"));
+        assertEquals("biologičeskom", transliterator.transliterate("биологическом"));
+
+        assertEquals("xièxiè", transliterator.transliterate("謝謝"));
+        assertEquals("xièxiè", transliterator.transliterate("谢谢"));
+
+        assertEquals("diànnǎo", transliterator.transliterate("電腦"));
+        assertEquals("diànnǎo", transliterator.transliterate("电脑"));
+        assertEquals("jìsuànjī", transliterator.transliterate("計算機"));
+        assertEquals("jìsuànjī", transliterator.transliterate("计算机"));
+    }
+
+
+    public void testChinese() {
+        final Language cmn = Language.lookup("cmn");
+        final Transliterator transliterator = Transliterator.createFromRules("", cmn.getDefaultNormalizerRules(), Transliterator.FORWARD);
+
+        assertEquals("xiexie", transliterator.transliterate("謝謝"));
+        assertEquals("xiexie", transliterator.transliterate("谢谢"));
+
+        assertEquals("diannao", transliterator.transliterate("電腦"));
+        assertEquals("diannao", transliterator.transliterate("电脑"));
+        assertEquals("jisuanji", transliterator.transliterate("計算機"));
+        assertEquals("jisuanji", transliterator.transliterate("计算机"));
+
+        assertEquals("chengjiu", transliterator.transliterate("成就"));
+
+    }
+
+    public void testArabic() {
+        final Language ar = Language.lookup("ar");
+        final Transliterator transliterator = Transliterator.createFromRules("", ar.getDefaultNormalizerRules(), Transliterator.FORWARD);
+        // These don't seem quite right....
+        assertEquals("haswb", transliterator.transliterate("حاسوب"));
+        assertEquals("kmbywtr", transliterator.transliterate("كمبيوتر"));
+
+        assertEquals("{\u200e كمبيوتر \u200e}", Language.fixBidiText("{كمبيوتر}"));
+        assertEquals("{a=\u200e كمبيوتر \u200e}", Language.fixBidiText("{a=كمبيوتر}"));
+        assertEquals("(\u200e كمبيوتر \u200e)", Language.fixBidiText("(كمبيوتر)"));
+        assertEquals("أنثى أنْثَى (’únθā) {f}, إناث (’ināθ) {p}, اناثى (’anāθā) {p}", Language.fixBidiText("أنثى أنْثَى (’únθā) {f}, إناث (’ināθ) {p}, اناثى (’anāθā) {p}"));
+
     }
-  }
-
-  public void testEnglishSort() {
-    final Transliterator normalizer = Transliterator.createFromRules("", Language.en.getDefaultNormalizerRules(), Transliterator.FORWARD);
-
-    final List<String> words = Arrays.asList(
-        "pre-print", 
-        "preppie", 
-        "preppy",
-        "preprocess");
-    
-    final List<String> sorted = new ArrayList<String>(words);
-    final NormalizeComparator comparator = new NormalizeComparator(normalizer, Language.en.getCollator(), 7);
-    Collections.sort(sorted, comparator);
-    for (int i = 0; i < words.size(); ++i) {
-      if (i > 0) {
-        assertTrue(comparator.compare(words.get(i-1), words.get(i)) < 0);
-      }
-      System.out.println(words.get(i) + "\t" + sorted.get(i));
-      assertEquals(words.get(i), sorted.get(i));
+
+    public void testThai() {
+        final Language th = Language.lookup("TH");
+        final Transliterator transliterator = Transliterator.createFromRules("", th.getDefaultNormalizerRules(), Transliterator.FORWARD);
+        // Not sure these are right, just to know...
+        assertEquals("d", transliterator.transliterate("ด"));
+        assertEquals("di", transliterator.transliterate("ด ี"));
+        assertEquals("dii", transliterator.transliterate("ดีี"));
+
+        assertEquals(Collections.singleton("ดีี"), DictFileParser.tokenize("ดีี", DictFileParser.NON_CHAR));
     }
-    
-    assertTrue(comparator.compare("pre-print", "preppy") < 0);
-
-  }
-  
-  public void testLanguage() {
-    assertEquals(Language.de, Language.lookup("de"));
-    assertEquals(Language.en, Language.lookup("en"));
-    assertEquals("es", Language.lookup("es").getIsoCode());
-  }
-
-  public void testTextNorm() {
-    //final Transliterator transliterator = Transliterator.getInstance("Any-Latin; Upper; Lower; 'oe' > 'o'; NFD; [:Nonspacing Mark:] Remove; NFC", Transliterator.FORWARD);
-    final Transliterator transliterator = Transliterator.createFromRules("", ":: Any-Latin; :: Upper; :: Lower; 'oe' > 'o'; :: NFD; :: [:Nonspacing Mark:] Remove; :: NFC ;", Transliterator.FORWARD);
-    assertEquals("hoschen", transliterator.transliterate("Höschen"));
-    assertEquals("hoschen", transliterator.transliterate("Hoeschen"));
-    assertEquals("grosspoo", transliterator.transliterate("Großpoo"));
-
-    assertEquals("kyanpasu", transliterator.transliterate("キャンパス"));
-    assertEquals("alphabetikos katalogos", transliterator.transliterate("Αλφαβητικός Κατάλογος"));
-    assertEquals("biologiceskom", transliterator.transliterate("биологическом"));
-  }
-  public void testHalfTextNorm() {
-    final Transliterator transliterator = Transliterator.createFromRules("", ":: Any-Latin; ' ' > ; :: Lower; ", Transliterator.FORWARD);
-    assertEquals("kyanpasu", transliterator.transliterate("キャンパス"));
-    assertEquals("alphabētikóskatálogos", transliterator.transliterate("Αλφαβητικός Κατάλογος"));
-    assertEquals("biologičeskom", transliterator.transliterate("биологическом"));
-
-    assertEquals("xièxiè", transliterator.transliterate("謝謝"));
-    assertEquals("xièxiè", transliterator.transliterate("谢谢"));
-
-    assertEquals("diànnǎo", transliterator.transliterate("電腦"));
-    assertEquals("diànnǎo", transliterator.transliterate("电脑"));
-    assertEquals("jìsuànjī", transliterator.transliterate("計算機"));
-    assertEquals("jìsuànjī", transliterator.transliterate("计算机"));
-  }
-
-  
-  public void testChinese() {
-    final Language cmn = Language.lookup("cmn");
-    final Transliterator transliterator = Transliterator.createFromRules("", cmn.getDefaultNormalizerRules(), Transliterator.FORWARD);
-    
-    assertEquals("xiexie", transliterator.transliterate("謝謝"));
-    assertEquals("xiexie", transliterator.transliterate("谢谢"));
-
-    assertEquals("diannao", transliterator.transliterate("電腦"));
-    assertEquals("diannao", transliterator.transliterate("电脑"));
-    assertEquals("jisuanji", transliterator.transliterate("計算機"));
-    assertEquals("jisuanji", transliterator.transliterate("计算机"));
-    
-    assertEquals("chengjiu", transliterator.transliterate("成就"));
-    
-  }
-  
-  public void testArabic() {
-    final Language ar = Language.lookup("ar");
-    final Transliterator transliterator = Transliterator.createFromRules("", ar.getDefaultNormalizerRules(), Transliterator.FORWARD);
-    // These don't seem quite right....
-    assertEquals("haswb", transliterator.transliterate("حاسوب"));
-    assertEquals("kmbywtr", transliterator.transliterate("كمبيوتر"));
-
-    assertEquals("{\u200e كمبيوتر \u200e}", Language.fixBidiText("{كمبيوتر}"));
-    assertEquals("{a=\u200e كمبيوتر \u200e}", Language.fixBidiText("{a=كمبيوتر}"));
-    assertEquals("(\u200e كمبيوتر \u200e)", Language.fixBidiText("(كمبيوتر)"));
-    assertEquals("أنثى أنْثَى (’únθā) {f}, إناث (’ināθ) {p}, اناثى (’anāθā) {p}", Language.fixBidiText("أنثى أنْثَى (’únθā) {f}, إناث (’ināθ) {p}, اناثى (’anāθā) {p}"));
-       
-  }
-
-  public void testThai() {
-    final Language th = Language.lookup("TH");
-    final Transliterator transliterator = Transliterator.createFromRules("", th.getDefaultNormalizerRules(), Transliterator.FORWARD);
-    // Not sure these are right, just to know...
-    assertEquals("d", transliterator.transliterate("ด"));
-    assertEquals("di", transliterator.transliterate("ด ี"));
-    assertEquals("dii", transliterator.transliterate("ดีี"));
-    
-    assertEquals(Collections.singleton("ดีี"), DictFileParser.tokenize("ดีี", DictFileParser.NON_CHAR));
-  }
-
-  
-  public void testEnWiktionaryNames() {
-    final Set<String> enLangs = new LinkedHashSet<String>(WiktionaryLangs.isoCodeToEnWikiName.keySet());
-    final List<String> names = new ArrayList<String>();
-    for (final String code : WiktionaryLangs.isoCodeToEnWikiName.keySet()) {
-      names.add(WiktionaryLangs.isoCodeToEnWikiName.get(code));
-      enLangs.add(code.toLowerCase());
+
+
+    public void testEnWiktionaryNames() {
+        final Set<String> enLangs = new LinkedHashSet<String>(WiktionaryLangs.isoCodeToEnWikiName.keySet());
+        final List<String> names = new ArrayList<String>();
+        for (final String code : WiktionaryLangs.isoCodeToEnWikiName.keySet()) {
+            names.add(WiktionaryLangs.isoCodeToEnWikiName.get(code));
+            enLangs.add(code.toLowerCase());
+        }
+        Collections.sort(names);
+        System.out.println(names);
+        //assertEquals(enLangs, Language.isoCodeToResources.keySet());
     }
-    Collections.sort(names);
-    System.out.println(names);
-    //assertEquals(enLangs, Language.isoCodeToResources.keySet());
-  }
 
 }
index 12b0c5215e772f201f8d18b15f0adaff61f7b7ad..6839904516abd6293c9cd6f6dcedc546ed39ecc7 100644 (file)
@@ -37,277 +37,276 @@ import com.hughes.android.dictionary.parser.wiktionary.WiktionaryLangs;
 
 public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler {
 
-  // The matches the whole line, otherwise regexes don't work well on French:
-  // {{=uk=}}
-  // Spanish has no initial headings, tried to also detect {{ES as such
-  // with "^(\\{\\{ES|(=+)[^=]).*$" but that broke English.
-  static final Pattern headingStart = Pattern.compile("^(=+)[^=].*$", Pattern.MULTILINE);
-  
-  final Map<String,List<Selector>> pathToSelectors = new LinkedHashMap<String, List<Selector>>();
-  List<Selector> currentSelectors = null;
-  
-  StringBuilder titleBuilder;
-  StringBuilder textBuilder;
-  StringBuilder currentBuilder = null;
-
-  public static void main(final String[] args) throws Exception {
-    final WiktionarySplitter wiktionarySplitter = new WiktionarySplitter();
-    wiktionarySplitter.go();
-  }
-  
-  private WiktionarySplitter() {
-    List<Selector> selectors;
-    for (final String code : WiktionaryLangs.wikiCodeToIsoCodeToWikiName.keySet()) {
-      //if (!code.equals("fr")) {continue;}
-      selectors = new ArrayList<WiktionarySplitter.Selector>();
-      pathToSelectors.put(String.format("data/inputs/%swiktionary-pages-articles.xml", code), selectors);
-      for (final Map.Entry<String, String> entry : WiktionaryLangs.wikiCodeToIsoCodeToWikiName.get(code).entrySet()) {
-        final String dir = String.format("data/inputs/wikiSplit/%s", code);
-        new File(dir).mkdirs();
-        selectors.add(new Selector(String.format("%s/%s.data", dir, entry.getKey()), entry.getValue()));
-      }
-    }
-  }
-
-  private void go() throws Exception {
-    final SAXParser parser = SAXParserFactoryImpl.newInstance().newSAXParser();
-
-    // Configure things.
-    for (final Map.Entry<String, List<Selector>> pathToSelectorsEntry : pathToSelectors.entrySet()) {
-      
-      currentSelectors = pathToSelectorsEntry.getValue();
-      
-      for (final Selector selector : currentSelectors) {
-        selector.out = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(selector.outFilename)));
-      }
-  
-      // Do it.
-      try {
-        parser.parse(new File(pathToSelectorsEntry.getKey()), this);
-      } catch (Exception e) {
-        System.err.println("Exception during parse, lastPageTitle=" + lastPageTitle + ", titleBuilder=" + titleBuilder.toString());
-        throw e;
-      }
-      
-      // Shutdown.
-      for (final Selector selector : currentSelectors) {
-        selector.out.close();
-      }
-      
-    }
-  }
-
-  String lastPageTitle = null;
-  int pageCount = 0;
-  private void endPage() {
-    final String title = titleBuilder.toString();
-    lastPageTitle = title;
-    if (++pageCount % 1000 == 0) {
-      System.out.println("endPage: " + title + ", count=" + pageCount);
+    // The matches the whole line, otherwise regexes don't work well on French:
+    // {{=uk=}}
+    // Spanish has no initial headings, tried to also detect {{ES as such
+    // with "^(\\{\\{ES|(=+)[^=]).*$" but that broke English.
+    static final Pattern headingStart = Pattern.compile("^(=+)[^=].*$", Pattern.MULTILINE);
+
+    final Map<String,List<Selector>> pathToSelectors = new LinkedHashMap<String, List<Selector>>();
+    List<Selector> currentSelectors = null;
+
+    StringBuilder titleBuilder;
+    StringBuilder textBuilder;
+    StringBuilder currentBuilder = null;
+
+    public static void main(final String[] args) throws Exception {
+        final WiktionarySplitter wiktionarySplitter = new WiktionarySplitter();
+        wiktionarySplitter.go();
     }
-    if (title.startsWith("Wiktionary:") || 
-            title.startsWith("Appendix:") || 
-            title.startsWith("Help:") ||
-            title.startsWith("Index:") ||
-            title.startsWith("MediaWiki:") || 
-            title.startsWith("Citations:") || 
-            title.startsWith("Concordance:") || 
-            title.startsWith("Glossary:") || 
-            title.startsWith("Rhymes:") || 
-            title.startsWith("Category:") || 
-            title.startsWith("Wikisaurus:") || 
-            title.startsWith("Unsupported titles/") || 
-            title.startsWith("Transwiki:") || 
-            title.startsWith("File:") || 
-            title.startsWith("Thread:") || 
-            title.startsWith("Template:") ||
-            title.startsWith("Summary:") ||
-            title.startsWith("Module:") ||
-            // DE
-            title.startsWith("Datei:") ||
-            title.startsWith("Verzeichnis:") ||
-            title.startsWith("Vorlage:") ||
-            title.startsWith("Thesaurus:") ||
-            title.startsWith("Kategorie:") ||
-            title.startsWith("Hilfe:") ||
-            title.startsWith("Reim:") ||
-            // FR:
-            title.startsWith("Annexe:") ||
-            title.startsWith("Catégori:") ||
-            title.startsWith("Modèle:") ||
-            title.startsWith("Thésaurus:") ||
-            title.startsWith("Projet:") ||
-            title.startsWith("Aide:") ||
-            title.startsWith("Fichier:") ||
-            title.startsWith("Wiktionnaire:") ||
-            title.startsWith("Catégorie:") ||
-            title.startsWith("Portail:") ||
-            title.startsWith("utiliusateur:") ||
-            title.startsWith("Kategorio:") ||
-            // IT
-            title.startsWith("Wikizionario:") ||
-            title.startsWith("Appendice:") ||
-            title.startsWith("Categoria:") ||
-            title.startsWith("Aiuto:") ||
-            title.startsWith("Portail:") ||
-            // ES
-            title.startsWith("Apéndice:") ||
-            title.startsWith("Archivo:") ||
-            title.startsWith("Ayuda:") ||
-            title.startsWith("Categoría:") ||
-            title.startsWith("Plantilla:") ||
-            title.startsWith("Wikcionario:") ||
-
-            // sentinel
-            false
-            ) {
-        return;
+
+    private WiktionarySplitter() {
+        List<Selector> selectors;
+        for (final String code : WiktionaryLangs.wikiCodeToIsoCodeToWikiName.keySet()) {
+            //if (!code.equals("fr")) {continue;}
+            selectors = new ArrayList<WiktionarySplitter.Selector>();
+            pathToSelectors.put(String.format("data/inputs/%swiktionary-pages-articles.xml", code), selectors);
+            for (final Map.Entry<String, String> entry : WiktionaryLangs.wikiCodeToIsoCodeToWikiName.get(code).entrySet()) {
+                final String dir = String.format("data/inputs/wikiSplit/%s", code);
+                new File(dir).mkdirs();
+                selectors.add(new Selector(String.format("%s/%s.data", dir, entry.getKey()), entry.getValue()));
+            }
+        }
     }
-    if (title.contains(":")) {
-        if (!title.startsWith("Sign gloss:")) {
-            System.err.println("title with colon: " + title);
+
+    private void go() throws Exception {
+        final SAXParser parser = SAXParserFactoryImpl.newInstance().newSAXParser();
+
+        // Configure things.
+        for (final Map.Entry<String, List<Selector>> pathToSelectorsEntry : pathToSelectors.entrySet()) {
+
+            currentSelectors = pathToSelectorsEntry.getValue();
+
+            for (final Selector selector : currentSelectors) {
+                selector.out = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(selector.outFilename)));
+            }
+
+            // Do it.
+            try {
+                parser.parse(new File(pathToSelectorsEntry.getKey()), this);
+            } catch (Exception e) {
+                System.err.println("Exception during parse, lastPageTitle=" + lastPageTitle + ", titleBuilder=" + titleBuilder.toString());
+                throw e;
+            }
+
+            // Shutdown.
+            for (final Selector selector : currentSelectors) {
+                selector.out.close();
+            }
+
         }
     }
-    
-    String text = textBuilder.toString();
-    String translingual = "";
-    
-    while (text.length() > 0) {
-      // Find start.
-      final Matcher startMatcher = headingStart.matcher(text);
-      if (!startMatcher.find()) {
-        return;
-      }
-      text = text.substring(startMatcher.end());
-      
-      final String heading = startMatcher.group();
-      for (final Selector selector : currentSelectors) {
-        if (heading.indexOf("Translingual") != -1) {
-          // Find end.
-          final int depth = startMatcher.group(1).length();
-          final Pattern endPattern = Pattern.compile(String.format("^={1,%d}[^=].*$", depth), Pattern.MULTILINE);
-
-          final Matcher endMatcher = endPattern.matcher(text);
-          if (endMatcher.find()) {
-            int end = endMatcher.start();
-            translingual = text.substring(0, endMatcher.start());
-            text = text.substring(end);
-            break;
-          }
+
+    String lastPageTitle = null;
+    int pageCount = 0;
+    private void endPage() {
+        final String title = titleBuilder.toString();
+        lastPageTitle = title;
+        if (++pageCount % 1000 == 0) {
+            System.out.println("endPage: " + title + ", count=" + pageCount);
+        }
+        if (title.startsWith("Wiktionary:") ||
+                title.startsWith("Appendix:") ||
+                title.startsWith("Help:") ||
+                title.startsWith("Index:") ||
+                title.startsWith("MediaWiki:") ||
+                title.startsWith("Citations:") ||
+                title.startsWith("Concordance:") ||
+                title.startsWith("Glossary:") ||
+                title.startsWith("Rhymes:") ||
+                title.startsWith("Category:") ||
+                title.startsWith("Wikisaurus:") ||
+                title.startsWith("Unsupported titles/") ||
+                title.startsWith("Transwiki:") ||
+                title.startsWith("File:") ||
+                title.startsWith("Thread:") ||
+                title.startsWith("Template:") ||
+                title.startsWith("Summary:") ||
+                title.startsWith("Module:") ||
+                // DE
+                title.startsWith("Datei:") ||
+                title.startsWith("Verzeichnis:") ||
+                title.startsWith("Vorlage:") ||
+                title.startsWith("Thesaurus:") ||
+                title.startsWith("Kategorie:") ||
+                title.startsWith("Hilfe:") ||
+                title.startsWith("Reim:") ||
+                // FR:
+                title.startsWith("Annexe:") ||
+                title.startsWith("Catégori:") ||
+                title.startsWith("Modèle:") ||
+                title.startsWith("Thésaurus:") ||
+                title.startsWith("Projet:") ||
+                title.startsWith("Aide:") ||
+                title.startsWith("Fichier:") ||
+                title.startsWith("Wiktionnaire:") ||
+                title.startsWith("Catégorie:") ||
+                title.startsWith("Portail:") ||
+                title.startsWith("utiliusateur:") ||
+                title.startsWith("Kategorio:") ||
+                // IT
+                title.startsWith("Wikizionario:") ||
+                title.startsWith("Appendice:") ||
+                title.startsWith("Categoria:") ||
+                title.startsWith("Aiuto:") ||
+                title.startsWith("Portail:") ||
+                // ES
+                title.startsWith("Apéndice:") ||
+                title.startsWith("Archivo:") ||
+                title.startsWith("Ayuda:") ||
+                title.startsWith("Categoría:") ||
+                title.startsWith("Plantilla:") ||
+                title.startsWith("Wikcionario:") ||
+
+                // sentinel
+                false
+           ) {
+            return;
         }
-        if (selector.pattern.matcher(heading).find()) {
-          
-          // Find end.
-          final int depth = startMatcher.group(1).length();
-          final Pattern endPattern = Pattern.compile(String.format("^={1,%d}[^=].*$", depth), Pattern.MULTILINE);
-          
-          final Matcher endMatcher = endPattern.matcher(text);
-          final int end;
-          if (endMatcher.find()) {
-            end = endMatcher.start();
-          } else {
-            end = text.length();
-          }
-          
-          String sectionText = text.substring(0, end);
-          // Hack to remove empty dummy section from French
-          if (sectionText.startsWith("\n=== {{S|étymologie}} ===\n: {{ébauche-étym"))
-          {
-              int dummy_end = sectionText.indexOf("}}", 41) + 2;
-              while (dummy_end + 1 < sectionText.length() &&
-                     sectionText.charAt(dummy_end) == '\n' &&
-                     sectionText.charAt(dummy_end + 1) == '\n') ++dummy_end;
-              sectionText = sectionText.substring(dummy_end);
-          }
-          if (heading.indexOf("Japanese") == -1) sectionText += translingual;
-          final Section section = new Section(title, heading, sectionText);
-          
-          try {
-            selector.out.writeUTF(section.title);
-            selector.out.writeUTF(section.heading);
-            final byte[] bytes = section.text.getBytes("UTF8");
-            selector.out.writeInt(bytes.length);
-            selector.out.write(bytes);
-          } catch (IOException e) {
-            throw new RuntimeException(e);
-          }
-          
-          text = text.substring(end);
-          break;
+        if (title.contains(":")) {
+            if (!title.startsWith("Sign gloss:")) {
+                System.err.println("title with colon: " + title);
+            }
         }
-      }
+
+        String text = textBuilder.toString();
+        String translingual = "";
+
+        while (text.length() > 0) {
+            // Find start.
+            final Matcher startMatcher = headingStart.matcher(text);
+            if (!startMatcher.find()) {
+                return;
+            }
+            text = text.substring(startMatcher.end());
+
+            final String heading = startMatcher.group();
+            for (final Selector selector : currentSelectors) {
+                if (heading.indexOf("Translingual") != -1) {
+                    // Find end.
+                    final int depth = startMatcher.group(1).length();
+                    final Pattern endPattern = Pattern.compile(String.format("^={1,%d}[^=].*$", depth), Pattern.MULTILINE);
+
+                    final Matcher endMatcher = endPattern.matcher(text);
+                    if (endMatcher.find()) {
+                        int end = endMatcher.start();
+                        translingual = text.substring(0, endMatcher.start());
+                        text = text.substring(end);
+                        break;
+                    }
+                }
+                if (selector.pattern.matcher(heading).find()) {
+
+                    // Find end.
+                    final int depth = startMatcher.group(1).length();
+                    final Pattern endPattern = Pattern.compile(String.format("^={1,%d}[^=].*$", depth), Pattern.MULTILINE);
+
+                    final Matcher endMatcher = endPattern.matcher(text);
+                    final int end;
+                    if (endMatcher.find()) {
+                        end = endMatcher.start();
+                    } else {
+                        end = text.length();
+                    }
+
+                    String sectionText = text.substring(0, end);
+                    // Hack to remove empty dummy section from French
+                    if (sectionText.startsWith("\n=== {{S|étymologie}} ===\n: {{ébauche-étym")) {
+                        int dummy_end = sectionText.indexOf("}}", 41) + 2;
+                        while (dummy_end + 1 < sectionText.length() &&
+                                sectionText.charAt(dummy_end) == '\n' &&
+                                sectionText.charAt(dummy_end + 1) == '\n') ++dummy_end;
+                        sectionText = sectionText.substring(dummy_end);
+                    }
+                    if (heading.indexOf("Japanese") == -1) sectionText += translingual;
+                    final Section section = new Section(title, heading, sectionText);
+
+                    try {
+                        selector.out.writeUTF(section.title);
+                        selector.out.writeUTF(section.heading);
+                        final byte[] bytes = section.text.getBytes("UTF8");
+                        selector.out.writeInt(bytes.length);
+                        selector.out.write(bytes);
+                    } catch (IOException e) {
+                        throw new RuntimeException(e);
+                    }
+
+                    text = text.substring(end);
+                    break;
+                }
+            }
+        }
+
     }
-    
-  }
-
-  // -----------------------------------------------------------------------
-
-  static class Section implements java.io.Serializable {
-    private static final long serialVersionUID = -7676549898325856822L;
-
-    final String title;
-    final String heading;
-    final String text;
-    
-    public Section(final String title, final String heading, final String text) {
-      this.title = title;
-      this.heading = heading;
-      this.text = text;
-      
-      //System.out.printf("TITLE:%s\nHEADING:%s\nTEXT:%s\n\n\n\n\n\n", title, heading, text);
+
+    // -----------------------------------------------------------------------
+
+    static class Section implements java.io.Serializable {
+        private static final long serialVersionUID = -7676549898325856822L;
+
+        final String title;
+        final String heading;
+        final String text;
+
+        public Section(final String title, final String heading, final String text) {
+            this.title = title;
+            this.heading = heading;
+            this.text = text;
+
+            //System.out.printf("TITLE:%s\nHEADING:%s\nTEXT:%s\n\n\n\n\n\n", title, heading, text);
+        }
     }
-  }
-  
-  static class Selector {
-    final String outFilename;
-    final Pattern pattern;
 
-    DataOutputStream out;
+    static class Selector {
+        final String outFilename;
+        final Pattern pattern;
+
+        DataOutputStream out;
 
-    public Selector(final String filename, final String pattern) {
-      this.outFilename = filename;
-      this.pattern = Pattern.compile(pattern, Pattern.CASE_INSENSITIVE);
+        public Selector(final String filename, final String pattern) {
+            this.outFilename = filename;
+            this.pattern = Pattern.compile(pattern, Pattern.CASE_INSENSITIVE);
+        }
     }
-  }
 
-  // -----------------------------------------------------------------------
-  
+    // -----------------------------------------------------------------------
+
     @Override
     public void startElement(String uri, String localName, String qName,
-        Attributes attributes) {
-      currentBuilder = null;
-      if ("page".equals(qName)) {
-        titleBuilder = new StringBuilder();
-        
-        // Start with "\n" to better match certain strings.
-        textBuilder = new StringBuilder("\n");
-      } else if ("title".equals(qName)) {
-        currentBuilder = titleBuilder;
-      } else if ("text".equals(qName)) {
-        currentBuilder = textBuilder;
-      }
+                             Attributes attributes) {
+        currentBuilder = null;
+        if ("page".equals(qName)) {
+            titleBuilder = new StringBuilder();
+
+            // Start with "\n" to better match certain strings.
+            textBuilder = new StringBuilder("\n");
+        } else if ("title".equals(qName)) {
+            currentBuilder = titleBuilder;
+        } else if ("text".equals(qName)) {
+            currentBuilder = textBuilder;
+        }
     }
 
     @Override
     public void characters(char[] ch, int start, int length) throws SAXException {
-      if (currentBuilder != null) {
-        currentBuilder.append(ch, start, length);
-      }
+        if (currentBuilder != null) {
+            currentBuilder.append(ch, start, length);
+        }
     }
 
     @Override
     public void endElement(String uri, String localName, String qName)
-        throws SAXException {
-      currentBuilder = null;
-      if ("page".equals(qName)) {
-        endPage();
-      }
+    throws SAXException {
+        currentBuilder = null;
+        if ("page".equals(qName)) {
+            endPage();
+        }
     }
-    
+
     public void parse(final File file) throws ParserConfigurationException,
         SAXException, IOException {
-      final SAXParser parser = SAXParserFactoryImpl.newInstance().newSAXParser();
-      parser.parse(file, this);
+        final SAXParser parser = SAXParserFactoryImpl.newInstance().newSAXParser();
+        parser.parse(file, this);
     }
-    
+
 }
index 8015f9a04ae7fd65e5a233d73c03e9e02a594852..07d077562b8b106392d201123eb26aa753e48781 100644 (file)
@@ -39,241 +39,241 @@ import com.hughes.android.dictionary.engine.PairEntry;
 import com.hughes.android.dictionary.engine.PairEntry.Pair;
 
 public class DictFileParser implements Parser {
-  
-  static final Logger logger = Logger.getLogger(DictFileParser.class.getName());
-
-  // Dictcc
-  public static final Pattern TAB = Pattern.compile("\\t");
-
-  // Chemnitz
-  public static final Pattern DOUBLE_COLON = Pattern.compile(" :: ");
-  public static final Pattern PIPE = Pattern.compile("\\|");
-  
-  static final Pattern SPACES = Pattern.compile("\\s+");
-  
-  static final Pattern BRACKETED = Pattern.compile("\\[([^]]+)\\]");
-  static final Pattern PARENTHESIZED = Pattern.compile("\\(([^)]+)\\)");
-  static final Pattern CURLY_BRACED = Pattern.compile("\\{([^}]+)\\}");
-  
-  // http://www.regular-expressions.info/unicode.html
-  static final Pattern NON_CHAR_DASH = Pattern.compile("[^-'\\p{L}\\p{M}\\p{N}]+");
-  public static final Pattern NON_CHAR = Pattern.compile("[^\\p{L}\\p{M}\\p{N}]+");
-
-  static final Pattern TRIM_PUNC = Pattern.compile("^[^\\p{L}\\p{M}\\p{N}]+|[^\\p{L}\\p{M}\\p{N}]+$");
-
-  final Charset charset;
-  final boolean flipCols;
-  
-  final Pattern fieldSplit;
-  final Pattern subfieldSplit;
-  
-  final DictionaryBuilder dictBuilder;
-  final IndexBuilder[] langIndexBuilders;
-  final IndexBuilder bothIndexBuilder;
-  
-  EntrySource entrySource;
-  
-  // final Set<String> alreadyDone = new HashSet<String>();
-    
-  public DictFileParser(final Charset charset, boolean flipCols,
-      final Pattern fieldSplit, final Pattern subfieldSplit,
-      final DictionaryBuilder dictBuilder, final IndexBuilder[] langIndexBuilders,
-      final IndexBuilder bothIndexBuilder) {
-    this.charset = charset;
-    this.flipCols = flipCols;
-    this.fieldSplit = fieldSplit;
-    this.subfieldSplit = subfieldSplit;
-    this.dictBuilder = dictBuilder;
-    this.langIndexBuilders = langIndexBuilders;
-    this.bothIndexBuilder = bothIndexBuilder;
-  }
-
-  @Override
-  public void parse(final File file, final EntrySource entrySouce, final int pageLimit) throws IOException {
-    this.entrySource = entrySouce;
-    final BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(file), charset));
-    String line;
-    int count = 0;
-    while ((line = reader.readLine()) != null) {
-      if (pageLimit >= 0 && count >= pageLimit) {
-        return;
-      }
-      if (count % 10000 == 0) {
-        logger.info("count=" + count + ", line=" + line);
-      }
-      parseLine(line);
-      ++count;
-    }
-  }
-  
-  private void parseLine(final String line) {
-    if (line.startsWith("#") || line.length() == 0) {
-      logger.info("Skipping comment line: " + line);
-      return;
-    }
-    final String[] fields = fieldSplit.split(line);
-    // dictcc now has a part of speech field as field #3.
-    if (fields.length < 2 || fields.length > 3) {
-      logger.warning("Malformed line: " + line);
-      return;
-    }
-    
-    fields[0] = SPACES.matcher(fields[0]).replaceAll(" ").trim();
-    fields[1] = SPACES.matcher(fields[1]).replaceAll(" ").trim();
-    if (flipCols) {
-      final String temp = fields[0];
-      fields[0] = fields[1];
-      fields[1] = temp;
-    }
 
-    final String[][] subfields = new String[2][];
-      if (subfieldSplit != null) {
-      subfields[0] = subfieldSplit.split(fields[0]);
-      subfields[1] = subfieldSplit.split(fields[1]);
-      if (subfields[0].length != subfields[1].length) {
-        logger.warning("Number of subfields doesn't match: " + line);
-        return;
-      }
-    } else {
-      subfields[0] = new String[] { fields[0] };
-      subfields[1] = new String[] { fields[1] };
-    }
-        
-    final PairEntry pairEntry = new PairEntry(entrySource);
-    for (int i = 0; i < subfields[0].length; ++i) {
-      subfields[0][i] = subfields[0][i].trim();
-      subfields[1][i] = subfields[1][i].trim();
-      if (subfields[0][i].length() == 0 && subfields[1][i].length() == 0) {
-        logger.warning("Empty pair: " + line);
-        continue;
-      }
-      if (subfields[0][i].length() == 0) {
-        subfields[0][i] = "__";
-      }
-      if (subfields[1][i].length() == 0) {
-        subfields[1][i] = "__";
-      }
-      pairEntry.pairs.add(new Pair(subfields[0][i], subfields[1][i]));
+    static final Logger logger = Logger.getLogger(DictFileParser.class.getName());
+
+    // Dictcc
+    public static final Pattern TAB = Pattern.compile("\\t");
+
+    // Chemnitz
+    public static final Pattern DOUBLE_COLON = Pattern.compile(" :: ");
+    public static final Pattern PIPE = Pattern.compile("\\|");
+
+    static final Pattern SPACES = Pattern.compile("\\s+");
+
+    static final Pattern BRACKETED = Pattern.compile("\\[([^]]+)\\]");
+    static final Pattern PARENTHESIZED = Pattern.compile("\\(([^)]+)\\)");
+    static final Pattern CURLY_BRACED = Pattern.compile("\\{([^}]+)\\}");
+
+    // http://www.regular-expressions.info/unicode.html
+    static final Pattern NON_CHAR_DASH = Pattern.compile("[^-'\\p{L}\\p{M}\\p{N}]+");
+    public static final Pattern NON_CHAR = Pattern.compile("[^\\p{L}\\p{M}\\p{N}]+");
+
+    static final Pattern TRIM_PUNC = Pattern.compile("^[^\\p{L}\\p{M}\\p{N}]+|[^\\p{L}\\p{M}\\p{N}]+$");
+
+    final Charset charset;
+    final boolean flipCols;
+
+    final Pattern fieldSplit;
+    final Pattern subfieldSplit;
+
+    final DictionaryBuilder dictBuilder;
+    final IndexBuilder[] langIndexBuilders;
+    final IndexBuilder bothIndexBuilder;
+
+    EntrySource entrySource;
+
+    // final Set<String> alreadyDone = new HashSet<String>();
+
+    public DictFileParser(final Charset charset, boolean flipCols,
+                          final Pattern fieldSplit, final Pattern subfieldSplit,
+                          final DictionaryBuilder dictBuilder, final IndexBuilder[] langIndexBuilders,
+                          final IndexBuilder bothIndexBuilder) {
+        this.charset = charset;
+        this.flipCols = flipCols;
+        this.fieldSplit = fieldSplit;
+        this.subfieldSplit = subfieldSplit;
+        this.dictBuilder = dictBuilder;
+        this.langIndexBuilders = langIndexBuilders;
+        this.bothIndexBuilder = bothIndexBuilder;
     }
-    final IndexedEntry entryData = new IndexedEntry(pairEntry);
-    entryData.isValid = true;
-    
-    for (int l = 0; l < 2; ++l) {
-      // alreadyDone.clear();
-      
-      for (int j = 0; j < subfields[l].length; ++j) {
-        String subfield = subfields[l][j];
-        final IndexBuilder indexBuilder = langIndexBuilders[l];
-        if (indexBuilder.index.sortLanguage == Language.de) {
-          subfield = parseField_DE(indexBuilder, subfield, entryData, j);
-        } else if (indexBuilder.index.sortLanguage == Language.en) {
-          subfield = parseField_EN(indexBuilder, subfield, entryData, j);
+
+    @Override
+    public void parse(final File file, final EntrySource entrySouce, final int pageLimit) throws IOException {
+        this.entrySource = entrySouce;
+        final BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(file), charset));
+        String line;
+        int count = 0;
+        while ((line = reader.readLine()) != null) {
+            if (pageLimit >= 0 && count >= pageLimit) {
+                return;
+            }
+            if (count % 10000 == 0) {
+                logger.info("count=" + count + ", line=" + line);
+            }
+            parseLine(line);
+            ++count;
         }
-        parseFieldGeneric(indexBuilder, subfield, entryData, j, subfields[l].length);
-      }
-    }
-  }
-
-  private void parseFieldGeneric(final IndexBuilder indexBuilder, String field,
-      final IndexedEntry entryData, final int subfieldIdx, final int numSubFields) {
-    // remove bracketed and parenthesized stuff.
-    final StringBuilder bracketed = new StringBuilder(); 
-    final StringBuilder parenthesized = new StringBuilder();
-    
-    Matcher matcher;
-    while ((matcher = BRACKETED.matcher(field)).find()) {
-      bracketed.append(matcher.group(1)).append(" ");
-      field = matcher.replaceFirst(" ");
     }
 
-    while ((matcher = PARENTHESIZED.matcher(field)).find()) {
-      parenthesized.append(matcher.group(1)).append(" ");
-      field = matcher.replaceFirst(" ");
-    }
-    
-    field = SPACES.matcher(field).replaceAll(" ").trim();
-
-    // split words on non -A-z0-9, do them.
-    final String[] tokens = NON_CHAR_DASH.split(field);
-
-    final EntryTypeName entryTypeName;
-    if (numSubFields == 1) {
-      assert subfieldIdx == 0;
-      if (tokens.length == 1) {
-        entryTypeName = EntryTypeName.ONE_WORD;
-      } else if (tokens.length == 2) {
-        entryTypeName = EntryTypeName.TWO_WORDS;
-      } else if (tokens.length == 3) {
-        entryTypeName = EntryTypeName.THREE_WORDS;
-      } else if (tokens.length == 4) {
-        entryTypeName = EntryTypeName.FOUR_WORDS;
-      } else {
-        entryTypeName = EntryTypeName.FIVE_OR_MORE_WORDS;
-      }
-    } else {
-      assert numSubFields > 1;
-      if (subfieldIdx == 0) {
-        if (tokens.length == 1) {
-          entryTypeName = EntryTypeName.MULTIROW_HEAD_ONE_WORD;
-        } else {
-          entryTypeName = EntryTypeName.MULTIROW_HEAD_MANY_WORDS;
+    private void parseLine(final String line) {
+        if (line.startsWith("#") || line.length() == 0) {
+            logger.info("Skipping comment line: " + line);
+            return;
+        }
+        final String[] fields = fieldSplit.split(line);
+        // dictcc now has a part of speech field as field #3.
+        if (fields.length < 2 || fields.length > 3) {
+            logger.warning("Malformed line: " + line);
+            return;
         }
-      } else {
-        assert subfieldIdx > 0;
-        if (tokens.length == 1) {
-          entryTypeName = EntryTypeName.MULTIROW_TAIL_ONE_WORD;
+
+        fields[0] = SPACES.matcher(fields[0]).replaceAll(" ").trim();
+        fields[1] = SPACES.matcher(fields[1]).replaceAll(" ").trim();
+        if (flipCols) {
+            final String temp = fields[0];
+            fields[0] = fields[1];
+            fields[1] = temp;
+        }
+
+        final String[][] subfields = new String[2][];
+        if (subfieldSplit != null) {
+            subfields[0] = subfieldSplit.split(fields[0]);
+            subfields[1] = subfieldSplit.split(fields[1]);
+            if (subfields[0].length != subfields[1].length) {
+                logger.warning("Number of subfields doesn't match: " + line);
+                return;
+            }
         } else {
-          entryTypeName = EntryTypeName.MULTIROW_TAIL_MANY_WORDS;
+            subfields[0] = new String[] { fields[0] };
+            subfields[1] = new String[] { fields[1] };
         }
-      }
-    }
 
-    for (String token : tokens) {
-      token = TRIM_PUNC.matcher(token).replaceAll("");
-      if (/*!alreadyDone.contains(token) && */token.length() > 0) {
-        indexBuilder.addEntryWithTokens(entryData, Collections.singleton(token), entryTypeName);
-        // alreadyDone.add(token);
-        
-        // also split words on dashes, do them, too.
-        if (token.contains("-")) {
-          final String[] dashed = token.split("-");
-          for (final String dashedToken : dashed) {
-            if (/*!alreadyDone.contains(dashedToken) && */dashedToken.length() > 0) {
-              indexBuilder.addEntryWithTokens(entryData, Collections.singleton(dashedToken), EntryTypeName.PART_OF_HYPHENATED);
+        final PairEntry pairEntry = new PairEntry(entrySource);
+        for (int i = 0; i < subfields[0].length; ++i) {
+            subfields[0][i] = subfields[0][i].trim();
+            subfields[1][i] = subfields[1][i].trim();
+            if (subfields[0][i].length() == 0 && subfields[1][i].length() == 0) {
+                logger.warning("Empty pair: " + line);
+                continue;
+            }
+            if (subfields[0][i].length() == 0) {
+                subfields[0][i] = "__";
+            }
+            if (subfields[1][i].length() == 0) {
+                subfields[1][i] = "__";
             }
-          }
+            pairEntry.pairs.add(new Pair(subfields[0][i], subfields[1][i]));
         }
+        final IndexedEntry entryData = new IndexedEntry(pairEntry);
+        entryData.isValid = true;
 
-      }  // if (!alreadyDone.contains(token)) {
-    }  // for (final String token : tokens) { 
-    
-    // process bracketed stuff (split on spaces and dashes always)
-    final String[] bracketedTokens = NON_CHAR.split(bracketed.toString());
-    for (final String token : bracketedTokens) {
-      assert !token.contains("-");
-      if (/*!alreadyDone.contains(token) && */token.length() > 0) {
-        indexBuilder.addEntryWithTokens(entryData, Collections.singleton(token), EntryTypeName.BRACKETED);
-      }
+        for (int l = 0; l < 2; ++l) {
+            // alreadyDone.clear();
+
+            for (int j = 0; j < subfields[l].length; ++j) {
+                String subfield = subfields[l][j];
+                final IndexBuilder indexBuilder = langIndexBuilders[l];
+                if (indexBuilder.index.sortLanguage == Language.de) {
+                    subfield = parseField_DE(indexBuilder, subfield, entryData, j);
+                } else if (indexBuilder.index.sortLanguage == Language.en) {
+                    subfield = parseField_EN(indexBuilder, subfield, entryData, j);
+                }
+                parseFieldGeneric(indexBuilder, subfield, entryData, j, subfields[l].length);
+            }
+        }
     }
-    
-    // process paren stuff
-    final String[] parenTokens = NON_CHAR.split(parenthesized.toString());
-    for (final String token : parenTokens) {
-      assert !token.contains("-");
-      if (/*!alreadyDone.contains(token) && */token.length() > 0) {
-        indexBuilder.addEntryWithTokens(entryData, Collections.singleton(token), EntryTypeName.PARENTHESIZED);
-      }
+
+    private void parseFieldGeneric(final IndexBuilder indexBuilder, String field,
+                                   final IndexedEntry entryData, final int subfieldIdx, final int numSubFields) {
+        // remove bracketed and parenthesized stuff.
+        final StringBuilder bracketed = new StringBuilder();
+        final StringBuilder parenthesized = new StringBuilder();
+
+        Matcher matcher;
+        while ((matcher = BRACKETED.matcher(field)).find()) {
+            bracketed.append(matcher.group(1)).append(" ");
+            field = matcher.replaceFirst(" ");
+        }
+
+        while ((matcher = PARENTHESIZED.matcher(field)).find()) {
+            parenthesized.append(matcher.group(1)).append(" ");
+            field = matcher.replaceFirst(" ");
+        }
+
+        field = SPACES.matcher(field).replaceAll(" ").trim();
+
+        // split words on non -A-z0-9, do them.
+        final String[] tokens = NON_CHAR_DASH.split(field);
+
+        final EntryTypeName entryTypeName;
+        if (numSubFields == 1) {
+            assert subfieldIdx == 0;
+            if (tokens.length == 1) {
+                entryTypeName = EntryTypeName.ONE_WORD;
+            } else if (tokens.length == 2) {
+                entryTypeName = EntryTypeName.TWO_WORDS;
+            } else if (tokens.length == 3) {
+                entryTypeName = EntryTypeName.THREE_WORDS;
+            } else if (tokens.length == 4) {
+                entryTypeName = EntryTypeName.FOUR_WORDS;
+            } else {
+                entryTypeName = EntryTypeName.FIVE_OR_MORE_WORDS;
+            }
+        } else {
+            assert numSubFields > 1;
+            if (subfieldIdx == 0) {
+                if (tokens.length == 1) {
+                    entryTypeName = EntryTypeName.MULTIROW_HEAD_ONE_WORD;
+                } else {
+                    entryTypeName = EntryTypeName.MULTIROW_HEAD_MANY_WORDS;
+                }
+            } else {
+                assert subfieldIdx > 0;
+                if (tokens.length == 1) {
+                    entryTypeName = EntryTypeName.MULTIROW_TAIL_ONE_WORD;
+                } else {
+                    entryTypeName = EntryTypeName.MULTIROW_TAIL_MANY_WORDS;
+                }
+            }
+        }
+
+        for (String token : tokens) {
+            token = TRIM_PUNC.matcher(token).replaceAll("");
+            if (/*!alreadyDone.contains(token) && */token.length() > 0) {
+                indexBuilder.addEntryWithTokens(entryData, Collections.singleton(token), entryTypeName);
+                // alreadyDone.add(token);
+
+                // also split words on dashes, do them, too.
+                if (token.contains("-")) {
+                    final String[] dashed = token.split("-");
+                    for (final String dashedToken : dashed) {
+                        if (/*!alreadyDone.contains(dashedToken) && */dashedToken.length() > 0) {
+                            indexBuilder.addEntryWithTokens(entryData, Collections.singleton(dashedToken), EntryTypeName.PART_OF_HYPHENATED);
+                        }
+                    }
+                }
+
+            }  // if (!alreadyDone.contains(token)) {
+        }  // for (final String token : tokens) {
+
+        // process bracketed stuff (split on spaces and dashes always)
+        final String[] bracketedTokens = NON_CHAR.split(bracketed.toString());
+        for (final String token : bracketedTokens) {
+            assert !token.contains("-");
+            if (/*!alreadyDone.contains(token) && */token.length() > 0) {
+                indexBuilder.addEntryWithTokens(entryData, Collections.singleton(token), EntryTypeName.BRACKETED);
+            }
+        }
+
+        // process paren stuff
+        final String[] parenTokens = NON_CHAR.split(parenthesized.toString());
+        for (final String token : parenTokens) {
+            assert !token.contains("-");
+            if (/*!alreadyDone.contains(token) && */token.length() > 0) {
+                indexBuilder.addEntryWithTokens(entryData, Collections.singleton(token), EntryTypeName.PARENTHESIZED);
+            }
+        }
+
     }
-    
-  }
 
-  private String parseField_DE(final IndexBuilder indexBuilder, String field,
-      final IndexedEntry entryData, final int subfieldIdx) {
-    
+    private String parseField_DE(final IndexBuilder indexBuilder, String field,
+                                 final IndexedEntry entryData, final int subfieldIdx) {
+
 //    final Matcher matcher = DE_NOUN.matcher(field);
 //    while (matcher.find()) {
 //      final String noun = matcher.group(1);
-      //final String gender = matcher.group(2);
+        //final String gender = matcher.group(2);
 //      if (alreadyDone.add(noun)) {
         // System.out.println("Found DE noun " + noun + ", " + gender);
 //        final List<EntryData> entries = indexBuilder.getOrCreateEntries(noun, EntryTypeName.NOUN);
@@ -281,26 +281,26 @@ public class DictFileParser implements Parser {
 //      }
 //    }
 
-    // In English, curly braces are used for different tenses.
-    field = CURLY_BRACED.matcher(field).replaceAll(" ");
+        // In English, curly braces are used for different tenses.
+        field = CURLY_BRACED.matcher(field).replaceAll(" ");
+
+        return field;
+    }
+
+    private String parseField_EN(final IndexBuilder indexBuilder, String field,
+                                 final IndexedEntry entryData, final int subfieldIdx) {
+        if (field.startsWith("to ")) {
+            field = field.substring(3);
+        }
+        return field;
+    }
 
-    return field;
-  }
-  
-  private String parseField_EN(final IndexBuilder indexBuilder, String field,
-      final IndexedEntry entryData, final int subfieldIdx) {
-    if (field.startsWith("to ")) {
-      field = field.substring(3);
+    public static final Set<String> tokenize(final String text, final Pattern pattern) {
+        final String[] split = pattern.split(text);
+        final Set<String> result = new LinkedHashSet<String>(Arrays.asList(split));
+        result.remove("");
+        return result;
     }
-    return field;
-  }
-  
-  public static final Set<String> tokenize(final String text, final Pattern pattern) {
-    final String[] split = pattern.split(text);
-    final Set<String> result = new LinkedHashSet<String>(Arrays.asList(split));
-    result.remove("");
-    return result;
-  }
 
 
 }
index b0f2e961214dfa5046583aecbeb3302f3e137042..969796dd7a7c1b5e0c6ccb0d455208e328995569 100644 (file)
@@ -20,7 +20,7 @@ import java.io.IOException;
 import com.hughes.android.dictionary.engine.EntrySource;
 
 public interface Parser {
-  
-  void parse(final File file, final EntrySource entrySource, final int pageLimit) throws IOException;
+
+    void parse(final File file, final EntrySource entrySource, final int pageLimit) throws IOException;
 
 }
index d6c8901aa6a6b6541c1d5b2ccd0e4dc4af56f507..8cf882e7de6933ea77dc19c5f39396146bd3f815 100644 (file)
@@ -22,625 +22,625 @@ import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 
 public final class WikiTokenizer {
-  
-  public static interface Callback {
-    void onPlainText(final String text);
-    void onMarkup(WikiTokenizer wikiTokenizer);
-    void onWikiLink(WikiTokenizer wikiTokenizer);
-    void onNewline(WikiTokenizer wikiTokenizer);
-    void onFunction(final WikiTokenizer tokenizer, String functionName, List<String> functionPositionArgs,
-        Map<String, String> functionNamedArgs);
-    void onHeading(WikiTokenizer wikiTokenizer);
-    void onListItem(WikiTokenizer wikiTokenizer);
-    void onComment(WikiTokenizer wikiTokenizer);
-    void onHtml(WikiTokenizer wikiTokenizer);
-  }
-  
-  public static class DoNothingCallback implements Callback {
-
-    @Override
-    public void onPlainText(String text) {
-    }
-
-    @Override
-    public void onMarkup(WikiTokenizer wikiTokenizer) {
-    }
-
-    @Override
-    public void onWikiLink(WikiTokenizer wikiTokenizer) {
-    }
-
-    @Override
-    public void onNewline(WikiTokenizer wikiTokenizer) {
-    }
-
-    @Override
-    public void onFunction(WikiTokenizer tokenizer, String functionName,
-        List<String> functionPositionArgs, Map<String, String> functionNamedArgs) {
-    }
-
-    @Override
-    public void onHeading(WikiTokenizer wikiTokenizer) {
-    }
-
-    @Override
-    public void onListItem(WikiTokenizer wikiTokenizer) {
-    }
-
-    @Override
-    public void onComment(WikiTokenizer wikiTokenizer) {
-    }
-
-    @Override
-    public void onHtml(WikiTokenizer wikiTokenizer) {
-    }
-  }
-  
-  //private static final Pattern wikiTokenEvent = Pattern.compile("($)", Pattern.MULTILINE);
-  private static final Pattern wikiTokenEvent = Pattern.compile("(" +
-               "\\{\\{|\\}\\}|" +
-               "\\[\\[|\\]\\]|" +
-               "\\||" +  // Need the | because we might have to find unescaped pipes
-        "=|" +  // Need the = because we might have to find unescaped =
-               "<!--|" +
-               "''|" +
-        "<pre>|" +
-        "<math>|" +
-        "<ref>|" +
-               "$)", Pattern.MULTILINE);
-  private static final String listChars = "*#:;";
-  
-    
-  final String wikiText;
-  final Matcher matcher;
-
-  boolean justReturnedNewline = true;
-  int lastLineStart = 0;
-  int end = 0;
-  int start = -1;
-
-  final List<String> errors = new ArrayList<String>();
-  final List<String> tokenStack = new ArrayList<String>();
-  
-
-  private String headingWikiText;
-  private int headingDepth;
-  private int listPrefixEnd;
-  private boolean isPlainText;
-  private boolean isMarkup;
-  private boolean isComment;
-  private boolean isFunction;
-  private boolean isWikiLink;
-  private boolean isHtml;
-  private int firstUnescapedPipePos;
-  
-  private int lastUnescapedPipePos;
-  private int lastUnescapedEqualsPos;
-  private final List<String> positionArgs = new ArrayList<String>();
-  private final Map<String,String> namedArgs = new LinkedHashMap<String,String>();
-  
-
-  public WikiTokenizer(final String wikiText) {
-    this(wikiText, true);
-  }
-
-  public WikiTokenizer(String wikiText, final boolean isNewline) {
-    wikiText = wikiText.replace('\u2028', '\n');
-    wikiText = wikiText.replace('\u0085', '\n');
-    this.wikiText = wikiText;
-    this.matcher = wikiTokenEvent.matcher(wikiText);
-    justReturnedNewline = isNewline;
-  }
-
-  private void clear() {
-    errors.clear();
-    tokenStack.clear();
-
-    headingWikiText = null;
-    headingDepth = -1;
-    listPrefixEnd = -1;
-    isPlainText = false;
-    isMarkup = false;
-    isComment = false;
-    isFunction = false;
-    isWikiLink = false;
-    isHtml = false;
-    
-    firstUnescapedPipePos = -1;
-    lastUnescapedPipePos = -1;
-    lastUnescapedEqualsPos = -1;
-    positionArgs.clear();
-    namedArgs.clear();
-  }
-
-  private static final Pattern POSSIBLE_WIKI_TEXT = Pattern.compile(
-      "\\{\\{|" +
-      "\\[\\[|" +
-      "<!--|" +
-      "''|" +
-      "<pre>|" +
-      "<math>|" +
-      "<ref>|" +
-      "[\n]"
-      );
-
-  public static void dispatch(final String wikiText, final boolean isNewline, final Callback callback) {
-    // Optimization...
-    if (!POSSIBLE_WIKI_TEXT.matcher(wikiText).find()) {
-      callback.onPlainText(wikiText);
-    } else {
-      final WikiTokenizer tokenizer = new WikiTokenizer(wikiText, isNewline);
-      while (tokenizer.nextToken() != null) {
-        if (tokenizer.isPlainText()) {
-          callback.onPlainText(tokenizer.token());
-        } else if (tokenizer.isMarkup()) {
-          callback.onMarkup(tokenizer);
-        } else if (tokenizer.isWikiLink()) {
-          callback.onWikiLink(tokenizer);
-        } else if (tokenizer.isNewline()) {
-          callback.onNewline(tokenizer);
-        } else if (tokenizer.isFunction()) {
-          callback.onFunction(tokenizer, tokenizer.functionName(), tokenizer.functionPositionArgs(), tokenizer.functionNamedArgs());
-        } else if (tokenizer.isHeading()) {
-          callback.onHeading(tokenizer);
-        } else if (tokenizer.isListItem()) {
-          callback.onListItem(tokenizer);
-        } else if (tokenizer.isComment()) {
-          callback.onComment(tokenizer);
-        } else if (tokenizer.isHtml()) {
-          callback.onHtml(tokenizer);
-        } else if (!tokenizer.errors.isEmpty()) {
-          // Log was already printed....
+
+    public static interface Callback {
+        void onPlainText(final String text);
+        void onMarkup(WikiTokenizer wikiTokenizer);
+        void onWikiLink(WikiTokenizer wikiTokenizer);
+        void onNewline(WikiTokenizer wikiTokenizer);
+        void onFunction(final WikiTokenizer tokenizer, String functionName, List<String> functionPositionArgs,
+                        Map<String, String> functionNamedArgs);
+        void onHeading(WikiTokenizer wikiTokenizer);
+        void onListItem(WikiTokenizer wikiTokenizer);
+        void onComment(WikiTokenizer wikiTokenizer);
+        void onHtml(WikiTokenizer wikiTokenizer);
+    }
+
+    public static class DoNothingCallback implements Callback {
+
+        @Override
+        public void onPlainText(String text) {
+        }
+
+        @Override
+        public void onMarkup(WikiTokenizer wikiTokenizer) {
+        }
+
+        @Override
+        public void onWikiLink(WikiTokenizer wikiTokenizer) {
+        }
+
+        @Override
+        public void onNewline(WikiTokenizer wikiTokenizer) {
+        }
+
+        @Override
+        public void onFunction(WikiTokenizer tokenizer, String functionName,
+                               List<String> functionPositionArgs, Map<String, String> functionNamedArgs) {
+        }
+
+        @Override
+        public void onHeading(WikiTokenizer wikiTokenizer) {
+        }
+
+        @Override
+        public void onListItem(WikiTokenizer wikiTokenizer) {
+        }
+
+        @Override
+        public void onComment(WikiTokenizer wikiTokenizer) {
+        }
+
+        @Override
+        public void onHtml(WikiTokenizer wikiTokenizer) {
+        }
+    }
+
+    //private static final Pattern wikiTokenEvent = Pattern.compile("($)", Pattern.MULTILINE);
+    private static final Pattern wikiTokenEvent = Pattern.compile("(" +
+            "\\{\\{|\\}\\}|" +
+            "\\[\\[|\\]\\]|" +
+            "\\||" +  // Need the | because we might have to find unescaped pipes
+            "=|" +  // Need the = because we might have to find unescaped =
+            "<!--|" +
+            "''|" +
+            "<pre>|" +
+            "<math>|" +
+            "<ref>|" +
+            "$)", Pattern.MULTILINE);
+    private static final String listChars = "*#:;";
+
+
+    final String wikiText;
+    final Matcher matcher;
+
+    boolean justReturnedNewline = true;
+    int lastLineStart = 0;
+    int end = 0;
+    int start = -1;
+
+    final List<String> errors = new ArrayList<String>();
+    final List<String> tokenStack = new ArrayList<String>();
+
+
+    private String headingWikiText;
+    private int headingDepth;
+    private int listPrefixEnd;
+    private boolean isPlainText;
+    private boolean isMarkup;
+    private boolean isComment;
+    private boolean isFunction;
+    private boolean isWikiLink;
+    private boolean isHtml;
+    private int firstUnescapedPipePos;
+
+    private int lastUnescapedPipePos;
+    private int lastUnescapedEqualsPos;
+    private final List<String> positionArgs = new ArrayList<String>();
+    private final Map<String,String> namedArgs = new LinkedHashMap<String,String>();
+
+
+    public WikiTokenizer(final String wikiText) {
+        this(wikiText, true);
+    }
+
+    public WikiTokenizer(String wikiText, final boolean isNewline) {
+        wikiText = wikiText.replace('\u2028', '\n');
+        wikiText = wikiText.replace('\u0085', '\n');
+        this.wikiText = wikiText;
+        this.matcher = wikiTokenEvent.matcher(wikiText);
+        justReturnedNewline = isNewline;
+    }
+
+    private void clear() {
+        errors.clear();
+        tokenStack.clear();
+
+        headingWikiText = null;
+        headingDepth = -1;
+        listPrefixEnd = -1;
+        isPlainText = false;
+        isMarkup = false;
+        isComment = false;
+        isFunction = false;
+        isWikiLink = false;
+        isHtml = false;
+
+        firstUnescapedPipePos = -1;
+        lastUnescapedPipePos = -1;
+        lastUnescapedEqualsPos = -1;
+        positionArgs.clear();
+        namedArgs.clear();
+    }
+
+    private static final Pattern POSSIBLE_WIKI_TEXT = Pattern.compile(
+                "\\{\\{|" +
+                "\\[\\[|" +
+                "<!--|" +
+                "''|" +
+                "<pre>|" +
+                "<math>|" +
+                "<ref>|" +
+                "[\n]"
+            );
+
+    public static void dispatch(final String wikiText, final boolean isNewline, final Callback callback) {
+        // Optimization...
+        if (!POSSIBLE_WIKI_TEXT.matcher(wikiText).find()) {
+            callback.onPlainText(wikiText);
         } else {
-          throw new IllegalStateException("Unknown wiki state: " + tokenizer.token());
+            final WikiTokenizer tokenizer = new WikiTokenizer(wikiText, isNewline);
+            while (tokenizer.nextToken() != null) {
+                if (tokenizer.isPlainText()) {
+                    callback.onPlainText(tokenizer.token());
+                } else if (tokenizer.isMarkup()) {
+                    callback.onMarkup(tokenizer);
+                } else if (tokenizer.isWikiLink()) {
+                    callback.onWikiLink(tokenizer);
+                } else if (tokenizer.isNewline()) {
+                    callback.onNewline(tokenizer);
+                } else if (tokenizer.isFunction()) {
+                    callback.onFunction(tokenizer, tokenizer.functionName(), tokenizer.functionPositionArgs(), tokenizer.functionNamedArgs());
+                } else if (tokenizer.isHeading()) {
+                    callback.onHeading(tokenizer);
+                } else if (tokenizer.isListItem()) {
+                    callback.onListItem(tokenizer);
+                } else if (tokenizer.isComment()) {
+                    callback.onComment(tokenizer);
+                } else if (tokenizer.isHtml()) {
+                    callback.onHtml(tokenizer);
+                } else if (!tokenizer.errors.isEmpty()) {
+                    // Log was already printed....
+                } else {
+                    throw new IllegalStateException("Unknown wiki state: " + tokenizer.token());
+                }
+            }
+        }
+    }
+
+    public List<String> errors() {
+        return errors;
+    }
+
+    public boolean isNewline() {
+        return justReturnedNewline;
+    }
+
+    public void returnToLineStart() {
+        end = start = lastLineStart;
+        justReturnedNewline = true;
+    }
+
+    public boolean isHeading() {
+        return headingWikiText != null;
+    }
+
+    public String headingWikiText() {
+        assert isHeading();
+        return headingWikiText;
+    }
+
+    public int headingDepth() {
+        assert isHeading();
+        return headingDepth;
+    }
+
+    public boolean isMarkup() {
+        return isMarkup;
+    }
+
+    public boolean isComment() {
+        return isComment;
+    }
+
+    public boolean isListItem() {
+        return listPrefixEnd != -1;
+    }
+
+    public String listItemPrefix() {
+        assert isListItem();
+        return wikiText.substring(start, listPrefixEnd);
+    }
+
+    public static String getListTag(char c) {
+        if (c == '#') {
+            return "ol";
+        }
+        return "ul";
+    }
+
+    public String listItemWikiText() {
+        assert isListItem();
+        return wikiText.substring(listPrefixEnd, end);
+    }
+
+    public boolean isFunction() {
+        return isFunction;
+    }
+
+    public String functionName() {
+        assert isFunction();
+        // "{{.."
+        if (firstUnescapedPipePos != -1) {
+            return trimNewlines(wikiText.substring(start + 2, firstUnescapedPipePos).trim());
+        }
+        final int safeEnd = Math.max(start + 2, end - 2);
+        return trimNewlines(wikiText.substring(start + 2, safeEnd).trim());
+    }
+
+    public List<String> functionPositionArgs() {
+        return positionArgs;
+    }
+
+    public Map<String, String> functionNamedArgs() {
+        return namedArgs;
+    }
+
+    public boolean isPlainText() {
+        return isPlainText;
+    }
+
+    public boolean isWikiLink() {
+        return isWikiLink;
+    }
+
+    public String wikiLinkText() {
+        assert isWikiLink();
+        // "[[.."
+        if (lastUnescapedPipePos != -1) {
+            return trimNewlines(wikiText.substring(lastUnescapedPipePos + 1, end - 2));
+        }
+        assert start + 2 < wikiText.length() && end >= 2: wikiText;
+        return trimNewlines(wikiText.substring(start + 2, end - 2));
+    }
+
+    public String wikiLinkDest() {
+        assert isWikiLink();
+        // "[[.."
+        if (firstUnescapedPipePos != -1) {
+            return trimNewlines(wikiText.substring(start + 2, firstUnescapedPipePos));
+        }
+        return null;
+    }
+
+    public boolean isHtml() {
+        return isHtml;
+    }
+
+    public boolean remainderStartsWith(final String prefix) {
+        return wikiText.startsWith(prefix, start);
+    }
+
+    public void nextLine() {
+        final int oldStart = start;
+        while(nextToken() != null && !isNewline()) {}
+        if (isNewline()) {
+            --end;
+        }
+        start = oldStart;
+    }
+
+
+    public WikiTokenizer nextToken() {
+        this.clear();
+
+        start = end;
+
+        if (justReturnedNewline) {
+            lastLineStart = start;
+        }
+
+        try {
+
+            final int len = wikiText.length();
+            if (start >= len) {
+                return null;
+            }
+
+            // Eat a newline if we're looking at one:
+            final boolean atNewline = wikiText.charAt(end) == '\n' || wikiText.charAt(end) == '\u2028';
+            if (atNewline) {
+                justReturnedNewline = true;
+                ++end;
+                return this;
+            }
+
+            if (justReturnedNewline) {
+                justReturnedNewline = false;
+
+                final char firstChar = wikiText.charAt(end);
+                if (firstChar == '=') {
+                    final int headerStart = end;
+                    // Skip ===...
+                    while (++end < len && wikiText.charAt(end) == '=') {}
+                    final int headerTitleStart = end;
+                    headingDepth = headerTitleStart - headerStart;
+                    // Skip non-=...
+                    if (end < len) {
+                        final int nextNewline = safeIndexOf(wikiText, end, "\n", "\n");
+                        final int closingEquals = escapedFindEnd(end, "=");
+                        if (wikiText.charAt(closingEquals - 1) == '=') {
+                            end = closingEquals - 1;
+                        } else {
+                            end = nextNewline;
+                        }
+                    }
+                    final int headerTitleEnd = end;
+                    headingWikiText = wikiText.substring(headerTitleStart, headerTitleEnd);
+                    // Skip ===...
+                    while (end < len && ++end < len && wikiText.charAt(end) == '=') {}
+                    final int headerEnd = end;
+                    if (headerEnd - headerTitleEnd != headingDepth) {
+                        errors.add("Mismatched header depth: " + token());
+                    }
+                    return this;
+                }
+                if (listChars.indexOf(firstChar) != -1) {
+                    while (++end < len && listChars.indexOf(wikiText.charAt(end)) != -1) {}
+                    listPrefixEnd = end;
+                    end = escapedFindEnd(start, "\n");
+                    return this;
+                }
+            }
+
+            if (wikiText.startsWith("'''", start)) {
+                isMarkup = true;
+                end = start + 3;
+                return this;
+            }
+
+            if (wikiText.startsWith("''", start)) {
+                isMarkup = true;
+                end = start + 2;
+                return this;
+            }
+
+            if (wikiText.startsWith("[[", start)) {
+                end = escapedFindEnd(start + 2, "]]");
+                isWikiLink = errors.isEmpty();
+                return this;
+            }
+
+            if (wikiText.startsWith("{{", start)) {
+                end = escapedFindEnd(start + 2, "}}");
+                isFunction = errors.isEmpty();
+                return this;
+            }
+
+            if (wikiText.startsWith("<pre>", start)) {
+                end = safeIndexOf(wikiText, start, "</pre>", "\n");
+                isHtml = true;
+                return this;
+            }
+
+            if (wikiText.startsWith("<ref>", start)) {
+                end = safeIndexOf(wikiText, start, "</ref>", "\n");
+                isHtml = true;
+                return this;
+            }
+
+            if (wikiText.startsWith("<math>", start)) {
+                end = safeIndexOf(wikiText, start, "</math>", "\n");
+                isHtml = true;
+                return this;
+            }
+
+            if (wikiText.startsWith("<!--", start)) {
+                isComment = true;
+                end = safeIndexOf(wikiText, start, "-->", "\n");
+                return this;
+            }
+
+            if (wikiText.startsWith("}}", start) || wikiText.startsWith("]]", start)) {
+                errors.add("Close without open!");
+                end += 2;
+                return this;
+            }
+
+            if (wikiText.charAt(start) == '|' || wikiText.charAt(start) == '=') {
+                isPlainText = true;
+                ++end;
+                return this;
+            }
+
+
+            if (this.matcher.find(start)) {
+                end = this.matcher.start(1);
+                isPlainText = true;
+                if (end == start) {
+                    errors.add("Empty group: " + this.matcher.group());
+                    assert false;
+                }
+                return this;
+            }
+
+            end = wikiText.length();
+            return this;
+
+        } finally {
+            if (!errors.isEmpty()) {
+                System.err.println("Errors: " + errors + ", token=" + token());
+            }
         }
-      }
-    }
-  }
-  
-  public List<String> errors() {
-    return errors;
-  }
-  
-  public boolean isNewline() {
-    return justReturnedNewline;
-  }
-  
-  public void returnToLineStart() {
-    end = start = lastLineStart;
-    justReturnedNewline = true;
-  }
-  
-  public boolean isHeading() {
-    return headingWikiText != null;
-  }
-  
-  public String headingWikiText() {
-    assert isHeading();
-    return headingWikiText;
-  }
-  
-  public int headingDepth() {
-    assert isHeading();
-    return headingDepth;
-  }
-  
-  public boolean isMarkup() {
-    return isMarkup;
-  }
-
-  public boolean isComment() {
-    return isComment;
-  }
-
-  public boolean isListItem() {
-    return listPrefixEnd != -1;
-  }
-  
-  public String listItemPrefix() {
-    assert isListItem();
-    return wikiText.substring(start, listPrefixEnd);
-  }
-  
-  public static String getListTag(char c) {
-    if (c == '#') {
-      return "ol";
-    }
-    return "ul";
-  }
-
-  public String listItemWikiText() {
-    assert isListItem();
-    return wikiText.substring(listPrefixEnd, end);
-  }
-  
-  public boolean isFunction() {
-    return isFunction;
-  }
-
-  public String functionName() {
-    assert isFunction();
-    // "{{.."
-    if (firstUnescapedPipePos != -1) {
-      return trimNewlines(wikiText.substring(start + 2, firstUnescapedPipePos).trim());
-    }
-    final int safeEnd = Math.max(start + 2, end - 2);
-    return trimNewlines(wikiText.substring(start + 2, safeEnd).trim());
-  }
-  
-  public List<String> functionPositionArgs() {
-    return positionArgs;
-  }
-
-  public Map<String, String> functionNamedArgs() {
-    return namedArgs;
-  }
-
-  public boolean isPlainText() {
-    return isPlainText;
-  }
-
-  public boolean isWikiLink() {
-    return isWikiLink;
-  }
-
-  public String wikiLinkText() {
-    assert isWikiLink();
-    // "[[.."
-    if (lastUnescapedPipePos != -1) {
-      return trimNewlines(wikiText.substring(lastUnescapedPipePos + 1, end - 2));
-    }
-    assert start + 2 < wikiText.length() && end >= 2: wikiText;
-    return trimNewlines(wikiText.substring(start + 2, end - 2));
-  }
-
-  public String wikiLinkDest() {
-    assert isWikiLink();
-    // "[[.."
-    if (firstUnescapedPipePos != -1) {
-      return trimNewlines(wikiText.substring(start + 2, firstUnescapedPipePos));
-    }
-    return null;
-  }
-  
-  public boolean isHtml() {
-    return isHtml;
-  }
-
-  public boolean remainderStartsWith(final String prefix) {
-    return wikiText.startsWith(prefix, start);
-  }
-  
-  public void nextLine() {
-    final int oldStart = start;
-    while(nextToken() != null && !isNewline()) {}
-    if (isNewline()) {
-      --end;
-    }
-    start = oldStart;
-  }
-
-  
-  public WikiTokenizer nextToken() {
-    this.clear();
-    
-    start = end;
-    
-    if (justReturnedNewline) {
-      lastLineStart = start;
-    }
-    
-    try {
-    
-    final int len = wikiText.length();
-    if (start >= len) {
-      return null;
-    }
-    
-    // Eat a newline if we're looking at one:
-    final boolean atNewline = wikiText.charAt(end) == '\n' || wikiText.charAt(end) == '\u2028';
-    if (atNewline) {
-      justReturnedNewline = true;
-      ++end;
-      return this;
-    }
-    
-    if (justReturnedNewline) {   
-      justReturnedNewline = false;
-
-      final char firstChar = wikiText.charAt(end);
-      if (firstChar == '=') {
-        final int headerStart = end;
-        // Skip ===...
-        while (++end < len && wikiText.charAt(end) == '=') {}
-        final int headerTitleStart = end;
-        headingDepth = headerTitleStart - headerStart;
-        // Skip non-=...
-        if (end < len) {
-          final int nextNewline = safeIndexOf(wikiText, end, "\n", "\n");
-          final int closingEquals = escapedFindEnd(end, "=");
-          if (wikiText.charAt(closingEquals - 1) == '=') {
-            end = closingEquals - 1;
-          } else {
-            end = nextNewline;
-          }
+
+    }
+
+    public String token() {
+        final String token = wikiText.substring(start, end);
+        assert token.equals("\n") || !token.endsWith("\n") : "token='" + token + "'";
+        return token;
+    }
+
+    final static String[] patterns = { "\n", "{{", "}}", "[[", "]]", "|", "=", "<!--" };
+    private int escapedFindEnd(final int start, final String toFind) {
+        assert tokenStack.isEmpty();
+
+        final boolean insideFunction = toFind.equals("}}");
+
+        int end = start;
+        int firstNewline = -1;
+        int[] nextMatch = new int[8];
+        for (int i = 0; i < 8; ++i) {
+            nextMatch[i] = wikiText.indexOf(patterns[i], start);
+            if (nextMatch[i] == -1) nextMatch[i] = i > 0 ? 0x7fffffff : wikiText.length();
         }
-        final int headerTitleEnd = end;
-        headingWikiText = wikiText.substring(headerTitleStart, headerTitleEnd);
-        // Skip ===...
-        while (end < len && ++end < len && wikiText.charAt(end) == '=') {}
-        final int headerEnd = end;
-        if (headerEnd - headerTitleEnd != headingDepth) {
-          errors.add("Mismatched header depth: " + token());
+        while (end < wikiText.length()) {
+            // Manual replacement for matcher.find(end),
+            // because Java regexp is a ridiculously slow implementation.
+            // Initialize to always match the end.
+            int matchIdx = 0;
+            for (int i = 1; i < 8; ++i) {
+                if (nextMatch[i] < nextMatch[matchIdx]) {
+                    matchIdx = i;
+                }
+            }
+
+            int matchStart = nextMatch[matchIdx];
+            String matchText = patterns[matchIdx];
+            int matchEnd = matchStart + matchText.length();
+            nextMatch[matchIdx] = wikiText.indexOf(patterns[matchIdx], matchEnd);
+            if (nextMatch[matchIdx] == -1) nextMatch[matchIdx] = matchIdx > 0 ? 0x7fffffff : wikiText.length();
+            if (matchIdx == 0) {
+                matchText = "";
+                matchEnd = matchStart;
+            }
+
+            assert matchEnd > end || matchText.length() == 0: "Group=" + matchText;
+            if (matchText.length() == 0) {
+                assert matchStart == wikiText.length() || wikiText.charAt(matchStart) == '\n' : wikiText + ", " + matchStart;
+                if (firstNewline == -1) {
+                    firstNewline = matchEnd;
+                }
+                if (tokenStack.isEmpty() && toFind.equals("\n")) {
+                    return matchStart;
+                }
+                ++end;
+            } else if (tokenStack.isEmpty() && matchText.equals(toFind)) {
+                // The normal return....
+                if (insideFunction) {
+                    addFunctionArg(insideFunction, matchStart);
+                }
+                return matchEnd;
+            } else if (matchText.equals("[[") || matchText.equals("{{")) {
+                tokenStack.add(matchText);
+            } else if (matchText.equals("]]") || matchText.equals("}}")) {
+                if (tokenStack.size() > 0) {
+                    final String removed = tokenStack.remove(tokenStack.size() - 1);
+                    if (removed.equals("{{") && !matchText.equals("}}")) {
+                        errors.add("Unmatched {{ error: " + wikiText.substring(start));
+                        return safeIndexOf(wikiText, start, "\n", "\n");
+                    } else if (removed.equals("[[") && !matchText.equals("]]")) {
+                        errors.add("Unmatched [[ error: " + wikiText.substring(start));
+                        return safeIndexOf(wikiText, start, "\n", "\n");
+                    }
+                } else {
+                    errors.add("Pop too many error: " + wikiText.substring(start).replace("\n", "\\\\n"));
+                    // If we were looking for a newline
+                    return safeIndexOf(wikiText, start, "\n", "\n");
+                }
+            } else if (matchText.equals("|")) {
+                if (tokenStack.isEmpty()) {
+                    addFunctionArg(insideFunction, matchStart);
+                }
+            } else if (matchText.equals("=")) {
+                if (tokenStack.isEmpty()) {
+                    lastUnescapedEqualsPos = matchStart;
+                }
+                // Do nothing.  These can match spuriously, and if it's not the thing
+                // we're looking for, keep on going.
+            } else if (matchText.equals("<!--")) {
+                end = wikiText.indexOf("-->");
+                if (end == -1) {
+                    errors.add("Unmatched <!-- error: " + wikiText.substring(start));
+                    return safeIndexOf(wikiText, start, "\n", "\n");
+                }
+            } else if (matchText.equals("''") || (matchText.startsWith("<") && matchText.endsWith(">"))) {
+                // Don't care.
+            } else {
+                assert false : "Match text='" + matchText + "'";
+                throw new IllegalStateException();
+            }
+
+            // Inside the while loop.  Just go forward.
+            end = Math.max(end, matchEnd);
         }
-        return this;
-      }
-      if (listChars.indexOf(firstChar) != -1) {
-        while (++end < len && listChars.indexOf(wikiText.charAt(end)) != -1) {}
-        listPrefixEnd = end;
-        end = escapedFindEnd(start, "\n");
-        return this;
-      }
-    }
-
-    if (wikiText.startsWith("'''", start)) {
-      isMarkup = true;
-      end = start + 3;
-      return this;
-    }
-    
-    if (wikiText.startsWith("''", start)) {
-      isMarkup = true;
-      end = start + 2;
-      return this;
-    }
-
-    if (wikiText.startsWith("[[", start)) {
-      end = escapedFindEnd(start + 2, "]]");
-      isWikiLink = errors.isEmpty();
-      return this;
-    }
-
-    if (wikiText.startsWith("{{", start)) {      
-      end = escapedFindEnd(start + 2, "}}");
-      isFunction = errors.isEmpty();
-      return this;
-    }
-
-    if (wikiText.startsWith("<pre>", start)) {
-      end = safeIndexOf(wikiText, start, "</pre>", "\n");
-      isHtml = true;
-      return this;
-    }
-
-    if (wikiText.startsWith("<ref>", start)) {
-        end = safeIndexOf(wikiText, start, "</ref>", "\n");
-        isHtml = true;
-        return this;
-      }
-
-    if (wikiText.startsWith("<math>", start)) {
-      end = safeIndexOf(wikiText, start, "</math>", "\n");
-      isHtml = true;
-      return this;
-    }
-
-    if (wikiText.startsWith("<!--", start)) {
-      isComment = true;
-      end = safeIndexOf(wikiText, start, "-->", "\n");
-      return this;
-    }
-
-    if (wikiText.startsWith("}}", start) || wikiText.startsWith("]]", start)) {
-      errors.add("Close without open!");
-      end += 2;
-      return this;
-    }
-
-    if (wikiText.charAt(start) == '|' || wikiText.charAt(start) == '=') {
-      isPlainText = true;
-      ++end;
-      return this;
-    }
-
-    
-    if (this.matcher.find(start)) {
-      end = this.matcher.start(1);
-      isPlainText = true;
-      if (end == start) {
-        errors.add("Empty group: " + this.matcher.group());
-        assert false;
-      }
-      return this;
-    }
-    
-    end = wikiText.length();
-    return this;
-    
-    } finally {
-      if (!errors.isEmpty()) {
-        System.err.println("Errors: " + errors + ", token=" + token());
-      }
-    }
-    
-  }
-  
-  public String token() {
-    final String token = wikiText.substring(start, end);
-    assert token.equals("\n") || !token.endsWith("\n") : "token='" + token + "'";
-    return token;
-  }
-  
-  final static String[] patterns = { "\n", "{{", "}}", "[[", "]]", "|", "=", "<!--" };
-  private int escapedFindEnd(final int start, final String toFind) {
-    assert tokenStack.isEmpty();
-    
-    final boolean insideFunction = toFind.equals("}}");
-    
-    int end = start;
-    int firstNewline = -1;
-    int[] nextMatch = new int[8];
-    for (int i = 0; i < 8; ++i) {
-        nextMatch[i] = wikiText.indexOf(patterns[i], start);
-        if (nextMatch[i] == -1) nextMatch[i] = i > 0 ? 0x7fffffff : wikiText.length();
-    }
-    while (end < wikiText.length()) {
-        // Manual replacement for matcher.find(end),
-        // because Java regexp is a ridiculously slow implementation.
-        // Initialize to always match the end.
-        int matchIdx = 0;
-        for (int i = 1; i < 8; ++i) {
-            if (nextMatch[i] < nextMatch[matchIdx]) {
-                matchIdx = i;
+        if (toFind.equals("\n") && tokenStack.isEmpty()) {
+            // We were looking for the end, we got it.
+            return end;
+        }
+        errors.add("Couldn't find: " + toFind + ", "+ wikiText.substring(start));
+        if (firstNewline != -1) {
+            return firstNewline;
+        }
+        return end;
+    }
+
+    private void addFunctionArg(final boolean insideFunction, final int matchStart) {
+        if (firstUnescapedPipePos == -1) {
+            firstUnescapedPipePos = lastUnescapedPipePos = matchStart;
+        } else if (insideFunction) {
+            if (lastUnescapedEqualsPos > lastUnescapedPipePos) {
+                final String key = wikiText.substring(lastUnescapedPipePos + 1, lastUnescapedEqualsPos);
+                final String value = wikiText.substring(lastUnescapedEqualsPos + 1, matchStart);
+                namedArgs.put(trimNewlines(key), trimNewlines(value));
+            } else {
+                final String value = wikiText.substring(lastUnescapedPipePos + 1, matchStart);
+                positionArgs.add(trimNewlines(value));
             }
         }
+        lastUnescapedPipePos = matchStart;
+    }
 
-        int matchStart = nextMatch[matchIdx];
-        String matchText = patterns[matchIdx];
-        int matchEnd = matchStart + matchText.length();
-        nextMatch[matchIdx] = wikiText.indexOf(patterns[matchIdx], matchEnd);
-        if (nextMatch[matchIdx] == -1) nextMatch[matchIdx] = matchIdx > 0 ? 0x7fffffff : wikiText.length();
-        if (matchIdx == 0) {
-            matchText = "";
-            matchEnd = matchStart;
+    static final String trimNewlines(String s) {
+        while (s.startsWith("\n")) {
+            s = s.substring(1);
+        }
+        while (s.endsWith("\n")) {
+            s = s.substring(0, s.length() - 1);
         }
+        return s.replace('\n', ' ');
+    }
 
-        assert matchEnd > end || matchText.length() == 0: "Group=" + matchText;
-        if (matchText.length() == 0) {
-          assert matchStart == wikiText.length() || wikiText.charAt(matchStart) == '\n' : wikiText + ", " + matchStart;
-          if (firstNewline == -1) {
-            firstNewline = matchEnd;
-          }
-          if (tokenStack.isEmpty() && toFind.equals("\n")) {
-            return matchStart;
-          }
-          ++end;
-        } else if (tokenStack.isEmpty() && matchText.equals(toFind)) {
-          // The normal return....
-          if (insideFunction) {
-            addFunctionArg(insideFunction, matchStart);
-          }
-          return matchEnd;
-        } else if (matchText.equals("[[") || matchText.equals("{{")) {
-          tokenStack.add(matchText);
-        } else if (matchText.equals("]]") || matchText.equals("}}")) {
-          if (tokenStack.size() > 0) {
-            final String removed = tokenStack.remove(tokenStack.size() - 1);
-            if (removed.equals("{{") && !matchText.equals("}}")) {
-              errors.add("Unmatched {{ error: " + wikiText.substring(start));
-              return safeIndexOf(wikiText, start, "\n", "\n");
-            } else if (removed.equals("[[") && !matchText.equals("]]")) {
-              errors.add("Unmatched [[ error: " + wikiText.substring(start));
-              return safeIndexOf(wikiText, start, "\n", "\n");
+    static int safeIndexOf(final String s, final int start, final String target, final String backup) {
+        int close = s.indexOf(target, start);
+        if (close != -1) {
+            // Don't step over a \n.
+            return close + (target.equals("\n") ? 0 : target.length());
+        }
+        close = s.indexOf(backup, start);
+        if (close != -1) {
+            return close + (backup.equals("\n") ? 0 : backup.length());
+        }
+        return s.length();
+    }
+
+    public static String toPlainText(final String wikiText) {
+        final WikiTokenizer wikiTokenizer = new WikiTokenizer(wikiText);
+        final StringBuilder builder = new StringBuilder();
+        while (wikiTokenizer.nextToken() != null) {
+            if (wikiTokenizer.isPlainText()) {
+                builder.append(wikiTokenizer.token());
+            } else if (wikiTokenizer.isWikiLink()) {
+                builder.append(wikiTokenizer.wikiLinkText());
+            } else if (wikiTokenizer.isNewline()) {
+                builder.append("\n");
+            } else if (wikiTokenizer.isFunction()) {
+                builder.append(wikiTokenizer.token());
             }
-          } else {
-            errors.add("Pop too many error: " + wikiText.substring(start).replace("\n", "\\\\n"));
-            // If we were looking for a newline
-            return safeIndexOf(wikiText, start, "\n", "\n");
-          }
-        } else if (matchText.equals("|")) { 
-          if (tokenStack.isEmpty()) {
-            addFunctionArg(insideFunction, matchStart);
-          }
-        } else if (matchText.equals("=")) {
-          if (tokenStack.isEmpty()) {
-            lastUnescapedEqualsPos = matchStart;
-          }
-          // Do nothing.  These can match spuriously, and if it's not the thing
-          // we're looking for, keep on going.
-        } else if (matchText.equals("<!--")) {
-          end = wikiText.indexOf("-->");
-          if (end == -1) {
-            errors.add("Unmatched <!-- error: " + wikiText.substring(start));
-            return safeIndexOf(wikiText, start, "\n", "\n");
-          }
-        } else if (matchText.equals("''") || (matchText.startsWith("<") && matchText.endsWith(">"))) {
-          // Don't care.
-        } else {
-          assert false : "Match text='" + matchText + "'";
-          throw new IllegalStateException();
         }
+        return builder.toString();
+    }
 
-      // Inside the while loop.  Just go forward.
-      end = Math.max(end, matchEnd);
-    }
-    if (toFind.equals("\n") && tokenStack.isEmpty()) {
-      // We were looking for the end, we got it.
-      return end;
-    }
-    errors.add("Couldn't find: " + toFind + ", "+ wikiText.substring(start));
-    if (firstNewline != -1) {
-      return firstNewline;
-    }
-    return end;
-  }
-
-  private void addFunctionArg(final boolean insideFunction, final int matchStart) {
-    if (firstUnescapedPipePos == -1) {
-      firstUnescapedPipePos = lastUnescapedPipePos = matchStart;
-    } else if (insideFunction) {
-      if (lastUnescapedEqualsPos > lastUnescapedPipePos) {
-        final String key = wikiText.substring(lastUnescapedPipePos + 1, lastUnescapedEqualsPos);
-        final String value = wikiText.substring(lastUnescapedEqualsPos + 1, matchStart);
-        namedArgs.put(trimNewlines(key), trimNewlines(value));
-      } else {
-        final String value = wikiText.substring(lastUnescapedPipePos + 1, matchStart);
-        positionArgs.add(trimNewlines(value));
-      }
-    }
-    lastUnescapedPipePos = matchStart;
-  }
-  
-  static final String trimNewlines(String s) {
-    while (s.startsWith("\n")) {
-      s = s.substring(1);
-    }
-    while (s.endsWith("\n")) {
-      s = s.substring(0, s.length() - 1);
-    }
-    return s.replace('\n', ' ');
-  }
-
-  static int safeIndexOf(final String s, final int start, final String target, final String backup) {
-    int close = s.indexOf(target, start);
-    if (close != -1) {
-      // Don't step over a \n.
-      return close + (target.equals("\n") ? 0 : target.length());
-    }
-    close = s.indexOf(backup, start);
-    if (close != -1) {
-      return close + (backup.equals("\n") ? 0 : backup.length());
-    }
-    return s.length();
-  }
-
-  public static String toPlainText(final String wikiText) {
-    final WikiTokenizer wikiTokenizer = new WikiTokenizer(wikiText);
-    final StringBuilder builder = new StringBuilder();
-    while (wikiTokenizer.nextToken() != null) {
-      if (wikiTokenizer.isPlainText()) {
-        builder.append(wikiTokenizer.token());
-      } else if (wikiTokenizer.isWikiLink()) {
-        builder.append(wikiTokenizer.wikiLinkText());
-      } else if (wikiTokenizer.isNewline()) {
-        builder.append("\n");
-      } else if (wikiTokenizer.isFunction()) {
-        builder.append(wikiTokenizer.token());
-      }
-    }
-    return builder.toString();
-  }
-
-  public static StringBuilder appendFunction(final StringBuilder builder, final String name, List<String> args,
-      final Map<String, String> namedArgs) {
-    builder.append(name);
-    for (final String arg : args) {
-      builder.append("|").append(arg);
-    }
-    for (final Map.Entry<String, String> entry : namedArgs.entrySet()) {
-      builder.append("|").append(entry.getKey()).append("=").append(entry.getValue());
-    }
-    return builder;
-  }
+    public static StringBuilder appendFunction(final StringBuilder builder, final String name, List<String> args,
+            final Map<String, String> namedArgs) {
+        builder.append(name);
+        for (final String arg : args) {
+            builder.append("|").append(arg);
+        }
+        for (final Map.Entry<String, String> entry : namedArgs.entrySet()) {
+            builder.append("|").append(entry.getKey()).append("=").append(entry.getValue());
+        }
+        return builder;
+    }
 
 }
index b4999d854601a9c822c0051afb389637d8975273..5193c00892a1f407ced4c994a7e669baf142f204 100644 (file)
@@ -21,321 +21,321 @@ import java.util.List;
 import junit.framework.TestCase;
 
 public class WikiTokenizerTest extends TestCase {
-    
-  public void testWikiLink() {
-    String wikiText;
-    
-    wikiText = "[[abc]]";
-    assertEquals(wikiText, new WikiTokenizer(wikiText).nextToken().token());
-    assertTrue(new WikiTokenizer(wikiText).nextToken().isWikiLink());
-    assertEquals("abc", new WikiTokenizer(wikiText).nextToken().wikiLinkText());
-    assertEquals(null, new WikiTokenizer(wikiText).nextToken().wikiLinkDest());
-    
-    wikiText = "[[abc|def]]";
-    assertEquals(wikiText, new WikiTokenizer(wikiText).nextToken().token());
-    assertTrue(new WikiTokenizer(wikiText).nextToken().isWikiLink());
-    assertEquals("def", new WikiTokenizer(wikiText).nextToken().wikiLinkText());
-    assertEquals("abc", new WikiTokenizer(wikiText).nextToken().wikiLinkDest());
-
-    wikiText = "[[abc|def|ghi{{a|=2}}p]]";
-    assertEquals(wikiText, new WikiTokenizer(wikiText).nextToken().token());
-    assertTrue(new WikiTokenizer(wikiText).nextToken().isWikiLink());
-    assertEquals("ghi{{a|=2}}p", new WikiTokenizer(wikiText).nextToken().wikiLinkText());
-    assertEquals("abc", new WikiTokenizer(wikiText).nextToken().wikiLinkDest());
-
-    wikiText = "[[abc]][[def]]";
-    assertEquals("[[abc]]", new WikiTokenizer(wikiText).nextToken().token());
-    assertEquals("abc", new WikiTokenizer(wikiText).nextToken().wikiLinkText());
-    assertEquals("def", new WikiTokenizer(wikiText).nextToken().nextToken().wikiLinkText());
-
-  }
-  
-  public void testWikiList() {
-    String wikiText;
-
-    wikiText = "* This is ''bold''' asdf.";
-    assertEquals(wikiText, new WikiTokenizer(wikiText).nextToken().token());
-
-    wikiText = "* {{a|US}} {{IPA|[ˈfɔɹ.wɝd]]}}\nasdf\n";
-    assertEquals("* {{a|US}} {{IPA|[ˈfɔɹ.wɝd]]}}", new WikiTokenizer(wikiText).nextToken().token());
-    assertTrue(new WikiTokenizer(wikiText).nextToken().isListItem());
-    assertEquals("\n", new WikiTokenizer(wikiText).nextToken().nextToken().token());
-
-    
-    wikiText = "* [[asdf|\u2028" +
-               "asdf]]";
-    assertEquals("* [[asdf|\n" +
-        "asdf]]", new WikiTokenizer(wikiText).nextToken().token());
-    assertTrue(new WikiTokenizer(wikiText).nextToken().isListItem());
-
-  }
-  
-  public void testFunction() {
-    String wikiText;
-
-    {
-    WikiTokenizer wt = new WikiTokenizer("'''Προστατευόμενη Ονομασία Προέλευσης''', \"Protected Designation of Origin\" {{");
-        while (wt.nextToken() != null) {
-            if (wt.isFunction()) {
-                assertEquals("", wt.functionName());
+
+    public void testWikiLink() {
+        String wikiText;
+
+        wikiText = "[[abc]]";
+        assertEquals(wikiText, new WikiTokenizer(wikiText).nextToken().token());
+        assertTrue(new WikiTokenizer(wikiText).nextToken().isWikiLink());
+        assertEquals("abc", new WikiTokenizer(wikiText).nextToken().wikiLinkText());
+        assertEquals(null, new WikiTokenizer(wikiText).nextToken().wikiLinkDest());
+
+        wikiText = "[[abc|def]]";
+        assertEquals(wikiText, new WikiTokenizer(wikiText).nextToken().token());
+        assertTrue(new WikiTokenizer(wikiText).nextToken().isWikiLink());
+        assertEquals("def", new WikiTokenizer(wikiText).nextToken().wikiLinkText());
+        assertEquals("abc", new WikiTokenizer(wikiText).nextToken().wikiLinkDest());
+
+        wikiText = "[[abc|def|ghi{{a|=2}}p]]";
+        assertEquals(wikiText, new WikiTokenizer(wikiText).nextToken().token());
+        assertTrue(new WikiTokenizer(wikiText).nextToken().isWikiLink());
+        assertEquals("ghi{{a|=2}}p", new WikiTokenizer(wikiText).nextToken().wikiLinkText());
+        assertEquals("abc", new WikiTokenizer(wikiText).nextToken().wikiLinkDest());
+
+        wikiText = "[[abc]][[def]]";
+        assertEquals("[[abc]]", new WikiTokenizer(wikiText).nextToken().token());
+        assertEquals("abc", new WikiTokenizer(wikiText).nextToken().wikiLinkText());
+        assertEquals("def", new WikiTokenizer(wikiText).nextToken().nextToken().wikiLinkText());
+
+    }
+
+    public void testWikiList() {
+        String wikiText;
+
+        wikiText = "* This is ''bold''' asdf.";
+        assertEquals(wikiText, new WikiTokenizer(wikiText).nextToken().token());
+
+        wikiText = "* {{a|US}} {{IPA|[ˈfɔɹ.wɝd]]}}\nasdf\n";
+        assertEquals("* {{a|US}} {{IPA|[ˈfɔɹ.wɝd]]}}", new WikiTokenizer(wikiText).nextToken().token());
+        assertTrue(new WikiTokenizer(wikiText).nextToken().isListItem());
+        assertEquals("\n", new WikiTokenizer(wikiText).nextToken().nextToken().token());
+
+
+        wikiText = "* [[asdf|\u2028" +
+                   "asdf]]";
+        assertEquals("* [[asdf|\n" +
+                     "asdf]]", new WikiTokenizer(wikiText).nextToken().token());
+        assertTrue(new WikiTokenizer(wikiText).nextToken().isListItem());
+
+    }
+
+    public void testFunction() {
+        String wikiText;
+
+        {
+            WikiTokenizer wt = new WikiTokenizer("'''Προστατευόμενη Ονομασία Προέλευσης''', \"Protected Designation of Origin\" {{");
+            while (wt.nextToken() != null) {
+                if (wt.isFunction()) {
+                    assertEquals("", wt.functionName());
+                }
             }
         }
+
+        wikiText = "{{abc}}";
+        assertEquals(wikiText, new WikiTokenizer(wikiText).nextToken().token());
+        assertTrue(new WikiTokenizer(wikiText).nextToken().isFunction());
+        assertEquals("abc", new WikiTokenizer(wikiText).nextToken().functionName());
+        assertEquals(0, new WikiTokenizer(wikiText).nextToken().functionPositionArgs().size());
+        assertEquals(0, new WikiTokenizer(wikiText).nextToken().functionNamedArgs().size());
+
+        wikiText = "{{abc|def}}";
+        assertEquals(wikiText, new WikiTokenizer(wikiText).nextToken().token());
+        assertTrue(new WikiTokenizer(wikiText).nextToken().isFunction());
+        assertEquals("abc", new WikiTokenizer(wikiText).nextToken().functionName());
+        assertEquals(Arrays.asList("def"), new WikiTokenizer(wikiText).nextToken().functionPositionArgs());
+        assertEquals(0, new WikiTokenizer(wikiText).nextToken().functionNamedArgs().size());
+
+        wikiText = "{{abc|d[[|]]ef|ghi}}";
+        assertEquals(wikiText, new WikiTokenizer(wikiText).nextToken().token());
+        assertTrue(new WikiTokenizer(wikiText).nextToken().isFunction());
+        assertEquals("abc", new WikiTokenizer(wikiText).nextToken().functionName());
+        assertEquals(Arrays.asList("d[[|]]ef", "ghi"), new WikiTokenizer(wikiText).nextToken().functionPositionArgs());
+        assertEquals(0, new WikiTokenizer(wikiText).nextToken().functionNamedArgs().size());
+
+        wikiText = "{{abc|arg1=101|ghi|arg2=202|arg3={{n1|n2=7|n3}}|{{d}}}}";
+        assertEquals(wikiText, new WikiTokenizer(wikiText).nextToken().token());
+        assertTrue(new WikiTokenizer(wikiText).nextToken().isFunction());
+        assertEquals("abc", new WikiTokenizer(wikiText).nextToken().functionName());
+        assertEquals(Arrays.asList("ghi", "{{d}}"), new WikiTokenizer(wikiText).nextToken().functionPositionArgs());
+        assertEquals(3, new WikiTokenizer(wikiText).nextToken().functionNamedArgs().size());
+        assertEquals("101", new WikiTokenizer(wikiText).nextToken().functionNamedArgs().get("arg1"));
+        assertEquals("202", new WikiTokenizer(wikiText).nextToken().functionNamedArgs().get("arg2"));
+        assertEquals("{{n1|n2=7|n3}}", new WikiTokenizer(wikiText).nextToken().functionNamedArgs().get("arg3"));
+
+        wikiText = "{{gloss|asdf}\nAsdf\n\n";
+        assertEquals("{{gloss|asdf}", new WikiTokenizer(wikiText).nextToken().token());
+
+        wikiText = "#*{{quote-book|year=1960|author={{w|P. G. Wodehouse}}\n" +
+                   "|title={{w|Jeeves in the Offing}}\n" +
+                   "|section=chapter XI\n" +
+                   "|passage=“I'm sorely beset, Jeeves. Do you recall telling me once about someone who told somebody he could tell him something which would make him think a bit? Knitted socks and porcu\n" +
+                   "pines entered into it, I remember.” “I think you may be referring to the ghost of the father of Hamlet, Prince of Denmark, sir. Addressing his son, he said ‘I could a tale unfold whos\n" +
+                   "e lightest word would harrow up thy soul, freeze thy young blood, make thy two eyes, like stars, start from their spheres, thy knotted and combined locks to part and each particular h\n" +
+                   "air to stand on end like quills upon the fretful '''porpentine'''.’&nbsp;” “That's right. Locks, of course, not socks. Odd that he should have said '''porpentine''' when he meant porc\n" +
+                   "upine. Slip of the tongue, no doubt, as so often happens with ghosts.”}}";
+        assertEquals(wikiText, new WikiTokenizer(wikiText).nextToken().token());
+
+
     }
 
-    wikiText = "{{abc}}";
-    assertEquals(wikiText, new WikiTokenizer(wikiText).nextToken().token());
-    assertTrue(new WikiTokenizer(wikiText).nextToken().isFunction());
-    assertEquals("abc", new WikiTokenizer(wikiText).nextToken().functionName());
-    assertEquals(0, new WikiTokenizer(wikiText).nextToken().functionPositionArgs().size());
-    assertEquals(0, new WikiTokenizer(wikiText).nextToken().functionNamedArgs().size());
-
-    wikiText = "{{abc|def}}";
-    assertEquals(wikiText, new WikiTokenizer(wikiText).nextToken().token());
-    assertTrue(new WikiTokenizer(wikiText).nextToken().isFunction());
-    assertEquals("abc", new WikiTokenizer(wikiText).nextToken().functionName());
-    assertEquals(Arrays.asList("def"), new WikiTokenizer(wikiText).nextToken().functionPositionArgs());
-    assertEquals(0, new WikiTokenizer(wikiText).nextToken().functionNamedArgs().size());
-
-    wikiText = "{{abc|d[[|]]ef|ghi}}";
-    assertEquals(wikiText, new WikiTokenizer(wikiText).nextToken().token());
-    assertTrue(new WikiTokenizer(wikiText).nextToken().isFunction());
-    assertEquals("abc", new WikiTokenizer(wikiText).nextToken().functionName());
-    assertEquals(Arrays.asList("d[[|]]ef", "ghi"), new WikiTokenizer(wikiText).nextToken().functionPositionArgs());
-    assertEquals(0, new WikiTokenizer(wikiText).nextToken().functionNamedArgs().size());
-
-    wikiText = "{{abc|arg1=101|ghi|arg2=202|arg3={{n1|n2=7|n3}}|{{d}}}}";
-    assertEquals(wikiText, new WikiTokenizer(wikiText).nextToken().token());
-    assertTrue(new WikiTokenizer(wikiText).nextToken().isFunction());
-    assertEquals("abc", new WikiTokenizer(wikiText).nextToken().functionName());
-    assertEquals(Arrays.asList("ghi", "{{d}}"), new WikiTokenizer(wikiText).nextToken().functionPositionArgs());
-    assertEquals(3, new WikiTokenizer(wikiText).nextToken().functionNamedArgs().size());
-    assertEquals("101", new WikiTokenizer(wikiText).nextToken().functionNamedArgs().get("arg1"));
-    assertEquals("202", new WikiTokenizer(wikiText).nextToken().functionNamedArgs().get("arg2"));
-    assertEquals("{{n1|n2=7|n3}}", new WikiTokenizer(wikiText).nextToken().functionNamedArgs().get("arg3"));
-
-    wikiText = "{{gloss|asdf}\nAsdf\n\n";
-    assertEquals("{{gloss|asdf}", new WikiTokenizer(wikiText).nextToken().token());
-
-    wikiText = "#*{{quote-book|year=1960|author={{w|P. G. Wodehouse}}\n" +
-    "|title={{w|Jeeves in the Offing}}\n" +
-    "|section=chapter XI\n" +
-    "|passage=“I'm sorely beset, Jeeves. Do you recall telling me once about someone who told somebody he could tell him something which would make him think a bit? Knitted socks and porcu\n" +
-    "pines entered into it, I remember.” “I think you may be referring to the ghost of the father of Hamlet, Prince of Denmark, sir. Addressing his son, he said ‘I could a tale unfold whos\n" +
-    "e lightest word would harrow up thy soul, freeze thy young blood, make thy two eyes, like stars, start from their spheres, thy knotted and combined locks to part and each particular h\n" +
-    "air to stand on end like quills upon the fretful '''porpentine'''.’&nbsp;” “That's right. Locks, of course, not socks. Odd that he should have said '''porpentine''' when he meant porc\n" +
-    "upine. Slip of the tongue, no doubt, as so often happens with ghosts.”}}";
-    assertEquals(wikiText, new WikiTokenizer(wikiText).nextToken().token());
-
-    
-  }
-  
-  public void testReturn() {
-    String wikiText;
-
-    wikiText = "hello\n=Heading=\nhello2";
-    
-    final WikiTokenizer tokenizer = new WikiTokenizer(wikiText);
-    
-    assertEquals("hello", tokenizer.nextToken().token());
-    tokenizer.returnToLineStart();
-    assertEquals("hello", tokenizer.nextToken().token());
-    assertEquals("\n", tokenizer.nextToken().token());
-    tokenizer.returnToLineStart();
-    assertEquals("hello", tokenizer.nextToken().token());
-    assertEquals("\n", tokenizer.nextToken().token());
-    
-    assertEquals("=Heading=", tokenizer.nextToken().token());
-    tokenizer.returnToLineStart();
-    assertEquals("=Heading=", tokenizer.nextToken().token());
-    assertEquals("\n", tokenizer.nextToken().token());
-    tokenizer.returnToLineStart();
-    assertEquals("=Heading=", tokenizer.nextToken().token());
-    assertEquals("\n", tokenizer.nextToken().token());
-
-    assertEquals("hello2", tokenizer.nextToken().token());
-    assertEquals(null, tokenizer.nextToken());
-    tokenizer.returnToLineStart();
-    assertEquals("hello2", tokenizer.nextToken().token());
-    assertEquals(null, tokenizer.nextToken());
-    
-    
-  }
-
-  public void testWikiHeading() {
-    String wikiText;
-
-    wikiText = "==";
-    assertEquals("==", new WikiTokenizer(wikiText).nextToken().token());
-    assertTrue(new WikiTokenizer(wikiText).nextToken().isHeading());
-    assertEquals(2, new WikiTokenizer(wikiText).nextToken().headingDepth());
-    assertEquals("", new WikiTokenizer(wikiText).nextToken().headingWikiText());
-    assertEquals(1, new WikiTokenizer(wikiText).nextToken().errors.size());
-
-    
-    wikiText = "=a";
-    assertEquals("=a", new WikiTokenizer(wikiText).nextToken().token());
-    assertTrue(new WikiTokenizer(wikiText).nextToken().isHeading());
-    assertEquals(1, new WikiTokenizer(wikiText).nextToken().headingDepth());
-    assertEquals("a", new WikiTokenizer(wikiText).nextToken().headingWikiText());
-    assertEquals(2, new WikiTokenizer(wikiText).nextToken().errors.size());
-
-    wikiText = "=a==";
-    assertEquals("=a==", new WikiTokenizer(wikiText).nextToken().token());
-    assertTrue(new WikiTokenizer(wikiText).nextToken().isHeading());
-    assertEquals(1, new WikiTokenizer(wikiText).nextToken().headingDepth());
-    assertEquals("a", new WikiTokenizer(wikiText).nextToken().headingWikiText());
-    assertEquals(1, new WikiTokenizer(wikiText).nextToken().errors.size());
-
-    wikiText = "a=";
-    assertEquals("a", new WikiTokenizer(wikiText).nextToken().token());
-    assertFalse(new WikiTokenizer(wikiText).nextToken().isHeading());
-
-    wikiText = "=a=";
-    assertEquals("=a=", new WikiTokenizer(wikiText).nextToken().token());
-    assertTrue(new WikiTokenizer(wikiText).nextToken().isHeading());
-    assertEquals(1, new WikiTokenizer(wikiText).nextToken().headingDepth());
-    assertEquals("a", new WikiTokenizer(wikiText).nextToken().headingWikiText());
-    assertEquals(0, new WikiTokenizer(wikiText).nextToken().errors.size());
-
-    wikiText = "==aa[[|=]] {{|={{=}} }}==";
-    assertEquals(wikiText, new WikiTokenizer(wikiText).nextToken().token());
-    assertTrue(new WikiTokenizer(wikiText).nextToken().isHeading());
-    assertEquals(2, new WikiTokenizer(wikiText).nextToken().headingDepth());
-    assertEquals("aa[[|=]] {{|={{=}} }}", new WikiTokenizer(wikiText).nextToken().headingWikiText());
-    assertEquals(0, new WikiTokenizer(wikiText).nextToken().errors.size());
-    
-  }
-
-  
-
-  public void testSimple() {
-    final String wikiText =
-      "Hi" + "\n" +
-      "Hello =thad| you're <!-- not --> '''pretty''' cool '''''over''''' there." + "\n" +
-      "hi <!--" + "\n" +
-      "multi-line" + "\n" +
-      "# comment -->" + "\n" +
-      "" + "\n" +
-      "asdf\n" +
-      "{{template_not_in_list}}" + "\n" +
-      "# {{template_in_list}}" + "\n" +
-      "[[wikitext]]:[[wikitext]]" + "\n" +  // don't want this to trigger a list
-      ": but this is a list!" + "\n" +
-      "*:* and so is this :::" + "\n" +
-      "here's [[some blah|some]] wikitext." + "\n" +
-      "here's a {{template|this has an = sign|blah=2|blah2=3|" + "\n" +
-      "blah3=3,[[asdf]|[asdf asdf]|[asdf asdf asdf]],blah4=4}} and some more text." + "\n" +
-      "== Header 2 ==" + "\n" +
-      "{{some-func|blah={{nested-func|n2}}|blah2=asdf}}" + "\n" +
-      "{{mismatched]]" + "\n" +
-      "[[mismatched}}" + "\n" +
-      "{extraterminated}}" + "\n" +
-      "[extraterminated]]" + "\n" +
-      "=== {{header-template}} ===" + "\n";
-    
-    final String[] expectedTokens = new String[] {
-        "Hi",
-        "\n",
-        "Hello ",
-        "=",
-        "thad",
-        "|",
-        " you're ",
-        "<!-- not -->",
-        " ",
-        "'''",
-        "pretty",
-        "'''",
-        " cool ",
-        "'''",
-        "''",
-        "over",
-        "'''",
-        "''",
-        " there.",
-        "\n",
-        "hi ",
-        "<!--\nmulti-line\n# comment -->",
-        "\n",
-        "\n",
-        "asdf",
-        "\n",
-        "{{template_not_in_list}}",
-        "\n",
-        "# {{template_in_list}}",
-        "\n",
-        "[[wikitext]]",
-        ":",
-        "[[wikitext]]",
-        "\n",
-        ": but this is a list!",
-        "\n",
-        "*:* and so is this :::",
-        "\n",
-        "here's ",
-        "[[some blah|some]]",
-        " wikitext.",
-        "\n",
-        "here's a ",
-        "{{template|this has an = sign|blah=2|blah2=3|\nblah3=3,[[asdf]|[asdf asdf]|[asdf asdf asdf]],blah4=4}}",
-        " and some more text.",
-        "\n",
-        "== Header 2 ==",
-        "\n",
-        "{{some-func|blah={{nested-func|n2}}|blah2=asdf}}",
-        "\n",
-        "{{mismatched]]",
-        "\n",
-        "[[mismatched}}",
-        "\n",
-        "{extraterminated",
-        "}}",
-        "\n",
-        "[extraterminated",
-        "]]",
-        "\n",
-        "=== {{header-template}} ===",
-        "\n",
+    public void testReturn() {
+        String wikiText;
+
+        wikiText = "hello\n=Heading=\nhello2";
+
+        final WikiTokenizer tokenizer = new WikiTokenizer(wikiText);
+
+        assertEquals("hello", tokenizer.nextToken().token());
+        tokenizer.returnToLineStart();
+        assertEquals("hello", tokenizer.nextToken().token());
+        assertEquals("\n", tokenizer.nextToken().token());
+        tokenizer.returnToLineStart();
+        assertEquals("hello", tokenizer.nextToken().token());
+        assertEquals("\n", tokenizer.nextToken().token());
+
+        assertEquals("=Heading=", tokenizer.nextToken().token());
+        tokenizer.returnToLineStart();
+        assertEquals("=Heading=", tokenizer.nextToken().token());
+        assertEquals("\n", tokenizer.nextToken().token());
+        tokenizer.returnToLineStart();
+        assertEquals("=Heading=", tokenizer.nextToken().token());
+        assertEquals("\n", tokenizer.nextToken().token());
+
+        assertEquals("hello2", tokenizer.nextToken().token());
+        assertEquals(null, tokenizer.nextToken());
+        tokenizer.returnToLineStart();
+        assertEquals("hello2", tokenizer.nextToken().token());
+        assertEquals(null, tokenizer.nextToken());
+
+
+    }
+
+    public void testWikiHeading() {
+        String wikiText;
+
+        wikiText = "==";
+        assertEquals("==", new WikiTokenizer(wikiText).nextToken().token());
+        assertTrue(new WikiTokenizer(wikiText).nextToken().isHeading());
+        assertEquals(2, new WikiTokenizer(wikiText).nextToken().headingDepth());
+        assertEquals("", new WikiTokenizer(wikiText).nextToken().headingWikiText());
+        assertEquals(1, new WikiTokenizer(wikiText).nextToken().errors.size());
+
+
+        wikiText = "=a";
+        assertEquals("=a", new WikiTokenizer(wikiText).nextToken().token());
+        assertTrue(new WikiTokenizer(wikiText).nextToken().isHeading());
+        assertEquals(1, new WikiTokenizer(wikiText).nextToken().headingDepth());
+        assertEquals("a", new WikiTokenizer(wikiText).nextToken().headingWikiText());
+        assertEquals(2, new WikiTokenizer(wikiText).nextToken().errors.size());
+
+        wikiText = "=a==";
+        assertEquals("=a==", new WikiTokenizer(wikiText).nextToken().token());
+        assertTrue(new WikiTokenizer(wikiText).nextToken().isHeading());
+        assertEquals(1, new WikiTokenizer(wikiText).nextToken().headingDepth());
+        assertEquals("a", new WikiTokenizer(wikiText).nextToken().headingWikiText());
+        assertEquals(1, new WikiTokenizer(wikiText).nextToken().errors.size());
+
+        wikiText = "a=";
+        assertEquals("a", new WikiTokenizer(wikiText).nextToken().token());
+        assertFalse(new WikiTokenizer(wikiText).nextToken().isHeading());
+
+        wikiText = "=a=";
+        assertEquals("=a=", new WikiTokenizer(wikiText).nextToken().token());
+        assertTrue(new WikiTokenizer(wikiText).nextToken().isHeading());
+        assertEquals(1, new WikiTokenizer(wikiText).nextToken().headingDepth());
+        assertEquals("a", new WikiTokenizer(wikiText).nextToken().headingWikiText());
+        assertEquals(0, new WikiTokenizer(wikiText).nextToken().errors.size());
+
+        wikiText = "==aa[[|=]] {{|={{=}} }}==";
+        assertEquals(wikiText, new WikiTokenizer(wikiText).nextToken().token());
+        assertTrue(new WikiTokenizer(wikiText).nextToken().isHeading());
+        assertEquals(2, new WikiTokenizer(wikiText).nextToken().headingDepth());
+        assertEquals("aa[[|=]] {{|={{=}} }}", new WikiTokenizer(wikiText).nextToken().headingWikiText());
+        assertEquals(0, new WikiTokenizer(wikiText).nextToken().errors.size());
+
+    }
+
+
+
+    public void testSimple() {
+        final String wikiText =
+            "Hi" + "\n" +
+            "Hello =thad| you're <!-- not --> '''pretty''' cool '''''over''''' there." + "\n" +
+            "hi <!--" + "\n" +
+            "multi-line" + "\n" +
+            "# comment -->" + "\n" +
+            "" + "\n" +
+            "asdf\n" +
+            "{{template_not_in_list}}" + "\n" +
+            "# {{template_in_list}}" + "\n" +
+            "[[wikitext]]:[[wikitext]]" + "\n" +  // don't want this to trigger a list
+            ": but this is a list!" + "\n" +
+            "*:* and so is this :::" + "\n" +
+            "here's [[some blah|some]] wikitext." + "\n" +
+            "here's a {{template|this has an = sign|blah=2|blah2=3|" + "\n" +
+            "blah3=3,[[asdf]|[asdf asdf]|[asdf asdf asdf]],blah4=4}} and some more text." + "\n" +
+            "== Header 2 ==" + "\n" +
+            "{{some-func|blah={{nested-func|n2}}|blah2=asdf}}" + "\n" +
+            "{{mismatched]]" + "\n" +
+            "[[mismatched}}" + "\n" +
+            "{extraterminated}}" + "\n" +
+            "[extraterminated]]" + "\n" +
+            "=== {{header-template}} ===" + "\n";
+
+        final String[] expectedTokens = new String[] {
+            "Hi",
+            "\n",
+            "Hello ",
+            "=",
+            "thad",
+            "|",
+            " you're ",
+            "<!-- not -->",
+            " ",
+            "'''",
+            "pretty",
+            "'''",
+            " cool ",
+            "'''",
+            "''",
+            "over",
+            "'''",
+            "''",
+            " there.",
+            "\n",
+            "hi ",
+            "<!--\nmulti-line\n# comment -->",
+            "\n",
+            "\n",
+            "asdf",
+            "\n",
+            "{{template_not_in_list}}",
+            "\n",
+            "# {{template_in_list}}",
+            "\n",
+            "[[wikitext]]",
+            ":",
+            "[[wikitext]]",
+            "\n",
+            ": but this is a list!",
+            "\n",
+            "*:* and so is this :::",
+            "\n",
+            "here's ",
+            "[[some blah|some]]",
+            " wikitext.",
+            "\n",
+            "here's a ",
+            "{{template|this has an = sign|blah=2|blah2=3|\nblah3=3,[[asdf]|[asdf asdf]|[asdf asdf asdf]],blah4=4}}",
+            " and some more text.",
+            "\n",
+            "== Header 2 ==",
+            "\n",
+            "{{some-func|blah={{nested-func|n2}}|blah2=asdf}}",
+            "\n",
+            "{{mismatched]]",
+            "\n",
+            "[[mismatched}}",
+            "\n",
+            "{extraterminated",
+            "}}",
+            "\n",
+            "[extraterminated",
+            "]]",
+            "\n",
+            "=== {{header-template}} ===",
+            "\n",
         };
-    
-    final List<String> actualTokens = new ArrayList<String>();
-    
-    final WikiTokenizer wikiTokenizer = new WikiTokenizer(wikiText);
-    WikiTokenizer token;
-    int i = 0;
-    while ((token = wikiTokenizer.nextToken()) != null) {
-      actualTokens.add(token.token());
-      System.out.println("\"" + token.token().replace("\n", "\\n") + "\",");
-      assertEquals(expectedTokens[i++], token.token());
+
+        final List<String> actualTokens = new ArrayList<String>();
+
+        final WikiTokenizer wikiTokenizer = new WikiTokenizer(wikiText);
+        WikiTokenizer token;
+        int i = 0;
+        while ((token = wikiTokenizer.nextToken()) != null) {
+            actualTokens.add(token.token());
+            System.out.println("\"" + token.token().replace("\n", "\\n") + "\",");
+            assertEquals(expectedTokens[i++], token.token());
+        }
+        assertEquals(Arrays.asList(expectedTokens), actualTokens);
     }
-    assertEquals(Arrays.asList(expectedTokens), actualTokens);
-  }
-  
-  public void testHtml() {
-      String wikiText;
-
-      {
-      wikiText = " zz <pre> asdf </pre> ZZ <math> 1234 </math> XX ";
-      final WikiTokenizer tokenizer = new WikiTokenizer(wikiText);
-      assertEquals(" zz ", tokenizer.nextToken().token());
-      assertEquals("<pre> asdf </pre>", tokenizer.nextToken().token());
-      assertEquals(" ZZ ", tokenizer.nextToken().token());
-      assertEquals("<math> 1234 </math>", tokenizer.nextToken().token());
-      assertEquals(" XX ", tokenizer.nextToken().token());
-      }
-      {
-      wikiText = "\n<math> 1234 </math>";
-      final WikiTokenizer tokenizer = new WikiTokenizer(wikiText);
-      assertEquals("<math> 1234 </math>", tokenizer.nextToken().nextToken().token());
-      }
-
-      {
-      wikiText = "# z'' is the '''free''' variable in \"<math>\\forall x\\exists y:xy=z</math>\".''";
-      final WikiTokenizer tokenizer = new WikiTokenizer(wikiText);
-      assertEquals(wikiText, tokenizer.nextToken().token());
-      }
-
-      
-  }
-  
+
+    public void testHtml() {
+        String wikiText;
+
+        {
+            wikiText = " zz <pre> asdf </pre> ZZ <math> 1234 </math> XX ";
+            final WikiTokenizer tokenizer = new WikiTokenizer(wikiText);
+            assertEquals(" zz ", tokenizer.nextToken().token());
+            assertEquals("<pre> asdf </pre>", tokenizer.nextToken().token());
+            assertEquals(" ZZ ", tokenizer.nextToken().token());
+            assertEquals("<math> 1234 </math>", tokenizer.nextToken().token());
+            assertEquals(" XX ", tokenizer.nextToken().token());
+        }
+        {
+            wikiText = "\n<math> 1234 </math>";
+            final WikiTokenizer tokenizer = new WikiTokenizer(wikiText);
+            assertEquals("<math> 1234 </math>", tokenizer.nextToken().nextToken().token());
+        }
+
+        {
+            wikiText = "# z'' is the '''free''' variable in \"<math>\\forall x\\exists y:xy=z</math>\".''";
+            final WikiTokenizer tokenizer = new WikiTokenizer(wikiText);
+            assertEquals(wikiText, tokenizer.nextToken().token());
+        }
+
+
+    }
+
 }
index ea60658164495725cbc8c887c3c02c7d98075325..b77c341649d5f63a0a11ef9a8f1fb1fa1d71fee1 100644 (file)
@@ -42,242 +42,242 @@ import com.hughes.util.EnumUtil;
 
 public abstract class AbstractWiktionaryParser implements Parser {
 
-  static final Logger LOG = Logger.getLogger("WiktionaryParser");
-
-  final SortedMap<String, AtomicInteger> counters = new TreeMap<String, AtomicInteger>();
-  final Set<String> pairsAdded = new LinkedHashSet<String>();
-  
-  public EntrySource entrySource;
-  public String title;
-
-
-  abstract void parseSection(final String heading, final String text);
-  
-  abstract void removeUselessArgs(final Map<String, String> namedArgs);
-  
-  @Override
-  public void parse(final File file, final EntrySource entrySource, final int pageLimit) throws IOException {
-    this.entrySource = entrySource;
-    int pageCount = 0;
-    final DataInputStream dis = new DataInputStream(new BufferedInputStream(new FileInputStream(file)));
-    try {
-    while (true) {
-      if (pageLimit >= 0 && pageCount >= pageLimit) {
-        return;
-      }
-      
-      try {
-        title = dis.readUTF();
-      } catch (EOFException e) {
-        LOG.log(Level.INFO, "EOF reading split.");
-        dis.close();
-        return;
-      }
-      final String heading = dis.readUTF();
-      final int bytesLength = dis.readInt();
-      final byte[] bytes = new byte[bytesLength];
-      dis.readFully(bytes);
-      final String text = new String(bytes, "UTF8");
-      
-      parseSection(heading, text);
-
-      ++pageCount;
-      if (pageCount % 1000 == 0) {
-        LOG.info("pageCount=" + pageCount);
-      }
-    }
-    } finally {
-      dis.close();
-      LOG.info("***COUNTERS***");
-      for (final Map.Entry<String, AtomicInteger> entry : counters.entrySet()) {
-        LOG.info(entry.getKey() + ": " + entry.getValue());
-      }
-    }
-  }
-  
-  static final Pattern whitespace = Pattern.compile("\\s+");
-  static String trim(final String s) {
-    return whitespace.matcher(s).replaceAll(" ").trim();
-  }
-
-  public void incrementCount(final String string) {
-    AtomicInteger counter = counters.get(string);
-    if (counter == null) {
-      counter = new AtomicInteger();
-      counters.put(string, counter);
-    }
-    counter.incrementAndGet();
-  }
-  
-  public void addLinkToCurrentEntry(final String token, final String lang, final EntryTypeName entryTypeName) {
-      assert false : token + ", title=" + title;
-  }
-
-  
-  // -------------------------------------------------------------------------
-  
-  static class AppendAndIndexWikiCallback<T extends AbstractWiktionaryParser> implements WikiTokenizer.Callback {
-
-    final T parser;
-    StringBuilder builder;
-    IndexedEntry indexedEntry;
-    IndexBuilder indexBuilder;
-    final Map<String,FunctionCallback<T>> functionCallbacks = new LinkedHashMap<String, FunctionCallback<T>>();
-    
-    boolean entryTypeNameSticks = false;
-    EntryTypeName entryTypeName = null;
-    
-    final Map<String,AtomicInteger> langCodeToTCount = new LinkedHashMap<String, AtomicInteger>();
-    
-    final NameAndArgs<T> nameAndArgs = new NameAndArgs<T>();
-    
-    public AppendAndIndexWikiCallback(final T parser) {
-      this.parser = parser;
-    }
-    
-    public void reset(final StringBuilder builder, final IndexedEntry indexedEntry) {
-      this.builder = builder;
-      this.indexedEntry = indexedEntry;
-      this.indexBuilder = null;
-      entryTypeName = null;
-      entryTypeNameSticks = false;
-    }
-    
-    public void dispatch(final String wikiText, final IndexBuilder indexBuilder, final EntryTypeName entryTypeName) {
-      final IndexBuilder oldIndexBuilder = this.indexBuilder;
-      final EntryTypeName oldEntryTypeName = this.entryTypeName;
-      this.indexBuilder = indexBuilder;
-      if (!entryTypeNameSticks) {
-        this.entryTypeName = EnumUtil.min(entryTypeName, this.entryTypeName);
-      }
-      if (entryTypeName == null) this.entryTypeName = null;
-      WikiTokenizer.dispatch(wikiText, false, this);
-      this.indexBuilder = oldIndexBuilder;
-      this.entryTypeName = oldEntryTypeName;
-    }
-    
-    public String dispatch(final String wikiText, final EntryTypeName entryTypeName) {
-      final int start = builder.length();
-      dispatch(wikiText, this.indexBuilder, entryTypeName);
-      return builder.substring(start);
-    }
+    static final Logger LOG = Logger.getLogger("WiktionaryParser");
 
-    @Override
-    public void onPlainText(final String plainText) {
-      // The only non-recursive callback.  Just appends to the builder, and indexes.
-      builder.append(plainText);
-      if (indexBuilder != null && entryTypeName != null && indexedEntry != null) {
-        indexBuilder.addEntryWithString(indexedEntry, plainText, entryTypeName);
-      }
-    }
+    final SortedMap<String, AtomicInteger> counters = new TreeMap<String, AtomicInteger>();
+    final Set<String> pairsAdded = new LinkedHashSet<String>();
 
-    @Override
-    public void onWikiLink(WikiTokenizer wikiTokenizer) {
-      final String text = wikiTokenizer.wikiLinkText();
-      @SuppressWarnings("unused")
-      final String link = wikiTokenizer.wikiLinkDest();
-      dispatch(text, entryTypeName);
-    }
+    public EntrySource entrySource;
+    public String title;
 
-    @Override
-    public void onFunction(
-        final WikiTokenizer wikiTokenizer,
-        final String name,
-        final List<String> args, 
-        final Map<String, String> namedArgs) {
-      
-      FunctionCallback<T> functionCallback = functionCallbacks.get(name);
-      if (functionCallback == null || !functionCallback.onWikiFunction(wikiTokenizer, name, args, namedArgs, parser, this)) {
-        // Default function handling:
-        parser.removeUselessArgs(namedArgs);
-        final boolean single = args.isEmpty() && namedArgs.isEmpty();
-        builder.append(single ? "{" : "{{");
-
-        final IndexBuilder oldIndexBuilder = indexBuilder;
-        indexBuilder = null;
-        nameAndArgs.onWikiFunction(wikiTokenizer, name, args, namedArgs, parser, this);
-        indexBuilder = oldIndexBuilder;
-
-        builder.append(single ? "}" : "}}");
-      }
-    }
-    
-    @Override
-    public void onHtml(WikiTokenizer wikiTokenizer) {
-      if (wikiTokenizer.token().startsWith("<ref>")) {
-          // Do nothing.
-          return;
-      }
-      // Unindexed for now.
-      builder.append(wikiTokenizer.token());
-    }
+
+    abstract void parseSection(final String heading, final String text);
+
+    abstract void removeUselessArgs(final Map<String, String> namedArgs);
 
     @Override
-    public void onMarkup(WikiTokenizer wikiTokenizer) {
-      // Do nothing.
+    public void parse(final File file, final EntrySource entrySource, final int pageLimit) throws IOException {
+        this.entrySource = entrySource;
+        int pageCount = 0;
+        final DataInputStream dis = new DataInputStream(new BufferedInputStream(new FileInputStream(file)));
+        try {
+            while (true) {
+                if (pageLimit >= 0 && pageCount >= pageLimit) {
+                    return;
+                }
+
+                try {
+                    title = dis.readUTF();
+                } catch (EOFException e) {
+                    LOG.log(Level.INFO, "EOF reading split.");
+                    dis.close();
+                    return;
+                }
+                final String heading = dis.readUTF();
+                final int bytesLength = dis.readInt();
+                final byte[] bytes = new byte[bytesLength];
+                dis.readFully(bytes);
+                final String text = new String(bytes, "UTF8");
+
+                parseSection(heading, text);
+
+                ++pageCount;
+                if (pageCount % 1000 == 0) {
+                    LOG.info("pageCount=" + pageCount);
+                }
+            }
+        } finally {
+            dis.close();
+            LOG.info("***COUNTERS***");
+            for (final Map.Entry<String, AtomicInteger> entry : counters.entrySet()) {
+                LOG.info(entry.getKey() + ": " + entry.getValue());
+            }
+        }
     }
 
-    @Override
-    public final void onComment(WikiTokenizer wikiTokenizer) {
-      // Do nothing.
+    static final Pattern whitespace = Pattern.compile("\\s+");
+    static String trim(final String s) {
+        return whitespace.matcher(s).replaceAll(" ").trim();
     }
 
-    @Override
-    public void onNewline(WikiTokenizer wikiTokenizer) {
-      assert false;
+    public void incrementCount(final String string) {
+        AtomicInteger counter = counters.get(string);
+        if (counter == null) {
+            counter = new AtomicInteger();
+            counters.put(string, counter);
+        }
+        counter.incrementAndGet();
     }
 
-    @Override
-    public void onHeading(WikiTokenizer wikiTokenizer) {
-      assert false;
+    public void addLinkToCurrentEntry(final String token, final String lang, final EntryTypeName entryTypeName) {
+        assert false : token + ", title=" + title;
     }
 
-    @Override
-    public void onListItem(WikiTokenizer wikiTokenizer) {
-      assert false;
+
+    // -------------------------------------------------------------------------
+
+    static class AppendAndIndexWikiCallback<T extends AbstractWiktionaryParser> implements WikiTokenizer.Callback {
+
+        final T parser;
+        StringBuilder builder;
+        IndexedEntry indexedEntry;
+        IndexBuilder indexBuilder;
+        final Map<String,FunctionCallback<T>> functionCallbacks = new LinkedHashMap<String, FunctionCallback<T>>();
+
+        boolean entryTypeNameSticks = false;
+        EntryTypeName entryTypeName = null;
+
+        final Map<String,AtomicInteger> langCodeToTCount = new LinkedHashMap<String, AtomicInteger>();
+
+        final NameAndArgs<T> nameAndArgs = new NameAndArgs<T>();
+
+        public AppendAndIndexWikiCallback(final T parser) {
+            this.parser = parser;
+        }
+
+        public void reset(final StringBuilder builder, final IndexedEntry indexedEntry) {
+            this.builder = builder;
+            this.indexedEntry = indexedEntry;
+            this.indexBuilder = null;
+            entryTypeName = null;
+            entryTypeNameSticks = false;
+        }
+
+        public void dispatch(final String wikiText, final IndexBuilder indexBuilder, final EntryTypeName entryTypeName) {
+            final IndexBuilder oldIndexBuilder = this.indexBuilder;
+            final EntryTypeName oldEntryTypeName = this.entryTypeName;
+            this.indexBuilder = indexBuilder;
+            if (!entryTypeNameSticks) {
+                this.entryTypeName = EnumUtil.min(entryTypeName, this.entryTypeName);
+            }
+            if (entryTypeName == null) this.entryTypeName = null;
+            WikiTokenizer.dispatch(wikiText, false, this);
+            this.indexBuilder = oldIndexBuilder;
+            this.entryTypeName = oldEntryTypeName;
+        }
+
+        public String dispatch(final String wikiText, final EntryTypeName entryTypeName) {
+            final int start = builder.length();
+            dispatch(wikiText, this.indexBuilder, entryTypeName);
+            return builder.substring(start);
+        }
+
+        @Override
+        public void onPlainText(final String plainText) {
+            // The only non-recursive callback.  Just appends to the builder, and indexes.
+            builder.append(plainText);
+            if (indexBuilder != null && entryTypeName != null && indexedEntry != null) {
+                indexBuilder.addEntryWithString(indexedEntry, plainText, entryTypeName);
+            }
+        }
+
+        @Override
+        public void onWikiLink(WikiTokenizer wikiTokenizer) {
+            final String text = wikiTokenizer.wikiLinkText();
+            @SuppressWarnings("unused")
+            final String link = wikiTokenizer.wikiLinkDest();
+            dispatch(text, entryTypeName);
+        }
+
+        @Override
+        public void onFunction(
+            final WikiTokenizer wikiTokenizer,
+            final String name,
+            final List<String> args,
+            final Map<String, String> namedArgs) {
+
+            FunctionCallback<T> functionCallback = functionCallbacks.get(name);
+            if (functionCallback == null || !functionCallback.onWikiFunction(wikiTokenizer, name, args, namedArgs, parser, this)) {
+                // Default function handling:
+                parser.removeUselessArgs(namedArgs);
+                final boolean single = args.isEmpty() && namedArgs.isEmpty();
+                builder.append(single ? "{" : "{{");
+
+                final IndexBuilder oldIndexBuilder = indexBuilder;
+                indexBuilder = null;
+                nameAndArgs.onWikiFunction(wikiTokenizer, name, args, namedArgs, parser, this);
+                indexBuilder = oldIndexBuilder;
+
+                builder.append(single ? "}" : "}}");
+            }
+        }
+
+        @Override
+        public void onHtml(WikiTokenizer wikiTokenizer) {
+            if (wikiTokenizer.token().startsWith("<ref>")) {
+                // Do nothing.
+                return;
+            }
+            // Unindexed for now.
+            builder.append(wikiTokenizer.token());
+        }
+
+        @Override
+        public void onMarkup(WikiTokenizer wikiTokenizer) {
+            // Do nothing.
+        }
+
+        @Override
+        public final void onComment(WikiTokenizer wikiTokenizer) {
+            // Do nothing.
+        }
+
+        @Override
+        public void onNewline(WikiTokenizer wikiTokenizer) {
+            assert false;
+        }
+
+        @Override
+        public void onHeading(WikiTokenizer wikiTokenizer) {
+            assert false;
+        }
+
+        @Override
+        public void onListItem(WikiTokenizer wikiTokenizer) {
+            assert false;
+        }
+
     }
 
-  }
-  
-  // --------------------------------------------------------------------
-  
-  static final class NameAndArgs<T extends AbstractWiktionaryParser> implements FunctionCallback<T> {
-    @Override
-    public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List<String> args,
-        final Map<String, String> namedArgs, final T parser,
-        final AppendAndIndexWikiCallback<T> appendAndIndexWikiCallback) {
-      
-      if (name != null) {
-        appendAndIndexWikiCallback.dispatch(name, null);
-      }
-      for (int i = 0; i < args.size(); ++i) {
-        if (args.get(i).length() > 0) {
-          appendAndIndexWikiCallback.builder.append("|");
-          appendAndIndexWikiCallback.dispatch(args.get(i), null, null);
+    // --------------------------------------------------------------------
+
+    static final class NameAndArgs<T extends AbstractWiktionaryParser> implements FunctionCallback<T> {
+        @Override
+        public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List<String> args,
+                                      final Map<String, String> namedArgs, final T parser,
+                                      final AppendAndIndexWikiCallback<T> appendAndIndexWikiCallback) {
+
+            if (name != null) {
+                appendAndIndexWikiCallback.dispatch(name, null);
+            }
+            for (int i = 0; i < args.size(); ++i) {
+                if (args.get(i).length() > 0) {
+                    appendAndIndexWikiCallback.builder.append("|");
+                    appendAndIndexWikiCallback.dispatch(args.get(i), null, null);
+                }
+            }
+            appendNamedArgs(namedArgs, appendAndIndexWikiCallback);
+            return true;
         }
-      }
-      appendNamedArgs(namedArgs, appendAndIndexWikiCallback);
-      return true;
     }
-  }
-  static NameAndArgs<AbstractWiktionaryParser> NAME_AND_ARGS = new NameAndArgs<AbstractWiktionaryParser>();
-
-  static void appendNamedArgs(final Map<String, String> namedArgs,
-      final AppendAndIndexWikiCallback<?> appendAndIndexWikiCallback) {
-    for (final Map.Entry<String, String> entry : namedArgs.entrySet()) {
-      appendAndIndexWikiCallback.builder.append("|");
-      appendAndIndexWikiCallback.dispatch(entry.getKey(), null, null);
-      appendAndIndexWikiCallback.builder.append("=");
-      EntryTypeName entryTypeName = null;
-      IndexBuilder indexBuilder = null;
-      // This doesn't work: we'd need to add to word-forms.
+    static NameAndArgs<AbstractWiktionaryParser> NAME_AND_ARGS = new NameAndArgs<AbstractWiktionaryParser>();
+
+    static void appendNamedArgs(final Map<String, String> namedArgs,
+                                final AppendAndIndexWikiCallback<?> appendAndIndexWikiCallback) {
+        for (final Map.Entry<String, String> entry : namedArgs.entrySet()) {
+            appendAndIndexWikiCallback.builder.append("|");
+            appendAndIndexWikiCallback.dispatch(entry.getKey(), null, null);
+            appendAndIndexWikiCallback.builder.append("=");
+            EntryTypeName entryTypeName = null;
+            IndexBuilder indexBuilder = null;
+            // This doesn't work: we'd need to add to word-forms.
 //      System.out.println(entry.getKey());
 //      if (entry.getKey().equals("tr")) {
 //        entryTypeName = EntryTypeName.WIKTIONARY_TRANSLITERATION;
 //        indexBuilder = appendAndIndexWikiCallback.parser.foreignIndexBuilder;
 //      }
-      appendAndIndexWikiCallback.dispatch(entry.getValue(), indexBuilder, entryTypeName);
+            appendAndIndexWikiCallback.dispatch(entry.getValue(), indexBuilder, entryTypeName);
+        }
     }
-  }
 
 }
index 91184461f9ceadad7e26e3c3ccbfa01606c2f540..871119139af165612aace24e21f337412105368e 100644 (file)
@@ -22,56 +22,56 @@ import java.util.List;
 import java.util.Map;
 
 class DeFunctionCallbacks {
-  
-  static <T extends AbstractWiktionaryParser> void addGenericCallbacks(Map<String, FunctionCallback<T>> callbacks) {
-      FunctionCallback<T> callback = new MakeHeadingFromName<T>("====");
-      callbacks.put("Aussprache", callback);
-      callbacks.put("Worttrennung", callback);
-      callbacks.put("Bedeutungen", callback);
-      callbacks.put("Herkunft", callback);
-      callbacks.put("Synonyme", callback);
-      callbacks.put("Gegenwörter", callback);
-      callbacks.put("Verkleinerungsformen", callback);
-      callbacks.put("Oberbegriffe", callback);
-      callbacks.put("Unterbegriffe", callback);
-      callbacks.put("Beispiele", callback);
-      callbacks.put("Redewendungen", callback);
-      callbacks.put("Charakteristische Wortkombinationen", callback);
-      callbacks.put("Abgeleitete Begriffe", callback);
-      callbacks.put("Übersetzungen", callback);
-      callbacks.put("Referenzen", callback);
-      callbacks.put("Grammatische Merkmale", callback);
-      callbacks.put("Abkürzungen", callback);
-      
-      // TODO:
-      // {{Anmerkung}}
-      // {{Anmerkungen}}
-      // {{Anmerkung|zum Gebrauch}}
-  }
 
-  
-  static final NameAndArgs<EnParser> NAME_AND_ARGS = new NameAndArgs<EnParser>();
+    static <T extends AbstractWiktionaryParser> void addGenericCallbacks(Map<String, FunctionCallback<T>> callbacks) {
+        FunctionCallback<T> callback = new MakeHeadingFromName<T>("====");
+        callbacks.put("Aussprache", callback);
+        callbacks.put("Worttrennung", callback);
+        callbacks.put("Bedeutungen", callback);
+        callbacks.put("Herkunft", callback);
+        callbacks.put("Synonyme", callback);
+        callbacks.put("Gegenwörter", callback);
+        callbacks.put("Verkleinerungsformen", callback);
+        callbacks.put("Oberbegriffe", callback);
+        callbacks.put("Unterbegriffe", callback);
+        callbacks.put("Beispiele", callback);
+        callbacks.put("Redewendungen", callback);
+        callbacks.put("Charakteristische Wortkombinationen", callback);
+        callbacks.put("Abgeleitete Begriffe", callback);
+        callbacks.put("Übersetzungen", callback);
+        callbacks.put("Referenzen", callback);
+        callbacks.put("Grammatische Merkmale", callback);
+        callbacks.put("Abkürzungen", callback);
 
-  
-  static final class MakeHeadingFromName<T extends AbstractWiktionaryParser> implements FunctionCallback<T> {
-    final String header;
-    public MakeHeadingFromName(String header) {
-        this.header = header;
+        // TODO:
+        // {{Anmerkung}}
+        // {{Anmerkungen}}
+        // {{Anmerkung|zum Gebrauch}}
     }
 
-    @Override
-      public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List<String> args,
-          final Map<String, String> namedArgs,
-          final T parser,
-          final AppendAndIndexWikiCallback<T> appendAndIndexWikiCallback) {
-        if (!namedArgs.isEmpty() || args.size() != 0) {
-            return false;
+
+    static final NameAndArgs<EnParser> NAME_AND_ARGS = new NameAndArgs<EnParser>();
+
+
+    static final class MakeHeadingFromName<T extends AbstractWiktionaryParser> implements FunctionCallback<T> {
+        final String header;
+        public MakeHeadingFromName(String header) {
+            this.header = header;
+        }
+
+        @Override
+        public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List<String> args,
+                                      final Map<String, String> namedArgs,
+                                      final T parser,
+                                      final AppendAndIndexWikiCallback<T> appendAndIndexWikiCallback) {
+            if (!namedArgs.isEmpty() || args.size() != 0) {
+                return false;
+            }
+            //appendAndIndexWikiCallback.builder.append(String.format("<%s>", header));
+            appendAndIndexWikiCallback.dispatch("\n" + header + name + header, null);
+            //appendAndIndexWikiCallback.builder.append(String.format("</%s>\n", header));
+            return true;
         }
-        //appendAndIndexWikiCallback.builder.append(String.format("<%s>", header));
-        appendAndIndexWikiCallback.dispatch("\n" + header + name + header, null);
-        //appendAndIndexWikiCallback.builder.append(String.format("</%s>\n", header));
-        return true;
-      }
     }
 
 
index 7dd933e38815ee1d595fb50777653f617120c705..670462f0fdb3b7e06e9fb08883b24c335f31d780 100644 (file)
@@ -30,312 +30,312 @@ import com.hughes.android.dictionary.parser.WikiTokenizer;
 public final class EnForeignParser extends EnParser {
 
     public EnForeignParser(final IndexBuilder enIndexBuilder,
-        final IndexBuilder otherIndexBuilder, final Pattern langPattern,
-        final Pattern langCodePattern, final boolean swap) {
-      super(enIndexBuilder, otherIndexBuilder, langPattern, langCodePattern, swap);
+                           final IndexBuilder otherIndexBuilder, final Pattern langPattern,
+                           final Pattern langCodePattern, final boolean swap) {
+        super(enIndexBuilder, otherIndexBuilder, langPattern, langCodePattern, swap);
     }
 
     @Override
     void parseSection(String heading, String text) {
-      if (isIgnorableTitle(title)) {
-        return;
-      }
-      final String lang = heading.replace("=", "").trim();
-      if (!langPattern.matcher(lang).find()){
-        return;
-      }
-      
-      final WikiTokenizer wikiTokenizer = new WikiTokenizer(text);
-      while (wikiTokenizer.nextToken() != null) {
-        if (wikiTokenizer.isHeading()) {
-          final String headingName = wikiTokenizer.headingWikiText();
-          if (headingName.equals("Translations")) {
-            LOG.warning("Translations not in English section: " + title);
-            incrementCount("WARNING: Translations not in English section");
-          } else if (headingName.equals("Pronunciation")) {
-            //doPronunciation(wikiLineReader);
-          } else if (headingName.startsWith(" {{S|")) {
-            // HACK to support parsing frwiktionary
-            String[] parts = headingName.split("\\|");
-            if (parts.length > 2 && langCodePattern.matcher(parts[2]).find() &&
-                (parts.length < 4 || !parts[3].startsWith("flexion"))) {
-                doForeignPartOfSpeech(lang, headingName, wikiTokenizer.headingDepth(), wikiTokenizer);
+        if (isIgnorableTitle(title)) {
+            return;
+        }
+        final String lang = heading.replace("=", "").trim();
+        if (!langPattern.matcher(lang).find()) {
+            return;
+        }
+
+        final WikiTokenizer wikiTokenizer = new WikiTokenizer(text);
+        while (wikiTokenizer.nextToken() != null) {
+            if (wikiTokenizer.isHeading()) {
+                final String headingName = wikiTokenizer.headingWikiText();
+                if (headingName.equals("Translations")) {
+                    LOG.warning("Translations not in English section: " + title);
+                    incrementCount("WARNING: Translations not in English section");
+                } else if (headingName.equals("Pronunciation")) {
+                    //doPronunciation(wikiLineReader);
+                } else if (headingName.startsWith(" {{S|")) {
+                    // HACK to support parsing frwiktionary
+                    String[] parts = headingName.split("\\|");
+                    if (parts.length > 2 && langCodePattern.matcher(parts[2]).find() &&
+                            (parts.length < 4 || !parts[3].startsWith("flexion"))) {
+                        doForeignPartOfSpeech(lang, headingName, wikiTokenizer.headingDepth(), wikiTokenizer);
+                    }
+                } else if (partOfSpeechHeader.matcher(headingName).matches()) {
+                    doForeignPartOfSpeech(lang, headingName, wikiTokenizer.headingDepth(), wikiTokenizer);
+                }
+            } else {
+                // It's not a heading.
+                // TODO: optimization: skip to next heading.
             }
-          } else if (partOfSpeechHeader.matcher(headingName).matches()) {
-            doForeignPartOfSpeech(lang, headingName, wikiTokenizer.headingDepth(), wikiTokenizer);
-          }
-        } else {
-          // It's not a heading.
-          // TODO: optimization: skip to next heading.
         }
-      }
     }
-    
+
     static final class ListSection {
-      final String firstPrefix;
-      final String firstLine;
-      final List<String> nextPrefixes = new ArrayList<String>();
-      final List<String> nextLines = new ArrayList<String>();
-      
-      public ListSection(String firstPrefix, String firstLine) {
-        this.firstPrefix = firstPrefix;
-        this.firstLine = firstLine;
-      }
-
-      @Override
-      public String toString() {
-        return firstPrefix + firstLine + "{ " + nextPrefixes + "}";
-      }
+        final String firstPrefix;
+        final String firstLine;
+        final List<String> nextPrefixes = new ArrayList<String>();
+        final List<String> nextLines = new ArrayList<String>();
+
+        public ListSection(String firstPrefix, String firstLine) {
+            this.firstPrefix = firstPrefix;
+            this.firstLine = firstLine;
+        }
+
+        @Override
+        public String toString() {
+            return firstPrefix + firstLine + "{ " + nextPrefixes + "}";
+        }
     }
 
     int foreignCount = 0;
     private void doForeignPartOfSpeech(final String lang, String posHeading, final int posDepth, WikiTokenizer wikiTokenizer) {
-      if (++foreignCount % 1000 == 0) {
-        LOG.info("***" + lang + ", " + title + ", pos=" + posHeading + ", foreignCount=" + foreignCount);
-      }
-      if (title.equals("6")) {
-        System.out.println();
-      }
-      
-      final StringBuilder foreignBuilder = new StringBuilder();
-      final List<EnForeignParser.ListSection> listSections = new ArrayList<EnForeignParser.ListSection>();
-      
-      appendAndIndexWikiCallback.reset(foreignBuilder, null);
-      this.state = State.ENGLISH_DEF_OF_FOREIGN;  // TODO: this is wrong, need new category....
-      titleAppended = false;
-      wordForms.clear();
-      
-      try {
-      
-      EnForeignParser.ListSection lastListSection = null;
-      
-      int currentHeadingDepth = posDepth;
-      while (wikiTokenizer.nextToken() != null) {
-        if (wikiTokenizer.isHeading()) {
-          currentHeadingDepth = wikiTokenizer.headingDepth();
-          
-          if (currentHeadingDepth <= posDepth) {
-            wikiTokenizer.returnToLineStart();
-            return;
-          }
-        }  // heading
-        
-        if (currentHeadingDepth > posDepth) {
-          // TODO: deal with other neat info sections inside POS
-          continue;
-        }
-        
-        if (wikiTokenizer.isFunction()) {
-          final String name = wikiTokenizer.functionName();
-          final List<String> args = wikiTokenizer.functionPositionArgs();
-          final Map<String,String> namedArgs = wikiTokenizer.functionNamedArgs();
-          // First line is generally a repeat of the title with some extra information.
-          // We need to build up the left side (foreign text, tokens) separately from the
-          // right side (English).  The left-side may get paired with multiple right sides.
-          // The left side should get filed under every form of the word in question (singular, plural).
-          
-          // For verbs, the conjugation comes later on in a deeper section.
-          // Ideally, we'd want to file every English entry with the verb
-          // under every verb form coming from the conjugation.
-          // Ie. under "fa": see: "make :: fare" and "do :: fare"
-          // But then where should we put the conjugation table?
-          // I think just under fare.  But then we need a way to link to the entry (actually the row, since entries doesn't show up!)
-          // for the conjugation table from "fa".
-          // Would like to be able to link to a lang#token.
-          
-          
-          String head = namedArgs.remove("head");
-          final String tr = namedArgs.remove("tr");
-          if (head == null && tr != null && !titleAppended) {
-            head = title;
-          }
-          if (head != null) {
-            final String form = appendAndIndexWikiCallback.dispatch(head, EntryTypeName.WIKTIONARY_TITLE_MULTI);
-            wordForms.add(form);
-            appendAndIndexWikiCallback.builder.append(" ");
-            titleAppended = true;
-          }
-          if (tr != null) {
-            appendAndIndexWikiCallback.builder.append(" (");
-            final String form = appendAndIndexWikiCallback.dispatch(tr, EntryTypeName.WIKTIONARY_TRANSLITERATION);
-            wordForms.add(form);
-            appendAndIndexWikiCallback.builder.append(") ");
-          }
-          
-          appendAndIndexWikiCallback.onFunction(wikiTokenizer, name, args, namedArgs);
-          
-        } else if (wikiTokenizer.isListItem()) {
-          final String prefix = wikiTokenizer.listItemPrefix();
-          if (lastListSection != null && 
-              prefix.startsWith(lastListSection.firstPrefix) && 
-              prefix.length() > lastListSection.firstPrefix.length()) {
-            lastListSection.nextPrefixes.add(prefix);
-            lastListSection.nextLines.add(wikiTokenizer.listItemWikiText());
-          } else {
-            lastListSection = new ListSection(prefix, wikiTokenizer.listItemWikiText());
-            listSections.add(lastListSection);
-          }
-        } else if (lastListSection != null) {
-          // Don't append anything after the lists, because there's crap.
-        } else if (wikiTokenizer.isWikiLink()) {
-          // Unindexed!
-          foreignBuilder.append(wikiTokenizer.wikiLinkText());
-          
-        } else if (wikiTokenizer.isPlainText()) {
-          // Unindexed!
-          foreignBuilder.append(wikiTokenizer.token());
-        } else if (wikiTokenizer.isHtml()) {
-            if (!wikiTokenizer.token().startsWith("<ref>")) {
-                foreignBuilder.append(wikiTokenizer.token());
-            }
-        } else if (wikiTokenizer.isMarkup() || 
-                wikiTokenizer.isNewline() || 
-                wikiTokenizer.isComment()) {
-          // Do nothing.
-        } else {
-          LOG.warning("Unexpected token: " + wikiTokenizer.token());
-          assert !wikiTokenizer.errors().isEmpty();
+        if (++foreignCount % 1000 == 0) {
+            LOG.info("***" + lang + ", " + title + ", pos=" + posHeading + ", foreignCount=" + foreignCount);
         }
-      }
-      
-      } finally {
-        // Here's where we exit.
-        // Should we make an entry even if there are no foreign list items?
-        String foreign = foreignBuilder.toString().trim();
-        if (!titleAppended && !foreign.toLowerCase().startsWith(title.toLowerCase())) {
-          foreign = String.format("%s %s", title, foreign);
+        if (title.equals("6")) {
+            System.out.println();
         }
-        if (!langPattern.matcher(lang).matches()) {
-          foreign = String.format("(%s) %s", lang, foreign);
-        }
-        for (final EnForeignParser.ListSection listSection : listSections) {
-          doForeignListSection(foreign, title, wordForms, listSection);
+
+        final StringBuilder foreignBuilder = new StringBuilder();
+        final List<EnForeignParser.ListSection> listSections = new ArrayList<EnForeignParser.ListSection>();
+
+        appendAndIndexWikiCallback.reset(foreignBuilder, null);
+        this.state = State.ENGLISH_DEF_OF_FOREIGN;  // TODO: this is wrong, need new category....
+        titleAppended = false;
+        wordForms.clear();
+
+        try {
+
+            EnForeignParser.ListSection lastListSection = null;
+
+            int currentHeadingDepth = posDepth;
+            while (wikiTokenizer.nextToken() != null) {
+                if (wikiTokenizer.isHeading()) {
+                    currentHeadingDepth = wikiTokenizer.headingDepth();
+
+                    if (currentHeadingDepth <= posDepth) {
+                        wikiTokenizer.returnToLineStart();
+                        return;
+                    }
+                }  // heading
+
+                if (currentHeadingDepth > posDepth) {
+                    // TODO: deal with other neat info sections inside POS
+                    continue;
+                }
+
+                if (wikiTokenizer.isFunction()) {
+                    final String name = wikiTokenizer.functionName();
+                    final List<String> args = wikiTokenizer.functionPositionArgs();
+                    final Map<String,String> namedArgs = wikiTokenizer.functionNamedArgs();
+                    // First line is generally a repeat of the title with some extra information.
+                    // We need to build up the left side (foreign text, tokens) separately from the
+                    // right side (English).  The left-side may get paired with multiple right sides.
+                    // The left side should get filed under every form of the word in question (singular, plural).
+
+                    // For verbs, the conjugation comes later on in a deeper section.
+                    // Ideally, we'd want to file every English entry with the verb
+                    // under every verb form coming from the conjugation.
+                    // Ie. under "fa": see: "make :: fare" and "do :: fare"
+                    // But then where should we put the conjugation table?
+                    // I think just under fare.  But then we need a way to link to the entry (actually the row, since entries doesn't show up!)
+                    // for the conjugation table from "fa".
+                    // Would like to be able to link to a lang#token.
+
+
+                    String head = namedArgs.remove("head");
+                    final String tr = namedArgs.remove("tr");
+                    if (head == null && tr != null && !titleAppended) {
+                        head = title;
+                    }
+                    if (head != null) {
+                        final String form = appendAndIndexWikiCallback.dispatch(head, EntryTypeName.WIKTIONARY_TITLE_MULTI);
+                        wordForms.add(form);
+                        appendAndIndexWikiCallback.builder.append(" ");
+                        titleAppended = true;
+                    }
+                    if (tr != null) {
+                        appendAndIndexWikiCallback.builder.append(" (");
+                        final String form = appendAndIndexWikiCallback.dispatch(tr, EntryTypeName.WIKTIONARY_TRANSLITERATION);
+                        wordForms.add(form);
+                        appendAndIndexWikiCallback.builder.append(") ");
+                    }
+
+                    appendAndIndexWikiCallback.onFunction(wikiTokenizer, name, args, namedArgs);
+
+                } else if (wikiTokenizer.isListItem()) {
+                    final String prefix = wikiTokenizer.listItemPrefix();
+                    if (lastListSection != null &&
+                            prefix.startsWith(lastListSection.firstPrefix) &&
+                            prefix.length() > lastListSection.firstPrefix.length()) {
+                        lastListSection.nextPrefixes.add(prefix);
+                        lastListSection.nextLines.add(wikiTokenizer.listItemWikiText());
+                    } else {
+                        lastListSection = new ListSection(prefix, wikiTokenizer.listItemWikiText());
+                        listSections.add(lastListSection);
+                    }
+                } else if (lastListSection != null) {
+                    // Don't append anything after the lists, because there's crap.
+                } else if (wikiTokenizer.isWikiLink()) {
+                    // Unindexed!
+                    foreignBuilder.append(wikiTokenizer.wikiLinkText());
+
+                } else if (wikiTokenizer.isPlainText()) {
+                    // Unindexed!
+                    foreignBuilder.append(wikiTokenizer.token());
+                } else if (wikiTokenizer.isHtml()) {
+                    if (!wikiTokenizer.token().startsWith("<ref>")) {
+                        foreignBuilder.append(wikiTokenizer.token());
+                    }
+                } else if (wikiTokenizer.isMarkup() ||
+                           wikiTokenizer.isNewline() ||
+                           wikiTokenizer.isComment()) {
+                    // Do nothing.
+                } else {
+                    LOG.warning("Unexpected token: " + wikiTokenizer.token());
+                    assert !wikiTokenizer.errors().isEmpty();
+                }
+            }
+
+        } finally {
+            // Here's where we exit.
+            // Should we make an entry even if there are no foreign list items?
+            String foreign = foreignBuilder.toString().trim();
+            if (!titleAppended && !foreign.toLowerCase().startsWith(title.toLowerCase())) {
+                foreign = String.format("%s %s", title, foreign);
+            }
+            if (!langPattern.matcher(lang).matches()) {
+                foreign = String.format("(%s) %s", lang, foreign);
+            }
+            for (final EnForeignParser.ListSection listSection : listSections) {
+                doForeignListSection(foreign, title, wordForms, listSection);
+            }
         }
-      }
     }
-    
+
     private void doForeignListSection(final String foreignText, String title, final Collection<String> forms, final EnForeignParser.ListSection listSection) {
-      state = State.ENGLISH_DEF_OF_FOREIGN;
-      final String prefix = listSection.firstPrefix;
-      if (prefix.length() > 1) {
-        // Could just get looser and say that any prefix longer than first is a sublist.
-        LOG.warning("Prefix '" + prefix + "' too long: " + listSection);
-        incrementCount("WARNING: Prefix too long");
-        return;
-      }
-      
-      final PairEntry pairEntry = new PairEntry(entrySource);
-      final IndexedEntry indexedEntry = new IndexedEntry(pairEntry);
-      indexedEntry.isValid = true;
-
-      entryIsFormOfSomething = false;
-      final StringBuilder englishBuilder = new StringBuilder();
-      final String mainLine = listSection.firstLine;
-      appendAndIndexWikiCallback.reset(englishBuilder, indexedEntry);
-      appendAndIndexWikiCallback.dispatch(mainLine, enIndexBuilder, EntryTypeName.WIKTIONARY_ENGLISH_DEF);
-
-      final String english = trim(englishBuilder.toString());
-      if (english.length() > 0) {
-        final Pair pair = new Pair(english, trim(foreignText), this.swap);
-        pairEntry.pairs.add(pair);
-        foreignIndexBuilder.addEntryWithString(indexedEntry, title, entryIsFormOfSomething ? EntryTypeName.WIKTIONARY_IS_FORM_OF_SOMETHING_ELSE : EntryTypeName.WIKTIONARY_TITLE_MULTI);
-        for (final String form : forms) {
-          foreignIndexBuilder.addEntryWithString(indexedEntry, form, EntryTypeName.WIKTIONARY_INFLECTED_FORM_MULTI);
-        }
-      }
-      
-      // Do examples.
-      String lastForeign = null;
-      for (int i = 0; i < listSection.nextPrefixes.size(); ++i) {
-        final String nextPrefix = listSection.nextPrefixes.get(i);
-        String nextLine = listSection.nextLines.get(i);
-
-        // TODO: This splitting is not sensitive to wiki code.
-        int dash = nextLine.indexOf("&mdash;");
-        int mdashLen = 7;
-        if (dash == -1) {
-          dash = nextLine.indexOf("—");
-          mdashLen = 1;
-        }
-        if (dash == -1) {
-          dash = nextLine.indexOf(" - ");
-          mdashLen = 3;
+        state = State.ENGLISH_DEF_OF_FOREIGN;
+        final String prefix = listSection.firstPrefix;
+        if (prefix.length() > 1) {
+            // Could just get looser and say that any prefix longer than first is a sublist.
+            LOG.warning("Prefix '" + prefix + "' too long: " + listSection);
+            incrementCount("WARNING: Prefix too long");
+            return;
         }
-        
-        if ((nextPrefix.equals("#:") || nextPrefix.equals("##:")) && dash != -1) {
-          final String foreignEx = nextLine.substring(0, dash);
-          final String englishEx = nextLine.substring(dash + mdashLen);
-          final Pair pair = new Pair(formatAndIndexExampleString(englishEx, enIndexBuilder, indexedEntry), formatAndIndexExampleString(foreignEx, foreignIndexBuilder, indexedEntry), swap);
-          if (pair.lang1 != "--" && pair.lang1 != "--") {
-            pairEntry.pairs.add(pair);
-          }
-          lastForeign = null;
-        // TODO: make #* and #*: work
-        } else if (nextPrefix.equals("#:") || nextPrefix.equals("##:")/* || nextPrefix.equals("#*")*/){
-          final Pair pair = new Pair("--", formatAndIndexExampleString(nextLine, null, indexedEntry), swap);
-          lastForeign = nextLine;
-          if (pair.lang1 != "--" && pair.lang1 != "--") {
+
+        final PairEntry pairEntry = new PairEntry(entrySource);
+        final IndexedEntry indexedEntry = new IndexedEntry(pairEntry);
+        indexedEntry.isValid = true;
+
+        entryIsFormOfSomething = false;
+        final StringBuilder englishBuilder = new StringBuilder();
+        final String mainLine = listSection.firstLine;
+        appendAndIndexWikiCallback.reset(englishBuilder, indexedEntry);
+        appendAndIndexWikiCallback.dispatch(mainLine, enIndexBuilder, EntryTypeName.WIKTIONARY_ENGLISH_DEF);
+
+        final String english = trim(englishBuilder.toString());
+        if (english.length() > 0) {
+            final Pair pair = new Pair(english, trim(foreignText), this.swap);
             pairEntry.pairs.add(pair);
-          }
-        } else if (nextPrefix.equals("#::") || nextPrefix.equals("#**")/* || nextPrefix.equals("#*:")*/) {
-          if (lastForeign != null && pairEntry.pairs.size() > 0) {
-            if (i + 1 < listSection.nextPrefixes.size()) {
-              // Chinese has sometimes multiple foreign lines
-              final String nextNextPrefix = listSection.nextPrefixes.get(i + 1);
-              if (nextNextPrefix.equals("#::") || nextNextPrefix.equals("#**")) {
-                ++i;
-                nextLine += "\n" + listSection.nextLines.get(i);
-              }
+            foreignIndexBuilder.addEntryWithString(indexedEntry, title, entryIsFormOfSomething ? EntryTypeName.WIKTIONARY_IS_FORM_OF_SOMETHING_ELSE : EntryTypeName.WIKTIONARY_TITLE_MULTI);
+            for (final String form : forms) {
+                foreignIndexBuilder.addEntryWithString(indexedEntry, form, EntryTypeName.WIKTIONARY_INFLECTED_FORM_MULTI);
             }
-            pairEntry.pairs.remove(pairEntry.pairs.size() - 1);
-            final Pair pair = new Pair(formatAndIndexExampleString(nextLine, enIndexBuilder, indexedEntry), formatAndIndexExampleString(lastForeign, foreignIndexBuilder, indexedEntry), swap);
-            if (pair.lang1 != "--" || pair.lang2 != "--") {
-              pairEntry.pairs.add(pair);
+        }
+
+        // Do examples.
+        String lastForeign = null;
+        for (int i = 0; i < listSection.nextPrefixes.size(); ++i) {
+            final String nextPrefix = listSection.nextPrefixes.get(i);
+            String nextLine = listSection.nextLines.get(i);
+
+            // TODO: This splitting is not sensitive to wiki code.
+            int dash = nextLine.indexOf("&mdash;");
+            int mdashLen = 7;
+            if (dash == -1) {
+                dash = nextLine.indexOf("—");
+                mdashLen = 1;
             }
-            lastForeign = null;
-          } else {
-            LOG.warning("TODO: English example with no foreign: " + title + ", " + nextLine);
-            final Pair pair = new Pair("--", formatAndIndexExampleString(nextLine, null, indexedEntry), swap);
-            if (pair.lang1 != "--" || pair.lang2 != "--") {
-              pairEntry.pairs.add(pair);
+            if (dash == -1) {
+                dash = nextLine.indexOf(" - ");
+                mdashLen = 3;
             }
-          }
-        } else if (nextPrefix.equals("#*")) {
-          // Can't really index these.
-          final Pair pair = new Pair("--", formatAndIndexExampleString(nextLine, null, indexedEntry), swap);
-          lastForeign = nextLine;
-          if (pair.lang1 != "--" || pair.lang2 != "--") {
-            pairEntry.pairs.add(pair);
-          }
-        } else if (nextPrefix.equals("#::*") || nextPrefix.equals("##") || nextPrefix.equals("#*:") || nextPrefix.equals("#:*") || true) {
-          final Pair pair = new Pair("--", formatAndIndexExampleString(nextLine, null, indexedEntry), swap);
-          if (pair.lang1 != "--" || pair.lang2 != "--") {
-            pairEntry.pairs.add(pair);
-          }
+
+            if ((nextPrefix.equals("#:") || nextPrefix.equals("##:")) && dash != -1) {
+                final String foreignEx = nextLine.substring(0, dash);
+                final String englishEx = nextLine.substring(dash + mdashLen);
+                final Pair pair = new Pair(formatAndIndexExampleString(englishEx, enIndexBuilder, indexedEntry), formatAndIndexExampleString(foreignEx, foreignIndexBuilder, indexedEntry), swap);
+                if (pair.lang1 != "--" && pair.lang1 != "--") {
+                    pairEntry.pairs.add(pair);
+                }
+                lastForeign = null;
+                // TODO: make #* and #*: work
+            } else if (nextPrefix.equals("#:") || nextPrefix.equals("##:")/* || nextPrefix.equals("#*")*/) {
+                final Pair pair = new Pair("--", formatAndIndexExampleString(nextLine, null, indexedEntry), swap);
+                lastForeign = nextLine;
+                if (pair.lang1 != "--" && pair.lang1 != "--") {
+                    pairEntry.pairs.add(pair);
+                }
+            } else if (nextPrefix.equals("#::") || nextPrefix.equals("#**")/* || nextPrefix.equals("#*:")*/) {
+                if (lastForeign != null && pairEntry.pairs.size() > 0) {
+                    if (i + 1 < listSection.nextPrefixes.size()) {
+                        // Chinese has sometimes multiple foreign lines
+                        final String nextNextPrefix = listSection.nextPrefixes.get(i + 1);
+                        if (nextNextPrefix.equals("#::") || nextNextPrefix.equals("#**")) {
+                            ++i;
+                            nextLine += "\n" + listSection.nextLines.get(i);
+                        }
+                    }
+                    pairEntry.pairs.remove(pairEntry.pairs.size() - 1);
+                    final Pair pair = new Pair(formatAndIndexExampleString(nextLine, enIndexBuilder, indexedEntry), formatAndIndexExampleString(lastForeign, foreignIndexBuilder, indexedEntry), swap);
+                    if (pair.lang1 != "--" || pair.lang2 != "--") {
+                        pairEntry.pairs.add(pair);
+                    }
+                    lastForeign = null;
+                } else {
+                    LOG.warning("TODO: English example with no foreign: " + title + ", " + nextLine);
+                    final Pair pair = new Pair("--", formatAndIndexExampleString(nextLine, null, indexedEntry), swap);
+                    if (pair.lang1 != "--" || pair.lang2 != "--") {
+                        pairEntry.pairs.add(pair);
+                    }
+                }
+            } else if (nextPrefix.equals("#*")) {
+                // Can't really index these.
+                final Pair pair = new Pair("--", formatAndIndexExampleString(nextLine, null, indexedEntry), swap);
+                lastForeign = nextLine;
+                if (pair.lang1 != "--" || pair.lang2 != "--") {
+                    pairEntry.pairs.add(pair);
+                }
+            } else if (nextPrefix.equals("#::*") || nextPrefix.equals("##") || nextPrefix.equals("#*:") || nextPrefix.equals("#:*") || true) {
+                final Pair pair = new Pair("--", formatAndIndexExampleString(nextLine, null, indexedEntry), swap);
+                if (pair.lang1 != "--" || pair.lang2 != "--") {
+                    pairEntry.pairs.add(pair);
+                }
 //        } else {
 //          assert false;
+            }
         }
-      }
     }
-    
+
     private String formatAndIndexExampleString(final String example, final IndexBuilder indexBuilder, final IndexedEntry indexedEntry) {
-      // TODO:
+        // TODO:
 //      if (wikiTokenizer.token().equals("'''")) {
 //        insideTripleQuotes = !insideTripleQuotes;
 //      }
-      final StringBuilder builder = new StringBuilder();
-      appendAndIndexWikiCallback.reset(builder, indexedEntry);
-      appendAndIndexWikiCallback.entryTypeName = EntryTypeName.WIKTIONARY_EXAMPLE;
-      appendAndIndexWikiCallback.entryTypeNameSticks = true;
-      try {
-        // TODO: this is a hack needed because we don't safely split on the dash.
-        appendAndIndexWikiCallback.dispatch(example, indexBuilder, EntryTypeName.WIKTIONARY_EXAMPLE);
-      } catch (AssertionError e) {
-        return "--";
-      }
-      final String result = trim(builder.toString());
-      return result.length() > 0 ? result : "--";
+        final StringBuilder builder = new StringBuilder();
+        appendAndIndexWikiCallback.reset(builder, indexedEntry);
+        appendAndIndexWikiCallback.entryTypeName = EntryTypeName.WIKTIONARY_EXAMPLE;
+        appendAndIndexWikiCallback.entryTypeNameSticks = true;
+        try {
+            // TODO: this is a hack needed because we don't safely split on the dash.
+            appendAndIndexWikiCallback.dispatch(example, indexBuilder, EntryTypeName.WIKTIONARY_EXAMPLE);
+        } catch (AssertionError e) {
+            return "--";
+        }
+        final String result = trim(builder.toString());
+        return result.length() > 0 ? result : "--";
     }
 
 
-  }  // ForeignParser
+}  // ForeignParser
index 5f7f85087db251128bc061f44e8628263745c0d1..5e37a0a572d4cc64727fa56902a84a3cd038e99c 100644 (file)
@@ -33,1138 +33,1143 @@ import java.util.Set;
 import java.util.concurrent.atomic.AtomicInteger;
 
 class EnFunctionCallbacks {
-  
-  static final Map<String,FunctionCallback<EnParser>> DEFAULT = new LinkedHashMap<String, FunctionCallback<EnParser>>();
-
-  static <T extends AbstractWiktionaryParser> void addGenericCallbacks(Map<String, FunctionCallback<T>> callbacks) {
-      FunctionCallback<T> callback = new Gender<T>();
-      callbacks.put("m", callback);
-      callbacks.put("f", callback);
-      callbacks.put("n", callback);
-      callbacks.put("p", callback);
-      callbacks.put("g", callback);
-      
-      callbacks.put("etyl", new etyl<T>());
-      callbacks.put("term", new term<T>());
-      
-      callback = new EncodingCallback<T>();
-      Set<String> encodings = new LinkedHashSet<String>(Arrays.asList(
-          "IPA", "IPAchar",  // Not really encodings, but it works.
-          "zh-ts", "zh-tsp",
-          "sd-Arab", "ku-Arab", "Arab", "unicode", "Laoo", "ur-Arab", "Thai", 
-          "fa-Arab", "Khmr", "Cyrl", "ug-Arab", "ko-inline", 
-          "Jpan", "Kore", "Hebr", "rfscript", "Beng", "Mong", "Knda", "Cyrs",
-          "yue-tsj", "Mlym", "Tfng", "Grek", "yue-yue-j"));
-      for (final String encoding : encodings) {
-          callbacks.put(encoding, callback);
-      }
-      
-      callback = new Ignore<T>();
-      callbacks.put("trreq", callback);
-      callbacks.put("t-image", callback);
-      callbacks.put("defn", callback);
-      callbacks.put("rfdef", callback);
-      callbacks.put("rfdate", callback);
-      callbacks.put("rfex", callback);
-      callbacks.put("rfquote", callback);
-      callbacks.put("attention", callback);
-      callbacks.put("zh-attention", callback);
-      callbacks.put("top2", callback);
-      callbacks.put("mid2", callback);
-      callbacks.put("top3", callback);
-      callbacks.put("mid3", callback);
-      callbacks.put("bottom", callback);
-      callbacks.put("rel-mid", callback);
-      callbacks.put("rel-mid3", callback);
-      callbacks.put("rel-mid4", callback);
-      callbacks.put("rel-bottom", callback);
-      callbacks.put("der-top", callback);
-      callbacks.put("der-mid", callback);
-      callbacks.put("der-mid3", callback);
-      callbacks.put("der-bottom", callback);
-      
-      callback = new AppendName<T>();
-      callbacks.put("...", callback);
-      
-      callbacks.put("qualifier", new QualifierCallback<T>());
-      callbacks.put("italbrac", new italbrac<T>());
-      callbacks.put("gloss", new gloss<T>());
-      callbacks.put("not used", new not_used<T>());
-      callbacks.put("wikipedia", new wikipedia<T>());
-      
-      final it_conj<T> it_conj_cb = new it_conj<T>();
-      callbacks.put("it-conj", it_conj_cb);
-      callbacks.put("it-conj-are", new it_conj_are<T>(it_conj_cb));
-      callbacks.put("it-conj-arsi", new it_conj_are<T>(it_conj_cb));
-      callbacks.put("it-conj-care", new it_conj_are<T>(it_conj_cb));
-      callbacks.put("it-conj-carsi", new it_conj_are<T>(it_conj_cb));
-      callbacks.put("it-conj-ciare", new it_conj_are<T>(it_conj_cb));
-      callbacks.put("it-conj-ciarsi", new it_conj_are<T>(it_conj_cb));
-      callbacks.put("it-conj-iare", new it_conj_are<T>(it_conj_cb));
-      callbacks.put("it-conj-iarsi", new it_conj_are<T>(it_conj_cb));
-      callbacks.put("it-conj-iare-b", new it_conj_are<T>(it_conj_cb));
-      callbacks.put("it-conj-iarsi-b", new it_conj_are<T>(it_conj_cb));
-      callbacks.put("it-conj-ire", new it_conj_ire<T>(it_conj_cb));
-      callbacks.put("it-conj-irsi", new it_conj_ire<T>(it_conj_cb));
-      callbacks.put("it-conj-ire-b", new it_conj_ire<T>(it_conj_cb));
-      callbacks.put("it-conj-irsi-b", new it_conj_ire<T>(it_conj_cb));
-      callbacks.put("it-conj-cire", new it_conj_ire<T>(it_conj_cb));
-      callbacks.put("it-conj-cirsi", new it_conj_ire<T>(it_conj_cb));
-      callbacks.put("it-conj-ire", new it_conj_ire<T>(it_conj_cb));
-      callbacks.put("it-conj-ere", new it_conj_ere<T>(it_conj_cb));
-      callbacks.put("it-conj-ersi", new it_conj_ere<T>(it_conj_cb));
-      callbacks.put("it-conj-urre", new it_conj_urre<T>(it_conj_cb));
-      callbacks.put("it-conj-ursi", new it_conj_urre<T>(it_conj_cb));
-      callbacks.put("it-conj-fare", new it_conj_fare<T>(it_conj_cb));
-
-      
-      //"{{it-conj-fare|putre|avere}}\n" + 
-
-      
-  }
-
-  static {
-    addGenericCallbacks(DEFAULT);
-      
-    FunctionCallback<EnParser> callback = new TranslationCallback<EnParser>();
-    DEFAULT.put("t", callback);
-    DEFAULT.put("t+", callback);
-    DEFAULT.put("t-", callback);
-    DEFAULT.put("tø", callback);
-    DEFAULT.put("apdx-t", callback);
-    
-    callback = new l_term();
-    DEFAULT.put("l", callback);
-    DEFAULT.put("term", callback);
-
-    //callback = new AppendArg0();
-
-    callback = new FormOf();
-    DEFAULT.put("form of", callback);
-    DEFAULT.put("conjugation of", callback);
-    DEFAULT.put("participle of", callback);
-    DEFAULT.put("present participle of", callback);
-    DEFAULT.put("past participle of", callback);
-    DEFAULT.put("feminine past participle of", callback);
-    DEFAULT.put("gerund of", callback);
-    DEFAULT.put("feminine of", callback);
-    DEFAULT.put("plural of", callback);
-    DEFAULT.put("feminine plural of", callback);
-    DEFAULT.put("inflected form of", callback);
-    DEFAULT.put("alternative form of", callback);
-    DEFAULT.put("dated form of", callback);
-    DEFAULT.put("apocopic form of", callback);
-    
-    callback = new InflOrHead();
-    DEFAULT.put("infl", callback);
-    DEFAULT.put("head", callback);
-  }
-  
-  static final NameAndArgs<EnParser> NAME_AND_ARGS = new NameAndArgs<EnParser>();
-
-  // ------------------------------------------------------------------
-
-  static final class TranslationCallback<T extends AbstractWiktionaryParser> implements FunctionCallback<T> {
-    @Override
-    public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List<String> args,
-        final Map<String, String> namedArgs, final T parser,
-        final AppendAndIndexWikiCallback<T> appendAndIndexWikiCallback) {
-
-      final String transliteration = namedArgs.remove("tr");
-      final String alt = namedArgs.remove("alt");
-      namedArgs.keySet().removeAll(EnParser.USELESS_WIKI_ARGS);
-      if (args.size() < 2) {
-        if (!name.equals("ttbc")) {
-          EnParser.LOG.warning("{{t...}} with wrong args: title=" + parser.title + ", " + wikiTokenizer.token());
-        }
-        return false;
-      }
-      final String langCode = ListUtil.get(args, 0);
-      if (!appendAndIndexWikiCallback.langCodeToTCount.containsKey(langCode)) {
-        appendAndIndexWikiCallback.langCodeToTCount.put(langCode, new AtomicInteger());
-      }
-      appendAndIndexWikiCallback.langCodeToTCount.get(langCode).incrementAndGet();
-      final String word = ListUtil.get(args, 1);
-      appendAndIndexWikiCallback.dispatch(alt != null ? alt : word, EntryTypeName.WIKTIONARY_TITLE_MULTI);
-
-      // Genders...
-      if (args.size() > 2) {
-        appendAndIndexWikiCallback.builder.append(" {");
-        for (int i = 2; i < args.size(); ++i) {
-          if (i > 2) {
-            appendAndIndexWikiCallback.builder.append("|");
-          }
-          appendAndIndexWikiCallback.builder.append(args.get(i));
+
+    static final Map<String,FunctionCallback<EnParser>> DEFAULT = new LinkedHashMap<String, FunctionCallback<EnParser>>();
+
+    static <T extends AbstractWiktionaryParser> void addGenericCallbacks(Map<String, FunctionCallback<T>> callbacks) {
+        FunctionCallback<T> callback = new Gender<T>();
+        callbacks.put("m", callback);
+        callbacks.put("f", callback);
+        callbacks.put("n", callback);
+        callbacks.put("p", callback);
+        callbacks.put("g", callback);
+
+        callbacks.put("etyl", new etyl<T>());
+        callbacks.put("term", new term<T>());
+
+        callback = new EncodingCallback<T>();
+        Set<String> encodings = new LinkedHashSet<String>(Arrays.asList(
+                    "IPA", "IPAchar",  // Not really encodings, but it works.
+                    "zh-ts", "zh-tsp",
+                    "sd-Arab", "ku-Arab", "Arab", "unicode", "Laoo", "ur-Arab", "Thai",
+                    "fa-Arab", "Khmr", "Cyrl", "ug-Arab", "ko-inline",
+                    "Jpan", "Kore", "Hebr", "rfscript", "Beng", "Mong", "Knda", "Cyrs",
+                    "yue-tsj", "Mlym", "Tfng", "Grek", "yue-yue-j"));
+        for (final String encoding : encodings) {
+            callbacks.put(encoding, callback);
         }
-        appendAndIndexWikiCallback.builder.append("}");
-      }
-
-      if (transliteration != null) {
-        appendAndIndexWikiCallback.builder.append(" (");
-        appendAndIndexWikiCallback.dispatch(transliteration, EntryTypeName.WIKTIONARY_TRANSLITERATION);
-        appendAndIndexWikiCallback.builder.append(")");
-      }
-      
-      if (alt != null) {
-        // If alt wasn't null, we appended alt instead of the actual word
-        // we're filing under..
-        appendAndIndexWikiCallback.builder.append(" (");
-        appendAndIndexWikiCallback.dispatch(word, EntryTypeName.WIKTIONARY_TITLE_MULTI);
-        appendAndIndexWikiCallback.builder.append(")");
-      }
-
-      // Catch-all for anything else...
-      if (!namedArgs.isEmpty()) {
-        appendAndIndexWikiCallback.builder.append(" {");
-        EnParser.appendNamedArgs(namedArgs, appendAndIndexWikiCallback);
-        appendAndIndexWikiCallback.builder.append("}");
-      }
-      
-      return true;
+
+        callback = new Ignore<T>();
+        callbacks.put("trreq", callback);
+        callbacks.put("t-image", callback);
+        callbacks.put("defn", callback);
+        callbacks.put("rfdef", callback);
+        callbacks.put("rfdate", callback);
+        callbacks.put("rfex", callback);
+        callbacks.put("rfquote", callback);
+        callbacks.put("attention", callback);
+        callbacks.put("zh-attention", callback);
+        callbacks.put("top2", callback);
+        callbacks.put("mid2", callback);
+        callbacks.put("top3", callback);
+        callbacks.put("mid3", callback);
+        callbacks.put("bottom", callback);
+        callbacks.put("rel-mid", callback);
+        callbacks.put("rel-mid3", callback);
+        callbacks.put("rel-mid4", callback);
+        callbacks.put("rel-bottom", callback);
+        callbacks.put("der-top", callback);
+        callbacks.put("der-mid", callback);
+        callbacks.put("der-mid3", callback);
+        callbacks.put("der-bottom", callback);
+
+        callback = new AppendName<T>();
+        callbacks.put("...", callback);
+
+        callbacks.put("qualifier", new QualifierCallback<T>());
+        callbacks.put("italbrac", new italbrac<T>());
+        callbacks.put("gloss", new gloss<T>());
+        callbacks.put("not used", new not_used<T>());
+        callbacks.put("wikipedia", new wikipedia<T>());
+
+        final it_conj<T> it_conj_cb = new it_conj<T>();
+        callbacks.put("it-conj", it_conj_cb);
+        callbacks.put("it-conj-are", new it_conj_are<T>(it_conj_cb));
+        callbacks.put("it-conj-arsi", new it_conj_are<T>(it_conj_cb));
+        callbacks.put("it-conj-care", new it_conj_are<T>(it_conj_cb));
+        callbacks.put("it-conj-carsi", new it_conj_are<T>(it_conj_cb));
+        callbacks.put("it-conj-ciare", new it_conj_are<T>(it_conj_cb));
+        callbacks.put("it-conj-ciarsi", new it_conj_are<T>(it_conj_cb));
+        callbacks.put("it-conj-iare", new it_conj_are<T>(it_conj_cb));
+        callbacks.put("it-conj-iarsi", new it_conj_are<T>(it_conj_cb));
+        callbacks.put("it-conj-iare-b", new it_conj_are<T>(it_conj_cb));
+        callbacks.put("it-conj-iarsi-b", new it_conj_are<T>(it_conj_cb));
+        callbacks.put("it-conj-ire", new it_conj_ire<T>(it_conj_cb));
+        callbacks.put("it-conj-irsi", new it_conj_ire<T>(it_conj_cb));
+        callbacks.put("it-conj-ire-b", new it_conj_ire<T>(it_conj_cb));
+        callbacks.put("it-conj-irsi-b", new it_conj_ire<T>(it_conj_cb));
+        callbacks.put("it-conj-cire", new it_conj_ire<T>(it_conj_cb));
+        callbacks.put("it-conj-cirsi", new it_conj_ire<T>(it_conj_cb));
+        callbacks.put("it-conj-ire", new it_conj_ire<T>(it_conj_cb));
+        callbacks.put("it-conj-ere", new it_conj_ere<T>(it_conj_cb));
+        callbacks.put("it-conj-ersi", new it_conj_ere<T>(it_conj_cb));
+        callbacks.put("it-conj-urre", new it_conj_urre<T>(it_conj_cb));
+        callbacks.put("it-conj-ursi", new it_conj_urre<T>(it_conj_cb));
+        callbacks.put("it-conj-fare", new it_conj_fare<T>(it_conj_cb));
+
+
+        //"{{it-conj-fare|putre|avere}}\n" +
+
+
+    }
+
+    static {
+        addGenericCallbacks(DEFAULT);
+
+        FunctionCallback<EnParser> callback = new TranslationCallback<EnParser>();
+        DEFAULT.put("t", callback);
+        DEFAULT.put("t+", callback);
+        DEFAULT.put("t-", callback);
+        DEFAULT.put("tø", callback);
+        DEFAULT.put("apdx-t", callback);
+
+        callback = new l_term();
+        DEFAULT.put("l", callback);
+        DEFAULT.put("term", callback);
+
+        //callback = new AppendArg0();
+
+        callback = new FormOf();
+        DEFAULT.put("form of", callback);
+        DEFAULT.put("conjugation of", callback);
+        DEFAULT.put("participle of", callback);
+        DEFAULT.put("present participle of", callback);
+        DEFAULT.put("past participle of", callback);
+        DEFAULT.put("feminine past participle of", callback);
+        DEFAULT.put("gerund of", callback);
+        DEFAULT.put("feminine of", callback);
+        DEFAULT.put("plural of", callback);
+        DEFAULT.put("feminine plural of", callback);
+        DEFAULT.put("inflected form of", callback);
+        DEFAULT.put("alternative form of", callback);
+        DEFAULT.put("dated form of", callback);
+        DEFAULT.put("apocopic form of", callback);
+
+        callback = new InflOrHead();
+        DEFAULT.put("infl", callback);
+        DEFAULT.put("head", callback);
     }
-  }
-
-  // ------------------------------------------------------------------
-  
-  static final class QualifierCallback<T extends AbstractWiktionaryParser> implements FunctionCallback<T> {
-    @Override
-    public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List<String> args,
-        final Map<String, String> namedArgs,
-        final T parser,
-        final AppendAndIndexWikiCallback<T> appendAndIndexWikiCallback) {
-      namedArgs.remove("lang");
-      if (!namedArgs.isEmpty()) {
-        EnParser.LOG.warning("weird qualifier: " + wikiTokenizer.token());
-        return false;
-      }
-      appendAndIndexWikiCallback.builder.append("(");
-      for (int i = 0; i < args.size(); ++i) {
-          appendAndIndexWikiCallback.dispatch(args.get(i), null);
-          if (i > 0) {
-              appendAndIndexWikiCallback.builder.append(", ");
-          }
-      }
-      appendAndIndexWikiCallback.builder.append(")");
-      return true;
+
+    static final NameAndArgs<EnParser> NAME_AND_ARGS = new NameAndArgs<EnParser>();
+
+    // ------------------------------------------------------------------
+
+    static final class TranslationCallback<T extends AbstractWiktionaryParser> implements FunctionCallback<T> {
+        @Override
+        public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List<String> args,
+                                      final Map<String, String> namedArgs, final T parser,
+                                      final AppendAndIndexWikiCallback<T> appendAndIndexWikiCallback) {
+
+            final String transliteration = namedArgs.remove("tr");
+            final String alt = namedArgs.remove("alt");
+            namedArgs.keySet().removeAll(EnParser.USELESS_WIKI_ARGS);
+            if (args.size() < 2) {
+                if (!name.equals("ttbc")) {
+                    EnParser.LOG.warning("{{t...}} with wrong args: title=" + parser.title + ", " + wikiTokenizer.token());
+                }
+                return false;
+            }
+            final String langCode = ListUtil.get(args, 0);
+            if (!appendAndIndexWikiCallback.langCodeToTCount.containsKey(langCode)) {
+                appendAndIndexWikiCallback.langCodeToTCount.put(langCode, new AtomicInteger());
+            }
+            appendAndIndexWikiCallback.langCodeToTCount.get(langCode).incrementAndGet();
+            final String word = ListUtil.get(args, 1);
+            appendAndIndexWikiCallback.dispatch(alt != null ? alt : word, EntryTypeName.WIKTIONARY_TITLE_MULTI);
+
+            // Genders...
+            if (args.size() > 2) {
+                appendAndIndexWikiCallback.builder.append(" {");
+                for (int i = 2; i < args.size(); ++i) {
+                    if (i > 2) {
+                        appendAndIndexWikiCallback.builder.append("|");
+                    }
+                    appendAndIndexWikiCallback.builder.append(args.get(i));
+                }
+                appendAndIndexWikiCallback.builder.append("}");
+            }
+
+            if (transliteration != null) {
+                appendAndIndexWikiCallback.builder.append(" (");
+                appendAndIndexWikiCallback.dispatch(transliteration, EntryTypeName.WIKTIONARY_TRANSLITERATION);
+                appendAndIndexWikiCallback.builder.append(")");
+            }
+
+            if (alt != null) {
+                // If alt wasn't null, we appended alt instead of the actual word
+                // we're filing under..
+                appendAndIndexWikiCallback.builder.append(" (");
+                appendAndIndexWikiCallback.dispatch(word, EntryTypeName.WIKTIONARY_TITLE_MULTI);
+                appendAndIndexWikiCallback.builder.append(")");
+            }
+
+            // Catch-all for anything else...
+            if (!namedArgs.isEmpty()) {
+                appendAndIndexWikiCallback.builder.append(" {");
+                EnParser.appendNamedArgs(namedArgs, appendAndIndexWikiCallback);
+                appendAndIndexWikiCallback.builder.append("}");
+            }
+
+            return true;
+        }
     }
-  }
-
-  // ------------------------------------------------------------------
-  
-  static final class EncodingCallback<T extends AbstractWiktionaryParser> implements FunctionCallback<T> {
-    @Override
-    public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List<String> args,
-        final Map<String, String> namedArgs,
-        final T parser,
-        final AppendAndIndexWikiCallback<T> appendAndIndexWikiCallback) {
-      namedArgs.remove("lang");
-      if (!namedArgs.isEmpty()) {
-        EnParser.LOG.warning("weird encoding: " + wikiTokenizer.token());
-        return false;
-      }
-      if (args.size() == 0) {
-        // Things like "{{Jpan}}" exist.
-        return true;
-      }
-      
-      if (name.equals("IPA")) {
-          appendAndIndexWikiCallback.dispatch("IPA: ", null);
-      }
-      
-      for (int i = 0; i < args.size(); ++i) {
-        if (i > 0) {
-          appendAndIndexWikiCallback.builder.append(", ");
+
+    // ------------------------------------------------------------------
+
+    static final class QualifierCallback<T extends AbstractWiktionaryParser> implements FunctionCallback<T> {
+        @Override
+        public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List<String> args,
+                                      final Map<String, String> namedArgs,
+                                      final T parser,
+                                      final AppendAndIndexWikiCallback<T> appendAndIndexWikiCallback) {
+            namedArgs.remove("lang");
+            if (!namedArgs.isEmpty()) {
+                EnParser.LOG.warning("weird qualifier: " + wikiTokenizer.token());
+                return false;
+            }
+            appendAndIndexWikiCallback.builder.append("(");
+            for (int i = 0; i < args.size(); ++i) {
+                appendAndIndexWikiCallback.dispatch(args.get(i), null);
+                if (i > 0) {
+                    appendAndIndexWikiCallback.builder.append(", ");
+                }
+            }
+            appendAndIndexWikiCallback.builder.append(")");
+            return true;
         }
-        final String arg = args.get(i);
+    }
+
+    // ------------------------------------------------------------------
+
+    static final class EncodingCallback<T extends AbstractWiktionaryParser> implements FunctionCallback<T> {
+        @Override
+        public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List<String> args,
+                                      final Map<String, String> namedArgs,
+                                      final T parser,
+                                      final AppendAndIndexWikiCallback<T> appendAndIndexWikiCallback) {
+            namedArgs.remove("lang");
+            if (!namedArgs.isEmpty()) {
+                EnParser.LOG.warning("weird encoding: " + wikiTokenizer.token());
+                return false;
+            }
+            if (args.size() == 0) {
+                // Things like "{{Jpan}}" exist.
+                return true;
+            }
+
+            if (name.equals("IPA")) {
+                appendAndIndexWikiCallback.dispatch("IPA: ", null);
+            }
+
+            for (int i = 0; i < args.size(); ++i) {
+                if (i > 0) {
+                    appendAndIndexWikiCallback.builder.append(", ");
+                }
+                final String arg = args.get(i);
 //        if (arg.equals(parser.title)) {
 //          parser.titleAppended = true;
 //        }
-        appendAndIndexWikiCallback.dispatch(arg, appendAndIndexWikiCallback.entryTypeName);
-      }
-      
-      return true;
-    }
-  }
-
-  // ------------------------------------------------------------------
-  
-  static final class Gender<T extends AbstractWiktionaryParser> implements FunctionCallback<T> {
-    @Override
-    public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List<String> args,
-        final Map<String, String> namedArgs,
-        final T parser,
-        final AppendAndIndexWikiCallback<T> appendAndIndexWikiCallback) {
-      if (!namedArgs.isEmpty()) {
-        return false;
-      }
-      appendAndIndexWikiCallback.builder.append("{");
-      appendAndIndexWikiCallback.builder.append(name);
-      for (int i = 0; i < args.size(); ++i) {
-        appendAndIndexWikiCallback.builder.append("|").append(args.get(i));
-      }
-      appendAndIndexWikiCallback.builder.append("}");
-      return true;
-    }
-  }
-
-  // ------------------------------------------------------------------
-  
-  static final class l_term implements FunctionCallback<EnParser> {
-    @Override
-    public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List<String> args,
-        final Map<String, String> namedArgs,
-        final EnParser parser,
-        final AppendAndIndexWikiCallback<EnParser> appendAndIndexWikiCallback) {
-      
-      // for {{l}}, lang is arg 0, but not for {{term}}
-      if (name.equals("term")) {
-        args.add(0, "");
-      }
-      
-      final EntryTypeName entryTypeName;
-      switch (parser.state) {
-      case TRANSLATION_LINE: entryTypeName = EntryTypeName.WIKTIONARY_TRANSLATION_OTHER_TEXT; break;
-      case ENGLISH_DEF_OF_FOREIGN: entryTypeName = EntryTypeName.WIKTIONARY_ENGLISH_DEF_WIKI_LINK; break;
-      default: throw new IllegalStateException("Invalid enum value: " + parser.state);
-      }
-      
-      final String langCode = args.get(0);
-      final IndexBuilder indexBuilder;
-      if ("".equals(langCode)) {
-        indexBuilder = parser.foreignIndexBuilder;
-      } else if ("en".equals(langCode)) {
-        indexBuilder = parser.enIndexBuilder;
-      } else {
-        indexBuilder = parser.foreignIndexBuilder;
-      }
-      
-      String displayText = ListUtil.get(args, 2, "");
-      if (displayText.equals("")) {
-        displayText = ListUtil.get(args, 1, null);
-      }
-      
-      if (displayText != null) {
-        appendAndIndexWikiCallback.dispatch(displayText, indexBuilder, entryTypeName);
-      } else {
-        EnParser.LOG.warning("no display text: " + wikiTokenizer.token());
-      }
-      
-      final String tr = namedArgs.remove("tr");
-      if (tr != null) {
-        appendAndIndexWikiCallback.builder.append(" (");
-        appendAndIndexWikiCallback.dispatch(tr, indexBuilder, EntryTypeName.WIKTIONARY_TRANSLITERATION);
-        appendAndIndexWikiCallback.builder.append(")");
-      }
-      
-      final String gloss = ListUtil.get(args, 3, "");
-      if (!gloss.equals("")) {
-        appendAndIndexWikiCallback.builder.append(" (");
-        appendAndIndexWikiCallback.dispatch(gloss, parser.enIndexBuilder, EntryTypeName.WIKTIONARY_ENGLISH_DEF);
-        appendAndIndexWikiCallback.builder.append(")");
-      }
-      
-      namedArgs.keySet().removeAll(EnParser.USELESS_WIKI_ARGS);
-      if (!namedArgs.isEmpty()) {
-        appendAndIndexWikiCallback.builder.append(" {").append(name);
-        EnParser.appendNamedArgs(namedArgs, appendAndIndexWikiCallback);
-        appendAndIndexWikiCallback.builder.append("}");
-      }
-
-      return true;
-    }
-  }
-
-  // ------------------------------------------------------------------
-  
-  static final class AppendArg0<T extends AbstractWiktionaryParser> implements FunctionCallback<EnParser> {
-    @Override
-    public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List<String> args,
-        final Map<String, String> namedArgs,
-        final EnParser parser,
-        final AppendAndIndexWikiCallback<EnParser> appendAndIndexWikiCallback) {
-      if (args.size() != 1 || !namedArgs.isEmpty()) {
-        return false;
-      }
-      appendAndIndexWikiCallback.dispatch(args.get(0), EntryTypeName.WIKTIONARY_TRANSLATION_OTHER_TEXT);
-
-      final String tr = namedArgs.remove("tr");
-      if (tr != null) {
-        appendAndIndexWikiCallback.builder.append(" (");
-        appendAndIndexWikiCallback.dispatch(tr, EntryTypeName.WIKTIONARY_TRANSLATION_OTHER_TEXT);
-        appendAndIndexWikiCallback.builder.append(")");
-        parser.wordForms.add(tr);
-      }
-
-      return true;
-    }
-  }
-
-  // ------------------------------------------------------------------
-  
-  static final class italbrac<T extends AbstractWiktionaryParser> implements FunctionCallback<T> {
-    @Override
-    public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List<String> args,
-        final Map<String, String> namedArgs,
-        final T parser,
-        final AppendAndIndexWikiCallback<T> appendAndIndexWikiCallback) {
-      if (args.size() != 1 || !namedArgs.isEmpty()) {
-        return false;
-      }
-      appendAndIndexWikiCallback.builder.append("(");
-      appendAndIndexWikiCallback.dispatch(args.get(0), EntryTypeName.WIKTIONARY_TRANSLATION_OTHER_TEXT);
-      appendAndIndexWikiCallback.builder.append(")");
-      return true;
-    }
-  }
-
-  // ------------------------------------------------------------------
-  
-  static final class gloss<T extends AbstractWiktionaryParser> implements FunctionCallback<T> {
-    @Override
-    public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List<String> args,
-        final Map<String, String> namedArgs,
-        final T parser,
-        final AppendAndIndexWikiCallback<T> appendAndIndexWikiCallback) {
-      if (args.size() != 1 || !namedArgs.isEmpty()) {
-        return false;
-      }
-      appendAndIndexWikiCallback.builder.append("(");
-      appendAndIndexWikiCallback.dispatch(args.get(0), EntryTypeName.WIKTIONARY_TRANSLATION_OTHER_TEXT);
-      appendAndIndexWikiCallback.builder.append(")");
-      return true;
-    }
-  }
-  
-  // ------------------------------------------------------------------
-  
-  static final class Ignore<T extends AbstractWiktionaryParser> implements FunctionCallback<T> {
-    @Override
-    public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List<String> args,
-        final Map<String, String> namedArgs,
-        final T parser,
-        final AppendAndIndexWikiCallback<T> appendAndIndexWikiCallback) {
-      return true;
-    }
-  }
-
-  // ------------------------------------------------------------------
-  
-  static final class not_used<T extends AbstractWiktionaryParser> implements FunctionCallback<T> {
-    @Override
-    public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List<String> args,
-        final Map<String, String> namedArgs,
-        final T parser,
-        final AppendAndIndexWikiCallback<T> appendAndIndexWikiCallback) {
-      appendAndIndexWikiCallback.builder.append("(not used)");
-      return true;
+                appendAndIndexWikiCallback.dispatch(arg, appendAndIndexWikiCallback.entryTypeName);
+            }
+
+            return true;
+        }
     }
-  }
-
-
-  // ------------------------------------------------------------------
-  
-  static final class AppendName<T extends AbstractWiktionaryParser> implements FunctionCallback<T> {
-    @Override
-    public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List<String> args,
-        final Map<String, String> namedArgs,
-        final T parser,
-        final AppendAndIndexWikiCallback<T> appendAndIndexWikiCallback) {
-      if (!args.isEmpty() || !namedArgs.isEmpty()) {
-        return false;
-      }
-      appendAndIndexWikiCallback.builder.append(name);
-      return true;
+
+    // ------------------------------------------------------------------
+
+    static final class Gender<T extends AbstractWiktionaryParser> implements FunctionCallback<T> {
+        @Override
+        public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List<String> args,
+                                      final Map<String, String> namedArgs,
+                                      final T parser,
+                                      final AppendAndIndexWikiCallback<T> appendAndIndexWikiCallback) {
+            if (!namedArgs.isEmpty()) {
+                return false;
+            }
+            appendAndIndexWikiCallback.builder.append("{");
+            appendAndIndexWikiCallback.builder.append(name);
+            for (int i = 0; i < args.size(); ++i) {
+                appendAndIndexWikiCallback.builder.append("|").append(args.get(i));
+            }
+            appendAndIndexWikiCallback.builder.append("}");
+            return true;
+        }
     }
-  }
-
-  // --------------------------------------------------------------------
-  // --------------------------------------------------------------------
-  
-
-  static final class FormOf implements FunctionCallback<EnParser> {
-    @Override
-    public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List<String> args,
-        final Map<String, String> namedArgs,
-        final EnParser parser,
-        final AppendAndIndexWikiCallback<EnParser> appendAndIndexWikiCallback) {
-      parser.entryIsFormOfSomething = true;
-      String formName = name;
-      if (name.equals("form of")) {
-        formName = ListUtil.remove(args, 0, null);
-      }
-      if (formName == null) {
-        EnParser.LOG.warning("Missing form name: " + parser.title);
-        formName = "form of";
-      }
-      String baseForm = ListUtil.get(args, 1, "");
-      if ("".equals(baseForm)) {
-        baseForm = ListUtil.get(args, 0, null);
-        ListUtil.remove(args, 1, "");
-      } else {
-        ListUtil.remove(args, 0, null);
-      }
-      namedArgs.keySet().removeAll(EnParser.USELESS_WIKI_ARGS);
-      
-      appendAndIndexWikiCallback.builder.append("{");
-      NAME_AND_ARGS.onWikiFunction(wikiTokenizer, formName, args, namedArgs, parser, appendAndIndexWikiCallback);
-      appendAndIndexWikiCallback.builder.append("}");
-      if (baseForm != null && appendAndIndexWikiCallback.indexedEntry != null) {
-        parser.foreignIndexBuilder.addEntryWithString(appendAndIndexWikiCallback.indexedEntry, baseForm, EntryTypeName.WIKTIONARY_BASE_FORM_MULTI);
-      } else {
-        // null baseForm happens in Danish.
-        EnParser.LOG.warning("Null baseform: " + parser.title);
-      }
-      return true;
+
+    // ------------------------------------------------------------------
+
+    static final class l_term implements FunctionCallback<EnParser> {
+        @Override
+        public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List<String> args,
+                                      final Map<String, String> namedArgs,
+                                      final EnParser parser,
+                                      final AppendAndIndexWikiCallback<EnParser> appendAndIndexWikiCallback) {
+
+            // for {{l}}, lang is arg 0, but not for {{term}}
+            if (name.equals("term")) {
+                args.add(0, "");
+            }
+
+            final EntryTypeName entryTypeName;
+            switch (parser.state) {
+            case TRANSLATION_LINE:
+                entryTypeName = EntryTypeName.WIKTIONARY_TRANSLATION_OTHER_TEXT;
+                break;
+            case ENGLISH_DEF_OF_FOREIGN:
+                entryTypeName = EntryTypeName.WIKTIONARY_ENGLISH_DEF_WIKI_LINK;
+                break;
+            default:
+                throw new IllegalStateException("Invalid enum value: " + parser.state);
+            }
+
+            final String langCode = args.get(0);
+            final IndexBuilder indexBuilder;
+            if ("".equals(langCode)) {
+                indexBuilder = parser.foreignIndexBuilder;
+            } else if ("en".equals(langCode)) {
+                indexBuilder = parser.enIndexBuilder;
+            } else {
+                indexBuilder = parser.foreignIndexBuilder;
+            }
+
+            String displayText = ListUtil.get(args, 2, "");
+            if (displayText.equals("")) {
+                displayText = ListUtil.get(args, 1, null);
+            }
+
+            if (displayText != null) {
+                appendAndIndexWikiCallback.dispatch(displayText, indexBuilder, entryTypeName);
+            } else {
+                EnParser.LOG.warning("no display text: " + wikiTokenizer.token());
+            }
+
+            final String tr = namedArgs.remove("tr");
+            if (tr != null) {
+                appendAndIndexWikiCallback.builder.append(" (");
+                appendAndIndexWikiCallback.dispatch(tr, indexBuilder, EntryTypeName.WIKTIONARY_TRANSLITERATION);
+                appendAndIndexWikiCallback.builder.append(")");
+            }
+
+            final String gloss = ListUtil.get(args, 3, "");
+            if (!gloss.equals("")) {
+                appendAndIndexWikiCallback.builder.append(" (");
+                appendAndIndexWikiCallback.dispatch(gloss, parser.enIndexBuilder, EntryTypeName.WIKTIONARY_ENGLISH_DEF);
+                appendAndIndexWikiCallback.builder.append(")");
+            }
+
+            namedArgs.keySet().removeAll(EnParser.USELESS_WIKI_ARGS);
+            if (!namedArgs.isEmpty()) {
+                appendAndIndexWikiCallback.builder.append(" {").append(name);
+                EnParser.appendNamedArgs(namedArgs, appendAndIndexWikiCallback);
+                appendAndIndexWikiCallback.builder.append("}");
+            }
+
+            return true;
+        }
     }
-  }
-  
-  static final EnFunctionCallbacks.FormOf FORM_OF = new FormOf();
-  
-
-  // --------------------------------------------------------------------
-  // --------------------------------------------------------------------
-  
-  static final class wikipedia<T extends AbstractWiktionaryParser> implements FunctionCallback<T> {
-    @Override
-    public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List<String> args,
-        final Map<String, String> namedArgs,
-        final T parser,
-        final AppendAndIndexWikiCallback<T> appendAndIndexWikiCallback) {
-      namedArgs.remove("lang");
-      if (args.size() > 1 || !namedArgs.isEmpty()) {
-        // Unindexed!
-        return false;
-      } else if (args.size() == 1) {
-        return false;
-      } else {
-        return true;
-      }
+
+    // ------------------------------------------------------------------
+
+    static final class AppendArg0<T extends AbstractWiktionaryParser> implements FunctionCallback<EnParser> {
+        @Override
+        public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List<String> args,
+                                      final Map<String, String> namedArgs,
+                                      final EnParser parser,
+                                      final AppendAndIndexWikiCallback<EnParser> appendAndIndexWikiCallback) {
+            if (args.size() != 1 || !namedArgs.isEmpty()) {
+                return false;
+            }
+            appendAndIndexWikiCallback.dispatch(args.get(0), EntryTypeName.WIKTIONARY_TRANSLATION_OTHER_TEXT);
+
+            final String tr = namedArgs.remove("tr");
+            if (tr != null) {
+                appendAndIndexWikiCallback.builder.append(" (");
+                appendAndIndexWikiCallback.dispatch(tr, EntryTypeName.WIKTIONARY_TRANSLATION_OTHER_TEXT);
+                appendAndIndexWikiCallback.builder.append(")");
+                parser.wordForms.add(tr);
+            }
+
+            return true;
+        }
     }
-  }
-
-  static final class InflOrHead implements FunctionCallback<EnParser> {
-    @Override
-    public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List<String> args,
-        final Map<String, String> namedArgs,
-        final EnParser parser,
-        final AppendAndIndexWikiCallback<EnParser> appendAndIndexWikiCallback) {
-      // See: http://en.wiktionary.org/wiki/Template:infl
-      // TODO: Actually these functions should start a new WordPOS:
-      // See: http://en.wiktionary.org/wiki/quattro
-      final String langCode = ListUtil.get(args, 0);
-      String head = namedArgs.remove("head");
-      if (head == null) {
-        head = namedArgs.remove("title"); // Bug
-      }
-      if (head == null) {
-        head = parser.title;
-      }
-      
-      namedArgs.keySet().removeAll(EnParser.USELESS_WIKI_ARGS);
-
-      final String tr = namedArgs.remove("tr");
-      String g = namedArgs.remove("g");
-      if (g == null) {
-        g = namedArgs.remove("gender");
-      }
-      final String g2 = namedArgs.remove("g2");
-      final String g3 = namedArgs.remove("g3");
-
-      // We might have already taken care of this in a generic way...
-      if (!parser.titleAppended) {
-        appendAndIndexWikiCallback.dispatch(head, EntryTypeName.WIKTIONARY_TITLE_MULTI);
-        parser.titleAppended = true;
-      }
-
-      if (g != null) {
-        appendAndIndexWikiCallback.builder.append(" {").append(g);
-        if (g2 != null) {
-          appendAndIndexWikiCallback.builder.append("|").append(g2);
+
+    // ------------------------------------------------------------------
+
+    static final class italbrac<T extends AbstractWiktionaryParser> implements FunctionCallback<T> {
+        @Override
+        public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List<String> args,
+                                      final Map<String, String> namedArgs,
+                                      final T parser,
+                                      final AppendAndIndexWikiCallback<T> appendAndIndexWikiCallback) {
+            if (args.size() != 1 || !namedArgs.isEmpty()) {
+                return false;
+            }
+            appendAndIndexWikiCallback.builder.append("(");
+            appendAndIndexWikiCallback.dispatch(args.get(0), EntryTypeName.WIKTIONARY_TRANSLATION_OTHER_TEXT);
+            appendAndIndexWikiCallback.builder.append(")");
+            return true;
         }
-        if (g3 != null) {
-          appendAndIndexWikiCallback.builder.append("|").append(g3);
+    }
+
+    // ------------------------------------------------------------------
+
+    static final class gloss<T extends AbstractWiktionaryParser> implements FunctionCallback<T> {
+        @Override
+        public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List<String> args,
+                                      final Map<String, String> namedArgs,
+                                      final T parser,
+                                      final AppendAndIndexWikiCallback<T> appendAndIndexWikiCallback) {
+            if (args.size() != 1 || !namedArgs.isEmpty()) {
+                return false;
+            }
+            appendAndIndexWikiCallback.builder.append("(");
+            appendAndIndexWikiCallback.dispatch(args.get(0), EntryTypeName.WIKTIONARY_TRANSLATION_OTHER_TEXT);
+            appendAndIndexWikiCallback.builder.append(")");
+            return true;
         }
-        appendAndIndexWikiCallback.builder.append("}");
-      }
-
-      if (tr != null) {
-        appendAndIndexWikiCallback.builder.append(" (");
-        appendAndIndexWikiCallback.dispatch(tr, EntryTypeName.WIKTIONARY_TITLE_MULTI);
-        appendAndIndexWikiCallback.builder.append(")");
-        parser.wordForms.add(tr);
-      }
-
-      final String pos = ListUtil.get(args, 1);
-      if (pos != null) {
-        appendAndIndexWikiCallback.builder.append(" (").append(pos).append(")");
-      }
-      for (int i = 2; i < args.size(); i += 2) {
-        final String inflName = ListUtil.get(args, i);
-        final String inflValue = ListUtil.get(args, i + 1);
-        appendAndIndexWikiCallback.builder.append(", ");
-        appendAndIndexWikiCallback.dispatch(inflName, null, null);
-        if (inflValue != null && inflValue.length() > 0) {
-          appendAndIndexWikiCallback.builder.append(": ");
-          appendAndIndexWikiCallback.dispatch(inflValue, null, null);
-          parser.wordForms.add(inflValue);
+    }
+
+    // ------------------------------------------------------------------
+
+    static final class Ignore<T extends AbstractWiktionaryParser> implements FunctionCallback<T> {
+        @Override
+        public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List<String> args,
+                                      final Map<String, String> namedArgs,
+                                      final T parser,
+                                      final AppendAndIndexWikiCallback<T> appendAndIndexWikiCallback) {
+            return true;
         }
-      }
-      for (final String key : namedArgs.keySet()) {
-        final String value = WikiTokenizer.toPlainText(namedArgs.get(key));
-        appendAndIndexWikiCallback.builder.append(" ");
-        appendAndIndexWikiCallback.dispatch(key, null, null);
-        appendAndIndexWikiCallback.builder.append("=");
-        appendAndIndexWikiCallback.dispatch(value, null, null);
-        parser.wordForms.add(value);
-      }
-      return true;
     }
-  }
-  
-  static final class etyl<T extends AbstractWiktionaryParser> implements FunctionCallback<T> {
-      @Override
-      public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List<String> args,
-          final Map<String, String> namedArgs,
-          final T parser,
-          final AppendAndIndexWikiCallback<T> appendAndIndexWikiCallback) {
-        final String langCode = ListUtil.get(args, 0);
-        if (langCode == null) {
-            return false;
+
+    // ------------------------------------------------------------------
+
+    static final class not_used<T extends AbstractWiktionaryParser> implements FunctionCallback<T> {
+        @Override
+        public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List<String> args,
+                                      final Map<String, String> namedArgs,
+                                      final T parser,
+                                      final AppendAndIndexWikiCallback<T> appendAndIndexWikiCallback) {
+            appendAndIndexWikiCallback.builder.append("(not used)");
+            return true;
         }
-        String langName = WiktionaryLangs.getEnglishName(langCode);
-        if (langName != null) {
-            appendAndIndexWikiCallback.dispatch(langName, null);
-        } else {
-            appendAndIndexWikiCallback.dispatch("lang:" + langCode, null);
+    }
+
+
+    // ------------------------------------------------------------------
+
+    static final class AppendName<T extends AbstractWiktionaryParser> implements FunctionCallback<T> {
+        @Override
+        public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List<String> args,
+                                      final Map<String, String> namedArgs,
+                                      final T parser,
+                                      final AppendAndIndexWikiCallback<T> appendAndIndexWikiCallback) {
+            if (!args.isEmpty() || !namedArgs.isEmpty()) {
+                return false;
+            }
+            appendAndIndexWikiCallback.builder.append(name);
+            return true;
         }
-        return true;
-      }
-  }
-
-  static final class term<T extends AbstractWiktionaryParser> implements FunctionCallback<T> {
-      @Override
-      public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List<String> args,
-          final Map<String, String> namedArgs,
-          final T parser,
-          final AppendAndIndexWikiCallback<T> appendAndIndexWikiCallback) {
-        namedArgs.remove("sc");
-        
-        // Main text.
-        final String lang = namedArgs.remove("lang");
-        String head = ListUtil.get(args, 0);
-        String display = ListUtil.get(args, 1);
-        if (StringUtil.isNullOrEmpty(head) && StringUtil.isNullOrEmpty(display)) {
-            head = display = parser.title;
+    }
+
+    // --------------------------------------------------------------------
+    // --------------------------------------------------------------------
+
+
+    static final class FormOf implements FunctionCallback<EnParser> {
+        @Override
+        public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List<String> args,
+                                      final Map<String, String> namedArgs,
+                                      final EnParser parser,
+                                      final AppendAndIndexWikiCallback<EnParser> appendAndIndexWikiCallback) {
+            parser.entryIsFormOfSomething = true;
+            String formName = name;
+            if (name.equals("form of")) {
+                formName = ListUtil.remove(args, 0, null);
+            }
+            if (formName == null) {
+                EnParser.LOG.warning("Missing form name: " + parser.title);
+                formName = "form of";
+            }
+            String baseForm = ListUtil.get(args, 1, "");
+            if ("".equals(baseForm)) {
+                baseForm = ListUtil.get(args, 0, null);
+                ListUtil.remove(args, 1, "");
+            } else {
+                ListUtil.remove(args, 0, null);
+            }
+            namedArgs.keySet().removeAll(EnParser.USELESS_WIKI_ARGS);
+
+            appendAndIndexWikiCallback.builder.append("{");
+            NAME_AND_ARGS.onWikiFunction(wikiTokenizer, formName, args, namedArgs, parser, appendAndIndexWikiCallback);
+            appendAndIndexWikiCallback.builder.append("}");
+            if (baseForm != null && appendAndIndexWikiCallback.indexedEntry != null) {
+                parser.foreignIndexBuilder.addEntryWithString(appendAndIndexWikiCallback.indexedEntry, baseForm, EntryTypeName.WIKTIONARY_BASE_FORM_MULTI);
+            } else {
+                // null baseForm happens in Danish.
+                EnParser.LOG.warning("Null baseform: " + parser.title);
+            }
+            return true;
         }
-        if (StringUtil.isNullOrEmpty(head)) {
-            // Dispatches formatted wiki text.
-            appendAndIndexWikiCallback.dispatch(display, null);
-        } else {
-            if (StringUtil.isNullOrEmpty(display)) {
-                display = head;
-            }
-            appendAndIndexWikiCallback.dispatch(String.format("[[%s|%s]]", display, head), null);
+    }
+
+    static final EnFunctionCallbacks.FormOf FORM_OF = new FormOf();
+
+
+    // --------------------------------------------------------------------
+    // --------------------------------------------------------------------
+
+    static final class wikipedia<T extends AbstractWiktionaryParser> implements FunctionCallback<T> {
+        @Override
+        public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List<String> args,
+                                      final Map<String, String> namedArgs,
+                                      final T parser,
+                                      final AppendAndIndexWikiCallback<T> appendAndIndexWikiCallback) {
+            namedArgs.remove("lang");
+            if (args.size() > 1 || !namedArgs.isEmpty()) {
+                // Unindexed!
+                return false;
+            } else if (args.size() == 1) {
+                return false;
+            } else {
+                return true;
+            }
         }
-        
-        // Stuff in ()s.
-        final String tr = namedArgs.remove("tr");
-        final String pos = namedArgs.remove("pos");
-        String gloss = ListUtil.get(args, 2);
-        String literally = namedArgs.remove("lit");
-        if (!StringUtil.isNullOrEmpty(gloss)) {
-            gloss = String.format("\"%s\"", gloss);
+    }
+
+    static final class InflOrHead implements FunctionCallback<EnParser> {
+        @Override
+        public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List<String> args,
+                                      final Map<String, String> namedArgs,
+                                      final EnParser parser,
+                                      final AppendAndIndexWikiCallback<EnParser> appendAndIndexWikiCallback) {
+            // See: http://en.wiktionary.org/wiki/Template:infl
+            // TODO: Actually these functions should start a new WordPOS:
+            // See: http://en.wiktionary.org/wiki/quattro
+            final String langCode = ListUtil.get(args, 0);
+            String head = namedArgs.remove("head");
+            if (head == null) {
+                head = namedArgs.remove("title"); // Bug
+            }
+            if (head == null) {
+                head = parser.title;
+            }
+
+            namedArgs.keySet().removeAll(EnParser.USELESS_WIKI_ARGS);
+
+            final String tr = namedArgs.remove("tr");
+            String g = namedArgs.remove("g");
+            if (g == null) {
+                g = namedArgs.remove("gender");
+            }
+            final String g2 = namedArgs.remove("g2");
+            final String g3 = namedArgs.remove("g3");
+
+            // We might have already taken care of this in a generic way...
+            if (!parser.titleAppended) {
+                appendAndIndexWikiCallback.dispatch(head, EntryTypeName.WIKTIONARY_TITLE_MULTI);
+                parser.titleAppended = true;
+            }
+
+            if (g != null) {
+                appendAndIndexWikiCallback.builder.append(" {").append(g);
+                if (g2 != null) {
+                    appendAndIndexWikiCallback.builder.append("|").append(g2);
+                }
+                if (g3 != null) {
+                    appendAndIndexWikiCallback.builder.append("|").append(g3);
+                }
+                appendAndIndexWikiCallback.builder.append("}");
+            }
+
+            if (tr != null) {
+                appendAndIndexWikiCallback.builder.append(" (");
+                appendAndIndexWikiCallback.dispatch(tr, EntryTypeName.WIKTIONARY_TITLE_MULTI);
+                appendAndIndexWikiCallback.builder.append(")");
+                parser.wordForms.add(tr);
+            }
+
+            final String pos = ListUtil.get(args, 1);
+            if (pos != null) {
+                appendAndIndexWikiCallback.builder.append(" (").append(pos).append(")");
+            }
+            for (int i = 2; i < args.size(); i += 2) {
+                final String inflName = ListUtil.get(args, i);
+                final String inflValue = ListUtil.get(args, i + 1);
+                appendAndIndexWikiCallback.builder.append(", ");
+                appendAndIndexWikiCallback.dispatch(inflName, null, null);
+                if (inflValue != null && inflValue.length() > 0) {
+                    appendAndIndexWikiCallback.builder.append(": ");
+                    appendAndIndexWikiCallback.dispatch(inflValue, null, null);
+                    parser.wordForms.add(inflValue);
+                }
+            }
+            for (final String key : namedArgs.keySet()) {
+                final String value = WikiTokenizer.toPlainText(namedArgs.get(key));
+                appendAndIndexWikiCallback.builder.append(" ");
+                appendAndIndexWikiCallback.dispatch(key, null, null);
+                appendAndIndexWikiCallback.builder.append("=");
+                appendAndIndexWikiCallback.dispatch(value, null, null);
+                parser.wordForms.add(value);
+            }
+            return true;
         }
-        if (!StringUtil.isNullOrEmpty(literally)) {
-            literally = String.format("literally %s", literally);
+    }
+
+    static final class etyl<T extends AbstractWiktionaryParser> implements FunctionCallback<T> {
+        @Override
+        public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List<String> args,
+                                      final Map<String, String> namedArgs,
+                                      final T parser,
+                                      final AppendAndIndexWikiCallback<T> appendAndIndexWikiCallback) {
+            final String langCode = ListUtil.get(args, 0);
+            if (langCode == null) {
+                return false;
+            }
+            String langName = WiktionaryLangs.getEnglishName(langCode);
+            if (langName != null) {
+                appendAndIndexWikiCallback.dispatch(langName, null);
+            } else {
+                appendAndIndexWikiCallback.dispatch("lang:" + langCode, null);
+            }
+            return true;
         }
-        final List<String> inParens = new ArrayList<String>(Arrays.asList(tr, pos, gloss, literally));
-        cleanList(inParens);
-        appendCommaSeparatedList(appendAndIndexWikiCallback, inParens);
-        
-        if (tr != null) {
-            parser.addLinkToCurrentEntry(tr, lang, EntryTypeName.WIKTIONARY_MENTIONED);
+    }
+
+    static final class term<T extends AbstractWiktionaryParser> implements FunctionCallback<T> {
+        @Override
+        public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List<String> args,
+                                      final Map<String, String> namedArgs,
+                                      final T parser,
+                                      final AppendAndIndexWikiCallback<T> appendAndIndexWikiCallback) {
+            namedArgs.remove("sc");
+
+            // Main text.
+            final String lang = namedArgs.remove("lang");
+            String head = ListUtil.get(args, 0);
+            String display = ListUtil.get(args, 1);
+            if (StringUtil.isNullOrEmpty(head) && StringUtil.isNullOrEmpty(display)) {
+                head = display = parser.title;
+            }
+            if (StringUtil.isNullOrEmpty(head)) {
+                // Dispatches formatted wiki text.
+                appendAndIndexWikiCallback.dispatch(display, null);
+            } else {
+                if (StringUtil.isNullOrEmpty(display)) {
+                    display = head;
+                }
+                appendAndIndexWikiCallback.dispatch(String.format("[[%s|%s]]", display, head), null);
+            }
+
+            // Stuff in ()s.
+            final String tr = namedArgs.remove("tr");
+            final String pos = namedArgs.remove("pos");
+            String gloss = ListUtil.get(args, 2);
+            String literally = namedArgs.remove("lit");
+            if (!StringUtil.isNullOrEmpty(gloss)) {
+                gloss = String.format("\"%s\"", gloss);
+            }
+            if (!StringUtil.isNullOrEmpty(literally)) {
+                literally = String.format("literally %s", literally);
+            }
+            final List<String> inParens = new ArrayList<String>(Arrays.asList(tr, pos, gloss, literally));
+            cleanList(inParens);
+            appendCommaSeparatedList(appendAndIndexWikiCallback, inParens);
+
+            if (tr != null) {
+                parser.addLinkToCurrentEntry(tr, lang, EntryTypeName.WIKTIONARY_MENTIONED);
+            }
+            return namedArgs.isEmpty();
         }
-        return namedArgs.isEmpty();
-      }
 
-    private void appendCommaSeparatedList(
+        private void appendCommaSeparatedList(
             final AppendAndIndexWikiCallback<T> appendAndIndexWikiCallback,
             final List<String> inParens) {
-        if (!inParens.isEmpty()) {
-            appendAndIndexWikiCallback.dispatch(" (", null);
-            for (int i = 0; i < inParens.size(); ++i) {
-                if (i > 0) {
-                    appendAndIndexWikiCallback.dispatch(", ", null);
+            if (!inParens.isEmpty()) {
+                appendAndIndexWikiCallback.dispatch(" (", null);
+                for (int i = 0; i < inParens.size(); ++i) {
+                    if (i > 0) {
+                        appendAndIndexWikiCallback.dispatch(", ", null);
+                    }
+                    appendAndIndexWikiCallback.dispatch(inParens.get(i), null);
                 }
-                appendAndIndexWikiCallback.dispatch(inParens.get(i), null);
+                appendAndIndexWikiCallback.dispatch(")", null);
             }
-            appendAndIndexWikiCallback.dispatch(")", null);
         }
-    }
 
-  }
-
-  private static void cleanList(List<String> asList) {
-      int pos;
-      while ((pos = asList.indexOf("")) != -1) {
-          asList.remove(pos);
-      }
-      while ((pos = asList.indexOf(null)) != -1) {
-          asList.remove(pos);
-      }
-  }
-
-
-  static {
-    DEFAULT.put("it-noun", new it_noun());
-  } 
-  static final class it_noun implements FunctionCallback<EnParser> {
-    @Override
-    public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List<String> args,
-        final Map<String, String> namedArgs,
-        final EnParser parser,
-        final AppendAndIndexWikiCallback<EnParser> appendAndIndexWikiCallback) {
-      parser.titleAppended = true;
-      final String gender = ListUtil.get(args, 0);
-      final String singular = parser.title;
-      final String plural = ListUtil.get(args, 1, null);
-      appendAndIndexWikiCallback.builder.append(" ");
-      appendAndIndexWikiCallback.dispatch(singular, null, null);
-      appendAndIndexWikiCallback.builder.append(" {").append(gender).append("}, ");
-      if (plural != null) {
-        appendAndIndexWikiCallback.dispatch(plural, null, null);
-        appendAndIndexWikiCallback.builder.append(" {pl}");
-        parser.wordForms.add(plural);
-      }
-      final String f = namedArgs.remove("f");
-      if (f != null) {
-          appendAndIndexWikiCallback.builder.append(", ");
-          appendAndIndexWikiCallback.dispatch(f, null, null);
-          appendAndIndexWikiCallback.builder.append(" {f}");
-      }
-      final String m = namedArgs.remove("m");
-      if (m != null) {
-          appendAndIndexWikiCallback.builder.append(", ");
-          appendAndIndexWikiCallback.dispatch(m, null, null);
-          appendAndIndexWikiCallback.builder.append(" {m}");
-      }
-      parser.wordForms.add(singular);
-      if (!namedArgs.isEmpty() || args.size() > 4) {
-        EnParser.LOG.warning("Invalid it-noun: " + wikiTokenizer.token());
-      }
-      return true;
     }
-  }
-
-  static {
-    DEFAULT.put("it-proper noun", new it_proper_noun<EnParser>());
-  } 
-  static final class it_proper_noun<T extends AbstractWiktionaryParser> implements FunctionCallback<T> {
-    @Override
-    public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List<String> args,
-        final Map<String, String> namedArgs,
-        final T parser,
-        final AppendAndIndexWikiCallback<T> appendAndIndexWikiCallback) {
-      return false;
+
+    private static void cleanList(List<String> asList) {
+        int pos;
+        while ((pos = asList.indexOf("")) != -1) {
+            asList.remove(pos);
+        }
+        while ((pos = asList.indexOf(null)) != -1) {
+            asList.remove(pos);
+        }
     }
-  }
-  
-  // -----------------------------------------------------------------------
-  // Italian stuff
-  // -----------------------------------------------------------------------
-  
-static final class it_conj_are<T extends AbstractWiktionaryParser> implements FunctionCallback<T> {
-    final it_conj<T> dest;
-    it_conj_are(it_conj<T> dest) {
-      this.dest = dest;
+
+
+    static {
+        DEFAULT.put("it-noun", new it_noun());
     }
-    @Override
-      public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List<String> args,
-          final Map<String, String> namedArgs,
-          final T parser,
-          final AppendAndIndexWikiCallback<T> appendAndIndexWikiCallback) {
-        final String h = name.equals("it-conj-care") || name.equals("it-conj-carsi") ? "h" : "";
-        final String i = name.equals("it-conj-ciare") || name.equals("it-conj-ciarsi") ? "i" : "";
-        final String i2 = name.equals("it-conj-iare") || name.equals("it-conj-iarsi") ? "" : "i";
-        final boolean si = name.equals("it-conj-arsi") || name.equals("it-conj-iarsi") || name.equals("it-conj-iarsi-b") || name.equals("it-conj-carsi") || name.equals("it-conj-ciarsi");
-        final String root = args.get(0);
-        passThroughOrFillIn(namedArgs, "inf", root + i + (si ? "arsi" : "are"), false);
-        namedArgs.put("aux", ListUtil.get(args, 1, ""));
-        passThroughOrFillIn(namedArgs, "ger", root + i + "ando" + (si ? "si" : ""), true);
-        passThroughOrFillIn(namedArgs, "presp", root + i + "ante"+ (si ? "si" : ""), true);
-        passThroughOrFillIn(namedArgs, "pastp", root + i + "ato", true);
-        if (si) {
-            passThroughOrFillIn(namedArgs, "pastp2", root + i + "atosi", true);
+    static final class it_noun implements FunctionCallback<EnParser> {
+        @Override
+        public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List<String> args,
+                                      final Map<String, String> namedArgs,
+                                      final EnParser parser,
+                                      final AppendAndIndexWikiCallback<EnParser> appendAndIndexWikiCallback) {
+            parser.titleAppended = true;
+            final String gender = ListUtil.get(args, 0);
+            final String singular = parser.title;
+            final String plural = ListUtil.get(args, 1, null);
+            appendAndIndexWikiCallback.builder.append(" ");
+            appendAndIndexWikiCallback.dispatch(singular, null, null);
+            appendAndIndexWikiCallback.builder.append(" {").append(gender).append("}, ");
+            if (plural != null) {
+                appendAndIndexWikiCallback.dispatch(plural, null, null);
+                appendAndIndexWikiCallback.builder.append(" {pl}");
+                parser.wordForms.add(plural);
+            }
+            final String f = namedArgs.remove("f");
+            if (f != null) {
+                appendAndIndexWikiCallback.builder.append(", ");
+                appendAndIndexWikiCallback.dispatch(f, null, null);
+                appendAndIndexWikiCallback.builder.append(" {f}");
+            }
+            final String m = namedArgs.remove("m");
+            if (m != null) {
+                appendAndIndexWikiCallback.builder.append(", ");
+                appendAndIndexWikiCallback.dispatch(m, null, null);
+                appendAndIndexWikiCallback.builder.append(" {m}");
+            }
+            parser.wordForms.add(singular);
+            if (!namedArgs.isEmpty() || args.size() > 4) {
+                EnParser.LOG.warning("Invalid it-noun: " + wikiTokenizer.token());
+            }
+            return true;
         }
-        final String i2b = (name.equals("it-conj-iare-b") || name.equals("it-conj-iarsi-b")) ? "" : i2;
-        
-        it_conj_passMood(namedArgs, "pres", false, root, Arrays.asList(i + "o", h + i2, i + "a", h + i2 + "amo", i + "ate", i + "ano"));
-        it_conj_passMood(namedArgs, "imperf", false, root, Arrays.asList(i + "avo", i + "avi", i + "ava", i + "avamo", i + "avate", i + "avano"));
-        it_conj_passMood(namedArgs, "prem", false, root, Arrays.asList(i + "ai", i + "asti", i + "ò", i + "ammo", i + "aste", i + "arono"));
-        it_conj_passMood(namedArgs, "fut", true, root, Arrays.asList(h + "erò", h + "erai", h + "erà", h + "eremo", h + "erete", h + "eranno"));
-        it_conj_passMood(namedArgs, "cond", true, root, Arrays.asList(h + "erei", h + "eresti", h + "erebbe", h + "eremmo", h + "ereste", h + "erebbero"));
-        
-        passThroughOrFillIn(namedArgs, "sub123s", root + h + i2, false);
-        passThroughOrFillIn(namedArgs, "sub1p", root + h + i2b + "amo", false);
-        passThroughOrFillIn(namedArgs, "sub2p", root + h + i2b + "ate", false);
-        passThroughOrFillIn(namedArgs, "sub3p", root + h + i2 + "no", false);
-
-        passThroughOrFillIn(namedArgs, "impsub12s", root + i + "assi", false);
-        passThroughOrFillIn(namedArgs, "impsub3s", root + i + "asse", false);
-        passThroughOrFillIn(namedArgs, "impsub1p", root + i + "assimo", false);
-        passThroughOrFillIn(namedArgs, "impsub2p", root + i + "aste", false);
-        passThroughOrFillIn(namedArgs, "impsub3p", root + i + "assero", false);
-
-        passThroughOrFillIn(namedArgs, "imp2s", root + i + "a" + (si ? "ti" : ""), true);
-        passThroughOrFillIn(namedArgs, "imp3s", (si ? "si " : "") + root + h + i2, true);
-        passThroughOrFillIn(namedArgs, "imp1p", root + h + i2b + "amo" + (si ? "ci" : ""), true);
-        passThroughOrFillIn(namedArgs, "imp2p", root + i + "ate" + (si ? "vi" : ""), true);
-        passThroughOrFillIn(namedArgs, "imp3p", (si ? "si " : "") + root + h + i2 + "no", true);
-
-        return dest.onWikiFunction(wikiTokenizer, name, args, namedArgs, parser, appendAndIndexWikiCallback);
-      }
     }
 
-  static final class it_conj_ire<T extends AbstractWiktionaryParser> implements FunctionCallback<T> {
-    final it_conj<T> dest;
-    it_conj_ire(it_conj<T> dest) {
-      this.dest = dest;
+    static {
+        DEFAULT.put("it-proper noun", new it_proper_noun<EnParser>());
+    }
+    static final class it_proper_noun<T extends AbstractWiktionaryParser> implements FunctionCallback<T> {
+        @Override
+        public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List<String> args,
+                                      final Map<String, String> namedArgs,
+                                      final T parser,
+                                      final AppendAndIndexWikiCallback<T> appendAndIndexWikiCallback) {
+            return false;
+        }
     }
-    @Override
-      public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List<String> args,
-          final Map<String, String> namedArgs,
-          final T parser,
-          final AppendAndIndexWikiCallback<T> appendAndIndexWikiCallback) {
-        final String root = args.get(0);
-        final String i = name.equals("it-conj-cire") || name.equals("it-conj-cirsi") ? "i" : "";
-        final boolean si = name.equals("it-conj-irsi") || name.equals("it-conj-irsi-b") || name.equals("it-conj-cirsi");
-
-        passThroughOrFillIn(namedArgs, "inf", root + (si ? "irsi" : "ire"), false);
-        namedArgs.put("aux", ListUtil.get(args, 1, ""));
-        passThroughOrFillIn(namedArgs, "ger", root + "endo" + (si ? "si" : ""), true);
-        passThroughOrFillIn(namedArgs, "presp", root + "ente" + (si ? "si" : ""), true);
-        passThroughOrFillIn(namedArgs, "pastp", root + "ito", true);
-        if (si) {
-            passThroughOrFillIn(namedArgs, "pastp2", root + "itosi", true);
+
+    // -----------------------------------------------------------------------
+    // Italian stuff
+    // -----------------------------------------------------------------------
+
+    static final class it_conj_are<T extends AbstractWiktionaryParser> implements FunctionCallback<T> {
+        final it_conj<T> dest;
+        it_conj_are(it_conj<T> dest) {
+            this.dest = dest;
+        }
+        @Override
+        public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List<String> args,
+                                      final Map<String, String> namedArgs,
+                                      final T parser,
+                                      final AppendAndIndexWikiCallback<T> appendAndIndexWikiCallback) {
+            final String h = name.equals("it-conj-care") || name.equals("it-conj-carsi") ? "h" : "";
+            final String i = name.equals("it-conj-ciare") || name.equals("it-conj-ciarsi") ? "i" : "";
+            final String i2 = name.equals("it-conj-iare") || name.equals("it-conj-iarsi") ? "" : "i";
+            final boolean si = name.equals("it-conj-arsi") || name.equals("it-conj-iarsi") || name.equals("it-conj-iarsi-b") || name.equals("it-conj-carsi") || name.equals("it-conj-ciarsi");
+            final String root = args.get(0);
+            passThroughOrFillIn(namedArgs, "inf", root + i + (si ? "arsi" : "are"), false);
+            namedArgs.put("aux", ListUtil.get(args, 1, ""));
+            passThroughOrFillIn(namedArgs, "ger", root + i + "ando" + (si ? "si" : ""), true);
+            passThroughOrFillIn(namedArgs, "presp", root + i + "ante"+ (si ? "si" : ""), true);
+            passThroughOrFillIn(namedArgs, "pastp", root + i + "ato", true);
+            if (si) {
+                passThroughOrFillIn(namedArgs, "pastp2", root + i + "atosi", true);
+            }
+            final String i2b = (name.equals("it-conj-iare-b") || name.equals("it-conj-iarsi-b")) ? "" : i2;
+
+            it_conj_passMood(namedArgs, "pres", false, root, Arrays.asList(i + "o", h + i2, i + "a", h + i2 + "amo", i + "ate", i + "ano"));
+            it_conj_passMood(namedArgs, "imperf", false, root, Arrays.asList(i + "avo", i + "avi", i + "ava", i + "avamo", i + "avate", i + "avano"));
+            it_conj_passMood(namedArgs, "prem", false, root, Arrays.asList(i + "ai", i + "asti", i + "ò", i + "ammo", i + "aste", i + "arono"));
+            it_conj_passMood(namedArgs, "fut", true, root, Arrays.asList(h + "erò", h + "erai", h + "erà", h + "eremo", h + "erete", h + "eranno"));
+            it_conj_passMood(namedArgs, "cond", true, root, Arrays.asList(h + "erei", h + "eresti", h + "erebbe", h + "eremmo", h + "ereste", h + "erebbero"));
+
+            passThroughOrFillIn(namedArgs, "sub123s", root + h + i2, false);
+            passThroughOrFillIn(namedArgs, "sub1p", root + h + i2b + "amo", false);
+            passThroughOrFillIn(namedArgs, "sub2p", root + h + i2b + "ate", false);
+            passThroughOrFillIn(namedArgs, "sub3p", root + h + i2 + "no", false);
+
+            passThroughOrFillIn(namedArgs, "impsub12s", root + i + "assi", false);
+            passThroughOrFillIn(namedArgs, "impsub3s", root + i + "asse", false);
+            passThroughOrFillIn(namedArgs, "impsub1p", root + i + "assimo", false);
+            passThroughOrFillIn(namedArgs, "impsub2p", root + i + "aste", false);
+            passThroughOrFillIn(namedArgs, "impsub3p", root + i + "assero", false);
+
+            passThroughOrFillIn(namedArgs, "imp2s", root + i + "a" + (si ? "ti" : ""), true);
+            passThroughOrFillIn(namedArgs, "imp3s", (si ? "si " : "") + root + h + i2, true);
+            passThroughOrFillIn(namedArgs, "imp1p", root + h + i2b + "amo" + (si ? "ci" : ""), true);
+            passThroughOrFillIn(namedArgs, "imp2p", root + i + "ate" + (si ? "vi" : ""), true);
+            passThroughOrFillIn(namedArgs, "imp3p", (si ? "si " : "") + root + h + i2 + "no", true);
+
+            return dest.onWikiFunction(wikiTokenizer, name, args, namedArgs, parser, appendAndIndexWikiCallback);
         }
-        if (!name.endsWith("-b")) {
-            it_conj_passMood(namedArgs, "pres", false, root, Arrays.asList(i + "o", "i", "e", "iamo", "ite", i + "ono"));
-        } else {
-            it_conj_passMood(namedArgs, "pres", false, root, Arrays.asList("isco", "isci", "isce", "iamo", "ite", "iscono"));
+    }
+
+    static final class it_conj_ire<T extends AbstractWiktionaryParser> implements FunctionCallback<T> {
+        final it_conj<T> dest;
+        it_conj_ire(it_conj<T> dest) {
+            this.dest = dest;
         }
-        it_conj_passMood(namedArgs, "imperf", false, root, Arrays.asList("ivo", "ivi", "iva", "ivamo", "ivate", "ivano"));
-        it_conj_passMood(namedArgs, "prem", false, root, Arrays.asList("ii", "isti", "ì", "immo", "iste", "irono"));
-        // Regular past historic synonyms:
-        passThroughOrFillIn(namedArgs, "prem3s2", root + "é", true);
-        passThroughOrFillIn(namedArgs, "prem3p2", root + "erono", true);
-        it_conj_passMood(namedArgs, "fut", true, root, Arrays.asList("irò", "irai", "irà", "iremo", "irete", "iranno"));
-        it_conj_passMood(namedArgs, "cond", true, root, Arrays.asList("irei", "iresti", "irebbe", "iremmo", "ireste", "irebbero"));
-
-        if (!name.endsWith("-b")) {
-            passThroughOrFillIn(namedArgs, "sub123s", root + i + "a", false);
-            passThroughOrFillIn(namedArgs, "sub3p", root + i + "ano", false);
-        } else {
-            passThroughOrFillIn(namedArgs, "sub123s", root + "isca", false);
-            passThroughOrFillIn(namedArgs, "sub3p", root + "iscano", false);
+        @Override
+        public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List<String> args,
+                                      final Map<String, String> namedArgs,
+                                      final T parser,
+                                      final AppendAndIndexWikiCallback<T> appendAndIndexWikiCallback) {
+            final String root = args.get(0);
+            final String i = name.equals("it-conj-cire") || name.equals("it-conj-cirsi") ? "i" : "";
+            final boolean si = name.equals("it-conj-irsi") || name.equals("it-conj-irsi-b") || name.equals("it-conj-cirsi");
+
+            passThroughOrFillIn(namedArgs, "inf", root + (si ? "irsi" : "ire"), false);
+            namedArgs.put("aux", ListUtil.get(args, 1, ""));
+            passThroughOrFillIn(namedArgs, "ger", root + "endo" + (si ? "si" : ""), true);
+            passThroughOrFillIn(namedArgs, "presp", root + "ente" + (si ? "si" : ""), true);
+            passThroughOrFillIn(namedArgs, "pastp", root + "ito", true);
+            if (si) {
+                passThroughOrFillIn(namedArgs, "pastp2", root + "itosi", true);
+            }
+            if (!name.endsWith("-b")) {
+                it_conj_passMood(namedArgs, "pres", false, root, Arrays.asList(i + "o", "i", "e", "iamo", "ite", i + "ono"));
+            } else {
+                it_conj_passMood(namedArgs, "pres", false, root, Arrays.asList("isco", "isci", "isce", "iamo", "ite", "iscono"));
+            }
+            it_conj_passMood(namedArgs, "imperf", false, root, Arrays.asList("ivo", "ivi", "iva", "ivamo", "ivate", "ivano"));
+            it_conj_passMood(namedArgs, "prem", false, root, Arrays.asList("ii", "isti", "ì", "immo", "iste", "irono"));
+            // Regular past historic synonyms:
+            passThroughOrFillIn(namedArgs, "prem3s2", root + "é", true);
+            passThroughOrFillIn(namedArgs, "prem3p2", root + "erono", true);
+            it_conj_passMood(namedArgs, "fut", true, root, Arrays.asList("irò", "irai", "irà", "iremo", "irete", "iranno"));
+            it_conj_passMood(namedArgs, "cond", true, root, Arrays.asList("irei", "iresti", "irebbe", "iremmo", "ireste", "irebbero"));
+
+            if (!name.endsWith("-b")) {
+                passThroughOrFillIn(namedArgs, "sub123s", root + i + "a", false);
+                passThroughOrFillIn(namedArgs, "sub3p", root + i + "ano", false);
+            } else {
+                passThroughOrFillIn(namedArgs, "sub123s", root + "isca", false);
+                passThroughOrFillIn(namedArgs, "sub3p", root + "iscano", false);
+            }
+            passThroughOrFillIn(namedArgs, "sub1p", root + "iamo", false);
+            passThroughOrFillIn(namedArgs, "sub2p", root + "iate", false);
+
+            passThroughOrFillIn(namedArgs, "impsub12s", root + "issi", false);
+            passThroughOrFillIn(namedArgs, "impsub3s", root + "isse", false);
+            passThroughOrFillIn(namedArgs, "impsub1p", root + "issimo", false);
+            passThroughOrFillIn(namedArgs, "impsub2p", root + "iste", false);
+            passThroughOrFillIn(namedArgs, "impsub3p", root + "issero", false);
+
+            if (!name.endsWith("-b")) {
+                passThroughOrFillIn(namedArgs, "imp2s", root + "i" + (si ? "ti" : ""), true);
+                passThroughOrFillIn(namedArgs, "imp3s", (si ? "si " : "") + root + i + "a", true);
+                passThroughOrFillIn(namedArgs, "imp3p", (si ? "si " : "") + root + i + "ano", true);
+            } else {
+                passThroughOrFillIn(namedArgs, "imp2s", root + "isci" + (si ? "ti" : ""), true);
+                passThroughOrFillIn(namedArgs, "imp3s", (si ? "si " : "") + root + "isca", true);
+                passThroughOrFillIn(namedArgs, "imp3p", (si ? "si " : "") + root + "iscano", true);
+            }
+            passThroughOrFillIn(namedArgs, "imp1p", root + "iamo" + (si ? "ci" : ""), true);
+            passThroughOrFillIn(namedArgs, "imp2p", root + "ite" + (si ? "vi" : ""), true);
+
+            return dest.onWikiFunction(wikiTokenizer, name, args, namedArgs, parser, appendAndIndexWikiCallback);
         }
-        passThroughOrFillIn(namedArgs, "sub1p", root + "iamo", false);
-        passThroughOrFillIn(namedArgs, "sub2p", root + "iate", false);
+    }
 
-        passThroughOrFillIn(namedArgs, "impsub12s", root + "issi", false);
-        passThroughOrFillIn(namedArgs, "impsub3s", root + "isse", false);
-        passThroughOrFillIn(namedArgs, "impsub1p", root + "issimo", false);
-        passThroughOrFillIn(namedArgs, "impsub2p", root + "iste", false);
-        passThroughOrFillIn(namedArgs, "impsub3p", root + "issero", false);
 
-        if (!name.endsWith("-b")) {
-            passThroughOrFillIn(namedArgs, "imp2s", root + "i" + (si ? "ti" : ""), true);
-            passThroughOrFillIn(namedArgs, "imp3s", (si ? "si " : "") + root + i + "a", true);
-            passThroughOrFillIn(namedArgs, "imp3p", (si ? "si " : "") + root + i + "ano", true);
-        } else {
-            passThroughOrFillIn(namedArgs, "imp2s", root + "isci" + (si ? "ti" : ""), true);
-            passThroughOrFillIn(namedArgs, "imp3s", (si ? "si " : "") + root + "isca", true);
-            passThroughOrFillIn(namedArgs, "imp3p", (si ? "si " : "") + root + "iscano", true);
+    static final class it_conj_ere<T extends AbstractWiktionaryParser> implements FunctionCallback<T> {
+        final it_conj<T> dest;
+        it_conj_ere(it_conj<T> dest) {
+            this.dest = dest;
         }
-        passThroughOrFillIn(namedArgs, "imp1p", root + "iamo" + (si ? "ci" : ""), true);
-        passThroughOrFillIn(namedArgs, "imp2p", root + "ite" + (si ? "vi" : ""), true);
+        @Override
+        public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List<String> args,
+                                      final Map<String, String> namedArgs,
+                                      final T parser,
+                                      final AppendAndIndexWikiCallback<T> appendAndIndexWikiCallback) {
+            final String root = args.get(0);
+            final boolean si = name.equals("it-conj-ersi");
+
+            passThroughOrFillIn(namedArgs, "inf", root + (si ? "ersi" : "ere"), false);
+            namedArgs.put("aux", ListUtil.get(args, 1, ""));
+            passThroughOrFillIn(namedArgs, "ger", root + "endo" + (si ? "si" : ""), true);
+            passThroughOrFillIn(namedArgs, "presp", root + "ente" + (si ? "si" : ""), true);
+            passThroughOrFillIn(namedArgs, "pastp", root + "uto", true);
+            if (si) {
+                passThroughOrFillIn(namedArgs, "pastp2", root + "utosi", true);
+            }
+            it_conj_passMood(namedArgs, "pres", false, root, Arrays.asList("o", "i", "e", "iamo", "ete", "ono"));
+            it_conj_passMood(namedArgs, "imperf", false, root, Arrays.asList("evo", "evi", "eva", "evamo", "evate", "evano"));
+            it_conj_passMood(namedArgs, "prem", false, root, Arrays.asList("ei", "esti", "ette", "emmo", "este", "ettero"));
+            // Regular past historic synonyms:
+            passThroughOrFillIn(namedArgs, "prem3s2", root + "é", true);
+            passThroughOrFillIn(namedArgs, "prem3p2", root + "erono", true);
+            it_conj_passMood(namedArgs, "fut", true, root, Arrays.asList("erò", "erai", "erà", "eremo", "erete", "eranno"));
+            it_conj_passMood(namedArgs, "cond", true, root, Arrays.asList("erei", "eresti", "erebbe", "eremmo", "ereste", "erebbero"));
+
+            passThroughOrFillIn(namedArgs, "sub123s", root + "a", false);
+            passThroughOrFillIn(namedArgs, "sub1p", root + "iamo", false);
+            passThroughOrFillIn(namedArgs, "sub2p", root + "iate", false);
+            passThroughOrFillIn(namedArgs, "sub3p", root + "ano", false);
+
+            passThroughOrFillIn(namedArgs, "impsub12s", root + "essi", false);
+            passThroughOrFillIn(namedArgs, "impsub3s", root + "esse", false);
+            passThroughOrFillIn(namedArgs, "impsub1p", root + "essimo", false);
+            passThroughOrFillIn(namedArgs, "impsub2p", root + "este", false);
+            passThroughOrFillIn(namedArgs, "impsub3p", root + "essero", false);
+
+            passThroughOrFillIn(namedArgs, "imp2s", root + "i" + (si ? "ti" : ""), true);
+            passThroughOrFillIn(namedArgs, "imp3s", (si ? "si " : "") + root + "a", true);
+            passThroughOrFillIn(namedArgs, "imp1p", root + "iamo" + (si ? "ci" : ""), true);
+            passThroughOrFillIn(namedArgs, "imp2p", root + "ete" + (si ? "vi" : ""), true);
+            passThroughOrFillIn(namedArgs, "imp3p", (si ? "si " : "") + root + "ano", true);
 
-        return dest.onWikiFunction(wikiTokenizer, name, args, namedArgs, parser, appendAndIndexWikiCallback);
-      }
+            return dest.onWikiFunction(wikiTokenizer, name, args, namedArgs, parser, appendAndIndexWikiCallback);
+        }
     }
 
-  
-  static final class it_conj_ere<T extends AbstractWiktionaryParser> implements FunctionCallback<T> {
-      final it_conj<T> dest;
-      it_conj_ere(it_conj<T> dest) {
-        this.dest = dest;
-      }
-      @Override
-        public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List<String> args,
-            final Map<String, String> namedArgs,
-            final T parser,
-            final AppendAndIndexWikiCallback<T> appendAndIndexWikiCallback) {
-          final String root = args.get(0);
-          final boolean si = name.equals("it-conj-ersi");
-
-          passThroughOrFillIn(namedArgs, "inf", root + (si ? "ersi" : "ere"), false);
-          namedArgs.put("aux", ListUtil.get(args, 1, ""));
-          passThroughOrFillIn(namedArgs, "ger", root + "endo" + (si ? "si" : ""), true);
-          passThroughOrFillIn(namedArgs, "presp", root + "ente" + (si ? "si" : ""), true);
-          passThroughOrFillIn(namedArgs, "pastp", root + "uto", true);
-          if (si) {
-              passThroughOrFillIn(namedArgs, "pastp2", root + "utosi", true);
-          }
-          it_conj_passMood(namedArgs, "pres", false, root, Arrays.asList("o", "i", "e", "iamo", "ete", "ono"));
-          it_conj_passMood(namedArgs, "imperf", false, root, Arrays.asList("evo", "evi", "eva", "evamo", "evate", "evano"));
-          it_conj_passMood(namedArgs, "prem", false, root, Arrays.asList("ei", "esti", "ette", "emmo", "este", "ettero"));
-          // Regular past historic synonyms:
-          passThroughOrFillIn(namedArgs, "prem3s2", root + "é", true);
-          passThroughOrFillIn(namedArgs, "prem3p2", root + "erono", true);
-          it_conj_passMood(namedArgs, "fut", true, root, Arrays.asList("erò", "erai", "erà", "eremo", "erete", "eranno"));
-          it_conj_passMood(namedArgs, "cond", true, root, Arrays.asList("erei", "eresti", "erebbe", "eremmo", "ereste", "erebbero"));
-
-          passThroughOrFillIn(namedArgs, "sub123s", root + "a", false);
-          passThroughOrFillIn(namedArgs, "sub1p", root + "iamo", false);
-          passThroughOrFillIn(namedArgs, "sub2p", root + "iate", false);
-          passThroughOrFillIn(namedArgs, "sub3p", root + "ano", false);
-
-          passThroughOrFillIn(namedArgs, "impsub12s", root + "essi", false);
-          passThroughOrFillIn(namedArgs, "impsub3s", root + "esse", false);
-          passThroughOrFillIn(namedArgs, "impsub1p", root + "essimo", false);
-          passThroughOrFillIn(namedArgs, "impsub2p", root + "este", false);
-          passThroughOrFillIn(namedArgs, "impsub3p", root + "essero", false);
-
-          passThroughOrFillIn(namedArgs, "imp2s", root + "i" + (si ? "ti" : ""), true);
-          passThroughOrFillIn(namedArgs, "imp3s", (si ? "si " : "") + root + "a", true);
-          passThroughOrFillIn(namedArgs, "imp1p", root + "iamo" + (si ? "ci" : ""), true);
-          passThroughOrFillIn(namedArgs, "imp2p", root + "ete" + (si ? "vi" : ""), true);
-          passThroughOrFillIn(namedArgs, "imp3p", (si ? "si " : "") + root + "ano", true);
-
-          return dest.onWikiFunction(wikiTokenizer, name, args, namedArgs, parser, appendAndIndexWikiCallback);
+    static final class it_conj_urre<T extends AbstractWiktionaryParser> implements FunctionCallback<T> {
+        final it_conj<T> dest;
+        it_conj_urre(it_conj<T> dest) {
+            this.dest = dest;
         }
-      }
-
-  static final class it_conj_urre<T extends AbstractWiktionaryParser> implements FunctionCallback<T> {
-      final it_conj<T> dest;
-      it_conj_urre(it_conj<T> dest) {
-        this.dest = dest;
-      }
-      @Override
+        @Override
         public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List<String> args,
-            final Map<String, String> namedArgs,
-            final T parser,
-            final AppendAndIndexWikiCallback<T> appendAndIndexWikiCallback) {
-          final String root = args.get(0);
-          final boolean si = name.equals("it-conj-ursi");
-
-          passThroughOrFillIn(namedArgs, "inf", root + (si ? "ursi" : "urre"), false);
-          namedArgs.put("aux", ListUtil.get(args, 1, ""));
-          passThroughOrFillIn(namedArgs, "ger", root + "ucendo" + (si ? "si" : ""), true);
-          passThroughOrFillIn(namedArgs, "presp", root + "ucente" + (si ? "si" : ""), true);
-          passThroughOrFillIn(namedArgs, "pastp", root + "otto", true);
-          if (si) {
-              passThroughOrFillIn(namedArgs, "pastp2", root + "ottosi", true);
-          }
-          it_conj_passMood(namedArgs, "pres", false, root, Arrays.asList("uco", "uci", "uce", "uciamo", "ucete", "ucono"));
-          it_conj_passMood(namedArgs, "imperf", false, root, Arrays.asList("ucevo", "ucevi", "uceva", "ucevamo", "ucevate", "ucevano"));
-          it_conj_passMood(namedArgs, "prem", false, root, Arrays.asList("ussi", "ucesti", "usse", "ucemmo", "uceste", "ussero"));
-          it_conj_passMood(namedArgs, "fut", true, root, Arrays.asList("urrò", "urrai", "urrà", "urremo", "urrete", "urranno"));
-          it_conj_passMood(namedArgs, "cond", true, root, Arrays.asList("urrei", "urresti", "urrebbe", "urremmo", "urreste", "urrebbero"));
-
-          passThroughOrFillIn(namedArgs, "sub123s", root + "uca", false);
-          passThroughOrFillIn(namedArgs, "sub1p", root + "uciamo", false);
-          passThroughOrFillIn(namedArgs, "sub2p", root + "uciate", false);
-          passThroughOrFillIn(namedArgs, "sub3p", root + "ucano", false);
-
-          passThroughOrFillIn(namedArgs, "impsub12s", root + "ucessi", false);
-          passThroughOrFillIn(namedArgs, "impsub3s", root + "ucesse", false);
-          passThroughOrFillIn(namedArgs, "impsub1p", root + "ucessimo", false);
-          passThroughOrFillIn(namedArgs, "impsub2p", root + "uceste", false);
-          passThroughOrFillIn(namedArgs, "impsub3p", root + "ucessero", false);
-
-          passThroughOrFillIn(namedArgs, "imp2s", root + "uci" + (si ? "ti" : ""), true);
-          passThroughOrFillIn(namedArgs, "imp3s", (si ? "si" : "") + root + "uca", true);
-          passThroughOrFillIn(namedArgs, "imp1p", root + "uciamo" + (si ? "ci" : ""), true);
-          passThroughOrFillIn(namedArgs, "imp2p", root + "ucete" + (si ? "vi" : ""), true);
-          passThroughOrFillIn(namedArgs, "imp3p", (si ? "si" : "") + root + "ucano", true);
-
-          return dest.onWikiFunction(wikiTokenizer, name, args, namedArgs, parser, appendAndIndexWikiCallback);
+                                      final Map<String, String> namedArgs,
+                                      final T parser,
+                                      final AppendAndIndexWikiCallback<T> appendAndIndexWikiCallback) {
+            final String root = args.get(0);
+            final boolean si = name.equals("it-conj-ursi");
+
+            passThroughOrFillIn(namedArgs, "inf", root + (si ? "ursi" : "urre"), false);
+            namedArgs.put("aux", ListUtil.get(args, 1, ""));
+            passThroughOrFillIn(namedArgs, "ger", root + "ucendo" + (si ? "si" : ""), true);
+            passThroughOrFillIn(namedArgs, "presp", root + "ucente" + (si ? "si" : ""), true);
+            passThroughOrFillIn(namedArgs, "pastp", root + "otto", true);
+            if (si) {
+                passThroughOrFillIn(namedArgs, "pastp2", root + "ottosi", true);
+            }
+            it_conj_passMood(namedArgs, "pres", false, root, Arrays.asList("uco", "uci", "uce", "uciamo", "ucete", "ucono"));
+            it_conj_passMood(namedArgs, "imperf", false, root, Arrays.asList("ucevo", "ucevi", "uceva", "ucevamo", "ucevate", "ucevano"));
+            it_conj_passMood(namedArgs, "prem", false, root, Arrays.asList("ussi", "ucesti", "usse", "ucemmo", "uceste", "ussero"));
+            it_conj_passMood(namedArgs, "fut", true, root, Arrays.asList("urrò", "urrai", "urrà", "urremo", "urrete", "urranno"));
+            it_conj_passMood(namedArgs, "cond", true, root, Arrays.asList("urrei", "urresti", "urrebbe", "urremmo", "urreste", "urrebbero"));
+
+            passThroughOrFillIn(namedArgs, "sub123s", root + "uca", false);
+            passThroughOrFillIn(namedArgs, "sub1p", root + "uciamo", false);
+            passThroughOrFillIn(namedArgs, "sub2p", root + "uciate", false);
+            passThroughOrFillIn(namedArgs, "sub3p", root + "ucano", false);
+
+            passThroughOrFillIn(namedArgs, "impsub12s", root + "ucessi", false);
+            passThroughOrFillIn(namedArgs, "impsub3s", root + "ucesse", false);
+            passThroughOrFillIn(namedArgs, "impsub1p", root + "ucessimo", false);
+            passThroughOrFillIn(namedArgs, "impsub2p", root + "uceste", false);
+            passThroughOrFillIn(namedArgs, "impsub3p", root + "ucessero", false);
+
+            passThroughOrFillIn(namedArgs, "imp2s", root + "uci" + (si ? "ti" : ""), true);
+            passThroughOrFillIn(namedArgs, "imp3s", (si ? "si" : "") + root + "uca", true);
+            passThroughOrFillIn(namedArgs, "imp1p", root + "uciamo" + (si ? "ci" : ""), true);
+            passThroughOrFillIn(namedArgs, "imp2p", root + "ucete" + (si ? "vi" : ""), true);
+            passThroughOrFillIn(namedArgs, "imp3p", (si ? "si" : "") + root + "ucano", true);
+
+            return dest.onWikiFunction(wikiTokenizer, name, args, namedArgs, parser, appendAndIndexWikiCallback);
+        }
+    }
+
+    static final class it_conj_fare<T extends AbstractWiktionaryParser> implements FunctionCallback<T> {
+        final it_conj<T> dest;
+        it_conj_fare(it_conj<T> dest) {
+            this.dest = dest;
         }
-      }
-
-  static final class it_conj_fare<T extends AbstractWiktionaryParser> implements FunctionCallback<T> {
-      final it_conj<T> dest;
-      it_conj_fare(it_conj<T> dest) {
-        this.dest = dest;
-      }
-      @Override
+        @Override
         public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List<String> args,
-            final Map<String, String> namedArgs,
-            final T parser,
-            final AppendAndIndexWikiCallback<T> appendAndIndexWikiCallback) {
-          final String root = args.get(0);
-          passThroughOrFillIn(namedArgs, "inf", root + "fare", false);
-          namedArgs.put("aux", ListUtil.get(args, 1, ""));
-          passThroughOrFillIn(namedArgs, "ger", root + "facendo", true);
-          passThroughOrFillIn(namedArgs, "presp", root + "facente", true);
-          passThroughOrFillIn(namedArgs, "pastp", root + "fatto", true);
-          it_conj_passMood(namedArgs, "pres", false, root, Arrays.asList("faccio", "fai", "fà", "facciamo", "fate", "fanno"));
-          passThroughOrFillIn(namedArgs, "pres1s2", root + "fò", true);
-          it_conj_passMood(namedArgs, "imperf", false, root, Arrays.asList("facevo", "facevi", "faceva", "facevamo", "facevate", "facevano"));
-          it_conj_passMood(namedArgs, "prem", false, root, Arrays.asList("feci", "facesti", "fece", "facemmo", "faceste", "fecero"));
-          it_conj_passMood(namedArgs, "fut", true, root, Arrays.asList("farò", "farai", "farà", "faremo", "farete", "faranno"));
-          it_conj_passMood(namedArgs, "cond", true, root, Arrays.asList("farei", "faresti", "farebbe", "faremmo", "fareste", "farebbero"));
-
-          passThroughOrFillIn(namedArgs, "sub123s", root + "faccia", false);
-          passThroughOrFillIn(namedArgs, "sub1p", root + "facciamo", false);
-          passThroughOrFillIn(namedArgs, "sub2p", root + "facciate", false);
-          passThroughOrFillIn(namedArgs, "sub3p", root + "facciano", false);
-
-          passThroughOrFillIn(namedArgs, "impsub12s", root + "facessi", false);
-          passThroughOrFillIn(namedArgs, "impsub3s", root + "facesse", false);
-          passThroughOrFillIn(namedArgs, "impsub1p", root + "facessimo", false);
-          passThroughOrFillIn(namedArgs, "impsub2p", root + "faceste", false);
-          passThroughOrFillIn(namedArgs, "impsub3p", root + "facessero", false);
-
-          passThroughOrFillIn(namedArgs, "imp2s", root + "fa", true);
-          passThroughOrFillIn(namedArgs, "imp3s", root + "faccia", true);
-          passThroughOrFillIn(namedArgs, "imp1p", root + "facciamo", true);
-          passThroughOrFillIn(namedArgs, "imp2p", root + "fate", true);
-          passThroughOrFillIn(namedArgs, "imp3p", root + "facciano", true);
-
-          return dest.onWikiFunction(wikiTokenizer, name, args, namedArgs, parser, appendAndIndexWikiCallback);
+                                      final Map<String, String> namedArgs,
+                                      final T parser,
+                                      final AppendAndIndexWikiCallback<T> appendAndIndexWikiCallback) {
+            final String root = args.get(0);
+            passThroughOrFillIn(namedArgs, "inf", root + "fare", false);
+            namedArgs.put("aux", ListUtil.get(args, 1, ""));
+            passThroughOrFillIn(namedArgs, "ger", root + "facendo", true);
+            passThroughOrFillIn(namedArgs, "presp", root + "facente", true);
+            passThroughOrFillIn(namedArgs, "pastp", root + "fatto", true);
+            it_conj_passMood(namedArgs, "pres", false, root, Arrays.asList("faccio", "fai", "fà", "facciamo", "fate", "fanno"));
+            passThroughOrFillIn(namedArgs, "pres1s2", root + "fò", true);
+            it_conj_passMood(namedArgs, "imperf", false, root, Arrays.asList("facevo", "facevi", "faceva", "facevamo", "facevate", "facevano"));
+            it_conj_passMood(namedArgs, "prem", false, root, Arrays.asList("feci", "facesti", "fece", "facemmo", "faceste", "fecero"));
+            it_conj_passMood(namedArgs, "fut", true, root, Arrays.asList("farò", "farai", "farà", "faremo", "farete", "faranno"));
+            it_conj_passMood(namedArgs, "cond", true, root, Arrays.asList("farei", "faresti", "farebbe", "faremmo", "fareste", "farebbero"));
+
+            passThroughOrFillIn(namedArgs, "sub123s", root + "faccia", false);
+            passThroughOrFillIn(namedArgs, "sub1p", root + "facciamo", false);
+            passThroughOrFillIn(namedArgs, "sub2p", root + "facciate", false);
+            passThroughOrFillIn(namedArgs, "sub3p", root + "facciano", false);
+
+            passThroughOrFillIn(namedArgs, "impsub12s", root + "facessi", false);
+            passThroughOrFillIn(namedArgs, "impsub3s", root + "facesse", false);
+            passThroughOrFillIn(namedArgs, "impsub1p", root + "facessimo", false);
+            passThroughOrFillIn(namedArgs, "impsub2p", root + "faceste", false);
+            passThroughOrFillIn(namedArgs, "impsub3p", root + "facessero", false);
+
+            passThroughOrFillIn(namedArgs, "imp2s", root + "fa", true);
+            passThroughOrFillIn(namedArgs, "imp3s", root + "faccia", true);
+            passThroughOrFillIn(namedArgs, "imp1p", root + "facciamo", true);
+            passThroughOrFillIn(namedArgs, "imp2p", root + "fate", true);
+            passThroughOrFillIn(namedArgs, "imp3p", root + "facciano", true);
+
+            return dest.onWikiFunction(wikiTokenizer, name, args, namedArgs, parser, appendAndIndexWikiCallback);
         }
-      }
-
-  static final Map<String,String> it_indicativePronouns = new LinkedHashMap<String, String>();
-  static {
-      it_indicativePronouns.put("1s", "io");
-      it_indicativePronouns.put("2s", "tu");
-      it_indicativePronouns.put("3s", "lui/lei");
-      it_indicativePronouns.put("1p", "noi");
-      it_indicativePronouns.put("2p", "voi");
-      it_indicativePronouns.put("3p", "essi/esse");
-  }
-
-  static final Map<String,String> it_subjunctivePronouns = new LinkedHashMap<String, String>();
-  static {
-      it_subjunctivePronouns.put("1s", "che io");
-      it_subjunctivePronouns.put("2s", "che tu");
-      it_subjunctivePronouns.put("3s", "che lui/lei");
-      it_subjunctivePronouns.put("1p", "che noi");
-      it_subjunctivePronouns.put("2p", "che voi");
-      it_subjunctivePronouns.put("3p", "che essi/esse");
-  }
-
-  static final Map<String,String> it_imperativePronouns = new LinkedHashMap<String, String>();
-  static {
-      it_imperativePronouns.put("1s", "-");
-      it_imperativePronouns.put("2s", "tu");
-      it_imperativePronouns.put("3s", "lui/lei");
-      it_imperativePronouns.put("1p", "noi");
-      it_imperativePronouns.put("2p", "voi");
-      it_imperativePronouns.put("3p", "essi/esse");
-  }
-
-
-  static final class it_conj<T extends AbstractWiktionaryParser> implements FunctionCallback<T> {
-      @Override
-      public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List<String> args,
-          final Map<String, String> namedArgs,
-          final T parser,
-          final AppendAndIndexWikiCallback<T> appendAndIndexWikiCallback) {
-        
-        final StringBuilder builder = appendAndIndexWikiCallback.builder;
-        
-        final String inf = namedArgs.get("inf");
-        
-        // TODO: center everything horizontally.
-        builder.append("<table style=\"background:#F0F0F0\">");
-        
-        builder.append("<tr>");
-        builder.append("<th colspan=\"1\" style=\"background:#e2e4c0\">infinito</th>");
-        builder.append("<td colspan=\"1\">");
-        appendAndIndexWikiCallback.dispatch(MapUtil.safeRemove(namedArgs, "inf", "-"), null);
-        builder.append("</td>");
-        builder.append("</tr>\n");
-
-        builder.append("<tr>");
-        builder.append("<th colspan=\"1\" style=\"background:#e2e4c0\">verbo ausiliare</th>");
-        builder.append("<td colspan=\"1\">");
-        appendAndIndexWikiCallback.dispatch(MapUtil.safeRemove(namedArgs, "aux", "-"), null);
-        builder.append("</td>");
-        builder.append("<th colspan=\"1\" style=\"background:#e2e4c0\">gerundio</th>");
-        builder.append("<td colspan=\"1\">");
-        outputKeyVariations(appendAndIndexWikiCallback, builder, "ger", namedArgs, true);
-        builder.append("</td>");
-        builder.append("</tr>\n");
-
-        builder.append("<tr>");
-        builder.append("<th colspan=\"1\" style=\"background:#e2e4c0\">participio presente</th>");
-        builder.append("<td colspan=\"1\">");
-        outputKeyVariations(appendAndIndexWikiCallback, builder, "presp", namedArgs, true);
-        builder.append("</td>");
-        builder.append("<th colspan=\"1\" style=\"background:#e2e4c0\">participio passato</th>");
-        builder.append("<td colspan=\"1\">");
-        outputKeyVariations(appendAndIndexWikiCallback, builder, "pastp", namedArgs, true);
-        builder.append("</td>");
-        builder.append("</tr>\n");
-        
-        final List<String> prefixes = (inf != null && inf.endsWith("si")) ? it_reflexive_pronouns : it_empty; 
-
-        String style = " style=\"background:#c0cfe4\"";
-        outputDataRow(appendAndIndexWikiCallback, style, "indicativo", style, "th", "", new LinkedHashMap<String, String>(it_indicativePronouns), it_empty, false);
-        outputDataRow(appendAndIndexWikiCallback, style, "presente", "", "td", "pres", namedArgs, prefixes, true);
-        outputDataRow(appendAndIndexWikiCallback, style, "imperfetto", "", "td", "imperf", namedArgs, prefixes, true);
-        outputDataRow(appendAndIndexWikiCallback, style, "passato remoto", "", "td", "prem", namedArgs, prefixes, true);
-        outputDataRow(appendAndIndexWikiCallback, style, "futuro", "", "td", "fut", namedArgs, prefixes, true);
-
-        style = " style=\"background:#c0d8e4\"";
-        outputDataRow(appendAndIndexWikiCallback, style, "condizionale", style, "th", "", new LinkedHashMap<String, String>(it_indicativePronouns), it_empty, false);
-        outputDataRow(appendAndIndexWikiCallback, style, "presente", "", "td", "cond", namedArgs, prefixes, true);
-
-        style = " style=\"background:#c0e4c0\"";
-        outputDataRow(appendAndIndexWikiCallback, style, "congiuntivo", style, "th", "", new LinkedHashMap<String, String>(it_subjunctivePronouns), it_empty, false);
-        namedArgs.put("sub3s2", namedArgs.remove("sub3s"));
-        namedArgs.put("sub1s", namedArgs.get("sub123s"));
-        namedArgs.put("sub2s", namedArgs.get("sub123s"));
-        namedArgs.put("sub3s", namedArgs.remove("sub123s"));
-        namedArgs.put("sub1s2", namedArgs.get("sub123s2"));
-        namedArgs.put("sub2s2", namedArgs.get("sub123s2"));
-        namedArgs.put("sub3s2", namedArgs.remove("sub123s2"));
-        outputDataRow(appendAndIndexWikiCallback, style, "presente", "", "td", "sub", namedArgs, prefixes, true);
-        namedArgs.put("impsub1s", namedArgs.get("impsub12s"));
-        namedArgs.put("impsub2s", namedArgs.remove("impsub12s"));
-        namedArgs.put("impsub1s2", namedArgs.get("impsub12s2"));
-        namedArgs.put("impsub2s2", namedArgs.remove("impsub12s2"));
-        outputDataRow(appendAndIndexWikiCallback, style, "imperfetto", "", "td", "impsub", namedArgs, prefixes, true);
-
-        style = " style=\"background:#e4d4c0\"";
-        outputDataRow(appendAndIndexWikiCallback, style, "imperativo", style, "th", "", new LinkedHashMap<String, String>(it_imperativePronouns), it_empty, false);
-        outputDataRow(appendAndIndexWikiCallback, style, "", "", "td", "imp", namedArgs, it_empty, false);  // these are attached to the stem.
-
-        builder.append("</table>\n");
-        
-        if (!namedArgs.isEmpty()) {
-            System.err.println("NON-EMPTY namedArgs: " + namedArgs);
-            if ("muovesse".equals(namedArgs.get("impsib3s2"))) {
-                return false;
-            }
-            if ("percuotesse".equals(namedArgs.get("impsib3s2"))) {
+    }
+
+    static final Map<String,String> it_indicativePronouns = new LinkedHashMap<String, String>();
+    static {
+        it_indicativePronouns.put("1s", "io");
+        it_indicativePronouns.put("2s", "tu");
+        it_indicativePronouns.put("3s", "lui/lei");
+        it_indicativePronouns.put("1p", "noi");
+        it_indicativePronouns.put("2p", "voi");
+        it_indicativePronouns.put("3p", "essi/esse");
+    }
+
+    static final Map<String,String> it_subjunctivePronouns = new LinkedHashMap<String, String>();
+    static {
+        it_subjunctivePronouns.put("1s", "che io");
+        it_subjunctivePronouns.put("2s", "che tu");
+        it_subjunctivePronouns.put("3s", "che lui/lei");
+        it_subjunctivePronouns.put("1p", "che noi");
+        it_subjunctivePronouns.put("2p", "che voi");
+        it_subjunctivePronouns.put("3p", "che essi/esse");
+    }
+
+    static final Map<String,String> it_imperativePronouns = new LinkedHashMap<String, String>();
+    static {
+        it_imperativePronouns.put("1s", "-");
+        it_imperativePronouns.put("2s", "tu");
+        it_imperativePronouns.put("3s", "lui/lei");
+        it_imperativePronouns.put("1p", "noi");
+        it_imperativePronouns.put("2p", "voi");
+        it_imperativePronouns.put("3p", "essi/esse");
+    }
+
+
+    static final class it_conj<T extends AbstractWiktionaryParser> implements FunctionCallback<T> {
+        @Override
+        public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List<String> args,
+                                      final Map<String, String> namedArgs,
+                                      final T parser,
+                                      final AppendAndIndexWikiCallback<T> appendAndIndexWikiCallback) {
+
+            final StringBuilder builder = appendAndIndexWikiCallback.builder;
+
+            final String inf = namedArgs.get("inf");
+
+            // TODO: center everything horizontally.
+            builder.append("<table style=\"background:#F0F0F0\">");
+
+            builder.append("<tr>");
+            builder.append("<th colspan=\"1\" style=\"background:#e2e4c0\">infinito</th>");
+            builder.append("<td colspan=\"1\">");
+            appendAndIndexWikiCallback.dispatch(MapUtil.safeRemove(namedArgs, "inf", "-"), null);
+            builder.append("</td>");
+            builder.append("</tr>\n");
+
+            builder.append("<tr>");
+            builder.append("<th colspan=\"1\" style=\"background:#e2e4c0\">verbo ausiliare</th>");
+            builder.append("<td colspan=\"1\">");
+            appendAndIndexWikiCallback.dispatch(MapUtil.safeRemove(namedArgs, "aux", "-"), null);
+            builder.append("</td>");
+            builder.append("<th colspan=\"1\" style=\"background:#e2e4c0\">gerundio</th>");
+            builder.append("<td colspan=\"1\">");
+            outputKeyVariations(appendAndIndexWikiCallback, builder, "ger", namedArgs, true);
+            builder.append("</td>");
+            builder.append("</tr>\n");
+
+            builder.append("<tr>");
+            builder.append("<th colspan=\"1\" style=\"background:#e2e4c0\">participio presente</th>");
+            builder.append("<td colspan=\"1\">");
+            outputKeyVariations(appendAndIndexWikiCallback, builder, "presp", namedArgs, true);
+            builder.append("</td>");
+            builder.append("<th colspan=\"1\" style=\"background:#e2e4c0\">participio passato</th>");
+            builder.append("<td colspan=\"1\">");
+            outputKeyVariations(appendAndIndexWikiCallback, builder, "pastp", namedArgs, true);
+            builder.append("</td>");
+            builder.append("</tr>\n");
+
+            final List<String> prefixes = (inf != null && inf.endsWith("si")) ? it_reflexive_pronouns : it_empty;
+
+            String style = " style=\"background:#c0cfe4\"";
+            outputDataRow(appendAndIndexWikiCallback, style, "indicativo", style, "th", "", new LinkedHashMap<String, String>(it_indicativePronouns), it_empty, false);
+            outputDataRow(appendAndIndexWikiCallback, style, "presente", "", "td", "pres", namedArgs, prefixes, true);
+            outputDataRow(appendAndIndexWikiCallback, style, "imperfetto", "", "td", "imperf", namedArgs, prefixes, true);
+            outputDataRow(appendAndIndexWikiCallback, style, "passato remoto", "", "td", "prem", namedArgs, prefixes, true);
+            outputDataRow(appendAndIndexWikiCallback, style, "futuro", "", "td", "fut", namedArgs, prefixes, true);
+
+            style = " style=\"background:#c0d8e4\"";
+            outputDataRow(appendAndIndexWikiCallback, style, "condizionale", style, "th", "", new LinkedHashMap<String, String>(it_indicativePronouns), it_empty, false);
+            outputDataRow(appendAndIndexWikiCallback, style, "presente", "", "td", "cond", namedArgs, prefixes, true);
+
+            style = " style=\"background:#c0e4c0\"";
+            outputDataRow(appendAndIndexWikiCallback, style, "congiuntivo", style, "th", "", new LinkedHashMap<String, String>(it_subjunctivePronouns), it_empty, false);
+            namedArgs.put("sub3s2", namedArgs.remove("sub3s"));
+            namedArgs.put("sub1s", namedArgs.get("sub123s"));
+            namedArgs.put("sub2s", namedArgs.get("sub123s"));
+            namedArgs.put("sub3s", namedArgs.remove("sub123s"));
+            namedArgs.put("sub1s2", namedArgs.get("sub123s2"));
+            namedArgs.put("sub2s2", namedArgs.get("sub123s2"));
+            namedArgs.put("sub3s2", namedArgs.remove("sub123s2"));
+            outputDataRow(appendAndIndexWikiCallback, style, "presente", "", "td", "sub", namedArgs, prefixes, true);
+            namedArgs.put("impsub1s", namedArgs.get("impsub12s"));
+            namedArgs.put("impsub2s", namedArgs.remove("impsub12s"));
+            namedArgs.put("impsub1s2", namedArgs.get("impsub12s2"));
+            namedArgs.put("impsub2s2", namedArgs.remove("impsub12s2"));
+            outputDataRow(appendAndIndexWikiCallback, style, "imperfetto", "", "td", "impsub", namedArgs, prefixes, true);
+
+            style = " style=\"background:#e4d4c0\"";
+            outputDataRow(appendAndIndexWikiCallback, style, "imperativo", style, "th", "", new LinkedHashMap<String, String>(it_imperativePronouns), it_empty, false);
+            outputDataRow(appendAndIndexWikiCallback, style, "", "", "td", "imp", namedArgs, it_empty, false);  // these are attached to the stem.
+
+            builder.append("</table>\n");
+
+            if (!namedArgs.isEmpty()) {
+                System.err.println("NON-EMPTY namedArgs: " + namedArgs);
+                if ("muovesse".equals(namedArgs.get("impsib3s2"))) {
+                    return false;
+                }
+                if ("percuotesse".equals(namedArgs.get("impsib3s2"))) {
+                    return false;
+                }
+                // Too many to deal with:
+                //assert false;
                 return false;
             }
-            // Too many to deal with:
-            //assert false;
-            return false;
-        }
 
-        return true;
-      }
+            return true;
+        }
 
         private void outputDataRow(AppendAndIndexWikiCallback<T> appendAndIndexWikiCallback,
-                String col1Style, String headerName, 
-                String col2Style, final String type2, 
-                String moodName, Map<String, String> namedArgs, final List<String> prefixes, final boolean isForm) {
+                                   String col1Style, String headerName,
+                                   String col2Style, final String type2,
+                                   String moodName, Map<String, String> namedArgs, final List<String> prefixes, final boolean isForm) {
             final StringBuilder builder = appendAndIndexWikiCallback.builder;
             builder.append("<tr>");
             builder.append("<th colspan=\"1\"").append(col1Style).append(">").append(headerName).append("</th>");
@@ -1183,49 +1188,49 @@ static final class it_conj_are<T extends AbstractWiktionaryParser> implements Fu
             builder.append("</tr>\n");
         }
     }
-  
-  static void passThroughOrFillIn(final Map<String,String> namedArgs, final String key, final String fillIn, final boolean quoteToEmpty) {
-      final String value = namedArgs.get(key);
-      if (quoteToEmpty && "''".equals(value)) {
-          namedArgs.put(key, "");
-          return;
-      }
-      if (value == null || value.equals("")) {
-          namedArgs.put(key, fillIn);
-      }
-  }
-  
-  static final List<String> it_number_s_p = Arrays.asList("s", "p");
-  static final List<String> it_person_1_2_3 = Arrays.asList("1", "2", "3");
-  static final List<String> it_reflexive_pronouns = Arrays.asList("mi ", "ti ", "si ", "ci ", "vi ", "si ");
-  static final List<String> it_empty = Arrays.asList("", "", "", "", "", "");
-  static void it_conj_passMood(final Map<String,String> namedArgs, final String moodName, final boolean quoteToEmpty, final String root, final List<String> suffixes) {
-      assert suffixes.size() == 6;
-      int i = 0;
-      for (final String number : it_number_s_p) {
-          for (final String person : it_person_1_2_3) {
-              passThroughOrFillIn(namedArgs, String.format("%s%s%s", moodName, person, number), root + suffixes.get(i), quoteToEmpty);
-              ++i;
-          }
-      }
-  }
-
-  private static <T extends AbstractWiktionaryParser> void outputKeyVariations(AppendAndIndexWikiCallback<T> appendAndIndexWikiCallback,
-        final StringBuilder builder, final String keyBase, Map<String, String> namedArgs, boolean isForm) {
-    for (int suffix = 0; suffix <= 4; ++suffix) {
-        final String key = suffix == 0 ? keyBase : keyBase + suffix;
-        final String val = namedArgs.remove(key);
-        if (val != null && !val.trim().equals("")) {
-            if (suffix > 0) {
-                builder.append(", ");
-            }
-            appendAndIndexWikiCallback.dispatch(val, null);
-            if (isForm) {
-                appendAndIndexWikiCallback.parser.addLinkToCurrentEntry(val, null, EntryTypeName.WIKTIONARY_INFLECTED_FORM_MULTI);
+
+    static void passThroughOrFillIn(final Map<String,String> namedArgs, final String key, final String fillIn, final boolean quoteToEmpty) {
+        final String value = namedArgs.get(key);
+        if (quoteToEmpty && "''".equals(value)) {
+            namedArgs.put(key, "");
+            return;
+        }
+        if (value == null || value.equals("")) {
+            namedArgs.put(key, fillIn);
+        }
+    }
+
+    static final List<String> it_number_s_p = Arrays.asList("s", "p");
+    static final List<String> it_person_1_2_3 = Arrays.asList("1", "2", "3");
+    static final List<String> it_reflexive_pronouns = Arrays.asList("mi ", "ti ", "si ", "ci ", "vi ", "si ");
+    static final List<String> it_empty = Arrays.asList("", "", "", "", "", "");
+    static void it_conj_passMood(final Map<String,String> namedArgs, final String moodName, final boolean quoteToEmpty, final String root, final List<String> suffixes) {
+        assert suffixes.size() == 6;
+        int i = 0;
+        for (final String number : it_number_s_p) {
+            for (final String person : it_person_1_2_3) {
+                passThroughOrFillIn(namedArgs, String.format("%s%s%s", moodName, person, number), root + suffixes.get(i), quoteToEmpty);
+                ++i;
+            }
+        }
+    }
+
+    private static <T extends AbstractWiktionaryParser> void outputKeyVariations(AppendAndIndexWikiCallback<T> appendAndIndexWikiCallback,
+            final StringBuilder builder, final String keyBase, Map<String, String> namedArgs, boolean isForm) {
+        for (int suffix = 0; suffix <= 4; ++suffix) {
+            final String key = suffix == 0 ? keyBase : keyBase + suffix;
+            final String val = namedArgs.remove(key);
+            if (val != null && !val.trim().equals("")) {
+                if (suffix > 0) {
+                    builder.append(", ");
+                }
+                appendAndIndexWikiCallback.dispatch(val, null);
+                if (isForm) {
+                    appendAndIndexWikiCallback.parser.addLinkToCurrentEntry(val, null, EntryTypeName.WIKTIONARY_INFLECTED_FORM_MULTI);
+                }
             }
         }
     }
-  }
 
 
 }
index 1c7f912a4d6e533292bf0d1180fa339f72ada554..d15cc9292d34962c0ab69f7b5db398f3d105dceb 100644 (file)
@@ -28,127 +28,127 @@ import com.hughes.android.dictionary.parser.WikiTokenizer;
 
 public abstract class EnParser extends AbstractWiktionaryParser {
 
-  // TODO: process {{ttbc}} lines
-  
-  public static final Pattern partOfSpeechHeader = Pattern.compile(
-      "Noun|Verb|Adjective|Adverb|Pronoun|Conjunction|Interjection|" +
-      "Preposition|Proper noun|Article|Prepositional phrase|Acronym|" +
-      "Abbreviation|Initialism|Contraction|Prefix|Suffix|Symbol|Letter|" +
-      "Ligature|Idiom|Phrase|\\{\\{acronym\\}\\}|\\{\\{initialism\\}\\}|" +
-      "\\{\\{abbreviation\\}\\}|" +
-      // These are @deprecated:
-      "Noun form|Verb form|Adjective form|Nominal phrase|Noun phrase|" +
-      "Verb phrase|Transitive verb|Intransitive verb|Reflexive verb|" +
-      // These are extras I found:
-      "Determiner|Numeral|Number|Cardinal number|Ordinal number|Proverb|" +
-      "Particle|Interjection|Pronominal adverb|" +
-      "Han character|Hanzi|Hanja|Kanji|Katakana character|Syllable");
-  
-  static final Set<String> USELESS_WIKI_ARGS = new LinkedHashSet<String>(
-      Arrays.asList(
-          "lang",
-          "sc",
-          "sort",
-          "cat",
-          "cat2",
-          "xs",
-          "nodot"));
-
-  static boolean isIgnorableTitle(final String title) {
-    return title.startsWith("Wiktionary:") ||
-        title.startsWith("Template:") ||
-        title.startsWith("Appendix:") ||
-        title.startsWith("Category:") ||
-        title.startsWith("Index:") ||
-        title.startsWith("MediaWiki:") ||
-        title.startsWith("TransWiki:") ||
-        title.startsWith("Citations:") ||
-        title.startsWith("Concordance:") ||
-        title.startsWith("Help:");
-  }
-  
-  final IndexBuilder enIndexBuilder;
-  final IndexBuilder foreignIndexBuilder;
-  final Pattern langPattern;
-  final Pattern langCodePattern;
-  final boolean swap;
-  
-  // State used while parsing.
-  enum State {
-    TRANSLATION_LINE,
-    ENGLISH_DEF_OF_FOREIGN,
-    ENGLISH_EXAMPLE,
-    FOREIGN_EXAMPLE,
-  }
-  State state = null;
-
-  public boolean entryIsFormOfSomething = false;
-  final Collection<String> wordForms = new ArrayList<String>();
-  boolean titleAppended = false;
-
-
-  final AppendAndIndexWikiCallback<EnParser> appendAndIndexWikiCallback = new AppendAndIndexCallback(this);
-  {
-    appendAndIndexWikiCallback.functionCallbacks.putAll(EnFunctionCallbacks.DEFAULT);
-    for (final String key : new ArrayList<String>(appendAndIndexWikiCallback.functionCallbacks.keySet())) {
-        // Don't handle the it-conj functions here.
-        if (key.startsWith("it-conj")) {
-            appendAndIndexWikiCallback.functionCallbacks.remove(key);
+    // TODO: process {{ttbc}} lines
+
+    public static final Pattern partOfSpeechHeader = Pattern.compile(
+                "Noun|Verb|Adjective|Adverb|Pronoun|Conjunction|Interjection|" +
+                "Preposition|Proper noun|Article|Prepositional phrase|Acronym|" +
+                "Abbreviation|Initialism|Contraction|Prefix|Suffix|Symbol|Letter|" +
+                "Ligature|Idiom|Phrase|\\{\\{acronym\\}\\}|\\{\\{initialism\\}\\}|" +
+                "\\{\\{abbreviation\\}\\}|" +
+                // These are @deprecated:
+                "Noun form|Verb form|Adjective form|Nominal phrase|Noun phrase|" +
+                "Verb phrase|Transitive verb|Intransitive verb|Reflexive verb|" +
+                // These are extras I found:
+                "Determiner|Numeral|Number|Cardinal number|Ordinal number|Proverb|" +
+                "Particle|Interjection|Pronominal adverb|" +
+                "Han character|Hanzi|Hanja|Kanji|Katakana character|Syllable");
+
+    static final Set<String> USELESS_WIKI_ARGS = new LinkedHashSet<String>(
+        Arrays.asList(
+            "lang",
+            "sc",
+            "sort",
+            "cat",
+            "cat2",
+            "xs",
+            "nodot"));
+
+    static boolean isIgnorableTitle(final String title) {
+        return title.startsWith("Wiktionary:") ||
+               title.startsWith("Template:") ||
+               title.startsWith("Appendix:") ||
+               title.startsWith("Category:") ||
+               title.startsWith("Index:") ||
+               title.startsWith("MediaWiki:") ||
+               title.startsWith("TransWiki:") ||
+               title.startsWith("Citations:") ||
+               title.startsWith("Concordance:") ||
+               title.startsWith("Help:");
+    }
+
+    final IndexBuilder enIndexBuilder;
+    final IndexBuilder foreignIndexBuilder;
+    final Pattern langPattern;
+    final Pattern langCodePattern;
+    final boolean swap;
+
+    // State used while parsing.
+    enum State {
+        TRANSLATION_LINE,
+        ENGLISH_DEF_OF_FOREIGN,
+        ENGLISH_EXAMPLE,
+        FOREIGN_EXAMPLE,
+    }
+    State state = null;
+
+    public boolean entryIsFormOfSomething = false;
+    final Collection<String> wordForms = new ArrayList<String>();
+    boolean titleAppended = false;
+
+
+    final AppendAndIndexWikiCallback<EnParser> appendAndIndexWikiCallback = new AppendAndIndexCallback(this);
+    {
+        appendAndIndexWikiCallback.functionCallbacks.putAll(EnFunctionCallbacks.DEFAULT);
+        for (final String key : new ArrayList<String>(appendAndIndexWikiCallback.functionCallbacks.keySet())) {
+            // Don't handle the it-conj functions here.
+            if (key.startsWith("it-conj")) {
+                appendAndIndexWikiCallback.functionCallbacks.remove(key);
+            }
         }
     }
-  }
-  
-  EnParser(final IndexBuilder enIndexBuilder, final IndexBuilder otherIndexBuilder, final Pattern langPattern, final Pattern langCodePattern, final boolean swap) {
-    this.enIndexBuilder = enIndexBuilder;
-    this.foreignIndexBuilder = otherIndexBuilder;
-    this.langPattern = langPattern;
-    this.langCodePattern = langCodePattern;
-    this.swap = swap;
-  }
-
-  @Override
-  void removeUselessArgs(Map<String, String> namedArgs) {
-    namedArgs.keySet().removeAll(USELESS_WIKI_ARGS);
-  }
-  
-  static class AppendAndIndexCallback extends AppendAndIndexWikiCallback<EnParser> {
-
-    public AppendAndIndexCallback(EnParser parser) {
-      super(parser);
+
+    EnParser(final IndexBuilder enIndexBuilder, final IndexBuilder otherIndexBuilder, final Pattern langPattern, final Pattern langCodePattern, final boolean swap) {
+        this.enIndexBuilder = enIndexBuilder;
+        this.foreignIndexBuilder = otherIndexBuilder;
+        this.langPattern = langPattern;
+        this.langCodePattern = langCodePattern;
+        this.swap = swap;
     }
 
     @Override
-    public void onWikiLink(WikiTokenizer wikiTokenizer) {
-      final String text = wikiTokenizer.wikiLinkText();
-      final String link = wikiTokenizer.wikiLinkDest();
-      if (link != null) {
-        if (link.contains("#English")) {
-          dispatch(text, parser.enIndexBuilder, EntryTypeName.WIKTIONARY_ENGLISH_DEF_WIKI_LINK);
-        } else if (link.contains("#") && parser.langPattern.matcher(link).find()) {
-          dispatch(text, parser.foreignIndexBuilder, EntryTypeName.WIKTIONARY_ENGLISH_DEF_OTHER_LANG);
-        } else if (link.equals("plural")) {
-          builder.append(text);
-        } else {
-          //LOG.warning("Special link: " + englishTokenizer.token());
-          dispatch(text, EntryTypeName.WIKTIONARY_ENGLISH_DEF_WIKI_LINK);
+    void removeUselessArgs(Map<String, String> namedArgs) {
+        namedArgs.keySet().removeAll(USELESS_WIKI_ARGS);
+    }
+
+    static class AppendAndIndexCallback extends AppendAndIndexWikiCallback<EnParser> {
+
+        public AppendAndIndexCallback(EnParser parser) {
+            super(parser);
         }
-      } else {
-        // link == null
-        final EntryTypeName entryTypeName;
-        switch (parser.state) {
-        case TRANSLATION_LINE:
-          entryTypeName = EntryTypeName.WIKTIONARY_TRANSLATION_WIKI_TEXT;
-          break;
-        case ENGLISH_DEF_OF_FOREIGN:
-          entryTypeName = EntryTypeName.WIKTIONARY_ENGLISH_DEF_WIKI_LINK;
-          break;
-          default:
-            throw new IllegalStateException("Invalid enum value: " + parser.state);
+
+        @Override
+        public void onWikiLink(WikiTokenizer wikiTokenizer) {
+            final String text = wikiTokenizer.wikiLinkText();
+            final String link = wikiTokenizer.wikiLinkDest();
+            if (link != null) {
+                if (link.contains("#English")) {
+                    dispatch(text, parser.enIndexBuilder, EntryTypeName.WIKTIONARY_ENGLISH_DEF_WIKI_LINK);
+                } else if (link.contains("#") && parser.langPattern.matcher(link).find()) {
+                    dispatch(text, parser.foreignIndexBuilder, EntryTypeName.WIKTIONARY_ENGLISH_DEF_OTHER_LANG);
+                } else if (link.equals("plural")) {
+                    builder.append(text);
+                } else {
+                    //LOG.warning("Special link: " + englishTokenizer.token());
+                    dispatch(text, EntryTypeName.WIKTIONARY_ENGLISH_DEF_WIKI_LINK);
+                }
+            } else {
+                // link == null
+                final EntryTypeName entryTypeName;
+                switch (parser.state) {
+                case TRANSLATION_LINE:
+                    entryTypeName = EntryTypeName.WIKTIONARY_TRANSLATION_WIKI_TEXT;
+                    break;
+                case ENGLISH_DEF_OF_FOREIGN:
+                    entryTypeName = EntryTypeName.WIKTIONARY_ENGLISH_DEF_WIKI_LINK;
+                    break;
+                default:
+                    throw new IllegalStateException("Invalid enum value: " + parser.state);
+                }
+                dispatch(text, entryTypeName);
+            }
         }
-        dispatch(text, entryTypeName);
-      }
+
     }
-    
-  }
 
 }
index d37c0e36f2a61cc858fbc2e356f6959f5b30abc1..8c9683cf2d4df11b440392e655a72626f1dc812c 100644 (file)
@@ -27,205 +27,205 @@ import com.hughes.android.dictionary.parser.WikiTokenizer;
 public final class EnToTranslationParser extends EnParser {
 
     public EnToTranslationParser(final IndexBuilder enIndexBuilder,
-        final IndexBuilder otherIndexBuilder, final Pattern langPattern,
-        final Pattern langCodePattern, final boolean swap) {
-      super(enIndexBuilder, otherIndexBuilder, langPattern, langCodePattern, swap);
+                                 final IndexBuilder otherIndexBuilder, final Pattern langPattern,
+                                 final Pattern langCodePattern, final boolean swap) {
+        super(enIndexBuilder, otherIndexBuilder, langPattern, langCodePattern, swap);
     }
 
     @Override
     void parseSection(String heading, String text) {
-      if (isIgnorableTitle(title)) {
-        return;
-      }
-      heading = heading.replace("=", "").trim();
-      if (!heading.contains("English")) {
-        return;
-      }
-
-      String pos = null;
-      int posDepth = -1;
-
-      final WikiTokenizer wikiTokenizer = new WikiTokenizer(text);
-      while (wikiTokenizer.nextToken() != null) {
-        
-        if (wikiTokenizer.isHeading()) {
-          final String headerName = wikiTokenizer.headingWikiText();
-          
-          if (wikiTokenizer.headingDepth() <= posDepth) {
-            pos = null;
-            posDepth = -1;
-          }
-          
-          if (partOfSpeechHeader.matcher(headerName).matches()) {
-            posDepth = wikiTokenizer.headingDepth();
-            pos = wikiTokenizer.headingWikiText();
-            // TODO: if we're inside the POS section, we should handle the first title line...
-            
-          } else if (headerName.equals("Translations")) {
-            if (pos == null) {
-              LOG.info("Translations without POS (but using anyway): " + title);
+        if (isIgnorableTitle(title)) {
+            return;
+        }
+        heading = heading.replace("=", "").trim();
+        if (!heading.contains("English")) {
+            return;
+        }
+
+        String pos = null;
+        int posDepth = -1;
+
+        final WikiTokenizer wikiTokenizer = new WikiTokenizer(text);
+        while (wikiTokenizer.nextToken() != null) {
+
+            if (wikiTokenizer.isHeading()) {
+                final String headerName = wikiTokenizer.headingWikiText();
+
+                if (wikiTokenizer.headingDepth() <= posDepth) {
+                    pos = null;
+                    posDepth = -1;
+                }
+
+                if (partOfSpeechHeader.matcher(headerName).matches()) {
+                    posDepth = wikiTokenizer.headingDepth();
+                    pos = wikiTokenizer.headingWikiText();
+                    // TODO: if we're inside the POS section, we should handle the first title line...
+
+                } else if (headerName.equals("Translations")) {
+                    if (pos == null) {
+                        LOG.info("Translations without POS (but using anyway): " + title);
+                    }
+                    doTranslations(wikiTokenizer, pos);
+                } else if (headerName.equals("Pronunciation")) {
+                    //doPronunciation(wikiLineReader);
+                }
+            } else if (wikiTokenizer.isFunction()) {
+                final String name = wikiTokenizer.functionName();
+                if (name.equals("head") && pos == null) {
+                    LOG.warning("{{head}} without POS: " + title);
+                }
             }
-            doTranslations(wikiTokenizer, pos);
-          } else if (headerName.equals("Pronunciation")) {
-            //doPronunciation(wikiLineReader);
-          }
-        } else if (wikiTokenizer.isFunction()) {
-          final String name = wikiTokenizer.functionName();
-          if (name.equals("head") && pos == null) {
-            LOG.warning("{{head}} without POS: " + title);
-          }
         }
-      }
     }
 
     private void doTranslations(final WikiTokenizer wikiTokenizer, final String pos) {
-      if (title.equals("absolutely")) {
-        //System.out.println();
-      }
-      
-      String topLevelLang = null;
-      String sense = null;
-      boolean done = false;
-      while (wikiTokenizer.nextToken() != null) {
-        if (wikiTokenizer.isHeading()) {
-          wikiTokenizer.returnToLineStart();
-          return;
-        }
-        if (done) {
-          continue;
+        if (title.equals("absolutely")) {
+            //System.out.println();
         }
-        
-        // Check whether we care about this line:
-        
-        if (wikiTokenizer.isFunction()) {
-          final String functionName = wikiTokenizer.functionName();
-          final List<String> positionArgs = wikiTokenizer.functionPositionArgs();
-          
-          if (functionName.equals("trans-top")) {
-            sense = null;
-            if (wikiTokenizer.functionPositionArgs().size() >= 1) {
-              sense = positionArgs.get(0);
-              sense = WikiTokenizer.toPlainText(sense);
-              //LOG.info("Sense: " + sense);
+
+        String topLevelLang = null;
+        String sense = null;
+        boolean done = false;
+        while (wikiTokenizer.nextToken() != null) {
+            if (wikiTokenizer.isHeading()) {
+                wikiTokenizer.returnToLineStart();
+                return;
+            }
+            if (done) {
+                continue;
             }
-          } else if (functionName.equals("trans-bottom")) {
-            sense = null;
-          } else if (functionName.equals("trans-mid")) {
-          } else if (functionName.equals("trans-see")) {
-           incrementCount("WARNING:trans-see");
-          } else if (functionName.startsWith("picdic")) {
-          } else if (functionName.startsWith("checktrans")) {
-            done = true;
-          } else if (functionName.startsWith("ttbc")) {
-            wikiTokenizer.nextLine();
-            // TODO: would be great to handle ttbc
-            // TODO: Check this: done = true;
-          } else {
-            LOG.warning("Unexpected translation wikifunction: " + wikiTokenizer.token() + ", title=" + title);
-          }
-        } else if (wikiTokenizer.isListItem()) {
-          final String line = wikiTokenizer.listItemWikiText();
-          // This line could produce an output...
-          
+
+            // Check whether we care about this line:
+
+            if (wikiTokenizer.isFunction()) {
+                final String functionName = wikiTokenizer.functionName();
+                final List<String> positionArgs = wikiTokenizer.functionPositionArgs();
+
+                if (functionName.equals("trans-top")) {
+                    sense = null;
+                    if (wikiTokenizer.functionPositionArgs().size() >= 1) {
+                        sense = positionArgs.get(0);
+                        sense = WikiTokenizer.toPlainText(sense);
+                        //LOG.info("Sense: " + sense);
+                    }
+                } else if (functionName.equals("trans-bottom")) {
+                    sense = null;
+                } else if (functionName.equals("trans-mid")) {
+                } else if (functionName.equals("trans-see")) {
+                    incrementCount("WARNING:trans-see");
+                } else if (functionName.startsWith("picdic")) {
+                } else if (functionName.startsWith("checktrans")) {
+                    done = true;
+                } else if (functionName.startsWith("ttbc")) {
+                    wikiTokenizer.nextLine();
+                    // TODO: would be great to handle ttbc
+                    // TODO: Check this: done = true;
+                } else {
+                    LOG.warning("Unexpected translation wikifunction: " + wikiTokenizer.token() + ", title=" + title);
+                }
+            } else if (wikiTokenizer.isListItem()) {
+                final String line = wikiTokenizer.listItemWikiText();
+                // This line could produce an output...
+
 //          if (line.contains("ich hoan dich gear")) {
 //            //System.out.println();
 //          }
-          
-          // First strip the language and check whether it matches.
-          // And hold onto it for sub-lines.
-          final int colonIndex = line.indexOf(":");
-          if (colonIndex == -1) {
-            continue;
-          }
-          
-          final String lang = trim(WikiTokenizer.toPlainText(line.substring(0, colonIndex)));
-          incrementCount("tCount:" + lang);
-          final boolean appendLang;
-          if (wikiTokenizer.listItemPrefix().length() == 1) {
-            topLevelLang = lang;
-            final boolean thisFind = langPattern.matcher(lang).find();
-            if (!thisFind) {
-              continue;
-            }
-            appendLang = !langPattern.matcher(lang).matches();
-          } else if (topLevelLang == null) {
-            continue;
-          } else {
-            // Two-level -- the only way we won't append is if this second level matches exactly.
-            if (!langPattern.matcher(lang).matches() && !langPattern.matcher(topLevelLang).find()) {
-              continue;
+
+                // First strip the language and check whether it matches.
+                // And hold onto it for sub-lines.
+                final int colonIndex = line.indexOf(":");
+                if (colonIndex == -1) {
+                    continue;
+                }
+
+                final String lang = trim(WikiTokenizer.toPlainText(line.substring(0, colonIndex)));
+                incrementCount("tCount:" + lang);
+                final boolean appendLang;
+                if (wikiTokenizer.listItemPrefix().length() == 1) {
+                    topLevelLang = lang;
+                    final boolean thisFind = langPattern.matcher(lang).find();
+                    if (!thisFind) {
+                        continue;
+                    }
+                    appendLang = !langPattern.matcher(lang).matches();
+                } else if (topLevelLang == null) {
+                    continue;
+                } else {
+                    // Two-level -- the only way we won't append is if this second level matches exactly.
+                    if (!langPattern.matcher(lang).matches() && !langPattern.matcher(topLevelLang).find()) {
+                        continue;
+                    }
+                    appendLang = !langPattern.matcher(lang).matches();
+                }
+
+                String rest = line.substring(colonIndex + 1).trim();
+                if (rest.length() > 0) {
+                    doTranslationLine(line, appendLang ? lang : null, pos, sense, rest);
+                }
+
+            } else if (wikiTokenizer.remainderStartsWith("''See''")) {
+                wikiTokenizer.nextLine();
+                incrementCount("WARNING: ''See''" );
+                LOG.fine("Skipping See line: " + wikiTokenizer.token());
+            } else if (wikiTokenizer.isWikiLink()) {
+                final String wikiLink = wikiTokenizer.wikiLinkText();
+                if (wikiLink.contains(":") && wikiLink.contains(title)) {
+                } else if (wikiLink.contains("Category:")) {
+                } else  {
+                    incrementCount("WARNING: Unexpected wikiLink" );
+                    LOG.warning("Unexpected wikiLink: " + wikiTokenizer.token() + ", title=" + title);
+                }
+            } else if (wikiTokenizer.isNewline() || wikiTokenizer.isMarkup() || wikiTokenizer.isComment()) {
+            } else {
+                final String token = wikiTokenizer.token();
+                if (token.equals("----")) {
+                } else {
+                    LOG.warning("Unexpected translation token: " + wikiTokenizer.token() + ", title=" + title);
+                    incrementCount("WARNING: Unexpected translation token" );
+                }
             }
-            appendLang = !langPattern.matcher(lang).matches();
-          }
-          
-          String rest = line.substring(colonIndex + 1).trim();
-          if (rest.length() > 0) {
-            doTranslationLine(line, appendLang ? lang : null, pos, sense, rest);
-          }
-          
-        } else if (wikiTokenizer.remainderStartsWith("''See''")) {
-          wikiTokenizer.nextLine();
-          incrementCount("WARNING: ''See''" );
-          LOG.fine("Skipping See line: " + wikiTokenizer.token());
-        } else if (wikiTokenizer.isWikiLink()) {
-          final String wikiLink = wikiTokenizer.wikiLinkText();
-          if (wikiLink.contains(":") && wikiLink.contains(title)) {
-          } else if (wikiLink.contains("Category:")) {
-          } else  {
-            incrementCount("WARNING: Unexpected wikiLink" );
-            LOG.warning("Unexpected wikiLink: " + wikiTokenizer.token() + ", title=" + title);
-          }
-        } else if (wikiTokenizer.isNewline() || wikiTokenizer.isMarkup() || wikiTokenizer.isComment()) {
-        } else {
-          final String token = wikiTokenizer.token();
-          if (token.equals("----")) { 
-          } else {
-            LOG.warning("Unexpected translation token: " + wikiTokenizer.token() + ", title=" + title);
-            incrementCount("WARNING: Unexpected translation token" );
-          }
+
         }
-        
-      }
     }
-    
+
     private void doTranslationLine(final String line, final String lang, final String pos, final String sense, final String rest) {
-      state = State.TRANSLATION_LINE;
-      // Good chance we'll actually file this one...
-      final PairEntry pairEntry = new PairEntry(entrySource);
-      final IndexedEntry indexedEntry = new IndexedEntry(pairEntry);
-      indexedEntry.isValid = true;
-      
-      final StringBuilder foreignText = new StringBuilder();
-      appendAndIndexWikiCallback.reset(foreignText, indexedEntry);
-      appendAndIndexWikiCallback.dispatch(rest, foreignIndexBuilder, EntryTypeName.WIKTIONARY_TRANSLATION_OTHER_TEXT);
-      
-      if (foreignText.length() == 0) {
-        LOG.warning("Empty foreignText: " + line);
-        incrementCount("WARNING: Empty foreignText" );
-        return;
-      }
-      
-      if (lang != null) {
-        foreignText.insert(0, String.format("(%s) ", lang));
-      }
-      
-      StringBuilder englishText = new StringBuilder();
-      
-      englishText.append(title);
-      if (sense != null) {
-        englishText.append(" (").append(sense).append(")");
-        enIndexBuilder.addEntryWithString(indexedEntry, sense, EntryTypeName.WIKTIONARY_TRANSLATION_SENSE);
-      }
-      if (pos != null) {
-        englishText.append(" (").append(pos.toLowerCase()).append(")");
-      }
-      enIndexBuilder.addEntryWithString(indexedEntry, title, EntryTypeName.WIKTIONARY_TITLE_MULTI);
-      
-      final Pair pair = new Pair(trim(englishText.toString()), trim(foreignText.toString()), swap);
-      pairEntry.pairs.add(pair);
-      if (!pairsAdded.add(pair.toString())) {
-        LOG.warning("Duplicate pair: " + pair.toString());
-        incrementCount("WARNING: Duplicate pair" );
-      }
+        state = State.TRANSLATION_LINE;
+        // Good chance we'll actually file this one...
+        final PairEntry pairEntry = new PairEntry(entrySource);
+        final IndexedEntry indexedEntry = new IndexedEntry(pairEntry);
+        indexedEntry.isValid = true;
+
+        final StringBuilder foreignText = new StringBuilder();
+        appendAndIndexWikiCallback.reset(foreignText, indexedEntry);
+        appendAndIndexWikiCallback.dispatch(rest, foreignIndexBuilder, EntryTypeName.WIKTIONARY_TRANSLATION_OTHER_TEXT);
+
+        if (foreignText.length() == 0) {
+            LOG.warning("Empty foreignText: " + line);
+            incrementCount("WARNING: Empty foreignText" );
+            return;
+        }
+
+        if (lang != null) {
+            foreignText.insert(0, String.format("(%s) ", lang));
+        }
+
+        StringBuilder englishText = new StringBuilder();
+
+        englishText.append(title);
+        if (sense != null) {
+            englishText.append(" (").append(sense).append(")");
+            enIndexBuilder.addEntryWithString(indexedEntry, sense, EntryTypeName.WIKTIONARY_TRANSLATION_SENSE);
+        }
+        if (pos != null) {
+            englishText.append(" (").append(pos.toLowerCase()).append(")");
+        }
+        enIndexBuilder.addEntryWithString(indexedEntry, title, EntryTypeName.WIKTIONARY_TITLE_MULTI);
+
+        final Pair pair = new Pair(trim(englishText.toString()), trim(foreignText.toString()), swap);
+        pairEntry.pairs.add(pair);
+        if (!pairsAdded.add(pair.toString())) {
+            LOG.warning("Duplicate pair: " + pair.toString());
+            incrementCount("WARNING: Duplicate pair" );
+        }
     }
-  }  // EnToTranslationParser
+}  // EnToTranslationParser
index 8025021aea1460d1f37a41716f349a8d5202560d..14cf43c86c9c8eb4bad956c1e3416fc9516c61cc 100644 (file)
@@ -30,126 +30,126 @@ import com.hughes.android.dictionary.parser.wiktionary.EnFunctionCallbacks.Trans
 import com.hughes.util.ListUtil;
 
 public final class EnTranslationToTranslationParser extends AbstractWiktionaryParser {
-  
+
     final List<IndexBuilder> indexBuilders;
     final Pattern[] langCodePatterns;
 
     PairEntry pairEntry = null;
     IndexedEntry indexedEntry = null;
-    StringBuilder[] builders = null; 
-    
-  public static final String NAME = "EnTranslationToTranslation";
-    
-  final Set<String> Ts = new LinkedHashSet<String>(Arrays.asList("t", "t+",
-      "t-", "tø", "apdx-t", "ttbc"));
-    
+    StringBuilder[] builders = null;
+
+    public static final String NAME = "EnTranslationToTranslation";
+
+    final Set<String> Ts = new LinkedHashSet<String>(Arrays.asList("t", "t+",
+            "t-", "tø", "apdx-t", "ttbc"));
+
     public EnTranslationToTranslationParser(final List<IndexBuilder> indexBuilders,
-        final Pattern[] langCodePatterns) {
-      this.indexBuilders = indexBuilders;
-      this.langCodePatterns = langCodePatterns;
+                                            final Pattern[] langCodePatterns) {
+        this.indexBuilders = indexBuilders;
+        this.langCodePatterns = langCodePatterns;
     }
-    
+
     @Override
     void removeUselessArgs(Map<String, String> namedArgs) {
-      namedArgs.keySet().removeAll(EnParser.USELESS_WIKI_ARGS);
+        namedArgs.keySet().removeAll(EnParser.USELESS_WIKI_ARGS);
     }
-    
+
     @Override
     void parseSection(String heading, String text) {
-      if (EnParser.isIgnorableTitle(title)) {
-        return;
-      }
-      final WikiTokenizer.Callback callback = new WikiTokenizer.DoNothingCallback() {
-        @Override
-        public void onFunction(WikiTokenizer wikiTokenizer, String name,
-            List<String> functionPositionArgs,
-            Map<String, String> functionNamedArgs) {
-          //System.out.println(wikiTokenizer.token());
-          if (Ts.contains(name)) {
-            onT(wikiTokenizer);
-          } else if (name.equals("trans-top") || name.equals("checktrans-top") || name.equals("checktrans")) {
-            startEntry(title, wikiTokenizer.token());
-          } else if (name.equals("trans-bottom")) {
-            finishEntry(title);
-          }
+        if (EnParser.isIgnorableTitle(title)) {
+            return;
         }
+        final WikiTokenizer.Callback callback = new WikiTokenizer.DoNothingCallback() {
+            @Override
+            public void onFunction(WikiTokenizer wikiTokenizer, String name,
+                                   List<String> functionPositionArgs,
+                                   Map<String, String> functionNamedArgs) {
+                //System.out.println(wikiTokenizer.token());
+                if (Ts.contains(name)) {
+                    onT(wikiTokenizer);
+                } else if (name.equals("trans-top") || name.equals("checktrans-top") || name.equals("checktrans")) {
+                    startEntry(title, wikiTokenizer.token());
+                } else if (name.equals("trans-bottom")) {
+                    finishEntry(title);
+                }
+            }
 
-        @Override
-        public void onListItem(WikiTokenizer wikiTokenizer) {
-          WikiTokenizer.dispatch(wikiTokenizer.listItemWikiText(), false, this);
+            @Override
+            public void onListItem(WikiTokenizer wikiTokenizer) {
+                WikiTokenizer.dispatch(wikiTokenizer.listItemWikiText(), false, this);
+            }
+        };
+        WikiTokenizer.dispatch(text, true, callback);
+
+        if (builders != null) {
+            LOG.warning("unended translations: " + title);
+            finishEntry(title);
         }
-      };
-      WikiTokenizer.dispatch(text, true, callback);
-      
-      if (builders != null) {
-        LOG.warning("unended translations: " + title);
-        finishEntry(title);
-      }
-    }
-    
-  final TranslationCallback<EnTranslationToTranslationParser> translationCallback = new TranslationCallback<EnTranslationToTranslationParser>();
-    
-  final AppendAndIndexWikiCallback<EnTranslationToTranslationParser> appendAndIndexWikiCallback = new AppendAndIndexWikiCallback<EnTranslationToTranslationParser>(
-      this);
-  {
-    for (final String t : Ts) {
-      appendAndIndexWikiCallback.functionCallbacks.put(t, translationCallback);
     }
-  }
-    
-  private void onT(WikiTokenizer wikiTokenizer) {
-    if (builders == null) {
-      LOG.warning("{{t...}} section outside of {{trans-top}}: " + title);
-      startEntry(title, "QUICKDIC_OUTSIDE");
-    }
-    
-    final List<String> args = wikiTokenizer.functionPositionArgs();
-    final String langCode = ListUtil.get(args, 0);
-    if (langCode == null) {
-      LOG.warning("Missing langCode: " + wikiTokenizer.token());
-      return;
+
+    final TranslationCallback<EnTranslationToTranslationParser> translationCallback = new TranslationCallback<EnTranslationToTranslationParser>();
+
+    final AppendAndIndexWikiCallback<EnTranslationToTranslationParser> appendAndIndexWikiCallback = new AppendAndIndexWikiCallback<EnTranslationToTranslationParser>(
+        this);
+    {
+        for (final String t : Ts) {
+            appendAndIndexWikiCallback.functionCallbacks.put(t, translationCallback);
+        }
     }
-    for (int p = 0; p < 2; ++p) {
-      if (langCodePatterns[p].matcher(langCode).matches()) {
-        appendAndIndexWikiCallback.builder = builders[p];
-        if (appendAndIndexWikiCallback.builder.length() > 0) {
-          appendAndIndexWikiCallback.builder.append(", ");
+
+    private void onT(WikiTokenizer wikiTokenizer) {
+        if (builders == null) {
+            LOG.warning("{{t...}} section outside of {{trans-top}}: " + title);
+            startEntry(title, "QUICKDIC_OUTSIDE");
+        }
+
+        final List<String> args = wikiTokenizer.functionPositionArgs();
+        final String langCode = ListUtil.get(args, 0);
+        if (langCode == null) {
+            LOG.warning("Missing langCode: " + wikiTokenizer.token());
+            return;
+        }
+        for (int p = 0; p < 2; ++p) {
+            if (langCodePatterns[p].matcher(langCode).matches()) {
+                appendAndIndexWikiCallback.builder = builders[p];
+                if (appendAndIndexWikiCallback.builder.length() > 0) {
+                    appendAndIndexWikiCallback.builder.append(", ");
+                }
+                appendAndIndexWikiCallback.indexBuilder = indexBuilders.get(p);
+                appendAndIndexWikiCallback.onFunction(wikiTokenizer,
+                                                      wikiTokenizer.functionName(), wikiTokenizer.functionPositionArgs(),
+                                                      wikiTokenizer.functionNamedArgs());
+            }
         }
-        appendAndIndexWikiCallback.indexBuilder = indexBuilders.get(p);
-        appendAndIndexWikiCallback.onFunction(wikiTokenizer,
-            wikiTokenizer.functionName(), wikiTokenizer.functionPositionArgs(),
-            wikiTokenizer.functionNamedArgs());
-      }
     }
-  }
 
     void startEntry(final String title, final String func) {
-      if (pairEntry != null) {
-        LOG.warning("startEntry() twice: " + title + ", " + func);
-        finishEntry(title);
-      }
-      
-      pairEntry = new PairEntry(entrySource);
-      indexedEntry = new IndexedEntry(pairEntry);
-      builders = new StringBuilder[] { new StringBuilder(), new StringBuilder() };
-      appendAndIndexWikiCallback.indexedEntry = indexedEntry;
+        if (pairEntry != null) {
+            LOG.warning("startEntry() twice: " + title + ", " + func);
+            finishEntry(title);
+        }
+
+        pairEntry = new PairEntry(entrySource);
+        indexedEntry = new IndexedEntry(pairEntry);
+        builders = new StringBuilder[] { new StringBuilder(), new StringBuilder() };
+        appendAndIndexWikiCallback.indexedEntry = indexedEntry;
     }
-    
+
     void finishEntry(final String title) {
-      if (pairEntry == null) {
-        LOG.warning("finalizeEntry() twice: " + title);
-        return;
-      }
-      final String lang1 = builders[0].toString();
-      final String lang2 = builders[1].toString();
-      if (lang1.length() > 0 && lang2.length() > 0) {
-        pairEntry.pairs.add(new Pair(lang1, lang2));
-        indexedEntry.isValid = true;
-      }
-      
-      pairEntry = null;
-      indexedEntry = null;
-      builders = null;
+        if (pairEntry == null) {
+            LOG.warning("finalizeEntry() twice: " + title);
+            return;
+        }
+        final String lang1 = builders[0].toString();
+        final String lang2 = builders[1].toString();
+        if (lang1.length() > 0 && lang2.length() > 0) {
+            pairEntry.pairs.add(new Pair(lang1, lang2));
+            indexedEntry.isValid = true;
+        }
+
+        pairEntry = null;
+        indexedEntry = null;
+        builders = null;
     }
 
-  }
\ No newline at end of file
+}
\ No newline at end of file
index 7727ad0e1f4f41b5065b89606fcb0afda51bec74..2edf3acf0fedc9d424546e5d62944a748421e3d3 100644 (file)
@@ -23,50 +23,50 @@ import java.util.List;
 import java.util.Map;
 
 class FrFunctionCallbacks {
-  
-  static <T extends AbstractWiktionaryParser> void addGenericCallbacks(Map<String, FunctionCallback<T>> callbacks) {
-      callbacks.put("-étym-", new Redispatch<T>("\n==== Étymologie ====\n"));
-      callbacks.put("-pron-", new Redispatch<T>("\n==== Prononciation ====\n"));
-      callbacks.put("-voir-", new Redispatch<T>("\n==== Voir aussi ====\n"));
-      callbacks.put("-drv-", new Redispatch<T>("\n==== Dérivés ====\n"));
-      callbacks.put("-syn-", new Redispatch<T>("\n==== Synonymes ====\n"));
 
-      callbacks.put("-apr-", new Redispatch<T>("\n==== Apparentés étymologiques ====\n"));
-      callbacks.put("-hyper-", new Redispatch<T>("\n==== Hyperonymes ====\n"));
-      callbacks.put("-hypo-", new Redispatch<T>("\n==== Hyponymes ====\n"));
-      callbacks.put("-réf-", new Redispatch<T>("\n==== Références ====\n"));
-      callbacks.put("-homo-", new Redispatch<T>("\n==== Homophones ====\n"));
-      callbacks.put("-anagr-", new Redispatch<T>("\n==== Anagrammes ====\n"));
-      callbacks.put("-voc-", new Redispatch<T>("\n==== Vocabulaire apparenté par le sens ====\n"));
-      callbacks.put("-exp-", new Redispatch<T>("\n==== Expressions ====\n"));
-      callbacks.put("-note-", new Redispatch<T>("\n==== Note ====\n"));
+    static <T extends AbstractWiktionaryParser> void addGenericCallbacks(Map<String, FunctionCallback<T>> callbacks) {
+        callbacks.put("-étym-", new Redispatch<T>("\n==== Étymologie ====\n"));
+        callbacks.put("-pron-", new Redispatch<T>("\n==== Prononciation ====\n"));
+        callbacks.put("-voir-", new Redispatch<T>("\n==== Voir aussi ====\n"));
+        callbacks.put("-drv-", new Redispatch<T>("\n==== Dérivés ====\n"));
+        callbacks.put("-syn-", new Redispatch<T>("\n==== Synonymes ====\n"));
 
-      callbacks.put("-trad-", new ItFunctionCallbacks.SkipSection<T>());
-  }
+        callbacks.put("-apr-", new Redispatch<T>("\n==== Apparentés étymologiques ====\n"));
+        callbacks.put("-hyper-", new Redispatch<T>("\n==== Hyperonymes ====\n"));
+        callbacks.put("-hypo-", new Redispatch<T>("\n==== Hyponymes ====\n"));
+        callbacks.put("-réf-", new Redispatch<T>("\n==== Références ====\n"));
+        callbacks.put("-homo-", new Redispatch<T>("\n==== Homophones ====\n"));
+        callbacks.put("-anagr-", new Redispatch<T>("\n==== Anagrammes ====\n"));
+        callbacks.put("-voc-", new Redispatch<T>("\n==== Vocabulaire apparenté par le sens ====\n"));
+        callbacks.put("-exp-", new Redispatch<T>("\n==== Expressions ====\n"));
+        callbacks.put("-note-", new Redispatch<T>("\n==== Note ====\n"));
 
-  
-  static final NameAndArgs<EnParser> NAME_AND_ARGS = new NameAndArgs<EnParser>();
-
-  
-  static final class MakeHeadingFromName<T extends AbstractWiktionaryParser> implements FunctionCallback<T> {
-    final String header;
-    public MakeHeadingFromName(String header) {
-        this.header = header;
+        callbacks.put("-trad-", new ItFunctionCallbacks.SkipSection<T>());
     }
 
-    @Override
-      public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List<String> args,
-          final Map<String, String> namedArgs,
-          final T parser,
-          final AppendAndIndexWikiCallback<T> appendAndIndexWikiCallback) {
-        if (!namedArgs.isEmpty() || args.size() != 0) {
-            return false;
+
+    static final NameAndArgs<EnParser> NAME_AND_ARGS = new NameAndArgs<EnParser>();
+
+
+    static final class MakeHeadingFromName<T extends AbstractWiktionaryParser> implements FunctionCallback<T> {
+        final String header;
+        public MakeHeadingFromName(String header) {
+            this.header = header;
+        }
+
+        @Override
+        public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List<String> args,
+                                      final Map<String, String> namedArgs,
+                                      final T parser,
+                                      final AppendAndIndexWikiCallback<T> appendAndIndexWikiCallback) {
+            if (!namedArgs.isEmpty() || args.size() != 0) {
+                return false;
+            }
+            //appendAndIndexWikiCallback.builder.append(String.format("<%s>", header));
+            appendAndIndexWikiCallback.dispatch("\n" + header + name + header, null);
+            //appendAndIndexWikiCallback.builder.append(String.format("</%s>\n", header));
+            return true;
         }
-        //appendAndIndexWikiCallback.builder.append(String.format("<%s>", header));
-        appendAndIndexWikiCallback.dispatch("\n" + header + name + header, null);
-        //appendAndIndexWikiCallback.builder.append(String.format("</%s>\n", header));
-        return true;
-      }
     }
 
 
index 550dd5d27154b01a6dcfb80fed3122c64b5b60d9..059497eb39d18ca61f875fc2deb9ecd581e55bdd 100644 (file)
@@ -23,10 +23,10 @@ import com.hughes.android.dictionary.parser.wiktionary.AbstractWiktionaryParser.
 public interface FunctionCallback<T extends AbstractWiktionaryParser> {
 
     boolean onWikiFunction(
-            final WikiTokenizer tokenizer,
-            final String name,
-            final List<String> args,
-            final Map<String, String> namedArgs,
-            final T parser,
-            final AppendAndIndexWikiCallback<T> appendAndIndexWikiCallback);
+        final WikiTokenizer tokenizer,
+        final String name,
+        final List<String> args,
+        final Map<String, String> namedArgs,
+        final T parser,
+        final AppendAndIndexWikiCallback<T> appendAndIndexWikiCallback);
 }
index 3b089fde647838f556469afa320de221dfdcff9b..8278ccd09b8804b1096bfea35bff5545ff1ce885 100644 (file)
@@ -24,7 +24,7 @@ import java.util.Map;
 class ItFunctionCallbacks {
 
     static <T extends AbstractWiktionaryParser> void addGenericCallbacks(
-            Map<String, FunctionCallback<T>> callbacks) {
+        Map<String, FunctionCallback<T>> callbacks) {
         callbacks.put("-hyph-", new Redispatch<T>("\n==== Sillabazione ====\n"));
         callbacks.put("-pron-", new Redispatch<T>("\n==== Pronuncia ====\n"));
         callbacks.put("-etim-", new Redispatch<T>("\n==== Etimologia / Derivazione ====\n"));
@@ -44,7 +44,7 @@ class ItFunctionCallbacks {
     static final NameAndArgs<EnParser> NAME_AND_ARGS = new NameAndArgs<EnParser>();
 
     static final class Redispatch<T extends AbstractWiktionaryParser> implements
-            FunctionCallback<T> {
+        FunctionCallback<T> {
         final String newText;
 
         public Redispatch(String newText) {
@@ -53,10 +53,10 @@ class ItFunctionCallbacks {
 
         @Override
         public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name,
-                final List<String> args,
-                final Map<String, String> namedArgs,
-                final T parser,
-                final AppendAndIndexWikiCallback<T> appendAndIndexWikiCallback) {
+                                      final List<String> args,
+                                      final Map<String, String> namedArgs,
+                                      final T parser,
+                                      final AppendAndIndexWikiCallback<T> appendAndIndexWikiCallback) {
             if (!namedArgs.isEmpty() || args.size() != 0) {
                 return false;
             }
@@ -66,18 +66,18 @@ class ItFunctionCallbacks {
     }
 
     static final class SkipSection<T extends AbstractWiktionaryParser> implements
-            FunctionCallback<T> {
+        FunctionCallback<T> {
         public SkipSection() {
         }
 
         @Override
         public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name,
-                final List<String> args,
-                final Map<String, String> namedArgs,
-                final T parser,
-                final AppendAndIndexWikiCallback<T> appendAndIndexWikiCallback) {
+                                      final List<String> args,
+                                      final Map<String, String> namedArgs,
+                                      final T parser,
+                                      final AppendAndIndexWikiCallback<T> appendAndIndexWikiCallback) {
             while (wikiTokenizer.nextToken() != null) {
-                if (wikiTokenizer.isFunction() 
+                if (wikiTokenizer.isFunction()
                         && wikiTokenizer.functionName().startsWith("-")
                         && wikiTokenizer.functionName().endsWith("-")
                         // Hack to prevent infinite-looping, would be better to check that this func was at the start of the line.
index e861b9ddabd3da127833f83b0db3ada33c77f97e..0066d3bc7aeed2dabbb85dccad7d89df848be65d 100644 (file)
@@ -28,7 +28,7 @@ public class WholeSectionToHtmlParser extends AbstractWiktionaryParser {
         boolean skipWikiLink(final WikiTokenizer wikiTokenizer);
         String adjustWikiLink(String wikiLinkDest, final String wikiLinkText);
         void addFunctionCallbacks(
-                Map<String, FunctionCallback<WholeSectionToHtmlParser>> functionCallbacks);
+            Map<String, FunctionCallback<WholeSectionToHtmlParser>> functionCallbacks);
     }
     static final Map<String,LangConfig> isoToLangConfig = new LinkedHashMap<String,LangConfig>();
     static {
@@ -38,7 +38,7 @@ public class WholeSectionToHtmlParser extends AbstractWiktionaryParser {
             public boolean skipSection(String headingText) {
                 return enSkipSections.matcher(headingText).matches();
             }
-            
+
             @Override
             public EntryTypeName sectionNameToEntryType(String sectionName) {
                 if (sectionName.equalsIgnoreCase("Synonyms")) {
@@ -56,7 +56,7 @@ public class WholeSectionToHtmlParser extends AbstractWiktionaryParser {
                 }
                 return null;
             }
-            
+
             @Override
             public boolean skipWikiLink(WikiTokenizer wikiTokenizer) {
                 final String wikiText = wikiTokenizer.wikiLinkText();
@@ -82,11 +82,11 @@ public class WholeSectionToHtmlParser extends AbstractWiktionaryParser {
 
             @Override
             public void addFunctionCallbacks(
-                    Map<String, FunctionCallback<WholeSectionToHtmlParser>> functionCallbacks) {
+                Map<String, FunctionCallback<WholeSectionToHtmlParser>> functionCallbacks) {
                 EnFunctionCallbacks.addGenericCallbacks(functionCallbacks);
             }
         });
-        
+
         final Pattern esSkipSections = Pattern.compile(".*(Traducciones|Locuciones).*");
         isoToLangConfig.put("ES", new LangConfig() {
             @Override
@@ -130,7 +130,7 @@ public class WholeSectionToHtmlParser extends AbstractWiktionaryParser {
 
             @Override
             public void addFunctionCallbacks(
-                    Map<String, FunctionCallback<WholeSectionToHtmlParser>> functionCallbacks) {
+                Map<String, FunctionCallback<WholeSectionToHtmlParser>> functionCallbacks) {
                 // TODO: need Spanish variant
             }
         });
@@ -141,7 +141,7 @@ public class WholeSectionToHtmlParser extends AbstractWiktionaryParser {
             public boolean skipSection(String headingText) {
                 return deSkipSections.matcher(headingText).matches();
             }
-            
+
             @Override
             public EntryTypeName sectionNameToEntryType(String sectionName) {
                 if (sectionName.equalsIgnoreCase("Synonyme")) {
@@ -152,7 +152,7 @@ public class WholeSectionToHtmlParser extends AbstractWiktionaryParser {
                 }
                 return null;
             }
-            
+
             @Override
             public boolean skipWikiLink(WikiTokenizer wikiTokenizer) {
                 final String wikiText = wikiTokenizer.wikiLinkText();
@@ -178,18 +178,18 @@ public class WholeSectionToHtmlParser extends AbstractWiktionaryParser {
 
             @Override
             public void addFunctionCallbacks(
-                    Map<String, FunctionCallback<WholeSectionToHtmlParser>> functionCallbacks) {
+                Map<String, FunctionCallback<WholeSectionToHtmlParser>> functionCallbacks) {
                 DeFunctionCallbacks.addGenericCallbacks(functionCallbacks);
             }
         });
-        
+
         final Pattern itSkipSections = Pattern.compile(".*(Traduzione|Note / Riferimenti).*");
         isoToLangConfig.put("IT", new LangConfig() {
             @Override
             public boolean skipSection(String headingText) {
                 return itSkipSections.matcher(headingText).matches();
             }
-            
+
             @Override
             public EntryTypeName sectionNameToEntryType(String sectionName) {
                 if (sectionName.equalsIgnoreCase("Sinonimi")) {
@@ -200,7 +200,7 @@ public class WholeSectionToHtmlParser extends AbstractWiktionaryParser {
                 }
                 return null;
             }
-            
+
             @Override
             public boolean skipWikiLink(WikiTokenizer wikiTokenizer) {
                 final String wikiText = wikiTokenizer.wikiLinkText();
@@ -226,7 +226,7 @@ public class WholeSectionToHtmlParser extends AbstractWiktionaryParser {
 
             @Override
             public void addFunctionCallbacks(
-                    Map<String, FunctionCallback<WholeSectionToHtmlParser>> functionCallbacks) {
+                Map<String, FunctionCallback<WholeSectionToHtmlParser>> functionCallbacks) {
                 ItFunctionCallbacks.addGenericCallbacks(functionCallbacks);
             }
         });
@@ -238,7 +238,7 @@ public class WholeSectionToHtmlParser extends AbstractWiktionaryParser {
             public boolean skipSection(String headingText) {
                 return frSkipSections.matcher(headingText).matches();
             }
-            
+
             @Override
             public EntryTypeName sectionNameToEntryType(String sectionName) {
                 if (sectionName.equalsIgnoreCase("Synonymes")) {
@@ -249,7 +249,7 @@ public class WholeSectionToHtmlParser extends AbstractWiktionaryParser {
                 }
                 return null;
             }
-            
+
             @Override
             public boolean skipWikiLink(WikiTokenizer wikiTokenizer) {
                 final String wikiText = wikiTokenizer.wikiLinkText();
@@ -275,7 +275,7 @@ public class WholeSectionToHtmlParser extends AbstractWiktionaryParser {
 
             @Override
             public void addFunctionCallbacks(
-                    Map<String, FunctionCallback<WholeSectionToHtmlParser>> functionCallbacks) {
+                Map<String, FunctionCallback<WholeSectionToHtmlParser>> functionCallbacks) {
                 FrFunctionCallbacks.addGenericCallbacks(functionCallbacks);
             }
         });
@@ -286,10 +286,10 @@ public class WholeSectionToHtmlParser extends AbstractWiktionaryParser {
     final String skipLangIso;
     final LangConfig langConfig;
     final String webUrlTemplate;
-    
+
 
     public WholeSectionToHtmlParser(final IndexBuilder titleIndexBuilder, final IndexBuilder defIndexBuilder, final String wiktionaryIso, final String skipLangIso,
-            final String webUrlTemplate) {
+                                    final String webUrlTemplate) {
         this.titleIndexBuilder = titleIndexBuilder;
         this.defIndexBuilder = defIndexBuilder;
         assert isoToLangConfig.containsKey(wiktionaryIso): wiktionaryIso;
@@ -297,7 +297,7 @@ public class WholeSectionToHtmlParser extends AbstractWiktionaryParser {
         this.skipLangIso = skipLangIso;
         this.webUrlTemplate = webUrlTemplate;
     }
-    
+
     IndexedEntry indexedEntry = null;
 
     @Override
@@ -307,7 +307,7 @@ public class WholeSectionToHtmlParser extends AbstractWiktionaryParser {
         indexedEntry = new IndexedEntry(htmlEntry);
 
         final AppendAndIndexWikiCallback<WholeSectionToHtmlParser> callback = new AppendCallback(
-                this);
+            this);
         langConfig.addFunctionCallbacks(callback.functionCallbacks);
 
         callback.builder = new StringBuilder();
@@ -316,11 +316,11 @@ public class WholeSectionToHtmlParser extends AbstractWiktionaryParser {
 
         if (webUrlTemplate != null) {
             final String webUrl = String.format(webUrlTemplate, title);
-           // URI.create can raise an exception e.g. if webUrl contains %, just ignore those cases.
-           try {
-            callback.builder.append(String.format("<p> <a href=\"%s\">%s</a>", URI.create(webUrl).toASCIIString(), escapeHtmlLiteral(webUrl)));
-           } catch (Exception e)
-           {}
+            // URI.create can raise an exception e.g. if webUrl contains %, just ignore those cases.
+            try {
+                callback.builder.append(String.format("<p> <a href=\"%s\">%s</a>", URI.create(webUrl).toASCIIString(), escapeHtmlLiteral(webUrl)));
+            } catch (Exception e) {
+            }
         }
         htmlEntry.html = callback.builder.toString();
         indexedEntry.isValid = true;
@@ -332,26 +332,26 @@ public class WholeSectionToHtmlParser extends AbstractWiktionaryParser {
         tokenData.htmlEntries.add(htmlEntry);
         // titleIndexBuilder.addEntryWithString(indexedEntry, title,
         // EntryTypeName.WIKTIONARY_TITLE_MULTI_DETAIL);
-        
+
         indexedEntry = null;
     }
 
     @Override
     void removeUselessArgs(Map<String, String> namedArgs) {
     }
-    
+
     @Override
     public void addLinkToCurrentEntry(String token, final String lang, EntryTypeName entryTypeName) {
         if (lang == null || lang.equals(skipLangIso)) {
             titleIndexBuilder.addEntryWithString(indexedEntry, token, entryTypeName);
         }
     }
-    
+
     public static String escapeHtmlLiteral(final String plainText) {
         final String htmlEscaped = StringEscapeUtils.escapeHtml3(plainText);
         if (StringUtil.isAscii(htmlEscaped)) {
             return htmlEscaped;
-        } else { 
+        } else {
             return StringUtil.escapeUnicodeToPureHtml(plainText);
         }
 
@@ -399,7 +399,7 @@ public class WholeSectionToHtmlParser extends AbstractWiktionaryParser {
 
         @Override
         public void onFunction(WikiTokenizer wikiTokenizer, String name,
-                List<String> args, Map<String, String> namedArgs) {
+                               List<String> args, Map<String, String> namedArgs) {
             if (skipLangIso.equalsIgnoreCase(namedArgs.get("lang"))) {
                 namedArgs.remove("lang");
             }
@@ -414,7 +414,7 @@ public class WholeSectionToHtmlParser extends AbstractWiktionaryParser {
         @Override
         public void onNewline(WikiTokenizer wikiTokenizer) {
         }
-        
+
         EntryTypeName sectionEntryTypeName;
         IndexBuilder currentIndexBuilder;
 
@@ -451,7 +451,7 @@ public class WholeSectionToHtmlParser extends AbstractWiktionaryParser {
             final String prefix = wikiTokenizer.listItemPrefix();
             while (listPrefixStack.size() < prefix.length()) {
                 builder.append(String.format("<%s>",
-                        WikiTokenizer.getListTag(prefix.charAt(listPrefixStack.size()))));
+                                             WikiTokenizer.getListTag(prefix.charAt(listPrefixStack.size()))));
                 listPrefixStack.add(prefix.charAt(listPrefixStack.size()));
             }
             builder.append("<li>");
index 7f52642821bc4b8b2bbafac65be248a0642fa504..9dfa00a3a0932caf2b843bcccfa597017c1def55 100644 (file)
@@ -23,195 +23,195 @@ import java.util.Set;
 import java.util.regex.Pattern;
 
 public class WiktionaryLangs {
-  
-  public static final Map<String,String> isoCodeToEnWikiName = new LinkedHashMap<String,String>();
-  static {
-    isoCodeToEnWikiName.put("AF", "Afrikaans");
-    isoCodeToEnWikiName.put("SQ", "Albanian");
-    isoCodeToEnWikiName.put("AR", "Arabic");
-    isoCodeToEnWikiName.put("HY", "Armenian");
-    isoCodeToEnWikiName.put("BE", "Belarusian");
-    isoCodeToEnWikiName.put("BN", "Bengali");
-    isoCodeToEnWikiName.put("BG", "Bulgarian");
-    isoCodeToEnWikiName.put("CA", "Catalan");
-    isoCodeToEnWikiName.put("SH", "Serbo-Croatian");
-    isoCodeToEnWikiName.put("CS", "Czech");
-    isoCodeToEnWikiName.put("ZH", "Chinese");
-    isoCodeToEnWikiName.put("cmn", "Mandarin");
-    isoCodeToEnWikiName.put("yue", "Cantonese");
-    isoCodeToEnWikiName.put("DA", "Danish");
-    isoCodeToEnWikiName.put("NL", "Dutch");
-    isoCodeToEnWikiName.put("EN", "English");
-    isoCodeToEnWikiName.put("EO", "Esperanto");
-    isoCodeToEnWikiName.put("ET", "Estonian");
-    isoCodeToEnWikiName.put("FI", "Finnish");
-    isoCodeToEnWikiName.put("FR", "French");
-    isoCodeToEnWikiName.put("DE", "German");
-    isoCodeToEnWikiName.put("grc", "Ancient Greek");
-    isoCodeToEnWikiName.put("EL", "Greek");
-    isoCodeToEnWikiName.put("haw", "Hawaiian");
-    isoCodeToEnWikiName.put("HE", "Hebrew");
-    isoCodeToEnWikiName.put("HI", "Hindi");
-    isoCodeToEnWikiName.put("HU", "Hungarian");
-    isoCodeToEnWikiName.put("IS", "Icelandic");
-    isoCodeToEnWikiName.put("ID", "Indonesian");
-    isoCodeToEnWikiName.put("GA", "Irish");
-    isoCodeToEnWikiName.put("GD", "Gaelic");
-    isoCodeToEnWikiName.put("GV", "Manx");
-    isoCodeToEnWikiName.put("IT", "Italian");
-    isoCodeToEnWikiName.put("LA", "Latin");
-    isoCodeToEnWikiName.put("LV", "Latvian");
-    isoCodeToEnWikiName.put("LT", "Lithuanian");
-    isoCodeToEnWikiName.put("JA", "Japanese");
-    isoCodeToEnWikiName.put("KO", "Korean");
-    isoCodeToEnWikiName.put("KU", "Kurdish");
-    isoCodeToEnWikiName.put("LO", "Lao");
-    isoCodeToEnWikiName.put("ML", "Malayalam");
-    isoCodeToEnWikiName.put("MS", "Malay");
-    isoCodeToEnWikiName.put("MI", "Maori");
-    isoCodeToEnWikiName.put("MN", "Mongolian");
-    isoCodeToEnWikiName.put("NE", "Nepali");
-    isoCodeToEnWikiName.put("NO", "Norwegian");
-    isoCodeToEnWikiName.put("FA", "Persian");
-    isoCodeToEnWikiName.put("PL", "Polish");
-    isoCodeToEnWikiName.put("PT", "Portuguese");
-    isoCodeToEnWikiName.put("PA", "Punjabi");
-    isoCodeToEnWikiName.put("RO", "Romanian");
-    isoCodeToEnWikiName.put("RU", "Russian");
-    isoCodeToEnWikiName.put("SA", "Sanskrit");
-    isoCodeToEnWikiName.put("SK", "Slovak");
-    isoCodeToEnWikiName.put("SL", "Slovene|Slovenian");
-    isoCodeToEnWikiName.put("SO", "Somali");
-    isoCodeToEnWikiName.put("ES", "Spanish");
-    isoCodeToEnWikiName.put("SW", "Swahili");
-    isoCodeToEnWikiName.put("SV", "Swedish");
-    isoCodeToEnWikiName.put("TL", "Tagalog");
-    isoCodeToEnWikiName.put("TG", "Tajik");
-    isoCodeToEnWikiName.put("TA", "Tamil");
-    isoCodeToEnWikiName.put("TH", "Thai");
-    isoCodeToEnWikiName.put("BO", "Tibetan");
-    isoCodeToEnWikiName.put("TR", "Turkish");
-    isoCodeToEnWikiName.put("UK", "Ukrainian");
-    isoCodeToEnWikiName.put("UR", "Urdu");
-    isoCodeToEnWikiName.put("VI", "Vietnamese");
-    isoCodeToEnWikiName.put("CI", "Welsh");
-    isoCodeToEnWikiName.put("YI", "Yiddish");
-    isoCodeToEnWikiName.put("ZU", "Zulu");
-    isoCodeToEnWikiName.put("AZ", "Azeri");
-    isoCodeToEnWikiName.put("EU", "Basque");
-    isoCodeToEnWikiName.put("BR", "Breton");
-    isoCodeToEnWikiName.put("MR", "Marathi");
-    isoCodeToEnWikiName.put("FO", "Faroese");
-    isoCodeToEnWikiName.put("GL", "Galician");
-    isoCodeToEnWikiName.put("KA", "Georgian");
-    isoCodeToEnWikiName.put("HT", "Haitian Creole");
-    isoCodeToEnWikiName.put("LB", "Luxembourgish");
-    isoCodeToEnWikiName.put("MK", "Macedonian");
-    isoCodeToEnWikiName.put("GV", "Manx");
-    
-    // No longer exists in EN:
-    // isoCodeToEnWikiName.put("BS", "Bosnian");
-    // isoCodeToEnWikiName.put("SR", "Serbian");
-    // isoCodeToEnWikiName.put("HR", "Croatian");
-    
-    // Font doesn't work:
-    //isoCodeToEnWikiName.put("MY", "Burmese");
-
-
-    {
-        //Set<String> missing = new LinkedHashSet<String>(isoCodeToEnWikiName.keySet());
-        //missing.removeAll(Language.isoCodeToResources.keySet());
-        //System.out.println(missing);
+
+    public static final Map<String,String> isoCodeToEnWikiName = new LinkedHashMap<String,String>();
+    static {
+        isoCodeToEnWikiName.put("AF", "Afrikaans");
+        isoCodeToEnWikiName.put("SQ", "Albanian");
+        isoCodeToEnWikiName.put("AR", "Arabic");
+        isoCodeToEnWikiName.put("HY", "Armenian");
+        isoCodeToEnWikiName.put("BE", "Belarusian");
+        isoCodeToEnWikiName.put("BN", "Bengali");
+        isoCodeToEnWikiName.put("BG", "Bulgarian");
+        isoCodeToEnWikiName.put("CA", "Catalan");
+        isoCodeToEnWikiName.put("SH", "Serbo-Croatian");
+        isoCodeToEnWikiName.put("CS", "Czech");
+        isoCodeToEnWikiName.put("ZH", "Chinese");
+        isoCodeToEnWikiName.put("cmn", "Mandarin");
+        isoCodeToEnWikiName.put("yue", "Cantonese");
+        isoCodeToEnWikiName.put("DA", "Danish");
+        isoCodeToEnWikiName.put("NL", "Dutch");
+        isoCodeToEnWikiName.put("EN", "English");
+        isoCodeToEnWikiName.put("EO", "Esperanto");
+        isoCodeToEnWikiName.put("ET", "Estonian");
+        isoCodeToEnWikiName.put("FI", "Finnish");
+        isoCodeToEnWikiName.put("FR", "French");
+        isoCodeToEnWikiName.put("DE", "German");
+        isoCodeToEnWikiName.put("grc", "Ancient Greek");
+        isoCodeToEnWikiName.put("EL", "Greek");
+        isoCodeToEnWikiName.put("haw", "Hawaiian");
+        isoCodeToEnWikiName.put("HE", "Hebrew");
+        isoCodeToEnWikiName.put("HI", "Hindi");
+        isoCodeToEnWikiName.put("HU", "Hungarian");
+        isoCodeToEnWikiName.put("IS", "Icelandic");
+        isoCodeToEnWikiName.put("ID", "Indonesian");
+        isoCodeToEnWikiName.put("GA", "Irish");
+        isoCodeToEnWikiName.put("GD", "Gaelic");
+        isoCodeToEnWikiName.put("GV", "Manx");
+        isoCodeToEnWikiName.put("IT", "Italian");
+        isoCodeToEnWikiName.put("LA", "Latin");
+        isoCodeToEnWikiName.put("LV", "Latvian");
+        isoCodeToEnWikiName.put("LT", "Lithuanian");
+        isoCodeToEnWikiName.put("JA", "Japanese");
+        isoCodeToEnWikiName.put("KO", "Korean");
+        isoCodeToEnWikiName.put("KU", "Kurdish");
+        isoCodeToEnWikiName.put("LO", "Lao");
+        isoCodeToEnWikiName.put("ML", "Malayalam");
+        isoCodeToEnWikiName.put("MS", "Malay");
+        isoCodeToEnWikiName.put("MI", "Maori");
+        isoCodeToEnWikiName.put("MN", "Mongolian");
+        isoCodeToEnWikiName.put("NE", "Nepali");
+        isoCodeToEnWikiName.put("NO", "Norwegian");
+        isoCodeToEnWikiName.put("FA", "Persian");
+        isoCodeToEnWikiName.put("PL", "Polish");
+        isoCodeToEnWikiName.put("PT", "Portuguese");
+        isoCodeToEnWikiName.put("PA", "Punjabi");
+        isoCodeToEnWikiName.put("RO", "Romanian");
+        isoCodeToEnWikiName.put("RU", "Russian");
+        isoCodeToEnWikiName.put("SA", "Sanskrit");
+        isoCodeToEnWikiName.put("SK", "Slovak");
+        isoCodeToEnWikiName.put("SL", "Slovene|Slovenian");
+        isoCodeToEnWikiName.put("SO", "Somali");
+        isoCodeToEnWikiName.put("ES", "Spanish");
+        isoCodeToEnWikiName.put("SW", "Swahili");
+        isoCodeToEnWikiName.put("SV", "Swedish");
+        isoCodeToEnWikiName.put("TL", "Tagalog");
+        isoCodeToEnWikiName.put("TG", "Tajik");
+        isoCodeToEnWikiName.put("TA", "Tamil");
+        isoCodeToEnWikiName.put("TH", "Thai");
+        isoCodeToEnWikiName.put("BO", "Tibetan");
+        isoCodeToEnWikiName.put("TR", "Turkish");
+        isoCodeToEnWikiName.put("UK", "Ukrainian");
+        isoCodeToEnWikiName.put("UR", "Urdu");
+        isoCodeToEnWikiName.put("VI", "Vietnamese");
+        isoCodeToEnWikiName.put("CI", "Welsh");
+        isoCodeToEnWikiName.put("YI", "Yiddish");
+        isoCodeToEnWikiName.put("ZU", "Zulu");
+        isoCodeToEnWikiName.put("AZ", "Azeri");
+        isoCodeToEnWikiName.put("EU", "Basque");
+        isoCodeToEnWikiName.put("BR", "Breton");
+        isoCodeToEnWikiName.put("MR", "Marathi");
+        isoCodeToEnWikiName.put("FO", "Faroese");
+        isoCodeToEnWikiName.put("GL", "Galician");
+        isoCodeToEnWikiName.put("KA", "Georgian");
+        isoCodeToEnWikiName.put("HT", "Haitian Creole");
+        isoCodeToEnWikiName.put("LB", "Luxembourgish");
+        isoCodeToEnWikiName.put("MK", "Macedonian");
+        isoCodeToEnWikiName.put("GV", "Manx");
+
+        // No longer exists in EN:
+        // isoCodeToEnWikiName.put("BS", "Bosnian");
+        // isoCodeToEnWikiName.put("SR", "Serbian");
+        // isoCodeToEnWikiName.put("HR", "Croatian");
+
+        // Font doesn't work:
+        //isoCodeToEnWikiName.put("MY", "Burmese");
+
+
+        {
+            //Set<String> missing = new LinkedHashSet<String>(isoCodeToEnWikiName.keySet());
+            //missing.removeAll(Language.isoCodeToResources.keySet());
+            //System.out.println(missing);
+        }
+        //assert Language.isoCodeToResources.keySet().containsAll(isoCodeToEnWikiName.keySet());
+    }
+
+    public static final Map<String,Map<String,String>> wikiCodeToIsoCodeToWikiName = new LinkedHashMap<String, Map<String,String>>();
+    static {
+        // en
+        wikiCodeToIsoCodeToWikiName.put("en", isoCodeToEnWikiName);
+
+        Map<String,String> isoCodeToWikiName;
+
+        // egrep -o '\{\{Wortart[^}]+\}\}' dewiktionary-pages-articles.xml | cut -d \| -f3 | sort | uniq -c | sort -nr
+        isoCodeToWikiName = new LinkedHashMap<String, String>();
+        wikiCodeToIsoCodeToWikiName.put("de", isoCodeToWikiName);
+        isoCodeToWikiName.put("DE", "Deutsch");
+        isoCodeToWikiName.put("EN", "Englisch");
+        isoCodeToWikiName.put("IT", "Italienisch");
+        isoCodeToWikiName.put("PL", "Polnisch");
+        isoCodeToWikiName.put("FR", "Französisch");
+        isoCodeToWikiName.put("EO", "Esperanto");
+        isoCodeToWikiName.put("CA", "Katalanisch");
+        isoCodeToWikiName.put("LA", "Latein");
+        isoCodeToWikiName.put("CS", "Tschechisch");
+        isoCodeToWikiName.put("HU", "Ungarisch");
+        isoCodeToWikiName.put("SV", "Schwedisch");
+        isoCodeToWikiName.put("ES", "Spanisch");
+
+        // egrep -o '== *\{\{langue\|[a-zA-Z]+\}\} *==' frwiktionary-pages-articles.xml | sort | uniq -c | sort -nr
+        isoCodeToWikiName = new LinkedHashMap<String, String>();
+        wikiCodeToIsoCodeToWikiName.put("fr", isoCodeToWikiName);
+        isoCodeToWikiName.put("FR", Pattern.quote("{{langue|fr}}"));
+        isoCodeToWikiName.put("RU", Pattern.quote("{{langue|ru}}"));
+        isoCodeToWikiName.put("AR", Pattern.quote("{{langue|ar}}"));  // Arabic
+        isoCodeToWikiName.put("BG", Pattern.quote("{{langue|bg}}"));  // Bulgarian
+        isoCodeToWikiName.put("EN", Pattern.quote("{{langue|en}}"));
+        //isoCodeToWikiName.put("", Pattern.quote("{{langue|sl}}"));
+        isoCodeToWikiName.put("LA", Pattern.quote("{{langue|la}}"));
+        isoCodeToWikiName.put("IT", Pattern.quote("{{langue|it}}"));
+        isoCodeToWikiName.put("EO", Pattern.quote("{{langue|eo}}"));
+        isoCodeToWikiName.put("CS", Pattern.quote("{{langue|cs}}"));  // Czech
+        isoCodeToWikiName.put("NL", Pattern.quote("{{langue|nl}}"));  // Dutch
+        //isoCodeToWikiName.put("", Pattern.quote("{{langue|mg}}"));
+        //isoCodeToWikiName.put("", Pattern.quote("{{langue|hsb}}"));
+        isoCodeToWikiName.put("ZH", Pattern.quote("{{langue|zh}}"));
+        isoCodeToWikiName.put("cmn", Pattern.quote("{{langue|cmn}}"));
+        isoCodeToWikiName.put("yue", Pattern.quote("{{langue|yue}}"));
+        isoCodeToWikiName.put("JA", Pattern.quote("{{langue|ja}}"));
+        isoCodeToWikiName.put("DE", Pattern.quote("{{langue|de}}"));
+        isoCodeToWikiName.put("IS", Pattern.quote("{{langue|is}}"));  // Icelandic
+        isoCodeToWikiName.put("ES", Pattern.quote("{{langue|es}}"));
+        isoCodeToWikiName.put("UK", Pattern.quote("{{langue|uk}}"));
+
+        // egrep -o '= *\{\{-[a-z]+-\}\} *=' itwiktionary-pages-articles.xml | sort | uniq -c | sort -n
+        isoCodeToWikiName = new LinkedHashMap<String, String>();
+        wikiCodeToIsoCodeToWikiName.put("it", isoCodeToWikiName);
+        isoCodeToWikiName.put("IT", "\\{\\{-(it|scn|nap|cal|lmo)-\\}\\}");  // scn, nap, cal, lmo
+        isoCodeToWikiName.put("EN", Pattern.quote("{{-en-}}"));
+        isoCodeToWikiName.put("FR", Pattern.quote("{{-fr-}}"));
+        isoCodeToWikiName.put("DE", Pattern.quote("{{-de-}}"));
+        isoCodeToWikiName.put("ES", Pattern.quote("{{-es-}}"));
+        isoCodeToWikiName.put("JA", Pattern.quote("{{-ja-}}"));
+        isoCodeToWikiName.put("PL", Pattern.quote("{{-pl-}}"));
+        isoCodeToWikiName.put("NL", Pattern.quote("{{-nl-}}"));
+        isoCodeToWikiName.put("LV", Pattern.quote("{{-lv-}}"));
+        isoCodeToWikiName.put("LA", Pattern.quote("{{-la-}}"));
+        isoCodeToWikiName.put("HU", Pattern.quote("{{-hu-}}"));
+        isoCodeToWikiName.put("EL", Pattern.quote("{{-grc-}}"));
+        isoCodeToWikiName.put("SV", Pattern.quote("{{-sv-}}"));
+        isoCodeToWikiName.put("RU", Pattern.quote("{{-ru-}}"));
+
+        // There seems to be no consistent pattern and few foreign language entries anyway
+        isoCodeToWikiName = new LinkedHashMap<String, String>();
+        wikiCodeToIsoCodeToWikiName.put("es", isoCodeToWikiName);
+        isoCodeToWikiName.put("ES", Pattern.quote("{{ES"));
+    }
+    public static String getEnglishName(String langCode) {
+        String name = isoCodeToEnWikiName.get(langCode);
+        if (name == null) {
+            name = isoCodeToEnWikiName.get(langCode.toUpperCase());
+        }
+        if (name == null) {
+            return null;
+        }
+        if (name.indexOf('|') != -1) {
+            return name.substring(0, name.indexOf('|'));
+        }
+        if (name.indexOf('$') != -1) {
+            return name.substring(0, name.indexOf('$'));
+        }
+        return name;  // can be null.
     }
-    //assert Language.isoCodeToResources.keySet().containsAll(isoCodeToEnWikiName.keySet());
-  }
-
-  public static final Map<String,Map<String,String>> wikiCodeToIsoCodeToWikiName = new LinkedHashMap<String, Map<String,String>>();
-  static {
-    // en
-    wikiCodeToIsoCodeToWikiName.put("en", isoCodeToEnWikiName);
-    
-    Map<String,String> isoCodeToWikiName;
-    
-    // egrep -o '\{\{Wortart[^}]+\}\}' dewiktionary-pages-articles.xml | cut -d \| -f3 | sort | uniq -c | sort -nr
-    isoCodeToWikiName = new LinkedHashMap<String, String>();
-    wikiCodeToIsoCodeToWikiName.put("de", isoCodeToWikiName);
-    isoCodeToWikiName.put("DE", "Deutsch");
-    isoCodeToWikiName.put("EN", "Englisch");
-    isoCodeToWikiName.put("IT", "Italienisch");
-    isoCodeToWikiName.put("PL", "Polnisch");
-    isoCodeToWikiName.put("FR", "Französisch");
-    isoCodeToWikiName.put("EO", "Esperanto");
-    isoCodeToWikiName.put("CA", "Katalanisch");
-    isoCodeToWikiName.put("LA", "Latein");
-    isoCodeToWikiName.put("CS", "Tschechisch");
-    isoCodeToWikiName.put("HU", "Ungarisch");
-    isoCodeToWikiName.put("SV", "Schwedisch");
-    isoCodeToWikiName.put("ES", "Spanisch");
-
-    // egrep -o '== *\{\{langue\|[a-zA-Z]+\}\} *==' frwiktionary-pages-articles.xml | sort | uniq -c | sort -nr
-    isoCodeToWikiName = new LinkedHashMap<String, String>();
-    wikiCodeToIsoCodeToWikiName.put("fr", isoCodeToWikiName);
-    isoCodeToWikiName.put("FR", Pattern.quote("{{langue|fr}}"));
-    isoCodeToWikiName.put("RU", Pattern.quote("{{langue|ru}}"));
-    isoCodeToWikiName.put("AR", Pattern.quote("{{langue|ar}}"));  // Arabic
-    isoCodeToWikiName.put("BG", Pattern.quote("{{langue|bg}}"));  // Bulgarian
-    isoCodeToWikiName.put("EN", Pattern.quote("{{langue|en}}"));
-    //isoCodeToWikiName.put("", Pattern.quote("{{langue|sl}}"));
-    isoCodeToWikiName.put("LA", Pattern.quote("{{langue|la}}"));
-    isoCodeToWikiName.put("IT", Pattern.quote("{{langue|it}}"));
-    isoCodeToWikiName.put("EO", Pattern.quote("{{langue|eo}}"));
-    isoCodeToWikiName.put("CS", Pattern.quote("{{langue|cs}}"));  // Czech
-    isoCodeToWikiName.put("NL", Pattern.quote("{{langue|nl}}"));  // Dutch
-    //isoCodeToWikiName.put("", Pattern.quote("{{langue|mg}}"));
-    //isoCodeToWikiName.put("", Pattern.quote("{{langue|hsb}}"));
-    isoCodeToWikiName.put("ZH", Pattern.quote("{{langue|zh}}"));
-    isoCodeToWikiName.put("cmn", Pattern.quote("{{langue|cmn}}"));
-    isoCodeToWikiName.put("yue", Pattern.quote("{{langue|yue}}"));
-    isoCodeToWikiName.put("JA", Pattern.quote("{{langue|ja}}"));
-    isoCodeToWikiName.put("DE", Pattern.quote("{{langue|de}}"));
-    isoCodeToWikiName.put("IS", Pattern.quote("{{langue|is}}"));  // Icelandic
-    isoCodeToWikiName.put("ES", Pattern.quote("{{langue|es}}"));
-    isoCodeToWikiName.put("UK", Pattern.quote("{{langue|uk}}"));
-
-    // egrep -o '= *\{\{-[a-z]+-\}\} *=' itwiktionary-pages-articles.xml | sort | uniq -c | sort -n
-    isoCodeToWikiName = new LinkedHashMap<String, String>();
-    wikiCodeToIsoCodeToWikiName.put("it", isoCodeToWikiName);
-    isoCodeToWikiName.put("IT", "\\{\\{-(it|scn|nap|cal|lmo)-\\}\\}");  // scn, nap, cal, lmo
-    isoCodeToWikiName.put("EN", Pattern.quote("{{-en-}}"));
-    isoCodeToWikiName.put("FR", Pattern.quote("{{-fr-}}"));
-    isoCodeToWikiName.put("DE", Pattern.quote("{{-de-}}"));
-    isoCodeToWikiName.put("ES", Pattern.quote("{{-es-}}"));
-    isoCodeToWikiName.put("JA", Pattern.quote("{{-ja-}}"));
-    isoCodeToWikiName.put("PL", Pattern.quote("{{-pl-}}"));
-    isoCodeToWikiName.put("NL", Pattern.quote("{{-nl-}}"));
-    isoCodeToWikiName.put("LV", Pattern.quote("{{-lv-}}"));
-    isoCodeToWikiName.put("LA", Pattern.quote("{{-la-}}"));
-    isoCodeToWikiName.put("HU", Pattern.quote("{{-hu-}}"));
-    isoCodeToWikiName.put("EL", Pattern.quote("{{-grc-}}"));
-    isoCodeToWikiName.put("SV", Pattern.quote("{{-sv-}}"));
-    isoCodeToWikiName.put("RU", Pattern.quote("{{-ru-}}"));
-
-    // There seems to be no consistent pattern and few foreign language entries anyway
-    isoCodeToWikiName = new LinkedHashMap<String, String>();
-    wikiCodeToIsoCodeToWikiName.put("es", isoCodeToWikiName);
-    isoCodeToWikiName.put("ES", Pattern.quote("{{ES"));
-  }
-  public static String getEnglishName(String langCode) {
-      String name = isoCodeToEnWikiName.get(langCode);
-      if (name == null) {
-          name = isoCodeToEnWikiName.get(langCode.toUpperCase());
-      }
-      if (name == null) {
-          return null;
-      }
-      if (name.indexOf('|') != -1) {
-          return name.substring(0, name.indexOf('|'));
-      }
-      if (name.indexOf('$') != -1) {
-          return name.substring(0, name.indexOf('$'));
-      }
-      return name;  // can be null.
-  }
-  
+
 }