From 511f5633b8f21a7929b0658e06245bb5092a313d Mon Sep 17 00:00:00 2001 From: Thad Hughes Date: Sat, 29 Dec 2012 22:35:44 -0800 Subject: [PATCH] Update URL format and parsing, fix FR handling. --- .../engine/DictionaryBuilderMain.java | 8 +- .../engine/DictionaryBuilderTest.java | 4 + .../dictionary/engine/DictionaryTest.java | 28 +- .../dictionary/engine/WiktionarySplitter.java | 2 +- .../dictionary/parser/GeneralTest.java | 2 +- .../wiktionary/FrFunctionCallbacks.java | 73 + .../wiktionary/WholeSectionToHtmlParser.java | 39 +- .../parser/wiktionary/WiktionaryLangs.java | 38 +- testdata/goldens/SingleLang_DE.quickdic.text | 3538 +++++------ testdata/goldens/SingleLang_EN.quickdic.text | 5474 ++++++++--------- testdata/goldens/SingleLang_IT.quickdic.text | 1538 ++--- testdata/goldens/testItConj.html | 2 +- .../wiktionary.WholeSection.DE.quickdic.text | 614 +- .../wiktionary.WholeSection.EN.quickdic.text | 5474 ++++++++--------- .../wiktionary.WholeSection.IT.quickdic.text | 4608 +++++++------- testdata/outputs/testItConj.html | 2 +- 16 files changed, 10779 insertions(+), 10665 deletions(-) create mode 100644 src/com/hughes/android/dictionary/parser/wiktionary/FrFunctionCallbacks.java diff --git a/src/com/hughes/android/dictionary/engine/DictionaryBuilderMain.java b/src/com/hughes/android/dictionary/engine/DictionaryBuilderMain.java index cfa8066..9f41c83 100644 --- a/src/com/hughes/android/dictionary/engine/DictionaryBuilderMain.java +++ b/src/com/hughes/android/dictionary/engine/DictionaryBuilderMain.java @@ -47,8 +47,6 @@ public class DictionaryBuilderMain extends TestCase { {"DE", "EN" }, {"DE", "IT" }, - /* - {"AR", "DE" }, {"AR", "ES" }, {"AR", "FR" }, @@ -136,8 +134,6 @@ public class DictionaryBuilderMain extends TestCase { {"FA", "SV" }, // Persian, Swedish, by request. {"NL", "PL" }, // Dutch, Polish, by request. - */ - }; @@ -319,7 +315,9 @@ public class DictionaryBuilderMain extends TestCase { allPairs.addAll(Arrays.asList(nonEnPairs)); // Add all the EN-XX pairs. for (final String isoCode : WiktionaryLangs.isoCodeToEnWikiName.keySet()) { - allPairs.add(new String[] {"EN", isoCode}); + if (!isoCode.equals("EN")) { + allPairs.add(new String[] {"EN", isoCode}); + } } diff --git a/src/com/hughes/android/dictionary/engine/DictionaryBuilderTest.java b/src/com/hughes/android/dictionary/engine/DictionaryBuilderTest.java index 4454015..ba75064 100644 --- a/src/com/hughes/android/dictionary/engine/DictionaryBuilderTest.java +++ b/src/com/hughes/android/dictionary/engine/DictionaryBuilderTest.java @@ -189,6 +189,10 @@ public class DictionaryBuilderTest extends TestCase { wiktionaryTestSingleLang("SingleLang_IT.quickdic", "IT", 100); } + public void testSingleLang_FR() throws Exception { + wiktionaryTestSingleLang("SingleLang_FR.quickdic", "FR", 100); + } + public void wiktionaryTestSingleLang(final String name, final String langCode, final int pageLimit) throws Exception { final File result = new File(TEST_OUTPUTS + name); System.out.println("Writing to: " + result); diff --git a/src/com/hughes/android/dictionary/engine/DictionaryTest.java b/src/com/hughes/android/dictionary/engine/DictionaryTest.java index d93ecd4..f01c846 100644 --- a/src/com/hughes/android/dictionary/engine/DictionaryTest.java +++ b/src/com/hughes/android/dictionary/engine/DictionaryTest.java @@ -42,6 +42,9 @@ public class DictionaryTest extends TestCase { } } } + + public void testURLFormatting() { + } public void testEnItWiktionary() throws IOException { final RandomAccessFile raf = new RandomAccessFile(OUTPUTS + "EN-IT.quickdic", "r"); @@ -61,9 +64,32 @@ public class DictionaryTest extends TestCase { assertTrue(!((TokenRow)rows.get(0)).getIndexEntry().htmlEntries.isEmpty()); } + { + final IndexEntry searchResult = itIndex.findInsertionPoint("azzurro", new AtomicBoolean( + false)); + HtmlEntry htmlEntry = searchResult.htmlEntries.get(0); + System.out.println("azzurro:\n" + htmlEntry.getHtml()); + } + raf.close(); } + public void testDeEnWiktionary() throws IOException { + final RandomAccessFile raf = new RandomAccessFile(OUTPUTS + "DE-EN.quickdic", "r"); + final Dictionary dict = new Dictionary(raf); + + final Index deIndex = dict.indices.get(0); + + { + final IndexEntry searchResult = deIndex.findInsertionPoint("rot", new AtomicBoolean( + false)); + HtmlEntry htmlEntry = searchResult.htmlEntries.get(0); + System.out.println("rot:\n" + htmlEntry.getHtml()); + } + + raf.close(); + } + public void testGermanMetadata() throws IOException { final RandomAccessFile raf = new RandomAccessFile(TEST_OUTPUTS + "de-en.quickdic", "r"); final Dictionary dict = new Dictionary(raf); @@ -170,7 +196,7 @@ public class DictionaryTest extends TestCase { assertSearchResult("Höschen", "Hos", deIndex.findInsertionPoint("Hos", new AtomicBoolean(false))); assertSearchResult("Höschen", "hos", deIndex.findInsertionPoint("hos", new AtomicBoolean(false))); - + raf.close(); } diff --git a/src/com/hughes/android/dictionary/engine/WiktionarySplitter.java b/src/com/hughes/android/dictionary/engine/WiktionarySplitter.java index 361473e..408ecd9 100644 --- a/src/com/hughes/android/dictionary/engine/WiktionarySplitter.java +++ b/src/com/hughes/android/dictionary/engine/WiktionarySplitter.java @@ -56,7 +56,7 @@ public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler { private WiktionarySplitter() { List selectors; for (final String code : WiktionaryLangs.wikiCodeToIsoCodeToWikiName.keySet()) { - //if (code.equals("en") || code.equals("de") || code.equals("fr")) {continue;} + //if (!code.equals("fr")) {continue;} selectors = new ArrayList(); pathToSelectors.put(String.format("data/inputs/%swiktionary-pages-articles.xml", code), selectors); for (final Map.Entry entry : WiktionaryLangs.wikiCodeToIsoCodeToWikiName.get(code).entrySet()) { diff --git a/src/com/hughes/android/dictionary/parser/GeneralTest.java b/src/com/hughes/android/dictionary/parser/GeneralTest.java index f52ad6f..d566e9e 100644 --- a/src/com/hughes/android/dictionary/parser/GeneralTest.java +++ b/src/com/hughes/android/dictionary/parser/GeneralTest.java @@ -14,7 +14,7 @@ public class GeneralTest { // This isn't actually valid html: assertEquals("IPA|/dɛɪ̯/|lang=nds", StringEscapeUtils.escapeHtml3("IPA|/dɛɪ̯/|lang=nds")); // Hopefully this is: - assertEquals("IPA|/dɛɪ̯/|lang=nds", StringUtil.escapeToPureHtmlUnicode("IPA|/dɛɪ̯/|lang=nds")); + assertEquals("IPA|/dɛɪ̯/|lang=nds", StringUtil.escapeUnicodeToPureHtml("IPA|/dɛɪ̯/|lang=nds")); } } diff --git a/src/com/hughes/android/dictionary/parser/wiktionary/FrFunctionCallbacks.java b/src/com/hughes/android/dictionary/parser/wiktionary/FrFunctionCallbacks.java new file mode 100644 index 0000000..7727ad0 --- /dev/null +++ b/src/com/hughes/android/dictionary/parser/wiktionary/FrFunctionCallbacks.java @@ -0,0 +1,73 @@ +// Copyright 2012 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package com.hughes.android.dictionary.parser.wiktionary; + +import com.hughes.android.dictionary.parser.WikiTokenizer; +import com.hughes.android.dictionary.parser.wiktionary.AbstractWiktionaryParser.AppendAndIndexWikiCallback; +import com.hughes.android.dictionary.parser.wiktionary.AbstractWiktionaryParser.NameAndArgs; +import com.hughes.android.dictionary.parser.wiktionary.ItFunctionCallbacks.Redispatch; + +import java.util.List; +import java.util.Map; + +class FrFunctionCallbacks { + + static void addGenericCallbacks(Map> callbacks) { + callbacks.put("-étym-", new Redispatch("\n==== Étymologie ====\n")); + callbacks.put("-pron-", new Redispatch("\n==== Prononciation ====\n")); + callbacks.put("-voir-", new Redispatch("\n==== Voir aussi ====\n")); + callbacks.put("-drv-", new Redispatch("\n==== Dérivés ====\n")); + callbacks.put("-syn-", new Redispatch("\n==== Synonymes ====\n")); + + callbacks.put("-apr-", new Redispatch("\n==== Apparentés étymologiques ====\n")); + callbacks.put("-hyper-", new Redispatch("\n==== Hyperonymes ====\n")); + callbacks.put("-hypo-", new Redispatch("\n==== Hyponymes ====\n")); + callbacks.put("-réf-", new Redispatch("\n==== Références ====\n")); + callbacks.put("-homo-", new Redispatch("\n==== Homophones ====\n")); + callbacks.put("-anagr-", new Redispatch("\n==== Anagrammes ====\n")); + callbacks.put("-voc-", new Redispatch("\n==== Vocabulaire apparenté par le sens ====\n")); + callbacks.put("-exp-", new Redispatch("\n==== Expressions ====\n")); + callbacks.put("-note-", new Redispatch("\n==== Note ====\n")); + + callbacks.put("-trad-", new ItFunctionCallbacks.SkipSection()); + } + + + static final NameAndArgs NAME_AND_ARGS = new NameAndArgs(); + + + static final class MakeHeadingFromName implements FunctionCallback { + final String header; + public MakeHeadingFromName(String header) { + this.header = header; + } + + @Override + public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List args, + final Map namedArgs, + final T parser, + final AppendAndIndexWikiCallback appendAndIndexWikiCallback) { + if (!namedArgs.isEmpty() || args.size() != 0) { + return false; + } + //appendAndIndexWikiCallback.builder.append(String.format("<%s>", header)); + appendAndIndexWikiCallback.dispatch("\n" + header + name + header, null); + //appendAndIndexWikiCallback.builder.append(String.format("\n", header)); + return true; + } + } + + +} \ No newline at end of file diff --git a/src/com/hughes/android/dictionary/parser/wiktionary/WholeSectionToHtmlParser.java b/src/com/hughes/android/dictionary/parser/wiktionary/WholeSectionToHtmlParser.java index b3249d1..21a83e4 100644 --- a/src/com/hughes/android/dictionary/parser/wiktionary/WholeSectionToHtmlParser.java +++ b/src/com/hughes/android/dictionary/parser/wiktionary/WholeSectionToHtmlParser.java @@ -184,35 +184,46 @@ public class WholeSectionToHtmlParser extends AbstractWiktionaryParser { }); - - final LangConfig basicLangConfig = new LangConfig() { + final Pattern frSkipSections = Pattern.compile(".*(Traductions).*"); + isoToLangConfig.put("FR", new LangConfig() { @Override public boolean skipSection(String headingText) { - return false; + return frSkipSections.matcher(headingText).matches(); } + @Override public EntryTypeName sectionNameToEntryType(String sectionName) { - return EntryTypeName.WIKTIONARY_MENTIONED; + if (sectionName.equalsIgnoreCase("Synonymes")) { + return EntryTypeName.SYNONYM_MULTI; + } + return null; } + @Override public boolean skipWikiLink(WikiTokenizer wikiTokenizer) { - final String wikiText = wikiTokenizer.wikiLinkText(); - if (wikiText.startsWith("Category:")) { - return true; - } return false; } @Override - public String adjustWikiLink(String wikiLinkDest, final String wikiLinkText) { + public String adjustWikiLink(String wikiLinkDest, String wikiLinkText) { + if (wikiLinkDest.startsWith("w:") || wikiLinkDest.startsWith("Image:")) { + return null; + } + final int hashPos = wikiLinkDest.indexOf("#"); + if (hashPos != -1) { + wikiLinkDest = wikiLinkDest.substring(0, hashPos); + if (wikiLinkDest.isEmpty()) { + wikiLinkDest = wikiLinkText; + } + } return wikiLinkDest; } @Override public void addFunctionCallbacks( Map> functionCallbacks) { + FrFunctionCallbacks.addGenericCallbacks(functionCallbacks); } - }; - isoToLangConfig.put("FR", basicLangConfig); + }); } final IndexBuilder titleIndexBuilder; @@ -282,7 +293,7 @@ public class WholeSectionToHtmlParser extends AbstractWiktionaryParser { if (StringUtil.isAscii(htmlEscaped)) { return htmlEscaped; } else { - return StringUtil.escapeToPureHtmlUnicode(plainText); + return StringUtil.escapeUnicodeToPureHtml(plainText); } } @@ -354,10 +365,14 @@ public class WholeSectionToHtmlParser extends AbstractWiktionaryParser { sectionEntryTypeName = langConfig.sectionNameToEntryType(headingText); final int depth = wikiTokenizer.headingDepth(); if (langConfig.skipSection(headingText)) { + System.out.println("Skipping section:" + headingText); while ((wikiTokenizer = wikiTokenizer.nextToken()) != null) { if (wikiTokenizer.isHeading() && wikiTokenizer.headingDepth() <= depth) { + System.out.println("Resume on: " + wikiTokenizer.token()); wikiTokenizer.returnToLineStart(); return; + } else { + System.out.println("Skipped: " + wikiTokenizer.token()); } } return; diff --git a/src/com/hughes/android/dictionary/parser/wiktionary/WiktionaryLangs.java b/src/com/hughes/android/dictionary/parser/wiktionary/WiktionaryLangs.java index 4acdef7..3de77dd 100644 --- a/src/com/hughes/android/dictionary/parser/wiktionary/WiktionaryLangs.java +++ b/src/com/hughes/android/dictionary/parser/wiktionary/WiktionaryLangs.java @@ -14,9 +14,7 @@ package com.hughes.android.dictionary.parser.wiktionary; -import com.hughes.android.dictionary.R; import com.hughes.android.dictionary.engine.Language; -import com.hughes.android.dictionary.engine.Language.LanguageResources; import java.util.LinkedHashMap; import java.util.Map; @@ -140,24 +138,24 @@ public class WiktionaryLangs { // egrep -o '\{\{=[a-zA-Z]+=\}\}' frwiktionary-pages-articles.xml | sort | uniq -c | sort -nr isoCodeToWikiName = new LinkedHashMap(); wikiCodeToIsoCodeToWikiName.put("fr", isoCodeToWikiName); - isoCodeToWikiName.put("FR", Pattern.quote("{{=fr=}}")); - isoCodeToWikiName.put("RU", Pattern.quote("{{=ru=}}")); - isoCodeToWikiName.put("BG", Pattern.quote("{{=bg=}}")); // Bulgarian - isoCodeToWikiName.put("EN", Pattern.quote("{{=en=}}")); - //isoCodeToWikiName.put("", Pattern.quote("{{=sl=}}")); - isoCodeToWikiName.put("LA", Pattern.quote("{{=la=}}")); - isoCodeToWikiName.put("IT", Pattern.quote("{{=it=}}")); - isoCodeToWikiName.put("EO", Pattern.quote("{{=eo=}}")); - isoCodeToWikiName.put("CS", Pattern.quote("{{=cs=}}")); // Czech - isoCodeToWikiName.put("NL", Pattern.quote("{{=nl=}}")); // Dutch - //isoCodeToWikiName.put("", Pattern.quote("{{=mg=}}")); - //isoCodeToWikiName.put("", Pattern.quote("{{=hsb=}}")); - isoCodeToWikiName.put("ZH", Pattern.quote("{{=zh=}}")); - isoCodeToWikiName.put("JA", Pattern.quote("{{=ja=}}")); - isoCodeToWikiName.put("DE", Pattern.quote("{{=de=}}")); - isoCodeToWikiName.put("IS", Pattern.quote("{{=is=}}")); // Icelandic - isoCodeToWikiName.put("ES", Pattern.quote("{{=es=}}")); - isoCodeToWikiName.put("UK", Pattern.quote("{{=uk=}}")); + isoCodeToWikiName.put("FR", Pattern.quote("{{langue|fr}}")); + isoCodeToWikiName.put("RU", Pattern.quote("{{langue|ru}}")); + isoCodeToWikiName.put("BG", Pattern.quote("{{langue|bg}}")); // Bulgarian + isoCodeToWikiName.put("EN", Pattern.quote("{{langue|en}}")); + //isoCodeToWikiName.put("", Pattern.quote("{{langue|sl}}")); + isoCodeToWikiName.put("LA", Pattern.quote("{{langue|la}}")); + isoCodeToWikiName.put("IT", Pattern.quote("{{langue|it}}")); + isoCodeToWikiName.put("EO", Pattern.quote("{{langue|eo}}")); + isoCodeToWikiName.put("CS", Pattern.quote("{{langue|cs}}")); // Czech + isoCodeToWikiName.put("NL", Pattern.quote("{{langue|nl}}")); // Dutch + //isoCodeToWikiName.put("", Pattern.quote("{{langue|mg}}")); + //isoCodeToWikiName.put("", Pattern.quote("{{langue|hsb}}")); + isoCodeToWikiName.put("ZH", Pattern.quote("{{langue|zh}}")); + isoCodeToWikiName.put("JA", Pattern.quote("{{langue|ja}}")); + isoCodeToWikiName.put("DE", Pattern.quote("{{langue|de}}")); + isoCodeToWikiName.put("IS", Pattern.quote("{{langue|is}}")); // Icelandic + isoCodeToWikiName.put("ES", Pattern.quote("{{langue|es}}")); + isoCodeToWikiName.put("UK", Pattern.quote("{{langue|uk}}")); // egrep -o '= *\{\{-[a-z]+-\}\} *=' itwiktionary-pages-articles.xml | sort | uniq -c | sort -n isoCodeToWikiName = new LinkedHashMap(); diff --git a/testdata/goldens/SingleLang_DE.quickdic.text b/testdata/goldens/SingleLang_DE.quickdic.text index 6ddeb4b..104aa95 100644 --- a/testdata/goldens/SingleLang_DE.quickdic.text +++ b/testdata/goldens/SingleLang_DE.quickdic.text @@ -13,40 +13,40 @@ HtmlEntry: Aal <<<

Aussprache

-