X-Git-Url: http://gitweb.fperrin.net/?a=blobdiff_plain;f=src%2Fcom%2Fhughes%2Fandroid%2Fdictionary%2Fparser%2Fwiktionary%2FWholeSectionToHtmlParser.java;h=ba915aab86ab49071b8e1a53279fe79407ff4817;hb=2fc669d88306d563fc9c899d8d91b25d591692ea;hp=0066d3bc7aeed2dabbb85dccad7d89df848be65d;hpb=e479ba38bbcb261951399326623c20ffacc147d4;p=DictionaryPC.git diff --git a/src/com/hughes/android/dictionary/parser/wiktionary/WholeSectionToHtmlParser.java b/src/com/hughes/android/dictionary/parser/wiktionary/WholeSectionToHtmlParser.java index 0066d3b..ba915aa 100644 --- a/src/com/hughes/android/dictionary/parser/wiktionary/WholeSectionToHtmlParser.java +++ b/src/com/hughes/android/dictionary/parser/wiktionary/WholeSectionToHtmlParser.java @@ -1,6 +1,15 @@ package com.hughes.android.dictionary.parser.wiktionary; +import java.net.URI; +import java.util.ArrayList; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.regex.Pattern; + +import org.apache.commons.text.StringEscapeUtils; + import com.hughes.android.dictionary.engine.EntryTypeName; import com.hughes.android.dictionary.engine.HtmlEntry; import com.hughes.android.dictionary.engine.IndexBuilder; @@ -9,15 +18,6 @@ import com.hughes.android.dictionary.engine.IndexedEntry; import com.hughes.android.dictionary.parser.WikiTokenizer; import com.hughes.util.StringUtil; -import org.apache.commons.lang3.StringEscapeUtils; - -import java.net.URI; -import java.util.ArrayList; -import java.util.LinkedHashMap; -import java.util.List; -import java.util.Map; -import java.util.regex.Pattern; - public class WholeSectionToHtmlParser extends AbstractWiktionaryParser { public static final String NAME = "WholeSectionToHtmlParser"; @@ -30,7 +30,7 @@ public class WholeSectionToHtmlParser extends AbstractWiktionaryParser { void addFunctionCallbacks( Map> functionCallbacks); } - static final Map isoToLangConfig = new LinkedHashMap(); + static final Map isoToLangConfig = new LinkedHashMap<>(); static { final Pattern enSkipSections = Pattern.compile(".*(Translations|Anagrams|References).*"); isoToLangConfig.put("EN", new LangConfig() { @@ -47,23 +47,18 @@ public class WholeSectionToHtmlParser extends AbstractWiktionaryParser { if (sectionName.equalsIgnoreCase("Antonyms")) { return EntryTypeName.ANTONYM_MULTI; } - if (EnParser.partOfSpeechHeader.matcher(sectionName).matches()) { - // We need to put it in the other index, too (probably) - return null; - } - if (sectionName.equalsIgnoreCase("Derived Terms")) { - return null; - } + // We need to put it in the other index, too (probably) ? + // EnParser.partOfSpeechHeader.matcher(sectionName).matches() + + // Needs special handling? + // sectionName.equalsIgnoreCase("Derived Terms") return null; } @Override public boolean skipWikiLink(WikiTokenizer wikiTokenizer) { final String wikiText = wikiTokenizer.wikiLinkText(); - if (wikiText.startsWith("Category:")) { - return true; - } - return false; + return wikiText.startsWith("Category:"); } @Override public String adjustWikiLink(String wikiLinkDest, String wikiLinkText) { @@ -108,10 +103,7 @@ public class WholeSectionToHtmlParser extends AbstractWiktionaryParser { @Override public boolean skipWikiLink(WikiTokenizer wikiTokenizer) { final String wikiText = wikiTokenizer.wikiLinkText(); - if (wikiText.startsWith("Categoría:")) { - return true; - } - return false; + return wikiText.startsWith("Categoría:"); } @Override public String adjustWikiLink(String wikiLinkDest, String wikiLinkText) { @@ -135,6 +127,51 @@ public class WholeSectionToHtmlParser extends AbstractWiktionaryParser { } }); + final Pattern ptSkipSections = Pattern.compile(".*Tradução.*"); + isoToLangConfig.put("PT", new LangConfig() { + @Override + public boolean skipSection(String headingText) { + return esSkipSections.matcher(headingText).matches(); + } + + @Override + public EntryTypeName sectionNameToEntryType(String sectionName) { + if (sectionName.equalsIgnoreCase("Sinônimo") || sectionName.equalsIgnoreCase("Sinônimos")) { + return EntryTypeName.SYNONYM_MULTI; + } + if (sectionName.equalsIgnoreCase("Antônimo") || sectionName.equalsIgnoreCase("Antônimos")) { + return EntryTypeName.ANTONYM_MULTI; + } + return null; + } + + @Override + public boolean skipWikiLink(WikiTokenizer wikiTokenizer) { + final String wikiText = wikiTokenizer.wikiLinkText(); + return wikiText.startsWith("Categoria:"); + } + @Override + public String adjustWikiLink(String wikiLinkDest, String wikiLinkText) { + if (wikiLinkDest.startsWith("w:") || wikiLinkDest.startsWith("Image:")) { + return null; + } + final int hashPos = wikiLinkDest.indexOf("#"); + if (hashPos != -1) { + wikiLinkDest = wikiLinkDest.substring(0, hashPos); + if (wikiLinkDest.isEmpty()) { + wikiLinkDest = wikiLinkText; + } + } + return wikiLinkDest; + } + + @Override + public void addFunctionCallbacks( + Map> functionCallbacks) { + // TODO: need Portuguese variant + } + }); + final Pattern deSkipSections = Pattern.compile(".*(Übersetzungen|Referenzen|Quellen).*"); isoToLangConfig.put("DE", new LangConfig() { @Override @@ -156,10 +193,7 @@ public class WholeSectionToHtmlParser extends AbstractWiktionaryParser { @Override public boolean skipWikiLink(WikiTokenizer wikiTokenizer) { final String wikiText = wikiTokenizer.wikiLinkText(); - if (wikiText.startsWith("Kategorie:")) { - return true; - } - return false; + return wikiText.startsWith("Kategorie:"); } @Override public String adjustWikiLink(String wikiLinkDest, String wikiLinkText) { @@ -204,10 +238,7 @@ public class WholeSectionToHtmlParser extends AbstractWiktionaryParser { @Override public boolean skipWikiLink(WikiTokenizer wikiTokenizer) { final String wikiText = wikiTokenizer.wikiLinkText(); - if (wikiText.startsWith("Categoria:")) { - return true; - } - return false; + return wikiText.startsWith("Categoria:"); } @Override public String adjustWikiLink(String wikiLinkDest, String wikiLinkText) { @@ -253,10 +284,7 @@ public class WholeSectionToHtmlParser extends AbstractWiktionaryParser { @Override public boolean skipWikiLink(WikiTokenizer wikiTokenizer) { final String wikiText = wikiTokenizer.wikiLinkText(); - if (wikiText.startsWith("Catégorie:")) { - return true; - } - return false; + return wikiText.startsWith("Catégorie:"); } @Override public String adjustWikiLink(String wikiLinkDest, String wikiLinkText) { @@ -391,7 +419,7 @@ public class WholeSectionToHtmlParser extends AbstractWiktionaryParser { if (!StringUtil.isNullOrEmpty(linkDest)) { builder.append(String.format("", HtmlEntry.formatQuickdicUrl("", linkDest))); super.onWikiLink(wikiTokenizer); - builder.append(String.format("")); + builder.append(""); } else { super.onWikiLink(wikiTokenizer); } @@ -441,7 +469,7 @@ public class WholeSectionToHtmlParser extends AbstractWiktionaryParser { builder.append(String.format("\n", depth)); } - final List listPrefixStack = new ArrayList(); + final List listPrefixStack = new ArrayList<>(); @Override public void onListItem(WikiTokenizer wikiTokenizer) {