]> gitweb.fperrin.net Git - DictionaryPC.git/blobdiff - src/com/hughes/android/dictionary/parser/wiktionary/WholeSectionToHtmlParser.java
Minor automated code simplifications.
[DictionaryPC.git] / src / com / hughes / android / dictionary / parser / wiktionary / WholeSectionToHtmlParser.java
index 57f4d587010c6d66d722f26acd8afb640daf293c..ba915aab86ab49071b8e1a53279fe79407ff4817 100644 (file)
@@ -1,6 +1,16 @@
 
 package com.hughes.android.dictionary.parser.wiktionary;
 
+import java.net.URI;
+import java.util.ArrayList;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.regex.Pattern;
+
+import org.apache.commons.text.StringEscapeUtils;
+
+import com.hughes.android.dictionary.engine.EntryTypeName;
 import com.hughes.android.dictionary.engine.HtmlEntry;
 import com.hughes.android.dictionary.engine.IndexBuilder;
 import com.hughes.android.dictionary.engine.IndexBuilder.TokenData;
@@ -8,98 +18,374 @@ import com.hughes.android.dictionary.engine.IndexedEntry;
 import com.hughes.android.dictionary.parser.WikiTokenizer;
 import com.hughes.util.StringUtil;
 
-import org.apache.commons.lang3.StringEscapeUtils;
-import org.apache.commons.lang3.StringUtils;
-
-import java.util.ArrayList;
-import java.util.LinkedHashMap;
-import java.util.List;
-import java.util.Map;
-import java.util.regex.Pattern;
-
 public class WholeSectionToHtmlParser extends AbstractWiktionaryParser {
 
     public static final String NAME = "WholeSectionToHtmlParser";
 
     interface LangConfig {
         boolean skipSection(final String name);
+        EntryTypeName sectionNameToEntryType(String sectionName);
         boolean skipWikiLink(final WikiTokenizer wikiTokenizer);
-        String adjustWikiLink(String wikiLinkDest);
+        String adjustWikiLink(String wikiLinkDest, final String wikiLinkText);
         void addFunctionCallbacks(
-                Map<String, FunctionCallback<WholeSectionToHtmlParser>> functionCallbacks);
+            Map<String, FunctionCallback<WholeSectionToHtmlParser>> functionCallbacks);
     }
-    static final Map<String,LangConfig> isoToLangConfig = new LinkedHashMap<String,LangConfig>();
+    static final Map<String,LangConfig> isoToLangConfig = new LinkedHashMap<>();
     static {
-        final Pattern enSkipSections = Pattern.compile(".*Translations|Anagrams|References.*");
+        final Pattern enSkipSections = Pattern.compile(".*(Translations|Anagrams|References).*");
         isoToLangConfig.put("EN", new LangConfig() {
             @Override
             public boolean skipSection(String headingText) {
                 return enSkipSections.matcher(headingText).matches();
             }
 
+            @Override
+            public EntryTypeName sectionNameToEntryType(String sectionName) {
+                if (sectionName.equalsIgnoreCase("Synonyms")) {
+                    return EntryTypeName.SYNONYM_MULTI;
+                }
+                if (sectionName.equalsIgnoreCase("Antonyms")) {
+                    return EntryTypeName.ANTONYM_MULTI;
+                }
+                // We need to put it in the other index, too (probably) ?
+                // EnParser.partOfSpeechHeader.matcher(sectionName).matches()
+
+                // Needs special handling?
+                // sectionName.equalsIgnoreCase("Derived Terms")
+                return null;
+            }
+
             @Override
             public boolean skipWikiLink(WikiTokenizer wikiTokenizer) {
                 final String wikiText = wikiTokenizer.wikiLinkText();
-                if (wikiText.startsWith("Category:")) {
-                    return true;
-                }
-                return false;
+                return wikiText.startsWith("Category:");
             }
             @Override
-            public String adjustWikiLink(String wikiLinkDest) {
+            public String adjustWikiLink(String wikiLinkDest, String wikiLinkText) {
                 if (wikiLinkDest.startsWith("w:") || wikiLinkDest.startsWith("Image:")) {
                     return null;
                 }
+                final int hashPos = wikiLinkDest.indexOf("#");
+                if (hashPos != -1) {
+                    wikiLinkDest = wikiLinkDest.substring(0, hashPos);
+                    if (wikiLinkDest.isEmpty()) {
+                        wikiLinkDest = wikiLinkText;
+                    }
+                }
                 return wikiLinkDest;
             }
 
             @Override
             public void addFunctionCallbacks(
-                    Map<String, FunctionCallback<WholeSectionToHtmlParser>> functionCallbacks) {
+                Map<String, FunctionCallback<WholeSectionToHtmlParser>> functionCallbacks) {
                 EnFunctionCallbacks.addGenericCallbacks(functionCallbacks);
-            }});
+            }
+        });
+
+        final Pattern esSkipSections = Pattern.compile(".*(Traducciones|Locuciones).*");
+        isoToLangConfig.put("ES", new LangConfig() {
+            @Override
+            public boolean skipSection(String headingText) {
+                return esSkipSections.matcher(headingText).matches();
+            }
+
+            @Override
+            public EntryTypeName sectionNameToEntryType(String sectionName) {
+                if (sectionName.equalsIgnoreCase("sinónimo") || sectionName.equalsIgnoreCase("sinónimos")) {
+                    return EntryTypeName.SYNONYM_MULTI;
+                }
+                if (sectionName.equalsIgnoreCase("antónimo") || sectionName.equalsIgnoreCase("antónimos")) {
+                    return EntryTypeName.ANTONYM_MULTI;
+                }
+                return null;
+            }
+
+            @Override
+            public boolean skipWikiLink(WikiTokenizer wikiTokenizer) {
+                final String wikiText = wikiTokenizer.wikiLinkText();
+                return wikiText.startsWith("Categoría:");
+            }
+            @Override
+            public String adjustWikiLink(String wikiLinkDest, String wikiLinkText) {
+                if (wikiLinkDest.startsWith("w:") || wikiLinkDest.startsWith("Image:")) {
+                    return null;
+                }
+                final int hashPos = wikiLinkDest.indexOf("#");
+                if (hashPos != -1) {
+                    wikiLinkDest = wikiLinkDest.substring(0, hashPos);
+                    if (wikiLinkDest.isEmpty()) {
+                        wikiLinkDest = wikiLinkText;
+                    }
+                }
+                return wikiLinkDest;
+            }
+
+            @Override
+            public void addFunctionCallbacks(
+                Map<String, FunctionCallback<WholeSectionToHtmlParser>> functionCallbacks) {
+                // TODO: need Spanish variant
+            }
+        });
+
+        final Pattern ptSkipSections = Pattern.compile(".*Tradução.*");
+        isoToLangConfig.put("PT", new LangConfig() {
+            @Override
+            public boolean skipSection(String headingText) {
+                return esSkipSections.matcher(headingText).matches();
+            }
+
+            @Override
+            public EntryTypeName sectionNameToEntryType(String sectionName) {
+                if (sectionName.equalsIgnoreCase("Sinônimo") || sectionName.equalsIgnoreCase("Sinônimos")) {
+                    return EntryTypeName.SYNONYM_MULTI;
+                }
+                if (sectionName.equalsIgnoreCase("Antônimo") || sectionName.equalsIgnoreCase("Antônimos")) {
+                    return EntryTypeName.ANTONYM_MULTI;
+                }
+                return null;
+            }
+
+            @Override
+            public boolean skipWikiLink(WikiTokenizer wikiTokenizer) {
+                final String wikiText = wikiTokenizer.wikiLinkText();
+                return wikiText.startsWith("Categoria:");
+            }
+            @Override
+            public String adjustWikiLink(String wikiLinkDest, String wikiLinkText) {
+                if (wikiLinkDest.startsWith("w:") || wikiLinkDest.startsWith("Image:")) {
+                    return null;
+                }
+                final int hashPos = wikiLinkDest.indexOf("#");
+                if (hashPos != -1) {
+                    wikiLinkDest = wikiLinkDest.substring(0, hashPos);
+                    if (wikiLinkDest.isEmpty()) {
+                        wikiLinkDest = wikiLinkText;
+                    }
+                }
+                return wikiLinkDest;
+            }
+
+            @Override
+            public void addFunctionCallbacks(
+                Map<String, FunctionCallback<WholeSectionToHtmlParser>> functionCallbacks) {
+                // TODO: need Portuguese variant
+            }
+        });
+
+        final Pattern deSkipSections = Pattern.compile(".*(Übersetzungen|Referenzen|Quellen).*");
+        isoToLangConfig.put("DE", new LangConfig() {
+            @Override
+            public boolean skipSection(String headingText) {
+                return deSkipSections.matcher(headingText).matches();
+            }
+
+            @Override
+            public EntryTypeName sectionNameToEntryType(String sectionName) {
+                if (sectionName.equalsIgnoreCase("Synonyme")) {
+                    return EntryTypeName.SYNONYM_MULTI;
+                }
+                if (sectionName.equalsIgnoreCase("Gegenwörter")) {
+                    return EntryTypeName.ANTONYM_MULTI;
+                }
+                return null;
+            }
+
+            @Override
+            public boolean skipWikiLink(WikiTokenizer wikiTokenizer) {
+                final String wikiText = wikiTokenizer.wikiLinkText();
+                return wikiText.startsWith("Kategorie:");
+            }
+            @Override
+            public String adjustWikiLink(String wikiLinkDest, String wikiLinkText) {
+                if (wikiLinkDest.startsWith("w:") || wikiLinkDest.startsWith("Image:")) {
+                    return null;
+                }
+                final int hashPos = wikiLinkDest.indexOf("#");
+                if (hashPos != -1) {
+                    wikiLinkDest = wikiLinkDest.substring(0, hashPos);
+                    if (wikiLinkDest.isEmpty()) {
+                        wikiLinkDest = wikiLinkText;
+                    }
+                }
+                return wikiLinkDest;
+            }
+
+            @Override
+            public void addFunctionCallbacks(
+                Map<String, FunctionCallback<WholeSectionToHtmlParser>> functionCallbacks) {
+                DeFunctionCallbacks.addGenericCallbacks(functionCallbacks);
+            }
+        });
+
+        final Pattern itSkipSections = Pattern.compile(".*(Traduzione|Note / Riferimenti).*");
+        isoToLangConfig.put("IT", new LangConfig() {
+            @Override
+            public boolean skipSection(String headingText) {
+                return itSkipSections.matcher(headingText).matches();
+            }
+
+            @Override
+            public EntryTypeName sectionNameToEntryType(String sectionName) {
+                if (sectionName.equalsIgnoreCase("Sinonimi")) {
+                    return EntryTypeName.SYNONYM_MULTI;
+                }
+                if (sectionName.equalsIgnoreCase("Antonimi/Contrari")) {
+                    return EntryTypeName.ANTONYM_MULTI;
+                }
+                return null;
+            }
+
+            @Override
+            public boolean skipWikiLink(WikiTokenizer wikiTokenizer) {
+                final String wikiText = wikiTokenizer.wikiLinkText();
+                return wikiText.startsWith("Categoria:");
+            }
+            @Override
+            public String adjustWikiLink(String wikiLinkDest, String wikiLinkText) {
+                if (wikiLinkDest.startsWith("w:") || wikiLinkDest.startsWith("Image:")) {
+                    return null;
+                }
+                final int hashPos = wikiLinkDest.indexOf("#");
+                if (hashPos != -1) {
+                    wikiLinkDest = wikiLinkDest.substring(0, hashPos);
+                    if (wikiLinkDest.isEmpty()) {
+                        wikiLinkDest = wikiLinkText;
+                    }
+                }
+                return wikiLinkDest;
+            }
+
+            @Override
+            public void addFunctionCallbacks(
+                Map<String, FunctionCallback<WholeSectionToHtmlParser>> functionCallbacks) {
+                ItFunctionCallbacks.addGenericCallbacks(functionCallbacks);
+            }
+        });
+
+
+        final Pattern frSkipSections = Pattern.compile(".*([Tt]raductions|[Aa]nagrammes).*");
+        isoToLangConfig.put("FR", new LangConfig() {
+            @Override
+            public boolean skipSection(String headingText) {
+                return frSkipSections.matcher(headingText).matches();
+            }
+
+            @Override
+            public EntryTypeName sectionNameToEntryType(String sectionName) {
+                if (sectionName.equalsIgnoreCase("Synonymes")) {
+                    return EntryTypeName.SYNONYM_MULTI;
+                }
+                if (sectionName.equalsIgnoreCase("Antonymes")) {
+                    return EntryTypeName.ANTONYM_MULTI;
+                }
+                return null;
+            }
+
+            @Override
+            public boolean skipWikiLink(WikiTokenizer wikiTokenizer) {
+                final String wikiText = wikiTokenizer.wikiLinkText();
+                return wikiText.startsWith("Catégorie:");
+            }
+            @Override
+            public String adjustWikiLink(String wikiLinkDest, String wikiLinkText) {
+                if (wikiLinkDest.startsWith("w:") || wikiLinkDest.startsWith("Image:")) {
+                    return null;
+                }
+                final int hashPos = wikiLinkDest.indexOf("#");
+                if (hashPos != -1) {
+                    wikiLinkDest = wikiLinkDest.substring(0, hashPos);
+                    if (wikiLinkDest.isEmpty()) {
+                        wikiLinkDest = wikiLinkText;
+                    }
+                }
+                return wikiLinkDest;
+            }
+
+            @Override
+            public void addFunctionCallbacks(
+                Map<String, FunctionCallback<WholeSectionToHtmlParser>> functionCallbacks) {
+                FrFunctionCallbacks.addGenericCallbacks(functionCallbacks);
+            }
+        });
     }
 
     final IndexBuilder titleIndexBuilder;
+    final IndexBuilder defIndexBuilder;
     final String skipLangIso;
     final LangConfig langConfig;
+    final String webUrlTemplate;
 
-    public WholeSectionToHtmlParser(final IndexBuilder titleIndexBuilder, final String wiktionaryIso, final String skipLangIso) {
+
+    public WholeSectionToHtmlParser(final IndexBuilder titleIndexBuilder, final IndexBuilder defIndexBuilder, final String wiktionaryIso, final String skipLangIso,
+                                    final String webUrlTemplate) {
         this.titleIndexBuilder = titleIndexBuilder;
+        this.defIndexBuilder = defIndexBuilder;
         assert isoToLangConfig.containsKey(wiktionaryIso): wiktionaryIso;
         this.langConfig = isoToLangConfig.get(wiktionaryIso);
         this.skipLangIso = skipLangIso;
+        this.webUrlTemplate = webUrlTemplate;
     }
 
+    IndexedEntry indexedEntry = null;
+
     @Override
     public void parseSection(String heading, String text) {
-        HtmlEntry htmlEntry = new HtmlEntry(entrySource, StringEscapeUtils.escapeHtml3(title));
-        IndexedEntry indexedEntry = new IndexedEntry(htmlEntry);
+        assert entrySource != null;
+        final HtmlEntry htmlEntry = new HtmlEntry(entrySource, title);
+        indexedEntry = new IndexedEntry(htmlEntry);
 
         final AppendAndIndexWikiCallback<WholeSectionToHtmlParser> callback = new AppendCallback(
-                this);
+            this);
         langConfig.addFunctionCallbacks(callback.functionCallbacks);
 
         callback.builder = new StringBuilder();
         callback.indexedEntry = indexedEntry;
         callback.dispatch(text, null);
 
+        if (webUrlTemplate != null) {
+            final String webUrl = String.format(webUrlTemplate, title);
+            // URI.create can raise an exception e.g. if webUrl contains %, just ignore those cases.
+            try {
+                callback.builder.append(String.format("<p> <a href=\"%s\">%s</a>", URI.create(webUrl).toASCIIString(), escapeHtmlLiteral(webUrl)));
+            } catch (Exception e) {
+            }
+        }
         htmlEntry.html = callback.builder.toString();
         indexedEntry.isValid = true;
 
         final TokenData tokenData = titleIndexBuilder.getOrCreateTokenData(title);
+        tokenData.hasMainEntry = true;
 
         htmlEntry.addToDictionary(titleIndexBuilder.index.dict);
         tokenData.htmlEntries.add(htmlEntry);
         // titleIndexBuilder.addEntryWithString(indexedEntry, title,
         // EntryTypeName.WIKTIONARY_TITLE_MULTI_DETAIL);
+
+        indexedEntry = null;
     }
 
     @Override
     void removeUselessArgs(Map<String, String> namedArgs) {
     }
-    
-    static final Pattern ALL_ASCII = Pattern.compile("[\\p{ASCII}]*");
+
+    @Override
+    public void addLinkToCurrentEntry(String token, final String lang, EntryTypeName entryTypeName) {
+        if (lang == null || lang.equals(skipLangIso)) {
+            titleIndexBuilder.addEntryWithString(indexedEntry, token, entryTypeName);
+        }
+    }
+
+    public static String escapeHtmlLiteral(final String plainText) {
+        final String htmlEscaped = StringEscapeUtils.escapeHtml3(plainText);
+        if (StringUtil.isAscii(htmlEscaped)) {
+            return htmlEscaped;
+        } else {
+            return StringUtil.escapeUnicodeToPureHtml(plainText);
+        }
+
+    }
+
+
 
     class AppendCallback extends AppendAndIndexWikiCallback<WholeSectionToHtmlParser> {
         public AppendCallback(WholeSectionToHtmlParser parser) {
@@ -108,12 +394,7 @@ public class WholeSectionToHtmlParser extends AbstractWiktionaryParser {
 
         @Override
         public void onPlainText(String plainText) {
-            final String htmlEscaped = StringEscapeUtils.escapeHtml3(plainText);
-            if (ALL_ASCII.matcher(htmlEscaped).matches()) {
-                super.onPlainText(htmlEscaped);
-            } else { 
-                super.onPlainText(StringUtil.escapeToPureHtmlUnicode(plainText));
-            }
+            super.onPlainText(escapeHtmlLiteral(plainText));
         }
 
         @Override
@@ -127,14 +408,18 @@ public class WholeSectionToHtmlParser extends AbstractWiktionaryParser {
             }
             String linkDest;
             if (wikiTokenizer.wikiLinkDest() != null) {
-                linkDest = langConfig.adjustWikiLink(wikiTokenizer.wikiLinkDest());
+                linkDest = langConfig.adjustWikiLink(wikiTokenizer.wikiLinkDest(), wikiTokenizer.wikiLinkText());
             } else {
                 linkDest = wikiTokenizer.wikiLinkText();
             }
-            if (linkDest != null) {
-                builder.append(String.format("<a href=\"%s\">", linkDest));
+            if (sectionEntryTypeName != null) {
+                // TODO: inside a definition, this could be the wrong language.
+                titleIndexBuilder.addEntryWithString(indexedEntry, wikiTokenizer.wikiLinkText(), sectionEntryTypeName);
+            }
+            if (!StringUtil.isNullOrEmpty(linkDest)) {
+                builder.append(String.format("<a href=\"%s\">", HtmlEntry.formatQuickdicUrl("", linkDest)));
                 super.onWikiLink(wikiTokenizer);
-                builder.append(String.format("</a>"));
+                builder.append("</a>");
             } else {
                 super.onWikiLink(wikiTokenizer);
             }
@@ -142,7 +427,7 @@ public class WholeSectionToHtmlParser extends AbstractWiktionaryParser {
 
         @Override
         public void onFunction(WikiTokenizer wikiTokenizer, String name,
-                List<String> args, Map<String, String> namedArgs) {
+                               List<String> args, Map<String, String> namedArgs) {
             if (skipLangIso.equalsIgnoreCase(namedArgs.get("lang"))) {
                 namedArgs.remove("lang");
             }
@@ -158,15 +443,23 @@ public class WholeSectionToHtmlParser extends AbstractWiktionaryParser {
         public void onNewline(WikiTokenizer wikiTokenizer) {
         }
 
+        EntryTypeName sectionEntryTypeName;
+        IndexBuilder currentIndexBuilder;
+
         @Override
         public void onHeading(WikiTokenizer wikiTokenizer) {
             final String headingText = wikiTokenizer.headingWikiText();
+            sectionEntryTypeName = langConfig.sectionNameToEntryType(headingText);
             final int depth = wikiTokenizer.headingDepth();
             if (langConfig.skipSection(headingText)) {
+                //System.out.println("Skipping section:" + headingText);
                 while ((wikiTokenizer = wikiTokenizer.nextToken()) != null) {
                     if (wikiTokenizer.isHeading() && wikiTokenizer.headingDepth() <= depth) {
+                        // System.out.println("Resume on: " + wikiTokenizer.token());
                         wikiTokenizer.returnToLineStart();
                         return;
+                    } else {
+                        // System.out.println("Skipped: " + wikiTokenizer.token());
                     }
                 }
                 return;
@@ -176,7 +469,7 @@ public class WholeSectionToHtmlParser extends AbstractWiktionaryParser {
             builder.append(String.format("</h%d>\n", depth));
         }
 
-        final List<Character> listPrefixStack = new ArrayList<Character>();
+        final List<Character> listPrefixStack = new ArrayList<>();
 
         @Override
         public void onListItem(WikiTokenizer wikiTokenizer) {
@@ -186,7 +479,7 @@ public class WholeSectionToHtmlParser extends AbstractWiktionaryParser {
             final String prefix = wikiTokenizer.listItemPrefix();
             while (listPrefixStack.size() < prefix.length()) {
                 builder.append(String.format("<%s>",
-                        WikiTokenizer.getListTag(prefix.charAt(listPrefixStack.size()))));
+                                             WikiTokenizer.getListTag(prefix.charAt(listPrefixStack.size()))));
                 listPrefixStack.add(prefix.charAt(listPrefixStack.size()));
             }
             builder.append("<li>");