]> gitweb.fperrin.net Git - DictionaryPC.git/blobdiff - src/com/hughes/android/dictionary/parser/wiktionary/WholeSectionToHtmlParser.java
Format links properly.
[DictionaryPC.git] / src / com / hughes / android / dictionary / parser / wiktionary / WholeSectionToHtmlParser.java
index 53104fc166e862c39cb89b9ec4a814d0fdc50253..0f7ae2d1cef8d94876a6b97ad0627a7633ee4562 100644 (file)
@@ -1,15 +1,19 @@
 
 package com.hughes.android.dictionary.parser.wiktionary;
 
+import com.hughes.android.dictionary.HtmlDisplayActivity;
+import com.hughes.android.dictionary.engine.EntryTypeName;
 import com.hughes.android.dictionary.engine.HtmlEntry;
 import com.hughes.android.dictionary.engine.IndexBuilder;
 import com.hughes.android.dictionary.engine.IndexBuilder.TokenData;
 import com.hughes.android.dictionary.engine.IndexedEntry;
 import com.hughes.android.dictionary.parser.WikiTokenizer;
+import com.hughes.util.StringUtil;
 
 import org.apache.commons.lang3.StringEscapeUtils;
 
 import java.util.ArrayList;
+import java.util.LinkedHashMap;
 import java.util.List;
 import java.util.Map;
 import java.util.regex.Pattern;
@@ -17,22 +21,129 @@ import java.util.regex.Pattern;
 public class WholeSectionToHtmlParser extends AbstractWiktionaryParser {
 
     public static final String NAME = "WholeSectionToHtmlParser";
-    public static final Pattern skipSections = Pattern.compile(".*Translations.*");
+
+    interface LangConfig {
+        boolean skipSection(final String name);
+        EntryTypeName sectionNameToEntryType(String sectionName);
+        boolean skipWikiLink(final WikiTokenizer wikiTokenizer);
+        String adjustWikiLink(String wikiLinkDest, final String wikiLinkText);
+        void addFunctionCallbacks(
+                Map<String, FunctionCallback<WholeSectionToHtmlParser>> functionCallbacks);
+    }
+    static final Map<String,LangConfig> isoToLangConfig = new LinkedHashMap<String,LangConfig>();
+    static {
+        final Pattern enSkipSections = Pattern.compile(".*Translations|Anagrams|References.*");
+        isoToLangConfig.put("EN", new LangConfig() {
+            @Override
+            public boolean skipSection(String headingText) {
+                return enSkipSections.matcher(headingText).matches();
+            }
+            
+            @Override
+            public EntryTypeName sectionNameToEntryType(String sectionName) {
+                if (sectionName.equalsIgnoreCase("Synonyms")) {
+                    return EntryTypeName.SYNONYM_MULTI;
+                }
+                if (sectionName.equalsIgnoreCase("Antonyms")) {
+                    return EntryTypeName.ANTONYM_MULTI;
+                }
+                if (EnParser.partOfSpeechHeader.matcher(sectionName).matches()) {
+                    // We need to put it in the other index, too.
+                    return null;
+                }
+                if (sectionName.equalsIgnoreCase("Derived Terms")) {
+                    return null;
+                }
+                return null;
+            }
+            
+            @Override
+            public boolean skipWikiLink(WikiTokenizer wikiTokenizer) {
+                final String wikiText = wikiTokenizer.wikiLinkText();
+                if (wikiText.startsWith("Category:")) {
+                    return true;
+                }
+                return false;
+            }
+            @Override
+            public String adjustWikiLink(String wikiLinkDest, String wikiLinkText) {
+                if (wikiLinkDest.startsWith("w:") || wikiLinkDest.startsWith("Image:")) {
+                    return null;
+                }
+                final int hashPos = wikiLinkDest.indexOf("#");
+                if (hashPos != -1) {
+                    wikiLinkDest = wikiLinkDest.substring(0, hashPos);
+                    if (wikiLinkDest.isEmpty()) {
+                        wikiLinkDest = wikiLinkText;
+                    }
+                }
+                return wikiLinkDest;
+            }
+
+            @Override
+            public void addFunctionCallbacks(
+                    Map<String, FunctionCallback<WholeSectionToHtmlParser>> functionCallbacks) {
+                EnFunctionCallbacks.addGenericCallbacks(functionCallbacks);
+            }
+        });
+        
+        final LangConfig basicLangConfig = new LangConfig() {
+            @Override
+            public boolean skipSection(String headingText) {
+                return false;
+            }
+            @Override
+            public EntryTypeName sectionNameToEntryType(String sectionName) {
+                return EntryTypeName.WIKTIONARY_MENTIONED;
+            }
+            @Override
+            public boolean skipWikiLink(WikiTokenizer wikiTokenizer) {
+                final String wikiText = wikiTokenizer.wikiLinkText();
+                if (wikiText.startsWith("Category:")) {
+                    return true;
+                }
+                return false;
+            }
+            @Override
+            public String adjustWikiLink(String wikiLinkDest, final String wikiLinkText) {
+                return wikiLinkDest;
+            }
+
+            @Override
+            public void addFunctionCallbacks(
+                    Map<String, FunctionCallback<WholeSectionToHtmlParser>> functionCallbacks) {
+            }
+        };
+        isoToLangConfig.put("FR", basicLangConfig);
+        isoToLangConfig.put("DE", basicLangConfig);
+        isoToLangConfig.put("IT", basicLangConfig);
+    }
 
     final IndexBuilder titleIndexBuilder;
+    final IndexBuilder defIndexBuilder;
+    final String skipLangIso;
+    final LangConfig langConfig;
+    
 
-    public WholeSectionToHtmlParser(final IndexBuilder titleIndexBuilder) {
+    public WholeSectionToHtmlParser(final IndexBuilder titleIndexBuilder, final IndexBuilder defIndexBuilder, final String wiktionaryIso, final String skipLangIso) {
         this.titleIndexBuilder = titleIndexBuilder;
-
+        this.defIndexBuilder = defIndexBuilder;
+        assert isoToLangConfig.containsKey(wiktionaryIso): wiktionaryIso;
+        this.langConfig = isoToLangConfig.get(wiktionaryIso);
+        this.skipLangIso = skipLangIso;
     }
+    
+    IndexedEntry indexedEntry = null;
 
     @Override
-    void parseSection(String heading, String text) {
-        HtmlEntry htmlEntry = new HtmlEntry(entrySource, StringEscapeUtils.escapeHtml3(title));
-        IndexedEntry indexedEntry = new IndexedEntry(htmlEntry);
+    public void parseSection(String heading, String text) {
+        assert entrySource != null;
+        final HtmlEntry htmlEntry = new HtmlEntry(entrySource, title);
+        indexedEntry = new IndexedEntry(htmlEntry);
 
         final AppendAndIndexWikiCallback<WholeSectionToHtmlParser> callback = new AppendCallback(
                 this);
+        langConfig.addFunctionCallbacks(callback.functionCallbacks);
 
         callback.builder = new StringBuilder();
         callback.indexedEntry = indexedEntry;
@@ -47,11 +158,20 @@ public class WholeSectionToHtmlParser extends AbstractWiktionaryParser {
         tokenData.htmlEntries.add(htmlEntry);
         // titleIndexBuilder.addEntryWithString(indexedEntry, title,
         // EntryTypeName.WIKTIONARY_TITLE_MULTI_DETAIL);
+        
+        indexedEntry = null;
     }
 
     @Override
     void removeUselessArgs(Map<String, String> namedArgs) {
     }
+    
+    @Override
+    public void addLinkToCurrentEntry(String token, EntryTypeName entryTypeName) {
+        titleIndexBuilder.addEntryWithString(indexedEntry, token, entryTypeName);
+    }
+
+
 
     class AppendCallback extends AppendAndIndexWikiCallback<WholeSectionToHtmlParser> {
         public AppendCallback(WholeSectionToHtmlParser parser) {
@@ -60,7 +180,12 @@ public class WholeSectionToHtmlParser extends AbstractWiktionaryParser {
 
         @Override
         public void onPlainText(String plainText) {
-            super.onPlainText(StringEscapeUtils.escapeHtml3(plainText));
+            final String htmlEscaped = StringEscapeUtils.escapeHtml3(plainText);
+            if (StringUtil.isAscii(htmlEscaped)) {
+                super.onPlainText(htmlEscaped);
+            } else { 
+                super.onPlainText(StringUtil.escapeToPureHtmlUnicode(plainText));
+            }
         }
 
         @Override
@@ -69,12 +194,34 @@ public class WholeSectionToHtmlParser extends AbstractWiktionaryParser {
                 // Skips wikilinks like: [[en::dick]]
                 return;
             }
-            super.onWikiLink(wikiTokenizer);
+            if (langConfig.skipWikiLink(wikiTokenizer)) {
+                return;
+            }
+            String linkDest;
+            if (wikiTokenizer.wikiLinkDest() != null) {
+                linkDest = langConfig.adjustWikiLink(wikiTokenizer.wikiLinkDest(), wikiTokenizer.wikiLinkText());
+            } else {
+                linkDest = wikiTokenizer.wikiLinkText();
+            }
+            if (sectionEntryTypeName != null) {
+                // TODO: inside a definition, this could be the wrong language.
+                titleIndexBuilder.addEntryWithString(indexedEntry, wikiTokenizer.wikiLinkText(), sectionEntryTypeName);
+            }
+            if (linkDest != null) {
+                builder.append(String.format("<a href=\"%s\">", HtmlEntry.formatQuickdicUrl("", linkDest)));
+                super.onWikiLink(wikiTokenizer);
+                builder.append(String.format("</a>"));
+            } else {
+                super.onWikiLink(wikiTokenizer);
+            }
         }
 
         @Override
         public void onFunction(WikiTokenizer wikiTokenizer, String name,
                 List<String> args, Map<String, String> namedArgs) {
+            if (skipLangIso.equalsIgnoreCase(namedArgs.get("lang"))) {
+                namedArgs.remove("lang");
+            }
             super.onFunction(wikiTokenizer, name, args, namedArgs);
         }
 
@@ -86,12 +233,16 @@ public class WholeSectionToHtmlParser extends AbstractWiktionaryParser {
         @Override
         public void onNewline(WikiTokenizer wikiTokenizer) {
         }
+        
+        EntryTypeName sectionEntryTypeName;
+        IndexBuilder currentIndexBuilder;
 
         @Override
         public void onHeading(WikiTokenizer wikiTokenizer) {
             final String headingText = wikiTokenizer.headingWikiText();
+            sectionEntryTypeName = langConfig.sectionNameToEntryType(headingText);
             final int depth = wikiTokenizer.headingDepth();
-            if (skipSections.matcher(headingText).matches()) {
+            if (langConfig.skipSection(headingText)) {
                 while ((wikiTokenizer = wikiTokenizer.nextToken()) != null) {
                     if (wikiTokenizer.isHeading() && wikiTokenizer.headingDepth() <= depth) {
                         wikiTokenizer.returnToLineStart();