]> gitweb.fperrin.net Git - DictionaryPC.git/blobdiff - src/com/hughes/android/dictionary/parser/wiktionary/WholeSectionToHtmlParser.java
Partial support for Spanish Wiktionary.
[DictionaryPC.git] / src / com / hughes / android / dictionary / parser / wiktionary / WholeSectionToHtmlParser.java
index b3249d12f395a81fb0393f4319b5d0a26104426f..8a4bb216c9b75967fc713512ef9f0f32bc63fe34 100644 (file)
@@ -8,10 +8,10 @@ import com.hughes.android.dictionary.engine.IndexBuilder.TokenData;
 import com.hughes.android.dictionary.engine.IndexedEntry;
 import com.hughes.android.dictionary.parser.WikiTokenizer;
 import com.hughes.util.StringUtil;
-import com.sun.xml.internal.rngom.util.Uri;
 
 import org.apache.commons.lang3.StringEscapeUtils;
 
+import java.net.URI;
 import java.util.ArrayList;
 import java.util.LinkedHashMap;
 import java.util.List;
@@ -87,6 +87,55 @@ public class WholeSectionToHtmlParser extends AbstractWiktionaryParser {
             }
         });
         
+        final Pattern esSkipSections = Pattern.compile(".*(Traducciones|Locuciones).*");
+        isoToLangConfig.put("ES", new LangConfig() {
+            @Override
+            public boolean skipSection(String headingText) {
+                return esSkipSections.matcher(headingText).matches();
+            }
+
+            @Override
+            public EntryTypeName sectionNameToEntryType(String sectionName) {
+                if (sectionName.equalsIgnoreCase("sinónimo") || sectionName.equalsIgnoreCase("sinónimos")) {
+                    return EntryTypeName.SYNONYM_MULTI;
+                }
+                if (sectionName.equalsIgnoreCase("antónimo") || sectionName.equalsIgnoreCase("antónimos")) {
+                    return EntryTypeName.ANTONYM_MULTI;
+                }
+                return null;
+            }
+
+            @Override
+            public boolean skipWikiLink(WikiTokenizer wikiTokenizer) {
+                final String wikiText = wikiTokenizer.wikiLinkText();
+                if (wikiText.startsWith("Categoría:")) {
+                    return true;
+                }
+                return false;
+            }
+            @Override
+            public String adjustWikiLink(String wikiLinkDest, String wikiLinkText) {
+                if (wikiLinkDest.startsWith("w:") || wikiLinkDest.startsWith("Image:")) {
+                    return null;
+                }
+                final int hashPos = wikiLinkDest.indexOf("#");
+                if (hashPos != -1) {
+                    wikiLinkDest = wikiLinkDest.substring(0, hashPos);
+                    if (wikiLinkDest.isEmpty()) {
+                        wikiLinkDest = wikiLinkText;
+                    }
+                }
+                return wikiLinkDest;
+            }
+
+            @Override
+            public void addFunctionCallbacks(
+                    Map<String, FunctionCallback<WholeSectionToHtmlParser>> functionCallbacks) {
+                // TODO: need Spanish variant
+                EnFunctionCallbacks.addGenericCallbacks(functionCallbacks);
+            }
+        });
+
         final Pattern deSkipSections = Pattern.compile(".*(Übersetzungen|Referenzen|Quellen).*");
         isoToLangConfig.put("DE", new LangConfig() {
             @Override
@@ -184,35 +233,46 @@ public class WholeSectionToHtmlParser extends AbstractWiktionaryParser {
         });
 
 
-        
-        final LangConfig basicLangConfig = new LangConfig() {
+        final Pattern frSkipSections = Pattern.compile(".*(Traductions).*");
+        isoToLangConfig.put("FR", new LangConfig() {
             @Override
             public boolean skipSection(String headingText) {
-                return false;
+                return frSkipSections.matcher(headingText).matches();
             }
+            
             @Override
             public EntryTypeName sectionNameToEntryType(String sectionName) {
-                return EntryTypeName.WIKTIONARY_MENTIONED;
+                if (sectionName.equalsIgnoreCase("Synonymes")) {
+                    return EntryTypeName.SYNONYM_MULTI;
+                }
+                return null;
             }
+            
             @Override
             public boolean skipWikiLink(WikiTokenizer wikiTokenizer) {
-                final String wikiText = wikiTokenizer.wikiLinkText();
-                if (wikiText.startsWith("Category:")) {
-                    return true;
-                }
                 return false;
             }
             @Override
-            public String adjustWikiLink(String wikiLinkDest, final String wikiLinkText) {
+            public String adjustWikiLink(String wikiLinkDest, String wikiLinkText) {
+                if (wikiLinkDest.startsWith("w:") || wikiLinkDest.startsWith("Image:")) {
+                    return null;
+                }
+                final int hashPos = wikiLinkDest.indexOf("#");
+                if (hashPos != -1) {
+                    wikiLinkDest = wikiLinkDest.substring(0, hashPos);
+                    if (wikiLinkDest.isEmpty()) {
+                        wikiLinkDest = wikiLinkText;
+                    }
+                }
                 return wikiLinkDest;
             }
 
             @Override
             public void addFunctionCallbacks(
                     Map<String, FunctionCallback<WholeSectionToHtmlParser>> functionCallbacks) {
+                FrFunctionCallbacks.addGenericCallbacks(functionCallbacks);
             }
-        };
-        isoToLangConfig.put("FR", basicLangConfig);
+        });
     }
 
     final IndexBuilder titleIndexBuilder;
@@ -250,7 +310,11 @@ public class WholeSectionToHtmlParser extends AbstractWiktionaryParser {
 
         if (webUrlTemplate != null) {
             final String webUrl = String.format(webUrlTemplate, title);
-            callback.builder.append(String.format("<p> <a href=\"%s\">%s</a>", Uri.escapeDisallowedChars(webUrl), escapeHtmlLiteral(webUrl)));
+           // URI.create can raise an exception e.g. if webUrl contains %, just ignore those cases.
+           try {
+            callback.builder.append(String.format("<p> <a href=\"%s\">%s</a>", URI.create(webUrl).toString(), escapeHtmlLiteral(webUrl)));
+           } catch (Exception e)
+           {}
         }
         htmlEntry.html = callback.builder.toString();
         indexedEntry.isValid = true;
@@ -282,7 +346,7 @@ public class WholeSectionToHtmlParser extends AbstractWiktionaryParser {
         if (StringUtil.isAscii(htmlEscaped)) {
             return htmlEscaped;
         } else { 
-            return StringUtil.escapeToPureHtmlUnicode(plainText);
+            return StringUtil.escapeUnicodeToPureHtml(plainText);
         }
 
     }
@@ -354,10 +418,14 @@ public class WholeSectionToHtmlParser extends AbstractWiktionaryParser {
             sectionEntryTypeName = langConfig.sectionNameToEntryType(headingText);
             final int depth = wikiTokenizer.headingDepth();
             if (langConfig.skipSection(headingText)) {
+                //System.out.println("Skipping section:" + headingText);
                 while ((wikiTokenizer = wikiTokenizer.nextToken()) != null) {
                     if (wikiTokenizer.isHeading() && wikiTokenizer.headingDepth() <= depth) {
+                        // System.out.println("Resume on: " + wikiTokenizer.token());
                         wikiTokenizer.returnToLineStart();
                         return;
+                    } else {
+                        // System.out.println("Skipped: " + wikiTokenizer.token());
                     }
                 }
                 return;