From 2f137177af2a16411d31280a22bd5dad6ba03dd2 Mon Sep 17 00:00:00 2001 From: Thad Hughes Date: Fri, 16 Dec 2011 20:12:13 -0800 Subject: [PATCH] Fixing examples... --- .../parser/EnWiktionaryXmlParser.java | 59 ++++++++++++------- 1 file changed, 39 insertions(+), 20 deletions(-) diff --git a/src/com/hughes/android/dictionary/parser/EnWiktionaryXmlParser.java b/src/com/hughes/android/dictionary/parser/EnWiktionaryXmlParser.java index d9a1249..c2e6e7c 100644 --- a/src/com/hughes/android/dictionary/parser/EnWiktionaryXmlParser.java +++ b/src/com/hughes/android/dictionary/parser/EnWiktionaryXmlParser.java @@ -23,7 +23,6 @@ import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; -import java.util.Collections; import java.util.LinkedHashSet; import java.util.List; import java.util.Map; @@ -656,54 +655,74 @@ public class EnWiktionaryXmlParser { for (int i = 0; i < listSection.nextPrefixes.size(); ++i) { final String nextPrefix = listSection.nextPrefixes.get(i); final String nextLine = listSection.nextLines.get(i); - int mdash = nextLine.indexOf("—"); + int dash = nextLine.indexOf("—"); int mdashLen = 7; - if (mdash == -1) { - mdash = nextLine.indexOf("—"); + if (dash == -1) { + dash = nextLine.indexOf("—"); mdashLen = 1; } - if (mdash == -1) { - mdash = nextLine.indexOf("'',"); - mdashLen = 3; - } - if (mdash == -1) { - mdash = nextLine.indexOf(" - "); + if (dash == -1) { + dash = nextLine.indexOf(" - "); mdashLen = 3; } // TODO: index and clean these!!! - if (nextPrefix.equals("#:") && mdash != -1) { - final String foreignEx = nextLine.substring(0, mdash); - final String englishEx = nextLine.substring(mdash + mdashLen); - final Pair pair = new Pair(trim(englishEx), trim(foreignEx), swap); + if (nextPrefix.equals("#:") && dash != -1) { + final String foreignEx = nextLine.substring(0, dash); + final String englishEx = nextLine.substring(dash + mdashLen); + final Pair pair = new Pair(formatAndIndexExampleString(englishEx, enIndexBuilder), formatAndIndexExampleString(foreignEx, otherIndexBuilder), swap); pairEntry.pairs.add(pair); lastForeign = null; } else if (nextPrefix.equals("#:")){ - final Pair pair = new Pair("--", trim(nextLine), swap); + final Pair pair = new Pair("--", formatAndIndexExampleString(nextLine, null), swap); lastForeign = nextLine; pairEntry.pairs.add(pair); } else if (nextPrefix.equals("#::") || nextPrefix.equals("#**")) { if (lastForeign != null) { pairEntry.pairs.remove(pairEntry.pairs.size() - 1); - final Pair pair = new Pair(nextLine, lastForeign, swap); + final Pair pair = new Pair(formatAndIndexExampleString(nextLine, enIndexBuilder), formatAndIndexExampleString(lastForeign, otherIndexBuilder), swap); pairEntry.pairs.add(pair); } else { LOG.warning("English example with no foreign: " + title + ", " + nextLine); } } else if (nextPrefix.equals("#*")) { // Can't really index these. - final Pair pair = new Pair("--", trim(nextLine), swap); + final Pair pair = new Pair("--", formatAndIndexExampleString(nextLine, null), swap); lastForeign = nextLine; pairEntry.pairs.add(pair); } else if (nextPrefix.equals("#::*")) { - final Pair pair = new Pair("--", trim(nextLine), swap); + final Pair pair = new Pair("--", formatAndIndexExampleString(nextLine, null), swap); pairEntry.pairs.add(pair); } else { assert false; } } - - + } + + private String formatAndIndexExampleString(final String example, final IndexBuilder indexBuilder) { + final WikiTokenizer wikiTokenizer = new WikiTokenizer(example, false); + final StringBuilder builder = new StringBuilder(); + boolean insideTripleQuotes = false; + while (wikiTokenizer.nextToken() != null) { + if (wikiTokenizer.isPlainText()) { + builder.append(wikiTokenizer.token()); + + } else if (wikiTokenizer.isWikiLink()) { + builder.append(wikiTokenizer.wikiLinkText()); + + } else if (wikiTokenizer.isFunction()) { + builder.append(wikiTokenizer.token()); + } else if (wikiTokenizer.isMarkup()) { + if (wikiTokenizer.token().equals("'''")) { + insideTripleQuotes = !insideTripleQuotes; + } + } else if (wikiTokenizer.isComment() || wikiTokenizer.isNewline()) { + // Do nothing. + } else { + LOG.warning("unexpected token: " + wikiTokenizer.token()); + } + } + return trim(builder.toString()); } -- 2.43.0