X-Git-Url: http://gitweb.fperrin.net/?a=blobdiff_plain;f=src%2Fcom%2Fhughes%2Fandroid%2Fdictionary%2Fparser%2Fwiktionary%2FEnParser.java;h=b60235c23991744392ce0799f485829595448107;hb=2fc669d88306d563fc9c899d8d91b25d591692ea;hp=0e1a203ff278e02573f5fb2e20d7da5f8adb334c;hpb=58fd4402729f38bf4408e8fef487a9bc359e45a0;p=DictionaryPC.git diff --git a/src/com/hughes/android/dictionary/parser/wiktionary/EnParser.java b/src/com/hughes/android/dictionary/parser/wiktionary/EnParser.java index 0e1a203..b60235c 100644 --- a/src/com/hughes/android/dictionary/parser/wiktionary/EnParser.java +++ b/src/com/hughes/android/dictionary/parser/wiktionary/EnParser.java @@ -28,127 +28,127 @@ import com.hughes.android.dictionary.parser.WikiTokenizer; public abstract class EnParser extends AbstractWiktionaryParser { - // TODO: process {{ttbc}} lines - - public static final Pattern partOfSpeechHeader = Pattern.compile( - "Noun|Verb|Adjective|Adverb|Pronoun|Conjunction|Interjection|" + - "Preposition|Proper noun|Article|Prepositional phrase|Acronym|" + - "Abbreviation|Initialism|Contraction|Prefix|Suffix|Symbol|Letter|" + - "Ligature|Idiom|Phrase|\\{\\{acronym\\}\\}|\\{\\{initialism\\}\\}|" + - "\\{\\{abbreviation\\}\\}|" + - // These are @deprecated: - "Noun form|Verb form|Adjective form|Nominal phrase|Noun phrase|" + - "Verb phrase|Transitive verb|Intransitive verb|Reflexive verb|" + - // These are extras I found: - "Determiner|Numeral|Number|Cardinal number|Ordinal number|Proverb|" + - "Particle|Interjection|Pronominal adverb" + - "Han character|Hanzi|Hanja|Kanji|Katakana character|Syllable"); - - static final Set USELESS_WIKI_ARGS = new LinkedHashSet( - Arrays.asList( - "lang", - "sc", - "sort", - "cat", - "cat2", - "xs", - "nodot")); - - static boolean isIgnorableTitle(final String title) { - return title.startsWith("Wiktionary:") || - title.startsWith("Template:") || - title.startsWith("Appendix:") || - title.startsWith("Category:") || - title.startsWith("Index:") || - title.startsWith("MediaWiki:") || - title.startsWith("TransWiki:") || - title.startsWith("Citations:") || - title.startsWith("Concordance:") || - title.startsWith("Help:"); - } - - final IndexBuilder enIndexBuilder; - final IndexBuilder foreignIndexBuilder; - final Pattern langPattern; - final Pattern langCodePattern; - final boolean swap; - - // State used while parsing. - enum State { - TRANSLATION_LINE, - ENGLISH_DEF_OF_FOREIGN, - ENGLISH_EXAMPLE, - FOREIGN_EXAMPLE, - } - State state = null; - - public boolean entryIsFormOfSomething = false; - final Collection wordForms = new ArrayList(); - boolean titleAppended = false; - - - final AppendAndIndexWikiCallback appendAndIndexWikiCallback = new AppendAndIndexCallback(this); - { - appendAndIndexWikiCallback.functionCallbacks.putAll(EnFunctionCallbacks.DEFAULT); - for (final String key : new ArrayList(appendAndIndexWikiCallback.functionCallbacks.keySet())) { - // Don't handle the it-conj functions here. - if (key.startsWith("it-conj")) { - appendAndIndexWikiCallback.functionCallbacks.remove(key); + // TODO: process {{ttbc}} lines + + public static final Pattern partOfSpeechHeader = Pattern.compile( + "Noun|Verb|Adjective|Adverb|Pronoun|Conjunction|Interjection|" + + "Preposition|Proper noun|Article|Prepositional phrase|Acronym|" + + "Abbreviation|Initialism|Contraction|Prefix|Suffix|Symbol|Letter|" + + "Ligature|Idiom|Phrase|\\{\\{acronym\\}\\}|\\{\\{initialism\\}\\}|" + + "\\{\\{abbreviation\\}\\}|" + + // These are @deprecated: + "Noun form|Verb form|Adjective form|Nominal phrase|Noun phrase|" + + "Verb phrase|Transitive verb|Intransitive verb|Reflexive verb|" + + // These are extras I found: + "Determiner|Numeral|Number|Cardinal number|Ordinal number|Proverb|" + + "Particle|Interjection|Pronominal adverb|" + + "Han character|Hanzi|Hanja|Kanji|Katakana character|Syllable"); + + static final Set USELESS_WIKI_ARGS = new LinkedHashSet<>( + Arrays.asList( + "lang", + "sc", + "sort", + "cat", + "cat2", + "xs", + "nodot")); + + static boolean isIgnorableTitle(final String title) { + return title.startsWith("Wiktionary:") || + title.startsWith("Template:") || + title.startsWith("Appendix:") || + title.startsWith("Category:") || + title.startsWith("Index:") || + title.startsWith("MediaWiki:") || + title.startsWith("TransWiki:") || + title.startsWith("Citations:") || + title.startsWith("Concordance:") || + title.startsWith("Help:"); + } + + final IndexBuilder enIndexBuilder; + final IndexBuilder foreignIndexBuilder; + final Pattern langPattern; + final Pattern langCodePattern; + final boolean swap; + + // State used while parsing. + enum State { + TRANSLATION_LINE, + ENGLISH_DEF_OF_FOREIGN, + ENGLISH_EXAMPLE, + FOREIGN_EXAMPLE, + } + State state = null; + + public boolean entryIsFormOfSomething = false; + final Collection wordForms = new ArrayList<>(); + boolean titleAppended = false; + + + final AppendAndIndexWikiCallback appendAndIndexWikiCallback = new AppendAndIndexCallback(this); + { + appendAndIndexWikiCallback.functionCallbacks.putAll(EnFunctionCallbacks.DEFAULT); + for (final String key : new ArrayList<>(appendAndIndexWikiCallback.functionCallbacks.keySet())) { + // Don't handle the it-conj functions here. + if (key.startsWith("it-conj")) { + appendAndIndexWikiCallback.functionCallbacks.remove(key); + } } } - } - - EnParser(final IndexBuilder enIndexBuilder, final IndexBuilder otherIndexBuilder, final Pattern langPattern, final Pattern langCodePattern, final boolean swap) { - this.enIndexBuilder = enIndexBuilder; - this.foreignIndexBuilder = otherIndexBuilder; - this.langPattern = langPattern; - this.langCodePattern = langCodePattern; - this.swap = swap; - } - - @Override - void removeUselessArgs(Map namedArgs) { - namedArgs.keySet().removeAll(USELESS_WIKI_ARGS); - } - - static class AppendAndIndexCallback extends AppendAndIndexWikiCallback { - - public AppendAndIndexCallback(EnParser parser) { - super(parser); + + EnParser(final IndexBuilder enIndexBuilder, final IndexBuilder otherIndexBuilder, final Pattern langPattern, final Pattern langCodePattern, final boolean swap) { + this.enIndexBuilder = enIndexBuilder; + this.foreignIndexBuilder = otherIndexBuilder; + this.langPattern = langPattern; + this.langCodePattern = langCodePattern; + this.swap = swap; } @Override - public void onWikiLink(WikiTokenizer wikiTokenizer) { - final String text = wikiTokenizer.wikiLinkText(); - final String link = wikiTokenizer.wikiLinkDest(); - if (link != null) { - if (link.contains("#English")) { - dispatch(text, parser.enIndexBuilder, EntryTypeName.WIKTIONARY_ENGLISH_DEF_WIKI_LINK); - } else if (link.contains("#") && parser.langPattern.matcher(link).find()) { - dispatch(text, parser.foreignIndexBuilder, EntryTypeName.WIKTIONARY_ENGLISH_DEF_OTHER_LANG); - } else if (link.equals("plural")) { - builder.append(text); - } else { - //LOG.warning("Special link: " + englishTokenizer.token()); - dispatch(text, EntryTypeName.WIKTIONARY_ENGLISH_DEF_WIKI_LINK); + void removeUselessArgs(Map namedArgs) { + namedArgs.keySet().removeAll(USELESS_WIKI_ARGS); + } + + static class AppendAndIndexCallback extends AppendAndIndexWikiCallback { + + public AppendAndIndexCallback(EnParser parser) { + super(parser); } - } else { - // link == null - final EntryTypeName entryTypeName; - switch (parser.state) { - case TRANSLATION_LINE: - entryTypeName = EntryTypeName.WIKTIONARY_TRANSLATION_WIKI_TEXT; - break; - case ENGLISH_DEF_OF_FOREIGN: - entryTypeName = EntryTypeName.WIKTIONARY_ENGLISH_DEF_WIKI_LINK; - break; - default: - throw new IllegalStateException("Invalid enum value: " + parser.state); + + @Override + public void onWikiLink(WikiTokenizer wikiTokenizer) { + final String text = wikiTokenizer.wikiLinkText(); + final String link = wikiTokenizer.wikiLinkDest(); + if (link != null) { + if (link.contains("#English")) { + dispatch(text, parser.enIndexBuilder, EntryTypeName.WIKTIONARY_ENGLISH_DEF_WIKI_LINK); + } else if (link.contains("#") && parser.langPattern.matcher(link).find()) { + dispatch(text, parser.foreignIndexBuilder, EntryTypeName.WIKTIONARY_ENGLISH_DEF_OTHER_LANG); + } else if (link.equals("plural")) { + builder.append(text); + } else { + //LOG.warning("Special link: " + englishTokenizer.token()); + dispatch(text, EntryTypeName.WIKTIONARY_ENGLISH_DEF_WIKI_LINK); + } + } else { + // link == null + final EntryTypeName entryTypeName; + switch (parser.state) { + case TRANSLATION_LINE: + entryTypeName = EntryTypeName.WIKTIONARY_TRANSLATION_WIKI_TEXT; + break; + case ENGLISH_DEF_OF_FOREIGN: + entryTypeName = EntryTypeName.WIKTIONARY_ENGLISH_DEF_WIKI_LINK; + break; + default: + throw new IllegalStateException("Invalid enum value: " + parser.state); + } + dispatch(text, entryTypeName); + } } - dispatch(text, entryTypeName); - } + } - - } }