1 // Copyright 2012 Google Inc. All Rights Reserved.
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
7 // http://www.apache.org/licenses/LICENSE-2.0
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
15 package com.hughes.android.dictionary.parser.wiktionary;
17 import java.util.ArrayList;
18 import java.util.Arrays;
19 import java.util.Collection;
20 import java.util.LinkedHashSet;
23 import java.util.regex.Pattern;
25 import com.hughes.android.dictionary.engine.EntryTypeName;
26 import com.hughes.android.dictionary.engine.IndexBuilder;
27 import com.hughes.android.dictionary.parser.WikiTokenizer;
29 public abstract class EnParser extends AbstractWiktionaryParser {
31 // TODO: process {{ttbc}} lines
33 public static final Pattern partOfSpeechHeader = Pattern.compile(
34 "Noun|Verb|Adjective|Adverb|Pronoun|Conjunction|Interjection|" +
35 "Preposition|Proper noun|Article|Prepositional phrase|Acronym|" +
36 "Abbreviation|Initialism|Contraction|Prefix|Suffix|Symbol|Letter|" +
37 "Ligature|Idiom|Phrase|\\{\\{acronym\\}\\}|\\{\\{initialism\\}\\}|" +
38 "\\{\\{abbreviation\\}\\}|" +
39 // These are @deprecated:
40 "Noun form|Verb form|Adjective form|Nominal phrase|Noun phrase|" +
41 "Verb phrase|Transitive verb|Intransitive verb|Reflexive verb|" +
42 // These are extras I found:
43 "Determiner|Numeral|Number|Cardinal number|Ordinal number|Proverb|" +
44 "Particle|Interjection|Pronominal adverb|" +
45 "Han character|Hanzi|Hanja|Kanji|Katakana character|Syllable");
47 static final Set<String> USELESS_WIKI_ARGS = new LinkedHashSet<String>(
57 static boolean isIgnorableTitle(final String title) {
58 return title.startsWith("Wiktionary:") ||
59 title.startsWith("Template:") ||
60 title.startsWith("Appendix:") ||
61 title.startsWith("Category:") ||
62 title.startsWith("Index:") ||
63 title.startsWith("MediaWiki:") ||
64 title.startsWith("TransWiki:") ||
65 title.startsWith("Citations:") ||
66 title.startsWith("Concordance:") ||
67 title.startsWith("Help:");
70 final IndexBuilder enIndexBuilder;
71 final IndexBuilder foreignIndexBuilder;
72 final Pattern langPattern;
73 final Pattern langCodePattern;
76 // State used while parsing.
79 ENGLISH_DEF_OF_FOREIGN,
85 public boolean entryIsFormOfSomething = false;
86 final Collection<String> wordForms = new ArrayList<String>();
87 boolean titleAppended = false;
90 final AppendAndIndexWikiCallback<EnParser> appendAndIndexWikiCallback = new AppendAndIndexCallback(this);
92 appendAndIndexWikiCallback.functionCallbacks.putAll(EnFunctionCallbacks.DEFAULT);
93 for (final String key : new ArrayList<String>(appendAndIndexWikiCallback.functionCallbacks.keySet())) {
94 // Don't handle the it-conj functions here.
95 if (key.startsWith("it-conj")) {
96 appendAndIndexWikiCallback.functionCallbacks.remove(key);
101 EnParser(final IndexBuilder enIndexBuilder, final IndexBuilder otherIndexBuilder, final Pattern langPattern, final Pattern langCodePattern, final boolean swap) {
102 this.enIndexBuilder = enIndexBuilder;
103 this.foreignIndexBuilder = otherIndexBuilder;
104 this.langPattern = langPattern;
105 this.langCodePattern = langCodePattern;
110 void removeUselessArgs(Map<String, String> namedArgs) {
111 namedArgs.keySet().removeAll(USELESS_WIKI_ARGS);
114 static class AppendAndIndexCallback extends AppendAndIndexWikiCallback<EnParser> {
116 public AppendAndIndexCallback(EnParser parser) {
121 public void onWikiLink(WikiTokenizer wikiTokenizer) {
122 final String text = wikiTokenizer.wikiLinkText();
123 final String link = wikiTokenizer.wikiLinkDest();
125 if (link.contains("#English")) {
126 dispatch(text, parser.enIndexBuilder, EntryTypeName.WIKTIONARY_ENGLISH_DEF_WIKI_LINK);
127 } else if (link.contains("#") && parser.langPattern.matcher(link).find()) {
128 dispatch(text, parser.foreignIndexBuilder, EntryTypeName.WIKTIONARY_ENGLISH_DEF_OTHER_LANG);
129 } else if (link.equals("plural")) {
130 builder.append(text);
132 //LOG.warning("Special link: " + englishTokenizer.token());
133 dispatch(text, EntryTypeName.WIKTIONARY_ENGLISH_DEF_WIKI_LINK);
137 final EntryTypeName entryTypeName;
138 switch (parser.state) {
139 case TRANSLATION_LINE:
140 entryTypeName = EntryTypeName.WIKTIONARY_TRANSLATION_WIKI_TEXT;
142 case ENGLISH_DEF_OF_FOREIGN:
143 entryTypeName = EntryTypeName.WIKTIONARY_ENGLISH_DEF_WIKI_LINK;
146 throw new IllegalStateException("Invalid enum value: " + parser.state);
148 dispatch(text, entryTypeName);