1 // Copyright 2012 Google Inc. All Rights Reserved.
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
7 // http://www.apache.org/licenses/LICENSE-2.0
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
15 package com.hughes.android.dictionary.parser.wiktionary;
17 import java.util.ArrayList;
18 import java.util.Arrays;
19 import java.util.Collection;
20 import java.util.LinkedHashSet;
23 import java.util.regex.Pattern;
25 import com.hughes.android.dictionary.engine.EntryTypeName;
26 import com.hughes.android.dictionary.engine.IndexBuilder;
27 import com.hughes.android.dictionary.parser.WikiTokenizer;
29 public abstract class EnParser extends AbstractWiktionaryParser {
31 // TODO: process {{ttbc}} lines
33 static final Pattern partOfSpeechHeader = Pattern.compile(
34 "Noun|Verb|Adjective|Adverb|Pronoun|Conjunction|Interjection|" +
35 "Preposition|Proper noun|Article|Prepositional phrase|Acronym|" +
36 "Abbreviation|Initialism|Contraction|Prefix|Suffix|Symbol|Letter|" +
37 "Ligature|Idiom|Phrase|\\{\\{acronym\\}\\}|\\{\\{initialism\\}\\}|" +
38 "\\{\\{abbreviation\\}\\}|" +
39 // These are @deprecated:
40 "Noun form|Verb form|Adjective form|Nominal phrase|Noun phrase|" +
41 "Verb phrase|Transitive verb|Intransitive verb|Reflexive verb|" +
42 // These are extras I found:
43 "Determiner|Numeral|Number|Cardinal number|Ordinal number|Proverb|" +
44 "Particle|Interjection|Pronominal adverb" +
45 "Han character|Hanzi|Hanja|Kanji|Katakana character|Syllable");
47 // Might only want to remove "lang" if it's equal to "zh", for example.
48 static final Set<String> USELESS_WIKI_ARGS = new LinkedHashSet<String>(
58 static boolean isIgnorableTitle(final String title) {
59 return title.startsWith("Wiktionary:") ||
60 title.startsWith("Template:") ||
61 title.startsWith("Appendix:") ||
62 title.startsWith("Category:") ||
63 title.startsWith("Index:") ||
64 title.startsWith("MediaWiki:") ||
65 title.startsWith("TransWiki:") ||
66 title.startsWith("Citations:") ||
67 title.startsWith("Concordance:") ||
68 title.startsWith("Help:");
71 final IndexBuilder enIndexBuilder;
72 final IndexBuilder foreignIndexBuilder;
73 final Pattern langPattern;
74 final Pattern langCodePattern;
77 // State used while parsing.
80 ENGLISH_DEF_OF_FOREIGN,
86 public boolean entryIsFormOfSomething = false;
87 final Collection<String> wordForms = new ArrayList<String>();
88 boolean titleAppended = false;
91 final AppendAndIndexWikiCallback<EnParser> appendAndIndexWikiCallback = new AppendAndIndexCallback(this);
93 appendAndIndexWikiCallback.functionCallbacks.putAll(EnFunctionCallbacks.DEFAULT);
96 EnParser(final IndexBuilder enIndexBuilder, final IndexBuilder otherIndexBuilder, final Pattern langPattern, final Pattern langCodePattern, final boolean swap) {
97 this.enIndexBuilder = enIndexBuilder;
98 this.foreignIndexBuilder = otherIndexBuilder;
99 this.langPattern = langPattern;
100 this.langCodePattern = langCodePattern;
105 void removeUselessArgs(Map<String, String> namedArgs) {
106 namedArgs.keySet().removeAll(USELESS_WIKI_ARGS);
109 static class AppendAndIndexCallback extends AppendAndIndexWikiCallback<EnParser> {
111 public AppendAndIndexCallback(EnParser parser) {
116 public void onWikiLink(WikiTokenizer wikiTokenizer) {
117 final String text = wikiTokenizer.wikiLinkText();
118 final String link = wikiTokenizer.wikiLinkDest();
120 if (link.contains("#English")) {
121 dispatch(text, parser.enIndexBuilder, EntryTypeName.WIKTIONARY_ENGLISH_DEF_WIKI_LINK);
122 } else if (link.contains("#") && parser.langPattern.matcher(link).find()) {
123 dispatch(text, parser.foreignIndexBuilder, EntryTypeName.WIKTIONARY_ENGLISH_DEF_OTHER_LANG);
124 } else if (link.equals("plural")) {
125 builder.append(text);
127 //LOG.warning("Special link: " + englishTokenizer.token());
128 dispatch(text, EntryTypeName.WIKTIONARY_ENGLISH_DEF_WIKI_LINK);
132 final EntryTypeName entryTypeName;
133 switch (parser.state) {
134 case TRANSLATION_LINE:
135 entryTypeName = EntryTypeName.WIKTIONARY_TRANSLATION_WIKI_TEXT;
137 case ENGLISH_DEF_OF_FOREIGN:
138 entryTypeName = EntryTypeName.WIKTIONARY_ENGLISH_DEF_WIKI_LINK;
141 throw new IllegalStateException("Invalid enum value: " + parser.state);
143 dispatch(text, entryTypeName);