]> gitweb.fperrin.net Git - DictionaryPC.git/blob - src/com/hughes/android/dictionary/parser/wiktionary/EnParser.java
d15cc9292d34962c0ab69f7b5db398f3d105dceb
[DictionaryPC.git] / src / com / hughes / android / dictionary / parser / wiktionary / EnParser.java
1 // Copyright 2012 Google Inc. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //     http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 package com.hughes.android.dictionary.parser.wiktionary;
16
17 import java.util.ArrayList;
18 import java.util.Arrays;
19 import java.util.Collection;
20 import java.util.LinkedHashSet;
21 import java.util.Map;
22 import java.util.Set;
23 import java.util.regex.Pattern;
24
25 import com.hughes.android.dictionary.engine.EntryTypeName;
26 import com.hughes.android.dictionary.engine.IndexBuilder;
27 import com.hughes.android.dictionary.parser.WikiTokenizer;
28
29 public abstract class EnParser extends AbstractWiktionaryParser {
30
31     // TODO: process {{ttbc}} lines
32
33     public static final Pattern partOfSpeechHeader = Pattern.compile(
34                 "Noun|Verb|Adjective|Adverb|Pronoun|Conjunction|Interjection|" +
35                 "Preposition|Proper noun|Article|Prepositional phrase|Acronym|" +
36                 "Abbreviation|Initialism|Contraction|Prefix|Suffix|Symbol|Letter|" +
37                 "Ligature|Idiom|Phrase|\\{\\{acronym\\}\\}|\\{\\{initialism\\}\\}|" +
38                 "\\{\\{abbreviation\\}\\}|" +
39                 // These are @deprecated:
40                 "Noun form|Verb form|Adjective form|Nominal phrase|Noun phrase|" +
41                 "Verb phrase|Transitive verb|Intransitive verb|Reflexive verb|" +
42                 // These are extras I found:
43                 "Determiner|Numeral|Number|Cardinal number|Ordinal number|Proverb|" +
44                 "Particle|Interjection|Pronominal adverb|" +
45                 "Han character|Hanzi|Hanja|Kanji|Katakana character|Syllable");
46
47     static final Set<String> USELESS_WIKI_ARGS = new LinkedHashSet<String>(
48         Arrays.asList(
49             "lang",
50             "sc",
51             "sort",
52             "cat",
53             "cat2",
54             "xs",
55             "nodot"));
56
57     static boolean isIgnorableTitle(final String title) {
58         return title.startsWith("Wiktionary:") ||
59                title.startsWith("Template:") ||
60                title.startsWith("Appendix:") ||
61                title.startsWith("Category:") ||
62                title.startsWith("Index:") ||
63                title.startsWith("MediaWiki:") ||
64                title.startsWith("TransWiki:") ||
65                title.startsWith("Citations:") ||
66                title.startsWith("Concordance:") ||
67                title.startsWith("Help:");
68     }
69
70     final IndexBuilder enIndexBuilder;
71     final IndexBuilder foreignIndexBuilder;
72     final Pattern langPattern;
73     final Pattern langCodePattern;
74     final boolean swap;
75
76     // State used while parsing.
77     enum State {
78         TRANSLATION_LINE,
79         ENGLISH_DEF_OF_FOREIGN,
80         ENGLISH_EXAMPLE,
81         FOREIGN_EXAMPLE,
82     }
83     State state = null;
84
85     public boolean entryIsFormOfSomething = false;
86     final Collection<String> wordForms = new ArrayList<String>();
87     boolean titleAppended = false;
88
89
90     final AppendAndIndexWikiCallback<EnParser> appendAndIndexWikiCallback = new AppendAndIndexCallback(this);
91     {
92         appendAndIndexWikiCallback.functionCallbacks.putAll(EnFunctionCallbacks.DEFAULT);
93         for (final String key : new ArrayList<String>(appendAndIndexWikiCallback.functionCallbacks.keySet())) {
94             // Don't handle the it-conj functions here.
95             if (key.startsWith("it-conj")) {
96                 appendAndIndexWikiCallback.functionCallbacks.remove(key);
97             }
98         }
99     }
100
101     EnParser(final IndexBuilder enIndexBuilder, final IndexBuilder otherIndexBuilder, final Pattern langPattern, final Pattern langCodePattern, final boolean swap) {
102         this.enIndexBuilder = enIndexBuilder;
103         this.foreignIndexBuilder = otherIndexBuilder;
104         this.langPattern = langPattern;
105         this.langCodePattern = langCodePattern;
106         this.swap = swap;
107     }
108
109     @Override
110     void removeUselessArgs(Map<String, String> namedArgs) {
111         namedArgs.keySet().removeAll(USELESS_WIKI_ARGS);
112     }
113
114     static class AppendAndIndexCallback extends AppendAndIndexWikiCallback<EnParser> {
115
116         public AppendAndIndexCallback(EnParser parser) {
117             super(parser);
118         }
119
120         @Override
121         public void onWikiLink(WikiTokenizer wikiTokenizer) {
122             final String text = wikiTokenizer.wikiLinkText();
123             final String link = wikiTokenizer.wikiLinkDest();
124             if (link != null) {
125                 if (link.contains("#English")) {
126                     dispatch(text, parser.enIndexBuilder, EntryTypeName.WIKTIONARY_ENGLISH_DEF_WIKI_LINK);
127                 } else if (link.contains("#") && parser.langPattern.matcher(link).find()) {
128                     dispatch(text, parser.foreignIndexBuilder, EntryTypeName.WIKTIONARY_ENGLISH_DEF_OTHER_LANG);
129                 } else if (link.equals("plural")) {
130                     builder.append(text);
131                 } else {
132                     //LOG.warning("Special link: " + englishTokenizer.token());
133                     dispatch(text, EntryTypeName.WIKTIONARY_ENGLISH_DEF_WIKI_LINK);
134                 }
135             } else {
136                 // link == null
137                 final EntryTypeName entryTypeName;
138                 switch (parser.state) {
139                 case TRANSLATION_LINE:
140                     entryTypeName = EntryTypeName.WIKTIONARY_TRANSLATION_WIKI_TEXT;
141                     break;
142                 case ENGLISH_DEF_OF_FOREIGN:
143                     entryTypeName = EntryTypeName.WIKTIONARY_ENGLISH_DEF_WIKI_LINK;
144                     break;
145                 default:
146                     throw new IllegalStateException("Invalid enum value: " + parser.state);
147                 }
148                 dispatch(text, entryTypeName);
149             }
150         }
151
152     }
153
154 }