1 // Copyright 2012 Google Inc. All Rights Reserved.
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
7 // http://www.apache.org/licenses/LICENSE-2.0
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
15 package com.hughes.android.dictionary.parser.enwiktionary;
17 import java.util.Arrays;
18 import java.util.Collections;
19 import java.util.LinkedHashMap;
20 import java.util.LinkedHashSet;
21 import java.util.List;
24 import java.util.concurrent.atomic.AtomicInteger;
25 import java.util.logging.Logger;
27 import com.hughes.android.dictionary.engine.EntryTypeName;
28 import com.hughes.android.dictionary.engine.IndexBuilder;
29 import com.hughes.android.dictionary.parser.WikiTokenizer;
30 import com.hughes.util.ListUtil;
32 public final class FunctionCallbacksDefault {
34 static final Logger LOG = Logger.getLogger(EnWiktionaryXmlParser.class.getName());
36 static final Map<String,FunctionCallback> DEFAULT = new LinkedHashMap<String, FunctionCallback>();
39 FunctionCallback callback = new TranslationCallback();
40 DEFAULT.put("t", callback);
41 DEFAULT.put("t+", callback);
42 DEFAULT.put("t-", callback);
43 DEFAULT.put("tø", callback);
44 DEFAULT.put("apdx-t", callback);
46 callback = new EncodingCallback();
47 Set<String> encodings = new LinkedHashSet<String>(Arrays.asList(
49 "sd-Arab", "ku-Arab", "Arab", "unicode", "Laoo", "ur-Arab", "Thai",
50 "fa-Arab", "Khmr", "Cyrl", "IPAchar", "ug-Arab", "ko-inline",
51 "Jpan", "Kore", "Hebr", "rfscript", "Beng", "Mong", "Knda", "Cyrs",
52 "yue-tsj", "Mlym", "Tfng", "Grek", "yue-yue-j"));
53 for (final String encoding : encodings) {
54 DEFAULT.put(encoding, callback);
57 callback = new l_term();
58 DEFAULT.put("l", callback);
59 DEFAULT.put("term", callback);
61 callback = new Gender();
62 DEFAULT.put("m", callback);
63 DEFAULT.put("f", callback);
64 DEFAULT.put("n", callback);
65 DEFAULT.put("p", callback);
66 DEFAULT.put("g", callback);
68 callback = new AppendArg0();
70 callback = new Ignore();
71 DEFAULT.put("trreq", callback);
72 DEFAULT.put("t-image", callback);
73 DEFAULT.put("defn", callback);
74 DEFAULT.put("rfdef", callback);
75 DEFAULT.put("rfdate", callback);
76 DEFAULT.put("rfex", callback);
77 DEFAULT.put("rfquote", callback);
78 DEFAULT.put("attention", callback);
79 DEFAULT.put("zh-attention", callback);
82 callback = new FormOf();
83 DEFAULT.put("form of", callback);
84 DEFAULT.put("conjugation of", callback);
85 DEFAULT.put("participle of", callback);
86 DEFAULT.put("present participle of", callback);
87 DEFAULT.put("past participle of", callback);
88 DEFAULT.put("feminine past participle of", callback);
89 DEFAULT.put("gerund of", callback);
90 DEFAULT.put("feminine of", callback);
91 DEFAULT.put("plural of", callback);
92 DEFAULT.put("feminine plural of", callback);
93 DEFAULT.put("inflected form of", callback);
94 DEFAULT.put("alternative form of", callback);
95 DEFAULT.put("dated form of", callback);
96 DEFAULT.put("apocopic form of", callback);
98 callback = new InflOrHead();
99 DEFAULT.put("infl", callback);
100 DEFAULT.put("head", callback);
102 callback = new AppendName();
103 DEFAULT.put("...", callback);
105 DEFAULT.put("qualifier", new QualifierCallback());
106 DEFAULT.put("italbrac", new italbrac());
107 DEFAULT.put("gloss", new gloss());
108 DEFAULT.put("not used", new not_used());
109 DEFAULT.put("wikipedia", new wikipedia());
113 static final class NameAndArgs implements FunctionCallback {
115 public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List<String> args,
116 final Map<String, String> namedArgs, final EnWiktionaryXmlParser parser,
117 final AppendAndIndexWikiCallback appendAndIndexWikiCallback) {
120 appendAndIndexWikiCallback.builder.append(name);
122 for (int i = 0; i < args.size(); ++i) {
123 if (args.get(i).length() > 0) {
124 appendAndIndexWikiCallback.builder.append("|");
125 appendAndIndexWikiCallback.dispatch(args.get(i), null, null);
128 appendNamedArgs(namedArgs, appendAndIndexWikiCallback);
132 static NameAndArgs NAME_AND_ARGS = new NameAndArgs();
134 static void appendNamedArgs(final Map<String, String> namedArgs,
135 final AppendAndIndexWikiCallback appendAndIndexWikiCallback) {
136 for (final Map.Entry<String, String> entry : namedArgs.entrySet()) {
137 appendAndIndexWikiCallback.builder.append("|");
138 appendAndIndexWikiCallback.dispatch(entry.getKey(), null, null);
139 appendAndIndexWikiCallback.builder.append("=");
140 EntryTypeName entryTypeName = null;
141 IndexBuilder indexBuilder = null;
142 // This doesn't work: we'd need to add to word-forms.
143 // System.out.println(entry.getKey());
144 // if (entry.getKey().equals("tr")) {
145 // entryTypeName = EntryTypeName.WIKTIONARY_TRANSLITERATION;
146 // indexBuilder = appendAndIndexWikiCallback.parser.foreignIndexBuilder;
148 appendAndIndexWikiCallback.dispatch(entry.getValue(), indexBuilder, entryTypeName);
152 // ------------------------------------------------------------------
154 static final class TranslationCallback implements FunctionCallback {
156 public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List<String> args,
157 final Map<String, String> namedArgs, final EnWiktionaryXmlParser parser,
158 final AppendAndIndexWikiCallback appendAndIndexWikiCallback) {
160 final String transliteration = namedArgs.remove("tr");
161 final String alt = namedArgs.remove("alt");
162 namedArgs.keySet().removeAll(EnWiktionaryXmlParser.USELESS_WIKI_ARGS);
163 if (args.size() < 2) {
164 LOG.warning("{{t...}} with wrong args: title=" + parser.title);
167 final String langCode = ListUtil.get(args, 0);
168 if (!appendAndIndexWikiCallback.langCodeToTCount.containsKey(langCode)) {
169 appendAndIndexWikiCallback.langCodeToTCount.put(langCode, new AtomicInteger());
171 appendAndIndexWikiCallback.langCodeToTCount.get(langCode).incrementAndGet();
172 final String word = ListUtil.get(args, 1);
173 appendAndIndexWikiCallback.dispatch(alt != null ? alt : word, EntryTypeName.WIKTIONARY_TITLE_MULTI);
176 if (args.size() > 2) {
177 appendAndIndexWikiCallback.builder.append(" {");
178 for (int i = 2; i < args.size(); ++i) {
180 appendAndIndexWikiCallback.builder.append("|");
182 appendAndIndexWikiCallback.builder.append(args.get(i));
184 appendAndIndexWikiCallback.builder.append("}");
187 if (transliteration != null) {
188 appendAndIndexWikiCallback.builder.append(" (");
189 appendAndIndexWikiCallback.dispatch(transliteration, EntryTypeName.WIKTIONARY_TRANSLITERATION);
190 appendAndIndexWikiCallback.builder.append(")");
194 // If alt wasn't null, we appended alt instead of the actual word
195 // we're filing under..
196 appendAndIndexWikiCallback.builder.append(" (");
197 appendAndIndexWikiCallback.dispatch(word, EntryTypeName.WIKTIONARY_TITLE_MULTI);
198 appendAndIndexWikiCallback.builder.append(")");
201 // Catch-all for anything else...
202 if (!namedArgs.isEmpty()) {
203 appendAndIndexWikiCallback.builder.append(" {");
204 FunctionCallbacksDefault.appendNamedArgs(namedArgs, appendAndIndexWikiCallback);
205 appendAndIndexWikiCallback.builder.append("}");
213 // ------------------------------------------------------------------
215 static final class QualifierCallback implements FunctionCallback {
217 public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List<String> args,
218 final Map<String, String> namedArgs,
219 final EnWiktionaryXmlParser parser,
220 final AppendAndIndexWikiCallback appendAndIndexWikiCallback) {
221 if (args.size() != 1 || !namedArgs.isEmpty()) {
222 LOG.warning("weird qualifier: ");
225 String qualifier = args.get(0);
226 appendAndIndexWikiCallback.builder.append("(");
227 appendAndIndexWikiCallback.dispatch(qualifier, null);
228 appendAndIndexWikiCallback.builder.append(")");
233 // ------------------------------------------------------------------
235 static final class EncodingCallback implements FunctionCallback {
237 public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List<String> args,
238 final Map<String, String> namedArgs,
239 final EnWiktionaryXmlParser parser,
240 final AppendAndIndexWikiCallback appendAndIndexWikiCallback) {
241 if (!namedArgs.isEmpty()) {
242 LOG.warning("weird encoding: " + wikiTokenizer.token());
244 if (args.size() == 0) {
245 // Things like "{{Jpan}}" exist.
249 for (int i = 0; i < args.size(); ++i) {
251 appendAndIndexWikiCallback.builder.append(", ");
253 final String arg = args.get(i);
254 // if (arg.equals(parser.title)) {
255 // parser.titleAppended = true;
257 appendAndIndexWikiCallback.dispatch(arg, appendAndIndexWikiCallback.entryTypeName);
264 // ------------------------------------------------------------------
266 static final class Gender implements FunctionCallback {
268 public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List<String> args,
269 final Map<String, String> namedArgs,
270 final EnWiktionaryXmlParser parser,
271 final AppendAndIndexWikiCallback appendAndIndexWikiCallback) {
272 if (!namedArgs.isEmpty()) {
275 appendAndIndexWikiCallback.builder.append("{");
276 appendAndIndexWikiCallback.builder.append(name);
277 for (int i = 0; i < args.size(); ++i) {
278 appendAndIndexWikiCallback.builder.append("|").append(args.get(i));
280 appendAndIndexWikiCallback.builder.append("}");
285 // ------------------------------------------------------------------
287 static final class l_term implements FunctionCallback {
289 public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List<String> args,
290 final Map<String, String> namedArgs,
291 final EnWiktionaryXmlParser parser,
292 final AppendAndIndexWikiCallback appendAndIndexWikiCallback) {
294 // for {{l}}, lang is arg 0, but not for {{term}}
295 if (name.equals("term")) {
299 final EntryTypeName entryTypeName;
300 switch (parser.state) {
301 case TRANSLATION_LINE: entryTypeName = EntryTypeName.WIKTIONARY_TRANSLATION_OTHER_TEXT; break;
302 case ENGLISH_DEF_OF_FOREIGN: entryTypeName = EntryTypeName.WIKTIONARY_ENGLISH_DEF_WIKI_LINK; break;
303 default: throw new IllegalStateException("Invalid enum value: " + parser.state);
306 final String langCode = args.get(0);
307 final IndexBuilder indexBuilder;
308 if ("".equals(langCode)) {
309 indexBuilder = parser.foreignIndexBuilder;
310 } else if ("en".equals(langCode)) {
311 indexBuilder = parser.enIndexBuilder;
313 indexBuilder = parser.foreignIndexBuilder;
316 String displayText = ListUtil.get(args, 2, "");
317 if (displayText.equals("")) {
318 displayText = ListUtil.get(args, 1, null);
321 if (displayText != null) {
322 appendAndIndexWikiCallback.dispatch(displayText, indexBuilder, entryTypeName);
324 LOG.warning("no display text: " + wikiTokenizer.token());
327 final String tr = namedArgs.remove("tr");
329 appendAndIndexWikiCallback.builder.append(" (");
330 appendAndIndexWikiCallback.dispatch(tr, indexBuilder, EntryTypeName.WIKTIONARY_TRANSLITERATION);
331 appendAndIndexWikiCallback.builder.append(")");
334 final String gloss = ListUtil.get(args, 3, "");
335 if (!gloss.equals("")) {
336 appendAndIndexWikiCallback.builder.append(" (");
337 appendAndIndexWikiCallback.dispatch(gloss, parser.enIndexBuilder, EntryTypeName.WIKTIONARY_ENGLISH_DEF);
338 appendAndIndexWikiCallback.builder.append(")");
341 namedArgs.keySet().removeAll(EnWiktionaryXmlParser.USELESS_WIKI_ARGS);
342 if (!namedArgs.isEmpty()) {
343 appendAndIndexWikiCallback.builder.append(" {").append(name);
344 appendNamedArgs(namedArgs, appendAndIndexWikiCallback);
345 appendAndIndexWikiCallback.builder.append("}");
352 // ------------------------------------------------------------------
354 static final class AppendArg0 implements FunctionCallback {
356 public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List<String> args,
357 final Map<String, String> namedArgs,
358 final EnWiktionaryXmlParser parser,
359 final AppendAndIndexWikiCallback appendAndIndexWikiCallback) {
360 if (args.size() != 1 || !namedArgs.isEmpty()) {
363 appendAndIndexWikiCallback.dispatch(args.get(0), EntryTypeName.WIKTIONARY_TRANSLATION_OTHER_TEXT);
364 // TODO: transliteration
369 // ------------------------------------------------------------------
371 static final class italbrac implements FunctionCallback {
373 public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List<String> args,
374 final Map<String, String> namedArgs,
375 final EnWiktionaryXmlParser parser,
376 final AppendAndIndexWikiCallback appendAndIndexWikiCallback) {
377 if (args.size() != 1 || !namedArgs.isEmpty()) {
380 appendAndIndexWikiCallback.builder.append("(");
381 appendAndIndexWikiCallback.dispatch(args.get(0), EntryTypeName.WIKTIONARY_TRANSLATION_OTHER_TEXT);
382 appendAndIndexWikiCallback.builder.append(")");
387 // ------------------------------------------------------------------
389 static final class gloss implements FunctionCallback {
391 public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List<String> args,
392 final Map<String, String> namedArgs,
393 final EnWiktionaryXmlParser parser,
394 final AppendAndIndexWikiCallback appendAndIndexWikiCallback) {
395 if (args.size() != 1 || !namedArgs.isEmpty()) {
398 appendAndIndexWikiCallback.builder.append("(");
399 appendAndIndexWikiCallback.dispatch(args.get(0), EntryTypeName.WIKTIONARY_TRANSLATION_OTHER_TEXT);
400 appendAndIndexWikiCallback.builder.append(")");
405 // ------------------------------------------------------------------
407 static final class Ignore implements FunctionCallback {
409 public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List<String> args,
410 final Map<String, String> namedArgs,
411 final EnWiktionaryXmlParser parser,
412 final AppendAndIndexWikiCallback appendAndIndexWikiCallback) {
417 // ------------------------------------------------------------------
419 static final class not_used implements FunctionCallback {
421 public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List<String> args,
422 final Map<String, String> namedArgs,
423 final EnWiktionaryXmlParser parser,
424 final AppendAndIndexWikiCallback appendAndIndexWikiCallback) {
425 appendAndIndexWikiCallback.builder.append("(not used)");
431 // ------------------------------------------------------------------
433 static final class AppendName implements FunctionCallback {
435 public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List<String> args,
436 final Map<String, String> namedArgs,
437 final EnWiktionaryXmlParser parser,
438 final AppendAndIndexWikiCallback appendAndIndexWikiCallback) {
439 if (!args.isEmpty() || !namedArgs.isEmpty()) {
442 appendAndIndexWikiCallback.builder.append(name);
447 // --------------------------------------------------------------------
448 // --------------------------------------------------------------------
451 static final class FormOf implements FunctionCallback {
453 public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List<String> args,
454 final Map<String, String> namedArgs,
455 final EnWiktionaryXmlParser parser,
456 final AppendAndIndexWikiCallback appendAndIndexWikiCallback) {
457 parser.entryIsFormOfSomething = true;
458 String formName = name;
459 if (name.equals("form of")) {
460 formName = ListUtil.remove(args, 0, null);
462 if (formName == null) {
463 LOG.warning("Missing form name: " + parser.title);
464 formName = "form of";
466 String baseForm = ListUtil.get(args, 1, "");
467 if ("".equals(baseForm)) {
468 baseForm = ListUtil.get(args, 0, null);
469 ListUtil.remove(args, 1, "");
471 ListUtil.remove(args, 0, null);
473 namedArgs.keySet().removeAll(EnWiktionaryXmlParser.USELESS_WIKI_ARGS);
475 appendAndIndexWikiCallback.builder.append("{");
476 NAME_AND_ARGS.onWikiFunction(wikiTokenizer, formName, args, namedArgs, parser, appendAndIndexWikiCallback);
477 appendAndIndexWikiCallback.builder.append("}");
478 if (baseForm != null && appendAndIndexWikiCallback.indexedEntry != null) {
479 parser.foreignIndexBuilder.addEntryWithString(appendAndIndexWikiCallback.indexedEntry, baseForm, EntryTypeName.WIKTIONARY_BASE_FORM_MULTI);
481 // null baseForm happens in Danish.
482 LOG.warning("Null baseform: " + parser.title);
488 static final FormOf FORM_OF = new FormOf();
491 // --------------------------------------------------------------------
492 // --------------------------------------------------------------------
494 static final class wikipedia implements FunctionCallback {
496 public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List<String> args,
497 final Map<String, String> namedArgs,
498 final EnWiktionaryXmlParser parser,
499 final AppendAndIndexWikiCallback appendAndIndexWikiCallback) {
500 namedArgs.remove("lang");
501 if (args.size() > 1 || !namedArgs.isEmpty()) {
504 } else if (args.size() == 1) {
512 static final class InflOrHead implements FunctionCallback {
514 public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List<String> args,
515 final Map<String, String> namedArgs,
516 final EnWiktionaryXmlParser parser,
517 final AppendAndIndexWikiCallback appendAndIndexWikiCallback) {
518 // See: http://en.wiktionary.org/wiki/Template:infl
519 final String langCode = ListUtil.get(args, 0);
520 String head = namedArgs.remove("head");
522 head = namedArgs.remove("title"); // Bug
527 parser.titleAppended = true;
529 namedArgs.keySet().removeAll(EnWiktionaryXmlParser.USELESS_WIKI_ARGS);
531 final String tr = namedArgs.remove("tr");
532 String g = namedArgs.remove("g");
534 g = namedArgs.remove("gender");
536 final String g2 = namedArgs.remove("g2");
537 final String g3 = namedArgs.remove("g3");
539 appendAndIndexWikiCallback.dispatch(head, EntryTypeName.WIKTIONARY_TITLE_MULTI);
542 appendAndIndexWikiCallback.builder.append(" {").append(g);
544 appendAndIndexWikiCallback.builder.append("|").append(g2);
547 appendAndIndexWikiCallback.builder.append("|").append(g3);
549 appendAndIndexWikiCallback.builder.append("}");
553 appendAndIndexWikiCallback.builder.append(" (");
554 appendAndIndexWikiCallback.dispatch(tr, EntryTypeName.WIKTIONARY_TITLE_MULTI);
555 appendAndIndexWikiCallback.builder.append(")");
556 parser.wordForms.add(tr);
559 final String pos = ListUtil.get(args, 1);
561 appendAndIndexWikiCallback.builder.append(" (").append(pos).append(")");
563 for (int i = 2; i < args.size(); i += 2) {
564 final String inflName = ListUtil.get(args, i);
565 final String inflValue = ListUtil.get(args, i + 1);
566 appendAndIndexWikiCallback.builder.append(", ");
567 appendAndIndexWikiCallback.dispatch(inflName, null, null);
568 if (inflValue != null && inflValue.length() > 0) {
569 appendAndIndexWikiCallback.builder.append(": ");
570 appendAndIndexWikiCallback.dispatch(inflValue, null, null);
571 parser.wordForms.add(inflValue);
574 for (final String key : namedArgs.keySet()) {
575 final String value = WikiTokenizer.toPlainText(namedArgs.get(key));
576 appendAndIndexWikiCallback.builder.append(" ");
577 appendAndIndexWikiCallback.dispatch(key, null, null);
578 appendAndIndexWikiCallback.builder.append("=");
579 appendAndIndexWikiCallback.dispatch(value, null, null);
580 parser.wordForms.add(value);
588 DEFAULT.put("it-noun", new it_noun());
590 static final class it_noun implements FunctionCallback {
592 public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List<String> args,
593 final Map<String, String> namedArgs,
594 final EnWiktionaryXmlParser parser,
595 final AppendAndIndexWikiCallback appendAndIndexWikiCallback) {
596 parser.titleAppended = true;
597 final String base = ListUtil.get(args, 0);
598 final String gender = ListUtil.get(args, 1);
599 final String singular = base + ListUtil.get(args, 2, null);
600 final String plural = base + ListUtil.get(args, 3, null);
601 appendAndIndexWikiCallback.builder.append(" ");
602 appendAndIndexWikiCallback.dispatch(singular, null, null);
603 appendAndIndexWikiCallback.builder.append(" {").append(gender).append("}, ");
604 appendAndIndexWikiCallback.dispatch(plural, null, null);
605 appendAndIndexWikiCallback.builder.append(" {pl}");
606 parser.wordForms.add(singular);
607 parser.wordForms.add(plural);
608 if (!namedArgs.isEmpty() || args.size() > 4) {
609 LOG.warning("Invalid it-noun: " + wikiTokenizer.token());
616 DEFAULT.put("it-proper noun", new it_proper_noun());
618 static final class it_proper_noun implements FunctionCallback {
620 public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List<String> args,
621 final Map<String, String> namedArgs,
622 final EnWiktionaryXmlParser parser,
623 final AppendAndIndexWikiCallback appendAndIndexWikiCallback) {