1 // Copyright 2012 Google Inc. All Rights Reserved.
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
7 // http://www.apache.org/licenses/LICENSE-2.0
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
15 package com.hughes.android.dictionary.parser.wiktionary;
17 import java.util.List;
19 import java.util.regex.Pattern;
21 import com.hughes.android.dictionary.engine.EntryTypeName;
22 import com.hughes.android.dictionary.engine.IndexBuilder;
23 import com.hughes.android.dictionary.engine.IndexedEntry;
24 import com.hughes.android.dictionary.engine.PairEntry;
25 import com.hughes.android.dictionary.engine.PairEntry.Pair;
26 import com.hughes.android.dictionary.parser.WikiTokenizer;
28 public final class EnTranslationToTranslationParser extends AbstractWiktionaryParser {
30 final IndexBuilder[] indexBuilders;
31 final Pattern[] namePatterns;
33 public EnTranslationToTranslationParser(final IndexBuilder[] indexBuilders,
34 final Pattern[] namePatterns) {
35 this.indexBuilders = indexBuilders;
36 this.namePatterns = namePatterns;
40 void removeUselessArgs(Map<String, String> namedArgs) {
41 namedArgs.keySet().removeAll(EnParser.USELESS_WIKI_ARGS);
45 void parseSection(String heading, String text) {
46 if (EnParser.isIgnorableTitle(title)) {
49 final WikiTokenizer wikiTokenizer = new WikiTokenizer(text);
50 while (wikiTokenizer.nextToken() != null) {
51 if (wikiTokenizer.isHeading()) {
52 final String headerName = wikiTokenizer.headingWikiText();
53 if (headerName.equals("Translations")) {
54 //doTranslations(wikiTokenizer);
57 // TODO: optimization: skip to next heading, or even skip to translations.
62 private void doTranslations(final WikiTokenizer wikiTokenizer) {
63 String topLevelLang = null;
65 StringBuilder[] builders;
66 while (wikiTokenizer.nextToken() != null) {
67 if (wikiTokenizer.isHeading()) {
68 wikiTokenizer.returnToLineStart();
75 // Check whether we care about this line:
76 if (wikiTokenizer.isFunction()) {
77 final String functionName = wikiTokenizer.functionName();
78 final List<String> positionArgs = wikiTokenizer.functionPositionArgs();
80 if (functionName.equals("trans-top")) {
81 if (wikiTokenizer.functionPositionArgs().size() >= 1) {
82 builders = new StringBuilder[] {new StringBuilder(), new StringBuilder()};
84 } else if (functionName.equals("trans-bottom")) {
86 } else if (functionName.equals("trans-mid")) {
87 } else if (functionName.equals("trans-see")) {
88 } else if (functionName.startsWith("picdic")) {
89 } else if (functionName.startsWith("checktrans")) {
91 } else if (functionName.startsWith("ttbc")) {
92 wikiTokenizer.nextLine();
93 // TODO: would be great to handle ttbc
94 // TODO: Check this: done = true;
96 LOG.warning("Unexpected translation wikifunction: " + wikiTokenizer.token() + ", title=" + title);
98 } else if (wikiTokenizer.isListItem()) {
99 final String line = wikiTokenizer.listItemWikiText();
100 // This line could produce an output...
102 // First strip the language and check whether it matches.
103 // And hold onto it for sub-lines.
104 final int colonIndex = line.indexOf(":");
105 if (colonIndex == -1) {
109 final String lang = trim(WikiTokenizer.toPlainText(line.substring(0, colonIndex)));
110 incrementCount("tCount:" + lang);
113 final boolean appendLang;
114 if (wikiTokenizer.listItemPrefix().length() == 1) {
116 final boolean thisFind = langPattern.matcher(lang).find();
120 appendLang = !langPattern.matcher(lang).matches();
121 } else if (topLevelLang == null) {
124 // Two-level -- the only way we won't append is if this second level matches exactly.
125 if (!langPattern.matcher(lang).matches() && !langPattern.matcher(topLevelLang).find()) {
128 appendLang = !langPattern.matcher(lang).matches();
131 String rest = line.substring(colonIndex + 1).trim();
132 if (rest.length() > 0) {
133 doTranslationLine(line, appendLang ? lang : null, rest);
139 private void doTranslationLine(final String line, final String lang, final String pos, final String sense, final String rest) {
140 state = State.TRANSLATION_LINE;
141 // Good chance we'll actually file this one...
142 final PairEntry pairEntry = new PairEntry(entrySource);
143 final IndexedEntry indexedEntry = new IndexedEntry(pairEntry);
145 final StringBuilder foreignText = new StringBuilder();
146 appendAndIndexWikiCallback.reset(foreignText, indexedEntry);
147 appendAndIndexWikiCallback.dispatch(rest, foreignIndexBuilder, EntryTypeName.WIKTIONARY_TRANSLATION_OTHER_TEXT);
149 if (foreignText.length() == 0) {
150 LOG.warning("Empty foreignText: " + line);
151 incrementCount("WARNING: Empty foreignText" );
156 foreignText.insert(0, String.format("(%s) ", lang));
159 StringBuilder englishText = new StringBuilder();
161 englishText.append(title);
163 englishText.append(" (").append(sense).append(")");
164 enIndexBuilder.addEntryWithString(indexedEntry, sense, EntryTypeName.WIKTIONARY_TRANSLATION_SENSE);
167 englishText.append(" (").append(pos.toLowerCase()).append(")");
169 enIndexBuilder.addEntryWithString(indexedEntry, title, EntryTypeName.WIKTIONARY_TITLE_MULTI);
171 final Pair pair = new Pair(trim(englishText.toString()), trim(foreignText.toString()), swap);
172 pairEntry.pairs.add(pair);
173 if (!pairsAdded.add(pair.toString())) {
174 LOG.warning("Duplicate pair: " + pair.toString());
175 incrementCount("WARNING: Duplicate pair" );