src/com/hughes/android/dictionary/parser/wiktionary/EnToTranslationParser.java

   1 // Copyright 2012 Google Inc. All Rights Reserved.
   2 //
   3 // Licensed under the Apache License, Version 2.0 (the "License");
   4 // you may not use this file except in compliance with the License.
   5 // You may obtain a copy of the License at
   6 //
   7 //     http://www.apache.org/licenses/LICENSE-2.0
   8 //
   9 // Unless required by applicable law or agreed to in writing, software
  10 // distributed under the License is distributed on an "AS IS" BASIS,
  11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12 // See the License for the specific language governing permissions and
  13 // limitations under the License.
  14
  15 package com.hughes.android.dictionary.parser.wiktionary;
  16
  17 import java.util.List;
  18 import java.util.regex.Pattern;
  19
  20 import com.hughes.android.dictionary.engine.EntryTypeName;
  21 import com.hughes.android.dictionary.engine.IndexBuilder;
  22 import com.hughes.android.dictionary.engine.IndexedEntry;
  23 import com.hughes.android.dictionary.engine.PairEntry;
  24 import com.hughes.android.dictionary.engine.PairEntry.Pair;
  25 import com.hughes.android.dictionary.parser.WikiTokenizer;
  26
  27 public final class EnToTranslationParser extends EnParser {
  28
  29     public EnToTranslationParser(final IndexBuilder enIndexBuilder,
  30                                  final IndexBuilder otherIndexBuilder, final Pattern langPattern,
  31                                  final Pattern langCodePattern, final boolean swap) {
  32         super(enIndexBuilder, otherIndexBuilder, langPattern, langCodePattern, swap);
  33     }
  34
  35     @Override
  36     void parseSection(String heading, String text) {
  37         if (isIgnorableTitle(title)) {
  38             return;
  39         }
  40         heading = heading.replace("=", "").trim();
  41         if (!heading.contains("English")) {
  42             return;
  43         }
  44
  45         String pos = null;
  46         int posDepth = -1;
  47
  48         final WikiTokenizer wikiTokenizer = new WikiTokenizer(text);
  49         while (wikiTokenizer.nextToken() != null) {
  50
  51             if (wikiTokenizer.isHeading()) {
  52                 final String headerName = wikiTokenizer.headingWikiText();
  53
  54                 if (wikiTokenizer.headingDepth() <= posDepth) {
  55                     pos = null;
  56                     posDepth = -1;
  57                 }
  58
  59                 if (partOfSpeechHeader.matcher(headerName).matches()) {
  60                     posDepth = wikiTokenizer.headingDepth();
  61                     pos = wikiTokenizer.headingWikiText();
  62                     // TODO: if we're inside the POS section, we should handle the first title line...
  63
  64                 } else if (headerName.equals("Translations")) {
  65                     if (pos == null) {
  66                         LOG.info("Translations without POS (but using anyway): " + title);
  67                     }
  68                     doTranslations(wikiTokenizer, pos);
  69                 } else if (headerName.equals("Pronunciation")) {
  70                     //doPronunciation(wikiLineReader);
  71                 }
  72             } else if (wikiTokenizer.isFunction()) {
  73                 final String name = wikiTokenizer.functionName();
  74                 if (name.equals("head") && pos == null) {
  75                     LOG.warning("{{head}} without POS: " + title);
  76                 }
  77             }
  78         }
  79     }
  80
  81     private void doTranslations(final WikiTokenizer wikiTokenizer, final String pos) {
  82         if (title.equals("absolutely")) {
  83             //System.out.println();
  84         }
  85
  86         String topLevelLang = null;
  87         String sense = null;
  88         boolean done = false;
  89         while (wikiTokenizer.nextToken() != null) {
  90             if (wikiTokenizer.isHeading()) {
  91                 wikiTokenizer.returnToLineStart();
  92                 return;
  93             }
  94             if (done) {
  95                 continue;
  96             }
  97
  98             // Check whether we care about this line:
  99
 100             if (wikiTokenizer.isFunction()) {
 101                 final String functionName = wikiTokenizer.functionName();
 102                 final List<String> positionArgs = wikiTokenizer.functionPositionArgs();
 103
 104                 if (functionName.equals("trans-top")) {
 105                     sense = null;
 106                     if (wikiTokenizer.functionPositionArgs().size() >= 1) {
 107                         sense = positionArgs.get(0);
 108                         sense = WikiTokenizer.toPlainText(sense);
 109                         //LOG.info("Sense: " + sense);
 110                     }
 111                 } else if (functionName.equals("trans-bottom")) {
 112                     sense = null;
 113                 } else if (functionName.equals("trans-mid")) {
 114                 } else if (functionName.equals("trans-see")) {
 115                     incrementCount("WARNING:trans-see");
 116                 } else if (functionName.startsWith("picdic")) {
 117                 } else if (functionName.startsWith("checktrans")) {
 118                     done = true;
 119                 } else if (functionName.startsWith("ttbc")) {
 120                     wikiTokenizer.nextLine();
 121                     // TODO: would be great to handle ttbc
 122                     // TODO: Check this: done = true;
 123                 } else {
 124                     LOG.warning("Unexpected translation wikifunction: " + wikiTokenizer.token() + ", title=" + title);
 125                 }
 126             } else if (wikiTokenizer.isListItem()) {
 127                 final String line = wikiTokenizer.listItemWikiText();
 128                 // This line could produce an output...
 129
 130 //          if (line.contains("ich hoan dich gear")) {
 131 //            //System.out.println();
 132 //          }
 133
 134                 // First strip the language and check whether it matches.
 135                 // And hold onto it for sub-lines.
 136                 final int colonIndex = line.indexOf(":");
 137                 if (colonIndex == -1) {
 138                     continue;
 139                 }
 140
 141                 final String lang = trim(WikiTokenizer.toPlainText(line.substring(0, colonIndex)));
 142                 incrementCount("tCount:" + lang);
 143                 final boolean appendLang;
 144                 if (wikiTokenizer.listItemPrefix().length() == 1) {
 145                     topLevelLang = lang;
 146                     final boolean thisFind = langPattern.matcher(lang).find();
 147                     if (!thisFind) {
 148                         continue;
 149                     }
 150                     appendLang = !langPattern.matcher(lang).matches();
 151                 } else if (topLevelLang == null) {
 152                     continue;
 153                 } else {
 154                     // Two-level -- the only way we won't append is if this second level matches exactly.
 155                     if (!langPattern.matcher(lang).matches() && !langPattern.matcher(topLevelLang).find()) {
 156                         continue;
 157                     }
 158                     appendLang = !langPattern.matcher(lang).matches();
 159                 }
 160
 161                 String rest = line.substring(colonIndex + 1).trim();
 162                 if (rest.length() > 0) {
 163                     doTranslationLine(line, appendLang ? lang : null, pos, sense, rest);
 164                 }
 165
 166             } else if (wikiTokenizer.remainderStartsWith("''See''")) {
 167                 wikiTokenizer.nextLine();
 168                 incrementCount("WARNING: ''See''" );
 169                 LOG.fine("Skipping See line: " + wikiTokenizer.token());
 170             } else if (wikiTokenizer.isWikiLink()) {
 171                 final String wikiLink = wikiTokenizer.wikiLinkText();
 172                 if (wikiLink.contains(":") && wikiLink.contains(title)) {
 173                 } else if (wikiLink.contains("Category:")) {
 174                 } else  {
 175                     incrementCount("WARNING: Unexpected wikiLink" );
 176                     LOG.warning("Unexpected wikiLink: " + wikiTokenizer.token() + ", title=" + title);
 177                 }
 178             } else if (wikiTokenizer.isNewline() || wikiTokenizer.isMarkup() || wikiTokenizer.isComment()) {
 179             } else {
 180                 final String token = wikiTokenizer.token();
 181                 if (token.equals("----")) {
 182                 } else {
 183                     LOG.warning("Unexpected translation token: " + wikiTokenizer.token() + ", title=" + title);
 184                     incrementCount("WARNING: Unexpected translation token" );
 185                 }
 186             }
 187
 188         }
 189     }
 190
 191     private void doTranslationLine(final String line, final String lang, final String pos, final String sense, final String rest) {
 192         state = State.TRANSLATION_LINE;
 193         // Good chance we'll actually file this one...
 194         final PairEntry pairEntry = new PairEntry(entrySource);
 195         final IndexedEntry indexedEntry = new IndexedEntry(pairEntry);
 196         indexedEntry.isValid = true;
 197
 198         final StringBuilder foreignText = new StringBuilder();
 199         appendAndIndexWikiCallback.reset(foreignText, indexedEntry);
 200         appendAndIndexWikiCallback.dispatch(rest, foreignIndexBuilder, EntryTypeName.WIKTIONARY_TRANSLATION_OTHER_TEXT);
 201
 202         if (foreignText.length() == 0) {
 203             LOG.warning("Empty foreignText: " + line);
 204             incrementCount("WARNING: Empty foreignText" );
 205             return;
 206         }
 207
 208         if (lang != null) {
 209             foreignText.insert(0, String.format("(%s) ", lang));
 210         }
 211
 212         StringBuilder englishText = new StringBuilder();
 213
 214         englishText.append(title);
 215         if (sense != null) {
 216             englishText.append(" (").append(sense).append(")");
 217             enIndexBuilder.addEntryWithString(indexedEntry, sense, EntryTypeName.WIKTIONARY_TRANSLATION_SENSE);
 218         }
 219         if (pos != null) {
 220             englishText.append(" (").append(pos.toLowerCase()).append(")");
 221         }
 222         enIndexBuilder.addEntryWithString(indexedEntry, title, EntryTypeName.WIKTIONARY_TITLE_MULTI);
 223
 224         final Pair pair = new Pair(trim(englishText.toString()), trim(foreignText.toString()), swap);
 225         pairEntry.pairs.add(pair);
 226         if (!pairsAdded.add(pair.toString())) {
 227             LOG.warning("Duplicate pair: " + pair.toString());
 228             incrementCount("WARNING: Duplicate pair" );
 229         }
 230     }
 231 }  // EnToTranslationParser