+ private void doTranslationLine(final String line, final String lang, final String title, final String pos, final String sense, final String rest) {
+ // Good chance we'll actually file this one...
+ final PairEntry pairEntry = new PairEntry();
+ final IndexedEntry indexedEntry = new IndexedEntry(pairEntry);
+
+ final StringBuilder otherText = new StringBuilder();
+ final WikiTokenizer wikiTokenizer = new WikiTokenizer(rest, false);
+ while (wikiTokenizer.nextToken() != null) {
+
+ if (wikiTokenizer.isPlainText()) {
+ final String plainText = wikiTokenizer.token();
+ otherText.append("").append(plainText);
+ otherIndexBuilder.addEntryWithString(indexedEntry, plainText, EntryTypeName.WIKTIONARY_TRANSLATION_OTHER_TEXT);
+
+ } else if (wikiTokenizer.isWikiLink()) {
+ final String plainText = wikiTokenizer.wikiLinkText();
+ otherText.append("").append(plainText);
+ otherIndexBuilder.addEntryWithString(indexedEntry, plainText, EntryTypeName.WIKTIONARY_TRANSLATION_WIKI_TEXT);
+
+ } else if (wikiTokenizer.isFunction()) {
+ final String functionName = wikiTokenizer.functionName();
+ final List<String> args = wikiTokenizer.functionPositionArgs();
+ final Map<String,String> namedArgs = wikiTokenizer.functionNamedArgs();
+
+ if (functionName.equals("t") || functionName.equals("t+") || functionName.equals("t-") || functionName.equals("tø") || functionName.equals("apdx-t")) {
+ if (args.size() < 2) {
+ LOG.warning("{{t}} with too few args: " + line + ", title=" + title);
+ continue;
+ }
+ final String langCode = get(args, 0);
+ //if (this.langCodePattern.matcher(langCode).matches()) {
+ final String word = get(args, 1);
+ final String gender = get(args, 2);
+ final String transliteration = namedArgs.get("tr");
+ if (otherText.length() > 0) {
+ otherText.append("");
+ }
+ otherText.append(word);
+ otherIndexBuilder.addEntryWithString(indexedEntry, word, EntryTypeName.WIKTIONARY_TITLE_SINGLE, EntryTypeName.WIKTIONARY_TITLE_MULTI);
+ if (gender != null) {
+ otherText.append(String.format(" {%s}", gender));
+ }
+ if (transliteration != null) {
+ otherText.append(String.format(TRANSLITERATION_FORMAT, transliteration));
+ otherIndexBuilder.addEntryWithString(indexedEntry, transliteration, EntryTypeName.WIKTIONARY_TRANSLITERATION);
+ }
+ //}
+ } else if (functionName.equals("qualifier")) {
+ if (args.size() == 0) {
+ otherText.append(wikiTokenizer.token());
+ } else {
+ String qualifier = args.get(0);
+ if (!namedArgs.isEmpty() || args.size() > 1) {
+ LOG.warning("weird qualifier: " + line);
+ }
+ // Unindexed!
+ otherText.append("(").append(qualifier).append(")");
+ }
+ } else if (encodings.contains(functionName)) {
+ otherText.append("").append(args.get(0));
+ otherIndexBuilder.addEntryWithString(indexedEntry, args.get(0), EntryTypeName.WIKTIONARY_TRANSLATION_OTHER_TEXT);
+ } else if (isGender(functionName)) {
+ appendGender(otherText, functionName, args);
+ } else if (functionName.equals("g")) {
+ otherText.append("{g}");
+ } else if (functionName.equals("l")) {
+ // encodes text in various langs.
+ // lang is arg 0.
+ otherText.append("").append(args.get(1));
+ otherIndexBuilder.addEntryWithString(indexedEntry, args.get(1), EntryTypeName.WIKTIONARY_TRANSLATION_OTHER_TEXT);
+ // TODO: transliteration
+ } else if (functionName.equals("term")) {
+ // cross-reference to another dictionary
+ otherText.append("").append(args.get(0));
+ otherIndexBuilder.addEntryWithString(indexedEntry, args.get(0), EntryTypeName.WIKTIONARY_TRANSLATION_OTHER_TEXT);
+ // TODO: transliteration
+ } else if (functionName.equals("italbrac") || functionName.equals("gloss")) {
+ // TODO: put this text aside to use it.
+ otherText.append("[").append(args.get(0)).append("]");
+ otherIndexBuilder.addEntryWithString(indexedEntry, args.get(0), EntryTypeName.WIKTIONARY_TRANSLATION_OTHER_TEXT);
+ } else if (functionName.equals("ttbc")) {
+ LOG.warning("Unexpected {{ttbc}}");
+ } else if (functionName.equals("trreq")) {
+ } else if (functionName.equals("not used")) {
+ otherText.append("(not used)");
+ } else if (functionName.equals("t-image")) {
+ // American sign language
+ } else {
+ // Unindexed!
+ otherText.append(wikiTokenizer.token());
+ }
+
+ } else if (wikiTokenizer.isNewline()) {
+ assert false;
+ } else if (wikiTokenizer.isComment()) {
+ } else if (wikiTokenizer.isMarkup()) {
+ } else {
+ LOG.warning("Bad translation token: " + wikiTokenizer.token());
+ }
+ }
+ if (otherText.length() == 0) {
+ LOG.warning("Empty otherText: " + line);
+ return;
+ }
+
+ if (lang != null) {
+ otherText.insert(0, String.format("(%s) ", lang));
+ }
+
+ StringBuilder englishText = new StringBuilder();
+
+ englishText.append(title);
+ if (sense != null) {
+ englishText.append(" (").append(sense).append(")");
+ enIndexBuilder.addEntryWithString(indexedEntry, sense, EntryTypeName.WIKTIONARY_TRANSLATION_SENSE, EntryTypeName.WIKTIONARY_TRANSLATION_SENSE);
+ }
+ if (pos != null) {
+ englishText.append(" (").append(pos.toLowerCase()).append(")");
+ }
+ enIndexBuilder.addEntryWithString(indexedEntry, title, EntryTypeName.WIKTIONARY_TITLE_SINGLE, EntryTypeName.WIKTIONARY_TITLE_MULTI);
+
+ final Pair pair = new Pair(trim(englishText.toString()), trim(otherText.toString()), swap);
+ pairEntry.pairs.add(pair);
+ if (!pairsAdded.add(pair.toString())) {
+ LOG.warning("Duplicate pair: " + pair.toString());
+ }
+ if (pair.toString().equals("libero {m} :: free (adjective)")) {
+ System.out.println();
+ }
+
+ }
+
+
+ private void appendGender(final StringBuilder otherText,
+ final String functionName, final List<String> args) {
+ otherText.append("{");
+ otherText.append(functionName);
+ for (int i = 0; i < args.size(); ++i) {
+ otherText.append("|").append(args.get(i));
+ }
+ otherText.append("}");
+ }
+
+
+ private boolean isGender(final String functionName) {
+ return functionName.equals("m") || functionName.equals("f") || functionName.equals("n") || functionName.equals("p");