-Setup new ICU
+handle examples.
+handle word-info in English.
+
Bad ordering:
===do===
do {{wikipedia|Do (nota)|lang=it}}{{infl|it|noun|g=m}} :: do, the musical note
fare {{it-verb}} {{transitive}} :: To do
-
sub-levels in translations.
-examples.
in wiktionary
futurismo :: futurism () (noun)
-
-
\ No newline at end of file
new Lang("^English$", "EN", null, "en.txt"),
};
Lang[] langs2 = new Lang[] {
- new Lang("^.*Italian.*$", "IT", "italian.data", "it.txt"),
+ //new Lang("^.*Italian.*$", "IT", "italian.data", "it.txt"),
new Lang("^.*French.*$", "FR", "french.data", "empty.txt"),
new Lang("^.*Spanish.*$", "ES", "spanish.data", "empty.txt"),
new Lang("^.*Greek.*$", "EL", "greek.data", "empty.txt"),
// -------------------------------------------------------------------------
- String pos = null;
- int posDepth = -1;
-
private void doEnglishWord(String title, String text) {
+
+ String pos = null;
+ int posDepth = -1;
+
final WikiTokenizer wikiTokenizer = new WikiTokenizer(text);
while (wikiTokenizer.nextToken() != null) {
posDepth = wikiTokenizer.headingDepth();
pos = wikiTokenizer.headingWikiText();
} else if (headerName.equals("Translations")) {
- doTranslations(title, wikiTokenizer);
+ if (pos == null) {
+ LOG.warning("Translations without POS: " + title);
+ }
+ doTranslations(title, wikiTokenizer, pos);
} else if (headerName.equals("Pronunciation")) {
//doPronunciation(wikiLineReader);
}
"Jpan", "Kore", "Hebr", "rfscript", "Beng", "Mong", "Knda", "Cyrs",
"yue-tsj", "Mlym", "Tfng", "Grek", "yue-yue-j"));
- private void doTranslations(final String title, final WikiTokenizer wikiTokenizer) {
+ private void doTranslations(final String title, final WikiTokenizer wikiTokenizer, final String pos) {
if (title.equals("absolutely")) {
System.out.println();
}
} else {
LOG.warning("Unexpected translation wikifunction: " + wikiTokenizer.token() + ", title=" + title);
}
- } else if (wikiTokenizer.isListItem() && wikiTokenizer.listItemPrefix().startsWith("*")) {
+ } else if (wikiTokenizer.isListItem()) {
final String line = wikiTokenizer.listItemWikiText();
// This line could produce an output...
String rest = line.substring(colonIndex + 1).trim();
if (rest.length() > 0) {
- doTranslationLine(line, title, sense, rest);
+ doTranslationLine(line, title, pos, sense, rest);
} else {
// TODO: do lines that are like Greek:
}
return index < list.size() ? list.get(index) : null;
}
- private void doTranslationLine(final String line, final String title, final String sense, final String rest) {
+ private void doTranslationLine(final String line, final String title, final String pos, final String sense, final String rest) {
// Good chance we'll actually file this one...
final PairEntry pairEntry = new PairEntry();
final IndexedEntry indexedEntry = new IndexedEntry(pairEntry);
public String token() {
final String token = wikiText.substring(start, end);
- assert token.equals("\n") || !token.endsWith("\n") : token;
+ assert token.equals("\n") || !token.endsWith("\n") : "token='" + token + "'";
return token;
}
final boolean insideFunction = toFind.equals("}}");
int end = start;
+ int firstNewline = -1;
while (end < wikiText.length()) {
if (matcher.find(end)) {
final String matchText = matcher.group();
assert matcher.end() > end || matchText.length() == 0: "Group=" + matcher.group();
if (matchText.length() == 0) {
assert matchStart == wikiText.length() || wikiText.charAt(matchStart) == '\n';
+ if (firstNewline == -1) {
+ firstNewline = matcher.end();
+ }
if (tokenStack.isEmpty() && toFind.equals("\n")) {
return matchStart;
}
// Inside the while loop. Just go forward.
end = Math.max(end, matcher.end());
}
+ if (toFind.equals("\n") && tokenStack.isEmpty()) {
+ // We were looking for the end, we got it.
+ return end;
+ }
+ if (firstNewline != -1) {
+ errors.add("Couldn't find: " + toFind + ", "+ wikiText.substring(start));
+ return firstNewline;
+ }
return end;
}
assertEquals("202", new WikiTokenizer(wikiText).nextToken().functionNamedArgs().get("arg2"));
assertEquals("{{n1|n2=7|n3}}", new WikiTokenizer(wikiText).nextToken().functionNamedArgs().get("arg3"));
+ wikiText = "{{gloss|asdf}\nAsdf\n\n";
+ assertEquals("{{gloss|asdf}", new WikiTokenizer(wikiText).nextToken().token());
+
+ wikiText = "#*{{quote-book|year=1960|author={{w|P. G. Wodehouse}}\n" +
+ "|title={{w|Jeeves in the Offing}}\n" +
+ "|section=chapter XI\n" +
+ "|passage=“I'm sorely beset, Jeeves. Do you recall telling me once about someone who told somebody he could tell him something which would make him think a bit? Knitted socks and porcu\n" +
+ "pines entered into it, I remember.” “I think you may be referring to the ghost of the father of Hamlet, Prince of Denmark, sir. Addressing his son, he said ‘I could a tale unfold whos\n" +
+ "e lightest word would harrow up thy soul, freeze thy young blood, make thy two eyes, like stars, start from their spheres, thy knotted and combined locks to part and each particular h\n" +
+ "air to stand on end like quills upon the fretful '''porpentine'''.’ ” “That's right. Locks, of course, not socks. Odd that he should have said '''porpentine''' when he meant porc\n" +
+ "upine. Slip of the tongue, no doubt, as so often happens with ghosts.”}}";
+ assertEquals(wikiText, new WikiTokenizer(wikiText).nextToken().token());
+
}
assertTrue(new WikiTokenizer(wikiText).nextToken().isHeading());
assertEquals(1, new WikiTokenizer(wikiText).nextToken().headingDepth());
assertEquals("a", new WikiTokenizer(wikiText).nextToken().headingWikiText());
- assertEquals(1, new WikiTokenizer(wikiText).nextToken().errors.size());
+ assertEquals(2, new WikiTokenizer(wikiText).nextToken().errors.size());
wikiText = "=a==";
assertEquals("=a==", new WikiTokenizer(wikiText).nextToken().token());
assertEquals(2, new WikiTokenizer(wikiText).nextToken().headingDepth());
assertEquals("aa[[|=]] {{|={{=}} }}", new WikiTokenizer(wikiText).nextToken().headingWikiText());
assertEquals(0, new WikiTokenizer(wikiText).nextToken().errors.size());
+
}