boolean skipWikiLink(final WikiTokenizer wikiTokenizer);
String adjustWikiLink(String wikiLinkDest, final String wikiLinkText);
void addFunctionCallbacks(
- Map<String, FunctionCallback<WholeSectionToHtmlParser>> functionCallbacks);
+ Map<String, FunctionCallback<WholeSectionToHtmlParser>> functionCallbacks);
}
static final Map<String,LangConfig> isoToLangConfig = new LinkedHashMap<String,LangConfig>();
static {
public boolean skipSection(String headingText) {
return enSkipSections.matcher(headingText).matches();
}
-
+
@Override
public EntryTypeName sectionNameToEntryType(String sectionName) {
if (sectionName.equalsIgnoreCase("Synonyms")) {
}
return null;
}
-
+
@Override
public boolean skipWikiLink(WikiTokenizer wikiTokenizer) {
final String wikiText = wikiTokenizer.wikiLinkText();
@Override
public void addFunctionCallbacks(
- Map<String, FunctionCallback<WholeSectionToHtmlParser>> functionCallbacks) {
+ Map<String, FunctionCallback<WholeSectionToHtmlParser>> functionCallbacks) {
EnFunctionCallbacks.addGenericCallbacks(functionCallbacks);
}
});
-
+
+ final Pattern esSkipSections = Pattern.compile(".*(Traducciones|Locuciones).*");
+ isoToLangConfig.put("ES", new LangConfig() {
+ @Override
+ public boolean skipSection(String headingText) {
+ return esSkipSections.matcher(headingText).matches();
+ }
+
+ @Override
+ public EntryTypeName sectionNameToEntryType(String sectionName) {
+ if (sectionName.equalsIgnoreCase("sinónimo") || sectionName.equalsIgnoreCase("sinónimos")) {
+ return EntryTypeName.SYNONYM_MULTI;
+ }
+ if (sectionName.equalsIgnoreCase("antónimo") || sectionName.equalsIgnoreCase("antónimos")) {
+ return EntryTypeName.ANTONYM_MULTI;
+ }
+ return null;
+ }
+
+ @Override
+ public boolean skipWikiLink(WikiTokenizer wikiTokenizer) {
+ final String wikiText = wikiTokenizer.wikiLinkText();
+ if (wikiText.startsWith("Categoría:")) {
+ return true;
+ }
+ return false;
+ }
+ @Override
+ public String adjustWikiLink(String wikiLinkDest, String wikiLinkText) {
+ if (wikiLinkDest.startsWith("w:") || wikiLinkDest.startsWith("Image:")) {
+ return null;
+ }
+ final int hashPos = wikiLinkDest.indexOf("#");
+ if (hashPos != -1) {
+ wikiLinkDest = wikiLinkDest.substring(0, hashPos);
+ if (wikiLinkDest.isEmpty()) {
+ wikiLinkDest = wikiLinkText;
+ }
+ }
+ return wikiLinkDest;
+ }
+
+ @Override
+ public void addFunctionCallbacks(
+ Map<String, FunctionCallback<WholeSectionToHtmlParser>> functionCallbacks) {
+ // TODO: need Spanish variant
+ }
+ });
+
+ final Pattern ptSkipSections = Pattern.compile(".*Tradução.*");
+ isoToLangConfig.put("PT", new LangConfig() {
+ @Override
+ public boolean skipSection(String headingText) {
+ return esSkipSections.matcher(headingText).matches();
+ }
+
+ @Override
+ public EntryTypeName sectionNameToEntryType(String sectionName) {
+ if (sectionName.equalsIgnoreCase("Sinônimo") || sectionName.equalsIgnoreCase("Sinônimos")) {
+ return EntryTypeName.SYNONYM_MULTI;
+ }
+ if (sectionName.equalsIgnoreCase("Antônimo") || sectionName.equalsIgnoreCase("Antônimos")) {
+ return EntryTypeName.ANTONYM_MULTI;
+ }
+ return null;
+ }
+
+ @Override
+ public boolean skipWikiLink(WikiTokenizer wikiTokenizer) {
+ final String wikiText = wikiTokenizer.wikiLinkText();
+ if (wikiText.startsWith("Categoría:")) {
+ return true;
+ }
+ return false;
+ }
+ @Override
+ public String adjustWikiLink(String wikiLinkDest, String wikiLinkText) {
+ if (wikiLinkDest.startsWith("w:") || wikiLinkDest.startsWith("Image:")) {
+ return null;
+ }
+ final int hashPos = wikiLinkDest.indexOf("#");
+ if (hashPos != -1) {
+ wikiLinkDest = wikiLinkDest.substring(0, hashPos);
+ if (wikiLinkDest.isEmpty()) {
+ wikiLinkDest = wikiLinkText;
+ }
+ }
+ return wikiLinkDest;
+ }
+
+ @Override
+ public void addFunctionCallbacks(
+ Map<String, FunctionCallback<WholeSectionToHtmlParser>> functionCallbacks) {
+ // TODO: need Portuguese variant
+ }
+ });
+
final Pattern deSkipSections = Pattern.compile(".*(Übersetzungen|Referenzen|Quellen).*");
isoToLangConfig.put("DE", new LangConfig() {
@Override
public boolean skipSection(String headingText) {
return deSkipSections.matcher(headingText).matches();
}
-
+
@Override
public EntryTypeName sectionNameToEntryType(String sectionName) {
if (sectionName.equalsIgnoreCase("Synonyme")) {
}
return null;
}
-
+
@Override
public boolean skipWikiLink(WikiTokenizer wikiTokenizer) {
final String wikiText = wikiTokenizer.wikiLinkText();
- if (wikiText.startsWith("???Category:")) {
+ if (wikiText.startsWith("Kategorie:")) {
return true;
}
return false;
@Override
public void addFunctionCallbacks(
- Map<String, FunctionCallback<WholeSectionToHtmlParser>> functionCallbacks) {
+ Map<String, FunctionCallback<WholeSectionToHtmlParser>> functionCallbacks) {
DeFunctionCallbacks.addGenericCallbacks(functionCallbacks);
}
});
-
+
final Pattern itSkipSections = Pattern.compile(".*(Traduzione|Note / Riferimenti).*");
isoToLangConfig.put("IT", new LangConfig() {
@Override
public boolean skipSection(String headingText) {
return itSkipSections.matcher(headingText).matches();
}
-
+
@Override
public EntryTypeName sectionNameToEntryType(String sectionName) {
if (sectionName.equalsIgnoreCase("Sinonimi")) {
}
return null;
}
-
+
@Override
public boolean skipWikiLink(WikiTokenizer wikiTokenizer) {
final String wikiText = wikiTokenizer.wikiLinkText();
- if (wikiText.startsWith("???Category:")) {
+ if (wikiText.startsWith("Categoria:")) {
return true;
}
return false;
@Override
public void addFunctionCallbacks(
- Map<String, FunctionCallback<WholeSectionToHtmlParser>> functionCallbacks) {
+ Map<String, FunctionCallback<WholeSectionToHtmlParser>> functionCallbacks) {
ItFunctionCallbacks.addGenericCallbacks(functionCallbacks);
}
});
- final Pattern frSkipSections = Pattern.compile(".*(Traductions).*");
+ final Pattern frSkipSections = Pattern.compile(".*([Tt]raductions|[Aa]nagrammes).*");
isoToLangConfig.put("FR", new LangConfig() {
@Override
public boolean skipSection(String headingText) {
return frSkipSections.matcher(headingText).matches();
}
-
+
@Override
public EntryTypeName sectionNameToEntryType(String sectionName) {
if (sectionName.equalsIgnoreCase("Synonymes")) {
return EntryTypeName.SYNONYM_MULTI;
}
+ if (sectionName.equalsIgnoreCase("Antonymes")) {
+ return EntryTypeName.ANTONYM_MULTI;
+ }
return null;
}
-
+
@Override
public boolean skipWikiLink(WikiTokenizer wikiTokenizer) {
+ final String wikiText = wikiTokenizer.wikiLinkText();
+ if (wikiText.startsWith("Catégorie:")) {
+ return true;
+ }
return false;
}
@Override
@Override
public void addFunctionCallbacks(
- Map<String, FunctionCallback<WholeSectionToHtmlParser>> functionCallbacks) {
+ Map<String, FunctionCallback<WholeSectionToHtmlParser>> functionCallbacks) {
FrFunctionCallbacks.addGenericCallbacks(functionCallbacks);
}
});
final String skipLangIso;
final LangConfig langConfig;
final String webUrlTemplate;
-
+
public WholeSectionToHtmlParser(final IndexBuilder titleIndexBuilder, final IndexBuilder defIndexBuilder, final String wiktionaryIso, final String skipLangIso,
- final String webUrlTemplate) {
+ final String webUrlTemplate) {
this.titleIndexBuilder = titleIndexBuilder;
this.defIndexBuilder = defIndexBuilder;
assert isoToLangConfig.containsKey(wiktionaryIso): wiktionaryIso;
this.skipLangIso = skipLangIso;
this.webUrlTemplate = webUrlTemplate;
}
-
+
IndexedEntry indexedEntry = null;
@Override
indexedEntry = new IndexedEntry(htmlEntry);
final AppendAndIndexWikiCallback<WholeSectionToHtmlParser> callback = new AppendCallback(
- this);
+ this);
langConfig.addFunctionCallbacks(callback.functionCallbacks);
callback.builder = new StringBuilder();
if (webUrlTemplate != null) {
final String webUrl = String.format(webUrlTemplate, title);
- // URI.create can raise an exception e.g. if webUrl contains %, just ignore those cases.
- try {
- callback.builder.append(String.format("<p> <a href=\"%s\">%s</a>", URI.create(webUrl).toString(), escapeHtmlLiteral(webUrl)));
- } catch (Exception e)
- {}
+ // URI.create can raise an exception e.g. if webUrl contains %, just ignore those cases.
+ try {
+ callback.builder.append(String.format("<p> <a href=\"%s\">%s</a>", URI.create(webUrl).toASCIIString(), escapeHtmlLiteral(webUrl)));
+ } catch (Exception e) {
+ }
}
htmlEntry.html = callback.builder.toString();
indexedEntry.isValid = true;
tokenData.htmlEntries.add(htmlEntry);
// titleIndexBuilder.addEntryWithString(indexedEntry, title,
// EntryTypeName.WIKTIONARY_TITLE_MULTI_DETAIL);
-
+
indexedEntry = null;
}
@Override
void removeUselessArgs(Map<String, String> namedArgs) {
}
-
+
@Override
public void addLinkToCurrentEntry(String token, final String lang, EntryTypeName entryTypeName) {
if (lang == null || lang.equals(skipLangIso)) {
titleIndexBuilder.addEntryWithString(indexedEntry, token, entryTypeName);
}
}
-
+
public static String escapeHtmlLiteral(final String plainText) {
final String htmlEscaped = StringEscapeUtils.escapeHtml3(plainText);
if (StringUtil.isAscii(htmlEscaped)) {
return htmlEscaped;
- } else {
+ } else {
return StringUtil.escapeUnicodeToPureHtml(plainText);
}
@Override
public void onFunction(WikiTokenizer wikiTokenizer, String name,
- List<String> args, Map<String, String> namedArgs) {
+ List<String> args, Map<String, String> namedArgs) {
if (skipLangIso.equalsIgnoreCase(namedArgs.get("lang"))) {
namedArgs.remove("lang");
}
@Override
public void onNewline(WikiTokenizer wikiTokenizer) {
}
-
+
EntryTypeName sectionEntryTypeName;
IndexBuilder currentIndexBuilder;
final String prefix = wikiTokenizer.listItemPrefix();
while (listPrefixStack.size() < prefix.length()) {
builder.append(String.format("<%s>",
- WikiTokenizer.getListTag(prefix.charAt(listPrefixStack.size()))));
+ WikiTokenizer.getListTag(prefix.charAt(listPrefixStack.size()))));
listPrefixStack.add(prefix.charAt(listPrefixStack.size()));
}
builder.append("<li>");