// The matches the whole line, otherwise regexes don't work well on French:
// {{=uk=}}
+ // Spanish has no initial headings, tried to also detect {{ES as such
+ // with "^(\\{\\{ES|(=+)[^=]).*$" but that broke English.
static final Pattern headingStart = Pattern.compile("^(=+)[^=].*$", Pattern.MULTILINE);
final Map<String,List<Selector>> pathToSelectors = new LinkedHashMap<String, List<Selector>>();
private WiktionarySplitter() {
List<Selector> selectors;
for (final String code : WiktionaryLangs.wikiCodeToIsoCodeToWikiName.keySet()) {
- //if (code.equals("en") || code.equals("de") || code.equals("fr")) {continue;}
+ //if (!code.equals("fr")) {continue;}
selectors = new ArrayList<WiktionarySplitter.Selector>();
pathToSelectors.put(String.format("data/inputs/%swiktionary-pages-articles.xml", code), selectors);
for (final Map.Entry<String, String> entry : WiktionaryLangs.wikiCodeToIsoCodeToWikiName.get(code).entrySet()) {
title.startsWith("Categoria:") ||
title.startsWith("Aiuto:") ||
title.startsWith("Portail:") ||
+ // ES
+ title.startsWith("Apéndice:") ||
+ title.startsWith("Archivo:") ||
+ title.startsWith("Ayuda:") ||
+ title.startsWith("CategorĂa:") ||
+ title.startsWith("Plantilla:") ||
+ title.startsWith("Wikcionario:") ||
// sentinel
false