// The matches the whole line, otherwise regexes don't work well on French:
// {{=uk=}}
- static final Pattern headingStart = Pattern.compile("^(=+)[^=].*$", Pattern.MULTILINE);
+ // Spanish has no initial headings, so also detect {{ES as such...
+ static final Pattern headingStart = Pattern.compile("^(\\{\\{ES|(=+)[^=]).*$", Pattern.MULTILINE);
final Map<String,List<Selector>> pathToSelectors = new LinkedHashMap<String, List<Selector>>();
List<Selector> currentSelectors = null;
private WiktionarySplitter() {
List<Selector> selectors;
for (final String code : WiktionaryLangs.wikiCodeToIsoCodeToWikiName.keySet()) {
- //if (code.equals("en") || code.equals("de") || code.equals("fr")) {continue;}
+ //if (!code.equals("fr")) {continue;}
selectors = new ArrayList<WiktionarySplitter.Selector>();
pathToSelectors.put(String.format("data/inputs/%swiktionary-pages-articles.xml", code), selectors);
for (final Map.Entry<String, String> entry : WiktionaryLangs.wikiCodeToIsoCodeToWikiName.get(code).entrySet()) {
title.startsWith("Categoria:") ||
title.startsWith("Aiuto:") ||
title.startsWith("Portail:") ||
+ // ES
+ title.startsWith("Apéndice:") ||
+ title.startsWith("Archivo:") ||
+ title.startsWith("Ayuda:") ||
+ title.startsWith("CategorĂa:") ||
+ title.startsWith("Plantilla:") ||
+ title.startsWith("Wikcionario:") ||
// sentinel
false
}
}
-
public void parse(final File file) throws ParserConfigurationException,
SAXException, IOException {
final SAXParser parser = SAXParserFactoryImpl.newInstance().newSAXParser();
parser.parse(file, this);
}
-
-
}