import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser;
-import javax.xml.parsers.SAXParserFactory;
+import org.apache.xerces.jaxp.SAXParserFactoryImpl;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
StringBuilder textBuilder;
StringBuilder currentBuilder = null;
- public static void main(final String[] args) throws SAXException, IOException, ParserConfigurationException {
+ public static void main(final String[] args) throws Exception {
final WiktionarySplitter wiktionarySplitter = new WiktionarySplitter();
wiktionarySplitter.go();
}
}
}
- private void go() throws ParserConfigurationException, SAXException, IOException {
- final SAXParser parser = SAXParserFactory.newInstance().newSAXParser();
+ private void go() throws Exception {
+ final SAXParser parser = SAXParserFactoryImpl.newInstance().newSAXParser();
// Configure things.
for (final Map.Entry<String, List<Selector>> pathToSelectorsEntry : pathToSelectors.entrySet()) {
}
// Do it.
- parser.parse(new File(pathToSelectorsEntry.getKey()), this);
+ try {
+ parser.parse(new File(pathToSelectorsEntry.getKey()), this);
+ } catch (Exception e) {
+ System.err.println("Exception during parse, lastPageTitle=" + lastPageTitle + ", titleBuilder=" + titleBuilder.toString());
+ throw e;
+ }
// Shutdown.
for (final Selector selector : currentSelectors) {
}
}
+ String lastPageTitle = null;
int pageCount = 0;
private void endPage() {
final String title = titleBuilder.toString();
+ lastPageTitle = title;
if (++pageCount % 1000 == 0) {
System.out.println("endPage: " + title + ", count=" + pageCount);
}
public void parse(final File file) throws ParserConfigurationException,
SAXException, IOException {
- final SAXParser parser = SAXParserFactory.newInstance().newSAXParser();
+ final SAXParser parser = SAXParserFactoryImpl.newInstance().newSAXParser();
parser.parse(file, this);
}