#!/usr/bin/python3 from html.parser import HTMLParser from glob import glob class OptedParser(HTMLParser): def __init__(self, outputfile): super().__init__() self.reading_entry = False self.reading_pos = False self.reading_definition = False self.entry = "" self.pos = "" self.definition = "" self.outputfile = outputfile def handle_starttag(self, tag, attrs): if tag == "b": assert not self.reading_entry assert not self.reading_pos assert not self.reading_definition assert not self.entry self.reading_entry = True elif tag == "i": assert not self.reading_entry assert not self.reading_pos assert not self.pos self.reading_pos = True def handle_data(self, data): if self.reading_entry: self.entry += data.lower() elif self.reading_pos: self.pos += data elif self.reading_definition: self.definition += data def handle_endtag(self, tag): if tag == "b": assert self.reading_entry assert not self.reading_definition assert not self.reading_pos self.reading_entry = False self.reading_definition = True elif tag == "i": assert not self.reading_entry assert self.reading_pos assert self.reading_definition self.reading_pos = False elif tag == "p": assert not self.reading_entry assert self.reading_definition assert not self.reading_pos assert self.entry assert self.definition assert "::" not in self.entry and "::" not in self.definition assert "|" not in self.entry and "|" not in self.definition if self.pos: self.definition = f"{self.entry} ({self.pos}) {self.definition}" self.definition = self.definition.replace("()", "") self.definition = self.definition.strip() self.outputfile.write(f"{self.entry} :: {self.definition}\n") self.reading_definition = False self.entry = "" self.pos = "" self.definition = "" def main(): with open("OPTED.chemnitz", "w", encoding="utf-8") as outputfile: opted_parser = OptedParser(outputfile) for fname in sorted(glob("OPTED/v003/wb1913_*.html")): print(f"Running on {fname}") with open(fname, mode="r", encoding="macroman", errors="strict") as inputfile: opted_parser.feed(inputfile.read()) if __name__ == "__main__": main()