#!/usr/bin/python3 from html.parser import HTMLParser from glob import glob class OptedParser(HTMLParser): def __init__(self): super().__init__() self.reading_headword = False self.reading_pos = False self.reading_definition = False self.headword = "" self.pos = "" self.definition = "" self.entries = {} def _add_definition(self, headword, definition): if headword not in self.entries: self.entries[headword] = [] self.entries[headword] += [definition] def write_definitions(self, outputfile): for headword in self.entries: if len(self.entries[headword]) == 1: outputfile.write(f"{headword}\t{self.entries[headword][0]}\n") else: outputfile.write(f"{headword}\t
    ") for entry in self.entries[headword]: outputfile.write(f"
  1. {entry}
  2. ") outputfile.write(f"
\n") if " " in headword or "-" in headword: print(f"<{headword}> has space or dash") def handle_starttag(self, tag, attrs): if tag == "b": assert not self.reading_headword assert not self.reading_pos assert not self.reading_definition assert not self.headword self.reading_headword = True elif tag == "i": assert not self.reading_headword assert not self.reading_pos assert not self.pos self.reading_pos = True def handle_data(self, data): if self.reading_headword: self.headword += data.lower() elif self.reading_pos: self.pos += data elif self.reading_definition: self.definition += data def handle_endtag(self, tag): if tag == "b": assert self.reading_headword assert not self.reading_definition assert not self.reading_pos self.reading_headword = False self.reading_definition = True elif tag == "i": assert not self.reading_headword assert self.reading_pos assert self.reading_definition self.reading_pos = False elif tag == "p": assert not self.reading_headword assert self.reading_definition assert not self.reading_pos assert self.headword assert self.definition if self.pos: self.definition = f"({self.pos}) {self.definition}" self.definition = self.definition.replace("()", "") self.definition = self.definition.strip() assert not "\\" in self.headword, f"\\ for word {self.headword}" if any(str(digit) in self.headword for digit in range(10)): print(f"Warning: {self.headword} has digits") self._add_definition(self.headword, self.definition) self.reading_definition = False self.headword = "" self.pos = "" self.definition = "" def main(): opted_parser = OptedParser() for fname in sorted(glob("OPTED/v003/wb1913_*.html")): print(f"Running on {fname}") with open(fname, mode="r", encoding="macroman", errors="strict") as inputfile: opted_parser.feed(inputfile.read()) with open("OPTED.tab_separated", "w", encoding="utf-8") as outputfile: opted_parser.write_definitions(outputfile) if __name__ == "__main__": main()