--- /dev/null
+#!/usr/bin/python3
+
+from html.parser import HTMLParser
+from glob import glob
+
+class OptedParser(HTMLParser):
+ def __init__(self):
+ super().__init__()
+ self.reading_headword = False
+ self.reading_pos = False
+ self.reading_definition = False
+ self.headword = ""
+ self.pos = ""
+ self.definition = ""
+ self.entries = {}
+
+ def _add_definition(self, headword, definition):
+ if headword not in self.entries:
+ self.entries[headword] = []
+ self.entries[headword] += [definition]
+
+ def write_definitions(self, outputfile):
+ for headword in self.entries:
+ if len(self.entries[headword]) == 1:
+ outputfile.write(f"{headword}\t{self.entries[headword][0]}\n")
+ else:
+ outputfile.write(f"{headword}\t<ol>")
+ for entry in self.entries[headword]:
+ outputfile.write(f"<li>{entry}</li>")
+ outputfile.write(f"</ol>\n")
+ if " " in headword or "-" in headword:
+ print(f"<{headword}> has space or dash")
+
+ def handle_starttag(self, tag, attrs):
+ if tag == "b":
+ assert not self.reading_headword
+ assert not self.reading_pos
+ assert not self.reading_definition
+ assert not self.headword
+
+ self.reading_headword = True
+
+ elif tag == "i":
+ assert not self.reading_headword
+ assert not self.reading_pos
+ assert not self.pos
+
+ self.reading_pos = True
+
+ def handle_data(self, data):
+ if self.reading_headword:
+ self.headword += data.lower()
+ elif self.reading_pos:
+ self.pos += data
+ elif self.reading_definition:
+ self.definition += data
+
+ def handle_endtag(self, tag):
+ if tag == "b":
+ assert self.reading_headword
+ assert not self.reading_definition
+ assert not self.reading_pos
+
+ self.reading_headword = False
+ self.reading_definition = True
+
+ elif tag == "i":
+ assert not self.reading_headword
+ assert self.reading_pos
+ assert self.reading_definition
+
+ self.reading_pos = False
+
+ elif tag == "p":
+ assert not self.reading_headword
+ assert self.reading_definition
+ assert not self.reading_pos
+ assert self.headword
+ assert self.definition
+
+ if self.pos:
+ self.definition = f"(<i>{self.pos}</i>) {self.definition}"
+ self.definition = self.definition.replace("()", "")
+ self.definition = self.definition.strip()
+
+ assert not "\\" in self.headword, f"\\ for word {self.headword}"
+ if any(str(digit) in self.headword for digit in range(10)):
+ print(f"Warning: {self.headword} has digits")
+
+ self._add_definition(self.headword, self.definition)
+ self.reading_definition = False
+ self.headword = ""
+ self.pos = ""
+ self.definition = ""
+
+def main():
+ opted_parser = OptedParser()
+ for fname in glob("OPTED/v003/wb1913_*.html"):
+ print(f"Running on {fname}")
+ with open(fname, mode="r",
+ encoding="macroman", errors="strict") as inputfile:
+ opted_parser.feed(inputfile.read())
+
+ with open("OPTED.tab_separated", "w", encoding="utf-8") as outputfile:
+ opted_parser.write_definitions(outputfile)
+
+
+if __name__ == "__main__":
+ main()