#!/usr/bin/python3
from html.parser import HTMLParser
from glob import glob
class OptedParser(HTMLParser):
def __init__(self):
super().__init__()
self.reading_headword = False
self.reading_pos = False
self.reading_definition = False
self.headword = ""
self.pos = ""
self.definition = ""
self.entries = {}
def _add_definition(self, headword, definition):
if headword not in self.entries:
self.entries[headword] = []
self.entries[headword] += [definition]
def write_definitions(self, outputfile):
for headword in self.entries:
if len(self.entries[headword]) == 1:
outputfile.write(f"{headword}\t{self.entries[headword][0]}\n")
else:
outputfile.write(f"{headword}\t
")
for entry in self.entries[headword]:
outputfile.write(f"- {entry}
")
outputfile.write(f"
\n")
if " " in headword or "-" in headword:
print(f"<{headword}> has space or dash")
def handle_starttag(self, tag, attrs):
if tag == "b":
assert not self.reading_headword
assert not self.reading_pos
assert not self.reading_definition
assert not self.headword
self.reading_headword = True
elif tag == "i":
assert not self.reading_headword
assert not self.reading_pos
assert not self.pos
self.reading_pos = True
def handle_data(self, data):
if self.reading_headword:
self.headword += data.lower()
elif self.reading_pos:
self.pos += data
elif self.reading_definition:
self.definition += data
def handle_endtag(self, tag):
if tag == "b":
assert self.reading_headword
assert not self.reading_definition
assert not self.reading_pos
self.reading_headword = False
self.reading_definition = True
elif tag == "i":
assert not self.reading_headword
assert self.reading_pos
assert self.reading_definition
self.reading_pos = False
elif tag == "p":
assert not self.reading_headword
assert self.reading_definition
assert not self.reading_pos
assert self.headword
assert self.definition
if self.pos:
self.definition = f"({self.pos}) {self.definition}"
self.definition = self.definition.replace("()", "")
self.definition = self.definition.strip()
assert not "\\" in self.headword, f"\\ for word {self.headword}"
if any(str(digit) in self.headword for digit in range(10)):
print(f"Warning: {self.headword} has digits")
self._add_definition(self.headword, self.definition)
self.reading_definition = False
self.headword = ""
self.pos = ""
self.definition = ""
def main():
opted_parser = OptedParser()
for fname in sorted(glob("OPTED/v003/wb1913_*.html")):
print(f"Running on {fname}")
with open(fname, mode="r",
encoding="macroman", errors="strict") as inputfile:
opted_parser.feed(inputfile.read())
with open("OPTED.tab_separated", "w", encoding="utf-8") as outputfile:
opted_parser.write_definitions(outputfile)
if __name__ == "__main__":
main()