3 from html.parser import HTMLParser
6 class OptedParser(HTMLParser):
7 def __init__(self, outputfile):
9 self.reading_entry = False
10 self.reading_pos = False
11 self.reading_definition = False
15 self.outputfile = outputfile
17 def handle_starttag(self, tag, attrs):
19 assert not self.reading_entry
20 assert not self.reading_pos
21 assert not self.reading_definition
24 self.reading_entry = True
27 assert not self.reading_entry
28 assert not self.reading_pos
31 self.reading_pos = True
33 def handle_data(self, data):
34 if self.reading_entry:
35 self.entry += data.lower()
36 elif self.reading_pos:
38 elif self.reading_definition:
39 self.definition += data
41 def handle_endtag(self, tag):
43 assert self.reading_entry
44 assert not self.reading_definition
45 assert not self.reading_pos
47 self.reading_entry = False
48 self.reading_definition = True
51 assert not self.reading_entry
52 assert self.reading_pos
53 assert self.reading_definition
55 self.reading_pos = False
58 assert not self.reading_entry
59 assert self.reading_definition
60 assert not self.reading_pos
62 assert self.definition
64 assert "::" not in self.entry and "::" not in self.definition
65 assert "|" not in self.entry and "|" not in self.definition
68 self.definition = f"<b>{self.entry}</b> (<i>{self.pos}</i>) {self.definition}"
70 self.definition = self.definition.replace("()", "")
71 self.definition = self.definition.strip()
73 self.outputfile.write(f"{self.entry} :: {self.definition}\n")
74 self.reading_definition = False
80 with open("OPTED.chemnitz", "w", encoding="utf-8") as outputfile:
81 opted_parser = OptedParser(outputfile)
82 for fname in sorted(glob("OPTED/v003/wb1913_*.html")):
83 print(f"Running on {fname}")
84 with open(fname, mode="r",
85 encoding="macroman", errors="strict") as inputfile:
86 opted_parser.feed(inputfile.read())
88 if __name__ == "__main__":