3 from html.parser import HTMLParser
6 class OptedParser(HTMLParser):
9 self.reading_headword = False
10 self.reading_pos = False
11 self.reading_definition = False
17 def _add_definition(self, headword, definition):
18 if headword not in self.entries:
19 self.entries[headword] = []
20 self.entries[headword] += [definition]
22 def write_definitions(self, outputfile):
23 for headword in self.entries:
24 if len(self.entries[headword]) == 1:
25 outputfile.write(f"{headword}\t{self.entries[headword][0]}\n")
27 outputfile.write(f"{headword}\t<ol>")
28 for entry in self.entries[headword]:
29 outputfile.write(f"<li>{entry}</li>")
30 outputfile.write(f"</ol>\n")
31 if " " in headword or "-" in headword:
32 print(f"<{headword}> has space or dash")
34 def handle_starttag(self, tag, attrs):
36 assert not self.reading_headword
37 assert not self.reading_pos
38 assert not self.reading_definition
39 assert not self.headword
41 self.reading_headword = True
44 assert not self.reading_headword
45 assert not self.reading_pos
48 self.reading_pos = True
50 def handle_data(self, data):
51 if self.reading_headword:
52 self.headword += data.lower()
53 elif self.reading_pos:
55 elif self.reading_definition:
56 self.definition += data
58 def handle_endtag(self, tag):
60 assert self.reading_headword
61 assert not self.reading_definition
62 assert not self.reading_pos
64 self.reading_headword = False
65 self.reading_definition = True
68 assert not self.reading_headword
69 assert self.reading_pos
70 assert self.reading_definition
72 self.reading_pos = False
75 assert not self.reading_headword
76 assert self.reading_definition
77 assert not self.reading_pos
79 assert self.definition
82 self.definition = f"(<i>{self.pos}</i>) {self.definition}"
83 self.definition = self.definition.replace("()", "")
84 self.definition = self.definition.strip()
86 assert not "\\" in self.headword, f"\\ for word {self.headword}"
87 if any(str(digit) in self.headword for digit in range(10)):
88 print(f"Warning: {self.headword} has digits")
90 self._add_definition(self.headword, self.definition)
91 self.reading_definition = False
97 opted_parser = OptedParser()
98 for fname in sorted(glob("OPTED/v003/wb1913_*.html")):
99 print(f"Running on {fname}")
100 with open(fname, mode="r",
101 encoding="macroman", errors="strict") as inputfile:
102 opted_parser.feed(inputfile.read())
104 with open("OPTED.tab_separated", "w", encoding="utf-8") as outputfile:
105 opted_parser.write_definitions(outputfile)
108 if __name__ == "__main__":