]> gitweb.fperrin.net Git - gen-quickdic.git/blob - OPTED_to_tab_separated
GCIDE conversion
[gen-quickdic.git] / OPTED_to_tab_separated
1 #!/usr/bin/python3
2
3 from html.parser import HTMLParser
4 from glob import glob
5
6 class OptedParser(HTMLParser):
7     def __init__(self):
8         super().__init__()
9         self.reading_headword = False
10         self.reading_pos = False
11         self.reading_definition = False
12         self.headword = ""
13         self.pos = ""
14         self.definition = ""
15         self.entries = {}
16
17     def _add_definition(self, headword, definition):
18         if headword not in self.entries:
19             self.entries[headword] = []
20         self.entries[headword] += [definition]
21
22     def write_definitions(self, outputfile):
23         for headword in self.entries:
24             if len(self.entries[headword]) == 1:
25                 outputfile.write(f"{headword}\t{self.entries[headword][0]}\n")
26             else:
27                 outputfile.write(f"{headword}\t<ol>")
28                 for entry in self.entries[headword]:
29                     outputfile.write(f"<li>{entry}</li>")
30                 outputfile.write(f"</ol>\n")
31             if " " in headword or "-" in headword:
32                 print(f"<{headword}> has space or dash")
33
34     def handle_starttag(self, tag, attrs):
35         if tag == "b":
36             assert not self.reading_headword
37             assert not self.reading_pos
38             assert not self.reading_definition
39             assert not self.headword
40
41             self.reading_headword = True
42
43         elif tag == "i":
44             assert not self.reading_headword
45             assert not self.reading_pos
46             assert not self.pos
47
48             self.reading_pos = True
49
50     def handle_data(self, data):
51         if self.reading_headword:
52             self.headword += data.lower()
53         elif self.reading_pos:
54             self.pos += data
55         elif self.reading_definition:
56             self.definition += data
57
58     def handle_endtag(self, tag):
59         if tag == "b":
60             assert self.reading_headword
61             assert not self.reading_definition
62             assert not self.reading_pos
63
64             self.reading_headword = False
65             self.reading_definition = True
66
67         elif tag == "i":
68             assert not self.reading_headword
69             assert self.reading_pos
70             assert self.reading_definition
71
72             self.reading_pos = False
73
74         elif tag == "p":
75             assert not self.reading_headword
76             assert self.reading_definition
77             assert not self.reading_pos
78             assert self.headword
79             assert self.definition
80
81             if self.pos:
82                 self.definition = f"(<i>{self.pos}</i>) {self.definition}"
83             self.definition = self.definition.replace("()", "")
84             self.definition = self.definition.strip()
85
86             assert not "\\" in self.headword, f"\\ for word {self.headword}"
87             if any(str(digit) in self.headword for digit in range(10)):
88                 print(f"Warning: {self.headword} has digits")
89
90             self._add_definition(self.headword, self.definition)
91             self.reading_definition = False
92             self.headword = ""
93             self.pos = ""
94             self.definition = ""
95
96 def main():
97     opted_parser = OptedParser()
98     for fname in sorted(glob("OPTED/v003/wb1913_*.html")):
99         print(f"Running on {fname}")
100         with open(fname, mode="r",
101                   encoding="macroman", errors="strict") as inputfile:
102             opted_parser.feed(inputfile.read())
103
104     with open("OPTED.tab_separated", "w", encoding="utf-8") as outputfile:
105         opted_parser.write_definitions(outputfile)
106
107
108 if __name__ == "__main__":
109     main()