]> gitweb.fperrin.net Git - gen-quickdic.git/blob - OPTED_to_chemnitz
GCIDE conversion
[gen-quickdic.git] / OPTED_to_chemnitz
1 #!/usr/bin/python3
2
3 from html.parser import HTMLParser
4 from glob import glob
5
6 class OptedParser(HTMLParser):
7     def __init__(self, outputfile):
8         super().__init__()
9         self.reading_entry = False
10         self.reading_pos = False
11         self.reading_definition = False
12         self.entry = ""
13         self.pos = ""
14         self.definition = ""
15         self.outputfile = outputfile
16
17     def handle_starttag(self, tag, attrs):
18         if tag == "b":
19             assert not self.reading_entry
20             assert not self.reading_pos
21             assert not self.reading_definition
22             assert not self.entry
23
24             self.reading_entry = True
25
26         elif tag == "i":
27             assert not self.reading_entry
28             assert not self.reading_pos
29             assert not self.pos
30
31             self.reading_pos = True
32
33     def handle_data(self, data):
34         if self.reading_entry:
35             self.entry += data.lower()
36         elif self.reading_pos:
37             self.pos += data
38         elif self.reading_definition:
39             self.definition += data
40
41     def handle_endtag(self, tag):
42         if tag == "b":
43             assert self.reading_entry
44             assert not self.reading_definition
45             assert not self.reading_pos
46
47             self.reading_entry = False
48             self.reading_definition = True
49
50         elif tag == "i":
51             assert not self.reading_entry
52             assert self.reading_pos
53             assert self.reading_definition
54
55             self.reading_pos = False
56
57         elif tag == "p":
58             assert not self.reading_entry
59             assert self.reading_definition
60             assert not self.reading_pos
61             assert self.entry
62             assert self.definition
63
64             assert "::" not in self.entry and "::" not in self.definition
65             assert "|" not in self.entry and "|" not in self.definition
66
67             if self.pos:
68                 self.definition = f"<b>{self.entry}</b> (<i>{self.pos}</i>) {self.definition}"
69
70             self.definition = self.definition.replace("()", "")
71             self.definition = self.definition.strip()
72
73             self.outputfile.write(f"{self.entry} :: {self.definition}\n")
74             self.reading_definition = False
75             self.entry = ""
76             self.pos = ""
77             self.definition = ""
78
79 def main():
80     with open("OPTED.chemnitz", "w", encoding="utf-8") as outputfile:
81         opted_parser = OptedParser(outputfile)
82         for fname in sorted(glob("OPTED/v003/wb1913_*.html")):
83             print(f"Running on {fname}")
84             with open(fname, mode="r",
85                       encoding="macroman", errors="strict") as inputfile:
86                 opted_parser.feed(inputfile.read())
87
88 if __name__ == "__main__":
89     main()