]> gitweb.fperrin.net Git - gen-quickdic.git/blobdiff - OPTED_to_chemnitz
Convert the OPTED dictionary for use on Tolino devices
[gen-quickdic.git] / OPTED_to_chemnitz
diff --git a/OPTED_to_chemnitz b/OPTED_to_chemnitz
new file mode 100755 (executable)
index 0000000..87b1773
--- /dev/null
@@ -0,0 +1,89 @@
+#!/usr/bin/python3
+
+from html.parser import HTMLParser
+from glob import glob
+
+class OptedParser(HTMLParser):
+    def __init__(self, outputfile):
+        super().__init__()
+        self.reading_entry = False
+        self.reading_pos = False
+        self.reading_definition = False
+        self.entry = ""
+        self.pos = ""
+        self.definition = ""
+        self.outputfile = outputfile
+
+    def handle_starttag(self, tag, attrs):
+        if tag == "b":
+            assert not self.reading_entry
+            assert not self.reading_pos
+            assert not self.reading_definition
+            assert not self.entry
+
+            self.reading_entry = True
+
+        elif tag == "i":
+            assert not self.reading_entry
+            assert not self.reading_pos
+            assert not self.pos
+
+            self.reading_pos = True
+
+    def handle_data(self, data):
+        if self.reading_entry:
+            self.entry += data.lower()
+        elif self.reading_pos:
+            self.pos += data
+        elif self.reading_definition:
+            self.definition += data
+
+    def handle_endtag(self, tag):
+        if tag == "b":
+            assert self.reading_entry
+            assert not self.reading_definition
+            assert not self.reading_pos
+
+            self.reading_entry = False
+            self.reading_definition = True
+
+        elif tag == "i":
+            assert not self.reading_entry
+            assert self.reading_pos
+            assert self.reading_definition
+
+            self.reading_pos = False
+
+        elif tag == "p":
+            assert not self.reading_entry
+            assert self.reading_definition
+            assert not self.reading_pos
+            assert self.entry
+            assert self.definition
+
+            assert "::" not in self.entry and "::" not in self.definition
+            assert "|" not in self.entry and "|" not in self.definition
+
+            if self.pos:
+                self.definition = f"<b>{self.entry}</b> (<i>{self.pos}</i>) {self.definition}"
+
+            self.definition = self.definition.replace("()", "")
+            self.definition = self.definition.strip()
+
+            self.outputfile.write(f"{self.entry} :: {self.definition}\n")
+            self.reading_definition = False
+            self.entry = ""
+            self.pos = ""
+            self.definition = ""
+
+def main():
+    with open("OPTED.chemnitz", "w", encoding="utf-8") as outputfile:
+        opted_parser = OptedParser(outputfile)
+        for fname in sorted(glob("OPTED/v003/wb1913_*.html")):
+            print(f"Running on {fname}")
+            with open(fname, mode="r",
+                      encoding="macroman", errors="strict") as inputfile:
+                opted_parser.feed(inputfile.read())
+
+if __name__ == "__main__":
+    main()