X-Git-Url: http://gitweb.fperrin.net/?p=gen-quickdic.git;a=blobdiff_plain;f=OPTED_to_chemnitz;fp=OPTED_to_chemnitz;h=87b1773a3325d3fb028d5c4a90622965fe8b4332;hp=0000000000000000000000000000000000000000;hb=5da87a9ec2370bb2f7ce11e107f07625e42f7171;hpb=5598132b30a07363a5b2d1b9966c761150614158
diff --git a/OPTED_to_chemnitz b/OPTED_to_chemnitz
new file mode 100755
index 0000000..87b1773
--- /dev/null
+++ b/OPTED_to_chemnitz
@@ -0,0 +1,89 @@
+#!/usr/bin/python3
+
+from html.parser import HTMLParser
+from glob import glob
+
+class OptedParser(HTMLParser):
+ def __init__(self, outputfile):
+ super().__init__()
+ self.reading_entry = False
+ self.reading_pos = False
+ self.reading_definition = False
+ self.entry = ""
+ self.pos = ""
+ self.definition = ""
+ self.outputfile = outputfile
+
+ def handle_starttag(self, tag, attrs):
+ if tag == "b":
+ assert not self.reading_entry
+ assert not self.reading_pos
+ assert not self.reading_definition
+ assert not self.entry
+
+ self.reading_entry = True
+
+ elif tag == "i":
+ assert not self.reading_entry
+ assert not self.reading_pos
+ assert not self.pos
+
+ self.reading_pos = True
+
+ def handle_data(self, data):
+ if self.reading_entry:
+ self.entry += data.lower()
+ elif self.reading_pos:
+ self.pos += data
+ elif self.reading_definition:
+ self.definition += data
+
+ def handle_endtag(self, tag):
+ if tag == "b":
+ assert self.reading_entry
+ assert not self.reading_definition
+ assert not self.reading_pos
+
+ self.reading_entry = False
+ self.reading_definition = True
+
+ elif tag == "i":
+ assert not self.reading_entry
+ assert self.reading_pos
+ assert self.reading_definition
+
+ self.reading_pos = False
+
+ elif tag == "p":
+ assert not self.reading_entry
+ assert self.reading_definition
+ assert not self.reading_pos
+ assert self.entry
+ assert self.definition
+
+ assert "::" not in self.entry and "::" not in self.definition
+ assert "|" not in self.entry and "|" not in self.definition
+
+ if self.pos:
+ self.definition = f"{self.entry} ({self.pos}) {self.definition}"
+
+ self.definition = self.definition.replace("()", "")
+ self.definition = self.definition.strip()
+
+ self.outputfile.write(f"{self.entry} :: {self.definition}\n")
+ self.reading_definition = False
+ self.entry = ""
+ self.pos = ""
+ self.definition = ""
+
+def main():
+ with open("OPTED.chemnitz", "w", encoding="utf-8") as outputfile:
+ opted_parser = OptedParser(outputfile)
+ for fname in sorted(glob("OPTED/v003/wb1913_*.html")):
+ print(f"Running on {fname}")
+ with open(fname, mode="r",
+ encoding="macroman", errors="strict") as inputfile:
+ opted_parser.feed(inputfile.read())
+
+if __name__ == "__main__":
+ main()