X-Git-Url: http://gitweb.fperrin.net/?p=gen-quickdic.git;a=blobdiff_plain;f=OPTED_to_chemnitz;fp=OPTED_to_chemnitz;h=87b1773a3325d3fb028d5c4a90622965fe8b4332;hp=0000000000000000000000000000000000000000;hb=5da87a9ec2370bb2f7ce11e107f07625e42f7171;hpb=5598132b30a07363a5b2d1b9966c761150614158 diff --git a/OPTED_to_chemnitz b/OPTED_to_chemnitz new file mode 100755 index 0000000..87b1773 --- /dev/null +++ b/OPTED_to_chemnitz @@ -0,0 +1,89 @@ +#!/usr/bin/python3 + +from html.parser import HTMLParser +from glob import glob + +class OptedParser(HTMLParser): + def __init__(self, outputfile): + super().__init__() + self.reading_entry = False + self.reading_pos = False + self.reading_definition = False + self.entry = "" + self.pos = "" + self.definition = "" + self.outputfile = outputfile + + def handle_starttag(self, tag, attrs): + if tag == "b": + assert not self.reading_entry + assert not self.reading_pos + assert not self.reading_definition + assert not self.entry + + self.reading_entry = True + + elif tag == "i": + assert not self.reading_entry + assert not self.reading_pos + assert not self.pos + + self.reading_pos = True + + def handle_data(self, data): + if self.reading_entry: + self.entry += data.lower() + elif self.reading_pos: + self.pos += data + elif self.reading_definition: + self.definition += data + + def handle_endtag(self, tag): + if tag == "b": + assert self.reading_entry + assert not self.reading_definition + assert not self.reading_pos + + self.reading_entry = False + self.reading_definition = True + + elif tag == "i": + assert not self.reading_entry + assert self.reading_pos + assert self.reading_definition + + self.reading_pos = False + + elif tag == "p": + assert not self.reading_entry + assert self.reading_definition + assert not self.reading_pos + assert self.entry + assert self.definition + + assert "::" not in self.entry and "::" not in self.definition + assert "|" not in self.entry and "|" not in self.definition + + if self.pos: + self.definition = f"{self.entry} ({self.pos}) {self.definition}" + + self.definition = self.definition.replace("()", "") + self.definition = self.definition.strip() + + self.outputfile.write(f"{self.entry} :: {self.definition}\n") + self.reading_definition = False + self.entry = "" + self.pos = "" + self.definition = "" + +def main(): + with open("OPTED.chemnitz", "w", encoding="utf-8") as outputfile: + opted_parser = OptedParser(outputfile) + for fname in sorted(glob("OPTED/v003/wb1913_*.html")): + print(f"Running on {fname}") + with open(fname, mode="r", + encoding="macroman", errors="strict") as inputfile: + opted_parser.feed(inputfile.read()) + +if __name__ == "__main__": + main()