X-Git-Url: http://gitweb.fperrin.net/?p=gen-quickdic.git;a=blobdiff_plain;f=OPTED_to_tab_separated;fp=OPTED_to_tab_separated;h=479b3f0c5479ecd3ac345179ebb48b6033656eb0;hp=0000000000000000000000000000000000000000;hb=5da87a9ec2370bb2f7ce11e107f07625e42f7171;hpb=5598132b30a07363a5b2d1b9966c761150614158 diff --git a/OPTED_to_tab_separated b/OPTED_to_tab_separated new file mode 100755 index 0000000..479b3f0 --- /dev/null +++ b/OPTED_to_tab_separated @@ -0,0 +1,109 @@ +#!/usr/bin/python3 + +from html.parser import HTMLParser +from glob import glob + +class OptedParser(HTMLParser): + def __init__(self): + super().__init__() + self.reading_headword = False + self.reading_pos = False + self.reading_definition = False + self.headword = "" + self.pos = "" + self.definition = "" + self.entries = {} + + def _add_definition(self, headword, definition): + if headword not in self.entries: + self.entries[headword] = [] + self.entries[headword] += [definition] + + def write_definitions(self, outputfile): + for headword in self.entries: + if len(self.entries[headword]) == 1: + outputfile.write(f"{headword}\t{self.entries[headword][0]}\n") + else: + outputfile.write(f"{headword}\t
    ") + for entry in self.entries[headword]: + outputfile.write(f"
  1. {entry}
  2. ") + outputfile.write(f"
\n") + if " " in headword or "-" in headword: + print(f"<{headword}> has space or dash") + + def handle_starttag(self, tag, attrs): + if tag == "b": + assert not self.reading_headword + assert not self.reading_pos + assert not self.reading_definition + assert not self.headword + + self.reading_headword = True + + elif tag == "i": + assert not self.reading_headword + assert not self.reading_pos + assert not self.pos + + self.reading_pos = True + + def handle_data(self, data): + if self.reading_headword: + self.headword += data.lower() + elif self.reading_pos: + self.pos += data + elif self.reading_definition: + self.definition += data + + def handle_endtag(self, tag): + if tag == "b": + assert self.reading_headword + assert not self.reading_definition + assert not self.reading_pos + + self.reading_headword = False + self.reading_definition = True + + elif tag == "i": + assert not self.reading_headword + assert self.reading_pos + assert self.reading_definition + + self.reading_pos = False + + elif tag == "p": + assert not self.reading_headword + assert self.reading_definition + assert not self.reading_pos + assert self.headword + assert self.definition + + if self.pos: + self.definition = f"({self.pos}) {self.definition}" + self.definition = self.definition.replace("()", "") + self.definition = self.definition.strip() + + assert not "\\" in self.headword, f"\\ for word {self.headword}" + if any(str(digit) in self.headword for digit in range(10)): + print(f"Warning: {self.headword} has digits") + + self._add_definition(self.headword, self.definition) + self.reading_definition = False + self.headword = "" + self.pos = "" + self.definition = "" + +def main(): + opted_parser = OptedParser() + for fname in glob("OPTED/v003/wb1913_*.html"): + print(f"Running on {fname}") + with open(fname, mode="r", + encoding="macroman", errors="strict") as inputfile: + opted_parser.feed(inputfile.read()) + + with open("OPTED.tab_separated", "w", encoding="utf-8") as outputfile: + opted_parser.write_definitions(outputfile) + + +if __name__ == "__main__": + main()