From c1db997ca48690b7e1405e67f887be60567c6b3e Mon Sep 17 00:00:00 2001 From: =?utf8?q?Fr=C3=A9d=C3=A9ric=20Perrin?= Date: Sat, 13 Feb 2021 09:58:14 +0000 Subject: [PATCH] GCIDE conversion --- GCIDE_to_tab_separated | 362 ++++++++++++++++++++++++++++++++++++++--- Makefile | 4 +- 2 files changed, 338 insertions(+), 28 deletions(-) diff --git a/GCIDE_to_tab_separated b/GCIDE_to_tab_separated index 99a4973..9c861f3 100755 --- a/GCIDE_to_tab_separated +++ b/GCIDE_to_tab_separated @@ -2,17 +2,21 @@ import lxml.etree import re +from glob import glob +import unicodedata entity_map = { - "
", + "
", "&": "&", "": "”", # close double quote "": "“", # open double quotes "2", + "<3dot/": "3", + # only used as an example in the entry for progression + "2, 4, 6, 8, 1010, 8, 6, 4, 22, 4, 8, 16, 32, 6464, 32, 16, 8, 4, 2", replace_fake_comments, rawtext, flags=re.DOTALL) - rawtext = f"" + rawtext + f"" - for entity, char in entity_map.items(): - rawtext = rawtext.replace(entity, char) - print(rawtext.splitlines()[5724:5730]) - e = lxml.etree.XML(rawtext) +re_unknownentity = re.compile(r"<[a-z0-9]+/") +re_sources = re.compile(r"\[(.*?)\]") +re_pos = re.compile(r"(.*?)") +re_headword = re.compile(r"(.*)") +re_subsense = re.compile(r"[0-9.]+") +re_altspelling = re.compile(r"(.+?)") +re_greekwords = re.compile(r"(.+?)") +skipped_hw = re.compile(r"^[0-9/]+(st|nd|rd|th)?$") +re_simplefraction = re.compile(r"(.*?)") + +# delete the tags and it's content +delete_tags = [re.compile(r".*?,?\.?"), + re.compile(r".*?,?\.?"), + re.compile(r".*?,?"), + re.compile(r".*?"), + re.compile(r".*?

"), # blah BUG song not closed + re.compile(r""), + re.compile(r".*?"), + re.compile(r'\(hō"m or ") \ + .replace(" and ") + text = text.replace(pos_part_match.group(0), f"{pos_part}") + text = text.replace("", f"<{formattag}>") \ + .replace(f"", f"") + else: + text = text.replace(f"<{tag}>", "") \ + .replace(f"", "") + + text = text \ + .replace("☞", "☞").replace(" ☞", "") \ + .replace("", "☞ ").replace("", "") \ + .replace("", "(").replace("", ")") \ + .replace("(", "(").replace(")", ")") \ + .replace("", "").replace("", "") \ + .replace("", '') \ + .replace("", "") \ + .replace("", '') \ + .replace("", "") \ + .replace("", '') \ + .replace("", "") \ + .replace("[R.]", "[Rare]") \ + .replace("[Obs.]", "[Obsolete]") \ + .replace("[", "[").replace("]", "]") \ + + wrapol = False + for subsensemark in re_subsense.findall(text): + text = text.replace(subsensemark, "
  • ") + wrapol = True + if wrapol: + text = text.replace("
  • ", "
    1. ", 1) + text = f"{text}
    " + + # embedded greek + text = re_greekwords.sub(detransliterate_greek, text) + + # move etymology at the end + match_ety = re_etymology.search(text) + if match_ety: + text = text.replace(match_ety.group(0), "") + text += f"

    Etymology: {match_ety.group(1)}

    " + + # simplify sources : if all identical, just have the one at the end; + # otherwise format it a bit better + sources = list(re_sources.finditer(text)) + #assert sources, f"Looking for sources in {text}" + sourcenames = [source.group(1) for source in sources] + if len(set(sourcenames)) == 1 or \ + (sourcenames and all("Webster" in s for s in sourcenames)): + text = re_sources.sub("", text) + text += f"

    Source: {sources[0].group(1)}

    " + else: + for source in set(sources): + text = text.replace(source.group(0), f"

    Source: {source.group(1)}

    ") + + unknwn_ent = re_unknownentity.search(text) + assert not unknwn_ent, f"Unknown entity {unknwn_ent} in {text}" + + # TODO: add check for unknown tags + + return text + +def detransliterate_greek(roman): + if isinstance(roman, re.Match): + roman = roman.group(1) + # not in alphabetical order, as ch for χ needs to appear before h for η + greek = roman\ + .replace("ch", "χ") \ + .replace("a", "α").replace("b", "β").replace("g", "γ") \ + .replace("d", "δ").replace("e", "ε").replace("z", "ζ") \ + .replace("h", "η").replace("q", "θ").replace("i", "ι") \ + .replace("k", "κ").replace("l", "λ").replace("m", "μ") \ + .replace("n", "ν").replace("x", "ξ").replace("o", "ο") \ + .replace("p", "π").replace("r", "ρ").replace("s", "σ") \ + .replace("t", "τ").replace("y", "υ").replace("u", "υ") \ + .replace("f", "φ").replace("ps", "ψ").replace("w", "ω") \ + .replace("~", "\N{COMBINING GRAVE ACCENT}") \ + .replace("`", "\N{COMBINING ACUTE ACCENT}") \ + .replace(":", "\N{COMBINING DIAERESIS}") \ + .replace(",", "\N{COMBINING GREEK YPOGEGRAMMENI}") \ + .replace("^", "\N{COMBINING GREEK PERISPOMENI}") + if greek.endswith("σ"): + greek = greek[:-1] + "ς" + # these appear before the letter, so need to move after + greek = re.sub("'(.)", "\\1\N{COMBINING COMMA ABOVE}", greek) + greek = re.sub("\"(.)", "\\1\N{COMBINING REVERSED COMMA ABOVE}", greek) + return unicodedata.normalize("NFC", greek) + +def test_transliterate_greek(): + expected = "" + actual = "" + for roman, greek in [("'archai:`zein", "ἀρχαΐζειν"), # BUG in the example the accent on the ι is wrong + ("zw^,on", "ζῷον"), + ("o'i^nos", "οἶνος"), + ("\"ydra`rgyros", "ὑδράργυρος"), + ("lyth`rios", "λυτήριος"), + ("poihth`s", "ποιητής")]: + expected += f"{roman}\n{greek} - {greek.encode('unicode_escape')}\n" + actual += f"{roman}\n{detransliterate_greek(roman)} - " \ + "{detransliterate_greek(roman).encode('unicode_escape')}\n" + assert actual == expected + +def mk_fraction(fraction): + num = fraction.group(1).translate({"1": "¹", "2": "²", "3": "³", + "4": "⁴", "5": "⁵", "6": "⁶", + "7": "⁷", "8": "⁸", "9": "⁹", + "0": "⁰"}) + den = fraction.group(2).translate({"1": "₁", "2": "₂", "3": "₃", + "4": "₄", "5": "₅", "6": "₆", + "7": "₇", "8": "₈", "9": "₉", + "0": "₀"}) + return f"{num}⁄{den}" + +class Converter: + def __init__(self): + self.entries = {} + self.current_hw = [] + self.current_def = "" + + def start_new_entry(self): + current_hw = tuple(hw.upper() for hw in self.current_hw) + if current_hw not in self.entries: + self.entries[current_hw] = [] + try: + defn = format_definition(self.current_def) + except: + print(f"Trying to parse {current_hw}: {self.current_def}") + raise + self.entries[current_hw].append(defn) + + self.current_def = "" + self.current_hw = [] + + def convert_file(self, fname): + rawtext = open(fname, "r", encoding="latin-1").read() + rawtext = re.sub(r"<--.+?-->", "", rawtext, flags=re.DOTALL) + for entity, char in entity_map.items(): + rawtext = rawtext.replace(entity, char) + + for line in rawtext.splitlines(): + match = re_headword.search(line) + if match: + if self.current_def: + self.start_new_entry() + + hw = match.group(1) + if skipped_hw.match(hw): + continue + elif hw == "Tracer/y": # BUG for these words + hw = "Tracery" + elif hw == "Adder fly/": + hw = "Adder fly" + elif hw == "Gairish/ness": + hw = "Gairishness" + elif hw == "Pi": + hw = "Pi" + + assert re.match("^[^<>/]+$", hw), hw + self.current_hw.append(hw) + else: + if self.current_hw: + for altsp in re_altspelling.findall(line): + self.current_hw.append(altsp) + self.current_def += line + + print(f"after{fname}, {len(self.entries)} definitions") + + def write(self, fname): + with open(fname, "w") as f: + for hws, entry in self.entries.items(): + hws = ", ".join(hws) + f.write(f"\n

    {hws}

    ") + if len(entry) > 1: + for i, subentry in enumerate(entry): + f.write(f"

    {i+1}. {hws}

    ") + f.write(subentry) + else: + f.write(entry[0]) + + def write_to_tabseparated(self, fname): + with open(fname, "w") as f: + for hws, entry in self.entries.items(): + hws = ", ".join(hws) + f.write(f"{hws}\t") + if len(entry) > 1: + for i, subentry in enumerate(entry): + f.write(f"

    {i+1}. {hws}

    ") + assert "\n" not in subentry + f.write(subentry) + else: + assert "\n" not in entry[0] + f.write(entry[0]) + f.write("\n") + +def main(): + c = Converter() + for fname in glob("../gcide/CIDE.?"): + print(f"Running over {fname}") + c.convert_file(fname) + + letter = fname.rsplit(".")[-1] + # c.write(f"GCIDE-{letter}.html") + c.write_to_tabseparated("GCIDE.tab_separated") + +if __name__ == "__main__": + main() diff --git a/Makefile b/Makefile index d3ebd70..db67dc5 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,4 @@ -all: OPTED.v006.quickdic XMLittre.v006.quickdic +all: OPTED.v006.quickdic XMLittre.v006.quickdic GCIDE.v006.quickdic %.tab_separated: %_to_tab_separated ./$*_to_tab_separated @@ -17,6 +17,8 @@ clean: rm -f *.quickdic *.quickdic.txt rm -fr OPTED/ +GCIDE.v007.quickdic: dictlang := EN +IT.v007.quickdic: dictlang := IT OPTED.v007.quickdic: dictlang := EN XMLittre.v007.quickdic: dictlang := FR -- 2.43.0