#!/usr/bin/python3 import lxml.etree import re from glob import glob import unicodedata entity_map = { "
", "&": "&", "": "”", # close double quote "": "“", # open double quotes "2", "<3dot/": "3", # only used as an example in the entry for progression "2, 4, 6, 8, 1010, 8, 6, 4, 22, 4, 8, 16, 32, 6464, 32, 16, 8, 4, 2(.*?)\]") re_pos = re.compile(r"(.*?)") re_headword = re.compile(r"(.*)") re_subsense = re.compile(r"[0-9.]+") re_altspelling = re.compile(r"(.+?)") re_greekwords = re.compile(r"(.+?)") skipped_hw = re.compile(r"^[0-9/]+(st|nd|rd|th)?$") re_simplefraction = re.compile(r"(.*?)") # delete the tags and it's content delete_tags = [re.compile(r".*?,?\.?"), re.compile(r".*?,?\.?"), re.compile(r".*?,?"), re.compile(r".*?"), re.compile(r".*?

"), # blah BUG song not closed re.compile(r""), re.compile(r".*?"), re.compile(r'\(hō"m or ") \ .replace(" and ") text = text.replace(pos_part_match.group(0), f"{pos_part}") text = text.replace("", f"<{formattag}>") \ .replace(f"", f"") else: text = text.replace(f"<{tag}>", "") \ .replace(f"", "") text = text \ .replace("☞", "☞").replace(" ☞", "") \ .replace("", "☞ ").replace("", "") \ .replace("", "(").replace("", ")") \ .replace("(", "(").replace(")", ")") \ .replace("", "").replace("", "") \ .replace("", '') \ .replace("", "") \ .replace("", '') \ .replace("", "") \ .replace("", '') \ .replace("", "") \ .replace("[R.]", "[Rare]") \ .replace("[Obs.]", "[Obsolete]") \ .replace("[", "[").replace("]", "]") \ wrapol = False for subsensemark in re_subsense.findall(text): text = text.replace(subsensemark, "
  • ") wrapol = True if wrapol: text = text.replace("
  • ", "
    1. ", 1) text = f"{text}
    " # embedded greek text = re_greekwords.sub(detransliterate_greek, text) # move etymology at the end match_ety = re_etymology.search(text) if match_ety: text = text.replace(match_ety.group(0), "") text += f"

    Etymology: {match_ety.group(1)}

    " # simplify sources : if all identical, just have the one at the end; # otherwise format it a bit better sources = list(re_sources.finditer(text)) #assert sources, f"Looking for sources in {text}" sourcenames = [source.group(1) for source in sources] if len(set(sourcenames)) == 1 or \ (sourcenames and all("Webster" in s for s in sourcenames)): text = re_sources.sub("", text) text += f"

    Source: {sources[0].group(1)}

    " else: for source in set(sources): text = text.replace(source.group(0), f"

    Source: {source.group(1)}

    ") unknwn_ent = re_unknownentity.search(text) assert not unknwn_ent, f"Unknown entity {unknwn_ent} in {text}" # TODO: add check for unknown tags return text def detransliterate_greek(roman): if isinstance(roman, re.Match): roman = roman.group(1) # not in alphabetical order, as ch for χ needs to appear before h for η greek = roman\ .replace("ch", "χ") \ .replace("a", "α").replace("b", "β").replace("g", "γ") \ .replace("d", "δ").replace("e", "ε").replace("z", "ζ") \ .replace("h", "η").replace("q", "θ").replace("i", "ι") \ .replace("k", "κ").replace("l", "λ").replace("m", "μ") \ .replace("n", "ν").replace("x", "ξ").replace("o", "ο") \ .replace("p", "π").replace("r", "ρ").replace("s", "σ") \ .replace("t", "τ").replace("y", "υ").replace("u", "υ") \ .replace("f", "φ").replace("ps", "ψ").replace("w", "ω") \ .replace("~", "\N{COMBINING GRAVE ACCENT}") \ .replace("`", "\N{COMBINING ACUTE ACCENT}") \ .replace(":", "\N{COMBINING DIAERESIS}") \ .replace(",", "\N{COMBINING GREEK YPOGEGRAMMENI}") \ .replace("^", "\N{COMBINING GREEK PERISPOMENI}") if greek.endswith("σ"): greek = greek[:-1] + "ς" # these appear before the letter, so need to move after greek = re.sub("'(.)", "\\1\N{COMBINING COMMA ABOVE}", greek) greek = re.sub("\"(.)", "\\1\N{COMBINING REVERSED COMMA ABOVE}", greek) return unicodedata.normalize("NFC", greek) def test_transliterate_greek(): expected = "" actual = "" for roman, greek in [("'archai:`zein", "ἀρχαΐζειν"), # BUG in the example the accent on the ι is wrong ("zw^,on", "ζῷον"), ("o'i^nos", "οἶνος"), ("\"ydra`rgyros", "ὑδράργυρος"), ("lyth`rios", "λυτήριος"), ("poihth`s", "ποιητής")]: expected += f"{roman}\n{greek} - {greek.encode('unicode_escape')}\n" actual += f"{roman}\n{detransliterate_greek(roman)} - " \ "{detransliterate_greek(roman).encode('unicode_escape')}\n" assert actual == expected def mk_fraction(fraction): num = fraction.group(1).translate({"1": "¹", "2": "²", "3": "³", "4": "⁴", "5": "⁵", "6": "⁶", "7": "⁷", "8": "⁸", "9": "⁹", "0": "⁰"}) den = fraction.group(2).translate({"1": "₁", "2": "₂", "3": "₃", "4": "₄", "5": "₅", "6": "₆", "7": "₇", "8": "₈", "9": "₉", "0": "₀"}) return f"{num}⁄{den}" class Converter: def __init__(self): self.entries = {} self.current_hw = [] self.current_def = "" def start_new_entry(self): current_hw = tuple(hw.upper() for hw in self.current_hw) if current_hw not in self.entries: self.entries[current_hw] = [] try: defn = format_definition(self.current_def) except: print(f"Trying to parse {current_hw}: {self.current_def}") raise self.entries[current_hw].append(defn) self.current_def = "" self.current_hw = [] def convert_file(self, fname): rawtext = open(fname, "r", encoding="latin-1").read() rawtext = re.sub(r"<--.+?-->", "", rawtext, flags=re.DOTALL) for entity, char in entity_map.items(): rawtext = rawtext.replace(entity, char) for line in rawtext.splitlines(): match = re_headword.search(line) if match: if self.current_def: self.start_new_entry() hw = match.group(1) if skipped_hw.match(hw): continue elif hw == "Tracer/y": # BUG for these words hw = "Tracery" elif hw == "Adder fly/": hw = "Adder fly" elif hw == "Gairish/ness": hw = "Gairishness" elif hw == "Pi": hw = "Pi" assert re.match("^[^<>/]+$", hw), hw self.current_hw.append(hw) else: if self.current_hw: for altsp in re_altspelling.findall(line): self.current_hw.append(altsp) self.current_def += line print(f"after{fname}, {len(self.entries)} definitions") def write(self, fname): with open(fname, "w") as f: for hws, entry in self.entries.items(): hws = ", ".join(hws) f.write(f"\n

    {hws}

    ") if len(entry) > 1: for i, subentry in enumerate(entry): f.write(f"

    {i+1}. {hws}

    ") f.write(subentry) else: f.write(entry[0]) def write_to_tabseparated(self, fname): with open(fname, "w") as f: for hws, entry in self.entries.items(): hws = ", ".join(hws) f.write(f"{hws}\t") if len(entry) > 1: for i, subentry in enumerate(entry): f.write(f"

    {i+1}. {hws}

    ") assert "\n" not in subentry f.write(subentry) else: assert "\n" not in entry[0] f.write(entry[0]) f.write("\n") def main(): c = Converter() for fname in glob("../gcide/CIDE.?"): print(f"Running over {fname}") c.convert_file(fname) letter = fname.rsplit(".")[-1] # c.write(f"GCIDE-{letter}.html") c.write_to_tabseparated("GCIDE.tab_separated") if __name__ == "__main__": main()