+re_unknownentity = re.compile(r"<[a-z0-9]+/")
+re_sources = re.compile(r"\[<source>(.*?)</source>\]")
+re_pos = re.compile(r"<pos>(.*?)</pos>")
+re_headword = re.compile(r"<ent>(.*)</ent>")
+re_subsense = re.compile(r"<sn>[0-9.]+</sn>")
+re_altspelling = re.compile(r"<asp>(.+?)</asp>")
+re_greekwords = re.compile(r"<grk>(.+?)</grk>")
+skipped_hw = re.compile(r"^[0-9/]+(st|nd|rd|th)?$")
+re_simplefraction = re.compile(r"<frac([0-9])([0-9])/")
+re_fraction = re.compile(r"<frac([0-9]+)x([0-9]+)/")
+re_etymology = re.compile(r"<ety>(.*?)</ety>")
+
+# delete the tags and it's content
+delete_tags = [re.compile(r"<pr>.*?</pr>,?\.?"),
+ re.compile(r"<hw>.*?</hw>,?\.?"),
+ re.compile(r"<mhw>.*?</mhw>,?"),
+ re.compile(r"<song>.*?</song>"),
+ re.compile(r"<song>.*?</p>"), # blah BUG song not closed
+ re.compile(r"<table.*?</table>"),
+ re.compile(r"<mtable>.*?</mtable>"),
+ re.compile(r'<pr>\(hō"m<stil/r\).'), # BUG pr not closed
+ ]
+# replace the markup, keeping the contents intact
+replace_markup = {
+ "plu plw def rj col cs cd altsp sansserif": "",
+ "ex xex qex er it ets etsep asp spn kingdom phylum subphylum class subclass ord subord fam subfam gen var varn": "i",
+ "bold ct colf stypec": "b",
+ "subs": "sub",
+ "sups": "sup",
+}
+
+def format_definition(text):
+ for del_tag in delete_tags:
+ text = del_tag.sub("", text)
+
+ # <and/ and <or/ should be in roman font, so need to check where they
+ # occur for replacement
+ pos_part_match = re_pos.search(text)
+ if pos_part_match:
+ pos_part = pos_part_match.group(1)
+ pos_part = pos_part.replace("<or/", "</i> or <i>") \
+ .replace("<and/", "</i> and <i>")
+ text = text.replace(pos_part_match.group(0), f"<i>{pos_part}</i>")
+ text = text.replace("<or/", " or ").replace("<and/", " and ")
+
+ # fractions
+ text = re_simplefraction.sub(mk_fraction, text)
+ text = re_fraction.sub(mk_fraction, text)
+
+ # translate functional tags into formatting
+ for funtag, formattag in replace_markup.items():
+ for tag in funtag.split():
+ if formattag:
+ text = text.replace(f"<{tag}>", f"<{formattag}>") \
+ .replace(f"</{tag}>", f"</{formattag}>")
+ else:
+ text = text.replace(f"<{tag}>", "") \
+ .replace(f"</{tag}>", "")
+
+ text = text \
+ .replace("<note>☞", "☞").replace("<note> ☞", "") \
+ .replace("<note>", "☞ ").replace("</note>", "") \
+ .replace("<pos>", "(<i>").replace("</pos>", "</i>)") \
+ .replace("<fld>(", "(<i>").replace(")</fld>", "</i>)") \
+ .replace("<fld>", "<i>").replace("</fld>", "</i>") \
+ .replace("<au>", '<span style="font-variant:small-caps">') \
+ .replace("</au>", "</span>") \
+ .replace("<qau>", '<span style="font-variant:small-caps">') \
+ .replace("</qau>", "</span>") \
+ .replace("<sc>", '<span style="font-variant:small-caps">') \
+ .replace("</sc>", "</span>") \
+ .replace("<mark>[R.]</mark>", "[<i>Rare</i>]") \
+ .replace("<mark>[Obs.]</mark>", "[<i>Obsolete</i>]") \
+ .replace("<mark>[", "[<i>").replace("]</mark>", "</i>]") \
+
+ wrapol = False
+ for subsensemark in re_subsense.findall(text):
+ text = text.replace(subsensemark, "<li>")
+ wrapol = True
+ if wrapol:
+ text = text.replace("<li>", "<ol><li>", 1)
+ text = f"{text}</ol>"
+
+ # embedded greek
+ text = re_greekwords.sub(detransliterate_greek, text)
+
+ # move etymology at the end
+ match_ety = re_etymology.search(text)
+ if match_ety:
+ text = text.replace(match_ety.group(0), "")
+ text += f"<p><b>Etymology:</b> {match_ety.group(1)}</p>"
+
+ # simplify sources : if all identical, just have the one at the end;
+ # otherwise format it a bit better
+ sources = list(re_sources.finditer(text))
+ #assert sources, f"Looking for sources in {text}"
+ sourcenames = [source.group(1) for source in sources]
+ if len(set(sourcenames)) == 1 or \
+ (sourcenames and all("Webster" in s for s in sourcenames)):
+ text = re_sources.sub("", text)
+ text += f"<p>Source: {sources[0].group(1)}</p>"
+ else:
+ for source in set(sources):
+ text = text.replace(source.group(0), f"<p>Source: {source.group(1)}</p>")
+
+ unknwn_ent = re_unknownentity.search(text)
+ assert not unknwn_ent, f"Unknown entity {unknwn_ent} in {text}"
+
+ # TODO: add check for unknown tags
+
+ return text
+
+def detransliterate_greek(roman):
+ if isinstance(roman, re.Match):
+ roman = roman.group(1)
+ # not in alphabetical order, as ch for χ needs to appear before h for η
+ greek = roman\
+ .replace("ch", "χ") \
+ .replace("a", "α").replace("b", "β").replace("g", "γ") \
+ .replace("d", "δ").replace("e", "ε").replace("z", "ζ") \
+ .replace("h", "η").replace("q", "θ").replace("i", "ι") \
+ .replace("k", "κ").replace("l", "λ").replace("m", "μ") \
+ .replace("n", "ν").replace("x", "ξ").replace("o", "ο") \
+ .replace("p", "π").replace("r", "ρ").replace("s", "σ") \
+ .replace("t", "τ").replace("y", "υ").replace("u", "υ") \
+ .replace("f", "φ").replace("ps", "ψ").replace("w", "ω") \
+ .replace("~", "\N{COMBINING GRAVE ACCENT}") \
+ .replace("`", "\N{COMBINING ACUTE ACCENT}") \
+ .replace(":", "\N{COMBINING DIAERESIS}") \
+ .replace(",", "\N{COMBINING GREEK YPOGEGRAMMENI}") \
+ .replace("^", "\N{COMBINING GREEK PERISPOMENI}")
+ if greek.endswith("σ"):
+ greek = greek[:-1] + "ς"
+ # these appear before the letter, so need to move after
+ greek = re.sub("'(.)", "\\1\N{COMBINING COMMA ABOVE}", greek)
+ greek = re.sub("\"(.)", "\\1\N{COMBINING REVERSED COMMA ABOVE}", greek)
+ return unicodedata.normalize("NFC", greek)
+
+def test_transliterate_greek():
+ expected = ""
+ actual = ""
+ for roman, greek in [("'archai:`zein", "ἀρχαΐζειν"), # BUG in the example the accent on the ι is wrong
+ ("zw^,on", "ζῷον"),
+ ("o'i^nos", "οἶνος"),
+ ("\"ydra`rgyros", "ὑδράργυρος"),
+ ("lyth`rios", "λυτήριος"),
+ ("poihth`s", "ποιητής")]:
+ expected += f"{roman}\n{greek} - {greek.encode('unicode_escape')}\n"
+ actual += f"{roman}\n{detransliterate_greek(roman)} - " \
+ "{detransliterate_greek(roman).encode('unicode_escape')}\n"
+ assert actual == expected
+
+def mk_fraction(fraction):
+ num = fraction.group(1).translate({"1": "¹", "2": "²", "3": "³",
+ "4": "⁴", "5": "⁵", "6": "⁶",
+ "7": "⁷", "8": "⁸", "9": "⁹",
+ "0": "⁰"})
+ den = fraction.group(2).translate({"1": "₁", "2": "₂", "3": "₃",
+ "4": "₄", "5": "₅", "6": "₆",
+ "7": "₇", "8": "₈", "9": "₉",
+ "0": "₀"})
+ return f"{num}⁄{den}"
+
+class Converter:
+ def __init__(self):
+ self.entries = {}
+ self.current_hw = []
+ self.current_def = ""
+
+ def start_new_entry(self):
+ current_hw = tuple(hw.upper() for hw in self.current_hw)
+ if current_hw not in self.entries:
+ self.entries[current_hw] = []
+ try:
+ defn = format_definition(self.current_def)
+ except:
+ print(f"Trying to parse {current_hw}: {self.current_def}")
+ raise
+ self.entries[current_hw].append(defn)
+
+ self.current_def = ""
+ self.current_hw = []
+
+ def convert_file(self, fname):
+ rawtext = open(fname, "r", encoding="latin-1").read()
+ rawtext = re.sub(r"<--.+?-->", "", rawtext, flags=re.DOTALL)
+ for entity, char in entity_map.items():
+ rawtext = rawtext.replace(entity, char)
+
+ for line in rawtext.splitlines():
+ match = re_headword.search(line)
+ if match:
+ if self.current_def:
+ self.start_new_entry()
+
+ hw = match.group(1)
+ if skipped_hw.match(hw):
+ continue
+ elif hw == "Tracer/y": # BUG for these words
+ hw = "Tracery"
+ elif hw == "Adder fly/":
+ hw = "Adder fly"
+ elif hw == "Gairish/ness":
+ hw = "Gairishness"
+ elif hw == "P<sub>i</sub>":
+ hw = "Pi"
+
+ assert re.match("^[^<>/]+$", hw), hw
+ self.current_hw.append(hw)
+ else:
+ if self.current_hw:
+ for altsp in re_altspelling.findall(line):
+ self.current_hw.append(altsp)
+ self.current_def += line
+
+ print(f"after{fname}, {len(self.entries)} definitions")
+
+ def write(self, fname):
+ with open(fname, "w") as f:
+ for hws, entry in self.entries.items():
+ hws = ", ".join(hws)
+ f.write(f"\n<h1>{hws}</h1>")
+ if len(entry) > 1:
+ for i, subentry in enumerate(entry):
+ f.write(f"<h2>{i+1}. {hws}</h2>")
+ f.write(subentry)
+ else:
+ f.write(entry[0])
+
+ def write_to_tabseparated(self, fname):
+ with open(fname, "w") as f:
+ for hws, entry in self.entries.items():
+ hws = ", ".join(hws)
+ f.write(f"{hws}\t")
+ if len(entry) > 1:
+ for i, subentry in enumerate(entry):
+ f.write(f"<h2>{i+1}. {hws}</h2>")
+ assert "\n" not in subentry
+ f.write(subentry)
+ else:
+ assert "\n" not in entry[0]
+ f.write(entry[0])
+ f.write("\n")
+
+def main():
+ c = Converter()
+ for fname in glob("../gcide/CIDE.?"):
+ print(f"Running over {fname}")
+ c.convert_file(fname)
+
+ letter = fname.rsplit(".")[-1]
+ # c.write(f"GCIDE-{letter}.html")
+ c.write_to_tabseparated("GCIDE.tab_separated")
+
+if __name__ == "__main__":
+ main()