From: Frédéric Perrin
",
+ "
",
"&": "&",
"
Etymology: {match_ety.group(1)}
" + + # simplify sources : if all identical, just have the one at the end; + # otherwise format it a bit better + sources = list(re_sources.finditer(text)) + #assert sources, f"Looking for sources in {text}" + sourcenames = [source.group(1) for source in sources] + if len(set(sourcenames)) == 1 or \ + (sourcenames and all("Webster" in s for s in sourcenames)): + text = re_sources.sub("", text) + text += f"Source: {sources[0].group(1)}
" + else: + for source in set(sources): + text = text.replace(source.group(0), f"Source: {source.group(1)}
") + + unknwn_ent = re_unknownentity.search(text) + assert not unknwn_ent, f"Unknown entity {unknwn_ent} in {text}" + + # TODO: add check for unknown tags + + return text + +def detransliterate_greek(roman): + if isinstance(roman, re.Match): + roman = roman.group(1) + # not in alphabetical order, as ch for Ï needs to appear before h for η + greek = roman\ + .replace("ch", "Ï") \ + .replace("a", "α").replace("b", "β").replace("g", "γ") \ + .replace("d", "δ").replace("e", "ε").replace("z", "ζ") \ + .replace("h", "η").replace("q", "θ").replace("i", "ι") \ + .replace("k", "κ").replace("l", "λ").replace("m", "μ") \ + .replace("n", "ν").replace("x", "ξ").replace("o", "ο") \ + .replace("p", "Ï").replace("r", "Ï").replace("s", "Ï") \ + .replace("t", "Ï").replace("y", "Ï ").replace("u", "Ï ") \ + .replace("f", "Ï").replace("ps", "Ï").replace("w", "Ï") \ + .replace("~", "\N{COMBINING GRAVE ACCENT}") \ + .replace("`", "\N{COMBINING ACUTE ACCENT}") \ + .replace(":", "\N{COMBINING DIAERESIS}") \ + .replace(",", "\N{COMBINING GREEK YPOGEGRAMMENI}") \ + .replace("^", "\N{COMBINING GREEK PERISPOMENI}") + if greek.endswith("Ï"): + greek = greek[:-1] + "Ï" + # these appear before the letter, so need to move after + greek = re.sub("'(.)", "\\1\N{COMBINING COMMA ABOVE}", greek) + greek = re.sub("\"(.)", "\\1\N{COMBINING REVERSED COMMA ABOVE}", greek) + return unicodedata.normalize("NFC", greek) + +def test_transliterate_greek(): + expected = "" + actual = "" + for roman, greek in [("'archai:`zein", "á¼ÏÏαÎζειν"), # BUG in the example the accent on the ι is wrong + ("zw^,on", "ζῷον"), + ("o'i^nos", "οἶνοÏ"), + ("\"ydra`rgyros", "á½Î´ÏάÏÎ³Ï ÏοÏ"), + ("lyth`rios", "Î»Ï ÏήÏιοÏ"), + ("poihth`s", "ÏοιηÏήÏ")]: + expected += f"{roman}\n{greek} - {greek.encode('unicode_escape')}\n" + actual += f"{roman}\n{detransliterate_greek(roman)} - " \ + "{detransliterate_greek(roman).encode('unicode_escape')}\n" + assert actual == expected + +def mk_fraction(fraction): + num = fraction.group(1).translate({"1": "¹", "2": "²", "3": "³", + "4": "â´", "5": "âµ", "6": "â¶", + "7": "â·", "8": "â¸", "9": "â¹", + "0": "â°"}) + den = fraction.group(2).translate({"1": "â", "2": "â", "3": "â", + "4": "â", "5": "â ", "6": "â", + "7": "â", "8": "â", "9": "â", + "0": "â"}) + return f"{num}â{den}" + +class Converter: + def __init__(self): + self.entries = {} + self.current_hw = [] + self.current_def = "" + + def start_new_entry(self): + current_hw = tuple(hw.upper() for hw in self.current_hw) + if current_hw not in self.entries: + self.entries[current_hw] = [] + try: + defn = format_definition(self.current_def) + except: + print(f"Trying to parse {current_hw}: {self.current_def}") + raise + self.entries[current_hw].append(defn) + + self.current_def = "" + self.current_hw = [] + + def convert_file(self, fname): + rawtext = open(fname, "r", encoding="latin-1").read() + rawtext = re.sub(r"<--.+?-->", "", rawtext, flags=re.DOTALL) + for entity, char in entity_map.items(): + rawtext = rawtext.replace(entity, char) + + for line in rawtext.splitlines(): + match = re_headword.search(line) + if match: + if self.current_def: + self.start_new_entry() + + hw = match.group(1) + if skipped_hw.match(hw): + continue + elif hw == "Tracer/y": # BUG for these words + hw = "Tracery" + elif hw == "Adder fly/": + hw = "Adder fly" + elif hw == "Gairish/ness": + hw = "Gairishness" + elif hw == "Pi": + hw = "Pi" + + assert re.match("^[^<>/]+$", hw), hw + self.current_hw.append(hw) + else: + if self.current_hw: + for altsp in re_altspelling.findall(line): + self.current_hw.append(altsp) + self.current_def += line + + print(f"after{fname}, {len(self.entries)} definitions") + + def write(self, fname): + with open(fname, "w") as f: + for hws, entry in self.entries.items(): + hws = ", ".join(hws) + f.write(f"\n