#!/usr/bin/python3
import lxml.etree
import re
from glob import glob
import unicodedata
entity_map = {
"
",
"&": "&",
"
Etymology: {match_ety.group(1)}
" # simplify sources : if all identical, just have the one at the end; # otherwise format it a bit better sources = list(re_sources.finditer(text)) #assert sources, f"Looking for sources in {text}" sourcenames = [source.group(1) for source in sources] if len(set(sourcenames)) == 1 or \ (sourcenames and all("Webster" in s for s in sourcenames)): text = re_sources.sub("", text) text += f"Source: {sources[0].group(1)}
" else: for source in set(sources): text = text.replace(source.group(0), f"Source: {source.group(1)}
") unknwn_ent = re_unknownentity.search(text) assert not unknwn_ent, f"Unknown entity {unknwn_ent} in {text}" # TODO: add check for unknown tags return text def detransliterate_greek(roman): if isinstance(roman, re.Match): roman = roman.group(1) # not in alphabetical order, as ch for χ needs to appear before h for η greek = roman\ .replace("ch", "χ") \ .replace("a", "α").replace("b", "β").replace("g", "γ") \ .replace("d", "δ").replace("e", "ε").replace("z", "ζ") \ .replace("h", "η").replace("q", "θ").replace("i", "ι") \ .replace("k", "κ").replace("l", "λ").replace("m", "μ") \ .replace("n", "ν").replace("x", "ξ").replace("o", "ο") \ .replace("p", "π").replace("r", "ρ").replace("s", "σ") \ .replace("t", "τ").replace("y", "υ").replace("u", "υ") \ .replace("f", "φ").replace("ps", "ψ").replace("w", "ω") \ .replace("~", "\N{COMBINING GRAVE ACCENT}") \ .replace("`", "\N{COMBINING ACUTE ACCENT}") \ .replace(":", "\N{COMBINING DIAERESIS}") \ .replace(",", "\N{COMBINING GREEK YPOGEGRAMMENI}") \ .replace("^", "\N{COMBINING GREEK PERISPOMENI}") if greek.endswith("σ"): greek = greek[:-1] + "ς" # these appear before the letter, so need to move after greek = re.sub("'(.)", "\\1\N{COMBINING COMMA ABOVE}", greek) greek = re.sub("\"(.)", "\\1\N{COMBINING REVERSED COMMA ABOVE}", greek) return unicodedata.normalize("NFC", greek) def test_transliterate_greek(): expected = "" actual = "" for roman, greek in [("'archai:`zein", "ἀρχαΐζειν"), # BUG in the example the accent on the ι is wrong ("zw^,on", "ζῷον"), ("o'i^nos", "οἶνος"), ("\"ydra`rgyros", "ὑδράργυρος"), ("lyth`rios", "λυτήριος"), ("poihth`s", "ποιητής")]: expected += f"{roman}\n{greek} - {greek.encode('unicode_escape')}\n" actual += f"{roman}\n{detransliterate_greek(roman)} - " \ "{detransliterate_greek(roman).encode('unicode_escape')}\n" assert actual == expected def mk_fraction(fraction): num = fraction.group(1).translate({"1": "¹", "2": "²", "3": "³", "4": "⁴", "5": "⁵", "6": "⁶", "7": "⁷", "8": "⁸", "9": "⁹", "0": "⁰"}) den = fraction.group(2).translate({"1": "₁", "2": "₂", "3": "₃", "4": "₄", "5": "₅", "6": "₆", "7": "₇", "8": "₈", "9": "₉", "0": "₀"}) return f"{num}⁄{den}" class Converter: def __init__(self): self.entries = {} self.current_hw = [] self.current_def = "" def start_new_entry(self): current_hw = tuple(hw.upper() for hw in self.current_hw) if current_hw not in self.entries: self.entries[current_hw] = [] try: defn = format_definition(self.current_def) except: print(f"Trying to parse {current_hw}: {self.current_def}") raise self.entries[current_hw].append(defn) self.current_def = "" self.current_hw = [] def convert_file(self, fname): rawtext = open(fname, "r", encoding="latin-1").read() rawtext = re.sub(r"<--.+?-->", "", rawtext, flags=re.DOTALL) for entity, char in entity_map.items(): rawtext = rawtext.replace(entity, char) for line in rawtext.splitlines(): match = re_headword.search(line) if match: if self.current_def: self.start_new_entry() hw = match.group(1) if skipped_hw.match(hw): continue elif hw == "Tracer/y": # BUG for these words hw = "Tracery" elif hw == "Adder fly/": hw = "Adder fly" elif hw == "Gairish/ness": hw = "Gairishness" elif hw == "Pi": hw = "Pi" assert re.match("^[^<>/]+$", hw), hw self.current_hw.append(hw) else: if self.current_hw: for altsp in re_altspelling.findall(line): self.current_hw.append(altsp) self.current_def += line print(f"after{fname}, {len(self.entries)} definitions") def write(self, fname): with open(fname, "w") as f: for hws, entry in self.entries.items(): hws = ", ".join(hws) f.write(f"\n