import lxml.etree
import re
+from glob import glob
+import unicodedata
entity_map = {
- "<br/": "<br/>",
+ "<br/": "<br>",
"&": "&",
"<lt/": "<",
"<gt/": ">",
"--": "—", # long (em) dash
+
"<Cced/": "Ç", # C cedilla
"<uum/": "ü", # u umlaut (diaeresis)
"<eacute/":"é", # e acute
"<acir/": "â", # a circumflex
+ "<acirc/": "â", # a circumflex
"<aum/": "ä", # a umlaut (diaeresis)
"<agrave/":"à", # a grave
"<aring/": "å", # a ring above
"<igrave/":"ì", # i grave
"<Aum/": "Ä", # A umlaut
"<Eacute/":"É", # E acute
- "<ae/": "æ", # ligature ae
- "<AE/": "Æ", # ligature AE
+ "<ncir/": "n̂", # n circumflex
"<ocir/": "ô", # o circumflex
"<oum/": "ö", # o umlaut (diaeresis)
"<ograve/":"ò", # o grave
"<uacute/":"ú", # u acute
"<ntil/": "ñ", # n tilde
"<Ntil/": "Ñ", # N tilde
+ "<mtil/": "m̆", # used in one location, looks more like m breve than m tilde BUG
+ "<ltil/": "ɫ", # l tilde
+ "<sec/": "˝", # seconds (of degree or time). Also, inches or double prime.
"<frac23/":"⅔", # two-thirds
"<frac13/":"⅓", # one-third
- "<sec/": "˝", # seconds (of degree or time). Also, inches or double prime.
"<frac12/":"½", # one-half
"<frac14/":"¼", # one-quarter
+ "<frac34/":"¾", # three-quarters
+ "<frac16/": "⅙", # one sixth
"<hand/": "☞", # pointing hand (printer's "fist")
"<bprime/":"˝", # bold accent (used in pronunciations)
"<prime/": "´", # light accent (used in pronunciations) also minutes (of
# arc or time)
+ "<min/": "´", # well some minutes of arc have a different entity
"<rdquo/": "”", # close double quote
+ "</q>": "”", # close double quote
"<sect/": "§", # section mark
+ "<para/": "¶", # paragraph mark
"<ldquo/": "“", # open double quotes
+ "<q>": "“", # open double quotes
"<amac/": "ā", # a macron
"<lsquo/": "‘", # left single quote
- "<nsm/": "ṉ", # "n sub-macron"
+ "<rsquo/": "’", # right single quote
+ "<nsm/": "ṉ", # n sub-macron
"<sharp/": "♯", # musical sharp
"<flat/": "♭", # musical flat
"<imac/": "ī", # i macron
"<dsdot/": "ḍ", # Sanskrit/Tamil d dot
"<nsdot/": "ṇ", # Sanskrit/Tamil n dot
"<tsdot/": "ṭ", # Sanskrit/Tamil t dot
+ "<tsdo/": "ṭ", # Sanskrit/Tamil t dot
"<ecr/": "ĕ", # e breve
"<icr/": "ĭ", # i breve
"<ocr/": "ŏ", # o breve
- "<OE/": "Œ", # OE ligature
- "<oe/": "œ", # oe ligature
"<omac/": "ō", # o macron
"<umac/": "ū", # u macron
"<ocar/": "ǒ", # o hacek
"<aemac/": "ǣ", # ae ligature macron
- "<oemac/": "ō", # oe ligature macron
+ "<oemac/": "ōē", # oe ligature macron BUG
"<ucr/": "ŭ", # u breve
"<acr/": "ă", # a breve
"<cre/": "˘", # crescent (like a breve, but vertically centered --
"<deg/": "°", # degree sign
"<middot/":"•", # bold middle dot
"<root/": "√", # root sign
+ "<cuberoot/": "∛", # cubic root sign
"<adot/": "ȧ", # a with dot above
+ "<mdot/": "ṁ", # m with dot above
+ "<breve/": "˘", # breve
+ "<dagger/": "†", # dagger
+ "<ounceap/": "℥", # ounce
+ "<asterism/": "⁂", # asterism
+ "<times/": "×", # multiplication
+ "<8star/": "⚹", # sextile, badly named BUG
+ "<upslur/": "⁀", # musical slur, approx. with the IPA tie
+ "<downslur/": "‿", # musical slur, approx. with the IPA undertiw
+ "<natural/": "♮", # natural key
+ "<schwa/": "ə", # schwa
+ "<astascending/": "☊", # ascending node, Dragon's head
+ "<astdescending": "☋", # descending node, Dragon's tail
+ "<integral2l/": "∫", # integration symbol
+ "<iques/": "¿", # inverted question mark as in Spanish
+ "<pause/": "𝄐", # pause aka. corona
+ "<nabla/": "∇", # nabla operator
+ "<dele/": "₰", # dele proofreading mark; closest Unicode is the pfennig symbol which has a similar origin
+ "<umlaut/": "¨", # diaeresis
+ "<rarr/": "→", # right arrow
+
+ "<ae/": "æ", # ligature ae
+ "<AE/": "Æ", # ligature AE
+ "<OE/": "Œ", # OE ligature
+ "<oe/": "œ", # oe ligature
+ "<filig/": "fi", # fi ligature
+ "<fllig/": "fl", # fl ligature
+ "<fflig/": "ff", # ff ligature
+ "<ffllig/": "ffl", # ffl ligature
- "<?/": "?", #(?) Place-holder for unknown or illegible character.
+ "<?/": "(???)", # Place-holder for unknown or illegible character.
# used only in prononciation key; not able to find what "short vertical
# bar on top" looks like with unicode chars.
"<osl/": "o", # o "semilong"
"<usl/": "u", # u "semilong"
"<th/": "th",# th ligature
+
"<ait/": "𝑎", # a italic
"<eit/": "𝑒",
"<iit/": "𝑖",
"<oit/": "𝑜",
"<uit/": "𝑢",
+
"<add/": "a", # a with two dot below
"<edd/": "e",
"<idd/": "i",
"<etil/": "ẽ",
"<ycr/": "ў",
+ # not perfect but good enough; only used in the definition of repetend
+ "<2dot/": "<span style='text-decoration:overline'>2</span>",
+ "<3dot/": "<span style='text-decoration:overline'>3</span>",
+ # only used as an example in the entry for progression
+ "<lbrace2/<matrix2x5><row>2, 4, 6, 8, 10</row><row>10, 8, 6, 4, 2</row></matrix2x5><rbrace2/":
+ "{2, 4, 6, 8, 10; 10, 8, 6, 4, 2}",
+ "<lbrace2/<matrix2x5><row>2, 4, 8, 16, 32, 64</row><row>64, 32, 16, 8, 4, 2</row></matrix2x5><rbrace2/":
+ "{2, 4, 8, 16, 32; 64, 32 ,16, 8, 4, 2}",
+
# greek letters
"<alpha/": "α", "<ALPHA/": "Α",
"<beta/": "β", "<BETA/": "Β",
"<pi/": "π", "<PI/": "Π",
"<rho/": "ρ", "<RHO/": "Ρ",
"<sigma/": "σ", "<SIGMA/": "Σ",
+ "<sigmat/": "ς",
+ "<digamma/": "ϝ",
"<tau/": "τ", "<TAU/": "Τ",
"<upsilon/": "υ", "<UPSILON/": "Υ",
"<phi/": "φ", "<PHI/": "Φ",
"<chi/": "χ", "<CHI/": "Χ",
"<psi/": "ψ", "<PSI/": "Ψ",
"<omega/": "ω", "<OMEGA/": "Ω",
+ "<asper/": "ʽ",
- # then there are some characters that are shown as escape sequences
+ # then there are some characters that are shown as escape sequences BUG
r"\'94": "ö",
- r"\'d8": "‖",
+ r"\'d8": "", # ‖ in the dictonary, no point in keeping
r"/'bd": "“", # one instance where / is used instead of \
r" 'bd": "“", # two instances where \ is misssing
r"`'b8": "”", # one instance where ` is used instead of \
- # entities that appear in the etymology of Arabic words, but no explanation
- # of what they stand for. Not displayed at all by GNU dico.
- "<hsdot/": "",
- "<zsdot/": "",
+ "<hsdot/": "ḥ",
+ "<zsdot/": "ẓ",
+ "<msdot/": "ṃ",
+ "<zdot/": "ż",
+ "<uring/": "ů",
+ "<usdot/": "ụ",
+ "<lsdot/": "ḷ",
+ "<cacute/": "ć",
+ "<ccar/": "č",
+ "<csdot/": "c̣",
+
+ "<sb/": "", # ??? BUG
+ "<colbreak/": "",
}
-def replace_fake_comments(match):
- nblines = match.group(0).count("\n")
- return "\n" * nblines
-
-def convert_file(fname):
- rawtext = open(fname, "r").read()
- rawtext = re.sub(r"<--.+?-->", replace_fake_comments, rawtext, flags=re.DOTALL)
- rawtext = f"<dict>" + rawtext + f"</dict>"
- for entity, char in entity_map.items():
- rawtext = rawtext.replace(entity, char)
- print(rawtext.splitlines()[5724:5730])
- e = lxml.etree.XML(rawtext)
+re_unknownentity = re.compile(r"<[a-z0-9]+/")
+re_sources = re.compile(r"\[<source>(.*?)</source>\]")
+re_pos = re.compile(r"<pos>(.*?)</pos>")
+re_headword = re.compile(r"<ent>(.*)</ent>")
+re_subsense = re.compile(r"<sn>[0-9.]+</sn>")
+re_altspelling = re.compile(r"<asp>(.+?)</asp>")
+re_greekwords = re.compile(r"<grk>(.+?)</grk>")
+skipped_hw = re.compile(r"^[0-9/]+(st|nd|rd|th)?$")
+re_simplefraction = re.compile(r"<frac([0-9])([0-9])/")
+re_fraction = re.compile(r"<frac([0-9]+)x([0-9]+)/")
+re_etymology = re.compile(r"<ety>(.*?)</ety>")
+
+# delete the tags and it's content
+delete_tags = [re.compile(r"<pr>.*?</pr>,?\.?"),
+ re.compile(r"<hw>.*?</hw>,?\.?"),
+ re.compile(r"<mhw>.*?</mhw>,?"),
+ re.compile(r"<song>.*?</song>"),
+ re.compile(r"<song>.*?</p>"), # blah BUG song not closed
+ re.compile(r"<table.*?</table>"),
+ re.compile(r"<mtable>.*?</mtable>"),
+ re.compile(r'<pr>\(hō"m<stil/r\).'), # BUG pr not closed
+ ]
+# replace the markup, keeping the contents intact
+replace_markup = {
+ "plu plw def rj col cs cd altsp sansserif": "",
+ "ex xex qex er it ets etsep asp spn kingdom phylum subphylum class subclass ord subord fam subfam gen var varn": "i",
+ "bold ct colf stypec": "b",
+ "subs": "sub",
+ "sups": "sup",
+}
+
+def format_definition(text):
+ for del_tag in delete_tags:
+ text = del_tag.sub("", text)
+
+ # <and/ and <or/ should be in roman font, so need to check where they
+ # occur for replacement
+ pos_part_match = re_pos.search(text)
+ if pos_part_match:
+ pos_part = pos_part_match.group(1)
+ pos_part = pos_part.replace("<or/", "</i> or <i>") \
+ .replace("<and/", "</i> and <i>")
+ text = text.replace(pos_part_match.group(0), f"<i>{pos_part}</i>")
+ text = text.replace("<or/", " or ").replace("<and/", " and ")
+
+ # fractions
+ text = re_simplefraction.sub(mk_fraction, text)
+ text = re_fraction.sub(mk_fraction, text)
+
+ # translate functional tags into formatting
+ for funtag, formattag in replace_markup.items():
+ for tag in funtag.split():
+ if formattag:
+ text = text.replace(f"<{tag}>", f"<{formattag}>") \
+ .replace(f"</{tag}>", f"</{formattag}>")
+ else:
+ text = text.replace(f"<{tag}>", "") \
+ .replace(f"</{tag}>", "")
+
+ text = text \
+ .replace("<note>☞", "☞").replace("<note> ☞", "") \
+ .replace("<note>", "☞ ").replace("</note>", "") \
+ .replace("<pos>", "(<i>").replace("</pos>", "</i>)") \
+ .replace("<fld>(", "(<i>").replace(")</fld>", "</i>)") \
+ .replace("<fld>", "<i>").replace("</fld>", "</i>") \
+ .replace("<au>", '<span style="font-variant:small-caps">') \
+ .replace("</au>", "</span>") \
+ .replace("<qau>", '<span style="font-variant:small-caps">') \
+ .replace("</qau>", "</span>") \
+ .replace("<sc>", '<span style="font-variant:small-caps">') \
+ .replace("</sc>", "</span>") \
+ .replace("<mark>[R.]</mark>", "[<i>Rare</i>]") \
+ .replace("<mark>[Obs.]</mark>", "[<i>Obsolete</i>]") \
+ .replace("<mark>[", "[<i>").replace("]</mark>", "</i>]") \
+
+ wrapol = False
+ for subsensemark in re_subsense.findall(text):
+ text = text.replace(subsensemark, "<li>")
+ wrapol = True
+ if wrapol:
+ text = text.replace("<li>", "<ol><li>", 1)
+ text = f"{text}</ol>"
+
+ # embedded greek
+ text = re_greekwords.sub(detransliterate_greek, text)
+
+ # move etymology at the end
+ match_ety = re_etymology.search(text)
+ if match_ety:
+ text = text.replace(match_ety.group(0), "")
+ text += f"<p><b>Etymology:</b> {match_ety.group(1)}</p>"
+
+ # simplify sources : if all identical, just have the one at the end;
+ # otherwise format it a bit better
+ sources = list(re_sources.finditer(text))
+ #assert sources, f"Looking for sources in {text}"
+ sourcenames = [source.group(1) for source in sources]
+ if len(set(sourcenames)) == 1 or \
+ (sourcenames and all("Webster" in s for s in sourcenames)):
+ text = re_sources.sub("", text)
+ text += f"<p>Source: {sources[0].group(1)}</p>"
+ else:
+ for source in set(sources):
+ text = text.replace(source.group(0), f"<p>Source: {source.group(1)}</p>")
+
+ unknwn_ent = re_unknownentity.search(text)
+ assert not unknwn_ent, f"Unknown entity {unknwn_ent} in {text}"
+
+ # TODO: add check for unknown tags
+
+ return text
+
+def detransliterate_greek(roman):
+ if isinstance(roman, re.Match):
+ roman = roman.group(1)
+ # not in alphabetical order, as ch for χ needs to appear before h for η
+ greek = roman\
+ .replace("ch", "χ") \
+ .replace("a", "α").replace("b", "β").replace("g", "γ") \
+ .replace("d", "δ").replace("e", "ε").replace("z", "ζ") \
+ .replace("h", "η").replace("q", "θ").replace("i", "ι") \
+ .replace("k", "κ").replace("l", "λ").replace("m", "μ") \
+ .replace("n", "ν").replace("x", "ξ").replace("o", "ο") \
+ .replace("p", "π").replace("r", "ρ").replace("s", "σ") \
+ .replace("t", "τ").replace("y", "υ").replace("u", "υ") \
+ .replace("f", "φ").replace("ps", "ψ").replace("w", "ω") \
+ .replace("~", "\N{COMBINING GRAVE ACCENT}") \
+ .replace("`", "\N{COMBINING ACUTE ACCENT}") \
+ .replace(":", "\N{COMBINING DIAERESIS}") \
+ .replace(",", "\N{COMBINING GREEK YPOGEGRAMMENI}") \
+ .replace("^", "\N{COMBINING GREEK PERISPOMENI}")
+ if greek.endswith("σ"):
+ greek = greek[:-1] + "ς"
+ # these appear before the letter, so need to move after
+ greek = re.sub("'(.)", "\\1\N{COMBINING COMMA ABOVE}", greek)
+ greek = re.sub("\"(.)", "\\1\N{COMBINING REVERSED COMMA ABOVE}", greek)
+ return unicodedata.normalize("NFC", greek)
+
+def test_transliterate_greek():
+ expected = ""
+ actual = ""
+ for roman, greek in [("'archai:`zein", "ἀρχαΐζειν"), # BUG in the example the accent on the ι is wrong
+ ("zw^,on", "ζῷον"),
+ ("o'i^nos", "οἶνος"),
+ ("\"ydra`rgyros", "ὑδράργυρος"),
+ ("lyth`rios", "λυτήριος"),
+ ("poihth`s", "ποιητής")]:
+ expected += f"{roman}\n{greek} - {greek.encode('unicode_escape')}\n"
+ actual += f"{roman}\n{detransliterate_greek(roman)} - " \
+ "{detransliterate_greek(roman).encode('unicode_escape')}\n"
+ assert actual == expected
+
+def mk_fraction(fraction):
+ num = fraction.group(1).translate({"1": "¹", "2": "²", "3": "³",
+ "4": "⁴", "5": "⁵", "6": "⁶",
+ "7": "⁷", "8": "⁸", "9": "⁹",
+ "0": "⁰"})
+ den = fraction.group(2).translate({"1": "₁", "2": "₂", "3": "₃",
+ "4": "₄", "5": "₅", "6": "₆",
+ "7": "₇", "8": "₈", "9": "₉",
+ "0": "₀"})
+ return f"{num}⁄{den}"
+
+class Converter:
+ def __init__(self):
+ self.entries = {}
+ self.current_hw = []
+ self.current_def = ""
+
+ def start_new_entry(self):
+ current_hw = tuple(hw.upper() for hw in self.current_hw)
+ if current_hw not in self.entries:
+ self.entries[current_hw] = []
+ try:
+ defn = format_definition(self.current_def)
+ except:
+ print(f"Trying to parse {current_hw}: {self.current_def}")
+ raise
+ self.entries[current_hw].append(defn)
+
+ self.current_def = ""
+ self.current_hw = []
+
+ def convert_file(self, fname):
+ rawtext = open(fname, "r", encoding="latin-1").read()
+ rawtext = re.sub(r"<--.+?-->", "", rawtext, flags=re.DOTALL)
+ for entity, char in entity_map.items():
+ rawtext = rawtext.replace(entity, char)
+
+ for line in rawtext.splitlines():
+ match = re_headword.search(line)
+ if match:
+ if self.current_def:
+ self.start_new_entry()
+
+ hw = match.group(1)
+ if skipped_hw.match(hw):
+ continue
+ elif hw == "Tracer/y": # BUG for these words
+ hw = "Tracery"
+ elif hw == "Adder fly/":
+ hw = "Adder fly"
+ elif hw == "Gairish/ness":
+ hw = "Gairishness"
+ elif hw == "P<sub>i</sub>":
+ hw = "Pi"
+
+ assert re.match("^[^<>/]+$", hw), hw
+ self.current_hw.append(hw)
+ else:
+ if self.current_hw:
+ for altsp in re_altspelling.findall(line):
+ self.current_hw.append(altsp)
+ self.current_def += line
+
+ print(f"after{fname}, {len(self.entries)} definitions")
+
+ def write(self, fname):
+ with open(fname, "w") as f:
+ for hws, entry in self.entries.items():
+ hws = ", ".join(hws)
+ f.write(f"\n<h1>{hws}</h1>")
+ if len(entry) > 1:
+ for i, subentry in enumerate(entry):
+ f.write(f"<h2>{i+1}. {hws}</h2>")
+ f.write(subentry)
+ else:
+ f.write(entry[0])
+
+ def write_to_tabseparated(self, fname):
+ with open(fname, "w") as f:
+ for hws, entry in self.entries.items():
+ hws = ", ".join(hws)
+ f.write(f"{hws}\t")
+ if len(entry) > 1:
+ for i, subentry in enumerate(entry):
+ f.write(f"<h2>{i+1}. {hws}</h2>")
+ assert "\n" not in subentry
+ f.write(subentry)
+ else:
+ assert "\n" not in entry[0]
+ f.write(entry[0])
+ f.write("\n")
+
+def main():
+ c = Converter()
+ for fname in glob("../gcide/CIDE.?"):
+ print(f"Running over {fname}")
+ c.convert_file(fname)
+
+ letter = fname.rsplit(".")[-1]
+ # c.write(f"GCIDE-{letter}.html")
+ c.write_to_tabseparated("GCIDE.tab_separated")
+
+if __name__ == "__main__":
+ main()