+#!/usr/bin/python3
+
+import lxml.etree
+import re
+
+entity_map = {
+ "<br/": "<br/>",
+ "&": "&",
+ "<lt/": "<",
+ "<gt/": ">",
+ "--": "—", # long (em) dash
+ "<Cced/": "Ç", # C cedilla
+ "<uum/": "ü", # u umlaut (diaeresis)
+ "<eacute/":"é", # e acute
+ "<acir/": "â", # a circumflex
+ "<aum/": "ä", # a umlaut (diaeresis)
+ "<agrave/":"à", # a grave
+ "<aring/": "å", # a ring above
+ "<cced/": "ç", # c cedilla
+ "<ecir/": "ê", # e circumflex
+ "<eum/": "ë", # e umlaut (diaeresis)
+ "<egrave/":"è", # e grave
+ "<ium/": "ï", # i umlaut (diaeresis)
+ "<icir/": "î", # i circumflex
+ "<icirc/": "î", # i circumflex
+ "<igrave/":"ì", # i grave
+ "<Aum/": "Ä", # A umlaut
+ "<Eacute/":"É", # E acute
+ "<ae/": "æ", # ligature ae
+ "<AE/": "Æ", # ligature AE
+ "<ocir/": "ô", # o circumflex
+ "<oum/": "ö", # o umlaut (diaeresis)
+ "<ograve/":"ò", # o grave
+ "<ucir/": "û", # u circumflex
+ "<ugrave/":"ù", # u grave
+ "<yum/": "ÿ", # y umlaut
+ "<Oum/": "Ö", # O umlaut
+ "<Uum/": "Ü", # U umlaut (diaeresis)
+ "<pound/": "£", # pound sign (British)
+ "<aacute/":"á", # a acute
+ "<iacute/":"í", # i acute
+ "<oacute/":"ó", # o acute
+ "<uacute/":"ú", # u acute
+ "<ntil/": "ñ", # n tilde
+ "<Ntil/": "Ñ", # N tilde
+ "<frac23/":"⅔", # two-thirds
+ "<frac13/":"⅓", # one-third
+ "<sec/": "˝", # seconds (of degree or time). Also, inches or double prime.
+ "<frac12/":"½", # one-half
+ "<frac14/":"¼", # one-quarter
+ "<hand/": "☞", # pointing hand (printer's "fist")
+ "<bprime/":"˝", # bold accent (used in pronunciations)
+ "<prime/": "´", # light accent (used in pronunciations) also minutes (of
+ # arc or time)
+ "<rdquo/": "”", # close double quote
+ "<sect/": "§", # section mark
+ "<ldquo/": "“", # open double quotes
+ "<amac/": "ā", # a macron
+ "<lsquo/": "‘", # left single quote
+ "<nsm/": "ṉ", # "n sub-macron"
+ "<sharp/": "♯", # musical sharp
+ "<flat/": "♭", # musical flat
+ "<imac/": "ī", # i macron
+ "<emac/": "ē", # e macron
+ "<dsdot/": "ḍ", # Sanskrit/Tamil d dot
+ "<nsdot/": "ṇ", # Sanskrit/Tamil n dot
+ "<tsdot/": "ṭ", # Sanskrit/Tamil t dot
+ "<ecr/": "ĕ", # e breve
+ "<icr/": "ĭ", # i breve
+ "<ocr/": "ŏ", # o breve
+ "<OE/": "Œ", # OE ligature
+ "<oe/": "œ", # oe ligature
+ "<omac/": "ō", # o macron
+ "<umac/": "ū", # u macron
+ "<ocar/": "ǒ", # o hacek
+ "<aemac/": "ǣ", # ae ligature macron
+ "<oemac/": "ō", # oe ligature macron
+ "<ucr/": "ŭ", # u breve
+ "<acr/": "ă", # a breve
+ "<cre/": "˘", # crescent (like a breve, but vertically centered --
+ # represents the short accent in poetic meter)
+ "<ymac/": "ȳ", # y macron
+ "<edh/": "ð", # small eth
+ "<thorn/": "þ", # small thorn
+ "<atil/": "ã", # a tilde
+ "<ndot/": "ṅ", # n with dot above
+ "<rsdot/": "ṛ", # r with a dot below
+ "<yogh/": "ȝ", # small yogh
+ "<mdash/": "—", # em dash
+ "<divide/":"÷", # division sign
+ "<deg/": "°", # degree sign
+ "<middot/":"•", # bold middle dot
+ "<root/": "√", # root sign
+ "<adot/": "ȧ", # a with dot above
+
+ "<?/": "?", #(?) Place-holder for unknown or illegible character.
+
+ # used only in prononciation key; not able to find what "short vertical
+ # bar on top" looks like with unicode chars.
+ "<asl/": "a", # a "semilong" (has a macron above with a short
+ # vertical bar on top the center of the macron)
+ # Used in pronunciations.
+ "<esl/": "e", # e "semilong"
+ "<isl/": "i", # i "semilong"
+ "<osl/": "o", # o "semilong"
+ "<usl/": "u", # u "semilong"
+ "<th/": "th",# th ligature
+ "<ait/": "𝑎", # a italic
+ "<eit/": "𝑒",
+ "<iit/": "𝑖",
+ "<oit/": "𝑜",
+ "<uit/": "𝑢",
+ "<add/": "a", # a with two dot below
+ "<edd/": "e",
+ "<idd/": "i",
+ "<odd/": "o",
+ "<udd/": "u",
+ "<oocr/": "oo",
+ "<oomac/": "oo",
+ "<etil/": "ẽ",
+ "<ycr/": "ў",
+
+ # greek letters
+ "<alpha/": "α", "<ALPHA/": "Α",
+ "<beta/": "β", "<BETA/": "Β",
+ "<gamma/": "γ", "<GAMMA/": "Γ",
+ "<delta/": "δ", "<DELTA/": "Δ",
+ "<epsilon/": "ε", "<EPSILON/": "Ε",
+ "<zeta/": "ζ", "<ZETA/": "Ζ",
+ "<eta/": "η", "<ETA/": "Η",
+ "<theta/": "θ", "<THETA/": "Θ",
+ "<iota/": "ι", "<IOTA/": "Ι",
+ "<kappa/": "κ", "<KAPPA/": "Κ",
+ "<lambda/": "λ", "<LAMBDA/": "Λ",
+ "<mu/": "μ", "<MU/": "Μ",
+ "<nu/": "ν", "<NU/": "Ν",
+ "<xi/": "ξ", "<XI/": "Ξ",
+ "<omicron/": "ο", "<OMICRON/": "Ο",
+ "<pi/": "π", "<PI/": "Π",
+ "<rho/": "ρ", "<RHO/": "Ρ",
+ "<sigma/": "σ", "<SIGMA/": "Σ",
+ "<tau/": "τ", "<TAU/": "Τ",
+ "<upsilon/": "υ", "<UPSILON/": "Υ",
+ "<phi/": "φ", "<PHI/": "Φ",
+ "<chi/": "χ", "<CHI/": "Χ",
+ "<psi/": "ψ", "<PSI/": "Ψ",
+ "<omega/": "ω", "<OMEGA/": "Ω",
+
+ # then there are some characters that are shown as escape sequences
+ r"\'94": "ö",
+ r"\'d8": "‖",
+ r"/'bd": "“", # one instance where / is used instead of \
+ r" 'bd": "“", # two instances where \ is misssing
+ r"`'b8": "”", # one instance where ` is used instead of \
+
+ # entities that appear in the etymology of Arabic words, but no explanation
+ # of what they stand for. Not displayed at all by GNU dico.
+ "<hsdot/": "",
+ "<zsdot/": "",
+}
+
+def replace_fake_comments(match):
+ nblines = match.group(0).count("\n")
+ return "\n" * nblines
+
+def convert_file(fname):
+ rawtext = open(fname, "r").read()
+ rawtext = re.sub(r"<--.+?-->", replace_fake_comments, rawtext, flags=re.DOTALL)
+ rawtext = f"<dict>" + rawtext + f"</dict>"
+ for entity, char in entity_map.items():
+ rawtext = rawtext.replace(entity, char)
+ print(rawtext.splitlines()[5724:5730])
+ e = lxml.etree.XML(rawtext)