11 "--": "—", # long (em) dash
12 "<Cced/": "Ç", # C cedilla
13 "<uum/": "ü", # u umlaut (diaeresis)
14 "<eacute/":"é", # e acute
15 "<acir/": "â", # a circumflex
16 "<aum/": "ä", # a umlaut (diaeresis)
17 "<agrave/":"à", # a grave
18 "<aring/": "å", # a ring above
19 "<cced/": "ç", # c cedilla
20 "<ecir/": "ê", # e circumflex
21 "<eum/": "ë", # e umlaut (diaeresis)
22 "<egrave/":"è", # e grave
23 "<ium/": "ï", # i umlaut (diaeresis)
24 "<icir/": "î", # i circumflex
25 "<icirc/": "î", # i circumflex
26 "<igrave/":"ì", # i grave
27 "<Aum/": "Ä", # A umlaut
28 "<Eacute/":"É", # E acute
29 "<ae/": "æ", # ligature ae
30 "<AE/": "Æ", # ligature AE
31 "<ocir/": "ô", # o circumflex
32 "<oum/": "ö", # o umlaut (diaeresis)
33 "<ograve/":"ò", # o grave
34 "<ucir/": "û", # u circumflex
35 "<ugrave/":"ù", # u grave
36 "<yum/": "ÿ", # y umlaut
37 "<Oum/": "Ö", # O umlaut
38 "<Uum/": "Ü", # U umlaut (diaeresis)
39 "<pound/": "£", # pound sign (British)
40 "<aacute/":"á", # a acute
41 "<iacute/":"í", # i acute
42 "<oacute/":"ó", # o acute
43 "<uacute/":"ú", # u acute
44 "<ntil/": "ñ", # n tilde
45 "<Ntil/": "Ñ", # N tilde
46 "<frac23/":"⅔", # two-thirds
47 "<frac13/":"⅓", # one-third
48 "<sec/": "˝", # seconds (of degree or time). Also, inches or double prime.
49 "<frac12/":"½", # one-half
50 "<frac14/":"¼", # one-quarter
51 "<hand/": "☞", # pointing hand (printer's "fist")
52 "<bprime/":"˝", # bold accent (used in pronunciations)
53 "<prime/": "´", # light accent (used in pronunciations) also minutes (of
55 "<rdquo/": "”", # close double quote
56 "<sect/": "§", # section mark
57 "<ldquo/": "“", # open double quotes
58 "<amac/": "ā", # a macron
59 "<lsquo/": "‘", # left single quote
60 "<nsm/": "ṉ", # "n sub-macron"
61 "<sharp/": "♯", # musical sharp
62 "<flat/": "♭", # musical flat
63 "<imac/": "ī", # i macron
64 "<emac/": "ē", # e macron
65 "<dsdot/": "ḍ", # Sanskrit/Tamil d dot
66 "<nsdot/": "ṇ", # Sanskrit/Tamil n dot
67 "<tsdot/": "ṭ", # Sanskrit/Tamil t dot
68 "<ecr/": "ĕ", # e breve
69 "<icr/": "ĭ", # i breve
70 "<ocr/": "ŏ", # o breve
71 "<OE/": "Œ", # OE ligature
72 "<oe/": "œ", # oe ligature
73 "<omac/": "ō", # o macron
74 "<umac/": "ū", # u macron
75 "<ocar/": "ǒ", # o hacek
76 "<aemac/": "ǣ", # ae ligature macron
77 "<oemac/": "ō", # oe ligature macron
78 "<ucr/": "ŭ", # u breve
79 "<acr/": "ă", # a breve
80 "<cre/": "˘", # crescent (like a breve, but vertically centered --
81 # represents the short accent in poetic meter)
82 "<ymac/": "ȳ", # y macron
83 "<edh/": "ð", # small eth
84 "<thorn/": "þ", # small thorn
85 "<atil/": "ã", # a tilde
86 "<ndot/": "ṅ", # n with dot above
87 "<rsdot/": "ṛ", # r with a dot below
88 "<yogh/": "ȝ", # small yogh
89 "<mdash/": "—", # em dash
90 "<divide/":"÷", # division sign
91 "<deg/": "°", # degree sign
92 "<middot/":"•", # bold middle dot
93 "<root/": "√", # root sign
94 "<adot/": "ȧ", # a with dot above
96 "<?/": "?", #(?) Place-holder for unknown or illegible character.
98 # used only in prononciation key; not able to find what "short vertical
99 # bar on top" looks like with unicode chars.
100 "<asl/": "a", # a "semilong" (has a macron above with a short
101 # vertical bar on top the center of the macron)
102 # Used in pronunciations.
103 "<esl/": "e", # e "semilong"
104 "<isl/": "i", # i "semilong"
105 "<osl/": "o", # o "semilong"
106 "<usl/": "u", # u "semilong"
107 "<th/": "th",# th ligature
108 "<ait/": "𝑎", # a italic
113 "<add/": "a", # a with two dot below
124 "<alpha/": "α", "<ALPHA/": "Α",
125 "<beta/": "β", "<BETA/": "Β",
126 "<gamma/": "γ", "<GAMMA/": "Γ",
127 "<delta/": "δ", "<DELTA/": "Δ",
128 "<epsilon/": "ε", "<EPSILON/": "Ε",
129 "<zeta/": "ζ", "<ZETA/": "Ζ",
130 "<eta/": "η", "<ETA/": "Η",
131 "<theta/": "θ", "<THETA/": "Θ",
132 "<iota/": "ι", "<IOTA/": "Ι",
133 "<kappa/": "κ", "<KAPPA/": "Κ",
134 "<lambda/": "λ", "<LAMBDA/": "Λ",
135 "<mu/": "μ", "<MU/": "Μ",
136 "<nu/": "ν", "<NU/": "Ν",
137 "<xi/": "ξ", "<XI/": "Ξ",
138 "<omicron/": "ο", "<OMICRON/": "Ο",
139 "<pi/": "π", "<PI/": "Π",
140 "<rho/": "ρ", "<RHO/": "Ρ",
141 "<sigma/": "σ", "<SIGMA/": "Σ",
142 "<tau/": "τ", "<TAU/": "Τ",
143 "<upsilon/": "υ", "<UPSILON/": "Υ",
144 "<phi/": "φ", "<PHI/": "Φ",
145 "<chi/": "χ", "<CHI/": "Χ",
146 "<psi/": "ψ", "<PSI/": "Ψ",
147 "<omega/": "ω", "<OMEGA/": "Ω",
149 # then there are some characters that are shown as escape sequences
152 r"/'bd": "“", # one instance where / is used instead of \
153 r" 'bd": "“", # two instances where \ is misssing
154 r"`'b8": "”", # one instance where ` is used instead of \
156 # entities that appear in the etymology of Arabic words, but no explanation
157 # of what they stand for. Not displayed at all by GNU dico.
162 def replace_fake_comments(match):
163 nblines = match.group(0).count("\n")
164 return "\n" * nblines
166 def convert_file(fname):
167 rawtext = open(fname, "r").read()
168 rawtext = re.sub(r"<--.+?-->", replace_fake_comments, rawtext, flags=re.DOTALL)
169 rawtext = f"<dict>" + rawtext + f"</dict>"
170 for entity, char in entity_map.items():
171 rawtext = rawtext.replace(entity, char)
172 print(rawtext.splitlines()[5724:5730])
173 e = lxml.etree.XML(rawtext)