13 "--": "—", # long (em) dash
15 "<Cced/": "Ç", # C cedilla
16 "<uum/": "ü", # u umlaut (diaeresis)
17 "<eacute/":"é", # e acute
18 "<acir/": "â", # a circumflex
19 "<acirc/": "â", # a circumflex
20 "<aum/": "ä", # a umlaut (diaeresis)
21 "<agrave/":"à", # a grave
22 "<aring/": "å", # a ring above
23 "<cced/": "ç", # c cedilla
24 "<ecir/": "ê", # e circumflex
25 "<eum/": "ë", # e umlaut (diaeresis)
26 "<egrave/":"è", # e grave
27 "<ium/": "ï", # i umlaut (diaeresis)
28 "<icir/": "î", # i circumflex
29 "<icirc/": "î", # i circumflex
30 "<igrave/":"ì", # i grave
31 "<Aum/": "Ä", # A umlaut
32 "<Eacute/":"É", # E acute
33 "<ncir/": "n̂", # n circumflex
34 "<ocir/": "ô", # o circumflex
35 "<oum/": "ö", # o umlaut (diaeresis)
36 "<ograve/":"ò", # o grave
37 "<ucir/": "û", # u circumflex
38 "<ugrave/":"ù", # u grave
39 "<yum/": "ÿ", # y umlaut
40 "<Oum/": "Ö", # O umlaut
41 "<Uum/": "Ü", # U umlaut (diaeresis)
42 "<pound/": "£", # pound sign (British)
43 "<aacute/":"á", # a acute
44 "<iacute/":"í", # i acute
45 "<oacute/":"ó", # o acute
46 "<uacute/":"ú", # u acute
47 "<ntil/": "ñ", # n tilde
48 "<Ntil/": "Ñ", # N tilde
49 "<mtil/": "m̆", # used in one location, looks more like m breve than m tilde BUG
50 "<ltil/": "ɫ", # l tilde
51 "<sec/": "˝", # seconds (of degree or time). Also, inches or double prime.
52 "<frac23/":"⅔", # two-thirds
53 "<frac13/":"⅓", # one-third
54 "<frac12/":"½", # one-half
55 "<frac14/":"¼", # one-quarter
56 "<frac34/":"¾", # three-quarters
57 "<frac16/": "⅙", # one sixth
58 "<hand/": "☞", # pointing hand (printer's "fist")
59 "<bprime/":"˝", # bold accent (used in pronunciations)
60 "<prime/": "´", # light accent (used in pronunciations) also minutes (of
62 "<min/": "´", # well some minutes of arc have a different entity
63 "<rdquo/": "”", # close double quote
64 "</q>": "”", # close double quote
65 "<sect/": "§", # section mark
66 "<para/": "¶", # paragraph mark
67 "<ldquo/": "“", # open double quotes
68 "<q>": "“", # open double quotes
69 "<amac/": "ā", # a macron
70 "<lsquo/": "‘", # left single quote
71 "<rsquo/": "’", # right single quote
72 "<nsm/": "ṉ", # n sub-macron
73 "<sharp/": "♯", # musical sharp
74 "<flat/": "♭", # musical flat
75 "<imac/": "ī", # i macron
76 "<emac/": "ē", # e macron
77 "<dsdot/": "ḍ", # Sanskrit/Tamil d dot
78 "<nsdot/": "ṇ", # Sanskrit/Tamil n dot
79 "<tsdot/": "ṭ", # Sanskrit/Tamil t dot
80 "<tsdo/": "ṭ", # Sanskrit/Tamil t dot
81 "<ecr/": "ĕ", # e breve
82 "<icr/": "ĭ", # i breve
83 "<ocr/": "ŏ", # o breve
84 "<omac/": "ō", # o macron
85 "<umac/": "ū", # u macron
86 "<ocar/": "ǒ", # o hacek
87 "<aemac/": "ǣ", # ae ligature macron
88 "<oemac/": "ōē", # oe ligature macron BUG
89 "<ucr/": "ŭ", # u breve
90 "<acr/": "ă", # a breve
91 "<cre/": "˘", # crescent (like a breve, but vertically centered --
92 # represents the short accent in poetic meter)
93 "<ymac/": "ȳ", # y macron
94 "<edh/": "ð", # small eth
95 "<thorn/": "þ", # small thorn
96 "<atil/": "ã", # a tilde
97 "<ndot/": "ṅ", # n with dot above
98 "<rsdot/": "ṛ", # r with a dot below
99 "<yogh/": "ȝ", # small yogh
100 "<mdash/": "—", # em dash
101 "<divide/":"÷", # division sign
102 "<deg/": "°", # degree sign
103 "<middot/":"•", # bold middle dot
104 "<root/": "√", # root sign
105 "<cuberoot/": "∛", # cubic root sign
106 "<adot/": "ȧ", # a with dot above
107 "<mdot/": "ṁ", # m with dot above
108 "<breve/": "˘", # breve
109 "<dagger/": "†", # dagger
110 "<ounceap/": "℥", # ounce
111 "<asterism/": "⁂", # asterism
112 "<times/": "×", # multiplication
113 "<8star/": "⚹", # sextile, badly named BUG
114 "<upslur/": "⁀", # musical slur, approx. with the IPA tie
115 "<downslur/": "‿", # musical slur, approx. with the IPA undertiw
116 "<natural/": "♮", # natural key
117 "<schwa/": "ə", # schwa
118 "<astascending/": "☊", # ascending node, Dragon's head
119 "<astdescending": "☋", # descending node, Dragon's tail
120 "<integral2l/": "∫", # integration symbol
121 "<iques/": "¿", # inverted question mark as in Spanish
122 "<pause/": "𝄐", # pause aka. corona
123 "<nabla/": "∇", # nabla operator
124 "<dele/": "₰", # dele proofreading mark; closest Unicode is the pfennig symbol which has a similar origin
125 "<umlaut/": "¨", # diaeresis
126 "<rarr/": "→", # right arrow
128 "<ae/": "æ", # ligature ae
129 "<AE/": "Æ", # ligature AE
130 "<OE/": "Œ", # OE ligature
131 "<oe/": "œ", # oe ligature
132 "<filig/": "fi", # fi ligature
133 "<fllig/": "fl", # fl ligature
134 "<fflig/": "ff", # ff ligature
135 "<ffllig/": "ffl", # ffl ligature
137 "<?/": "(???)", # Place-holder for unknown or illegible character.
139 # used only in prononciation key; not able to find what "short vertical
140 # bar on top" looks like with unicode chars.
141 "<asl/": "a", # a "semilong" (has a macron above with a short
142 # vertical bar on top the center of the macron)
143 # Used in pronunciations.
144 "<esl/": "e", # e "semilong"
145 "<isl/": "i", # i "semilong"
146 "<osl/": "o", # o "semilong"
147 "<usl/": "u", # u "semilong"
148 "<th/": "th",# th ligature
150 "<ait/": "𝑎", # a italic
156 "<add/": "a", # a with two dot below
166 # not perfect but good enough; only used in the definition of repetend
167 "<2dot/": "<span style='text-decoration:overline'>2</span>",
168 "<3dot/": "<span style='text-decoration:overline'>3</span>",
169 # only used as an example in the entry for progression
170 "<lbrace2/<matrix2x5><row>2, 4, 6, 8, 10</row><row>10, 8, 6, 4, 2</row></matrix2x5><rbrace2/":
171 "{2, 4, 6, 8, 10; 10, 8, 6, 4, 2}",
172 "<lbrace2/<matrix2x5><row>2, 4, 8, 16, 32, 64</row><row>64, 32, 16, 8, 4, 2</row></matrix2x5><rbrace2/":
173 "{2, 4, 8, 16, 32; 64, 32 ,16, 8, 4, 2}",
176 "<alpha/": "α", "<ALPHA/": "Α",
177 "<beta/": "β", "<BETA/": "Β",
178 "<gamma/": "γ", "<GAMMA/": "Γ",
179 "<delta/": "δ", "<DELTA/": "Δ",
180 "<epsilon/": "ε", "<EPSILON/": "Ε",
181 "<zeta/": "ζ", "<ZETA/": "Ζ",
182 "<eta/": "η", "<ETA/": "Η",
183 "<theta/": "θ", "<THETA/": "Θ",
184 "<iota/": "ι", "<IOTA/": "Ι",
185 "<kappa/": "κ", "<KAPPA/": "Κ",
186 "<lambda/": "λ", "<LAMBDA/": "Λ",
187 "<mu/": "μ", "<MU/": "Μ",
188 "<nu/": "ν", "<NU/": "Ν",
189 "<xi/": "ξ", "<XI/": "Ξ",
190 "<omicron/": "ο", "<OMICRON/": "Ο",
191 "<pi/": "π", "<PI/": "Π",
192 "<rho/": "ρ", "<RHO/": "Ρ",
193 "<sigma/": "σ", "<SIGMA/": "Σ",
196 "<tau/": "τ", "<TAU/": "Τ",
197 "<upsilon/": "υ", "<UPSILON/": "Υ",
198 "<phi/": "φ", "<PHI/": "Φ",
199 "<chi/": "χ", "<CHI/": "Χ",
200 "<psi/": "ψ", "<PSI/": "Ψ",
201 "<omega/": "ω", "<OMEGA/": "Ω",
204 # then there are some characters that are shown as escape sequences BUG
206 r"\'d8": "", # ‖ in the dictonary, no point in keeping
207 r"/'bd": "“", # one instance where / is used instead of \
208 r" 'bd": "“", # two instances where \ is misssing
209 r"`'b8": "”", # one instance where ` is used instead of \
222 "<sb/": "", # ??? BUG
226 re_unknownentity = re.compile(r"<[a-z0-9]+/")
227 re_sources = re.compile(r"\[<source>(.*?)</source>\]")
228 re_pos = re.compile(r"<pos>(.*?)</pos>")
229 re_headword = re.compile(r"<ent>(.*)</ent>")
230 re_subsense = re.compile(r"<sn>[0-9.]+</sn>")
231 re_altspelling = re.compile(r"<asp>(.+?)</asp>")
232 re_greekwords = re.compile(r"<grk>(.+?)</grk>")
233 skipped_hw = re.compile(r"^[0-9/]+(st|nd|rd|th)?$")
234 re_simplefraction = re.compile(r"<frac([0-9])([0-9])/")
235 re_fraction = re.compile(r"<frac([0-9]+)x([0-9]+)/")
236 re_etymology = re.compile(r"<ety>(.*?)</ety>")
238 # delete the tags and it's content
239 delete_tags = [re.compile(r"<pr>.*?</pr>,?\.?"),
240 re.compile(r"<hw>.*?</hw>,?\.?"),
241 re.compile(r"<mhw>.*?</mhw>,?"),
242 re.compile(r"<song>.*?</song>"),
243 re.compile(r"<song>.*?</p>"), # blah BUG song not closed
244 re.compile(r"<table.*?</table>"),
245 re.compile(r"<mtable>.*?</mtable>"),
246 re.compile(r'<pr>\(hō"m<stil/r\).'), # BUG pr not closed
248 # replace the markup, keeping the contents intact
250 "plu plw def rj col cs cd altsp sansserif": "",
251 "ex xex qex er it ets etsep asp spn kingdom phylum subphylum class subclass ord subord fam subfam gen var varn": "i",
252 "bold ct colf stypec": "b",
257 def format_definition(text):
258 for del_tag in delete_tags:
259 text = del_tag.sub("", text)
261 # <and/ and <or/ should be in roman font, so need to check where they
262 # occur for replacement
263 pos_part_match = re_pos.search(text)
265 pos_part = pos_part_match.group(1)
266 pos_part = pos_part.replace("<or/", "</i> or <i>") \
267 .replace("<and/", "</i> and <i>")
268 text = text.replace(pos_part_match.group(0), f"<i>{pos_part}</i>")
269 text = text.replace("<or/", " or ").replace("<and/", " and ")
272 text = re_simplefraction.sub(mk_fraction, text)
273 text = re_fraction.sub(mk_fraction, text)
275 # translate functional tags into formatting
276 for funtag, formattag in replace_markup.items():
277 for tag in funtag.split():
279 text = text.replace(f"<{tag}>", f"<{formattag}>") \
280 .replace(f"</{tag}>", f"</{formattag}>")
282 text = text.replace(f"<{tag}>", "") \
283 .replace(f"</{tag}>", "")
286 .replace("<note>☞", "☞").replace("<note> ☞", "") \
287 .replace("<note>", "☞ ").replace("</note>", "") \
288 .replace("<pos>", "(<i>").replace("</pos>", "</i>)") \
289 .replace("<fld>(", "(<i>").replace(")</fld>", "</i>)") \
290 .replace("<fld>", "<i>").replace("</fld>", "</i>") \
291 .replace("<au>", '<span style="font-variant:small-caps">') \
292 .replace("</au>", "</span>") \
293 .replace("<qau>", '<span style="font-variant:small-caps">') \
294 .replace("</qau>", "</span>") \
295 .replace("<sc>", '<span style="font-variant:small-caps">') \
296 .replace("</sc>", "</span>") \
297 .replace("<mark>[R.]</mark>", "[<i>Rare</i>]") \
298 .replace("<mark>[Obs.]</mark>", "[<i>Obsolete</i>]") \
299 .replace("<mark>[", "[<i>").replace("]</mark>", "</i>]") \
302 for subsensemark in re_subsense.findall(text):
303 text = text.replace(subsensemark, "<li>")
306 text = text.replace("<li>", "<ol><li>", 1)
307 text = f"{text}</ol>"
310 text = re_greekwords.sub(detransliterate_greek, text)
312 # move etymology at the end
313 match_ety = re_etymology.search(text)
315 text = text.replace(match_ety.group(0), "")
316 text += f"<p><b>Etymology:</b> {match_ety.group(1)}</p>"
318 # simplify sources : if all identical, just have the one at the end;
319 # otherwise format it a bit better
320 sources = list(re_sources.finditer(text))
321 #assert sources, f"Looking for sources in {text}"
322 sourcenames = [source.group(1) for source in sources]
323 if len(set(sourcenames)) == 1 or \
324 (sourcenames and all("Webster" in s for s in sourcenames)):
325 text = re_sources.sub("", text)
326 text += f"<p>Source: {sources[0].group(1)}</p>"
328 for source in set(sources):
329 text = text.replace(source.group(0), f"<p>Source: {source.group(1)}</p>")
331 unknwn_ent = re_unknownentity.search(text)
332 assert not unknwn_ent, f"Unknown entity {unknwn_ent} in {text}"
334 # TODO: add check for unknown tags
338 def detransliterate_greek(roman):
339 if isinstance(roman, re.Match):
340 roman = roman.group(1)
341 # not in alphabetical order, as ch for χ needs to appear before h for η
343 .replace("ch", "χ") \
344 .replace("a", "α").replace("b", "β").replace("g", "γ") \
345 .replace("d", "δ").replace("e", "ε").replace("z", "ζ") \
346 .replace("h", "η").replace("q", "θ").replace("i", "ι") \
347 .replace("k", "κ").replace("l", "λ").replace("m", "μ") \
348 .replace("n", "ν").replace("x", "ξ").replace("o", "ο") \
349 .replace("p", "π").replace("r", "ρ").replace("s", "σ") \
350 .replace("t", "τ").replace("y", "υ").replace("u", "υ") \
351 .replace("f", "φ").replace("ps", "ψ").replace("w", "ω") \
352 .replace("~", "\N{COMBINING GRAVE ACCENT}") \
353 .replace("`", "\N{COMBINING ACUTE ACCENT}") \
354 .replace(":", "\N{COMBINING DIAERESIS}") \
355 .replace(",", "\N{COMBINING GREEK YPOGEGRAMMENI}") \
356 .replace("^", "\N{COMBINING GREEK PERISPOMENI}")
357 if greek.endswith("σ"):
358 greek = greek[:-1] + "ς"
359 # these appear before the letter, so need to move after
360 greek = re.sub("'(.)", "\\1\N{COMBINING COMMA ABOVE}", greek)
361 greek = re.sub("\"(.)", "\\1\N{COMBINING REVERSED COMMA ABOVE}", greek)
362 return unicodedata.normalize("NFC", greek)
364 def test_transliterate_greek():
367 for roman, greek in [("'archai:`zein", "ἀρχαΐζειν"), # BUG in the example the accent on the ι is wrong
369 ("o'i^nos", "οἶνος"),
370 ("\"ydra`rgyros", "ὑδράργυρος"),
371 ("lyth`rios", "λυτήριος"),
372 ("poihth`s", "ποιητής")]:
373 expected += f"{roman}\n{greek} - {greek.encode('unicode_escape')}\n"
374 actual += f"{roman}\n{detransliterate_greek(roman)} - " \
375 "{detransliterate_greek(roman).encode('unicode_escape')}\n"
376 assert actual == expected
378 def mk_fraction(fraction):
379 num = fraction.group(1).translate({"1": "¹", "2": "²", "3": "³",
380 "4": "⁴", "5": "⁵", "6": "⁶",
381 "7": "⁷", "8": "⁸", "9": "⁹",
383 den = fraction.group(2).translate({"1": "₁", "2": "₂", "3": "₃",
384 "4": "₄", "5": "₅", "6": "₆",
385 "7": "₇", "8": "₈", "9": "₉",
387 return f"{num}⁄{den}"
393 self.current_def = ""
395 def start_new_entry(self):
396 current_hw = tuple(hw.upper() for hw in self.current_hw)
397 if current_hw not in self.entries:
398 self.entries[current_hw] = []
400 defn = format_definition(self.current_def)
402 print(f"Trying to parse {current_hw}: {self.current_def}")
404 self.entries[current_hw].append(defn)
406 self.current_def = ""
409 def convert_file(self, fname):
410 rawtext = open(fname, "r", encoding="latin-1").read()
411 rawtext = re.sub(r"<--.+?-->", "", rawtext, flags=re.DOTALL)
412 for entity, char in entity_map.items():
413 rawtext = rawtext.replace(entity, char)
415 for line in rawtext.splitlines():
416 match = re_headword.search(line)
419 self.start_new_entry()
422 if skipped_hw.match(hw):
424 elif hw == "Tracer/y": # BUG for these words
426 elif hw == "Adder fly/":
428 elif hw == "Gairish/ness":
430 elif hw == "P<sub>i</sub>":
433 assert re.match("^[^<>/]+$", hw), hw
434 self.current_hw.append(hw)
437 for altsp in re_altspelling.findall(line):
438 self.current_hw.append(altsp)
439 self.current_def += line
441 print(f"after{fname}, {len(self.entries)} definitions")
443 def write(self, fname):
444 with open(fname, "w") as f:
445 for hws, entry in self.entries.items():
447 f.write(f"\n<h1>{hws}</h1>")
449 for i, subentry in enumerate(entry):
450 f.write(f"<h2>{i+1}. {hws}</h2>")
455 def write_to_tabseparated(self, fname):
456 with open(fname, "w") as f:
457 for hws, entry in self.entries.items():
461 for i, subentry in enumerate(entry):
462 f.write(f"<h2>{i+1}. {hws}</h2>")
463 assert "\n" not in subentry
466 assert "\n" not in entry[0]
472 for fname in glob("../gcide/CIDE.?"):
473 print(f"Running over {fname}")
474 c.convert_file(fname)
476 letter = fname.rsplit(".")[-1]
477 # c.write(f"GCIDE-{letter}.html")
478 c.write_to_tabseparated("GCIDE.tab_separated")
480 if __name__ == "__main__":