GCIDE_to_tab_separated

   1 #!/usr/bin/python3
   2
   3 import lxml.etree
   4 import re
   5 from glob import glob
   6 import unicodedata
   7
   8 entity_map = {
   9     "<br/":    "<br>",
  10     "&":       "&amp;",
  11     "<lt/":    "&lt;",
  12     "<gt/":    "&gt;",
  13     "--":      "—", # long (em) dash
  14
  15     "<Cced/":  "Ç", # C cedilla
  16     "<uum/":   "ü", # u umlaut (diaeresis)
  17     "<eacute/":"é", # e acute
  18     "<acir/":  "â", # a circumflex
  19     "<acirc/": "â", # a circumflex
  20     "<aum/":   "ä", # a umlaut (diaeresis)
  21     "<agrave/":"à", # a grave
  22     "<aring/": "å", # a ring above
  23     "<cced/":  "ç", # c cedilla
  24     "<ecir/":  "ê", # e circumflex
  25     "<eum/":   "ë", # e umlaut (diaeresis)
  26     "<egrave/":"è", # e grave
  27     "<ium/":   "ï", # i umlaut (diaeresis)
  28     "<icir/":  "î", # i circumflex
  29     "<icirc/": "î", # i circumflex
  30     "<igrave/":"ì", # i grave
  31     "<Aum/":   "Ä", # A umlaut
  32     "<Eacute/":"É", # E acute
  33     "<ncir/":  "n̂", # n circumflex
  34     "<ocir/":  "ô", # o circumflex
  35     "<oum/":   "ö", # o umlaut (diaeresis)
  36     "<ograve/":"ò", # o grave
  37     "<ucir/":  "û", # u circumflex
  38     "<ugrave/":"ù", # u grave
  39     "<yum/":   "ÿ", # y umlaut
  40     "<Oum/":   "Ö", # O umlaut
  41     "<Uum/":   "Ü", # U umlaut (diaeresis)
  42     "<pound/": "£", # pound sign (British)
  43     "<aacute/":"á", # a acute
  44     "<iacute/":"í", # i acute
  45     "<oacute/":"ó", # o acute
  46     "<uacute/":"ú", # u acute
  47     "<ntil/":  "ñ", # n tilde
  48     "<Ntil/":  "Ñ", # N tilde
  49     "<mtil/":  "m̆", # used in one location, looks more like m breve than m tilde BUG
  50     "<ltil/":  "ɫ", # l tilde
  51     "<sec/":   "˝", # seconds (of degree or time). Also, inches or double prime.
  52     "<frac23/":"⅔", # two-thirds
  53     "<frac13/":"⅓", # one-third
  54     "<frac12/":"½", # one-half
  55     "<frac14/":"¼", # one-quarter
  56     "<frac34/":"¾", # three-quarters
  57     "<frac16/":  "⅙", # one sixth
  58     "<hand/":  "☞", # pointing hand (printer's "fist")
  59     "<bprime/":"˝", # bold accent (used in pronunciations)
  60     "<prime/": "´", # light accent (used in pronunciations) also minutes (of
  61                     # arc or time)
  62     "<min/":   "´", # well some minutes of arc have a different entity
  63     "<rdquo/": "”", # close double quote
  64     "</q>":    "”", # close double quote
  65     "<sect/":  "§", # section mark
  66     "<para/":  "¶", # paragraph mark
  67     "<ldquo/": "“", # open double quotes
  68     "<q>":     "“", # open double quotes
  69     "<amac/":  "ā", # a macron
  70     "<lsquo/": "‘", # left single quote
  71     "<rsquo/": "’", # right single quote
  72     "<nsm/":   "ṉ", # n sub-macron
  73     "<sharp/": "♯", # musical sharp
  74     "<flat/":  "♭", # musical flat
  75     "<imac/":  "ī", # i macron
  76     "<emac/":  "ē", # e macron
  77     "<dsdot/": "ḍ", # Sanskrit/Tamil d dot
  78     "<nsdot/": "ṇ", # Sanskrit/Tamil n dot
  79     "<tsdot/": "ṭ", # Sanskrit/Tamil t dot
  80     "<tsdo/":  "ṭ", # Sanskrit/Tamil t dot
  81     "<ecr/":   "ĕ", # e breve
  82     "<icr/":   "ĭ", # i breve
  83     "<ocr/":   "ŏ", # o breve
  84     "<omac/":  "ō", # o macron
  85     "<umac/":  "ū", # u macron
  86     "<ocar/":  "ǒ", # o hacek
  87     "<aemac/": "ǣ", # ae ligature macron
  88     "<oemac/": "ōē", # oe ligature macron BUG
  89     "<ucr/":   "ŭ", # u breve
  90     "<acr/":   "ă", # a breve
  91     "<cre/":   "˘", # crescent (like a breve, but vertically centered --
  92                     # represents the short accent in poetic meter)
  93     "<ymac/":  "ȳ", # y macron
  94     "<edh/":   "ð", # small eth
  95     "<thorn/": "þ", # small thorn
  96     "<atil/":  "ã", # a tilde
  97     "<ndot/":  "ṅ", # n with dot above
  98     "<rsdot/": "ṛ", # r with a dot below
  99     "<yogh/":  "ȝ", # small yogh
 100     "<mdash/": "—", # em dash
 101     "<divide/":"÷", # division sign
 102     "<deg/":   "°", # degree sign
 103     "<middot/":"•", # bold middle dot
 104     "<root/":  "√", # root sign
 105     "<cuberoot/": "∛", # cubic root sign
 106     "<adot/":  "ȧ", # a with dot above
 107     "<mdot/":  "ṁ", # m with dot above
 108     "<breve/": "˘", # breve
 109     "<dagger/": "†", # dagger
 110     "<ounceap/": "℥", # ounce
 111     "<asterism/": "⁂", # asterism
 112     "<times/": "×", # multiplication
 113     "<8star/": "⚹", # sextile, badly named BUG
 114     "<upslur/": "⁀", # musical slur, approx. with the IPA tie
 115     "<downslur/": "‿", # musical slur, approx. with the IPA undertiw
 116     "<natural/": "♮", # natural key
 117     "<schwa/": "ə", # schwa
 118     "<astascending/": "☊", # ascending node, Dragon's head
 119     "<astdescending": "☋", # descending node, Dragon's tail
 120     "<integral2l/": "∫", # integration symbol
 121     "<iques/": "¿", # inverted question mark as in Spanish
 122     "<pause/": "𝄐", # pause aka. corona
 123     "<nabla/": "∇", # nabla operator
 124     "<dele/": "₰", # dele proofreading mark; closest Unicode is the pfennig symbol which has a similar origin
 125     "<umlaut/": "¨", # diaeresis
 126     "<rarr/":  "→", # right arrow
 127
 128     "<ae/":    "æ", # ligature ae
 129     "<AE/":    "Æ", # ligature AE
 130     "<OE/":    "Œ", # OE ligature
 131     "<oe/":    "œ", # oe ligature
 132     "<filig/": "ﬁ", # fi ligature
 133     "<fllig/": "ﬂ", # fl ligature
 134     "<fflig/": "ﬀ", # ff ligature
 135     "<ffllig/": "ﬄ", # ffl ligature
 136
 137     "<?/":     "(???)", # Place-holder for unknown or illegible character.
 138
 139     # used only in prononciation key; not able to find what "short vertical
 140     # bar on top" looks like with unicode chars.
 141     "<asl/":   "a", #  a "semilong" (has a macron above with a short
 142                     # vertical bar on top the center of the macron)
 143                     # Used in pronunciations.
 144     "<esl/":   "e", # e "semilong"
 145     "<isl/":   "i", # i "semilong"
 146     "<osl/":   "o", # o "semilong"
 147     "<usl/":   "u", # u "semilong"
 148     "<th/":    "th",# th ligature
 149
 150     "<ait/":   "𝑎", # a italic
 151     "<eit/":   "𝑒",
 152     "<iit/":   "𝑖",
 153     "<oit/":   "𝑜",
 154     "<uit/":   "𝑢",
 155
 156     "<add/":   "a", # a with two dot below
 157     "<edd/":   "e",
 158     "<idd/":   "i",
 159     "<odd/":   "o",
 160     "<udd/":   "u",
 161     "<oocr/":  "oo",
 162     "<oomac/":  "oo",
 163     "<etil/":  "ẽ",
 164     "<ycr/":   "ў",
 165
 166     # not perfect but good enough; only used in the definition of repetend
 167     "<2dot/": "<span style='text-decoration:overline'>2</span>",
 168     "<3dot/": "<span style='text-decoration:overline'>3</span>",
 169     # only used as an example in the entry for progression
 170     "<lbrace2/<matrix2x5><row>2, 4, 6, 8, 10</row><row>10, 8, 6, 4, 2</row></matrix2x5><rbrace2/":
 171     "{2, 4, 6, 8, 10; 10, 8, 6, 4, 2}",
 172     "<lbrace2/<matrix2x5><row>2, 4, 8, 16, 32, 64</row><row>64, 32, 16, 8, 4, 2</row></matrix2x5><rbrace2/":
 173     "{2, 4, 8, 16, 32; 64, 32 ,16, 8, 4, 2}",
 174
 175     # greek letters
 176     "<alpha/": "α",         "<ALPHA/": "Α",
 177     "<beta/": "β",          "<BETA/": "Β",
 178     "<gamma/": "γ",         "<GAMMA/": "Γ",
 179     "<delta/": "δ",         "<DELTA/": "Δ",
 180     "<epsilon/": "ε",       "<EPSILON/": "Ε",
 181     "<zeta/": "ζ",          "<ZETA/": "Ζ",
 182     "<eta/": "η",           "<ETA/": "Η",
 183     "<theta/": "θ",         "<THETA/": "Θ",
 184     "<iota/": "ι",          "<IOTA/": "Ι",
 185     "<kappa/": "κ",         "<KAPPA/": "Κ",
 186     "<lambda/": "λ",        "<LAMBDA/": "Λ",
 187     "<mu/": "μ",            "<MU/": "Μ",
 188     "<nu/": "ν",            "<NU/": "Ν",
 189     "<xi/": "ξ",            "<XI/": "Ξ",
 190     "<omicron/": "ο",       "<OMICRON/": "Ο",
 191     "<pi/": "π",            "<PI/": "Π",
 192     "<rho/": "ρ",           "<RHO/": "Ρ",
 193     "<sigma/": "σ",         "<SIGMA/": "Σ",
 194     "<sigmat/": "ς",
 195     "<digamma/": "ϝ",
 196     "<tau/": "τ",           "<TAU/": "Τ",
 197     "<upsilon/": "υ",       "<UPSILON/": "Υ",
 198     "<phi/": "φ",           "<PHI/": "Φ",
 199     "<chi/": "χ",           "<CHI/": "Χ",
 200     "<psi/": "ψ",           "<PSI/": "Ψ",
 201     "<omega/": "ω",         "<OMEGA/": "Ω",
 202     "<asper/": "ʽ",
 203
 204     # then there are some characters that are shown as escape sequences BUG
 205     r"\'94":   "ö",
 206     r"\'d8":   "",         # ‖ in the dictonary, no point in keeping
 207     r"/'bd":   "“",        # one instance where / is used instead of \
 208     r" 'bd":   "“",        # two instances where \ is misssing
 209     r"`'b8":   "”",        # one instance where ` is used instead of \
 210
 211     "<hsdot/": "ḥ",
 212     "<zsdot/": "ẓ",
 213     "<msdot/": "ṃ",
 214     "<zdot/": "ż",
 215     "<uring/": "ů",
 216     "<usdot/": "ụ",
 217     "<lsdot/": "ḷ",
 218     "<cacute/": "ć",
 219     "<ccar/": "č",
 220     "<csdot/": "c̣",
 221
 222     "<sb/": "",                 # ??? BUG
 223     "<colbreak/": "",
 224 }
 225
 226 re_unknownentity = re.compile(r"<[a-z0-9]+/")
 227 re_sources = re.compile(r"\[<source>(.*?)</source>\]")
 228 re_pos = re.compile(r"<pos>(.*?)</pos>")
 229 re_headword = re.compile(r"<ent>(.*)</ent>")
 230 re_subsense = re.compile(r"<sn>[0-9.]+</sn>")
 231 re_altspelling = re.compile(r"<asp>(.+?)</asp>")
 232 re_greekwords = re.compile(r"<grk>(.+?)</grk>")
 233 skipped_hw = re.compile(r"^[0-9/]+(st|nd|rd|th)?$")
 234 re_simplefraction = re.compile(r"<frac([0-9])([0-9])/")
 235 re_fraction = re.compile(r"<frac([0-9]+)x([0-9]+)/")
 236 re_etymology = re.compile(r"<ety>(.*?)</ety>")
 237
 238 # delete the tags and it's content
 239 delete_tags = [re.compile(r"<pr>.*?</pr>,?\.?"),
 240                re.compile(r"<hw>.*?</hw>,?\.?"),
 241                re.compile(r"<mhw>.*?</mhw>,?"),
 242                re.compile(r"<song>.*?</song>"),
 243                re.compile(r"<song>.*?</p>"), # blah BUG song not closed
 244                re.compile(r"<table.*?</table>"),
 245                re.compile(r"<mtable>.*?</mtable>"),
 246                re.compile(r'<pr>\(hō"m<stil/r\).'), # BUG pr not closed
 247               ]
 248 # replace the markup, keeping the contents intact
 249 replace_markup = {
 250     "plu plw def rj col cs cd altsp sansserif": "",
 251     "ex xex qex er it ets etsep asp spn kingdom phylum subphylum class subclass ord subord fam subfam gen var varn": "i",
 252     "bold ct colf stypec": "b",
 253     "subs": "sub",
 254     "sups": "sup",
 255 }
 256
 257 def format_definition(text):
 258     for del_tag in delete_tags:
 259         text = del_tag.sub("", text)
 260
 261     # <and/ and <or/ should be in roman font, so need to check where they
 262     # occur for replacement
 263     pos_part_match = re_pos.search(text)
 264     if pos_part_match:
 265         pos_part = pos_part_match.group(1)
 266         pos_part = pos_part.replace("<or/", "</i> or <i>") \
 267                            .replace("<and/", "</i> and <i>")
 268         text = text.replace(pos_part_match.group(0), f"<i>{pos_part}</i>")
 269     text = text.replace("<or/", " or ").replace("<and/", " and ")
 270
 271     # fractions
 272     text = re_simplefraction.sub(mk_fraction, text)
 273     text = re_fraction.sub(mk_fraction, text)
 274
 275     # translate functional tags into formatting
 276     for funtag, formattag in replace_markup.items():
 277         for tag in funtag.split():
 278             if formattag:
 279                 text = text.replace(f"<{tag}>", f"<{formattag}>") \
 280                            .replace(f"</{tag}>", f"</{formattag}>")
 281             else:
 282                 text = text.replace(f"<{tag}>", "") \
 283                            .replace(f"</{tag}>", "")
 284
 285     text = text \
 286         .replace("<note>☞", "☞").replace("<note> ☞", "") \
 287         .replace("<note>", "☞ ").replace("</note>", "") \
 288         .replace("<pos>", "(<i>").replace("</pos>", "</i>)") \
 289         .replace("<fld>(", "(<i>").replace(")</fld>", "</i>)") \
 290         .replace("<fld>", "<i>").replace("</fld>", "</i>") \
 291         .replace("<au>", '<span style="font-variant:small-caps">') \
 292         .replace("</au>", "</span>") \
 293         .replace("<qau>", '<span style="font-variant:small-caps">') \
 294         .replace("</qau>", "</span>") \
 295         .replace("<sc>", '<span style="font-variant:small-caps">') \
 296         .replace("</sc>", "</span>") \
 297         .replace("<mark>[R.]</mark>", "[<i>Rare</i>]") \
 298         .replace("<mark>[Obs.]</mark>", "[<i>Obsolete</i>]") \
 299         .replace("<mark>[", "[<i>").replace("]</mark>", "</i>]") \
 300
 301     wrapol = False
 302     for subsensemark in re_subsense.findall(text):
 303         text = text.replace(subsensemark, "<li>")
 304         wrapol = True
 305     if wrapol:
 306         text = text.replace("<li>", "<ol><li>", 1)
 307         text = f"{text}</ol>"
 308
 309     # embedded greek
 310     text = re_greekwords.sub(detransliterate_greek, text)
 311
 312     # move etymology at the end
 313     match_ety = re_etymology.search(text)
 314     if match_ety:
 315         text = text.replace(match_ety.group(0), "")
 316         text += f"<p><b>Etymology:</b> {match_ety.group(1)}</p>"
 317
 318     # simplify sources : if all identical, just have the one at the end;
 319     # otherwise format it a bit better
 320     sources = list(re_sources.finditer(text))
 321     #assert sources, f"Looking for sources in {text}"
 322     sourcenames = [source.group(1) for source in sources]
 323     if len(set(sourcenames)) == 1 or \
 324        (sourcenames and all("Webster" in s for s in sourcenames)):
 325         text = re_sources.sub("", text)
 326         text +=  f"<p>Source: {sources[0].group(1)}</p>"
 327     else:
 328         for source in set(sources):
 329             text = text.replace(source.group(0), f"<p>Source: {source.group(1)}</p>")
 330
 331     unknwn_ent = re_unknownentity.search(text)
 332     assert not unknwn_ent, f"Unknown entity {unknwn_ent} in {text}"
 333
 334     # TODO: add check for unknown tags
 335
 336     return text
 337
 338 def detransliterate_greek(roman):
 339     if isinstance(roman, re.Match):
 340         roman = roman.group(1)
 341     # not in alphabetical order, as ch for χ needs to appear before h for η
 342     greek = roman\
 343         .replace("ch", "χ") \
 344         .replace("a", "α").replace("b", "β").replace("g", "γ") \
 345         .replace("d", "δ").replace("e", "ε").replace("z", "ζ") \
 346         .replace("h", "η").replace("q", "θ").replace("i", "ι") \
 347         .replace("k", "κ").replace("l", "λ").replace("m", "μ") \
 348         .replace("n", "ν").replace("x", "ξ").replace("o", "ο") \
 349         .replace("p", "π").replace("r", "ρ").replace("s", "σ") \
 350         .replace("t", "τ").replace("y", "υ").replace("u", "υ") \
 351         .replace("f", "φ").replace("ps", "ψ").replace("w", "ω") \
 352         .replace("~", "\N{COMBINING GRAVE ACCENT}") \
 353         .replace("`", "\N{COMBINING ACUTE ACCENT}") \
 354         .replace(":", "\N{COMBINING DIAERESIS}") \
 355         .replace(",", "\N{COMBINING GREEK YPOGEGRAMMENI}") \
 356         .replace("^", "\N{COMBINING GREEK PERISPOMENI}")
 357     if greek.endswith("σ"):
 358         greek = greek[:-1] + "ς"
 359     # these appear before the letter, so need to move after
 360     greek = re.sub("'(.)", "\\1\N{COMBINING COMMA ABOVE}", greek)
 361     greek = re.sub("\"(.)", "\\1\N{COMBINING REVERSED COMMA ABOVE}", greek)
 362     return unicodedata.normalize("NFC", greek)
 363
 364 def test_transliterate_greek():
 365     expected = ""
 366     actual = ""
 367     for roman, greek in [("'archai:`zein", "ἀρχαΐζειν"), # BUG in the example the accent on the ι is wrong
 368                          ("zw^,on", "ζῷον"),
 369                          ("o'i^nos", "οἶνος"),
 370                          ("\"ydra`rgyros", "ὑδράργυρος"),
 371                          ("lyth`rios", "λυτήριος"),
 372                          ("poihth`s", "ποιητής")]:
 373         expected += f"{roman}\n{greek} - {greek.encode('unicode_escape')}\n"
 374         actual += f"{roman}\n{detransliterate_greek(roman)} - " \
 375             "{detransliterate_greek(roman).encode('unicode_escape')}\n"
 376     assert actual == expected
 377
 378 def mk_fraction(fraction):
 379     num = fraction.group(1).translate({"1": "¹", "2": "²", "3": "³",
 380                                        "4": "⁴", "5": "⁵", "6": "⁶",
 381                                        "7": "⁷", "8": "⁸", "9": "⁹",
 382                                        "0": "⁰"})
 383     den = fraction.group(2).translate({"1": "₁", "2": "₂", "3": "₃",
 384                                        "4": "₄", "5": "₅", "6": "₆",
 385                                        "7": "₇", "8": "₈", "9": "₉",
 386                                        "0": "₀"})
 387     return f"{num}⁄{den}"
 388
 389 class Converter:
 390     def __init__(self):
 391         self.entries = {}
 392         self.current_hw = []
 393         self.current_def = ""
 394
 395     def start_new_entry(self):
 396         current_hw = tuple(hw.upper() for hw in self.current_hw)
 397         if current_hw not in self.entries:
 398             self.entries[current_hw] = []
 399         try:
 400             defn = format_definition(self.current_def)
 401         except:
 402             print(f"Trying to parse {current_hw}: {self.current_def}")
 403             raise
 404         self.entries[current_hw].append(defn)
 405
 406         self.current_def = ""
 407         self.current_hw = []
 408
 409     def convert_file(self, fname):
 410         rawtext = open(fname, "r", encoding="latin-1").read()
 411         rawtext = re.sub(r"<--.+?-->", "", rawtext, flags=re.DOTALL)
 412         for entity, char in entity_map.items():
 413             rawtext = rawtext.replace(entity, char)
 414
 415         for line in rawtext.splitlines():
 416             match = re_headword.search(line)
 417             if match:
 418                 if self.current_def:
 419                     self.start_new_entry()
 420
 421                 hw = match.group(1)
 422                 if skipped_hw.match(hw):
 423                     continue
 424                 elif hw == "Tracer/y": # BUG for these words
 425                     hw = "Tracery"
 426                 elif hw == "Adder fly/":
 427                     hw = "Adder fly"
 428                 elif hw == "Gairish/ness":
 429                     hw = "Gairishness"
 430                 elif hw == "P<sub>i</sub>":
 431                     hw = "Pi"
 432
 433                 assert re.match("^[^<>/]+$", hw), hw
 434                 self.current_hw.append(hw)
 435             else:
 436                 if self.current_hw:
 437                     for altsp in re_altspelling.findall(line):
 438                         self.current_hw.append(altsp)
 439                     self.current_def += line
 440
 441         print(f"after{fname}, {len(self.entries)} definitions")
 442
 443     def write(self, fname):
 444         with open(fname, "w") as f:
 445             for hws, entry in self.entries.items():
 446                 hws = ", ".join(hws)
 447                 f.write(f"\n<h1>{hws}</h1>")
 448                 if len(entry) > 1:
 449                     for i, subentry in enumerate(entry):
 450                         f.write(f"<h2>{i+1}. {hws}</h2>")
 451                         f.write(subentry)
 452                 else:
 453                     f.write(entry[0])
 454
 455     def write_to_tabseparated(self, fname):
 456         with open(fname, "w") as f:
 457             for hws, entry in self.entries.items():
 458                 hws = ", ".join(hws)
 459                 f.write(f"{hws}\t")
 460                 if len(entry) > 1:
 461                     for i, subentry in enumerate(entry):
 462                         f.write(f"<h2>{i+1}. {hws}</h2>")
 463                         assert "\n" not in subentry
 464                         f.write(subentry)
 465                 else:
 466                     assert "\n" not in entry[0]
 467                     f.write(entry[0])
 468                 f.write("\n")
 469
 470 def main():
 471     c = Converter()
 472     for fname in glob("../gcide/CIDE.?"):
 473         print(f"Running over {fname}")
 474         c.convert_file(fname)
 475
 476         letter = fname.rsplit(".")[-1]
 477         # c.write(f"GCIDE-{letter}.html")
 478     c.write_to_tabseparated("GCIDE.tab_separated")
 479
 480 if __name__ == "__main__":
 481     main()