GCIDE conversion

[gen-quickdic.git] / GCIDE_to_tab_separated
diff --git a/GCIDE_to_tab_separated b/GCIDE_to_tab_separated

index 99a497392eadf854b980032be5af678e966f7a38..9c861f3cfc7279f353a4d6fdac3d36402f85b6b9 100755 (executable)
--- a/GCIDE_to_tab_separated
+++ b/GCIDE_to_tab_separated
@@ -2,17 +2,21 @@
  
  import lxml.etree
  import re
+from glob import glob
+import unicodedata
  
  entity_map = {
-    "<br/":    "<br/>",
+    "<br/":    "<br>",
      "&":       "&amp;",
      "<lt/":    "&lt;",
      "<gt/":    "&gt;",
      "--":      "—", # long (em) dash
+
      "<Cced/":  "Ç", # C cedilla
      "<uum/":   "ü", # u umlaut (diaeresis)
      "<eacute/":"é", # e acute
      "<acir/":  "â", # a circumflex
+    "<acirc/": "â", # a circumflex
      "<aum/":   "ä", # a umlaut (diaeresis)
      "<agrave/":"à", # a grave
      "<aring/": "å", # a ring above
@@ -26,8 +30,7 @@ entity_map = {
      "<igrave/":"ì", # i grave
      "<Aum/":   "Ä", # A umlaut
      "<Eacute/":"É", # E acute
-    "<ae/":    "æ", # ligature ae
-    "<AE/":    "Æ", # ligature AE
+    "<ncir/":  "n̂", # n circumflex
      "<ocir/":  "ô", # o circumflex
      "<oum/":   "ö", # o umlaut (diaeresis)
      "<ograve/":"ò", # o grave
@@ -43,21 +46,30 @@ entity_map = {
      "<uacute/":"ú", # u acute
      "<ntil/":  "ñ", # n tilde
      "<Ntil/":  "Ñ", # N tilde
+    "<mtil/":  "m̆", # used in one location, looks more like m breve than m tilde BUG
+    "<ltil/":  "ɫ", # l tilde
+    "<sec/":   "˝", # seconds (of degree or time). Also, inches or double prime.
      "<frac23/":"⅔", # two-thirds
      "<frac13/":"⅓", # one-third
-    "<sec/":   "˝", # seconds (of degree or time). Also, inches or double prime.
      "<frac12/":"½", # one-half
      "<frac14/":"¼", # one-quarter
+    "<frac34/":"¾", # three-quarters
+    "<frac16/":  "⅙", # one sixth
      "<hand/":  "☞", # pointing hand (printer's "fist")
      "<bprime/":"˝", # bold accent (used in pronunciations)
      "<prime/": "´", # light accent (used in pronunciations) also minutes (of
                      # arc or time)
+    "<min/":   "´", # well some minutes of arc have a different entity
      "<rdquo/": "”", # close double quote
+    "</q>":    "”", # close double quote
      "<sect/":  "§", # section mark
+    "<para/":  "¶", # paragraph mark
      "<ldquo/": "“", # open double quotes
+    "<q>":     "“", # open double quotes
      "<amac/":  "ā", # a macron
      "<lsquo/": "‘", # left single quote
-    "<nsm/":   "ṉ", # "n sub-macron"
+    "<rsquo/": "’", # right single quote
+    "<nsm/":   "ṉ", # n sub-macron
      "<sharp/": "♯", # musical sharp
      "<flat/":  "♭", # musical flat
      "<imac/":  "ī", # i macron
@@ -65,16 +77,15 @@ entity_map = {
      "<dsdot/": "ḍ", # Sanskrit/Tamil d dot 
      "<nsdot/": "ṇ", # Sanskrit/Tamil n dot
      "<tsdot/": "ṭ", # Sanskrit/Tamil t dot
+    "<tsdo/":  "ṭ", # Sanskrit/Tamil t dot
      "<ecr/":   "ĕ", # e breve
      "<icr/":   "ĭ", # i breve
      "<ocr/":   "ŏ", # o breve
-    "<OE/":    "Œ", # OE ligature
-    "<oe/":    "œ", # oe ligature
      "<omac/":  "ō", # o macron
      "<umac/":  "ū", # u macron
      "<ocar/":  "ǒ", # o hacek
      "<aemac/": "ǣ", # ae ligature macron
-    "<oemac/": "ō", # oe ligature macron
+    "<oemac/": "ōē", # oe ligature macron BUG
      "<ucr/":   "ŭ", # u breve
      "<acr/":   "ă", # a breve
      "<cre/":   "˘", # crescent (like a breve, but vertically centered --
@@ -91,9 +102,39 @@ entity_map = {
      "<deg/":   "°", # degree sign
      "<middot/":"•", # bold middle dot
      "<root/":  "√", # root sign
+    "<cuberoot/": "∛", # cubic root sign
      "<adot/":  "ȧ", # a with dot above
+    "<mdot/":  "ṁ", # m with dot above
+    "<breve/": "˘", # breve
+    "<dagger/": "†", # dagger
+    "<ounceap/": "℥", # ounce
+    "<asterism/": "⁂", # asterism
+    "<times/": "×", # multiplication
+    "<8star/": "⚹", # sextile, badly named BUG
+    "<upslur/": "⁀", # musical slur, approx. with the IPA tie
+    "<downslur/": "‿", # musical slur, approx. with the IPA undertiw
+    "<natural/": "♮", # natural key
+    "<schwa/": "ə", # schwa
+    "<astascending/": "☊", # ascending node, Dragon's head
+    "<astdescending": "☋", # descending node, Dragon's tail
+    "<integral2l/": "∫", # integration symbol
+    "<iques/": "¿", # inverted question mark as in Spanish
+    "<pause/": "𝄐", # pause aka. corona
+    "<nabla/": "∇", # nabla operator
+    "<dele/": "₰", # dele proofreading mark; closest Unicode is the pfennig symbol which has a similar origin
+    "<umlaut/": "¨", # diaeresis
+    "<rarr/":  "→", # right arrow
+    
+    "<ae/":    "æ", # ligature ae
+    "<AE/":    "Æ", # ligature AE
+    "<OE/":    "Œ", # OE ligature
+    "<oe/":    "œ", # oe ligature
+    "<filig/": "ﬁ", # fi ligature
+    "<fllig/": "ﬂ", # fl ligature
+    "<fflig/": "ﬀ", # ff ligature
+    "<ffllig/": "ﬄ", # ffl ligature
  
-    "<?/":     "?", #(?) Place-holder for unknown or illegible character.
+    "<?/":     "(???)", # Place-holder for unknown or illegible character.
  
      # used only in prononciation key; not able to find what "short vertical
      # bar on top" looks like with unicode chars.
@@ -105,11 +146,13 @@ entity_map = {
      "<osl/":   "o", # o "semilong"
      "<usl/":   "u", # u "semilong"
      "<th/":    "th",# th ligature
+
      "<ait/":   "𝑎", # a italic
      "<eit/":   "𝑒",
      "<iit/":   "𝑖",
      "<oit/":   "𝑜",
      "<uit/":   "𝑢",
+
      "<add/":   "a", # a with two dot below
      "<edd/":   "e",
      "<idd/":   "i",
@@ -120,6 +163,15 @@ entity_map = {
      "<etil/":  "ẽ",
      "<ycr/":   "ў",
  
+    # not perfect but good enough; only used in the definition of repetend
+    "<2dot/": "<span style='text-decoration:overline'>2</span>",
+    "<3dot/": "<span style='text-decoration:overline'>3</span>",
+    # only used as an example in the entry for progression
+    "<lbrace2/<matrix2x5><row>2, 4, 6, 8, 10</row><row>10, 8, 6, 4, 2</row></matrix2x5><rbrace2/":
+    "{2, 4, 6, 8, 10; 10, 8, 6, 4, 2}",
+    "<lbrace2/<matrix2x5><row>2, 4, 8, 16, 32, 64</row><row>64, 32, 16, 8, 4, 2</row></matrix2x5><rbrace2/":
+    "{2, 4, 8, 16, 32; 64, 32 ,16, 8, 4, 2}",
+
      # greek letters
      "<alpha/": "α",         "<ALPHA/": "Α",
      "<beta/": "β",          "<BETA/": "Β",
@@ -139,35 +191,291 @@ entity_map = {
      "<pi/": "π",            "<PI/": "Π",
      "<rho/": "ρ",           "<RHO/": "Ρ",
      "<sigma/": "σ",         "<SIGMA/": "Σ",
+    "<sigmat/": "ς",
+    "<digamma/": "ϝ",
      "<tau/": "τ",           "<TAU/": "Τ",
      "<upsilon/": "υ",       "<UPSILON/": "Υ",
      "<phi/": "φ",           "<PHI/": "Φ",
      "<chi/": "χ",           "<CHI/": "Χ",
      "<psi/": "ψ",           "<PSI/": "Ψ",
      "<omega/": "ω",         "<OMEGA/": "Ω",
+    "<asper/": "ʽ",
  
-    # then there are some characters that are shown as escape sequences
+    # then there are some characters that are shown as escape sequences BUG
      r"\'94":   "ö",
-    r"\'d8":   "‖",
+    r"\'d8":   "",         # ‖ in the dictonary, no point in keeping
      r"/'bd":   "“",        # one instance where / is used instead of \
      r" 'bd":   "“",        # two instances where \ is misssing
      r"`'b8":   "”",        # one instance where ` is used instead of \
  
-    # entities that appear in the etymology of Arabic words, but no explanation
-    # of what they stand for. Not displayed at all by GNU dico.
-    "<hsdot/": "",
-    "<zsdot/": "",
+    "<hsdot/": "ḥ",
+    "<zsdot/": "ẓ",
+    "<msdot/": "ṃ",
+    "<zdot/": "ż",
+    "<uring/": "ů",
+    "<usdot/": "ụ",
+    "<lsdot/": "ḷ",
+    "<cacute/": "ć",
+    "<ccar/": "č",
+    "<csdot/": "c̣",
+
+    "<sb/": "",                 # ??? BUG
+    "<colbreak/": "",
  }
  
-def replace_fake_comments(match):
-    nblines = match.group(0).count("\n")
-    return "\n" * nblines
-
-def convert_file(fname):
-    rawtext = open(fname, "r").read()
-    rawtext = re.sub(r"<--.+?-->", replace_fake_comments, rawtext, flags=re.DOTALL)
-    rawtext = f"<dict>" + rawtext + f"</dict>"
-    for entity, char in entity_map.items():
-        rawtext = rawtext.replace(entity, char)
-    print(rawtext.splitlines()[5724:5730])
-    e = lxml.etree.XML(rawtext)
+re_unknownentity = re.compile(r"<[a-z0-9]+/")
+re_sources = re.compile(r"\[<source>(.*?)</source>\]")
+re_pos = re.compile(r"<pos>(.*?)</pos>")
+re_headword = re.compile(r"<ent>(.*)</ent>")
+re_subsense = re.compile(r"<sn>[0-9.]+</sn>")
+re_altspelling = re.compile(r"<asp>(.+?)</asp>")
+re_greekwords = re.compile(r"<grk>(.+?)</grk>")
+skipped_hw = re.compile(r"^[0-9/]+(st|nd|rd|th)?$")
+re_simplefraction = re.compile(r"<frac([0-9])([0-9])/")
+re_fraction = re.compile(r"<frac([0-9]+)x([0-9]+)/")
+re_etymology = re.compile(r"<ety>(.*?)</ety>")
+
+# delete the tags and it's content
+delete_tags = [re.compile(r"<pr>.*?</pr>,?\.?"),
+               re.compile(r"<hw>.*?</hw>,?\.?"),
+               re.compile(r"<mhw>.*?</mhw>,?"),
+               re.compile(r"<song>.*?</song>"),
+               re.compile(r"<song>.*?</p>"), # blah BUG song not closed
+               re.compile(r"<table.*?</table>"),
+               re.compile(r"<mtable>.*?</mtable>"),
+               re.compile(r'<pr>\(hō"m<stil/r\).'), # BUG pr not closed
+              ]
+# replace the markup, keeping the contents intact
+replace_markup = {
+    "plu plw def rj col cs cd altsp sansserif": "",
+    "ex xex qex er it ets etsep asp spn kingdom phylum subphylum class subclass ord subord fam subfam gen var varn": "i",
+    "bold ct colf stypec": "b",
+    "subs": "sub",
+    "sups": "sup",
+}
+            
+def format_definition(text):
+    for del_tag in delete_tags:
+        text = del_tag.sub("", text)
+
+    # <and/ and <or/ should be in roman font, so need to check where they
+    # occur for replacement
+    pos_part_match = re_pos.search(text)
+    if pos_part_match:
+        pos_part = pos_part_match.group(1)
+        pos_part = pos_part.replace("<or/", "</i> or <i>") \
+                           .replace("<and/", "</i> and <i>")
+        text = text.replace(pos_part_match.group(0), f"<i>{pos_part}</i>")
+    text = text.replace("<or/", " or ").replace("<and/", " and ")
+
+    # fractions
+    text = re_simplefraction.sub(mk_fraction, text)
+    text = re_fraction.sub(mk_fraction, text)
+
+    # translate functional tags into formatting
+    for funtag, formattag in replace_markup.items():
+        for tag in funtag.split():
+            if formattag:
+                text = text.replace(f"<{tag}>", f"<{formattag}>") \
+                           .replace(f"</{tag}>", f"</{formattag}>")
+            else:
+                text = text.replace(f"<{tag}>", "") \
+                           .replace(f"</{tag}>", "")
+
+    text = text \
+        .replace("<note>☞", "☞").replace("<note> ☞", "") \
+        .replace("<note>", "☞ ").replace("</note>", "") \
+        .replace("<pos>", "(<i>").replace("</pos>", "</i>)") \
+        .replace("<fld>(", "(<i>").replace(")</fld>", "</i>)") \
+        .replace("<fld>", "<i>").replace("</fld>", "</i>") \
+        .replace("<au>", '<span style="font-variant:small-caps">') \
+        .replace("</au>", "</span>") \
+        .replace("<qau>", '<span style="font-variant:small-caps">') \
+        .replace("</qau>", "</span>") \
+        .replace("<sc>", '<span style="font-variant:small-caps">') \
+        .replace("</sc>", "</span>") \
+        .replace("<mark>[R.]</mark>", "[<i>Rare</i>]") \
+        .replace("<mark>[Obs.]</mark>", "[<i>Obsolete</i>]") \
+        .replace("<mark>[", "[<i>").replace("]</mark>", "</i>]") \
+
+    wrapol = False
+    for subsensemark in re_subsense.findall(text):
+        text = text.replace(subsensemark, "<li>")
+        wrapol = True
+    if wrapol:
+        text = text.replace("<li>", "<ol><li>", 1)
+        text = f"{text}</ol>"
+
+    # embedded greek
+    text = re_greekwords.sub(detransliterate_greek, text)
+
+    # move etymology at the end
+    match_ety = re_etymology.search(text)
+    if match_ety:
+        text = text.replace(match_ety.group(0), "")
+        text += f"<p><b>Etymology:</b> {match_ety.group(1)}</p>"
+
+    # simplify sources : if all identical, just have the one at the end;
+    # otherwise format it a bit better
+    sources = list(re_sources.finditer(text))
+    #assert sources, f"Looking for sources in {text}"
+    sourcenames = [source.group(1) for source in sources]
+    if len(set(sourcenames)) == 1 or \
+       (sourcenames and all("Webster" in s for s in sourcenames)):
+        text = re_sources.sub("", text)
+        text +=  f"<p>Source: {sources[0].group(1)}</p>"
+    else:
+        for source in set(sources):
+            text = text.replace(source.group(0), f"<p>Source: {source.group(1)}</p>")
+
+    unknwn_ent = re_unknownentity.search(text)
+    assert not unknwn_ent, f"Unknown entity {unknwn_ent} in {text}"
+
+    # TODO: add check for unknown tags
+
+    return text
+
+def detransliterate_greek(roman):
+    if isinstance(roman, re.Match):
+        roman = roman.group(1)
+    # not in alphabetical order, as ch for χ needs to appear before h for η
+    greek = roman\
+        .replace("ch", "χ") \
+        .replace("a", "α").replace("b", "β").replace("g", "γ") \
+        .replace("d", "δ").replace("e", "ε").replace("z", "ζ") \
+        .replace("h", "η").replace("q", "θ").replace("i", "ι") \
+        .replace("k", "κ").replace("l", "λ").replace("m", "μ") \
+        .replace("n", "ν").replace("x", "ξ").replace("o", "ο") \
+        .replace("p", "π").replace("r", "ρ").replace("s", "σ") \
+        .replace("t", "τ").replace("y", "υ").replace("u", "υ") \
+        .replace("f", "φ").replace("ps", "ψ").replace("w", "ω") \
+        .replace("~", "\N{COMBINING GRAVE ACCENT}") \
+        .replace("`", "\N{COMBINING ACUTE ACCENT}") \
+        .replace(":", "\N{COMBINING DIAERESIS}") \
+        .replace(",", "\N{COMBINING GREEK YPOGEGRAMMENI}") \
+        .replace("^", "\N{COMBINING GREEK PERISPOMENI}")
+    if greek.endswith("σ"):
+        greek = greek[:-1] + "ς"
+    # these appear before the letter, so need to move after
+    greek = re.sub("'(.)", "\\1\N{COMBINING COMMA ABOVE}", greek)
+    greek = re.sub("\"(.)", "\\1\N{COMBINING REVERSED COMMA ABOVE}", greek)
+    return unicodedata.normalize("NFC", greek)
+
+def test_transliterate_greek():
+    expected = ""
+    actual = ""
+    for roman, greek in [("'archai:`zein", "ἀρχαΐζειν"), # BUG in the example the accent on the ι is wrong
+                         ("zw^,on", "ζῷον"),
+                         ("o'i^nos", "οἶνος"),
+                         ("\"ydra`rgyros", "ὑδράργυρος"),
+                         ("lyth`rios", "λυτήριος"),
+                         ("poihth`s", "ποιητής")]:
+        expected += f"{roman}\n{greek} - {greek.encode('unicode_escape')}\n"
+        actual += f"{roman}\n{detransliterate_greek(roman)} - " \
+            "{detransliterate_greek(roman).encode('unicode_escape')}\n"
+    assert actual == expected
+
+def mk_fraction(fraction):
+    num = fraction.group(1).translate({"1": "¹", "2": "²", "3": "³",
+                                       "4": "⁴", "5": "⁵", "6": "⁶",
+                                       "7": "⁷", "8": "⁸", "9": "⁹",
+                                       "0": "⁰"})
+    den = fraction.group(2).translate({"1": "₁", "2": "₂", "3": "₃",
+                                       "4": "₄", "5": "₅", "6": "₆",
+                                       "7": "₇", "8": "₈", "9": "₉",
+                                       "0": "₀"})
+    return f"{num}⁄{den}"
+
+class Converter:
+    def __init__(self):
+        self.entries = {}
+        self.current_hw = []
+        self.current_def = ""
+
+    def start_new_entry(self):
+        current_hw = tuple(hw.upper() for hw in self.current_hw)
+        if current_hw not in self.entries:
+            self.entries[current_hw] = []
+        try:
+            defn = format_definition(self.current_def)
+        except:
+            print(f"Trying to parse {current_hw}: {self.current_def}")
+            raise
+        self.entries[current_hw].append(defn)
+
+        self.current_def = ""
+        self.current_hw = []
+
+    def convert_file(self, fname):
+        rawtext = open(fname, "r", encoding="latin-1").read()
+        rawtext = re.sub(r"<--.+?-->", "", rawtext, flags=re.DOTALL)
+        for entity, char in entity_map.items():
+            rawtext = rawtext.replace(entity, char)
+
+        for line in rawtext.splitlines():
+            match = re_headword.search(line)
+            if match:
+                if self.current_def:
+                    self.start_new_entry()
+
+                hw = match.group(1)
+                if skipped_hw.match(hw):
+                    continue
+                elif hw == "Tracer/y": # BUG for these words
+                    hw = "Tracery"
+                elif hw == "Adder fly/":
+                    hw = "Adder fly"
+                elif hw == "Gairish/ness":
+                    hw = "Gairishness"
+                elif hw == "P<sub>i</sub>":
+                    hw = "Pi"
+
+                assert re.match("^[^<>/]+$", hw), hw
+                self.current_hw.append(hw)
+            else:
+                if self.current_hw:
+                    for altsp in re_altspelling.findall(line):
+                        self.current_hw.append(altsp)
+                    self.current_def += line
+
+        print(f"after{fname}, {len(self.entries)} definitions")
+
+    def write(self, fname):
+        with open(fname, "w") as f:
+            for hws, entry in self.entries.items():
+                hws = ", ".join(hws)
+                f.write(f"\n<h1>{hws}</h1>")
+                if len(entry) > 1:
+                    for i, subentry in enumerate(entry):
+                        f.write(f"<h2>{i+1}. {hws}</h2>")
+                        f.write(subentry)
+                else:
+                    f.write(entry[0])
+
+    def write_to_tabseparated(self, fname):
+        with open(fname, "w") as f:
+            for hws, entry in self.entries.items():
+                hws = ", ".join(hws)
+                f.write(f"{hws}\t")
+                if len(entry) > 1:
+                    for i, subentry in enumerate(entry):
+                        f.write(f"<h2>{i+1}. {hws}</h2>")
+                        assert "\n" not in subentry
+                        f.write(subentry)
+                else:
+                    assert "\n" not in entry[0]
+                    f.write(entry[0])
+                f.write("\n")
+
+def main():
+    c = Converter()
+    for fname in glob("../gcide/CIDE.?"):
+        print(f"Running over {fname}")
+        c.convert_file(fname)
+
+        letter = fname.rsplit(".")[-1]
+        # c.write(f"GCIDE-{letter}.html")
+    c.write_to_tabseparated("GCIDE.tab_separated")
+
+if __name__ == "__main__":
+    main()