From: Frédéric Perrin <fred@fperrin.net>
Date: Sat, 13 Feb 2021 09:58:14 +0000 (+0000)
Subject: GCIDE conversion
X-Git-Url: http://gitweb.fperrin.net/?p=gen-quickdic.git;a=commitdiff_plain;h=c1db997ca48690b7e1405e67f887be60567c6b3e;ds=sidebyside

GCIDE conversion
---

diff --git a/GCIDE_to_tab_separated b/GCIDE_to_tab_separated
index 99a4973..9c861f3 100755
--- a/GCIDE_to_tab_separated
+++ b/GCIDE_to_tab_separated
@@ -2,17 +2,21 @@
 
 import lxml.etree
 import re
+from glob import glob
+import unicodedata
 
 entity_map = {
-    "<br/":    "<br/>",
+    "<br/":    "<br>",
     "&":       "&amp;",
     "<lt/":    "&lt;",
     "<gt/":    "&gt;",
     "--":      "â", # long (em) dash
+
     "<Cced/":  "Ã", # C cedilla
     "<uum/":   "Ã¼", # u umlaut (diaeresis)
     "<eacute/":"Ã©", # e acute
     "<acir/":  "Ã¢", # a circumflex
+    "<acirc/": "Ã¢", # a circumflex
     "<aum/":   "Ã¤", # a umlaut (diaeresis)
     "<agrave/":"Ã ", # a grave
     "<aring/": "Ã¥", # a ring above
@@ -26,8 +30,7 @@ entity_map = {
     "<igrave/":"Ã¬", # i grave
     "<Aum/":   "Ã", # A umlaut
     "<Eacute/":"Ã", # E acute
-    "<ae/":    "Ã¦", # ligature ae
-    "<AE/":    "Ã", # ligature AE
+    "<ncir/":  "nÌ", # n circumflex
     "<ocir/":  "Ã´", # o circumflex
     "<oum/":   "Ã¶", # o umlaut (diaeresis)
     "<ograve/":"Ã²", # o grave
@@ -43,21 +46,30 @@ entity_map = {
     "<uacute/":"Ãº", # u acute
     "<ntil/":  "Ã±", # n tilde
     "<Ntil/":  "Ã", # N tilde
+    "<mtil/":  "mÌ", # used in one location, looks more like m breve than m tilde BUG
+    "<ltil/":  "É«", # l tilde
+    "<sec/":   "Ë", # seconds (of degree or time). Also, inches or double prime.
     "<frac23/":"â", # two-thirds
     "<frac13/":"â", # one-third
-    "<sec/":   "Ë", # seconds (of degree or time). Also, inches or double prime.
     "<frac12/":"Â½", # one-half
     "<frac14/":"Â¼", # one-quarter
+    "<frac34/":"Â¾", # three-quarters
+    "<frac16/":  "â", # one sixth
     "<hand/":  "â", # pointing hand (printer's "fist")
     "<bprime/":"Ë", # bold accent (used in pronunciations)
     "<prime/": "Â´", # light accent (used in pronunciations) also minutes (of
                     # arc or time)
+    "<min/":   "Â´", # well some minutes of arc have a different entity
     "<rdquo/": "â", # close double quote
+    "</q>":    "â", # close double quote
     "<sect/":  "Â§", # section mark
+    "<para/":  "Â¶", # paragraph mark
     "<ldquo/": "â", # open double quotes
+    "<q>":     "â", # open double quotes
     "<amac/":  "Ä", # a macron
     "<lsquo/": "â", # left single quote
-    "<nsm/":   "á¹", # "n sub-macron"
+    "<rsquo/": "â", # right single quote
+    "<nsm/":   "á¹", # n sub-macron
     "<sharp/": "â¯", # musical sharp
     "<flat/":  "â­", # musical flat
     "<imac/":  "Ä«", # i macron
@@ -65,16 +77,15 @@ entity_map = {
     "<dsdot/": "á¸", # Sanskrit/Tamil d dot 
     "<nsdot/": "á¹", # Sanskrit/Tamil n dot
     "<tsdot/": "á¹­", # Sanskrit/Tamil t dot
+    "<tsdo/":  "á¹­", # Sanskrit/Tamil t dot
     "<ecr/":   "Ä", # e breve
     "<icr/":   "Ä­", # i breve
     "<ocr/":   "Å", # o breve
-    "<OE/":    "Å", # OE ligature
-    "<oe/":    "Å", # oe ligature
     "<omac/":  "Å", # o macron
     "<umac/":  "Å«", # u macron
     "<ocar/":  "Ç", # o hacek
     "<aemac/": "Ç£", # ae ligature macron
-    "<oemac/": "Å", # oe ligature macron
+    "<oemac/": "ÅÄ", # oe ligature macron BUG
     "<ucr/":   "Å­", # u breve
     "<acr/":   "Ä", # a breve
     "<cre/":   "Ë", # crescent (like a breve, but vertically centered --
@@ -91,9 +102,39 @@ entity_map = {
     "<deg/":   "Â°", # degree sign
     "<middot/":"â¢", # bold middle dot
     "<root/":  "â", # root sign
+    "<cuberoot/": "â", # cubic root sign
     "<adot/":  "È§", # a with dot above
+    "<mdot/":  "á¹", # m with dot above
+    "<breve/": "Ë", # breve
+    "<dagger/": "â ", # dagger
+    "<ounceap/": "â¥", # ounce
+    "<asterism/": "â", # asterism
+    "<times/": "Ã", # multiplication
+    "<8star/": "â¹", # sextile, badly named BUG
+    "<upslur/": "â", # musical slur, approx. with the IPA tie
+    "<downslur/": "â¿", # musical slur, approx. with the IPA undertiw
+    "<natural/": "â®", # natural key
+    "<schwa/": "É", # schwa
+    "<astascending/": "â", # ascending node, Dragon's head
+    "<astdescending": "â", # descending node, Dragon's tail
+    "<integral2l/": "â«", # integration symbol
+    "<iques/": "Â¿", # inverted question mark as in Spanish
+    "<pause/": "ð", # pause aka. corona
+    "<nabla/": "â", # nabla operator
+    "<dele/": "â°", # dele proofreading mark; closest Unicode is the pfennig symbol which has a similar origin
+    "<umlaut/": "Â¨", # diaeresis
+    "<rarr/":  "â", # right arrow
+    
+    "<ae/":    "Ã¦", # ligature ae
+    "<AE/":    "Ã", # ligature AE
+    "<OE/":    "Å", # OE ligature
+    "<oe/":    "Å", # oe ligature
+    "<filig/": "ï¬", # fi ligature
+    "<fllig/": "ï¬", # fl ligature
+    "<fflig/": "ï¬", # ff ligature
+    "<ffllig/": "ï¬", # ffl ligature
 
-    "<?/":     "?", #(?) Place-holder for unknown or illegible character.
+    "<?/":     "(???)", # Place-holder for unknown or illegible character.
 
     # used only in prononciation key; not able to find what "short vertical
     # bar on top" looks like with unicode chars.
@@ -105,11 +146,13 @@ entity_map = {
     "<osl/":   "o", # o "semilong"
     "<usl/":   "u", # u "semilong"
     "<th/":    "th",# th ligature
+
     "<ait/":   "ð", # a italic
     "<eit/":   "ð",
     "<iit/":   "ð",
     "<oit/":   "ð",
     "<uit/":   "ð¢",
+
     "<add/":   "a", # a with two dot below
     "<edd/":   "e",
     "<idd/":   "i",
@@ -120,6 +163,15 @@ entity_map = {
     "<etil/":  "áº½",
     "<ycr/":   "Ñ",
 
+    # not perfect but good enough; only used in the definition of repetend
+    "<2dot/": "<span style='text-decoration:overline'>2</span>",
+    "<3dot/": "<span style='text-decoration:overline'>3</span>",
+    # only used as an example in the entry for progression
+    "<lbrace2/<matrix2x5><row>2, 4, 6, 8, 10</row><row>10, 8, 6, 4, 2</row></matrix2x5><rbrace2/":
+    "{2, 4, 6, 8, 10; 10, 8, 6, 4, 2}",
+    "<lbrace2/<matrix2x5><row>2, 4, 8, 16, 32, 64</row><row>64, 32, 16, 8, 4, 2</row></matrix2x5><rbrace2/":
+    "{2, 4, 8, 16, 32; 64, 32 ,16, 8, 4, 2}",
+
     # greek letters
     "<alpha/": "Î±",         "<ALPHA/": "Î",
     "<beta/": "Î²",          "<BETA/": "Î",
@@ -139,35 +191,291 @@ entity_map = {
     "<pi/": "Ï",            "<PI/": "Î ",
     "<rho/": "Ï",           "<RHO/": "Î¡",
     "<sigma/": "Ï",         "<SIGMA/": "Î£",
+    "<sigmat/": "Ï",
+    "<digamma/": "Ï",
     "<tau/": "Ï",           "<TAU/": "Î¤",
     "<upsilon/": "Ï",       "<UPSILON/": "Î¥",
     "<phi/": "Ï",           "<PHI/": "Î¦",
     "<chi/": "Ï",           "<CHI/": "Î§",
     "<psi/": "Ï",           "<PSI/": "Î¨",
     "<omega/": "Ï",         "<OMEGA/": "Î©",
+    "<asper/": "Ê½",
 
-    # then there are some characters that are shown as escape sequences
+    # then there are some characters that are shown as escape sequences BUG
     r"\'94":   "Ã¶",
-    r"\'d8":   "â",
+    r"\'d8":   "",         # â in the dictonary, no point in keeping
     r"/'bd":   "â",        # one instance where / is used instead of \
     r" 'bd":   "â",        # two instances where \ is misssing
     r"`'b8":   "â",        # one instance where ` is used instead of \
 
-    # entities that appear in the etymology of Arabic words, but no explanation
-    # of what they stand for. Not displayed at all by GNU dico.
-    "<hsdot/": "",
-    "<zsdot/": "",
+    "<hsdot/": "á¸¥",
+    "<zsdot/": "áº",
+    "<msdot/": "á¹",
+    "<zdot/": "Å¼",
+    "<uring/": "Å¯",
+    "<usdot/": "á»¥",
+    "<lsdot/": "á¸·",
+    "<cacute/": "Ä",
+    "<ccar/": "Ä",
+    "<csdot/": "cÌ£",
+
+    "<sb/": "",                 # ??? BUG
+    "<colbreak/": "",
 }
 
-def replace_fake_comments(match):
-    nblines = match.group(0).count("\n")
-    return "\n" * nblines
-
-def convert_file(fname):
-    rawtext = open(fname, "r").read()
-    rawtext = re.sub(r"<--.+?-->", replace_fake_comments, rawtext, flags=re.DOTALL)
-    rawtext = f"<dict>" + rawtext + f"</dict>"
-    for entity, char in entity_map.items():
-        rawtext = rawtext.replace(entity, char)
-    print(rawtext.splitlines()[5724:5730])
-    e = lxml.etree.XML(rawtext)
+re_unknownentity = re.compile(r"<[a-z0-9]+/")
+re_sources = re.compile(r"\[<source>(.*?)</source>\]")
+re_pos = re.compile(r"<pos>(.*?)</pos>")
+re_headword = re.compile(r"<ent>(.*)</ent>")
+re_subsense = re.compile(r"<sn>[0-9.]+</sn>")
+re_altspelling = re.compile(r"<asp>(.+?)</asp>")
+re_greekwords = re.compile(r"<grk>(.+?)</grk>")
+skipped_hw = re.compile(r"^[0-9/]+(st|nd|rd|th)?$")
+re_simplefraction = re.compile(r"<frac([0-9])([0-9])/")
+re_fraction = re.compile(r"<frac([0-9]+)x([0-9]+)/")
+re_etymology = re.compile(r"<ety>(.*?)</ety>")
+
+# delete the tags and it's content
+delete_tags = [re.compile(r"<pr>.*?</pr>,?\.?"),
+               re.compile(r"<hw>.*?</hw>,?\.?"),
+               re.compile(r"<mhw>.*?</mhw>,?"),
+               re.compile(r"<song>.*?</song>"),
+               re.compile(r"<song>.*?</p>"), # blah BUG song not closed
+               re.compile(r"<table.*?</table>"),
+               re.compile(r"<mtable>.*?</mtable>"),
+               re.compile(r'<pr>\(hÅ"m<stil/r\).'), # BUG pr not closed
+              ]
+# replace the markup, keeping the contents intact
+replace_markup = {
+    "plu plw def rj col cs cd altsp sansserif": "",
+    "ex xex qex er it ets etsep asp spn kingdom phylum subphylum class subclass ord subord fam subfam gen var varn": "i",
+    "bold ct colf stypec": "b",
+    "subs": "sub",
+    "sups": "sup",
+}
+            
+def format_definition(text):
+    for del_tag in delete_tags:
+        text = del_tag.sub("", text)
+
+    # <and/ and <or/ should be in roman font, so need to check where they
+    # occur for replacement
+    pos_part_match = re_pos.search(text)
+    if pos_part_match:
+        pos_part = pos_part_match.group(1)
+        pos_part = pos_part.replace("<or/", "</i> or <i>") \
+                           .replace("<and/", "</i> and <i>")
+        text = text.replace(pos_part_match.group(0), f"<i>{pos_part}</i>")
+    text = text.replace("<or/", " or ").replace("<and/", " and ")
+
+    # fractions
+    text = re_simplefraction.sub(mk_fraction, text)
+    text = re_fraction.sub(mk_fraction, text)
+
+    # translate functional tags into formatting
+    for funtag, formattag in replace_markup.items():
+        for tag in funtag.split():
+            if formattag:
+                text = text.replace(f"<{tag}>", f"<{formattag}>") \
+                           .replace(f"</{tag}>", f"</{formattag}>")
+            else:
+                text = text.replace(f"<{tag}>", "") \
+                           .replace(f"</{tag}>", "")
+
+    text = text \
+        .replace("<note>â", "â").replace("<note> â", "") \
+        .replace("<note>", "â ").replace("</note>", "") \
+        .replace("<pos>", "(<i>").replace("</pos>", "</i>)") \
+        .replace("<fld>(", "(<i>").replace(")</fld>", "</i>)") \
+        .replace("<fld>", "<i>").replace("</fld>", "</i>") \
+        .replace("<au>", '<span style="font-variant:small-caps">') \
+        .replace("</au>", "</span>") \
+        .replace("<qau>", '<span style="font-variant:small-caps">') \
+        .replace("</qau>", "</span>") \
+        .replace("<sc>", '<span style="font-variant:small-caps">') \
+        .replace("</sc>", "</span>") \
+        .replace("<mark>[R.]</mark>", "[<i>Rare</i>]") \
+        .replace("<mark>[Obs.]</mark>", "[<i>Obsolete</i>]") \
+        .replace("<mark>[", "[<i>").replace("]</mark>", "</i>]") \
+
+    wrapol = False
+    for subsensemark in re_subsense.findall(text):
+        text = text.replace(subsensemark, "<li>")
+        wrapol = True
+    if wrapol:
+        text = text.replace("<li>", "<ol><li>", 1)
+        text = f"{text}</ol>"
+
+    # embedded greek
+    text = re_greekwords.sub(detransliterate_greek, text)
+
+    # move etymology at the end
+    match_ety = re_etymology.search(text)
+    if match_ety:
+        text = text.replace(match_ety.group(0), "")
+        text += f"<p><b>Etymology:</b> {match_ety.group(1)}</p>"
+
+    # simplify sources : if all identical, just have the one at the end;
+    # otherwise format it a bit better
+    sources = list(re_sources.finditer(text))
+    #assert sources, f"Looking for sources in {text}"
+    sourcenames = [source.group(1) for source in sources]
+    if len(set(sourcenames)) == 1 or \
+       (sourcenames and all("Webster" in s for s in sourcenames)):
+        text = re_sources.sub("", text)
+        text +=  f"<p>Source: {sources[0].group(1)}</p>"
+    else:
+        for source in set(sources):
+            text = text.replace(source.group(0), f"<p>Source: {source.group(1)}</p>")
+
+    unknwn_ent = re_unknownentity.search(text)
+    assert not unknwn_ent, f"Unknown entity {unknwn_ent} in {text}"
+
+    # TODO: add check for unknown tags
+
+    return text
+
+def detransliterate_greek(roman):
+    if isinstance(roman, re.Match):
+        roman = roman.group(1)
+    # not in alphabetical order, as ch for Ï needs to appear before h for Î·
+    greek = roman\
+        .replace("ch", "Ï") \
+        .replace("a", "Î±").replace("b", "Î²").replace("g", "Î³") \
+        .replace("d", "Î´").replace("e", "Îµ").replace("z", "Î¶") \
+        .replace("h", "Î·").replace("q", "Î¸").replace("i", "Î¹") \
+        .replace("k", "Îº").replace("l", "Î»").replace("m", "Î¼") \
+        .replace("n", "Î½").replace("x", "Î¾").replace("o", "Î¿") \
+        .replace("p", "Ï").replace("r", "Ï").replace("s", "Ï") \
+        .replace("t", "Ï").replace("y", "Ï").replace("u", "Ï") \
+        .replace("f", "Ï").replace("ps", "Ï").replace("w", "Ï") \
+        .replace("~", "\N{COMBINING GRAVE ACCENT}") \
+        .replace("`", "\N{COMBINING ACUTE ACCENT}") \
+        .replace(":", "\N{COMBINING DIAERESIS}") \
+        .replace(",", "\N{COMBINING GREEK YPOGEGRAMMENI}") \
+        .replace("^", "\N{COMBINING GREEK PERISPOMENI}")
+    if greek.endswith("Ï"):
+        greek = greek[:-1] + "Ï"
+    # these appear before the letter, so need to move after
+    greek = re.sub("'(.)", "\\1\N{COMBINING COMMA ABOVE}", greek)
+    greek = re.sub("\"(.)", "\\1\N{COMBINING REVERSED COMMA ABOVE}", greek)
+    return unicodedata.normalize("NFC", greek)
+
+def test_transliterate_greek():
+    expected = ""
+    actual = ""
+    for roman, greek in [("'archai:`zein", "á¼ÏÏÎ±ÎÎ¶ÎµÎ¹Î½"), # BUG in the example the accent on the Î¹ is wrong
+                         ("zw^,on", "Î¶á¿·Î¿Î½"),
+                         ("o'i^nos", "Î¿á¼¶Î½Î¿Ï"),
+                         ("\"ydra`rgyros", "á½Î´ÏÎ¬ÏÎ³ÏÏÎ¿Ï"),
+                         ("lyth`rios", "Î»ÏÏÎ®ÏÎ¹Î¿Ï"),
+                         ("poihth`s", "ÏÎ¿Î¹Î·ÏÎ®Ï")]:
+        expected += f"{roman}\n{greek} - {greek.encode('unicode_escape')}\n"
+        actual += f"{roman}\n{detransliterate_greek(roman)} - " \
+            "{detransliterate_greek(roman).encode('unicode_escape')}\n"
+    assert actual == expected
+
+def mk_fraction(fraction):
+    num = fraction.group(1).translate({"1": "Â¹", "2": "Â²", "3": "Â³",
+                                       "4": "â´", "5": "âµ", "6": "â¶",
+                                       "7": "â·", "8": "â¸", "9": "â¹",
+                                       "0": "â°"})
+    den = fraction.group(2).translate({"1": "â", "2": "â", "3": "â",
+                                       "4": "â", "5": "â", "6": "â",
+                                       "7": "â", "8": "â", "9": "â",
+                                       "0": "â"})
+    return f"{num}â{den}"
+
+class Converter:
+    def __init__(self):
+        self.entries = {}
+        self.current_hw = []
+        self.current_def = ""
+
+    def start_new_entry(self):
+        current_hw = tuple(hw.upper() for hw in self.current_hw)
+        if current_hw not in self.entries:
+            self.entries[current_hw] = []
+        try:
+            defn = format_definition(self.current_def)
+        except:
+            print(f"Trying to parse {current_hw}: {self.current_def}")
+            raise
+        self.entries[current_hw].append(defn)
+
+        self.current_def = ""
+        self.current_hw = []
+
+    def convert_file(self, fname):
+        rawtext = open(fname, "r", encoding="latin-1").read()
+        rawtext = re.sub(r"<--.+?-->", "", rawtext, flags=re.DOTALL)
+        for entity, char in entity_map.items():
+            rawtext = rawtext.replace(entity, char)
+
+        for line in rawtext.splitlines():
+            match = re_headword.search(line)
+            if match:
+                if self.current_def:
+                    self.start_new_entry()
+
+                hw = match.group(1)
+                if skipped_hw.match(hw):
+                    continue
+                elif hw == "Tracer/y": # BUG for these words
+                    hw = "Tracery"
+                elif hw == "Adder fly/":
+                    hw = "Adder fly"
+                elif hw == "Gairish/ness":
+                    hw = "Gairishness"
+                elif hw == "P<sub>i</sub>":
+                    hw = "Pi"
+
+                assert re.match("^[^<>/]+$", hw), hw
+                self.current_hw.append(hw)
+            else:
+                if self.current_hw:
+                    for altsp in re_altspelling.findall(line):
+                        self.current_hw.append(altsp)
+                    self.current_def += line
+
+        print(f"after{fname}, {len(self.entries)} definitions")
+
+    def write(self, fname):
+        with open(fname, "w") as f:
+            for hws, entry in self.entries.items():
+                hws = ", ".join(hws)
+                f.write(f"\n<h1>{hws}</h1>")
+                if len(entry) > 1:
+                    for i, subentry in enumerate(entry):
+                        f.write(f"<h2>{i+1}. {hws}</h2>")
+                        f.write(subentry)
+                else:
+                    f.write(entry[0])
+
+    def write_to_tabseparated(self, fname):
+        with open(fname, "w") as f:
+            for hws, entry in self.entries.items():
+                hws = ", ".join(hws)
+                f.write(f"{hws}\t")
+                if len(entry) > 1:
+                    for i, subentry in enumerate(entry):
+                        f.write(f"<h2>{i+1}. {hws}</h2>")
+                        assert "\n" not in subentry
+                        f.write(subentry)
+                else:
+                    assert "\n" not in entry[0]
+                    f.write(entry[0])
+                f.write("\n")
+
+def main():
+    c = Converter()
+    for fname in glob("../gcide/CIDE.?"):
+        print(f"Running over {fname}")
+        c.convert_file(fname)
+
+        letter = fname.rsplit(".")[-1]
+        # c.write(f"GCIDE-{letter}.html")
+    c.write_to_tabseparated("GCIDE.tab_separated")
+
+if __name__ == "__main__":
+    main()
diff --git a/Makefile b/Makefile
index d3ebd70..db67dc5 100644
--- a/Makefile
+++ b/Makefile
@@ -1,4 +1,4 @@
-all: OPTED.v006.quickdic XMLittre.v006.quickdic
+all: OPTED.v006.quickdic XMLittre.v006.quickdic GCIDE.v006.quickdic
 
 %.tab_separated: %_to_tab_separated
 	./$*_to_tab_separated
@@ -17,6 +17,8 @@ clean:
 	rm -f *.quickdic *.quickdic.txt
 	rm -fr OPTED/
 
+GCIDE.v007.quickdic: dictlang := EN
+IT.v007.quickdic: dictlang := IT
 OPTED.v007.quickdic: dictlang := EN
 XMLittre.v007.quickdic: dictlang := FR