GCIDE_to_tab_separated

   1 #!/usr/bin/python3
   2
   3 import lxml.etree
   4 import re
   5
   6 entity_map = {
   7     "<br/":    "<br/>",
   8     "&":       "&amp;",
   9     "<lt/":    "&lt;",
  10     "<gt/":    "&gt;",
  11     "--":      "—", # long (em) dash
  12     "<Cced/":  "Ç", # C cedilla
  13     "<uum/":   "ü", # u umlaut (diaeresis)
  14     "<eacute/":"é", # e acute
  15     "<acir/":  "â", # a circumflex
  16     "<aum/":   "ä", # a umlaut (diaeresis)
  17     "<agrave/":"à", # a grave
  18     "<aring/": "å", # a ring above
  19     "<cced/":  "ç", # c cedilla
  20     "<ecir/":  "ê", # e circumflex
  21     "<eum/":   "ë", # e umlaut (diaeresis)
  22     "<egrave/":"è", # e grave
  23     "<ium/":   "ï", # i umlaut (diaeresis)
  24     "<icir/":  "î", # i circumflex
  25     "<icirc/": "î", # i circumflex
  26     "<igrave/":"ì", # i grave
  27     "<Aum/":   "Ä", # A umlaut
  28     "<Eacute/":"É", # E acute
  29     "<ae/":    "æ", # ligature ae
  30     "<AE/":    "Æ", # ligature AE
  31     "<ocir/":  "ô", # o circumflex
  32     "<oum/":   "ö", # o umlaut (diaeresis)
  33     "<ograve/":"ò", # o grave
  34     "<ucir/":  "û", # u circumflex
  35     "<ugrave/":"ù", # u grave
  36     "<yum/":   "ÿ", # y umlaut
  37     "<Oum/":   "Ö", # O umlaut
  38     "<Uum/":   "Ü", # U umlaut (diaeresis)
  39     "<pound/": "£", # pound sign (British)
  40     "<aacute/":"á", # a acute
  41     "<iacute/":"í", # i acute
  42     "<oacute/":"ó", # o acute
  43     "<uacute/":"ú", # u acute
  44     "<ntil/":  "ñ", # n tilde
  45     "<Ntil/":  "Ñ", # N tilde
  46     "<frac23/":"⅔", # two-thirds
  47     "<frac13/":"⅓", # one-third
  48     "<sec/":   "˝", # seconds (of degree or time). Also, inches or double prime.
  49     "<frac12/":"½", # one-half
  50     "<frac14/":"¼", # one-quarter
  51     "<hand/":  "☞", # pointing hand (printer's "fist")
  52     "<bprime/":"˝", # bold accent (used in pronunciations)
  53     "<prime/": "´", # light accent (used in pronunciations) also minutes (of
  54                     # arc or time)
  55     "<rdquo/": "”", # close double quote
  56     "<sect/":  "§", # section mark
  57     "<ldquo/": "“", # open double quotes
  58     "<amac/":  "ā", # a macron
  59     "<lsquo/": "‘", # left single quote
  60     "<nsm/":   "ṉ", # "n sub-macron"
  61     "<sharp/": "♯", # musical sharp
  62     "<flat/":  "♭", # musical flat
  63     "<imac/":  "ī", # i macron
  64     "<emac/":  "ē", # e macron
  65     "<dsdot/": "ḍ", # Sanskrit/Tamil d dot
  66     "<nsdot/": "ṇ", # Sanskrit/Tamil n dot
  67     "<tsdot/": "ṭ", # Sanskrit/Tamil t dot
  68     "<ecr/":   "ĕ", # e breve
  69     "<icr/":   "ĭ", # i breve
  70     "<ocr/":   "ŏ", # o breve
  71     "<OE/":    "Œ", # OE ligature
  72     "<oe/":    "œ", # oe ligature
  73     "<omac/":  "ō", # o macron
  74     "<umac/":  "ū", # u macron
  75     "<ocar/":  "ǒ", # o hacek
  76     "<aemac/": "ǣ", # ae ligature macron
  77     "<oemac/": "ō", # oe ligature macron
  78     "<ucr/":   "ŭ", # u breve
  79     "<acr/":   "ă", # a breve
  80     "<cre/":   "˘", # crescent (like a breve, but vertically centered --
  81                     # represents the short accent in poetic meter)
  82     "<ymac/":  "ȳ", # y macron
  83     "<edh/":   "ð", # small eth
  84     "<thorn/": "þ", # small thorn
  85     "<atil/":  "ã", # a tilde
  86     "<ndot/":  "ṅ", # n with dot above
  87     "<rsdot/": "ṛ", # r with a dot below
  88     "<yogh/":  "ȝ", # small yogh
  89     "<mdash/": "—", # em dash
  90     "<divide/":"÷", # division sign
  91     "<deg/":   "°", # degree sign
  92     "<middot/":"•", # bold middle dot
  93     "<root/":  "√", # root sign
  94     "<adot/":  "ȧ", # a with dot above
  95
  96     "<?/":     "?", #(?) Place-holder for unknown or illegible character.
  97
  98     # used only in prononciation key; not able to find what "short vertical
  99     # bar on top" looks like with unicode chars.
 100     "<asl/":   "a", #  a "semilong" (has a macron above with a short
 101                     # vertical bar on top the center of the macron)
 102                     # Used in pronunciations.
 103     "<esl/":   "e", # e "semilong"
 104     "<isl/":   "i", # i "semilong"
 105     "<osl/":   "o", # o "semilong"
 106     "<usl/":   "u", # u "semilong"
 107     "<th/":    "th",# th ligature
 108     "<ait/":   "𝑎", # a italic
 109     "<eit/":   "𝑒",
 110     "<iit/":   "𝑖",
 111     "<oit/":   "𝑜",
 112     "<uit/":   "𝑢",
 113     "<add/":   "a", # a with two dot below
 114     "<edd/":   "e",
 115     "<idd/":   "i",
 116     "<odd/":   "o",
 117     "<udd/":   "u",
 118     "<oocr/":  "oo",
 119     "<oomac/":  "oo",
 120     "<etil/":  "ẽ",
 121     "<ycr/":   "ў",
 122
 123     # greek letters
 124     "<alpha/": "α",         "<ALPHA/": "Α",
 125     "<beta/": "β",          "<BETA/": "Β",
 126     "<gamma/": "γ",         "<GAMMA/": "Γ",
 127     "<delta/": "δ",         "<DELTA/": "Δ",
 128     "<epsilon/": "ε",       "<EPSILON/": "Ε",
 129     "<zeta/": "ζ",          "<ZETA/": "Ζ",
 130     "<eta/": "η",           "<ETA/": "Η",
 131     "<theta/": "θ",         "<THETA/": "Θ",
 132     "<iota/": "ι",          "<IOTA/": "Ι",
 133     "<kappa/": "κ",         "<KAPPA/": "Κ",
 134     "<lambda/": "λ",        "<LAMBDA/": "Λ",
 135     "<mu/": "μ",            "<MU/": "Μ",
 136     "<nu/": "ν",            "<NU/": "Ν",
 137     "<xi/": "ξ",            "<XI/": "Ξ",
 138     "<omicron/": "ο",       "<OMICRON/": "Ο",
 139     "<pi/": "π",            "<PI/": "Π",
 140     "<rho/": "ρ",           "<RHO/": "Ρ",
 141     "<sigma/": "σ",         "<SIGMA/": "Σ",
 142     "<tau/": "τ",           "<TAU/": "Τ",
 143     "<upsilon/": "υ",       "<UPSILON/": "Υ",
 144     "<phi/": "φ",           "<PHI/": "Φ",
 145     "<chi/": "χ",           "<CHI/": "Χ",
 146     "<psi/": "ψ",           "<PSI/": "Ψ",
 147     "<omega/": "ω",         "<OMEGA/": "Ω",
 148
 149     # then there are some characters that are shown as escape sequences
 150     r"\'94":   "ö",
 151     r"\'d8":   "‖",
 152     r"/'bd":   "“",        # one instance where / is used instead of \
 153     r" 'bd":   "“",        # two instances where \ is misssing
 154     r"`'b8":   "”",        # one instance where ` is used instead of \
 155
 156     # entities that appear in the etymology of Arabic words, but no explanation
 157     # of what they stand for. Not displayed at all by GNU dico.
 158     "<hsdot/": "",
 159     "<zsdot/": "",
 160 }
 161
 162 def replace_fake_comments(match):
 163     nblines = match.group(0).count("\n")
 164     return "\n" * nblines
 165
 166 def convert_file(fname):
 167     rawtext = open(fname, "r").read()
 168     rawtext = re.sub(r"<--.+?-->", replace_fake_comments, rawtext, flags=re.DOTALL)
 169     rawtext = f"<dict>" + rawtext + f"</dict>"
 170     for entity, char in entity_map.items():
 171         rawtext = rawtext.replace(entity, char)
 172     print(rawtext.splitlines()[5724:5730])
 173     e = lxml.etree.XML(rawtext)