snapshot of current effort

author Frédéric Perrin <fred@fperrin.net>

Sat, 6 Feb 2021 11:09:46 +0000 (11:09 +0000)

committer Frédéric Perrin <fred@fperrin.net>

Sat, 6 Feb 2021 11:09:46 +0000 (11:09 +0000)
author Frédéric Perrin <fred@fperrin.net>
Sat, 6 Feb 2021 11:09:46 +0000 (11:09 +0000)
committer Frédéric Perrin <fred@fperrin.net>
Sat, 6 Feb 2021 11:09:46 +0000 (11:09 +0000)
diff --git a/GCIDE_to_tab_separated b/GCIDE_to_tab_separated

new file mode 100755 (executable)

index 0000000..99a4973
--- /dev/null
+++ b/GCIDE_to_tab_separated
@@ -0,0 +1,173 @@
+#!/usr/bin/python3
+
+import lxml.etree
+import re
+
+entity_map = {
+    "<br/":    "<br/>",
+    "&":       "&amp;",
+    "<lt/":    "&lt;",
+    "<gt/":    "&gt;",
+    "--":      "—", # long (em) dash
+    "<Cced/":  "Ç", # C cedilla
+    "<uum/":   "ü", # u umlaut (diaeresis)
+    "<eacute/":"é", # e acute
+    "<acir/":  "â", # a circumflex
+    "<aum/":   "ä", # a umlaut (diaeresis)
+    "<agrave/":"à", # a grave
+    "<aring/": "å", # a ring above
+    "<cced/":  "ç", # c cedilla
+    "<ecir/":  "ê", # e circumflex
+    "<eum/":   "ë", # e umlaut (diaeresis)
+    "<egrave/":"è", # e grave
+    "<ium/":   "ï", # i umlaut (diaeresis)
+    "<icir/":  "î", # i circumflex
+    "<icirc/": "î", # i circumflex
+    "<igrave/":"ì", # i grave
+    "<Aum/":   "Ä", # A umlaut
+    "<Eacute/":"É", # E acute
+    "<ae/":    "æ", # ligature ae
+    "<AE/":    "Æ", # ligature AE
+    "<ocir/":  "ô", # o circumflex
+    "<oum/":   "ö", # o umlaut (diaeresis)
+    "<ograve/":"ò", # o grave
+    "<ucir/":  "û", # u circumflex
+    "<ugrave/":"ù", # u grave
+    "<yum/":   "ÿ", # y umlaut
+    "<Oum/":   "Ö", # O umlaut
+    "<Uum/":   "Ü", # U umlaut (diaeresis)
+    "<pound/": "£", # pound sign (British)
+    "<aacute/":"á", # a acute
+    "<iacute/":"í", # i acute
+    "<oacute/":"ó", # o acute
+    "<uacute/":"ú", # u acute
+    "<ntil/":  "ñ", # n tilde
+    "<Ntil/":  "Ñ", # N tilde
+    "<frac23/":"⅔", # two-thirds
+    "<frac13/":"⅓", # one-third
+    "<sec/":   "˝", # seconds (of degree or time). Also, inches or double prime.
+    "<frac12/":"½", # one-half
+    "<frac14/":"¼", # one-quarter
+    "<hand/":  "☞", # pointing hand (printer's "fist")
+    "<bprime/":"˝", # bold accent (used in pronunciations)
+    "<prime/": "´", # light accent (used in pronunciations) also minutes (of
+                    # arc or time)
+    "<rdquo/": "”", # close double quote
+    "<sect/":  "§", # section mark
+    "<ldquo/": "“", # open double quotes
+    "<amac/":  "ā", # a macron
+    "<lsquo/": "‘", # left single quote
+    "<nsm/":   "ṉ", # "n sub-macron"
+    "<sharp/": "♯", # musical sharp
+    "<flat/":  "♭", # musical flat
+    "<imac/":  "ī", # i macron
+    "<emac/":  "ē", # e macron
+    "<dsdot/": "ḍ", # Sanskrit/Tamil d dot 
+    "<nsdot/": "ṇ", # Sanskrit/Tamil n dot
+    "<tsdot/": "ṭ", # Sanskrit/Tamil t dot
+    "<ecr/":   "ĕ", # e breve
+    "<icr/":   "ĭ", # i breve
+    "<ocr/":   "ŏ", # o breve
+    "<OE/":    "Œ", # OE ligature
+    "<oe/":    "œ", # oe ligature
+    "<omac/":  "ō", # o macron
+    "<umac/":  "ū", # u macron
+    "<ocar/":  "ǒ", # o hacek
+    "<aemac/": "ǣ", # ae ligature macron
+    "<oemac/": "ō", # oe ligature macron
+    "<ucr/":   "ŭ", # u breve
+    "<acr/":   "ă", # a breve
+    "<cre/":   "˘", # crescent (like a breve, but vertically centered --
+                    # represents the short accent in poetic meter)
+    "<ymac/":  "ȳ", # y macron
+    "<edh/":   "ð", # small eth
+    "<thorn/": "þ", # small thorn
+    "<atil/":  "ã", # a tilde
+    "<ndot/":  "ṅ", # n with dot above
+    "<rsdot/": "ṛ", # r with a dot below
+    "<yogh/":  "ȝ", # small yogh
+    "<mdash/": "—", # em dash
+    "<divide/":"÷", # division sign
+    "<deg/":   "°", # degree sign
+    "<middot/":"•", # bold middle dot
+    "<root/":  "√", # root sign
+    "<adot/":  "ȧ", # a with dot above
+
+    "<?/":     "?", #(?) Place-holder for unknown or illegible character.
+
+    # used only in prononciation key; not able to find what "short vertical
+    # bar on top" looks like with unicode chars.
+    "<asl/":   "a", #  a "semilong" (has a macron above with a short
+                    # vertical bar on top the center of the macron)
+                    # Used in pronunciations.
+    "<esl/":   "e", # e "semilong"
+    "<isl/":   "i", # i "semilong"
+    "<osl/":   "o", # o "semilong"
+    "<usl/":   "u", # u "semilong"
+    "<th/":    "th",# th ligature
+    "<ait/":   "𝑎", # a italic
+    "<eit/":   "𝑒",
+    "<iit/":   "𝑖",
+    "<oit/":   "𝑜",
+    "<uit/":   "𝑢",
+    "<add/":   "a", # a with two dot below
+    "<edd/":   "e",
+    "<idd/":   "i",
+    "<odd/":   "o",
+    "<udd/":   "u",
+    "<oocr/":  "oo",
+    "<oomac/":  "oo",
+    "<etil/":  "ẽ",
+    "<ycr/":   "ў",
+
+    # greek letters
+    "<alpha/": "α",         "<ALPHA/": "Α",
+    "<beta/": "β",          "<BETA/": "Β",
+    "<gamma/": "γ",         "<GAMMA/": "Γ",
+    "<delta/": "δ",         "<DELTA/": "Δ",
+    "<epsilon/": "ε",       "<EPSILON/": "Ε",
+    "<zeta/": "ζ",          "<ZETA/": "Ζ",
+    "<eta/": "η",           "<ETA/": "Η",
+    "<theta/": "θ",         "<THETA/": "Θ",
+    "<iota/": "ι",          "<IOTA/": "Ι",
+    "<kappa/": "κ",         "<KAPPA/": "Κ",
+    "<lambda/": "λ",        "<LAMBDA/": "Λ",
+    "<mu/": "μ",            "<MU/": "Μ",
+    "<nu/": "ν",            "<NU/": "Ν",
+    "<xi/": "ξ",            "<XI/": "Ξ",
+    "<omicron/": "ο",       "<OMICRON/": "Ο",
+    "<pi/": "π",            "<PI/": "Π",
+    "<rho/": "ρ",           "<RHO/": "Ρ",
+    "<sigma/": "σ",         "<SIGMA/": "Σ",
+    "<tau/": "τ",           "<TAU/": "Τ",
+    "<upsilon/": "υ",       "<UPSILON/": "Υ",
+    "<phi/": "φ",           "<PHI/": "Φ",
+    "<chi/": "χ",           "<CHI/": "Χ",
+    "<psi/": "ψ",           "<PSI/": "Ψ",
+    "<omega/": "ω",         "<OMEGA/": "Ω",
+
+    # then there are some characters that are shown as escape sequences
+    r"\'94":   "ö",
+    r"\'d8":   "‖",
+    r"/'bd":   "“",        # one instance where / is used instead of \
+    r" 'bd":   "“",        # two instances where \ is misssing
+    r"`'b8":   "”",        # one instance where ` is used instead of \
+
+    # entities that appear in the etymology of Arabic words, but no explanation
+    # of what they stand for. Not displayed at all by GNU dico.
+    "<hsdot/": "",
+    "<zsdot/": "",
+}
+
+def replace_fake_comments(match):
+    nblines = match.group(0).count("\n")
+    return "\n" * nblines
+
+def convert_file(fname):
+    rawtext = open(fname, "r").read()
+    rawtext = re.sub(r"<--.+?-->", replace_fake_comments, rawtext, flags=re.DOTALL)
+    rawtext = f"<dict>" + rawtext + f"</dict>"
+    for entity, char in entity_map.items():
+        rawtext = rawtext.replace(entity, char)
+    print(rawtext.splitlines()[5724:5730])
+    e = lxml.etree.XML(rawtext)
diff --git a/Makefile b/Makefile

index 3fc8c53ea0bc371f03eca0cb5c813bfd675474c3..d3ebd707e42b23854f3ab7ccb78c0990339a9c9f 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -1,41 +1,47 @@
-OPTED_SOURCEDIR = OPTED/v003
+all: OPTED.v006.quickdic XMLittre.v006.quickdic
  
-OPTED_FILES = $(addprefix $(OPTED_SOURCEDIR)/wb1913_,$(addsuffix .html,$(shell bash -c 'echo {a..z} new')))
+%.tab_separated: %_to_tab_separated
+       ./$*_to_tab_separated
  
-essai:
-       @echo $(FILES)
+%.v007.quickdic: %.tab_separated
+       echo $(dictlang)
+       [ ! -z $(dictlang) ]
+       cd ../DictionaryPC && ./run.sh --dictInfo=@$(CURDIR)/$*-dictinfo.txt --input1Charset=UTF8 --input1Format=tab_separated --input1Name=$* --lang1=$(dictlang) --lang1Stoplist=$(CURDIR)/$*-stoplist.txt --input1=$(CURDIR)/$< --dictOut=$(CURDIR)/$@  --print=$(CURDIR)/$@.txt
  
-all: OPTED.v006-from-tab_separated.quickdic
+%.v006.quickdic: %.v007.quickdic
+       rm -f $@
+       cd ../DictionaryPC && ./convert_to_v6.sh $(CURDIR)/$< $(CURDIR)/$@
  
-# optedv003.hqx:
-#      wget 'http://www.mso.anu.edu.au/~ralph/OPTED/optedv003.hqx'
+clean:
+       rm -f *.tab_separated *.chemnitz
+       rm -f *.quickdic *.quickdic.txt
+       rm -fr OPTED/
  
-$(OPTED_FILES): optedv003.hqx
-       hexbin -d $<
-       unar OPTED.sit.data
-       find OPTED -type f | xargs sed -i 's/\r/\n/g'
-       cd OPTED && patch -p1 < ../OPTED.patch
+OPTED.v007.quickdic: dictlang := EN
+XMLittre.v007.quickdic: dictlang := FR
  
-OPTED.tab_separated: $(OPTED_FILES)
-       ./OPTED_to_tab_separated
+OPTED_SOURCEDIR = OPTED/v003
  
-OPTED.v007-from-tab_separated.quickdic: OPTED.tab_separated
-       cd ../DictionaryPC && ./run.sh --dictInfo="Webster's Unabridged Dictionary, from the OPTED project" --input1=$(CURDIR)/$< --input1Charset=UTF8 --input1Format=tab_separated --input1Name="Webster-OPTED" --lang1=EN --lang1Stoplist=data/inputs/stoplists/en.txt --dictOut=$(CURDIR)/$@  --print=$(CURDIR)/$@.txt
+OPTED_FILES = $(shell bash -c 'for l in {a..z} new; do echo $(OPTED_SOURCEDIR)/wb1913_$$l.html; done')
  
-OPTED.v006-from-tab_separated.quickdic: OPTED.v007-from-tab_separated.quickdic
-       rm -f $@
-       cd ../DictionaryPC && ./convert_to_v6.sh $(CURDIR)/$< $(CURDIR)/$@
+optedv003.hqx:
+       echo Manually run that command to fetch the OPTED raw data
+       echo wget 'http://www.mso.anu.edu.au/~ralph/OPTED/optedv003.hqx'
+       false
  
-OPTED.chemnitz: OPTED/v003/wb1913_a.html
-       ./OPTED_to_chemnitz
+$(OPTED_FILES): optedv003.hqx
+       hexbin -d optedv003.hqx
+       unar -f OPTED.sit.data
+       find OPTED -type f | xargs sed -i 's/\r/\n/g'
+       cd OPTED && patch -p1 < ../OPTED.patch
  
-OPTED.v007-from-chemnitz.quickdic: OPTED.chemnitz
-       cd ../DictionaryPC && ./run.sh --dictInfo="Webster's Unabridged Dictionary, from the OPTED project" --input1=$(CURDIR)/$< --input1Charset=UTF8 --input1Format=chemnitz --input1Name="Webster-OPTED" --lang1=EN --lang1Stoplist=data/inputs/stoplists/en.txt --dictOut=$(CURDIR)/$@  --print=$(CURDIR)/$@.txt
+OPTED.tab_separated: $(OPTED_FILES) OPTED.patch
  
-OPTED.v006-from-chemnitz.quickdic: OPTED.v007-from-chemnitz.quickdic
-       rm -f $@
-       cd ../DictionaryPC && ./convert_to_v6.sh $(CURDIR)/$< $(CURDIR)/$@
+XMLittre.tab_separated: XMLittre-entree.xslt
+XMLittre.tab_separated: ../xmlittre-data/
  
-clean:
-       rm -f OPTED.*.quickdic OPTED.tab_separated OPTED.chemnitz
-       rm -fr OPTED/
+./xmlittre-data/:
+       echo Manually run that command to fetch XMLittre data
+       echo cd $(CURDIR)/..
+       echo git clone 'https://bitbucket.org/Mytskine/xmlittre-data.git'
+       false
diff --git a/OPTED-dictinfo.txt b/OPTED-dictinfo.txt

new file mode 100644 (file)

index 0000000..8f9558b
--- /dev/null
+++ b/OPTED-dictinfo.txt
@@ -0,0 +1,9 @@
+The Online Plain Text English Dictionary is a public domain English word
+list dictionary, based on the public domain portion of "The Project
+Gutenberg Etext of Webster's Unabridged Dictionary" which is in turn
+based on the 1913 US Webster's Unabridged Dictionary.
+
+This version has been extensively stripped down and set out as one
+definition per line by Ralph Sutherland.
+
+Version Quickdic prepared by Frédéric Perrin.
diff --git a/OPTED_to_tab_separated b/OPTED_to_tab_separated

index 479b3f0c5479ecd3ac345179ebb48b6033656eb0..7e99a6c18be72abbeced8d954be95423878ecc5b 100755 (executable)
--- a/OPTED_to_tab_separated
+++ b/OPTED_to_tab_separated
@@ -95,7 +95,7 @@ class OptedParser(HTMLParser):
  
  def main():
      opted_parser = OptedParser()
-    for fname in glob("OPTED/v003/wb1913_*.html"):
+    for fname in sorted(glob("OPTED/v003/wb1913_*.html")):
          print(f"Running on {fname}")
          with open(fname, mode="r",
                    encoding="macroman", errors="strict") as inputfile:
diff --git a/README.txt b/README.txt

new file mode 100644 (file)

index 0000000..108c28d
--- /dev/null
+++ b/README.txt
@@ -0,0 +1,14 @@
+http://www.gutenberg.org/files/29765/29765-8.zip
+
+OPTED http://www.mso.anu.edu.au/~ralph/OPTED/optedv003.hqx
+Decode with "hexbin -d optedv003.hqx" then "unar -forks skip OPTED.sit.data"
+hexbin from package macutils ; unar from package of the same name.
+
+./run.sh --dictInfo="Webster-OPTED" --dictOut=OPTED.v007.quickdic --input1=../dictionaries/OPTED.tab_separated --input1Charset=UTF8 --input1Format=tab_separated --input1Name="Webster-OPTED" --lang1=EN --lang2=EN
+./run.sh --dictInfo="Webster's Unabridged Dictionary, from the OPTED project" --dictOut=OPTED.v007.quickdic --input1=../dictionaries/OPTED.chemnitz --input1Charset=UTF8 --input1Format=chemnitz --input1Name="Webster-OPTED" --lang1=EN --lang1Stoplist=data/inputs/stoplists/en.txt
+./convert_to_v6.sh OPTED.v007.quickdic OPTED.v006.quickdic
+
+
+ftp://ftp.gnu.org/gnu/gcide/gcide-0.52.tar.xz
+
+https://www.ebooksgratuits.com/details.php?book=1609
diff --git a/XMLittre-entree.xslt b/XMLittre-entree.xslt

new file mode 100644 (file)

index 0000000..97c1b1e
--- /dev/null
+++ b/XMLittre-entree.xslt
@@ -0,0 +1,99 @@
+<?xml version="1.0" encoding="utf-8"?>
+<xsl:stylesheet version="1.0"
+               xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
+  <xsl:output method="xml" version="1.0" encoding="UTF-8"
+             indent="yes" standalone="no" omit-xml-declaration="yes"/>
+
+  <xsl:template match="@*|node()">
+    <xsl:copy>
+      <xsl:apply-templates select="@*|node()" />
+    </xsl:copy>
+  </xsl:template>
+
+  <xsl:template match="entree">
+    <xsl:for-each select="entete/nature | entete/indent | entete/cit | entete/a | entete/semantique">
+      <xsl:apply-templates select="."/>
+    </xsl:for-each>
+
+    <xsl:if test="résumé">
+      <p><b>RÉSUMÉ</b></p>
+    </xsl:if>
+    <xsl:if test="résumé/indent">
+      <xsl:apply-templates select="résumé/indent"/>
+    </xsl:if>
+    <xsl:choose>
+      <xsl:when test="count(résumé/variante) &gt; 1">
+       <xsl:apply-templates select="résumé" mode="plusieurs_variantes"/>
+      </xsl:when>
+      <xsl:when test="count(résumé/variante) = 1">
+       <xsl:apply-templates select="résumé" mode="unique_variante"/>
+      </xsl:when>
+    </xsl:choose>
+    <xsl:if test="résumé">
+      <p><b>ENTRÉE PRINCIPALE</b></p>
+    </xsl:if>
+    
+    <xsl:choose>
+      <xsl:when test="count(corps/variante) &gt; 1">
+       <xsl:apply-templates select="corps" mode="plusieurs_variantes"/>
+      </xsl:when>
+      <xsl:otherwise>
+       <xsl:apply-templates select="corps" mode="unique_variante"/>
+      </xsl:otherwise>
+    </xsl:choose>
+
+    <xsl:for-each select="rubrique">
+      <p><b><xsl:value-of select="@nom"/></b></p>
+      <xsl:apply-templates select="node()"/>
+    </xsl:for-each>
+
+    <xsl:if test="entete/prononciation">
+      <p><b>PRONONCIATION</b></p>
+      <div>
+       <xsl:apply-templates select="entete/prononciation/node()" />
+      </div>
+    </xsl:if>
+  </xsl:template>
+
+  <xsl:template match="nature">
+    <i><xsl:value-of select="text()"/></i>
+  </xsl:template>
+  <xsl:template match="indent">
+    <div style="padding-left: 1em"><xsl:apply-templates select="node()"/></div>
+  </xsl:template>
+
+  <xsl:template match="corps|résumé" mode="plusieurs_variantes">
+    <ol>
+      <xsl:for-each select="variante">
+       <li><xsl:apply-templates select="node()"/></li>
+      </xsl:for-each>
+    </ol>
+  </xsl:template>
+
+  <xsl:template match="corps|résumé" mode="unique_variante">
+    <div><xsl:apply-templates select="node()"/></div>
+  </xsl:template>
+
+  <xsl:template match="cit">
+    <p><q><xsl:apply-templates select="node()" /></q>
+      <xsl:if test="string-length(@ref)>0">, <i><xsl:value-of select="@ref"/></i></xsl:if>
+      <xsl:if test="string-length(@aut)>0">, <xsl:value-of select="@aut"/></xsl:if>
+    </p>
+  </xsl:template>
+
+  <xsl:template match="rubrique">
+    <p><b><xsl:value-of select="@nom"/></b></p>
+    <xsl:apply-templates select="node()"/>
+  </xsl:template>
+
+  <xsl:template match="exemple">
+    <q><xsl:apply-templates select="node()"/></q>
+  </xsl:template>
+  <xsl:template match="semantique">
+    <xsl:apply-templates select="node()"/>
+  </xsl:template>
+  <xsl:template match="span">
+    <xsl:apply-templates select="node()"/>
+  </xsl:template>
+
+</xsl:stylesheet>
diff --git a/XMLittre-stoplist.txt b/XMLittre-stoplist.txt

new file mode 100644 (file)

index 0000000..1008221
--- /dev/null
+++ b/XMLittre-stoplist.txt
@@ -0,0 +1,45 @@
+ou
+euse
+ante
+ale
+ive
+ienne
+ie
+s
+ue
+trice
+ente
+elle
+ine
+enne
+et
+es
+ette
+aine
+ite
+ise
+arde
+onne
+ane
+de
+se
+aite
+oise
+aude
+einte
+ate
+erte
+la
+in
+use
+d
+eure
+ote
+eule
+l
+un
+en
+ecte
+uite
+une
+simplement
diff --git a/XMLittre_to_tab_separated b/XMLittre_to_tab_separated

new file mode 100755 (executable)

index 0000000..0c041d9
--- /dev/null
+++ b/XMLittre_to_tab_separated
@@ -0,0 +1,48 @@
+#!/usr/bin/python3
+
+from glob import glob
+from lxml import etree
+
+xslt_entree = etree.XSLT(etree.parse("XMLittre-entree.xslt"))
+
+class Parser:
+    def __init__(self):
+        self.entrees = {}
+
+    def parse_file(self, fname):
+        fxml = etree.parse(fname)
+        root = fxml.getroot()
+        for entree in root.getchildren():
+            assert entree.tag == "entree"
+            terme = entree.attrib["terme"]
+            entree_html = xslt_entree(entree)
+            entree_text = str(entree_html)
+            entree_text = entree_text.replace("\n", "")
+            if terme not in self.entrees:
+                self.entrees[terme] = []
+            self.entrees[terme].append(entree_text)
+
+    def writeout(self, fname):
+        with open(fname, "w") as f:
+            for terme in self.entrees:
+                f.write(terme)
+                f.write("\t")
+                if len(self.entrees[terme]) > 1:
+                    f.write("<ol>")
+                    for entree in self.entrees[terme]:
+                        f.write("<li>")
+                        f.write(entree)
+                        f.write("</li>")
+                    f.write("</ol>")
+                else:
+                    f.write(self.entrees[terme][0])
+                f.write("\n")
+
+def main():
+    p = Parser()
+    for fname in glob("../xmlittre-data/?.xml"):
+        p.parse_file(fname)
+    p.writeout("XMLittre.tab_separated")
+
+if __name__ == "__main__":
+    main()
author	Frédéric Perrin <fred@fperrin.net>
	Sat, 6 Feb 2021 11:09:46 +0000 (11:09 +0000)
committer	Frédéric Perrin <fred@fperrin.net>
	Sat, 6 Feb 2021 11:09:46 +0000 (11:09 +0000)
GCIDE_to_tab_separated	[new file with mode: 0755]	patch \| blob
Makefile		patch \| blob \| history
OPTED-dictinfo.txt	[new file with mode: 0644]	patch \| blob
OPTED_to_tab_separated		patch \| blob \| history
README.txt	[new file with mode: 0644]	patch \| blob
XMLittre-entree.xslt	[new file with mode: 0644]	patch \| blob
XMLittre-stoplist.txt	[new file with mode: 0644]	patch \| blob
XMLittre_to_tab_separated	[new file with mode: 0755]	patch \| blob