From 028a6accf7d7d721797482d65dd4dc9d13611840 Mon Sep 17 00:00:00 2001
From: =?utf8?q?Fr=C3=A9d=C3=A9ric=20Perrin?= <fred@fperrin.net>
Date: Sat, 6 Feb 2021 11:09:46 +0000
Subject: [PATCH] snapshot of current effort

---
 GCIDE_to_tab_separated    | 173 ++++++++++++++++++++++++++++++++++++++
 Makefile                  |  64 +++++++-------
 OPTED-dictinfo.txt        |   9 ++
 OPTED_to_tab_separated    |   2 +-
 README.txt                |  14 +++
 XMLittre-entree.xslt      |  99 ++++++++++++++++++++++
 XMLittre-stoplist.txt     |  45 ++++++++++
 XMLittre_to_tab_separated |  48 +++++++++++
 8 files changed, 424 insertions(+), 30 deletions(-)
 create mode 100755 GCIDE_to_tab_separated
 create mode 100644 OPTED-dictinfo.txt
 create mode 100644 README.txt
 create mode 100644 XMLittre-entree.xslt
 create mode 100644 XMLittre-stoplist.txt
 create mode 100755 XMLittre_to_tab_separated
diff --git a/GCIDE_to_tab_separated b/GCIDE_to_tab_separated
new file mode 100755
index 0000000..99a4973
--- /dev/null
+++ b/GCIDE_to_tab_separated
@@ -0,0 +1,173 @@
+#!/usr/bin/python3
+
+import lxml.etree
+import re
+
+entity_map = {
+    "<br/":    "<br/>",
+    "&":       "&amp;",
+    "<lt/":    "&lt;",
+    "<gt/":    "&gt;",
+    "--":      "â", # long (em) dash
+    "<Cced/":  "Ã", # C cedilla
+    "<uum/":   "Ã¼", # u umlaut (diaeresis)
+    "<eacute/":"Ã©", # e acute
+    "<acir/":  "Ã¢", # a circumflex
+    "<aum/":   "Ã¤", # a umlaut (diaeresis)
+    "<agrave/":"Ã ", # a grave
+    "<aring/": "Ã¥", # a ring above
+    "<cced/":  "Ã§", # c cedilla
+    "<ecir/":  "Ãª", # e circumflex
+    "<eum/":   "Ã«", # e umlaut (diaeresis)
+    "<egrave/":"Ã¨", # e grave
+    "<ium/":   "Ã¯", # i umlaut (diaeresis)
+    "<icir/":  "Ã®", # i circumflex
+    "<icirc/": "Ã®", # i circumflex
+    "<igrave/":"Ã¬", # i grave
+    "<Aum/":   "Ã", # A umlaut
+    "<Eacute/":"Ã", # E acute
+    "<ae/":    "Ã¦", # ligature ae
+    "<AE/":    "Ã", # ligature AE
+    "<ocir/":  "Ã´", # o circumflex
+    "<oum/":   "Ã¶", # o umlaut (diaeresis)
+    "<ograve/":"Ã²", # o grave
+    "<ucir/":  "Ã»", # u circumflex
+    "<ugrave/":"Ã¹", # u grave
+    "<yum/":   "Ã¿", # y umlaut
+    "<Oum/":   "Ã", # O umlaut
+    "<Uum/":   "Ã", # U umlaut (diaeresis)
+    "<pound/": "Â£", # pound sign (British)
+    "<aacute/":"Ã¡", # a acute
+    "<iacute/":"Ã­", # i acute
+    "<oacute/":"Ã³", # o acute
+    "<uacute/":"Ãº", # u acute
+    "<ntil/":  "Ã±", # n tilde
+    "<Ntil/":  "Ã", # N tilde
+    "<frac23/":"â", # two-thirds
+    "<frac13/":"â", # one-third
+    "<sec/":   "Ë", # seconds (of degree or time). Also, inches or double prime.
+    "<frac12/":"Â½", # one-half
+    "<frac14/":"Â¼", # one-quarter
+    "<hand/":  "â", # pointing hand (printer's "fist")
+    "<bprime/":"Ë", # bold accent (used in pronunciations)
+    "<prime/": "Â´", # light accent (used in pronunciations) also minutes (of
+                    # arc or time)
+    "<rdquo/": "â", # close double quote
+    "<sect/":  "Â§", # section mark
+    "<ldquo/": "â", # open double quotes
+    "<amac/":  "Ä", # a macron
+    "<lsquo/": "â", # left single quote
+    "<nsm/":   "á¹", # "n sub-macron"
+    "<sharp/": "â¯", # musical sharp
+    "<flat/":  "â­", # musical flat
+    "<imac/":  "Ä«", # i macron
+    "<emac/":  "Ä", # e macron
+    "<dsdot/": "á¸", # Sanskrit/Tamil d dot 
+    "<nsdot/": "á¹", # Sanskrit/Tamil n dot
+    "<tsdot/": "á¹­", # Sanskrit/Tamil t dot
+    "<ecr/":   "Ä", # e breve
+    "<icr/":   "Ä­", # i breve
+    "<ocr/":   "Å", # o breve
+    "<OE/":    "Å", # OE ligature
+    "<oe/":    "Å", # oe ligature
+    "<omac/":  "Å", # o macron
+    "<umac/":  "Å«", # u macron
+    "<ocar/":  "Ç", # o hacek
+    "<aemac/": "Ç£", # ae ligature macron
+    "<oemac/": "Å", # oe ligature macron
+    "<ucr/":   "Å­", # u breve
+    "<acr/":   "Ä", # a breve
+    "<cre/":   "Ë", # crescent (like a breve, but vertically centered --
+                    # represents the short accent in poetic meter)
+    "<ymac/":  "È³", # y macron
+    "<edh/":   "Ã°", # small eth
+    "<thorn/": "Ã¾", # small thorn
+    "<atil/":  "Ã£", # a tilde
+    "<ndot/":  "á¹", # n with dot above
+    "<rsdot/": "á¹", # r with a dot below
+    "<yogh/":  "È", # small yogh
+    "<mdash/": "â", # em dash
+    "<divide/":"Ã·", # division sign
+    "<deg/":   "Â°", # degree sign
+    "<middot/":"â¢", # bold middle dot
+    "<root/":  "â", # root sign
+    "<adot/":  "È§", # a with dot above
+
+    "<?/":     "?", #(?) Place-holder for unknown or illegible character.
+
+    # used only in prononciation key; not able to find what "short vertical
+    # bar on top" looks like with unicode chars.
+    "<asl/":   "a", #  a "semilong" (has a macron above with a short
+                    # vertical bar on top the center of the macron)
+                    # Used in pronunciations.
+    "<esl/":   "e", # e "semilong"
+    "<isl/":   "i", # i "semilong"
+    "<osl/":   "o", # o "semilong"
+    "<usl/":   "u", # u "semilong"
+    "<th/":    "th",# th ligature
+    "<ait/":   "ð", # a italic
+    "<eit/":   "ð",
+    "<iit/":   "ð",
+    "<oit/":   "ð",
+    "<uit/":   "ð¢",
+    "<add/":   "a", # a with two dot below
+    "<edd/":   "e",
+    "<idd/":   "i",
+    "<odd/":   "o",
+    "<udd/":   "u",
+    "<oocr/":  "oo",
+    "<oomac/":  "oo",
+    "<etil/":  "áº½",
+    "<ycr/":   "Ñ",
+
+    # greek letters
+    "<alpha/": "Î±",         "<ALPHA/": "Î",
+    "<beta/": "Î²",          "<BETA/": "Î",
+    "<gamma/": "Î³",         "<GAMMA/": "Î",
+    "<delta/": "Î´",         "<DELTA/": "Î",
+    "<epsilon/": "Îµ",       "<EPSILON/": "Î",
+    "<zeta/": "Î¶",          "<ZETA/": "Î",
+    "<eta/": "Î·",           "<ETA/": "Î",
+    "<theta/": "Î¸",         "<THETA/": "Î",
+    "<iota/": "Î¹",          "<IOTA/": "Î",
+    "<kappa/": "Îº",         "<KAPPA/": "Î",
+    "<lambda/": "Î»",        "<LAMBDA/": "Î",
+    "<mu/": "Î¼",            "<MU/": "Î",
+    "<nu/": "Î½",            "<NU/": "Î",
+    "<xi/": "Î¾",            "<XI/": "Î",
+    "<omicron/": "Î¿",       "<OMICRON/": "Î",
+    "<pi/": "Ï",            "<PI/": "Î ",
+    "<rho/": "Ï",           "<RHO/": "Î¡",
+    "<sigma/": "Ï",         "<SIGMA/": "Î£",
+    "<tau/": "Ï",           "<TAU/": "Î¤",
+    "<upsilon/": "Ï",       "<UPSILON/": "Î¥",
+    "<phi/": "Ï",           "<PHI/": "Î¦",
+    "<chi/": "Ï",           "<CHI/": "Î§",
+    "<psi/": "Ï",           "<PSI/": "Î¨",
+    "<omega/": "Ï",         "<OMEGA/": "Î©",
+
+    # then there are some characters that are shown as escape sequences
+    r"\'94":   "Ã¶",
+    r"\'d8":   "â",
+    r"/'bd":   "â",        # one instance where / is used instead of \
+    r" 'bd":   "â",        # two instances where \ is misssing
+    r"`'b8":   "â",        # one instance where ` is used instead of \
+
+    # entities that appear in the etymology of Arabic words, but no explanation
+    # of what they stand for. Not displayed at all by GNU dico.
+    "<hsdot/": "",
+    "<zsdot/": "",
+}
+
+def replace_fake_comments(match):
+    nblines = match.group(0).count("\n")
+    return "\n" * nblines
+
+def convert_file(fname):
+    rawtext = open(fname, "r").read()
+    rawtext = re.sub(r"<--.+?-->", replace_fake_comments, rawtext, flags=re.DOTALL)
+    rawtext = f"<dict>" + rawtext + f"</dict>"
+    for entity, char in entity_map.items():
+        rawtext = rawtext.replace(entity, char)
+    print(rawtext.splitlines()[5724:5730])
+    e = lxml.etree.XML(rawtext)
diff --git a/Makefile b/Makefile
index 3fc8c53..d3ebd70 100644
--- a/Makefile
+++ b/Makefile
@@ -1,41 +1,47 @@
-OPTED_SOURCEDIR = OPTED/v003
+all: OPTED.v006.quickdic XMLittre.v006.quickdic
 
-OPTED_FILES = $(addprefix $(OPTED_SOURCEDIR)/wb1913_,$(addsuffix .html,$(shell bash -c 'echo {a..z} new')))
+%.tab_separated: %_to_tab_separated
+	./$*_to_tab_separated
 
-essai:
-	@echo $(FILES)
+%.v007.quickdic: %.tab_separated
+	echo $(dictlang)
+	[ ! -z $(dictlang) ]
+	cd ../DictionaryPC && ./run.sh --dictInfo=@$(CURDIR)/$*-dictinfo.txt --input1Charset=UTF8 --input1Format=tab_separated --input1Name=$* --lang1=$(dictlang) --lang1Stoplist=$(CURDIR)/$*-stoplist.txt --input1=$(CURDIR)/$< --dictOut=$(CURDIR)/$@  --print=$(CURDIR)/$@.txt
 
-all: OPTED.v006-from-tab_separated.quickdic
+%.v006.quickdic: %.v007.quickdic
+	rm -f $@
+	cd ../DictionaryPC && ./convert_to_v6.sh $(CURDIR)/$< $(CURDIR)/$@
 
-# optedv003.hqx:
-# 	wget 'http://www.mso.anu.edu.au/~ralph/OPTED/optedv003.hqx'
+clean:
+	rm -f *.tab_separated *.chemnitz
+	rm -f *.quickdic *.quickdic.txt
+	rm -fr OPTED/
 
-$(OPTED_FILES): optedv003.hqx
-	hexbin -d $<
-	unar OPTED.sit.data
-	find OPTED -type f | xargs sed -i 's/\r/\n/g'
-	cd OPTED && patch -p1 < ../OPTED.patch
+OPTED.v007.quickdic: dictlang := EN
+XMLittre.v007.quickdic: dictlang := FR
 
-OPTED.tab_separated: $(OPTED_FILES)
-	./OPTED_to_tab_separated
+OPTED_SOURCEDIR = OPTED/v003
 
-OPTED.v007-from-tab_separated.quickdic: OPTED.tab_separated
-	cd ../DictionaryPC && ./run.sh --dictInfo="Webster's Unabridged Dictionary, from the OPTED project" --input1=$(CURDIR)/$< --input1Charset=UTF8 --input1Format=tab_separated --input1Name="Webster-OPTED" --lang1=EN --lang1Stoplist=data/inputs/stoplists/en.txt --dictOut=$(CURDIR)/$@  --print=$(CURDIR)/$@.txt
+OPTED_FILES = $(shell bash -c 'for l in {a..z} new; do echo $(OPTED_SOURCEDIR)/wb1913_$$l.html; done')
 
-OPTED.v006-from-tab_separated.quickdic: OPTED.v007-from-tab_separated.quickdic
-	rm -f $@
-	cd ../DictionaryPC && ./convert_to_v6.sh $(CURDIR)/$< $(CURDIR)/$@
+optedv003.hqx:
+	echo Manually run that command to fetch the OPTED raw data
+	echo wget 'http://www.mso.anu.edu.au/~ralph/OPTED/optedv003.hqx'
+	false
 
-OPTED.chemnitz: OPTED/v003/wb1913_a.html
-	./OPTED_to_chemnitz
+$(OPTED_FILES): optedv003.hqx
+	hexbin -d optedv003.hqx
+	unar -f OPTED.sit.data
+	find OPTED -type f | xargs sed -i 's/\r/\n/g'
+	cd OPTED && patch -p1 < ../OPTED.patch
 
-OPTED.v007-from-chemnitz.quickdic: OPTED.chemnitz
-	cd ../DictionaryPC && ./run.sh --dictInfo="Webster's Unabridged Dictionary, from the OPTED project" --input1=$(CURDIR)/$< --input1Charset=UTF8 --input1Format=chemnitz --input1Name="Webster-OPTED" --lang1=EN --lang1Stoplist=data/inputs/stoplists/en.txt --dictOut=$(CURDIR)/$@  --print=$(CURDIR)/$@.txt
+OPTED.tab_separated: $(OPTED_FILES) OPTED.patch
 
-OPTED.v006-from-chemnitz.quickdic: OPTED.v007-from-chemnitz.quickdic
-	rm -f $@
-	cd ../DictionaryPC && ./convert_to_v6.sh $(CURDIR)/$< $(CURDIR)/$@
+XMLittre.tab_separated: XMLittre-entree.xslt
+XMLittre.tab_separated: ../xmlittre-data/
 
-clean:
-	rm -f OPTED.*.quickdic OPTED.tab_separated OPTED.chemnitz
-	rm -fr OPTED/
+./xmlittre-data/:
+	echo Manually run that command to fetch XMLittre data
+	echo cd $(CURDIR)/..
+	echo git clone 'https://bitbucket.org/Mytskine/xmlittre-data.git'
+	false
diff --git a/OPTED-dictinfo.txt b/OPTED-dictinfo.txt
new file mode 100644
index 0000000..8f9558b
--- /dev/null
+++ b/OPTED-dictinfo.txt
@@ -0,0 +1,9 @@
+The Online Plain Text English Dictionary is a public domain English word
+list dictionary, based on the public domain portion of "The Project
+Gutenberg Etext of Webster's Unabridged Dictionary" which is in turn
+based on the 1913 US Webster's Unabridged Dictionary.
+
+This version has been extensively stripped down and set out as one
+definition per line by Ralph Sutherland.
+
+Version Quickdic prepared by FrÃ©dÃ©ric Perrin.
diff --git a/OPTED_to_tab_separated b/OPTED_to_tab_separated
index 479b3f0..7e99a6c 100755
--- a/OPTED_to_tab_separated
+++ b/OPTED_to_tab_separated
@@ -95,7 +95,7 @@ class OptedParser(HTMLParser):
 
 def main():
     opted_parser = OptedParser()
-    for fname in glob("OPTED/v003/wb1913_*.html"):
+    for fname in sorted(glob("OPTED/v003/wb1913_*.html")):
         print(f"Running on {fname}")
         with open(fname, mode="r",
                   encoding="macroman", errors="strict") as inputfile:
diff --git a/README.txt b/README.txt
new file mode 100644
index 0000000..108c28d
--- /dev/null
+++ b/README.txt
@@ -0,0 +1,14 @@
+http://www.gutenberg.org/files/29765/29765-8.zip
+
+OPTED http://www.mso.anu.edu.au/~ralph/OPTED/optedv003.hqx
+Decode with "hexbin -d optedv003.hqx" then "unar -forks skip OPTED.sit.data"
+hexbin from package macutils ; unar from package of the same name.
+
+./run.sh --dictInfo="Webster-OPTED" --dictOut=OPTED.v007.quickdic --input1=../dictionaries/OPTED.tab_separated --input1Charset=UTF8 --input1Format=tab_separated --input1Name="Webster-OPTED" --lang1=EN --lang2=EN
+./run.sh --dictInfo="Webster's Unabridged Dictionary, from the OPTED project" --dictOut=OPTED.v007.quickdic --input1=../dictionaries/OPTED.chemnitz --input1Charset=UTF8 --input1Format=chemnitz --input1Name="Webster-OPTED" --lang1=EN --lang1Stoplist=data/inputs/stoplists/en.txt
+./convert_to_v6.sh OPTED.v007.quickdic OPTED.v006.quickdic
+
+
+ftp://ftp.gnu.org/gnu/gcide/gcide-0.52.tar.xz
+
+https://www.ebooksgratuits.com/details.php?book=1609
diff --git a/XMLittre-entree.xslt b/XMLittre-entree.xslt
new file mode 100644
index 0000000..97c1b1e
--- /dev/null
+++ b/XMLittre-entree.xslt
@@ -0,0 +1,99 @@
+<?xml version="1.0" encoding="utf-8"?>
+<xsl:stylesheet version="1.0"
+		xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
+  <xsl:output method="xml" version="1.0" encoding="UTF-8"
+  	      indent="yes" standalone="no" omit-xml-declaration="yes"/>
+
+  <xsl:template match="@*|node()">
+    <xsl:copy>
+      <xsl:apply-templates select="@*|node()" />
+    </xsl:copy>
+  </xsl:template>
+
+  <xsl:template match="entree">
+    <xsl:for-each select="entete/nature | entete/indent | entete/cit | entete/a | entete/semantique">
+      <xsl:apply-templates select="."/>
+    </xsl:for-each>
+
+    <xsl:if test="rÃ©sumÃ©">
+      <p><b>RÃSUMÃ</b></p>
+    </xsl:if>
+    <xsl:if test="rÃ©sumÃ©/indent">
+      <xsl:apply-templates select="rÃ©sumÃ©/indent"/>
+    </xsl:if>
+    <xsl:choose>
+      <xsl:when test="count(rÃ©sumÃ©/variante) &gt; 1">
+    	<xsl:apply-templates select="rÃ©sumÃ©" mode="plusieurs_variantes"/>
+      </xsl:when>
+      <xsl:when test="count(rÃ©sumÃ©/variante) = 1">
+    	<xsl:apply-templates select="rÃ©sumÃ©" mode="unique_variante"/>
+      </xsl:when>
+    </xsl:choose>
+    <xsl:if test="rÃ©sumÃ©">
+      <p><b>ENTRÃE PRINCIPALE</b></p>
+    </xsl:if>
+    
+    <xsl:choose>
+      <xsl:when test="count(corps/variante) &gt; 1">
+    	<xsl:apply-templates select="corps" mode="plusieurs_variantes"/>
+      </xsl:when>
+      <xsl:otherwise>
+    	<xsl:apply-templates select="corps" mode="unique_variante"/>
+      </xsl:otherwise>
+    </xsl:choose>
+
+    <xsl:for-each select="rubrique">
+      <p><b><xsl:value-of select="@nom"/></b></p>
+      <xsl:apply-templates select="node()"/>
+    </xsl:for-each>
+
+    <xsl:if test="entete/prononciation">
+      <p><b>PRONONCIATION</b></p>
+      <div>
+    	<xsl:apply-templates select="entete/prononciation/node()" />
+      </div>
+    </xsl:if>
+  </xsl:template>
+
+  <xsl:template match="nature">
+    <i><xsl:value-of select="text()"/></i>
+  </xsl:template>
+  <xsl:template match="indent">
+    <div style="padding-left: 1em"><xsl:apply-templates select="node()"/></div>
+  </xsl:template>
+
+  <xsl:template match="corps|rÃ©sumÃ©" mode="plusieurs_variantes">
+    <ol>
+      <xsl:for-each select="variante">
+  	<li><xsl:apply-templates select="node()"/></li>
+      </xsl:for-each>
+    </ol>
+  </xsl:template>
+
+  <xsl:template match="corps|rÃ©sumÃ©" mode="unique_variante">
+    <div><xsl:apply-templates select="node()"/></div>
+  </xsl:template>
+
+  <xsl:template match="cit">
+    <p><q><xsl:apply-templates select="node()" /></q>
+      <xsl:if test="string-length(@ref)>0">, <i><xsl:value-of select="@ref"/></i></xsl:if>
+      <xsl:if test="string-length(@aut)>0">, <xsl:value-of select="@aut"/></xsl:if>
+    </p>
+  </xsl:template>
+
+  <xsl:template match="rubrique">
+    <p><b><xsl:value-of select="@nom"/></b></p>
+    <xsl:apply-templates select="node()"/>
+  </xsl:template>
+
+  <xsl:template match="exemple">
+    <q><xsl:apply-templates select="node()"/></q>
+  </xsl:template>
+  <xsl:template match="semantique">
+    <xsl:apply-templates select="node()"/>
+  </xsl:template>
+  <xsl:template match="span">
+    <xsl:apply-templates select="node()"/>
+  </xsl:template>
+
+</xsl:stylesheet>
diff --git a/XMLittre-stoplist.txt b/XMLittre-stoplist.txt
new file mode 100644
index 0000000..1008221
--- /dev/null
+++ b/XMLittre-stoplist.txt
@@ -0,0 +1,45 @@
+ou
+euse
+ante
+ale
+ive
+ienne
+ie
+s
+ue
+trice
+ente
+elle
+ine
+enne
+et
+es
+ette
+aine
+ite
+ise
+arde
+onne
+ane
+de
+se
+aite
+oise
+aude
+einte
+ate
+erte
+la
+in
+use
+d
+eure
+ote
+eule
+l
+un
+en
+ecte
+uite
+une
+simplement
diff --git a/XMLittre_to_tab_separated b/XMLittre_to_tab_separated
new file mode 100755
index 0000000..0c041d9
--- /dev/null
+++ b/XMLittre_to_tab_separated
@@ -0,0 +1,48 @@
+#!/usr/bin/python3
+
+from glob import glob
+from lxml import etree
+
+xslt_entree = etree.XSLT(etree.parse("XMLittre-entree.xslt"))
+
+class Parser:
+    def __init__(self):
+        self.entrees = {}
+
+    def parse_file(self, fname):
+        fxml = etree.parse(fname)
+        root = fxml.getroot()
+        for entree in root.getchildren():
+            assert entree.tag == "entree"
+            terme = entree.attrib["terme"]
+            entree_html = xslt_entree(entree)
+            entree_text = str(entree_html)
+            entree_text = entree_text.replace("\n", "")
+            if terme not in self.entrees:
+                self.entrees[terme] = []
+            self.entrees[terme].append(entree_text)
+
+    def writeout(self, fname):
+        with open(fname, "w") as f:
+            for terme in self.entrees:
+                f.write(terme)
+                f.write("\t")
+                if len(self.entrees[terme]) > 1:
+                    f.write("<ol>")
+                    for entree in self.entrees[terme]:
+                        f.write("<li>")
+                        f.write(entree)
+                        f.write("</li>")
+                    f.write("</ol>")
+                else:
+                    f.write(self.entrees[terme][0])
+                f.write("\n")
+
+def main():
+    p = Parser()
+    for fname in glob("../xmlittre-data/?.xml"):
+        p.parse_file(fname)
+    p.writeout("XMLittre.tab_separated")
+
+if __name__ == "__main__":
+    main()
-- 
2.43.0