--- /dev/null
+#!/usr/bin/python3
+
+import lxml.etree
+import re
+
+entity_map = {
+ "<br/": "<br/>",
+ "&": "&",
+ "<lt/": "<",
+ "<gt/": ">",
+ "--": "—", # long (em) dash
+ "<Cced/": "Ç", # C cedilla
+ "<uum/": "ü", # u umlaut (diaeresis)
+ "<eacute/":"é", # e acute
+ "<acir/": "â", # a circumflex
+ "<aum/": "ä", # a umlaut (diaeresis)
+ "<agrave/":"à", # a grave
+ "<aring/": "å", # a ring above
+ "<cced/": "ç", # c cedilla
+ "<ecir/": "ê", # e circumflex
+ "<eum/": "ë", # e umlaut (diaeresis)
+ "<egrave/":"è", # e grave
+ "<ium/": "ï", # i umlaut (diaeresis)
+ "<icir/": "î", # i circumflex
+ "<icirc/": "î", # i circumflex
+ "<igrave/":"ì", # i grave
+ "<Aum/": "Ä", # A umlaut
+ "<Eacute/":"É", # E acute
+ "<ae/": "æ", # ligature ae
+ "<AE/": "Æ", # ligature AE
+ "<ocir/": "ô", # o circumflex
+ "<oum/": "ö", # o umlaut (diaeresis)
+ "<ograve/":"ò", # o grave
+ "<ucir/": "û", # u circumflex
+ "<ugrave/":"ù", # u grave
+ "<yum/": "ÿ", # y umlaut
+ "<Oum/": "Ö", # O umlaut
+ "<Uum/": "Ü", # U umlaut (diaeresis)
+ "<pound/": "£", # pound sign (British)
+ "<aacute/":"á", # a acute
+ "<iacute/":"í", # i acute
+ "<oacute/":"ó", # o acute
+ "<uacute/":"ú", # u acute
+ "<ntil/": "ñ", # n tilde
+ "<Ntil/": "Ñ", # N tilde
+ "<frac23/":"⅔", # two-thirds
+ "<frac13/":"⅓", # one-third
+ "<sec/": "˝", # seconds (of degree or time). Also, inches or double prime.
+ "<frac12/":"½", # one-half
+ "<frac14/":"¼", # one-quarter
+ "<hand/": "☞", # pointing hand (printer's "fist")
+ "<bprime/":"˝", # bold accent (used in pronunciations)
+ "<prime/": "´", # light accent (used in pronunciations) also minutes (of
+ # arc or time)
+ "<rdquo/": "”", # close double quote
+ "<sect/": "§", # section mark
+ "<ldquo/": "“", # open double quotes
+ "<amac/": "ā", # a macron
+ "<lsquo/": "‘", # left single quote
+ "<nsm/": "ṉ", # "n sub-macron"
+ "<sharp/": "♯", # musical sharp
+ "<flat/": "♭", # musical flat
+ "<imac/": "ī", # i macron
+ "<emac/": "ē", # e macron
+ "<dsdot/": "ḍ", # Sanskrit/Tamil d dot
+ "<nsdot/": "ṇ", # Sanskrit/Tamil n dot
+ "<tsdot/": "ṭ", # Sanskrit/Tamil t dot
+ "<ecr/": "ĕ", # e breve
+ "<icr/": "ĭ", # i breve
+ "<ocr/": "ŏ", # o breve
+ "<OE/": "Œ", # OE ligature
+ "<oe/": "œ", # oe ligature
+ "<omac/": "ō", # o macron
+ "<umac/": "ū", # u macron
+ "<ocar/": "ǒ", # o hacek
+ "<aemac/": "ǣ", # ae ligature macron
+ "<oemac/": "ō", # oe ligature macron
+ "<ucr/": "ŭ", # u breve
+ "<acr/": "ă", # a breve
+ "<cre/": "˘", # crescent (like a breve, but vertically centered --
+ # represents the short accent in poetic meter)
+ "<ymac/": "ȳ", # y macron
+ "<edh/": "ð", # small eth
+ "<thorn/": "þ", # small thorn
+ "<atil/": "ã", # a tilde
+ "<ndot/": "ṅ", # n with dot above
+ "<rsdot/": "ṛ", # r with a dot below
+ "<yogh/": "ȝ", # small yogh
+ "<mdash/": "—", # em dash
+ "<divide/":"÷", # division sign
+ "<deg/": "°", # degree sign
+ "<middot/":"•", # bold middle dot
+ "<root/": "√", # root sign
+ "<adot/": "ȧ", # a with dot above
+
+ "<?/": "?", #(?) Place-holder for unknown or illegible character.
+
+ # used only in prononciation key; not able to find what "short vertical
+ # bar on top" looks like with unicode chars.
+ "<asl/": "a", # a "semilong" (has a macron above with a short
+ # vertical bar on top the center of the macron)
+ # Used in pronunciations.
+ "<esl/": "e", # e "semilong"
+ "<isl/": "i", # i "semilong"
+ "<osl/": "o", # o "semilong"
+ "<usl/": "u", # u "semilong"
+ "<th/": "th",# th ligature
+ "<ait/": "𝑎", # a italic
+ "<eit/": "𝑒",
+ "<iit/": "𝑖",
+ "<oit/": "𝑜",
+ "<uit/": "𝑢",
+ "<add/": "a", # a with two dot below
+ "<edd/": "e",
+ "<idd/": "i",
+ "<odd/": "o",
+ "<udd/": "u",
+ "<oocr/": "oo",
+ "<oomac/": "oo",
+ "<etil/": "ẽ",
+ "<ycr/": "ў",
+
+ # greek letters
+ "<alpha/": "α", "<ALPHA/": "Α",
+ "<beta/": "β", "<BETA/": "Β",
+ "<gamma/": "γ", "<GAMMA/": "Γ",
+ "<delta/": "δ", "<DELTA/": "Δ",
+ "<epsilon/": "ε", "<EPSILON/": "Ε",
+ "<zeta/": "ζ", "<ZETA/": "Ζ",
+ "<eta/": "η", "<ETA/": "Η",
+ "<theta/": "θ", "<THETA/": "Θ",
+ "<iota/": "ι", "<IOTA/": "Ι",
+ "<kappa/": "κ", "<KAPPA/": "Κ",
+ "<lambda/": "λ", "<LAMBDA/": "Λ",
+ "<mu/": "μ", "<MU/": "Μ",
+ "<nu/": "ν", "<NU/": "Ν",
+ "<xi/": "ξ", "<XI/": "Ξ",
+ "<omicron/": "ο", "<OMICRON/": "Ο",
+ "<pi/": "π", "<PI/": "Π",
+ "<rho/": "ρ", "<RHO/": "Ρ",
+ "<sigma/": "σ", "<SIGMA/": "Σ",
+ "<tau/": "τ", "<TAU/": "Τ",
+ "<upsilon/": "υ", "<UPSILON/": "Υ",
+ "<phi/": "φ", "<PHI/": "Φ",
+ "<chi/": "χ", "<CHI/": "Χ",
+ "<psi/": "ψ", "<PSI/": "Ψ",
+ "<omega/": "ω", "<OMEGA/": "Ω",
+
+ # then there are some characters that are shown as escape sequences
+ r"\'94": "ö",
+ r"\'d8": "‖",
+ r"/'bd": "“", # one instance where / is used instead of \
+ r" 'bd": "“", # two instances where \ is misssing
+ r"`'b8": "”", # one instance where ` is used instead of \
+
+ # entities that appear in the etymology of Arabic words, but no explanation
+ # of what they stand for. Not displayed at all by GNU dico.
+ "<hsdot/": "",
+ "<zsdot/": "",
+}
+
+def replace_fake_comments(match):
+ nblines = match.group(0).count("\n")
+ return "\n" * nblines
+
+def convert_file(fname):
+ rawtext = open(fname, "r").read()
+ rawtext = re.sub(r"<--.+?-->", replace_fake_comments, rawtext, flags=re.DOTALL)
+ rawtext = f"<dict>" + rawtext + f"</dict>"
+ for entity, char in entity_map.items():
+ rawtext = rawtext.replace(entity, char)
+ print(rawtext.splitlines()[5724:5730])
+ e = lxml.etree.XML(rawtext)
-OPTED_SOURCEDIR = OPTED/v003
+all: OPTED.v006.quickdic XMLittre.v006.quickdic
-OPTED_FILES = $(addprefix $(OPTED_SOURCEDIR)/wb1913_,$(addsuffix .html,$(shell bash -c 'echo {a..z} new')))
+%.tab_separated: %_to_tab_separated
+ ./$*_to_tab_separated
-essai:
- @echo $(FILES)
+%.v007.quickdic: %.tab_separated
+ echo $(dictlang)
+ [ ! -z $(dictlang) ]
+ cd ../DictionaryPC && ./run.sh --dictInfo=@$(CURDIR)/$*-dictinfo.txt --input1Charset=UTF8 --input1Format=tab_separated --input1Name=$* --lang1=$(dictlang) --lang1Stoplist=$(CURDIR)/$*-stoplist.txt --input1=$(CURDIR)/$< --dictOut=$(CURDIR)/$@ --print=$(CURDIR)/$@.txt
-all: OPTED.v006-from-tab_separated.quickdic
+%.v006.quickdic: %.v007.quickdic
+ rm -f $@
+ cd ../DictionaryPC && ./convert_to_v6.sh $(CURDIR)/$< $(CURDIR)/$@
-# optedv003.hqx:
-# wget 'http://www.mso.anu.edu.au/~ralph/OPTED/optedv003.hqx'
+clean:
+ rm -f *.tab_separated *.chemnitz
+ rm -f *.quickdic *.quickdic.txt
+ rm -fr OPTED/
-$(OPTED_FILES): optedv003.hqx
- hexbin -d $<
- unar OPTED.sit.data
- find OPTED -type f | xargs sed -i 's/\r/\n/g'
- cd OPTED && patch -p1 < ../OPTED.patch
+OPTED.v007.quickdic: dictlang := EN
+XMLittre.v007.quickdic: dictlang := FR
-OPTED.tab_separated: $(OPTED_FILES)
- ./OPTED_to_tab_separated
+OPTED_SOURCEDIR = OPTED/v003
-OPTED.v007-from-tab_separated.quickdic: OPTED.tab_separated
- cd ../DictionaryPC && ./run.sh --dictInfo="Webster's Unabridged Dictionary, from the OPTED project" --input1=$(CURDIR)/$< --input1Charset=UTF8 --input1Format=tab_separated --input1Name="Webster-OPTED" --lang1=EN --lang1Stoplist=data/inputs/stoplists/en.txt --dictOut=$(CURDIR)/$@ --print=$(CURDIR)/$@.txt
+OPTED_FILES = $(shell bash -c 'for l in {a..z} new; do echo $(OPTED_SOURCEDIR)/wb1913_$$l.html; done')
-OPTED.v006-from-tab_separated.quickdic: OPTED.v007-from-tab_separated.quickdic
- rm -f $@
- cd ../DictionaryPC && ./convert_to_v6.sh $(CURDIR)/$< $(CURDIR)/$@
+optedv003.hqx:
+ echo Manually run that command to fetch the OPTED raw data
+ echo wget 'http://www.mso.anu.edu.au/~ralph/OPTED/optedv003.hqx'
+ false
-OPTED.chemnitz: OPTED/v003/wb1913_a.html
- ./OPTED_to_chemnitz
+$(OPTED_FILES): optedv003.hqx
+ hexbin -d optedv003.hqx
+ unar -f OPTED.sit.data
+ find OPTED -type f | xargs sed -i 's/\r/\n/g'
+ cd OPTED && patch -p1 < ../OPTED.patch
-OPTED.v007-from-chemnitz.quickdic: OPTED.chemnitz
- cd ../DictionaryPC && ./run.sh --dictInfo="Webster's Unabridged Dictionary, from the OPTED project" --input1=$(CURDIR)/$< --input1Charset=UTF8 --input1Format=chemnitz --input1Name="Webster-OPTED" --lang1=EN --lang1Stoplist=data/inputs/stoplists/en.txt --dictOut=$(CURDIR)/$@ --print=$(CURDIR)/$@.txt
+OPTED.tab_separated: $(OPTED_FILES) OPTED.patch
-OPTED.v006-from-chemnitz.quickdic: OPTED.v007-from-chemnitz.quickdic
- rm -f $@
- cd ../DictionaryPC && ./convert_to_v6.sh $(CURDIR)/$< $(CURDIR)/$@
+XMLittre.tab_separated: XMLittre-entree.xslt
+XMLittre.tab_separated: ../xmlittre-data/
-clean:
- rm -f OPTED.*.quickdic OPTED.tab_separated OPTED.chemnitz
- rm -fr OPTED/
+./xmlittre-data/:
+ echo Manually run that command to fetch XMLittre data
+ echo cd $(CURDIR)/..
+ echo git clone 'https://bitbucket.org/Mytskine/xmlittre-data.git'
+ false
--- /dev/null
+<?xml version="1.0" encoding="utf-8"?>
+<xsl:stylesheet version="1.0"
+ xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
+ <xsl:output method="xml" version="1.0" encoding="UTF-8"
+ indent="yes" standalone="no" omit-xml-declaration="yes"/>
+
+ <xsl:template match="@*|node()">
+ <xsl:copy>
+ <xsl:apply-templates select="@*|node()" />
+ </xsl:copy>
+ </xsl:template>
+
+ <xsl:template match="entree">
+ <xsl:for-each select="entete/nature | entete/indent | entete/cit | entete/a | entete/semantique">
+ <xsl:apply-templates select="."/>
+ </xsl:for-each>
+
+ <xsl:if test="résumé">
+ <p><b>RÉSUMÉ</b></p>
+ </xsl:if>
+ <xsl:if test="résumé/indent">
+ <xsl:apply-templates select="résumé/indent"/>
+ </xsl:if>
+ <xsl:choose>
+ <xsl:when test="count(résumé/variante) > 1">
+ <xsl:apply-templates select="résumé" mode="plusieurs_variantes"/>
+ </xsl:when>
+ <xsl:when test="count(résumé/variante) = 1">
+ <xsl:apply-templates select="résumé" mode="unique_variante"/>
+ </xsl:when>
+ </xsl:choose>
+ <xsl:if test="résumé">
+ <p><b>ENTRÉE PRINCIPALE</b></p>
+ </xsl:if>
+
+ <xsl:choose>
+ <xsl:when test="count(corps/variante) > 1">
+ <xsl:apply-templates select="corps" mode="plusieurs_variantes"/>
+ </xsl:when>
+ <xsl:otherwise>
+ <xsl:apply-templates select="corps" mode="unique_variante"/>
+ </xsl:otherwise>
+ </xsl:choose>
+
+ <xsl:for-each select="rubrique">
+ <p><b><xsl:value-of select="@nom"/></b></p>
+ <xsl:apply-templates select="node()"/>
+ </xsl:for-each>
+
+ <xsl:if test="entete/prononciation">
+ <p><b>PRONONCIATION</b></p>
+ <div>
+ <xsl:apply-templates select="entete/prononciation/node()" />
+ </div>
+ </xsl:if>
+ </xsl:template>
+
+ <xsl:template match="nature">
+ <i><xsl:value-of select="text()"/></i>
+ </xsl:template>
+ <xsl:template match="indent">
+ <div style="padding-left: 1em"><xsl:apply-templates select="node()"/></div>
+ </xsl:template>
+
+ <xsl:template match="corps|résumé" mode="plusieurs_variantes">
+ <ol>
+ <xsl:for-each select="variante">
+ <li><xsl:apply-templates select="node()"/></li>
+ </xsl:for-each>
+ </ol>
+ </xsl:template>
+
+ <xsl:template match="corps|résumé" mode="unique_variante">
+ <div><xsl:apply-templates select="node()"/></div>
+ </xsl:template>
+
+ <xsl:template match="cit">
+ <p><q><xsl:apply-templates select="node()" /></q>
+ <xsl:if test="string-length(@ref)>0">, <i><xsl:value-of select="@ref"/></i></xsl:if>
+ <xsl:if test="string-length(@aut)>0">, <xsl:value-of select="@aut"/></xsl:if>
+ </p>
+ </xsl:template>
+
+ <xsl:template match="rubrique">
+ <p><b><xsl:value-of select="@nom"/></b></p>
+ <xsl:apply-templates select="node()"/>
+ </xsl:template>
+
+ <xsl:template match="exemple">
+ <q><xsl:apply-templates select="node()"/></q>
+ </xsl:template>
+ <xsl:template match="semantique">
+ <xsl:apply-templates select="node()"/>
+ </xsl:template>
+ <xsl:template match="span">
+ <xsl:apply-templates select="node()"/>
+ </xsl:template>
+
+</xsl:stylesheet>