From 028a6accf7d7d721797482d65dd4dc9d13611840 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Fr=C3=A9d=C3=A9ric=20Perrin?= Date: Sat, 6 Feb 2021 11:09:46 +0000 Subject: [PATCH] snapshot of current effort --- GCIDE_to_tab_separated | 173 ++++++++++++++++++++++++++++++++++++++ Makefile | 64 +++++++------- OPTED-dictinfo.txt | 9 ++ OPTED_to_tab_separated | 2 +- README.txt | 14 +++ XMLittre-entree.xslt | 99 ++++++++++++++++++++++ XMLittre-stoplist.txt | 45 ++++++++++ XMLittre_to_tab_separated | 48 +++++++++++ 8 files changed, 424 insertions(+), 30 deletions(-) create mode 100755 GCIDE_to_tab_separated create mode 100644 OPTED-dictinfo.txt create mode 100644 README.txt create mode 100644 XMLittre-entree.xslt create mode 100644 XMLittre-stoplist.txt create mode 100755 XMLittre_to_tab_separated diff --git a/GCIDE_to_tab_separated b/GCIDE_to_tab_separated new file mode 100755 index 0000000..99a4973 --- /dev/null +++ b/GCIDE_to_tab_separated @@ -0,0 +1,173 @@ +#!/usr/bin/python3 + +import lxml.etree +import re + +entity_map = { + "
", + "&": "&", + "", replace_fake_comments, rawtext, flags=re.DOTALL) + rawtext = f"" + rawtext + f"" + for entity, char in entity_map.items(): + rawtext = rawtext.replace(entity, char) + print(rawtext.splitlines()[5724:5730]) + e = lxml.etree.XML(rawtext) diff --git a/Makefile b/Makefile index 3fc8c53..d3ebd70 100644 --- a/Makefile +++ b/Makefile @@ -1,41 +1,47 @@ -OPTED_SOURCEDIR = OPTED/v003 +all: OPTED.v006.quickdic XMLittre.v006.quickdic -OPTED_FILES = $(addprefix $(OPTED_SOURCEDIR)/wb1913_,$(addsuffix .html,$(shell bash -c 'echo {a..z} new'))) +%.tab_separated: %_to_tab_separated + ./$*_to_tab_separated -essai: - @echo $(FILES) +%.v007.quickdic: %.tab_separated + echo $(dictlang) + [ ! -z $(dictlang) ] + cd ../DictionaryPC && ./run.sh --dictInfo=@$(CURDIR)/$*-dictinfo.txt --input1Charset=UTF8 --input1Format=tab_separated --input1Name=$* --lang1=$(dictlang) --lang1Stoplist=$(CURDIR)/$*-stoplist.txt --input1=$(CURDIR)/$< --dictOut=$(CURDIR)/$@ --print=$(CURDIR)/$@.txt -all: OPTED.v006-from-tab_separated.quickdic +%.v006.quickdic: %.v007.quickdic + rm -f $@ + cd ../DictionaryPC && ./convert_to_v6.sh $(CURDIR)/$< $(CURDIR)/$@ -# optedv003.hqx: -# wget 'http://www.mso.anu.edu.au/~ralph/OPTED/optedv003.hqx' +clean: + rm -f *.tab_separated *.chemnitz + rm -f *.quickdic *.quickdic.txt + rm -fr OPTED/ -$(OPTED_FILES): optedv003.hqx - hexbin -d $< - unar OPTED.sit.data - find OPTED -type f | xargs sed -i 's/\r/\n/g' - cd OPTED && patch -p1 < ../OPTED.patch +OPTED.v007.quickdic: dictlang := EN +XMLittre.v007.quickdic: dictlang := FR -OPTED.tab_separated: $(OPTED_FILES) - ./OPTED_to_tab_separated +OPTED_SOURCEDIR = OPTED/v003 -OPTED.v007-from-tab_separated.quickdic: OPTED.tab_separated - cd ../DictionaryPC && ./run.sh --dictInfo="Webster's Unabridged Dictionary, from the OPTED project" --input1=$(CURDIR)/$< --input1Charset=UTF8 --input1Format=tab_separated --input1Name="Webster-OPTED" --lang1=EN --lang1Stoplist=data/inputs/stoplists/en.txt --dictOut=$(CURDIR)/$@ --print=$(CURDIR)/$@.txt +OPTED_FILES = $(shell bash -c 'for l in {a..z} new; do echo $(OPTED_SOURCEDIR)/wb1913_$$l.html; done') -OPTED.v006-from-tab_separated.quickdic: OPTED.v007-from-tab_separated.quickdic - rm -f $@ - cd ../DictionaryPC && ./convert_to_v6.sh $(CURDIR)/$< $(CURDIR)/$@ +optedv003.hqx: + echo Manually run that command to fetch the OPTED raw data + echo wget 'http://www.mso.anu.edu.au/~ralph/OPTED/optedv003.hqx' + false -OPTED.chemnitz: OPTED/v003/wb1913_a.html - ./OPTED_to_chemnitz +$(OPTED_FILES): optedv003.hqx + hexbin -d optedv003.hqx + unar -f OPTED.sit.data + find OPTED -type f | xargs sed -i 's/\r/\n/g' + cd OPTED && patch -p1 < ../OPTED.patch -OPTED.v007-from-chemnitz.quickdic: OPTED.chemnitz - cd ../DictionaryPC && ./run.sh --dictInfo="Webster's Unabridged Dictionary, from the OPTED project" --input1=$(CURDIR)/$< --input1Charset=UTF8 --input1Format=chemnitz --input1Name="Webster-OPTED" --lang1=EN --lang1Stoplist=data/inputs/stoplists/en.txt --dictOut=$(CURDIR)/$@ --print=$(CURDIR)/$@.txt +OPTED.tab_separated: $(OPTED_FILES) OPTED.patch -OPTED.v006-from-chemnitz.quickdic: OPTED.v007-from-chemnitz.quickdic - rm -f $@ - cd ../DictionaryPC && ./convert_to_v6.sh $(CURDIR)/$< $(CURDIR)/$@ +XMLittre.tab_separated: XMLittre-entree.xslt +XMLittre.tab_separated: ../xmlittre-data/ -clean: - rm -f OPTED.*.quickdic OPTED.tab_separated OPTED.chemnitz - rm -fr OPTED/ +./xmlittre-data/: + echo Manually run that command to fetch XMLittre data + echo cd $(CURDIR)/.. + echo git clone 'https://bitbucket.org/Mytskine/xmlittre-data.git' + false diff --git a/OPTED-dictinfo.txt b/OPTED-dictinfo.txt new file mode 100644 index 0000000..8f9558b --- /dev/null +++ b/OPTED-dictinfo.txt @@ -0,0 +1,9 @@ +The Online Plain Text English Dictionary is a public domain English word +list dictionary, based on the public domain portion of "The Project +Gutenberg Etext of Webster's Unabridged Dictionary" which is in turn +based on the 1913 US Webster's Unabridged Dictionary. + +This version has been extensively stripped down and set out as one +definition per line by Ralph Sutherland. + +Version Quickdic prepared by Frédéric Perrin. diff --git a/OPTED_to_tab_separated b/OPTED_to_tab_separated index 479b3f0..7e99a6c 100755 --- a/OPTED_to_tab_separated +++ b/OPTED_to_tab_separated @@ -95,7 +95,7 @@ class OptedParser(HTMLParser): def main(): opted_parser = OptedParser() - for fname in glob("OPTED/v003/wb1913_*.html"): + for fname in sorted(glob("OPTED/v003/wb1913_*.html")): print(f"Running on {fname}") with open(fname, mode="r", encoding="macroman", errors="strict") as inputfile: diff --git a/README.txt b/README.txt new file mode 100644 index 0000000..108c28d --- /dev/null +++ b/README.txt @@ -0,0 +1,14 @@ +http://www.gutenberg.org/files/29765/29765-8.zip + +OPTED http://www.mso.anu.edu.au/~ralph/OPTED/optedv003.hqx +Decode with "hexbin -d optedv003.hqx" then "unar -forks skip OPTED.sit.data" +hexbin from package macutils ; unar from package of the same name. + +./run.sh --dictInfo="Webster-OPTED" --dictOut=OPTED.v007.quickdic --input1=../dictionaries/OPTED.tab_separated --input1Charset=UTF8 --input1Format=tab_separated --input1Name="Webster-OPTED" --lang1=EN --lang2=EN +./run.sh --dictInfo="Webster's Unabridged Dictionary, from the OPTED project" --dictOut=OPTED.v007.quickdic --input1=../dictionaries/OPTED.chemnitz --input1Charset=UTF8 --input1Format=chemnitz --input1Name="Webster-OPTED" --lang1=EN --lang1Stoplist=data/inputs/stoplists/en.txt +./convert_to_v6.sh OPTED.v007.quickdic OPTED.v006.quickdic + + +ftp://ftp.gnu.org/gnu/gcide/gcide-0.52.tar.xz + +https://www.ebooksgratuits.com/details.php?book=1609 diff --git a/XMLittre-entree.xslt b/XMLittre-entree.xslt new file mode 100644 index 0000000..97c1b1e --- /dev/null +++ b/XMLittre-entree.xslt @@ -0,0 +1,99 @@ + + + + + + + + + + + + + + + + +

RÉSUMÉ

+
+ + + + + + + + + + + + +

ENTRÉE PRINCIPALE

+
+ + + + + + + + + + + +

+ +
+ + +

PRONONCIATION

+
+ +
+
+
+ + + + + +
+
+ + +
    + +
  1. +
    +
+
+ + +
+
+ + +

+ , + , +

+
+ + +

+ +
+ + + + + + + + + + + +
diff --git a/XMLittre-stoplist.txt b/XMLittre-stoplist.txt new file mode 100644 index 0000000..1008221 --- /dev/null +++ b/XMLittre-stoplist.txt @@ -0,0 +1,45 @@ +ou +euse +ante +ale +ive +ienne +ie +s +ue +trice +ente +elle +ine +enne +et +es +ette +aine +ite +ise +arde +onne +ane +de +se +aite +oise +aude +einte +ate +erte +la +in +use +d +eure +ote +eule +l +un +en +ecte +uite +une +simplement diff --git a/XMLittre_to_tab_separated b/XMLittre_to_tab_separated new file mode 100755 index 0000000..0c041d9 --- /dev/null +++ b/XMLittre_to_tab_separated @@ -0,0 +1,48 @@ +#!/usr/bin/python3 + +from glob import glob +from lxml import etree + +xslt_entree = etree.XSLT(etree.parse("XMLittre-entree.xslt")) + +class Parser: + def __init__(self): + self.entrees = {} + + def parse_file(self, fname): + fxml = etree.parse(fname) + root = fxml.getroot() + for entree in root.getchildren(): + assert entree.tag == "entree" + terme = entree.attrib["terme"] + entree_html = xslt_entree(entree) + entree_text = str(entree_html) + entree_text = entree_text.replace("\n", "") + if terme not in self.entrees: + self.entrees[terme] = [] + self.entrees[terme].append(entree_text) + + def writeout(self, fname): + with open(fname, "w") as f: + for terme in self.entrees: + f.write(terme) + f.write("\t") + if len(self.entrees[terme]) > 1: + f.write("
    ") + for entree in self.entrees[terme]: + f.write("
  1. ") + f.write(entree) + f.write("
  2. ") + f.write("
") + else: + f.write(self.entrees[terme][0]) + f.write("\n") + +def main(): + p = Parser() + for fname in glob("../xmlittre-data/?.xml"): + p.parse_file(fname) + p.writeout("XMLittre.tab_separated") + +if __name__ == "__main__": + main() -- 2.43.0