From 5da87a9ec2370bb2f7ce11e107f07625e42f7171 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Fr=C3=A9d=C3=A9ric=20Perrin?= Date: Sun, 31 Jan 2021 00:57:09 +0000 Subject: [PATCH] Convert the OPTED dictionary for use on Tolino devices --- Makefile | 41 ++++++++++++++++ OPTED.patch | 76 ++++++++++++++++++++++++++++ OPTED_to_chemnitz | 89 +++++++++++++++++++++++++++++++++ OPTED_to_tab_separated | 109 +++++++++++++++++++++++++++++++++++++++++ 4 files changed, 315 insertions(+) create mode 100644 Makefile create mode 100644 OPTED.patch create mode 100755 OPTED_to_chemnitz create mode 100755 OPTED_to_tab_separated diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..3fc8c53 --- /dev/null +++ b/Makefile @@ -0,0 +1,41 @@ +OPTED_SOURCEDIR = OPTED/v003 + +OPTED_FILES = $(addprefix $(OPTED_SOURCEDIR)/wb1913_,$(addsuffix .html,$(shell bash -c 'echo {a..z} new'))) + +essai: + @echo $(FILES) + +all: OPTED.v006-from-tab_separated.quickdic + +# optedv003.hqx: +# wget 'http://www.mso.anu.edu.au/~ralph/OPTED/optedv003.hqx' + +$(OPTED_FILES): optedv003.hqx + hexbin -d $< + unar OPTED.sit.data + find OPTED -type f | xargs sed -i 's/\r/\n/g' + cd OPTED && patch -p1 < ../OPTED.patch + +OPTED.tab_separated: $(OPTED_FILES) + ./OPTED_to_tab_separated + +OPTED.v007-from-tab_separated.quickdic: OPTED.tab_separated + cd ../DictionaryPC && ./run.sh --dictInfo="Webster's Unabridged Dictionary, from the OPTED project" --input1=$(CURDIR)/$< --input1Charset=UTF8 --input1Format=tab_separated --input1Name="Webster-OPTED" --lang1=EN --lang1Stoplist=data/inputs/stoplists/en.txt --dictOut=$(CURDIR)/$@ --print=$(CURDIR)/$@.txt + +OPTED.v006-from-tab_separated.quickdic: OPTED.v007-from-tab_separated.quickdic + rm -f $@ + cd ../DictionaryPC && ./convert_to_v6.sh $(CURDIR)/$< $(CURDIR)/$@ + +OPTED.chemnitz: OPTED/v003/wb1913_a.html + ./OPTED_to_chemnitz + +OPTED.v007-from-chemnitz.quickdic: OPTED.chemnitz + cd ../DictionaryPC && ./run.sh --dictInfo="Webster's Unabridged Dictionary, from the OPTED project" --input1=$(CURDIR)/$< --input1Charset=UTF8 --input1Format=chemnitz --input1Name="Webster-OPTED" --lang1=EN --lang1Stoplist=data/inputs/stoplists/en.txt --dictOut=$(CURDIR)/$@ --print=$(CURDIR)/$@.txt + +OPTED.v006-from-chemnitz.quickdic: OPTED.v007-from-chemnitz.quickdic + rm -f $@ + cd ../DictionaryPC && ./convert_to_v6.sh $(CURDIR)/$< $(CURDIR)/$@ + +clean: + rm -f OPTED.*.quickdic OPTED.tab_separated OPTED.chemnitz + rm -fr OPTED/ diff --git a/OPTED.patch b/OPTED.patch new file mode 100644 index 0000000..6298b03 --- /dev/null +++ b/OPTED.patch @@ -0,0 +1,76 @@ +diff '--exclude=*~' -U0 -r OPTED/v003/wb1913_a.html OPTED.mine/v003/wb1913_a.html +--- OPTED/v003/wb1913_f.html 2021-01-31 00:20:51.926049448 +0000 ++++ OPTED.mine/v003/wb1913_f.html 2021-01-30 23:18:15.986529554 +0000 +@@ -5148 +5147,0 @@ +-

+diff '--exclude=*~' -U0 -r OPTED/v003/wb1913_r.html OPTED.mine/v003/wb1913_r.html +--- OPTED/v003/wb1913_r.html 2021-01-31 00:20:51.870050835 +0000 ++++ OPTED.mine/v003/wb1913_r.html 2021-01-30 23:18:16.058528321 +0000 +@@ -2674 +2674,2 @@ +-

Reenforce (v.) That part of a cannon near the breech which is thicker than the rest of the piece, so as better to resist the force of the exploding powder. See Illust. of Cannon.

Reenforce (v.) (b)

Reenforce (v.) An additional thickness of canvas, cloth, or the like, around an eyelet, buttonhole, etc.

++

Reenforce (v.) That part of a cannon near the breech which is thicker than the rest of the piece, so as better to resist the force of the exploding powder. See Illust. of Cannon.

++

Reenforce (v.) An additional thickness of canvas, cloth, or the like, around an eyelet, buttonhole, etc.

+diff '--exclude=*~' -U0 -r OPTED/v003/wb1913_s.html OPTED.mine/v003/wb1913_s.html +--- OPTED/v003/wb1913_s.html 2021-01-31 00:20:51.894050242 +0000 ++++ OPTED.mine/v003/wb1913_s.html 2021-01-30 23:23:25.685218008 +0000 +@@ -245,7 +245,7 @@ +-

Sad (supperl.) Sated; satisfied; weary; tired.

+-

Sad (supperl.) Heavy; weighty; ponderous; close; hard.

+-

Sad (supperl.) Dull; grave; dark; somber; -- said of colors.

+-

Sad (supperl.) Serious; grave; sober; steadfast; not light or frivolous.

+-

Sad (supperl.) Affected with grief or unhappiness; cast down with affliction; downcast; gloomy; mournful.

+-

Sad (supperl.) Afflictive; calamitous; causing sorrow; as, a sad accident; a sad misfortune.

+-

Sad (supperl.) Hence, bad; naughty; troublesome; wicked.

++

Sad (superl.) Sated; satisfied; weary; tired.

++

Sad (superl.) Heavy; weighty; ponderous; close; hard.

++

Sad (superl.) Dull; grave; dark; somber; -- said of colors.

++

Sad (superl.) Serious; grave; sober; steadfast; not light or frivolous.

++

Sad (superl.) Affected with grief or unhappiness; cast down with affliction; downcast; gloomy; mournful.

++

Sad (superl.) Afflictive; calamitous; causing sorrow; as, a sad accident; a sad misfortune.

++

Sad (superl.) Hence, bad; naughty; troublesome; wicked.

+@@ -12942,2 +12942 @@ +-

Spilikin (n.) One of a number of small pieces or pegs of wood, ivory, bone, or other material, for playing a game, or for counting the score in a game, as in cribbage. In the plural (spilikins

spilikins (pl. ) of Spilikin

+-), a game played with such pieces; pushpin.

++

Spilikin (n.) One of a number of small pieces or pegs of wood, ivory, bone, or other material, for playing a game, or for counting the score in a game, as in cribbage. In the plural (spilikins), a game played with such pieces; pushpin.

+diff '--exclude=*~' -U0 -r OPTED/v003/wb1913_f.html OPTED.mine/v003/wb1913_f.html +--- OPTED/v003/wb1913_f.html 2021-01-31 12:11:13.664266778 +0000 ++++ OPTED.mine/v003/wb1913_f.html 2021-01-31 12:04:23.084611285 +0000 +@@ -4950 +4950 @@ +-

Forewent 2 (imp.) of Forego

++

Forewent (imp.) of Forego

+diff '--exclude=*~' -U0 -r OPTED/v003/wb1913_g.html OPTED.mine/v003/wb1913_g.html +--- OPTED/v003/wb1913_g.html 2021-01-31 12:11:13.660266621 +0000 ++++ OPTED.mine/v003/wb1913_g.html 2021-01-31 12:10:32.334605895 +0000 +@@ -4288 +4288 @@ +-

\d8Gregarin\91 (n. pl.) An order of Protozoa, allied to the Rhizopoda, and parasitic in other animals, as in the earthworm, lobster, etc. When adult, they have a small, wormlike body inclosing a nucleus, but without external organs; in one of the young stages, they are amoebiform; -- called also Gregarinida, and Gregarinaria.

++

GregarinĀ¾ (n. pl.) An order of Protozoa, allied to the Rhizopoda, and parasitic in other animals, as in the earthworm, lobster, etc. When adult, they have a small, wormlike body inclosing a nucleus, but without external organs; in one of the young stages, they are amoebiform; -- called also Gregarinida, and Gregarinaria.

+@@ -4291 +4291 @@ +-

\d8Gregarinida () Gregarinae.

++

Gregarinida () Gregarinae.

+diff '--exclude=*~' -U0 -r OPTED/v003/wb1913_s.html OPTED.mine/v003/wb1913_s.html +--- OPTED/v003/wb1913_s.html 2021-01-31 12:11:13.672267087 +0000 ++++ OPTED.mine/v003/wb1913_s.html 2021-01-31 12:03:57.578461661 +0000 +@@ -1804,2 +1804,3 @@ +-

, a , or an . PCP. It is presumably an older spelling of scanned. --2. () Specifically (Pros.), to go through with, as a verse, marking and distinguishing the feet of which it is composed; to show, in reading, the metrical structure of; to recite metrically.

+-

, a , or an . PCP. It is presumably an older spelling of scanned. --2. Specifically (Pros.), to go through with, as a verse, marking and distinguishing the feet of which it is composed; to show, in reading, the metrical structure of; to recite metrically () To go over and examine point by point; to examine with care; to look closely at or into; to scrutinize.

++

Scan It is presumably an older spelling of scanned. Specifically (Pros.), to go through with, as a verse, marking and distinguishing the feet of which it is composed; to show, in reading, the metrical structure of; to recite metrically.

++

Scan It is presumably an older spelling of scanned. Specifically (Pros.), to go through with, as a verse, marking and distinguishing the feet of which it is composed; to show, in reading, the metrical structure of; to recite metrically

++

Scan () To go over and examine point by point; to examine with care; to look closely at or into; to scrutinize.

+diff '--exclude=*~' -U0 -r OPTED/v003/wb1913_w.html OPTED.mine/v003/wb1913_w.html +--- OPTED/v003/wb1913_w.html 2021-01-31 12:11:13.596264143 +0000 ++++ OPTED.mine/v003/wb1913_w.html 2021-01-31 12:04:39.293924871 +0000 +@@ -3286 +3286 @@ +-

Winnard 2 (n.) The redwing.

++

Winnard (n.) The redwing.

+diff '--exclude=*~' -U0 -r OPTED/v003/wb1913_p.html OPTED.mine/v003/wb1913_p.html +--- OPTED/v003/wb1913_p.html 2021-01-31 12:12:06.466217986 +0000 ++++ OPTED.mine/v003/wb1913_p.html 2021-01-31 12:14:29.514677672 +0000 +@@ -8283 +8283 @@ +-

Poach (v. t.) To stab; to pierce; to spear, \as fish.

++

Poach (v. t.) To stab; to pierce; to spear, as fish.

+diff '--exclude=*~' -U0 -r OPTED/v003/wb1913_c.html OPTED.mine/v003/wb1913_c.html +--- OPTED/v003/wb1913_c.html 2021-01-31 12:34:47.541056945 +0000 ++++ OPTED.mine/v003/wb1913_c.html 2021-01-31 12:35:33.646765527 +0000 +@@ -12084 +12084 @@ +-

Convict1ible (a.) Capable of being convicted.

++

Convictible (a.) Capable of being convicted.

diff --git a/OPTED_to_chemnitz b/OPTED_to_chemnitz new file mode 100755 index 0000000..87b1773 --- /dev/null +++ b/OPTED_to_chemnitz @@ -0,0 +1,89 @@ +#!/usr/bin/python3 + +from html.parser import HTMLParser +from glob import glob + +class OptedParser(HTMLParser): + def __init__(self, outputfile): + super().__init__() + self.reading_entry = False + self.reading_pos = False + self.reading_definition = False + self.entry = "" + self.pos = "" + self.definition = "" + self.outputfile = outputfile + + def handle_starttag(self, tag, attrs): + if tag == "b": + assert not self.reading_entry + assert not self.reading_pos + assert not self.reading_definition + assert not self.entry + + self.reading_entry = True + + elif tag == "i": + assert not self.reading_entry + assert not self.reading_pos + assert not self.pos + + self.reading_pos = True + + def handle_data(self, data): + if self.reading_entry: + self.entry += data.lower() + elif self.reading_pos: + self.pos += data + elif self.reading_definition: + self.definition += data + + def handle_endtag(self, tag): + if tag == "b": + assert self.reading_entry + assert not self.reading_definition + assert not self.reading_pos + + self.reading_entry = False + self.reading_definition = True + + elif tag == "i": + assert not self.reading_entry + assert self.reading_pos + assert self.reading_definition + + self.reading_pos = False + + elif tag == "p": + assert not self.reading_entry + assert self.reading_definition + assert not self.reading_pos + assert self.entry + assert self.definition + + assert "::" not in self.entry and "::" not in self.definition + assert "|" not in self.entry and "|" not in self.definition + + if self.pos: + self.definition = f"{self.entry} ({self.pos}) {self.definition}" + + self.definition = self.definition.replace("()", "") + self.definition = self.definition.strip() + + self.outputfile.write(f"{self.entry} :: {self.definition}\n") + self.reading_definition = False + self.entry = "" + self.pos = "" + self.definition = "" + +def main(): + with open("OPTED.chemnitz", "w", encoding="utf-8") as outputfile: + opted_parser = OptedParser(outputfile) + for fname in sorted(glob("OPTED/v003/wb1913_*.html")): + print(f"Running on {fname}") + with open(fname, mode="r", + encoding="macroman", errors="strict") as inputfile: + opted_parser.feed(inputfile.read()) + +if __name__ == "__main__": + main() diff --git a/OPTED_to_tab_separated b/OPTED_to_tab_separated new file mode 100755 index 0000000..479b3f0 --- /dev/null +++ b/OPTED_to_tab_separated @@ -0,0 +1,109 @@ +#!/usr/bin/python3 + +from html.parser import HTMLParser +from glob import glob + +class OptedParser(HTMLParser): + def __init__(self): + super().__init__() + self.reading_headword = False + self.reading_pos = False + self.reading_definition = False + self.headword = "" + self.pos = "" + self.definition = "" + self.entries = {} + + def _add_definition(self, headword, definition): + if headword not in self.entries: + self.entries[headword] = [] + self.entries[headword] += [definition] + + def write_definitions(self, outputfile): + for headword in self.entries: + if len(self.entries[headword]) == 1: + outputfile.write(f"{headword}\t{self.entries[headword][0]}\n") + else: + outputfile.write(f"{headword}\t
    ") + for entry in self.entries[headword]: + outputfile.write(f"
  1. {entry}
  2. ") + outputfile.write(f"
\n") + if " " in headword or "-" in headword: + print(f"<{headword}> has space or dash") + + def handle_starttag(self, tag, attrs): + if tag == "b": + assert not self.reading_headword + assert not self.reading_pos + assert not self.reading_definition + assert not self.headword + + self.reading_headword = True + + elif tag == "i": + assert not self.reading_headword + assert not self.reading_pos + assert not self.pos + + self.reading_pos = True + + def handle_data(self, data): + if self.reading_headword: + self.headword += data.lower() + elif self.reading_pos: + self.pos += data + elif self.reading_definition: + self.definition += data + + def handle_endtag(self, tag): + if tag == "b": + assert self.reading_headword + assert not self.reading_definition + assert not self.reading_pos + + self.reading_headword = False + self.reading_definition = True + + elif tag == "i": + assert not self.reading_headword + assert self.reading_pos + assert self.reading_definition + + self.reading_pos = False + + elif tag == "p": + assert not self.reading_headword + assert self.reading_definition + assert not self.reading_pos + assert self.headword + assert self.definition + + if self.pos: + self.definition = f"({self.pos}) {self.definition}" + self.definition = self.definition.replace("()", "") + self.definition = self.definition.strip() + + assert not "\\" in self.headword, f"\\ for word {self.headword}" + if any(str(digit) in self.headword for digit in range(10)): + print(f"Warning: {self.headword} has digits") + + self._add_definition(self.headword, self.definition) + self.reading_definition = False + self.headword = "" + self.pos = "" + self.definition = "" + +def main(): + opted_parser = OptedParser() + for fname in glob("OPTED/v003/wb1913_*.html"): + print(f"Running on {fname}") + with open(fname, mode="r", + encoding="macroman", errors="strict") as inputfile: + opted_parser.feed(inputfile.read()) + + with open("OPTED.tab_separated", "w", encoding="utf-8") as outputfile: + opted_parser.write_definitions(outputfile) + + +if __name__ == "__main__": + main() -- 2.43.0