From a2ef5759504df493d5d5968223984cc81aeb768b Mon Sep 17 00:00:00 2001 From: =?utf8?q?Reimar=20D=C3=B6ffinger?= Date: Thu, 27 Aug 2015 18:05:57 +0200 Subject: [PATCH] Add script to help with dictionary generation. Also update en and sv stoplists and parse the Spanish wiktionary a bit. --- data/inputs/stoplists/en.txt | 1 + data/inputs/stoplists/sv.txt | 50 +++++++++++++++++++ generate_dictionaries.sh | 11 ++++ .../parser/wiktionary/WiktionaryLangs.java | 6 ++- 4 files changed, 67 insertions(+), 1 deletion(-) create mode 100644 data/inputs/stoplists/sv.txt create mode 100755 generate_dictionaries.sh diff --git a/data/inputs/stoplists/en.txt b/data/inputs/stoplists/en.txt index 9f96dce..a91a6d1 100644 --- a/data/inputs/stoplists/en.txt +++ b/data/inputs/stoplists/en.txt @@ -52,3 +52,4 @@ sth sb at it +not diff --git a/data/inputs/stoplists/sv.txt b/data/inputs/stoplists/sv.txt new file mode 100644 index 0000000..f89cd28 --- /dev/null +++ b/data/inputs/stoplists/sv.txt @@ -0,0 +1,50 @@ +i +och +en +på +är +att +det +som +till +har +av +med +för +den +inte +jag +ett +Jag +du +om +sig +Det +var +de +så +gå +upp +mig +vi +han +ta +där +Han +in +e +här +ha +dig +man +Den +kan +De +ut +över +men +hade +eller +min +nu +sin diff --git a/generate_dictionaries.sh b/generate_dictionaries.sh new file mode 100755 index 0000000..dec94b9 --- /dev/null +++ b/generate_dictionaries.sh @@ -0,0 +1,11 @@ +#./run.sh --lang1=EN --dictOut=test --dictInfo=test --input0=data/inputs/wikiSplit/en/EN.data --input0Name=enwikitionary --input0Format=enwiktionary --input0LangPattern=English --input0LangCodePattern=en --input0EnIndex=1 --input0WiktionaryType=EnEnglish +while read langcode langname ; do +lang=$(echo $langcode | tr '[a-z]' '[A-Z]') +test "$lang" = "CY" && lang=CI +stoplist="" +test -e data/inputs/stoplists/${langcode}.txt && stoplist="--lang2Stoplist=data/inputs/stoplists/${langcode}.txt" +./run.sh --lang1=EN --lang2=$lang --lang1Stoplist=data/inputs/stoplists/en.txt $stoplist --dictOut=data/outputs/EN-${lang}.quickdic --dictInfo="(EN)Wiktionary-based EN-$lang dictionary." --input0=data/inputs/wikiSplit/en/${lang}.data --input0Name=enwikitionary --input0Format=enwiktionary --input0LangPattern=${langname} --input0LangCodePattern=${langcode} --input0EnIndex=1 --input0WiktionaryType=EnForeign --input1=data/inputs/wikiSplit/en/EN.data --input1Name=enwikitionary --input1Format=enwiktionary --input1LangPattern=${langname} --input1LangCodePattern=${langcode} --input1EnIndex=1 --input1WiktionaryType=EnToTranslation +rm data/outputs/EN-${lang}.quickdic.v006.zip +7z a -mx=9 data/outputs/EN-${lang}.quickdic.v006.zip ./data/outputs/EN-${lang}.quickdic +done < EN-foreign-dictlist.txt +./run.sh --lang1=EN --lang1Stoplist=data/inputs/stoplists/en.txt --dictOut=data/outputs/EN.quickdic --dictInfo="Wiktionary-based EN dictionary." --input0=data/inputs/wikiSplit/en/EN.data --input0Name=enwikitionary --input0Format=enwiktionary --input0LangPattern=English --input0LangCodePattern=en --input0EnIndex=1 --input0WiktionaryType=EnEnglish diff --git a/src/com/hughes/android/dictionary/parser/wiktionary/WiktionaryLangs.java b/src/com/hughes/android/dictionary/parser/wiktionary/WiktionaryLangs.java index f05e71e..93d1f1f 100644 --- a/src/com/hughes/android/dictionary/parser/wiktionary/WiktionaryLangs.java +++ b/src/com/hughes/android/dictionary/parser/wiktionary/WiktionaryLangs.java @@ -149,7 +149,7 @@ public class WiktionaryLangs { isoCodeToWikiName.put("SV", "Schwedisch"); isoCodeToWikiName.put("ES", "Spanisch"); - // egrep -o '\{\{=[a-zA-Z]+=\}\}' frwiktionary-pages-articles.xml | sort | uniq -c | sort -nr + // egrep -o '== *\{\{langue\|[a-zA-Z]+\}\} *==' frwiktionary-pages-articles.xml | sort | uniq -c | sort -nr isoCodeToWikiName = new LinkedHashMap(); wikiCodeToIsoCodeToWikiName.put("fr", isoCodeToWikiName); isoCodeToWikiName.put("FR", Pattern.quote("{{langue|fr}}")); @@ -190,6 +190,10 @@ public class WiktionaryLangs { isoCodeToWikiName.put("EL", Pattern.quote("{{-grc-}}")); isoCodeToWikiName.put("SV", Pattern.quote("{{-sv-}}")); + // There seems to be no consistent pattern and few foreign language entries anyway + isoCodeToWikiName = new LinkedHashMap(); + wikiCodeToIsoCodeToWikiName.put("es", isoCodeToWikiName); + isoCodeToWikiName.put("ES", Pattern.quote("{{ES}}")); } public static String getEnglishName(String langCode) { String name = isoCodeToEnWikiName.get(langCode); -- 2.43.0