]> gitweb.fperrin.net Git - DictionaryPC.git/commitdiff
Add script to help with dictionary generation.
authorReimar Döffinger <Reimar.Doeffinger@gmx.de>
Thu, 27 Aug 2015 16:05:57 +0000 (18:05 +0200)
committerReimar Döffinger <Reimar.Doeffinger@gmx.de>
Thu, 27 Aug 2015 16:05:57 +0000 (18:05 +0200)
Also update en and sv stoplists and parse the
Spanish wiktionary a bit.

data/inputs/stoplists/en.txt
data/inputs/stoplists/sv.txt [new file with mode: 0644]
generate_dictionaries.sh [new file with mode: 0755]
src/com/hughes/android/dictionary/parser/wiktionary/WiktionaryLangs.java

index 9f96dce2892b3c87cb9c558977ea9679208164da..a91a6d15e9603a635ef75d97581f0567fe65bd9f 100644 (file)
@@ -52,3 +52,4 @@ sth
 sb
 at
 it
+not
diff --git a/data/inputs/stoplists/sv.txt b/data/inputs/stoplists/sv.txt
new file mode 100644 (file)
index 0000000..f89cd28
--- /dev/null
@@ -0,0 +1,50 @@
+i
+och
+en
+på
+är
+att
+det
+som
+till
+har
+av
+med
+för
+den
+inte
+jag
+ett
+Jag
+du
+om
+sig
+Det
+var
+de
+så
+gå
+upp
+mig
+vi
+han
+ta
+där
+Han
+in
+e
+här
+ha
+dig
+man
+Den
+kan
+De
+ut
+över
+men
+hade
+eller
+min
+nu
+sin
diff --git a/generate_dictionaries.sh b/generate_dictionaries.sh
new file mode 100755 (executable)
index 0000000..dec94b9
--- /dev/null
@@ -0,0 +1,11 @@
+#./run.sh --lang1=EN --dictOut=test --dictInfo=test --input0=data/inputs/wikiSplit/en/EN.data  --input0Name=enwikitionary --input0Format=enwiktionary --input0LangPattern=English --input0LangCodePattern=en --input0EnIndex=1 --input0WiktionaryType=EnEnglish
+while read langcode langname ; do
+lang=$(echo $langcode | tr '[a-z]' '[A-Z]')
+test "$lang" = "CY" && lang=CI
+stoplist=""
+test -e data/inputs/stoplists/${langcode}.txt && stoplist="--lang2Stoplist=data/inputs/stoplists/${langcode}.txt"
+./run.sh --lang1=EN --lang2=$lang --lang1Stoplist=data/inputs/stoplists/en.txt $stoplist --dictOut=data/outputs/EN-${lang}.quickdic --dictInfo="(EN)Wiktionary-based EN-$lang dictionary." --input0=data/inputs/wikiSplit/en/${lang}.data  --input0Name=enwikitionary --input0Format=enwiktionary --input0LangPattern=${langname} --input0LangCodePattern=${langcode} --input0EnIndex=1 --input0WiktionaryType=EnForeign --input1=data/inputs/wikiSplit/en/EN.data --input1Name=enwikitionary --input1Format=enwiktionary --input1LangPattern=${langname} --input1LangCodePattern=${langcode} --input1EnIndex=1 --input1WiktionaryType=EnToTranslation
+rm data/outputs/EN-${lang}.quickdic.v006.zip
+7z a -mx=9 data/outputs/EN-${lang}.quickdic.v006.zip ./data/outputs/EN-${lang}.quickdic
+done < EN-foreign-dictlist.txt
+./run.sh --lang1=EN --lang1Stoplist=data/inputs/stoplists/en.txt --dictOut=data/outputs/EN.quickdic --dictInfo="Wiktionary-based EN dictionary." --input0=data/inputs/wikiSplit/en/EN.data  --input0Name=enwikitionary --input0Format=enwiktionary --input0LangPattern=English --input0LangCodePattern=en --input0EnIndex=1 --input0WiktionaryType=EnEnglish
index f05e71efdedd580554dc0cdaf054a29a50d3498f..93d1f1ff32b696e6eac7d3d422cd0797057295a8 100644 (file)
@@ -149,7 +149,7 @@ public class WiktionaryLangs {
     isoCodeToWikiName.put("SV", "Schwedisch");
     isoCodeToWikiName.put("ES", "Spanisch");
 
-    // egrep -o '\{\{=[a-zA-Z]+=\}\}' frwiktionary-pages-articles.xml | sort | uniq -c | sort -nr
+    // egrep -o '== *\{\{langue\|[a-zA-Z]+\}\} *==' frwiktionary-pages-articles.xml | sort | uniq -c | sort -nr
     isoCodeToWikiName = new LinkedHashMap<String, String>();
     wikiCodeToIsoCodeToWikiName.put("fr", isoCodeToWikiName);
     isoCodeToWikiName.put("FR", Pattern.quote("{{langue|fr}}"));
@@ -190,6 +190,10 @@ public class WiktionaryLangs {
     isoCodeToWikiName.put("EL", Pattern.quote("{{-grc-}}"));
     isoCodeToWikiName.put("SV", Pattern.quote("{{-sv-}}"));
 
+    // There seems to be no consistent pattern and few foreign language entries anyway
+    isoCodeToWikiName = new LinkedHashMap<String, String>();
+    wikiCodeToIsoCodeToWikiName.put("es", isoCodeToWikiName);
+    isoCodeToWikiName.put("ES", Pattern.quote("{{ES}}"));
   }
   public static String getEnglishName(String langCode) {
       String name = isoCodeToEnWikiName.get(langCode);