L=en
echo "Downloading from: http://dumps.wikimedia.org/${L}wiktionary/"
-WIKI=${L}wiktionary-20120714-pages-articles.xml
-curl --remote-name http://dumps.wikimedia.org/${L}wiktionary/20120714/${WIKI}.bz2
+WIKI=${L}wiktionary-20120930-pages-articles.xml
+curl --remote-name http://dumps.wikimedia.org/${L}wiktionary/20120930/${WIKI}.bz2
bunzip2 ${WIKI}.bz2
mv ${WIKI} inputs/${L}wiktionary-pages-articles.xml
L=fr
echo "Downloading from: http://dumps.wikimedia.org/${L}wiktionary/"
-WIKI=${L}wiktionary-20120719-pages-articles.xml
-curl --remote-name http://dumps.wikimedia.org/${L}wiktionary/20120719/${WIKI}.bz2
+WIKI=${L}wiktionary-20120926-pages-articles.xml
+curl --remote-name http://dumps.wikimedia.org/${L}wiktionary/20120926/${WIKI}.bz2
bunzip2 ${WIKI}.bz2
mv ${WIKI} inputs/${L}wiktionary-pages-articles.xml
L=it
echo "Downloading from: http://dumps.wikimedia.org/${L}wiktionary/"
-WIKI=${L}wiktionary-20120720-pages-articles.xml
-curl --remote-name http://dumps.wikimedia.org/${L}wiktionary/20120720/${WIKI}.bz2
+WIKI=${L}wiktionary-20120926-pages-articles.xml
+curl --remote-name http://dumps.wikimedia.org/${L}wiktionary/20120926/${WIKI}.bz2
bunzip2 ${WIKI}.bz2
mv ${WIKI} inputs/${L}wiktionary-pages-articles.xml
L=de
echo "Downloading from: http://dumps.wikimedia.org/${L}wiktionary/"
-WIKI=${L}wiktionary-20120714-pages-articles.xml
-curl --remote-name http://dumps.wikimedia.org/${L}wiktionary/20120714/${WIKI}.bz2
+WIKI=${L}wiktionary-20120928-pages-articles.xml
+curl --remote-name http://dumps.wikimedia.org/${L}wiktionary/20120928/${WIKI}.bz2
bunzip2 ${WIKI}.bz2
mv ${WIKI} inputs/${L}wiktionary-pages-articles.xml
L=es
echo "Downloading from: http://dumps.wikimedia.org/${L}wiktionary/"
-WIKI=${L}wiktionary-20120718-pages-articles.xml
-curl --remote-name http://dumps.wikimedia.org/${L}wiktionary/20120718/${WIKI}.bz2
+WIKI=${L}wiktionary-20120924-pages-articles.xml
+curl --remote-name http://dumps.wikimedia.org/${L}wiktionary/20120924/${WIKI}.bz2
bunzip2 ${WIKI}.bz2
mv ${WIKI} inputs/${L}wiktionary-pages-articles.xml
static final String INPUTS = "data/inputs/";
static final String STOPLISTS = "data/inputs/stoplists/";
static final String OUTPUTS = "data/outputs/";
+
+ // Build the non EN ones.
+ static final String[][] nonEnPairs = new String[][] {
+ {"EN"},
+ {"DE"},
+ {"IT"},
+ {"FR"},
+
+ // The 3 I use most:
+ {"IT", "EN" },
+ {"DE", "EN" },
+ {"DE", "IT" },
+
+
+ {"AR", "DE" },
+ {"AR", "ES" },
+ {"AR", "FR" },
+ {"AR", "HE" },
+ {"AR", "IT" },
+ {"AR", "JA" },
+ {"AR", "RU" },
+ {"AR", "TR" }, // Turkish
+ {"AR", "ZH" },
+
+ {"DE", "AR" },
+ {"DE", "FR" },
+ {"DE", "CA" }, // Catalan
+ {"DE", "CS" }, // Czech
+ {"DE", "EO" }, // Esperanto
+ {"DE", "ES" },
+ {"DE", "FR" },
+ {"DE", "HE" },
+ {"DE", "HU" }, // Hungarian
+ {"DE", "IT" },
+ {"DE", "JA" },
+ {"DE", "LA" }, // Latin
+ {"DE", "NL" }, // Dutch
+ {"DE", "PL" }, // Polish
+ {"DE", "RU" },
+ {"DE", "SV" }, // Swedish
+ {"DE", "TR" }, // Turkish
+ {"DE", "ZH" },
+ {"DE", "TA" }, // Tamil
+
+ {"ES", "RU" }, // Spanish-Russian
+
+ {"FR", "BG" }, // Bulgarian
+ {"FR", "CS" }, // Czech
+ {"FR", "DE" },
+ {"FR", "ES" },
+ {"FR", "IT" },
+ {"FR", "JA" },
+ {"FR", "LA" },
+ {"FR", "NL" }, // Dutch
+ {"FR", "RU" },
+ {"FR", "TR" }, // Turkish
+ {"FR", "ZH" },
+
+ {"IT", "DE" },
+ {"IT", "EL" }, // Greek
+ {"IT", "ES" },
+ {"IT", "FR" },
+ {"IT", "HU" },
+ {"IT", "JA" },
+ {"IT", "LA" }, // Latin
+ {"IT", "LV" }, // Latvian
+ {"IT", "NL" },
+ {"IT", "PL" },
+ {"IT", "RU" },
+ {"IT", "SV" },
+ {"IT", "TR" }, // Turkish
+ {"IT", "ZH" },
+
+ {"JA", "ZH" },
+ {"JA", "AR" },
+ {"JA", "KO" },
+
+ {"ZH", "AR" },
+ {"ZH", "DE" },
+ {"ZH", "ES" },
+ {"ZH", "FR" },
+ {"ZH", "IT" },
+ {"ZH", "KO" },
+
+
+ {"NO", "SV" },
+ {"NO", "FI" },
+ {"FI", "SV" },
+
+ {"PL", "FR" }, // Polish
+ {"PL", "RU" }, // Polish
+ {"PL", "HU" }, // Polish
+ {"PL", "ES" }, // Polish
+
+ {"TR", "EL" }, // Turkish, Greek
+
+ {"FA", "HY" }, // Persian, Armenian, by request.
+ {"FA", "SV" }, // Persian, Swedish, by request.
+
+ };
+
static final Map<String,String> isoToDedication = new LinkedHashMap<String, String>();
result.add(String.format("--input%dWiktionaryLang=%s", i, lang1));
result.add(String.format("--input%dSkipLang=%s", i, lang1));
result.add(String.format("--input%dWebUrlTemplate=http://%s.wiktionary.org/wiki/%%s", i, lang1.toLowerCase()));
- result.add(String.format("--input%dPageLimit=100", i));
+ //result.add(String.format("--input%dPageLimit=100", i));
++i;
} else {
System.err.println("Can't read file: " + wikiSplitFile);
result.add(String.format("--input%dLangPattern=%s", i, "English"));
result.add(String.format("--input%dLangCodePattern=%s", i, lang1.toLowerCase()));
result.add(String.format("--input%dEnIndex=%d", i, 1));
- result.add(String.format("--input%dPageLimit=100", i));
+ //result.add(String.format("--input%dPageLimit=100", i));
++i;
}
final List<String[]> allPairs = new ArrayList<String[]>();
- // Build the non EN ones.
- final String[][] nonEnPairs = new String[][] {
- {"EN"},
- {"DE"},
- {"IT"},
- {"FR"},
-
- // The 3 I use most:
- {"IT", "EN" },
- {"DE", "EN" },
- {"DE", "IT" },
-
-
- {"AR", "DE" },
- {"AR", "ES" },
- {"AR", "FR" },
- {"AR", "HE" },
- {"AR", "IT" },
- {"AR", "JA" },
- {"AR", "RU" },
- {"AR", "TR" }, // Turkish
- {"AR", "ZH" },
-
- {"DE", "AR" },
- {"DE", "FR" },
- {"DE", "CA" }, // Catalan
- {"DE", "CS" }, // Czech
- {"DE", "EO" }, // Esperanto
- {"DE", "ES" },
- {"DE", "FR" },
- {"DE", "HE" },
- {"DE", "HU" }, // Hungarian
- {"DE", "IT" },
- {"DE", "JA" },
- {"DE", "LA" }, // Latin
- {"DE", "NL" }, // Dutch
- {"DE", "PL" }, // Polish
- {"DE", "RU" },
- {"DE", "SV" }, // Swedish
- {"DE", "TR" }, // Turkish
- {"DE", "ZH" },
- {"DE", "TA" }, // Tamil
-
- {"ES", "RU" }, // Spanish-Russian
-
- {"FR", "BG" }, // Bulgarian
- {"FR", "CS" }, // Czech
- {"FR", "DE" },
- {"FR", "ES" },
- {"FR", "IT" },
- {"FR", "JA" },
- {"FR", "LA" },
- {"FR", "NL" }, // Dutch
- {"FR", "RU" },
- {"FR", "TR" }, // Turkish
- {"FR", "ZH" },
-
- {"IT", "DE" },
- {"IT", "EL" }, // Greek
- {"IT", "ES" },
- {"IT", "FR" },
- {"IT", "HU" },
- {"IT", "JA" },
- {"IT", "LA" }, // Latin
- {"IT", "LV" }, // Latvian
- {"IT", "NL" },
- {"IT", "PL" },
- {"IT", "RU" },
- {"IT", "SV" },
- {"IT", "TR" }, // Turkish
- {"IT", "ZH" },
-
- {"JA", "ZH" },
- {"JA", "AR" },
- {"JA", "KO" },
-
- {"ZH", "AR" },
- {"ZH", "DE" },
- {"ZH", "ES" },
- {"ZH", "FR" },
- {"ZH", "IT" },
- {"ZH", "KO" },
-
-
- {"NO", "SV" },
- {"NO", "FI" },
- {"FI", "SV" },
-
- {"PL", "FR" }, // Polish
- {"PL", "RU" }, // Polish
- {"PL", "HU" }, // Polish
- {"PL", "ES" }, // Polish
-
- {"TR", "EL" }, // Turkish, Greek
-
- {"FA", "HY" }, // Persian, Armenian, by request.
- {"FA", "SV" }, // Persian, Swedish, by request.
-
- };
allPairs.addAll(Arrays.asList(nonEnPairs));
// Add all the EN-XX pairs.