L=en
echo "Downloading from: http://dumps.wikimedia.org/${L}wiktionary/"
WIKI=${L}wiktionary-20120109-pages-articles.xml
-curl --remote-name http://dumps.wikimedia.org/${L}wiktionary/20120109/${WIKI}.bz2
-bunzip2 ${WIKI}.bz2
-mv ${WIKI} inputs/${L}wiktionary-pages-articles.xml
+#curl --remote-name http://dumps.wikimedia.org/${L}wiktionary/20120109/${WIKI}.bz2
+#bunzip2 ${WIKI}.bz2
+#mv ${WIKI} inputs/${L}wiktionary-pages-articles.xml
L=fr
echo "Downloading from: http://dumps.wikimedia.org/${L}wiktionary/"
public class CheckDictionariesMain {
static final String BASE_URL = "http://quickdic-dictionary.googlecode.com/files/";
- static final String VERSION_CODE = "v003";
+ static final String VERSION_CODE = "v002";
public static void main(String[] args) throws IOException {
final File dictDir = new File(DictionaryBuilderMain.OUTPUTS);
public final Dictionary dictionary;
public final List<IndexBuilder> indexBuilders = new ArrayList<IndexBuilder>();
- public DictionaryBuilder(final String dictInfo, final Language lang0, final Language lang1, final String normalizerRules1, final String normalizerRules2, final Set<String> lang1Stoplist, final Set<String> lang2Stoplist) {
- dictionary = new Dictionary(dictInfo);
+ public DictionaryBuilder(final String dictInfoString, final Language lang0, final Language lang1, final String normalizerRules1, final String normalizerRules2, final Set<String> lang1Stoplist, final Set<String> lang2Stoplist) {
+ dictionary = new Dictionary(dictInfoString);
indexBuilders.add(new IndexBuilder(this, lang0.getIsoCode(), lang0.getIsoCode() + "->" + lang1.getIsoCode(), lang0, normalizerRules1, lang1Stoplist, false));
indexBuilders.add(new IndexBuilder(this, lang1.getIsoCode(), lang1.getIsoCode() + "->" + lang0.getIsoCode(), lang1, normalizerRules2, lang2Stoplist, true));
}
fatalError("Must specify human readable name for: " + prefix + "Name");
}
- final EntrySource entrySource = new EntrySource(dictionaryBuilder.dictionary.sources.size(), inputName);
+ final EntrySource entrySource = new EntrySource(dictionaryBuilder.dictionary.sources.size(), inputName, 0);
System.out.println("");
String inputFormat = keyValueArgs.remove(prefix + "Format");
}
if (tokenEntryDatas.add(entryData)) {
rows.add(new PairEntry.Row(entryData.index(), rows.size(), index));
+ ++entryData.entry.entrySource.numEntries;
++numRows;
// System.out.print(" " + typeToEntry.getKey() + ": ");
isoCodeToWikiName.put("BO", "Tibetan");
isoCodeToWikiName.put("TR", "Turkish");
isoCodeToWikiName.put("UK", "Ukrainian");
+ isoCodeToWikiName.put("UR", "Urdu");
isoCodeToWikiName.put("VI", "Vietnamese");
isoCodeToWikiName.put("CI", "Welsh");
isoCodeToWikiName.put("YI", "Yiddish");
Source: http://dict.tu-chemnitz.de/
Thanks to Frank Richter.
+EntrySource: chemnitz 980
+EntrySource: dictcc 13
+
Index: DE DE->EN
***40***
40 :: 40
dictInfo=SomeWikiData
+EntrySource: enwiktionary.arabic 13363
+
Index: AR AR->EN
***أ***
أ / أ (’álifu hámzatin) :: The first letter of the Arabic alphabet is the small hamza (ء) that sits on top of أ, and the tall column is its bearer. The composite letter is called الف (’álif) and the hamza represents a glottal stop (/ʔ/). (For the pronunciation without hamza, see ا.) It is followed by ب.
dictInfo=SomeWikiData
+EntrySource: enwiktionary.german 5303
+
Index: DE DE->EN
===001===
ward (verb form) :: {archaic} Third-person singular indicative past form of werden.
dictInfo=SomeWikiData
+EntrySource: enwiktionary.english 4965
+
Index: DE DE->EN
===2===
Zehn {f} (2) :: ten (the number following nine) (noun)
dictInfo=SomeWikiData
+EntrySource: enwiktionary.french 6667
+
Index: FR FR->EN
===00===
de {fr-prep} :: from (used to indicate the start of a time or range)
dictInfo=SomeWikiData
+EntrySource: enwiktionary.english 3462
+
Index: IT IT->EN
===15===
(periodo di) due settimane ; quindicina {f} (actually 15 days) :: fortnight (period of two weeks) (noun)
dictInfo=SomeWikiData
+EntrySource: enwiktionary.italian 5146
+
Index: IT IT->EN
===1963===
dal :: since
dictInfo=SomeWikiData
+EntrySource: enwiktionary.english 4579
+
Index: ZH ZH->EN
===1===
(Cantonese) 今日 (gam<sup>1</sup>yat<sup>6</sup>) :: today (on the current day) (adverb)
dictInfo=SomeWikiData
+EntrySource: enwiktionary.chinese 628
+
Index: ZH ZH->EN
===3===
NB {{cmn-adj|p|pint=nb}} :: {{slang|skey=nb}} fucking awesome
For next release:
+help screen
+eng_urdu
fix up dictionary manager:
thread that handles unzipping, downloading for the life of the application (so screen changes don't screw it up).
check over UI.