From: Reimar Döffinger Date: Sun, 20 Dec 2020 09:55:20 +0000 (+0100) Subject: Merge pull request #3 from zorun/swedish X-Git-Url: http://gitweb.fperrin.net/?p=DictionaryPC.git;a=commitdiff_plain;h=96e85ff86f44888a9994710d55e760893d87da76;hp=cb48e386855b695b54c6e9682ef7bffaf1d2ba38 Merge pull request #3 from zorun/swedish Add french-swedish dictionary support. --- diff --git a/.classpath b/.classpath index 60b221e..96fa364 100755 --- a/.classpath +++ b/.classpath @@ -1,10 +1,17 @@ - - + + + + + + + + - - + + + diff --git a/.gitignore b/.gitignore index 287cc93..38a503e 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,5 @@ -dictInputs -dictOutputs/ +data/inputs/ +data/outputs/ bin wikiSplit wikiSplit_2011 @@ -7,4 +7,3 @@ wikiSplit_201106 wikiSplit_201111 .project .settings/ -*.class diff --git a/WiktionarySplitter.sh b/WiktionarySplitter.sh index 705bf2b..66d38a2 100755 --- a/WiktionarySplitter.sh +++ b/WiktionarySplitter.sh @@ -1,10 +1,12 @@ # Run after downloading (data/downloadInputs.sh) to generate # per-language data files from enwiktionary. -ICU4J=/usr/share/java/icu4j-49.1.jar -test -r "$ICU4J" || ICU4J=/usr/share/icu4j-55/lib/icu4j.jar -XERCES=/usr/share/java/xercesImpl.jar -test -r "$XERCES" || XERCES=/usr/share/xerces-2/lib/xercesImpl.jar -COMMONS_COMPRESS=/usr/share/java/commons-compress.jar -JAVA=/usr/lib/jvm/java-8-openjdk-amd64/jre/bin/java -test -x "$JAVA" || JAVA=java -"$JAVA" -Xmx4096m -Xverify:none -classpath src:../Util/src/:../Dictionary/src/:"$ICU4J":"$XERCES":"$COMMONS_COMPRESS" com.hughes.android.dictionary.engine.WiktionarySplitter "$@" +RUNNER=./DictionaryPC +if ! test -x "$RUNNER" ; then + ICU4J=/usr/share/java/icu4j-49.1.jar + test -r "$ICU4J" || ICU4J=/usr/share/icu4j-55/lib/icu4j.jar + COMMONS_COMPRESS=/usr/share/java/commons-compress.jar + JAVA=/usr/lib/jvm/java-8-openjdk-amd64/jre/bin/java + test -x "$JAVA" || JAVA=java + RUNNER="$JAVA -Xmx4096m -Xverify:none -classpath bin/:$ICU4J:$COMMONS_COMPRESS com.hughes.android.dictionary.engine.Runner" +fi +$RUNNER WiktionarySplitter "$@" diff --git a/compile.sh b/compile.sh index 7b0021f..0780eff 100755 --- a/compile.sh +++ b/compile.sh @@ -2,10 +2,7 @@ ICU4J=/usr/share/java/icu4j-49.1.jar test -r "$ICU4J" || ICU4J=/usr/share/icu4j-55/lib/icu4j.jar JUNIT=/usr/share/java/junit.jar test -r "$JUNIT" || JUNIT=/usr/share/junit/lib/junit.jar -XERCES=/usr/share/java/xercesImpl.jar -test -r "$XERCES" || XERCES=/usr/share/xerces-2/lib/xercesImpl.jar -COMMONS=/usr/share/java/commons-lang3.jar -test -r "$COMMONS" || COMMONS=/usr/share/commons-lang-3.3/lib/commons-lang.jar +COMMONS=/usr/share/java/commons-text.jar COMMONS_COMPRESS=/usr/share/java/commons-compress.jar if [ ! -x ../Dictionary ] ; then echo "You need to clone the Dictionary repository (including subprojects) into .." @@ -19,10 +16,6 @@ if [ ! -r "$JUNIT" ] ; then echo "Junit needs to be installed" exit 1; fi -if [ ! -r "$XERCES" ] ; then - echo "Xerces needs to be installed" - exit 1; -fi if [ ! -r "$COMMONS" ] ; then echo "commons-lang needs to be installed" exit 1; @@ -31,4 +24,8 @@ if [ ! -r "$COMMONS_COMPRESS" ] ; then echo "commons-compress needs to be installed" exit 1; fi -javac -g ../Dictionary/Util/src/com/hughes/util/*.java ../Dictionary/Util/src/com/hughes/util/raf/*.java ../Dictionary/src/com/hughes/android/dictionary/DictionaryInfo.java ../Dictionary/src/com/hughes/android/dictionary/engine/*.java ../Dictionary/src/com/hughes/android/dictionary/C.java src/com/hughes/util/*.java src/com/hughes/android/dictionary/*.java src/com/hughes/android/dictionary/*/*.java src/com/hughes/android/dictionary/*/*/*.java -classpath "$ICU4J:$JUNIT:$XERCES:$COMMONS:$COMMONS_COMPRESS" +mkdir -p bin +# -encoding is just a work around for user that still run systems +# with non-UTF8 locales +# Limit to Java 11 for compatibility with native-image +javac --source 11 --target 11 --limit-modules java.xml,java.logging -Xlint:all -encoding UTF-8 -g -d bin/ ../Dictionary/Util/src/com/hughes/util/*.java ../Dictionary/Util/src/com/hughes/util/raf/*.java ../Dictionary/src/com/hughes/android/dictionary/DictionaryInfo.java ../Dictionary/src/com/hughes/android/dictionary/engine/*.java ../Dictionary/src/com/hughes/android/dictionary/C.java src/com/hughes/util/*.java src/com/hughes/android/dictionary/*.java src/com/hughes/android/dictionary/*/*.java src/com/hughes/android/dictionary/*/*/*.java -classpath "$ICU4J:$JUNIT:$COMMONS:$COMMONS_COMPRESS" diff --git a/convert_to_v6.sh b/convert_to_v6.sh new file mode 100755 index 0000000..2443c1c --- /dev/null +++ b/convert_to_v6.sh @@ -0,0 +1,7 @@ +RUNNER=./DictionaryPC +if ! test -x "$RUNNER" ; then + JAVA=/usr/lib/jvm/java-8-openjdk-amd64/jre/bin/java + test -x "$JAVA" || JAVA=java + RUNNER="$JAVA -classpath bin/ com.hughes.android.dictionary.engine.Runner" +fi +$RUNNER ConvertToV6 "$@" diff --git a/genv6.sh b/genv6.sh new file mode 100755 index 0000000..9bb5147 --- /dev/null +++ b/genv6.sh @@ -0,0 +1,20 @@ +set -e +rm -rf data/outputsv6 +mkdir data/outputsv6 +for i in data/outputs/*.quickdic ; do + o=data/outputsv6/$(basename "$i") + ./convert_to_v6.sh "$i" "$o" + 7z a -mx=9 "$o".v006.zip "$o" + rm "$o" + # skipHtml makes no sense for single-language dictionaries + if echo "$o" | grep -q '-' ; then + if ./convert_to_v6.sh "$i" "$o" skipHtmlOpt ; then + 7z a -mx=9 "$o".small.v006.zip "$o" + rm "$o" + elif [ $? -ne 3 ] ; then + # Check for magic 3 indicating "no HTML entries in dictionary" + echo "Converting dictionary failed!" + exit 1 + fi + fi +done diff --git a/googlecode_upload.py b/googlecode_upload.py deleted file mode 100755 index d2d5f97..0000000 --- a/googlecode_upload.py +++ /dev/null @@ -1,248 +0,0 @@ -#!/usr/bin/env python -# -# Copyright 2006, 2007 Google Inc. All Rights Reserved. -# Author: danderson@google.com (David Anderson) -# -# Script for uploading files to a Google Code project. -# -# This is intended to be both a useful script for people who want to -# streamline project uploads and a reference implementation for -# uploading files to Google Code projects. -# -# To upload a file to Google Code, you need to provide a path to the -# file on your local machine, a small summary of what the file is, a -# project name, and a valid account that is a member or owner of that -# project. You can optionally provide a list of labels that apply to -# the file. The file will be uploaded under the same name that it has -# in your local filesystem (that is, the "basename" or last path -# component). Run the script with '--help' to get the exact syntax -# and available options. -# -# Note that the upload script requests that you enter your -# googlecode.com password. This is NOT your Gmail account password! -# This is the password you use on googlecode.com for committing to -# Subversion and uploading files. You can find your password by going -# to http://code.google.com/hosting/settings when logged in with your -# Gmail account. If you have already committed to your project's -# Subversion repository, the script will automatically retrieve your -# credentials from there (unless disabled, see the output of '--help' -# for details). -# -# If you are looking at this script as a reference for implementing -# your own Google Code file uploader, then you should take a look at -# the upload() function, which is the meat of the uploader. You -# basically need to build a multipart/form-data POST request with the -# right fields and send it to https://PROJECT.googlecode.com/files . -# Authenticate the request using HTTP Basic authentication, as is -# shown below. -# -# Licensed under the terms of the Apache Software License 2.0: -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Questions, comments, feature requests and patches are most welcome. -# Please direct all of these to the Google Code users group: -# http://groups.google.com/group/google-code-hosting - -"""Google Code file uploader script. -""" - -__author__ = 'danderson@google.com (David Anderson)' - -import httplib -import os.path -import optparse -import getpass -import base64 -import sys - - -def upload(file, project_name, user_name, password, summary, labels=None): - """Upload a file to a Google Code project's file server. - - Args: - file: The local path to the file. - project_name: The name of your project on Google Code. - user_name: Your Google account name. - password: The googlecode.com password for your account. - Note that this is NOT your global Google Account password! - summary: A small description for the file. - labels: an optional list of label strings with which to tag the file. - - Returns: a tuple: - http_status: 201 if the upload succeeded, something else if an - error occured. - http_reason: The human-readable string associated with http_status - file_url: If the upload succeeded, the URL of the file on Google - Code, None otherwise. - """ - # The login is the user part of user@gmail.com. If the login provided - # is in the full user@domain form, strip it down. - if user_name.endswith('@gmail.com'): - user_name = user_name[:user_name.index('@gmail.com')] - - form_fields = [('summary', summary)] - if labels is not None: - form_fields.extend([('label', l.strip()) for l in labels]) - - content_type, body = encode_upload_request(form_fields, file) - - upload_host = '%s.googlecode.com' % project_name - upload_uri = '/files' - auth_token = base64.b64encode('%s:%s'% (user_name, password)) - headers = { - 'Authorization': 'Basic %s' % auth_token, - 'User-Agent': 'Googlecode.com uploader v0.9.4', - 'Content-Type': content_type, - } - - server = httplib.HTTPSConnection(upload_host) - server.request('POST', upload_uri, body, headers) - resp = server.getresponse() - server.close() - - if resp.status == 201: - location = resp.getheader('Location', None) - else: - location = None - return resp.status, resp.reason, location - - -def encode_upload_request(fields, file_path): - """Encode the given fields and file into a multipart form body. - - fields is a sequence of (name, value) pairs. file is the path of - the file to upload. The file will be uploaded to Google Code with - the same file name. - - Returns: (content_type, body) ready for httplib.HTTP instance - """ - BOUNDARY = '----------Googlecode_boundary_reindeer_flotilla' - CRLF = '\r\n' - - body = [] - - # Add the metadata about the upload first - for key, value in fields: - body.extend( - ['--' + BOUNDARY, - 'Content-Disposition: form-data; name="%s"' % key, - '', - value, - ]) - - # Now add the file itself - file_name = os.path.basename(file_path) - f = open(file_path, 'rb') - file_content = f.read() - f.close() - - body.extend( - ['--' + BOUNDARY, - 'Content-Disposition: form-data; name="filename"; filename="%s"' - % file_name, - # The upload server determines the mime-type, no need to set it. - 'Content-Type: application/octet-stream', - '', - file_content, - ]) - - # Finalize the form body - body.extend(['--' + BOUNDARY + '--', '']) - - return 'multipart/form-data; boundary=%s' % BOUNDARY, CRLF.join(body) - - -def upload_find_auth(file_path, project_name, summary, labels=None, - user_name=None, password=None, tries=3): - """Find credentials and upload a file to a Google Code project's file server. - - file_path, project_name, summary, and labels are passed as-is to upload. - - Args: - file_path: The local path to the file. - project_name: The name of your project on Google Code. - summary: A small description for the file. - labels: an optional list of label strings with which to tag the file. - config_dir: Path to Subversion configuration directory, 'none', or None. - user_name: Your Google account name. - tries: How many attempts to make. - """ - - while tries > 0: - if user_name is None: - # Read username if not specified or loaded from svn config, or on - # subsequent tries. - sys.stdout.write('Please enter your googlecode.com username: ') - sys.stdout.flush() - user_name = sys.stdin.readline().rstrip() - if password is None: - # Read password if not loaded from svn config, or on subsequent tries. - print 'Please enter your googlecode.com password.' - print '** Note that this is NOT your Gmail account password! **' - print 'It is the password you use to access Subversion repositories,' - print 'and can be found here: http://code.google.com/hosting/settings' - password = getpass.getpass() - - status, reason, url = upload(file_path, project_name, user_name, password, - summary, labels) - # Returns 403 Forbidden instead of 401 Unauthorized for bad - # credentials as of 2007-07-17. - if status in [httplib.FORBIDDEN, httplib.UNAUTHORIZED]: - # Rest for another try. - user_name = password = None - tries = tries - 1 - else: - # We're done. - break - - return status, reason, url - - -def main(): - parser = optparse.OptionParser(usage='googlecode-upload.py -s SUMMARY ' - '-p PROJECT [options] FILE') - parser.add_option('-s', '--summary', dest='summary', - help='Short description of the file') - parser.add_option('-p', '--project', dest='project', - help='Google Code project name') - parser.add_option('-u', '--user', dest='user', - help='Your Google Code username') - parser.add_option('-w', '--password', dest='password', - help='Your Google Code password') - parser.add_option('-l', '--labels', dest='labels', - help='An optional list of comma-separated labels to attach ' - 'to the file') - - options, args = parser.parse_args() - - if not options.summary: - parser.error('File summary is missing.') - elif not options.project: - parser.error('Project name is missing.') - elif len(args) < 1: - parser.error('File to upload not provided.') - elif len(args) > 1: - parser.error('Only one file may be specified.') - - file_path = args[0] - - if options.labels: - labels = options.labels.split(',') - else: - labels = None - - status, reason, url = upload_find_auth(file_path, options.project, - options.summary, labels, - options.user, options.password) - if url: - print 'The file was uploaded successfully.' - print 'URL: %s' % url - return 0 - else: - print 'An error occurred. Your file was not uploaded.' - print 'Google Code upload server said: %s (%s)' % (reason, status) - return 1 - - -if __name__ == '__main__': - sys.exit(main()) diff --git a/jars/commons-lang3-3.1.jar b/jars/commons-lang3-3.1.jar deleted file mode 100644 index a85e539..0000000 Binary files a/jars/commons-lang3-3.1.jar and /dev/null differ diff --git a/jars/xerces-2_11_0/xercesImpl.jar b/jars/xerces-2_11_0/xercesImpl.jar deleted file mode 100644 index 0aaa990..0000000 Binary files a/jars/xerces-2_11_0/xercesImpl.jar and /dev/null differ diff --git a/native-image-reflection.json b/native-image-reflection.json new file mode 100644 index 0000000..e86958e --- /dev/null +++ b/native-image-reflection.json @@ -0,0 +1,8 @@ +[ + { + "name": "com.ibm.icu.text.CollatorServiceShim", + "methods": [ + { "name": "", "parameterTypes": [] } + ] + } +] diff --git a/native-image.cmd b/native-image.cmd new file mode 100755 index 0000000..d0e0c6d --- /dev/null +++ b/native-image.cmd @@ -0,0 +1,2 @@ +REM --allow-incomplete-classpath due to missing XZ implementation +%GRAALVM_HOME%/bin/native-image --allow-incomplete-classpath --no-server -H:Name="DictionaryPC" com.hughes.android.dictionary.engine.Runner --no-fallback -cp bin/;commons-compress.jar;commons-text.jar;commons-lang3.jar;icu4j-49.1.jar -H:IncludeResources="com/ibm/icu/.*" -H:ReflectionConfigurationFiles=native-image-reflection.json diff --git a/native-image.sh b/native-image.sh new file mode 100755 index 0000000..a332f2f --- /dev/null +++ b/native-image.sh @@ -0,0 +1 @@ +"$GRAALVM_HOME"/bin/native-image --no-server -H:Name="DictionaryPC" com.hughes.android.dictionary.engine.Runner --no-fallback -cp bin/:/usr/share/java/commons-compress.jar:/usr/share/java/commons-text.jar:/usr/share/java/commons-lang3.jar:/usr/share/java/icu4j-49.1.jar -H:IncludeResources="com/ibm/icu/.*" -H:ReflectionConfigurationFiles=native-image-reflection.json diff --git a/run.sh b/run.sh index 12ea566..3a3c1d9 100755 --- a/run.sh +++ b/run.sh @@ -1,11 +1,14 @@ -# -agentlib:hprof=heap=sites,depth=20 -ICU4J=/usr/share/java/icu4j-49.1.jar -test -r "$ICU4J" || ICU4J=/usr/share/icu4j-55/lib/icu4j.jar -XERCES=/usr/share/java/xercesImpl.jar -test -r "$XERCES" || XERCES=/usr/share/xerces-2/lib/xercesImpl.jar -COMMONS=/usr/share/java/commons-lang3.jar -test -r "$COMMONS" || COMMONS=/usr/share/commons-lang-3.3/lib/commons-lang.jar -COMMONS_COMPRESS=/usr/share/java/commons-compress.jar -JAVA=/usr/lib/jvm/java-8-openjdk-amd64/jre/bin/java -test -x "$JAVA" || JAVA=java -"$JAVA" -Djava.util.logging.config.file="logging.properties" -Xmx4096m -classpath src:../Dictionary/Util/src/:../Dictionary/src/:"$ICU4J":"$XERCES":"$COMMONS":"$COMMONS_COMPRESS" com.hughes.android.dictionary.engine.DictionaryBuilder "$@" +RUNNER=./DictionaryPC +if ! test -x "$RUNNER" ; then + # -agentlib:hprof=heap=sites,depth=20 + ICU4J=/usr/share/java/icu4j-49.1.jar + test -r "$ICU4J" || ICU4J=/usr/share/icu4j-55/lib/icu4j.jar + COMMONS_LANG3=/usr/share/java/commons-lang3.jar + test -r "$COMMONS_LANG3" || COMMONS_LANG3=/usr/share/commons-lang-3.3/lib/commons-lang.jar + COMMONS_TEXT=/usr/share/java/commons-text.jar + COMMONS_COMPRESS=/usr/share/java/commons-compress.jar + JAVA=/usr/lib/jvm/java-8-openjdk-amd64/jre/bin/java + test -x "$JAVA" || JAVA=java + RUNNER="$JAVA -Djava.util.logging.config.file=logging.properties -Xmx4096m -classpath bin/:$ICU4J:$COMMONS_LANG3:$COMMONS_TEXT:$COMMONS_COMPRESS com.hughes.android.dictionary.engine.Runner" +fi +$RUNNER DictionaryBuilder "$@" diff --git a/src/com/hughes/android/dictionary/CollatorWrapper.java b/src/com/hughes/android/dictionary/CollatorWrapper.java index 2373629..295847d 100644 --- a/src/com/hughes/android/dictionary/CollatorWrapper.java +++ b/src/com/hughes/android/dictionary/CollatorWrapper.java @@ -18,11 +18,11 @@ import java.util.Locale; import com.ibm.icu.text.Collator; -final public class CollatorWrapper { -static public Collator getInstance() { +public final class CollatorWrapper { +public static Collator getInstance() { return Collator.getInstance(); } -static public Collator getInstanceStrengthIdentical(Locale l) { +public static Collator getInstanceStrengthIdentical(Locale l) { Collator c = Collator.getInstance(l); c.setStrength(Collator.IDENTICAL); return c; diff --git a/src/com/hughes/android/dictionary/DateFormatTest.java b/src/com/hughes/android/dictionary/DateFormatTest.java index fce2095..8be638c 100644 --- a/src/com/hughes/android/dictionary/DateFormatTest.java +++ b/src/com/hughes/android/dictionary/DateFormatTest.java @@ -1,29 +1,29 @@ -// Copyright 2011 Google Inc. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package com.hughes.android.dictionary; - -import java.text.SimpleDateFormat; -import java.util.Date; - -public class DateFormatTest { - - /** - * @param args - */ - public static void main(String[] args) { - System.out.println(new SimpleDateFormat("yyyy.MM.dd HH:mm:ss").format(new Date())); - } - -} +// Copyright 2011 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package com.hughes.android.dictionary; + +import java.text.SimpleDateFormat; +import java.util.Date; + +public class DateFormatTest { + + /** + * @param args + */ + public static void main(String[] args) { + System.out.println(new SimpleDateFormat("yyyy.MM.dd HH:mm:ss").format(new Date())); + } + +} diff --git a/src/com/hughes/android/dictionary/DictionaryApplication.java b/src/com/hughes/android/dictionary/DictionaryApplication.java index da5e67c..b18c7a0 100644 --- a/src/com/hughes/android/dictionary/DictionaryApplication.java +++ b/src/com/hughes/android/dictionary/DictionaryApplication.java @@ -1,5 +1,5 @@ package com.hughes.android.dictionary; -final public class DictionaryApplication { - final static public boolean USE_COLLATOR = true; +public final class DictionaryApplication { + public static final boolean USE_COLLATOR = true; } diff --git a/src/com/hughes/android/dictionary/FeatureConfig.java b/src/com/hughes/android/dictionary/FeatureConfig.java index 4642e11..291c010 100644 --- a/src/com/hughes/android/dictionary/FeatureConfig.java +++ b/src/com/hughes/android/dictionary/FeatureConfig.java @@ -1,5 +1,5 @@ package com.hughes.android.dictionary; -final public class FeatureConfig { - final static public boolean enableWrite = true; +public final class FeatureConfig { + public static final boolean enableWrite = true; } diff --git a/src/com/hughes/android/dictionary/SerializeCollatorTest.java b/src/com/hughes/android/dictionary/SerializeCollatorTest.java index 7a1e42e..2980e10 100644 --- a/src/com/hughes/android/dictionary/SerializeCollatorTest.java +++ b/src/com/hughes/android/dictionary/SerializeCollatorTest.java @@ -1,36 +1,35 @@ -// Copyright 2011 Google Inc. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package com.hughes.android.dictionary; - -import java.io.File; -import java.io.IOException; -import java.util.Comparator; - -import com.hughes.android.dictionary.engine.Language; -import java.text.Collator; - -public class SerializeCollatorTest { - - /** - * @param args - * @throws IOException - */ - public static void main(String[] args) throws IOException { - File temp = File.createTempFile("temp", null); - final Comparator c = Language.de.getCollator(); - //FileUtil.writeObject(c, temp); - } - -} +// Copyright 2011 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package com.hughes.android.dictionary; + +import java.io.File; +import java.io.IOException; +import java.util.Comparator; + +import com.hughes.android.dictionary.engine.Language; + +public class SerializeCollatorTest { + + /** + * @param args + * @throws IOException + */ + public static void main(String[] args) throws IOException { + File temp = File.createTempFile("temp", null); + final Comparator c = Language.de.getCollator(); + //FileUtil.writeObject(c, temp); + } + +} diff --git a/src/com/hughes/android/dictionary/engine/CheckDictionariesMain.java b/src/com/hughes/android/dictionary/engine/CheckDictionariesMain.java index d82f190..e43f1d0 100644 --- a/src/com/hughes/android/dictionary/engine/CheckDictionariesMain.java +++ b/src/com/hughes/android/dictionary/engine/CheckDictionariesMain.java @@ -1,9 +1,5 @@ package com.hughes.android.dictionary.engine; -import com.hughes.android.dictionary.DictionaryInfo; -import com.hughes.android.dictionary.DictionaryInfo.IndexInfo; -import com.hughes.util.CollectionUtil; - import java.io.File; import java.io.IOException; import java.io.PrintWriter; @@ -13,9 +9,13 @@ import java.util.Arrays; import java.util.Collections; import java.util.List; +import com.hughes.android.dictionary.DictionaryInfo; +import com.hughes.android.dictionary.DictionaryInfo.IndexInfo; +import com.hughes.util.CollectionUtil; + public class CheckDictionariesMain { - static final String BASE_URL = "http://github.com/rdoeffinger/Dictionary/releases/download/v0.2-dictionaries/"; + static final String BASE_URL = "https://github.com/rdoeffinger/Dictionary/releases/download/v0.3-dictionaries/"; static final String VERSION_CODE_OLD = "v006"; static final String VERSION_CODE = "v007"; @@ -26,7 +26,7 @@ public class CheckDictionariesMain { // dictionaryInfoOut.println("# LANG_1\t%LANG_2\tFILENAME\tVERSION_CODE\tFILESIZE\tNUM_MAIN_WORDS_1\tNUM_MAIN_WORDS_2\tNUM_ALL_WORDS_1\tNUM_ALL_WORDS_2"); final File[] files = dictDir.listFiles(); - final List dictNames = new ArrayList(); + final List dictNames = new ArrayList<>(); Arrays.sort(files); for (final File dictFile : files) { if (!dictFile.getName().endsWith("quickdic")) { @@ -63,7 +63,7 @@ public class CheckDictionariesMain { // Find the stats. System.out.println("Stats..."); - final List indexNames = new ArrayList(); + final List indexNames = new ArrayList<>(); for (final IndexInfo indexInfo : dictionaryInfo.indexInfos) { indexNames.add(indexInfo.shortName); } diff --git a/src/com/hughes/android/dictionary/engine/ConvertToV6.java b/src/com/hughes/android/dictionary/engine/ConvertToV6.java new file mode 100644 index 0000000..05a801b --- /dev/null +++ b/src/com/hughes/android/dictionary/engine/ConvertToV6.java @@ -0,0 +1,74 @@ +// Copyright 2020 Reimar Döffinger. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package com.hughes.android.dictionary.engine; + +import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.io.RandomAccessFile; + +public class ConvertToV6 { + public static void main(final String[] args) throws IOException { + if (args.length != 2 && args.length != 3) { + System.out.println("Usage: ConvertToV6 [skipHtml]"); + System.out.println("If the option third argument is given as 'skipHtml'"); + System.out.println("the v6 dictionary will be without all HTML entries to reduce its size"); + return; + } + boolean skipHtml = false; + boolean skipHtmlOpt = false; + if (args.length == 3) { + if (!args[2].equals("skipHtml") && !args[2].equals("skipHtmlOpt")) { + System.out.println("Unknown extra argument '" + args[2] + "'"); + return; + } + skipHtml = true; + skipHtmlOpt = args[2].equals("skipHtmlOpt"); + } + final String inname = args[0]; + final String outname = args[1]; + FileInputStream in; + try { + in = new FileInputStream(inname); + } catch (FileNotFoundException e) { + System.out.println("Could not open input file '" + inname + "'"); + System.out.println(e); + return; + } + final Dictionary dictionary = new Dictionary(in.getChannel()); + if (dictionary.dictFileVersion <= 6) { + System.out.println("Input dictionary is already v6 or older!"); + return; + } + if (skipHtmlOpt && dictionary.htmlEntries.size() == 0) { + System.exit(3); + } + RandomAccessFile out; + try { + out = new RandomAccessFile(outname, "rw"); + } catch (FileNotFoundException e) { + System.out.println("Could not open output file '" + outname + "'"); + System.out.println(e); + return; + } + if (out.length() > 0) { + System.out.println("Output file '" + outname + "' already exists, aborting!"); + return; + } + new DictionaryV6Writer(dictionary).writev6(out, skipHtml); + out.close(); + in.close(); + } +} diff --git a/src/com/hughes/android/dictionary/engine/DictionaryBuilder.java b/src/com/hughes/android/dictionary/engine/DictionaryBuilder.java index d105af2..34cc705 100644 --- a/src/com/hughes/android/dictionary/engine/DictionaryBuilder.java +++ b/src/com/hughes/android/dictionary/engine/DictionaryBuilder.java @@ -42,7 +42,7 @@ import com.hughes.util.FileUtil; public class DictionaryBuilder { public final Dictionary dictionary; - public final List indexBuilders = new ArrayList(); + public final List indexBuilders = new ArrayList<>(); public DictionaryBuilder(final String dictInfoString, final Language lang0, final Language lang1, final String normalizerRules1, final String normalizerRules2, final Set lang1Stoplist, final Set lang2Stoplist) { dictionary = new Dictionary(dictInfoString); @@ -61,7 +61,7 @@ public class DictionaryBuilder { } } - public static void main(final String[] args) throws IOException, ParserConfigurationException, SAXException { + public static void main(final String[] args) throws IOException { System.out.println("Running with arguments:"); for (final String arg : args) { System.out.println(arg); @@ -80,8 +80,8 @@ public class DictionaryBuilder { lang2 = null; } - final Set lang1Stoplist = new LinkedHashSet(); - final Set lang2Stoplist = new LinkedHashSet(); + final Set lang1Stoplist = new LinkedHashSet<>(); + final Set lang2Stoplist = new LinkedHashSet<>(); final String lang1StoplistFile = keyValueArgs.remove("lang1Stoplist"); final String lang2StoplistFile = keyValueArgs.remove("lang2Stoplist"); if (lang1StoplistFile != null) { @@ -145,7 +145,7 @@ public class DictionaryBuilder { final int pageLimit = Integer.parseInt(pageLimitString); final EntrySource entrySource = new EntrySource(dictionaryBuilder.dictionary.sources.size(), inputName, 0); - System.out.println(""); + System.out.println(); String inputFormat = keyValueArgs.remove(prefix + "Format"); if ("tab_separated".equals(inputFormat)) { diff --git a/src/com/hughes/android/dictionary/engine/DictionaryBuilderMain.java b/src/com/hughes/android/dictionary/engine/DictionaryBuilderMain.java index cf5fa96..8c2ebd0 100644 --- a/src/com/hughes/android/dictionary/engine/DictionaryBuilderMain.java +++ b/src/com/hughes/android/dictionary/engine/DictionaryBuilderMain.java @@ -14,12 +14,6 @@ package com.hughes.android.dictionary.engine; -import com.hughes.android.dictionary.parser.wiktionary.EnTranslationToTranslationParser; -import com.hughes.android.dictionary.parser.wiktionary.WholeSectionToHtmlParser; -import com.hughes.android.dictionary.parser.wiktionary.WiktionaryLangs; - -import junit.framework.TestCase; - import java.io.File; import java.util.ArrayList; import java.util.Arrays; @@ -29,6 +23,12 @@ import java.util.List; import java.util.Map; import java.util.Set; +import com.hughes.android.dictionary.parser.wiktionary.EnTranslationToTranslationParser; +import com.hughes.android.dictionary.parser.wiktionary.WholeSectionToHtmlParser; +import com.hughes.android.dictionary.parser.wiktionary.WiktionaryLangs; + +import junit.framework.TestCase; + public class DictionaryBuilderMain extends TestCase { static final String INPUTS = "data/inputs/"; @@ -36,7 +36,7 @@ public class DictionaryBuilderMain extends TestCase { static final String OUTPUTS = "data/outputs/"; // Build the non EN ones. - static final String[][] nonEnPairs = new String[][] { + static final String[][] nonEnPairs = { {"EN"}, {"DE"}, {"IT"}, @@ -138,7 +138,7 @@ public class DictionaryBuilderMain extends TestCase { - static final Map isoToDedication = new LinkedHashMap(); + static final Map isoToDedication = new LinkedHashMap<>(); static { isoToDedication.put("AF", "Wiktionary-based Afrikaans dictionary dedicated to Heiko and Mariëtte Horn."); isoToDedication.put("HR", "Wiktionary-based Croatian dictionary dedicated to Ines Viskic and Miro Kresonja."); @@ -158,7 +158,7 @@ public class DictionaryBuilderMain extends TestCase { return isoToDedication.containsKey(iso) ? isoToDedication.get(iso) : String.format("Wiktionary-based %s dictionary.", iso); } - static final Map isoToStoplist = new LinkedHashMap(); + static final Map isoToStoplist = new LinkedHashMap<>(); static { isoToStoplist.put("DE", "de.txt"); isoToStoplist.put("EN", "en.txt"); @@ -167,7 +167,7 @@ public class DictionaryBuilderMain extends TestCase { isoToStoplist.put("FR", "fr.txt"); } private static String getStoplist(String iso) { - return isoToStoplist.containsKey(iso) ? isoToStoplist.get(iso) : "empty.txt"; + return isoToStoplist.getOrDefault(iso, "empty.txt"); } static String getOtherLang(final String[] pair, final String first) { @@ -177,7 +177,7 @@ public class DictionaryBuilderMain extends TestCase { } static List getMainArgs(final String[] pair) { - final List result = new ArrayList(); + final List result = new ArrayList<>(); int i = 1; @@ -311,9 +311,7 @@ public class DictionaryBuilderMain extends TestCase { public static void main(final String[] args) throws Exception { - final List allPairs = new ArrayList(); - - allPairs.addAll(Arrays.asList(nonEnPairs)); + final List allPairs = new ArrayList<>(Arrays.asList(nonEnPairs)); // Add all the EN-XX pairs. for (final String isoCode : WiktionaryLangs.isoCodeToEnWikiName.keySet()) { if (!isoCode.equals("EN")) { @@ -322,7 +320,7 @@ public class DictionaryBuilderMain extends TestCase { } - final Set> done = new LinkedHashSet>(); + final Set> done = new LinkedHashSet<>(); boolean go = true; for (final String[] pair : allPairs) { Arrays.sort(pair); @@ -332,11 +330,7 @@ public class DictionaryBuilderMain extends TestCase { } done.add(pairList); - if (pairList.contains("EN") && pairList.contains("DE")) { - go = true; - } else { - go = false; - } + go = pairList.contains("EN") && pairList.contains("DE"); if (!go) { continue; diff --git a/src/com/hughes/android/dictionary/engine/DictionaryBuilderTest.java b/src/com/hughes/android/dictionary/engine/DictionaryBuilderTest.java index 72ad9eb..ee1e664 100644 --- a/src/com/hughes/android/dictionary/engine/DictionaryBuilderTest.java +++ b/src/com/hughes/android/dictionary/engine/DictionaryBuilderTest.java @@ -15,7 +15,6 @@ package com.hughes.android.dictionary.engine; import java.io.File; -import java.io.FileNotFoundException; import java.io.IOException; import java.io.PrintStream; import java.io.RandomAccessFile; @@ -347,7 +346,7 @@ public class DictionaryBuilderTest extends TestCase { } private void checkGolden(final String dictName, final File dictFile) - throws IOException, FileNotFoundException { + throws IOException { // Check it once: assertFilesEqual(GOLDENS + dictName + ".text", dictFile.getPath() + ".text"); diff --git a/src/com/hughes/android/dictionary/engine/DictionaryTest.java b/src/com/hughes/android/dictionary/engine/DictionaryTest.java index d1dcc2b..0a9c673 100644 --- a/src/com/hughes/android/dictionary/engine/DictionaryTest.java +++ b/src/com/hughes/android/dictionary/engine/DictionaryTest.java @@ -21,11 +21,11 @@ import java.util.List; import java.util.Random; import java.util.concurrent.atomic.AtomicBoolean; -import junit.framework.TestCase; - import com.hughes.android.dictionary.engine.Index.IndexEntry; import com.hughes.util.CollectionUtil; +import junit.framework.TestCase; + public class DictionaryTest extends TestCase { @@ -61,7 +61,7 @@ public class DictionaryTest extends TestCase { assertTrue(rows.toString(), rows.size() > 0); assertTrue(rows.get(0).toString().startsWith("come mai@")); assertTrue(rows.get(0) instanceof TokenRow); - assertTrue(!((TokenRow)rows.get(0)).getIndexEntry().htmlEntries.isEmpty()); + assertFalse(((TokenRow) rows.get(0)).getIndexEntry().htmlEntries.isEmpty()); } { @@ -70,7 +70,7 @@ public class DictionaryTest extends TestCase { assertTrue(rows.toString(), rows.size() > 0); assertTrue(rows.get(0).toString().startsWith("buon giorno@")); assertTrue(rows.get(0) instanceof TokenRow); - assertTrue(!((TokenRow)rows.get(0)).getIndexEntry().htmlEntries.isEmpty()); + assertFalse(((TokenRow) rows.get(0)).getIndexEntry().htmlEntries.isEmpty()); } { @@ -171,7 +171,7 @@ public class DictionaryTest extends TestCase { // Check that search in lowercase works. assertSearchResult("Alibi", "Alibi", deIndex.findInsertionPoint("alib", new AtomicBoolean(false))); - System.out.println(deIndex.findInsertionPoint("alib", new AtomicBoolean(false)).toString()); + System.out.println(deIndex.findInsertionPoint("alib", new AtomicBoolean(false))); raf.close(); } diff --git a/src/com/hughes/android/dictionary/engine/DictionaryV6Writer.java b/src/com/hughes/android/dictionary/engine/DictionaryV6Writer.java new file mode 100644 index 0000000..3da4d4a --- /dev/null +++ b/src/com/hughes/android/dictionary/engine/DictionaryV6Writer.java @@ -0,0 +1,336 @@ +// Copyright 2020 Reimar Döffinger. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package com.hughes.android.dictionary.engine; + +import java.io.BufferedOutputStream; +import java.io.ByteArrayOutputStream; +import java.io.DataOutputStream; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.ObjectOutputStream; +import java.io.RandomAccessFile; +import java.nio.charset.StandardCharsets; +import java.util.Collections; +import java.util.List; +import java.util.zip.GZIPOutputStream; + +public class DictionaryV6Writer { + private final Dictionary d; + + public DictionaryV6Writer(Dictionary dictionary) { + d = dictionary; + } + + private void writev6Sources(RandomAccessFile out) throws IOException { + ByteArrayOutputStream toc = new ByteArrayOutputStream(); + DataOutputStream tocout = new DataOutputStream(toc); + + out.writeInt(d.sources.size()); + long tocPos = out.getFilePointer(); + out.seek(tocPos + d.sources.size() * 8 + 8); + for (EntrySource s : d.sources) { + long dataPos = out.getFilePointer(); + tocout.writeLong(dataPos); + + out.writeUTF(s.getName()); + out.writeInt(s.getNumEntries()); + } + long dataPos = out.getFilePointer(); + tocout.writeLong(dataPos); + tocout.close(); + + out.seek(tocPos); + out.write(toc.toByteArray()); + out.seek(dataPos); + } + + private void writev6PairEntries(RandomAccessFile out) throws IOException { + ByteArrayOutputStream toc = new ByteArrayOutputStream(); + DataOutputStream tocout = new DataOutputStream(toc); + + long tocPos = out.getFilePointer(); + long dataPos = tocPos + 4 + d.pairEntries.size() * 8 + 8; + + out.seek(dataPos); + DataOutputStream outb = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(out.getFD()))); + + tocout.writeInt(d.pairEntries.size()); + for (PairEntry pe : d.pairEntries) { + tocout.writeLong(dataPos + outb.size()); + + outb.writeShort(pe.entrySource.index()); + outb.writeInt(pe.pairs.size()); + for (PairEntry.Pair p : pe.pairs) { + outb.writeUTF(p.lang1); + outb.writeUTF(p.lang2); + } + } + dataPos += outb.size(); + outb.flush(); + tocout.writeLong(dataPos); + tocout.close(); + + out.seek(tocPos); + out.write(toc.toByteArray()); + out.seek(dataPos); + } + + private void writev6TextEntries(RandomAccessFile out) throws IOException { + ByteArrayOutputStream toc = new ByteArrayOutputStream(); + DataOutputStream tocout = new DataOutputStream(toc); + + out.writeInt(d.textEntries.size()); + long tocPos = out.getFilePointer(); + out.seek(tocPos + d.textEntries.size() * 8 + 8); + for (TextEntry t : d.textEntries) { + long dataPos = out.getFilePointer(); + tocout.writeLong(dataPos); + + out.writeShort(t.entrySource.index()); + out.writeUTF(t.text); + } + long dataPos = out.getFilePointer(); + tocout.writeLong(dataPos); + tocout.close(); + + out.seek(tocPos); + out.write(toc.toByteArray()); + out.seek(dataPos); + } + + private void writev6EmptyList(RandomAccessFile out) throws IOException { + out.writeInt(0); + out.writeLong(out.getFilePointer() + 8); + } + + private void writev6HtmlEntries(RandomAccessFile out) throws IOException { + ByteArrayOutputStream toc = new ByteArrayOutputStream(); + DataOutputStream tocout = new DataOutputStream(toc); + + long tocPos = out.getFilePointer(); + long dataPos = tocPos + 4 + d.htmlEntries.size() * 8 + 8; + + out.seek(dataPos); + DataOutputStream outb = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(out.getFD()))); + + tocout.writeInt(d.htmlEntries.size()); + for (HtmlEntry h : d.htmlEntries) { + tocout.writeLong(dataPos + outb.size()); + + outb.writeShort(h.entrySource.index()); + outb.writeUTF(h.title); + byte[] data = h.getHtml().getBytes(StandardCharsets.UTF_8); + outb.writeInt(data.length); + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + GZIPOutputStream gzout = new GZIPOutputStream(baos); + gzout.write(data); + gzout.close(); + outb.writeInt(baos.size()); + outb.write(baos.toByteArray()); + } + dataPos += outb.size(); + outb.flush(); + tocout.writeLong(dataPos); + tocout.close(); + + out.seek(tocPos); + out.write(toc.toByteArray()); + out.seek(dataPos); + } + + private void writev6HtmlIndices(DataOutputStream out, long pos, List entries) throws IOException { + long dataPos = pos + 4 + entries.size() * 8 + 8; + + out.writeInt(entries.size()); + + // TOC is trivial, so optimize writing it + for (int i = 0; i < entries.size(); i++) { + out.writeLong(dataPos); + dataPos += 4; + } + out.writeLong(dataPos); + + for (HtmlEntry e : entries) { + out.writeInt(e.index()); + } + } + + private void writev6IndexEntries(RandomAccessFile out, List entries, int[] prunedRowIdx) throws IOException { + ByteArrayOutputStream toc = new ByteArrayOutputStream(); + DataOutputStream tocout = new DataOutputStream(toc); + + long tocPos = out.getFilePointer(); + long dataPos = tocPos + 4 + entries.size() * 8 + 8; + + out.seek(dataPos); + DataOutputStream outb = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(out.getFD()))); + + tocout.writeInt(entries.size()); + for (Index.IndexEntry e : entries) { + tocout.writeLong(dataPos + outb.size()); + + outb.writeUTF(e.token); + + int startRow = e.startRow; + int numRows = e.numRows; + if (prunedRowIdx != null) { + // note: the start row will always be a TokenRow + // and thus never be pruned + int newNumRows = 1; + for (int i = 1; i < numRows; i++) { + if (prunedRowIdx[startRow + i] >= 0) newNumRows++; + } + startRow = prunedRowIdx[startRow]; + numRows = newNumRows; + } + + outb.writeInt(startRow); + outb.writeInt(numRows); + final boolean hasNormalizedForm = !e.token.equals(e.normalizedToken()); + outb.writeBoolean(hasNormalizedForm); + if (hasNormalizedForm) outb.writeUTF(e.normalizedToken()); + writev6HtmlIndices(outb, dataPos + outb.size(), + prunedRowIdx == null ? e.htmlEntries : Collections.emptyList()); + } + dataPos += outb.size(); + outb.flush(); + tocout.writeLong(dataPos); + tocout.close(); + + out.seek(tocPos); + out.write(toc.toByteArray()); + out.seek(dataPos); + } + + private void writev6Index(RandomAccessFile out, boolean skipHtml) throws IOException { + ByteArrayOutputStream toc = new ByteArrayOutputStream(); + DataOutputStream tocout = new DataOutputStream(toc); + + out.writeInt(d.indices.size()); + long tocPos = out.getFilePointer(); + out.seek(tocPos + d.indices.size() * 8 + 8); + for (Index idx : d.indices) { + // create pruned index for skipHtml feature + int[] prunedRowIdx = null; + int prunedSize = 0; + if (skipHtml) { + prunedRowIdx = new int[idx.rows.size()]; + for (int i = 0; i < idx.rows.size(); i++) { + final RowBase r = idx.rows.get(i); + // prune Html entries + boolean pruned = r instanceof HtmlEntry.Row; + prunedRowIdx[i] = pruned ? -1 : prunedSize; + if (!pruned) prunedSize++; + } + } + + long dataPos = out.getFilePointer(); + tocout.writeLong(dataPos); + + out.writeUTF(idx.shortName); + out.writeUTF(idx.longName); + out.writeUTF(idx.sortLanguage.getIsoCode()); + out.writeUTF(idx.normalizerRules); + out.writeBoolean(idx.swapPairEntries); + out.writeInt(idx.mainTokenCount); + writev6IndexEntries(out, idx.sortedIndexEntries, prunedRowIdx); + + // write stoplist, serializing the whole Set *shudder* + // Actually just emulate ObjectOutputStream serialization + final byte[] hashSetSerialized = { + (byte)0xac, (byte)0xed, // magic + 0x00, 0x05, // version + 0x73, // object + 0x72, // class + // "java.util.HashSet" + 0x00, 0x11, 0x6a, 0x61, 0x76, 0x61, 0x2e, 0x75, 0x74, 0x69, + 0x6c, 0x2e, 0x48, 0x61, 0x73, 0x68, 0x53, 0x65, 0x74, + // serialization ID + (byte)0xba, 0x44, (byte)0x85, (byte)0x95, (byte)0x96, (byte)0xb8, (byte)0xb7, 0x34, + 0x03, // flags: serialized, custom serialization function + 0x00, 0x00, // fields count + 0x78, // blockdata end + 0x70, // null (superclass) + 0x77, 0x0c // blockdata short, 0xc bytes + }; + int stoplistlen = hashSetSerialized.length; + stoplistlen += 12; // block data: capacity (int), load factor (float), size (int) + for (String s : idx.stoplist) { + stoplistlen += 3 + s.length(); + } + stoplistlen++; + + DataOutputStream outb = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(out.getFD()))); + outb.writeInt(stoplistlen); + outb.write(hashSetSerialized); + outb.writeInt(idx.stoplist.size()); // capacity + outb.writeFloat(0.75f); // load factor + outb.writeInt(idx.stoplist.size()); // size + for (String s : idx.stoplist) { + outb.writeByte(0x74); // String type + outb.writeUTF(s); + } + outb.writeByte(0x78); // blockdata end + + outb.writeInt(skipHtml ? prunedSize : idx.rows.size()); + outb.writeInt(5); + for (RowBase r : idx.rows) { + int type = 0; + if (r instanceof PairEntry.Row) { + type = 0; + } else if (r instanceof TokenRow) { + final TokenRow tokenRow = (TokenRow)r; + type = tokenRow.hasMainEntry ? 1 : 3; + } else if (r instanceof TextEntry.Row) { + type = 2; + } else if (r instanceof HtmlEntry.Row) { + type = 4; + if (skipHtml) continue; + } else { + throw new RuntimeException("Row type not supported for v6"); + } + outb.writeByte(type); + outb.writeInt(r.referenceIndex); + } + outb.flush(); + } + long dataPos = out.getFilePointer(); + tocout.writeLong(dataPos); + tocout.close(); + + out.seek(tocPos); + out.write(toc.toByteArray()); + out.seek(dataPos); + } + + public void writev6(RandomAccessFile raf, boolean skipHtml) throws IOException { + raf.writeInt(6); + raf.writeLong(d.creationMillis); + raf.writeUTF(d.dictInfo); + System.out.println("sources start: " + raf.getFilePointer()); + writev6Sources(raf); + System.out.println("pair start: " + raf.getFilePointer()); + writev6PairEntries(raf); + System.out.println("text start: " + raf.getFilePointer()); + writev6TextEntries(raf); + System.out.println("html index start: " + raf.getFilePointer()); + if (skipHtml) writev6EmptyList(raf); + else writev6HtmlEntries(raf); + System.out.println("indices start: " + raf.getFilePointer()); + writev6Index(raf, skipHtml); + System.out.println("end: " + raf.getFilePointer()); + raf.writeUTF("END OF DICTIONARY"); + } +} diff --git a/src/com/hughes/android/dictionary/engine/IndexBuilder.java b/src/com/hughes/android/dictionary/engine/IndexBuilder.java index e7e1b43..2db537b 100644 --- a/src/com/hughes/android/dictionary/engine/IndexBuilder.java +++ b/src/com/hughes/android/dictionary/engine/IndexBuilder.java @@ -14,16 +14,7 @@ package com.hughes.android.dictionary.engine; -import java.util.ArrayList; -import java.util.Collections; -import java.util.Comparator; -import java.util.EnumMap; -import java.util.HashSet; -import java.util.List; -import java.util.Map; -import java.util.Set; -import java.util.SortedMap; -import java.util.TreeMap; +import java.util.*; import com.hughes.android.dictionary.engine.Index.IndexEntry; import com.hughes.android.dictionary.parser.DictFileParser; @@ -34,17 +25,19 @@ public class IndexBuilder { public final Index index; final Set stoplist; - final SortedMap tokenToData; + final Map fastTokenToData; + final SortedMap tokenToData; IndexBuilder(final DictionaryBuilder dictionaryBuilder, final String shortName, final String longName, final Language language, final String normalizerRules, final Set stoplist, final boolean swapPairEntries) { this.dictionaryBuilder = dictionaryBuilder; index = new Index(dictionaryBuilder.dictionary, shortName, longName, language, normalizerRules, swapPairEntries, stoplist); - tokenToData = new TreeMap(index.getSortComparator()); + tokenToData = new TreeMap<>(new FastNormalizeComparator(index.getSortComparator())); + fastTokenToData = new HashMap<>(); this.stoplist = stoplist; } public void build() { - final Set tokenIndexedEntries = new HashSet(); + final Set tokenIndexedEntries = new HashSet<>(); final List rows = index.rows; index.mainTokenCount = 0; for (final TokenData tokenData : tokenToData.values()) { @@ -101,13 +94,8 @@ public class IndexBuilder { } } - final List entriesSortedByNumRows = new ArrayList(index.sortedIndexEntries); - Collections.sort(entriesSortedByNumRows, new Comparator() { - @Override - public int compare(IndexEntry object1, IndexEntry object2) { - return object2.numRows - object1.numRows; - } - }); + final List entriesSortedByNumRows = new ArrayList<>(index.sortedIndexEntries); + entriesSortedByNumRows.sort((object1, object2) -> object2.numRows - object1.numRows); System.out.println("Most common tokens:"); for (int i = 0; i < 50 && i < entriesSortedByNumRows.size(); ++i) { System.out.println(" " + entriesSortedByNumRows.get(i)); @@ -117,10 +105,10 @@ public class IndexBuilder { public static class TokenData { final String token; - final Map> typeToEntries = new EnumMap>(EntryTypeName.class); + final Map> typeToEntries = new EnumMap<>(EntryTypeName.class); public boolean hasMainEntry = false; - public List htmlEntries = new ArrayList(); + public final List htmlEntries = new ArrayList<>(); TokenData(final String token) { assert token.equals(token.trim()); @@ -130,11 +118,16 @@ public class IndexBuilder { } public TokenData getOrCreateTokenData(final String token) { - TokenData tokenData = tokenToData.get(token); - if (tokenData == null) { - tokenData = new TokenData(token); - tokenToData.put(token, tokenData); + TokenData tokenData = fastTokenToData.get(token); + if (tokenData != null) return tokenData; + tokenData = new TokenData(token); + final FastCompareString c = new FastCompareString(token); + if (tokenToData.put(c, tokenData) != null) { + // The parallel HashMap assumes that the TreeMap Comparator + // is compatible with the equals it uses to compare. + throw new RuntimeException("TokenData TreeMap and HashMap out of sync, Comparator may be broken?"); } + fastTokenToData.put(token, tokenData); return tokenData; } @@ -145,7 +138,7 @@ public class IndexBuilder { tokenData.hasMainEntry = true; } if (entries == null) { - entries = new ArrayList(); + entries = new ArrayList<>(); tokenData.typeToEntries.put(entryTypeName, entries); } return entries; diff --git a/src/com/hughes/android/dictionary/engine/LanguageTest.java b/src/com/hughes/android/dictionary/engine/LanguageTest.java index 24fe094..be787eb 100644 --- a/src/com/hughes/android/dictionary/engine/LanguageTest.java +++ b/src/com/hughes/android/dictionary/engine/LanguageTest.java @@ -21,12 +21,12 @@ import java.util.LinkedHashSet; import java.util.List; import java.util.Set; -import junit.framework.TestCase; - import com.hughes.android.dictionary.parser.DictFileParser; import com.hughes.android.dictionary.parser.wiktionary.WiktionaryLangs; import com.ibm.icu.text.Transliterator; +import junit.framework.TestCase; + public class LanguageTest extends TestCase { public void testGermanSort() { @@ -73,10 +73,10 @@ public class LanguageTest extends TestCase { assertEquals("hulle", normalizer.transform("Hulle")); - final List sorted = new ArrayList(words); + final List sorted = new ArrayList<>(words); // Collections.shuffle(shuffled, new Random(0)); - Collections.sort(sorted, comparator); - System.out.println(sorted.toString()); + sorted.sort(comparator); + System.out.println(sorted); for (int i = 0; i < words.size(); ++i) { System.out.println(words.get(i) + "\t" + sorted.get(i)); assertEquals(words.get(i), sorted.get(i)); @@ -92,9 +92,9 @@ public class LanguageTest extends TestCase { "preppy", "preprocess"); - final List sorted = new ArrayList(words); + final List sorted = new ArrayList<>(words); final NormalizeComparator comparator = new NormalizeComparator(normalizer, Language.en.getCollator(), 7); - Collections.sort(sorted, comparator); + sorted.sort(comparator); for (int i = 0; i < words.size(); ++i) { if (i > 0) { assertTrue(comparator.compare(words.get(i-1), words.get(i)) < 0); @@ -183,8 +183,8 @@ public class LanguageTest extends TestCase { public void testEnWiktionaryNames() { - final Set enLangs = new LinkedHashSet(WiktionaryLangs.isoCodeToEnWikiName.keySet()); - final List names = new ArrayList(); + final Set enLangs = new LinkedHashSet<>(WiktionaryLangs.isoCodeToEnWikiName.keySet()); + final List names = new ArrayList<>(); for (final String code : WiktionaryLangs.isoCodeToEnWikiName.keySet()) { names.add(WiktionaryLangs.isoCodeToEnWikiName.get(code)); enLangs.add(code.toLowerCase()); diff --git a/src/com/hughes/android/dictionary/engine/ReadAheadBuffer.java b/src/com/hughes/android/dictionary/engine/ReadAheadBuffer.java index d4b3ab5..b38ee2b 100644 --- a/src/com/hughes/android/dictionary/engine/ReadAheadBuffer.java +++ b/src/com/hughes/android/dictionary/engine/ReadAheadBuffer.java @@ -14,8 +14,8 @@ package com.hughes.android.dictionary.engine; -import java.io.InputStream; import java.io.IOException; +import java.io.InputStream; import java.io.PipedInputStream; import java.io.PipedOutputStream; @@ -24,29 +24,24 @@ public class ReadAheadBuffer extends PipedInputStream { public ReadAheadBuffer(InputStream in, int size) { super(size); assert size >= 2 * BLOCK_SIZE; - this.in = in; try { pipe = new PipedOutputStream(this); - buffer = new byte[BLOCK_SIZE]; - new Thread(new Runnable() { - public void run() { - int read; - try { - while ((read = in.read(buffer)) > 0) - { - pipe.write(buffer, 0, read); - pipe.flush(); - } - } catch (IOException e) {} - try { - pipe.close(); - } catch (IOException e) {} - } - }).start(); } catch (IOException e) {} + new Thread(() -> { + try { + int read; + final byte[] buffer = new byte[BLOCK_SIZE]; + while ((read = in.read(buffer)) > 0) + { + pipe.write(buffer, 0, read); + pipe.flush(); + } + } catch (IOException e) {} + try { + pipe.close(); + } catch (IOException e) {} + }).start(); } - InputStream in; PipedOutputStream pipe; - byte buffer[]; } diff --git a/src/com/hughes/android/dictionary/engine/Runner.java b/src/com/hughes/android/dictionary/engine/Runner.java new file mode 100644 index 0000000..b150613 --- /dev/null +++ b/src/com/hughes/android/dictionary/engine/Runner.java @@ -0,0 +1,38 @@ +// Copyright 2020 Reimar Döffinger. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package com.hughes.android.dictionary.engine; + +import java.util.Arrays; + +public class Runner { + public static void main(final String[] args) throws Exception { + if (args.length == 0) { + System.out.println("Specify WiktionarySplitter, DictionaryBuilder or ConvertToV6 as first argument"); + return; + } + String[] newargs = Arrays.copyOfRange(args, 1, args.length); + if (args[0].equals("WiktionarySplitter")) { + WiktionarySplitter.main(newargs); + } else if (args[0].equals("DictionaryBuilder")) { + DictionaryBuilder.main(newargs); + } else if (args[0].equals("ConvertToV6")) { + ConvertToV6.main(newargs); + } else if (args[0].equals("CheckDictionariesMain")) { + CheckDictionariesMain.main(newargs); + } else { + System.out.println("Unknown command '" + args[0] + "'. Use one of WiktionarySplitter, DictionaryBuilder, ConvertToV6 or CheckDictionariesMain instead."); + } + } +} diff --git a/src/com/hughes/android/dictionary/engine/WiktionarySplitter.java b/src/com/hughes/android/dictionary/engine/WiktionarySplitter.java index 5935df8..9d51b78 100644 --- a/src/com/hughes/android/dictionary/engine/WiktionarySplitter.java +++ b/src/com/hughes/android/dictionary/engine/WiktionarySplitter.java @@ -20,35 +20,39 @@ import java.io.DataOutputStream; import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; -import java.io.InputStream; import java.io.IOException; +import java.io.InputStream; import java.io.OutputStream; +import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; import java.util.regex.Matcher; import java.util.regex.Pattern; import javax.xml.parsers.ParserConfigurationException; import javax.xml.parsers.SAXParser; +import javax.xml.parsers.SAXParserFactory; -import org.apache.xerces.jaxp.SAXParserFactoryImpl; import org.apache.commons.compress.compressors.CompressorStreamFactory; import org.xml.sax.Attributes; import org.xml.sax.SAXException; import com.hughes.android.dictionary.parser.wiktionary.WiktionaryLangs; -public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler { +public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler implements Runnable { // The matches the whole line, otherwise regexes don't work well on French: // {{=uk=}} // Spanish has no initial headings, tried to also detect {{ES as such // with "^(\\{\\{ES|(=+)[^=]).*$" but that broke English. - static final Pattern headingStart = Pattern.compile("^(=+)[^=].*$", Pattern.MULTILINE); + static final Pattern headingStartPattern = Pattern.compile("^(=+)[^=].*$", Pattern.MULTILINE); + static final Pattern startSpanish = Pattern.compile("\\{\\{ES(\\|[^{}=]*)?}}"); - final Map> pathToSelectors = new LinkedHashMap>(); + final Map.Entry> pathToSelectorsEntry; List currentSelectors = null; StringBuilder titleBuilder; @@ -56,15 +60,28 @@ public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler { StringBuilder currentBuilder = null; public static void main(final String[] args) throws Exception { - final WiktionarySplitter wiktionarySplitter = new WiktionarySplitter(); - wiktionarySplitter.go(); + boolean parallel = args.length > 0 && args[0].equals("parallel"); + final ExecutorService e = Executors.newCachedThreadPool(); + final Map> pathToSelectors = createSelectorsMap(); + for (final Map.Entry> pathToSelectorsEntry : pathToSelectors.entrySet()) { + final WiktionarySplitter wiktionarySplitter = new WiktionarySplitter(pathToSelectorsEntry); + if (parallel) { + e.submit(wiktionarySplitter); + } else wiktionarySplitter.go(); + } + e.shutdown(); } - private WiktionarySplitter() { + private WiktionarySplitter(final Map.Entry> pathToSelectorsEntry) { + this.pathToSelectorsEntry = pathToSelectorsEntry; + } + + private static Map> createSelectorsMap() { + final Map> pathToSelectors = new LinkedHashMap<>(); List selectors; for (final String code : WiktionaryLangs.wikiCodeToIsoCodeToWikiName.keySet()) { //if (!code.equals("fr")) {continue;} - selectors = new ArrayList(); + selectors = new ArrayList<>(); pathToSelectors.put(String.format("data/inputs/%swiktionary-pages-articles.xml", code), selectors); for (final Map.Entry entry : WiktionaryLangs.wikiCodeToIsoCodeToWikiName.get(code).entrySet()) { final String dir = String.format("data/inputs/wikiSplit/%s", code); @@ -72,13 +89,22 @@ public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler { selectors.add(new Selector(String.format("%s/%s.data", dir, entry.getKey()), entry.getValue())); } } + return pathToSelectors; + } + + @Override + public void run() { + try { + go(); + } catch (Exception e) { + throw new RuntimeException(e); + } } private void go() throws Exception { - final SAXParser parser = SAXParserFactoryImpl.newInstance().newSAXParser(); + final SAXParser parser = SAXParserFactory.newInstance().newSAXParser(); // Configure things. - for (final Map.Entry> pathToSelectorsEntry : pathToSelectors.entrySet()) { currentSelectors = pathToSelectorsEntry.getValue(); @@ -86,7 +112,7 @@ public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler { OutputStream tmp = new FileOutputStream(selector.outFilename + ".gz"); tmp = new BufferedOutputStream(tmp); tmp = new CompressorStreamFactory().createCompressorOutputStream(CompressorStreamFactory.GZIP, tmp); - tmp = new WriteBuffer(tmp, 20 * 1024 * 1024); + tmp = new WriteBuffer(tmp, 1024 * 1024); selector.out = new DataOutputStream(tmp); } @@ -105,7 +131,7 @@ public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler { parser.parse(new BufferedInputStream(in), this); } } catch (Exception e) { - System.err.println("Exception during parse, lastPageTitle=" + lastPageTitle + ", titleBuilder=" + titleBuilder.toString() + " of file " + pathToSelectorsEntry.getKey()); + System.err.println("Exception during parse, lastPageTitle=" + lastPageTitle + ", titleBuilder=" + titleBuilder + " of file " + pathToSelectorsEntry.getKey()); throw e; } @@ -113,17 +139,15 @@ public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler { for (final Selector selector : currentSelectors) { selector.out.close(); } - - } } String lastPageTitle = null; int pageCount = 0; - Pattern endPatterns[] = new Pattern[100]; + final Matcher[] endPatterns = new Matcher[100]; - private Pattern getEndPattern(int depth) { + private Matcher getEndPattern(int depth) { if (endPatterns[depth] == null) - endPatterns[depth] = Pattern.compile(String.format("^={1,%d}[^=].*$", depth), Pattern.MULTILINE); + endPatterns[depth] = Pattern.compile(String.format("^={1,%d}[^=].*$", depth), Pattern.MULTILINE).matcher(""); return endPatterns[depth]; } @@ -152,6 +176,7 @@ public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler { title.startsWith("Template:") || title.startsWith("Summary:") || title.startsWith("Module:") || + title.startsWith("Reconstruction:") || // DE title.startsWith("Datei:") || title.startsWith("Verzeichnis:") || @@ -160,6 +185,7 @@ public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler { title.startsWith("Kategorie:") || title.startsWith("Hilfe:") || title.startsWith("Reim:") || + title.startsWith("Modul:") || // FR: title.startsWith("Annexe:") || title.startsWith("Catégori:") || @@ -169,16 +195,20 @@ public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler { title.startsWith("Aide:") || title.startsWith("Fichier:") || title.startsWith("Wiktionnaire:") || + title.startsWith("Translations:Wiktionnaire:") || + title.startsWith("Translations:Projet:") || title.startsWith("Catégorie:") || title.startsWith("Portail:") || title.startsWith("utiliusateur:") || title.startsWith("Kategorio:") || + title.startsWith("Tutoriel:") || // IT title.startsWith("Wikizionario:") || title.startsWith("Appendice:") || title.startsWith("Categoria:") || title.startsWith("Aiuto:") || title.startsWith("Portail:") || + title.startsWith("Modulo:") || // ES title.startsWith("Apéndice:") || title.startsWith("Archivo:") || @@ -195,39 +225,40 @@ public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler { title.startsWith("Predefinição:") || title.startsWith("Vocabulário:") || title.startsWith("Wikcionário:") || + title.startsWith("Módulo:") || // sentinel false ) return; - if (!title.startsWith("Sign gloss:")) { + // leave the Flexion: pages in for now and do not warn about them + if (!title.startsWith("Sign gloss:") && !title.startsWith("Flexion:")) { System.err.println("title with colon: " + title); } } String text = textBuilder.toString(); // Workaround for Spanish wiktionary {{ES}} and {{ES|word}} patterns - text = text.replaceAll("\\{\\{ES(\\|[^{}=]*)?}}", "== {{lengua|es}} =="); + text = startSpanish.matcher(text).replaceAll("== {{lengua|es}} =="); String translingual = ""; int start = 0; - final Matcher startMatcher = headingStart.matcher(text); + Matcher headingStart = headingStartPattern.matcher(text); while (start < text.length()) { // Find start. - if (!startMatcher.find(start)) { + if (!headingStart.find(start)) { return; } - start = startMatcher.end(); + start = headingStart.end(); - final String heading = startMatcher.group(); + final String heading = headingStart.group(); // For Translingual entries just store the text for later // use in the per-language sections - if (heading.indexOf("Translingual") != -1) { + if (heading.contains("Translingual")) { // Find end. - final int depth = startMatcher.group(1).length(); - final Pattern endPattern = getEndPattern(depth); + final int depth = headingStart.group(1).length(); + final Matcher endMatcher = getEndPattern(depth).reset(text); - final Matcher endMatcher = endPattern.matcher(text); if (endMatcher.find(start)) { int end = endMatcher.start(); translingual = text.substring(start, end); @@ -237,12 +268,11 @@ public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler { } for (final Selector selector : currentSelectors) { - if (selector.pattern.matcher(heading).find()) { + if (selector.pattern.reset(heading).find()) { // Find end. - final int depth = startMatcher.group(1).length(); - final Pattern endPattern = getEndPattern(depth); + final int depth = headingStart.group(1).length(); + final Matcher endMatcher = getEndPattern(depth).reset(text); - final Matcher endMatcher = endPattern.matcher(text); final int end; if (endMatcher.find(start)) { end = endMatcher.start(); @@ -259,13 +289,13 @@ public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler { sectionText.charAt(dummy_end + 1) == '\n') ++dummy_end; sectionText = sectionText.substring(dummy_end); } - if (heading.indexOf("Japanese") == -1) sectionText += translingual; + if (!heading.contains("Japanese")) sectionText += translingual; final Section section = new Section(title, heading, sectionText); try { selector.out.writeUTF(section.title); selector.out.writeUTF(section.heading); - final byte[] bytes = section.text.getBytes("UTF8"); + final byte[] bytes = section.text.getBytes(StandardCharsets.UTF_8); selector.out.writeInt(bytes.length); selector.out.write(bytes); } catch (IOException e) { @@ -300,13 +330,13 @@ public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler { static class Selector { final String outFilename; - final Pattern pattern; + final Matcher pattern; DataOutputStream out; public Selector(final String filename, final String pattern) { this.outFilename = filename; - this.pattern = Pattern.compile(pattern, Pattern.CASE_INSENSITIVE); + this.pattern = Pattern.compile(pattern, Pattern.CASE_INSENSITIVE).matcher(""); } } @@ -329,15 +359,14 @@ public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler { } @Override - public void characters(char[] ch, int start, int length) throws SAXException { + public void characters(char[] ch, int start, int length) { if (currentBuilder != null) { currentBuilder.append(ch, start, length); } } @Override - public void endElement(String uri, String localName, String qName) - throws SAXException { + public void endElement(String uri, String localName, String qName) { currentBuilder = null; if ("page".equals(qName)) { endPage(); @@ -346,7 +375,7 @@ public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler { public void parse(final File file) throws ParserConfigurationException, SAXException, IOException { - final SAXParser parser = SAXParserFactoryImpl.newInstance().newSAXParser(); + final SAXParser parser = SAXParserFactory.newInstance().newSAXParser(); parser.parse(file, this); } diff --git a/src/com/hughes/android/dictionary/engine/WriteBuffer.java b/src/com/hughes/android/dictionary/engine/WriteBuffer.java index c68264e..09e2655 100644 --- a/src/com/hughes/android/dictionary/engine/WriteBuffer.java +++ b/src/com/hughes/android/dictionary/engine/WriteBuffer.java @@ -14,35 +14,33 @@ package com.hughes.android.dictionary.engine; -import java.io.OutputStream; import java.io.IOException; +import java.io.OutputStream; import java.io.PipedInputStream; import java.io.PipedOutputStream; public class WriteBuffer extends PipedOutputStream { - static int BLOCK_SIZE = 1024 * 1024; + static int BLOCK_SIZE = 256 * 1024; public WriteBuffer(OutputStream out, int size) { assert size >= 2 * BLOCK_SIZE; this.out = out; try { pipe = new PipedInputStream(this, size); buffer = new byte[BLOCK_SIZE]; - writeThread = new Thread(new Runnable() { - public void run() { - int read; - try { - while ((read = pipe.read(buffer)) > 0) - { - out.write(buffer, 0, read); - out.flush(); - } - } catch (IOException e) { - System.out.println("Error writing to file " + e); + writeThread = new Thread(() -> { + int read; + try { + while ((read = pipe.read(buffer)) > 0) + { + out.write(buffer, 0, read); + out.flush(); } - try { - out.close(); - } catch (IOException e) {} + } catch (IOException e) { + System.out.println("Error writing to file " + e); } + try { + out.close(); + } catch (IOException e) {} }); writeThread.start(); } catch (IOException e) {} @@ -61,5 +59,5 @@ public class WriteBuffer extends PipedOutputStream { Thread writeThread; OutputStream out; PipedInputStream pipe; - byte buffer[]; + byte[] buffer; } diff --git a/src/com/hughes/android/dictionary/parser/DictFileParser.java b/src/com/hughes/android/dictionary/parser/DictFileParser.java index 07d0775..e9c6180 100644 --- a/src/com/hughes/android/dictionary/parser/DictFileParser.java +++ b/src/com/hughes/android/dictionary/parser/DictFileParser.java @@ -23,7 +23,6 @@ import java.nio.charset.Charset; import java.util.Arrays; import java.util.Collections; import java.util.LinkedHashSet; -import java.util.List; import java.util.Set; import java.util.logging.Logger; import java.util.regex.Matcher; @@ -31,12 +30,11 @@ import java.util.regex.Pattern; import com.hughes.android.dictionary.engine.DictionaryBuilder; import com.hughes.android.dictionary.engine.EntrySource; -import com.hughes.android.dictionary.engine.IndexedEntry; import com.hughes.android.dictionary.engine.EntryTypeName; import com.hughes.android.dictionary.engine.IndexBuilder; +import com.hughes.android.dictionary.engine.IndexedEntry; import com.hughes.android.dictionary.engine.Language; import com.hughes.android.dictionary.engine.PairEntry; -import com.hughes.android.dictionary.engine.PairEntry.Pair; public class DictFileParser implements Parser { @@ -153,7 +151,7 @@ public class DictFileParser implements Parser { if (subfields[1][i].length() == 0) { subfields[1][i] = "__"; } - pairEntry.pairs.add(new Pair(subfields[0][i], subfields[1][i])); + pairEntry.pairs.add(new PairEntry.Pair(subfields[0][i], subfields[1][i])); } final IndexedEntry entryData = new IndexedEntry(pairEntry); entryData.isValid = true; @@ -295,9 +293,9 @@ public class DictFileParser implements Parser { return field; } - public static final Set tokenize(final String text, final Pattern pattern) { + public static Set tokenize(final String text, final Pattern pattern) { final String[] split = pattern.split(text); - final Set result = new LinkedHashSet(Arrays.asList(split)); + final Set result = new LinkedHashSet<>(Arrays.asList(split)); result.remove(""); return result; } diff --git a/src/com/hughes/android/dictionary/parser/WikiTokenizer.java b/src/com/hughes/android/dictionary/parser/WikiTokenizer.java index 7212319..ca0193a 100644 --- a/src/com/hughes/android/dictionary/parser/WikiTokenizer.java +++ b/src/com/hughes/android/dictionary/parser/WikiTokenizer.java @@ -14,16 +14,13 @@ package com.hughes.android.dictionary.parser; -import java.util.ArrayList; -import java.util.LinkedHashMap; -import java.util.List; -import java.util.Map; +import java.util.*; import java.util.regex.Matcher; import java.util.regex.Pattern; public final class WikiTokenizer { - public static interface Callback { + public interface Callback { void onPlainText(final String text); void onMarkup(WikiTokenizer wikiTokenizer); void onWikiLink(WikiTokenizer wikiTokenizer); @@ -77,7 +74,7 @@ public final class WikiTokenizer { } //private static final Pattern wikiTokenEvent = Pattern.compile("($)", Pattern.MULTILINE); - private static final Pattern wikiTokenEvent = Pattern.compile("(" + + private static final Pattern wikiTokenEvent = Pattern.compile( "\\{\\{|\\}\\}|" + "\\[\\[|\\]\\]|" + "\\||" + // Need the | because we might have to find unescaped pipes @@ -87,7 +84,7 @@ public final class WikiTokenizer { "
|" +
             "|" +
             "|" +
-            "$)", Pattern.MULTILINE);
+            "\n", Pattern.MULTILINE);
     private static final String listChars = "*#:;";
 
 
@@ -99,8 +96,8 @@ public final class WikiTokenizer {
     int end = 0;
     int start = -1;
 
-    final List errors = new ArrayList();
-    final List tokenStack = new ArrayList();
+    final List errors = new ArrayList<>();
+    final List tokenStack = new ArrayList<>();
 
 
     private String headingWikiText;
@@ -116,8 +113,8 @@ public final class WikiTokenizer {
 
     private int lastUnescapedPipePos;
     private int lastUnescapedEqualsPos;
-    private final List positionArgs = new ArrayList();
-    private final Map namedArgs = new LinkedHashMap();
+    private final List positionArgs = new ArrayList<>();
+    private final Map namedArgs = new LinkedHashMap<>();
 
 
     public WikiTokenizer(final String wikiText) {
@@ -126,6 +123,7 @@ public final class WikiTokenizer {
 
     public WikiTokenizer(String wikiText, final boolean isNewline) {
         wikiText = wikiText.replace('\u2028', '\n');
+        wikiText = wikiText.replace('\u2029', '\n');
         wikiText = wikiText.replace('\u0085', '\n');
         this.wikiText = wikiText;
         this.matcher = wikiTokenEvent.matcher(wikiText);
@@ -153,7 +151,7 @@ public final class WikiTokenizer {
         namedArgs.clear();
     }
 
-    private static final Pattern POSSIBLE_WIKI_TEXT = Pattern.compile(
+    private static final Matcher POSSIBLE_WIKI_TEXT = Pattern.compile(
                 "\\{\\{|" +
                 "\\[\\[|" +
                 "", matchStart);
                 if (end == -1) {
                     errors.add("Unmatched