Merge pull request #3 from zorun/swedish

author Reimar Döffinger <rdoeffinger@users.noreply.github.com>

Sun, 20 Dec 2020 09:55:20 +0000 (10:55 +0100)

committer GitHub <noreply@github.com>

Sun, 20 Dec 2020 09:55:20 +0000 (10:55 +0100)
author Reimar Döffinger <rdoeffinger@users.noreply.github.com>
Sun, 20 Dec 2020 09:55:20 +0000 (10:55 +0100)
committer GitHub <noreply@github.com>
Sun, 20 Dec 2020 09:55:20 +0000 (10:55 +0100)
diff --git a/.classpath b/.classpath

index 60b221e67987b5895fca74585dac129286ba019d..96fa3642f1494a62fa10a8aa21ceb5f3eb3ab1f1 100755 (executable)
--- a/.classpath
+++ b/.classpath
@@ -1,10 +1,17 @@
  <?xml version="1.0" encoding="UTF-8"?>
  <classpath>
         <classpathentry kind="src" path="src"/>
-       <classpathentry combineaccessrules="false" kind="src" path="/Dictionary"/>
-       <classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER"/>
+       <classpathentry including="com/hughes/android/dictionary/DictionaryInfo.java|com/hughes/android/dictionary/engine/" kind="src" path="Dictionary/src"/>
+       <classpathentry kind="src" path="Dictionary/Util/src"/>
+       <classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER">
+               <attributes>
+                       <attribute name="module" value="true"/>
+                       <attribute name="limit-modules" value="java.xml,java.logging"/>
+               </attributes>
+       </classpathentry>
         <classpathentry kind="con" path="org.eclipse.jdt.junit.JUNIT_CONTAINER/4"/>
-       <classpathentry kind="lib" path="jars/xerces-2_11_0/xercesImpl.jar"/>
-       <classpathentry kind="lib" path="jars/commons-lang3-3.1.jar"/>
+       <classpathentry kind="lib" path="/usr/share/java/icu4j-49.1.jar"/>
+       <classpathentry kind="lib" path="/usr/share/java/commons-text.jar"/>
+       <classpathentry kind="lib" path="/usr/share/java/commons-compress.jar"/>
         <classpathentry kind="output" path="bin"/>
  </classpath>
diff --git a/.gitignore b/.gitignore

index 287cc93e6762676d75cb896d74389de9ade476e5..38a503e23f5f2e2082bc8682f7d493febfbbadae 100644 (file)
--- a/.gitignore
+++ b/.gitignore
@@ -1,5 +1,5 @@
-dictInputs
-dictOutputs/
+data/inputs/
+data/outputs/
  bin
  wikiSplit
  wikiSplit_2011
@@ -7,4 +7,3 @@ wikiSplit_201106
  wikiSplit_201111
  .project
  .settings/
-*.class
diff --git a/WiktionarySplitter.sh b/WiktionarySplitter.sh

index 705bf2b946948d483da041ede73fed96239e1281..66d38a23cfe8d20309a38b436121fd0cd8002099 100755 (executable)
--- a/WiktionarySplitter.sh
+++ b/WiktionarySplitter.sh
@@ -1,10 +1,12 @@
  # Run after downloading (data/downloadInputs.sh) to generate
  # per-language data files from enwiktionary.
-ICU4J=/usr/share/java/icu4j-49.1.jar
-test -r "$ICU4J" || ICU4J=/usr/share/icu4j-55/lib/icu4j.jar
-XERCES=/usr/share/java/xercesImpl.jar
-test -r "$XERCES" || XERCES=/usr/share/xerces-2/lib/xercesImpl.jar
-COMMONS_COMPRESS=/usr/share/java/commons-compress.jar
-JAVA=/usr/lib/jvm/java-8-openjdk-amd64/jre/bin/java
-test -x "$JAVA" || JAVA=java
-"$JAVA" -Xmx4096m -Xverify:none -classpath src:../Util/src/:../Dictionary/src/:"$ICU4J":"$XERCES":"$COMMONS_COMPRESS" com.hughes.android.dictionary.engine.WiktionarySplitter "$@"
+RUNNER=./DictionaryPC
+if ! test -x "$RUNNER" ; then
+  ICU4J=/usr/share/java/icu4j-49.1.jar
+  test -r "$ICU4J" || ICU4J=/usr/share/icu4j-55/lib/icu4j.jar
+  COMMONS_COMPRESS=/usr/share/java/commons-compress.jar
+  JAVA=/usr/lib/jvm/java-8-openjdk-amd64/jre/bin/java
+  test -x "$JAVA" || JAVA=java
+  RUNNER="$JAVA -Xmx4096m -Xverify:none -classpath bin/:$ICU4J:$COMMONS_COMPRESS com.hughes.android.dictionary.engine.Runner"
+fi
+$RUNNER WiktionarySplitter "$@"
diff --git a/compile.sh b/compile.sh

index 7b0021f4b9f7f6ffa2201a487ebccad7b51ca145..0780effa987277498dbc431ecfe0dad39a95c226 100755 (executable)
--- a/compile.sh
+++ b/compile.sh
@@ -2,10 +2,7 @@ ICU4J=/usr/share/java/icu4j-49.1.jar
  test -r "$ICU4J" || ICU4J=/usr/share/icu4j-55/lib/icu4j.jar
  JUNIT=/usr/share/java/junit.jar
  test -r "$JUNIT" || JUNIT=/usr/share/junit/lib/junit.jar
-XERCES=/usr/share/java/xercesImpl.jar
-test -r "$XERCES" || XERCES=/usr/share/xerces-2/lib/xercesImpl.jar
-COMMONS=/usr/share/java/commons-lang3.jar
-test -r "$COMMONS" || COMMONS=/usr/share/commons-lang-3.3/lib/commons-lang.jar
+COMMONS=/usr/share/java/commons-text.jar
  COMMONS_COMPRESS=/usr/share/java/commons-compress.jar
  if [ ! -x ../Dictionary ] ; then
      echo "You need to clone the Dictionary repository (including subprojects) into .."
@@ -19,10 +16,6 @@ if [ ! -r "$JUNIT" ] ; then
      echo "Junit needs to be installed"
      exit 1;
  fi
-if [ ! -r "$XERCES" ] ; then
-    echo "Xerces needs to be installed"
-    exit 1;
-fi
  if [ ! -r "$COMMONS" ] ; then
      echo "commons-lang needs to be installed"
      exit 1;
@@ -31,4 +24,8 @@ if [ ! -r "$COMMONS_COMPRESS" ] ; then
      echo "commons-compress needs to be installed"
      exit 1;
  fi
-javac -g ../Dictionary/Util/src/com/hughes/util/*.java ../Dictionary/Util/src/com/hughes/util/raf/*.java ../Dictionary/src/com/hughes/android/dictionary/DictionaryInfo.java ../Dictionary/src/com/hughes/android/dictionary/engine/*.java ../Dictionary/src/com/hughes/android/dictionary/C.java src/com/hughes/util/*.java src/com/hughes/android/dictionary/*.java src/com/hughes/android/dictionary/*/*.java src/com/hughes/android/dictionary/*/*/*.java -classpath "$ICU4J:$JUNIT:$XERCES:$COMMONS:$COMMONS_COMPRESS"
+mkdir -p bin
+# -encoding is just a work around for user that still run systems
+# with non-UTF8 locales
+# Limit to Java 11 for compatibility with native-image
+javac --source 11 --target 11 --limit-modules java.xml,java.logging -Xlint:all -encoding UTF-8 -g -d bin/ ../Dictionary/Util/src/com/hughes/util/*.java ../Dictionary/Util/src/com/hughes/util/raf/*.java ../Dictionary/src/com/hughes/android/dictionary/DictionaryInfo.java ../Dictionary/src/com/hughes/android/dictionary/engine/*.java ../Dictionary/src/com/hughes/android/dictionary/C.java src/com/hughes/util/*.java src/com/hughes/android/dictionary/*.java src/com/hughes/android/dictionary/*/*.java src/com/hughes/android/dictionary/*/*/*.java -classpath "$ICU4J:$JUNIT:$COMMONS:$COMMONS_COMPRESS"
diff --git a/convert_to_v6.sh b/convert_to_v6.sh

new file mode 100755 (executable)

index 0000000..2443c1c
--- /dev/null
+++ b/convert_to_v6.sh
@@ -0,0 +1,7 @@
+RUNNER=./DictionaryPC
+if ! test -x "$RUNNER" ; then
+  JAVA=/usr/lib/jvm/java-8-openjdk-amd64/jre/bin/java
+  test -x "$JAVA" || JAVA=java
+  RUNNER="$JAVA -classpath bin/ com.hughes.android.dictionary.engine.Runner"
+fi
+$RUNNER ConvertToV6 "$@"
diff --git a/genv6.sh b/genv6.sh

new file mode 100755 (executable)

index 0000000..9bb5147
--- /dev/null
+++ b/genv6.sh
@@ -0,0 +1,20 @@
+set -e
+rm -rf data/outputsv6
+mkdir data/outputsv6
+for i in data/outputs/*.quickdic ; do
+    o=data/outputsv6/$(basename "$i")
+    ./convert_to_v6.sh "$i" "$o"
+    7z a -mx=9 "$o".v006.zip "$o"
+    rm "$o"
+    # skipHtml makes no sense for single-language dictionaries
+    if echo "$o" | grep -q '-' ; then
+        if ./convert_to_v6.sh "$i" "$o" skipHtmlOpt ; then
+            7z a -mx=9 "$o".small.v006.zip "$o"
+            rm "$o"
+        elif [ $? -ne 3 ] ; then
+            # Check for magic 3 indicating "no HTML entries in dictionary"
+            echo "Converting dictionary failed!"
+            exit 1
+        fi
+    fi
+done
diff --git a/googlecode_upload.py b/googlecode_upload.py

deleted file mode 100755 (executable)

index d2d5f97..0000000
--- a/googlecode_upload.py
+++ /dev/null
@@ -1,248 +0,0 @@
-#!/usr/bin/env python
-#
-# Copyright 2006, 2007 Google Inc. All Rights Reserved.
-# Author: danderson@google.com (David Anderson)
-#
-# Script for uploading files to a Google Code project.
-#
-# This is intended to be both a useful script for people who want to
-# streamline project uploads and a reference implementation for
-# uploading files to Google Code projects.
-#
-# To upload a file to Google Code, you need to provide a path to the
-# file on your local machine, a small summary of what the file is, a
-# project name, and a valid account that is a member or owner of that
-# project.  You can optionally provide a list of labels that apply to
-# the file.  The file will be uploaded under the same name that it has
-# in your local filesystem (that is, the "basename" or last path
-# component).  Run the script with '--help' to get the exact syntax
-# and available options.
-#
-# Note that the upload script requests that you enter your
-# googlecode.com password.  This is NOT your Gmail account password!
-# This is the password you use on googlecode.com for committing to
-# Subversion and uploading files.  You can find your password by going
-# to http://code.google.com/hosting/settings when logged in with your
-# Gmail account. If you have already committed to your project's
-# Subversion repository, the script will automatically retrieve your
-# credentials from there (unless disabled, see the output of '--help'
-# for details).
-#
-# If you are looking at this script as a reference for implementing
-# your own Google Code file uploader, then you should take a look at
-# the upload() function, which is the meat of the uploader.  You
-# basically need to build a multipart/form-data POST request with the
-# right fields and send it to https://PROJECT.googlecode.com/files .
-# Authenticate the request using HTTP Basic authentication, as is
-# shown below.
-#
-# Licensed under the terms of the Apache Software License 2.0:
-#  http://www.apache.org/licenses/LICENSE-2.0
-#
-# Questions, comments, feature requests and patches are most welcome.
-# Please direct all of these to the Google Code users group:
-#  http://groups.google.com/group/google-code-hosting
-
-"""Google Code file uploader script.
-"""
-
-__author__ = 'danderson@google.com (David Anderson)'
-
-import httplib
-import os.path
-import optparse
-import getpass
-import base64
-import sys
-
-
-def upload(file, project_name, user_name, password, summary, labels=None):
-  """Upload a file to a Google Code project's file server.
-
-  Args:
-    file: The local path to the file.
-    project_name: The name of your project on Google Code.
-    user_name: Your Google account name.
-    password: The googlecode.com password for your account.
-              Note that this is NOT your global Google Account password!
-    summary: A small description for the file.
-    labels: an optional list of label strings with which to tag the file.
-
-  Returns: a tuple:
-    http_status: 201 if the upload succeeded, something else if an
-                 error occured.
-    http_reason: The human-readable string associated with http_status
-    file_url: If the upload succeeded, the URL of the file on Google
-              Code, None otherwise.
-  """
-  # The login is the user part of user@gmail.com. If the login provided
-  # is in the full user@domain form, strip it down.
-  if user_name.endswith('@gmail.com'):
-    user_name = user_name[:user_name.index('@gmail.com')]
-
-  form_fields = [('summary', summary)]
-  if labels is not None:
-    form_fields.extend([('label', l.strip()) for l in labels])
-
-  content_type, body = encode_upload_request(form_fields, file)
-
-  upload_host = '%s.googlecode.com' % project_name
-  upload_uri = '/files'
-  auth_token = base64.b64encode('%s:%s'% (user_name, password))
-  headers = {
-    'Authorization': 'Basic %s' % auth_token,
-    'User-Agent': 'Googlecode.com uploader v0.9.4',
-    'Content-Type': content_type,
-    }
-
-  server = httplib.HTTPSConnection(upload_host)
-  server.request('POST', upload_uri, body, headers)
-  resp = server.getresponse()
-  server.close()
-
-  if resp.status == 201:
-    location = resp.getheader('Location', None)
-  else:
-    location = None
-  return resp.status, resp.reason, location
-
-
-def encode_upload_request(fields, file_path):
-  """Encode the given fields and file into a multipart form body.
-
-  fields is a sequence of (name, value) pairs. file is the path of
-  the file to upload. The file will be uploaded to Google Code with
-  the same file name.
-
-  Returns: (content_type, body) ready for httplib.HTTP instance
-  """
-  BOUNDARY = '----------Googlecode_boundary_reindeer_flotilla'
-  CRLF = '\r\n'
-
-  body = []
-
-  # Add the metadata about the upload first
-  for key, value in fields:
-    body.extend(
-      ['--' + BOUNDARY,
-       'Content-Disposition: form-data; name="%s"' % key,
-       '',
-       value,
-       ])
-
-  # Now add the file itself
-  file_name = os.path.basename(file_path)
-  f = open(file_path, 'rb')
-  file_content = f.read()
-  f.close()
-
-  body.extend(
-    ['--' + BOUNDARY,
-     'Content-Disposition: form-data; name="filename"; filename="%s"'
-     % file_name,
-     # The upload server determines the mime-type, no need to set it.
-     'Content-Type: application/octet-stream',
-     '',
-     file_content,
-     ])
-
-  # Finalize the form body
-  body.extend(['--' + BOUNDARY + '--', ''])
-
-  return 'multipart/form-data; boundary=%s' % BOUNDARY, CRLF.join(body)
-
-
-def upload_find_auth(file_path, project_name, summary, labels=None,
-                     user_name=None, password=None, tries=3):
-  """Find credentials and upload a file to a Google Code project's file server.
-
-  file_path, project_name, summary, and labels are passed as-is to upload.
-
-  Args:
-    file_path: The local path to the file.
-    project_name: The name of your project on Google Code.
-    summary: A small description for the file.
-    labels: an optional list of label strings with which to tag the file.
-    config_dir: Path to Subversion configuration directory, 'none', or None.
-    user_name: Your Google account name.
-    tries: How many attempts to make.
-  """
-
-  while tries > 0:
-    if user_name is None:
-      # Read username if not specified or loaded from svn config, or on
-      # subsequent tries.
-      sys.stdout.write('Please enter your googlecode.com username: ')
-      sys.stdout.flush()
-      user_name = sys.stdin.readline().rstrip()
-    if password is None:
-      # Read password if not loaded from svn config, or on subsequent tries.
-      print 'Please enter your googlecode.com password.'
-      print '** Note that this is NOT your Gmail account password! **'
-      print 'It is the password you use to access Subversion repositories,'
-      print 'and can be found here: http://code.google.com/hosting/settings'
-      password = getpass.getpass()
-
-    status, reason, url = upload(file_path, project_name, user_name, password,
-                                 summary, labels)
-    # Returns 403 Forbidden instead of 401 Unauthorized for bad
-    # credentials as of 2007-07-17.
-    if status in [httplib.FORBIDDEN, httplib.UNAUTHORIZED]:
-      # Rest for another try.
-      user_name = password = None
-      tries = tries - 1
-    else:
-      # We're done.
-      break
-
-  return status, reason, url
-
-
-def main():
-  parser = optparse.OptionParser(usage='googlecode-upload.py -s SUMMARY '
-                                 '-p PROJECT [options] FILE')
-  parser.add_option('-s', '--summary', dest='summary',
-                    help='Short description of the file')
-  parser.add_option('-p', '--project', dest='project',
-                    help='Google Code project name')
-  parser.add_option('-u', '--user', dest='user',
-                    help='Your Google Code username')
-  parser.add_option('-w', '--password', dest='password',
-                    help='Your Google Code password')
-  parser.add_option('-l', '--labels', dest='labels',
-                    help='An optional list of comma-separated labels to attach '
-                    'to the file')
-
-  options, args = parser.parse_args()
-
-  if not options.summary:
-    parser.error('File summary is missing.')
-  elif not options.project:
-    parser.error('Project name is missing.')
-  elif len(args) < 1:
-    parser.error('File to upload not provided.')
-  elif len(args) > 1:
-    parser.error('Only one file may be specified.')
-
-  file_path = args[0]
-
-  if options.labels:
-    labels = options.labels.split(',')
-  else:
-    labels = None
-
-  status, reason, url = upload_find_auth(file_path, options.project,
-                                         options.summary, labels,
-                                         options.user, options.password)
-  if url:
-    print 'The file was uploaded successfully.'
-    print 'URL: %s' % url
-    return 0
-  else:
-    print 'An error occurred. Your file was not uploaded.'
-    print 'Google Code upload server said: %s (%s)' % (reason, status)
-    return 1
-
-
-if __name__ == '__main__':
-  sys.exit(main())
diff --git a/jars/commons-lang3-3.1.jar b/jars/commons-lang3-3.1.jar

deleted file mode 100644 (file)

index a85e539..0000000

Binary files a/jars/commons-lang3-3.1.jar and /dev/null differ
diff --git a/jars/xerces-2_11_0/xercesImpl.jar b/jars/xerces-2_11_0/xercesImpl.jar

deleted file mode 100644 (file)

index 0aaa990..0000000

Binary files a/jars/xerces-2_11_0/xercesImpl.jar and /dev/null differ
diff --git a/native-image-reflection.json b/native-image-reflection.json

new file mode 100644 (file)

index 0000000..e86958e
--- /dev/null
+++ b/native-image-reflection.json
@@ -0,0 +1,8 @@
+[
+  {
+    "name": "com.ibm.icu.text.CollatorServiceShim",
+    "methods": [
+      { "name": "<init>", "parameterTypes": [] }
+    ]
+  }
+]
diff --git a/native-image.cmd b/native-image.cmd

new file mode 100755 (executable)

index 0000000..d0e0c6d
--- /dev/null
+++ b/native-image.cmd
@@ -0,0 +1,2 @@
+REM --allow-incomplete-classpath due to missing XZ implementation\r
+%GRAALVM_HOME%/bin/native-image --allow-incomplete-classpath --no-server -H:Name="DictionaryPC" com.hughes.android.dictionary.engine.Runner --no-fallback -cp bin/;commons-compress.jar;commons-text.jar;commons-lang3.jar;icu4j-49.1.jar -H:IncludeResources="com/ibm/icu/.*" -H:ReflectionConfigurationFiles=native-image-reflection.json\r
diff --git a/native-image.sh b/native-image.sh

new file mode 100755 (executable)

index 0000000..a332f2f
--- /dev/null
+++ b/native-image.sh
@@ -0,0 +1 @@
+"$GRAALVM_HOME"/bin/native-image --no-server -H:Name="DictionaryPC" com.hughes.android.dictionary.engine.Runner --no-fallback -cp bin/:/usr/share/java/commons-compress.jar:/usr/share/java/commons-text.jar:/usr/share/java/commons-lang3.jar:/usr/share/java/icu4j-49.1.jar -H:IncludeResources="com/ibm/icu/.*" -H:ReflectionConfigurationFiles=native-image-reflection.json
diff --git a/run.sh b/run.sh

index 12ea566019ed17c7d4378276f6575477709e0ba8..3a3c1d9b9e488ca254399dbc40f0c64212098bea 100755 (executable)
--- a/run.sh
+++ b/run.sh
@@ -1,11 +1,14 @@
-# -agentlib:hprof=heap=sites,depth=20
-ICU4J=/usr/share/java/icu4j-49.1.jar
-test -r "$ICU4J" || ICU4J=/usr/share/icu4j-55/lib/icu4j.jar
-XERCES=/usr/share/java/xercesImpl.jar
-test -r "$XERCES" || XERCES=/usr/share/xerces-2/lib/xercesImpl.jar
-COMMONS=/usr/share/java/commons-lang3.jar
-test -r "$COMMONS" || COMMONS=/usr/share/commons-lang-3.3/lib/commons-lang.jar
-COMMONS_COMPRESS=/usr/share/java/commons-compress.jar
-JAVA=/usr/lib/jvm/java-8-openjdk-amd64/jre/bin/java
-test -x "$JAVA" || JAVA=java
-"$JAVA" -Djava.util.logging.config.file="logging.properties" -Xmx4096m -classpath src:../Dictionary/Util/src/:../Dictionary/src/:"$ICU4J":"$XERCES":"$COMMONS":"$COMMONS_COMPRESS" com.hughes.android.dictionary.engine.DictionaryBuilder "$@"
+RUNNER=./DictionaryPC
+if ! test -x "$RUNNER" ; then
+  # -agentlib:hprof=heap=sites,depth=20
+  ICU4J=/usr/share/java/icu4j-49.1.jar
+  test -r "$ICU4J" || ICU4J=/usr/share/icu4j-55/lib/icu4j.jar
+  COMMONS_LANG3=/usr/share/java/commons-lang3.jar
+  test -r "$COMMONS_LANG3" || COMMONS_LANG3=/usr/share/commons-lang-3.3/lib/commons-lang.jar
+  COMMONS_TEXT=/usr/share/java/commons-text.jar
+  COMMONS_COMPRESS=/usr/share/java/commons-compress.jar
+  JAVA=/usr/lib/jvm/java-8-openjdk-amd64/jre/bin/java
+  test -x "$JAVA" || JAVA=java
+  RUNNER="$JAVA -Djava.util.logging.config.file=logging.properties -Xmx4096m -classpath bin/:$ICU4J:$COMMONS_LANG3:$COMMONS_TEXT:$COMMONS_COMPRESS com.hughes.android.dictionary.engine.Runner"
+fi
+$RUNNER DictionaryBuilder "$@"
diff --git a/src/com/hughes/android/dictionary/CollatorWrapper.java b/src/com/hughes/android/dictionary/CollatorWrapper.java

index 23736295ccb14d986f8ca22c01e5ca9ae6c01c6c..295847dbe51b001ca2dd43aebd5f41f83716c45e 100644 (file)
--- a/src/com/hughes/android/dictionary/CollatorWrapper.java
+++ b/src/com/hughes/android/dictionary/CollatorWrapper.java
@@ -18,11 +18,11 @@ import java.util.Locale;
  
  import com.ibm.icu.text.Collator;
  
-final public class CollatorWrapper {
-static public Collator getInstance() {
+public final class CollatorWrapper {
+public static Collator getInstance() {
      return Collator.getInstance();
  }
-static public Collator getInstanceStrengthIdentical(Locale l) {
+public static Collator getInstanceStrengthIdentical(Locale l) {
      Collator c = Collator.getInstance(l);
      c.setStrength(Collator.IDENTICAL);
      return c;
diff --git a/src/com/hughes/android/dictionary/DateFormatTest.java b/src/com/hughes/android/dictionary/DateFormatTest.java

index fce209525089d281b22dfd22822d85260e0b8aae..8be638c6c39d13bab42b03539c539835d2fe0200 100644 (file)
--- a/src/com/hughes/android/dictionary/DateFormatTest.java
+++ b/src/com/hughes/android/dictionary/DateFormatTest.java
@@ -1,29 +1,29 @@
-// Copyright 2011 Google Inc. All Rights Reserved.\r
-//\r
-// Licensed under the Apache License, Version 2.0 (the "License");\r
-// you may not use this file except in compliance with the License.\r
-// You may obtain a copy of the License at\r
-//\r
-//     http://www.apache.org/licenses/LICENSE-2.0\r
-//\r
-// Unless required by applicable law or agreed to in writing, software\r
-// distributed under the License is distributed on an "AS IS" BASIS,\r
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\r
-// See the License for the specific language governing permissions and\r
-// limitations under the License.\r
-\r
-package com.hughes.android.dictionary;\r
-\r
-import java.text.SimpleDateFormat;\r
-import java.util.Date;\r
-\r
-public class DateFormatTest {\r
-\r
-    /**\r
-     * @param args\r
-     */\r
-    public static void main(String[] args) {\r
-        System.out.println(new SimpleDateFormat("yyyy.MM.dd HH:mm:ss").format(new Date()));\r
-    }\r
-\r
-}\r
+// Copyright 2011 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package com.hughes.android.dictionary;
+
+import java.text.SimpleDateFormat;
+import java.util.Date;
+
+public class DateFormatTest {
+
+    /**
+     * @param args
+     */
+    public static void main(String[] args) {
+        System.out.println(new SimpleDateFormat("yyyy.MM.dd HH:mm:ss").format(new Date()));
+    }
+
+}
diff --git a/src/com/hughes/android/dictionary/DictionaryApplication.java b/src/com/hughes/android/dictionary/DictionaryApplication.java

index da5e67cefb1affc852320741d78110bdd4069e4e..b18c7a02a0e0dfabb3d145b74c5eaf4c3b69eca5 100644 (file)
--- a/src/com/hughes/android/dictionary/DictionaryApplication.java
+++ b/src/com/hughes/android/dictionary/DictionaryApplication.java
@@ -1,5 +1,5 @@
  package com.hughes.android.dictionary;
  
-final public class DictionaryApplication {
-    final static public boolean USE_COLLATOR = true;
+public final class DictionaryApplication {
+    public static final boolean USE_COLLATOR = true;
  }
diff --git a/src/com/hughes/android/dictionary/FeatureConfig.java b/src/com/hughes/android/dictionary/FeatureConfig.java

index 4642e1171f4759e1f847bc2a7ae15181acf88c25..291c010376b04180bc378f4ccd2495235ffae888 100644 (file)
--- a/src/com/hughes/android/dictionary/FeatureConfig.java
+++ b/src/com/hughes/android/dictionary/FeatureConfig.java
@@ -1,5 +1,5 @@
  package com.hughes.android.dictionary;
  
-final public class FeatureConfig {
-    final static public boolean enableWrite = true;
+public final class FeatureConfig {
+    public static final boolean enableWrite = true;
  }
diff --git a/src/com/hughes/android/dictionary/SerializeCollatorTest.java b/src/com/hughes/android/dictionary/SerializeCollatorTest.java

index 7a1e42e41b74703faf0a563a10eecbc4c5c94dfe..2980e10b40304c0cb93c107d14a3d699fbbe4168 100644 (file)
--- a/src/com/hughes/android/dictionary/SerializeCollatorTest.java
+++ b/src/com/hughes/android/dictionary/SerializeCollatorTest.java
@@ -1,36 +1,35 @@
-// Copyright 2011 Google Inc. All Rights Reserved.\r
-//\r
-// Licensed under the Apache License, Version 2.0 (the "License");\r
-// you may not use this file except in compliance with the License.\r
-// You may obtain a copy of the License at\r
-//\r
-//     http://www.apache.org/licenses/LICENSE-2.0\r
-//\r
-// Unless required by applicable law or agreed to in writing, software\r
-// distributed under the License is distributed on an "AS IS" BASIS,\r
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\r
-// See the License for the specific language governing permissions and\r
-// limitations under the License.\r
-\r
-package com.hughes.android.dictionary;\r
-\r
-import java.io.File;\r
-import java.io.IOException;\r
-import java.util.Comparator;\r
-\r
-import com.hughes.android.dictionary.engine.Language;\r
-import java.text.Collator;\r
-\r
-public class SerializeCollatorTest {\r
-\r
-    /**\r
-     * @param args\r
-     * @throws IOException\r
-     */\r
-    public static void main(String[] args) throws IOException {\r
-        File temp = File.createTempFile("temp", null);\r
-        final Comparator c = Language.de.getCollator();\r
-        //FileUtil.writeObject(c, temp);\r
-    }\r
-\r
-}\r
+// Copyright 2011 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package com.hughes.android.dictionary;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.Comparator;
+
+import com.hughes.android.dictionary.engine.Language;
+
+public class SerializeCollatorTest {
+
+    /**
+     * @param args
+     * @throws IOException
+     */
+    public static void main(String[] args) throws IOException {
+        File temp = File.createTempFile("temp", null);
+        final Comparator<Object> c = Language.de.getCollator();
+        //FileUtil.writeObject(c, temp);
+    }
+
+}
diff --git a/src/com/hughes/android/dictionary/engine/CheckDictionariesMain.java b/src/com/hughes/android/dictionary/engine/CheckDictionariesMain.java

index d82f190215555c0372271019de3845bf0925e5c4..e43f1d098ff7c99903abfd3fecf2638b31e1bd53 100644 (file)
--- a/src/com/hughes/android/dictionary/engine/CheckDictionariesMain.java
+++ b/src/com/hughes/android/dictionary/engine/CheckDictionariesMain.java
@@ -1,9 +1,5 @@
  package com.hughes.android.dictionary.engine;
  
-import com.hughes.android.dictionary.DictionaryInfo;
-import com.hughes.android.dictionary.DictionaryInfo.IndexInfo;
-import com.hughes.util.CollectionUtil;
-
  import java.io.File;
  import java.io.IOException;
  import java.io.PrintWriter;
@@ -13,9 +9,13 @@ import java.util.Arrays;
  import java.util.Collections;
  import java.util.List;
  
+import com.hughes.android.dictionary.DictionaryInfo;
+import com.hughes.android.dictionary.DictionaryInfo.IndexInfo;
+import com.hughes.util.CollectionUtil;
+
  public class CheckDictionariesMain {
  
-    static final String BASE_URL = "http://github.com/rdoeffinger/Dictionary/releases/download/v0.2-dictionaries/";
+    static final String BASE_URL = "https://github.com/rdoeffinger/Dictionary/releases/download/v0.3-dictionaries/";
      static final String VERSION_CODE_OLD = "v006";
      static final String VERSION_CODE = "v007";
  
@@ -26,7 +26,7 @@ public class CheckDictionariesMain {
  //    dictionaryInfoOut.println("# LANG_1\t%LANG_2\tFILENAME\tVERSION_CODE\tFILESIZE\tNUM_MAIN_WORDS_1\tNUM_MAIN_WORDS_2\tNUM_ALL_WORDS_1\tNUM_ALL_WORDS_2");
  
          final File[] files = dictDir.listFiles();
-        final List<String> dictNames = new ArrayList<String>();
+        final List<String> dictNames = new ArrayList<>();
          Arrays.sort(files);
          for (final File dictFile : files) {
              if (!dictFile.getName().endsWith("quickdic")) {
@@ -63,7 +63,7 @@ public class CheckDictionariesMain {
  
              // Find the stats.
              System.out.println("Stats...");
-            final List<String> indexNames = new ArrayList<String>();
+            final List<String> indexNames = new ArrayList<>();
              for (final IndexInfo indexInfo : dictionaryInfo.indexInfos) {
                  indexNames.add(indexInfo.shortName);
              }
diff --git a/src/com/hughes/android/dictionary/engine/ConvertToV6.java b/src/com/hughes/android/dictionary/engine/ConvertToV6.java

new file mode 100644 (file)

index 0000000..05a801b
--- /dev/null
+++ b/src/com/hughes/android/dictionary/engine/ConvertToV6.java
@@ -0,0 +1,74 @@
+// Copyright 2020 Reimar Döffinger. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package com.hughes.android.dictionary.engine;
+
+import java.io.FileInputStream;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.io.RandomAccessFile;
+
+public class ConvertToV6 {
+    public static void main(final String[] args) throws IOException {
+        if (args.length != 2 && args.length != 3) {
+            System.out.println("Usage: ConvertToV6 <input.v007> <output.v006> [skipHtml]");
+            System.out.println("If the option third argument is given as 'skipHtml'");
+            System.out.println("the v6 dictionary will be without all HTML entries to reduce its size");
+            return;
+        }
+        boolean skipHtml = false;
+        boolean skipHtmlOpt = false;
+        if (args.length == 3) {
+            if (!args[2].equals("skipHtml") && !args[2].equals("skipHtmlOpt")) {
+                System.out.println("Unknown extra argument '" + args[2] + "'");
+                return;
+            }
+            skipHtml = true;
+            skipHtmlOpt = args[2].equals("skipHtmlOpt");
+        }
+        final String inname = args[0];
+        final String outname = args[1];
+        FileInputStream in;
+        try {
+            in = new FileInputStream(inname);
+        } catch (FileNotFoundException e) {
+            System.out.println("Could not open input file '" + inname + "'");
+            System.out.println(e);
+            return;
+        }
+        final Dictionary dictionary = new Dictionary(in.getChannel());
+        if (dictionary.dictFileVersion <= 6) {
+            System.out.println("Input dictionary is already v6 or older!");
+            return;
+        }
+        if (skipHtmlOpt && dictionary.htmlEntries.size() == 0) {
+            System.exit(3);
+        }
+        RandomAccessFile out;
+        try {
+            out = new RandomAccessFile(outname, "rw");
+        } catch (FileNotFoundException e) {
+            System.out.println("Could not open output file '" + outname + "'");
+            System.out.println(e);
+            return;
+        }
+        if (out.length() > 0) {
+            System.out.println("Output file '" + outname + "' already exists, aborting!");
+            return;
+        }
+        new DictionaryV6Writer(dictionary).writev6(out, skipHtml);
+        out.close();
+        in.close();
+    }
+}
diff --git a/src/com/hughes/android/dictionary/engine/DictionaryBuilder.java b/src/com/hughes/android/dictionary/engine/DictionaryBuilder.java

index d105af2d4764e45778b12b574c3eeeebe47c1401..34cc705c39592cf45a5b3efa22bd4da39460a9c9 100644 (file)
--- a/src/com/hughes/android/dictionary/engine/DictionaryBuilder.java
+++ b/src/com/hughes/android/dictionary/engine/DictionaryBuilder.java
@@ -42,7 +42,7 @@ import com.hughes.util.FileUtil;
  public class DictionaryBuilder {
  
      public final Dictionary dictionary;
-    public final List<IndexBuilder> indexBuilders = new ArrayList<IndexBuilder>();
+    public final List<IndexBuilder> indexBuilders = new ArrayList<>();
  
      public DictionaryBuilder(final String dictInfoString, final Language lang0, final Language lang1, final String normalizerRules1, final String normalizerRules2, final Set<String> lang1Stoplist, final Set<String> lang2Stoplist) {
          dictionary = new Dictionary(dictInfoString);
@@ -61,7 +61,7 @@ public class DictionaryBuilder {
          }
      }
  
-    public static void main(final String[] args) throws IOException, ParserConfigurationException, SAXException {
+    public static void main(final String[] args) throws IOException {
          System.out.println("Running with arguments:");
          for (final String arg : args) {
              System.out.println(arg);
@@ -80,8 +80,8 @@ public class DictionaryBuilder {
              lang2 = null;
          }
  
-        final Set<String> lang1Stoplist = new LinkedHashSet<String>();
-        final Set<String> lang2Stoplist = new LinkedHashSet<String>();
+        final Set<String> lang1Stoplist = new LinkedHashSet<>();
+        final Set<String> lang2Stoplist = new LinkedHashSet<>();
          final String lang1StoplistFile = keyValueArgs.remove("lang1Stoplist");
          final String lang2StoplistFile = keyValueArgs.remove("lang2Stoplist");
          if (lang1StoplistFile != null) {
@@ -145,7 +145,7 @@ public class DictionaryBuilder {
                  final int pageLimit = Integer.parseInt(pageLimitString);
  
                  final EntrySource entrySource = new EntrySource(dictionaryBuilder.dictionary.sources.size(), inputName, 0);
-                System.out.println("");
+                System.out.println();
  
                  String inputFormat = keyValueArgs.remove(prefix + "Format");
                  if ("tab_separated".equals(inputFormat)) {
diff --git a/src/com/hughes/android/dictionary/engine/DictionaryBuilderMain.java b/src/com/hughes/android/dictionary/engine/DictionaryBuilderMain.java

index cf5fa96fc172acf6ee8fc4633b3180fcf6806c40..8c2ebd087f38cfd3aa50ab5966045fa10e99c7a1 100644 (file)
--- a/src/com/hughes/android/dictionary/engine/DictionaryBuilderMain.java
+++ b/src/com/hughes/android/dictionary/engine/DictionaryBuilderMain.java
@@ -14,12 +14,6 @@
  
  package com.hughes.android.dictionary.engine;
  
-import com.hughes.android.dictionary.parser.wiktionary.EnTranslationToTranslationParser;
-import com.hughes.android.dictionary.parser.wiktionary.WholeSectionToHtmlParser;
-import com.hughes.android.dictionary.parser.wiktionary.WiktionaryLangs;
-
-import junit.framework.TestCase;
-
  import java.io.File;
  import java.util.ArrayList;
  import java.util.Arrays;
@@ -29,6 +23,12 @@ import java.util.List;
  import java.util.Map;
  import java.util.Set;
  
+import com.hughes.android.dictionary.parser.wiktionary.EnTranslationToTranslationParser;
+import com.hughes.android.dictionary.parser.wiktionary.WholeSectionToHtmlParser;
+import com.hughes.android.dictionary.parser.wiktionary.WiktionaryLangs;
+
+import junit.framework.TestCase;
+
  public class DictionaryBuilderMain extends TestCase {
  
      static final String INPUTS = "data/inputs/";
@@ -36,7 +36,7 @@ public class DictionaryBuilderMain extends TestCase {
      static final String OUTPUTS = "data/outputs/";
  
      // Build the non EN ones.
-    static final String[][] nonEnPairs = new String[][] {
+    static final String[][] nonEnPairs = {
          {"EN"},
          {"DE"},
          {"IT"},
@@ -138,7 +138,7 @@ public class DictionaryBuilderMain extends TestCase {
  
  
  
-    static final Map<String,String>  isoToDedication = new LinkedHashMap<String, String>();
+    static final Map<String,String>  isoToDedication = new LinkedHashMap<>();
      static {
          isoToDedication.put("AF", "Wiktionary-based Afrikaans dictionary dedicated to Heiko and Mariëtte Horn.");
          isoToDedication.put("HR", "Wiktionary-based Croatian dictionary dedicated to Ines Viskic and Miro Kresonja.");
@@ -158,7 +158,7 @@ public class DictionaryBuilderMain extends TestCase {
          return isoToDedication.containsKey(iso) ? isoToDedication.get(iso) : String.format("Wiktionary-based %s dictionary.", iso);
      }
  
-    static final Map<String,String>  isoToStoplist = new LinkedHashMap<String, String>();
+    static final Map<String,String>  isoToStoplist = new LinkedHashMap<>();
      static {
          isoToStoplist.put("DE", "de.txt");
          isoToStoplist.put("EN", "en.txt");
@@ -167,7 +167,7 @@ public class DictionaryBuilderMain extends TestCase {
          isoToStoplist.put("FR", "fr.txt");
      }
      private static String getStoplist(String iso) {
-        return isoToStoplist.containsKey(iso) ? isoToStoplist.get(iso) : "empty.txt";
+        return isoToStoplist.getOrDefault(iso, "empty.txt");
      }
  
      static String getOtherLang(final String[] pair, final String first) {
@@ -177,7 +177,7 @@ public class DictionaryBuilderMain extends TestCase {
      }
  
      static List<String> getMainArgs(final String[] pair) {
-        final List<String> result = new ArrayList<String>();
+        final List<String> result = new ArrayList<>();
  
          int i = 1;
  
@@ -311,9 +311,7 @@ public class DictionaryBuilderMain extends TestCase {
  
      public static void main(final String[] args) throws Exception {
  
-        final List<String[]> allPairs = new ArrayList<String[]>();
-
-        allPairs.addAll(Arrays.asList(nonEnPairs));
+        final List<String[]> allPairs = new ArrayList<>(Arrays.asList(nonEnPairs));
          // Add all the EN-XX pairs.
          for (final String isoCode : WiktionaryLangs.isoCodeToEnWikiName.keySet()) {
              if (!isoCode.equals("EN")) {
@@ -322,7 +320,7 @@ public class DictionaryBuilderMain extends TestCase {
          }
  
  
-        final Set<List<String>> done = new LinkedHashSet<List<String>>();
+        final Set<List<String>> done = new LinkedHashSet<>();
          boolean go = true;
          for (final String[] pair : allPairs) {
              Arrays.sort(pair);
@@ -332,11 +330,7 @@ public class DictionaryBuilderMain extends TestCase {
              }
              done.add(pairList);
  
-            if (pairList.contains("EN") && pairList.contains("DE")) {
-                go = true;
-            } else {
-                go = false;
-            }
+            go = pairList.contains("EN") && pairList.contains("DE");
  
              if (!go) {
                  continue;
diff --git a/src/com/hughes/android/dictionary/engine/DictionaryBuilderTest.java b/src/com/hughes/android/dictionary/engine/DictionaryBuilderTest.java

index 72ad9eba138e81f957c4633ba661eacefeb00b5a..ee1e6648e42ea46f301e64aa0f9de45391e8dc83 100644 (file)
--- a/src/com/hughes/android/dictionary/engine/DictionaryBuilderTest.java
+++ b/src/com/hughes/android/dictionary/engine/DictionaryBuilderTest.java
@@ -15,7 +15,6 @@
  package com.hughes.android.dictionary.engine;
  
  import java.io.File;
-import java.io.FileNotFoundException;
  import java.io.IOException;
  import java.io.PrintStream;
  import java.io.RandomAccessFile;
@@ -347,7 +346,7 @@ public class DictionaryBuilderTest extends TestCase {
      }
  
      private void checkGolden(final String dictName, final File dictFile)
-    throws IOException, FileNotFoundException {
+    throws IOException {
          // Check it once:
          assertFilesEqual(GOLDENS + dictName + ".text", dictFile.getPath() + ".text");
  
diff --git a/src/com/hughes/android/dictionary/engine/DictionaryTest.java b/src/com/hughes/android/dictionary/engine/DictionaryTest.java

index d1dcc2b36ebe05e4536aa23d2d7fe3d0c7ce910a..0a9c6733d7d31c969e862e59c9e28c94f93aa3e9 100644 (file)
--- a/src/com/hughes/android/dictionary/engine/DictionaryTest.java
+++ b/src/com/hughes/android/dictionary/engine/DictionaryTest.java
@@ -21,11 +21,11 @@ import java.util.List;
  import java.util.Random;
  import java.util.concurrent.atomic.AtomicBoolean;
  
-import junit.framework.TestCase;
-
  import com.hughes.android.dictionary.engine.Index.IndexEntry;
  import com.hughes.util.CollectionUtil;
  
+import junit.framework.TestCase;
+
  
  public class DictionaryTest extends TestCase {
  
@@ -61,7 +61,7 @@ public class DictionaryTest extends TestCase {
              assertTrue(rows.toString(), rows.size() > 0);
              assertTrue(rows.get(0).toString().startsWith("come mai@"));
              assertTrue(rows.get(0) instanceof TokenRow);
-            assertTrue(!((TokenRow)rows.get(0)).getIndexEntry().htmlEntries.isEmpty());
+            assertFalse(((TokenRow) rows.get(0)).getIndexEntry().htmlEntries.isEmpty());
          }
  
          {
@@ -70,7 +70,7 @@ public class DictionaryTest extends TestCase {
              assertTrue(rows.toString(), rows.size() > 0);
              assertTrue(rows.get(0).toString().startsWith("buon giorno@"));
              assertTrue(rows.get(0) instanceof TokenRow);
-            assertTrue(!((TokenRow)rows.get(0)).getIndexEntry().htmlEntries.isEmpty());
+            assertFalse(((TokenRow) rows.get(0)).getIndexEntry().htmlEntries.isEmpty());
          }
  
          {
@@ -171,7 +171,7 @@ public class DictionaryTest extends TestCase {
  
          // Check that search in lowercase works.
          assertSearchResult("Alibi", "Alibi", deIndex.findInsertionPoint("alib", new AtomicBoolean(false)));
-        System.out.println(deIndex.findInsertionPoint("alib", new AtomicBoolean(false)).toString());
+        System.out.println(deIndex.findInsertionPoint("alib", new AtomicBoolean(false)));
  
          raf.close();
      }
diff --git a/src/com/hughes/android/dictionary/engine/DictionaryV6Writer.java b/src/com/hughes/android/dictionary/engine/DictionaryV6Writer.java

new file mode 100644 (file)

index 0000000..3da4d4a
--- /dev/null
+++ b/src/com/hughes/android/dictionary/engine/DictionaryV6Writer.java
@@ -0,0 +1,336 @@
+// Copyright 2020 Reimar Döffinger. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package com.hughes.android.dictionary.engine;
+
+import java.io.BufferedOutputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.DataOutputStream;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.ObjectOutputStream;
+import java.io.RandomAccessFile;
+import java.nio.charset.StandardCharsets;
+import java.util.Collections;
+import java.util.List;
+import java.util.zip.GZIPOutputStream;
+
+public class DictionaryV6Writer {
+    private final Dictionary d;
+
+    public DictionaryV6Writer(Dictionary dictionary) {
+        d = dictionary;
+    }
+
+    private void writev6Sources(RandomAccessFile out) throws IOException {
+        ByteArrayOutputStream toc = new ByteArrayOutputStream();
+        DataOutputStream tocout = new DataOutputStream(toc);
+
+        out.writeInt(d.sources.size());
+        long tocPos = out.getFilePointer();
+        out.seek(tocPos + d.sources.size() * 8 + 8);
+        for (EntrySource s : d.sources) {
+            long dataPos = out.getFilePointer();
+            tocout.writeLong(dataPos);
+
+            out.writeUTF(s.getName());
+            out.writeInt(s.getNumEntries());
+        }
+        long dataPos = out.getFilePointer();
+        tocout.writeLong(dataPos);
+        tocout.close();
+
+        out.seek(tocPos);
+        out.write(toc.toByteArray());
+        out.seek(dataPos);
+    }
+
+    private void writev6PairEntries(RandomAccessFile out) throws IOException {
+        ByteArrayOutputStream toc = new ByteArrayOutputStream();
+        DataOutputStream tocout = new DataOutputStream(toc);
+
+        long tocPos = out.getFilePointer();
+        long dataPos = tocPos + 4 + d.pairEntries.size() * 8 + 8;
+
+        out.seek(dataPos);
+        DataOutputStream outb = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(out.getFD())));
+
+        tocout.writeInt(d.pairEntries.size());
+        for (PairEntry pe : d.pairEntries) {
+            tocout.writeLong(dataPos + outb.size());
+
+            outb.writeShort(pe.entrySource.index());
+            outb.writeInt(pe.pairs.size());
+            for (PairEntry.Pair p : pe.pairs) {
+                outb.writeUTF(p.lang1);
+                outb.writeUTF(p.lang2);
+            }
+        }
+        dataPos += outb.size();
+        outb.flush();
+        tocout.writeLong(dataPos);
+        tocout.close();
+
+        out.seek(tocPos);
+        out.write(toc.toByteArray());
+        out.seek(dataPos);
+    }
+
+    private void writev6TextEntries(RandomAccessFile out) throws IOException {
+        ByteArrayOutputStream toc = new ByteArrayOutputStream();
+        DataOutputStream tocout = new DataOutputStream(toc);
+
+        out.writeInt(d.textEntries.size());
+        long tocPos = out.getFilePointer();
+        out.seek(tocPos + d.textEntries.size() * 8 + 8);
+        for (TextEntry t : d.textEntries) {
+            long dataPos = out.getFilePointer();
+            tocout.writeLong(dataPos);
+
+            out.writeShort(t.entrySource.index());
+            out.writeUTF(t.text);
+        }
+        long dataPos = out.getFilePointer();
+        tocout.writeLong(dataPos);
+        tocout.close();
+
+        out.seek(tocPos);
+        out.write(toc.toByteArray());
+        out.seek(dataPos);
+    }
+
+    private void writev6EmptyList(RandomAccessFile out) throws IOException {
+        out.writeInt(0);
+        out.writeLong(out.getFilePointer() + 8);
+    }
+
+    private void writev6HtmlEntries(RandomAccessFile out) throws IOException {
+        ByteArrayOutputStream toc = new ByteArrayOutputStream();
+        DataOutputStream tocout = new DataOutputStream(toc);
+
+        long tocPos = out.getFilePointer();
+        long dataPos = tocPos + 4 + d.htmlEntries.size() * 8 + 8;
+
+        out.seek(dataPos);
+        DataOutputStream outb = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(out.getFD())));
+
+        tocout.writeInt(d.htmlEntries.size());
+        for (HtmlEntry h : d.htmlEntries) {
+            tocout.writeLong(dataPos + outb.size());
+
+            outb.writeShort(h.entrySource.index());
+            outb.writeUTF(h.title);
+            byte[] data = h.getHtml().getBytes(StandardCharsets.UTF_8);
+            outb.writeInt(data.length);
+            ByteArrayOutputStream baos = new ByteArrayOutputStream();
+            GZIPOutputStream gzout = new GZIPOutputStream(baos);
+            gzout.write(data);
+            gzout.close();
+            outb.writeInt(baos.size());
+            outb.write(baos.toByteArray());
+        }
+        dataPos += outb.size();
+        outb.flush();
+        tocout.writeLong(dataPos);
+        tocout.close();
+
+        out.seek(tocPos);
+        out.write(toc.toByteArray());
+        out.seek(dataPos);
+    }
+
+    private void writev6HtmlIndices(DataOutputStream out, long pos, List<HtmlEntry> entries) throws IOException {
+        long dataPos = pos + 4 + entries.size() * 8 + 8;
+
+        out.writeInt(entries.size());
+
+        // TOC is trivial, so optimize writing it
+        for (int i = 0; i < entries.size(); i++) {
+            out.writeLong(dataPos);
+            dataPos += 4;
+        }
+        out.writeLong(dataPos);
+
+        for (HtmlEntry e : entries) {
+            out.writeInt(e.index());
+        }
+    }
+
+    private void writev6IndexEntries(RandomAccessFile out, List<Index.IndexEntry> entries, int[] prunedRowIdx) throws IOException {
+        ByteArrayOutputStream toc = new ByteArrayOutputStream();
+        DataOutputStream tocout = new DataOutputStream(toc);
+
+        long tocPos = out.getFilePointer();
+        long dataPos = tocPos + 4 + entries.size() * 8 + 8;
+
+        out.seek(dataPos);
+        DataOutputStream outb = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(out.getFD())));
+
+        tocout.writeInt(entries.size());
+        for (Index.IndexEntry e : entries) {
+            tocout.writeLong(dataPos + outb.size());
+
+            outb.writeUTF(e.token);
+
+            int startRow = e.startRow;
+            int numRows = e.numRows;
+            if (prunedRowIdx != null) {
+                // note: the start row will always be a TokenRow
+                // and thus never be pruned
+                int newNumRows = 1;
+                for (int i = 1; i < numRows; i++) {
+                    if (prunedRowIdx[startRow + i] >= 0) newNumRows++;
+                }
+                startRow = prunedRowIdx[startRow];
+                numRows = newNumRows;
+            }
+
+            outb.writeInt(startRow);
+            outb.writeInt(numRows);
+            final boolean hasNormalizedForm = !e.token.equals(e.normalizedToken());
+            outb.writeBoolean(hasNormalizedForm);
+            if (hasNormalizedForm) outb.writeUTF(e.normalizedToken());
+            writev6HtmlIndices(outb, dataPos + outb.size(),
+                               prunedRowIdx == null ? e.htmlEntries : Collections.emptyList());
+        }
+        dataPos += outb.size();
+        outb.flush();
+        tocout.writeLong(dataPos);
+        tocout.close();
+
+        out.seek(tocPos);
+        out.write(toc.toByteArray());
+        out.seek(dataPos);
+    }
+
+    private void writev6Index(RandomAccessFile out, boolean skipHtml) throws IOException {
+        ByteArrayOutputStream toc = new ByteArrayOutputStream();
+        DataOutputStream tocout = new DataOutputStream(toc);
+
+        out.writeInt(d.indices.size());
+        long tocPos = out.getFilePointer();
+        out.seek(tocPos + d.indices.size() * 8 + 8);
+        for (Index idx : d.indices) {
+            // create pruned index for skipHtml feature
+            int[] prunedRowIdx = null;
+            int prunedSize = 0;
+            if (skipHtml) {
+                prunedRowIdx = new int[idx.rows.size()];
+                for (int i = 0; i < idx.rows.size(); i++) {
+                    final RowBase r = idx.rows.get(i);
+                    // prune Html entries
+                    boolean pruned = r instanceof HtmlEntry.Row;
+                    prunedRowIdx[i] = pruned ? -1 : prunedSize;
+                    if (!pruned) prunedSize++;
+                }
+            }
+
+            long dataPos = out.getFilePointer();
+            tocout.writeLong(dataPos);
+
+            out.writeUTF(idx.shortName);
+            out.writeUTF(idx.longName);
+            out.writeUTF(idx.sortLanguage.getIsoCode());
+            out.writeUTF(idx.normalizerRules);
+            out.writeBoolean(idx.swapPairEntries);
+            out.writeInt(idx.mainTokenCount);
+            writev6IndexEntries(out, idx.sortedIndexEntries, prunedRowIdx);
+
+            // write stoplist, serializing the whole Set *shudder*
+            // Actually just emulate ObjectOutputStream serialization
+            final byte[] hashSetSerialized = {
+                (byte)0xac, (byte)0xed, // magic
+                0x00, 0x05, // version
+                0x73, // object
+                0x72, // class
+                // "java.util.HashSet"
+                0x00, 0x11, 0x6a, 0x61, 0x76, 0x61, 0x2e, 0x75, 0x74, 0x69,
+                0x6c, 0x2e, 0x48, 0x61, 0x73, 0x68, 0x53, 0x65, 0x74,
+                // serialization ID
+                (byte)0xba, 0x44, (byte)0x85, (byte)0x95, (byte)0x96, (byte)0xb8, (byte)0xb7, 0x34,
+                0x03, // flags: serialized, custom serialization function
+                0x00, 0x00, // fields count
+                0x78, // blockdata end
+                0x70, // null (superclass)
+                0x77, 0x0c // blockdata short, 0xc bytes
+            };
+            int stoplistlen = hashSetSerialized.length;
+            stoplistlen += 12; // block data: capacity (int), load factor (float), size (int)
+            for (String s : idx.stoplist) {
+                stoplistlen += 3 + s.length();
+            }
+            stoplistlen++;
+
+            DataOutputStream outb = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(out.getFD())));
+            outb.writeInt(stoplistlen);
+            outb.write(hashSetSerialized);
+            outb.writeInt(idx.stoplist.size()); // capacity
+            outb.writeFloat(0.75f); // load factor
+            outb.writeInt(idx.stoplist.size()); // size
+            for (String s : idx.stoplist) {
+                outb.writeByte(0x74); // String type
+                outb.writeUTF(s);
+            }
+            outb.writeByte(0x78); // blockdata end
+
+            outb.writeInt(skipHtml ? prunedSize : idx.rows.size());
+            outb.writeInt(5);
+            for (RowBase r : idx.rows) {
+                int type = 0;
+                if (r instanceof PairEntry.Row) {
+                    type = 0;
+                } else if (r instanceof TokenRow) {
+                    final TokenRow tokenRow = (TokenRow)r;
+                    type = tokenRow.hasMainEntry ? 1 : 3;
+                } else if (r instanceof TextEntry.Row) {
+                    type = 2;
+                } else if (r instanceof HtmlEntry.Row) {
+                    type = 4;
+                    if (skipHtml) continue;
+                } else {
+                    throw new RuntimeException("Row type not supported for v6");
+                }
+                outb.writeByte(type);
+                outb.writeInt(r.referenceIndex);
+            }
+            outb.flush();
+        }
+        long dataPos = out.getFilePointer();
+        tocout.writeLong(dataPos);
+        tocout.close();
+
+        out.seek(tocPos);
+        out.write(toc.toByteArray());
+        out.seek(dataPos);
+    }
+
+    public void writev6(RandomAccessFile raf, boolean skipHtml) throws IOException {
+        raf.writeInt(6);
+        raf.writeLong(d.creationMillis);
+        raf.writeUTF(d.dictInfo);
+        System.out.println("sources start: " + raf.getFilePointer());
+        writev6Sources(raf);
+        System.out.println("pair start: " + raf.getFilePointer());
+        writev6PairEntries(raf);
+        System.out.println("text start: " + raf.getFilePointer());
+        writev6TextEntries(raf);
+        System.out.println("html index start: " + raf.getFilePointer());
+        if (skipHtml) writev6EmptyList(raf);
+        else writev6HtmlEntries(raf);
+        System.out.println("indices start: " + raf.getFilePointer());
+        writev6Index(raf, skipHtml);
+        System.out.println("end: " + raf.getFilePointer());
+        raf.writeUTF("END OF DICTIONARY");
+    }
+}
diff --git a/src/com/hughes/android/dictionary/engine/IndexBuilder.java b/src/com/hughes/android/dictionary/engine/IndexBuilder.java

index e7e1b43635627d146d263dbf0c5a6ea5d85b1570..2db537bcd0c548bef2dabe22d245cae511a4451c 100644 (file)
--- a/src/com/hughes/android/dictionary/engine/IndexBuilder.java
+++ b/src/com/hughes/android/dictionary/engine/IndexBuilder.java
@@ -14,16 +14,7 @@
  
  package com.hughes.android.dictionary.engine;
  
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.Comparator;
-import java.util.EnumMap;
-import java.util.HashSet;
-import java.util.List;
-import java.util.Map;
-import java.util.Set;
-import java.util.SortedMap;
-import java.util.TreeMap;
+import java.util.*;
  
  import com.hughes.android.dictionary.engine.Index.IndexEntry;
  import com.hughes.android.dictionary.parser.DictFileParser;
@@ -34,17 +25,19 @@ public class IndexBuilder {
      public final Index index;
      final Set<String> stoplist;
  
-    final SortedMap<String, TokenData> tokenToData;
+    final Map<String, TokenData> fastTokenToData;
+    final SortedMap<FastCompareString, TokenData> tokenToData;
  
      IndexBuilder(final DictionaryBuilder dictionaryBuilder, final String shortName, final String longName, final Language language, final String normalizerRules, final Set<String> stoplist, final boolean swapPairEntries) {
          this.dictionaryBuilder = dictionaryBuilder;
          index = new Index(dictionaryBuilder.dictionary, shortName, longName, language, normalizerRules, swapPairEntries, stoplist);
-        tokenToData = new TreeMap<String, TokenData>(index.getSortComparator());
+        tokenToData = new TreeMap<>(new FastNormalizeComparator(index.getSortComparator()));
+        fastTokenToData = new HashMap<>();
          this.stoplist = stoplist;
      }
  
      public void build() {
-        final Set<IndexedEntry> tokenIndexedEntries = new HashSet<IndexedEntry>();
+        final Set<IndexedEntry> tokenIndexedEntries = new HashSet<>();
          final List<RowBase> rows = index.rows;
          index.mainTokenCount = 0;
          for (final TokenData tokenData : tokenToData.values()) {
@@ -101,13 +94,8 @@ public class IndexBuilder {
              }
          }
  
-        final List<IndexEntry> entriesSortedByNumRows = new ArrayList<IndexEntry>(index.sortedIndexEntries);
-        Collections.sort(entriesSortedByNumRows, new Comparator<IndexEntry>() {
-            @Override
-            public int compare(IndexEntry object1, IndexEntry object2) {
-                return object2.numRows - object1.numRows;
-            }
-        });
+        final List<IndexEntry> entriesSortedByNumRows = new ArrayList<>(index.sortedIndexEntries);
+        entriesSortedByNumRows.sort((object1, object2) -> object2.numRows - object1.numRows);
          System.out.println("Most common tokens:");
          for (int i = 0; i < 50 && i < entriesSortedByNumRows.size(); ++i) {
              System.out.println("  " + entriesSortedByNumRows.get(i));
@@ -117,10 +105,10 @@ public class IndexBuilder {
      public static class TokenData {
          final String token;
  
-        final Map<EntryTypeName, List<IndexedEntry>> typeToEntries = new EnumMap<EntryTypeName, List<IndexedEntry>>(EntryTypeName.class);
+        final Map<EntryTypeName, List<IndexedEntry>> typeToEntries = new EnumMap<>(EntryTypeName.class);
          public boolean hasMainEntry = false;
  
-        public List<HtmlEntry> htmlEntries = new ArrayList<HtmlEntry>();
+        public final List<HtmlEntry> htmlEntries = new ArrayList<>();
  
          TokenData(final String token) {
              assert token.equals(token.trim());
@@ -130,11 +118,16 @@ public class IndexBuilder {
      }
  
      public TokenData getOrCreateTokenData(final String token) {
-        TokenData tokenData = tokenToData.get(token);
-        if (tokenData == null) {
-            tokenData = new TokenData(token);
-            tokenToData.put(token, tokenData);
+        TokenData tokenData = fastTokenToData.get(token);
+        if (tokenData != null) return tokenData;
+        tokenData = new TokenData(token);
+        final FastCompareString c = new FastCompareString(token);
+        if (tokenToData.put(c, tokenData) != null) {
+            // The parallel HashMap assumes that the TreeMap Comparator
+            // is compatible with the equals it uses to compare.
+            throw new RuntimeException("TokenData TreeMap and HashMap out of sync, Comparator may be broken?");
          }
+        fastTokenToData.put(token, tokenData);
          return tokenData;
      }
  
@@ -145,7 +138,7 @@ public class IndexBuilder {
              tokenData.hasMainEntry = true;
          }
          if (entries == null) {
-            entries = new ArrayList<IndexedEntry>();
+            entries = new ArrayList<>();
              tokenData.typeToEntries.put(entryTypeName, entries);
          }
          return entries;
diff --git a/src/com/hughes/android/dictionary/engine/LanguageTest.java b/src/com/hughes/android/dictionary/engine/LanguageTest.java

index 24fe094ffe99563e7478ed36167ac1ac9b331a94..be787ebf53fa1be7f7a55b0fa9bc2c1a662c391b 100644 (file)
--- a/src/com/hughes/android/dictionary/engine/LanguageTest.java
+++ b/src/com/hughes/android/dictionary/engine/LanguageTest.java
@@ -21,12 +21,12 @@ import java.util.LinkedHashSet;
  import java.util.List;
  import java.util.Set;
  
-import junit.framework.TestCase;
-
  import com.hughes.android.dictionary.parser.DictFileParser;
  import com.hughes.android.dictionary.parser.wiktionary.WiktionaryLangs;
  import com.ibm.icu.text.Transliterator;
  
+import junit.framework.TestCase;
+
  public class LanguageTest extends TestCase {
  
      public void testGermanSort() {
@@ -73,10 +73,10 @@ public class LanguageTest extends TestCase {
          assertEquals("hulle", normalizer.transform("Hulle"));
  
  
-        final List<String> sorted = new ArrayList<String>(words);
+        final List<String> sorted = new ArrayList<>(words);
  //    Collections.shuffle(shuffled, new Random(0));
-        Collections.sort(sorted, comparator);
-        System.out.println(sorted.toString());
+        sorted.sort(comparator);
+        System.out.println(sorted);
          for (int i = 0; i < words.size(); ++i) {
              System.out.println(words.get(i) + "\t" + sorted.get(i));
              assertEquals(words.get(i), sorted.get(i));
@@ -92,9 +92,9 @@ public class LanguageTest extends TestCase {
                                         "preppy",
                                         "preprocess");
  
-        final List<String> sorted = new ArrayList<String>(words);
+        final List<String> sorted = new ArrayList<>(words);
          final NormalizeComparator comparator = new NormalizeComparator(normalizer, Language.en.getCollator(), 7);
-        Collections.sort(sorted, comparator);
+        sorted.sort(comparator);
          for (int i = 0; i < words.size(); ++i) {
              if (i > 0) {
                  assertTrue(comparator.compare(words.get(i-1), words.get(i)) < 0);
@@ -183,8 +183,8 @@ public class LanguageTest extends TestCase {
  
  
      public void testEnWiktionaryNames() {
-        final Set<String> enLangs = new LinkedHashSet<String>(WiktionaryLangs.isoCodeToEnWikiName.keySet());
-        final List<String> names = new ArrayList<String>();
+        final Set<String> enLangs = new LinkedHashSet<>(WiktionaryLangs.isoCodeToEnWikiName.keySet());
+        final List<String> names = new ArrayList<>();
          for (final String code : WiktionaryLangs.isoCodeToEnWikiName.keySet()) {
              names.add(WiktionaryLangs.isoCodeToEnWikiName.get(code));
              enLangs.add(code.toLowerCase());
diff --git a/src/com/hughes/android/dictionary/engine/ReadAheadBuffer.java b/src/com/hughes/android/dictionary/engine/ReadAheadBuffer.java

index d4b3ab51a281ee5a177b0259530ebe8ba7c86c90..b38ee2b9785f9feae050670ef8feadc9208946c0 100644 (file)
--- a/src/com/hughes/android/dictionary/engine/ReadAheadBuffer.java
+++ b/src/com/hughes/android/dictionary/engine/ReadAheadBuffer.java
@@ -14,8 +14,8 @@
  
  package com.hughes.android.dictionary.engine;
  
-import java.io.InputStream;
  import java.io.IOException;
+import java.io.InputStream;
  import java.io.PipedInputStream;
  import java.io.PipedOutputStream;
  
@@ -24,29 +24,24 @@ public class ReadAheadBuffer extends PipedInputStream {
      public ReadAheadBuffer(InputStream in, int size) {
          super(size);
          assert size >= 2 * BLOCK_SIZE;
-        this.in = in;
          try {
              pipe = new PipedOutputStream(this);
-            buffer = new byte[BLOCK_SIZE];
-            new Thread(new Runnable() {
-                public void run() {
-                    int read;
-                    try {
-                        while ((read = in.read(buffer)) > 0)
-                        {
-                            pipe.write(buffer, 0, read);
-                            pipe.flush();
-                        }
-                    } catch (IOException e) {}
-                    try {
-                        pipe.close();
-                    } catch (IOException e) {}
-                }
-            }).start();
          } catch (IOException e) {}
+        new Thread(() -> {
+            try {
+                int read;
+                final byte[] buffer = new byte[BLOCK_SIZE];
+                while ((read = in.read(buffer)) > 0)
+                {
+                    pipe.write(buffer, 0, read);
+                    pipe.flush();
+                }
+            } catch (IOException e) {}
+            try {
+                pipe.close();
+            } catch (IOException e) {}
+        }).start();
      }
  
-    InputStream in;
      PipedOutputStream pipe;
-    byte buffer[];
  }
diff --git a/src/com/hughes/android/dictionary/engine/Runner.java b/src/com/hughes/android/dictionary/engine/Runner.java

new file mode 100644 (file)

index 0000000..b150613
--- /dev/null
+++ b/src/com/hughes/android/dictionary/engine/Runner.java
@@ -0,0 +1,38 @@
+// Copyright 2020 Reimar Döffinger. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package com.hughes.android.dictionary.engine;
+
+import java.util.Arrays;
+
+public class Runner {
+    public static void main(final String[] args) throws Exception {
+        if (args.length == 0) {
+            System.out.println("Specify WiktionarySplitter, DictionaryBuilder or ConvertToV6 as first argument");
+            return;
+        }
+        String[] newargs = Arrays.copyOfRange(args, 1, args.length);
+        if (args[0].equals("WiktionarySplitter")) {
+            WiktionarySplitter.main(newargs);
+        } else if (args[0].equals("DictionaryBuilder")) {
+            DictionaryBuilder.main(newargs);
+        } else if (args[0].equals("ConvertToV6")) {
+            ConvertToV6.main(newargs);
+        } else if (args[0].equals("CheckDictionariesMain")) {
+            CheckDictionariesMain.main(newargs);
+        } else {
+            System.out.println("Unknown command '" + args[0] + "'. Use one of WiktionarySplitter, DictionaryBuilder, ConvertToV6 or CheckDictionariesMain instead.");
+        }
+    }
+}
diff --git a/src/com/hughes/android/dictionary/engine/WiktionarySplitter.java b/src/com/hughes/android/dictionary/engine/WiktionarySplitter.java

index 5935df8f51646f28d6462f9d59bf679ec65cabc5..9d51b7841fd07fc851237035c4a0574d6f250396 100644 (file)
--- a/src/com/hughes/android/dictionary/engine/WiktionarySplitter.java
+++ b/src/com/hughes/android/dictionary/engine/WiktionarySplitter.java
@@ -20,35 +20,39 @@ import java.io.DataOutputStream;
  import java.io.File;
  import java.io.FileInputStream;
  import java.io.FileOutputStream;
-import java.io.InputStream;
  import java.io.IOException;
+import java.io.InputStream;
  import java.io.OutputStream;
+import java.nio.charset.StandardCharsets;
  import java.util.ArrayList;
  import java.util.LinkedHashMap;
  import java.util.List;
  import java.util.Map;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
  import java.util.regex.Matcher;
  import java.util.regex.Pattern;
  
  import javax.xml.parsers.ParserConfigurationException;
  import javax.xml.parsers.SAXParser;
+import javax.xml.parsers.SAXParserFactory;
  
-import org.apache.xerces.jaxp.SAXParserFactoryImpl;
  import org.apache.commons.compress.compressors.CompressorStreamFactory;
  import org.xml.sax.Attributes;
  import org.xml.sax.SAXException;
  
  import com.hughes.android.dictionary.parser.wiktionary.WiktionaryLangs;
  
-public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler {
+public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler implements Runnable {
  
      // The matches the whole line, otherwise regexes don't work well on French:
      // {{=uk=}}
      // Spanish has no initial headings, tried to also detect {{ES as such
      // with "^(\\{\\{ES|(=+)[^=]).*$" but that broke English.
-    static final Pattern headingStart = Pattern.compile("^(=+)[^=].*$", Pattern.MULTILINE);
+    static final Pattern headingStartPattern = Pattern.compile("^(=+)[^=].*$", Pattern.MULTILINE);
+    static final Pattern startSpanish = Pattern.compile("\\{\\{ES(\\|[^{}=]*)?}}");
  
-    final Map<String,List<Selector>> pathToSelectors = new LinkedHashMap<String, List<Selector>>();
+    final Map.Entry<String, List<Selector>> pathToSelectorsEntry;
      List<Selector> currentSelectors = null;
  
      StringBuilder titleBuilder;
@@ -56,15 +60,28 @@ public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler {
      StringBuilder currentBuilder = null;
  
      public static void main(final String[] args) throws Exception {
-        final WiktionarySplitter wiktionarySplitter = new WiktionarySplitter();
-        wiktionarySplitter.go();
+        boolean parallel = args.length > 0 && args[0].equals("parallel");
+        final ExecutorService e = Executors.newCachedThreadPool();
+        final Map<String,List<Selector>> pathToSelectors = createSelectorsMap();
+        for (final Map.Entry<String, List<Selector>> pathToSelectorsEntry : pathToSelectors.entrySet()) {
+            final WiktionarySplitter wiktionarySplitter = new WiktionarySplitter(pathToSelectorsEntry);
+            if (parallel) {
+                e.submit(wiktionarySplitter);
+            } else wiktionarySplitter.go();
+        }
+        e.shutdown();
      }
  
-    private WiktionarySplitter() {
+    private WiktionarySplitter(final Map.Entry<String, List<Selector>> pathToSelectorsEntry) {
+        this.pathToSelectorsEntry = pathToSelectorsEntry;
+    }
+
+    private static Map<String,List<Selector>> createSelectorsMap() {
+        final Map<String,List<Selector>> pathToSelectors = new LinkedHashMap<>();
          List<Selector> selectors;
          for (final String code : WiktionaryLangs.wikiCodeToIsoCodeToWikiName.keySet()) {
              //if (!code.equals("fr")) {continue;}
-            selectors = new ArrayList<WiktionarySplitter.Selector>();
+            selectors = new ArrayList<>();
              pathToSelectors.put(String.format("data/inputs/%swiktionary-pages-articles.xml", code), selectors);
              for (final Map.Entry<String, String> entry : WiktionaryLangs.wikiCodeToIsoCodeToWikiName.get(code).entrySet()) {
                  final String dir = String.format("data/inputs/wikiSplit/%s", code);
@@ -72,13 +89,22 @@ public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler {
                  selectors.add(new Selector(String.format("%s/%s.data", dir, entry.getKey()), entry.getValue()));
              }
          }
+        return pathToSelectors;
+    }
+
+    @Override
+    public void run() {
+        try {
+            go();
+        } catch (Exception e) {
+            throw new RuntimeException(e);
+        }
      }
  
      private void go() throws Exception {
-        final SAXParser parser = SAXParserFactoryImpl.newInstance().newSAXParser();
+        final SAXParser parser = SAXParserFactory.newInstance().newSAXParser();
  
          // Configure things.
-        for (final Map.Entry<String, List<Selector>> pathToSelectorsEntry : pathToSelectors.entrySet()) {
  
              currentSelectors = pathToSelectorsEntry.getValue();
  
@@ -86,7 +112,7 @@ public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler {
                  OutputStream tmp = new FileOutputStream(selector.outFilename + ".gz");
                  tmp = new BufferedOutputStream(tmp);
                  tmp = new CompressorStreamFactory().createCompressorOutputStream(CompressorStreamFactory.GZIP, tmp);
-                tmp = new WriteBuffer(tmp, 20 * 1024 * 1024);
+                tmp = new WriteBuffer(tmp, 1024 * 1024);
                  selector.out = new DataOutputStream(tmp);
              }
  
@@ -105,7 +131,7 @@ public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler {
                      parser.parse(new BufferedInputStream(in), this);
                  }
              } catch (Exception e) {
-                System.err.println("Exception during parse, lastPageTitle=" + lastPageTitle + ", titleBuilder=" + titleBuilder.toString() + " of file " + pathToSelectorsEntry.getKey());
+                System.err.println("Exception during parse, lastPageTitle=" + lastPageTitle + ", titleBuilder=" + titleBuilder + " of file " + pathToSelectorsEntry.getKey());
                  throw e;
              }
  
@@ -113,17 +139,15 @@ public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler {
              for (final Selector selector : currentSelectors) {
                  selector.out.close();
              }
-
-        }
      }
  
      String lastPageTitle = null;
      int pageCount = 0;
-    Pattern endPatterns[] = new Pattern[100];
+    final Matcher[] endPatterns = new Matcher[100];
  
-    private Pattern getEndPattern(int depth) {
+    private Matcher getEndPattern(int depth) {
          if (endPatterns[depth] == null)
-            endPatterns[depth] = Pattern.compile(String.format("^={1,%d}[^=].*$", depth), Pattern.MULTILINE);
+            endPatterns[depth] = Pattern.compile(String.format("^={1,%d}[^=].*$", depth), Pattern.MULTILINE).matcher("");
          return endPatterns[depth];
      }
  
@@ -152,6 +176,7 @@ public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler {
                  title.startsWith("Template:") ||
                  title.startsWith("Summary:") ||
                  title.startsWith("Module:") ||
+                title.startsWith("Reconstruction:") ||
                  // DE
                  title.startsWith("Datei:") ||
                  title.startsWith("Verzeichnis:") ||
@@ -160,6 +185,7 @@ public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler {
                  title.startsWith("Kategorie:") ||
                  title.startsWith("Hilfe:") ||
                  title.startsWith("Reim:") ||
+                title.startsWith("Modul:") ||
                  // FR:
                  title.startsWith("Annexe:") ||
                  title.startsWith("Catégori:") ||
@@ -169,16 +195,20 @@ public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler {
                  title.startsWith("Aide:") ||
                  title.startsWith("Fichier:") ||
                  title.startsWith("Wiktionnaire:") ||
+                title.startsWith("Translations:Wiktionnaire:") ||
+                title.startsWith("Translations:Projet:") ||
                  title.startsWith("Catégorie:") ||
                  title.startsWith("Portail:") ||
                  title.startsWith("utiliusateur:") ||
                  title.startsWith("Kategorio:") ||
+                title.startsWith("Tutoriel:") ||
                  // IT
                  title.startsWith("Wikizionario:") ||
                  title.startsWith("Appendice:") ||
                  title.startsWith("Categoria:") ||
                  title.startsWith("Aiuto:") ||
                  title.startsWith("Portail:") ||
+                title.startsWith("Modulo:") ||
                  // ES
                  title.startsWith("Apéndice:") ||
                  title.startsWith("Archivo:") ||
@@ -195,39 +225,40 @@ public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler {
                  title.startsWith("Predefinição:") ||
                  title.startsWith("Vocabulário:") ||
                  title.startsWith("Wikcionário:") ||
+                title.startsWith("Módulo:") ||
  
                  // sentinel
                  false
                 ) return;
-            if (!title.startsWith("Sign gloss:")) {
+            // leave the Flexion: pages in for now and do not warn about them
+            if (!title.startsWith("Sign gloss:") && !title.startsWith("Flexion:")) {
                  System.err.println("title with colon: " + title);
              }
          }
  
          String text = textBuilder.toString();
          // Workaround for Spanish wiktionary {{ES}} and {{ES|word}} patterns
-        text = text.replaceAll("\\{\\{ES(\\|[^{}=]*)?}}", "== {{lengua|es}} ==");
+        text = startSpanish.matcher(text).replaceAll("== {{lengua|es}} ==");
          String translingual = "";
          int start = 0;
-        final Matcher startMatcher = headingStart.matcher(text);
+        Matcher headingStart = headingStartPattern.matcher(text);
  
          while (start < text.length()) {
              // Find start.
-            if (!startMatcher.find(start)) {
+            if (!headingStart.find(start)) {
                  return;
              }
-            start = startMatcher.end();
+            start = headingStart.end();
  
-            final String heading = startMatcher.group();
+            final String heading = headingStart.group();
  
              // For Translingual entries just store the text for later
              // use in the per-language sections
-            if (heading.indexOf("Translingual") != -1) {
+            if (heading.contains("Translingual")) {
                  // Find end.
-                final int depth = startMatcher.group(1).length();
-                final Pattern endPattern = getEndPattern(depth);
+                final int depth = headingStart.group(1).length();
+                final Matcher endMatcher = getEndPattern(depth).reset(text);
  
-                final Matcher endMatcher = endPattern.matcher(text);
                  if (endMatcher.find(start)) {
                      int end = endMatcher.start();
                      translingual = text.substring(start, end);
@@ -237,12 +268,11 @@ public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler {
              }
  
              for (final Selector selector : currentSelectors) {
-                if (selector.pattern.matcher(heading).find()) {
+                if (selector.pattern.reset(heading).find()) {
                      // Find end.
-                    final int depth = startMatcher.group(1).length();
-                    final Pattern endPattern = getEndPattern(depth);
+                    final int depth = headingStart.group(1).length();
+                    final Matcher endMatcher = getEndPattern(depth).reset(text);
  
-                    final Matcher endMatcher = endPattern.matcher(text);
                      final int end;
                      if (endMatcher.find(start)) {
                          end = endMatcher.start();
@@ -259,13 +289,13 @@ public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler {
                                  sectionText.charAt(dummy_end + 1) == '\n') ++dummy_end;
                          sectionText = sectionText.substring(dummy_end);
                      }
-                    if (heading.indexOf("Japanese") == -1) sectionText += translingual;
+                    if (!heading.contains("Japanese")) sectionText += translingual;
                      final Section section = new Section(title, heading, sectionText);
  
                      try {
                          selector.out.writeUTF(section.title);
                          selector.out.writeUTF(section.heading);
-                        final byte[] bytes = section.text.getBytes("UTF8");
+                        final byte[] bytes = section.text.getBytes(StandardCharsets.UTF_8);
                          selector.out.writeInt(bytes.length);
                          selector.out.write(bytes);
                      } catch (IOException e) {
@@ -300,13 +330,13 @@ public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler {
  
      static class Selector {
          final String outFilename;
-        final Pattern pattern;
+        final Matcher pattern;
  
          DataOutputStream out;
  
          public Selector(final String filename, final String pattern) {
              this.outFilename = filename;
-            this.pattern = Pattern.compile(pattern, Pattern.CASE_INSENSITIVE);
+            this.pattern = Pattern.compile(pattern, Pattern.CASE_INSENSITIVE).matcher("");
          }
      }
  
@@ -329,15 +359,14 @@ public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler {
      }
  
      @Override
-    public void characters(char[] ch, int start, int length) throws SAXException {
+    public void characters(char[] ch, int start, int length) {
          if (currentBuilder != null) {
              currentBuilder.append(ch, start, length);
          }
      }
  
      @Override
-    public void endElement(String uri, String localName, String qName)
-    throws SAXException {
+    public void endElement(String uri, String localName, String qName) {
          currentBuilder = null;
          if ("page".equals(qName)) {
              endPage();
@@ -346,7 +375,7 @@ public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler {
  
      public void parse(final File file) throws ParserConfigurationException,
          SAXException, IOException {
-        final SAXParser parser = SAXParserFactoryImpl.newInstance().newSAXParser();
+        final SAXParser parser = SAXParserFactory.newInstance().newSAXParser();
          parser.parse(file, this);
      }
  
diff --git a/src/com/hughes/android/dictionary/engine/WriteBuffer.java b/src/com/hughes/android/dictionary/engine/WriteBuffer.java

index c68264ebb583523283039adb9a70a313e207b6dd..09e2655ffa88dbf19a13b1831ecc30c0a228dd15 100644 (file)
--- a/src/com/hughes/android/dictionary/engine/WriteBuffer.java
+++ b/src/com/hughes/android/dictionary/engine/WriteBuffer.java
@@ -14,35 +14,33 @@
  
  package com.hughes.android.dictionary.engine;
  
-import java.io.OutputStream;
  import java.io.IOException;
+import java.io.OutputStream;
  import java.io.PipedInputStream;
  import java.io.PipedOutputStream;
  
  public class WriteBuffer extends PipedOutputStream {
-    static int BLOCK_SIZE = 1024 * 1024;
+    static int BLOCK_SIZE = 256 * 1024;
      public WriteBuffer(OutputStream out, int size) {
          assert size >= 2 * BLOCK_SIZE;
          this.out = out;
          try {
              pipe = new PipedInputStream(this, size);
              buffer = new byte[BLOCK_SIZE];
-            writeThread = new Thread(new Runnable() {
-                public void run() {
-                    int read;
-                    try {
-                        while ((read = pipe.read(buffer)) > 0)
-                        {
-                            out.write(buffer, 0, read);
-                            out.flush();
-                        }
-                    } catch (IOException e) {
-                        System.out.println("Error writing to file " + e);
+            writeThread = new Thread(() -> {
+                int read;
+                try {
+                    while ((read = pipe.read(buffer)) > 0)
+                    {
+                        out.write(buffer, 0, read);
+                        out.flush();
                      }
-                    try {
-                        out.close();
-                    } catch (IOException e) {}
+                } catch (IOException e) {
+                    System.out.println("Error writing to file " + e);
                  }
+                try {
+                    out.close();
+                } catch (IOException e) {}
              });
              writeThread.start();
          } catch (IOException e) {}
@@ -61,5 +59,5 @@ public class WriteBuffer extends PipedOutputStream {
      Thread writeThread;
      OutputStream out;
      PipedInputStream pipe;
-    byte buffer[];
+    byte[] buffer;
  }
diff --git a/src/com/hughes/android/dictionary/parser/DictFileParser.java b/src/com/hughes/android/dictionary/parser/DictFileParser.java

index 07d077562b8b106392d201123eb26aa753e48781..e9c61808af5b8a91573cfea2898a7f08fc3bf349 100644 (file)
--- a/src/com/hughes/android/dictionary/parser/DictFileParser.java
+++ b/src/com/hughes/android/dictionary/parser/DictFileParser.java
@@ -23,7 +23,6 @@ import java.nio.charset.Charset;
  import java.util.Arrays;
  import java.util.Collections;
  import java.util.LinkedHashSet;
-import java.util.List;
  import java.util.Set;
  import java.util.logging.Logger;
  import java.util.regex.Matcher;
@@ -31,12 +30,11 @@ import java.util.regex.Pattern;
  
  import com.hughes.android.dictionary.engine.DictionaryBuilder;
  import com.hughes.android.dictionary.engine.EntrySource;
-import com.hughes.android.dictionary.engine.IndexedEntry;
  import com.hughes.android.dictionary.engine.EntryTypeName;
  import com.hughes.android.dictionary.engine.IndexBuilder;
+import com.hughes.android.dictionary.engine.IndexedEntry;
  import com.hughes.android.dictionary.engine.Language;
  import com.hughes.android.dictionary.engine.PairEntry;
-import com.hughes.android.dictionary.engine.PairEntry.Pair;
  
  public class DictFileParser implements Parser {
  
@@ -153,7 +151,7 @@ public class DictFileParser implements Parser {
              if (subfields[1][i].length() == 0) {
                  subfields[1][i] = "__";
              }
-            pairEntry.pairs.add(new Pair(subfields[0][i], subfields[1][i]));
+            pairEntry.pairs.add(new PairEntry.Pair(subfields[0][i], subfields[1][i]));
          }
          final IndexedEntry entryData = new IndexedEntry(pairEntry);
          entryData.isValid = true;
@@ -295,9 +293,9 @@ public class DictFileParser implements Parser {
          return field;
      }
  
-    public static final Set<String> tokenize(final String text, final Pattern pattern) {
+    public static Set<String> tokenize(final String text, final Pattern pattern) {
          final String[] split = pattern.split(text);
-        final Set<String> result = new LinkedHashSet<String>(Arrays.asList(split));
+        final Set<String> result = new LinkedHashSet<>(Arrays.asList(split));
          result.remove("");
          return result;
      }
diff --git a/src/com/hughes/android/dictionary/parser/WikiTokenizer.java b/src/com/hughes/android/dictionary/parser/WikiTokenizer.java

index 721231905326bfc4d70a388782e57415a72fd36c..ca0193a5074604665ca7db55c28e9511e4e2f0f8 100644 (file)
--- a/src/com/hughes/android/dictionary/parser/WikiTokenizer.java
+++ b/src/com/hughes/android/dictionary/parser/WikiTokenizer.java
@@ -14,16 +14,13 @@
  
  package com.hughes.android.dictionary.parser;
  
-import java.util.ArrayList;
-import java.util.LinkedHashMap;
-import java.util.List;
-import java.util.Map;
+import java.util.*;
  import java.util.regex.Matcher;
  import java.util.regex.Pattern;
  
  public final class WikiTokenizer {
  
-    public static interface Callback {
+    public interface Callback {
          void onPlainText(final String text);
          void onMarkup(WikiTokenizer wikiTokenizer);
          void onWikiLink(WikiTokenizer wikiTokenizer);
@@ -77,7 +74,7 @@ public final class WikiTokenizer {
      }
  
      //private static final Pattern wikiTokenEvent = Pattern.compile("($)", Pattern.MULTILINE);
-    private static final Pattern wikiTokenEvent = Pattern.compile("(" +
+    private static final Pattern wikiTokenEvent = Pattern.compile(
              "\\{\\{|\\}\\}|" +
              "\\[\\[|\\]\\]|" +
              "\\||" +  // Need the | because we might have to find unescaped pipes
@@ -87,7 +84,7 @@ public final class WikiTokenizer {
              "<pre>|" +
              "<math>|" +
              "<ref>|" +
-            "$)", Pattern.MULTILINE);
+            "\n", Pattern.MULTILINE);
      private static final String listChars = "*#:;";
  
  
@@ -99,8 +96,8 @@ public final class WikiTokenizer {
      int end = 0;
      int start = -1;
  
-    final List<String> errors = new ArrayList<String>();
-    final List<String> tokenStack = new ArrayList<String>();
+    final List<String> errors = new ArrayList<>();
+    final List<TokenDelim> tokenStack = new ArrayList<>();
  
  
      private String headingWikiText;
@@ -116,8 +113,8 @@ public final class WikiTokenizer {
  
      private int lastUnescapedPipePos;
      private int lastUnescapedEqualsPos;
-    private final List<String> positionArgs = new ArrayList<String>();
-    private final Map<String,String> namedArgs = new LinkedHashMap<String,String>();
+    private final List<String> positionArgs = new ArrayList<>();
+    private final Map<String,String> namedArgs = new LinkedHashMap<>();
  
  
      public WikiTokenizer(final String wikiText) {
@@ -126,6 +123,7 @@ public final class WikiTokenizer {
  
      public WikiTokenizer(String wikiText, final boolean isNewline) {
          wikiText = wikiText.replace('\u2028', '\n');
+        wikiText = wikiText.replace('\u2029', '\n');
          wikiText = wikiText.replace('\u0085', '\n');
          this.wikiText = wikiText;
          this.matcher = wikiTokenEvent.matcher(wikiText);
@@ -153,7 +151,7 @@ public final class WikiTokenizer {
          namedArgs.clear();
      }
  
-    private static final Pattern POSSIBLE_WIKI_TEXT = Pattern.compile(
+    private static final Matcher POSSIBLE_WIKI_TEXT = Pattern.compile(
                  "\\{\\{|" +
                  "\\[\\[|" +
                  "<!--|" +
@@ -161,12 +159,33 @@ public final class WikiTokenizer {
                  "<pre>|" +
                  "<math>|" +
                  "<ref>|" +
-                "[\n]"
-            );
+                "\n"
+            ).matcher("");
  
      public static void dispatch(final String wikiText, final boolean isNewline, final Callback callback) {
-        // Optimization...
-        if (!POSSIBLE_WIKI_TEXT.matcher(wikiText).find()) {
+        // Statistical background, from EN-DE dictionary generation:
+        // out of 12083000 calls, 9697686 can be skipped via the test
+        // for ', \n and ((c - 0x3b) & 0xff9f) < 2 (which covers among others
+        // <, { and [).
+        // This increased to 10006466 checking for <, { and [ specifically,
+        // and is minimally faster overall.
+        // A even more precise one using regex and checking for {{, [[, <!--, '',
+        // <pre>, <math>, <ref> and \n increased that to 10032846.
+        // Regex thus seems far too costly for a measly increase from 80%/82% to 83% rejection rate
+        // However completely removing it changes output (likely a bug), so leave it in for now
+        // but at least run it only on the 18% not caught by the faster logic.
+        // Original runtime: 1m29.708s
+        // Optimized: 1m19.170s
+        // Regex removed: 1m20.314s (not statistically significant)
+        boolean matched = false;
+        for (int i = 0; i < wikiText.length(); i++) {
+            int c = wikiText.charAt(i);
+            if (c == '\'' || c == '\n' || c == '<' || c == '[' || c == '{') {
+                matched = true;
+                break;
+            }
+        }
+        if (!matched || !POSSIBLE_WIKI_TEXT.reset(wikiText).find()) {
              callback.onPlainText(wikiText);
          } else {
              final WikiTokenizer tokenizer = new WikiTokenizer(wikiText, isNewline);
@@ -338,7 +357,7 @@ public final class WikiTokenizer {
              }
  
              // Eat a newline if we're looking at one:
-            final boolean atNewline = wikiText.charAt(end) == '\n' || wikiText.charAt(end) == '\u2028';
+            final boolean atNewline = wikiText.charAt(end) == '\n';
              if (atNewline) {
                  justReturnedNewline = true;
                  ++end;
@@ -358,7 +377,7 @@ public final class WikiTokenizer {
                      // Skip non-=...
                      if (end < len) {
                          final int nextNewline = safeIndexOf(wikiText, end, "\n", "\n");
-                        final int closingEquals = escapedFindEnd(end, "=");
+                        final int closingEquals = escapedFindEnd(end, TokenDelim.EQUALS);
                          if (wikiText.charAt(closingEquals - 1) == '=') {
                              end = closingEquals - 1;
                          } else {
@@ -378,7 +397,7 @@ public final class WikiTokenizer {
                  if (listChars.indexOf(firstChar) != -1) {
                      while (++end < len && listChars.indexOf(wikiText.charAt(end)) != -1) {}
                      listPrefixEnd = end;
-                    end = escapedFindEnd(start, "\n");
+                    end = escapedFindEnd(start, TokenDelim.NEWLINE);
                      return this;
                  }
              }
@@ -396,13 +415,13 @@ public final class WikiTokenizer {
              }
  
              if (wikiText.startsWith("[[", start)) {
-                end = escapedFindEnd(start + 2, "]]");
+                end = escapedFindEnd(start + 2, TokenDelim.DBRACKET_CLOSE);
                  isWikiLink = errors.isEmpty();
                  return this;
              }
  
              if (wikiText.startsWith("{{", start)) {
-                end = escapedFindEnd(start + 2, "}}");
+                end = escapedFindEnd(start + 2, TokenDelim.BRACE_CLOSE);
                  isFunction = errors.isEmpty();
                  return this;
              }
@@ -444,17 +463,28 @@ public final class WikiTokenizer {
              }
  
  
-            if (this.matcher.find(start)) {
-                end = this.matcher.start(1);
+            while (end < wikiText.length()) {
+                int c = wikiText.charAt(end);
+                if (c == '\n' || c == '\'' || ((c - 0x1b) & 0xff9f) < 3) {
+                    matcher.region(end, wikiText.length());
+                    if (matcher.lookingAt()) break;
+                }
+                end++;
+            }
+            if (end != wikiText.length()) {
                  isPlainText = true;
                  if (end == start) {
-                    errors.add("Empty group: " + this.matcher.group());
+                    // stumbled over a new type of newline?
+                    // Or matcher is out of sync with checks above
+                    errors.add("Empty group: " + this.matcher.group() + " char: " + (int)wikiText.charAt(end));
                      assert false;
+                    // Note: all newlines should be normalize to \n before calling this function
+                    throw new RuntimeException("matcher not in sync with code, or new type of newline, errors :" + errors);
                  }
                  return this;
              }
  
-            end = wikiText.length();
+            isPlainText = true;
              return this;
  
          } finally {
@@ -471,68 +501,90 @@ public final class WikiTokenizer {
          return token;
      }
  
-    final static String[] patterns = { "\n", "{{", "}}", "[[", "]]", "[", "]", "|", "=", "<!--" };
-    private int escapedFindEnd(final int start, final String toFind) {
+    enum TokenDelim { NEWLINE, BRACE_OPEN, BRACE_CLOSE, DBRACKET_OPEN, DBRACKET_CLOSE, BRACKET_OPEN, BRACKET_CLOSE, PIPE, EQUALS, COMMENT }
+
+    private int tokenDelimLen(TokenDelim d) {
+        switch (d) {
+            case NEWLINE:
+            case BRACKET_OPEN:
+            case BRACKET_CLOSE:
+            case PIPE:
+            case EQUALS:
+                return 1;
+            case BRACE_OPEN:
+            case BRACE_CLOSE:
+            case DBRACKET_OPEN:
+            case DBRACKET_CLOSE:
+                return 2;
+            case COMMENT:
+                return 4;
+            default:
+                throw new RuntimeException();
+        }
+    }
+
+    static final String[] patterns = { "\n", "{{", "}}", "[[", "]]", "[", "]", "|", "=", "<!--" };
+    private int escapedFindEnd(final int start, final TokenDelim toFind) {
          assert tokenStack.isEmpty();
  
-        final boolean insideFunction = toFind.equals("}}");
+        final boolean insideFunction = toFind == TokenDelim.BRACE_CLOSE;
  
          int end = start;
          int firstNewline = -1;
-        int[] nextMatch = new int[patterns.length];
-        for (int i = 0; i < nextMatch.length; ++i) {
-            nextMatch[i] = -2;
-        }
          int singleBrackets = 0;
          while (end < wikiText.length()) {
              // Manual replacement for matcher.find(end),
              // because Java regexp is a ridiculously slow implementation.
              // Initialize to always match the end.
-            int matchIdx = 0;
-            for (int i = 0; i < nextMatch.length; ++i) {
-                if (nextMatch[i] <= end) {
-                    nextMatch[i] = wikiText.indexOf(patterns[i], end);
-                    if (nextMatch[i] == -1) nextMatch[i] = i > 0 ? 0x7fffffff : wikiText.length();
-                }
-                if (nextMatch[i] < nextMatch[matchIdx]) {
-                    matchIdx = i;
-                }
+            TokenDelim match = TokenDelim.NEWLINE;
+            int matchStart = end;
+            for (; matchStart < wikiText.length(); matchStart++) {
+                int i = matchStart;
+                int c = wikiText.charAt(i);
+                if (c == '\n') break;
+                if (c == '{' && wikiText.startsWith("{{", i)) { match = TokenDelim.BRACE_OPEN; break; }
+                if (c == '}' && wikiText.startsWith("}}", i)) { match = TokenDelim.BRACE_CLOSE; break; }
+                if (c == '[') { match = wikiText.startsWith("[[", i) ? TokenDelim.DBRACKET_OPEN : TokenDelim.BRACKET_OPEN ; break; }
+                if (c == ']') { match = wikiText.startsWith("]]", i) ? TokenDelim.DBRACKET_CLOSE : TokenDelim.BRACKET_CLOSE ; break; }
+                if (c == '|') { match = TokenDelim.PIPE; break; }
+                if (c == '=') { match = TokenDelim.EQUALS; break; }
+                if (c == '<' && wikiText.startsWith("<!--", i)) { match = TokenDelim.COMMENT; break; }
              }
  
-            int matchStart = nextMatch[matchIdx];
-            String matchText = patterns[matchIdx];
-            int matchEnd = matchStart + matchText.length();
-            if (matchIdx == 0) {
-                matchText = "";
-                matchEnd = matchStart;
+            int matchEnd = matchStart + (match == TokenDelim.NEWLINE ? 0 : tokenDelimLen(match));
+            if (match != TokenDelim.NEWLINE && tokenStack.isEmpty() && match == toFind) {
+                // The normal return....
+                if (insideFunction) {
+                    addFunctionArg(insideFunction, matchStart);
+                }
+                return matchEnd;
              }
-
-            assert matchEnd > end || matchText.length() == 0: "Group=" + matchText;
-            if (matchText.length() == 0) {
+            switch (match) {
+                case NEWLINE:
                  assert matchStart == wikiText.length() || wikiText.charAt(matchStart) == '\n' : wikiText + ", " + matchStart;
                  if (firstNewline == -1) {
                      firstNewline = matchEnd;
                  }
-                if (tokenStack.isEmpty() && toFind.equals("\n")) {
+                if (tokenStack.isEmpty() && toFind == TokenDelim.NEWLINE) {
                      return matchStart;
                  }
                  ++end;
-            } else if (tokenStack.isEmpty() && matchText.equals(toFind)) {
-                // The normal return....
-                if (insideFunction) {
-                    addFunctionArg(insideFunction, matchStart);
-                }
-                return matchEnd;
-            } else if (matchText.equals("[")) {
+                break;
+                case BRACKET_OPEN:
                  singleBrackets++;
-            } else if (matchText.equals("]")) {
+                break;
+                case BRACKET_CLOSE:
                  if (singleBrackets > 0) singleBrackets--;
-            } else if (matchText.equals("[[") || matchText.equals("{{")) {
-                tokenStack.add(matchText);
-            } else if (matchText.equals("]]") || matchText.equals("}}")) {
-                if (tokenStack.size() > 0) {
-                    final String removed = tokenStack.remove(tokenStack.size() - 1);
-                    if (removed.equals("{{") && !matchText.equals("}}")) {
+                break;
+                case DBRACKET_OPEN:
+                case BRACE_OPEN:
+                tokenStack.add(match);
+                break;
+                case DBRACKET_CLOSE:
+                case BRACE_CLOSE:
+                if (!tokenStack.isEmpty()) {
+                    final TokenDelim removed = tokenStack.remove(tokenStack.size() - 1);
+                    if (removed == TokenDelim.BRACE_OPEN && match != TokenDelim.BRACE_CLOSE) {
                          if (singleBrackets >= 2) { // assume this is really two closing single ]
                              singleBrackets -= 2;
                              tokenStack.add(removed);
@@ -540,46 +592,47 @@ public final class WikiTokenizer {
                              errors.add("Unmatched {{ error: " + wikiText.substring(start, matchEnd));
                              return safeIndexOf(wikiText, start, "\n", "\n");
                          }
-                    } else if (removed.equals("[[") && !matchText.equals("]]")) {
+                    } else if (removed == TokenDelim.DBRACKET_OPEN && match != TokenDelim.DBRACKET_CLOSE) {
                          errors.add("Unmatched [[ error: " + wikiText.substring(start, matchEnd));
                          return safeIndexOf(wikiText, start, "\n", "\n");
                      }
                  } else {
-                    errors.add("Pop too many " + matchText + " error: " + wikiText.substring(start, matchEnd).replace("\n", "\\\\n"));
+                    errors.add("Pop too many " + wikiText.substring(matchStart, matchEnd) + " error: " + wikiText.substring(start, matchEnd).replace("\n", "\\\\n"));
                      // If we were looking for a newline
                      return safeIndexOf(wikiText, start, "\n", "\n");
                  }
-            } else if (matchText.equals("|")) {
+                break;
+                case PIPE:
                  if (tokenStack.isEmpty()) {
                      addFunctionArg(insideFunction, matchStart);
                  }
-            } else if (matchText.equals("=")) {
+                break;
+                case EQUALS:
                  if (tokenStack.isEmpty()) {
                      lastUnescapedEqualsPos = matchStart;
                  }
                  // Do nothing.  These can match spuriously, and if it's not the thing
                  // we're looking for, keep on going.
-            } else if (matchText.equals("<!--")) {
+                break;
+                case COMMENT:
                  end = wikiText.indexOf("-->", matchStart);
                  if (end == -1) {
                      errors.add("Unmatched <!-- error: " + wikiText.substring(start));
                      return safeIndexOf(wikiText, start, "\n", "\n");
                  }
-            } else if (matchText.equals("''") || (matchText.startsWith("<") && matchText.endsWith(">"))) {
-                // Don't care.
-            } else {
-                assert false : "Match text='" + matchText + "'";
-                throw new IllegalStateException();
+                break;
+                default:
+                    throw new RuntimeException();
              }
  
              // Inside the while loop.  Just go forward.
              end = Math.max(end, matchEnd);
          }
-        if (toFind.equals("\n") && tokenStack.isEmpty()) {
+        if (toFind == TokenDelim.NEWLINE && tokenStack.isEmpty()) {
              // We were looking for the end, we got it.
              return end;
          }
-        errors.add("Couldn't find: " + (toFind.equals("\n") ? "newline" : toFind) + ", "+ wikiText.substring(start));
+        errors.add("Couldn't find: " + toFind + ", "+ wikiText.substring(start));
          if (firstNewline != -1) {
              return firstNewline;
          }
@@ -602,7 +655,7 @@ public final class WikiTokenizer {
          lastUnescapedPipePos = matchStart;
      }
  
-    static final String trimNewlines(String s) {
+    static String trimNewlines(String s) {
          while (s.startsWith("\n")) {
              s = s.substring(1);
          }
diff --git a/src/com/hughes/android/dictionary/parser/WikiTokenizerTest.java b/src/com/hughes/android/dictionary/parser/WikiTokenizerTest.java

index 5193c00892a1f407ced4c994a7e669baf142f204..bc48e529cb0347d05b1b12032c9cf5034cff4f17 100644 (file)
--- a/src/com/hughes/android/dictionary/parser/WikiTokenizerTest.java
+++ b/src/com/hughes/android/dictionary/parser/WikiTokenizerTest.java
@@ -16,6 +16,7 @@ package com.hughes.android.dictionary.parser;
  
  import java.util.ArrayList;
  import java.util.Arrays;
+import java.util.Collections;
  import java.util.List;
  
  import junit.framework.TestCase;
@@ -29,7 +30,7 @@ public class WikiTokenizerTest extends TestCase {
          assertEquals(wikiText, new WikiTokenizer(wikiText).nextToken().token());
          assertTrue(new WikiTokenizer(wikiText).nextToken().isWikiLink());
          assertEquals("abc", new WikiTokenizer(wikiText).nextToken().wikiLinkText());
-        assertEquals(null, new WikiTokenizer(wikiText).nextToken().wikiLinkDest());
+        assertNull(new WikiTokenizer(wikiText).nextToken().wikiLinkDest());
  
          wikiText = "[[abc|def]]";
          assertEquals(wikiText, new WikiTokenizer(wikiText).nextToken().token());
@@ -93,7 +94,7 @@ public class WikiTokenizerTest extends TestCase {
          assertEquals(wikiText, new WikiTokenizer(wikiText).nextToken().token());
          assertTrue(new WikiTokenizer(wikiText).nextToken().isFunction());
          assertEquals("abc", new WikiTokenizer(wikiText).nextToken().functionName());
-        assertEquals(Arrays.asList("def"), new WikiTokenizer(wikiText).nextToken().functionPositionArgs());
+        assertEquals(Collections.singletonList("def"), new WikiTokenizer(wikiText).nextToken().functionPositionArgs());
          assertEquals(0, new WikiTokenizer(wikiText).nextToken().functionNamedArgs().size());
  
          wikiText = "{{abc|d[[|]]ef|ghi}}";
@@ -153,10 +154,10 @@ public class WikiTokenizerTest extends TestCase {
          assertEquals("\n", tokenizer.nextToken().token());
  
          assertEquals("hello2", tokenizer.nextToken().token());
-        assertEquals(null, tokenizer.nextToken());
+        assertNull(tokenizer.nextToken());
          tokenizer.returnToLineStart();
          assertEquals("hello2", tokenizer.nextToken().token());
-        assertEquals(null, tokenizer.nextToken());
+        assertNull(tokenizer.nextToken());
  
  
      }
@@ -233,7 +234,7 @@ public class WikiTokenizerTest extends TestCase {
              "[extraterminated]]" + "\n" +
              "=== {{header-template}} ===" + "\n";
  
-        final String[] expectedTokens = new String[] {
+        final String[] expectedTokens = {
              "Hi",
              "\n",
              "Hello ",
@@ -298,7 +299,7 @@ public class WikiTokenizerTest extends TestCase {
              "\n",
          };
  
-        final List<String> actualTokens = new ArrayList<String>();
+        final List<String> actualTokens = new ArrayList<>();
  
          final WikiTokenizer wikiTokenizer = new WikiTokenizer(wikiText);
          WikiTokenizer token;
diff --git a/src/com/hughes/android/dictionary/parser/wiktionary/AbstractWiktionaryParser.java b/src/com/hughes/android/dictionary/parser/wiktionary/AbstractWiktionaryParser.java

index c66a0d78010ed32e325e731843af33026bb0c640..81a676c8755909cd19badc6fe799aabd538609dd 100644 (file)
--- a/src/com/hughes/android/dictionary/parser/wiktionary/AbstractWiktionaryParser.java
+++ b/src/com/hughes/android/dictionary/parser/wiktionary/AbstractWiktionaryParser.java
@@ -19,8 +19,9 @@ import java.io.DataInputStream;
  import java.io.EOFException;
  import java.io.File;
  import java.io.FileInputStream;
-import java.io.InputStream;
  import java.io.IOException;
+import java.io.InputStream;
+import java.nio.charset.StandardCharsets;
  import java.util.LinkedHashMap;
  import java.util.LinkedHashSet;
  import java.util.List;
@@ -31,10 +32,11 @@ import java.util.TreeMap;
  import java.util.concurrent.atomic.AtomicInteger;
  import java.util.logging.Level;
  import java.util.logging.Logger;
+import java.util.regex.Matcher;
  import java.util.regex.Pattern;
  
-import org.apache.commons.compress.compressors.CompressorStreamFactory;
  import org.apache.commons.compress.compressors.CompressorException;
+import org.apache.commons.compress.compressors.CompressorStreamFactory;
  
  import com.hughes.android.dictionary.engine.EntrySource;
  import com.hughes.android.dictionary.engine.EntryTypeName;
@@ -49,8 +51,10 @@ public abstract class AbstractWiktionaryParser implements Parser {
  
      static final Logger LOG = Logger.getLogger("WiktionaryParser");
  
-    final SortedMap<String, AtomicInteger> counters = new TreeMap<String, AtomicInteger>();
-    final Set<String> pairsAdded = new LinkedHashSet<String>();
+    private static final Pattern SUPERSCRIPT = Pattern.compile("<sup>[0-9]*</sup>");
+
+    final SortedMap<String, AtomicInteger> counters = new TreeMap<>();
+    final Set<String> pairsAdded = new LinkedHashSet<>();
  
      public EntrySource entrySource;
      public String title;
@@ -60,6 +64,34 @@ public abstract class AbstractWiktionaryParser implements Parser {
  
      abstract void removeUselessArgs(final Map<String, String> namedArgs);
  
+    private static String replaceSuperscript(String in) {
+        Matcher matcher;
+        while ((matcher = SUPERSCRIPT.matcher(in)).find()) {
+            String replace = "";
+            String orig = matcher.group();
+            for (int i = 5; i < orig.length() - 6; i++)
+            {
+                char c = 0;
+                switch (orig.charAt(i)) {
+                case '0': c = '\u2070'; break;
+                case '1': c = '\u00b9'; break;
+                case '2': c = '\u00b2'; break;
+                case '3': c = '\u00b3'; break;
+                case '4': c = '\u2074'; break;
+                case '5': c = '\u2075'; break;
+                case '6': c = '\u2076'; break;
+                case '7': c = '\u2077'; break;
+                case '8': c = '\u2078'; break;
+                case '9': c = '\u2079'; break;
+                }
+                if (c == 0) throw new RuntimeException();
+                replace += c;
+            }
+            in = matcher.replaceFirst(replace);
+        }
+        return in;
+    }
+
      @Override
      public void parse(final File file, final EntrySource entrySource, final int pageLimit) throws IOException {
          this.entrySource = entrySource;
@@ -67,7 +99,7 @@ public abstract class AbstractWiktionaryParser implements Parser {
          File input = new File(file.getPath() + ".bz2");
          if (!input.exists()) input = new File(file.getPath() + ".gz");
          if (!input.exists()) input = new File(file.getPath() + ".xz");
-       DataInputStream dis;
+        DataInputStream dis;
          if (!input.exists()) {
              // Fallback to uncompressed file
              dis = new DataInputStream(new BufferedInputStream(new FileInputStream(file)));
@@ -98,9 +130,9 @@ public abstract class AbstractWiktionaryParser implements Parser {
                  final int bytesLength = dis.readInt();
                  final byte[] bytes = new byte[bytesLength];
                  dis.readFully(bytes);
-                final String text = new String(bytes, "UTF8");
+                final String text = new String(bytes, StandardCharsets.UTF_8);
  
-                parseSection(heading, text);
+                parseSection(heading, replaceSuperscript(text));
  
                  ++pageCount;
                  if (pageCount % 1000 == 0) {
@@ -143,14 +175,14 @@ public abstract class AbstractWiktionaryParser implements Parser {
          StringBuilder builder;
          IndexedEntry indexedEntry;
          IndexBuilder indexBuilder;
-        final Map<String,FunctionCallback<T>> functionCallbacks = new LinkedHashMap<String, FunctionCallback<T>>();
+        final Map<String,FunctionCallback<T>> functionCallbacks = new LinkedHashMap<>();
  
          boolean entryTypeNameSticks = false;
          EntryTypeName entryTypeName = null;
  
-        final Map<String,AtomicInteger> langCodeToTCount = new LinkedHashMap<String, AtomicInteger>();
+        final Map<String,AtomicInteger> langCodeToTCount = new LinkedHashMap<>();
  
-        final NameAndArgs<T> nameAndArgs = new NameAndArgs<T>();
+        final NameAndArgs<T> nameAndArgs = new NameAndArgs<>();
  
          public AppendAndIndexWikiCallback(final T parser) {
              this.parser = parser;
@@ -271,17 +303,17 @@ public abstract class AbstractWiktionaryParser implements Parser {
              if (name != null) {
                  appendAndIndexWikiCallback.dispatch(name, null);
              }
-            for (int i = 0; i < args.size(); ++i) {
-                if (args.get(i).length() > 0) {
+            for (String arg : args) {
+                if (arg.length() > 0) {
                      appendAndIndexWikiCallback.builder.append("|");
-                    appendAndIndexWikiCallback.dispatch(args.get(i), null, null);
+                    appendAndIndexWikiCallback.dispatch(arg, null, null);
                  }
              }
              appendNamedArgs(namedArgs, appendAndIndexWikiCallback);
              return true;
          }
      }
-    static NameAndArgs<AbstractWiktionaryParser> NAME_AND_ARGS = new NameAndArgs<AbstractWiktionaryParser>();
+    static NameAndArgs<AbstractWiktionaryParser> NAME_AND_ARGS = new NameAndArgs<>();
  
      static void appendNamedArgs(final Map<String, String> namedArgs,
                                  final AppendAndIndexWikiCallback<?> appendAndIndexWikiCallback) {
diff --git a/src/com/hughes/android/dictionary/parser/wiktionary/DeFunctionCallbacks.java b/src/com/hughes/android/dictionary/parser/wiktionary/DeFunctionCallbacks.java

index 871119139af165612aace24e21f337412105368e..fd6ea725c3f21cc49bd7dcf5266efd029b9630bf 100644 (file)
--- a/src/com/hughes/android/dictionary/parser/wiktionary/DeFunctionCallbacks.java
+++ b/src/com/hughes/android/dictionary/parser/wiktionary/DeFunctionCallbacks.java
@@ -14,17 +14,17 @@
  
  package com.hughes.android.dictionary.parser.wiktionary;
  
+import java.util.List;
+import java.util.Map;
+
  import com.hughes.android.dictionary.parser.WikiTokenizer;
  import com.hughes.android.dictionary.parser.wiktionary.AbstractWiktionaryParser.AppendAndIndexWikiCallback;
  import com.hughes.android.dictionary.parser.wiktionary.AbstractWiktionaryParser.NameAndArgs;
  
-import java.util.List;
-import java.util.Map;
-
  class DeFunctionCallbacks {
  
      static <T extends AbstractWiktionaryParser> void addGenericCallbacks(Map<String, FunctionCallback<T>> callbacks) {
-        FunctionCallback<T> callback = new MakeHeadingFromName<T>("====");
+        FunctionCallback<T> callback = new MakeHeadingFromName<>("====");
          callbacks.put("Aussprache", callback);
          callbacks.put("Worttrennung", callback);
          callbacks.put("Bedeutungen", callback);
@@ -50,7 +50,7 @@ class DeFunctionCallbacks {
      }
  
  
-    static final NameAndArgs<EnParser> NAME_AND_ARGS = new NameAndArgs<EnParser>();
+    static final NameAndArgs<EnParser> NAME_AND_ARGS = new NameAndArgs<>();
  
  
      static final class MakeHeadingFromName<T extends AbstractWiktionaryParser> implements FunctionCallback<T> {
diff --git a/src/com/hughes/android/dictionary/parser/wiktionary/EnForeignParser.java b/src/com/hughes/android/dictionary/parser/wiktionary/EnForeignParser.java

index 670462f0fdb3b7e06e9fb08883b24c335f31d780..8954f3784472fe328bebffeb556f1e1357ad2c2b 100644 (file)
--- a/src/com/hughes/android/dictionary/parser/wiktionary/EnForeignParser.java
+++ b/src/com/hughes/android/dictionary/parser/wiktionary/EnForeignParser.java
@@ -24,7 +24,6 @@ import com.hughes.android.dictionary.engine.EntryTypeName;
  import com.hughes.android.dictionary.engine.IndexBuilder;
  import com.hughes.android.dictionary.engine.IndexedEntry;
  import com.hughes.android.dictionary.engine.PairEntry;
-import com.hughes.android.dictionary.engine.PairEntry.Pair;
  import com.hughes.android.dictionary.parser.WikiTokenizer;
  
  public final class EnForeignParser extends EnParser {
@@ -74,8 +73,8 @@ public final class EnForeignParser extends EnParser {
      static final class ListSection {
          final String firstPrefix;
          final String firstLine;
-        final List<String> nextPrefixes = new ArrayList<String>();
-        final List<String> nextLines = new ArrayList<String>();
+        final List<String> nextPrefixes = new ArrayList<>();
+        final List<String> nextLines = new ArrayList<>();
  
          public ListSection(String firstPrefix, String firstLine) {
              this.firstPrefix = firstPrefix;
@@ -98,7 +97,7 @@ public final class EnForeignParser extends EnParser {
          }
  
          final StringBuilder foreignBuilder = new StringBuilder();
-        final List<EnForeignParser.ListSection> listSections = new ArrayList<EnForeignParser.ListSection>();
+        final List<EnForeignParser.ListSection> listSections = new ArrayList<>();
  
          appendAndIndexWikiCallback.reset(foreignBuilder, null);
          this.state = State.ENGLISH_DEF_OF_FOREIGN;  // TODO: this is wrong, need new category....
@@ -236,7 +235,7 @@ public final class EnForeignParser extends EnParser {
  
          final String english = trim(englishBuilder.toString());
          if (english.length() > 0) {
-            final Pair pair = new Pair(english, trim(foreignText), this.swap);
+            final PairEntry.Pair pair = new PairEntry.Pair(english, trim(foreignText), this.swap);
              pairEntry.pairs.add(pair);
              foreignIndexBuilder.addEntryWithString(indexedEntry, title, entryIsFormOfSomething ? EntryTypeName.WIKTIONARY_IS_FORM_OF_SOMETHING_ELSE : EntryTypeName.WIKTIONARY_TITLE_MULTI);
              for (final String form : forms) {
@@ -265,14 +264,14 @@ public final class EnForeignParser extends EnParser {
              if ((nextPrefix.equals("#:") || nextPrefix.equals("##:")) && dash != -1) {
                  final String foreignEx = nextLine.substring(0, dash);
                  final String englishEx = nextLine.substring(dash + mdashLen);
-                final Pair pair = new Pair(formatAndIndexExampleString(englishEx, enIndexBuilder, indexedEntry), formatAndIndexExampleString(foreignEx, foreignIndexBuilder, indexedEntry), swap);
+                final PairEntry.Pair pair = new PairEntry.Pair(formatAndIndexExampleString(englishEx, enIndexBuilder, indexedEntry), formatAndIndexExampleString(foreignEx, foreignIndexBuilder, indexedEntry), swap);
                  if (pair.lang1 != "--" && pair.lang1 != "--") {
                      pairEntry.pairs.add(pair);
                  }
                  lastForeign = null;
                  // TODO: make #* and #*: work
              } else if (nextPrefix.equals("#:") || nextPrefix.equals("##:")/* || nextPrefix.equals("#*")*/) {
-                final Pair pair = new Pair("--", formatAndIndexExampleString(nextLine, null, indexedEntry), swap);
+                final PairEntry.Pair pair = new PairEntry.Pair("--", formatAndIndexExampleString(nextLine, null, indexedEntry), swap);
                  lastForeign = nextLine;
                  if (pair.lang1 != "--" && pair.lang1 != "--") {
                      pairEntry.pairs.add(pair);
@@ -288,27 +287,27 @@ public final class EnForeignParser extends EnParser {
                          }
                      }
                      pairEntry.pairs.remove(pairEntry.pairs.size() - 1);
-                    final Pair pair = new Pair(formatAndIndexExampleString(nextLine, enIndexBuilder, indexedEntry), formatAndIndexExampleString(lastForeign, foreignIndexBuilder, indexedEntry), swap);
+                    final PairEntry.Pair pair = new PairEntry.Pair(formatAndIndexExampleString(nextLine, enIndexBuilder, indexedEntry), formatAndIndexExampleString(lastForeign, foreignIndexBuilder, indexedEntry), swap);
                      if (pair.lang1 != "--" || pair.lang2 != "--") {
                          pairEntry.pairs.add(pair);
                      }
                      lastForeign = null;
                  } else {
                      LOG.warning("TODO: English example with no foreign: " + title + ", " + nextLine);
-                    final Pair pair = new Pair("--", formatAndIndexExampleString(nextLine, null, indexedEntry), swap);
+                    final PairEntry.Pair pair = new PairEntry.Pair("--", formatAndIndexExampleString(nextLine, null, indexedEntry), swap);
                      if (pair.lang1 != "--" || pair.lang2 != "--") {
                          pairEntry.pairs.add(pair);
                      }
                  }
              } else if (nextPrefix.equals("#*")) {
                  // Can't really index these.
-                final Pair pair = new Pair("--", formatAndIndexExampleString(nextLine, null, indexedEntry), swap);
+                final PairEntry.Pair pair = new PairEntry.Pair("--", formatAndIndexExampleString(nextLine, null, indexedEntry), swap);
                  lastForeign = nextLine;
                  if (pair.lang1 != "--" || pair.lang2 != "--") {
                      pairEntry.pairs.add(pair);
                  }
              } else if (nextPrefix.equals("#::*") || nextPrefix.equals("##") || nextPrefix.equals("#*:") || nextPrefix.equals("#:*") || true) {
-                final Pair pair = new Pair("--", formatAndIndexExampleString(nextLine, null, indexedEntry), swap);
+                final PairEntry.Pair pair = new PairEntry.Pair("--", formatAndIndexExampleString(nextLine, null, indexedEntry), swap);
                  if (pair.lang1 != "--" || pair.lang2 != "--") {
                      pairEntry.pairs.add(pair);
                  }
diff --git a/src/com/hughes/android/dictionary/parser/wiktionary/EnFunctionCallbacks.java b/src/com/hughes/android/dictionary/parser/wiktionary/EnFunctionCallbacks.java

index 5e37a0a572d4cc64727fa56902a84a3cd038e99c..18df94fdfbb2619a92762ebeee1cbdc608f99202 100644 (file)
--- a/src/com/hughes/android/dictionary/parser/wiktionary/EnFunctionCallbacks.java
+++ b/src/com/hughes/android/dictionary/parser/wiktionary/EnFunctionCallbacks.java
@@ -14,15 +14,6 @@
  
  package com.hughes.android.dictionary.parser.wiktionary;
  
-import com.hughes.android.dictionary.engine.EntryTypeName;
-import com.hughes.android.dictionary.engine.IndexBuilder;
-import com.hughes.android.dictionary.parser.WikiTokenizer;
-import com.hughes.android.dictionary.parser.wiktionary.AbstractWiktionaryParser.AppendAndIndexWikiCallback;
-import com.hughes.android.dictionary.parser.wiktionary.AbstractWiktionaryParser.NameAndArgs;
-import com.hughes.util.ListUtil;
-import com.hughes.util.MapUtil;
-import com.hughes.util.StringUtil;
-
  import java.util.ArrayList;
  import java.util.Arrays;
  import java.util.LinkedHashMap;
@@ -32,34 +23,43 @@ import java.util.Map;
  import java.util.Set;
  import java.util.concurrent.atomic.AtomicInteger;
  
+import com.hughes.android.dictionary.engine.EntryTypeName;
+import com.hughes.android.dictionary.engine.IndexBuilder;
+import com.hughes.android.dictionary.parser.WikiTokenizer;
+import com.hughes.android.dictionary.parser.wiktionary.AbstractWiktionaryParser.AppendAndIndexWikiCallback;
+import com.hughes.android.dictionary.parser.wiktionary.AbstractWiktionaryParser.NameAndArgs;
+import com.hughes.util.ListUtil;
+import com.hughes.util.MapUtil;
+import com.hughes.util.StringUtil;
+
  class EnFunctionCallbacks {
  
-    static final Map<String,FunctionCallback<EnParser>> DEFAULT = new LinkedHashMap<String, FunctionCallback<EnParser>>();
+    static final Map<String,FunctionCallback<EnParser>> DEFAULT = new LinkedHashMap<>();
  
      static <T extends AbstractWiktionaryParser> void addGenericCallbacks(Map<String, FunctionCallback<T>> callbacks) {
-        FunctionCallback<T> callback = new Gender<T>();
+        FunctionCallback<T> callback = new Gender<>();
          callbacks.put("m", callback);
          callbacks.put("f", callback);
          callbacks.put("n", callback);
          callbacks.put("p", callback);
          callbacks.put("g", callback);
  
-        callbacks.put("etyl", new etyl<T>());
-        callbacks.put("term", new term<T>());
-
-        callback = new EncodingCallback<T>();
-        Set<String> encodings = new LinkedHashSet<String>(Arrays.asList(
-                    "IPA", "IPAchar",  // Not really encodings, but it works.
-                    "zh-ts", "zh-tsp",
-                    "sd-Arab", "ku-Arab", "Arab", "unicode", "Laoo", "ur-Arab", "Thai",
-                    "fa-Arab", "Khmr", "Cyrl", "ug-Arab", "ko-inline",
-                    "Jpan", "Kore", "Hebr", "rfscript", "Beng", "Mong", "Knda", "Cyrs",
-                    "yue-tsj", "Mlym", "Tfng", "Grek", "yue-yue-j"));
+        callbacks.put("etyl", new etyl<>());
+        callbacks.put("term", new term<>());
+
+        callback = new EncodingCallback<>();
+        Set<String> encodings = new LinkedHashSet<>(Arrays.asList(
+                "IPA", "IPAchar",  // Not really encodings, but it works.
+                "zh-ts", "zh-tsp",
+                "sd-Arab", "ku-Arab", "Arab", "unicode", "Laoo", "ur-Arab", "Thai",
+                "fa-Arab", "Khmr", "Cyrl", "ug-Arab", "ko-inline",
+                "Jpan", "Kore", "Hebr", "rfscript", "Beng", "Mong", "Knda", "Cyrs",
+                "yue-tsj", "Mlym", "Tfng", "Grek", "yue-yue-j"));
          for (final String encoding : encodings) {
              callbacks.put(encoding, callback);
          }
  
-        callback = new Ignore<T>();
+        callback = new Ignore<>();
          callbacks.put("trreq", callback);
          callbacks.put("t-image", callback);
          callbacks.put("defn", callback);
@@ -83,39 +83,39 @@ class EnFunctionCallbacks {
          callbacks.put("der-mid3", callback);
          callbacks.put("der-bottom", callback);
  
-        callback = new AppendName<T>();
+        callback = new AppendName<>();
          callbacks.put("...", callback);
  
-        callbacks.put("qualifier", new QualifierCallback<T>());
-        callbacks.put("italbrac", new italbrac<T>());
-        callbacks.put("gloss", new gloss<T>());
-        callbacks.put("not used", new not_used<T>());
-        callbacks.put("wikipedia", new wikipedia<T>());
+        callbacks.put("qualifier", new QualifierCallback<>());
+        callbacks.put("italbrac", new italbrac<>());
+        callbacks.put("gloss", new gloss<>());
+        callbacks.put("not used", new not_used<>());
+        callbacks.put("wikipedia", new wikipedia<>());
  
-        final it_conj<T> it_conj_cb = new it_conj<T>();
+        final it_conj<T> it_conj_cb = new it_conj<>();
          callbacks.put("it-conj", it_conj_cb);
-        callbacks.put("it-conj-are", new it_conj_are<T>(it_conj_cb));
-        callbacks.put("it-conj-arsi", new it_conj_are<T>(it_conj_cb));
-        callbacks.put("it-conj-care", new it_conj_are<T>(it_conj_cb));
-        callbacks.put("it-conj-carsi", new it_conj_are<T>(it_conj_cb));
-        callbacks.put("it-conj-ciare", new it_conj_are<T>(it_conj_cb));
-        callbacks.put("it-conj-ciarsi", new it_conj_are<T>(it_conj_cb));
-        callbacks.put("it-conj-iare", new it_conj_are<T>(it_conj_cb));
-        callbacks.put("it-conj-iarsi", new it_conj_are<T>(it_conj_cb));
-        callbacks.put("it-conj-iare-b", new it_conj_are<T>(it_conj_cb));
-        callbacks.put("it-conj-iarsi-b", new it_conj_are<T>(it_conj_cb));
-        callbacks.put("it-conj-ire", new it_conj_ire<T>(it_conj_cb));
-        callbacks.put("it-conj-irsi", new it_conj_ire<T>(it_conj_cb));
-        callbacks.put("it-conj-ire-b", new it_conj_ire<T>(it_conj_cb));
-        callbacks.put("it-conj-irsi-b", new it_conj_ire<T>(it_conj_cb));
-        callbacks.put("it-conj-cire", new it_conj_ire<T>(it_conj_cb));
-        callbacks.put("it-conj-cirsi", new it_conj_ire<T>(it_conj_cb));
-        callbacks.put("it-conj-ire", new it_conj_ire<T>(it_conj_cb));
-        callbacks.put("it-conj-ere", new it_conj_ere<T>(it_conj_cb));
-        callbacks.put("it-conj-ersi", new it_conj_ere<T>(it_conj_cb));
-        callbacks.put("it-conj-urre", new it_conj_urre<T>(it_conj_cb));
-        callbacks.put("it-conj-ursi", new it_conj_urre<T>(it_conj_cb));
-        callbacks.put("it-conj-fare", new it_conj_fare<T>(it_conj_cb));
+        callbacks.put("it-conj-are", new it_conj_are<>(it_conj_cb));
+        callbacks.put("it-conj-arsi", new it_conj_are<>(it_conj_cb));
+        callbacks.put("it-conj-care", new it_conj_are<>(it_conj_cb));
+        callbacks.put("it-conj-carsi", new it_conj_are<>(it_conj_cb));
+        callbacks.put("it-conj-ciare", new it_conj_are<>(it_conj_cb));
+        callbacks.put("it-conj-ciarsi", new it_conj_are<>(it_conj_cb));
+        callbacks.put("it-conj-iare", new it_conj_are<>(it_conj_cb));
+        callbacks.put("it-conj-iarsi", new it_conj_are<>(it_conj_cb));
+        callbacks.put("it-conj-iare-b", new it_conj_are<>(it_conj_cb));
+        callbacks.put("it-conj-iarsi-b", new it_conj_are<>(it_conj_cb));
+        callbacks.put("it-conj-ire", new it_conj_ire<>(it_conj_cb));
+        callbacks.put("it-conj-irsi", new it_conj_ire<>(it_conj_cb));
+        callbacks.put("it-conj-ire-b", new it_conj_ire<>(it_conj_cb));
+        callbacks.put("it-conj-irsi-b", new it_conj_ire<>(it_conj_cb));
+        callbacks.put("it-conj-cire", new it_conj_ire<>(it_conj_cb));
+        callbacks.put("it-conj-cirsi", new it_conj_ire<>(it_conj_cb));
+        callbacks.put("it-conj-ire", new it_conj_ire<>(it_conj_cb));
+        callbacks.put("it-conj-ere", new it_conj_ere<>(it_conj_cb));
+        callbacks.put("it-conj-ersi", new it_conj_ere<>(it_conj_cb));
+        callbacks.put("it-conj-urre", new it_conj_urre<>(it_conj_cb));
+        callbacks.put("it-conj-ursi", new it_conj_urre<>(it_conj_cb));
+        callbacks.put("it-conj-fare", new it_conj_fare<>(it_conj_cb));
  
  
          //"{{it-conj-fare|putre|avere}}\n" +
@@ -126,7 +126,7 @@ class EnFunctionCallbacks {
      static {
          addGenericCallbacks(DEFAULT);
  
-        FunctionCallback<EnParser> callback = new TranslationCallback<EnParser>();
+        FunctionCallback<EnParser> callback = new TranslationCallback<>();
          DEFAULT.put("t", callback);
          DEFAULT.put("t+", callback);
          DEFAULT.put("t-", callback);
@@ -160,7 +160,7 @@ class EnFunctionCallbacks {
          DEFAULT.put("head", callback);
      }
  
-    static final NameAndArgs<EnParser> NAME_AND_ARGS = new NameAndArgs<EnParser>();
+    static final NameAndArgs<EnParser> NAME_AND_ARGS = new NameAndArgs<>();
  
      // ------------------------------------------------------------------
  
@@ -175,7 +175,7 @@ class EnFunctionCallbacks {
              namedArgs.keySet().removeAll(EnParser.USELESS_WIKI_ARGS);
              if (args.size() < 2) {
                  if (!name.equals("ttbc")) {
-                    EnParser.LOG.warning("{{t...}} with wrong args: title=" + parser.title + ", " + wikiTokenizer.token());
+                    AbstractWiktionaryParser.LOG.warning("{{t...}} with wrong args: title=" + parser.title + ", " + wikiTokenizer.token());
                  }
                  return false;
              }
@@ -216,7 +216,7 @@ class EnFunctionCallbacks {
              // Catch-all for anything else...
              if (!namedArgs.isEmpty()) {
                  appendAndIndexWikiCallback.builder.append(" {");
-                EnParser.appendNamedArgs(namedArgs, appendAndIndexWikiCallback);
+                AbstractWiktionaryParser.appendNamedArgs(namedArgs, appendAndIndexWikiCallback);
                  appendAndIndexWikiCallback.builder.append("}");
              }
  
@@ -234,7 +234,7 @@ class EnFunctionCallbacks {
                                        final AppendAndIndexWikiCallback<T> appendAndIndexWikiCallback) {
              namedArgs.remove("lang");
              if (!namedArgs.isEmpty()) {
-                EnParser.LOG.warning("weird qualifier: " + wikiTokenizer.token());
+                AbstractWiktionaryParser.LOG.warning("weird qualifier: " + wikiTokenizer.token());
                  return false;
              }
              appendAndIndexWikiCallback.builder.append("(");
@@ -259,7 +259,7 @@ class EnFunctionCallbacks {
                                        final AppendAndIndexWikiCallback<T> appendAndIndexWikiCallback) {
              namedArgs.remove("lang");
              if (!namedArgs.isEmpty()) {
-                EnParser.LOG.warning("weird encoding: " + wikiTokenizer.token());
+                AbstractWiktionaryParser.LOG.warning("weird encoding: " + wikiTokenizer.token());
                  return false;
              }
              if (args.size() == 0) {
@@ -299,8 +299,8 @@ class EnFunctionCallbacks {
              }
              appendAndIndexWikiCallback.builder.append("{");
              appendAndIndexWikiCallback.builder.append(name);
-            for (int i = 0; i < args.size(); ++i) {
-                appendAndIndexWikiCallback.builder.append("|").append(args.get(i));
+            for (String arg : args) {
+                appendAndIndexWikiCallback.builder.append("|").append(arg);
              }
              appendAndIndexWikiCallback.builder.append("}");
              return true;
@@ -351,7 +351,7 @@ class EnFunctionCallbacks {
              if (displayText != null) {
                  appendAndIndexWikiCallback.dispatch(displayText, indexBuilder, entryTypeName);
              } else {
-                EnParser.LOG.warning("no display text: " + wikiTokenizer.token());
+                AbstractWiktionaryParser.LOG.warning("no display text: " + wikiTokenizer.token());
              }
  
              final String tr = namedArgs.remove("tr");
@@ -371,7 +371,7 @@ class EnFunctionCallbacks {
              namedArgs.keySet().removeAll(EnParser.USELESS_WIKI_ARGS);
              if (!namedArgs.isEmpty()) {
                  appendAndIndexWikiCallback.builder.append(" {").append(name);
-                EnParser.appendNamedArgs(namedArgs, appendAndIndexWikiCallback);
+                AbstractWiktionaryParser.appendNamedArgs(namedArgs, appendAndIndexWikiCallback);
                  appendAndIndexWikiCallback.builder.append("}");
              }
  
@@ -498,7 +498,7 @@ class EnFunctionCallbacks {
                  formName = ListUtil.remove(args, 0, null);
              }
              if (formName == null) {
-                EnParser.LOG.warning("Missing form name: " + parser.title);
+                AbstractWiktionaryParser.LOG.warning("Missing form name: " + parser.title);
                  formName = "form of";
              }
              String baseForm = ListUtil.get(args, 1, "");
@@ -517,7 +517,7 @@ class EnFunctionCallbacks {
                  parser.foreignIndexBuilder.addEntryWithString(appendAndIndexWikiCallback.indexedEntry, baseForm, EntryTypeName.WIKTIONARY_BASE_FORM_MULTI);
              } else {
                  // null baseForm happens in Danish.
-                EnParser.LOG.warning("Null baseform: " + parser.title);
+                AbstractWiktionaryParser.LOG.warning("Null baseform: " + parser.title);
              }
              return true;
          }
@@ -539,11 +539,7 @@ class EnFunctionCallbacks {
              if (args.size() > 1 || !namedArgs.isEmpty()) {
                  // Unindexed!
                  return false;
-            } else if (args.size() == 1) {
-                return false;
-            } else {
-                return true;
-            }
+            } else return args.size() != 1;
          }
      }
  
@@ -637,11 +633,7 @@ class EnFunctionCallbacks {
                  return false;
              }
              String langName = WiktionaryLangs.getEnglishName(langCode);
-            if (langName != null) {
-                appendAndIndexWikiCallback.dispatch(langName, null);
-            } else {
-                appendAndIndexWikiCallback.dispatch("lang:" + langCode, null);
-            }
+            appendAndIndexWikiCallback.dispatch(langName == null ? "lang:" + langCode : langName, null);
              return true;
          }
      }
@@ -682,7 +674,7 @@ class EnFunctionCallbacks {
              if (!StringUtil.isNullOrEmpty(literally)) {
                  literally = String.format("literally %s", literally);
              }
-            final List<String> inParens = new ArrayList<String>(Arrays.asList(tr, pos, gloss, literally));
+            final List<String> inParens = new ArrayList<>(Arrays.asList(tr, pos, gloss, literally));
              cleanList(inParens);
              appendCommaSeparatedList(appendAndIndexWikiCallback, inParens);
  
@@ -755,14 +747,14 @@ class EnFunctionCallbacks {
              }
              parser.wordForms.add(singular);
              if (!namedArgs.isEmpty() || args.size() > 4) {
-                EnParser.LOG.warning("Invalid it-noun: " + wikiTokenizer.token());
+                AbstractWiktionaryParser.LOG.warning("Invalid it-noun: " + wikiTokenizer.token());
              }
              return true;
          }
      }
  
      static {
-        DEFAULT.put("it-proper noun", new it_proper_noun<EnParser>());
+        DEFAULT.put("it-proper noun", new it_proper_noun<>());
      }
      static final class it_proper_noun<T extends AbstractWiktionaryParser> implements FunctionCallback<T> {
          @Override
@@ -1041,7 +1033,7 @@ class EnFunctionCallbacks {
          }
      }
  
-    static final Map<String,String> it_indicativePronouns = new LinkedHashMap<String, String>();
+    static final Map<String,String> it_indicativePronouns = new LinkedHashMap<>();
      static {
          it_indicativePronouns.put("1s", "io");
          it_indicativePronouns.put("2s", "tu");
@@ -1051,7 +1043,7 @@ class EnFunctionCallbacks {
          it_indicativePronouns.put("3p", "essi/esse");
      }
  
-    static final Map<String,String> it_subjunctivePronouns = new LinkedHashMap<String, String>();
+    static final Map<String,String> it_subjunctivePronouns = new LinkedHashMap<>();
      static {
          it_subjunctivePronouns.put("1s", "che io");
          it_subjunctivePronouns.put("2s", "che tu");
@@ -1061,7 +1053,7 @@ class EnFunctionCallbacks {
          it_subjunctivePronouns.put("3p", "che essi/esse");
      }
  
-    static final Map<String,String> it_imperativePronouns = new LinkedHashMap<String, String>();
+    static final Map<String,String> it_imperativePronouns = new LinkedHashMap<>();
      static {
          it_imperativePronouns.put("1s", "-");
          it_imperativePronouns.put("2s", "tu");
@@ -1118,18 +1110,18 @@ class EnFunctionCallbacks {
              final List<String> prefixes = (inf != null && inf.endsWith("si")) ? it_reflexive_pronouns : it_empty;
  
              String style = " style=\"background:#c0cfe4\"";
-            outputDataRow(appendAndIndexWikiCallback, style, "indicativo", style, "th", "", new LinkedHashMap<String, String>(it_indicativePronouns), it_empty, false);
+            outputDataRow(appendAndIndexWikiCallback, style, "indicativo", style, "th", "", new LinkedHashMap<>(it_indicativePronouns), it_empty, false);
              outputDataRow(appendAndIndexWikiCallback, style, "presente", "", "td", "pres", namedArgs, prefixes, true);
              outputDataRow(appendAndIndexWikiCallback, style, "imperfetto", "", "td", "imperf", namedArgs, prefixes, true);
              outputDataRow(appendAndIndexWikiCallback, style, "passato remoto", "", "td", "prem", namedArgs, prefixes, true);
              outputDataRow(appendAndIndexWikiCallback, style, "futuro", "", "td", "fut", namedArgs, prefixes, true);
  
              style = " style=\"background:#c0d8e4\"";
-            outputDataRow(appendAndIndexWikiCallback, style, "condizionale", style, "th", "", new LinkedHashMap<String, String>(it_indicativePronouns), it_empty, false);
+            outputDataRow(appendAndIndexWikiCallback, style, "condizionale", style, "th", "", new LinkedHashMap<>(it_indicativePronouns), it_empty, false);
              outputDataRow(appendAndIndexWikiCallback, style, "presente", "", "td", "cond", namedArgs, prefixes, true);
  
              style = " style=\"background:#c0e4c0\"";
-            outputDataRow(appendAndIndexWikiCallback, style, "congiuntivo", style, "th", "", new LinkedHashMap<String, String>(it_subjunctivePronouns), it_empty, false);
+            outputDataRow(appendAndIndexWikiCallback, style, "congiuntivo", style, "th", "", new LinkedHashMap<>(it_subjunctivePronouns), it_empty, false);
              namedArgs.put("sub3s2", namedArgs.remove("sub3s"));
              namedArgs.put("sub1s", namedArgs.get("sub123s"));
              namedArgs.put("sub2s", namedArgs.get("sub123s"));
@@ -1145,7 +1137,7 @@ class EnFunctionCallbacks {
              outputDataRow(appendAndIndexWikiCallback, style, "imperfetto", "", "td", "impsub", namedArgs, prefixes, true);
  
              style = " style=\"background:#e4d4c0\"";
-            outputDataRow(appendAndIndexWikiCallback, style, "imperativo", style, "th", "", new LinkedHashMap<String, String>(it_imperativePronouns), it_empty, false);
+            outputDataRow(appendAndIndexWikiCallback, style, "imperativo", style, "th", "", new LinkedHashMap<>(it_imperativePronouns), it_empty, false);
              outputDataRow(appendAndIndexWikiCallback, style, "", "", "td", "imp", namedArgs, it_empty, false);  // these are attached to the stem.
  
              builder.append("</table>\n");
@@ -1177,7 +1169,7 @@ class EnFunctionCallbacks {
              for (final String number : it_number_s_p) {
                  for (final String person : it_person_1_2_3) {
                      // Output <td> or <th>
-                    builder.append("<").append(type2).append("").append(col2Style).append(">");
+                    builder.append("<").append(type2).append(col2Style).append(">");
                      final String keyBase = String.format("%s%s%s", moodName, person, number);
                      appendAndIndexWikiCallback.dispatch(prefixes.get(i++), null);
                      outputKeyVariations(appendAndIndexWikiCallback, builder, keyBase, namedArgs, isForm);
diff --git a/src/com/hughes/android/dictionary/parser/wiktionary/EnParser.java b/src/com/hughes/android/dictionary/parser/wiktionary/EnParser.java

index d15cc9292d34962c0ab69f7b5db398f3d105dceb..b60235c23991744392ce0799f485829595448107 100644 (file)
--- a/src/com/hughes/android/dictionary/parser/wiktionary/EnParser.java
+++ b/src/com/hughes/android/dictionary/parser/wiktionary/EnParser.java
@@ -44,15 +44,15 @@ public abstract class EnParser extends AbstractWiktionaryParser {
                  "Particle|Interjection|Pronominal adverb|" +
                  "Han character|Hanzi|Hanja|Kanji|Katakana character|Syllable");
  
-    static final Set<String> USELESS_WIKI_ARGS = new LinkedHashSet<String>(
-        Arrays.asList(
-            "lang",
-            "sc",
-            "sort",
-            "cat",
-            "cat2",
-            "xs",
-            "nodot"));
+    static final Set<String> USELESS_WIKI_ARGS = new LinkedHashSet<>(
+            Arrays.asList(
+                    "lang",
+                    "sc",
+                    "sort",
+                    "cat",
+                    "cat2",
+                    "xs",
+                    "nodot"));
  
      static boolean isIgnorableTitle(final String title) {
          return title.startsWith("Wiktionary:") ||
@@ -83,14 +83,14 @@ public abstract class EnParser extends AbstractWiktionaryParser {
      State state = null;
  
      public boolean entryIsFormOfSomething = false;
-    final Collection<String> wordForms = new ArrayList<String>();
+    final Collection<String> wordForms = new ArrayList<>();
      boolean titleAppended = false;
  
  
      final AppendAndIndexWikiCallback<EnParser> appendAndIndexWikiCallback = new AppendAndIndexCallback(this);
      {
          appendAndIndexWikiCallback.functionCallbacks.putAll(EnFunctionCallbacks.DEFAULT);
-        for (final String key : new ArrayList<String>(appendAndIndexWikiCallback.functionCallbacks.keySet())) {
+        for (final String key : new ArrayList<>(appendAndIndexWikiCallback.functionCallbacks.keySet())) {
              // Don't handle the it-conj functions here.
              if (key.startsWith("it-conj")) {
                  appendAndIndexWikiCallback.functionCallbacks.remove(key);
diff --git a/src/com/hughes/android/dictionary/parser/wiktionary/EnToTranslationParser.java b/src/com/hughes/android/dictionary/parser/wiktionary/EnToTranslationParser.java

index 8c9683cf2d4df11b440392e655a72626f1dc812c..73b2db697ce86a8a92179b30baaaeefac38e5dea 100644 (file)
--- a/src/com/hughes/android/dictionary/parser/wiktionary/EnToTranslationParser.java
+++ b/src/com/hughes/android/dictionary/parser/wiktionary/EnToTranslationParser.java
@@ -224,7 +224,7 @@ public final class EnToTranslationParser extends EnParser {
          final Pair pair = new Pair(trim(englishText.toString()), trim(foreignText.toString()), swap);
          pairEntry.pairs.add(pair);
          if (!pairsAdded.add(pair.toString())) {
-            LOG.warning("Duplicate pair: " + pair.toString());
+            LOG.warning("Duplicate pair: " + pair);
              incrementCount("WARNING: Duplicate pair" );
          }
      }
diff --git a/src/com/hughes/android/dictionary/parser/wiktionary/EnTranslationToTranslationParser.java b/src/com/hughes/android/dictionary/parser/wiktionary/EnTranslationToTranslationParser.java

index 4bf49033dca99a39c6e1ada0eefde892d283c8f2..042f0fac3289280cb91db44ac86f5404d09fba6d 100644 (file)
--- a/src/com/hughes/android/dictionary/parser/wiktionary/EnTranslationToTranslationParser.java
+++ b/src/com/hughes/android/dictionary/parser/wiktionary/EnTranslationToTranslationParser.java
@@ -15,17 +15,16 @@
  package com.hughes.android.dictionary.parser.wiktionary;
  
  import java.util.Arrays;
+import java.util.HashSet;
  import java.util.LinkedHashSet;
  import java.util.List;
  import java.util.Map;
  import java.util.Set;
-import java.util.HashSet;
  import java.util.regex.Pattern;
  
  import com.hughes.android.dictionary.engine.IndexBuilder;
  import com.hughes.android.dictionary.engine.IndexedEntry;
  import com.hughes.android.dictionary.engine.PairEntry;
-import com.hughes.android.dictionary.engine.PairEntry.Pair;
  import com.hughes.android.dictionary.parser.WikiTokenizer;
  import com.hughes.android.dictionary.parser.wiktionary.EnFunctionCallbacks.TranslationCallback;
  import com.hughes.util.ListUtil;
@@ -38,11 +37,11 @@ public final class EnTranslationToTranslationParser extends AbstractWiktionaryPa
      PairEntry pairEntry = null;
      IndexedEntry indexedEntry = null;
      StringBuilder[] builders = null;
-    HashSet<Pair> allPairs = new HashSet<Pair>();
+    final HashSet<PairEntry.Pair> allPairs = new HashSet<>();
  
      public static final String NAME = "EnTranslationToTranslation";
  
-    final Set<String> Ts = new LinkedHashSet<String>(Arrays.asList("t", "t+",
+    final Set<String> Ts = new LinkedHashSet<>(Arrays.asList("t", "t+",
              "t-", "tø", "apdx-t", "ttbc"));
  
      public EnTranslationToTranslationParser(final List<IndexBuilder> indexBuilders,
@@ -89,10 +88,10 @@ public final class EnTranslationToTranslationParser extends AbstractWiktionaryPa
          }
      }
  
-    final TranslationCallback<EnTranslationToTranslationParser> translationCallback = new TranslationCallback<EnTranslationToTranslationParser>();
+    final TranslationCallback<EnTranslationToTranslationParser> translationCallback = new TranslationCallback<>();
  
-    final AppendAndIndexWikiCallback<EnTranslationToTranslationParser> appendAndIndexWikiCallback = new AppendAndIndexWikiCallback<EnTranslationToTranslationParser>(
-        this);
+    final AppendAndIndexWikiCallback<EnTranslationToTranslationParser> appendAndIndexWikiCallback = new AppendAndIndexWikiCallback<>(
+            this);
      {
          for (final String t : Ts) {
              appendAndIndexWikiCallback.functionCallbacks.put(t, translationCallback);
@@ -145,12 +144,12 @@ public final class EnTranslationToTranslationParser extends AbstractWiktionaryPa
          final String lang1 = builders[0].toString();
          final String lang2 = builders[1].toString();
          if (lang1.length() > 0 && lang2.length() > 0) {
-            final Pair newPair = new Pair(lang1, lang2);
+            final PairEntry.Pair newPair = new PairEntry.Pair(lang1, lang2);
              // brute-force approach to prevent adding duplicates
              if (!allPairs.contains(newPair))
              {
                  allPairs.add(newPair);
-                pairEntry.pairs.add(new Pair(lang1, lang2));
+                pairEntry.pairs.add(new PairEntry.Pair(lang1, lang2));
                  indexedEntry.isValid = true;
              }
          }
diff --git a/src/com/hughes/android/dictionary/parser/wiktionary/FrFunctionCallbacks.java b/src/com/hughes/android/dictionary/parser/wiktionary/FrFunctionCallbacks.java

index 2edf3acf0fedc9d424546e5d62944a748421e3d3..9f4ef05127817b77530663057057af4fb25619a2 100644 (file)
--- a/src/com/hughes/android/dictionary/parser/wiktionary/FrFunctionCallbacks.java
+++ b/src/com/hughes/android/dictionary/parser/wiktionary/FrFunctionCallbacks.java
@@ -14,38 +14,38 @@
  
  package com.hughes.android.dictionary.parser.wiktionary;
  
+import java.util.List;
+import java.util.Map;
+
  import com.hughes.android.dictionary.parser.WikiTokenizer;
  import com.hughes.android.dictionary.parser.wiktionary.AbstractWiktionaryParser.AppendAndIndexWikiCallback;
  import com.hughes.android.dictionary.parser.wiktionary.AbstractWiktionaryParser.NameAndArgs;
  import com.hughes.android.dictionary.parser.wiktionary.ItFunctionCallbacks.Redispatch;
  
-import java.util.List;
-import java.util.Map;
-
  class FrFunctionCallbacks {
  
      static <T extends AbstractWiktionaryParser> void addGenericCallbacks(Map<String, FunctionCallback<T>> callbacks) {
-        callbacks.put("-étym-", new Redispatch<T>("\n==== Étymologie ====\n"));
-        callbacks.put("-pron-", new Redispatch<T>("\n==== Prononciation ====\n"));
-        callbacks.put("-voir-", new Redispatch<T>("\n==== Voir aussi ====\n"));
-        callbacks.put("-drv-", new Redispatch<T>("\n==== Dérivés ====\n"));
-        callbacks.put("-syn-", new Redispatch<T>("\n==== Synonymes ====\n"));
+        callbacks.put("-étym-", new Redispatch<>("\n==== Étymologie ====\n"));
+        callbacks.put("-pron-", new Redispatch<>("\n==== Prononciation ====\n"));
+        callbacks.put("-voir-", new Redispatch<>("\n==== Voir aussi ====\n"));
+        callbacks.put("-drv-", new Redispatch<>("\n==== Dérivés ====\n"));
+        callbacks.put("-syn-", new Redispatch<>("\n==== Synonymes ====\n"));
  
-        callbacks.put("-apr-", new Redispatch<T>("\n==== Apparentés étymologiques ====\n"));
-        callbacks.put("-hyper-", new Redispatch<T>("\n==== Hyperonymes ====\n"));
-        callbacks.put("-hypo-", new Redispatch<T>("\n==== Hyponymes ====\n"));
-        callbacks.put("-réf-", new Redispatch<T>("\n==== Références ====\n"));
-        callbacks.put("-homo-", new Redispatch<T>("\n==== Homophones ====\n"));
-        callbacks.put("-anagr-", new Redispatch<T>("\n==== Anagrammes ====\n"));
-        callbacks.put("-voc-", new Redispatch<T>("\n==== Vocabulaire apparenté par le sens ====\n"));
-        callbacks.put("-exp-", new Redispatch<T>("\n==== Expressions ====\n"));
-        callbacks.put("-note-", new Redispatch<T>("\n==== Note ====\n"));
+        callbacks.put("-apr-", new Redispatch<>("\n==== Apparentés étymologiques ====\n"));
+        callbacks.put("-hyper-", new Redispatch<>("\n==== Hyperonymes ====\n"));
+        callbacks.put("-hypo-", new Redispatch<>("\n==== Hyponymes ====\n"));
+        callbacks.put("-réf-", new Redispatch<>("\n==== Références ====\n"));
+        callbacks.put("-homo-", new Redispatch<>("\n==== Homophones ====\n"));
+        callbacks.put("-anagr-", new Redispatch<>("\n==== Anagrammes ====\n"));
+        callbacks.put("-voc-", new Redispatch<>("\n==== Vocabulaire apparenté par le sens ====\n"));
+        callbacks.put("-exp-", new Redispatch<>("\n==== Expressions ====\n"));
+        callbacks.put("-note-", new Redispatch<>("\n==== Note ====\n"));
  
-        callbacks.put("-trad-", new ItFunctionCallbacks.SkipSection<T>());
+        callbacks.put("-trad-", new ItFunctionCallbacks.SkipSection<>());
      }
  
  
-    static final NameAndArgs<EnParser> NAME_AND_ARGS = new NameAndArgs<EnParser>();
+    static final NameAndArgs<EnParser> NAME_AND_ARGS = new NameAndArgs<>();
  
  
      static final class MakeHeadingFromName<T extends AbstractWiktionaryParser> implements FunctionCallback<T> {
diff --git a/src/com/hughes/android/dictionary/parser/wiktionary/ItFunctionCallbacks.java b/src/com/hughes/android/dictionary/parser/wiktionary/ItFunctionCallbacks.java

index 8278ccd09b8804b1096bfea35bff5545ff1ce885..17ee4beabfe7fef12259426eb9918168b0ebfd79 100644 (file)
--- a/src/com/hughes/android/dictionary/parser/wiktionary/ItFunctionCallbacks.java
+++ b/src/com/hughes/android/dictionary/parser/wiktionary/ItFunctionCallbacks.java
@@ -14,34 +14,34 @@
  
  package com.hughes.android.dictionary.parser.wiktionary;
  
+import java.util.List;
+import java.util.Map;
+
  import com.hughes.android.dictionary.parser.WikiTokenizer;
  import com.hughes.android.dictionary.parser.wiktionary.AbstractWiktionaryParser.AppendAndIndexWikiCallback;
  import com.hughes.android.dictionary.parser.wiktionary.AbstractWiktionaryParser.NameAndArgs;
  
-import java.util.List;
-import java.util.Map;
-
  class ItFunctionCallbacks {
  
      static <T extends AbstractWiktionaryParser> void addGenericCallbacks(
          Map<String, FunctionCallback<T>> callbacks) {
-        callbacks.put("-hyph-", new Redispatch<T>("\n==== Sillabazione ====\n"));
-        callbacks.put("-pron-", new Redispatch<T>("\n==== Pronuncia ====\n"));
-        callbacks.put("-etim-", new Redispatch<T>("\n==== Etimologia / Derivazione ====\n"));
-        callbacks.put("-syn-", new Redispatch<T>("\n==== Sinonimi ====\n"));
-        callbacks.put("-ant-", new Redispatch<T>("\n==== Antonimi/Contrari ====\n"));
-        callbacks.put("-drv-", new Redispatch<T>("\n==== Parole derivate ====\n"));
-        callbacks.put("-prov-", new Redispatch<T>("\n==== Proverbi e modi di dire ====\n"));
-        callbacks.put("-ref-", new Redispatch<T>("\n==== Note / Riferimenti ====\n"));
-        callbacks.put("-rel-", new Redispatch<T>("\n==== Termini correlati ====\n"));
-        callbacks.put("-var-", new Redispatch<T>("\n==== Varianti ====\n"));
+        callbacks.put("-hyph-", new Redispatch<>("\n==== Sillabazione ====\n"));
+        callbacks.put("-pron-", new Redispatch<>("\n==== Pronuncia ====\n"));
+        callbacks.put("-etim-", new Redispatch<>("\n==== Etimologia / Derivazione ====\n"));
+        callbacks.put("-syn-", new Redispatch<>("\n==== Sinonimi ====\n"));
+        callbacks.put("-ant-", new Redispatch<>("\n==== Antonimi/Contrari ====\n"));
+        callbacks.put("-drv-", new Redispatch<>("\n==== Parole derivate ====\n"));
+        callbacks.put("-prov-", new Redispatch<>("\n==== Proverbi e modi di dire ====\n"));
+        callbacks.put("-ref-", new Redispatch<>("\n==== Note / Riferimenti ====\n"));
+        callbacks.put("-rel-", new Redispatch<>("\n==== Termini correlati ====\n"));
+        callbacks.put("-var-", new Redispatch<>("\n==== Varianti ====\n"));
  
-        callbacks.put("-trans1-", new SkipSection<T>());
-        callbacks.put("-trans2-", new SkipSection<T>());
-        callbacks.put("-ref-", new SkipSection<T>());
+        callbacks.put("-trans1-", new SkipSection<>());
+        callbacks.put("-trans2-", new SkipSection<>());
+        callbacks.put("-ref-", new SkipSection<>());
      }
  
-    static final NameAndArgs<EnParser> NAME_AND_ARGS = new NameAndArgs<EnParser>();
+    static final NameAndArgs<EnParser> NAME_AND_ARGS = new NameAndArgs<>();
  
      static final class Redispatch<T extends AbstractWiktionaryParser> implements
          FunctionCallback<T> {
diff --git a/src/com/hughes/android/dictionary/parser/wiktionary/WholeSectionToHtmlParser.java b/src/com/hughes/android/dictionary/parser/wiktionary/WholeSectionToHtmlParser.java

index e744fce9e538e752f9e8ae34ab7faad42d35f068..2b719db747b3b5f64edf4c8f938d333c9da04073 100644 (file)
--- a/src/com/hughes/android/dictionary/parser/wiktionary/WholeSectionToHtmlParser.java
+++ b/src/com/hughes/android/dictionary/parser/wiktionary/WholeSectionToHtmlParser.java
@@ -1,6 +1,15 @@
  
  package com.hughes.android.dictionary.parser.wiktionary;
  
+import java.net.URI;
+import java.util.ArrayList;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.regex.Pattern;
+
+import org.apache.commons.text.StringEscapeUtils;
+
  import com.hughes.android.dictionary.engine.EntryTypeName;
  import com.hughes.android.dictionary.engine.HtmlEntry;
  import com.hughes.android.dictionary.engine.IndexBuilder;
@@ -9,15 +18,6 @@ import com.hughes.android.dictionary.engine.IndexedEntry;
  import com.hughes.android.dictionary.parser.WikiTokenizer;
  import com.hughes.util.StringUtil;
  
-import org.apache.commons.lang3.StringEscapeUtils;
-
-import java.net.URI;
-import java.util.ArrayList;
-import java.util.LinkedHashMap;
-import java.util.List;
-import java.util.Map;
-import java.util.regex.Pattern;
-
  public class WholeSectionToHtmlParser extends AbstractWiktionaryParser {
  
      public static final String NAME = "WholeSectionToHtmlParser";
@@ -30,7 +30,7 @@ public class WholeSectionToHtmlParser extends AbstractWiktionaryParser {
          void addFunctionCallbacks(
              Map<String, FunctionCallback<WholeSectionToHtmlParser>> functionCallbacks);
      }
-    static final Map<String,LangConfig> isoToLangConfig = new LinkedHashMap<String,LangConfig>();
+    static final Map<String,LangConfig> isoToLangConfig = new LinkedHashMap<>();
      static {
          final Pattern enSkipSections = Pattern.compile(".*(Translations|Anagrams|References).*");
          isoToLangConfig.put("EN", new LangConfig() {
@@ -47,23 +47,18 @@ public class WholeSectionToHtmlParser extends AbstractWiktionaryParser {
                  if (sectionName.equalsIgnoreCase("Antonyms")) {
                      return EntryTypeName.ANTONYM_MULTI;
                  }
-                if (EnParser.partOfSpeechHeader.matcher(sectionName).matches()) {
-                    // We need to put it in the other index, too (probably)
-                    return null;
-                }
-                if (sectionName.equalsIgnoreCase("Derived Terms")) {
-                    return null;
-                }
+                // We need to put it in the other index, too (probably) ?
+                // EnParser.partOfSpeechHeader.matcher(sectionName).matches()
+
+                // Needs special handling?
+                // sectionName.equalsIgnoreCase("Derived Terms")
                  return null;
              }
  
              @Override
              public boolean skipWikiLink(WikiTokenizer wikiTokenizer) {
                  final String wikiText = wikiTokenizer.wikiLinkText();
-                if (wikiText.startsWith("Category:")) {
-                    return true;
-                }
-                return false;
+                return wikiText.startsWith("Category:");
              }
              @Override
              public String adjustWikiLink(String wikiLinkDest, String wikiLinkText) {
@@ -108,10 +103,7 @@ public class WholeSectionToHtmlParser extends AbstractWiktionaryParser {
              @Override
              public boolean skipWikiLink(WikiTokenizer wikiTokenizer) {
                  final String wikiText = wikiTokenizer.wikiLinkText();
-                if (wikiText.startsWith("Categoría:")) {
-                    return true;
-                }
-                return false;
+                return wikiText.startsWith("Categoría:");
              }
              @Override
              public String adjustWikiLink(String wikiLinkDest, String wikiLinkText) {
@@ -156,10 +148,7 @@ public class WholeSectionToHtmlParser extends AbstractWiktionaryParser {
              @Override
              public boolean skipWikiLink(WikiTokenizer wikiTokenizer) {
                  final String wikiText = wikiTokenizer.wikiLinkText();
-                if (wikiText.startsWith("Categoria:")) {
-                    return true;
-                }
-                return false;
+                return wikiText.startsWith("Categoria:");
              }
              @Override
              public String adjustWikiLink(String wikiLinkDest, String wikiLinkText) {
@@ -204,10 +193,7 @@ public class WholeSectionToHtmlParser extends AbstractWiktionaryParser {
              @Override
              public boolean skipWikiLink(WikiTokenizer wikiTokenizer) {
                  final String wikiText = wikiTokenizer.wikiLinkText();
-                if (wikiText.startsWith("Kategorie:")) {
-                    return true;
-                }
-                return false;
+                return wikiText.startsWith("Kategorie:");
              }
              @Override
              public String adjustWikiLink(String wikiLinkDest, String wikiLinkText) {
@@ -252,10 +238,7 @@ public class WholeSectionToHtmlParser extends AbstractWiktionaryParser {
              @Override
              public boolean skipWikiLink(WikiTokenizer wikiTokenizer) {
                  final String wikiText = wikiTokenizer.wikiLinkText();
-                if (wikiText.startsWith("Categoria:")) {
-                    return true;
-                }
-                return false;
+                return wikiText.startsWith("Categoria:");
              }
              @Override
              public String adjustWikiLink(String wikiLinkDest, String wikiLinkText) {
@@ -301,10 +284,7 @@ public class WholeSectionToHtmlParser extends AbstractWiktionaryParser {
              @Override
              public boolean skipWikiLink(WikiTokenizer wikiTokenizer) {
                  final String wikiText = wikiTokenizer.wikiLinkText();
-                if (wikiText.startsWith("Catégorie:")) {
-                    return true;
-                }
-                return false;
+                return wikiText.startsWith("Catégorie:");
              }
              @Override
              public String adjustWikiLink(String wikiLinkDest, String wikiLinkText) {
@@ -364,11 +344,19 @@ public class WholeSectionToHtmlParser extends AbstractWiktionaryParser {
  
          if (webUrlTemplate != null) {
              final String webUrl = String.format(webUrlTemplate, title);
+            String asciiWebUrl = null;
              // URI.create can raise an exception e.g. if webUrl contains %, just ignore those cases.
              try {
-                callback.builder.append(String.format("<p> <a href=\"%s\">%s</a>", URI.create(webUrl).toASCIIString(), escapeHtmlLiteral(webUrl)));
+                asciiWebUrl = URI.create(webUrl).toASCIIString();
              } catch (Exception e) {
              }
+            if (asciiWebUrl != null) {
+                callback.builder.append("<p> <a href=\"");
+                callback.builder.append(asciiWebUrl);
+                callback.builder.append("\">");
+                callback.builder.append(escapeHtmlLiteral(webUrl));
+                callback.builder.append("</a>");
+            }
          }
          htmlEntry.html = callback.builder.toString();
          indexedEntry.isValid = true;
@@ -437,9 +425,11 @@ public class WholeSectionToHtmlParser extends AbstractWiktionaryParser {
                  titleIndexBuilder.addEntryWithString(indexedEntry, wikiTokenizer.wikiLinkText(), sectionEntryTypeName);
              }
              if (!StringUtil.isNullOrEmpty(linkDest)) {
-                builder.append(String.format("<a href=\"%s\">", HtmlEntry.formatQuickdicUrl("", linkDest)));
+                builder.append("<a href=\"");
+                builder.append(HtmlEntry.formatQuickdicUrl("", linkDest));
+                builder.append("\">");
                  super.onWikiLink(wikiTokenizer);
-                builder.append(String.format("</a>"));
+                builder.append("</a>");
              } else {
                  super.onWikiLink(wikiTokenizer);
              }
@@ -484,12 +474,16 @@ public class WholeSectionToHtmlParser extends AbstractWiktionaryParser {
                  }
                  return;
              }
-            builder.append(String.format("\n<h%d>", depth));
+            builder.append("\n<h");
+            builder.append(depth);
+            builder.append('>');
              dispatch(headingText, null);
-            builder.append(String.format("</h%d>\n", depth));
+            builder.append("</h");
+            builder.append(depth);
+            builder.append(">\n");
          }
  
-        final List<Character> listPrefixStack = new ArrayList<Character>();
+        final List<Character> listPrefixStack = new ArrayList<>();
  
          @Override
          public void onListItem(WikiTokenizer wikiTokenizer) {
@@ -498,8 +492,9 @@ public class WholeSectionToHtmlParser extends AbstractWiktionaryParser {
              }
              final String prefix = wikiTokenizer.listItemPrefix();
              while (listPrefixStack.size() < prefix.length()) {
-                builder.append(String.format("<%s>",
-                                             WikiTokenizer.getListTag(prefix.charAt(listPrefixStack.size()))));
+                builder.append('<');
+                builder.append(WikiTokenizer.getListTag(prefix.charAt(listPrefixStack.size())));
+                builder.append('>');
                  listPrefixStack.add(prefix.charAt(listPrefixStack.size()));
              }
              builder.append("<li>");
@@ -523,7 +518,9 @@ public class WholeSectionToHtmlParser extends AbstractWiktionaryParser {
              }
              while (listPrefixStack.size() > nextListHeader.length()) {
                  final char prefixChar = listPrefixStack.remove(listPrefixStack.size() - 1);
-                builder.append(String.format("</%s>\n", WikiTokenizer.getListTag(prefixChar)));
+                builder.append("</");
+                builder.append(WikiTokenizer.getListTag(prefixChar));
+                builder.append(">\n");
              }
          }
  
diff --git a/src/com/hughes/android/dictionary/parser/wiktionary/WiktionaryLangs.java b/src/com/hughes/android/dictionary/parser/wiktionary/WiktionaryLangs.java

index 86206a060bbbc68f09c4314c16ba1eeae6f245e9..fb9f283f52c2668285d3f5a1030f648319f65d0f 100644 (file)
--- a/src/com/hughes/android/dictionary/parser/wiktionary/WiktionaryLangs.java
+++ b/src/com/hughes/android/dictionary/parser/wiktionary/WiktionaryLangs.java
@@ -14,17 +14,13 @@
  
  package com.hughes.android.dictionary.parser.wiktionary;
  
-import com.hughes.android.dictionary.engine.Language;
-
  import java.util.LinkedHashMap;
-import java.util.LinkedHashSet;
  import java.util.Map;
-import java.util.Set;
  import java.util.regex.Pattern;
  
  public class WiktionaryLangs {
  
-    public static final Map<String,String> isoCodeToEnWikiName = new LinkedHashMap<String,String>();
+    public static final Map<String,String> isoCodeToEnWikiName = new LinkedHashMap<>();
      static {
          isoCodeToEnWikiName.put("AF", "Afrikaans");
          isoCodeToEnWikiName.put("SQ", "Albanian");
@@ -111,7 +107,6 @@ public class WiktionaryLangs {
          isoCodeToEnWikiName.put("HT", "Haitian Creole");
          isoCodeToEnWikiName.put("LB", "Luxembourgish");
          isoCodeToEnWikiName.put("MK", "Macedonian");
-        isoCodeToEnWikiName.put("GV", "Manx");
          isoCodeToEnWikiName.put("scn", "Sicilian");
          isoCodeToEnWikiName.put("cu", "Old Church Slavonic");
          isoCodeToEnWikiName.put("rom", "Romani");
@@ -133,7 +128,7 @@ public class WiktionaryLangs {
          //assert Language.isoCodeToResources.keySet().containsAll(isoCodeToEnWikiName.keySet());
      }
  
-    public static final Map<String,Map<String,String>> wikiCodeToIsoCodeToWikiName = new LinkedHashMap<String, Map<String,String>>();
+    public static final Map<String,Map<String,String>> wikiCodeToIsoCodeToWikiName = new LinkedHashMap<>();
      static {
          Map<String,String> isoCodeToWikiName;
  
@@ -141,7 +136,7 @@ public class WiktionaryLangs {
          wikiCodeToIsoCodeToWikiName.put("en", isoCodeToEnWikiName);
  
          // egrep -o '\{\{Wortart[^}]+\}\}' dewiktionary-pages-articles.xml | cut -d \| -f3 | sort | uniq -c | sort -nr
-        isoCodeToWikiName = new LinkedHashMap<String, String>();
+        isoCodeToWikiName = new LinkedHashMap<>();
          wikiCodeToIsoCodeToWikiName.put("de", isoCodeToWikiName);
          isoCodeToWikiName.put("nds", "Niederdeutsch");
          isoCodeToWikiName.put("DE", "Deutsch");
@@ -159,7 +154,7 @@ public class WiktionaryLangs {
          isoCodeToWikiName.put("RO", "Rumänisch");
  
          // egrep -o '== *\{\{langue\|[a-zA-Z]+\}\} *==' frwiktionary-pages-articles.xml | sort | uniq -c | sort -nr
-        isoCodeToWikiName = new LinkedHashMap<String, String>();
+        isoCodeToWikiName = new LinkedHashMap<>();
          wikiCodeToIsoCodeToWikiName.put("fr", isoCodeToWikiName);
          isoCodeToWikiName.put("FR", Pattern.quote("{{langue|fr}}"));
          isoCodeToWikiName.put("RU", Pattern.quote("{{langue|ru}}"));
@@ -187,7 +182,7 @@ public class WiktionaryLangs {
          isoCodeToWikiName.put("SV", Pattern.quote("{{langue|sv}}"));
  
          // egrep -o '= *\{\{-[a-z]+-\}\} *=' itwiktionary-pages-articles.xml | sort | uniq -c | sort -n
-        isoCodeToWikiName = new LinkedHashMap<String, String>();
+        isoCodeToWikiName = new LinkedHashMap<>();
          wikiCodeToIsoCodeToWikiName.put("it", isoCodeToWikiName);
          isoCodeToWikiName.put("IT", "\\{\\{-(it|scn|nap|cal|lmo)-\\}\\}");  // scn, nap, cal, lmo
          isoCodeToWikiName.put("EN", Pattern.quote("{{-en-}}"));
@@ -205,7 +200,7 @@ public class WiktionaryLangs {
          isoCodeToWikiName.put("RU", Pattern.quote("{{-ru-}}"));
  
          // egrep -o '== *\{\{lengua\|[a-zA-Z]+\}\} *==' frwiktionary-pages-articles.xml | sort | uniq -c | sort -nr
-        isoCodeToWikiName = new LinkedHashMap<String, String>();
+        isoCodeToWikiName = new LinkedHashMap<>();
          wikiCodeToIsoCodeToWikiName.put("es", isoCodeToWikiName);
          isoCodeToWikiName.put("AR", Pattern.quote("{{lengua|ar}}"));
          isoCodeToWikiName.put("ES", Pattern.quote("{{lengua|es}}"));
@@ -214,7 +209,7 @@ public class WiktionaryLangs {
          isoCodeToWikiName.put("IT", Pattern.quote("{{lengua|it}}"));
  
          // Pattern seems to match Italian one
-        isoCodeToWikiName = new LinkedHashMap<String, String>();
+        isoCodeToWikiName = new LinkedHashMap<>();
          wikiCodeToIsoCodeToWikiName.put("pt", isoCodeToWikiName);
          isoCodeToWikiName.put("PT", Pattern.quote("{{-pt-}}"));
          isoCodeToWikiName.put("EN", Pattern.quote("{{-en-}}"));
diff --git a/src/com/hughes/util/Args.java b/src/com/hughes/util/Args.java

index 6c35dd14626ae82698b04e8665b872bf41df65af..6e30ee17bcb0ace6ce9bc727f74e2582093e13ab 100644 (file)
--- a/src/com/hughes/util/Args.java
+++ b/src/com/hughes/util/Args.java
@@ -26,7 +26,7 @@ public class Args {
              int equalsIndex;
              if (arg.startsWith("--") && (equalsIndex = arg.indexOf("=")) >= 0) {
                  final String key = arg.substring(2, equalsIndex);
-                final String value = arg.substring(equalsIndex + 1, arg.length());
+                final String value = arg.substring(equalsIndex + 1);
                  dest.put(key, value);
              }
          }
diff --git a/src/com/hughes/util/EnumUtil.java b/src/com/hughes/util/EnumUtil.java

index 4cd956b3dca2c02de61ffcffeb4ad92abe5b4496..046adca70e7047015565f22d4cb14d512545d087 100644 (file)
--- a/src/com/hughes/util/EnumUtil.java
+++ b/src/com/hughes/util/EnumUtil.java
@@ -18,7 +18,7 @@ package com.hughes.util;
  @SuppressWarnings("WeakerAccess")
  public final class EnumUtil {
  
-    public static final <T extends Enum<T>> T min(final T e1, final T e2) {
+    public static <T extends Enum<T>> T min(final T e1, final T e2) {
          if (e1 == null) {
              return e2;
          }
diff --git a/src/com/hughes/util/FileUtil.java b/src/com/hughes/util/FileUtil.java

index d3024da18947d68cecf06930a3acaf7ae81efda1..d280cc1a232e8e2798b5e57e6e25ec0d59689b73 100644 (file)
--- a/src/com/hughes/util/FileUtil.java
+++ b/src/com/hughes/util/FileUtil.java
@@ -1,67 +1,67 @@
-// Copyright 2011 Google Inc. All Rights Reserved.\r
-//\r
-// Licensed under the Apache License, Version 2.0 (the "License");\r
-// you may not use this file except in compliance with the License.\r
-// You may obtain a copy of the License at\r
-//\r
-//     http://www.apache.org/licenses/LICENSE-2.0\r
-//\r
-// Unless required by applicable law or agreed to in writing, software\r
-// distributed under the License is distributed on an "AS IS" BASIS,\r
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\r
-// See the License for the specific language governing permissions and\r
-// limitations under the License.\r
-\r
-package com.hughes.util;\r
-\r
-import java.io.BufferedReader;\r
-import java.io.File;\r
-import java.io.FileInputStream;\r
-import java.io.FileOutputStream;\r
-import java.io.IOException;\r
-import java.io.InputStreamReader;\r
-import java.io.PrintStream;\r
-import java.io.RandomAccessFile;\r
-import java.util.ArrayList;\r
-import java.util.List;\r
-\r
-@SuppressWarnings("WeakerAccess")\r
-public final class FileUtil {\r
-    public static String readLine(final RandomAccessFile file, final long startPos) throws IOException {\r
-        file.seek(startPos);\r
-        return file.readLine();\r
-    }\r
-\r
-    public static List<String> readLines(final File file) throws IOException {\r
-        final List<String> result = new ArrayList<>();\r
-        try (BufferedReader in = new BufferedReader(new InputStreamReader(new FileInputStream(file)))) {\r
-            String line;\r
-            while ((line = in.readLine()) != null) {\r
-                result.add(line);\r
-            }\r
-        }\r
-        return result;\r
-    }\r
-\r
-    public static String readToString(final File file) throws IOException {\r
-        StringBuilder result = new StringBuilder();\r
-        try (BufferedReader in = new BufferedReader(new InputStreamReader(new FileInputStream(file)))) {\r
-            String line;\r
-            while ((line = in.readLine()) != null) {\r
-                result.append(line).append("\n");\r
-            }\r
-        }\r
-        return result.toString();\r
-    }\r
-\r
-    public static void writeStringToUTF8File(final String string, final File file) {\r
-        throw new IllegalStateException();\r
-    }\r
-\r
-    public static void printString(final File file, final String s) throws IOException {\r
-        final PrintStream out = new PrintStream(new FileOutputStream(file));\r
-        out.print(s);\r
-        out.close();\r
-    }\r
-\r
-}\r
+// Copyright 2011 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package com.hughes.util;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.io.PrintStream;
+import java.io.RandomAccessFile;
+import java.util.ArrayList;
+import java.util.List;
+
+@SuppressWarnings("WeakerAccess")
+public final class FileUtil {
+    public static String readLine(final RandomAccessFile file, final long startPos) throws IOException {
+        file.seek(startPos);
+        return file.readLine();
+    }
+
+    public static List<String> readLines(final File file) throws IOException {
+        final List<String> result = new ArrayList<>();
+        try (BufferedReader in = new BufferedReader(new InputStreamReader(new FileInputStream(file)))) {
+            String line;
+            while ((line = in.readLine()) != null) {
+                result.add(line);
+            }
+        }
+        return result;
+    }
+
+    public static String readToString(final File file) throws IOException {
+        StringBuilder result = new StringBuilder();
+        try (BufferedReader in = new BufferedReader(new InputStreamReader(new FileInputStream(file)))) {
+            String line;
+            while ((line = in.readLine()) != null) {
+                result.append(line).append("\n");
+            }
+        }
+        return result.toString();
+    }
+
+    public static void writeStringToUTF8File(final String string, final File file) {
+        throw new IllegalStateException();
+    }
+
+    public static void printString(final File file, final String s) throws IOException {
+        final PrintStream out = new PrintStream(new FileOutputStream(file));
+        out.print(s);
+        out.close();
+    }
+
+}
diff --git a/src/com/hughes/util/MapUtil.java b/src/com/hughes/util/MapUtil.java

index a62a86ec0e557acbd11df56b854995898d015552..0976288f631e3057e349c8e1c2c0beed6a0b5f19 100644 (file)
--- a/src/com/hughes/util/MapUtil.java
+++ b/src/com/hughes/util/MapUtil.java
@@ -16,40 +16,11 @@ package com.hughes.util;
  
  import java.util.Map;
  
-@SuppressWarnings({"WeakerAccess", "unused"})
  public class MapUtil {
-
-    public static <K,V> V safeGet(final Map<K,V> map, K key, V defaultValue) {
-        if (!map.containsKey(key)) {
-            return defaultValue;
-        }
-        return map.get(key);
-    }
-
-    public static <K,V> V safeGetOrPut(final Map<K,V> map, K key, V defaultValue) {
-        if (!map.containsKey(key)) {
-            map.put(key, defaultValue);
-        }
-        return map.get(key);
-    }
-
-    public static <K,V> V safeGet(final Map<K,V> map, K key, Class<V> valueClass) {
-        if (!map.containsKey(key)) {
-            try {
-                map.put(key, valueClass.newInstance());
-            } catch (Exception e) {
-                throw new RuntimeException(e);
-            }
-        }
-        return map.get(key);
-    }
-
      public static <K,V> V safeRemove(final Map<K,V> map, K key, V defaultValue) {
          if (!map.containsKey(key)) {
              return defaultValue;
          }
          return map.remove(key);
      }
-
-
  }
diff --git a/update_dict_list.sh b/update_dict_list.sh

index efd40320022cc7f3e130b08077f204f0514d173a..3d09064333a105af36103f60dafc30b15fbffcfd 100755 (executable)
--- a/update_dict_list.sh
+++ b/update_dict_list.sh
@@ -1,6 +1,9 @@
  # Run to update ..//Dictionary/res/raw/dictionary_info.txt to reference
  # all dictionaries in /data/outputs (needs to contain both zip and uncompressed files).
-CLASS=CheckDictionariesMain
-JAVA=/usr/lib/jvm/java-8-openjdk-amd64/jre/bin/java
-test -x "$JAVA" || JAVA=java
-$JAVA -classpath src:../Dictionary/Util/src/:../Dictionary/src/:/usr/share/java/com.ibm.icu.jar:/usr/share/java/xercesImpl.jar com.hughes.android.dictionary.engine.$CLASS "$@"
+RUNNER=./DictionaryPC
+if ! test -x "$RUNNER" ; then
+  JAVA=/usr/lib/jvm/java-8-openjdk-amd64/jre/bin/java
+  test -x "$JAVA" || JAVA=java
+  RUNNER="$JAVA -classpath bin/:/usr/share/java/com.ibm.icu.jar com.hughes.android.dictionary.engine.Runner"
+fi
+$RUNNER CheckDictionariesMain "$@"
author	Reimar Döffinger <rdoeffinger@users.noreply.github.com>
	Sun, 20 Dec 2020 09:55:20 +0000 (10:55 +0100)
committer	GitHub <noreply@github.com>
	Sun, 20 Dec 2020 09:55:20 +0000 (10:55 +0100)
.classpath		patch \| blob \| history
.gitignore		patch \| blob \| history
WiktionarySplitter.sh		patch \| blob \| history
compile.sh		patch \| blob \| history
convert_to_v6.sh	[new file with mode: 0755]	patch \| blob
genv6.sh	[new file with mode: 0755]	patch \| blob
googlecode_upload.py	[deleted file]	patch \| blob \| history
jars/commons-lang3-3.1.jar	[deleted file]	patch \| blob \| history
jars/xerces-2_11_0/xercesImpl.jar	[deleted file]	patch \| blob \| history
native-image-reflection.json	[new file with mode: 0644]	patch \| blob
native-image.cmd	[new file with mode: 0755]	patch \| blob
native-image.sh	[new file with mode: 0755]	patch \| blob
run.sh		patch \| blob \| history
src/com/hughes/android/dictionary/CollatorWrapper.java		patch \| blob \| history
src/com/hughes/android/dictionary/DateFormatTest.java		patch \| blob \| history
src/com/hughes/android/dictionary/DictionaryApplication.java		patch \| blob \| history
src/com/hughes/android/dictionary/FeatureConfig.java		patch \| blob \| history
src/com/hughes/android/dictionary/SerializeCollatorTest.java		patch \| blob \| history
src/com/hughes/android/dictionary/engine/CheckDictionariesMain.java		patch \| blob \| history
src/com/hughes/android/dictionary/engine/ConvertToV6.java	[new file with mode: 0644]	patch \| blob
src/com/hughes/android/dictionary/engine/DictionaryBuilder.java		patch \| blob \| history
src/com/hughes/android/dictionary/engine/DictionaryBuilderMain.java		patch \| blob \| history
src/com/hughes/android/dictionary/engine/DictionaryBuilderTest.java		patch \| blob \| history
src/com/hughes/android/dictionary/engine/DictionaryTest.java		patch \| blob \| history
src/com/hughes/android/dictionary/engine/DictionaryV6Writer.java	[new file with mode: 0644]	patch \| blob
src/com/hughes/android/dictionary/engine/IndexBuilder.java		patch \| blob \| history
src/com/hughes/android/dictionary/engine/LanguageTest.java		patch \| blob \| history
src/com/hughes/android/dictionary/engine/ReadAheadBuffer.java		patch \| blob \| history
src/com/hughes/android/dictionary/engine/Runner.java	[new file with mode: 0644]	patch \| blob
src/com/hughes/android/dictionary/engine/WiktionarySplitter.java		patch \| blob \| history
src/com/hughes/android/dictionary/engine/WriteBuffer.java		patch \| blob \| history
src/com/hughes/android/dictionary/parser/DictFileParser.java		patch \| blob \| history
src/com/hughes/android/dictionary/parser/WikiTokenizer.java		patch \| blob \| history
src/com/hughes/android/dictionary/parser/WikiTokenizerTest.java		patch \| blob \| history
src/com/hughes/android/dictionary/parser/wiktionary/AbstractWiktionaryParser.java		patch \| blob \| history
src/com/hughes/android/dictionary/parser/wiktionary/DeFunctionCallbacks.java		patch \| blob \| history
src/com/hughes/android/dictionary/parser/wiktionary/EnForeignParser.java		patch \| blob \| history
src/com/hughes/android/dictionary/parser/wiktionary/EnFunctionCallbacks.java		patch \| blob \| history
src/com/hughes/android/dictionary/parser/wiktionary/EnParser.java		patch \| blob \| history
src/com/hughes/android/dictionary/parser/wiktionary/EnToTranslationParser.java		patch \| blob \| history
src/com/hughes/android/dictionary/parser/wiktionary/EnTranslationToTranslationParser.java		patch \| blob \| history
src/com/hughes/android/dictionary/parser/wiktionary/FrFunctionCallbacks.java		patch \| blob \| history
src/com/hughes/android/dictionary/parser/wiktionary/ItFunctionCallbacks.java		patch \| blob \| history
src/com/hughes/android/dictionary/parser/wiktionary/WholeSectionToHtmlParser.java		patch \| blob \| history
src/com/hughes/android/dictionary/parser/wiktionary/WiktionaryLangs.java		patch \| blob \| history
src/com/hughes/util/Args.java		patch \| blob \| history
src/com/hughes/util/EnumUtil.java		patch \| blob \| history
src/com/hughes/util/FileUtil.java		patch \| blob \| history
src/com/hughes/util/MapUtil.java		patch \| blob \| history
update_dict_list.sh		patch \| blob \| history