--- /dev/null
+package com.hughes.android.dictionary;\r
+\r
+import java.io.IOException;\r
+import java.io.RandomAccessFile;\r
+import java.util.ArrayList;\r
+import java.util.Arrays;\r
+import java.util.LinkedHashMap;\r
+import java.util.LinkedHashSet;\r
+import java.util.List;\r
+import java.util.Map;\r
+import java.util.Set;\r
+import java.util.regex.Matcher;\r
+import java.util.regex.Pattern;\r
+\r
+import com.hughes.util.raf.RAFFactory;\r
+import com.hughes.util.raf.RAFSerializable;\r
+\r
+public final class SimpleEntry implements Entry {\r
+\r
+ static final byte LANG1 = 0;\r
+ static final byte LANG2 = 1;\r
+\r
+ static final Pattern lineSplitPattern = Pattern.compile("\\s::\\s");\r
+ static final Pattern sublineSplitPattern = Pattern.compile("\\s\\|\\s");\r
+\r
+ final String[] lang1;\r
+ final String[] lang2;\r
+ \r
+ SimpleEntry(final String[] lang1, final String[] lang2) {\r
+ this.lang1 = lang1;\r
+ this.lang2 = lang2;\r
+ }\r
+\r
+ public static final RAFFactory<SimpleEntry> RAF_FACTORY = new RAFFactory<SimpleEntry>() {\r
+ public SimpleEntry create(RandomAccessFile raf) throws IOException {\r
+ final int rows = raf.readByte();\r
+ final String[] lang1 = new String[rows];\r
+ final String[] lang2 = new String[rows];\r
+ for (int i = 0; i < lang1.length; ++i) {\r
+ lang1[i] = raf.readUTF();\r
+ lang2[i] = raf.readUTF();\r
+ }\r
+ return new SimpleEntry(lang1, lang2);\r
+ }};\r
+ public void write(RandomAccessFile raf) throws IOException {\r
+ assert lang1.length == (byte) lang1.length;\r
+ raf.writeByte(lang1.length);\r
+ for (int i = 0; i < lang1.length; ++i) {\r
+ raf.writeUTF(lang1[i]);\r
+ raf.writeUTF(lang2[i]);\r
+ }\r
+ }\r
+\r
+ @Override\r
+ public boolean equals(Object o) {\r
+ if (!(o instanceof SimpleEntry)) {\r
+ return false;\r
+ }\r
+ final SimpleEntry that = (SimpleEntry) o;\r
+ return Arrays.deepEquals(this.lang1, that.lang1) && Arrays.deepEquals(this.lang2, that.lang2); \r
+ }\r
+\r
+ @Override\r
+ public int hashCode() {\r
+ return Arrays.deepHashCode(lang1) + Arrays.deepHashCode(lang2);\r
+ }\r
+\r
+ @Override\r
+ public String toString() {\r
+ return getRawText(false);\r
+ }\r
+\r
+ public int getRowCount() {\r
+ assert lang1.length == lang2.length;\r
+ return lang1.length;\r
+ }\r
+\r
+ String[] getAllText(final byte lang) {\r
+ if (lang == LANG1) {\r
+ return lang1;\r
+ }\r
+ assert lang == LANG2;\r
+ return lang2;\r
+ }\r
+ \r
+ String getRawText(boolean onlyFirstSubentry) {\r
+ final StringBuilder result = new StringBuilder();\r
+ for (int i = 0; i < (onlyFirstSubentry ? 1 : lang1.length); ++i) {\r
+ result.append(i == 0 ? "" : " | ").append(lang1[i]);\r
+ }\r
+ result.append("\t");\r
+ for (int i = 0; i < (onlyFirstSubentry ? 1 : lang2.length); ++i) {\r
+ result.append(i == 0 ? "" : " | ").append(lang2[i]);\r
+ }\r
+ return result.toString();\r
+ }\r
+ \r
+ static byte otherLang(final byte lang) {\r
+ assert lang == LANG1 || lang == LANG2;\r
+ return lang == LANG1 ? LANG2 : LANG1;\r
+ }\r
+ \r
+/*\r
+Lu Letter, Uppercase\r
+Ll Letter, Lowercase\r
+Lt Letter, Titlecase\r
+Lm Letter, Modifier\r
+Lo Letter, Other\r
+Mn Mark, Nonspacing\r
+Mc Mark, Spacing Combining\r
+Me Mark, Enclosing\r
+Nd Number, Decimal Digit\r
+Nl Number, Letter\r
+No Number, Other\r
+Pc Punctuation, Connector\r
+Pd Punctuation, Dash\r
+Ps Punctuation, Open\r
+Pe Punctuation, Close\r
+Pi Punctuation, Initial quote (may behave like Ps or Pe depending on usage)\r
+Pf Punctuation, Final quote (may behave like Ps or Pe depending on usage)\r
+Po Punctuation, Other\r
+Sm Symbol, Math\r
+Sc Symbol, Currency\r
+Sk Symbol, Modifier\r
+So Symbol, Other\r
+Zs Separator, Space\r
+Zl Separator, Line\r
+Zp Separator, Paragraph\r
+*/\r
+\r
+ static Pattern htmlDecimalCode = Pattern.compile("&#([0-9]+);");\r
+ static Pattern htmlCode = Pattern.compile("&#[^;]+;");\r
+ \r
+ static SimpleEntry parseFromLine(String line, final boolean hasMultipleSubentries) {\r
+ \r
+ line = line.replaceAll("<", "<");\r
+ line = line.replaceAll(">", ">");\r
+ Matcher matcher;\r
+ while ((matcher = htmlDecimalCode.matcher(line)).find()) {\r
+ final int intVal = Integer.parseInt(matcher.group(1));\r
+ final String charCode = "" + ((char) intVal);\r
+ System.out.println("Replacing " + matcher.group() + " with " + charCode);\r
+ line = matcher.replaceAll(charCode);\r
+ }\r
+ if ((matcher = htmlCode.matcher(line)).find()) {\r
+ System.err.println("HTML code: " + matcher.group());\r
+ }\r
+ \r
+ final String[] parts = lineSplitPattern.split(line);\r
+ if (parts.length != 2) {\r
+ System.err.println("Entry:" + "Invalid line: " + line);\r
+ return null;\r
+ }\r
+ if (!hasMultipleSubentries) {\r
+ return new SimpleEntry(new String[] {parts[0].trim()}, new String[] {parts[1].trim()});\r
+ }\r
+ \r
+ final String[] lang1 = sublineSplitPattern.split(" " + parts[0].trim() + " ");\r
+ final String[] lang2 = sublineSplitPattern.split(" " + parts[1].trim() + " ");\r
+ if (lang1.length != lang2.length) {\r
+ System.err.println("Entry:" + "Invalid subline: " + line);\r
+ return null;\r
+ }\r
+ for (int i = 0; i < lang1.length; ++i) {\r
+ lang1[i] = lang1[i].trim();\r
+ lang2[i] = lang2[i].trim();\r
+ }\r
+ return new SimpleEntry(lang1, lang2);\r
+ }\r
+ \r
+ static final Map<String, String> bracketToClose = new LinkedHashMap<String, String>();\r
+ static {\r
+ bracketToClose.put("\"", "\"");\r
+ bracketToClose.put(" '", "' ");\r
+ }\r
+ \r
+ // This used to be called WHITESPACE.\r
+ static final Pattern NON_TOKEN_CHAR = Pattern.compile("\\s+");\r
+ \r
+ public Map<String,Integer> getIndexableTokens(final byte lang) {\r
+ final Set<String> result = new LinkedHashSet<String>();\r
+ String text = " ";\r
+ for (final String subentry : getAllText(lang)) {\r
+ text += subentry + " ";\r
+ }\r
+\r
+ text = text.replaceAll("fig\\.", " ");\r
+ text = text.replaceAll("\\{[^\\}]+}", " ");\r
+ text = text.replaceAll("\"-", "-");\r
+ text = text.replaceAll("-\"", "-");\r
+ text = text.replaceAll("[\"/\\()<>\\[\\],;?!.]", " ");\r
+ text = text.replaceAll("[-:] ", " ");\r
+ text = text.replaceAll(" [-:]", " ");\r
+ \r
+ // Now be really conservative about what we allow inside a token:\r
+ // See: http://unicode.org/Public/UNIDATA/UCD.html#General_Category_Values\r
+ text = text.replaceAll("[^-:\\p{L}\\p{N}\\p{S}]", " ");\r
+ result.addAll(Arrays.asList(NON_TOKEN_CHAR.split(text)));\r
+\r
+ text = text.replaceAll("[-]", " ");\r
+ result.addAll(Arrays.asList(NON_TOKEN_CHAR.split(text)));\r
+ \r
+ final Set<String> result2 = new LinkedHashSet<String>();\r
+ for (final String token : result) {\r
+ if (isIndexable(token)) {\r
+ result2.add(token);\r
+ }\r
+ }\r
+ return result2;\r
+ }\r
+\r
+ static boolean isIndexable(final String text) {\r
+ // Does it have an alpha-numeric anywhere?\r
+ return text.matches(".*\\w.*");\r
+ }\r
+ \r
+}
\ No newline at end of file