1 package com.hughes.android.dictionary;
\r
3 import java.io.IOException;
\r
4 import java.io.RandomAccessFile;
\r
5 import java.util.ArrayList;
\r
6 import java.util.Arrays;
\r
7 import java.util.LinkedHashMap;
\r
8 import java.util.LinkedHashSet;
\r
9 import java.util.List;
\r
10 import java.util.Map;
\r
11 import java.util.Set;
\r
12 import java.util.regex.Matcher;
\r
13 import java.util.regex.Pattern;
\r
15 import com.hughes.util.raf.RAFFactory;
\r
16 import com.hughes.util.raf.RAFSerializable;
\r
18 public final class SimpleEntry implements Entry {
\r
20 static final byte LANG1 = 0;
\r
21 static final byte LANG2 = 1;
\r
23 static final Pattern lineSplitPattern = Pattern.compile("\\s::\\s");
\r
24 static final Pattern sublineSplitPattern = Pattern.compile("\\s\\|\\s");
\r
26 final String[] lang1;
\r
27 final String[] lang2;
\r
29 SimpleEntry(final String[] lang1, final String[] lang2) {
\r
34 public static final RAFFactory<SimpleEntry> RAF_FACTORY = new RAFFactory<SimpleEntry>() {
\r
35 public SimpleEntry create(RandomAccessFile raf) throws IOException {
\r
36 final int rows = raf.readByte();
\r
37 final String[] lang1 = new String[rows];
\r
38 final String[] lang2 = new String[rows];
\r
39 for (int i = 0; i < lang1.length; ++i) {
\r
40 lang1[i] = raf.readUTF();
\r
41 lang2[i] = raf.readUTF();
\r
43 return new SimpleEntry(lang1, lang2);
\r
45 public void write(RandomAccessFile raf) throws IOException {
\r
46 assert lang1.length == (byte) lang1.length;
\r
47 raf.writeByte(lang1.length);
\r
48 for (int i = 0; i < lang1.length; ++i) {
\r
49 raf.writeUTF(lang1[i]);
\r
50 raf.writeUTF(lang2[i]);
\r
55 public boolean equals(Object o) {
\r
56 if (!(o instanceof SimpleEntry)) {
\r
59 final SimpleEntry that = (SimpleEntry) o;
\r
60 return Arrays.deepEquals(this.lang1, that.lang1) && Arrays.deepEquals(this.lang2, that.lang2);
\r
64 public int hashCode() {
\r
65 return Arrays.deepHashCode(lang1) + Arrays.deepHashCode(lang2);
\r
69 public String toString() {
\r
70 return getRawText(false);
\r
73 public int getRowCount() {
\r
74 assert lang1.length == lang2.length;
\r
75 return lang1.length;
\r
78 String[] getAllText(final byte lang) {
\r
79 if (lang == LANG1) {
\r
82 assert lang == LANG2;
\r
86 String getRawText(boolean onlyFirstSubentry) {
\r
87 final StringBuilder result = new StringBuilder();
\r
88 for (int i = 0; i < (onlyFirstSubentry ? 1 : lang1.length); ++i) {
\r
89 result.append(i == 0 ? "" : " | ").append(lang1[i]);
\r
91 result.append("\t");
\r
92 for (int i = 0; i < (onlyFirstSubentry ? 1 : lang2.length); ++i) {
\r
93 result.append(i == 0 ? "" : " | ").append(lang2[i]);
\r
95 return result.toString();
\r
98 static byte otherLang(final byte lang) {
\r
99 assert lang == LANG1 || lang == LANG2;
\r
100 return lang == LANG1 ? LANG2 : LANG1;
\r
104 Lu Letter, Uppercase
\r
105 Ll Letter, Lowercase
\r
106 Lt Letter, Titlecase
\r
107 Lm Letter, Modifier
\r
109 Mn Mark, Nonspacing
\r
110 Mc Mark, Spacing Combining
\r
112 Nd Number, Decimal Digit
\r
115 Pc Punctuation, Connector
\r
116 Pd Punctuation, Dash
\r
117 Ps Punctuation, Open
\r
118 Pe Punctuation, Close
\r
119 Pi Punctuation, Initial quote (may behave like Ps or Pe depending on usage)
\r
120 Pf Punctuation, Final quote (may behave like Ps or Pe depending on usage)
\r
121 Po Punctuation, Other
\r
123 Sc Symbol, Currency
\r
124 Sk Symbol, Modifier
\r
126 Zs Separator, Space
\r
128 Zp Separator, Paragraph
\r
131 static Pattern htmlDecimalCode = Pattern.compile("&#([0-9]+);");
\r
132 static Pattern htmlCode = Pattern.compile("&#[^;]+;");
\r
134 static SimpleEntry parseFromLine(String line, final boolean hasMultipleSubentries) {
\r
136 line = line.replaceAll("<", "<");
\r
137 line = line.replaceAll(">", ">");
\r
139 while ((matcher = htmlDecimalCode.matcher(line)).find()) {
\r
140 final int intVal = Integer.parseInt(matcher.group(1));
\r
141 final String charCode = "" + ((char) intVal);
\r
142 System.out.println("Replacing " + matcher.group() + " with " + charCode);
\r
143 line = matcher.replaceAll(charCode);
\r
145 if ((matcher = htmlCode.matcher(line)).find()) {
\r
146 System.err.println("HTML code: " + matcher.group());
\r
149 final String[] parts = lineSplitPattern.split(line);
\r
150 if (parts.length != 2) {
\r
151 System.err.println("Entry:" + "Invalid line: " + line);
\r
154 if (!hasMultipleSubentries) {
\r
155 return new SimpleEntry(new String[] {parts[0].trim()}, new String[] {parts[1].trim()});
\r
158 final String[] lang1 = sublineSplitPattern.split(" " + parts[0].trim() + " ");
\r
159 final String[] lang2 = sublineSplitPattern.split(" " + parts[1].trim() + " ");
\r
160 if (lang1.length != lang2.length) {
\r
161 System.err.println("Entry:" + "Invalid subline: " + line);
\r
164 for (int i = 0; i < lang1.length; ++i) {
\r
165 lang1[i] = lang1[i].trim();
\r
166 lang2[i] = lang2[i].trim();
\r
168 return new SimpleEntry(lang1, lang2);
\r
171 static final Map<String, String> bracketToClose = new LinkedHashMap<String, String>();
\r
173 bracketToClose.put("\"", "\"");
\r
174 bracketToClose.put(" '", "' ");
\r
177 // This used to be called WHITESPACE.
\r
178 static final Pattern NON_TOKEN_CHAR = Pattern.compile("\\s+");
\r
180 public Map<String,Integer> getIndexableTokens(final byte lang) {
\r
181 final Set<String> result = new LinkedHashSet<String>();
\r
183 for (final String subentry : getAllText(lang)) {
\r
184 text += subentry + " ";
\r
187 text = text.replaceAll("fig\\.", " ");
\r
188 text = text.replaceAll("\\{[^\\}]+}", " ");
\r
189 text = text.replaceAll("\"-", "-");
\r
190 text = text.replaceAll("-\"", "-");
\r
191 text = text.replaceAll("[\"/\\()<>\\[\\],;?!.]", " ");
\r
192 text = text.replaceAll("[-:] ", " ");
\r
193 text = text.replaceAll(" [-:]", " ");
\r
195 // Now be really conservative about what we allow inside a token:
\r
196 // See: http://unicode.org/Public/UNIDATA/UCD.html#General_Category_Values
\r
197 text = text.replaceAll("[^-:\\p{L}\\p{N}\\p{S}]", " ");
\r
198 result.addAll(Arrays.asList(NON_TOKEN_CHAR.split(text)));
\r
200 text = text.replaceAll("[-]", " ");
\r
201 result.addAll(Arrays.asList(NON_TOKEN_CHAR.split(text)));
\r
203 final Set<String> result2 = new LinkedHashSet<String>();
\r
204 for (final String token : result) {
\r
205 if (isIndexable(token)) {
\r
206 result2.add(token);
\r
212 static boolean isIndexable(final String text) {
\r
213 // Does it have an alpha-numeric anywhere?
\r
214 return text.matches(".*\\w.*");
\r