1 package com.hughes.android.dictionary;
\r
3 import java.io.IOException;
\r
4 import java.io.RandomAccessFile;
\r
5 import java.util.Arrays;
\r
6 import java.util.LinkedHashMap;
\r
7 import java.util.LinkedHashSet;
\r
8 import java.util.Map;
\r
9 import java.util.Set;
\r
10 import java.util.regex.Matcher;
\r
11 import java.util.regex.Pattern;
\r
13 import com.hughes.util.raf.RAFFactory;
\r
15 public final class SimpleEntry implements Entry {
\r
17 static final byte LANG1 = 0;
\r
18 static final byte LANG2 = 1;
\r
20 static final Pattern lineSplitPattern = Pattern.compile("\\s::\\s");
\r
21 static final Pattern sublineSplitPattern = Pattern.compile("\\s\\|\\s");
\r
23 final String[] lang1;
\r
24 final String[] lang2;
\r
26 SimpleEntry(final String[] lang1, final String[] lang2) {
\r
31 public static final RAFFactory<SimpleEntry> RAF_FACTORY = new RAFFactory<SimpleEntry>() {
\r
32 public SimpleEntry create(RandomAccessFile raf) throws IOException {
\r
33 final int rows = raf.readByte();
\r
34 final String[] lang1 = new String[rows];
\r
35 final String[] lang2 = new String[rows];
\r
36 for (int i = 0; i < lang1.length; ++i) {
\r
37 lang1[i] = raf.readUTF();
\r
38 lang2[i] = raf.readUTF();
\r
40 return new SimpleEntry(lang1, lang2);
\r
42 public void write(RandomAccessFile raf) throws IOException {
\r
43 assert lang1.length == (byte) lang1.length;
\r
44 raf.writeByte(lang1.length);
\r
45 for (int i = 0; i < lang1.length; ++i) {
\r
46 raf.writeUTF(lang1[i]);
\r
47 raf.writeUTF(lang2[i]);
\r
52 public boolean equals(Object o) {
\r
53 if (!(o instanceof SimpleEntry)) {
\r
56 final SimpleEntry that = (SimpleEntry) o;
\r
57 return Arrays.deepEquals(this.lang1, that.lang1) && Arrays.deepEquals(this.lang2, that.lang2);
\r
61 public int hashCode() {
\r
62 return Arrays.deepHashCode(lang1) + Arrays.deepHashCode(lang2);
\r
66 public String toString() {
\r
67 return getRawText(false);
\r
70 public int getRowCount() {
\r
71 assert lang1.length == lang2.length;
\r
72 return lang1.length;
\r
75 String[] getAllText(final byte lang) {
\r
76 if (lang == LANG1) {
\r
79 assert lang == LANG2;
\r
83 String getRawText(boolean onlyFirstSubentry) {
\r
84 final StringBuilder result = new StringBuilder();
\r
85 for (int i = 0; i < (onlyFirstSubentry ? 1 : lang1.length); ++i) {
\r
86 result.append(i == 0 ? "" : " | ").append(lang1[i]);
\r
88 result.append("\t");
\r
89 for (int i = 0; i < (onlyFirstSubentry ? 1 : lang2.length); ++i) {
\r
90 result.append(i == 0 ? "" : " | ").append(lang2[i]);
\r
92 return result.toString();
\r
95 static byte otherLang(final byte lang) {
\r
96 assert lang == LANG1 || lang == LANG2;
\r
97 return lang == LANG1 ? LANG2 : LANG1;
\r
101 Lu Letter, Uppercase
\r
102 Ll Letter, Lowercase
\r
103 Lt Letter, Titlecase
\r
104 Lm Letter, Modifier
\r
106 Mn Mark, Nonspacing
\r
107 Mc Mark, Spacing Combining
\r
109 Nd Number, Decimal Digit
\r
112 Pc Punctuation, Connector
\r
113 Pd Punctuation, Dash
\r
114 Ps Punctuation, Open
\r
115 Pe Punctuation, Close
\r
116 Pi Punctuation, Initial quote (may behave like Ps or Pe depending on usage)
\r
117 Pf Punctuation, Final quote (may behave like Ps or Pe depending on usage)
\r
118 Po Punctuation, Other
\r
120 Sc Symbol, Currency
\r
121 Sk Symbol, Modifier
\r
123 Zs Separator, Space
\r
125 Zp Separator, Paragraph
\r
128 static Pattern htmlDecimalCode = Pattern.compile("&#([0-9]+);");
\r
129 static Pattern htmlCode = Pattern.compile("&#[^;]+;");
\r
131 static SimpleEntry parseFromLine(String line, final boolean hasMultipleSubentries) {
\r
133 line = line.replaceAll("<", "<");
\r
134 line = line.replaceAll(">", ">");
\r
136 while ((matcher = htmlDecimalCode.matcher(line)).find()) {
\r
137 final int intVal = Integer.parseInt(matcher.group(1));
\r
138 final String charCode = "" + ((char) intVal);
\r
139 System.out.println("Replacing " + matcher.group() + " with " + charCode);
\r
140 line = matcher.replaceAll(charCode);
\r
142 if ((matcher = htmlCode.matcher(line)).find()) {
\r
143 System.err.println("HTML code: " + matcher.group());
\r
146 final String[] parts = lineSplitPattern.split(line);
\r
147 if (parts.length != 2) {
\r
148 System.err.println("Entry:" + "Invalid line: " + line);
\r
151 if (!hasMultipleSubentries) {
\r
152 return new SimpleEntry(new String[] {parts[0].trim()}, new String[] {parts[1].trim()});
\r
155 final String[] lang1 = sublineSplitPattern.split(" " + parts[0].trim() + " ");
\r
156 final String[] lang2 = sublineSplitPattern.split(" " + parts[1].trim() + " ");
\r
157 if (lang1.length != lang2.length) {
\r
158 System.err.println("Entry:" + "Invalid subline: " + line);
\r
161 for (int i = 0; i < lang1.length; ++i) {
\r
162 lang1[i] = lang1[i].trim();
\r
163 lang2[i] = lang2[i].trim();
\r
165 return new SimpleEntry(lang1, lang2);
\r
168 static final Map<String, String> bracketToClose = new LinkedHashMap<String, String>();
\r
170 bracketToClose.put("\"", "\"");
\r
171 bracketToClose.put(" '", "' ");
\r
174 // This used to be called WHITESPACE.
\r
175 static final Pattern NON_TOKEN_CHAR = Pattern.compile("\\s+");
\r
177 public Set<String> getIndexableTokens(final byte lang) {
\r
178 final Set<String> result = new LinkedHashSet<String>();
\r
180 for (final String subentry : getAllText(lang)) {
\r
181 text += subentry + " ";
\r
184 text = text.replaceAll("fig\\.", " ");
\r
185 text = text.replaceAll("\\{[^\\}]+}", " ");
\r
186 text = text.replaceAll("\"-", "-");
\r
187 text = text.replaceAll("-\"", "-");
\r
188 text = text.replaceAll("[\"/\\()<>\\[\\],;?!.]", " ");
\r
189 text = text.replaceAll("[-:] ", " ");
\r
190 text = text.replaceAll(" [-:]", " ");
\r
192 // Now be really conservative about what we allow inside a token:
\r
193 // See: http://unicode.org/Public/UNIDATA/UCD.html#General_Category_Values
\r
194 text = text.replaceAll("[^-:\\p{L}\\p{N}\\p{S}]", " ");
\r
195 result.addAll(Arrays.asList(NON_TOKEN_CHAR.split(text)));
\r
197 text = text.replaceAll("[-]", " ");
\r
198 result.addAll(Arrays.asList(NON_TOKEN_CHAR.split(text)));
\r
200 final Set<String> result2 = new LinkedHashSet<String>();
\r
201 for (final String token : result) {
\r
202 if (isIndexable(token)) {
\r
203 result2.add(token);
\r
209 static boolean isIndexable(final String text) {
\r
210 // Does it have an alpha-numeric anywhere?
\r
211 return text.matches(".*\\w.*");
\r