1 package com.hughes.android.dictionary;
\r
3 import java.io.IOException;
\r
4 import java.io.RandomAccessFile;
\r
5 import java.util.ArrayList;
\r
6 import java.util.Arrays;
\r
7 import java.util.LinkedHashMap;
\r
8 import java.util.LinkedHashSet;
\r
9 import java.util.List;
\r
10 import java.util.Map;
\r
11 import java.util.Set;
\r
12 import java.util.regex.Matcher;
\r
13 import java.util.regex.Pattern;
\r
15 import com.hughes.util.raf.RAFFactory;
\r
16 import com.hughes.util.raf.RAFSerializable;
\r
18 public final class Entry implements RAFSerializable<Entry> {
\r
20 static final byte LANG1 = 0;
\r
21 static final byte LANG2 = 1;
\r
23 static final Pattern lineSplitPattern = Pattern.compile("\\s::\\s");
\r
24 static final Pattern sublineSplitPattern = Pattern.compile("\\s\\|\\s");
\r
26 final String[] lang1;
\r
27 final String[] lang2;
\r
29 // public Entry(final String lang1, final String lang2) {
\r
30 // this.lang1 = new String[] {lang1};
\r
31 // this.lang2 = new String[] {lang2};
\r
34 Entry(final String[] lang1, final String[] lang2) {
\r
39 public static final RAFFactory<Entry> RAF_FACTORY = new RAFFactory<Entry>() {
\r
40 public Entry create(RandomAccessFile raf) throws IOException {
\r
41 final int rows = raf.readByte();
\r
42 final String[] lang1 = new String[rows];
\r
43 final String[] lang2 = new String[rows];
\r
44 for (int i = 0; i < lang1.length; ++i) {
\r
45 lang1[i] = raf.readUTF();
\r
46 lang2[i] = raf.readUTF();
\r
48 return new Entry(lang1, lang2);
\r
50 public void write(RandomAccessFile raf) throws IOException {
\r
51 assert lang1.length == (byte) lang1.length;
\r
52 raf.writeByte(lang1.length);
\r
53 for (int i = 0; i < lang1.length; ++i) {
\r
54 raf.writeUTF(lang1[i]);
\r
55 raf.writeUTF(lang2[i]);
\r
60 public boolean equals(Object o) {
\r
61 if (!(o instanceof Entry)) {
\r
64 final Entry that = (Entry) o;
\r
65 return Arrays.deepEquals(this.lang1, that.lang1) && Arrays.deepEquals(this.lang2, that.lang2);
\r
69 public int hashCode() {
\r
70 return Arrays.deepHashCode(lang1) + Arrays.deepHashCode(lang2);
\r
74 public String toString() {
\r
75 return getRawText();
\r
78 public int getRowCount() {
\r
79 assert lang1.length == lang2.length;
\r
80 return lang1.length;
\r
83 String[] getAllText(final byte lang) {
\r
84 if (lang == LANG1) {
\r
87 assert lang == LANG2;
\r
91 String getRawText() {
\r
92 final StringBuilder result = new StringBuilder();
\r
93 for (int i = 0; i < lang1.length; ++i) {
\r
94 result.append(i == 0 ? "" : " | ").append(lang1[i]);
\r
96 result.append("\t");
\r
97 for (int i = 0; i < lang2.length; ++i) {
\r
98 result.append(i == 0 ? "" : " | ").append(lang2[i]);
\r
100 return result.toString();
\r
103 static byte otherLang(final byte lang) {
\r
104 assert lang == LANG1 || lang == LANG2;
\r
105 return lang == LANG1 ? LANG2 : LANG1;
\r
109 Lu Letter, Uppercase
\r
110 Ll Letter, Lowercase
\r
111 Lt Letter, Titlecase
\r
112 Lm Letter, Modifier
\r
114 Mn Mark, Nonspacing
\r
115 Mc Mark, Spacing Combining
\r
117 Nd Number, Decimal Digit
\r
120 Pc Punctuation, Connector
\r
121 Pd Punctuation, Dash
\r
122 Ps Punctuation, Open
\r
123 Pe Punctuation, Close
\r
124 Pi Punctuation, Initial quote (may behave like Ps or Pe depending on usage)
\r
125 Pf Punctuation, Final quote (may behave like Ps or Pe depending on usage)
\r
126 Po Punctuation, Other
\r
128 Sc Symbol, Currency
\r
129 Sk Symbol, Modifier
\r
131 Zs Separator, Space
\r
133 Zp Separator, Paragraph
\r
136 static Pattern htmlDecimalCode = Pattern.compile("&#([0-9]+);");
\r
137 static Pattern htmlCode = Pattern.compile("&#[^;]+;");
\r
139 static Entry parseFromLine(String line, final boolean hasMultipleSubentries) {
\r
141 line = line.replaceAll("<", "<");
\r
142 line = line.replaceAll(">", ">");
\r
144 while ((matcher = htmlDecimalCode.matcher(line)).find()) {
\r
145 final int intVal = Integer.parseInt(matcher.group(1));
\r
146 final String charCode = "" + ((char) intVal);
\r
147 System.out.println("Replacing " + matcher.group() + " with " + charCode);
\r
148 line = matcher.replaceAll(charCode);
\r
150 if ((matcher = htmlCode.matcher(line)).find()) {
\r
151 System.err.println("HTML code: " + matcher.group());
\r
154 final String[] parts = lineSplitPattern.split(line);
\r
155 if (parts.length != 2) {
\r
156 System.err.println("Entry:" + "Invalid line: " + line);
\r
159 if (!hasMultipleSubentries) {
\r
160 return new Entry(new String[] {parts[0].trim()}, new String[] {parts[1].trim()});
\r
163 final String[] lang1 = sublineSplitPattern.split(" " + parts[0].trim() + " ");
\r
164 final String[] lang2 = sublineSplitPattern.split(" " + parts[1].trim() + " ");
\r
165 if (lang1.length != lang2.length) {
\r
166 System.err.println("Entry:" + "Invalid subline: " + line);
\r
169 for (int i = 0; i < lang1.length; ++i) {
\r
170 lang1[i] = lang1[i].trim();
\r
171 lang2[i] = lang2[i].trim();
\r
173 return new Entry(lang1, lang2);
\r
176 static final Map<String, String> bracketToClose = new LinkedHashMap<String, String>();
\r
178 bracketToClose.put("\"", "\"");
\r
179 bracketToClose.put(" '", "' ");
\r
182 // This used to be called WHITESPACE.
\r
183 static final Pattern NON_TOKEN_CHAR = Pattern.compile("\\s+");
\r
185 public Set<String> getIndexableTokens(final byte lang) {
\r
186 final Set<String> result = new LinkedHashSet<String>();
\r
188 for (final String subentry : getAllText(lang)) {
\r
189 text += subentry + " ";
\r
192 text = text.replaceAll("fig\\.", " ");
\r
193 text = text.replaceAll("\\{[^\\}]+}", " ");
\r
194 text = text.replaceAll("\"-", "-");
\r
195 text = text.replaceAll("-\"", "-");
\r
196 text = text.replaceAll("[\"/\\()<>\\[\\],;?!.]", " ");
\r
197 text = text.replaceAll("[-:] ", " ");
\r
198 text = text.replaceAll(" [-:]", " ");
\r
200 // Now be really conservative about what we allow inside a token:
\r
201 // See: http://unicode.org/Public/UNIDATA/UCD.html#General_Category_Values
\r
202 text = text.replaceAll("[^-:\\p{L}\\p{N}\\p{S}]", " ");
\r
203 result.addAll(Arrays.asList(NON_TOKEN_CHAR.split(text)));
\r
205 text = text.replaceAll("[-]", " ");
\r
206 result.addAll(Arrays.asList(NON_TOKEN_CHAR.split(text)));
\r
208 final Set<String> result2 = new LinkedHashSet<String>();
\r
209 for (final String token : result) {
\r
210 if (isIndexable(token)) {
\r
211 result2.add(token);
\r
217 static boolean isIndexable(final String text) {
\r
218 // Does it have an alpha-numeric anywhere?
\r
219 return text.matches(".*\\w.*");
\r
222 static List<String> getTextInside(final String text, final String start, final String end) {
\r
223 final List<String> result = new ArrayList<String>();
\r
225 while ((startPos = text.indexOf(start)) != -1) {
\r
226 final int endPos = text.indexOf(end, startPos + 1);
\r
227 result.add(text.substring(startPos + 1, endPos));
\r
228 startPos = endPos + 1;
\r