1 package com.hughes.android.dictionary;
\r
3 import java.io.IOException;
\r
4 import java.io.RandomAccessFile;
\r
5 import java.util.ArrayList;
\r
6 import java.util.Arrays;
\r
7 import java.util.LinkedHashMap;
\r
8 import java.util.LinkedHashSet;
\r
9 import java.util.List;
\r
10 import java.util.Map;
\r
11 import java.util.Set;
\r
12 import java.util.regex.Pattern;
\r
14 import com.hughes.util.raf.RAFFactory;
\r
15 import com.hughes.util.raf.RAFSerializable;
\r
17 public final class Entry implements RAFSerializable<Entry> {
\r
19 static final byte LANG1 = 0;
\r
20 static final byte LANG2 = 1;
\r
22 static final Pattern lineSplitPattern = Pattern.compile("\\s::\\s");
\r
23 static final Pattern sublineSplitPattern = Pattern.compile("\\s\\|\\s");
\r
25 final String[] lang1;
\r
26 final String[] lang2;
\r
28 // public Entry(final String lang1, final String lang2) {
\r
29 // this.lang1 = new String[] {lang1};
\r
30 // this.lang2 = new String[] {lang2};
\r
33 Entry(final String[] lang1, final String[] lang2) {
\r
38 public static final RAFFactory<Entry> RAF_FACTORY = new RAFFactory<Entry>() {
\r
39 public Entry create(RandomAccessFile raf) throws IOException {
\r
40 final int rows = raf.readByte();
\r
41 final String[] lang1 = new String[rows];
\r
42 final String[] lang2 = new String[rows];
\r
43 for (int i = 0; i < lang1.length; ++i) {
\r
44 lang1[i] = raf.readUTF();
\r
45 lang2[i] = raf.readUTF();
\r
47 return new Entry(lang1, lang2);
\r
49 public void write(RandomAccessFile raf) throws IOException {
\r
50 assert lang1.length == (byte) lang1.length;
\r
51 raf.writeByte(lang1.length);
\r
52 for (int i = 0; i < lang1.length; ++i) {
\r
53 raf.writeUTF(lang1[i]);
\r
54 raf.writeUTF(lang2[i]);
\r
59 public boolean equals(Object o) {
\r
60 if (!(o instanceof Entry)) {
\r
63 final Entry that = (Entry) o;
\r
64 return Arrays.deepEquals(this.lang1, that.lang1) && Arrays.deepEquals(this.lang2, that.lang2);
\r
68 public int hashCode() {
\r
69 return Arrays.deepHashCode(lang1) + Arrays.deepHashCode(lang2);
\r
73 public String toString() {
\r
74 return getRawText();
\r
77 public int getRowCount() {
\r
78 assert lang1.length == lang2.length;
\r
79 return lang1.length;
\r
82 String[] getAllText(final byte lang) {
\r
83 if (lang == LANG1) {
\r
86 assert lang == LANG2;
\r
90 String getRawText() {
\r
91 final StringBuilder result = new StringBuilder();
\r
92 for (int i = 0; i < lang1.length; ++i) {
\r
93 result.append(i == 0 ? "" : " | ").append(lang1[i]);
\r
95 result.append("\t");
\r
96 for (int i = 0; i < lang2.length; ++i) {
\r
97 result.append(i == 0 ? "" : " | ").append(lang2[i]);
\r
99 return result.toString();
\r
102 static byte otherLang(final byte lang) {
\r
103 assert lang == LANG1 || lang == LANG2;
\r
104 return lang == LANG1 ? LANG2 : LANG1;
\r
108 static Entry parseFromLine(String line, final boolean hasMultipleSubentries) {
\r
109 line = line.replaceAll("<", "<");
\r
110 line = line.replaceAll(">", ">");
\r
111 final String[] parts = lineSplitPattern.split(line);
\r
112 if (parts.length != 2) {
\r
113 System.err.println("Entry:" + "Invalid line: " + line);
\r
116 if (!hasMultipleSubentries) {
\r
117 return new Entry(new String[] {parts[0].trim()}, new String[] {parts[1].trim()});
\r
120 final String[] lang1 = sublineSplitPattern.split(" " + parts[0].trim() + " ");
\r
121 final String[] lang2 = sublineSplitPattern.split(" " + parts[1].trim() + " ");
\r
122 if (lang1.length != lang2.length) {
\r
123 System.err.println("Entry:" + "Invalid subline: " + line);
\r
126 for (int i = 0; i < lang1.length; ++i) {
\r
127 lang1[i] = lang1[i].trim();
\r
128 lang2[i] = lang2[i].trim();
\r
130 return new Entry(lang1, lang2);
\r
133 static final Map<String, String> bracketToClose = new LinkedHashMap<String, String>();
\r
135 bracketToClose.put("\"", "\"");
\r
136 bracketToClose.put(" '", "' ");
\r
139 static final Pattern WHITESPACE = Pattern.compile("\\s+");
\r
141 public Set<String> getIndexableTokens(final byte lang) {
\r
142 final Set<String> result = new LinkedHashSet<String>();
\r
144 for (final String subentry : getAllText(lang)) {
\r
145 text += subentry + " ";
\r
148 text = text.replaceAll("fig\\.", " ");
\r
149 text = text.replaceAll("\\{[^\\}]+}", " ");
\r
150 text = text.replaceAll("\"-", "-");
\r
151 text = text.replaceAll("-\"", "-");
\r
152 text = text.replaceAll("[\"/\\()<>\\[\\],;?!.]", " ");
\r
153 text = text.replaceAll("[:] ", " ");
\r
154 text = text.replaceAll(" [:]", " ");
\r
156 // Now be really conservative about what we allow inside a token:
\r
157 // See: http://unicode.org/Public/UNIDATA/UCD.html#General_Category_Values
\r
158 text = text.replaceAll("[^-:\\p{Lu}\\p{Ll}\\p{Lt}\\p{Lm}\\p{Lo}\\p{Nd}\\p{Nl}\\p{No}]", " ");
\r
160 result.addAll(Arrays.asList(WHITESPACE.split(text)));
\r
162 text = text.replaceAll("[-]", " ");
\r
163 result.addAll(Arrays.asList(WHITESPACE.split(text)));
\r
165 final Set<String> result2 = new LinkedHashSet<String>();
\r
166 for (final String token : result) {
\r
167 if (isIndexable(token)) {
\r
168 result2.add(token);
\r
174 static boolean isIndexable(final String text) {
\r
175 // Does it have an alpha-numeric anywhere?
\r
176 return text.matches(".*\\w.*");
\r
179 static List<String> getTextInside(final String text, final String start, final String end) {
\r
180 final List<String> result = new ArrayList<String>();
\r
182 while ((startPos = text.indexOf(start)) != -1) {
\r
183 final int endPos = text.indexOf(end, startPos + 1);
\r
184 result.add(text.substring(startPos + 1, endPos));
\r
185 startPos = endPos + 1;
\r