X-Git-Url: http://gitweb.fperrin.net/?a=blobdiff_plain;f=src%2Fcom%2Fhughes%2Fandroid%2Fdictionary%2FEntry.java;h=fb95f4ce2bb6c34b7b48bf4cf4cc49c95a531460;hb=3f7c2b6aef9ede05120637caa2525f3f60a61ba2;hp=5d1a2d9c05c50433a4da0bebbd3117537d92e518;hpb=127973afabe0c34015667c599d68bf9453d85652;p=Dictionary.git diff --git a/src/com/hughes/android/dictionary/Entry.java b/src/com/hughes/android/dictionary/Entry.java old mode 100755 new mode 100644 index 5d1a2d9..fb95f4c --- a/src/com/hughes/android/dictionary/Entry.java +++ b/src/com/hughes/android/dictionary/Entry.java @@ -1,233 +1,22 @@ -package com.hughes.android.dictionary; - -import java.io.IOException; -import java.io.RandomAccessFile; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.LinkedHashMap; -import java.util.LinkedHashSet; -import java.util.List; -import java.util.Map; -import java.util.Set; -import java.util.regex.Matcher; -import java.util.regex.Pattern; - -import com.hughes.util.raf.RAFFactory; -import com.hughes.util.raf.RAFSerializable; - -public final class Entry implements RAFSerializable { - - static final byte LANG1 = 0; - static final byte LANG2 = 1; - - static final Pattern lineSplitPattern = Pattern.compile("\\s::\\s"); - static final Pattern sublineSplitPattern = Pattern.compile("\\s\\|\\s"); - - final String[] lang1; - final String[] lang2; - -// public Entry(final String lang1, final String lang2) { -// this.lang1 = new String[] {lang1}; -// this.lang2 = new String[] {lang2}; -// } - - Entry(final String[] lang1, final String[] lang2) { - this.lang1 = lang1; - this.lang2 = lang2; - } - - public static final RAFFactory RAF_FACTORY = new RAFFactory() { - public Entry create(RandomAccessFile raf) throws IOException { - final int rows = raf.readByte(); - final String[] lang1 = new String[rows]; - final String[] lang2 = new String[rows]; - for (int i = 0; i < lang1.length; ++i) { - lang1[i] = raf.readUTF(); - lang2[i] = raf.readUTF(); - } - return new Entry(lang1, lang2); - }}; - public void write(RandomAccessFile raf) throws IOException { - assert lang1.length == (byte) lang1.length; - raf.writeByte(lang1.length); - for (int i = 0; i < lang1.length; ++i) { - raf.writeUTF(lang1[i]); - raf.writeUTF(lang2[i]); - } - } - - @Override - public boolean equals(Object o) { - if (!(o instanceof Entry)) { - return false; - } - final Entry that = (Entry) o; - return Arrays.deepEquals(this.lang1, that.lang1) && Arrays.deepEquals(this.lang2, that.lang2); - } - - @Override - public int hashCode() { - return Arrays.deepHashCode(lang1) + Arrays.deepHashCode(lang2); - } - - @Override - public String toString() { - return getRawText(false); - } - - public int getRowCount() { - assert lang1.length == lang2.length; - return lang1.length; - } - - String[] getAllText(final byte lang) { - if (lang == LANG1) { - return lang1; - } - assert lang == LANG2; - return lang2; - } - - String getRawText(boolean onlyFirstSubentry) { - final StringBuilder result = new StringBuilder(); - for (int i = 0; i < (onlyFirstSubentry ? 1 : lang1.length); ++i) { - result.append(i == 0 ? "" : " | ").append(lang1[i]); - } - result.append("\t"); - for (int i = 0; i < (onlyFirstSubentry ? 1 : lang2.length); ++i) { - result.append(i == 0 ? "" : " | ").append(lang2[i]); - } - return result.toString(); - } - - static byte otherLang(final byte lang) { - assert lang == LANG1 || lang == LANG2; - return lang == LANG1 ? LANG2 : LANG1; - } - -/* -Lu Letter, Uppercase -Ll Letter, Lowercase -Lt Letter, Titlecase -Lm Letter, Modifier -Lo Letter, Other -Mn Mark, Nonspacing -Mc Mark, Spacing Combining -Me Mark, Enclosing -Nd Number, Decimal Digit -Nl Number, Letter -No Number, Other -Pc Punctuation, Connector -Pd Punctuation, Dash -Ps Punctuation, Open -Pe Punctuation, Close -Pi Punctuation, Initial quote (may behave like Ps or Pe depending on usage) -Pf Punctuation, Final quote (may behave like Ps or Pe depending on usage) -Po Punctuation, Other -Sm Symbol, Math -Sc Symbol, Currency -Sk Symbol, Modifier -So Symbol, Other -Zs Separator, Space -Zl Separator, Line -Zp Separator, Paragraph -*/ - - static Pattern htmlDecimalCode = Pattern.compile("&#([0-9]+);"); - static Pattern htmlCode = Pattern.compile("&#[^;]+;"); - - static Entry parseFromLine(String line, final boolean hasMultipleSubentries) { - - line = line.replaceAll("<", "<"); - line = line.replaceAll(">", ">"); - Matcher matcher; - while ((matcher = htmlDecimalCode.matcher(line)).find()) { - final int intVal = Integer.parseInt(matcher.group(1)); - final String charCode = "" + ((char) intVal); - System.out.println("Replacing " + matcher.group() + " with " + charCode); - line = matcher.replaceAll(charCode); - } - if ((matcher = htmlCode.matcher(line)).find()) { - System.err.println("HTML code: " + matcher.group()); - } - - final String[] parts = lineSplitPattern.split(line); - if (parts.length != 2) { - System.err.println("Entry:" + "Invalid line: " + line); - return null; - } - if (!hasMultipleSubentries) { - return new Entry(new String[] {parts[0].trim()}, new String[] {parts[1].trim()}); - } - - final String[] lang1 = sublineSplitPattern.split(" " + parts[0].trim() + " "); - final String[] lang2 = sublineSplitPattern.split(" " + parts[1].trim() + " "); - if (lang1.length != lang2.length) { - System.err.println("Entry:" + "Invalid subline: " + line); - return null; - } - for (int i = 0; i < lang1.length; ++i) { - lang1[i] = lang1[i].trim(); - lang2[i] = lang2[i].trim(); - } - return new Entry(lang1, lang2); - } - - static final Map bracketToClose = new LinkedHashMap(); - static { - bracketToClose.put("\"", "\""); - bracketToClose.put(" '", "' "); - } - - // This used to be called WHITESPACE. - static final Pattern NON_TOKEN_CHAR = Pattern.compile("\\s+"); - - public Set getIndexableTokens(final byte lang) { - final Set result = new LinkedHashSet(); - String text = " "; - for (final String subentry : getAllText(lang)) { - text += subentry + " "; - } - - text = text.replaceAll("fig\\.", " "); - text = text.replaceAll("\\{[^\\}]+}", " "); - text = text.replaceAll("\"-", "-"); - text = text.replaceAll("-\"", "-"); - text = text.replaceAll("[\"/\\()<>\\[\\],;?!.]", " "); - text = text.replaceAll("[-:] ", " "); - text = text.replaceAll(" [-:]", " "); - - // Now be really conservative about what we allow inside a token: - // See: http://unicode.org/Public/UNIDATA/UCD.html#General_Category_Values - text = text.replaceAll("[^-:\\p{L}\\p{N}\\p{S}]", " "); - result.addAll(Arrays.asList(NON_TOKEN_CHAR.split(text))); - - text = text.replaceAll("[-]", " "); - result.addAll(Arrays.asList(NON_TOKEN_CHAR.split(text))); - - final Set result2 = new LinkedHashSet(); - for (final String token : result) { - if (isIndexable(token)) { - result2.add(token); - } - } - return result2; - } - - static boolean isIndexable(final String text) { - // Does it have an alpha-numeric anywhere? - return text.matches(".*\\w.*"); - } - - static List getTextInside(final String text, final String start, final String end) { - final List result = new ArrayList(); - int startPos = 0; - while ((startPos = text.indexOf(start)) != -1) { - final int endPos = text.indexOf(end, startPos + 1); - result.add(text.substring(startPos + 1, endPos)); - startPos = endPos + 1; - } - return result; - } - -} \ No newline at end of file +package com.hughes.android.dictionary; + +import java.io.IOException; +import java.io.RandomAccessFile; + +import com.hughes.util.raf.RAFFactory; +import com.hughes.util.raf.RAFSerializable; + +public abstract class Entry implements RAFSerializable { + + public static final RAFFactory RAF_FACTORY = new RAFFactory() { + public Entry create(RandomAccessFile raf) throws IOException { + final byte type = raf.readByte(); + switch (type) { + case 0: + return SimpleEntry.RAF_FACTORY.create(raf); + } + throw new RuntimeException("Invalid entry type: " + type); + }}; + + +}