for (int i = 0; i < lang1.length; ++i) {\r
result.append(i == 0 ? "" : " | ").append(lang1[i]);\r
}\r
- result.append(" :: ");\r
+ result.append("\t");\r
for (int i = 0; i < lang2.length; ++i) {\r
result.append(i == 0 ? "" : " | ").append(lang2[i]);\r
}\r
\r
public Set<String> getIndexableTokens(final byte lang) {\r
final Set<String> result = new LinkedHashSet<String>();\r
- String text = Arrays.asList(getAllText(lang)).toString();\r
+ String text = " ";\r
+ for (final String subentry : getAllText(lang)) {\r
+ text += subentry + " ";\r
+ }\r
\r
text = text.replaceAll("fig\\.", " ");\r
text = text.replaceAll("\\{[^\\}]+}", " ");\r
text = text.replaceAll("\"-", "-");\r
text = text.replaceAll("-\"", "-");\r
- text = text.replaceAll("[\":/\\()<>\\[\\],;?!.]", " ");\r
+ text = text.replaceAll("[\"/\\()<>\\[\\],;?!.]", " ");\r
+ text = text.replaceAll("[:] ", " ");\r
+ text = text.replaceAll(" [:]", " ");\r
+ \r
+ // Now be really conservative about what we allow inside a token:\r
+ // See: http://unicode.org/Public/UNIDATA/UCD.html#General_Category_Values\r
+ text = text.replaceAll("[^-:\\p{Lu}\\p{Ll}\\p{Lt}\\p{Lm}\\p{Lo}\\p{Nd}\\p{Nl}\\p{No}]", " ");\r
+ \r
result.addAll(Arrays.asList(WHITESPACE.split(text)));\r
\r
text = text.replaceAll("[-]", " ");\r