import java.util.List;\r
import java.util.Map;\r
import java.util.Set;\r
+import java.util.regex.Matcher;\r
import java.util.regex.Pattern;\r
\r
import com.hughes.util.raf.RAFFactory;\r
return lang == LANG1 ? LANG2 : LANG1;\r
}\r
\r
-\r
+/*\r
+Lu Letter, Uppercase\r
+Ll Letter, Lowercase\r
+Lt Letter, Titlecase\r
+Lm Letter, Modifier\r
+Lo Letter, Other\r
+Mn Mark, Nonspacing\r
+Mc Mark, Spacing Combining\r
+Me Mark, Enclosing\r
+Nd Number, Decimal Digit\r
+Nl Number, Letter\r
+No Number, Other\r
+Pc Punctuation, Connector\r
+Pd Punctuation, Dash\r
+Ps Punctuation, Open\r
+Pe Punctuation, Close\r
+Pi Punctuation, Initial quote (may behave like Ps or Pe depending on usage)\r
+Pf Punctuation, Final quote (may behave like Ps or Pe depending on usage)\r
+Po Punctuation, Other\r
+Sm Symbol, Math\r
+Sc Symbol, Currency\r
+Sk Symbol, Modifier\r
+So Symbol, Other\r
+Zs Separator, Space\r
+Zl Separator, Line\r
+Zp Separator, Paragraph\r
+*/\r
+\r
+ static Pattern htmlDecimalCode = Pattern.compile("&#([0-9]+);");\r
+ static Pattern htmlCode = Pattern.compile("&#[^;]+;");\r
+ \r
static Entry parseFromLine(String line, final boolean hasMultipleSubentries) {\r
+ \r
line = line.replaceAll("<", "<");\r
line = line.replaceAll(">", ">");\r
+ Matcher matcher;\r
+ while ((matcher = htmlDecimalCode.matcher(line)).find()) {\r
+ final int intVal = Integer.parseInt(matcher.group(1));\r
+ final String charCode = "" + ((char) intVal);\r
+ System.out.println("Replacing " + matcher.group() + " with " + charCode);\r
+ line = matcher.replaceAll(charCode);\r
+ }\r
+ if ((matcher = htmlCode.matcher(line)).find()) {\r
+ System.err.println("HTML code: " + matcher.group());\r
+ }\r
+ \r
final String[] parts = lineSplitPattern.split(line);\r
if (parts.length != 2) {\r
System.err.println("Entry:" + "Invalid line: " + line);\r
bracketToClose.put(" '", "' ");\r
}\r
\r
- static final Pattern WHITESPACE = Pattern.compile("\\s+");\r
+ // This used to be called WHITESPACE.\r
+ static final Pattern NON_TOKEN_CHAR = Pattern.compile("\\s+");\r
\r
public Set<String> getIndexableTokens(final byte lang) {\r
final Set<String> result = new LinkedHashSet<String>();\r
text = text.replaceAll("\"-", "-");\r
text = text.replaceAll("-\"", "-");\r
text = text.replaceAll("[\"/\\()<>\\[\\],;?!.]", " ");\r
- text = text.replaceAll("[:] ", " ");\r
- text = text.replaceAll(" [:]", " ");\r
+ text = text.replaceAll("[-:] ", " ");\r
+ text = text.replaceAll(" [-:]", " ");\r
\r
// Now be really conservative about what we allow inside a token:\r
// See: http://unicode.org/Public/UNIDATA/UCD.html#General_Category_Values\r
- text = text.replaceAll("[^-:\\p{Lu}\\p{Ll}\\p{Lt}\\p{Lm}\\p{Lo}\\p{Nd}\\p{Nl}\\p{No}]", " ");\r
- \r
- result.addAll(Arrays.asList(WHITESPACE.split(text)));\r
+ text = text.replaceAll("[^-:\\p{L}\\p{N}\\p{S}]", " ");\r
+ result.addAll(Arrays.asList(NON_TOKEN_CHAR.split(text)));\r
\r
text = text.replaceAll("[-]", " ");\r
- result.addAll(Arrays.asList(WHITESPACE.split(text)));\r
+ result.addAll(Arrays.asList(NON_TOKEN_CHAR.split(text)));\r
\r
final Set<String> result2 = new LinkedHashSet<String>();\r
for (final String token : result) {\r