/** ******************************************************************************* * Copyright (C) 1996-2010, International Business Machines Corporation and * * others. All Rights Reserved. * ******************************************************************************* */ package com.ibm.icu.text; import java.io.IOException; import java.text.ParseException; import java.util.Arrays; import java.util.Enumeration; import java.util.Hashtable; import java.util.Vector; import com.ibm.icu.impl.IntTrieBuilder; import com.ibm.icu.impl.Norm2AllModes; import com.ibm.icu.impl.Normalizer2Impl; import com.ibm.icu.impl.TrieBuilder; import com.ibm.icu.impl.TrieIterator; import com.ibm.icu.impl.UCharacterProperty; import com.ibm.icu.impl.Utility; import com.ibm.icu.lang.UCharacter; import com.ibm.icu.lang.UCharacterCategory; import com.ibm.icu.util.RangeValueIterator; import com.ibm.icu.util.VersionInfo; /** * Class for building a collator from a list of collation rules. This class is * uses CollationRuleParser * * @author Syn Wee Quek * @since release 2.2, June 11 2002 */ final class CollationParsedRuleBuilder { // package private constructors ------------------------------------------ /** * Constructor * * @param rules * collation rules * @exception ParseException * thrown when argument rules have an invalid syntax */ CollationParsedRuleBuilder(String rules) throws ParseException { m_parser_ = new CollationRuleParser(rules); m_parser_.assembleTokenList(); m_utilColEIter_ = RuleBasedCollator.UCA_ .getCollationElementIterator(""); } // package private inner classes ----------------------------------------- /** * Inverse UCA wrapper */ static class InverseUCA { // package private constructor --------------------------------------- InverseUCA() { } // package private data member --------------------------------------- /** * Array list of characters */ int m_table_[]; /** * Array list of continuation characters */ char m_continuations_[]; /** * UCA version of inverse UCA table */ VersionInfo m_UCA_version_; // package private method -------------------------------------------- /** * Returns the previous inverse ces of the argument ces * * @param ce * ce to test * @param contce * continuation ce to test * @param strength * collation strength * @param prevresult * an array to store the return results previous inverse ce * and previous inverse continuation ce * @return result of the inverse ce */ final int getInversePrevCE(int ce, int contce, int strength, int prevresult[]) { int result = findInverseCE(ce, contce); if (result < 0) { prevresult[0] = CollationElementIterator.NULLORDER; return -1; } ce &= STRENGTH_MASK_[strength]; contce &= STRENGTH_MASK_[strength]; prevresult[0] = ce; prevresult[1] = contce; while ((prevresult[0] & STRENGTH_MASK_[strength]) == ce && (prevresult[1] & STRENGTH_MASK_[strength]) == contce && result > 0) { // this condition should prevent falling off the edge of the // world // here, we end up in a singularity - zero prevresult[0] = m_table_[3 * (--result)]; prevresult[1] = m_table_[3 * result + 1]; } return result; } final int getCEStrengthDifference(int CE, int contCE, int prevCE, int prevContCE) { int strength = Collator.TERTIARY; while (((prevCE & STRENGTH_MASK_[strength]) != (CE & STRENGTH_MASK_[strength]) || (prevContCE & STRENGTH_MASK_[strength]) != (contCE & STRENGTH_MASK_[strength])) && (strength != 0)) { strength--; } return strength; } private int compareCEs(int source0, int source1, int target0, int target1) { int s1 = source0, s2, t1 = target0, t2; if (RuleBasedCollator.isContinuation(source1)) { s2 = source1; } else { s2 = 0; } if (RuleBasedCollator.isContinuation(target1)) { t2 = target1; } else { t2 = 0; } int s = 0, t = 0; if (s1 == t1 && s2 == t2) { return 0; } s = (s1 & 0xFFFF0000) | ((s2 & 0xFFFF0000) >>> 16); t = (t1 & 0xFFFF0000) | ((t2 & 0xFFFF0000) >>> 16); if (s == t) { s = (s1 & 0x0000FF00) | (s2 & 0x0000FF00) >> 8; t = (t1 & 0x0000FF00) | (t2 & 0x0000FF00) >> 8; if (s == t) { s = (s1 & 0x000000FF) << 8 | (s2 & 0x000000FF); t = (t1 & 0x000000FF) << 8 | (t2 & 0x000000FF); return Utility.compareUnsigned(s, t); } else { return Utility.compareUnsigned(s, t); } } else { return Utility.compareUnsigned(s, t); } } /** * Finding the inverse CE of the argument CEs * * @param ce * CE to be tested * @param contce * continuation CE * @return inverse CE */ int findInverseCE(int ce, int contce) { int bottom = 0; int top = m_table_.length / 3; int result = 0; while (bottom < top - 1) { result = (top + bottom) >> 1; int first = m_table_[3 * result]; int second = m_table_[3 * result + 1]; int comparison = compareCEs(first, second, ce, contce); if (comparison > 0) { top = result; } else if (comparison < 0) { bottom = result; } else { break; } } return result; } /** * Getting gap offsets in the inverse UCA * * @param listheader * parsed token lists * @exception Exception * thrown when error occurs while finding the collation * gaps */ void getInverseGapPositions( CollationRuleParser.TokenListHeader listheader) throws Exception { // reset all the gaps CollationRuleParser.Token token = listheader.m_first_; int tokenstrength = token.m_strength_; for (int i = 0; i < 3; i++) { listheader.m_gapsHi_[3 * i] = 0; listheader.m_gapsHi_[3 * i + 1] = 0; listheader.m_gapsHi_[3 * i + 2] = 0; listheader.m_gapsLo_[3 * i] = 0; listheader.m_gapsLo_[3 * i + 1] = 0; listheader.m_gapsLo_[3 * i + 2] = 0; listheader.m_numStr_[i] = 0; listheader.m_fStrToken_[i] = null; listheader.m_lStrToken_[i] = null; listheader.m_pos_[i] = -1; } if ((listheader.m_baseCE_ >>> 24) >= RuleBasedCollator.UCA_CONSTANTS_.PRIMARY_IMPLICIT_MIN_ && (listheader.m_baseCE_ >>> 24) <= RuleBasedCollator.UCA_CONSTANTS_.PRIMARY_IMPLICIT_MAX_) { // implicits - listheader.m_pos_[0] = 0; int t1 = listheader.m_baseCE_; int t2 = listheader.m_baseContCE_; listheader.m_gapsLo_[0] = mergeCE(t1, t2, Collator.PRIMARY); listheader.m_gapsLo_[1] = mergeCE(t1, t2, Collator.SECONDARY); listheader.m_gapsLo_[2] = mergeCE(t1, t2, Collator.TERTIARY); int primaryCE = t1 & RuleBasedCollator.CE_PRIMARY_MASK_ | (t2 & RuleBasedCollator.CE_PRIMARY_MASK_) >>> 16; primaryCE = RuleBasedCollator.impCEGen_ .getImplicitFromRaw(RuleBasedCollator.impCEGen_ .getRawFromImplicit(primaryCE) + 1); t1 = primaryCE & RuleBasedCollator.CE_PRIMARY_MASK_ | 0x0505; t2 = (primaryCE << 16) & RuleBasedCollator.CE_PRIMARY_MASK_ | RuleBasedCollator.CE_CONTINUATION_MARKER_; // if (listheader.m_baseCE_ < 0xEF000000) { // // first implicits have three byte primaries, with a gap of // // one so we esentially need to add 2 to the top byte in // // listheader.m_baseContCE_ // t2 += 0x02000000; // } // else { // // second implicits have four byte primaries, with a gap of // // IMPLICIT_LAST2_MULTIPLIER_ // // Now, this guy is not really accessible here, so until we // // find a better way to pass it around, assume that the gap // is 1 // t2 += 0x00020000; // } listheader.m_gapsHi_[0] = mergeCE(t1, t2, Collator.PRIMARY); listheader.m_gapsHi_[1] = mergeCE(t1, t2, Collator.SECONDARY); listheader.m_gapsHi_[2] = mergeCE(t1, t2, Collator.TERTIARY); } else if (listheader.m_indirect_ == true && listheader.m_nextCE_ != 0) { listheader.m_pos_[0] = 0; int t1 = listheader.m_baseCE_; int t2 = listheader.m_baseContCE_; listheader.m_gapsLo_[0] = mergeCE(t1, t2, Collator.PRIMARY); listheader.m_gapsLo_[1] = mergeCE(t1, t2, Collator.SECONDARY); listheader.m_gapsLo_[2] = mergeCE(t1, t2, Collator.TERTIARY); t1 = listheader.m_nextCE_; t2 = listheader.m_nextContCE_; listheader.m_gapsHi_[0] = mergeCE(t1, t2, Collator.PRIMARY); listheader.m_gapsHi_[1] = mergeCE(t1, t2, Collator.SECONDARY); listheader.m_gapsHi_[2] = mergeCE(t1, t2, Collator.TERTIARY); } else { while (true) { if (tokenstrength < CE_BASIC_STRENGTH_LIMIT_) { listheader.m_pos_[tokenstrength] = getInverseNext( listheader, tokenstrength); if (listheader.m_pos_[tokenstrength] >= 0) { listheader.m_fStrToken_[tokenstrength] = token; } else { // The CE must be implicit, since it's not in the // table // Error throw new Exception("Internal program error"); } } while (token != null && token.m_strength_ >= tokenstrength) { if (tokenstrength < CE_BASIC_STRENGTH_LIMIT_) { listheader.m_lStrToken_[tokenstrength] = token; } token = token.m_next_; } if (tokenstrength < CE_BASIC_STRENGTH_LIMIT_ - 1) { // check if previous interval is the same and merge the // intervals if it is so if (listheader.m_pos_[tokenstrength] == listheader.m_pos_[tokenstrength + 1]) { listheader.m_fStrToken_[tokenstrength] = listheader.m_fStrToken_[tokenstrength + 1]; listheader.m_fStrToken_[tokenstrength + 1] = null; listheader.m_lStrToken_[tokenstrength + 1] = null; listheader.m_pos_[tokenstrength + 1] = -1; } } if (token != null) { tokenstrength = token.m_strength_; } else { break; } } for (int st = 0; st < 3; st++) { int pos = listheader.m_pos_[st]; if (pos >= 0) { int t1 = m_table_[3 * pos]; int t2 = m_table_[3 * pos + 1]; listheader.m_gapsHi_[3 * st] = mergeCE(t1, t2, Collator.PRIMARY); listheader.m_gapsHi_[3 * st + 1] = mergeCE(t1, t2, Collator.SECONDARY); listheader.m_gapsHi_[3 * st + 2] = (t1 & 0x3f) << 24 | (t2 & 0x3f) << 16; // pos --; // t1 = m_table_[3 * pos]; // t2 = m_table_[3 * pos + 1]; t1 = listheader.m_baseCE_; t2 = listheader.m_baseContCE_; listheader.m_gapsLo_[3 * st] = mergeCE(t1, t2, Collator.PRIMARY); listheader.m_gapsLo_[3 * st + 1] = mergeCE(t1, t2, Collator.SECONDARY); listheader.m_gapsLo_[3 * st + 2] = (t1 & 0x3f) << 24 | (t2 & 0x3f) << 16; } } } } /** * Gets the next CE in the inverse table * * @param listheader * token list header * @param strength * collation strength * @return next ce */ private final int getInverseNext( CollationRuleParser.TokenListHeader listheader, int strength) { int ce = listheader.m_baseCE_; int secondce = listheader.m_baseContCE_; int result = findInverseCE(ce, secondce); if (result < 0) { return -1; } ce &= STRENGTH_MASK_[strength]; secondce &= STRENGTH_MASK_[strength]; int nextce = ce; int nextcontce = secondce; while ((nextce & STRENGTH_MASK_[strength]) == ce && (nextcontce & STRENGTH_MASK_[strength]) == secondce) { nextce = m_table_[3 * (++result)]; nextcontce = m_table_[3 * result + 1]; } listheader.m_nextCE_ = nextce; listheader.m_nextContCE_ = nextcontce; return result; } } // package private data members ------------------------------------------ /** * Inverse UCA, instantiate only when required */ static final InverseUCA INVERSE_UCA_; /** * UCA and Inverse UCA version do not match */ private static final String INV_UCA_VERSION_MISMATCH_ = "UCA versions of UCA and inverse UCA should match"; /** * UCA and Inverse UCA version do not match */ private static final String UCA_NOT_INSTANTIATED_ = "UCA is not instantiated!"; /** * Initializing the inverse UCA */ static { InverseUCA temp = null; try { temp = CollatorReader.getInverseUCA(); } catch (IOException e) { } /* * try { String invdat = "/com/ibm/icu/impl/data/invuca.icu"; * InputStream i = * CollationParsedRuleBuilder.class.getResourceAsStream(invdat); * BufferedInputStream b = new BufferedInputStream(i, 110000); * INVERSE_UCA_ = CollatorReader.readInverseUCA(b); b.close(); * i.close(); } catch (Exception e) { e.printStackTrace(); throw new * RuntimeException(e.getMessage()); } */ if (temp != null && RuleBasedCollator.UCA_ != null) { if (!temp.m_UCA_version_ .equals(RuleBasedCollator.UCA_.m_UCA_version_)) { throw new RuntimeException(INV_UCA_VERSION_MISMATCH_); } } else { throw new RuntimeException(UCA_NOT_INSTANTIATED_); } INVERSE_UCA_ = temp; } // package private methods ----------------------------------------------- /** * Parse and sets the collation rules in the argument collator * * @param collator * to set * @exception Exception * thrown when internal program error occurs */ void setRules(RuleBasedCollator collator) throws Exception { if (m_parser_.m_resultLength_ > 0 || m_parser_.m_removeSet_ != null) { // we have a set of rules, let's make something of it assembleTailoringTable(collator); } else { // no rules, but no error either must be only options // We will init the collator from UCA collator.setWithUCATables(); } // And set only the options m_parser_.setDefaultOptionsInCollator(collator); } private void copyRangeFromUCA(BuildTable t, int start, int end) { int u = 0; for (u = start; u <= end; u++) { // if ((CE = ucmpe32_get(t.m_mapping, u)) == UCOL_NOT_FOUND int CE = t.m_mapping_.getValue(u); if (CE == CE_NOT_FOUND_ // this test is for contractions that are missing the starting // element. Looks like latin-1 should be done before // assembling the table, even if it results in more false // closure elements || (isContractionTableElement(CE) && getCE( t.m_contractions_, CE, 0) == CE_NOT_FOUND_)) { // m_utilElement_.m_uchars_ = str.toString(); m_utilElement_.m_uchars_ = UCharacter.toString(u); m_utilElement_.m_cPoints_ = m_utilElement_.m_uchars_; m_utilElement_.m_prefix_ = 0; m_utilElement_.m_CELength_ = 0; m_utilElement_.m_prefixChars_ = null; m_utilColEIter_.setText(m_utilElement_.m_uchars_); while (CE != CollationElementIterator.NULLORDER) { CE = m_utilColEIter_.next(); if (CE != CollationElementIterator.NULLORDER) { m_utilElement_.m_CEs_[m_utilElement_.m_CELength_++] = CE; } } addAnElement(t, m_utilElement_); } } } /** * 2. Eliminate the negative lists by doing the following for each non-null * negative list: o if previousCE(baseCE, strongestN) != some ListHeader X's * baseCE, create new ListHeader X o reverse the list, add to the end of X's * positive list. Reset the strength of the first item you add, based on the * stronger strength levels of the two lists. * * 3. For each ListHeader with a non-null positive list: o Find all * character strings with CEs between the baseCE and the next/previous CE, * at the strength of the first token. Add these to the tailoring. ? That * is, if UCA has ... x <<< X << x' <<< X' < y ..., and the tailoring has & * x < z... ? Then we change the tailoring to & x <<< X << x' <<< X' < z ... * * It is possible that this part should be done even while constructing list * The problem is that it is unknown what is going to be the strongest * weight. So we might as well do it here o Allocate CEs for each token in * the list, based on the total number N of the largest level difference, * and the gap G between baseCE and nextCE at that level. The relation * * between the last item and nextCE is the same as the strongest strength. o * Example: baseCE < a << b <<< q << c < d < e * nextCE(X,1) ? There are 3 * primary items: a, d, e. Fit them into the primary gap. Then fit b and c * into the secondary gap between a and d, then fit q into the tertiary gap * between b and c. o Example: baseCE << b <<< q << c * nextCE(X,2) ? There * are 2 secondary items: b, c. Fit them into the secondary gap. Then fit q * into the tertiary gap between b and c. o When incrementing primary * values, we will not cross high byte boundaries except where there is only * a single-byte primary. That is to ensure that the script reordering will * continue to work. * * @param collator * the rule based collator to update * @exception Exception * thrown when internal program error occurs */ void assembleTailoringTable(RuleBasedCollator collator) throws Exception { for (int i = 0; i < m_parser_.m_resultLength_; i++) { // now we need to generate the CEs // We stuff the initial value in the buffers, and increase the // appropriate buffer according to strength if (m_parser_.m_listHeader_[i].m_first_ != null) { // if there are any elements // due to the way parser works, subsequent tailorings // may remove all the elements from a sequence, therefore // leaving an empty tailoring sequence. initBuffers(m_parser_.m_listHeader_[i]); } } if (m_parser_.m_variableTop_ != null) { // stuff the variable top value m_parser_.m_options_.m_variableTopValue_ = m_parser_.m_variableTop_.m_CE_[0] >>> 16; // remove it from the list if (m_parser_.m_variableTop_.m_listHeader_.m_first_ == m_parser_.m_variableTop_) { // first // in // list m_parser_.m_variableTop_.m_listHeader_.m_first_ = m_parser_.m_variableTop_.m_next_; } if (m_parser_.m_variableTop_.m_listHeader_.m_last_ == m_parser_.m_variableTop_) { // first in list m_parser_.m_variableTop_.m_listHeader_.m_last_ = m_parser_.m_variableTop_.m_previous_; } if (m_parser_.m_variableTop_.m_next_ != null) { m_parser_.m_variableTop_.m_next_.m_previous_ = m_parser_.m_variableTop_.m_previous_; } if (m_parser_.m_variableTop_.m_previous_ != null) { m_parser_.m_variableTop_.m_previous_.m_next_ = m_parser_.m_variableTop_.m_next_; } } BuildTable t = new BuildTable(m_parser_); // After this, we have assigned CE values to all regular CEs now we // will go through list once more and resolve expansions, make // UCAElements structs and add them to table for (int i = 0; i < m_parser_.m_resultLength_; i++) { // now we need to generate the CEs // We stuff the initial value in the buffers, and increase the // appropriate buffer according to strength */ createElements(t, m_parser_.m_listHeader_[i]); } m_utilElement_.clear(); StringBuilder str = new StringBuilder(); // add latin-1 stuff copyRangeFromUCA(t, 0, 0xFF); // add stuff for copying if (m_parser_.m_copySet_ != null) { int i = 0; for (i = 0; i < m_parser_.m_copySet_.getRangeCount(); i++) { copyRangeFromUCA(t, m_parser_.m_copySet_.getRangeStart(i), m_parser_.m_copySet_.getRangeEnd(i)); } } // copy contractions from the UCA - this is felt mostly for cyrillic char conts[] = RuleBasedCollator.UCA_CONTRACTIONS_; int offset = 0; while (conts[offset] != 0) { // tailoredCE = ucmpe32_get(t.m_mapping, *conts); int tailoredCE = t.m_mapping_.getValue(conts[offset]); Elements prefixElm = null; if (tailoredCE != CE_NOT_FOUND_) { boolean needToAdd = true; if (isContractionTableElement(tailoredCE)) { if (isTailored(t.m_contractions_, tailoredCE, conts, offset + 1) == true) { needToAdd = false; } } if (!needToAdd && isPrefix(tailoredCE) && conts[offset + 1] == 0) { // pre-context character in UCA // The format for pre-context character is // conts[0]: baseCP conts[1]:0 conts[2]:pre-context CP Elements elm = new Elements(); elm.m_cPoints_ = m_utilElement_.m_uchars_; elm.m_CELength_ = 0; elm.m_uchars_ = UCharacter.toString(conts[offset]); elm.m_prefixChars_ = UCharacter.toString(conts[offset + 2]); elm.m_prefix_ = 0; // TODO(claireho) : confirm! prefixElm = t.m_prefixLookup_.get(elm); if ((prefixElm == null) || (prefixElm.m_prefixChars_.charAt(0) != conts[offset + 2])) { needToAdd = true; } } if (m_parser_.m_removeSet_ != null && m_parser_.m_removeSet_.contains(conts[offset])) { needToAdd = false; } if (needToAdd == true) { // we need to add if this contraction is not tailored. if (conts[offset + 1] != 0) { // not precontext m_utilElement_.m_prefix_ = 0; m_utilElement_.m_prefixChars_ = null; m_utilElement_.m_cPoints_ = m_utilElement_.m_uchars_; str.delete(0, str.length()); str.append(conts[offset]); str.append(conts[offset + 1]); if (conts[offset + 2] != 0) { str.append(conts[offset + 2]); } m_utilElement_.m_uchars_ = str.toString(); m_utilElement_.m_CELength_ = 0; m_utilColEIter_.setText(m_utilElement_.m_uchars_); } else { // add a pre-context element int preKeyLen = 0; str.delete(0, str.length()); // clean up m_utilElement_.m_cPoints_ = UCharacter .toString(conts[offset]); m_utilElement_.m_CELength_ = 0; m_utilElement_.m_uchars_ = UCharacter .toString(conts[offset]); m_utilElement_.m_prefixChars_ = UCharacter .toString(conts[offset + 2]); if (prefixElm == null) { m_utilElement_.m_prefix_ = 0; } else { // TODO (claireho): confirm! m_utilElement_.m_prefix_ = m_utilElement_.m_prefix_; // m_utilElement_.m_prefix_= prefixElm.m_prefix_; } m_utilColEIter_.setText(m_utilElement_.m_prefixChars_); while (m_utilColEIter_.next() != CollationElementIterator.NULLORDER) { // count number of keys for pre-context char. preKeyLen++; } str.append(conts[offset + 2]); str.append(conts[offset]); m_utilColEIter_.setText(str.toString()); // Skip the keys for prefix character, then copy the // rest to el. while ((preKeyLen-- > 0) && m_utilColEIter_.next() != CollationElementIterator.NULLORDER) { continue; } } while (true) { int CE = m_utilColEIter_.next(); if (CE != CollationElementIterator.NULLORDER) { m_utilElement_.m_CEs_[m_utilElement_.m_CELength_++] = CE; } else { break; } } addAnElement(t, m_utilElement_); } } else if (m_parser_.m_removeSet_ != null && m_parser_.m_removeSet_.contains(conts[offset])) { copyRangeFromUCA(t, conts[offset], conts[offset]); } offset += 3; } // Add completely ignorable elements processUCACompleteIgnorables(t); // canonical closure canonicalClosure(t); // still need to produce compatibility closure assembleTable(t, collator); } // private inner classes ------------------------------------------------- @SuppressWarnings("unused") private static class CEGenerator { // package private data members -------------------------------------- WeightRange m_ranges_[]; int m_rangesLength_; int m_byteSize_; int m_start_; int m_limit_; int m_maxCount_; int m_count_; int m_current_; int m_fLow_; // forbidden Low int m_fHigh_; // forbidden High // package private constructor --------------------------------------- CEGenerator() { m_ranges_ = new WeightRange[7]; for (int i = 6; i >= 0; i--) { m_ranges_[i] = new WeightRange(); } } } private static class WeightRange implements Comparable { // public methods ---------------------------------------------------- /** * Compares this object with target * * @param target object to compare with * @return 0 if equals, 1 if this is > target, -1 otherwise */ public int compareTo(WeightRange target) { if (this == target) { return 0; } int tstart = target.m_start_; if (m_start_ == tstart) { return 0; } if (m_start_ > tstart) { return 1; } return -1; } /** * Initialize */ public void clear() { m_start_ = 0; m_end_ = 0; m_length_ = 0; m_count_ = 0; m_length2_ = 0; m_count2_ = 0; } // package private data members -------------------------------------- int m_start_; int m_end_; int m_length_; int m_count_; int m_length2_; int m_count2_; // package private constructor --------------------------------------- WeightRange() { clear(); } /** * Copy constructor. Cloneable is troublesome, needs to check for * exception * * @param source * to clone */ WeightRange(WeightRange source) { m_start_ = source.m_start_; m_end_ = source.m_end_; m_length_ = source.m_length_; m_count_ = source.m_count_; m_length2_ = source.m_length2_; m_count2_ = source.m_count2_; } } private static class MaxJamoExpansionTable { // package private data members -------------------------------------- Vector m_endExpansionCE_; // vector of booleans Vector m_isV_; byte m_maxLSize_; byte m_maxVSize_; byte m_maxTSize_; // package private constructor --------------------------------------- MaxJamoExpansionTable() { m_endExpansionCE_ = new Vector(); m_isV_ = new Vector(); m_endExpansionCE_.add(new Integer(0)); m_isV_.add(Boolean.FALSE); m_maxLSize_ = 1; m_maxVSize_ = 1; m_maxTSize_ = 1; } MaxJamoExpansionTable(MaxJamoExpansionTable table) { m_endExpansionCE_ = new Vector(table.m_endExpansionCE_); m_isV_ = new Vector(table.m_isV_); m_maxLSize_ = table.m_maxLSize_; m_maxVSize_ = table.m_maxVSize_; m_maxTSize_ = table.m_maxTSize_; } } private static class MaxExpansionTable { // package private constructor -------------------------------------- MaxExpansionTable() { m_endExpansionCE_ = new Vector(); m_expansionCESize_ = new Vector(); m_endExpansionCE_.add(new Integer(0)); m_expansionCESize_.add(new Byte((byte) 0)); } MaxExpansionTable(MaxExpansionTable table) { m_endExpansionCE_ = new Vector(table.m_endExpansionCE_); m_expansionCESize_ = new Vector(table.m_expansionCESize_); } // package private data member -------------------------------------- Vector m_endExpansionCE_; Vector m_expansionCESize_; } private static class BasicContractionTable { // package private constructors ------------------------------------- BasicContractionTable() { m_CEs_ = new Vector(); m_codePoints_ = new StringBuilder(); } // package private data members ------------------------------------- StringBuilder m_codePoints_; Vector m_CEs_; } private static class ContractionTable { // package private constructor -------------------------------------- /** * Builds a contraction table * * @param mapping */ ContractionTable(IntTrieBuilder mapping) { m_mapping_ = mapping; m_elements_ = new Vector(); m_CEs_ = new Vector(); m_codePoints_ = new StringBuilder(); m_offsets_ = new Vector(); m_currentTag_ = CE_NOT_FOUND_TAG_; } /** * Copies a contraction table. Not all data will be copied into their * own object. * * @param table */ ContractionTable(ContractionTable table) { m_mapping_ = table.m_mapping_; m_elements_ = new Vector(table.m_elements_); m_codePoints_ = new StringBuilder(table.m_codePoints_); m_CEs_ = new Vector(table.m_CEs_); m_offsets_ = new Vector(table.m_offsets_); m_currentTag_ = table.m_currentTag_; } // package private data members ------------------------------------ /** * Vector of BasicContractionTable */ Vector m_elements_; IntTrieBuilder m_mapping_; StringBuilder m_codePoints_; Vector m_CEs_; Vector m_offsets_; int m_currentTag_; } /** * Private class for combining mark table. The table is indexed by the class * value(0-255). */ @SuppressWarnings("unused") private static class CombinClassTable { /** * accumulated numbers of combining marks. */ int[] index = new int[256]; /** * code point array for combining marks. */ char[] cPoints; /** * size of cPoints. */ int size; // constructor CombinClassTable() { cPoints = null; size = 0; pos = 0; curClass = 1; } /** * Copy the combining mark table from ccc and index in compact way. * * @param cps * : code point array * @param size * : size of ccc * @param index * : index of combining classes(0-255) */ void generate(char[] cps, int numOfCM, int[] ccIndex) { int count = 0; cPoints = new char[numOfCM]; for (int i = 0; i < 256; i++) { for (int j = 0; j < ccIndex[i]; j++) { cPoints[count++] = cps[(i << 8) + j]; } index[i] = count; } size = count; } /** * Get first CM(combining mark) with the combining class value cClass. * * @param cClass * : combining class value. * @return combining mark codepoint or 0 if no combining make with class * value cClass */ char GetFirstCM(int cClass) { curClass = cClass; if (cPoints == null || cClass == 0 || index[cClass] == index[cClass - 1]) { return 0; } pos = 1; return cPoints[index[cClass - 1]]; } /** * Get next CM(combining mark) with the combining class value cClass. * Return combining mark codepoint or 0 if no next CM. */ char GetNextCM() { if (cPoints == null || index[curClass] == (index[curClass - 1] + pos)) { return 0; } return cPoints[index[curClass - 1] + (pos++)]; } // private data members int pos; int curClass; } private static final class BuildTable implements TrieBuilder.DataManipulate { // package private methods ------------------------------------------ /** * For construction of the Trie tables. Has to be labeled public * * @param cp The value of the code point. * @param offset The value of the offset. * @return data offset or 0 */ public int getFoldedValue(int cp, int offset) { int limit = cp + 0x400; while (cp < limit) { int value = m_mapping_.getValue(cp); boolean inBlockZero = m_mapping_.isInZeroBlock(cp); int tag = getCETag(value); if (inBlockZero == true) { cp += TrieBuilder.DATA_BLOCK_LENGTH; } else if (!(isSpecial(value) && (tag == CE_IMPLICIT_TAG_ || tag == CE_NOT_FOUND_TAG_))) { // These are values that are starting in either UCA // (IMPLICIT_TAG) or in the tailorings (NOT_FOUND_TAG). // Presence of these tags means that there is nothing in // this position and that it should be skipped. return RuleBasedCollator.CE_SPECIAL_FLAG_ | (CE_SURROGATE_TAG_ << 24) | offset; } else { ++cp; } } return 0; } // package private constructor -------------------------------------- /** * Returns a table */ BuildTable(CollationRuleParser parser) { m_collator_ = new RuleBasedCollator(); m_collator_.setWithUCAData(); MaxExpansionTable maxet = new MaxExpansionTable(); MaxJamoExpansionTable maxjet = new MaxJamoExpansionTable(); m_options_ = parser.m_options_; m_expansions_ = new Vector(); // Do your own mallocs for the structure, array and have linear // Latin 1 int trieinitialvalue = RuleBasedCollator.CE_SPECIAL_FLAG_ | (CE_NOT_FOUND_TAG_ << 24); // temporary fix for jb3822, 0x100000 -> 30000 m_mapping_ = new IntTrieBuilder(null, 0x30000, trieinitialvalue, trieinitialvalue, true); m_prefixLookup_ = new Hashtable(); // uhash_open(prefixLookupHash, prefixLookupComp); m_contractions_ = new ContractionTable(m_mapping_); // copy UCA's maxexpansion and merge as we go along m_maxExpansions_ = maxet; // adding an extra initial value for easier manipulation for (int i = 0; i < RuleBasedCollator.UCA_.m_expansionEndCE_.length; i++) { maxet.m_endExpansionCE_.add(new Integer( RuleBasedCollator.UCA_.m_expansionEndCE_[i])); maxet.m_expansionCESize_.add(new Byte( RuleBasedCollator.UCA_.m_expansionEndCEMaxSize_[i])); } m_maxJamoExpansions_ = maxjet; m_unsafeCP_ = new byte[UNSAFECP_TABLE_SIZE_]; m_contrEndCP_ = new byte[UNSAFECP_TABLE_SIZE_]; Arrays.fill(m_unsafeCP_, (byte) 0); Arrays.fill(m_contrEndCP_, (byte) 0); } /** * Duplicating a BuildTable. Not all data will be duplicated into their * own object. * * @param table * to clone */ BuildTable(BuildTable table) { m_collator_ = table.m_collator_; m_mapping_ = new IntTrieBuilder(table.m_mapping_); m_expansions_ = new Vector(table.m_expansions_); m_contractions_ = new ContractionTable(table.m_contractions_); m_contractions_.m_mapping_ = m_mapping_; m_options_ = table.m_options_; m_maxExpansions_ = new MaxExpansionTable(table.m_maxExpansions_); m_maxJamoExpansions_ = new MaxJamoExpansionTable( table.m_maxJamoExpansions_); m_unsafeCP_ = new byte[table.m_unsafeCP_.length]; System.arraycopy(table.m_unsafeCP_, 0, m_unsafeCP_, 0, m_unsafeCP_.length); m_contrEndCP_ = new byte[table.m_contrEndCP_.length]; System.arraycopy(table.m_contrEndCP_, 0, m_contrEndCP_, 0, m_contrEndCP_.length); } // package private data members ------------------------------------- RuleBasedCollator m_collator_; IntTrieBuilder m_mapping_; Vector m_expansions_; ContractionTable m_contractions_; // UCATableHeader image; CollationRuleParser.OptionSet m_options_; MaxExpansionTable m_maxExpansions_; MaxJamoExpansionTable m_maxJamoExpansions_; byte m_unsafeCP_[]; byte m_contrEndCP_[]; Hashtable m_prefixLookup_; CombinClassTable cmLookup = null; } private static class Elements { // package private data members ------------------------------------- String m_prefixChars_; int m_prefix_; String m_uchars_; /** * Working string */ String m_cPoints_; /** * Offset to the working string */ int m_cPointsOffset_; /** * These are collation elements - there could be more than one - in case * of expansion */ int m_CEs_[]; int m_CELength_; /** * This is the value element maps in original table */ int m_mapCE_; int m_sizePrim_[]; int m_sizeSec_[]; int m_sizeTer_[]; boolean m_variableTop_; boolean m_caseBit_; // package private constructors ------------------------------------- /** * Package private constructor */ Elements() { m_sizePrim_ = new int[128]; m_sizeSec_ = new int[128]; m_sizeTer_ = new int[128]; m_CEs_ = new int[256]; m_CELength_ = 0; } /** * Package private constructor */ Elements(Elements element) { m_prefixChars_ = element.m_prefixChars_; m_prefix_ = element.m_prefix_; m_uchars_ = element.m_uchars_; m_cPoints_ = element.m_cPoints_; m_cPointsOffset_ = element.m_cPointsOffset_; m_CEs_ = element.m_CEs_; m_CELength_ = element.m_CELength_; m_mapCE_ = element.m_mapCE_; m_sizePrim_ = element.m_sizePrim_; m_sizeSec_ = element.m_sizeSec_; m_sizeTer_ = element.m_sizeTer_; m_variableTop_ = element.m_variableTop_; m_caseBit_ = element.m_caseBit_; } // package private methods ------------------------------------------- /** * Initializing the elements */ public void clear() { m_prefixChars_ = null; m_prefix_ = 0; m_uchars_ = null; m_cPoints_ = null; m_cPointsOffset_ = 0; m_CELength_ = 0; m_mapCE_ = 0; Arrays.fill(m_sizePrim_, 0); Arrays.fill(m_sizeSec_, 0); Arrays.fill(m_sizeTer_, 0); m_variableTop_ = false; m_caseBit_ = false; } /** * Hashcode calculation for token * * @return the hashcode */ public int hashCode() { String str = m_cPoints_.substring(m_cPointsOffset_); return str.hashCode(); } /** * Equals calculation * * @param target Object to compare * @return true if target is the same as this object */ public boolean equals(Object target) { if (target == this) { return true; } if (target instanceof Elements) { Elements t = (Elements) target; int size = m_cPoints_.length() - m_cPointsOffset_; if (size == t.m_cPoints_.length() - t.m_cPointsOffset_) { return t.m_cPoints_.regionMatches(t.m_cPointsOffset_, m_cPoints_, m_cPointsOffset_, size); } } return false; } } // private data member --------------------------------------------------- /** * Maximum strength used in CE building */ private static final int CE_BASIC_STRENGTH_LIMIT_ = 3; /** * Maximum collation strength */ private static final int CE_STRENGTH_LIMIT_ = 16; /** * Strength mask array, used in inverse UCA */ private static final int STRENGTH_MASK_[] = { 0xFFFF0000, 0xFFFFFF00, 0xFFFFFFFF }; /** * CE tag for not found */ private static final int CE_NOT_FOUND_ = 0xF0000000; /** * CE tag for not found */ private static final int CE_NOT_FOUND_TAG_ = 0; /** * This code point results in an expansion */ private static final int CE_EXPANSION_TAG_ = 1; /** * Start of a contraction */ private static final int CE_CONTRACTION_TAG_ = 2; /* * Thai character - do the reordering */ // private static final int CE_THAI_TAG_ = 3; /* * Charset processing, not yet implemented */ // private static final int CE_CHARSET_TAG_ = 4; /** * Lead surrogate that is tailored and doesn't start a contraction */ private static final int CE_SURROGATE_TAG_ = 5; /* * AC00-D7AF */ // private static final int CE_HANGUL_SYLLABLE_TAG_ = 6; /* * D800-DBFF */ // private static final int CE_LEAD_SURROGATE_TAG_ = 7; /* * DC00-DFFF */ // private static final int CE_TRAIL_SURROGATE_TAG_ = 8; /* * 0x3400-0x4DB5, 0x4E00-0x9FA5, 0xF900-0xFA2D */ // private static final int CE_CJK_IMPLICIT_TAG_ = 9; private static final int CE_IMPLICIT_TAG_ = 10; private static final int CE_SPEC_PROC_TAG_ = 11; /** * This is a three byte primary with starting secondaries and tertiaries. It * fits in a single 32 bit CE and is used instead of expansion to save space * without affecting the performance (hopefully) */ private static final int CE_LONG_PRIMARY_TAG_ = 12; /** * Unsafe UChar hash table table size. Size is 32 bytes for 1 bit for each * latin 1 char + some power of two for hashing the rest of the chars. Size * in bytes */ private static final int UNSAFECP_TABLE_SIZE_ = 1056; /** * Mask value down to "some power of two" -1. Number of bits, not num of * bytes. */ private static final int UNSAFECP_TABLE_MASK_ = 0x1fff; /** * Case values */ private static final int UPPER_CASE_ = 0x80; private static final int MIXED_CASE_ = 0x40; private static final int LOWER_CASE_ = 0x00; /* * Initial table size */ // private static final int INIT_TABLE_SIZE_ = 1028; /* * Header size, copied from ICU4C, to be changed when that value changes */ // private static final int HEADER_SIZE_ = 0xC4; /** * Contraction table new element indicator */ private static final int CONTRACTION_TABLE_NEW_ELEMENT_ = 0xFFFFFF; /** * Parser for the rules */ private CollationRuleParser m_parser_; /** * Utility UCA collation element iterator */ private CollationElementIterator m_utilColEIter_; /** * Utility data members */ private CEGenerator m_utilGens_[] = { new CEGenerator(), new CEGenerator(), new CEGenerator() }; private int m_utilCEBuffer_[] = new int[CE_BASIC_STRENGTH_LIMIT_]; private int m_utilIntBuffer_[] = new int[CE_STRENGTH_LIMIT_]; private Elements m_utilElement_ = new Elements(); private Elements m_utilElement2_ = new Elements(); private CollationRuleParser.Token m_utilToken_ = new CollationRuleParser.Token(); private int m_utilCountBuffer_[] = new int[6]; private long m_utilLongBuffer_[] = new long[5]; private WeightRange m_utilLowerWeightRange_[] = { new WeightRange(), new WeightRange(), new WeightRange(), new WeightRange(), new WeightRange() }; private WeightRange m_utilUpperWeightRange_[] = { new WeightRange(), new WeightRange(), new WeightRange(), new WeightRange(), new WeightRange() }; private WeightRange m_utilWeightRange_ = new WeightRange(); private final Normalizer2Impl m_nfcImpl_ = Norm2AllModes.getNFCInstance().impl; private CanonicalIterator m_utilCanIter_ = new CanonicalIterator(""); private StringBuilder m_utilStringBuffer_ = new StringBuilder(""); // Flag indicating a combining marks table is required or not. private static boolean buildCMTabFlag = false; // private methods ------------------------------------------------------- /** * @param listheader * parsed rule tokens * @exception Exception * thrown when internal error occurs */ private void initBuffers(CollationRuleParser.TokenListHeader listheader) throws Exception { CollationRuleParser.Token token = listheader.m_last_; Arrays.fill(m_utilIntBuffer_, 0, CE_STRENGTH_LIMIT_, 0); token.m_toInsert_ = 1; m_utilIntBuffer_[token.m_strength_] = 1; while (token.m_previous_ != null) { if (token.m_previous_.m_strength_ < token.m_strength_) { // going up m_utilIntBuffer_[token.m_strength_] = 0; m_utilIntBuffer_[token.m_previous_.m_strength_]++; } else if (token.m_previous_.m_strength_ > token.m_strength_) { // going down m_utilIntBuffer_[token.m_previous_.m_strength_] = 1; } else { m_utilIntBuffer_[token.m_strength_]++; } token = token.m_previous_; token.m_toInsert_ = m_utilIntBuffer_[token.m_strength_]; } token.m_toInsert_ = m_utilIntBuffer_[token.m_strength_]; INVERSE_UCA_.getInverseGapPositions(listheader); token = listheader.m_first_; int fstrength = Collator.IDENTICAL; int initstrength = Collator.IDENTICAL; m_utilCEBuffer_[Collator.PRIMARY] = mergeCE(listheader.m_baseCE_, listheader.m_baseContCE_, Collator.PRIMARY); m_utilCEBuffer_[Collator.SECONDARY] = mergeCE(listheader.m_baseCE_, listheader.m_baseContCE_, Collator.SECONDARY); m_utilCEBuffer_[Collator.TERTIARY] = mergeCE(listheader.m_baseCE_, listheader.m_baseContCE_, Collator.TERTIARY); while (token != null) { fstrength = token.m_strength_; if (fstrength < initstrength) { initstrength = fstrength; if (listheader.m_pos_[fstrength] == -1) { while (listheader.m_pos_[fstrength] == -1 && fstrength > 0) { fstrength--; } if (listheader.m_pos_[fstrength] == -1) { throw new Exception("Internal program error"); } } if (initstrength == Collator.TERTIARY) { // starting with tertiary m_utilCEBuffer_[Collator.PRIMARY] = listheader.m_gapsLo_[fstrength * 3]; m_utilCEBuffer_[Collator.SECONDARY] = listheader.m_gapsLo_[fstrength * 3 + 1]; m_utilCEBuffer_[Collator.TERTIARY] = getCEGenerator( m_utilGens_[Collator.TERTIARY], listheader.m_gapsLo_, listheader.m_gapsHi_, token, fstrength); } else if (initstrength == Collator.SECONDARY) { // secondaries m_utilCEBuffer_[Collator.PRIMARY] = listheader.m_gapsLo_[fstrength * 3]; m_utilCEBuffer_[Collator.SECONDARY] = getCEGenerator( m_utilGens_[Collator.SECONDARY], listheader.m_gapsLo_, listheader.m_gapsHi_, token, fstrength); m_utilCEBuffer_[Collator.TERTIARY] = getSimpleCEGenerator( m_utilGens_[Collator.TERTIARY], token, Collator.TERTIARY); } else { // primaries m_utilCEBuffer_[Collator.PRIMARY] = getCEGenerator( m_utilGens_[Collator.PRIMARY], listheader.m_gapsLo_, listheader.m_gapsHi_, token, fstrength); m_utilCEBuffer_[Collator.SECONDARY] = getSimpleCEGenerator( m_utilGens_[Collator.SECONDARY], token, Collator.SECONDARY); m_utilCEBuffer_[Collator.TERTIARY] = getSimpleCEGenerator( m_utilGens_[Collator.TERTIARY], token, Collator.TERTIARY); } } else { if (token.m_strength_ == Collator.TERTIARY) { m_utilCEBuffer_[Collator.TERTIARY] = getNextGenerated(m_utilGens_[Collator.TERTIARY]); } else if (token.m_strength_ == Collator.SECONDARY) { m_utilCEBuffer_[Collator.SECONDARY] = getNextGenerated(m_utilGens_[Collator.SECONDARY]); m_utilCEBuffer_[Collator.TERTIARY] = getSimpleCEGenerator( m_utilGens_[Collator.TERTIARY], token, Collator.TERTIARY); } else if (token.m_strength_ == Collator.PRIMARY) { m_utilCEBuffer_[Collator.PRIMARY] = getNextGenerated(m_utilGens_[Collator.PRIMARY]); m_utilCEBuffer_[Collator.SECONDARY] = getSimpleCEGenerator( m_utilGens_[Collator.SECONDARY], token, Collator.SECONDARY); m_utilCEBuffer_[Collator.TERTIARY] = getSimpleCEGenerator( m_utilGens_[Collator.TERTIARY], token, Collator.TERTIARY); } } doCE(m_utilCEBuffer_, token); token = token.m_next_; } } /** * Get the next generated ce * * @param g * ce generator * @return next generated ce */ private int getNextGenerated(CEGenerator g) { g.m_current_ = nextWeight(g); return g.m_current_; } /** * @param g * CEGenerator * @param token * rule token * @param strength * @return ce generator * @exception Exception * thrown when internal error occurs */ private int getSimpleCEGenerator(CEGenerator g, CollationRuleParser.Token token, int strength) throws Exception { int high, low, count = 1; int maxbyte = (strength == Collator.TERTIARY) ? 0x3F : 0xFF; if (strength == Collator.SECONDARY) { low = RuleBasedCollator.COMMON_TOP_2_ << 24; high = 0xFFFFFFFF; count = 0xFF - RuleBasedCollator.COMMON_TOP_2_; } else { low = RuleBasedCollator.BYTE_COMMON_ << 24; // 0x05000000; high = 0x40000000; count = 0x40 - RuleBasedCollator.BYTE_COMMON_; } if (token.m_next_ != null && token.m_next_.m_strength_ == strength) { count = token.m_next_.m_toInsert_; } g.m_rangesLength_ = allocateWeights(low, high, count, maxbyte, g.m_ranges_); g.m_current_ = RuleBasedCollator.BYTE_COMMON_ << 24; if (g.m_rangesLength_ == 0) { throw new Exception("Internal program error"); } return g.m_current_; } /** * Combines 2 ce into one with respect to the argument strength * * @param ce1 * first ce * @param ce2 * second ce * @param strength * strength to use * @return combined ce */ private static int mergeCE(int ce1, int ce2, int strength) { int mask = RuleBasedCollator.CE_TERTIARY_MASK_; if (strength == Collator.SECONDARY) { mask = RuleBasedCollator.CE_SECONDARY_MASK_; } else if (strength == Collator.PRIMARY) { mask = RuleBasedCollator.CE_PRIMARY_MASK_; } ce1 &= mask; ce2 &= mask; switch (strength) { case Collator.PRIMARY: return ce1 | ce2 >>> 16; case Collator.SECONDARY: return ce1 << 16 | ce2 << 8; default: return ce1 << 24 | ce2 << 16; } } /** * @param g * CEGenerator * @param lows * low gap array * @param highs * high gap array * @param token * rule token * @param fstrength * @exception Exception * thrown when internal error occurs */ private int getCEGenerator(CEGenerator g, int lows[], int highs[], CollationRuleParser.Token token, int fstrength) throws Exception { int strength = token.m_strength_; int low = lows[fstrength * 3 + strength]; int high = highs[fstrength * 3 + strength]; int maxbyte = 0; if (strength == Collator.TERTIARY) { maxbyte = 0x3F; } else if (strength == Collator.PRIMARY) { maxbyte = 0xFE; } else { maxbyte = 0xFF; } int count = token.m_toInsert_; if (Utility.compareUnsigned(low, high) >= 0 && strength > Collator.PRIMARY) { int s = strength; while (true) { s--; if (lows[fstrength * 3 + s] != highs[fstrength * 3 + s]) { if (strength == Collator.SECONDARY) { if (low < (RuleBasedCollator.COMMON_TOP_2_ << 24)) { // Override if low range is less than // UCOL_COMMON_TOP2. low = RuleBasedCollator.COMMON_TOP_2_ << 24; } high = 0xFFFFFFFF; } else { if (low < RuleBasedCollator.COMMON_BOTTOM_3 << 24) { // Override if low range is less than // UCOL_COMMON_BOT3. low = RuleBasedCollator.COMMON_BOTTOM_3 << 24; } high = 0x40000000; } break; } if (s < 0) { throw new Exception("Internal program error"); } } } if (low == 0) { low = 0x01000000; } if (strength == Collator.SECONDARY) { // similar as simple if (Utility.compareUnsigned(low, RuleBasedCollator.COMMON_BOTTOM_2_ << 24) >= 0 && Utility.compareUnsigned(low, RuleBasedCollator.COMMON_TOP_2_ << 24) < 0) { low = RuleBasedCollator.COMMON_TOP_2_ << 24; } if (Utility.compareUnsigned(high, RuleBasedCollator.COMMON_BOTTOM_2_ << 24) > 0 && Utility.compareUnsigned(high, RuleBasedCollator.COMMON_TOP_2_ << 24) < 0) { high = RuleBasedCollator.COMMON_TOP_2_ << 24; } if (Utility.compareUnsigned(low, RuleBasedCollator.COMMON_BOTTOM_2_ << 24) < 0) { g.m_rangesLength_ = allocateWeights( RuleBasedCollator.BYTE_UNSHIFTED_MIN_ << 24, high, count, maxbyte, g.m_ranges_); g.m_current_ = nextWeight(g); // g.m_current_ = RuleBasedCollator.COMMON_BOTTOM_2_ << 24; return g.m_current_; } } g.m_rangesLength_ = allocateWeights(low, high, count, maxbyte, g.m_ranges_); if (g.m_rangesLength_ == 0) { throw new Exception("Internal program error"); } g.m_current_ = nextWeight(g); return g.m_current_; } /** * @param ceparts * list of collation elements parts * @param token * rule token * @exception Exception * thrown when forming case bits for expansions fails */ private void doCE(int ceparts[], CollationRuleParser.Token token) throws Exception { // this one makes the table and stuff // int noofbytes[] = new int[3]; for (int i = 0; i < 3; i++) { // noofbytes[i] = countBytes(ceparts[i]); m_utilIntBuffer_[i] = countBytes(ceparts[i]); } // Here we have to pack CEs from parts int cei = 0; int value = 0; while ((cei << 1) < m_utilIntBuffer_[0] || cei < m_utilIntBuffer_[1] || cei < m_utilIntBuffer_[2]) { if (cei > 0) { value = RuleBasedCollator.CE_CONTINUATION_MARKER_; } else { value = 0; } if ((cei << 1) < m_utilIntBuffer_[0]) { value |= ((ceparts[0] >> (32 - ((cei + 1) << 4))) & 0xFFFF) << 16; } if (cei < m_utilIntBuffer_[1]) { value |= ((ceparts[1] >> (32 - ((cei + 1) << 3))) & 0xFF) << 8; } if (cei < m_utilIntBuffer_[2]) { value |= ((ceparts[2] >> (32 - ((cei + 1) << 3))) & 0x3F); } token.m_CE_[cei] = value; cei++; } if (cei == 0) { // totally ignorable token.m_CELength_ = 1; token.m_CE_[0] = 0; } else { // there is at least something token.m_CELength_ = cei; } // Case bits handling for expansion if (token.m_CE_[0] != 0) { // case bits should be set only for // non-ignorables int startoftokenrule = token.m_source_ & 0xFF; if ((token.m_source_ >>> 24) > 1) { // Do it manually int length = token.m_source_ >>> 24; String tokenstr = token.m_rules_.substring(startoftokenrule, startoftokenrule + length); token.m_CE_[0] |= getCaseBits(tokenstr); } else { // Copy it from the UCA int caseCE = getFirstCE(token.m_rules_.charAt(startoftokenrule)); token.m_CE_[0] |= (caseCE & 0xC0); } } } /** * Count the number of non-zero bytes used in the ce * * @param ce * @return number of non-zero bytes used in ce */ private static final int countBytes(int ce) { int mask = 0xFFFFFFFF; int result = 0; while (mask != 0) { if ((ce & mask) != 0) { result++; } mask >>>= 8; } return result; } /** * We are ready to create collation elements * * @param t * build table to insert * @param lh * rule token list header */ private void createElements(BuildTable t, CollationRuleParser.TokenListHeader lh) { CollationRuleParser.Token tok = lh.m_first_; m_utilElement_.clear(); while (tok != null) { // first, check if there are any expansions // if there are expansions, we need to do a little bit more // processing since parts of expansion can be tailored, while // others are not if (tok.m_expansion_ != 0) { int len = tok.m_expansion_ >>> 24; int currentSequenceLen = len; int expOffset = tok.m_expansion_ & 0x00FFFFFF; m_utilToken_.m_source_ = currentSequenceLen | expOffset; m_utilToken_.m_rules_ = m_parser_.m_source_; while (len > 0) { currentSequenceLen = len; while (currentSequenceLen > 0) { m_utilToken_.m_source_ = (currentSequenceLen << 24) | expOffset; CollationRuleParser.Token expt = m_parser_.m_hashTable_.get(m_utilToken_); if (expt != null && expt.m_strength_ != CollationRuleParser.TOKEN_RESET_) { // expansion is tailored int noOfCEsToCopy = expt.m_CELength_; for (int j = 0; j < noOfCEsToCopy; j++) { tok.m_expCE_[tok.m_expCELength_ + j] = expt.m_CE_[j]; } tok.m_expCELength_ += noOfCEsToCopy; // never try to add codepoints and CEs. // For some odd reason, it won't work. expOffset += currentSequenceLen; // noOfCEsToCopy; len -= currentSequenceLen; // noOfCEsToCopy; break; } else { currentSequenceLen--; } } if (currentSequenceLen == 0) { // couldn't find any tailored subsequence, will have to // get one from UCA. first, get the UChars from the // rules then pick CEs out until there is no more and // stuff them into expansion m_utilColEIter_.setText(m_parser_.m_source_.substring( expOffset, expOffset + 1)); while (true) { int order = m_utilColEIter_.next(); if (order == CollationElementIterator.NULLORDER) { break; } tok.m_expCE_[tok.m_expCELength_++] = order; } expOffset++; len--; } } } else { tok.m_expCELength_ = 0; } // set the ucaelement with obtained values m_utilElement_.m_CELength_ = tok.m_CELength_ + tok.m_expCELength_; // copy CEs System.arraycopy(tok.m_CE_, 0, m_utilElement_.m_CEs_, 0, tok.m_CELength_); System.arraycopy(tok.m_expCE_, 0, m_utilElement_.m_CEs_, tok.m_CELength_, tok.m_expCELength_); // copy UChars // We kept prefix and source kind of together, as it is a kind of a // contraction. // However, now we have to slice the prefix off the main thing - m_utilElement_.m_prefix_ = 0;// el.m_prefixChars_; m_utilElement_.m_cPointsOffset_ = 0; // el.m_uchars_; if (tok.m_prefix_ != 0) { // we will just copy the prefix here, and adjust accordingly in // the addPrefix function in ucol_elm. The reason is that we // need to add both composed AND decomposed elements to the // unsafe table. int size = tok.m_prefix_ >> 24; int offset = tok.m_prefix_ & 0x00FFFFFF; m_utilElement_.m_prefixChars_ = m_parser_.m_source_.substring( offset, offset + size); size = (tok.m_source_ >> 24) - (tok.m_prefix_ >> 24); offset = (tok.m_source_ & 0x00FFFFFF) + (tok.m_prefix_ >> 24); m_utilElement_.m_uchars_ = m_parser_.m_source_.substring( offset, offset + size); } else { m_utilElement_.m_prefixChars_ = null; int offset = tok.m_source_ & 0x00FFFFFF; int size = tok.m_source_ >>> 24; m_utilElement_.m_uchars_ = m_parser_.m_source_.substring( offset, offset + size); } m_utilElement_.m_cPoints_ = m_utilElement_.m_uchars_; boolean containCombinMarks = false; for (int i = 0; i < m_utilElement_.m_cPoints_.length() - m_utilElement_.m_cPointsOffset_; i++) { if (isJamo(m_utilElement_.m_cPoints_.charAt(i))) { t.m_collator_.m_isJamoSpecial_ = true; break; } if (!buildCMTabFlag) { // check combining class int fcd = m_nfcImpl_.getFCD16FromSingleLead(m_utilElement_.m_cPoints_.charAt(i)); // TODO: review for handling supplementary characters if ((fcd & 0xff) == 0) { // reset flag when current char is not combining mark. containCombinMarks = false; } else { containCombinMarks = true; } } } if (!buildCMTabFlag && containCombinMarks) { buildCMTabFlag = true; } /*** * // Case bits handling m_utilElement_.m_CEs_[0] &= 0xFFFFFF3F; // * Clean the case bits field if (m_utilElement_.m_cPoints_.length() * - m_utilElement_.m_cPointsOffset_ > 1) { // Do it manually * m_utilElement_.m_CEs_[0] |= * getCaseBits(m_utilElement_.m_cPoints_); } else { // Copy it from * the UCA int caseCE = * getFirstCE(m_utilElement_.m_cPoints_.charAt(0)); * m_utilElement_.m_CEs_[0] |= (caseCE & 0xC0); } ***/ // and then, add it addAnElement(t, m_utilElement_); tok = tok.m_next_; } } /** * Testing if the string argument has case * * @param src * string * @return the case for this char array * @exception Exception * thrown when internal program error occurs */ private final int getCaseBits(String src) throws Exception { int uCount = 0; int lCount = 0; src = Normalizer.decompose(src, true); m_utilColEIter_.setText(src); for (int i = 0; i < src.length(); i++) { m_utilColEIter_.setText(src.substring(i, i + 1)); int order = m_utilColEIter_.next(); if (RuleBasedCollator.isContinuation(order)) { throw new Exception("Internal program error"); } if ((order & RuleBasedCollator.CE_CASE_BIT_MASK_) == UPPER_CASE_) { uCount++; } else { char ch = src.charAt(i); if (UCharacter.isLowerCase(ch)) { lCount++; } else { if (toSmallKana(ch) == ch && toLargeKana(ch) != ch) { lCount++; } } } } if (uCount != 0 && lCount != 0) { return MIXED_CASE_; } else if (uCount != 0) { return UPPER_CASE_; } else { return LOWER_CASE_; } } /** * Converts a char to the uppercase Kana * * @param ch * character to convert * @return the converted Kana character */ private static final char toLargeKana(char ch) { if (0x3042 < ch && ch < 0x30ef) { // Kana range switch (ch - 0x3000) { case 0x41: case 0x43: case 0x45: case 0x47: case 0x49: case 0x63: case 0x83: case 0x85: case 0x8E: case 0xA1: case 0xA3: case 0xA5: case 0xA7: case 0xA9: case 0xC3: case 0xE3: case 0xE5: case 0xEE: ch++; break; case 0xF5: ch = 0x30AB; break; case 0xF6: ch = 0x30B1; break; } } return ch; } /** * Converts a char to the lowercase Kana * * @param ch * character to convert * @return the converted Kana character */ private static final char toSmallKana(char ch) { if (0x3042 < ch && ch < 0x30ef) { // Kana range switch (ch - 0x3000) { case 0x42: case 0x44: case 0x46: case 0x48: case 0x4A: case 0x64: case 0x84: case 0x86: case 0x8F: case 0xA2: case 0xA4: case 0xA6: case 0xA8: case 0xAA: case 0xC4: case 0xE4: case 0xE6: case 0xEF: ch--; break; case 0xAB: ch = 0x30F5; break; case 0xB1: ch = 0x30F6; break; } } return ch; } /** * This should be connected to special Jamo handling. */ private int getFirstCE(char ch) { m_utilColEIter_.setText(UCharacter.toString(ch)); return m_utilColEIter_.next(); } /** * This adds a read element, while testing for existence * * @param t * build table * @param element * @return ce */ private int addAnElement(BuildTable t, Elements element) { Vector expansions = t.m_expansions_; element.m_mapCE_ = 0; if (element.m_CELength_ == 1) { element.m_mapCE_ = element.m_CEs_[0]; } else { // unfortunately, it looks like we have to look for a long primary // here since in canonical closure we are going to hit some long // primaries from the first phase, and they will come back as // continuations/expansions destroying the effect of the previous // opitimization. A long primary is a three byte primary with // starting secondaries and tertiaries. It can appear in long runs // of only primary differences (like east Asian tailorings) also, // it should not be an expansion, as expansions would break with // this if (element.m_CELength_ == 2 // a two CE expansion && RuleBasedCollator.isContinuation(element.m_CEs_[1]) && (element.m_CEs_[1] & (~(0xFF << 24 | RuleBasedCollator.CE_CONTINUATION_MARKER_))) == 0 // that // has // only // primaries // in // continuation && (((element.m_CEs_[0] >> 8) & 0xFF) == RuleBasedCollator.BYTE_COMMON_) // a common secondary && ((element.m_CEs_[0] & 0xFF) == RuleBasedCollator.BYTE_COMMON_) // and // a // common // tertiary ) { element.m_mapCE_ = RuleBasedCollator.CE_SPECIAL_FLAG_ // a long primary special | (CE_LONG_PRIMARY_TAG_ << 24) // first and second byte of primary | ((element.m_CEs_[0] >> 8) & 0xFFFF00) // third byte of primary | ((element.m_CEs_[1] >> 24) & 0xFF); } else { // omitting expansion offset in builder // (HEADER_SIZE_ >> 2) int expansion = RuleBasedCollator.CE_SPECIAL_FLAG_ | (CE_EXPANSION_TAG_ << RuleBasedCollator.CE_TAG_SHIFT_) | (addExpansion(expansions, element.m_CEs_[0]) << 4) & 0xFFFFF0; for (int i = 1; i < element.m_CELength_; i++) { addExpansion(expansions, element.m_CEs_[i]); } if (element.m_CELength_ <= 0xF) { expansion |= element.m_CELength_; } else { addExpansion(expansions, 0); } element.m_mapCE_ = expansion; setMaxExpansion(element.m_CEs_[element.m_CELength_ - 1], (byte) element.m_CELength_, t.m_maxExpansions_); if (isJamo(element.m_cPoints_.charAt(0))) { t.m_collator_.m_isJamoSpecial_ = true; setMaxJamoExpansion(element.m_cPoints_.charAt(0), element.m_CEs_[element.m_CELength_ - 1], (byte) element.m_CELength_, t.m_maxJamoExpansions_); } } } // We treat digits differently - they are "uber special" and should be // processed differently if numeric collation is on. int uniChar = 0; if ((element.m_uchars_.length() == 2) && UTF16.isLeadSurrogate(element.m_uchars_.charAt(0))) { uniChar = UCharacterProperty.getRawSupplementary(element.m_uchars_ .charAt(0), element.m_uchars_.charAt(1)); } else if (element.m_uchars_.length() == 1) { uniChar = element.m_uchars_.charAt(0); } // Here, we either have one normal CE OR mapCE is set. Therefore, we // stuff only one element to the expansion buffer. When we encounter a // digit and we don't do numeric collation, we will just pick the CE // we have and break out of case (see ucol.cpp ucol_prv_getSpecialCE // && ucol_prv_getSpecialPrevCE). If we picked a special, further // processing will occur. If it's a simple CE, we'll return due // to how the loop is constructed. if (uniChar != 0 && UCharacter.isDigit(uniChar)) { // prepare the element int expansion = RuleBasedCollator.CE_SPECIAL_FLAG_ | (CollationElementIterator.CE_DIGIT_TAG_ << RuleBasedCollator.CE_TAG_SHIFT_) | 1; if (element.m_mapCE_ != 0) { // if there is an expansion, we'll pick it here expansion |= (addExpansion(expansions, element.m_mapCE_) << 4); } else { expansion |= (addExpansion(expansions, element.m_CEs_[0]) << 4); } element.m_mapCE_ = expansion; } // here we want to add the prefix structure. // I will try to process it as a reverse contraction, if possible. // prefix buffer is already reversed. if (element.m_prefixChars_ != null && element.m_prefixChars_.length() - element.m_prefix_ > 0) { // We keep the seen prefix starter elements in a hashtable we need // it to be able to distinguish between the simple codepoints and // prefix starters. Also, we need to use it for canonical closure. m_utilElement2_.m_caseBit_ = element.m_caseBit_; m_utilElement2_.m_CELength_ = element.m_CELength_; m_utilElement2_.m_CEs_ = element.m_CEs_; m_utilElement2_.m_mapCE_ = element.m_mapCE_; // m_utilElement2_.m_prefixChars_ = element.m_prefixChars_; m_utilElement2_.m_sizePrim_ = element.m_sizePrim_; m_utilElement2_.m_sizeSec_ = element.m_sizeSec_; m_utilElement2_.m_sizeTer_ = element.m_sizeTer_; m_utilElement2_.m_variableTop_ = element.m_variableTop_; m_utilElement2_.m_prefix_ = element.m_prefix_; m_utilElement2_.m_prefixChars_ = Normalizer.compose( element.m_prefixChars_, false); m_utilElement2_.m_uchars_ = element.m_uchars_; m_utilElement2_.m_cPoints_ = element.m_cPoints_; m_utilElement2_.m_cPointsOffset_ = 0; if (t.m_prefixLookup_ != null) { Elements uCE = t.m_prefixLookup_.get(element); if (uCE != null) { // there is already a set of code points here element.m_mapCE_ = addPrefix(t, uCE.m_mapCE_, element); } else { // no code points, so this spot is clean element.m_mapCE_ = addPrefix(t, CE_NOT_FOUND_, element); uCE = new Elements(element); uCE.m_cPoints_ = uCE.m_uchars_; t.m_prefixLookup_.put(uCE, uCE); } if (m_utilElement2_.m_prefixChars_.length() != element.m_prefixChars_ .length() - element.m_prefix_ || !m_utilElement2_.m_prefixChars_.regionMatches(0, element.m_prefixChars_, element.m_prefix_, m_utilElement2_.m_prefixChars_.length())) { // do it! m_utilElement2_.m_mapCE_ = addPrefix(t, element.m_mapCE_, m_utilElement2_); } } } // We need to use the canonical iterator here // the way we do it is to generate the canonically equivalent strings // for the contraction and then add the sequences that pass FCD check if (element.m_cPoints_.length() - element.m_cPointsOffset_ > 1 && !(element.m_cPoints_.length() - element.m_cPointsOffset_ == 2 && UTF16.isLeadSurrogate(element.m_cPoints_.charAt(0)) && UTF16 .isTrailSurrogate(element.m_cPoints_.charAt(1)))) { // this is a contraction, we should check whether a composed form // should also be included m_utilCanIter_.setSource(element.m_cPoints_); String source = m_utilCanIter_.next(); while (source != null && source.length() > 0) { if (Normalizer.quickCheck(source, Normalizer.FCD, 0) != Normalizer.NO) { element.m_uchars_ = source; element.m_cPoints_ = element.m_uchars_; finalizeAddition(t, element); } source = m_utilCanIter_.next(); } return element.m_mapCE_; } else { return finalizeAddition(t, element); } } /** * Adds an expansion ce to the expansion vector * * @param expansions * vector to add to * @param value * of the expansion * @return the current position of the new element */ private static final int addExpansion(Vector expansions, int value) { expansions.add(new Integer(value)); return expansions.size() - 1; } /** * Looks for the maximum length of all expansion sequences ending with the * same collation element. The size required for maxexpansion and maxsize is * returned if the arrays are too small. * * @param endexpansion * the last expansion collation element to be added * @param expansionsize * size of the expansion * @param maxexpansion * data structure to store the maximum expansion data. * @returns size of the maxexpansion and maxsize used. */ private static int setMaxExpansion(int endexpansion, byte expansionsize, MaxExpansionTable maxexpansion) { int start = 0; int limit = maxexpansion.m_endExpansionCE_.size(); long unsigned = (long) endexpansion; unsigned &= 0xFFFFFFFFl; // using binary search to determine if last expansion element is // already in the array int result = -1; while (start < limit - 1) { int mid = start + ((limit - start) >> 1); long unsignedce = (maxexpansion.m_endExpansionCE_ .get(mid)).intValue(); unsignedce &= 0xFFFFFFFFl; if (unsigned <= unsignedce) { limit = mid; } else { start = mid; } } if ((maxexpansion.m_endExpansionCE_.get(start)).intValue() == endexpansion) { result = start; } else if ((maxexpansion.m_endExpansionCE_.get(limit)) .intValue() == endexpansion) { result = limit; } if (result > -1) { // found the ce in expansion, we'll just modify the size if it // is smaller Object currentsize = maxexpansion.m_expansionCESize_.get(result); if (((Byte) currentsize).byteValue() < expansionsize) { maxexpansion.m_expansionCESize_.set(result, new Byte( expansionsize)); } } else { // we'll need to squeeze the value into the array. initial // implementation. shifting the subarray down by 1 maxexpansion.m_endExpansionCE_.insertElementAt(new Integer( endexpansion), start + 1); maxexpansion.m_expansionCESize_.insertElementAt(new Byte( expansionsize), start + 1); } return maxexpansion.m_endExpansionCE_.size(); } /** * Sets the maximum length of all jamo expansion sequences ending with the * same collation element. The size required for maxexpansion and maxsize is * returned if the arrays are too small. * * @param ch * the jamo codepoint * @param endexpansion * the last expansion collation element to be added * @param expansionsize * size of the expansion * @param maxexpansion * data structure to store the maximum expansion data. * @returns size of the maxexpansion and maxsize used. */ private static int setMaxJamoExpansion(char ch, int endexpansion, byte expansionsize, MaxJamoExpansionTable maxexpansion) { boolean isV = true; if (ch >= 0x1100 && ch <= 0x1112) { // determines L for Jamo, doesn't need to store this since it is // never at the end of a expansion if (maxexpansion.m_maxLSize_ < expansionsize) { maxexpansion.m_maxLSize_ = expansionsize; } return maxexpansion.m_endExpansionCE_.size(); } if (ch >= 0x1161 && ch <= 0x1175) { // determines V for Jamo if (maxexpansion.m_maxVSize_ < expansionsize) { maxexpansion.m_maxVSize_ = expansionsize; } } if (ch >= 0x11A8 && ch <= 0x11C2) { isV = false; // determines T for Jamo if (maxexpansion.m_maxTSize_ < expansionsize) { maxexpansion.m_maxTSize_ = expansionsize; } } int pos = maxexpansion.m_endExpansionCE_.size(); while (pos > 0) { pos--; if ((maxexpansion.m_endExpansionCE_.get(pos)).intValue() == endexpansion) { return maxexpansion.m_endExpansionCE_.size(); } } maxexpansion.m_endExpansionCE_.add(new Integer(endexpansion)); maxexpansion.m_isV_.add(isV ? Boolean.TRUE : Boolean.FALSE); return maxexpansion.m_endExpansionCE_.size(); } /** * Adds a prefix to the table * * @param t * build table to update * @param CE * collation element to add * @param element * rule element to add * @return modified ce */ private int addPrefix(BuildTable t, int CE, Elements element) { // currently the longest prefix we're supporting in Japanese is two // characters long. Although this table could quite easily mimic // complete contraction stuff there is no good reason to make a general // solution, as it would require some error prone messing. ContractionTable contractions = t.m_contractions_; String oldCP = element.m_cPoints_; int oldCPOffset = element.m_cPointsOffset_; contractions.m_currentTag_ = CE_SPEC_PROC_TAG_; // here, we will normalize & add prefix to the table. int size = element.m_prefixChars_.length() - element.m_prefix_; for (int j = 1; j < size; j++) { // First add NFD prefix chars to unsafe CP hash table // Unless it is a trail surrogate, which is handled algoritmically // and shouldn't take up space in the table. char ch = element.m_prefixChars_.charAt(j + element.m_prefix_); if (!UTF16.isTrailSurrogate(ch)) { unsafeCPSet(t.m_unsafeCP_, ch); } } // StringBuffer reversed = new StringBuffer(); m_utilStringBuffer_.delete(0, m_utilStringBuffer_.length()); for (int j = 0; j < size; j++) { // prefixes are going to be looked up backwards // therefore, we will promptly reverse the prefix buffer... int offset = element.m_prefixChars_.length() - j - 1; m_utilStringBuffer_.append(element.m_prefixChars_.charAt(offset)); } element.m_prefixChars_ = m_utilStringBuffer_.toString(); element.m_prefix_ = 0; // the first codepoint is also unsafe, as it forms a 'contraction' with // the prefix if (!UTF16.isTrailSurrogate(element.m_cPoints_.charAt(0))) { unsafeCPSet(t.m_unsafeCP_, element.m_cPoints_.charAt(0)); } element.m_cPoints_ = element.m_prefixChars_; element.m_cPointsOffset_ = element.m_prefix_; // Add the last char of the contraction to the contraction-end hash // table. unless it is a trail surrogate, which is handled // algorithmically and shouldn't be in the table if (!UTF16.isTrailSurrogate(element.m_cPoints_ .charAt(element.m_cPoints_.length() - 1))) { ContrEndCPSet(t.m_contrEndCP_, element.m_cPoints_ .charAt(element.m_cPoints_.length() - 1)); } // First we need to check if contractions starts with a surrogate // int cp = UTF16.charAt(element.m_cPoints_, element.m_cPointsOffset_); // If there are any Jamos in the contraction, we should turn on special // processing for Jamos if (isJamo(element.m_prefixChars_.charAt(element.m_prefix_))) { t.m_collator_.m_isJamoSpecial_ = true; } // then we need to deal with it // we could aready have something in table - or we might not if (!isPrefix(CE)) { // if it wasn't contraction, we wouldn't end up here int firstContractionOffset = addContraction(contractions, CONTRACTION_TABLE_NEW_ELEMENT_, (char) 0, CE); int newCE = processContraction(contractions, element, CE_NOT_FOUND_); addContraction(contractions, firstContractionOffset, element.m_prefixChars_.charAt(element.m_prefix_), newCE); addContraction(contractions, firstContractionOffset, (char) 0xFFFF, CE); CE = constructSpecialCE(CE_SPEC_PROC_TAG_, firstContractionOffset); } else { // we are adding to existing contraction // there were already some elements in the table, so we need to add // a new contraction // Two things can happen here: either the codepoint is already in // the table, or it is not char ch = element.m_prefixChars_.charAt(element.m_prefix_); int position = findCP(contractions, CE, ch); if (position > 0) { // if it is we just continue down the chain int eCE = getCE(contractions, CE, position); int newCE = processContraction(contractions, element, eCE); setContraction(contractions, CE, position, ch, newCE); } else { // if it isn't, we will have to create a new sequence processContraction(contractions, element, CE_NOT_FOUND_); insertContraction(contractions, CE, ch, element.m_mapCE_); } } element.m_cPoints_ = oldCP; element.m_cPointsOffset_ = oldCPOffset; return CE; } /** * Checks if the argument ce is a contraction * * @param CE * collation element * @return true if argument ce is a contraction */ private static final boolean isContraction(int CE) { return isSpecial(CE) && (getCETag(CE) == CE_CONTRACTION_TAG_); } /** * Checks if the argument ce has a prefix * * @param CE * collation element * @return true if argument ce has a prefix */ private static final boolean isPrefix(int CE) { return isSpecial(CE) && (getCETag(CE) == CE_SPEC_PROC_TAG_); } /** * Checks if the argument ce is special * * @param CE * collation element * @return true if argument ce is special */ private static final boolean isSpecial(int CE) { return (CE & RuleBasedCollator.CE_SPECIAL_FLAG_) == 0xF0000000; } /** * Checks if the argument ce has a prefix * * @param CE * collation element * @return true if argument ce has a prefix */ private static final int getCETag(int CE) { return (CE & RuleBasedCollator.CE_TAG_MASK_) >>> RuleBasedCollator.CE_TAG_SHIFT_; } /** * Gets the ce at position in contraction table * * @param table * contraction table * @param position * offset to the contraction table * @return ce */ private static final int getCE(ContractionTable table, int element, int position) { element &= 0xFFFFFF; BasicContractionTable tbl = getBasicContractionTable(table, element); if (tbl == null) { return CE_NOT_FOUND_; } if (position > tbl.m_CEs_.size() || position == -1) { return CE_NOT_FOUND_; } else { return tbl.m_CEs_.get(position).intValue(); } } /** * Sets the unsafe character * * @param table * unsafe table * @param c * character to be added */ private static final void unsafeCPSet(byte table[], char c) { int hash = c; if (hash >= (UNSAFECP_TABLE_SIZE_ << 3)) { if (hash >= 0xd800 && hash <= 0xf8ff) { // Part of a surrogate, or in private use area. // These don't go in the table return; } hash = (hash & UNSAFECP_TABLE_MASK_) + 256; } table[hash >> 3] |= (1 << (hash & 7)); } /** * Sets the contraction end character * * @param table * contraction end table * @param c * character to be added */ private static final void ContrEndCPSet(byte table[], char c) { int hash = c; if (hash >= (UNSAFECP_TABLE_SIZE_ << 3)) { hash = (hash & UNSAFECP_TABLE_MASK_) + 256; } table[hash >> 3] |= (1 << (hash & 7)); } /** * Adds more contractions in table. If element is non existant, it creates * on. Returns element handle * * @param table * contraction table * @param element * offset to the contraction table * @param codePoint * codepoint to add * @param value * @return collation element */ private static int addContraction(ContractionTable table, int element, char codePoint, int value) { BasicContractionTable tbl = getBasicContractionTable(table, element); if (tbl == null) { tbl = addAContractionElement(table); element = table.m_elements_.size() - 1; } tbl.m_CEs_.add(new Integer(value)); tbl.m_codePoints_.append(codePoint); return constructSpecialCE(table.m_currentTag_, element); } /** * Adds a contraction element to the table * * @param table * contraction table to update * @return contraction */ private static BasicContractionTable addAContractionElement( ContractionTable table) { BasicContractionTable result = new BasicContractionTable(); table.m_elements_.add(result); return result; } /** * Constructs a special ce * * @param tag * special tag * @param CE * collation element * @return a contraction ce */ private static final int constructSpecialCE(int tag, int CE) { return RuleBasedCollator.CE_SPECIAL_FLAG_ | (tag << RuleBasedCollator.CE_TAG_SHIFT_) | (CE & 0xFFFFFF); } /** * Sets and inserts the element that has a contraction * * @param contractions * contraction table * @param element * contracting element * @param existingCE * @return contraction ce */ private static int processContraction(ContractionTable contractions, Elements element, int existingCE) { int firstContractionOffset = 0; // end of recursion if (element.m_cPoints_.length() - element.m_cPointsOffset_ == 1) { if (isContractionTableElement(existingCE) && getCETag(existingCE) == contractions.m_currentTag_) { changeContraction(contractions, existingCE, (char) 0, element.m_mapCE_); changeContraction(contractions, existingCE, (char) 0xFFFF, element.m_mapCE_); return existingCE; } else { // can't do just that. existingCe might be a contraction, // meaning that we need to do another step return element.m_mapCE_; } } // this recursion currently feeds on the only element we have... // We will have to copy it in order to accomodate for both backward // and forward cycles // we encountered either an empty space or a non-contraction element // this means we are constructing a new contraction sequence element.m_cPointsOffset_++; if (!isContractionTableElement(existingCE)) { // if it wasn't contraction, we wouldn't end up here firstContractionOffset = addContraction(contractions, CONTRACTION_TABLE_NEW_ELEMENT_, (char) 0, existingCE); int newCE = processContraction(contractions, element, CE_NOT_FOUND_); addContraction(contractions, firstContractionOffset, element.m_cPoints_.charAt(element.m_cPointsOffset_), newCE); addContraction(contractions, firstContractionOffset, (char) 0xFFFF, existingCE); existingCE = constructSpecialCE(contractions.m_currentTag_, firstContractionOffset); } else { // we are adding to existing contraction // there were already some elements in the table, so we need to add // a new contraction // Two things can happen here: either the codepoint is already in // the table, or it is not int position = findCP(contractions, existingCE, element.m_cPoints_ .charAt(element.m_cPointsOffset_)); if (position > 0) { // if it is we just continue down the chain int eCE = getCE(contractions, existingCE, position); int newCE = processContraction(contractions, element, eCE); setContraction(contractions, existingCE, position, element.m_cPoints_.charAt(element.m_cPointsOffset_), newCE); } else { // if it isn't, we will have to create a new sequence int newCE = processContraction(contractions, element, CE_NOT_FOUND_); insertContraction(contractions, existingCE, element.m_cPoints_ .charAt(element.m_cPointsOffset_), newCE); } } element.m_cPointsOffset_--; return existingCE; } /** * Checks if CE belongs to the contraction table * * @param CE * collation element to test * @return true if CE belongs to the contraction table */ private static final boolean isContractionTableElement(int CE) { return isSpecial(CE) && (getCETag(CE) == CE_CONTRACTION_TAG_ || getCETag(CE) == CE_SPEC_PROC_TAG_); } /** * Gets the codepoint * * @param table * contraction table * @param element * offset to the contraction element in the table * @param codePoint * code point to look for * @return the offset to the code point */ private static int findCP(ContractionTable table, int element, char codePoint) { BasicContractionTable tbl = getBasicContractionTable(table, element); if (tbl == null) { return -1; } int position = 0; while (codePoint > tbl.m_codePoints_.charAt(position)) { position++; if (position > tbl.m_codePoints_.length()) { return -1; } } if (codePoint == tbl.m_codePoints_.charAt(position)) { return position; } else { return -1; } } /** * Gets the contraction element out of the contraction table * * @param table * contraction table * @param offset * to the element in the contraction table * @return basic contraction element at offset in the contraction table */ private static final BasicContractionTable getBasicContractionTable( ContractionTable table, int offset) { offset &= 0xFFFFFF; if (offset == 0xFFFFFF) { return null; } return table.m_elements_.get(offset); } /** * Changes the contraction element * * @param table * contraction table * @param element * offset to the element in the contraction table * @param codePoint * codepoint * @param newCE * new collation element * @return basic contraction element at offset in the contraction table */ private static final int changeContraction(ContractionTable table, int element, char codePoint, int newCE) { BasicContractionTable tbl = getBasicContractionTable(table, element); if (tbl == null) { return 0; } int position = 0; while (codePoint > tbl.m_codePoints_.charAt(position)) { position++; if (position > tbl.m_codePoints_.length()) { return CE_NOT_FOUND_; } } if (codePoint == tbl.m_codePoints_.charAt(position)) { tbl.m_CEs_.set(position, new Integer(newCE)); return element & 0xFFFFFF; } else { return CE_NOT_FOUND_; } } /** * Sets a part of contraction sequence in table. If element is non existant, * it creates on. Returns element handle. * * @param table * contraction table * @param element * offset to the contraction table * @param offset * @param codePoint * contraction character * @param value * ce value * @return new contraction ce */ private static final int setContraction(ContractionTable table, int element, int offset, char codePoint, int value) { element &= 0xFFFFFF; BasicContractionTable tbl = getBasicContractionTable(table, element); if (tbl == null) { tbl = addAContractionElement(table); element = table.m_elements_.size() - 1; } tbl.m_CEs_.set(offset, new Integer(value)); tbl.m_codePoints_.setCharAt(offset, codePoint); return constructSpecialCE(table.m_currentTag_, element); } /** * Inserts a part of contraction sequence in table. Sequences behind the * offset are moved back. If element is non existent, it creates on. * * @param table * contraction * @param element * offset to the table contraction * @param codePoint * code point * @param value * collation element value * @return contraction collation element */ private static final int insertContraction(ContractionTable table, int element, char codePoint, int value) { element &= 0xFFFFFF; BasicContractionTable tbl = getBasicContractionTable(table, element); if (tbl == null) { tbl = addAContractionElement(table); element = table.m_elements_.size() - 1; } int offset = 0; while (tbl.m_codePoints_.charAt(offset) < codePoint && offset < tbl.m_codePoints_.length()) { offset++; } tbl.m_CEs_.insertElementAt(new Integer(value), offset); tbl.m_codePoints_.insert(offset, codePoint); return constructSpecialCE(table.m_currentTag_, element); } /** * Finalize addition * * @param t * build table * @param element * to add */ private final static int finalizeAddition(BuildTable t, Elements element) { int CE = CE_NOT_FOUND_; // This should add a completely ignorable element to the // unsafe table, so that backward iteration will skip // over it when treating contractions. if (element.m_mapCE_ == 0) { for (int i = 0; i < element.m_cPoints_.length(); i++) { char ch = element.m_cPoints_.charAt(i); if (!UTF16.isTrailSurrogate(ch)) { unsafeCPSet(t.m_unsafeCP_, ch); } } } if (element.m_cPoints_.length() - element.m_cPointsOffset_ > 1) { // we're adding a contraction int cp = UTF16.charAt(element.m_cPoints_, element.m_cPointsOffset_); CE = t.m_mapping_.getValue(cp); CE = addContraction(t, CE, element); } else { // easy case CE = t.m_mapping_.getValue(element.m_cPoints_ .charAt(element.m_cPointsOffset_)); if (CE != CE_NOT_FOUND_) { if (isContractionTableElement(CE)) { // adding a non contraction element (thai, expansion, // single) to already existing contraction if (!isPrefix(element.m_mapCE_)) { // we cannot reenter prefix elements - as we are going // to create a dead loop // Only expansions and regular CEs can go here... // Contractions will never happen in this place setContraction(t.m_contractions_, CE, 0, (char) 0, element.m_mapCE_); // This loop has to change the CE at the end of // contraction REDO! changeLastCE(t.m_contractions_, CE, element.m_mapCE_); } } else { t.m_mapping_ .setValue(element.m_cPoints_ .charAt(element.m_cPointsOffset_), element.m_mapCE_); if (element.m_prefixChars_ != null && element.m_prefixChars_.length() > 0 && getCETag(CE) != CE_IMPLICIT_TAG_) { // Add CE for standalone precontext char. Elements origElem = new Elements(); origElem.m_prefixChars_ = null; origElem.m_uchars_ = element.m_cPoints_; origElem.m_cPoints_ = origElem.m_uchars_; origElem.m_CEs_[0] = CE; origElem.m_mapCE_ = CE; origElem.m_CELength_ = 1; finalizeAddition(t, origElem); } } } else { t.m_mapping_.setValue(element.m_cPoints_ .charAt(element.m_cPointsOffset_), element.m_mapCE_); } } return CE; } /** * Note regarding surrogate handling: We are interested only in the single * or leading surrogates in a contraction. If a surrogate is somewhere else * in the contraction, it is going to be handled as a pair of code units, as * it doesn't affect the performance AND handling surrogates specially would * complicate code way too much. */ private static int addContraction(BuildTable t, int CE, Elements element) { ContractionTable contractions = t.m_contractions_; contractions.m_currentTag_ = CE_CONTRACTION_TAG_; // First we need to check if contractions starts with a surrogate int cp = UTF16.charAt(element.m_cPoints_, 0); int cpsize = 1; if (UCharacter.isSupplementary(cp)) { cpsize = 2; } if (cpsize < element.m_cPoints_.length()) { // This is a real contraction, if there are other characters after // the first int size = element.m_cPoints_.length() - element.m_cPointsOffset_; for (int j = 1; j < size; j++) { // First add contraction chars to unsafe CP hash table // Unless it is a trail surrogate, which is handled // algoritmically and shouldn't take up space in the table. if (!UTF16.isTrailSurrogate(element.m_cPoints_ .charAt(element.m_cPointsOffset_ + j))) { unsafeCPSet(t.m_unsafeCP_, element.m_cPoints_ .charAt(element.m_cPointsOffset_ + j)); } } // Add the last char of the contraction to the contraction-end // hash table. unless it is a trail surrogate, which is handled // algorithmically and shouldn't be in the table if (!UTF16.isTrailSurrogate(element.m_cPoints_ .charAt(element.m_cPoints_.length() - 1))) { ContrEndCPSet(t.m_contrEndCP_, element.m_cPoints_ .charAt(element.m_cPoints_.length() - 1)); } // If there are any Jamos in the contraction, we should turn on // special processing for Jamos if (isJamo(element.m_cPoints_.charAt(element.m_cPointsOffset_))) { t.m_collator_.m_isJamoSpecial_ = true; } // then we need to deal with it // we could aready have something in table - or we might not element.m_cPointsOffset_ += cpsize; if (!isContraction(CE)) { // if it wasn't contraction, we wouldn't end up here int firstContractionOffset = addContraction(contractions, CONTRACTION_TABLE_NEW_ELEMENT_, (char) 0, CE); int newCE = processContraction(contractions, element, CE_NOT_FOUND_); addContraction(contractions, firstContractionOffset, element.m_cPoints_.charAt(element.m_cPointsOffset_), newCE); addContraction(contractions, firstContractionOffset, (char) 0xFFFF, CE); CE = constructSpecialCE(CE_CONTRACTION_TAG_, firstContractionOffset); } else { // we are adding to existing contraction // there were already some elements in the table, so we need to // add a new contraction // Two things can happen here: either the codepoint is already // in the table, or it is not int position = findCP(contractions, CE, element.m_cPoints_ .charAt(element.m_cPointsOffset_)); if (position > 0) { // if it is we just continue down the chain int eCE = getCE(contractions, CE, position); int newCE = processContraction(contractions, element, eCE); setContraction( contractions, CE, position, element.m_cPoints_.charAt(element.m_cPointsOffset_), newCE); } else { // if it isn't, we will have to create a new sequence int newCE = processContraction(contractions, element, CE_NOT_FOUND_); insertContraction(contractions, CE, element.m_cPoints_ .charAt(element.m_cPointsOffset_), newCE); } } element.m_cPointsOffset_ -= cpsize; t.m_mapping_.setValue(cp, CE); } else if (!isContraction(CE)) { // this is just a surrogate, and there is no contraction t.m_mapping_.setValue(cp, element.m_mapCE_); } else { // fill out the first stage of the contraction with the surrogate // CE changeContraction(contractions, CE, (char) 0, element.m_mapCE_); changeContraction(contractions, CE, (char) 0xFFFF, element.m_mapCE_); } return CE; } /** * this is for adding non contractions * * @param table * contraction table * @param element * offset to the contraction table * @param value * collation element value * @return new collation element */ private static final int changeLastCE(ContractionTable table, int element, int value) { BasicContractionTable tbl = getBasicContractionTable(table, element); if (tbl == null) { return 0; } tbl.m_CEs_.set(tbl.m_CEs_.size() - 1, new Integer(value)); return constructSpecialCE(table.m_currentTag_, element & 0xFFFFFF); } /** * Given a set of ranges calculated by allocWeights(), iterate through the * weights. Sets the next weight in cegenerator.m_current_. * * @param cegenerator * object that contains ranges weight range array and its * rangeCount * @return the next weight */ private static int nextWeight(CEGenerator cegenerator) { if (cegenerator.m_rangesLength_ > 0) { // get maxByte from the .count field int maxByte = cegenerator.m_ranges_[0].m_count_; // get the next weight int weight = cegenerator.m_ranges_[0].m_start_; if (weight == cegenerator.m_ranges_[0].m_end_) { // this range is finished, remove it and move the following // ones up cegenerator.m_rangesLength_--; if (cegenerator.m_rangesLength_ > 0) { System.arraycopy(cegenerator.m_ranges_, 1, cegenerator.m_ranges_, 0, cegenerator.m_rangesLength_); cegenerator.m_ranges_[0].m_count_ = maxByte; // keep maxByte in ranges[0] } } else { // increment the weight for the next value cegenerator.m_ranges_[0].m_start_ = incWeight(weight, cegenerator.m_ranges_[0].m_length2_, maxByte); } return weight; } return -1; } /** * Increment the collation weight * * @param weight * to increment * @param length * @param maxByte * @return new incremented weight */ private static final int incWeight(int weight, int length, int maxByte) { while (true) { int b = getWeightByte(weight, length); if (b < maxByte) { return setWeightByte(weight, length, b + 1); } else { // roll over, set this byte to BYTE_FIRST_TAILORED_ and // increment the previous one weight = setWeightByte(weight, length, RuleBasedCollator.BYTE_FIRST_TAILORED_); --length; } } } /** * Gets the weight byte * * @param weight * @param index * @return byte */ private static final int getWeightByte(int weight, int index) { return (weight >> ((4 - index) << 3)) & 0xff; } /** * Set the weight byte in table * * @param weight * @param index * @param b * byte */ private static final int setWeightByte(int weight, int index, int b) { index <<= 3; // 0xffffffff except a 00 "hole" for the index-th byte int mask = 0xffffffff >>> index; index = 32 - index; mask |= 0xffffff00 << index; return (weight & mask) | (b << index); } /** * Call getWeightRanges and then determine heuristically which ranges to use * for a given number of weights between (excluding) two limits * * @param lowerLimit * @param upperLimit * @param n * @param maxByte * @param ranges * @return */ private int allocateWeights(int lowerLimit, int upperLimit, int n, int maxByte, WeightRange ranges[]) { // number of usable byte values 3..maxByte int countBytes = maxByte - RuleBasedCollator.BYTE_FIRST_TAILORED_ + 1; // [0] unused, [5] to make index checks unnecessary, m_utilCountBuffer_ // countBytes to the power of index, m_utilLongBuffer_ for unsignedness // gcc requires explicit initialization m_utilLongBuffer_[0] = 1; m_utilLongBuffer_[1] = countBytes; m_utilLongBuffer_[2] = m_utilLongBuffer_[1] * countBytes; m_utilLongBuffer_[3] = m_utilLongBuffer_[2] * countBytes; m_utilLongBuffer_[4] = m_utilLongBuffer_[3] * countBytes; int rangeCount = getWeightRanges(lowerLimit, upperLimit, maxByte, countBytes, ranges); if (rangeCount <= 0) { return 0; } // what is the maximum number of weights with these ranges? long maxCount = 0; for (int i = 0; i < rangeCount; ++i) { maxCount += (long) ranges[i].m_count_ * m_utilLongBuffer_[4 - ranges[i].m_length_]; } if (maxCount < n) { return 0; } // set the length2 and count2 fields for (int i = 0; i < rangeCount; ++i) { ranges[i].m_length2_ = ranges[i].m_length_; ranges[i].m_count2_ = ranges[i].m_count_; } // try until we find suitably large ranges while (true) { // get the smallest number of bytes in a range int minLength = ranges[0].m_length2_; // sum up the number of elements that fit into ranges of each byte // length Arrays.fill(m_utilCountBuffer_, 0); for (int i = 0; i < rangeCount; ++i) { m_utilCountBuffer_[ranges[i].m_length2_] += ranges[i].m_count2_; } // now try to allocate n elements in the available short ranges if (n <= m_utilCountBuffer_[minLength] + m_utilCountBuffer_[minLength + 1]) { // trivial cases, use the first few ranges maxCount = 0; rangeCount = 0; do { maxCount += ranges[rangeCount].m_count2_; ++rangeCount; } while (n > maxCount); break; } else if (n <= ranges[0].m_count2_ * countBytes) { // easy case, just make this one range large enough by // lengthening it once more, possibly split it rangeCount = 1; // calculate how to split the range between maxLength-1 // (count1) and maxLength (count2) long power_1 = m_utilLongBuffer_[minLength - ranges[0].m_length_]; long power = power_1 * countBytes; int count2 = (int) ((n + power - 1) / power); int count1 = ranges[0].m_count_ - count2; // split the range if (count1 < 1) { // lengthen the entire range to maxLength lengthenRange(ranges, 0, maxByte, countBytes); } else { // really split the range // create a new range with the end and initial and current // length of the old one rangeCount = 2; ranges[1].m_end_ = ranges[0].m_end_; ranges[1].m_length_ = ranges[0].m_length_; ranges[1].m_length2_ = minLength; // set the end of the first range according to count1 int i = ranges[0].m_length_; int b = getWeightByte(ranges[0].m_start_, i) + count1 - 1; // ranges[0].count and count1 may be >countBytes from // merging adjacent ranges; b > maxByte is possible if (b <= maxByte) { ranges[0].m_end_ = setWeightByte(ranges[0].m_start_, i, b); } else { ranges[0].m_end_ = setWeightByte(incWeight( ranges[0].m_start_, i - 1, maxByte), i, b - countBytes); } // set the bytes in the end weight at length + 1..length2 // to maxByte b = (maxByte << 24) | (maxByte << 16) | (maxByte << 8) | maxByte; // this used to be 0xffffffff ranges[0].m_end_ = truncateWeight(ranges[0].m_end_, i) | (b >>> (i << 3)) & (b << ((4 - minLength) << 3)); // set the start of the second range to immediately follow // the end of the first one ranges[1].m_start_ = incWeight(ranges[0].m_end_, minLength, maxByte); // set the count values (informational) ranges[0].m_count_ = count1; ranges[1].m_count_ = count2; ranges[0].m_count2_ = (int) (count1 * power_1); // will be *countBytes when lengthened ranges[1].m_count2_ = (int) (count2 * power_1); // lengthen the second range to maxLength lengthenRange(ranges, 1, maxByte, countBytes); } break; } // no good match, lengthen all minLength ranges and iterate for (int i = 0; ranges[i].m_length2_ == minLength; ++i) { lengthenRange(ranges, i, maxByte, countBytes); } } if (rangeCount > 1) { // sort the ranges by weight values Arrays.sort(ranges, 0, rangeCount); } // set maxByte in ranges[0] for ucol_nextWeight() ranges[0].m_count_ = maxByte; return rangeCount; } /** * Updates the range length * * @param range * weight range array * @param offset * to weight range array * @param maxByte * @param countBytes * @return new length */ private static final int lengthenRange(WeightRange range[], int offset, int maxByte, int countBytes) { int length = range[offset].m_length2_ + 1; range[offset].m_start_ = setWeightTrail(range[offset].m_start_, length, RuleBasedCollator.BYTE_FIRST_TAILORED_); range[offset].m_end_ = setWeightTrail(range[offset].m_end_, length, maxByte); range[offset].m_count2_ *= countBytes; range[offset].m_length2_ = length; return length; } /** * Gets the weight * * @param weight * @param length * @param trail * @return new weight */ private static final int setWeightTrail(int weight, int length, int trail) { length = (4 - length) << 3; return (weight & (0xffffff00 << length)) | (trail << length); } /** * take two CE weights and calculate the possible ranges of weights between * the two limits, excluding them for weights with up to 4 bytes there are * up to 2*4-1=7 ranges * * @param lowerLimit * @param upperLimit * @param maxByte * @param countBytes * @param ranges * @return weight ranges */ private int getWeightRanges(int lowerLimit, int upperLimit, int maxByte, int countBytes, WeightRange ranges[]) { // assume that both lowerLimit & upperLimit are not 0 // get the lengths of the limits int lowerLength = lengthOfWeight(lowerLimit); int upperLength = lengthOfWeight(upperLimit); if (Utility.compareUnsigned(lowerLimit, upperLimit) >= 0) { return 0; } // check that neither is a prefix of the other if (lowerLength < upperLength) { if (lowerLimit == truncateWeight(upperLimit, lowerLength)) { return 0; } } // if the upper limit is a prefix of the lower limit then the earlier // test lowerLimit >= upperLimit has caught it // reset local variables // With the limit lengths of 1..4, there are up to 7 ranges for // allocation: // range minimum length // lower[4] 4 // lower[3] 3 // lower[2] 2 // middle 1 // upper[2] 2 // upper[3] 3 // upper[4] 4 // We are now going to calculate up to 7 ranges. // Some of them will typically overlap, so we will then have to merge // and eliminate ranges. // We have to clean cruft from previous invocations // before doing anything. C++ already does that for (int length = 0; length < 5; length++) { m_utilLowerWeightRange_[length].clear(); m_utilUpperWeightRange_[length].clear(); } m_utilWeightRange_.clear(); int weight = lowerLimit; for (int length = lowerLength; length >= 2; --length) { m_utilLowerWeightRange_[length].clear(); int trail = getWeightByte(weight, length); if (trail < maxByte) { m_utilLowerWeightRange_[length].m_start_ = incWeightTrail( weight, length); m_utilLowerWeightRange_[length].m_end_ = setWeightTrail(weight, length, maxByte); m_utilLowerWeightRange_[length].m_length_ = length; m_utilLowerWeightRange_[length].m_count_ = maxByte - trail; } weight = truncateWeight(weight, length - 1); } m_utilWeightRange_.m_start_ = incWeightTrail(weight, 1); weight = upperLimit; // [0] and [1] are not used - this simplifies indexing, // m_utilUpperWeightRange_ for (int length = upperLength; length >= 2; length--) { int trail = getWeightByte(weight, length); if (trail > RuleBasedCollator.BYTE_FIRST_TAILORED_) { m_utilUpperWeightRange_[length].m_start_ = setWeightTrail( weight, length, RuleBasedCollator.BYTE_FIRST_TAILORED_); m_utilUpperWeightRange_[length].m_end_ = decWeightTrail(weight, length); m_utilUpperWeightRange_[length].m_length_ = length; m_utilUpperWeightRange_[length].m_count_ = trail - RuleBasedCollator.BYTE_FIRST_TAILORED_; } weight = truncateWeight(weight, length - 1); } m_utilWeightRange_.m_end_ = decWeightTrail(weight, 1); // set the middle range m_utilWeightRange_.m_length_ = 1; if (Utility.compareUnsigned(m_utilWeightRange_.m_end_, m_utilWeightRange_.m_start_) >= 0) { // if (m_utilWeightRange_.m_end_ >= m_utilWeightRange_.m_start_) { m_utilWeightRange_.m_count_ = ((m_utilWeightRange_.m_end_ - m_utilWeightRange_.m_start_) >>> 24) + 1; } else { // eliminate overlaps // remove the middle range m_utilWeightRange_.m_count_ = 0; // reduce or remove the lower ranges that go beyond upperLimit for (int length = 4; length >= 2; --length) { if (m_utilLowerWeightRange_[length].m_count_ > 0 && m_utilUpperWeightRange_[length].m_count_ > 0) { int start = m_utilUpperWeightRange_[length].m_start_; int end = m_utilLowerWeightRange_[length].m_end_; if (end >= start || incWeight(end, length, maxByte) == start) { // lower and upper ranges collide or are directly // adjacent: merge these two and remove all shorter // ranges start = m_utilLowerWeightRange_[length].m_start_; end = m_utilLowerWeightRange_[length].m_end_ = m_utilUpperWeightRange_[length].m_end_; // merging directly adjacent ranges needs to subtract // the 0/1 gaps in between; // it may result in a range with count>countBytes m_utilLowerWeightRange_[length].m_count_ = getWeightByte( end, length) - getWeightByte(start, length) + 1 + countBytes * (getWeightByte(end, length - 1) - getWeightByte( start, length - 1)); m_utilUpperWeightRange_[length].m_count_ = 0; while (--length >= 2) { m_utilLowerWeightRange_[length].m_count_ = m_utilUpperWeightRange_[length].m_count_ = 0; } break; } } } } // copy the ranges, shortest first, into the result array int rangeCount = 0; if (m_utilWeightRange_.m_count_ > 0) { ranges[0] = new WeightRange(m_utilWeightRange_); rangeCount = 1; } for (int length = 2; length <= 4; ++length) { // copy upper first so that later the middle range is more likely // the first one to use if (m_utilUpperWeightRange_[length].m_count_ > 0) { ranges[rangeCount] = new WeightRange( m_utilUpperWeightRange_[length]); ++rangeCount; } if (m_utilLowerWeightRange_[length].m_count_ > 0) { ranges[rangeCount] = new WeightRange( m_utilLowerWeightRange_[length]); ++rangeCount; } } return rangeCount; } /** * Truncates the weight with length * * @param weight * @param length * @return truncated weight */ private static final int truncateWeight(int weight, int length) { return weight & (0xffffffff << ((4 - length) << 3)); } /** * Length of the weight * * @param weight * @return length of the weight */ private static final int lengthOfWeight(int weight) { if ((weight & 0xffffff) == 0) { return 1; } else if ((weight & 0xffff) == 0) { return 2; } else if ((weight & 0xff) == 0) { return 3; } return 4; } /** * Increment the weight trail * * @param weight * @param length * @return new weight */ private static final int incWeightTrail(int weight, int length) { return weight + (1 << ((4 - length) << 3)); } /** * Decrement the weight trail * * @param weight * @param length * @return new weight */ private static int decWeightTrail(int weight, int length) { return weight - (1 << ((4 - length) << 3)); } /** * Gets the codepoint * * @param tbl * contraction table * @param codePoint * code point to look for * @return the offset to the code point */ private static int findCP(BasicContractionTable tbl, char codePoint) { int position = 0; while (codePoint > tbl.m_codePoints_.charAt(position)) { position++; if (position > tbl.m_codePoints_.length()) { return -1; } } if (codePoint == tbl.m_codePoints_.charAt(position)) { return position; } else { return -1; } } /** * Finds a contraction ce * * @param table * @param element * @param ch * @return ce */ private static int findCE(ContractionTable table, int element, char ch) { if (table == null) { return CE_NOT_FOUND_; } BasicContractionTable tbl = getBasicContractionTable(table, element); if (tbl == null) { return CE_NOT_FOUND_; } int position = findCP(tbl, ch); if (position > tbl.m_CEs_.size() || position < 0) { return CE_NOT_FOUND_; } return tbl.m_CEs_.get(position).intValue(); } /** * Checks if the string is tailored in the contraction * * @param table * contraction table * @param element * @param array * character array to check * @param offset * array offset * @return true if it is tailored */ private static boolean isTailored(ContractionTable table, int element, char array[], int offset) { while (array[offset] != 0) { element = findCE(table, element, array[offset]); if (element == CE_NOT_FOUND_) { return false; } if (!isContractionTableElement(element)) { return true; } offset++; } if (getCE(table, element, 0) != CE_NOT_FOUND_) { return true; } else { return false; } } /** * Assemble RuleBasedCollator * * @param t * build table * @param collator * to update */ private void assembleTable(BuildTable t, RuleBasedCollator collator) { IntTrieBuilder mapping = t.m_mapping_; Vector expansions = t.m_expansions_; ContractionTable contractions = t.m_contractions_; MaxExpansionTable maxexpansion = t.m_maxExpansions_; // contraction offset has to be in since we are building on the // UCA contractions // int beforeContractions = (HEADER_SIZE_ // + paddedsize(expansions.size() << 2)) >>> 1; collator.m_contractionOffset_ = 0; int contractionsSize = constructTable(contractions); // the following operation depends on the trie data. Therefore, we have // to do it before the trie is compacted // sets jamo expansions getMaxExpansionJamo(mapping, maxexpansion, t.m_maxJamoExpansions_, collator.m_isJamoSpecial_); // TODO: LATIN1 array is now in the utrie - it should be removed from // the calculation setAttributes(collator, t.m_options_); // copy expansions int size = expansions.size(); collator.m_expansion_ = new int[size]; for (int i = 0; i < size; i++) { collator.m_expansion_[i] = expansions.get(i).intValue(); } // contractions block if (contractionsSize != 0) { // copy contraction index collator.m_contractionIndex_ = new char[contractionsSize]; contractions.m_codePoints_.getChars(0, contractionsSize, collator.m_contractionIndex_, 0); // copy contraction collation elements collator.m_contractionCE_ = new int[contractionsSize]; for (int i = 0; i < contractionsSize; i++) { collator.m_contractionCE_[i] = contractions.m_CEs_.get(i).intValue(); } } // copy mapping table collator.m_trie_ = mapping.serialize(t, RuleBasedCollator.DataManipulate.getInstance()); // copy max expansion table // not copying the first element which is a dummy // to be in synch with icu4c's builder, we continue to use the // expansion offset // omitting expansion offset in builder collator.m_expansionOffset_ = 0; size = maxexpansion.m_endExpansionCE_.size(); collator.m_expansionEndCE_ = new int[size - 1]; for (int i = 1; i < size; i++) { collator.m_expansionEndCE_[i - 1] = maxexpansion.m_endExpansionCE_ .get(i).intValue(); } collator.m_expansionEndCEMaxSize_ = new byte[size - 1]; for (int i = 1; i < size; i++) { collator.m_expansionEndCEMaxSize_[i - 1] = maxexpansion.m_expansionCESize_ .get(i).byteValue(); } // Unsafe chars table. Finish it off, then copy it. unsafeCPAddCCNZ(t); // Or in unsafebits from UCA, making a combined table. for (int i = 0; i < UNSAFECP_TABLE_SIZE_; i++) { t.m_unsafeCP_[i] |= RuleBasedCollator.UCA_.m_unsafe_[i]; } collator.m_unsafe_ = t.m_unsafeCP_; // Finish building Contraction Ending chars hash table and then copy it // out. // Or in unsafebits from UCA, making a combined table for (int i = 0; i < UNSAFECP_TABLE_SIZE_; i++) { t.m_contrEndCP_[i] |= RuleBasedCollator.UCA_.m_contractionEnd_[i]; } collator.m_contractionEnd_ = t.m_contrEndCP_; } /** * Sets this collator to use the all options and tables in UCA. * * @param collator * which attribute is to be set * @param option * to set with */ private static final void setAttributes(RuleBasedCollator collator, CollationRuleParser.OptionSet option) { collator.latinOneFailed_ = true; collator.m_caseFirst_ = option.m_caseFirst_; collator.setDecomposition(option.m_decomposition_); collator .setAlternateHandlingShifted(option.m_isAlternateHandlingShifted_); collator.setCaseLevel(option.m_isCaseLevel_); collator.setFrenchCollation(option.m_isFrenchCollation_); collator.m_isHiragana4_ = option.m_isHiragana4_; collator.setStrength(option.m_strength_); collator.m_variableTopValue_ = option.m_variableTopValue_; collator.latinOneFailed_ = false; } /** * Constructing the contraction table * * @param table * contraction table * @return */ private int constructTable(ContractionTable table) { // See how much memory we need int tsize = table.m_elements_.size(); if (tsize == 0) { return 0; } table.m_offsets_.clear(); int position = 0; for (int i = 0; i < tsize; i++) { table.m_offsets_.add(new Integer(position)); position += table.m_elements_.get(i).m_CEs_ .size(); } table.m_CEs_.clear(); table.m_codePoints_.delete(0, table.m_codePoints_.length()); // Now stuff the things in StringBuilder cpPointer = table.m_codePoints_; Vector CEPointer = table.m_CEs_; for (int i = 0; i < tsize; i++) { BasicContractionTable bct = table.m_elements_.get(i); int size = bct.m_CEs_.size(); char ccMax = 0; char ccMin = 255; int offset = CEPointer.size(); CEPointer.add(bct.m_CEs_.get(0)); for (int j = 1; j < size; j++) { char ch = bct.m_codePoints_.charAt(j); char cc = (char) (UCharacter.getCombiningClass(ch) & 0xFF); if (cc > ccMax) { ccMax = cc; } if (cc < ccMin) { ccMin = cc; } cpPointer.append(ch); CEPointer.add(bct.m_CEs_.get(j)); } cpPointer.insert(offset, (char) (((ccMin == ccMax) ? 1 : 0 << 8) | ccMax)); for (int j = 0; j < size; j++) { if (isContractionTableElement(CEPointer.get(offset + j).intValue())) { int ce = CEPointer.get(offset + j).intValue(); CEPointer.set(offset + j, new Integer(constructSpecialCE(getCETag(ce), table.m_offsets_.get(getContractionOffset(ce)) .intValue()))); } } } for (int i = 0; i <= 0x10FFFF; i++) { int CE = table.m_mapping_.getValue(i); if (isContractionTableElement(CE)) { CE = constructSpecialCE(getCETag(CE), table.m_offsets_.get(getContractionOffset(CE)).intValue()); table.m_mapping_.setValue(i, CE); } } return position; } /** * Get contraction offset * * @param ce * collation element * @return contraction offset */ private static final int getContractionOffset(int ce) { return ce & 0xFFFFFF; } /** * Gets the maximum Jamo expansion * * @param mapping * trie table * @param maxexpansion * maximum expansion table * @param maxjamoexpansion * maximum jamo expansion table * @param jamospecial * is jamo special? */ private static void getMaxExpansionJamo(IntTrieBuilder mapping, MaxExpansionTable maxexpansion, MaxJamoExpansionTable maxjamoexpansion, boolean jamospecial) { int VBASE = 0x1161; int TBASE = 0x11A8; int VCOUNT = 21; int TCOUNT = 28; int v = VBASE + VCOUNT - 1; int t = TBASE + TCOUNT - 1; while (v >= VBASE) { int ce = mapping.getValue(v); if ((ce & RuleBasedCollator.CE_SPECIAL_FLAG_) != RuleBasedCollator.CE_SPECIAL_FLAG_) { setMaxExpansion(ce, (byte) 2, maxexpansion); } v--; } while (t >= TBASE) { int ce = mapping.getValue(t); if ((ce & RuleBasedCollator.CE_SPECIAL_FLAG_) != RuleBasedCollator.CE_SPECIAL_FLAG_) { setMaxExpansion(ce, (byte) 3, maxexpansion); } t--; } // According to the docs, 99% of the time, the Jamo will not be special if (jamospecial) { // gets the max expansion in all unicode characters int count = maxjamoexpansion.m_endExpansionCE_.size(); byte maxTSize = (byte) (maxjamoexpansion.m_maxLSize_ + maxjamoexpansion.m_maxVSize_ + maxjamoexpansion.m_maxTSize_); byte maxVSize = (byte) (maxjamoexpansion.m_maxLSize_ + maxjamoexpansion.m_maxVSize_); while (count > 0) { count--; if ((maxjamoexpansion.m_isV_.get(count)) .booleanValue() == true) { setMaxExpansion( (maxjamoexpansion.m_endExpansionCE_ .get(count)).intValue(), maxVSize, maxexpansion); } else { setMaxExpansion( (maxjamoexpansion.m_endExpansionCE_ .get(count)).intValue(), maxTSize, maxexpansion); } } } } /** * To the UnsafeCP hash table, add all chars with combining class != 0 * * @param t * build table */ private final void unsafeCPAddCCNZ(BuildTable t) { boolean buildCMTable = (buildCMTabFlag & (t.cmLookup == null)); char[] cm = null; // combining mark array int[] index = new int[256]; int count = 0; if (buildCMTable) { cm = new char[0x10000]; } for (char c = 0; c < 0xffff; c++) { int fcd = m_nfcImpl_.getFCD16FromSingleLead(c); // TODO: review for handling supplementary characters if (fcd >= 0x100 || // if the leading combining class(c) > 0 || (UTF16.isLeadSurrogate(c) && fcd != 0)) { // c is a leading surrogate with some FCD data unsafeCPSet(t.m_unsafeCP_, c); if (buildCMTable && (fcd != 0)) { int cc = (fcd & 0xff); int pos = (cc << 8) + index[cc]; cm[pos] = c; index[cc]++; count++; } } } if (t.m_prefixLookup_ != null) { Enumeration els = t.m_prefixLookup_.elements(); while (els.hasMoreElements()) { Elements e = els.nextElement(); // codepoints here are in the NFD form. We need to add the // first code point of the NFC form to unsafe, because // strcoll needs to backup over them. // weiv: This is wrong! See the comment above. // String decomp = Normalizer.decompose(e.m_cPoints_, true); // unsafeCPSet(t.m_unsafeCP_, decomp.charAt(0)); // it should be: String comp = Normalizer.compose(e.m_cPoints_, false); unsafeCPSet(t.m_unsafeCP_, comp.charAt(0)); } } if (buildCMTable) { t.cmLookup = new CombinClassTable(); t.cmLookup.generate(cm, count, index); } } /** * Create closure * * @param t * build table * @param collator * RuleBasedCollator * @param colEl * collation element iterator * @param start * @param limit * @param type * character type * @return */ private boolean enumCategoryRangeClosureCategory(BuildTable t, RuleBasedCollator collator, CollationElementIterator colEl, int start, int limit, int type) { if (type != UCharacterCategory.UNASSIGNED && type != UCharacterCategory.PRIVATE_USE) { // if the range is assigned - we might ommit more categories later for (int u32 = start; u32 < limit; u32++) { String decomp = m_nfcImpl_.getDecomposition(u32); if (decomp != null) { String comp = UCharacter.toString(u32); if (!collator.equals(comp, decomp)) { m_utilElement_.m_cPoints_ = decomp; m_utilElement_.m_prefix_ = 0; Elements prefix = t.m_prefixLookup_.get(m_utilElement_); if (prefix == null) { m_utilElement_.m_cPoints_ = comp; m_utilElement_.m_prefix_ = 0; m_utilElement_.m_prefixChars_ = null; colEl.setText(decomp); int ce = colEl.next(); m_utilElement_.m_CELength_ = 0; while (ce != CollationElementIterator.NULLORDER) { m_utilElement_.m_CEs_[m_utilElement_.m_CELength_++] = ce; ce = colEl.next(); } } else { m_utilElement_.m_cPoints_ = comp; m_utilElement_.m_prefix_ = 0; m_utilElement_.m_prefixChars_ = null; m_utilElement_.m_CELength_ = 1; m_utilElement_.m_CEs_[0] = prefix.m_mapCE_; // This character uses a prefix. We have to add it // to the unsafe table, as it decomposed form is // already in. In Japanese, this happens for \u309e // & \u30fe // Since unsafeCPSet is static in ucol_elm, we are // going to wrap it up in the unsafeCPAddCCNZ // function } addAnElement(t, m_utilElement_); } } } } return true; } /** * Determine if a character is a Jamo * * @param ch * character to test * @return true if ch is a Jamo, false otherwise */ private static final boolean isJamo(char ch) { return (ch >= 0x1100 && ch <= 0x1112) || (ch >= 0x1175 && ch <= 0x1161) || (ch >= 0x11A8 && ch <= 0x11C2); } /** * Produces canonical closure */ private void canonicalClosure(BuildTable t) { BuildTable temp = new BuildTable(t); assembleTable(temp, temp.m_collator_); // produce canonical closure CollationElementIterator coleiter = temp.m_collator_ .getCollationElementIterator(""); RangeValueIterator typeiter = UCharacter.getTypeIterator(); RangeValueIterator.Element element = new RangeValueIterator.Element(); while (typeiter.next(element)) { enumCategoryRangeClosureCategory(t, temp.m_collator_, coleiter, element.start, element.limit, element.value); } t.cmLookup = temp.cmLookup; temp.cmLookup = null; for (int i = 0; i < m_parser_.m_resultLength_; i++) { char baseChar, firstCM; // now we need to generate the CEs // We stuff the initial value in the buffers, and increase the // appropriate buffer according to strength */ // createElements(t, m_parser_.m_listHeader_[i]); CollationRuleParser.Token tok = m_parser_.m_listHeader_[i].m_first_; m_utilElement_.clear(); while (tok != null) { m_utilElement_.m_prefix_ = 0;// el.m_prefixChars_; m_utilElement_.m_cPointsOffset_ = 0; // el.m_uchars_; if (tok.m_prefix_ != 0) { // we will just copy the prefix here, and adjust accordingly // in // the addPrefix function in ucol_elm. The reason is that we // need to add both composed AND decomposed elements to the // unsafe table. int size = tok.m_prefix_ >> 24; int offset = tok.m_prefix_ & 0x00FFFFFF; m_utilElement_.m_prefixChars_ = m_parser_.m_source_ .substring(offset, offset + size); size = (tok.m_source_ >> 24) - (tok.m_prefix_ >> 24); offset = (tok.m_source_ & 0x00FFFFFF) + (tok.m_prefix_ >> 24); m_utilElement_.m_uchars_ = m_parser_.m_source_.substring( offset, offset + size); } else { m_utilElement_.m_prefixChars_ = null; int offset = tok.m_source_ & 0x00FFFFFF; int size = tok.m_source_ >>> 24; m_utilElement_.m_uchars_ = m_parser_.m_source_.substring( offset, offset + size); } m_utilElement_.m_cPoints_ = m_utilElement_.m_uchars_; baseChar = firstCM = 0; // reset for (int j = 0; j < m_utilElement_.m_cPoints_.length() - m_utilElement_.m_cPointsOffset_; j++) { int fcd = m_nfcImpl_.getFCD16FromSingleLead(m_utilElement_.m_cPoints_.charAt(j)); // TODO: review for handling supplementary characters if ((fcd & 0xff) == 0) { baseChar = m_utilElement_.m_cPoints_.charAt(j); } else { if ((baseChar != 0) && (firstCM == 0)) { firstCM = m_utilElement_.m_cPoints_.charAt(j); // first // combining // mark } } } if ((baseChar != 0) && (firstCM != 0)) { addTailCanonicalClosures(t, temp.m_collator_, coleiter, baseChar, firstCM); } tok = tok.m_next_; } } } private void addTailCanonicalClosures(BuildTable t, RuleBasedCollator m_collator, CollationElementIterator colEl, char baseChar, char cMark) { if (t.cmLookup == null) { return; } CombinClassTable cmLookup = t.cmLookup; int[] index = cmLookup.index; int cClass = m_nfcImpl_.getFCD16FromSingleLead(cMark) & 0xff; // TODO: review for handling supplementary characters int maxIndex = 0; char[] precompCh = new char[256]; int[] precompClass = new int[256]; int precompLen = 0; Elements element = new Elements(); if (cClass > 0) { maxIndex = index[cClass - 1]; } for (int i = 0; i < maxIndex; i++) { StringBuilder decompBuf = new StringBuilder(); decompBuf.append(baseChar).append(cmLookup.cPoints[i]); String comp = Normalizer.compose(decompBuf.toString(), false); if (comp.length() == 1) { precompCh[precompLen] = comp.charAt(0); precompClass[precompLen] = (m_nfcImpl_.getFCD16FromSingleLead(cmLookup.cPoints[i]) & 0xff); // TODO: review for handling supplementary characters precompLen++; StringBuilder decomp = new StringBuilder(); for (int j = 0; j < m_utilElement_.m_cPoints_.length(); j++) { if (m_utilElement_.m_cPoints_.charAt(j) == cMark) { decomp.append(cmLookup.cPoints[i]); } else { decomp.append(m_utilElement_.m_cPoints_.charAt(j)); } } comp = Normalizer.compose(decomp.toString(), false); StringBuilder buf = new StringBuilder(comp); buf.append(cMark); decomp.append(cMark); comp = buf.toString(); element.m_cPoints_ = decomp.toString(); element.m_CELength_ = 0; element.m_prefix_ = 0; Elements prefix = t.m_prefixLookup_.get(element); element.m_cPoints_ = comp; element.m_uchars_ = comp; if (prefix == null) { element.m_prefix_ = 0; element.m_prefixChars_ = null; colEl.setText(decomp.toString()); int ce = colEl.next(); element.m_CELength_ = 0; while (ce != CollationElementIterator.NULLORDER) { element.m_CEs_[element.m_CELength_++] = ce; ce = colEl.next(); } } else { element.m_cPoints_ = comp; element.m_prefix_ = 0; element.m_prefixChars_ = null; element.m_CELength_ = 1; element.m_CEs_[0] = prefix.m_mapCE_; } setMapCE(t, element); finalizeAddition(t, element); if (comp.length() > 2) { // This is a fix for tailoring contractions with accented // character at the end of contraction string. addFCD4AccentedContractions(t, colEl, comp, element); } if (precompLen > 1) { precompLen = addMultiCMontractions(t, colEl, element, precompCh, precompClass, precompLen, cMark, i, decomp.toString()); } } } } private void setMapCE(BuildTable t, Elements element) { Vector expansions = t.m_expansions_; element.m_mapCE_ = 0; if (element.m_CELength_ == 2 // a two CE expansion && RuleBasedCollator.isContinuation(element.m_CEs_[1]) && (element.m_CEs_[1] & (~(0xFF << 24 | RuleBasedCollator.CE_CONTINUATION_MARKER_))) == 0 // that // has // only // primaries // in // continuation && (((element.m_CEs_[0] >> 8) & 0xFF) == RuleBasedCollator.BYTE_COMMON_) // a common secondary && ((element.m_CEs_[0] & 0xFF) == RuleBasedCollator.BYTE_COMMON_)) { // and // a // common // tertiary element.m_mapCE_ = RuleBasedCollator.CE_SPECIAL_FLAG_ // a long primary special | (CE_LONG_PRIMARY_TAG_ << 24) // first and second byte of primary | ((element.m_CEs_[0] >> 8) & 0xFFFF00) // third byte of primary | ((element.m_CEs_[1] >> 24) & 0xFF); } else { // omitting expansion offset in builder // (HEADER_SIZE_ >> 2) int expansion = RuleBasedCollator.CE_SPECIAL_FLAG_ | (CE_EXPANSION_TAG_ << RuleBasedCollator.CE_TAG_SHIFT_) | (addExpansion(expansions, element.m_CEs_[0]) << 4) & 0xFFFFF0; for (int i = 1; i < element.m_CELength_; i++) { addExpansion(expansions, element.m_CEs_[i]); } if (element.m_CELength_ <= 0xF) { expansion |= element.m_CELength_; } else { addExpansion(expansions, 0); } element.m_mapCE_ = expansion; setMaxExpansion(element.m_CEs_[element.m_CELength_ - 1], (byte) element.m_CELength_, t.m_maxExpansions_); } } private int addMultiCMontractions(BuildTable t, CollationElementIterator colEl, Elements element, char[] precompCh, int[] precompClass, int maxComp, char cMark, int cmPos, String decomp) { CombinClassTable cmLookup = t.cmLookup; char[] combiningMarks = { cMark }; int cMarkClass = UCharacter.getCombiningClass(cMark) & 0xFF; String comMark = new String(combiningMarks); int noOfPrecomposedChs = maxComp; for (int j = 0; j < maxComp; j++) { int count = 0; StringBuilder temp; do { String newDecomp, comp; if (count == 0) { // Decompose the saved precomposed char. newDecomp = Normalizer.decompose( new String(precompCh, j, 1), false); temp = new StringBuilder(newDecomp); temp.append(cmLookup.cPoints[cmPos]); newDecomp = temp.toString(); } else { temp = new StringBuilder(decomp); temp.append(precompCh[j]); newDecomp = temp.toString(); } comp = Normalizer.compose(newDecomp, false); if (comp.length() == 1) { temp.append(cMark); element.m_cPoints_ = temp.toString(); element.m_CELength_ = 0; element.m_prefix_ = 0; Elements prefix = t.m_prefixLookup_.get(element); element.m_cPoints_ = comp + comMark; if (prefix == null) { element.m_prefix_ = 0; element.m_prefixChars_ = null; colEl.setText(temp.toString()); int ce = colEl.next(); element.m_CELength_ = 0; while (ce != CollationElementIterator.NULLORDER) { element.m_CEs_[element.m_CELength_++] = ce; ce = colEl.next(); } } else { element.m_cPoints_ = comp; element.m_prefix_ = 0; element.m_prefixChars_ = null; element.m_CELength_ = 1; element.m_CEs_[0] = prefix.m_mapCE_; } setMapCE(t, element); finalizeAddition(t, element); precompCh[noOfPrecomposedChs] = comp.charAt(0); precompClass[noOfPrecomposedChs] = cMarkClass; noOfPrecomposedChs++; } } while (++count < 2 && (precompClass[j] == cMarkClass)); } return noOfPrecomposedChs; } private void addFCD4AccentedContractions(BuildTable t, CollationElementIterator colEl, String data, Elements element) { String decomp = Normalizer.decompose(data, false); String comp = Normalizer.compose(data, false); element.m_cPoints_ = decomp; element.m_CELength_ = 0; element.m_prefix_ = 0; Elements prefix = t.m_prefixLookup_.get(element); if (prefix == null) { element.m_cPoints_ = comp; element.m_prefix_ = 0; element.m_prefixChars_ = null; element.m_CELength_ = 0; colEl.setText(decomp); int ce = colEl.next(); element.m_CELength_ = 0; while (ce != CollationElementIterator.NULLORDER) { element.m_CEs_[element.m_CELength_++] = ce; ce = colEl.next(); } addAnElement(t, element); } } private void processUCACompleteIgnorables(BuildTable t) { TrieIterator trieiterator = new TrieIterator( RuleBasedCollator.UCA_.m_trie_); RangeValueIterator.Element element = new RangeValueIterator.Element(); while (trieiterator.next(element)) { int start = element.start; int limit = element.limit; if (element.value == 0) { while (start < limit) { int CE = t.m_mapping_.getValue(start); if (CE == CE_NOT_FOUND_) { m_utilElement_.m_prefix_ = 0; m_utilElement_.m_uchars_ = UCharacter.toString(start); m_utilElement_.m_cPoints_ = m_utilElement_.m_uchars_; m_utilElement_.m_cPointsOffset_ = 0; m_utilElement_.m_CELength_ = 1; m_utilElement_.m_CEs_[0] = 0; addAnElement(t, m_utilElement_); } start++; } } } } }