2 *******************************************************************************
3 * Copyright (C) 1996-2011, International Business Machines Corporation and *
4 * others. All Rights Reserved. *
5 *******************************************************************************
8 package com.ibm.icu.util;
10 import java.util.Enumeration;
11 import java.util.NoSuchElementException;
13 import com.ibm.icu.text.UTF16;
14 import com.ibm.icu.text.UnicodeSet;
17 * {@icuenhanced java.util.Calendar}.{@icu _usage_}
19 * <p>The string tokenizer class allows an application to break a string
20 * into tokens by performing code point comparison.
21 * The <code>StringTokenizer</code> methods do not distinguish
22 * among identifiers, numbers, and quoted strings, nor do they recognize
23 * and skip comments.</p>
25 * The set of delimiters (the codepoints that separate tokens) may be
26 * specified either at creation time or on a per-token basis.
29 * An instance of <code>StringTokenizer</code> behaves in one of three ways,
30 * depending on whether it was created with the <code>returnDelims</code>
31 * and <code>coalesceDelims</code>
32 * flags having the value <code>true</code> or <code>false</code>:
34 * <li>If returnDelims is <code>false</code>, delimiter code points serve to
35 * separate tokens. A token is a maximal sequence of consecutive
36 * code points that are not delimiters.
37 * <li>If returnDelims is <code>true</code>, delimiter code points are
38 * themselves considered to be tokens. In this case, if coalesceDelims is
39 * <code>true</code>, such tokens will be the maximal sequence of consecutive
40 * code points that <em>are</em> delimiters. If coalesceDelims is false,
41 * a token will be received for each delimiter code point.
43 * <p>A token is thus either one
44 * delimiter code point, a maximal sequence of consecutive code points that
45 * are delimiters, or a maximal sequence of consecutive code
46 * points that are not delimiters.
49 * A <tt>StringTokenizer</tt> object internally maintains a current
50 * position within the string to be tokenized. Some operations advance this
51 * current position past the code point processed.
54 * A token is returned by taking a substring of the string that was used to
55 * create the <tt>StringTokenizer</tt> object.
58 * Example of the use of the default delimiter tokenizer.
60 * StringTokenizer st = new StringTokenizer("this is a test");
61 * while (st.hasMoreTokens()) {
62 * println(st.nextToken());
67 * prints the following output:
76 * Example of the use of the tokenizer with user specified delimiter.
78 * StringTokenizer st = new StringTokenizer(
79 * "this is a test with supplementary characters \ud800\ud800\udc00\udc00",
80 * " \ud800\udc00");
81 * while (st.hasMoreTokens()) {
82 * println(st.nextToken());
87 * prints the following output:
103 public final class StringTokenizer implements Enumeration<Object>
105 // public constructors ---------------------------------------------
108 * {@icu} Constructs a string tokenizer for the specified string. All
109 * characters in the delim argument are the delimiters for separating
111 * <p>If the returnDelims flag is false, the delimiter characters are
112 * skipped and only serve as separators between tokens.</p>
113 * <p>If the returnDelims flag is true, then the delimiter characters
114 * are also returned as tokens, one per delimiter.
115 * @param str a string to be parsed.
116 * @param delim the delimiters.
117 * @param returndelims flag indicating whether to return the delimiters
119 * @exception NullPointerException if str is null
122 public StringTokenizer(String str, UnicodeSet delim, boolean returndelims)
124 this(str, delim, returndelims, false);
128 * {@icu} Constructs a string tokenizer for the specified string. All
129 * characters in the delim argument are the delimiters for separating
131 * <p>If the returnDelims flag is false, the delimiter characters are
132 * skipped and only serve as separators between tokens.</p>
133 * <p>If the returnDelims flag is true, then the delimiter characters
134 * are also returned as tokens. If coalescedelims is true, one token
135 * is returned for each run of delimiter characters, otherwise one
136 * token is returned per delimiter. Since surrogate pairs can be
137 * delimiters, the returned token might be two chars in length.</p>
138 * @param str a string to be parsed.
139 * @param delim the delimiters.
140 * @param returndelims flag indicating whether to return the delimiters
142 * @param coalescedelims flag indicating whether to return a run of
143 * delimiters as a single token or as one token per delimiter.
144 * This only takes effect if returndelims is true.
145 * @exception NullPointerException if str is null
147 * @deprecated This API is ICU internal only.
149 public StringTokenizer(String str, UnicodeSet delim, boolean returndelims, boolean coalescedelims)
152 m_length_ = str.length();
154 m_delimiters_ = EMPTY_DELIMITER_;
157 m_delimiters_ = delim;
159 m_returnDelimiters_ = returndelims;
160 m_coalesceDelimiters_ = coalescedelims;
163 if (m_length_ == 0) {
164 // string length 0, no tokens
170 m_nextOffset_ = getNextNonDelimiter(0);
176 * {@icu} Constructs a string tokenizer for the specified string. The
177 * characters in the delim argument are the delimiters for separating
179 * <p>Delimiter characters themselves will not be treated as tokens.</p>
180 * @param str a string to be parsed.
181 * @param delim the delimiters.
182 * @exception NullPointerException if str is null
185 public StringTokenizer(String str, UnicodeSet delim)
187 this(str, delim, false, false);
191 * <p>Constructs a string tokenizer for the specified string. All
192 * characters in the delim argument are the delimiters for separating
194 * <p>If the returnDelims flag is false, the delimiter characters are
195 * skipped and only serve as separators between tokens.</p>
196 * <p>If the returnDelims flag is true, then the delimiter characters
197 * are also returned as tokens, one per delimiter.
198 * @param str a string to be parsed.
199 * @param delim the delimiters.
200 * @param returndelims flag indicating whether to return the delimiters
202 * @exception NullPointerException if str is null
205 public StringTokenizer(String str, String delim, boolean returndelims)
207 this(str, delim, returndelims, false); // java default behavior
211 * <p>Constructs a string tokenizer for the specified string. All
212 * characters in the delim argument are the delimiters for separating
214 * <p>If the returnDelims flag is false, the delimiter characters are
215 * skipped and only serve as separators between tokens.</p>
216 * <p>If the returnDelims flag is true, then the delimiter characters
217 * are also returned as tokens. If coalescedelims is true, one token
218 * is returned for each run of delimiter characters, otherwise one
219 * token is returned per delimiter. Since surrogate pairs can be
220 * delimiters, the returned token might be two chars in length.</p>
221 * @param str a string to be parsed.
222 * @param delim the delimiters.
223 * @param returndelims flag indicating whether to return the delimiters
225 * @param coalescedelims flag indicating whether to return a run of
226 * delimiters as a single token or as one token per delimiter.
227 * This only takes effect if returndelims is true.
228 * @exception NullPointerException if str is null
230 * @deprecated This API is ICU internal only.
232 public StringTokenizer(String str, String delim, boolean returndelims, boolean coalescedelims)
234 // don't ignore whitespace
235 m_delimiters_ = EMPTY_DELIMITER_;
236 if (delim != null && delim.length() > 0) {
237 m_delimiters_ = new UnicodeSet();
238 m_delimiters_.addAll(delim);
241 m_coalesceDelimiters_ = coalescedelims;
243 m_length_ = str.length();
244 m_returnDelimiters_ = returndelims;
247 if (m_length_ == 0) {
248 // string length 0, no tokens
254 m_nextOffset_ = getNextNonDelimiter(0);
260 * <p>Constructs a string tokenizer for the specified string. The
261 * characters in the delim argument are the delimiters for separating
263 * <p>Delimiter characters themselves will not be treated as tokens.</p>
264 * @param str a string to be parsed.
265 * @param delim the delimiters.
266 * @exception NullPointerException if str is null
269 public StringTokenizer(String str, String delim)
271 // don't ignore whitespace
272 this(str, delim, false, false);
276 * <p>Constructs a string tokenizer for the specified string.
277 * The tokenizer uses the default delimiter set, which is
278 * " \t\n\r\f":
279 * the space character, the tab character, the newline character, the
280 * carriage-return character, and the form-feed character.</p>
281 * <p>Delimiter characters themselves will not be treated as tokens.</p>
282 * @param str a string to be parsed
283 * @exception NullPointerException if str is null
286 public StringTokenizer(String str)
288 this(str, DEFAULT_DELIMITERS_, false, false);
291 // public methods --------------------------------------------------
294 * Tests if there are more tokens available from this tokenizer's
296 * If this method returns <tt>true</tt>, then a subsequent call to
297 * <tt>nextToken</tt> with no argument will successfully return a token.
298 * @return <code>true</code> if and only if there is at least one token
299 * in the string after the current position; <code>false</code>
303 public boolean hasMoreTokens()
305 return m_nextOffset_ >= 0;
309 * Returns the next token from this string tokenizer.
310 * @return the next token from this string tokenizer.
311 * @exception NoSuchElementException if there are no more tokens in
312 * this tokenizer's string.
315 public String nextToken()
317 if (m_tokenOffset_ < 0) {
318 if (m_nextOffset_ < 0) {
319 throw new NoSuchElementException("No more tokens in String");
321 // pre-calculations of tokens not done
322 if (m_returnDelimiters_) {
324 int c = UTF16.charAt(m_source_, m_nextOffset_);
325 boolean contains = delims == null
326 ? m_delimiters_.contains(c)
327 : c < delims.length && delims[c];
329 if (m_coalesceDelimiters_) {
330 tokenlimit = getNextNonDelimiter(m_nextOffset_);
332 tokenlimit = m_nextOffset_ + UTF16.getCharCount(c);
333 if (tokenlimit == m_length_) {
339 tokenlimit = getNextDelimiter(m_nextOffset_);
342 if (tokenlimit < 0) {
343 result = m_source_.substring(m_nextOffset_);
346 result = m_source_.substring(m_nextOffset_, tokenlimit);
348 m_nextOffset_ = tokenlimit;
352 int tokenlimit = getNextDelimiter(m_nextOffset_);
354 if (tokenlimit < 0) {
355 result = m_source_.substring(m_nextOffset_);
356 m_nextOffset_ = tokenlimit;
359 result = m_source_.substring(m_nextOffset_, tokenlimit);
360 m_nextOffset_ = getNextNonDelimiter(tokenlimit);
366 // count was called before and we have all the tokens
367 if (m_tokenOffset_ >= m_tokenSize_) {
368 throw new NoSuchElementException("No more tokens in String");
371 if (m_tokenLimit_[m_tokenOffset_] >= 0) {
372 result = m_source_.substring(m_tokenStart_[m_tokenOffset_],
373 m_tokenLimit_[m_tokenOffset_]);
376 result = m_source_.substring(m_tokenStart_[m_tokenOffset_]);
380 if (m_tokenOffset_ < m_tokenSize_) {
381 m_nextOffset_ = m_tokenStart_[m_tokenOffset_];
387 * Returns the next token in this string tokenizer's string. First,
388 * the set of characters considered to be delimiters by this
389 * <tt>StringTokenizer</tt> object is changed to be the characters in
390 * the string <tt>delim</tt>. Then the next token in the string
391 * after the current position is returned. The current position is
392 * advanced beyond the recognized token. The new delimiter set
393 * remains the default after this call.
394 * @param delim the new delimiters.
395 * @return the next token, after switching to the new delimiter set.
396 * @exception NoSuchElementException if there are no more tokens in
397 * this tokenizer's string.
400 public String nextToken(String delim)
402 m_delimiters_ = EMPTY_DELIMITER_;
403 if (delim != null && delim.length() > 0) {
404 m_delimiters_ = new UnicodeSet();
405 m_delimiters_.addAll(delim);
407 return nextToken(m_delimiters_);
411 * {@icu} Returns the next token in this string tokenizer's string. First,
412 * the set of characters considered to be delimiters by this
413 * <tt>StringTokenizer</tt> object is changed to be the characters in
414 * the string <tt>delim</tt>. Then the next token in the string
415 * after the current position is returned. The current position is
416 * advanced beyond the recognized token. The new delimiter set
417 * remains the default after this call.
418 * @param delim the new delimiters.
419 * @return the next token, after switching to the new delimiter set.
420 * @exception NoSuchElementException if there are no more tokens in
421 * this tokenizer's string.
424 public String nextToken(UnicodeSet delim)
426 m_delimiters_ = delim;
430 if (!m_returnDelimiters_) {
431 m_nextOffset_ = getNextNonDelimiter(m_nextOffset_);
437 * Returns the same value as the <code>hasMoreTokens</code> method.
438 * It exists so that this class can implement the
439 * <code>Enumeration</code> interface.
440 * @return <code>true</code> if there are more tokens;
441 * <code>false</code> otherwise.
442 * @see #hasMoreTokens()
445 public boolean hasMoreElements()
447 return hasMoreTokens();
451 * Returns the same value as the <code>nextToken</code> method, except
452 * that its declared return value is <code>Object</code> rather than
453 * <code>String</code>. It exists so that this class can implement the
454 * <code>Enumeration</code> interface.
455 * @return the next token in the string.
456 * @exception NoSuchElementException if there are no more tokens in
457 * this tokenizer's string.
461 public Object nextElement()
467 * Calculates the number of times that this tokenizer's
468 * <code>nextToken</code> method can be called before it generates an
469 * exception. The current position is not advanced.
470 * @return the number of tokens remaining in the string using the
471 * current delimiter set.
475 public int countTokens()
478 if (hasMoreTokens()) {
479 if (m_tokenOffset_ >= 0) {
480 return m_tokenSize_ - m_tokenOffset_;
482 if (m_tokenStart_ == null) {
483 m_tokenStart_ = new int[TOKEN_SIZE_];
484 m_tokenLimit_ = new int[TOKEN_SIZE_];
487 if (m_tokenStart_.length == result) {
488 int temptokenindex[] = m_tokenStart_;
489 int temptokensize[] = m_tokenLimit_;
490 int originalsize = temptokenindex.length;
491 int newsize = originalsize + TOKEN_SIZE_;
492 m_tokenStart_ = new int[newsize];
493 m_tokenLimit_ = new int[newsize];
494 System.arraycopy(temptokenindex, 0, m_tokenStart_, 0,
496 System.arraycopy(temptokensize, 0, m_tokenLimit_, 0,
499 m_tokenStart_[result] = m_nextOffset_;
500 if (m_returnDelimiters_) {
501 int c = UTF16.charAt(m_source_, m_nextOffset_);
502 boolean contains = delims == null
503 ? m_delimiters_.contains(c)
504 : c < delims.length && delims[c];
506 if (m_coalesceDelimiters_) {
507 m_tokenLimit_[result] = getNextNonDelimiter(
510 int p = m_nextOffset_ + 1;
511 if (p == m_length_) {
514 m_tokenLimit_[result] = p;
519 m_tokenLimit_[result] = getNextDelimiter(m_nextOffset_);
521 m_nextOffset_ = m_tokenLimit_[result];
524 m_tokenLimit_[result] = getNextDelimiter(m_nextOffset_);
525 m_nextOffset_ = getNextNonDelimiter(m_tokenLimit_[result]);
528 } while (m_nextOffset_ >= 0);
530 m_tokenSize_ = result;
531 m_nextOffset_ = m_tokenStart_[0];
536 // private data members -------------------------------------------------
539 * Current offset to the token array. If the array token is not set up yet,
542 private int m_tokenOffset_;
544 * Size of the token array. If the array token is not set up yet,
547 private int m_tokenSize_;
549 * Array of pre-calculated tokens start indexes in source string terminated
551 * This is only set up during countTokens() and only stores the remaining
552 * tokens, not all tokens including parsed ones
554 private int m_tokenStart_[];
556 * Array of pre-calculated tokens limit indexes in source string.
557 * This is only set up during countTokens() and only stores the remaining
558 * tokens, not all tokens including parsed ones
560 private int m_tokenLimit_[];
562 * UnicodeSet containing delimiters
564 private UnicodeSet m_delimiters_;
566 * String to parse for tokens
568 private String m_source_;
570 * Length of m_source_
572 private int m_length_;
574 * Current position in string to parse for tokens
576 private int m_nextOffset_;
578 * Flag indicator if delimiters are to be treated as tokens too
580 private boolean m_returnDelimiters_;
583 * Flag indicating whether to coalesce runs of delimiters into single tokens
585 private boolean m_coalesceDelimiters_;
588 * Default set of delimiters \t\n\r\f
590 private static final UnicodeSet DEFAULT_DELIMITERS_
591 = new UnicodeSet(0x09, 0x0a, 0x0c, 0x0d, 0x20, 0x20); // UnicodeSet("[ \t\n\r\f]", false)
593 * Array size increments
595 private static final int TOKEN_SIZE_ = 100;
597 * A empty delimiter UnicodeSet, used when user specified null delimiters
599 private static final UnicodeSet EMPTY_DELIMITER_ = UnicodeSet.EMPTY;
601 // private methods ------------------------------------------------------
604 * Gets the index of the next delimiter after offset
605 * @param offset to the source string
606 * @return offset of the immediate next delimiter, otherwise
607 * (- source string length - 1) if there
608 * are no more delimiters after m_nextOffset
610 private int getNextDelimiter(int offset)
615 if (delims == null) {
617 c = UTF16.charAt(m_source_, result);
618 if (m_delimiters_.contains(c)) {
622 } while (result < m_length_);
625 c = UTF16.charAt(m_source_, result);
626 if (c < delims.length && delims[c]) {
630 } while (result < m_length_);
632 if (result < m_length_) {
636 return -1 - m_length_;
640 * Gets the index of the next non-delimiter after m_nextOffset_
641 * @param offset to the source string
642 * @return offset of the immediate next non-delimiter, otherwise
643 * (- source string length - 1) if there
644 * are no more delimiters after m_nextOffset
646 private int getNextNonDelimiter(int offset)
651 if (delims == null) {
653 c = UTF16.charAt(m_source_, result);
654 if (!m_delimiters_.contains(c)) {
658 } while (result < m_length_);
661 c = UTF16.charAt(m_source_, result);
662 if (!(c < delims.length && delims[c])) {
666 } while (result < m_length_);
668 if (result < m_length_) {
672 return -1 - m_length_;
675 void checkDelimiters() {
676 if (m_delimiters_ == null || m_delimiters_.size() == 0) {
677 delims = new boolean[0];
679 int maxChar = m_delimiters_.getRangeEnd(m_delimiters_.getRangeCount()-1);
680 if (maxChar < 0x7f) {
681 delims = new boolean[maxChar+1];
682 for (int i = 0, ch; -1 != (ch = m_delimiters_.charAt(i)); ++i) {
690 private boolean[] delims;