2 *******************************************************************************
\r
3 * Copyright (C) 1996-2006, International Business Machines Corporation and *
\r
4 * others. All Rights Reserved. *
\r
5 *******************************************************************************
\r
8 package com.ibm.icu.util;
\r
10 import java.util.Enumeration;
\r
11 import java.util.NoSuchElementException;
\r
12 import com.ibm.icu.text.UnicodeSet;
\r
13 import com.ibm.icu.text.UTF16;
\r
16 * <p>The string tokenizer class allows an application to break a string
\r
17 * into tokens by performing code point comparison.
\r
18 * The <code>StringTokenizer</code> methods do not distinguish
\r
19 * among identifiers, numbers, and quoted strings, nor do they recognize
\r
20 * and skip comments.</p>
\r
22 * The set of delimiters (the codepoints that separate tokens) may be
\r
23 * specified either at creation time or on a per-token basis.
\r
26 * An instance of <code>StringTokenizer</code> behaves in one of three ways,
\r
27 * depending on whether it was created with the <code>returnDelims</code>
\r
28 * and <code>coalesceDelims</code>
\r
29 * flags having the value <code>true</code> or <code>false</code>:
\r
31 * <li>If returnDelims is <code>false</code>, delimiter code points serve to
\r
32 * separate tokens. A token is a maximal sequence of consecutive
\r
33 * code points that are not delimiters.
\r
34 * <li>If returnDelims is <code>true</code>, delimiter code points are
\r
35 * themselves considered to be tokens. In this case, if coalesceDelims is
\r
36 * <code>true</code>, such tokens will be the maximal sequence of consecutive
\r
37 * code points that <em>are</em> delimiters. If coalesceDelims is false,
\r
38 * a token will be received for each delimiter code point.
\r
40 * <p>A token is thus either one
\r
41 * delimiter code point, a maximal sequence of consecutive code points that
\r
42 * are delimiters, or a maximal sequence of consecutive code
\r
43 * points that are not delimiters.
\r
46 * A <tt>StringTokenizer</tt> object internally maintains a current
\r
47 * position within the string to be tokenized. Some operations advance this
\r
48 * current position past the code point processed.
\r
51 * A token is returned by taking a substring of the string that was used to
\r
52 * create the <tt>StringTokenizer</tt> object.
\r
55 * Example of the use of the default delimiter tokenizer.
\r
57 * StringTokenizer st = new StringTokenizer("this is a test");
\r
58 * while (st.hasMoreTokens()) {
\r
59 * println(st.nextToken());
\r
61 * </pre></blockquote>
\r
64 * prints the following output:
\r
70 * </pre></blockquote>
\r
73 * Example of the use of the tokenizer with user specified delimiter.
\r
75 * StringTokenizer st = new StringTokenizer(
\r
76 * "this is a test with supplementary characters \ud800\ud800\udc00\udc00",
\r
77 * " \ud800\udc00");
\r
78 * while (st.hasMoreTokens()) {
\r
79 * println(st.nextToken());
\r
81 * </pre></blockquote>
\r
84 * prints the following output:
\r
95 * </pre></blockquote>
\r
100 public final class StringTokenizer implements Enumeration
\r
102 // public constructors ---------------------------------------------
\r
105 * <p>Constructs a string tokenizer for the specified string. All
\r
106 * characters in the delim argument are the delimiters for separating
\r
108 * <p>If the returnDelims flag is false, the delimiter characters are
\r
109 * skipped and only serve as separators between tokens.</p>
\r
110 * <p>If the returnDelims flag is true, then the delimiter characters
\r
111 * are also returned as tokens, one per delimiter.
\r
112 * @param str a string to be parsed.
\r
113 * @param delim the delimiters.
\r
114 * @param returndelims flag indicating whether to return the delimiters
\r
116 * @exception throws a NullPointerException if str is null
\r
119 public StringTokenizer(String str, UnicodeSet delim, boolean returndelims)
\r
121 this(str, delim, returndelims, false);
\r
125 * <p>Constructs a string tokenizer for the specified string. All
\r
126 * characters in the delim argument are the delimiters for separating
\r
128 * <p>If the returnDelims flag is false, the delimiter characters are
\r
129 * skipped and only serve as separators between tokens.</p>
\r
130 * <p>If the returnDelims flag is true, then the delimiter characters
\r
131 * are also returned as tokens. If coalescedelims is true, one token
\r
132 * is returned for each run of delimiter characters, otherwise one
\r
133 * token is returned per delimiter. Since surrogate pairs can be
\r
134 * delimiters, the returned token might be two chars in length.</p>
\r
135 * @param str a string to be parsed.
\r
136 * @param delim the delimiters.
\r
137 * @param returndelims flag indicating whether to return the delimiters
\r
139 * @param coalescedelims flag indicating whether to return a run of
\r
140 * delimiters as a single token or as one token per delimiter.
\r
141 * This only takes effect if returndelims is true.
\r
142 * @exception throws a NullPointerException if str is null
\r
143 * @internal ICU 3.4.3
\r
144 * @deprecated This API is ICU internal only.
\r
146 public StringTokenizer(String str, UnicodeSet delim, boolean returndelims, boolean coalescedelims)
\r
149 m_length_ = str.length();
\r
150 if (delim == null) {
\r
151 m_delimiters_ = EMPTY_DELIMITER_;
\r
154 m_delimiters_ = delim;
\r
156 m_returnDelimiters_ = returndelims;
\r
157 m_coalesceDelimiters_ = coalescedelims;
\r
158 m_tokenOffset_ = -1;
\r
160 if (m_length_ == 0) {
\r
161 // string length 0, no tokens
\r
162 m_nextOffset_ = -1;
\r
166 if (!returndelims) {
\r
167 m_nextOffset_ = getNextNonDelimiter(0);
\r
173 * <p>Constructs a string tokenizer for the specified string. The
\r
174 * characters in the delim argument are the delimiters for separating
\r
176 * <p>Delimiter characters themselves will not be treated as tokens.</p>
\r
177 * @param str a string to be parsed.
\r
178 * @param delim the delimiters.
\r
179 * @exception throws a NullPointerException if str is null
\r
182 public StringTokenizer(String str, UnicodeSet delim)
\r
184 this(str, delim, false, false);
\r
188 * <p>Constructs a string tokenizer for the specified string. All
\r
189 * characters in the delim argument are the delimiters for separating
\r
191 * <p>If the returnDelims flag is false, the delimiter characters are
\r
192 * skipped and only serve as separators between tokens.</p>
\r
193 * <p>If the returnDelims flag is true, then the delimiter characters
\r
194 * are also returned as tokens, one per delimiter.
\r
195 * @param str a string to be parsed.
\r
196 * @param delim the delimiters.
\r
197 * @param returndelims flag indicating whether to return the delimiters
\r
199 * @exception throws a NullPointerException if str is null
\r
202 public StringTokenizer(String str, String delim, boolean returndelims)
\r
204 this(str, delim, returndelims, false); // java default behavior
\r
208 * <p>Constructs a string tokenizer for the specified string. All
\r
209 * characters in the delim argument are the delimiters for separating
\r
211 * <p>If the returnDelims flag is false, the delimiter characters are
\r
212 * skipped and only serve as separators between tokens.</p>
\r
213 * <p>If the returnDelims flag is true, then the delimiter characters
\r
214 * are also returned as tokens. If coalescedelims is true, one token
\r
215 * is returned for each run of delimiter characters, otherwise one
\r
216 * token is returned per delimiter. Since surrogate pairs can be
\r
217 * delimiters, the returned token might be two chars in length.</p>
\r
218 * @param str a string to be parsed.
\r
219 * @param delim the delimiters.
\r
220 * @param returndelims flag indicating whether to return the delimiters
\r
222 * @param coalescedelims flag indicating whether to return a run of
\r
223 * delimiters as a single token or as one token per delimiter.
\r
224 * This only takes effect if returndelims is true.
\r
225 * @exception throws a NullPointerException if str is null
\r
226 * @internal ICU 3.4.3
\r
227 * @deprecated This API is ICU internal only.
\r
229 public StringTokenizer(String str, String delim, boolean returndelims, boolean coalescedelims)
\r
231 // don't ignore whitespace
\r
232 m_delimiters_ = EMPTY_DELIMITER_;
\r
233 if (delim != null && delim.length() > 0) {
\r
234 m_delimiters_ = new UnicodeSet();
\r
235 m_delimiters_.addAll(delim);
\r
238 m_coalesceDelimiters_ = coalescedelims;
\r
240 m_length_ = str.length();
\r
241 m_returnDelimiters_ = returndelims;
\r
242 m_tokenOffset_ = -1;
\r
244 if (m_length_ == 0) {
\r
245 // string length 0, no tokens
\r
246 m_nextOffset_ = -1;
\r
250 if (!returndelims) {
\r
251 m_nextOffset_ = getNextNonDelimiter(0);
\r
257 * <p>Constructs a string tokenizer for the specified string. The
\r
258 * characters in the delim argument are the delimiters for separating
\r
260 * <p>Delimiter characters themselves will not be treated as tokens.</p>
\r
261 * @param str a string to be parsed.
\r
262 * @param delim the delimiters.
\r
263 * @exception throws a NullPointerException if str is null
\r
266 public StringTokenizer(String str, String delim)
\r
268 // don't ignore whitespace
\r
269 this(str, delim, false, false);
\r
273 * <p>Constructs a string tokenizer for the specified string.
\r
274 * The tokenizer uses the default delimiter set, which is
\r
275 * " \t\n\r\f":
\r
276 * the space character, the tab character, the newline character, the
\r
277 * carriage-return character, and the form-feed character.</p>
\r
278 * <p>Delimiter characters themselves will not be treated as tokens.</p>
\r
279 * @param str a string to be parsed
\r
280 * @exception throws a NullPointerException if str is null
\r
283 public StringTokenizer(String str)
\r
285 this(str, DEFAULT_DELIMITERS_, false, false);
\r
288 // public methods --------------------------------------------------
\r
291 * Tests if there are more tokens available from this tokenizer's
\r
293 * If this method returns <tt>true</tt>, then a subsequent call to
\r
294 * <tt>nextToken</tt> with no argument will successfully return a token.
\r
295 * @return <code>true</code> if and only if there is at least one token
\r
296 * in the string after the current position; <code>false</code>
\r
300 public boolean hasMoreTokens()
\r
302 return m_nextOffset_ >= 0;
\r
306 * Returns the next token from this string tokenizer.
\r
307 * @return the next token from this string tokenizer.
\r
308 * @exception NoSuchElementException if there are no more tokens in
\r
309 * this tokenizer's string.
\r
312 public String nextToken()
\r
314 if (m_tokenOffset_ < 0) {
\r
315 if (m_nextOffset_ < 0) {
\r
316 throw new NoSuchElementException("No more tokens in String");
\r
318 // pre-calculations of tokens not done
\r
319 if (m_returnDelimiters_) {
\r
320 int tokenlimit = 0;
\r
321 int c = UTF16.charAt(m_source_, m_nextOffset_);
\r
322 boolean contains = delims == null
\r
323 ? m_delimiters_.contains(c)
\r
324 : c < delims.length && delims[c];
\r
326 if (m_coalesceDelimiters_) {
\r
327 tokenlimit = getNextNonDelimiter(m_nextOffset_);
\r
329 tokenlimit = m_nextOffset_ + UTF16.getCharCount(c);
\r
330 if (tokenlimit == m_length_) {
\r
336 tokenlimit = getNextDelimiter(m_nextOffset_);
\r
339 if (tokenlimit < 0) {
\r
340 result = m_source_.substring(m_nextOffset_);
\r
343 result = m_source_.substring(m_nextOffset_, tokenlimit);
\r
345 m_nextOffset_ = tokenlimit;
\r
349 int tokenlimit = getNextDelimiter(m_nextOffset_);
\r
351 if (tokenlimit < 0) {
\r
352 result = m_source_.substring(m_nextOffset_);
\r
353 m_nextOffset_ = tokenlimit;
\r
356 result = m_source_.substring(m_nextOffset_, tokenlimit);
\r
357 m_nextOffset_ = getNextNonDelimiter(tokenlimit);
\r
363 // count was called before and we have all the tokens
\r
364 if (m_tokenOffset_ >= m_tokenSize_) {
\r
365 throw new NoSuchElementException("No more tokens in String");
\r
368 if (m_tokenLimit_[m_tokenOffset_] >= 0) {
\r
369 result = m_source_.substring(m_tokenStart_[m_tokenOffset_],
\r
370 m_tokenLimit_[m_tokenOffset_]);
\r
373 result = m_source_.substring(m_tokenStart_[m_tokenOffset_]);
\r
376 m_nextOffset_ = -1;
\r
377 if (m_tokenOffset_ < m_tokenSize_) {
\r
378 m_nextOffset_ = m_tokenStart_[m_tokenOffset_];
\r
384 * Returns the next token in this string tokenizer's string. First,
\r
385 * the set of characters considered to be delimiters by this
\r
386 * <tt>StringTokenizer</tt> object is changed to be the characters in
\r
387 * the string <tt>delim</tt>. Then the next token in the string
\r
388 * after the current position is returned. The current position is
\r
389 * advanced beyond the recognized token. The new delimiter set
\r
390 * remains the default after this call.
\r
391 * @param delim the new delimiters.
\r
392 * @return the next token, after switching to the new delimiter set.
\r
393 * @exception NoSuchElementException if there are no more tokens in
\r
394 * this tokenizer's string.
\r
397 public String nextToken(String delim)
\r
399 m_delimiters_ = EMPTY_DELIMITER_;
\r
400 if (delim != null && delim.length() > 0) {
\r
401 m_delimiters_ = new UnicodeSet();
\r
402 m_delimiters_.addAll(delim);
\r
404 return nextToken(m_delimiters_);
\r
408 * Returns the next token in this string tokenizer's string. First,
\r
409 * the set of characters considered to be delimiters by this
\r
410 * <tt>StringTokenizer</tt> object is changed to be the characters in
\r
411 * the string <tt>delim</tt>. Then the next token in the string
\r
412 * after the current position is returned. The current position is
\r
413 * advanced beyond the recognized token. The new delimiter set
\r
414 * remains the default after this call.
\r
415 * @param delim the new delimiters.
\r
416 * @return the next token, after switching to the new delimiter set.
\r
417 * @exception NoSuchElementException if there are no more tokens in
\r
418 * this tokenizer's string.
\r
421 public String nextToken(UnicodeSet delim)
\r
423 m_delimiters_ = delim;
\r
425 m_tokenOffset_ = -1;
\r
427 if (!m_returnDelimiters_) {
\r
428 m_nextOffset_ = getNextNonDelimiter(m_nextOffset_);
\r
430 return nextToken();
\r
434 * Returns the same value as the <code>hasMoreTokens</code> method.
\r
435 * It exists so that this class can implement the
\r
436 * <code>Enumeration</code> interface.
\r
437 * @return <code>true</code> if there are more tokens;
\r
438 * <code>false</code> otherwise.
\r
439 * @see #hasMoreTokens()
\r
442 public boolean hasMoreElements()
\r
444 return hasMoreTokens();
\r
448 * Returns the same value as the <code>nextToken</code> method, except
\r
449 * that its declared return value is <code>Object</code> rather than
\r
450 * <code>String</code>. It exists so that this class can implement the
\r
451 * <code>Enumeration</code> interface.
\r
452 * @return the next token in the string.
\r
453 * @exception NoSuchElementException if there are no more tokens in
\r
454 * this tokenizer's string.
\r
455 * @see #nextToken()
\r
458 public Object nextElement()
\r
460 return nextToken();
\r
464 * Calculates the number of times that this tokenizer's
\r
465 * <code>nextToken</code> method can be called before it generates an
\r
466 * exception. The current position is not advanced.
\r
467 * @return the number of tokens remaining in the string using the
\r
468 * current delimiter set.
\r
469 * @see #nextToken()
\r
472 public int countTokens()
\r
475 if (hasMoreTokens()) {
\r
476 if (m_tokenOffset_ >= 0) {
\r
477 return m_tokenSize_ - m_tokenOffset_;
\r
479 if (m_tokenStart_ == null) {
\r
480 m_tokenStart_ = new int[TOKEN_SIZE_];
\r
481 m_tokenLimit_ = new int[TOKEN_SIZE_];
\r
484 if (m_tokenStart_.length == result) {
\r
485 int temptokenindex[] = m_tokenStart_;
\r
486 int temptokensize[] = m_tokenLimit_;
\r
487 int originalsize = temptokenindex.length;
\r
488 int newsize = originalsize + TOKEN_SIZE_;
\r
489 m_tokenStart_ = new int[newsize];
\r
490 m_tokenLimit_ = new int[newsize];
\r
491 System.arraycopy(temptokenindex, 0, m_tokenStart_, 0,
\r
493 System.arraycopy(temptokensize, 0, m_tokenLimit_, 0,
\r
496 m_tokenStart_[result] = m_nextOffset_;
\r
497 if (m_returnDelimiters_) {
\r
498 int c = UTF16.charAt(m_source_, m_nextOffset_);
\r
499 boolean contains = delims == null
\r
500 ? m_delimiters_.contains(c)
\r
501 : c < delims.length && delims[c];
\r
503 if (m_coalesceDelimiters_) {
\r
504 m_tokenLimit_[result] = getNextNonDelimiter(
\r
507 int p = m_nextOffset_ + 1;
\r
508 if (p == m_length_) {
\r
511 m_tokenLimit_[result] = p;
\r
516 m_tokenLimit_[result] = getNextDelimiter(m_nextOffset_);
\r
518 m_nextOffset_ = m_tokenLimit_[result];
\r
521 m_tokenLimit_[result] = getNextDelimiter(m_nextOffset_);
\r
522 m_nextOffset_ = getNextNonDelimiter(m_tokenLimit_[result]);
\r
525 } while (m_nextOffset_ >= 0);
\r
526 m_tokenOffset_ = 0;
\r
527 m_tokenSize_ = result;
\r
528 m_nextOffset_ = m_tokenStart_[0];
\r
533 // private data members -------------------------------------------------
\r
536 * Current offset to the token array. If the array token is not set up yet,
\r
537 * this value is a -1
\r
539 private int m_tokenOffset_;
\r
541 * Size of the token array. If the array token is not set up yet,
\r
542 * this value is a -1
\r
544 private int m_tokenSize_;
\r
546 * Array of pre-calculated tokens start indexes in source string terminated
\r
548 * This is only set up during countTokens() and only stores the remaining
\r
549 * tokens, not all tokens including parsed ones
\r
551 private int m_tokenStart_[];
\r
553 * Array of pre-calculated tokens limit indexes in source string.
\r
554 * This is only set up during countTokens() and only stores the remaining
\r
555 * tokens, not all tokens including parsed ones
\r
557 private int m_tokenLimit_[];
\r
559 * UnicodeSet containing delimiters
\r
561 private UnicodeSet m_delimiters_;
\r
563 * String to parse for tokens
\r
565 private String m_source_;
\r
567 * Length of m_source_
\r
569 private int m_length_;
\r
571 * Current position in string to parse for tokens
\r
573 private int m_nextOffset_;
\r
575 * Flag indicator if delimiters are to be treated as tokens too
\r
577 private boolean m_returnDelimiters_;
\r
580 * Flag indicating whether to coalesce runs of delimiters into single tokens
\r
582 private boolean m_coalesceDelimiters_;
\r
585 * Default set of delimiters \t\n\r\f
\r
587 private static final UnicodeSet DEFAULT_DELIMITERS_
\r
588 = new UnicodeSet("[ \t\n\r\f]", false);
\r
590 * Array size increments
\r
592 private static final int TOKEN_SIZE_ = 100;
\r
594 * A empty delimiter UnicodeSet, used when user specified null delimiters
\r
596 private static final UnicodeSet EMPTY_DELIMITER_ = new UnicodeSet();
\r
598 // private methods ------------------------------------------------------
\r
601 * Gets the index of the next delimiter after offset
\r
602 * @param offset to the source string
\r
603 * @return offset of the immediate next delimiter, otherwise
\r
604 * (- source string length - 1) if there
\r
605 * are no more delimiters after m_nextOffset
\r
607 private int getNextDelimiter(int offset)
\r
610 int result = offset;
\r
612 if (delims == null) {
\r
614 c = UTF16.charAt(m_source_, result);
\r
615 if (m_delimiters_.contains(c)) {
\r
619 } while (result < m_length_);
\r
622 c = UTF16.charAt(m_source_, result);
\r
623 if (c < delims.length && delims[c]) {
\r
627 } while (result < m_length_);
\r
629 if (result < m_length_) {
\r
633 return -1 - m_length_;
\r
637 * Gets the index of the next non-delimiter after m_nextOffset_
\r
638 * @param offset to the source string
\r
639 * @return offset of the immediate next non-delimiter, otherwise
\r
640 * (- source string length - 1) if there
\r
641 * are no more delimiters after m_nextOffset
\r
643 private int getNextNonDelimiter(int offset)
\r
646 int result = offset;
\r
648 if (delims == null) {
\r
650 c = UTF16.charAt(m_source_, result);
\r
651 if (!m_delimiters_.contains(c)) {
\r
655 } while (result < m_length_);
\r
658 c = UTF16.charAt(m_source_, result);
\r
659 if (!(c < delims.length && delims[c])) {
\r
663 } while (result < m_length_);
\r
665 if (result < m_length_) {
\r
669 return -1 - m_length_;
\r
672 void checkDelimiters() {
\r
673 if (m_delimiters_ == null || m_delimiters_.size() == 0) {
\r
674 delims = new boolean[0];
\r
676 int maxChar = m_delimiters_.getRangeEnd(m_delimiters_.getRangeCount()-1);
\r
677 if (maxChar < 0x7f) {
\r
678 delims = new boolean[maxChar+1];
\r
679 for (int i = 0, ch; -1 != (ch = m_delimiters_.charAt(i)); ++i) {
\r
687 private boolean[] delims;
\r