/* ********************************************************************** * Copyright (c) 2004-2008, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** * Author: Alan Liu * Created: March 16 2004 * Since: ICU 3.0 ********************************************************************** */ package com.ibm.icu.impl.data; import java.io.IOException; import com.ibm.icu.impl.UCharacterProperty; import com.ibm.icu.impl.Utility; import com.ibm.icu.text.UTF16; /** * An iterator class that returns successive string tokens from some * source. String tokens are, in general, separated by rule white * space in the source test. Furthermore, they may be delimited by * either single or double quotes (opening and closing quotes must * match). Escapes are processed using standard ICU unescaping. */ public class TokenIterator { private ResourceReader reader; private String line; private StringBuffer buf; private boolean done; private int pos; private int lastpos; /** * Construct an iterator over the tokens returned by the given * ResourceReader, ignoring blank lines and comment lines (first * non-blank character is '#'). Note that trailing comments on a * line, beginning with the first unquoted '#', are recognized. */ public TokenIterator(ResourceReader r) { reader = r; line = null; done = false; buf = new StringBuffer(); pos = lastpos = -1; } /** * Return the next token from this iterator, or null if the last * token has been returned. */ public String next() throws IOException { if (done) { return null; } for (;;) { if (line == null) { line = reader.readLineSkippingComments(); if (line == null) { done = true; return null; } pos = 0; } buf.setLength(0); lastpos = pos; pos = nextToken(pos); if (pos < 0) { line = null; continue; } return buf.toString(); } } /** * Return the one-based line number of the line of the last token returned by * next(). Should only be called * after a call to next(); otherwise the return * value is undefined. */ public int getLineNumber() { return reader.getLineNumber(); } /** * Return a string description of the position of the last line * returned by readLine() or readLineSkippingComments(). */ public String describePosition() { return reader.describePosition() + ':' + (lastpos+1); } /** * Read the next token from 'this.line' and append it to * 'this.buf'. Tokens are separated by rule white space. Tokens * may also be delimited by double or single quotes. The closing * quote must match the opening quote. If a '#' is encountered, * the rest of the line is ignored, unless it is backslash-escaped * or within quotes. * @param position the offset into the string * @return offset to the next character to read from line, or if * the end of the line is reached without scanning a valid token, * -1 */ private int nextToken(int position) { position = Utility.skipWhitespace(line, position); if (position == line.length()) { return -1; } int startpos = position; char c = line.charAt(position++); char quote = 0; switch (c) { case '"': case '\'': quote = c; break; case '#': return -1; default: buf.append(c); break; } int[] posref = null; while (position < line.length()) { c = line.charAt(position); // 16-bit ok if (c == '\\') { if (posref == null) { posref = new int[1]; } posref[0] = position+1; int c32 = Utility.unescapeAt(line, posref); if (c32 < 0) { throw new RuntimeException("Invalid escape at " + reader.describePosition() + ':' + position); } UTF16.append(buf, c32); position = posref[0]; } else if ((quote != 0 && c == quote) || (quote == 0 && UCharacterProperty.isRuleWhiteSpace(c))) { return ++position; } else if (quote == 0 && c == '#') { return position; // do NOT increment } else { buf.append(c); ++position; } } if (quote != 0) { throw new RuntimeException("Unterminated quote at " + reader.describePosition() + ':' + startpos); } return position; } }