From 7755640784fa50ecbd138360e6a9e65823519cfe Mon Sep 17 00:00:00 2001 From: Sebastian Bazley Date: Tue, 6 Aug 2013 15:43:10 +0000 Subject: [PATCH] No longer wanted git-svn-id: https://svn.apache.org/repos/asf/commons/proper/csv/trunk@1511005 13f79535-47bb-0310-9956-ffa450edef68 --- .../org/apache/commons/csv/CSVLexer1.java | 247 ----------------- .../apache/commons/csv/CSVLexer1306663.java | 235 ---------------- .../apache/commons/csv/CSVLexer1306667.java | 235 ---------------- .../org/apache/commons/csv/CSVLexer3.java | 254 ------------------ 4 files changed, 971 deletions(-) delete mode 100644 src/test/java/org/apache/commons/csv/CSVLexer1.java delete mode 100644 src/test/java/org/apache/commons/csv/CSVLexer1306663.java delete mode 100644 src/test/java/org/apache/commons/csv/CSVLexer1306667.java delete mode 100644 src/test/java/org/apache/commons/csv/CSVLexer3.java diff --git a/src/test/java/org/apache/commons/csv/CSVLexer1.java b/src/test/java/org/apache/commons/csv/CSVLexer1.java deleted file mode 100644 index fc5c9857..00000000 --- a/src/test/java/org/apache/commons/csv/CSVLexer1.java +++ /dev/null @@ -1,247 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.commons.csv; - -import static org.apache.commons.csv.Constants.UNDEFINED; -import static org.apache.commons.csv.Token.Type.EOF; -import static org.apache.commons.csv.Token.Type.EORECORD; -import static org.apache.commons.csv.Token.Type.TOKEN; - -import java.io.IOException; - -/** - * - * - * @version $Id$ - */ -class CSVLexer1 extends Lexer { - - private final StringBuilder wsBuf = new StringBuilder(); - - // ctor needs to be public so can be called dynamically by PerformanceTest class - public CSVLexer1(final CSVFormat format, final ExtendedBufferedReader in) { - super(format, in); - } - - /** - * Returns the next token. - *

- * A token corresponds to a term, a record change or an end-of-file indicator. - * - * @param tkn an existing Token object to reuse. The caller is responsible to initialize the Token. - * @return the next token found - * @throws java.io.IOException on stream access error - */ - @Override - Token nextToken(Token tkn) throws IOException { - wsBuf.setLength(0); // reuse - - // get the last read char (required for empty line detection) - int lastChar = in.getLastChar(); - - // read the next char and set eol - /* note: unfortunately isEndOfLine may consumes a character silently. - * this has no effect outside of the method. so a simple workaround - * is to call 'readAgain' on the stream... - */ - int c = in.read(); - boolean eol = readEndOfLine(c); - c = in.getLastChar(); - - // empty line detection: eol AND (last char was EOL or beginning) - if (format.getIgnoreEmptyLines()) { - while (eol - && (lastChar == '\n' || lastChar == '\r' || lastChar == UNDEFINED) - && !isEndOfFile(lastChar)) { - // go on char ahead ... - lastChar = c; - c = in.read(); - eol = readEndOfLine(c); - c = in.getLastChar(); - // reached end of file without any content (empty line at the end) - if (isEndOfFile(c)) { - tkn.type = EOF; - return tkn; - } - } - } - - // did we reach eof during the last iteration already ? EOF - if (isEndOfFile(lastChar) || (lastChar != format.getDelimiter() && isEndOfFile(c))) { - tkn.type = EOF; - return tkn; - } - - // important: make sure a new char gets consumed in each iteration - while (!tkn.isReady && tkn.type != EOF) { - // ignore whitespaces at beginning of a token - if (format.getIgnoreSurroundingSpaces()) { - while (isWhitespace(c) && !eol) { - wsBuf.append((char) c); - c = in.read(); - eol = readEndOfLine(c); - } - } - - // ok, start of token reached: comment, encapsulated, or token - if (c == format.getCommentStart()) { - // ignore everything till end of line and continue (incr linecount) - in.readLine(); - tkn.reset(); - tkn = nextToken(tkn); - } else if (c == format.getDelimiter()) { - // empty token return TOKEN("") - tkn.type = TOKEN; - tkn.isReady = true; - } else if (eol) { - // empty token return EORECORD("") - //noop: tkn.content.append(""); - tkn.type = EORECORD; - tkn.isReady = true; - } else if (c == format.getQuoteChar()) { - // consume encapsulated token - encapsulatedTokenLexer(tkn, c); - } else if (isEndOfFile(c)) { - // end of file return EOF() - //noop: tkn.content.append(""); - tkn.type = EOF; - tkn.isReady = true; - } else { - // next token must be a simple token - // add removed blanks when not ignoring whitespace chars... - if (!format.getIgnoreSurroundingSpaces()) { - tkn.content.append(wsBuf); - } - simpleTokenLexer(tkn, c); - } - } - return tkn; - } - - /** - * A simple token lexer - *

- * Simple token are tokens which are not surrounded by encapsulators. - * A simple token might contain escaped delimiters (as \, or \;). The - * token is finished when one of the following conditions become true: - *

- * - * @param tkn the current token - * @param c the current character - * @return the filled token - * @throws IOException on stream access error - */ - private Token simpleTokenLexer(final Token tkn, int c) throws IOException { - while (true) { - if (readEndOfLine(c)) { - // end of record - tkn.type = EORECORD; - tkn.isReady = true; - break; - } else if (isEndOfFile(c)) { - // end of file - tkn.type = EOF; - tkn.isReady = true; - break; - } else if (c == format.getDelimiter()) { - // end of token - tkn.type = TOKEN; - tkn.isReady = true; - break; - } else if (c == format.getEscape()) { - tkn.content.append((char) readEscape()); - } else { - tkn.content.append((char) c); - } - - c = in.read(); - } - - if (format.getIgnoreSurroundingSpaces()) { - trimTrailingSpaces(tkn.content); - } - - return tkn; - } - - /** - * An encapsulated token lexer - *

- * Encapsulated tokens are surrounded by the given encapsulating-string. - * The encapsulator itself might be included in the token using a - * doubling syntax (as "", '') or using escaping (as in \", \'). - * Whitespaces before and after an encapsulated token are ignored. - * - * @param tkn the current token - * @param c the current character - * @return a valid token object - * @throws IOException on invalid state - */ - private Token encapsulatedTokenLexer(final Token tkn, int c) throws IOException { - // save current line - final long startLineNumber = getCurrentLineNumber(); - // ignore the given delimiter - // assert c == delimiter; - while (true) { - c = in.read(); - - if (c == format.getEscape()) { - tkn.content.append((char) readEscape()); - } else if (c == format.getQuoteChar()) { - if (in.lookAhead() == format.getQuoteChar()) { - // double or escaped encapsulator -> add single encapsulator to token - c = in.read(); - tkn.content.append((char) c); - } else { - // token finish mark (encapsulator) reached: ignore whitespace till delimiter - while (true) { - c = in.read(); - if (c == format.getDelimiter()) { - tkn.type = TOKEN; - tkn.isReady = true; - return tkn; - } else if (isEndOfFile(c)) { - tkn.type = EOF; - tkn.isReady = true; - return tkn; - } else if (readEndOfLine(c)) { - // ok eo token reached - tkn.type = EORECORD; - tkn.isReady = true; - return tkn; - } else if (!isWhitespace(c)) { - // error invalid char between token and next delimiter - throw new IOException("(line " + getCurrentLineNumber() + ") invalid char between encapsulated token and delimiter"); - } - } - } - } else if (isEndOfFile(c)) { - // error condition (end of file before end of token) - throw new IOException("(startline " + startLineNumber + ") EOF reached before encapsulated token finished"); - } else { - // consume character - tkn.content.append((char) c); - } - } - } - -} \ No newline at end of file diff --git a/src/test/java/org/apache/commons/csv/CSVLexer1306663.java b/src/test/java/org/apache/commons/csv/CSVLexer1306663.java deleted file mode 100644 index 27d349af..00000000 --- a/src/test/java/org/apache/commons/csv/CSVLexer1306663.java +++ /dev/null @@ -1,235 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.commons.csv; - -import static org.apache.commons.csv.Token.Type.COMMENT; -import static org.apache.commons.csv.Token.Type.EOF; -import static org.apache.commons.csv.Token.Type.EORECORD; -import static org.apache.commons.csv.Token.Type.INVALID; -import static org.apache.commons.csv.Token.Type.TOKEN; - -import java.io.IOException; - -/** - * - * - * @version $Id$ - */ -class CSVLexer1306663 extends Lexer { - - // ctor needs to be public so can be called dynamically by PerformanceTest class - public CSVLexer1306663(final CSVFormat format, final ExtendedBufferedReader in) { - super(format, in); - } - - /** - * Returns the next token. - *

- * A token corresponds to a term, a record change or an end-of-file indicator. - * - * @param tkn an existing Token object to reuse. The caller is responsible to initialize the Token. - * @return the next token found - * @throws java.io.IOException on stream access error - */ - @Override - Token nextToken(final Token tkn) throws IOException { - - // get the last read char (required for empty line detection) - int lastChar = in.getLastChar(); - - // read the next char and set eol - int c = in.read(); - - if (isStartOfLine(lastChar) && isCommentStart(c)) { - in.readLine(); - tkn.type = COMMENT; - return tkn; - } - - /* note: unfortunately isEndOfLine may consumes a character silently. - * this has no effect outside of the method. so a simple workaround - * is to call 'readAgain' on the stream... - */ - boolean eol = readEndOfLine(c); - c = in.getLastChar(); - - // empty line detection: eol AND (last char was EOL or beginning) - if (ignoreEmptyLines) { - while (eol && isStartOfLine(lastChar)) { - // go on char ahead ... - lastChar = c; - c = in.read(); - eol = readEndOfLine(c); - c = in.getLastChar(); - // reached end of file without any content (empty line at the end) - if (isEndOfFile(c)) { - tkn.type = EOF; - // don't set tkn.isReady here because no content - return tkn; - } - } - } - - // did we reach eof during the last iteration already ? EOF - if (isEndOfFile(lastChar) || (!isDelimiter(lastChar) && isEndOfFile(c))) { - tkn.type = EOF; - // don't set tkn.isReady here because no content - return tkn; - } - - // important: make sure a new char gets consumed in each iteration - while (tkn.type == INVALID) { - // ignore whitespaces at beginning of a token - if (ignoreSurroundingSpaces) { - while (isWhitespace(c) && !eol) { - c = in.read(); - eol = readEndOfLine(c); - } - } - - // ok, start of token reached: encapsulated, or token - if (isDelimiter(c)) { - // empty token return TOKEN("") - tkn.type = TOKEN; - } else if (eol) { - // empty token return EORECORD("") - //noop: tkn.content.append(""); - tkn.type = EORECORD; - } else if (isQuoteChar(c)) { - // consume encapsulated token - encapsulatedTokenLexer(tkn); - } else if (isEndOfFile(c)) { - // end of file return EOF() - //noop: tkn.content.append(""); - tkn.type = EOF; - tkn.isReady = true; // there is data at EOF - } else { - // next token must be a simple token - // add removed blanks when not ignoring whitespace chars... - simpleTokenLexer(tkn, c); - } - } - return tkn; - } - - /** - * A simple token lexer - *

- * Simple token are tokens which are not surrounded by encapsulators. - * A simple token might contain escaped delimiters (as \, or \;). The - * token is finished when one of the following conditions become true: - *

- * - * @param tkn the current token - * @param c the current character - * @return the filled token - * @throws IOException on stream access error - */ - private Token simpleTokenLexer(final Token tkn, int c) throws IOException { - // Faster to use while(true)+break than while(tkn.type == INVALID) - while (true) { - if (readEndOfLine(c)) { - tkn.type = EORECORD; - break; - } else if (isEndOfFile(c)) { - tkn.type = EOF; - tkn.isReady = true; // There is data at EOF - break; - } else if (isDelimiter(c)) { - tkn.type = TOKEN; - break; - } else if (isEscape(c)) { - tkn.content.append((char) readEscape()); - c = in.read(); // continue - } else { - tkn.content.append((char) c); - c = in.read(); // continue - } - } - - if (ignoreSurroundingSpaces) { - trimTrailingSpaces(tkn.content); - } - - return tkn; - } - - /** - * An encapsulated token lexer - *

- * Encapsulated tokens are surrounded by the given encapsulating-string. - * The encapsulator itself might be included in the token using a - * doubling syntax (as "", '') or using escaping (as in \", \'). - * Whitespaces before and after an encapsulated token are ignored. - * - * @param tkn the current token - * @return a valid token object - * @throws IOException on invalid state - */ - private Token encapsulatedTokenLexer(final Token tkn) throws IOException { - // save current line - final long startLineNumber = getCurrentLineNumber(); - // ignore the given delimiter - // assert c == delimiter; - int c; - while (true) { - c = in.read(); - - if (isEscape(c)) { - tkn.content.append((char) readEscape()); - } else if (isQuoteChar(c)) { - if (isQuoteChar(in.lookAhead())) { - // double or escaped encapsulator -> add single encapsulator to token - c = in.read(); - tkn.content.append((char) c); - } else { - // token finish mark (encapsulator) reached: ignore whitespace till delimiter - while (true) { - c = in.read(); - if (isDelimiter(c)) { - tkn.type = TOKEN; - return tkn; - } else if (isEndOfFile(c)) { - tkn.type = EOF; - tkn.isReady = true; // There is data at EOF - return tkn; - } else if (readEndOfLine(c)) { - // ok eo token reached - tkn.type = EORECORD; - return tkn; - } else if (!isWhitespace(c)) { - // error invalid char between token and next delimiter - throw new IOException("(line " + getCurrentLineNumber() + ") invalid char between encapsulated token and delimiter"); - } - } - } - } else if (isEndOfFile(c)) { - // error condition (end of file before end of token) - throw new IOException("(startline " + startLineNumber + ") EOF reached before encapsulated token finished"); - } else { - // consume character - tkn.content.append((char) c); - } - } - } - -} \ No newline at end of file diff --git a/src/test/java/org/apache/commons/csv/CSVLexer1306667.java b/src/test/java/org/apache/commons/csv/CSVLexer1306667.java deleted file mode 100644 index 6697166c..00000000 --- a/src/test/java/org/apache/commons/csv/CSVLexer1306667.java +++ /dev/null @@ -1,235 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.commons.csv; - -import static org.apache.commons.csv.Token.Type.COMMENT; -import static org.apache.commons.csv.Token.Type.EOF; -import static org.apache.commons.csv.Token.Type.EORECORD; -import static org.apache.commons.csv.Token.Type.INVALID; -import static org.apache.commons.csv.Token.Type.TOKEN; - -import java.io.IOException; - -/** - * - * - * @version $Id$ - */ -class CSVLexer1306667 extends Lexer { - - // ctor needs to be public so can be called dynamically by PerformanceTest class - public CSVLexer1306667(final CSVFormat format, final ExtendedBufferedReader in) { - super(format, in); - } - - /** - * Returns the next token. - *

- * A token corresponds to a term, a record change or an end-of-file indicator. - * - * @param tkn an existing Token object to reuse. The caller is responsible to initialize the Token. - * @return the next token found - * @throws java.io.IOException on stream access error - */ - @Override - Token nextToken(final Token tkn) throws IOException { - - // get the last read char (required for empty line detection) - int lastChar = in.getLastChar(); - - // read the next char and set eol - int c = in.read(); - - /* note: unfortunately isEndOfLine may consumes a character silently. - * this has no effect outside of the method. so a simple workaround - * is to call 'readAgain' on the stream... - */ - boolean eol = readEndOfLine(c); - c = in.getLastChar(); - - // empty line detection: eol AND (last char was EOL or beginning) - if (ignoreEmptyLines) { - while (eol && isStartOfLine(lastChar)) { - // go on char ahead ... - lastChar = c; - c = in.read(); - eol = readEndOfLine(c); - c = in.getLastChar(); - // reached end of file without any content (empty line at the end) - if (isEndOfFile(c)) { - tkn.type = EOF; - // don't set tkn.isReady here because no content - return tkn; - } - } - } - - // did we reach eof during the last iteration already ? EOF - if (isEndOfFile(lastChar) || (!isDelimiter(lastChar) && isEndOfFile(c))) { - tkn.type = EOF; - // don't set tkn.isReady here because no content - return tkn; - } - - if (isStartOfLine(lastChar) && isCommentStart(c)) { - in.readLine(); - tkn.type = COMMENT; - return tkn; - } - - // important: make sure a new char gets consumed in each iteration - while (tkn.type == INVALID) { - // ignore whitespaces at beginning of a token - if (ignoreSurroundingSpaces) { - while (isWhitespace(c) && !eol) { - c = in.read(); - eol = readEndOfLine(c); - } - } - - // ok, start of token reached: encapsulated, or token - if (isDelimiter(c)) { - // empty token return TOKEN("") - tkn.type = TOKEN; - } else if (eol) { - // empty token return EORECORD("") - //noop: tkn.content.append(""); - tkn.type = EORECORD; - } else if (isQuoteChar(c)) { - // consume encapsulated token - encapsulatedTokenLexer(tkn); - } else if (isEndOfFile(c)) { - // end of file return EOF() - //noop: tkn.content.append(""); - tkn.type = EOF; - tkn.isReady = true; // there is data at EOF - } else { - // next token must be a simple token - // add removed blanks when not ignoring whitespace chars... - simpleTokenLexer(tkn, c); - } - } - return tkn; - } - - /** - * A simple token lexer - *

- * Simple token are tokens which are not surrounded by encapsulators. - * A simple token might contain escaped delimiters (as \, or \;). The - * token is finished when one of the following conditions become true: - *

- * - * @param tkn the current token - * @param c the current character - * @return the filled token - * @throws IOException on stream access error - */ - private Token simpleTokenLexer(final Token tkn, int c) throws IOException { - // Faster to use while(true)+break than while(tkn.type == INVALID) - while (true) { - if (readEndOfLine(c)) { - tkn.type = EORECORD; - break; - } else if (isEndOfFile(c)) { - tkn.type = EOF; - tkn.isReady = true; // There is data at EOF - break; - } else if (isDelimiter(c)) { - tkn.type = TOKEN; - break; - } else if (isEscape(c)) { - tkn.content.append((char) readEscape()); - c = in.read(); // continue - } else { - tkn.content.append((char) c); - c = in.read(); // continue - } - } - - if (ignoreSurroundingSpaces) { - trimTrailingSpaces(tkn.content); - } - - return tkn; - } - - /** - * An encapsulated token lexer - *

- * Encapsulated tokens are surrounded by the given encapsulating-string. - * The encapsulator itself might be included in the token using a - * doubling syntax (as "", '') or using escaping (as in \", \'). - * Whitespaces before and after an encapsulated token are ignored. - * - * @param tkn the current token - * @return a valid token object - * @throws IOException on invalid state - */ - private Token encapsulatedTokenLexer(final Token tkn) throws IOException { - // save current line - final long startLineNumber = getCurrentLineNumber(); - // ignore the given delimiter - // assert c == delimiter; - int c; - while (true) { - c = in.read(); - - if (isEscape(c)) { - tkn.content.append((char) readEscape()); - } else if (isQuoteChar(c)) { - if (isQuoteChar(in.lookAhead())) { - // double or escaped encapsulator -> add single encapsulator to token - c = in.read(); - tkn.content.append((char) c); - } else { - // token finish mark (encapsulator) reached: ignore whitespace till delimiter - while (true) { - c = in.read(); - if (isDelimiter(c)) { - tkn.type = TOKEN; - return tkn; - } else if (isEndOfFile(c)) { - tkn.type = EOF; - tkn.isReady = true; // There is data at EOF - return tkn; - } else if (readEndOfLine(c)) { - // ok eo token reached - tkn.type = EORECORD; - return tkn; - } else if (!isWhitespace(c)) { - // error invalid char between token and next delimiter - throw new IOException("(line " + getCurrentLineNumber() + ") invalid char between encapsulated token and delimiter"); - } - } - } - } else if (isEndOfFile(c)) { - // error condition (end of file before end of token) - throw new IOException("(startline " + startLineNumber + ") EOF reached before encapsulated token finished"); - } else { - // consume character - tkn.content.append((char) c); - } - } - } - -} \ No newline at end of file diff --git a/src/test/java/org/apache/commons/csv/CSVLexer3.java b/src/test/java/org/apache/commons/csv/CSVLexer3.java deleted file mode 100644 index 0559a67f..00000000 --- a/src/test/java/org/apache/commons/csv/CSVLexer3.java +++ /dev/null @@ -1,254 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.commons.csv; - -import static org.apache.commons.csv.Constants.END_OF_STREAM; -import static org.apache.commons.csv.Token.Type.COMMENT; -import static org.apache.commons.csv.Token.Type.EOF; -import static org.apache.commons.csv.Token.Type.EORECORD; -import static org.apache.commons.csv.Token.Type.INVALID; -import static org.apache.commons.csv.Token.Type.TOKEN; - -import java.io.IOException; - -/** - * Experimental Lexer using enums to keep track of state and character type. - * Unfortunately it is twice as slow. - * For reference purpose only. - * - * @version $Id$ - */ -class CSVLexer3 extends Lexer { - - private final char escape; - - // ctor needs to be public so can be called dynamically by PerformanceTest class - public CSVLexer3(final CSVFormat format, final ExtendedBufferedReader in) { - super(format, in); - this.escape = format.getEscape(); - } - - /** - * Classify the character types - */ - private static enum CharType { - DELIM, - ESCAPE, - ENCAP, - EOL, - COMMENT_START, - WHITESPACE, - OTHER, - EOFCHAR - } - - private CharType classify(final int intch) { - if (isDelimiter(intch)) { - return CharType.DELIM; - } - if (isCommentStart(intch)) { - return CharType.COMMENT_START; - } - if (isQuoteChar(intch)) { - return CharType.ENCAP; - } - if (isEscape(intch)) { - return CharType.ESCAPE; - } - if (intch == '\r' || intch == '\n') { - return CharType.EOL; - } - if (isWhitespace(intch)) { // Must be after EOL check - return CharType.WHITESPACE; - } - if (intch == END_OF_STREAM) { - return CharType.EOFCHAR; - } - return CharType.OTHER; - } - - /** - * Parsing states - */ - private static enum State { - BEGIN, PLAIN, INQUOTE, QUOTEQUOTE, ESCAPE_PLAIN, ESCAPE_QUOTE, - } - - /** - * Returns the next token. - *

- * A token corresponds to a term, a record change or an end-of-file indicator. - * - * @param tkn an existing Token object to reuse. The caller is responsible to initialize the Token. - * @return the next token found - * @throws java.io.IOException on stream access error - */ - @Override - Token nextToken(final Token tkn) throws IOException { - - State state = State.BEGIN; - int intch; - boolean trimTrailingSpaces = false; - while(tkn.type == INVALID) { - intch = in.read(); - final CharType type = classify(intch); - switch(state) { - case BEGIN: - switch(type){ - case COMMENT_START: - in.readLine(); - tkn.type = COMMENT; - break; - case ENCAP: - state = State.INQUOTE; - break; - case DELIM: - tkn.type = TOKEN; - break; - case EOL: - tkn.type = EORECORD; - break; - case EOFCHAR: - tkn.type = EOF; - break; - case ESCAPE: - state = State.ESCAPE_PLAIN; - break; - case OTHER: - tkn.content.append((char) intch); - state = State.PLAIN; - break; - case WHITESPACE: - if (!ignoreSurroundingSpaces){ - tkn.content.append((char) intch); - state = State.PLAIN; - } - break; - } - break; - case PLAIN: - switch(type){ - case DELIM: - tkn.type = TOKEN; - break; - case EOL: - tkn.type = EORECORD; - break; - case EOFCHAR: - tkn.type = EOF; - break; - case ESCAPE: - state = State.ESCAPE_PLAIN; - break; - default: - trimTrailingSpaces = ignoreSurroundingSpaces; // we have a plain token - tkn.content.append((char) intch); - break; - } - break; - case INQUOTE: // Started a quoted string - switch(type){ - case ENCAP: - state = State.QUOTEQUOTE; - break; - case ESCAPE: - state = State.ESCAPE_QUOTE; - break; - case EOFCHAR: - throw new IOException("(line " + getCurrentLineNumber() + ") unexpected EOF in quoted string"); - default: - tkn.content.append((char) intch); - break; - } - break; - case QUOTEQUOTE: // "..." seen, expecting end of token or " - switch(type){ - case DELIM: - tkn.type = TOKEN; - break; - case EOL: - tkn.type = EORECORD; - break; - case EOFCHAR: - tkn.type = EOF; - break; - case ENCAP: // "..."" seen, append it - tkn.content.append((char) intch); - state = State.INQUOTE; - break; - case WHITESPACE: // trailing whitespace may be allowed - if (!ignoreSurroundingSpaces) { - // error invalid char between token and next delimiter - throw new IOException("(line " + getCurrentLineNumber() + ") invalid char between encapsulated token and delimiter"); - } - break; - // Everything else is invalid - case ESCAPE: - case OTHER: - case COMMENT_START: - // error invalid char between token and next delimiter - throw new IOException("(line " + getCurrentLineNumber() + ") invalid char between encapsulated token and delimiter"); - } - break; - case ESCAPE_PLAIN: - switch(type){ - case DELIM: - case ESCAPE: - case EOL: - tkn.content.append((char) intch); - state = State.PLAIN; - break; - case COMMENT_START: // TODO should comment be escaped? - case ENCAP: // TODO is this correct? - case OTHER: // TODO may need to escape further - case WHITESPACE: - tkn.content.append(escape); - tkn.content.append((char) intch); - break; - case EOFCHAR: - throw new IOException("(line " + getCurrentLineNumber() + ") unexpected EOF in escape sequence"); - } - break; - case ESCAPE_QUOTE: - switch(type){ - case ESCAPE: - case ENCAP: // this is the only required escape - tkn.content.append((char) intch); - break; - case COMMENT_START: - case DELIM: - case EOL: - case OTHER: - case WHITESPACE: - tkn.content.append(escape); - tkn.content.append((char) intch); - break; - case EOFCHAR: - throw new IOException("(line " + getCurrentLineNumber() + ") unexpected EOF in escape sequence"); - } - break; - default: - break; - } - } - if (trimTrailingSpaces) { - trimTrailingSpaces(tkn.content); - } - return tkn; - } -} \ No newline at end of file