No longer wanted
git-svn-id: https://svn.apache.org/repos/asf/commons/proper/csv/trunk@1511005 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
8d8bbf459e
commit
7755640784
|
@ -1,247 +0,0 @@
|
||||||
/*
|
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
|
||||||
* contributor license agreements. See the NOTICE file distributed with
|
|
||||||
* this work for additional information regarding copyright ownership.
|
|
||||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
|
||||||
* (the "License"); you may not use this file except in compliance with
|
|
||||||
* the License. You may obtain a copy of the License at
|
|
||||||
*
|
|
||||||
* http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
*
|
|
||||||
* Unless required by applicable law or agreed to in writing, software
|
|
||||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
* See the License for the specific language governing permissions and
|
|
||||||
* limitations under the License.
|
|
||||||
*/
|
|
||||||
|
|
||||||
package org.apache.commons.csv;
|
|
||||||
|
|
||||||
import static org.apache.commons.csv.Constants.UNDEFINED;
|
|
||||||
import static org.apache.commons.csv.Token.Type.EOF;
|
|
||||||
import static org.apache.commons.csv.Token.Type.EORECORD;
|
|
||||||
import static org.apache.commons.csv.Token.Type.TOKEN;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
|
|
||||||
/**
|
|
||||||
*
|
|
||||||
*
|
|
||||||
* @version $Id$
|
|
||||||
*/
|
|
||||||
class CSVLexer1 extends Lexer {
|
|
||||||
|
|
||||||
private final StringBuilder wsBuf = new StringBuilder();
|
|
||||||
|
|
||||||
// ctor needs to be public so can be called dynamically by PerformanceTest class
|
|
||||||
public CSVLexer1(final CSVFormat format, final ExtendedBufferedReader in) {
|
|
||||||
super(format, in);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Returns the next token.
|
|
||||||
* <p/>
|
|
||||||
* A token corresponds to a term, a record change or an end-of-file indicator.
|
|
||||||
*
|
|
||||||
* @param tkn an existing Token object to reuse. The caller is responsible to initialize the Token.
|
|
||||||
* @return the next token found
|
|
||||||
* @throws java.io.IOException on stream access error
|
|
||||||
*/
|
|
||||||
@Override
|
|
||||||
Token nextToken(Token tkn) throws IOException {
|
|
||||||
wsBuf.setLength(0); // reuse
|
|
||||||
|
|
||||||
// get the last read char (required for empty line detection)
|
|
||||||
int lastChar = in.getLastChar();
|
|
||||||
|
|
||||||
// read the next char and set eol
|
|
||||||
/* note: unfortunately isEndOfLine may consumes a character silently.
|
|
||||||
* this has no effect outside of the method. so a simple workaround
|
|
||||||
* is to call 'readAgain' on the stream...
|
|
||||||
*/
|
|
||||||
int c = in.read();
|
|
||||||
boolean eol = readEndOfLine(c);
|
|
||||||
c = in.getLastChar();
|
|
||||||
|
|
||||||
// empty line detection: eol AND (last char was EOL or beginning)
|
|
||||||
if (format.getIgnoreEmptyLines()) {
|
|
||||||
while (eol
|
|
||||||
&& (lastChar == '\n' || lastChar == '\r' || lastChar == UNDEFINED)
|
|
||||||
&& !isEndOfFile(lastChar)) {
|
|
||||||
// go on char ahead ...
|
|
||||||
lastChar = c;
|
|
||||||
c = in.read();
|
|
||||||
eol = readEndOfLine(c);
|
|
||||||
c = in.getLastChar();
|
|
||||||
// reached end of file without any content (empty line at the end)
|
|
||||||
if (isEndOfFile(c)) {
|
|
||||||
tkn.type = EOF;
|
|
||||||
return tkn;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// did we reach eof during the last iteration already ? EOF
|
|
||||||
if (isEndOfFile(lastChar) || (lastChar != format.getDelimiter() && isEndOfFile(c))) {
|
|
||||||
tkn.type = EOF;
|
|
||||||
return tkn;
|
|
||||||
}
|
|
||||||
|
|
||||||
// important: make sure a new char gets consumed in each iteration
|
|
||||||
while (!tkn.isReady && tkn.type != EOF) {
|
|
||||||
// ignore whitespaces at beginning of a token
|
|
||||||
if (format.getIgnoreSurroundingSpaces()) {
|
|
||||||
while (isWhitespace(c) && !eol) {
|
|
||||||
wsBuf.append((char) c);
|
|
||||||
c = in.read();
|
|
||||||
eol = readEndOfLine(c);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// ok, start of token reached: comment, encapsulated, or token
|
|
||||||
if (c == format.getCommentStart()) {
|
|
||||||
// ignore everything till end of line and continue (incr linecount)
|
|
||||||
in.readLine();
|
|
||||||
tkn.reset();
|
|
||||||
tkn = nextToken(tkn);
|
|
||||||
} else if (c == format.getDelimiter()) {
|
|
||||||
// empty token return TOKEN("")
|
|
||||||
tkn.type = TOKEN;
|
|
||||||
tkn.isReady = true;
|
|
||||||
} else if (eol) {
|
|
||||||
// empty token return EORECORD("")
|
|
||||||
//noop: tkn.content.append("");
|
|
||||||
tkn.type = EORECORD;
|
|
||||||
tkn.isReady = true;
|
|
||||||
} else if (c == format.getQuoteChar()) {
|
|
||||||
// consume encapsulated token
|
|
||||||
encapsulatedTokenLexer(tkn, c);
|
|
||||||
} else if (isEndOfFile(c)) {
|
|
||||||
// end of file return EOF()
|
|
||||||
//noop: tkn.content.append("");
|
|
||||||
tkn.type = EOF;
|
|
||||||
tkn.isReady = true;
|
|
||||||
} else {
|
|
||||||
// next token must be a simple token
|
|
||||||
// add removed blanks when not ignoring whitespace chars...
|
|
||||||
if (!format.getIgnoreSurroundingSpaces()) {
|
|
||||||
tkn.content.append(wsBuf);
|
|
||||||
}
|
|
||||||
simpleTokenLexer(tkn, c);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return tkn;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* A simple token lexer
|
|
||||||
* <p/>
|
|
||||||
* Simple token are tokens which are not surrounded by encapsulators.
|
|
||||||
* A simple token might contain escaped delimiters (as \, or \;). The
|
|
||||||
* token is finished when one of the following conditions become true:
|
|
||||||
* <ul>
|
|
||||||
* <li>end of line has been reached (EORECORD)</li>
|
|
||||||
* <li>end of stream has been reached (EOF)</li>
|
|
||||||
* <li>an unescaped delimiter has been reached (TOKEN)</li>
|
|
||||||
* </ul>
|
|
||||||
*
|
|
||||||
* @param tkn the current token
|
|
||||||
* @param c the current character
|
|
||||||
* @return the filled token
|
|
||||||
* @throws IOException on stream access error
|
|
||||||
*/
|
|
||||||
private Token simpleTokenLexer(final Token tkn, int c) throws IOException {
|
|
||||||
while (true) {
|
|
||||||
if (readEndOfLine(c)) {
|
|
||||||
// end of record
|
|
||||||
tkn.type = EORECORD;
|
|
||||||
tkn.isReady = true;
|
|
||||||
break;
|
|
||||||
} else if (isEndOfFile(c)) {
|
|
||||||
// end of file
|
|
||||||
tkn.type = EOF;
|
|
||||||
tkn.isReady = true;
|
|
||||||
break;
|
|
||||||
} else if (c == format.getDelimiter()) {
|
|
||||||
// end of token
|
|
||||||
tkn.type = TOKEN;
|
|
||||||
tkn.isReady = true;
|
|
||||||
break;
|
|
||||||
} else if (c == format.getEscape()) {
|
|
||||||
tkn.content.append((char) readEscape());
|
|
||||||
} else {
|
|
||||||
tkn.content.append((char) c);
|
|
||||||
}
|
|
||||||
|
|
||||||
c = in.read();
|
|
||||||
}
|
|
||||||
|
|
||||||
if (format.getIgnoreSurroundingSpaces()) {
|
|
||||||
trimTrailingSpaces(tkn.content);
|
|
||||||
}
|
|
||||||
|
|
||||||
return tkn;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* An encapsulated token lexer
|
|
||||||
* <p/>
|
|
||||||
* Encapsulated tokens are surrounded by the given encapsulating-string.
|
|
||||||
* The encapsulator itself might be included in the token using a
|
|
||||||
* doubling syntax (as "", '') or using escaping (as in \", \').
|
|
||||||
* Whitespaces before and after an encapsulated token are ignored.
|
|
||||||
*
|
|
||||||
* @param tkn the current token
|
|
||||||
* @param c the current character
|
|
||||||
* @return a valid token object
|
|
||||||
* @throws IOException on invalid state
|
|
||||||
*/
|
|
||||||
private Token encapsulatedTokenLexer(final Token tkn, int c) throws IOException {
|
|
||||||
// save current line
|
|
||||||
final long startLineNumber = getCurrentLineNumber();
|
|
||||||
// ignore the given delimiter
|
|
||||||
// assert c == delimiter;
|
|
||||||
while (true) {
|
|
||||||
c = in.read();
|
|
||||||
|
|
||||||
if (c == format.getEscape()) {
|
|
||||||
tkn.content.append((char) readEscape());
|
|
||||||
} else if (c == format.getQuoteChar()) {
|
|
||||||
if (in.lookAhead() == format.getQuoteChar()) {
|
|
||||||
// double or escaped encapsulator -> add single encapsulator to token
|
|
||||||
c = in.read();
|
|
||||||
tkn.content.append((char) c);
|
|
||||||
} else {
|
|
||||||
// token finish mark (encapsulator) reached: ignore whitespace till delimiter
|
|
||||||
while (true) {
|
|
||||||
c = in.read();
|
|
||||||
if (c == format.getDelimiter()) {
|
|
||||||
tkn.type = TOKEN;
|
|
||||||
tkn.isReady = true;
|
|
||||||
return tkn;
|
|
||||||
} else if (isEndOfFile(c)) {
|
|
||||||
tkn.type = EOF;
|
|
||||||
tkn.isReady = true;
|
|
||||||
return tkn;
|
|
||||||
} else if (readEndOfLine(c)) {
|
|
||||||
// ok eo token reached
|
|
||||||
tkn.type = EORECORD;
|
|
||||||
tkn.isReady = true;
|
|
||||||
return tkn;
|
|
||||||
} else if (!isWhitespace(c)) {
|
|
||||||
// error invalid char between token and next delimiter
|
|
||||||
throw new IOException("(line " + getCurrentLineNumber() + ") invalid char between encapsulated token and delimiter");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else if (isEndOfFile(c)) {
|
|
||||||
// error condition (end of file before end of token)
|
|
||||||
throw new IOException("(startline " + startLineNumber + ") EOF reached before encapsulated token finished");
|
|
||||||
} else {
|
|
||||||
// consume character
|
|
||||||
tkn.content.append((char) c);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
|
@ -1,235 +0,0 @@
|
||||||
/*
|
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
|
||||||
* contributor license agreements. See the NOTICE file distributed with
|
|
||||||
* this work for additional information regarding copyright ownership.
|
|
||||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
|
||||||
* (the "License"); you may not use this file except in compliance with
|
|
||||||
* the License. You may obtain a copy of the License at
|
|
||||||
*
|
|
||||||
* http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
*
|
|
||||||
* Unless required by applicable law or agreed to in writing, software
|
|
||||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
* See the License for the specific language governing permissions and
|
|
||||||
* limitations under the License.
|
|
||||||
*/
|
|
||||||
|
|
||||||
package org.apache.commons.csv;
|
|
||||||
|
|
||||||
import static org.apache.commons.csv.Token.Type.COMMENT;
|
|
||||||
import static org.apache.commons.csv.Token.Type.EOF;
|
|
||||||
import static org.apache.commons.csv.Token.Type.EORECORD;
|
|
||||||
import static org.apache.commons.csv.Token.Type.INVALID;
|
|
||||||
import static org.apache.commons.csv.Token.Type.TOKEN;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
|
|
||||||
/**
|
|
||||||
*
|
|
||||||
*
|
|
||||||
* @version $Id$
|
|
||||||
*/
|
|
||||||
class CSVLexer1306663 extends Lexer {
|
|
||||||
|
|
||||||
// ctor needs to be public so can be called dynamically by PerformanceTest class
|
|
||||||
public CSVLexer1306663(final CSVFormat format, final ExtendedBufferedReader in) {
|
|
||||||
super(format, in);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Returns the next token.
|
|
||||||
* <p/>
|
|
||||||
* A token corresponds to a term, a record change or an end-of-file indicator.
|
|
||||||
*
|
|
||||||
* @param tkn an existing Token object to reuse. The caller is responsible to initialize the Token.
|
|
||||||
* @return the next token found
|
|
||||||
* @throws java.io.IOException on stream access error
|
|
||||||
*/
|
|
||||||
@Override
|
|
||||||
Token nextToken(final Token tkn) throws IOException {
|
|
||||||
|
|
||||||
// get the last read char (required for empty line detection)
|
|
||||||
int lastChar = in.getLastChar();
|
|
||||||
|
|
||||||
// read the next char and set eol
|
|
||||||
int c = in.read();
|
|
||||||
|
|
||||||
if (isStartOfLine(lastChar) && isCommentStart(c)) {
|
|
||||||
in.readLine();
|
|
||||||
tkn.type = COMMENT;
|
|
||||||
return tkn;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* note: unfortunately isEndOfLine may consumes a character silently.
|
|
||||||
* this has no effect outside of the method. so a simple workaround
|
|
||||||
* is to call 'readAgain' on the stream...
|
|
||||||
*/
|
|
||||||
boolean eol = readEndOfLine(c);
|
|
||||||
c = in.getLastChar();
|
|
||||||
|
|
||||||
// empty line detection: eol AND (last char was EOL or beginning)
|
|
||||||
if (ignoreEmptyLines) {
|
|
||||||
while (eol && isStartOfLine(lastChar)) {
|
|
||||||
// go on char ahead ...
|
|
||||||
lastChar = c;
|
|
||||||
c = in.read();
|
|
||||||
eol = readEndOfLine(c);
|
|
||||||
c = in.getLastChar();
|
|
||||||
// reached end of file without any content (empty line at the end)
|
|
||||||
if (isEndOfFile(c)) {
|
|
||||||
tkn.type = EOF;
|
|
||||||
// don't set tkn.isReady here because no content
|
|
||||||
return tkn;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// did we reach eof during the last iteration already ? EOF
|
|
||||||
if (isEndOfFile(lastChar) || (!isDelimiter(lastChar) && isEndOfFile(c))) {
|
|
||||||
tkn.type = EOF;
|
|
||||||
// don't set tkn.isReady here because no content
|
|
||||||
return tkn;
|
|
||||||
}
|
|
||||||
|
|
||||||
// important: make sure a new char gets consumed in each iteration
|
|
||||||
while (tkn.type == INVALID) {
|
|
||||||
// ignore whitespaces at beginning of a token
|
|
||||||
if (ignoreSurroundingSpaces) {
|
|
||||||
while (isWhitespace(c) && !eol) {
|
|
||||||
c = in.read();
|
|
||||||
eol = readEndOfLine(c);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// ok, start of token reached: encapsulated, or token
|
|
||||||
if (isDelimiter(c)) {
|
|
||||||
// empty token return TOKEN("")
|
|
||||||
tkn.type = TOKEN;
|
|
||||||
} else if (eol) {
|
|
||||||
// empty token return EORECORD("")
|
|
||||||
//noop: tkn.content.append("");
|
|
||||||
tkn.type = EORECORD;
|
|
||||||
} else if (isQuoteChar(c)) {
|
|
||||||
// consume encapsulated token
|
|
||||||
encapsulatedTokenLexer(tkn);
|
|
||||||
} else if (isEndOfFile(c)) {
|
|
||||||
// end of file return EOF()
|
|
||||||
//noop: tkn.content.append("");
|
|
||||||
tkn.type = EOF;
|
|
||||||
tkn.isReady = true; // there is data at EOF
|
|
||||||
} else {
|
|
||||||
// next token must be a simple token
|
|
||||||
// add removed blanks when not ignoring whitespace chars...
|
|
||||||
simpleTokenLexer(tkn, c);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return tkn;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* A simple token lexer
|
|
||||||
* <p/>
|
|
||||||
* Simple token are tokens which are not surrounded by encapsulators.
|
|
||||||
* A simple token might contain escaped delimiters (as \, or \;). The
|
|
||||||
* token is finished when one of the following conditions become true:
|
|
||||||
* <ul>
|
|
||||||
* <li>end of line has been reached (EORECORD)</li>
|
|
||||||
* <li>end of stream has been reached (EOF)</li>
|
|
||||||
* <li>an unescaped delimiter has been reached (TOKEN)</li>
|
|
||||||
* </ul>
|
|
||||||
*
|
|
||||||
* @param tkn the current token
|
|
||||||
* @param c the current character
|
|
||||||
* @return the filled token
|
|
||||||
* @throws IOException on stream access error
|
|
||||||
*/
|
|
||||||
private Token simpleTokenLexer(final Token tkn, int c) throws IOException {
|
|
||||||
// Faster to use while(true)+break than while(tkn.type == INVALID)
|
|
||||||
while (true) {
|
|
||||||
if (readEndOfLine(c)) {
|
|
||||||
tkn.type = EORECORD;
|
|
||||||
break;
|
|
||||||
} else if (isEndOfFile(c)) {
|
|
||||||
tkn.type = EOF;
|
|
||||||
tkn.isReady = true; // There is data at EOF
|
|
||||||
break;
|
|
||||||
} else if (isDelimiter(c)) {
|
|
||||||
tkn.type = TOKEN;
|
|
||||||
break;
|
|
||||||
} else if (isEscape(c)) {
|
|
||||||
tkn.content.append((char) readEscape());
|
|
||||||
c = in.read(); // continue
|
|
||||||
} else {
|
|
||||||
tkn.content.append((char) c);
|
|
||||||
c = in.read(); // continue
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (ignoreSurroundingSpaces) {
|
|
||||||
trimTrailingSpaces(tkn.content);
|
|
||||||
}
|
|
||||||
|
|
||||||
return tkn;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* An encapsulated token lexer
|
|
||||||
* <p/>
|
|
||||||
* Encapsulated tokens are surrounded by the given encapsulating-string.
|
|
||||||
* The encapsulator itself might be included in the token using a
|
|
||||||
* doubling syntax (as "", '') or using escaping (as in \", \').
|
|
||||||
* Whitespaces before and after an encapsulated token are ignored.
|
|
||||||
*
|
|
||||||
* @param tkn the current token
|
|
||||||
* @return a valid token object
|
|
||||||
* @throws IOException on invalid state
|
|
||||||
*/
|
|
||||||
private Token encapsulatedTokenLexer(final Token tkn) throws IOException {
|
|
||||||
// save current line
|
|
||||||
final long startLineNumber = getCurrentLineNumber();
|
|
||||||
// ignore the given delimiter
|
|
||||||
// assert c == delimiter;
|
|
||||||
int c;
|
|
||||||
while (true) {
|
|
||||||
c = in.read();
|
|
||||||
|
|
||||||
if (isEscape(c)) {
|
|
||||||
tkn.content.append((char) readEscape());
|
|
||||||
} else if (isQuoteChar(c)) {
|
|
||||||
if (isQuoteChar(in.lookAhead())) {
|
|
||||||
// double or escaped encapsulator -> add single encapsulator to token
|
|
||||||
c = in.read();
|
|
||||||
tkn.content.append((char) c);
|
|
||||||
} else {
|
|
||||||
// token finish mark (encapsulator) reached: ignore whitespace till delimiter
|
|
||||||
while (true) {
|
|
||||||
c = in.read();
|
|
||||||
if (isDelimiter(c)) {
|
|
||||||
tkn.type = TOKEN;
|
|
||||||
return tkn;
|
|
||||||
} else if (isEndOfFile(c)) {
|
|
||||||
tkn.type = EOF;
|
|
||||||
tkn.isReady = true; // There is data at EOF
|
|
||||||
return tkn;
|
|
||||||
} else if (readEndOfLine(c)) {
|
|
||||||
// ok eo token reached
|
|
||||||
tkn.type = EORECORD;
|
|
||||||
return tkn;
|
|
||||||
} else if (!isWhitespace(c)) {
|
|
||||||
// error invalid char between token and next delimiter
|
|
||||||
throw new IOException("(line " + getCurrentLineNumber() + ") invalid char between encapsulated token and delimiter");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else if (isEndOfFile(c)) {
|
|
||||||
// error condition (end of file before end of token)
|
|
||||||
throw new IOException("(startline " + startLineNumber + ") EOF reached before encapsulated token finished");
|
|
||||||
} else {
|
|
||||||
// consume character
|
|
||||||
tkn.content.append((char) c);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
|
@ -1,235 +0,0 @@
|
||||||
/*
|
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
|
||||||
* contributor license agreements. See the NOTICE file distributed with
|
|
||||||
* this work for additional information regarding copyright ownership.
|
|
||||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
|
||||||
* (the "License"); you may not use this file except in compliance with
|
|
||||||
* the License. You may obtain a copy of the License at
|
|
||||||
*
|
|
||||||
* http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
*
|
|
||||||
* Unless required by applicable law or agreed to in writing, software
|
|
||||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
* See the License for the specific language governing permissions and
|
|
||||||
* limitations under the License.
|
|
||||||
*/
|
|
||||||
|
|
||||||
package org.apache.commons.csv;
|
|
||||||
|
|
||||||
import static org.apache.commons.csv.Token.Type.COMMENT;
|
|
||||||
import static org.apache.commons.csv.Token.Type.EOF;
|
|
||||||
import static org.apache.commons.csv.Token.Type.EORECORD;
|
|
||||||
import static org.apache.commons.csv.Token.Type.INVALID;
|
|
||||||
import static org.apache.commons.csv.Token.Type.TOKEN;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
|
|
||||||
/**
|
|
||||||
*
|
|
||||||
*
|
|
||||||
* @version $Id$
|
|
||||||
*/
|
|
||||||
class CSVLexer1306667 extends Lexer {
|
|
||||||
|
|
||||||
// ctor needs to be public so can be called dynamically by PerformanceTest class
|
|
||||||
public CSVLexer1306667(final CSVFormat format, final ExtendedBufferedReader in) {
|
|
||||||
super(format, in);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Returns the next token.
|
|
||||||
* <p/>
|
|
||||||
* A token corresponds to a term, a record change or an end-of-file indicator.
|
|
||||||
*
|
|
||||||
* @param tkn an existing Token object to reuse. The caller is responsible to initialize the Token.
|
|
||||||
* @return the next token found
|
|
||||||
* @throws java.io.IOException on stream access error
|
|
||||||
*/
|
|
||||||
@Override
|
|
||||||
Token nextToken(final Token tkn) throws IOException {
|
|
||||||
|
|
||||||
// get the last read char (required for empty line detection)
|
|
||||||
int lastChar = in.getLastChar();
|
|
||||||
|
|
||||||
// read the next char and set eol
|
|
||||||
int c = in.read();
|
|
||||||
|
|
||||||
/* note: unfortunately isEndOfLine may consumes a character silently.
|
|
||||||
* this has no effect outside of the method. so a simple workaround
|
|
||||||
* is to call 'readAgain' on the stream...
|
|
||||||
*/
|
|
||||||
boolean eol = readEndOfLine(c);
|
|
||||||
c = in.getLastChar();
|
|
||||||
|
|
||||||
// empty line detection: eol AND (last char was EOL or beginning)
|
|
||||||
if (ignoreEmptyLines) {
|
|
||||||
while (eol && isStartOfLine(lastChar)) {
|
|
||||||
// go on char ahead ...
|
|
||||||
lastChar = c;
|
|
||||||
c = in.read();
|
|
||||||
eol = readEndOfLine(c);
|
|
||||||
c = in.getLastChar();
|
|
||||||
// reached end of file without any content (empty line at the end)
|
|
||||||
if (isEndOfFile(c)) {
|
|
||||||
tkn.type = EOF;
|
|
||||||
// don't set tkn.isReady here because no content
|
|
||||||
return tkn;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// did we reach eof during the last iteration already ? EOF
|
|
||||||
if (isEndOfFile(lastChar) || (!isDelimiter(lastChar) && isEndOfFile(c))) {
|
|
||||||
tkn.type = EOF;
|
|
||||||
// don't set tkn.isReady here because no content
|
|
||||||
return tkn;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (isStartOfLine(lastChar) && isCommentStart(c)) {
|
|
||||||
in.readLine();
|
|
||||||
tkn.type = COMMENT;
|
|
||||||
return tkn;
|
|
||||||
}
|
|
||||||
|
|
||||||
// important: make sure a new char gets consumed in each iteration
|
|
||||||
while (tkn.type == INVALID) {
|
|
||||||
// ignore whitespaces at beginning of a token
|
|
||||||
if (ignoreSurroundingSpaces) {
|
|
||||||
while (isWhitespace(c) && !eol) {
|
|
||||||
c = in.read();
|
|
||||||
eol = readEndOfLine(c);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// ok, start of token reached: encapsulated, or token
|
|
||||||
if (isDelimiter(c)) {
|
|
||||||
// empty token return TOKEN("")
|
|
||||||
tkn.type = TOKEN;
|
|
||||||
} else if (eol) {
|
|
||||||
// empty token return EORECORD("")
|
|
||||||
//noop: tkn.content.append("");
|
|
||||||
tkn.type = EORECORD;
|
|
||||||
} else if (isQuoteChar(c)) {
|
|
||||||
// consume encapsulated token
|
|
||||||
encapsulatedTokenLexer(tkn);
|
|
||||||
} else if (isEndOfFile(c)) {
|
|
||||||
// end of file return EOF()
|
|
||||||
//noop: tkn.content.append("");
|
|
||||||
tkn.type = EOF;
|
|
||||||
tkn.isReady = true; // there is data at EOF
|
|
||||||
} else {
|
|
||||||
// next token must be a simple token
|
|
||||||
// add removed blanks when not ignoring whitespace chars...
|
|
||||||
simpleTokenLexer(tkn, c);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return tkn;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* A simple token lexer
|
|
||||||
* <p/>
|
|
||||||
* Simple token are tokens which are not surrounded by encapsulators.
|
|
||||||
* A simple token might contain escaped delimiters (as \, or \;). The
|
|
||||||
* token is finished when one of the following conditions become true:
|
|
||||||
* <ul>
|
|
||||||
* <li>end of line has been reached (EORECORD)</li>
|
|
||||||
* <li>end of stream has been reached (EOF)</li>
|
|
||||||
* <li>an unescaped delimiter has been reached (TOKEN)</li>
|
|
||||||
* </ul>
|
|
||||||
*
|
|
||||||
* @param tkn the current token
|
|
||||||
* @param c the current character
|
|
||||||
* @return the filled token
|
|
||||||
* @throws IOException on stream access error
|
|
||||||
*/
|
|
||||||
private Token simpleTokenLexer(final Token tkn, int c) throws IOException {
|
|
||||||
// Faster to use while(true)+break than while(tkn.type == INVALID)
|
|
||||||
while (true) {
|
|
||||||
if (readEndOfLine(c)) {
|
|
||||||
tkn.type = EORECORD;
|
|
||||||
break;
|
|
||||||
} else if (isEndOfFile(c)) {
|
|
||||||
tkn.type = EOF;
|
|
||||||
tkn.isReady = true; // There is data at EOF
|
|
||||||
break;
|
|
||||||
} else if (isDelimiter(c)) {
|
|
||||||
tkn.type = TOKEN;
|
|
||||||
break;
|
|
||||||
} else if (isEscape(c)) {
|
|
||||||
tkn.content.append((char) readEscape());
|
|
||||||
c = in.read(); // continue
|
|
||||||
} else {
|
|
||||||
tkn.content.append((char) c);
|
|
||||||
c = in.read(); // continue
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (ignoreSurroundingSpaces) {
|
|
||||||
trimTrailingSpaces(tkn.content);
|
|
||||||
}
|
|
||||||
|
|
||||||
return tkn;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* An encapsulated token lexer
|
|
||||||
* <p/>
|
|
||||||
* Encapsulated tokens are surrounded by the given encapsulating-string.
|
|
||||||
* The encapsulator itself might be included in the token using a
|
|
||||||
* doubling syntax (as "", '') or using escaping (as in \", \').
|
|
||||||
* Whitespaces before and after an encapsulated token are ignored.
|
|
||||||
*
|
|
||||||
* @param tkn the current token
|
|
||||||
* @return a valid token object
|
|
||||||
* @throws IOException on invalid state
|
|
||||||
*/
|
|
||||||
private Token encapsulatedTokenLexer(final Token tkn) throws IOException {
|
|
||||||
// save current line
|
|
||||||
final long startLineNumber = getCurrentLineNumber();
|
|
||||||
// ignore the given delimiter
|
|
||||||
// assert c == delimiter;
|
|
||||||
int c;
|
|
||||||
while (true) {
|
|
||||||
c = in.read();
|
|
||||||
|
|
||||||
if (isEscape(c)) {
|
|
||||||
tkn.content.append((char) readEscape());
|
|
||||||
} else if (isQuoteChar(c)) {
|
|
||||||
if (isQuoteChar(in.lookAhead())) {
|
|
||||||
// double or escaped encapsulator -> add single encapsulator to token
|
|
||||||
c = in.read();
|
|
||||||
tkn.content.append((char) c);
|
|
||||||
} else {
|
|
||||||
// token finish mark (encapsulator) reached: ignore whitespace till delimiter
|
|
||||||
while (true) {
|
|
||||||
c = in.read();
|
|
||||||
if (isDelimiter(c)) {
|
|
||||||
tkn.type = TOKEN;
|
|
||||||
return tkn;
|
|
||||||
} else if (isEndOfFile(c)) {
|
|
||||||
tkn.type = EOF;
|
|
||||||
tkn.isReady = true; // There is data at EOF
|
|
||||||
return tkn;
|
|
||||||
} else if (readEndOfLine(c)) {
|
|
||||||
// ok eo token reached
|
|
||||||
tkn.type = EORECORD;
|
|
||||||
return tkn;
|
|
||||||
} else if (!isWhitespace(c)) {
|
|
||||||
// error invalid char between token and next delimiter
|
|
||||||
throw new IOException("(line " + getCurrentLineNumber() + ") invalid char between encapsulated token and delimiter");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else if (isEndOfFile(c)) {
|
|
||||||
// error condition (end of file before end of token)
|
|
||||||
throw new IOException("(startline " + startLineNumber + ") EOF reached before encapsulated token finished");
|
|
||||||
} else {
|
|
||||||
// consume character
|
|
||||||
tkn.content.append((char) c);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
|
@ -1,254 +0,0 @@
|
||||||
/*
|
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
|
||||||
* contributor license agreements. See the NOTICE file distributed with
|
|
||||||
* this work for additional information regarding copyright ownership.
|
|
||||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
|
||||||
* (the "License"); you may not use this file except in compliance with
|
|
||||||
* the License. You may obtain a copy of the License at
|
|
||||||
*
|
|
||||||
* http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
*
|
|
||||||
* Unless required by applicable law or agreed to in writing, software
|
|
||||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
* See the License for the specific language governing permissions and
|
|
||||||
* limitations under the License.
|
|
||||||
*/
|
|
||||||
|
|
||||||
package org.apache.commons.csv;
|
|
||||||
|
|
||||||
import static org.apache.commons.csv.Constants.END_OF_STREAM;
|
|
||||||
import static org.apache.commons.csv.Token.Type.COMMENT;
|
|
||||||
import static org.apache.commons.csv.Token.Type.EOF;
|
|
||||||
import static org.apache.commons.csv.Token.Type.EORECORD;
|
|
||||||
import static org.apache.commons.csv.Token.Type.INVALID;
|
|
||||||
import static org.apache.commons.csv.Token.Type.TOKEN;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Experimental Lexer using enums to keep track of state and character type.
|
|
||||||
* Unfortunately it is twice as slow.
|
|
||||||
* For reference purpose only.
|
|
||||||
*
|
|
||||||
* @version $Id$
|
|
||||||
*/
|
|
||||||
class CSVLexer3 extends Lexer {
|
|
||||||
|
|
||||||
private final char escape;
|
|
||||||
|
|
||||||
// ctor needs to be public so can be called dynamically by PerformanceTest class
|
|
||||||
public CSVLexer3(final CSVFormat format, final ExtendedBufferedReader in) {
|
|
||||||
super(format, in);
|
|
||||||
this.escape = format.getEscape();
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Classify the character types
|
|
||||||
*/
|
|
||||||
private static enum CharType {
|
|
||||||
DELIM,
|
|
||||||
ESCAPE,
|
|
||||||
ENCAP,
|
|
||||||
EOL,
|
|
||||||
COMMENT_START,
|
|
||||||
WHITESPACE,
|
|
||||||
OTHER,
|
|
||||||
EOFCHAR
|
|
||||||
}
|
|
||||||
|
|
||||||
private CharType classify(final int intch) {
|
|
||||||
if (isDelimiter(intch)) {
|
|
||||||
return CharType.DELIM;
|
|
||||||
}
|
|
||||||
if (isCommentStart(intch)) {
|
|
||||||
return CharType.COMMENT_START;
|
|
||||||
}
|
|
||||||
if (isQuoteChar(intch)) {
|
|
||||||
return CharType.ENCAP;
|
|
||||||
}
|
|
||||||
if (isEscape(intch)) {
|
|
||||||
return CharType.ESCAPE;
|
|
||||||
}
|
|
||||||
if (intch == '\r' || intch == '\n') {
|
|
||||||
return CharType.EOL;
|
|
||||||
}
|
|
||||||
if (isWhitespace(intch)) { // Must be after EOL check
|
|
||||||
return CharType.WHITESPACE;
|
|
||||||
}
|
|
||||||
if (intch == END_OF_STREAM) {
|
|
||||||
return CharType.EOFCHAR;
|
|
||||||
}
|
|
||||||
return CharType.OTHER;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Parsing states
|
|
||||||
*/
|
|
||||||
private static enum State {
|
|
||||||
BEGIN, PLAIN, INQUOTE, QUOTEQUOTE, ESCAPE_PLAIN, ESCAPE_QUOTE,
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Returns the next token.
|
|
||||||
* <p/>
|
|
||||||
* A token corresponds to a term, a record change or an end-of-file indicator.
|
|
||||||
*
|
|
||||||
* @param tkn an existing Token object to reuse. The caller is responsible to initialize the Token.
|
|
||||||
* @return the next token found
|
|
||||||
* @throws java.io.IOException on stream access error
|
|
||||||
*/
|
|
||||||
@Override
|
|
||||||
Token nextToken(final Token tkn) throws IOException {
|
|
||||||
|
|
||||||
State state = State.BEGIN;
|
|
||||||
int intch;
|
|
||||||
boolean trimTrailingSpaces = false;
|
|
||||||
while(tkn.type == INVALID) {
|
|
||||||
intch = in.read();
|
|
||||||
final CharType type = classify(intch);
|
|
||||||
switch(state) {
|
|
||||||
case BEGIN:
|
|
||||||
switch(type){
|
|
||||||
case COMMENT_START:
|
|
||||||
in.readLine();
|
|
||||||
tkn.type = COMMENT;
|
|
||||||
break;
|
|
||||||
case ENCAP:
|
|
||||||
state = State.INQUOTE;
|
|
||||||
break;
|
|
||||||
case DELIM:
|
|
||||||
tkn.type = TOKEN;
|
|
||||||
break;
|
|
||||||
case EOL:
|
|
||||||
tkn.type = EORECORD;
|
|
||||||
break;
|
|
||||||
case EOFCHAR:
|
|
||||||
tkn.type = EOF;
|
|
||||||
break;
|
|
||||||
case ESCAPE:
|
|
||||||
state = State.ESCAPE_PLAIN;
|
|
||||||
break;
|
|
||||||
case OTHER:
|
|
||||||
tkn.content.append((char) intch);
|
|
||||||
state = State.PLAIN;
|
|
||||||
break;
|
|
||||||
case WHITESPACE:
|
|
||||||
if (!ignoreSurroundingSpaces){
|
|
||||||
tkn.content.append((char) intch);
|
|
||||||
state = State.PLAIN;
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
case PLAIN:
|
|
||||||
switch(type){
|
|
||||||
case DELIM:
|
|
||||||
tkn.type = TOKEN;
|
|
||||||
break;
|
|
||||||
case EOL:
|
|
||||||
tkn.type = EORECORD;
|
|
||||||
break;
|
|
||||||
case EOFCHAR:
|
|
||||||
tkn.type = EOF;
|
|
||||||
break;
|
|
||||||
case ESCAPE:
|
|
||||||
state = State.ESCAPE_PLAIN;
|
|
||||||
break;
|
|
||||||
default:
|
|
||||||
trimTrailingSpaces = ignoreSurroundingSpaces; // we have a plain token
|
|
||||||
tkn.content.append((char) intch);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
case INQUOTE: // Started a quoted string
|
|
||||||
switch(type){
|
|
||||||
case ENCAP:
|
|
||||||
state = State.QUOTEQUOTE;
|
|
||||||
break;
|
|
||||||
case ESCAPE:
|
|
||||||
state = State.ESCAPE_QUOTE;
|
|
||||||
break;
|
|
||||||
case EOFCHAR:
|
|
||||||
throw new IOException("(line " + getCurrentLineNumber() + ") unexpected EOF in quoted string");
|
|
||||||
default:
|
|
||||||
tkn.content.append((char) intch);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
case QUOTEQUOTE: // "..." seen, expecting end of token or "
|
|
||||||
switch(type){
|
|
||||||
case DELIM:
|
|
||||||
tkn.type = TOKEN;
|
|
||||||
break;
|
|
||||||
case EOL:
|
|
||||||
tkn.type = EORECORD;
|
|
||||||
break;
|
|
||||||
case EOFCHAR:
|
|
||||||
tkn.type = EOF;
|
|
||||||
break;
|
|
||||||
case ENCAP: // "..."" seen, append it
|
|
||||||
tkn.content.append((char) intch);
|
|
||||||
state = State.INQUOTE;
|
|
||||||
break;
|
|
||||||
case WHITESPACE: // trailing whitespace may be allowed
|
|
||||||
if (!ignoreSurroundingSpaces) {
|
|
||||||
// error invalid char between token and next delimiter
|
|
||||||
throw new IOException("(line " + getCurrentLineNumber() + ") invalid char between encapsulated token and delimiter");
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
// Everything else is invalid
|
|
||||||
case ESCAPE:
|
|
||||||
case OTHER:
|
|
||||||
case COMMENT_START:
|
|
||||||
// error invalid char between token and next delimiter
|
|
||||||
throw new IOException("(line " + getCurrentLineNumber() + ") invalid char between encapsulated token and delimiter");
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
case ESCAPE_PLAIN:
|
|
||||||
switch(type){
|
|
||||||
case DELIM:
|
|
||||||
case ESCAPE:
|
|
||||||
case EOL:
|
|
||||||
tkn.content.append((char) intch);
|
|
||||||
state = State.PLAIN;
|
|
||||||
break;
|
|
||||||
case COMMENT_START: // TODO should comment be escaped?
|
|
||||||
case ENCAP: // TODO is this correct?
|
|
||||||
case OTHER: // TODO may need to escape further
|
|
||||||
case WHITESPACE:
|
|
||||||
tkn.content.append(escape);
|
|
||||||
tkn.content.append((char) intch);
|
|
||||||
break;
|
|
||||||
case EOFCHAR:
|
|
||||||
throw new IOException("(line " + getCurrentLineNumber() + ") unexpected EOF in escape sequence");
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
case ESCAPE_QUOTE:
|
|
||||||
switch(type){
|
|
||||||
case ESCAPE:
|
|
||||||
case ENCAP: // this is the only required escape
|
|
||||||
tkn.content.append((char) intch);
|
|
||||||
break;
|
|
||||||
case COMMENT_START:
|
|
||||||
case DELIM:
|
|
||||||
case EOL:
|
|
||||||
case OTHER:
|
|
||||||
case WHITESPACE:
|
|
||||||
tkn.content.append(escape);
|
|
||||||
tkn.content.append((char) intch);
|
|
||||||
break;
|
|
||||||
case EOFCHAR:
|
|
||||||
throw new IOException("(line " + getCurrentLineNumber() + ") unexpected EOF in escape sequence");
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
default:
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (trimTrailingSpaces) {
|
|
||||||
trimTrailingSpaces(tkn.content);
|
|
||||||
}
|
|
||||||
return tkn;
|
|
||||||
}
|
|
||||||
}
|
|
Loading…
Reference in New Issue