Merge Lexer with CSVLexer
git-svn-id: https://svn.apache.org/repos/asf/commons/proper/csv/trunk@1511006 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
7755640784
commit
7b168ebbbe
|
@ -17,6 +17,13 @@
|
||||||
|
|
||||||
package org.apache.commons.csv;
|
package org.apache.commons.csv;
|
||||||
|
|
||||||
|
import static org.apache.commons.csv.Constants.BACKSPACE;
|
||||||
|
import static org.apache.commons.csv.Constants.CR;
|
||||||
|
import static org.apache.commons.csv.Constants.END_OF_STREAM;
|
||||||
|
import static org.apache.commons.csv.Constants.FF;
|
||||||
|
import static org.apache.commons.csv.Constants.LF;
|
||||||
|
import static org.apache.commons.csv.Constants.TAB;
|
||||||
|
import static org.apache.commons.csv.Constants.UNDEFINED;
|
||||||
import static org.apache.commons.csv.Token.Type.COMMENT;
|
import static org.apache.commons.csv.Token.Type.COMMENT;
|
||||||
import static org.apache.commons.csv.Token.Type.EOF;
|
import static org.apache.commons.csv.Token.Type.EOF;
|
||||||
import static org.apache.commons.csv.Token.Type.EORECORD;
|
import static org.apache.commons.csv.Token.Type.EORECORD;
|
||||||
|
@ -30,11 +37,38 @@ import java.io.IOException;
|
||||||
*
|
*
|
||||||
* @version $Id$
|
* @version $Id$
|
||||||
*/
|
*/
|
||||||
final class CSVLexer extends Lexer {
|
final class CSVLexer {
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Constant char to use for disabling comments, escapes and encapsulation. The value -2 is used because it
|
||||||
|
* won't be confused with an EOF signal (-1), and because the Unicode value {@code FFFE} would be encoded as two
|
||||||
|
* chars (using surrogates) and thus there should never be a collision with a real text char.
|
||||||
|
*/
|
||||||
|
private static final char DISABLED = '\ufffe';
|
||||||
|
|
||||||
|
private final char delimiter;
|
||||||
|
private final char escape;
|
||||||
|
private final char quoteChar;
|
||||||
|
private final char commmentStart;
|
||||||
|
|
||||||
|
final boolean ignoreSurroundingSpaces;
|
||||||
|
final boolean ignoreEmptyLines;
|
||||||
|
|
||||||
|
final CSVFormat format;
|
||||||
|
|
||||||
|
/** The input stream */
|
||||||
|
final ExtendedBufferedReader in;
|
||||||
|
|
||||||
/** INTERNAL API. ctor needs to be public so can be called dynamically by PerformanceTest class */
|
/** INTERNAL API. ctor needs to be public so can be called dynamically by PerformanceTest class */
|
||||||
CSVLexer(final CSVFormat format, final ExtendedBufferedReader in) {
|
CSVLexer(final CSVFormat format, final ExtendedBufferedReader in) {
|
||||||
super(format, in);
|
this.format = format;
|
||||||
|
this.in = in;
|
||||||
|
this.delimiter = format.getDelimiter();
|
||||||
|
this.escape = mapNullToDisabled(format.getEscape());
|
||||||
|
this.quoteChar = mapNullToDisabled(format.getQuoteChar());
|
||||||
|
this.commmentStart = mapNullToDisabled(format.getCommentStart());
|
||||||
|
this.ignoreSurroundingSpaces = format.getIgnoreSurroundingSpaces();
|
||||||
|
this.ignoreEmptyLines = format.getIgnoreEmptyLines();
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -48,7 +82,6 @@ final class CSVLexer extends Lexer {
|
||||||
* @throws java.io.IOException
|
* @throws java.io.IOException
|
||||||
* on stream access error
|
* on stream access error
|
||||||
*/
|
*/
|
||||||
@Override
|
|
||||||
Token nextToken(final Token token) throws IOException {
|
Token nextToken(final Token token) throws IOException {
|
||||||
|
|
||||||
// get the last read char (required for empty line detection)
|
// get the last read char (required for empty line detection)
|
||||||
|
@ -257,4 +290,144 @@ final class CSVLexer extends Lexer {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private final char mapNullToDisabled(final Character c) {
|
||||||
|
return c == null ? DISABLED : c.charValue();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the current line number
|
||||||
|
*
|
||||||
|
* @return the current line number
|
||||||
|
*/
|
||||||
|
long getCurrentLineNumber() {
|
||||||
|
return in.getCurrentLineNumber();
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO escape handling needs more work
|
||||||
|
/**
|
||||||
|
* Handle an escape sequence.
|
||||||
|
* The current character must be the escape character.
|
||||||
|
* On return, the next character is available by calling {@link ExtendedBufferedReader#getLastChar()}
|
||||||
|
* on the input stream.
|
||||||
|
*
|
||||||
|
* @return the unescaped character (as an int) or {@link END_OF_STREAM} if char following the escape is invalid.
|
||||||
|
* @throws IOException if there is a problem reading the stream or the end of stream is detected:
|
||||||
|
* the escape character is not allowed at end of strem
|
||||||
|
*/
|
||||||
|
int readEscape() throws IOException {
|
||||||
|
// the escape char has just been read (normally a backslash)
|
||||||
|
final int ch = in.read();
|
||||||
|
switch (ch) {
|
||||||
|
case 'r':
|
||||||
|
return CR;
|
||||||
|
case 'n':
|
||||||
|
return LF;
|
||||||
|
case 't':
|
||||||
|
return TAB;
|
||||||
|
case 'b':
|
||||||
|
return BACKSPACE;
|
||||||
|
case 'f':
|
||||||
|
return FF;
|
||||||
|
case CR:
|
||||||
|
case LF:
|
||||||
|
case FF: // TODO is this correct?
|
||||||
|
case TAB: // TODO is this correct? Do tabs need to be escaped?
|
||||||
|
case BACKSPACE: // TODO is this correct?
|
||||||
|
return ch;
|
||||||
|
case END_OF_STREAM:
|
||||||
|
throw new IOException("EOF whilst processing escape sequence");
|
||||||
|
default:
|
||||||
|
// Now check for meta-characters
|
||||||
|
if (isMetaChar(ch)) {
|
||||||
|
return ch;
|
||||||
|
}
|
||||||
|
// indicate unexpected char - available from in.getLastChar()
|
||||||
|
return END_OF_STREAM;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void trimTrailingSpaces(final StringBuilder buffer) {
|
||||||
|
int length = buffer.length();
|
||||||
|
while (length > 0 && Character.isWhitespace(buffer.charAt(length - 1))) {
|
||||||
|
length = length - 1;
|
||||||
|
}
|
||||||
|
if (length != buffer.length()) {
|
||||||
|
buffer.setLength(length);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Greedily accepts \n, \r and \r\n This checker consumes silently the second control-character...
|
||||||
|
*
|
||||||
|
* @return true if the given or next character is a line-terminator
|
||||||
|
*/
|
||||||
|
boolean readEndOfLine(int ch) throws IOException {
|
||||||
|
// check if we have \r\n...
|
||||||
|
if (ch == CR && in.lookAhead() == LF) {
|
||||||
|
// note: does not change ch outside of this method!
|
||||||
|
ch = in.read();
|
||||||
|
}
|
||||||
|
return ch == LF || ch == CR;
|
||||||
|
}
|
||||||
|
|
||||||
|
boolean isClosed() {
|
||||||
|
return in.isClosed();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @return true if the given char is a whitespace character
|
||||||
|
*/
|
||||||
|
boolean isWhitespace(final int ch) {
|
||||||
|
return !isDelimiter(ch) && Character.isWhitespace((char) ch);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Checks if the current character represents the start of a line: a CR, LF or is at the start of the file.
|
||||||
|
*
|
||||||
|
* @param ch the character to check
|
||||||
|
* @return true if the character is at the start of a line.
|
||||||
|
*/
|
||||||
|
boolean isStartOfLine(final int ch) {
|
||||||
|
return ch == LF || ch == CR || ch == UNDEFINED;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @return true if the given character indicates end of file
|
||||||
|
*/
|
||||||
|
boolean isEndOfFile(final int ch) {
|
||||||
|
return ch == END_OF_STREAM;
|
||||||
|
}
|
||||||
|
|
||||||
|
boolean isDelimiter(final int ch) {
|
||||||
|
return ch == delimiter;
|
||||||
|
}
|
||||||
|
|
||||||
|
boolean isEscape(final int ch) {
|
||||||
|
return ch == escape;
|
||||||
|
}
|
||||||
|
|
||||||
|
boolean isQuoteChar(final int ch) {
|
||||||
|
return ch == quoteChar;
|
||||||
|
}
|
||||||
|
|
||||||
|
boolean isCommentStart(final int ch) {
|
||||||
|
return ch == commmentStart;
|
||||||
|
}
|
||||||
|
|
||||||
|
private boolean isMetaChar(final int ch) {
|
||||||
|
return ch == delimiter ||
|
||||||
|
ch == escape ||
|
||||||
|
ch == quoteChar ||
|
||||||
|
ch == commmentStart;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Closes resources.
|
||||||
|
*
|
||||||
|
* @throws IOException
|
||||||
|
* If an I/O error occurs
|
||||||
|
*/
|
||||||
|
void close() throws IOException {
|
||||||
|
in.close();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -217,7 +217,7 @@ public final class CSVParser implements Iterable<CSVRecord>, Closeable {
|
||||||
private final CSVFormat format;
|
private final CSVFormat format;
|
||||||
private final Map<String, Integer> headerMap;
|
private final Map<String, Integer> headerMap;
|
||||||
|
|
||||||
private final Lexer lexer;
|
private final CSVLexer lexer;
|
||||||
|
|
||||||
/** A record buffer for getRecord(). Grows as necessary and is reused. */
|
/** A record buffer for getRecord(). Grows as necessary and is reused. */
|
||||||
private final List<String> record = new ArrayList<String>();
|
private final List<String> record = new ArrayList<String>();
|
||||||
|
|
|
@ -1,211 +0,0 @@
|
||||||
/*
|
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
|
||||||
* contributor license agreements. See the NOTICE file distributed with
|
|
||||||
* this work for additional information regarding copyright ownership.
|
|
||||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
|
||||||
* (the "License"); you may not use this file except in compliance with
|
|
||||||
* the License. You may obtain a copy of the License at
|
|
||||||
*
|
|
||||||
* http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
*
|
|
||||||
* Unless required by applicable law or agreed to in writing, software
|
|
||||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
* See the License for the specific language governing permissions and
|
|
||||||
* limitations under the License.
|
|
||||||
*/
|
|
||||||
|
|
||||||
package org.apache.commons.csv;
|
|
||||||
|
|
||||||
import static org.apache.commons.csv.Constants.BACKSPACE;
|
|
||||||
import static org.apache.commons.csv.Constants.CR;
|
|
||||||
import static org.apache.commons.csv.Constants.END_OF_STREAM;
|
|
||||||
import static org.apache.commons.csv.Constants.FF;
|
|
||||||
import static org.apache.commons.csv.Constants.LF;
|
|
||||||
import static org.apache.commons.csv.Constants.TAB;
|
|
||||||
import static org.apache.commons.csv.Constants.UNDEFINED;
|
|
||||||
|
|
||||||
import java.io.Closeable;
|
|
||||||
import java.io.IOException;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Abstract lexer class; contains common utility routines shared by lexers
|
|
||||||
*
|
|
||||||
* @version $Id$
|
|
||||||
*/
|
|
||||||
abstract class Lexer implements Closeable {
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Constant char to use for disabling comments, escapes and encapsulation. The value -2 is used because it
|
|
||||||
* won't be confused with an EOF signal (-1), and because the Unicode value {@code FFFE} would be encoded as two
|
|
||||||
* chars (using surrogates) and thus there should never be a collision with a real text char.
|
|
||||||
*/
|
|
||||||
private static final char DISABLED = '\ufffe';
|
|
||||||
|
|
||||||
private final char delimiter;
|
|
||||||
private final char escape;
|
|
||||||
private final char quoteChar;
|
|
||||||
private final char commmentStart;
|
|
||||||
|
|
||||||
final boolean ignoreSurroundingSpaces;
|
|
||||||
final boolean ignoreEmptyLines;
|
|
||||||
|
|
||||||
final CSVFormat format;
|
|
||||||
|
|
||||||
/** The input stream */
|
|
||||||
final ExtendedBufferedReader in;
|
|
||||||
|
|
||||||
Lexer(final CSVFormat format, final ExtendedBufferedReader in) {
|
|
||||||
this.format = format;
|
|
||||||
this.in = in;
|
|
||||||
this.delimiter = format.getDelimiter();
|
|
||||||
this.escape = mapNullToDisabled(format.getEscape());
|
|
||||||
this.quoteChar = mapNullToDisabled(format.getQuoteChar());
|
|
||||||
this.commmentStart = mapNullToDisabled(format.getCommentStart());
|
|
||||||
this.ignoreSurroundingSpaces = format.getIgnoreSurroundingSpaces();
|
|
||||||
this.ignoreEmptyLines = format.getIgnoreEmptyLines();
|
|
||||||
}
|
|
||||||
|
|
||||||
private final char mapNullToDisabled(final Character c) {
|
|
||||||
return c == null ? DISABLED : c.charValue();
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Returns the current line number
|
|
||||||
*
|
|
||||||
* @return the current line number
|
|
||||||
*/
|
|
||||||
long getCurrentLineNumber() {
|
|
||||||
return in.getCurrentLineNumber();
|
|
||||||
}
|
|
||||||
|
|
||||||
// TODO escape handling needs more work
|
|
||||||
/**
|
|
||||||
* Handle an escape sequence.
|
|
||||||
* The current character must be the escape character.
|
|
||||||
* On return, the next character is available by calling {@link ExtendedBufferedReader#getLastChar()}
|
|
||||||
* on the input stream.
|
|
||||||
*
|
|
||||||
* @return the unescaped character (as an int) or {@link END_OF_STREAM} if char following the escape is invalid.
|
|
||||||
* @throws IOException if there is a problem reading the stream or the end of stream is detected:
|
|
||||||
* the escape character is not allowed at end of strem
|
|
||||||
*/
|
|
||||||
int readEscape() throws IOException {
|
|
||||||
// the escape char has just been read (normally a backslash)
|
|
||||||
final int ch = in.read();
|
|
||||||
switch (ch) {
|
|
||||||
case 'r':
|
|
||||||
return CR;
|
|
||||||
case 'n':
|
|
||||||
return LF;
|
|
||||||
case 't':
|
|
||||||
return TAB;
|
|
||||||
case 'b':
|
|
||||||
return BACKSPACE;
|
|
||||||
case 'f':
|
|
||||||
return FF;
|
|
||||||
case CR:
|
|
||||||
case LF:
|
|
||||||
case FF: // TODO is this correct?
|
|
||||||
case TAB: // TODO is this correct? Do tabs need to be escaped?
|
|
||||||
case BACKSPACE: // TODO is this correct?
|
|
||||||
return ch;
|
|
||||||
case END_OF_STREAM:
|
|
||||||
throw new IOException("EOF whilst processing escape sequence");
|
|
||||||
default:
|
|
||||||
// Now check for meta-characters
|
|
||||||
if (isMetaChar(ch)) {
|
|
||||||
return ch;
|
|
||||||
}
|
|
||||||
// indicate unexpected char - available from in.getLastChar()
|
|
||||||
return END_OF_STREAM;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void trimTrailingSpaces(final StringBuilder buffer) {
|
|
||||||
int length = buffer.length();
|
|
||||||
while (length > 0 && Character.isWhitespace(buffer.charAt(length - 1))) {
|
|
||||||
length = length - 1;
|
|
||||||
}
|
|
||||||
if (length != buffer.length()) {
|
|
||||||
buffer.setLength(length);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Greedily accepts \n, \r and \r\n This checker consumes silently the second control-character...
|
|
||||||
*
|
|
||||||
* @return true if the given or next character is a line-terminator
|
|
||||||
*/
|
|
||||||
boolean readEndOfLine(int ch) throws IOException {
|
|
||||||
// check if we have \r\n...
|
|
||||||
if (ch == CR && in.lookAhead() == LF) {
|
|
||||||
// note: does not change ch outside of this method!
|
|
||||||
ch = in.read();
|
|
||||||
}
|
|
||||||
return ch == LF || ch == CR;
|
|
||||||
}
|
|
||||||
|
|
||||||
abstract Token nextToken(Token reusableToken) throws IOException;
|
|
||||||
|
|
||||||
boolean isClosed() {
|
|
||||||
return in.isClosed();
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* @return true if the given char is a whitespace character
|
|
||||||
*/
|
|
||||||
boolean isWhitespace(final int ch) {
|
|
||||||
return !isDelimiter(ch) && Character.isWhitespace((char) ch);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Checks if the current character represents the start of a line: a CR, LF or is at the start of the file.
|
|
||||||
*
|
|
||||||
* @param ch the character to check
|
|
||||||
* @return true if the character is at the start of a line.
|
|
||||||
*/
|
|
||||||
boolean isStartOfLine(final int ch) {
|
|
||||||
return ch == LF || ch == CR || ch == UNDEFINED;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* @return true if the given character indicates end of file
|
|
||||||
*/
|
|
||||||
boolean isEndOfFile(final int ch) {
|
|
||||||
return ch == END_OF_STREAM;
|
|
||||||
}
|
|
||||||
|
|
||||||
boolean isDelimiter(final int ch) {
|
|
||||||
return ch == delimiter;
|
|
||||||
}
|
|
||||||
|
|
||||||
boolean isEscape(final int ch) {
|
|
||||||
return ch == escape;
|
|
||||||
}
|
|
||||||
|
|
||||||
boolean isQuoteChar(final int ch) {
|
|
||||||
return ch == quoteChar;
|
|
||||||
}
|
|
||||||
|
|
||||||
boolean isCommentStart(final int ch) {
|
|
||||||
return ch == commmentStart;
|
|
||||||
}
|
|
||||||
|
|
||||||
private boolean isMetaChar(final int ch) {
|
|
||||||
return ch == delimiter ||
|
|
||||||
ch == escape ||
|
|
||||||
ch == quoteChar ||
|
|
||||||
ch == commmentStart;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Closes resources.
|
|
||||||
*
|
|
||||||
* @throws IOException
|
|
||||||
* If an I/O error occurs
|
|
||||||
*/
|
|
||||||
public void close() throws IOException {
|
|
||||||
in.close();
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -52,14 +52,14 @@ public class CSVLexerTest {
|
||||||
formatWithEscaping = CSVFormat.DEFAULT.withEscape('\\');
|
formatWithEscaping = CSVFormat.DEFAULT.withEscape('\\');
|
||||||
}
|
}
|
||||||
|
|
||||||
private Lexer getLexer(final String input, final CSVFormat format) {
|
private CSVLexer getLexer(final String input, final CSVFormat format) {
|
||||||
return new CSVLexer(format, new ExtendedBufferedReader(new StringReader(input)));
|
return new CSVLexer(format, new ExtendedBufferedReader(new StringReader(input)));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testSurroundingSpacesAreDeleted() throws IOException {
|
public void testSurroundingSpacesAreDeleted() throws IOException {
|
||||||
final String code = "noSpaces, leadingSpaces,trailingSpaces , surroundingSpaces , ,,";
|
final String code = "noSpaces, leadingSpaces,trailingSpaces , surroundingSpaces , ,,";
|
||||||
final Lexer parser = getLexer(code, CSVFormat.DEFAULT.withIgnoreSurroundingSpaces(true));
|
final CSVLexer parser = getLexer(code, CSVFormat.DEFAULT.withIgnoreSurroundingSpaces(true));
|
||||||
assertThat(parser.nextToken(new Token()), matches(TOKEN, "noSpaces"));
|
assertThat(parser.nextToken(new Token()), matches(TOKEN, "noSpaces"));
|
||||||
assertThat(parser.nextToken(new Token()), matches(TOKEN, "leadingSpaces"));
|
assertThat(parser.nextToken(new Token()), matches(TOKEN, "leadingSpaces"));
|
||||||
assertThat(parser.nextToken(new Token()), matches(TOKEN, "trailingSpaces"));
|
assertThat(parser.nextToken(new Token()), matches(TOKEN, "trailingSpaces"));
|
||||||
|
@ -72,7 +72,7 @@ public class CSVLexerTest {
|
||||||
@Test
|
@Test
|
||||||
public void testSurroundingTabsAreDeleted() throws IOException {
|
public void testSurroundingTabsAreDeleted() throws IOException {
|
||||||
final String code = "noTabs,\tleadingTab,trailingTab\t,\tsurroundingTabs\t,\t\t,,";
|
final String code = "noTabs,\tleadingTab,trailingTab\t,\tsurroundingTabs\t,\t\t,,";
|
||||||
final Lexer parser = getLexer(code, CSVFormat.DEFAULT.withIgnoreSurroundingSpaces(true));
|
final CSVLexer parser = getLexer(code, CSVFormat.DEFAULT.withIgnoreSurroundingSpaces(true));
|
||||||
assertThat(parser.nextToken(new Token()), matches(TOKEN, "noTabs"));
|
assertThat(parser.nextToken(new Token()), matches(TOKEN, "noTabs"));
|
||||||
assertThat(parser.nextToken(new Token()), matches(TOKEN, "leadingTab"));
|
assertThat(parser.nextToken(new Token()), matches(TOKEN, "leadingTab"));
|
||||||
assertThat(parser.nextToken(new Token()), matches(TOKEN, "trailingTab"));
|
assertThat(parser.nextToken(new Token()), matches(TOKEN, "trailingTab"));
|
||||||
|
@ -99,7 +99,7 @@ public class CSVLexerTest {
|
||||||
"\n"+
|
"\n"+
|
||||||
"\n";
|
"\n";
|
||||||
final CSVFormat format = CSVFormat.DEFAULT.withIgnoreEmptyLines(true);
|
final CSVFormat format = CSVFormat.DEFAULT.withIgnoreEmptyLines(true);
|
||||||
final Lexer parser = getLexer(code, format);
|
final CSVLexer parser = getLexer(code, format);
|
||||||
|
|
||||||
assertThat(parser.nextToken(new Token()), matches(TOKEN, "first"));
|
assertThat(parser.nextToken(new Token()), matches(TOKEN, "first"));
|
||||||
assertThat(parser.nextToken(new Token()), matches(TOKEN, "line"));
|
assertThat(parser.nextToken(new Token()), matches(TOKEN, "line"));
|
||||||
|
@ -123,7 +123,7 @@ public class CSVLexerTest {
|
||||||
"# penultimate comment\n"+
|
"# penultimate comment\n"+
|
||||||
"# Final comment\n";
|
"# Final comment\n";
|
||||||
final CSVFormat format = CSVFormat.DEFAULT.withCommentStart('#');
|
final CSVFormat format = CSVFormat.DEFAULT.withCommentStart('#');
|
||||||
final Lexer parser = getLexer(code, format);
|
final CSVLexer parser = getLexer(code, format);
|
||||||
|
|
||||||
assertThat(parser.nextToken(new Token()), matches(TOKEN, "first"));
|
assertThat(parser.nextToken(new Token()), matches(TOKEN, "first"));
|
||||||
assertThat(parser.nextToken(new Token()), matches(TOKEN, "line"));
|
assertThat(parser.nextToken(new Token()), matches(TOKEN, "line"));
|
||||||
|
@ -161,7 +161,7 @@ public class CSVLexerTest {
|
||||||
final CSVFormat format = CSVFormat.DEFAULT.withCommentStart('#').withIgnoreEmptyLines(false);
|
final CSVFormat format = CSVFormat.DEFAULT.withCommentStart('#').withIgnoreEmptyLines(false);
|
||||||
assertFalse("Should not ignore empty lines", format.getIgnoreEmptyLines());
|
assertFalse("Should not ignore empty lines", format.getIgnoreEmptyLines());
|
||||||
|
|
||||||
final Lexer parser = getLexer(code, format);
|
final CSVLexer parser = getLexer(code, format);
|
||||||
|
|
||||||
|
|
||||||
assertThat(parser.nextToken(new Token()), matches(TOKEN, "1"));
|
assertThat(parser.nextToken(new Token()), matches(TOKEN, "1"));
|
||||||
|
@ -199,7 +199,7 @@ public class CSVLexerTest {
|
||||||
final String code = "a,\\,,b\\\n\\,,";
|
final String code = "a,\\,,b\\\n\\,,";
|
||||||
final CSVFormat format = CSVFormat.DEFAULT;
|
final CSVFormat format = CSVFormat.DEFAULT;
|
||||||
assertFalse(format.isEscaping());
|
assertFalse(format.isEscaping());
|
||||||
final Lexer parser = getLexer(code, format);
|
final CSVLexer parser = getLexer(code, format);
|
||||||
|
|
||||||
assertThat(parser.nextToken(new Token()), matches(TOKEN, "a"));
|
assertThat(parser.nextToken(new Token()), matches(TOKEN, "a"));
|
||||||
// an unquoted single backslash is not an escape char
|
// an unquoted single backslash is not an escape char
|
||||||
|
@ -221,7 +221,7 @@ public class CSVLexerTest {
|
||||||
final String code = "a,\\,,b\\\\\n\\,,\\\nc,d\\\r\ne";
|
final String code = "a,\\,,b\\\\\n\\,,\\\nc,d\\\r\ne";
|
||||||
final CSVFormat format = formatWithEscaping.withIgnoreEmptyLines(false);
|
final CSVFormat format = formatWithEscaping.withIgnoreEmptyLines(false);
|
||||||
assertTrue(format.isEscaping());
|
assertTrue(format.isEscaping());
|
||||||
final Lexer parser = getLexer(code, format);
|
final CSVLexer parser = getLexer(code, format);
|
||||||
|
|
||||||
assertThat(parser.nextToken(new Token()), matches(TOKEN, "a"));
|
assertThat(parser.nextToken(new Token()), matches(TOKEN, "a"));
|
||||||
assertThat(parser.nextToken(new Token()), matches(TOKEN, ","));
|
assertThat(parser.nextToken(new Token()), matches(TOKEN, ","));
|
||||||
|
@ -241,7 +241,7 @@ public class CSVLexerTest {
|
||||||
* a, " foo " ,b
|
* a, " foo " ,b
|
||||||
*/
|
*/
|
||||||
final String code = "a,\"foo\",b\na, \" foo\",b\na,\"foo \" ,b\na, \" foo \" ,b";
|
final String code = "a,\"foo\",b\na, \" foo\",b\na,\"foo \" ,b\na, \" foo \" ,b";
|
||||||
final Lexer parser = getLexer(code, CSVFormat.DEFAULT.withIgnoreSurroundingSpaces(true));
|
final CSVLexer parser = getLexer(code, CSVFormat.DEFAULT.withIgnoreSurroundingSpaces(true));
|
||||||
assertThat(parser.nextToken(new Token()), matches(TOKEN, "a"));
|
assertThat(parser.nextToken(new Token()), matches(TOKEN, "a"));
|
||||||
assertThat(parser.nextToken(new Token()), matches(TOKEN, "foo"));
|
assertThat(parser.nextToken(new Token()), matches(TOKEN, "foo"));
|
||||||
assertThat(parser.nextToken(new Token()), matches(EORECORD, "b"));
|
assertThat(parser.nextToken(new Token()), matches(EORECORD, "b"));
|
||||||
|
@ -261,7 +261,7 @@ public class CSVLexerTest {
|
||||||
@Test
|
@Test
|
||||||
public void testNextToken5() throws IOException {
|
public void testNextToken5() throws IOException {
|
||||||
final String code = "a,\"foo\n\",b\n\"foo\n baar ,,,\"\n\"\n\t \n\"";
|
final String code = "a,\"foo\n\",b\n\"foo\n baar ,,,\"\n\"\n\t \n\"";
|
||||||
final Lexer parser = getLexer(code, CSVFormat.DEFAULT);
|
final CSVLexer parser = getLexer(code, CSVFormat.DEFAULT);
|
||||||
assertThat(parser.nextToken(new Token()), matches(TOKEN, "a"));
|
assertThat(parser.nextToken(new Token()), matches(TOKEN, "a"));
|
||||||
assertThat(parser.nextToken(new Token()), matches(TOKEN, "foo\n"));
|
assertThat(parser.nextToken(new Token()), matches(TOKEN, "foo\n"));
|
||||||
assertThat(parser.nextToken(new Token()), matches(EORECORD, "b"));
|
assertThat(parser.nextToken(new Token()), matches(EORECORD, "b"));
|
||||||
|
@ -280,7 +280,7 @@ public class CSVLexerTest {
|
||||||
*/
|
*/
|
||||||
final String code = "a;'b and '' more\n'\n!comment;;;;\n;;";
|
final String code = "a;'b and '' more\n'\n!comment;;;;\n;;";
|
||||||
final CSVFormat format = CSVFormat.DEFAULT.withQuoteChar('\'').withCommentStart('!').withDelimiter(';');
|
final CSVFormat format = CSVFormat.DEFAULT.withQuoteChar('\'').withCommentStart('!').withDelimiter(';');
|
||||||
final Lexer parser = getLexer(code, format);
|
final CSVLexer parser = getLexer(code, format);
|
||||||
assertThat(parser.nextToken(new Token()), matches(TOKEN, "a"));
|
assertThat(parser.nextToken(new Token()), matches(TOKEN, "a"));
|
||||||
assertThat(parser.nextToken(new Token()), matches(EORECORD, "b and ' more\n"));
|
assertThat(parser.nextToken(new Token()), matches(EORECORD, "b and ' more\n"));
|
||||||
}
|
}
|
||||||
|
@ -289,7 +289,7 @@ public class CSVLexerTest {
|
||||||
@Test
|
@Test
|
||||||
public void testDelimiterIsWhitespace() throws IOException {
|
public void testDelimiterIsWhitespace() throws IOException {
|
||||||
final String code = "one\ttwo\t\tfour \t five\t six";
|
final String code = "one\ttwo\t\tfour \t five\t six";
|
||||||
final Lexer parser = getLexer(code, CSVFormat.TDF);
|
final CSVLexer parser = getLexer(code, CSVFormat.TDF);
|
||||||
assertThat(parser.nextToken(new Token()), matches(TOKEN, "one"));
|
assertThat(parser.nextToken(new Token()), matches(TOKEN, "one"));
|
||||||
assertThat(parser.nextToken(new Token()), matches(TOKEN, "two"));
|
assertThat(parser.nextToken(new Token()), matches(TOKEN, "two"));
|
||||||
assertThat(parser.nextToken(new Token()), matches(TOKEN, ""));
|
assertThat(parser.nextToken(new Token()), matches(TOKEN, ""));
|
||||||
|
@ -300,96 +300,96 @@ public class CSVLexerTest {
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testEscapedCR() throws Exception {
|
public void testEscapedCR() throws Exception {
|
||||||
final Lexer lexer = getLexer("character\\" + CR + "Escaped", formatWithEscaping);
|
final CSVLexer lexer = getLexer("character\\" + CR + "Escaped", formatWithEscaping);
|
||||||
assertThat(lexer.nextToken(new Token()), hasContent("character" + CR + "Escaped"));
|
assertThat(lexer.nextToken(new Token()), hasContent("character" + CR + "Escaped"));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testCR() throws Exception {
|
public void testCR() throws Exception {
|
||||||
final Lexer lexer = getLexer("character" + CR + "NotEscaped", formatWithEscaping);
|
final CSVLexer lexer = getLexer("character" + CR + "NotEscaped", formatWithEscaping);
|
||||||
assertThat(lexer.nextToken(new Token()), hasContent("character"));
|
assertThat(lexer.nextToken(new Token()), hasContent("character"));
|
||||||
assertThat(lexer.nextToken(new Token()), hasContent("NotEscaped"));
|
assertThat(lexer.nextToken(new Token()), hasContent("NotEscaped"));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testEscapedLF() throws Exception {
|
public void testEscapedLF() throws Exception {
|
||||||
final Lexer lexer = getLexer("character\\" + LF + "Escaped", formatWithEscaping);
|
final CSVLexer lexer = getLexer("character\\" + LF + "Escaped", formatWithEscaping);
|
||||||
assertThat(lexer.nextToken(new Token()), hasContent("character" + LF + "Escaped"));
|
assertThat(lexer.nextToken(new Token()), hasContent("character" + LF + "Escaped"));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testLF() throws Exception {
|
public void testLF() throws Exception {
|
||||||
final Lexer lexer = getLexer("character" + LF + "NotEscaped", formatWithEscaping);
|
final CSVLexer lexer = getLexer("character" + LF + "NotEscaped", formatWithEscaping);
|
||||||
assertThat(lexer.nextToken(new Token()), hasContent("character"));
|
assertThat(lexer.nextToken(new Token()), hasContent("character"));
|
||||||
assertThat(lexer.nextToken(new Token()), hasContent("NotEscaped"));
|
assertThat(lexer.nextToken(new Token()), hasContent("NotEscaped"));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test // TODO is this correct? Do we expect <esc>TAB to be unescaped?
|
@Test // TODO is this correct? Do we expect <esc>TAB to be unescaped?
|
||||||
public void testEscapedTab() throws Exception {
|
public void testEscapedTab() throws Exception {
|
||||||
final Lexer lexer = getLexer("character\\" + TAB + "Escaped", formatWithEscaping);
|
final CSVLexer lexer = getLexer("character\\" + TAB + "Escaped", formatWithEscaping);
|
||||||
assertThat(lexer.nextToken(new Token()), hasContent("character" + TAB + "Escaped"));
|
assertThat(lexer.nextToken(new Token()), hasContent("character" + TAB + "Escaped"));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testTab() throws Exception {
|
public void testTab() throws Exception {
|
||||||
final Lexer lexer = getLexer("character" + TAB + "NotEscaped", formatWithEscaping);
|
final CSVLexer lexer = getLexer("character" + TAB + "NotEscaped", formatWithEscaping);
|
||||||
assertThat(lexer.nextToken(new Token()), hasContent("character" + TAB + "NotEscaped"));
|
assertThat(lexer.nextToken(new Token()), hasContent("character" + TAB + "NotEscaped"));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test // TODO is this correct? Do we expect <esc>BACKSPACE to be unescaped?
|
@Test // TODO is this correct? Do we expect <esc>BACKSPACE to be unescaped?
|
||||||
public void testEscapedBackspace() throws Exception {
|
public void testEscapedBackspace() throws Exception {
|
||||||
final Lexer lexer = getLexer("character\\" + BACKSPACE + "Escaped", formatWithEscaping);
|
final CSVLexer lexer = getLexer("character\\" + BACKSPACE + "Escaped", formatWithEscaping);
|
||||||
assertThat(lexer.nextToken(new Token()), hasContent("character" + BACKSPACE + "Escaped"));
|
assertThat(lexer.nextToken(new Token()), hasContent("character" + BACKSPACE + "Escaped"));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testBackspace() throws Exception {
|
public void testBackspace() throws Exception {
|
||||||
final Lexer lexer = getLexer("character" + BACKSPACE + "NotEscaped", formatWithEscaping);
|
final CSVLexer lexer = getLexer("character" + BACKSPACE + "NotEscaped", formatWithEscaping);
|
||||||
assertThat(lexer.nextToken(new Token()), hasContent("character" + BACKSPACE + "NotEscaped"));
|
assertThat(lexer.nextToken(new Token()), hasContent("character" + BACKSPACE + "NotEscaped"));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test // TODO is this correct? Do we expect <esc>FF to be unescaped?
|
@Test // TODO is this correct? Do we expect <esc>FF to be unescaped?
|
||||||
public void testEscapedFF() throws Exception {
|
public void testEscapedFF() throws Exception {
|
||||||
final Lexer lexer = getLexer("character\\" + FF + "Escaped", formatWithEscaping);
|
final CSVLexer lexer = getLexer("character\\" + FF + "Escaped", formatWithEscaping);
|
||||||
assertThat(lexer.nextToken(new Token()), hasContent("character" + FF + "Escaped"));
|
assertThat(lexer.nextToken(new Token()), hasContent("character" + FF + "Escaped"));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testFF() throws Exception {
|
public void testFF() throws Exception {
|
||||||
final Lexer lexer = getLexer("character" + FF + "NotEscaped", formatWithEscaping);
|
final CSVLexer lexer = getLexer("character" + FF + "NotEscaped", formatWithEscaping);
|
||||||
assertThat(lexer.nextToken(new Token()), hasContent("character" + FF + "NotEscaped"));
|
assertThat(lexer.nextToken(new Token()), hasContent("character" + FF + "NotEscaped"));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testEscapedMySqlNullValue() throws Exception {
|
public void testEscapedMySqlNullValue() throws Exception {
|
||||||
// MySQL uses \N to symbolize null values. We have to restore this
|
// MySQL uses \N to symbolize null values. We have to restore this
|
||||||
final Lexer lexer = getLexer("character\\NEscaped", formatWithEscaping);
|
final CSVLexer lexer = getLexer("character\\NEscaped", formatWithEscaping);
|
||||||
assertThat(lexer.nextToken(new Token()), hasContent("character\\NEscaped"));
|
assertThat(lexer.nextToken(new Token()), hasContent("character\\NEscaped"));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testEscapedCharacter() throws Exception {
|
public void testEscapedCharacter() throws Exception {
|
||||||
final Lexer lexer = getLexer("character\\aEscaped", formatWithEscaping);
|
final CSVLexer lexer = getLexer("character\\aEscaped", formatWithEscaping);
|
||||||
assertThat(lexer.nextToken(new Token()), hasContent("character\\aEscaped"));
|
assertThat(lexer.nextToken(new Token()), hasContent("character\\aEscaped"));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testEscapedControlCharacter() throws Exception {
|
public void testEscapedControlCharacter() throws Exception {
|
||||||
// we are explicitly using an escape different from \ here
|
// we are explicitly using an escape different from \ here
|
||||||
final Lexer lexer = getLexer("character!rEscaped", CSVFormat.DEFAULT.withEscape('!'));
|
final CSVLexer lexer = getLexer("character!rEscaped", CSVFormat.DEFAULT.withEscape('!'));
|
||||||
assertThat(lexer.nextToken(new Token()), hasContent("character" + CR + "Escaped"));
|
assertThat(lexer.nextToken(new Token()), hasContent("character" + CR + "Escaped"));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testEscapedControlCharacter2() throws Exception {
|
public void testEscapedControlCharacter2() throws Exception {
|
||||||
final Lexer lexer = getLexer("character\\rEscaped", CSVFormat.DEFAULT.withEscape('\\'));
|
final CSVLexer lexer = getLexer("character\\rEscaped", CSVFormat.DEFAULT.withEscape('\\'));
|
||||||
assertThat(lexer.nextToken(new Token()), hasContent("character" + CR + "Escaped"));
|
assertThat(lexer.nextToken(new Token()), hasContent("character" + CR + "Escaped"));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test(expected = IOException.class)
|
@Test(expected = IOException.class)
|
||||||
public void testEscapingAtEOF() throws Exception {
|
public void testEscapingAtEOF() throws Exception {
|
||||||
final String code = "escaping at EOF is evil\\";
|
final String code = "escaping at EOF is evil\\";
|
||||||
final Lexer lexer = getLexer(code, formatWithEscaping);
|
final CSVLexer lexer = getLexer(code, formatWithEscaping);
|
||||||
|
|
||||||
lexer.nextToken(new Token());
|
lexer.nextToken(new Token());
|
||||||
}
|
}
|
||||||
|
|
|
@ -224,9 +224,9 @@ public class PerformanceTest {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private static Constructor<Lexer> getLexerCtor(final String clazz) throws Exception {
|
private static Constructor<CSVLexer> getLexerCtor(final String clazz) throws Exception {
|
||||||
@SuppressWarnings("unchecked")
|
@SuppressWarnings("unchecked")
|
||||||
final Class<Lexer> lexer = (Class<Lexer>) Class.forName("org.apache.commons.csv." + clazz);
|
final Class<CSVLexer> lexer = (Class<CSVLexer>) Class.forName("org.apache.commons.csv." + clazz);
|
||||||
return lexer.getConstructor(new Class<?>[]{CSVFormat.class, ExtendedBufferedReader.class});
|
return lexer.getConstructor(new Class<?>[]{CSVFormat.class, ExtendedBufferedReader.class});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -235,7 +235,7 @@ public class PerformanceTest {
|
||||||
String dynamic = "";
|
String dynamic = "";
|
||||||
for (int i = 0; i < max; i++) {
|
for (int i = 0; i < max; i++) {
|
||||||
final ExtendedBufferedReader input = new ExtendedBufferedReader(getReader());
|
final ExtendedBufferedReader input = new ExtendedBufferedReader(getReader());
|
||||||
Lexer lexer = null;
|
CSVLexer lexer = null;
|
||||||
if (test.startsWith("CSVLexer")) {
|
if (test.startsWith("CSVLexer")) {
|
||||||
dynamic="!";
|
dynamic="!";
|
||||||
lexer = getLexerCtor(test).newInstance(new Object[]{format, input});
|
lexer = getLexerCtor(test).newInstance(new Object[]{format, input});
|
||||||
|
|
Loading…
Reference in New Issue