Extracted the lexer from CSVParser in a distinct class (suggested by Bob Smith)
git-svn-id: https://svn.apache.org/repos/asf/commons/sandbox/csv/trunk@1298033 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
898b7f9629
commit
ca7bbae40e
|
@ -25,7 +25,9 @@ import java.util.Iterator;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.NoSuchElementException;
|
import java.util.NoSuchElementException;
|
||||||
|
|
||||||
import static org.apache.commons.csv.CSVParser.Token.Type.*;
|
import org.apache.commons.csv.CSVLexer.Token;
|
||||||
|
|
||||||
|
import static org.apache.commons.csv.CSVLexer.Token.Type.*;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Parses CSV files according to the specified configuration.
|
* Parses CSV files according to the specified configuration.
|
||||||
|
@ -59,65 +61,16 @@ import static org.apache.commons.csv.CSVParser.Token.Type.*;
|
||||||
*/
|
*/
|
||||||
public class CSVParser implements Iterable<String[]> {
|
public class CSVParser implements Iterable<String[]> {
|
||||||
|
|
||||||
/** length of the initial token (content-)buffer */
|
|
||||||
private static final int INITIAL_TOKEN_LENGTH = 50;
|
|
||||||
|
|
||||||
/** Immutable empty String array. */
|
/** Immutable empty String array. */
|
||||||
private static final String[] EMPTY_STRING_ARRAY = new String[0];
|
private static final String[] EMPTY_STRING_ARRAY = new String[0];
|
||||||
|
|
||||||
/** The input stream */
|
private CSVLexer lexer;
|
||||||
private final ExtendedBufferedReader in;
|
|
||||||
|
|
||||||
private final CSVFormat format;
|
|
||||||
|
|
||||||
// the following objects are shared to reduce garbage
|
// the following objects are shared to reduce garbage
|
||||||
|
|
||||||
/** A record buffer for getLine(). Grows as necessary and is reused. */
|
/** A record buffer for getLine(). Grows as necessary and is reused. */
|
||||||
private final List<String> record = new ArrayList<String>();
|
private final List<String> record = new ArrayList<String>();
|
||||||
private final Token reusableToken = new Token();
|
private final Token reusableToken = new Token();
|
||||||
private final CharBuffer wsBuf = new CharBuffer();
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Token is an internal token representation.
|
|
||||||
* <p/>
|
|
||||||
* It is used as contract between the lexer and the parser.
|
|
||||||
*/
|
|
||||||
static class Token {
|
|
||||||
|
|
||||||
enum Type {
|
|
||||||
/** Token has no valid content, i.e. is in its initialized state. */
|
|
||||||
INVALID,
|
|
||||||
|
|
||||||
/** Token with content, at beginning or in the middle of a line. */
|
|
||||||
TOKEN,
|
|
||||||
|
|
||||||
/** Token (which can have content) when end of file is reached. */
|
|
||||||
EOF,
|
|
||||||
|
|
||||||
/** Token with content when end of a line is reached. */
|
|
||||||
EORECORD
|
|
||||||
}
|
|
||||||
|
|
||||||
/** Token type */
|
|
||||||
Type type = INVALID;
|
|
||||||
|
|
||||||
/** The content buffer. */
|
|
||||||
CharBuffer content = new CharBuffer(INITIAL_TOKEN_LENGTH);
|
|
||||||
|
|
||||||
/** Token ready flag: indicates a valid token with content (ready for the parser). */
|
|
||||||
boolean isReady;
|
|
||||||
|
|
||||||
Token reset() {
|
|
||||||
content.clear();
|
|
||||||
type = INVALID;
|
|
||||||
isReady = false;
|
|
||||||
return this;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// ======================================================
|
|
||||||
// the constructor
|
|
||||||
// ======================================================
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* CSV parser using the default {@link CSVFormat}.
|
* CSV parser using the default {@link CSVFormat}.
|
||||||
|
@ -139,8 +92,7 @@ public class CSVParser implements Iterable<String[]> {
|
||||||
input = new UnicodeUnescapeReader(input);
|
input = new UnicodeUnescapeReader(input);
|
||||||
}
|
}
|
||||||
|
|
||||||
this.in = new ExtendedBufferedReader(input);
|
this.lexer = new CSVLexer(format, new ExtendedBufferedReader(input));
|
||||||
this.format = format;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -153,9 +105,6 @@ public class CSVParser implements Iterable<String[]> {
|
||||||
this(new StringReader(input), format);
|
this(new StringReader(input), format);
|
||||||
}
|
}
|
||||||
|
|
||||||
// ======================================================
|
|
||||||
// the parser
|
|
||||||
// ======================================================
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Parses the CSV according to the given format and returns the content
|
* Parses the CSV according to the given format and returns the content
|
||||||
|
@ -191,7 +140,7 @@ public class CSVParser implements Iterable<String[]> {
|
||||||
record.clear();
|
record.clear();
|
||||||
while (true) {
|
while (true) {
|
||||||
reusableToken.reset();
|
reusableToken.reset();
|
||||||
nextToken(reusableToken);
|
lexer.nextToken(reusableToken);
|
||||||
switch (reusableToken.type) {
|
switch (reusableToken.type) {
|
||||||
case TOKEN:
|
case TOKEN:
|
||||||
record.add(reusableToken.content.toString());
|
record.add(reusableToken.content.toString());
|
||||||
|
@ -274,12 +223,69 @@ public class CSVParser implements Iterable<String[]> {
|
||||||
* @return current line number
|
* @return current line number
|
||||||
*/
|
*/
|
||||||
public int getLineNumber() {
|
public int getLineNumber() {
|
||||||
return in.getLineNumber();
|
return lexer.getLineNumber();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// ======================================================
|
|
||||||
// the lexer(s)
|
class CSVLexer {
|
||||||
// ======================================================
|
|
||||||
|
/** length of the initial token (content-)buffer */
|
||||||
|
private static final int INITIAL_TOKEN_LENGTH = 50;
|
||||||
|
|
||||||
|
private final CharBuffer wsBuf = new CharBuffer();
|
||||||
|
|
||||||
|
private CSVFormat format;
|
||||||
|
|
||||||
|
/** The input stream */
|
||||||
|
private ExtendedBufferedReader in;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Token is an internal token representation.
|
||||||
|
* <p/>
|
||||||
|
* It is used as contract between the lexer and the parser.
|
||||||
|
*/
|
||||||
|
static class Token {
|
||||||
|
|
||||||
|
enum Type {
|
||||||
|
/** Token has no valid content, i.e. is in its initialized state. */
|
||||||
|
INVALID,
|
||||||
|
|
||||||
|
/** Token with content, at beginning or in the middle of a line. */
|
||||||
|
TOKEN,
|
||||||
|
|
||||||
|
/** Token (which can have content) when end of file is reached. */
|
||||||
|
EOF,
|
||||||
|
|
||||||
|
/** Token with content when end of a line is reached. */
|
||||||
|
EORECORD
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Token type */
|
||||||
|
Type type = INVALID;
|
||||||
|
|
||||||
|
/** The content buffer. */
|
||||||
|
CharBuffer content = new CharBuffer(INITIAL_TOKEN_LENGTH);
|
||||||
|
|
||||||
|
/** Token ready flag: indicates a valid token with content (ready for the parser). */
|
||||||
|
boolean isReady;
|
||||||
|
|
||||||
|
Token reset() {
|
||||||
|
content.clear();
|
||||||
|
type = INVALID;
|
||||||
|
isReady = false;
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
CSVLexer(CSVFormat format, ExtendedBufferedReader in) {
|
||||||
|
this.format = format;
|
||||||
|
this.in = in;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int getLineNumber() {
|
||||||
|
return in.getLineNumber();
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Returns the next token.
|
* Returns the next token.
|
||||||
|
@ -503,19 +509,6 @@ public class CSVParser implements Iterable<String[]> {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Obtain the specified CSV format.
|
|
||||||
*
|
|
||||||
* @return format currently being used
|
|
||||||
*/
|
|
||||||
public CSVFormat getFormat() {
|
|
||||||
return this.format;
|
|
||||||
}
|
|
||||||
|
|
||||||
// ======================================================
|
|
||||||
// Character class checker
|
|
||||||
// ======================================================
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @return true if the given char is a whitespace character
|
* @return true if the given char is a whitespace character
|
||||||
*/
|
*/
|
||||||
|
|
|
@ -0,0 +1,167 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.commons.csv;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.StringReader;
|
||||||
|
|
||||||
|
import junit.framework.TestCase;
|
||||||
|
import org.apache.commons.csv.CSVLexer.Token;
|
||||||
|
|
||||||
|
import static org.apache.commons.csv.CSVLexer.Token.Type.*;
|
||||||
|
|
||||||
|
public class CSVLexerTest extends TestCase {
|
||||||
|
|
||||||
|
private CSVLexer getLexer(String input, CSVFormat format) {
|
||||||
|
return new CSVLexer(format, new ExtendedBufferedReader(new StringReader(input)));
|
||||||
|
}
|
||||||
|
|
||||||
|
private void assertTokenEquals(Token.Type expectedType, String expectedContent, Token token) {
|
||||||
|
assertEquals("Token type", expectedType, token.type);
|
||||||
|
assertEquals("Token content", expectedContent, token.content.toString());
|
||||||
|
}
|
||||||
|
|
||||||
|
// Single line (without comment)
|
||||||
|
public void testNextToken1() throws IOException {
|
||||||
|
String code = "abc,def, hijk, lmnop, qrst,uv ,wxy ,z , ,";
|
||||||
|
CSVLexer parser = getLexer(code, CSVFormat.DEFAULT);
|
||||||
|
assertTokenEquals(TOKEN, "abc", parser.nextToken(new Token()));
|
||||||
|
assertTokenEquals(TOKEN, "def", parser.nextToken(new Token()));
|
||||||
|
assertTokenEquals(TOKEN, "hijk", parser.nextToken(new Token()));
|
||||||
|
assertTokenEquals(TOKEN, "lmnop", parser.nextToken(new Token()));
|
||||||
|
assertTokenEquals(TOKEN, "qrst", parser.nextToken(new Token()));
|
||||||
|
assertTokenEquals(TOKEN, "uv", parser.nextToken(new Token()));
|
||||||
|
assertTokenEquals(TOKEN, "wxy", parser.nextToken(new Token()));
|
||||||
|
assertTokenEquals(TOKEN, "z", parser.nextToken(new Token()));
|
||||||
|
assertTokenEquals(TOKEN, "", parser.nextToken(new Token()));
|
||||||
|
assertTokenEquals(EOF, "", parser.nextToken(new Token()));
|
||||||
|
}
|
||||||
|
|
||||||
|
// multiline including comments (and empty lines)
|
||||||
|
public void testNextToken2() throws IOException {
|
||||||
|
/* file: 1,2,3,
|
||||||
|
* a,b x,c
|
||||||
|
*
|
||||||
|
* # this is a comment
|
||||||
|
* d,e,
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
String code = "1,2,3,\na,b x,c\n#foo\n\nd,e,\n\n";
|
||||||
|
CSVFormat format = CSVFormat.DEFAULT.withCommentStart('#');
|
||||||
|
|
||||||
|
CSVLexer parser = getLexer(code, format);
|
||||||
|
|
||||||
|
|
||||||
|
assertTokenEquals(TOKEN, "1", parser.nextToken(new Token()));
|
||||||
|
assertTokenEquals(TOKEN, "2", parser.nextToken(new Token()));
|
||||||
|
assertTokenEquals(TOKEN, "3", parser.nextToken(new Token()));
|
||||||
|
assertTokenEquals(EORECORD, "", parser.nextToken(new Token()));
|
||||||
|
assertTokenEquals(TOKEN, "a", parser.nextToken(new Token()));
|
||||||
|
assertTokenEquals(TOKEN, "b x", parser.nextToken(new Token()));
|
||||||
|
assertTokenEquals(EORECORD, "c", parser.nextToken(new Token()));
|
||||||
|
assertTokenEquals(EORECORD, "", parser.nextToken(new Token()));
|
||||||
|
assertTokenEquals(TOKEN, "d", parser.nextToken(new Token()));
|
||||||
|
assertTokenEquals(TOKEN, "e", parser.nextToken(new Token()));
|
||||||
|
assertTokenEquals(EORECORD, "", parser.nextToken(new Token()));
|
||||||
|
assertTokenEquals(EOF, "", parser.nextToken(new Token()));
|
||||||
|
assertTokenEquals(EOF, "", parser.nextToken(new Token()));
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
// simple token with escaping
|
||||||
|
public void testNextToken3() throws IOException {
|
||||||
|
/* file: a,\,,b
|
||||||
|
* \,,
|
||||||
|
*/
|
||||||
|
String code = "a,\\,,b\n\\,,";
|
||||||
|
CSVFormat format = CSVFormat.DEFAULT.withCommentStart('#');
|
||||||
|
CSVLexer parser = getLexer(code, format);
|
||||||
|
|
||||||
|
assertTokenEquals(TOKEN, "a", parser.nextToken(new Token()));
|
||||||
|
// an unquoted single backslash is not an escape char
|
||||||
|
assertTokenEquals(TOKEN, "\\", parser.nextToken(new Token()));
|
||||||
|
assertTokenEquals(TOKEN, "", parser.nextToken(new Token()));
|
||||||
|
assertTokenEquals(EORECORD, "b", parser.nextToken(new Token()));
|
||||||
|
// an unquoted single backslash is not an escape char
|
||||||
|
assertTokenEquals(TOKEN, "\\", parser.nextToken(new Token()));
|
||||||
|
assertTokenEquals(TOKEN, "", parser.nextToken(new Token()));
|
||||||
|
assertTokenEquals(EOF, "", parser.nextToken(new Token()));
|
||||||
|
}
|
||||||
|
|
||||||
|
// encapsulator tokenizer (sinle line)
|
||||||
|
public void testNextToken4() throws IOException {
|
||||||
|
/* file: a,"foo",b
|
||||||
|
* a, " foo",b
|
||||||
|
* a,"foo " ,b // whitespace after closing encapsulator
|
||||||
|
* a, " foo " ,b
|
||||||
|
*/
|
||||||
|
String code = "a,\"foo\",b\na, \" foo\",b\na,\"foo \" ,b\na, \" foo \" ,b";
|
||||||
|
CSVLexer parser = getLexer(code, CSVFormat.DEFAULT);
|
||||||
|
assertTokenEquals(TOKEN, "a", parser.nextToken(new Token()));
|
||||||
|
assertTokenEquals(TOKEN, "foo", parser.nextToken(new Token()));
|
||||||
|
assertTokenEquals(EORECORD, "b", parser.nextToken(new Token()));
|
||||||
|
assertTokenEquals(TOKEN, "a", parser.nextToken(new Token()));
|
||||||
|
assertTokenEquals(TOKEN, " foo", parser.nextToken(new Token()));
|
||||||
|
assertTokenEquals(EORECORD, "b", parser.nextToken(new Token()));
|
||||||
|
assertTokenEquals(TOKEN, "a", parser.nextToken(new Token()));
|
||||||
|
assertTokenEquals(TOKEN, "foo ", parser.nextToken(new Token()));
|
||||||
|
assertTokenEquals(EORECORD, "b", parser.nextToken(new Token()));
|
||||||
|
assertTokenEquals(TOKEN, "a", parser.nextToken(new Token()));
|
||||||
|
assertTokenEquals(TOKEN, " foo ", parser.nextToken(new Token()));
|
||||||
|
// assertTokenEquals(EORECORD, "b", parser.nextToken(new Token()));
|
||||||
|
assertTokenEquals(EOF, "b", parser.nextToken(new Token()));
|
||||||
|
}
|
||||||
|
|
||||||
|
// encapsulator tokenizer (multi line, delimiter in string)
|
||||||
|
public void testNextToken5() throws IOException {
|
||||||
|
String code = "a,\"foo\n\",b\n\"foo\n baar ,,,\"\n\"\n\t \n\"";
|
||||||
|
CSVLexer parser = getLexer(code, CSVFormat.DEFAULT);
|
||||||
|
assertTokenEquals(TOKEN, "a", parser.nextToken(new Token()));
|
||||||
|
assertTokenEquals(TOKEN, "foo\n", parser.nextToken(new Token()));
|
||||||
|
assertTokenEquals(EORECORD, "b", parser.nextToken(new Token()));
|
||||||
|
assertTokenEquals(EORECORD, "foo\n baar ,,,", parser.nextToken(new Token()));
|
||||||
|
assertTokenEquals(EOF, "\n\t \n", parser.nextToken(new Token()));
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
// change delimiters, comment, encapsulater
|
||||||
|
public void testNextToken6() throws IOException {
|
||||||
|
/* file: a;'b and \' more
|
||||||
|
* '
|
||||||
|
* !comment;;;;
|
||||||
|
* ;;
|
||||||
|
*/
|
||||||
|
String code = "a;'b and '' more\n'\n!comment;;;;\n;;";
|
||||||
|
CSVFormat format = new CSVFormat(';', '\'', '!');
|
||||||
|
CSVLexer parser = getLexer(code, format);
|
||||||
|
assertTokenEquals(TOKEN, "a", parser.nextToken(new Token()));
|
||||||
|
assertTokenEquals(EORECORD, "b and ' more\n", parser.nextToken(new Token()));
|
||||||
|
}
|
||||||
|
|
||||||
|
// From SANDBOX-153
|
||||||
|
public void testDelimiterIsWhitespace() throws IOException {
|
||||||
|
String code = "one\ttwo\t\tfour \t five\t six";
|
||||||
|
CSVLexer parser = getLexer(code, CSVFormat.TDF);
|
||||||
|
assertTokenEquals(TOKEN, "one", parser.nextToken(new Token()));
|
||||||
|
assertTokenEquals(TOKEN, "two", parser.nextToken(new Token()));
|
||||||
|
assertTokenEquals(TOKEN, "", parser.nextToken(new Token()));
|
||||||
|
assertTokenEquals(TOKEN, "four", parser.nextToken(new Token()));
|
||||||
|
assertTokenEquals(TOKEN, "five", parser.nextToken(new Token()));
|
||||||
|
assertTokenEquals(EOF, "six", parser.nextToken(new Token()));
|
||||||
|
}
|
||||||
|
}
|
|
@ -28,8 +28,6 @@ import java.util.NoSuchElementException;
|
||||||
|
|
||||||
import junit.framework.TestCase;
|
import junit.framework.TestCase;
|
||||||
|
|
||||||
import static org.apache.commons.csv.CSVParser.Token.Type.*;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* CSVParserTest
|
* CSVParserTest
|
||||||
*
|
*
|
||||||
|
@ -41,168 +39,7 @@ import static org.apache.commons.csv.CSVParser.Token.Type.*;
|
||||||
*/
|
*/
|
||||||
public class CSVParserTest extends TestCase {
|
public class CSVParserTest extends TestCase {
|
||||||
|
|
||||||
/**
|
String code = "a,b,c,d\n"
|
||||||
* TestCSVParser.
|
|
||||||
*/
|
|
||||||
class TestCSVParser extends CSVParser {
|
|
||||||
/**
|
|
||||||
* Test parser to investigate the type of the internal Token.
|
|
||||||
*
|
|
||||||
* @param in a Reader
|
|
||||||
*/
|
|
||||||
TestCSVParser(Reader in) {
|
|
||||||
super(in);
|
|
||||||
}
|
|
||||||
|
|
||||||
TestCSVParser(Reader in, CSVFormat format) {
|
|
||||||
super(in, format);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Calls super.nextToken() and prints out a String representation of token
|
|
||||||
* type and content.
|
|
||||||
*
|
|
||||||
* @return String representation of token type and content
|
|
||||||
* @throws IOException like {@link CSVParser#nextToken(Token)}
|
|
||||||
*/
|
|
||||||
public String testNextToken() throws IOException {
|
|
||||||
Token t = super.nextToken(new Token());
|
|
||||||
return t.type.name() + ";" + t.content + ";";
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// ======================================================
|
|
||||||
// lexer tests
|
|
||||||
// ======================================================
|
|
||||||
|
|
||||||
// Single line (without comment)
|
|
||||||
public void testNextToken1() throws IOException {
|
|
||||||
String code = "abc,def, hijk, lmnop, qrst,uv ,wxy ,z , ,";
|
|
||||||
TestCSVParser parser = new TestCSVParser(new StringReader(code));
|
|
||||||
assertEquals(TOKEN + ";abc;", parser.testNextToken());
|
|
||||||
assertEquals(TOKEN + ";def;", parser.testNextToken());
|
|
||||||
assertEquals(TOKEN + ";hijk;", parser.testNextToken());
|
|
||||||
assertEquals(TOKEN + ";lmnop;", parser.testNextToken());
|
|
||||||
assertEquals(TOKEN + ";qrst;", parser.testNextToken());
|
|
||||||
assertEquals(TOKEN + ";uv;", parser.testNextToken());
|
|
||||||
assertEquals(TOKEN + ";wxy;", parser.testNextToken());
|
|
||||||
assertEquals(TOKEN + ";z;", parser.testNextToken());
|
|
||||||
assertEquals(TOKEN + ";;", parser.testNextToken());
|
|
||||||
assertEquals(EOF + ";;", parser.testNextToken());
|
|
||||||
}
|
|
||||||
|
|
||||||
// multiline including comments (and empty lines)
|
|
||||||
public void testNextToken2() throws IOException {
|
|
||||||
/* file: 1,2,3,
|
|
||||||
* a,b x,c
|
|
||||||
*
|
|
||||||
* # this is a comment
|
|
||||||
* d,e,
|
|
||||||
*
|
|
||||||
*/
|
|
||||||
String code = "1,2,3,\na,b x,c\n#foo\n\nd,e,\n\n";
|
|
||||||
CSVFormat format = CSVFormat.DEFAULT.withCommentStart('#');
|
|
||||||
|
|
||||||
TestCSVParser parser = new TestCSVParser(new StringReader(code), format);
|
|
||||||
|
|
||||||
|
|
||||||
assertEquals(TOKEN + ";1;", parser.testNextToken());
|
|
||||||
assertEquals(TOKEN + ";2;", parser.testNextToken());
|
|
||||||
assertEquals(TOKEN + ";3;", parser.testNextToken());
|
|
||||||
assertEquals(EORECORD + ";;", parser.testNextToken());
|
|
||||||
assertEquals(TOKEN + ";a;", parser.testNextToken());
|
|
||||||
assertEquals(TOKEN + ";b x;", parser.testNextToken());
|
|
||||||
assertEquals(EORECORD + ";c;", parser.testNextToken());
|
|
||||||
assertEquals(EORECORD + ";;", parser.testNextToken());
|
|
||||||
assertEquals(TOKEN + ";d;", parser.testNextToken());
|
|
||||||
assertEquals(TOKEN + ";e;", parser.testNextToken());
|
|
||||||
assertEquals(EORECORD + ";;", parser.testNextToken());
|
|
||||||
assertEquals(EOF + ";;", parser.testNextToken());
|
|
||||||
assertEquals(EOF + ";;", parser.testNextToken());
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
// simple token with escaping
|
|
||||||
public void testNextToken3() throws IOException {
|
|
||||||
/* file: a,\,,b
|
|
||||||
* \,,
|
|
||||||
*/
|
|
||||||
String code = "a,\\,,b\n\\,,";
|
|
||||||
CSVFormat format = CSVFormat.DEFAULT.withCommentStart('#');
|
|
||||||
TestCSVParser parser = new TestCSVParser(new StringReader(code), format);
|
|
||||||
|
|
||||||
assertEquals(TOKEN + ";a;", parser.testNextToken());
|
|
||||||
// an unquoted single backslash is not an escape char
|
|
||||||
assertEquals(TOKEN + ";\\;", parser.testNextToken());
|
|
||||||
assertEquals(TOKEN + ";;", parser.testNextToken());
|
|
||||||
assertEquals(EORECORD + ";b;", parser.testNextToken());
|
|
||||||
// an unquoted single backslash is not an escape char
|
|
||||||
assertEquals(TOKEN + ";\\;", parser.testNextToken());
|
|
||||||
assertEquals(TOKEN + ";;", parser.testNextToken());
|
|
||||||
assertEquals(EOF + ";;", parser.testNextToken());
|
|
||||||
}
|
|
||||||
|
|
||||||
// encapsulator tokenizer (sinle line)
|
|
||||||
public void testNextToken4() throws IOException {
|
|
||||||
/* file: a,"foo",b
|
|
||||||
* a, " foo",b
|
|
||||||
* a,"foo " ,b // whitespace after closing encapsulator
|
|
||||||
* a, " foo " ,b
|
|
||||||
*/
|
|
||||||
String code =
|
|
||||||
"a,\"foo\",b\na, \" foo\",b\na,\"foo \" ,b\na, \" foo \" ,b";
|
|
||||||
TestCSVParser parser = new TestCSVParser(new StringReader(code));
|
|
||||||
assertEquals(TOKEN + ";a;", parser.testNextToken());
|
|
||||||
assertEquals(TOKEN + ";foo;", parser.testNextToken());
|
|
||||||
assertEquals(EORECORD + ";b;", parser.testNextToken());
|
|
||||||
assertEquals(TOKEN + ";a;", parser.testNextToken());
|
|
||||||
assertEquals(TOKEN + "; foo;", parser.testNextToken());
|
|
||||||
assertEquals(EORECORD + ";b;", parser.testNextToken());
|
|
||||||
assertEquals(TOKEN + ";a;", parser.testNextToken());
|
|
||||||
assertEquals(TOKEN + ";foo ;", parser.testNextToken());
|
|
||||||
assertEquals(EORECORD + ";b;", parser.testNextToken());
|
|
||||||
assertEquals(TOKEN + ";a;", parser.testNextToken());
|
|
||||||
assertEquals(TOKEN + "; foo ;", parser.testNextToken());
|
|
||||||
// assertEquals(EORECORD + ";b;", parser.testNextToken());
|
|
||||||
assertEquals(EOF + ";b;", parser.testNextToken());
|
|
||||||
}
|
|
||||||
|
|
||||||
// encapsulator tokenizer (multi line, delimiter in string)
|
|
||||||
public void testNextToken5() throws IOException {
|
|
||||||
String code =
|
|
||||||
"a,\"foo\n\",b\n\"foo\n baar ,,,\"\n\"\n\t \n\"";
|
|
||||||
TestCSVParser parser = new TestCSVParser(new StringReader(code));
|
|
||||||
assertEquals(TOKEN + ";a;", parser.testNextToken());
|
|
||||||
assertEquals(TOKEN + ";foo\n;", parser.testNextToken());
|
|
||||||
assertEquals(EORECORD + ";b;", parser.testNextToken());
|
|
||||||
assertEquals(EORECORD + ";foo\n baar ,,,;",
|
|
||||||
parser.testNextToken());
|
|
||||||
assertEquals(EOF + ";\n\t \n;", parser.testNextToken());
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
// change delimiters, comment, encapsulater
|
|
||||||
public void testNextToken6() throws IOException {
|
|
||||||
/* file: a;'b and \' more
|
|
||||||
* '
|
|
||||||
* !comment;;;;
|
|
||||||
* ;;
|
|
||||||
*/
|
|
||||||
String code = "a;'b and '' more\n'\n!comment;;;;\n;;";
|
|
||||||
TestCSVParser parser = new TestCSVParser(new StringReader(code), new CSVFormat(';', '\'', '!'));
|
|
||||||
assertEquals(TOKEN + ";a;", parser.testNextToken());
|
|
||||||
assertEquals(
|
|
||||||
EORECORD + ";b and ' more\n;",
|
|
||||||
parser.testNextToken());
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
// ======================================================
|
|
||||||
// parser tests
|
|
||||||
// ======================================================
|
|
||||||
|
|
||||||
String code =
|
|
||||||
"a,b,c,d\n"
|
|
||||||
+ " a , b , 1 2 \n"
|
+ " a , b , 1 2 \n"
|
||||||
+ "\"foo baar\", b,\n"
|
+ "\"foo baar\", b,\n"
|
||||||
// + " \"foo\n,,\n\"\",,\n\\\"\",d,e\n";
|
// + " \"foo\n,,\n\"\",,\n\\\"\",d,e\n";
|
||||||
|
@ -552,18 +389,6 @@ public class CSVParserTest extends TestCase {
|
||||||
assertEquals(3, data.length);
|
assertEquals(3, data.length);
|
||||||
}
|
}
|
||||||
|
|
||||||
// From SANDBOX-153
|
|
||||||
public void testDelimiterIsWhitespace() throws IOException {
|
|
||||||
String code = "one\ttwo\t\tfour \t five\t six";
|
|
||||||
TestCSVParser parser = new TestCSVParser(new StringReader(code), CSVFormat.TDF);
|
|
||||||
assertEquals(TOKEN + ";one;", parser.testNextToken());
|
|
||||||
assertEquals(TOKEN + ";two;", parser.testNextToken());
|
|
||||||
assertEquals(TOKEN + ";;", parser.testNextToken());
|
|
||||||
assertEquals(TOKEN + ";four;", parser.testNextToken());
|
|
||||||
assertEquals(TOKEN + ";five;", parser.testNextToken());
|
|
||||||
assertEquals(EOF + ";six;", parser.testNextToken());
|
|
||||||
}
|
|
||||||
|
|
||||||
public void testForEach() {
|
public void testForEach() {
|
||||||
List<String[]> records = new ArrayList<String[]>();
|
List<String[]> records = new ArrayList<String[]>();
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue