[CSV-131] Save positions of records to enable random access. The floor is open for code review and further discussion based on the comments in the Jira.
git-svn-id: https://svn.apache.org/repos/asf/commons/proper/csv/trunk@1635052 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
b466ec0ecd
commit
e28e28e1f2
|
@ -39,6 +39,7 @@
|
||||||
</properties>
|
</properties>
|
||||||
<body>
|
<body>
|
||||||
<release version="1.1" date="2014-mm-dd" description="Feature and bug fix release">
|
<release version="1.1" date="2014-mm-dd" description="Feature and bug fix release">
|
||||||
|
<action issue="CSV-131" type="add" dev="ggregory" due-to="Holger Stratmann">Save positions of records to enable random access</action>
|
||||||
<action issue="CSV-130" type="fix" dev="ggregory" due-to="Sergei Lebedev">CSVFormat#withHeader doesn't work well with #printComment, add withHeaderComments(String...)</action>
|
<action issue="CSV-130" type="fix" dev="ggregory" due-to="Sergei Lebedev">CSVFormat#withHeader doesn't work well with #printComment, add withHeaderComments(String...)</action>
|
||||||
<action issue="CSV-128" type="fix" dev="ggregory">CSVFormat.EXCEL should ignore empty header names</action>
|
<action issue="CSV-128" type="fix" dev="ggregory">CSVFormat.EXCEL should ignore empty header names</action>
|
||||||
<action issue="CSV-129" type="add" dev="ggregory">Add CSVFormat#with 0-arg methods matching boolean arg methods</action>
|
<action issue="CSV-129" type="add" dev="ggregory">Add CSVFormat#with 0-arg methods matching boolean arg methods</action>
|
||||||
|
|
|
@ -220,6 +220,12 @@ public final class CSVParser implements Iterable<CSVRecord>, Closeable {
|
||||||
|
|
||||||
private long recordNumber;
|
private long recordNumber;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Lexer offset if the parser does not start parsing at the beginning of the source. Usually used in combination
|
||||||
|
* with {@link #setNextRecordNumber(long)}
|
||||||
|
*/
|
||||||
|
private long characterOffset;
|
||||||
|
|
||||||
private final Token reusableToken = new Token();
|
private final Token reusableToken = new Token();
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -295,6 +301,43 @@ public final class CSVParser implements Iterable<CSVRecord>, Closeable {
|
||||||
return this.headerMap == null ? null : new LinkedHashMap<String, Integer>(this.headerMap);
|
return this.headerMap == null ? null : new LinkedHashMap<String, Integer>(this.headerMap);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Sets the record number to be assigned to the next record read.
|
||||||
|
* <p>
|
||||||
|
* Use this if the reader is not positioned at the first record when you create the parser. For example, the first
|
||||||
|
* record read might be the 51st record in the source file.
|
||||||
|
* </p>
|
||||||
|
* <p>
|
||||||
|
* If you want the records to also have the correct character position referring to the underlying source, call
|
||||||
|
* {@link #setNextCharacterPosition(long)}.
|
||||||
|
* </p>
|
||||||
|
*
|
||||||
|
* @param nextRecordNumber
|
||||||
|
* the next record number
|
||||||
|
* @since 1.1
|
||||||
|
*/
|
||||||
|
public void setNextRecordNumber(long nextRecordNumber) {
|
||||||
|
this.recordNumber = nextRecordNumber - 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Sets the current position in the source stream regardless of where the parser and lexer start reading.
|
||||||
|
* <p>
|
||||||
|
* For example: We open a file and seek to position 5434 in order to start reading at record 42. In order to have
|
||||||
|
* the parser assign the correct characterPosition to records, we call this method.
|
||||||
|
* </p>
|
||||||
|
* <p>
|
||||||
|
* If you want the records to also have the correct record numbers, call {@link #setNextRecordNumber(long)}
|
||||||
|
* </p>
|
||||||
|
*
|
||||||
|
* @param position
|
||||||
|
* the new character position
|
||||||
|
* @since 1.1
|
||||||
|
*/
|
||||||
|
public void setNextCharacterPosition(long position) {
|
||||||
|
this.characterOffset = position - lexer.getCharacterPosition();
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Returns the current record number in the input stream.
|
* Returns the current record number in the input stream.
|
||||||
*
|
*
|
||||||
|
@ -445,6 +488,7 @@ public final class CSVParser implements Iterable<CSVRecord>, Closeable {
|
||||||
CSVRecord result = null;
|
CSVRecord result = null;
|
||||||
this.record.clear();
|
this.record.clear();
|
||||||
StringBuilder sb = null;
|
StringBuilder sb = null;
|
||||||
|
final long startCharPosition = lexer.getCharacterPosition() + this.characterOffset;
|
||||||
do {
|
do {
|
||||||
this.reusableToken.reset();
|
this.reusableToken.reset();
|
||||||
this.lexer.nextToken(this.reusableToken);
|
this.lexer.nextToken(this.reusableToken);
|
||||||
|
@ -480,7 +524,7 @@ public final class CSVParser implements Iterable<CSVRecord>, Closeable {
|
||||||
this.recordNumber++;
|
this.recordNumber++;
|
||||||
final String comment = sb == null ? null : sb.toString();
|
final String comment = sb == null ? null : sb.toString();
|
||||||
result = new CSVRecord(this.record.toArray(new String[this.record.size()]), this.headerMap, comment,
|
result = new CSVRecord(this.record.toArray(new String[this.record.size()]), this.headerMap, comment,
|
||||||
this.recordNumber);
|
this.recordNumber, startCharPosition);
|
||||||
}
|
}
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
|
@ -36,6 +36,8 @@ public final class CSVRecord implements Serializable, Iterable<String> {
|
||||||
|
|
||||||
private static final long serialVersionUID = 1L;
|
private static final long serialVersionUID = 1L;
|
||||||
|
|
||||||
|
private final long characterPosition;
|
||||||
|
|
||||||
/** The accumulated comments (if any) */
|
/** The accumulated comments (if any) */
|
||||||
private final String comment;
|
private final String comment;
|
||||||
|
|
||||||
|
@ -48,11 +50,12 @@ public final class CSVRecord implements Serializable, Iterable<String> {
|
||||||
/** The values of the record */
|
/** The values of the record */
|
||||||
private final String[] values;
|
private final String[] values;
|
||||||
|
|
||||||
CSVRecord(final String[] values, final Map<String, Integer> mapping, final String comment, final long recordNumber) {
|
CSVRecord(final String[] values, final Map<String, Integer> mapping, final String comment, final long recordNumber, long characterPosition) {
|
||||||
this.recordNumber = recordNumber;
|
this.recordNumber = recordNumber;
|
||||||
this.values = values != null ? values : EMPTY_STRING_ARRAY;
|
this.values = values != null ? values : EMPTY_STRING_ARRAY;
|
||||||
this.mapping = mapping;
|
this.mapping = mapping;
|
||||||
this.comment = comment;
|
this.comment = comment;
|
||||||
|
this.characterPosition = characterPosition;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -109,6 +112,16 @@ public final class CSVRecord implements Serializable, Iterable<String> {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the start position of this record as a character position in the source stream. This may or may not
|
||||||
|
* correspond to the byte position depending on the character set.
|
||||||
|
*
|
||||||
|
* @return the position of this record in the source stream.
|
||||||
|
*/
|
||||||
|
public long getCharacterPosition() {
|
||||||
|
return characterPosition;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Returns the comment for this record, if any.
|
* Returns the comment for this record, if any.
|
||||||
*
|
*
|
||||||
|
|
|
@ -299,22 +299,23 @@ public class CSVParserTest {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// @Test
|
// @Test
|
||||||
// public void testStartWithEmptyLinesThenHeaders() throws Exception {
|
// public void testStartWithEmptyLinesThenHeaders() throws Exception {
|
||||||
// final String[] codes = { "\r\n\r\n\r\nhello,\r\n\r\n\r\n", "hello,\n\n\n", "hello,\"\"\r\n\r\n\r\n", "hello,\"\"\n\n\n" };
|
// final String[] codes = { "\r\n\r\n\r\nhello,\r\n\r\n\r\n", "hello,\n\n\n", "hello,\"\"\r\n\r\n\r\n",
|
||||||
// final String[][] res = { { "hello", "" }, { "" }, // Excel format does not ignore empty lines
|
// "hello,\"\"\n\n\n" };
|
||||||
// { "" } };
|
// final String[][] res = { { "hello", "" }, { "" }, // Excel format does not ignore empty lines
|
||||||
// for (final String code : codes) {
|
// { "" } };
|
||||||
// final CSVParser parser = CSVParser.parse(code, CSVFormat.EXCEL);
|
// for (final String code : codes) {
|
||||||
// final List<CSVRecord> records = parser.getRecords();
|
// final CSVParser parser = CSVParser.parse(code, CSVFormat.EXCEL);
|
||||||
// assertEquals(res.length, records.size());
|
// final List<CSVRecord> records = parser.getRecords();
|
||||||
// assertTrue(records.size() > 0);
|
// assertEquals(res.length, records.size());
|
||||||
// for (int i = 0; i < res.length; i++) {
|
// assertTrue(records.size() > 0);
|
||||||
// assertArrayEquals(res[i], records.get(i).values());
|
// for (int i = 0; i < res.length; i++) {
|
||||||
// }
|
// assertArrayEquals(res[i], records.get(i).values());
|
||||||
// parser.close();
|
// }
|
||||||
// }
|
// parser.close();
|
||||||
// }
|
// }
|
||||||
|
// }
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testEndOfFileBehaviorCSV() throws Exception {
|
public void testEndOfFileBehaviorCSV() throws Exception {
|
||||||
|
@ -474,6 +475,16 @@ public class CSVParserTest {
|
||||||
this.validateLineNumbers(String.valueOf(LF));
|
this.validateLineNumbers(String.valueOf(LF));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testGetRecordPositionWithCRLF() throws Exception {
|
||||||
|
this.validateRecordPosition(CRLF);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testGetRecordPositionWithLF() throws Exception {
|
||||||
|
this.validateRecordPosition(String.valueOf(LF));
|
||||||
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testGetOneLine() throws IOException {
|
public void testGetOneLine() throws IOException {
|
||||||
final CSVParser parser = CSVParser.parse(CSV_INPUT_1, CSVFormat.DEFAULT);
|
final CSVParser parser = CSVParser.parse(CSV_INPUT_1, CSVFormat.DEFAULT);
|
||||||
|
@ -902,4 +913,65 @@ public class CSVParserTest {
|
||||||
parser.close();
|
parser.close();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private void validateRecordPosition(final String lineSeparator) throws IOException {
|
||||||
|
final String nl = lineSeparator; // used as linebreak in values for better distinction
|
||||||
|
|
||||||
|
String code = "a,b,c" + lineSeparator + "1,2,3" + lineSeparator +
|
||||||
|
// to see if recordPosition correctly points to the enclosing quote
|
||||||
|
"'A" + nl + "A','B" + nl + "B',CC" + lineSeparator +
|
||||||
|
// unicode test... not very relevant while operating on strings instead of bytes, but for
|
||||||
|
// completeness...
|
||||||
|
"\u00c4,\u00d6,\u00dc" + lineSeparator + "EOF,EOF,EOF";
|
||||||
|
|
||||||
|
final CSVFormat format = CSVFormat.newFormat(',').withQuote('\'').withRecordSeparator(lineSeparator);
|
||||||
|
CSVParser parser = CSVParser.parse(code, format);
|
||||||
|
|
||||||
|
CSVRecord record;
|
||||||
|
assertEquals(0, parser.getRecordNumber());
|
||||||
|
|
||||||
|
assertNotNull(record = parser.nextRecord());
|
||||||
|
assertEquals(1, record.getRecordNumber());
|
||||||
|
assertEquals(code.indexOf('a'), record.getCharacterPosition());
|
||||||
|
|
||||||
|
assertNotNull(record = parser.nextRecord());
|
||||||
|
assertEquals(2, record.getRecordNumber());
|
||||||
|
assertEquals(code.indexOf('1'), record.getCharacterPosition());
|
||||||
|
|
||||||
|
assertNotNull(record = parser.nextRecord());
|
||||||
|
final long positionRecord3 = record.getCharacterPosition();
|
||||||
|
assertEquals(3, record.getRecordNumber());
|
||||||
|
assertEquals(code.indexOf("'A"), record.getCharacterPosition());
|
||||||
|
assertEquals("A" + lineSeparator + "A", record.get(0));
|
||||||
|
assertEquals("B" + lineSeparator + "B", record.get(1));
|
||||||
|
assertEquals("CC", record.get(2));
|
||||||
|
|
||||||
|
assertNotNull(record = parser.nextRecord());
|
||||||
|
assertEquals(4, record.getRecordNumber());
|
||||||
|
assertEquals(code.indexOf('\u00c4'), record.getCharacterPosition());
|
||||||
|
|
||||||
|
assertNotNull(record = parser.nextRecord());
|
||||||
|
assertEquals(5, record.getRecordNumber());
|
||||||
|
assertEquals(code.indexOf("EOF"), record.getCharacterPosition());
|
||||||
|
|
||||||
|
parser.close();
|
||||||
|
|
||||||
|
// now try to read starting at record 3
|
||||||
|
parser = CSVParser.parse(code.substring((int) positionRecord3), format);
|
||||||
|
parser.setNextRecordNumber(3);
|
||||||
|
parser.setNextCharacterPosition(positionRecord3);
|
||||||
|
|
||||||
|
assertNotNull(record = parser.nextRecord());
|
||||||
|
assertEquals(3, record.getRecordNumber());
|
||||||
|
assertEquals(code.indexOf("'A"), record.getCharacterPosition());
|
||||||
|
assertEquals("A" + lineSeparator + "A", record.get(0));
|
||||||
|
assertEquals("B" + lineSeparator + "B", record.get(1));
|
||||||
|
assertEquals("CC", record.get(2));
|
||||||
|
|
||||||
|
assertNotNull(record = parser.nextRecord());
|
||||||
|
assertEquals(4, record.getRecordNumber());
|
||||||
|
assertEquals(code.indexOf('\u00c4'), record.getCharacterPosition());
|
||||||
|
assertEquals("\u00c4", record.get(0));
|
||||||
|
|
||||||
|
parser.close();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -45,12 +45,12 @@ public class CSVRecordTest {
|
||||||
@Before
|
@Before
|
||||||
public void setUp() throws Exception {
|
public void setUp() throws Exception {
|
||||||
values = new String[] { "A", "B", "C" };
|
values = new String[] { "A", "B", "C" };
|
||||||
record = new CSVRecord(values, null, null, 0);
|
record = new CSVRecord(values, null, null, 0, -1);
|
||||||
header = new HashMap<String, Integer>();
|
header = new HashMap<String, Integer>();
|
||||||
header.put("first", Integer.valueOf(0));
|
header.put("first", Integer.valueOf(0));
|
||||||
header.put("second", Integer.valueOf(1));
|
header.put("second", Integer.valueOf(1));
|
||||||
header.put("third", Integer.valueOf(2));
|
header.put("third", Integer.valueOf(2));
|
||||||
recordWithHeader = new CSVRecord(values, header, null, 0);
|
recordWithHeader = new CSVRecord(values, header, null, 0, -1);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
|
Loading…
Reference in New Issue