[CSV-93] Allow the handling of NULL values.

- [CSV-253] Handle absent values in input (null).
- Cleaned up version of PR 77 from dota17 where:
- Don't duplicate two state items from the format.
- Use try-with-resources.
- Remove useless parens.
- Update Javaodc.
- Sort members in the new tests.
- Use builder.
This commit is contained in:
Gary Gregory 2021-07-08 12:36:23 -04:00
parent 4083b7a2a0
commit ac280e705d
7 changed files with 243 additions and 6 deletions

View File

@ -446,8 +446,7 @@ public final class CSVParser implements Iterable<CSVRecord>, Closeable {
if (lastRecord && inputClean.isEmpty() && this.format.getTrailingDelimiter()) {
return;
}
final String nullString = this.format.getNullString();
this.recordList.add(inputClean.equals(nullString) ? null : inputClean);
this.recordList.add(handleNull(inputClean));
}
/**
@ -636,7 +635,26 @@ public final class CSVParser implements Iterable<CSVRecord>, Closeable {
}
/**
* Gets whether this parser is closed.
* Handle whether input is parsed as null
*
* @param input
* the cell data to further processed
* @return null if input is parsed as null, or input itself if input isn't parsed as null
*/
private String handleNull(final String input) {
final boolean isQuoted = this.reusableToken.isQuoted;
final String nullString = format.getNullString();
final boolean strictQuoteMode = isStrictQuoteMode();
if (input.equals(nullString)) {
// nullString = NULL(String), distinguish between "NULL" and NULL in ALL_NON_NULL or NON_NUMERIC quote mode
return strictQuoteMode && isQuoted ? input : null;
}
// don't set nullString, distinguish between "" and ,, (absent values) in All_NON_NULL or NON_NUMERIC quote mode
return strictQuoteMode && nullString == null && input.isEmpty() && !isQuoted ? null : input;
}
/**
* Tests whether this parser is closed.
*
* @return whether this parser is closed.
*/
@ -645,7 +663,18 @@ public final class CSVParser implements Iterable<CSVRecord>, Closeable {
}
/**
* Returns an iterator on the records.
* Tests whether the format's {@link QuoteMode} is {@link QuoteMode#ALL_NON_NULL} or {@link QuoteMode#NON_NUMERIC}.
*
* @return true if the format's {@link QuoteMode} is {@link QuoteMode#ALL_NON_NULL} or
* {@link QuoteMode#NON_NUMERIC}.
*/
private boolean isStrictQuoteMode() {
return this.format.getQuoteMode() == QuoteMode.ALL_NON_NULL ||
this.format.getQuoteMode() == QuoteMode.NON_NUMERIC;
}
/**
* Returns the record iterator.
*
* <p>
* An {@link IOException} caught during the iteration are re-thrown as an

View File

@ -324,6 +324,7 @@ final class Lexer implements Closeable {
* on invalid state: EOF before closing encapsulator or invalid character before delimiter or EOL
*/
private Token parseEncapsulatedToken(final Token token) throws IOException {
token.isQuoted = true;
// save current line number in case needed for IOE
final long startLineNumber = getCurrentLineNumber();
int c;

View File

@ -17,7 +17,7 @@
package org.apache.commons.csv;
/**
* Defines quoting behavior when printing.
* Defines quoting behavior.
*/
public enum QuoteMode {

View File

@ -55,10 +55,13 @@ final class Token {
/** Token ready flag: indicates a valid token with content (ready for the parser). */
boolean isReady;
boolean isQuoted;
void reset() {
content.setLength(0);
type = INVALID;
isReady = false;
isQuoted = false;
}
/**

View File

@ -873,7 +873,7 @@ public class CSVPrinterTest {
String expected = "\"NULL\"\tNULL\n";
assertEquals(expected, writer.toString());
String[] record0 = toFirstRecordValues(expected, format);
assertArrayEquals(new Object[2], record0);
assertArrayEquals(s, record0);
s = new String[] { "\\N", null };
format = CSVFormat.MYSQL.withNullString("\\N");

View File

@ -0,0 +1,54 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.commons.csv.issues;
import static org.junit.jupiter.api.Assertions.assertEquals;
import java.io.IOException;
import java.io.StringReader;
import java.util.Iterator;
import org.apache.commons.csv.CSVFormat;
import org.apache.commons.csv.CSVParser;
import org.apache.commons.csv.CSVRecord;
import org.apache.commons.csv.QuoteMode;
import org.junit.jupiter.api.Test;
/**
* Setting QuoteMode:ALL_NON_NULL or NON_NUMERIC can distinguish between empty string columns and absent value columns.
*/
public class JiraCsv253Test {
private void assertArrayEqual(final String[] expected, final CSVRecord actual) {
for (int i = 0; i < expected.length; i++) {
assertEquals(expected[i], actual.get(i));
}
}
@Test
public void testHandleAbsentValues() throws IOException {
final String source = "\"John\",,\"Doe\"\n" + ",\"AA\",123\n" + "\"John\",90,\n" + "\"\",,90";
final CSVFormat csvFormat = CSVFormat.DEFAULT.builder().setQuoteMode(QuoteMode.NON_NUMERIC).build();
try (final CSVParser parser = csvFormat.parse(new StringReader(source))) {
final Iterator<CSVRecord> csvRecords = parser.iterator();
assertArrayEqual(new String[] {"John", null, "Doe"}, csvRecords.next());
assertArrayEqual(new String[] {null, "AA", "123"}, csvRecords.next());
assertArrayEqual(new String[] {"John", "90", null}, csvRecords.next());
assertArrayEqual(new String[] {"", null, "90"}, csvRecords.next());
}
}
}

View File

@ -0,0 +1,150 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.commons.csv.issues;
import static org.junit.jupiter.api.Assertions.assertEquals;
import java.io.IOException;
import java.io.StringReader;
import org.apache.commons.csv.CSVFormat;
import org.apache.commons.csv.CSVParser;
import org.apache.commons.csv.CSVRecord;
import org.apache.commons.csv.QuoteMode;
import org.junit.jupiter.api.Test;
/**
* Add more tests about null value.
* <p>
* QuoteMode:ALL_NON_NULL (Quotes all non-null fields, null will not be quoted but not null will be quoted). when
* withNullString("NULL"), NULL String value ("NULL") and null value (null) will be formatted as '"NULL",NULL'. So it
* also should be parsed as NULL String value and null value (["NULL", null]), It should be distinguish in parsing. And
* when don't set nullString in CSVFormat, String '"",' should be parsed as "" and null (["", null]) according to null
* will not be quoted but not null will be quoted. QuoteMode:NON_NUMERIC, same as ALL_NON_NULL.
* </p>
* <p>
* This can solve the problem of distinguishing between empty string columns and absent value columns which just like
* Jira CSV-253 to a certain extent.
* </p>
*/
public class JiraCsv93Test {
private static Object[] objects1 = {"abc", "", null, "a,b,c", 123};
private static Object[] objects2 = {"abc", "NULL", null, "a,b,c", 123};
private void every(final CSVFormat csvFormat, final Object[] objects, final String format, final String[] data)
throws IOException {
final String source = csvFormat.format(objects);
assertEquals(format, csvFormat.format(objects));
try (final CSVParser csvParser = csvFormat.parse(new StringReader(source))) {
final CSVRecord csvRecord = csvParser.iterator().next();
for (int i = 0; i < data.length; i++) {
assertEquals(csvRecord.get(i), data[i]);
}
}
}
@Test
public void testWithNotSetNullString() throws IOException {
// @formatter:off
every(CSVFormat.DEFAULT,
objects1,
"abc,,,\"a,b,c\",123",
new String[]{"abc", "", "", "a,b,c", "123"});
every(CSVFormat.DEFAULT.builder().setQuoteMode(QuoteMode.ALL).build(),
objects1,
"\"abc\",\"\",,\"a,b,c\",\"123\"",
new String[]{"abc", "", "", "a,b,c", "123"});
every(CSVFormat.DEFAULT.builder().setQuoteMode(QuoteMode.ALL_NON_NULL).build(),
objects1,
"\"abc\",\"\",,\"a,b,c\",\"123\"",
new String[]{"abc", "", null, "a,b,c", "123"});
every(CSVFormat.DEFAULT.builder().setQuoteMode(QuoteMode.MINIMAL).build(),
objects1,
"abc,,,\"a,b,c\",123",
new String[]{"abc", "", "", "a,b,c", "123"});
every(CSVFormat.DEFAULT.builder().setEscape('?').setQuoteMode(QuoteMode.NONE).build(),
objects1,
"abc,,,a?,b?,c,123",
new String[]{"abc", "", "", "a,b,c", "123"});
every(CSVFormat.DEFAULT.builder().setQuoteMode(QuoteMode.NON_NUMERIC).build(),
objects1,
"\"abc\",\"\",,\"a,b,c\",123",
new String[]{"abc", "", null, "a,b,c", "123"});
// @formatter:on
}
@Test
public void testWithSetNullStringEmptyString() throws IOException {
// @formatter:off
every(CSVFormat.DEFAULT.builder().setNullString("").build(),
objects1,
"abc,,,\"a,b,c\",123",
new String[]{"abc", null, null, "a,b,c", "123"});
every(CSVFormat.DEFAULT.builder().setNullString("").setQuoteMode(QuoteMode.ALL).build(),
objects1,
"\"abc\",\"\",\"\",\"a,b,c\",\"123\"",
new String[]{"abc", null, null, "a,b,c", "123"});
every(CSVFormat.DEFAULT.builder().setNullString("").setQuoteMode(QuoteMode.ALL_NON_NULL).build(),
objects1,
"\"abc\",\"\",,\"a,b,c\",\"123\"",
new String[]{"abc", "", null, "a,b,c", "123"});
every(CSVFormat.DEFAULT.builder().setNullString("").setQuoteMode(QuoteMode.MINIMAL).build(),
objects1,
"abc,,,\"a,b,c\",123",
new String[]{"abc", null, null, "a,b,c", "123"});
every(CSVFormat.DEFAULT.builder().setNullString("").setEscape('?').setQuoteMode(QuoteMode.NONE).build(),
objects1,
"abc,,,a?,b?,c,123",
new String[]{"abc", null, null, "a,b,c", "123"});
every(CSVFormat.DEFAULT.builder().setNullString("").setQuoteMode(QuoteMode.NON_NUMERIC).build(),
objects1,
"\"abc\",\"\",,\"a,b,c\",123",
new String[]{"abc", "", null, "a,b,c", "123"});
// @formatter:on
}
@Test
public void testWithSetNullStringNULL() throws IOException {
// @formatter:off
every(CSVFormat.DEFAULT.builder().setNullString("NULL").build(),
objects2,
"abc,NULL,NULL,\"a,b,c\",123",
new String[]{"abc", null, null, "a,b,c", "123"});
every(CSVFormat.DEFAULT.builder().setNullString("NULL").setQuoteMode(QuoteMode.ALL).build(),
objects2,
"\"abc\",\"NULL\",\"NULL\",\"a,b,c\",\"123\"",
new String[]{"abc", null, null, "a,b,c", "123"});
every(CSVFormat.DEFAULT.builder().setNullString("NULL").setQuoteMode(QuoteMode.ALL_NON_NULL).build(),
objects2,
"\"abc\",\"NULL\",NULL,\"a,b,c\",\"123\"",
new String[]{"abc", "NULL", null, "a,b,c", "123"});
every(CSVFormat.DEFAULT.builder().setNullString("NULL").setQuoteMode(QuoteMode.MINIMAL).build(),
objects2,
"abc,NULL,NULL,\"a,b,c\",123",
new String[]{"abc", null, null, "a,b,c", "123"});
every(CSVFormat.DEFAULT.builder().setNullString("NULL").setEscape('?').setQuoteMode(QuoteMode.NONE).build(),
objects2,
"abc,NULL,NULL,a?,b?,c,123",
new String[]{"abc", null, null, "a,b,c", "123"});
every(CSVFormat.DEFAULT.builder().setNullString("NULL").setQuoteMode(QuoteMode.NON_NUMERIC).build(),
objects2,
"\"abc\",\"NULL\",NULL,\"a,b,c\",123",
new String[]{"abc", "NULL", null, "a,b,c", "123"});
// @formatter:on
}
}