Handle missing values for delimited text files when Nullhandling is enabled (#8779)

* Handle missing values

* Fix multi value tests

* Fix firehose tests

* Fix conflicts
This commit is contained in:
Atul Mohan 2019-11-20 00:35:22 -06:00 committed by Clint Wylie
parent 4ae6466ae2
commit f5fbd0bea0
5 changed files with 30 additions and 6 deletions

View File

@ -21,6 +21,9 @@ package org.apache.druid.java.util.common.parsers;
import com.google.common.annotations.VisibleForTesting;
import com.opencsv.RFC4180Parser;
import com.opencsv.RFC4180ParserBuilder;
import com.opencsv.enums.CSVReaderNullFieldIndicator;
import org.apache.druid.common.config.NullHandling;
import javax.annotation.Nullable;
import java.io.IOException;
@ -29,7 +32,10 @@ import java.util.List;
public class CSVParser extends AbstractFlatTextFormatParser
{
private final RFC4180Parser parser = new RFC4180Parser();
private final RFC4180Parser parser = NullHandling.replaceWithDefault()
? new RFC4180Parser()
: new RFC4180ParserBuilder().withFieldAsNull(
CSVReaderNullFieldIndicator.EMPTY_SEPARATORS).build();
public CSVParser(
@Nullable final String listDelimiter,

View File

@ -22,6 +22,7 @@ package org.apache.druid.java.util.common.parsers;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Preconditions;
import com.google.common.base.Splitter;
import org.apache.druid.common.config.NullHandling;
import javax.annotation.Nullable;
import java.util.ArrayList;
@ -94,7 +95,12 @@ public class DelimitedParser extends AbstractFlatTextFormatParser
List<String> result = new ArrayList<String>();
while (iterator.hasNext()) {
result.add(iterator.next());
String splitValue = iterator.next();
if (!NullHandling.replaceWithDefault() && splitValue.isEmpty()) {
result.add(null);
} else {
result.add(splitValue);
}
}
return Collections.unmodifiableList(result);

View File

@ -215,6 +215,20 @@ public class FlatTextFormatParserTest
parser.parseToMap(body[0]);
}
@Test
public void testWithNullValues()
{
final Parser<String, Object> parser = PARSER_FACTORY.get(format, true, 0);
parser.startFileFromBeginning();
final String[] body = new String[]{
concat(format, "time", "value1", "value2"),
concat(format, "hello", "world", "")
};
Assert.assertNull(parser.parseToMap(body[0]));
final Map<String, Object> jsonMap = parser.parseToMap(body[1]);
Assert.assertNull(jsonMap.get("value2"));
}
private static class FlatTextFormatParserFactory
{
public Parser<String, Object> get(FlatTextFormat format)

View File

@ -1190,9 +1190,7 @@ public class FirehoseSamplerTest
private String getUnparseableTimestampString()
{
return ParserType.STR_CSV.equals(parserType)
? (USE_DEFAULT_VALUE_FOR_NULL
? "Unparseable timestamp found! Event: {t=bad_timestamp, dim1=foo, dim2=null, met1=6}"
: "Unparseable timestamp found! Event: {t=bad_timestamp, dim1=foo, dim2=, met1=6}")
: "Unparseable timestamp found! Event: {t=bad_timestamp, dim1=foo, met1=6}";
}

View File

@ -161,7 +161,7 @@ public class MultiValuedDimensionTest
"2011-01-12T00:00:00.000Z,product_1,t1\tt2\tt3,u1\tu2",
"2011-01-13T00:00:00.000Z,product_2,t3\tt4\tt5,u3\tu4",
"2011-01-14T00:00:00.000Z,product_3,t5\tt6\tt7,u1\tu5",
"2011-01-14T00:00:00.000Z,product_4,,u2"
"2011-01-14T00:00:00.000Z,product_4,\"\",u2"
};
for (String row : rows) {