From f5fbd0bea0715bc3fd3d7ac63028e168cf257712 Mon Sep 17 00:00:00 2001 From: Atul Mohan Date: Wed, 20 Nov 2019 00:35:22 -0600 Subject: [PATCH] Handle missing values for delimited text files when Nullhandling is enabled (#8779) * Handle missing values * Fix multi value tests * Fix firehose tests * Fix conflicts --- .../druid/java/util/common/parsers/CSVParser.java | 8 +++++++- .../java/util/common/parsers/DelimitedParser.java | 8 +++++++- .../common/parsers/FlatTextFormatParserTest.java | 14 ++++++++++++++ .../overlord/sampler/FirehoseSamplerTest.java | 4 +--- .../druid/query/MultiValuedDimensionTest.java | 2 +- 5 files changed, 30 insertions(+), 6 deletions(-) diff --git a/core/src/main/java/org/apache/druid/java/util/common/parsers/CSVParser.java b/core/src/main/java/org/apache/druid/java/util/common/parsers/CSVParser.java index 0a26433933c..9b99014dc69 100644 --- a/core/src/main/java/org/apache/druid/java/util/common/parsers/CSVParser.java +++ b/core/src/main/java/org/apache/druid/java/util/common/parsers/CSVParser.java @@ -21,6 +21,9 @@ package org.apache.druid.java.util.common.parsers; import com.google.common.annotations.VisibleForTesting; import com.opencsv.RFC4180Parser; +import com.opencsv.RFC4180ParserBuilder; +import com.opencsv.enums.CSVReaderNullFieldIndicator; +import org.apache.druid.common.config.NullHandling; import javax.annotation.Nullable; import java.io.IOException; @@ -29,7 +32,10 @@ import java.util.List; public class CSVParser extends AbstractFlatTextFormatParser { - private final RFC4180Parser parser = new RFC4180Parser(); + private final RFC4180Parser parser = NullHandling.replaceWithDefault() + ? new RFC4180Parser() + : new RFC4180ParserBuilder().withFieldAsNull( + CSVReaderNullFieldIndicator.EMPTY_SEPARATORS).build(); public CSVParser( @Nullable final String listDelimiter, diff --git a/core/src/main/java/org/apache/druid/java/util/common/parsers/DelimitedParser.java b/core/src/main/java/org/apache/druid/java/util/common/parsers/DelimitedParser.java index 61eafa70f19..849d45e8557 100644 --- a/core/src/main/java/org/apache/druid/java/util/common/parsers/DelimitedParser.java +++ b/core/src/main/java/org/apache/druid/java/util/common/parsers/DelimitedParser.java @@ -22,6 +22,7 @@ package org.apache.druid.java.util.common.parsers; import com.google.common.annotations.VisibleForTesting; import com.google.common.base.Preconditions; import com.google.common.base.Splitter; +import org.apache.druid.common.config.NullHandling; import javax.annotation.Nullable; import java.util.ArrayList; @@ -94,7 +95,12 @@ public class DelimitedParser extends AbstractFlatTextFormatParser List result = new ArrayList(); while (iterator.hasNext()) { - result.add(iterator.next()); + String splitValue = iterator.next(); + if (!NullHandling.replaceWithDefault() && splitValue.isEmpty()) { + result.add(null); + } else { + result.add(splitValue); + } } return Collections.unmodifiableList(result); diff --git a/core/src/test/java/org/apache/druid/java/util/common/parsers/FlatTextFormatParserTest.java b/core/src/test/java/org/apache/druid/java/util/common/parsers/FlatTextFormatParserTest.java index 571a0d359b8..bda9674cba6 100644 --- a/core/src/test/java/org/apache/druid/java/util/common/parsers/FlatTextFormatParserTest.java +++ b/core/src/test/java/org/apache/druid/java/util/common/parsers/FlatTextFormatParserTest.java @@ -215,6 +215,20 @@ public class FlatTextFormatParserTest parser.parseToMap(body[0]); } + @Test + public void testWithNullValues() + { + final Parser parser = PARSER_FACTORY.get(format, true, 0); + parser.startFileFromBeginning(); + final String[] body = new String[]{ + concat(format, "time", "value1", "value2"), + concat(format, "hello", "world", "") + }; + Assert.assertNull(parser.parseToMap(body[0])); + final Map jsonMap = parser.parseToMap(body[1]); + Assert.assertNull(jsonMap.get("value2")); + } + private static class FlatTextFormatParserFactory { public Parser get(FlatTextFormat format) diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/overlord/sampler/FirehoseSamplerTest.java b/indexing-service/src/test/java/org/apache/druid/indexing/overlord/sampler/FirehoseSamplerTest.java index 62526cfe327..96e12e66478 100644 --- a/indexing-service/src/test/java/org/apache/druid/indexing/overlord/sampler/FirehoseSamplerTest.java +++ b/indexing-service/src/test/java/org/apache/druid/indexing/overlord/sampler/FirehoseSamplerTest.java @@ -1190,9 +1190,7 @@ public class FirehoseSamplerTest private String getUnparseableTimestampString() { return ParserType.STR_CSV.equals(parserType) - ? (USE_DEFAULT_VALUE_FOR_NULL - ? "Unparseable timestamp found! Event: {t=bad_timestamp, dim1=foo, dim2=null, met1=6}" - : "Unparseable timestamp found! Event: {t=bad_timestamp, dim1=foo, dim2=, met1=6}") + ? "Unparseable timestamp found! Event: {t=bad_timestamp, dim1=foo, dim2=null, met1=6}" : "Unparseable timestamp found! Event: {t=bad_timestamp, dim1=foo, met1=6}"; } diff --git a/processing/src/test/java/org/apache/druid/query/MultiValuedDimensionTest.java b/processing/src/test/java/org/apache/druid/query/MultiValuedDimensionTest.java index b570677a0ed..93cf2dced8a 100644 --- a/processing/src/test/java/org/apache/druid/query/MultiValuedDimensionTest.java +++ b/processing/src/test/java/org/apache/druid/query/MultiValuedDimensionTest.java @@ -161,7 +161,7 @@ public class MultiValuedDimensionTest "2011-01-12T00:00:00.000Z,product_1,t1\tt2\tt3,u1\tu2", "2011-01-13T00:00:00.000Z,product_2,t3\tt4\tt5,u3\tu4", "2011-01-14T00:00:00.000Z,product_3,t5\tt6\tt7,u1\tu5", - "2011-01-14T00:00:00.000Z,product_4,,u2" + "2011-01-14T00:00:00.000Z,product_4,\"\",u2" }; for (String row : rows) {