From b37c9b5fe049f27aa73dc6ff70926bb6f08a1fa1 Mon Sep 17 00:00:00 2001 From: Jihoon Son Date: Sat, 24 Jun 2017 06:29:26 +0900 Subject: [PATCH] Fix a bug of CSV/TSV parsers when extracting columns from header (#4443) * Reset fieldNames whenever a new file begins * Fix test failure * Fix test failure --- .../indexing/common/task/IndexTaskTest.java | 2 +- .../java/util/common/parsers/CSVParser.java | 5 +- .../util/common/parsers/DelimitedParser.java | 5 +- .../util/common/parsers/CSVParserTest.java | 74 ++++++++++++++++++- .../common/parsers/DelimitedParserTest.java | 70 +++++++++++++++++- 5 files changed, 148 insertions(+), 8 deletions(-) diff --git a/indexing-service/src/test/java/io/druid/indexing/common/task/IndexTaskTest.java b/indexing-service/src/test/java/io/druid/indexing/common/task/IndexTaskTest.java index 2881ab0c519..d25811517fa 100644 --- a/indexing-service/src/test/java/io/druid/indexing/common/task/IndexTaskTest.java +++ b/indexing-service/src/test/java/io/druid/indexing/common/task/IndexTaskTest.java @@ -482,7 +482,7 @@ public class IndexTaskTest Assert.assertEquals(1, segments.size()); - Assert.assertEquals(Arrays.asList("dim"), segments.get(0).getDimensions()); + Assert.assertEquals(Arrays.asList("d"), segments.get(0).getDimensions()); Assert.assertEquals(Arrays.asList("val"), segments.get(0).getMetrics()); Assert.assertEquals(new Interval("2014/P1D"), segments.get(0).getInterval()); } diff --git a/java-util/src/main/java/io/druid/java/util/common/parsers/CSVParser.java b/java-util/src/main/java/io/druid/java/util/common/parsers/CSVParser.java index 957e72560d2..180b095ae94 100644 --- a/java-util/src/main/java/io/druid/java/util/common/parsers/CSVParser.java +++ b/java-util/src/main/java/io/druid/java/util/common/parsers/CSVParser.java @@ -113,9 +113,12 @@ public class CSVParser implements Parser @Override public void startFileFromBeginning() { - supportSkipHeaderRows = true; + if (hasHeaderRow) { + fieldNames = null; + } hasParsedHeader = false; skippedHeaderRows = 0; + supportSkipHeaderRows = true; } @Override diff --git a/java-util/src/main/java/io/druid/java/util/common/parsers/DelimitedParser.java b/java-util/src/main/java/io/druid/java/util/common/parsers/DelimitedParser.java index 27114df31e9..c052daee17e 100644 --- a/java-util/src/main/java/io/druid/java/util/common/parsers/DelimitedParser.java +++ b/java-util/src/main/java/io/druid/java/util/common/parsers/DelimitedParser.java @@ -126,9 +126,12 @@ public class DelimitedParser implements Parser @Override public void startFileFromBeginning() { - supportSkipHeaderRows = true; + if (hasHeaderRow) { + fieldNames = null; + } hasParsedHeader = false; skippedHeaderRows = 0; + supportSkipHeaderRows = true; } @Override diff --git a/java-util/src/test/java/io/druid/java/util/common/parsers/CSVParserTest.java b/java-util/src/test/java/io/druid/java/util/common/parsers/CSVParserTest.java index 37a589b276a..f57bd4077a1 100644 --- a/java-util/src/test/java/io/druid/java/util/common/parsers/CSVParserTest.java +++ b/java-util/src/test/java/io/druid/java/util/common/parsers/CSVParserTest.java @@ -22,12 +22,16 @@ package io.druid.java.util.common.parsers; import com.google.common.base.Optional; import com.google.common.collect.ImmutableMap; import org.junit.Assert; +import org.junit.Rule; import org.junit.Test; +import org.junit.rules.ExpectedException; import java.util.Map; public class CSVParserTest { + @Rule + public ExpectedException expectedException = ExpectedException.none(); @Test public void testValidHeader() @@ -117,9 +121,71 @@ public class CSVParserTest ); } - @Test(expected = UnsupportedOperationException.class) + @Test + public void testCSVParserWithHeaderRow() + { + final Parser csvParser = new CSVParser( + Optional.absent(), + true, + 0 + ); + csvParser.startFileFromBeginning(); + final String[] body = new String[] { + "time,value1,value2", + "hello,world,foo" + }; + Assert.assertNull(csvParser.parse(body[0])); + final Map jsonMap = csvParser.parse(body[1]); + Assert.assertEquals( + "jsonMap", + ImmutableMap.of("time", "hello", "value1", "world", "value2", "foo"), + jsonMap + ); + } + + @Test + public void testCSVParserWithDifferentHeaderRows() + { + final Parser csvParser = new CSVParser( + Optional.absent(), + true, + 0 + ); + csvParser.startFileFromBeginning(); + final String[] body = new String[] { + "time,value1,value2", + "hello,world,foo" + }; + Assert.assertNull(csvParser.parse(body[0])); + Map jsonMap = csvParser.parse(body[1]); + Assert.assertEquals( + "jsonMap", + ImmutableMap.of("time", "hello", "value1", "world", "value2", "foo"), + jsonMap + ); + + csvParser.startFileFromBeginning(); + final String[] body2 = new String[] { + "time,value1,value2,value3", + "hello,world,foo,bar" + }; + Assert.assertNull(csvParser.parse(body2[0])); + jsonMap = csvParser.parse(body2[1]); + Assert.assertEquals( + "jsonMap", + ImmutableMap.of("time", "hello", "value1", "world", "value2", "foo", "value3", "bar"), + jsonMap + ); + } + + @Test public void testCSVParserWithoutStartFileFromBeginning() { + expectedException.expect(UnsupportedOperationException.class); + expectedException.expectMessage( + "hasHeaderRow or maxSkipHeaderRows is not supported. Please check the indexTask supports these options." + ); + final int skipHeaderRows = 2; final Parser csvParser = new CSVParser( Optional.absent(), @@ -127,9 +193,9 @@ public class CSVParserTest skipHeaderRows ); final String[] body = new String[] { - "header\tline\t1", - "header\tline\t2", - "hello\tworld\tfoo" + "header,line,1", + "header,line,2", + "hello,world,foo" }; csvParser.parse(body[0]); } diff --git a/java-util/src/test/java/io/druid/java/util/common/parsers/DelimitedParserTest.java b/java-util/src/test/java/io/druid/java/util/common/parsers/DelimitedParserTest.java index d91ed25cbbc..ca913253eb6 100644 --- a/java-util/src/test/java/io/druid/java/util/common/parsers/DelimitedParserTest.java +++ b/java-util/src/test/java/io/druid/java/util/common/parsers/DelimitedParserTest.java @@ -22,12 +22,16 @@ package io.druid.java.util.common.parsers; import com.google.common.base.Optional; import com.google.common.collect.ImmutableMap; import org.junit.Assert; +import org.junit.Rule; import org.junit.Test; +import org.junit.rules.ExpectedException; import java.util.Map; public class DelimitedParserTest { + @Rule + public ExpectedException expectedException = ExpectedException.none(); @Test public void testValidHeader() @@ -127,9 +131,73 @@ public class DelimitedParserTest ); } - @Test(expected = UnsupportedOperationException.class) + @Test + public void testTSVParserWithHeaderRow() + { + final Parser parser = new DelimitedParser( + Optional.of("\t"), + Optional.absent(), + true, + 0 + ); + parser.startFileFromBeginning(); + final String[] body = new String[] { + "time\tvalue1\tvalue2", + "hello\tworld\tfoo" + }; + Assert.assertNull(parser.parse(body[0])); + final Map jsonMap = parser.parse(body[1]); + Assert.assertEquals( + "jsonMap", + ImmutableMap.of("time", "hello", "value1", "world", "value2", "foo"), + jsonMap + ); + } + + @Test + public void testTSVParserWithDifferentHeaderRows() + { + final Parser csvParser = new DelimitedParser( + Optional.of("\t"), + Optional.absent(), + true, + 0 + ); + csvParser.startFileFromBeginning(); + final String[] body = new String[] { + "time\tvalue1\tvalue2", + "hello\tworld\tfoo" + }; + Assert.assertNull(csvParser.parse(body[0])); + Map jsonMap = csvParser.parse(body[1]); + Assert.assertEquals( + "jsonMap", + ImmutableMap.of("time", "hello", "value1", "world", "value2", "foo"), + jsonMap + ); + + csvParser.startFileFromBeginning(); + final String[] body2 = new String[] { + "time\tvalue1\tvalue2\tvalue3", + "hello\tworld\tfoo\tbar" + }; + Assert.assertNull(csvParser.parse(body2[0])); + jsonMap = csvParser.parse(body2[1]); + Assert.assertEquals( + "jsonMap", + ImmutableMap.of("time", "hello", "value1", "world", "value2", "foo", "value3", "bar"), + jsonMap + ); + } + + @Test public void testTSVParserWithoutStartFileFromBeginning() { + expectedException.expect(UnsupportedOperationException.class); + expectedException.expectMessage( + "hasHeaderRow or maxSkipHeaderRows is not supported. Please check the indexTask supports these options." + ); + final int skipHeaderRows = 2; final Parser delimitedParser = new DelimitedParser( Optional.of("\t"),