From b37c9b5fe049f27aa73dc6ff70926bb6f08a1fa1 Mon Sep 17 00:00:00 2001
From: Jihoon Son <jihoonson@apache.org>
Date: Sat, 24 Jun 2017 06:29:26 +0900
Subject: [PATCH] Fix a bug of CSV/TSV parsers when extracting columns from
 header (#4443)

* Reset fieldNames whenever a new file begins

* Fix test failure

* Fix test failure
---
 .../indexing/common/task/IndexTaskTest.java   |  2 +-
 .../java/util/common/parsers/CSVParser.java   |  5 +-
 .../util/common/parsers/DelimitedParser.java  |  5 +-
 .../util/common/parsers/CSVParserTest.java    | 74 ++++++++++++++++++-
 .../common/parsers/DelimitedParserTest.java   | 70 +++++++++++++++++-
 5 files changed, 148 insertions(+), 8 deletions(-)

diff --git a/indexing-service/src/test/java/io/druid/indexing/common/task/IndexTaskTest.java b/indexing-service/src/test/java/io/druid/indexing/common/task/IndexTaskTest.java
index 2881ab0c519..d25811517fa 100644
--- a/indexing-service/src/test/java/io/druid/indexing/common/task/IndexTaskTest.java
+++ b/indexing-service/src/test/java/io/druid/indexing/common/task/IndexTaskTest.java
@@ -482,7 +482,7 @@ public class IndexTaskTest
 
     Assert.assertEquals(1, segments.size());
 
-    Assert.assertEquals(Arrays.asList("dim"), segments.get(0).getDimensions());
+    Assert.assertEquals(Arrays.asList("d"), segments.get(0).getDimensions());
     Assert.assertEquals(Arrays.asList("val"), segments.get(0).getMetrics());
     Assert.assertEquals(new Interval("2014/P1D"), segments.get(0).getInterval());
   }
diff --git a/java-util/src/main/java/io/druid/java/util/common/parsers/CSVParser.java b/java-util/src/main/java/io/druid/java/util/common/parsers/CSVParser.java
index 957e72560d2..180b095ae94 100644
--- a/java-util/src/main/java/io/druid/java/util/common/parsers/CSVParser.java
+++ b/java-util/src/main/java/io/druid/java/util/common/parsers/CSVParser.java
@@ -113,9 +113,12 @@ public class CSVParser implements Parser<String, Object>
   @Override
   public void startFileFromBeginning()
   {
-    supportSkipHeaderRows = true;
+    if (hasHeaderRow) {
+      fieldNames = null;
+    }
     hasParsedHeader = false;
     skippedHeaderRows = 0;
+    supportSkipHeaderRows = true;
   }
 
   @Override
diff --git a/java-util/src/main/java/io/druid/java/util/common/parsers/DelimitedParser.java b/java-util/src/main/java/io/druid/java/util/common/parsers/DelimitedParser.java
index 27114df31e9..c052daee17e 100644
--- a/java-util/src/main/java/io/druid/java/util/common/parsers/DelimitedParser.java
+++ b/java-util/src/main/java/io/druid/java/util/common/parsers/DelimitedParser.java
@@ -126,9 +126,12 @@ public class DelimitedParser implements Parser<String, Object>
   @Override
   public void startFileFromBeginning()
   {
-    supportSkipHeaderRows = true;
+    if (hasHeaderRow) {
+      fieldNames = null;
+    }
     hasParsedHeader = false;
     skippedHeaderRows = 0;
+    supportSkipHeaderRows = true;
   }
 
   @Override
diff --git a/java-util/src/test/java/io/druid/java/util/common/parsers/CSVParserTest.java b/java-util/src/test/java/io/druid/java/util/common/parsers/CSVParserTest.java
index 37a589b276a..f57bd4077a1 100644
--- a/java-util/src/test/java/io/druid/java/util/common/parsers/CSVParserTest.java
+++ b/java-util/src/test/java/io/druid/java/util/common/parsers/CSVParserTest.java
@@ -22,12 +22,16 @@ package io.druid.java.util.common.parsers;
 import com.google.common.base.Optional;
 import com.google.common.collect.ImmutableMap;
 import org.junit.Assert;
+import org.junit.Rule;
 import org.junit.Test;
+import org.junit.rules.ExpectedException;
 
 import java.util.Map;
 
 public class CSVParserTest
 {
+  @Rule
+  public ExpectedException expectedException = ExpectedException.none();
 
   @Test
   public void testValidHeader()
@@ -117,9 +121,71 @@ public class CSVParserTest
     );
   }
 
-  @Test(expected = UnsupportedOperationException.class)
+  @Test
+  public void testCSVParserWithHeaderRow()
+  {
+    final Parser<String, Object> csvParser = new CSVParser(
+        Optional.absent(),
+        true,
+        0
+    );
+    csvParser.startFileFromBeginning();
+    final String[] body = new String[] {
+        "time,value1,value2",
+        "hello,world,foo"
+    };
+    Assert.assertNull(csvParser.parse(body[0]));
+    final Map<String, Object> jsonMap = csvParser.parse(body[1]);
+    Assert.assertEquals(
+        "jsonMap",
+        ImmutableMap.of("time", "hello", "value1", "world", "value2", "foo"),
+        jsonMap
+    );
+  }
+
+  @Test
+  public void testCSVParserWithDifferentHeaderRows()
+  {
+    final Parser<String, Object> csvParser = new CSVParser(
+        Optional.absent(),
+        true,
+        0
+    );
+    csvParser.startFileFromBeginning();
+    final String[] body = new String[] {
+        "time,value1,value2",
+        "hello,world,foo"
+    };
+    Assert.assertNull(csvParser.parse(body[0]));
+    Map<String, Object> jsonMap = csvParser.parse(body[1]);
+    Assert.assertEquals(
+        "jsonMap",
+        ImmutableMap.of("time", "hello", "value1", "world", "value2", "foo"),
+        jsonMap
+    );
+
+    csvParser.startFileFromBeginning();
+    final String[] body2 = new String[] {
+        "time,value1,value2,value3",
+        "hello,world,foo,bar"
+    };
+    Assert.assertNull(csvParser.parse(body2[0]));
+    jsonMap = csvParser.parse(body2[1]);
+    Assert.assertEquals(
+        "jsonMap",
+        ImmutableMap.of("time", "hello", "value1", "world", "value2", "foo", "value3", "bar"),
+        jsonMap
+    );
+  }
+
+  @Test
   public void testCSVParserWithoutStartFileFromBeginning()
   {
+    expectedException.expect(UnsupportedOperationException.class);
+    expectedException.expectMessage(
+        "hasHeaderRow or maxSkipHeaderRows is not supported. Please check the indexTask supports these options."
+    );
+
     final int skipHeaderRows = 2;
     final Parser<String, Object> csvParser = new CSVParser(
         Optional.absent(),
@@ -127,9 +193,9 @@ public class CSVParserTest
         skipHeaderRows
     );
     final String[] body = new String[] {
-        "header\tline\t1",
-        "header\tline\t2",
-        "hello\tworld\tfoo"
+        "header,line,1",
+        "header,line,2",
+        "hello,world,foo"
     };
     csvParser.parse(body[0]);
   }
diff --git a/java-util/src/test/java/io/druid/java/util/common/parsers/DelimitedParserTest.java b/java-util/src/test/java/io/druid/java/util/common/parsers/DelimitedParserTest.java
index d91ed25cbbc..ca913253eb6 100644
--- a/java-util/src/test/java/io/druid/java/util/common/parsers/DelimitedParserTest.java
+++ b/java-util/src/test/java/io/druid/java/util/common/parsers/DelimitedParserTest.java
@@ -22,12 +22,16 @@ package io.druid.java.util.common.parsers;
 import com.google.common.base.Optional;
 import com.google.common.collect.ImmutableMap;
 import org.junit.Assert;
+import org.junit.Rule;
 import org.junit.Test;
+import org.junit.rules.ExpectedException;
 
 import java.util.Map;
 
 public class DelimitedParserTest
 {
+  @Rule
+  public ExpectedException expectedException = ExpectedException.none();
 
   @Test
   public void testValidHeader()
@@ -127,9 +131,73 @@ public class DelimitedParserTest
     );
   }
 
-  @Test(expected = UnsupportedOperationException.class)
+  @Test
+  public void testTSVParserWithHeaderRow()
+  {
+    final Parser<String, Object> parser = new DelimitedParser(
+        Optional.of("\t"),
+        Optional.absent(),
+        true,
+        0
+    );
+    parser.startFileFromBeginning();
+    final String[] body = new String[] {
+        "time\tvalue1\tvalue2",
+        "hello\tworld\tfoo"
+    };
+    Assert.assertNull(parser.parse(body[0]));
+    final Map<String, Object> jsonMap = parser.parse(body[1]);
+    Assert.assertEquals(
+        "jsonMap",
+        ImmutableMap.of("time", "hello", "value1", "world", "value2", "foo"),
+        jsonMap
+    );
+  }
+
+  @Test
+  public void testTSVParserWithDifferentHeaderRows()
+  {
+    final Parser<String, Object> csvParser = new DelimitedParser(
+        Optional.of("\t"),
+        Optional.absent(),
+        true,
+        0
+    );
+    csvParser.startFileFromBeginning();
+    final String[] body = new String[] {
+        "time\tvalue1\tvalue2",
+        "hello\tworld\tfoo"
+    };
+    Assert.assertNull(csvParser.parse(body[0]));
+    Map<String, Object> jsonMap = csvParser.parse(body[1]);
+    Assert.assertEquals(
+        "jsonMap",
+        ImmutableMap.of("time", "hello", "value1", "world", "value2", "foo"),
+        jsonMap
+    );
+
+    csvParser.startFileFromBeginning();
+    final String[] body2 = new String[] {
+        "time\tvalue1\tvalue2\tvalue3",
+        "hello\tworld\tfoo\tbar"
+    };
+    Assert.assertNull(csvParser.parse(body2[0]));
+    jsonMap = csvParser.parse(body2[1]);
+    Assert.assertEquals(
+        "jsonMap",
+        ImmutableMap.of("time", "hello", "value1", "world", "value2", "foo", "value3", "bar"),
+        jsonMap
+    );
+  }
+
+  @Test
   public void testTSVParserWithoutStartFileFromBeginning()
   {
+    expectedException.expect(UnsupportedOperationException.class);
+    expectedException.expectMessage(
+        "hasHeaderRow or maxSkipHeaderRows is not supported. Please check the indexTask supports these options."
+    );
+
     final int skipHeaderRows = 2;
     final Parser<String, Object> delimitedParser = new DelimitedParser(
         Optional.of("\t"),