Hadoop indexing: Fix NPE when intervals not provided (#4686)

* Fix #4647 * NPE protect bucketInterval as well * Add test to verify timezone as well * Also handle case when intervals are already present * Fix checkstyle error * Use factory method instead for Datetime * Use Intervals factory method
2017-10-05 22:46:07 -07:00 · 2017-10-05 22:46:07 -07:00 · 4ff12e4394
parent 716a5ec1a8
commit 4ff12e4394
4 changed files with 75 additions and 13 deletions
--- a/indexing-hadoop/src/main/java/io/druid/indexer/DetermineHashedPartitionsJob.java
+++ b/indexing-hadoop/src/main/java/io/druid/indexer/DetermineHashedPartitionsJob.java
@ -300,12 +300,14 @@ public class DetermineHashedPartitionsJob implements Jobby
  {
    private final List<Interval> intervals = Lists.newArrayList();
    protected HadoopDruidIndexerConfig config = null;
    private boolean determineIntervals;
    @Override
    protected void setup(Context context)
        throws IOException, InterruptedException
    {
      config = HadoopDruidIndexerConfig.fromConfiguration(context.getConfiguration());
      determineIntervals = !config.getSegmentGranularIntervals().isPresent();
    }
    @Override
@ -321,12 +323,20 @@ public class DetermineHashedPartitionsJob implements Jobby
            HyperLogLogCollector.makeCollector(ByteBuffer.wrap(value.getBytes(), 0, value.getLength()))
        );
      }
      Optional<Interval> intervalOptional = config.getGranularitySpec().bucketInterval(DateTimes.utc(key.get()));
-      if (!intervalOptional.isPresent()) {
+      Interval interval;
-        throw new ISE("WTF?! No bucket found for timestamp: %s", key.get());
+
      if (determineIntervals) {
        interval = config.getGranularitySpec().getSegmentGranularity().bucket(DateTimes.utc(key.get()));
      } else {
        Optional<Interval> intervalOptional = config.getGranularitySpec().bucketInterval(DateTimes.utc(key.get()));
        if (!intervalOptional.isPresent()) {
          throw new ISE("WTF?! No bucket found for timestamp: %s", key.get());
        }
        interval = intervalOptional.get();
      }
-      Interval interval = intervalOptional.get();
+
      intervals.add(interval);
      final Path outPath = config.makeSegmentPartitionInfoPath(interval);
      final OutputStream out = Utils.makePathAndOutputStream(
@ -353,7 +363,7 @@ public class DetermineHashedPartitionsJob implements Jobby
        throws IOException, InterruptedException
    {
      super.run(context);
-      if (!config.getSegmentGranularIntervals().isPresent()) {
+      if (determineIntervals) {
        final Path outPath = config.makeIntervalInfoPath();
        final OutputStream out = Utils.makePathAndOutputStream(
            context, outPath, config.isOverwriteFiles()
--- a/indexing-hadoop/src/test/java/io/druid/indexer/DetermineHashedPartitionsJobTest.java
+++ b/indexing-hadoop/src/test/java/io/druid/indexer/DetermineHashedPartitionsJobTest.java
@ -29,10 +29,15 @@ import io.druid.data.input.impl.TimestampSpec;
 import io.druid.indexer.partitions.HashedPartitionsSpec;
 import io.druid.java.util.common.Intervals;
 import io.druid.java.util.common.granularity.Granularities;
 import io.druid.java.util.common.granularity.Granularity;
 import io.druid.java.util.common.granularity.PeriodGranularity;
 import io.druid.query.aggregation.AggregatorFactory;
 import io.druid.query.aggregation.DoubleSumAggregatorFactory;
 import io.druid.segment.indexing.DataSchema;
 import io.druid.segment.indexing.granularity.UniformGranularitySpec;
 import org.joda.time.DateTimeZone;
 import org.joda.time.Interval;
 import org.joda.time.Period;
 import org.junit.Assert;
 import org.junit.Test;
 import org.junit.runner.RunWith;
@ -53,7 +58,7 @@ public class DetermineHashedPartitionsJobTest
  private int[] expectedNumOfShards;
  private int errorMargin;
-  @Parameterized.Parameters(name = "File={0}, TargetPartitionSize={1}, Interval={2}, ErrorMargin={3}, NumTimeBuckets={4}, NumShards={5}")
+  @Parameterized.Parameters(name = "File={0}, TargetPartitionSize={1}, Interval={2}, ErrorMargin={3}, NumTimeBuckets={4}, NumShards={5}, SegmentGranularity={6}")
  public static Collection<?> data()
  {
    int[] first = new int[1];
@ -73,7 +78,8 @@ public class DetermineHashedPartitionsJobTest
                "2011-04-10T00:00:00.000Z/2011-04-11T00:00:00.000Z",
                0,
                1,
-                first
+                first,
                Granularities.DAY
            },
            {
                DetermineHashedPartitionsJobTest.class.getResource("/druid.test.data.with.duplicate.rows.tsv").getPath(),
@ -81,7 +87,8 @@ public class DetermineHashedPartitionsJobTest
                "2011-04-10T00:00:00.000Z/2011-04-16T00:00:00.000Z",
                0,
                6,
-                second
+                second,
                Granularities.DAY
            },
            {
                DetermineHashedPartitionsJobTest.class.getResource("/druid.test.data.with.duplicate.rows.tsv").getPath(),
@ -89,7 +96,26 @@ public class DetermineHashedPartitionsJobTest
                "2011-04-10T00:00:00.000Z/2011-04-16T00:00:00.000Z",
                0,
                6,
-                third
+                third,
                Granularities.DAY
            },
            {
                DetermineHashedPartitionsJobTest.class.getResource("/druid.test.data.with.duplicate.rows.tsv").getPath(),
                1L,
                null,
                0,
                6,
                third,
                Granularities.DAY
            },
            {
                DetermineHashedPartitionsJobTest.class.getResource("/druid.test.data.with.rows.in.timezone.tsv").getPath(),
                1L,
                null,
                0,
                1,
                first,
                new PeriodGranularity(new Period("P1D"), null, DateTimeZone.forID("America/Los_Angeles"))
            }
        }
    );
@ -101,7 +127,8 @@ public class DetermineHashedPartitionsJobTest
      String interval,
      int errorMargin,
      int expectedNumTimeBuckets,
-      int[] expectedNumOfShards
+      int[] expectedNumOfShards,
      Granularity segmentGranularity
  ) throws IOException
  {
    this.expectedNumOfShards = expectedNumOfShards;
@ -109,6 +136,11 @@ public class DetermineHashedPartitionsJobTest
    this.errorMargin = errorMargin;
    File tmpDir = Files.createTempDir();
    ImmutableList<Interval> intervals = null;
    if (interval != null) {
      intervals = ImmutableList.of(Intervals.of(interval));
    }
    HadoopIngestionSpec ingestionSpec = new HadoopIngestionSpec(
        new DataSchema(
            "test_schema",
@ -145,9 +177,9 @@ public class DetermineHashedPartitionsJobTest
            ),
            new AggregatorFactory[]{new DoubleSumAggregatorFactory("index", "index")},
            new UniformGranularitySpec(
-                Granularities.DAY,
+                segmentGranularity,
                Granularities.NONE,
-                ImmutableList.of(Intervals.of(interval))
+                intervals
            ),
            HadoopDruidIndexerConfig.JSON_MAPPER
        ),
--- a/indexing-hadoop/src/test/resources/druid.test.data.with.rows.in.timezone.tsv
+++ b/indexing-hadoop/src/test/resources/druid.test.data.with.rows.in.timezone.tsv
@ -0,0 +1,16 @@
 2011-04-10T00:00:00.000-07:00	spot	automotive	preferred	apreferred	113.221448
 2011-04-10T00:00:00.000-07:00	spot	automotive	preferred	apreferred	11.221448
 2011-04-10T00:00:00.000-07:00	spot	automotive	preferred	apreferred	103.221448
 2011-04-10T00:00:00.000-07:00	spot	automotive	preferred	apreferred	53.221448
 2011-04-10T00:00:00.000-07:00	spot	business	preferred	bpreferred	95.570457
 2011-04-10T00:00:00.000-07:00	spot	entertainment	preferred	epreferred	131.766616
 2011-04-10T00:00:00.000-07:00	spot	health	preferred	hpreferred	99.950855
 2011-04-10T00:00:00.000-07:00	spot	mezzanine	preferred	mpreferred	91.470524
 2011-04-10T00:00:00.000-07:00	spot	news	preferred	npreferred	99.393076
 2011-04-10T00:00:00.000-07:00	spot	premium	preferred	ppreferred	123.207579
 2011-04-10T00:00:00.000-07:00	spot	technology	preferred	tpreferred	84.898691
 2011-04-10T00:00:00.000-07:00	spot	travel	preferred	tpreferred	114.353962
 2011-04-10T00:00:00.000-07:00	total_market	mezzanine	preferred	mpreferred	1005.253077
 2011-04-10T00:00:00.000-07:00	total_market	premium	preferred	ppreferred	1030.094757
 2011-04-10T00:00:00.000-07:00	upfront	mezzanine	preferred	mpreferred	1031.741509
 2011-04-10T00:00:00.000-07:00	upfront	premium	preferred	ppreferred	775.965555
--- a/server/src/main/java/io/druid/segment/indexing/granularity/UniformGranularitySpec.java
+++ b/server/src/main/java/io/druid/segment/indexing/granularity/UniformGranularitySpec.java
@ -97,7 +97,11 @@ public class UniformGranularitySpec implements GranularitySpec
  @Override
  public Optional<Interval> bucketInterval(DateTime dt)
  {
-    return wrappedSpec.bucketInterval(dt);
+    if (wrappedSpec == null) {
      return Optional.absent();
    } else {
      return wrappedSpec.bucketInterval(dt);
    }
  }
  @Override