mirror of https://github.com/apache/druid.git
Hadoop indexing: Fix NPE when intervals not provided (#4686)
* Fix #4647 * NPE protect bucketInterval as well * Add test to verify timezone as well * Also handle case when intervals are already present * Fix checkstyle error * Use factory method instead for Datetime * Use Intervals factory method
This commit is contained in:
parent
716a5ec1a8
commit
4ff12e4394
|
@ -300,12 +300,14 @@ public class DetermineHashedPartitionsJob implements Jobby
|
||||||
{
|
{
|
||||||
private final List<Interval> intervals = Lists.newArrayList();
|
private final List<Interval> intervals = Lists.newArrayList();
|
||||||
protected HadoopDruidIndexerConfig config = null;
|
protected HadoopDruidIndexerConfig config = null;
|
||||||
|
private boolean determineIntervals;
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected void setup(Context context)
|
protected void setup(Context context)
|
||||||
throws IOException, InterruptedException
|
throws IOException, InterruptedException
|
||||||
{
|
{
|
||||||
config = HadoopDruidIndexerConfig.fromConfiguration(context.getConfiguration());
|
config = HadoopDruidIndexerConfig.fromConfiguration(context.getConfiguration());
|
||||||
|
determineIntervals = !config.getSegmentGranularIntervals().isPresent();
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -321,12 +323,20 @@ public class DetermineHashedPartitionsJob implements Jobby
|
||||||
HyperLogLogCollector.makeCollector(ByteBuffer.wrap(value.getBytes(), 0, value.getLength()))
|
HyperLogLogCollector.makeCollector(ByteBuffer.wrap(value.getBytes(), 0, value.getLength()))
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
Optional<Interval> intervalOptional = config.getGranularitySpec().bucketInterval(DateTimes.utc(key.get()));
|
|
||||||
|
|
||||||
if (!intervalOptional.isPresent()) {
|
Interval interval;
|
||||||
throw new ISE("WTF?! No bucket found for timestamp: %s", key.get());
|
|
||||||
|
if (determineIntervals) {
|
||||||
|
interval = config.getGranularitySpec().getSegmentGranularity().bucket(DateTimes.utc(key.get()));
|
||||||
|
} else {
|
||||||
|
Optional<Interval> intervalOptional = config.getGranularitySpec().bucketInterval(DateTimes.utc(key.get()));
|
||||||
|
|
||||||
|
if (!intervalOptional.isPresent()) {
|
||||||
|
throw new ISE("WTF?! No bucket found for timestamp: %s", key.get());
|
||||||
|
}
|
||||||
|
interval = intervalOptional.get();
|
||||||
}
|
}
|
||||||
Interval interval = intervalOptional.get();
|
|
||||||
intervals.add(interval);
|
intervals.add(interval);
|
||||||
final Path outPath = config.makeSegmentPartitionInfoPath(interval);
|
final Path outPath = config.makeSegmentPartitionInfoPath(interval);
|
||||||
final OutputStream out = Utils.makePathAndOutputStream(
|
final OutputStream out = Utils.makePathAndOutputStream(
|
||||||
|
@ -353,7 +363,7 @@ public class DetermineHashedPartitionsJob implements Jobby
|
||||||
throws IOException, InterruptedException
|
throws IOException, InterruptedException
|
||||||
{
|
{
|
||||||
super.run(context);
|
super.run(context);
|
||||||
if (!config.getSegmentGranularIntervals().isPresent()) {
|
if (determineIntervals) {
|
||||||
final Path outPath = config.makeIntervalInfoPath();
|
final Path outPath = config.makeIntervalInfoPath();
|
||||||
final OutputStream out = Utils.makePathAndOutputStream(
|
final OutputStream out = Utils.makePathAndOutputStream(
|
||||||
context, outPath, config.isOverwriteFiles()
|
context, outPath, config.isOverwriteFiles()
|
||||||
|
|
|
@ -29,10 +29,15 @@ import io.druid.data.input.impl.TimestampSpec;
|
||||||
import io.druid.indexer.partitions.HashedPartitionsSpec;
|
import io.druid.indexer.partitions.HashedPartitionsSpec;
|
||||||
import io.druid.java.util.common.Intervals;
|
import io.druid.java.util.common.Intervals;
|
||||||
import io.druid.java.util.common.granularity.Granularities;
|
import io.druid.java.util.common.granularity.Granularities;
|
||||||
|
import io.druid.java.util.common.granularity.Granularity;
|
||||||
|
import io.druid.java.util.common.granularity.PeriodGranularity;
|
||||||
import io.druid.query.aggregation.AggregatorFactory;
|
import io.druid.query.aggregation.AggregatorFactory;
|
||||||
import io.druid.query.aggregation.DoubleSumAggregatorFactory;
|
import io.druid.query.aggregation.DoubleSumAggregatorFactory;
|
||||||
import io.druid.segment.indexing.DataSchema;
|
import io.druid.segment.indexing.DataSchema;
|
||||||
import io.druid.segment.indexing.granularity.UniformGranularitySpec;
|
import io.druid.segment.indexing.granularity.UniformGranularitySpec;
|
||||||
|
import org.joda.time.DateTimeZone;
|
||||||
|
import org.joda.time.Interval;
|
||||||
|
import org.joda.time.Period;
|
||||||
import org.junit.Assert;
|
import org.junit.Assert;
|
||||||
import org.junit.Test;
|
import org.junit.Test;
|
||||||
import org.junit.runner.RunWith;
|
import org.junit.runner.RunWith;
|
||||||
|
@ -53,7 +58,7 @@ public class DetermineHashedPartitionsJobTest
|
||||||
private int[] expectedNumOfShards;
|
private int[] expectedNumOfShards;
|
||||||
private int errorMargin;
|
private int errorMargin;
|
||||||
|
|
||||||
@Parameterized.Parameters(name = "File={0}, TargetPartitionSize={1}, Interval={2}, ErrorMargin={3}, NumTimeBuckets={4}, NumShards={5}")
|
@Parameterized.Parameters(name = "File={0}, TargetPartitionSize={1}, Interval={2}, ErrorMargin={3}, NumTimeBuckets={4}, NumShards={5}, SegmentGranularity={6}")
|
||||||
public static Collection<?> data()
|
public static Collection<?> data()
|
||||||
{
|
{
|
||||||
int[] first = new int[1];
|
int[] first = new int[1];
|
||||||
|
@ -73,7 +78,8 @@ public class DetermineHashedPartitionsJobTest
|
||||||
"2011-04-10T00:00:00.000Z/2011-04-11T00:00:00.000Z",
|
"2011-04-10T00:00:00.000Z/2011-04-11T00:00:00.000Z",
|
||||||
0,
|
0,
|
||||||
1,
|
1,
|
||||||
first
|
first,
|
||||||
|
Granularities.DAY
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
DetermineHashedPartitionsJobTest.class.getResource("/druid.test.data.with.duplicate.rows.tsv").getPath(),
|
DetermineHashedPartitionsJobTest.class.getResource("/druid.test.data.with.duplicate.rows.tsv").getPath(),
|
||||||
|
@ -81,7 +87,8 @@ public class DetermineHashedPartitionsJobTest
|
||||||
"2011-04-10T00:00:00.000Z/2011-04-16T00:00:00.000Z",
|
"2011-04-10T00:00:00.000Z/2011-04-16T00:00:00.000Z",
|
||||||
0,
|
0,
|
||||||
6,
|
6,
|
||||||
second
|
second,
|
||||||
|
Granularities.DAY
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
DetermineHashedPartitionsJobTest.class.getResource("/druid.test.data.with.duplicate.rows.tsv").getPath(),
|
DetermineHashedPartitionsJobTest.class.getResource("/druid.test.data.with.duplicate.rows.tsv").getPath(),
|
||||||
|
@ -89,7 +96,26 @@ public class DetermineHashedPartitionsJobTest
|
||||||
"2011-04-10T00:00:00.000Z/2011-04-16T00:00:00.000Z",
|
"2011-04-10T00:00:00.000Z/2011-04-16T00:00:00.000Z",
|
||||||
0,
|
0,
|
||||||
6,
|
6,
|
||||||
third
|
third,
|
||||||
|
Granularities.DAY
|
||||||
|
},
|
||||||
|
{
|
||||||
|
DetermineHashedPartitionsJobTest.class.getResource("/druid.test.data.with.duplicate.rows.tsv").getPath(),
|
||||||
|
1L,
|
||||||
|
null,
|
||||||
|
0,
|
||||||
|
6,
|
||||||
|
third,
|
||||||
|
Granularities.DAY
|
||||||
|
},
|
||||||
|
{
|
||||||
|
DetermineHashedPartitionsJobTest.class.getResource("/druid.test.data.with.rows.in.timezone.tsv").getPath(),
|
||||||
|
1L,
|
||||||
|
null,
|
||||||
|
0,
|
||||||
|
1,
|
||||||
|
first,
|
||||||
|
new PeriodGranularity(new Period("P1D"), null, DateTimeZone.forID("America/Los_Angeles"))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
);
|
);
|
||||||
|
@ -101,7 +127,8 @@ public class DetermineHashedPartitionsJobTest
|
||||||
String interval,
|
String interval,
|
||||||
int errorMargin,
|
int errorMargin,
|
||||||
int expectedNumTimeBuckets,
|
int expectedNumTimeBuckets,
|
||||||
int[] expectedNumOfShards
|
int[] expectedNumOfShards,
|
||||||
|
Granularity segmentGranularity
|
||||||
) throws IOException
|
) throws IOException
|
||||||
{
|
{
|
||||||
this.expectedNumOfShards = expectedNumOfShards;
|
this.expectedNumOfShards = expectedNumOfShards;
|
||||||
|
@ -109,6 +136,11 @@ public class DetermineHashedPartitionsJobTest
|
||||||
this.errorMargin = errorMargin;
|
this.errorMargin = errorMargin;
|
||||||
File tmpDir = Files.createTempDir();
|
File tmpDir = Files.createTempDir();
|
||||||
|
|
||||||
|
ImmutableList<Interval> intervals = null;
|
||||||
|
if (interval != null) {
|
||||||
|
intervals = ImmutableList.of(Intervals.of(interval));
|
||||||
|
}
|
||||||
|
|
||||||
HadoopIngestionSpec ingestionSpec = new HadoopIngestionSpec(
|
HadoopIngestionSpec ingestionSpec = new HadoopIngestionSpec(
|
||||||
new DataSchema(
|
new DataSchema(
|
||||||
"test_schema",
|
"test_schema",
|
||||||
|
@ -145,9 +177,9 @@ public class DetermineHashedPartitionsJobTest
|
||||||
),
|
),
|
||||||
new AggregatorFactory[]{new DoubleSumAggregatorFactory("index", "index")},
|
new AggregatorFactory[]{new DoubleSumAggregatorFactory("index", "index")},
|
||||||
new UniformGranularitySpec(
|
new UniformGranularitySpec(
|
||||||
Granularities.DAY,
|
segmentGranularity,
|
||||||
Granularities.NONE,
|
Granularities.NONE,
|
||||||
ImmutableList.of(Intervals.of(interval))
|
intervals
|
||||||
),
|
),
|
||||||
HadoopDruidIndexerConfig.JSON_MAPPER
|
HadoopDruidIndexerConfig.JSON_MAPPER
|
||||||
),
|
),
|
||||||
|
|
|
@ -0,0 +1,16 @@
|
||||||
|
2011-04-10T00:00:00.000-07:00 spot automotive preferred apreferred 113.221448
|
||||||
|
2011-04-10T00:00:00.000-07:00 spot automotive preferred apreferred 11.221448
|
||||||
|
2011-04-10T00:00:00.000-07:00 spot automotive preferred apreferred 103.221448
|
||||||
|
2011-04-10T00:00:00.000-07:00 spot automotive preferred apreferred 53.221448
|
||||||
|
2011-04-10T00:00:00.000-07:00 spot business preferred bpreferred 95.570457
|
||||||
|
2011-04-10T00:00:00.000-07:00 spot entertainment preferred epreferred 131.766616
|
||||||
|
2011-04-10T00:00:00.000-07:00 spot health preferred hpreferred 99.950855
|
||||||
|
2011-04-10T00:00:00.000-07:00 spot mezzanine preferred mpreferred 91.470524
|
||||||
|
2011-04-10T00:00:00.000-07:00 spot news preferred npreferred 99.393076
|
||||||
|
2011-04-10T00:00:00.000-07:00 spot premium preferred ppreferred 123.207579
|
||||||
|
2011-04-10T00:00:00.000-07:00 spot technology preferred tpreferred 84.898691
|
||||||
|
2011-04-10T00:00:00.000-07:00 spot travel preferred tpreferred 114.353962
|
||||||
|
2011-04-10T00:00:00.000-07:00 total_market mezzanine preferred mpreferred 1005.253077
|
||||||
|
2011-04-10T00:00:00.000-07:00 total_market premium preferred ppreferred 1030.094757
|
||||||
|
2011-04-10T00:00:00.000-07:00 upfront mezzanine preferred mpreferred 1031.741509
|
||||||
|
2011-04-10T00:00:00.000-07:00 upfront premium preferred ppreferred 775.965555
|
|
|
@ -97,7 +97,11 @@ public class UniformGranularitySpec implements GranularitySpec
|
||||||
@Override
|
@Override
|
||||||
public Optional<Interval> bucketInterval(DateTime dt)
|
public Optional<Interval> bucketInterval(DateTime dt)
|
||||||
{
|
{
|
||||||
return wrappedSpec.bucketInterval(dt);
|
if (wrappedSpec == null) {
|
||||||
|
return Optional.absent();
|
||||||
|
} else {
|
||||||
|
return wrappedSpec.bucketInterval(dt);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|
Loading…
Reference in New Issue