diff --git a/indexing-hadoop/src/main/java/io/druid/indexer/JobHelper.java b/indexing-hadoop/src/main/java/io/druid/indexer/JobHelper.java index f71ea239020..c703e490637 100644 --- a/indexing-hadoop/src/main/java/io/druid/indexer/JobHelper.java +++ b/indexing-hadoop/src/main/java/io/druid/indexer/JobHelper.java @@ -737,7 +737,16 @@ public class JobHelper } else if ("hdfs".equals(type)) { segmentLocURI = URI.create(loadSpec.get("path").toString()); } else if ("google".equals(type)) { - segmentLocURI = URI.create(String.format("gs://%s/%s", loadSpec.get("bucket"), loadSpec.get("path"))); + // Segment names contain : in their path. + // Google Cloud Storage supports : but Hadoop does not. + // This becomes an issue when re-indexing using the current segments. + // The Hadoop getSplits code doesn't understand the : and returns "Relative path in absolute URI" + // This could be fixed using the same code that generates path names for hdfs segments using + // getHdfsStorageDir. But that wouldn't fix this issue for people who already have segments with ":". + // Because of this we just URL encode the : making everything work as it should. + segmentLocURI = URI.create( + String.format("gs://%s/%s", loadSpec.get("bucket"), loadSpec.get("path").toString().replace(":", "%3A")) + ); } else if ("local".equals(type)) { try { segmentLocURI = new URI("file", null, loadSpec.get("path").toString(), null, null); diff --git a/indexing-hadoop/src/test/java/io/druid/indexer/JobHelperTest.java b/indexing-hadoop/src/test/java/io/druid/indexer/JobHelperTest.java index 9aa233c8f77..a18984bcd47 100644 --- a/indexing-hadoop/src/test/java/io/druid/indexer/JobHelperTest.java +++ b/indexing-hadoop/src/test/java/io/druid/indexer/JobHelperTest.java @@ -31,6 +31,8 @@ import io.druid.query.aggregation.AggregatorFactory; import io.druid.query.aggregation.LongSumAggregatorFactory; import io.druid.segment.indexing.DataSchema; import io.druid.segment.indexing.granularity.UniformGranularitySpec; +import io.druid.timeline.DataSegment; +import io.druid.timeline.partition.NoneShardSpec; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.mapreduce.Job; import org.joda.time.Interval; @@ -42,6 +44,8 @@ import org.junit.rules.TemporaryFolder; import java.io.File; import java.io.IOException; +import java.net.URI; +import java.net.URISyntaxException; import java.util.HashMap; import java.util.Map; @@ -144,7 +148,30 @@ public class JobHelperTest ); } + @Test + public void testGoogleGetURIFromSegment() throws URISyntaxException + { + DataSegment segment = new DataSegment( + "test1", + Interval.parse("2000/3000"), + "ver", + ImmutableMap.of( + "type", "google", + "bucket", "test-test", + "path", "tmp/foo:bar/index1.zip" + ), + ImmutableList.of(), + ImmutableList.of(), + NoneShardSpec.instance(), + 9, + 1024 + ); + Assert.assertEquals( + new URI("gs://test-test/tmp/foo%3Abar/index1.zip"), + JobHelper.getURIFromSegment(segment) + ); + } private static class HadoopDruidIndexerConfigSpy extends HadoopDruidIndexerConfig {