add ability to specify Multiple PathSpecs in batch ingestion, so that we can grab data from multiple places in same ingestion

Conflicts:
	indexing-hadoop/src/main/java/io/druid/indexer/HadoopDruidIndexerConfig.java
	indexing-hadoop/src/main/java/io/druid/indexer/JobHelper.java

Conflicts:
	indexing-hadoop/src/main/java/io/druid/indexer/path/PathSpec.java
This commit is contained in:
Himanshu Gupta 2015-05-18 13:41:06 -05:00
parent 1ae56f139b
commit 45947a1021
13 changed files with 235 additions and 57 deletions

View File

@ -88,7 +88,6 @@ public class DetermineHashedPartitionsJob implements Jobby
); );
JobHelper.injectSystemProperties(groupByJob); JobHelper.injectSystemProperties(groupByJob);
JobHelper.setInputFormat(groupByJob, config);
groupByJob.setMapperClass(DetermineCardinalityMapper.class); groupByJob.setMapperClass(DetermineCardinalityMapper.class);
groupByJob.setMapOutputKeyClass(LongWritable.class); groupByJob.setMapOutputKeyClass(LongWritable.class);
groupByJob.setMapOutputValueClass(BytesWritable.class); groupByJob.setMapOutputValueClass(BytesWritable.class);

View File

@ -126,7 +126,6 @@ public class DeterminePartitionsJob implements Jobby
); );
JobHelper.injectSystemProperties(groupByJob); JobHelper.injectSystemProperties(groupByJob);
JobHelper.setInputFormat(groupByJob, config);
groupByJob.setMapperClass(DeterminePartitionsGroupByMapper.class); groupByJob.setMapperClass(DeterminePartitionsGroupByMapper.class);
groupByJob.setMapOutputKeyClass(BytesWritable.class); groupByJob.setMapOutputKeyClass(BytesWritable.class);
groupByJob.setMapOutputValueClass(NullWritable.class); groupByJob.setMapOutputValueClass(NullWritable.class);
@ -173,7 +172,6 @@ public class DeterminePartitionsJob implements Jobby
} else { } else {
// Directly read the source data, since we assume it's already grouped. // Directly read the source data, since we assume it's already grouped.
dimSelectionJob.setMapperClass(DeterminePartitionsDimSelectionAssumeGroupedMapper.class); dimSelectionJob.setMapperClass(DeterminePartitionsDimSelectionAssumeGroupedMapper.class);
JobHelper.setInputFormat(dimSelectionJob, config);
config.addInputPaths(dimSelectionJob); config.addInputPaths(dimSelectionJob);
} }

View File

@ -55,7 +55,6 @@ import io.druid.timeline.partition.ShardSpecLookup;
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Job;
import org.joda.time.DateTime; import org.joda.time.DateTime;
import org.joda.time.Interval; import org.joda.time.Interval;
@ -354,11 +353,6 @@ public class HadoopDruidIndexerConfig
return pathSpec.addInputPaths(this, job); return pathSpec.addInputPaths(this, job);
} }
public Class<? extends InputFormat> getInputFormatClass()
{
return pathSpec.getInputFormat();
}
/******************************************** /********************************************
Granularity/Bucket Helper Methods Granularity/Bucket Helper Methods
********************************************/ ********************************************/

View File

@ -140,8 +140,6 @@ public class IndexGeneratorJob implements Jobby
JobHelper.injectSystemProperties(job); JobHelper.injectSystemProperties(job);
JobHelper.setInputFormat(job, config);
job.setMapperClass(IndexGeneratorMapper.class); job.setMapperClass(IndexGeneratorMapper.class);
job.setMapOutputValueClass(BytesWritable.class); job.setMapOutputValueClass(BytesWritable.class);

View File

@ -44,8 +44,6 @@ import org.apache.hadoop.io.retry.RetryProxy;
import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.TaskAttemptContext; import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.TaskAttemptID; import org.apache.hadoop.mapreduce.TaskAttemptID;
import org.apache.hadoop.mapreduce.lib.input.CombineTextInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.util.Progressable; import org.apache.hadoop.util.Progressable;
import org.joda.time.DateTime; import org.joda.time.DateTime;
import org.joda.time.Interval; import org.joda.time.Interval;
@ -207,17 +205,6 @@ public class JobHelper
return true; return true;
} }
public static void setInputFormat(Job job, HadoopDruidIndexerConfig indexerConfig)
{
if (indexerConfig.getInputFormatClass() != null) {
job.setInputFormatClass(indexerConfig.getInputFormatClass());
} else if (indexerConfig.isCombineText()) {
job.setInputFormatClass(CombineTextInputFormat.class);
} else {
job.setInputFormatClass(TextInputFormat.class);
}
}
public static DataSegment serializeOutIndex( public static DataSegment serializeOutIndex(
final DataSegment segmentTemplate, final DataSegment segmentTemplate,
final Configuration configuration, final Configuration configuration,

View File

@ -35,7 +35,6 @@ import io.druid.indexing.overlord.IndexerMetadataStorageCoordinator;
import io.druid.query.aggregation.AggregatorFactory; import io.druid.query.aggregation.AggregatorFactory;
import io.druid.timeline.DataSegment; import io.druid.timeline.DataSegment;
import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.MultipleInputs; import org.apache.hadoop.mapreduce.lib.input.MultipleInputs;
@ -146,15 +145,11 @@ public class DatasourcePathSpec implements PathSpec
job.getConfiguration().set(DatasourceInputFormat.CONF_DRUID_SCHEMA, mapper.writeValueAsString(ingestionSpec)); job.getConfiguration().set(DatasourceInputFormat.CONF_DRUID_SCHEMA, mapper.writeValueAsString(ingestionSpec));
job.getConfiguration().set(DatasourceInputFormat.CONF_INPUT_SEGMENTS, mapper.writeValueAsString(segments)); job.getConfiguration().set(DatasourceInputFormat.CONF_INPUT_SEGMENTS, mapper.writeValueAsString(segments));
job.getConfiguration().set(DatasourceInputFormat.CONF_MAX_SPLIT_SIZE, String.valueOf(maxSplitSize)); job.getConfiguration().set(DatasourceInputFormat.CONF_MAX_SPLIT_SIZE, String.valueOf(maxSplitSize));
MultipleInputs.addInputPath(job, new Path("/dummy/tobe/ignored"), DatasourceInputFormat.class);
return job; return job;
} }
public Class<? extends InputFormat> getInputFormat()
{
return DatasourceInputFormat.class;
}
@Override @Override
public boolean equals(Object o) public boolean equals(Object o)
{ {

View File

@ -30,7 +30,10 @@ import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.InputFormat; import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.CombineTextInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.MultipleInputs;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.joda.time.DateTime; import org.joda.time.DateTime;
import org.joda.time.Interval; import org.joda.time.Interval;
import org.joda.time.format.DateTimeFormat; import org.joda.time.format.DateTimeFormat;
@ -152,7 +155,7 @@ public class GranularityPathSpec implements PathSpec
for (String path : paths) { for (String path : paths) {
log.info("Appending path[%s]", path); log.info("Appending path[%s]", path);
FileInputFormat.addInputPath(job, new Path(path)); StaticPathSpec.addToMultipleInputs(config, job, path, inputFormat);
} }
return job; return job;

View File

@ -0,0 +1,80 @@
/*
* Licensed to Metamarkets Group Inc. (Metamarkets) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. Metamarkets licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package io.druid.indexer.path;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.google.api.client.repackaged.com.google.common.base.Preconditions;
import io.druid.indexer.HadoopDruidIndexerConfig;
import org.apache.hadoop.mapreduce.Job;
import java.io.IOException;
import java.util.List;
public class MultiplePathSpec implements PathSpec
{
private List<PathSpec> children;
public MultiplePathSpec(
@JsonProperty("children") List<PathSpec> children
)
{
Preconditions.checkArgument(children != null && children.size() > 0, "Null/Empty list of child PathSpecs");
this.children = children;
}
@JsonProperty
public List<PathSpec> getChildren()
{
return children;
}
@Override
public Job addInputPaths(
HadoopDruidIndexerConfig config, Job job
) throws IOException
{
for(PathSpec spec : children) {
spec.addInputPaths(config, job);
}
return job;
}
@Override
public boolean equals(Object o)
{
if (this == o) {
return true;
}
if (o == null || getClass() != o.getClass()) {
return false;
}
MultiplePathSpec that = (MultiplePathSpec) o;
return children.equals(that.children);
}
@Override
public int hashCode()
{
return children.hashCode();
}
}

View File

@ -20,8 +20,6 @@ package io.druid.indexer.path;
import com.fasterxml.jackson.annotation.JsonSubTypes; import com.fasterxml.jackson.annotation.JsonSubTypes;
import com.fasterxml.jackson.annotation.JsonTypeInfo; import com.fasterxml.jackson.annotation.JsonTypeInfo;
import io.druid.indexer.HadoopDruidIndexerConfig; import io.druid.indexer.HadoopDruidIndexerConfig;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Job;
import java.io.IOException; import java.io.IOException;
@ -33,10 +31,10 @@ import java.io.IOException;
@JsonSubTypes.Type(name="granular_unprocessed", value=GranularUnprocessedPathSpec.class), @JsonSubTypes.Type(name="granular_unprocessed", value=GranularUnprocessedPathSpec.class),
@JsonSubTypes.Type(name="granularity", value=GranularityPathSpec.class), @JsonSubTypes.Type(name="granularity", value=GranularityPathSpec.class),
@JsonSubTypes.Type(name="static", value=StaticPathSpec.class), @JsonSubTypes.Type(name="static", value=StaticPathSpec.class),
@JsonSubTypes.Type(name="dataSource", value=DatasourcePathSpec.class) @JsonSubTypes.Type(name="dataSource", value=DatasourcePathSpec.class),
@JsonSubTypes.Type(name="multi", value=MultiplePathSpec.class)
}) })
public interface PathSpec public interface PathSpec
{ {
public Job addInputPaths(HadoopDruidIndexerConfig config, Job job) throws IOException; public Job addInputPaths(HadoopDruidIndexerConfig config, Job job) throws IOException;
public Class<? extends InputFormat> getInputFormat();
} }

View File

@ -20,10 +20,12 @@ package io.druid.indexer.path;
import com.fasterxml.jackson.annotation.JsonProperty; import com.fasterxml.jackson.annotation.JsonProperty;
import com.metamx.common.logger.Logger; import com.metamx.common.logger.Logger;
import io.druid.indexer.HadoopDruidIndexerConfig; import io.druid.indexer.HadoopDruidIndexerConfig;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.InputFormat; import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.input.CombineTextInputFormat;
import org.apache.hadoop.mapreduce.lib.input.MultipleInputs;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import java.io.IOException; import java.io.IOException;
@ -60,7 +62,9 @@ public class StaticPathSpec implements PathSpec
public Job addInputPaths(HadoopDruidIndexerConfig config, Job job) throws IOException public Job addInputPaths(HadoopDruidIndexerConfig config, Job job) throws IOException
{ {
log.info("Adding paths[%s]", paths); log.info("Adding paths[%s]", paths);
FileInputFormat.addInputPaths(job, paths);
addToMultipleInputs(config, job, paths, inputFormat);
return job; return job;
} }
@ -68,4 +72,54 @@ public class StaticPathSpec implements PathSpec
{ {
return inputFormat; return inputFormat;
} }
public String getPaths()
{
return paths;
}
public final static void addToMultipleInputs(
HadoopDruidIndexerConfig config,
Job job,
String path,
Class<? extends InputFormat> inputFormatClass
)
{
if (inputFormatClass == null) {
if (config.isCombineText()) {
MultipleInputs.addInputPath(job, new Path(path), CombineTextInputFormat.class);
} else {
MultipleInputs.addInputPath(job, new Path(path), TextInputFormat.class);
}
} else {
MultipleInputs.addInputPath(job, new Path(path), inputFormatClass);
}
}
@Override
public boolean equals(Object o)
{
if (this == o) {
return true;
}
if (o == null || getClass() != o.getClass()) {
return false;
}
StaticPathSpec that = (StaticPathSpec) o;
if (paths != null ? !paths.equals(that.paths) : that.paths != null) {
return false;
}
return !(inputFormat != null ? !inputFormat.equals(that.inputFormat) : that.inputFormat != null);
}
@Override
public int hashCode()
{
int result = paths != null ? paths.hashCode() : 0;
result = 31 * result + (inputFormat != null ? inputFormat.hashCode() : 0);
return result;
}
} }

View File

@ -76,18 +76,18 @@ public class GranularityPathSpecTest
} }
@Test @Test
public void testDeserialization() throws Exception public void testSerdeCustomInputFormat() throws Exception
{ {
testDeserialization("/test/path", "*.test", "pat_pat", Granularity.SECOND, TextInputFormat.class); testSerde("/test/path", "*.test", "pat_pat", Granularity.SECOND, TextInputFormat.class);
} }
@Test @Test
public void testDeserializationNoInputFormat() throws Exception public void testSerdeNoInputFormat() throws Exception
{ {
testDeserialization("/test/path", "*.test", "pat_pat", Granularity.SECOND, null); testSerde("/test/path", "*.test", "pat_pat", Granularity.SECOND, null);
} }
private void testDeserialization( private void testSerde(
String inputPath, String inputPath,
String filePattern, String filePattern,
String pathFormat, String pathFormat,
@ -114,7 +114,7 @@ public class GranularityPathSpecTest
} }
sb.append("\"type\" : \"granularity\"}"); sb.append("\"type\" : \"granularity\"}");
GranularityPathSpec pathSpec = (GranularityPathSpec)jsonMapper.readValue(sb.toString(), PathSpec.class); GranularityPathSpec pathSpec = (GranularityPathSpec) StaticPathSpecTest.readWriteRead(sb.toString(), jsonMapper);
Assert.assertEquals(inputFormat, pathSpec.getInputFormat()); Assert.assertEquals(inputFormat, pathSpec.getInputFormat());
Assert.assertEquals(inputPath, pathSpec.getInputPath()); Assert.assertEquals(inputPath, pathSpec.getInputPath());
Assert.assertEquals(filePattern, pathSpec.getFilePattern()); Assert.assertEquals(filePattern, pathSpec.getFilePattern());

View File

@ -0,0 +1,65 @@
/*
* Licensed to Metamarkets Group Inc. (Metamarkets) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. Metamarkets licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package io.druid.indexer.path;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.common.collect.Lists;
import io.druid.jackson.DefaultObjectMapper;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.easymock.EasyMock;
import org.junit.Assert;
import org.junit.Test;
/**
*/
public class MultiplePathSpecTest
{
@Test
public void testSerde() throws Exception
{
PathSpec expected = new MultiplePathSpec(
Lists.<PathSpec>newArrayList(
new StaticPathSpec("/tmp/path1", null),
new StaticPathSpec("/tmp/path2", TextInputFormat.class)
)
);
ObjectMapper jsonMapper = new DefaultObjectMapper();
PathSpec actual = jsonMapper.readValue(jsonMapper.writeValueAsString(expected), PathSpec.class);
Assert.assertEquals(expected, actual);
}
@Test
public void testAddInputPaths() throws Exception
{
PathSpec ps1 = EasyMock.createMock(PathSpec.class);
EasyMock.expect(ps1.addInputPaths(null, null)).andReturn(null);
PathSpec ps2 = EasyMock.createMock(PathSpec.class);
EasyMock.expect(ps2.addInputPaths(null, null)).andReturn(null);
EasyMock.replay(ps1, ps2);
new MultiplePathSpec(Lists.newArrayList(ps1, ps2)).addInputPaths(null, null);
EasyMock.verify(ps1, ps2);
}
}

View File

@ -21,8 +21,6 @@ package io.druid.indexer.path;
import io.druid.jackson.DefaultObjectMapper; import io.druid.jackson.DefaultObjectMapper;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.junit.Assert; import org.junit.Assert;
import org.junit.Test; import org.junit.Test;
@ -34,18 +32,18 @@ public class StaticPathSpecTest
private final ObjectMapper jsonMapper = new DefaultObjectMapper(); private final ObjectMapper jsonMapper = new DefaultObjectMapper();
@Test @Test
public void testDeserialization() throws Exception public void testSerdeCustomInputFormat() throws Exception
{ {
testDeserialization("/sample/path", TextInputFormat.class); testSerde("/sample/path", TextInputFormat.class);
} }
@Test @Test
public void testDeserializationNoInputFormat() throws Exception public void testDeserializationNoInputFormat() throws Exception
{ {
testDeserialization("/sample/path", null); testSerde("/sample/path", null);
} }
private void testDeserialization(String path, Class inputFormat) throws Exception private void testSerde(String path, Class inputFormat) throws Exception
{ {
StringBuilder sb = new StringBuilder(); StringBuilder sb = new StringBuilder();
sb.append("{\"paths\" : \""); sb.append("{\"paths\" : \"");
@ -57,13 +55,22 @@ public class StaticPathSpecTest
sb.append("\","); sb.append("\",");
} }
sb.append("\"type\" : \"static\"}"); sb.append("\"type\" : \"static\"}");
StaticPathSpec pathSpec = (StaticPathSpec)jsonMapper.readValue(sb.toString(), PathSpec.class);
StaticPathSpec pathSpec = (StaticPathSpec) readWriteRead(sb.toString(), jsonMapper);
Assert.assertEquals(inputFormat, pathSpec.getInputFormat()); Assert.assertEquals(inputFormat, pathSpec.getInputFormat());
Assert.assertEquals(path, pathSpec.getPaths());
Job job = Job.getInstance(); }
pathSpec.addInputPaths(null, job);
Assert.assertEquals( public static final PathSpec readWriteRead(String jsonStr, ObjectMapper jsonMapper) throws Exception
"file:" + path, {
job.getConfiguration().get(FileInputFormat.INPUT_DIR)); return jsonMapper.readValue(
jsonMapper.writeValueAsString(
jsonMapper.readValue(
jsonStr,
PathSpec.class
)
),
PathSpec.class
);
} }
} }