HDFS input source (#8899)

* HDFS input source Add support for using HDFS as an input source. In this version, commas or globs are not supported in HDFS paths. * Fix forbidden api * Address review comments
2019-11-19 22:19:39 -08:00 · 2019-11-19 22:19:39 -08:00 · 4ae6466ae2
parent 074a45219d
commit 4ae6466ae2
7 changed files with 606 additions and 40 deletions
--- a/core/src/main/java/org/apache/druid/data/input/impl/CsvInputFormat.java
+++ b/core/src/main/java/org/apache/druid/data/input/impl/CsvInputFormat.java
@ -47,7 +47,7 @@ public class CsvInputFormat implements InputFormat
  @JsonCreator
  public CsvInputFormat(
      @JsonProperty("columns") @Nullable List<String> columns,
-      @JsonProperty("listDelimiter") String listDelimiter,
+      @JsonProperty("listDelimiter") @Nullable String listDelimiter,
      @Deprecated @JsonProperty("hasHeaderRow") @Nullable Boolean hasHeaderRow,
      @JsonProperty("findColumnsFromHeader") @Nullable Boolean findColumnsFromHeader,
      @JsonProperty("skipHeaderRows") int skipHeaderRows
--- a/extensions-core/hdfs-storage/pom.xml
+++ b/extensions-core/hdfs-storage/pom.xml
@ -201,10 +201,6 @@
              <groupId>org.mortbay.jetty</groupId>
              <artifactId>jetty-util</artifactId>
            </exclusion>
-            <exclusion>
-              <groupId>org.apache.hadoop</groupId>
-              <artifactId>hadoop-annotations</artifactId>
-            </exclusion>
            <exclusion>
              <groupId>com.google.protobuf</groupId>
              <artifactId>protobuf-java</artifactId>
--- a/extensions-core/hdfs-storage/src/main/java/org/apache/druid/firehose/hdfs/HdfsFirehoseFactory.java
+++ b/extensions-core/hdfs-storage/src/main/java/org/apache/druid/firehose/hdfs/HdfsFirehoseFactory.java
@ -27,23 +27,17 @@ import org.apache.druid.data.input.FiniteFirehoseFactory;
 import org.apache.druid.data.input.InputSplit;
 import org.apache.druid.data.input.impl.StringInputRowParser;
 import org.apache.druid.data.input.impl.prefetch.PrefetchableTextFilesFirehoseFactory;
-import org.apache.druid.java.util.common.IAE;
+import org.apache.druid.inputsource.hdfs.HdfsInputSource;
 import org.apache.druid.storage.hdfs.HdfsDataSegmentPuller;
 import org.apache.druid.utils.CompressionUtils;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FSDataInputStream;
 import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.mapreduce.Job;
-import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
-import org.apache.hadoop.mapreduce.lib.input.FileSplit;
-import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;

 import java.io.IOException;
 import java.io.InputStream;
 import java.util.Collection;
-import java.util.Collections;
 import java.util.List;
-import java.util.stream.Collectors;

 public class HdfsFirehoseFactory extends PrefetchableTextFilesFirehoseFactory<Path>
 {
@ -62,16 +56,8 @@ public class HdfsFirehoseFactory extends PrefetchableTextFilesFirehoseFactory<Pa
  )
  {
    super(maxCacheCapacityBytes, maxFetchCapacityBytes, prefetchTriggerBytes, fetchTimeout, maxFetchRetry);
+    this.inputPaths = HdfsInputSource.coerceInputPathsToList(inputPaths, "inputPaths");
    this.conf = conf;
-
-    // Coerce 'inputPaths' to List<String>
-    if (inputPaths instanceof String) {
-      this.inputPaths = Collections.singletonList((String) inputPaths);
-    } else if (inputPaths instanceof List && ((List<?>) inputPaths).stream().allMatch(x -> x instanceof String)) {
-      this.inputPaths = ((List<?>) inputPaths).stream().map(x -> (String) x).collect(Collectors.toList());
-    } else {
-      throw new IAE("'inputPaths' must be a string or an array of strings");
-    }
  }

  @JsonProperty("paths")
@ -83,23 +69,7 @@ public class HdfsFirehoseFactory extends PrefetchableTextFilesFirehoseFactory<Pa
  @Override
  protected Collection<Path> initObjects() throws IOException
  {
-    // Use TextInputFormat to read splits. To do this, we need to make a fake Job.
-    final Job job = Job.getInstance(conf);
-
-    // Add paths to the fake JobContext.
-    inputPaths.forEach(input -> {
-      try {
-        FileInputFormat.addInputPaths(job, input);
-      }
-      catch (IOException e) {
-        throw new RuntimeException(e);
-      }
-    });
-
-    return new TextInputFormat().getSplits(job)
-                                .stream()
-                                .map(split -> ((FileSplit) split).getPath())
-                                .collect(Collectors.toSet());
+    return HdfsInputSource.getPaths(inputPaths, conf);
  }

  @Override
--- a/extensions-core/hdfs-storage/src/main/java/org/apache/druid/inputsource/hdfs/HdfsInputEntity.java
+++ b/extensions-core/hdfs-storage/src/main/java/org/apache/druid/inputsource/hdfs/HdfsInputEntity.java
@ -0,0 +1,63 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.druid.inputsource.hdfs;
+
+import com.google.common.base.Predicate;
+import org.apache.druid.data.input.InputEntity;
+import org.apache.druid.storage.hdfs.HdfsDataSegmentPuller;
+import org.apache.druid.utils.CompressionUtils;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.URI;
+
+public class HdfsInputEntity implements InputEntity
+{
+  private final Configuration conf;
+  private final Path path;
+
+  HdfsInputEntity(Configuration conf, Path path)
+  {
+    this.conf = conf;
+    this.path = path;
+  }
+
+  @Override
+  public URI getUri()
+  {
+    return path.toUri();
+  }
+
+  @Override
+  public InputStream open() throws IOException
+  {
+    FileSystem fs = path.getFileSystem(conf);
+    return CompressionUtils.decompress(fs.open(path), path.getName());
+  }
+
+  @Override
+  public Predicate<Throwable> getFetchRetryCondition()
+  {
+    return HdfsDataSegmentPuller.RETRY_PREDICATE;
+  }
+}
--- a/extensions-core/hdfs-storage/src/main/java/org/apache/druid/inputsource/hdfs/HdfsInputSource.java
+++ b/extensions-core/hdfs-storage/src/main/java/org/apache/druid/inputsource/hdfs/HdfsInputSource.java
@ -0,0 +1,233 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.druid.inputsource.hdfs;
+
+import com.fasterxml.jackson.annotation.JacksonInject;
+import com.fasterxml.jackson.annotation.JsonCreator;
+import com.fasterxml.jackson.annotation.JsonProperty;
+import com.google.common.base.Preconditions;
+import com.google.common.collect.ImmutableList;
+import org.apache.druid.data.input.AbstractInputSource;
+import org.apache.druid.data.input.InputFormat;
+import org.apache.druid.data.input.InputRowSchema;
+import org.apache.druid.data.input.InputSourceReader;
+import org.apache.druid.data.input.InputSplit;
+import org.apache.druid.data.input.SplitHintSpec;
+import org.apache.druid.data.input.impl.InputEntityIteratingReader;
+import org.apache.druid.data.input.impl.SplittableInputSource;
+import org.apache.druid.java.util.common.IAE;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.JobContext;
+import org.apache.hadoop.mapreduce.RecordReader;
+import org.apache.hadoop.mapreduce.TaskAttemptContext;
+import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
+import org.apache.hadoop.mapreduce.lib.input.FileSplit;
+
+import javax.annotation.Nullable;
+import java.io.File;
+import java.io.IOException;
+import java.io.UncheckedIOException;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.List;
+import java.util.stream.Collectors;
+import java.util.stream.Stream;
+
+public class HdfsInputSource extends AbstractInputSource implements SplittableInputSource<Path>
+{
+  private static final String PROP_PATHS = "paths";
+
+  private final List<String> inputPaths;
+  private final Configuration configuration;
+
+  // Although the javadocs for SplittableInputSource say to avoid caching splits to reduce memory, HdfsInputSource
+  // *does* cache the splits for the following reasons:
+  //
+  // 1) It will improve compatibility with the index_hadoop task, allowing people to easily migrate from Hadoop.
+  //    For example, input paths with globs will be supported (lazily expanding the wildcard glob is tricky).
+  //
+  // 2) The index_hadoop task allocates splits eagerly, so the memory usage should not be a problem for anyone
+  //    migrating from Hadoop.
+  private List<Path> cachedPaths;
+
+  @JsonCreator
+  public HdfsInputSource(
+      @JsonProperty(PROP_PATHS) Object inputPaths,
+      @JacksonInject Configuration configuration
+  )
+  {
+    this.inputPaths = coerceInputPathsToList(inputPaths, PROP_PATHS);
+    this.configuration = configuration;
+    this.cachedPaths = null;
+  }
+
+  public static List<String> coerceInputPathsToList(Object inputPaths, String propertyName)
+  {
+    final List<String> paths;
+
+    if (inputPaths instanceof String) {
+      paths = Collections.singletonList((String) inputPaths);
+    } else if (inputPaths instanceof List && ((List<?>) inputPaths).stream().allMatch(x -> x instanceof String)) {
+      paths = ((List<?>) inputPaths).stream().map(x -> (String) x).collect(Collectors.toList());
+    } else {
+      throw new IAE("'%s' must be a string or an array of strings", propertyName);
+    }
+
+    return paths;
+  }
+
+  public static Collection<Path> getPaths(List<String> inputPaths, Configuration configuration) throws IOException
+  {
+    if (inputPaths.isEmpty()) {
+      return Collections.emptySet();
+    }
+
+    // Use FileInputFormat to read splits. To do this, we need to make a fake Job.
+    Job job = Job.getInstance(configuration);
+
+    // Add paths to the fake JobContext.
+    for (String inputPath : inputPaths) {
+      FileInputFormat.addInputPaths(job, inputPath);
+    }
+
+    return new HdfsFileInputFormat().getSplits(job)
+                                    .stream()
+                                    .map(split -> ((FileSplit) split).getPath())
+                                    .collect(Collectors.toSet());
+  }
+
+  /**
+   * Helper for leveraging hadoop code to interpret HDFS paths with globs
+   */
+  private static class HdfsFileInputFormat extends FileInputFormat<Object, Object>
+  {
+    @Override
+    public RecordReader<Object, Object> createRecordReader(
+        org.apache.hadoop.mapreduce.InputSplit inputSplit,
+        TaskAttemptContext taskAttemptContext
+    )
+    {
+      throw new UnsupportedOperationException();
+    }
+
+    @Override
+    protected boolean isSplitable(JobContext context, Path filename)
+    {
+      return false;  // prevent generating extra paths
+    }
+  }
+
+  @JsonProperty(PROP_PATHS)
+  private List<String> getInputPaths()
+  {
+    return inputPaths;
+  }
+
+  @Override
+  protected InputSourceReader formattableReader(
+      InputRowSchema inputRowSchema,
+      InputFormat inputFormat,
+      @Nullable File temporaryDirectory
+  )
+  {
+    final Stream<InputSplit<Path>> splits;
+    try {
+      splits = createSplits(inputFormat, null);
+    }
+    catch (IOException e) {
+      throw new UncheckedIOException(e);
+    }
+
+    return new InputEntityIteratingReader(
+        inputRowSchema,
+        inputFormat,
+        splits.map(split -> new HdfsInputEntity(configuration, split.get())),
+        temporaryDirectory
+    );
+  }
+
+  @Override
+  public Stream<InputSplit<Path>> createSplits(InputFormat inputFormat, @Nullable SplitHintSpec splitHintSpec)
+      throws IOException
+  {
+    cachePathsIfNeeded();
+    return cachedPaths.stream().map(InputSplit::new);
+  }
+
+  @Override
+  public int getNumSplits(InputFormat inputFormat, @Nullable SplitHintSpec splitHintSpec) throws IOException
+  {
+    cachePathsIfNeeded();
+    return cachedPaths.size();
+  }
+
+  @Override
+  public SplittableInputSource<Path> withSplit(InputSplit<Path> split)
+  {
+    return new HdfsInputSource(split.get().toString(), configuration);
+  }
+
+  @Override
+  public boolean needsFormat()
+  {
+    return true;
+  }
+
+  private void cachePathsIfNeeded() throws IOException
+  {
+    if (cachedPaths == null) {
+      cachedPaths = ImmutableList.copyOf(Preconditions.checkNotNull(getPaths(inputPaths, configuration), "paths"));
+    }
+  }
+
+  static Builder builder()
+  {
+    return new Builder();
+  }
+
+  static final class Builder
+  {
+    private Object paths;
+    private Configuration configuration;
+
+    private Builder()
+    {
+    }
+
+    Builder paths(Object paths)
+    {
+      this.paths = paths;
+      return this;
+    }
+
+    Builder configuration(Configuration configuration)
+    {
+      this.configuration = configuration;
+      return this;
+    }
+
+    HdfsInputSource build()
+    {
+      return new HdfsInputSource(paths, configuration);
+    }
+  }
+}
--- a/extensions-core/hdfs-storage/src/main/java/org/apache/druid/storage/hdfs/HdfsStorageDruidModule.java
+++ b/extensions-core/hdfs-storage/src/main/java/org/apache/druid/storage/hdfs/HdfsStorageDruidModule.java
@ -33,6 +33,7 @@ import org.apache.druid.guice.LazySingleton;
 import org.apache.druid.guice.LifecycleModule;
 import org.apache.druid.guice.ManageLifecycle;
 import org.apache.druid.initialization.DruidModule;
+import org.apache.druid.inputsource.hdfs.HdfsInputSource;
 import org.apache.druid.storage.hdfs.tasklog.HdfsTaskLogs;
 import org.apache.druid.storage.hdfs.tasklog.HdfsTaskLogsConfig;
 import org.apache.hadoop.conf.Configuration;
@ -48,7 +49,7 @@ import java.util.Properties;
 */
 public class HdfsStorageDruidModule implements DruidModule
 {
-  public static final String SCHEME = "hdfs";
+  static final String SCHEME = "hdfs";
  private Properties props = null;

  @Inject
@ -63,7 +64,8 @@ public class HdfsStorageDruidModule implements DruidModule
    return Collections.singletonList(
        new SimpleModule().registerSubtypes(
            new NamedType(HdfsLoadSpec.class, HdfsStorageDruidModule.SCHEME),
-            new NamedType(HdfsFirehoseFactory.class, HdfsStorageDruidModule.SCHEME)
+            new NamedType(HdfsFirehoseFactory.class, HdfsStorageDruidModule.SCHEME),
+            new NamedType(HdfsInputSource.class, HdfsStorageDruidModule.SCHEME)
        )
    );
  }
--- a/extensions-core/hdfs-storage/src/test/java/org/apache/druid/inputsource/hdfs/HdfsInputSourceTest.java
+++ b/extensions-core/hdfs-storage/src/test/java/org/apache/druid/inputsource/hdfs/HdfsInputSourceTest.java
@ -0,0 +1,302 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.druid.inputsource.hdfs;
+
+import com.fasterxml.jackson.annotation.JsonProperty;
+import com.fasterxml.jackson.databind.InjectableValues;
+import com.fasterxml.jackson.databind.ObjectMapper;
+import org.apache.druid.data.input.InputFormat;
+import org.apache.druid.data.input.InputRow;
+import org.apache.druid.data.input.InputRowSchema;
+import org.apache.druid.data.input.InputSource;
+import org.apache.druid.data.input.InputSourceReader;
+import org.apache.druid.data.input.impl.CsvInputFormat;
+import org.apache.druid.data.input.impl.DimensionsSpec;
+import org.apache.druid.data.input.impl.TimestampSpec;
+import org.apache.druid.java.util.common.parsers.CloseableIterator;
+import org.apache.druid.storage.hdfs.HdfsStorageDruidModule;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hdfs.MiniDFSCluster;
+import org.junit.After;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.experimental.runners.Enclosed;
+import org.junit.rules.ExpectedException;
+import org.junit.rules.TemporaryFolder;
+import org.junit.runner.RunWith;
+
+import java.io.BufferedWriter;
+import java.io.File;
+import java.io.IOException;
+import java.io.OutputStreamWriter;
+import java.io.UncheckedIOException;
+import java.io.Writer;
+import java.nio.charset.StandardCharsets;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.stream.Collectors;
+import java.util.stream.IntStream;
+
+@RunWith(Enclosed.class)
+public class HdfsInputSourceTest
+{
+  private static final String PATH = "/foo/bar";
+  private static final Configuration CONFIGURATION = new Configuration();
+  private static final String COLUMN = "value";
+  private static final InputRowSchema INPUT_ROW_SCHEMA = new InputRowSchema(
+      new TimestampSpec(null, null, null),
+      DimensionsSpec.EMPTY,
+      Collections.emptyList()
+  );
+  private static final InputFormat INPUT_FORMAT = new CsvInputFormat(
+      Arrays.asList(TimestampSpec.DEFAULT_COLUMN, COLUMN),
+      null,
+      false,
+      null,
+      0
+  );
+
+  public static class SerializeDeserializeTest
+  {
+    private static final ObjectMapper OBJECT_MAPPER = createObjectMapper();
+
+    private HdfsInputSource.Builder hdfsInputSourceBuilder;
+
+    @Rule
+    public ExpectedException exception = ExpectedException.none();
+
+    @Before
+    public void setup()
+    {
+      hdfsInputSourceBuilder = HdfsInputSource.builder()
+                                              .paths(PATH)
+                                              .configuration(CONFIGURATION);
+    }
+
+    @Test
+    public void requiresPathsAsStringOrArrayOfStrings()
+    {
+      exception.expect(IllegalArgumentException.class);
+      exception.expectMessage("'paths' must be a string or an array of strings");
+
+      hdfsInputSourceBuilder.paths(Arrays.asList("a", 1)).build();
+    }
+
+    @Test
+    public void serializesDeserializesWithArrayPaths()
+    {
+      Wrapper target = new Wrapper(hdfsInputSourceBuilder.paths(Collections.singletonList(PATH)));
+      testSerializesDeserializes(target);
+    }
+
+    @Test
+    public void serializesDeserializesStringPaths()
+    {
+      Wrapper target = new Wrapper(hdfsInputSourceBuilder.paths(PATH));
+      testSerializesDeserializes(target);
+    }
+
+    private static void testSerializesDeserializes(Wrapper hdfsInputSourceWrapper)
+    {
+      try {
+        String serialized = OBJECT_MAPPER.writeValueAsString(hdfsInputSourceWrapper);
+        Wrapper deserialized = OBJECT_MAPPER.readValue(serialized, Wrapper.class);
+        Assert.assertEquals(serialized, OBJECT_MAPPER.writeValueAsString(deserialized));
+      }
+      catch (IOException e) {
+        throw new UncheckedIOException(e);
+      }
+    }
+
+    private static ObjectMapper createObjectMapper()
+    {
+      final ObjectMapper mapper = new ObjectMapper();
+      mapper.setInjectableValues(new InjectableValues.Std().addValue(Configuration.class, new Configuration()));
+      new HdfsStorageDruidModule().getJacksonModules().forEach(mapper::registerModule);
+      return mapper;
+    }
+
+    // Helper to test HdfsInputSource is added correctly to HdfsStorageDruidModule
+    private static class Wrapper
+    {
+      @JsonProperty
+      InputSource inputSource;
+
+      @SuppressWarnings("unused")  // used by Jackson
+      private Wrapper()
+      {
+      }
+
+      Wrapper(HdfsInputSource.Builder hdfsInputSourceBuilder)
+      {
+        this.inputSource = hdfsInputSourceBuilder.build();
+      }
+    }
+  }
+
+  public static class ReaderTest
+  {
+    private static final String PATH = "/test";
+    private static final int NUM_FILE = 3;
+    private static final String KEY_VALUE_SEPARATOR = ",";
+    private static final String ALPHABET = "abcdefghijklmnopqrstuvwxyz";
+
+    @Rule
+    public TemporaryFolder temporaryFolder = new TemporaryFolder();
+
+    private MiniDFSCluster dfsCluster;
+    private HdfsInputSource target;
+    private Set<Path> paths;
+    private Map<Long, String> timestampToValue;
+
+    @Before
+    public void setup() throws IOException
+    {
+      timestampToValue = new HashMap<>();
+
+      File dir = temporaryFolder.getRoot();
+      Configuration configuration = new Configuration(true);
+      configuration.set(MiniDFSCluster.HDFS_MINIDFS_BASEDIR, dir.getAbsolutePath());
+      dfsCluster = new MiniDFSCluster.Builder(configuration).build();
+
+      paths = IntStream.range(0, NUM_FILE)
+                       .mapToObj(
+                           i -> {
+                             char value = ALPHABET.charAt(i % ALPHABET.length());
+                             timestampToValue.put((long) i, Character.toString(value));
+                             return createFile(
+                                 dfsCluster,
+                                 String.valueOf(i),
+                                 i + KEY_VALUE_SEPARATOR + value
+                             );
+                           }
+                       )
+                       .collect(Collectors.toSet());
+
+      target = HdfsInputSource.builder()
+                              .paths(dfsCluster.getURI() + PATH + "*")
+                              .configuration(CONFIGURATION)
+                              .build();
+    }
+
+    @After
+    public void teardown()
+    {
+      if (dfsCluster != null) {
+        dfsCluster.shutdown(true);
+      }
+    }
+
+    private static Path createFile(MiniDFSCluster dfsCluster, String pathSuffix, String contents)
+    {
+      try {
+        Path path = new Path(PATH + pathSuffix);
+        try (Writer writer = new BufferedWriter(
+            new OutputStreamWriter(dfsCluster.getFileSystem().create(path), StandardCharsets.UTF_8)
+        )) {
+          writer.write(contents);
+        }
+        return path;
+      }
+      catch (IOException e) {
+        throw new UncheckedIOException(e);
+      }
+    }
+
+    @Test
+    public void readsSplitsCorrectly() throws IOException
+    {
+      InputSourceReader reader = target.formattableReader(INPUT_ROW_SCHEMA, INPUT_FORMAT, null);
+
+      Map<Long, String> actualTimestampToValue = new HashMap<>();
+      try (CloseableIterator<InputRow> iterator = reader.read()) {
+        while (iterator.hasNext()) {
+          InputRow row = iterator.next();
+          actualTimestampToValue.put(row.getTimestampFromEpoch(), row.getDimension(COLUMN).get(0));
+        }
+      }
+
+      Assert.assertEquals(timestampToValue, actualTimestampToValue);
+    }
+
+    @Test
+    public void hasCorrectSplits() throws IOException
+    {
+      Set<Path> actualPaths = target.createSplits(null, null)
+                                    .map(split -> Path.getPathWithoutSchemeAndAuthority(split.get()))
+                                    .collect(Collectors.toSet());
+      Assert.assertEquals(paths, actualPaths);
+    }
+
+    @Test
+    public void hasCorrectNumberOfSplits() throws IOException
+    {
+      int numSplits = target.getNumSplits(null, null);
+      Assert.assertEquals(NUM_FILE, numSplits);
+    }
+  }
+
+  public static class EmptyPathsTest
+  {
+    private HdfsInputSource target;
+
+    @Before
+    public void setup()
+    {
+      target = HdfsInputSource.builder()
+                              .paths(Collections.emptyList())
+                              .configuration(CONFIGURATION)
+                              .build();
+    }
+
+    @Test
+    public void readsSplitsCorrectly() throws IOException
+    {
+      InputSourceReader reader = target.formattableReader(INPUT_ROW_SCHEMA, INPUT_FORMAT, null);
+
+      try (CloseableIterator<InputRow> iterator = reader.read()) {
+        Assert.assertFalse(iterator.hasNext());
+      }
+    }
+
+    @Test
+    public void hasCorrectSplits() throws IOException
+    {
+      List<Path> paths = target.createSplits(null, null)
+                               .map(split -> Path.getPathWithoutSchemeAndAuthority(split.get()))
+                               .collect(Collectors.toList());
+      Assert.assertTrue(String.valueOf(paths), paths.isEmpty());
+    }
+
+    @Test
+    public void hasCorrectNumberOfSplits() throws IOException
+    {
+      int numSplits = target.getNumSplits(null, null);
+      Assert.assertEquals(0, numSplits);
+    }
+  }
+}