Add integration tests for HDFS (#9542)

* HDFS IT * HDFS IT * HDFS IT * fix checkstyle
2020-03-20 15:46:08 -07:00 · 2020-03-20 15:46:08 -07:00 · 5f127a1829
parent 4870ad7b56
commit 5f127a1829
22 changed files with 773 additions and 107 deletions
--- a/.travis.yml
+++ b/.travis.yml
@ -344,7 +344,7 @@ jobs:
      name: "(Compile=openjdk8, Run=openjdk8) other integration test"
      jdk: openjdk8
      services: *integration_test_services
-      env: TESTNG_GROUPS='-DexcludedGroups=batch-index,perfect-rollup-parallel-batch-index,kafka-index,query,realtime-index,security,s3-deep-storage,gcs-deep-storage,azure-deep-storage' JVM_RUNTIME='-Djvm.runtime=8'
+      env: TESTNG_GROUPS='-DexcludedGroups=batch-index,perfect-rollup-parallel-batch-index,kafka-index,query,realtime-index,security,s3-deep-storage,gcs-deep-storage,azure-deep-storage,hdfs-deep-storage' JVM_RUNTIME='-Djvm.runtime=8'
      script: *run_integration_test
      after_failure: *integration_test_diags
    # END - Integration tests for Compile with Java 8 and Run with Java 8
@ -383,7 +383,7 @@ jobs:
    - <<: *integration_tests
      name: "(Compile=openjdk8, Run=openjdk11) other integration test"
      jdk: openjdk8
-      env: TESTNG_GROUPS='-DexcludedGroups=batch-index,perfect-rollup-parallel-batch-index,kafka-index,query,realtime-index,security,s3-deep-storage,gcs-deep-storage,azure-deep-storage' JVM_RUNTIME='-Djvm.runtime=11'
+      env: TESTNG_GROUPS='-DexcludedGroups=batch-index,perfect-rollup-parallel-batch-index,kafka-index,query,realtime-index,security,s3-deep-storage,gcs-deep-storage,azure-deep-storage,hdfs-deep-storage' JVM_RUNTIME='-Djvm.runtime=11'
    # END - Integration tests for Compile with Java 8 and Run with Java 11

    - name: "security vulnerabilities"
--- a/integration-tests/README.md
+++ b/integration-tests/README.md
@ -140,15 +140,22 @@ Running a Test That Uses Hadoop
 The integration test that indexes from hadoop is not run as part
 of the integration test run discussed above.  This is because druid
 test clusters might not, in general, have access to hadoop.
-That's the case (for now, at least) when using the docker cluster set 
-up by the integration-tests profile, so the hadoop test
-has to be run using a cluster specified in a configuration file.
+This also applies to integration test that uses Hadoop HDFS as an inputSource or as a deep storage. 
+To run integration test that uses Hadoop, you will have to run a Hadoop cluster. This can be done in two ways:
+1) Run your own Druid + Haddop cluster and specified Hadoop configs in the configuration file (CONFIG_FILE).
+2) Run Druid Docker test clusters with Hadoop container by passing -Dstart.hadoop.docker=true to the mvn command. 

-The data file is 
-integration-tests/src/test/resources/hadoop/batch_hadoop.data.
+Currently, hdfs-deep-storage and other <cloud>-deep-storage integration test groups can only be run with 
+Druid Docker test clusters by passing -Dstart.hadoop.docker=true to start Hadoop container.
+You will also have to provide -Doverride.config.path=<PATH_TO_FILE> with your Druid's Hadoop configs set. 
+See integration-tests/docker/environment-configs/override-examples/hdfs directory for example.
+Note that if the integration test you are running also uses other cloud extension (S3, Azure, GCS), additional
+credentials/configs may need to be set in the same file as your Druid's Hadoop configs set. 
+
+Currently, ITHadoopIndexTest can only be run with your own Druid + Haddop cluster by following the below steps:
 Create a directory called batchHadoop1 in the hadoop file system
-(anywhere you want) and put batch_hadoop.data into that directory
-(as its only file).
+(anywhere you want) and put batch_hadoop.data (integration-tests/src/test/resources/hadoop/batch_hadoop.data) 
+into that directory (as its only file).

 Add this keyword to the configuration file (see above):

--- a/integration-tests/docker/environment-configs/common
+++ b/integration-tests/docker/environment-configs/common
@ -23,7 +23,7 @@ LC_ALL=C.UTF-8

 # JAVA OPTS
 COMMON_DRUID_JAVA_OPTS=-Duser.timezone=UTC -Dfile.encoding=UTF-8 -Dlog4j.configurationFile=/shared/docker/lib/log4j2.xml
-DRUID_DEP_LIB_DIR=/shared/hadoop_xml/*:/shared/docker/lib/*:/usr/local/druid/lib/mysql-connector-java.jar
+DRUID_DEP_LIB_DIR=/shared/hadoop_xml:/shared/docker/lib/*:/usr/local/druid/lib/mysql-connector-java.jar

 # Druid configs
 druid_extensions_loadList=[]
--- a/integration-tests/docker/environment-configs/override-examples/hdfs
+++ b/integration-tests/docker/environment-configs/override-examples/hdfs
@ -0,0 +1,24 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+druid_storage_type=hdfs
+druid_storage_storageDirectory=/druid/segments
+# Depending on the test, additional extension(s) may be required.
+# Please refer to the other integration-tests/docker/environment-configs/override-examples/ files and Druid docs for
+# additional env vars to provide for each extension.
+druid_extensions_loadList=["druid-hdfs-storage"]
--- a/integration-tests/pom.xml
+++ b/integration-tests/pom.xml
@ -55,6 +55,12 @@
            <version>${project.parent.version}</version>
            <scope>runtime</scope>
        </dependency>
+        <dependency>
+            <groupId>org.apache.druid.extensions</groupId>
+            <artifactId>druid-hdfs-storage</artifactId>
+            <version>${project.parent.version}</version>
+            <scope>runtime</scope>
+        </dependency>
        <dependency>
            <groupId>org.apache.druid.extensions</groupId>
            <artifactId>druid-datasketches</artifactId>
--- a/integration-tests/run_cluster.sh
+++ b/integration-tests/run_cluster.sh
@ -68,6 +68,9 @@
  # For druid-google-extensions
  mkdir -p $SHARED_DIR/docker/extensions/druid-google-extensions
  mv $SHARED_DIR/docker/lib/druid-google-extensions-* $SHARED_DIR/docker/extensions/druid-google-extensions
+  $ For druid-hdfs-storage
+  mkdir -p $SHARED_DIR/docker/extensions/druid-hdfs-storage
+  mv $SHARED_DIR/docker/lib/druid-hdfs-storage-* $SHARED_DIR/docker/extensions/druid-hdfs-storage

  # Pull Hadoop dependency if needed
  if [ -n "$DRUID_INTEGRATION_TEST_START_HADOOP_DOCKER" ] && [ "$DRUID_INTEGRATION_TEST_START_HADOOP_DOCKER" == true ]
@ -157,6 +160,42 @@ fi

 # Start docker containers for all Druid processes and dependencies
 {
+  # Start Hadoop docker if needed
+  if [ -n "$DRUID_INTEGRATION_TEST_START_HADOOP_DOCKER" ] && [ "$DRUID_INTEGRATION_TEST_START_HADOOP_DOCKER" == true ]
+  then
+    # Start Hadoop docker container
+    docker run -d --privileged --net druid-it-net --ip 172.172.172.13 -h druid-it-hadoop --name druid-it-hadoop -p 2049:2049 -p 2122:2122 -p 8020:8020 -p 8021:8021 -p 8030:8030 -p 8031:8031 -p 8032:8032 -p 8033:8033 -p 8040:8040 -p 8042:8042 -p 8088:8088 -p 8443:8443 -p 9000:9000 -p 10020:10020 -p 19888:19888 -p 34455:34455 -p 49707:49707 -p 50010:50010 -p 50020:50020 -p 50030:50030 -p 50060:50060 -p 50070:50070 -p 50075:50075 -p 50090:50090 -p 51111:51111 -v $RESOURCEDIR:/resources -v $SHARED_DIR:/shared druid-it/hadoop:2.8.5 sh -c "/etc/bootstrap.sh && tail -f /dev/null"
+
+    # wait for hadoop namenode to be up
+    echo "Waiting for hadoop namenode to be up"
+    docker exec -t druid-it-hadoop sh -c "./usr/local/hadoop/bin/hdfs dfs -mkdir -p /druid"
+    while [ $? -ne 0 ]
+    do
+       sleep 2
+       docker exec -t druid-it-hadoop sh -c "./usr/local/hadoop/bin/hdfs dfs -mkdir -p /druid"
+    done
+    echo "Finished waiting for Hadoop namenode"
+
+    # Setup hadoop druid dirs
+    echo "Setting up druid hadoop dirs"
+    docker exec -t druid-it-hadoop sh -c "./usr/local/hadoop/bin/hdfs dfs -mkdir -p /druid"
+    docker exec -t druid-it-hadoop sh -c "./usr/local/hadoop/bin/hdfs dfs -mkdir -p /druid/segments"
+    docker exec -t druid-it-hadoop sh -c "./usr/local/hadoop/bin/hdfs dfs -mkdir -p /quickstart"
+    docker exec -t druid-it-hadoop sh -c "./usr/local/hadoop/bin/hdfs dfs -chmod 777 /druid"
+    docker exec -t druid-it-hadoop sh -c "./usr/local/hadoop/bin/hdfs dfs -chmod 777 /druid/segments"
+    docker exec -t druid-it-hadoop sh -c "./usr/local/hadoop/bin/hdfs dfs -chmod 777 /quickstart"
+    docker exec -t druid-it-hadoop sh -c "./usr/local/hadoop/bin/hdfs dfs -chmod -R 777 /tmp"
+    docker exec -t druid-it-hadoop sh -c "./usr/local/hadoop/bin/hdfs dfs -chmod -R 777 /user"
+    # Copy data files to Hadoop container
+    docker exec -t druid-it-hadoop sh -c "./usr/local/hadoop/bin/hdfs dfs -put /shared/wikiticker-it/wikiticker-2015-09-12-sampled.json.gz /quickstart/wikiticker-2015-09-12-sampled.json.gz"
+    docker exec -t druid-it-hadoop sh -c "./usr/local/hadoop/bin/hdfs dfs -put /resources/data/batch_index /batch_index"
+    echo "Finished setting up druid hadoop dirs"
+
+    echo "Copying Hadoop XML files to shared"
+    docker exec -t druid-it-hadoop sh -c "cp /usr/local/hadoop/etc/hadoop/*.xml /shared/hadoop_xml"
+    echo "Copied Hadoop XML files to shared"
+  fi
+
  # Start zookeeper and kafka
  docker run -d --privileged --net druid-it-net --ip 172.172.172.2 ${COMMON_ENV} --name druid-zookeeper-kafka -p 2181:2181 -p 9092:9092 -p 9093:9093 -v $SHARED_DIR:/shared -v $SERVICE_SUPERVISORDS_DIR/zookeeper.conf:$SUPERVISORDIR/zookeeper.conf -v $SERVICE_SUPERVISORDS_DIR/kafka.conf:$SUPERVISORDIR/kafka.conf druid/cluster

@ -189,39 +228,4 @@ fi

  # Start Router with custom TLS cert checkers
  docker run -d --privileged --net druid-it-net --ip 172.172.172.12 ${COMMON_ENV} ${ROUTER_CUSTOM_CHECK_TLS_ENV} ${OVERRIDE_ENV} --hostname druid-router-custom-check-tls --name druid-router-custom-check-tls -p 8891:8891 -p 9091:9091 -v $SHARED_DIR:/shared -v $SERVICE_SUPERVISORDS_DIR/druid.conf:$SUPERVISORDIR/druid.conf --link druid-zookeeper-kafka:druid-zookeeper-kafka --link druid-coordinator:druid-coordinator --link druid-broker:druid-broker druid/cluster
-
-  # Start Hadoop docker if needed
-  if [ -n "$DRUID_INTEGRATION_TEST_START_HADOOP_DOCKER" ] && [ "$DRUID_INTEGRATION_TEST_START_HADOOP_DOCKER" == true ]
-  then
-    # Start Hadoop docker container
-    docker run -d --privileged --net druid-it-net --ip 172.172.172.13 -h druid-it-hadoop --name druid-it-hadoop -p 2049:2049 -p 2122:2122 -p 8020:8020 -p 8021:8021 -p 8030:8030 -p 8031:8031 -p 8032:8032 -p 8033:8033 -p 8040:8040 -p 8042:8042 -p 8088:8088 -p 8443:8443 -p 9000:9000 -p 10020:10020 -p 19888:19888 -p 34455:34455 -p 49707:49707 -p 50010:50010 -p 50020:50020 -p 50030:50030 -p 50060:50060 -p 50070:50070 -p 50075:50075 -p 50090:50090 -p 51111:51111 -v $SHARED_DIR:/shared druid-it/hadoop:2.8.5 sh -c "/etc/bootstrap.sh && tail -f /dev/null"
-
-    # wait for hadoop namenode to be up
-    echo "Waiting for hadoop namenode to be up"
-    docker exec -t druid-it-hadoop sh -c "./usr/local/hadoop/bin/hdfs dfs -mkdir -p /druid"
-    while [ $? -ne 0 ]
-    do
-       sleep 2
-       docker exec -t druid-it-hadoop sh -c "./usr/local/hadoop/bin/hdfs dfs -mkdir -p /druid"
-    done
-    echo "Finished waiting for Hadoop namenode"
-
-    # Setup hadoop druid dirs
-    echo "Setting up druid hadoop dirs"
-    docker exec -t druid-it-hadoop sh -c "./usr/local/hadoop/bin/hdfs dfs -mkdir -p /druid"
-    docker exec -t druid-it-hadoop sh -c "./usr/local/hadoop/bin/hdfs dfs -mkdir -p /druid/segments"
-    docker exec -t druid-it-hadoop sh -c "./usr/local/hadoop/bin/hdfs dfs -mkdir -p /quickstart"
-    docker exec -t druid-it-hadoop sh -c "./usr/local/hadoop/bin/hdfs dfs -chmod 777 /druid"
-    docker exec -t druid-it-hadoop sh -c "./usr/local/hadoop/bin/hdfs dfs -chmod 777 /druid/segments"
-    docker exec -t druid-it-hadoop sh -c "./usr/local/hadoop/bin/hdfs dfs -chmod 777 /quickstart"
-    docker exec -t druid-it-hadoop sh -c "./usr/local/hadoop/bin/hdfs dfs -chmod -R 777 /tmp"
-    docker exec -t druid-it-hadoop sh -c "./usr/local/hadoop/bin/hdfs dfs -chmod -R 777 /user"
-    # Copy data files to Hadoop container
-    docker exec -t druid-it-hadoop sh -c "./usr/local/hadoop/bin/hdfs dfs -put /shared/wikiticker-it/wikiticker-2015-09-12-sampled.json.gz /quickstart/wikiticker-2015-09-12-sampled.json.gz"
-    echo "Finished setting up druid hadoop dirs"
-
-    echo "Copying Hadoop XML files to shared"
-    docker exec -t druid-it-hadoop sh -c "cp /usr/local/hadoop/etc/hadoop/*.xml /shared/hadoop_xml"
-    echo "Copied Hadoop XML files to shared"
-  fi
 }
--- a/integration-tests/src/test/java/org/apache/druid/tests/TestNGGroup.java
+++ b/integration-tests/src/test/java/org/apache/druid/tests/TestNGGroup.java
@ -36,17 +36,23 @@ public class TestNGGroup
  // This group can only be run individually using -Dgroups=security since it requires specific test data setup.
  public static final String SECURITY = "security";
  // This group is not part of CI. To run this group, s3 configs/credentials for your s3 must be provided in a file.
-  // The path of the file must then we pass to mvn with -Doverride.config.path=<PATH_TO_FILE>
+  // The path of the file must then be pass to mvn with -Doverride.config.path=<PATH_TO_FILE>
  // See integration-tests/docker/environment-configs/override-examples/s3 for env vars to provide.
  public static final String S3_DEEP_STORAGE = "s3-deep-storage";
  // This group is not part of CI. To run this group, gcs configs/credentials for your gcs must be provided in a file.
-  // The path of the file must then we pass to mvn with -Doverride.config.path=<PATH_TO_FILE>
+  // The path of the file must then be pass to mvn with -Doverride.config.path=<PATH_TO_FILE>
  // See integration-tests/docker/environment-configs/override-examples/gcs for env vars to provide.
  // The path to the folder that contains your GOOGLE_APPLICATION_CREDENTIALS file must also be pass
  // to mvn with -Dresource.file.dir.path=<PATH_TO_FOLDER>
  public static final String GCS_DEEP_STORAGE = "gcs-deep-storage";
  // This group is not part of CI. To run this group, azure configs/credentials for your azure must be provided in a file.
-  // The path of the file must then we pass to mvn with -Doverride.config.path=<PATH_TO_FILE>
+  // The path of the file must then be pass to mvn with -Doverride.config.path=<PATH_TO_FILE>
  // See integration-tests/docker/environment-configs/override-examples/azures for env vars to provide.
  public static final String AZURE_DEEP_STORAGE = "azure-deep-storage";
+  // This group is not part of CI. To run this group, hadoop configs must be provided in a file. The path of the file
+  // must then be pass to mvn with -Doverride.config.path=<PATH_TO_FILE>
+  // See integration-tests/docker/environment-configs/override-examples/hdfs for env vars to provide.
+  // Additionally, hadoop docker must be started by passing -Dstart.hadoop.docker=true to mvn.
+  public static final String HDFS_DEEP_STORAGE = "hdfs-deep-storage";
+
 }
--- a/integration-tests/src/test/java/org/apache/druid/tests/indexer/AbstractAzureInputSourceSimpleIndexTest.java
+++ b/integration-tests/src/test/java/org/apache/druid/tests/indexer/AbstractAzureInputSourceSimpleIndexTest.java
@ -24,30 +24,14 @@ import com.google.common.collect.ImmutableMap;
 import org.apache.druid.indexer.partitions.DynamicPartitionsSpec;
 import org.apache.druid.java.util.common.Pair;
 import org.apache.druid.java.util.common.StringUtils;
-import org.apache.druid.testing.guice.DruidTestModuleFactory;
-import org.apache.druid.tests.TestNGGroup;
 import org.testng.annotations.DataProvider;
-import org.testng.annotations.Guice;
-import org.testng.annotations.Test;

 import java.io.Closeable;
 import java.util.List;
 import java.util.UUID;
 import java.util.function.Function;

-/**
- * IMPORTANT:
- * To run this test, you must:
- * 1) Set the bucket and path for your data. This can be done by setting -Ddruid.test.config.cloudBucket and
- *    -Ddruid.test.config.cloudPath or setting "cloud_bucket" and "cloud_path" in the config file.
- * 2) Copy wikipedia_index_data1.json, wikipedia_index_data2.json, and wikipedia_index_data3.json
- *    located in integration-tests/src/test/resources/data/batch_index to your Azure at the location set in step 1.
- * 3) Provide -Doverride.config.path=<PATH_TO_FILE> with Azure credentials/configs set. See
- *    integration-tests/docker/environment-configs/override-examples/azure for env vars to provide.
- */
-@Test(groups = TestNGGroup.AZURE_DEEP_STORAGE)
-@Guice(moduleFactory = DruidTestModuleFactory.class)
-public class ITAzureParallelIndexTest extends AbstractITBatchIndexTest
+public abstract class AbstractAzureInputSourceSimpleIndexTest extends AbstractITBatchIndexTest
 {
  private static final String INDEX_TASK = "/indexer/wikipedia_cloud_index_task.json";
  private static final String INDEX_QUERIES_RESOURCE = "/indexer/wikipedia_index_queries.json";
@ -85,8 +69,7 @@ public class ITAzureParallelIndexTest extends AbstractITBatchIndexTest
    };
  }

-  @Test(dataProvider = "resources")
-  public void testAzureIndexData(Pair<String, List> azureInputSource) throws Exception
+  void doTest(Pair<String, List> azureInputSource) throws Exception
  {
    try (
        final Closeable ignored1 = unloader(INDEX_DATASOURCE + config.getExtraDatasourceNameSuffix());
--- a/integration-tests/src/test/java/org/apache/druid/tests/indexer/AbstractGcsInputSourceSimpleIndexTest.java
+++ b/integration-tests/src/test/java/org/apache/druid/tests/indexer/AbstractGcsInputSourceSimpleIndexTest.java
@ -24,31 +24,14 @@ import com.google.common.collect.ImmutableMap;
 import org.apache.druid.indexer.partitions.DynamicPartitionsSpec;
 import org.apache.druid.java.util.common.Pair;
 import org.apache.druid.java.util.common.StringUtils;
-import org.apache.druid.testing.guice.DruidTestModuleFactory;
-import org.apache.druid.tests.TestNGGroup;
 import org.testng.annotations.DataProvider;
-import org.testng.annotations.Guice;
-import org.testng.annotations.Test;

 import java.io.Closeable;
 import java.util.List;
 import java.util.UUID;
 import java.util.function.Function;

-/**
- * IMPORTANT:
- * To run this test, you must:
- * 1) Set the bucket and path for your data. This can be done by setting -Ddruid.test.config.cloudBucket and
- *    -Ddruid.test.config.cloudPath or setting "cloud_bucket" and "cloud_path" in the config file.
- * 2) Copy wikipedia_index_data1.json, wikipedia_index_data2.json, and wikipedia_index_data3.json
- *    located in integration-tests/src/test/resources/data/batch_index to your GCS at the location set in step 1.
- * 3) Provide -Doverride.config.path=<PATH_TO_FILE> with gcs configs set. See
- *    integration-tests/docker/environment-configs/override-examples/gcs for env vars to provide.
- * 4) Provide -Dresource.file.dir.path=<PATH_TO_FOLDER> with folder that contains GOOGLE_APPLICATION_CREDENTIALS file
- */
-@Test(groups = TestNGGroup.GCS_DEEP_STORAGE)
-@Guice(moduleFactory = DruidTestModuleFactory.class)
-public class ITGcsParallelIndexTest extends AbstractITBatchIndexTest
+public abstract class AbstractGcsInputSourceSimpleIndexTest extends AbstractITBatchIndexTest
 {
  private static final String INDEX_TASK = "/indexer/wikipedia_cloud_index_task.json";
  private static final String INDEX_QUERIES_RESOURCE = "/indexer/wikipedia_index_queries.json";
@ -86,8 +69,7 @@ public class ITGcsParallelIndexTest extends AbstractITBatchIndexTest
    };
  }

-  @Test(dataProvider = "resources")
-  public void testGcsIndexData(Pair<String, List> gcsInputSource) throws Exception
+  void doTest(Pair<String, List> gcsInputSource) throws Exception
  {
    try (
        final Closeable ignored1 = unloader(INDEX_DATASOURCE + config.getExtraDatasourceNameSuffix());
--- a/integration-tests/src/test/java/org/apache/druid/tests/indexer/AbstractHdfsInputSourceSimpleIndexTest.java
+++ b/integration-tests/src/test/java/org/apache/druid/tests/indexer/AbstractHdfsInputSourceSimpleIndexTest.java
@ -0,0 +1,100 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.druid.tests.indexer;
+
+import com.google.common.collect.ImmutableList;
+import org.apache.druid.java.util.common.Pair;
+import org.apache.druid.java.util.common.StringUtils;
+import org.testng.annotations.DataProvider;
+
+import java.io.Closeable;
+import java.util.List;
+import java.util.UUID;
+import java.util.function.Function;
+
+public abstract class AbstractHdfsInputSourceSimpleIndexTest extends AbstractITBatchIndexTest
+{
+  private static final String INDEX_TASK = "/indexer/wikipedia_cloud_simple_index_task.json";
+  private static final String INDEX_QUERIES_RESOURCE = "/indexer/wikipedia_index_queries.json";
+  private static final String INDEX_DATASOURCE = "wikipedia_index_test_" + UUID.randomUUID();
+  private static final String INPUT_SOURCE_PATHS_KEY = "paths";
+
+  @DataProvider
+  public static Object[][] resources()
+  {
+    return new Object[][]{
+        {new Pair<>(INPUT_SOURCE_PATHS_KEY,
+                    "hdfs://druid-it-hadoop:9000/batch_index"
+        )},
+        {new Pair<>(INPUT_SOURCE_PATHS_KEY,
+                    ImmutableList.of(
+                        "hdfs://druid-it-hadoop:9000/batch_index"
+                    )
+        )},
+        {new Pair<>(INPUT_SOURCE_PATHS_KEY,
+                    ImmutableList.of(
+                        "hdfs://druid-it-hadoop:9000/batch_index/wikipedia_index_data1.json",
+                        "hdfs://druid-it-hadoop:9000/batch_index/wikipedia_index_data2.json",
+                        "hdfs://druid-it-hadoop:9000/batch_index/wikipedia_index_data3.json"
+                    )
+        )}
+    };
+  }
+
+  void doTest(Pair<String, List> hdfsInputSource) throws Exception
+  {
+    try (
+        final Closeable ignored1 = unloader(INDEX_DATASOURCE + config.getExtraDatasourceNameSuffix());
+    ) {
+      final Function<String, String> hdfsPropsTransform = spec -> {
+        try {
+          spec = StringUtils.replace(
+              spec,
+              "%%INPUT_SOURCE_TYPE%%",
+              "hdfs"
+          );
+          spec = StringUtils.replace(
+              spec,
+              "%%INPUT_SOURCE_PROPERTY_KEY%%",
+              hdfsInputSource.lhs
+          );
+          return StringUtils.replace(
+              spec,
+              "%%INPUT_SOURCE_PROPERTY_VALUE%%",
+              jsonMapper.writeValueAsString(hdfsInputSource.rhs)
+          );
+        }
+        catch (Exception e) {
+          throw new RuntimeException(e);
+        }
+      };
+
+      doIndexTest(
+          INDEX_DATASOURCE,
+          INDEX_TASK,
+          hdfsPropsTransform,
+          INDEX_QUERIES_RESOURCE,
+          false,
+          true,
+          true
+      );
+    }
+  }
+}
--- a/integration-tests/src/test/java/org/apache/druid/tests/indexer/AbstractS3InputSourceSimpleIndexTest.java
+++ b/integration-tests/src/test/java/org/apache/druid/tests/indexer/AbstractS3InputSourceSimpleIndexTest.java
@ -24,30 +24,14 @@ import com.google.common.collect.ImmutableMap;
 import org.apache.druid.indexer.partitions.DynamicPartitionsSpec;
 import org.apache.druid.java.util.common.Pair;
 import org.apache.druid.java.util.common.StringUtils;
-import org.apache.druid.testing.guice.DruidTestModuleFactory;
-import org.apache.druid.tests.TestNGGroup;
 import org.testng.annotations.DataProvider;
-import org.testng.annotations.Guice;
-import org.testng.annotations.Test;

 import java.io.Closeable;
 import java.util.List;
 import java.util.UUID;
 import java.util.function.Function;

-/**
- * IMPORTANT:
- * To run this test, you must:
- * 1) Set the bucket and path for your data. This can be done by setting -Ddruid.test.config.cloudBucket and
- *    -Ddruid.test.config.cloudPath or setting "cloud_bucket" and "cloud_path" in the config file.
- * 2) Copy wikipedia_index_data1.json, wikipedia_index_data2.json, and wikipedia_index_data3.json
- *    located in integration-tests/src/test/resources/data/batch_index to your S3 at the location set in step 1.
- * 3) Provide -Doverride.config.path=<PATH_TO_FILE> with s3 credentials/configs set. See
- *    integration-tests/docker/environment-configs/override-examples/s3 for env vars to provide.
- */
-@Test(groups = TestNGGroup.S3_DEEP_STORAGE)
-@Guice(moduleFactory = DruidTestModuleFactory.class)
-public class ITS3ParallelIndexTest extends AbstractITBatchIndexTest
+public abstract class AbstractS3InputSourceSimpleIndexTest extends AbstractITBatchIndexTest
 {
  private static final String INDEX_TASK = "/indexer/wikipedia_cloud_index_task.json";
  private static final String INDEX_QUERIES_RESOURCE = "/indexer/wikipedia_index_queries.json";
@ -68,7 +52,7 @@ public class ITS3ParallelIndexTest extends AbstractITBatchIndexTest
                        "s3://%%BUCKET%%/%%PATH%%" + WIKIPEDIA_DATA_1,
                        "s3://%%BUCKET%%/%%PATH%%" + WIKIPEDIA_DATA_2,
                        "s3://%%BUCKET%%/%%PATH%%" + WIKIPEDIA_DATA_3
-                        )
+                    )
        )},
        {new Pair<>(INPUT_SOURCE_PREFIXES_KEY,
                    ImmutableList.of(
@ -85,8 +69,7 @@ public class ITS3ParallelIndexTest extends AbstractITBatchIndexTest
    };
  }

-  @Test(dataProvider = "resources")
-  public void testS3IndexData(Pair<String, List> s3InputSource) throws Exception
+  void doTest(Pair<String, List> s3InputSource) throws Exception
  {
    try (
        final Closeable ignored1 = unloader(INDEX_DATASOURCE + config.getExtraDatasourceNameSuffix());
--- a/integration-tests/src/test/java/org/apache/druid/tests/indexer/ITAzureToAzureParallelIndexTest.java
+++ b/integration-tests/src/test/java/org/apache/druid/tests/indexer/ITAzureToAzureParallelIndexTest.java
@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.druid.tests.indexer;
+
+import org.apache.druid.java.util.common.Pair;
+import org.apache.druid.testing.guice.DruidTestModuleFactory;
+import org.apache.druid.tests.TestNGGroup;
+import org.testng.annotations.Guice;
+import org.testng.annotations.Test;
+
+import java.util.List;
+
+/**
+ * IMPORTANT:
+ * To run this test, you must:
+ * 1) Set the bucket and path for your data. This can be done by setting -Ddruid.test.config.cloudBucket and
+ *    -Ddruid.test.config.cloudPath or setting "cloud_bucket" and "cloud_path" in the config file.
+ * 2) Copy wikipedia_index_data1.json, wikipedia_index_data2.json, and wikipedia_index_data3.json
+ *    located in integration-tests/src/test/resources/data/batch_index to your Azure at the location set in step 1.
+ * 3) Provide -Doverride.config.path=<PATH_TO_FILE> with Azure credentials/configs set. See
+ *    integration-tests/docker/environment-configs/override-examples/azure for env vars to provide.
+ */
+@Test(groups = TestNGGroup.AZURE_DEEP_STORAGE)
+@Guice(moduleFactory = DruidTestModuleFactory.class)
+public class ITAzureToAzureParallelIndexTest extends AbstractAzureInputSourceSimpleIndexTest
+{
+  @Test(dataProvider = "resources")
+  public void testAzureIndexData(Pair<String, List> azureInputSource) throws Exception
+  {
+    doTest(azureInputSource);
+  }
+}
--- a/integration-tests/src/test/java/org/apache/druid/tests/indexer/ITAzureToHdfsParallelIndexTest.java
+++ b/integration-tests/src/test/java/org/apache/druid/tests/indexer/ITAzureToHdfsParallelIndexTest.java
@ -0,0 +1,51 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.druid.tests.indexer;
+
+import org.apache.druid.java.util.common.Pair;
+import org.apache.druid.testing.guice.DruidTestModuleFactory;
+import org.apache.druid.tests.TestNGGroup;
+import org.testng.annotations.Guice;
+import org.testng.annotations.Test;
+
+import java.util.List;
+
+/**
+ * IMPORTANT:
+ * To run this test, you must:
+ * 1) Set the bucket and path for your data. This can be done by setting -Ddruid.test.config.cloudBucket and
+ *    -Ddruid.test.config.cloudPath or setting "cloud_bucket" and "cloud_path" in the config file.
+ * 2) Copy wikipedia_index_data1.json, wikipedia_index_data2.json, and wikipedia_index_data3.json
+ *    located in integration-tests/src/test/resources/data/batch_index to your Azure at the location set in step 1.
+ * 3) Provide -Doverride.config.path=<PATH_TO_FILE> with Azure credentials and hdfs deep storage configs set. See
+ *    integration-tests/docker/environment-configs/override-examples/azure and
+ *    integration-tests/docker/environment-configs/override-examples/hdfs for env vars to provide.
+ * 4) Run the test with -Dstart.hadoop.docker=true in the mvn command
+ */
+@Test(groups = TestNGGroup.HDFS_DEEP_STORAGE)
+@Guice(moduleFactory = DruidTestModuleFactory.class)
+public class ITAzureToHdfsParallelIndexTest extends AbstractAzureInputSourceSimpleIndexTest
+{
+  @Test(dataProvider = "resources")
+  public void testAzureIndexData(Pair<String, List> azureInputSource) throws Exception
+  {
+    doTest(azureInputSource);
+  }
+}
--- a/integration-tests/src/test/java/org/apache/druid/tests/indexer/ITGcsToGcsParallelIndexTest.java
+++ b/integration-tests/src/test/java/org/apache/druid/tests/indexer/ITGcsToGcsParallelIndexTest.java
@ -0,0 +1,50 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.druid.tests.indexer;
+
+import org.apache.druid.java.util.common.Pair;
+import org.apache.druid.testing.guice.DruidTestModuleFactory;
+import org.apache.druid.tests.TestNGGroup;
+import org.testng.annotations.Guice;
+import org.testng.annotations.Test;
+
+import java.util.List;
+
+/**
+ * IMPORTANT:
+ * To run this test, you must:
+ * 1) Set the bucket and path for your data. This can be done by setting -Ddruid.test.config.cloudBucket and
+ *    -Ddruid.test.config.cloudPath or setting "cloud_bucket" and "cloud_path" in the config file.
+ * 2) Copy wikipedia_index_data1.json, wikipedia_index_data2.json, and wikipedia_index_data3.json
+ *    located in integration-tests/src/test/resources/data/batch_index to your GCS at the location set in step 1.
+ * 3) Provide -Doverride.config.path=<PATH_TO_FILE> with gcs configs set. See
+ *    integration-tests/docker/environment-configs/override-examples/gcs for env vars to provide.
+ * 4) Provide -Dresource.file.dir.path=<PATH_TO_FOLDER> with folder that contains GOOGLE_APPLICATION_CREDENTIALS file
+ */
+@Test(groups = TestNGGroup.GCS_DEEP_STORAGE)
+@Guice(moduleFactory = DruidTestModuleFactory.class)
+public class ITGcsToGcsParallelIndexTest extends AbstractGcsInputSourceSimpleIndexTest
+{
+  @Test(dataProvider = "resources")
+  public void testGcsIndexData(Pair<String, List> gcsInputSource) throws Exception
+  {
+    doTest(gcsInputSource);
+  }
+}
--- a/integration-tests/src/test/java/org/apache/druid/tests/indexer/ITGcsToHdfsParallelIndexTest.java
+++ b/integration-tests/src/test/java/org/apache/druid/tests/indexer/ITGcsToHdfsParallelIndexTest.java
@ -0,0 +1,52 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.druid.tests.indexer;
+
+import org.apache.druid.java.util.common.Pair;
+import org.apache.druid.testing.guice.DruidTestModuleFactory;
+import org.apache.druid.tests.TestNGGroup;
+import org.testng.annotations.Guice;
+import org.testng.annotations.Test;
+
+import java.util.List;
+
+/**
+ * IMPORTANT:
+ * To run this test, you must:
+ * 1) Set the bucket and path for your data. This can be done by setting -Ddruid.test.config.cloudBucket and
+ *    -Ddruid.test.config.cloudPath or setting "cloud_bucket" and "cloud_path" in the config file.
+ * 2) Copy wikipedia_index_data1.json, wikipedia_index_data2.json, and wikipedia_index_data3.json
+ *    located in integration-tests/src/test/resources/data/batch_index to your GCS at the location set in step 1.
+ * 3) Provide -Doverride.config.path=<PATH_TO_FILE> with gcs configs and hdfs deep storage configs set. See
+ *    integration-tests/docker/environment-configs/override-examples/gcs and
+ *    integration-tests/docker/environment-configs/override-examples/hdfs for env vars to provide.
+ * 4) Provide -Dresource.file.dir.path=<PATH_TO_FOLDER> with folder that contains GOOGLE_APPLICATION_CREDENTIALS file
+ * 5) Run the test with -Dstart.hadoop.docker=true in the mvn command
+ */
+@Test(groups = TestNGGroup.HDFS_DEEP_STORAGE)
+@Guice(moduleFactory = DruidTestModuleFactory.class)
+public class ITGcsToHdfsParallelIndexTest extends AbstractGcsInputSourceSimpleIndexTest
+{
+  @Test(dataProvider = "resources")
+  public void testGcsIndexData(Pair<String, List> gcsInputSource) throws Exception
+  {
+    doTest(gcsInputSource);
+  }
+}
--- a/integration-tests/src/test/java/org/apache/druid/tests/indexer/ITHdfsToAzureSimpleIndexTest.java
+++ b/integration-tests/src/test/java/org/apache/druid/tests/indexer/ITHdfsToAzureSimpleIndexTest.java
@ -0,0 +1,47 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.druid.tests.indexer;
+
+import org.apache.druid.java.util.common.Pair;
+import org.apache.druid.testing.guice.DruidTestModuleFactory;
+import org.apache.druid.tests.TestNGGroup;
+import org.testng.annotations.Guice;
+import org.testng.annotations.Test;
+
+import java.util.List;
+
+/**
+ * IMPORTANT:
+ * To run this test, you must:
+ * 1) Run the test with -Dstart.hadoop.docker=true in the mvn command
+ * 2) Provide -Doverride.config.path=<PATH_TO_FILE> with Azure credentials/configs set. See
+ *    integration-tests/docker/environment-configs/override-examples/azure for env vars to provide.
+ *    You will also need to include "druid-hdfs-storage" to druid_extensions_loadList in this file.
+ */
+@Test(groups = TestNGGroup.AZURE_DEEP_STORAGE)
+@Guice(moduleFactory = DruidTestModuleFactory.class)
+public class ITHdfsToAzureSimpleIndexTest extends AbstractHdfsInputSourceSimpleIndexTest
+{
+  @Test(dataProvider = "resources")
+  public void testHdfsIndexData(Pair<String, List> hdfsInputSource) throws Exception
+  {
+    doTest(hdfsInputSource);
+  }
+}
--- a/integration-tests/src/test/java/org/apache/druid/tests/indexer/ITHdfsToGcsSimpleIndexTest.java
+++ b/integration-tests/src/test/java/org/apache/druid/tests/indexer/ITHdfsToGcsSimpleIndexTest.java
@ -0,0 +1,48 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.druid.tests.indexer;
+
+import org.apache.druid.java.util.common.Pair;
+import org.apache.druid.testing.guice.DruidTestModuleFactory;
+import org.apache.druid.tests.TestNGGroup;
+import org.testng.annotations.Guice;
+import org.testng.annotations.Test;
+
+import java.util.List;
+
+/**
+ * IMPORTANT:
+ * To run this test, you must:
+ * 1) Run the test with -Dstart.hadoop.docker=true in the mvn command
+ * 2) Provide -Doverride.config.path=<PATH_TO_FILE> with gcs configs set. See
+ *    integration-tests/docker/environment-configs/override-examples/gcs for env vars to provide.
+ *    You will also need to include "druid-hdfs-storage" to druid_extensions_loadList in this file.
+ * 3) Provide -Dresource.file.dir.path=<PATH_TO_FOLDER> with folder that contains GOOGLE_APPLICATION_CREDENTIALS file
+ */
+@Test(groups = TestNGGroup.GCS_DEEP_STORAGE)
+@Guice(moduleFactory = DruidTestModuleFactory.class)
+public class ITHdfsToGcsSimpleIndexTest extends AbstractHdfsInputSourceSimpleIndexTest
+{
+  @Test(dataProvider = "resources")
+  public void testHdfsIndexData(Pair<String, List> hdfsInputSource) throws Exception
+  {
+    doTest(hdfsInputSource);
+  }
+}
--- a/integration-tests/src/test/java/org/apache/druid/tests/indexer/ITHdfsToHdfsSimpleIndexTest.java
+++ b/integration-tests/src/test/java/org/apache/druid/tests/indexer/ITHdfsToHdfsSimpleIndexTest.java
@ -0,0 +1,46 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.druid.tests.indexer;
+
+import org.apache.druid.java.util.common.Pair;
+import org.apache.druid.testing.guice.DruidTestModuleFactory;
+import org.apache.druid.tests.TestNGGroup;
+import org.testng.annotations.Guice;
+import org.testng.annotations.Test;
+
+import java.util.List;
+
+/**
+ * IMPORTANT:
+ * To run this test, you must:
+ * 1) Run the test with -Dstart.hadoop.docker=true in the mvn command
+ * 2) Provide -Doverride.config.path=<PATH_TO_FILE> with hdfs configs set. See
+ *    integration-tests/docker/environment-configs/override-examples/hdfs for env vars to provide.
+ */
+@Test(groups = TestNGGroup.HDFS_DEEP_STORAGE)
+@Guice(moduleFactory = DruidTestModuleFactory.class)
+public class ITHdfsToHdfsSimpleIndexTest extends AbstractHdfsInputSourceSimpleIndexTest
+{
+  @Test(dataProvider = "resources")
+  public void testHdfsIndexData(Pair<String, List> hdfsInputSource) throws Exception
+  {
+    doTest(hdfsInputSource);
+  }
+}
--- a/integration-tests/src/test/java/org/apache/druid/tests/indexer/ITHdfsToS3SimpleIndexTest.java
+++ b/integration-tests/src/test/java/org/apache/druid/tests/indexer/ITHdfsToS3SimpleIndexTest.java
@ -0,0 +1,47 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.druid.tests.indexer;
+
+import org.apache.druid.java.util.common.Pair;
+import org.apache.druid.testing.guice.DruidTestModuleFactory;
+import org.apache.druid.tests.TestNGGroup;
+import org.testng.annotations.Guice;
+import org.testng.annotations.Test;
+
+import java.util.List;
+
+/**
+ * IMPORTANT:
+ * To run this test, you must:
+ * 1) Run the test with -Dstart.hadoop.docker=true in the mvn command
+ * 2) Provide -Doverride.config.path=<PATH_TO_FILE> with s3 credentials/configs set. See
+ *    integration-tests/docker/environment-configs/override-examples/s3 for env vars to provide.
+ *    You will also need to include "druid-hdfs-storage" to druid_extensions_loadList in this file.
+ */
+@Test(groups = TestNGGroup.S3_DEEP_STORAGE)
+@Guice(moduleFactory = DruidTestModuleFactory.class)
+public class ITHdfsToS3SimpleIndexTest extends AbstractHdfsInputSourceSimpleIndexTest
+{
+  @Test(dataProvider = "resources")
+  public void testHdfsIndexData(Pair<String, List> hdfsInputSource) throws Exception
+  {
+    doTest(hdfsInputSource);
+  }
+}
--- a/integration-tests/src/test/java/org/apache/druid/tests/indexer/ITS3ToHdfsParallelIndexTest.java
+++ b/integration-tests/src/test/java/org/apache/druid/tests/indexer/ITS3ToHdfsParallelIndexTest.java
@ -0,0 +1,51 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.druid.tests.indexer;
+
+import org.apache.druid.java.util.common.Pair;
+import org.apache.druid.testing.guice.DruidTestModuleFactory;
+import org.apache.druid.tests.TestNGGroup;
+import org.testng.annotations.Guice;
+import org.testng.annotations.Test;
+
+import java.util.List;
+
+/**
+ * IMPORTANT:
+ * To run this test, you must:
+ * 1) Set the bucket and path for your data. This can be done by setting -Ddruid.test.config.cloudBucket and
+ *    -Ddruid.test.config.cloudPath or setting "cloud_bucket" and "cloud_path" in the config file.
+ * 2) Copy wikipedia_index_data1.json, wikipedia_index_data2.json, and wikipedia_index_data3.json
+ *    located in integration-tests/src/test/resources/data/batch_index to your S3 at the location set in step 1.
+ * 3) Provide -Doverride.config.path=<PATH_TO_FILE> with s3 credentials and hdfs deep storage configs set. See
+ *    integration-tests/docker/environment-configs/override-examples/s3 and
+ *    integration-tests/docker/environment-configs/override-examples/hdfs for env vars to provide.
+ * 4) Run the test with -Dstart.hadoop.docker=true in the mvn command
+ */
+@Test(groups = TestNGGroup.HDFS_DEEP_STORAGE)
+@Guice(moduleFactory = DruidTestModuleFactory.class)
+public class ITS3ToHdfsParallelIndexTest extends AbstractS3InputSourceSimpleIndexTest
+{
+  @Test(dataProvider = "resources")
+  public void testS3IndexData(Pair<String, List> s3InputSource) throws Exception
+  {
+    doTest(s3InputSource);
+  }
+}
--- a/integration-tests/src/test/java/org/apache/druid/tests/indexer/ITS3ToS3ParallelIndexTest.java
+++ b/integration-tests/src/test/java/org/apache/druid/tests/indexer/ITS3ToS3ParallelIndexTest.java
@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.druid.tests.indexer;
+
+import org.apache.druid.java.util.common.Pair;
+import org.apache.druid.testing.guice.DruidTestModuleFactory;
+import org.apache.druid.tests.TestNGGroup;
+import org.testng.annotations.Guice;
+import org.testng.annotations.Test;
+
+import java.util.List;
+
+/**
+ * IMPORTANT:
+ * To run this test, you must:
+ * 1) Set the bucket and path for your data. This can be done by setting -Ddruid.test.config.cloudBucket and
+ *    -Ddruid.test.config.cloudPath or setting "cloud_bucket" and "cloud_path" in the config file.
+ * 2) Copy wikipedia_index_data1.json, wikipedia_index_data2.json, and wikipedia_index_data3.json
+ *    located in integration-tests/src/test/resources/data/batch_index to your S3 at the location set in step 1.
+ * 3) Provide -Doverride.config.path=<PATH_TO_FILE> with s3 credentials/configs set. See
+ *    integration-tests/docker/environment-configs/override-examples/s3 for env vars to provide.
+ */
+@Test(groups = TestNGGroup.S3_DEEP_STORAGE)
+@Guice(moduleFactory = DruidTestModuleFactory.class)
+public class ITS3ToS3ParallelIndexTest extends AbstractS3InputSourceSimpleIndexTest
+{
+  @Test(dataProvider = "resources")
+  public void testS3IndexData(Pair<String, List> s3InputSource) throws Exception
+  {
+    doTest(s3InputSource);
+  }
+}
--- a/integration-tests/src/test/resources/indexer/wikipedia_cloud_simple_index_task.json
+++ b/integration-tests/src/test/resources/indexer/wikipedia_cloud_simple_index_task.json
@ -0,0 +1,81 @@
+{
+  "type": "index",
+  "spec": {
+    "dataSchema": {
+      "dataSource": "%%DATASOURCE%%",
+      "timestampSpec": {
+        "column": "timestamp"
+      },
+      "dimensionsSpec": {
+        "dimensions": [
+          "page",
+          {"type": "string", "name": "language", "createBitmapIndex": false},
+          "user",
+          "unpatrolled",
+          "newPage",
+          "robot",
+          "anonymous",
+          "namespace",
+          "continent",
+          "country",
+          "region",
+          "city"
+        ]
+      },
+      "metricsSpec": [
+        {
+          "type": "count",
+          "name": "count"
+        },
+        {
+          "type": "doubleSum",
+          "name": "added",
+          "fieldName": "added"
+        },
+        {
+          "type": "doubleSum",
+          "name": "deleted",
+          "fieldName": "deleted"
+        },
+        {
+          "type": "doubleSum",
+          "name": "delta",
+          "fieldName": "delta"
+        },
+        {
+          "name": "thetaSketch",
+          "type": "thetaSketch",
+          "fieldName": "user"
+        },
+        {
+          "name": "quantilesDoublesSketch",
+          "type": "quantilesDoublesSketch",
+          "fieldName": "delta"
+        },
+        {
+          "name": "HLLSketchBuild",
+          "type": "HLLSketchBuild",
+          "fieldName": "user"
+        }
+      ],
+      "granularitySpec": {
+        "segmentGranularity": "DAY",
+        "queryGranularity": "second",
+        "intervals" : [ "2013-08-31/2013-09-02" ]
+      }
+    },
+    "ioConfig": {
+      "type": "index",
+      "inputSource": {
+        "type": "%%INPUT_SOURCE_TYPE%%",
+        "%%INPUT_SOURCE_PROPERTY_KEY%%": %%INPUT_SOURCE_PROPERTY_VALUE%%
+      },
+      "inputFormat": {
+        "type": "json"
+      }
+    },
+    "tuningConfig": {
+      "type": "index"
+    }
+  }
+}