Add integration tests for HDFS (#9542)

* HDFS IT

* HDFS IT

* HDFS IT

* fix checkstyle
This commit is contained in:
Maytas Monsereenusorn 2020-03-20 15:46:08 -07:00 committed by GitHub
parent 4870ad7b56
commit 5f127a1829
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
22 changed files with 773 additions and 107 deletions

View File

@ -344,7 +344,7 @@ jobs:
name: "(Compile=openjdk8, Run=openjdk8) other integration test"
jdk: openjdk8
services: *integration_test_services
env: TESTNG_GROUPS='-DexcludedGroups=batch-index,perfect-rollup-parallel-batch-index,kafka-index,query,realtime-index,security,s3-deep-storage,gcs-deep-storage,azure-deep-storage' JVM_RUNTIME='-Djvm.runtime=8'
env: TESTNG_GROUPS='-DexcludedGroups=batch-index,perfect-rollup-parallel-batch-index,kafka-index,query,realtime-index,security,s3-deep-storage,gcs-deep-storage,azure-deep-storage,hdfs-deep-storage' JVM_RUNTIME='-Djvm.runtime=8'
script: *run_integration_test
after_failure: *integration_test_diags
# END - Integration tests for Compile with Java 8 and Run with Java 8
@ -383,7 +383,7 @@ jobs:
- <<: *integration_tests
name: "(Compile=openjdk8, Run=openjdk11) other integration test"
jdk: openjdk8
env: TESTNG_GROUPS='-DexcludedGroups=batch-index,perfect-rollup-parallel-batch-index,kafka-index,query,realtime-index,security,s3-deep-storage,gcs-deep-storage,azure-deep-storage' JVM_RUNTIME='-Djvm.runtime=11'
env: TESTNG_GROUPS='-DexcludedGroups=batch-index,perfect-rollup-parallel-batch-index,kafka-index,query,realtime-index,security,s3-deep-storage,gcs-deep-storage,azure-deep-storage,hdfs-deep-storage' JVM_RUNTIME='-Djvm.runtime=11'
# END - Integration tests for Compile with Java 8 and Run with Java 11
- name: "security vulnerabilities"

View File

@ -140,15 +140,22 @@ Running a Test That Uses Hadoop
The integration test that indexes from hadoop is not run as part
of the integration test run discussed above. This is because druid
test clusters might not, in general, have access to hadoop.
That's the case (for now, at least) when using the docker cluster set
up by the integration-tests profile, so the hadoop test
has to be run using a cluster specified in a configuration file.
This also applies to integration test that uses Hadoop HDFS as an inputSource or as a deep storage.
To run integration test that uses Hadoop, you will have to run a Hadoop cluster. This can be done in two ways:
1) Run your own Druid + Haddop cluster and specified Hadoop configs in the configuration file (CONFIG_FILE).
2) Run Druid Docker test clusters with Hadoop container by passing -Dstart.hadoop.docker=true to the mvn command.
The data file is
integration-tests/src/test/resources/hadoop/batch_hadoop.data.
Currently, hdfs-deep-storage and other <cloud>-deep-storage integration test groups can only be run with
Druid Docker test clusters by passing -Dstart.hadoop.docker=true to start Hadoop container.
You will also have to provide -Doverride.config.path=<PATH_TO_FILE> with your Druid's Hadoop configs set.
See integration-tests/docker/environment-configs/override-examples/hdfs directory for example.
Note that if the integration test you are running also uses other cloud extension (S3, Azure, GCS), additional
credentials/configs may need to be set in the same file as your Druid's Hadoop configs set.
Currently, ITHadoopIndexTest can only be run with your own Druid + Haddop cluster by following the below steps:
Create a directory called batchHadoop1 in the hadoop file system
(anywhere you want) and put batch_hadoop.data into that directory
(as its only file).
(anywhere you want) and put batch_hadoop.data (integration-tests/src/test/resources/hadoop/batch_hadoop.data)
into that directory (as its only file).
Add this keyword to the configuration file (see above):

View File

@ -23,7 +23,7 @@ LC_ALL=C.UTF-8
# JAVA OPTS
COMMON_DRUID_JAVA_OPTS=-Duser.timezone=UTC -Dfile.encoding=UTF-8 -Dlog4j.configurationFile=/shared/docker/lib/log4j2.xml
DRUID_DEP_LIB_DIR=/shared/hadoop_xml/*:/shared/docker/lib/*:/usr/local/druid/lib/mysql-connector-java.jar
DRUID_DEP_LIB_DIR=/shared/hadoop_xml:/shared/docker/lib/*:/usr/local/druid/lib/mysql-connector-java.jar
# Druid configs
druid_extensions_loadList=[]

View File

@ -0,0 +1,24 @@
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
druid_storage_type=hdfs
druid_storage_storageDirectory=/druid/segments
# Depending on the test, additional extension(s) may be required.
# Please refer to the other integration-tests/docker/environment-configs/override-examples/ files and Druid docs for
# additional env vars to provide for each extension.
druid_extensions_loadList=["druid-hdfs-storage"]

View File

@ -55,6 +55,12 @@
<version>${project.parent.version}</version>
<scope>runtime</scope>
</dependency>
<dependency>
<groupId>org.apache.druid.extensions</groupId>
<artifactId>druid-hdfs-storage</artifactId>
<version>${project.parent.version}</version>
<scope>runtime</scope>
</dependency>
<dependency>
<groupId>org.apache.druid.extensions</groupId>
<artifactId>druid-datasketches</artifactId>

View File

@ -68,6 +68,9 @@
# For druid-google-extensions
mkdir -p $SHARED_DIR/docker/extensions/druid-google-extensions
mv $SHARED_DIR/docker/lib/druid-google-extensions-* $SHARED_DIR/docker/extensions/druid-google-extensions
$ For druid-hdfs-storage
mkdir -p $SHARED_DIR/docker/extensions/druid-hdfs-storage
mv $SHARED_DIR/docker/lib/druid-hdfs-storage-* $SHARED_DIR/docker/extensions/druid-hdfs-storage
# Pull Hadoop dependency if needed
if [ -n "$DRUID_INTEGRATION_TEST_START_HADOOP_DOCKER" ] && [ "$DRUID_INTEGRATION_TEST_START_HADOOP_DOCKER" == true ]
@ -157,6 +160,42 @@ fi
# Start docker containers for all Druid processes and dependencies
{
# Start Hadoop docker if needed
if [ -n "$DRUID_INTEGRATION_TEST_START_HADOOP_DOCKER" ] && [ "$DRUID_INTEGRATION_TEST_START_HADOOP_DOCKER" == true ]
then
# Start Hadoop docker container
docker run -d --privileged --net druid-it-net --ip 172.172.172.13 -h druid-it-hadoop --name druid-it-hadoop -p 2049:2049 -p 2122:2122 -p 8020:8020 -p 8021:8021 -p 8030:8030 -p 8031:8031 -p 8032:8032 -p 8033:8033 -p 8040:8040 -p 8042:8042 -p 8088:8088 -p 8443:8443 -p 9000:9000 -p 10020:10020 -p 19888:19888 -p 34455:34455 -p 49707:49707 -p 50010:50010 -p 50020:50020 -p 50030:50030 -p 50060:50060 -p 50070:50070 -p 50075:50075 -p 50090:50090 -p 51111:51111 -v $RESOURCEDIR:/resources -v $SHARED_DIR:/shared druid-it/hadoop:2.8.5 sh -c "/etc/bootstrap.sh && tail -f /dev/null"
# wait for hadoop namenode to be up
echo "Waiting for hadoop namenode to be up"
docker exec -t druid-it-hadoop sh -c "./usr/local/hadoop/bin/hdfs dfs -mkdir -p /druid"
while [ $? -ne 0 ]
do
sleep 2
docker exec -t druid-it-hadoop sh -c "./usr/local/hadoop/bin/hdfs dfs -mkdir -p /druid"
done
echo "Finished waiting for Hadoop namenode"
# Setup hadoop druid dirs
echo "Setting up druid hadoop dirs"
docker exec -t druid-it-hadoop sh -c "./usr/local/hadoop/bin/hdfs dfs -mkdir -p /druid"
docker exec -t druid-it-hadoop sh -c "./usr/local/hadoop/bin/hdfs dfs -mkdir -p /druid/segments"
docker exec -t druid-it-hadoop sh -c "./usr/local/hadoop/bin/hdfs dfs -mkdir -p /quickstart"
docker exec -t druid-it-hadoop sh -c "./usr/local/hadoop/bin/hdfs dfs -chmod 777 /druid"
docker exec -t druid-it-hadoop sh -c "./usr/local/hadoop/bin/hdfs dfs -chmod 777 /druid/segments"
docker exec -t druid-it-hadoop sh -c "./usr/local/hadoop/bin/hdfs dfs -chmod 777 /quickstart"
docker exec -t druid-it-hadoop sh -c "./usr/local/hadoop/bin/hdfs dfs -chmod -R 777 /tmp"
docker exec -t druid-it-hadoop sh -c "./usr/local/hadoop/bin/hdfs dfs -chmod -R 777 /user"
# Copy data files to Hadoop container
docker exec -t druid-it-hadoop sh -c "./usr/local/hadoop/bin/hdfs dfs -put /shared/wikiticker-it/wikiticker-2015-09-12-sampled.json.gz /quickstart/wikiticker-2015-09-12-sampled.json.gz"
docker exec -t druid-it-hadoop sh -c "./usr/local/hadoop/bin/hdfs dfs -put /resources/data/batch_index /batch_index"
echo "Finished setting up druid hadoop dirs"
echo "Copying Hadoop XML files to shared"
docker exec -t druid-it-hadoop sh -c "cp /usr/local/hadoop/etc/hadoop/*.xml /shared/hadoop_xml"
echo "Copied Hadoop XML files to shared"
fi
# Start zookeeper and kafka
docker run -d --privileged --net druid-it-net --ip 172.172.172.2 ${COMMON_ENV} --name druid-zookeeper-kafka -p 2181:2181 -p 9092:9092 -p 9093:9093 -v $SHARED_DIR:/shared -v $SERVICE_SUPERVISORDS_DIR/zookeeper.conf:$SUPERVISORDIR/zookeeper.conf -v $SERVICE_SUPERVISORDS_DIR/kafka.conf:$SUPERVISORDIR/kafka.conf druid/cluster
@ -189,39 +228,4 @@ fi
# Start Router with custom TLS cert checkers
docker run -d --privileged --net druid-it-net --ip 172.172.172.12 ${COMMON_ENV} ${ROUTER_CUSTOM_CHECK_TLS_ENV} ${OVERRIDE_ENV} --hostname druid-router-custom-check-tls --name druid-router-custom-check-tls -p 8891:8891 -p 9091:9091 -v $SHARED_DIR:/shared -v $SERVICE_SUPERVISORDS_DIR/druid.conf:$SUPERVISORDIR/druid.conf --link druid-zookeeper-kafka:druid-zookeeper-kafka --link druid-coordinator:druid-coordinator --link druid-broker:druid-broker druid/cluster
# Start Hadoop docker if needed
if [ -n "$DRUID_INTEGRATION_TEST_START_HADOOP_DOCKER" ] && [ "$DRUID_INTEGRATION_TEST_START_HADOOP_DOCKER" == true ]
then
# Start Hadoop docker container
docker run -d --privileged --net druid-it-net --ip 172.172.172.13 -h druid-it-hadoop --name druid-it-hadoop -p 2049:2049 -p 2122:2122 -p 8020:8020 -p 8021:8021 -p 8030:8030 -p 8031:8031 -p 8032:8032 -p 8033:8033 -p 8040:8040 -p 8042:8042 -p 8088:8088 -p 8443:8443 -p 9000:9000 -p 10020:10020 -p 19888:19888 -p 34455:34455 -p 49707:49707 -p 50010:50010 -p 50020:50020 -p 50030:50030 -p 50060:50060 -p 50070:50070 -p 50075:50075 -p 50090:50090 -p 51111:51111 -v $SHARED_DIR:/shared druid-it/hadoop:2.8.5 sh -c "/etc/bootstrap.sh && tail -f /dev/null"
# wait for hadoop namenode to be up
echo "Waiting for hadoop namenode to be up"
docker exec -t druid-it-hadoop sh -c "./usr/local/hadoop/bin/hdfs dfs -mkdir -p /druid"
while [ $? -ne 0 ]
do
sleep 2
docker exec -t druid-it-hadoop sh -c "./usr/local/hadoop/bin/hdfs dfs -mkdir -p /druid"
done
echo "Finished waiting for Hadoop namenode"
# Setup hadoop druid dirs
echo "Setting up druid hadoop dirs"
docker exec -t druid-it-hadoop sh -c "./usr/local/hadoop/bin/hdfs dfs -mkdir -p /druid"
docker exec -t druid-it-hadoop sh -c "./usr/local/hadoop/bin/hdfs dfs -mkdir -p /druid/segments"
docker exec -t druid-it-hadoop sh -c "./usr/local/hadoop/bin/hdfs dfs -mkdir -p /quickstart"
docker exec -t druid-it-hadoop sh -c "./usr/local/hadoop/bin/hdfs dfs -chmod 777 /druid"
docker exec -t druid-it-hadoop sh -c "./usr/local/hadoop/bin/hdfs dfs -chmod 777 /druid/segments"
docker exec -t druid-it-hadoop sh -c "./usr/local/hadoop/bin/hdfs dfs -chmod 777 /quickstart"
docker exec -t druid-it-hadoop sh -c "./usr/local/hadoop/bin/hdfs dfs -chmod -R 777 /tmp"
docker exec -t druid-it-hadoop sh -c "./usr/local/hadoop/bin/hdfs dfs -chmod -R 777 /user"
# Copy data files to Hadoop container
docker exec -t druid-it-hadoop sh -c "./usr/local/hadoop/bin/hdfs dfs -put /shared/wikiticker-it/wikiticker-2015-09-12-sampled.json.gz /quickstart/wikiticker-2015-09-12-sampled.json.gz"
echo "Finished setting up druid hadoop dirs"
echo "Copying Hadoop XML files to shared"
docker exec -t druid-it-hadoop sh -c "cp /usr/local/hadoop/etc/hadoop/*.xml /shared/hadoop_xml"
echo "Copied Hadoop XML files to shared"
fi
}

View File

@ -36,17 +36,23 @@ public class TestNGGroup
// This group can only be run individually using -Dgroups=security since it requires specific test data setup.
public static final String SECURITY = "security";
// This group is not part of CI. To run this group, s3 configs/credentials for your s3 must be provided in a file.
// The path of the file must then we pass to mvn with -Doverride.config.path=<PATH_TO_FILE>
// The path of the file must then be pass to mvn with -Doverride.config.path=<PATH_TO_FILE>
// See integration-tests/docker/environment-configs/override-examples/s3 for env vars to provide.
public static final String S3_DEEP_STORAGE = "s3-deep-storage";
// This group is not part of CI. To run this group, gcs configs/credentials for your gcs must be provided in a file.
// The path of the file must then we pass to mvn with -Doverride.config.path=<PATH_TO_FILE>
// The path of the file must then be pass to mvn with -Doverride.config.path=<PATH_TO_FILE>
// See integration-tests/docker/environment-configs/override-examples/gcs for env vars to provide.
// The path to the folder that contains your GOOGLE_APPLICATION_CREDENTIALS file must also be pass
// to mvn with -Dresource.file.dir.path=<PATH_TO_FOLDER>
public static final String GCS_DEEP_STORAGE = "gcs-deep-storage";
// This group is not part of CI. To run this group, azure configs/credentials for your azure must be provided in a file.
// The path of the file must then we pass to mvn with -Doverride.config.path=<PATH_TO_FILE>
// The path of the file must then be pass to mvn with -Doverride.config.path=<PATH_TO_FILE>
// See integration-tests/docker/environment-configs/override-examples/azures for env vars to provide.
public static final String AZURE_DEEP_STORAGE = "azure-deep-storage";
// This group is not part of CI. To run this group, hadoop configs must be provided in a file. The path of the file
// must then be pass to mvn with -Doverride.config.path=<PATH_TO_FILE>
// See integration-tests/docker/environment-configs/override-examples/hdfs for env vars to provide.
// Additionally, hadoop docker must be started by passing -Dstart.hadoop.docker=true to mvn.
public static final String HDFS_DEEP_STORAGE = "hdfs-deep-storage";
}

View File

@ -24,30 +24,14 @@ import com.google.common.collect.ImmutableMap;
import org.apache.druid.indexer.partitions.DynamicPartitionsSpec;
import org.apache.druid.java.util.common.Pair;
import org.apache.druid.java.util.common.StringUtils;
import org.apache.druid.testing.guice.DruidTestModuleFactory;
import org.apache.druid.tests.TestNGGroup;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Guice;
import org.testng.annotations.Test;
import java.io.Closeable;
import java.util.List;
import java.util.UUID;
import java.util.function.Function;
/**
* IMPORTANT:
* To run this test, you must:
* 1) Set the bucket and path for your data. This can be done by setting -Ddruid.test.config.cloudBucket and
* -Ddruid.test.config.cloudPath or setting "cloud_bucket" and "cloud_path" in the config file.
* 2) Copy wikipedia_index_data1.json, wikipedia_index_data2.json, and wikipedia_index_data3.json
* located in integration-tests/src/test/resources/data/batch_index to your Azure at the location set in step 1.
* 3) Provide -Doverride.config.path=<PATH_TO_FILE> with Azure credentials/configs set. See
* integration-tests/docker/environment-configs/override-examples/azure for env vars to provide.
*/
@Test(groups = TestNGGroup.AZURE_DEEP_STORAGE)
@Guice(moduleFactory = DruidTestModuleFactory.class)
public class ITAzureParallelIndexTest extends AbstractITBatchIndexTest
public abstract class AbstractAzureInputSourceSimpleIndexTest extends AbstractITBatchIndexTest
{
private static final String INDEX_TASK = "/indexer/wikipedia_cloud_index_task.json";
private static final String INDEX_QUERIES_RESOURCE = "/indexer/wikipedia_index_queries.json";
@ -85,8 +69,7 @@ public class ITAzureParallelIndexTest extends AbstractITBatchIndexTest
};
}
@Test(dataProvider = "resources")
public void testAzureIndexData(Pair<String, List> azureInputSource) throws Exception
void doTest(Pair<String, List> azureInputSource) throws Exception
{
try (
final Closeable ignored1 = unloader(INDEX_DATASOURCE + config.getExtraDatasourceNameSuffix());

View File

@ -24,31 +24,14 @@ import com.google.common.collect.ImmutableMap;
import org.apache.druid.indexer.partitions.DynamicPartitionsSpec;
import org.apache.druid.java.util.common.Pair;
import org.apache.druid.java.util.common.StringUtils;
import org.apache.druid.testing.guice.DruidTestModuleFactory;
import org.apache.druid.tests.TestNGGroup;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Guice;
import org.testng.annotations.Test;
import java.io.Closeable;
import java.util.List;
import java.util.UUID;
import java.util.function.Function;
/**
* IMPORTANT:
* To run this test, you must:
* 1) Set the bucket and path for your data. This can be done by setting -Ddruid.test.config.cloudBucket and
* -Ddruid.test.config.cloudPath or setting "cloud_bucket" and "cloud_path" in the config file.
* 2) Copy wikipedia_index_data1.json, wikipedia_index_data2.json, and wikipedia_index_data3.json
* located in integration-tests/src/test/resources/data/batch_index to your GCS at the location set in step 1.
* 3) Provide -Doverride.config.path=<PATH_TO_FILE> with gcs configs set. See
* integration-tests/docker/environment-configs/override-examples/gcs for env vars to provide.
* 4) Provide -Dresource.file.dir.path=<PATH_TO_FOLDER> with folder that contains GOOGLE_APPLICATION_CREDENTIALS file
*/
@Test(groups = TestNGGroup.GCS_DEEP_STORAGE)
@Guice(moduleFactory = DruidTestModuleFactory.class)
public class ITGcsParallelIndexTest extends AbstractITBatchIndexTest
public abstract class AbstractGcsInputSourceSimpleIndexTest extends AbstractITBatchIndexTest
{
private static final String INDEX_TASK = "/indexer/wikipedia_cloud_index_task.json";
private static final String INDEX_QUERIES_RESOURCE = "/indexer/wikipedia_index_queries.json";
@ -86,8 +69,7 @@ public class ITGcsParallelIndexTest extends AbstractITBatchIndexTest
};
}
@Test(dataProvider = "resources")
public void testGcsIndexData(Pair<String, List> gcsInputSource) throws Exception
void doTest(Pair<String, List> gcsInputSource) throws Exception
{
try (
final Closeable ignored1 = unloader(INDEX_DATASOURCE + config.getExtraDatasourceNameSuffix());

View File

@ -0,0 +1,100 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.druid.tests.indexer;
import com.google.common.collect.ImmutableList;
import org.apache.druid.java.util.common.Pair;
import org.apache.druid.java.util.common.StringUtils;
import org.testng.annotations.DataProvider;
import java.io.Closeable;
import java.util.List;
import java.util.UUID;
import java.util.function.Function;
public abstract class AbstractHdfsInputSourceSimpleIndexTest extends AbstractITBatchIndexTest
{
private static final String INDEX_TASK = "/indexer/wikipedia_cloud_simple_index_task.json";
private static final String INDEX_QUERIES_RESOURCE = "/indexer/wikipedia_index_queries.json";
private static final String INDEX_DATASOURCE = "wikipedia_index_test_" + UUID.randomUUID();
private static final String INPUT_SOURCE_PATHS_KEY = "paths";
@DataProvider
public static Object[][] resources()
{
return new Object[][]{
{new Pair<>(INPUT_SOURCE_PATHS_KEY,
"hdfs://druid-it-hadoop:9000/batch_index"
)},
{new Pair<>(INPUT_SOURCE_PATHS_KEY,
ImmutableList.of(
"hdfs://druid-it-hadoop:9000/batch_index"
)
)},
{new Pair<>(INPUT_SOURCE_PATHS_KEY,
ImmutableList.of(
"hdfs://druid-it-hadoop:9000/batch_index/wikipedia_index_data1.json",
"hdfs://druid-it-hadoop:9000/batch_index/wikipedia_index_data2.json",
"hdfs://druid-it-hadoop:9000/batch_index/wikipedia_index_data3.json"
)
)}
};
}
void doTest(Pair<String, List> hdfsInputSource) throws Exception
{
try (
final Closeable ignored1 = unloader(INDEX_DATASOURCE + config.getExtraDatasourceNameSuffix());
) {
final Function<String, String> hdfsPropsTransform = spec -> {
try {
spec = StringUtils.replace(
spec,
"%%INPUT_SOURCE_TYPE%%",
"hdfs"
);
spec = StringUtils.replace(
spec,
"%%INPUT_SOURCE_PROPERTY_KEY%%",
hdfsInputSource.lhs
);
return StringUtils.replace(
spec,
"%%INPUT_SOURCE_PROPERTY_VALUE%%",
jsonMapper.writeValueAsString(hdfsInputSource.rhs)
);
}
catch (Exception e) {
throw new RuntimeException(e);
}
};
doIndexTest(
INDEX_DATASOURCE,
INDEX_TASK,
hdfsPropsTransform,
INDEX_QUERIES_RESOURCE,
false,
true,
true
);
}
}
}

View File

@ -24,30 +24,14 @@ import com.google.common.collect.ImmutableMap;
import org.apache.druid.indexer.partitions.DynamicPartitionsSpec;
import org.apache.druid.java.util.common.Pair;
import org.apache.druid.java.util.common.StringUtils;
import org.apache.druid.testing.guice.DruidTestModuleFactory;
import org.apache.druid.tests.TestNGGroup;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Guice;
import org.testng.annotations.Test;
import java.io.Closeable;
import java.util.List;
import java.util.UUID;
import java.util.function.Function;
/**
* IMPORTANT:
* To run this test, you must:
* 1) Set the bucket and path for your data. This can be done by setting -Ddruid.test.config.cloudBucket and
* -Ddruid.test.config.cloudPath or setting "cloud_bucket" and "cloud_path" in the config file.
* 2) Copy wikipedia_index_data1.json, wikipedia_index_data2.json, and wikipedia_index_data3.json
* located in integration-tests/src/test/resources/data/batch_index to your S3 at the location set in step 1.
* 3) Provide -Doverride.config.path=<PATH_TO_FILE> with s3 credentials/configs set. See
* integration-tests/docker/environment-configs/override-examples/s3 for env vars to provide.
*/
@Test(groups = TestNGGroup.S3_DEEP_STORAGE)
@Guice(moduleFactory = DruidTestModuleFactory.class)
public class ITS3ParallelIndexTest extends AbstractITBatchIndexTest
public abstract class AbstractS3InputSourceSimpleIndexTest extends AbstractITBatchIndexTest
{
private static final String INDEX_TASK = "/indexer/wikipedia_cloud_index_task.json";
private static final String INDEX_QUERIES_RESOURCE = "/indexer/wikipedia_index_queries.json";
@ -85,8 +69,7 @@ public class ITS3ParallelIndexTest extends AbstractITBatchIndexTest
};
}
@Test(dataProvider = "resources")
public void testS3IndexData(Pair<String, List> s3InputSource) throws Exception
void doTest(Pair<String, List> s3InputSource) throws Exception
{
try (
final Closeable ignored1 = unloader(INDEX_DATASOURCE + config.getExtraDatasourceNameSuffix());

View File

@ -0,0 +1,49 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.druid.tests.indexer;
import org.apache.druid.java.util.common.Pair;
import org.apache.druid.testing.guice.DruidTestModuleFactory;
import org.apache.druid.tests.TestNGGroup;
import org.testng.annotations.Guice;
import org.testng.annotations.Test;
import java.util.List;
/**
* IMPORTANT:
* To run this test, you must:
* 1) Set the bucket and path for your data. This can be done by setting -Ddruid.test.config.cloudBucket and
* -Ddruid.test.config.cloudPath or setting "cloud_bucket" and "cloud_path" in the config file.
* 2) Copy wikipedia_index_data1.json, wikipedia_index_data2.json, and wikipedia_index_data3.json
* located in integration-tests/src/test/resources/data/batch_index to your Azure at the location set in step 1.
* 3) Provide -Doverride.config.path=<PATH_TO_FILE> with Azure credentials/configs set. See
* integration-tests/docker/environment-configs/override-examples/azure for env vars to provide.
*/
@Test(groups = TestNGGroup.AZURE_DEEP_STORAGE)
@Guice(moduleFactory = DruidTestModuleFactory.class)
public class ITAzureToAzureParallelIndexTest extends AbstractAzureInputSourceSimpleIndexTest
{
@Test(dataProvider = "resources")
public void testAzureIndexData(Pair<String, List> azureInputSource) throws Exception
{
doTest(azureInputSource);
}
}

View File

@ -0,0 +1,51 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.druid.tests.indexer;
import org.apache.druid.java.util.common.Pair;
import org.apache.druid.testing.guice.DruidTestModuleFactory;
import org.apache.druid.tests.TestNGGroup;
import org.testng.annotations.Guice;
import org.testng.annotations.Test;
import java.util.List;
/**
* IMPORTANT:
* To run this test, you must:
* 1) Set the bucket and path for your data. This can be done by setting -Ddruid.test.config.cloudBucket and
* -Ddruid.test.config.cloudPath or setting "cloud_bucket" and "cloud_path" in the config file.
* 2) Copy wikipedia_index_data1.json, wikipedia_index_data2.json, and wikipedia_index_data3.json
* located in integration-tests/src/test/resources/data/batch_index to your Azure at the location set in step 1.
* 3) Provide -Doverride.config.path=<PATH_TO_FILE> with Azure credentials and hdfs deep storage configs set. See
* integration-tests/docker/environment-configs/override-examples/azure and
* integration-tests/docker/environment-configs/override-examples/hdfs for env vars to provide.
* 4) Run the test with -Dstart.hadoop.docker=true in the mvn command
*/
@Test(groups = TestNGGroup.HDFS_DEEP_STORAGE)
@Guice(moduleFactory = DruidTestModuleFactory.class)
public class ITAzureToHdfsParallelIndexTest extends AbstractAzureInputSourceSimpleIndexTest
{
@Test(dataProvider = "resources")
public void testAzureIndexData(Pair<String, List> azureInputSource) throws Exception
{
doTest(azureInputSource);
}
}

View File

@ -0,0 +1,50 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.druid.tests.indexer;
import org.apache.druid.java.util.common.Pair;
import org.apache.druid.testing.guice.DruidTestModuleFactory;
import org.apache.druid.tests.TestNGGroup;
import org.testng.annotations.Guice;
import org.testng.annotations.Test;
import java.util.List;
/**
* IMPORTANT:
* To run this test, you must:
* 1) Set the bucket and path for your data. This can be done by setting -Ddruid.test.config.cloudBucket and
* -Ddruid.test.config.cloudPath or setting "cloud_bucket" and "cloud_path" in the config file.
* 2) Copy wikipedia_index_data1.json, wikipedia_index_data2.json, and wikipedia_index_data3.json
* located in integration-tests/src/test/resources/data/batch_index to your GCS at the location set in step 1.
* 3) Provide -Doverride.config.path=<PATH_TO_FILE> with gcs configs set. See
* integration-tests/docker/environment-configs/override-examples/gcs for env vars to provide.
* 4) Provide -Dresource.file.dir.path=<PATH_TO_FOLDER> with folder that contains GOOGLE_APPLICATION_CREDENTIALS file
*/
@Test(groups = TestNGGroup.GCS_DEEP_STORAGE)
@Guice(moduleFactory = DruidTestModuleFactory.class)
public class ITGcsToGcsParallelIndexTest extends AbstractGcsInputSourceSimpleIndexTest
{
@Test(dataProvider = "resources")
public void testGcsIndexData(Pair<String, List> gcsInputSource) throws Exception
{
doTest(gcsInputSource);
}
}

View File

@ -0,0 +1,52 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.druid.tests.indexer;
import org.apache.druid.java.util.common.Pair;
import org.apache.druid.testing.guice.DruidTestModuleFactory;
import org.apache.druid.tests.TestNGGroup;
import org.testng.annotations.Guice;
import org.testng.annotations.Test;
import java.util.List;
/**
* IMPORTANT:
* To run this test, you must:
* 1) Set the bucket and path for your data. This can be done by setting -Ddruid.test.config.cloudBucket and
* -Ddruid.test.config.cloudPath or setting "cloud_bucket" and "cloud_path" in the config file.
* 2) Copy wikipedia_index_data1.json, wikipedia_index_data2.json, and wikipedia_index_data3.json
* located in integration-tests/src/test/resources/data/batch_index to your GCS at the location set in step 1.
* 3) Provide -Doverride.config.path=<PATH_TO_FILE> with gcs configs and hdfs deep storage configs set. See
* integration-tests/docker/environment-configs/override-examples/gcs and
* integration-tests/docker/environment-configs/override-examples/hdfs for env vars to provide.
* 4) Provide -Dresource.file.dir.path=<PATH_TO_FOLDER> with folder that contains GOOGLE_APPLICATION_CREDENTIALS file
* 5) Run the test with -Dstart.hadoop.docker=true in the mvn command
*/
@Test(groups = TestNGGroup.HDFS_DEEP_STORAGE)
@Guice(moduleFactory = DruidTestModuleFactory.class)
public class ITGcsToHdfsParallelIndexTest extends AbstractGcsInputSourceSimpleIndexTest
{
@Test(dataProvider = "resources")
public void testGcsIndexData(Pair<String, List> gcsInputSource) throws Exception
{
doTest(gcsInputSource);
}
}

View File

@ -0,0 +1,47 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.druid.tests.indexer;
import org.apache.druid.java.util.common.Pair;
import org.apache.druid.testing.guice.DruidTestModuleFactory;
import org.apache.druid.tests.TestNGGroup;
import org.testng.annotations.Guice;
import org.testng.annotations.Test;
import java.util.List;
/**
* IMPORTANT:
* To run this test, you must:
* 1) Run the test with -Dstart.hadoop.docker=true in the mvn command
* 2) Provide -Doverride.config.path=<PATH_TO_FILE> with Azure credentials/configs set. See
* integration-tests/docker/environment-configs/override-examples/azure for env vars to provide.
* You will also need to include "druid-hdfs-storage" to druid_extensions_loadList in this file.
*/
@Test(groups = TestNGGroup.AZURE_DEEP_STORAGE)
@Guice(moduleFactory = DruidTestModuleFactory.class)
public class ITHdfsToAzureSimpleIndexTest extends AbstractHdfsInputSourceSimpleIndexTest
{
@Test(dataProvider = "resources")
public void testHdfsIndexData(Pair<String, List> hdfsInputSource) throws Exception
{
doTest(hdfsInputSource);
}
}

View File

@ -0,0 +1,48 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.druid.tests.indexer;
import org.apache.druid.java.util.common.Pair;
import org.apache.druid.testing.guice.DruidTestModuleFactory;
import org.apache.druid.tests.TestNGGroup;
import org.testng.annotations.Guice;
import org.testng.annotations.Test;
import java.util.List;
/**
* IMPORTANT:
* To run this test, you must:
* 1) Run the test with -Dstart.hadoop.docker=true in the mvn command
* 2) Provide -Doverride.config.path=<PATH_TO_FILE> with gcs configs set. See
* integration-tests/docker/environment-configs/override-examples/gcs for env vars to provide.
* You will also need to include "druid-hdfs-storage" to druid_extensions_loadList in this file.
* 3) Provide -Dresource.file.dir.path=<PATH_TO_FOLDER> with folder that contains GOOGLE_APPLICATION_CREDENTIALS file
*/
@Test(groups = TestNGGroup.GCS_DEEP_STORAGE)
@Guice(moduleFactory = DruidTestModuleFactory.class)
public class ITHdfsToGcsSimpleIndexTest extends AbstractHdfsInputSourceSimpleIndexTest
{
@Test(dataProvider = "resources")
public void testHdfsIndexData(Pair<String, List> hdfsInputSource) throws Exception
{
doTest(hdfsInputSource);
}
}

View File

@ -0,0 +1,46 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.druid.tests.indexer;
import org.apache.druid.java.util.common.Pair;
import org.apache.druid.testing.guice.DruidTestModuleFactory;
import org.apache.druid.tests.TestNGGroup;
import org.testng.annotations.Guice;
import org.testng.annotations.Test;
import java.util.List;
/**
* IMPORTANT:
* To run this test, you must:
* 1) Run the test with -Dstart.hadoop.docker=true in the mvn command
* 2) Provide -Doverride.config.path=<PATH_TO_FILE> with hdfs configs set. See
* integration-tests/docker/environment-configs/override-examples/hdfs for env vars to provide.
*/
@Test(groups = TestNGGroup.HDFS_DEEP_STORAGE)
@Guice(moduleFactory = DruidTestModuleFactory.class)
public class ITHdfsToHdfsSimpleIndexTest extends AbstractHdfsInputSourceSimpleIndexTest
{
@Test(dataProvider = "resources")
public void testHdfsIndexData(Pair<String, List> hdfsInputSource) throws Exception
{
doTest(hdfsInputSource);
}
}

View File

@ -0,0 +1,47 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.druid.tests.indexer;
import org.apache.druid.java.util.common.Pair;
import org.apache.druid.testing.guice.DruidTestModuleFactory;
import org.apache.druid.tests.TestNGGroup;
import org.testng.annotations.Guice;
import org.testng.annotations.Test;
import java.util.List;
/**
* IMPORTANT:
* To run this test, you must:
* 1) Run the test with -Dstart.hadoop.docker=true in the mvn command
* 2) Provide -Doverride.config.path=<PATH_TO_FILE> with s3 credentials/configs set. See
* integration-tests/docker/environment-configs/override-examples/s3 for env vars to provide.
* You will also need to include "druid-hdfs-storage" to druid_extensions_loadList in this file.
*/
@Test(groups = TestNGGroup.S3_DEEP_STORAGE)
@Guice(moduleFactory = DruidTestModuleFactory.class)
public class ITHdfsToS3SimpleIndexTest extends AbstractHdfsInputSourceSimpleIndexTest
{
@Test(dataProvider = "resources")
public void testHdfsIndexData(Pair<String, List> hdfsInputSource) throws Exception
{
doTest(hdfsInputSource);
}
}

View File

@ -0,0 +1,51 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.druid.tests.indexer;
import org.apache.druid.java.util.common.Pair;
import org.apache.druid.testing.guice.DruidTestModuleFactory;
import org.apache.druid.tests.TestNGGroup;
import org.testng.annotations.Guice;
import org.testng.annotations.Test;
import java.util.List;
/**
* IMPORTANT:
* To run this test, you must:
* 1) Set the bucket and path for your data. This can be done by setting -Ddruid.test.config.cloudBucket and
* -Ddruid.test.config.cloudPath or setting "cloud_bucket" and "cloud_path" in the config file.
* 2) Copy wikipedia_index_data1.json, wikipedia_index_data2.json, and wikipedia_index_data3.json
* located in integration-tests/src/test/resources/data/batch_index to your S3 at the location set in step 1.
* 3) Provide -Doverride.config.path=<PATH_TO_FILE> with s3 credentials and hdfs deep storage configs set. See
* integration-tests/docker/environment-configs/override-examples/s3 and
* integration-tests/docker/environment-configs/override-examples/hdfs for env vars to provide.
* 4) Run the test with -Dstart.hadoop.docker=true in the mvn command
*/
@Test(groups = TestNGGroup.HDFS_DEEP_STORAGE)
@Guice(moduleFactory = DruidTestModuleFactory.class)
public class ITS3ToHdfsParallelIndexTest extends AbstractS3InputSourceSimpleIndexTest
{
@Test(dataProvider = "resources")
public void testS3IndexData(Pair<String, List> s3InputSource) throws Exception
{
doTest(s3InputSource);
}
}

View File

@ -0,0 +1,49 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.druid.tests.indexer;
import org.apache.druid.java.util.common.Pair;
import org.apache.druid.testing.guice.DruidTestModuleFactory;
import org.apache.druid.tests.TestNGGroup;
import org.testng.annotations.Guice;
import org.testng.annotations.Test;
import java.util.List;
/**
* IMPORTANT:
* To run this test, you must:
* 1) Set the bucket and path for your data. This can be done by setting -Ddruid.test.config.cloudBucket and
* -Ddruid.test.config.cloudPath or setting "cloud_bucket" and "cloud_path" in the config file.
* 2) Copy wikipedia_index_data1.json, wikipedia_index_data2.json, and wikipedia_index_data3.json
* located in integration-tests/src/test/resources/data/batch_index to your S3 at the location set in step 1.
* 3) Provide -Doverride.config.path=<PATH_TO_FILE> with s3 credentials/configs set. See
* integration-tests/docker/environment-configs/override-examples/s3 for env vars to provide.
*/
@Test(groups = TestNGGroup.S3_DEEP_STORAGE)
@Guice(moduleFactory = DruidTestModuleFactory.class)
public class ITS3ToS3ParallelIndexTest extends AbstractS3InputSourceSimpleIndexTest
{
@Test(dataProvider = "resources")
public void testS3IndexData(Pair<String, List> s3InputSource) throws Exception
{
doTest(s3InputSource);
}
}

View File

@ -0,0 +1,81 @@
{
"type": "index",
"spec": {
"dataSchema": {
"dataSource": "%%DATASOURCE%%",
"timestampSpec": {
"column": "timestamp"
},
"dimensionsSpec": {
"dimensions": [
"page",
{"type": "string", "name": "language", "createBitmapIndex": false},
"user",
"unpatrolled",
"newPage",
"robot",
"anonymous",
"namespace",
"continent",
"country",
"region",
"city"
]
},
"metricsSpec": [
{
"type": "count",
"name": "count"
},
{
"type": "doubleSum",
"name": "added",
"fieldName": "added"
},
{
"type": "doubleSum",
"name": "deleted",
"fieldName": "deleted"
},
{
"type": "doubleSum",
"name": "delta",
"fieldName": "delta"
},
{
"name": "thetaSketch",
"type": "thetaSketch",
"fieldName": "user"
},
{
"name": "quantilesDoublesSketch",
"type": "quantilesDoublesSketch",
"fieldName": "delta"
},
{
"name": "HLLSketchBuild",
"type": "HLLSketchBuild",
"fieldName": "user"
}
],
"granularitySpec": {
"segmentGranularity": "DAY",
"queryGranularity": "second",
"intervals" : [ "2013-08-31/2013-09-02" ]
}
},
"ioConfig": {
"type": "index",
"inputSource": {
"type": "%%INPUT_SOURCE_TYPE%%",
"%%INPUT_SOURCE_PROPERTY_KEY%%": %%INPUT_SOURCE_PROPERTY_VALUE%%
},
"inputFormat": {
"type": "json"
}
},
"tuningConfig": {
"type": "index"
}
}
}