2012-10-24 03:39:51 -04:00
|
|
|
<?xml version="1.0" encoding="UTF-8"?>
|
2012-10-24 05:09:43 -04:00
|
|
|
<!--
|
2018-07-11 12:55:18 -04:00
|
|
|
~ Licensed to the Apache Software Foundation (ASF) under one
|
|
|
|
~ or more contributor license agreements. See the NOTICE file
|
|
|
|
~ distributed with this work for additional information
|
|
|
|
~ regarding copyright ownership. The ASF licenses this file
|
|
|
|
~ to you under the Apache License, Version 2.0 (the
|
|
|
|
~ "License"); you may not use this file except in compliance
|
|
|
|
~ with the License. You may obtain a copy of the License at
|
2012-10-24 05:09:43 -04:00
|
|
|
~
|
2018-07-11 12:55:18 -04:00
|
|
|
~ http://www.apache.org/licenses/LICENSE-2.0
|
2012-10-24 05:09:43 -04:00
|
|
|
~
|
2018-07-11 12:55:18 -04:00
|
|
|
~ Unless required by applicable law or agreed to in writing,
|
|
|
|
~ software distributed under the License is distributed on an
|
|
|
|
~ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
|
|
~ KIND, either express or implied. See the License for the
|
|
|
|
~ specific language governing permissions and limitations
|
|
|
|
~ under the License.
|
2012-10-24 05:09:43 -04:00
|
|
|
-->
|
|
|
|
|
Async task client for SeekableStreamSupervisors. (#13354)
Main changes:
1) Convert SeekableStreamIndexTaskClient to an interface, move old code
to SeekableStreamIndexTaskClientSyncImpl, and add new implementation
SeekableStreamIndexTaskClientAsyncImpl that uses ServiceClient.
2) Add "chatAsync" parameter to seekable stream supervisors that causes
the supervisor to use an async task client.
3) In SeekableStreamSupervisor.discoverTasks, adjust logic to avoid making
blocking RPC calls in workerExec threads.
4) In SeekableStreamSupervisor generally, switch from Futures.successfulAsList
to FutureUtils.coalesce, so we can better capture the errors that occurred
with contacting individual tasks.
Other, related changes:
1) Add ServiceRetryPolicy.retryNotAvailable, which controls whether
ServiceClient retries unavailable services. Useful since we do not
want to retry calls unavailable tasks within the service client. (The
supervisor does its own higher-level retries.)
2) Add FutureUtils.transformAsync, a more lambda friendly version of
Futures.transform(f, AsyncFunction).
3) Add FutureUtils.coalesce. Similar to Futures.successfulAsList, but
returns Either instead of using null on error.
4) Add JacksonUtils.readValue overloads for JavaType and TypeReference.
2022-11-21 08:50:26 -05:00
|
|
|
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
|
|
|
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
|
2015-02-03 19:48:00 -05:00
|
|
|
<modelVersion>4.0.0</modelVersion>
|
2012-10-24 03:39:51 -04:00
|
|
|
|
2015-02-03 19:48:00 -05:00
|
|
|
<artifactId>druid-indexing-service</artifactId>
|
|
|
|
<name>druid-indexing-service</name>
|
|
|
|
<description>druid-indexing-service</description>
|
2012-10-24 03:39:51 -04:00
|
|
|
|
2015-02-03 19:48:00 -05:00
|
|
|
<parent>
|
2018-08-30 12:56:26 -04:00
|
|
|
<groupId>org.apache.druid</groupId>
|
2015-02-03 19:48:00 -05:00
|
|
|
<artifactId>druid</artifactId>
|
2024-09-10 01:31:20 -04:00
|
|
|
<version>32.0.0-SNAPSHOT</version>
|
2015-02-03 19:48:00 -05:00
|
|
|
</parent>
|
2012-10-31 19:10:07 -04:00
|
|
|
|
2015-02-03 19:48:00 -05:00
|
|
|
<dependencies>
|
|
|
|
<dependency>
|
2018-08-30 12:56:26 -04:00
|
|
|
<groupId>org.apache.druid</groupId>
|
2023-02-17 17:27:41 -05:00
|
|
|
<artifactId>druid-processing</artifactId>
|
2015-02-03 19:48:00 -05:00
|
|
|
<version>${project.parent.version}</version>
|
|
|
|
</dependency>
|
|
|
|
<dependency>
|
2018-08-30 12:56:26 -04:00
|
|
|
<groupId>org.apache.druid</groupId>
|
2015-02-03 19:48:00 -05:00
|
|
|
<artifactId>druid-server</artifactId>
|
|
|
|
<version>${project.parent.version}</version>
|
|
|
|
</dependency>
|
|
|
|
<dependency>
|
2018-08-30 12:56:26 -04:00
|
|
|
<groupId>org.apache.druid</groupId>
|
2015-02-03 19:48:00 -05:00
|
|
|
<artifactId>druid-indexing-hadoop</artifactId>
|
|
|
|
<version>${project.parent.version}</version>
|
|
|
|
</dependency>
|
2019-09-09 17:37:21 -04:00
|
|
|
<dependency>
|
|
|
|
<groupId>io.dropwizard.metrics</groupId>
|
|
|
|
<artifactId>metrics-core</artifactId>
|
|
|
|
</dependency>
|
|
|
|
<dependency>
|
|
|
|
<groupId>com.google.code.findbugs</groupId>
|
|
|
|
<artifactId>jsr305</artifactId>
|
|
|
|
</dependency>
|
|
|
|
<dependency>
|
|
|
|
<groupId>commons-io</groupId>
|
|
|
|
<artifactId>commons-io</artifactId>
|
|
|
|
</dependency>
|
|
|
|
<dependency>
|
|
|
|
<groupId>com.fasterxml.jackson.core</groupId>
|
|
|
|
<artifactId>jackson-annotations</artifactId>
|
|
|
|
</dependency>
|
|
|
|
<dependency>
|
|
|
|
<groupId>org.apache.curator</groupId>
|
|
|
|
<artifactId>curator-framework</artifactId>
|
|
|
|
</dependency>
|
|
|
|
<dependency>
|
|
|
|
<groupId>org.apache.curator</groupId>
|
|
|
|
<artifactId>curator-client</artifactId>
|
|
|
|
</dependency>
|
|
|
|
<dependency>
|
|
|
|
<groupId>joda-time</groupId>
|
|
|
|
<artifactId>joda-time</artifactId>
|
|
|
|
</dependency>
|
2022-02-27 18:19:28 -05:00
|
|
|
<dependency>
|
|
|
|
<groupId>jakarta.inject</groupId>
|
|
|
|
<artifactId>jakarta.inject-api</artifactId>
|
|
|
|
</dependency>
|
2019-09-09 17:37:21 -04:00
|
|
|
<dependency>
|
|
|
|
<groupId>com.google.inject</groupId>
|
|
|
|
<artifactId>guice</artifactId>
|
|
|
|
</dependency>
|
|
|
|
<dependency>
|
|
|
|
<groupId>com.fasterxml.jackson.core</groupId>
|
|
|
|
<artifactId>jackson-databind</artifactId>
|
|
|
|
</dependency>
|
2024-12-18 22:38:20 -05:00
|
|
|
|
2019-09-09 17:37:21 -04:00
|
|
|
<dependency>
|
|
|
|
<groupId>javax.ws.rs</groupId>
|
|
|
|
<artifactId>jsr311-api</artifactId>
|
|
|
|
</dependency>
|
|
|
|
<dependency>
|
|
|
|
<groupId>io.netty</groupId>
|
|
|
|
<artifactId>netty</artifactId>
|
|
|
|
</dependency>
|
|
|
|
<dependency>
|
|
|
|
<groupId>org.apache.zookeeper</groupId>
|
|
|
|
<artifactId>zookeeper</artifactId>
|
|
|
|
</dependency>
|
2021-05-25 15:49:49 -04:00
|
|
|
<dependency>
|
|
|
|
<groupId>org.apache.zookeeper</groupId>
|
|
|
|
<artifactId>zookeeper-jute</artifactId>
|
|
|
|
</dependency>
|
2019-09-09 17:37:21 -04:00
|
|
|
<dependency>
|
|
|
|
<groupId>javax.servlet</groupId>
|
|
|
|
<artifactId>javax.servlet-api</artifactId>
|
|
|
|
</dependency>
|
|
|
|
<dependency>
|
|
|
|
<groupId>com.fasterxml.jackson.core</groupId>
|
|
|
|
<artifactId>jackson-core</artifactId>
|
|
|
|
</dependency>
|
|
|
|
<dependency>
|
|
|
|
<groupId>com.sun.jersey</groupId>
|
|
|
|
<artifactId>jersey-server</artifactId>
|
|
|
|
</dependency>
|
|
|
|
<dependency>
|
|
|
|
<groupId>com.google.guava</groupId>
|
|
|
|
<artifactId>guava</artifactId>
|
|
|
|
</dependency>
|
|
|
|
<dependency>
|
|
|
|
<groupId>org.apache.curator</groupId>
|
|
|
|
<artifactId>curator-recipes</artifactId>
|
|
|
|
</dependency>
|
|
|
|
<dependency>
|
2023-08-14 02:20:51 -04:00
|
|
|
<groupId>jakarta.validation</groupId>
|
|
|
|
<artifactId>jakarta.validation-api</artifactId>
|
2019-09-09 17:37:21 -04:00
|
|
|
</dependency>
|
|
|
|
<dependency>
|
|
|
|
<groupId>javax.servlet</groupId>
|
|
|
|
<artifactId>servlet-api</artifactId>
|
|
|
|
<scope>provided</scope>
|
|
|
|
</dependency>
|
|
|
|
<dependency>
|
|
|
|
<groupId>io.netty</groupId>
|
|
|
|
<artifactId>netty-handler</artifactId>
|
|
|
|
</dependency>
|
|
|
|
<dependency>
|
|
|
|
<groupId>org.apache.commons</groupId>
|
|
|
|
<artifactId>commons-lang3</artifactId>
|
|
|
|
</dependency>
|
|
|
|
<dependency>
|
|
|
|
<groupId>commons-codec</groupId>
|
|
|
|
<artifactId>commons-codec</artifactId>
|
|
|
|
</dependency>
|
|
|
|
<dependency>
|
|
|
|
<groupId>org.eclipse.jetty</groupId>
|
|
|
|
<artifactId>jetty-util</artifactId>
|
|
|
|
</dependency>
|
|
|
|
<dependency>
|
|
|
|
<groupId>com.fasterxml.jackson.jaxrs</groupId>
|
|
|
|
<artifactId>jackson-jaxrs-smile-provider</artifactId>
|
|
|
|
</dependency>
|
|
|
|
<dependency>
|
|
|
|
<groupId>com.google.errorprone</groupId>
|
|
|
|
<artifactId>error_prone_annotations</artifactId>
|
|
|
|
</dependency>
|
|
|
|
<dependency>
|
|
|
|
<groupId>it.unimi.dsi</groupId>
|
2021-06-10 10:43:18 -04:00
|
|
|
<artifactId>fastutil-core</artifactId>
|
2019-09-09 17:37:21 -04:00
|
|
|
</dependency>
|
Parallel indexing single dim partitions (#8925)
* Parallel indexing single dim partitions
Implements single dimension range partitioning for native parallel batch
indexing as described in #8769. This initial version requires the
druid-datasketches extension to be loaded.
The algorithm has 5 phases that are orchestrated by the supervisor in
`ParallelIndexSupervisorTask#runRangePartitionMultiPhaseParallel()`.
These phases and the main classes involved are described below:
1) In parallel, determine the distribution of dimension values for each
input source split.
`PartialDimensionDistributionTask` uses `StringSketch` to generate
the approximate distribution of dimension values for each input
source split. If the rows are ungrouped,
`PartialDimensionDistributionTask.UngroupedRowDimensionValueFilter`
uses a Bloom filter to skip rows that would be grouped. The final
distribution is sent back to the supervisor via
`DimensionDistributionReport`.
2) The range partitions are determined.
In `ParallelIndexSupervisorTask#determineAllRangePartitions()`, the
supervisor uses `StringSketchMerger` to merge the individual
`StringSketch`es created in the preceding phase. The merged sketch is
then used to create the range partitions.
3) In parallel, generate partial range-partitioned segments.
`PartialRangeSegmentGenerateTask` uses the range partitions
determined in the preceding phase and
`RangePartitionCachingLocalSegmentAllocator` to generate
`SingleDimensionShardSpec`s. The partition information is sent back
to the supervisor via `GeneratedGenericPartitionsReport`.
4) The partial range segments are grouped.
In `ParallelIndexSupervisorTask#groupGenericPartitionLocationsPerPartition()`,
the supervisor creates the `PartialGenericSegmentMergeIOConfig`s
necessary for the next phase.
5) In parallel, merge partial range-partitioned segments.
`PartialGenericSegmentMergeTask` uses `GenericPartitionLocation` to
retrieve the partial range-partitioned segments generated earlier and
then merges and publishes them.
* Fix dependencies & forbidden apis
* Fixes for integration test
* Address review comments
* Fix docs, strict compile, sketch check, rollup check
* Fix first shard spec, partition serde, single subtask
* Fix first partition check in test
* Misc rewording/refactoring to address code review
* Fix doc link
* Split batch index integration test
* Do not run parallel-batch-index twice
* Adjust last partition
* Split ITParallelIndexTest to reduce runtime
* Rename test class
* Allow null values in range partitions
* Indicate which phase failed
* Improve asserts in tests
2019-12-10 02:05:49 -05:00
|
|
|
<dependency>
|
|
|
|
<groupId>org.apache.logging.log4j</groupId>
|
|
|
|
<artifactId>log4j-core</artifactId>
|
|
|
|
</dependency>
|
|
|
|
<dependency>
|
|
|
|
<groupId>org.apache.logging.log4j</groupId>
|
|
|
|
<artifactId>log4j-api</artifactId>
|
|
|
|
</dependency>
|
|
|
|
<dependency>
|
|
|
|
<!-- Used in native parallel batch indexing to determine distribution of dimension values -->
|
|
|
|
<groupId>org.apache.datasketches</groupId>
|
|
|
|
<artifactId>datasketches-java</artifactId>
|
|
|
|
<scope>provided</scope>
|
|
|
|
</dependency>
|
|
|
|
<dependency>
|
|
|
|
<!-- Used in native parallel batch indexing to determine distribution of dimension values -->
|
|
|
|
<groupId>org.apache.datasketches</groupId>
|
|
|
|
<artifactId>datasketches-memory</artifactId>
|
|
|
|
<scope>provided</scope>
|
|
|
|
</dependency>
|
2021-07-09 03:10:29 -04:00
|
|
|
<dependency>
|
|
|
|
<groupId>org.apache.commons</groupId>
|
|
|
|
<artifactId>commons-collections4</artifactId>
|
|
|
|
<scope>provided</scope>
|
|
|
|
</dependency>
|
2015-02-03 19:48:00 -05:00
|
|
|
<!-- Tests -->
|
|
|
|
<dependency>
|
|
|
|
<groupId>junit</groupId>
|
|
|
|
<artifactId>junit</artifactId>
|
|
|
|
<scope>test</scope>
|
|
|
|
</dependency>
|
|
|
|
<dependency>
|
|
|
|
<groupId>org.easymock</groupId>
|
|
|
|
<artifactId>easymock</artifactId>
|
|
|
|
<scope>test</scope>
|
|
|
|
</dependency>
|
|
|
|
<dependency>
|
|
|
|
<groupId>org.apache.curator</groupId>
|
|
|
|
<artifactId>curator-test</artifactId>
|
|
|
|
<scope>test</scope>
|
|
|
|
</dependency>
|
2018-10-02 13:50:22 -04:00
|
|
|
<dependency>
|
|
|
|
<groupId>org.apache.druid</groupId>
|
2017-06-08 09:32:10 -04:00
|
|
|
<artifactId>druid-processing</artifactId>
|
|
|
|
<version>${project.parent.version}</version>
|
|
|
|
<type>test-jar</type>
|
|
|
|
<scope>test</scope>
|
|
|
|
</dependency>
|
2015-06-26 11:07:58 -04:00
|
|
|
<dependency>
|
2018-08-30 12:56:26 -04:00
|
|
|
<groupId>org.apache.druid</groupId>
|
2015-06-26 11:07:58 -04:00
|
|
|
<artifactId>druid-server</artifactId>
|
|
|
|
<version>${project.parent.version}</version>
|
|
|
|
<type>test-jar</type>
|
|
|
|
<scope>test</scope>
|
|
|
|
</dependency>
|
2019-11-20 20:24:12 -05:00
|
|
|
<dependency>
|
|
|
|
<groupId>org.hamcrest</groupId>
|
|
|
|
<artifactId>hamcrest-all</artifactId>
|
|
|
|
<scope>test</scope>
|
|
|
|
</dependency>
|
2019-09-09 17:37:21 -04:00
|
|
|
<dependency>
|
|
|
|
<groupId>org.hamcrest</groupId>
|
|
|
|
<artifactId>hamcrest-core</artifactId>
|
|
|
|
<scope>test</scope>
|
|
|
|
</dependency>
|
2019-11-06 14:07:04 -05:00
|
|
|
<dependency>
|
|
|
|
<groupId>org.assertj</groupId>
|
|
|
|
<artifactId>assertj-core</artifactId>
|
|
|
|
<scope>test</scope>
|
|
|
|
</dependency>
|
2020-01-21 15:59:43 -05:00
|
|
|
<dependency>
|
|
|
|
<groupId>nl.jqno.equalsverifier</groupId>
|
|
|
|
<artifactId>equalsverifier</artifactId>
|
|
|
|
<scope>test</scope>
|
|
|
|
</dependency>
|
2020-02-10 18:17:54 -05:00
|
|
|
<dependency>
|
|
|
|
<groupId>com.github.stefanbirkner</groupId>
|
|
|
|
<artifactId>system-rules</artifactId>
|
|
|
|
<scope>test</scope>
|
|
|
|
</dependency>
|
2020-10-10 22:35:17 -04:00
|
|
|
<dependency>
|
|
|
|
<groupId>org.mockito</groupId>
|
|
|
|
<artifactId>mockito-core</artifactId>
|
|
|
|
<scope>test</scope>
|
|
|
|
</dependency>
|
2024-09-16 06:40:25 -04:00
|
|
|
<dependency>
|
|
|
|
<groupId>org.apache.maven.resolver</groupId>
|
|
|
|
<artifactId>maven-resolver-api</artifactId>
|
|
|
|
<version>1.3.1</version>
|
|
|
|
</dependency>
|
2015-02-03 19:48:00 -05:00
|
|
|
</dependencies>
|
2013-12-03 23:51:19 -05:00
|
|
|
|
2021-10-30 13:16:24 -04:00
|
|
|
<profiles>
|
|
|
|
<profile>
|
|
|
|
<id>hadoop3</id>
|
|
|
|
<activation>
|
2023-04-26 03:22:51 -04:00
|
|
|
<activeByDefault>true</activeByDefault>
|
2021-10-30 13:16:24 -04:00
|
|
|
</activation>
|
|
|
|
<dependencies>
|
|
|
|
<dependency>
|
|
|
|
<groupId>org.apache.hadoop</groupId>
|
|
|
|
<artifactId>hadoop-client-api</artifactId>
|
|
|
|
<version>${hadoop.compile.version}</version>
|
|
|
|
<scope>provided</scope>
|
|
|
|
</dependency>
|
|
|
|
</dependencies>
|
|
|
|
<properties>
|
|
|
|
<hadoop-task-libs>
|
|
|
|
org.apache.hadoop:hadoop-client-api:${hadoop.compile.version},org.apache.hadoop:hadoop-client-runtime:${hadoop.compile.version}
|
|
|
|
</hadoop-task-libs>
|
|
|
|
</properties>
|
|
|
|
</profile>
|
|
|
|
</profiles>
|
|
|
|
|
2016-01-07 01:33:07 -05:00
|
|
|
<build>
|
|
|
|
<plugins>
|
|
|
|
<plugin>
|
|
|
|
<groupId>org.apache.maven.plugins</groupId>
|
|
|
|
<artifactId>maven-jar-plugin</artifactId>
|
|
|
|
<executions>
|
|
|
|
<execution>
|
|
|
|
<goals>
|
|
|
|
<goal>test-jar</goal>
|
|
|
|
</goals>
|
|
|
|
</execution>
|
|
|
|
</executions>
|
|
|
|
</plugin>
|
2021-10-30 13:16:24 -04:00
|
|
|
<plugin>
|
|
|
|
<artifactId>maven-resources-plugin</artifactId>
|
|
|
|
<groupId>org.apache.maven.plugins</groupId>
|
|
|
|
<configuration>
|
|
|
|
<outputDirectory>${project.build.outputDirectory}</outputDirectory>
|
|
|
|
<resources>
|
|
|
|
<resource>
|
|
|
|
<directory>src/main/resources</directory>
|
|
|
|
<includes>hadoop.indexer.libs.version</includes>
|
|
|
|
<filtering>true</filtering>
|
|
|
|
</resource>
|
|
|
|
</resources>
|
|
|
|
</configuration>
|
|
|
|
</plugin>
|
Async task client for SeekableStreamSupervisors. (#13354)
Main changes:
1) Convert SeekableStreamIndexTaskClient to an interface, move old code
to SeekableStreamIndexTaskClientSyncImpl, and add new implementation
SeekableStreamIndexTaskClientAsyncImpl that uses ServiceClient.
2) Add "chatAsync" parameter to seekable stream supervisors that causes
the supervisor to use an async task client.
3) In SeekableStreamSupervisor.discoverTasks, adjust logic to avoid making
blocking RPC calls in workerExec threads.
4) In SeekableStreamSupervisor generally, switch from Futures.successfulAsList
to FutureUtils.coalesce, so we can better capture the errors that occurred
with contacting individual tasks.
Other, related changes:
1) Add ServiceRetryPolicy.retryNotAvailable, which controls whether
ServiceClient retries unavailable services. Useful since we do not
want to retry calls unavailable tasks within the service client. (The
supervisor does its own higher-level retries.)
2) Add FutureUtils.transformAsync, a more lambda friendly version of
Futures.transform(f, AsyncFunction).
3) Add FutureUtils.coalesce. Similar to Futures.successfulAsList, but
returns Either instead of using null on error.
4) Add JacksonUtils.readValue overloads for JavaType and TypeReference.
2022-11-21 08:50:26 -05:00
|
|
|
<plugin>
|
|
|
|
<groupId>org.jacoco</groupId>
|
|
|
|
<artifactId>jacoco-maven-plugin</artifactId>
|
|
|
|
<configuration>
|
|
|
|
<excludes>
|
|
|
|
<!-- Tested in integration tests, but we lack unit tests.
|
|
|
|
(The newer async implementation does have unit tests.) -->
|
|
|
|
<exclude>org/apache/druid/indexing/seekablestream/SeekableStreamIndexTaskClientSyncImpl.class</exclude>
|
|
|
|
</excludes>
|
|
|
|
</configuration>
|
|
|
|
</plugin>
|
2016-01-07 01:33:07 -05:00
|
|
|
</plugins>
|
|
|
|
</build>
|
|
|
|
|
2012-10-24 03:39:51 -04:00
|
|
|
</project>
|