2012-10-24 03:39:51 -04:00
|
|
|
<?xml version="1.0" encoding="UTF-8"?>
|
2012-10-24 05:09:43 -04:00
|
|
|
<!--
|
2018-07-11 12:55:18 -04:00
|
|
|
~ Licensed to the Apache Software Foundation (ASF) under one
|
|
|
|
~ or more contributor license agreements. See the NOTICE file
|
|
|
|
~ distributed with this work for additional information
|
|
|
|
~ regarding copyright ownership. The ASF licenses this file
|
|
|
|
~ to you under the Apache License, Version 2.0 (the
|
|
|
|
~ "License"); you may not use this file except in compliance
|
|
|
|
~ with the License. You may obtain a copy of the License at
|
2012-10-24 05:09:43 -04:00
|
|
|
~
|
2018-07-11 12:55:18 -04:00
|
|
|
~ http://www.apache.org/licenses/LICENSE-2.0
|
2012-10-24 05:09:43 -04:00
|
|
|
~
|
2018-07-11 12:55:18 -04:00
|
|
|
~ Unless required by applicable law or agreed to in writing,
|
|
|
|
~ software distributed under the License is distributed on an
|
|
|
|
~ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
|
|
~ KIND, either express or implied. See the License for the
|
|
|
|
~ specific language governing permissions and limitations
|
|
|
|
~ under the License.
|
2012-10-24 05:09:43 -04:00
|
|
|
-->
|
|
|
|
|
2015-02-23 17:27:58 -05:00
|
|
|
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
|
2015-02-03 19:48:00 -05:00
|
|
|
<modelVersion>4.0.0</modelVersion>
|
2012-10-24 03:39:51 -04:00
|
|
|
|
2015-02-03 19:48:00 -05:00
|
|
|
<artifactId>druid-indexing-service</artifactId>
|
|
|
|
<name>druid-indexing-service</name>
|
|
|
|
<description>druid-indexing-service</description>
|
2012-10-24 03:39:51 -04:00
|
|
|
|
2015-02-03 19:48:00 -05:00
|
|
|
<parent>
|
2018-08-30 12:56:26 -04:00
|
|
|
<groupId>org.apache.druid</groupId>
|
2015-02-03 19:48:00 -05:00
|
|
|
<artifactId>druid</artifactId>
|
2022-08-29 01:57:38 -04:00
|
|
|
<version>25.0.0-SNAPSHOT</version>
|
2015-02-03 19:48:00 -05:00
|
|
|
</parent>
|
2012-10-31 19:10:07 -04:00
|
|
|
|
2015-02-03 19:48:00 -05:00
|
|
|
<dependencies>
|
|
|
|
<dependency>
|
2018-08-30 12:56:26 -04:00
|
|
|
<groupId>org.apache.druid</groupId>
|
2018-10-14 23:37:37 -04:00
|
|
|
<artifactId>druid-core</artifactId>
|
2015-02-03 19:48:00 -05:00
|
|
|
<version>${project.parent.version}</version>
|
|
|
|
</dependency>
|
|
|
|
<dependency>
|
2018-08-30 12:56:26 -04:00
|
|
|
<groupId>org.apache.druid</groupId>
|
2015-02-03 19:48:00 -05:00
|
|
|
<artifactId>druid-server</artifactId>
|
|
|
|
<version>${project.parent.version}</version>
|
|
|
|
</dependency>
|
|
|
|
<dependency>
|
2018-08-30 12:56:26 -04:00
|
|
|
<groupId>org.apache.druid</groupId>
|
2015-02-03 19:48:00 -05:00
|
|
|
<artifactId>druid-indexing-hadoop</artifactId>
|
|
|
|
<version>${project.parent.version}</version>
|
|
|
|
</dependency>
|
2019-09-09 17:37:21 -04:00
|
|
|
<dependency>
|
|
|
|
<groupId>org.apache.druid</groupId>
|
|
|
|
<artifactId>druid-processing</artifactId>
|
|
|
|
<version>${project.parent.version}</version>
|
|
|
|
</dependency>
|
|
|
|
<dependency>
|
|
|
|
<groupId>org.apache.druid</groupId>
|
|
|
|
<artifactId>druid-hll</artifactId>
|
|
|
|
<version>${project.parent.version}</version>
|
|
|
|
</dependency>
|
|
|
|
<dependency>
|
|
|
|
<groupId>io.dropwizard.metrics</groupId>
|
|
|
|
<artifactId>metrics-core</artifactId>
|
|
|
|
</dependency>
|
|
|
|
<dependency>
|
|
|
|
<groupId>com.google.code.findbugs</groupId>
|
|
|
|
<artifactId>jsr305</artifactId>
|
|
|
|
</dependency>
|
|
|
|
<dependency>
|
|
|
|
<groupId>commons-io</groupId>
|
|
|
|
<artifactId>commons-io</artifactId>
|
|
|
|
</dependency>
|
|
|
|
<dependency>
|
|
|
|
<groupId>com.fasterxml.jackson.core</groupId>
|
|
|
|
<artifactId>jackson-annotations</artifactId>
|
|
|
|
</dependency>
|
|
|
|
<dependency>
|
|
|
|
<groupId>org.apache.curator</groupId>
|
|
|
|
<artifactId>curator-framework</artifactId>
|
|
|
|
</dependency>
|
|
|
|
<dependency>
|
|
|
|
<groupId>org.apache.curator</groupId>
|
|
|
|
<artifactId>curator-client</artifactId>
|
|
|
|
</dependency>
|
|
|
|
<dependency>
|
|
|
|
<groupId>joda-time</groupId>
|
|
|
|
<artifactId>joda-time</artifactId>
|
|
|
|
</dependency>
|
2022-02-27 18:19:28 -05:00
|
|
|
<dependency>
|
|
|
|
<groupId>jakarta.inject</groupId>
|
|
|
|
<artifactId>jakarta.inject-api</artifactId>
|
|
|
|
</dependency>
|
2019-09-09 17:37:21 -04:00
|
|
|
<dependency>
|
|
|
|
<groupId>com.google.inject</groupId>
|
|
|
|
<artifactId>guice</artifactId>
|
|
|
|
</dependency>
|
|
|
|
<dependency>
|
|
|
|
<groupId>com.fasterxml.jackson.core</groupId>
|
|
|
|
<artifactId>jackson-databind</artifactId>
|
|
|
|
</dependency>
|
|
|
|
<dependency>
|
|
|
|
<groupId>com.google.inject.extensions</groupId>
|
|
|
|
<artifactId>guice-multibindings</artifactId>
|
|
|
|
</dependency>
|
|
|
|
<dependency>
|
|
|
|
<groupId>commons-lang</groupId>
|
|
|
|
<artifactId>commons-lang</artifactId>
|
|
|
|
</dependency>
|
|
|
|
<dependency>
|
|
|
|
<groupId>javax.ws.rs</groupId>
|
|
|
|
<artifactId>jsr311-api</artifactId>
|
|
|
|
</dependency>
|
|
|
|
<dependency>
|
|
|
|
<groupId>io.netty</groupId>
|
|
|
|
<artifactId>netty</artifactId>
|
|
|
|
</dependency>
|
|
|
|
<dependency>
|
|
|
|
<groupId>org.apache.zookeeper</groupId>
|
|
|
|
<artifactId>zookeeper</artifactId>
|
|
|
|
</dependency>
|
2021-05-25 15:49:49 -04:00
|
|
|
<dependency>
|
|
|
|
<groupId>org.apache.zookeeper</groupId>
|
|
|
|
<artifactId>zookeeper-jute</artifactId>
|
|
|
|
</dependency>
|
2019-09-09 17:37:21 -04:00
|
|
|
<dependency>
|
|
|
|
<groupId>javax.servlet</groupId>
|
|
|
|
<artifactId>javax.servlet-api</artifactId>
|
|
|
|
</dependency>
|
|
|
|
<dependency>
|
|
|
|
<groupId>com.fasterxml.jackson.core</groupId>
|
|
|
|
<artifactId>jackson-core</artifactId>
|
|
|
|
</dependency>
|
|
|
|
<dependency>
|
|
|
|
<groupId>com.sun.jersey</groupId>
|
|
|
|
<artifactId>jersey-server</artifactId>
|
|
|
|
</dependency>
|
|
|
|
<dependency>
|
|
|
|
<groupId>com.google.guava</groupId>
|
|
|
|
<artifactId>guava</artifactId>
|
|
|
|
</dependency>
|
|
|
|
<dependency>
|
|
|
|
<groupId>org.apache.curator</groupId>
|
|
|
|
<artifactId>curator-recipes</artifactId>
|
|
|
|
</dependency>
|
|
|
|
<dependency>
|
|
|
|
<groupId>javax.validation</groupId>
|
|
|
|
<artifactId>validation-api</artifactId>
|
|
|
|
</dependency>
|
|
|
|
<dependency>
|
|
|
|
<groupId>javax.servlet</groupId>
|
|
|
|
<artifactId>servlet-api</artifactId>
|
|
|
|
<scope>provided</scope>
|
|
|
|
</dependency>
|
|
|
|
<dependency>
|
|
|
|
<groupId>io.netty</groupId>
|
|
|
|
<artifactId>netty-handler</artifactId>
|
|
|
|
</dependency>
|
|
|
|
<dependency>
|
|
|
|
<groupId>org.apache.commons</groupId>
|
|
|
|
<artifactId>commons-lang3</artifactId>
|
|
|
|
</dependency>
|
|
|
|
<dependency>
|
|
|
|
<groupId>commons-codec</groupId>
|
|
|
|
<artifactId>commons-codec</artifactId>
|
|
|
|
</dependency>
|
|
|
|
<dependency>
|
|
|
|
<groupId>org.eclipse.jetty</groupId>
|
|
|
|
<artifactId>jetty-util</artifactId>
|
|
|
|
</dependency>
|
|
|
|
<dependency>
|
|
|
|
<groupId>com.fasterxml.jackson.jaxrs</groupId>
|
|
|
|
<artifactId>jackson-jaxrs-smile-provider</artifactId>
|
|
|
|
</dependency>
|
|
|
|
<dependency>
|
|
|
|
<groupId>com.google.errorprone</groupId>
|
|
|
|
<artifactId>error_prone_annotations</artifactId>
|
|
|
|
</dependency>
|
|
|
|
<dependency>
|
|
|
|
<groupId>it.unimi.dsi</groupId>
|
2021-06-10 10:43:18 -04:00
|
|
|
<artifactId>fastutil-core</artifactId>
|
2019-09-09 17:37:21 -04:00
|
|
|
</dependency>
|
Parallel indexing single dim partitions (#8925)
* Parallel indexing single dim partitions
Implements single dimension range partitioning for native parallel batch
indexing as described in #8769. This initial version requires the
druid-datasketches extension to be loaded.
The algorithm has 5 phases that are orchestrated by the supervisor in
`ParallelIndexSupervisorTask#runRangePartitionMultiPhaseParallel()`.
These phases and the main classes involved are described below:
1) In parallel, determine the distribution of dimension values for each
input source split.
`PartialDimensionDistributionTask` uses `StringSketch` to generate
the approximate distribution of dimension values for each input
source split. If the rows are ungrouped,
`PartialDimensionDistributionTask.UngroupedRowDimensionValueFilter`
uses a Bloom filter to skip rows that would be grouped. The final
distribution is sent back to the supervisor via
`DimensionDistributionReport`.
2) The range partitions are determined.
In `ParallelIndexSupervisorTask#determineAllRangePartitions()`, the
supervisor uses `StringSketchMerger` to merge the individual
`StringSketch`es created in the preceding phase. The merged sketch is
then used to create the range partitions.
3) In parallel, generate partial range-partitioned segments.
`PartialRangeSegmentGenerateTask` uses the range partitions
determined in the preceding phase and
`RangePartitionCachingLocalSegmentAllocator` to generate
`SingleDimensionShardSpec`s. The partition information is sent back
to the supervisor via `GeneratedGenericPartitionsReport`.
4) The partial range segments are grouped.
In `ParallelIndexSupervisorTask#groupGenericPartitionLocationsPerPartition()`,
the supervisor creates the `PartialGenericSegmentMergeIOConfig`s
necessary for the next phase.
5) In parallel, merge partial range-partitioned segments.
`PartialGenericSegmentMergeTask` uses `GenericPartitionLocation` to
retrieve the partial range-partitioned segments generated earlier and
then merges and publishes them.
* Fix dependencies & forbidden apis
* Fixes for integration test
* Address review comments
* Fix docs, strict compile, sketch check, rollup check
* Fix first shard spec, partition serde, single subtask
* Fix first partition check in test
* Misc rewording/refactoring to address code review
* Fix doc link
* Split batch index integration test
* Do not run parallel-batch-index twice
* Adjust last partition
* Split ITParallelIndexTest to reduce runtime
* Rename test class
* Allow null values in range partitions
* Indicate which phase failed
* Improve asserts in tests
2019-12-10 02:05:49 -05:00
|
|
|
<dependency>
|
|
|
|
<groupId>org.apache.logging.log4j</groupId>
|
|
|
|
<artifactId>log4j-core</artifactId>
|
|
|
|
</dependency>
|
|
|
|
<dependency>
|
|
|
|
<groupId>org.apache.logging.log4j</groupId>
|
|
|
|
<artifactId>log4j-api</artifactId>
|
|
|
|
</dependency>
|
|
|
|
<dependency>
|
|
|
|
<!-- Used in native parallel batch indexing to determine distribution of dimension values -->
|
|
|
|
<groupId>org.apache.datasketches</groupId>
|
|
|
|
<artifactId>datasketches-java</artifactId>
|
|
|
|
<scope>provided</scope>
|
|
|
|
</dependency>
|
|
|
|
<dependency>
|
|
|
|
<!-- Used in native parallel batch indexing to determine distribution of dimension values -->
|
|
|
|
<groupId>org.apache.datasketches</groupId>
|
|
|
|
<artifactId>datasketches-memory</artifactId>
|
|
|
|
<scope>provided</scope>
|
|
|
|
</dependency>
|
2020-11-13 16:59:23 -05:00
|
|
|
<dependency>
|
|
|
|
<groupId>net.thisptr</groupId>
|
|
|
|
<artifactId>jackson-jq</artifactId>
|
|
|
|
</dependency>
|
2021-05-07 17:29:48 -04:00
|
|
|
<dependency>
|
|
|
|
<groupId>org.codehaus.jackson</groupId>
|
|
|
|
<artifactId>jackson-core-asl</artifactId>
|
|
|
|
<scope>provided</scope>
|
|
|
|
</dependency>
|
2021-07-09 03:10:29 -04:00
|
|
|
<dependency>
|
|
|
|
<groupId>org.apache.commons</groupId>
|
|
|
|
<artifactId>commons-collections4</artifactId>
|
|
|
|
<scope>provided</scope>
|
|
|
|
</dependency>
|
2022-08-04 03:05:07 -04:00
|
|
|
<dependency>
|
|
|
|
<groupId>org.eclipse.aether</groupId>
|
|
|
|
<artifactId>aether-api</artifactId>
|
|
|
|
</dependency>
|
2015-02-03 19:48:00 -05:00
|
|
|
<!-- Tests -->
|
|
|
|
<dependency>
|
|
|
|
<groupId>junit</groupId>
|
|
|
|
<artifactId>junit</artifactId>
|
|
|
|
<scope>test</scope>
|
|
|
|
</dependency>
|
|
|
|
<dependency>
|
|
|
|
<groupId>org.easymock</groupId>
|
|
|
|
<artifactId>easymock</artifactId>
|
|
|
|
<scope>test</scope>
|
|
|
|
</dependency>
|
|
|
|
<dependency>
|
|
|
|
<groupId>org.apache.curator</groupId>
|
|
|
|
<artifactId>curator-test</artifactId>
|
|
|
|
<scope>test</scope>
|
|
|
|
</dependency>
|
2018-10-02 13:50:22 -04:00
|
|
|
<dependency>
|
|
|
|
<groupId>org.apache.druid</groupId>
|
2018-10-14 23:37:37 -04:00
|
|
|
<artifactId>druid-core</artifactId>
|
2018-10-02 13:50:22 -04:00
|
|
|
<version>${project.parent.version}</version>
|
|
|
|
<type>test-jar</type>
|
|
|
|
<scope>test</scope>
|
|
|
|
</dependency>
|
2017-06-08 09:32:10 -04:00
|
|
|
<dependency>
|
2018-08-30 12:56:26 -04:00
|
|
|
<groupId>org.apache.druid</groupId>
|
2017-06-08 09:32:10 -04:00
|
|
|
<artifactId>druid-processing</artifactId>
|
|
|
|
<version>${project.parent.version}</version>
|
|
|
|
<type>test-jar</type>
|
|
|
|
<scope>test</scope>
|
|
|
|
</dependency>
|
2015-06-26 11:07:58 -04:00
|
|
|
<dependency>
|
2018-08-30 12:56:26 -04:00
|
|
|
<groupId>org.apache.druid</groupId>
|
2015-06-26 11:07:58 -04:00
|
|
|
<artifactId>druid-server</artifactId>
|
|
|
|
<version>${project.parent.version}</version>
|
|
|
|
<type>test-jar</type>
|
|
|
|
<scope>test</scope>
|
|
|
|
</dependency>
|
2019-11-20 20:24:12 -05:00
|
|
|
<dependency>
|
|
|
|
<groupId>org.hamcrest</groupId>
|
|
|
|
<artifactId>hamcrest-all</artifactId>
|
|
|
|
<scope>test</scope>
|
|
|
|
</dependency>
|
2019-09-09 17:37:21 -04:00
|
|
|
<dependency>
|
|
|
|
<groupId>org.hamcrest</groupId>
|
|
|
|
<artifactId>hamcrest-core</artifactId>
|
|
|
|
<scope>test</scope>
|
|
|
|
</dependency>
|
2019-11-06 14:07:04 -05:00
|
|
|
<dependency>
|
|
|
|
<groupId>org.assertj</groupId>
|
|
|
|
<artifactId>assertj-core</artifactId>
|
|
|
|
<scope>test</scope>
|
|
|
|
</dependency>
|
2020-01-21 15:59:43 -05:00
|
|
|
<dependency>
|
|
|
|
<groupId>nl.jqno.equalsverifier</groupId>
|
|
|
|
<artifactId>equalsverifier</artifactId>
|
|
|
|
<scope>test</scope>
|
|
|
|
</dependency>
|
2020-02-10 18:17:54 -05:00
|
|
|
<dependency>
|
|
|
|
<groupId>com.github.stefanbirkner</groupId>
|
|
|
|
<artifactId>system-rules</artifactId>
|
|
|
|
<scope>test</scope>
|
|
|
|
</dependency>
|
2020-10-10 22:35:17 -04:00
|
|
|
<dependency>
|
|
|
|
<groupId>org.mockito</groupId>
|
|
|
|
<artifactId>mockito-core</artifactId>
|
|
|
|
<version>${mockito.version}</version>
|
|
|
|
<scope>test</scope>
|
|
|
|
</dependency>
|
2015-02-03 19:48:00 -05:00
|
|
|
</dependencies>
|
2013-12-03 23:51:19 -05:00
|
|
|
|
2021-10-30 13:16:24 -04:00
|
|
|
<profiles>
|
|
|
|
<profile>
|
|
|
|
<id>hadoop2</id>
|
|
|
|
<activation>
|
|
|
|
<activeByDefault>true</activeByDefault>
|
|
|
|
</activation>
|
|
|
|
<dependencies>
|
|
|
|
<dependency>
|
|
|
|
<groupId>org.apache.hadoop</groupId>
|
|
|
|
<artifactId>hadoop-mapreduce-client-core</artifactId>
|
|
|
|
<scope>provided</scope>
|
|
|
|
<exclusions>
|
|
|
|
<exclusion>
|
|
|
|
<groupId>javax.servlet</groupId>
|
|
|
|
<artifactId>servlet-api</artifactId>
|
|
|
|
</exclusion>
|
|
|
|
</exclusions>
|
|
|
|
</dependency>
|
|
|
|
<dependency>
|
|
|
|
<groupId>org.apache.hadoop</groupId>
|
|
|
|
<artifactId>hadoop-common</artifactId>
|
|
|
|
<scope>provided</scope>
|
|
|
|
</dependency>
|
|
|
|
<dependency>
|
|
|
|
<groupId>org.apache.hadoop</groupId>
|
|
|
|
<artifactId>hadoop-client</artifactId>
|
|
|
|
<version>${hadoop.compile.version}</version>
|
|
|
|
<scope>provided</scope>
|
|
|
|
<exclusions>
|
|
|
|
<exclusion>
|
|
|
|
<groupId>org.apache.avro</groupId>
|
|
|
|
<artifactId>avro</artifactId>
|
|
|
|
</exclusion>
|
|
|
|
</exclusions>
|
|
|
|
</dependency>
|
|
|
|
<dependency>
|
|
|
|
<groupId>org.apache.hadoop</groupId>
|
|
|
|
<artifactId>hadoop-yarn-common</artifactId>
|
|
|
|
<scope>provided</scope>
|
|
|
|
</dependency>
|
|
|
|
</dependencies>
|
|
|
|
<properties>
|
|
|
|
<hadoop-task-libs>org.apache.hadoop:hadoop-client:${hadoop.compile.version}</hadoop-task-libs>
|
|
|
|
</properties>
|
|
|
|
</profile>
|
|
|
|
<profile>
|
|
|
|
<id>hadoop3</id>
|
|
|
|
<activation>
|
|
|
|
<property>
|
|
|
|
<name>hadoop3.enabled</name>
|
|
|
|
<value>true</value>
|
|
|
|
</property>
|
|
|
|
</activation>
|
|
|
|
<dependencies>
|
|
|
|
<dependency>
|
|
|
|
<groupId>org.apache.hadoop</groupId>
|
|
|
|
<artifactId>hadoop-client-api</artifactId>
|
|
|
|
<version>${hadoop.compile.version}</version>
|
|
|
|
<scope>provided</scope>
|
|
|
|
</dependency>
|
|
|
|
</dependencies>
|
|
|
|
<properties>
|
|
|
|
<hadoop-task-libs>
|
|
|
|
org.apache.hadoop:hadoop-client-api:${hadoop.compile.version},org.apache.hadoop:hadoop-client-runtime:${hadoop.compile.version}
|
|
|
|
</hadoop-task-libs>
|
|
|
|
</properties>
|
|
|
|
</profile>
|
|
|
|
</profiles>
|
|
|
|
|
2016-01-07 01:33:07 -05:00
|
|
|
<build>
|
|
|
|
<plugins>
|
|
|
|
<plugin>
|
|
|
|
<groupId>org.apache.maven.plugins</groupId>
|
|
|
|
<artifactId>maven-jar-plugin</artifactId>
|
|
|
|
<executions>
|
|
|
|
<execution>
|
|
|
|
<goals>
|
|
|
|
<goal>test-jar</goal>
|
|
|
|
</goals>
|
|
|
|
</execution>
|
|
|
|
</executions>
|
|
|
|
</plugin>
|
2021-10-30 13:16:24 -04:00
|
|
|
<plugin>
|
|
|
|
<artifactId>maven-resources-plugin</artifactId>
|
|
|
|
<groupId>org.apache.maven.plugins</groupId>
|
|
|
|
<configuration>
|
|
|
|
<outputDirectory>${project.build.outputDirectory}</outputDirectory>
|
|
|
|
<resources>
|
|
|
|
<resource>
|
|
|
|
<directory>src/main/resources</directory>
|
|
|
|
<includes>hadoop.indexer.libs.version</includes>
|
|
|
|
<filtering>true</filtering>
|
|
|
|
</resource>
|
|
|
|
</resources>
|
|
|
|
</configuration>
|
|
|
|
</plugin>
|
2016-01-07 01:33:07 -05:00
|
|
|
</plugins>
|
|
|
|
</build>
|
|
|
|
|
2012-10-24 03:39:51 -04:00
|
|
|
</project>
|