druid/indexing-service/pom.xml

402 lines
15 KiB
XML
Raw Normal View History

2012-10-24 03:39:51 -04:00
<?xml version="1.0" encoding="UTF-8"?>
<!--
~ Licensed to the Apache Software Foundation (ASF) under one
~ or more contributor license agreements. See the NOTICE file
~ distributed with this work for additional information
~ regarding copyright ownership. The ASF licenses this file
~ to you under the Apache License, Version 2.0 (the
~ "License"); you may not use this file except in compliance
~ with the License. You may obtain a copy of the License at
~
~ http://www.apache.org/licenses/LICENSE-2.0
~
~ Unless required by applicable law or agreed to in writing,
~ software distributed under the License is distributed on an
~ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
~ KIND, either express or implied. See the License for the
~ specific language governing permissions and limitations
~ under the License.
-->
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
<modelVersion>4.0.0</modelVersion>
2012-10-24 03:39:51 -04:00
<artifactId>druid-indexing-service</artifactId>
<name>druid-indexing-service</name>
<description>druid-indexing-service</description>
2012-10-24 03:39:51 -04:00
<parent>
<groupId>org.apache.druid</groupId>
<artifactId>druid</artifactId>
<version>26.0.0-SNAPSHOT</version>
</parent>
<dependencies>
<dependency>
<groupId>org.apache.druid</groupId>
<artifactId>druid-core</artifactId>
<version>${project.parent.version}</version>
</dependency>
<dependency>
<groupId>org.apache.druid</groupId>
<artifactId>druid-server</artifactId>
<version>${project.parent.version}</version>
</dependency>
<dependency>
<groupId>org.apache.druid</groupId>
<artifactId>druid-indexing-hadoop</artifactId>
<version>${project.parent.version}</version>
</dependency>
<dependency>
<groupId>org.apache.druid</groupId>
<artifactId>druid-processing</artifactId>
<version>${project.parent.version}</version>
</dependency>
<dependency>
<groupId>org.apache.druid</groupId>
<artifactId>druid-hll</artifactId>
<version>${project.parent.version}</version>
</dependency>
<dependency>
<groupId>io.dropwizard.metrics</groupId>
<artifactId>metrics-core</artifactId>
</dependency>
<dependency>
<groupId>com.google.code.findbugs</groupId>
<artifactId>jsr305</artifactId>
</dependency>
<dependency>
<groupId>commons-io</groupId>
<artifactId>commons-io</artifactId>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-annotations</artifactId>
</dependency>
<dependency>
<groupId>org.apache.curator</groupId>
<artifactId>curator-framework</artifactId>
</dependency>
<dependency>
<groupId>org.apache.curator</groupId>
<artifactId>curator-client</artifactId>
</dependency>
<dependency>
<groupId>joda-time</groupId>
<artifactId>joda-time</artifactId>
</dependency>
<dependency>
<groupId>jakarta.inject</groupId>
<artifactId>jakarta.inject-api</artifactId>
</dependency>
<dependency>
<groupId>com.google.inject</groupId>
<artifactId>guice</artifactId>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-databind</artifactId>
</dependency>
<dependency>
<groupId>com.google.inject.extensions</groupId>
<artifactId>guice-multibindings</artifactId>
</dependency>
<dependency>
<groupId>commons-lang</groupId>
<artifactId>commons-lang</artifactId>
</dependency>
<dependency>
<groupId>javax.ws.rs</groupId>
<artifactId>jsr311-api</artifactId>
</dependency>
<dependency>
<groupId>io.netty</groupId>
<artifactId>netty</artifactId>
</dependency>
<dependency>
<groupId>org.apache.zookeeper</groupId>
<artifactId>zookeeper</artifactId>
</dependency>
<dependency>
<groupId>org.apache.zookeeper</groupId>
<artifactId>zookeeper-jute</artifactId>
</dependency>
<dependency>
<groupId>javax.servlet</groupId>
<artifactId>javax.servlet-api</artifactId>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-core</artifactId>
</dependency>
<dependency>
<groupId>com.sun.jersey</groupId>
<artifactId>jersey-server</artifactId>
</dependency>
<dependency>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
</dependency>
<dependency>
<groupId>org.apache.curator</groupId>
<artifactId>curator-recipes</artifactId>
</dependency>
<dependency>
<groupId>javax.validation</groupId>
<artifactId>validation-api</artifactId>
</dependency>
<dependency>
<groupId>javax.servlet</groupId>
<artifactId>servlet-api</artifactId>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>io.netty</groupId>
<artifactId>netty-handler</artifactId>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-lang3</artifactId>
</dependency>
<dependency>
<groupId>commons-codec</groupId>
<artifactId>commons-codec</artifactId>
</dependency>
<dependency>
<groupId>org.eclipse.jetty</groupId>
<artifactId>jetty-util</artifactId>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.jaxrs</groupId>
<artifactId>jackson-jaxrs-smile-provider</artifactId>
</dependency>
<dependency>
<groupId>com.google.errorprone</groupId>
<artifactId>error_prone_annotations</artifactId>
</dependency>
<dependency>
<groupId>it.unimi.dsi</groupId>
<artifactId>fastutil-core</artifactId>
</dependency>
Parallel indexing single dim partitions (#8925) * Parallel indexing single dim partitions Implements single dimension range partitioning for native parallel batch indexing as described in #8769. This initial version requires the druid-datasketches extension to be loaded. The algorithm has 5 phases that are orchestrated by the supervisor in `ParallelIndexSupervisorTask#runRangePartitionMultiPhaseParallel()`. These phases and the main classes involved are described below: 1) In parallel, determine the distribution of dimension values for each input source split. `PartialDimensionDistributionTask` uses `StringSketch` to generate the approximate distribution of dimension values for each input source split. If the rows are ungrouped, `PartialDimensionDistributionTask.UngroupedRowDimensionValueFilter` uses a Bloom filter to skip rows that would be grouped. The final distribution is sent back to the supervisor via `DimensionDistributionReport`. 2) The range partitions are determined. In `ParallelIndexSupervisorTask#determineAllRangePartitions()`, the supervisor uses `StringSketchMerger` to merge the individual `StringSketch`es created in the preceding phase. The merged sketch is then used to create the range partitions. 3) In parallel, generate partial range-partitioned segments. `PartialRangeSegmentGenerateTask` uses the range partitions determined in the preceding phase and `RangePartitionCachingLocalSegmentAllocator` to generate `SingleDimensionShardSpec`s. The partition information is sent back to the supervisor via `GeneratedGenericPartitionsReport`. 4) The partial range segments are grouped. In `ParallelIndexSupervisorTask#groupGenericPartitionLocationsPerPartition()`, the supervisor creates the `PartialGenericSegmentMergeIOConfig`s necessary for the next phase. 5) In parallel, merge partial range-partitioned segments. `PartialGenericSegmentMergeTask` uses `GenericPartitionLocation` to retrieve the partial range-partitioned segments generated earlier and then merges and publishes them. * Fix dependencies & forbidden apis * Fixes for integration test * Address review comments * Fix docs, strict compile, sketch check, rollup check * Fix first shard spec, partition serde, single subtask * Fix first partition check in test * Misc rewording/refactoring to address code review * Fix doc link * Split batch index integration test * Do not run parallel-batch-index twice * Adjust last partition * Split ITParallelIndexTest to reduce runtime * Rename test class * Allow null values in range partitions * Indicate which phase failed * Improve asserts in tests
2019-12-10 02:05:49 -05:00
<dependency>
<groupId>org.apache.logging.log4j</groupId>
<artifactId>log4j-core</artifactId>
</dependency>
<dependency>
<groupId>org.apache.logging.log4j</groupId>
<artifactId>log4j-api</artifactId>
</dependency>
<dependency>
<!-- Used in native parallel batch indexing to determine distribution of dimension values -->
<groupId>org.apache.datasketches</groupId>
<artifactId>datasketches-java</artifactId>
<scope>provided</scope>
</dependency>
<dependency>
<!-- Used in native parallel batch indexing to determine distribution of dimension values -->
<groupId>org.apache.datasketches</groupId>
<artifactId>datasketches-memory</artifactId>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>net.thisptr</groupId>
<artifactId>jackson-jq</artifactId>
</dependency>
<dependency>
<groupId>org.codehaus.jackson</groupId>
<artifactId>jackson-core-asl</artifactId>
<scope>provided</scope>
</dependency>
Bound memory utilization for dynamic partitioning (i.e. memory growth is constant) (#11294) * Bound memory in native batch ingest create segments * Move BatchAppenderatorDriverTest to indexing service... note that we had to put the sink back in sinks in mergeandpush since the persistent data needs to be dropped and the sink is required for that * Remove sinks from memory and clean up intermediate persists dirs manually after sink has been merged * Changed name from RealtimeAppenderator to StreamAppenderator * Style * Incorporating tests from StreamAppenderatorTest * Keep totalRows and cleanup code * Added missing dep * Fix unit test * Checkstyle * allowIncrementalPersists should always be true for batch * Added sinks metadata * clear sinks metadata when closing appenderator * Style + minor edits to log msgs * Update sinks metadata & totalRows when dropping a sink (segment) * Remove max * Intelli-j check * Keep a count of hydrants persisted by sink for sanity check before merge * Move out sanity * Add previous hydrant count to sink metadata * Remove redundant field from SinkMetadata * Remove unneeded functions * Cleanup unused code * Removed unused code * Remove unused field * Exclude it from jacoco because it is very hard to get branch coverage * Remove segment announcement and some other minor cleanup * Add fallback flag * Minor code cleanup * Checkstyle * Code review changes * Update batchMemoryMappedIndex name * Code review comments * Exclude class from coverage, will include again when packaging gets fixed * Moved test classes to server module * More BatchAppenderator cleanup * Fix bug in wrong counting of totalHydrants plus minor cleanup in add * Removed left over comments * Have BatchAppenderator follow the Appenderator contract for push & getSegments * Fix LGTM violations * Review comments * Add stats after push is done * Code review comments (cleanup, remove rest of synchronization constructs in batch appenderator, reneame feature flag, remove real time flag stuff from stream appenderator, etc.) * Update javadocs * Add thread safety notice to BatchAppenderator * Further cleanup config * More config cleanup
2021-07-09 03:10:29 -04:00
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-collections4</artifactId>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.eclipse.aether</groupId>
<artifactId>aether-api</artifactId>
</dependency>
<!-- Tests -->
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.easymock</groupId>
<artifactId>easymock</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.apache.curator</groupId>
<artifactId>curator-test</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.apache.druid</groupId>
<artifactId>druid-core</artifactId>
<version>${project.parent.version}</version>
<type>test-jar</type>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.apache.druid</groupId>
<artifactId>druid-processing</artifactId>
<version>${project.parent.version}</version>
<type>test-jar</type>
<scope>test</scope>
</dependency>
2015-06-26 11:07:58 -04:00
<dependency>
<groupId>org.apache.druid</groupId>
2015-06-26 11:07:58 -04:00
<artifactId>druid-server</artifactId>
<version>${project.parent.version}</version>
<type>test-jar</type>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.hamcrest</groupId>
<artifactId>hamcrest-all</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.hamcrest</groupId>
<artifactId>hamcrest-core</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.assertj</groupId>
<artifactId>assertj-core</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>nl.jqno.equalsverifier</groupId>
<artifactId>equalsverifier</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>com.github.stefanbirkner</groupId>
<artifactId>system-rules</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.mockito</groupId>
<artifactId>mockito-core</artifactId>
<version>${mockito.version}</version>
<scope>test</scope>
</dependency>
</dependencies>
<profiles>
<profile>
<id>hadoop2</id>
<activation>
<activeByDefault>true</activeByDefault>
</activation>
<dependencies>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-mapreduce-client-core</artifactId>
<scope>provided</scope>
<exclusions>
<exclusion>
<groupId>javax.servlet</groupId>
<artifactId>servlet-api</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>${hadoop.compile.version}</version>
<scope>provided</scope>
<exclusions>
<exclusion>
<groupId>org.apache.avro</groupId>
<artifactId>avro</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-yarn-common</artifactId>
<scope>provided</scope>
</dependency>
</dependencies>
<properties>
<hadoop-task-libs>org.apache.hadoop:hadoop-client:${hadoop.compile.version}</hadoop-task-libs>
</properties>
</profile>
<profile>
<id>hadoop3</id>
<activation>
<property>
<name>hadoop3.enabled</name>
<value>true</value>
</property>
</activation>
<dependencies>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client-api</artifactId>
<version>${hadoop.compile.version}</version>
<scope>provided</scope>
</dependency>
</dependencies>
<properties>
<hadoop-task-libs>
org.apache.hadoop:hadoop-client-api:${hadoop.compile.version},org.apache.hadoop:hadoop-client-runtime:${hadoop.compile.version}
</hadoop-task-libs>
</properties>
</profile>
</profiles>
2016-01-07 01:33:07 -05:00
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-jar-plugin</artifactId>
<executions>
<execution>
<goals>
<goal>test-jar</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<artifactId>maven-resources-plugin</artifactId>
<groupId>org.apache.maven.plugins</groupId>
<configuration>
<outputDirectory>${project.build.outputDirectory}</outputDirectory>
<resources>
<resource>
<directory>src/main/resources</directory>
<includes>hadoop.indexer.libs.version</includes>
<filtering>true</filtering>
</resource>
</resources>
</configuration>
</plugin>
<plugin>
<groupId>org.jacoco</groupId>
<artifactId>jacoco-maven-plugin</artifactId>
<version>${jacoco.version}</version>
<configuration>
<excludes>
<!-- Tested in integration tests, but we lack unit tests.
(The newer async implementation does have unit tests.) -->
<exclude>org/apache/druid/indexing/seekablestream/SeekableStreamIndexTaskClientSyncImpl.class</exclude>
</excludes>
</configuration>
</plugin>
2016-01-07 01:33:07 -05:00
</plugins>
</build>
2012-10-24 03:39:51 -04:00
</project>