Abhishek Radhakrishnan 9f95a691f7
Extension to read and ingest Delta Lake tables (#15755)
* something

* test commit

* compilation fix

* more compilation fixes (fixme placeholders)

* Comment out druid-kereberos build since it conflicts with newly added transitive deps from delta-lake

Will need to sort out the dependencies later.

* checkpoint

* remove snapshot schema since we can get schema from the row

* iterator bug fix

* json json json

* sampler flow

* empty impls for read(InputStats) and sample()

* conversion?

* conversion, without timestamp

* Web console changes to show Delta Lake

* Asset bug fix and tile load

* Add missing pieces to input source info, etc.

* fix stuff

* Use a different delta lake asset

* Delta lake extension dependencies

* Cleanup

* Add InputSource, module init and helper code to process delta files.

* Test init

* Checkpoint changes

* Test resources and updates

* some fixes

* move to the correct package

* More tests

* Test cleanup

* TODOs

* Test updates

* requirements and javadocs

* Adjust dependencies

* Update readme

* Bump up version

* fixup typo in deps

* forbidden api and checkstyle checks

* Trim down dependencies

* new lines

* Fixup Intellij inspections.

* Add equals() and hashCode()

* chain splits, intellij inspections

* review comments and todo placeholder

* fix up some docs

* null table path and test dependencies. Fixup broken link.

* run prettify

* Different test; fixes

* Upgrade pyspark and delta-spark to latest (3.5.0 and 3.0.0) and regenerate tests

* yank the old test resource.

* add a couple of sad path tests

* Updates to readme based on latest.

* Version support

* Extract Delta DateTime converstions to DeltaTimeUtils class and add test

* More comprehensive split tests.

* Some test renames.

* Cleanup and update instructions.

* add pruneSchema() optimization for table scans.

* Oops, missed the parquet files.

* Update default table and rename schema constants.

* Test setup and misc changes.

* Add class loader logic as the context class loader is unaware about extension classes

* change some table client creation logic.

* Add hadoop-aws, hadoop-common and related exclusions.

* Remove org.apache.hadoop:hadoop-common

* Apply suggestions from code review

Co-authored-by: Victoria Lim <vtlim@users.noreply.github.com>

* Add entry to .spelling to fix docs static check

---------

Co-authored-by: abhishekagarwal87 <1477457+abhishekagarwal87@users.noreply.github.com>
Co-authored-by: Laksh Singla <lakshsingla@gmail.com>
Co-authored-by: Victoria Lim <vtlim@users.noreply.github.com>
2024-01-30 21:53:50 -08:00

580 lines
35 KiB
XML

<?xml version="1.0" encoding="UTF-8"?>
<!--
~ Licensed to the Apache Software Foundation (ASF) under one
~ or more contributor license agreements. See the NOTICE file
~ distributed with this work for additional information
~ regarding copyright ownership. The ASF licenses this file
~ to you under the Apache License, Version 2.0 (the
~ "License"); you may not use this file except in compliance
~ with the License. You may obtain a copy of the License at
~
~ http://www.apache.org/licenses/LICENSE-2.0
~
~ Unless required by applicable law or agreed to in writing,
~ software distributed under the License is distributed on an
~ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
~ KIND, either express or implied. See the License for the
~ specific language governing permissions and limitations
~ under the License.
-->
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<packaging>pom</packaging>
<artifactId>distribution</artifactId>
<name>distribution</name>
<description>distribution</description>
<parent>
<artifactId>druid</artifactId>
<groupId>org.apache.druid</groupId>
<version>30.0.0-SNAPSHOT</version>
</parent>
<dependencies>
<dependency>
<groupId>org.apache.druid</groupId>
<artifactId>druid-services</artifactId>
<version>${project.parent.version}</version>
</dependency>
<dependency>
<!-- This is needed to bundle the web console -->
<groupId>org.apache.druid</groupId>
<artifactId>web-console</artifactId>
<version>${project.parent.version}</version>
</dependency>
</dependencies>
<properties>
<!-- the default value is a repeated flag from the command line, since blank value is not allowed -->
<druid.distribution.pulldeps.opts>--clean</druid.distribution.pulldeps.opts>
</properties>
<build>
<plugins>
<plugin>
<groupId>net.nicoulaj.maven.plugins</groupId>
<artifactId>checksum-maven-plugin</artifactId>
<version>1.7</version>
<executions>
<execution>
<id>dist-checksum</id>
<goals>
<goal>files</goal>
</goals>
</execution>
</executions>
<configuration>
<algorithms>
<algorithm>SHA-512</algorithm>
</algorithms>
<csvSummary>false</csvSummary>
<fileSets>
<fileSet>
<directory>${project.build.directory}</directory>
<includes>
<include>*-src.tar.gz</include>
<include>*-bin.tar.gz</include>
</includes>
</fileSet>
</fileSets>
<failIfNoFiles>false</failIfNoFiles>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-antrun-plugin</artifactId>
<version>3.0.0</version>
<executions>
<execution>
<phase>prepare-package</phase>
<configuration>
<target>
<concat destfile="${project.build.directory}/../../README.BINARY">
<fileset file="${project.build.directory}/../../README" />
<fileset file="${project.build.directory}/../../LABELS" />
</concat>
</target>
</configuration>
<goals>
<goal>run</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.owasp</groupId>
<artifactId>dependency-check-maven</artifactId>
<configuration>
<skip>true</skip>
</configuration>
</plugin>
</plugins>
</build>
<profiles>
<profile>
<id>dist</id>
<activation>
<activeByDefault>false</activeByDefault>
</activation>
<build>
<plugins>
<plugin>
<groupId>org.codehaus.mojo</groupId>
<artifactId>exec-maven-plugin</artifactId>
<executions>
<execution>
<id>generate-readme</id>
<phase>initialize</phase>
<goals>
<goal>exec</goal>
</goals>
<configuration>
<executable>${project.basedir}/bin/build-textfile-readme.sh</executable>
<arguments>
<argument>${project.basedir}/../</argument>
<argument>${project.parent.version}</argument>
</arguments>
</configuration>
</execution>
<execution>
<id>generate-binary-license</id>
<phase>initialize</phase>
<goals>
<goal>exec</goal>
</goals>
<configuration>
<executable>${project.basedir}/bin/generate-binary-license.py</executable>
<arguments>
<argument>${project.parent.basedir}/licenses/APACHE2</argument>
<argument>${project.parent.basedir}/licenses.yaml</argument>
<argument>${project.parent.basedir}/LICENSE.BINARY</argument>
</arguments>
</configuration>
</execution>
<execution>
<id>generate-binary-notice</id>
<phase>initialize</phase>
<goals>
<goal>exec</goal>
</goals>
<configuration>
<executable>${project.basedir}/bin/generate-binary-notice.py</executable>
<arguments>
<argument>${project.parent.basedir}/NOTICE</argument>
<argument>${project.parent.basedir}/licenses.yaml</argument>
<argument>${project.parent.basedir}/NOTICE.BINARY</argument>
</arguments>
</configuration>
</execution>
<execution>
<id>pull-deps</id>
<phase>package</phase>
<goals>
<goal>exec</goal>
</goals>
<configuration>
<executable>${project.parent.basedir}/examples/bin/run-java</executable>
<arguments>
<argument>-classpath</argument>
<classpath />
<argument>-Ddruid.extensions.loadList=[]</argument>
<argument>-Ddruid.extensions.directory=${project.build.directory}/extensions
</argument>
<argument>
-Ddruid.extensions.hadoopDependenciesDir=${project.build.directory}/hadoop-dependencies
</argument>
<argument>org.apache.druid.cli.Main</argument>
<argument>tools</argument>
<argument>pull-deps</argument>
<argument>--clean</argument>
<argument>--defaultVersion</argument>
<argument>${project.parent.version}</argument>
<argument>-l</argument>
<argument>${settings.localRepository}</argument>
<argument>-h</argument>
<argument>org.apache.hadoop:hadoop-client-api:${hadoop.compile.version}</argument>
<argument>-h</argument>
<argument>org.apache.hadoop:hadoop-client-runtime:${hadoop.compile.version}</argument>
<argument>-c</argument>
<argument>org.apache.druid.extensions:druid-avro-extensions</argument>
<argument>-c</argument>
<argument>org.apache.druid.extensions:druid-azure-extensions</argument>
<argument>-c</argument>
<argument>org.apache.druid.extensions:druid-bloom-filter</argument>
<argument>-c</argument>
<argument>org.apache.druid.extensions:druid-datasketches</argument>
<argument>-c</argument>
<argument>org.apache.druid.extensions:druid-hdfs-storage</argument>
<argument>-c</argument>
<argument>org.apache.druid.extensions:druid-histogram</argument>
<argument>-c</argument>
<argument>org.apache.druid.extensions:druid-kafka-extraction-namespace</argument>
<argument>-c</argument>
<argument>org.apache.druid.extensions:druid-kafka-indexing-service</argument>
<argument>-c</argument>
<argument>org.apache.druid.extensions:druid-kinesis-indexing-service</argument>
<argument>-c</argument>
<argument>org.apache.druid.extensions:druid-lookups-cached-global</argument>
<argument>-c</argument>
<argument>org.apache.druid.extensions:druid-lookups-cached-single</argument>
<argument>-c</argument>
<argument>org.apache.druid.extensions:druid-multi-stage-query</argument>
<argument>-c</argument>
<argument>org.apache.druid.extensions:druid-protobuf-extensions</argument>
<argument>-c</argument>
<argument>org.apache.druid.extensions:mysql-metadata-storage</argument>
<argument>-c</argument>
<argument>org.apache.druid.extensions:druid-orc-extensions</argument>
<argument>-c</argument>
<argument>org.apache.druid.extensions:druid-parquet-extensions</argument>
<argument>-c</argument>
<argument>org.apache.druid.extensions:postgresql-metadata-storage</argument>
<argument>-c</argument>
<argument>org.apache.druid.extensions:druid-kerberos</argument>
<argument>-c</argument>
<argument>org.apache.druid.extensions:druid-s3-extensions</argument>
<argument>-c</argument>
<argument>org.apache.druid.extensions:druid-aws-rds-extensions</argument>
<argument>-c</argument>
<argument>org.apache.druid.extensions:druid-ec2-extensions</argument>
<argument>-c</argument>
<argument>org.apache.druid.extensions:druid-google-extensions</argument>
<argument>-c</argument>
<argument>org.apache.druid.extensions:druid-stats</argument>
<argument>-c</argument>
<argument>org.apache.druid.extensions:simple-client-sslcontext</argument>
<argument>-c</argument>
<argument>org.apache.druid.extensions:druid-basic-security</argument>
<argument>-c</argument>
<argument>org.apache.druid.extensions:druid-pac4j</argument>
<argument>-c</argument>
<argument>org.apache.druid.extensions:druid-ranger-security</argument>
<argument>-c</argument>
<argument>org.apache.druid.extensions:druid-kubernetes-extensions</argument>
<argument>-c</argument>
<argument>org.apache.druid.extensions:druid-catalog</argument>
<argument>-c</argument>
<argument>org.apache.druid.extensions:druid-deltalake-extensions</argument>
<argument>${druid.distribution.pulldeps.opts}</argument>
</arguments>
</configuration>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-assembly-plugin</artifactId>
<executions>
<execution>
<id>distro-assembly</id>
<phase>package</phase>
<goals>
<goal>single</goal>
</goals>
<configuration>
<finalName>apache-druid-${project.parent.version}</finalName>
<tarLongFileMode>posix</tarLongFileMode>
<descriptors>
<descriptor>src/assembly/assembly.xml</descriptor>
</descriptors>
</configuration>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.codehaus.mojo</groupId>
<artifactId>license-maven-plugin</artifactId>
<executions>
<execution>
<id>download-licenses</id>
<goals>
<goal>download-licenses</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</build>
</profile>
<profile>
<id>apache-release</id>
<build>
<plugins>
<plugin>
<groupId>org.codehaus.mojo</groupId>
<artifactId>exec-maven-plugin</artifactId>
<executions>
<execution>
<id>generate-licenses-report</id>
<phase>initialize</phase>
<goals>
<goal>exec</goal>
</goals>
<configuration>
<executable>${project.basedir}/bin/generate-license-dependency-reports.py</executable>
<arguments>
<argument>${project.basedir}/../</argument>
<argument>${project.basedir}/target</argument>
<argument>--clean-maven-artifact-transfer</argument>
<argument>--parallel</argument>
<argument>2</argument>
</arguments>
</configuration>
</execution>
<execution>
<id>check-licenses</id>
<phase>test</phase>
<goals>
<goal>exec</goal>
</goals>
<configuration>
<executable>${project.basedir}/bin/check-licenses.py</executable>
<arguments>
<argument>${project.parent.basedir}/licenses.yaml</argument>
<argument>${project.basedir}/target/license-reports</argument>
</arguments>
</configuration>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-assembly-plugin</artifactId>
<executions>
<execution>
<id>source-release-assembly-druid</id>
<phase>package</phase>
<goals>
<goal>single</goal>
</goals>
<configuration>
<finalName>apache-druid-${project.version}-src</finalName>
<tarLongFileMode>posix</tarLongFileMode>
<descriptors>
<descriptor>src/assembly/source-assembly.xml</descriptor>
</descriptors>
<appendAssemblyId>false</appendAssemblyId>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
</build>
</profile>
<profile>
<id>bundle-contrib-exts</id>
<build>
<plugins>
<plugin>
<groupId>org.codehaus.mojo</groupId>
<artifactId>exec-maven-plugin</artifactId>
<executions>
<execution>
<id>pull-deps-contrib-exts</id>
<phase>package</phase>
<goals>
<goal>exec</goal>
</goals>
<configuration>
<executable>${project.parent.basedir}/examples/bin/run-java</executable>
<arguments>
<argument>-classpath</argument>
<classpath />
<argument>-Ddruid.extensions.loadList=[]</argument>
<argument>-Ddruid.extensions.directory=${project.build.directory}/extensions
</argument>
<argument>
-Ddruid.extensions.hadoopDependenciesDir=${project.build.directory}/hadoop-dependencies
</argument>
<argument>org.apache.druid.cli.Main</argument>
<argument>tools</argument>
<argument>pull-deps</argument>
<argument>--defaultVersion</argument>
<argument>${project.parent.version}</argument>
<argument>-l</argument>
<argument>${settings.localRepository}</argument>
<argument>--no-default-hadoop</argument>
<argument>-c</argument>
<argument>org.apache.druid.extensions.contrib:ambari-metrics-emitter</argument>
<argument>-c</argument>
<argument>org.apache.druid.extensions.contrib:dropwizard-emitter</argument>
<argument>-c</argument>
<argument>org.apache.druid.extensions.contrib:druid-cassandra-storage</argument>
<argument>-c</argument>
<argument>org.apache.druid.extensions.contrib:druid-cloudfiles-extensions</argument>
<argument>-c</argument>
<argument>org.apache.druid.extensions.contrib:druid-distinctcount</argument>
<argument>-c</argument>
<argument>org.apache.druid.extensions.contrib:graphite-emitter</argument>
<argument>-c</argument>
<argument>org.apache.druid.extensions.contrib:druid-influx-extensions</argument>
<argument>-c</argument>
<argument>org.apache.druid.extensions.contrib:druid-influxdb-emitter</argument>
<argument>-c</argument>
<argument>org.apache.druid.extensions.contrib:kafka-emitter</argument>
<argument>-c</argument>
<argument>org.apache.druid.extensions.contrib:druid-kubernetes-overlord-extensions</argument>
<argument>-c</argument>
<argument>org.apache.druid.extensions.contrib:materialized-view-maintenance</argument>
<argument>-c</argument>
<argument>org.apache.druid.extensions.contrib:materialized-view-selection</argument>
<argument>-c</argument>
<argument>org.apache.druid.extensions.contrib:druid-opentsdb-emitter</argument>
<argument>-c</argument>
<argument>org.apache.druid.extensions.contrib:druid-redis-cache</argument>
<argument>-c</argument>
<argument>org.apache.druid.extensions.contrib:sqlserver-metadata-storage</argument>
<argument>-c</argument>
<argument>org.apache.druid.extensions.contrib:statsd-emitter</argument>
<argument>-c</argument>
<argument>org.apache.druid.extensions.contrib:prometheus-emitter</argument>
<argument>-c</argument>
<argument>org.apache.druid.extensions.contrib:druid-thrift-extensions</argument>
<argument>-c</argument>
<argument>org.apache.druid.extensions.contrib:druid-time-min-max</argument>
<argument>-c</argument>
<argument>org.apache.druid.extensions.contrib:druid-virtual-columns</argument>
<argument>-c</argument>
<argument>org.apache.druid.extensions.contrib:druid-moving-average-query</argument>
<argument>-c</argument>
<argument>org.apache.druid.extensions.contrib:druid-momentsketch</argument>
<argument>-c</argument>
<argument>org.apache.druid.extensions.contrib:druid-tdigestsketch</argument>
<argument>-c</argument>
<argument>org.apache.druid.extensions.contrib:druid-ddsketch</argument>
<argument>-c</argument>
<argument>org.apache.druid.extensions.contrib:gce-extensions</argument>
<argument>-c</argument>
<argument>org.apache.druid.extensions.contrib:aliyun-oss-extensions</argument>
<argument>-c</argument>
<argument>org.apache.druid.extensions.contrib:opentelemetry-emitter</argument>
<argument>-c</argument>
<argument>org.apache.druid.extensions:druid-iceberg-extensions</argument>
<argument>-c</argument>
<argument>org.apache.druid.extensions:druid-deltalake-extensions</argument>
<argument>-c</argument>
<argument>org.apache.druid.extensions.contrib:druid-spectator-histogram</argument>
</arguments>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
</build>
</profile>
<profile>
<id>integration-test</id>
<activation>
<activeByDefault>false</activeByDefault>
</activation>
<build>
<plugins>
<plugin>
<groupId>org.codehaus.mojo</groupId>
<artifactId>exec-maven-plugin</artifactId>
<executions>
<execution>
<id>pull-deps</id>
<phase>package</phase>
<goals>
<goal>exec</goal>
</goals>
<configuration>
<executable>${project.parent.basedir}/examples/bin/run-java</executable>
<arguments>
<argument>-classpath</argument>
<classpath />
<argument>-Ddruid.extensions.loadList=[]</argument>
<argument>-Ddruid.extensions.directory=${project.build.directory}/extensions
</argument>
<argument>
-Ddruid.extensions.hadoopDependenciesDir=${project.build.directory}/hadoop-dependencies
</argument>
<argument>org.apache.druid.cli.Main</argument>
<argument>tools</argument>
<argument>pull-deps</argument>
<argument>--clean</argument>
<argument>--defaultVersion</argument>
<argument>${project.parent.version}</argument>
<argument>-l</argument>
<argument>${settings.localRepository}</argument>
<argument>-h</argument>
<argument>org.apache.hadoop:hadoop-client:${hadoop.compile.version}</argument>
<argument>-c</argument>
<argument>org.apache.druid.extensions:druid-avro-extensions</argument>
<argument>-c</argument>
<argument>org.apache.druid.extensions:druid-azure-extensions</argument>
<argument>-c</argument>
<argument>org.apache.druid.extensions:druid-datasketches</argument>
<argument>-c</argument>
<argument>org.apache.druid.extensions:druid-hdfs-storage</argument>
<argument>-c</argument>
<argument>org.apache.druid.extensions:druid-histogram</argument>
<argument>-c</argument>
<argument>org.apache.druid.extensions:druid-kafka-indexing-service</argument>
<argument>-c</argument>
<argument>org.apache.druid.extensions:druid-kinesis-indexing-service</argument>
<argument>-c</argument>
<argument>org.apache.druid.extensions:druid-lookups-cached-global</argument>
<argument>-c</argument>
<argument>org.apache.druid.extensions:druid-multi-stage-query</argument>
<argument>-c</argument>
<argument>org.apache.druid.extensions:druid-protobuf-extensions</argument>
<argument>-c</argument>
<argument>org.apache.druid.extensions:mysql-metadata-storage</argument>
<argument>-c</argument>
<argument>org.apache.druid.extensions:druid-orc-extensions</argument>
<argument>-c</argument>
<argument>org.apache.druid.extensions:druid-parquet-extensions</argument>
<argument>-c</argument>
<argument>org.apache.druid.extensions:postgresql-metadata-storage</argument>
<argument>-c</argument>
<argument>org.apache.druid.extensions:druid-s3-extensions</argument>
<argument>-c</argument>
<argument>org.apache.druid.extensions:druid-ec2-extensions</argument>
<argument>-c</argument>
<argument>org.apache.druid.extensions:druid-google-extensions</argument>
<argument>-c</argument>
<argument>org.apache.druid.extensions:simple-client-sslcontext</argument>
<argument>-c</argument>
<argument>org.apache.druid.extensions:druid-basic-security</argument>
<argument>-c</argument>
<argument>org.apache.druid:druid-integration-tests</argument>
<argument>-c</argument>
<argument>org.apache.druid.extensions:druid-testing-tools</argument>
<argument>${druid.distribution.pulldeps.opts}</argument>
</arguments>
</configuration>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-assembly-plugin</artifactId>
<executions>
<execution>
<id>distro-assembly</id>
<phase>package</phase>
<goals>
<goal>single</goal>
</goals>
<configuration>
<finalName>apache-druid-${project.version}-integration-test</finalName>
<tarLongFileMode>posix</tarLongFileMode>
<descriptors>
<descriptor>src/assembly/integration-test-assembly.xml</descriptor>
</descriptors>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
</build>
</profile>
</profiles>
</project>