Update Hadoop3 as default build version (#14005)

Hadoop 2 often causes red security scans on Druid distribution because of the dependencies it brings. We want to move away from Hadoop 2 and provide Hadoop 3 distribution available. Switch druid to building with Hadoop 3 by default. Druid will still be compatible with Hadoop 2 and users can build hadoop-2 compatible distribution using hadoop2 profile.
This commit is contained in:
Tejaswini Bandlamudi 2023-04-26 12:52:51 +05:30 committed by GitHub
parent 752475b799
commit 774073b2e7
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
19 changed files with 171 additions and 136 deletions

View File

@ -85,21 +85,15 @@ jobs:
if: ${{ matrix.java == 'jdk8' }}
run: ./.github/scripts/license_checks_script.sh
- name: license checks for hadoop3
if: ${{ matrix.java == 'jdk8' }}
env:
HADOOP_PROFILE: -Phadoop3
run: ./.github/scripts/license_checks_script.sh
- name: analyze dependencies
if: ${{ matrix.java == 'jdk8' }}
run: |
./.github/scripts/analyze_dependencies_script.sh
- name: analyze dependencies for hadoop3
- name: analyze dependencies for hadoop2
if: ${{ matrix.java == 'jdk8' }}
env:
HADOOP_PROFILE: -Phadoop3
HADOOP_PROFILE: -Phadoop2
run: |
./.github/scripts/analyze_dependencies_script.sh

View File

@ -22,7 +22,7 @@ import sys
# this script does some primitive examination of git diff to determine if a test suite needs to be run or not
# these jobs should always be run, no matter what
always_run_jobs = ['license checks', 'license checks with Hadoop3', '(openjdk8) packaging check', '(openjdk11) packaging check']
always_run_jobs = ['license checks', '(openjdk8) packaging check', '(openjdk11) packaging check']
# ignore changes to these files completely since they don't impact CI, if the changes are only to these files then all
# of CI can be skipped. however, jobs which are always run will still be run even if only these files are changed

View File

@ -328,30 +328,6 @@ apache-druid-0.17.0-src.tar.gz.asc
apache-druid-0.17.0-src.tar.gz.sha512
```
#### Build artifacts for Hadoop-3
```bash
$ mvn clean install -Phadoop3,apache-release,dist,rat -DskipTests -Dgpg.keyname=<your GPG key fingerprint>
```
This should produce the following artifacts:
```plaintext
apache-druid-0.17.0-bin.tar.gz
apache-druid-0.17.0-bin.tar.gz.asc
apache-druid-0.17.0-bin.tar.gz.sha512
apache-druid-0.17.0-src.tar.gz
apache-druid-0.17.0-src.tar.gz.asc
apache-druid-0.17.0-src.tar.gz.sha512
```
You can ignore the src artifacts as they are the same as the main profile. The binary artifacts should be renamed to include the suffix `hadoop3`. So the final artifacts would be as follows:
```plaintext
apache-druid-0.17.0-hadoop3-bin.tar.gz
apache-druid-0.17.0-hadoop3-bin.tar.gz.asc
apache-druid-0.17.0-hadoop3-bin.tar.gz.sha512
```
Ensure that the GPG key fingerprint used in the `mvn install` command matches your release signing key in https://dist.apache.org/repos/dist/release/druid/KEYS.
### Verify checksums
@ -361,8 +337,6 @@ $ diff <(shasum -a512 apache-druid-0.17.0-bin.tar.gz | cut -d ' ' -f1) <(cat apa
...
$ diff <(shasum -a512 apache-druid-0.17.0-src.tar.gz | cut -d ' ' -f1) <(cat apache-druid-0.17.0-src.tar.gz.sha512 ; echo)
...
$ diff <(shasum -a512 apache-druid-0.17.0-hadoop3-bin.tar.gz | cut -d ' ' -f1) <(cat apache-druid-0.17.0-hadoop3-bin.tar.gz.sha512 ; echo)
...
```
### Verify GPG signatures
@ -372,8 +346,6 @@ $ gpg --verify apache-druid-0.17.0-bin.tar.gz.asc apache-druid-0.17.0-bin.tar.gz
...
$ gpg --verify apache-druid-0.17.0-src.tar.gz.asc apache-druid-0.17.0-src.tar.gz
...
$ gpg --verify apache-druid-0.17.0-hadoop3-bin.tar.gz.asc apache-druid-0.17.0-hadoop3-bin.tar.gz
...
```
### Commit artifacts to SVN repo

View File

@ -293,6 +293,8 @@ def build_compatible_license_names():
compatible_licenses['The MIT License (MIT)'] = 'MIT License'
compatible_licenses['Bouncy Castle Licence'] = 'MIT License'
compatible_licenses['The Go license'] = 'The Go license'
compatible_licenses['-'] = '-'
return compatible_licenses

View File

@ -116,7 +116,7 @@
<profiles>
<profile>
<id>dist</id>
<id>dist-hadoop2</id>
<activation>
<activeByDefault>false</activeByDefault>
<property>
@ -190,6 +190,7 @@
<argument>
-Ddruid.extensions.hadoopDependenciesDir=${project.build.directory}/hadoop-dependencies
</argument>
<argument>-Dhadoop2.enabled=true</argument>
<argument>org.apache.druid.cli.Main</argument>
<argument>tools</argument>
<argument>pull-deps</argument>
@ -225,8 +226,6 @@
<argument>-c</argument>
<argument>org.apache.druid.extensions:druid-multi-stage-query</argument>
<argument>-c</argument>
<argument>org.apache.druid.extensions:druid-catalog</argument>
<argument>-c</argument>
<argument>org.apache.druid.extensions:druid-protobuf-extensions</argument>
<argument>-c</argument>
<argument>org.apache.druid.extensions:mysql-metadata-storage</argument>
@ -302,7 +301,7 @@
</build>
</profile>
<profile>
<id>dist-hadoop3</id>
<id>dist</id>
<activation>
<activeByDefault>false</activeByDefault>
</activation>
@ -373,7 +372,6 @@
<argument>
-Ddruid.extensions.hadoopDependenciesDir=${project.build.directory}/hadoop-dependencies
</argument>
<argument>-Dhadoop3.enabled=true</argument>
<argument>org.apache.druid.cli.Main</argument>
<argument>tools</argument>
<argument>pull-deps</argument>
@ -442,6 +440,8 @@
<argument>org.apache.druid.extensions:druid-ranger-security</argument>
<argument>-c</argument>
<argument>org.apache.druid.extensions:druid-kubernetes-extensions</argument>
<argument>-c</argument>
<argument>org.apache.druid.extensions:druid-catalog</argument>
<argument>${druid.distribution.pulldeps.opts}</argument>
</arguments>
</configuration>

View File

@ -72,24 +72,6 @@ Putting these together, if you wish to build the source and binary distributions
mvn clean install -Papache-release,dist,rat -DskipTests
```
### Building hadoop 3 distribution
By default, druid ships hadoop 2.x.x jars along with the distribution. Exact version can be found in the
main [pom](https://github.com/apache/druid/blob/master/pom.xml). To build druid with hadoop 3.x.x jars, hadoop3 profile
needs to be activated.
To generate build with hadoop 3 dependencies, run:
```bash
mvn clean install -Phadoop3
```
To generate distribution with hadoop3 dependencies, run :
```bash
mvn clean install -Papache-release,dist-hadoop3,rat,hadoop3 -DskipTests
```
#### Potential issues
##### Missing `pyyaml`

View File

@ -136,7 +136,10 @@
<profile>
<id>hadoop2</id>
<activation>
<activeByDefault>true</activeByDefault>
<property>
<name>hadoop2.enabled</name>
<value>true</value>
</property>
</activation>
<dependencies>
<dependency>
@ -161,23 +164,20 @@
<profile>
<id>hadoop3</id>
<activation>
<property>
<name>hadoop3.enabled</name>
<value>true</value>
</property>
<activeByDefault>true</activeByDefault>
</activation>
<dependencies>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client-api</artifactId>
<version>${hadoop.compile.version}</version>
<scope>provided</scope>
<scope>compile</scope>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client-runtime</artifactId>
<version>${hadoop.compile.version}</version>
<scope>test</scope>
<scope>runtime</scope>
</dependency>
</dependencies>
</profile>

View File

@ -271,7 +271,10 @@
<profile>
<id>hadoop2</id>
<activation>
<activeByDefault>true</activeByDefault>
<property>
<name>hadoop2.enabled</name>
<value>true</value>
</property>
</activation>
<dependencies>
<dependency>
@ -313,17 +316,14 @@
<profile>
<id>hadoop3</id>
<activation>
<property>
<name>hadoop3.enabled</name>
<value>true</value>
</property>
<activeByDefault>true</activeByDefault>
</activation>
<dependencies>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client-api</artifactId>
<version>${hadoop.compile.version}</version>
<scope>provided</scope>
<scope>compile</scope>
</dependency>
<dependency>
<groupId>com.sun.jersey</groupId>

View File

@ -136,10 +136,19 @@
<groupId>com.google.code.findbugs</groupId>
<artifactId>jsr305</artifactId>
</exclusion>
<!-- multiple jetty-util exclusions because of different Hadoop profiles (2.x, 3.x) -->
<exclusion>
<groupId>org.mortbay.jetty</groupId>
<artifactId>jetty-util</artifactId>
</exclusion>
<exclusion>
<groupId>org.eclipse.jetty</groupId>
<artifactId>jetty-webapp</artifactId>
</exclusion>
<exclusion>
<groupId>org.eclipse.jetty</groupId>
<artifactId>jetty-xml</artifactId>
</exclusion>
<exclusion>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-annotations</artifactId>
@ -193,7 +202,7 @@
<artifactId>json-smart</artifactId>
</exclusion>
<exclusion>
<groupId>com.sun.jersey</groupId>
<groupId>com.github.pjfanning</groupId>
<artifactId>jersey-json</artifactId>
</exclusion>
<exclusion>
@ -277,6 +286,26 @@
<artifactId>jetty-client</artifactId>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.eclipse.jetty</groupId>
<artifactId>jetty-server</artifactId>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.eclipse.jetty</groupId>
<artifactId>jetty-util</artifactId>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.eclipse.jetty</groupId>
<artifactId>jetty-io</artifactId>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.eclipse.jetty</groupId>
<artifactId>jetty-servlet</artifactId>
<scope>provided</scope>
</dependency>
<!-- we require not shaded version of hadoop deps in case of hadoop3 as we are extending
org.apache.hadoop.security.authentication.server.AuthenticationHandler -->
<dependency>

View File

@ -172,7 +172,10 @@
<profile>
<id>hadoop2</id>
<activation>
<activeByDefault>true</activeByDefault>
<property>
<name>hadoop2.enabled</name>
<value>true</value>
</property>
</activation>
<dependencies>
<dependency>
@ -401,10 +404,7 @@
<profile>
<id>hadoop3</id>
<activation>
<property>
<name>hadoop3.enabled</name>
<value>true</value>
</property>
<activeByDefault>true</activeByDefault>
</activation>
<dependencies>
<dependency>
@ -417,7 +417,7 @@
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client-runtime</artifactId>
<version>${hadoop.compile.version}</version>
<scope>test</scope>
<scope>runtime</scope>
</dependency>
</dependencies>
</profile>

View File

@ -45,6 +45,12 @@
<artifactId>hadoop-aws</artifactId>
<version>${hadoop.compile.version}</version>
<scope>runtime</scope>
<exclusions>
<exclusion>
<groupId>com.amazonaws</groupId>
<artifactId>aws-java-sdk-bundle</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>commons-io</groupId>
@ -137,7 +143,10 @@
<profile>
<id>hadoop2</id>
<activation>
<activeByDefault>true</activeByDefault>
<property>
<name>hadoop2.enabled</name>
<value>true</value>
</property>
</activation>
<dependencies>
<dependency>
@ -423,10 +432,7 @@
<profile>
<id>hadoop3</id>
<activation>
<property>
<name>hadoop3.enabled</name>
<value>true</value>
</property>
<activeByDefault>true</activeByDefault>
</activation>
<dependencies>
<dependency>
@ -447,6 +453,12 @@
<version>${hadoop.compile.version}</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>com.amazonaws</groupId>
<artifactId>aws-java-sdk-s3</artifactId>
<version>${aws.sdk.version}</version>
<scope>runtime</scope>
</dependency>
<dependency>
<groupId>log4j</groupId>
<artifactId>log4j</artifactId>

View File

@ -235,7 +235,10 @@
<profile>
<id>hadoop2</id>
<activation>
<activeByDefault>true</activeByDefault>
<property>
<name>hadoop2.enabled</name>
<value>true</value>
</property>
</activation>
<dependencies>
<dependency>
@ -491,10 +494,7 @@
<profile>
<id>hadoop3</id>
<activation>
<property>
<name>hadoop3.enabled</name>
<value>true</value>
</property>
<activeByDefault>true</activeByDefault>
</activation>
<dependencies>
<dependency>
@ -512,7 +512,7 @@
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client-runtime</artifactId>
<version>${hadoop.compile.version}</version>
<scope>test</scope>
<scope>runtime</scope>
</dependency>
</dependencies>
</profile>

View File

@ -171,7 +171,10 @@
<profile>
<id>hadoop2</id>
<activation>
<activeByDefault>true</activeByDefault>
<property>
<name>hadoop2.enabled</name>
<value>true</value>
</property>
</activation>
<dependencies>
<!-- needed if using native batch with hdfs input source -->
@ -419,23 +422,20 @@
<profile>
<id>hadoop3</id>
<activation>
<property>
<name>hadoop3.enabled</name>
<value>true</value>
</property>
<activeByDefault>true</activeByDefault>
</activation>
<dependencies>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client-api</artifactId>
<version>${hadoop.compile.version}</version>
<scope>provided</scope>
<scope>compile</scope>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client-runtime</artifactId>
<version>${hadoop.compile.version}</version>
<scope>test</scope>
<scope>runtime</scope>
</dependency>
</dependencies>
</profile>

View File

@ -155,7 +155,10 @@
<profile>
<id>hadoop2</id>
<activation>
<activeByDefault>true</activeByDefault>
<property>
<name>hadoop2.enabled</name>
<value>true</value>
</property>
</activation>
<dependencies>
<dependency>
@ -210,10 +213,7 @@
<profile>
<id>hadoop3</id>
<activation>
<property>
<name>hadoop3.enabled</name>
<value>true</value>
</property>
<activeByDefault>true</activeByDefault>
</activation>
<dependencies>
<dependency>

View File

@ -274,7 +274,10 @@
<profile>
<id>hadoop2</id>
<activation>
<activeByDefault>true</activeByDefault>
<property>
<name>hadoop2.enabled</name>
<value>true</value>
</property>
</activation>
<dependencies>
<dependency>
@ -318,10 +321,7 @@
<profile>
<id>hadoop3</id>
<activation>
<property>
<name>hadoop3.enabled</name>
<value>true</value>
</property>
<activeByDefault>true</activeByDefault>
</activation>
<dependencies>
<dependency>

View File

@ -88,7 +88,7 @@ fi
# Assemble Java options
JAVA_OPTS="$DRUID_SERVICE_JAVA_OPTS $DRUID_COMMON_JAVA_OPTS -XX:HeapDumpPath=$LOG_DIR/$INSTANCE_NAME $DEBUG_OPTS"
LOG4J_CONFIG=$SHARED_DIR/conf/log4j2.xml
LOG4J_CONFIG=$SHARED_DIR/resources/log4j2.xml
if [ -f $LOG4J_CONFIG ]; then
JAVA_OPTS="$JAVA_OPTS -Dlog4j.configurationFile=$LOG4J_CONFIG"
fi

View File

@ -39,8 +39,8 @@
</repositories>
<properties>
<hadoop.integ.libs>"org.apache.hadoop:hadoop-client:${hadoop.compile.version}", "org.apache.hadoop:hadoop-azure:${hadoop.compile.version}"</hadoop.integ.libs>
<hadoop.s3.impl>org.apache.hadoop.fs.s3native.NativeS3FileSystem</hadoop.s3.impl>
<hadoop.integ.libs>"org.apache.hadoop:hadoop-client-api:${hadoop.compile.version}", "org.apache.hadoop:hadoop-client-runtime:${hadoop.compile.version}", "org.apache.hadoop:hadoop-azure:${hadoop.compile.version}"</hadoop.integ.libs>
<hadoop.s3.impl>org.apache.hadoop.fs.s3a.S3AFileSystem</hadoop.s3.impl>
</properties>
<dependencies>
@ -554,10 +554,10 @@
<profiles>
<profile>
<id>hadoop3</id>
<id>hadoop2</id>
<properties>
<hadoop.integ.libs>"org.apache.hadoop:hadoop-client-api:${hadoop.compile.version}", "org.apache.hadoop:hadoop-client-runtime:${hadoop.compile.version}", "org.apache.hadoop:hadoop-azure:${hadoop.compile.version}"</hadoop.integ.libs>
<hadoop.s3.impl>org.apache.hadoop.fs.s3a.S3AFileSystem</hadoop.s3.impl>
<hadoop.integ.libs>"org.apache.hadoop:hadoop-client:${hadoop.compile.version}", "org.apache.hadoop:hadoop-azure:${hadoop.compile.version}"</hadoop.integ.libs>
<hadoop.s3.impl>org.apache.hadoop.fs.s3native.NativeS3FileSystem</hadoop.s3.impl>
</properties>
</profile>
<profile>

View File

@ -644,7 +644,7 @@ name: Apache Commons Configuration
license_category: binary
module: java-core
license_name: Apache License version 2.0
version: 2.1.1
version: 2.8.0
libraries:
- org.apache.commons: commons-configuration2
@ -2913,22 +2913,10 @@ name: Apache Hadoop
license_category: binary
module: hadoop-client
license_name: Apache License version 2.0
version: 3.3.1
version: 3.3.5
libraries:
- org.apache.hadoop: hadoop-annotations
- org.apache.hadoop: hadoop-auth
- org.apache.hadoop: hadoop-client
- org.apache.hadoop: hadoop-common
- org.apache.hadoop: hadoop-hdfs-client
- org.apache.hadoop: hadoop-mapreduce-client-app
- org.apache.hadoop: hadoop-mapreduce-client-common
- org.apache.hadoop: hadoop-mapreduce-client-core
- org.apache.hadoop: hadoop-mapreduce-client-jobclient
- org.apache.hadoop: hadoop-mapreduce-client-shuffle
- org.apache.hadoop: hadoop-yarn-api
- org.apache.hadoop: hadoop-yarn-client
- org.apache.hadoop: hadoop-yarn-common
- org.apache.hadoop: hadoop-yarn-server-common
---
@ -3485,6 +3473,43 @@ notices:
---
name: reload4j
license_category: binary
module: hadoop-common
license_name: Apache License version 2.0
version: 1.2.22
libraries:
- ch.qos.reload4j: reload4j
notices:
- reload4j: |
Apache log4j
Copyright 2010 The Apache Software Foundation
This product includes software developed at
The Apache Software Foundation (http://www.apache.org/).
---
name: slf4j-reload4j
license_category: binary
module: hadoop-common
license_name: MIT License
version: 1.7.36
libraries:
- org.slf4j: slf4j-reload4j
---
name: com.github.pjfanning jersey-json
license_category: binary
module: druid-kerberos
license_name: CDDL 1.1
version: 1.20
libraries:
- com.github.pjfanning: jersey-json
---
name: Kafka Schema Registry Client
version: 5.5.1
license_category: binary
@ -3786,6 +3811,16 @@ libraries:
---
name: Hadoop Client API
license_category: binary
module: extensions/druid-hdfs-storage
license_name: Apache License version 2.0
version: 3.3.5
libraries:
- org.apache.hadoop: hadoop-client-api
---
name: xmlenc
license_category: binary
module: extensions/druid-hdfs-storage
@ -5037,7 +5072,7 @@ name: Woodstox
license_category: binary
module: java-core
license_name: Apache License version 2.0
version: 5.3.0
version: 5.4.0
libraries:
- com.fasterxml.woodstox: woodstox-core
@ -5064,6 +5099,16 @@ libraries:
---
name: RE2/J
license_category: binary
module: java-core
license_name: The Go license
version: 1.1
license_file_path: licenses/bin/re2j.GO
libraries:
- com.google.re2j: re2j
---
name: jakarta.activation
license_category: binary
module: extensions/druid-avro-extensions

13
pom.xml
View File

@ -110,12 +110,12 @@
<resilience4j.version>1.3.1</resilience4j.version>
<slf4j.version>1.7.36</slf4j.version>
<!-- If compiling with different hadoop version also modify default hadoop coordinates in TaskConfig.java -->
<hadoop.compile.version>2.8.5</hadoop.compile.version>
<hadoop.compile.version>3.3.5</hadoop.compile.version>
<mockito.version>4.3.1</mockito.version>
<aws.sdk.version>1.12.317</aws.sdk.version>
<caffeine.version>2.8.0</caffeine.version>
<jacoco.version>0.8.7</jacoco.version>
<hibernate-validator.version>5.2.5.Final</hibernate-validator.version>
<hibernate-validator.version>5.3.6.Final</hibernate-validator.version>
<httpclient.version>4.5.13</httpclient.version>
<!-- When upgrading ZK, edit docs and integration tests as well (integration-tests/docker-base/setup.sh) -->
<zookeeper.version>3.5.10</zookeeper.version>
@ -2033,17 +2033,16 @@
</properties>
</profile>
<profile>
<id>hadoop3</id>
<id>hadoop2</id>
<activation>
<property>
<name>hadoop3.enabled</name>
<name>hadoop2.enabled</name>
<value>true</value>
</property>
</activation>
<properties>
<hadoop.compile.version>3.3.1</hadoop.compile.version>
<hibernate-validator.version>5.3.6.Final</hibernate-validator.version>
<httpclient.version>4.5.13</httpclient.version>
<hadoop.compile.version>2.8.5</hadoop.compile.version>
<hibernate-validator.version>5.2.5.Final</hibernate-validator.version>
</properties>
</profile>
</profiles>