From 774073b2e7f3ab47a0b1fd3c45fb03bb2281038c Mon Sep 17 00:00:00 2001 From: Tejaswini Bandlamudi <96047043+tejaswini-imply@users.noreply.github.com> Date: Wed, 26 Apr 2023 12:52:51 +0530 Subject: [PATCH] Update Hadoop3 as default build version (#14005) Hadoop 2 often causes red security scans on Druid distribution because of the dependencies it brings. We want to move away from Hadoop 2 and provide Hadoop 3 distribution available. Switch druid to building with Hadoop 3 by default. Druid will still be compatible with Hadoop 2 and users can build hadoop-2 compatible distribution using hadoop2 profile. --- .github/workflows/static-checks.yml | 10 +-- check_test_suite.py | 2 +- distribution/asf-release-process-guide.md | 28 ------- distribution/bin/check-licenses.py | 2 + distribution/pom.xml | 10 +-- docs/development/build.md | 18 ----- extensions-contrib/thrift-extensions/pom.xml | 14 ++-- extensions-core/avro-extensions/pom.xml | 12 +-- extensions-core/druid-kerberos/pom.xml | 31 +++++++- extensions-core/druid-ranger-security/pom.xml | 12 +-- extensions-core/hdfs-storage/pom.xml | 22 ++++-- extensions-core/orc-extensions/pom.xml | 12 +-- extensions-core/parquet-extensions/pom.xml | 14 ++-- indexing-hadoop/pom.xml | 10 +-- indexing-service/pom.xml | 10 +-- integration-tests-ex/image/docker/launch.sh | 2 +- integration-tests/pom.xml | 10 +-- licenses.yaml | 75 +++++++++++++++---- pom.xml | 13 ++-- 19 files changed, 171 insertions(+), 136 deletions(-) diff --git a/.github/workflows/static-checks.yml b/.github/workflows/static-checks.yml index f72e65b60d2..d731098192b 100644 --- a/.github/workflows/static-checks.yml +++ b/.github/workflows/static-checks.yml @@ -85,21 +85,15 @@ jobs: if: ${{ matrix.java == 'jdk8' }} run: ./.github/scripts/license_checks_script.sh - - name: license checks for hadoop3 - if: ${{ matrix.java == 'jdk8' }} - env: - HADOOP_PROFILE: -Phadoop3 - run: ./.github/scripts/license_checks_script.sh - - name: analyze dependencies if: ${{ matrix.java == 'jdk8' }} run: | ./.github/scripts/analyze_dependencies_script.sh - - name: analyze dependencies for hadoop3 + - name: analyze dependencies for hadoop2 if: ${{ matrix.java == 'jdk8' }} env: - HADOOP_PROFILE: -Phadoop3 + HADOOP_PROFILE: -Phadoop2 run: | ./.github/scripts/analyze_dependencies_script.sh diff --git a/check_test_suite.py b/check_test_suite.py index 5c7040e555f..d55c52b0b73 100755 --- a/check_test_suite.py +++ b/check_test_suite.py @@ -22,7 +22,7 @@ import sys # this script does some primitive examination of git diff to determine if a test suite needs to be run or not # these jobs should always be run, no matter what -always_run_jobs = ['license checks', 'license checks with Hadoop3', '(openjdk8) packaging check', '(openjdk11) packaging check'] +always_run_jobs = ['license checks', '(openjdk8) packaging check', '(openjdk11) packaging check'] # ignore changes to these files completely since they don't impact CI, if the changes are only to these files then all # of CI can be skipped. however, jobs which are always run will still be run even if only these files are changed diff --git a/distribution/asf-release-process-guide.md b/distribution/asf-release-process-guide.md index d513caef6bd..16f1670d1fa 100644 --- a/distribution/asf-release-process-guide.md +++ b/distribution/asf-release-process-guide.md @@ -328,30 +328,6 @@ apache-druid-0.17.0-src.tar.gz.asc apache-druid-0.17.0-src.tar.gz.sha512 ``` -#### Build artifacts for Hadoop-3 - -```bash -$ mvn clean install -Phadoop3,apache-release,dist,rat -DskipTests -Dgpg.keyname= -``` - -This should produce the following artifacts: - -```plaintext -apache-druid-0.17.0-bin.tar.gz -apache-druid-0.17.0-bin.tar.gz.asc -apache-druid-0.17.0-bin.tar.gz.sha512 -apache-druid-0.17.0-src.tar.gz -apache-druid-0.17.0-src.tar.gz.asc -apache-druid-0.17.0-src.tar.gz.sha512 -``` - -You can ignore the src artifacts as they are the same as the main profile. The binary artifacts should be renamed to include the suffix `hadoop3`. So the final artifacts would be as follows: -```plaintext -apache-druid-0.17.0-hadoop3-bin.tar.gz -apache-druid-0.17.0-hadoop3-bin.tar.gz.asc -apache-druid-0.17.0-hadoop3-bin.tar.gz.sha512 -``` - Ensure that the GPG key fingerprint used in the `mvn install` command matches your release signing key in https://dist.apache.org/repos/dist/release/druid/KEYS. ### Verify checksums @@ -361,8 +337,6 @@ $ diff <(shasum -a512 apache-druid-0.17.0-bin.tar.gz | cut -d ' ' -f1) <(cat apa ... $ diff <(shasum -a512 apache-druid-0.17.0-src.tar.gz | cut -d ' ' -f1) <(cat apache-druid-0.17.0-src.tar.gz.sha512 ; echo) ... -$ diff <(shasum -a512 apache-druid-0.17.0-hadoop3-bin.tar.gz | cut -d ' ' -f1) <(cat apache-druid-0.17.0-hadoop3-bin.tar.gz.sha512 ; echo) -... ``` ### Verify GPG signatures @@ -372,8 +346,6 @@ $ gpg --verify apache-druid-0.17.0-bin.tar.gz.asc apache-druid-0.17.0-bin.tar.gz ... $ gpg --verify apache-druid-0.17.0-src.tar.gz.asc apache-druid-0.17.0-src.tar.gz ... -$ gpg --verify apache-druid-0.17.0-hadoop3-bin.tar.gz.asc apache-druid-0.17.0-hadoop3-bin.tar.gz -... ``` ### Commit artifacts to SVN repo diff --git a/distribution/bin/check-licenses.py b/distribution/bin/check-licenses.py index b5a2c2e933d..ff77eeace0b 100755 --- a/distribution/bin/check-licenses.py +++ b/distribution/bin/check-licenses.py @@ -293,6 +293,8 @@ def build_compatible_license_names(): compatible_licenses['The MIT License (MIT)'] = 'MIT License' compatible_licenses['Bouncy Castle Licence'] = 'MIT License' + compatible_licenses['The Go license'] = 'The Go license' + compatible_licenses['-'] = '-' return compatible_licenses diff --git a/distribution/pom.xml b/distribution/pom.xml index f041399c6a5..c2aa28f75d3 100644 --- a/distribution/pom.xml +++ b/distribution/pom.xml @@ -116,7 +116,7 @@ - dist + dist-hadoop2 false @@ -190,6 +190,7 @@ -Ddruid.extensions.hadoopDependenciesDir=${project.build.directory}/hadoop-dependencies + -Dhadoop2.enabled=true org.apache.druid.cli.Main tools pull-deps @@ -225,8 +226,6 @@ -c org.apache.druid.extensions:druid-multi-stage-query -c - org.apache.druid.extensions:druid-catalog - -c org.apache.druid.extensions:druid-protobuf-extensions -c org.apache.druid.extensions:mysql-metadata-storage @@ -302,7 +301,7 @@ - dist-hadoop3 + dist false @@ -373,7 +372,6 @@ -Ddruid.extensions.hadoopDependenciesDir=${project.build.directory}/hadoop-dependencies - -Dhadoop3.enabled=true org.apache.druid.cli.Main tools pull-deps @@ -442,6 +440,8 @@ org.apache.druid.extensions:druid-ranger-security -c org.apache.druid.extensions:druid-kubernetes-extensions + -c + org.apache.druid.extensions:druid-catalog ${druid.distribution.pulldeps.opts} diff --git a/docs/development/build.md b/docs/development/build.md index 15f0689631a..911f25aabbf 100644 --- a/docs/development/build.md +++ b/docs/development/build.md @@ -72,24 +72,6 @@ Putting these together, if you wish to build the source and binary distributions mvn clean install -Papache-release,dist,rat -DskipTests ``` -### Building hadoop 3 distribution - -By default, druid ships hadoop 2.x.x jars along with the distribution. Exact version can be found in the -main [pom](https://github.com/apache/druid/blob/master/pom.xml). To build druid with hadoop 3.x.x jars, hadoop3 profile -needs to be activated. - -To generate build with hadoop 3 dependencies, run: - -```bash -mvn clean install -Phadoop3 -``` - -To generate distribution with hadoop3 dependencies, run : - -```bash -mvn clean install -Papache-release,dist-hadoop3,rat,hadoop3 -DskipTests -``` - #### Potential issues ##### Missing `pyyaml` diff --git a/extensions-contrib/thrift-extensions/pom.xml b/extensions-contrib/thrift-extensions/pom.xml index f9ae98cf2cf..ce791089077 100644 --- a/extensions-contrib/thrift-extensions/pom.xml +++ b/extensions-contrib/thrift-extensions/pom.xml @@ -136,7 +136,10 @@ hadoop2 - true + + hadoop2.enabled + true + @@ -161,23 +164,20 @@ hadoop3 - - hadoop3.enabled - true - + true org.apache.hadoop hadoop-client-api ${hadoop.compile.version} - provided + compile org.apache.hadoop hadoop-client-runtime ${hadoop.compile.version} - test + runtime diff --git a/extensions-core/avro-extensions/pom.xml b/extensions-core/avro-extensions/pom.xml index f3fc2293f72..ebaccea3410 100644 --- a/extensions-core/avro-extensions/pom.xml +++ b/extensions-core/avro-extensions/pom.xml @@ -271,7 +271,10 @@ hadoop2 - true + + hadoop2.enabled + true + @@ -313,17 +316,14 @@ hadoop3 - - hadoop3.enabled - true - + true org.apache.hadoop hadoop-client-api ${hadoop.compile.version} - provided + compile com.sun.jersey diff --git a/extensions-core/druid-kerberos/pom.xml b/extensions-core/druid-kerberos/pom.xml index 77fc92cb98c..dfffd516d12 100644 --- a/extensions-core/druid-kerberos/pom.xml +++ b/extensions-core/druid-kerberos/pom.xml @@ -136,10 +136,19 @@ com.google.code.findbugs jsr305 + org.mortbay.jetty jetty-util + + org.eclipse.jetty + jetty-webapp + + + org.eclipse.jetty + jetty-xml + org.apache.hadoop hadoop-annotations @@ -193,7 +202,7 @@ json-smart - com.sun.jersey + com.github.pjfanning jersey-json @@ -277,6 +286,26 @@ jetty-client provided + + org.eclipse.jetty + jetty-server + provided + + + org.eclipse.jetty + jetty-util + provided + + + org.eclipse.jetty + jetty-io + provided + + + org.eclipse.jetty + jetty-servlet + provided + diff --git a/extensions-core/druid-ranger-security/pom.xml b/extensions-core/druid-ranger-security/pom.xml index a7062bfed18..39300380b42 100644 --- a/extensions-core/druid-ranger-security/pom.xml +++ b/extensions-core/druid-ranger-security/pom.xml @@ -172,7 +172,10 @@ hadoop2 - true + + hadoop2.enabled + true + @@ -401,10 +404,7 @@ hadoop3 - - hadoop3.enabled - true - + true @@ -417,7 +417,7 @@ org.apache.hadoop hadoop-client-runtime ${hadoop.compile.version} - test + runtime diff --git a/extensions-core/hdfs-storage/pom.xml b/extensions-core/hdfs-storage/pom.xml index 27241c6df49..89494185ead 100644 --- a/extensions-core/hdfs-storage/pom.xml +++ b/extensions-core/hdfs-storage/pom.xml @@ -45,6 +45,12 @@ hadoop-aws ${hadoop.compile.version} runtime + + + com.amazonaws + aws-java-sdk-bundle + + commons-io @@ -137,7 +143,10 @@ hadoop2 - true + + hadoop2.enabled + true + @@ -423,10 +432,7 @@ hadoop3 - - hadoop3.enabled - true - + true @@ -447,6 +453,12 @@ ${hadoop.compile.version} test + + com.amazonaws + aws-java-sdk-s3 + ${aws.sdk.version} + runtime + log4j log4j diff --git a/extensions-core/orc-extensions/pom.xml b/extensions-core/orc-extensions/pom.xml index f16d6913666..245fdce5ab8 100644 --- a/extensions-core/orc-extensions/pom.xml +++ b/extensions-core/orc-extensions/pom.xml @@ -235,7 +235,10 @@ hadoop2 - true + + hadoop2.enabled + true + @@ -491,10 +494,7 @@ hadoop3 - - hadoop3.enabled - true - + true @@ -512,7 +512,7 @@ org.apache.hadoop hadoop-client-runtime ${hadoop.compile.version} - test + runtime diff --git a/extensions-core/parquet-extensions/pom.xml b/extensions-core/parquet-extensions/pom.xml index dbb8b5d3c77..02a7d451ff5 100644 --- a/extensions-core/parquet-extensions/pom.xml +++ b/extensions-core/parquet-extensions/pom.xml @@ -171,7 +171,10 @@ hadoop2 - true + + hadoop2.enabled + true + @@ -419,23 +422,20 @@ hadoop3 - - hadoop3.enabled - true - + true org.apache.hadoop hadoop-client-api ${hadoop.compile.version} - provided + compile org.apache.hadoop hadoop-client-runtime ${hadoop.compile.version} - test + runtime diff --git a/indexing-hadoop/pom.xml b/indexing-hadoop/pom.xml index b97d1bc3775..edf7af761cf 100644 --- a/indexing-hadoop/pom.xml +++ b/indexing-hadoop/pom.xml @@ -155,7 +155,10 @@ hadoop2 - true + + hadoop2.enabled + true + @@ -210,10 +213,7 @@ hadoop3 - - hadoop3.enabled - true - + true diff --git a/indexing-service/pom.xml b/indexing-service/pom.xml index 9a9bbc4da4a..82603e83a20 100644 --- a/indexing-service/pom.xml +++ b/indexing-service/pom.xml @@ -274,7 +274,10 @@ hadoop2 - true + + hadoop2.enabled + true + @@ -318,10 +321,7 @@ hadoop3 - - hadoop3.enabled - true - + true diff --git a/integration-tests-ex/image/docker/launch.sh b/integration-tests-ex/image/docker/launch.sh index 8116ab96323..13fcc98a4de 100644 --- a/integration-tests-ex/image/docker/launch.sh +++ b/integration-tests-ex/image/docker/launch.sh @@ -88,7 +88,7 @@ fi # Assemble Java options JAVA_OPTS="$DRUID_SERVICE_JAVA_OPTS $DRUID_COMMON_JAVA_OPTS -XX:HeapDumpPath=$LOG_DIR/$INSTANCE_NAME $DEBUG_OPTS" -LOG4J_CONFIG=$SHARED_DIR/conf/log4j2.xml +LOG4J_CONFIG=$SHARED_DIR/resources/log4j2.xml if [ -f $LOG4J_CONFIG ]; then JAVA_OPTS="$JAVA_OPTS -Dlog4j.configurationFile=$LOG4J_CONFIG" fi diff --git a/integration-tests/pom.xml b/integration-tests/pom.xml index 02faa0cb2cb..18fd9958775 100644 --- a/integration-tests/pom.xml +++ b/integration-tests/pom.xml @@ -39,8 +39,8 @@ - "org.apache.hadoop:hadoop-client:${hadoop.compile.version}", "org.apache.hadoop:hadoop-azure:${hadoop.compile.version}" - org.apache.hadoop.fs.s3native.NativeS3FileSystem + "org.apache.hadoop:hadoop-client-api:${hadoop.compile.version}", "org.apache.hadoop:hadoop-client-runtime:${hadoop.compile.version}", "org.apache.hadoop:hadoop-azure:${hadoop.compile.version}" + org.apache.hadoop.fs.s3a.S3AFileSystem @@ -554,10 +554,10 @@ - hadoop3 + hadoop2 - "org.apache.hadoop:hadoop-client-api:${hadoop.compile.version}", "org.apache.hadoop:hadoop-client-runtime:${hadoop.compile.version}", "org.apache.hadoop:hadoop-azure:${hadoop.compile.version}" - org.apache.hadoop.fs.s3a.S3AFileSystem + "org.apache.hadoop:hadoop-client:${hadoop.compile.version}", "org.apache.hadoop:hadoop-azure:${hadoop.compile.version}" + org.apache.hadoop.fs.s3native.NativeS3FileSystem diff --git a/licenses.yaml b/licenses.yaml index 416e3e0bffd..ba0e3ca2171 100644 --- a/licenses.yaml +++ b/licenses.yaml @@ -644,7 +644,7 @@ name: Apache Commons Configuration license_category: binary module: java-core license_name: Apache License version 2.0 -version: 2.1.1 +version: 2.8.0 libraries: - org.apache.commons: commons-configuration2 @@ -2913,22 +2913,10 @@ name: Apache Hadoop license_category: binary module: hadoop-client license_name: Apache License version 2.0 -version: 3.3.1 +version: 3.3.5 libraries: - - org.apache.hadoop: hadoop-annotations - org.apache.hadoop: hadoop-auth - - org.apache.hadoop: hadoop-client - org.apache.hadoop: hadoop-common - - org.apache.hadoop: hadoop-hdfs-client - - org.apache.hadoop: hadoop-mapreduce-client-app - - org.apache.hadoop: hadoop-mapreduce-client-common - - org.apache.hadoop: hadoop-mapreduce-client-core - - org.apache.hadoop: hadoop-mapreduce-client-jobclient - - org.apache.hadoop: hadoop-mapreduce-client-shuffle - - org.apache.hadoop: hadoop-yarn-api - - org.apache.hadoop: hadoop-yarn-client - - org.apache.hadoop: hadoop-yarn-common - - org.apache.hadoop: hadoop-yarn-server-common --- @@ -3485,6 +3473,43 @@ notices: --- +name: reload4j +license_category: binary +module: hadoop-common +license_name: Apache License version 2.0 +version: 1.2.22 +libraries: + - ch.qos.reload4j: reload4j +notices: + - reload4j: | + Apache log4j + Copyright 2010 The Apache Software Foundation + + This product includes software developed at + The Apache Software Foundation (http://www.apache.org/). + +--- + +name: slf4j-reload4j +license_category: binary +module: hadoop-common +license_name: MIT License +version: 1.7.36 +libraries: + - org.slf4j: slf4j-reload4j + +--- + +name: com.github.pjfanning jersey-json +license_category: binary +module: druid-kerberos +license_name: CDDL 1.1 +version: 1.20 +libraries: + - com.github.pjfanning: jersey-json + +--- + name: Kafka Schema Registry Client version: 5.5.1 license_category: binary @@ -3786,6 +3811,16 @@ libraries: --- +name: Hadoop Client API +license_category: binary +module: extensions/druid-hdfs-storage +license_name: Apache License version 2.0 +version: 3.3.5 +libraries: + - org.apache.hadoop: hadoop-client-api + +--- + name: xmlenc license_category: binary module: extensions/druid-hdfs-storage @@ -5037,7 +5072,7 @@ name: Woodstox license_category: binary module: java-core license_name: Apache License version 2.0 -version: 5.3.0 +version: 5.4.0 libraries: - com.fasterxml.woodstox: woodstox-core @@ -5064,6 +5099,16 @@ libraries: --- +name: RE2/J +license_category: binary +module: java-core +license_name: The Go license +version: 1.1 +license_file_path: licenses/bin/re2j.GO +libraries: + - com.google.re2j: re2j + +--- name: jakarta.activation license_category: binary module: extensions/druid-avro-extensions diff --git a/pom.xml b/pom.xml index 574743b12d8..0baa3b90b3a 100644 --- a/pom.xml +++ b/pom.xml @@ -110,12 +110,12 @@ 1.3.1 1.7.36 - 2.8.5 + 3.3.5 4.3.1 1.12.317 2.8.0 0.8.7 - 5.2.5.Final + 5.3.6.Final 4.5.13 3.5.10 @@ -2033,17 +2033,16 @@ - hadoop3 + hadoop2 - hadoop3.enabled + hadoop2.enabled true - 3.3.1 - 5.3.6.Final - 4.5.13 + 2.8.5 + 5.2.5.Final