From ddfc31d7ed0dbd9c0eb3c5c84956d5fb27880427 Mon Sep 17 00:00:00 2001 From: Abhishek Agarwal <1477457+abhishekagarwal87@users.noreply.github.com> Date: Mon, 26 Feb 2024 21:18:55 +0530 Subject: [PATCH] Reduce the size of distribution docker image (#15968) This PR creates symlinks when there are duplicate jars present in the extension. Docker image includes contrib extensions, too, and the size of the image has bloated up quite a lot of late. This change also fixes "ITNestedQueryPushDownTest integration test" --- .github/workflows/standard-its.yml | 3 +- distribution/docker/Dockerfile | 12 +++-- distribution/docker/deduplicate_jars.sh | 51 +++++++++++++++++++ .../script/setup_druid_on_k8s.sh | 2 +- 4 files changed, 62 insertions(+), 6 deletions(-) create mode 100755 distribution/docker/deduplicate_jars.sh diff --git a/.github/workflows/standard-its.yml b/.github/workflows/standard-its.yml index a0d4f856e62..8709b09e596 100644 --- a/.github/workflows/standard-its.yml +++ b/.github/workflows/standard-its.yml @@ -183,7 +183,8 @@ jobs: run: | for v in broker middlemanager router coordinator historical ; do echo "------------------------druid-tiny-cluster-"$v"s-0-------------------------"; - sudo /usr/local/bin/kubectl logs --tail 1000 druid-tiny-cluster-"$v"s-0 ||:; + /usr/local/bin/kubectl logs --tail 1000 druid-tiny-cluster-"$v"s-0 ||:; + /usr/local/bin/kubectl get events | grep druid-tiny-cluster-"$v"s-0 ||:; done integration-other-tests: diff --git a/distribution/docker/Dockerfile b/distribution/docker/Dockerfile index bebd0a1c6e0..230f09acaac 100644 --- a/distribution/docker/Dockerfile +++ b/distribution/docker/Dockerfile @@ -40,7 +40,7 @@ RUN --mount=type=cache,target=/root/.m2 if [ "$BUILD_FROM_SOURCE" = "true" ]; th install \ -Pdist,bundle-contrib-exts \ -Pskip-static-checks,skip-tests \ - -Dmaven.javadoc.skip=true \ + -Dmaven.javadoc.skip=true -T1C \ ; fi RUN --mount=type=cache,target=/root/.m2 VERSION=$(mvn -B -q org.apache.maven.plugins:maven-help-plugin:3.2.0:evaluate \ @@ -82,15 +82,19 @@ RUN addgroup -S -g 1000 druid \ COPY --from=bash-static /bin/bash /bin/bash RUN chmod 755 /bin/bash -COPY --chown=druid:druid --from=builder /opt /opt COPY distribution/docker/druid.sh /druid.sh COPY distribution/docker/peon.sh /peon.sh +COPY distribution/docker/deduplicate_jars.sh /deduplicate_jars.sh # create necessary directories which could be mounted as volume +# copy and de-duplicate jars from builder in same layer to reduce image size # /opt/druid/var is used to keep individual files(e.g. log) of each Druid service # /opt/shared is used to keep segments and task logs shared among Druid services -RUN mkdir /opt/druid/var /opt/shared \ - && chown druid:druid /opt/druid/var /opt/shared \ +RUN --mount=type=bind,from=builder,source=/opt,target=/builder/opt \ + mkdir -p /opt/druid/var /opt/shared \ + && cp -r /builder/opt/druid /opt/ \ + && /deduplicate_jars.sh /opt/druid \ + && chown -R druid:druid /opt/druid \ && chmod 775 /opt/druid/var /opt/shared USER druid diff --git a/distribution/docker/deduplicate_jars.sh b/distribution/docker/deduplicate_jars.sh new file mode 100755 index 00000000000..336f955a54b --- /dev/null +++ b/distribution/docker/deduplicate_jars.sh @@ -0,0 +1,51 @@ +#!/bin/bash + +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +set -e + +# Check if an argument is provided +if [ "$#" -ne 1 ]; then + echo "Usage: $0 " + exit 1 +fi + +# Directory containing JAR files +JAR_DIR="$1" + +# Declare an associative array to hold the canonical filenames. Works on base version >= 4 +declare -A canonical + +# Find all JAR files, sort them to ensure duplicates are processed together +while IFS= read -r jar; do + # Extract the base name and sanitize it to create a valid array key + key=$(basename "$jar") + + # Check if this is the first occurrence of this file + if [ -z "${canonical[$key]}" ]; then + # Mark this file as the canonical one for this basename + canonical[$key]="$jar" + else + # This file is a duplicate, replace it with a symlink to the canonical file + ln -sf "${canonical[$key]}" "$jar" + echo "Replaced duplicate $jar with symlink to ${canonical[$key]}" + fi +# Read in an order that retain core libs as original jars +done < <(find $JAR_DIR -wholename '*/lib/*.jar' | sort ; find $JAR_DIR -wholename '*/extensions/*.jar' | sort ; find $JAR_DIR -wholename '*/hadoop-dependencies/*.jar' | sort) diff --git a/integration-tests/script/setup_druid_on_k8s.sh b/integration-tests/script/setup_druid_on_k8s.sh index 665850a62d2..960eedfe5fc 100755 --- a/integration-tests/script/setup_druid_on_k8s.sh +++ b/integration-tests/script/setup_druid_on_k8s.sh @@ -30,7 +30,7 @@ mvn -B -ff -q dependency:go-offline \ install \ -Pdist,bundle-contrib-exts \ -Pskip-static-checks,skip-tests \ - -Dmaven.javadoc.skip=true + -Dmaven.javadoc.skip=true -T1C DOCKER_BUILDKIT=1 docker build --build-arg BUILD_FROM_SOURCE=0 -t druid/base:v1 -f distribution/docker/Dockerfile . DOCKER_BUILDKIT=1 docker build --build-arg BASE_IMAGE=druid/base:v1 -t druid/cluster:v1 -f distribution/docker/DockerfileBuildTarAdvanced .