From b5dcf64f979f0225a394b5c928f92cb5f539bc6d Mon Sep 17 00:00:00 2001 From: Zhankun Tang Date: Thu, 25 Apr 2019 17:56:31 +0800 Subject: [PATCH] SUBMARINE-44. Upgrade Docker image's tensorflow version to 1.13.1. Contributed by Zac Zhou. --- .../ubuntu-16.04/Dockerfile.cpu.tf_1.13.1 | 71 ++++++ .../base/ubuntu-16.04/Dockerfile.cpu.tf_1.8.0 | 69 ------ .../ubuntu-16.04/Dockerfile.gpu.tf_1.13.1 | 85 +++++++ .../base/ubuntu-16.04/Dockerfile.gpu.tf_1.8.0 | 67 ------ .../src/main/docker/build-all.sh | 8 +- ....cpu.tf_1.8.0 => Dockerfile.cpu.tf_1.13.1} | 4 +- ....gpu.tf_1.8.0 => Dockerfile.gpu.tf_1.13.1} | 4 +- .../README.md | 0 .../cifar10.py | 2 +- .../cifar10_main.py | 4 +- .../cifar10_model.py | 0 .../cifar10_utils.py | 5 +- .../generate_cifar10_tfrecords.py | 10 +- .../model_base.py | 0 .../src/site/markdown/InstallationGuide.md | 214 ++++++++---------- .../InstallationGuideChineseVersion.md | 197 ++++++---------- .../src/site/markdown/QuickStart.md | 39 +++- .../RunningDistributedCifar10TFJobs.md | 12 +- .../site/markdown/TestAndTroubleshooting.md | 27 +-- .../src/site/markdown/WriteDockerfile.md | 34 +-- 20 files changed, 395 insertions(+), 457 deletions(-) create mode 100644 hadoop-submarine/hadoop-submarine-core/src/main/docker/base/ubuntu-16.04/Dockerfile.cpu.tf_1.13.1 delete mode 100644 hadoop-submarine/hadoop-submarine-core/src/main/docker/base/ubuntu-16.04/Dockerfile.cpu.tf_1.8.0 create mode 100644 hadoop-submarine/hadoop-submarine-core/src/main/docker/base/ubuntu-16.04/Dockerfile.gpu.tf_1.13.1 delete mode 100644 hadoop-submarine/hadoop-submarine-core/src/main/docker/base/ubuntu-16.04/Dockerfile.gpu.tf_1.8.0 rename hadoop-submarine/hadoop-submarine-core/src/main/docker/with-cifar10-models/ubuntu-16.04/{Dockerfile.cpu.tf_1.8.0 => Dockerfile.cpu.tf_1.13.1} (90%) rename hadoop-submarine/hadoop-submarine-core/src/main/docker/with-cifar10-models/ubuntu-16.04/{Dockerfile.gpu.tf_1.8.0 => Dockerfile.gpu.tf_1.13.1} (90%) rename hadoop-submarine/hadoop-submarine-core/src/main/docker/with-cifar10-models/ubuntu-16.04/{cifar10_estimator_tf_1.8.0 => cifar10_estimator_tf_1.13.1}/README.md (100%) rename hadoop-submarine/hadoop-submarine-core/src/main/docker/with-cifar10-models/ubuntu-16.04/{cifar10_estimator_tf_1.8.0 => cifar10_estimator_tf_1.13.1}/cifar10.py (98%) rename hadoop-submarine/hadoop-submarine-core/src/main/docker/with-cifar10-models/ubuntu-16.04/{cifar10_estimator_tf_1.8.0 => cifar10_estimator_tf_1.13.1}/cifar10_main.py (99%) rename hadoop-submarine/hadoop-submarine-core/src/main/docker/with-cifar10-models/ubuntu-16.04/{cifar10_estimator_tf_1.8.0 => cifar10_estimator_tf_1.13.1}/cifar10_model.py (100%) rename hadoop-submarine/hadoop-submarine-core/src/main/docker/with-cifar10-models/ubuntu-16.04/{cifar10_estimator_tf_1.8.0 => cifar10_estimator_tf_1.13.1}/cifar10_utils.py (99%) rename hadoop-submarine/hadoop-submarine-core/src/main/docker/with-cifar10-models/ubuntu-16.04/{cifar10_estimator_tf_1.8.0 => cifar10_estimator_tf_1.13.1}/generate_cifar10_tfrecords.py (94%) rename hadoop-submarine/hadoop-submarine-core/src/main/docker/with-cifar10-models/ubuntu-16.04/{cifar10_estimator_tf_1.8.0 => cifar10_estimator_tf_1.13.1}/model_base.py (100%) diff --git a/hadoop-submarine/hadoop-submarine-core/src/main/docker/base/ubuntu-16.04/Dockerfile.cpu.tf_1.13.1 b/hadoop-submarine/hadoop-submarine-core/src/main/docker/base/ubuntu-16.04/Dockerfile.cpu.tf_1.13.1 new file mode 100644 index 00000000000..b32cb4161bd --- /dev/null +++ b/hadoop-submarine/hadoop-submarine-core/src/main/docker/base/ubuntu-16.04/Dockerfile.cpu.tf_1.13.1 @@ -0,0 +1,71 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +FROM ubuntu:16.04 + +# Pick up some TF dependencies +RUN apt-get update && apt-get install -y --allow-downgrades --no-install-recommends \ + --allow-change-held-packages --allow-unauthenticated \ + build-essential libfreetype6-dev libpng12-dev \ + libzmq3-dev pkg-config python python-dev \ + rsync software-properties-common curl unzip wget grep sed vim iputils-ping net-tools gdb python2.7-dbg tzdata && \ + apt-get clean && rm -rf /var/lib/apt/lists/* + +RUN export DEBIAN_FRONTEND=noninteractive && apt-get update && apt-get install -yq --no-install-recommends \ + krb5-user libpam-krb5 && \ + apt-get clean && rm -rf /var/lib/apt/lists/* + +RUN wget https://bootstrap.pypa.io/get-pip.py && \ + python get-pip.py && \ + rm get-pip.py + +RUN echo "Install python related packages" && \ + apt-get update && \ + apt-get install -y --no-install-recommends gfortran \ + # numerical/algebra packages + libblas-dev libatlas-dev liblapack-dev \ + # font, image for matplotlib + libpng-dev libxft-dev \ + # for tkinter + python-tk libxml2-dev libxslt-dev zlib1g-dev && \ + apt-get clean && rm -rf /var/lib/apt/lists/* + +RUN pip --no-cache-dir install Pillow h5py ipykernel jupyter matplotlib numpy pandas scipy sklearn && \ + python -m ipykernel.kernelspec + +# Install TensorFlow CPU version. +ENV TENSORFLOW_VERSION="1.13.1" +RUN pip --no-cache-dir install \ + http://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-${TENSORFLOW_VERSION}-cp27-none-linux_x86_64.whl +RUN apt-get update && apt-get install -y --no-install-recommends git && \ + apt-get clean && rm -rf /var/lib/apt/lists/* + +# Install hadoop +ENV HADOOP_VERSION="3.1.2" +RUN wget http://mirrors.shu.edu.cn/apache/hadoop/common/hadoop-${HADOOP_VERSION}/hadoop-${HADOOP_VERSION}.tar.gz +RUN tar zxf hadoop-${HADOOP_VERSION}.tar.gz +RUN ln -s hadoop-${HADOOP_VERSION} hadoop-current +RUN rm hadoop-${HADOOP_VERSION}.tar.gz + +ENV JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64 +RUN echo "$LOG_TAG Install java8" && \ + apt-get update && \ + apt-get install -y --no-install-recommends openjdk-8-jdk && \ + apt-get clean && rm -rf /var/lib/apt/lists/* + +# Set the locale to fix bash warning: setlocale: LC_ALL: cannot change locale (en_US.UTF-8) +RUN apt-get update && apt-get install -y --no-install-recommends locales && \ + apt-get clean && rm -rf /var/lib/apt/lists/* +RUN locale-gen en_US.UTF-8 diff --git a/hadoop-submarine/hadoop-submarine-core/src/main/docker/base/ubuntu-16.04/Dockerfile.cpu.tf_1.8.0 b/hadoop-submarine/hadoop-submarine-core/src/main/docker/base/ubuntu-16.04/Dockerfile.cpu.tf_1.8.0 deleted file mode 100644 index f2446a745c5..00000000000 --- a/hadoop-submarine/hadoop-submarine-core/src/main/docker/base/ubuntu-16.04/Dockerfile.cpu.tf_1.8.0 +++ /dev/null @@ -1,69 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -FROM ubuntu:16.04 - -LABEL maintainer="Craig Citro " - -# Pick up some TF dependencies -RUN apt-get update && apt-get install -y --no-install-recommends \ - build-essential \ - curl \ - libfreetype6-dev \ - libpng12-dev \ - libzmq3-dev \ - pkg-config \ - python \ - python-dev \ - rsync \ - software-properties-common \ - unzip \ - && \ - apt-get clean && \ - rm -rf /var/lib/apt/lists/* - -RUN curl -O https://bootstrap.pypa.io/get-pip.py && \ - python get-pip.py && \ - rm get-pip.py - -RUN pip --no-cache-dir install \ - Pillow \ - h5py \ - ipykernel \ - jupyter \ - matplotlib \ - numpy \ - pandas \ - scipy \ - sklearn \ - && \ - python -m ipykernel.kernelspec - -# --- DO NOT EDIT OR DELETE BETWEEN THE LINES --- # -# These lines will be edited automatically by parameterized_docker_build.sh. # -# COPY _PIP_FILE_ / -# RUN pip --no-cache-dir install /_PIP_FILE_ -# RUN rm -f /_PIP_FILE_ - -# Install TensorFlow CPU version from central repo -RUN pip --no-cache-dir install \ - http://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0-cp27-none-linux_x86_64.whl - -RUN apt-get update && apt-get install git -y - -RUN apt-get update && apt-get install -y openjdk-8-jdk wget -RUN wget http://apache.cs.utah.edu/hadoop/common/hadoop-3.1.1/hadoop-3.1.1.tar.gz -RUN tar zxf hadoop-3.1.1.tar.gz \ No newline at end of file diff --git a/hadoop-submarine/hadoop-submarine-core/src/main/docker/base/ubuntu-16.04/Dockerfile.gpu.tf_1.13.1 b/hadoop-submarine/hadoop-submarine-core/src/main/docker/base/ubuntu-16.04/Dockerfile.gpu.tf_1.13.1 new file mode 100644 index 00000000000..85f5ea1e55d --- /dev/null +++ b/hadoop-submarine/hadoop-submarine-core/src/main/docker/base/ubuntu-16.04/Dockerfile.gpu.tf_1.13.1 @@ -0,0 +1,85 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +FROM nvidia/cuda:10.0-cudnn7-devel-ubuntu16.04 + +# Pick up some TF dependencies +RUN apt-get update && apt-get install -y --allow-downgrades --no-install-recommends \ + --allow-change-held-packages --allow-unauthenticated \ + build-essential libfreetype6-dev libpng12-dev \ + libzmq3-dev pkg-config python python-dev \ + rsync software-properties-common curl unzip wget grep sed vim \ + iputils-ping net-tools gdb python2.7-dbg tzdata \ + cuda-command-line-tools-10-0 cuda-cublas-10-0 \ + cuda-cufft-10-0 cuda-curand-10-0 cuda-cusolver-10-0 \ + cuda-cusparse-10-0 libcudnn7=7.4.1.5-1+cuda10.0 && \ + apt-get clean && rm -rf /var/lib/apt/lists/* + +# Install TensorRT +RUN apt-get update && \ + apt-get install -y --allow-unauthenticated --no-install-recommends \ + nvinfer-runtime-trt-repo-ubuntu1604-5.0.2-ga-cuda10.0 && \ + apt-get update && \ + apt-get install -y --no-install-recommends \ + libnvinfer5=5.0.2-1+cuda10.0 && \ + apt-get clean && rm -rf /var/lib/apt/lists/* + + +RUN export DEBIAN_FRONTEND=noninteractive && apt-get update && \ + apt-get install -yq --no-install-recommends krb5-user libpam-krb5 \ + && apt-get clean && rm -rf /var/lib/apt/lists/* + +RUN wget https://bootstrap.pypa.io/get-pip.py && \ + python get-pip.py && \ + rm get-pip.py + +RUN echo "Install python related packages" && \ + apt-get -y update && \ + apt-get install -y --no-install-recommends gfortran \ + # numerical/algebra packages + libblas-dev libatlas-dev liblapack-dev \ + # font, image for matplotlib + libpng-dev libxft-dev \ + # for tkinter + python-tk libxml2-dev libxslt-dev zlib1g-dev && \ + apt-get clean && rm -rf /var/lib/apt/lists/* + +RUN pip --no-cache-dir install Pillow h5py ipykernel jupyter matplotlib numpy pandas scipy sklearn && \ + python -m ipykernel.kernelspec + +# Install TensorFlow GPU version. +ENV TENSORFLOW_VERSION="1.13.1" +RUN pip --no-cache-dir install \ + http://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-${TENSORFLOW_VERSION}-cp27-none-linux_x86_64.whl +RUN apt-get update && apt-get install -y --no-install-recommends git && \ + apt-get clean && rm -rf /var/lib/apt/lists/* + +# Install hadoop +ENV HADOOP_VERSION="3.1.2" +RUN wget http://mirrors.shu.edu.cn/apache/hadoop/common/hadoop-${HADOOP_VERSION}/hadoop-${HADOOP_VERSION}.tar.gz +RUN tar zxf hadoop-${HADOOP_VERSION}.tar.gz +RUN ln -s hadoop-${HADOOP_VERSION} hadoop-current +RUN rm hadoop-${HADOOP_VERSION}.tar.gz + +ENV JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64 +RUN echo "$LOG_TAG Install java8" && \ + apt-get -y update && \ + apt-get install -y --no-install-recommends openjdk-8-jdk && \ + rm -rf /var/lib/apt/lists/* + +# Set the locale to fix bash warning: setlocale: LC_ALL: cannot change locale (en_US.UTF-8) +RUN apt-get update && apt-get install -y --no-install-recommends locales && \ + apt-get clean && rm -rf /var/lib/apt/lists/* +RUN locale-gen en_US.UTF-8 diff --git a/hadoop-submarine/hadoop-submarine-core/src/main/docker/base/ubuntu-16.04/Dockerfile.gpu.tf_1.8.0 b/hadoop-submarine/hadoop-submarine-core/src/main/docker/base/ubuntu-16.04/Dockerfile.gpu.tf_1.8.0 deleted file mode 100644 index dee6e195717..00000000000 --- a/hadoop-submarine/hadoop-submarine-core/src/main/docker/base/ubuntu-16.04/Dockerfile.gpu.tf_1.8.0 +++ /dev/null @@ -1,67 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -FROM nvidia/cuda:9.0-cudnn7-devel-ubuntu16.04 - -# Pick up some TF dependencies -RUN apt-get update && apt-get install -y --allow-downgrades --no-install-recommends \ - build-essential \ - cuda-command-line-tools-9-0 \ - cuda-cublas-9-0 \ - cuda-cufft-9-0 \ - cuda-curand-9-0 \ - cuda-cusolver-9-0 \ - cuda-cusparse-9-0 \ - curl \ - libcudnn7=7.0.5.15-1+cuda9.0 \ - libfreetype6-dev \ - libpng12-dev \ - libzmq3-dev \ - pkg-config \ - python \ - python-dev \ - rsync \ - software-properties-common \ - unzip \ - && \ - apt-get clean && \ - rm -rf /var/lib/apt/lists/* - -RUN curl -O https://bootstrap.pypa.io/get-pip.py && \ - python get-pip.py && \ - rm get-pip.py - -RUN pip --no-cache-dir install \ - Pillow \ - h5py \ - ipykernel \ - jupyter \ - matplotlib \ - numpy \ - pandas \ - scipy \ - sklearn \ - && \ - python -m ipykernel.kernelspec - -# Install TensorFlow GPU version. -RUN pip --no-cache-dir install \ - http://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.8.0-cp27-none-linux_x86_64.whl -RUN apt-get update && apt-get install git -y - -RUN apt-get update && apt-get install -y openjdk-8-jdk wget -RUN wget http://apache.cs.utah.edu/hadoop/common/hadoop-3.1.0/hadoop-3.1.0.tar.gz -RUN tar zxf hadoop-3.1.0.tar.gz \ No newline at end of file diff --git a/hadoop-submarine/hadoop-submarine-core/src/main/docker/build-all.sh b/hadoop-submarine/hadoop-submarine-core/src/main/docker/build-all.sh index ad3a935438e..1e9848fc4c1 100755 --- a/hadoop-submarine/hadoop-submarine-core/src/main/docker/build-all.sh +++ b/hadoop-submarine/hadoop-submarine-core/src/main/docker/build-all.sh @@ -21,12 +21,12 @@ set -e cd base/ubuntu-16.04 -docker build . -f Dockerfile.cpu.tf_1.8.0 -t tf-1.8.0-cpu-base:0.0.1 -docker build . -f Dockerfile.gpu.tf_1.8.0 -t tf-1.8.0-gpu-base:0.0.1 +docker build . -f Dockerfile.cpu.tf_1.13.1 -t tf-1.13.1-cpu-base:0.0.1 +docker build . -f Dockerfile.gpu.tf_1.13.1 -t tf-1.13.1-gpu-base:0.0.1 echo "Finished building base images" cd ../../with-cifar10-models/ubuntu-16.04 -docker build . -f Dockerfile.cpu.tf_1.8.0 -t tf-1.8.0-cpu:0.0.1 -docker build . -f Dockerfile.gpu.tf_1.8.0 -t tf-1.8.0-gpu:0.0.1 +docker build . -f Dockerfile.cpu.tf_1.13.1 -t tf-1.13.1-cpu:0.0.1 +docker build . -f Dockerfile.gpu.tf_1.13.1 -t tf-1.13.1-gpu:0.0.1 diff --git a/hadoop-submarine/hadoop-submarine-core/src/main/docker/with-cifar10-models/ubuntu-16.04/Dockerfile.cpu.tf_1.8.0 b/hadoop-submarine/hadoop-submarine-core/src/main/docker/with-cifar10-models/ubuntu-16.04/Dockerfile.cpu.tf_1.13.1 similarity index 90% rename from hadoop-submarine/hadoop-submarine-core/src/main/docker/with-cifar10-models/ubuntu-16.04/Dockerfile.cpu.tf_1.8.0 rename to hadoop-submarine/hadoop-submarine-core/src/main/docker/with-cifar10-models/ubuntu-16.04/Dockerfile.cpu.tf_1.13.1 index 1087d61982b..188e4878b9b 100644 --- a/hadoop-submarine/hadoop-submarine-core/src/main/docker/with-cifar10-models/ubuntu-16.04/Dockerfile.cpu.tf_1.8.0 +++ b/hadoop-submarine/hadoop-submarine-core/src/main/docker/with-cifar10-models/ubuntu-16.04/Dockerfile.cpu.tf_1.13.1 @@ -14,9 +14,9 @@ # See the License for the specific language governing permissions and # limitations under the License. -FROM tf-1.8.0-cpu-base:0.0.1 +FROM tf-1.13.1-cpu-base:0.0.1 # Include models RUN mkdir /test -ADD cifar10_estimator_tf_1.8.0 /test/cifar10_estimator +ADD cifar10_estimator_tf_1.13.1 /test/cifar10_estimator RUN chown -R nobody /test \ No newline at end of file diff --git a/hadoop-submarine/hadoop-submarine-core/src/main/docker/with-cifar10-models/ubuntu-16.04/Dockerfile.gpu.tf_1.8.0 b/hadoop-submarine/hadoop-submarine-core/src/main/docker/with-cifar10-models/ubuntu-16.04/Dockerfile.gpu.tf_1.13.1 similarity index 90% rename from hadoop-submarine/hadoop-submarine-core/src/main/docker/with-cifar10-models/ubuntu-16.04/Dockerfile.gpu.tf_1.8.0 rename to hadoop-submarine/hadoop-submarine-core/src/main/docker/with-cifar10-models/ubuntu-16.04/Dockerfile.gpu.tf_1.13.1 index d1f829f023b..8819fa619f0 100644 --- a/hadoop-submarine/hadoop-submarine-core/src/main/docker/with-cifar10-models/ubuntu-16.04/Dockerfile.gpu.tf_1.8.0 +++ b/hadoop-submarine/hadoop-submarine-core/src/main/docker/with-cifar10-models/ubuntu-16.04/Dockerfile.gpu.tf_1.13.1 @@ -14,9 +14,9 @@ # See the License for the specific language governing permissions and # limitations under the License. -FROM tf-1.8.0-gpu-base:0.0.1 +FROM tf-1.13.1-gpu-base:0.0.1 # Include models RUN mkdir /test -ADD cifar10_estimator_tf_1.8.0 /test/cifar10_estimator +ADD cifar10_estimator_tf_1.13.1 /test/cifar10_estimator RUN chown -R nobody /test \ No newline at end of file diff --git a/hadoop-submarine/hadoop-submarine-core/src/main/docker/with-cifar10-models/ubuntu-16.04/cifar10_estimator_tf_1.8.0/README.md b/hadoop-submarine/hadoop-submarine-core/src/main/docker/with-cifar10-models/ubuntu-16.04/cifar10_estimator_tf_1.13.1/README.md similarity index 100% rename from hadoop-submarine/hadoop-submarine-core/src/main/docker/with-cifar10-models/ubuntu-16.04/cifar10_estimator_tf_1.8.0/README.md rename to hadoop-submarine/hadoop-submarine-core/src/main/docker/with-cifar10-models/ubuntu-16.04/cifar10_estimator_tf_1.13.1/README.md diff --git a/hadoop-submarine/hadoop-submarine-core/src/main/docker/with-cifar10-models/ubuntu-16.04/cifar10_estimator_tf_1.8.0/cifar10.py b/hadoop-submarine/hadoop-submarine-core/src/main/docker/with-cifar10-models/ubuntu-16.04/cifar10_estimator_tf_1.13.1/cifar10.py similarity index 98% rename from hadoop-submarine/hadoop-submarine-core/src/main/docker/with-cifar10-models/ubuntu-16.04/cifar10_estimator_tf_1.8.0/cifar10.py rename to hadoop-submarine/hadoop-submarine-core/src/main/docker/with-cifar10-models/ubuntu-16.04/cifar10_estimator_tf_1.13.1/cifar10.py index 6903e8d93de..5e1a70895ad 100644 --- a/hadoop-submarine/hadoop-submarine-core/src/main/docker/with-cifar10-models/ubuntu-16.04/cifar10_estimator_tf_1.8.0/cifar10.py +++ b/hadoop-submarine/hadoop-submarine-core/src/main/docker/with-cifar10-models/ubuntu-16.04/cifar10_estimator_tf_1.13.1/cifar10.py @@ -75,7 +75,7 @@ class Cifar10DataSet(object): # Parse records. dataset = dataset.map( - self.parser) + self.parser, num_parallel_calls=batch_size) # Potentially shuffle records. if self.subset == 'train': diff --git a/hadoop-submarine/hadoop-submarine-core/src/main/docker/with-cifar10-models/ubuntu-16.04/cifar10_estimator_tf_1.8.0/cifar10_main.py b/hadoop-submarine/hadoop-submarine-core/src/main/docker/with-cifar10-models/ubuntu-16.04/cifar10_estimator_tf_1.13.1/cifar10_main.py similarity index 99% rename from hadoop-submarine/hadoop-submarine-core/src/main/docker/with-cifar10-models/ubuntu-16.04/cifar10_estimator_tf_1.8.0/cifar10_main.py rename to hadoop-submarine/hadoop-submarine-core/src/main/docker/with-cifar10-models/ubuntu-16.04/cifar10_estimator_tf_1.13.1/cifar10_main.py index 086c95b4e45..51da6b94fa2 100644 --- a/hadoop-submarine/hadoop-submarine-core/src/main/docker/with-cifar10-models/ubuntu-16.04/cifar10_estimator_tf_1.8.0/cifar10_main.py +++ b/hadoop-submarine/hadoop-submarine-core/src/main/docker/with-cifar10-models/ubuntu-16.04/cifar10_estimator_tf_1.13.1/cifar10_main.py @@ -344,7 +344,7 @@ def get_experiment_fn(data_dir, train_steps = hparams.train_steps eval_steps = num_eval_examples // hparams.eval_batch_size - + classifier = tf.estimator.Estimator( model_fn=get_model_fn(num_gpus, variable_strategy, run_config.num_worker_replicas or 1), @@ -483,7 +483,7 @@ if __name__ == '__main__': type=str, default=None, help="""\ - If not set, the data format best for the training device is used. + If not set, the data format best for the training device is used. Allowed values: channels_first (NCHW) channels_last (NHWC).\ """) parser.add_argument( diff --git a/hadoop-submarine/hadoop-submarine-core/src/main/docker/with-cifar10-models/ubuntu-16.04/cifar10_estimator_tf_1.8.0/cifar10_model.py b/hadoop-submarine/hadoop-submarine-core/src/main/docker/with-cifar10-models/ubuntu-16.04/cifar10_estimator_tf_1.13.1/cifar10_model.py similarity index 100% rename from hadoop-submarine/hadoop-submarine-core/src/main/docker/with-cifar10-models/ubuntu-16.04/cifar10_estimator_tf_1.8.0/cifar10_model.py rename to hadoop-submarine/hadoop-submarine-core/src/main/docker/with-cifar10-models/ubuntu-16.04/cifar10_estimator_tf_1.13.1/cifar10_model.py diff --git a/hadoop-submarine/hadoop-submarine-core/src/main/docker/with-cifar10-models/ubuntu-16.04/cifar10_estimator_tf_1.8.0/cifar10_utils.py b/hadoop-submarine/hadoop-submarine-core/src/main/docker/with-cifar10-models/ubuntu-16.04/cifar10_estimator_tf_1.13.1/cifar10_utils.py similarity index 99% rename from hadoop-submarine/hadoop-submarine-core/src/main/docker/with-cifar10-models/ubuntu-16.04/cifar10_estimator_tf_1.8.0/cifar10_utils.py rename to hadoop-submarine/hadoop-submarine-core/src/main/docker/with-cifar10-models/ubuntu-16.04/cifar10_estimator_tf_1.13.1/cifar10_utils.py index 7ecb50a3c5b..5eb2c3f62fe 100644 --- a/hadoop-submarine/hadoop-submarine-core/src/main/docker/with-cifar10-models/ubuntu-16.04/cifar10_estimator_tf_1.8.0/cifar10_utils.py +++ b/hadoop-submarine/hadoop-submarine-core/src/main/docker/with-cifar10-models/ubuntu-16.04/cifar10_estimator_tf_1.13.1/cifar10_utils.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== - import collections import six @@ -29,7 +28,7 @@ from tensorflow.contrib.learn.python.learn import run_config # TODO(b/64848083) Remove once uid bug is fixed -class RunConfig(tf.contrib.learn.RunConfig): +class RunConfig(tf.contrib.learn.RunConfig): def uid(self, whitelist=None): """Generates a 'Unique Identifier' based on all internal fields. Caller should use the uid string to check `RunConfig` instance integrity @@ -60,7 +59,7 @@ class RunConfig(tf.contrib.learn.RunConfig): key=lambda t: t[0]) ) return ', '.join( - '%s=%r' % (k, v) for (k, v) in six.iteritems(ordered_state)) + '%s=%r' % (k, v) for (k, v) in six.iteritems(ordered_state)) class ExamplesPerSecondHook(session_run_hook.SessionRunHook): diff --git a/hadoop-submarine/hadoop-submarine-core/src/main/docker/with-cifar10-models/ubuntu-16.04/cifar10_estimator_tf_1.8.0/generate_cifar10_tfrecords.py b/hadoop-submarine/hadoop-submarine-core/src/main/docker/with-cifar10-models/ubuntu-16.04/cifar10_estimator_tf_1.13.1/generate_cifar10_tfrecords.py similarity index 94% rename from hadoop-submarine/hadoop-submarine-core/src/main/docker/with-cifar10-models/ubuntu-16.04/cifar10_estimator_tf_1.8.0/generate_cifar10_tfrecords.py rename to hadoop-submarine/hadoop-submarine-core/src/main/docker/with-cifar10-models/ubuntu-16.04/cifar10_estimator_tf_1.13.1/generate_cifar10_tfrecords.py index 409cee4eaec..d1a599c31bf 100644 --- a/hadoop-submarine/hadoop-submarine-core/src/main/docker/with-cifar10-models/ubuntu-16.04/cifar10_estimator_tf_1.8.0/generate_cifar10_tfrecords.py +++ b/hadoop-submarine/hadoop-submarine-core/src/main/docker/with-cifar10-models/ubuntu-16.04/cifar10_estimator_tf_1.13.1/generate_cifar10_tfrecords.py @@ -25,6 +25,7 @@ from __future__ import print_function import argparse import os +import sys import tarfile from six.moves import cPickle as pickle @@ -63,7 +64,10 @@ def _get_file_names(): def read_pickle_from_file(filename): with tf.gfile.Open(filename, 'rb') as f: - data_dict = pickle.load(f) + if sys.version_info >= (3, 0): + data_dict = pickle.load(f, encoding='bytes') + else: + data_dict = pickle.load(f) return data_dict @@ -73,8 +77,8 @@ def convert_to_tfrecord(input_files, output_file): with tf.python_io.TFRecordWriter(output_file) as record_writer: for input_file in input_files: data_dict = read_pickle_from_file(input_file) - data = data_dict['data'] - labels = data_dict['labels'] + data = data_dict[b'data'] + labels = data_dict[b'labels'] num_entries_in_batch = len(labels) for i in range(num_entries_in_batch): example = tf.train.Example(features=tf.train.Features( diff --git a/hadoop-submarine/hadoop-submarine-core/src/main/docker/with-cifar10-models/ubuntu-16.04/cifar10_estimator_tf_1.8.0/model_base.py b/hadoop-submarine/hadoop-submarine-core/src/main/docker/with-cifar10-models/ubuntu-16.04/cifar10_estimator_tf_1.13.1/model_base.py similarity index 100% rename from hadoop-submarine/hadoop-submarine-core/src/main/docker/with-cifar10-models/ubuntu-16.04/cifar10_estimator_tf_1.8.0/model_base.py rename to hadoop-submarine/hadoop-submarine-core/src/main/docker/with-cifar10-models/ubuntu-16.04/cifar10_estimator_tf_1.13.1/model_base.py diff --git a/hadoop-submarine/hadoop-submarine-core/src/site/markdown/InstallationGuide.md b/hadoop-submarine/hadoop-submarine-core/src/site/markdown/InstallationGuide.md index 4ef2bdab0c2..1c7812ba8e9 100644 --- a/hadoop-submarine/hadoop-submarine-core/src/site/markdown/InstallationGuide.md +++ b/hadoop-submarine/hadoop-submarine-core/src/site/markdown/InstallationGuide.md @@ -24,8 +24,8 @@ The operating system and kernel versions we have tested are as shown in the foll | Enviroment | Verion | | ------ | ------ | -| Operating System | centos-release-7-3.1611.el7.centos.x86_64 | -| Kernal | 3.10.0-514.el7.x86_64 | +| Operating System | centos-release-7-5.1804.el7.centos.x86_64 | +| Kernal | 3.10.0-862.el7.x86_64 | ### User & Group @@ -62,8 +62,8 @@ yum install gcc make g++ # Approach 1: yum install kernel-devel-$(uname -r) kernel-headers-$(uname -r) # Approach 2: -wget http://vault.centos.org/7.3.1611/os/x86_64/Packages/kernel-headers-3.10.0-514.el7.x86_64.rpm -rpm -ivh kernel-headers-3.10.0-514.el7.x86_64.rpm +wget http://vault.centos.org/7.3.1611/os/x86_64/Packages/kernel-headers-3.10.0-862.el7.x86_64.rpm +rpm -ivh kernel-headers-3.10.0-862.el7.x86_64.rpm ``` ### GPU Servers (Only for Nvidia GPU equipped nodes) @@ -165,26 +165,43 @@ https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html ### Docker Installation -We recommend to use Docker version >= 1.12.5, following steps are just for your reference. You can always to choose other approaches to install Docker. +The following steps show how to install docker 18.06.1.ce. You can choose other approaches to install Docker. ``` -yum -y update -yum -y install yum-utils -yum-config-manager --add-repo https://yum.dockerproject.org/repo/main/centos/7 -yum -y update +# Remove old version docker +sudo yum remove docker \ + docker-client \ + docker-client-latest \ + docker-common \ + docker-latest \ + docker-latest-logrotate \ + docker-logrotate \ + docker-engine -# Show available packages -yum search --showduplicates docker-engine +# Docker version +export DOCKER_VERSION="18.06.1.ce" +# Setup the repository +sudo yum install -y yum-utils \ + device-mapper-persistent-data \ + lvm2 +sudo yum-config-manager \ + --add-repo \ + https://download.docker.com/linux/centos/docker-ce.repo -# Install docker 1.12.5 -yum -y --nogpgcheck install docker-engine-1.12.5* +# Check docker version +yum list docker-ce --showduplicates | sort -r + +# Install docker with specified DOCKER_VERSION +sudo yum install -y docker-ce-${DOCKER_VERSION} docker-ce-cli-${DOCKER_VERSION} containerd.io + +# Start docker systemctl start docker chown hadoop:netease /var/run/docker.sock chown hadoop:netease /usr/bin/docker ``` -Reference:https://docs.docker.com/cs-engine/1.12/ +Reference:https://docs.docker.com/install/linux/docker-ce/centos/ ### Docker Configuration @@ -208,46 +225,40 @@ sudo systemctl restart docker -### Docker EE version +### Check docker version ```bash $ docker version Client: - Version: 1.12.5 - API version: 1.24 - Go version: go1.6.4 - Git commit: 7392c3b - Built: Fri Dec 16 02:23:59 2016 + Version: 18.06.1-ce + API version: 1.38 + Go version: go1.10.3 + Git commit: e68fc7a + Built: Tue Aug 21 17:23:03 2018 OS/Arch: linux/amd64 + Experimental: false Server: - Version: 1.12.5 - API version: 1.24 - Go version: go1.6.4 - Git commit: 7392c3b - Built: Fri Dec 16 02:23:59 2016 + Version: 18.06.1-ce + API version: 1.38 (minimum version 1.12) + Go version: go1.10.3 + Git commit: e68fc7a + Built: Tue Aug 21 17:23:03 2018 OS/Arch: linux/amd64 + Experimental: false ``` ### Nvidia-docker Installation (Only for Nvidia GPU equipped nodes) -Submarine depends on nvidia-docker 1.0 version +Submarine has already supported nvidia-docker V2 ``` -wget -P /tmp https://github.com/NVIDIA/nvidia-docker/releases/download/v1.0.1/nvidia-docker-1.0.1-1.x86_64.rpm -sudo rpm -i /tmp/nvidia-docker*.rpm -# Start nvidia-docker -sudo systemctl start nvidia-docker - -# Check nvidia-docker status: -systemctl status nvidia-docker - -# Check nvidia-docker log: -journalctl -u nvidia-docker - -# Test nvidia-docker-plugin -curl http://localhost:3476/v1.0/docker/cli +# Add the package repositories +distribution=$(. /etc/os-release;echo $ID$VERSION_ID) +curl -s -L https://nvidia.github.io/nvidia-container-runtime/$distribution/nvidia-container-runtime.repo | \ + sudo tee /etc/yum.repos.d/nvidia-container-runtime.repo +sudo yum install -y nvidia-docker2-2.0.3-1.docker18.06.1.ce ``` According to `nvidia-driver` version, add folders under the path of `/var/lib/nvidia-docker/volumes/nvidia_driver/` @@ -264,7 +275,7 @@ cp /usr/lib64/libcuda* /var/lib/nvidia-docker/volumes/nvidia_driver/390.87/lib64 cp /usr/lib64/libnvidia* /var/lib/nvidia-docker/volumes/nvidia_driver/390.87/lib64 # Test with nvidia-smi -nvidia-docker run --rm nvidia/cuda:9.0-devel nvidia-smi +nvidia-docker run --rm nvidia/cuda:10.0-devel nvidia-smi ``` Test docker, nvidia-docker, nvidia-driver installation @@ -283,89 +294,17 @@ import tensorflow as tf tf.test.is_gpu_available() ``` -[The way to uninstall nvidia-docker 1.0](https://github.com/nvidia/nvidia-docker/wiki/Installation-(version-2.0)) +The way to uninstall nvidia-docker V2 +``` +sudo yum remove -y nvidia-docker2-2.0.3-1.docker18.06.1.ce +``` Reference: -https://github.com/NVIDIA/nvidia-docker/tree/1.0 - +https://github.com/NVIDIA/nvidia-docker ### Tensorflow Image -There is no need to install CUDNN and CUDA on the servers, because CUDNN and CUDA can be added in the docker images. we can get basic docker images by following WriteDockerfile.md. - - -The basic Dockerfile doesn't support kerberos security. if you need kerberos, you can get write a Dockerfile like this - - -```shell -FROM nvidia/cuda:9.0-cudnn7-devel-ubuntu16.04 - -# Pick up some TF dependencies -RUN apt-get update && apt-get install -y --allow-downgrades --no-install-recommends \ - build-essential \ - cuda-command-line-tools-9-0 \ - cuda-cublas-9-0 \ - cuda-cufft-9-0 \ - cuda-curand-9-0 \ - cuda-cusolver-9-0 \ - cuda-cusparse-9-0 \ - curl \ - libcudnn7=7.0.5.15-1+cuda9.0 \ - libfreetype6-dev \ - libpng12-dev \ - libzmq3-dev \ - pkg-config \ - python \ - python-dev \ - rsync \ - software-properties-common \ - unzip \ - && \ - apt-get clean && \ - rm -rf /var/lib/apt/lists/* - -RUN export DEBIAN_FRONTEND=noninteractive && apt-get update && apt-get install -yq krb5-user libpam-krb5 && apt-get clean - -RUN curl -O https://bootstrap.pypa.io/get-pip.py && \ - python get-pip.py && \ - rm get-pip.py - -RUN pip --no-cache-dir install \ - Pillow \ - h5py \ - ipykernel \ - jupyter \ - matplotlib \ - numpy \ - pandas \ - scipy \ - sklearn \ - && \ - python -m ipykernel.kernelspec - -# Install TensorFlow GPU version. -RUN pip --no-cache-dir install \ - http://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.8.0-cp27-none-linux_x86_64.whl -RUN apt-get update && apt-get install git -y - -RUN apt-get update && apt-get install -y openjdk-8-jdk wget -# Downloadhadoop-3.1.1.tar.gz -RUN wget http://mirrors.hust.edu.cn/apache/hadoop/common/hadoop-3.1.1/hadoop-3.1.1.tar.gz -RUN tar zxf hadoop-3.1.1.tar.gz -RUN mv hadoop-3.1.1 hadoop-3.1.0 - -# Download jdk which supports kerberos -RUN wget -qO jdk8.tar.gz 'http://${kerberos_jdk_url}/jdk-8u152-linux-x64.tar.gz' -RUN tar xzf jdk8.tar.gz -C /opt -RUN mv /opt/jdk* /opt/java -RUN rm jdk8.tar.gz -RUN update-alternatives --install /usr/bin/java java /opt/java/bin/java 100 -RUN update-alternatives --install /usr/bin/javac javac /opt/java/bin/javac 100 - -ENV JAVA_HOME /opt/java -ENV PATH $PATH:$JAVA_HOME/bin -``` - +There is no need to install CUDNN and CUDA on the servers, because CUDNN and CUDA can be added in the docker images. we can get basic docker images by referring to WriteDockerfile.md. ### Test tensorflow in a docker container @@ -515,12 +454,12 @@ where ${dfs_name_service} is the hdfs name service you use ```bash ./bin/yarn jar /home/hadoop/hadoop-current/share/hadoop/yarn/hadoop-yarn-submarine-3.2.0-SNAPSHOT.jar job run \ --env DOCKER_JAVA_HOME=/opt/java \ - --env DOCKER_HADOOP_HDFS_HOME=/hadoop-3.1.0 --name standalone-tf \ - --docker_image dockerfile-cpu-tf1.8.0-with-models \ + --env DOCKER_HADOOP_HDFS_HOME=/hadoop-current --name standalone-tf \ + --docker_image tf-1.13.1-cpu:0.0.1 \ --input_path hdfs://${dfs_name_service}/tmp/cifar-10-data \ --checkpoint_path hdfs://${dfs_name_service}/user/hadoop/tf-checkpoint \ --worker_resources memory=4G,vcores=2 --verbose \ - --worker_launch_cmd "python /test/cifar10_estimator/cifar10_main.py --data-dir=hdfs://${dfs_name_service}/tmp/cifar-10-data --job-dir=hdfs://${dfs_name_service}/tmp/cifar-10-jobdir --train-steps=500 --eval-batch-size=16 --train-batch-size=16 --num-gpus=0" + --worker_launch_cmd "python /test/cifar10_estimator/cifar10_main.py --data-dir=%input_path% --job-dir=%checkpoint_path% --train-steps=500 --eval-batch-size=16 --train-batch-size=16 --num-gpus=0" ``` ### Distributed Mode @@ -537,17 +476,17 @@ where ${dfs_name_service} is the hdfs name service you use ```bash ./bin/yarn jar /home/hadoop/hadoop-current/share/hadoop/yarn/hadoop-yarn-submarine-3.2.0-SNAPSHOT.jar job run \ --env DOCKER_JAVA_HOME=/opt/java \ - --env DOCKER_HADOOP_HDFS_HOME=/hadoop-3.1.0 --name distributed-tf \ + --env DOCKER_HADOOP_HDFS_HOME=/hadoop-current --name distributed-tf \ --env YARN_CONTAINER_RUNTIME_DOCKER_CONTAINER_NETWORK=calico-network \ - --docker_image dockerfile-cpu-tf1.8.0-with-models \ + --docker_image tf-1.13.1-cpu:0.0.1 \ --input_path hdfs://${dfs_name_service}/tmp/cifar-10-data \ --checkpoint_path hdfs://${dfs_name_service}/user/hadoop/tf-distributed-checkpoint \ --worker_resources memory=4G,vcores=2 --verbose \ --num_ps 1 \ --ps_resources memory=4G,vcores=2 \ - --ps_launch_cmd "python /test/cifar10_estimator/cifar10_main.py --data-dir=hdfs://${dfs_name_service}/tmp/cifar-10-data --job-dir=hdfs://${dfs_name_service}/tmp/cifar-10-jobdir --num-gpus=0" \ + --ps_launch_cmd "python /test/cifar10_estimator/cifar10_main.py --data-dir=%input_path% --job-dir=%checkpoint_path% --num-gpus=0" \ --num_workers 4 \ - --worker_launch_cmd "python /test/cifar10_estimator/cifar10_main.py --data-dir=hdfs://${dfs_name_service}/tmp/cifar-10-data --job-dir=hdfs://${dfs_name_service}/tmp/cifar-10-jobdir --train-steps=500 --eval-batch-size=16 --train-batch-size=16 --sync --num-gpus=0" + --worker_launch_cmd "python /test/cifar10_estimator/cifar10_main.py --data-dir=%input_path% --job-dir=%checkpoint_path% --train-steps=500 --eval-batch-size=16 --train-batch-size=16 --sync --num-gpus=0" ``` @@ -589,6 +528,11 @@ Add configurations in yarn-site.xml yarn.nodemanager.resource-plugins yarn.io/gpu + + + yarn.nodemanager.resource-plugins.gpu.docker-plugin + nvidia-docker-v2 + ``` @@ -603,6 +547,8 @@ Add configurations in container-executor.cfg docker.allowed.volume-drivers=/usr/bin/nvidia-docker docker.allowed.devices=/dev/nvidiactl,/dev/nvidia-uvm,/dev/nvidia-uvm-tools,/dev/nvidia1,/dev/nvidia0 docker.allowed.ro-mounts=nvidia_driver_ + # Use nvidia docker v2 + docker.allowed.runtimes=nvidia [gpu] module.enabled=true @@ -613,3 +559,21 @@ Add configurations in container-executor.cfg root=/sys/fs/cgroup yarn-hierarchy=/hadoop-yarn ``` + +### Run a distributed tensorflow gpu job + +```bash + ./yarn jar /home/hadoop/hadoop-current/share/hadoop/yarn/hadoop-yarn-submarine-3.2.0-SNAPSHOT.jar job run \ + --env DOCKER_JAVA_HOME=/opt/java \ + --env DOCKER_HADOOP_HDFS_HOME=/hadoop-current --name distributed-tf-gpu \ + --env YARN_CONTAINER_RUNTIME_DOCKER_CONTAINER_NETWORK=calico-network \ + --docker_image tf-1.13.1-gpu:0.0.1 \ + --input_path hdfs://${dfs_name_service}/tmp/cifar-10-data \ + --checkpoint_path hdfs://${dfs_name_service}/user/hadoop/tf-distributed-checkpoint \ + --num_ps 0 \ + --ps_resources memory=4G,vcores=2,gpu=0 \ + --ps_launch_cmd "python /test/cifar10_estimator/cifar10_main.py --data-dir=%input_path% --job-dir=%checkpoint_path% --num-gpus=0" \ + --worker_resources memory=4G,vcores=2,gpu=1 --verbose \ + --num_workers 1 \ + --worker_launch_cmd "python /test/cifar10_estimator/cifar10_main.py --data-dir=%input_path% --job-dir=%checkpoint_path% --train-steps=500 --eval-batch-size=16 --train-batch-size=16 --sync --num-gpus=1" +``` diff --git a/hadoop-submarine/hadoop-submarine-core/src/site/markdown/InstallationGuideChineseVersion.md b/hadoop-submarine/hadoop-submarine-core/src/site/markdown/InstallationGuideChineseVersion.md index 471b8fcd956..ba996e8d211 100644 --- a/hadoop-submarine/hadoop-submarine-core/src/site/markdown/InstallationGuideChineseVersion.md +++ b/hadoop-submarine/hadoop-submarine-core/src/site/markdown/InstallationGuideChineseVersion.md @@ -18,12 +18,12 @@ ### 操作系统 -我们使用的操作系统版本是 centos-release-7-3.1611.el7.centos.x86_64, 内核版本是 3.10.0-514.el7.x86_64 ,应该是最低版本了。 +我们使用的操作系统版本是 centos-release-7-5.1804.el7.centos.x86_64, 内核版本是 3.10.0-862.el7.x86_64。 | Enviroment | Verion | | ------ | ------ | -| Operating System | centos-release-7-3.1611.el7.centos.x86_64 | -| Kernal | 3.10.0-514.el7.x86_64 | +| Operating System | centos-release-7-5.1804.el7.centos.x86_64 | +| Kernal | 3.10.0-862.el7.x86_64 | ### User & Group @@ -58,8 +58,8 @@ yum install gcc make g++ # 方法一: yum install kernel-devel-$(uname -r) kernel-headers-$(uname -r) # 方法二: -wget http://vault.centos.org/7.3.1611/os/x86_64/Packages/kernel-headers-3.10.0-514.el7.x86_64.rpm -rpm -ivh kernel-headers-3.10.0-514.el7.x86_64.rpm +wget http://vault.centos.org/7.3.1611/os/x86_64/Packages/kernel-headers-3.10.0-862.el7.x86_64.rpm +rpm -ivh kernel-headers-3.10.0-862.el7.x86_64.rpm ``` ### 检查 GPU 版本 @@ -155,23 +155,40 @@ https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html ### 安装 Docker ``` -yum -y update -yum -y install yum-utils -yum-config-manager --add-repo https://yum.dockerproject.org/repo/main/centos/7 -yum -y update +# Remove old version docker +sudo yum remove docker \ + docker-client \ + docker-client-latest \ + docker-common \ + docker-latest \ + docker-latest-logrotate \ + docker-logrotate \ + docker-engine -# 显示 available 的安装包 -yum search --showduplicates docker-engine +# Docker version +export DOCKER_VERSION="18.06.1.ce" +# Setup the repository +sudo yum install -y yum-utils \ + device-mapper-persistent-data \ + lvm2 +sudo yum-config-manager \ + --add-repo \ + https://download.docker.com/linux/centos/docker-ce.repo -# 安装 1.12.5 版本 docker -yum -y --nogpgcheck install docker-engine-1.12.5* +# Check docker version +yum list docker-ce --showduplicates | sort -r + +# Install docker with specified DOCKER_VERSION +sudo yum install -y docker-ce-${DOCKER_VERSION} docker-ce-cli-${DOCKER_VERSION} containerd.io + +# Start docker systemctl start docker chown hadoop:netease /var/run/docker.sock chown hadoop:netease /usr/bin/docker ``` -Reference:https://docs.docker.com/cs-engine/1.12/ +Reference:https://docs.docker.com/install/linux/docker-ce/centos/ ### 配置 Docker @@ -195,46 +212,40 @@ sudo systemctl restart docker -### Docker EE version +### 检查 Docker version ```bash $ docker version Client: - Version: 1.12.5 - API version: 1.24 - Go version: go1.6.4 - Git commit: 7392c3b - Built: Fri Dec 16 02:23:59 2016 + Version: 18.06.1-ce + API version: 1.38 + Go version: go1.10.3 + Git commit: e68fc7a + Built: Tue Aug 21 17:23:03 2018 OS/Arch: linux/amd64 + Experimental: false Server: - Version: 1.12.5 - API version: 1.24 - Go version: go1.6.4 - Git commit: 7392c3b - Built: Fri Dec 16 02:23:59 2016 + Version: 18.06.1-ce + API version: 1.38 (minimum version 1.12) + Go version: go1.10.3 + Git commit: e68fc7a + Built: Tue Aug 21 17:23:03 2018 OS/Arch: linux/amd64 + Experimental: false ``` ### 安装 nvidia-docker -Hadoop-3.2 的 submarine 使用的是 1.0 版本的 nvidia-docker +Hadoop-3.2 的 submarine 已支持 V2 版本的 nvidia-docker ``` -wget -P /tmp https://github.com/NVIDIA/nvidia-docker/releases/download/v1.0.1/nvidia-docker-1.0.1-1.x86_64.rpm -sudo rpm -i /tmp/nvidia-docker*.rpm -# 启动 nvidia-docker -sudo systemctl start nvidia-docker - -# 查看 nvidia-docker 状态: -systemctl status nvidia-docker - -# 查看 nvidia-docker 日志: -journalctl -u nvidia-docker - -# 查看 nvidia-docker-plugin 是否正常 -curl http://localhost:3476/v1.0/docker/cli +# Add the package repositories +distribution=$(. /etc/os-release;echo $ID$VERSION_ID) +curl -s -L https://nvidia.github.io/nvidia-container-runtime/$distribution/nvidia-container-runtime.repo | \ + sudo tee /etc/yum.repos.d/nvidia-container-runtime.repo +sudo yum install -y nvidia-docker2-2.0.3-1.docker18.06.1.ce ``` 在 `/var/lib/nvidia-docker/volumes/nvidia_driver/` 路径下,根据 `nvidia-driver` 的版本创建文件夹: @@ -251,7 +262,7 @@ cp /usr/lib64/libcuda* /var/lib/nvidia-docker/volumes/nvidia_driver/390.87/lib64 cp /usr/lib64/libnvidia* /var/lib/nvidia-docker/volumes/nvidia_driver/390.87/lib64 # Test nvidia-smi -nvidia-docker run --rm nvidia/cuda:9.0-devel nvidia-smi +nvidia-docker run --rm nvidia/cuda:10.0-devel nvidia-smi ``` 测试 docker, nvidia-docker, nvidia-driver 安装 @@ -270,11 +281,13 @@ import tensorflow as tf tf.test.is_gpu_available() ``` -卸载 nvidia-docker 1.0 的方法: -https://github.com/nvidia/nvidia-docker/wiki/Installation-(version-2.0) +卸载 nvidia-docker V2 的方法: +``` +sudo yum remove -y nvidia-docker2-2.0.3-1.docker18.06.1.ce +``` reference: -https://github.com/NVIDIA/nvidia-docker/tree/1.0 +https://github.com/NVIDIA/nvidia-docker @@ -282,79 +295,6 @@ https://github.com/NVIDIA/nvidia-docker/tree/1.0 CUDNN 和 CUDA 其实不需要在物理机上安装,因为 Sumbmarine 中提供了已经包含了CUDNN 和 CUDA 的镜像文件,基础的Dockfile可参见WriteDockerfile.md - -上述images无法支持kerberos环境,如果需要kerberos可以使用如下Dockfile - -```shell -FROM nvidia/cuda:9.0-cudnn7-devel-ubuntu16.04 - -# Pick up some TF dependencies -RUN apt-get update && apt-get install -y --allow-downgrades --no-install-recommends \ - build-essential \ - cuda-command-line-tools-9-0 \ - cuda-cublas-9-0 \ - cuda-cufft-9-0 \ - cuda-curand-9-0 \ - cuda-cusolver-9-0 \ - cuda-cusparse-9-0 \ - curl \ - libcudnn7=7.0.5.15-1+cuda9.0 \ - libfreetype6-dev \ - libpng12-dev \ - libzmq3-dev \ - pkg-config \ - python \ - python-dev \ - rsync \ - software-properties-common \ - unzip \ - && \ - apt-get clean && \ - rm -rf /var/lib/apt/lists/* - -RUN export DEBIAN_FRONTEND=noninteractive && apt-get update && apt-get install -yq krb5-user libpam-krb5 && apt-get clean - -RUN curl -O https://bootstrap.pypa.io/get-pip.py && \ - python get-pip.py && \ - rm get-pip.py - -RUN pip --no-cache-dir install \ - Pillow \ - h5py \ - ipykernel \ - jupyter \ - matplotlib \ - numpy \ - pandas \ - scipy \ - sklearn \ - && \ - python -m ipykernel.kernelspec - -# Install TensorFlow GPU version. -RUN pip --no-cache-dir install \ - http://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.8.0-cp27-none-linux_x86_64.whl -RUN apt-get update && apt-get install git -y - -RUN apt-get update && apt-get install -y openjdk-8-jdk wget -# 下载 hadoop-3.1.1.tar.gz -RUN wget http://mirrors.hust.edu.cn/apache/hadoop/common/hadoop-3.1.1/hadoop-3.1.1.tar.gz -RUN tar zxf hadoop-3.1.1.tar.gz -RUN mv hadoop-3.1.1 hadoop-3.1.0 - -# 下载支持kerberos的jdk安装包 -RUN wget -qO jdk8.tar.gz 'http://${kerberos_jdk_url}/jdk-8u152-linux-x64.tar.gz' -RUN tar xzf jdk8.tar.gz -C /opt -RUN mv /opt/jdk* /opt/java -RUN rm jdk8.tar.gz -RUN update-alternatives --install /usr/bin/java java /opt/java/bin/java 100 -RUN update-alternatives --install /usr/bin/javac javac /opt/java/bin/javac 100 - -ENV JAVA_HOME /opt/java -ENV PATH $PATH:$JAVA_HOME/bin -``` - - ### 测试 TF 环境 创建好 docker 镜像后,需要先手动检查 TensorFlow 是否可以正常使用,避免通过 YARN 调度后出现问题,可以执行以下命令 @@ -505,12 +445,12 @@ sudo YARN_LOGFILE=registrydns.log ./yarn-daemon.sh start registrydns ```bash ./bin/yarn jar /home/hadoop/hadoop-current/share/hadoop/yarn/hadoop-yarn-submarine-3.2.0-SNAPSHOT.jar job run \ --env DOCKER_JAVA_HOME=/opt/java \ - --env DOCKER_HADOOP_HDFS_HOME=/hadoop-3.1.0 --name standalone-tf \ - --docker_image dockerfile-cpu-tf1.8.0-with-models \ + --env DOCKER_HADOOP_HDFS_HOME=/hadoop-current --name standalone-tf \ + --docker_image tf-1.13.1-cpu:0.0.1 \ --input_path hdfs://${dfs_name_service}/tmp/cifar-10-data \ --checkpoint_path hdfs://${dfs_name_service}/user/hadoop/tf-checkpoint \ --worker_resources memory=4G,vcores=2 --verbose \ - --worker_launch_cmd "python /test/cifar10_estimator/cifar10_main.py --data-dir=hdfs://${dfs_name_service}/tmp/cifar-10-data --job-dir=hdfs://${dfs_name_service}/tmp/cifar-10-jobdir --train-steps=500 --eval-batch-size=16 --train-batch-size=16 --num-gpus=0" + --worker_launch_cmd "python /test/cifar10_estimator/cifar10_main.py --data-dir=%input_path% --job-dir=%checkpoint_path% --train-steps=500 --eval-batch-size=16 --train-batch-size=16 --num-gpus=0" ``` @@ -531,17 +471,17 @@ sudo YARN_LOGFILE=registrydns.log ./yarn-daemon.sh start registrydns ```bash ./bin/yarn jar /home/hadoop/hadoop-current/share/hadoop/yarn/hadoop-yarn-submarine-3.2.0-SNAPSHOT.jar job run \ --env DOCKER_JAVA_HOME=/opt/java \ - --env DOCKER_HADOOP_HDFS_HOME=/hadoop-3.1.0 --name distributed-tf \ + --env DOCKER_HADOOP_HDFS_HOME=/hadoop-current --name distributed-tf \ --env YARN_CONTAINER_RUNTIME_DOCKER_CONTAINER_NETWORK=calico-network \ - --docker_image dockerfile-cpu-tf1.8.0-with-models \ + --docker_image tf-1.13.1-cpu:0.0.1 \ --input_path hdfs://${dfs_name_service}/tmp/cifar-10-data \ --checkpoint_path hdfs://${dfs_name_service}/user/hadoop/tf-distributed-checkpoint \ --worker_resources memory=4G,vcores=2 --verbose \ --num_ps 1 \ --ps_resources memory=4G,vcores=2 \ - --ps_launch_cmd "python /test/cifar10_estimator/cifar10_main.py --data-dir=hdfs://${dfs_name_service}/tmp/cifar-10-data --job-dir=hdfs://${dfs_name_service}/tmp/cifar-10-jobdir --num-gpus=0" \ + --ps_launch_cmd "python /test/cifar10_estimator/cifar10_main.py --data-dir=%input_path% --job-dir=%checkpoint_path% --num-gpus=0" \ --num_workers 4 \ - --worker_launch_cmd "python /test/cifar10_estimator/cifar10_main.py --data-dir=hdfs://${dfs_name_service}/tmp/cifar-10-data --job-dir=hdfs://${${dfs_name_service}}/tmp/cifar-10-jobdir --train-steps=500 --eval-batch-size=16 --train-batch-size=16 --sync --num-gpus=0" + --worker_launch_cmd "python /test/cifar10_estimator/cifar10_main.py --data-dir=%input_path% --job-dir=%checkpoint_path% --train-steps=500 --eval-batch-size=16 --train-batch-size=16 --sync --num-gpus=0" ``` @@ -583,6 +523,11 @@ resourcemanager 使用的 scheduler 必须是 capacity scheduler,在 capacity- yarn.nodemanager.resource-plugins yarn.io/gpu + + + yarn.nodemanager.resource-plugins.gpu.docker-plugin + nvidia-docker-v2 + ``` @@ -597,6 +542,8 @@ resourcemanager 使用的 scheduler 必须是 capacity scheduler,在 capacity- docker.allowed.volume-drivers=/usr/bin/nvidia-docker docker.allowed.devices=/dev/nvidiactl,/dev/nvidia-uvm,/dev/nvidia-uvm-tools,/dev/nvidia1,/dev/nvidia0 docker.allowed.ro-mounts=nvidia_driver_375.26 + # Use nvidia docker v2 + docker.allowed.runtimes=nvidia [gpu] module.enabled=true @@ -615,9 +562,9 @@ Distributed-shell + GPU + cgroup ```bash ./yarn jar /home/hadoop/hadoop-current/share/hadoop/yarn/hadoop-yarn-submarine-3.2.0-SNAPSHOT.jar job run \ --env DOCKER_JAVA_HOME=/opt/java \ - --env DOCKER_HADOOP_HDFS_HOME=/hadoop-3.1.0 --name distributed-tf-gpu \ + --env DOCKER_HADOOP_HDFS_HOME=/hadoop-current --name distributed-tf-gpu \ --env YARN_CONTAINER_RUNTIME_DOCKER_CONTAINER_NETWORK=calico-network \ - --docker_image gpu-cuda9.0-tf1.8.0-with-models \ + --docker_image tf-1.13.1-gpu:0.0.1 \ --input_path hdfs://${dfs_name_service}/tmp/cifar-10-data \ --checkpoint_path hdfs://${dfs_name_service}/user/hadoop/tf-distributed-checkpoint \ --num_ps 0 \ diff --git a/hadoop-submarine/hadoop-submarine-core/src/site/markdown/QuickStart.md b/hadoop-submarine/hadoop-submarine-core/src/site/markdown/QuickStart.md index 071e1a8db3f..6577f8357ad 100644 --- a/hadoop-submarine/hadoop-submarine-core/src/site/markdown/QuickStart.md +++ b/hadoop-submarine/hadoop-submarine-core/src/site/markdown/QuickStart.md @@ -22,8 +22,17 @@ Must: Optional: -- Enable YARN DNS. (When distributed training is required.) +- Enable YARN DNS. (When yarn service runtime is required.) - Enable GPU on YARN support. (When GPU-based training is required.) +- Docker images for submarine jobs. (When docker container is required.) +``` + # Get prebuilt docker images (No liability) + docker pull hadoopsubmarine/tf-1.13.1-gpu:0.0.1 + # Or build your own docker images + docker build . -f Dockerfile.gpu.tf_1.13.1 -t tf-1.13.1-gpu-base:0.0.1 +``` +More details, please refer to +[How to write Dockerfile for Submarine jobs](WriteDockerfile.html) ## Run jobs @@ -122,7 +131,7 @@ For submarine internal configuration, please create a `submarine.xml` which shou ``` yarn jar path-to/hadoop-yarn-applications-submarine-3.2.0-SNAPSHOT.jar job run \ --env DOCKER_JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64/jre/ \ - --env DOCKER_HADOOP_HDFS_HOME=/hadoop-3.1.0 --name tf-job-001 \ + --env DOCKER_HADOOP_HDFS_HOME=/hadoop-current --name tf-job-001 \ --docker_image \ --input_path hdfs://default/dataset/cifar-10-data \ --checkpoint_path hdfs://default/tmp/cifar-10-jobdir \ @@ -153,11 +162,11 @@ See below screenshot: ``` yarn jar hadoop-yarn-applications-submarine-.jar job run \ - --name tf-job-001 --docker_image \ + --name tf-job-001 --docker_image \ --input_path hdfs://default/dataset/cifar-10-data \ --checkpoint_path hdfs://default/tmp/cifar-10-jobdir \ --env DOCKER_JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64/jre/ \ - --env DOCKER_HADOOP_HDFS_HOME=/hadoop-3.1.0 \ + --env DOCKER_HADOOP_HDFS_HOME=/hadoop-current \ --num_workers 2 \ --worker_resources memory=8G,vcores=2,gpu=1 --worker_launch_cmd "cmd for worker ..." \ --num_ps 2 \ @@ -183,11 +192,11 @@ yarn jar hadoop-yarn-applications-submarine-3.2.0-SNAPSHOT.jar job show --name t Output looks like: ``` Job Meta Info: - Application Id: application_1532131617202_0005 - Input Path: hdfs://default/dataset/cifar-10-data - Checkpoint Path: hdfs://default/tmp/cifar-10-jobdir - Run Parameters: --name tf-job-001 --docker_image wtan/tf-1.8.0-gpu:0.0.3 - (... all your commandline before run the job) + Application Id: application_1532131617202_0005 + Input Path: hdfs://default/dataset/cifar-10-data + Checkpoint Path: hdfs://default/tmp/cifar-10-jobdir + Run Parameters: --name tf-job-001 --docker_image + (... all your commandline before run the job) ``` After that, you can run ```tensorboard --logdir=``` to view Tensorboard of the job. @@ -198,9 +207,9 @@ After that, you can run ```tensorboard --logdir=``` to view Ten # Cleanup previous service if needed yarn app -destroy tensorboard-service; \ yarn jar /tmp/hadoop-yarn-applications-submarine-3.2.0-SNAPSHOT.jar \ - job run --name tensorboard-service --verbose --docker_image wtan/tf-1.8.0-cpu:0.0.3 \ + job run --name tensorboard-service --verbose --docker_image \ --env DOCKER_JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64/jre/ \ - --env DOCKER_HADOOP_HDFS_HOME=/hadoop-3.1.0 \ + --env DOCKER_HADOOP_HDFS_HOME=/hadoop-current \ --num_workers 0 --tensorboard ``` @@ -216,3 +225,11 @@ There're two ways to get training job logs, one is from YARN UI (new or old): ![alt text](./images/job-logs-ui.png "Job logs UI") Or you can use `yarn logs -applicationId ` to get logs from CLI + +## Build from source code + +If you want to build submarine project by yourself, you can follow the steps: + +- Run 'mvn install -DskipTests' from Hadoop source top level once. + +- Navigate to hadoop-submarine folder and run 'mvn clean package'. \ No newline at end of file diff --git a/hadoop-submarine/hadoop-submarine-core/src/site/markdown/RunningDistributedCifar10TFJobs.md b/hadoop-submarine/hadoop-submarine-core/src/site/markdown/RunningDistributedCifar10TFJobs.md index 127c80f075a..7da98d55cfb 100644 --- a/hadoop-submarine/hadoop-submarine-core/src/site/markdown/RunningDistributedCifar10TFJobs.md +++ b/hadoop-submarine/hadoop-submarine-core/src/site/markdown/RunningDistributedCifar10TFJobs.md @@ -57,13 +57,13 @@ Refer to [Write Dockerfile](WriteDockerfile.md) to build a Docker image or use p ``` yarn jar path/to/hadoop-yarn-applications-submarine-3.2.0-SNAPSHOT.jar \ - job run --name tf-job-001 --verbose --docker_image hadoopsubmarine/tf-1.8.0-gpu:0.0.1 \ + job run --name tf-job-001 --verbose --docker_image tf-1.13.1-gpu:0.0.1 \ --input_path hdfs://default/dataset/cifar-10-data \ --env DOCKER_JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64/jre/ - --env DOCKER_HADOOP_HDFS_HOME=/hadoop-3.1.0 + --env DOCKER_HADOOP_HDFS_HOME=/hadoop-current --num_workers 1 --worker_resources memory=8G,vcores=2,gpu=1 \ --worker_launch_cmd "cd /test/models/tutorials/image/cifar10_estimator && python cifar10_main.py --data-dir=%input_path% --job-dir=%checkpoint_path% --train-steps=10000 --eval-batch-size=16 --train-batch-size=16 --num-gpus=2 --sync" \ - --tensorboard --tensorboard_docker_image wtan/tf-1.8.0-cpu:0.0.3 + --tensorboard --tensorboard_docker_image tf-1.13.1-cpu:0.0.1 ``` Explanations: @@ -75,16 +75,16 @@ Explanations: ``` yarn jar path/to/hadoop-yarn-applications-submarine-3.2.0-SNAPSHOT.jar \ - job run --name tf-job-001 --verbose --docker_image hadoopsubmarine/tf-1.8.0-gpu:0.0.1 \ + job run --name tf-job-001 --verbose --docker_image tf-1.13.1-gpu:0.0.1 \ --input_path hdfs://default/dataset/cifar-10-data \ --env(s) (same as standalone) --num_workers 2 \ --worker_resources memory=8G,vcores=2,gpu=1 \ --worker_launch_cmd "cd /test/models/tutorials/image/cifar10_estimator && python cifar10_main.py --data-dir=%input_path% --job-dir=%checkpoint_path% --train-steps=10000 --eval-batch-size=16 --train-batch-size=16 --num-gpus=2 --sync" \ - --ps_docker_image wtan/tf-1.8.0-cpu:0.0.3 \ + --ps_docker_image tf-1.13.1-cpu:0.0.1 \ --num_ps 1 --ps_resources memory=4G,vcores=2,gpu=0 \ --ps_launch_cmd "cd /test/models/tutorials/image/cifar10_estimator && python cifar10_main.py --data-dir=%input_path% --job-dir=%checkpoint_path% --num-gpus=0" \ - --tensorboard --tensorboard_docker_image wtan/tf-1.8.0-cpu:0.0.3 + --tensorboard --tensorboard_docker_image tf-1.13.1-cpu:0.0.1 ``` Explanations: diff --git a/hadoop-submarine/hadoop-submarine-core/src/site/markdown/TestAndTroubleshooting.md b/hadoop-submarine/hadoop-submarine-core/src/site/markdown/TestAndTroubleshooting.md index 3acf81a324f..8fd43f39ce6 100644 --- a/hadoop-submarine/hadoop-submarine-core/src/site/markdown/TestAndTroubleshooting.md +++ b/hadoop-submarine/hadoop-submarine-core/src/site/markdown/TestAndTroubleshooting.md @@ -19,10 +19,10 @@ Distributed-shell + GPU + cgroup ```bash ./yarn jar /home/hadoop/hadoop-current/share/hadoop/yarn/hadoop-yarn-submarine-3.2.0-SNAPSHOT.jar job run \ --env DOCKER_JAVA_HOME=/opt/java \ - --env DOCKER_HADOOP_HDFS_HOME=/hadoop-3.1.0 --name distributed-tf-gpu \ + --env DOCKER_HADOOP_HDFS_HOME=/hadoop-current --name distributed-tf-gpu \ --env YARN_CONTAINER_RUNTIME_DOCKER_CONTAINER_NETWORK=calico-network \ - --worker_docker_image gpu-cuda9.0-tf1.8.0-with-models \ - --ps_docker_image dockerfile-cpu-tf1.8.0-with-models \ + --worker_docker_image tf-1.13.1-gpu:0.0.1 \ + --ps_docker_image tf-1.13.1-cpu:0.0.1 \ --input_path hdfs://${dfs_name_service}/tmp/cifar-10-data \ --checkpoint_path hdfs://${dfs_name_service}/user/hadoop/tf-distributed-checkpoint \ --num_ps 0 \ @@ -140,26 +140,7 @@ $ chmod +x find-busy-mnt.sh $ kill -9 5007 ``` - -### Issue 5:Failed to execute `sudo nvidia-docker run` - -``` -docker: Error response from daemon: create nvidia_driver_361.42: VolumeDriver.Create: internal error, check logs for details. -See 'docker run --help'. -``` - -Solution: - -``` -#check nvidia-docker status -$ systemctl status nvidia-docker -$ journalctl -n -u nvidia-docker -#restart nvidia-docker -systemctl stop nvidia-docker -systemctl start nvidia-docker -``` - -### Issue 6:Yarn failed to start containers +### Issue 5:Yarn failed to start containers if the number of GPUs required by applications is larger than the number of GPUs in the cluster, there would be some containers can't be created. diff --git a/hadoop-submarine/hadoop-submarine-core/src/site/markdown/WriteDockerfile.md b/hadoop-submarine/hadoop-submarine-core/src/site/markdown/WriteDockerfile.md index 79aac8dc092..0d4c6c1fdae 100644 --- a/hadoop-submarine/hadoop-submarine-core/src/site/markdown/WriteDockerfile.md +++ b/hadoop-submarine/hadoop-submarine-core/src/site/markdown/WriteDockerfile.md @@ -56,6 +56,8 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ apt-get clean && \ rm -rf /var/lib/apt/lists/* +RUN export DEBIAN_FRONTEND=noninteractive && apt-get update && apt-get install -yq krb5-user libpam-krb5 && apt-get clean + RUN curl -O https://bootstrap.pypa.io/get-pip.py && \ python get-pip.py && \ rm get-pip.py @@ -74,14 +76,18 @@ RUN pip --no-cache-dir install \ python -m ipykernel.kernelspec RUN pip --no-cache-dir install \ - http://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0-cp27-none-linux_x86_64.whl + http://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.13.1-cp27-none-linux_x86_64.whl ``` On top of above image, add files, install packages to access HDFS ``` RUN apt-get update && apt-get install -y openjdk-8-jdk wget -RUN wget http://apache.cs.utah.edu/hadoop/common/hadoop-3.1.0/hadoop-3.1.0.tar.gz -RUN tar zxf hadoop-3.1.0.tar.gz +# Install hadoop +ENV HADOOP_VERSION="3.1.2" +RUN wget http://mirrors.hust.edu.cn/apache/hadoop/common/hadoop-${HADOOP_VERSION}/hadoop-${HADOOP_VERSION}.tar.gz +RUN tar zxf hadoop-${HADOOP_VERSION}.tar.gz +RUN ln -s hadoop-${HADOOP_VERSION} hadoop-current +RUN rm hadoop-${HADOOP_VERSION}.tar.gz ``` Build and push to your own docker registry: Use ```docker build ... ``` and ```docker push ...``` to finish this step. @@ -90,12 +96,12 @@ Build and push to your own docker registry: Use ```docker build ... ``` and ```d We provided following examples for you to build tensorflow docker images. -For Tensorflow 1.8.0 (Precompiled to CUDA 9.x) +For Tensorflow 1.13.1 (Precompiled to CUDA 10.x) -- *docker/base/ubuntu-16.04/Dockerfile.cpu.tf_1.8.0*: Tensorflow 1.8.0 supports CPU only. -- *docker/with-cifar10-models/ubuntu-16.04/Dockerfile.cpu.tf_1.8.0*: Tensorflow 1.8.0 supports CPU only, and included models -- *docker/base/ubuntu-16.04/Dockerfile.gpu.cuda_9.0.tf_1.8.0*: Tensorflow 1.8.0 supports GPU, which is prebuilt to CUDA9. -- *docker/with-cifar10-models/ubuntu-16.04/Dockerfile.gpu.cuda_8.0.tf_1.8.0*: Tensorflow 1.8.0 supports GPU, which is prebuilt to CUDA9, with models. +- *docker/base/ubuntu-16.04/Dockerfile.cpu.tf_1.13.1*: Tensorflow 1.13.1 supports CPU only. +- *docker/with-cifar10-models/ubuntu-16.04/Dockerfile.cpu.tf_1.13.1*: Tensorflow 1.13.1 supports CPU only, and included models +- *docker/base/ubuntu-16.04/Dockerfile.gpu.tf_1.13.1*: Tensorflow 1.13.1 supports GPU, which is prebuilt to CUDA10. +- *docker/with-cifar10-models/ubuntu-16.04/Dockerfile.gpu.tf_1.13.1*: Tensorflow 1.13.1 supports GPU, which is prebuilt to CUDA10, with models. ## Build Docker images @@ -103,15 +109,15 @@ For Tensorflow 1.8.0 (Precompiled to CUDA 9.x) Under `docker/` directory, run `build-all.sh` to build Docker images. It will build following images: -- `tf-1.8.0-gpu-base:0.0.1` for base Docker image which includes Hadoop, Tensorflow, GPU base libraries. -- `tf-1.8.0-gpu-base:0.0.1` for base Docker image which includes Hadoop. Tensorflow. -- `tf-1.8.0-gpu:0.0.1` which includes cifar10 model -- `tf-1.8.0-cpu:0.0.1` which inclues cifar10 model (cpu only). +- `tf-1.13.1-gpu-base:0.0.1` for base Docker image which includes Hadoop, Tensorflow, GPU base libraries. +- `tf-1.13.1-gpu-base:0.0.1` for base Docker image which includes Hadoop. Tensorflow. +- `tf-1.13.1-gpu:0.0.1` which includes cifar10 model +- `tf-1.13.1-cpu:0.0.1` which inclues cifar10 model (cpu only). ### Use prebuilt images (No liability) You can also use prebuilt images for convenience: -- hadoopsubmarine/tf-1.8.0-gpu:0.0.1 -- hadoopsubmarine/tf-1.8.0-cpu:0.0.1 +- hadoopsubmarine/tf-1.13.1-gpu:0.0.1 +- hadoopsubmarine/tf-1.13.1-cpu:0.0.1