SUBMARINE-44. Upgrade Docker image's tensorflow version to 1.13.1. Contributed by Zac Zhou.

This commit is contained in:
Zhankun Tang 2019-04-25 17:56:31 +08:00
parent 0b3d41bdee
commit b5dcf64f97
20 changed files with 395 additions and 457 deletions

View File

@ -0,0 +1,71 @@
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
FROM ubuntu:16.04
# Pick up some TF dependencies
RUN apt-get update && apt-get install -y --allow-downgrades --no-install-recommends \
--allow-change-held-packages --allow-unauthenticated \
build-essential libfreetype6-dev libpng12-dev \
libzmq3-dev pkg-config python python-dev \
rsync software-properties-common curl unzip wget grep sed vim iputils-ping net-tools gdb python2.7-dbg tzdata && \
apt-get clean && rm -rf /var/lib/apt/lists/*
RUN export DEBIAN_FRONTEND=noninteractive && apt-get update && apt-get install -yq --no-install-recommends \
krb5-user libpam-krb5 && \
apt-get clean && rm -rf /var/lib/apt/lists/*
RUN wget https://bootstrap.pypa.io/get-pip.py && \
python get-pip.py && \
rm get-pip.py
RUN echo "Install python related packages" && \
apt-get update && \
apt-get install -y --no-install-recommends gfortran \
# numerical/algebra packages
libblas-dev libatlas-dev liblapack-dev \
# font, image for matplotlib
libpng-dev libxft-dev \
# for tkinter
python-tk libxml2-dev libxslt-dev zlib1g-dev && \
apt-get clean && rm -rf /var/lib/apt/lists/*
RUN pip --no-cache-dir install Pillow h5py ipykernel jupyter matplotlib numpy pandas scipy sklearn && \
python -m ipykernel.kernelspec
# Install TensorFlow CPU version.
ENV TENSORFLOW_VERSION="1.13.1"
RUN pip --no-cache-dir install \
http://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-${TENSORFLOW_VERSION}-cp27-none-linux_x86_64.whl
RUN apt-get update && apt-get install -y --no-install-recommends git && \
apt-get clean && rm -rf /var/lib/apt/lists/*
# Install hadoop
ENV HADOOP_VERSION="3.1.2"
RUN wget http://mirrors.shu.edu.cn/apache/hadoop/common/hadoop-${HADOOP_VERSION}/hadoop-${HADOOP_VERSION}.tar.gz
RUN tar zxf hadoop-${HADOOP_VERSION}.tar.gz
RUN ln -s hadoop-${HADOOP_VERSION} hadoop-current
RUN rm hadoop-${HADOOP_VERSION}.tar.gz
ENV JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64
RUN echo "$LOG_TAG Install java8" && \
apt-get update && \
apt-get install -y --no-install-recommends openjdk-8-jdk && \
apt-get clean && rm -rf /var/lib/apt/lists/*
# Set the locale to fix bash warning: setlocale: LC_ALL: cannot change locale (en_US.UTF-8)
RUN apt-get update && apt-get install -y --no-install-recommends locales && \
apt-get clean && rm -rf /var/lib/apt/lists/*
RUN locale-gen en_US.UTF-8

View File

@ -1,69 +0,0 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
FROM ubuntu:16.04
LABEL maintainer="Craig Citro <craigcitro@google.com>"
# Pick up some TF dependencies
RUN apt-get update && apt-get install -y --no-install-recommends \
build-essential \
curl \
libfreetype6-dev \
libpng12-dev \
libzmq3-dev \
pkg-config \
python \
python-dev \
rsync \
software-properties-common \
unzip \
&& \
apt-get clean && \
rm -rf /var/lib/apt/lists/*
RUN curl -O https://bootstrap.pypa.io/get-pip.py && \
python get-pip.py && \
rm get-pip.py
RUN pip --no-cache-dir install \
Pillow \
h5py \
ipykernel \
jupyter \
matplotlib \
numpy \
pandas \
scipy \
sklearn \
&& \
python -m ipykernel.kernelspec
# --- DO NOT EDIT OR DELETE BETWEEN THE LINES --- #
# These lines will be edited automatically by parameterized_docker_build.sh. #
# COPY _PIP_FILE_ /
# RUN pip --no-cache-dir install /_PIP_FILE_
# RUN rm -f /_PIP_FILE_
# Install TensorFlow CPU version from central repo
RUN pip --no-cache-dir install \
http://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0-cp27-none-linux_x86_64.whl
RUN apt-get update && apt-get install git -y
RUN apt-get update && apt-get install -y openjdk-8-jdk wget
RUN wget http://apache.cs.utah.edu/hadoop/common/hadoop-3.1.1/hadoop-3.1.1.tar.gz
RUN tar zxf hadoop-3.1.1.tar.gz

View File

@ -0,0 +1,85 @@
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
FROM nvidia/cuda:10.0-cudnn7-devel-ubuntu16.04
# Pick up some TF dependencies
RUN apt-get update && apt-get install -y --allow-downgrades --no-install-recommends \
--allow-change-held-packages --allow-unauthenticated \
build-essential libfreetype6-dev libpng12-dev \
libzmq3-dev pkg-config python python-dev \
rsync software-properties-common curl unzip wget grep sed vim \
iputils-ping net-tools gdb python2.7-dbg tzdata \
cuda-command-line-tools-10-0 cuda-cublas-10-0 \
cuda-cufft-10-0 cuda-curand-10-0 cuda-cusolver-10-0 \
cuda-cusparse-10-0 libcudnn7=7.4.1.5-1+cuda10.0 && \
apt-get clean && rm -rf /var/lib/apt/lists/*
# Install TensorRT
RUN apt-get update && \
apt-get install -y --allow-unauthenticated --no-install-recommends \
nvinfer-runtime-trt-repo-ubuntu1604-5.0.2-ga-cuda10.0 && \
apt-get update && \
apt-get install -y --no-install-recommends \
libnvinfer5=5.0.2-1+cuda10.0 && \
apt-get clean && rm -rf /var/lib/apt/lists/*
RUN export DEBIAN_FRONTEND=noninteractive && apt-get update && \
apt-get install -yq --no-install-recommends krb5-user libpam-krb5 \
&& apt-get clean && rm -rf /var/lib/apt/lists/*
RUN wget https://bootstrap.pypa.io/get-pip.py && \
python get-pip.py && \
rm get-pip.py
RUN echo "Install python related packages" && \
apt-get -y update && \
apt-get install -y --no-install-recommends gfortran \
# numerical/algebra packages
libblas-dev libatlas-dev liblapack-dev \
# font, image for matplotlib
libpng-dev libxft-dev \
# for tkinter
python-tk libxml2-dev libxslt-dev zlib1g-dev && \
apt-get clean && rm -rf /var/lib/apt/lists/*
RUN pip --no-cache-dir install Pillow h5py ipykernel jupyter matplotlib numpy pandas scipy sklearn && \
python -m ipykernel.kernelspec
# Install TensorFlow GPU version.
ENV TENSORFLOW_VERSION="1.13.1"
RUN pip --no-cache-dir install \
http://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-${TENSORFLOW_VERSION}-cp27-none-linux_x86_64.whl
RUN apt-get update && apt-get install -y --no-install-recommends git && \
apt-get clean && rm -rf /var/lib/apt/lists/*
# Install hadoop
ENV HADOOP_VERSION="3.1.2"
RUN wget http://mirrors.shu.edu.cn/apache/hadoop/common/hadoop-${HADOOP_VERSION}/hadoop-${HADOOP_VERSION}.tar.gz
RUN tar zxf hadoop-${HADOOP_VERSION}.tar.gz
RUN ln -s hadoop-${HADOOP_VERSION} hadoop-current
RUN rm hadoop-${HADOOP_VERSION}.tar.gz
ENV JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64
RUN echo "$LOG_TAG Install java8" && \
apt-get -y update && \
apt-get install -y --no-install-recommends openjdk-8-jdk && \
rm -rf /var/lib/apt/lists/*
# Set the locale to fix bash warning: setlocale: LC_ALL: cannot change locale (en_US.UTF-8)
RUN apt-get update && apt-get install -y --no-install-recommends locales && \
apt-get clean && rm -rf /var/lib/apt/lists/*
RUN locale-gen en_US.UTF-8

View File

@ -1,67 +0,0 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
FROM nvidia/cuda:9.0-cudnn7-devel-ubuntu16.04
# Pick up some TF dependencies
RUN apt-get update && apt-get install -y --allow-downgrades --no-install-recommends \
build-essential \
cuda-command-line-tools-9-0 \
cuda-cublas-9-0 \
cuda-cufft-9-0 \
cuda-curand-9-0 \
cuda-cusolver-9-0 \
cuda-cusparse-9-0 \
curl \
libcudnn7=7.0.5.15-1+cuda9.0 \
libfreetype6-dev \
libpng12-dev \
libzmq3-dev \
pkg-config \
python \
python-dev \
rsync \
software-properties-common \
unzip \
&& \
apt-get clean && \
rm -rf /var/lib/apt/lists/*
RUN curl -O https://bootstrap.pypa.io/get-pip.py && \
python get-pip.py && \
rm get-pip.py
RUN pip --no-cache-dir install \
Pillow \
h5py \
ipykernel \
jupyter \
matplotlib \
numpy \
pandas \
scipy \
sklearn \
&& \
python -m ipykernel.kernelspec
# Install TensorFlow GPU version.
RUN pip --no-cache-dir install \
http://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.8.0-cp27-none-linux_x86_64.whl
RUN apt-get update && apt-get install git -y
RUN apt-get update && apt-get install -y openjdk-8-jdk wget
RUN wget http://apache.cs.utah.edu/hadoop/common/hadoop-3.1.0/hadoop-3.1.0.tar.gz
RUN tar zxf hadoop-3.1.0.tar.gz

View File

@ -21,12 +21,12 @@ set -e
cd base/ubuntu-16.04
docker build . -f Dockerfile.cpu.tf_1.8.0 -t tf-1.8.0-cpu-base:0.0.1
docker build . -f Dockerfile.gpu.tf_1.8.0 -t tf-1.8.0-gpu-base:0.0.1
docker build . -f Dockerfile.cpu.tf_1.13.1 -t tf-1.13.1-cpu-base:0.0.1
docker build . -f Dockerfile.gpu.tf_1.13.1 -t tf-1.13.1-gpu-base:0.0.1
echo "Finished building base images"
cd ../../with-cifar10-models/ubuntu-16.04
docker build . -f Dockerfile.cpu.tf_1.8.0 -t tf-1.8.0-cpu:0.0.1
docker build . -f Dockerfile.gpu.tf_1.8.0 -t tf-1.8.0-gpu:0.0.1
docker build . -f Dockerfile.cpu.tf_1.13.1 -t tf-1.13.1-cpu:0.0.1
docker build . -f Dockerfile.gpu.tf_1.13.1 -t tf-1.13.1-gpu:0.0.1

View File

@ -14,9 +14,9 @@
# See the License for the specific language governing permissions and
# limitations under the License.
FROM tf-1.8.0-cpu-base:0.0.1
FROM tf-1.13.1-cpu-base:0.0.1
# Include models
RUN mkdir /test
ADD cifar10_estimator_tf_1.8.0 /test/cifar10_estimator
ADD cifar10_estimator_tf_1.13.1 /test/cifar10_estimator
RUN chown -R nobody /test

View File

@ -14,9 +14,9 @@
# See the License for the specific language governing permissions and
# limitations under the License.
FROM tf-1.8.0-gpu-base:0.0.1
FROM tf-1.13.1-gpu-base:0.0.1
# Include models
RUN mkdir /test
ADD cifar10_estimator_tf_1.8.0 /test/cifar10_estimator
ADD cifar10_estimator_tf_1.13.1 /test/cifar10_estimator
RUN chown -R nobody /test

View File

@ -75,7 +75,7 @@ class Cifar10DataSet(object):
# Parse records.
dataset = dataset.map(
self.parser)
self.parser, num_parallel_calls=batch_size)
# Potentially shuffle records.
if self.subset == 'train':

View File

@ -344,7 +344,7 @@ def get_experiment_fn(data_dir,
train_steps = hparams.train_steps
eval_steps = num_eval_examples // hparams.eval_batch_size
classifier = tf.estimator.Estimator(
model_fn=get_model_fn(num_gpus, variable_strategy,
run_config.num_worker_replicas or 1),
@ -483,7 +483,7 @@ if __name__ == '__main__':
type=str,
default=None,
help="""\
If not set, the data format best for the training device is used.
If not set, the data format best for the training device is used.
Allowed values: channels_first (NCHW) channels_last (NHWC).\
""")
parser.add_argument(

View File

@ -12,7 +12,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
import collections
import six
@ -29,7 +28,7 @@ from tensorflow.contrib.learn.python.learn import run_config
# TODO(b/64848083) Remove once uid bug is fixed
class RunConfig(tf.contrib.learn.RunConfig):
class RunConfig(tf.contrib.learn.RunConfig):
def uid(self, whitelist=None):
"""Generates a 'Unique Identifier' based on all internal fields.
Caller should use the uid string to check `RunConfig` instance integrity
@ -60,7 +59,7 @@ class RunConfig(tf.contrib.learn.RunConfig):
key=lambda t: t[0])
)
return ', '.join(
'%s=%r' % (k, v) for (k, v) in six.iteritems(ordered_state))
'%s=%r' % (k, v) for (k, v) in six.iteritems(ordered_state))
class ExamplesPerSecondHook(session_run_hook.SessionRunHook):

View File

@ -25,6 +25,7 @@ from __future__ import print_function
import argparse
import os
import sys
import tarfile
from six.moves import cPickle as pickle
@ -63,7 +64,10 @@ def _get_file_names():
def read_pickle_from_file(filename):
with tf.gfile.Open(filename, 'rb') as f:
data_dict = pickle.load(f)
if sys.version_info >= (3, 0):
data_dict = pickle.load(f, encoding='bytes')
else:
data_dict = pickle.load(f)
return data_dict
@ -73,8 +77,8 @@ def convert_to_tfrecord(input_files, output_file):
with tf.python_io.TFRecordWriter(output_file) as record_writer:
for input_file in input_files:
data_dict = read_pickle_from_file(input_file)
data = data_dict['data']
labels = data_dict['labels']
data = data_dict[b'data']
labels = data_dict[b'labels']
num_entries_in_batch = len(labels)
for i in range(num_entries_in_batch):
example = tf.train.Example(features=tf.train.Features(

View File

@ -24,8 +24,8 @@ The operating system and kernel versions we have tested are as shown in the foll
| Enviroment | Verion |
| ------ | ------ |
| Operating System | centos-release-7-3.1611.el7.centos.x86_64 |
| Kernal | 3.10.0-514.el7.x86_64 |
| Operating System | centos-release-7-5.1804.el7.centos.x86_64 |
| Kernal | 3.10.0-862.el7.x86_64 |
### User & Group
@ -62,8 +62,8 @@ yum install gcc make g++
# Approach 1
yum install kernel-devel-$(uname -r) kernel-headers-$(uname -r)
# Approach 2
wget http://vault.centos.org/7.3.1611/os/x86_64/Packages/kernel-headers-3.10.0-514.el7.x86_64.rpm
rpm -ivh kernel-headers-3.10.0-514.el7.x86_64.rpm
wget http://vault.centos.org/7.3.1611/os/x86_64/Packages/kernel-headers-3.10.0-862.el7.x86_64.rpm
rpm -ivh kernel-headers-3.10.0-862.el7.x86_64.rpm
```
### GPU Servers (Only for Nvidia GPU equipped nodes)
@ -165,26 +165,43 @@ https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html
### Docker Installation
We recommend to use Docker version >= 1.12.5, following steps are just for your reference. You can always to choose other approaches to install Docker.
The following steps show how to install docker 18.06.1.ce. You can choose other approaches to install Docker.
```
yum -y update
yum -y install yum-utils
yum-config-manager --add-repo https://yum.dockerproject.org/repo/main/centos/7
yum -y update
# Remove old version docker
sudo yum remove docker \
docker-client \
docker-client-latest \
docker-common \
docker-latest \
docker-latest-logrotate \
docker-logrotate \
docker-engine
# Show available packages
yum search --showduplicates docker-engine
# Docker version
export DOCKER_VERSION="18.06.1.ce"
# Setup the repository
sudo yum install -y yum-utils \
device-mapper-persistent-data \
lvm2
sudo yum-config-manager \
--add-repo \
https://download.docker.com/linux/centos/docker-ce.repo
# Install docker 1.12.5
yum -y --nogpgcheck install docker-engine-1.12.5*
# Check docker version
yum list docker-ce --showduplicates | sort -r
# Install docker with specified DOCKER_VERSION
sudo yum install -y docker-ce-${DOCKER_VERSION} docker-ce-cli-${DOCKER_VERSION} containerd.io
# Start docker
systemctl start docker
chown hadoop:netease /var/run/docker.sock
chown hadoop:netease /usr/bin/docker
```
Referencehttps://docs.docker.com/cs-engine/1.12/
Referencehttps://docs.docker.com/install/linux/docker-ce/centos/
### Docker Configuration
@ -208,46 +225,40 @@ sudo systemctl restart docker
### Docker EE version
### Check docker version
```bash
$ docker version
Client:
Version: 1.12.5
API version: 1.24
Go version: go1.6.4
Git commit: 7392c3b
Built: Fri Dec 16 02:23:59 2016
Version: 18.06.1-ce
API version: 1.38
Go version: go1.10.3
Git commit: e68fc7a
Built: Tue Aug 21 17:23:03 2018
OS/Arch: linux/amd64
Experimental: false
Server:
Version: 1.12.5
API version: 1.24
Go version: go1.6.4
Git commit: 7392c3b
Built: Fri Dec 16 02:23:59 2016
Version: 18.06.1-ce
API version: 1.38 (minimum version 1.12)
Go version: go1.10.3
Git commit: e68fc7a
Built: Tue Aug 21 17:23:03 2018
OS/Arch: linux/amd64
Experimental: false
```
### Nvidia-docker Installation (Only for Nvidia GPU equipped nodes)
Submarine depends on nvidia-docker 1.0 version
Submarine has already supported nvidia-docker V2
```
wget -P /tmp https://github.com/NVIDIA/nvidia-docker/releases/download/v1.0.1/nvidia-docker-1.0.1-1.x86_64.rpm
sudo rpm -i /tmp/nvidia-docker*.rpm
# Start nvidia-docker
sudo systemctl start nvidia-docker
# Check nvidia-docker status
systemctl status nvidia-docker
# Check nvidia-docker log
journalctl -u nvidia-docker
# Test nvidia-docker-plugin
curl http://localhost:3476/v1.0/docker/cli
# Add the package repositories
distribution=$(. /etc/os-release;echo $ID$VERSION_ID)
curl -s -L https://nvidia.github.io/nvidia-container-runtime/$distribution/nvidia-container-runtime.repo | \
sudo tee /etc/yum.repos.d/nvidia-container-runtime.repo
sudo yum install -y nvidia-docker2-2.0.3-1.docker18.06.1.ce
```
According to `nvidia-driver` version, add folders under the path of `/var/lib/nvidia-docker/volumes/nvidia_driver/`
@ -264,7 +275,7 @@ cp /usr/lib64/libcuda* /var/lib/nvidia-docker/volumes/nvidia_driver/390.87/lib64
cp /usr/lib64/libnvidia* /var/lib/nvidia-docker/volumes/nvidia_driver/390.87/lib64
# Test with nvidia-smi
nvidia-docker run --rm nvidia/cuda:9.0-devel nvidia-smi
nvidia-docker run --rm nvidia/cuda:10.0-devel nvidia-smi
```
Test docker, nvidia-docker, nvidia-driver installation
@ -283,89 +294,17 @@ import tensorflow as tf
tf.test.is_gpu_available()
```
[The way to uninstall nvidia-docker 1.0](https://github.com/nvidia/nvidia-docker/wiki/Installation-(version-2.0))
The way to uninstall nvidia-docker V2
```
sudo yum remove -y nvidia-docker2-2.0.3-1.docker18.06.1.ce
```
Reference:
https://github.com/NVIDIA/nvidia-docker/tree/1.0
https://github.com/NVIDIA/nvidia-docker
### Tensorflow Image
There is no need to install CUDNN and CUDA on the servers, because CUDNN and CUDA can be added in the docker images. we can get basic docker images by following WriteDockerfile.md.
The basic Dockerfile doesn't support kerberos security. if you need kerberos, you can get write a Dockerfile like this
```shell
FROM nvidia/cuda:9.0-cudnn7-devel-ubuntu16.04
# Pick up some TF dependencies
RUN apt-get update && apt-get install -y --allow-downgrades --no-install-recommends \
build-essential \
cuda-command-line-tools-9-0 \
cuda-cublas-9-0 \
cuda-cufft-9-0 \
cuda-curand-9-0 \
cuda-cusolver-9-0 \
cuda-cusparse-9-0 \
curl \
libcudnn7=7.0.5.15-1+cuda9.0 \
libfreetype6-dev \
libpng12-dev \
libzmq3-dev \
pkg-config \
python \
python-dev \
rsync \
software-properties-common \
unzip \
&& \
apt-get clean && \
rm -rf /var/lib/apt/lists/*
RUN export DEBIAN_FRONTEND=noninteractive && apt-get update && apt-get install -yq krb5-user libpam-krb5 && apt-get clean
RUN curl -O https://bootstrap.pypa.io/get-pip.py && \
python get-pip.py && \
rm get-pip.py
RUN pip --no-cache-dir install \
Pillow \
h5py \
ipykernel \
jupyter \
matplotlib \
numpy \
pandas \
scipy \
sklearn \
&& \
python -m ipykernel.kernelspec
# Install TensorFlow GPU version.
RUN pip --no-cache-dir install \
http://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.8.0-cp27-none-linux_x86_64.whl
RUN apt-get update && apt-get install git -y
RUN apt-get update && apt-get install -y openjdk-8-jdk wget
# Downloadhadoop-3.1.1.tar.gz
RUN wget http://mirrors.hust.edu.cn/apache/hadoop/common/hadoop-3.1.1/hadoop-3.1.1.tar.gz
RUN tar zxf hadoop-3.1.1.tar.gz
RUN mv hadoop-3.1.1 hadoop-3.1.0
# Download jdk which supports kerberos
RUN wget -qO jdk8.tar.gz 'http://${kerberos_jdk_url}/jdk-8u152-linux-x64.tar.gz'
RUN tar xzf jdk8.tar.gz -C /opt
RUN mv /opt/jdk* /opt/java
RUN rm jdk8.tar.gz
RUN update-alternatives --install /usr/bin/java java /opt/java/bin/java 100
RUN update-alternatives --install /usr/bin/javac javac /opt/java/bin/javac 100
ENV JAVA_HOME /opt/java
ENV PATH $PATH:$JAVA_HOME/bin
```
There is no need to install CUDNN and CUDA on the servers, because CUDNN and CUDA can be added in the docker images. we can get basic docker images by referring to WriteDockerfile.md.
### Test tensorflow in a docker container
@ -515,12 +454,12 @@ where ${dfs_name_service} is the hdfs name service you use
```bash
./bin/yarn jar /home/hadoop/hadoop-current/share/hadoop/yarn/hadoop-yarn-submarine-3.2.0-SNAPSHOT.jar job run \
--env DOCKER_JAVA_HOME=/opt/java \
--env DOCKER_HADOOP_HDFS_HOME=/hadoop-3.1.0 --name standalone-tf \
--docker_image dockerfile-cpu-tf1.8.0-with-models \
--env DOCKER_HADOOP_HDFS_HOME=/hadoop-current --name standalone-tf \
--docker_image tf-1.13.1-cpu:0.0.1 \
--input_path hdfs://${dfs_name_service}/tmp/cifar-10-data \
--checkpoint_path hdfs://${dfs_name_service}/user/hadoop/tf-checkpoint \
--worker_resources memory=4G,vcores=2 --verbose \
--worker_launch_cmd "python /test/cifar10_estimator/cifar10_main.py --data-dir=hdfs://${dfs_name_service}/tmp/cifar-10-data --job-dir=hdfs://${dfs_name_service}/tmp/cifar-10-jobdir --train-steps=500 --eval-batch-size=16 --train-batch-size=16 --num-gpus=0"
--worker_launch_cmd "python /test/cifar10_estimator/cifar10_main.py --data-dir=%input_path% --job-dir=%checkpoint_path% --train-steps=500 --eval-batch-size=16 --train-batch-size=16 --num-gpus=0"
```
### Distributed Mode
@ -537,17 +476,17 @@ where ${dfs_name_service} is the hdfs name service you use
```bash
./bin/yarn jar /home/hadoop/hadoop-current/share/hadoop/yarn/hadoop-yarn-submarine-3.2.0-SNAPSHOT.jar job run \
--env DOCKER_JAVA_HOME=/opt/java \
--env DOCKER_HADOOP_HDFS_HOME=/hadoop-3.1.0 --name distributed-tf \
--env DOCKER_HADOOP_HDFS_HOME=/hadoop-current --name distributed-tf \
--env YARN_CONTAINER_RUNTIME_DOCKER_CONTAINER_NETWORK=calico-network \
--docker_image dockerfile-cpu-tf1.8.0-with-models \
--docker_image tf-1.13.1-cpu:0.0.1 \
--input_path hdfs://${dfs_name_service}/tmp/cifar-10-data \
--checkpoint_path hdfs://${dfs_name_service}/user/hadoop/tf-distributed-checkpoint \
--worker_resources memory=4G,vcores=2 --verbose \
--num_ps 1 \
--ps_resources memory=4G,vcores=2 \
--ps_launch_cmd "python /test/cifar10_estimator/cifar10_main.py --data-dir=hdfs://${dfs_name_service}/tmp/cifar-10-data --job-dir=hdfs://${dfs_name_service}/tmp/cifar-10-jobdir --num-gpus=0" \
--ps_launch_cmd "python /test/cifar10_estimator/cifar10_main.py --data-dir=%input_path% --job-dir=%checkpoint_path% --num-gpus=0" \
--num_workers 4 \
--worker_launch_cmd "python /test/cifar10_estimator/cifar10_main.py --data-dir=hdfs://${dfs_name_service}/tmp/cifar-10-data --job-dir=hdfs://${dfs_name_service}/tmp/cifar-10-jobdir --train-steps=500 --eval-batch-size=16 --train-batch-size=16 --sync --num-gpus=0"
--worker_launch_cmd "python /test/cifar10_estimator/cifar10_main.py --data-dir=%input_path% --job-dir=%checkpoint_path% --train-steps=500 --eval-batch-size=16 --train-batch-size=16 --sync --num-gpus=0"
```
@ -589,6 +528,11 @@ Add configurations in yarn-site.xml
<name>yarn.nodemanager.resource-plugins</name>
<value>yarn.io/gpu</value>
</property>
<!--Use nvidia docker v2-->
<property>
<name>yarn.nodemanager.resource-plugins.gpu.docker-plugin</name>
<value>nvidia-docker-v2</value>
</property>
</configuration>
```
@ -603,6 +547,8 @@ Add configurations in container-executor.cfg
docker.allowed.volume-drivers=/usr/bin/nvidia-docker
docker.allowed.devices=/dev/nvidiactl,/dev/nvidia-uvm,/dev/nvidia-uvm-tools,/dev/nvidia1,/dev/nvidia0
docker.allowed.ro-mounts=nvidia_driver_<version>
# Use nvidia docker v2
docker.allowed.runtimes=nvidia
[gpu]
module.enabled=true
@ -613,3 +559,21 @@ Add configurations in container-executor.cfg
root=/sys/fs/cgroup
yarn-hierarchy=/hadoop-yarn
```
### Run a distributed tensorflow gpu job
```bash
./yarn jar /home/hadoop/hadoop-current/share/hadoop/yarn/hadoop-yarn-submarine-3.2.0-SNAPSHOT.jar job run \
--env DOCKER_JAVA_HOME=/opt/java \
--env DOCKER_HADOOP_HDFS_HOME=/hadoop-current --name distributed-tf-gpu \
--env YARN_CONTAINER_RUNTIME_DOCKER_CONTAINER_NETWORK=calico-network \
--docker_image tf-1.13.1-gpu:0.0.1 \
--input_path hdfs://${dfs_name_service}/tmp/cifar-10-data \
--checkpoint_path hdfs://${dfs_name_service}/user/hadoop/tf-distributed-checkpoint \
--num_ps 0 \
--ps_resources memory=4G,vcores=2,gpu=0 \
--ps_launch_cmd "python /test/cifar10_estimator/cifar10_main.py --data-dir=%input_path% --job-dir=%checkpoint_path% --num-gpus=0" \
--worker_resources memory=4G,vcores=2,gpu=1 --verbose \
--num_workers 1 \
--worker_launch_cmd "python /test/cifar10_estimator/cifar10_main.py --data-dir=%input_path% --job-dir=%checkpoint_path% --train-steps=500 --eval-batch-size=16 --train-batch-size=16 --sync --num-gpus=1"
```

View File

@ -18,12 +18,12 @@
### 操作系统
我们使用的操作系统版本是 centos-release-7-3.1611.el7.centos.x86_64, 内核版本是 3.10.0-514.el7.x86_64 ,应该是最低版本了
我们使用的操作系统版本是 centos-release-7-5.1804.el7.centos.x86_64, 内核版本是 3.10.0-862.el7.x86_64
| Enviroment | Verion |
| ------ | ------ |
| Operating System | centos-release-7-3.1611.el7.centos.x86_64 |
| Kernal | 3.10.0-514.el7.x86_64 |
| Operating System | centos-release-7-5.1804.el7.centos.x86_64 |
| Kernal | 3.10.0-862.el7.x86_64 |
### User & Group
@ -58,8 +58,8 @@ yum install gcc make g++
# 方法一:
yum install kernel-devel-$(uname -r) kernel-headers-$(uname -r)
# 方法二:
wget http://vault.centos.org/7.3.1611/os/x86_64/Packages/kernel-headers-3.10.0-514.el7.x86_64.rpm
rpm -ivh kernel-headers-3.10.0-514.el7.x86_64.rpm
wget http://vault.centos.org/7.3.1611/os/x86_64/Packages/kernel-headers-3.10.0-862.el7.x86_64.rpm
rpm -ivh kernel-headers-3.10.0-862.el7.x86_64.rpm
```
### 检查 GPU 版本
@ -155,23 +155,40 @@ https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html
### 安装 Docker
```
yum -y update
yum -y install yum-utils
yum-config-manager --add-repo https://yum.dockerproject.org/repo/main/centos/7
yum -y update
# Remove old version docker
sudo yum remove docker \
docker-client \
docker-client-latest \
docker-common \
docker-latest \
docker-latest-logrotate \
docker-logrotate \
docker-engine
# 显示 available 的安装包
yum search --showduplicates docker-engine
# Docker version
export DOCKER_VERSION="18.06.1.ce"
# Setup the repository
sudo yum install -y yum-utils \
device-mapper-persistent-data \
lvm2
sudo yum-config-manager \
--add-repo \
https://download.docker.com/linux/centos/docker-ce.repo
# 安装 1.12.5 版本 docker
yum -y --nogpgcheck install docker-engine-1.12.5*
# Check docker version
yum list docker-ce --showduplicates | sort -r
# Install docker with specified DOCKER_VERSION
sudo yum install -y docker-ce-${DOCKER_VERSION} docker-ce-cli-${DOCKER_VERSION} containerd.io
# Start docker
systemctl start docker
chown hadoop:netease /var/run/docker.sock
chown hadoop:netease /usr/bin/docker
```
Referencehttps://docs.docker.com/cs-engine/1.12/
Referencehttps://docs.docker.com/install/linux/docker-ce/centos/
### 配置 Docker
@ -195,46 +212,40 @@ sudo systemctl restart docker
### Docker EE version
### 检查 Docker version
```bash
$ docker version
Client:
Version: 1.12.5
API version: 1.24
Go version: go1.6.4
Git commit: 7392c3b
Built: Fri Dec 16 02:23:59 2016
Version: 18.06.1-ce
API version: 1.38
Go version: go1.10.3
Git commit: e68fc7a
Built: Tue Aug 21 17:23:03 2018
OS/Arch: linux/amd64
Experimental: false
Server:
Version: 1.12.5
API version: 1.24
Go version: go1.6.4
Git commit: 7392c3b
Built: Fri Dec 16 02:23:59 2016
Version: 18.06.1-ce
API version: 1.38 (minimum version 1.12)
Go version: go1.10.3
Git commit: e68fc7a
Built: Tue Aug 21 17:23:03 2018
OS/Arch: linux/amd64
Experimental: false
```
### 安装 nvidia-docker
Hadoop-3.2 的 submarine 使用的是 1.0 版本的 nvidia-docker
Hadoop-3.2 的 submarine 已支持 V2 版本的 nvidia-docker
```
wget -P /tmp https://github.com/NVIDIA/nvidia-docker/releases/download/v1.0.1/nvidia-docker-1.0.1-1.x86_64.rpm
sudo rpm -i /tmp/nvidia-docker*.rpm
# 启动 nvidia-docker
sudo systemctl start nvidia-docker
# 查看 nvidia-docker 状态:
systemctl status nvidia-docker
# 查看 nvidia-docker 日志:
journalctl -u nvidia-docker
# 查看 nvidia-docker-plugin 是否正常
curl http://localhost:3476/v1.0/docker/cli
# Add the package repositories
distribution=$(. /etc/os-release;echo $ID$VERSION_ID)
curl -s -L https://nvidia.github.io/nvidia-container-runtime/$distribution/nvidia-container-runtime.repo | \
sudo tee /etc/yum.repos.d/nvidia-container-runtime.repo
sudo yum install -y nvidia-docker2-2.0.3-1.docker18.06.1.ce
```
`/var/lib/nvidia-docker/volumes/nvidia_driver/` 路径下,根据 `nvidia-driver` 的版本创建文件夹:
@ -251,7 +262,7 @@ cp /usr/lib64/libcuda* /var/lib/nvidia-docker/volumes/nvidia_driver/390.87/lib64
cp /usr/lib64/libnvidia* /var/lib/nvidia-docker/volumes/nvidia_driver/390.87/lib64
# Test nvidia-smi
nvidia-docker run --rm nvidia/cuda:9.0-devel nvidia-smi
nvidia-docker run --rm nvidia/cuda:10.0-devel nvidia-smi
```
测试 docker, nvidia-docker, nvidia-driver 安装
@ -270,11 +281,13 @@ import tensorflow as tf
tf.test.is_gpu_available()
```
卸载 nvidia-docker 1.0 的方法:
https://github.com/nvidia/nvidia-docker/wiki/Installation-(version-2.0)
卸载 nvidia-docker V2 的方法:
```
sudo yum remove -y nvidia-docker2-2.0.3-1.docker18.06.1.ce
```
reference:
https://github.com/NVIDIA/nvidia-docker/tree/1.0
https://github.com/NVIDIA/nvidia-docker
@ -282,79 +295,6 @@ https://github.com/NVIDIA/nvidia-docker/tree/1.0
CUDNN 和 CUDA 其实不需要在物理机上安装,因为 Sumbmarine 中提供了已经包含了CUDNN 和 CUDA 的镜像文件基础的Dockfile可参见WriteDockerfile.md
上述images无法支持kerberos环境如果需要kerberos可以使用如下Dockfile
```shell
FROM nvidia/cuda:9.0-cudnn7-devel-ubuntu16.04
# Pick up some TF dependencies
RUN apt-get update && apt-get install -y --allow-downgrades --no-install-recommends \
build-essential \
cuda-command-line-tools-9-0 \
cuda-cublas-9-0 \
cuda-cufft-9-0 \
cuda-curand-9-0 \
cuda-cusolver-9-0 \
cuda-cusparse-9-0 \
curl \
libcudnn7=7.0.5.15-1+cuda9.0 \
libfreetype6-dev \
libpng12-dev \
libzmq3-dev \
pkg-config \
python \
python-dev \
rsync \
software-properties-common \
unzip \
&& \
apt-get clean && \
rm -rf /var/lib/apt/lists/*
RUN export DEBIAN_FRONTEND=noninteractive && apt-get update && apt-get install -yq krb5-user libpam-krb5 && apt-get clean
RUN curl -O https://bootstrap.pypa.io/get-pip.py && \
python get-pip.py && \
rm get-pip.py
RUN pip --no-cache-dir install \
Pillow \
h5py \
ipykernel \
jupyter \
matplotlib \
numpy \
pandas \
scipy \
sklearn \
&& \
python -m ipykernel.kernelspec
# Install TensorFlow GPU version.
RUN pip --no-cache-dir install \
http://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.8.0-cp27-none-linux_x86_64.whl
RUN apt-get update && apt-get install git -y
RUN apt-get update && apt-get install -y openjdk-8-jdk wget
# 下载 hadoop-3.1.1.tar.gz
RUN wget http://mirrors.hust.edu.cn/apache/hadoop/common/hadoop-3.1.1/hadoop-3.1.1.tar.gz
RUN tar zxf hadoop-3.1.1.tar.gz
RUN mv hadoop-3.1.1 hadoop-3.1.0
# 下载支持kerberos的jdk安装包
RUN wget -qO jdk8.tar.gz 'http://${kerberos_jdk_url}/jdk-8u152-linux-x64.tar.gz'
RUN tar xzf jdk8.tar.gz -C /opt
RUN mv /opt/jdk* /opt/java
RUN rm jdk8.tar.gz
RUN update-alternatives --install /usr/bin/java java /opt/java/bin/java 100
RUN update-alternatives --install /usr/bin/javac javac /opt/java/bin/javac 100
ENV JAVA_HOME /opt/java
ENV PATH $PATH:$JAVA_HOME/bin
```
### 测试 TF 环境
创建好 docker 镜像后,需要先手动检查 TensorFlow 是否可以正常使用,避免通过 YARN 调度后出现问题,可以执行以下命令
@ -505,12 +445,12 @@ sudo YARN_LOGFILE=registrydns.log ./yarn-daemon.sh start registrydns
```bash
./bin/yarn jar /home/hadoop/hadoop-current/share/hadoop/yarn/hadoop-yarn-submarine-3.2.0-SNAPSHOT.jar job run \
--env DOCKER_JAVA_HOME=/opt/java \
--env DOCKER_HADOOP_HDFS_HOME=/hadoop-3.1.0 --name standalone-tf \
--docker_image dockerfile-cpu-tf1.8.0-with-models \
--env DOCKER_HADOOP_HDFS_HOME=/hadoop-current --name standalone-tf \
--docker_image tf-1.13.1-cpu:0.0.1 \
--input_path hdfs://${dfs_name_service}/tmp/cifar-10-data \
--checkpoint_path hdfs://${dfs_name_service}/user/hadoop/tf-checkpoint \
--worker_resources memory=4G,vcores=2 --verbose \
--worker_launch_cmd "python /test/cifar10_estimator/cifar10_main.py --data-dir=hdfs://${dfs_name_service}/tmp/cifar-10-data --job-dir=hdfs://${dfs_name_service}/tmp/cifar-10-jobdir --train-steps=500 --eval-batch-size=16 --train-batch-size=16 --num-gpus=0"
--worker_launch_cmd "python /test/cifar10_estimator/cifar10_main.py --data-dir=%input_path% --job-dir=%checkpoint_path% --train-steps=500 --eval-batch-size=16 --train-batch-size=16 --num-gpus=0"
```
@ -531,17 +471,17 @@ sudo YARN_LOGFILE=registrydns.log ./yarn-daemon.sh start registrydns
```bash
./bin/yarn jar /home/hadoop/hadoop-current/share/hadoop/yarn/hadoop-yarn-submarine-3.2.0-SNAPSHOT.jar job run \
--env DOCKER_JAVA_HOME=/opt/java \
--env DOCKER_HADOOP_HDFS_HOME=/hadoop-3.1.0 --name distributed-tf \
--env DOCKER_HADOOP_HDFS_HOME=/hadoop-current --name distributed-tf \
--env YARN_CONTAINER_RUNTIME_DOCKER_CONTAINER_NETWORK=calico-network \
--docker_image dockerfile-cpu-tf1.8.0-with-models \
--docker_image tf-1.13.1-cpu:0.0.1 \
--input_path hdfs://${dfs_name_service}/tmp/cifar-10-data \
--checkpoint_path hdfs://${dfs_name_service}/user/hadoop/tf-distributed-checkpoint \
--worker_resources memory=4G,vcores=2 --verbose \
--num_ps 1 \
--ps_resources memory=4G,vcores=2 \
--ps_launch_cmd "python /test/cifar10_estimator/cifar10_main.py --data-dir=hdfs://${dfs_name_service}/tmp/cifar-10-data --job-dir=hdfs://${dfs_name_service}/tmp/cifar-10-jobdir --num-gpus=0" \
--ps_launch_cmd "python /test/cifar10_estimator/cifar10_main.py --data-dir=%input_path% --job-dir=%checkpoint_path% --num-gpus=0" \
--num_workers 4 \
--worker_launch_cmd "python /test/cifar10_estimator/cifar10_main.py --data-dir=hdfs://${dfs_name_service}/tmp/cifar-10-data --job-dir=hdfs://${${dfs_name_service}}/tmp/cifar-10-jobdir --train-steps=500 --eval-batch-size=16 --train-batch-size=16 --sync --num-gpus=0"
--worker_launch_cmd "python /test/cifar10_estimator/cifar10_main.py --data-dir=%input_path% --job-dir=%checkpoint_path% --train-steps=500 --eval-batch-size=16 --train-batch-size=16 --sync --num-gpus=0"
```
@ -583,6 +523,11 @@ resourcemanager 使用的 scheduler 必须是 capacity scheduler在 capacity-
<name>yarn.nodemanager.resource-plugins</name>
<value>yarn.io/gpu</value>
</property>
<!--Use nvidia docker v2-->
<property>
<name>yarn.nodemanager.resource-plugins.gpu.docker-plugin</name>
<value>nvidia-docker-v2</value>
</property>
</configuration>
```
@ -597,6 +542,8 @@ resourcemanager 使用的 scheduler 必须是 capacity scheduler在 capacity-
docker.allowed.volume-drivers=/usr/bin/nvidia-docker
docker.allowed.devices=/dev/nvidiactl,/dev/nvidia-uvm,/dev/nvidia-uvm-tools,/dev/nvidia1,/dev/nvidia0
docker.allowed.ro-mounts=nvidia_driver_375.26
# Use nvidia docker v2
docker.allowed.runtimes=nvidia
[gpu]
module.enabled=true
@ -615,9 +562,9 @@ Distributed-shell + GPU + cgroup
```bash
./yarn jar /home/hadoop/hadoop-current/share/hadoop/yarn/hadoop-yarn-submarine-3.2.0-SNAPSHOT.jar job run \
--env DOCKER_JAVA_HOME=/opt/java \
--env DOCKER_HADOOP_HDFS_HOME=/hadoop-3.1.0 --name distributed-tf-gpu \
--env DOCKER_HADOOP_HDFS_HOME=/hadoop-current --name distributed-tf-gpu \
--env YARN_CONTAINER_RUNTIME_DOCKER_CONTAINER_NETWORK=calico-network \
--docker_image gpu-cuda9.0-tf1.8.0-with-models \
--docker_image tf-1.13.1-gpu:0.0.1 \
--input_path hdfs://${dfs_name_service}/tmp/cifar-10-data \
--checkpoint_path hdfs://${dfs_name_service}/user/hadoop/tf-distributed-checkpoint \
--num_ps 0 \

View File

@ -22,8 +22,17 @@ Must:
Optional:
- Enable YARN DNS. (When distributed training is required.)
- Enable YARN DNS. (When yarn service runtime is required.)
- Enable GPU on YARN support. (When GPU-based training is required.)
- Docker images for submarine jobs. (When docker container is required.)
```
# Get prebuilt docker images (No liability)
docker pull hadoopsubmarine/tf-1.13.1-gpu:0.0.1
# Or build your own docker images
docker build . -f Dockerfile.gpu.tf_1.13.1 -t tf-1.13.1-gpu-base:0.0.1
```
More details, please refer to
[How to write Dockerfile for Submarine jobs](WriteDockerfile.html)
## Run jobs
@ -122,7 +131,7 @@ For submarine internal configuration, please create a `submarine.xml` which shou
```
yarn jar path-to/hadoop-yarn-applications-submarine-3.2.0-SNAPSHOT.jar job run \
--env DOCKER_JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64/jre/ \
--env DOCKER_HADOOP_HDFS_HOME=/hadoop-3.1.0 --name tf-job-001 \
--env DOCKER_HADOOP_HDFS_HOME=/hadoop-current --name tf-job-001 \
--docker_image <your-docker-image> \
--input_path hdfs://default/dataset/cifar-10-data \
--checkpoint_path hdfs://default/tmp/cifar-10-jobdir \
@ -153,11 +162,11 @@ See below screenshot:
```
yarn jar hadoop-yarn-applications-submarine-<version>.jar job run \
--name tf-job-001 --docker_image <your docker image> \
--name tf-job-001 --docker_image <your-docker-image> \
--input_path hdfs://default/dataset/cifar-10-data \
--checkpoint_path hdfs://default/tmp/cifar-10-jobdir \
--env DOCKER_JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64/jre/ \
--env DOCKER_HADOOP_HDFS_HOME=/hadoop-3.1.0 \
--env DOCKER_HADOOP_HDFS_HOME=/hadoop-current \
--num_workers 2 \
--worker_resources memory=8G,vcores=2,gpu=1 --worker_launch_cmd "cmd for worker ..." \
--num_ps 2 \
@ -183,11 +192,11 @@ yarn jar hadoop-yarn-applications-submarine-3.2.0-SNAPSHOT.jar job show --name t
Output looks like:
```
Job Meta Info:
Application Id: application_1532131617202_0005
Input Path: hdfs://default/dataset/cifar-10-data
Checkpoint Path: hdfs://default/tmp/cifar-10-jobdir
Run Parameters: --name tf-job-001 --docker_image wtan/tf-1.8.0-gpu:0.0.3
(... all your commandline before run the job)
Application Id: application_1532131617202_0005
Input Path: hdfs://default/dataset/cifar-10-data
Checkpoint Path: hdfs://default/tmp/cifar-10-jobdir
Run Parameters: --name tf-job-001 --docker_image <your-docker-image>
(... all your commandline before run the job)
```
After that, you can run ```tensorboard --logdir=<checkpoint-path>``` to view Tensorboard of the job.
@ -198,9 +207,9 @@ After that, you can run ```tensorboard --logdir=<checkpoint-path>``` to view Ten
# Cleanup previous service if needed
yarn app -destroy tensorboard-service; \
yarn jar /tmp/hadoop-yarn-applications-submarine-3.2.0-SNAPSHOT.jar \
job run --name tensorboard-service --verbose --docker_image wtan/tf-1.8.0-cpu:0.0.3 \
job run --name tensorboard-service --verbose --docker_image <your-docker-image> \
--env DOCKER_JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64/jre/ \
--env DOCKER_HADOOP_HDFS_HOME=/hadoop-3.1.0 \
--env DOCKER_HADOOP_HDFS_HOME=/hadoop-current \
--num_workers 0 --tensorboard
```
@ -216,3 +225,11 @@ There're two ways to get training job logs, one is from YARN UI (new or old):
![alt text](./images/job-logs-ui.png "Job logs UI")
Or you can use `yarn logs -applicationId <applicationId>` to get logs from CLI
## Build from source code
If you want to build submarine project by yourself, you can follow the steps:
- Run 'mvn install -DskipTests' from Hadoop source top level once.
- Navigate to hadoop-submarine folder and run 'mvn clean package'.

View File

@ -57,13 +57,13 @@ Refer to [Write Dockerfile](WriteDockerfile.md) to build a Docker image or use p
```
yarn jar path/to/hadoop-yarn-applications-submarine-3.2.0-SNAPSHOT.jar \
job run --name tf-job-001 --verbose --docker_image hadoopsubmarine/tf-1.8.0-gpu:0.0.1 \
job run --name tf-job-001 --verbose --docker_image tf-1.13.1-gpu:0.0.1 \
--input_path hdfs://default/dataset/cifar-10-data \
--env DOCKER_JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64/jre/
--env DOCKER_HADOOP_HDFS_HOME=/hadoop-3.1.0
--env DOCKER_HADOOP_HDFS_HOME=/hadoop-current
--num_workers 1 --worker_resources memory=8G,vcores=2,gpu=1 \
--worker_launch_cmd "cd /test/models/tutorials/image/cifar10_estimator && python cifar10_main.py --data-dir=%input_path% --job-dir=%checkpoint_path% --train-steps=10000 --eval-batch-size=16 --train-batch-size=16 --num-gpus=2 --sync" \
--tensorboard --tensorboard_docker_image wtan/tf-1.8.0-cpu:0.0.3
--tensorboard --tensorboard_docker_image tf-1.13.1-cpu:0.0.1
```
Explanations:
@ -75,16 +75,16 @@ Explanations:
```
yarn jar path/to/hadoop-yarn-applications-submarine-3.2.0-SNAPSHOT.jar \
job run --name tf-job-001 --verbose --docker_image hadoopsubmarine/tf-1.8.0-gpu:0.0.1 \
job run --name tf-job-001 --verbose --docker_image tf-1.13.1-gpu:0.0.1 \
--input_path hdfs://default/dataset/cifar-10-data \
--env(s) (same as standalone)
--num_workers 2 \
--worker_resources memory=8G,vcores=2,gpu=1 \
--worker_launch_cmd "cd /test/models/tutorials/image/cifar10_estimator && python cifar10_main.py --data-dir=%input_path% --job-dir=%checkpoint_path% --train-steps=10000 --eval-batch-size=16 --train-batch-size=16 --num-gpus=2 --sync" \
--ps_docker_image wtan/tf-1.8.0-cpu:0.0.3 \
--ps_docker_image tf-1.13.1-cpu:0.0.1 \
--num_ps 1 --ps_resources memory=4G,vcores=2,gpu=0 \
--ps_launch_cmd "cd /test/models/tutorials/image/cifar10_estimator && python cifar10_main.py --data-dir=%input_path% --job-dir=%checkpoint_path% --num-gpus=0" \
--tensorboard --tensorboard_docker_image wtan/tf-1.8.0-cpu:0.0.3
--tensorboard --tensorboard_docker_image tf-1.13.1-cpu:0.0.1
```
Explanations:

View File

@ -19,10 +19,10 @@ Distributed-shell + GPU + cgroup
```bash
./yarn jar /home/hadoop/hadoop-current/share/hadoop/yarn/hadoop-yarn-submarine-3.2.0-SNAPSHOT.jar job run \
--env DOCKER_JAVA_HOME=/opt/java \
--env DOCKER_HADOOP_HDFS_HOME=/hadoop-3.1.0 --name distributed-tf-gpu \
--env DOCKER_HADOOP_HDFS_HOME=/hadoop-current --name distributed-tf-gpu \
--env YARN_CONTAINER_RUNTIME_DOCKER_CONTAINER_NETWORK=calico-network \
--worker_docker_image gpu-cuda9.0-tf1.8.0-with-models \
--ps_docker_image dockerfile-cpu-tf1.8.0-with-models \
--worker_docker_image tf-1.13.1-gpu:0.0.1 \
--ps_docker_image tf-1.13.1-cpu:0.0.1 \
--input_path hdfs://${dfs_name_service}/tmp/cifar-10-data \
--checkpoint_path hdfs://${dfs_name_service}/user/hadoop/tf-distributed-checkpoint \
--num_ps 0 \
@ -140,26 +140,7 @@ $ chmod +x find-busy-mnt.sh
$ kill -9 5007
```
### Issue 5Failed to execute `sudo nvidia-docker run`
```
docker: Error response from daemon: create nvidia_driver_361.42: VolumeDriver.Create: internal error, check logs for details.
See 'docker run --help'.
```
Solution:
```
#check nvidia-docker status
$ systemctl status nvidia-docker
$ journalctl -n -u nvidia-docker
#restart nvidia-docker
systemctl stop nvidia-docker
systemctl start nvidia-docker
```
### Issue 6Yarn failed to start containers
### Issue 5Yarn failed to start containers
if the number of GPUs required by applications is larger than the number of GPUs in the cluster, there would be some containers can't be created.

View File

@ -56,6 +56,8 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
apt-get clean && \
rm -rf /var/lib/apt/lists/*
RUN export DEBIAN_FRONTEND=noninteractive && apt-get update && apt-get install -yq krb5-user libpam-krb5 && apt-get clean
RUN curl -O https://bootstrap.pypa.io/get-pip.py && \
python get-pip.py && \
rm get-pip.py
@ -74,14 +76,18 @@ RUN pip --no-cache-dir install \
python -m ipykernel.kernelspec
RUN pip --no-cache-dir install \
http://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0-cp27-none-linux_x86_64.whl
http://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.13.1-cp27-none-linux_x86_64.whl
```
On top of above image, add files, install packages to access HDFS
```
RUN apt-get update && apt-get install -y openjdk-8-jdk wget
RUN wget http://apache.cs.utah.edu/hadoop/common/hadoop-3.1.0/hadoop-3.1.0.tar.gz
RUN tar zxf hadoop-3.1.0.tar.gz
# Install hadoop
ENV HADOOP_VERSION="3.1.2"
RUN wget http://mirrors.hust.edu.cn/apache/hadoop/common/hadoop-${HADOOP_VERSION}/hadoop-${HADOOP_VERSION}.tar.gz
RUN tar zxf hadoop-${HADOOP_VERSION}.tar.gz
RUN ln -s hadoop-${HADOOP_VERSION} hadoop-current
RUN rm hadoop-${HADOOP_VERSION}.tar.gz
```
Build and push to your own docker registry: Use ```docker build ... ``` and ```docker push ...``` to finish this step.
@ -90,12 +96,12 @@ Build and push to your own docker registry: Use ```docker build ... ``` and ```d
We provided following examples for you to build tensorflow docker images.
For Tensorflow 1.8.0 (Precompiled to CUDA 9.x)
For Tensorflow 1.13.1 (Precompiled to CUDA 10.x)
- *docker/base/ubuntu-16.04/Dockerfile.cpu.tf_1.8.0*: Tensorflow 1.8.0 supports CPU only.
- *docker/with-cifar10-models/ubuntu-16.04/Dockerfile.cpu.tf_1.8.0*: Tensorflow 1.8.0 supports CPU only, and included models
- *docker/base/ubuntu-16.04/Dockerfile.gpu.cuda_9.0.tf_1.8.0*: Tensorflow 1.8.0 supports GPU, which is prebuilt to CUDA9.
- *docker/with-cifar10-models/ubuntu-16.04/Dockerfile.gpu.cuda_8.0.tf_1.8.0*: Tensorflow 1.8.0 supports GPU, which is prebuilt to CUDA9, with models.
- *docker/base/ubuntu-16.04/Dockerfile.cpu.tf_1.13.1*: Tensorflow 1.13.1 supports CPU only.
- *docker/with-cifar10-models/ubuntu-16.04/Dockerfile.cpu.tf_1.13.1*: Tensorflow 1.13.1 supports CPU only, and included models
- *docker/base/ubuntu-16.04/Dockerfile.gpu.tf_1.13.1*: Tensorflow 1.13.1 supports GPU, which is prebuilt to CUDA10.
- *docker/with-cifar10-models/ubuntu-16.04/Dockerfile.gpu.tf_1.13.1*: Tensorflow 1.13.1 supports GPU, which is prebuilt to CUDA10, with models.
## Build Docker images
@ -103,15 +109,15 @@ For Tensorflow 1.8.0 (Precompiled to CUDA 9.x)
Under `docker/` directory, run `build-all.sh` to build Docker images. It will build following images:
- `tf-1.8.0-gpu-base:0.0.1` for base Docker image which includes Hadoop, Tensorflow, GPU base libraries.
- `tf-1.8.0-gpu-base:0.0.1` for base Docker image which includes Hadoop. Tensorflow.
- `tf-1.8.0-gpu:0.0.1` which includes cifar10 model
- `tf-1.8.0-cpu:0.0.1` which inclues cifar10 model (cpu only).
- `tf-1.13.1-gpu-base:0.0.1` for base Docker image which includes Hadoop, Tensorflow, GPU base libraries.
- `tf-1.13.1-gpu-base:0.0.1` for base Docker image which includes Hadoop. Tensorflow.
- `tf-1.13.1-gpu:0.0.1` which includes cifar10 model
- `tf-1.13.1-cpu:0.0.1` which inclues cifar10 model (cpu only).
### Use prebuilt images
(No liability)
You can also use prebuilt images for convenience:
- hadoopsubmarine/tf-1.8.0-gpu:0.0.1
- hadoopsubmarine/tf-1.8.0-cpu:0.0.1
- hadoopsubmarine/tf-1.13.1-gpu:0.0.1
- hadoopsubmarine/tf-1.13.1-cpu:0.0.1