2018-09-18 11:39:26 -04:00
# Based on the SequenceIQ hadoop-docker project hosted at
# https://github.com/sequenceiq/hadoop-docker, and modified at
# the Apache Software Foundation (ASF).
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
2019-11-19 12:14:33 -05:00
# Creates pseudo distributed hadoop 2.8.5 with java 8
2020-12-08 14:00:51 -05:00
FROM centos:7
2018-08-09 16:37:52 -04:00
USER root
# install dev tools
RUN yum clean all \
&& rpm --rebuilddb \
&& yum install -y curl which tar sudo openssh-server openssh-clients rsync yum-plugin-ovl\
&& yum clean all \
&& yum update -y libselinux \
2019-10-25 20:58:40 -04:00
&& yum update -y nss \
2018-08-09 16:37:52 -04:00
&& yum clean all
# update libselinux. see https://github.com/sequenceiq/hadoop-docker/issues/14
2019-10-25 20:58:40 -04:00
# update nss. see https://unix.stackexchange.com/questions/280548/curl-doesnt-connect-to-https-while-wget-does-nss-error-12286
2018-08-09 16:37:52 -04:00
# passwordless ssh
RUN ssh-keygen -q -N "" -t dsa -f /etc/ssh/ssh_host_dsa_key
RUN ssh-keygen -q -N "" -t rsa -f /etc/ssh/ssh_host_rsa_key
RUN ssh-keygen -q -N "" -t rsa -f /root/.ssh/id_rsa
RUN cp /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys
# zulu java 8
2019-05-21 14:30:14 -04:00
COPY setup-zulu-repo.sh /root/setup-zulu-repo.sh
RUN /root/setup-zulu-repo.sh
2018-08-09 16:37:52 -04:00
RUN yum install -y zulu-8
ENV JAVA_HOME /usr/lib/jvm/zulu-8
ENV PATH $PATH :$JAVA_HOME /bin
# hadoop
2021-03-11 12:07:51 -05:00
ARG APACHE_ARCHIVE_MIRROR_HOST = https://archive.apache.org
RUN curl -s ${ APACHE_ARCHIVE_MIRROR_HOST } /dist/hadoop/core/hadoop-2.8.5/hadoop-2.8.5.tar.gz | tar -xz -C /usr/local/
2019-11-19 12:14:33 -05:00
RUN cd /usr/local && ln -s ./hadoop-2.8.5 hadoop
2018-08-09 16:37:52 -04:00
ENV HADOOP_PREFIX /usr/local/hadoop
ENV HADOOP_COMMON_HOME /usr/local/hadoop
ENV HADOOP_HDFS_HOME /usr/local/hadoop
ENV HADOOP_MAPRED_HOME /usr/local/hadoop
ENV HADOOP_YARN_HOME /usr/local/hadoop
ENV HADOOP_CONF_DIR /usr/local/hadoop/etc/hadoop
ENV YARN_CONF_DIR $HADOOP_PREFIX /etc/hadoop
RUN sed -i '/^export JAVA_HOME/ s:.*:export JAVA_HOME=/usr/lib/jvm/zulu-8\nexport HADOOP_PREFIX=/usr/local/hadoop\nexport HADOOP_HOME=/usr/local/hadoop\n:' $HADOOP_PREFIX /etc/hadoop/hadoop-env.sh
RUN sed -i '/^export HADOOP_CONF_DIR/ s:.*:export HADOOP_CONF_DIR=/usr/local/hadoop/etc/hadoop/:' $HADOOP_PREFIX /etc/hadoop/hadoop-env.sh
RUN mkdir $HADOOP_PREFIX /input
RUN cp $HADOOP_PREFIX /etc/hadoop/*.xml $HADOOP_PREFIX /input
# pseudo distributed
ADD core-site.xml.template $HADOOP_PREFIX /etc/hadoop/core-site.xml.template
RUN sed s/HOSTNAME/localhost/ /usr/local/hadoop/etc/hadoop/core-site.xml.template > /usr/local/hadoop/etc/hadoop/core-site.xml
ADD hdfs-site.xml $HADOOP_PREFIX /etc/hadoop/hdfs-site.xml
ADD mapred-site.xml $HADOOP_PREFIX /etc/hadoop/mapred-site.xml
ADD yarn-site.xml $HADOOP_PREFIX /etc/hadoop/yarn-site.xml
RUN $HADOOP_PREFIX /bin/hdfs namenode -format
ADD ssh_config /root/.ssh/config
RUN chmod 600 /root/.ssh/config
RUN chown root:root /root/.ssh/config
# # installing supervisord
# RUN yum install -y python-setuptools
# RUN easy_install pip
# RUN curl https://bitbucket.org/pypa/setuptools/raw/bootstrap/ez_setup.py -o - | python
# RUN pip install supervisor
#
# ADD supervisord.conf /etc/supervisord.conf
ADD bootstrap.sh /etc/bootstrap.sh
RUN chown root:root /etc/bootstrap.sh
RUN chmod 700 /etc/bootstrap.sh
ENV BOOTSTRAP /etc/bootstrap.sh
# workingaround docker.io build error
RUN ls -la /usr/local/hadoop/etc/hadoop/*-env.sh
RUN chmod +x /usr/local/hadoop/etc/hadoop/*-env.sh
RUN ls -la /usr/local/hadoop/etc/hadoop/*-env.sh
# Copy additional .jars to classpath
RUN cp /usr/local/hadoop/share/hadoop/tools/lib/*.jar /usr/local/hadoop/share/hadoop/common/lib/
# fix the 254 error code
RUN sed -i "/^[^#]*UsePAM/ s/.*/#&/" /etc/ssh/sshd_config
RUN echo "UsePAM no" >> /etc/ssh/sshd_config
RUN echo "Port 2122" >> /etc/ssh/sshd_config
2020-12-08 14:00:51 -05:00
# script for plain sshd start
RUN echo -e \
'#!/bin/bash\n/usr/sbin/sshd\ntimeout 10 bash -c "until printf \"\" 2>>/dev/null >>/dev/tcp/127.0.0.1/2122; do sleep 0.5; done"' > \
/usr/local/bin/start_sshd && \
chmod a+x /usr/local/bin/start_sshd
RUN start_sshd && $HADOOP_PREFIX /etc/hadoop/hadoop-env.sh && $HADOOP_PREFIX /sbin/start-dfs.sh && $HADOOP_PREFIX /bin/hdfs dfs -mkdir -p /user/root
RUN start_sshd && $HADOOP_PREFIX /etc/hadoop/hadoop-env.sh && $HADOOP_PREFIX /sbin/start-dfs.sh && $HADOOP_PREFIX /bin/hdfs dfs -put $HADOOP_PREFIX /etc/hadoop/ input
2018-08-09 16:37:52 -04:00
CMD [ "/etc/bootstrap.sh" , "-d" ]
# Hdfs ports
EXPOSE 50010 50020 50070 50075 50090 8020 9000
# Mapred ports
EXPOSE 10020 19888
#Yarn ports
EXPOSE 8030 8031 8032 8033 8040 8042 8088
#Other ports
EXPOSE 49707 2122