HADOOP-7599. Script improvements to setup a secure Hadoop cluster. Contributed by Eric Yang.

git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1169986 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Devaraj Das 2011-09-13 00:31:29 +00:00
parent 6165875dc6
commit 468775a21d
23 changed files with 1486 additions and 200 deletions

View File

@ -78,6 +78,13 @@
<include>*-site.xml</include> <include>*-site.xml</include>
</includes> </includes>
</fileSet> </fileSet>
<fileSet>
<directory>${basedir}/src/main/packages/templates/conf</directory>
<outputDirectory>/share/hadoop/${hadoop.component}/templates/conf</outputDirectory>
<includes>
<include>*</include>
</includes>
</fileSet>
<fileSet> <fileSet>
<directory>${project.build.directory}</directory> <directory>${project.build.directory}</directory>
<outputDirectory>/share/hadoop/${hadoop.component}</outputDirectory> <outputDirectory>/share/hadoop/${hadoop.component}</outputDirectory>

View File

@ -380,6 +380,9 @@ Release 0.23.0 - Unreleased
HADOOP-7612. Change test-patch to run tests for all nested modules. HADOOP-7612. Change test-patch to run tests for all nested modules.
(tomwhite) (tomwhite)
HADOOP-7599. Script improvements to setup a secure Hadoop cluster
(Eric Yang via ddas)
OPTIMIZATIONS OPTIMIZATIONS
HADOOP-7333. Performance improvement in PureJavaCrc32. (Eric Caspole HADOOP-7333. Performance improvement in PureJavaCrc32. (Eric Caspole

View File

@ -39,6 +39,14 @@ fi
. /lib/lsb/init-functions . /lib/lsb/init-functions
if [ -n "$HADOOP_SECURE_DN_USER" ]; then
DN_USER="root"
IDENT_USER=${HADOOP_SECURE_DN_USER}
else
DN_USER="hdfs"
IDENT_USER=${DN_USER}
fi
# Are we running from init? # Are we running from init?
run_by_init() { run_by_init() {
([ "$previous" ] && [ "$runlevel" ]) || [ "$runlevel" = S ] ([ "$previous" ] && [ "$runlevel" ]) || [ "$runlevel" = S ]
@ -67,13 +75,14 @@ check_privsep_dir() {
} }
export PATH="${PATH:+$PATH:}/usr/sbin:/usr/bin" export PATH="${PATH:+$PATH:}/usr/sbin:/usr/bin"
export HADOOP_PREFIX="/usr"
case "$1" in case "$1" in
start) start)
check_privsep_dir check_privsep_dir
check_for_no_start check_for_no_start
log_daemon_msg "Starting Apache Hadoop Data Node server" "hadoop-datanode" log_daemon_msg "Starting Apache Hadoop Data Node server" "hadoop-datanode"
if start-stop-daemon --start --quiet --oknodo --pidfile ${HADOOP_PID_DIR}/hadoop-hdfs-datanode.pid -c hdfs -x ${HADOOP_PREFIX}/sbin/hadoop-daemon.sh -- --config ${HADOOP_CONF_DIR} start datanode; then if start-stop-daemon --start --quiet --oknodo --pidfile ${HADOOP_PID_DIR}/hadoop-${IDENT_USER}-datanode.pid -c ${DN_USER} -x ${HADOOP_PREFIX}/sbin/hadoop-daemon.sh -- --config ${HADOOP_CONF_DIR} start datanode; then
log_end_msg 0 log_end_msg 0
else else
log_end_msg 1 log_end_msg 1
@ -81,7 +90,7 @@ case "$1" in
;; ;;
stop) stop)
log_daemon_msg "Stopping Apache Hadoop Data Node server" "hadoop-datanode" log_daemon_msg "Stopping Apache Hadoop Data Node server" "hadoop-datanode"
if start-stop-daemon --stop --quiet --oknodo --pidfile ${HADOOP_PID_DIR}/hadoop-hdfs-datanode.pid; then if start-stop-daemon --stop --quiet --oknodo --pidfile ${HADOOP_PID_DIR}/hadoop-${IDENT_USER}-datanode.pid; then
log_end_msg 0 log_end_msg 0
else else
log_end_msg 1 log_end_msg 1
@ -91,9 +100,9 @@ case "$1" in
restart) restart)
check_privsep_dir check_privsep_dir
log_daemon_msg "Restarting Apache Hadoop Data Node server" "hadoop-datanode" log_daemon_msg "Restarting Apache Hadoop Data Node server" "hadoop-datanode"
start-stop-daemon --stop --quiet --oknodo --retry 30 --pidfile ${HADOOP_PID_DIR}/hadoop-hdfs-datanode.pid start-stop-daemon --stop --quiet --oknodo --retry 30 --pidfile ${HADOOP_PID_DIR}/hadoop-${IDENT_USER}-datanode.pid
check_for_no_start log_end_msg check_for_no_start log_end_msg
if start-stop-daemon --start --quiet --oknodo --pidfile ${HADOOP_PID_DIR}/hadoop-hdfs-datanode.pid -c hdfs -x ${HADOOP_PREFIX}/sbin/hadoop-daemon.sh -- --config ${HADOOP_CONF_DIR} start datanode; then if start-stop-daemon --start --quiet --oknodo --pidfile ${HADOOP_PID_DIR}/hadoop-${IDENT_USER}-datanode.pid -c ${DN_USER} -x ${HADOOP_PREFIX}/sbin/hadoop-daemon.sh -- --config ${HADOOP_CONF_DIR} start datanode; then
log_end_msg 0 log_end_msg 0
else else
log_end_msg 1 log_end_msg 1
@ -104,14 +113,14 @@ case "$1" in
check_privsep_dir check_privsep_dir
log_daemon_msg "Restarting Apache Hadoop Data Node server" "hadoop-datanode" log_daemon_msg "Restarting Apache Hadoop Data Node server" "hadoop-datanode"
set +e set +e
start-stop-daemon --stop --quiet --retry 30 --pidfile ${HADOOP_PID_DIR}/hadoop-hdfs-datanode.pid start-stop-daemon --stop --quiet --retry 30 --pidfile ${HADOOP_PID_DIR}/hadoop-${IDENT_USER}-datanode.pid
RET="$?" RET="$?"
set -e set -e
case $RET in case $RET in
0) 0)
# old daemon stopped # old daemon stopped
check_for_no_start log_end_msg check_for_no_start log_end_msg
if start-stop-daemon --start --quiet --oknodo --pidfile ${HADOOP_PID_DIR}/hadoop-hdfs-datanode.pid -c hdfs -x ${HADOOP_PREFIX}/sbin/hadoop-daemon.sh -- --config ${HADOOP_CONF_DIR} start datanode; then if start-stop-daemon --start --quiet --oknodo --pidfile ${HADOOP_PID_DIR}/hadoop-${IDENT_USER}-datanode.pid -c ${DN_USER} -x ${HADOOP_PREFIX}/sbin/hadoop-daemon.sh -- --config ${HADOOP_CONF_DIR} start datanode; then
log_end_msg 0 log_end_msg 0
else else
log_end_msg 1 log_end_msg 1
@ -131,7 +140,7 @@ case "$1" in
;; ;;
status) status)
status_of_proc -p ${HADOOP_PID_DIR}/hadoop-hdfs-datanode.pid ${JAVA_HOME}/bin/java hadoop-datanode && exit 0 || exit $? status_of_proc -p ${HADOOP_PID_DIR}/hadoop-${IDENT_USER}-datanode.pid ${JAVA_HOME}/bin/java hadoop-datanode && exit 0 || exit $?
;; ;;
*) *)

View File

@ -67,6 +67,7 @@ check_privsep_dir() {
} }
export PATH="${PATH:+$PATH:}/usr/sbin:/usr/bin" export PATH="${PATH:+$PATH:}/usr/sbin:/usr/bin"
export HADOOP_PREFIX="/usr"
case "$1" in case "$1" in
start) start)

View File

@ -67,10 +67,11 @@ check_privsep_dir() {
} }
format() { format() {
su -c '${HADOOP_PREFIX}/bin/hadoop --config ${HADOOP_CONF_DIR} namenode -format' hdfs sudo -u hdfs ${HADOOP_PREFIX}/bin/hadoop --config ${HADOOP_CONF_DIR} namenode -format
} }
export PATH="${PATH:+$PATH:}/usr/sbin:/usr/bin" export PATH="${PATH:+$PATH:}/usr/sbin:/usr/bin"
export HADOOP_PREFIX="/usr"
case "$1" in case "$1" in
start) start)

View File

@ -67,6 +67,7 @@ check_privsep_dir() {
} }
export PATH="${PATH:+$PATH:}/usr/sbin:/usr/bin" export PATH="${PATH:+$PATH:}/usr/sbin:/usr/bin"
export HADOOP_PREFIX="/usr"
case "$1" in case "$1" in
start) start)

View File

@ -14,9 +14,10 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
this="${BASH_SOURCE-$0}"
bin=`dirname "$0"` bin=$(cd -P -- "$(dirname -- "$this")" && pwd -P)
bin=`cd "$bin"; pwd` script="$(basename -- "$this")"
this="$bin/$script"
if [ "$HADOOP_HOME" != "" ]; then if [ "$HADOOP_HOME" != "" ]; then
echo "Warning: \$HADOOP_HOME is deprecated." echo "Warning: \$HADOOP_HOME is deprecated."
@ -29,30 +30,86 @@ usage() {
echo " echo "
usage: $0 <parameters> usage: $0 <parameters>
Require parameter: Require parameter:
-u <username> Create user on HDFS --config /etc/hadoop Location of Hadoop configuration file
-u <username> Create user on HDFS
Optional parameters: Optional parameters:
-h Display this message -h Display this message
--kerberos-realm=KERBEROS.EXAMPLE.COM Set Kerberos realm
--super-user=hdfs Set super user id
--super-user-keytab=/etc/security/keytabs/hdfs.keytab Set super user keytab location
" "
exit 1 exit 1
} }
# Parse script parameters OPTS=$(getopt \
if [ $# != 2 ] ; then -n $0 \
-o '' \
-l 'kerberos-realm:' \
-l 'super-user:' \
-l 'super-user-keytab:' \
-o 'h' \
-o 'u' \
-- "$@")
if [ $? != 0 ] ; then
usage usage
exit 1 exit 1
fi fi
while getopts "hu:" OPTION create_user() {
do if [ "${SETUP_USER}" = "" ]; then
case $OPTION in break
u) fi
SETUP_USER=$2; shift 2 HADOOP_HDFS_USER=${HADOOP_HDFS_USER:-hdfs}
;; export HADOOP_PREFIX
h) export HADOOP_CONF_DIR
export JAVA_HOME
export SETUP_USER=${SETUP_USER}
export SETUP_PATH=/user/${SETUP_USER}
if [ ! "${KERBEROS_REALM}" = "" ]; then
# locate kinit cmd
if [ -e /etc/lsb-release ]; then
KINIT_CMD="/usr/bin/kinit -kt ${HDFS_USER_KEYTAB} ${HADOOP_HDFS_USER}"
else
KINIT_CMD="/usr/kerberos/bin/kinit -kt ${HDFS_USER_KEYTAB} ${HADOOP_HDFS_USER}"
fi
su -c "${KINIT_CMD}" ${HADOOP_HDFS_USER}
fi
su -c "${HADOOP_PREFIX}/bin/hadoop --config ${HADOOP_CONF_DIR} fs -mkdir ${SETUP_PATH}" ${HADOOP_HDFS_USER}
su -c "${HADOOP_PREFIX}/bin/hadoop --config ${HADOOP_CONF_DIR} fs -chown ${SETUP_USER}:${SETUP_USER} ${SETUP_PATH}" ${HADOOP_HDFS_USER}
su -c "${HADOOP_PREFIX}/bin/hadoop --config ${HADOOP_CONF_DIR} fs -chmod 711 ${SETUP_PATH}" ${HADOOP_HDFS_USER}
if [ "$?" == "0" ]; then
echo "User directory has been setup: ${SETUP_PATH}"
fi
}
eval set -- "${OPTS}"
while true; do
case "$1" in
-u)
shift
;;
--kerberos-realm)
KERBEROS_REALM=$2; shift 2
;;
--super-user)
HADOOP_HDFS_USER=$2; shift 2
;;
--super-user-keytab)
HDFS_USER_KEYTAB=$2; shift 2
;;
-h)
usage usage
;; ;;
--) --)
shift ; break while shift; do
SETUP_USER=$1
create_user
done
break
;; ;;
*) *)
echo "Unknown option: $1" echo "Unknown option: $1"
@ -62,15 +119,3 @@ do
esac esac
done done
# Create user directory on HDFS
export SETUP_USER
export SETUP_PATH=/user/${SETUP_USER}
export HADOOP_PREFIX
export HADOOP_CONF_DIR
su -c '${HADOOP_PREFIX}/bin/hadoop --config ${HADOOP_CONF_DIR} fs -mkdir ${SETUP_PATH}' hdfs
su -c '${HADOOP_PREFIX}/bin/hadoop --config ${HADOOP_CONF_DIR} fs -chown ${SETUP_USER}:${SETUP_USER} ${SETUP_PATH}' hdfs
if [ "$?" == "0" ]; then
echo "User directory has been setup: ${SETUP_PATH}"
fi

View File

@ -18,35 +18,60 @@
bin=`dirname "$0"` bin=`dirname "$0"`
bin=`cd "$bin"; pwd` bin=`cd "$bin"; pwd`
if [ "$HADOOP_HOME" != "" ]; then this="${BASH_SOURCE-$0}"
echo "Warning: \$HADOOP_HOME is deprecated." export HADOOP_PREFIX=`dirname "$this"`/..
echo
fi
. "$bin"/../libexec/hadoop-config.sh
usage() { usage() {
echo " echo "
usage: $0 <parameters> usage: $0 <parameters>
Optional parameters: Optional parameters:
--auto Setup automatically --auto Setup path and configuration automatically
--default Generate default config --default Setup configuration as default
--conf-dir=/etc/hadoop Set config directory --conf-dir=/etc/hadoop Set configuration directory
--datanode-dir=/var/lib/hadoop/hdfs/datanode Set datanode directory --datanode-dir=/var/lib/hadoop/hdfs/datanode Set datanode directory
--group=hadoop Set Hadoop group name
-h Display this message -h Display this message
--jobtracker-url=hostname:9001 Set jobtracker url --hdfs-user=hdfs Set HDFS user
--jobtracker-host=hostname Set jobtracker host
--namenode-host=hostname Set namenode host
--secondarynamenode-host=hostname Set secondary namenode host
--kerberos-realm=KERBEROS.EXAMPLE.COM Set Kerberos realm
--kinit-location=/usr/kerberos/bin/kinit Set kinit location
--keytab-dir=/etc/security/keytabs Set keytab directory
--log-dir=/var/log/hadoop Set log directory --log-dir=/var/log/hadoop Set log directory
--hdfs-dir=/var/lib/hadoop/hdfs Set hdfs directory --pid-dir=/var/run/hadoop Set pid directory
--hdfs-dir=/var/lib/hadoop/hdfs Set HDFS directory
--hdfs-user-keytab=/home/hdfs/hdfs.keytab Set HDFS user key tab
--mapred-dir=/var/lib/hadoop/mapred Set mapreduce directory --mapred-dir=/var/lib/hadoop/mapred Set mapreduce directory
--mapreduce-user=mr Set mapreduce user
--mapreduce-user-keytab=/home/mr/hdfs.keytab Set mapreduce user key tab
--namenode-dir=/var/lib/hadoop/hdfs/namenode Set namenode directory --namenode-dir=/var/lib/hadoop/hdfs/namenode Set namenode directory
--namenode-url=hdfs://hostname:9000/ Set namenode url
--replication=3 Set replication factor --replication=3 Set replication factor
--taskscheduler=org.apache.hadoop.mapred.JobQueueTaskScheduler Set task scheduler --taskscheduler=org.apache.hadoop.mapred.JobQueueTaskScheduler Set task scheduler
--datanodes=hostname1,hostname2,... SET the datanodes
--tasktrackers=hostname1,hostname2,... SET the tasktrackers
" "
exit 1 exit 1
} }
check_permission() {
TARGET=$1
OWNER="0"
RESULT=0
while [ "$TARGET" != "/" ]; do
PARENT=`dirname $TARGET`
NAME=`basename $TARGET`
OWNER=`ls -ln $PARENT | grep $NAME| awk '{print $3}'`
if [ "$OWNER" != "0" ]; then
RESULT=1
break
fi
TARGET=`dirname $TARGET`
done
return $RESULT
}
template_generator() { template_generator() {
REGEX='(\$\{[a-zA-Z_][a-zA-Z_0-9]*\})' REGEX='(\$\{[a-zA-Z_][a-zA-Z_0-9]*\})'
cat $1 | cat $1 |
@ -64,17 +89,30 @@ OPTS=$(getopt \
-n $0 \ -n $0 \
-o '' \ -o '' \
-l 'auto' \ -l 'auto' \
-l 'java-home:' \
-l 'conf-dir:' \ -l 'conf-dir:' \
-l 'default' \ -l 'default' \
-l 'group:' \
-l 'hdfs-dir:' \ -l 'hdfs-dir:' \
-l 'namenode-dir:' \ -l 'namenode-dir:' \
-l 'datanode-dir:' \ -l 'datanode-dir:' \
-l 'mapred-dir:' \ -l 'mapred-dir:' \
-l 'namenode-url:' \ -l 'namenode-host:' \
-l 'jobtracker-url:' \ -l 'secondarynamenode-host:' \
-l 'jobtracker-host:' \
-l 'log-dir:' \ -l 'log-dir:' \
-l 'pid-dir:' \
-l 'replication:' \ -l 'replication:' \
-l 'taskscheduler:' \ -l 'taskscheduler:' \
-l 'hdfs-user:' \
-l 'hdfs-user-keytab:' \
-l 'mapreduce-user:' \
-l 'mapreduce-user-keytab:' \
-l 'keytab-dir:' \
-l 'kerberos-realm:' \
-l 'kinit-location:' \
-l 'datanodes:' \
-l 'tasktrackers:' \
-o 'h' \ -o 'h' \
-- "$@") -- "$@")
@ -84,8 +122,7 @@ fi
# Make sure the HADOOP_LOG_DIR is not picked up from user environment. # Make sure the HADOOP_LOG_DIR is not picked up from user environment.
unset HADOOP_LOG_DIR unset HADOOP_LOG_DIR
# Parse script parameters
eval set -- "${OPTS}" eval set -- "${OPTS}"
while true ; do while true ; do
case "$1" in case "$1" in
@ -94,6 +131,10 @@ while true ; do
AUTOMATED=1 AUTOMATED=1
shift shift
;; ;;
--java-home)
JAVA_HOME=$2; shift 2
AUTOMATED=1
;;
--conf-dir) --conf-dir)
HADOOP_CONF_DIR=$2; shift 2 HADOOP_CONF_DIR=$2; shift 2
AUTOMATED=1 AUTOMATED=1
@ -101,6 +142,10 @@ while true ; do
--default) --default)
AUTOMATED=1; shift AUTOMATED=1; shift
;; ;;
--group)
HADOOP_GROUP=$2; shift 2
AUTOMATED=1
;;
-h) -h)
usage usage
;; ;;
@ -120,11 +165,15 @@ while true ; do
HADOOP_MAPRED_DIR=$2; shift 2 HADOOP_MAPRED_DIR=$2; shift 2
AUTOMATED=1 AUTOMATED=1
;; ;;
--namenode-url) --namenode-host)
HADOOP_NN_HOST=$2; shift 2 HADOOP_NN_HOST=$2; shift 2
AUTOMATED=1 AUTOMATED=1
;; ;;
--jobtracker-url) --secondarynamenode-host)
HADOOP_SNN_HOST=$2; shift 2
AUTOMATED=1
;;
--jobtracker-host)
HADOOP_JT_HOST=$2; shift 2 HADOOP_JT_HOST=$2; shift 2
AUTOMATED=1 AUTOMATED=1
;; ;;
@ -132,6 +181,10 @@ while true ; do
HADOOP_LOG_DIR=$2; shift 2 HADOOP_LOG_DIR=$2; shift 2
AUTOMATED=1 AUTOMATED=1
;; ;;
--pid-dir)
HADOOP_PID_DIR=$2; shift 2
AUTOMATED=1
;;
--replication) --replication)
HADOOP_REPLICATION=$2; shift 2 HADOOP_REPLICATION=$2; shift 2
AUTOMATED=1 AUTOMATED=1
@ -139,7 +192,46 @@ while true ; do
--taskscheduler) --taskscheduler)
HADOOP_TASK_SCHEDULER=$2; shift 2 HADOOP_TASK_SCHEDULER=$2; shift 2
AUTOMATED=1 AUTOMATED=1
;; ;;
--hdfs-user)
HADOOP_HDFS_USER=$2; shift 2
AUTOMATED=1
;;
--mapreduce-user)
HADOOP_MR_USER=$2; shift 2
AUTOMATED=1
;;
--keytab-dir)
KEYTAB_DIR=$2; shift 2
AUTOMATED=1
;;
--hdfs-user-keytab)
HDFS_KEYTAB=$2; shift 2
AUTOMATED=1
;;
--mapreduce-user-keytab)
MR_KEYTAB=$2; shift 2
AUTOMATED=1
;;
--kerberos-realm)
KERBEROS_REALM=$2; shift 2
SECURITY_TYPE="kerberos"
AUTOMATED=1
;;
--kinit-location)
KINIT=$2; shift 2
AUTOMATED=1
;;
--datanodes)
DATANODES=$2; shift 2
AUTOMATED=1
DATANODES=$(echo $DATANODES | tr ',' ' ')
;;
--tasktrackers)
TASKTRACKERS=$2; shift 2
AUTOMATED=1
TASKTRACKERS=$(echo $TASKTRACKERS | tr ',' ' ')
;;
--) --)
shift ; break shift ; break
;; ;;
@ -151,21 +243,40 @@ while true ; do
esac esac
done done
# Fill in default values, if parameters have not been defined.
AUTOSETUP=${AUTOSETUP:-1} AUTOSETUP=${AUTOSETUP:-1}
JAVA_HOME=${JAVA_HOME:-/usr/java/default} JAVA_HOME=${JAVA_HOME:-/usr/java/default}
HADOOP_NN_HOST=${HADOOP_NN_HOST:-hdfs://`hostname`:9000/} HADOOP_GROUP=${HADOOP_GROUP:-hadoop}
HADOOP_NN_HOST=${HADOOP_NN_HOST:-`hostname`}
HADOOP_NN_DIR=${HADOOP_NN_DIR:-/var/lib/hadoop/hdfs/namenode} HADOOP_NN_DIR=${HADOOP_NN_DIR:-/var/lib/hadoop/hdfs/namenode}
HADOOP_DN_DIR=${HADOOP_DN_DIR:-/var/lib/hadoop/hdfs/datanode} HADOOP_DN_DIR=${HADOOP_DN_DIR:-/var/lib/hadoop/hdfs/datanode}
HADOOP_JT_HOST=${HADOOP_JT_HOST:-`hostname`:9001} HADOOP_JT_HOST=${HADOOP_JT_HOST:-`hostname`}
HADOOP_HDFS_DIR=${HADOOP_HDFS_DIR:-/var/lib/hadoop/hdfs} HADOOP_HDFS_DIR=${HADOOP_HDFS_DIR:-/var/lib/hadoop/hdfs}
HADOOP_MAPRED_DIR=${HADOOP_MAPRED_DIR:-/var/lib/hadoop/mapred} HADOOP_MAPRED_DIR=${HADOOP_MAPRED_DIR:-/var/lib/hadoop/mapred}
HADOOP_LOG_DIR=${HADOOP_LOG_DIR:-/var/log/hadoop} HADOOP_LOG_DIR=${HADOOP_LOG_DIR:-/var/log/hadoop}
HADOOP_PID_DIR=${HADOOP_PID_DIR:-/var/log/hadoop}
HADOOP_CONF_DIR=${HADOOP_CONF_DIR:-/etc/hadoop} HADOOP_CONF_DIR=${HADOOP_CONF_DIR:-/etc/hadoop}
HADOOP_REPLICATION=${HADOOP_RELICATION:-3} HADOOP_REPLICATION=${HADOOP_RELICATION:-3}
HADOOP_TASK_SCHEDULER=${HADOOP_TASK_SCHEDULER:-org.apache.hadoop.mapred.JobQueueTaskScheduler} HADOOP_TASK_SCHEDULER=${HADOOP_TASK_SCHEDULER:-org.apache.hadoop.mapred.JobQueueTaskScheduler}
HADOOP_HDFS_USER=${HADOOP_HDFS_USER:-hdfs}
HADOOP_MR_USER=${HADOOP_MR_USER:-mr}
KEYTAB_DIR=${KEYTAB_DIR:-/etc/security/keytabs}
HDFS_KEYTAB=${HDFS_KEYTAB:-/home/hdfs/hdfs.keytab}
MR_KEYTAB=${MR_KEYTAB:-/home/mr/mr.keytab}
KERBEROS_REALM=${KERBEROS_REALM:-KERBEROS.EXAMPLE.COM}
SECURITY_TYPE=${SECURITY_TYPE:-simple}
KINIT=${KINIT:-/usr/kerberos/bin/kinit}
if [ "${SECURITY_TYPE}" = "kerberos" ]; then
TASK_CONTROLLER="org.apache.hadoop.mapred.LinuxTaskController"
HADOOP_DN_ADDR="0.0.0.0:1019"
HADOOP_DN_HTTP_ADDR="0.0.0.0:1022"
SECURITY="true"
else
TASK_CONTROLLER="org.apache.hadoop.mapred.DefaultTaskController"
HADDOP_DN_ADDR="0.0.0.0:50010"
HADOOP_DN_HTTP_ADDR="0.0.0.0:50075"
SECURITY="false"
fi
# Interactive setup wizard
if [ "${AUTOMATED}" != "1" ]; then if [ "${AUTOMATED}" != "1" ]; then
echo "Setup Hadoop Configuration" echo "Setup Hadoop Configuration"
echo echo
@ -173,18 +284,18 @@ if [ "${AUTOMATED}" != "1" ]; then
read USER_HADOOP_CONF_DIR read USER_HADOOP_CONF_DIR
echo -n "Where would you like to put log directory? (${HADOOP_LOG_DIR}) " echo -n "Where would you like to put log directory? (${HADOOP_LOG_DIR}) "
read USER_HADOOP_LOG_DIR read USER_HADOOP_LOG_DIR
echo -n "What is the url of the namenode? (${HADOOP_NN_HOST}) " echo -n "Where would you like to put pid directory? (${HADOOP_PID_DIR}) "
read USER_HADOOP_PID_DIR
echo -n "What is the host of the namenode? (${HADOOP_NN_HOST}) "
read USER_HADOOP_NN_HOST read USER_HADOOP_NN_HOST
echo -n "Where would you like to put namenode data directory? (${HADOOP_NN_DIR}) " echo -n "Where would you like to put namenode data directory? (${HADOOP_NN_DIR}) "
read USER_HADOOP_NN_DIR read USER_HADOOP_NN_DIR
echo -n "Where would you like to put datanode data directory? (${HADOOP_DN_DIR}) " echo -n "Where would you like to put datanode data directory? (${HADOOP_DN_DIR}) "
read USER_HADOOP_DN_DIR read USER_HADOOP_DN_DIR
echo -n "What is the url of the jobtracker? (${HADOOP_JT_HOST}) " echo -n "What is the host of the jobtracker? (${HADOOP_JT_HOST}) "
read USER_HADOOP_JT_HOST read USER_HADOOP_JT_HOST
echo -n "Where would you like to put jobtracker/tasktracker data directory? (${HADOOP_MAPRED_DIR}) " echo -n "Where would you like to put jobtracker/tasktracker data directory? (${HADOOP_MAPRED_DIR}) "
read USER_HADOOP_MAPRED_DIR read USER_HADOOP_MAPRED_DIR
echo -n "Which taskscheduler would you like? (${HADOOP_TASK_SCHEDULER}) "
read USER_HADOOP_TASK_SCHEDULER
echo -n "Where is JAVA_HOME directory? (${JAVA_HOME}) " echo -n "Where is JAVA_HOME directory? (${JAVA_HOME}) "
read USER_JAVA_HOME read USER_JAVA_HOME
echo -n "Would you like to create directories/copy conf files to localhost? (Y/n) " echo -n "Would you like to create directories/copy conf files to localhost? (Y/n) "
@ -199,16 +310,18 @@ if [ "${AUTOMATED}" != "1" ]; then
HADOOP_MAPRED_DIR=${USER_HADOOP_MAPRED_DIR:-$HADOOP_MAPRED_DIR} HADOOP_MAPRED_DIR=${USER_HADOOP_MAPRED_DIR:-$HADOOP_MAPRED_DIR}
HADOOP_TASK_SCHEDULER=${HADOOP_TASK_SCHEDULER:-org.apache.hadoop.mapred.JobQueueTaskScheduler} HADOOP_TASK_SCHEDULER=${HADOOP_TASK_SCHEDULER:-org.apache.hadoop.mapred.JobQueueTaskScheduler}
HADOOP_LOG_DIR=${USER_HADOOP_LOG_DIR:-$HADOOP_LOG_DIR} HADOOP_LOG_DIR=${USER_HADOOP_LOG_DIR:-$HADOOP_LOG_DIR}
HADOOP_PID_DIR=${USER_HADOOP_PID_DIR:-$HADOOP_PID_DIR}
HADOOP_CONF_DIR=${USER_HADOOP_CONF_DIR:-$HADOOP_CONF_DIR} HADOOP_CONF_DIR=${USER_HADOOP_CONF_DIR:-$HADOOP_CONF_DIR}
AUTOSETUP=${USER_AUTOSETUP:-y} AUTOSETUP=${USER_AUTOSETUP:-y}
echo "Review your choices:" echo "Review your choices:"
echo echo
echo "Config directory : ${HADOOP_CONF_DIR}" echo "Config directory : ${HADOOP_CONF_DIR}"
echo "Log directory : ${HADOOP_LOG_DIR}" echo "Log directory : ${HADOOP_LOG_DIR}"
echo "Namenode url : ${HADOOP_NN_HOST}" echo "PID directory : ${HADOOP_PID_DIR}"
echo "Namenode host : ${HADOOP_NN_HOST}"
echo "Namenode directory : ${HADOOP_NN_DIR}" echo "Namenode directory : ${HADOOP_NN_DIR}"
echo "Datanode directory : ${HADOOP_DN_DIR}" echo "Datanode directory : ${HADOOP_DN_DIR}"
echo "Jobtracker url : ${HADOOP_JT_HOST}" echo "Jobtracker host : ${HADOOP_JT_HOST}"
echo "Mapreduce directory : ${HADOOP_MAPRED_DIR}" echo "Mapreduce directory : ${HADOOP_MAPRED_DIR}"
echo "Task scheduler : ${HADOOP_TASK_SCHEDULER}" echo "Task scheduler : ${HADOOP_TASK_SCHEDULER}"
echo "JAVA_HOME directory : ${JAVA_HOME}" echo "JAVA_HOME directory : ${JAVA_HOME}"
@ -222,61 +335,180 @@ if [ "${AUTOMATED}" != "1" ]; then
fi fi
fi fi
if [ "${AUTOSETUP}" == "1" ]; then
# If user wants to setup local system automatically,
# set config file generation location to HADOOP_CONF_DIR.
DEST=${HADOOP_CONF_DIR}
else
# If user is only interested to generate config file locally,
# place config files in the current working directory.
DEST=`pwd`
fi
# remove existing config file, they are existed in current directory.
rm -f ${DEST}/core-site.xml >/dev/null
rm -f ${DEST}/hdfs-site.xml >/dev/null
rm -f ${DEST}/mapred-site.xml >/dev/null
rm -f ${DEST}/hadoop-env.sh >/dev/null
# Generate config file with specified parameters.
template_generator ${HADOOP_PREFIX}/share/hadoop/common/templates/core-site.xml ${DEST}/core-site.xml
template_generator ${HADOOP_PREFIX}/share/hadoop/hdfs/templates/hdfs-site.xml ${DEST}/hdfs-site.xml
template_generator ${HADOOP_PREFIX}/share/hadoop/mapreduce/templates/mapred-site.xml ${DEST}/mapred-site.xml
template_generator ${HADOOP_CONF_DIR}/hadoop-env.sh.template ${DEST}/hadoop-env.sh
chown root:hadoop ${DEST}/hadoop-env.sh
chmod 755 ${DEST}/hadoop-env.sh
# Setup directory path and copy config files, if AUTOSETUP is chosen.
if [ "${AUTOSETUP}" == "1" -o "${AUTOSETUP}" == "y" ]; then if [ "${AUTOSETUP}" == "1" -o "${AUTOSETUP}" == "y" ]; then
mkdir -p ${HADOOP_HDFS_DIR} if [ -d ${KEYTAB_DIR} ]; then
mkdir -p ${HADOOP_NN_DIR} chmod 700 ${KEYTAB_DIR}/*
mkdir -p ${HADOOP_DN_DIR} chown ${HADOOP_MR_USER}:${HADOOP_GROUP} ${KEYTAB_DIR}/[jt]t.service.keytab
mkdir -p ${HADOOP_MAPRED_DIR} chown ${HADOOP_HDFS_USER}:${HADOOP_GROUP} ${KEYTAB_DIR}/[dns]n.service.keytab
fi
chmod 755 -R ${HADOOP_PREFIX}/sbin/*hadoop*
chmod 755 -R ${HADOOP_PREFIX}/bin/hadoop
chmod 755 -R ${HADOOP_PREFIX}/libexec/hadoop-config.sh
mkdir -p /home/${HADOOP_MR_USER}
chown ${HADOOP_MR_USER}:${HADOOP_GROUP} /home/${HADOOP_MR_USER}
HDFS_DIR=`echo ${HADOOP_HDFS_DIR} | sed -e 's/,/ /g'`
mkdir -p ${HDFS_DIR}
if [ -e ${HADOOP_NN_DIR} ]; then
rm -rf ${HADOOP_NN_DIR}
fi
DATANODE_DIR=`echo ${HADOOP_DN_DIR} | sed -e 's/,/ /g'`
mkdir -p ${DATANODE_DIR}
MAPRED_DIR=`echo ${HADOOP_MAPRED_DIR} | sed -e 's/,/ /g'`
mkdir -p ${MAPRED_DIR}
mkdir -p ${HADOOP_CONF_DIR} mkdir -p ${HADOOP_CONF_DIR}
check_permission ${HADOOP_CONF_DIR}
if [ $? == 1 ]; then
echo "Full path to ${HADOOP_CONF_DIR} should be owned by root."
exit 1
fi
mkdir -p ${HADOOP_LOG_DIR} mkdir -p ${HADOOP_LOG_DIR}
mkdir -p ${HADOOP_LOG_DIR}/hdfs #create the log sub dir for diff users
mkdir -p ${HADOOP_LOG_DIR}/mapred mkdir -p ${HADOOP_LOG_DIR}/${HADOOP_HDFS_USER}
chown hdfs:hadoop ${HADOOP_HDFS_DIR} mkdir -p ${HADOOP_LOG_DIR}/${HADOOP_MR_USER}
chown hdfs:hadoop ${HADOOP_NN_DIR}
chown hdfs:hadoop ${HADOOP_DN_DIR} mkdir -p ${HADOOP_PID_DIR}
chown mapred:hadoop ${HADOOP_MAPRED_DIR} chown ${HADOOP_HDFS_USER}:${HADOOP_GROUP} ${HDFS_DIR}
chown root:hadoop ${HADOOP_LOG_DIR} chown ${HADOOP_HDFS_USER}:${HADOOP_GROUP} ${DATANODE_DIR}
chmod 700 -R ${DATANODE_DIR}
chown ${HADOOP_MR_USER}:${HADOOP_GROUP} ${MAPRED_DIR}
chown ${HADOOP_HDFS_USER}:${HADOOP_GROUP} ${HADOOP_LOG_DIR}
chmod 775 ${HADOOP_LOG_DIR} chmod 775 ${HADOOP_LOG_DIR}
chown hdfs:hadoop ${HADOOP_LOG_DIR}/hdfs chmod 775 ${HADOOP_PID_DIR}
chown mapred:hadoop ${HADOOP_LOG_DIR}/mapred chown root:${HADOOP_GROUP} ${HADOOP_PID_DIR}
#change the permission and the owner
chmod 755 ${HADOOP_LOG_DIR}/${HADOOP_HDFS_USER}
chown ${HADOOP_HDFS_USER}:${HADOOP_GROUP} ${HADOOP_LOG_DIR}/${HADOOP_HDFS_USER}
chmod 755 ${HADOOP_LOG_DIR}/${HADOOP_MR_USER}
chown ${HADOOP_MR_USER}:${HADOOP_GROUP} ${HADOOP_LOG_DIR}/${HADOOP_MR_USER}
if [ -e ${HADOOP_CONF_DIR}/core-site.xml ]; then
mv -f ${HADOOP_CONF_DIR}/core-site.xml ${HADOOP_CONF_DIR}/core-site.xml.bak
fi
if [ -e ${HADOOP_CONF_DIR}/hdfs-site.xml ]; then
mv -f ${HADOOP_CONF_DIR}/hdfs-site.xml ${HADOOP_CONF_DIR}/hdfs-site.xml.bak
fi
if [ -e ${HADOOP_CONF_DIR}/mapred-site.xml ]; then
mv -f ${HADOOP_CONF_DIR}/mapred-site.xml ${HADOOP_CONF_DIR}/mapred-site.xml.bak
fi
if [ -e ${HADOOP_CONF_DIR}/hadoop-env.sh ]; then
mv -f ${HADOOP_CONF_DIR}/hadoop-env.sh ${HADOOP_CONF_DIR}/hadoop-env.sh.bak
fi
if [ -e ${HADOOP_CONF_DIR}/hadoop-policy.xml ]; then
mv -f ${HADOOP_CONF_DIR}/hadoop-policy.xml ${HADOOP_CONF_DIR}/hadoop-policy.xml.bak
fi
if [ -e ${HADOOP_CONF_DIR}/mapred-queue-acls.xml ]; then
mv -f ${HADOOP_CONF_DIR}/mapred-queue-acls.xml ${HADOOP_CONF_DIR}/mapred-queue-acls.xml.bak
fi
if [ -e ${HADOOP_CONF_DIR}/commons-logging.properties ]; then
mv -f ${HADOOP_CONF_DIR}/commons-logging.properties ${HADOOP_CONF_DIR}/commons-logging.properties.bak
fi
if [ -e ${HADOOP_CONF_DIR}/taskcontroller.cfg ]; then
mv -f ${HADOOP_CONF_DIR}/taskcontroller.cfg ${HADOOP_CONF_DIR}/taskcontroller.cfg.bak
fi
if [ -e ${HADOOP_CONF_DIR}/slaves ]; then
mv -f ${HADOOP_CONF_DIR}/slaves ${HADOOP_CONF_DIR}/slaves.bak
fi
if [ -e ${HADOOP_CONF_DIR}/dfs.include ]; then
mv -f ${HADOOP_CONF_DIR}/dfs.include ${HADOOP_CONF_DIR}/dfs.include.bak
fi
if [ -e ${HADOOP_CONF_DIR}/dfs.exclude ]; then
mv -f ${HADOOP_CONF_DIR}/dfs.exclude ${HADOOP_CONF_DIR}/dfs.exclude.bak
fi
if [ -e ${HADOOP_CONF_DIR}/mapred.include ]; then
mv -f ${HADOOP_CONF_DIR}/mapred.include ${HADOOP_CONF_DIR}/mapred.include.bak
fi
if [ -e ${HADOOP_CONF_DIR}/mapred.exclude ]; then
mv -f ${HADOOP_CONF_DIR}/mapred.exclude ${HADOOP_CONF_DIR}/mapred.exclude.bak
fi
template_generator ${HADOOP_PREFIX}/share/hadoop/common/templates/conf/core-site.xml ${HADOOP_CONF_DIR}/core-site.xml
template_generator ${HADOOP_PREFIX}/share/hadoop/common/templates/conf/hdfs-site.xml ${HADOOP_CONF_DIR}/hdfs-site.xml
template_generator ${HADOOP_PREFIX}/share/hadoop/common/templates/conf/mapred-site.xml ${HADOOP_CONF_DIR}/mapred-site.xml
template_generator ${HADOOP_PREFIX}/share/hadoop/common/templates/conf/hadoop-env.sh ${HADOOP_CONF_DIR}/hadoop-env.sh
template_generator ${HADOOP_PREFIX}/share/hadoop/common/templates/conf/hadoop-policy.xml ${HADOOP_CONF_DIR}/hadoop-policy.xml
template_generator ${HADOOP_PREFIX}/share/hadoop/common/templates/conf/commons-logging.properties ${HADOOP_CONF_DIR}/commons-logging.properties
template_generator ${HADOOP_PREFIX}/share/hadoop/common/templates/conf/mapred-queue-acls.xml ${HADOOP_CONF_DIR}/mapred-queue-acls.xml
template_generator ${HADOOP_PREFIX}/share/hadoop/common/templates/conf/taskcontroller.cfg ${HADOOP_CONF_DIR}/taskcontroller.cfg
#set the owner of the hadoop dir to root
chown root ${HADOOP_PREFIX}
chown root:${HADOOP_GROUP} ${HADOOP_CONF_DIR}/hadoop-env.sh
chmod 755 ${HADOOP_CONF_DIR}/hadoop-env.sh
#set taskcontroller
chown root:${HADOOP_GROUP} ${HADOOP_CONF_DIR}/taskcontroller.cfg
chmod 400 ${HADOOP_CONF_DIR}/taskcontroller.cfg
chown root:${HADOOP_GROUP} ${HADOOP_PREFIX}/bin/task-controller
chmod 6050 ${HADOOP_PREFIX}/bin/task-controller
#generate the slaves file and include and exclude files for hdfs and mapred
echo '' > ${HADOOP_CONF_DIR}/slaves
echo '' > ${HADOOP_CONF_DIR}/dfs.include
echo '' > ${HADOOP_CONF_DIR}/dfs.exclude
echo '' > ${HADOOP_CONF_DIR}/mapred.include
echo '' > ${HADOOP_CONF_DIR}/mapred.exclude
for dn in $DATANODES
do
echo $dn >> ${HADOOP_CONF_DIR}/slaves
echo $dn >> ${HADOOP_CONF_DIR}/dfs.include
done
for tt in $TASKTRACKERS
do
echo $tt >> ${HADOOP_CONF_DIR}/mapred.include
done
echo "Configuration setup is completed." echo "Configuration setup is completed."
if [[ "$HADOOP_NN_HOST" =~ "`hostname`" ]]; then if [[ "$HADOOP_NN_HOST" =~ "`hostname`" ]]; then
echo "Proceed to run hadoop-setup-hdfs.sh on namenode." echo "Proceed to run hadoop-setup-hdfs.sh on namenode."
fi fi
else else
template_generator ${HADOOP_PREFIX}/share/hadoop/common/templates/conf/core-site.xml ${HADOOP_CONF_DIR}/core-site.xml
template_generator ${HADOOP_PREFIX}/share/hadoop/common/templates/conf/hdfs-site.xml ${HADOOP_CONF_DIR}/hdfs-site.xml
template_generator ${HADOOP_PREFIX}/share/hadoop/common/templates/conf/mapred-site.xml ${HADOOP_CONF_DIR}/mapred-site.xml
template_generator ${HADOOP_PREFIX}/share/hadoop/common/templates/conf/hadoop-env.sh ${HADOOP_CONF_DIR}/hadoop-env.sh
template_generator ${HADOOP_PREFIX}/share/hadoop/common/templates/conf/hadoop-policy.xml ${HADOOP_CONF_DIR}/hadoop-policy.xml
template_generator ${HADOOP_PREFIX}/share/hadoop/common/templates/conf/commons-logging.properties ${HADOOP_CONF_DIR}/commons-logging.properties
template_generator ${HADOOP_PREFIX}/share/hadoop/common/templates/conf/mapred-queue-acls.xml ${HADOOP_CONF_DIR}/mapred-queue-acls.xml
template_generator ${HADOOP_PREFIX}/share/hadoop/common/templates/conf/taskcontroller.cfg ${HADOOP_CONF_DIR}/taskcontroller.cfg
template_generator ${HADOOP_PREFIX}/share/hadoop/common/templates/conf/hadoop-metrics2.properties ${HADOOP_CONF_DIR}/hadoop-metrics2.properties
if [ ! -e ${HADOOP_CONF_DIR}/capacity-scheduler.xml ]; then
template_generator ${HADOOP_PREFIX}/share/hadoop/templates/conf/capacity-scheduler.xml ${HADOOP_CONF_DIR}/capacity-scheduler.xml
fi
chown root:${HADOOP_GROUP} ${HADOOP_CONF_DIR}/hadoop-env.sh
chmod 755 ${HADOOP_CONF_DIR}/hadoop-env.sh
#set taskcontroller
chown root:${HADOOP_GROUP} ${HADOOP_CONF_DIR}/taskcontroller.cfg
chmod 400 ${HADOOP_CONF_DIR}/taskcontroller.cfg
chown root:${HADOOP_GROUP} ${HADOOP_PREFIX}/bin/task-controller
chmod 6050 ${HADOOP_PREFIX}/bin/task-controller
#generate the slaves file and include and exclude files for hdfs and mapred
echo '' > ${HADOOP_CONF_DIR}/slaves
echo '' > ${HADOOP_CONF_DIR}/dfs.include
echo '' > ${HADOOP_CONF_DIR}/dfs.exclude
echo '' > ${HADOOP_CONF_DIR}/mapred.include
echo '' > ${HADOOP_CONF_DIR}/mapred.exclude
for dn in $DATANODES
do
echo $dn >> ${HADOOP_CONF_DIR}/slaves
echo $dn >> ${HADOOP_CONF_DIR}/dfs.include
done
for tt in $TASKTRACKERS
do
echo $tt >> ${HADOOP_CONF_DIR}/mapred.include
done
echo echo
echo "Configuration file has been generated, please copy:" echo "Configuration file has been generated in:"
echo echo
echo "core-site.xml" echo "${HADOOP_CONF_DIR}/core-site.xml"
echo "hdfs-site.xml" echo "${HADOOP_CONF_DIR}/hdfs-site.xml"
echo "mapred-site.xml" echo "${HADOOP_CONF_DIR}/mapred-site.xml"
echo "hadoop-env.sh" echo "${HADOOP_CONF_DIR}/hadoop-env.sh"
echo echo
echo " to ${HADOOP_CONF_DIR} on all nodes, and proceed to run hadoop-setup-hdfs.sh on namenode." echo " to ${HADOOP_CONF_DIR} on all nodes, and proceed to run hadoop-setup-hdfs.sh on namenode."
fi fi

View File

@ -18,37 +18,65 @@
bin=`dirname "$0"` bin=`dirname "$0"`
bin=`cd "$bin"; pwd` bin=`cd "$bin"; pwd`
if [ "$HADOOP_HOME" != "" ]; then
echo "Warning: \$HADOOP_HOME is deprecated."
echo
fi
. "$bin"/../libexec/hadoop-config.sh . "$bin"/../libexec/hadoop-config.sh
usage() { usage() {
echo " echo "
usage: $0 <parameters> usage: $0 <parameters>
Require parameter:
-c <clusterid> Set cluster identifier for HDFS
Optional parameters: Optional parameters:
-h Display this message --format Force namenode format
--group=hadoop Set Hadoop group
-h Display this message
--hdfs-user=hdfs Set HDFS user
--kerberos-realm=KERBEROS.EXAMPLE.COM Set Kerberos realm
--hdfs-user-keytab=/home/hdfs/hdfs.keytab Set HDFS user key tab
--mapreduce-user=mr Set mapreduce user
" "
exit 1 exit 1
} }
if [ $# != 2 ] ; then OPTS=$(getopt \
-n $0 \
-o '' \
-l 'format' \
-l 'hdfs-user:' \
-l 'hdfs-user-keytab:' \
-l 'mapreduce-user:' \
-l 'kerberos-realm:' \
-o 'h' \
-- "$@")
if [ $? != 0 ] ; then
usage usage
exit 1
fi fi
while getopts "hc:" OPTION eval set -- "${OPTS}"
do while true ; do
case $OPTION in case "$1" in
c) --format)
SETUP_CLUSTER=$2; shift 2 FORMAT_NAMENODE=1; shift
AUTOMATED=1
;; ;;
h) --group)
usage HADOOP_GROUP=$2; shift 2
AUTOMATED=1
;;
--hdfs-user)
HADOOP_HDFS_USER=$2; shift 2
AUTOMATED=1
;;
--mapreduce-user)
HADOOP_MR_USER=$2; shift 2
AUTOMATED=1
;;
--hdfs-user-keytab)
HDFS_KEYTAB=$2; shift 2
AUTOMATED=1
;;
--kerberos-realm)
KERBEROS_REALM=$2; shift 2
AUTOMATED=1
;; ;;
--) --)
shift ; break shift ; break
@ -61,30 +89,56 @@ do
esac esac
done done
export HADOOP_PREFIX HADOOP_GROUP=${HADOOP_GROUP:-hadoop}
export HADOOP_CONF_DIR HADOOP_HDFS_USER=${HADOOP_HDFS_USER:-hdfs}
export SETUP_CLUSTER HADOOP_MAPREDUCE_USER=${HADOOP_MR_USER:-mapred}
if [ "${KERBEROS_REALM}" != "" ]; then
# Determine kerberos location base on Linux distro.
if [ -e /etc/lsb-release ]; then
KERBEROS_BIN=/usr/bin
else
KERBEROS_BIN=/usr/kerberos/bin
fi
kinit_cmd="${KERBEROS_BIN}/kinit -k -t ${HDFS_KEYTAB} ${HADOOP_HDFS_USER}"
su -c "${kinit_cmd}" ${HADOOP_HDFS_USER}
fi
# Start namenode and initialize file system structure
echo "Setup Hadoop Distributed File System" echo "Setup Hadoop Distributed File System"
echo echo
echo "Formatting namenode"
echo # Format namenode
su -c '${HADOOP_PREFIX}/bin/hadoop --config ${HADOOP_CONF_DIR} namenode -format -clusterid ${SETUP_CLUSTER}' hdfs if [ "${FORMAT_NAMENODE}" == "1" ]; then
echo echo "Formatting namenode"
echo
su -c "echo Y | ${HADOOP_PREFIX}/bin/hadoop --config ${HADOOP_CONF_DIR} namenode -format" ${HADOOP_HDFS_USER}
echo
fi
# Start namenode process
echo "Starting namenode process" echo "Starting namenode process"
echo echo
/etc/init.d/hadoop-namenode start if [ -e ${HADOOP_PREFIX}/sbin/hadoop-daemon.sh ]; then
DAEMON_PATH=${HADOOP_PREFIX}/sbin
else
DAEMON_PATH=${HADOOP_PREFIX}/bin
fi
su -c "${DAEMON_PATH}/hadoop-daemon.sh --config ${HADOOP_CONF_DIR} start namenode" ${HADOOP_HDFS_USER}
echo echo
echo "Initialize HDFS file system: " echo "Initialize HDFS file system: "
echo echo
su -c '${HADOOP_PREFIX}/bin/hadoop --config ${HADOOP_CONF_DIR} fs -mkdir /jobtracker' hdfs #create the /user dir
su -c '${HADOOP_PREFIX}/bin/hadoop --config ${HADOOP_CONF_DIR} fs -chown mapred:mapred /jobtracker' hdfs su -c "${HADOOP_PREFIX}/bin/hadoop --config ${HADOOP_CONF_DIR} dfs -mkdir /user" ${HADOOP_HDFS_USER}
su -c '${HADOOP_PREFIX}/bin/hadoop --config ${HADOOP_CONF_DIR} fs -mkdir /user/mapred' hdfs
su -c '${HADOOP_PREFIX}/bin/hadoop --config ${HADOOP_CONF_DIR} fs -chown mapred:mapred /user/mapred' hdfs #create /tmp and give it 777
su -c '${HADOOP_PREFIX}/bin/hadoop --config ${HADOOP_CONF_DIR} fs -mkdir /tmp' hdfs su -c "${HADOOP_PREFIX}/bin/hadoop --config ${HADOOP_CONF_DIR} dfs -mkdir /tmp" ${HADOOP_HDFS_USER}
su -c '${HADOOP_PREFIX}/bin/hadoop --config ${HADOOP_CONF_DIR} fs -chmod 777 /tmp' hdfs su -c "${HADOOP_PREFIX}/bin/hadoop --config ${HADOOP_CONF_DIR} dfs -chmod 777 /tmp" ${HADOOP_HDFS_USER}
#create /mapred
su -c "${HADOOP_PREFIX}/bin/hadoop --config ${HADOOP_CONF_DIR} dfs -mkdir /mapred" ${HADOOP_HDFS_USER}
su -c "${HADOOP_PREFIX}/bin/hadoop --config ${HADOOP_CONF_DIR} dfs -chmod 700 /mapred" ${HADOOP_HDFS_USER}
su -c "${HADOOP_PREFIX}/bin/hadoop --config ${HADOOP_CONF_DIR} dfs -chown ${HADOOP_MAPREDUCE_USER}:system /mapred" ${HADOOP_HDFS_USER}
if [ $? -eq 0 ]; then if [ $? -eq 0 ]; then
echo "Completed." echo "Completed."

View File

@ -17,16 +17,16 @@
# Script for setup HDFS file system for single node deployment # Script for setup HDFS file system for single node deployment
bin=`which $0` bin=`dirname "$0"`
bin=`dirname ${bin}`
bin=`cd "$bin"; pwd` bin=`cd "$bin"; pwd`
export HADOOP_PREFIX=${bin}/.. if [ "$HADOOP_HOME" != "" ]; then
echo "Warning: \$HADOOP_HOME is deprecated."
if [ -e /etc/hadoop/hadoop-env.sh ]; then echo
. /etc/hadoop/hadoop-env.sh
fi fi
. "$bin"/../libexec/hadoop-config.sh
usage() { usage() {
echo " echo "
usage: $0 <parameters> usage: $0 <parameters>
@ -38,7 +38,19 @@ usage: $0 <parameters>
exit 1 exit 1
} }
# Parse script parameters template_generator() {
REGEX='(\$\{[a-zA-Z_][a-zA-Z_0-9]*\})'
cat $1 |
while read line ; do
while [[ "$line" =~ $REGEX ]] ; do
LHS=${BASH_REMATCH[1]}
RHS="$(eval echo "\"$LHS\"")"
line=${line//$LHS/$RHS}
done
echo $line >> $2
done
}
OPTS=$(getopt \ OPTS=$(getopt \
-n $0 \ -n $0 \
-o '' \ -o '' \
@ -49,6 +61,10 @@ if [ $? != 0 ] ; then
usage usage
fi fi
if [ -e /etc/hadoop/hadoop-env.sh ]; then
. /etc/hadoop/hadoop-env.sh
fi
eval set -- "${OPTS}" eval set -- "${OPTS}"
while true ; do while true ; do
case "$1" in case "$1" in
@ -69,7 +85,6 @@ while true ; do
esac esac
done done
# Interactive setup wizard
if [ "${AUTOMATED}" != "1" ]; then if [ "${AUTOMATED}" != "1" ]; then
echo "Welcome to Hadoop single node setup wizard" echo "Welcome to Hadoop single node setup wizard"
echo echo
@ -119,68 +134,59 @@ SET_REBOOT=${SET_REBOOT:-y}
/etc/init.d/hadoop-jobtracker stop 2>/dev/null >/dev/null /etc/init.d/hadoop-jobtracker stop 2>/dev/null >/dev/null
/etc/init.d/hadoop-tasktracker stop 2>/dev/null >/dev/null /etc/init.d/hadoop-tasktracker stop 2>/dev/null >/dev/null
# Default settings
JAVA_HOME=${JAVA_HOME:-/usr/java/default}
HADOOP_NN_HOST=${HADOOP_NN_HOST:-hdfs://localhost:9000/}
HADOOP_NN_DIR=${HADOOP_NN_DIR:-/var/lib/hadoop/hdfs/namenode}
HADOOP_DN_DIR=${HADOOP_DN_DIR:-/var/lib/hadoop/hdfs/datanode}
HADOOP_JT_HOST=${HADOOP_JT_HOST:-localhost:9001}
HADOOP_HDFS_DIR=${HADOOP_MAPRED_DIR:-/var/lib/hadoop/hdfs}
HADOOP_MAPRED_DIR=${HADOOP_MAPRED_DIR:-/var/lib/hadoop/mapred}
HADOOP_LOG_DIR="/var/log/hadoop"
HADOOP_CONF_DIR=${HADOOP_CONF_DIR:-/etc/hadoop}
HADOOP_REPLICATION=${HADOOP_RELICATION:-1}
HADOOP_TASK_SCHEDULER=${HADOOP_TASK_SCHEDULER:-org.apache.hadoop.mapred.JobQueueTaskScheduler}
# Setup config files
if [ "${SET_CONFIG}" == "y" ]; then if [ "${SET_CONFIG}" == "y" ]; then
JAVA_HOME=${JAVA_HOME:-/usr/java/default}
HADOOP_NN_HOST=${HADOOP_NN_HOST:-localhost}
HADOOP_NN_DIR=${HADOOP_NN_DIR:-/var/lib/hadoop/hdfs/namenode}
HADOOP_DN_DIR=${HADOOP_DN_DIR:-/var/lib/hadoop/hdfs/datanode}
HADOOP_JT_HOST=${HADOOP_JT_HOST:-localhost}
HADOOP_HDFS_DIR=${HADOOP_MAPRED_DIR:-/var/lib/hadoop/hdfs}
HADOOP_MAPRED_DIR=${HADOOP_MAPRED_DIR:-/var/lib/hadoop/mapred}
HADOOP_PID_DIR=${HADOOP_PID_DIR:-/var/run/hadoop}
HADOOP_LOG_DIR="/var/log/hadoop"
HADOOP_CONF_DIR=${HADOOP_CONF_DIR:-/etc/hadoop}
HADOOP_REPLICATION=${HADOOP_RELICATION:-1}
${HADOOP_PREFIX}/sbin/hadoop-setup-conf.sh --auto \ ${HADOOP_PREFIX}/sbin/hadoop-setup-conf.sh --auto \
--hdfs-user=hdfs \
--mapreduce-user=mapred \
--conf-dir=${HADOOP_CONF_DIR} \ --conf-dir=${HADOOP_CONF_DIR} \
--datanode-dir=${HADOOP_DN_DIR} \ --datanode-dir=${HADOOP_DN_DIR} \
--hdfs-dir=${HADOOP_HDFS_DIR} \ --hdfs-dir=${HADOOP_HDFS_DIR} \
--jobtracker-url=${HADOOP_JT_HOST} \ --jobtracker-host=${HADOOP_JT_HOST} \
--log-dir=${HADOOP_LOG_DIR} \ --log-dir=${HADOOP_LOG_DIR} \
--pid-dir=${HADOOP_PID_DIR} \
--mapred-dir=${HADOOP_MAPRED_DIR} \ --mapred-dir=${HADOOP_MAPRED_DIR} \
--namenode-dir=${HADOOP_NN_DIR} \ --namenode-dir=${HADOOP_NN_DIR} \
--namenode-url=${HADOOP_NN_HOST} \ --namenode-host=${HADOOP_NN_HOST} \
--replication=${HADOOP_REPLICATION} --replication=${HADOOP_REPLICATION}
fi fi
export HADOOP_CONF_DIR
# Format namenode
if [ ! -e ${HADOOP_NN_DIR} ]; then if [ ! -e ${HADOOP_NN_DIR} ]; then
rm -rf ${HADOOP_HDFS_DIR} 2>/dev/null >/dev/null rm -rf ${HADOOP_HDFS_DIR} 2>/dev/null >/dev/null
mkdir -p ${HADOOP_HDFS_DIR} mkdir -p ${HADOOP_HDFS_DIR}
chmod 755 ${HADOOP_HDFS_DIR} chmod 755 ${HADOOP_HDFS_DIR}
chown hdfs:hadoop ${HADOOP_HDFS_DIR} chown hdfs:hadoop ${HADOOP_HDFS_DIR}
su -c '${HADOOP_PREFIX}/bin/hdfs --config ${HADOOP_CONF_DIR} namenode -format -clusterid hadoop' hdfs /etc/init.d/hadoop-namenode format
elif [ "${SET_FORMAT}" == "y" ]; then elif [ "${SET_FORMAT}" == "y" ]; then
rm -rf ${HADOOP_HDFS_DIR} 2>/dev/null >/dev/null rm -rf ${HADOOP_HDFS_DIR} 2>/dev/null >/dev/null
mkdir -p ${HADOOP_HDFS_DIR} mkdir -p ${HADOOP_HDFS_DIR}
chmod 755 ${HADOOP_HDFS_DIR} chmod 755 ${HADOOP_HDFS_DIR}
chown hdfs:hadoop ${HADOOP_HDFS_DIR} chown hdfs:hadoop ${HADOOP_HDFS_DIR}
rm -rf /var/lib/hadoop/hdfs/namenode rm -rf ${HADOOP_NN_DIR}
su -c '${HADOOP_PREFIX}/bin/hdfs --config ${HADOOP_CONF_DIR} namenode -format -clusterid hadoop' hdfs /etc/init.d/hadoop-namenode format
fi fi
# Start hdfs service
/etc/init.d/hadoop-namenode start /etc/init.d/hadoop-namenode start
/etc/init.d/hadoop-datanode start /etc/init.d/hadoop-datanode start
# Initialize file system structure su -c '${HADOOP_PREFIX}/bin/hadoop --config ${HADOOP_CONF_DIR} dfs -mkdir /user/mapred' hdfs
su -c '${HADOOP_PREFIX}/bin/hadoop --config ${HADOOP_CONF_DIR} fs -mkdir /user/mapred' hdfs su -c '${HADOOP_PREFIX}/bin/hadoop --config ${HADOOP_CONF_DIR} dfs -chown mapred:mapred /user/mapred' hdfs
su -c '${HADOOP_PREFIX}/bin/hadoop --config ${HADOOP_CONF_DIR} fs -chown mapred:mapred /user/mapred' hdfs su -c '${HADOOP_PREFIX}/bin/hadoop --config ${HADOOP_CONF_DIR} dfs -mkdir /tmp' hdfs
su -c '${HADOOP_PREFIX}/bin/hadoop --config ${HADOOP_CONF_DIR} fs -mkdir /tmp' hdfs su -c '${HADOOP_PREFIX}/bin/hadoop --config ${HADOOP_CONF_DIR} dfs -chmod 777 /tmp' hdfs
su -c '${HADOOP_PREFIX}/bin/hadoop --config ${HADOOP_CONF_DIR} fs -chmod 777 /tmp' hdfs
su -c '${HADOOP_PREFIX}/bin/hadoop --config ${HADOOP_CONF_DIR} fs -mkdir /jobtracker' hdfs
su -c '${HADOOP_PREFIX}/bin/hadoop --config ${HADOOP_CONF_DIR} fs -chown mapred:mapred /jobtracker' hdfs
# Start mapreduce service
/etc/init.d/hadoop-jobtracker start /etc/init.d/hadoop-jobtracker start
/etc/init.d/hadoop-tasktracker start /etc/init.d/hadoop-tasktracker start
# Toggle service startup on reboot
if [ "${SET_REBOOT}" == "y" ]; then if [ "${SET_REBOOT}" == "y" ]; then
if [ -e /etc/debian_version ]; then if [ -e /etc/debian_version ]; then
ln -sf ../init.d/hadoop-namenode /etc/rc2.d/S90hadoop-namenode ln -sf ../init.d/hadoop-namenode /etc/rc2.d/S90hadoop-namenode
@ -203,7 +209,6 @@ if [ "${SET_REBOOT}" == "y" ]; then
fi fi
fi fi
# Shutdown service, if user choose to stop services after setup
if [ "${STARTUP}" != "y" ]; then if [ "${STARTUP}" != "y" ]; then
/etc/init.d/hadoop-namenode stop /etc/init.d/hadoop-namenode stop
/etc/init.d/hadoop-datanode stop /etc/init.d/hadoop-datanode stop

View File

@ -27,10 +27,15 @@ source /etc/default/hadoop-env.sh
RETVAL=0 RETVAL=0
PIDFILE="${HADOOP_PID_DIR}/hadoop-hdfs-datanode.pid" PIDFILE="${HADOOP_PID_DIR}/hadoop-hdfs-datanode.pid"
desc="Hadoop datanode daemon" desc="Hadoop datanode daemon"
HADOOP_PREFIX="/usr"
start() { start() {
echo -n $"Starting $desc (hadoop-datanode): " echo -n $"Starting $desc (hadoop-datanode): "
daemon --user hdfs ${HADOOP_PREFIX}/sbin/hadoop-daemon.sh --config "${HADOOP_CONF_DIR}" start datanode if [ -n "$HADOOP_SECURE_DN_USER" ]; then
daemon ${HADOOP_PREFIX}/sbin/hadoop-daemon.sh --config "${HADOOP_CONF_DIR}" start datanode
else
daemon --user hdfs ${HADOOP_PREFIX}/sbin/hadoop-daemon.sh --config "${HADOOP_CONF_DIR}" start datanode
fi
RETVAL=$? RETVAL=$?
echo echo
[ $RETVAL -eq 0 ] && touch /var/lock/subsys/hadoop-datanode [ $RETVAL -eq 0 ] && touch /var/lock/subsys/hadoop-datanode
@ -39,7 +44,11 @@ start() {
stop() { stop() {
echo -n $"Stopping $desc (hadoop-datanode): " echo -n $"Stopping $desc (hadoop-datanode): "
daemon --user hdfs ${HADOOP_PREFIX}/sbin/hadoop-daemon.sh --config "${HADOOP_CONF_DIR}" stop datanode if [ -n "$HADOOP_SECURE_DN_USER" ]; then
daemon ${HADOOP_PREFIX}/sbin/hadoop-daemon.sh --config "${HADOOP_CONF_DIR}" stop datanode
else
daemon --user hdfs ${HADOOP_PREFIX}/sbin/hadoop-daemon.sh --config "${HADOOP_CONF_DIR}" stop datanode
fi
RETVAL=$? RETVAL=$?
sleep 5 sleep 5
echo echo

View File

@ -27,6 +27,7 @@ source /etc/default/hadoop-env.sh
RETVAL=0 RETVAL=0
PIDFILE="${HADOOP_PID_DIR}/hadoop-mapred-jobtracker.pid" PIDFILE="${HADOOP_PID_DIR}/hadoop-mapred-jobtracker.pid"
desc="Hadoop jobtracker daemon" desc="Hadoop jobtracker daemon"
export HADOOP_PREFIX="/usr"
start() { start() {
echo -n $"Starting $desc (hadoop-jobtracker): " echo -n $"Starting $desc (hadoop-jobtracker): "

View File

@ -27,6 +27,7 @@ source /etc/default/hadoop-env.sh
RETVAL=0 RETVAL=0
PIDFILE="${HADOOP_PID_DIR}/hadoop-hdfs-namenode.pid" PIDFILE="${HADOOP_PID_DIR}/hadoop-hdfs-namenode.pid"
desc="Hadoop namenode daemon" desc="Hadoop namenode daemon"
export HADOOP_PREFIX="/usr"
start() { start() {
echo -n $"Starting $desc (hadoop-namenode): " echo -n $"Starting $desc (hadoop-namenode): "

View File

@ -27,6 +27,7 @@ source /etc/default/hadoop-env.sh
RETVAL=0 RETVAL=0
PIDFILE="${HADOOP_PID_DIR}/hadoop-mapred-tasktracker.pid" PIDFILE="${HADOOP_PID_DIR}/hadoop-mapred-tasktracker.pid"
desc="Hadoop tasktracker daemon" desc="Hadoop tasktracker daemon"
export HADOOP_PREFIX="/usr"
start() { start() {
echo -n $"Starting $desc (hadoop-tasktracker): " echo -n $"Starting $desc (hadoop-tasktracker): "

View File

@ -0,0 +1,178 @@
<?xml version="1.0"?>
<!-- This is the configuration file for the resource manager in Hadoop. -->
<!-- You can configure various scheduling parameters related to queues. -->
<!-- The properties for a queue follow a naming convention,such as, -->
<!-- mapred.capacity-scheduler.queue.<queue-name>.property-name. -->
<configuration>
<property>
<name>mapred.capacity-scheduler.maximum-system-jobs</name>
<value>3000</value>
<description>Maximum number of jobs in the system which can be initialized,
concurrently, by the CapacityScheduler.
</description>
</property>
<property>
<name>mapred.capacity-scheduler.queue.default.capacity</name>
<value>100</value>
<description>Percentage of the number of slots in the cluster that are
to be available for jobs in this queue.
</description>
</property>
<property>
<name>mapred.capacity-scheduler.queue.default.maximum-capacity</name>
<value>-1</value>
<description>
maximum-capacity defines a limit beyond which a queue cannot use the capacity of the cluster.
This provides a means to limit how much excess capacity a queue can use. By default, there is no limit.
The maximum-capacity of a queue can only be greater than or equal to its minimum capacity.
Default value of -1 implies a queue can use complete capacity of the cluster.
This property could be to curtail certain jobs which are long running in nature from occupying more than a
certain percentage of the cluster, which in the absence of pre-emption, could lead to capacity guarantees of
other queues being affected.
One important thing to note is that maximum-capacity is a percentage , so based on the cluster's capacity
the max capacity would change. So if large no of nodes or racks get added to the cluster , max Capacity in
absolute terms would increase accordingly.
</description>
</property>
<property>
<name>mapred.capacity-scheduler.queue.default.supports-priority</name>
<value>false</value>
<description>If true, priorities of jobs will be taken into
account in scheduling decisions.
</description>
</property>
<property>
<name>mapred.capacity-scheduler.queue.default.minimum-user-limit-percent</name>
<value>100</value>
<description> Each queue enforces a limit on the percentage of resources
allocated to a user at any given time, if there is competition for them.
This user limit can vary between a minimum and maximum value. The former
depends on the number of users who have submitted jobs, and the latter is
set to this property value. For example, suppose the value of this
property is 25. If two users have submitted jobs to a queue, no single
user can use more than 50% of the queue resources. If a third user submits
a job, no single user can use more than 33% of the queue resources. With 4
or more users, no user can use more than 25% of the queue's resources. A
value of 100 implies no user limits are imposed.
</description>
</property>
<property>
<name>mapred.capacity-scheduler.queue.default.user-limit-factor</name>
<value>1</value>
<description>The multiple of the queue capacity which can be configured to
allow a single user to acquire more slots.
</description>
</property>
<property>
<name>mapred.capacity-scheduler.queue.default.maximum-initialized-active-tasks</name>
<value>200000</value>
<description>The maximum number of tasks, across all jobs in the queue,
which can be initialized concurrently. Once the queue's jobs exceed this
limit they will be queued on disk.
</description>
</property>
<property>
<name>mapred.capacity-scheduler.queue.default.maximum-initialized-active-tasks-per-user</name>
<value>100000</value>
<description>The maximum number of tasks per-user, across all the of the
user's jobs in the queue, which can be initialized concurrently. Once the
user's jobs exceed this limit they will be queued on disk.
</description>
</property>
<property>
<name>mapred.capacity-scheduler.queue.default.init-accept-jobs-factor</name>
<value>10</value>
<description>The multipe of (maximum-system-jobs * queue-capacity) used to
determine the number of jobs which are accepted by the scheduler.
</description>
</property>
<!-- The default configuration settings for the capacity task scheduler -->
<!-- The default values would be applied to all the queues which don't have -->
<!-- the appropriate property for the particular queue -->
<property>
<name>mapred.capacity-scheduler.default-supports-priority</name>
<value>false</value>
<description>If true, priorities of jobs will be taken into
account in scheduling decisions by default in a job queue.
</description>
</property>
<property>
<name>mapred.capacity-scheduler.default-minimum-user-limit-percent</name>
<value>100</value>
<description>The percentage of the resources limited to a particular user
for the job queue at any given point of time by default.
</description>
</property>
<property>
<name>mapred.capacity-scheduler.default-user-limit-factor</name>
<value>1</value>
<description>The default multiple of queue-capacity which is used to
determine the amount of slots a single user can consume concurrently.
</description>
</property>
<property>
<name>mapred.capacity-scheduler.default-maximum-active-tasks-per-queue</name>
<value>200000</value>
<description>The default maximum number of tasks, across all jobs in the
queue, which can be initialized concurrently. Once the queue's jobs exceed
this limit they will be queued on disk.
</description>
</property>
<property>
<name>mapred.capacity-scheduler.default-maximum-active-tasks-per-user</name>
<value>100000</value>
<description>The default maximum number of tasks per-user, across all the of
the user's jobs in the queue, which can be initialized concurrently. Once
the user's jobs exceed this limit they will be queued on disk.
</description>
</property>
<property>
<name>mapred.capacity-scheduler.default-init-accept-jobs-factor</name>
<value>10</value>
<description>The default multipe of (maximum-system-jobs * queue-capacity)
used to determine the number of jobs which are accepted by the scheduler.
</description>
</property>
<!-- Capacity scheduler Job Initialization configuration parameters -->
<property>
<name>mapred.capacity-scheduler.init-poll-interval</name>
<value>5000</value>
<description>The amount of time in miliseconds which is used to poll
the job queues for jobs to initialize.
</description>
</property>
<property>
<name>mapred.capacity-scheduler.init-worker-threads</name>
<value>5</value>
<description>Number of worker threads which would be used by
Initialization poller to initialize jobs in a set of queue.
If number mentioned in property is equal to number of job queues
then a single thread would initialize jobs in a queue. If lesser
then a thread would get a set of queues assigned. If the number
is greater then number of threads would be equal to number of
job queues.
</description>
</property>
</configuration>

View File

@ -0,0 +1,7 @@
#Logging Implementation
#Log4J
org.apache.commons.logging.Log=org.apache.commons.logging.impl.Log4JLogger
#JDK Logger
#org.apache.commons.logging.Log=org.apache.commons.logging.impl.Jdk14Logger

View File

@ -1,27 +1,78 @@
<?xml version="1.0"?> <?xml version="1.0"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?> <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<!-- Put site-specific property overrides in this file. --> <!-- Put site-specific property overrides in this file. -->
<configuration> <configuration>
<property>
<name>local.realm</name>
<value>${KERBEROS_REALM}</value>
</property>
<!-- file system properties -->
<property> <property>
<name>fs.default.name</name> <name>fs.default.name</name>
<value>${HADOOP_NN_HOST}</value> <value>hdfs://${HADOOP_NN_HOST}:8020</value>
<description>The name of the default file system. Either the
literal string "local" or a host:port for NDFS.
</description>
<final>true</final>
</property> </property>
<property>
<name>fs.trash.interval</name>
<value>360</value>
<description>Number of minutes between trash checkpoints.
If zero, the trash feature is disabled.
</description>
</property>
<property>
<name>hadoop.security.auth_to_local</name>
<value>
RULE:[2:$1@$0]([jt]t@.*${KERBEROS_REALM})s/.*/${HADOOP_MR_USER}/
RULE:[2:$1@$0]([nd]n@.*${KERBEROS_REALM})s/.*/${HADOOP_HDFS_USER}/
RULE:[2:$1@$0](mapred@.*${KERBEROS_REALM})s/.*/${HADOOP_MR_USER}/
RULE:[2:$1@$0](hdfs@.*${KERBEROS_REALM})s/.*/${HADOOP_HDFS_USER}/
RULE:[2:$1@$0](mapredqa@.*${KERBEROS_REALM})s/.*/${HADOOP_MR_USER}/
RULE:[2:$1@$0](hdfsqa@.*${KERBEROS_REALM})s/.*/${HADOOP_HDFS_USER}/
DEFAULT
</value>
<description></description>
</property>
<property>
<name>hadoop.security.authentication</name>
<value>${SECURITY_TYPE}</value>
<description>
Set the authentication for the cluster. Valid values are: simple or
kerberos.
</description>
</property>
<property>
<name>hadoop.security.authorization</name>
<value>${SECURITY}</value>
<description>
Enable authorization for different protocols.
</description>
</property>
<property>
<name>hadoop.security.groups.cache.secs</name>
<value>14400</value>
</property>
<property>
<name>hadoop.kerberos.kinit.command</name>
<value>${KINIT}</value>
</property>
<property>
<name>hadoop.http.filter.initializers</name>
<value>org.apache.hadoop.http.lib.StaticUserWebFilter</value>
</property>
</configuration> </configuration>

View File

@ -0,0 +1,54 @@
# Set Hadoop-specific environment variables here.
# The only required environment variable is JAVA_HOME. All others are
# optional. When running a distributed configuration it is best to
# set JAVA_HOME in this file, so that it is correctly defined on
# remote nodes.
# The java implementation to use.
export JAVA_HOME=${JAVA_HOME}
export HADOOP_CONF_DIR=${HADOOP_CONF_DIR:-"/etc/hadoop"}
# Extra Java CLASSPATH elements. Automatically insert capacity-scheduler.
for f in $HADOOP_HOME/contrib/capacity-scheduler/*.jar; do
if [ "$HADOOP_CLASSPATH" ]; then
export HADOOP_CLASSPATH=${HADOOP_CLASSPATH}:$f
else
export HADOOP_CLASSPATH=$f
fi
done
# The maximum amount of heap to use, in MB. Default is 1000.
#export HADOOP_HEAPSIZE=
#export HADOOP_NAMENODE_INIT_HEAPSIZE=""
# Extra Java runtime options. Empty by default.
export HADOOP_OPTS="-Djava.net.preferIPv4Stack=true ${HADOOP_OPTS}"
# Command specific options appended to HADOOP_OPTS when specified
export HADOOP_NAMENODE_OPTS="-Dsecurity.audit.logger=INFO,DRFAS -Dhdfs.audit.logger=INFO,DRFAAUDIT ${HADOOP_NAMENODE_OPTS}"
HADOOP_JOBTRACKER_OPTS="-Dsecurity.audit.logger=INFO,DRFAS -Dmapred.audit.logger=INFO,MRAUDIT -Dmapred.jobsummary.logger=INFO,JSA ${HADOOP_JOBTRACKER_OPTS}"
HADOOP_TASKTRACKER_OPTS="-Dsecurity.audit.logger=ERROR,console -Dmapred.audit.logger=ERROR,console ${HADOOP_TASKTRACKER_OPTS}"
HADOOP_DATANODE_OPTS="-Dsecurity.audit.logger=ERROR,DRFAS ${HADOOP_DATANODE_OPTS}"
export HADOOP_SECONDARYNAMENODE_OPTS="-Dsecurity.audit.logger=INFO,DRFAS -Dhdfs.audit.logger=INFO,DRFAAUDIT ${HADOOP_SECONDARYNAMENODE_OPTS}"
# The following applies to multiple commands (fs, dfs, fsck, distcp etc)
export HADOOP_CLIENT_OPTS="-Xmx128m ${HADOOP_CLIENT_OPTS}"
#HADOOP_JAVA_PLATFORM_OPTS="-XX:-UsePerfData ${HADOOP_JAVA_PLATFORM_OPTS}"
# On secure datanodes, user to run the datanode as after dropping privileges
export HADOOP_SECURE_DN_USER=${HADOOP_HDFS_USER}
# Where log files are stored. $HADOOP_HOME/logs by default.
export HADOOP_LOG_DIR=${HADOOP_LOG_DIR}/$USER
# Where log files are stored in the secure data environment.
export HADOOP_SECURE_DN_LOG_DIR=${HADOOP_LOG_DIR}/${HADOOP_HDFS_USER}
# The directory where pid files are stored. /tmp by default.
export HADOOP_PID_DIR=${HADOOP_PID_DIR}
export HADOOP_SECURE_DN_PID_DIR=${HADOOP_PID_DIR}
# A string representing this instance of hadoop. $USER by default.
export HADOOP_IDENT_STRING=$USER

View File

@ -0,0 +1,118 @@
<?xml version="1.0"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<!-- Put site-specific property overrides in this file. -->
<configuration>
<property>
<name>security.client.protocol.acl</name>
<value>*</value>
<description>ACL for ClientProtocol, which is used by user code
via the DistributedFileSystem.
The ACL is a comma-separated list of user and group names. The user and
group list is separated by a blank. For e.g. "alice,bob users,wheel".
A special value of "*" means all users are allowed.</description>
</property>
<property>
<name>security.client.datanode.protocol.acl</name>
<value>*</value>
<description>ACL for ClientDatanodeProtocol, the client-to-datanode protocol
for block recovery.
The ACL is a comma-separated list of user and group names. The user and
group list is separated by a blank. For e.g. "alice,bob users,wheel".
A special value of "*" means all users are allowed.</description>
</property>
<property>
<name>security.datanode.protocol.acl</name>
<value>*</value>
<description>ACL for DatanodeProtocol, which is used by datanodes to
communicate with the namenode.
The ACL is a comma-separated list of user and group names. The user and
group list is separated by a blank. For e.g. "alice,bob users,wheel".
A special value of "*" means all users are allowed.</description>
</property>
<property>
<name>security.inter.datanode.protocol.acl</name>
<value>*</value>
<description>ACL for InterDatanodeProtocol, the inter-datanode protocol
for updating generation timestamp.
The ACL is a comma-separated list of user and group names. The user and
group list is separated by a blank. For e.g. "alice,bob users,wheel".
A special value of "*" means all users are allowed.</description>
</property>
<property>
<name>security.namenode.protocol.acl</name>
<value>*</value>
<description>ACL for NamenodeProtocol, the protocol used by the secondary
namenode to communicate with the namenode.
The ACL is a comma-separated list of user and group names. The user and
group list is separated by a blank. For e.g. "alice,bob users,wheel".
A special value of "*" means all users are allowed.</description>
</property>
<property>
<name>security.inter.tracker.protocol.acl</name>
<value>*</value>
<description>ACL for InterTrackerProtocol, used by the tasktrackers to
communicate with the jobtracker.
The ACL is a comma-separated list of user and group names. The user and
group list is separated by a blank. For e.g. "alice,bob users,wheel".
A special value of "*" means all users are allowed.</description>
</property>
<property>
<name>security.job.submission.protocol.acl</name>
<value>*</value>
<description>ACL for JobSubmissionProtocol, used by job clients to
communciate with the jobtracker for job submission, querying job status etc.
The ACL is a comma-separated list of user and group names. The user and
group list is separated by a blank. For e.g. "alice,bob users,wheel".
A special value of "*" means all users are allowed.</description>
</property>
<property>
<name>security.task.umbilical.protocol.acl</name>
<value>*</value>
<description>ACL for TaskUmbilicalProtocol, used by the map and reduce
tasks to communicate with the parent tasktracker.
The ACL is a comma-separated list of user and group names. The user and
group list is separated by a blank. For e.g. "alice,bob users,wheel".
A special value of "*" means all users are allowed.</description>
</property>
<property>
<name>security.admin.operations.protocol.acl</name>
<value>${HADOOP_HDFS_USER}</value>
<description>ACL for AdminOperationsProtocol. Used for admin commands.
The ACL is a comma-separated list of user and group names. The user and
group list is separated by a blank. For e.g. "alice,bob users,wheel".
A special value of "*" means all users are allowed.</description>
</property>
<property>
<name>security.refresh.usertogroups.mappings.protocol.acl</name>
<value>${HADOOP_HDFS_USER}</value>
<description>ACL for RefreshUserMappingsProtocol. Used to refresh
users mappings. The ACL is a comma-separated list of user and
group names. The user and group list is separated by a blank. For
e.g. "alice,bob users,wheel". A special value of "*" means all
users are allowed.</description>
</property>
<property>
<name>security.refresh.policy.protocol.acl</name>
<value>${HADOOP_HDFS_USER}</value>
<description>ACL for RefreshAuthorizationPolicyProtocol, used by the
dfsadmin and mradmin commands to refresh the security policy in-effect.
The ACL is a comma-separated list of user and group names. The user and
group list is separated by a blank. For e.g. "alice,bob users,wheel".
A special value of "*" means all users are allowed.</description>
</property>
</configuration>

View File

@ -0,0 +1,225 @@
<?xml version="1.0"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<configuration>
<!-- file system properties -->
<property>
<name>dfs.name.dir</name>
<value>${HADOOP_NN_DIR}</value>
<description>Determines where on the local filesystem the DFS name node
should store the name table. If this is a comma-delimited list
of directories then the name table is replicated in all of the
directories, for redundancy. </description>
<final>true</final>
</property>
<property>
<name>dfs.data.dir</name>
<value>${HADOOP_DN_DIR}</value>
<description>Determines where on the local filesystem an DFS data node
should store its blocks. If this is a comma-delimited
list of directories, then data will be stored in all named
directories, typically on different devices.
Directories that do not exist are ignored.
</description>
<final>true</final>
</property>
<property>
<name>dfs.safemode.threshold.pct</name>
<value>1.0f</value>
<description>
Specifies the percentage of blocks that should satisfy
the minimal replication requirement defined by dfs.replication.min.
Values less than or equal to 0 mean not to start in safe mode.
Values greater than 1 will make safe mode permanent.
</description>
</property>
<property>
<name>dfs.datanode.address</name>
<value>${HADOOP_DN_ADDR}</value>
</property>
<property>
<name>dfs.datanode.http.address</name>
<value>${HADOOP_DN_HTTP_ADDR}</value>
</property>
<property>
<name>dfs.http.address</name>
<value>${HADOOP_NN_HOST}:50070</value>
<description>The name of the default file system. Either the
literal string "local" or a host:port for NDFS.
</description>
<final>true</final>
</property>
<!-- Permissions configuration -->
<property>
<name>dfs.umaskmode</name>
<value>077</value>
<description>
The octal umask used when creating files and directories.
</description>
</property>
<property>
<name>dfs.block.access.token.enable</name>
<value>${SECURITY}</value>
<description>
Are access tokens are used as capabilities for accessing datanodes.
</description>
</property>
<property>
<name>dfs.namenode.kerberos.principal</name>
<value>nn/_HOST@${local.realm}</value>
<description>
Kerberos principal name for the NameNode
</description>
</property>
<property>
<name>dfs.secondary.namenode.kerberos.principal</name>
<value>nn/_HOST@${local.realm}</value>
<description>
Kerberos principal name for the secondary NameNode.
</description>
</property>
<property>
<name>dfs.namenode.kerberos.https.principal</name>
<value>host/_HOST@${local.realm}</value>
<description>
The Kerberos principal for the host that the NameNode runs on.
</description>
</property>
<property>
<name>dfs.secondary.namenode.kerberos.https.principal</name>
<value>host/_HOST@${local.realm}</value>
<description>
The Kerberos principal for the hostthat the secondary NameNode runs on.
</description>
</property>
<property>
<name>dfs.secondary.https.port</name>
<value>50490</value>
<description>The https port where secondary-namenode binds</description>
</property>
<property>
<name>dfs.datanode.kerberos.principal</name>
<value>dn/_HOST@${local.realm}</value>
<description>
The Kerberos principal that the DataNode runs as. "_HOST" is replaced by
the real host name.
</description>
</property>
<property>
<name>dfs.namenode.keytab.file</name>
<value>/etc/security/keytabs/nn.service.keytab</value>
<description>
Combined keytab file containing the namenode service and host principals.
</description>
</property>
<property>
<name>dfs.secondary.namenode.keytab.file</name>
<value>/etc/security/keytabs/nn.service.keytab</value>
<description>
Combined keytab file containing the namenode service and host principals.
</description>
</property>
<property>
<name>dfs.datanode.keytab.file</name>
<value>/etc/security/keytabs/dn.service.keytab</value>
<description>
The filename of the keytab file for the DataNode.
</description>
</property>
<property>
<name>dfs.https.port</name>
<value>50470</value>
<description>The https port where namenode binds</description>
</property>
<property>
<name>dfs.https.address</name>
<value>${HADOOP_NN_HOST}:50470</value>
<description>The https address where namenode binds</description>
</property>
<property>
<name>dfs.datanode.data.dir.perm</name>
<value>700</value>
<description>The permissions that should be there on dfs.data.dir
directories. The datanode will not come up if the permissions are
different on existing dfs.data.dir directories. If the directories
don't exist, they will be created with this permission.
</description>
</property>
<property>
<name>dfs.cluster.administrators</name>
<value>${HADOOP_HDFS_USER}</value>
<description>ACL for who all can view the default servlets in the HDFS</description>
</property>
<property>
<name>dfs.permissions.superusergroup</name>
<value>${HADOOP_GROUP}</value>
<description>The name of the group of super-users.</description>
</property>
<property>
<name>dfs.namenode.http-address</name>
<value>${HADOOP_NN_HOST}:50070</value>
<description>
The address and the base port where the dfs namenode web ui will listen on.
If the port is 0 then the server will start on a free port.
</description>
</property>
<property>
<name>dfs.namenode.https-address</name>
<value>${HADOOP_NN_HOST}:50470</value>
</property>
<property>
<name>dfs.secondary.http.address</name>
<value>${HADOOP_SNN_HOST}:50090</value>
<description>
The secondary namenode http server address and port.
If the port is 0 then the server will start on a free port.
</description>
</property>
<property>
<name>dfs.hosts</name>
<value>${HADOOP_CONF_DIR}/dfs.include</value>
<description>Names a file that contains a list of hosts that are
permitted to connect to the namenode. The full pathname of the file
must be specified. If the value is empty, all hosts are
permitted.</description>
</property>
<property>
<name>dfs.hosts.exclude</name>
<value>${HADOOP_CONF_DIR}/dfs.exclude</value>
<description>Names a file that contains a list of hosts that are
not permitted to connect to the namenode. The full pathname of the
file must be specified. If the value is empty, no hosts are
excluded.
</description>
</property>
</configuration>

View File

@ -0,0 +1,12 @@
<?xml version="1.0"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<configuration>
<property>
<name>mapred.queue.default.acl-submit-job</name>
<value>*</value>
</property>
<property>
<name>mapred.queue.default.acl-administer-jobs</name>
<value>*</value>
</property>
</configuration>

View File

@ -0,0 +1,268 @@
<?xml version="1.0"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<!-- Put site-specific property overrides in this file. -->
<configuration>
<property>
<name>mapred.tasktracker.tasks.sleeptime-before-sigkill</name>
<value>250</value>
<description>Normally, this is the amount of time before killing
processes, and the recommended-default is 5.000 seconds - a value of
5000 here. In this case, we are using it solely to blast tasks before
killing them, and killing them very quickly (1/4 second) to guarantee
that we do not leave VMs around for later jobs.
</description>
</property>
<property>
<name>mapred.system.dir</name>
<value>/mapred/mapredsystem</value>
<final>true</final>
</property>
<property>
<name>mapred.job.tracker</name>
<value>${HADOOP_JT_HOST}:9000</value>
<final>true</final>
</property>
<property>
<name>mapred.job.tracker.http.address</name>
<value>${HADOOP_JT_HOST}:50030</value>
<final>true</final>
</property>
<property>
<name>mapred.local.dir</name>
<value>${HADOOP_MAPRED_DIR}</value>
<final>true</final>
</property>
<property>
<name>mapreduce.cluster.administrators</name>
<value>${HADOOP_MR_USER}</value>
</property>
<property>
<name>mapred.map.tasks.speculative.execution</name>
<value>false</value>
<description>If true, then multiple instances of some map tasks
may be executed in parallel.</description>
</property>
<property>
<name>mapred.reduce.tasks.speculative.execution</name>
<value>false</value>
<description>If true, then multiple instances of some reduce tasks
may be executed in parallel.</description>
</property>
<property>
<name>mapred.output.compression.type</name>
<value>BLOCK</value>
<description>If the job outputs are to compressed as SequenceFiles, how
should they be compressed? Should be one of NONE, RECORD or BLOCK.
</description>
</property>
<property>
<name>jetty.connector</name>
<value>org.mortbay.jetty.nio.SelectChannelConnector</value>
</property>
<property>
<name>mapred.task.tracker.task-controller</name>
<value>${TASK_CONTROLLER}</value>
</property>
<property>
<name>mapred.child.root.logger</name>
<value>INFO,TLA</value>
</property>
<property>
<name>stream.tmpdir</name>
<value>${mapred.temp.dir}</value>
</property>
<property>
<name>mapred.child.java.opts</name>
<value>-server -Xmx640m -Djava.net.preferIPv4Stack=true</value>
</property>
<property>
<name>mapred.child.ulimit</name>
<value>8388608</value>
</property>
<property>
<name>mapred.job.tracker.persist.jobstatus.active</name>
<value>true</value>
<description>Indicates if persistency of job status information is
active or not.
</description>
</property>
<property>
<name>mapred.job.tracker.persist.jobstatus.dir</name>
<value>file:///${HADOOP_LOG_DIR}/${HADOOP_MR_USER}/jobstatus</value>
<description>The directory where the job status information is persisted
in a file system to be available after it drops of the memory queue and
between jobtracker restarts.
</description>
</property>
<property>
<name>mapred.job.tracker.history.completed.location</name>
<value>/mapred/history/done</value>
</property>
<property>
<name>mapred.heartbeats.in.second</name>
<value>200</value>
<description>to enable HADOOP:5784</description>
</property>
<property>
<name>mapreduce.tasktracker.outofband.heartbeat</name>
<value>true</value>
<description>to enable MAPREDUCE:270</description>
</property>
<property>
<name>mapred.jobtracker.maxtasks.per.job</name>
<value>200000</value>
<final>true</final>
<description>The maximum number of tasks for a single job.
A value of -1 indicates that there is no maximum.
</description>
</property>
<property>
<name>mapreduce.jobtracker.kerberos.principal</name>
<value>jt/_HOST@${local.realm}</value>
<description>
JT principal
</description>
</property>
<property>
<name>mapreduce.tasktracker.kerberos.principal</name>
<value>tt/_HOST@${local.realm}</value>
<description>
TT principal.
</description>
</property>
<property>
<name>hadoop.job.history.user.location</name>
<value>none</value>
</property>
<property>
<name>mapreduce.jobtracker.keytab.file</name>
<value>/etc/security/keytabs/jt.service.keytab</value>
<description>
The keytab for the jobtracker principal.
</description>
</property>
<property>
<name>mapreduce.tasktracker.keytab.file</name>
<value>/etc/security/keytabs/tt.service.keytab</value>
<description>The filename of the keytab for the task tracker</description>
</property>
<property>
<name>mapreduce.jobtracker.staging.root.dir</name>
<value>/user</value>
<description>The Path prefix for where the staging directories should be
placed. The next level is always the user's
name. It is a path in the default file system.
</description>
</property>
<property>
<name>mapreduce.job.acl-modify-job</name>
<value></value>
</property>
<property>
<name>mapreduce.job.acl-view-job</name>
<value>Dr.Who</value>
</property>
<property>
<name>mapreduce.tasktracker.group</name>
<value>${HADOOP_GROUP}</value>
<description>The group that the task controller uses for accessing the
task controller. The mapred user must be a member and users should *not*
be members.
</description>
</property>
<property>
<name>mapred.acls.enabled</name>
<value>true</value>
</property>
<property>
<name>mapred.jobtracker.taskScheduler</name>
<value>org.apache.hadoop.mapred.CapacityTaskScheduler</value>
</property>
<property>
<name>mapred.queue.names</name>
<value>default</value>
</property>
<!-- settings for the history server -->
<property>
<name>mapreduce.history.server.embedded</name>
<value>false</value>
</property>
<property>
<name>mapreduce.history.server.http.address</name>
<value>${HADOOP_JT_HOST}:51111</value>
</property>
<property>
<name>mapreduce.jobhistory.kerberos.principal</name>
<value>jt/_HOST@${local.realm}</value>
<description>history server principal</description>
</property>
<property>
<name>mapreduce.jobhistory.keytab.file</name>
<value>/etc/security/keytabs/jt.service.keytab</value>
<description>
The keytab for the jobtracker principal.
</description>
</property>
<property>
<name>mapred.hosts</name>
<value>${HADOOP_CONF_DIR}/mapred.include</value>
<description>Names a file that contains the list of nodes that may
connect to the jobtracker. If the value is empty, all hosts are
permitted.</description>
</property>
<property>
<name>mapred.hosts.exclude</name>
<value>${HADOOP_CONF_DIR}/mapred.exclude</value>
<description>Names a file that contains the list of hosts that
should be excluded by the jobtracker. If the value is empty, no
hosts are excluded.</description>
</property>
<property>
<name>mapred.jobtracker.retirejob.check</name>
<value>10000</value>
</property>
<property>
<name>mapred.jobtracker.retirejob.interval</name>
<value>0</value>
</property>
</configuration>

View File

@ -0,0 +1,3 @@
mapreduce.cluster.local.dir=${HADOOP_MAPRED_DIR}
mapreduce.tasktracker.group=${HADOOP_GROUP}
hadoop.log.dir=${HADOOP_LOG_DIR}/${HADOOP_MR_USER}