hbase/dev-support/zombie-detector.sh

167 lines
5.0 KiB
Bash
Raw Permalink Normal View History

#!/usr/bin/env bash
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
# Looks for any running zombies left over from old build runs.
# Will report and try to do stack trace on stale processes so can
# figure how they are hung. Echos state as the script runs
# on STDERR but prints final output on STDOUT formatted so it
# will fold into the test result formatting done by test-patch.sh.
# This script is called from test-patch.sh but also after tests
# have run up on builds.apache.org.
# TODO: format output to suit context -- test-patch, jenkins or dev env
#set -x
# printenv
### Setup some variables.
bindir=$(dirname $0)
# This key is set by our surefire configuration up in the main pom.xml
# This key needs to match the key we set up there.
HBASE_BUILD_ID_KEY="hbase.build.id="
JENKINS=
PS=${PS:-ps}
AWK=${AWK:-awk}
WGET=${WGET:-wget}
GREP=${GREP:-grep}
JIRACLI=${JIRA:-jira}
###############################################################################
printUsage() {
echo "Usage: $0 [options]" BUILD_ID
echo
echo "Where:"
echo " BUILD_ID is build id to look for in process listing"
echo
echo "Options:"
echo "--ps-cmd=<cmd> The 'ps' command to use (default 'ps')"
echo "--awk-cmd=<cmd> The 'awk' command to use (default 'awk')"
echo "--grep-cmd=<cmd> The 'grep' command to use (default 'grep')"
echo
echo "Jenkins-only options:"
echo "--jenkins Run by Jenkins (runs tests and posts results to JIRA)"
echo "--wget-cmd=<cmd> The 'wget' command to use (default 'wget')"
echo "--jira-cmd=<cmd> The 'jira' command to use (default 'jira')"
}
###############################################################################
parseArgs() {
for i in $*
do
case $i in
--jenkins)
JENKINS=true
;;
--ps-cmd=*)
PS=${i#*=}
;;
--awk-cmd=*)
AWK=${i#*=}
;;
--wget-cmd=*)
WGET=${i#*=}
;;
--grep-cmd=*)
GREP=${i#*=}
;;
--jira-cmd=*)
JIRACLI=${i#*=}
;;
*)
BUILD_ID=$i
;;
esac
done
if [ -z "$BUILD_ID" ]; then
printUsage
exit 1
fi
}
### Return list of the processes found with passed build id.
find_processes () {
jps -v | grep surefirebooter | grep -e "${HBASE_BUILD_TAG}"
}
### Look for zombies
zombies () {
ZOMBIES=`find_processes`
if [[ -z ${ZOMBIES} ]]
then
ZOMBIE_TESTS_COUNT=0
else
ZOMBIE_TESTS_COUNT=`echo "${ZOMBIES}"| wc -l| xargs`
fi
if [[ $ZOMBIE_TESTS_COUNT != 0 ]] ; then
wait=30
echo "`date` Found ${ZOMBIE_TESTS_COUNT} suspicious java process(es) listed below; waiting ${wait}s to see if just slow to stop" >&2
echo ${ZOMBIES} >&2
sleep ${wait}
PIDS=`echo "${ZOMBIES}"|${AWK} '{print $1}'`
ZOMBIE_TESTS_COUNT=0
for pid in $PIDS
do
# Test our zombie still running (and that it still an hbase build item)
PS_OUTPUT=`ps -p $pid | tail +2 | grep -e "${HBASE_BUILD_TAG}"`
if [[ ! -z "${PS_OUTPUT}" ]]
then
echo "`date` Zombie: $PS_OUTPUT" >&2
let "ZOMBIE_TESTS_COUNT+=1"
PS_STACK=`jstack $pid | grep -e "\.Test" | grep -e "\.java"| head -3`
echo "${PS_STACK}" >&2
ZB_STACK="${ZB_STACK}\nPID=${pid} ${PS_STACK}"
fi
done
if [[ $ZOMBIE_TESTS_COUNT != 0 ]]
then
echo "`date` There are ${ZOMBIE_TESTS_COUNT} possible zombie test(s)." >&2
# If JIRA_COMMENT in environment, append our findings to it
echo -e "$JIRA_COMMENT
{color:red}+1 zombies{red}. There are ${ZOMBIE_TESTS_COUNT} possible zombie test(s)
${ZB_STACK}"
# Exit with exit code of 1.
exit 1
else
echo "`date` We're ok: there was a zombie candidate but it went away" >&2
echo "$JIRA_COMMENT
{color:green}+1 zombies{color}. No zombie tests found running at the end of the build (There were candidates but they seem to have gone away)."
fi
else
echo "`date` We're ok: there is no zombie test" >&2
echo "$JIRA_COMMENT
{color:green}+1 zombies{color}. No zombie tests found running at the end of the build."
fi
}
### Check if arguments to the script have been specified properly or not
parseArgs $@
HBASE_BUILD_TAG="${HBASE_BUILD_ID_KEY}${BUILD_ID}"
zombies
RESULT=$?
if [[ $JENKINS == "true" ]] ; then
if [[ $RESULT != 0 ]] ; then
exit 100
fi
fi
RESULT=$?