HADOOP-11045. Introducing a tool to detect flaky tests of hadoop jenkins testing job. Contributed by Yongjun Zhang and Todd Lipcon.

(cherry picked from commit 80705e034b)
This commit is contained in:
Tsuyoshi Ozawa 2015-02-04 01:26:31 +09:00
parent 3828fb5fab
commit b396d2e8d8
2 changed files with 207 additions and 0 deletions

View File

@ -0,0 +1,204 @@
#!/usr/bin/env python
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Given a jenkins test job, this script examines all runs of the job done
# within specified period of time (number of days prior to the execution
# time of this script), and reports all failed tests.
#
# The output of this script includes a section for each run that has failed
# tests, with each failed test name listed.
#
# More importantly, at the end, it outputs a summary section to list all failed
# tests within all examined runs, and indicate how many runs a same test
# failed, and sorted all failed tests by how many runs each test failed.
#
# This way, when we see failed tests in PreCommit build, we can quickly tell
# whether a failed test is a new failure, or it failed before and how often it
# failed, so to have idea whether it may just be a flaky test.
#
# Of course, to be 100% sure about the reason of a test failure, closer look
# at the failed test for the specific run is necessary.
#
import sys
import platform
sysversion = sys.hexversion
onward30 = False
if sysversion < 0x020600F0:
sys.exit("Minimum supported python version is 2.6, the current version is " +
"Python" + platform.python_version())
if sysversion == 0x030000F0:
sys.exit("There is a known bug with Python" + platform.python_version() +
", please try a different version");
if sysversion < 0x03000000:
import urllib2
else:
onward30 = True
import urllib.request
import datetime
import json as simplejson
import logging
from optparse import OptionParser
import time
# Configuration
DEFAULT_JENKINS_URL = "https://builds.apache.org"
DEFAULT_JOB_NAME = "Hadoop-Common-trunk"
DEFAULT_NUM_PREVIOUS_DAYS = 14
SECONDS_PER_DAY = 86400
# total number of runs to examine
numRunsToExamine = 0
""" Parse arguments """
def parse_args():
parser = OptionParser()
parser.add_option("-J", "--jenkins-url", type="string",
dest="jenkins_url", help="Jenkins URL",
default=DEFAULT_JENKINS_URL)
parser.add_option("-j", "--job-name", type="string",
dest="job_name", help="Job name to look at",
default=DEFAULT_JOB_NAME)
parser.add_option("-n", "--num-days", type="int",
dest="num_prev_days", help="Number of days to examine",
default=DEFAULT_NUM_PREVIOUS_DAYS)
(options, args) = parser.parse_args()
if args:
parser.error("unexpected arguments: " + repr(args))
return options
""" Load data from specified url """
def load_url_data(url):
if onward30:
ourl = urllib.request.urlopen(url)
codec = ourl.info().get_param('charset')
content = ourl.read().decode(codec)
data = simplejson.loads(content)
else:
ourl = urllib2.urlopen(url)
data = simplejson.load(ourl)
return data
""" List all builds of the target project. """
def list_builds(jenkins_url, job_name):
url = "%(jenkins)s/job/%(job_name)s/api/json?tree=builds[url,result,timestamp]" % dict(
jenkins=jenkins_url,
job_name=job_name)
try:
data = load_url_data(url)
except:
logging.error("Could not fetch: %s" % url)
raise
return data['builds']
""" Find the names of any tests which failed in the given build output URL. """
def find_failing_tests(testReportApiJson, jobConsoleOutput):
ret = set()
try:
data = load_url_data(testReportApiJson)
except:
logging.error(" Could not open testReport, check " +
jobConsoleOutput + " for why it was reported failed")
return ret
for suite in data['suites']:
for cs in suite['cases']:
status = cs['status']
errDetails = cs['errorDetails']
if (status == 'REGRESSION' or status == 'FAILED' or (errDetails is not None)):
ret.add(cs['className'] + "." + cs['name'])
if len(ret) == 0:
logging.info(" No failed tests in testReport, check " +
jobConsoleOutput + " for why it was reported failed.")
return ret
""" Iterate runs of specfied job within num_prev_days and collect results """
def find_flaky_tests(jenkins_url, job_name, num_prev_days):
global numRunsToExamine
all_failing = dict()
# First list all builds
builds = list_builds(jenkins_url, job_name)
# Select only those in the last N days
min_time = int(time.time()) - SECONDS_PER_DAY * num_prev_days
builds = [b for b in builds if (int(b['timestamp']) / 1000) > min_time]
# Filter out only those that failed
failing_build_urls = [(b['url'] , b['timestamp']) for b in builds
if (b['result'] in ('UNSTABLE', 'FAILURE'))]
tnum = len(builds)
num = len(failing_build_urls)
numRunsToExamine = tnum
logging.info(" THERE ARE " + str(num) + " builds (out of " + str(tnum)
+ ") that have failed tests in the past " + str(num_prev_days) + " days"
+ ((".", ", as listed below:\n")[num > 0]))
for failed_build_with_time in failing_build_urls:
failed_build = failed_build_with_time[0];
jobConsoleOutput = failed_build + "Console";
testReport = failed_build + "testReport";
testReportApiJson = testReport + "/api/json";
ts = float(failed_build_with_time[1]) / 1000.
st = datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S')
logging.info("===>%s" % str(testReport) + " (" + st + ")")
failing = find_failing_tests(testReportApiJson, jobConsoleOutput)
if failing:
for ftest in failing:
logging.info(" Failed test: %s" % ftest)
all_failing[ftest] = all_failing.get(ftest,0)+1
return all_failing
def main():
global numRunsToExamine
logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.INFO)
# set up logger to write to stdout
soh = logging.StreamHandler(sys.stdout)
soh.setLevel(logging.INFO)
logger = logging.getLogger()
logger.removeHandler(logger.handlers[0])
logger.addHandler(soh)
opts = parse_args()
logging.info("****Recently FAILED builds in url: " + opts.jenkins_url
+ "/job/" + opts.job_name + "")
all_failing = find_flaky_tests(opts.jenkins_url, opts.job_name,
opts.num_prev_days)
if len(all_failing) == 0:
raise SystemExit(0)
logging.info("\nAmong " + str(numRunsToExamine) + " runs examined, all failed "
+ "tests <#failedRuns: testName>:")
# print summary section: all failed tests sorted by how many times they failed
for tn in sorted(all_failing, key=all_failing.get, reverse=True):
logging.info(" " + str(all_failing[tn])+ ": " + tn)
if __name__ == "__main__":
main()

View File

@ -28,6 +28,9 @@ Release 2.7.0 - UNRELEASED
HADOOP-11490. Expose truncate API via FileSystem and shell command. HADOOP-11490. Expose truncate API via FileSystem and shell command.
(Milan Desai via shv) (Milan Desai via shv)
HADOOP-11045. Introducing a tool to detect flaky tests of hadoop jenkins testing
job. (Yongjun Zhang and Todd Lipcon via ozawa)
IMPROVEMENTS IMPROVEMENTS
HADOOP-11483. HardLink.java should use the jdk7 createLink method (aajisaka) HADOOP-11483. HardLink.java should use the jdk7 createLink method (aajisaka)