MAPREDUCE-5066. Added a timeout for the job.end.notification.url. Contributed by Ivan Mitic.

git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1470216 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Arun Murthy 2013-04-20 19:18:24 +00:00
parent a91067fc5e
commit 794f9bb3e4
6 changed files with 246 additions and 14 deletions

View File

@ -327,6 +327,9 @@ Release 2.0.5-beta - UNRELEASED
MAPREDUCE-5163. Update MR App to not use API utility methods for collections
after YARN-441. (Xuan Gong via vinodkv)
MAPREDUCE-5066. Added a timeout for the job.end.notification.url. (Ivan
Mitic via acmurthy)
Release 2.0.4-alpha - UNRELEASED
INCOMPATIBLE CHANGES

View File

@ -27,6 +27,7 @@
import org.apache.hadoop.conf.Configurable;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.mapred.JobContext;
import org.apache.hadoop.mapreduce.MRJobConfig;
import org.apache.hadoop.mapreduce.v2.api.records.JobReport;
import org.mortbay.log.Log;
@ -54,6 +55,7 @@ public class JobEndNotifier implements Configurable {
protected String proxyConf;
protected int numTries; //Number of tries to attempt notification
protected int waitInterval; //Time (ms) to wait between retrying notification
protected int timeout; // Timeout (ms) on the connection and notification
protected URL urlToNotify; //URL to notify read from the config
protected Proxy proxyToUse = Proxy.NO_PROXY; //Proxy to use for notification
@ -76,6 +78,9 @@ public void setConf(Configuration conf) {
);
waitInterval = (waitInterval < 0) ? 5000 : waitInterval;
timeout = conf.getInt(JobContext.MR_JOB_END_NOTIFICATION_TIMEOUT,
JobContext.DEFAULT_MR_JOB_END_NOTIFICATION_TIMEOUT);
userUrl = conf.get(MRJobConfig.MR_JOB_END_NOTIFICATION_URL);
proxyConf = conf.get(MRJobConfig.MR_JOB_END_NOTIFICATION_PROXY);
@ -112,8 +117,7 @@ public Configuration getConf() {
}
/**
* Notify the URL just once. Use best effort. Timeout hard coded to 5
* seconds.
* Notify the URL just once. Use best effort.
*/
protected boolean notifyURLOnce() {
boolean success = false;
@ -121,8 +125,8 @@ protected boolean notifyURLOnce() {
Log.info("Job end notification trying " + urlToNotify);
HttpURLConnection conn =
(HttpURLConnection) urlToNotify.openConnection(proxyToUse);
conn.setConnectTimeout(5*1000);
conn.setReadTimeout(5*1000);
conn.setConnectTimeout(timeout);
conn.setReadTimeout(timeout);
conn.setAllowUserInteraction(false);
if(conn.getResponseCode() != HttpURLConnection.HTTP_OK) {
Log.warn("Job end notification to " + urlToNotify +" failed with code: "

View File

@ -73,6 +73,13 @@ private void testWaitInterval(Configuration conf) {
+ waitInterval, waitInterval == 5000);
}
private void testTimeout(Configuration conf) {
conf.set(MRJobConfig.MR_JOB_END_NOTIFICATION_TIMEOUT, "1000");
setConf(conf);
Assert.assertTrue("Expected timeout to be 1000, but was "
+ timeout, timeout == 1000);
}
private void testProxyConfiguration(Configuration conf) {
conf.set(MRJobConfig.MR_JOB_END_NOTIFICATION_PROXY, "somehost");
setConf(conf);
@ -109,6 +116,7 @@ public void checkConfiguration() {
Configuration conf = new Configuration();
testNumRetries(conf);
testWaitInterval(conf);
testTimeout(conf);
testProxyConfiguration(conf);
}

View File

@ -44,9 +44,10 @@ private static JobEndStatusInfo createNotification(JobConf conf,
JobEndStatusInfo notification = null;
String uri = conf.getJobEndNotificationURI();
if (uri != null) {
// +1 to make logic for first notification identical to a retry
int retryAttempts = conf.getInt(JobContext.MR_JOB_END_RETRY_ATTEMPTS, 0) + 1;
int retryAttempts = conf.getInt(JobContext.MR_JOB_END_RETRY_ATTEMPTS, 0);
long retryInterval = conf.getInt(JobContext.MR_JOB_END_RETRY_INTERVAL, 30000);
int timeout = conf.getInt(JobContext.MR_JOB_END_NOTIFICATION_TIMEOUT,
JobContext.DEFAULT_MR_JOB_END_NOTIFICATION_TIMEOUT);
if (uri.contains("$jobId")) {
uri = uri.replace("$jobId", status.getJobID().toString());
}
@ -56,17 +57,22 @@ private static JobEndStatusInfo createNotification(JobConf conf,
(status.getRunState() == JobStatus.FAILED) ? "FAILED" : "KILLED";
uri = uri.replace("$jobStatus", statusStr);
}
notification = new JobEndStatusInfo(uri, retryAttempts, retryInterval);
notification = new JobEndStatusInfo(
uri, retryAttempts, retryInterval, timeout);
}
return notification;
}
private static int httpNotification(String uri) throws IOException {
private static int httpNotification(String uri, int timeout)
throws IOException {
URI url = new URI(uri, false);
HttpClient m_client = new HttpClient();
HttpClient httpClient = new HttpClient();
httpClient.getParams().setSoTimeout(timeout);
httpClient.getParams().setConnectionManagerTimeout(timeout);
HttpMethod method = new GetMethod(url.getEscapedURI());
method.setRequestHeader("Accept", "*/*");
return m_client.executeMethod(method);
return httpClient.executeMethod(method);
}
// for use by the LocalJobRunner, without using a thread&queue,
@ -74,9 +80,10 @@ private static int httpNotification(String uri) throws IOException {
public static void localRunnerNotification(JobConf conf, JobStatus status) {
JobEndStatusInfo notification = createNotification(conf, status);
if (notification != null) {
while (notification.configureForRetry()) {
do {
try {
int code = httpNotification(notification.getUri());
int code = httpNotification(notification.getUri(),
notification.getTimeout());
if (code != 200) {
throw new IOException("Invalid response status code: " + code);
}
@ -96,7 +103,7 @@ public static void localRunnerNotification(JobConf conf, JobStatus status) {
catch (InterruptedException iex) {
LOG.error("Notification retry error [" + notification + "]", iex);
}
}
} while (notification.configureForRetry());
}
}
@ -105,12 +112,15 @@ private static class JobEndStatusInfo implements Delayed {
private int retryAttempts;
private long retryInterval;
private long delayTime;
private int timeout;
JobEndStatusInfo(String uri, int retryAttempts, long retryInterval) {
JobEndStatusInfo(String uri, int retryAttempts, long retryInterval,
int timeout) {
this.uri = uri;
this.retryAttempts = retryAttempts;
this.retryInterval = retryInterval;
this.delayTime = System.currentTimeMillis();
this.timeout = timeout;
}
public String getUri() {
@ -125,6 +135,10 @@ public long getRetryInterval() {
return retryInterval;
}
public int getTimeout() {
return timeout;
}
public boolean configureForRetry() {
boolean retry = false;
if (getRetryAttempts() > 0) {

View File

@ -616,6 +616,9 @@ public interface MRJobConfig {
public static final String MR_JOB_END_NOTIFICATION_PROXY =
"mapreduce.job.end-notification.proxy";
public static final String MR_JOB_END_NOTIFICATION_TIMEOUT =
"mapreduce.job.end-notification.timeout";
public static final String MR_JOB_END_RETRY_ATTEMPTS =
"mapreduce.job.end-notification.retry.attempts";
@ -628,6 +631,9 @@ public interface MRJobConfig {
public static final String MR_JOB_END_NOTIFICATION_MAX_RETRY_INTERVAL =
"mapreduce.job.end-notification.max.retry.interval";
public static final int DEFAULT_MR_JOB_END_NOTIFICATION_TIMEOUT =
5000;
/*
* MR AM Service Authorization
*/

View File

@ -0,0 +1,197 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.mapred;
import java.io.File;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.PrintStream;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.URL;
import javax.servlet.ServletException;
import javax.servlet.http.HttpServlet;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;
import junit.framework.TestCase;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.http.HttpServer;
public class TestJobEndNotifier extends TestCase {
HttpServer server;
URL baseUrl;
@SuppressWarnings("serial")
public static class JobEndServlet extends HttpServlet {
public static volatile int calledTimes = 0;
public static URI requestUri;
@Override
public void doGet(HttpServletRequest request,
HttpServletResponse response
) throws ServletException, IOException {
InputStreamReader in = new InputStreamReader(request.getInputStream());
PrintStream out = new PrintStream(response.getOutputStream());
calledTimes++;
try {
requestUri = new URI(null, null,
request.getRequestURI(), request.getQueryString(), null);
} catch (URISyntaxException e) {
}
in.close();
out.close();
}
}
// Servlet that delays requests for a long time
@SuppressWarnings("serial")
public static class DelayServlet extends HttpServlet {
public static volatile int calledTimes = 0;
@Override
public void doGet(HttpServletRequest request,
HttpServletResponse response
) throws ServletException, IOException {
boolean timedOut = false;
calledTimes++;
try {
// Sleep for a long time
Thread.sleep(1000000);
} catch (InterruptedException e) {
timedOut = true;
}
assertTrue("DelayServlet should be interrupted", timedOut);
}
}
// Servlet that fails all requests into it
@SuppressWarnings("serial")
public static class FailServlet extends HttpServlet {
public static volatile int calledTimes = 0;
@Override
public void doGet(HttpServletRequest request,
HttpServletResponse response
) throws ServletException, IOException {
calledTimes++;
throw new IOException("I am failing!");
}
}
public void setUp() throws Exception {
new File(System.getProperty("build.webapps", "build/webapps") + "/test"
).mkdirs();
server = new HttpServer("test", "0.0.0.0", 0, true);
server.addServlet("delay", "/delay", DelayServlet.class);
server.addServlet("jobend", "/jobend", JobEndServlet.class);
server.addServlet("fail", "/fail", FailServlet.class);
server.start();
int port = server.getPort();
baseUrl = new URL("http://localhost:" + port + "/");
JobEndServlet.calledTimes = 0;
JobEndServlet.requestUri = null;
DelayServlet.calledTimes = 0;
FailServlet.calledTimes = 0;
}
public void tearDown() throws Exception {
server.stop();
}
/**
* Basic validation for localRunnerNotification.
*/
public void testLocalJobRunnerUriSubstitution() throws InterruptedException {
JobStatus jobStatus = createTestJobStatus(
"job_20130313155005308_0001", JobStatus.SUCCEEDED);
JobConf jobConf = createTestJobConf(
new Configuration(), 0,
baseUrl + "jobend?jobid=$jobId&status=$jobStatus");
JobEndNotifier.localRunnerNotification(jobConf, jobStatus);
// No need to wait for the notification to go thru since calls are
// synchronous
// Validate params
assertEquals(1, JobEndServlet.calledTimes);
assertEquals("jobid=job_20130313155005308_0001&status=SUCCEEDED",
JobEndServlet.requestUri.getQuery());
}
/**
* Validate job.end.retry.attempts for the localJobRunner.
*/
public void testLocalJobRunnerRetryCount() throws InterruptedException {
int retryAttempts = 3;
JobStatus jobStatus = createTestJobStatus(
"job_20130313155005308_0001", JobStatus.SUCCEEDED);
JobConf jobConf = createTestJobConf(
new Configuration(), retryAttempts, baseUrl + "fail");
JobEndNotifier.localRunnerNotification(jobConf, jobStatus);
// Validate params
assertEquals(retryAttempts + 1, FailServlet.calledTimes);
}
/**
* Validate that the notification times out after reaching
* mapreduce.job.end-notification.timeout.
*/
public void testNotificationTimeout() throws InterruptedException {
Configuration conf = new Configuration();
// Reduce the timeout to 1 second
conf.setInt("mapreduce.job.end-notification.timeout", 1000);
JobStatus jobStatus = createTestJobStatus(
"job_20130313155005308_0001", JobStatus.SUCCEEDED);
JobConf jobConf = createTestJobConf(
conf, 0,
baseUrl + "delay");
long startTime = System.currentTimeMillis();
JobEndNotifier.localRunnerNotification(jobConf, jobStatus);
long elapsedTime = System.currentTimeMillis() - startTime;
// Validate params
assertEquals(1, DelayServlet.calledTimes);
// Make sure we timed out with time slightly above 1 second
// (default timeout is in terms of minutes, so we'll catch the problem)
assertTrue(elapsedTime < 2000);
}
private static JobStatus createTestJobStatus(String jobId, int state) {
return new JobStatus(
JobID.forName(jobId), 0.5f, 0.0f,
state, "root", "TestJobEndNotifier", null, null);
}
private static JobConf createTestJobConf(
Configuration conf, int retryAttempts, String notificationUri) {
JobConf jobConf = new JobConf(conf);
jobConf.setInt("job.end.retry.attempts", retryAttempts);
jobConf.set("job.end.retry.interval", "0");
jobConf.setJobEndNotificationURI(notificationUri);
return jobConf;
}
}