MAPREDUCE-6024. Shortened the time when Fetcher is stuck in retrying before concluding the failure by configuration. Contributed by Yunjiong Zhao.
git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1618677 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
4a978bff3e
commit
f8e871d01b
|
@ -227,6 +227,9 @@ Release 2.6.0 - UNRELEASED
|
||||||
MAPREDUCE-6032. Made MR jobs write job history files on the default FS when
|
MAPREDUCE-6032. Made MR jobs write job history files on the default FS when
|
||||||
the current context's FS is different. (Benjamin Zhitomirsky via zjshen)
|
the current context's FS is different. (Benjamin Zhitomirsky via zjshen)
|
||||||
|
|
||||||
|
MAPREDUCE-6024. Shortened the time when Fetcher is stuck in retrying before
|
||||||
|
concluding the failure by configuration. (Yunjiong Zhao via zjshen)
|
||||||
|
|
||||||
Release 2.5.0 - UNRELEASED
|
Release 2.5.0 - UNRELEASED
|
||||||
|
|
||||||
INCOMPATIBLE CHANGES
|
INCOMPATIBLE CHANGES
|
||||||
|
|
|
@ -148,10 +148,10 @@ public class JobImpl implements org.apache.hadoop.mapreduce.v2.app.job.Job,
|
||||||
private static final Log LOG = LogFactory.getLog(JobImpl.class);
|
private static final Log LOG = LogFactory.getLog(JobImpl.class);
|
||||||
|
|
||||||
//The maximum fraction of fetch failures allowed for a map
|
//The maximum fraction of fetch failures allowed for a map
|
||||||
private static final double MAX_ALLOWED_FETCH_FAILURES_FRACTION = 0.5;
|
private float maxAllowedFetchFailuresFraction;
|
||||||
|
|
||||||
// Maximum no. of fetch-failure notifications after which map task is failed
|
//Maximum no. of fetch-failure notifications after which map task is failed
|
||||||
private static final int MAX_FETCH_FAILURES_NOTIFICATIONS = 3;
|
private int maxFetchFailuresNotifications;
|
||||||
|
|
||||||
public static final String JOB_KILLED_DIAG =
|
public static final String JOB_KILLED_DIAG =
|
||||||
"Job received Kill while in RUNNING state.";
|
"Job received Kill while in RUNNING state.";
|
||||||
|
@ -704,6 +704,13 @@ public class JobImpl implements org.apache.hadoop.mapreduce.v2.app.job.Job,
|
||||||
if(forcedDiagnostic != null) {
|
if(forcedDiagnostic != null) {
|
||||||
this.diagnostics.add(forcedDiagnostic);
|
this.diagnostics.add(forcedDiagnostic);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
this.maxAllowedFetchFailuresFraction = conf.getFloat(
|
||||||
|
MRJobConfig.MAX_ALLOWED_FETCH_FAILURES_FRACTION,
|
||||||
|
MRJobConfig.DEFAULT_MAX_ALLOWED_FETCH_FAILURES_FRACTION);
|
||||||
|
this.maxFetchFailuresNotifications = conf.getInt(
|
||||||
|
MRJobConfig.MAX_FETCH_FAILURES_NOTIFICATIONS,
|
||||||
|
MRJobConfig.DEFAULT_MAX_FETCH_FAILURES_NOTIFICATIONS);
|
||||||
}
|
}
|
||||||
|
|
||||||
protected StateMachine<JobStateInternal, JobEventType, JobEvent> getStateMachine() {
|
protected StateMachine<JobStateInternal, JobEventType, JobEvent> getStateMachine() {
|
||||||
|
@ -1900,9 +1907,8 @@ public class JobImpl implements org.apache.hadoop.mapreduce.v2.app.job.Job,
|
||||||
float failureRate = shufflingReduceTasks == 0 ? 1.0f :
|
float failureRate = shufflingReduceTasks == 0 ? 1.0f :
|
||||||
(float) fetchFailures / shufflingReduceTasks;
|
(float) fetchFailures / shufflingReduceTasks;
|
||||||
// declare faulty if fetch-failures >= max-allowed-failures
|
// declare faulty if fetch-failures >= max-allowed-failures
|
||||||
boolean isMapFaulty =
|
if (fetchFailures >= job.getMaxFetchFailuresNotifications()
|
||||||
(failureRate >= MAX_ALLOWED_FETCH_FAILURES_FRACTION);
|
&& failureRate >= job.getMaxAllowedFetchFailuresFraction()) {
|
||||||
if (fetchFailures >= MAX_FETCH_FAILURES_NOTIFICATIONS && isMapFaulty) {
|
|
||||||
LOG.info("Too many fetch-failures for output of task attempt: " +
|
LOG.info("Too many fetch-failures for output of task attempt: " +
|
||||||
mapId + " ... raising fetch failure to map");
|
mapId + " ... raising fetch failure to map");
|
||||||
job.eventHandler.handle(new TaskAttemptEvent(mapId,
|
job.eventHandler.handle(new TaskAttemptEvent(mapId,
|
||||||
|
@ -2185,4 +2191,12 @@ public class JobImpl implements org.apache.hadoop.mapreduce.v2.app.job.Job,
|
||||||
jobConf.addResource(fc.open(confPath), confPath.toString());
|
jobConf.addResource(fc.open(confPath), confPath.toString());
|
||||||
return jobConf;
|
return jobConf;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public float getMaxAllowedFetchFailuresFraction() {
|
||||||
|
return maxAllowedFetchFailuresFraction;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int getMaxFetchFailuresNotifications() {
|
||||||
|
return maxFetchFailuresNotifications;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -293,11 +293,19 @@ public interface MRJobConfig {
|
||||||
public static final String SHUFFLE_READ_TIMEOUT = "mapreduce.reduce.shuffle.read.timeout";
|
public static final String SHUFFLE_READ_TIMEOUT = "mapreduce.reduce.shuffle.read.timeout";
|
||||||
|
|
||||||
public static final String SHUFFLE_FETCH_FAILURES = "mapreduce.reduce.shuffle.maxfetchfailures";
|
public static final String SHUFFLE_FETCH_FAILURES = "mapreduce.reduce.shuffle.maxfetchfailures";
|
||||||
|
public static final String MAX_ALLOWED_FETCH_FAILURES_FRACTION = "mapreduce.reduce.shuffle.max-fetch-failures-fraction";
|
||||||
|
public static final float DEFAULT_MAX_ALLOWED_FETCH_FAILURES_FRACTION = 0.5f;
|
||||||
|
|
||||||
|
public static final String MAX_FETCH_FAILURES_NOTIFICATIONS = "mapreduce.reduce.shuffle.max-fetch-failures-notifications";
|
||||||
|
public static final int DEFAULT_MAX_FETCH_FAILURES_NOTIFICATIONS = 3;
|
||||||
|
|
||||||
public static final String SHUFFLE_NOTIFY_READERROR = "mapreduce.reduce.shuffle.notify.readerror";
|
public static final String SHUFFLE_NOTIFY_READERROR = "mapreduce.reduce.shuffle.notify.readerror";
|
||||||
|
|
||||||
public static final String MAX_SHUFFLE_FETCH_RETRY_DELAY = "mapreduce.reduce.shuffle.retry-delay.max.ms";
|
public static final String MAX_SHUFFLE_FETCH_RETRY_DELAY = "mapreduce.reduce.shuffle.retry-delay.max.ms";
|
||||||
public static final long DEFAULT_MAX_SHUFFLE_FETCH_RETRY_DELAY = 60000;
|
public static final long DEFAULT_MAX_SHUFFLE_FETCH_RETRY_DELAY = 60000;
|
||||||
|
|
||||||
|
public static final String MAX_SHUFFLE_FETCH_HOST_FAILURES = "mapreduce.reduce.shuffle.max-host-failures";
|
||||||
|
public static final int DEFAULT_MAX_SHUFFLE_FETCH_HOST_FAILURES = 5;
|
||||||
|
|
||||||
public static final String REDUCE_SKIP_INCR_PROC_COUNT = "mapreduce.reduce.skip.proc-count.auto-incr";
|
public static final String REDUCE_SKIP_INCR_PROC_COUNT = "mapreduce.reduce.skip.proc-count.auto-incr";
|
||||||
|
|
||||||
|
|
|
@ -319,6 +319,7 @@ class Fetcher<K,V> extends Thread {
|
||||||
|
|
||||||
// If connect did not succeed, just mark all the maps as failed,
|
// If connect did not succeed, just mark all the maps as failed,
|
||||||
// indirectly penalizing the host
|
// indirectly penalizing the host
|
||||||
|
scheduler.hostFailed(host.getHostName());
|
||||||
for(TaskAttemptID left: remaining) {
|
for(TaskAttemptID left: remaining) {
|
||||||
scheduler.copyFailed(left, host, false, connectExcpt);
|
scheduler.copyFailed(left, host, false, connectExcpt);
|
||||||
}
|
}
|
||||||
|
@ -343,6 +344,7 @@ class Fetcher<K,V> extends Thread {
|
||||||
|
|
||||||
if(failedTasks != null && failedTasks.length > 0) {
|
if(failedTasks != null && failedTasks.length > 0) {
|
||||||
LOG.warn("copyMapOutput failed for tasks "+Arrays.toString(failedTasks));
|
LOG.warn("copyMapOutput failed for tasks "+Arrays.toString(failedTasks));
|
||||||
|
scheduler.hostFailed(host.getHostName());
|
||||||
for(TaskAttemptID left: failedTasks) {
|
for(TaskAttemptID left: failedTasks) {
|
||||||
scheduler.copyFailed(left, host, true, false);
|
scheduler.copyFailed(left, host, true, false);
|
||||||
}
|
}
|
||||||
|
|
|
@ -18,7 +18,6 @@
|
||||||
package org.apache.hadoop.mapreduce.task.reduce;
|
package org.apache.hadoop.mapreduce.task.reduce;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
|
||||||
import java.net.InetAddress;
|
import java.net.InetAddress;
|
||||||
import java.net.URI;
|
import java.net.URI;
|
||||||
import java.net.UnknownHostException;
|
import java.net.UnknownHostException;
|
||||||
|
@ -101,6 +100,7 @@ public class ShuffleSchedulerImpl<K,V> implements ShuffleScheduler<K,V> {
|
||||||
|
|
||||||
private final boolean reportReadErrorImmediately;
|
private final boolean reportReadErrorImmediately;
|
||||||
private long maxDelay = MRJobConfig.DEFAULT_MAX_SHUFFLE_FETCH_RETRY_DELAY;
|
private long maxDelay = MRJobConfig.DEFAULT_MAX_SHUFFLE_FETCH_RETRY_DELAY;
|
||||||
|
private int maxHostFailures;
|
||||||
|
|
||||||
public ShuffleSchedulerImpl(JobConf job, TaskStatus status,
|
public ShuffleSchedulerImpl(JobConf job, TaskStatus status,
|
||||||
TaskAttemptID reduceId,
|
TaskAttemptID reduceId,
|
||||||
|
@ -132,6 +132,9 @@ public class ShuffleSchedulerImpl<K,V> implements ShuffleScheduler<K,V> {
|
||||||
|
|
||||||
this.maxDelay = job.getLong(MRJobConfig.MAX_SHUFFLE_FETCH_RETRY_DELAY,
|
this.maxDelay = job.getLong(MRJobConfig.MAX_SHUFFLE_FETCH_RETRY_DELAY,
|
||||||
MRJobConfig.DEFAULT_MAX_SHUFFLE_FETCH_RETRY_DELAY);
|
MRJobConfig.DEFAULT_MAX_SHUFFLE_FETCH_RETRY_DELAY);
|
||||||
|
this.maxHostFailures = job.getInt(
|
||||||
|
MRJobConfig.MAX_SHUFFLE_FETCH_HOST_FAILURES,
|
||||||
|
MRJobConfig.DEFAULT_MAX_SHUFFLE_FETCH_HOST_FAILURES);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -213,9 +216,18 @@ public class ShuffleSchedulerImpl<K,V> implements ShuffleScheduler<K,V> {
|
||||||
progress.setStatus("copy(" + mapsDone + " of " + totalMaps + " at "
|
progress.setStatus("copy(" + mapsDone + " of " + totalMaps + " at "
|
||||||
+ mbpsFormat.format(transferRate) + " MB/s)");
|
+ mbpsFormat.format(transferRate) + " MB/s)");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public synchronized void hostFailed(String hostname) {
|
||||||
|
if (hostFailures.containsKey(hostname)) {
|
||||||
|
IntWritable x = hostFailures.get(hostname);
|
||||||
|
x.set(x.get() + 1);
|
||||||
|
} else {
|
||||||
|
hostFailures.put(hostname, new IntWritable(1));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
public synchronized void copyFailed(TaskAttemptID mapId, MapHost host,
|
public synchronized void copyFailed(TaskAttemptID mapId, MapHost host,
|
||||||
boolean readError, boolean connectExcpt) {
|
boolean readError, boolean connectExcpt) {
|
||||||
host.penalize();
|
host.penalize();
|
||||||
int failures = 1;
|
int failures = 1;
|
||||||
if (failureCounts.containsKey(mapId)) {
|
if (failureCounts.containsKey(mapId)) {
|
||||||
|
@ -226,12 +238,9 @@ public class ShuffleSchedulerImpl<K,V> implements ShuffleScheduler<K,V> {
|
||||||
failureCounts.put(mapId, new IntWritable(1));
|
failureCounts.put(mapId, new IntWritable(1));
|
||||||
}
|
}
|
||||||
String hostname = host.getHostName();
|
String hostname = host.getHostName();
|
||||||
if (hostFailures.containsKey(hostname)) {
|
//report failure if already retried maxHostFailures times
|
||||||
IntWritable x = hostFailures.get(hostname);
|
boolean hostFail = hostFailures.get(hostname).get() > getMaxHostFailures() ? true : false;
|
||||||
x.set(x.get() + 1);
|
|
||||||
} else {
|
|
||||||
hostFailures.put(hostname, new IntWritable(1));
|
|
||||||
}
|
|
||||||
if (failures >= abortFailureLimit) {
|
if (failures >= abortFailureLimit) {
|
||||||
try {
|
try {
|
||||||
throw new IOException(failures + " failures downloading " + mapId);
|
throw new IOException(failures + " failures downloading " + mapId);
|
||||||
|
@ -240,7 +249,7 @@ public class ShuffleSchedulerImpl<K,V> implements ShuffleScheduler<K,V> {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
checkAndInformJobTracker(failures, mapId, readError, connectExcpt);
|
checkAndInformJobTracker(failures, mapId, readError, connectExcpt, hostFail);
|
||||||
|
|
||||||
checkReducerHealth();
|
checkReducerHealth();
|
||||||
|
|
||||||
|
@ -270,9 +279,9 @@ public class ShuffleSchedulerImpl<K,V> implements ShuffleScheduler<K,V> {
|
||||||
// after every 'maxFetchFailuresBeforeReporting' failures
|
// after every 'maxFetchFailuresBeforeReporting' failures
|
||||||
private void checkAndInformJobTracker(
|
private void checkAndInformJobTracker(
|
||||||
int failures, TaskAttemptID mapId, boolean readError,
|
int failures, TaskAttemptID mapId, boolean readError,
|
||||||
boolean connectExcpt) {
|
boolean connectExcpt, boolean hostFailed) {
|
||||||
if (connectExcpt || (reportReadErrorImmediately && readError)
|
if (connectExcpt || (reportReadErrorImmediately && readError)
|
||||||
|| ((failures % maxFetchFailuresBeforeReporting) == 0)) {
|
|| ((failures % maxFetchFailuresBeforeReporting) == 0) || hostFailed) {
|
||||||
LOG.info("Reporting fetch failure for " + mapId + " to jobtracker.");
|
LOG.info("Reporting fetch failure for " + mapId + " to jobtracker.");
|
||||||
status.addFetchFailedMap((org.apache.hadoop.mapred.TaskAttemptID) mapId);
|
status.addFetchFailedMap((org.apache.hadoop.mapred.TaskAttemptID) mapId);
|
||||||
}
|
}
|
||||||
|
@ -507,4 +516,7 @@ public class ShuffleSchedulerImpl<K,V> implements ShuffleScheduler<K,V> {
|
||||||
referee.join();
|
referee.join();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public int getMaxHostFailures() {
|
||||||
|
return maxHostFailures;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue