MAPREDUCE-5476. Changed MR AM recovery code to cleanup staging-directory only after unregistering from the RM. Contributed by Jian He.
git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1516660 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
200220e8f3
commit
740f4cb97a
|
@ -237,6 +237,9 @@ Release 2.1.1-beta - UNRELEASED
|
||||||
MAPREDUCE-5470. LocalJobRunner does not work on Windows. (Sandy Ryza via
|
MAPREDUCE-5470. LocalJobRunner does not work on Windows. (Sandy Ryza via
|
||||||
cnauroth)
|
cnauroth)
|
||||||
|
|
||||||
|
MAPREDUCE-5476. Changed MR AM recovery code to cleanup staging-directory
|
||||||
|
only after unregistering from the RM. (Jian He via vinodkv)
|
||||||
|
|
||||||
Release 2.1.0-beta - 2013-08-22
|
Release 2.1.0-beta - 2013-08-22
|
||||||
|
|
||||||
INCOMPATIBLE CHANGES
|
INCOMPATIBLE CHANGES
|
||||||
|
|
|
@ -326,17 +326,22 @@ public class MRAppMaster extends CompositeService {
|
||||||
eater);
|
eater);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (copyHistory) {
|
||||||
|
// Now that there's a FINISHING state for application on RM to give AMs
|
||||||
|
// plenty of time to clean up after unregister it's safe to clean staging
|
||||||
|
// directory after unregistering with RM. So, we start the staging-dir
|
||||||
|
// cleaner BEFORE the ContainerAllocator so that on shut-down,
|
||||||
|
// ContainerAllocator unregisters first and then the staging-dir cleaner
|
||||||
|
// deletes staging directory.
|
||||||
|
addService(createStagingDirCleaningService());
|
||||||
|
}
|
||||||
|
|
||||||
// service to allocate containers from RM (if non-uber) or to fake it (uber)
|
// service to allocate containers from RM (if non-uber) or to fake it (uber)
|
||||||
containerAllocator = createContainerAllocator(null, context);
|
containerAllocator = createContainerAllocator(null, context);
|
||||||
addIfService(containerAllocator);
|
addIfService(containerAllocator);
|
||||||
dispatcher.register(ContainerAllocator.EventType.class, containerAllocator);
|
dispatcher.register(ContainerAllocator.EventType.class, containerAllocator);
|
||||||
|
|
||||||
if (copyHistory) {
|
if (copyHistory) {
|
||||||
// Add the staging directory cleaner before the history server but after
|
|
||||||
// the container allocator so the staging directory is cleaned after
|
|
||||||
// the history has been flushed but before unregistering with the RM.
|
|
||||||
addService(createStagingDirCleaningService());
|
|
||||||
|
|
||||||
// Add the JobHistoryEventHandler last so that it is properly stopped first.
|
// Add the JobHistoryEventHandler last so that it is properly stopped first.
|
||||||
// This will guarantee that all history-events are flushed before AM goes
|
// This will guarantee that all history-events are flushed before AM goes
|
||||||
// ahead with shutdown.
|
// ahead with shutdown.
|
||||||
|
@ -345,7 +350,6 @@ public class MRAppMaster extends CompositeService {
|
||||||
// queued inside the JobHistoryEventHandler
|
// queued inside the JobHistoryEventHandler
|
||||||
addIfService(historyService);
|
addIfService(historyService);
|
||||||
|
|
||||||
|
|
||||||
JobHistoryCopyService cpHist = new JobHistoryCopyService(appAttemptID,
|
JobHistoryCopyService cpHist = new JobHistoryCopyService(appAttemptID,
|
||||||
dispatcher.getEventHandler());
|
dispatcher.getEventHandler());
|
||||||
addIfService(cpHist);
|
addIfService(cpHist);
|
||||||
|
@ -396,6 +400,14 @@ public class MRAppMaster extends CompositeService {
|
||||||
dispatcher.register(Speculator.EventType.class,
|
dispatcher.register(Speculator.EventType.class,
|
||||||
speculatorEventDispatcher);
|
speculatorEventDispatcher);
|
||||||
|
|
||||||
|
// Now that there's a FINISHING state for application on RM to give AMs
|
||||||
|
// plenty of time to clean up after unregister it's safe to clean staging
|
||||||
|
// directory after unregistering with RM. So, we start the staging-dir
|
||||||
|
// cleaner BEFORE the ContainerAllocator so that on shut-down,
|
||||||
|
// ContainerAllocator unregisters first and then the staging-dir cleaner
|
||||||
|
// deletes staging directory.
|
||||||
|
addService(createStagingDirCleaningService());
|
||||||
|
|
||||||
// service to allocate containers from RM (if non-uber) or to fake it (uber)
|
// service to allocate containers from RM (if non-uber) or to fake it (uber)
|
||||||
addIfService(containerAllocator);
|
addIfService(containerAllocator);
|
||||||
dispatcher.register(ContainerAllocator.EventType.class, containerAllocator);
|
dispatcher.register(ContainerAllocator.EventType.class, containerAllocator);
|
||||||
|
@ -405,11 +417,6 @@ public class MRAppMaster extends CompositeService {
|
||||||
addIfService(containerLauncher);
|
addIfService(containerLauncher);
|
||||||
dispatcher.register(ContainerLauncher.EventType.class, containerLauncher);
|
dispatcher.register(ContainerLauncher.EventType.class, containerLauncher);
|
||||||
|
|
||||||
// Add the staging directory cleaner before the history server but after
|
|
||||||
// the container allocator so the staging directory is cleaned after
|
|
||||||
// the history has been flushed but before unregistering with the RM.
|
|
||||||
addService(createStagingDirCleaningService());
|
|
||||||
|
|
||||||
// Add the JobHistoryEventHandler last so that it is properly stopped first.
|
// Add the JobHistoryEventHandler last so that it is properly stopped first.
|
||||||
// This will guarantee that all history-events are flushed before AM goes
|
// This will guarantee that all history-events are flushed before AM goes
|
||||||
// ahead with shutdown.
|
// ahead with shutdown.
|
||||||
|
|
|
@ -36,6 +36,8 @@ import org.apache.hadoop.fs.Path;
|
||||||
import org.apache.hadoop.mapreduce.JobID;
|
import org.apache.hadoop.mapreduce.JobID;
|
||||||
import org.apache.hadoop.mapreduce.MRJobConfig;
|
import org.apache.hadoop.mapreduce.MRJobConfig;
|
||||||
import org.apache.hadoop.mapreduce.TypeConverter;
|
import org.apache.hadoop.mapreduce.TypeConverter;
|
||||||
|
import org.apache.hadoop.mapreduce.jobhistory.JobHistoryEvent;
|
||||||
|
import org.apache.hadoop.mapreduce.jobhistory.JobHistoryEventHandler;
|
||||||
import org.apache.hadoop.mapreduce.v2.api.records.JobId;
|
import org.apache.hadoop.mapreduce.v2.api.records.JobId;
|
||||||
import org.apache.hadoop.mapreduce.v2.api.records.JobState;
|
import org.apache.hadoop.mapreduce.v2.api.records.JobState;
|
||||||
import org.apache.hadoop.mapreduce.v2.app.client.ClientService;
|
import org.apache.hadoop.mapreduce.v2.app.client.ClientService;
|
||||||
|
@ -54,6 +56,7 @@ import org.apache.hadoop.service.Service;
|
||||||
import org.apache.hadoop.yarn.api.records.ApplicationAttemptId;
|
import org.apache.hadoop.yarn.api.records.ApplicationAttemptId;
|
||||||
import org.apache.hadoop.yarn.api.records.ApplicationId;
|
import org.apache.hadoop.yarn.api.records.ApplicationId;
|
||||||
import org.apache.hadoop.yarn.api.records.ContainerId;
|
import org.apache.hadoop.yarn.api.records.ContainerId;
|
||||||
|
import org.apache.hadoop.yarn.event.EventHandler;
|
||||||
import org.apache.hadoop.yarn.exceptions.YarnRuntimeException;
|
import org.apache.hadoop.yarn.exceptions.YarnRuntimeException;
|
||||||
import org.apache.hadoop.yarn.factories.RecordFactory;
|
import org.apache.hadoop.yarn.factories.RecordFactory;
|
||||||
import org.apache.hadoop.yarn.factory.providers.RecordFactoryProvider;
|
import org.apache.hadoop.yarn.factory.providers.RecordFactoryProvider;
|
||||||
|
@ -279,14 +282,17 @@ import org.junit.Test;
|
||||||
}
|
}
|
||||||
|
|
||||||
private final class MRAppTestCleanup extends MRApp {
|
private final class MRAppTestCleanup extends MRApp {
|
||||||
boolean stoppedContainerAllocator;
|
int stagingDirCleanedup;
|
||||||
boolean cleanedBeforeContainerAllocatorStopped;
|
int ContainerAllocatorStopped;
|
||||||
|
int JobHistoryEventHandlerStopped;
|
||||||
|
int numStops;
|
||||||
public MRAppTestCleanup(int maps, int reduces, boolean autoComplete,
|
public MRAppTestCleanup(int maps, int reduces, boolean autoComplete,
|
||||||
String testName, boolean cleanOnStart) {
|
String testName, boolean cleanOnStart) {
|
||||||
super(maps, reduces, autoComplete, testName, cleanOnStart);
|
super(maps, reduces, autoComplete, testName, cleanOnStart);
|
||||||
stoppedContainerAllocator = false;
|
stagingDirCleanedup = 0;
|
||||||
cleanedBeforeContainerAllocatorStopped = false;
|
ContainerAllocatorStopped = 0;
|
||||||
|
JobHistoryEventHandlerStopped = 0;
|
||||||
|
numStops = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -312,6 +318,26 @@ import org.junit.Test;
|
||||||
return newJob;
|
return newJob;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected EventHandler<JobHistoryEvent> createJobHistoryHandler(
|
||||||
|
AppContext context) {
|
||||||
|
return new TestJobHistoryEventHandler(context, getStartCount());
|
||||||
|
}
|
||||||
|
|
||||||
|
private class TestJobHistoryEventHandler extends JobHistoryEventHandler {
|
||||||
|
|
||||||
|
public TestJobHistoryEventHandler(AppContext context, int startCount) {
|
||||||
|
super(context, startCount);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void serviceStop() throws Exception {
|
||||||
|
numStops++;
|
||||||
|
JobHistoryEventHandlerStopped = numStops;
|
||||||
|
super.serviceStop();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected ContainerAllocator createContainerAllocator(
|
protected ContainerAllocator createContainerAllocator(
|
||||||
ClientService clientService, AppContext context) {
|
ClientService clientService, AppContext context) {
|
||||||
|
@ -334,7 +360,8 @@ import org.junit.Test;
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected void serviceStop() throws Exception {
|
protected void serviceStop() throws Exception {
|
||||||
stoppedContainerAllocator = true;
|
numStops++;
|
||||||
|
ContainerAllocatorStopped = numStops;
|
||||||
super.serviceStop();
|
super.serviceStop();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -346,7 +373,8 @@ import org.junit.Test;
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void cleanupStagingDir() throws IOException {
|
public void cleanupStagingDir() throws IOException {
|
||||||
cleanedBeforeContainerAllocatorStopped = !stoppedContainerAllocator;
|
numStops++;
|
||||||
|
stagingDirCleanedup = numStops;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -377,11 +405,15 @@ import org.junit.Test;
|
||||||
app.verifyCompleted();
|
app.verifyCompleted();
|
||||||
|
|
||||||
int waitTime = 20 * 1000;
|
int waitTime = 20 * 1000;
|
||||||
while (waitTime > 0 && !app.cleanedBeforeContainerAllocatorStopped) {
|
while (waitTime > 0 && app.numStops < 3 ) {
|
||||||
Thread.sleep(100);
|
Thread.sleep(100);
|
||||||
waitTime -= 100;
|
waitTime -= 100;
|
||||||
}
|
}
|
||||||
Assert.assertTrue("Staging directory not cleaned before notifying RM",
|
|
||||||
app.cleanedBeforeContainerAllocatorStopped);
|
// assert JobHistoryEventHandlerStopped first, then
|
||||||
|
// ContainerAllocatorStopped, and then stagingDirCleanedup
|
||||||
|
Assert.assertEquals(1, app.JobHistoryEventHandlerStopped);
|
||||||
|
Assert.assertEquals(2, app.ContainerAllocatorStopped);
|
||||||
|
Assert.assertEquals(3, app.stagingDirCleanedup);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue