YARN-4325. Nodemanager log handlers fail to send finished/failed events in some cases. Contributed by Junping Du

(cherry picked from commit 81effb7dcd)
This commit is contained in:
Jason Lowe 2016-05-16 15:40:23 +00:00
parent a37b3694ea
commit 575c056357
6 changed files with 155 additions and 8 deletions

View File

@ -207,18 +207,18 @@ ApplicationEventType.INIT_APPLICATION, new AppInitTransition())
ApplicationEventType.APPLICATION_LOG_HANDLING_FINISHED,
ApplicationEventType.APPLICATION_INITED,
ApplicationEventType.FINISH_APPLICATION))
// Transitions from FINISHED state
.addTransition(ApplicationState.FINISHED,
ApplicationState.FINISHED,
ApplicationEventType.APPLICATION_LOG_HANDLING_FINISHED,
EnumSet.of(
ApplicationEventType.APPLICATION_LOG_HANDLING_FINISHED,
ApplicationEventType.APPLICATION_LOG_HANDLING_FAILED),
new AppLogsAggregatedTransition())
.addTransition(ApplicationState.FINISHED, ApplicationState.FINISHED,
EnumSet.of(
ApplicationEventType.APPLICATION_LOG_HANDLING_INITED,
ApplicationEventType.APPLICATION_LOG_HANDLING_FAILED,
ApplicationEventType.FINISH_APPLICATION))
// create the topology tables
.installTopology();

View File

@ -501,6 +501,7 @@ public Object run() throws Exception {
}
}
@SuppressWarnings("unchecked")
@Override
public void run() {
try {
@ -513,6 +514,9 @@ public void run() {
} finally {
if (!this.appAggregationFinished.get()) {
LOG.warn("Aggregation did not complete for application " + appId);
this.dispatcher.getEventHandler().handle(
new ApplicationEvent(this.appId,
ApplicationEventType.APPLICATION_LOG_HANDLING_FAILED));
}
this.appAggregationFinished.set(true);
}

View File

@ -416,7 +416,6 @@ private void stopContainer(ContainerId containerId, int exitCode) {
// A container is complete. Put this containers' logs up for aggregation if
// this containers' logs are needed.
AppLogAggregator aggregator = this.appLogAggregators.get(
containerId.getApplicationAttemptId().getApplicationId());
if (aggregator == null) {
@ -436,6 +435,7 @@ private void stopContainer(ContainerId containerId, int exitCode) {
new ContainerLogContext(containerId, containerType, exitCode));
}
@SuppressWarnings("unchecked")
private void stopApp(ApplicationId appId) {
// App is complete. Finish up any containers' pending log aggregation and
@ -445,6 +445,9 @@ private void stopApp(ApplicationId appId) {
if (aggregator == null) {
LOG.warn("Log aggregation is not initialized for " + appId
+ ", did it fail to start?");
this.dispatcher.getEventHandler().handle(
new ApplicationEvent(appId,
ApplicationEventType.APPLICATION_LOG_HANDLING_FAILED));
return;
}
aggregator.finishLogAggregation();

View File

@ -171,6 +171,10 @@ public void handle(LogHandlerEvent event) {
String user = appOwners.remove(appId);
if (user == null) {
LOG.error("Unable to locate user for " + appId);
// send LOG_HANDLING_FAILED out
NonAggregatingLogHandler.this.dispatcher.getEventHandler().handle(
new ApplicationEvent(appId,
ApplicationEventType.APPLICATION_LOG_HANDLING_FAILED));
break;
}
LogDeleterRunnable logDeleter = new LogDeleterRunnable(user, appId);

View File

@ -294,6 +294,90 @@ public void testApplicationRecovery() throws Exception {
cm.stop();
}
@Test
public void testNMRecoveryForAppFinishedWithLogAggregationFailure()
throws Exception {
conf.setBoolean(YarnConfiguration.NM_RECOVERY_ENABLED, true);
conf.setBoolean(YarnConfiguration.NM_RECOVERY_SUPERVISED, true);
NMStateStoreService stateStore = new NMMemoryStateStoreService();
stateStore.init(conf);
stateStore.start();
Context context = createContext(conf, stateStore);
ContainerManagerImpl cm = createContainerManager(context);
cm.init(conf);
cm.start();
// add an application by starting a container
ApplicationId appId = ApplicationId.newInstance(0, 1);
ApplicationAttemptId attemptId =
ApplicationAttemptId.newInstance(appId, 1);
ContainerId cid = ContainerId.newContainerId(attemptId, 1);
Map<String, LocalResource> localResources = Collections.emptyMap();
Map<String, String> containerEnv = Collections.emptyMap();
List<String> containerCmds = Collections.emptyList();
Map<String, ByteBuffer> serviceData = Collections.emptyMap();
ContainerLaunchContext clc = ContainerLaunchContext.newInstance(
localResources, containerEnv, containerCmds, serviceData,
null, null);
StartContainersResponse startResponse = startContainer(context, cm, cid,
clc, null);
assertTrue(startResponse.getFailedRequests().isEmpty());
assertEquals(1, context.getApplications().size());
Application app = context.getApplications().get(appId);
assertNotNull(app);
waitForAppState(app, ApplicationState.INITING);
// simulate application completion
List<ApplicationId> finishedApps = new ArrayList<ApplicationId>();
finishedApps.add(appId);
cm.handle(new CMgrCompletedAppsEvent(finishedApps,
CMgrCompletedAppsEvent.Reason.BY_RESOURCEMANAGER));
waitForAppState(app, ApplicationState.APPLICATION_RESOURCES_CLEANINGUP);
app.handle(new ApplicationEvent(app.getAppId(),
ApplicationEventType.APPLICATION_RESOURCES_CLEANEDUP));
assertEquals(app.getApplicationState(), ApplicationState.FINISHED);
// application is still in NM context.
assertEquals(1, context.getApplications().size());
// restart and verify app is still there and marked as finished.
cm.stop();
context = createContext(conf, stateStore);
cm = createContainerManager(context);
cm.init(conf);
cm.start();
assertEquals(1, context.getApplications().size());
app = context.getApplications().get(appId);
assertNotNull(app);
// no longer saving FINISH_APP event in NM stateStore,
// simulate by resending FINISH_APP event
cm.handle(new CMgrCompletedAppsEvent(finishedApps,
CMgrCompletedAppsEvent.Reason.BY_RESOURCEMANAGER));
waitForAppState(app, ApplicationState.APPLICATION_RESOURCES_CLEANINGUP);
// TODO need to figure out why additional APPLICATION_RESOURCES_CLEANEDUP
// is needed.
app.handle(new ApplicationEvent(app.getAppId(),
ApplicationEventType.APPLICATION_RESOURCES_CLEANEDUP));
assertEquals(app.getApplicationState(), ApplicationState.FINISHED);
// simulate log aggregation failed.
app.handle(new ApplicationEvent(app.getAppId(),
ApplicationEventType.APPLICATION_LOG_HANDLING_FAILED));
// restart and verify app is no longer present after recovery
cm.stop();
context = createContext(conf, stateStore);
cm = createContainerManager(context);
cm.init(conf);
cm.start();
assertTrue(context.getApplications().isEmpty());
cm.stop();
}
@Test
public void testContainerResizeRecovery() throws Exception {
conf.setBoolean(YarnConfiguration.NM_RECOVERY_ENABLED, true);

View File

@ -17,6 +17,9 @@
*/
package org.apache.hadoop.yarn.server.nodemanager.containermanager.loghandler;
import static org.junit.Assert.assertTrue;
import static org.junit.Assert.assertFalse;
import static org.mockito.Matchers.any;
import static org.mockito.Matchers.anyLong;
import static org.mockito.Matchers.eq;
@ -84,7 +87,7 @@ public class TestNonAggregatingLogHandler {
DeletionService mockDelService;
Configuration conf;
DrainDispatcher dispatcher;
EventHandler<ApplicationEvent> appEventHandler;
private ApplicationEventHandler appEventHandler;
String user = "testuser";
ApplicationId appId;
ApplicationAttemptId appAttemptId;
@ -97,7 +100,7 @@ public void setup() {
mockDelService = mock(DeletionService.class);
conf = new YarnConfiguration();
dispatcher = createDispatcher(conf);
appEventHandler = mock(EventHandler.class);
appEventHandler = new ApplicationEventHandler();
dispatcher.register(ApplicationEventType.class, appEventHandler);
appId = BuilderUtils.newApplicationId(1234, 1);
appAttemptId = BuilderUtils.newApplicationAttemptId(appId, 1);
@ -345,6 +348,9 @@ public void testRecovery() throws Exception {
dirsHandler.init(conf);
appEventHandler.resetLogHandlingEvent();
assertFalse(appEventHandler.receiveLogHandlingFinishEvent());
NMStateStoreService stateStore = new NMMemoryStateStoreService();
stateStore.init(conf);
stateStore.start();
@ -377,8 +383,21 @@ public void testRecovery() throws Exception {
logHandler.start();
verify(logHandler.mockSched, never()).schedule(any(Runnable.class),
anyLong(), any(TimeUnit.class));
// wait events get drained.
this.dispatcher.await();
assertTrue(appEventHandler.receiveLogHandlingFinishEvent());
appEventHandler.resetLogHandlingEvent();
assertFalse(appEventHandler.receiveLogHandlingFailedEvent());
// send an app finish event against a removed app
logHandler.handle(new LogHandlerAppFinishedEvent(appId));
this.dispatcher.await();
// verify to receive a log failed event.
assertTrue(appEventHandler.receiveLogHandlingFailedEvent());
assertFalse(appEventHandler.receiveLogHandlingFinishEvent());
logHandler.close();
}
}
/**
* Function to run a log handler with directories failing the getFileStatus
@ -536,4 +555,37 @@ public static File[] getLocalLogDirFiles(String name, int number) {
}
return dirs;
}
class ApplicationEventHandler implements EventHandler<ApplicationEvent> {
private boolean logHandlingFinished = false;
private boolean logHandlingFailed = false;
@Override
public void handle(ApplicationEvent event) {
switch (event.getType()) {
case APPLICATION_LOG_HANDLING_FINISHED:
logHandlingFinished = true;
break;
case APPLICATION_LOG_HANDLING_FAILED:
logHandlingFailed = true;
default:
// do nothing.
}
}
public boolean receiveLogHandlingFinishEvent() {
return logHandlingFinished;
}
public boolean receiveLogHandlingFailedEvent() {
return logHandlingFailed;
}
public void resetLogHandlingEvent() {
logHandlingFinished = false;
logHandlingFailed = false;
}
}
}