YARN-4325. Nodemanager log handlers fail to send finished/failed events in some cases. Contributed by Junping Du
(cherry picked from commit 81effb7dcd
)
This commit is contained in:
parent
a37b3694ea
commit
575c056357
|
@ -207,18 +207,18 @@ public class ApplicationImpl implements Application {
|
|||
ApplicationEventType.APPLICATION_LOG_HANDLING_FINISHED,
|
||||
ApplicationEventType.APPLICATION_INITED,
|
||||
ApplicationEventType.FINISH_APPLICATION))
|
||||
|
||||
|
||||
// Transitions from FINISHED state
|
||||
.addTransition(ApplicationState.FINISHED,
|
||||
ApplicationState.FINISHED,
|
||||
ApplicationEventType.APPLICATION_LOG_HANDLING_FINISHED,
|
||||
EnumSet.of(
|
||||
ApplicationEventType.APPLICATION_LOG_HANDLING_FINISHED,
|
||||
ApplicationEventType.APPLICATION_LOG_HANDLING_FAILED),
|
||||
new AppLogsAggregatedTransition())
|
||||
.addTransition(ApplicationState.FINISHED, ApplicationState.FINISHED,
|
||||
EnumSet.of(
|
||||
ApplicationEventType.APPLICATION_LOG_HANDLING_INITED,
|
||||
ApplicationEventType.APPLICATION_LOG_HANDLING_FAILED,
|
||||
ApplicationEventType.FINISH_APPLICATION))
|
||||
|
||||
// create the topology tables
|
||||
.installTopology();
|
||||
|
||||
|
|
|
@ -501,6 +501,7 @@ public class AppLogAggregatorImpl implements AppLogAggregator {
|
|||
}
|
||||
}
|
||||
|
||||
@SuppressWarnings("unchecked")
|
||||
@Override
|
||||
public void run() {
|
||||
try {
|
||||
|
@ -513,6 +514,9 @@ public class AppLogAggregatorImpl implements AppLogAggregator {
|
|||
} finally {
|
||||
if (!this.appAggregationFinished.get()) {
|
||||
LOG.warn("Aggregation did not complete for application " + appId);
|
||||
this.dispatcher.getEventHandler().handle(
|
||||
new ApplicationEvent(this.appId,
|
||||
ApplicationEventType.APPLICATION_LOG_HANDLING_FAILED));
|
||||
}
|
||||
this.appAggregationFinished.set(true);
|
||||
}
|
||||
|
|
|
@ -416,7 +416,6 @@ public class LogAggregationService extends AbstractService implements
|
|||
|
||||
// A container is complete. Put this containers' logs up for aggregation if
|
||||
// this containers' logs are needed.
|
||||
|
||||
AppLogAggregator aggregator = this.appLogAggregators.get(
|
||||
containerId.getApplicationAttemptId().getApplicationId());
|
||||
if (aggregator == null) {
|
||||
|
@ -436,6 +435,7 @@ public class LogAggregationService extends AbstractService implements
|
|||
new ContainerLogContext(containerId, containerType, exitCode));
|
||||
}
|
||||
|
||||
@SuppressWarnings("unchecked")
|
||||
private void stopApp(ApplicationId appId) {
|
||||
|
||||
// App is complete. Finish up any containers' pending log aggregation and
|
||||
|
@ -445,6 +445,9 @@ public class LogAggregationService extends AbstractService implements
|
|||
if (aggregator == null) {
|
||||
LOG.warn("Log aggregation is not initialized for " + appId
|
||||
+ ", did it fail to start?");
|
||||
this.dispatcher.getEventHandler().handle(
|
||||
new ApplicationEvent(appId,
|
||||
ApplicationEventType.APPLICATION_LOG_HANDLING_FAILED));
|
||||
return;
|
||||
}
|
||||
aggregator.finishLogAggregation();
|
||||
|
|
|
@ -171,6 +171,10 @@ public class NonAggregatingLogHandler extends AbstractService implements
|
|||
String user = appOwners.remove(appId);
|
||||
if (user == null) {
|
||||
LOG.error("Unable to locate user for " + appId);
|
||||
// send LOG_HANDLING_FAILED out
|
||||
NonAggregatingLogHandler.this.dispatcher.getEventHandler().handle(
|
||||
new ApplicationEvent(appId,
|
||||
ApplicationEventType.APPLICATION_LOG_HANDLING_FAILED));
|
||||
break;
|
||||
}
|
||||
LogDeleterRunnable logDeleter = new LogDeleterRunnable(user, appId);
|
||||
|
|
|
@ -294,6 +294,90 @@ public class TestContainerManagerRecovery extends BaseContainerManagerTest {
|
|||
cm.stop();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testNMRecoveryForAppFinishedWithLogAggregationFailure()
|
||||
throws Exception {
|
||||
conf.setBoolean(YarnConfiguration.NM_RECOVERY_ENABLED, true);
|
||||
conf.setBoolean(YarnConfiguration.NM_RECOVERY_SUPERVISED, true);
|
||||
|
||||
NMStateStoreService stateStore = new NMMemoryStateStoreService();
|
||||
stateStore.init(conf);
|
||||
stateStore.start();
|
||||
Context context = createContext(conf, stateStore);
|
||||
ContainerManagerImpl cm = createContainerManager(context);
|
||||
cm.init(conf);
|
||||
cm.start();
|
||||
|
||||
// add an application by starting a container
|
||||
ApplicationId appId = ApplicationId.newInstance(0, 1);
|
||||
ApplicationAttemptId attemptId =
|
||||
ApplicationAttemptId.newInstance(appId, 1);
|
||||
ContainerId cid = ContainerId.newContainerId(attemptId, 1);
|
||||
Map<String, LocalResource> localResources = Collections.emptyMap();
|
||||
Map<String, String> containerEnv = Collections.emptyMap();
|
||||
List<String> containerCmds = Collections.emptyList();
|
||||
Map<String, ByteBuffer> serviceData = Collections.emptyMap();
|
||||
|
||||
ContainerLaunchContext clc = ContainerLaunchContext.newInstance(
|
||||
localResources, containerEnv, containerCmds, serviceData,
|
||||
null, null);
|
||||
|
||||
StartContainersResponse startResponse = startContainer(context, cm, cid,
|
||||
clc, null);
|
||||
assertTrue(startResponse.getFailedRequests().isEmpty());
|
||||
assertEquals(1, context.getApplications().size());
|
||||
Application app = context.getApplications().get(appId);
|
||||
assertNotNull(app);
|
||||
waitForAppState(app, ApplicationState.INITING);
|
||||
|
||||
// simulate application completion
|
||||
List<ApplicationId> finishedApps = new ArrayList<ApplicationId>();
|
||||
finishedApps.add(appId);
|
||||
cm.handle(new CMgrCompletedAppsEvent(finishedApps,
|
||||
CMgrCompletedAppsEvent.Reason.BY_RESOURCEMANAGER));
|
||||
waitForAppState(app, ApplicationState.APPLICATION_RESOURCES_CLEANINGUP);
|
||||
|
||||
app.handle(new ApplicationEvent(app.getAppId(),
|
||||
ApplicationEventType.APPLICATION_RESOURCES_CLEANEDUP));
|
||||
assertEquals(app.getApplicationState(), ApplicationState.FINISHED);
|
||||
// application is still in NM context.
|
||||
assertEquals(1, context.getApplications().size());
|
||||
|
||||
// restart and verify app is still there and marked as finished.
|
||||
cm.stop();
|
||||
context = createContext(conf, stateStore);
|
||||
cm = createContainerManager(context);
|
||||
cm.init(conf);
|
||||
cm.start();
|
||||
assertEquals(1, context.getApplications().size());
|
||||
app = context.getApplications().get(appId);
|
||||
assertNotNull(app);
|
||||
|
||||
// no longer saving FINISH_APP event in NM stateStore,
|
||||
// simulate by resending FINISH_APP event
|
||||
cm.handle(new CMgrCompletedAppsEvent(finishedApps,
|
||||
CMgrCompletedAppsEvent.Reason.BY_RESOURCEMANAGER));
|
||||
waitForAppState(app, ApplicationState.APPLICATION_RESOURCES_CLEANINGUP);
|
||||
// TODO need to figure out why additional APPLICATION_RESOURCES_CLEANEDUP
|
||||
// is needed.
|
||||
app.handle(new ApplicationEvent(app.getAppId(),
|
||||
ApplicationEventType.APPLICATION_RESOURCES_CLEANEDUP));
|
||||
assertEquals(app.getApplicationState(), ApplicationState.FINISHED);
|
||||
|
||||
// simulate log aggregation failed.
|
||||
app.handle(new ApplicationEvent(app.getAppId(),
|
||||
ApplicationEventType.APPLICATION_LOG_HANDLING_FAILED));
|
||||
|
||||
// restart and verify app is no longer present after recovery
|
||||
cm.stop();
|
||||
context = createContext(conf, stateStore);
|
||||
cm = createContainerManager(context);
|
||||
cm.init(conf);
|
||||
cm.start();
|
||||
assertTrue(context.getApplications().isEmpty());
|
||||
cm.stop();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testContainerResizeRecovery() throws Exception {
|
||||
conf.setBoolean(YarnConfiguration.NM_RECOVERY_ENABLED, true);
|
||||
|
|
|
@ -17,6 +17,9 @@
|
|||
*/
|
||||
package org.apache.hadoop.yarn.server.nodemanager.containermanager.loghandler;
|
||||
|
||||
import static org.junit.Assert.assertTrue;
|
||||
import static org.junit.Assert.assertFalse;
|
||||
|
||||
import static org.mockito.Matchers.any;
|
||||
import static org.mockito.Matchers.anyLong;
|
||||
import static org.mockito.Matchers.eq;
|
||||
|
@ -84,7 +87,7 @@ public class TestNonAggregatingLogHandler {
|
|||
DeletionService mockDelService;
|
||||
Configuration conf;
|
||||
DrainDispatcher dispatcher;
|
||||
EventHandler<ApplicationEvent> appEventHandler;
|
||||
private ApplicationEventHandler appEventHandler;
|
||||
String user = "testuser";
|
||||
ApplicationId appId;
|
||||
ApplicationAttemptId appAttemptId;
|
||||
|
@ -97,7 +100,7 @@ public class TestNonAggregatingLogHandler {
|
|||
mockDelService = mock(DeletionService.class);
|
||||
conf = new YarnConfiguration();
|
||||
dispatcher = createDispatcher(conf);
|
||||
appEventHandler = mock(EventHandler.class);
|
||||
appEventHandler = new ApplicationEventHandler();
|
||||
dispatcher.register(ApplicationEventType.class, appEventHandler);
|
||||
appId = BuilderUtils.newApplicationId(1234, 1);
|
||||
appAttemptId = BuilderUtils.newApplicationAttemptId(appId, 1);
|
||||
|
@ -345,6 +348,9 @@ public class TestNonAggregatingLogHandler {
|
|||
|
||||
dirsHandler.init(conf);
|
||||
|
||||
appEventHandler.resetLogHandlingEvent();
|
||||
assertFalse(appEventHandler.receiveLogHandlingFinishEvent());
|
||||
|
||||
NMStateStoreService stateStore = new NMMemoryStateStoreService();
|
||||
stateStore.init(conf);
|
||||
stateStore.start();
|
||||
|
@ -377,8 +383,21 @@ public class TestNonAggregatingLogHandler {
|
|||
logHandler.start();
|
||||
verify(logHandler.mockSched, never()).schedule(any(Runnable.class),
|
||||
anyLong(), any(TimeUnit.class));
|
||||
|
||||
// wait events get drained.
|
||||
this.dispatcher.await();
|
||||
assertTrue(appEventHandler.receiveLogHandlingFinishEvent());
|
||||
|
||||
appEventHandler.resetLogHandlingEvent();
|
||||
assertFalse(appEventHandler.receiveLogHandlingFailedEvent());
|
||||
// send an app finish event against a removed app
|
||||
logHandler.handle(new LogHandlerAppFinishedEvent(appId));
|
||||
this.dispatcher.await();
|
||||
// verify to receive a log failed event.
|
||||
assertTrue(appEventHandler.receiveLogHandlingFailedEvent());
|
||||
assertFalse(appEventHandler.receiveLogHandlingFinishEvent());
|
||||
logHandler.close();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Function to run a log handler with directories failing the getFileStatus
|
||||
|
@ -536,4 +555,37 @@ public class TestNonAggregatingLogHandler {
|
|||
}
|
||||
return dirs;
|
||||
}
|
||||
|
||||
class ApplicationEventHandler implements EventHandler<ApplicationEvent> {
|
||||
|
||||
private boolean logHandlingFinished = false;
|
||||
private boolean logHandlingFailed = false;
|
||||
|
||||
@Override
|
||||
public void handle(ApplicationEvent event) {
|
||||
switch (event.getType()) {
|
||||
case APPLICATION_LOG_HANDLING_FINISHED:
|
||||
logHandlingFinished = true;
|
||||
break;
|
||||
case APPLICATION_LOG_HANDLING_FAILED:
|
||||
logHandlingFailed = true;
|
||||
default:
|
||||
// do nothing.
|
||||
}
|
||||
}
|
||||
|
||||
public boolean receiveLogHandlingFinishEvent() {
|
||||
return logHandlingFinished;
|
||||
}
|
||||
|
||||
public boolean receiveLogHandlingFailedEvent() {
|
||||
return logHandlingFailed;
|
||||
}
|
||||
|
||||
public void resetLogHandlingEvent() {
|
||||
logHandlingFinished = false;
|
||||
logHandlingFailed = false;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue