YARN-875. Application can hang if AMRMClientAsync callback thread has exception (Xuan Gong via bikas)

git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1506750 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Bikas Saha 2013-07-24 22:13:17 +00:00
parent ed6598791e
commit 9c453d4432
5 changed files with 149 additions and 57 deletions

View File

@ -743,6 +743,9 @@ Release 2.1.0-beta - 2013-07-02
YARN-873. YARNClient.getApplicationReport(unknownAppId) returns a null YARN-873. YARNClient.getApplicationReport(unknownAppId) returns a null
report (Xuan Gong via bikas) report (Xuan Gong via bikas)
YARN-875. Application can hang if AMRMClientAsync callback thread has
exception (Xuan Gong via bikas)
BREAKDOWN OF HADOOP-8562/YARN-191 SUBTASKS AND RELATED JIRAS BREAKDOWN OF HADOOP-8562/YARN-191 SUBTASKS AND RELATED JIRAS
YARN-158. Yarn creating package-info.java must not depend on sh. YARN-158. Yarn creating package-info.java must not depend on sh.

View File

@ -649,8 +649,9 @@ public class ApplicationMaster {
} }
@Override @Override
public void onError(Exception e) { public void onError(Throwable e) {
done = true; done = true;
resourceManager.stop();
} }
} }

View File

@ -220,6 +220,13 @@ extends AbstractService {
public float getProgress(); public float getProgress();
public void onError(Exception e); /**
* Called when error comes from RM communications as well as from errors in
* the callback itself from the app. Calling
* stop() is the recommended action.
*
* @param e
*/
public void onError(Throwable e);
} }
} }

View File

@ -217,7 +217,7 @@ extends AMRMClientAsync<T> {
// synchronization ensures we don't send heartbeats after unregistering // synchronization ensures we don't send heartbeats after unregistering
synchronized (unregisterHeartbeatLock) { synchronized (unregisterHeartbeatLock) {
if (!keepRunning) { if (!keepRunning) {
break; return;
} }
try { try {
@ -227,13 +227,13 @@ extends AMRMClientAsync<T> {
savedException = ex; savedException = ex;
// interrupt handler thread in case it waiting on the queue // interrupt handler thread in case it waiting on the queue
handlerThread.interrupt(); handlerThread.interrupt();
break; return;
} catch (IOException e) { } catch (IOException e) {
LOG.error("IO exception on heartbeat", e); LOG.error("IO exception on heartbeat", e);
savedException = e; savedException = e;
// interrupt handler thread in case it waiting on the queue // interrupt handler thread in case it waiting on the queue
handlerThread.interrupt(); handlerThread.interrupt();
break; return;
} }
} }
if (response != null) { if (response != null) {
@ -266,51 +266,60 @@ extends AMRMClientAsync<T> {
} }
public void run() { public void run() {
while (keepRunning) { while (true) {
AllocateResponse response; if (!keepRunning) {
return;
}
try { try {
AllocateResponse response;
if(savedException != null) { if(savedException != null) {
LOG.error("Stopping callback due to: ", savedException); LOG.error("Stopping callback due to: ", savedException);
handler.onError(savedException); handler.onError(savedException);
break;
}
response = responseQueue.take();
} catch (InterruptedException ex) {
LOG.info("Interrupted while waiting for queue", ex);
continue;
}
if (response.getAMCommand() != null) {
switch(response.getAMCommand()) {
case AM_RESYNC:
case AM_SHUTDOWN:
handler.onShutdownRequest();
LOG.info("Shutdown requested. Stopping callback.");
return; return;
default:
String msg =
"Unhandled value of AMCommand: " + response.getAMCommand();
LOG.error(msg);
throw new YarnRuntimeException(msg);
} }
} try {
List<NodeReport> updatedNodes = response.getUpdatedNodes(); response = responseQueue.take();
if (!updatedNodes.isEmpty()) { } catch (InterruptedException ex) {
handler.onNodesUpdated(updatedNodes); LOG.info("Interrupted while waiting for queue", ex);
} continue;
}
List<ContainerStatus> completed =
response.getCompletedContainersStatuses();
if (!completed.isEmpty()) {
handler.onContainersCompleted(completed);
}
List<Container> allocated = response.getAllocatedContainers(); if (response.getAMCommand() != null) {
if (!allocated.isEmpty()) { switch(response.getAMCommand()) {
handler.onContainersAllocated(allocated); case AM_RESYNC:
case AM_SHUTDOWN:
handler.onShutdownRequest();
LOG.info("Shutdown requested. Stopping callback.");
return;
default:
String msg =
"Unhandled value of RM AMCommand: " + response.getAMCommand();
LOG.error(msg);
throw new YarnRuntimeException(msg);
}
}
List<NodeReport> updatedNodes = response.getUpdatedNodes();
if (!updatedNodes.isEmpty()) {
handler.onNodesUpdated(updatedNodes);
}
List<ContainerStatus> completed =
response.getCompletedContainersStatuses();
if (!completed.isEmpty()) {
handler.onContainersCompleted(completed);
}
List<Container> allocated = response.getAllocatedContainers();
if (!allocated.isEmpty()) {
handler.onContainersAllocated(allocated);
}
progress = handler.getProgress();
} catch (Throwable ex) {
handler.onError(ex);
// re-throw exception to end the thread
throw new YarnRuntimeException(ex);
} }
progress = handler.getProgress();
} }
} }
} }

View File

@ -21,10 +21,12 @@ package org.apache.hadoop.yarn.client.api.async.impl;
import static org.mockito.Matchers.anyFloat; import static org.mockito.Matchers.anyFloat;
import static org.mockito.Matchers.anyInt; import static org.mockito.Matchers.anyInt;
import static org.mockito.Matchers.anyString; import static org.mockito.Matchers.anyString;
import static org.mockito.Matchers.any;
import static org.mockito.Mockito.mock; import static org.mockito.Mockito.mock;
import static org.mockito.Mockito.when; import static org.mockito.Mockito.when;
import static org.mockito.Mockito.times; import static org.mockito.Mockito.times;
import static org.mockito.Mockito.verify; import static org.mockito.Mockito.verify;
import static org.mockito.Mockito.spy;
import java.io.IOException; import java.io.IOException;
import java.util.ArrayList; import java.util.ArrayList;
@ -54,6 +56,7 @@ import org.apache.hadoop.yarn.client.api.AMRMClient.ContainerRequest;
import org.apache.hadoop.yarn.client.api.async.AMRMClientAsync; import org.apache.hadoop.yarn.client.api.async.AMRMClientAsync;
import org.apache.hadoop.yarn.client.api.impl.AMRMClientImpl; import org.apache.hadoop.yarn.client.api.impl.AMRMClientImpl;
import org.apache.hadoop.yarn.exceptions.YarnException; import org.apache.hadoop.yarn.exceptions.YarnException;
import org.apache.hadoop.yarn.exceptions.YarnRuntimeException;
import org.junit.Test; import org.junit.Test;
import org.mockito.invocation.InvocationOnMock; import org.mockito.invocation.InvocationOnMock;
import org.mockito.stubbing.Answer; import org.mockito.stubbing.Answer;
@ -264,13 +267,13 @@ public class TestAMRMClientAsync {
AMRMClientAsync<ContainerRequest> asyncClient = AMRMClientAsync<ContainerRequest> asyncClient =
AMRMClientAsync.createAMRMClientAsync(client, 20, callbackHandler); AMRMClientAsync.createAMRMClientAsync(client, 20, callbackHandler);
callbackHandler.registerAsyncClient(asyncClient); callbackHandler.asynClient = asyncClient;
asyncClient.init(conf); asyncClient.init(conf);
asyncClient.start(); asyncClient.start();
synchronized (callbackHandler.notifier) { synchronized (callbackHandler.notifier) {
asyncClient.registerApplicationMaster("localhost", 1234, null); asyncClient.registerApplicationMaster("localhost", 1234, null);
while(callbackHandler.stop == false) { while(callbackHandler.notify == false) {
try { try {
callbackHandler.notifier.wait(); callbackHandler.notifier.wait();
} catch (InterruptedException e) { } catch (InterruptedException e) {
@ -280,6 +283,65 @@ public class TestAMRMClientAsync {
} }
} }
void runCallBackThrowOutException(TestCallbackHandler2 callbackHandler) throws
InterruptedException, YarnException, IOException {
Configuration conf = new Configuration();
@SuppressWarnings("unchecked")
AMRMClient<ContainerRequest> client = mock(AMRMClientImpl.class);
List<ContainerStatus> completed = Arrays.asList(
ContainerStatus.newInstance(newContainerId(0, 0, 0, 0),
ContainerState.COMPLETE, "", 0));
final AllocateResponse response = createAllocateResponse(completed,
new ArrayList<Container>(), null);
when(client.allocate(anyFloat())).thenReturn(response);
AMRMClientAsync<ContainerRequest> asyncClient =
AMRMClientAsync.createAMRMClientAsync(client, 20, callbackHandler);
callbackHandler.asynClient = asyncClient;
callbackHandler.throwOutException = true;
asyncClient.init(conf);
asyncClient.start();
// call register and wait for error callback and stop
synchronized (callbackHandler.notifier) {
asyncClient.registerApplicationMaster("localhost", 1234, null);
while(callbackHandler.notify == false) {
try {
callbackHandler.notifier.wait();
} catch (InterruptedException e) {
e.printStackTrace();
}
}
}
// verify error invoked
verify(callbackHandler, times(0)).getProgress();
verify(callbackHandler, times(1)).onError(any(Exception.class));
// sleep to wait for a few heartbeat calls that can trigger callbacks
Thread.sleep(50);
// verify no more invocations after the first one.
// ie. callback thread has stopped
verify(callbackHandler, times(0)).getProgress();
verify(callbackHandler, times(1)).onError(any(Exception.class));
}
@Test (timeout = 5000)
public void testCallBackThrowOutException() throws YarnException,
IOException, InterruptedException {
// test exception in callback with app calling stop() on app.onError()
TestCallbackHandler2 callbackHandler = spy(new TestCallbackHandler2());
runCallBackThrowOutException(callbackHandler);
}
@Test (timeout = 5000)
public void testCallBackThrowOutExceptionNoStop() throws YarnException,
IOException, InterruptedException {
// test exception in callback with app not calling stop() on app.onError()
TestCallbackHandler2 callbackHandler = spy(new TestCallbackHandler2());
callbackHandler.stop = false;
runCallBackThrowOutException(callbackHandler);
}
private AllocateResponse createAllocateResponse( private AllocateResponse createAllocateResponse(
List<ContainerStatus> completed, List<Container> allocated, List<ContainerStatus> completed, List<Container> allocated,
List<NMToken> nmTokens) { List<NMToken> nmTokens) {
@ -378,8 +440,8 @@ public class TestAMRMClientAsync {
} }
@Override @Override
public void onError(Exception e) { public void onError(Throwable e) {
savedException = e; savedException = new Exception(e.getMessage());
synchronized (notifier) { synchronized (notifier) {
notifier.notifyAll(); notifier.notifyAll();
} }
@ -390,10 +452,16 @@ public class TestAMRMClientAsync {
Object notifier = new Object(); Object notifier = new Object();
@SuppressWarnings("rawtypes") @SuppressWarnings("rawtypes")
AMRMClientAsync asynClient; AMRMClientAsync asynClient;
boolean stop = false; boolean stop = true;
boolean notify = false;
boolean throwOutException = false;
@Override @Override
public void onContainersCompleted(List<ContainerStatus> statuses) {} public void onContainersCompleted(List<ContainerStatus> statuses) {
if (throwOutException) {
throw new YarnRuntimeException("Exception from callback handler");
}
}
@Override @Override
public void onContainersAllocated(List<Container> containers) {} public void onContainersAllocated(List<Container> containers) {}
@ -406,20 +474,24 @@ public class TestAMRMClientAsync {
@Override @Override
public float getProgress() { public float getProgress() {
asynClient.stop(); callStopAndNotify();
stop = true;
synchronized (notifier) {
notifier.notifyAll();
}
return 0; return 0;
} }
@Override @Override
public void onError(Exception e) {} public void onError(Throwable e) {
Assert.assertEquals(e.getMessage(), "Exception from callback handler");
callStopAndNotify();
}
public void registerAsyncClient( void callStopAndNotify() {
AMRMClientAsync<ContainerRequest> asyncClient) { if(stop) {
this.asynClient = asyncClient; asynClient.stop();
}
notify = true;
synchronized (notifier) {
notifier.notifyAll();
}
} }
} }
} }