YARN-875. Application can hang if AMRMClientAsync callback thread has exception (Xuan Gong via bikas)
git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1506750 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
ed6598791e
commit
9c453d4432
|
@ -743,6 +743,9 @@ Release 2.1.0-beta - 2013-07-02
|
||||||
YARN-873. YARNClient.getApplicationReport(unknownAppId) returns a null
|
YARN-873. YARNClient.getApplicationReport(unknownAppId) returns a null
|
||||||
report (Xuan Gong via bikas)
|
report (Xuan Gong via bikas)
|
||||||
|
|
||||||
|
YARN-875. Application can hang if AMRMClientAsync callback thread has
|
||||||
|
exception (Xuan Gong via bikas)
|
||||||
|
|
||||||
BREAKDOWN OF HADOOP-8562/YARN-191 SUBTASKS AND RELATED JIRAS
|
BREAKDOWN OF HADOOP-8562/YARN-191 SUBTASKS AND RELATED JIRAS
|
||||||
|
|
||||||
YARN-158. Yarn creating package-info.java must not depend on sh.
|
YARN-158. Yarn creating package-info.java must not depend on sh.
|
||||||
|
|
|
@ -649,8 +649,9 @@ public class ApplicationMaster {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void onError(Exception e) {
|
public void onError(Throwable e) {
|
||||||
done = true;
|
done = true;
|
||||||
|
resourceManager.stop();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -220,6 +220,13 @@ extends AbstractService {
|
||||||
|
|
||||||
public float getProgress();
|
public float getProgress();
|
||||||
|
|
||||||
public void onError(Exception e);
|
/**
|
||||||
|
* Called when error comes from RM communications as well as from errors in
|
||||||
|
* the callback itself from the app. Calling
|
||||||
|
* stop() is the recommended action.
|
||||||
|
*
|
||||||
|
* @param e
|
||||||
|
*/
|
||||||
|
public void onError(Throwable e);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -217,7 +217,7 @@ extends AMRMClientAsync<T> {
|
||||||
// synchronization ensures we don't send heartbeats after unregistering
|
// synchronization ensures we don't send heartbeats after unregistering
|
||||||
synchronized (unregisterHeartbeatLock) {
|
synchronized (unregisterHeartbeatLock) {
|
||||||
if (!keepRunning) {
|
if (!keepRunning) {
|
||||||
break;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
|
@ -227,13 +227,13 @@ extends AMRMClientAsync<T> {
|
||||||
savedException = ex;
|
savedException = ex;
|
||||||
// interrupt handler thread in case it waiting on the queue
|
// interrupt handler thread in case it waiting on the queue
|
||||||
handlerThread.interrupt();
|
handlerThread.interrupt();
|
||||||
break;
|
return;
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
LOG.error("IO exception on heartbeat", e);
|
LOG.error("IO exception on heartbeat", e);
|
||||||
savedException = e;
|
savedException = e;
|
||||||
// interrupt handler thread in case it waiting on the queue
|
// interrupt handler thread in case it waiting on the queue
|
||||||
handlerThread.interrupt();
|
handlerThread.interrupt();
|
||||||
break;
|
return;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (response != null) {
|
if (response != null) {
|
||||||
|
@ -266,51 +266,60 @@ extends AMRMClientAsync<T> {
|
||||||
}
|
}
|
||||||
|
|
||||||
public void run() {
|
public void run() {
|
||||||
while (keepRunning) {
|
while (true) {
|
||||||
AllocateResponse response;
|
if (!keepRunning) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
try {
|
try {
|
||||||
|
AllocateResponse response;
|
||||||
if(savedException != null) {
|
if(savedException != null) {
|
||||||
LOG.error("Stopping callback due to: ", savedException);
|
LOG.error("Stopping callback due to: ", savedException);
|
||||||
handler.onError(savedException);
|
handler.onError(savedException);
|
||||||
break;
|
|
||||||
}
|
|
||||||
response = responseQueue.take();
|
|
||||||
} catch (InterruptedException ex) {
|
|
||||||
LOG.info("Interrupted while waiting for queue", ex);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (response.getAMCommand() != null) {
|
|
||||||
switch(response.getAMCommand()) {
|
|
||||||
case AM_RESYNC:
|
|
||||||
case AM_SHUTDOWN:
|
|
||||||
handler.onShutdownRequest();
|
|
||||||
LOG.info("Shutdown requested. Stopping callback.");
|
|
||||||
return;
|
return;
|
||||||
default:
|
|
||||||
String msg =
|
|
||||||
"Unhandled value of AMCommand: " + response.getAMCommand();
|
|
||||||
LOG.error(msg);
|
|
||||||
throw new YarnRuntimeException(msg);
|
|
||||||
}
|
}
|
||||||
}
|
try {
|
||||||
List<NodeReport> updatedNodes = response.getUpdatedNodes();
|
response = responseQueue.take();
|
||||||
if (!updatedNodes.isEmpty()) {
|
} catch (InterruptedException ex) {
|
||||||
handler.onNodesUpdated(updatedNodes);
|
LOG.info("Interrupted while waiting for queue", ex);
|
||||||
}
|
continue;
|
||||||
|
}
|
||||||
List<ContainerStatus> completed =
|
|
||||||
response.getCompletedContainersStatuses();
|
|
||||||
if (!completed.isEmpty()) {
|
|
||||||
handler.onContainersCompleted(completed);
|
|
||||||
}
|
|
||||||
|
|
||||||
List<Container> allocated = response.getAllocatedContainers();
|
if (response.getAMCommand() != null) {
|
||||||
if (!allocated.isEmpty()) {
|
switch(response.getAMCommand()) {
|
||||||
handler.onContainersAllocated(allocated);
|
case AM_RESYNC:
|
||||||
|
case AM_SHUTDOWN:
|
||||||
|
handler.onShutdownRequest();
|
||||||
|
LOG.info("Shutdown requested. Stopping callback.");
|
||||||
|
return;
|
||||||
|
default:
|
||||||
|
String msg =
|
||||||
|
"Unhandled value of RM AMCommand: " + response.getAMCommand();
|
||||||
|
LOG.error(msg);
|
||||||
|
throw new YarnRuntimeException(msg);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
List<NodeReport> updatedNodes = response.getUpdatedNodes();
|
||||||
|
if (!updatedNodes.isEmpty()) {
|
||||||
|
handler.onNodesUpdated(updatedNodes);
|
||||||
|
}
|
||||||
|
|
||||||
|
List<ContainerStatus> completed =
|
||||||
|
response.getCompletedContainersStatuses();
|
||||||
|
if (!completed.isEmpty()) {
|
||||||
|
handler.onContainersCompleted(completed);
|
||||||
|
}
|
||||||
|
|
||||||
|
List<Container> allocated = response.getAllocatedContainers();
|
||||||
|
if (!allocated.isEmpty()) {
|
||||||
|
handler.onContainersAllocated(allocated);
|
||||||
|
}
|
||||||
|
|
||||||
|
progress = handler.getProgress();
|
||||||
|
} catch (Throwable ex) {
|
||||||
|
handler.onError(ex);
|
||||||
|
// re-throw exception to end the thread
|
||||||
|
throw new YarnRuntimeException(ex);
|
||||||
}
|
}
|
||||||
|
|
||||||
progress = handler.getProgress();
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -21,10 +21,12 @@ package org.apache.hadoop.yarn.client.api.async.impl;
|
||||||
import static org.mockito.Matchers.anyFloat;
|
import static org.mockito.Matchers.anyFloat;
|
||||||
import static org.mockito.Matchers.anyInt;
|
import static org.mockito.Matchers.anyInt;
|
||||||
import static org.mockito.Matchers.anyString;
|
import static org.mockito.Matchers.anyString;
|
||||||
|
import static org.mockito.Matchers.any;
|
||||||
import static org.mockito.Mockito.mock;
|
import static org.mockito.Mockito.mock;
|
||||||
import static org.mockito.Mockito.when;
|
import static org.mockito.Mockito.when;
|
||||||
import static org.mockito.Mockito.times;
|
import static org.mockito.Mockito.times;
|
||||||
import static org.mockito.Mockito.verify;
|
import static org.mockito.Mockito.verify;
|
||||||
|
import static org.mockito.Mockito.spy;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
|
@ -54,6 +56,7 @@ import org.apache.hadoop.yarn.client.api.AMRMClient.ContainerRequest;
|
||||||
import org.apache.hadoop.yarn.client.api.async.AMRMClientAsync;
|
import org.apache.hadoop.yarn.client.api.async.AMRMClientAsync;
|
||||||
import org.apache.hadoop.yarn.client.api.impl.AMRMClientImpl;
|
import org.apache.hadoop.yarn.client.api.impl.AMRMClientImpl;
|
||||||
import org.apache.hadoop.yarn.exceptions.YarnException;
|
import org.apache.hadoop.yarn.exceptions.YarnException;
|
||||||
|
import org.apache.hadoop.yarn.exceptions.YarnRuntimeException;
|
||||||
import org.junit.Test;
|
import org.junit.Test;
|
||||||
import org.mockito.invocation.InvocationOnMock;
|
import org.mockito.invocation.InvocationOnMock;
|
||||||
import org.mockito.stubbing.Answer;
|
import org.mockito.stubbing.Answer;
|
||||||
|
@ -264,13 +267,13 @@ public class TestAMRMClientAsync {
|
||||||
|
|
||||||
AMRMClientAsync<ContainerRequest> asyncClient =
|
AMRMClientAsync<ContainerRequest> asyncClient =
|
||||||
AMRMClientAsync.createAMRMClientAsync(client, 20, callbackHandler);
|
AMRMClientAsync.createAMRMClientAsync(client, 20, callbackHandler);
|
||||||
callbackHandler.registerAsyncClient(asyncClient);
|
callbackHandler.asynClient = asyncClient;
|
||||||
asyncClient.init(conf);
|
asyncClient.init(conf);
|
||||||
asyncClient.start();
|
asyncClient.start();
|
||||||
|
|
||||||
synchronized (callbackHandler.notifier) {
|
synchronized (callbackHandler.notifier) {
|
||||||
asyncClient.registerApplicationMaster("localhost", 1234, null);
|
asyncClient.registerApplicationMaster("localhost", 1234, null);
|
||||||
while(callbackHandler.stop == false) {
|
while(callbackHandler.notify == false) {
|
||||||
try {
|
try {
|
||||||
callbackHandler.notifier.wait();
|
callbackHandler.notifier.wait();
|
||||||
} catch (InterruptedException e) {
|
} catch (InterruptedException e) {
|
||||||
|
@ -280,6 +283,65 @@ public class TestAMRMClientAsync {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void runCallBackThrowOutException(TestCallbackHandler2 callbackHandler) throws
|
||||||
|
InterruptedException, YarnException, IOException {
|
||||||
|
Configuration conf = new Configuration();
|
||||||
|
@SuppressWarnings("unchecked")
|
||||||
|
AMRMClient<ContainerRequest> client = mock(AMRMClientImpl.class);
|
||||||
|
|
||||||
|
List<ContainerStatus> completed = Arrays.asList(
|
||||||
|
ContainerStatus.newInstance(newContainerId(0, 0, 0, 0),
|
||||||
|
ContainerState.COMPLETE, "", 0));
|
||||||
|
final AllocateResponse response = createAllocateResponse(completed,
|
||||||
|
new ArrayList<Container>(), null);
|
||||||
|
|
||||||
|
when(client.allocate(anyFloat())).thenReturn(response);
|
||||||
|
AMRMClientAsync<ContainerRequest> asyncClient =
|
||||||
|
AMRMClientAsync.createAMRMClientAsync(client, 20, callbackHandler);
|
||||||
|
callbackHandler.asynClient = asyncClient;
|
||||||
|
callbackHandler.throwOutException = true;
|
||||||
|
asyncClient.init(conf);
|
||||||
|
asyncClient.start();
|
||||||
|
|
||||||
|
// call register and wait for error callback and stop
|
||||||
|
synchronized (callbackHandler.notifier) {
|
||||||
|
asyncClient.registerApplicationMaster("localhost", 1234, null);
|
||||||
|
while(callbackHandler.notify == false) {
|
||||||
|
try {
|
||||||
|
callbackHandler.notifier.wait();
|
||||||
|
} catch (InterruptedException e) {
|
||||||
|
e.printStackTrace();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// verify error invoked
|
||||||
|
verify(callbackHandler, times(0)).getProgress();
|
||||||
|
verify(callbackHandler, times(1)).onError(any(Exception.class));
|
||||||
|
// sleep to wait for a few heartbeat calls that can trigger callbacks
|
||||||
|
Thread.sleep(50);
|
||||||
|
// verify no more invocations after the first one.
|
||||||
|
// ie. callback thread has stopped
|
||||||
|
verify(callbackHandler, times(0)).getProgress();
|
||||||
|
verify(callbackHandler, times(1)).onError(any(Exception.class));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test (timeout = 5000)
|
||||||
|
public void testCallBackThrowOutException() throws YarnException,
|
||||||
|
IOException, InterruptedException {
|
||||||
|
// test exception in callback with app calling stop() on app.onError()
|
||||||
|
TestCallbackHandler2 callbackHandler = spy(new TestCallbackHandler2());
|
||||||
|
runCallBackThrowOutException(callbackHandler);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test (timeout = 5000)
|
||||||
|
public void testCallBackThrowOutExceptionNoStop() throws YarnException,
|
||||||
|
IOException, InterruptedException {
|
||||||
|
// test exception in callback with app not calling stop() on app.onError()
|
||||||
|
TestCallbackHandler2 callbackHandler = spy(new TestCallbackHandler2());
|
||||||
|
callbackHandler.stop = false;
|
||||||
|
runCallBackThrowOutException(callbackHandler);
|
||||||
|
}
|
||||||
|
|
||||||
private AllocateResponse createAllocateResponse(
|
private AllocateResponse createAllocateResponse(
|
||||||
List<ContainerStatus> completed, List<Container> allocated,
|
List<ContainerStatus> completed, List<Container> allocated,
|
||||||
List<NMToken> nmTokens) {
|
List<NMToken> nmTokens) {
|
||||||
|
@ -378,8 +440,8 @@ public class TestAMRMClientAsync {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void onError(Exception e) {
|
public void onError(Throwable e) {
|
||||||
savedException = e;
|
savedException = new Exception(e.getMessage());
|
||||||
synchronized (notifier) {
|
synchronized (notifier) {
|
||||||
notifier.notifyAll();
|
notifier.notifyAll();
|
||||||
}
|
}
|
||||||
|
@ -390,10 +452,16 @@ public class TestAMRMClientAsync {
|
||||||
Object notifier = new Object();
|
Object notifier = new Object();
|
||||||
@SuppressWarnings("rawtypes")
|
@SuppressWarnings("rawtypes")
|
||||||
AMRMClientAsync asynClient;
|
AMRMClientAsync asynClient;
|
||||||
boolean stop = false;
|
boolean stop = true;
|
||||||
|
boolean notify = false;
|
||||||
|
boolean throwOutException = false;
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void onContainersCompleted(List<ContainerStatus> statuses) {}
|
public void onContainersCompleted(List<ContainerStatus> statuses) {
|
||||||
|
if (throwOutException) {
|
||||||
|
throw new YarnRuntimeException("Exception from callback handler");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void onContainersAllocated(List<Container> containers) {}
|
public void onContainersAllocated(List<Container> containers) {}
|
||||||
|
@ -406,20 +474,24 @@ public class TestAMRMClientAsync {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public float getProgress() {
|
public float getProgress() {
|
||||||
asynClient.stop();
|
callStopAndNotify();
|
||||||
stop = true;
|
|
||||||
synchronized (notifier) {
|
|
||||||
notifier.notifyAll();
|
|
||||||
}
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void onError(Exception e) {}
|
public void onError(Throwable e) {
|
||||||
|
Assert.assertEquals(e.getMessage(), "Exception from callback handler");
|
||||||
|
callStopAndNotify();
|
||||||
|
}
|
||||||
|
|
||||||
public void registerAsyncClient(
|
void callStopAndNotify() {
|
||||||
AMRMClientAsync<ContainerRequest> asyncClient) {
|
if(stop) {
|
||||||
this.asynClient = asyncClient;
|
asynClient.stop();
|
||||||
|
}
|
||||||
|
notify = true;
|
||||||
|
synchronized (notifier) {
|
||||||
|
notifier.notifyAll();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue