YARN-8893. [AMRMProxy] Fix thread leak in AMRMClientRelayer and UAM client. Contributed by Botong Huang.

This commit is contained in:
Giovanni Matteo Fumarola 2018-11-02 15:30:08 -07:00
parent aed836efbf
commit 989715ec50
9 changed files with 103 additions and 77 deletions

View File

@ -27,9 +27,8 @@
import java.util.Set; import java.util.Set;
import java.util.TreeSet; import java.util.TreeSet;
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.HadoopIllegalArgumentException;
import org.apache.hadoop.ipc.RPC; import org.apache.hadoop.ipc.RPC;
import org.apache.hadoop.service.AbstractService;
import org.apache.hadoop.yarn.api.ApplicationMasterProtocol; import org.apache.hadoop.yarn.api.ApplicationMasterProtocol;
import org.apache.hadoop.yarn.api.protocolrecords.AllocateRequest; import org.apache.hadoop.yarn.api.protocolrecords.AllocateRequest;
import org.apache.hadoop.yarn.api.protocolrecords.AllocateResponse; import org.apache.hadoop.yarn.api.protocolrecords.AllocateResponse;
@ -47,8 +46,6 @@
import org.apache.hadoop.yarn.api.records.UpdateContainerRequest; import org.apache.hadoop.yarn.api.records.UpdateContainerRequest;
import org.apache.hadoop.yarn.api.records.UpdatedContainer; import org.apache.hadoop.yarn.api.records.UpdatedContainer;
import org.apache.hadoop.yarn.client.AMRMClientUtils; import org.apache.hadoop.yarn.client.AMRMClientUtils;
import org.apache.hadoop.yarn.client.ClientRMProxy;
import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.apache.hadoop.yarn.exceptions.ApplicationMasterNotRegisteredException; import org.apache.hadoop.yarn.exceptions.ApplicationMasterNotRegisteredException;
import org.apache.hadoop.yarn.exceptions.InvalidApplicationMasterRequestException; import org.apache.hadoop.yarn.exceptions.InvalidApplicationMasterRequestException;
import org.apache.hadoop.yarn.exceptions.YarnException; import org.apache.hadoop.yarn.exceptions.YarnException;
@ -66,8 +63,7 @@
* pending requests similar to AMRMClient, and handles RM re-sync automatically * pending requests similar to AMRMClient, and handles RM re-sync automatically
* without propagate the re-sync exception back to AMRMClient. * without propagate the re-sync exception back to AMRMClient.
*/ */
public class AMRMClientRelayer extends AbstractService public class AMRMClientRelayer implements ApplicationMasterProtocol {
implements ApplicationMasterProtocol {
private static final Logger LOG = private static final Logger LOG =
LoggerFactory.getLogger(AMRMClientRelayer.class); LoggerFactory.getLogger(AMRMClientRelayer.class);
@ -136,51 +132,16 @@ public class AMRMClientRelayer extends AbstractService
private AMRMClientRelayerMetrics metrics; private AMRMClientRelayerMetrics metrics;
public AMRMClientRelayer() {
super(AMRMClientRelayer.class.getName());
this.resetResponseId = -1;
this.metrics = AMRMClientRelayerMetrics.getInstance();
this.rmClient = null;
this.appId = null;
this.rmId = "";
}
public AMRMClientRelayer(ApplicationMasterProtocol rmClient, public AMRMClientRelayer(ApplicationMasterProtocol rmClient,
ApplicationId appId, String rmId) { ApplicationId appId, String rmId) {
this(); this.resetResponseId = -1;
this.metrics = AMRMClientRelayerMetrics.getInstance();
this.rmId = "";
this.rmClient = rmClient; this.rmClient = rmClient;
this.appId = appId; this.appId = appId;
this.rmId = rmId; this.rmId = rmId;
} }
@Override
protected void serviceInit(Configuration conf) throws Exception {
super.serviceInit(conf);
}
@Override
protected void serviceStart() throws Exception {
final YarnConfiguration conf = new YarnConfiguration(getConfig());
try {
if (this.rmClient == null) {
this.rmClient =
ClientRMProxy.createRMProxy(conf, ApplicationMasterProtocol.class);
}
} catch (IOException e) {
throw new YarnRuntimeException(e);
}
super.serviceStart();
}
@Override
protected void serviceStop() throws Exception {
if (this.rmClient != null) {
RPC.stopProxy(this.rmClient);
}
shutdown();
super.serviceStop();
}
public void setAMRegistrationRequest( public void setAMRegistrationRequest(
RegisterApplicationMasterRequest registerRequest) { RegisterApplicationMasterRequest registerRequest) {
this.amRegistrationRequest = registerRequest; this.amRegistrationRequest = registerRequest;
@ -231,6 +192,14 @@ public void shutdown() {
.decrClientPending(rmId, req.getContainerUpdateType(), 1); .decrClientPending(rmId, req.getContainerUpdateType(), 1);
} }
} }
if (this.rmClient != null) {
try {
RPC.stopProxy(this.rmClient);
this.rmClient = null;
} catch (HadoopIllegalArgumentException e) {
}
}
} }
@Override @Override

View File

@ -371,6 +371,34 @@ public FinishApplicationMasterResponse finishApplicationMaster(String uamId,
return response; return response;
} }
/**
* Shutdown an UAM client without killing it in YarnRM.
*
* @param uamId uam Id
* @throws YarnException if fails
*/
public void shutDownConnections(String uamId)
throws YarnException {
if (!this.unmanagedAppMasterMap.containsKey(uamId)) {
throw new YarnException("UAM " + uamId + " does not exist");
}
LOG.info(
"Shutting down UAM id {} for application {} without killing the UAM",
uamId, this.appIdMap.get(uamId));
this.unmanagedAppMasterMap.remove(uamId).shutDownConnections();
}
/**
* Shutdown all UAM clients without killing them in YarnRM.
*
* @throws YarnException if fails
*/
public void shutDownConnections() throws YarnException {
for (String uamId : this.unmanagedAppMasterMap.keySet()) {
shutDownConnections(uamId);
}
}
/** /**
* Get the id of all running UAMs. * Get the id of all running UAMs.
* *

View File

@ -255,9 +255,6 @@ public RegisterApplicationMasterResponse registerApplicationMaster(
public FinishApplicationMasterResponse finishApplicationMaster( public FinishApplicationMasterResponse finishApplicationMaster(
FinishApplicationMasterRequest request) FinishApplicationMasterRequest request)
throws YarnException, IOException { throws YarnException, IOException {
this.heartbeatHandler.shutdown();
if (this.userUgi == null) { if (this.userUgi == null) {
if (this.connectionInitiated) { if (this.connectionInitiated) {
// This is possible if the async launchUAM is still // This is possible if the async launchUAM is still
@ -270,7 +267,12 @@ public FinishApplicationMasterResponse finishApplicationMaster(
+ "be called before createAndRegister"); + "be called before createAndRegister");
} }
} }
return this.rmProxyRelayer.finishApplicationMaster(request); FinishApplicationMasterResponse response =
this.rmProxyRelayer.finishApplicationMaster(request);
if (response.getIsUnregistered()) {
shutDownConnections();
}
return response;
} }
/** /**
@ -282,11 +284,10 @@ public FinishApplicationMasterResponse finishApplicationMaster(
*/ */
public KillApplicationResponse forceKillApplication() public KillApplicationResponse forceKillApplication()
throws IOException, YarnException { throws IOException, YarnException {
shutDownConnections();
KillApplicationRequest request = KillApplicationRequest request =
KillApplicationRequest.newInstance(this.applicationId); KillApplicationRequest.newInstance(this.applicationId);
this.heartbeatHandler.shutdown();
if (this.rmClient == null) { if (this.rmClient == null) {
this.rmClient = createRMProxy(ApplicationClientProtocol.class, this.conf, this.rmClient = createRMProxy(ApplicationClientProtocol.class, this.conf,
UserGroupInformation.createRemoteUser(this.submitter), null); UserGroupInformation.createRemoteUser(this.submitter), null);
@ -323,6 +324,14 @@ public void allocateAsync(AllocateRequest request,
} }
} }
/**
* Shutdown this UAM client, without killing the UAM in the YarnRM side.
*/
public void shutDownConnections() {
this.heartbeatHandler.shutdown();
this.rmProxyRelayer.shutdown();
}
/** /**
* Returns the application id of the UAM. * Returns the application id of the UAM.
* *
@ -532,4 +541,9 @@ public int getRequestQueueSize() {
protected void drainHeartbeatThread() { protected void drainHeartbeatThread() {
this.heartbeatHandler.drainHeartbeatThread(); this.heartbeatHandler.drainHeartbeatThread();
} }
@VisibleForTesting
protected boolean isHeartbeatThreadAlive() {
return this.heartbeatHandler.isAlive();
}
} }

View File

@ -120,7 +120,6 @@
import org.apache.hadoop.yarn.api.records.ContainerReport; import org.apache.hadoop.yarn.api.records.ContainerReport;
import org.apache.hadoop.yarn.api.records.ContainerState; import org.apache.hadoop.yarn.api.records.ContainerState;
import org.apache.hadoop.yarn.api.records.ContainerStatus; import org.apache.hadoop.yarn.api.records.ContainerStatus;
import org.apache.hadoop.yarn.api.records.FinalApplicationStatus;
import org.apache.hadoop.yarn.api.records.NMToken; import org.apache.hadoop.yarn.api.records.NMToken;
import org.apache.hadoop.yarn.api.records.NodeId; import org.apache.hadoop.yarn.api.records.NodeId;
import org.apache.hadoop.yarn.api.records.NodeLabel; import org.apache.hadoop.yarn.api.records.NodeLabel;
@ -338,9 +337,7 @@ public FinishApplicationMasterResponse finishApplicationMaster(
applicationContainerIdMap.remove(appId); applicationContainerIdMap.remove(appId);
} }
return FinishApplicationMasterResponse.newInstance( return FinishApplicationMasterResponse.newInstance(true);
request.getFinalApplicationStatus() == FinalApplicationStatus.SUCCEEDED
? true : false);
} }
protected ApplicationId getApplicationId(int id) { protected ApplicationId getApplicationId(int id) {

View File

@ -46,6 +46,7 @@
import org.apache.hadoop.yarn.exceptions.YarnException; import org.apache.hadoop.yarn.exceptions.YarnException;
import org.apache.hadoop.yarn.server.scheduler.ResourceRequestSet; import org.apache.hadoop.yarn.server.scheduler.ResourceRequestSet;
import org.apache.hadoop.yarn.util.Records; import org.apache.hadoop.yarn.util.Records;
import org.junit.After;
import org.junit.Assert; import org.junit.Assert;
import org.junit.Before; import org.junit.Before;
import org.junit.Test; import org.junit.Test;
@ -155,16 +156,17 @@ public void setup() throws YarnException, IOException {
this.mockAMS = new MockApplicationMasterService(); this.mockAMS = new MockApplicationMasterService();
this.relayer = new AMRMClientRelayer(this.mockAMS, null, "TEST"); this.relayer = new AMRMClientRelayer(this.mockAMS, null, "TEST");
this.relayer.init(conf);
this.relayer.start();
this.relayer.registerApplicationMaster( this.relayer.registerApplicationMaster(
RegisterApplicationMasterRequest.newInstance("", 0, "")); RegisterApplicationMasterRequest.newInstance("", 0, ""));
clearAllocateRequestLists(); clearAllocateRequestLists();
} }
@After
public void cleanup() {
this.relayer.shutdown();
}
private void assertAsksAndReleases(int expectedAsk, int expectedRelease) { private void assertAsksAndReleases(int expectedAsk, int expectedRelease) {
Assert.assertEquals(expectedAsk, this.mockAMS.lastAsk.size()); Assert.assertEquals(expectedAsk, this.mockAMS.lastAsk.size());
Assert.assertEquals(expectedRelease, this.mockAMS.lastRelease.size()); Assert.assertEquals(expectedRelease, this.mockAMS.lastRelease.size());

View File

@ -141,17 +141,11 @@ public void setup() throws YarnException, IOException {
this.homeRelayer = new AMRMClientRelayer(this.mockAMS, this.homeRelayer = new AMRMClientRelayer(this.mockAMS,
ApplicationId.newInstance(0, 0), this.homeID); ApplicationId.newInstance(0, 0), this.homeID);
this.homeRelayer.init(conf);
this.homeRelayer.start();
this.homeRelayer.registerApplicationMaster( this.homeRelayer.registerApplicationMaster(
RegisterApplicationMasterRequest.newInstance("", 0, "")); RegisterApplicationMasterRequest.newInstance("", 0, ""));
this.uamRelayer = new AMRMClientRelayer(this.mockAMS, this.uamRelayer = new AMRMClientRelayer(this.mockAMS,
ApplicationId.newInstance(0, 0), this.uamID); ApplicationId.newInstance(0, 0), this.uamID);
this.uamRelayer.init(conf);
this.uamRelayer.start();
this.uamRelayer.registerApplicationMaster( this.uamRelayer.registerApplicationMaster(
RegisterApplicationMasterRequest.newInstance("", 0, "")); RegisterApplicationMasterRequest.newInstance("", 0, ""));

View File

@ -87,7 +87,7 @@ protected void waitForCallBackCountAndCheckZeroPending(
} }
} }
@Test(timeout = 5000) @Test(timeout = 10000)
public void testBasicUsage() public void testBasicUsage()
throws YarnException, IOException, InterruptedException { throws YarnException, IOException, InterruptedException {
@ -104,6 +104,11 @@ public void testBasicUsage()
finishApplicationMaster( finishApplicationMaster(
FinishApplicationMasterRequest.newInstance(null, null, null), FinishApplicationMasterRequest.newInstance(null, null, null),
attemptId); attemptId);
while (uam.isHeartbeatThreadAlive()) {
LOG.info("waiting for heartbeat thread to finish");
Thread.sleep(100);
}
} }
/* /*
@ -261,7 +266,7 @@ public void testFinishWithoutRegister()
attemptId); attemptId);
} }
@Test @Test(timeout = 10000)
public void testForceKill() public void testForceKill()
throws YarnException, IOException, InterruptedException { throws YarnException, IOException, InterruptedException {
launchUAM(attemptId); launchUAM(attemptId);
@ -269,6 +274,11 @@ public void testForceKill()
RegisterApplicationMasterRequest.newInstance(null, 0, null), attemptId); RegisterApplicationMasterRequest.newInstance(null, 0, null), attemptId);
uam.forceKillApplication(); uam.forceKillApplication();
while (uam.isHeartbeatThreadAlive()) {
LOG.info("waiting for heartbeat thread to finish");
Thread.sleep(100);
}
try { try {
uam.forceKillApplication(); uam.forceKillApplication();
Assert.fail("Should fail because application is already killed"); Assert.fail("Should fail because application is already killed");
@ -276,6 +286,19 @@ public void testForceKill()
} }
} }
@Test(timeout = 10000)
public void testShutDownConnections()
throws YarnException, IOException, InterruptedException {
launchUAM(attemptId);
registerApplicationMaster(
RegisterApplicationMasterRequest.newInstance(null, 0, null), attemptId);
uam.shutDownConnections();
while (uam.isHeartbeatThreadAlive()) {
LOG.info("waiting for heartbeat thread to finish");
Thread.sleep(100);
}
}
protected UserGroupInformation getUGIWithToken( protected UserGroupInformation getUGIWithToken(
ApplicationAttemptId appAttemptId) { ApplicationAttemptId appAttemptId) {
UserGroupInformation ugi = UserGroupInformation ugi =

View File

@ -716,12 +716,7 @@ public FinishApplicationMasterResponseInfo call() throws Exception {
uamPool.finishApplicationMaster(subClusterId, finishRequest); uamPool.finishApplicationMaster(subClusterId, finishRequest);
if (uamResponse.getIsUnregistered()) { if (uamResponse.getIsUnregistered()) {
AMRMClientRelayer relayer =
secondaryRelayers.remove(subClusterId); secondaryRelayers.remove(subClusterId);
if(relayer != null) {
relayer.shutdown();
}
if (getNMStateStore() != null) { if (getNMStateStore() != null) {
getNMStateStore().removeAMRMProxyAppContextEntry(attemptId, getNMStateStore().removeAMRMProxyAppContextEntry(attemptId,
NMSS_SECONDARY_SC_PREFIX + subClusterId); NMSS_SECONDARY_SC_PREFIX + subClusterId);
@ -801,8 +796,16 @@ public void setNextInterceptor(RequestInterceptor next) {
*/ */
@Override @Override
public void shutdown() { public void shutdown() {
LOG.info("Shutting down FederationInterceptor for {}", this.attemptId);
// Do not stop uamPool service and kill UAMs here because of possible second // Do not stop uamPool service and kill UAMs here because of possible second
// app attempt // app attempt
try {
this.uamPool.shutDownConnections();
} catch (YarnException e) {
LOG.error("Error shutting down all UAM clients without killing them", e);
}
if (this.threadpool != null) { if (this.threadpool != null) {
try { try {
this.threadpool.shutdown(); this.threadpool.shutdown();
@ -814,9 +817,6 @@ public void shutdown() {
// Stop the home heartbeat thread // Stop the home heartbeat thread
this.homeHeartbeartHandler.shutdown(); this.homeHeartbeartHandler.shutdown();
this.homeRMRelayer.shutdown(); this.homeRMRelayer.shutdown();
for (AMRMClientRelayer relayer : this.secondaryRelayers.values()) {
relayer.shutdown();
}
super.shutdown(); super.shutdown();
} }

View File

@ -206,7 +206,6 @@ public void testFinishOneApplicationMasterWithFailure() throws Exception {
finishApplicationMaster(testAppId, FinalApplicationStatus.FAILED); finishApplicationMaster(testAppId, FinalApplicationStatus.FAILED);
Assert.assertNotNull(finshResponse); Assert.assertNotNull(finshResponse);
Assert.assertEquals(false, finshResponse.getIsUnregistered());
try { try {
// Try to finish an application master that is already finished. // Try to finish an application master that is already finished.