YARN-9013. [GPG] fix order of steps cleaning Registry entries in ApplicationCleaner. Contributed by Botong Huang.

This commit is contained in:
Botong Huang 2018-12-05 20:58:22 -08:00
parent d93507ef15
commit 6e70802532
3 changed files with 67 additions and 22 deletions

View File

@ -19,7 +19,6 @@
package org.apache.hadoop.yarn.server.globalpolicygenerator.applicationcleaner; package org.apache.hadoop.yarn.server.globalpolicygenerator.applicationcleaner;
import java.util.HashSet; import java.util.HashSet;
import java.util.List;
import java.util.Set; import java.util.Set;
import org.apache.commons.lang3.time.DurationFormatUtils; import org.apache.commons.lang3.time.DurationFormatUtils;
@ -96,6 +95,10 @@ public abstract class ApplicationCleaner implements Runnable {
return this.gpgContext; return this.gpgContext;
} }
public FederationRegistryClient getRegistryClient() {
return this.registryClient;
}
/** /**
* Query router for applications. * Query router for applications.
* *
@ -154,18 +157,6 @@ public abstract class ApplicationCleaner implements Runnable {
+ " success Router queries after " + totalAttemptCount + " retries"); + " success Router queries after " + totalAttemptCount + " retries");
} }
protected void cleanupAppRecordInRegistry(Set<ApplicationId> knownApps) {
List<String> allApps = this.registryClient.getAllApplications();
LOG.info("Got " + allApps.size() + " existing apps in registry");
for (String app : allApps) {
ApplicationId appId = ApplicationId.fromString(app);
if (!knownApps.contains(appId)) {
LOG.info("removing finished application entry for {}", app);
this.registryClient.removeAppFromRegistry(appId, true);
}
}
}
@Override @Override
public abstract void run(); public abstract void run();
} }

View File

@ -46,26 +46,37 @@ public class DefaultApplicationCleaner extends ApplicationCleaner {
LOG.info("Application cleaner run at time {}", now); LOG.info("Application cleaner run at time {}", now);
FederationStateStoreFacade facade = getGPGContext().getStateStoreFacade(); FederationStateStoreFacade facade = getGPGContext().getStateStoreFacade();
Set<ApplicationId> candidates = new HashSet<ApplicationId>();
try { try {
// Get the candidate list from StateStore before calling router
Set<ApplicationId> allStateStoreApps = new HashSet<ApplicationId>();
List<ApplicationHomeSubCluster> response = List<ApplicationHomeSubCluster> response =
facade.getApplicationsHomeSubCluster(); facade.getApplicationsHomeSubCluster();
for (ApplicationHomeSubCluster app : response) { for (ApplicationHomeSubCluster app : response) {
candidates.add(app.getApplicationId()); allStateStoreApps.add(app.getApplicationId());
} }
LOG.info("{} app entries in FederationStateStore", candidates.size()); LOG.info("{} app entries in FederationStateStore",
allStateStoreApps.size());
// Get the candidate list from Registry before calling router
List<String> allRegistryApps = getRegistryClient().getAllApplications();
LOG.info("{} app entries in FederationRegistry",
allStateStoreApps.size());
// Get the list of known apps from Router
Set<ApplicationId> routerApps = getRouterKnownApplications(); Set<ApplicationId> routerApps = getRouterKnownApplications();
LOG.info("{} known applications from Router", routerApps.size()); LOG.info("{} known applications from Router", routerApps.size());
candidates = Sets.difference(candidates, routerApps); // Clean up StateStore entries
LOG.info("Deleting {} applications from statestore", candidates.size()); Set<ApplicationId> toDelete =
Sets.difference(allStateStoreApps, routerApps);
LOG.info("Deleting {} applications from statestore", toDelete.size());
if (LOG.isDebugEnabled()) { if (LOG.isDebugEnabled()) {
LOG.debug("Apps to delete: ", candidates.stream().map(Object::toString) LOG.debug("Apps to delete: ", toDelete.stream().map(Object::toString)
.collect(Collectors.joining(","))); .collect(Collectors.joining(",")));
} }
for (ApplicationId appId : candidates) { for (ApplicationId appId : toDelete) {
try { try {
LOG.debug("Deleting {} from statestore ", appId);
facade.deleteApplicationHomeSubCluster(appId); facade.deleteApplicationHomeSubCluster(appId);
} catch (Exception e) { } catch (Exception e) {
LOG.error( LOG.error(
@ -74,8 +85,15 @@ public class DefaultApplicationCleaner extends ApplicationCleaner {
} }
} }
// Clean up registry entries // Clean up Registry entries
cleanupAppRecordInRegistry(routerApps); for (String app : allRegistryApps) {
ApplicationId appId = ApplicationId.fromString(app);
if (!routerApps.contains(appId)) {
LOG.debug("removing finished application entry for {}", app);
getRegistryClient().removeAppFromRegistry(appId, true);
}
}
} catch (Throwable e) { } catch (Throwable e) {
LOG.error("Application cleaner started at time " + now + " fails: ", e); LOG.error("Application cleaner started at time " + now + " fails: ", e);
} }

View File

@ -63,6 +63,8 @@ public class TestDefaultApplicationCleaner {
// The list of applications returned by mocked router // The list of applications returned by mocked router
private Set<ApplicationId> routerAppIds; private Set<ApplicationId> routerAppIds;
private ApplicationId appIdToAddConcurrently;
@Before @Before
public void setup() throws Exception { public void setup() throws Exception {
conf = new YarnConfiguration(); conf = new YarnConfiguration();
@ -111,6 +113,8 @@ public class TestDefaultApplicationCleaner {
new Token<AMRMTokenIdentifier>()); new Token<AMRMTokenIdentifier>());
} }
Assert.assertEquals(3, registryClient.getAllApplications().size()); Assert.assertEquals(3, registryClient.getAllApplications().size());
appIdToAddConcurrently = null;
} }
@After @After
@ -149,13 +153,45 @@ public class TestDefaultApplicationCleaner {
Assert.assertEquals(1, registryClient.getAllApplications().size()); Assert.assertEquals(1, registryClient.getAllApplications().size());
} }
@Test
public void testConcurrentNewApp() throws YarnException {
appIdToAddConcurrently = ApplicationId.newInstance(1, 1);
appCleaner.run();
// The concurrently added app should be still there
Assert.assertEquals(1,
stateStore
.getApplicationsHomeSubCluster(
GetApplicationsHomeSubClusterRequest.newInstance())
.getAppsHomeSubClusters().size());
// The concurrently added app should be still there
Assert.assertEquals(1, registryClient.getAllApplications().size());
}
/** /**
* Testable version of DefaultApplicationCleaner. * Testable version of DefaultApplicationCleaner.
*/ */
public class TestableDefaultApplicationCleaner public class TestableDefaultApplicationCleaner
extends DefaultApplicationCleaner { extends DefaultApplicationCleaner {
@Override @Override
public Set<ApplicationId> getAppsFromRouter() throws YarnRuntimeException { public Set<ApplicationId> getAppsFromRouter() throws YarnRuntimeException {
if (appIdToAddConcurrently != null) {
SubClusterId scId = SubClusterId.newInstance("MySubClusterId");
try {
stateStore
.addApplicationHomeSubCluster(AddApplicationHomeSubClusterRequest
.newInstance(ApplicationHomeSubCluster
.newInstance(appIdToAddConcurrently, scId)));
} catch (YarnException e) {
throw new YarnRuntimeException(e);
}
registryClient.writeAMRMTokenForUAM(appIdToAddConcurrently,
scId.toString(), new Token<AMRMTokenIdentifier>());
}
return routerAppIds; return routerAppIds;
} }
} }