YARN-4924. NM recovery race can lead to container not cleaned up. Contributed by sandflee
(cherry picked from commit 3150ae8108
)
Conflicts:
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/ContainerManagerImpl.java
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMLeveldbStateStoreService.java
This commit is contained in:
parent
7dd8798933
commit
9b5c5bd42f
|
@ -292,12 +292,6 @@ public class ContainerManagerImpl extends CompositeService implements
|
||||||
for (RecoveredContainerState rcs : stateStore.loadContainersState()) {
|
for (RecoveredContainerState rcs : stateStore.loadContainersState()) {
|
||||||
recoverContainer(rcs);
|
recoverContainer(rcs);
|
||||||
}
|
}
|
||||||
|
|
||||||
String diagnostic = "Application marked finished during recovery";
|
|
||||||
for (ApplicationId appId : appsState.getFinishedApplications()) {
|
|
||||||
dispatcher.getEventHandler().handle(
|
|
||||||
new ApplicationFinishEvent(appId, diagnostic));
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1318,11 +1312,6 @@ public class ContainerManagerImpl extends CompositeService implements
|
||||||
} else if (appsFinishedEvent.getReason() == CMgrCompletedAppsEvent.Reason.BY_RESOURCEMANAGER) {
|
} else if (appsFinishedEvent.getReason() == CMgrCompletedAppsEvent.Reason.BY_RESOURCEMANAGER) {
|
||||||
diagnostic = "Application killed by ResourceManager";
|
diagnostic = "Application killed by ResourceManager";
|
||||||
}
|
}
|
||||||
try {
|
|
||||||
this.context.getNMStateStore().storeFinishedApplication(appID);
|
|
||||||
} catch (IOException e) {
|
|
||||||
LOG.error("Unable to update application state in store", e);
|
|
||||||
}
|
|
||||||
this.dispatcher.getEventHandler().handle(
|
this.dispatcher.getEventHandler().handle(
|
||||||
new ApplicationFinishEvent(appID,
|
new ApplicationFinishEvent(appID,
|
||||||
diagnostic));
|
diagnostic));
|
||||||
|
|
|
@ -84,6 +84,7 @@ public class NMLeveldbStateStoreService extends NMStateStoreService {
|
||||||
|
|
||||||
private static final String APPLICATIONS_KEY_PREFIX =
|
private static final String APPLICATIONS_KEY_PREFIX =
|
||||||
"ContainerManager/applications/";
|
"ContainerManager/applications/";
|
||||||
|
@Deprecated
|
||||||
private static final String FINISHED_APPS_KEY_PREFIX =
|
private static final String FINISHED_APPS_KEY_PREFIX =
|
||||||
"ContainerManager/finishedApps/";
|
"ContainerManager/finishedApps/";
|
||||||
|
|
||||||
|
@ -361,20 +362,6 @@ public class NMLeveldbStateStoreService extends NMStateStoreService {
|
||||||
state.applications.add(
|
state.applications.add(
|
||||||
ContainerManagerApplicationProto.parseFrom(entry.getValue()));
|
ContainerManagerApplicationProto.parseFrom(entry.getValue()));
|
||||||
}
|
}
|
||||||
|
|
||||||
state.finishedApplications = new ArrayList<ApplicationId>();
|
|
||||||
keyPrefix = FINISHED_APPS_KEY_PREFIX;
|
|
||||||
iter.seek(bytes(keyPrefix));
|
|
||||||
while (iter.hasNext()) {
|
|
||||||
Entry<byte[], byte[]> entry = iter.next();
|
|
||||||
String key = asString(entry.getKey());
|
|
||||||
if (!key.startsWith(keyPrefix)) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
ApplicationId appId =
|
|
||||||
ConverterUtils.toApplicationId(key.substring(keyPrefix.length()));
|
|
||||||
state.finishedApplications.add(appId);
|
|
||||||
}
|
|
||||||
} catch (DBException e) {
|
} catch (DBException e) {
|
||||||
throw new IOException(e);
|
throw new IOException(e);
|
||||||
} finally {
|
} finally {
|
||||||
|
@ -383,6 +370,8 @@ public class NMLeveldbStateStoreService extends NMStateStoreService {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
cleanupDeprecatedFinishedApps();
|
||||||
|
|
||||||
return state;
|
return state;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -397,17 +386,6 @@ public class NMLeveldbStateStoreService extends NMStateStoreService {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
|
||||||
public void storeFinishedApplication(ApplicationId appId)
|
|
||||||
throws IOException {
|
|
||||||
String key = FINISHED_APPS_KEY_PREFIX + appId;
|
|
||||||
try {
|
|
||||||
db.put(bytes(key), new byte[0]);
|
|
||||||
} catch (DBException e) {
|
|
||||||
throw new IOException(e);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void removeApplication(ApplicationId appId)
|
public void removeApplication(ApplicationId appId)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
|
@ -416,8 +394,6 @@ public class NMLeveldbStateStoreService extends NMStateStoreService {
|
||||||
try {
|
try {
|
||||||
String key = APPLICATIONS_KEY_PREFIX + appId;
|
String key = APPLICATIONS_KEY_PREFIX + appId;
|
||||||
batch.delete(bytes(key));
|
batch.delete(bytes(key));
|
||||||
key = FINISHED_APPS_KEY_PREFIX + appId;
|
|
||||||
batch.delete(bytes(key));
|
|
||||||
db.write(batch);
|
db.write(batch);
|
||||||
} finally {
|
} finally {
|
||||||
batch.close();
|
batch.close();
|
||||||
|
@ -935,6 +911,52 @@ public class NMLeveldbStateStoreService extends NMStateStoreService {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@SuppressWarnings("deprecation")
|
||||||
|
private void cleanupDeprecatedFinishedApps() {
|
||||||
|
try {
|
||||||
|
cleanupKeysWithPrefix(FINISHED_APPS_KEY_PREFIX);
|
||||||
|
} catch (Exception e) {
|
||||||
|
LOG.warn("cleanup keys with prefix " + FINISHED_APPS_KEY_PREFIX +
|
||||||
|
" from leveldb failed", e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void cleanupKeysWithPrefix(String prefix) throws IOException {
|
||||||
|
WriteBatch batch = null;
|
||||||
|
LeveldbIterator iter = null;
|
||||||
|
try {
|
||||||
|
iter = new LeveldbIterator(db);
|
||||||
|
try {
|
||||||
|
batch = db.createWriteBatch();
|
||||||
|
iter.seek(bytes(prefix));
|
||||||
|
while (iter.hasNext()) {
|
||||||
|
byte[] key = iter.next().getKey();
|
||||||
|
String keyStr = asString(key);
|
||||||
|
if (!keyStr.startsWith(prefix)) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
batch.delete(key);
|
||||||
|
if (LOG.isDebugEnabled()) {
|
||||||
|
LOG.debug("cleanup " + keyStr + " from leveldb");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
db.write(batch);
|
||||||
|
} catch (DBException e) {
|
||||||
|
throw new IOException(e);
|
||||||
|
} finally {
|
||||||
|
if (batch != null) {
|
||||||
|
batch.close();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch (DBException e) {
|
||||||
|
throw new IOException(e);
|
||||||
|
} finally {
|
||||||
|
if (iter != null) {
|
||||||
|
iter.close();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
private String getLogDeleterKey(ApplicationId appId) {
|
private String getLogDeleterKey(ApplicationId appId) {
|
||||||
return LOG_DELETER_KEY_PREFIX + appId;
|
return LOG_DELETER_KEY_PREFIX + appId;
|
||||||
}
|
}
|
||||||
|
|
|
@ -58,10 +58,6 @@ public class NMNullStateStoreService extends NMStateStoreService {
|
||||||
ContainerManagerApplicationProto p) throws IOException {
|
ContainerManagerApplicationProto p) throws IOException {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
|
||||||
public void storeFinishedApplication(ApplicationId appId) {
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void removeApplication(ApplicationId appId) throws IOException {
|
public void removeApplication(ApplicationId appId) throws IOException {
|
||||||
}
|
}
|
||||||
|
|
|
@ -52,15 +52,11 @@ public abstract class NMStateStoreService extends AbstractService {
|
||||||
|
|
||||||
public static class RecoveredApplicationsState {
|
public static class RecoveredApplicationsState {
|
||||||
List<ContainerManagerApplicationProto> applications;
|
List<ContainerManagerApplicationProto> applications;
|
||||||
List<ApplicationId> finishedApplications;
|
|
||||||
|
|
||||||
public List<ContainerManagerApplicationProto> getApplications() {
|
public List<ContainerManagerApplicationProto> getApplications() {
|
||||||
return applications;
|
return applications;
|
||||||
}
|
}
|
||||||
|
|
||||||
public List<ApplicationId> getFinishedApplications() {
|
|
||||||
return finishedApplications;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public enum RecoveredContainerStatus {
|
public enum RecoveredContainerStatus {
|
||||||
|
@ -247,14 +243,6 @@ public abstract class NMStateStoreService extends AbstractService {
|
||||||
public abstract void storeApplication(ApplicationId appId,
|
public abstract void storeApplication(ApplicationId appId,
|
||||||
ContainerManagerApplicationProto p) throws IOException;
|
ContainerManagerApplicationProto p) throws IOException;
|
||||||
|
|
||||||
/**
|
|
||||||
* Record that an application has finished
|
|
||||||
* @param appId the application ID
|
|
||||||
* @throws IOException
|
|
||||||
*/
|
|
||||||
public abstract void storeFinishedApplication(ApplicationId appId)
|
|
||||||
throws IOException;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Remove records corresponding to an application
|
* Remove records corresponding to an application
|
||||||
* @param appId the application ID
|
* @param appId the application ID
|
||||||
|
|
|
@ -259,6 +259,10 @@ public class TestContainerManagerRecovery extends BaseContainerManagerTest {
|
||||||
assertEquals(1, context.getApplications().size());
|
assertEquals(1, context.getApplications().size());
|
||||||
app = context.getApplications().get(appId);
|
app = context.getApplications().get(appId);
|
||||||
assertNotNull(app);
|
assertNotNull(app);
|
||||||
|
// no longer saving FINISH_APP event in NM stateStore,
|
||||||
|
// simulate by resending FINISH_APP event
|
||||||
|
cm.handle(new CMgrCompletedAppsEvent(finishedApps,
|
||||||
|
CMgrCompletedAppsEvent.Reason.BY_RESOURCEMANAGER));
|
||||||
waitForAppState(app, ApplicationState.APPLICATION_RESOURCES_CLEANINGUP);
|
waitForAppState(app, ApplicationState.APPLICATION_RESOURCES_CLEANINGUP);
|
||||||
assertTrue(context.getApplicationACLsManager().checkAccess(
|
assertTrue(context.getApplicationACLsManager().checkAccess(
|
||||||
UserGroupInformation.createRemoteUser(modUser),
|
UserGroupInformation.createRemoteUser(modUser),
|
||||||
|
|
|
@ -44,7 +44,6 @@ import org.apache.hadoop.yarn.server.api.records.impl.pb.MasterKeyPBImpl;
|
||||||
|
|
||||||
public class NMMemoryStateStoreService extends NMStateStoreService {
|
public class NMMemoryStateStoreService extends NMStateStoreService {
|
||||||
private Map<ApplicationId, ContainerManagerApplicationProto> apps;
|
private Map<ApplicationId, ContainerManagerApplicationProto> apps;
|
||||||
private Set<ApplicationId> finishedApps;
|
|
||||||
private Map<ContainerId, RecoveredContainerState> containerStates;
|
private Map<ContainerId, RecoveredContainerState> containerStates;
|
||||||
private Map<TrackerKey, TrackerState> trackerStates;
|
private Map<TrackerKey, TrackerState> trackerStates;
|
||||||
private Map<Integer, DeletionServiceDeleteTaskProto> deleteTasks;
|
private Map<Integer, DeletionServiceDeleteTaskProto> deleteTasks;
|
||||||
|
@ -59,7 +58,6 @@ public class NMMemoryStateStoreService extends NMStateStoreService {
|
||||||
@Override
|
@Override
|
||||||
protected void initStorage(Configuration conf) {
|
protected void initStorage(Configuration conf) {
|
||||||
apps = new HashMap<ApplicationId, ContainerManagerApplicationProto>();
|
apps = new HashMap<ApplicationId, ContainerManagerApplicationProto>();
|
||||||
finishedApps = new HashSet<ApplicationId>();
|
|
||||||
containerStates = new HashMap<ContainerId, RecoveredContainerState>();
|
containerStates = new HashMap<ContainerId, RecoveredContainerState>();
|
||||||
nmTokenState = new RecoveredNMTokensState();
|
nmTokenState = new RecoveredNMTokensState();
|
||||||
nmTokenState.applicationMasterKeys =
|
nmTokenState.applicationMasterKeys =
|
||||||
|
@ -86,7 +84,6 @@ public class NMMemoryStateStoreService extends NMStateStoreService {
|
||||||
RecoveredApplicationsState state = new RecoveredApplicationsState();
|
RecoveredApplicationsState state = new RecoveredApplicationsState();
|
||||||
state.applications = new ArrayList<ContainerManagerApplicationProto>(
|
state.applications = new ArrayList<ContainerManagerApplicationProto>(
|
||||||
apps.values());
|
apps.values());
|
||||||
state.finishedApplications = new ArrayList<ApplicationId>(finishedApps);
|
|
||||||
return state;
|
return state;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -98,16 +95,10 @@ public class NMMemoryStateStoreService extends NMStateStoreService {
|
||||||
apps.put(appId, protoCopy);
|
apps.put(appId, protoCopy);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
|
||||||
public synchronized void storeFinishedApplication(ApplicationId appId) {
|
|
||||||
finishedApps.add(appId);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public synchronized void removeApplication(ApplicationId appId)
|
public synchronized void removeApplication(ApplicationId appId)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
apps.remove(appId);
|
apps.remove(appId);
|
||||||
finishedApps.remove(appId);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -393,7 +384,6 @@ public class NMMemoryStateStoreService extends NMStateStoreService {
|
||||||
logDeleterState.remove(appId);
|
logDeleterState.remove(appId);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private static class TrackerState {
|
private static class TrackerState {
|
||||||
Map<Path, LocalResourceProto> inProgressMap =
|
Map<Path, LocalResourceProto> inProgressMap =
|
||||||
new HashMap<Path, LocalResourceProto>();
|
new HashMap<Path, LocalResourceProto>();
|
||||||
|
|
|
@ -174,7 +174,6 @@ public class TestNMLeveldbStateStoreService {
|
||||||
// test empty when no state
|
// test empty when no state
|
||||||
RecoveredApplicationsState state = stateStore.loadApplicationsState();
|
RecoveredApplicationsState state = stateStore.loadApplicationsState();
|
||||||
assertTrue(state.getApplications().isEmpty());
|
assertTrue(state.getApplications().isEmpty());
|
||||||
assertTrue(state.getFinishedApplications().isEmpty());
|
|
||||||
|
|
||||||
// store an application and verify recovered
|
// store an application and verify recovered
|
||||||
final ApplicationId appId1 = ApplicationId.newInstance(1234, 1);
|
final ApplicationId appId1 = ApplicationId.newInstance(1234, 1);
|
||||||
|
@ -188,10 +187,8 @@ public class TestNMLeveldbStateStoreService {
|
||||||
state = stateStore.loadApplicationsState();
|
state = stateStore.loadApplicationsState();
|
||||||
assertEquals(1, state.getApplications().size());
|
assertEquals(1, state.getApplications().size());
|
||||||
assertEquals(appProto1, state.getApplications().get(0));
|
assertEquals(appProto1, state.getApplications().get(0));
|
||||||
assertTrue(state.getFinishedApplications().isEmpty());
|
|
||||||
|
|
||||||
// finish an application and add a new one
|
// add a new app
|
||||||
stateStore.storeFinishedApplication(appId1);
|
|
||||||
final ApplicationId appId2 = ApplicationId.newInstance(1234, 2);
|
final ApplicationId appId2 = ApplicationId.newInstance(1234, 2);
|
||||||
builder = ContainerManagerApplicationProto.newBuilder();
|
builder = ContainerManagerApplicationProto.newBuilder();
|
||||||
builder.setId(((ApplicationIdPBImpl) appId2).getProto());
|
builder.setId(((ApplicationIdPBImpl) appId2).getProto());
|
||||||
|
@ -203,18 +200,13 @@ public class TestNMLeveldbStateStoreService {
|
||||||
assertEquals(2, state.getApplications().size());
|
assertEquals(2, state.getApplications().size());
|
||||||
assertTrue(state.getApplications().contains(appProto1));
|
assertTrue(state.getApplications().contains(appProto1));
|
||||||
assertTrue(state.getApplications().contains(appProto2));
|
assertTrue(state.getApplications().contains(appProto2));
|
||||||
assertEquals(1, state.getFinishedApplications().size());
|
|
||||||
assertEquals(appId1, state.getFinishedApplications().get(0));
|
|
||||||
|
|
||||||
// test removing an application
|
// test removing an application
|
||||||
stateStore.storeFinishedApplication(appId2);
|
|
||||||
stateStore.removeApplication(appId2);
|
stateStore.removeApplication(appId2);
|
||||||
restartStateStore();
|
restartStateStore();
|
||||||
state = stateStore.loadApplicationsState();
|
state = stateStore.loadApplicationsState();
|
||||||
assertEquals(1, state.getApplications().size());
|
assertEquals(1, state.getApplications().size());
|
||||||
assertEquals(appProto1, state.getApplications().get(0));
|
assertEquals(appProto1, state.getApplications().get(0));
|
||||||
assertEquals(1, state.getFinishedApplications().size());
|
|
||||||
assertEquals(appId1, state.getFinishedApplications().get(0));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
|
Loading…
Reference in New Issue