YARN-4924. NM recovery race can lead to container not cleaned up. Contributed by sandflee

(cherry picked from commit 3150ae8108)

Conflicts:

	hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/ContainerManagerImpl.java
	hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMLeveldbStateStoreService.java
This commit is contained in:
Jason Lowe 2016-04-14 19:39:32 +00:00
parent 7dd8798933
commit 9b5c5bd42f
7 changed files with 54 additions and 73 deletions

View File

@ -292,12 +292,6 @@ public class ContainerManagerImpl extends CompositeService implements
for (RecoveredContainerState rcs : stateStore.loadContainersState()) { for (RecoveredContainerState rcs : stateStore.loadContainersState()) {
recoverContainer(rcs); recoverContainer(rcs);
} }
String diagnostic = "Application marked finished during recovery";
for (ApplicationId appId : appsState.getFinishedApplications()) {
dispatcher.getEventHandler().handle(
new ApplicationFinishEvent(appId, diagnostic));
}
} }
} }
@ -1318,11 +1312,6 @@ public class ContainerManagerImpl extends CompositeService implements
} else if (appsFinishedEvent.getReason() == CMgrCompletedAppsEvent.Reason.BY_RESOURCEMANAGER) { } else if (appsFinishedEvent.getReason() == CMgrCompletedAppsEvent.Reason.BY_RESOURCEMANAGER) {
diagnostic = "Application killed by ResourceManager"; diagnostic = "Application killed by ResourceManager";
} }
try {
this.context.getNMStateStore().storeFinishedApplication(appID);
} catch (IOException e) {
LOG.error("Unable to update application state in store", e);
}
this.dispatcher.getEventHandler().handle( this.dispatcher.getEventHandler().handle(
new ApplicationFinishEvent(appID, new ApplicationFinishEvent(appID,
diagnostic)); diagnostic));

View File

@ -84,6 +84,7 @@ public class NMLeveldbStateStoreService extends NMStateStoreService {
private static final String APPLICATIONS_KEY_PREFIX = private static final String APPLICATIONS_KEY_PREFIX =
"ContainerManager/applications/"; "ContainerManager/applications/";
@Deprecated
private static final String FINISHED_APPS_KEY_PREFIX = private static final String FINISHED_APPS_KEY_PREFIX =
"ContainerManager/finishedApps/"; "ContainerManager/finishedApps/";
@ -361,20 +362,6 @@ public class NMLeveldbStateStoreService extends NMStateStoreService {
state.applications.add( state.applications.add(
ContainerManagerApplicationProto.parseFrom(entry.getValue())); ContainerManagerApplicationProto.parseFrom(entry.getValue()));
} }
state.finishedApplications = new ArrayList<ApplicationId>();
keyPrefix = FINISHED_APPS_KEY_PREFIX;
iter.seek(bytes(keyPrefix));
while (iter.hasNext()) {
Entry<byte[], byte[]> entry = iter.next();
String key = asString(entry.getKey());
if (!key.startsWith(keyPrefix)) {
break;
}
ApplicationId appId =
ConverterUtils.toApplicationId(key.substring(keyPrefix.length()));
state.finishedApplications.add(appId);
}
} catch (DBException e) { } catch (DBException e) {
throw new IOException(e); throw new IOException(e);
} finally { } finally {
@ -383,6 +370,8 @@ public class NMLeveldbStateStoreService extends NMStateStoreService {
} }
} }
cleanupDeprecatedFinishedApps();
return state; return state;
} }
@ -397,17 +386,6 @@ public class NMLeveldbStateStoreService extends NMStateStoreService {
} }
} }
@Override
public void storeFinishedApplication(ApplicationId appId)
throws IOException {
String key = FINISHED_APPS_KEY_PREFIX + appId;
try {
db.put(bytes(key), new byte[0]);
} catch (DBException e) {
throw new IOException(e);
}
}
@Override @Override
public void removeApplication(ApplicationId appId) public void removeApplication(ApplicationId appId)
throws IOException { throws IOException {
@ -416,8 +394,6 @@ public class NMLeveldbStateStoreService extends NMStateStoreService {
try { try {
String key = APPLICATIONS_KEY_PREFIX + appId; String key = APPLICATIONS_KEY_PREFIX + appId;
batch.delete(bytes(key)); batch.delete(bytes(key));
key = FINISHED_APPS_KEY_PREFIX + appId;
batch.delete(bytes(key));
db.write(batch); db.write(batch);
} finally { } finally {
batch.close(); batch.close();
@ -935,6 +911,52 @@ public class NMLeveldbStateStoreService extends NMStateStoreService {
} }
} }
@SuppressWarnings("deprecation")
private void cleanupDeprecatedFinishedApps() {
try {
cleanupKeysWithPrefix(FINISHED_APPS_KEY_PREFIX);
} catch (Exception e) {
LOG.warn("cleanup keys with prefix " + FINISHED_APPS_KEY_PREFIX +
" from leveldb failed", e);
}
}
private void cleanupKeysWithPrefix(String prefix) throws IOException {
WriteBatch batch = null;
LeveldbIterator iter = null;
try {
iter = new LeveldbIterator(db);
try {
batch = db.createWriteBatch();
iter.seek(bytes(prefix));
while (iter.hasNext()) {
byte[] key = iter.next().getKey();
String keyStr = asString(key);
if (!keyStr.startsWith(prefix)) {
break;
}
batch.delete(key);
if (LOG.isDebugEnabled()) {
LOG.debug("cleanup " + keyStr + " from leveldb");
}
}
db.write(batch);
} catch (DBException e) {
throw new IOException(e);
} finally {
if (batch != null) {
batch.close();
}
}
} catch (DBException e) {
throw new IOException(e);
} finally {
if (iter != null) {
iter.close();
}
}
}
private String getLogDeleterKey(ApplicationId appId) { private String getLogDeleterKey(ApplicationId appId) {
return LOG_DELETER_KEY_PREFIX + appId; return LOG_DELETER_KEY_PREFIX + appId;
} }

View File

@ -58,10 +58,6 @@ public class NMNullStateStoreService extends NMStateStoreService {
ContainerManagerApplicationProto p) throws IOException { ContainerManagerApplicationProto p) throws IOException {
} }
@Override
public void storeFinishedApplication(ApplicationId appId) {
}
@Override @Override
public void removeApplication(ApplicationId appId) throws IOException { public void removeApplication(ApplicationId appId) throws IOException {
} }

View File

@ -52,15 +52,11 @@ public abstract class NMStateStoreService extends AbstractService {
public static class RecoveredApplicationsState { public static class RecoveredApplicationsState {
List<ContainerManagerApplicationProto> applications; List<ContainerManagerApplicationProto> applications;
List<ApplicationId> finishedApplications;
public List<ContainerManagerApplicationProto> getApplications() { public List<ContainerManagerApplicationProto> getApplications() {
return applications; return applications;
} }
public List<ApplicationId> getFinishedApplications() {
return finishedApplications;
}
} }
public enum RecoveredContainerStatus { public enum RecoveredContainerStatus {
@ -247,14 +243,6 @@ public abstract class NMStateStoreService extends AbstractService {
public abstract void storeApplication(ApplicationId appId, public abstract void storeApplication(ApplicationId appId,
ContainerManagerApplicationProto p) throws IOException; ContainerManagerApplicationProto p) throws IOException;
/**
* Record that an application has finished
* @param appId the application ID
* @throws IOException
*/
public abstract void storeFinishedApplication(ApplicationId appId)
throws IOException;
/** /**
* Remove records corresponding to an application * Remove records corresponding to an application
* @param appId the application ID * @param appId the application ID

View File

@ -259,6 +259,10 @@ public class TestContainerManagerRecovery extends BaseContainerManagerTest {
assertEquals(1, context.getApplications().size()); assertEquals(1, context.getApplications().size());
app = context.getApplications().get(appId); app = context.getApplications().get(appId);
assertNotNull(app); assertNotNull(app);
// no longer saving FINISH_APP event in NM stateStore,
// simulate by resending FINISH_APP event
cm.handle(new CMgrCompletedAppsEvent(finishedApps,
CMgrCompletedAppsEvent.Reason.BY_RESOURCEMANAGER));
waitForAppState(app, ApplicationState.APPLICATION_RESOURCES_CLEANINGUP); waitForAppState(app, ApplicationState.APPLICATION_RESOURCES_CLEANINGUP);
assertTrue(context.getApplicationACLsManager().checkAccess( assertTrue(context.getApplicationACLsManager().checkAccess(
UserGroupInformation.createRemoteUser(modUser), UserGroupInformation.createRemoteUser(modUser),

View File

@ -44,7 +44,6 @@ import org.apache.hadoop.yarn.server.api.records.impl.pb.MasterKeyPBImpl;
public class NMMemoryStateStoreService extends NMStateStoreService { public class NMMemoryStateStoreService extends NMStateStoreService {
private Map<ApplicationId, ContainerManagerApplicationProto> apps; private Map<ApplicationId, ContainerManagerApplicationProto> apps;
private Set<ApplicationId> finishedApps;
private Map<ContainerId, RecoveredContainerState> containerStates; private Map<ContainerId, RecoveredContainerState> containerStates;
private Map<TrackerKey, TrackerState> trackerStates; private Map<TrackerKey, TrackerState> trackerStates;
private Map<Integer, DeletionServiceDeleteTaskProto> deleteTasks; private Map<Integer, DeletionServiceDeleteTaskProto> deleteTasks;
@ -59,7 +58,6 @@ public class NMMemoryStateStoreService extends NMStateStoreService {
@Override @Override
protected void initStorage(Configuration conf) { protected void initStorage(Configuration conf) {
apps = new HashMap<ApplicationId, ContainerManagerApplicationProto>(); apps = new HashMap<ApplicationId, ContainerManagerApplicationProto>();
finishedApps = new HashSet<ApplicationId>();
containerStates = new HashMap<ContainerId, RecoveredContainerState>(); containerStates = new HashMap<ContainerId, RecoveredContainerState>();
nmTokenState = new RecoveredNMTokensState(); nmTokenState = new RecoveredNMTokensState();
nmTokenState.applicationMasterKeys = nmTokenState.applicationMasterKeys =
@ -86,7 +84,6 @@ public class NMMemoryStateStoreService extends NMStateStoreService {
RecoveredApplicationsState state = new RecoveredApplicationsState(); RecoveredApplicationsState state = new RecoveredApplicationsState();
state.applications = new ArrayList<ContainerManagerApplicationProto>( state.applications = new ArrayList<ContainerManagerApplicationProto>(
apps.values()); apps.values());
state.finishedApplications = new ArrayList<ApplicationId>(finishedApps);
return state; return state;
} }
@ -98,16 +95,10 @@ public class NMMemoryStateStoreService extends NMStateStoreService {
apps.put(appId, protoCopy); apps.put(appId, protoCopy);
} }
@Override
public synchronized void storeFinishedApplication(ApplicationId appId) {
finishedApps.add(appId);
}
@Override @Override
public synchronized void removeApplication(ApplicationId appId) public synchronized void removeApplication(ApplicationId appId)
throws IOException { throws IOException {
apps.remove(appId); apps.remove(appId);
finishedApps.remove(appId);
} }
@Override @Override
@ -393,7 +384,6 @@ public class NMMemoryStateStoreService extends NMStateStoreService {
logDeleterState.remove(appId); logDeleterState.remove(appId);
} }
private static class TrackerState { private static class TrackerState {
Map<Path, LocalResourceProto> inProgressMap = Map<Path, LocalResourceProto> inProgressMap =
new HashMap<Path, LocalResourceProto>(); new HashMap<Path, LocalResourceProto>();

View File

@ -174,7 +174,6 @@ public class TestNMLeveldbStateStoreService {
// test empty when no state // test empty when no state
RecoveredApplicationsState state = stateStore.loadApplicationsState(); RecoveredApplicationsState state = stateStore.loadApplicationsState();
assertTrue(state.getApplications().isEmpty()); assertTrue(state.getApplications().isEmpty());
assertTrue(state.getFinishedApplications().isEmpty());
// store an application and verify recovered // store an application and verify recovered
final ApplicationId appId1 = ApplicationId.newInstance(1234, 1); final ApplicationId appId1 = ApplicationId.newInstance(1234, 1);
@ -188,10 +187,8 @@ public class TestNMLeveldbStateStoreService {
state = stateStore.loadApplicationsState(); state = stateStore.loadApplicationsState();
assertEquals(1, state.getApplications().size()); assertEquals(1, state.getApplications().size());
assertEquals(appProto1, state.getApplications().get(0)); assertEquals(appProto1, state.getApplications().get(0));
assertTrue(state.getFinishedApplications().isEmpty());
// finish an application and add a new one // add a new app
stateStore.storeFinishedApplication(appId1);
final ApplicationId appId2 = ApplicationId.newInstance(1234, 2); final ApplicationId appId2 = ApplicationId.newInstance(1234, 2);
builder = ContainerManagerApplicationProto.newBuilder(); builder = ContainerManagerApplicationProto.newBuilder();
builder.setId(((ApplicationIdPBImpl) appId2).getProto()); builder.setId(((ApplicationIdPBImpl) appId2).getProto());
@ -203,18 +200,13 @@ public class TestNMLeveldbStateStoreService {
assertEquals(2, state.getApplications().size()); assertEquals(2, state.getApplications().size());
assertTrue(state.getApplications().contains(appProto1)); assertTrue(state.getApplications().contains(appProto1));
assertTrue(state.getApplications().contains(appProto2)); assertTrue(state.getApplications().contains(appProto2));
assertEquals(1, state.getFinishedApplications().size());
assertEquals(appId1, state.getFinishedApplications().get(0));
// test removing an application // test removing an application
stateStore.storeFinishedApplication(appId2);
stateStore.removeApplication(appId2); stateStore.removeApplication(appId2);
restartStateStore(); restartStateStore();
state = stateStore.loadApplicationsState(); state = stateStore.loadApplicationsState();
assertEquals(1, state.getApplications().size()); assertEquals(1, state.getApplications().size());
assertEquals(appProto1, state.getApplications().get(0)); assertEquals(appProto1, state.getApplications().get(0));
assertEquals(1, state.getFinishedApplications().size());
assertEquals(appId1, state.getFinishedApplications().get(0));
} }
@Test @Test