YARN-6920. Fix resource leak that happens during container re-initialization. (asuresh)
This commit is contained in:
parent
c61f2c4198
commit
8d3fd81980
|
@ -398,6 +398,8 @@ public class TestNMClient {
|
|||
"will be Rolled-back", Arrays.asList(new Integer[] {-1000}));
|
||||
testCommitContainer(container.getId(), true);
|
||||
testReInitializeContainer(container.getId(), clc, false);
|
||||
testGetContainerStatus(container, i, ContainerState.RUNNING,
|
||||
"will be Re-initialized", Arrays.asList(new Integer[] {-1000}));
|
||||
testCommitContainer(container.getId(), false);
|
||||
} else {
|
||||
testReInitializeContainer(container.getId(), clc, true);
|
||||
|
@ -449,24 +451,21 @@ public class TestNMClient {
|
|||
ContainerState state, String diagnostics, List<Integer> exitStatuses)
|
||||
throws YarnException, IOException {
|
||||
while (true) {
|
||||
try {
|
||||
ContainerStatus status = nmClient.getContainerStatus(
|
||||
container.getId(), container.getNodeId());
|
||||
// NodeManager may still need some time to get the stable
|
||||
// container status
|
||||
if (status.getState() == state) {
|
||||
assertEquals(container.getId(), status.getContainerId());
|
||||
assertTrue("" + index + ": " + status.getDiagnostics(),
|
||||
status.getDiagnostics().contains(diagnostics));
|
||||
|
||||
assertTrue("Exit Statuses are supposed to be in: " + exitStatuses +
|
||||
", but the actual exit status code is: " + status.getExitStatus(),
|
||||
exitStatuses.contains(status.getExitStatus()));
|
||||
break;
|
||||
}
|
||||
Thread.sleep(100);
|
||||
} catch (InterruptedException e) {
|
||||
e.printStackTrace();
|
||||
sleep(250);
|
||||
ContainerStatus status = nmClient.getContainerStatus(
|
||||
container.getId(), container.getNodeId());
|
||||
// NodeManager may still need some time to get the stable
|
||||
// container status
|
||||
if (status.getState() == state) {
|
||||
assertEquals(container.getId(), status.getContainerId());
|
||||
assertTrue("" + index + ": " + status.getDiagnostics(),
|
||||
status.getDiagnostics().contains(diagnostics));
|
||||
|
||||
assertTrue("Exit Statuses are supposed to be in: " + exitStatuses +
|
||||
", but the actual exit status code is: " +
|
||||
status.getExitStatus(),
|
||||
exitStatuses.contains(status.getExitStatus()));
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -559,9 +558,7 @@ public class TestNMClient {
|
|||
ContainerLaunchContext clc, boolean autoCommit)
|
||||
throws YarnException, IOException {
|
||||
try {
|
||||
sleep(250);
|
||||
nmClient.reInitializeContainer(containerId, clc, autoCommit);
|
||||
sleep(250);
|
||||
} catch (YarnException e) {
|
||||
// NM container will only be in SCHEDULED state, so expect the increase
|
||||
// action to fail.
|
||||
|
|
|
@ -1397,6 +1397,10 @@ public class ContainerImpl implements Container {
|
|||
container.resourceSet =
|
||||
container.reInitContext.mergedResourceSet(container.resourceSet);
|
||||
container.isMarkeForKilling = false;
|
||||
// Ensure Resources are decremented.
|
||||
container.dispatcher.getEventHandler().handle(
|
||||
new ContainerSchedulerEvent(container,
|
||||
ContainerSchedulerEventType.CONTAINER_COMPLETED));
|
||||
container.sendScheduleEvent();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -466,4 +466,8 @@ public class ContainerScheduler extends AbstractService implements
|
|||
return this.context.getContainerManager().getContainersMonitor();
|
||||
}
|
||||
|
||||
@VisibleForTesting
|
||||
public ResourceUtilization getCurrentUtilization() {
|
||||
return this.utilizationTracker.getCurrentUtilization();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -74,6 +74,7 @@ import org.apache.hadoop.yarn.api.records.LocalResource;
|
|||
import org.apache.hadoop.yarn.api.records.LocalResourceType;
|
||||
import org.apache.hadoop.yarn.api.records.LocalResourceVisibility;
|
||||
import org.apache.hadoop.yarn.api.records.Resource;
|
||||
import org.apache.hadoop.yarn.api.records.ResourceUtilization;
|
||||
import org.apache.hadoop.yarn.api.records.SerializedException;
|
||||
import org.apache.hadoop.yarn.api.records.SignalContainerCommand;
|
||||
import org.apache.hadoop.yarn.api.records.Token;
|
||||
|
@ -437,7 +438,15 @@ public class TestContainerManager extends BaseContainerManagerTest {
|
|||
|
||||
File newStartFile = new File(tmpDir, "start_file_n.txt").getAbsoluteFile();
|
||||
|
||||
ResourceUtilization beforeUpgrade =
|
||||
ResourceUtilization.newInstance(
|
||||
containerManager.getContainerScheduler().getCurrentUtilization());
|
||||
prepareContainerUpgrade(autoCommit, false, false, cId, newStartFile);
|
||||
ResourceUtilization afterUpgrade =
|
||||
ResourceUtilization.newInstance(
|
||||
containerManager.getContainerScheduler().getCurrentUtilization());
|
||||
Assert.assertEquals("Possible resource leak detected !!",
|
||||
beforeUpgrade, afterUpgrade);
|
||||
|
||||
// Assert that the First process is not alive anymore
|
||||
Assert.assertFalse("Process is still alive!",
|
||||
|
|
Loading…
Reference in New Issue