YARN-3464. Race condition in LocalizerRunner kills localizer before localizing all resources. (Zhihai Xu via kasha)
(cherry picked from commit47279c3228
) (cherry picked from commit4045c41afe
) (cherry picked from commit 6f2cc0dfa8f21984ecdab59dc087ccf525934930)
This commit is contained in:
parent
9af5b1dcd0
commit
6ade6b5051
|
@ -132,6 +132,9 @@ Release 2.6.1 - UNRELEASED
|
||||||
YARN-3024. LocalizerRunner should give DIE action when all resources are
|
YARN-3024. LocalizerRunner should give DIE action when all resources are
|
||||||
localized. (Chengbing Liu via xgong)
|
localized. (Chengbing Liu via xgong)
|
||||||
|
|
||||||
|
YARN-3464. Race condition in LocalizerRunner kills localizer before
|
||||||
|
localizing all resources. (Zhihai Xu via kasha)
|
||||||
|
|
||||||
Release 2.6.0 - 2014-11-18
|
Release 2.6.0 - 2014-11-18
|
||||||
|
|
||||||
INCOMPATIBLE CHANGES
|
INCOMPATIBLE CHANGES
|
||||||
|
|
|
@ -58,7 +58,9 @@ import org.apache.hadoop.yarn.server.nodemanager.containermanager.launcher.Conta
|
||||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.launcher.ContainersLauncherEventType;
|
import org.apache.hadoop.yarn.server.nodemanager.containermanager.launcher.ContainersLauncherEventType;
|
||||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.LocalResourceRequest;
|
import org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.LocalResourceRequest;
|
||||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.event.ContainerLocalizationCleanupEvent;
|
import org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.event.ContainerLocalizationCleanupEvent;
|
||||||
|
import org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.event.ContainerLocalizationEvent;
|
||||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.event.ContainerLocalizationRequestEvent;
|
import org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.event.ContainerLocalizationRequestEvent;
|
||||||
|
import org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.event.LocalizationEventType;
|
||||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.loghandler.event.LogHandlerContainerFinishedEvent;
|
import org.apache.hadoop.yarn.server.nodemanager.containermanager.loghandler.event.LogHandlerContainerFinishedEvent;
|
||||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.monitor.ContainerStartMonitoringEvent;
|
import org.apache.hadoop.yarn.server.nodemanager.containermanager.monitor.ContainerStartMonitoringEvent;
|
||||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.monitor.ContainerStopMonitoringEvent;
|
import org.apache.hadoop.yarn.server.nodemanager.containermanager.monitor.ContainerStopMonitoringEvent;
|
||||||
|
@ -709,6 +711,10 @@ public class ContainerImpl implements Container {
|
||||||
return ContainerState.LOCALIZING;
|
return ContainerState.LOCALIZING;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
container.dispatcher.getEventHandler().handle(
|
||||||
|
new ContainerLocalizationEvent(LocalizationEventType.
|
||||||
|
CONTAINER_RESOURCES_LOCALIZED, container));
|
||||||
|
|
||||||
container.sendLaunchEvent();
|
container.sendLaunchEvent();
|
||||||
container.metrics.endInitingContainer();
|
container.metrics.endInitingContainer();
|
||||||
return ContainerState.LOCALIZED;
|
return ContainerState.LOCALIZED;
|
||||||
|
|
|
@ -49,6 +49,7 @@ import java.util.concurrent.ScheduledExecutorService;
|
||||||
import java.util.concurrent.ScheduledThreadPoolExecutor;
|
import java.util.concurrent.ScheduledThreadPoolExecutor;
|
||||||
import java.util.concurrent.ThreadFactory;
|
import java.util.concurrent.ThreadFactory;
|
||||||
import java.util.concurrent.TimeUnit;
|
import java.util.concurrent.TimeUnit;
|
||||||
|
import java.util.concurrent.atomic.AtomicBoolean;
|
||||||
|
|
||||||
import org.apache.commons.logging.Log;
|
import org.apache.commons.logging.Log;
|
||||||
import org.apache.commons.logging.LogFactory;
|
import org.apache.commons.logging.LogFactory;
|
||||||
|
@ -107,6 +108,7 @@ import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Cont
|
||||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerResourceFailedEvent;
|
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerResourceFailedEvent;
|
||||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.event.ApplicationLocalizationEvent;
|
import org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.event.ApplicationLocalizationEvent;
|
||||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.event.ContainerLocalizationCleanupEvent;
|
import org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.event.ContainerLocalizationCleanupEvent;
|
||||||
|
import org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.event.ContainerLocalizationEvent;
|
||||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.event.ContainerLocalizationRequestEvent;
|
import org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.event.ContainerLocalizationRequestEvent;
|
||||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.event.LocalizationEvent;
|
import org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.event.LocalizationEvent;
|
||||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.event.LocalizationEventType;
|
import org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.event.LocalizationEventType;
|
||||||
|
@ -388,6 +390,9 @@ public class ResourceLocalizationService extends CompositeService
|
||||||
case INIT_CONTAINER_RESOURCES:
|
case INIT_CONTAINER_RESOURCES:
|
||||||
handleInitContainerResources((ContainerLocalizationRequestEvent) event);
|
handleInitContainerResources((ContainerLocalizationRequestEvent) event);
|
||||||
break;
|
break;
|
||||||
|
case CONTAINER_RESOURCES_LOCALIZED:
|
||||||
|
handleContainerResourcesLocalized((ContainerLocalizationEvent) event);
|
||||||
|
break;
|
||||||
case CACHE_CLEANUP:
|
case CACHE_CLEANUP:
|
||||||
handleCacheCleanup(event);
|
handleCacheCleanup(event);
|
||||||
break;
|
break;
|
||||||
|
@ -451,6 +456,17 @@ public class ResourceLocalizationService extends CompositeService
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Once a container's resources are localized, kill the corresponding
|
||||||
|
* {@link ContainerLocalizer}
|
||||||
|
*/
|
||||||
|
private void handleContainerResourcesLocalized(
|
||||||
|
ContainerLocalizationEvent event) {
|
||||||
|
Container c = event.getContainer();
|
||||||
|
String locId = ConverterUtils.toString(c.getContainerId());
|
||||||
|
localizerTracker.endContainerLocalization(locId);
|
||||||
|
}
|
||||||
|
|
||||||
private void handleCacheCleanup(LocalizationEvent event) {
|
private void handleCacheCleanup(LocalizationEvent event) {
|
||||||
ResourceRetentionSet retain =
|
ResourceRetentionSet retain =
|
||||||
new ResourceRetentionSet(delService, cacheTargetSize);
|
new ResourceRetentionSet(delService, cacheTargetSize);
|
||||||
|
@ -661,7 +677,7 @@ public class ResourceLocalizationService extends CompositeService
|
||||||
response.setLocalizerAction(LocalizerAction.DIE);
|
response.setLocalizerAction(LocalizerAction.DIE);
|
||||||
return response;
|
return response;
|
||||||
}
|
}
|
||||||
return localizer.update(status.getResources());
|
return localizer.processHeartbeat(status.getResources());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -715,6 +731,17 @@ public class ResourceLocalizationService extends CompositeService
|
||||||
localizer.interrupt();
|
localizer.interrupt();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void endContainerLocalization(String locId) {
|
||||||
|
LocalizerRunner localizer;
|
||||||
|
synchronized (privLocalizers) {
|
||||||
|
localizer = privLocalizers.get(locId);
|
||||||
|
if (null == localizer) {
|
||||||
|
return; // ignore
|
||||||
|
}
|
||||||
|
}
|
||||||
|
localizer.endContainerLocalization();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -863,6 +890,7 @@ public class ResourceLocalizationService extends CompositeService
|
||||||
final Map<LocalResourceRequest,LocalizerResourceRequestEvent> scheduled;
|
final Map<LocalResourceRequest,LocalizerResourceRequestEvent> scheduled;
|
||||||
// Its a shared list between Private Localizer and dispatcher thread.
|
// Its a shared list between Private Localizer and dispatcher thread.
|
||||||
final List<LocalizerResourceRequestEvent> pending;
|
final List<LocalizerResourceRequestEvent> pending;
|
||||||
|
private AtomicBoolean killContainerLocalizer = new AtomicBoolean(false);
|
||||||
|
|
||||||
// TODO: threadsafe, use outer?
|
// TODO: threadsafe, use outer?
|
||||||
private final RecordFactory recordFactory =
|
private final RecordFactory recordFactory =
|
||||||
|
@ -883,6 +911,10 @@ public class ResourceLocalizationService extends CompositeService
|
||||||
pending.add(request);
|
pending.add(request);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void endContainerLocalization() {
|
||||||
|
killContainerLocalizer.set(true);
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Find next resource to be given to a spawned localizer.
|
* Find next resource to be given to a spawned localizer.
|
||||||
*
|
*
|
||||||
|
@ -929,7 +961,7 @@ public class ResourceLocalizationService extends CompositeService
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
LocalizerHeartbeatResponse update(
|
LocalizerHeartbeatResponse processHeartbeat(
|
||||||
List<LocalResourceStatus> remoteResourceStatuses) {
|
List<LocalResourceStatus> remoteResourceStatuses) {
|
||||||
LocalizerHeartbeatResponse response =
|
LocalizerHeartbeatResponse response =
|
||||||
recordFactory.newRecordInstance(LocalizerHeartbeatResponse.class);
|
recordFactory.newRecordInstance(LocalizerHeartbeatResponse.class);
|
||||||
|
@ -938,7 +970,7 @@ public class ResourceLocalizationService extends CompositeService
|
||||||
ApplicationId applicationId =
|
ApplicationId applicationId =
|
||||||
context.getContainerId().getApplicationAttemptId().getApplicationId();
|
context.getContainerId().getApplicationAttemptId().getApplicationId();
|
||||||
|
|
||||||
LocalizerAction action = LocalizerAction.LIVE;
|
boolean fetchFailed = false;
|
||||||
// Update resource statuses.
|
// Update resource statuses.
|
||||||
for (LocalResourceStatus stat : remoteResourceStatuses) {
|
for (LocalResourceStatus stat : remoteResourceStatuses) {
|
||||||
LocalResource rsrc = stat.getResource();
|
LocalResource rsrc = stat.getResource();
|
||||||
|
@ -974,7 +1006,7 @@ public class ResourceLocalizationService extends CompositeService
|
||||||
case FETCH_FAILURE:
|
case FETCH_FAILURE:
|
||||||
final String diagnostics = stat.getException().toString();
|
final String diagnostics = stat.getException().toString();
|
||||||
LOG.warn(req + " failed: " + diagnostics);
|
LOG.warn(req + " failed: " + diagnostics);
|
||||||
response.setLocalizerAction(LocalizerAction.DIE);
|
fetchFailed = true;
|
||||||
getLocalResourcesTracker(req.getVisibility(), user, applicationId)
|
getLocalResourcesTracker(req.getVisibility(), user, applicationId)
|
||||||
.handle(new ResourceFailedLocalizationEvent(
|
.handle(new ResourceFailedLocalizationEvent(
|
||||||
req, diagnostics));
|
req, diagnostics));
|
||||||
|
@ -986,15 +1018,15 @@ public class ResourceLocalizationService extends CompositeService
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
LOG.info("Unknown status: " + stat.getStatus());
|
LOG.info("Unknown status: " + stat.getStatus());
|
||||||
action = LocalizerAction.DIE;
|
fetchFailed = true;
|
||||||
getLocalResourcesTracker(req.getVisibility(), user, applicationId)
|
getLocalResourcesTracker(req.getVisibility(), user, applicationId)
|
||||||
.handle(new ResourceFailedLocalizationEvent(
|
.handle(new ResourceFailedLocalizationEvent(
|
||||||
req, stat.getException().getMessage()));
|
req, stat.getException().getMessage()));
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (action == LocalizerAction.DIE) {
|
if (fetchFailed || killContainerLocalizer.get()) {
|
||||||
response.setLocalizerAction(action);
|
response.setLocalizerAction(LocalizerAction.DIE);
|
||||||
return response;
|
return response;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1022,12 +1054,9 @@ public class ResourceLocalizationService extends CompositeService
|
||||||
} catch (URISyntaxException e) {
|
} catch (URISyntaxException e) {
|
||||||
//TODO fail? Already translated several times...
|
//TODO fail? Already translated several times...
|
||||||
}
|
}
|
||||||
} else if (pending.isEmpty()) {
|
|
||||||
// TODO: Synchronization
|
|
||||||
action = LocalizerAction.DIE;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
response.setLocalizerAction(action);
|
response.setLocalizerAction(LocalizerAction.LIVE);
|
||||||
response.setResourceSpecs(rsrcs);
|
response.setResourceSpecs(rsrcs);
|
||||||
return response;
|
return response;
|
||||||
}
|
}
|
||||||
|
|
|
@ -23,4 +23,5 @@ public enum LocalizationEventType {
|
||||||
CACHE_CLEANUP,
|
CACHE_CLEANUP,
|
||||||
CLEANUP_CONTAINER_RESOURCES,
|
CLEANUP_CONTAINER_RESOURCES,
|
||||||
DESTROY_APPLICATION_RESOURCES,
|
DESTROY_APPLICATION_RESOURCES,
|
||||||
|
CONTAINER_RESOURCES_LOCALIZED,
|
||||||
}
|
}
|
||||||
|
|
|
@ -22,28 +22,25 @@ import static org.junit.Assert.assertEquals;
|
||||||
import static org.junit.Assert.assertNotNull;
|
import static org.junit.Assert.assertNotNull;
|
||||||
import static org.junit.Assert.assertNull;
|
import static org.junit.Assert.assertNull;
|
||||||
import static org.junit.Assert.assertTrue;
|
import static org.junit.Assert.assertTrue;
|
||||||
|
import static org.mockito.Matchers.any;
|
||||||
import static org.mockito.Matchers.anyBoolean;
|
import static org.mockito.Matchers.anyBoolean;
|
||||||
import static org.mockito.Matchers.anyInt;
|
import static org.mockito.Matchers.anyInt;
|
||||||
import static org.mockito.Matchers.anyLong;
|
import static org.mockito.Matchers.anyLong;
|
||||||
import static org.mockito.Matchers.anyShort;
|
import static org.mockito.Matchers.anyShort;
|
||||||
import static org.mockito.Matchers.any;
|
|
||||||
import static org.mockito.Matchers.argThat;
|
import static org.mockito.Matchers.argThat;
|
||||||
import static org.mockito.Matchers.eq;
|
import static org.mockito.Matchers.eq;
|
||||||
import static org.mockito.Matchers.isA;
|
import static org.mockito.Matchers.isA;
|
||||||
import static org.mockito.Matchers.isNull;
|
import static org.mockito.Matchers.isNull;
|
||||||
import static org.mockito.Mockito.doAnswer;
|
import static org.mockito.Mockito.doAnswer;
|
||||||
import static org.mockito.Mockito.doNothing;
|
|
||||||
import static org.mockito.Mockito.doReturn;
|
import static org.mockito.Mockito.doReturn;
|
||||||
import static org.mockito.Mockito.mock;
|
import static org.mockito.Mockito.mock;
|
||||||
import static org.mockito.Mockito.spy;
|
import static org.mockito.Mockito.spy;
|
||||||
import static org.mockito.Mockito.timeout;
|
import static org.mockito.Mockito.timeout;
|
||||||
import static org.mockito.Mockito.times;
|
import static org.mockito.Mockito.times;
|
||||||
import static org.mockito.Mockito.verify;
|
import static org.mockito.Mockito.verify;
|
||||||
import static org.mockito.Mockito.verifyNoMoreInteractions;
|
|
||||||
import static org.mockito.Mockito.when;
|
import static org.mockito.Mockito.when;
|
||||||
|
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
import java.io.FileNotFoundException;
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.net.InetSocketAddress;
|
import java.net.InetSocketAddress;
|
||||||
import java.net.URI;
|
import java.net.URI;
|
||||||
|
@ -63,12 +60,6 @@ import java.util.concurrent.BrokenBarrierException;
|
||||||
import java.util.concurrent.CyclicBarrier;
|
import java.util.concurrent.CyclicBarrier;
|
||||||
import java.util.concurrent.Future;
|
import java.util.concurrent.Future;
|
||||||
|
|
||||||
import org.apache.hadoop.fs.FileAlreadyExistsException;
|
|
||||||
import org.apache.hadoop.fs.Options;
|
|
||||||
import org.apache.hadoop.fs.UnresolvedLinkException;
|
|
||||||
import org.apache.hadoop.security.AccessControlException;
|
|
||||||
import org.junit.Assert;
|
|
||||||
|
|
||||||
import org.apache.commons.io.FileUtils;
|
import org.apache.commons.io.FileUtils;
|
||||||
import org.apache.hadoop.conf.Configuration;
|
import org.apache.hadoop.conf.Configuration;
|
||||||
import org.apache.hadoop.fs.AbstractFileSystem;
|
import org.apache.hadoop.fs.AbstractFileSystem;
|
||||||
|
@ -76,6 +67,7 @@ import org.apache.hadoop.fs.CommonConfigurationKeys;
|
||||||
import org.apache.hadoop.fs.FSDataOutputStream;
|
import org.apache.hadoop.fs.FSDataOutputStream;
|
||||||
import org.apache.hadoop.fs.FileContext;
|
import org.apache.hadoop.fs.FileContext;
|
||||||
import org.apache.hadoop.fs.FileStatus;
|
import org.apache.hadoop.fs.FileStatus;
|
||||||
|
import org.apache.hadoop.fs.Options;
|
||||||
import org.apache.hadoop.fs.Options.ChecksumOpt;
|
import org.apache.hadoop.fs.Options.ChecksumOpt;
|
||||||
import org.apache.hadoop.fs.Path;
|
import org.apache.hadoop.fs.Path;
|
||||||
import org.apache.hadoop.fs.UnsupportedFileSystemException;
|
import org.apache.hadoop.fs.UnsupportedFileSystemException;
|
||||||
|
@ -128,6 +120,7 @@ import org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.Reso
|
||||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.ResourceLocalizationService.PublicLocalizer;
|
import org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.ResourceLocalizationService.PublicLocalizer;
|
||||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.event.ApplicationLocalizationEvent;
|
import org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.event.ApplicationLocalizationEvent;
|
||||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.event.ContainerLocalizationCleanupEvent;
|
import org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.event.ContainerLocalizationCleanupEvent;
|
||||||
|
import org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.event.ContainerLocalizationEvent;
|
||||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.event.ContainerLocalizationRequestEvent;
|
import org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.event.ContainerLocalizationRequestEvent;
|
||||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.event.LocalizationEvent;
|
import org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.event.LocalizationEvent;
|
||||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.event.LocalizationEventType;
|
import org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.event.LocalizationEventType;
|
||||||
|
@ -145,6 +138,7 @@ import org.apache.hadoop.yarn.server.security.ApplicationACLsManager;
|
||||||
import org.apache.hadoop.yarn.server.utils.BuilderUtils;
|
import org.apache.hadoop.yarn.server.utils.BuilderUtils;
|
||||||
import org.apache.hadoop.yarn.util.ConverterUtils;
|
import org.apache.hadoop.yarn.util.ConverterUtils;
|
||||||
import org.junit.After;
|
import org.junit.After;
|
||||||
|
import org.junit.Assert;
|
||||||
import org.junit.Before;
|
import org.junit.Before;
|
||||||
import org.junit.BeforeClass;
|
import org.junit.BeforeClass;
|
||||||
import org.junit.Test;
|
import org.junit.Test;
|
||||||
|
@ -942,7 +936,13 @@ public class TestResourceLocalizationService {
|
||||||
assertTrue(localizedPath.getFile().endsWith(
|
assertTrue(localizedPath.getFile().endsWith(
|
||||||
localPath + Path.SEPARATOR + "1" + Path.SEPARATOR + "12"));
|
localPath + Path.SEPARATOR + "1" + Path.SEPARATOR + "12"));
|
||||||
|
|
||||||
// get shutdown
|
response = spyService.heartbeat(stat);
|
||||||
|
assertEquals(LocalizerAction.LIVE, response.getLocalizerAction());
|
||||||
|
|
||||||
|
spyService.handle(new ContainerLocalizationEvent(
|
||||||
|
LocalizationEventType.CONTAINER_RESOURCES_LOCALIZED, c));
|
||||||
|
|
||||||
|
// get shutdown after receive CONTAINER_RESOURCES_LOCALIZED event
|
||||||
response = spyService.heartbeat(stat);
|
response = spyService.heartbeat(stat);
|
||||||
assertEquals(LocalizerAction.DIE, response.getLocalizerAction());
|
assertEquals(LocalizerAction.DIE, response.getLocalizerAction());
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue