YARN-1800. Fixed NodeManager to gracefully handle RejectedExecutionException in the public-localizer thread-pool. Contributed by Varun Vasudev.
git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1576545 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
a3616c58dd
commit
8aab8533a1
|
@ -446,6 +446,9 @@ Release 2.4.0 - UNRELEASED
|
||||||
YARN-1821. NPE on registerNodeManager if the request has containers for
|
YARN-1821. NPE on registerNodeManager if the request has containers for
|
||||||
UnmanagedAMs. (kasha)
|
UnmanagedAMs. (kasha)
|
||||||
|
|
||||||
|
YARN-1800. Fixed NodeManager to gracefully handle RejectedExecutionException
|
||||||
|
in the public-localizer thread-pool. (Varun Vasudev via vinodkv)
|
||||||
|
|
||||||
Release 2.3.1 - UNRELEASED
|
Release 2.3.1 - UNRELEASED
|
||||||
|
|
||||||
INCOMPATIBLE CHANGES
|
INCOMPATIBLE CHANGES
|
||||||
|
|
|
@ -44,6 +44,7 @@ import java.util.concurrent.ExecutorCompletionService;
|
||||||
import java.util.concurrent.ExecutorService;
|
import java.util.concurrent.ExecutorService;
|
||||||
import java.util.concurrent.Executors;
|
import java.util.concurrent.Executors;
|
||||||
import java.util.concurrent.Future;
|
import java.util.concurrent.Future;
|
||||||
|
import java.util.concurrent.RejectedExecutionException;
|
||||||
import java.util.concurrent.ScheduledExecutorService;
|
import java.util.concurrent.ScheduledExecutorService;
|
||||||
import java.util.concurrent.ScheduledThreadPoolExecutor;
|
import java.util.concurrent.ScheduledThreadPoolExecutor;
|
||||||
import java.util.concurrent.ThreadFactory;
|
import java.util.concurrent.ThreadFactory;
|
||||||
|
@ -683,9 +684,16 @@ public class ResourceLocalizationService extends CompositeService
|
||||||
}
|
}
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
rsrc.unlock();
|
rsrc.unlock();
|
||||||
// TODO Need to Fix IO Exceptions - Notifying resource
|
publicRsrc.handle(new ResourceFailedLocalizationEvent(request
|
||||||
|
.getResource().getRequest(), e.getMessage()));
|
||||||
LOG.error("Local path for public localization is not found. "
|
LOG.error("Local path for public localization is not found. "
|
||||||
+ " May be disks failed.", e);
|
+ " May be disks failed.", e);
|
||||||
|
} catch (RejectedExecutionException re) {
|
||||||
|
rsrc.unlock();
|
||||||
|
publicRsrc.handle(new ResourceFailedLocalizationEvent(request
|
||||||
|
.getResource().getRequest(), re.getMessage()));
|
||||||
|
LOG.error("Failed to submit rsrc " + rsrc + " for download."
|
||||||
|
+ " Either queue is full or threadpool is shutdown.", re);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
rsrc.unlock();
|
rsrc.unlock();
|
||||||
|
|
|
@ -84,6 +84,7 @@ import org.apache.hadoop.yarn.api.records.SerializedException;
|
||||||
import org.apache.hadoop.yarn.api.records.URL;
|
import org.apache.hadoop.yarn.api.records.URL;
|
||||||
import org.apache.hadoop.yarn.conf.YarnConfiguration;
|
import org.apache.hadoop.yarn.conf.YarnConfiguration;
|
||||||
import org.apache.hadoop.yarn.event.AsyncDispatcher;
|
import org.apache.hadoop.yarn.event.AsyncDispatcher;
|
||||||
|
import org.apache.hadoop.yarn.event.Dispatcher;
|
||||||
import org.apache.hadoop.yarn.event.DrainDispatcher;
|
import org.apache.hadoop.yarn.event.DrainDispatcher;
|
||||||
import org.apache.hadoop.yarn.event.EventHandler;
|
import org.apache.hadoop.yarn.event.EventHandler;
|
||||||
import org.apache.hadoop.yarn.exceptions.YarnException;
|
import org.apache.hadoop.yarn.exceptions.YarnException;
|
||||||
|
@ -109,6 +110,7 @@ import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Cont
|
||||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerResourceFailedEvent;
|
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerResourceFailedEvent;
|
||||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.ResourceLocalizationService.LocalizerRunner;
|
import org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.ResourceLocalizationService.LocalizerRunner;
|
||||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.ResourceLocalizationService.LocalizerTracker;
|
import org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.ResourceLocalizationService.LocalizerTracker;
|
||||||
|
import org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.ResourceLocalizationService.PublicLocalizer;
|
||||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.event.ApplicationLocalizationEvent;
|
import org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.event.ApplicationLocalizationEvent;
|
||||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.event.ContainerLocalizationCleanupEvent;
|
import org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.event.ContainerLocalizationCleanupEvent;
|
||||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.event.ContainerLocalizationRequestEvent;
|
import org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.event.ContainerLocalizationRequestEvent;
|
||||||
|
@ -126,6 +128,7 @@ import org.junit.BeforeClass;
|
||||||
import org.junit.Test;
|
import org.junit.Test;
|
||||||
import org.mockito.ArgumentCaptor;
|
import org.mockito.ArgumentCaptor;
|
||||||
import org.mockito.ArgumentMatcher;
|
import org.mockito.ArgumentMatcher;
|
||||||
|
import org.mockito.Mockito;
|
||||||
import org.mockito.invocation.InvocationOnMock;
|
import org.mockito.invocation.InvocationOnMock;
|
||||||
import org.mockito.stubbing.Answer;
|
import org.mockito.stubbing.Answer;
|
||||||
|
|
||||||
|
@ -682,6 +685,121 @@ public class TestResourceLocalizationService {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Test case for handling RejectedExecutionException and IOException which can
|
||||||
|
* be thrown when adding public resources to the pending queue.
|
||||||
|
* RejectedExecutionException can be thrown either due to the incoming queue
|
||||||
|
* being full or if the ExecutorCompletionService threadpool is shutdown.
|
||||||
|
* Since it's hard to simulate the queue being full, this test just shuts down
|
||||||
|
* the threadpool and makes sure the exception is handled. If anything is
|
||||||
|
* messed up the async dispatcher thread will cause a system exit causing the
|
||||||
|
* test to fail.
|
||||||
|
*/
|
||||||
|
@Test
|
||||||
|
@SuppressWarnings("unchecked")
|
||||||
|
public void testPublicResourceAddResourceExceptions() throws Exception {
|
||||||
|
List<Path> localDirs = new ArrayList<Path>();
|
||||||
|
String[] sDirs = new String[4];
|
||||||
|
for (int i = 0; i < 4; ++i) {
|
||||||
|
localDirs.add(lfs.makeQualified(new Path(basedir, i + "")));
|
||||||
|
sDirs[i] = localDirs.get(i).toString();
|
||||||
|
}
|
||||||
|
conf.setStrings(YarnConfiguration.NM_LOCAL_DIRS, sDirs);
|
||||||
|
conf.setBoolean(Dispatcher.DISPATCHER_EXIT_ON_ERROR_KEY, true);
|
||||||
|
|
||||||
|
DrainDispatcher dispatcher = new DrainDispatcher();
|
||||||
|
EventHandler<ApplicationEvent> applicationBus = mock(EventHandler.class);
|
||||||
|
dispatcher.register(ApplicationEventType.class, applicationBus);
|
||||||
|
EventHandler<ContainerEvent> containerBus = mock(EventHandler.class);
|
||||||
|
dispatcher.register(ContainerEventType.class, containerBus);
|
||||||
|
|
||||||
|
ContainerExecutor exec = mock(ContainerExecutor.class);
|
||||||
|
DeletionService delService = mock(DeletionService.class);
|
||||||
|
LocalDirsHandlerService dirsHandler = new LocalDirsHandlerService();
|
||||||
|
LocalDirsHandlerService dirsHandlerSpy = spy(dirsHandler);
|
||||||
|
dirsHandlerSpy.init(conf);
|
||||||
|
|
||||||
|
dispatcher.init(conf);
|
||||||
|
dispatcher.start();
|
||||||
|
|
||||||
|
try {
|
||||||
|
ResourceLocalizationService rawService =
|
||||||
|
new ResourceLocalizationService(dispatcher, exec, delService,
|
||||||
|
dirsHandlerSpy);
|
||||||
|
ResourceLocalizationService spyService = spy(rawService);
|
||||||
|
doReturn(mockServer).when(spyService).createServer();
|
||||||
|
doReturn(lfs).when(spyService).getLocalFileContext(
|
||||||
|
isA(Configuration.class));
|
||||||
|
|
||||||
|
spyService.init(conf);
|
||||||
|
spyService.start();
|
||||||
|
|
||||||
|
final String user = "user0";
|
||||||
|
// init application
|
||||||
|
final Application app = mock(Application.class);
|
||||||
|
final ApplicationId appId =
|
||||||
|
BuilderUtils.newApplicationId(314159265358979L, 3);
|
||||||
|
when(app.getUser()).thenReturn(user);
|
||||||
|
when(app.getAppId()).thenReturn(appId);
|
||||||
|
spyService.handle(new ApplicationLocalizationEvent(
|
||||||
|
LocalizationEventType.INIT_APPLICATION_RESOURCES, app));
|
||||||
|
dispatcher.await();
|
||||||
|
|
||||||
|
// init resources
|
||||||
|
Random r = new Random();
|
||||||
|
r.setSeed(r.nextLong());
|
||||||
|
|
||||||
|
// Queue localization request for the public resource
|
||||||
|
final LocalResource pubResource = getPublicMockedResource(r);
|
||||||
|
final LocalResourceRequest pubReq = new LocalResourceRequest(pubResource);
|
||||||
|
Map<LocalResourceVisibility, Collection<LocalResourceRequest>> req =
|
||||||
|
new HashMap<LocalResourceVisibility, Collection<LocalResourceRequest>>();
|
||||||
|
req
|
||||||
|
.put(LocalResourceVisibility.PUBLIC, Collections.singletonList(pubReq));
|
||||||
|
|
||||||
|
// init container.
|
||||||
|
final Container c = getMockContainer(appId, 42);
|
||||||
|
|
||||||
|
// first test ioexception
|
||||||
|
Mockito
|
||||||
|
.doThrow(new IOException())
|
||||||
|
.when(dirsHandlerSpy)
|
||||||
|
.getLocalPathForWrite(isA(String.class), Mockito.anyLong(),
|
||||||
|
Mockito.anyBoolean());
|
||||||
|
// send request
|
||||||
|
spyService.handle(new ContainerLocalizationRequestEvent(c, req));
|
||||||
|
dispatcher.await();
|
||||||
|
LocalResourcesTracker tracker =
|
||||||
|
spyService.getLocalResourcesTracker(LocalResourceVisibility.PUBLIC,
|
||||||
|
user, appId);
|
||||||
|
Assert.assertNull(tracker.getLocalizedResource(pubReq));
|
||||||
|
|
||||||
|
// test RejectedExecutionException
|
||||||
|
Mockito
|
||||||
|
.doCallRealMethod()
|
||||||
|
.when(dirsHandlerSpy)
|
||||||
|
.getLocalPathForWrite(isA(String.class), Mockito.anyLong(),
|
||||||
|
Mockito.anyBoolean());
|
||||||
|
|
||||||
|
// shutdown the thread pool
|
||||||
|
PublicLocalizer publicLocalizer = spyService.getPublicLocalizer();
|
||||||
|
publicLocalizer.threadPool.shutdown();
|
||||||
|
|
||||||
|
spyService.handle(new ContainerLocalizationRequestEvent(c, req));
|
||||||
|
dispatcher.await();
|
||||||
|
tracker =
|
||||||
|
spyService.getLocalResourcesTracker(LocalResourceVisibility.PUBLIC,
|
||||||
|
user, appId);
|
||||||
|
Assert.assertNull(tracker.getLocalizedResource(pubReq));
|
||||||
|
|
||||||
|
} finally {
|
||||||
|
// if we call stop with events in the queue, an InterruptedException gets
|
||||||
|
// thrown resulting in the dispatcher thread causing a system exit
|
||||||
|
dispatcher.await();
|
||||||
|
dispatcher.stop();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
@Test(timeout = 100000)
|
@Test(timeout = 100000)
|
||||||
@SuppressWarnings("unchecked")
|
@SuppressWarnings("unchecked")
|
||||||
public void testParallelDownloadAttemptsForPrivateResource() throws Exception {
|
public void testParallelDownloadAttemptsForPrivateResource() throws Exception {
|
||||||
|
@ -830,6 +948,8 @@ public class TestResourceLocalizationService {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@Test(timeout = 10000)
|
@Test(timeout = 10000)
|
||||||
@SuppressWarnings("unchecked")
|
@SuppressWarnings("unchecked")
|
||||||
public void testLocalResourcePath() throws Exception {
|
public void testLocalResourcePath() throws Exception {
|
||||||
|
|
Loading…
Reference in New Issue