From 4722d76b4c8c1557f49505585f164c77ba54c0a4 Mon Sep 17 00:00:00 2001 From: Vinod Kumar Vavilapalli Date: Thu, 1 Nov 2012 00:33:32 +0000 Subject: [PATCH] YARN-189. Fixed a deadlock between RM's ApplicationMasterService and the dispatcher. Contributed by Thomas Graves. svn merge --ignore-ancestry -c 1404431 ../../trunk/ git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/branches/branch-2@1404432 13f79535-47bb-0310-9956-ffa450edef68 --- hadoop-yarn-project/CHANGES.txt | 3 ++ .../ApplicationMasterService.java | 29 ++++++++++++------- 2 files changed, 21 insertions(+), 11 deletions(-) diff --git a/hadoop-yarn-project/CHANGES.txt b/hadoop-yarn-project/CHANGES.txt index 3fbc65ec06f..750dd091768 100644 --- a/hadoop-yarn-project/CHANGES.txt +++ b/hadoop-yarn-project/CHANGES.txt @@ -179,6 +179,9 @@ Release 0.23.5 - UNRELEASED YARN-166. capacity scheduler doesn't allow capacity < 1.0 (tgraves via bobby) + YARN-189. Fixed a deadlock between RM's ApplicationMasterService and the + dispatcher. (Thomas Graves via vinodkv) + Release 0.23.4 - UNRELEASED INCOMPATIBLE CHANGES diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ApplicationMasterService.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ApplicationMasterService.java index 849111e54b5..96ee551e205 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ApplicationMasterService.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ApplicationMasterService.java @@ -265,10 +265,10 @@ public class ApplicationMasterService extends AbstractService implements // Oh damn! Sending reboot isn't enough. RM state is corrupted. TODO: allocateResponse.setAMResponse(reboot); return allocateResponse; - } - + } + // Allow only one thread in AM to do heartbeat at a time. - synchronized (lastResponse) { // BUG TODO: Locking order is screwed. + synchronized (lastResponse) { // Send the status update to the appAttempt. this.rmContext.getDispatcher().getEventHandler().handle( @@ -282,7 +282,8 @@ public class ApplicationMasterService extends AbstractService implements Allocation allocation = this.rScheduler.allocate(appAttemptId, ask, release); - RMApp app = this.rmContext.getRMApps().get(appAttemptId.getApplicationId()); + RMApp app = this.rmContext.getRMApps().get( + appAttemptId.getApplicationId()); RMAppAttempt appAttempt = app.getRMAppAttempt(appAttemptId); AMResponse response = recordFactory.newRecordInstance(AMResponse.class); @@ -316,7 +317,18 @@ public class ApplicationMasterService extends AbstractService implements .pullJustFinishedContainers()); response.setResponseId(lastResponse.getResponseId() + 1); response.setAvailableResources(allocation.getResourceLimit()); - responseMap.put(appAttemptId, response); + + AMResponse oldResponse = responseMap.put(appAttemptId, response); + if (oldResponse == null) { + // appAttempt got unregistered, remove it back out + responseMap.remove(appAttemptId); + String message = "App Attempt removed from the cache during allocate" + + appAttemptId; + LOG.error(message); + allocateResponse.setAMResponse(reboot); + return allocateResponse; + } + allocateResponse.setAMResponse(response); allocateResponse.setNumClusterNodes(this.rScheduler.getNumClusterNodes()); return allocateResponse; @@ -331,12 +343,7 @@ public class ApplicationMasterService extends AbstractService implements } public void unregisterAttempt(ApplicationAttemptId attemptId) { - AMResponse lastResponse = responseMap.get(attemptId); - if (lastResponse != null) { - synchronized (lastResponse) { - responseMap.remove(attemptId); - } - } + responseMap.remove(attemptId); } public void refreshServiceAcls(Configuration configuration,