From d841a0f47461af485d6c8320b54e761006c619a7 Mon Sep 17 00:00:00 2001 From: Karthik Kambatla Date: Thu, 26 Dec 2013 00:21:56 +0000 Subject: [PATCH 01/42] YARN-1172. Convert SecretManagers in RM to services (Tsuyoshi OZAWA via kasha) git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1553431 13f79535-47bb-0310-9956-ffa450edef68 --- hadoop-yarn-project/CHANGES.txt | 2 + .../RMSecretManagerService.java | 143 ++++++++++++++++++ .../resourcemanager/ResourceManager.java | 102 +++---------- .../yarn/server/resourcemanager/MockRM.java | 17 ++- .../resourcemanager/QueueACLsTestBase.java | 2 +- .../resourcemanager/TestClientRMService.java | 2 +- .../server/resourcemanager/TestRMRestart.java | 2 +- .../security/TestClientToAMTokens.java | 2 +- .../security/TestRMDelegationTokens.java | 21 ++- 9 files changed, 193 insertions(+), 100 deletions(-) create mode 100644 hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/RMSecretManagerService.java diff --git a/hadoop-yarn-project/CHANGES.txt b/hadoop-yarn-project/CHANGES.txt index c623dc2fb15..33185796466 100644 --- a/hadoop-yarn-project/CHANGES.txt +++ b/hadoop-yarn-project/CHANGES.txt @@ -187,6 +187,8 @@ Release 2.4.0 - UNRELEASED YARN-1307. Redesign znode structure for Zookeeper based RM state-store for better organization and scalability. (Tsuyoshi OZAWA via vinodkv) + YARN-1172. Convert SecretManagers in RM to services (Tsuyoshi OZAWA via kasha) + OPTIMIZATIONS BUG FIXES diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/RMSecretManagerService.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/RMSecretManagerService.java new file mode 100644 index 00000000000..9fdde6589a3 --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/RMSecretManagerService.java @@ -0,0 +1,143 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.yarn.server.resourcemanager; + +import com.google.common.annotations.VisibleForTesting; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.service.AbstractService; +import org.apache.hadoop.yarn.conf.YarnConfiguration; +import org.apache.hadoop.yarn.exceptions.YarnRuntimeException; +import org.apache.hadoop.yarn.server.resourcemanager.security.AMRMTokenSecretManager; +import org.apache.hadoop.yarn.server.resourcemanager.security.ClientToAMTokenSecretManagerInRM; +import org.apache.hadoop.yarn.server.resourcemanager.security.NMTokenSecretManagerInRM; +import org.apache.hadoop.yarn.server.resourcemanager.security.RMContainerTokenSecretManager; +import org.apache.hadoop.yarn.server.resourcemanager.security.RMDelegationTokenSecretManager; + +import java.io.IOException; + +public class RMSecretManagerService extends AbstractService { + + AMRMTokenSecretManager amRmTokenSecretManager; + NMTokenSecretManagerInRM nmTokenSecretManager; + ClientToAMTokenSecretManagerInRM clientToAMSecretManager; + RMContainerTokenSecretManager containerTokenSecretManager; + RMDelegationTokenSecretManager rmDTSecretManager; + + RMContextImpl rmContext; + + /** + * Construct the service. + * + */ + public RMSecretManagerService(Configuration conf, RMContextImpl rmContext) { + super(RMSecretManagerService.class.getName()); + this.rmContext = rmContext; + + // To initialize correctly, these managers should be created before + // being called serviceInit(). + nmTokenSecretManager = createNMTokenSecretManager(conf); + rmContext.setNMTokenSecretManager(nmTokenSecretManager); + + containerTokenSecretManager = createContainerTokenSecretManager(conf); + rmContext.setContainerTokenSecretManager(containerTokenSecretManager); + + clientToAMSecretManager = createClientToAMTokenSecretManager(); + rmContext.setClientToAMTokenSecretManager(clientToAMSecretManager); + + amRmTokenSecretManager = createAMRMTokenSecretManager(conf); + rmContext.setAMRMTokenSecretManager(amRmTokenSecretManager); + + rmDTSecretManager = + createRMDelegationTokenSecretManager(conf, rmContext); + rmContext.setRMDelegationTokenSecretManager(rmDTSecretManager); + } + + @Override + public void serviceInit(Configuration conf) throws Exception { + super.serviceInit(conf); + } + + @Override + public void serviceStart() throws Exception { + amRmTokenSecretManager.start(); + containerTokenSecretManager.start(); + nmTokenSecretManager.start(); + + try { + rmDTSecretManager.startThreads(); + } catch(IOException ie) { + throw new YarnRuntimeException("Failed to start secret manager threads", ie); + } + super.serviceStart(); + } + + @Override + public void serviceStop() throws Exception { + if (rmDTSecretManager != null) { + rmDTSecretManager.stopThreads(); + } + if (amRmTokenSecretManager != null) { + amRmTokenSecretManager.stop(); + } + if (containerTokenSecretManager != null) { + containerTokenSecretManager.stop(); + } + if(nmTokenSecretManager != null) { + nmTokenSecretManager.stop(); + } + super.serviceStop(); + } + + protected RMContainerTokenSecretManager createContainerTokenSecretManager( + Configuration conf) { + return new RMContainerTokenSecretManager(conf); + } + + protected NMTokenSecretManagerInRM createNMTokenSecretManager( + Configuration conf) { + return new NMTokenSecretManagerInRM(conf); + } + + protected AMRMTokenSecretManager createAMRMTokenSecretManager( + Configuration conf) { + return new AMRMTokenSecretManager(conf); + } + + protected ClientToAMTokenSecretManagerInRM createClientToAMTokenSecretManager() { + return new ClientToAMTokenSecretManagerInRM(); + } + + @VisibleForTesting + protected RMDelegationTokenSecretManager createRMDelegationTokenSecretManager( + Configuration conf, RMContext rmContext) { + long secretKeyInterval = + conf.getLong(YarnConfiguration.DELEGATION_KEY_UPDATE_INTERVAL_KEY, + YarnConfiguration.DELEGATION_KEY_UPDATE_INTERVAL_DEFAULT); + long tokenMaxLifetime = + conf.getLong(YarnConfiguration.DELEGATION_TOKEN_MAX_LIFETIME_KEY, + YarnConfiguration.DELEGATION_TOKEN_MAX_LIFETIME_DEFAULT); + long tokenRenewInterval = + conf.getLong(YarnConfiguration.DELEGATION_TOKEN_RENEW_INTERVAL_KEY, + YarnConfiguration.DELEGATION_TOKEN_RENEW_INTERVAL_DEFAULT); + + return new RMDelegationTokenSecretManager(secretKeyInterval, + tokenMaxLifetime, tokenRenewInterval, 3600000, rmContext); + } + +} diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ResourceManager.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ResourceManager.java index e44207796ea..d7f51e94532 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ResourceManager.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ResourceManager.java @@ -81,7 +81,6 @@ import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.SchedulerEvent; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.SchedulerEventType; import org.apache.hadoop.yarn.server.resourcemanager.security.AMRMTokenSecretManager; -import org.apache.hadoop.yarn.server.resourcemanager.security.ClientToAMTokenSecretManagerInRM; import org.apache.hadoop.yarn.server.resourcemanager.security.DelegationTokenRenewer; import org.apache.hadoop.yarn.server.resourcemanager.security.NMTokenSecretManagerInRM; import org.apache.hadoop.yarn.server.resourcemanager.security.QueueACLsManager; @@ -134,13 +133,7 @@ public class ResourceManager extends CompositeService implements Recoverable { * in Active state. */ protected RMActiveServices activeServices; - protected ClientToAMTokenSecretManagerInRM clientToAMSecretManager; - - protected RMContainerTokenSecretManager containerTokenSecretManager; - protected NMTokenSecretManagerInRM nmTokenSecretManager; - - protected AMRMTokenSecretManager amRmTokenSecretManager; - + protected RMSecretManagerService rmSecretManagerService; private Dispatcher rmDispatcher; protected ResourceScheduler scheduler; @@ -154,7 +147,6 @@ public class ResourceManager extends CompositeService implements Recoverable { protected RMAppManager rmAppManager; protected ApplicationACLsManager applicationACLsManager; protected QueueACLsManager queueACLsManager; - protected RMDelegationTokenSecretManager rmDTSecretManager; private DelegationTokenRenewer delegationTokenRenewer; private WebApp webApp; protected ResourceTrackerService resourceTracker; @@ -211,16 +203,6 @@ protected void setRMStateStore(RMStateStore rmStore) { rmContext.setStateStore(rmStore); } - protected RMContainerTokenSecretManager createContainerTokenSecretManager( - Configuration conf) { - return new RMContainerTokenSecretManager(conf); - } - - protected NMTokenSecretManagerInRM createNMTokenSecretManager( - Configuration conf) { - return new NMTokenSecretManagerInRM(conf); - } - protected EventHandler createSchedulerEventDispatcher() { return new SchedulerEventDispatcher(this.scheduler); } @@ -234,11 +216,6 @@ protected Dispatcher createDispatcher() { return new AsyncDispatcher(); } - protected AMRMTokenSecretManager createAMRMTokenSecretManager( - Configuration conf) { - return new AMRMTokenSecretManager(conf); - } - protected ResourceScheduler createScheduler() { String schedulerClassName = conf.get(YarnConfiguration.RM_SCHEDULER, YarnConfiguration.DEFAULT_RM_SCHEDULER); @@ -324,11 +301,8 @@ protected void serviceInit(Configuration configuration) throws Exception { addIfService(rmDispatcher); rmContext.setDispatcher(rmDispatcher); - clientToAMSecretManager = new ClientToAMTokenSecretManagerInRM(); - rmContext.setClientToAMTokenSecretManager(clientToAMSecretManager); - - amRmTokenSecretManager = createAMRMTokenSecretManager(conf); - rmContext.setAMRMTokenSecretManager(amRmTokenSecretManager); + rmSecretManagerService = createRMSecretManagerService(); + addService(rmSecretManagerService); containerAllocationExpirer = new ContainerAllocationExpirer(rmDispatcher); addService(containerAllocationExpirer); @@ -342,12 +316,6 @@ protected void serviceInit(Configuration configuration) throws Exception { addService(amFinishingMonitor); rmContext.setAMFinishingMonitor(amFinishingMonitor); - containerTokenSecretManager = createContainerTokenSecretManager(conf); - rmContext.setContainerTokenSecretManager(containerTokenSecretManager); - - nmTokenSecretManager = createNMTokenSecretManager(conf); - rmContext.setNMTokenSecretManager(nmTokenSecretManager); - boolean isRecoveryEnabled = conf.getBoolean( YarnConfiguration.RECOVERY_ENABLED, YarnConfiguration.DEFAULT_RM_RECOVERY_ENABLED); @@ -435,8 +403,6 @@ protected void serviceInit(Configuration configuration) throws Exception { rmAppManager = createRMAppManager(); // Register event handler for RMAppManagerEvents rmDispatcher.register(RMAppManagerEventType.class, rmAppManager); - rmDTSecretManager = createRMDelegationTokenSecretManager(rmContext); - rmContext.setRMDelegationTokenSecretManager(rmDTSecretManager); clientRM = createClientRMService(); rmContext.setClientRMService(clientRM); @@ -460,10 +426,6 @@ protected void serviceInit(Configuration configuration) throws Exception { @Override protected void serviceStart() throws Exception { - amRmTokenSecretManager.start(); - containerTokenSecretManager.start(); - nmTokenSecretManager.start(); - RMStateStore rmStore = rmContext.getStateStore(); // The state store needs to start irrespective of recoveryEnabled as apps // need events to move to further states. @@ -481,13 +443,7 @@ protected void serviceStart() throws Exception { throw e; } } - startWepApp(); - try { - rmDTSecretManager.startThreads(); - } catch(IOException ie) { - throw new YarnRuntimeException("Failed to start secret manager threads", ie); - } if (getConfig().getBoolean(YarnConfiguration.IS_MINI_YARN_CLUSTER, false)) { int port = webApp.port(); @@ -502,19 +458,7 @@ protected void serviceStop() throws Exception { if (webApp != null) { webApp.stop(); } - if (rmDTSecretManager != null) { - rmDTSecretManager.stopThreads(); - } - if (amRmTokenSecretManager != null) { - amRmTokenSecretManager.stop(); - } - if (containerTokenSecretManager != null) { - containerTokenSecretManager.stop(); - } - if(nmTokenSecretManager != null) { - nmTokenSecretManager.stop(); - } DefaultMetricsSystem.shutdown(); @@ -939,30 +883,15 @@ protected void serviceStop() throws Exception { protected ResourceTrackerService createResourceTrackerService() { return new ResourceTrackerService(this.rmContext, this.nodesListManager, - this.nmLivelinessMonitor, this.containerTokenSecretManager, - this.nmTokenSecretManager); - } - - protected RMDelegationTokenSecretManager - createRMDelegationTokenSecretManager(RMContext rmContext) { - long secretKeyInterval = - conf.getLong(YarnConfiguration.DELEGATION_KEY_UPDATE_INTERVAL_KEY, - YarnConfiguration.DELEGATION_KEY_UPDATE_INTERVAL_DEFAULT); - long tokenMaxLifetime = - conf.getLong(YarnConfiguration.DELEGATION_TOKEN_MAX_LIFETIME_KEY, - YarnConfiguration.DELEGATION_TOKEN_MAX_LIFETIME_DEFAULT); - long tokenRenewInterval = - conf.getLong(YarnConfiguration.DELEGATION_TOKEN_RENEW_INTERVAL_KEY, - YarnConfiguration.DELEGATION_TOKEN_RENEW_INTERVAL_DEFAULT); - - return new RMDelegationTokenSecretManager(secretKeyInterval, - tokenMaxLifetime, tokenRenewInterval, 3600000, rmContext); + this.nmLivelinessMonitor, + this.rmContext.getContainerTokenSecretManager(), + this.rmContext.getNMTokenSecretManager()); } protected ClientRMService createClientRMService() { return new ClientRMService(this.rmContext, scheduler, this.rmAppManager, this.applicationACLsManager, this.queueACLsManager, - this.rmDTSecretManager); + getRMDTSecretManager()); } protected ApplicationMasterService createApplicationMasterService() { @@ -973,6 +902,10 @@ protected AdminService createAdminService() { return new AdminService(this, rmContext); } + protected RMSecretManagerService createRMSecretManagerService() { + return new RMSecretManagerService(conf, rmContext); + } + @Private public ClientRMService getClientRMService() { return this.clientRM; @@ -1013,23 +946,28 @@ public QueueACLsManager getQueueACLsManager() { @Private public RMContainerTokenSecretManager getRMContainerTokenSecretManager() { - return this.containerTokenSecretManager; + return this.rmContext.getContainerTokenSecretManager(); } @Private public NMTokenSecretManagerInRM getRMNMTokenSecretManager() { - return this.nmTokenSecretManager; + return this.rmContext.getNMTokenSecretManager(); } @Private public AMRMTokenSecretManager getAMRMTokenSecretManager(){ - return this.amRmTokenSecretManager; + return this.rmContext.getAMRMTokenSecretManager(); + } + + @Private + public RMDelegationTokenSecretManager getRMDTSecretManager(){ + return this.rmContext.getRMDelegationTokenSecretManager(); } @Override public void recover(RMState state) throws Exception { // recover RMdelegationTokenSecretManager - rmDTSecretManager.recover(state); + getRMDTSecretManager().recover(state); // recover applications rmAppManager.recover(state); diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/MockRM.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/MockRM.java index 69453780fc5..76d8b1ac165 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/MockRM.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/MockRM.java @@ -61,7 +61,8 @@ import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNodeEventType; import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNodeImpl; import org.apache.hadoop.yarn.server.resourcemanager.security.ClientToAMTokenSecretManagerInRM; -import org.apache.hadoop.yarn.server.resourcemanager.security.RMDelegationTokenSecretManager; +import org.apache.hadoop.yarn.server.resourcemanager.security.NMTokenSecretManagerInRM; +import org.apache.hadoop.yarn.server.resourcemanager.security.RMContainerTokenSecretManager; import org.apache.hadoop.yarn.util.Records; import org.apache.log4j.Level; import org.apache.log4j.LogManager; @@ -309,7 +310,7 @@ public void sendAMLaunchFailed(ApplicationAttemptId appAttemptId) protected ClientRMService createClientRMService() { return new ClientRMService(getRMContext(), getResourceScheduler(), rmAppManager, applicationACLsManager, queueACLsManager, - rmDTSecretManager) { + getRMDTSecretManager()) { @Override protected void serviceStart() { // override to not start rpc handler @@ -325,8 +326,12 @@ protected void serviceStop() { @Override protected ResourceTrackerService createResourceTrackerService() { Configuration conf = new Configuration(); - + + RMContainerTokenSecretManager containerTokenSecretManager = + getRMContainerTokenSecretManager(); containerTokenSecretManager.rollMasterKey(); + NMTokenSecretManagerInRM nmTokenSecretManager = + getRMNMTokenSecretManager(); nmTokenSecretManager.rollMasterKey(); return new ResourceTrackerService(getRMContext(), nodesListManager, this.nmLivelinessMonitor, containerTokenSecretManager, @@ -398,12 +403,8 @@ public NodesListManager getNodesListManager() { return this.nodesListManager; } - public RMDelegationTokenSecretManager getRMDTSecretManager() { - return this.rmDTSecretManager; - } - public ClientToAMTokenSecretManagerInRM getClientToAMTokenSecretManager() { - return this.clientToAMSecretManager; + return this.getRMContext().getClientToAMTokenSecretManager(); } public RMAppManager getRMAppManager() { diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/QueueACLsTestBase.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/QueueACLsTestBase.java index 4760dba4455..b400e4f6c71 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/QueueACLsTestBase.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/QueueACLsTestBase.java @@ -86,7 +86,7 @@ public void setup() throws InterruptedException, IOException { protected ClientRMService createClientRMService() { return new ClientRMService(getRMContext(), this.scheduler, this.rmAppManager, this.applicationACLsManager, - this.queueACLsManager, this.rmDTSecretManager); + this.queueACLsManager, getRMDTSecretManager()); }; @Override diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestClientRMService.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestClientRMService.java index ca6dc3e037f..d425dda2aba 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestClientRMService.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestClientRMService.java @@ -133,7 +133,7 @@ public void testGetClusterNodes() throws Exception { protected ClientRMService createClientRMService() { return new ClientRMService(this.rmContext, scheduler, this.rmAppManager, this.applicationACLsManager, this.queueACLsManager, - this.rmDTSecretManager); + this.getRMDTSecretManager()); }; }; rm.start(); diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestRMRestart.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestRMRestart.java index fe220c07ebd..c7ef857cc6b 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestRMRestart.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestRMRestart.java @@ -1567,7 +1567,7 @@ public TestSecurityMockRM(Configuration conf, RMStateStore store) { @Override protected ClientRMService createClientRMService() { return new ClientRMService(getRMContext(), getResourceScheduler(), - rmAppManager, applicationACLsManager, null, rmDTSecretManager){ + rmAppManager, applicationACLsManager, null, getRMDTSecretManager()){ @Override protected void serviceStart() throws Exception { // do nothing diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/security/TestClientToAMTokens.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/security/TestClientToAMTokens.java index f7fd55e25bf..6a209e745cb 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/security/TestClientToAMTokens.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/security/TestClientToAMTokens.java @@ -168,7 +168,7 @@ public void testClientToAMTokens() throws Exception { protected ClientRMService createClientRMService() { return new ClientRMService(this.rmContext, scheduler, this.rmAppManager, this.applicationACLsManager, this.queueACLsManager, - this.rmDTSecretManager); + getRMDTSecretManager()); }; @Override diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/security/TestRMDelegationTokens.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/security/TestRMDelegationTokens.java index 6cc0a18a6f3..3b5add8012d 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/security/TestRMDelegationTokens.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/security/TestRMDelegationTokens.java @@ -37,6 +37,7 @@ import org.apache.hadoop.yarn.security.client.RMDelegationTokenIdentifier; import org.apache.hadoop.yarn.server.resourcemanager.MockRM; import org.apache.hadoop.yarn.server.resourcemanager.RMContext; +import org.apache.hadoop.yarn.server.resourcemanager.RMSecretManagerService; import org.apache.hadoop.yarn.server.resourcemanager.TestRMRestart.TestSecurityMockRM; import org.apache.hadoop.yarn.server.resourcemanager.recovery.MemoryRMStateStore; import org.apache.hadoop.yarn.server.resourcemanager.recovery.RMStateStore; @@ -166,13 +167,21 @@ public MyMockRM(Configuration conf, RMStateStore store) { } @Override - protected RMDelegationTokenSecretManager - createRMDelegationTokenSecretManager(RMContext rmContext) { - // KeyUpdateInterval-> 1 seconds - // TokenMaxLifetime-> 2 seconds. - return new TestRMDelegationTokenSecretManager(1000, 1000, 2000, 1000, - rmContext); + protected RMSecretManagerService createRMSecretManagerService() { + return new RMSecretManagerService(conf, rmContext) { + + @Override + protected RMDelegationTokenSecretManager + createRMDelegationTokenSecretManager(Configuration conf, + RMContext rmContext) { + // KeyUpdateInterval-> 1 seconds + // TokenMaxLifetime-> 2 seconds. + return new TestRMDelegationTokenSecretManager(1000, 1000, 2000, 1000, + rmContext); + } + }; } + } public class TestRMDelegationTokenSecretManager extends From 53bc888f0b6d2b1cc635d6d2e21900008a5fadb9 Mon Sep 17 00:00:00 2001 From: Ivan Mitic Date: Thu, 26 Dec 2013 19:42:24 +0000 Subject: [PATCH 02/42] HADOOP-10090. Jobtracker metrics not updated properly after execution of a mapreduce job. Contributed by Ivan Mitic. git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1553561 13f79535-47bb-0310-9956-ffa450edef68 --- .../hadoop-common/CHANGES.txt | 3 + .../impl/TestMetricsSourceAdapter.java | 87 +++++++++++++++++++ 2 files changed, 90 insertions(+) create mode 100644 hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/metrics2/impl/TestMetricsSourceAdapter.java diff --git a/hadoop-common-project/hadoop-common/CHANGES.txt b/hadoop-common-project/hadoop-common/CHANGES.txt index dcc8f5bde61..11b567e6989 100644 --- a/hadoop-common-project/hadoop-common/CHANGES.txt +++ b/hadoop-common-project/hadoop-common/CHANGES.txt @@ -568,6 +568,9 @@ Release 2.3.0 - UNRELEASED HADOOP-10175. Har files system authority should preserve userinfo. (Chuan Liu via cnauroth) + HADOOP-10090. Jobtracker metrics not updated properly after execution + of a mapreduce job. (ivanmi) + Release 2.2.0 - 2013-10-13 INCOMPATIBLE CHANGES diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/metrics2/impl/TestMetricsSourceAdapter.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/metrics2/impl/TestMetricsSourceAdapter.java new file mode 100644 index 00000000000..724d449fccd --- /dev/null +++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/metrics2/impl/TestMetricsSourceAdapter.java @@ -0,0 +1,87 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.metrics2.impl; + +import static org.junit.Assert.*; + +import java.util.ArrayList; +import java.util.List; + +import org.apache.hadoop.metrics2.MetricsSource; +import org.apache.hadoop.metrics2.MetricsTag; +import org.apache.hadoop.metrics2.annotation.Metric; +import org.apache.hadoop.metrics2.annotation.Metrics; +import org.apache.hadoop.metrics2.lib.MetricsAnnotations; +import org.apache.hadoop.metrics2.lib.MetricsRegistry; +import org.apache.hadoop.metrics2.lib.MetricsSourceBuilder; +import org.apache.hadoop.metrics2.lib.MutableCounterLong; +import org.junit.Test; + +public class TestMetricsSourceAdapter { + + @Test + public void testGetMetricsAndJmx() throws Exception { + // create test source with a single metric counter of value 0 + TestSource source = new TestSource("test"); + MetricsSourceBuilder sb = MetricsAnnotations.newSourceBuilder(source); + final MetricsSource s = sb.build(); + + List injectedTags = new ArrayList(); + MetricsSourceAdapter sa = new MetricsSourceAdapter( + "test", "test", "test desc", s, injectedTags, null, null, 1, false); + + // all metrics are initially assumed to have changed + MetricsCollectorImpl builder = new MetricsCollectorImpl(); + Iterable metricsRecords = sa.getMetrics(builder, true); + + // Validate getMetrics and JMX initial values + MetricsRecordImpl metricsRecord = metricsRecords.iterator().next(); + assertEquals(0L, + metricsRecord.metrics().iterator().next().value().longValue()); + + Thread.sleep(100); // skip JMX cache TTL + assertEquals(0L, (Number)sa.getAttribute("C1")); + + // change metric value + source.incrementCnt(); + + // validate getMetrics and JMX + builder = new MetricsCollectorImpl(); + metricsRecords = sa.getMetrics(builder, true); + metricsRecord = metricsRecords.iterator().next(); + assertTrue(metricsRecord.metrics().iterator().hasNext()); + Thread.sleep(100); // skip JMX cache TTL + assertEquals(1L, (Number)sa.getAttribute("C1")); + } + + @SuppressWarnings("unused") + @Metrics(context="test") + private static class TestSource { + @Metric("C1 desc") MutableCounterLong c1; + final MetricsRegistry registry; + + TestSource(String recName) { + registry = new MetricsRegistry(recName); + } + + public void incrementCnt() { + c1.incr(); + } + } +} From 3ee7f20c330df92fdeb472d2402134484d5aac6d Mon Sep 17 00:00:00 2001 From: Jian He Date: Thu, 26 Dec 2013 22:05:16 +0000 Subject: [PATCH 03/42] YARN-1527. Fixed yarn rmadmin command to print the correct usage info. Contributed by Akira AJISAKA. git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1553596 13f79535-47bb-0310-9956-ffa450edef68 --- hadoop-yarn-project/CHANGES.txt | 3 +++ .../hadoop/yarn/client/cli/RMAdminCLI.java | 4 +-- .../hadoop/yarn/client/TestRMAdminCLI.java | 26 +++++++++---------- 3 files changed, 18 insertions(+), 15 deletions(-) diff --git a/hadoop-yarn-project/CHANGES.txt b/hadoop-yarn-project/CHANGES.txt index 33185796466..ae4c4c19b84 100644 --- a/hadoop-yarn-project/CHANGES.txt +++ b/hadoop-yarn-project/CHANGES.txt @@ -269,6 +269,9 @@ Release 2.4.0 - UNRELEASED YARN-1451. TestResourceManager relies on the scheduler assigning multiple containers in a single node update. (Sandy Ryza via kasha) + YARN-1527. Fix yarn rmadmin command to print the correct usage info. + (Akira AJISAKA via jianhe) + Release 2.3.0 - UNRELEASED INCOMPATIBLE CHANGES diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-client/src/main/java/org/apache/hadoop/yarn/client/cli/RMAdminCLI.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-client/src/main/java/org/apache/hadoop/yarn/client/cli/RMAdminCLI.java index 807841b2cf4..258650673f9 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-client/src/main/java/org/apache/hadoop/yarn/client/cli/RMAdminCLI.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-client/src/main/java/org/apache/hadoop/yarn/client/cli/RMAdminCLI.java @@ -122,13 +122,13 @@ private static void buildIndividualUsageMsg(String cmd, } } String space = (usageInfo.args == "") ? "" : " "; - builder.append("Usage: java RMAdmin [" + builder.append("Usage: yarn rmadmin [" + cmd + space + usageInfo.args + "]\n"); } private static void buildUsageMsg(StringBuilder builder) { - builder.append("Usage: java RMAdmin"); + builder.append("Usage: yarn rmadmin\n"); for (String cmdKey : ADMIN_USAGE.keySet()) { UsageInfo usageInfo = ADMIN_USAGE.get(cmdKey); builder.append(" " + cmdKey + " " + usageInfo.args + "\n"); diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-client/src/test/java/org/apache/hadoop/yarn/client/TestRMAdminCLI.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-client/src/test/java/org/apache/hadoop/yarn/client/TestRMAdminCLI.java index 94c87515f09..cec8fcc557a 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-client/src/test/java/org/apache/hadoop/yarn/client/TestRMAdminCLI.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-client/src/test/java/org/apache/hadoop/yarn/client/TestRMAdminCLI.java @@ -241,37 +241,37 @@ public void testHelp() throws Exception { "commands if none")); testError(new String[] { "-help", "-refreshQueues" }, - "Usage: java RMAdmin [-refreshQueues]", dataErr, 0); + "Usage: yarn rmadmin [-refreshQueues]", dataErr, 0); testError(new String[] { "-help", "-refreshNodes" }, - "Usage: java RMAdmin [-refreshNodes]", dataErr, 0); + "Usage: yarn rmadmin [-refreshNodes]", dataErr, 0); testError(new String[] { "-help", "-refreshUserToGroupsMappings" }, - "Usage: java RMAdmin [-refreshUserToGroupsMappings]", dataErr, 0); + "Usage: yarn rmadmin [-refreshUserToGroupsMappings]", dataErr, 0); testError( new String[] { "-help", "-refreshSuperUserGroupsConfiguration" }, - "Usage: java RMAdmin [-refreshSuperUserGroupsConfiguration]", + "Usage: yarn rmadmin [-refreshSuperUserGroupsConfiguration]", dataErr, 0); testError(new String[] { "-help", "-refreshAdminAcls" }, - "Usage: java RMAdmin [-refreshAdminAcls]", dataErr, 0); + "Usage: yarn rmadmin [-refreshAdminAcls]", dataErr, 0); testError(new String[] { "-help", "-refreshServiceAcl" }, - "Usage: java RMAdmin [-refreshServiceAcl]", dataErr, 0); + "Usage: yarn rmadmin [-refreshServiceAcl]", dataErr, 0); testError(new String[] { "-help", "-getGroups" }, - "Usage: java RMAdmin [-getGroups [username]]", dataErr, 0); + "Usage: yarn rmadmin [-getGroups [username]]", dataErr, 0); testError(new String[] { "-help", "-transitionToActive" }, - "Usage: java RMAdmin [-transitionToActive ]", dataErr, 0); + "Usage: yarn rmadmin [-transitionToActive ]", dataErr, 0); testError(new String[] { "-help", "-transitionToStandby" }, - "Usage: java RMAdmin [-transitionToStandby ]", dataErr, 0); + "Usage: yarn rmadmin [-transitionToStandby ]", dataErr, 0); testError(new String[] { "-help", "-getServiceState" }, - "Usage: java RMAdmin [-getServiceState ]", dataErr, 0); + "Usage: yarn rmadmin [-getServiceState ]", dataErr, 0); testError(new String[] { "-help", "-checkHealth" }, - "Usage: java RMAdmin [-checkHealth ]", dataErr, 0); + "Usage: yarn rmadmin [-checkHealth ]", dataErr, 0); testError(new String[] { "-help", "-failover" }, - "Usage: java RMAdmin " + + "Usage: yarn rmadmin " + "[-failover [--forcefence] [--forceactive] " + " ]", dataErr, 0); testError(new String[] { "-help", "-badParameter" }, - "Usage: java RMAdmin", dataErr, 0); + "Usage: yarn rmadmin", dataErr, 0); testError(new String[] { "-badParameter" }, "badParameter: Unknown command", dataErr, -1); } finally { From 1c9ac9adeb886f62d8d5f1f8aa8d6a12c51420ac Mon Sep 17 00:00:00 2001 From: Karthik Kambatla Date: Fri, 27 Dec 2013 03:28:23 +0000 Subject: [PATCH 04/42] YARN-1523. Use StandbyException instead of RMNotYetReadyException (kasha) git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1553616 13f79535-47bb-0310-9956-ffa450edef68 --- hadoop-yarn-project/CHANGES.txt | 2 ++ .../exceptions/RMNotYetActiveException.java | 36 ------------------- ...ResourceManagerAdministrationProtocol.java | 10 +++--- .../server/resourcemanager/AdminService.java | 25 ++++++++----- 4 files changed, 23 insertions(+), 50 deletions(-) delete mode 100644 hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/exceptions/RMNotYetActiveException.java diff --git a/hadoop-yarn-project/CHANGES.txt b/hadoop-yarn-project/CHANGES.txt index ae4c4c19b84..bb5b895428b 100644 --- a/hadoop-yarn-project/CHANGES.txt +++ b/hadoop-yarn-project/CHANGES.txt @@ -189,6 +189,8 @@ Release 2.4.0 - UNRELEASED YARN-1172. Convert SecretManagers in RM to services (Tsuyoshi OZAWA via kasha) + YARN-1523. Use StandbyException instead of RMNotYetReadyException (kasha) + OPTIMIZATIONS BUG FIXES diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/exceptions/RMNotYetActiveException.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/exceptions/RMNotYetActiveException.java deleted file mode 100644 index 4aac61eaa93..00000000000 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/exceptions/RMNotYetActiveException.java +++ /dev/null @@ -1,36 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.yarn.exceptions; - -import org.apache.hadoop.classification.InterfaceAudience; -import org.apache.hadoop.classification.InterfaceStability; - -/** - * Exception to be thrown when an Active-Only operation is attempted on a - * ResourceManager that is not Active. - */ -@InterfaceAudience.Private -@InterfaceStability.Evolving -public class RMNotYetActiveException extends YarnException { - private static final long serialVersionUID = 1L; - - public RMNotYetActiveException() { - super("ResourceManager is not yet Active!"); - } -} diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/server/api/ResourceManagerAdministrationProtocol.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/server/api/ResourceManagerAdministrationProtocol.java index e768847e850..26415992ac0 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/server/api/ResourceManagerAdministrationProtocol.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/server/api/ResourceManagerAdministrationProtocol.java @@ -24,10 +24,10 @@ import org.apache.hadoop.classification.InterfaceAudience.Public; import org.apache.hadoop.classification.InterfaceStability.Evolving; import org.apache.hadoop.classification.InterfaceStability.Stable; +import org.apache.hadoop.ipc.StandbyException; import org.apache.hadoop.tools.GetUserMappingsProtocol; import org.apache.hadoop.yarn.api.records.NodeId; import org.apache.hadoop.yarn.api.records.ResourceOption; -import org.apache.hadoop.yarn.exceptions.RMNotYetActiveException; import org.apache.hadoop.yarn.exceptions.YarnException; import org.apache.hadoop.yarn.server.api.protocolrecords.RefreshAdminAclsRequest; import org.apache.hadoop.yarn.server.api.protocolrecords.RefreshAdminAclsResponse; @@ -51,25 +51,25 @@ public interface ResourceManagerAdministrationProtocol extends GetUserMappingsPr @Public @Stable public RefreshQueuesResponse refreshQueues(RefreshQueuesRequest request) - throws RMNotYetActiveException, YarnException, IOException; + throws StandbyException, YarnException, IOException; @Public @Stable public RefreshNodesResponse refreshNodes(RefreshNodesRequest request) - throws RMNotYetActiveException, YarnException, IOException; + throws StandbyException, YarnException, IOException; @Public @Stable public RefreshSuperUserGroupsConfigurationResponse refreshSuperUserGroupsConfiguration( RefreshSuperUserGroupsConfigurationRequest request) - throws RMNotYetActiveException, YarnException, IOException; + throws StandbyException, YarnException, IOException; @Public @Stable public RefreshUserToGroupsMappingsResponse refreshUserToGroupsMappings( RefreshUserToGroupsMappingsRequest request) - throws RMNotYetActiveException, YarnException, IOException; + throws StandbyException, YarnException, IOException; @Public @Stable diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/AdminService.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/AdminService.java index 0d0e5481be5..33230d86f9b 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/AdminService.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/AdminService.java @@ -40,6 +40,7 @@ import org.apache.hadoop.ipc.ProtobufRpcEngine; import org.apache.hadoop.ipc.RPC; import org.apache.hadoop.ipc.RPC.Server; +import org.apache.hadoop.ipc.StandbyException; import org.apache.hadoop.security.Groups; import org.apache.hadoop.security.UserGroupInformation; import org.apache.hadoop.security.authorize.AccessControlList; @@ -49,7 +50,6 @@ import org.apache.hadoop.yarn.api.records.NodeId; import org.apache.hadoop.yarn.api.records.ResourceOption; import org.apache.hadoop.yarn.conf.YarnConfiguration; -import org.apache.hadoop.yarn.exceptions.RMNotYetActiveException; import org.apache.hadoop.yarn.exceptions.YarnException; import org.apache.hadoop.yarn.factories.RecordFactory; import org.apache.hadoop.yarn.factory.providers.RecordFactoryProvider; @@ -82,6 +82,7 @@ public class AdminService extends AbstractService implements private final RMContext rmContext; private final ResourceManager rm; + private String rmId; private Server server; private InetSocketAddress masterServiceAddress; @@ -105,6 +106,7 @@ public synchronized void serviceInit(Configuration conf) throws Exception { adminAcl = new AccessControlList(conf.get( YarnConfiguration.YARN_ADMIN_ACL, YarnConfiguration.DEFAULT_YARN_ADMIN_ACL)); + rmId = conf.get(YarnConfiguration.RM_HA_ID); super.serviceInit(conf); } @@ -176,6 +178,10 @@ private synchronized boolean isRMActive() { return HAServiceState.ACTIVE == rmContext.getHAServiceState(); } + private void throwStandbyException() throws StandbyException { + throw new StandbyException("ResourceManager " + rmId + " is not Active!"); + } + @Override public synchronized void monitorHealth() throws IOException { @@ -239,14 +245,14 @@ public synchronized HAServiceStatus getServiceStatus() throws IOException { @Override public RefreshQueuesResponse refreshQueues(RefreshQueuesRequest request) - throws YarnException { + throws YarnException, StandbyException { UserGroupInformation user = checkAcls("refreshQueues"); if (!isRMActive()) { RMAuditLogger.logFailure(user.getShortUserName(), "refreshQueues", adminAcl.toString(), "AdminService", "ResourceManager is not active. Can not refresh queues."); - throw new RMNotYetActiveException(); + throwStandbyException(); } try { @@ -265,14 +271,14 @@ public RefreshQueuesResponse refreshQueues(RefreshQueuesRequest request) @Override public RefreshNodesResponse refreshNodes(RefreshNodesRequest request) - throws YarnException { + throws YarnException, StandbyException { UserGroupInformation user = checkAcls("refreshNodes"); if (!isRMActive()) { RMAuditLogger.logFailure(user.getShortUserName(), "refreshNodes", adminAcl.toString(), "AdminService", "ResourceManager is not active. Can not refresh nodes."); - throw new RMNotYetActiveException(); + throwStandbyException(); } try { @@ -291,7 +297,7 @@ public RefreshNodesResponse refreshNodes(RefreshNodesRequest request) @Override public RefreshSuperUserGroupsConfigurationResponse refreshSuperUserGroupsConfiguration( RefreshSuperUserGroupsConfigurationRequest request) - throws YarnException { + throws YarnException, StandbyException { UserGroupInformation user = checkAcls("refreshSuperUserGroupsConfiguration"); // TODO (YARN-1459): Revisit handling super-user-groups on Standby RM @@ -300,7 +306,7 @@ public RefreshSuperUserGroupsConfigurationResponse refreshSuperUserGroupsConfigu "refreshSuperUserGroupsConfiguration", adminAcl.toString(), "AdminService", "ResourceManager is not active. Can not refresh super-user-groups."); - throw new RMNotYetActiveException(); + throwStandbyException(); } ProxyUsers.refreshSuperUserGroupsConfiguration(new Configuration()); @@ -313,7 +319,8 @@ public RefreshSuperUserGroupsConfigurationResponse refreshSuperUserGroupsConfigu @Override public RefreshUserToGroupsMappingsResponse refreshUserToGroupsMappings( - RefreshUserToGroupsMappingsRequest request) throws YarnException { + RefreshUserToGroupsMappingsRequest request) + throws YarnException, StandbyException { UserGroupInformation user = checkAcls("refreshUserToGroupsMappings"); // TODO (YARN-1459): Revisit handling user-groups on Standby RM @@ -322,7 +329,7 @@ public RefreshUserToGroupsMappingsResponse refreshUserToGroupsMappings( "refreshUserToGroupsMapping", adminAcl.toString(), "AdminService", "ResourceManager is not active. Can not refresh user-groups."); - throw new RMNotYetActiveException(); + throwStandbyException(); } Groups.getUserToGroupsMappingService().refresh(); From defeef6fe43de476fc3ff08660feaa17a16931cd Mon Sep 17 00:00:00 2001 From: Karthik Kambatla Date: Fri, 27 Dec 2013 21:10:14 +0000 Subject: [PATCH 05/42] YARN-1481. Addendum patch to fix synchronization in AdminService git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1553738 13f79535-47bb-0310-9956-ffa450edef68 --- .../apache/hadoop/yarn/server/resourcemanager/AdminService.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/AdminService.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/AdminService.java index 33230d86f9b..10e73267680 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/AdminService.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/AdminService.java @@ -174,7 +174,7 @@ private UserGroupInformation checkAcls(String method) throws YarnException { } } - private synchronized boolean isRMActive() { + private boolean isRMActive() { return HAServiceState.ACTIVE == rmContext.getHAServiceState(); } From 8f0bf54d3442e6beedfaeaf3b53c5769019ca9d1 Mon Sep 17 00:00:00 2001 From: Vinod Kumar Vavilapalli Date: Sat, 28 Dec 2013 01:09:07 +0000 Subject: [PATCH 06/42] YARN-1541. Changed ResourceManager to invalidate ApplicationMaster host/port information once an AM crashes. Contributed by Jian He. git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1553772 13f79535-47bb-0310-9956-ffa450edef68 --- hadoop-yarn-project/CHANGES.txt | 3 + .../rmapp/attempt/RMAppAttemptImpl.java | 10 +- .../yarn/server/resourcemanager/TestRM.java | 116 +++++++++++++++++- .../attempt/TestRMAppAttemptTransitions.java | 8 ++ 4 files changed, 133 insertions(+), 4 deletions(-) diff --git a/hadoop-yarn-project/CHANGES.txt b/hadoop-yarn-project/CHANGES.txt index bb5b895428b..f593ac8a895 100644 --- a/hadoop-yarn-project/CHANGES.txt +++ b/hadoop-yarn-project/CHANGES.txt @@ -191,6 +191,9 @@ Release 2.4.0 - UNRELEASED YARN-1523. Use StandbyException instead of RMNotYetReadyException (kasha) + YARN-1541. Changed ResourceManager to invalidate ApplicationMaster host/port + information once an AM crashes. (Jian He via vinodkv) + OPTIMIZATIONS BUG FIXES diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/attempt/RMAppAttemptImpl.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/attempt/RMAppAttemptImpl.java index 8823952dfa7..647bc59c9dd 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/attempt/RMAppAttemptImpl.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/attempt/RMAppAttemptImpl.java @@ -139,7 +139,7 @@ public class RMAppAttemptImpl implements RMAppAttempt, Recoverable { private float progress = 0; private String host = "N/A"; - private int rpcPort; + private int rpcPort = -1; private String originalTrackingUrl = "N/A"; private String proxiedTrackingUrl = "N/A"; private long startTime = 0; @@ -526,6 +526,11 @@ private void setTrackingUrlToRMAppPage() { proxiedTrackingUrl = originalTrackingUrl; } + private void invalidateAMHostAndPort() { + this.host = "N/A"; + this.rpcPort = -1; + } + // This is only used for RMStateStore. Normal operation must invoke the secret // manager to get the key and not use the local key directly. @Override @@ -1033,6 +1038,7 @@ public void transition(RMAppAttemptImpl appAttempt, { // don't leave the tracking URL pointing to a non-existent AM appAttempt.setTrackingUrlToRMAppPage(); + appAttempt.invalidateAMHostAndPort(); appEvent = new RMAppFailedAttemptEvent(applicationId, RMAppEventType.ATTEMPT_KILLED, @@ -1043,6 +1049,7 @@ public void transition(RMAppAttemptImpl appAttempt, { // don't leave the tracking URL pointing to a non-existent AM appAttempt.setTrackingUrlToRMAppPage(); + appAttempt.invalidateAMHostAndPort(); appEvent = new RMAppFailedAttemptEvent(applicationId, RMAppEventType.ATTEMPT_FAILED, @@ -1059,7 +1066,6 @@ public void transition(RMAppAttemptImpl appAttempt, appAttempt.eventHandler.handle(appEvent); appAttempt.eventHandler.handle(new AppAttemptRemovedSchedulerEvent( appAttemptId, finalAttemptState)); - appAttempt.removeCredentials(appAttempt); } } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestRM.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestRM.java index 205846a11de..a2bf4ae97bf 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestRM.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestRM.java @@ -19,26 +19,33 @@ package org.apache.hadoop.yarn.server.resourcemanager; import java.util.ArrayList; +import java.util.EnumSet; import java.util.HashMap; import java.util.List; -import javax.security.auth.login.Configuration; - import junit.framework.Assert; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.yarn.api.protocolrecords.AllocateResponse; +import org.apache.hadoop.yarn.api.protocolrecords.FinishApplicationMasterRequest; +import org.apache.hadoop.yarn.api.protocolrecords.GetApplicationReportRequest; +import org.apache.hadoop.yarn.api.protocolrecords.GetApplicationsRequest; +import org.apache.hadoop.yarn.api.protocolrecords.GetApplicationsResponse; import org.apache.hadoop.yarn.api.protocolrecords.GetNewApplicationResponse; import org.apache.hadoop.yarn.api.records.ApplicationAttemptId; +import org.apache.hadoop.yarn.api.records.ApplicationReport; import org.apache.hadoop.yarn.api.records.Container; import org.apache.hadoop.yarn.api.records.ContainerId; import org.apache.hadoop.yarn.api.records.ContainerState; +import org.apache.hadoop.yarn.api.records.FinalApplicationStatus; import org.apache.hadoop.yarn.api.records.NMToken; import org.apache.hadoop.yarn.api.records.ResourceRequest; import org.apache.hadoop.yarn.api.records.Token; +import org.apache.hadoop.yarn.api.records.YarnApplicationState; import org.apache.hadoop.yarn.conf.YarnConfiguration; import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMApp; +import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppState; import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttempt; import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttemptState; import org.apache.hadoop.yarn.server.resourcemanager.security.NMTokenSecretManagerInRM; @@ -368,6 +375,111 @@ public void testActivatingApplicationAfterAddingNM() throws Exception { rm1.stop(); } + // This is to test AM Host and rpc port are invalidated after the am attempt + // is killed or failed, so that client doesn't get the wrong information. + @Test (timeout = 80000) + public void testInvalidateAMHostPortWhenAMFailedOrKilled() throws Exception { + YarnConfiguration conf = new YarnConfiguration(); + conf.setInt(YarnConfiguration.RM_AM_MAX_ATTEMPTS, 1); + MockRM rm1 = new MockRM(conf); + rm1.start(); + + // a succeeded app + RMApp app1 = rm1.submitApp(200); + MockNM nm1 = + new MockNM("127.0.0.1:1234", 15120, rm1.getResourceTrackerService()); + nm1.registerNode(); + MockAM am1 = launchAM(app1, rm1, nm1); + finishApplicationMaster(app1, rm1, nm1, am1); + + // a failed app + RMApp app2 = rm1.submitApp(200); + MockAM am2 = launchAM(app2, rm1, nm1); + nm1.nodeHeartbeat(am2.getApplicationAttemptId(), 1, ContainerState.COMPLETE); + am2.waitForState(RMAppAttemptState.FAILED); + rm1.waitForState(app2.getApplicationId(), RMAppState.FAILED); + + // a killed app + RMApp app3 = rm1.submitApp(200); + MockAM am3 = launchAM(app3, rm1, nm1); + rm1.killApp(app3.getApplicationId()); + rm1.waitForState(app3.getApplicationId(), RMAppState.KILLED); + rm1.waitForState(am3.getApplicationAttemptId(), RMAppAttemptState.KILLED); + + GetApplicationsRequest request1 = + GetApplicationsRequest.newInstance(EnumSet.of( + YarnApplicationState.FINISHED, YarnApplicationState.KILLED, + YarnApplicationState.FAILED)); + GetApplicationsResponse response1 = + rm1.getClientRMService().getApplications(request1); + List appList1 = response1.getApplicationList(); + + Assert.assertEquals(3, appList1.size()); + for (ApplicationReport report : appList1) { + // killed/failed apps host and rpc port are invalidated. + if (report.getApplicationId().equals(app2.getApplicationId()) + || report.getApplicationId().equals(app3.getApplicationId())) { + Assert.assertEquals("N/A", report.getHost()); + Assert.assertEquals(-1, report.getRpcPort()); + } + // succeeded app's host and rpc port is not invalidated + if (report.getApplicationId().equals(app1.getApplicationId())) { + Assert.assertFalse(report.getHost().equals("N/A")); + Assert.assertTrue(report.getRpcPort() != -1); + } + } + } + + @Test (timeout = 60000) + public void testInvalidatedAMHostPortOnAMRestart() throws Exception { + YarnConfiguration conf = new YarnConfiguration(); + MockRM rm1 = new MockRM(conf); + rm1.start(); + MockNM nm1 = + new MockNM("127.0.0.1:1234", 15120, rm1.getResourceTrackerService()); + nm1.registerNode(); + + // a failed app + RMApp app2 = rm1.submitApp(200); + MockAM am2 = launchAM(app2, rm1, nm1); + nm1 + .nodeHeartbeat(am2.getApplicationAttemptId(), 1, ContainerState.COMPLETE); + am2.waitForState(RMAppAttemptState.FAILED); + rm1.waitForState(app2.getApplicationId(), RMAppState.ACCEPTED); + + // before new attempt is launched, the app report returns the invalid AM + // host and port. + GetApplicationReportRequest request1 = + GetApplicationReportRequest.newInstance(app2.getApplicationId()); + ApplicationReport report1 = + rm1.getClientRMService().getApplicationReport(request1) + .getApplicationReport(); + Assert.assertEquals("N/A", report1.getHost()); + Assert.assertEquals(-1, report1.getRpcPort()); + } + + private MockAM launchAM(RMApp app, MockRM rm, MockNM nm) + throws Exception { + RMAppAttempt attempt = app.getCurrentAppAttempt(); + nm.nodeHeartbeat(true); + MockAM am = rm.sendAMLaunched(attempt.getAppAttemptId()); + am.registerAppAttempt(); + rm.waitForState(app.getApplicationId(), RMAppState.RUNNING); + return am; + } + + private void finishApplicationMaster(RMApp rmApp, MockRM rm, MockNM nm, + MockAM am) throws Exception { + FinishApplicationMasterRequest req = + FinishApplicationMasterRequest.newInstance( + FinalApplicationStatus.SUCCEEDED, "", ""); + am.unregisterAppAttempt(req); + am.waitForState(RMAppAttemptState.FINISHING); + nm.nodeHeartbeat(am.getApplicationAttemptId(), 1, ContainerState.COMPLETE); + am.waitForState(RMAppAttemptState.FINISHED); + rm.waitForState(rmApp.getApplicationId(), RMAppState.FINISHED); + } + public static void main(String[] args) throws Exception { TestRM t = new TestRM(); t.testGetNewAppId(); diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/attempt/TestRMAppAttemptTransitions.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/attempt/TestRMAppAttemptTransitions.java index 6c72d97080d..0ad2f2a0370 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/attempt/TestRMAppAttemptTransitions.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/attempt/TestRMAppAttemptTransitions.java @@ -806,6 +806,7 @@ public void testRunningToFailed() { applicationAttempt.getAppAttemptId().getApplicationId()); assertEquals(rmAppPageUrl, applicationAttempt.getOriginalTrackingUrl()); assertEquals(rmAppPageUrl, applicationAttempt.getTrackingUrl()); + verifyAMHostAndPortInvalidated(); } @Test @@ -841,6 +842,7 @@ public void testRunningToKilled() { assertEquals(rmAppPageUrl, applicationAttempt.getOriginalTrackingUrl()); assertEquals(rmAppPageUrl, applicationAttempt.getTrackingUrl()); verifyTokenCount(applicationAttempt.getAppAttemptId(), 1); + verifyAMHostAndPortInvalidated(); } @Test(timeout=10000) @@ -878,6 +880,7 @@ public void testRunningExpire() { assertEquals(rmAppPageUrl, applicationAttempt.getOriginalTrackingUrl()); assertEquals(rmAppPageUrl, applicationAttempt.getTrackingUrl()); verifyTokenCount(applicationAttempt.getAppAttemptId(), 1); + verifyAMHostAndPortInvalidated(); } @Test @@ -1125,4 +1128,9 @@ private void verifyAttemptFinalStateSaved() { verify(store, times(1)).updateApplicationAttemptState( any(ApplicationAttemptState.class)); } + + private void verifyAMHostAndPortInvalidated() { + assertEquals("N/A", applicationAttempt.getHost()); + assertEquals(-1, applicationAttempt.getRpcPort()); + } } From 1152e23ed03e8831a3167a729503aad3cbcb4ee7 Mon Sep 17 00:00:00 2001 From: Karthik Kambatla Date: Sat, 28 Dec 2013 06:55:30 +0000 Subject: [PATCH 07/42] YARN-1463. Tests should avoid starting http-server where possible or creates spnego keytab/principals (vinodkv via kasha) git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1553790 13f79535-47bb-0310-9956-ffa450edef68 --- .../mapreduce/v2/hs/HistoryClientService.java | 5 ++- .../mapreduce/v2/hs/JobHistoryServer.java | 15 ++++++-- .../mapreduce/security/TestJHSSecurity.java | 12 ++++++ hadoop-yarn-project/CHANGES.txt | 3 ++ .../apache/hadoop/yarn/webapp/WebApps.java | 4 +- .../hadoop-yarn-server-tests/pom.xml | 5 +++ .../hadoop/yarn/server/MiniYARNCluster.java | 10 +++-- .../server/TestContainerManagerSecurity.java | 37 +++++++++++++++++-- .../yarn/server/TestRMNMSecretKeys.java | 4 ++ 9 files changed, 82 insertions(+), 13 deletions(-) diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-hs/src/main/java/org/apache/hadoop/mapreduce/v2/hs/HistoryClientService.java b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-hs/src/main/java/org/apache/hadoop/mapreduce/v2/hs/HistoryClientService.java index f27124cdf23..7bdb7dbd431 100644 --- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-hs/src/main/java/org/apache/hadoop/mapreduce/v2/hs/HistoryClientService.java +++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-hs/src/main/java/org/apache/hadoop/mapreduce/v2/hs/HistoryClientService.java @@ -88,6 +88,8 @@ import org.apache.hadoop.yarn.webapp.WebApp; import org.apache.hadoop.yarn.webapp.WebApps; +import com.google.common.annotations.VisibleForTesting; + /** * This module is responsible for talking to the * JobClient (user facing). @@ -142,7 +144,8 @@ protected void serviceStart() throws Exception { super.serviceStart(); } - private void initializeWebApp(Configuration conf) { + @VisibleForTesting + protected void initializeWebApp(Configuration conf) { webApp = new HsWebApp(history); InetSocketAddress bindAddress = MRWebAppUtil.getJHSWebBindAddress(conf); // NOTE: there should be a .at(InetSocketAddress) diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-hs/src/main/java/org/apache/hadoop/mapreduce/v2/hs/JobHistoryServer.java b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-hs/src/main/java/org/apache/hadoop/mapreduce/v2/hs/JobHistoryServer.java index 4fc84c96fae..9c92bed6acb 100644 --- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-hs/src/main/java/org/apache/hadoop/mapreduce/v2/hs/JobHistoryServer.java +++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-hs/src/main/java/org/apache/hadoop/mapreduce/v2/hs/JobHistoryServer.java @@ -45,6 +45,8 @@ import org.apache.hadoop.yarn.exceptions.YarnRuntimeException; import org.apache.hadoop.yarn.logaggregation.AggregatedLogDeletionService; +import com.google.common.annotations.VisibleForTesting; + /****************************************************************** * {@link JobHistoryServer} is responsible for servicing all job history * related requests from client. @@ -60,10 +62,10 @@ public class JobHistoryServer extends CompositeService { public static final long historyServerTimeStamp = System.currentTimeMillis(); private static final Log LOG = LogFactory.getLog(JobHistoryServer.class); - private HistoryContext historyContext; + protected HistoryContext historyContext; private HistoryClientService clientService; private JobHistory jobHistoryService; - private JHSDelegationTokenSecretManager jhsDTSecretManager; + protected JHSDelegationTokenSecretManager jhsDTSecretManager; private AggregatedLogDeletionService aggLogDelService; private HSAdminServer hsAdminServer; private HistoryServerStateStoreService stateStore; @@ -129,8 +131,7 @@ protected void serviceInit(Configuration conf) throws Exception { historyContext = (HistoryContext)jobHistoryService; stateStore = createStateStore(conf); this.jhsDTSecretManager = createJHSSecretManager(conf, stateStore); - clientService = new HistoryClientService(historyContext, - this.jhsDTSecretManager); + clientService = createHistoryClientService(); aggLogDelService = new AggregatedLogDeletionService(); hsAdminServer = new HSAdminServer(aggLogDelService, jobHistoryService); addService(stateStore); @@ -142,6 +143,12 @@ protected void serviceInit(Configuration conf) throws Exception { super.serviceInit(config); } + @VisibleForTesting + protected HistoryClientService createHistoryClientService() { + return new HistoryClientService(historyContext, + this.jhsDTSecretManager); + } + protected JHSDelegationTokenSecretManager createJHSSecretManager( Configuration conf, HistoryServerStateStoreService store) { long secretKeyInterval = diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-jobclient/src/test/java/org/apache/hadoop/mapreduce/security/TestJHSSecurity.java b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-jobclient/src/test/java/org/apache/hadoop/mapreduce/security/TestJHSSecurity.java index f3b9821f85d..e159652653c 100644 --- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-jobclient/src/test/java/org/apache/hadoop/mapreduce/security/TestJHSSecurity.java +++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-jobclient/src/test/java/org/apache/hadoop/mapreduce/security/TestJHSSecurity.java @@ -39,6 +39,7 @@ import org.apache.hadoop.mapreduce.v2.api.protocolrecords.GetDelegationTokenRequest; import org.apache.hadoop.mapreduce.v2.api.protocolrecords.GetJobReportRequest; import org.apache.hadoop.mapreduce.v2.api.protocolrecords.RenewDelegationTokenRequest; +import org.apache.hadoop.mapreduce.v2.hs.HistoryClientService; import org.apache.hadoop.mapreduce.v2.hs.HistoryServerStateStoreService; import org.apache.hadoop.mapreduce.v2.hs.JHSDelegationTokenSecretManager; import org.apache.hadoop.mapreduce.v2.hs.JobHistoryServer; @@ -94,6 +95,17 @@ protected JHSDelegationTokenSecretManager createJHSSecretManager( return new JHSDelegationTokenSecretManager(initialInterval, maxLifetime, renewInterval, 3600000, store); } + + @Override + protected HistoryClientService createHistoryClientService() { + return new HistoryClientService(historyContext, + this.jhsDTSecretManager) { + @Override + protected void initializeWebApp(Configuration conf) { + // Don't need it, skip.; + } + }; + } }; // final JobHistoryServer jobHistoryServer = jhServer; jobHistoryServer.init(conf); diff --git a/hadoop-yarn-project/CHANGES.txt b/hadoop-yarn-project/CHANGES.txt index f593ac8a895..491277dd275 100644 --- a/hadoop-yarn-project/CHANGES.txt +++ b/hadoop-yarn-project/CHANGES.txt @@ -483,6 +483,9 @@ Release 2.2.0 - 2013-10-13 YARN-1278. Fixed NodeManager to not delete local resources for apps on resync command from RM - a bug caused by YARN-1149. (Hitesh Shah via vinodkv) + YARN-1463. Tests should avoid starting http-server where possible or creates + spnego keytab/principals (vinodkv via kasha) + Release 2.1.1-beta - 2013-09-23 INCOMPATIBLE CHANGES diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/webapp/WebApps.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/webapp/WebApps.java index 7f70d98ded8..e0a37ea6dac 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/webapp/WebApps.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/webapp/WebApps.java @@ -225,8 +225,8 @@ public void setup() { boolean hasSpnegoConf = spnegoPrincipalKey != null && spnegoKeytabKey != null; if (hasSpnegoConf) { - builder.setUsernameConfKey(conf.get(spnegoPrincipalKey)) - .setKeytabConfKey(conf.get(spnegoKeytabKey)) + builder.setUsernameConfKey(spnegoPrincipalKey) + .setKeytabConfKey(spnegoKeytabKey) .setSecurityEnabled(UserGroupInformation.isSecurityEnabled()); } HttpServer server = builder.build(); diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-tests/pom.xml b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-tests/pom.xml index 600c647f9f7..330a5685c1e 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-tests/pom.xml +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-tests/pom.xml @@ -50,6 +50,11 @@ test-jar test + + org.apache.hadoop + hadoop-minikdc + test + diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-tests/src/test/java/org/apache/hadoop/yarn/server/MiniYARNCluster.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-tests/src/test/java/org/apache/hadoop/yarn/server/MiniYARNCluster.java index 78bbea43852..60f1e079f5b 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-tests/src/test/java/org/apache/hadoop/yarn/server/MiniYARNCluster.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-tests/src/test/java/org/apache/hadoop/yarn/server/MiniYARNCluster.java @@ -26,7 +26,6 @@ import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ConcurrentMap; -import com.google.common.annotations.VisibleForTesting; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.classification.InterfaceAudience; @@ -67,7 +66,7 @@ import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.event.RMAppAttemptUnregistrationEvent; import org.apache.hadoop.yarn.webapp.util.WebAppUtils; -import static org.junit.Assert.fail; +import com.google.common.annotations.VisibleForTesting; /** * Embedded Yarn minicluster for testcases that need to interact with a cluster. @@ -213,7 +212,12 @@ public void serviceInit(Configuration conf) throws Exception { } for (int i = 0; i < resourceManagers.length; i++) { - resourceManagers[i] = new ResourceManager(); + resourceManagers[i] = new ResourceManager() { + @Override + protected void doSecureLogin() throws IOException { + // Don't try to login using keytab in the testcases. + } + }; addService(new ResourceManagerWrapper(i)); } for(int index = 0; index < nodeManagers.length; index++) { diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-tests/src/test/java/org/apache/hadoop/yarn/server/TestContainerManagerSecurity.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-tests/src/test/java/org/apache/hadoop/yarn/server/TestContainerManagerSecurity.java index 0a62f36ef7b..34582a7e667 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-tests/src/test/java/org/apache/hadoop/yarn/server/TestContainerManagerSecurity.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-tests/src/test/java/org/apache/hadoop/yarn/server/TestContainerManagerSecurity.java @@ -20,6 +20,7 @@ import static org.junit.Assert.fail; +import java.io.File; import java.io.IOException; import java.net.InetSocketAddress; import java.security.PrivilegedAction; @@ -34,6 +35,7 @@ import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.CommonConfigurationKeysPublic; +import org.apache.hadoop.minikdc.KerberosSecurityTestcase; import org.apache.hadoop.net.NetUtils; import org.apache.hadoop.security.UserGroupInformation; import org.apache.hadoop.security.token.SecretManager.InvalidToken; @@ -66,21 +68,40 @@ import org.apache.hadoop.yarn.server.resourcemanager.security.RMContainerTokenSecretManager; import org.apache.hadoop.yarn.util.ConverterUtils; import org.apache.hadoop.yarn.util.Records; +import org.junit.After; +import org.junit.Before; import org.junit.Test; import org.junit.runner.RunWith; import org.junit.runners.Parameterized; import org.junit.runners.Parameterized.Parameters; @RunWith(Parameterized.class) -public class TestContainerManagerSecurity { +public class TestContainerManagerSecurity extends KerberosSecurityTestcase { static Log LOG = LogFactory.getLog(TestContainerManagerSecurity.class); static final RecordFactory recordFactory = RecordFactoryProvider .getRecordFactory(null); private static MiniYARNCluster yarnCluster; + private static final File testRootDir = new File("target", + TestContainerManagerSecurity.class.getName() + "-root"); + private static File httpSpnegoKeytabFile = new File(testRootDir, + "httpSpnegoKeytabFile.keytab"); + private static String httpSpnegoPrincipal = "HTTP/localhost@EXAMPLE.COM"; private Configuration conf; + @Before + public void setUp() throws Exception { + testRootDir.mkdirs(); + httpSpnegoKeytabFile.deleteOnExit(); + getKdc().createPrincipal(httpSpnegoKeytabFile, httpSpnegoPrincipal); + } + + @After + public void tearDown() { + testRootDir.delete(); + } + @Parameters public static Collection configs() { Configuration configurationWithoutSecurity = new Configuration(); @@ -89,8 +110,18 @@ public static Collection configs() { Configuration configurationWithSecurity = new Configuration(); configurationWithSecurity.set( - CommonConfigurationKeysPublic.HADOOP_SECURITY_AUTHENTICATION, - "kerberos"); + CommonConfigurationKeysPublic.HADOOP_SECURITY_AUTHENTICATION, "kerberos"); + configurationWithSecurity.set( + YarnConfiguration.RM_WEBAPP_SPNEGO_USER_NAME_KEY, httpSpnegoPrincipal); + configurationWithSecurity.set( + YarnConfiguration.RM_WEBAPP_SPNEGO_KEYTAB_FILE_KEY, + httpSpnegoKeytabFile.getAbsolutePath()); + configurationWithSecurity.set( + YarnConfiguration.NM_WEBAPP_SPNEGO_USER_NAME_KEY, httpSpnegoPrincipal); + configurationWithSecurity.set( + YarnConfiguration.NM_WEBAPP_SPNEGO_KEYTAB_FILE_KEY, + httpSpnegoKeytabFile.getAbsolutePath()); + return Arrays.asList(new Object[][] { { configurationWithoutSecurity }, { configurationWithSecurity } }); } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-tests/src/test/java/org/apache/hadoop/yarn/server/TestRMNMSecretKeys.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-tests/src/test/java/org/apache/hadoop/yarn/server/TestRMNMSecretKeys.java index 56784a54dc5..dbeb49d8f6a 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-tests/src/test/java/org/apache/hadoop/yarn/server/TestRMNMSecretKeys.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-tests/src/test/java/org/apache/hadoop/yarn/server/TestRMNMSecretKeys.java @@ -64,6 +64,10 @@ protected void doSecureLogin() throws IOException { protected Dispatcher createDispatcher() { return dispatcher; } + @Override + protected void startWepApp() { + // Don't need it, skip. + } }; rm.init(conf); rm.start(); From cc4c74be09ed54e7541b7e2ae8cebc42c2c130fe Mon Sep 17 00:00:00 2001 From: Vinod Kumar Vavilapalli Date: Sat, 28 Dec 2013 17:51:03 +0000 Subject: [PATCH 08/42] MAPREDUCE-5694. Fixed MR AppMaster to shutdown the LogManager so as to avoid losing syslog in some conditions. Contributed by Mohammad Kamrul Islam. git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1553879 13f79535-47bb-0310-9956-ffa450edef68 --- hadoop-mapreduce-project/CHANGES.txt | 3 +++ .../java/org/apache/hadoop/mapreduce/v2/app/MRAppMaster.java | 3 +++ 2 files changed, 6 insertions(+) diff --git a/hadoop-mapreduce-project/CHANGES.txt b/hadoop-mapreduce-project/CHANGES.txt index 4dcc7f37d9b..b9d167b77a3 100644 --- a/hadoop-mapreduce-project/CHANGES.txt +++ b/hadoop-mapreduce-project/CHANGES.txt @@ -258,6 +258,9 @@ Release 2.4.0 - UNRELEASED MAPREDUCE-5687. Fixed failure in TestYARNRunner caused by YARN-1446. (Jian He via vinodkv) + MAPREDUCE-5694. Fixed MR AppMaster to shutdown the LogManager so as to avoid + losing syslog in some conditions. (Mohammad Kamrul Islam via vinodkv) + Release 2.3.0 - UNRELEASED INCOMPATIBLE CHANGES diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-app/src/main/java/org/apache/hadoop/mapreduce/v2/app/MRAppMaster.java b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-app/src/main/java/org/apache/hadoop/mapreduce/v2/app/MRAppMaster.java index ca6aadfb1cc..6fafdb5aa23 100644 --- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-app/src/main/java/org/apache/hadoop/mapreduce/v2/app/MRAppMaster.java +++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-app/src/main/java/org/apache/hadoop/mapreduce/v2/app/MRAppMaster.java @@ -139,6 +139,7 @@ import org.apache.hadoop.yarn.util.Clock; import org.apache.hadoop.yarn.util.ConverterUtils; import org.apache.hadoop.yarn.util.SystemClock; +import org.apache.log4j.LogManager; import com.google.common.annotations.VisibleForTesting; @@ -1395,6 +1396,8 @@ public static void main(String[] args) { } catch (Throwable t) { LOG.fatal("Error starting MRAppMaster", t); System.exit(1); + } finally { + LogManager.shutdown(); } } From 47cca0cb6d1f4e5979d11d9a624b005e6e666f2f Mon Sep 17 00:00:00 2001 From: Christopher Douglas Date: Sat, 28 Dec 2013 21:58:33 +0000 Subject: [PATCH 09/42] MAPREDUCE-5196. Add bookkeeping for managing checkpoints of task state. Contributed by Carlo Curino git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1553939 13f79535-47bb-0310-9956-ffa450edef68 --- hadoop-mapreduce-project/CHANGES.txt | 3 + .../mapred/TaskAttemptListenerImpl.java | 76 +++- .../app/job/event/TaskAttemptEventType.java | 1 + .../v2/app/job/impl/TaskAttemptImpl.java | 25 ++ .../v2/app/rm/RMContainerAllocator.java | 2 +- .../app/rm/preemption/AMPreemptionPolicy.java | 13 +- .../CheckpointAMPreemptionPolicy.java | 290 +++++++++++++++ .../rm/preemption/KillAMPreemptionPolicy.java | 9 +- .../rm/preemption/NoopAMPreemptionPolicy.java | 9 +- .../mapred/TestTaskAttemptListenerImpl.java | 157 +++++++-- .../app/TestCheckpointPreemptionPolicy.java | 329 ++++++++++++++++++ .../apache/hadoop/mapred/LocalJobRunner.java | 42 ++- .../org/apache/hadoop/mapred/AMFeedback.java | 63 ++++ .../java/org/apache/hadoop/mapred/Task.java | 38 +- .../org/apache/hadoop/mapred/TaskStatus.java | 2 +- .../hadoop/mapred/TaskUmbilicalProtocol.java | 45 ++- .../checkpoint/TaskCheckpointID.java | 47 ++- .../apache/hadoop/mapred/TestMapProgress.java | 32 +- .../apache/hadoop/mapred/TestTaskCommit.java | 31 +- .../TestUmbilicalProtocolWithJobToken.java | 2 +- 20 files changed, 1098 insertions(+), 118 deletions(-) create mode 100644 hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-app/src/main/java/org/apache/hadoop/mapreduce/v2/app/rm/preemption/CheckpointAMPreemptionPolicy.java create mode 100644 hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-app/src/test/java/org/apache/hadoop/mapreduce/v2/app/TestCheckpointPreemptionPolicy.java create mode 100644 hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapred/AMFeedback.java diff --git a/hadoop-mapreduce-project/CHANGES.txt b/hadoop-mapreduce-project/CHANGES.txt index b9d167b77a3..c7aace0597f 100644 --- a/hadoop-mapreduce-project/CHANGES.txt +++ b/hadoop-mapreduce-project/CHANGES.txt @@ -77,6 +77,9 @@ Trunk (Unreleased) MAPREDUCE-5189. Add policies and wiring to respond to preemption requests from YARN. (Carlo Curino via cdouglas) + MAPREDUCE-5196. Add bookkeeping for managing checkpoints of task state. + (Carlo Curino via cdouglas) + BUG FIXES MAPREDUCE-4272. SortedRanges.Range#compareTo is not spec compliant. diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-app/src/main/java/org/apache/hadoop/mapred/TaskAttemptListenerImpl.java b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-app/src/main/java/org/apache/hadoop/mapred/TaskAttemptListenerImpl.java index 8af7e379873..9f4b9c7ec63 100644 --- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-app/src/main/java/org/apache/hadoop/mapred/TaskAttemptListenerImpl.java +++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-app/src/main/java/org/apache/hadoop/mapred/TaskAttemptListenerImpl.java @@ -36,7 +36,9 @@ import org.apache.hadoop.mapred.SortedRanges.Range; import org.apache.hadoop.mapreduce.MRJobConfig; import org.apache.hadoop.mapreduce.TypeConverter; +import org.apache.hadoop.mapreduce.checkpoint.TaskCheckpointID; import org.apache.hadoop.mapreduce.security.token.JobTokenSecretManager; +import org.apache.hadoop.mapreduce.v2.api.records.TaskId; import org.apache.hadoop.mapreduce.v2.app.AppContext; import org.apache.hadoop.mapreduce.v2.app.TaskAttemptListener; import org.apache.hadoop.mapreduce.v2.app.TaskHeartbeatHandler; @@ -45,8 +47,8 @@ import org.apache.hadoop.mapreduce.v2.app.job.event.TaskAttemptDiagnosticsUpdateEvent; import org.apache.hadoop.mapreduce.v2.app.job.event.TaskAttemptEvent; import org.apache.hadoop.mapreduce.v2.app.job.event.TaskAttemptEventType; -import org.apache.hadoop.mapreduce.v2.app.job.event.TaskAttemptStatusUpdateEvent; import org.apache.hadoop.mapreduce.v2.app.job.event.TaskAttemptStatusUpdateEvent.TaskAttemptStatus; +import org.apache.hadoop.mapreduce.v2.app.job.event.TaskAttemptStatusUpdateEvent; import org.apache.hadoop.mapreduce.v2.app.rm.RMHeartbeatHandler; import org.apache.hadoop.mapreduce.v2.app.rm.preemption.AMPreemptionPolicy; import org.apache.hadoop.mapreduce.v2.app.security.authorize.MRAMPolicyProvider; @@ -228,6 +230,22 @@ public void commitPending(TaskAttemptID taskAttemptID, TaskStatus taskStatsu) TaskAttemptEventType.TA_COMMIT_PENDING)); } + @Override + public void preempted(TaskAttemptID taskAttemptID, TaskStatus taskStatus) + throws IOException, InterruptedException { + LOG.info("Preempted state update from " + taskAttemptID.toString()); + // An attempt is telling us that it got preempted. + org.apache.hadoop.mapreduce.v2.api.records.TaskAttemptId attemptID = + TypeConverter.toYarn(taskAttemptID); + + preemptionPolicy.reportSuccessfulPreemption(attemptID); + taskHeartbeatHandler.progressing(attemptID); + + context.getEventHandler().handle( + new TaskAttemptEvent(attemptID, + TaskAttemptEventType.TA_PREEMPTED)); + } + @Override public void done(TaskAttemptID taskAttemptID) throws IOException { LOG.info("Done acknowledgement from " + taskAttemptID.toString()); @@ -250,6 +268,10 @@ public void fatalError(TaskAttemptID taskAttemptID, String msg) org.apache.hadoop.mapreduce.v2.api.records.TaskAttemptId attemptID = TypeConverter.toYarn(taskAttemptID); + + // handling checkpoints + preemptionPolicy.handleFailedContainer(attemptID); + context.getEventHandler().handle( new TaskAttemptEvent(attemptID, TaskAttemptEventType.TA_FAILMSG)); } @@ -264,6 +286,10 @@ public void fsError(TaskAttemptID taskAttemptID, String message) org.apache.hadoop.mapreduce.v2.api.records.TaskAttemptId attemptID = TypeConverter.toYarn(taskAttemptID); + + // handling checkpoints + preemptionPolicy.handleFailedContainer(attemptID); + context.getEventHandler().handle( new TaskAttemptEvent(attemptID, TaskAttemptEventType.TA_FAILMSG)); } @@ -293,12 +319,6 @@ public MapTaskCompletionEventsUpdate getMapCompletionEvents( return new MapTaskCompletionEventsUpdate(events, shouldReset); } - @Override - public boolean ping(TaskAttemptID taskAttemptID) throws IOException { - LOG.info("Ping from " + taskAttemptID.toString()); - return true; - } - @Override public void reportDiagnosticInfo(TaskAttemptID taskAttemptID, String diagnosticInfo) throws IOException { @@ -321,11 +341,33 @@ public void reportDiagnosticInfo(TaskAttemptID taskAttemptID, String diagnosticI } @Override - public boolean statusUpdate(TaskAttemptID taskAttemptID, + public AMFeedback statusUpdate(TaskAttemptID taskAttemptID, TaskStatus taskStatus) throws IOException, InterruptedException { - LOG.info("Status update from " + taskAttemptID.toString()); + org.apache.hadoop.mapreduce.v2.api.records.TaskAttemptId yarnAttemptID = TypeConverter.toYarn(taskAttemptID); + + AMFeedback feedback = new AMFeedback(); + feedback.setTaskFound(true); + + // Propagating preemption to the task if TASK_PREEMPTION is enabled + if (getConfig().getBoolean(MRJobConfig.TASK_PREEMPTION, false) + && preemptionPolicy.isPreempted(yarnAttemptID)) { + feedback.setPreemption(true); + LOG.info("Setting preemption bit for task: "+ yarnAttemptID + + " of type " + yarnAttemptID.getTaskId().getTaskType()); + } + + if (taskStatus == null) { + //We are using statusUpdate only as a simple ping + LOG.info("Ping from " + taskAttemptID.toString()); + taskHeartbeatHandler.progressing(yarnAttemptID); + return feedback; + } + + // if we are here there is an actual status update to be processed + LOG.info("Status update from " + taskAttemptID.toString()); + taskHeartbeatHandler.progressing(yarnAttemptID); TaskAttemptStatus taskAttemptStatus = new TaskAttemptStatus(); @@ -386,7 +428,7 @@ public boolean statusUpdate(TaskAttemptID taskAttemptID, context.getEventHandler().handle( new TaskAttemptStatusUpdateEvent(taskAttemptStatus.id, taskAttemptStatus)); - return true; + return feedback; } @Override @@ -494,4 +536,18 @@ public ProtocolSignature getProtocolSignature(String protocol, return ProtocolSignature.getProtocolSignature(this, protocol, clientVersion, clientMethodsHash); } + + // task checkpoint bookeeping + @Override + public TaskCheckpointID getCheckpointID(TaskID taskId) { + TaskId tid = TypeConverter.toYarn(taskId); + return preemptionPolicy.getCheckpointID(tid); + } + + @Override + public void setCheckpointID(TaskID taskId, TaskCheckpointID cid) { + TaskId tid = TypeConverter.toYarn(taskId); + preemptionPolicy.setCheckpointID(tid, cid); + } + } diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-app/src/main/java/org/apache/hadoop/mapreduce/v2/app/job/event/TaskAttemptEventType.java b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-app/src/main/java/org/apache/hadoop/mapreduce/v2/app/job/event/TaskAttemptEventType.java index a43263264e9..1f05ac30aaf 100644 --- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-app/src/main/java/org/apache/hadoop/mapreduce/v2/app/job/event/TaskAttemptEventType.java +++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-app/src/main/java/org/apache/hadoop/mapreduce/v2/app/job/event/TaskAttemptEventType.java @@ -47,6 +47,7 @@ public enum TaskAttemptEventType { TA_FAILMSG, TA_UPDATE, TA_TIMED_OUT, + TA_PREEMPTED, //Producer:TaskCleaner TA_CLEANUP_DONE, diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-app/src/main/java/org/apache/hadoop/mapreduce/v2/app/job/impl/TaskAttemptImpl.java b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-app/src/main/java/org/apache/hadoop/mapreduce/v2/app/job/impl/TaskAttemptImpl.java index 5e14ce1cb52..37c5064b182 100644 --- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-app/src/main/java/org/apache/hadoop/mapreduce/v2/app/job/impl/TaskAttemptImpl.java +++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-app/src/main/java/org/apache/hadoop/mapreduce/v2/app/job/impl/TaskAttemptImpl.java @@ -304,6 +304,9 @@ TaskAttemptEventType.TA_CONTAINER_CLEANED, new KilledTransition()) .addTransition(TaskAttemptStateInternal.RUNNING, TaskAttemptStateInternal.KILL_CONTAINER_CLEANUP, TaskAttemptEventType.TA_KILL, CLEANUP_CONTAINER_TRANSITION) + .addTransition(TaskAttemptStateInternal.RUNNING, + TaskAttemptStateInternal.KILLED, + TaskAttemptEventType.TA_PREEMPTED, new PreemptedTransition()) // Transitions from COMMIT_PENDING state .addTransition(TaskAttemptStateInternal.COMMIT_PENDING, @@ -437,6 +440,7 @@ TaskAttemptEventType.TA_CONTAINER_CLEANED, new TaskCleanupTransition()) TaskAttemptEventType.TA_DONE, TaskAttemptEventType.TA_FAILMSG, TaskAttemptEventType.TA_CONTAINER_CLEANED, + TaskAttemptEventType.TA_PREEMPTED, // Container launch events can arrive late TaskAttemptEventType.TA_CONTAINER_LAUNCHED, TaskAttemptEventType.TA_CONTAINER_LAUNCH_FAILED)) @@ -1874,6 +1878,27 @@ public void transition(TaskAttemptImpl taskAttempt, } } + private static class PreemptedTransition implements + SingleArcTransition { + @SuppressWarnings("unchecked") + @Override + public void transition(TaskAttemptImpl taskAttempt, + TaskAttemptEvent event) { + taskAttempt.setFinishTime(); + taskAttempt.taskAttemptListener.unregister( + taskAttempt.attemptId, taskAttempt.jvmID); + taskAttempt.eventHandler.handle(new ContainerLauncherEvent( + taskAttempt.attemptId, + taskAttempt.getAssignedContainerID(), taskAttempt.getAssignedContainerMgrAddress(), + taskAttempt.container.getContainerToken(), + ContainerLauncher.EventType.CONTAINER_REMOTE_CLEANUP)); + taskAttempt.eventHandler.handle(new TaskTAttemptEvent( + taskAttempt.attemptId, + TaskEventType.T_ATTEMPT_KILLED)); + + } + } + private static class CleanupContainerTransition implements SingleArcTransition { @SuppressWarnings("unchecked") diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-app/src/main/java/org/apache/hadoop/mapreduce/v2/app/rm/RMContainerAllocator.java b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-app/src/main/java/org/apache/hadoop/mapreduce/v2/app/rm/RMContainerAllocator.java index dd739f2b7c3..18491fdbf1d 100644 --- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-app/src/main/java/org/apache/hadoop/mapreduce/v2/app/rm/RMContainerAllocator.java +++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-app/src/main/java/org/apache/hadoop/mapreduce/v2/app/rm/RMContainerAllocator.java @@ -347,7 +347,7 @@ protected synchronized void handleEvent(ContainerAllocatorEvent event) { } } else if ( - event.getType() == ContainerAllocator.EventType.CONTAINER_DEALLOCATE) { + event.getType() == ContainerAllocator.EventType.CONTAINER_DEALLOCATE) { LOG.info("Processing the event " + event.toString()); diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-app/src/main/java/org/apache/hadoop/mapreduce/v2/app/rm/preemption/AMPreemptionPolicy.java b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-app/src/main/java/org/apache/hadoop/mapreduce/v2/app/rm/preemption/AMPreemptionPolicy.java index 0bbe75bdea3..85211f958d6 100644 --- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-app/src/main/java/org/apache/hadoop/mapreduce/v2/app/rm/preemption/AMPreemptionPolicy.java +++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-app/src/main/java/org/apache/hadoop/mapreduce/v2/app/rm/preemption/AMPreemptionPolicy.java @@ -19,10 +19,9 @@ import java.util.List; -import org.apache.hadoop.mapred.TaskAttemptID; -import org.apache.hadoop.mapred.TaskID; import org.apache.hadoop.mapreduce.checkpoint.TaskCheckpointID; import org.apache.hadoop.mapreduce.v2.api.records.TaskAttemptId; +import org.apache.hadoop.mapreduce.v2.api.records.TaskId; import org.apache.hadoop.mapreduce.v2.api.records.TaskType; import org.apache.hadoop.mapreduce.v2.app.AppContext; import org.apache.hadoop.yarn.api.records.Container; @@ -81,7 +80,7 @@ public abstract class Context { * successfully preempted (for bookeeping, counters, etc..) * @param attemptID Task attempt that preempted */ - public void reportSuccessfulPreemption(TaskAttemptID attemptID); + public void reportSuccessfulPreemption(TaskAttemptId attemptID); /** * Callback informing the policy of containers exiting with a failure. This @@ -98,20 +97,20 @@ public abstract class Context { public void handleCompletedContainer(TaskAttemptId attemptID); /** - * Method to retrieve the latest checkpoint for a given {@link TaskID} + * Method to retrieve the latest checkpoint for a given {@link TaskId} * @param taskId TaskID * @return CheckpointID associated with this task or null */ - public TaskCheckpointID getCheckpointID(TaskID taskId); + public TaskCheckpointID getCheckpointID(TaskId taskId); /** * Method to store the latest {@link * org.apache.hadoop.mapreduce.checkpoint.CheckpointID} for a given {@link - * TaskID}. Assigning a null is akin to remove all previous checkpoints for + * TaskId}. Assigning a null is akin to remove all previous checkpoints for * this task. * @param taskId TaskID * @param cid Checkpoint to assign or null to remove it. */ - public void setCheckpointID(TaskID taskId, TaskCheckpointID cid); + public void setCheckpointID(TaskId taskId, TaskCheckpointID cid); } diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-app/src/main/java/org/apache/hadoop/mapreduce/v2/app/rm/preemption/CheckpointAMPreemptionPolicy.java b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-app/src/main/java/org/apache/hadoop/mapreduce/v2/app/rm/preemption/CheckpointAMPreemptionPolicy.java new file mode 100644 index 00000000000..57e0bce1ff7 --- /dev/null +++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-app/src/main/java/org/apache/hadoop/mapreduce/v2/app/rm/preemption/CheckpointAMPreemptionPolicy.java @@ -0,0 +1,290 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.mapreduce.v2.app.rm.preemption; + +import java.util.Collections; +import java.util.Comparator; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.mapreduce.JobCounter; +import org.apache.hadoop.mapreduce.checkpoint.TaskCheckpointID; +import org.apache.hadoop.mapreduce.v2.api.records.TaskAttemptId; +import org.apache.hadoop.mapreduce.v2.api.records.TaskId; +import org.apache.hadoop.mapreduce.v2.api.records.TaskType; +import org.apache.hadoop.mapreduce.v2.app.AppContext; +import org.apache.hadoop.mapreduce.v2.app.job.event.JobCounterUpdateEvent; +import org.apache.hadoop.yarn.api.records.Container; +import org.apache.hadoop.yarn.api.records.ContainerId; +import org.apache.hadoop.yarn.api.records.PreemptionContainer; +import org.apache.hadoop.yarn.api.records.PreemptionContract; +import org.apache.hadoop.yarn.api.records.PreemptionMessage; +import org.apache.hadoop.yarn.api.records.PreemptionResourceRequest; +import org.apache.hadoop.yarn.api.records.Resource; +import org.apache.hadoop.yarn.api.records.ResourceRequest; +import org.apache.hadoop.yarn.api.records.StrictPreemptionContract; +import org.apache.hadoop.yarn.event.EventHandler; + +/** + * This policy works in combination with an implementation of task + * checkpointing. It computes the tasks to be preempted in response to the RM + * request for preemption. For strict requests, it maps containers to + * corresponding tasks; for fungible requests, it attempts to pick the best + * containers to preempt (reducers in reverse allocation order). The + * TaskAttemptListener will interrogate this policy when handling a task + * heartbeat to check whether the task should be preempted or not. When handling + * fungible requests, the policy discount the RM ask by the amount of currently + * in-flight preemptions (i.e., tasks that are checkpointing). + * + * This class it is also used to maintain the list of checkpoints for existing + * tasks. Centralizing this functionality here, allows us to have visibility on + * preemption and checkpoints in a single location, thus coordinating preemption + * and checkpoint management decisions in a single policy. + */ +public class CheckpointAMPreemptionPolicy implements AMPreemptionPolicy { + + // task attempts flagged for preemption + private final Set toBePreempted; + + private final Set countedPreemptions; + + private final Map checkpoints; + + private final Map pendingFlexiblePreemptions; + + @SuppressWarnings("rawtypes") + private EventHandler eventHandler; + + static final Log LOG = LogFactory + .getLog(CheckpointAMPreemptionPolicy.class); + + public CheckpointAMPreemptionPolicy() { + this(Collections.synchronizedSet(new HashSet()), + Collections.synchronizedSet(new HashSet()), + Collections.synchronizedMap(new HashMap()), + Collections.synchronizedMap(new HashMap())); + } + + CheckpointAMPreemptionPolicy(Set toBePreempted, + Set countedPreemptions, + Map checkpoints, + Map pendingFlexiblePreemptions) { + this.toBePreempted = toBePreempted; + this.countedPreemptions = countedPreemptions; + this.checkpoints = checkpoints; + this.pendingFlexiblePreemptions = pendingFlexiblePreemptions; + } + + @Override + public void init(AppContext context) { + this.eventHandler = context.getEventHandler(); + } + + @Override + public void preempt(Context ctxt, PreemptionMessage preemptionRequests) { + + if (preemptionRequests != null) { + + // handling non-negotiable preemption + + StrictPreemptionContract cStrict = preemptionRequests.getStrictContract(); + if (cStrict != null + && cStrict.getContainers() != null + && cStrict.getContainers().size() > 0) { + LOG.info("strict preemption :" + + preemptionRequests.getStrictContract().getContainers().size() + + " containers to kill"); + + // handle strict preemptions. These containers are non-negotiable + for (PreemptionContainer c : + preemptionRequests.getStrictContract().getContainers()) { + ContainerId reqCont = c.getId(); + TaskAttemptId reqTask = ctxt.getTaskAttempt(reqCont); + if (reqTask != null) { + // ignore requests for preempting containers running maps + if (org.apache.hadoop.mapreduce.v2.api.records.TaskType.REDUCE + .equals(reqTask.getTaskId().getTaskType())) { + toBePreempted.add(reqTask); + LOG.info("preempting " + reqCont + " running task:" + reqTask); + } else { + LOG.info("NOT preempting " + reqCont + " running task:" + reqTask); + } + } + } + } + + // handling negotiable preemption + PreemptionContract cNegot = preemptionRequests.getContract(); + if (cNegot != null + && cNegot.getResourceRequest() != null + && cNegot.getResourceRequest().size() > 0 + && cNegot.getContainers() != null + && cNegot.getContainers().size() > 0) { + + LOG.info("negotiable preemption :" + + preemptionRequests.getContract().getResourceRequest().size() + + " resourceReq, " + + preemptionRequests.getContract().getContainers().size() + + " containers"); + // handle fungible preemption. Here we only look at the total amount of + // resources to be preempted and pick enough of our containers to + // satisfy that. We only support checkpointing for reducers for now. + List reqResources = + preemptionRequests.getContract().getResourceRequest(); + + // compute the total amount of pending preemptions (to be discounted + // from current request) + int pendingPreemptionRam = 0; + int pendingPreemptionCores = 0; + for (Resource r : pendingFlexiblePreemptions.values()) { + pendingPreemptionRam += r.getMemory(); + pendingPreemptionCores += r.getVirtualCores(); + } + + // discount preemption request based on currently pending preemption + for (PreemptionResourceRequest rr : reqResources) { + ResourceRequest reqRsrc = rr.getResourceRequest(); + if (!ResourceRequest.ANY.equals(reqRsrc.getResourceName())) { + // For now, only respond to aggregate requests and ignore locality + continue; + } + + LOG.info("ResourceRequest:" + reqRsrc); + int reqCont = reqRsrc.getNumContainers(); + int reqMem = reqRsrc.getCapability().getMemory(); + int totalMemoryToRelease = reqCont * reqMem; + int reqCores = reqRsrc.getCapability().getVirtualCores(); + int totalCoresToRelease = reqCont * reqCores; + + // remove + if (pendingPreemptionRam > 0) { + // if goes negative we simply exit + totalMemoryToRelease -= pendingPreemptionRam; + // decrement pending resources if zero or negatve we will + // ignore it while processing next PreemptionResourceRequest + pendingPreemptionRam -= totalMemoryToRelease; + } + if (pendingPreemptionCores > 0) { + totalCoresToRelease -= pendingPreemptionCores; + pendingPreemptionCores -= totalCoresToRelease; + } + + // reverse order of allocation (for now) + List listOfCont = ctxt.getContainers(TaskType.REDUCE); + Collections.sort(listOfCont, new Comparator() { + @Override + public int compare(final Container o1, final Container o2) { + return o2.getId().getId() - o1.getId().getId(); + } + }); + + // preempt reducers first + for (Container cont : listOfCont) { + if (totalMemoryToRelease <= 0 && totalCoresToRelease<=0) { + break; + } + TaskAttemptId reduceId = ctxt.getTaskAttempt(cont.getId()); + int cMem = cont.getResource().getMemory(); + int cCores = cont.getResource().getVirtualCores(); + + if (!toBePreempted.contains(reduceId)) { + totalMemoryToRelease -= cMem; + totalCoresToRelease -= cCores; + toBePreempted.add(reduceId); + pendingFlexiblePreemptions.put(reduceId, cont.getResource()); + } + LOG.info("ResourceRequest:" + reqRsrc + " satisfied preempting " + + reduceId); + } + // if map was preemptable we would do add them to toBePreempted here + } + } + } + } + + @Override + public void handleFailedContainer(TaskAttemptId attemptID) { + toBePreempted.remove(attemptID); + checkpoints.remove(attemptID.getTaskId()); + } + + @Override + public void handleCompletedContainer(TaskAttemptId attemptID){ + LOG.info(" task completed:" + attemptID); + toBePreempted.remove(attemptID); + pendingFlexiblePreemptions.remove(attemptID); + } + + @Override + public boolean isPreempted(TaskAttemptId yarnAttemptID) { + if (toBePreempted.contains(yarnAttemptID)) { + updatePreemptionCounters(yarnAttemptID); + return true; + } + return false; + } + + @Override + public void reportSuccessfulPreemption(TaskAttemptId taskAttemptID) { + // ignore + } + + @Override + public TaskCheckpointID getCheckpointID(TaskId taskId) { + return checkpoints.get(taskId); + } + + @Override + public void setCheckpointID(TaskId taskId, TaskCheckpointID cid) { + checkpoints.put(taskId, cid); + if (cid != null) { + updateCheckpointCounters(taskId, cid); + } + } + + @SuppressWarnings({ "unchecked" }) + private void updateCheckpointCounters(TaskId taskId, TaskCheckpointID cid) { + JobCounterUpdateEvent jce = new JobCounterUpdateEvent(taskId.getJobId()); + jce.addCounterUpdate(JobCounter.CHECKPOINTS, 1); + eventHandler.handle(jce); + jce = new JobCounterUpdateEvent(taskId.getJobId()); + jce.addCounterUpdate(JobCounter.CHECKPOINT_BYTES, cid.getCheckpointBytes()); + eventHandler.handle(jce); + jce = new JobCounterUpdateEvent(taskId.getJobId()); + jce.addCounterUpdate(JobCounter.CHECKPOINT_TIME, cid.getCheckpointTime()); + eventHandler.handle(jce); + + } + + @SuppressWarnings({ "unchecked" }) + private void updatePreemptionCounters(TaskAttemptId yarnAttemptID) { + if (!countedPreemptions.contains(yarnAttemptID)) { + countedPreemptions.add(yarnAttemptID); + JobCounterUpdateEvent jce = new JobCounterUpdateEvent(yarnAttemptID + .getTaskId().getJobId()); + jce.addCounterUpdate(JobCounter.TASKS_REQ_PREEMPT, 1); + eventHandler.handle(jce); + } + } + +} diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-app/src/main/java/org/apache/hadoop/mapreduce/v2/app/rm/preemption/KillAMPreemptionPolicy.java b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-app/src/main/java/org/apache/hadoop/mapreduce/v2/app/rm/preemption/KillAMPreemptionPolicy.java index 100ef4f7af4..daf737a154c 100644 --- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-app/src/main/java/org/apache/hadoop/mapreduce/v2/app/rm/preemption/KillAMPreemptionPolicy.java +++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-app/src/main/java/org/apache/hadoop/mapreduce/v2/app/rm/preemption/KillAMPreemptionPolicy.java @@ -19,11 +19,10 @@ import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; -import org.apache.hadoop.mapred.TaskAttemptID; -import org.apache.hadoop.mapred.TaskID; import org.apache.hadoop.mapreduce.JobCounter; import org.apache.hadoop.mapreduce.checkpoint.TaskCheckpointID; import org.apache.hadoop.mapreduce.v2.api.records.TaskAttemptId; +import org.apache.hadoop.mapreduce.v2.api.records.TaskId; import org.apache.hadoop.mapreduce.v2.app.AppContext; import org.apache.hadoop.mapreduce.v2.app.job.event.JobCounterUpdateEvent; import org.apache.hadoop.mapreduce.v2.app.job.event.TaskAttemptEvent; @@ -89,17 +88,17 @@ public boolean isPreempted(TaskAttemptId yarnAttemptID) { } @Override - public void reportSuccessfulPreemption(TaskAttemptID taskAttemptID) { + public void reportSuccessfulPreemption(TaskAttemptId taskAttemptID) { // ignore } @Override - public TaskCheckpointID getCheckpointID(TaskID taskId) { + public TaskCheckpointID getCheckpointID(TaskId taskId) { return null; } @Override - public void setCheckpointID(TaskID taskId, TaskCheckpointID cid) { + public void setCheckpointID(TaskId taskId, TaskCheckpointID cid) { // ignore } diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-app/src/main/java/org/apache/hadoop/mapreduce/v2/app/rm/preemption/NoopAMPreemptionPolicy.java b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-app/src/main/java/org/apache/hadoop/mapreduce/v2/app/rm/preemption/NoopAMPreemptionPolicy.java index 0c020aca22b..f6cc7b1d918 100644 --- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-app/src/main/java/org/apache/hadoop/mapreduce/v2/app/rm/preemption/NoopAMPreemptionPolicy.java +++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-app/src/main/java/org/apache/hadoop/mapreduce/v2/app/rm/preemption/NoopAMPreemptionPolicy.java @@ -17,10 +17,9 @@ */ package org.apache.hadoop.mapreduce.v2.app.rm.preemption; -import org.apache.hadoop.mapred.TaskAttemptID; -import org.apache.hadoop.mapred.TaskID; import org.apache.hadoop.mapreduce.checkpoint.TaskCheckpointID; import org.apache.hadoop.mapreduce.v2.api.records.TaskAttemptId; +import org.apache.hadoop.mapreduce.v2.api.records.TaskId; import org.apache.hadoop.mapreduce.v2.app.AppContext; import org.apache.hadoop.yarn.api.records.PreemptionMessage; @@ -50,17 +49,17 @@ public boolean isPreempted(TaskAttemptId yarnAttemptID) { } @Override - public void reportSuccessfulPreemption(TaskAttemptID taskAttemptID) { + public void reportSuccessfulPreemption(TaskAttemptId taskAttemptID) { // ignore } @Override - public TaskCheckpointID getCheckpointID(TaskID taskId) { + public TaskCheckpointID getCheckpointID(TaskId taskId) { return null; } @Override - public void setCheckpointID(TaskID taskId, TaskCheckpointID cid) { + public void setCheckpointID(TaskId taskId, TaskCheckpointID cid) { // ignore } diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-app/src/test/java/org/apache/hadoop/mapred/TestTaskAttemptListenerImpl.java b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-app/src/test/java/org/apache/hadoop/mapred/TestTaskAttemptListenerImpl.java index ba8e3d30261..6563cda9aec 100644 --- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-app/src/test/java/org/apache/hadoop/mapred/TestTaskAttemptListenerImpl.java +++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-app/src/test/java/org/apache/hadoop/mapred/TestTaskAttemptListenerImpl.java @@ -17,26 +17,23 @@ */ package org.apache.hadoop.mapred; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertFalse; -import static org.junit.Assert.assertNotNull; -import static org.junit.Assert.assertNull; -import static org.junit.Assert.assertTrue; -import static org.mockito.Matchers.any; -import static org.mockito.Mockito.mock; -import static org.mockito.Mockito.never; -import static org.mockito.Mockito.times; -import static org.mockito.Mockito.verify; -import static org.mockito.Mockito.when; +import org.apache.hadoop.mapred.Counters; +import org.apache.hadoop.mapred.Counters.Counter; +import org.apache.hadoop.mapreduce.checkpoint.EnumCounter; import java.io.IOException; +import java.util.ArrayList; import java.util.Arrays; - -import junit.framework.Assert; +import java.util.List; import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.mapreduce.MRJobConfig; import org.apache.hadoop.mapreduce.TaskType; import org.apache.hadoop.mapreduce.TypeConverter; +import org.apache.hadoop.mapreduce.checkpoint.CheckpointID; +import org.apache.hadoop.mapreduce.checkpoint.FSCheckpointID; +import org.apache.hadoop.mapreduce.checkpoint.TaskCheckpointID; import org.apache.hadoop.mapreduce.security.token.JobTokenSecretManager; import org.apache.hadoop.mapreduce.v2.api.records.JobId; import org.apache.hadoop.mapreduce.v2.api.records.TaskAttemptCompletionEvent; @@ -46,21 +43,31 @@ import org.apache.hadoop.mapreduce.v2.app.AppContext; import org.apache.hadoop.mapreduce.v2.app.TaskHeartbeatHandler; import org.apache.hadoop.mapreduce.v2.app.job.Job; +import org.apache.hadoop.mapreduce.v2.app.rm.preemption.AMPreemptionPolicy; +import org.apache.hadoop.mapreduce.v2.app.rm.preemption.CheckpointAMPreemptionPolicy; import org.apache.hadoop.mapreduce.v2.app.rm.RMHeartbeatHandler; import org.apache.hadoop.mapreduce.v2.util.MRBuilderUtils; +import org.apache.hadoop.yarn.event.Dispatcher; +import org.apache.hadoop.yarn.event.EventHandler; import org.apache.hadoop.yarn.factories.RecordFactory; import org.apache.hadoop.yarn.factory.providers.RecordFactoryProvider; import org.apache.hadoop.yarn.util.SystemClock; + import org.junit.Test; +import static org.junit.Assert.*; +import static org.mockito.Mockito.*; public class TestTaskAttemptListenerImpl { - public static class MockTaskAttemptListenerImpl extends TaskAttemptListenerImpl { + public static class MockTaskAttemptListenerImpl + extends TaskAttemptListenerImpl { public MockTaskAttemptListenerImpl(AppContext context, JobTokenSecretManager jobTokenSecretManager, RMHeartbeatHandler rmHeartbeatHandler, - TaskHeartbeatHandler hbHandler) { - super(context, jobTokenSecretManager, rmHeartbeatHandler, null); + TaskHeartbeatHandler hbHandler, + AMPreemptionPolicy policy) { + + super(context, jobTokenSecretManager, rmHeartbeatHandler, policy); this.taskHeartbeatHandler = hbHandler; } @@ -87,9 +94,16 @@ public void testGetTask() throws IOException { RMHeartbeatHandler rmHeartbeatHandler = mock(RMHeartbeatHandler.class); TaskHeartbeatHandler hbHandler = mock(TaskHeartbeatHandler.class); + Dispatcher dispatcher = mock(Dispatcher.class); + EventHandler ea = mock(EventHandler.class); + when(dispatcher.getEventHandler()).thenReturn(ea); + + when(appCtx.getEventHandler()).thenReturn(ea); + CheckpointAMPreemptionPolicy policy = new CheckpointAMPreemptionPolicy(); + policy.init(appCtx); MockTaskAttemptListenerImpl listener = new MockTaskAttemptListenerImpl(appCtx, secret, - rmHeartbeatHandler, hbHandler); + rmHeartbeatHandler, hbHandler, policy); Configuration conf = new Configuration(); listener.init(conf); listener.start(); @@ -144,7 +158,7 @@ public void testGetTask() throws IOException { assertNotNull(jvmid); try { JVMId.forName("jvm_001_002_m_004_006"); - Assert.fail(); + fail(); } catch (IllegalArgumentException e) { assertEquals(e.getMessage(), "TaskId string : jvm_001_002_m_004_006 is not properly formed"); @@ -190,8 +204,14 @@ public void testGetMapCompletionEvents() throws IOException { RMHeartbeatHandler rmHeartbeatHandler = mock(RMHeartbeatHandler.class); final TaskHeartbeatHandler hbHandler = mock(TaskHeartbeatHandler.class); - TaskAttemptListenerImpl listener = - new TaskAttemptListenerImpl(appCtx, secret, rmHeartbeatHandler, null) { + Dispatcher dispatcher = mock(Dispatcher.class); + EventHandler ea = mock(EventHandler.class); + when(dispatcher.getEventHandler()).thenReturn(ea); + when(appCtx.getEventHandler()).thenReturn(ea); + CheckpointAMPreemptionPolicy policy = new CheckpointAMPreemptionPolicy(); + policy.init(appCtx); + TaskAttemptListenerImpl listener = new TaskAttemptListenerImpl( + appCtx, secret, rmHeartbeatHandler, policy) { @Override protected void registerHeartbeatHandler(Configuration conf) { taskHeartbeatHandler = hbHandler; @@ -219,7 +239,8 @@ private static TaskAttemptCompletionEvent createTce(int eventId, isMap ? org.apache.hadoop.mapreduce.v2.api.records.TaskType.MAP : org.apache.hadoop.mapreduce.v2.api.records.TaskType.REDUCE); TaskAttemptId attemptId = MRBuilderUtils.newTaskAttemptId(tid, 0); - RecordFactory recordFactory = RecordFactoryProvider.getRecordFactory(null); + RecordFactory recordFactory = + RecordFactoryProvider.getRecordFactory(null); TaskAttemptCompletionEvent tce = recordFactory .newRecordInstance(TaskAttemptCompletionEvent.class); tce.setEventId(eventId); @@ -244,8 +265,14 @@ public void testCommitWindow() throws IOException { RMHeartbeatHandler rmHeartbeatHandler = mock(RMHeartbeatHandler.class); final TaskHeartbeatHandler hbHandler = mock(TaskHeartbeatHandler.class); - TaskAttemptListenerImpl listener = - new TaskAttemptListenerImpl(appCtx, secret, rmHeartbeatHandler, null) { + Dispatcher dispatcher = mock(Dispatcher.class); + EventHandler ea = mock(EventHandler.class); + when(dispatcher.getEventHandler()).thenReturn(ea); + when(appCtx.getEventHandler()).thenReturn(ea); + CheckpointAMPreemptionPolicy policy = new CheckpointAMPreemptionPolicy(); + policy.init(appCtx); + TaskAttemptListenerImpl listener = new TaskAttemptListenerImpl( + appCtx, secret, rmHeartbeatHandler, policy) { @Override protected void registerHeartbeatHandler(Configuration conf) { taskHeartbeatHandler = hbHandler; @@ -270,4 +297,88 @@ protected void registerHeartbeatHandler(Configuration conf) { listener.stop(); } + + @Test + public void testCheckpointIDTracking() + throws IOException, InterruptedException{ + + SystemClock clock = new SystemClock(); + + org.apache.hadoop.mapreduce.v2.app.job.Task mockTask = + mock(org.apache.hadoop.mapreduce.v2.app.job.Task.class); + when(mockTask.canCommit(any(TaskAttemptId.class))).thenReturn(true); + Job mockJob = mock(Job.class); + when(mockJob.getTask(any(TaskId.class))).thenReturn(mockTask); + + Dispatcher dispatcher = mock(Dispatcher.class); + EventHandler ea = mock(EventHandler.class); + when(dispatcher.getEventHandler()).thenReturn(ea); + + RMHeartbeatHandler rmHeartbeatHandler = + mock(RMHeartbeatHandler.class); + + AppContext appCtx = mock(AppContext.class); + when(appCtx.getJob(any(JobId.class))).thenReturn(mockJob); + when(appCtx.getClock()).thenReturn(clock); + when(appCtx.getEventHandler()).thenReturn(ea); + JobTokenSecretManager secret = mock(JobTokenSecretManager.class); + final TaskHeartbeatHandler hbHandler = mock(TaskHeartbeatHandler.class); + when(appCtx.getEventHandler()).thenReturn(ea); + CheckpointAMPreemptionPolicy policy = new CheckpointAMPreemptionPolicy(); + policy.init(appCtx); + TaskAttemptListenerImpl listener = new TaskAttemptListenerImpl( + appCtx, secret, rmHeartbeatHandler, policy) { + @Override + protected void registerHeartbeatHandler(Configuration conf) { + taskHeartbeatHandler = hbHandler; + } + }; + + Configuration conf = new Configuration(); + conf.setBoolean(MRJobConfig.TASK_PREEMPTION, true); + //conf.setBoolean("preemption.reduce", true); + + listener.init(conf); + listener.start(); + + TaskAttemptID tid = new TaskAttemptID("12345", 1, TaskType.REDUCE, 1, 0); + + List partialOut = new ArrayList(); + partialOut.add(new Path("/prev1")); + partialOut.add(new Path("/prev2")); + + Counters counters = mock(Counters.class); + final long CBYTES = 64L * 1024 * 1024; + final long CTIME = 4344L; + final Path CLOC = new Path("/test/1"); + Counter cbytes = mock(Counter.class); + when(cbytes.getValue()).thenReturn(CBYTES); + Counter ctime = mock(Counter.class); + when(ctime.getValue()).thenReturn(CTIME); + when(counters.findCounter(eq(EnumCounter.CHECKPOINT_BYTES))) + .thenReturn(cbytes); + when(counters.findCounter(eq(EnumCounter.CHECKPOINT_MS))) + .thenReturn(ctime); + + // propagating a taskstatus that contains a checkpoint id + TaskCheckpointID incid = new TaskCheckpointID(new FSCheckpointID( + CLOC), partialOut, counters); + listener.setCheckpointID( + org.apache.hadoop.mapred.TaskID.downgrade(tid.getTaskID()), incid); + + // and try to get it back + CheckpointID outcid = listener.getCheckpointID(tid.getTaskID()); + TaskCheckpointID tcid = (TaskCheckpointID) outcid; + assertEquals(CBYTES, tcid.getCheckpointBytes()); + assertEquals(CTIME, tcid.getCheckpointTime()); + assertTrue(partialOut.containsAll(tcid.getPartialCommittedOutput())); + assertTrue(tcid.getPartialCommittedOutput().containsAll(partialOut)); + + //assert it worked + assert outcid == incid; + + listener.stop(); + + } + } diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-app/src/test/java/org/apache/hadoop/mapreduce/v2/app/TestCheckpointPreemptionPolicy.java b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-app/src/test/java/org/apache/hadoop/mapreduce/v2/app/TestCheckpointPreemptionPolicy.java new file mode 100644 index 00000000000..b62c1c9cd80 --- /dev/null +++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-app/src/test/java/org/apache/hadoop/mapreduce/v2/app/TestCheckpointPreemptionPolicy.java @@ -0,0 +1,329 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.mapreduce.v2.app; + +import org.apache.hadoop.yarn.api.records.PreemptionContract; +import org.apache.hadoop.yarn.api.records.PreemptionMessage; +import org.apache.hadoop.yarn.api.records.Priority; +import org.apache.hadoop.yarn.util.resource.Resources; + +import static org.junit.Assert.*; +import static org.mockito.Mockito.*; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import org.apache.hadoop.mapred.TaskAttemptListenerImpl; +import org.apache.hadoop.mapreduce.v2.api.records.JobId; +import org.apache.hadoop.mapreduce.v2.api.records.TaskAttemptId; +import org.apache.hadoop.mapreduce.v2.api.records.TaskId; +import org.apache.hadoop.mapreduce.v2.api.records.TaskType; +import org.apache.hadoop.mapreduce.v2.app.MRAppMaster.RunningAppContext; +import org.apache.hadoop.mapreduce.v2.app.rm.RMContainerAllocator; +import org.apache.hadoop.mapreduce.v2.app.rm.preemption.AMPreemptionPolicy; +import org.apache.hadoop.mapreduce.v2.app.rm.preemption.CheckpointAMPreemptionPolicy; +import org.apache.hadoop.mapreduce.v2.util.MRBuilderUtils; +import org.apache.hadoop.yarn.api.records.PreemptionContainer; +import org.apache.hadoop.yarn.api.records.PreemptionResourceRequest; +import org.apache.hadoop.yarn.api.records.StrictPreemptionContract; +import org.apache.hadoop.yarn.api.records.ApplicationAttemptId; +import org.apache.hadoop.yarn.api.records.ApplicationId; +import org.apache.hadoop.yarn.api.records.Container; +import org.apache.hadoop.yarn.api.records.ContainerId; +import org.apache.hadoop.yarn.api.records.Resource; +import org.apache.hadoop.yarn.api.records.ResourceRequest; +import org.apache.hadoop.yarn.event.EventHandler; +import org.apache.hadoop.yarn.factories.RecordFactory; +import org.apache.hadoop.yarn.factory.providers.RecordFactoryProvider; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.Allocation; +import org.junit.Before; +import org.junit.Test; + +public class TestCheckpointPreemptionPolicy { + + TaskAttemptListenerImpl pel= null; + RMContainerAllocator r; + JobId jid; + RunningAppContext mActxt; + Set preemptedContainers = new HashSet(); + Map assignedContainers = + new HashMap(); + private final RecordFactory recordFactory = + RecordFactoryProvider.getRecordFactory(null); + HashMap contToResourceMap = + new HashMap(); + + private int minAlloc = 1024; + + @Before + @SuppressWarnings("rawtypes") // mocked generics + public void setup() { + ApplicationId appId = ApplicationId.newInstance(200, 1); + ApplicationAttemptId appAttemptId = + ApplicationAttemptId.newInstance(appId, 1); + jid = MRBuilderUtils.newJobId(appId, 1); + + mActxt = mock(RunningAppContext.class); + EventHandler ea = mock(EventHandler.class); + when(mActxt.getEventHandler()).thenReturn(ea); + for (int i = 0; i < 40; ++i) { + ContainerId cId = ContainerId.newInstance(appAttemptId, i); + if (0 == i % 7) { + preemptedContainers.add(cId); + } + TaskId tId = 0 == i % 2 + ? MRBuilderUtils.newTaskId(jid, i / 2, TaskType.MAP) + : MRBuilderUtils.newTaskId(jid, i / 2 + 1, TaskType.REDUCE); + assignedContainers.put(cId, MRBuilderUtils.newTaskAttemptId(tId, 0)); + contToResourceMap.put(cId, Resource.newInstance(2 * minAlloc, 2)); + } + + for (Map.Entry ent : + assignedContainers.entrySet()) { + System.out.println("cont:" + ent.getKey().getId() + + " type:" + ent.getValue().getTaskId().getTaskType() + + " res:" + contToResourceMap.get(ent.getKey()).getMemory() + "MB" ); + } + } + + @Test + public void testStrictPreemptionContract() { + + final Map containers = assignedContainers; + AMPreemptionPolicy.Context mPctxt = new AMPreemptionPolicy.Context() { + @Override + public TaskAttemptId getTaskAttempt(ContainerId cId) { + return containers.get(cId); + } + @Override + public List getContainers(TaskType t) { + List p = new ArrayList(); + for (Map.Entry ent : + assignedContainers.entrySet()) { + if (ent.getValue().getTaskId().getTaskType().equals(t)) { + p.add(Container.newInstance(ent.getKey(), null, null, + contToResourceMap.get(ent.getKey()), + Priority.newInstance(0), null)); + } + } + return p; + } + }; + + PreemptionMessage pM = generatePreemptionMessage(preemptedContainers, + contToResourceMap, Resource.newInstance(1024, 1), true); + + CheckpointAMPreemptionPolicy policy = new CheckpointAMPreemptionPolicy(); + policy.init(mActxt); + policy.preempt(mPctxt, pM); + + + for (ContainerId c : preemptedContainers) { + TaskAttemptId t = assignedContainers.get(c); + if (TaskType.MAP.equals(t.getTaskId().getTaskType())) { + assert policy.isPreempted(t) == false; + } else { + assert policy.isPreempted(t); + } + } + } + + + @Test + public void testPreemptionContract() { + final Map containers = assignedContainers; + AMPreemptionPolicy.Context mPctxt = new AMPreemptionPolicy.Context() { + @Override + public TaskAttemptId getTaskAttempt(ContainerId cId) { + return containers.get(cId); + } + + @Override + public List getContainers(TaskType t) { + List p = new ArrayList(); + for (Map.Entry ent : + assignedContainers.entrySet()){ + if(ent.getValue().getTaskId().getTaskType().equals(t)){ + p.add(Container.newInstance(ent.getKey(), null, null, + contToResourceMap.get(ent.getKey()), + Priority.newInstance(0), null)); + } + } + return p; + } + }; + + PreemptionMessage pM = generatePreemptionMessage(preemptedContainers, + contToResourceMap, Resource.newInstance(minAlloc, 1), false); + + CheckpointAMPreemptionPolicy policy = new CheckpointAMPreemptionPolicy(); + policy.init(mActxt); + + int supposedMemPreemption = pM.getContract().getResourceRequest() + .get(0).getResourceRequest().getCapability().getMemory() + * pM.getContract().getResourceRequest().get(0).getResourceRequest() + .getNumContainers(); + + // first round of preemption + policy.preempt(mPctxt, pM); + List preempting = + validatePreemption(pM, policy, supposedMemPreemption); + + // redundant message + policy.preempt(mPctxt, pM); + List preempting2 = + validatePreemption(pM, policy, supposedMemPreemption); + + // check that nothing got added + assert preempting2.equals(preempting); + + // simulate 2 task completions/successful preemption + policy.handleCompletedContainer(preempting.get(0)); + policy.handleCompletedContainer(preempting.get(1)); + + // remove from assignedContainers + Iterator> it = + assignedContainers.entrySet().iterator(); + while (it.hasNext()) { + Map.Entry ent = it.next(); + if (ent.getValue().equals(preempting.get(0)) || + ent.getValue().equals(preempting.get(1))) + it.remove(); + } + + // one more message asking for preemption + policy.preempt(mPctxt, pM); + + // triggers preemption of 2 more containers (i.e., the preemption set changes) + List preempting3 = + validatePreemption(pM, policy, supposedMemPreemption); + assert preempting3.equals(preempting2) == false; + } + + private List validatePreemption(PreemptionMessage pM, + CheckpointAMPreemptionPolicy policy, int supposedMemPreemption) { + Resource effectivelyPreempted = Resource.newInstance(0, 0); + + List preempting = new ArrayList(); + + for (Map.Entry ent : + assignedContainers.entrySet()) { + if (policy.isPreempted(ent.getValue())) { + Resources.addTo(effectivelyPreempted,contToResourceMap.get(ent.getKey())); + // preempt only reducers + if (policy.isPreempted(ent.getValue())){ + assertEquals(TaskType.REDUCE, ent.getValue().getTaskId().getTaskType()); + preempting.add(ent.getValue()); + } + } + } + + // preempt enough + assert (effectivelyPreempted.getMemory() >= supposedMemPreemption) + : " preempted: " + effectivelyPreempted.getMemory(); + + // preempt not too much enough + assert effectivelyPreempted.getMemory() <= supposedMemPreemption + minAlloc; + return preempting; + } + + private PreemptionMessage generatePreemptionMessage( + Set containerToPreempt, + HashMap resPerCont, + Resource minimumAllocation, boolean strict) { + + Set currentContPreemption = Collections.unmodifiableSet( + new HashSet(containerToPreempt)); + containerToPreempt.clear(); + Resource tot = Resource.newInstance(0, 0); + for(ContainerId c : currentContPreemption){ + Resources.addTo(tot, + resPerCont.get(c)); + } + int numCont = (int) Math.ceil(tot.getMemory() / + (double) minimumAllocation.getMemory()); + ResourceRequest rr = ResourceRequest.newInstance( + Priority.newInstance(0), ResourceRequest.ANY, + minimumAllocation, numCont); + if (strict) { + return generatePreemptionMessage(new Allocation(null, null, + currentContPreemption, null, null)); + } + return generatePreemptionMessage(new Allocation(null, null, + null, currentContPreemption, + Collections.singletonList(rr))); + } + + + private PreemptionMessage generatePreemptionMessage(Allocation allocation) { + PreemptionMessage pMsg = null; + // assemble strict preemption request + if (allocation.getStrictContainerPreemptions() != null) { + pMsg = recordFactory.newRecordInstance(PreemptionMessage.class); + StrictPreemptionContract pStrict = + recordFactory.newRecordInstance(StrictPreemptionContract.class); + Set pCont = new HashSet(); + for (ContainerId cId : allocation.getStrictContainerPreemptions()) { + PreemptionContainer pc = + recordFactory.newRecordInstance(PreemptionContainer.class); + pc.setId(cId); + pCont.add(pc); + } + pStrict.setContainers(pCont); + pMsg.setStrictContract(pStrict); + } + + // assemble negotiable preemption request + if (allocation.getResourcePreemptions() != null && + allocation.getResourcePreemptions().size() > 0 && + allocation.getContainerPreemptions() != null && + allocation.getContainerPreemptions().size() > 0) { + if (pMsg == null) { + pMsg = recordFactory.newRecordInstance(PreemptionMessage.class); + } + PreemptionContract contract = + recordFactory.newRecordInstance(PreemptionContract.class); + Set pCont = new HashSet(); + for (ContainerId cId : allocation.getContainerPreemptions()) { + PreemptionContainer pc = + recordFactory.newRecordInstance(PreemptionContainer.class); + pc.setId(cId); + pCont.add(pc); + } + List pRes = + new ArrayList(); + for (ResourceRequest crr : allocation.getResourcePreemptions()) { + PreemptionResourceRequest prr = + recordFactory.newRecordInstance(PreemptionResourceRequest.class); + prr.setResourceRequest(crr); + pRes.add(prr); + } + contract.setContainers(pCont); + contract.setResourceRequest(pRes); + pMsg.setContract(contract); + } + return pMsg; + } + +} diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-common/src/main/java/org/apache/hadoop/mapred/LocalJobRunner.java b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-common/src/main/java/org/apache/hadoop/mapred/LocalJobRunner.java index 2bb7dc83655..b6855024f36 100644 --- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-common/src/main/java/org/apache/hadoop/mapred/LocalJobRunner.java +++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-common/src/main/java/org/apache/hadoop/mapred/LocalJobRunner.java @@ -53,6 +53,7 @@ import org.apache.hadoop.mapreduce.TaskCompletionEvent; import org.apache.hadoop.mapreduce.TaskTrackerInfo; import org.apache.hadoop.mapreduce.TaskType; +import org.apache.hadoop.mapreduce.checkpoint.TaskCheckpointID; import org.apache.hadoop.mapreduce.protocol.ClientProtocol; import org.apache.hadoop.mapreduce.security.token.delegation.DelegationTokenIdentifier; import org.apache.hadoop.mapreduce.server.jobtracker.JTConfig; @@ -575,10 +576,17 @@ public void run() { // TaskUmbilicalProtocol methods + @Override public JvmTask getTask(JvmContext context) { return null; } - public synchronized boolean statusUpdate(TaskAttemptID taskId, + @Override + public synchronized AMFeedback statusUpdate(TaskAttemptID taskId, TaskStatus taskStatus) throws IOException, InterruptedException { + AMFeedback feedback = new AMFeedback(); + feedback.setTaskFound(true); + if (null == taskStatus) { + return feedback; + } // Serialize as we would if distributed in order to make deep copy ByteArrayOutputStream baos = new ByteArrayOutputStream(); DataOutputStream dos = new DataOutputStream(baos); @@ -618,7 +626,7 @@ public synchronized boolean statusUpdate(TaskAttemptID taskId, } // ignore phase - return true; + return feedback; } /** Return the current values of the counters for this job, @@ -654,24 +662,24 @@ public void commitPending(TaskAttemptID taskid, statusUpdate(taskid, taskStatus); } + @Override public void reportDiagnosticInfo(TaskAttemptID taskid, String trace) { // Ignore for now } + @Override public void reportNextRecordRange(TaskAttemptID taskid, SortedRanges.Range range) throws IOException { LOG.info("Task " + taskid + " reportedNextRecordRange " + range); } - public boolean ping(TaskAttemptID taskid) throws IOException { - return true; - } - + @Override public boolean canCommit(TaskAttemptID taskid) throws IOException { return true; } + @Override public void done(TaskAttemptID taskId) throws IOException { int taskIndex = mapIds.indexOf(taskId); if (taskIndex >= 0) { // mapping @@ -681,11 +689,13 @@ public void done(TaskAttemptID taskId) throws IOException { } } + @Override public synchronized void fsError(TaskAttemptID taskId, String message) throws IOException { LOG.fatal("FSError: "+ message + "from task: " + taskId); } + @Override public void shuffleError(TaskAttemptID taskId, String message) throws IOException { LOG.fatal("shuffleError: "+ message + "from task: " + taskId); } @@ -695,12 +705,30 @@ public synchronized void fatalError(TaskAttemptID taskId, String msg) LOG.fatal("Fatal: "+ msg + "from task: " + taskId); } + @Override public MapTaskCompletionEventsUpdate getMapCompletionEvents(JobID jobId, int fromEventId, int maxLocs, TaskAttemptID id) throws IOException { return new MapTaskCompletionEventsUpdate( org.apache.hadoop.mapred.TaskCompletionEvent.EMPTY_ARRAY, false); } - + + @Override + public void preempted(TaskAttemptID taskId, TaskStatus taskStatus) + throws IOException, InterruptedException { + // ignore + } + + @Override + public TaskCheckpointID getCheckpointID(TaskID taskId) { + // ignore + return null; + } + + @Override + public void setCheckpointID(TaskID downgrade, TaskCheckpointID cid) { + // ignore + } + } public LocalJobRunner(Configuration conf) throws IOException { diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapred/AMFeedback.java b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapred/AMFeedback.java new file mode 100644 index 00000000000..210ac959c43 --- /dev/null +++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapred/AMFeedback.java @@ -0,0 +1,63 @@ +/** +* Licensed to the Apache Software Foundation (ASF) under one +* or more contributor license agreements. See the NOTICE file +* distributed with this work for additional information +* regarding copyright ownership. The ASF licenses this file +* to you under the Apache License, Version 2.0 (the +* "License"); you may not use this file except in compliance +* with the License. You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*/ +package org.apache.hadoop.mapred; + +import java.io.DataInput; +import java.io.DataOutput; +import java.io.IOException; + +import org.apache.hadoop.io.Writable; + +/** + * This class is a simple struct to include both the taskFound information and + * a possible preemption request coming from the AM. + */ +public class AMFeedback implements Writable { + + boolean taskFound; + boolean preemption; + + public void setTaskFound(boolean t){ + taskFound=t; + } + + public boolean getTaskFound(){ + return taskFound; + } + + public void setPreemption(boolean preemption) { + this.preemption=preemption; + } + + public boolean getPreemption() { + return preemption; + } + + @Override + public void write(DataOutput out) throws IOException { + out.writeBoolean(taskFound); + out.writeBoolean(preemption); + } + + @Override + public void readFields(DataInput in) throws IOException { + taskFound = in.readBoolean(); + preemption = in.readBoolean(); + } + +} diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapred/Task.java b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapred/Task.java index 685e61cfb63..660ffc65ad3 100644 --- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapred/Task.java +++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapred/Task.java @@ -187,6 +187,7 @@ static synchronized String getOutputName(int partition) { protected SecretKey tokenSecret; protected SecretKey shuffleSecret; protected GcTimeUpdater gcUpdater; + final AtomicBoolean mustPreempt = new AtomicBoolean(false); //////////////////////////////////////////// // Constructors @@ -711,6 +712,7 @@ public void run() { } try { boolean taskFound = true; // whether TT knows about this task + AMFeedback amFeedback = null; // sleep for a bit synchronized(lock) { if (taskDone.get()) { @@ -728,12 +730,14 @@ public void run() { taskStatus.statusUpdate(taskProgress.get(), taskProgress.toString(), counters); - taskFound = umbilical.statusUpdate(taskId, taskStatus); + amFeedback = umbilical.statusUpdate(taskId, taskStatus); + taskFound = amFeedback.getTaskFound(); taskStatus.clearStatus(); } else { // send ping - taskFound = umbilical.ping(taskId); + amFeedback = umbilical.statusUpdate(taskId, null); + taskFound = amFeedback.getTaskFound(); } // if Task Tracker is not aware of our task ID (probably because it died and @@ -744,6 +748,17 @@ public void run() { System.exit(66); } + // Set a flag that says we should preempt this is read by + // ReduceTasks in places of the execution where it is + // safe/easy to preempt + boolean lastPreempt = mustPreempt.get(); + mustPreempt.set(mustPreempt.get() || amFeedback.getPreemption()); + + if (lastPreempt ^ mustPreempt.get()) { + LOG.info("PREEMPTION TASK: setting mustPreempt to " + + mustPreempt.get() + " given " + amFeedback.getPreemption() + + " for "+ taskId + " task status: " +taskStatus.getPhase()); + } sendProgress = resetProgressFlag(); remainingRetries = MAX_RETRIES; } @@ -992,10 +1007,17 @@ private void updateHeapUsageCounter() { public void done(TaskUmbilicalProtocol umbilical, TaskReporter reporter ) throws IOException, InterruptedException { - LOG.info("Task:" + taskId + " is done." - + " And is in the process of committing"); updateCounters(); - + if (taskStatus.getRunState() == TaskStatus.State.PREEMPTED ) { + // If we are preempted, do no output promotion; signal done and exit + committer.commitTask(taskContext); + umbilical.preempted(taskId, taskStatus); + taskDone.set(true); + reporter.stopCommunicationThread(); + return; + } + LOG.info("Task:" + taskId + " is done." + + " And is in the process of committing"); boolean commitRequired = isCommitRequired(); if (commitRequired) { int retries = MAX_RETRIES; @@ -1054,7 +1076,7 @@ public void statusUpdate(TaskUmbilicalProtocol umbilical) int retries = MAX_RETRIES; while (true) { try { - if (!umbilical.statusUpdate(getTaskID(), taskStatus)) { + if (!umbilical.statusUpdate(getTaskID(), taskStatus).getTaskFound()) { LOG.warn("Parent died. Exiting "+taskId); System.exit(66); } @@ -1098,8 +1120,8 @@ private long calculateOutputSize() throws IOException { if (isMapTask() && conf.getNumReduceTasks() > 0) { try { Path mapOutput = mapOutputFile.getOutputFile(); - FileSystem localFS = FileSystem.getLocal(conf); - return localFS.getFileStatus(mapOutput).getLen(); + FileSystem fs = mapOutput.getFileSystem(conf); + return fs.getFileStatus(mapOutput).getLen(); } catch (IOException e) { LOG.warn ("Could not find output size " , e); } diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapred/TaskStatus.java b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapred/TaskStatus.java index 7bd5eb90616..a5c12de2627 100644 --- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapred/TaskStatus.java +++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapred/TaskStatus.java @@ -51,7 +51,7 @@ public static enum Phase{STARTING, MAP, SHUFFLE, SORT, REDUCE, CLEANUP} @InterfaceAudience.Private @InterfaceStability.Unstable public static enum State {RUNNING, SUCCEEDED, FAILED, UNASSIGNED, KILLED, - COMMIT_PENDING, FAILED_UNCLEAN, KILLED_UNCLEAN} + COMMIT_PENDING, FAILED_UNCLEAN, KILLED_UNCLEAN, PREEMPTED} private final TaskAttemptID taskid; private float progress; diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapred/TaskUmbilicalProtocol.java b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapred/TaskUmbilicalProtocol.java index 425c3b87dad..5df02c7b5b1 100644 --- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapred/TaskUmbilicalProtocol.java +++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapred/TaskUmbilicalProtocol.java @@ -24,6 +24,9 @@ import org.apache.hadoop.classification.InterfaceStability; import org.apache.hadoop.ipc.VersionedProtocol; import org.apache.hadoop.mapred.JvmTask; +import org.apache.hadoop.mapreduce.checkpoint.CheckpointID; +import org.apache.hadoop.mapreduce.checkpoint.FSCheckpointID; +import org.apache.hadoop.mapreduce.checkpoint.TaskCheckpointID; import org.apache.hadoop.mapreduce.security.token.JobTokenSelector; import org.apache.hadoop.security.token.TokenInfo; @@ -64,9 +67,10 @@ public interface TaskUmbilicalProtocol extends VersionedProtocol { * Version 17 Modified TaskID to be aware of the new TaskTypes * Version 18 Added numRequiredSlots to TaskStatus for MAPREDUCE-516 * Version 19 Added fatalError for child to communicate fatal errors to TT + * Version 20 Added methods to manage checkpoints * */ - public static final long versionID = 19L; + public static final long versionID = 20L; /** * Called when a child task process starts, to get its task. @@ -78,7 +82,8 @@ public interface TaskUmbilicalProtocol extends VersionedProtocol { JvmTask getTask(JvmContext context) throws IOException; /** - * Report child's progress to parent. + * Report child's progress to parent. Also invoked to report still alive (used + * to be in ping). It reports an AMFeedback used to propagate preemption requests. * * @param taskId task-id of the child * @param taskStatus status of the child @@ -86,7 +91,7 @@ public interface TaskUmbilicalProtocol extends VersionedProtocol { * @throws InterruptedException * @return True if the task is known */ - boolean statusUpdate(TaskAttemptID taskId, TaskStatus taskStatus) + AMFeedback statusUpdate(TaskAttemptID taskId, TaskStatus taskStatus) throws IOException, InterruptedException; /** Report error messages back to parent. Calls should be sparing, since all @@ -105,11 +110,6 @@ boolean statusUpdate(TaskAttemptID taskId, TaskStatus taskStatus) void reportNextRecordRange(TaskAttemptID taskid, SortedRanges.Range range) throws IOException; - /** Periodically called by child to check if parent is still alive. - * @return True if the task is known - */ - boolean ping(TaskAttemptID taskid) throws IOException; - /** Report that the task is successfully completed. Failure is assumed if * the task process exits without calling this. * @param taskid task's id @@ -161,4 +161,33 @@ MapTaskCompletionEventsUpdate getMapCompletionEvents(JobID jobId, TaskAttemptID id) throws IOException; + /** + * Report to the AM that the task has been succesfully preempted. + * + * @param taskId task's id + * @param taskStatus status of the child + * @throws IOException + */ + void preempted(TaskAttemptID taskId, TaskStatus taskStatus) + throws IOException, InterruptedException; + + /** + * Return the latest CheckpointID for the given TaskID. This provides + * the task with a way to locate the checkpointed data and restart from + * that point in the computation. + * + * @param taskID task's id + * @return the most recent checkpoint (if any) for this task + * @throws IOException + */ + TaskCheckpointID getCheckpointID(TaskID taskID); + + /** + * Send a CheckpointID for a given TaskID to be stored in the AM, + * to later restart a task from this checkpoint. + * @param tid + * @param cid + */ + void setCheckpointID(TaskID tid, TaskCheckpointID cid); + } diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/checkpoint/TaskCheckpointID.java b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/checkpoint/TaskCheckpointID.java index 102b84f2483..17e6922fc3b 100644 --- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/checkpoint/TaskCheckpointID.java +++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/checkpoint/TaskCheckpointID.java @@ -34,37 +34,31 @@ * cost of checkpoints and other counters. This is sent by the task to the AM * to be stored and provided to the next execution of the same task. */ -public class TaskCheckpointID implements CheckpointID{ +public class TaskCheckpointID implements CheckpointID { - FSCheckpointID rawId; - private List partialOutput; - private Counters counters; + final FSCheckpointID rawId; + private final List partialOutput; + private final Counters counters; public TaskCheckpointID() { - this.rawId = new FSCheckpointID(); - this.partialOutput = new ArrayList(); + this(new FSCheckpointID(), new ArrayList(), new Counters()); } public TaskCheckpointID(FSCheckpointID rawId, List partialOutput, Counters counters) { this.rawId = rawId; this.counters = counters; - if(partialOutput == null) - this.partialOutput = new ArrayList(); - else - this.partialOutput = partialOutput; + this.partialOutput = null == partialOutput + ? new ArrayList() + : partialOutput; } @Override public void write(DataOutput out) throws IOException { counters.write(out); - if (partialOutput == null) { - WritableUtils.writeVLong(out, 0L); - } else { - WritableUtils.writeVLong(out, partialOutput.size()); - for(Path p:partialOutput){ - Text.writeString(out, p.toString()); - } + WritableUtils.writeVLong(out, partialOutput.size()); + for (Path p : partialOutput) { + Text.writeString(out, p.toString()); } rawId.write(out); } @@ -74,21 +68,22 @@ public void readFields(DataInput in) throws IOException { partialOutput.clear(); counters.readFields(in); long numPout = WritableUtils.readVLong(in); - for(int i=0;i Date: Sun, 29 Dec 2013 05:43:31 +0000 Subject: [PATCH 10/42] YARN-1481. Reverting addendum patch git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1553994 13f79535-47bb-0310-9956-ffa450edef68 --- .../apache/hadoop/yarn/server/resourcemanager/AdminService.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/AdminService.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/AdminService.java index 10e73267680..33230d86f9b 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/AdminService.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/AdminService.java @@ -174,7 +174,7 @@ private UserGroupInformation checkAcls(String method) throws YarnException { } } - private boolean isRMActive() { + private synchronized boolean isRMActive() { return HAServiceState.ACTIVE == rmContext.getHAServiceState(); } From 5a54b91df9183095e725e6c6927f44d6ac262069 Mon Sep 17 00:00:00 2001 From: Konstantin Shvachko Date: Sun, 29 Dec 2013 21:22:04 +0000 Subject: [PATCH 11/42] HDFS-5675. Add Mkdirs operation to NNThroughputBenchmark. Contributed by Plamen Jeliazkov. git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1554071 13f79535-47bb-0310-9956-ffa450edef68 --- hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt | 9 +- .../namenode/NNThroughputBenchmark.java | 97 +++++++++++++++++++ 2 files changed, 103 insertions(+), 3 deletions(-) diff --git a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt index 44956404f1e..7597f49129b 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt +++ b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt @@ -618,9 +618,6 @@ Release 2.4.0 - UNRELEASED HDFS-5004. Add additional JMX bean for NameNode status data (Trevor Lorimer via cos) - HDFS-5068. Convert NNThroughputBenchmark to a Tool to allow generic options. - (shv) - HDFS-4994. Audit log getContentSummary() calls. (Robert Parker via kihwal) HDFS-5144. Document time unit to NameNodeMetrics. (Akira Ajisaka via @@ -866,6 +863,12 @@ Release 2.3.0 - UNRELEASED HDFS-5662. Can't decommission a DataNode due to file's replication factor larger than the rest of the cluster size. (brandonli) + HDFS-5068. Convert NNThroughputBenchmark to a Tool to allow generic options. + (shv) + + HDFS-5675. Add Mkdirs operation to NNThroughputBenchmark. + (Plamen Jeliazkov via shv) + OPTIMIZATIONS BUG FIXES diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/NNThroughputBenchmark.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/NNThroughputBenchmark.java index 94b625e2d71..7aef8e513a0 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/NNThroughputBenchmark.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/NNThroughputBenchmark.java @@ -605,6 +605,98 @@ void printResults() { } } + /** + * Directory creation statistics. + * + * Each thread creates the same (+ or -1) number of directories. + * Directory names are pre-generated during initialization. + */ + class MkdirsStats extends OperationStatsBase { + // Operation types + static final String OP_MKDIRS_NAME = "mkdirs"; + static final String OP_MKDIRS_USAGE = "-op mkdirs [-threads T] [-dirs N] " + + "[-dirsPerDir P]"; + + protected FileNameGenerator nameGenerator; + protected String[][] dirPaths; + + MkdirsStats(List args) { + super(); + parseArguments(args); + } + + @Override + String getOpName() { + return OP_MKDIRS_NAME; + } + + @Override + void parseArguments(List args) { + boolean ignoreUnrelatedOptions = verifyOpArgument(args); + int nrDirsPerDir = 2; + for (int i = 2; i < args.size(); i++) { // parse command line + if(args.get(i).equals("-dirs")) { + if(i+1 == args.size()) printUsage(); + numOpsRequired = Integer.parseInt(args.get(++i)); + } else if(args.get(i).equals("-threads")) { + if(i+1 == args.size()) printUsage(); + numThreads = Integer.parseInt(args.get(++i)); + } else if(args.get(i).equals("-dirsPerDir")) { + if(i+1 == args.size()) printUsage(); + nrDirsPerDir = Integer.parseInt(args.get(++i)); + } else if(!ignoreUnrelatedOptions) + printUsage(); + } + nameGenerator = new FileNameGenerator(getBaseDir(), nrDirsPerDir); + } + + @Override + void generateInputs(int[] opsPerThread) throws IOException { + assert opsPerThread.length == numThreads : "Error opsPerThread.length"; + nameNodeProto.setSafeMode(HdfsConstants.SafeModeAction.SAFEMODE_LEAVE, + false); + LOG.info("Generate " + numOpsRequired + " inputs for " + getOpName()); + dirPaths = new String[numThreads][]; + for(int idx=0; idx < numThreads; idx++) { + int threadOps = opsPerThread[idx]; + dirPaths[idx] = new String[threadOps]; + for(int jdx=0; jdx < threadOps; jdx++) + dirPaths[idx][jdx] = nameGenerator. + getNextFileName("ThroughputBench"); + } + } + + /** + * returns client name + */ + @Override + String getExecutionArgument(int daemonId) { + return getClientName(daemonId); + } + + /** + * Do mkdirs operation. + */ + @Override + long executeOp(int daemonId, int inputIdx, String clientName) + throws IOException { + long start = Time.now(); + nameNodeProto.mkdirs(dirPaths[daemonId][inputIdx], + FsPermission.getDefault(), true); + long end = Time.now(); + return end-start; + } + + @Override + void printResults() { + LOG.info("--- " + getOpName() + " inputs ---"); + LOG.info("nrDirs = " + numOpsRequired); + LOG.info("nrThreads = " + numThreads); + LOG.info("nrDirsPerDir = " + nameGenerator.getFilesPerDirectory()); + printStats(); + } + } + /** * Open file statistics. * @@ -1279,6 +1371,7 @@ static void printUsage() { System.err.println("Usage: NNThroughputBenchmark" + "\n\t" + OperationStatsBase.OP_ALL_USAGE + " | \n\t" + CreateFileStats.OP_CREATE_USAGE + + " | \n\t" + MkdirsStats.OP_MKDIRS_USAGE + " | \n\t" + OpenFileStats.OP_OPEN_USAGE + " | \n\t" + DeleteFileStats.OP_DELETE_USAGE + " | \n\t" + FileStatusStats.OP_FILE_STATUS_USAGE @@ -1328,6 +1421,10 @@ public int run(String[] aArgs) throws Exception { opStat = new CreateFileStats(args); ops.add(opStat); } + if(runAll || MkdirsStats.OP_MKDIRS_NAME.equals(type)) { + opStat = new MkdirsStats(args); + ops.add(opStat); + } if(runAll || OpenFileStats.OP_OPEN_NAME.equals(type)) { opStat = new OpenFileStats(args); ops.add(opStat); From c179d58eeeef5947e71a7550ff61cd421924ce96 Mon Sep 17 00:00:00 2001 From: Andrew Wang Date: Mon, 30 Dec 2013 19:02:08 +0000 Subject: [PATCH 12/42] Add updated editsStored files missing from initial HDFS-5636 commit. git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1554293 13f79535-47bb-0310-9956-ffa450edef68 --- .../src/test/resources/editsStored | Bin 4599 -> 4712 bytes .../src/test/resources/editsStored.xml | 164 ++++++++++-------- 2 files changed, 90 insertions(+), 74 deletions(-) diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/resources/editsStored b/hadoop-hdfs-project/hadoop-hdfs/src/test/resources/editsStored index dc60e3424bd9bfe74f33d61b79a8fe23ad55aef6..c68762ae3f5b628ff57a93a9de952415dcf3ea34 100644 GIT binary patch delta 1502 zcmeya{6b}doPM6*%3T~+y1nhPoC~T2Zm*jy$^Ze(OkIr5K-q&FN!xxs+y7HyCBNOK zz=^)vyv{E7&MPo5FoNmH3s@ByRVFJ4N*Ed%Tbdb|8JioL#~U(m8XB4F833UngTVe5 z((_^j{3k4ZxSjo0_4Y=f1?-C&7rdJI4q})Al3@xg0Sh-vPWEEd3kR8RfmJu>hk`f1 zn4k`2XW-UP%gjlQ&&@AOO@SKagjFBcDgUpHlg~4%f&JhMRqH!(qA=^*S!!wilckti zC9t@bUw*@b*ORw187pFu6wu~)+8_;e4BzAiCK+iXthNhw2~1L$?8FT5bPLq>7BWLn zs4aQgg~^Yhru871rohr5dnaVGJqs(N3MdS(1Kq~RfYnKooCZ1TCuguiob()O$a6BC zB(3^s;`_~@yX|ST;)OD zeUCMS?av*GzBt*4*8&_Q|Df9c0h64{f2V-* zhjB9klRcxe7poq3n9cL~lkf4$W0$sdIq`~nvN)f-7OHNIKODQ2A%-*9GH@2;=jRk8 z7L`;2h2$qE@JT7j|Azpqj&o7HIJ^Vd875PlA6x0AY_ZhUf1Uh$}pfcl^ Vi4#QxkC@uc5r-=DRXfUN4FE=`@_qmS delta 1423 zcmaE%@?Cj?oW5ka&_<5xt!|Z5%VtO3^?F$+$^Ze(OkIr5K-n!EOBXLHjWcVic3-$l zd!ny4uhRz?QxgUTMld~Do<)I)L22?lW(j>$BLhQ214GMrLk3PmBXd0iAT(qU2xv;) zovG(y*>3T_bH@ftWuW=&ZzhF$Py7HeD+k#umW!)Cnojm%)C&h0UxrmTC+FI2HcU`= zvNLe&r)B1(#^>gjrKUiQYR0OM%a6@%{p5>`YG5x+fU2D^aiTEmTb2N$Ba>yAS|zZ! zmVc_jL%zv7nT)YX3M}k+k~*1%Sp%D-Aj5>oM<=^5Lp*vGYU5e*0!`>*eKp(U=TO70 zARERzuc$eJ1sWoPlNlMsCnvK!RRIQ}8Az0o0c&_jil6@!HaUkC>^d_(h=FE&zIq|ol?!6*kwy| z&6&Y6dr79xUYJP0QiQU|F@P_~qmqJg)K}&%VbR zy!NvmPi&ZM!)pN!kY7;kza}^GieXDD4qcfM`jgl5%1Z%_GzaBAk$~MyjKFlx==3#f zGE;_~xBcW7xTIADs?s7SEAq)}q3S;OHTVfA$AN$?17|^geojGRQAs6`%Rf1bPf7}_ z(_OxL@?V&|jL!s{LtRT>r~6O-#HXS39}2LV;a0k9Psd~%ez|%qS;~FsRqg*uU{^6P zI6`Sgtmb$uZ;)G|3RU8S>;-Hki>GPh#6qZI24^s1@=bnaP!yO~L88FCioEpT<#)it zaB_*jenxO{$Au{FCg0(gnS5T5k5{y-tc??DyVvAbf*K%u9r_UV3Mn&wo;XoNkcs8f OS8=F9@AJzR^8o2 1 - 1388171826188 - c7d869c22c8afce1 + 1389121087930 + d48b4b3e6a43707b @@ -24,8 +24,8 @@ 3 2 - 1388171826191 - a3c41446507dfca9 + 1389121087937 + 62b6fae6bff918a9 @@ -37,17 +37,17 @@ 16386 /file_create_u\0001;F431 1 - 1387480626844 - 1387480626844 + 1388429889312 + 1388429889312 512 - DFSClient_NONMAPREDUCE_1147796111_1 + DFSClient_NONMAPREDUCE_-1396063717_1 127.0.0.1 andrew supergroup 420 - a90261a0-3759-4480-ba80-e10c9ae331e6 + bfe81b9e-5c10-4f90-a5e1-b707da7bb781 7 @@ -59,8 +59,8 @@ 0 /file_create_u\0001;F431 1 - 1387480626885 - 1387480626844 + 1388429889328 + 1388429889312 512 @@ -78,8 +78,8 @@ 0 /file_create_u\0001;F431 /file_moved - 1387480626894 - a90261a0-3759-4480-ba80-e10c9ae331e6 + 1388429889336 + bfe81b9e-5c10-4f90-a5e1-b707da7bb781 9 @@ -89,8 +89,8 @@ 7 0 /file_moved - 1387480626905 - a90261a0-3759-4480-ba80-e10c9ae331e6 + 1388429889346 + bfe81b9e-5c10-4f90-a5e1-b707da7bb781 10 @@ -101,7 +101,7 @@ 0 16387 /directory_mkdir - 1387480626917 + 1388429889357 andrew supergroup @@ -136,7 +136,7 @@ 12 /directory_mkdir snapshot1 - a90261a0-3759-4480-ba80-e10c9ae331e6 + bfe81b9e-5c10-4f90-a5e1-b707da7bb781 15 @@ -147,7 +147,7 @@ /directory_mkdir snapshot1 snapshot2 - a90261a0-3759-4480-ba80-e10c9ae331e6 + bfe81b9e-5c10-4f90-a5e1-b707da7bb781 16 @@ -157,7 +157,7 @@ 14 /directory_mkdir snapshot2 - a90261a0-3759-4480-ba80-e10c9ae331e6 + bfe81b9e-5c10-4f90-a5e1-b707da7bb781 17 @@ -169,17 +169,17 @@ 16388 /file_create_u\0001;F431 1 - 1387480626978 - 1387480626978 + 1388429889412 + 1388429889412 512 - DFSClient_NONMAPREDUCE_1147796111_1 + DFSClient_NONMAPREDUCE_-1396063717_1 127.0.0.1 andrew supergroup 420 - a90261a0-3759-4480-ba80-e10c9ae331e6 + bfe81b9e-5c10-4f90-a5e1-b707da7bb781 18 @@ -191,8 +191,8 @@ 0 /file_create_u\0001;F431 1 - 1387480626985 - 1387480626978 + 1388429889420 + 1388429889412 512 @@ -253,9 +253,9 @@ 0 /file_create_u\0001;F431 /file_moved - 1387480627035 + 1388429889495 NONE - a90261a0-3759-4480-ba80-e10c9ae331e6 + bfe81b9e-5c10-4f90-a5e1-b707da7bb781 25 @@ -267,17 +267,17 @@ 16389 /file_concat_target 1 - 1387480627043 - 1387480627043 + 1388429889511 + 1388429889511 512 - DFSClient_NONMAPREDUCE_1147796111_1 + DFSClient_NONMAPREDUCE_-1396063717_1 127.0.0.1 andrew supergroup 420 - a90261a0-3759-4480-ba80-e10c9ae331e6 + bfe81b9e-5c10-4f90-a5e1-b707da7bb781 27 @@ -388,8 +388,8 @@ 0 /file_concat_target 1 - 1387480627148 - 1387480627043 + 1388429889812 + 1388429889511 512 @@ -423,17 +423,17 @@ 16390 /file_concat_0 1 - 1387480627155 - 1387480627155 + 1388429889825 + 1388429889825 512 - DFSClient_NONMAPREDUCE_1147796111_1 + DFSClient_NONMAPREDUCE_-1396063717_1 127.0.0.1 andrew supergroup 420 - a90261a0-3759-4480-ba80-e10c9ae331e6 + bfe81b9e-5c10-4f90-a5e1-b707da7bb781 40 @@ -544,8 +544,8 @@ 0 /file_concat_0 1 - 1387480627193 - 1387480627155 + 1388429889909 + 1388429889825 512 @@ -579,17 +579,17 @@ 16391 /file_concat_1 1 - 1387480627200 - 1387480627200 + 1388429889920 + 1388429889920 512 - DFSClient_NONMAPREDUCE_1147796111_1 + DFSClient_NONMAPREDUCE_-1396063717_1 127.0.0.1 andrew supergroup 420 - a90261a0-3759-4480-ba80-e10c9ae331e6 + bfe81b9e-5c10-4f90-a5e1-b707da7bb781 52 @@ -700,8 +700,8 @@ 0 /file_concat_1 1 - 1387480627238 - 1387480627200 + 1388429890016 + 1388429889920 512 @@ -733,12 +733,12 @@ 56 0 /file_concat_target - 1387480627246 + 1388429890031 /file_concat_0 /file_concat_1 - a90261a0-3759-4480-ba80-e10c9ae331e6 + bfe81b9e-5c10-4f90-a5e1-b707da7bb781 63 @@ -750,14 +750,14 @@ 16392 /file_symlink /file_concat_target - 1387480627255 - 1387480627255 + 1388429890046 + 1388429890046 andrew supergroup 511 - a90261a0-3759-4480-ba80-e10c9ae331e6 + bfe81b9e-5c10-4f90-a5e1-b707da7bb781 64 @@ -771,11 +771,11 @@ andrew JobTracker - 1387480627262 - 1388085427262 + 1388429890059 + 1389034690059 2 - 1387567027262 + 1388516290059 @@ -788,11 +788,11 @@ andrew JobTracker - 1387480627262 - 1388085427262 + 1388429890059 + 1389034690059 2 - 1387567027281 + 1388516290109 @@ -805,8 +805,8 @@ andrew JobTracker - 1387480627262 - 1388085427262 + 1388429890059 + 1389034690059 2 @@ -821,7 +821,7 @@ 493 9223372036854775807 2305843009213693951 - a90261a0-3759-4480-ba80-e10c9ae331e6 + bfe81b9e-5c10-4f90-a5e1-b707da7bb781 68 @@ -834,7 +834,7 @@ party 448 1989 - a90261a0-3759-4480-ba80-e10c9ae331e6 + bfe81b9e-5c10-4f90-a5e1-b707da7bb781 69 @@ -846,8 +846,8 @@ /bar 1 poolparty - 2305844396694321272 - a90261a0-3759-4480-ba80-e10c9ae331e6 + 2305844397643584141 + bfe81b9e-5c10-4f90-a5e1-b707da7bb781 70 @@ -857,7 +857,7 @@ 64 1 /bar2 - a90261a0-3759-4480-ba80-e10c9ae331e6 + bfe81b9e-5c10-4f90-a5e1-b707da7bb781 71 @@ -866,7 +866,7 @@ 65 1 - a90261a0-3759-4480-ba80-e10c9ae331e6 + bfe81b9e-5c10-4f90-a5e1-b707da7bb781 72 @@ -875,7 +875,7 @@ 66 poolparty - a90261a0-3759-4480-ba80-e10c9ae331e6 + bfe81b9e-5c10-4f90-a5e1-b707da7bb781 73 @@ -887,17 +887,17 @@ 16393 /hard-lease-recovery-test 1 - 1387480627356 - 1387480627356 + 1388429890261 + 1388429890261 512 - DFSClient_NONMAPREDUCE_1147796111_1 + DFSClient_NONMAPREDUCE_-1396063717_1 127.0.0.1 andrew supergroup 420 - a90261a0-3759-4480-ba80-e10c9ae331e6 + bfe81b9e-5c10-4f90-a5e1-b707da7bb781 74 @@ -954,7 +954,23 @@ OP_REASSIGN_LEASE 73 - DFSClient_NONMAPREDUCE_1147796111_1 + DFSClient_NONMAPREDUCE_-1396063717_1 + /hard-lease-recovery-test + HDFS_NameNode + + + + OP_SET_GENSTAMP_V2 + + 74 + 1012 + + + + OP_REASSIGN_LEASE + + 75 + HDFS_NameNode /hard-lease-recovery-test HDFS_NameNode @@ -962,20 +978,20 @@ OP_CLOSE - 74 + 76 0 0 /hard-lease-recovery-test 1 - 1387480629729 - 1387480627356 + 1388429895216 + 1388429890261 512 1073741834 11 - 1011 + 1012 andrew @@ -987,7 +1003,7 @@ OP_END_LOG_SEGMENT - 75 + 77 From e596f17734899b5398a51c8988598ea1cca97a84 Mon Sep 17 00:00:00 2001 From: Colin McCabe Date: Mon, 30 Dec 2013 19:35:43 +0000 Subject: [PATCH 13/42] HDFS-5582. hdfs getconf -excludeFile or -includeFile always failed (satish via cmccabe) git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1554295 13f79535-47bb-0310-9956-ffa450edef68 --- hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt | 3 + .../org/apache/hadoop/hdfs/tools/GetConf.java | 5 +- .../apache/hadoop/hdfs/tools/TestGetConf.java | 73 ++++++++++++++++++- 3 files changed, 78 insertions(+), 3 deletions(-) diff --git a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt index 7597f49129b..8822e366655 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt +++ b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt @@ -1019,6 +1019,9 @@ Release 2.3.0 - UNRELEASED HDFS-5661. Browsing FileSystem via web ui, should use datanode's fqdn instead of ip address. (Benoy Antony via jing9) + HDFS-5582. hdfs getconf -excludeFile or -includeFile always failed (sathish + via cmccabe) + Release 2.2.0 - 2013-10-13 INCOMPATIBLE CHANGES diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/tools/GetConf.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/tools/GetConf.java index 778ac59ee25..92a3864a675 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/tools/GetConf.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/tools/GetConf.java @@ -29,6 +29,7 @@ import org.apache.hadoop.HadoopIllegalArgumentException; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; +import org.apache.hadoop.hdfs.DFSConfigKeys; import org.apache.hadoop.hdfs.DFSUtil; import org.apache.hadoop.hdfs.HdfsConfiguration; import org.apache.hadoop.hdfs.DFSUtil.ConfiguredNNAddress; @@ -85,9 +86,9 @@ enum Command { map.put(BACKUP.getName().toLowerCase(), new BackupNodesCommandHandler()); map.put(INCLUDE_FILE.getName().toLowerCase(), - new CommandHandler("DFSConfigKeys.DFS_HOSTS")); + new CommandHandler(DFSConfigKeys.DFS_HOSTS)); map.put(EXCLUDE_FILE.getName().toLowerCase(), - new CommandHandler("DFSConfigKeys.DFS_HOSTS_EXCLUDE")); + new CommandHandler(DFSConfigKeys.DFS_HOSTS_EXCLUDE)); map.put(NNRPCADDRESSES.getName().toLowerCase(), new NNRpcAddressesCommandHandler()); map.put(CONFKEY.getName().toLowerCase(), diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/tools/TestGetConf.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/tools/TestGetConf.java index 7a17cea2d65..80b176f4bd5 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/tools/TestGetConf.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/tools/TestGetConf.java @@ -33,10 +33,15 @@ import java.net.InetSocketAddress; import java.util.ArrayList; import java.util.Arrays; +import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.StringTokenizer; +import org.apache.hadoop.fs.FSDataOutputStream; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hdfs.DFSConfigKeys; import org.apache.hadoop.hdfs.DFSUtil; import org.apache.hadoop.hdfs.DFSUtil.ConfiguredNNAddress; import org.apache.hadoop.hdfs.HdfsConfiguration; @@ -55,7 +60,7 @@ public class TestGetConf { enum TestType { NAMENODE, BACKUP, SECONDARY, NNRPCADDRESSES } - + FileSystem localFileSys; /** Setup federation nameServiceIds in the configuration */ private void setupNameServices(HdfsConfiguration conf, int nameServiceIdCount) { StringBuilder nsList = new StringBuilder(); @@ -379,4 +384,70 @@ public void testTool() throws Exception { } } } + @Test + public void TestGetConfExcludeCommand() throws Exception{ + HdfsConfiguration conf = new HdfsConfiguration(); + // Set up the hosts/exclude files. + localFileSys = FileSystem.getLocal(conf); + Path workingDir = localFileSys.getWorkingDirectory(); + Path dir = new Path(workingDir, System.getProperty("test.build.data", "target/test/data") + "/Getconf/"); + Path hostsFile = new Path(dir, "hosts"); + Path excludeFile = new Path(dir, "exclude"); + + // Setup conf + conf.set(DFSConfigKeys.DFS_HOSTS, hostsFile.toUri().getPath()); + conf.set(DFSConfigKeys.DFS_HOSTS_EXCLUDE, excludeFile.toUri().getPath()); + writeConfigFile(hostsFile, null); + writeConfigFile(excludeFile, null); + String[] args = {"-excludeFile"}; + String ret = runTool(conf, args, true); + assertEquals(excludeFile.toUri().getPath(),ret.trim()); + cleanupFile(localFileSys, excludeFile.getParent()); + } + + @Test + public void TestGetConfIncludeCommand() throws Exception{ + HdfsConfiguration conf = new HdfsConfiguration(); + // Set up the hosts/exclude files. + localFileSys = FileSystem.getLocal(conf); + Path workingDir = localFileSys.getWorkingDirectory(); + Path dir = new Path(workingDir, System.getProperty("test.build.data", "target/test/data") + "/Getconf/"); + Path hostsFile = new Path(dir, "hosts"); + Path excludeFile = new Path(dir, "exclude"); + + // Setup conf + conf.set(DFSConfigKeys.DFS_HOSTS, hostsFile.toUri().getPath()); + conf.set(DFSConfigKeys.DFS_HOSTS_EXCLUDE, excludeFile.toUri().getPath()); + writeConfigFile(hostsFile, null); + writeConfigFile(excludeFile, null); + String[] args = {"-includeFile"}; + String ret = runTool(conf, args, true); + assertEquals(hostsFile.toUri().getPath(),ret.trim()); + cleanupFile(localFileSys, excludeFile.getParent()); + } + + private void writeConfigFile(Path name, ArrayList nodes) + throws IOException { + // delete if it already exists + if (localFileSys.exists(name)) { + localFileSys.delete(name, true); + } + + FSDataOutputStream stm = localFileSys.create(name); + + if (nodes != null) { + for (Iterator it = nodes.iterator(); it.hasNext();) { + String node = it.next(); + stm.writeBytes(node); + stm.writeBytes("\n"); + } + } + stm.close(); + } + + private void cleanupFile(FileSystem fileSys, Path name) throws IOException { + assertTrue(fileSys.exists(name)); + fileSys.delete(name, true); + assertTrue(!fileSys.exists(name)); + } } From 7f86c8114ec98f8a38a690bc1304c2cfc41d093e Mon Sep 17 00:00:00 2001 From: Andrew Wang Date: Mon, 30 Dec 2013 20:02:36 +0000 Subject: [PATCH 14/42] HDFS-5701. Fix the CacheAdmin -addPool -maxTtl option name. Contributed by Stephen Chu. git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1554305 13f79535-47bb-0310-9956-ffa450edef68 --- hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt | 3 +++ .../src/main/java/org/apache/hadoop/hdfs/tools/CacheAdmin.java | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt index 8822e366655..303e64d3d61 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt +++ b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt @@ -453,6 +453,9 @@ Trunk (Unreleased) HDFS-5679. TestCacheDirectives should handle the case where native code is not available. (wang) + HDFS-5701. Fix the CacheAdmin -addPool -maxTtl option name. + (Stephen Chu via wang) + BREAKDOWN OF HDFS-2832 SUBTASKS AND RELATED JIRAS HDFS-4985. Add storage type to the protocol and expose it in block report diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/tools/CacheAdmin.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/tools/CacheAdmin.java index 2766b382580..41c43fae188 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/tools/CacheAdmin.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/tools/CacheAdmin.java @@ -578,7 +578,7 @@ public String getName() { public String getShortUsage() { return "[" + NAME + " [-owner ] " + "[-group ] [-mode ] [-limit ] " + - "[-maxttl ]\n"; + "[-maxTtl ]\n"; } @Override From b524501d4f4b48edeb02901114087f3b5f57691f Mon Sep 17 00:00:00 2001 From: Vinod Kumar Vavilapalli Date: Mon, 30 Dec 2013 21:14:46 +0000 Subject: [PATCH 15/42] MAPREDUCE-5685. Fixed a bug with JobContext getCacheFiles API inside the WrappedReducer class. Contributed by Yi Song. git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1554320 13f79535-47bb-0310-9956-ffa450edef68 --- hadoop-mapreduce-project/CHANGES.txt | 3 ++ .../mapred/TestMRWithDistributedCache.java | 40 +++++++++++++++---- .../mapreduce/lib/reduce/WrappedReducer.java | 2 +- 3 files changed, 37 insertions(+), 8 deletions(-) diff --git a/hadoop-mapreduce-project/CHANGES.txt b/hadoop-mapreduce-project/CHANGES.txt index c7aace0597f..52aefce295a 100644 --- a/hadoop-mapreduce-project/CHANGES.txt +++ b/hadoop-mapreduce-project/CHANGES.txt @@ -264,6 +264,9 @@ Release 2.4.0 - UNRELEASED MAPREDUCE-5694. Fixed MR AppMaster to shutdown the LogManager so as to avoid losing syslog in some conditions. (Mohammad Kamrul Islam via vinodkv) + MAPREDUCE-5685. Fixed a bug with JobContext getCacheFiles API inside the + WrappedReducer class. (Yi Song via vinodkv) + Release 2.3.0 - UNRELEASED INCOMPATIBLE CHANGES diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-common/src/test/java/org/apache/hadoop/mapred/TestMRWithDistributedCache.java b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-common/src/test/java/org/apache/hadoop/mapred/TestMRWithDistributedCache.java index 3bd7f052fc1..c73465e52ca 100644 --- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-common/src/test/java/org/apache/hadoop/mapred/TestMRWithDistributedCache.java +++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-common/src/test/java/org/apache/hadoop/mapred/TestMRWithDistributedCache.java @@ -44,6 +44,8 @@ import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; +import org.apache.hadoop.mapreduce.Reducer; +import org.apache.hadoop.mapreduce.TaskInputOutputContext; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.NullOutputFormat; import org.apache.hadoop.mapreduce.server.jobtracker.JTConfig; @@ -82,12 +84,11 @@ public class TestMRWithDistributedCache extends TestCase { private static final Log LOG = LogFactory.getLog(TestMRWithDistributedCache.class); + + private static class DistributedCacheChecker { - public static class DistributedCacheChecker extends - Mapper { - - @Override - public void setup(Context context) throws IOException { + public void setup(TaskInputOutputContext context) + throws IOException { Configuration conf = context.getConfiguration(); Path[] localFiles = context.getLocalCacheFiles(); URI[] files = context.getCacheFiles(); @@ -101,6 +102,10 @@ public void setup(Context context) throws IOException { TestCase.assertEquals(2, files.length); TestCase.assertEquals(2, archives.length); + // Check the file name + TestCase.assertTrue(files[0].getPath().endsWith("distributed.first")); + TestCase.assertTrue(files[1].getPath().endsWith("distributed.second.jar")); + // Check lengths of the files TestCase.assertEquals(1, fs.getFileStatus(localFiles[0]).getLen()); TestCase.assertTrue(fs.getFileStatus(localFiles[1]).getLen() > 1); @@ -130,8 +135,28 @@ public void setup(Context context) throws IOException { TestCase.assertTrue("second file should be symlinked too", expectedAbsentSymlinkFile.exists()); } + } - + + public static class DistributedCacheCheckerMapper extends + Mapper { + + @Override + protected void setup(Context context) throws IOException, + InterruptedException { + new DistributedCacheChecker().setup(context); + } + } + + public static class DistributedCacheCheckerReducer extends + Reducer { + + @Override + public void setup(Context context) throws IOException { + new DistributedCacheChecker().setup(context); + } + } + private void testWithConf(Configuration conf) throws IOException, InterruptedException, ClassNotFoundException, URISyntaxException { // Create a temporary file of length 1. @@ -146,7 +171,8 @@ private void testWithConf(Configuration conf) throws IOException, Job job = Job.getInstance(conf); - job.setMapperClass(DistributedCacheChecker.class); + job.setMapperClass(DistributedCacheCheckerMapper.class); + job.setReducerClass(DistributedCacheCheckerReducer.class); job.setOutputFormatClass(NullOutputFormat.class); FileInputFormat.setInputPaths(job, first); // Creates the Job Configuration diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/reduce/WrappedReducer.java b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/reduce/WrappedReducer.java index 5be02cb03cf..39178642f24 100644 --- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/reduce/WrappedReducer.java +++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/reduce/WrappedReducer.java @@ -137,7 +137,7 @@ public URI[] getCacheArchives() throws IOException { @Override public URI[] getCacheFiles() throws IOException { - return reduceContext.getCacheArchives(); + return reduceContext.getCacheFiles(); } @Override From 50480f892a9175e34e8aeae2466c6e748affb5d4 Mon Sep 17 00:00:00 2001 From: Vinod Kumar Vavilapalli Date: Mon, 30 Dec 2013 22:21:55 +0000 Subject: [PATCH 16/42] YARN-1522. Fixed a race condition in the test TestApplicationCleanup that was causing it to randomly fail. Contributed by Liyin Liang. git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1554328 13f79535-47bb-0310-9956-ffa450edef68 --- hadoop-yarn-project/CHANGES.txt | 3 ++ .../TestApplicationCleanup.java | 34 +++++++++++-------- 2 files changed, 23 insertions(+), 14 deletions(-) diff --git a/hadoop-yarn-project/CHANGES.txt b/hadoop-yarn-project/CHANGES.txt index 491277dd275..7d8439534ad 100644 --- a/hadoop-yarn-project/CHANGES.txt +++ b/hadoop-yarn-project/CHANGES.txt @@ -277,6 +277,9 @@ Release 2.4.0 - UNRELEASED YARN-1527. Fix yarn rmadmin command to print the correct usage info. (Akira AJISAKA via jianhe) + YARN-1522. Fixed a race condition in the test TestApplicationCleanup that was + causing it to randomly fail. (Liyin Liang via vinodkv) + Release 2.3.0 - UNRELEASED INCOMPATIBLE CHANGES diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestApplicationCleanup.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestApplicationCleanup.java index 5eecae0987f..75bb225d4eb 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestApplicationCleanup.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestApplicationCleanup.java @@ -100,26 +100,32 @@ public void testAppCleanup() throws Exception { //currently only containers are cleaned via this //AM container is cleaned via container launcher resp = nm1.nodeHeartbeat(true); - List contsToClean = resp.getContainersToCleanup(); - List apps = resp.getApplicationsToCleanup(); - int cleanedConts = contsToClean.size(); - int cleanedApps = apps.size(); + List containersToCleanup = resp.getContainersToCleanup(); + List appsToCleanup = resp.getApplicationsToCleanup(); + int numCleanedContainers = containersToCleanup.size(); + int numCleanedApps = appsToCleanup.size(); waitCount = 0; - while ((cleanedConts < 2 || cleanedApps < 1) && waitCount++ < 200) { + while ((numCleanedContainers < 2 || numCleanedApps < 1) + && waitCount++ < 200) { LOG.info("Waiting to get cleanup events.. cleanedConts: " - + cleanedConts + " cleanedApps: " + cleanedApps); + + numCleanedContainers + " cleanedApps: " + numCleanedApps); Thread.sleep(100); resp = nm1.nodeHeartbeat(true); - contsToClean = resp.getContainersToCleanup(); - apps = resp.getApplicationsToCleanup(); - cleanedConts += contsToClean.size(); - cleanedApps += apps.size(); + List deltaContainersToCleanup = + resp.getContainersToCleanup(); + List deltaAppsToCleanup = resp.getApplicationsToCleanup(); + // Add the deltas to the global list + containersToCleanup.addAll(deltaContainersToCleanup); + appsToCleanup.addAll(deltaAppsToCleanup); + // Update counts now + numCleanedContainers = containersToCleanup.size(); + numCleanedApps = appsToCleanup.size(); } - Assert.assertEquals(1, apps.size()); - Assert.assertEquals(app.getApplicationId(), apps.get(0)); - Assert.assertEquals(1, cleanedApps); - Assert.assertEquals(2, cleanedConts); + Assert.assertEquals(1, appsToCleanup.size()); + Assert.assertEquals(app.getApplicationId(), appsToCleanup.get(0)); + Assert.assertEquals(1, numCleanedApps); + Assert.assertEquals(2, numCleanedContainers); rm.stop(); } From 460ac8cb50e024b60e02a96c2ab27368dfe0944d Mon Sep 17 00:00:00 2001 From: Vinod Kumar Vavilapalli Date: Tue, 31 Dec 2013 01:10:01 +0000 Subject: [PATCH 17/42] YARN-1121. Addendum patch. Fixed AsyncDispatcher hang issue during stop due to a race condition caused by the previous patch. Contributed by Jian He. git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1554344 13f79535-47bb-0310-9956-ffa450edef68 --- .../hadoop/yarn/event/AsyncDispatcher.java | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/event/AsyncDispatcher.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/event/AsyncDispatcher.java index bf5058a9d13..733f0eaabc5 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/event/AsyncDispatcher.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/event/AsyncDispatcher.java @@ -56,6 +56,7 @@ public class AsyncDispatcher extends AbstractService implements Dispatcher { // Indicates all the remaining dispatcher's events on stop have been drained // and processed. private volatile boolean drained = true; + private Object waitForDrained = new Object(); // For drainEventsOnStop enabled only, block newly coming events into the // queue while stopping. @@ -82,6 +83,16 @@ Runnable createThread() { public void run() { while (!stopped && !Thread.currentThread().isInterrupted()) { drained = eventQueue.isEmpty(); + // blockNewEvents is only set when dispatcher is draining to stop, + // adding this check is to avoid the overhead of acquiring the lock + // and calling notify every time in the normal run of the loop. + if (blockNewEvents) { + synchronized (waitForDrained) { + if (drained) { + waitForDrained.notify(); + } + } + } Event event; try { event = eventQueue.take(); @@ -125,8 +136,11 @@ protected void serviceStop() throws Exception { if (drainEventsOnStop) { blockNewEvents = true; LOG.info("AsyncDispatcher is draining to stop, igonring any new events."); - while(!drained) { - Thread.yield(); + synchronized (waitForDrained) { + while (!drained && eventHandlingThread.isAlive()) { + waitForDrained.wait(1000); + LOG.info("Waiting for AsyncDispatcher to drain."); + } } } stopped = true; From 97e881b955b91b07ac7b6fbc0718f0ecf009dc84 Mon Sep 17 00:00:00 2001 From: Uma Maheswara Rao G Date: Tue, 31 Dec 2013 16:24:02 +0000 Subject: [PATCH 18/42] HDFS-5671. Fix socket leak in DFSInputStream#getBlockReader. Contributed by JamesLi git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1554553 13f79535-47bb-0310-9956-ffa450edef68 --- hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt | 2 ++ .../apache/hadoop/hdfs/DFSInputStream.java | 20 ++++++++++++++----- 2 files changed, 17 insertions(+), 5 deletions(-) diff --git a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt index 303e64d3d61..55c0a8b612e 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt +++ b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt @@ -1025,6 +1025,8 @@ Release 2.3.0 - UNRELEASED HDFS-5582. hdfs getconf -excludeFile or -includeFile always failed (sathish via cmccabe) + HDFS-5671. Fix socket leak in DFSInputStream#getBlockReader. (JamesLi via umamahesh) + Release 2.2.0 - 2013-10-13 INCOMPATIBLE CHANGES diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DFSInputStream.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DFSInputStream.java index 3d26a98b56d..47efcf339a3 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DFSInputStream.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DFSInputStream.java @@ -1188,11 +1188,21 @@ protected BlockReader getBlockReader(InetSocketAddress dnAddr, } // Try to create a new remote peer. Peer peer = newTcpPeer(dnAddr); - return BlockReaderFactory.newBlockReader( - dfsClient.getConf(), file, block, blockToken, startOffset, - len, verifyChecksum, clientName, peer, chosenNode, - dsFactory, peerCache, fileInputStreamCache, false, - curCachingStrategy); + try { + reader = BlockReaderFactory.newBlockReader(dfsClient.getConf(), file, + block, blockToken, startOffset, len, verifyChecksum, clientName, + peer, chosenNode, dsFactory, peerCache, fileInputStreamCache, false, + curCachingStrategy); + return reader; + } catch (IOException ex) { + DFSClient.LOG.debug( + "Exception while getting block reader, closing stale " + peer, ex); + throw ex; + } finally { + if (reader == null) { + IOUtils.closeQuietly(peer); + } + } } From 07e4fb1455abc33584fc666ef745abe256ebd7d1 Mon Sep 17 00:00:00 2001 From: Andrew Wang Date: Wed, 1 Jan 2014 00:01:12 +0000 Subject: [PATCH 19/42] HDFS-5708. The CacheManager throws a NPE in the DataNode logs when processing cache reports that refer to a block not known to the BlockManager. Contributed by Colin Patrick McCabe. git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1554594 13f79535-47bb-0310-9956-ffa450edef68 --- hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt | 6 ++ .../CacheReplicationMonitor.java | 73 +++++++++++++++---- .../hdfs/server/namenode/CacheManager.java | 38 ++++------ .../server/namenode/TestCacheDirectives.java | 9 +++ 4 files changed, 87 insertions(+), 39 deletions(-) diff --git a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt index 55c0a8b612e..b1bcdf81d36 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt +++ b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt @@ -244,12 +244,14 @@ Trunk (Unreleased) HDFS-5636. Enforce a max TTL per cache pool. (awang via cmccabe) OPTIMIZATIONS + HDFS-5349. DNA_CACHE and DNA_UNCACHE should be by blockId only. (cmccabe) HDFS-5665. Remove the unnecessary writeLock while initializing CacheManager in FsNameSystem Ctor. (Uma Maheswara Rao G via Andrew Wang) BUG FIXES + HADOOP-9635 Fix potential Stack Overflow in DomainSocket.c (V. Karthik Kumar via cmccabe) @@ -456,6 +458,10 @@ Trunk (Unreleased) HDFS-5701. Fix the CacheAdmin -addPool -maxTtl option name. (Stephen Chu via wang) + HDFS-5708. The CacheManager throws a NPE in the DataNode logs when + processing cache reports that refer to a block not known to the + BlockManager. (cmccabe via wang) + BREAKDOWN OF HDFS-2832 SUBTASKS AND RELATED JIRAS HDFS-4985. Add storage type to the protocol and expose it in block report diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/CacheReplicationMonitor.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/CacheReplicationMonitor.java index 5aa440fb6f6..e86f345a499 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/CacheReplicationMonitor.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/CacheReplicationMonitor.java @@ -460,14 +460,21 @@ private void rescanFile(CacheDirective directive, INodeFile file) { directive.getReplication()) * blockInfo.getNumBytes(); cachedTotal += cachedByBlock; - if (mark != ocblock.getMark()) { - // Mark hasn't been set in this scan, so update replication and mark. + if ((mark != ocblock.getMark()) || + (ocblock.getReplication() < directive.getReplication())) { + // + // Overwrite the block's replication and mark in two cases: + // + // 1. If the mark on the CachedBlock is different from the mark for + // this scan, that means the block hasn't been updated during this + // scan, and we should overwrite whatever is there, since it is no + // longer valid. + // + // 2. If the replication in the CachedBlock is less than what the + // directive asks for, we want to increase the block's replication + // field to what the directive asks for. + // ocblock.setReplicationAndMark(directive.getReplication(), mark); - } else { - // Mark already set in this scan. Set replication to highest value in - // any CacheDirective that covers this file. - ocblock.setReplicationAndMark((short)Math.max( - directive.getReplication(), ocblock.getReplication()), mark); } } } @@ -483,6 +490,36 @@ private void rescanFile(CacheDirective directive, INodeFile file) { } } + private String findReasonForNotCaching(CachedBlock cblock, + BlockInfo blockInfo) { + if (blockInfo == null) { + // Somehow, a cache report with the block arrived, but the block + // reports from the DataNode haven't (yet?) described such a block. + // Alternately, the NameNode might have invalidated the block, but the + // DataNode hasn't caught up. In any case, we want to tell the DN + // to uncache this. + return "not tracked by the BlockManager"; + } else if (!blockInfo.isComplete()) { + // When a cached block changes state from complete to some other state + // on the DataNode (perhaps because of append), it will begin the + // uncaching process. However, the uncaching process is not + // instantaneous, especially if clients have pinned the block. So + // there may be a period of time when incomplete blocks remain cached + // on the DataNodes. + return "not complete"; + } else if (cblock.getReplication() == 0) { + // Since 0 is not a valid value for a cache directive's replication + // field, seeing a replication of 0 on a CacheBlock means that it + // has never been reached by any sweep. + return "not needed by any directives"; + } else if (cblock.getMark() != mark) { + // Although the block was needed in the past, we didn't reach it during + // the current sweep. Therefore, it doesn't need to be cached any more. + return "no longer needed by any directives"; + } + return null; + } + /** * Scan through the cached block map. * Any blocks which are under-replicated should be assigned new Datanodes. @@ -508,11 +545,17 @@ private void rescanCachedBlockMap() { iter.remove(); } } - // If the block's mark doesn't match with the mark of this scan, that - // means that this block couldn't be reached during this scan. That means - // it doesn't need to be cached any more. - int neededCached = (cblock.getMark() != mark) ? - 0 : cblock.getReplication(); + BlockInfo blockInfo = blockManager. + getStoredBlock(new Block(cblock.getBlockId())); + String reason = findReasonForNotCaching(cblock, blockInfo); + int neededCached = 0; + if (reason != null) { + if (LOG.isDebugEnabled()) { + LOG.debug("not caching " + cblock + " because it is " + reason); + } + } else { + neededCached = cblock.getReplication(); + } int numCached = cached.size(); if (numCached >= neededCached) { // If we have enough replicas, drop all pending cached. @@ -612,8 +655,10 @@ private void addNewPendingCached(int neededCached, BlockInfo blockInfo = blockManager. getStoredBlock(new Block(cachedBlock.getBlockId())); if (blockInfo == null) { - LOG.debug("Not caching block " + cachedBlock + " because it " + - "was deleted from all DataNodes."); + if (LOG.isDebugEnabled()) { + LOG.debug("Not caching block " + cachedBlock + " because there " + + "is no record of it on the NameNode."); + } return; } if (!blockInfo.isComplete()) { diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/CacheManager.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/CacheManager.java index e25913d9cb9..f24b386df16 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/CacheManager.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/CacheManager.java @@ -62,7 +62,6 @@ import org.apache.hadoop.hdfs.protocol.CachePoolInfo; import org.apache.hadoop.hdfs.protocol.DatanodeID; import org.apache.hadoop.hdfs.protocol.LocatedBlock; -import org.apache.hadoop.hdfs.server.blockmanagement.BlockInfo; import org.apache.hadoop.hdfs.server.blockmanagement.BlockManager; import org.apache.hadoop.hdfs.server.blockmanagement.CacheReplicationMonitor; import org.apache.hadoop.hdfs.server.blockmanagement.DatanodeDescriptor; @@ -940,39 +939,28 @@ private void processCacheReportImpl(final DatanodeDescriptor datanode, final List blockIds) { CachedBlocksList cached = datanode.getCached(); cached.clear(); + CachedBlocksList cachedList = datanode.getCached(); + CachedBlocksList pendingCachedList = datanode.getPendingCached(); for (Iterator iter = blockIds.iterator(); iter.hasNext(); ) { - Block block = new Block(iter.next()); - BlockInfo blockInfo = blockManager.getStoredBlock(block); - if (!blockInfo.isComplete()) { - LOG.warn("Ignoring block id " + block.getBlockId() + ", because " + - "it is in not complete yet. It is in state " + - blockInfo.getBlockUCState()); - continue; - } - Collection corruptReplicas = - blockManager.getCorruptReplicas(blockInfo); - if ((corruptReplicas != null) && corruptReplicas.contains(datanode)) { - // The NameNode will eventually remove or update the corrupt block. - // Until then, we pretend that it isn't cached. - LOG.warn("Ignoring cached replica on " + datanode + " of " + block + - " because it is corrupt."); - continue; - } + long blockId = iter.next(); CachedBlock cachedBlock = - new CachedBlock(block.getBlockId(), (short)0, false); + new CachedBlock(blockId, (short)0, false); CachedBlock prevCachedBlock = cachedBlocks.get(cachedBlock); - // Use the existing CachedBlock if it's present; otherwise, - // insert a new one. + // Add the block ID from the cache report to the cachedBlocks map + // if it's not already there. if (prevCachedBlock != null) { cachedBlock = prevCachedBlock; } else { cachedBlocks.put(cachedBlock); } - if (!cachedBlock.isPresent(datanode.getCached())) { - datanode.getCached().add(cachedBlock); + // Add the block to the datanode's implicit cached block list + // if it's not already there. Similarly, remove it from the pending + // cached block list if it exists there. + if (!cachedBlock.isPresent(cachedList)) { + cachedList.add(cachedBlock); } - if (cachedBlock.isPresent(datanode.getPendingCached())) { - datanode.getPendingCached().remove(cachedBlock); + if (cachedBlock.isPresent(pendingCachedList)) { + pendingCachedList.remove(cachedBlock); } } } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestCacheDirectives.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestCacheDirectives.java index 916e1fa9829..6ab808ea167 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestCacheDirectives.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestCacheDirectives.java @@ -69,6 +69,7 @@ import org.apache.hadoop.hdfs.protocol.CachePoolStats; import org.apache.hadoop.hdfs.server.blockmanagement.CacheReplicationMonitor; import org.apache.hadoop.hdfs.server.blockmanagement.DatanodeDescriptor.CachedBlocksList.Type; +import org.apache.hadoop.hdfs.server.datanode.DataNode; import org.apache.hadoop.hdfs.server.protocol.NamenodeProtocols; import org.apache.hadoop.io.nativeio.NativeIO; import org.apache.hadoop.io.nativeio.NativeIO.POSIX.CacheManipulator; @@ -796,7 +797,15 @@ public Boolean get() { } }, 500, 60000); + // Send a cache report referring to a bogus block. It is important that + // the NameNode be robust against this. NamenodeProtocols nnRpc = namenode.getRpcServer(); + DataNode dn0 = cluster.getDataNodes().get(0); + String bpid = cluster.getNamesystem().getBlockPoolId(); + LinkedList bogusBlockIds = new LinkedList (); + bogusBlockIds.add(999999L); + nnRpc.cacheReport(dn0.getDNRegistrationForBP(bpid), bpid, bogusBlockIds); + Path rootDir = helper.getDefaultWorkingDirectory(dfs); // Create the pool final String pool = "friendlyPool"; From 75d882ba00080cf2d416ac5f7ebbdeace2ee9f05 Mon Sep 17 00:00:00 2001 From: Steve Loughran Date: Thu, 2 Jan 2014 13:39:37 +0000 Subject: [PATCH 20/42] HADOOP-10147 HDFS-5678 Upgrade to commons-logging 1.1.3 to avoid potential deadlock in MiniDFSCluster git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1554803 13f79535-47bb-0310-9956-ffa450edef68 --- hadoop-common-project/hadoop-common/CHANGES.txt | 3 +++ hadoop-project/pom.xml | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/hadoop-common-project/hadoop-common/CHANGES.txt b/hadoop-common-project/hadoop-common/CHANGES.txt index 11b567e6989..ee6ee6ddb54 100644 --- a/hadoop-common-project/hadoop-common/CHANGES.txt +++ b/hadoop-common-project/hadoop-common/CHANGES.txt @@ -489,6 +489,9 @@ Release 2.4.0 - UNRELEASED HADOOP-10171. TestRPC fails intermittently on jkd7 (Mit Desai via jeagles) + HADOOP-10147 HDFS-5678 Upgrade to commons-logging 1.1.3 to avoid potential + deadlock in MiniDFSCluster (stevel) + Release 2.3.0 - UNRELEASED INCOMPATIBLE CHANGES diff --git a/hadoop-project/pom.xml b/hadoop-project/pom.xml index b296a294e2d..e1ff1eba5bf 100644 --- a/hadoop-project/pom.xml +++ b/hadoop-project/pom.xml @@ -519,7 +519,7 @@ commons-logging commons-logging - 1.1.1 + 1.1.3 avalon-framework From fe458a1e3a172bf1745b2c4d555ad9a065fa47b2 Mon Sep 17 00:00:00 2001 From: Kihwal Lee Date: Thu, 2 Jan 2014 14:58:04 +0000 Subject: [PATCH 21/42] HADOOP-10173. Remove UGI from DIGEST-MD5 SASL server creation. Contributed by Daryn Sharp. git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1554815 13f79535-47bb-0310-9956-ffa450edef68 --- hadoop-common-project/hadoop-common/CHANGES.txt | 3 +++ .../org/apache/hadoop/security/SaslRpcServer.java | 11 +++++++++-- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/hadoop-common-project/hadoop-common/CHANGES.txt b/hadoop-common-project/hadoop-common/CHANGES.txt index ee6ee6ddb54..345f09e1789 100644 --- a/hadoop-common-project/hadoop-common/CHANGES.txt +++ b/hadoop-common-project/hadoop-common/CHANGES.txt @@ -416,6 +416,9 @@ Release 2.4.0 - UNRELEASED HADOOP-10172. Cache SASL server factories (daryn) + HADOOP-10173. Remove UGI from DIGEST-MD5 SASL server creation (daryn via + kihwal) + BUG FIXES HADOOP-9964. Fix deadlocks in TestHttpServer by synchronize diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/SaslRpcServer.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/SaslRpcServer.java index bbabd887a2f..b2db83670b0 100644 --- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/SaslRpcServer.java +++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/SaslRpcServer.java @@ -131,7 +131,7 @@ public SaslRpcServer(AuthMethod authMethod) throws IOException { public SaslServer create(Connection connection, SecretManager secretManager ) throws IOException, InterruptedException { - UserGroupInformation ugi = UserGroupInformation.getCurrentUser(); + UserGroupInformation ugi = null; final CallbackHandler callback; switch (authMethod) { case TOKEN: { @@ -139,6 +139,7 @@ public SaslServer create(Connection connection, break; } case KERBEROS: { + ugi = UserGroupInformation.getCurrentUser(); if (serverId.isEmpty()) { throw new AccessControlException( "Kerberos principal name does NOT have the expected " @@ -153,7 +154,9 @@ public SaslServer create(Connection connection, "Server does not support SASL " + authMethod); } - SaslServer saslServer = ugi.doAs( + final SaslServer saslServer; + if (ugi != null) { + saslServer = ugi.doAs( new PrivilegedExceptionAction() { @Override public SaslServer run() throws SaslException { @@ -161,6 +164,10 @@ public SaslServer run() throws SaslException { SaslRpcServer.SASL_PROPS, callback); } }); + } else { + saslServer = saslFactory.createSaslServer(mechanism, protocol, serverId, + SaslRpcServer.SASL_PROPS, callback); + } if (saslServer == null) { throw new AccessControlException( "Unable to find SASL server implementation for " + mechanism); From bb2e2fee6071233fa3f708c04c58091f4b8b0f99 Mon Sep 17 00:00:00 2001 From: Vinod Kumar Vavilapalli Date: Thu, 2 Jan 2014 19:54:07 +0000 Subject: [PATCH 22/42] YARN-1549. Fixed a bug in ResourceManager's ApplicationMasterService that was causing unamanged AMs to not finish correctly. Contributed by haosdent. git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1554886 13f79535-47bb-0310-9956-ffa450edef68 --- hadoop-yarn-project/CHANGES.txt | 3 +++ .../TestUnmanagedAMLauncher.java | 8 ++++++-- .../ApplicationMasterService.java | 17 +++++++++++++++-- 3 files changed, 24 insertions(+), 4 deletions(-) diff --git a/hadoop-yarn-project/CHANGES.txt b/hadoop-yarn-project/CHANGES.txt index 7d8439534ad..b351ba343a4 100644 --- a/hadoop-yarn-project/CHANGES.txt +++ b/hadoop-yarn-project/CHANGES.txt @@ -280,6 +280,9 @@ Release 2.4.0 - UNRELEASED YARN-1522. Fixed a race condition in the test TestApplicationCleanup that was causing it to randomly fail. (Liyin Liang via vinodkv) + YARN-1549. Fixed a bug in ResourceManager's ApplicationMasterService that + was causing unamanged AMs to not finish correctly. (haosdent via vinodkv) + Release 2.3.0 - UNRELEASED INCOMPATIBLE CHANGES diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-applications-unmanaged-am-launcher/src/test/java/org/apache/hadoop/yarn/applications/unmanagedamlauncher/TestUnmanagedAMLauncher.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-applications-unmanaged-am-launcher/src/test/java/org/apache/hadoop/yarn/applications/unmanagedamlauncher/TestUnmanagedAMLauncher.java index 0cc4b8e72a8..73833379028 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-applications-unmanaged-am-launcher/src/test/java/org/apache/hadoop/yarn/applications/unmanagedamlauncher/TestUnmanagedAMLauncher.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-applications-unmanaged-am-launcher/src/test/java/org/apache/hadoop/yarn/applications/unmanagedamlauncher/TestUnmanagedAMLauncher.java @@ -29,12 +29,14 @@ import java.net.URL; import junit.framework.Assert; + import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.net.NetUtils; import org.apache.hadoop.yarn.api.ApplicationMasterProtocol; import org.apache.hadoop.yarn.api.protocolrecords.FinishApplicationMasterRequest; +import org.apache.hadoop.yarn.api.protocolrecords.FinishApplicationMasterResponse; import org.apache.hadoop.yarn.api.protocolrecords.RegisterApplicationMasterRequest; import org.apache.hadoop.yarn.api.records.FinalApplicationStatus; import org.apache.hadoop.yarn.client.ClientRMProxy; @@ -193,8 +195,10 @@ public static void main(String[] args) throws Exception { client.registerApplicationMaster(RegisterApplicationMasterRequest .newInstance(NetUtils.getHostname(), -1, "")); Thread.sleep(1000); - client.finishApplicationMaster(FinishApplicationMasterRequest - .newInstance(FinalApplicationStatus.SUCCEEDED, "success", null)); + FinishApplicationMasterResponse resp = + client.finishApplicationMaster(FinishApplicationMasterRequest + .newInstance(FinalApplicationStatus.SUCCEEDED, "success", null)); + assertTrue(resp.getIsUnregistered()); System.exit(0); } else { System.exit(1); diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ApplicationMasterService.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ApplicationMasterService.java index 787ed9fa656..29f0ebe7e97 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ApplicationMasterService.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ApplicationMasterService.java @@ -292,8 +292,21 @@ public FinishApplicationMasterResponse finishApplicationMaster( this.amLivelinessMonitor.receivedPing(applicationAttemptId); - if (rmContext.getRMApps().get(applicationAttemptId.getApplicationId()) - .isAppSafeToTerminate()) { + RMApp rmApp = + rmContext.getRMApps().get(applicationAttemptId.getApplicationId()); + + if (rmApp.getApplicationSubmissionContext().getUnmanagedAM()) { + // No recovery supported yet for unmanaged AM. Send the unregister event + // and (falsely) acknowledge state-store write immediately. + rmContext.getDispatcher().getEventHandler().handle( + new RMAppAttemptUnregistrationEvent(applicationAttemptId, request + .getTrackingUrl(), request.getFinalApplicationStatus(), request + .getDiagnostics())); + return FinishApplicationMasterResponse.newInstance(true); + } + + // Not an unmanaged-AM. + if (rmApp.isAppSafeToTerminate()) { return FinishApplicationMasterResponse.newInstance(true); } else { // keep sending the unregister event as RM may crash in the meanwhile. From 3a299fd7bbacc69d8316a445fdf8c8bcbb79847f Mon Sep 17 00:00:00 2001 From: Andrew Wang Date: Thu, 2 Jan 2014 20:00:07 +0000 Subject: [PATCH 23/42] HADOOP-10198. DomainSocket: add support for socketpair. Contributed by Colin Patrick McCabe. git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1554888 13f79535-47bb-0310-9956-ffa450edef68 --- .../hadoop-common/CHANGES.txt | 3 ++ .../apache/hadoop/net/unix/DomainSocket.java | 18 ++++++++ .../org/apache/hadoop/net/unix/DomainSocket.c | 44 +++++++++++++++++++ .../hadoop/net/unix/TestDomainSocket.java | 40 +++++++++++++---- 4 files changed, 97 insertions(+), 8 deletions(-) diff --git a/hadoop-common-project/hadoop-common/CHANGES.txt b/hadoop-common-project/hadoop-common/CHANGES.txt index 345f09e1789..029a5efb8fa 100644 --- a/hadoop-common-project/hadoop-common/CHANGES.txt +++ b/hadoop-common-project/hadoop-common/CHANGES.txt @@ -108,6 +108,9 @@ Trunk (Unreleased) HADOOP-10141. Create KeyProvider API to separate encryption key storage from the applications. (omalley) + HADOOP-19198. DomainSocket: add support for socketpair. + (Colin Patrick McCabe via wang) + BUG FIXES HADOOP-9451. Fault single-layer config if node group topology is enabled. diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/net/unix/DomainSocket.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/net/unix/DomainSocket.java index 4c6ae0592c2..bdf4d67a1af 100644 --- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/net/unix/DomainSocket.java +++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/net/unix/DomainSocket.java @@ -276,6 +276,24 @@ public static DomainSocket bindAndListen(String path) throws IOException { return new DomainSocket(path, fd); } + /** + * Create a pair of UNIX domain sockets which are connected to each other + * by calling socketpair(2). + * + * @return An array of two UNIX domain sockets connected to + * each other. + * @throws IOException on error. + */ + public static DomainSocket[] socketpair() throws IOException { + int fds[] = socketpair0(); + return new DomainSocket[] { + new DomainSocket("(anonymous0)", fds[0]), + new DomainSocket("(anonymous1)", fds[1]) + }; + } + + private static native int[] socketpair0() throws IOException; + private static native int accept0(int fd) throws IOException; /** diff --git a/hadoop-common-project/hadoop-common/src/main/native/src/org/apache/hadoop/net/unix/DomainSocket.c b/hadoop-common-project/hadoop-common/src/main/native/src/org/apache/hadoop/net/unix/DomainSocket.c index 26423f8d836..48c4252fe7c 100644 --- a/hadoop-common-project/hadoop-common/src/main/native/src/org/apache/hadoop/net/unix/DomainSocket.c +++ b/hadoop-common-project/hadoop-common/src/main/native/src/org/apache/hadoop/net/unix/DomainSocket.c @@ -364,6 +364,50 @@ JNIEnv *env, jclass clazz, jstring path) return fd; } +#define SOCKETPAIR_ARRAY_LEN 2 + +JNIEXPORT jarray JNICALL +Java_org_apache_hadoop_net_unix_DomainSocket_socketpair0( +JNIEnv *env, jclass clazz) +{ + jarray arr = NULL; + int idx, err, fds[SOCKETPAIR_ARRAY_LEN] = { -1, -1 }; + jthrowable jthr = NULL; + + arr = (*env)->NewIntArray(env, SOCKETPAIR_ARRAY_LEN); + jthr = (*env)->ExceptionOccurred(env); + if (jthr) { + (*env)->ExceptionClear(env); + goto done; + } + if (socketpair(PF_UNIX, SOCK_STREAM, 0, fds) < 0) { + err = errno; + jthr = newSocketException(env, err, + "socketpair(2) error: %s", terror(err)); + goto done; + } + (*env)->SetIntArrayRegion(env, arr, 0, SOCKETPAIR_ARRAY_LEN, fds); + jthr = (*env)->ExceptionOccurred(env); + if (jthr) { + (*env)->ExceptionClear(env); + goto done; + } + +done: + if (jthr) { + (*env)->DeleteLocalRef(env, arr); + arr = NULL; + for (idx = 0; idx < SOCKETPAIR_ARRAY_LEN; idx++) { + if (fds[idx] >= 0) { + close(fds[idx]); + fds[idx] = -1; + } + } + (*env)->Throw(env, jthr); + } + return arr; +} + JNIEXPORT jint JNICALL Java_org_apache_hadoop_net_unix_DomainSocket_accept0( JNIEnv *env, jclass clazz, jint fd) diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/net/unix/TestDomainSocket.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/net/unix/TestDomainSocket.java index d512027d45d..d6d9591ddd0 100644 --- a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/net/unix/TestDomainSocket.java +++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/net/unix/TestDomainSocket.java @@ -420,7 +420,8 @@ public int read(byte b[], int off, int length) throws IOException { * @throws IOException */ void testClientServer1(final Class writeStrategyClass, - final Class readStrategyClass) throws Exception { + final Class readStrategyClass, + final DomainSocket preConnectedSockets[]) throws Exception { final String TEST_PATH = new File(sockDir.getDir(), "test_sock_client_server1").getAbsolutePath(); final byte clientMsg1[] = new byte[] { 0x1, 0x2, 0x3, 0x4, 0x5, 0x6 }; @@ -428,13 +429,15 @@ void testClientServer1(final Class writeStrategyClass, final byte clientMsg2 = 0x45; final ArrayBlockingQueue threadResults = new ArrayBlockingQueue(2); - final DomainSocket serv = DomainSocket.bindAndListen(TEST_PATH); + final DomainSocket serv = (preConnectedSockets != null) ? + null : DomainSocket.bindAndListen(TEST_PATH); Thread serverThread = new Thread() { public void run(){ // Run server DomainSocket conn = null; try { - conn = serv.accept(); + conn = preConnectedSockets != null ? + preConnectedSockets[0] : serv.accept(); byte in1[] = new byte[clientMsg1.length]; ReadStrategy reader = readStrategyClass.newInstance(); reader.init(conn); @@ -459,7 +462,8 @@ public void run(){ Thread clientThread = new Thread() { public void run(){ try { - DomainSocket client = DomainSocket.connect(TEST_PATH); + DomainSocket client = preConnectedSockets != null ? + preConnectedSockets[1] : DomainSocket.connect(TEST_PATH); WriteStrategy writer = writeStrategyClass.newInstance(); writer.init(client); writer.write(clientMsg1); @@ -487,25 +491,45 @@ public void run(){ } serverThread.join(120000); clientThread.join(120000); - serv.close(); + if (serv != null) { + serv.close(); + } } @Test(timeout=180000) public void testClientServerOutStreamInStream() throws Exception { testClientServer1(OutputStreamWriteStrategy.class, - InputStreamReadStrategy.class); + InputStreamReadStrategy.class, null); + } + + @Test(timeout=180000) + public void testClientServerOutStreamInStreamWithSocketpair() throws Exception { + testClientServer1(OutputStreamWriteStrategy.class, + InputStreamReadStrategy.class, DomainSocket.socketpair()); } @Test(timeout=180000) public void testClientServerOutStreamInDbb() throws Exception { testClientServer1(OutputStreamWriteStrategy.class, - DirectByteBufferReadStrategy.class); + DirectByteBufferReadStrategy.class, null); + } + + @Test(timeout=180000) + public void testClientServerOutStreamInDbbWithSocketpair() throws Exception { + testClientServer1(OutputStreamWriteStrategy.class, + DirectByteBufferReadStrategy.class, DomainSocket.socketpair()); } @Test(timeout=180000) public void testClientServerOutStreamInAbb() throws Exception { testClientServer1(OutputStreamWriteStrategy.class, - ArrayBackedByteBufferReadStrategy.class); + ArrayBackedByteBufferReadStrategy.class, null); + } + + @Test(timeout=180000) + public void testClientServerOutStreamInAbbWithSocketpair() throws Exception { + testClientServer1(OutputStreamWriteStrategy.class, + ArrayBackedByteBufferReadStrategy.class, DomainSocket.socketpair()); } static private class PassedFile { From a44ddd674a2db7d349c5e380e11dff20bd3bc401 Mon Sep 17 00:00:00 2001 From: Andrew Wang Date: Thu, 2 Jan 2014 20:07:02 +0000 Subject: [PATCH 24/42] Amend CHANGES.txt for HADOOP-10198 git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1554890 13f79535-47bb-0310-9956-ffa450edef68 --- hadoop-common-project/hadoop-common/CHANGES.txt | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/hadoop-common-project/hadoop-common/CHANGES.txt b/hadoop-common-project/hadoop-common/CHANGES.txt index 029a5efb8fa..cdf3b7a1e58 100644 --- a/hadoop-common-project/hadoop-common/CHANGES.txt +++ b/hadoop-common-project/hadoop-common/CHANGES.txt @@ -108,9 +108,6 @@ Trunk (Unreleased) HADOOP-10141. Create KeyProvider API to separate encryption key storage from the applications. (omalley) - HADOOP-19198. DomainSocket: add support for socketpair. - (Colin Patrick McCabe via wang) - BUG FIXES HADOOP-9451. Fault single-layer config if node group topology is enabled. @@ -410,6 +407,9 @@ Release 2.4.0 - UNRELEASED HADOOP-10169. Remove the unnecessary synchronized in JvmMetrics class. (Liang Xie via jing9) + HADOOP-10198. DomainSocket: add support for socketpair. + (Colin Patrick McCabe via wang) + OPTIMIZATIONS HADOOP-9748. Reduce blocking on UGI.ensureInitialized (daryn) From b4eb963c3c3e0b123003d7b32cdf7c9202cfb441 Mon Sep 17 00:00:00 2001 From: Andrew Wang Date: Thu, 2 Jan 2014 20:11:41 +0000 Subject: [PATCH 25/42] HDFS-5659. dfsadmin -report doesn't output cache information properly. Contributed by Andrew Wang. git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1554893 13f79535-47bb-0310-9956-ffa450edef68 --- .../org/apache/hadoop/util/StringUtils.java | 6 +- hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt | 3 + .../hadoop/hdfs/protocolPB/PBHelper.java | 33 ++++------- .../apache/hadoop/hdfs/tools/CacheAdmin.java | 59 ++++++++++++------- .../server/datanode/TestFsDatasetCache.java | 30 +++++++++- .../server/namenode/TestCacheDirectives.java | 23 +++++++- 6 files changed, 108 insertions(+), 46 deletions(-) diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/StringUtils.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/StringUtils.java index d1c428e0f42..6c890d54f81 100644 --- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/StringUtils.java +++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/StringUtils.java @@ -928,8 +928,10 @@ public static String getStackTrace(Thread t) { * @param args List of arguments. * @return null if the option was not found; the value of the * option otherwise. + * @throws IllegalArgumentException if the option's argument is not present */ - public static String popOptionWithArgument(String name, List args) { + public static String popOptionWithArgument(String name, List args) + throws IllegalArgumentException { String val = null; for (Iterator iter = args.iterator(); iter.hasNext(); ) { String cur = iter.next(); @@ -939,7 +941,7 @@ public static String popOptionWithArgument(String name, List args) { } else if (cur.equals(name)) { iter.remove(); if (!iter.hasNext()) { - throw new RuntimeException("option " + name + " requires 1 " + + throw new IllegalArgumentException("option " + name + " requires 1 " + "argument."); } val = iter.next(); diff --git a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt index b1bcdf81d36..4dd7f018b3b 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt +++ b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt @@ -462,6 +462,9 @@ Trunk (Unreleased) processing cache reports that refer to a block not known to the BlockManager. (cmccabe via wang) + HDFS-5659. dfsadmin -report doesn't output cache information properly. + (wang) + BREAKDOWN OF HDFS-2832 SUBTASKS AND RELATED JIRAS HDFS-4985. Add storage type to the protocol and expose it in block report diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocolPB/PBHelper.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocolPB/PBHelper.java index 3b4c82b287e..10be9062842 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocolPB/PBHelper.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocolPB/PBHelper.java @@ -511,21 +511,7 @@ static public DatanodeInfo convert(DatanodeInfoProto di) { static public DatanodeInfoProto convertDatanodeInfo(DatanodeInfo di) { if (di == null) return null; - DatanodeInfoProto.Builder builder = DatanodeInfoProto.newBuilder(); - if (di.getNetworkLocation() != null) { - builder.setLocation(di.getNetworkLocation()); - } - - return builder. - setId(PBHelper.convert((DatanodeID) di)). - setCapacity(di.getCapacity()). - setDfsUsed(di.getDfsUsed()). - setRemaining(di.getRemaining()). - setBlockPoolUsed(di.getBlockPoolUsed()). - setLastUpdate(di.getLastUpdate()). - setXceiverCount(di.getXceiverCount()). - setAdminState(PBHelper.convert(di.getAdminState())). - build(); + return convert(di); } @@ -569,15 +555,20 @@ public static DatanodeInfo[] convert(List list) { public static DatanodeInfoProto convert(DatanodeInfo info) { DatanodeInfoProto.Builder builder = DatanodeInfoProto.newBuilder(); - builder.setBlockPoolUsed(info.getBlockPoolUsed()); - builder.setAdminState(PBHelper.convert(info.getAdminState())); - builder.setCapacity(info.getCapacity()) - .setDfsUsed(info.getDfsUsed()) + if (info.getNetworkLocation() != null) { + builder.setLocation(info.getNetworkLocation()); + } + builder .setId(PBHelper.convert((DatanodeID)info)) - .setLastUpdate(info.getLastUpdate()) - .setLocation(info.getNetworkLocation()) + .setCapacity(info.getCapacity()) + .setDfsUsed(info.getDfsUsed()) .setRemaining(info.getRemaining()) + .setBlockPoolUsed(info.getBlockPoolUsed()) + .setCacheCapacity(info.getCacheCapacity()) + .setCacheUsed(info.getCacheUsed()) + .setLastUpdate(info.getLastUpdate()) .setXceiverCount(info.getXceiverCount()) + .setAdminState(PBHelper.convert(info.getAdminState())) .build(); return builder.build(); } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/tools/CacheAdmin.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/tools/CacheAdmin.java index 41c43fae188..341d8b19273 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/tools/CacheAdmin.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/tools/CacheAdmin.java @@ -84,7 +84,12 @@ public int run(String[] args) throws IOException { for (int j = 1; j < args.length; j++) { argsList.add(args[j]); } - return command.run(getConf(), argsList); + try { + return command.run(getConf(), argsList); + } catch (IllegalArgumentException e) { + System.err.println(prettifyException(e)); + return -1; + } } public static void main(String[] argsArray) throws IOException { @@ -135,6 +140,20 @@ private static Long parseTtlString(String maxTtlString) throws IOException { return maxTtl; } + private static Expiration parseExpirationString(String ttlString) + throws IOException { + Expiration ex = null; + if (ttlString != null) { + if (ttlString.equalsIgnoreCase("never")) { + ex = CacheDirectiveInfo.Expiration.NEVER; + } else { + long ttl = DFSUtil.parseRelativeTime(ttlString); + ex = CacheDirectiveInfo.Expiration.newRelative(ttl); + } + } + return ex; + } + interface Command { String getName(); String getShortUsage(); @@ -171,6 +190,7 @@ public String getLongUsage() { listing.addRow("", "How long the directive is " + "valid. Can be specified in minutes, hours, and days, e.g. " + "30m, 4h, 2d. Valid units are [smhd]." + + " \"never\" indicates a directive that never expires." + " If unspecified, the directive never expires."); return getShortUsage() + "\n" + "Add a new cache directive.\n\n" + @@ -203,15 +223,15 @@ public int run(Configuration conf, List args) throws IOException { } String ttlString = StringUtils.popOptionWithArgument("-ttl", args); - if (ttlString != null) { - try { - long ttl = DFSUtil.parseRelativeTime(ttlString); - builder.setExpiration(CacheDirectiveInfo.Expiration.newRelative(ttl)); - } catch (IOException e) { - System.err.println( - "Error while parsing ttl value: " + e.getMessage()); - return 1; + try { + Expiration ex = parseExpirationString(ttlString); + if (ex != null) { + builder.setExpiration(ex); } + } catch (IOException e) { + System.err.println( + "Error while parsing ttl value: " + e.getMessage()); + return 1; } if (!args.isEmpty()) { @@ -326,7 +346,7 @@ public String getLongUsage() { listing.addRow("", "How long the directive is " + "valid. Can be specified in minutes, hours, and days, e.g. " + "30m, 4h, 2d. Valid units are [smhd]." + - " If unspecified, the directive never expires."); + " \"never\" indicates a directive that never expires."); return getShortUsage() + "\n" + "Modify a cache directive.\n\n" + listing.toString(); @@ -362,17 +382,16 @@ public int run(Configuration conf, List args) throws IOException { modified = true; } String ttlString = StringUtils.popOptionWithArgument("-ttl", args); - if (ttlString != null) { - long ttl; - try { - ttl = DFSUtil.parseRelativeTime(ttlString); - } catch (IOException e) { - System.err.println( - "Error while parsing ttl value: " + e.getMessage()); - return 1; + try { + Expiration ex = parseExpirationString(ttlString); + if (ex != null) { + builder.setExpiration(ex); + modified = true; } - builder.setExpiration(CacheDirectiveInfo.Expiration.newRelative(ttl)); - modified = true; + } catch (IOException e) { + System.err.println( + "Error while parsing ttl value: " + e.getMessage()); + return 1; } if (!args.isEmpty()) { System.err.println("Can't understand argument: " + args.get(0)); diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestFsDatasetCache.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestFsDatasetCache.java index 7c5ab7dbd57..85605ddb044 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestFsDatasetCache.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestFsDatasetCache.java @@ -36,6 +36,7 @@ import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.HdfsBlockLocation; import org.apache.hadoop.fs.Path; @@ -82,7 +83,11 @@ public class TestFsDatasetCache { // Most Linux installs allow a default of 64KB locked memory private static final long CACHE_CAPACITY = 64 * 1024; - private static final long BLOCK_SIZE = 4096; + // mlock always locks the entire page. So we don't need to deal with this + // rounding, use the OS page size for the block size. + private static final long PAGE_SIZE = + NativeIO.POSIX.getCacheManipulator().getOperatingSystemPageSize(); + private static final long BLOCK_SIZE = PAGE_SIZE; private static Configuration conf; private static MiniDFSCluster cluster = null; @@ -451,4 +456,27 @@ public Boolean get() { } }, 100, 10000); } + + @Test(timeout=60000) + public void testPageRounder() throws Exception { + // Write a small file + Path fileName = new Path("/testPageRounder"); + final int smallBlocks = 512; // This should be smaller than the page size + assertTrue("Page size should be greater than smallBlocks!", + PAGE_SIZE > smallBlocks); + final int numBlocks = 5; + final int fileLen = smallBlocks * numBlocks; + FSDataOutputStream out = + fs.create(fileName, false, 4096, (short)1, smallBlocks); + out.write(new byte[fileLen]); + out.close(); + HdfsBlockLocation[] locs = (HdfsBlockLocation[])fs.getFileBlockLocations( + fileName, 0, fileLen); + // Cache the file and check the sizes match the page size + setHeartbeatResponse(cacheBlocks(locs)); + verifyExpectedCacheUsage(PAGE_SIZE * numBlocks, numBlocks); + // Uncache and check that it decrements by the page size too + setHeartbeatResponse(uncacheBlocks(locs)); + verifyExpectedCacheUsage(0, 0); + } } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestCacheDirectives.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestCacheDirectives.java index 6ab808ea167..4be876e985f 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestCacheDirectives.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestCacheDirectives.java @@ -65,7 +65,9 @@ import org.apache.hadoop.hdfs.protocol.CacheDirectiveStats; import org.apache.hadoop.hdfs.protocol.CachePoolEntry; import org.apache.hadoop.hdfs.protocol.CachePoolInfo; +import org.apache.hadoop.hdfs.protocol.DatanodeInfo; import org.apache.hadoop.hdfs.protocol.CacheDirectiveInfo.Expiration; +import org.apache.hadoop.hdfs.protocol.HdfsConstants.DatanodeReportType; import org.apache.hadoop.hdfs.protocol.CachePoolStats; import org.apache.hadoop.hdfs.server.blockmanagement.CacheReplicationMonitor; import org.apache.hadoop.hdfs.server.blockmanagement.DatanodeDescriptor.CachedBlocksList.Type; @@ -105,7 +107,7 @@ public class TestCacheDirectives { EditLogFileOutputStream.setShouldSkipFsyncForTesting(false); } - private static final long BLOCK_SIZE = 512; + private static final long BLOCK_SIZE = 4096; private static final int NUM_DATANODES = 4; // Most Linux installs will allow non-root users to lock 64KB. // In this test though, we stub out mlock so this doesn't matter. @@ -835,6 +837,24 @@ public Boolean get() { waitForCachedBlocks(namenode, expected, expected, "testWaitForCachedReplicas:1"); } + + // Check that the datanodes have the right cache values + DatanodeInfo[] live = dfs.getDataNodeStats(DatanodeReportType.LIVE); + assertEquals("Unexpected number of live nodes", NUM_DATANODES, live.length); + long totalUsed = 0; + for (DatanodeInfo dn : live) { + final long cacheCapacity = dn.getCacheCapacity(); + final long cacheUsed = dn.getCacheUsed(); + final long cacheRemaining = dn.getCacheRemaining(); + assertEquals("Unexpected cache capacity", CACHE_CAPACITY, cacheCapacity); + assertEquals("Capacity not equal to used + remaining", + cacheCapacity, cacheUsed + cacheRemaining); + assertEquals("Remaining not equal to capacity - used", + cacheCapacity - cacheUsed, cacheRemaining); + totalUsed += cacheUsed; + } + assertEquals(expected*BLOCK_SIZE, totalUsed); + // Uncache and check each path in sequence RemoteIterator entries = new CacheDirectiveIterator(nnRpc, null); @@ -974,7 +994,6 @@ public void testWaitForCachedReplicasInDirectory() throws Exception { (4+3) * numBlocksPerFile * BLOCK_SIZE, 3, 2, poolInfo, "testWaitForCachedReplicasInDirectory:2:pool"); - // remove and watch numCached go to 0 dfs.removeCacheDirective(id); dfs.removeCacheDirective(id2); From a45017a5f17ec11814db7e206d1e80aaa4dd8d8f Mon Sep 17 00:00:00 2001 From: Vinod Kumar Vavilapalli Date: Thu, 2 Jan 2014 20:19:45 +0000 Subject: [PATCH 26/42] YARN-1493. Changed ResourceManager and Scheduler interfacing to recognize app-attempts separately from apps. Contributed by Jian He. git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1554896 13f79535-47bb-0310-9956-ffa450edef68 --- .../scheduler/ResourceSchedulerWrapper.java | 37 +- .../sls/scheduler/SLSCapacityScheduler.java | 11 +- hadoop-yarn-project/CHANGES.txt | 3 + .../resourcemanager/rmapp/RMAppEventType.java | 4 +- .../resourcemanager/rmapp/RMAppImpl.java | 82 +++- .../rmapp/attempt/RMAppAttemptEventType.java | 3 +- .../rmapp/attempt/RMAppAttemptImpl.java | 66 +-- .../scheduler/ActiveUsersManager.java | 6 +- .../scheduler/SchedulerAppReport.java | 2 +- .../scheduler/SchedulerAppUtils.java | 2 +- .../scheduler/SchedulerApplication.java | 381 +--------------- .../SchedulerApplicationAttempt.java | 410 ++++++++++++++++++ .../scheduler/capacity/CSQueue.java | 31 +- .../scheduler/capacity/CapacityScheduler.java | 128 ++++-- .../scheduler/capacity/LeafQueue.java | 79 ++-- .../scheduler/capacity/ParentQueue.java | 41 +- .../common/fica/FiCaSchedulerApp.java | 4 +- .../common/fica/FiCaSchedulerNode.java | 6 +- .../event/AppAddedSchedulerEvent.java} | 34 +- .../event/AppAttemptAddedSchedulerEvent.java | 15 +- .../event/AppRemovedSchedulerEvent.java | 43 ++ .../scheduler/event/SchedulerEventType.java | 6 +- .../scheduler/fair/FSParentQueue.java | 2 +- .../scheduler/fair/FSSchedulerApp.java | 4 +- .../scheduler/fair/FairScheduler.java | 125 ++++-- .../scheduler/fifo/FifoScheduler.java | 96 ++-- .../server/resourcemanager/Application.java | 12 +- .../resourcemanager/TestClientRMService.java | 2 +- .../resourcemanager/TestFifoScheduler.java | 28 +- .../server/resourcemanager/TestRMRestart.java | 27 +- .../rmapp/TestRMAppTransitions.java | 12 +- .../attempt/TestRMAppAttemptTransitions.java | 20 +- .../scheduler/TestSchedulerUtils.java | 25 ++ .../capacity/TestApplicationLimits.java | 38 +- .../capacity/TestCapacityScheduler.java | 35 +- .../scheduler/capacity/TestLeafQueue.java | 70 +-- .../scheduler/fair/TestFairScheduler.java | 210 +++++---- .../scheduler/fifo/TestFifoScheduler.java | 63 ++- .../webapp/TestRMWebServicesApps.java | 5 + 39 files changed, 1268 insertions(+), 900 deletions(-) create mode 100644 hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/SchedulerApplicationAttempt.java rename hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/{rmapp/attempt/event/RMAppAttemptRejectedEvent.java => scheduler/event/AppAddedSchedulerEvent.java} (54%) create mode 100644 hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/event/AppRemovedSchedulerEvent.java diff --git a/hadoop-tools/hadoop-sls/src/main/java/org/apache/hadoop/yarn/sls/scheduler/ResourceSchedulerWrapper.java b/hadoop-tools/hadoop-sls/src/main/java/org/apache/hadoop/yarn/sls/scheduler/ResourceSchedulerWrapper.java index bc7f7a086ad..b24e20eb1d0 100644 --- a/hadoop-tools/hadoop-sls/src/main/java/org/apache/hadoop/yarn/sls/scheduler/ResourceSchedulerWrapper.java +++ b/hadoop-tools/hadoop-sls/src/main/java/org/apache/hadoop/yarn/sls/scheduler/ResourceSchedulerWrapper.java @@ -64,8 +64,9 @@ import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerAppReport; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerNodeReport; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacityScheduler; -import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.AppAttemptAddedSchedulerEvent; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.AppAddedSchedulerEvent; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.AppAttemptRemovedSchedulerEvent; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.AppRemovedSchedulerEvent; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.NodeUpdateSchedulerEvent; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.SchedulerEvent; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.SchedulerEventType; @@ -105,8 +106,8 @@ public class ResourceSchedulerWrapper implements private Configuration conf; private ResourceScheduler scheduler; - private Map appQueueMap = - new ConcurrentHashMap(); + private Map appQueueMap = + new ConcurrentHashMap(); private BufferedWriter jobRuntimeLogBW; // Priority of the ResourceSchedulerWrapper shutdown hook. @@ -240,7 +241,7 @@ public void handle(SchedulerEvent schedulerEvent) { (AppAttemptRemovedSchedulerEvent) schedulerEvent; ApplicationAttemptId appAttemptId = appRemoveEvent.getApplicationAttemptID(); - String queue = appQueueMap.get(appAttemptId); + String queue = appQueueMap.get(appAttemptId.getApplicationId()); SchedulerAppReport app = scheduler.getSchedulerAppInfo(appAttemptId); if (! app.getLiveContainers().isEmpty()) { // have 0 or 1 // should have one container which is AM container @@ -262,20 +263,18 @@ public void handle(SchedulerEvent schedulerEvent) { schedulerHandleCounter.inc(); schedulerHandleCounterMap.get(schedulerEvent.getType()).inc(); - if (schedulerEvent.getType() == SchedulerEventType.APP_ATTEMPT_REMOVED - && schedulerEvent instanceof AppAttemptRemovedSchedulerEvent) { + if (schedulerEvent.getType() == SchedulerEventType.APP_REMOVED + && schedulerEvent instanceof AppRemovedSchedulerEvent) { SLSRunner.decreaseRemainingApps(); - AppAttemptRemovedSchedulerEvent appRemoveEvent = - (AppAttemptRemovedSchedulerEvent) schedulerEvent; - ApplicationAttemptId appAttemptId = - appRemoveEvent.getApplicationAttemptID(); - appQueueMap.remove(appRemoveEvent.getApplicationAttemptID()); - } else if (schedulerEvent.getType() == SchedulerEventType.APP_ATTEMPT_ADDED - && schedulerEvent instanceof AppAttemptAddedSchedulerEvent) { - AppAttemptAddedSchedulerEvent appAddEvent = - (AppAttemptAddedSchedulerEvent) schedulerEvent; + AppRemovedSchedulerEvent appRemoveEvent = + (AppRemovedSchedulerEvent) schedulerEvent; + appQueueMap.remove(appRemoveEvent.getApplicationID()); + } else if (schedulerEvent.getType() == SchedulerEventType.APP_ADDED + && schedulerEvent instanceof AppAddedSchedulerEvent) { + AppAddedSchedulerEvent appAddEvent = + (AppAddedSchedulerEvent) schedulerEvent; String queueName = appAddEvent.getQueue(); - appQueueMap.put(appAddEvent.getApplicationAttemptId(), queueName); + appQueueMap.put(appAddEvent.getApplicationId(), queueName); } } } @@ -297,7 +296,9 @@ private void updateQueueWithNodeUpdate( continue; } - String queue = appQueueMap.get(containerId.getApplicationAttemptId()); + String queue = + appQueueMap.get(containerId.getApplicationAttemptId() + .getApplicationId()); int releasedMemory = 0, releasedVCores = 0; if (status.getExitStatus() == ContainerExitStatus.SUCCESS) { for (RMContainer rmc : app.getLiveContainers()) { @@ -329,7 +330,7 @@ private void updateQueueWithAllocateRequest(Allocation allocation, // update queue information Resource pendingResource = Resources.createResource(0, 0); Resource allocatedResource = Resources.createResource(0, 0); - String queueName = appQueueMap.get(attemptId); + String queueName = appQueueMap.get(attemptId.getApplicationId()); // container requested for (ResourceRequest request : resourceRequests) { if (request.getResourceName().equals(ResourceRequest.ANY)) { diff --git a/hadoop-tools/hadoop-sls/src/main/java/org/apache/hadoop/yarn/sls/scheduler/SLSCapacityScheduler.java b/hadoop-tools/hadoop-sls/src/main/java/org/apache/hadoop/yarn/sls/scheduler/SLSCapacityScheduler.java index 1b304de79af..6a84e5838c4 100644 --- a/hadoop-tools/hadoop-sls/src/main/java/org/apache/hadoop/yarn/sls/scheduler/SLSCapacityScheduler.java +++ b/hadoop-tools/hadoop-sls/src/main/java/org/apache/hadoop/yarn/sls/scheduler/SLSCapacityScheduler.java @@ -283,10 +283,13 @@ public void handle(SchedulerEvent schedulerEvent) { appQueueMap.remove(appRemoveEvent.getApplicationAttemptID()); } else if (schedulerEvent.getType() == SchedulerEventType.APP_ATTEMPT_ADDED && schedulerEvent instanceof AppAttemptAddedSchedulerEvent) { - AppAttemptAddedSchedulerEvent appAddEvent = - (AppAttemptAddedSchedulerEvent) schedulerEvent; - String queueName = appAddEvent.getQueue(); - appQueueMap.put(appAddEvent.getApplicationAttemptId(), queueName); + AppAttemptAddedSchedulerEvent appAddEvent = + (AppAttemptAddedSchedulerEvent) schedulerEvent; + SchedulerApplication app = + applications.get(appAddEvent.getApplicationAttemptId() + .getApplicationId()); + appQueueMap.put(appAddEvent.getApplicationAttemptId(), app.getQueue() + .getQueueName()); } } } diff --git a/hadoop-yarn-project/CHANGES.txt b/hadoop-yarn-project/CHANGES.txt index b351ba343a4..33225f06dbb 100644 --- a/hadoop-yarn-project/CHANGES.txt +++ b/hadoop-yarn-project/CHANGES.txt @@ -194,6 +194,9 @@ Release 2.4.0 - UNRELEASED YARN-1541. Changed ResourceManager to invalidate ApplicationMaster host/port information once an AM crashes. (Jian He via vinodkv) + YARN-1493. Changed ResourceManager and Scheduler interfacing to recognize + app-attempts separately from apps. (Jian He via vinodkv) + OPTIMIZATIONS BUG FIXES diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/RMAppEventType.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/RMAppEventType.java index ad3f20d23d9..bddcb352bb8 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/RMAppEventType.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/RMAppEventType.java @@ -24,9 +24,11 @@ public enum RMAppEventType { RECOVER, KILL, - // Source: RMAppAttempt + // Source: Scheduler APP_REJECTED, APP_ACCEPTED, + + // Source: RMAppAttempt ATTEMPT_REGISTERED, ATTEMPT_UNREGISTERED, ATTEMPT_FINISHED, // Will send the final state diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/RMAppImpl.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/RMAppImpl.java index 0bf7c817454..1d451fbc85b 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/RMAppImpl.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/RMAppImpl.java @@ -66,6 +66,8 @@ import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNode; import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNodeCleanAppEvent; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.YarnScheduler; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.AppAddedSchedulerEvent; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.AppRemovedSchedulerEvent; import org.apache.hadoop.yarn.server.utils.BuilderUtils; import org.apache.hadoop.yarn.state.InvalidStateTransitonException; import org.apache.hadoop.yarn.state.MultipleArcTransition; @@ -136,7 +138,7 @@ RMAppEventType.NODE_UPDATE, new RMAppNodeUpdateTransition()) .addTransition(RMAppState.NEW, RMAppState.NEW_SAVING, RMAppEventType.START, new RMAppNewlySavingTransition()) .addTransition(RMAppState.NEW, EnumSet.of(RMAppState.SUBMITTED, - RMAppState.RUNNING, RMAppState.FINISHED, RMAppState.FAILED, + RMAppState.ACCEPTED, RMAppState.FINISHED, RMAppState.FAILED, RMAppState.KILLED, RMAppState.FINAL_SAVING), RMAppEventType.RECOVER, new RMAppRecoveredTransition()) .addTransition(RMAppState.NEW, RMAppState.FINAL_SAVING, RMAppEventType.KILL, @@ -151,7 +153,7 @@ RMAppEventType.RECOVER, new RMAppRecoveredTransition()) .addTransition(RMAppState.NEW_SAVING, RMAppState.NEW_SAVING, RMAppEventType.NODE_UPDATE, new RMAppNodeUpdateTransition()) .addTransition(RMAppState.NEW_SAVING, RMAppState.SUBMITTED, - RMAppEventType.APP_NEW_SAVED, new StartAppAttemptTransition()) + RMAppEventType.APP_NEW_SAVED, new AddApplicationToSchedulerTransition()) .addTransition(RMAppState.NEW_SAVING, RMAppState.FINAL_SAVING, RMAppEventType.KILL, new FinalSavingTransition( @@ -169,9 +171,12 @@ RMAppEventType.NODE_UPDATE, new RMAppNodeUpdateTransition()) new FinalSavingTransition( new AppRejectedTransition(), RMAppState.FAILED)) .addTransition(RMAppState.SUBMITTED, RMAppState.ACCEPTED, - RMAppEventType.APP_ACCEPTED) - .addTransition(RMAppState.SUBMITTED, RMAppState.KILLING, - RMAppEventType.KILL,new KillAttemptTransition()) + RMAppEventType.APP_ACCEPTED, new StartAppAttemptTransition()) + .addTransition(RMAppState.SUBMITTED, RMAppState.FINAL_SAVING, + RMAppEventType.KILL, + new FinalSavingTransition( + new AppKilledTransition(), RMAppState.KILLED)) + // Transitions from ACCEPTED state .addTransition(RMAppState.ACCEPTED, RMAppState.ACCEPTED, @@ -179,11 +184,22 @@ RMAppEventType.NODE_UPDATE, new RMAppNodeUpdateTransition()) .addTransition(RMAppState.ACCEPTED, RMAppState.RUNNING, RMAppEventType.ATTEMPT_REGISTERED) .addTransition(RMAppState.ACCEPTED, - EnumSet.of(RMAppState.SUBMITTED, RMAppState.FINAL_SAVING), + EnumSet.of(RMAppState.ACCEPTED, RMAppState.FINAL_SAVING), + // ACCEPTED state is possible to receive ATTEMPT_FAILED event because + // RMAppRecoveredTransition is returning ACCEPTED state directly and + // waiting for the previous AM to exit. RMAppEventType.ATTEMPT_FAILED, - new AttemptFailedTransition(RMAppState.SUBMITTED)) - .addTransition(RMAppState.ACCEPTED, RMAppState.KILLING, - RMAppEventType.KILL,new KillAttemptTransition()) + new AttemptFailedTransition(RMAppState.ACCEPTED)) + .addTransition(RMAppState.ACCEPTED, RMAppState.FINAL_SAVING, + RMAppEventType.KILL, + new FinalSavingTransition( + new AppKilledTransition(), RMAppState.KILLED)) + // ACCECPTED state can once again receive APP_ACCEPTED event, because on + // recovery the app returns ACCEPTED state and the app once again go + // through the scheduler and triggers one more APP_ACCEPTED event at + // ACCEPTED state. + .addTransition(RMAppState.ACCEPTED, RMAppState.ACCEPTED, + RMAppEventType.APP_ACCEPTED) // Transitions from RUNNING state .addTransition(RMAppState.RUNNING, RMAppState.RUNNING, @@ -197,9 +213,9 @@ RMAppEventType.NODE_UPDATE, new RMAppNodeUpdateTransition()) // UnManagedAM directly jumps to finished RMAppEventType.ATTEMPT_FINISHED, FINISHED_TRANSITION) .addTransition(RMAppState.RUNNING, - EnumSet.of(RMAppState.SUBMITTED, RMAppState.FINAL_SAVING), + EnumSet.of(RMAppState.ACCEPTED, RMAppState.FINAL_SAVING), RMAppEventType.ATTEMPT_FAILED, - new AttemptFailedTransition(RMAppState.SUBMITTED)) + new AttemptFailedTransition(RMAppState.ACCEPTED)) .addTransition(RMAppState.RUNNING, RMAppState.KILLING, RMAppEventType.KILL, new KillAttemptTransition()) @@ -641,7 +657,7 @@ private void createNewAttempt(boolean startAttempt) { ApplicationAttemptId.newInstance(applicationId, attempts.size() + 1); RMAppAttempt attempt = new RMAppAttemptImpl(appAttemptId, rmContext, scheduler, masterService, - submissionContext, conf, user); + submissionContext, conf); attempts.put(appAttemptId, attempt); currentAttempt = attempt; if(startAttempt) { @@ -695,29 +711,46 @@ public RMAppState transition(RMAppImpl app, RMAppEvent event) { return app.recoveredFinalState; } + // Notify scheduler about the app on recovery + new AddApplicationToSchedulerTransition().transition(app, event); + // No existent attempts means the attempt associated with this app was not // started or started but not yet saved. if (app.attempts.isEmpty()) { - app.createNewAttempt(true); return RMAppState.SUBMITTED; } - return RMAppState.RUNNING; + // YARN-1507 is saving the application state after the application is + // accepted. So after YARN-1507, an app is saved meaning it is accepted. + // Thus we return ACCECPTED state on recovery. + return RMAppState.ACCEPTED; + } + } + + private static final class AddApplicationToSchedulerTransition extends + RMAppTransition { + @SuppressWarnings("unchecked") + @Override + public void transition(RMAppImpl app, RMAppEvent event) { + if (event instanceof RMAppNewSavedEvent) { + RMAppNewSavedEvent storeEvent = (RMAppNewSavedEvent) event; + // For HA this exception needs to be handled by giving up + // master status if we got fenced + if (((RMAppNewSavedEvent) event).getStoredException() != null) { + LOG.error( + "Failed to store application: " + storeEvent.getApplicationId(), + storeEvent.getStoredException()); + ExitUtil.terminate(1, storeEvent.getStoredException()); + } + } + app.handler.handle(new AppAddedSchedulerEvent(app.applicationId, + app.submissionContext.getQueue(), app.user)); } } private static final class StartAppAttemptTransition extends RMAppTransition { @Override public void transition(RMAppImpl app, RMAppEvent event) { - RMAppNewSavedEvent storeEvent = (RMAppNewSavedEvent) event; - if (storeEvent.getStoredException() != null) { - // For HA this exception needs to be handled by giving up - // master status if we got fenced - LOG.error( - "Failed to store application: " + storeEvent.getApplicationId(), - storeEvent.getStoredException()); - ExitUtil.terminate(1, storeEvent.getStoredException()); - } app.createNewAttempt(true); }; } @@ -965,6 +998,8 @@ public void transition(RMAppImpl app, RMAppEvent event) { if (app.finishTime == 0 ) { app.finishTime = System.currentTimeMillis(); } + app.handler.handle(new AppRemovedSchedulerEvent(app.applicationId, app + .getState())); app.handler.handle( new RMAppManagerEvent(app.applicationId, RMAppManagerEventType.APP_COMPLETED)); @@ -993,7 +1028,6 @@ public RMAppState transition(RMAppImpl app, RMAppEvent event) { return RMAppState.FINAL_SAVING; } } - } @Override diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/attempt/RMAppAttemptEventType.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/attempt/RMAppAttemptEventType.java index bac27139647..e1522f1bf73 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/attempt/RMAppAttemptEventType.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/attempt/RMAppAttemptEventType.java @@ -45,8 +45,7 @@ public enum RMAppAttemptEventType { ATTEMPT_UPDATE_SAVED, // Source: Scheduler - APP_REJECTED, - APP_ACCEPTED, + ATTEMPT_ADDED, // Source: RMAttemptImpl.recover RECOVER diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/attempt/RMAppAttemptImpl.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/attempt/RMAppAttemptImpl.java index 647bc59c9dd..f805f423ee2 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/attempt/RMAppAttemptImpl.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/attempt/RMAppAttemptImpl.java @@ -75,13 +75,11 @@ import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppFailedAttemptEvent; import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppFinishedAttemptEvent; import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppImpl; -import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppRejectedEvent; import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.event.RMAppAttemptContainerAcquiredEvent; import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.event.RMAppAttemptContainerFinishedEvent; import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.event.RMAppAttemptLaunchFailedEvent; import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.event.RMAppAttemptNewSavedEvent; import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.event.RMAppAttemptRegistrationEvent; -import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.event.RMAppAttemptRejectedEvent; import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.event.RMAppAttemptStatusupdateEvent; import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.event.RMAppAttemptUnregistrationEvent; import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.event.RMAppAttemptUpdateSavedEvent; @@ -150,7 +148,6 @@ public class RMAppAttemptImpl implements RMAppAttempt, Recoverable { private final StringBuilder diagnostics = new StringBuilder(); private Configuration conf; - private String user; private static final ExpiredTransition EXPIRED_TRANSITION = new ExpiredTransition(); @@ -186,14 +183,10 @@ RMAppAttemptEventType.START, new AttemptStartedTransition()) RMAppAttemptEventType.RECOVER, new AttemptRecoveredTransition()) // Transitions from SUBMITTED state - .addTransition(RMAppAttemptState.SUBMITTED, RMAppAttemptState.FINAL_SAVING, - RMAppAttemptEventType.APP_REJECTED, - new FinalSavingTransition(new AppRejectedTransition(), - RMAppAttemptState.FAILED)) .addTransition(RMAppAttemptState.SUBMITTED, EnumSet.of(RMAppAttemptState.LAUNCHED_UNMANAGED_SAVING, RMAppAttemptState.SCHEDULED), - RMAppAttemptEventType.APP_ACCEPTED, + RMAppAttemptEventType.ATTEMPT_ADDED, new ScheduleTransition()) .addTransition(RMAppAttemptState.SUBMITTED, RMAppAttemptState.FINAL_SAVING, RMAppAttemptEventType.KILL, @@ -380,8 +373,7 @@ RMAppAttemptEventType.STATUS_UPDATE, new StatusUpdateTransition()) .addTransition( RMAppAttemptState.KILLED, RMAppAttemptState.KILLED, - EnumSet.of(RMAppAttemptEventType.APP_ACCEPTED, - RMAppAttemptEventType.APP_REJECTED, + EnumSet.of(RMAppAttemptEventType.ATTEMPT_ADDED, RMAppAttemptEventType.EXPIRE, RMAppAttemptEventType.LAUNCHED, RMAppAttemptEventType.LAUNCH_FAILED, @@ -398,7 +390,7 @@ public RMAppAttemptImpl(ApplicationAttemptId appAttemptId, RMContext rmContext, YarnScheduler scheduler, ApplicationMasterService masterService, ApplicationSubmissionContext submissionContext, - Configuration conf, String user) { + Configuration conf) { this.conf = conf; this.applicationAttemptId = appAttemptId; this.rmContext = rmContext; @@ -414,7 +406,6 @@ public RMAppAttemptImpl(ApplicationAttemptId appAttemptId, this.proxiedTrackingUrl = generateProxyUriWithScheme(null); this.stateMachine = stateMachineFactory.make(this); - this.user = user; } @Override @@ -750,35 +741,8 @@ public void transition(RMAppAttemptImpl appAttempt, appAttempt.rmContext.getAMRMTokenSecretManager()); // Add the applicationAttempt to the scheduler - appAttempt.eventHandler.handle( - new AppAttemptAddedSchedulerEvent(appAttempt.applicationAttemptId, - appAttempt.submissionContext.getQueue(), appAttempt.user)); - } - } - - private static final class AppRejectedTransition extends BaseTransition { - @Override - public void transition(RMAppAttemptImpl appAttempt, - RMAppAttemptEvent event) { - - RMAppAttemptRejectedEvent rejectedEvent = (RMAppAttemptRejectedEvent) event; - - // Tell the AMS. Unregister from the ApplicationMasterService - appAttempt.masterService - .unregisterAttempt(appAttempt.applicationAttemptId); - - // Save the diagnostic message - String message = rejectedEvent.getMessage(); - appAttempt.diagnostics.append(message); - - // Send the rejection event to app - appAttempt.eventHandler.handle( - new RMAppRejectedEvent( - rejectedEvent.getApplicationAttemptId().getApplicationId(), - message) - ); - - appAttempt.removeCredentials(appAttempt); + appAttempt.eventHandler.handle(new AppAttemptAddedSchedulerEvent( + appAttempt.applicationAttemptId)); } } @@ -794,11 +758,6 @@ private static final class ScheduleTransition public RMAppAttemptState transition(RMAppAttemptImpl appAttempt, RMAppAttemptEvent event) { if (!appAttempt.submissionContext.getUnmanagedAM()) { - // Send the acceptance to the app - appAttempt.eventHandler.handle(new RMAppEvent(event - .getApplicationAttemptId().getApplicationId(), - RMAppEventType.APP_ACCEPTED)); - // Request a container for the AM. ResourceRequest request = BuilderUtils.newResourceRequest( @@ -918,11 +877,6 @@ private void rememberTargetTransitionsAndStoreState(RMAppAttemptEvent event, FinalApplicationStatus finalStatus = null; switch (event.getType()) { - case APP_REJECTED: - RMAppAttemptRejectedEvent rejectedEvent = - (RMAppAttemptRejectedEvent) event; - diags = rejectedEvent.getMessage(); - break; case LAUNCH_FAILED: RMAppAttemptLaunchFailedEvent launchFaileEvent = (RMAppAttemptLaunchFailedEvent) event; @@ -1091,16 +1045,6 @@ private static final class UnmanagedAMAttemptSavedTransition public void transition(RMAppAttemptImpl appAttempt, RMAppAttemptEvent event) { appAttempt.checkAttemptStoreError(event); - // Send the acceptance to the app - // Ideally this should have been done when the scheduler accepted the app. - // But its here because until the attempt is saved the client should not - // launch the unmanaged AM. Client waits for the app status to be accepted - // before doing so. So we have to delay the accepted state until we have - // completed storing the attempt - appAttempt.eventHandler.handle(new RMAppEvent(event - .getApplicationAttemptId().getApplicationId(), - RMAppEventType.APP_ACCEPTED)); - super.transition(appAttempt, event); } } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/ActiveUsersManager.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/ActiveUsersManager.java index e9c5c5ae53a..36e68583857 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/ActiveUsersManager.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/ActiveUsersManager.java @@ -56,7 +56,7 @@ public ActiveUsersManager(QueueMetrics metrics) { * @param user application user * @param applicationId activated application */ - @Lock({Queue.class, SchedulerApplication.class}) + @Lock({Queue.class, SchedulerApplicationAttempt.class}) synchronized public void activateApplication( String user, ApplicationId applicationId) { Set userApps = usersApplications.get(user); @@ -79,7 +79,7 @@ synchronized public void activateApplication( * @param user application user * @param applicationId deactivated application */ - @Lock({Queue.class, SchedulerApplication.class}) + @Lock({Queue.class, SchedulerApplicationAttempt.class}) synchronized public void deactivateApplication( String user, ApplicationId applicationId) { Set userApps = usersApplications.get(user); @@ -102,7 +102,7 @@ synchronized public void deactivateApplication( * resource requests. * @return number of active users */ - @Lock({Queue.class, SchedulerApplication.class}) + @Lock({Queue.class, SchedulerApplicationAttempt.class}) synchronized public int getNumActiveUsers() { return activeUsers; } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/SchedulerAppReport.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/SchedulerAppReport.java index f1dc9d2ae37..669b97a841b 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/SchedulerAppReport.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/SchedulerAppReport.java @@ -36,7 +36,7 @@ public class SchedulerAppReport { private final Collection reserved; private final boolean pending; - public SchedulerAppReport(SchedulerApplication app) { + public SchedulerAppReport(SchedulerApplicationAttempt app) { this.live = app.getLiveContainers(); this.reserved = app.getReservedContainers(); this.pending = app.isPending(); diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/SchedulerAppUtils.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/SchedulerAppUtils.java index be68fe2e28f..36a124421d0 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/SchedulerAppUtils.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/SchedulerAppUtils.java @@ -22,7 +22,7 @@ public class SchedulerAppUtils { - public static boolean isBlacklisted(SchedulerApplication application, + public static boolean isBlacklisted(SchedulerApplicationAttempt application, SchedulerNode node, Log LOG) { if (application.isBlacklisted(node.getNodeName())) { if (LOG.isDebugEnabled()) { diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/SchedulerApplication.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/SchedulerApplication.java index 0fb8acbfbc1..48e3ee85f76 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/SchedulerApplication.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/SchedulerApplication.java @@ -17,393 +17,26 @@ */ package org.apache.hadoop.yarn.server.resourcemanager.scheduler; -import java.util.ArrayList; -import java.util.Collection; -import java.util.HashMap; -import java.util.List; -import java.util.Map; - -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; import org.apache.hadoop.classification.InterfaceAudience.Private; -import org.apache.hadoop.classification.InterfaceStability.Stable; import org.apache.hadoop.classification.InterfaceStability.Unstable; -import org.apache.hadoop.yarn.api.records.ApplicationAttemptId; -import org.apache.hadoop.yarn.api.records.ApplicationId; -import org.apache.hadoop.yarn.api.records.ApplicationResourceUsageReport; -import org.apache.hadoop.yarn.api.records.Container; -import org.apache.hadoop.yarn.api.records.ContainerId; -import org.apache.hadoop.yarn.api.records.NodeId; -import org.apache.hadoop.yarn.api.records.Priority; -import org.apache.hadoop.yarn.api.records.Resource; -import org.apache.hadoop.yarn.api.records.ResourceRequest; -import org.apache.hadoop.yarn.server.resourcemanager.RMContext; -import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttemptState; -import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainer; -import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainerEvent; -import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainerEventType; -import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainerImpl; -import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainerReservedEvent; -import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNodeCleanContainerEvent; -import org.apache.hadoop.yarn.util.resource.Resources; -import com.google.common.collect.HashMultiset; -import com.google.common.collect.Multiset; - -/** - * Represents an application attempt from the viewpoint of the scheduler. - * Each running app attempt in the RM corresponds to one instance - * of this class. - */ @Private @Unstable -public abstract class SchedulerApplication { - - private static final Log LOG = LogFactory.getLog(SchedulerApplication.class); +public class SchedulerApplication { - protected final AppSchedulingInfo appSchedulingInfo; - - protected final Map liveContainers = - new HashMap(); - protected final Map> reservedContainers = - new HashMap>(); + private final Queue queue; + private final String user; - private final Multiset reReservations = HashMultiset.create(); - - protected final Resource currentReservation = Resource.newInstance(0, 0); - private Resource resourceLimit = Resource.newInstance(0, 0); - protected final Resource currentConsumption = Resource.newInstance(0, 0); - - protected List newlyAllocatedContainers = - new ArrayList(); - - /** - * Count how many times the application has been given an opportunity - * to schedule a task at each priority. Each time the scheduler - * asks the application for a task at this priority, it is incremented, - * and each time the application successfully schedules a task, it - * is reset to 0. - */ - Multiset schedulingOpportunities = HashMultiset.create(); - - // Time of the last container scheduled at the current allowed level - protected Map lastScheduledContainer = - new HashMap(); - - protected final Queue queue; - protected boolean isStopped = false; - - protected final RMContext rmContext; - - public SchedulerApplication(ApplicationAttemptId applicationAttemptId, - String user, Queue queue, ActiveUsersManager activeUsersManager, - RMContext rmContext) { - this.rmContext = rmContext; - this.appSchedulingInfo = - new AppSchedulingInfo(applicationAttemptId, user, queue, - activeUsersManager); + public SchedulerApplication(Queue queue, String user) { this.queue = queue; - } - - /** - * Get the live containers of the application. - * @return live containers of the application - */ - public synchronized Collection getLiveContainers() { - return new ArrayList(liveContainers.values()); - } - - /** - * Is this application pending? - * @return true if it is else false. - */ - public boolean isPending() { - return appSchedulingInfo.isPending(); - } - - /** - * Get {@link ApplicationAttemptId} of the application master. - * @return ApplicationAttemptId of the application master - */ - public ApplicationAttemptId getApplicationAttemptId() { - return appSchedulingInfo.getApplicationAttemptId(); - } - - public ApplicationId getApplicationId() { - return appSchedulingInfo.getApplicationId(); - } - - public String getUser() { - return appSchedulingInfo.getUser(); + this.user = user; } - public Map getResourceRequests(Priority priority) { - return appSchedulingInfo.getResourceRequests(priority); - } - - public int getNewContainerId() { - return appSchedulingInfo.getNewContainerId(); - } - - public Collection getPriorities() { - return appSchedulingInfo.getPriorities(); - } - - public ResourceRequest getResourceRequest(Priority priority, String resourceName) { - return this.appSchedulingInfo.getResourceRequest(priority, resourceName); - } - - public synchronized int getTotalRequiredResources(Priority priority) { - return getResourceRequest(priority, ResourceRequest.ANY).getNumContainers(); - } - - public Resource getResource(Priority priority) { - return appSchedulingInfo.getResource(priority); - } - - public String getQueueName() { - return appSchedulingInfo.getQueueName(); - } - - public synchronized RMContainer getRMContainer(ContainerId id) { - return liveContainers.get(id); - } - - protected synchronized void resetReReservations(Priority priority) { - reReservations.setCount(priority, 0); - } - - protected synchronized void addReReservation(Priority priority) { - reReservations.add(priority); - } - - public synchronized int getReReservations(Priority priority) { - return reReservations.count(priority); - } - - /** - * Get total current reservations. - * Used only by unit tests - * @return total current reservations - */ - @Stable - @Private - public synchronized Resource getCurrentReservation() { - return currentReservation; - } - public Queue getQueue() { return queue; } - - public synchronized void updateResourceRequests( - List requests) { - if (!isStopped) { - appSchedulingInfo.updateResourceRequests(requests); - } - } - - public synchronized void stop(RMAppAttemptState rmAppAttemptFinalState) { - // Cleanup all scheduling information - isStopped = true; - appSchedulingInfo.stop(rmAppAttemptFinalState); - } - public synchronized boolean isStopped() { - return isStopped; + public String getUser() { + return user; } - - /** - * Get the list of reserved containers - * @return All of the reserved containers. - */ - public synchronized List getReservedContainers() { - List reservedContainers = new ArrayList(); - for (Map.Entry> e : - this.reservedContainers.entrySet()) { - reservedContainers.addAll(e.getValue().values()); - } - return reservedContainers; - } - - public synchronized RMContainer reserve(SchedulerNode node, Priority priority, - RMContainer rmContainer, Container container) { - // Create RMContainer if necessary - if (rmContainer == null) { - rmContainer = - new RMContainerImpl(container, getApplicationAttemptId(), - node.getNodeID(), rmContext.getDispatcher().getEventHandler(), - rmContext.getContainerAllocationExpirer()); - - Resources.addTo(currentReservation, container.getResource()); - - // Reset the re-reservation count - resetReReservations(priority); - } else { - // Note down the re-reservation - addReReservation(priority); - } - rmContainer.handle(new RMContainerReservedEvent(container.getId(), - container.getResource(), node.getNodeID(), priority)); - - Map reservedContainers = - this.reservedContainers.get(priority); - if (reservedContainers == null) { - reservedContainers = new HashMap(); - this.reservedContainers.put(priority, reservedContainers); - } - reservedContainers.put(node.getNodeID(), rmContainer); - - LOG.info("Application " + getApplicationId() - + " reserved container " + rmContainer - + " on node " + node + ", currently has " + reservedContainers.size() - + " at priority " + priority - + "; currentReservation " + currentReservation.getMemory()); - - return rmContainer; - } - - /** - * Has the application reserved the given node at the - * given priority? - * @param node node to be checked - * @param priority priority of reserved container - * @return true is reserved, false if not - */ - public synchronized boolean isReserved(SchedulerNode node, Priority priority) { - Map reservedContainers = - this.reservedContainers.get(priority); - if (reservedContainers != null) { - return reservedContainers.containsKey(node.getNodeID()); - } - return false; - } - - public synchronized void setHeadroom(Resource globalLimit) { - this.resourceLimit = globalLimit; - } - - /** - * Get available headroom in terms of resources for the application's user. - * @return available resource headroom - */ - public synchronized Resource getHeadroom() { - // Corner case to deal with applications being slightly over-limit - if (resourceLimit.getMemory() < 0) { - resourceLimit.setMemory(0); - } - - return resourceLimit; - } - - public synchronized int getNumReservedContainers(Priority priority) { - Map reservedContainers = - this.reservedContainers.get(priority); - return (reservedContainers == null) ? 0 : reservedContainers.size(); - } - - @SuppressWarnings("unchecked") - public synchronized void containerLaunchedOnNode(ContainerId containerId, - NodeId nodeId) { - // Inform the container - RMContainer rmContainer = getRMContainer(containerId); - if (rmContainer == null) { - // Some unknown container sneaked into the system. Kill it. - rmContext.getDispatcher().getEventHandler() - .handle(new RMNodeCleanContainerEvent(nodeId, containerId)); - return; - } - - rmContainer.handle(new RMContainerEvent(containerId, - RMContainerEventType.LAUNCHED)); - } - - public synchronized void showRequests() { - if (LOG.isDebugEnabled()) { - for (Priority priority : getPriorities()) { - Map requests = getResourceRequests(priority); - if (requests != null) { - LOG.debug("showRequests:" + " application=" + getApplicationId() + - " headRoom=" + getHeadroom() + - " currentConsumption=" + currentConsumption.getMemory()); - for (ResourceRequest request : requests.values()) { - LOG.debug("showRequests:" + " application=" + getApplicationId() - + " request=" + request); - } - } - } - } - } - - public Resource getCurrentConsumption() { - return currentConsumption; - } - - public synchronized List pullNewlyAllocatedContainers() { - List returnContainerList = new ArrayList( - newlyAllocatedContainers.size()); - for (RMContainer rmContainer : newlyAllocatedContainers) { - rmContainer.handle(new RMContainerEvent(rmContainer.getContainerId(), - RMContainerEventType.ACQUIRED)); - returnContainerList.add(rmContainer.getContainer()); - } - newlyAllocatedContainers.clear(); - return returnContainerList; - } - - public synchronized void updateBlacklist( - List blacklistAdditions, List blacklistRemovals) { - if (!isStopped) { - this.appSchedulingInfo.updateBlacklist( - blacklistAdditions, blacklistRemovals); - } - } - - public boolean isBlacklisted(String resourceName) { - return this.appSchedulingInfo.isBlacklisted(resourceName); - } - - public synchronized void addSchedulingOpportunity(Priority priority) { - schedulingOpportunities.setCount(priority, - schedulingOpportunities.count(priority) + 1); - } - - public synchronized void subtractSchedulingOpportunity(Priority priority) { - int count = schedulingOpportunities.count(priority) - 1; - this.schedulingOpportunities.setCount(priority, Math.max(count, 0)); - } - - /** - * Return the number of times the application has been given an opportunity - * to schedule a task at the given priority since the last time it - * successfully did so. - */ - public synchronized int getSchedulingOpportunities(Priority priority) { - return schedulingOpportunities.count(priority); - } - - /** - * Should be called when an application has successfully scheduled a container, - * or when the scheduling locality threshold is relaxed. - * Reset various internal counters which affect delay scheduling - * - * @param priority The priority of the container scheduled. - */ - public synchronized void resetSchedulingOpportunities(Priority priority) { - resetSchedulingOpportunities(priority, System.currentTimeMillis()); - } - // used for continuous scheduling - public synchronized void resetSchedulingOpportunities(Priority priority, - long currentTimeMs) { - lastScheduledContainer.put(priority, currentTimeMs); - schedulingOpportunities.setCount(priority, 0); - } - - public synchronized ApplicationResourceUsageReport getResourceUsageReport() { - return ApplicationResourceUsageReport.newInstance(liveContainers.size(), - reservedContainers.size(), Resources.clone(currentConsumption), - Resources.clone(currentReservation), - Resources.add(currentConsumption, currentReservation)); - } - } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/SchedulerApplicationAttempt.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/SchedulerApplicationAttempt.java new file mode 100644 index 00000000000..c601ceef03c --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/SchedulerApplicationAttempt.java @@ -0,0 +1,410 @@ +/** +* Licensed to the Apache Software Foundation (ASF) under one +* or more contributor license agreements. See the NOTICE file +* distributed with this work for additional information +* regarding copyright ownership. The ASF licenses this file +* to you under the Apache License, Version 2.0 (the +* "License"); you may not use this file except in compliance +* with the License. You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*/ +package org.apache.hadoop.yarn.server.resourcemanager.scheduler; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.classification.InterfaceAudience.Private; +import org.apache.hadoop.classification.InterfaceStability.Stable; +import org.apache.hadoop.classification.InterfaceStability.Unstable; +import org.apache.hadoop.yarn.api.records.ApplicationAttemptId; +import org.apache.hadoop.yarn.api.records.ApplicationId; +import org.apache.hadoop.yarn.api.records.ApplicationResourceUsageReport; +import org.apache.hadoop.yarn.api.records.Container; +import org.apache.hadoop.yarn.api.records.ContainerId; +import org.apache.hadoop.yarn.api.records.NodeId; +import org.apache.hadoop.yarn.api.records.Priority; +import org.apache.hadoop.yarn.api.records.Resource; +import org.apache.hadoop.yarn.api.records.ResourceRequest; +import org.apache.hadoop.yarn.server.resourcemanager.RMContext; +import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttemptState; +import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainer; +import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainerEvent; +import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainerEventType; +import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainerImpl; +import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainerReservedEvent; +import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNodeCleanContainerEvent; +import org.apache.hadoop.yarn.util.resource.Resources; + +import com.google.common.collect.HashMultiset; +import com.google.common.collect.Multiset; + +/** + * Represents an application attempt from the viewpoint of the scheduler. + * Each running app attempt in the RM corresponds to one instance + * of this class. + */ +@Private +@Unstable +public abstract class SchedulerApplicationAttempt { + + private static final Log LOG = LogFactory + .getLog(SchedulerApplicationAttempt.class); + + protected final AppSchedulingInfo appSchedulingInfo; + + protected final Map liveContainers = + new HashMap(); + protected final Map> reservedContainers = + new HashMap>(); + + private final Multiset reReservations = HashMultiset.create(); + + protected final Resource currentReservation = Resource.newInstance(0, 0); + private Resource resourceLimit = Resource.newInstance(0, 0); + protected final Resource currentConsumption = Resource.newInstance(0, 0); + + protected List newlyAllocatedContainers = + new ArrayList(); + + /** + * Count how many times the application has been given an opportunity + * to schedule a task at each priority. Each time the scheduler + * asks the application for a task at this priority, it is incremented, + * and each time the application successfully schedules a task, it + * is reset to 0. + */ + Multiset schedulingOpportunities = HashMultiset.create(); + + // Time of the last container scheduled at the current allowed level + protected Map lastScheduledContainer = + new HashMap(); + + protected final Queue queue; + protected boolean isStopped = false; + + protected final RMContext rmContext; + + public SchedulerApplicationAttempt(ApplicationAttemptId applicationAttemptId, + String user, Queue queue, ActiveUsersManager activeUsersManager, + RMContext rmContext) { + this.rmContext = rmContext; + this.appSchedulingInfo = + new AppSchedulingInfo(applicationAttemptId, user, queue, + activeUsersManager); + this.queue = queue; + } + + /** + * Get the live containers of the application. + * @return live containers of the application + */ + public synchronized Collection getLiveContainers() { + return new ArrayList(liveContainers.values()); + } + + /** + * Is this application pending? + * @return true if it is else false. + */ + public boolean isPending() { + return appSchedulingInfo.isPending(); + } + + /** + * Get {@link ApplicationAttemptId} of the application master. + * @return ApplicationAttemptId of the application master + */ + public ApplicationAttemptId getApplicationAttemptId() { + return appSchedulingInfo.getApplicationAttemptId(); + } + + public ApplicationId getApplicationId() { + return appSchedulingInfo.getApplicationId(); + } + + public String getUser() { + return appSchedulingInfo.getUser(); + } + + public Map getResourceRequests(Priority priority) { + return appSchedulingInfo.getResourceRequests(priority); + } + + public int getNewContainerId() { + return appSchedulingInfo.getNewContainerId(); + } + + public Collection getPriorities() { + return appSchedulingInfo.getPriorities(); + } + + public ResourceRequest getResourceRequest(Priority priority, String resourceName) { + return this.appSchedulingInfo.getResourceRequest(priority, resourceName); + } + + public synchronized int getTotalRequiredResources(Priority priority) { + return getResourceRequest(priority, ResourceRequest.ANY).getNumContainers(); + } + + public Resource getResource(Priority priority) { + return appSchedulingInfo.getResource(priority); + } + + public String getQueueName() { + return appSchedulingInfo.getQueueName(); + } + + public synchronized RMContainer getRMContainer(ContainerId id) { + return liveContainers.get(id); + } + + protected synchronized void resetReReservations(Priority priority) { + reReservations.setCount(priority, 0); + } + + protected synchronized void addReReservation(Priority priority) { + reReservations.add(priority); + } + + public synchronized int getReReservations(Priority priority) { + return reReservations.count(priority); + } + + /** + * Get total current reservations. + * Used only by unit tests + * @return total current reservations + */ + @Stable + @Private + public synchronized Resource getCurrentReservation() { + return currentReservation; + } + + public Queue getQueue() { + return queue; + } + + public synchronized void updateResourceRequests( + List requests) { + if (!isStopped) { + appSchedulingInfo.updateResourceRequests(requests); + } + } + + public synchronized void stop(RMAppAttemptState rmAppAttemptFinalState) { + // Cleanup all scheduling information + isStopped = true; + appSchedulingInfo.stop(rmAppAttemptFinalState); + } + + public synchronized boolean isStopped() { + return isStopped; + } + + /** + * Get the list of reserved containers + * @return All of the reserved containers. + */ + public synchronized List getReservedContainers() { + List reservedContainers = new ArrayList(); + for (Map.Entry> e : + this.reservedContainers.entrySet()) { + reservedContainers.addAll(e.getValue().values()); + } + return reservedContainers; + } + + public synchronized RMContainer reserve(SchedulerNode node, Priority priority, + RMContainer rmContainer, Container container) { + // Create RMContainer if necessary + if (rmContainer == null) { + rmContainer = + new RMContainerImpl(container, getApplicationAttemptId(), + node.getNodeID(), rmContext.getDispatcher().getEventHandler(), + rmContext.getContainerAllocationExpirer()); + + Resources.addTo(currentReservation, container.getResource()); + + // Reset the re-reservation count + resetReReservations(priority); + } else { + // Note down the re-reservation + addReReservation(priority); + } + rmContainer.handle(new RMContainerReservedEvent(container.getId(), + container.getResource(), node.getNodeID(), priority)); + + Map reservedContainers = + this.reservedContainers.get(priority); + if (reservedContainers == null) { + reservedContainers = new HashMap(); + this.reservedContainers.put(priority, reservedContainers); + } + reservedContainers.put(node.getNodeID(), rmContainer); + + LOG.info("Application " + getApplicationId() + + " reserved container " + rmContainer + + " on node " + node + ", currently has " + reservedContainers.size() + + " at priority " + priority + + "; currentReservation " + currentReservation.getMemory()); + + return rmContainer; + } + + /** + * Has the application reserved the given node at the + * given priority? + * @param node node to be checked + * @param priority priority of reserved container + * @return true is reserved, false if not + */ + public synchronized boolean isReserved(SchedulerNode node, Priority priority) { + Map reservedContainers = + this.reservedContainers.get(priority); + if (reservedContainers != null) { + return reservedContainers.containsKey(node.getNodeID()); + } + return false; + } + + public synchronized void setHeadroom(Resource globalLimit) { + this.resourceLimit = globalLimit; + } + + /** + * Get available headroom in terms of resources for the application's user. + * @return available resource headroom + */ + public synchronized Resource getHeadroom() { + // Corner case to deal with applications being slightly over-limit + if (resourceLimit.getMemory() < 0) { + resourceLimit.setMemory(0); + } + + return resourceLimit; + } + + public synchronized int getNumReservedContainers(Priority priority) { + Map reservedContainers = + this.reservedContainers.get(priority); + return (reservedContainers == null) ? 0 : reservedContainers.size(); + } + + @SuppressWarnings("unchecked") + public synchronized void containerLaunchedOnNode(ContainerId containerId, + NodeId nodeId) { + // Inform the container + RMContainer rmContainer = getRMContainer(containerId); + if (rmContainer == null) { + // Some unknown container sneaked into the system. Kill it. + rmContext.getDispatcher().getEventHandler() + .handle(new RMNodeCleanContainerEvent(nodeId, containerId)); + return; + } + + rmContainer.handle(new RMContainerEvent(containerId, + RMContainerEventType.LAUNCHED)); + } + + public synchronized void showRequests() { + if (LOG.isDebugEnabled()) { + for (Priority priority : getPriorities()) { + Map requests = getResourceRequests(priority); + if (requests != null) { + LOG.debug("showRequests:" + " application=" + getApplicationId() + + " headRoom=" + getHeadroom() + + " currentConsumption=" + currentConsumption.getMemory()); + for (ResourceRequest request : requests.values()) { + LOG.debug("showRequests:" + " application=" + getApplicationId() + + " request=" + request); + } + } + } + } + } + + public Resource getCurrentConsumption() { + return currentConsumption; + } + + public synchronized List pullNewlyAllocatedContainers() { + List returnContainerList = new ArrayList( + newlyAllocatedContainers.size()); + for (RMContainer rmContainer : newlyAllocatedContainers) { + rmContainer.handle(new RMContainerEvent(rmContainer.getContainerId(), + RMContainerEventType.ACQUIRED)); + returnContainerList.add(rmContainer.getContainer()); + } + newlyAllocatedContainers.clear(); + return returnContainerList; + } + + public synchronized void updateBlacklist( + List blacklistAdditions, List blacklistRemovals) { + if (!isStopped) { + this.appSchedulingInfo.updateBlacklist( + blacklistAdditions, blacklistRemovals); + } + } + + public boolean isBlacklisted(String resourceName) { + return this.appSchedulingInfo.isBlacklisted(resourceName); + } + + public synchronized void addSchedulingOpportunity(Priority priority) { + schedulingOpportunities.setCount(priority, + schedulingOpportunities.count(priority) + 1); + } + + public synchronized void subtractSchedulingOpportunity(Priority priority) { + int count = schedulingOpportunities.count(priority) - 1; + this.schedulingOpportunities.setCount(priority, Math.max(count, 0)); + } + + /** + * Return the number of times the application has been given an opportunity + * to schedule a task at the given priority since the last time it + * successfully did so. + */ + public synchronized int getSchedulingOpportunities(Priority priority) { + return schedulingOpportunities.count(priority); + } + + /** + * Should be called when an application has successfully scheduled a container, + * or when the scheduling locality threshold is relaxed. + * Reset various internal counters which affect delay scheduling + * + * @param priority The priority of the container scheduled. + */ + public synchronized void resetSchedulingOpportunities(Priority priority) { + resetSchedulingOpportunities(priority, System.currentTimeMillis()); + } + // used for continuous scheduling + public synchronized void resetSchedulingOpportunities(Priority priority, + long currentTimeMs) { + lastScheduledContainer.put(priority, currentTimeMs); + schedulingOpportunities.setCount(priority, 0); + } + + public synchronized ApplicationResourceUsageReport getResourceUsageReport() { + return ApplicationResourceUsageReport.newInstance(liveContainers.size(), + reservedContainers.size(), Resources.clone(currentConsumption), + Resources.clone(currentReservation), + Resources.add(currentConsumption, currentReservation)); + } + +} diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/CSQueue.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/CSQueue.java index c317df51a66..f5090ba699e 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/CSQueue.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/CSQueue.java @@ -27,6 +27,7 @@ import org.apache.hadoop.security.AccessControlException; import org.apache.hadoop.security.UserGroupInformation; import org.apache.hadoop.yarn.api.records.ApplicationAttemptId; +import org.apache.hadoop.yarn.api.records.ApplicationId; import org.apache.hadoop.yarn.api.records.Container; import org.apache.hadoop.yarn.api.records.ContainerStatus; import org.apache.hadoop.yarn.api.records.QueueACL; @@ -35,7 +36,6 @@ import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainer; import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainerEventType; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ActiveUsersManager; -import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerApplication; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerApp; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerNode; @@ -155,21 +155,32 @@ public interface CSQueue /** * Submit a new application to the queue. - * @param application application being submitted + * @param applicationId the applicationId of the application being submitted * @param user user who submitted the application * @param queue queue to which the application is submitted */ - public void submitApplication(FiCaSchedulerApp application, String user, - String queue) - throws AccessControlException; - + public void submitApplication(ApplicationId applicationId, String user, + String queue) throws AccessControlException; + + /** + * Submit an application attempt to the queue. + */ + public void submitApplicationAttempt(FiCaSchedulerApp application, + String userName); + /** * An application submitted to this queue has finished. - * @param application - * @param queue application queue + * @param applicationId + * @param user user who submitted the application */ - public void finishApplication(FiCaSchedulerApp application, String queue); - + public void finishApplication(ApplicationId applicationId, String user); + + /** + * An application attempt submitted to this queue has finished. + */ + public void finishApplicationAttempt(FiCaSchedulerApp application, + String queue); + /** * Assign containers to applications in the queue or it's children (if any). * @param clusterResource the resource of the cluster. diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/CapacityScheduler.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/CapacityScheduler.java index 60256398461..5f341089ba8 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/CapacityScheduler.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/CapacityScheduler.java @@ -36,6 +36,7 @@ import org.apache.hadoop.security.AccessControlException; import org.apache.hadoop.security.UserGroupInformation; import org.apache.hadoop.yarn.api.records.ApplicationAttemptId; +import org.apache.hadoop.yarn.api.records.ApplicationId; import org.apache.hadoop.yarn.api.records.ApplicationResourceUsageReport; import org.apache.hadoop.yarn.api.records.Container; import org.apache.hadoop.yarn.api.records.ContainerId; @@ -53,10 +54,13 @@ import org.apache.hadoop.yarn.server.resourcemanager.RMAuditLogger.AuditConstants; import org.apache.hadoop.yarn.server.resourcemanager.RMContext; import org.apache.hadoop.yarn.server.resourcemanager.recovery.RMStateStore.RMState; +import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppEvent; +import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppEventType; +import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppRejectedEvent; +import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppState; import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttemptEvent; import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttemptEventType; import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttemptState; -import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.event.RMAppAttemptRejectedEvent; import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainer; import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainerEventType; import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNode; @@ -65,14 +69,16 @@ import org.apache.hadoop.yarn.server.resourcemanager.scheduler.Allocation; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.PreemptableResourceScheduler; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.QueueMetrics; -import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerAppReport; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerApplication; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerAppReport; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerNodeReport; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerUtils; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerApp; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerNode; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.AppAddedSchedulerEvent; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.AppAttemptAddedSchedulerEvent; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.AppAttemptRemovedSchedulerEvent; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.AppRemovedSchedulerEvent; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.ContainerExpiredSchedulerEvent; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.NodeAddedSchedulerEvent; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.NodeRemovedSchedulerEvent; @@ -185,7 +191,11 @@ public Configuration getConf() { private Resource maximumAllocation; @VisibleForTesting - protected Map applications = + protected Map applications = + new ConcurrentHashMap(); + + @VisibleForTesting + protected Map appAttempts = new ConcurrentHashMap(); private boolean initialized = false; @@ -415,61 +425,84 @@ static CSQueue parseQueue( synchronized CSQueue getQueue(String queueName) { return queues.get(queueName); } - - private synchronized void - addApplicationAttempt(ApplicationAttemptId applicationAttemptId, - String queueName, String user) { - // Sanity checks + private synchronized void addApplication(ApplicationId applicationId, + String queueName, String user) { + // santiy checks. CSQueue queue = getQueue(queueName); if (queue == null) { - String message = "Application " + applicationAttemptId + + String message = "Application " + applicationId + " submitted by user " + user + " to unknown queue: " + queueName; - this.rmContext.getDispatcher().getEventHandler().handle( - new RMAppAttemptRejectedEvent(applicationAttemptId, message)); + this.rmContext.getDispatcher().getEventHandler() + .handle(new RMAppRejectedEvent(applicationId, message)); return; } if (!(queue instanceof LeafQueue)) { - String message = "Application " + applicationAttemptId + + String message = "Application " + applicationId + " submitted by user " + user + " to non-leaf queue: " + queueName; - this.rmContext.getDispatcher().getEventHandler().handle( - new RMAppAttemptRejectedEvent(applicationAttemptId, message)); + this.rmContext.getDispatcher().getEventHandler() + .handle(new RMAppRejectedEvent(applicationId, message)); return; } - - // TODO: Fix store - FiCaSchedulerApp SchedulerApp = - new FiCaSchedulerApp(applicationAttemptId, user, queue, - queue.getActiveUsersManager(), rmContext); - // Submit to the queue try { - queue.submitApplication(SchedulerApp, user, queueName); + queue.submitApplication(applicationId, user, queueName); } catch (AccessControlException ace) { - LOG.info("Failed to submit application " + applicationAttemptId + - " to queue " + queueName + " from user " + user, ace); - this.rmContext.getDispatcher().getEventHandler().handle( - new RMAppAttemptRejectedEvent(applicationAttemptId, - ace.toString())); + LOG.info("Failed to submit application " + applicationId + " to queue " + + queueName + " from user " + user, ace); + this.rmContext.getDispatcher().getEventHandler() + .handle(new RMAppRejectedEvent(applicationId, ace.toString())); return; } + SchedulerApplication application = + new SchedulerApplication(queue, user); + applications.put(applicationId, application); + LOG.info("Accepted application " + applicationId + " from user: " + user + + ", in queue: " + queueName); + rmContext.getDispatcher().getEventHandler() + .handle(new RMAppEvent(applicationId, RMAppEventType.APP_ACCEPTED)); + } - applications.put(applicationAttemptId, SchedulerApp); - - LOG.info("Application Submission: " + applicationAttemptId + - ", user: " + user + - " queue: " + queue + - ", currently active: " + applications.size()); + private synchronized void addApplicationAttempt( + ApplicationAttemptId applicationAttemptId) { + SchedulerApplication application = + applications.get(applicationAttemptId.getApplicationId()); + CSQueue queue = (CSQueue) application.getQueue(); + FiCaSchedulerApp SchedulerApp = + new FiCaSchedulerApp(applicationAttemptId, application.getUser(), + queue, queue.getActiveUsersManager(), rmContext); + appAttempts.put(applicationAttemptId, SchedulerApp); + queue.submitApplicationAttempt(SchedulerApp, application.getUser()); + LOG.info("Added Application Attempt " + applicationAttemptId + + " to scheduler from user " + application.getUser() + " in queue " + + queue.getQueueName()); rmContext.getDispatcher().getEventHandler().handle( - new RMAppAttemptEvent(applicationAttemptId, - RMAppAttemptEventType.APP_ACCEPTED)); + new RMAppAttemptEvent(applicationAttemptId, + RMAppAttemptEventType.ATTEMPT_ADDED)); + } + + private synchronized void doneApplication(ApplicationId applicationId, + RMAppState finalState) { + SchedulerApplication application = applications.get(applicationId); + if (application == null){ + // The AppRemovedSchedulerEvent maybe sent on recovery for completed apps. + return; + } + CSQueue queue = (CSQueue) application.getQueue(); + if (!(queue instanceof LeafQueue)) { + LOG.error("Cannot finish application " + "from non-leaf queue: " + + queue.getQueueName()); + } else { + queue.finishApplication(applicationId, application.getUser()); + } + applications.remove(applicationId); } private synchronized void doneApplicationAttempt( ApplicationAttemptId applicationAttemptId, RMAppAttemptState rmAppAttemptFinalState) { - LOG.info("Application " + applicationAttemptId + " is done." + + LOG.info("Application Attempt " + applicationAttemptId + " is done." + " finalState=" + rmAppAttemptFinalState); FiCaSchedulerApp application = getApplication(applicationAttemptId); @@ -509,11 +542,11 @@ private synchronized void doneApplicationAttempt( LOG.error("Cannot finish application " + "from non-leaf queue: " + queueName); } else { - queue.finishApplication(application, queue.getQueueName()); + queue.finishApplicationAttempt(application, queue.getQueueName()); } // Remove from our data-structure - applications.remove(applicationAttemptId); + appAttempts.remove(applicationAttemptId); } private static final Allocation EMPTY_ALLOCATION = @@ -740,12 +773,25 @@ public void handle(SchedulerEvent event) { nodeUpdate(nodeUpdatedEvent.getRMNode()); } break; + case APP_ADDED: + { + AppAddedSchedulerEvent appAddedEvent = (AppAddedSchedulerEvent) event; + addApplication(appAddedEvent.getApplicationId(), + appAddedEvent.getQueue(), appAddedEvent.getUser()); + } + break; + case APP_REMOVED: + { + AppRemovedSchedulerEvent appRemovedEvent = (AppRemovedSchedulerEvent)event; + doneApplication(appRemovedEvent.getApplicationID(), + appRemovedEvent.getFinalState()); + } + break; case APP_ATTEMPT_ADDED: { AppAttemptAddedSchedulerEvent appAttemptAddedEvent = (AppAttemptAddedSchedulerEvent) event; - addApplicationAttempt(appAttemptAddedEvent.getApplicationAttemptId(), - appAttemptAddedEvent.getQueue(), appAttemptAddedEvent.getUser()); + addApplicationAttempt(appAttemptAddedEvent.getApplicationAttemptId()); } break; case APP_ATTEMPT_REMOVED: @@ -854,7 +900,7 @@ private synchronized void completedContainer(RMContainer rmContainer, @Lock(Lock.NoLock.class) FiCaSchedulerApp getApplication(ApplicationAttemptId applicationAttemptId) { - return applications.get(applicationAttemptId); + return appAttempts.get(applicationAttemptId); } @Override @@ -912,7 +958,7 @@ public void preemptContainer(ApplicationAttemptId aid, RMContainer cont) { LOG.debug("PREEMPT_CONTAINER: application:" + aid.toString() + " container: " + cont.toString()); } - FiCaSchedulerApp app = applications.get(aid); + FiCaSchedulerApp app = appAttempts.get(aid); if (app != null) { app.addPreemptContainer(cont.getContainerId()); } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/LeafQueue.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/LeafQueue.java index db7db607ba9..a8581a0a8d3 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/LeafQueue.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/LeafQueue.java @@ -38,6 +38,7 @@ import org.apache.hadoop.security.UserGroupInformation; import org.apache.hadoop.security.authorize.AccessControlList; import org.apache.hadoop.yarn.api.records.ApplicationAttemptId; +import org.apache.hadoop.yarn.api.records.ApplicationId; import org.apache.hadoop.yarn.api.records.Container; import org.apache.hadoop.yarn.api.records.ContainerId; import org.apache.hadoop.yarn.api.records.ContainerStatus; @@ -59,7 +60,6 @@ import org.apache.hadoop.yarn.server.resourcemanager.scheduler.NodeType; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.QueueMetrics; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerAppUtils; -import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerApplication; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerApp; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerNode; import org.apache.hadoop.yarn.server.resourcemanager.security.RMContainerTokenSecretManager; @@ -99,7 +99,7 @@ public class LeafQueue implements CSQueue { private volatile int numContainers; Set activeApplications; - Map applicationsMap = + Map applicationAttemptMap = new HashMap(); Set pendingApplications; @@ -635,7 +635,22 @@ public boolean hasAccess(QueueACL acl, UserGroupInformation user) { } @Override - public void submitApplication(FiCaSchedulerApp application, String userName, + public void submitApplicationAttempt(FiCaSchedulerApp application, + String userName) { + // Careful! Locking order is important! + synchronized (this) { + User user = getUser(userName); + // Add the attempt to our data-structures + addApplicationAttempt(application, user); + } + + int attemptId = application.getApplicationAttemptId().getAttemptId(); + metrics.submitApp(userName, attemptId); + getParent().submitApplicationAttempt(application, userName); + } + + @Override + public void submitApplication(ApplicationId applicationId, String userName, String queue) throws AccessControlException { // Careful! Locking order is important! @@ -653,8 +668,7 @@ public void submitApplication(FiCaSchedulerApp application, String userName, // Check if the queue is accepting jobs if (getState() != QueueState.RUNNING) { String msg = "Queue " + getQueuePath() + - " is STOPPED. Cannot accept submission of application: " + - application.getApplicationId(); + " is STOPPED. Cannot accept submission of application: " + applicationId; LOG.info(msg); throw new AccessControlException(msg); } @@ -663,8 +677,7 @@ public void submitApplication(FiCaSchedulerApp application, String userName, if (getNumApplications() >= getMaxApplications()) { String msg = "Queue " + getQueuePath() + " already has " + getNumApplications() + " applications," + - " cannot accept submission of application: " + - application.getApplicationId(); + " cannot accept submission of application: " + applicationId; LOG.info(msg); throw new AccessControlException(msg); } @@ -675,26 +688,18 @@ public void submitApplication(FiCaSchedulerApp application, String userName, String msg = "Queue " + getQueuePath() + " already has " + user.getTotalApplications() + " applications from user " + userName + - " cannot accept submission of application: " + - application.getApplicationId(); + " cannot accept submission of application: " + applicationId; LOG.info(msg); throw new AccessControlException(msg); } - - // Add the application to our data-structures - addApplication(application, user); } - int attemptId = application.getApplicationAttemptId().getAttemptId(); - metrics.submitApp(userName, attemptId); - // Inform the parent queue try { - getParent().submitApplication(application, userName, queue); + getParent().submitApplication(applicationId, userName, queue); } catch (AccessControlException ace) { LOG.info("Failed to submit application to parent-queue: " + getParent().getQueuePath(), ace); - removeApplication(application, user); throw ace; } } @@ -722,11 +727,11 @@ private synchronized void activateApplications() { } } - private synchronized void addApplication(FiCaSchedulerApp application, User user) { + private synchronized void addApplicationAttempt(FiCaSchedulerApp application, User user) { // Accept user.submitApplication(); pendingApplications.add(application); - applicationsMap.put(application.getApplicationAttemptId(), application); + applicationAttemptMap.put(application.getApplicationAttemptId(), application); // Activate applications activateApplications(); @@ -742,22 +747,28 @@ private synchronized void addApplication(FiCaSchedulerApp application, User user } @Override - public void finishApplication(FiCaSchedulerApp application, String queue) { - // Careful! Locking order is important! - synchronized (this) { - removeApplication(application, getUser(application.getUser())); - } - + public void finishApplication(ApplicationId application, String user) { + // Inform the activeUsersManager + activeUsersManager.deactivateApplication(user, application); // Inform the parent queue - getParent().finishApplication(application, queue); + getParent().finishApplication(application, user); } - public synchronized void removeApplication(FiCaSchedulerApp application, User user) { + @Override + public void finishApplicationAttempt(FiCaSchedulerApp application, String queue) { + // Careful! Locking order is important! + synchronized (this) { + removeApplicationAttempt(application, getUser(application.getUser())); + } + getParent().finishApplicationAttempt(application, queue); + } + + public synchronized void removeApplicationAttempt(FiCaSchedulerApp application, User user) { boolean wasActive = activeApplications.remove(application); if (!wasActive) { pendingApplications.remove(application); } - applicationsMap.remove(application.getApplicationAttemptId()); + applicationAttemptMap.remove(application.getApplicationAttemptId()); user.finishApplication(wasActive); if (user.getTotalApplications() == 0) { @@ -766,13 +777,7 @@ public synchronized void removeApplication(FiCaSchedulerApp application, User us // Check if we can activate more applications activateApplications(); - - // Inform the activeUsersManager - synchronized (application) { - activeUsersManager.deactivateApplication( - application.getUser(), application.getApplicationId()); - } - + LOG.info("Application removed -" + " appId: " + application.getApplicationId() + " user: " + application.getUser() + @@ -783,10 +788,10 @@ public synchronized void removeApplication(FiCaSchedulerApp application, User us " #queue-active-applications: " + getNumActiveApplications() ); } - + private synchronized FiCaSchedulerApp getApplication( ApplicationAttemptId applicationAttemptId) { - return applicationsMap.get(applicationAttemptId); + return applicationAttemptMap.get(applicationAttemptId); } private static final CSAssignment NULL_ASSIGNMENT = diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/ParentQueue.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/ParentQueue.java index b22b24ed4ea..1f094759a4b 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/ParentQueue.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/ParentQueue.java @@ -37,6 +37,7 @@ import org.apache.hadoop.security.UserGroupInformation; import org.apache.hadoop.security.authorize.AccessControlList; import org.apache.hadoop.yarn.api.records.ApplicationAttemptId; +import org.apache.hadoop.yarn.api.records.ApplicationId; import org.apache.hadoop.yarn.api.records.Container; import org.apache.hadoop.yarn.api.records.ContainerStatus; import org.apache.hadoop.yarn.api.records.QueueACL; @@ -51,7 +52,6 @@ import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ActiveUsersManager; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.NodeType; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.QueueMetrics; -import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerApplication; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerApp; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerNode; import org.apache.hadoop.yarn.util.resource.ResourceCalculator; @@ -442,7 +442,7 @@ public boolean hasAccess(QueueACL acl, UserGroupInformation user) { } @Override - public void submitApplication(FiCaSchedulerApp application, String user, + public void submitApplication(ApplicationId applicationId, String user, String queue) throws AccessControlException { synchronized (this) { @@ -455,57 +455,70 @@ public void submitApplication(FiCaSchedulerApp application, String user, if (state != QueueState.RUNNING) { throw new AccessControlException("Queue " + getQueuePath() + " is STOPPED. Cannot accept submission of application: " + - application.getApplicationId()); + applicationId); } - addApplication(application, user); + addApplication(applicationId, user); } // Inform the parent queue if (parent != null) { try { - parent.submitApplication(application, user, queue); + parent.submitApplication(applicationId, user, queue); } catch (AccessControlException ace) { LOG.info("Failed to submit application to parent-queue: " + parent.getQueuePath(), ace); - removeApplication(application, user); + removeApplication(applicationId, user); throw ace; } } } - private synchronized void addApplication(FiCaSchedulerApp application, + + @Override + public void submitApplicationAttempt(FiCaSchedulerApp application, + String userName) { + // submit attempt logic. + } + + @Override + public void finishApplicationAttempt(FiCaSchedulerApp application, + String queue) { + // finish attempt logic. + } + + private synchronized void addApplication(ApplicationId applicationId, String user) { - + ++numApplications; LOG.info("Application added -" + - " appId: " + application.getApplicationId() + + " appId: " + applicationId + " user: " + user + " leaf-queue of parent: " + getQueueName() + " #applications: " + getNumApplications()); } @Override - public void finishApplication(FiCaSchedulerApp application, String queue) { + public void finishApplication(ApplicationId application, String user) { synchronized (this) { - removeApplication(application, application.getUser()); + removeApplication(application, user); } // Inform the parent queue if (parent != null) { - parent.finishApplication(application, queue); + parent.finishApplication(application, user); } } - public synchronized void removeApplication(FiCaSchedulerApp application, + public synchronized void removeApplication(ApplicationId applicationId, String user) { --numApplications; LOG.info("Application removed -" + - " appId: " + application.getApplicationId() + + " appId: " + applicationId + " user: " + user + " leaf-queue of parent: " + getQueueName() + " #applications: " + getNumApplications()); diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/common/fica/FiCaSchedulerApp.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/common/fica/FiCaSchedulerApp.java index 7f51126fec8..dcbc5ad7a46 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/common/fica/FiCaSchedulerApp.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/common/fica/FiCaSchedulerApp.java @@ -47,7 +47,7 @@ import org.apache.hadoop.yarn.server.resourcemanager.scheduler.Allocation; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.NodeType; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.Queue; -import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerApplication; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerApplicationAttempt; import org.apache.hadoop.yarn.util.resource.Resources; import org.apache.hadoop.yarn.util.resource.ResourceCalculator; @@ -57,7 +57,7 @@ */ @Private @Unstable -public class FiCaSchedulerApp extends SchedulerApplication { +public class FiCaSchedulerApp extends SchedulerApplicationAttempt { private static final Log LOG = LogFactory.getLog(FiCaSchedulerApp.class); diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/common/fica/FiCaSchedulerNode.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/common/fica/FiCaSchedulerNode.java index 23068fefde3..9c5a6062094 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/common/fica/FiCaSchedulerNode.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/common/fica/FiCaSchedulerNode.java @@ -36,7 +36,7 @@ import org.apache.hadoop.yarn.factory.providers.RecordFactoryProvider; import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainer; import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNode; -import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerApplication; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerApplicationAttempt; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerNode; import org.apache.hadoop.yarn.util.resource.Resources; @@ -206,7 +206,7 @@ public synchronized List getRunningContainers() { } public synchronized void reserveResource( - SchedulerApplication application, Priority priority, + SchedulerApplicationAttempt application, Priority priority, RMContainer reservedContainer) { // Check if it's already reserved if (this.reservedContainer != null) { @@ -241,7 +241,7 @@ public synchronized void reserveResource( } public synchronized void unreserveResource( - SchedulerApplication application) { + SchedulerApplicationAttempt application) { // adding NP checks as this can now be called for preemption if (reservedContainer != null diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/attempt/event/RMAppAttemptRejectedEvent.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/event/AppAddedSchedulerEvent.java similarity index 54% rename from hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/attempt/event/RMAppAttemptRejectedEvent.java rename to hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/event/AppAddedSchedulerEvent.java index 8f795a2fa9e..d6fb36df78b 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/attempt/event/RMAppAttemptRejectedEvent.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/event/AppAddedSchedulerEvent.java @@ -16,22 +16,34 @@ * limitations under the License. */ -package org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.event; +package org.apache.hadoop.yarn.server.resourcemanager.scheduler.event; -import org.apache.hadoop.yarn.api.records.ApplicationAttemptId; -import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttemptEvent; -import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttemptEventType; +import org.apache.hadoop.yarn.api.records.ApplicationId; -public class RMAppAttemptRejectedEvent extends RMAppAttemptEvent { +public class AppAddedSchedulerEvent extends SchedulerEvent { - private final String message; + private final ApplicationId applicationId; + private final String queue; + private final String user; - public RMAppAttemptRejectedEvent(ApplicationAttemptId appAttemptId, String message) { - super(appAttemptId, RMAppAttemptEventType.APP_REJECTED); - this.message = message; + public AppAddedSchedulerEvent( + ApplicationId applicationId, String queue, String user) { + super(SchedulerEventType.APP_ADDED); + this.applicationId = applicationId; + this.queue = queue; + this.user = user; } - public String getMessage() { - return this.message; + public ApplicationId getApplicationId() { + return applicationId; } + + public String getQueue() { + return queue; + } + + public String getUser() { + return user; + } + } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/event/AppAttemptAddedSchedulerEvent.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/event/AppAttemptAddedSchedulerEvent.java index 7b9ffff4b53..d50c1570e06 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/event/AppAttemptAddedSchedulerEvent.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/event/AppAttemptAddedSchedulerEvent.java @@ -23,27 +23,14 @@ public class AppAttemptAddedSchedulerEvent extends SchedulerEvent { private final ApplicationAttemptId applicationAttemptId; - private final String queue; - private final String user; public AppAttemptAddedSchedulerEvent( - ApplicationAttemptId applicationAttemptId, String queue, String user) { + ApplicationAttemptId applicationAttemptId) { super(SchedulerEventType.APP_ATTEMPT_ADDED); this.applicationAttemptId = applicationAttemptId; - this.queue = queue; - this.user = user; } public ApplicationAttemptId getApplicationAttemptId() { return applicationAttemptId; } - - public String getQueue() { - return queue; - } - - public String getUser() { - return user; - } - } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/event/AppRemovedSchedulerEvent.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/event/AppRemovedSchedulerEvent.java new file mode 100644 index 00000000000..9842bed00b2 --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/event/AppRemovedSchedulerEvent.java @@ -0,0 +1,43 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.yarn.server.resourcemanager.scheduler.event; + +import org.apache.hadoop.yarn.api.records.ApplicationId; +import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppState; + +public class AppRemovedSchedulerEvent extends SchedulerEvent { + + private final ApplicationId applicationId; + private final RMAppState finalState; + + public AppRemovedSchedulerEvent(ApplicationId applicationId, + RMAppState finalState) { + super(SchedulerEventType.APP_REMOVED); + this.applicationId = applicationId; + this.finalState = finalState; + } + + public ApplicationId getApplicationID() { + return this.applicationId; + } + + public RMAppState getFinalState() { + return this.finalState; + } +} diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/event/SchedulerEventType.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/event/SchedulerEventType.java index dd1aec71bff..243c72ba676 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/event/SchedulerEventType.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/event/SchedulerEventType.java @@ -24,7 +24,11 @@ public enum SchedulerEventType { NODE_ADDED, NODE_REMOVED, NODE_UPDATE, - + + // Source: RMApp + APP_ADDED, + APP_REMOVED, + // Source: RMAppAttempt APP_ATTEMPT_ADDED, APP_ATTEMPT_REMOVED, diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FSParentQueue.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FSParentQueue.java index 90a87416180..7f7d26487ff 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FSParentQueue.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FSParentQueue.java @@ -33,7 +33,7 @@ import org.apache.hadoop.yarn.api.records.QueueUserACLInfo; import org.apache.hadoop.yarn.api.records.Resource; import org.apache.hadoop.yarn.util.resource.Resources; -import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerApplication; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerApplicationAttempt; @Private @Unstable diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FSSchedulerApp.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FSSchedulerApp.java index 10913b17ea9..0bdac8c3652 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FSSchedulerApp.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FSSchedulerApp.java @@ -44,7 +44,7 @@ import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainerImpl; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ActiveUsersManager; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.NodeType; -import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerApplication; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerApplicationAttempt; import org.apache.hadoop.yarn.util.resource.Resources; /** @@ -52,7 +52,7 @@ */ @Private @Unstable -public class FSSchedulerApp extends SchedulerApplication { +public class FSSchedulerApp extends SchedulerApplicationAttempt { private static final Log LOG = LogFactory.getLog(FSSchedulerApp.class); diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FairScheduler.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FairScheduler.java index bc716c1401f..bdfbcabe312 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FairScheduler.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FairScheduler.java @@ -38,6 +38,7 @@ import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.security.UserGroupInformation; import org.apache.hadoop.yarn.api.records.ApplicationAttemptId; +import org.apache.hadoop.yarn.api.records.ApplicationId; import org.apache.hadoop.yarn.api.records.ApplicationResourceUsageReport; import org.apache.hadoop.yarn.api.records.Container; import org.apache.hadoop.yarn.api.records.ContainerId; @@ -58,10 +59,13 @@ import org.apache.hadoop.yarn.server.resourcemanager.recovery.RMStateStore.RMState; import org.apache.hadoop.yarn.server.resourcemanager.resource.ResourceWeights; import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMApp; +import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppEvent; +import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppEventType; +import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppRejectedEvent; +import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppState; import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttemptEvent; import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttemptEventType; import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttemptState; -import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.event.RMAppAttemptRejectedEvent; import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainer; import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainerEventType; import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainerState; @@ -75,8 +79,10 @@ import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerApplication; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerNodeReport; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerUtils; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.AppAddedSchedulerEvent; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.AppAttemptAddedSchedulerEvent; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.AppAttemptRemovedSchedulerEvent; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.AppRemovedSchedulerEvent; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.ContainerExpiredSchedulerEvent; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.NodeAddedSchedulerEvent; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.NodeRemovedSchedulerEvent; @@ -151,10 +157,15 @@ public class FairScheduler implements ResourceScheduler { // Time we last ran preemptTasksIfNecessary private long lastPreemptCheckTime; - // This stores per-application scheduling information, indexed by + // This stores per-application scheduling information, + @VisibleForTesting + protected Map applications = + new ConcurrentHashMap(); + + // This stores per-application-attempt scheduling information, indexed by // attempt ID's for fast lookup. @VisibleForTesting - protected Map applications = + protected Map appAttempts = new ConcurrentHashMap(); // Nodes in the cluster, indexed by NodeId @@ -253,7 +264,7 @@ public QueueManager getQueueManager() { private RMContainer getRMContainer(ContainerId containerId) { FSSchedulerApp application = - applications.get(containerId.getApplicationAttemptId()); + appAttempts.get(containerId.getApplicationAttemptId()); return (application == null) ? null : application.getRMContainer(containerId); } @@ -591,44 +602,63 @@ public FairSchedulerEventLog getEventLog() { * user. This will accept a new app even if the user or queue is above * configured limits, but the app will not be marked as runnable. */ - protected synchronized void addApplicationAttempt( - ApplicationAttemptId applicationAttemptId, String queueName, String user) { + protected synchronized void addApplication(ApplicationId applicationId, + String queueName, String user) { if (queueName == null || queueName.isEmpty()) { - String message = "Reject application " + applicationAttemptId + + String message = "Reject application " + applicationId + " submitted by user " + user + " with an empty queue name."; LOG.info(message); - rmContext.getDispatcher().getEventHandler().handle( - new RMAppAttemptRejectedEvent(applicationAttemptId, message)); + rmContext.getDispatcher().getEventHandler() + .handle(new RMAppRejectedEvent(applicationId, message)); return; } - RMApp rmApp = rmContext.getRMApps().get( - applicationAttemptId.getApplicationId()); + RMApp rmApp = rmContext.getRMApps().get(applicationId); FSLeafQueue queue = assignToQueue(rmApp, queueName, user); if (queue == null) { rmContext.getDispatcher().getEventHandler().handle( - new RMAppAttemptRejectedEvent(applicationAttemptId, + new RMAppRejectedEvent(applicationId, "Application rejected by queue placement policy")); return; } - FSSchedulerApp schedulerApp = - new FSSchedulerApp(applicationAttemptId, user, - queue, new ActiveUsersManager(getRootQueueMetrics()), - rmContext); - // Enforce ACLs UserGroupInformation userUgi = UserGroupInformation.createRemoteUser(user); if (!queue.hasAccess(QueueACL.SUBMIT_APPLICATIONS, userUgi) && !queue.hasAccess(QueueACL.ADMINISTER_QUEUE, userUgi)) { String msg = "User " + userUgi.getUserName() + - " cannot submit applications to queue " + queue.getName(); + " cannot submit applications to queue " + queue.getName(); LOG.info(msg); - rmContext.getDispatcher().getEventHandler().handle( - new RMAppAttemptRejectedEvent(applicationAttemptId, msg)); + rmContext.getDispatcher().getEventHandler() + .handle(new RMAppRejectedEvent(applicationId, msg)); return; } + + SchedulerApplication application = + new SchedulerApplication(queue, user); + applications.put(applicationId, application); + + LOG.info("Accepted application " + applicationId + " from user: " + user + + ", in queue: " + queueName); + rmContext.getDispatcher().getEventHandler() + .handle(new RMAppEvent(applicationId, RMAppEventType.APP_ACCEPTED)); + } + + /** + * Add a new application attempt to the scheduler. + */ + protected synchronized void addApplicationAttempt( + ApplicationAttemptId applicationAttemptId) { + SchedulerApplication application = + applications.get(applicationAttemptId.getApplicationId()); + String user = application.getUser(); + FSLeafQueue queue = (FSLeafQueue) application.getQueue(); + + FSSchedulerApp schedulerApp = + new FSSchedulerApp(applicationAttemptId, user, + queue, new ActiveUsersManager(getRootQueueMetrics()), + rmContext); boolean runnable = maxRunningEnforcer.canAppBeRunnable(queue, user); queue.addApp(schedulerApp, runnable); @@ -639,16 +669,14 @@ queue, new ActiveUsersManager(getRootQueueMetrics()), } queue.getMetrics().submitApp(user, applicationAttemptId.getAttemptId()); + appAttempts.put(applicationAttemptId, schedulerApp); - applications.put(applicationAttemptId, schedulerApp); - - LOG.info("Application Submission: " + applicationAttemptId + - ", user: "+ user + - ", currently active: " + applications.size()); - + LOG.info("Added Application Attempt " + applicationAttemptId + + " to scheduler from user: " + user + ", currently active: " + + appAttempts.size()); rmContext.getDispatcher().getEventHandler().handle( new RMAppAttemptEvent(applicationAttemptId, - RMAppAttemptEventType.APP_ACCEPTED)); + RMAppAttemptEventType.ATTEMPT_ADDED)); } @VisibleForTesting @@ -674,13 +702,18 @@ FSLeafQueue assignToQueue(RMApp rmApp, String queueName, String user) { return queue; } + private synchronized void removeApplication(ApplicationId applicationId, + RMAppState finalState) { + applications.remove(applicationId); + } + private synchronized void removeApplicationAttempt( ApplicationAttemptId applicationAttemptId, RMAppAttemptState rmAppAttemptFinalState) { LOG.info("Application " + applicationAttemptId + " is done." + " finalState=" + rmAppAttemptFinalState); - FSSchedulerApp application = applications.get(applicationAttemptId); + FSSchedulerApp application = appAttempts.get(applicationAttemptId); if (application == null) { LOG.info("Unknown application " + applicationAttemptId + " has completed!"); @@ -720,7 +753,7 @@ private synchronized void removeApplicationAttempt( } // Remove from our data-structure - applications.remove(applicationAttemptId); + appAttempts.remove(applicationAttemptId); } /** @@ -737,7 +770,7 @@ private synchronized void completedContainer(RMContainer rmContainer, // Get the application for the finished container ApplicationAttemptId applicationAttemptId = container.getId().getApplicationAttemptId(); - FSSchedulerApp application = applications.get(applicationAttemptId); + FSSchedulerApp application = appAttempts.get(applicationAttemptId); if (application == null) { LOG.info("Container " + container + " of" + " unknown application " + applicationAttemptId + @@ -811,7 +844,7 @@ public Allocation allocate(ApplicationAttemptId appAttemptId, List ask, List release, List blacklistAdditions, List blacklistRemovals) { // Make sure this application exists - FSSchedulerApp application = applications.get(appAttemptId); + FSSchedulerApp application = appAttempts.get(appAttemptId); if (application == null) { LOG.info("Calling allocate on removed " + "or non existant application " + appAttemptId); @@ -882,7 +915,7 @@ public Allocation allocate(ApplicationAttemptId appAttemptId, private void containerLaunchedOnNode(ContainerId containerId, FSSchedulerNode node) { // Get the application for the finished container ApplicationAttemptId applicationAttemptId = containerId.getApplicationAttemptId(); - FSSchedulerApp application = applications.get(applicationAttemptId); + FSSchedulerApp application = appAttempts.get(applicationAttemptId); if (application == null) { LOG.info("Unknown application: " + applicationAttemptId + " launched container " + containerId + @@ -1025,23 +1058,23 @@ public SchedulerNodeReport getNodeReport(NodeId nodeId) { } public FSSchedulerApp getSchedulerApp(ApplicationAttemptId appAttemptId) { - return applications.get(appAttemptId); + return appAttempts.get(appAttemptId); } @Override public SchedulerAppReport getSchedulerAppInfo( ApplicationAttemptId appAttemptId) { - if (!applications.containsKey(appAttemptId)) { + if (!appAttempts.containsKey(appAttemptId)) { LOG.error("Request for appInfo of unknown attempt" + appAttemptId); return null; } - return new SchedulerAppReport(applications.get(appAttemptId)); + return new SchedulerAppReport(appAttempts.get(appAttemptId)); } @Override public ApplicationResourceUsageReport getAppResourceUsageReport( ApplicationAttemptId appAttemptId) { - FSSchedulerApp app = applications.get(appAttemptId); + FSSchedulerApp app = appAttempts.get(appAttemptId); if (app == null) { LOG.error("Request for appInfo of unknown attempt" + appAttemptId); return null; @@ -1090,15 +1123,29 @@ public void handle(SchedulerEvent event) { NodeUpdateSchedulerEvent nodeUpdatedEvent = (NodeUpdateSchedulerEvent)event; nodeUpdate(nodeUpdatedEvent.getRMNode()); break; + case APP_ADDED: + if (!(event instanceof AppAddedSchedulerEvent)) { + throw new RuntimeException("Unexpected event type: " + event); + } + AppAddedSchedulerEvent appAddedEvent = (AppAddedSchedulerEvent) event; + addApplication(appAddedEvent.getApplicationId(), + appAddedEvent.getQueue(), appAddedEvent.getUser()); + break; + case APP_REMOVED: + if (!(event instanceof AppRemovedSchedulerEvent)) { + throw new RuntimeException("Unexpected event type: " + event); + } + AppRemovedSchedulerEvent appRemovedEvent = (AppRemovedSchedulerEvent)event; + removeApplication(appRemovedEvent.getApplicationID(), + appRemovedEvent.getFinalState()); + break; case APP_ATTEMPT_ADDED: if (!(event instanceof AppAttemptAddedSchedulerEvent)) { throw new RuntimeException("Unexpected event type: " + event); } AppAttemptAddedSchedulerEvent appAttemptAddedEvent = (AppAttemptAddedSchedulerEvent) event; - String queue = appAttemptAddedEvent.getQueue(); - addApplicationAttempt(appAttemptAddedEvent.getApplicationAttemptId(), - queue, appAttemptAddedEvent.getUser()); + addApplicationAttempt(appAttemptAddedEvent.getApplicationAttemptId()); break; case APP_ATTEMPT_REMOVED: if (!(event instanceof AppAttemptRemovedSchedulerEvent)) { diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fifo/FifoScheduler.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fifo/FifoScheduler.java index c5f0bd67d92..9d429136db9 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fifo/FifoScheduler.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fifo/FifoScheduler.java @@ -37,6 +37,7 @@ import org.apache.hadoop.security.UserGroupInformation; import org.apache.hadoop.security.authorize.AccessControlList; import org.apache.hadoop.yarn.api.records.ApplicationAttemptId; +import org.apache.hadoop.yarn.api.records.ApplicationId; import org.apache.hadoop.yarn.api.records.ApplicationResourceUsageReport; import org.apache.hadoop.yarn.api.records.Container; import org.apache.hadoop.yarn.api.records.ContainerId; @@ -58,6 +59,9 @@ import org.apache.hadoop.yarn.server.resourcemanager.RMAuditLogger.AuditConstants; import org.apache.hadoop.yarn.server.resourcemanager.RMContext; import org.apache.hadoop.yarn.server.resourcemanager.recovery.RMStateStore.RMState; +import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppEvent; +import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppEventType; +import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppState; import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttemptEvent; import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttemptEventType; import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttemptState; @@ -74,12 +78,15 @@ import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ResourceScheduler; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerAppReport; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerAppUtils; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerApplication; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerNodeReport; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerUtils; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerApp; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerNode; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.AppAddedSchedulerEvent; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.AppAttemptAddedSchedulerEvent; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.AppAttemptRemovedSchedulerEvent; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.AppRemovedSchedulerEvent; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.ContainerExpiredSchedulerEvent; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.NodeAddedSchedulerEvent; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.NodeRemovedSchedulerEvent; @@ -116,11 +123,15 @@ public class FifoScheduler implements ResourceScheduler, Configurable { private Resource maximumAllocation; private boolean usePortForNodeName; + @VisibleForTesting + protected Map applications = + new ConcurrentSkipListMap(); + // Use ConcurrentSkipListMap because applications need to be ordered @VisibleForTesting - protected Map applications + protected Map appAttempts = new ConcurrentSkipListMap(); - + private ActiveUsersManager activeUsersManager; private static final String DEFAULT_QUEUE_NAME = "default"; @@ -327,7 +338,7 @@ public Allocation allocate( @VisibleForTesting FiCaSchedulerApp getApplication( ApplicationAttemptId applicationAttemptId) { - return applications.get(applicationAttemptId); + return appAttempts.get(applicationAttemptId); } @Override @@ -347,20 +358,44 @@ public ApplicationResourceUsageReport getAppResourceUsageReport( private FiCaSchedulerNode getNode(NodeId nodeId) { return nodes.get(nodeId); } - - private synchronized void addApplicationAttempt(ApplicationAttemptId appAttemptId, - String user) { + + private synchronized void addApplication(ApplicationId applicationId, + String queue, String user) { + SchedulerApplication application = + new SchedulerApplication(null, user); + applications.put(applicationId, application); + LOG.info("Accepted application " + applicationId + " from user: " + user); + rmContext.getDispatcher().getEventHandler() + .handle(new RMAppEvent(applicationId, RMAppEventType.APP_ACCEPTED)); + } + + private synchronized void addApplicationAttempt( + ApplicationAttemptId appAttemptId) { + SchedulerApplication application = + applications.get(appAttemptId.getApplicationId()); + String user = application.getUser(); // TODO: Fix store - FiCaSchedulerApp schedulerApp = - new FiCaSchedulerApp(appAttemptId, user, DEFAULT_QUEUE, activeUsersManager, - this.rmContext); - applications.put(appAttemptId, schedulerApp); + FiCaSchedulerApp schedulerApp = + new FiCaSchedulerApp(appAttemptId, user, DEFAULT_QUEUE, + activeUsersManager, this.rmContext); + appAttempts.put(appAttemptId, schedulerApp); metrics.submitApp(user, appAttemptId.getAttemptId()); - LOG.info("Application Submission: " + appAttemptId.getApplicationId() + - " from " + user + ", currently active: " + applications.size()); + LOG.info("Added Application Attempt " + appAttemptId + + " to scheduler from user " + application.getUser() + + ", currently active: " + appAttempts.size()); rmContext.getDispatcher().getEventHandler().handle( new RMAppAttemptEvent(appAttemptId, - RMAppAttemptEventType.APP_ACCEPTED)); + RMAppAttemptEventType.ATTEMPT_ADDED)); + } + + private synchronized void doneApplication(ApplicationId applicationId, + RMAppState finalState) { + SchedulerApplication application = applications.get(applicationId); + + // Inform the activeUsersManager + activeUsersManager.deactivateApplication(application.getUser(), + applicationId); + applications.remove(applicationId); } private synchronized void doneApplicationAttempt( @@ -382,17 +417,11 @@ private synchronized void doneApplicationAttempt( RMContainerEventType.KILL); } - // Inform the activeUsersManager - synchronized (application) { - activeUsersManager.deactivateApplication( - application.getUser(), application.getApplicationId()); - } - // Clean up pending requests, metrics etc. application.stop(rmAppAttemptFinalState); // Remove the application - applications.remove(applicationAttemptId); + appAttempts.remove(applicationAttemptId); } /** @@ -403,10 +432,10 @@ private synchronized void doneApplicationAttempt( private void assignContainers(FiCaSchedulerNode node) { LOG.debug("assignContainers:" + " node=" + node.getRMNode().getNodeAddress() + - " #applications=" + applications.size()); + " #applications=" + appAttempts.size()); // Try to assign containers to applications in fifo order - for (Map.Entry e : applications + for (Map.Entry e : appAttempts .entrySet()) { FiCaSchedulerApp application = e.getValue(); LOG.debug("pre-assignContainers"); @@ -445,7 +474,7 @@ private void assignContainers(FiCaSchedulerNode node) { // Update the applications' headroom to correctly take into // account the containers assigned in this update. - for (FiCaSchedulerApp application : applications.values()) { + for (FiCaSchedulerApp application : appAttempts.values()) { application.setHeadroom(Resources.subtract(clusterResource, usedResource)); } } @@ -697,12 +726,25 @@ public void handle(SchedulerEvent event) { nodeUpdate(nodeUpdatedEvent.getRMNode()); } break; + case APP_ADDED: + { + AppAddedSchedulerEvent appAddedEvent = (AppAddedSchedulerEvent) event; + addApplication(appAddedEvent.getApplicationId(), + appAddedEvent.getQueue(), appAddedEvent.getUser()); + } + break; + case APP_REMOVED: + { + AppRemovedSchedulerEvent appRemovedEvent = (AppRemovedSchedulerEvent)event; + doneApplication(appRemovedEvent.getApplicationID(), + appRemovedEvent.getFinalState()); + } + break; case APP_ATTEMPT_ADDED: { AppAttemptAddedSchedulerEvent appAttemptAddedEvent = (AppAttemptAddedSchedulerEvent) event; - addApplicationAttempt(appAttemptAddedEvent.getApplicationAttemptId(), - appAttemptAddedEvent.getUser()); + addApplicationAttempt(appAttemptAddedEvent.getApplicationAttemptId()); } break; case APP_ATTEMPT_REMOVED: @@ -867,8 +909,8 @@ public synchronized boolean checkAccess(UserGroupInformation callerUGI, public synchronized List getAppsInQueue(String queueName) { if (queueName.equals(DEFAULT_QUEUE.getQueueName())) { List apps = new ArrayList( - applications.size()); - for (FiCaSchedulerApp app : applications.values()) { + appAttempts.size()); + for (FiCaSchedulerApp app : appAttempts.values()) { apps.add(app.getApplicationAttemptId()); } return apps; diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/Application.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/Application.java index 6767180b62a..1192c30774e 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/Application.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/Application.java @@ -57,6 +57,7 @@ import org.apache.hadoop.yarn.server.resourcemanager.scheduler.Allocation; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.NodeType; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ResourceScheduler; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.AppAddedSchedulerEvent; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.AppAttemptAddedSchedulerEvent; import org.apache.hadoop.yarn.util.Records; import org.apache.hadoop.yarn.util.resource.Resources; @@ -164,11 +165,14 @@ public synchronized void submit() throws IOException, YarnException { final ResourceScheduler scheduler = resourceManager.getResourceScheduler(); resourceManager.getClientRMService().submitApplication(request); - + // Notify scheduler - AppAttemptAddedSchedulerEvent appAddedEvent1 = new AppAttemptAddedSchedulerEvent( - this.applicationAttemptId, this.queue, this.user); - scheduler.handle(appAddedEvent1); + AppAddedSchedulerEvent addAppEvent = + new AppAddedSchedulerEvent(this.applicationId, this.queue, "user"); + scheduler.handle(addAppEvent); + AppAttemptAddedSchedulerEvent addAttemptEvent = + new AppAttemptAddedSchedulerEvent(this.applicationAttemptId); + scheduler.handle(addAttemptEvent); } public synchronized void addResourceRequestSpec( diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestClientRMService.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestClientRMService.java index d425dda2aba..cbb70d57a4a 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestClientRMService.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestClientRMService.java @@ -649,7 +649,7 @@ private RMAppImpl getRMApp(RMContext rmContext, YarnScheduler yarnScheduler, .currentTimeMillis(), "YARN")); ApplicationAttemptId attemptId = ApplicationAttemptId.newInstance(applicationId3, 1); RMAppAttemptImpl rmAppAttemptImpl = new RMAppAttemptImpl(attemptId, - rmContext, yarnScheduler, null, asContext, config, null); + rmContext, yarnScheduler, null, asContext, config); when(app.getCurrentAppAttempt()).thenReturn(rmAppAttemptImpl); return app; } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestFifoScheduler.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestFifoScheduler.java index 2d370fcfb46..4bf0c449598 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestFifoScheduler.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestFifoScheduler.java @@ -44,6 +44,7 @@ import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ResourceScheduler; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerNodeReport; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacitySchedulerConfiguration; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.AppAddedSchedulerEvent; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.AppAttemptAddedSchedulerEvent; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.NodeAddedSchedulerEvent; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.NodeRemovedSchedulerEvent; @@ -297,9 +298,12 @@ public void testBlackListNodes() throws Exception { ApplicationId appId1 = BuilderUtils.newApplicationId(100, 1); ApplicationAttemptId appAttemptId1 = BuilderUtils.newApplicationAttemptId( appId1, 1); - SchedulerEvent event1 = - new AppAttemptAddedSchedulerEvent(appAttemptId1, "queue", "user"); - fs.handle(event1); + SchedulerEvent appEvent = + new AppAddedSchedulerEvent(appId1, "queue", "user"); + fs.handle(appEvent); + SchedulerEvent attemptEvent = + new AppAttemptAddedSchedulerEvent(appAttemptId1); + fs.handle(attemptEvent); List emptyId = new ArrayList(); List emptyAsk = new ArrayList(); @@ -388,16 +392,22 @@ public void testHeadroom() throws Exception { ApplicationId appId1 = BuilderUtils.newApplicationId(100, 1); ApplicationAttemptId appAttemptId1 = BuilderUtils.newApplicationAttemptId( appId1, 1); - SchedulerEvent event1 = - new AppAttemptAddedSchedulerEvent(appAttemptId1, "queue", "user"); - fs.handle(event1); + SchedulerEvent appEvent = + new AppAddedSchedulerEvent(appId1, "queue", "user"); + fs.handle(appEvent); + SchedulerEvent attemptEvent = + new AppAttemptAddedSchedulerEvent(appAttemptId1); + fs.handle(attemptEvent); ApplicationId appId2 = BuilderUtils.newApplicationId(200, 2); ApplicationAttemptId appAttemptId2 = BuilderUtils.newApplicationAttemptId( appId2, 1); - SchedulerEvent event2 = - new AppAttemptAddedSchedulerEvent(appAttemptId2, "queue", "user"); - fs.handle(event2); + SchedulerEvent appEvent2 = + new AppAddedSchedulerEvent(appId2, "queue", "user"); + fs.handle(appEvent2); + SchedulerEvent attemptEvent2 = + new AppAttemptAddedSchedulerEvent(appAttemptId2); + fs.handle(attemptEvent2); List emptyId = new ArrayList(); List emptyAsk = new ArrayList(); diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestRMRestart.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestRMRestart.java index c7ef857cc6b..440bddc510c 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestRMRestart.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestRMRestart.java @@ -248,7 +248,7 @@ public void testRMRestart() throws Exception { // verify correct number of attempts and other data RMApp loadedApp1 = rm2.getRMContext().getRMApps().get(app1.getApplicationId()); Assert.assertNotNull(loadedApp1); - //Assert.assertEquals(1, loadedApp1.getAppAttempts().size()); + Assert.assertEquals(1, loadedApp1.getAppAttempts().size()); Assert.assertEquals(app1.getApplicationSubmissionContext() .getApplicationId(), loadedApp1.getApplicationSubmissionContext() .getApplicationId()); @@ -261,7 +261,7 @@ public void testRMRestart() throws Exception { .getApplicationId()); // verify state machine kicked into expected states - rm2.waitForState(loadedApp1.getApplicationId(), RMAppState.RUNNING); + rm2.waitForState(loadedApp1.getApplicationId(), RMAppState.ACCEPTED); rm2.waitForState(loadedApp2.getApplicationId(), RMAppState.ACCEPTED); // verify attempts for apps @@ -299,7 +299,11 @@ public void testRMRestart() throws Exception { nm2.registerNode(); rm2.waitForState(loadedApp1.getApplicationId(), RMAppState.ACCEPTED); - Assert.assertEquals(2, loadedApp1.getAppAttempts().size()); + // wait for the 2nd attempt to be started. + int timeoutSecs = 0; + while (loadedApp1.getAppAttempts().size() != 2 && timeoutSecs++ < 40) {; + Thread.sleep(200); + } // verify no more reboot response sent hbResponse = nm1.nodeHeartbeat(true); @@ -476,10 +480,10 @@ public void testRMRestartWaitForPreviousAMToFinish() throws Exception { Assert.assertEquals(NodeAction.RESYNC, res.getNodeAction()); RMApp rmApp = rm2.getRMContext().getRMApps().get(app1.getApplicationId()); - // application should be in running state - rm2.waitForState(app1.getApplicationId(), RMAppState.RUNNING); + // application should be in ACCEPTED state + rm2.waitForState(app1.getApplicationId(), RMAppState.ACCEPTED); - Assert.assertEquals(RMAppState.RUNNING, rmApp.getState()); + Assert.assertEquals(RMAppState.ACCEPTED, rmApp.getState()); // new attempt should not be started Assert.assertEquals(2, rmApp.getAppAttempts().size()); // am1 attempt should be in FAILED state where as am2 attempt should be in @@ -516,9 +520,9 @@ public void testRMRestartWaitForPreviousAMToFinish() throws Exception { nm1.setResourceTrackerService(rm3.getResourceTrackerService()); rmApp = rm3.getRMContext().getRMApps().get(app1.getApplicationId()); - // application should be in running state - rm3.waitForState(app1.getApplicationId(), RMAppState.RUNNING); - Assert.assertEquals(rmApp.getState(), RMAppState.RUNNING); + // application should be in ACCEPTED state + rm3.waitForState(app1.getApplicationId(), RMAppState.ACCEPTED); + Assert.assertEquals(rmApp.getState(), RMAppState.ACCEPTED); // new attempt should not be started Assert.assertEquals(3, rmApp.getAppAttempts().size()); // am1 and am2 attempts should be in FAILED state where as am3 should be @@ -562,6 +566,11 @@ public void testRMRestartWaitForPreviousAMToFinish() throws Exception { rmApp = rm4.getRMContext().getRMApps().get(app1.getApplicationId()); rm4.waitForState(rmApp.getApplicationId(), RMAppState.ACCEPTED); + // wait for the attempt to be created. + int timeoutSecs = 0; + while (rmApp.getAppAttempts().size() != 2 && timeoutSecs++ < 40) { + Thread.sleep(200); + } Assert.assertEquals(4, rmApp.getAppAttempts().size()); Assert.assertEquals(RMAppState.ACCEPTED, rmApp.getState()); rm4.waitForState(latestAppAttemptId, RMAppAttemptState.SCHEDULED); diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/TestRMAppTransitions.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/TestRMAppTransitions.java index ba255d339ef..e6e19eaec3b 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/TestRMAppTransitions.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/TestRMAppTransitions.java @@ -567,7 +567,9 @@ public void testAppSubmittedKill() throws IOException, InterruptedException { RMAppEventType.KILL); application.handle(event); rmDispatcher.await(); - assertAppAndAttemptKilled(application); + sendAppUpdateSavedEvent(application); + assertKilled(application); + assertAppFinalStateSaved(application); } @Test @@ -582,7 +584,7 @@ public void testAppAcceptedFailed() throws IOException { new RMAppFailedAttemptEvent(application.getApplicationId(), RMAppEventType.ATTEMPT_FAILED, ""); application.handle(event); - assertAppState(RMAppState.SUBMITTED, application); + assertAppState(RMAppState.ACCEPTED, application); event = new RMAppEvent(application.getApplicationId(), RMAppEventType.APP_ACCEPTED); @@ -612,7 +614,9 @@ public void testAppAcceptedKill() throws IOException, InterruptedException { RMAppEventType.KILL); application.handle(event); rmDispatcher.await(); - assertAppAndAttemptKilled(application); + sendAppUpdateSavedEvent(application); + assertKilled(application); + assertAppFinalStateSaved(application); } @Test @@ -654,7 +658,7 @@ public void testAppRunningFailed() throws IOException { RMAppEventType.ATTEMPT_FAILED, ""); application.handle(event); rmDispatcher.await(); - assertAppState(RMAppState.SUBMITTED, application); + assertAppState(RMAppState.ACCEPTED, application); appAttempt = application.getCurrentAppAttempt(); Assert.assertEquals(++expectedAttemptId, appAttempt.getAppAttemptId().getAttemptId()); diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/attempt/TestRMAppAttemptTransitions.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/attempt/TestRMAppAttemptTransitions.java index 0ad2f2a0370..5bea03b6218 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/attempt/TestRMAppAttemptTransitions.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/attempt/TestRMAppAttemptTransitions.java @@ -79,7 +79,6 @@ import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.event.RMAppAttemptLaunchFailedEvent; import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.event.RMAppAttemptNewSavedEvent; import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.event.RMAppAttemptRegistrationEvent; -import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.event.RMAppAttemptRejectedEvent; import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.event.RMAppAttemptUnregistrationEvent; import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.event.RMAppAttemptUpdateSavedEvent; import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.ContainerAllocationExpirer; @@ -258,7 +257,7 @@ public void setUp() throws Exception { application = mock(RMApp.class); applicationAttempt = new RMAppAttemptImpl(applicationAttemptId, rmContext, scheduler, - masterService, submissionContext, new Configuration(), user); + masterService, submissionContext, new Configuration()); when(application.getCurrentAppAttempt()).thenReturn(applicationAttempt); when(application.getApplicationId()).thenReturn(applicationId); @@ -408,9 +407,6 @@ private void testAppAttemptScheduledState() { assertEquals(0.0, (double)applicationAttempt.getProgress(), 0.0001); assertEquals(0, applicationAttempt.getRanNodes().size()); assertNull(applicationAttempt.getFinalApplicationStatus()); - - // Check events - verify(application).handle(any(RMAppEvent.class)); } /** @@ -446,7 +442,7 @@ private void testAppAttemptFailedState(Container container, assertEquals(0, applicationAttempt.getRanNodes().size()); // Check events - verify(application, times(2)).handle(any(RMAppFailedAttemptEvent.class)); + verify(application, times(1)).handle(any(RMAppFailedAttemptEvent.class)); verifyTokenCount(applicationAttempt.getAppAttemptId(), 1); verifyAttemptFinalStateSaved(); } @@ -544,7 +540,7 @@ private void scheduleApplicationAttempt() { applicationAttempt.handle( new RMAppAttemptEvent( applicationAttempt.getAppAttemptId(), - RMAppAttemptEventType.APP_ACCEPTED)); + RMAppAttemptEventType.ATTEMPT_ADDED)); if(unmanagedAM){ assertEquals(RMAppAttemptState.LAUNCHED_UNMANAGED_SAVING, @@ -703,16 +699,6 @@ public void testNewToRecovered() { RMAppAttemptEventType.RECOVER)); testAppAttemptRecoveredState(); } - - @Test - public void testSubmittedToFailed() { - submitApplicationAttempt(); - String message = "Rejected"; - applicationAttempt.handle( - new RMAppAttemptRejectedEvent( - applicationAttempt.getAppAttemptId(), message)); - testAppAttemptSubmittedToFailedState(message); - } @Test public void testSubmittedToKilled() { diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/TestSchedulerUtils.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/TestSchedulerUtils.java index 9969db5a5e3..8fcbf54b6cd 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/TestSchedulerUtils.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/TestSchedulerUtils.java @@ -51,6 +51,7 @@ import org.apache.hadoop.yarn.api.records.ResourceRequest; import org.apache.hadoop.yarn.api.records.impl.pb.ResourceRequestPBImpl; import org.apache.hadoop.yarn.conf.YarnConfiguration; +import org.apache.hadoop.yarn.event.EventHandler; import org.apache.hadoop.yarn.exceptions.InvalidResourceBlacklistRequestException; import org.apache.hadoop.yarn.exceptions.InvalidResourceRequestException; import org.apache.hadoop.yarn.ipc.YarnRPC; @@ -58,8 +59,12 @@ import org.apache.hadoop.yarn.server.resourcemanager.TestAMAuthorization.MockRMWithAMS; import org.apache.hadoop.yarn.server.resourcemanager.TestAMAuthorization.MyContainerManager; import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMApp; +import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppState; import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttempt; import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttemptState; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.AppAddedSchedulerEvent; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.AppRemovedSchedulerEvent; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.SchedulerEvent; import org.apache.hadoop.yarn.server.utils.BuilderUtils; import org.apache.hadoop.yarn.util.Records; import org.apache.hadoop.yarn.util.resource.DefaultResourceCalculator; @@ -378,4 +383,24 @@ public void testCreatePreemptedContainerStatus() { ApplicationId.newInstance(System.currentTimeMillis(), 1), 1), 1), "x"); Assert.assertEquals(ContainerExitStatus.PREEMPTED, cd.getExitStatus()); } + + public static SchedulerApplication verifyAppAddedAndRemovedFromScheduler( + final Map applications, + EventHandler handler, String queueName) throws Exception { + ApplicationId appId = + ApplicationId.newInstance(System.currentTimeMillis(), 1); + AppAddedSchedulerEvent appAddedEvent = + new AppAddedSchedulerEvent(appId, queueName, "user"); + handler.handle(appAddedEvent); + SchedulerApplication app = applications.get(appId); + // verify application is added. + Assert.assertNotNull(app); + Assert.assertEquals("user", app.getUser()); + + AppRemovedSchedulerEvent appRemoveEvent = + new AppRemovedSchedulerEvent(appId, RMAppState.FINISHED); + handler.handle(appRemoveEvent); + Assert.assertNull(applications.get(appId)); + return app; + } } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestApplicationLimits.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestApplicationLimits.java index f343bd546c6..2b548ef4607 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestApplicationLimits.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestApplicationLimits.java @@ -304,7 +304,7 @@ public void testActiveApplicationLimits() throws Exception { int APPLICATION_ID = 0; // Submit first application FiCaSchedulerApp app_0 = getMockApplication(APPLICATION_ID++, user_0); - queue.submitApplication(app_0, user_0, A); + queue.submitApplicationAttempt(app_0, user_0); assertEquals(1, queue.getNumActiveApplications()); assertEquals(0, queue.getNumPendingApplications()); assertEquals(1, queue.getNumActiveApplications(user_0)); @@ -312,7 +312,7 @@ public void testActiveApplicationLimits() throws Exception { // Submit second application FiCaSchedulerApp app_1 = getMockApplication(APPLICATION_ID++, user_0); - queue.submitApplication(app_1, user_0, A); + queue.submitApplicationAttempt(app_1, user_0); assertEquals(2, queue.getNumActiveApplications()); assertEquals(0, queue.getNumPendingApplications()); assertEquals(2, queue.getNumActiveApplications(user_0)); @@ -320,14 +320,14 @@ public void testActiveApplicationLimits() throws Exception { // Submit third application, should remain pending FiCaSchedulerApp app_2 = getMockApplication(APPLICATION_ID++, user_0); - queue.submitApplication(app_2, user_0, A); + queue.submitApplicationAttempt(app_2, user_0); assertEquals(2, queue.getNumActiveApplications()); assertEquals(1, queue.getNumPendingApplications()); assertEquals(2, queue.getNumActiveApplications(user_0)); assertEquals(1, queue.getNumPendingApplications(user_0)); // Finish one application, app_2 should be activated - queue.finishApplication(app_0, A); + queue.finishApplicationAttempt(app_0, A); assertEquals(2, queue.getNumActiveApplications()); assertEquals(0, queue.getNumPendingApplications()); assertEquals(2, queue.getNumActiveApplications(user_0)); @@ -335,7 +335,7 @@ public void testActiveApplicationLimits() throws Exception { // Submit another one for user_0 FiCaSchedulerApp app_3 = getMockApplication(APPLICATION_ID++, user_0); - queue.submitApplication(app_3, user_0, A); + queue.submitApplicationAttempt(app_3, user_0); assertEquals(2, queue.getNumActiveApplications()); assertEquals(1, queue.getNumPendingApplications()); assertEquals(2, queue.getNumActiveApplications(user_0)); @@ -346,7 +346,7 @@ public void testActiveApplicationLimits() throws Exception { // Submit first app for user_1 FiCaSchedulerApp app_4 = getMockApplication(APPLICATION_ID++, user_1); - queue.submitApplication(app_4, user_1, A); + queue.submitApplicationAttempt(app_4, user_1); assertEquals(3, queue.getNumActiveApplications()); assertEquals(1, queue.getNumPendingApplications()); assertEquals(2, queue.getNumActiveApplications(user_0)); @@ -356,7 +356,7 @@ public void testActiveApplicationLimits() throws Exception { // Submit second app for user_1, should block due to queue-limit FiCaSchedulerApp app_5 = getMockApplication(APPLICATION_ID++, user_1); - queue.submitApplication(app_5, user_1, A); + queue.submitApplicationAttempt(app_5, user_1); assertEquals(3, queue.getNumActiveApplications()); assertEquals(2, queue.getNumPendingApplications()); assertEquals(2, queue.getNumActiveApplications(user_0)); @@ -365,7 +365,7 @@ public void testActiveApplicationLimits() throws Exception { assertEquals(1, queue.getNumPendingApplications(user_1)); // Now finish one app of user_1 so app_5 should be activated - queue.finishApplication(app_4, A); + queue.finishApplicationAttempt(app_4, A); assertEquals(3, queue.getNumActiveApplications()); assertEquals(1, queue.getNumPendingApplications()); assertEquals(2, queue.getNumActiveApplications(user_0)); @@ -385,7 +385,7 @@ public void testActiveLimitsWithKilledApps() throws Exception { // Submit first application FiCaSchedulerApp app_0 = getMockApplication(APPLICATION_ID++, user_0); - queue.submitApplication(app_0, user_0, A); + queue.submitApplicationAttempt(app_0, user_0); assertEquals(1, queue.getNumActiveApplications()); assertEquals(0, queue.getNumPendingApplications()); assertEquals(1, queue.getNumActiveApplications(user_0)); @@ -394,7 +394,7 @@ public void testActiveLimitsWithKilledApps() throws Exception { // Submit second application FiCaSchedulerApp app_1 = getMockApplication(APPLICATION_ID++, user_0); - queue.submitApplication(app_1, user_0, A); + queue.submitApplicationAttempt(app_1, user_0); assertEquals(2, queue.getNumActiveApplications()); assertEquals(0, queue.getNumPendingApplications()); assertEquals(2, queue.getNumActiveApplications(user_0)); @@ -403,7 +403,7 @@ public void testActiveLimitsWithKilledApps() throws Exception { // Submit third application, should remain pending FiCaSchedulerApp app_2 = getMockApplication(APPLICATION_ID++, user_0); - queue.submitApplication(app_2, user_0, A); + queue.submitApplicationAttempt(app_2, user_0); assertEquals(2, queue.getNumActiveApplications()); assertEquals(1, queue.getNumPendingApplications()); assertEquals(2, queue.getNumActiveApplications(user_0)); @@ -412,7 +412,7 @@ public void testActiveLimitsWithKilledApps() throws Exception { // Submit fourth application, should remain pending FiCaSchedulerApp app_3 = getMockApplication(APPLICATION_ID++, user_0); - queue.submitApplication(app_3, user_0, A); + queue.submitApplicationAttempt(app_3, user_0); assertEquals(2, queue.getNumActiveApplications()); assertEquals(2, queue.getNumPendingApplications()); assertEquals(2, queue.getNumActiveApplications(user_0)); @@ -420,7 +420,7 @@ public void testActiveLimitsWithKilledApps() throws Exception { assertTrue(queue.pendingApplications.contains(app_3)); // Kill 3rd pending application - queue.finishApplication(app_2, A); + queue.finishApplicationAttempt(app_2, A); assertEquals(2, queue.getNumActiveApplications()); assertEquals(1, queue.getNumPendingApplications()); assertEquals(2, queue.getNumActiveApplications(user_0)); @@ -429,7 +429,7 @@ public void testActiveLimitsWithKilledApps() throws Exception { assertFalse(queue.activeApplications.contains(app_2)); // Finish 1st application, app_3 should become active - queue.finishApplication(app_0, A); + queue.finishApplicationAttempt(app_0, A); assertEquals(2, queue.getNumActiveApplications()); assertEquals(0, queue.getNumPendingApplications()); assertEquals(2, queue.getNumActiveApplications(user_0)); @@ -439,7 +439,7 @@ public void testActiveLimitsWithKilledApps() throws Exception { assertFalse(queue.activeApplications.contains(app_0)); // Finish 2nd application - queue.finishApplication(app_1, A); + queue.finishApplicationAttempt(app_1, A); assertEquals(1, queue.getNumActiveApplications()); assertEquals(0, queue.getNumPendingApplications()); assertEquals(1, queue.getNumActiveApplications(user_0)); @@ -447,7 +447,7 @@ public void testActiveLimitsWithKilledApps() throws Exception { assertFalse(queue.activeApplications.contains(app_1)); // Finish 4th application - queue.finishApplication(app_3, A); + queue.finishApplicationAttempt(app_3, A); assertEquals(0, queue.getNumActiveApplications()); assertEquals(0, queue.getNumPendingApplications()); assertEquals(0, queue.getNumActiveApplications(user_0)); @@ -507,7 +507,7 @@ public void testHeadroom() throws Exception { FiCaSchedulerApp app_0_0 = spy(new FiCaSchedulerApp(appAttemptId_0_0, user_0, queue, queue.getActiveUsersManager(), rmContext)); - queue.submitApplication(app_0_0, user_0, A); + queue.submitApplicationAttempt(app_0_0, user_0); List app_0_0_requests = new ArrayList(); app_0_0_requests.add( @@ -526,7 +526,7 @@ public void testHeadroom() throws Exception { FiCaSchedulerApp app_0_1 = spy(new FiCaSchedulerApp(appAttemptId_0_1, user_0, queue, queue.getActiveUsersManager(), rmContext)); - queue.submitApplication(app_0_1, user_0, A); + queue.submitApplicationAttempt(app_0_1, user_0); List app_0_1_requests = new ArrayList(); app_0_1_requests.add( @@ -545,7 +545,7 @@ public void testHeadroom() throws Exception { FiCaSchedulerApp app_1_0 = spy(new FiCaSchedulerApp(appAttemptId_1_0, user_1, queue, queue.getActiveUsersManager(), rmContext)); - queue.submitApplication(app_1_0, user_1, A); + queue.submitApplicationAttempt(app_1_0, user_1); List app_1_0_requests = new ArrayList(); app_1_0_requests.add( diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestCapacityScheduler.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestCapacityScheduler.java index c2f2f88dc46..3d49d86a37d 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestCapacityScheduler.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestCapacityScheduler.java @@ -64,7 +64,10 @@ import org.apache.hadoop.yarn.server.resourcemanager.scheduler.Queue; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ResourceScheduler; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerApplication; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerApplicationAttempt; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.TestSchedulerUtils; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerApp; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.AppAddedSchedulerEvent; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.AppAttemptAddedSchedulerEvent; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.NodeAddedSchedulerEvent; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.NodeRemovedSchedulerEvent; @@ -555,9 +558,12 @@ public void testBlackListNodes() throws Exception { ApplicationId appId = BuilderUtils.newApplicationId(100, 1); ApplicationAttemptId appAttemptId = BuilderUtils.newApplicationAttemptId( appId, 1); - SchedulerEvent event = - new AppAttemptAddedSchedulerEvent(appAttemptId, "default", "user"); - cs.handle(event); + SchedulerEvent addAppEvent = + new AppAddedSchedulerEvent(appId, "default", "user"); + cs.handle(addAppEvent); + SchedulerEvent addAttemptEvent = + new AppAttemptAddedSchedulerEvent(appAttemptId); + cs.handle(addAttemptEvent); // Verify the blacklist can be updated independent of requesting containers cs.allocate(appAttemptId, Collections.emptyList(), @@ -596,10 +602,10 @@ public void testApplicationComparator() public void testConcurrentAccessOnApplications() throws Exception { CapacityScheduler cs = new CapacityScheduler(); verifyConcurrentAccessOnApplications( - cs.applications, FiCaSchedulerApp.class, Queue.class); + cs.appAttempts, FiCaSchedulerApp.class, Queue.class); } - public static + public static void verifyConcurrentAccessOnApplications( final Map applications, Class appClazz, final Class queueClazz) @@ -682,4 +688,21 @@ public void testGetAppsInQueue() throws Exception { Assert.assertNull(scheduler.getAppsInQueue("nonexistentqueue")); } -} + @Test + public void testAddAndRemoveAppFromCapacityScheduler() throws Exception { + + AsyncDispatcher rmDispatcher = new AsyncDispatcher(); + CapacityScheduler cs = new CapacityScheduler(); + CapacitySchedulerConfiguration conf = new CapacitySchedulerConfiguration(); + setupQueueConfiguration(conf); + cs.reinitialize(conf, new RMContextImpl(rmDispatcher, null, null, null, + null, null, new RMContainerTokenSecretManager(conf), + new NMTokenSecretManagerInRM(conf), + new ClientToAMTokenSecretManagerInRM())); + + SchedulerApplication app = + TestSchedulerUtils.verifyAppAddedAndRemovedFromScheduler( + cs.applications, cs, "a1"); + Assert.assertEquals("a1", app.getQueue().getQueueName()); + } + } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestLeafQueue.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestLeafQueue.java index 73eb697e0ec..5e272debf7c 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestLeafQueue.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestLeafQueue.java @@ -271,14 +271,14 @@ public void testSingleQueueOneUserMetrics() throws Exception { FiCaSchedulerApp app_0 = new FiCaSchedulerApp(appAttemptId_0, user_0, a, mock(ActiveUsersManager.class), rmContext); - a.submitApplication(app_0, user_0, B); + a.submitApplicationAttempt(app_0, user_0); final ApplicationAttemptId appAttemptId_1 = TestUtils.getMockApplicationAttemptId(1, 0); FiCaSchedulerApp app_1 = new FiCaSchedulerApp(appAttemptId_1, user_0, a, mock(ActiveUsersManager.class), rmContext); - a.submitApplication(app_1, user_0, B); // same user + a.submitApplicationAttempt(app_1, user_0); // same user // Setup some nodes @@ -320,14 +320,14 @@ public void testUserQueueAcl() throws Exception { .getMockApplicationAttemptId(0, 1); FiCaSchedulerApp app_0 = new FiCaSchedulerApp(appAttemptId_0, user_d, d, null, rmContext); - d.submitApplication(app_0, user_d, D); + d.submitApplicationAttempt(app_0, user_d); // Attempt the same application again final ApplicationAttemptId appAttemptId_1 = TestUtils .getMockApplicationAttemptId(0, 2); FiCaSchedulerApp app_1 = new FiCaSchedulerApp(appAttemptId_1, user_d, d, null, rmContext); - d.submitApplication(app_1, user_d, D); // same user + d.submitApplicationAttempt(app_1, user_d); // same user } @@ -345,7 +345,7 @@ public void testAppAttemptMetrics() throws Exception { .getMockApplicationAttemptId(0, 1); FiCaSchedulerApp app_0 = new FiCaSchedulerApp(appAttemptId_0, user_0, a, null, rmContext); - a.submitApplication(app_0, user_0, B); + a.submitApplicationAttempt(app_0, user_0); when(cs.getApplication(appAttemptId_0)).thenReturn(app_0); AppAttemptRemovedSchedulerEvent event = new AppAttemptRemovedSchedulerEvent( @@ -360,7 +360,7 @@ public void testAppAttemptMetrics() throws Exception { .getMockApplicationAttemptId(0, 2); FiCaSchedulerApp app_1 = new FiCaSchedulerApp(appAttemptId_1, user_0, a, null, rmContext); - a.submitApplication(app_1, user_0, B); // same user + a.submitApplicationAttempt(app_1, user_0); // same user assertEquals(1, a.getMetrics().getAppsSubmitted()); assertEquals(1, a.getMetrics().getAppsPending()); @@ -396,14 +396,14 @@ public void testSingleQueueWithOneUser() throws Exception { FiCaSchedulerApp app_0 = new FiCaSchedulerApp(appAttemptId_0, user_0, a, mock(ActiveUsersManager.class), rmContext); - a.submitApplication(app_0, user_0, A); + a.submitApplicationAttempt(app_0, user_0); final ApplicationAttemptId appAttemptId_1 = TestUtils.getMockApplicationAttemptId(1, 0); FiCaSchedulerApp app_1 = new FiCaSchedulerApp(appAttemptId_1, user_0, a, mock(ActiveUsersManager.class), rmContext); - a.submitApplication(app_1, user_0, A); // same user + a.submitApplicationAttempt(app_1, user_0); // same user // Setup some nodes @@ -524,21 +524,21 @@ public void testUserLimits() throws Exception { FiCaSchedulerApp app_0 = new FiCaSchedulerApp(appAttemptId_0, user_0, a, a.getActiveUsersManager(), rmContext); - a.submitApplication(app_0, user_0, A); + a.submitApplicationAttempt(app_0, user_0); final ApplicationAttemptId appAttemptId_1 = TestUtils.getMockApplicationAttemptId(1, 0); FiCaSchedulerApp app_1 = new FiCaSchedulerApp(appAttemptId_1, user_0, a, a.getActiveUsersManager(), rmContext); - a.submitApplication(app_1, user_0, A); // same user + a.submitApplicationAttempt(app_1, user_0); // same user final ApplicationAttemptId appAttemptId_2 = TestUtils.getMockApplicationAttemptId(2, 0); FiCaSchedulerApp app_2 = new FiCaSchedulerApp(appAttemptId_2, user_1, a, a.getActiveUsersManager(), rmContext); - a.submitApplication(app_2, user_1, A); + a.submitApplicationAttempt(app_2, user_1); // Setup some nodes String host_0 = "127.0.0.1"; @@ -618,21 +618,21 @@ public void testHeadroomWithMaxCap() throws Exception { FiCaSchedulerApp app_0 = new FiCaSchedulerApp(appAttemptId_0, user_0, a, a.getActiveUsersManager(), rmContext); - a.submitApplication(app_0, user_0, A); + a.submitApplicationAttempt(app_0, user_0); final ApplicationAttemptId appAttemptId_1 = TestUtils.getMockApplicationAttemptId(1, 0); FiCaSchedulerApp app_1 = new FiCaSchedulerApp(appAttemptId_1, user_0, a, a.getActiveUsersManager(), rmContext); - a.submitApplication(app_1, user_0, A); // same user + a.submitApplicationAttempt(app_1, user_0); // same user final ApplicationAttemptId appAttemptId_2 = TestUtils.getMockApplicationAttemptId(2, 0); FiCaSchedulerApp app_2 = new FiCaSchedulerApp(appAttemptId_2, user_1, a, a.getActiveUsersManager(), rmContext); - a.submitApplication(app_2, user_1, A); + a.submitApplicationAttempt(app_2, user_1); // Setup some nodes String host_0 = "127.0.0.1"; @@ -729,28 +729,28 @@ public void testSingleQueueWithMultipleUsers() throws Exception { FiCaSchedulerApp app_0 = new FiCaSchedulerApp(appAttemptId_0, user_0, a, a.getActiveUsersManager(), rmContext); - a.submitApplication(app_0, user_0, A); + a.submitApplicationAttempt(app_0, user_0); final ApplicationAttemptId appAttemptId_1 = TestUtils.getMockApplicationAttemptId(1, 0); FiCaSchedulerApp app_1 = new FiCaSchedulerApp(appAttemptId_1, user_0, a, a.getActiveUsersManager(), rmContext); - a.submitApplication(app_1, user_0, A); // same user + a.submitApplicationAttempt(app_1, user_0); // same user final ApplicationAttemptId appAttemptId_2 = TestUtils.getMockApplicationAttemptId(2, 0); FiCaSchedulerApp app_2 = new FiCaSchedulerApp(appAttemptId_2, user_1, a, a.getActiveUsersManager(), rmContext); - a.submitApplication(app_2, user_1, A); + a.submitApplicationAttempt(app_2, user_1); final ApplicationAttemptId appAttemptId_3 = TestUtils.getMockApplicationAttemptId(3, 0); FiCaSchedulerApp app_3 = new FiCaSchedulerApp(appAttemptId_3, user_2, a, a.getActiveUsersManager(), rmContext); - a.submitApplication(app_3, user_2, A); + a.submitApplicationAttempt(app_3, user_2); // Setup some nodes String host_0 = "127.0.0.1"; @@ -905,14 +905,14 @@ public void testReservation() throws Exception { FiCaSchedulerApp app_0 = new FiCaSchedulerApp(appAttemptId_0, user_0, a, mock(ActiveUsersManager.class), rmContext); - a.submitApplication(app_0, user_0, A); + a.submitApplicationAttempt(app_0, user_0); final ApplicationAttemptId appAttemptId_1 = TestUtils.getMockApplicationAttemptId(1, 0); FiCaSchedulerApp app_1 = new FiCaSchedulerApp(appAttemptId_1, user_1, a, mock(ActiveUsersManager.class), rmContext); - a.submitApplication(app_1, user_1, A); + a.submitApplicationAttempt(app_1, user_1); // Setup some nodes String host_0 = "127.0.0.1"; @@ -1007,14 +1007,14 @@ public void testStolenReservedContainer() throws Exception { FiCaSchedulerApp app_0 = new FiCaSchedulerApp(appAttemptId_0, user_0, a, mock(ActiveUsersManager.class), rmContext); - a.submitApplication(app_0, user_0, A); + a.submitApplicationAttempt(app_0, user_0); final ApplicationAttemptId appAttemptId_1 = TestUtils.getMockApplicationAttemptId(1, 0); FiCaSchedulerApp app_1 = new FiCaSchedulerApp(appAttemptId_1, user_1, a, mock(ActiveUsersManager.class), rmContext); - a.submitApplication(app_1, user_1, A); + a.submitApplicationAttempt(app_1, user_1); // Setup some nodes String host_0 = "127.0.0.1"; @@ -1111,14 +1111,14 @@ public void testReservationExchange() throws Exception { FiCaSchedulerApp app_0 = new FiCaSchedulerApp(appAttemptId_0, user_0, a, mock(ActiveUsersManager.class), rmContext); - a.submitApplication(app_0, user_0, A); + a.submitApplicationAttempt(app_0, user_0); final ApplicationAttemptId appAttemptId_1 = TestUtils.getMockApplicationAttemptId(1, 0); FiCaSchedulerApp app_1 = new FiCaSchedulerApp(appAttemptId_1, user_1, a, mock(ActiveUsersManager.class), rmContext); - a.submitApplication(app_1, user_1, A); + a.submitApplicationAttempt(app_1, user_1); // Setup some nodes String host_0 = "127.0.0.1"; @@ -1232,7 +1232,7 @@ public void testLocalityScheduling() throws Exception { FiCaSchedulerApp app_0 = spy(new FiCaSchedulerApp(appAttemptId_0, user_0, a, mock(ActiveUsersManager.class), rmContext)); - a.submitApplication(app_0, user_0, A); + a.submitApplicationAttempt(app_0, user_0); // Setup some nodes and racks String host_0 = "127.0.0.1"; @@ -1373,7 +1373,7 @@ public void testApplicationPriorityScheduling() throws Exception { FiCaSchedulerApp app_0 = spy(new FiCaSchedulerApp(appAttemptId_0, user_0, a, mock(ActiveUsersManager.class), rmContext)); - a.submitApplication(app_0, user_0, A); + a.submitApplicationAttempt(app_0, user_0); // Setup some nodes and racks String host_0 = "127.0.0.1"; @@ -1504,7 +1504,7 @@ public void testSchedulingConstraints() throws Exception { FiCaSchedulerApp app_0 = spy(new FiCaSchedulerApp(appAttemptId_0, user_0, a, mock(ActiveUsersManager.class), rmContext)); - a.submitApplication(app_0, user_0, A); + a.submitApplicationAttempt(app_0, user_0); // Setup some nodes and racks String host_0_0 = "127.0.0.1"; @@ -1607,21 +1607,21 @@ public void testActivateApplicationAfterQueueRefresh() throws Exception { FiCaSchedulerApp app_0 = new FiCaSchedulerApp(appAttemptId_0, user_e, e, mock(ActiveUsersManager.class), rmContext); - e.submitApplication(app_0, user_e, E); + e.submitApplicationAttempt(app_0, user_e); final ApplicationAttemptId appAttemptId_1 = TestUtils.getMockApplicationAttemptId(1, 0); FiCaSchedulerApp app_1 = new FiCaSchedulerApp(appAttemptId_1, user_e, e, mock(ActiveUsersManager.class), rmContext); - e.submitApplication(app_1, user_e, E); // same user + e.submitApplicationAttempt(app_1, user_e); // same user final ApplicationAttemptId appAttemptId_2 = TestUtils.getMockApplicationAttemptId(2, 0); FiCaSchedulerApp app_2 = new FiCaSchedulerApp(appAttemptId_2, user_e, e, mock(ActiveUsersManager.class), rmContext); - e.submitApplication(app_2, user_e, E); // same user + e.submitApplicationAttempt(app_2, user_e); // same user // before reinitialization assertEquals(2, e.activeApplications.size()); @@ -1685,21 +1685,21 @@ public void testActivateApplicationByUpdatingClusterResource() FiCaSchedulerApp app_0 = new FiCaSchedulerApp(appAttemptId_0, user_e, e, mock(ActiveUsersManager.class), rmContext); - e.submitApplication(app_0, user_e, E); + e.submitApplicationAttempt(app_0, user_e); final ApplicationAttemptId appAttemptId_1 = TestUtils.getMockApplicationAttemptId(1, 0); FiCaSchedulerApp app_1 = new FiCaSchedulerApp(appAttemptId_1, user_e, e, mock(ActiveUsersManager.class), rmContext); - e.submitApplication(app_1, user_e, E); // same user + e.submitApplicationAttempt(app_1, user_e); // same user final ApplicationAttemptId appAttemptId_2 = TestUtils.getMockApplicationAttemptId(2, 0); FiCaSchedulerApp app_2 = new FiCaSchedulerApp(appAttemptId_2, user_e, e, mock(ActiveUsersManager.class), rmContext); - e.submitApplication(app_2, user_e, E); // same user + e.submitApplicationAttempt(app_2, user_e); // same user // before updating cluster resource assertEquals(2, e.activeApplications.size()); @@ -1762,14 +1762,14 @@ public void testLocalityConstraints() throws Exception { FiCaSchedulerApp app_0 = spy(new FiCaSchedulerApp(appAttemptId_0, user_0, a, mock(ActiveUsersManager.class), rmContext)); - a.submitApplication(app_0, user_0, A); + a.submitApplicationAttempt(app_0, user_0); final ApplicationAttemptId appAttemptId_1 = TestUtils.getMockApplicationAttemptId(1, 0); FiCaSchedulerApp app_1 = spy(new FiCaSchedulerApp(appAttemptId_1, user_0, a, mock(ActiveUsersManager.class), rmContext)); - a.submitApplication(app_1, user_0, A); + a.submitApplicationAttempt(app_1, user_0); // Setup some nodes and racks String host_0_0 = "127.0.0.1"; diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/TestFairScheduler.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/TestFairScheduler.java index b601b3883e9..a7ad979ba20 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/TestFairScheduler.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/TestFairScheduler.java @@ -78,7 +78,9 @@ import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNode; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.QueueMetrics; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ResourceScheduler; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.TestSchedulerUtils; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.TestCapacityScheduler; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.AppAddedSchedulerEvent; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.AppAttemptAddedSchedulerEvent; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.AppAttemptRemovedSchedulerEvent; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.NodeAddedSchedulerEvent; @@ -255,7 +257,12 @@ private ApplicationAttemptId createSchedulingRequest(int memory, String queueId, private ApplicationAttemptId createSchedulingRequest(int memory, int vcores, String queueId, String userId, int numContainers, int priority) { ApplicationAttemptId id = createAppAttemptId(this.APP_ID++, this.ATTEMPT_ID++); - scheduler.addApplicationAttempt(id, queueId, userId); + scheduler.addApplication(id.getApplicationId(), queueId, userId); + // This conditional is for testAclSubmitApplication where app is rejected + // and no app is added. + if (scheduler.applications.containsKey(id.getApplicationId())) { + scheduler.addApplicationAttempt(id); + } List ask = new ArrayList(); ResourceRequest request = createResourceRequest(memory, vcores, ResourceRequest.ANY, priority, numContainers, true); @@ -583,7 +590,7 @@ public void testSimpleContainerReservation() throws Exception { // Make sure queue 2 is waiting with a reservation assertEquals(0, scheduler.getQueueManager().getQueue("queue2"). getResourceUsage().getMemory()); - assertEquals(1024, scheduler.applications.get(attId).getCurrentReservation().getMemory()); + assertEquals(1024, scheduler.appAttempts.get(attId).getCurrentReservation().getMemory()); // Now another node checks in with capacity RMNode node2 = @@ -599,10 +606,10 @@ public void testSimpleContainerReservation() throws Exception { getResourceUsage().getMemory()); // The old reservation should still be there... - assertEquals(1024, scheduler.applications.get(attId).getCurrentReservation().getMemory()); + assertEquals(1024, scheduler.appAttempts.get(attId).getCurrentReservation().getMemory()); // ... but it should disappear when we update the first node. scheduler.handle(updateEvent); - assertEquals(0, scheduler.applications.get(attId).getCurrentReservation().getMemory()); + assertEquals(0, scheduler.appAttempts.get(attId).getCurrentReservation().getMemory()); } @@ -618,9 +625,13 @@ public void testUserAsDefaultQueue() throws Exception { null, null, null, false, false, 0, null, null), null, null, 0, null); appsMap.put(appAttemptId.getApplicationId(), rmApp); - AppAttemptAddedSchedulerEvent appAddedEvent = - new AppAttemptAddedSchedulerEvent(appAttemptId, "default", "user1"); + AppAddedSchedulerEvent appAddedEvent = + new AppAddedSchedulerEvent(appAttemptId.getApplicationId(), "default", + "user1"); scheduler.handle(appAddedEvent); + AppAttemptAddedSchedulerEvent attempAddedEvent = + new AppAttemptAddedSchedulerEvent(appAttemptId); + scheduler.handle(attempAddedEvent); assertEquals(1, scheduler.getQueueManager().getLeafQueue("user1", true) .getRunnableAppSchedulables().size()); assertEquals(0, scheduler.getQueueManager().getLeafQueue("default", true) @@ -639,10 +650,14 @@ public void testNotUserAsDefaultQueue() throws Exception { null, null, null, ApplicationSubmissionContext.newInstance(null, null, null, null, null, false, false, 0, null, null), null, null, 0, null); appsMap.put(appAttemptId.getApplicationId(), rmApp); - - AppAttemptAddedSchedulerEvent appAddedEvent2 = - new AppAttemptAddedSchedulerEvent(appAttemptId, "default", "user2"); - scheduler.handle(appAddedEvent2); + + AppAddedSchedulerEvent appAddedEvent = + new AppAddedSchedulerEvent(appAttemptId.getApplicationId(), "default", + "user2"); + scheduler.handle(appAddedEvent); + AppAttemptAddedSchedulerEvent attempAddedEvent = + new AppAttemptAddedSchedulerEvent(appAttemptId); + scheduler.handle(attempAddedEvent); assertEquals(0, scheduler.getQueueManager().getLeafQueue("user1", true) .getRunnableAppSchedulables().size()); assertEquals(1, scheduler.getQueueManager().getLeafQueue("default", true) @@ -660,8 +675,8 @@ public void testEmptyQueueName() throws Exception { // submit app with empty queue ApplicationAttemptId appAttemptId = createAppAttemptId(1, 1); - AppAttemptAddedSchedulerEvent appAddedEvent = - new AppAttemptAddedSchedulerEvent(appAttemptId, "", "user1"); + AppAddedSchedulerEvent appAddedEvent = + new AppAddedSchedulerEvent(appAttemptId.getApplicationId(), "", "user1"); scheduler.handle(appAddedEvent); // submission rejected @@ -695,7 +710,7 @@ public void testQueuePlacementWithPolicy() throws Exception { scheduler.reinitialize(conf, resourceManager.getRMContext()); ApplicationAttemptId appId; - Map apps = scheduler.applications; + Map apps = scheduler.appAttempts; List rules = new ArrayList(); rules.add(new QueuePlacementRule.Specified().initialize(true, null)); @@ -786,11 +801,14 @@ public void testQueueDemandCalculation() throws Exception { scheduler.reinitialize(conf, resourceManager.getRMContext()); ApplicationAttemptId id11 = createAppAttemptId(1, 1); - scheduler.addApplicationAttempt(id11, "root.queue1", "user1"); + scheduler.addApplication(id11.getApplicationId(), "root.queue1", "user1"); + scheduler.addApplicationAttempt(id11); ApplicationAttemptId id21 = createAppAttemptId(2, 1); - scheduler.addApplicationAttempt(id21, "root.queue2", "user1"); + scheduler.addApplication(id21.getApplicationId(), "root.queue2", "user1"); + scheduler.addApplicationAttempt(id21); ApplicationAttemptId id22 = createAppAttemptId(2, 2); - scheduler.addApplicationAttempt(id22, "root.queue2", "user1"); + scheduler.addApplication(id22.getApplicationId(), "root.queue2", "user1"); + scheduler.addApplicationAttempt(id22); int minReqSize = FairSchedulerConfiguration.DEFAULT_RM_SCHEDULER_INCREMENT_ALLOCATION_MB; @@ -831,11 +849,13 @@ public void testQueueDemandCalculation() throws Exception { @Test public void testAppAdditionAndRemoval() throws Exception { scheduler.reinitialize(conf, resourceManager.getRMContext()); - - AppAttemptAddedSchedulerEvent appAddedEvent1 = - new AppAttemptAddedSchedulerEvent(createAppAttemptId(1, 1), "default", - "user1"); - scheduler.handle(appAddedEvent1); + ApplicationAttemptId attemptId =createAppAttemptId(1, 1); + AppAddedSchedulerEvent appAddedEvent = new AppAddedSchedulerEvent(attemptId.getApplicationId(), "default", + "user1"); + scheduler.handle(appAddedEvent); + AppAttemptAddedSchedulerEvent attemptAddedEvent = + new AppAttemptAddedSchedulerEvent(createAppAttemptId(1, 1)); + scheduler.handle(attemptAddedEvent); // Scheduler should have two queues (the default and the one created for user1) assertEquals(2, scheduler.getQueueManager().getLeafQueues().size()); @@ -1118,12 +1138,12 @@ public void testChoiceOfPreemptedContainers() throws Exception { scheduler.handle(nodeUpdate3); } - assertEquals(1, scheduler.applications.get(app1).getLiveContainers().size()); - assertEquals(1, scheduler.applications.get(app2).getLiveContainers().size()); - assertEquals(1, scheduler.applications.get(app3).getLiveContainers().size()); - assertEquals(1, scheduler.applications.get(app4).getLiveContainers().size()); - assertEquals(1, scheduler.applications.get(app5).getLiveContainers().size()); - assertEquals(1, scheduler.applications.get(app6).getLiveContainers().size()); + assertEquals(1, scheduler.appAttempts.get(app1).getLiveContainers().size()); + assertEquals(1, scheduler.appAttempts.get(app2).getLiveContainers().size()); + assertEquals(1, scheduler.appAttempts.get(app3).getLiveContainers().size()); + assertEquals(1, scheduler.appAttempts.get(app4).getLiveContainers().size()); + assertEquals(1, scheduler.appAttempts.get(app5).getLiveContainers().size()); + assertEquals(1, scheduler.appAttempts.get(app6).getLiveContainers().size()); // Now new requests arrive from queues C and D ApplicationAttemptId app7 = @@ -1146,16 +1166,16 @@ public void testChoiceOfPreemptedContainers() throws Exception { // Make sure it is lowest priority container. scheduler.preemptResources(scheduler.getQueueManager().getLeafQueues(), Resources.createResource(2 * 1024)); - assertEquals(1, scheduler.applications.get(app1).getLiveContainers().size()); - assertEquals(1, scheduler.applications.get(app2).getLiveContainers().size()); - assertEquals(1, scheduler.applications.get(app4).getLiveContainers().size()); - assertEquals(1, scheduler.applications.get(app5).getLiveContainers().size()); + assertEquals(1, scheduler.appAttempts.get(app1).getLiveContainers().size()); + assertEquals(1, scheduler.appAttempts.get(app2).getLiveContainers().size()); + assertEquals(1, scheduler.appAttempts.get(app4).getLiveContainers().size()); + assertEquals(1, scheduler.appAttempts.get(app5).getLiveContainers().size()); // First verify we are adding containers to preemption list for the application - assertTrue(!Collections.disjoint(scheduler.applications.get(app3).getLiveContainers(), - scheduler.applications.get(app3).getPreemptionContainers())); - assertTrue(!Collections.disjoint(scheduler.applications.get(app6).getLiveContainers(), - scheduler.applications.get(app6).getPreemptionContainers())); + assertTrue(!Collections.disjoint(scheduler.appAttempts.get(app3).getLiveContainers(), + scheduler.appAttempts.get(app3).getPreemptionContainers())); + assertTrue(!Collections.disjoint(scheduler.appAttempts.get(app6).getLiveContainers(), + scheduler.appAttempts.get(app6).getPreemptionContainers())); // Pretend 15 seconds have passed clock.tick(15); @@ -1165,8 +1185,8 @@ public void testChoiceOfPreemptedContainers() throws Exception { Resources.createResource(2 * 1024)); // At this point the containers should have been killed (since we are not simulating AM) - assertEquals(0, scheduler.applications.get(app6).getLiveContainers().size()); - assertEquals(0, scheduler.applications.get(app3).getLiveContainers().size()); + assertEquals(0, scheduler.appAttempts.get(app6).getLiveContainers().size()); + assertEquals(0, scheduler.appAttempts.get(app3).getLiveContainers().size()); // Trigger a kill by insisting we want containers back scheduler.preemptResources(scheduler.getQueueManager().getLeafQueues(), @@ -1180,22 +1200,22 @@ public void testChoiceOfPreemptedContainers() throws Exception { scheduler.preemptResources(scheduler.getQueueManager().getLeafQueues(), Resources.createResource(2 * 1024)); - assertEquals(1, scheduler.applications.get(app1).getLiveContainers().size()); - assertEquals(0, scheduler.applications.get(app2).getLiveContainers().size()); - assertEquals(0, scheduler.applications.get(app3).getLiveContainers().size()); - assertEquals(1, scheduler.applications.get(app4).getLiveContainers().size()); - assertEquals(0, scheduler.applications.get(app5).getLiveContainers().size()); - assertEquals(0, scheduler.applications.get(app6).getLiveContainers().size()); + assertEquals(1, scheduler.appAttempts.get(app1).getLiveContainers().size()); + assertEquals(0, scheduler.appAttempts.get(app2).getLiveContainers().size()); + assertEquals(0, scheduler.appAttempts.get(app3).getLiveContainers().size()); + assertEquals(1, scheduler.appAttempts.get(app4).getLiveContainers().size()); + assertEquals(0, scheduler.appAttempts.get(app5).getLiveContainers().size()); + assertEquals(0, scheduler.appAttempts.get(app6).getLiveContainers().size()); // Now A and B are below fair share, so preemption shouldn't do anything scheduler.preemptResources(scheduler.getQueueManager().getLeafQueues(), Resources.createResource(2 * 1024)); - assertEquals(1, scheduler.applications.get(app1).getLiveContainers().size()); - assertEquals(0, scheduler.applications.get(app2).getLiveContainers().size()); - assertEquals(0, scheduler.applications.get(app3).getLiveContainers().size()); - assertEquals(1, scheduler.applications.get(app4).getLiveContainers().size()); - assertEquals(0, scheduler.applications.get(app5).getLiveContainers().size()); - assertEquals(0, scheduler.applications.get(app6).getLiveContainers().size()); + assertEquals(1, scheduler.appAttempts.get(app1).getLiveContainers().size()); + assertEquals(0, scheduler.appAttempts.get(app2).getLiveContainers().size()); + assertEquals(0, scheduler.appAttempts.get(app3).getLiveContainers().size()); + assertEquals(1, scheduler.appAttempts.get(app4).getLiveContainers().size()); + assertEquals(0, scheduler.appAttempts.get(app5).getLiveContainers().size()); + assertEquals(0, scheduler.appAttempts.get(app6).getLiveContainers().size()); } @Test (timeout = 5000) @@ -1354,9 +1374,9 @@ public void testMultipleContainersWaitingForReservation() throws IOException { // One container should get reservation and the other should get nothing assertEquals(1024, - scheduler.applications.get(attId1).getCurrentReservation().getMemory()); + scheduler.appAttempts.get(attId1).getCurrentReservation().getMemory()); assertEquals(0, - scheduler.applications.get(attId2).getCurrentReservation().getMemory()); + scheduler.appAttempts.get(attId2).getCurrentReservation().getMemory()); } @Test (timeout = 5000) @@ -1391,7 +1411,7 @@ public void testUserMaxRunningApps() throws Exception { scheduler.handle(updateEvent); // App 1 should be running - assertEquals(1, scheduler.applications.get(attId1).getLiveContainers().size()); + assertEquals(1, scheduler.appAttempts.get(attId1).getLiveContainers().size()); ApplicationAttemptId attId2 = createSchedulingRequest(1024, "queue1", "user1", 1); @@ -1400,7 +1420,7 @@ public void testUserMaxRunningApps() throws Exception { scheduler.handle(updateEvent); // App 2 should not be running - assertEquals(0, scheduler.applications.get(attId2).getLiveContainers().size()); + assertEquals(0, scheduler.appAttempts.get(attId2).getLiveContainers().size()); // Request another container for app 1 createSchedulingRequestExistingApplication(1024, 1, attId1); @@ -1409,7 +1429,7 @@ public void testUserMaxRunningApps() throws Exception { scheduler.handle(updateEvent); // Request should be fulfilled - assertEquals(2, scheduler.applications.get(attId1).getLiveContainers().size()); + assertEquals(2, scheduler.appAttempts.get(attId1).getLiveContainers().size()); } @Test (timeout = 5000) @@ -1429,10 +1449,10 @@ public void testReservationWhileMultiplePriorities() throws IOException { NodeUpdateSchedulerEvent updateEvent = new NodeUpdateSchedulerEvent(node1); scheduler.handle(updateEvent); - FSSchedulerApp app = scheduler.applications.get(attId); + FSSchedulerApp app = scheduler.appAttempts.get(attId); assertEquals(1, app.getLiveContainers().size()); - ContainerId containerId = scheduler.applications.get(attId) + ContainerId containerId = scheduler.appAttempts.get(attId) .getLiveContainers().iterator().next().getContainerId(); // Cause reservation to be created @@ -1501,9 +1521,9 @@ public void testAclSubmitApplication() throws Exception { ApplicationAttemptId attId2 = createSchedulingRequest(1024, "queue1", "norealuserhasthisname2", 1); - FSSchedulerApp app1 = scheduler.applications.get(attId1); + FSSchedulerApp app1 = scheduler.appAttempts.get(attId1); assertNotNull("The application was not allowed", app1); - FSSchedulerApp app2 = scheduler.applications.get(attId2); + FSSchedulerApp app2 = scheduler.appAttempts.get(attId2); assertNull("The application was allowed", app2); } @@ -1526,7 +1546,8 @@ public void testMultipleNodesSingleRackRequest() throws Exception { scheduler.handle(nodeEvent2); ApplicationAttemptId appId = createAppAttemptId(this.APP_ID++, this.ATTEMPT_ID++); - scheduler.addApplicationAttempt(appId, "queue1", "user1"); + scheduler.addApplication(appId.getApplicationId(), "queue1", "user1"); + scheduler.addApplicationAttempt(appId); // 1 request with 2 nodes on the same rack. another request with 1 node on // a different rack @@ -1545,14 +1566,14 @@ public void testMultipleNodesSingleRackRequest() throws Exception { NodeUpdateSchedulerEvent updateEvent1 = new NodeUpdateSchedulerEvent(node1); scheduler.handle(updateEvent1); // should assign node local - assertEquals(1, scheduler.applications.get(appId).getLiveContainers().size()); + assertEquals(1, scheduler.appAttempts.get(appId).getLiveContainers().size()); // node 2 checks in scheduler.update(); NodeUpdateSchedulerEvent updateEvent2 = new NodeUpdateSchedulerEvent(node2); scheduler.handle(updateEvent2); // should assign rack local - assertEquals(2, scheduler.applications.get(appId).getLiveContainers().size()); + assertEquals(2, scheduler.appAttempts.get(appId).getLiveContainers().size()); } @Test (timeout = 5000) @@ -1571,8 +1592,8 @@ public void testFifoWithinQueue() throws Exception { "user1", 2); ApplicationAttemptId attId2 = createSchedulingRequest(1024, "queue1", "user1", 2); - FSSchedulerApp app1 = scheduler.applications.get(attId1); - FSSchedulerApp app2 = scheduler.applications.get(attId2); + FSSchedulerApp app1 = scheduler.appAttempts.get(attId1); + FSSchedulerApp app2 = scheduler.appAttempts.get(attId2); FSLeafQueue queue1 = scheduler.getQueueManager().getLeafQueue("queue1", true); queue1.setPolicy(new FifoPolicy()); @@ -1612,7 +1633,7 @@ public void testMaxAssign() throws Exception { ApplicationAttemptId attId = createSchedulingRequest(1024, "root.default", "user", 8); - FSSchedulerApp app = scheduler.applications.get(attId); + FSSchedulerApp app = scheduler.appAttempts.get(attId); // set maxAssign to 2: only 2 containers should be allocated scheduler.maxAssign = 2; @@ -1674,10 +1695,10 @@ public void testAssignContainer() throws Exception { ApplicationAttemptId attId4 = createSchedulingRequest(1024, fifoQueue, user, 4); - FSSchedulerApp app1 = scheduler.applications.get(attId1); - FSSchedulerApp app2 = scheduler.applications.get(attId2); - FSSchedulerApp app3 = scheduler.applications.get(attId3); - FSSchedulerApp app4 = scheduler.applications.get(attId4); + FSSchedulerApp app1 = scheduler.appAttempts.get(attId1); + FSSchedulerApp app2 = scheduler.appAttempts.get(attId2); + FSSchedulerApp app3 = scheduler.appAttempts.get(attId3); + FSSchedulerApp app4 = scheduler.appAttempts.get(attId4); scheduler.getQueueManager().getLeafQueue(fifoQueue, true) .setPolicy(SchedulingPolicy.parse("fifo")); @@ -1764,7 +1785,7 @@ public void testNotAllowSubmitApplication() throws Exception { ApplicationAttemptId attId = ApplicationAttemptId.newInstance(applicationId, this.ATTEMPT_ID++); - scheduler.addApplicationAttempt(attId, queue, user); + scheduler.addApplication(attId.getApplicationId(), queue, user); numTries = 0; while (application.getFinishTime() == 0 && numTries < MAX_TRIES) { @@ -1792,7 +1813,7 @@ public void testReservationThatDoesntFit() throws IOException { NodeUpdateSchedulerEvent updateEvent = new NodeUpdateSchedulerEvent(node1); scheduler.handle(updateEvent); - FSSchedulerApp app = scheduler.applications.get(attId); + FSSchedulerApp app = scheduler.appAttempts.get(attId); assertEquals(0, app.getLiveContainers().size()); assertEquals(0, app.getReservedContainers().size()); @@ -1861,7 +1882,7 @@ public void testStrictLocality() throws IOException { NodeUpdateSchedulerEvent node2UpdateEvent = new NodeUpdateSchedulerEvent(node2); // no matter how many heartbeats, node2 should never get a container - FSSchedulerApp app = scheduler.applications.get(attId1); + FSSchedulerApp app = scheduler.appAttempts.get(attId1); for (int i = 0; i < 10; i++) { scheduler.handle(node2UpdateEvent); assertEquals(0, app.getLiveContainers().size()); @@ -1900,7 +1921,7 @@ public void testCancelStrictLocality() throws IOException { NodeUpdateSchedulerEvent node2UpdateEvent = new NodeUpdateSchedulerEvent(node2); // no matter how many heartbeats, node2 should never get a container - FSSchedulerApp app = scheduler.applications.get(attId1); + FSSchedulerApp app = scheduler.appAttempts.get(attId1); for (int i = 0; i < 10; i++) { scheduler.handle(node2UpdateEvent); assertEquals(0, app.getLiveContainers().size()); @@ -1933,7 +1954,7 @@ public void testReservationsStrictLocality() throws IOException { ApplicationAttemptId attId = createSchedulingRequest(1024, "queue1", "user1", 0); - FSSchedulerApp app = scheduler.applications.get(attId); + FSSchedulerApp app = scheduler.appAttempts.get(attId); ResourceRequest nodeRequest = createResourceRequest(1024, node2.getHostName(), 1, 2, true); ResourceRequest rackRequest = createResourceRequest(1024, "rack1", 1, 2, true); @@ -1973,7 +1994,7 @@ public void testNoMoreCpuOnNode() throws IOException { ApplicationAttemptId attId = createSchedulingRequest(1024, 1, "default", "user1", 2); - FSSchedulerApp app = scheduler.applications.get(attId); + FSSchedulerApp app = scheduler.appAttempts.get(attId); scheduler.update(); NodeUpdateSchedulerEvent updateEvent = new NodeUpdateSchedulerEvent(node1); @@ -1993,10 +2014,10 @@ public void testBasicDRFAssignment() throws Exception { ApplicationAttemptId appAttId1 = createSchedulingRequest(2048, 1, "queue1", "user1", 2); - FSSchedulerApp app1 = scheduler.applications.get(appAttId1); + FSSchedulerApp app1 = scheduler.appAttempts.get(appAttId1); ApplicationAttemptId appAttId2 = createSchedulingRequest(1024, 2, "queue1", "user1", 2); - FSSchedulerApp app2 = scheduler.applications.get(appAttId2); + FSSchedulerApp app2 = scheduler.appAttempts.get(appAttId2); DominantResourceFairnessPolicy drfPolicy = new DominantResourceFairnessPolicy(); drfPolicy.initialize(scheduler.getClusterCapacity()); @@ -2034,13 +2055,13 @@ public void testBasicDRFWithQueues() throws Exception { ApplicationAttemptId appAttId1 = createSchedulingRequest(3072, 1, "queue1", "user1", 2); - FSSchedulerApp app1 = scheduler.applications.get(appAttId1); + FSSchedulerApp app1 = scheduler.appAttempts.get(appAttId1); ApplicationAttemptId appAttId2 = createSchedulingRequest(2048, 2, "queue1", "user1", 2); - FSSchedulerApp app2 = scheduler.applications.get(appAttId2); + FSSchedulerApp app2 = scheduler.appAttempts.get(appAttId2); ApplicationAttemptId appAttId3 = createSchedulingRequest(1024, 2, "queue2", "user1", 2); - FSSchedulerApp app3 = scheduler.applications.get(appAttId3); + FSSchedulerApp app3 = scheduler.appAttempts.get(appAttId3); DominantResourceFairnessPolicy drfPolicy = new DominantResourceFairnessPolicy(); drfPolicy.initialize(scheduler.getClusterCapacity()); @@ -2071,19 +2092,19 @@ public void testDRFHierarchicalQueues() throws Exception { ApplicationAttemptId appAttId1 = createSchedulingRequest(3074, 1, "queue1.subqueue1", "user1", 2); Thread.sleep(3); // so that start times will be different - FSSchedulerApp app1 = scheduler.applications.get(appAttId1); + FSSchedulerApp app1 = scheduler.appAttempts.get(appAttId1); ApplicationAttemptId appAttId2 = createSchedulingRequest(1024, 3, "queue1.subqueue1", "user1", 2); Thread.sleep(3); // so that start times will be different - FSSchedulerApp app2 = scheduler.applications.get(appAttId2); + FSSchedulerApp app2 = scheduler.appAttempts.get(appAttId2); ApplicationAttemptId appAttId3 = createSchedulingRequest(2048, 2, "queue1.subqueue2", "user1", 2); Thread.sleep(3); // so that start times will be different - FSSchedulerApp app3 = scheduler.applications.get(appAttId3); + FSSchedulerApp app3 = scheduler.appAttempts.get(appAttId3); ApplicationAttemptId appAttId4 = createSchedulingRequest(1024, 2, "queue2", "user1", 2); Thread.sleep(3); // so that start times will be different - FSSchedulerApp app4 = scheduler.applications.get(appAttId4); + FSSchedulerApp app4 = scheduler.appAttempts.get(appAttId4); DominantResourceFairnessPolicy drfPolicy = new DominantResourceFairnessPolicy(); drfPolicy.initialize(scheduler.getClusterCapacity()); @@ -2163,7 +2184,7 @@ public void testHostPortNodeName() throws Exception { NodeUpdateSchedulerEvent(node2); // no matter how many heartbeats, node2 should never get a container - FSSchedulerApp app = scheduler.applications.get(attId1); + FSSchedulerApp app = scheduler.appAttempts.get(attId1); for (int i = 0; i < 10; i++) { scheduler.handle(node2UpdateEvent); assertEquals(0, app.getLiveContainers().size()); @@ -2178,12 +2199,12 @@ public void testHostPortNodeName() throws Exception { public void testConcurrentAccessOnApplications() throws Exception { FairScheduler fs = new FairScheduler(); TestCapacityScheduler.verifyConcurrentAccessOnApplications( - fs.applications, FSSchedulerApp.class, FSLeafQueue.class); + fs.appAttempts, FSSchedulerApp.class, FSLeafQueue.class); } private void verifyAppRunnable(ApplicationAttemptId attId, boolean runnable) { - FSSchedulerApp app = scheduler.applications.get(attId); + FSSchedulerApp app = scheduler.appAttempts.get(attId); FSLeafQueue queue = app.getQueue(); Collection runnableApps = queue.getRunnableAppSchedulables(); @@ -2356,7 +2377,8 @@ public void testContinuousScheduling() throws Exception { // send application request ApplicationAttemptId appAttemptId = createAppAttemptId(this.APP_ID++, this.ATTEMPT_ID++); - fs.addApplicationAttempt(appAttemptId, "queue11", "user11"); + fs.addApplication(appAttemptId.getApplicationId(), "queue11", "user11"); + fs.addApplicationAttempt(appAttemptId); List ask = new ArrayList(); ResourceRequest request = createResourceRequest(1024, 1, ResourceRequest.ANY, 1, 1, true); @@ -2367,7 +2389,7 @@ public void testContinuousScheduling() throws Exception { // at least one pass Thread.sleep(fs.getConf().getContinuousSchedulingSleepMs() + 500); - FSSchedulerApp app = fs.applications.get(appAttemptId); + FSSchedulerApp app = fs.appAttempts.get(appAttemptId); // Wait until app gets resources. while (app.getCurrentConsumption().equals(Resources.none())) { } @@ -2455,7 +2477,7 @@ public void testBlacklistNodes() throws Exception { ApplicationAttemptId appAttemptId = createSchedulingRequest(GB, "root.default", "user", 1); - FSSchedulerApp app = scheduler.applications.get(appAttemptId); + FSSchedulerApp app = scheduler.appAttempts.get(appAttemptId); // Verify the blacklist can be updated independent of requesting containers scheduler.allocate(appAttemptId, Collections.emptyList(), @@ -2465,7 +2487,7 @@ public void testBlacklistNodes() throws Exception { scheduler.allocate(appAttemptId, Collections.emptyList(), Collections.emptyList(), null, Collections.singletonList(host)); - assertFalse(scheduler.applications.get(appAttemptId).isBlacklisted(host)); + assertFalse(scheduler.appAttempts.get(appAttemptId).isBlacklisted(host)); List update = Arrays.asList( createResourceRequest(GB, node.getHostName(), 1, 0, true)); @@ -2527,4 +2549,12 @@ public void testGetAppsInQueue() throws Exception { assertTrue(appAttIds.contains(appAttId1)); assertTrue(appAttIds.contains(appAttId2)); } + + @Test + public void testAddAndRemoveAppFromFairScheduler() throws Exception { + FairScheduler scheduler = + (FairScheduler) resourceManager.getResourceScheduler(); + TestSchedulerUtils.verifyAppAddedAndRemovedFromScheduler( + scheduler.applications, scheduler, "default"); + } } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fifo/TestFifoScheduler.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fifo/TestFifoScheduler.java index 7ce7e42bc67..30578265fe2 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fifo/TestFifoScheduler.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fifo/TestFifoScheduler.java @@ -61,13 +61,16 @@ import org.apache.hadoop.yarn.server.resourcemanager.scheduler.QueueMetrics; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ResourceScheduler; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerAppReport; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.TestSchedulerUtils; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.TestCapacityScheduler; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerApp; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerNode; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.AppAddedSchedulerEvent; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.AppAttemptAddedSchedulerEvent; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.NodeAddedSchedulerEvent; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.NodeUpdateSchedulerEvent; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.SchedulerEvent; +import org.apache.hadoop.yarn.server.resourcemanager.security.ClientToAMTokenSecretManagerInRM; import org.apache.hadoop.yarn.server.resourcemanager.security.NMTokenSecretManagerInRM; import org.apache.hadoop.yarn.server.resourcemanager.security.RMContainerTokenSecretManager; import org.apache.hadoop.yarn.server.utils.BuilderUtils; @@ -150,14 +153,21 @@ public void testAppAttemptMetrics() throws Exception { ApplicationAttemptId appAttemptId = BuilderUtils.newApplicationAttemptId( appId, 1); - SchedulerEvent event = - new AppAttemptAddedSchedulerEvent(appAttemptId, "queue", "user"); - schedular.handle(event); + SchedulerEvent appEvent = new AppAddedSchedulerEvent(appId, "queue", "user"); + schedular.handle(appEvent); + SchedulerEvent attemptEvent = + new AppAttemptAddedSchedulerEvent(appAttemptId); + schedular.handle(attemptEvent); appAttemptId = BuilderUtils.newApplicationAttemptId(appId, 2); - event = new AppAttemptAddedSchedulerEvent(appAttemptId, "queue", "user"); - schedular.handle(event); + SchedulerEvent appEvent2 = + new AppAddedSchedulerEvent(appAttemptId.getApplicationId(), "queue", + "user"); + schedular.handle(appEvent2); + SchedulerEvent attemptEvent2 = + new AppAttemptAddedSchedulerEvent(appAttemptId); + schedular.handle(attemptEvent2); int afterAppsSubmitted = metrics.getAppsSubmitted(); Assert.assertEquals(1, afterAppsSubmitted - beforeAppsSubmitted); @@ -188,9 +198,13 @@ public void testNodeLocalAssignment() throws Exception { int _appAttemptId = 1; ApplicationAttemptId appAttemptId = createAppAttemptId(_appId, _appAttemptId); - AppAttemptAddedSchedulerEvent appEvent1 = - new AppAttemptAddedSchedulerEvent(appAttemptId, "queue1", "user1"); - scheduler.handle(appEvent1); + AppAddedSchedulerEvent appEvent = + new AppAddedSchedulerEvent(appAttemptId.getApplicationId(), "queue1", + "user1"); + scheduler.handle(appEvent); + AppAttemptAddedSchedulerEvent attemptEvent = + new AppAttemptAddedSchedulerEvent(appAttemptId); + scheduler.handle(attemptEvent); int memory = 64; int nConts = 3; @@ -274,9 +288,13 @@ public Map getNodes(){ int _appAttemptId = 1; ApplicationAttemptId appAttemptId = createAppAttemptId(_appId, _appAttemptId); - AppAttemptAddedSchedulerEvent appEvent1 = - new AppAttemptAddedSchedulerEvent(appAttemptId, "queue1", "user1"); - scheduler.handle(appEvent1); + AppAddedSchedulerEvent appEvent = + new AppAddedSchedulerEvent(appAttemptId.getApplicationId(), "queue1", + "user1"); + scheduler.handle(appEvent); + AppAttemptAddedSchedulerEvent attemptEvent = + new AppAttemptAddedSchedulerEvent(appAttemptId); + scheduler.handle(attemptEvent); int memory = 1024; int priority = 1; @@ -520,7 +538,7 @@ public void testFifoScheduler() throws Exception { public void testConcurrentAccessOnApplications() throws Exception { FifoScheduler fs = new FifoScheduler(); TestCapacityScheduler.verifyConcurrentAccessOnApplications( - fs.applications, FiCaSchedulerApp.class, Queue.class); + fs.appAttempts, FiCaSchedulerApp.class, Queue.class); } @SuppressWarnings("resource") @@ -541,9 +559,13 @@ public void testBlackListNodes() throws Exception { ApplicationId appId = BuilderUtils.newApplicationId(100, 1); ApplicationAttemptId appAttemptId = BuilderUtils.newApplicationAttemptId( appId, 1); - SchedulerEvent event = - new AppAttemptAddedSchedulerEvent(appAttemptId, "default", "user"); - fs.handle(event); + SchedulerEvent appEvent = + new AppAddedSchedulerEvent(appId, "default", + "user"); + fs.handle(appEvent); + SchedulerEvent attemptEvent = + new AppAttemptAddedSchedulerEvent(appAttemptId); + fs.handle(attemptEvent); // Verify the blacklist can be updated independent of requesting containers fs.allocate(appAttemptId, Collections.emptyList(), @@ -575,6 +597,17 @@ public void testGetAppsInQueue() throws Exception { Assert.assertNull(scheduler.getAppsInQueue("someotherqueue")); } + @Test + public void testAddAndRemoveAppFromFiFoScheduler() throws Exception { + Configuration conf = new Configuration(); + conf.setClass(YarnConfiguration.RM_SCHEDULER, FifoScheduler.class, + ResourceScheduler.class); + MockRM rm = new MockRM(conf); + FifoScheduler fs = (FifoScheduler)rm.getResourceScheduler(); + TestSchedulerUtils.verifyAppAddedAndRemovedFromScheduler(fs.applications, + fs, "queue"); + } + private void checkApplicationResourceUsage(int expected, Application application) { Assert.assertEquals(expected, application.getUsedResources().getMemory()); diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/webapp/TestRMWebServicesApps.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/webapp/TestRMWebServicesApps.java index 6d1d30d84a3..58170efaff2 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/webapp/TestRMWebServicesApps.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/webapp/TestRMWebServicesApps.java @@ -46,6 +46,7 @@ import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppFailedAttemptEvent; import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppState; import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttempt; +import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttemptState; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ResourceScheduler; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.fifo.FifoScheduler; import org.apache.hadoop.yarn.server.resourcemanager.security.QueueACLsManager; @@ -1392,6 +1393,8 @@ public void testMultipleAppAttempts() throws JSONException, Exception { MockNM amNodeManager = rm.registerNode("127.0.0.1:1234", 2048); RMApp app1 = rm.submitApp(CONTAINER_MB, "testwordcount", "user1"); amNodeManager.nodeHeartbeat(true); + rm.waitForState(app1.getCurrentAppAttempt().getAppAttemptId(), + RMAppAttemptState.ALLOCATED); int maxAppAttempts = rm.getConfig().getInt( YarnConfiguration.RM_AM_MAX_ATTEMPTS, YarnConfiguration.DEFAULT_RM_AM_MAX_ATTEMPTS); @@ -1405,6 +1408,8 @@ public void testMultipleAppAttempts() throws JSONException, Exception { rm.waitForState(app1.getApplicationId(), RMAppState.ACCEPTED); amNodeManager.nodeHeartbeat(true); } + rm.waitForState(app1.getCurrentAppAttempt().getAppAttemptId(), + RMAppAttemptState.ALLOCATED); assertEquals("incorrect number of attempts", maxAppAttempts, app1.getAppAttempts().values().size()); testAppAttemptsHelper(app1.getApplicationId().toString(), app1, From d85c017d0488930d806f267141057fc73e68c728 Mon Sep 17 00:00:00 2001 From: Andrew Wang Date: Fri, 3 Jan 2014 02:45:53 +0000 Subject: [PATCH 27/42] HDFS-5651. Remove dfs.namenode.caching.enabled and improve CRM locking. Contributed by Colin Patrick McCabe. git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1555002 13f79535-47bb-0310-9956-ffa450edef68 --- hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt | 3 + .../org/apache/hadoop/hdfs/DFSConfigKeys.java | 5 +- .../CacheReplicationMonitor.java | 150 +++++--------- .../blockmanagement/DatanodeManager.java | 7 + .../hdfs/server/namenode/CacheManager.java | 185 ++++++++---------- .../hdfs/server/namenode/FSNamesystem.java | 15 +- .../src/main/resources/hdfs-default.xml | 12 +- .../apt/CentralizedCacheManagement.apt.vm | 13 +- .../server/datanode/TestFsDatasetCache.java | 2 - .../server/namenode/TestCacheDirectives.java | 51 ----- .../namenode/ha/TestHAStateTransitions.java | 25 +++ 11 files changed, 197 insertions(+), 271 deletions(-) diff --git a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt index 4dd7f018b3b..c7fa1945a17 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt +++ b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt @@ -243,6 +243,9 @@ Trunk (Unreleased) HDFS-5636. Enforce a max TTL per cache pool. (awang via cmccabe) + HDFS-5651. Remove dfs.namenode.caching.enabled and improve CRM locking. + (cmccabe via wang) + OPTIMIZATIONS HDFS-5349. DNA_CACHE and DNA_UNCACHE should be by blockId only. (cmccabe) diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DFSConfigKeys.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DFSConfigKeys.java index dce98d54e69..6acb0168bf2 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DFSConfigKeys.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DFSConfigKeys.java @@ -108,8 +108,9 @@ public class DFSConfigKeys extends CommonConfigurationKeys { public static final long DFS_DATANODE_MAX_LOCKED_MEMORY_DEFAULT = 0; public static final String DFS_DATANODE_FSDATASETCACHE_MAX_THREADS_PER_VOLUME_KEY = "dfs.datanode.fsdatasetcache.max.threads.per.volume"; public static final int DFS_DATANODE_FSDATASETCACHE_MAX_THREADS_PER_VOLUME_DEFAULT = 4; - public static final String DFS_NAMENODE_CACHING_ENABLED_KEY = "dfs.namenode.caching.enabled"; - public static final boolean DFS_NAMENODE_CACHING_ENABLED_DEFAULT = false; + public static final String DFS_NAMENODE_PATH_BASED_CACHE_BLOCK_MAP_ALLOCATION_PERCENT = + "dfs.namenode.path.based.cache.block.map.allocation.percent"; + public static final float DFS_NAMENODE_PATH_BASED_CACHE_BLOCK_MAP_ALLOCATION_PERCENT_DEFAULT = 0.25f; public static final String DFS_NAMENODE_HTTP_PORT_KEY = "dfs.http.port"; public static final int DFS_NAMENODE_HTTP_PORT_DEFAULT = 50070; diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/CacheReplicationMonitor.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/CacheReplicationMonitor.java index e86f345a499..6e6e44b5008 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/CacheReplicationMonitor.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/CacheReplicationMonitor.java @@ -87,17 +87,17 @@ public class CacheReplicationMonitor extends Thread implements Closeable { * The CacheReplicationMonitor (CRM) lock. Used to synchronize starting and * waiting for rescan operations. */ - private final ReentrantLock lock = new ReentrantLock(); + private final ReentrantLock lock; /** * Notifies the scan thread that an immediate rescan is needed. */ - private final Condition doRescan = lock.newCondition(); + private final Condition doRescan; /** * Notifies waiting threads that a rescan has finished. */ - private final Condition scanFinished = lock.newCondition(); + private final Condition scanFinished; /** * Whether there are pending CacheManager operations that necessitate a @@ -121,11 +121,6 @@ public class CacheReplicationMonitor extends Thread implements Closeable { */ private boolean shutdown = false; - /** - * The monotonic time at which the current scan started. - */ - private long startTimeMs; - /** * Mark status of the current scan. */ @@ -142,24 +137,27 @@ public class CacheReplicationMonitor extends Thread implements Closeable { private long scannedBlocks; public CacheReplicationMonitor(FSNamesystem namesystem, - CacheManager cacheManager, long intervalMs) { + CacheManager cacheManager, long intervalMs, ReentrantLock lock) { this.namesystem = namesystem; this.blockManager = namesystem.getBlockManager(); this.cacheManager = cacheManager; this.cachedBlocks = cacheManager.getCachedBlocks(); this.intervalMs = intervalMs; + this.lock = lock; + this.doRescan = this.lock.newCondition(); + this.scanFinished = this.lock.newCondition(); } @Override public void run() { - startTimeMs = 0; + long startTimeMs = 0; + Thread.currentThread().setName("CacheReplicationMonitor(" + + System.identityHashCode(this) + ")"); LOG.info("Starting CacheReplicationMonitor with interval " + intervalMs + " milliseconds"); try { long curTimeMs = Time.monotonicNow(); while (true) { - // Not all of the variables accessed here need the CRM lock, but take - // it anyway for simplicity lock.lock(); try { while (true) { @@ -180,12 +178,6 @@ public void run() { doRescan.await(delta, TimeUnit.MILLISECONDS); curTimeMs = Time.monotonicNow(); } - } finally { - lock.unlock(); - } - // Mark scan as started, clear needsRescan - lock.lock(); - try { isScanning = true; needsRescan = false; } finally { @@ -195,7 +187,7 @@ public void run() { mark = !mark; rescan(); curTimeMs = Time.monotonicNow(); - // Retake the CRM lock to update synchronization-related variables + // Update synchronization-related variables. lock.lock(); try { isScanning = false; @@ -208,32 +200,15 @@ public void run() { scannedBlocks + " block(s) in " + (curTimeMs - startTimeMs) + " " + "millisecond(s)."); } + } catch (InterruptedException e) { + LOG.info("Shutting down CacheReplicationMonitor."); + return; } catch (Throwable t) { LOG.fatal("Thread exiting", t); terminate(1, t); } } - /** - * Similar to {@link CacheReplicationMonitor#waitForRescan()}, except it only - * waits if there are pending operations that necessitate a rescan as - * indicated by {@link #setNeedsRescan()}. - *

- * Note that this call may release the FSN lock, so operations before and - * after are not necessarily atomic. - */ - public void waitForRescanIfNeeded() { - lock.lock(); - try { - if (!needsRescan) { - return; - } - } finally { - lock.unlock(); - } - waitForRescan(); - } - /** * Waits for a rescan to complete. This doesn't guarantee consistency with * pending operations, only relative recency, since it will not force a new @@ -242,49 +217,27 @@ public void waitForRescanIfNeeded() { * Note that this call will release the FSN lock, so operations before and * after are not atomic. */ - public void waitForRescan() { - // Drop the FSN lock temporarily and retake it after we finish waiting - // Need to handle both the read lock and the write lock - boolean retakeWriteLock = false; - if (namesystem.hasWriteLock()) { - namesystem.writeUnlock(); - retakeWriteLock = true; - } else if (namesystem.hasReadLock()) { - namesystem.readUnlock(); - } else { - // Expected to have at least one of the locks - Preconditions.checkState(false, - "Need to be holding either the read or write lock"); + public void waitForRescanIfNeeded() { + Preconditions.checkArgument(!namesystem.hasWriteLock(), + "Must not hold the FSN write lock when waiting for a rescan."); + Preconditions.checkArgument(lock.isHeldByCurrentThread(), + "Must hold the CRM lock when waiting for a rescan."); + if (!needsRescan) { + return; } - // try/finally for retaking FSN lock - try { - lock.lock(); - // try/finally for releasing CRM lock + // If no scan is already ongoing, mark the CRM as dirty and kick + if (!isScanning) { + doRescan.signal(); + } + // Wait until the scan finishes and the count advances + final long startCount = scanCount; + while ((!shutdown) && (startCount >= scanCount)) { try { - // If no scan is already ongoing, mark the CRM as dirty and kick - if (!isScanning) { - needsRescan = true; - doRescan.signal(); - } - // Wait until the scan finishes and the count advances - final long startCount = scanCount; - while (startCount >= scanCount) { - try { - scanFinished.await(); - } catch (InterruptedException e) { - LOG.warn("Interrupted while waiting for CacheReplicationMonitor" - + " rescan", e); - break; - } - } - } finally { - lock.unlock(); - } - } finally { - if (retakeWriteLock) { - namesystem.writeLock(); - } else { - namesystem.readLock(); + scanFinished.await(); + } catch (InterruptedException e) { + LOG.warn("Interrupted while waiting for CacheReplicationMonitor" + + " rescan", e); + break; } } } @@ -294,42 +247,43 @@ public void waitForRescan() { * changes that require a rescan. */ public void setNeedsRescan() { - lock.lock(); - try { - this.needsRescan = true; - } finally { - lock.unlock(); - } + Preconditions.checkArgument(lock.isHeldByCurrentThread(), + "Must hold the CRM lock when setting the needsRescan bit."); + this.needsRescan = true; } /** - * Shut down and join the monitor thread. + * Shut down the monitor thread. */ @Override public void close() throws IOException { + Preconditions.checkArgument(namesystem.hasWriteLock()); lock.lock(); try { if (shutdown) return; + // Since we hold both the FSN write lock and the CRM lock here, + // we know that the CRM thread cannot be currently modifying + // the cache manager state while we're closing it. + // Since the CRM thread checks the value of 'shutdown' after waiting + // for a lock, we know that the thread will not modify the cache + // manager state after this point. shutdown = true; doRescan.signalAll(); scanFinished.signalAll(); } finally { lock.unlock(); } - try { - if (this.isAlive()) { - this.join(60000); - } - } catch (InterruptedException e) { - Thread.currentThread().interrupt(); - } } - private void rescan() { + private void rescan() throws InterruptedException { scannedDirectives = 0; scannedBlocks = 0; namesystem.writeLock(); try { + if (shutdown) { + throw new InterruptedException("CacheReplicationMonitor was " + + "shut down."); + } resetStatistics(); rescanCacheDirectives(); rescanCachedBlockMap(); @@ -609,9 +563,6 @@ private void rescanCachedBlockMap() { private void addNewPendingUncached(int neededUncached, CachedBlock cachedBlock, List cached, List pendingUncached) { - if (!cacheManager.isActive()) { - return; - } // Figure out which replicas can be uncached. LinkedList possibilities = new LinkedList(); @@ -647,9 +598,6 @@ private void addNewPendingUncached(int neededUncached, private void addNewPendingCached(int neededCached, CachedBlock cachedBlock, List cached, List pendingCached) { - if (!cacheManager.isActive()) { - return; - } // To figure out which replicas can be cached, we consult the // blocksMap. We don't want to try to cache a corrupt replica, though. BlockInfo blockInfo = blockManager. diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/DatanodeManager.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/DatanodeManager.java index 3bb1f150bcb..41be0f5660c 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/DatanodeManager.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/DatanodeManager.java @@ -1443,6 +1443,13 @@ public String toString() { return getClass().getSimpleName() + ": " + host2DatanodeMap; } + public void clearPendingCachingCommands() { + for (DatanodeDescriptor dn : datanodeMap.values()) { + dn.getPendingCached().clear(); + dn.getPendingUncached().clear(); + } + } + public void setShouldSendCachingCommands(boolean shouldSendCachingCommands) { this.shouldSendCachingCommands = shouldSendCachingCommands; } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/CacheManager.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/CacheManager.java index f24b386df16..b3ff8dfef59 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/CacheManager.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/CacheManager.java @@ -17,8 +17,8 @@ */ package org.apache.hadoop.hdfs.server.namenode; -import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_CACHING_ENABLED_DEFAULT; -import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_CACHING_ENABLED_KEY; +import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_PATH_BASED_CACHE_BLOCK_MAP_ALLOCATION_PERCENT; +import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_PATH_BASED_CACHE_BLOCK_MAP_ALLOCATION_PERCENT_DEFAULT; import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_LIST_CACHE_DIRECTIVES_NUM_RESPONSES; import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_LIST_CACHE_DIRECTIVES_NUM_RESPONSES_DEFAULT; import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_LIST_CACHE_POOLS_NUM_RESPONSES; @@ -40,6 +40,7 @@ import java.util.Map.Entry; import java.util.SortedMap; import java.util.TreeMap; +import java.util.concurrent.locks.ReentrantLock; import org.apache.commons.io.IOUtils; import org.apache.commons.logging.Log; @@ -84,7 +85,7 @@ /** * The Cache Manager handles caching on DataNodes. * - * This class is instantiated by the FSNamesystem when caching is enabled. + * This class is instantiated by the FSNamesystem. * It maintains the mapping of cached blocks to datanodes via processing * datanode cache reports. Based on these reports and addition and removal of * caching directives, we will schedule caching and uncaching work. @@ -93,6 +94,8 @@ public final class CacheManager { public static final Log LOG = LogFactory.getLog(CacheManager.class); + private static final float MIN_CACHED_BLOCKS_PERCENT = 0.001f; + // TODO: add pending / underCached / schedule cached blocks stats. /** @@ -147,34 +150,16 @@ public final class CacheManager { */ private final long scanIntervalMs; - /** - * Whether caching is enabled. - * - * If caching is disabled, we will not process cache reports or store - * information about what is cached where. We also do not start the - * CacheReplicationMonitor thread. This will save resources, but provide - * less functionality. - * - * Even when caching is disabled, we still store path-based cache - * information. This information is stored in the edit log and fsimage. We - * don't want to lose it just because a configuration setting was turned off. - * However, we will not act on this information if caching is disabled. - */ - private final boolean enabled; - - /** - * Whether the CacheManager is active. - * - * When the CacheManager is active, it tells the DataNodes what to cache - * and uncache. The CacheManager cannot become active if enabled = false. - */ - private boolean active = false; - /** * All cached blocks. */ private final GSet cachedBlocks; + /** + * Lock which protects the CacheReplicationMonitor. + */ + private final ReentrantLock crmLock = new ReentrantLock(); + /** * The CacheReplicationMonitor. */ @@ -194,54 +179,51 @@ public final class CacheManager { scanIntervalMs = conf.getLong( DFS_NAMENODE_PATH_BASED_CACHE_REFRESH_INTERVAL_MS, DFS_NAMENODE_PATH_BASED_CACHE_REFRESH_INTERVAL_MS_DEFAULT); - this.enabled = conf.getBoolean(DFS_NAMENODE_CACHING_ENABLED_KEY, - DFS_NAMENODE_CACHING_ENABLED_DEFAULT); - this.cachedBlocks = !enabled ? null : - new LightWeightGSet( - LightWeightGSet.computeCapacity(0.25, "cachedBlocks")); + float cachedBlocksPercent = conf.getFloat( + DFS_NAMENODE_PATH_BASED_CACHE_BLOCK_MAP_ALLOCATION_PERCENT, + DFS_NAMENODE_PATH_BASED_CACHE_BLOCK_MAP_ALLOCATION_PERCENT_DEFAULT); + if (cachedBlocksPercent < MIN_CACHED_BLOCKS_PERCENT) { + LOG.info("Using minimum value " + MIN_CACHED_BLOCKS_PERCENT + + " for " + DFS_NAMENODE_PATH_BASED_CACHE_BLOCK_MAP_ALLOCATION_PERCENT); + cachedBlocksPercent = MIN_CACHED_BLOCKS_PERCENT; + } + this.cachedBlocks = new LightWeightGSet( + LightWeightGSet.computeCapacity(cachedBlocksPercent, + "cachedBlocks")); + } - /** - * Activate the cache manager. - * - * When the cache manager is active, tell the datanodes where to cache files. - */ - public void activate() { - assert namesystem.hasWriteLock(); - if (enabled && (!active)) { - LOG.info("Activating CacheManager. " + - "Starting replication monitor thread..."); - active = true; - monitor = new CacheReplicationMonitor(namesystem, this, - scanIntervalMs); - monitor.start(); + public void startMonitorThread() { + crmLock.lock(); + try { + if (this.monitor == null) { + this.monitor = new CacheReplicationMonitor(namesystem, this, + scanIntervalMs, crmLock); + this.monitor.start(); + } + } finally { + crmLock.unlock(); } } - /** - * Deactivate the cache manager. - * - * When the cache manager is inactive, it does not tell the datanodes where to - * cache files. - */ - public void deactivate() { - assert namesystem.hasWriteLock(); - if (active) { - LOG.info("Deactivating CacheManager. " + - "stopping CacheReplicationMonitor thread..."); - active = false; - IOUtils.closeQuietly(monitor); - monitor = null; - LOG.info("CacheReplicationMonitor thread stopped and deactivated."); + public void stopMonitorThread() { + crmLock.lock(); + try { + if (this.monitor != null) { + CacheReplicationMonitor prevMonitor = this.monitor; + this.monitor = null; + IOUtils.closeQuietly(prevMonitor); + } + } finally { + crmLock.unlock(); } } - /** - * Return true only if the cache manager is active. - * Must be called under the FSN read or write lock. - */ - public boolean isActive() { - return active; + public void clearDirectiveStats() { + assert namesystem.hasWriteLock(); + for (CacheDirective directive : directivesById.values()) { + directive.resetStatistics(); + } } /** @@ -480,9 +462,7 @@ private void addInternal(CacheDirective directive, CachePool pool) { directive.addBytesNeeded(stats.getBytesNeeded()); directive.addFilesNeeded(directive.getFilesNeeded()); - if (monitor != null) { - monitor.setNeedsRescan(); - } + setNeedsRescan(); } /** @@ -514,10 +494,6 @@ public CacheDirectiveInfo addDirective( long expiryTime = validateExpiryTime(info, pool.getMaxRelativeExpiryMs()); // Do quota validation if required if (!flags.contains(CacheFlag.FORCE)) { - // Can't kick and wait if caching is disabled - if (monitor != null) { - monitor.waitForRescan(); - } checkLimit(pool, path, replication); } // All validation passed @@ -622,9 +598,7 @@ public void modifyDirective(CacheDirectiveInfo info, validateExpiryTime(infoWithDefaults, destPool.getMaxRelativeExpiryMs()); // Indicate changes to the CRM - if (monitor != null) { - monitor.setNeedsRescan(); - } + setNeedsRescan(); // Validation passed removeInternal(prevEntry); @@ -659,9 +633,7 @@ private void removeInternal(CacheDirective directive) pool.getDirectiveList().remove(directive); assert directive.getPool() == null; - if (monitor != null) { - monitor.setNeedsRescan(); - } + setNeedsRescan(); } public void removeDirective(long id, FSPermissionChecker pc) @@ -694,9 +666,6 @@ public void removeDirective(long id, FSPermissionChecker pc) if (filter.getReplication() != null) { throw new IOException("Filtering by replication is unsupported."); } - if (monitor != null) { - monitor.waitForRescanIfNeeded(); - } ArrayList replies = new ArrayList(NUM_PRE_ALLOCATED_ENTRIES); int numReplies = 0; @@ -805,9 +774,7 @@ public void modifyCachePool(CachePoolInfo info) bld.append(prefix).append("set limit to " + info.getLimit()); prefix = "; "; // New limit changes stats, need to set needs refresh - if (monitor != null) { - monitor.setNeedsRescan(); - } + setNeedsRescan(); } if (info.getMaxRelativeExpiryMs() != null) { final Long maxRelativeExpiry = info.getMaxRelativeExpiryMs(); @@ -853,9 +820,7 @@ public void removeCachePool(String poolName) directivesById.remove(directive.getId()); iter.remove(); } - if (monitor != null) { - monitor.setNeedsRescan(); - } + setNeedsRescan(); } catch (IOException e) { LOG.info("removeCachePool of " + poolName + " failed: ", e); throw e; @@ -866,9 +831,6 @@ public void removeCachePool(String poolName) public BatchedListEntries listCachePools(FSPermissionChecker pc, String prevKey) { assert namesystem.hasReadLock(); - if (monitor != null) { - monitor.waitForRescanIfNeeded(); - } final int NUM_PRE_ALLOCATED_ENTRIES = 16; ArrayList results = new ArrayList(NUM_PRE_ALLOCATED_ENTRIES); @@ -884,9 +846,6 @@ public void removeCachePool(String poolName) } public void setCachedLocations(LocatedBlock block) { - if (!enabled) { - return; - } CachedBlock cachedBlock = new CachedBlock(block.getBlock().getBlockId(), (short)0, false); @@ -902,12 +861,6 @@ public void setCachedLocations(LocatedBlock block) { public final void processCacheReport(final DatanodeID datanodeID, final List blockIds) throws IOException { - if (!enabled) { - LOG.info("Ignoring cache report from " + datanodeID + - " because " + DFS_NAMENODE_CACHING_ENABLED_KEY + " = false. " + - "number of blocks: " + blockIds.size()); - return; - } namesystem.writeLock(); final long startTime = Time.monotonicNow(); final long endTime; @@ -1085,4 +1038,36 @@ private void loadDirectives(DataInput in) throws IOException { } prog.endStep(Phase.LOADING_FSIMAGE, step); } + + public void waitForRescanIfNeeded() { + crmLock.lock(); + try { + if (monitor != null) { + monitor.waitForRescanIfNeeded(); + } + } finally { + crmLock.unlock(); + } + } + + private void setNeedsRescan() { + crmLock.lock(); + try { + if (monitor != null) { + monitor.setNeedsRescan(); + } + } finally { + crmLock.unlock(); + } + } + + @VisibleForTesting + public Thread getCacheReplicationMonitor() { + crmLock.lock(); + try { + return monitor; + } finally { + crmLock.unlock(); + } + } } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java index 83b9b21ab20..101eecec6f6 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java @@ -929,7 +929,6 @@ void stopCommonServices() { writeLock(); try { if (blockManager != null) blockManager.close(); - cacheManager.deactivate(); } finally { writeUnlock(); } @@ -999,7 +998,7 @@ void startActiveServices() throws IOException { editLogRollerThreshold, editLogRollerInterval)); nnEditLogRoller.start(); - cacheManager.activate(); + cacheManager.startMonitorThread(); blockManager.getDatanodeManager().setShouldSendCachingCommands(true); } finally { writeUnlock(); @@ -1050,7 +1049,9 @@ void stopActiveServices() { // so that the tailer starts from the right spot. dir.fsImage.updateLastAppliedTxIdFromWritten(); } - cacheManager.deactivate(); + cacheManager.stopMonitorThread(); + cacheManager.clearDirectiveStats(); + blockManager.getDatanodeManager().clearPendingCachingCommands(); blockManager.getDatanodeManager().setShouldSendCachingCommands(false); } finally { writeUnlock(); @@ -7064,6 +7065,9 @@ long addCacheDirective(CacheDirectiveInfo directive, EnumSet flags) return (Long) cacheEntry.getPayload(); } boolean success = false; + if (!flags.contains(CacheFlag.FORCE)) { + cacheManager.waitForRescanIfNeeded(); + } writeLock(); Long result = null; try { @@ -7105,6 +7109,9 @@ void modifyCacheDirective(CacheDirectiveInfo directive, if (cacheEntry != null && cacheEntry.isSuccess()) { return; } + if (!flags.contains(CacheFlag.FORCE)) { + cacheManager.waitForRescanIfNeeded(); + } writeLock(); try { checkOperation(OperationCategory.WRITE); @@ -7164,6 +7171,7 @@ BatchedListEntries listCacheDirectives( final FSPermissionChecker pc = isPermissionEnabled ? getPermissionChecker() : null; BatchedListEntries results; + cacheManager.waitForRescanIfNeeded(); readLock(); boolean success = false; try { @@ -7287,6 +7295,7 @@ public BatchedListEntries listCachePools(String prevKey) BatchedListEntries results; checkOperation(OperationCategory.READ); boolean success = false; + cacheManager.waitForRescanIfNeeded(); readLock(); try { checkOperation(OperationCategory.READ); diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/resources/hdfs-default.xml b/hadoop-hdfs-project/hadoop-hdfs/src/main/resources/hdfs-default.xml index b961c32bb72..24f0b03c0b1 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/resources/hdfs-default.xml +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/resources/hdfs-default.xml @@ -1476,13 +1476,13 @@ - dfs.namenode.caching.enabled - false + dfs.namenode.path.based.cache.block.map.allocation.percent + 0.25 - Set to true to enable block caching. This flag enables the NameNode to - maintain a mapping of cached blocks to DataNodes via processing DataNode - cache reports. Based on these reports and addition and removal of caching - directives, the NameNode will schedule caching and uncaching work. + The percentage of the Java heap which we will allocate to the cached blocks + map. The cached blocks map is a hash map which uses chained hashing. + Smaller maps may be accessed more slowly if the number of cached blocks is + large; larger maps will consume more memory. diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/site/apt/CentralizedCacheManagement.apt.vm b/hadoop-hdfs-project/hadoop-hdfs/src/site/apt/CentralizedCacheManagement.apt.vm index d1db4af12bb..30ddf68a52f 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/site/apt/CentralizedCacheManagement.apt.vm +++ b/hadoop-hdfs-project/hadoop-hdfs/src/site/apt/CentralizedCacheManagement.apt.vm @@ -242,12 +242,6 @@ Centralized Cache Management in HDFS Be sure to configure the following: - * dfs.namenode.caching.enabled - - This must be set to true to enable caching. If this is false, the NameNode - will ignore cache reports, and will not ask DataNodes to cache - blocks. - * dfs.datanode.max.locked.memory The DataNode will treat this as the maximum amount of memory it can use for @@ -281,6 +275,13 @@ Centralized Cache Management in HDFS By default, this parameter is set to 10000, which is 10 seconds. + * dfs.namenode.path.based.cache.block.map.allocation.percent + + The percentage of the Java heap which we will allocate to the cached blocks + map. The cached blocks map is a hash map which uses chained hashing. + Smaller maps may be accessed more slowly if the number of cached blocks is + large; larger maps will consume more memory. The default is 0.25 percent. + ** {OS Limits} If you get the error "Cannot start datanode because the configured max diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestFsDatasetCache.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestFsDatasetCache.java index 85605ddb044..b6aac810db0 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestFsDatasetCache.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestFsDatasetCache.java @@ -109,14 +109,12 @@ public class TestFsDatasetCache { public void setUp() throws Exception { assumeTrue(!Path.WINDOWS); conf = new HdfsConfiguration(); - conf.setBoolean(DFSConfigKeys.DFS_NAMENODE_CACHING_ENABLED_KEY, true); conf.setLong(DFSConfigKeys.DFS_NAMENODE_PATH_BASED_CACHE_RETRY_INTERVAL_MS, 500); conf.setLong(DFSConfigKeys.DFS_BLOCK_SIZE_KEY, BLOCK_SIZE); conf.setLong(DFSConfigKeys.DFS_DATANODE_MAX_LOCKED_MEMORY_KEY, CACHE_CAPACITY); conf.setLong(DFSConfigKeys.DFS_HEARTBEAT_INTERVAL_KEY, 1); - conf.setBoolean(DFSConfigKeys.DFS_NAMENODE_CACHING_ENABLED_KEY, true); prevCacheManipulator = NativeIO.POSIX.getCacheManipulator(); NativeIO.POSIX.setCacheManipulator(new NoMlockCacheManipulator()); diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestCacheDirectives.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestCacheDirectives.java index 4be876e985f..b81fde32ada 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestCacheDirectives.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestCacheDirectives.java @@ -21,7 +21,6 @@ import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_CACHEREPORT_INTERVAL_MSEC_KEY; import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_DATANODE_MAX_LOCKED_MEMORY_KEY; import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_HEARTBEAT_INTERVAL_KEY; -import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_CACHING_ENABLED_KEY; import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_PATH_BASED_CACHE_REFRESH_INTERVAL_MS; import static org.apache.hadoop.hdfs.protocol.CachePoolInfo.RELATIVE_EXPIRY_NEVER; import static org.apache.hadoop.test.GenericTestUtils.assertExceptionContains; @@ -118,7 +117,6 @@ private static HdfsConfiguration createCachingConf() { conf.setLong(DFS_BLOCK_SIZE_KEY, BLOCK_SIZE); conf.setLong(DFS_DATANODE_MAX_LOCKED_MEMORY_KEY, CACHE_CAPACITY); conf.setLong(DFS_HEARTBEAT_INTERVAL_KEY, 1); - conf.setBoolean(DFS_NAMENODE_CACHING_ENABLED_KEY, true); conf.setLong(DFS_CACHEREPORT_INTERVAL_MSEC_KEY, 1000); conf.setLong(DFS_NAMENODE_PATH_BASED_CACHE_REFRESH_INTERVAL_MS, 1000); // set low limits here for testing purposes @@ -867,55 +865,6 @@ public Boolean get() { } } - @Test(timeout=120000) - public void testAddingCacheDirectiveInfosWhenCachingIsDisabled() - throws Exception { - cluster.shutdown(); - HdfsConfiguration conf = createCachingConf(); - conf.setBoolean(DFS_NAMENODE_CACHING_ENABLED_KEY, false); - MiniDFSCluster cluster = - new MiniDFSCluster.Builder(conf).numDataNodes(NUM_DATANODES).build(); - - try { - cluster.waitActive(); - DistributedFileSystem dfs = cluster.getFileSystem(); - NameNode namenode = cluster.getNameNode(); - // Create the pool - String pool = "pool1"; - namenode.getRpcServer().addCachePool(new CachePoolInfo(pool)); - // Create some test files - final int numFiles = 2; - final int numBlocksPerFile = 2; - final List paths = new ArrayList(numFiles); - for (int i=0; istandby, and manual failover * and failback between two namenodes. @@ -124,6 +127,17 @@ public void testTransitionActiveToStandby() throws Exception { } } + private void addCrmThreads(MiniDFSCluster cluster, + LinkedList crmThreads) { + for (int nn = 0; nn <= 1; nn++) { + Thread thread = cluster.getNameNode(nn).getNamesystem(). + getCacheManager().getCacheReplicationMonitor(); + if (thread != null) { + crmThreads.add(thread); + } + } + } + /** * Test that transitioning a service to the state that it is already * in is a nop, specifically, an exception is not thrown. @@ -131,19 +145,30 @@ public void testTransitionActiveToStandby() throws Exception { @Test public void testTransitionToCurrentStateIsANop() throws Exception { Configuration conf = new Configuration(); + conf.setLong(DFSConfigKeys.DFS_NAMENODE_PATH_BASED_CACHE_REFRESH_INTERVAL_MS, 1L); MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf) .nnTopology(MiniDFSNNTopology.simpleHATopology()) .numDataNodes(1) .build(); + LinkedList crmThreads = new LinkedList(); try { cluster.waitActive(); + addCrmThreads(cluster, crmThreads); cluster.transitionToActive(0); + addCrmThreads(cluster, crmThreads); cluster.transitionToActive(0); + addCrmThreads(cluster, crmThreads); cluster.transitionToStandby(0); + addCrmThreads(cluster, crmThreads); cluster.transitionToStandby(0); + addCrmThreads(cluster, crmThreads); } finally { cluster.shutdown(); } + // Verify that all cacheReplicationMonitor threads shut down + for (Thread thread : crmThreads) { + Uninterruptibles.joinUninterruptibly(thread); + } } /** From 1a8781f1f9d4916369be8090690fd97131443159 Mon Sep 17 00:00:00 2001 From: Karthik Kambatla Date: Fri, 3 Jan 2014 17:27:36 +0000 Subject: [PATCH 28/42] MAPREDUCE-5689. MRAppMaster does not preempt reducers when scheduled maps cannot be fulfilled. (lohit via kasha) git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1555161 13f79535-47bb-0310-9956-ffa450edef68 --- hadoop-mapreduce-project/CHANGES.txt | 3 +++ .../mapreduce/v2/app/rm/RMContainerAllocator.java | 3 ++- .../v2/app/TestRMContainerAllocator.java | 15 +++++++++++++++ 3 files changed, 20 insertions(+), 1 deletion(-) diff --git a/hadoop-mapreduce-project/CHANGES.txt b/hadoop-mapreduce-project/CHANGES.txt index 52aefce295a..3fb2dd6156c 100644 --- a/hadoop-mapreduce-project/CHANGES.txt +++ b/hadoop-mapreduce-project/CHANGES.txt @@ -267,6 +267,9 @@ Release 2.4.0 - UNRELEASED MAPREDUCE-5685. Fixed a bug with JobContext getCacheFiles API inside the WrappedReducer class. (Yi Song via vinodkv) + MAPREDUCE-5689. MRAppMaster does not preempt reducers when scheduled maps + cannot be fulfilled. (lohit via kasha) + Release 2.3.0 - UNRELEASED INCOMPATIBLE CHANGES diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-app/src/main/java/org/apache/hadoop/mapreduce/v2/app/rm/RMContainerAllocator.java b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-app/src/main/java/org/apache/hadoop/mapreduce/v2/app/rm/RMContainerAllocator.java index 18491fdbf1d..a8ee06b3b48 100644 --- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-app/src/main/java/org/apache/hadoop/mapreduce/v2/app/rm/RMContainerAllocator.java +++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-app/src/main/java/org/apache/hadoop/mapreduce/v2/app/rm/RMContainerAllocator.java @@ -229,7 +229,8 @@ protected synchronized void heartbeat() throws Exception { int completedMaps = getJob().getCompletedMaps(); int completedTasks = completedMaps + getJob().getCompletedReduces(); - if (lastCompletedTasks != completedTasks) { + if ((lastCompletedTasks != completedTasks) || + (scheduledRequests.maps.size() > 0)) { lastCompletedTasks = completedTasks; recalculateReduceSchedule = true; } diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-app/src/test/java/org/apache/hadoop/mapreduce/v2/app/TestRMContainerAllocator.java b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-app/src/test/java/org/apache/hadoop/mapreduce/v2/app/TestRMContainerAllocator.java index 3a6644e4349..3eb5222865c 100644 --- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-app/src/test/java/org/apache/hadoop/mapreduce/v2/app/TestRMContainerAllocator.java +++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-app/src/test/java/org/apache/hadoop/mapreduce/v2/app/TestRMContainerAllocator.java @@ -1604,6 +1604,21 @@ public void testReduceScheduling() throws Exception { numPendingReduces, maxReduceRampupLimit, reduceSlowStart); verify(allocator).rampDownReduces(anyInt()); + + // Test reduce ramp-down for when there are scheduled maps + // Since we have two scheduled Maps, rampDownReducers + // should be invoked twice. + scheduledMaps = 2; + assignedReduces = 2; + doReturn(10 * 1024).when(allocator).getMemLimit(); + allocator.scheduleReduces( + totalMaps, succeededMaps, + scheduledMaps, scheduledReduces, + assignedMaps, assignedReduces, + mapResourceReqt, reduceResourceReqt, + numPendingReduces, + maxReduceRampupLimit, reduceSlowStart); + verify(allocator, times(2)).rampDownReduces(anyInt()); } private static class RecalculateContainerAllocator extends MyContainerAllocator { From 71e6ea4be10860a03eb946b7f935eb8abc9090a2 Mon Sep 17 00:00:00 2001 From: Jing Zhao Date: Fri, 3 Jan 2014 17:36:16 +0000 Subject: [PATCH 29/42] HDFS-5695. Clean up TestOfflineEditsViewer and OfflineEditsViewerHelper. Contributed by Haohui Mai. git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1555164 13f79535-47bb-0310-9956-ffa450edef68 --- hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt | 3 + .../namenode/OfflineEditsViewerHelper.java | 159 +------------ .../TestOfflineEditsViewer.java | 217 ++++++++---------- 3 files changed, 99 insertions(+), 280 deletions(-) diff --git a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt index c7fa1945a17..40ee26e9bc0 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt +++ b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt @@ -780,6 +780,9 @@ Release 2.4.0 - UNRELEASED HDFS-2933. Improve DataNode Web UI Index Page. (Vivek Ganesan via Arpit Agarwal) + HDFS-5695. Clean up TestOfflineEditsViewer and OfflineEditsViewerHelper. + (Haohui Mai via jing9) + OPTIMIZATIONS HDFS-5239. Allow FSNamesystem lock fairness to be configurable (daryn) diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/OfflineEditsViewerHelper.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/OfflineEditsViewerHelper.java index 43a4af1fe81..b3bf40ea9ce 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/OfflineEditsViewerHelper.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/OfflineEditsViewerHelper.java @@ -20,7 +20,6 @@ import java.io.File; import java.io.IOException; -import java.security.PrivilegedExceptionAction; import java.util.Iterator; import org.apache.commons.logging.Log; @@ -29,25 +28,13 @@ import org.apache.hadoop.classification.InterfaceStability; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.CommonConfigurationKeysPublic; -import org.apache.hadoop.fs.FSDataOutputStream; -import org.apache.hadoop.fs.FileContext; -import org.apache.hadoop.fs.Options.Rename; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.fs.permission.FsPermission; -import org.apache.hadoop.hdfs.DFSClientAdapter; import org.apache.hadoop.hdfs.DFSConfigKeys; import org.apache.hadoop.hdfs.DFSTestUtil; import org.apache.hadoop.hdfs.DistributedFileSystem; import org.apache.hadoop.hdfs.MiniDFSCluster; -import org.apache.hadoop.hdfs.protocol.CachePoolInfo; -import org.apache.hadoop.hdfs.protocol.HdfsConstants; -import org.apache.hadoop.hdfs.protocol.LocatedBlocks; -import org.apache.hadoop.hdfs.protocol.CacheDirectiveInfo; import org.apache.hadoop.hdfs.server.common.Storage.StorageDirectory; import org.apache.hadoop.hdfs.server.common.Util; import org.apache.hadoop.hdfs.server.namenode.NNStorage.NameNodeDirType; -import org.apache.hadoop.security.UserGroupInformation; -import org.apache.hadoop.security.token.Token; /** * OfflineEditsViewerHelper is a helper class for TestOfflineEditsViewer, @@ -135,151 +122,11 @@ public void shutdownCluster() throws IOException { * OP_CLEAR_NS_QUOTA (12) */ private CheckpointSignature runOperations() throws IOException { - LOG.info("Creating edits by performing fs operations"); // no check, if it's not it throws an exception which is what we want - DistributedFileSystem dfs = - (DistributedFileSystem)cluster.getFileSystem(); - FileContext fc = FileContext.getFileContext(cluster.getURI(0), config); - // OP_ADD 0 - Path pathFileCreate = new Path("/file_create_u\1F431"); - FSDataOutputStream s = dfs.create(pathFileCreate); - // OP_CLOSE 9 - s.close(); - // OP_RENAME_OLD 1 - Path pathFileMoved = new Path("/file_moved"); - dfs.rename(pathFileCreate, pathFileMoved); - // OP_DELETE 2 - dfs.delete(pathFileMoved, false); - // OP_MKDIR 3 - Path pathDirectoryMkdir = new Path("/directory_mkdir"); - dfs.mkdirs(pathDirectoryMkdir); - // OP_ALLOW_SNAPSHOT 29 - dfs.allowSnapshot(pathDirectoryMkdir); - // OP_DISALLOW_SNAPSHOT 30 - dfs.disallowSnapshot(pathDirectoryMkdir); - // OP_CREATE_SNAPSHOT 26 - String ssName = "snapshot1"; - dfs.allowSnapshot(pathDirectoryMkdir); - dfs.createSnapshot(pathDirectoryMkdir, ssName); - // OP_RENAME_SNAPSHOT 28 - String ssNewName = "snapshot2"; - dfs.renameSnapshot(pathDirectoryMkdir, ssName, ssNewName); - // OP_DELETE_SNAPSHOT 27 - dfs.deleteSnapshot(pathDirectoryMkdir, ssNewName); - // OP_SET_REPLICATION 4 - s = dfs.create(pathFileCreate); - s.close(); - dfs.setReplication(pathFileCreate, (short)1); - // OP_SET_PERMISSIONS 7 - Short permission = 0777; - dfs.setPermission(pathFileCreate, new FsPermission(permission)); - // OP_SET_OWNER 8 - dfs.setOwner(pathFileCreate, new String("newOwner"), null); - // OP_CLOSE 9 see above - // OP_SET_GENSTAMP 10 see above - // OP_SET_NS_QUOTA 11 obsolete - // OP_CLEAR_NS_QUOTA 12 obsolete - // OP_TIMES 13 - long mtime = 1285195527000L; // Wed, 22 Sep 2010 22:45:27 GMT - long atime = mtime; - dfs.setTimes(pathFileCreate, mtime, atime); - // OP_SET_QUOTA 14 - dfs.setQuota(pathDirectoryMkdir, 1000L, HdfsConstants.QUOTA_DONT_SET); - // OP_RENAME 15 - fc.rename(pathFileCreate, pathFileMoved, Rename.NONE); - // OP_CONCAT_DELETE 16 - Path pathConcatTarget = new Path("/file_concat_target"); - Path[] pathConcatFiles = new Path[2]; - pathConcatFiles[0] = new Path("/file_concat_0"); - pathConcatFiles[1] = new Path("/file_concat_1"); - - long length = blockSize * 3; // multiple of blocksize for concat - short replication = 1; - long seed = 1; - - DFSTestUtil.createFile(dfs, pathConcatTarget, length, replication, seed); - DFSTestUtil.createFile(dfs, pathConcatFiles[0], length, replication, seed); - DFSTestUtil.createFile(dfs, pathConcatFiles[1], length, replication, seed); - dfs.concat(pathConcatTarget, pathConcatFiles); - // OP_SYMLINK 17 - Path pathSymlink = new Path("/file_symlink"); - fc.createSymlink(pathConcatTarget, pathSymlink, false); - // OP_GET_DELEGATION_TOKEN 18 - // OP_RENEW_DELEGATION_TOKEN 19 - // OP_CANCEL_DELEGATION_TOKEN 20 - // see TestDelegationToken.java - // fake the user to renew token for - final Token[] tokens = dfs.addDelegationTokens("JobTracker", null); - UserGroupInformation longUgi = UserGroupInformation.createRemoteUser( - "JobTracker/foo.com@FOO.COM"); - try { - longUgi.doAs(new PrivilegedExceptionAction() { - @Override - public Object run() throws IOException, InterruptedException { - for (Token token : tokens) { - token.renew(config); - token.cancel(config); - } - return null; - } - }); - } catch(InterruptedException e) { - throw new IOException( - "renewDelegationToken threw InterruptedException", e); - } - // OP_UPDATE_MASTER_KEY 21 - // done by getDelegationTokenSecretManager().startThreads(); - - // OP_ADD_CACHE_POOL 35 - final String pool = "poolparty"; - dfs.addCachePool(new CachePoolInfo(pool)); - // OP_MODIFY_CACHE_POOL 36 - dfs.modifyCachePool(new CachePoolInfo(pool) - .setOwnerName("carlton") - .setGroupName("party") - .setMode(new FsPermission((short)0700)) - .setLimit(1989l)); - // OP_ADD_PATH_BASED_CACHE_DIRECTIVE 33 - long id = dfs.addCacheDirective( - new CacheDirectiveInfo.Builder(). - setPath(new Path("/bar")). - setReplication((short)1). - setPool(pool). - build()); - // OP_MODIFY_PATH_BASED_CACHE_DIRECTIVE 38 - dfs.modifyCacheDirective( - new CacheDirectiveInfo.Builder(). - setId(id). - setPath(new Path("/bar2")). - build()); - // OP_REMOVE_PATH_BASED_CACHE_DIRECTIVE 34 - dfs.removeCacheDirective(id); - // OP_REMOVE_CACHE_POOL 37 - dfs.removeCachePool(pool); - // sync to disk, otherwise we parse partial edits - cluster.getNameNode().getFSImage().getEditLog().logSync(); - - // OP_REASSIGN_LEASE 22 - String filePath = "/hard-lease-recovery-test"; - byte[] bytes = "foo-bar-baz".getBytes(); - DFSClientAdapter.stopLeaseRenewer(dfs); - FSDataOutputStream leaseRecoveryPath = dfs.create(new Path(filePath)); - leaseRecoveryPath.write(bytes); - leaseRecoveryPath.hflush(); - // Set the hard lease timeout to 1 second. - cluster.setLeasePeriod(60 * 1000, 1000); - // wait for lease recovery to complete - LocatedBlocks locatedBlocks; - do { - try { - Thread.sleep(1000); - } catch (InterruptedException e) { - LOG.info("Innocuous exception", e); - } - locatedBlocks = DFSClientAdapter.callGetBlockLocations( - cluster.getNameNodeRpc(), filePath, 0L, bytes.length); - } while (locatedBlocks.isUnderConstruction()); + DistributedFileSystem dfs = (DistributedFileSystem) cluster.getFileSystem(); + DFSTestUtil.runOperations(cluster, dfs, cluster.getConfiguration(0), + dfs.getDefaultBlockSize(), 0); // Force a roll so we get an OP_END_LOG_SEGMENT txn return cluster.getNameNodeRpc().rollEditLog(); diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/tools/offlineEditsViewer/TestOfflineEditsViewer.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/tools/offlineEditsViewer/TestOfflineEditsViewer.java index e6c9a3f3967..46f31026823 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/tools/offlineEditsViewer/TestOfflineEditsViewer.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/tools/offlineEditsViewer/TestOfflineEditsViewer.java @@ -26,8 +26,6 @@ import java.io.IOException; import java.nio.ByteBuffer; import java.nio.channels.FileChannel; -import java.util.HashMap; -import java.util.Map; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; @@ -36,168 +34,142 @@ import org.apache.hadoop.hdfs.server.namenode.OfflineEditsViewerHelper; import org.apache.hadoop.hdfs.tools.offlineEditsViewer.OfflineEditsViewer.Flags; import org.apache.hadoop.test.PathUtils; +import org.junit.After; import org.junit.Before; +import org.junit.Rule; import org.junit.Test; +import org.junit.rules.TemporaryFolder; + +import com.google.common.collect.ImmutableSet; public class TestOfflineEditsViewer { - private static final Log LOG = LogFactory.getLog(TestOfflineEditsViewer.class); + private static final Log LOG = LogFactory + .getLog(TestOfflineEditsViewer.class); - private static final Map obsoleteOpCodes = - new HashMap(); - - private static final Map missingOpCodes = - new HashMap(); - - static { - initializeObsoleteOpCodes(); - initializeMissingOpCodes(); - } - - private static String buildDir = - PathUtils.getTestDirName(TestOfflineEditsViewer.class); - - private static String cacheDir = - System.getProperty("test.cache.data", "build/test/cache"); + private static String buildDir = PathUtils + .getTestDirName(TestOfflineEditsViewer.class); // to create edits and get edits filename - private static final OfflineEditsViewerHelper nnHelper - = new OfflineEditsViewerHelper(); + private static final OfflineEditsViewerHelper nnHelper = new OfflineEditsViewerHelper(); + private static final ImmutableSet skippedOps = skippedOps(); - /** - * Initialize obsoleteOpCodes - * - * Reason for suppressing "deprecation" warnings: - * - * These are the opcodes that are not used anymore, some - * are marked deprecated, we need to include them here to make - * sure we exclude them when checking for completeness of testing, - * that's why the "deprecation" warnings are suppressed. - */ @SuppressWarnings("deprecation") - private static void initializeObsoleteOpCodes() { - obsoleteOpCodes.put(FSEditLogOpCodes.OP_DATANODE_ADD, true); - obsoleteOpCodes.put(FSEditLogOpCodes.OP_DATANODE_REMOVE, true); - obsoleteOpCodes.put(FSEditLogOpCodes.OP_SET_NS_QUOTA, true); - obsoleteOpCodes.put(FSEditLogOpCodes.OP_CLEAR_NS_QUOTA, true); + private static ImmutableSet skippedOps() { + ImmutableSet.Builder b = ImmutableSet + . builder(); + + // Deprecated opcodes + b.add(FSEditLogOpCodes.OP_DATANODE_ADD) + .add(FSEditLogOpCodes.OP_DATANODE_REMOVE) + .add(FSEditLogOpCodes.OP_SET_NS_QUOTA) + .add(FSEditLogOpCodes.OP_CLEAR_NS_QUOTA) + .add(FSEditLogOpCodes.OP_SET_GENSTAMP_V1); + + // Cannot test delegation token related code in insecure set up + b.add(FSEditLogOpCodes.OP_GET_DELEGATION_TOKEN) + .add(FSEditLogOpCodes.OP_RENEW_DELEGATION_TOKEN) + .add(FSEditLogOpCodes.OP_CANCEL_DELEGATION_TOKEN); + + // Skip invalid opcode + b.add(FSEditLogOpCodes.OP_INVALID); + return b.build(); } - /** - * Initialize missingOpcodes - * - * Opcodes that are not available except after uprade from - * an older version. We don't test these here. - */ - private static void initializeMissingOpCodes() { - obsoleteOpCodes.put(FSEditLogOpCodes.OP_SET_GENSTAMP_V1, true); - } + @Rule + public final TemporaryFolder folder = new TemporaryFolder(); @Before - public void setup() { - new File(cacheDir).mkdirs(); + public void setUp() throws IOException { + nnHelper.startCluster(buildDir + "/dfs/"); } - + + @After + public void tearDown() throws IOException { + nnHelper.shutdownCluster(); + } + /** * Test the OfflineEditsViewer */ @Test public void testGenerated() throws IOException { - - LOG.info("START - testing with generated edits"); - - nnHelper.startCluster(buildDir + "/dfs/"); - // edits generated by nnHelper (MiniDFSCluster), should have all op codes // binary, XML, reparsed binary - String edits = nnHelper.generateEdits(); - String editsParsedXml = cacheDir + "/editsParsed.xml"; - String editsReparsed = cacheDir + "/editsReparsed"; + String edits = nnHelper.generateEdits(); + String editsParsedXml = folder.newFile("editsParsed.xml").getAbsolutePath(); + String editsReparsed = folder.newFile("editsParsed").getAbsolutePath(); // parse to XML then back to binary assertEquals(0, runOev(edits, editsParsedXml, "xml", false)); assertEquals(0, runOev(editsParsedXml, editsReparsed, "binary", false)); // judgment time + assertTrue("Edits " + edits + " should have all op codes", + hasAllOpCodes(edits)); + LOG.info("Comparing generated file " + editsReparsed + + " with reference file " + edits); assertTrue( - "Edits " + edits + " should have all op codes", - hasAllOpCodes(edits)); - LOG.info("Comparing generated file " + editsReparsed + - " with reference file " + edits); - assertTrue( - "Generated edits and reparsed (bin to XML to bin) should be same", - filesEqualIgnoreTrailingZeros(edits, editsReparsed)); - - // removes edits so do this at the end - nnHelper.shutdownCluster(); - - LOG.info("END"); + "Generated edits and reparsed (bin to XML to bin) should be same", + filesEqualIgnoreTrailingZeros(edits, editsReparsed)); } @Test public void testRecoveryMode() throws IOException { - LOG.info("START - testing with generated edits"); - - nnHelper.startCluster(buildDir + "/dfs/"); - // edits generated by nnHelper (MiniDFSCluster), should have all op codes // binary, XML, reparsed binary - String edits = nnHelper.generateEdits(); - + String edits = nnHelper.generateEdits(); + FileOutputStream os = new FileOutputStream(edits, true); // Corrupt the file by truncating the end - FileChannel editsFile = new FileOutputStream(edits, true).getChannel(); + FileChannel editsFile = os.getChannel(); editsFile.truncate(editsFile.size() - 5); - - String editsParsedXml = cacheDir + "/editsRecoveredParsed.xml"; - String editsReparsed = cacheDir + "/editsRecoveredReparsed"; - String editsParsedXml2 = cacheDir + "/editsRecoveredParsed2.xml"; + + String editsParsedXml = folder.newFile("editsRecoveredParsed.xml") + .getAbsolutePath(); + String editsReparsed = folder.newFile("editsRecoveredReparsed") + .getAbsolutePath(); + String editsParsedXml2 = folder.newFile("editsRecoveredParsed2.xml") + .getAbsolutePath(); // Can't read the corrupted file without recovery mode assertEquals(-1, runOev(edits, editsParsedXml, "xml", false)); - + // parse to XML then back to binary assertEquals(0, runOev(edits, editsParsedXml, "xml", true)); - assertEquals(0, runOev(editsParsedXml, editsReparsed, "binary", false)); + assertEquals(0, runOev(editsParsedXml, editsReparsed, "binary", false)); assertEquals(0, runOev(editsReparsed, editsParsedXml2, "xml", false)); // judgment time assertTrue("Test round trip", - filesEqualIgnoreTrailingZeros(editsParsedXml, editsParsedXml2)); + filesEqualIgnoreTrailingZeros(editsParsedXml, editsParsedXml2)); - // removes edits so do this at the end - nnHelper.shutdownCluster(); - - LOG.info("END"); + os.close(); } @Test public void testStored() throws IOException { - - LOG.info("START - testing with stored reference edits"); - // reference edits stored with source code (see build.xml) + final String cacheDir = System.getProperty("test.cache.data", + "build/test/cache"); // binary, XML, reparsed binary - String editsStored = cacheDir + "/editsStored"; - String editsStoredParsedXml = cacheDir + "/editsStoredParsed.xml"; - String editsStoredReparsed = cacheDir + "/editsStoredReparsed"; + String editsStored = cacheDir + "/editsStored"; + String editsStoredParsedXml = cacheDir + "/editsStoredParsed.xml"; + String editsStoredReparsed = cacheDir + "/editsStoredReparsed"; // reference XML version of editsStored (see build.xml) - String editsStoredXml = cacheDir + "/editsStored.xml"; - + String editsStoredXml = cacheDir + "/editsStored.xml"; + // parse to XML then back to binary assertEquals(0, runOev(editsStored, editsStoredParsedXml, "xml", false)); - assertEquals(0, runOev(editsStoredParsedXml, editsStoredReparsed, - "binary", false)); + assertEquals(0, + runOev(editsStoredParsedXml, editsStoredReparsed, "binary", false)); // judgement time + assertTrue("Edits " + editsStored + " should have all op codes", + hasAllOpCodes(editsStored)); + assertTrue("Reference XML edits and parsed to XML should be same", + filesEqual(editsStoredXml, editsStoredParsedXml)); assertTrue( - "Edits " + editsStored + " should have all op codes", - hasAllOpCodes(editsStored)); - assertTrue( - "Reference XML edits and parsed to XML should be same", - filesEqual(editsStoredXml, editsStoredParsedXml)); - assertTrue( - "Reference edits and reparsed (bin to XML to bin) should be same", - filesEqualIgnoreTrailingZeros(editsStored, editsStoredReparsed)); - - LOG.info("END"); + "Reference edits and reparsed (bin to XML to bin) should be same", + filesEqualIgnoreTrailingZeros(editsStored, editsStoredReparsed)); } /** @@ -233,22 +205,17 @@ private boolean hasAllOpCodes(String inFilename) throws IOException { OfflineEditsViewer oev = new OfflineEditsViewer(); if (oev.go(inFilename, outFilename, "stats", new Flags(), visitor) != 0) return false; - LOG.info("Statistics for " + inFilename + "\n" + - visitor.getStatisticsString()); - + LOG.info("Statistics for " + inFilename + "\n" + + visitor.getStatisticsString()); + boolean hasAllOpCodes = true; - for(FSEditLogOpCodes opCode : FSEditLogOpCodes.values()) { + for (FSEditLogOpCodes opCode : FSEditLogOpCodes.values()) { // don't need to test obsolete opCodes - if(obsoleteOpCodes.containsKey(opCode)) { + if (skippedOps.contains(opCode)) continue; - } else if (missingOpCodes.containsKey(opCode)) { - continue; - } else if (opCode == FSEditLogOpCodes.OP_INVALID) { - continue; - } Long count = visitor.getStatistics().get(opCode); - if((count == null) || (count == 0)) { + if ((count == null) || (count == 0)) { hasAllOpCodes = false; LOG.info("Opcode " + opCode + " not tested in " + inFilename); } @@ -257,9 +224,9 @@ private boolean hasAllOpCodes(String inFilename) throws IOException { } /** - * Compare two files, ignore trailing zeros at the end, - * for edits log the trailing zeros do not make any difference, - * throw exception is the files are not same + * Compare two files, ignore trailing zeros at the end, for edits log the + * trailing zeros do not make any difference, throw exception is the files are + * not same * * @param filenameSmall first file to compare (doesn't have to be smaller) * @param filenameLarge second file to compare (doesn't have to be larger) @@ -271,7 +238,7 @@ private boolean filesEqualIgnoreTrailingZeros(String filenameSmall, ByteBuffer large = ByteBuffer.wrap(DFSTestUtil.loadFile(filenameLarge)); // now correct if it's otherwise - if(small.capacity() > large.capacity()) { + if (small.capacity() > large.capacity()) { ByteBuffer tmpByteBuffer = small; small = large; large = tmpByteBuffer; @@ -288,13 +255,15 @@ private boolean filesEqualIgnoreTrailingZeros(String filenameSmall, large.limit(small.capacity()); // compares position to limit - if(!small.equals(large)) { return false; } + if (!small.equals(large)) { + return false; + } // everything after limit should be 0xFF int i = large.limit(); large.clear(); - for(; i < large.capacity(); i++) { - if(large.get(i) != FSEditLogOpCodes.OP_INVALID.getOpCode()) { + for (; i < large.capacity(); i++) { + if (large.get(i) != FSEditLogOpCodes.OP_INVALID.getOpCode()) { return false; } } From b80343a55f0c34459d4aca4b800869b93c4d2e2c Mon Sep 17 00:00:00 2001 From: Brandon Li Date: Fri, 3 Jan 2014 18:29:44 +0000 Subject: [PATCH 30/42] HDFS-5705. TestSecondaryNameNodeUpgrade#testChangeNsIDFails may fail due to ConcurrentModificationException. Contributed by Ted Yu git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1555190 13f79535-47bb-0310-9956-ffa450edef68 --- hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt | 3 +++ .../hdfs/server/datanode/fsdataset/impl/FsVolumeImpl.java | 4 ++-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt index 40ee26e9bc0..2267b0ac6e3 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt +++ b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt @@ -468,6 +468,9 @@ Trunk (Unreleased) HDFS-5659. dfsadmin -report doesn't output cache information properly. (wang) + HDFS-5705. TestSecondaryNameNodeUpgrade#testChangeNsIDFails may fail due + to ConcurrentModificationException. (Ted Yu via brandonli) + BREAKDOWN OF HDFS-2832 SUBTASKS AND RELATED JIRAS HDFS-4985. Add storage type to the protocol and expose it in block report diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/FsVolumeImpl.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/FsVolumeImpl.java index 9e5b0ebee4f..795fab1f3a5 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/FsVolumeImpl.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/FsVolumeImpl.java @@ -19,10 +19,10 @@ import java.io.File; import java.io.IOException; -import java.util.HashMap; import java.util.Map; import java.util.Map.Entry; import java.util.Set; +import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.Executor; import java.util.concurrent.LinkedBlockingQueue; import java.util.concurrent.ThreadFactory; @@ -54,7 +54,7 @@ class FsVolumeImpl implements FsVolumeSpi { private final String storageID; private final StorageType storageType; private final Map bpSlices - = new HashMap(); + = new ConcurrentHashMap(); private final File currentDir; // /current private final DF usage; private final long reserved; From f8a9329f2b8e768fe6730fc05436e973344b9132 Mon Sep 17 00:00:00 2001 From: Arpit Agarwal Date: Mon, 6 Jan 2014 17:28:23 +0000 Subject: [PATCH 31/42] HDFS-5667. Include DatanodeStorage in StorageReport. (Arpit Agarwal) git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1555929 13f79535-47bb-0310-9956-ffa450edef68 --- hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt | 14 +++--- .../hadoop/hdfs/protocolPB/PBHelper.java | 10 ++-- .../blockmanagement/DatanodeDescriptor.java | 4 +- .../fsdataset/impl/FsDatasetImpl.java | 7 +-- .../hdfs/server/protocol/StorageReport.java | 12 ++--- .../src/main/proto/DatanodeProtocol.proto | 3 +- .../apache/hadoop/hdfs/MiniDFSCluster.java | 46 ++++++++++++------- .../hdfs/MiniDFSClusterWithNodeGroup.java | 18 ++------ .../blockmanagement/BlockManagerTestUtil.java | 4 +- .../hdfs/server/common/TestJspHelper.java | 7 ++- .../server/datanode/SimulatedFSDataset.java | 5 +- .../hdfs/server/datanode/TestDiskError.java | 6 +-- .../namenode/NNThroughputBenchmark.java | 21 +++++---- .../server/namenode/TestDeadDatanode.java | 5 +- 14 files changed, 91 insertions(+), 71 deletions(-) diff --git a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt index 2267b0ac6e3..07e4b27b278 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt +++ b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt @@ -449,12 +449,6 @@ Trunk (Unreleased) HDFS-5626. dfsadmin -report shows incorrect cache values. (cmccabe) - HDFS-5406. Send incremental block reports for all storages in a - single call. (Arpit Agarwal) - - HDFS-5454. DataNode UUID should be assigned prior to FsDataset - initialization. (Arpit Agarwal) - HDFS-5679. TestCacheDirectives should handle the case where native code is not available. (wang) @@ -596,6 +590,14 @@ Trunk (Unreleased) HDFS-5648. Get rid of FsDatasetImpl#perVolumeReplicaMap. (Arpit Agarwal) + HDFS-5406. Send incremental block reports for all storages in a + single call. (Arpit Agarwal) + + HDFS-5454. DataNode UUID should be assigned prior to FsDataset + initialization. (Arpit Agarwal) + + HDFS-5667. Include DatanodeStorage in StorageReport. (Arpit Agarwal) + Release 2.4.0 - UNRELEASED INCOMPATIBLE CHANGES diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocolPB/PBHelper.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocolPB/PBHelper.java index 10be9062842..b0db32aaa62 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocolPB/PBHelper.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocolPB/PBHelper.java @@ -1559,13 +1559,17 @@ public static StorageReportProto convert(StorageReport r) { StorageReportProto.Builder builder = StorageReportProto.newBuilder() .setBlockPoolUsed(r.getBlockPoolUsed()).setCapacity(r.getCapacity()) .setDfsUsed(r.getDfsUsed()).setRemaining(r.getRemaining()) - .setStorageUuid(r.getStorageID()); + .setStorageUuid(r.getStorage().getStorageID()) + .setStorage(convert(r.getStorage())); return builder.build(); } public static StorageReport convert(StorageReportProto p) { - return new StorageReport(p.getStorageUuid(), p.getFailed(), - p.getCapacity(), p.getDfsUsed(), p.getRemaining(), + return new StorageReport( + p.hasStorage() ? + convert(p.getStorage()) : + new DatanodeStorage(p.getStorageUuid()), + p.getFailed(), p.getCapacity(), p.getDfsUsed(), p.getRemaining(), p.getBlockPoolUsed()); } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/DatanodeDescriptor.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/DatanodeDescriptor.java index 5621c17400f..fc4bf885092 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/DatanodeDescriptor.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/DatanodeDescriptor.java @@ -355,11 +355,11 @@ public void updateHeartbeat(StorageReport[] reports, long cacheCapacity, setLastUpdate(Time.now()); this.volumeFailures = volFailures; for (StorageReport report : reports) { - DatanodeStorageInfo storage = storageMap.get(report.getStorageID()); + DatanodeStorageInfo storage = storageMap.get(report.getStorage().getStorageID()); if (storage == null) { // This is seen during cluster initialization when the heartbeat // is received before the initial block reports from each storage. - storage = updateStorage(new DatanodeStorage(report.getStorageID())); + storage = updateStorage(report.getStorage()); } storage.receivedHeartbeat(report); totalCapacity += report.getCapacity(); diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/FsDatasetImpl.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/FsDatasetImpl.java index b81235625e6..53386bcbbf8 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/FsDatasetImpl.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/FsDatasetImpl.java @@ -121,7 +121,7 @@ public StorageReport[] getStorageReports(String bpid) reports = new StorageReport[volumes.volumes.size()]; int i = 0; for (FsVolumeImpl volume : volumes.volumes) { - reports[i++] = new StorageReport(volume.getStorageID(), + reports[i++] = new StorageReport(volume.toDatanodeStorage(), false, volume.getCapacity(), volume.getDfsUsed(), @@ -237,12 +237,9 @@ public LengthInputStream getMetaDataInputStream(ExtendedBlock b) final List volArray = new ArrayList( storage.getNumStorageDirs()); for (int idx = 0; idx < storage.getNumStorageDirs(); idx++) { - // TODO: getStorageTypeFromLocations() is only a temporary workaround and - // should be replaced with getting storage type from DataStorage (missing - // storage type now) directly. Storage.StorageDirectory sd = storage.getStorageDir(idx); final File dir = sd.getCurrentDir(); - final StorageType storageType = getStorageTypeFromLocations(dataLocations, dir); + final StorageType storageType = getStorageTypeFromLocations(dataLocations, sd.getRoot()); volArray.add(new FsVolumeImpl(this, sd.getStorageUuid(), dir, conf, storageType)); LOG.info("Added volume - " + dir + ", StorageType: " + storageType); diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/protocol/StorageReport.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/protocol/StorageReport.java index c805f1ea455..5fd5733df92 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/protocol/StorageReport.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/protocol/StorageReport.java @@ -21,7 +21,7 @@ * Utilization report for a Datanode storage */ public class StorageReport { - private final String storageID; + private final DatanodeStorage storage; private final boolean failed; private final long capacity; private final long dfsUsed; @@ -30,9 +30,9 @@ public class StorageReport { public static final StorageReport[] EMPTY_ARRAY = {}; - public StorageReport(String sid, boolean failed, long capacity, long dfsUsed, - long remaining, long bpUsed) { - this.storageID = sid; + public StorageReport(DatanodeStorage storage, boolean failed, + long capacity, long dfsUsed, long remaining, long bpUsed) { + this.storage = storage; this.failed = failed; this.capacity = capacity; this.dfsUsed = dfsUsed; @@ -40,8 +40,8 @@ public StorageReport(String sid, boolean failed, long capacity, long dfsUsed, this.blockPoolUsed = bpUsed; } - public String getStorageID() { - return storageID; + public DatanodeStorage getStorage() { + return storage; } public boolean isFailed() { diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/proto/DatanodeProtocol.proto b/hadoop-hdfs-project/hadoop-hdfs/src/main/proto/DatanodeProtocol.proto index 236a7350705..0e6d59ae188 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/proto/DatanodeProtocol.proto +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/proto/DatanodeProtocol.proto @@ -196,12 +196,13 @@ message HeartbeatRequestProto { } message StorageReportProto { - required string storageUuid = 1; + required string storageUuid = 1 [ deprecated = true ]; optional bool failed = 2 [ default = false ]; optional uint64 capacity = 3 [ default = 0 ]; optional uint64 dfsUsed = 4 [ default = 0 ]; optional uint64 remaining = 5 [ default = 0 ]; optional uint64 blockPoolUsed = 6 [ default = 0 ]; + optional DatanodeStorageProto storage = 7; // supersedes StorageUuid } /** diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/MiniDFSCluster.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/MiniDFSCluster.java index 1221a7f2a1f..ea78be1d150 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/MiniDFSCluster.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/MiniDFSCluster.java @@ -140,6 +140,7 @@ public static class Builder { private int nameNodeHttpPort = 0; private final Configuration conf; private int numDataNodes = 1; + private StorageType storageType = StorageType.DEFAULT; private boolean format = true; private boolean manageNameDfsDirs = true; private boolean manageNameDfsSharedDirs = true; @@ -185,6 +186,14 @@ public Builder numDataNodes(int val) { return this; } + /** + * Default: StorageType.DEFAULT + */ + public Builder storageType(StorageType type) { + this.storageType = type; + return this; + } + /** * Default: true */ @@ -341,6 +350,7 @@ protected MiniDFSCluster(Builder builder) throws IOException { initMiniDFSCluster(builder.conf, builder.numDataNodes, + builder.storageType, builder.format, builder.manageNameDfsDirs, builder.manageNameDfsSharedDirs, @@ -592,7 +602,7 @@ public MiniDFSCluster(int nameNodePort, String[] racks, String hosts[], long[] simulatedCapacities) throws IOException { this.nameNodes = new NameNodeInfo[1]; // Single namenode in the cluster - initMiniDFSCluster(conf, numDataNodes, format, + initMiniDFSCluster(conf, numDataNodes, StorageType.DEFAULT, format, manageNameDfsDirs, true, manageDataDfsDirs, manageDataDfsDirs, operation, racks, hosts, simulatedCapacities, null, true, false, @@ -601,7 +611,7 @@ public MiniDFSCluster(int nameNodePort, private void initMiniDFSCluster( Configuration conf, - int numDataNodes, boolean format, boolean manageNameDfsDirs, + int numDataNodes, StorageType storageType, boolean format, boolean manageNameDfsDirs, boolean manageNameDfsSharedDirs, boolean enableManagedDfsDirsRedundancy, boolean manageDataDfsDirs, StartupOption operation, String[] racks, String[] hosts, long[] simulatedCapacities, String clusterId, @@ -670,7 +680,7 @@ private void initMiniDFSCluster( } // Start the DataNodes - startDataNodes(conf, numDataNodes, manageDataDfsDirs, operation, racks, + startDataNodes(conf, numDataNodes, storageType, manageDataDfsDirs, operation, racks, hosts, simulatedCapacities, setupHostsFile, checkDataNodeAddrConfig, checkDataNodeHostConfig); waitClusterUp(); //make sure ProxyUsers uses the latest conf @@ -990,6 +1000,19 @@ public void waitClusterUp() throws IOException { } } + String makeDataNodeDirs(int dnIndex, StorageType storageType) throws IOException { + StringBuilder sb = new StringBuilder(); + for (int j = 0; j < DIRS_PER_DATANODE; ++j) { + File dir = getInstanceStorageDir(dnIndex, j); + dir.mkdirs(); + if (!dir.isDirectory()) { + throw new IOException("Mkdirs failed to create directory for DataNode " + dir); + } + sb.append((j > 0 ? "," : "") + "[" + storageType + "]" + fileAsURI(dir)); + } + return sb.toString(); + } + /** * Modify the config and start up additional DataNodes. The info port for * DataNodes is guaranteed to use a free port. @@ -1052,7 +1075,7 @@ public synchronized void startDataNodes(Configuration conf, int numDataNodes, String[] racks, String[] hosts, long[] simulatedCapacities, boolean setupHostsFile) throws IOException { - startDataNodes(conf, numDataNodes, manageDfsDirs, operation, racks, hosts, + startDataNodes(conf, numDataNodes, StorageType.DEFAULT, manageDfsDirs, operation, racks, hosts, simulatedCapacities, setupHostsFile, false, false); } @@ -1066,7 +1089,7 @@ public synchronized void startDataNodes(Configuration conf, int numDataNodes, long[] simulatedCapacities, boolean setupHostsFile, boolean checkDataNodeAddrConfig) throws IOException { - startDataNodes(conf, numDataNodes, manageDfsDirs, operation, racks, hosts, + startDataNodes(conf, numDataNodes, StorageType.DEFAULT, manageDfsDirs, operation, racks, hosts, simulatedCapacities, setupHostsFile, checkDataNodeAddrConfig, false); } @@ -1098,7 +1121,7 @@ public synchronized void startDataNodes(Configuration conf, int numDataNodes, * @throws IllegalStateException if NameNode has been shutdown */ public synchronized void startDataNodes(Configuration conf, int numDataNodes, - boolean manageDfsDirs, StartupOption operation, + StorageType storageType, boolean manageDfsDirs, StartupOption operation, String[] racks, String[] hosts, long[] simulatedCapacities, boolean setupHostsFile, @@ -1154,16 +1177,7 @@ public synchronized void startDataNodes(Configuration conf, int numDataNodes, // Set up datanode address setupDatanodeAddress(dnConf, setupHostsFile, checkDataNodeAddrConfig); if (manageDfsDirs) { - StringBuilder sb = new StringBuilder(); - for (int j = 0; j < DIRS_PER_DATANODE; ++j) { - File dir = getInstanceStorageDir(i, j); - dir.mkdirs(); - if (!dir.isDirectory()) { - throw new IOException("Mkdirs failed to create directory for DataNode " + dir); - } - sb.append((j > 0 ? "," : "") + fileAsURI(dir)); - } - String dirs = sb.toString(); + String dirs = makeDataNodeDirs(i, storageType); dnConf.set(DFS_DATANODE_DATA_DIR_KEY, dirs); conf.set(DFS_DATANODE_DATA_DIR_KEY, dirs); } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/MiniDFSClusterWithNodeGroup.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/MiniDFSClusterWithNodeGroup.java index 453ec223fe5..42ec9f84df1 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/MiniDFSClusterWithNodeGroup.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/MiniDFSClusterWithNodeGroup.java @@ -50,7 +50,7 @@ public static void setNodeGroups (String[] nodeGroups) { } public synchronized void startDataNodes(Configuration conf, int numDataNodes, - boolean manageDfsDirs, StartupOption operation, + StorageType storageType, boolean manageDfsDirs, StartupOption operation, String[] racks, String[] nodeGroups, String[] hosts, long[] simulatedCapacities, boolean setupHostsFile, @@ -112,15 +112,7 @@ public synchronized void startDataNodes(Configuration conf, int numDataNodes, // Set up datanode address setupDatanodeAddress(dnConf, setupHostsFile, checkDataNodeAddrConfig); if (manageDfsDirs) { - File dir1 = getInstanceStorageDir(i, 0); - File dir2 = getInstanceStorageDir(i, 1); - dir1.mkdirs(); - dir2.mkdirs(); - if (!dir1.isDirectory() || !dir2.isDirectory()) { - throw new IOException("Mkdirs failed to create directory for DataNode " - + i + ": " + dir1 + " or " + dir2); - } - String dirs = fileAsURI(dir1) + "," + fileAsURI(dir2); + String dirs = makeDataNodeDirs(i, storageType); dnConf.set(DFSConfigKeys.DFS_DATANODE_DATA_DIR_KEY, dirs); conf.set(DFSConfigKeys.DFS_DATANODE_DATA_DIR_KEY, dirs); } @@ -198,7 +190,7 @@ public synchronized void startDataNodes(Configuration conf, int numDataNodes, String[] racks, String[] nodeGroups, String[] hosts, long[] simulatedCapacities, boolean setupHostsFile) throws IOException { - startDataNodes(conf, numDataNodes, manageDfsDirs, operation, racks, nodeGroups, + startDataNodes(conf, numDataNodes, StorageType.DEFAULT, manageDfsDirs, operation, racks, nodeGroups, hosts, simulatedCapacities, setupHostsFile, false, false); } @@ -213,13 +205,13 @@ public void startDataNodes(Configuration conf, int numDataNodes, // This is for initialize from parent class. @Override public synchronized void startDataNodes(Configuration conf, int numDataNodes, - boolean manageDfsDirs, StartupOption operation, + StorageType storageType, boolean manageDfsDirs, StartupOption operation, String[] racks, String[] hosts, long[] simulatedCapacities, boolean setupHostsFile, boolean checkDataNodeAddrConfig, boolean checkDataNodeHostConfig) throws IOException { - startDataNodes(conf, numDataNodes, manageDfsDirs, operation, racks, + startDataNodes(conf, numDataNodes, storageType, manageDfsDirs, operation, racks, NODE_GROUPS, hosts, simulatedCapacities, setupHostsFile, checkDataNodeAddrConfig, checkDataNodeHostConfig); } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManagerTestUtil.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManagerTestUtil.java index cfc9750967b..fecc7be992b 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManagerTestUtil.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManagerTestUtil.java @@ -257,8 +257,10 @@ public static StorageReport[] getStorageReportsForDatanode( DatanodeDescriptor dnd) { ArrayList reports = new ArrayList(); for (DatanodeStorageInfo storage : dnd.getStorageInfos()) { + DatanodeStorage dns = new DatanodeStorage( + storage.getStorageID(), storage.getState(), storage.getStorageType()); StorageReport report = new StorageReport( - storage.getStorageID(), false, storage.getCapacity(), + dns ,false, storage.getCapacity(), storage.getDfsUsed(), storage.getRemaining(), storage.getBlockPoolUsed()); reports.add(report); diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/common/TestJspHelper.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/common/TestJspHelper.java index 37d594eef53..1f27a0b8428 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/common/TestJspHelper.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/common/TestJspHelper.java @@ -470,11 +470,14 @@ public void testSortNodeByFields() throws Exception { BlockManagerTestUtil.updateStorage(dnDesc1, new DatanodeStorage("dnStorage1")); BlockManagerTestUtil.updateStorage(dnDesc2, new DatanodeStorage("dnStorage2")); + DatanodeStorage dns1 = new DatanodeStorage("dnStorage1"); + DatanodeStorage dns2 = new DatanodeStorage("dnStorage2"); + StorageReport[] report1 = new StorageReport[] { - new StorageReport("dnStorage1", false, 1024, 100, 924, 100) + new StorageReport(dns1, false, 1024, 100, 924, 100) }; StorageReport[] report2 = new StorageReport[] { - new StorageReport("dnStorage2", false, 2500, 200, 1848, 200) + new StorageReport(dns2, false, 2500, 200, 1848, 200) }; dnDesc1.updateHeartbeat(report1, 5l, 3l, 10, 2); dnDesc2.updateHeartbeat(report2, 10l, 2l, 20, 1); diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/SimulatedFSDataset.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/SimulatedFSDataset.java index f5b291e0ae6..a2e95a4d673 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/SimulatedFSDataset.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/SimulatedFSDataset.java @@ -394,8 +394,9 @@ String getStorageUuid() { } synchronized StorageReport getStorageReport(String bpid) { - return new StorageReport(getStorageUuid(), false, getCapacity(), - getUsed(), getFree(), map.get(bpid).getUsed()); + return new StorageReport(new DatanodeStorage(getStorageUuid()), + false, getCapacity(), getUsed(), getFree(), + map.get(bpid).getUsed()); } } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestDiskError.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestDiskError.java index f67af62b0ce..6b70cbfc599 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestDiskError.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestDiskError.java @@ -40,6 +40,7 @@ import org.apache.hadoop.hdfs.protocol.datatransfer.BlockConstructionStage; import org.apache.hadoop.hdfs.protocol.datatransfer.Sender; import org.apache.hadoop.hdfs.security.token.block.BlockTokenSecretManager; +import org.apache.hadoop.hdfs.server.datanode.fsdataset.FsVolumeSpi; import org.apache.hadoop.hdfs.server.namenode.NameNodeAdapter; import org.apache.hadoop.util.DataChecksum; import org.junit.After; @@ -186,9 +187,8 @@ public void testLocalDirs() throws Exception { // Check permissions on directories in 'dfs.datanode.data.dir' FileSystem localFS = FileSystem.getLocal(conf); for (DataNode dn : cluster.getDataNodes()) { - String[] dataDirs = - dn.getConf().getStrings(DFSConfigKeys.DFS_DATANODE_DATA_DIR_KEY); - for (String dir : dataDirs) { + for (FsVolumeSpi v : dn.getFSDataset().getVolumes()) { + String dir = v.getBasePath(); Path dataDir = new Path(dir); FsPermission actual = localFS.getFileStatus(dataDir).getPermission(); assertEquals("Permission for dir: " + dataDir + ", is " + actual + diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/NNThroughputBenchmark.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/NNThroughputBenchmark.java index 7aef8e513a0..b32aecdb6a5 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/NNThroughputBenchmark.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/NNThroughputBenchmark.java @@ -938,7 +938,7 @@ void register() throws IOException { // register datanode dnRegistration = nameNodeProto.registerDatanode(dnRegistration); //first block reports - storage = new DatanodeStorage(dnRegistration.getDatanodeUuid()); + storage = new DatanodeStorage(DatanodeStorage.generateUuid()); final StorageBlockReport[] reports = { new StorageBlockReport(storage, new BlockListAsLongs(null, null).getBlockListAsLongs()) @@ -954,8 +954,8 @@ void register() throws IOException { void sendHeartbeat() throws IOException { // register datanode // TODO:FEDERATION currently a single block pool is supported - StorageReport[] rep = { new StorageReport(dnRegistration.getDatanodeUuid(), - false, DF_CAPACITY, DF_USED, DF_CAPACITY - DF_USED, DF_USED) }; + StorageReport[] rep = { new StorageReport(storage, false, + DF_CAPACITY, DF_USED, DF_CAPACITY - DF_USED, DF_USED) }; DatanodeCommand[] cmds = nameNodeProto.sendHeartbeat(dnRegistration, rep, 0L, 0L, 0, 0, 0).getCommands(); if(cmds != null) { @@ -1001,7 +1001,7 @@ public int compareTo(String xferAddr) { @SuppressWarnings("unused") // keep it for future blockReceived benchmark int replicateBlocks() throws IOException { // register datanode - StorageReport[] rep = { new StorageReport(dnRegistration.getDatanodeUuid(), + StorageReport[] rep = { new StorageReport(storage, false, DF_CAPACITY, DF_USED, DF_CAPACITY - DF_USED, DF_USED) }; DatanodeCommand[] cmds = nameNodeProto.sendHeartbeat(dnRegistration, rep, 0L, 0L, 0, 0, 0).getCommands(); @@ -1010,7 +1010,8 @@ int replicateBlocks() throws IOException { if (cmd.getAction() == DatanodeProtocol.DNA_TRANSFER) { // Send a copy of a block to another datanode BlockCommand bcmd = (BlockCommand)cmd; - return transferBlocks(bcmd.getBlocks(), bcmd.getTargets()); + return transferBlocks(bcmd.getBlocks(), bcmd.getTargets(), + bcmd.getTargetStorageIDs()); } } } @@ -1023,12 +1024,14 @@ int replicateBlocks() throws IOException { * that the blocks have been received. */ private int transferBlocks( Block blocks[], - DatanodeInfo xferTargets[][] + DatanodeInfo xferTargets[][], + String targetStorageIDs[][] ) throws IOException { for(int i = 0; i < blocks.length; i++) { DatanodeInfo blockTargets[] = xferTargets[i]; for(int t = 0; t < blockTargets.length; t++) { DatanodeInfo dnInfo = blockTargets[t]; + String targetStorageID = targetStorageIDs[i][t]; DatanodeRegistration receivedDNReg; receivedDNReg = new DatanodeRegistration(dnInfo, new DataStorage(nsInfo), @@ -1038,7 +1041,7 @@ private int transferBlocks( Block blocks[], blocks[i], ReceivedDeletedBlockInfo.BlockStatus.RECEIVED_BLOCK, null) }; StorageReceivedDeletedBlocks[] report = { new StorageReceivedDeletedBlocks( - receivedDNReg.getDatanodeUuid(), rdBlocks) }; + targetStorageID, rdBlocks) }; nameNodeProto.blockReceivedAndDeleted(receivedDNReg, nameNode .getNamesystem().getBlockPoolId(), report); } @@ -1127,7 +1130,7 @@ void generateInputs(int[] ignore) throws IOException { } // create files - LOG.info("Creating " + nrFiles + " with " + blocksPerFile + " blocks each."); + LOG.info("Creating " + nrFiles + " files with " + blocksPerFile + " blocks each."); FileNameGenerator nameGenerator; nameGenerator = new FileNameGenerator(getBaseDir(), 100); String clientName = getClientName(007); @@ -1161,7 +1164,7 @@ private ExtendedBlock addBlocks(String fileName, String clientName) loc.getBlock().getLocalBlock(), ReceivedDeletedBlockInfo.BlockStatus.RECEIVED_BLOCK, null) }; StorageReceivedDeletedBlocks[] report = { new StorageReceivedDeletedBlocks( - datanodes[dnIdx].dnRegistration.getDatanodeUuid(), rdBlocks) }; + datanodes[dnIdx].storage.getStorageID(), rdBlocks) }; nameNodeProto.blockReceivedAndDeleted(datanodes[dnIdx].dnRegistration, loc .getBlock().getBlockPoolId(), report); } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestDeadDatanode.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestDeadDatanode.java index b05248a9954..6c4bb169029 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestDeadDatanode.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestDeadDatanode.java @@ -140,8 +140,9 @@ public void testDeadDatanode() throws Exception { // Ensure heartbeat from dead datanode is rejected with a command // that asks datanode to register again - StorageReport[] rep = { new StorageReport(reg.getDatanodeUuid(), false, 0, 0, - 0, 0) }; + StorageReport[] rep = { new StorageReport( + new DatanodeStorage(reg.getDatanodeUuid()), + false, 0, 0, 0, 0) }; DatanodeCommand[] cmd = dnp.sendHeartbeat(reg, rep, 0L, 0L, 0, 0, 0) .getCommands(); assertEquals(1, cmd.length); From fb2406a635263875103a06763614eebcf77d297a Mon Sep 17 00:00:00 2001 From: Alejandro Abdelnur Date: Mon, 6 Jan 2014 18:11:38 +0000 Subject: [PATCH 32/42] HADOOP-10193. hadoop-auth's PseudoAuthenticationHandler can consume getInputStream. (gchanan via tucu) git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1555955 13f79535-47bb-0310-9956-ffa450edef68 --- hadoop-common-project/hadoop-auth/pom.xml | 5 +++++ .../server/PseudoAuthenticationHandler.java | 20 ++++++++++++++++++- .../TestPseudoAuthenticationHandler.java | 2 +- .../hadoop-common/CHANGES.txt | 3 +++ 4 files changed, 28 insertions(+), 2 deletions(-) diff --git a/hadoop-common-project/hadoop-auth/pom.xml b/hadoop-common-project/hadoop-auth/pom.xml index dc841e5d527..1d913a637c1 100644 --- a/hadoop-common-project/hadoop-auth/pom.xml +++ b/hadoop-common-project/hadoop-auth/pom.xml @@ -92,6 +92,11 @@ hadoop-minikdc test + + org.apache.httpcomponents + httpclient + compile + diff --git a/hadoop-common-project/hadoop-auth/src/main/java/org/apache/hadoop/security/authentication/server/PseudoAuthenticationHandler.java b/hadoop-common-project/hadoop-auth/src/main/java/org/apache/hadoop/security/authentication/server/PseudoAuthenticationHandler.java index 1a2f98c1c95..235081b9618 100644 --- a/hadoop-common-project/hadoop-auth/src/main/java/org/apache/hadoop/security/authentication/server/PseudoAuthenticationHandler.java +++ b/hadoop-common-project/hadoop-auth/src/main/java/org/apache/hadoop/security/authentication/server/PseudoAuthenticationHandler.java @@ -16,10 +16,15 @@ import org.apache.hadoop.security.authentication.client.AuthenticationException; import org.apache.hadoop.security.authentication.client.PseudoAuthenticator; +import org.apache.http.client.utils.URLEncodedUtils; +import org.apache.http.NameValuePair; + import javax.servlet.ServletException; import javax.servlet.http.HttpServletRequest; import javax.servlet.http.HttpServletResponse; import java.io.IOException; +import java.nio.charset.Charset; +import java.util.List; import java.util.Properties; /** @@ -48,6 +53,7 @@ public class PseudoAuthenticationHandler implements AuthenticationHandler { */ public static final String ANONYMOUS_ALLOWED = TYPE + ".anonymous.allowed"; + private static final Charset UTF8_CHARSET = Charset.forName("UTF-8"); private boolean acceptAnonymous; /** @@ -114,6 +120,18 @@ public boolean managementOperation(AuthenticationToken token, return true; } + private String getUserName(HttpServletRequest request) { + List list = URLEncodedUtils.parse(request.getQueryString(), UTF8_CHARSET); + if (list != null) { + for (NameValuePair nv : list) { + if (PseudoAuthenticator.USER_NAME.equals(nv.getName())) { + return nv.getValue(); + } + } + } + return null; + } + /** * Authenticates an HTTP client request. *

@@ -139,7 +157,7 @@ public boolean managementOperation(AuthenticationToken token, public AuthenticationToken authenticate(HttpServletRequest request, HttpServletResponse response) throws IOException, AuthenticationException { AuthenticationToken token; - String userName = request.getParameter(PseudoAuthenticator.USER_NAME); + String userName = getUserName(request); if (userName == null) { if (getAcceptAnonymous()) { token = AuthenticationToken.ANONYMOUS; diff --git a/hadoop-common-project/hadoop-auth/src/test/java/org/apache/hadoop/security/authentication/server/TestPseudoAuthenticationHandler.java b/hadoop-common-project/hadoop-auth/src/test/java/org/apache/hadoop/security/authentication/server/TestPseudoAuthenticationHandler.java index da7eda7bc8e..91c11031d9c 100644 --- a/hadoop-common-project/hadoop-auth/src/test/java/org/apache/hadoop/security/authentication/server/TestPseudoAuthenticationHandler.java +++ b/hadoop-common-project/hadoop-auth/src/test/java/org/apache/hadoop/security/authentication/server/TestPseudoAuthenticationHandler.java @@ -94,7 +94,7 @@ private void _testUserName(boolean anonymous) throws Exception { HttpServletRequest request = Mockito.mock(HttpServletRequest.class); HttpServletResponse response = Mockito.mock(HttpServletResponse.class); - Mockito.when(request.getParameter(PseudoAuthenticator.USER_NAME)).thenReturn("user"); + Mockito.when(request.getQueryString()).thenReturn(PseudoAuthenticator.USER_NAME + "=" + "user"); AuthenticationToken token = handler.authenticate(request, response); diff --git a/hadoop-common-project/hadoop-common/CHANGES.txt b/hadoop-common-project/hadoop-common/CHANGES.txt index cdf3b7a1e58..cdff25522ec 100644 --- a/hadoop-common-project/hadoop-common/CHANGES.txt +++ b/hadoop-common-project/hadoop-common/CHANGES.txt @@ -580,6 +580,9 @@ Release 2.3.0 - UNRELEASED HADOOP-10090. Jobtracker metrics not updated properly after execution of a mapreduce job. (ivanmi) + HADOOP-10193. hadoop-auth's PseudoAuthenticationHandler can consume getInputStream. + (gchanan via tucu) + Release 2.2.0 - 2013-10-13 INCOMPATIBLE CHANGES From 410f3a9f60026e6552a0711dc6154d2f407b1293 Mon Sep 17 00:00:00 2001 From: Arpit Agarwal Date: Mon, 6 Jan 2014 18:14:05 +0000 Subject: [PATCH 33/42] HDFS-5667. Add test missed in previous checkin git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1555956 13f79535-47bb-0310-9956-ffa450edef68 --- .../server/datanode/TestStorageReport.java | 113 ++++++++++++++++++ 1 file changed, 113 insertions(+) create mode 100644 hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestStorageReport.java diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestStorageReport.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestStorageReport.java new file mode 100644 index 00000000000..b0c89d9397c --- /dev/null +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestStorageReport.java @@ -0,0 +1,113 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hdfs.server.datanode; + +import java.io.IOException; + + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hdfs.*; +import org.apache.hadoop.hdfs.protocolPB.DatanodeProtocolClientSideTranslatorPB; +import org.apache.hadoop.hdfs.server.namenode.NameNode; +import org.apache.hadoop.hdfs.server.protocol.DatanodeRegistration; +import org.apache.hadoop.hdfs.server.protocol.DatanodeStorage; +import org.apache.hadoop.hdfs.server.protocol.StorageReport; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; +import org.mockito.ArgumentCaptor; +import org.mockito.Mockito; + +import static org.hamcrest.core.Is.is; +import static org.junit.Assert.assertNotSame; +import static org.junit.Assert.assertThat; +import static org.mockito.Matchers.any; +import static org.mockito.Matchers.anyInt; +import static org.mockito.Matchers.anyLong; + +public class TestStorageReport { + public static final Log LOG = LogFactory.getLog(TestStorageReport.class); + + private static short REPL_FACTOR = 1; + private static final StorageType storageType = StorageType.SSD; // pick non-default. + + private static Configuration conf; + private MiniDFSCluster cluster; + private DistributedFileSystem fs; + static String bpid; + + @Before + public void startUpCluster() throws IOException { + conf = new HdfsConfiguration(); + cluster = new MiniDFSCluster.Builder(conf) + .numDataNodes(REPL_FACTOR) + .storageType(storageType) + .build(); + fs = cluster.getFileSystem(); + bpid = cluster.getNamesystem().getBlockPoolId(); + } + + @After + public void shutDownCluster() throws IOException { + if (cluster != null) { + fs.close(); + cluster.shutdown(); + cluster = null; + } + } + + /** + * Ensure that storage type and storage state are propagated + * in Storage Reports. + */ + @Test + public void testStorageReportHasStorageTypeAndState() throws IOException { + + // Make sure we are not testing with the default type, that would not + // be a very good test. + assertNotSame(storageType, StorageType.DEFAULT); + NameNode nn = cluster.getNameNode(); + DataNode dn = cluster.getDataNodes().get(0); + + // Insert a spy object for the NN RPC. + DatanodeProtocolClientSideTranslatorPB nnSpy = + DataNodeTestUtils.spyOnBposToNN(dn, nn); + + // Trigger a heartbeat so there is an interaction with the spy + // object. + DataNodeTestUtils.triggerHeartbeat(dn); + + // Verify that the callback passed in the expected parameters. + ArgumentCaptor captor = + ArgumentCaptor.forClass(StorageReport[].class); + + Mockito.verify(nnSpy).sendHeartbeat( + any(DatanodeRegistration.class), + captor.capture(), + anyLong(), anyLong(), anyInt(), anyInt(), anyInt()); + + StorageReport[] reports = captor.getValue(); + + for (StorageReport report: reports) { + assertThat(report.getStorage().getStorageType(), is(storageType)); + assertThat(report.getStorage().getState(), is(DatanodeStorage.State.NORMAL)); + } + } +} From 76238b9722539b5fd4773129ecc31b11bd8255ef Mon Sep 17 00:00:00 2001 From: Alejandro Abdelnur Date: Mon, 6 Jan 2014 18:35:26 +0000 Subject: [PATCH 34/42] MAPREDUCE-3310. Custom grouping comparator cannot be set for Combiners (tucu) git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1555968 13f79535-47bb-0310-9956-ffa450edef68 --- hadoop-mapreduce-project/CHANGES.txt | 2 + .../org/apache/hadoop/mapred/JobConf.java | 52 ++++- .../java/org/apache/hadoop/mapred/Task.java | 5 +- .../java/org/apache/hadoop/mapreduce/Job.java | 18 ++ .../apache/hadoop/mapreduce/JobContext.java | 21 +- .../apache/hadoop/mapreduce/MRJobConfig.java | 2 + .../lib/chain/ChainMapContextImpl.java | 5 + .../lib/chain/ChainReduceContextImpl.java | 5 + .../mapreduce/lib/map/WrappedMapper.java | 5 + .../mapreduce/lib/reduce/WrappedReducer.java | 5 + .../hadoop/mapreduce/task/JobContextImpl.java | 11 + .../task/reduce/MergeManagerImpl.java | 2 +- .../mapred/TestOldCombinerGrouping.java | 191 ++++++++++++++++++ .../mapreduce/TestNewCombinerGrouping.java | 178 ++++++++++++++++ 14 files changed, 493 insertions(+), 9 deletions(-) create mode 100644 hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-jobclient/src/test/java/org/apache/hadoop/mapred/TestOldCombinerGrouping.java create mode 100644 hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-jobclient/src/test/java/org/apache/hadoop/mapreduce/TestNewCombinerGrouping.java diff --git a/hadoop-mapreduce-project/CHANGES.txt b/hadoop-mapreduce-project/CHANGES.txt index 3fb2dd6156c..668b4a893fc 100644 --- a/hadoop-mapreduce-project/CHANGES.txt +++ b/hadoop-mapreduce-project/CHANGES.txt @@ -196,6 +196,8 @@ Release 2.4.0 - UNRELEASED MAPREDUCE-5550. Task Status message (reporter.setStatus) not shown in UI with Hadoop 2.0 (Gera Shegalov via Sandy Ryza) + MAPREDUCE-3310. Custom grouping comparator cannot be set for Combiners (tucu) + OPTIMIZATIONS MAPREDUCE-5484. YarnChild unnecessarily loads job conf twice (Sandy Ryza) diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapred/JobConf.java b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapred/JobConf.java index 5bae686ab20..53159fbe592 100644 --- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapred/JobConf.java +++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapred/JobConf.java @@ -949,6 +949,23 @@ public String getKeyFieldPartitionerOption() { return get(KeyFieldBasedPartitioner.PARTITIONER_OPTIONS); } + /** + * Get the user defined {@link WritableComparable} comparator for + * grouping keys of inputs to the combiner. + * + * @return comparator set by the user for grouping values. + * @see #setCombinerKeyGroupingComparator(Class) for details. + */ + public RawComparator getCombinerKeyGroupingComparator() { + Class theClass = getClass( + JobContext.COMBINER_GROUP_COMPARATOR_CLASS, null, RawComparator.class); + if (theClass == null) { + return getOutputKeyComparator(); + } + + return ReflectionUtils.newInstance(theClass, this); + } + /** * Get the user defined {@link WritableComparable} comparator for * grouping keys of inputs to the reduce. @@ -966,6 +983,37 @@ public RawComparator getOutputValueGroupingComparator() { return ReflectionUtils.newInstance(theClass, this); } + /** + * Set the user defined {@link RawComparator} comparator for + * grouping keys in the input to the combiner. + *

+ *

This comparator should be provided if the equivalence rules for keys + * for sorting the intermediates are different from those for grouping keys + * before each call to + * {@link Reducer#reduce(Object, java.util.Iterator, OutputCollector, Reporter)}.

+ *

+ *

For key-value pairs (K1,V1) and (K2,V2), the values (V1, V2) are passed + * in a single call to the reduce function if K1 and K2 compare as equal.

+ *

+ *

Since {@link #setOutputKeyComparatorClass(Class)} can be used to control + * how keys are sorted, this can be used in conjunction to simulate + * secondary sort on values.

+ *

+ *

Note: This is not a guarantee of the combiner sort being + * stable in any sense. (In any case, with the order of available + * map-outputs to the combiner being non-deterministic, it wouldn't make + * that much sense.)

+ * + * @param theClass the comparator class to be used for grouping keys for the + * combiner. It should implement RawComparator. + * @see #setOutputKeyComparatorClass(Class) + */ + public void setCombinerKeyGroupingComparator( + Class theClass) { + setClass(JobContext.COMBINER_GROUP_COMPARATOR_CLASS, + theClass, RawComparator.class); + } + /** * Set the user defined {@link RawComparator} comparator for * grouping keys in the input to the reduce. @@ -989,7 +1037,9 @@ public RawComparator getOutputValueGroupingComparator() { * * @param theClass the comparator class to be used for grouping keys. * It should implement RawComparator. - * @see #setOutputKeyComparatorClass(Class) + * @see #setOutputKeyComparatorClass(Class) + * @see {@link #setCombinerKeyGroupingComparator(Class)} for setting a + * comparator for the combiner. */ public void setOutputValueGroupingComparator( Class theClass) { diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapred/Task.java b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapred/Task.java index 660ffc65ad3..72cd41c9ea6 100644 --- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapred/Task.java +++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapred/Task.java @@ -1575,7 +1575,8 @@ protected OldCombinerRunner(Class> cls, combinerClass = cls; keyClass = (Class) job.getMapOutputKeyClass(); valueClass = (Class) job.getMapOutputValueClass(); - comparator = (RawComparator) job.getOutputKeyComparator(); + comparator = (RawComparator) + job.getCombinerKeyGroupingComparator(); } @SuppressWarnings("unchecked") @@ -1624,7 +1625,7 @@ protected static class NewCombinerRunner extends CombinerRunner { this.taskId = taskId; keyClass = (Class) context.getMapOutputKeyClass(); valueClass = (Class) context.getMapOutputValueClass(); - comparator = (RawComparator) context.getSortComparator(); + comparator = (RawComparator) context.getCombinerKeyGroupingComparator(); this.committer = committer; } diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/Job.java b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/Job.java index 78c6b4b1a9c..4bb97e84d6c 100644 --- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/Job.java +++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/Job.java @@ -948,11 +948,27 @@ public void setOutputValueClass(Class theClass conf.setOutputValueClass(theClass); } + /** + * Define the comparator that controls which keys are grouped together + * for a single call to combiner, + * {@link Reducer#reduce(Object, Iterable, + * org.apache.hadoop.mapreduce.Reducer.Context)} + * + * @param cls the raw comparator to use + * @throws IllegalStateException if the job is submitted + */ + public void setCombinerKeyGroupingComparatorClass( + Class cls) throws IllegalStateException { + ensureState(JobState.DEFINE); + conf.setCombinerKeyGroupingComparator(cls); + } + /** * Define the comparator that controls how the keys are sorted before they * are passed to the {@link Reducer}. * @param cls the raw comparator * @throws IllegalStateException if the job is submitted + * @see {@link #setCombinerKeyGroupingComparatorClass(Class)} */ public void setSortComparatorClass(Class cls ) throws IllegalStateException { @@ -967,6 +983,8 @@ public void setSortComparatorClass(Class cls * org.apache.hadoop.mapreduce.Reducer.Context)} * @param cls the raw comparator to use * @throws IllegalStateException if the job is submitted + * @see {@link #setCombinerKeyGroupingComparatorClass(Class)} for setting a + * comparator for the combiner. */ public void setGroupingComparatorClass(Class cls ) throws IllegalStateException { diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/JobContext.java b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/JobContext.java index 4842e20b9c4..fa73a5f066f 100644 --- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/JobContext.java +++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/JobContext.java @@ -167,13 +167,24 @@ public Class> getPartitionerClass() */ public String getJar(); - /** - * Get the user defined {@link RawComparator} comparator for - * grouping keys of inputs to the reduce. - * + /** + * Get the user defined {@link RawComparator} comparator for + * grouping keys of inputs to the combiner. + * * @return comparator set by the user for grouping values. - * @see Job#setGroupingComparatorClass(Class) for details. + * @see Job#setCombinerKeyGroupingComparatorClass(Class) for details. */ + public RawComparator getCombinerKeyGroupingComparator(); + + /** + * Get the user defined {@link RawComparator} comparator for + * grouping keys of inputs to the reduce. + * + * @return comparator set by the user for grouping values. + * @see Job#setGroupingComparatorClass(Class) for details. + * @see {@link #getCombinerKeyGroupingComparator()} for setting a + * comparator for the combiner. + */ public RawComparator getGroupingComparator(); /** diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/MRJobConfig.java b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/MRJobConfig.java index e696b865533..1be7ba3e3b9 100644 --- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/MRJobConfig.java +++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/MRJobConfig.java @@ -93,6 +93,8 @@ public interface MRJobConfig { public static final String KEY_COMPARATOR = "mapreduce.job.output.key.comparator.class"; + public static final String COMBINER_GROUP_COMPARATOR_CLASS = "mapreduce.job.combiner.group.comparator.class"; + public static final String GROUP_COMPARATOR_CLASS = "mapreduce.job.output.group.comparator.class"; public static final String WORKING_DIR = "mapreduce.job.working.dir"; diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/chain/ChainMapContextImpl.java b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/chain/ChainMapContextImpl.java index 598bb936060..ea2c77ace9b 100644 --- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/chain/ChainMapContextImpl.java +++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/chain/ChainMapContextImpl.java @@ -166,6 +166,11 @@ public String[] getFileTimestamps() { return base.getFileTimestamps(); } + @Override + public RawComparator getCombinerKeyGroupingComparator() { + return base.getCombinerKeyGroupingComparator(); + } + @Override public RawComparator getGroupingComparator() { return base.getGroupingComparator(); diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/chain/ChainReduceContextImpl.java b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/chain/ChainReduceContextImpl.java index 8d6648468e8..5e9a1add874 100644 --- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/chain/ChainReduceContextImpl.java +++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/chain/ChainReduceContextImpl.java @@ -159,6 +159,11 @@ public String[] getFileTimestamps() { return base.getFileTimestamps(); } + @Override + public RawComparator getCombinerKeyGroupingComparator() { + return base.getCombinerKeyGroupingComparator(); + } + @Override public RawComparator getGroupingComparator() { return base.getGroupingComparator(); diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/map/WrappedMapper.java b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/map/WrappedMapper.java index 95c4b90c0f9..8865a36c31d 100644 --- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/map/WrappedMapper.java +++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/map/WrappedMapper.java @@ -168,6 +168,11 @@ public String[] getFileTimestamps() { return mapContext.getFileTimestamps(); } + @Override + public RawComparator getCombinerKeyGroupingComparator() { + return mapContext.getCombinerKeyGroupingComparator(); + } + @Override public RawComparator getGroupingComparator() { return mapContext.getGroupingComparator(); diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/reduce/WrappedReducer.java b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/reduce/WrappedReducer.java index 39178642f24..185c135c2e1 100644 --- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/reduce/WrappedReducer.java +++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/reduce/WrappedReducer.java @@ -161,6 +161,11 @@ public String[] getFileTimestamps() { return reduceContext.getFileTimestamps(); } + @Override + public RawComparator getCombinerKeyGroupingComparator() { + return reduceContext.getCombinerKeyGroupingComparator(); + } + @Override public RawComparator getGroupingComparator() { return reduceContext.getGroupingComparator(); diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/task/JobContextImpl.java b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/task/JobContextImpl.java index b4c6dca5545..247c2f2029b 100644 --- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/task/JobContextImpl.java +++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/task/JobContextImpl.java @@ -252,6 +252,17 @@ public String getJar() { return conf.getJar(); } + /** + * Get the user defined {@link RawComparator} comparator for + * grouping keys of inputs to the combiner. + * + * @return comparator set by the user for grouping values. + * @see Job#setCombinerKeyGroupingComparatorClass(Class) for details. + */ + public RawComparator getCombinerKeyGroupingComparator() { + return conf.getCombinerKeyGroupingComparator(); + } + /** * Get the user defined {@link RawComparator} comparator for * grouping keys of inputs to the reduce. diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/task/reduce/MergeManagerImpl.java b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/task/reduce/MergeManagerImpl.java index ca3bed93998..a821e4d1b8a 100644 --- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/task/reduce/MergeManagerImpl.java +++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/task/reduce/MergeManagerImpl.java @@ -582,7 +582,7 @@ private void combineAndSpill( Class keyClass = (Class) job.getMapOutputKeyClass(); Class valClass = (Class) job.getMapOutputValueClass(); RawComparator comparator = - (RawComparator)job.getOutputKeyComparator(); + (RawComparator)job.getCombinerKeyGroupingComparator(); try { CombineValuesIterator values = new CombineValuesIterator( kvIter, comparator, keyClass, valClass, job, Reporter.NULL, diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-jobclient/src/test/java/org/apache/hadoop/mapred/TestOldCombinerGrouping.java b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-jobclient/src/test/java/org/apache/hadoop/mapred/TestOldCombinerGrouping.java new file mode 100644 index 00000000000..96919bef68f --- /dev/null +++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-jobclient/src/test/java/org/apache/hadoop/mapred/TestOldCombinerGrouping.java @@ -0,0 +1,191 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.mapred; + +import junit.framework.Assert; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.LongWritable; +import org.apache.hadoop.io.RawComparator; +import org.apache.hadoop.io.Text; +import org.junit.Test; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileReader; +import java.io.FileWriter; +import java.io.IOException; +import java.io.PrintWriter; +import java.util.HashSet; +import java.util.Iterator; +import java.util.Set; +import java.util.UUID; + +public class TestOldCombinerGrouping { + private static String TEST_ROOT_DIR = + new File("build", UUID.randomUUID().toString()).getAbsolutePath(); + + public static class Map implements + Mapper { + @Override + public void map(LongWritable key, Text value, + OutputCollector output, Reporter reporter) + throws IOException { + String v = value.toString(); + String k = v.substring(0, v.indexOf(",")); + v = v.substring(v.indexOf(",") + 1); + output.collect(new Text(k), new LongWritable(Long.parseLong(v))); + } + + @Override + public void close() throws IOException { + } + + @Override + public void configure(JobConf job) { + } + } + + public static class Reduce implements + Reducer { + + @Override + public void reduce(Text key, Iterator values, + OutputCollector output, Reporter reporter) + throws IOException { + LongWritable maxValue = null; + while (values.hasNext()) { + LongWritable value = values.next(); + if (maxValue == null) { + maxValue = value; + } else if (value.compareTo(maxValue) > 0) { + maxValue = value; + } + } + output.collect(key, maxValue); + } + + @Override + public void close() throws IOException { + } + + @Override + public void configure(JobConf job) { + } + } + + public static class Combiner extends Reduce { + } + + public static class GroupComparator implements RawComparator { + @Override + public int compare(byte[] bytes, int i, int i2, byte[] bytes2, int i3, + int i4) { + byte[] b1 = new byte[i2]; + System.arraycopy(bytes, i, b1, 0, i2); + + byte[] b2 = new byte[i4]; + System.arraycopy(bytes2, i3, b2, 0, i4); + + return compare(new Text(new String(b1)), new Text(new String(b2))); + } + + @Override + public int compare(Text o1, Text o2) { + String s1 = o1.toString(); + String s2 = o2.toString(); + s1 = s1.substring(0, s1.indexOf("|")); + s2 = s2.substring(0, s2.indexOf("|")); + return s1.compareTo(s2); + } + + } + + @Test + public void testCombiner() throws Exception { + if (!new File(TEST_ROOT_DIR).mkdirs()) { + throw new RuntimeException("Could not create test dir: " + TEST_ROOT_DIR); + } + File in = new File(TEST_ROOT_DIR, "input"); + if (!in.mkdirs()) { + throw new RuntimeException("Could not create test dir: " + in); + } + File out = new File(TEST_ROOT_DIR, "output"); + PrintWriter pw = new PrintWriter(new FileWriter(new File(in, "data.txt"))); + pw.println("A|a,1"); + pw.println("A|b,2"); + pw.println("B|a,3"); + pw.println("B|b,4"); + pw.println("B|c,5"); + pw.close(); + JobConf job = new JobConf(); + job.set("mapreduce.framework.name", "local"); + TextInputFormat.setInputPaths(job, new Path(in.getPath())); + TextOutputFormat.setOutputPath(job, new Path(out.getPath())); + job.setMapperClass(Map.class); + job.setReducerClass(Reduce.class); + job.setInputFormat(TextInputFormat.class); + job.setMapOutputKeyClass(Text.class); + job.setMapOutputValueClass(LongWritable.class); + job.setOutputFormat(TextOutputFormat.class); + job.setOutputValueGroupingComparator(GroupComparator.class); + + job.setCombinerClass(Combiner.class); + job.setCombinerKeyGroupingComparator(GroupComparator.class); + job.setInt("min.num.spills.for.combine", 0); + + JobClient client = new JobClient(job); + RunningJob runningJob = client.submitJob(job); + runningJob.waitForCompletion(); + if (runningJob.isSuccessful()) { + Counters counters = runningJob.getCounters(); + + long combinerInputRecords = counters.getGroup( + "org.apache.hadoop.mapreduce.TaskCounter"). + getCounter("COMBINE_INPUT_RECORDS"); + long combinerOutputRecords = counters.getGroup( + "org.apache.hadoop.mapreduce.TaskCounter"). + getCounter("COMBINE_OUTPUT_RECORDS"); + Assert.assertTrue(combinerInputRecords > 0); + Assert.assertTrue(combinerInputRecords > combinerOutputRecords); + + BufferedReader br = new BufferedReader(new FileReader( + new File(out, "part-00000"))); + Set output = new HashSet(); + String line = br.readLine(); + Assert.assertNotNull(line); + output.add(line.substring(0, 1) + line.substring(4, 5)); + line = br.readLine(); + Assert.assertNotNull(line); + output.add(line.substring(0, 1) + line.substring(4, 5)); + line = br.readLine(); + Assert.assertNull(line); + br.close(); + + Set expected = new HashSet(); + expected.add("A2"); + expected.add("B5"); + + Assert.assertEquals(expected, output); + + } else { + Assert.fail("Job failed"); + } + } + +} diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-jobclient/src/test/java/org/apache/hadoop/mapreduce/TestNewCombinerGrouping.java b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-jobclient/src/test/java/org/apache/hadoop/mapreduce/TestNewCombinerGrouping.java new file mode 100644 index 00000000000..c4b734bdc5b --- /dev/null +++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-jobclient/src/test/java/org/apache/hadoop/mapreduce/TestNewCombinerGrouping.java @@ -0,0 +1,178 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.mapreduce; + +import junit.framework.Assert; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.LongWritable; +import org.apache.hadoop.io.RawComparator; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; +import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; +import org.junit.Test; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileReader; +import java.io.FileWriter; +import java.io.IOException; +import java.io.PrintWriter; +import java.util.HashSet; +import java.util.Set; +import java.util.UUID; + +public class TestNewCombinerGrouping { + private static String TEST_ROOT_DIR = + new File("build", UUID.randomUUID().toString()).getAbsolutePath(); + + public static class Map extends + Mapper { + + @Override + protected void map(LongWritable key, Text value, + Context context) + throws IOException, InterruptedException { + String v = value.toString(); + String k = v.substring(0, v.indexOf(",")); + v = v.substring(v.indexOf(",") + 1); + context.write(new Text(k), new LongWritable(Long.parseLong(v))); + } + } + + public static class Reduce extends + Reducer { + + @Override + protected void reduce(Text key, Iterable values, + Context context) + throws IOException, InterruptedException { + LongWritable maxValue = null; + for (LongWritable value : values) { + if (maxValue == null) { + maxValue = value; + } else if (value.compareTo(maxValue) > 0) { + maxValue = value; + } + } + context.write(key, maxValue); + } + } + + public static class Combiner extends Reduce { + } + + public static class GroupComparator implements RawComparator { + @Override + public int compare(byte[] bytes, int i, int i2, byte[] bytes2, int i3, + int i4) { + byte[] b1 = new byte[i2]; + System.arraycopy(bytes, i, b1, 0, i2); + + byte[] b2 = new byte[i4]; + System.arraycopy(bytes2, i3, b2, 0, i4); + + return compare(new Text(new String(b1)), new Text(new String(b2))); + } + + @Override + public int compare(Text o1, Text o2) { + String s1 = o1.toString(); + String s2 = o2.toString(); + s1 = s1.substring(0, s1.indexOf("|")); + s2 = s2.substring(0, s2.indexOf("|")); + return s1.compareTo(s2); + } + + } + + @Test + public void testCombiner() throws Exception { + if (!new File(TEST_ROOT_DIR).mkdirs()) { + throw new RuntimeException("Could not create test dir: " + TEST_ROOT_DIR); + } + File in = new File(TEST_ROOT_DIR, "input"); + if (!in.mkdirs()) { + throw new RuntimeException("Could not create test dir: " + in); + } + File out = new File(TEST_ROOT_DIR, "output"); + PrintWriter pw = new PrintWriter(new FileWriter(new File(in, "data.txt"))); + pw.println("A|a,1"); + pw.println("A|b,2"); + pw.println("B|a,3"); + pw.println("B|b,4"); + pw.println("B|c,5"); + pw.close(); + JobConf conf = new JobConf(); + conf.set("mapreduce.framework.name", "local"); + Job job = new Job(conf); + TextInputFormat.setInputPaths(job, new Path(in.getPath())); + TextOutputFormat.setOutputPath(job, new Path(out.getPath())); + + job.setMapperClass(Map.class); + job.setReducerClass(Reduce.class); + job.setInputFormatClass(TextInputFormat.class); + job.setMapOutputKeyClass(Text.class); + job.setMapOutputValueClass(LongWritable.class); + job.setOutputFormatClass(TextOutputFormat.class); + job.setGroupingComparatorClass(GroupComparator.class); + + job.setCombinerKeyGroupingComparatorClass(GroupComparator.class); + job.setCombinerClass(Combiner.class); + job.getConfiguration().setInt("min.num.spills.for.combine", 0); + + job.submit(); + job.waitForCompletion(false); + if (job.isSuccessful()) { + Counters counters = job.getCounters(); + + long combinerInputRecords = counters.findCounter( + "org.apache.hadoop.mapreduce.TaskCounter", + "COMBINE_INPUT_RECORDS").getValue(); + long combinerOutputRecords = counters.findCounter( + "org.apache.hadoop.mapreduce.TaskCounter", + "COMBINE_OUTPUT_RECORDS").getValue(); + Assert.assertTrue(combinerInputRecords > 0); + Assert.assertTrue(combinerInputRecords > combinerOutputRecords); + + BufferedReader br = new BufferedReader(new FileReader( + new File(out, "part-r-00000"))); + Set output = new HashSet(); + String line = br.readLine(); + Assert.assertNotNull(line); + output.add(line.substring(0, 1) + line.substring(4, 5)); + line = br.readLine(); + Assert.assertNotNull(line); + output.add(line.substring(0, 1) + line.substring(4, 5)); + line = br.readLine(); + Assert.assertNull(line); + br.close(); + + Set expected = new HashSet(); + expected.add("A2"); + expected.add("B5"); + + Assert.assertEquals(expected, output); + + } else { + Assert.fail("Job failed"); + } + } + +} From 2cddd21db9ce2e896e11225af3ae50d547884ca4 Mon Sep 17 00:00:00 2001 From: Karthik Kambatla Date: Mon, 6 Jan 2014 18:40:15 +0000 Subject: [PATCH 35/42] YARN-1559. Race between ServerRMProxy and ClientRMProxy setting RMProxy#INSTANCE. (kasha and vinodkv via kasha) git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1555970 13f79535-47bb-0310-9956-ffa450edef68 --- hadoop-yarn-project/CHANGES.txt | 3 +++ .../dev-support/findbugs-exclude.xml | 9 ------- .../hadoop/yarn/client/ClientRMProxy.java | 9 ++----- .../apache/hadoop/yarn/client/RMProxy.java | 26 +++---------------- .../hadoop/yarn/server/api/ServerRMProxy.java | 11 +++----- 5 files changed, 11 insertions(+), 47 deletions(-) diff --git a/hadoop-yarn-project/CHANGES.txt b/hadoop-yarn-project/CHANGES.txt index 33225f06dbb..ef98e212f2b 100644 --- a/hadoop-yarn-project/CHANGES.txt +++ b/hadoop-yarn-project/CHANGES.txt @@ -286,6 +286,9 @@ Release 2.4.0 - UNRELEASED YARN-1549. Fixed a bug in ResourceManager's ApplicationMasterService that was causing unamanged AMs to not finish correctly. (haosdent via vinodkv) + YARN-1559. Race between ServerRMProxy and ClientRMProxy setting + RMProxy#INSTANCE. (kasha and vinodkv via kasha) + Release 2.3.0 - UNRELEASED INCOMPATIBLE CHANGES diff --git a/hadoop-yarn-project/hadoop-yarn/dev-support/findbugs-exclude.xml b/hadoop-yarn-project/hadoop-yarn/dev-support/findbugs-exclude.xml index 486bebfec50..74ca61b8578 100644 --- a/hadoop-yarn-project/hadoop-yarn/dev-support/findbugs-exclude.xml +++ b/hadoop-yarn-project/hadoop-yarn/dev-support/findbugs-exclude.xml @@ -309,13 +309,4 @@ - - - - - - - - - diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-client/src/main/java/org/apache/hadoop/yarn/client/ClientRMProxy.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-client/src/main/java/org/apache/hadoop/yarn/client/ClientRMProxy.java index 06bbc3555c4..91d0bf7fc92 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-client/src/main/java/org/apache/hadoop/yarn/client/ClientRMProxy.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-client/src/main/java/org/apache/hadoop/yarn/client/ClientRMProxy.java @@ -39,16 +39,13 @@ public class ClientRMProxy extends RMProxy { private static final Log LOG = LogFactory.getLog(ClientRMProxy.class); + private static final ClientRMProxy INSTANCE = new ClientRMProxy(); private interface ClientRMProtocols extends ApplicationClientProtocol, ApplicationMasterProtocol, ResourceManagerAdministrationProtocol { // Add nothing } - static { - INSTANCE = new ClientRMProxy(); - } - private ClientRMProxy(){ super(); } @@ -63,9 +60,7 @@ private ClientRMProxy(){ */ public static T createRMProxy(final Configuration configuration, final Class protocol) throws IOException { - // This method exists only to initiate this class' static INSTANCE. TODO: - // FIX if possible - return RMProxy.createRMProxy(configuration, protocol); + return createRMProxy(configuration, protocol, INSTANCE); } private static void setupTokens(InetSocketAddress resourceManagerAddress) diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/client/RMProxy.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/client/RMProxy.java index 913eb04613c..c15018bde8a 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/client/RMProxy.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/client/RMProxy.java @@ -50,7 +50,6 @@ public class RMProxy { private static final Log LOG = LogFactory.getLog(RMProxy.class); - protected static RMProxy INSTANCE; protected RMProxy() {} @@ -79,17 +78,17 @@ protected InetSocketAddress getRMAddress( */ @Private protected static T createRMProxy(final Configuration configuration, - final Class protocol) throws IOException { + final Class protocol, RMProxy instance) throws IOException { YarnConfiguration conf = (configuration instanceof YarnConfiguration) ? (YarnConfiguration) configuration : new YarnConfiguration(configuration); RetryPolicy retryPolicy = createRetryPolicy(conf); if (HAUtil.isHAEnabled(conf)) { RMFailoverProxyProvider provider = - INSTANCE.createRMFailoverProxyProvider(conf, protocol); + instance.createRMFailoverProxyProvider(conf, protocol); return (T) RetryProxy.create(protocol, provider, retryPolicy); } else { - InetSocketAddress rmAddress = INSTANCE.getRMAddress(conf, protocol); + InetSocketAddress rmAddress = instance.getRMAddress(conf, protocol); LOG.info("Connecting to ResourceManager at " + rmAddress); T proxy = RMProxy.getProxy(conf, protocol, rmAddress); return (T) RetryProxy.create(protocol, proxy, retryPolicy); @@ -159,25 +158,6 @@ private RMFailoverProxyProvider createRMFailoverProxyProvider( return provider; } - /** - * A RetryPolicy to allow failing over upto the specified maximum time. - */ - private static class FailoverUptoMaximumTimePolicy implements RetryPolicy { - private long maxTime; - - FailoverUptoMaximumTimePolicy(long maxTime) { - this.maxTime = maxTime; - } - - @Override - public RetryAction shouldRetry(Exception e, int retries, int failovers, - boolean isIdempotentOrAtMostOnce) throws Exception { - return System.currentTimeMillis() < maxTime - ? RetryAction.FAILOVER_AND_RETRY - : RetryAction.FAIL; - } - } - /** * Fetch retry policy from Configuration */ diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/main/java/org/apache/hadoop/yarn/server/api/ServerRMProxy.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/main/java/org/apache/hadoop/yarn/server/api/ServerRMProxy.java index 15a26e51260..5d4fc462c12 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/main/java/org/apache/hadoop/yarn/server/api/ServerRMProxy.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/main/java/org/apache/hadoop/yarn/server/api/ServerRMProxy.java @@ -32,10 +32,7 @@ public class ServerRMProxy extends RMProxy { private static final Log LOG = LogFactory.getLog(ServerRMProxy.class); - - static { - INSTANCE = new ServerRMProxy(); - } + private static final ServerRMProxy INSTANCE = new ServerRMProxy(); private ServerRMProxy() { super(); @@ -51,10 +48,8 @@ private ServerRMProxy() { */ public static T createRMProxy(final Configuration configuration, final Class protocol) throws IOException { - // This method exists only to initiate this class' static INSTANCE. TODO: - // FIX if possible - return RMProxy.createRMProxy(configuration, protocol); - } + return createRMProxy(configuration, protocol, INSTANCE); +} @InterfaceAudience.Private @Override From f342dbcfc750d10028fdea0579a4822d5cf1e545 Mon Sep 17 00:00:00 2001 From: Jian He Date: Mon, 6 Jan 2014 18:57:23 +0000 Subject: [PATCH 36/42] YARN-1560. Fixed TestYarnClient#testAMMRTokens failure with null AMRM token. (Contributed by Ted Yu) git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1555975 13f79535-47bb-0310-9956-ffa450edef68 --- hadoop-yarn-project/CHANGES.txt | 3 +++ .../yarn/client/api/impl/TestYarnClient.java | 14 ++++++++++++++ 2 files changed, 17 insertions(+) diff --git a/hadoop-yarn-project/CHANGES.txt b/hadoop-yarn-project/CHANGES.txt index ef98e212f2b..69776c33486 100644 --- a/hadoop-yarn-project/CHANGES.txt +++ b/hadoop-yarn-project/CHANGES.txt @@ -289,6 +289,9 @@ Release 2.4.0 - UNRELEASED YARN-1559. Race between ServerRMProxy and ClientRMProxy setting RMProxy#INSTANCE. (kasha and vinodkv via kasha) + YARN-1560. Fixed TestYarnClient#testAMMRTokens failure with null AMRM token. + (Ted Yu via jianhe) + Release 2.3.0 - UNRELEASED INCOMPATIBLE CHANGES diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-client/src/test/java/org/apache/hadoop/yarn/client/api/impl/TestYarnClient.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-client/src/test/java/org/apache/hadoop/yarn/client/api/impl/TestYarnClient.java index 966995c99ce..00ab7895d67 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-client/src/test/java/org/apache/hadoop/yarn/client/api/impl/TestYarnClient.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-client/src/test/java/org/apache/hadoop/yarn/client/api/impl/TestYarnClient.java @@ -378,6 +378,13 @@ public void testAMMRTokens() throws Exception { appId = createApp(rmClient, true); waitTillAccepted(rmClient, appId); + long start = System.currentTimeMillis(); + while (rmClient.getAMRMToken(appId) == null) { + if (System.currentTimeMillis() - start > 20 * 1000) { + Assert.fail("AMRM token is null"); + } + Thread.sleep(100); + } //unmanaged AMs do return AMRM token Assert.assertNotNull(rmClient.getAMRMToken(appId)); @@ -392,6 +399,13 @@ public ApplicationId run() throws Exception { rmClient.start(); ApplicationId appId = createApp(rmClient, true); waitTillAccepted(rmClient, appId); + long start = System.currentTimeMillis(); + while (rmClient.getAMRMToken(appId) == null) { + if (System.currentTimeMillis() - start > 20 * 1000) { + Assert.fail("AMRM token is null"); + } + Thread.sleep(100); + } //unmanaged AMs do return AMRM token Assert.assertNotNull(rmClient.getAMRMToken(appId)); return appId; From 2a1ecd00dadb1577da9e02822469e8194f1d3cee Mon Sep 17 00:00:00 2001 From: Colin McCabe Date: Mon, 6 Jan 2014 18:59:10 +0000 Subject: [PATCH 37/42] HDFS-5220. Expose group resolution time as metric (jxiang via cmccabe) git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1555976 13f79535-47bb-0310-9956-ffa450edef68 --- .../hadoop/fs/CommonConfigurationKeys.java | 4 ++- .../org/apache/hadoop/security/Groups.java | 1 + .../hadoop/security/UserGroupInformation.java | 30 +++++++++++++++++++ .../security/TestUserGroupInformation.java | 27 +++++++++++++++-- hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt | 2 ++ .../hadoop/hdfs/server/namenode/NameNode.java | 8 +++++ .../namenode/metrics/TestNameNodeMetrics.java | 8 +++++ 7 files changed, 76 insertions(+), 4 deletions(-) diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/CommonConfigurationKeys.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/CommonConfigurationKeys.java index 5e7fe93a7c7..3c3da625739 100644 --- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/CommonConfigurationKeys.java +++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/CommonConfigurationKeys.java @@ -21,7 +21,6 @@ import org.apache.hadoop.classification.InterfaceAudience; import org.apache.hadoop.classification.InterfaceStability; import org.apache.hadoop.http.lib.StaticUserWebFilter; -import org.apache.hadoop.security.authorize.Service; /** * This class contains constants for configuration keys used @@ -240,4 +239,7 @@ public class CommonConfigurationKeys extends CommonConfigurationKeysPublic { /** Default value for IPC_SERVER_CONNECTION_IDLE_SCAN_INTERVAL_KEY */ public static final int IPC_CLIENT_CONNECTION_IDLESCANINTERVAL_DEFAULT = 10000; + + public static final String HADOOP_USER_GROUP_METRICS_PERCENTILES_INTERVALS = + "hadoop.user.group.metrics.percentiles.intervals"; } diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/Groups.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/Groups.java index 33659c6fada..097bc30dfe3 100644 --- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/Groups.java +++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/Groups.java @@ -138,6 +138,7 @@ public List getGroups(String user) throws IOException { List groupList = impl.getGroups(user); long endMs = Time.monotonicNow(); long deltaMs = endMs - startMs ; + UserGroupInformation.metrics.addGetGroups(deltaMs); if (deltaMs > warningDeltaMs) { LOG.warn("Potential performance problem: getGroups(user=" + user +") " + "took " + deltaMs + " milliseconds."); diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/UserGroupInformation.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/UserGroupInformation.java index 972fc780fcf..dde5dcd3b07 100644 --- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/UserGroupInformation.java +++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/UserGroupInformation.java @@ -19,6 +19,7 @@ import static org.apache.hadoop.fs.CommonConfigurationKeys.HADOOP_KERBEROS_MIN_SECONDS_BEFORE_RELOGIN; import static org.apache.hadoop.fs.CommonConfigurationKeys.HADOOP_KERBEROS_MIN_SECONDS_BEFORE_RELOGIN_DEFAULT; +import static org.apache.hadoop.fs.CommonConfigurationKeys.HADOOP_USER_GROUP_METRICS_PERCENTILES_INTERVALS; import java.io.File; import java.io.IOException; @@ -58,6 +59,8 @@ import org.apache.hadoop.metrics2.annotation.Metric; import org.apache.hadoop.metrics2.annotation.Metrics; import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem; +import org.apache.hadoop.metrics2.lib.MetricsRegistry; +import org.apache.hadoop.metrics2.lib.MutableQuantiles; import org.apache.hadoop.metrics2.lib.MutableRate; import org.apache.hadoop.security.SaslRpcServer.AuthMethod; import org.apache.hadoop.security.authentication.util.KerberosUtil; @@ -92,14 +95,27 @@ public class UserGroupInformation { */ @Metrics(about="User and group related metrics", context="ugi") static class UgiMetrics { + final MetricsRegistry registry = new MetricsRegistry("UgiMetrics"); + @Metric("Rate of successful kerberos logins and latency (milliseconds)") MutableRate loginSuccess; @Metric("Rate of failed kerberos logins and latency (milliseconds)") MutableRate loginFailure; + @Metric("GetGroups") MutableRate getGroups; + MutableQuantiles[] getGroupsQuantiles; static UgiMetrics create() { return DefaultMetricsSystem.instance().register(new UgiMetrics()); } + + void addGetGroups(long latency) { + getGroups.add(latency); + if (getGroupsQuantiles != null) { + for (MutableQuantiles q : getGroupsQuantiles) { + q.add(latency); + } + } + } } /** @@ -250,6 +266,20 @@ private static synchronized void initialize(Configuration conf, groups = Groups.getUserToGroupsMappingService(conf); } UserGroupInformation.conf = conf; + + if (metrics.getGroupsQuantiles == null) { + int[] intervals = conf.getInts(HADOOP_USER_GROUP_METRICS_PERCENTILES_INTERVALS); + if (intervals != null && intervals.length > 0) { + final int length = intervals.length; + MutableQuantiles[] getGroupsQuantiles = new MutableQuantiles[length]; + for (int i = 0; i < length; i++) { + getGroupsQuantiles[i] = metrics.registry.newQuantiles( + "getGroups" + intervals[i] + "s", + "Get groups", "ops", "latency", intervals[i]); + } + metrics.getGroupsQuantiles = getGroupsQuantiles; + } + } } /** diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/security/TestUserGroupInformation.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/security/TestUserGroupInformation.java index baa95b14fc1..0a303d0e885 100644 --- a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/security/TestUserGroupInformation.java +++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/security/TestUserGroupInformation.java @@ -19,7 +19,6 @@ import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.CommonConfigurationKeysPublic; import org.apache.hadoop.io.Text; -import org.apache.hadoop.ipc.TestSaslRPC; import org.apache.hadoop.metrics2.MetricsRecordBuilder; import org.apache.hadoop.security.UserGroupInformation.AuthenticationMethod; import org.apache.hadoop.security.authentication.util.KerberosName; @@ -40,9 +39,9 @@ import java.util.LinkedHashSet; import java.util.Set; +import static org.apache.hadoop.fs.CommonConfigurationKeys.HADOOP_USER_GROUP_METRICS_PERCENTILES_INTERVALS; import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.HADOOP_SECURITY_AUTH_TO_LOCAL; import static org.apache.hadoop.ipc.TestSaslRPC.*; -import static org.apache.hadoop.security.token.delegation.TestDelegationToken.TestDelegationTokenIdentifier; import static org.apache.hadoop.test.MetricsAsserts.*; import static org.junit.Assert.*; import static org.mockito.Mockito.mock; @@ -55,6 +54,8 @@ public class TestUserGroupInformation { final private static String GROUP3_NAME = "group3"; final private static String[] GROUP_NAMES = new String[]{GROUP1_NAME, GROUP2_NAME, GROUP3_NAME}; + // Rollover interval of percentile metrics (in seconds) + private static final int PERCENTILES_INTERVAL = 1; private static Configuration conf; /** @@ -80,7 +81,8 @@ public static void setup() { // doesn't matter what it is, but getGroups needs it set... // use HADOOP_HOME environment variable to prevent interfering with logic // that finds winutils.exe - System.setProperty("hadoop.home.dir", System.getenv("HADOOP_HOME")); + String home = System.getenv("HADOOP_HOME"); + System.setProperty("hadoop.home.dir", (home != null ? home : ".")); // fake the realm is kerberos is enabled System.setProperty("java.security.krb5.kdc", ""); System.setProperty("java.security.krb5.realm", "DEFAULT.REALM"); @@ -150,11 +152,15 @@ public void testGetRealAuthenticationMethod() { /** Test login method */ @Test (timeout = 30000) public void testLogin() throws Exception { + conf.set(HADOOP_USER_GROUP_METRICS_PERCENTILES_INTERVALS, + String.valueOf(PERCENTILES_INTERVAL)); + UserGroupInformation.setConfiguration(conf); // login from unix UserGroupInformation ugi = UserGroupInformation.getCurrentUser(); assertEquals(UserGroupInformation.getCurrentUser(), UserGroupInformation.getLoginUser()); assertTrue(ugi.getGroupNames().length >= 1); + verifyGroupMetrics(1); // ensure that doAs works correctly UserGroupInformation userGroupInfo = @@ -728,6 +734,21 @@ public static void verifyLoginMetrics(long success, int failure) } } + private static void verifyGroupMetrics( + long groups) throws InterruptedException { + MetricsRecordBuilder rb = getMetrics("UgiMetrics"); + if (groups > 0) { + assertCounter("GetGroupsNumOps", groups, rb); + double avg = getDoubleGauge("GetGroupsAvgTime", rb); + assertTrue(avg >= 0.0); + + // Sleep for an interval+slop to let the percentiles rollover + Thread.sleep((PERCENTILES_INTERVAL+1)*1000); + // Check that the percentiles were updated + assertQuantileGauges("GetGroups1s", rb); + } + } + /** * Test for the case that UserGroupInformation.getCurrentUser() * is called when the AccessControlContext has a Subject associated diff --git a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt index 07e4b27b278..31b70279cd6 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt +++ b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt @@ -788,6 +788,8 @@ Release 2.4.0 - UNRELEASED HDFS-5695. Clean up TestOfflineEditsViewer and OfflineEditsViewerHelper. (Haohui Mai via jing9) + HDFS-5220. Expose group resolution time as metric (jxiang via cmccabe) + OPTIMIZATIONS HDFS-5239. Allow FSNamesystem lock fairness to be configurable (daryn) diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NameNode.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NameNode.java index 3b39bc3de99..4b2bd66d1ff 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NameNode.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NameNode.java @@ -480,6 +480,14 @@ void loginAsNameNodeUser(Configuration conf) throws IOException { * @param conf the configuration */ protected void initialize(Configuration conf) throws IOException { + if (conf.get(HADOOP_USER_GROUP_METRICS_PERCENTILES_INTERVALS) == null) { + String intervals = conf.get(DFS_METRICS_PERCENTILES_INTERVALS_KEY); + if (intervals != null) { + conf.set(HADOOP_USER_GROUP_METRICS_PERCENTILES_INTERVALS, + intervals); + } + } + UserGroupInformation.setConfiguration(conf); loginAsNameNodeUser(conf); diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/metrics/TestNameNodeMetrics.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/metrics/TestNameNodeMetrics.java index b4f2a536dfb..67b6cce3b42 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/metrics/TestNameNodeMetrics.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/metrics/TestNameNodeMetrics.java @@ -47,6 +47,8 @@ import org.apache.hadoop.hdfs.server.namenode.FSNamesystem; import org.apache.hadoop.hdfs.server.namenode.NameNodeAdapter; import org.apache.hadoop.metrics2.MetricsRecordBuilder; +import org.apache.hadoop.metrics2.MetricsSource; +import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem; import org.apache.hadoop.test.MetricsAsserts; import org.apache.hadoop.util.Time; import org.apache.log4j.Level; @@ -108,6 +110,12 @@ public void setUp() throws Exception { @After public void tearDown() throws Exception { + MetricsSource source = DefaultMetricsSystem.instance().getSource("UgiMetrics"); + if (source != null) { + // Run only once since the UGI metrics is cleaned up during teardown + MetricsRecordBuilder rb = getMetrics(source); + assertQuantileGauges("GetGroups1s", rb); + } cluster.shutdown(); } From 8deb7a60575ad33b78a5167673276275ba7bece5 Mon Sep 17 00:00:00 2001 From: Colin McCabe Date: Mon, 6 Jan 2014 19:45:02 +0000 Subject: [PATCH 38/42] HDFS-5589. Namenode loops caching and uncaching when data should be uncached. (awang via cmccabe) git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1555996 13f79535-47bb-0310-9956-ffa450edef68 --- hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt | 3 + .../CacheReplicationMonitor.java | 172 +++++++++++++++--- .../server/datanode/TestFsDatasetCache.java | 48 ++++- .../server/namenode/TestCacheDirectives.java | 67 +++++-- 4 files changed, 249 insertions(+), 41 deletions(-) diff --git a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt index 31b70279cd6..82c104b4a7d 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt +++ b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt @@ -598,6 +598,9 @@ Trunk (Unreleased) HDFS-5667. Include DatanodeStorage in StorageReport. (Arpit Agarwal) + HDFS-5589. Namenode loops caching and uncaching when data should be + uncached (awang via cmccabe) + Release 2.4.0 - UNRELEASED INCOMPATIBLE CHANGES diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/CacheReplicationMonitor.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/CacheReplicationMonitor.java index 6e6e44b5008..aef726fa9b9 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/CacheReplicationMonitor.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/CacheReplicationMonitor.java @@ -21,12 +21,14 @@ import java.io.Closeable; import java.io.IOException; +import java.util.ArrayList; import java.util.Collection; import java.util.Date; import java.util.Iterator; import java.util.LinkedList; import java.util.List; import java.util.Random; +import java.util.TreeMap; import java.util.concurrent.TimeUnit; import java.util.concurrent.locks.Condition; import java.util.concurrent.locks.ReentrantLock; @@ -76,7 +78,7 @@ public class CacheReplicationMonitor extends Thread implements Closeable { /** * Pseudorandom number source */ - private final Random random = new Random(); + private static final Random random = new Random(); /** * The interval at which we scan the namesystem for caching changes. @@ -310,8 +312,6 @@ private void rescanCacheDirectives() { FSDirectory fsDir = namesystem.getFSDirectory(); final long now = new Date().getTime(); for (CacheDirective directive : cacheManager.getCacheDirectives()) { - // Reset the directive's statistics - directive.resetStatistics(); // Skip processing this entry if it has expired if (LOG.isTraceEnabled()) { LOG.trace("Directive expiry is at " + directive.getExpiryTime()); @@ -461,7 +461,7 @@ private String findReasonForNotCaching(CachedBlock cblock, // there may be a period of time when incomplete blocks remain cached // on the DataNodes. return "not complete"; - } else if (cblock.getReplication() == 0) { + } else if (cblock.getReplication() == 0) { // Since 0 is not a valid value for a cache directive's replication // field, seeing a replication of 0 on a CacheBlock means that it // has never been reached by any sweep. @@ -469,6 +469,9 @@ private String findReasonForNotCaching(CachedBlock cblock, } else if (cblock.getMark() != mark) { // Although the block was needed in the past, we didn't reach it during // the current sweep. Therefore, it doesn't need to be cached any more. + // Need to set the replication to 0 so it doesn't flip back to cached + // when the mark flips on the next scan + cblock.setReplicationAndMark((short)0, mark); return "no longer needed by any directives"; } return null; @@ -595,7 +598,7 @@ private void addNewPendingUncached(int neededUncached, * @param pendingCached A list of DataNodes that will soon cache the * block. */ - private void addNewPendingCached(int neededCached, + private void addNewPendingCached(final int neededCached, CachedBlock cachedBlock, List cached, List pendingCached) { // To figure out which replicas can be cached, we consult the @@ -616,35 +619,156 @@ private void addNewPendingCached(int neededCached, } return; } - List possibilities = new LinkedList(); + // Filter the list of replicas to only the valid targets + List possibilities = + new LinkedList(); int numReplicas = blockInfo.getCapacity(); Collection corrupt = blockManager.getCorruptReplicas(blockInfo); + int outOfCapacity = 0; for (int i = 0; i < numReplicas; i++) { DatanodeDescriptor datanode = blockInfo.getDatanode(i); - if ((datanode != null) && - ((!pendingCached.contains(datanode)) && - ((corrupt == null) || (!corrupt.contains(datanode))))) { - possibilities.add(datanode); + if (datanode == null) { + continue; } + if (datanode.isDecommissioned() || datanode.isDecommissionInProgress()) { + continue; + } + if (corrupt != null && corrupt.contains(datanode)) { + continue; + } + if (pendingCached.contains(datanode) || cached.contains(datanode)) { + continue; + } + long pendingCapacity = datanode.getCacheRemaining(); + // Subtract pending cached blocks from effective capacity + Iterator it = datanode.getPendingCached().iterator(); + while (it.hasNext()) { + CachedBlock cBlock = it.next(); + BlockInfo info = + blockManager.getStoredBlock(new Block(cBlock.getBlockId())); + if (info != null) { + pendingCapacity -= info.getNumBytes(); + } + } + it = datanode.getPendingUncached().iterator(); + // Add pending uncached blocks from effective capacity + while (it.hasNext()) { + CachedBlock cBlock = it.next(); + BlockInfo info = + blockManager.getStoredBlock(new Block(cBlock.getBlockId())); + if (info != null) { + pendingCapacity += info.getNumBytes(); + } + } + if (pendingCapacity < blockInfo.getNumBytes()) { + if (LOG.isTraceEnabled()) { + LOG.trace("Datanode " + datanode + " is not a valid possibility for" + + " block " + blockInfo.getBlockId() + " of size " + + blockInfo.getNumBytes() + " bytes, only has " + + datanode.getCacheRemaining() + " bytes of cache remaining."); + } + outOfCapacity++; + continue; + } + possibilities.add(datanode); } - while (neededCached > 0) { - if (possibilities.isEmpty()) { - LOG.warn("We need " + neededCached + " more replica(s) than " + - "actually exist to provide a cache replication of " + - cachedBlock.getReplication() + " for " + cachedBlock); - return; - } - DatanodeDescriptor datanode = - possibilities.remove(random.nextInt(possibilities.size())); - if (LOG.isDebugEnabled()) { - LOG.debug("AddNewPendingCached: datanode " + datanode + - " will now cache block " + cachedBlock); - } + List chosen = chooseDatanodesForCaching(possibilities, + neededCached, blockManager.getDatanodeManager().getStaleInterval()); + for (DatanodeDescriptor datanode : chosen) { pendingCached.add(datanode); boolean added = datanode.getPendingCached().add(cachedBlock); assert added; - neededCached--; + } + // We were unable to satisfy the requested replication factor + if (neededCached > chosen.size()) { + if (LOG.isDebugEnabled()) { + LOG.debug( + "Only have " + + (cachedBlock.getReplication() - neededCached + chosen.size()) + + " of " + cachedBlock.getReplication() + " cached replicas for " + + cachedBlock + " (" + outOfCapacity + " nodes have insufficient " + + "capacity)."); + } } } + + /** + * Chooses datanode locations for caching from a list of valid possibilities. + * Non-stale nodes are chosen before stale nodes. + * + * @param possibilities List of candidate datanodes + * @param neededCached Number of replicas needed + * @param staleInterval Age of a stale datanode + * @return A list of chosen datanodes + */ + private static List chooseDatanodesForCaching( + final List possibilities, final int neededCached, + final long staleInterval) { + // Make a copy that we can modify + List targets = + new ArrayList(possibilities); + // Selected targets + List chosen = new LinkedList(); + + // Filter out stale datanodes + List stale = new LinkedList(); + Iterator it = targets.iterator(); + while (it.hasNext()) { + DatanodeDescriptor d = it.next(); + if (d.isStale(staleInterval)) { + it.remove(); + stale.add(d); + } + } + // Select targets + while (chosen.size() < neededCached) { + // Try to use stale nodes if we're out of non-stale nodes, else we're done + if (targets.isEmpty()) { + if (!stale.isEmpty()) { + targets = stale; + } else { + break; + } + } + // Select a random target + DatanodeDescriptor target = + chooseRandomDatanodeByRemainingCapacity(targets); + chosen.add(target); + targets.remove(target); + } + return chosen; + } + + /** + * Choose a single datanode from the provided list of possible + * targets, weighted by the percentage of free space remaining on the node. + * + * @return The chosen datanode + */ + private static DatanodeDescriptor chooseRandomDatanodeByRemainingCapacity( + final List targets) { + // Use a weighted probability to choose the target datanode + float total = 0; + for (DatanodeDescriptor d : targets) { + total += d.getCacheRemainingPercent(); + } + // Give each datanode a portion of keyspace equal to its relative weight + // [0, w1) selects d1, [w1, w2) selects d2, etc. + TreeMap lottery = + new TreeMap(); + int offset = 0; + for (DatanodeDescriptor d : targets) { + // Since we're using floats, be paranoid about negative values + int weight = + Math.max(1, (int)((d.getCacheRemainingPercent() / total) * 1000000)); + offset += weight; + lottery.put(offset, d); + } + // Choose a number from [0, offset), which is the total amount of weight, + // to select the winner + DatanodeDescriptor winner = + lottery.higherEntry(random.nextInt(offset)).getValue(); + return winner; + } } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestFsDatasetCache.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestFsDatasetCache.java index b6aac810db0..c0a93c4aef1 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestFsDatasetCache.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestFsDatasetCache.java @@ -43,10 +43,13 @@ import org.apache.hadoop.ha.HAServiceProtocol.HAServiceState; import org.apache.hadoop.hdfs.DFSConfigKeys; import org.apache.hadoop.hdfs.DFSTestUtil; +import org.apache.hadoop.hdfs.DistributedFileSystem; import org.apache.hadoop.hdfs.HdfsConfiguration; import org.apache.hadoop.hdfs.LogVerificationAppender; import org.apache.hadoop.hdfs.MiniDFSCluster; import org.apache.hadoop.hdfs.protocol.Block; +import org.apache.hadoop.hdfs.protocol.CacheDirectiveInfo; +import org.apache.hadoop.hdfs.protocol.CachePoolInfo; import org.apache.hadoop.hdfs.protocol.ExtendedBlock; import org.apache.hadoop.hdfs.protocolPB.DatanodeProtocolClientSideTranslatorPB; import org.apache.hadoop.hdfs.server.datanode.fsdataset.FsDatasetSpi; @@ -109,8 +112,9 @@ public class TestFsDatasetCache { public void setUp() throws Exception { assumeTrue(!Path.WINDOWS); conf = new HdfsConfiguration(); - conf.setLong(DFSConfigKeys.DFS_NAMENODE_PATH_BASED_CACHE_RETRY_INTERVAL_MS, - 500); + conf.setLong( + DFSConfigKeys.DFS_NAMENODE_PATH_BASED_CACHE_REFRESH_INTERVAL_MS, 100); + conf.setLong(DFSConfigKeys.DFS_CACHEREPORT_INTERVAL_MSEC_KEY, 500); conf.setLong(DFSConfigKeys.DFS_BLOCK_SIZE_KEY, BLOCK_SIZE); conf.setLong(DFSConfigKeys.DFS_DATANODE_MAX_LOCKED_MEMORY_KEY, CACHE_CAPACITY); @@ -328,7 +332,7 @@ public void testFilesExceedMaxLockedMemory() throws Exception { // Create some test files that will exceed total cache capacity final int numFiles = 5; - final long fileSize = 15000; + final long fileSize = CACHE_CAPACITY / (numFiles-1); final Path[] testFiles = new Path[numFiles]; final HdfsBlockLocation[][] fileLocs = new HdfsBlockLocation[numFiles][]; @@ -477,4 +481,42 @@ public void testPageRounder() throws Exception { setHeartbeatResponse(uncacheBlocks(locs)); verifyExpectedCacheUsage(0, 0); } + + @Test(timeout=60000) + public void testUncacheQuiesces() throws Exception { + // Create a file + Path fileName = new Path("/testUncacheQuiesces"); + int fileLen = 4096; + DFSTestUtil.createFile(fs, fileName, fileLen, (short)1, 0xFDFD); + // Cache it + DistributedFileSystem dfs = cluster.getFileSystem(); + dfs.addCachePool(new CachePoolInfo("pool")); + dfs.addCacheDirective(new CacheDirectiveInfo.Builder() + .setPool("pool").setPath(fileName).setReplication((short)3).build()); + GenericTestUtils.waitFor(new Supplier() { + @Override + public Boolean get() { + MetricsRecordBuilder dnMetrics = getMetrics(dn.getMetrics().name()); + long blocksCached = + MetricsAsserts.getLongCounter("BlocksCached", dnMetrics); + return blocksCached > 0; + } + }, 1000, 30000); + // Uncache it + dfs.removeCacheDirective(1); + GenericTestUtils.waitFor(new Supplier() { + @Override + public Boolean get() { + MetricsRecordBuilder dnMetrics = getMetrics(dn.getMetrics().name()); + long blocksUncached = + MetricsAsserts.getLongCounter("BlocksUncached", dnMetrics); + return blocksUncached > 0; + } + }, 1000, 30000); + // Make sure that no additional messages were sent + Thread.sleep(10000); + MetricsRecordBuilder dnMetrics = getMetrics(dn.getMetrics().name()); + MetricsAsserts.assertCounter("BlocksCached", 1l, dnMetrics); + MetricsAsserts.assertCounter("BlocksUncached", 1l, dnMetrics); + } } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestCacheDirectives.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestCacheDirectives.java index b81fde32ada..d47c275771f 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestCacheDirectives.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestCacheDirectives.java @@ -57,17 +57,18 @@ import org.apache.hadoop.hdfs.DFSTestUtil; import org.apache.hadoop.hdfs.DistributedFileSystem; import org.apache.hadoop.hdfs.HdfsConfiguration; +import org.apache.hadoop.hdfs.LogVerificationAppender; import org.apache.hadoop.hdfs.MiniDFSCluster; import org.apache.hadoop.hdfs.protocol.CacheDirectiveEntry; import org.apache.hadoop.hdfs.protocol.CacheDirectiveInfo; +import org.apache.hadoop.hdfs.protocol.CacheDirectiveInfo.Expiration; import org.apache.hadoop.hdfs.protocol.CacheDirectiveIterator; import org.apache.hadoop.hdfs.protocol.CacheDirectiveStats; import org.apache.hadoop.hdfs.protocol.CachePoolEntry; import org.apache.hadoop.hdfs.protocol.CachePoolInfo; -import org.apache.hadoop.hdfs.protocol.DatanodeInfo; -import org.apache.hadoop.hdfs.protocol.CacheDirectiveInfo.Expiration; -import org.apache.hadoop.hdfs.protocol.HdfsConstants.DatanodeReportType; import org.apache.hadoop.hdfs.protocol.CachePoolStats; +import org.apache.hadoop.hdfs.protocol.DatanodeInfo; +import org.apache.hadoop.hdfs.protocol.HdfsConstants.DatanodeReportType; import org.apache.hadoop.hdfs.server.blockmanagement.CacheReplicationMonitor; import org.apache.hadoop.hdfs.server.blockmanagement.DatanodeDescriptor.CachedBlocksList.Type; import org.apache.hadoop.hdfs.server.datanode.DataNode; @@ -81,6 +82,7 @@ import org.apache.hadoop.util.GSet; import org.apache.log4j.Level; import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; import org.junit.After; import org.junit.Assert; import org.junit.Before; @@ -603,8 +605,8 @@ public void testCacheManagerRestart() throws Exception { * Wait for the NameNode to have an expected number of cached blocks * and replicas. * @param nn NameNode - * @param expectedCachedBlocks - * @param expectedCachedReplicas + * @param expectedCachedBlocks if -1, treat as wildcard + * @param expectedCachedReplicas if -1, treat as wildcard * @throws Exception */ private static void waitForCachedBlocks(NameNode nn, @@ -633,16 +635,18 @@ public Boolean get() { } finally { namesystem.readUnlock(); } - if ((numCachedBlocks == expectedCachedBlocks) && - (numCachedReplicas == expectedCachedReplicas)) { - return true; - } else { - LOG.info(logString + " cached blocks: have " + numCachedBlocks + - " / " + expectedCachedBlocks + ". " + - "cached replicas: have " + numCachedReplicas + - " / " + expectedCachedReplicas); - return false; + if (expectedCachedBlocks == -1 || + numCachedBlocks == expectedCachedBlocks) { + if (expectedCachedReplicas == -1 || + numCachedReplicas == expectedCachedReplicas) { + return true; + } } + LOG.info(logString + " cached blocks: have " + numCachedBlocks + + " / " + expectedCachedBlocks + ". " + + "cached replicas: have " + numCachedReplicas + + " / " + expectedCachedReplicas); + return false; } }, 500, 60000); } @@ -1351,4 +1355,39 @@ public void testMaxRelativeExpiry() throws Exception { .setExpiration(Expiration.newRelative(RELATIVE_EXPIRY_NEVER - 1)) .build()); } + + @Test(timeout=60000) + public void testExceedsCapacity() throws Exception { + // Create a giant file + final Path fileName = new Path("/exceeds"); + final long fileLen = CACHE_CAPACITY * (NUM_DATANODES*2); + int numCachedReplicas = (int) ((CACHE_CAPACITY*NUM_DATANODES)/BLOCK_SIZE); + DFSTestUtil.createFile(dfs, fileName, fileLen, (short) NUM_DATANODES, + 0xFADED); + // Set up a log appender watcher + final LogVerificationAppender appender = new LogVerificationAppender(); + final Logger logger = Logger.getRootLogger(); + logger.addAppender(appender); + dfs.addCachePool(new CachePoolInfo("pool")); + dfs.addCacheDirective(new CacheDirectiveInfo.Builder().setPool("pool") + .setPath(fileName).setReplication((short) 1).build()); + waitForCachedBlocks(namenode, -1, numCachedReplicas, + "testExceeds:1"); + // Check that no DNs saw an excess CACHE message + int lines = appender.countLinesWithMessage( + "more bytes in the cache: " + + DFSConfigKeys.DFS_DATANODE_MAX_LOCKED_MEMORY_KEY); + assertEquals("Namenode should not send extra CACHE commands", 0, lines); + // Try creating a file with giant-sized blocks that exceed cache capacity + dfs.delete(fileName, false); + DFSTestUtil.createFile(dfs, fileName, 4096, fileLen, CACHE_CAPACITY * 2, + (short) 1, 0xFADED); + // Nothing will get cached, so just force sleep for a bit + Thread.sleep(4000); + // Still should not see any excess commands + lines = appender.countLinesWithMessage( + "more bytes in the cache: " + + DFSConfigKeys.DFS_DATANODE_MAX_LOCKED_MEMORY_KEY); + assertEquals("Namenode should not send extra CACHE commands", 0, lines); + } } From bfd227bf09c6b99c9d24d9df26fdaa502e9409a5 Mon Sep 17 00:00:00 2001 From: Brandon Li Date: Mon, 6 Jan 2014 22:58:18 +0000 Subject: [PATCH 39/42] HDFS-5719. FSImage#doRollback() should close prevState before return. Contributed by Ted Yu git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1556057 13f79535-47bb-0310-9956-ffa450edef68 --- hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt | 3 + .../hadoop/hdfs/server/namenode/FSImage.java | 102 +++++++++--------- 2 files changed, 56 insertions(+), 49 deletions(-) diff --git a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt index 82c104b4a7d..5c665647d27 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt +++ b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt @@ -465,6 +465,9 @@ Trunk (Unreleased) HDFS-5705. TestSecondaryNameNodeUpgrade#testChangeNsIDFails may fail due to ConcurrentModificationException. (Ted Yu via brandonli) + HDFS-5719. FSImage#doRollback() should close prevState before return + (Ted Yu via brandonli) + BREAKDOWN OF HDFS-2832 SUBTASKS AND RELATED JIRAS HDFS-4985. Add storage type to the protocol and expose it in block report diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSImage.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSImage.java index ee743fe65bb..14764e006ec 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSImage.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSImage.java @@ -405,60 +405,64 @@ private void doRollback() throws IOException { // Directories that don't have previous state do not rollback boolean canRollback = false; FSImage prevState = new FSImage(conf); - prevState.getStorage().layoutVersion = HdfsConstants.LAYOUT_VERSION; - for (Iterator it = storage.dirIterator(); it.hasNext();) { - StorageDirectory sd = it.next(); - File prevDir = sd.getPreviousDir(); - if (!prevDir.exists()) { // use current directory then - LOG.info("Storage directory " + sd.getRoot() - + " does not contain previous fs state."); - // read and verify consistency with other directories - storage.readProperties(sd); - continue; + try { + prevState.getStorage().layoutVersion = HdfsConstants.LAYOUT_VERSION; + for (Iterator it = storage.dirIterator(); it.hasNext();) { + StorageDirectory sd = it.next(); + File prevDir = sd.getPreviousDir(); + if (!prevDir.exists()) { // use current directory then + LOG.info("Storage directory " + sd.getRoot() + + " does not contain previous fs state."); + // read and verify consistency with other directories + storage.readProperties(sd); + continue; + } + + // read and verify consistency of the prev dir + prevState.getStorage().readPreviousVersionProperties(sd); + + if (prevState.getLayoutVersion() != HdfsConstants.LAYOUT_VERSION) { + throw new IOException( + "Cannot rollback to storage version " + + prevState.getLayoutVersion() + + " using this version of the NameNode, which uses storage version " + + HdfsConstants.LAYOUT_VERSION + ". " + + "Please use the previous version of HDFS to perform the rollback."); + } + canRollback = true; } + if (!canRollback) + throw new IOException("Cannot rollback. None of the storage " + + "directories contain previous fs state."); - // read and verify consistency of the prev dir - prevState.getStorage().readPreviousVersionProperties(sd); + // Now that we know all directories are going to be consistent + // Do rollback for each directory containing previous state + for (Iterator it = storage.dirIterator(); it.hasNext();) { + StorageDirectory sd = it.next(); + File prevDir = sd.getPreviousDir(); + if (!prevDir.exists()) + continue; - if (prevState.getLayoutVersion() != HdfsConstants.LAYOUT_VERSION) { - throw new IOException( - "Cannot rollback to storage version " + - prevState.getLayoutVersion() + - " using this version of the NameNode, which uses storage version " + - HdfsConstants.LAYOUT_VERSION + ". " + - "Please use the previous version of HDFS to perform the rollback."); + LOG.info("Rolling back storage directory " + sd.getRoot() + + ".\n new LV = " + prevState.getStorage().getLayoutVersion() + + "; new CTime = " + prevState.getStorage().getCTime()); + File tmpDir = sd.getRemovedTmp(); + assert !tmpDir.exists() : "removed.tmp directory must not exist."; + // rename current to tmp + File curDir = sd.getCurrentDir(); + assert curDir.exists() : "Current directory must exist."; + NNStorage.rename(curDir, tmpDir); + // rename previous to current + NNStorage.rename(prevDir, curDir); + + // delete tmp dir + NNStorage.deleteDir(tmpDir); + LOG.info("Rollback of " + sd.getRoot()+ " is complete."); } - canRollback = true; + isUpgradeFinalized = true; + } finally { + prevState.close(); } - if (!canRollback) - throw new IOException("Cannot rollback. None of the storage " - + "directories contain previous fs state."); - - // Now that we know all directories are going to be consistent - // Do rollback for each directory containing previous state - for (Iterator it = storage.dirIterator(); it.hasNext();) { - StorageDirectory sd = it.next(); - File prevDir = sd.getPreviousDir(); - if (!prevDir.exists()) - continue; - - LOG.info("Rolling back storage directory " + sd.getRoot() - + ".\n new LV = " + prevState.getStorage().getLayoutVersion() - + "; new CTime = " + prevState.getStorage().getCTime()); - File tmpDir = sd.getRemovedTmp(); - assert !tmpDir.exists() : "removed.tmp directory must not exist."; - // rename current to tmp - File curDir = sd.getCurrentDir(); - assert curDir.exists() : "Current directory must exist."; - NNStorage.rename(curDir, tmpDir); - // rename previous to current - NNStorage.rename(prevDir, curDir); - - // delete tmp dir - NNStorage.deleteDir(tmpDir); - LOG.info("Rollback of " + sd.getRoot()+ " is complete."); - } - isUpgradeFinalized = true; } private void doFinalize(StorageDirectory sd) throws IOException { From 0c591a624dec3e0b2d932aa1afd1fe4e1306e6c8 Mon Sep 17 00:00:00 2001 From: Alejandro Abdelnur Date: Mon, 6 Jan 2014 23:21:08 +0000 Subject: [PATCH 40/42] MAPREDUCE-3310. Addendum fixing javadocs warnings (tucu) git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1556063 13f79535-47bb-0310-9956-ffa450edef68 --- .../src/main/java/org/apache/hadoop/mapred/JobConf.java | 5 ++--- .../src/main/java/org/apache/hadoop/mapreduce/Job.java | 5 ++--- .../main/java/org/apache/hadoop/mapreduce/JobContext.java | 7 +++---- 3 files changed, 7 insertions(+), 10 deletions(-) diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapred/JobConf.java b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapred/JobConf.java index 53159fbe592..fd9e95d4916 100644 --- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapred/JobConf.java +++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapred/JobConf.java @@ -971,7 +971,7 @@ public RawComparator getCombinerKeyGroupingComparator() { * grouping keys of inputs to the reduce. * * @return comparator set by the user for grouping values. - * @see #setOutputValueGroupingComparator(Class) for details. + * @see #setOutputValueGroupingComparator(Class) for details. */ public RawComparator getOutputValueGroupingComparator() { Class theClass = getClass( @@ -1038,8 +1038,7 @@ public void setCombinerKeyGroupingComparator( * @param theClass the comparator class to be used for grouping keys. * It should implement RawComparator. * @see #setOutputKeyComparatorClass(Class) - * @see {@link #setCombinerKeyGroupingComparator(Class)} for setting a - * comparator for the combiner. + * @see #setCombinerKeyGroupingComparator(Class) */ public void setOutputValueGroupingComparator( Class theClass) { diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/Job.java b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/Job.java index 4bb97e84d6c..115a2b9fee3 100644 --- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/Job.java +++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/Job.java @@ -968,7 +968,7 @@ public void setCombinerKeyGroupingComparatorClass( * are passed to the {@link Reducer}. * @param cls the raw comparator * @throws IllegalStateException if the job is submitted - * @see {@link #setCombinerKeyGroupingComparatorClass(Class)} + * @see #setCombinerKeyGroupingComparatorClass(Class) */ public void setSortComparatorClass(Class cls ) throws IllegalStateException { @@ -983,8 +983,7 @@ public void setSortComparatorClass(Class cls * org.apache.hadoop.mapreduce.Reducer.Context)} * @param cls the raw comparator to use * @throws IllegalStateException if the job is submitted - * @see {@link #setCombinerKeyGroupingComparatorClass(Class)} for setting a - * comparator for the combiner. + * @see #setCombinerKeyGroupingComparatorClass(Class) */ public void setGroupingComparatorClass(Class cls ) throws IllegalStateException { diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/JobContext.java b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/JobContext.java index fa73a5f066f..836f1829079 100644 --- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/JobContext.java +++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/JobContext.java @@ -172,7 +172,7 @@ public Class> getPartitionerClass() * grouping keys of inputs to the combiner. * * @return comparator set by the user for grouping values. - * @see Job#setCombinerKeyGroupingComparatorClass(Class) for details. + * @see Job#setCombinerKeyGroupingComparatorClass(Class) */ public RawComparator getCombinerKeyGroupingComparator(); @@ -181,9 +181,8 @@ public Class> getPartitionerClass() * grouping keys of inputs to the reduce. * * @return comparator set by the user for grouping values. - * @see Job#setGroupingComparatorClass(Class) for details. - * @see {@link #getCombinerKeyGroupingComparator()} for setting a - * comparator for the combiner. + * @see Job#setGroupingComparatorClass(Class) + * @see #getCombinerKeyGroupingComparator() */ public RawComparator getGroupingComparator(); From cbdad3d47150ef01440515128241af6bfd47a3ec Mon Sep 17 00:00:00 2001 From: Owen O'Malley Date: Mon, 6 Jan 2014 23:43:08 +0000 Subject: [PATCH 41/42] HADOOP-10201. Add listing to KeyProvider API. (Larry McCay via omalley) git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1556072 13f79535-47bb-0310-9956-ffa450edef68 --- .../hadoop-common/CHANGES.txt | 2 + .../crypto/key/JavaKeyStoreProvider.java | 44 ++++++++++++++++++- .../apache/hadoop/crypto/key/KeyProvider.java | 14 ++++++ .../hadoop/crypto/key/UserProvider.java | 30 +++++++++++++ .../apache/hadoop/security/Credentials.java | 30 +++++++++---- .../crypto/key/TestKeyProviderFactory.java | 11 +++++ 6 files changed, 121 insertions(+), 10 deletions(-) diff --git a/hadoop-common-project/hadoop-common/CHANGES.txt b/hadoop-common-project/hadoop-common/CHANGES.txt index cdff25522ec..6dd74fbe20a 100644 --- a/hadoop-common-project/hadoop-common/CHANGES.txt +++ b/hadoop-common-project/hadoop-common/CHANGES.txt @@ -108,6 +108,8 @@ Trunk (Unreleased) HADOOP-10141. Create KeyProvider API to separate encryption key storage from the applications. (omalley) + HADOOP-10201. Add listing to KeyProvider API. (Larry McCay via omalley) + BUG FIXES HADOOP-9451. Fault single-layer config if node group topology is enabled. diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/crypto/key/JavaKeyStoreProvider.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/crypto/key/JavaKeyStoreProvider.java index 3c82563628e..93a47deaa73 100644 --- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/crypto/key/JavaKeyStoreProvider.java +++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/crypto/key/JavaKeyStoreProvider.java @@ -36,8 +36,11 @@ import java.security.NoSuchAlgorithmException; import java.security.UnrecoverableKeyException; import java.security.cert.CertificateException; +import java.util.ArrayList; import java.util.Date; +import java.util.Enumeration; import java.util.HashMap; +import java.util.List; import java.util.Map; /** @@ -56,6 +59,7 @@ */ @InterfaceAudience.Private public class JavaKeyStoreProvider extends KeyProvider { + private static final String KEY_METADATA = "KeyMetadata"; public static final String SCHEME_NAME = "jceks"; public static final String KEYSTORE_PASSWORD_NAME = "HADOOP_KEYSTORE_PASSWORD"; @@ -117,6 +121,44 @@ public KeyVersion getKeyVersion(String versionName) throws IOException { return new KeyVersion(versionName, key.getEncoded()); } + @Override + public List getKeys() throws IOException { + ArrayList list = new ArrayList(); + String alias = null; + try { + Enumeration e = keyStore.aliases(); + while (e.hasMoreElements()) { + alias = e.nextElement(); + // only include the metadata key names in the list of names + if (!alias.contains("@")) { + list.add(alias); + } + } + } catch (KeyStoreException e) { + throw new IOException("Can't get key " + alias + " from " + path, e); + } + return list; + } + + @Override + public List getKeyVersions(String name) throws IOException { + List list = new ArrayList(); + Metadata km = getMetadata(name); + if (km != null) { + int latestVersion = km.getVersions(); + KeyVersion v = null; + String versionName = null; + for (int i = 0; i < latestVersion; i++) { + versionName = buildVersionName(name, i); + v = getKeyVersion(versionName); + if (v != null) { + list.add(v); + } + } + } + return list; + } + @Override public Metadata getMetadata(String name) throws IOException { if (cache.containsKey(name)) { @@ -288,7 +330,7 @@ public String getAlgorithm() { @Override public String getFormat() { - return "KeyMetadata"; + return KEY_METADATA; } @Override diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/crypto/key/KeyProvider.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/crypto/key/KeyProvider.java index a8e95e5eb6e..6f9f016f62c 100644 --- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/crypto/key/KeyProvider.java +++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/crypto/key/KeyProvider.java @@ -254,6 +254,20 @@ public static Options options(Configuration conf) { public abstract KeyVersion getKeyVersion(String versionName ) throws IOException; + /** + * Get the key names for all keys. + * @return the list of key names + * @throws IOException + */ + public abstract List getKeys() throws IOException; + + /** + * Get the key material for all versions of a specific key name. + * @return the list of key material + * @throws IOException + */ + public abstract List getKeyVersions(String name) throws IOException; + /** * Get the current version of the key, which should be used for encrypting new * data. diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/crypto/key/UserProvider.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/crypto/key/UserProvider.java index 42ce69341d1..424e7ca8503 100644 --- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/crypto/key/UserProvider.java +++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/crypto/key/UserProvider.java @@ -20,8 +20,10 @@ import java.io.IOException; import java.net.URI; +import java.util.ArrayList; import java.util.Date; import java.util.HashMap; +import java.util.List; import java.util.Map; import org.apache.hadoop.classification.InterfaceAudience; @@ -142,4 +144,32 @@ public KeyProvider createProvider(URI providerName, return null; } } + + @Override + public List getKeys() throws IOException { + List list = new ArrayList(); + List keys = credentials.getAllSecretKeys(); + for (Text key : keys) { + if (key.find("@") == -1) { + list.add(key.toString()); + } + } + return list; + } + + @Override + public List getKeyVersions(String name) throws IOException { + List list = new ArrayList(); + Metadata km = getMetadata(name); + if (km != null) { + int latestVersion = km.getVersions(); + for (int i = 0; i < latestVersion; i++) { + KeyVersion v = getKeyVersion(buildVersionName(name, i)); + if (v != null) { + list.add(v); + } + } + } + return list; + } } diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/Credentials.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/Credentials.java index 88f54de61af..b796743eaa1 100644 --- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/Credentials.java +++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/Credentials.java @@ -29,7 +29,9 @@ import java.util.Arrays; import java.util.Collection; import java.util.HashMap; +import java.util.List; import java.util.Map; +import java.util.Map.Entry; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; @@ -73,15 +75,6 @@ public Credentials(Credentials credentials) { this.addAll(credentials); } - /** - * Returns the key bytes for the alias - * @param alias the alias for the key - * @return key for this alias - */ - public byte[] getSecretKey(Text alias) { - return secretKeysMap.get(alias); - } - /** * Returns the Token object for the alias * @param alias the alias for the Token @@ -117,6 +110,15 @@ public Collection> getAllTokens() { public int numberOfTokens() { return tokenMap.size(); } + + /** + * Returns the key bytes for the alias + * @param alias the alias for the key + * @return key for this alias + */ + public byte[] getSecretKey(Text alias) { + return secretKeysMap.get(alias); + } /** * @return number of keys in the in-memory map @@ -142,6 +144,16 @@ public void removeSecretKey(Text alias) { secretKeysMap.remove(alias); } + /** + * Return all the secret key entries in the in-memory map + */ + public List getAllSecretKeys() { + List list = new java.util.ArrayList(); + list.addAll(secretKeysMap.keySet()); + + return list; + } + /** * Convenience method for reading a token storage file, and loading the Tokens * therein in the passed UGI diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/crypto/key/TestKeyProviderFactory.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/crypto/key/TestKeyProviderFactory.java index 8d073f7d514..b2964af6f80 100644 --- a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/crypto/key/TestKeyProviderFactory.java +++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/crypto/key/TestKeyProviderFactory.java @@ -21,6 +21,7 @@ import java.io.IOException; import java.util.List; import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.crypto.key.KeyProvider.KeyVersion; import org.apache.hadoop.io.Text; import org.apache.hadoop.security.Credentials; import org.apache.hadoop.security.UserGroupInformation; @@ -160,6 +161,16 @@ static void checkSpecificProvider(Configuration conf, provider.getCurrentKey("key4").getMaterial()); assertArrayEquals(key3, provider.getCurrentKey("key3").getMaterial()); assertEquals("key3@0", provider.getCurrentKey("key3").getVersionName()); + + List keys = provider.getKeys(); + assertTrue("Keys should have been returned.", keys.size() == 2); + assertTrue("Returned Keys should have included key3.", keys.contains("key3")); + assertTrue("Returned Keys should have included key4.", keys.contains("key4")); + + List kvl = provider.getKeyVersions("key3"); + assertTrue("KeyVersions should have been returned for key3.", kvl.size() == 1); + assertTrue("KeyVersions should have included key3@0.", kvl.get(0).getVersionName().equals("key3@0")); + assertArrayEquals(key3, kvl.get(0).getMaterial()); } @Test From 5241aa4cdd24b60f74f123e99bab788958186e09 Mon Sep 17 00:00:00 2001 From: Arpit Agarwal Date: Tue, 7 Jan 2014 00:40:59 +0000 Subject: [PATCH 42/42] HDFS-2832. Update CHANGES.txt to reflect merge to branch-2 git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1556088 13f79535-47bb-0310-9956-ffa450edef68 --- hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt | 274 ++++++++++---------- 1 file changed, 137 insertions(+), 137 deletions(-) diff --git a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt index 5c665647d27..e21fcdff37f 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt +++ b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt @@ -13,10 +13,6 @@ Trunk (Unreleased) HDFS-3125. Add JournalService to enable Journal Daemon. (suresh) - HDFS-2832. Heterogeneous Storages support in HDFS phase 1 - treat DataNode - as a collection of storages (see breakdown of tasks below for features and - contributors). - IMPROVEMENTS HDFS-4665. Move TestNetworkTopologyWithNodeGroup to common. @@ -468,139 +464,6 @@ Trunk (Unreleased) HDFS-5719. FSImage#doRollback() should close prevState before return (Ted Yu via brandonli) - BREAKDOWN OF HDFS-2832 SUBTASKS AND RELATED JIRAS - - HDFS-4985. Add storage type to the protocol and expose it in block report - and block locations. (Arpit Agarwal) - - HDFS-5115. Make StorageID a UUID. (Arpit Agarwal) - - HDFS-5000. DataNode configuration should allow specifying storage type. - (Arpit Agarwal) - - HDFS-4987. Namenode changes to track multiple storages per datanode. - (szetszwo) - - HDFS-5154. Fix TestBlockManager and TestDatanodeDescriptor after HDFS-4987. - (Junping Du via szetszwo) - - HDFS-5009. Include storage information in the LocatedBlock. (szetszwo) - - HDFS-5134. Move blockContentsStale, heartbeatedSinceFailover and - firstBlockReport from DatanodeDescriptor to DatanodeStorageInfo; and - fix a synchronization problem in DatanodeStorageInfo. (szetszwo) - - HDFS-5157. Add StorageType to FsVolume. (Junping Du via szetszwo) - - HDFS-4990. Change BlockPlacementPolicy to choose storages instead of - datanodes. (szetszwo) - - HDFS-5232. Protocol changes to transmit StorageUuid. (Arpit Agarwal) - - HDFS-5233. Use Datanode UUID to identify Datanodes. (Arpit Agarwal) - - HDFS-5222. Move block schedule information from DatanodeDescriptor to - DatanodeStorageInfo. (szetszwo) - - HDFS-4988. Datanode must support all the volumes as individual storages. - (Arpit Agarwal) - - HDFS-5377. Heartbeats from Datandode should include one storage report - per storage directory. (Arpit Agarwal) - - HDFS-5398. NameNode changes to process storage reports per storage - directory. (Arpit Agarwal) - - HDFS-5390. Send one incremental block report per storage directory. - (Arpit Agarwal) - - HDFS-5401. Fix NPE in Directory Scanner. (Arpit Agarwal) - - HDFS-5417. Fix storage IDs in PBHelper and UpgradeUtilities. (szetszwo) - - HDFS-5214. Fix NPEs in BlockManager and DirectoryScanner. (Arpit Agarwal) - - HDFS-5435. File append fails to initialize storageIDs. (Junping Du via - Arpit Agarwal) - - HDFS-5437. Fix TestBlockReport and TestBPOfferService failures. (Arpit - Agarwal) - - HDFS-5447. Fix TestJspHelper. (Arpit Agarwal) - - HDFS-5452. Fix TestReplicationPolicy and TestBlocksScheduledCounter. - - HDFS-5448. Datanode should generate its ID on first registration. (Arpit - Agarwal) - - HDFS-5448. Fix break caused by previous checkin for HDFS-5448. (Arpit - Agarwal) - - HDFS-5455. NN should update storageMap on first heartbeat. (Arpit Agarwal) - - HDFS-5457. Fix TestDatanodeRegistration, TestFsck and TestAddBlockRetry. - (Contributed by szetszwo) - - HDFS-5466. Update storage IDs when the pipeline is updated. (Contributed - by szetszwo) - - HDFS-5439. Fix TestPendingReplication. (Contributed by Junping Du, Arpit - Agarwal) - - HDFS-5470. Add back trunk's reportDiff algorithm to the branch. - (Contributed by szetszwo) - - HDFS-5472. Fix TestDatanodeManager, TestSafeMode and - TestNNThroughputBenchmark (Contributed by szetszwo) - - HDFS-5475. NN incorrectly tracks more than one replica per DN. (Arpit - Agarwal) - - HDFS-5481. Fix TestDataNodeVolumeFailure in branch HDFS-2832. (Contributed - by Junping Du) - - HDFS-5480. Update Balancer for HDFS-2832. (Contributed by szetszwo) - - HDFS-5486. Fix TestNameNodeMetrics for HDFS-2832. (Arpit Agarwal) - - HDFS-5491. Update editsStored for HDFS-2832. (Arpit Agarwal) - - HDFS-5494. Fix findbugs warnings for HDFS-2832. (Arpit Agarwal) - - HDFS-5508. Fix compilation error after merge. (Contributed by szetszwo) - - HDFS-5501. Fix pendingReceivedRequests tracking in BPServiceActor. (Arpit - Agarwal) - - HDFS-5510. Fix a findbug warning in DataStorage.java on HDFS-2832 branch. - (Junping Du via Arpit Agarwal) - - HDFS-5515. Fix TestDFSStartupVersions for HDFS-2832. (Arpit Agarwal) - - HDFS-5527. Fix TestUnderReplicatedBlocks on branch HDFS-2832. (Arpit - Agarwal) - - HDFS-5547. Fix build break after merge from trunk to HDFS-2832. (Arpit - Agarwal) - - HDFS-5542. Fix TODO and clean up the code in HDFS-2832. (Contributed by - szetszwo) - - HDFS-5559. Fix TestDatanodeConfig in HDFS-2832. (Contributed by szetszwo) - - HDFS-5484. StorageType and State in DatanodeStorageInfo in NameNode is - not accurate. (Eric Sirianni via Arpit Agarwal) - - HDFS-5648. Get rid of FsDatasetImpl#perVolumeReplicaMap. (Arpit Agarwal) - - HDFS-5406. Send incremental block reports for all storages in a - single call. (Arpit Agarwal) - - HDFS-5454. DataNode UUID should be assigned prior to FsDataset - initialization. (Arpit Agarwal) - - HDFS-5667. Include DatanodeStorage in StorageReport. (Arpit Agarwal) - HDFS-5589. Namenode loops caching and uncaching when data should be uncached (awang via cmccabe) @@ -634,6 +497,10 @@ Release 2.4.0 - UNRELEASED HDFS-5514. FSNamesystem's fsLock should allow custom implementation (daryn) + HDFS-2832. Heterogeneous Storages support in HDFS phase 1 - treat DataNode + as a collection of storages (see breakdown of tasks below for features and + contributors). + IMPROVEMENTS HDFS-5267. Remove volatile from LightWeightHashSet. (Junping Du via llu) @@ -861,6 +728,139 @@ Release 2.4.0 - UNRELEASED HDFS-5690. DataNode fails to start in secure mode when dfs.http.policy equals to HTTP_ONLY. (Haohui Mai via jing9) + BREAKDOWN OF HDFS-2832 SUBTASKS AND RELATED JIRAS + + HDFS-4985. Add storage type to the protocol and expose it in block report + and block locations. (Arpit Agarwal) + + HDFS-5115. Make StorageID a UUID. (Arpit Agarwal) + + HDFS-5000. DataNode configuration should allow specifying storage type. + (Arpit Agarwal) + + HDFS-4987. Namenode changes to track multiple storages per datanode. + (szetszwo) + + HDFS-5154. Fix TestBlockManager and TestDatanodeDescriptor after HDFS-4987. + (Junping Du via szetszwo) + + HDFS-5009. Include storage information in the LocatedBlock. (szetszwo) + + HDFS-5134. Move blockContentsStale, heartbeatedSinceFailover and + firstBlockReport from DatanodeDescriptor to DatanodeStorageInfo; and + fix a synchronization problem in DatanodeStorageInfo. (szetszwo) + + HDFS-5157. Add StorageType to FsVolume. (Junping Du via szetszwo) + + HDFS-4990. Change BlockPlacementPolicy to choose storages instead of + datanodes. (szetszwo) + + HDFS-5232. Protocol changes to transmit StorageUuid. (Arpit Agarwal) + + HDFS-5233. Use Datanode UUID to identify Datanodes. (Arpit Agarwal) + + HDFS-5222. Move block schedule information from DatanodeDescriptor to + DatanodeStorageInfo. (szetszwo) + + HDFS-4988. Datanode must support all the volumes as individual storages. + (Arpit Agarwal) + + HDFS-5377. Heartbeats from Datandode should include one storage report + per storage directory. (Arpit Agarwal) + + HDFS-5398. NameNode changes to process storage reports per storage + directory. (Arpit Agarwal) + + HDFS-5390. Send one incremental block report per storage directory. + (Arpit Agarwal) + + HDFS-5401. Fix NPE in Directory Scanner. (Arpit Agarwal) + + HDFS-5417. Fix storage IDs in PBHelper and UpgradeUtilities. (szetszwo) + + HDFS-5214. Fix NPEs in BlockManager and DirectoryScanner. (Arpit Agarwal) + + HDFS-5435. File append fails to initialize storageIDs. (Junping Du via + Arpit Agarwal) + + HDFS-5437. Fix TestBlockReport and TestBPOfferService failures. (Arpit + Agarwal) + + HDFS-5447. Fix TestJspHelper. (Arpit Agarwal) + + HDFS-5452. Fix TestReplicationPolicy and TestBlocksScheduledCounter. + + HDFS-5448. Datanode should generate its ID on first registration. (Arpit + Agarwal) + + HDFS-5448. Fix break caused by previous checkin for HDFS-5448. (Arpit + Agarwal) + + HDFS-5455. NN should update storageMap on first heartbeat. (Arpit Agarwal) + + HDFS-5457. Fix TestDatanodeRegistration, TestFsck and TestAddBlockRetry. + (Contributed by szetszwo) + + HDFS-5466. Update storage IDs when the pipeline is updated. (Contributed + by szetszwo) + + HDFS-5439. Fix TestPendingReplication. (Contributed by Junping Du, Arpit + Agarwal) + + HDFS-5470. Add back trunk's reportDiff algorithm to the branch. + (Contributed by szetszwo) + + HDFS-5472. Fix TestDatanodeManager, TestSafeMode and + TestNNThroughputBenchmark (Contributed by szetszwo) + + HDFS-5475. NN incorrectly tracks more than one replica per DN. (Arpit + Agarwal) + + HDFS-5481. Fix TestDataNodeVolumeFailure in branch HDFS-2832. (Contributed + by Junping Du) + + HDFS-5480. Update Balancer for HDFS-2832. (Contributed by szetszwo) + + HDFS-5486. Fix TestNameNodeMetrics for HDFS-2832. (Arpit Agarwal) + + HDFS-5491. Update editsStored for HDFS-2832. (Arpit Agarwal) + + HDFS-5494. Fix findbugs warnings for HDFS-2832. (Arpit Agarwal) + + HDFS-5508. Fix compilation error after merge. (Contributed by szetszwo) + + HDFS-5501. Fix pendingReceivedRequests tracking in BPServiceActor. (Arpit + Agarwal) + + HDFS-5510. Fix a findbug warning in DataStorage.java on HDFS-2832 branch. + (Junping Du via Arpit Agarwal) + + HDFS-5515. Fix TestDFSStartupVersions for HDFS-2832. (Arpit Agarwal) + + HDFS-5527. Fix TestUnderReplicatedBlocks on branch HDFS-2832. (Arpit + Agarwal) + + HDFS-5547. Fix build break after merge from trunk to HDFS-2832. (Arpit + Agarwal) + + HDFS-5542. Fix TODO and clean up the code in HDFS-2832. (Contributed by + szetszwo) + + HDFS-5559. Fix TestDatanodeConfig in HDFS-2832. (Contributed by szetszwo) + + HDFS-5484. StorageType and State in DatanodeStorageInfo in NameNode is + not accurate. (Eric Sirianni via Arpit Agarwal) + + HDFS-5648. Get rid of FsDatasetImpl#perVolumeReplicaMap. (Arpit Agarwal) + + HDFS-5406. Send incremental block reports for all storages in a + single call. (Arpit Agarwal) + + HDFS-5454. DataNode UUID should be assigned prior to FsDataset + initialization. (Arpit Agarwal) + + HDFS-5667. Include DatanodeStorage in StorageReport. (Arpit Agarwal) + Release 2.3.0 - UNRELEASED INCOMPATIBLE CHANGES