From 4af7d14ce29d1cb3ea098f968d28fb51dc3ca967 Mon Sep 17 00:00:00 2001 From: Eric Badger Date: Wed, 29 Jan 2020 22:44:23 +0000 Subject: [PATCH] YARN-10084. Allow inheritance of max app lifetime / default app lifetime. Contributed by Eric Payne. --- .../scheduler/capacity/AbstractCSQueue.java | 97 ++++++++++ .../scheduler/capacity/CSQueue.java | 22 +++ .../scheduler/capacity/LeafQueue.java | 32 +-- .../rmapp/TestApplicationLifetimeMonitor.java | 183 ++++++++++++++++++ .../src/site/markdown/CapacityScheduler.md | 4 +- 5 files changed, 307 insertions(+), 31 deletions(-) diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/AbstractCSQueue.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/AbstractCSQueue.java index 67b676bea57..88596eba809 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/AbstractCSQueue.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/AbstractCSQueue.java @@ -46,6 +46,7 @@ import org.apache.hadoop.yarn.api.records.QueueStatistics; import org.apache.hadoop.yarn.api.records.Resource; import org.apache.hadoop.yarn.conf.YarnConfiguration; import org.apache.hadoop.yarn.exceptions.YarnException; +import org.apache.hadoop.yarn.exceptions.YarnRuntimeException; import org.apache.hadoop.yarn.factories.RecordFactory; import org.apache.hadoop.yarn.factory.providers.RecordFactoryProvider; import org.apache.hadoop.yarn.security.AccessRequest; @@ -110,6 +111,15 @@ public abstract class AbstractCSQueue implements CSQueue { QueueResourceQuotas queueResourceQuotas; + // -1 indicates lifetime is disabled + private volatile long maxApplicationLifetime = -1; + + private volatile long defaultApplicationLifetime = -1; + + // Indicates if this queue's default lifetime was set by a config property, + // either at this level or anywhere in the queue's hierarchy. + private volatile boolean defaultAppLifetimeWasSpecifiedInConfig = false; + protected enum CapacityConfigType { NONE, PERCENTAGE, ABSOLUTE_RESOURCE }; @@ -415,6 +425,19 @@ public abstract class AbstractCSQueue implements CSQueue { getQueuePath()); this.userWeights = getUserWeightsFromHierarchy(configuration); + + maxApplicationLifetime = getInheritedMaxAppLifetime(this, configuration); + defaultApplicationLifetime = + getInheritedDefaultAppLifetime(this, configuration, + maxApplicationLifetime); + if (maxApplicationLifetime > 0 && + defaultApplicationLifetime > maxApplicationLifetime) { + throw new YarnRuntimeException( + "Default lifetime " + defaultApplicationLifetime + + " can't exceed maximum lifetime " + maxApplicationLifetime); + } + defaultApplicationLifetime = defaultApplicationLifetime > 0 + ? defaultApplicationLifetime : maxApplicationLifetime; } finally { writeLock.unlock(); } @@ -810,6 +833,68 @@ public abstract class AbstractCSQueue implements CSQueue { parentQ.getPreemptionDisabled()); } + private long getInheritedMaxAppLifetime(CSQueue q, + CapacitySchedulerConfiguration conf) { + CSQueue parentQ = q.getParent(); + long maxAppLifetime = conf.getMaximumLifetimePerQueue(q.getQueuePath()); + + // If q is the root queue, then get max app lifetime from conf. + if (parentQ == null) { + return maxAppLifetime; + } + + // If this is not the root queue, get this queue's max app lifetime + // from the conf. The parent's max app lifetime will be used if it's + // not set for this queue. + // A value of 0 will override the parent's value and means no max lifetime. + // A negative value means that the parent's max should be used. + long parentsMaxAppLifetime = getParent().getMaximumApplicationLifetime(); + return (maxAppLifetime >= 0) ? maxAppLifetime : parentsMaxAppLifetime; + } + + private long getInheritedDefaultAppLifetime(CSQueue q, + CapacitySchedulerConfiguration conf, long myMaxAppLifetime) { + CSQueue parentQ = q.getParent(); + long defaultAppLifetime = conf.getDefaultLifetimePerQueue(getQueuePath()); + defaultAppLifetimeWasSpecifiedInConfig = + (defaultAppLifetime >= 0 + || (parentQ != null && + parentQ.getDefaultAppLifetimeWasSpecifiedInConfig())); + + // If q is the root queue, then get default app lifetime from conf. + if (parentQ == null) { + return defaultAppLifetime; + } + + // If this is not the root queue, get the parent's default app lifetime. The + // parent's default app lifetime will be used if not set for this queue. + long parentsDefaultAppLifetime = + getParent().getDefaultApplicationLifetime(); + + // Negative value indicates default lifetime was not set at this level. + // If default lifetime was not set at this level, calculate it based on + // parent's default lifetime or current queue's max lifetime. + if (defaultAppLifetime < 0) { + // If default lifetime was not set at this level but was set somewhere in + // the parent's hierarchy, set default lifetime to parent queue's default + // only if parent queue's lifetime is less than current queueu's max + // lifetime. Otherwise, use current queue's max lifetime value for its + // default lifetime. + if (defaultAppLifetimeWasSpecifiedInConfig) { + if (parentsDefaultAppLifetime <= myMaxAppLifetime) { + defaultAppLifetime = parentsDefaultAppLifetime; + } else { + defaultAppLifetime = myMaxAppLifetime; + } + } else { + // Default app lifetime value was not set anywhere in this queue's + // hierarchy. Use current queue's max lifetime as its default. + defaultAppLifetime = myMaxAppLifetime; + } + } // else if >= 0, default lifetime was set at this level. Just use it. + return defaultAppLifetime; + } + /** * The specified queue is intra-queue preemptable if * 1) system-wide intra-queue preemption is turned on @@ -1259,4 +1344,16 @@ public abstract class AbstractCSQueue implements CSQueue { this.writeLock.unlock(); } } + + public long getMaximumApplicationLifetime() { + return maxApplicationLifetime; + } + + public long getDefaultApplicationLifetime() { + return defaultApplicationLifetime; + } + + public boolean getDefaultAppLifetimeWasSpecifiedInConfig() { + return defaultAppLifetimeWasSpecifiedInConfig; + } } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/CSQueue.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/CSQueue.java index b6bf7436864..6a93d1d38e4 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/CSQueue.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/CSQueue.java @@ -434,4 +434,26 @@ public interface CSQueue extends SchedulerQueue { * @return effective max queue capacity */ Resource getEffectiveMaxCapacityDown(String label, Resource factor); + + /** + * Get the maximum lifetime in seconds of an application which is submitted to + * this queue. Apps can set their own lifetime timeout up to this value. + * @return max lifetime in seconds + */ + long getMaximumApplicationLifetime(); + + /** + * Get the default lifetime in seconds of an application which is submitted to + * this queue. If an app doesn't specify its own timeout when submitted, this + * value will be used. + * @return default app lifetime + */ + long getDefaultApplicationLifetime(); + + /** + * Get the indicator of whether or not the default application lifetime was + * set by a config property or was calculated by the capacity scheduler. + * @return indicator whether set or calculated + */ + boolean getDefaultAppLifetimeWasSpecifiedInConfig(); } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/LeafQueue.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/LeafQueue.java index 81de14b1ee0..72dce8e4f2f 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/LeafQueue.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/LeafQueue.java @@ -44,7 +44,6 @@ import org.apache.hadoop.yarn.api.records.QueueInfo; import org.apache.hadoop.yarn.api.records.QueueState; import org.apache.hadoop.yarn.api.records.QueueUserACLInfo; import org.apache.hadoop.yarn.api.records.Resource; -import org.apache.hadoop.yarn.exceptions.YarnRuntimeException; import org.apache.hadoop.yarn.factories.RecordFactory; import org.apache.hadoop.yarn.factory.providers.RecordFactoryProvider; import org.apache.hadoop.yarn.nodelabels.CommonNodeLabelsManager; @@ -132,10 +131,6 @@ public class LeafQueue extends AbstractCSQueue { List priorityAcls = new ArrayList(); - // -1 indicates lifetime is disabled - private volatile long maxApplicationLifetime = -1; - private volatile long defaultApplicationLifetime = -1; - @SuppressWarnings({ "unchecked", "rawtypes" }) public LeafQueue(CapacitySchedulerContext cs, String queueName, CSQueue parent, CSQueue old) throws IOException { @@ -261,19 +256,6 @@ public class LeafQueue extends AbstractCSQueue { defaultAppPriorityPerQueue = Priority.newInstance( conf.getDefaultApplicationPriorityConfPerQueue(getQueuePath())); - maxApplicationLifetime = - conf.getMaximumLifetimePerQueue((getQueuePath())); - defaultApplicationLifetime = - conf.getDefaultLifetimePerQueue((getQueuePath())); - if (maxApplicationLifetime > 0 && - defaultApplicationLifetime > maxApplicationLifetime) { - throw new YarnRuntimeException( - "Default lifetime" + defaultApplicationLifetime - + " can't exceed maximum lifetime " + maxApplicationLifetime); - } - defaultApplicationLifetime = defaultApplicationLifetime > 0 - ? defaultApplicationLifetime : maxApplicationLifetime; - // Validate leaf queue's user's weights. int queueUL = Math.min(100, conf.getUserLimit(getQueuePath())); for (Entry e : getUserWeights().entrySet()) { @@ -334,9 +316,9 @@ public class LeafQueue extends AbstractCSQueue { + reservationsContinueLooking + "\n" + "preemptionDisabled = " + getPreemptionDisabled() + "\n" + "defaultAppPriorityPerQueue = " + defaultAppPriorityPerQueue + "\npriority = " + priority - + "\nmaxLifetime = " + maxApplicationLifetime + " seconds" - + "\ndefaultLifetime = " - + defaultApplicationLifetime + " seconds"); + + "\nmaxLifetime = " + getMaximumApplicationLifetime() + + " seconds" + "\ndefaultLifetime = " + + getDefaultApplicationLifetime() + " seconds"); } finally { writeLock.unlock(); } @@ -2215,12 +2197,4 @@ public class LeafQueue extends AbstractCSQueue { this.userLimit = userLimit; } } - - public long getMaximumApplicationLifetime() { - return maxApplicationLifetime; - } - - public long getDefaultApplicationLifetime() { - return defaultApplicationLifetime; - } } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/TestApplicationLifetimeMonitor.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/TestApplicationLifetimeMonitor.java index e1d87163b26..579674cf7fd 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/TestApplicationLifetimeMonitor.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/TestApplicationLifetimeMonitor.java @@ -367,6 +367,189 @@ public class TestApplicationLifetimeMonitor { } } + static final String CQ1 = "child1"; + @Test(timeout = 120000) + public void testInheritAppLifetimeFromParentQueue() throws Exception { + YarnConfiguration yarnConf = conf; + long maxRootLifetime = 20L; + long defaultRootLifetime = 10L; + if (scheduler.equals(CapacityScheduler.class)) { + CapacitySchedulerConfiguration csConf = + new CapacitySchedulerConfiguration(); + csConf.setQueues(CapacitySchedulerConfiguration.ROOT, + new String[] {CQ1}); + csConf.setCapacity(CapacitySchedulerConfiguration.ROOT + "." + CQ1, 100); + csConf.setMaximumLifetimePerQueue( + CapacitySchedulerConfiguration.ROOT, maxRootLifetime); + csConf.setDefaultLifetimePerQueue( + CapacitySchedulerConfiguration.ROOT, defaultRootLifetime); + yarnConf = new YarnConfiguration(csConf); + } + + MockRM rm = null; + try { + rm = new MockRM(yarnConf); + rm.start(); + + Priority appPriority = Priority.newInstance(0); + MockNM nm1 = rm.registerNode("127.0.0.1:1234", 16 * 1024); + + // user not set lifetime, so queue max lifetime will be considered. + RMApp app1 = rm.submitApp(200, CQ1, false, null, appPriority, null); + + nm1.nodeHeartbeat(true); + + if (scheduler.equals(CapacityScheduler.class)) { + // Supported only on capacity scheduler + CapacityScheduler csched = + (CapacityScheduler) rm.getResourceScheduler(); + + rm.waitForState(app1.getApplicationId(), RMAppState.KILLED); + long totalTimeRun = app1.getFinishTime() - app1.getSubmitTime(); + // Child queue should have inherited parent max and default lifetimes. + Assert.assertEquals("Child queue max lifetime should have overridden" + + " parent value", + maxRootLifetime, + csched.getQueue(CQ1).getMaximumApplicationLifetime()); + Assert.assertEquals("Child queue default lifetime should have" + + " overridden parent value", + defaultRootLifetime, + csched.getQueue(CQ1).getDefaultApplicationLifetime()); + // app1 (run in the 'child1' queue) should have run longer than the + // default lifetime but less than the max lifetime. + Assert.assertTrue("Application killed before default lifetime value", + totalTimeRun > (defaultRootLifetime * 1000)); + Assert.assertTrue( + "Application killed after max lifetime value " + totalTimeRun, + totalTimeRun < (maxRootLifetime * 1000)); + } + } finally { + stopRM(rm); + } + } + + @Test(timeout = 120000) + public void testOverrideParentQueueMaxAppLifetime() throws Exception { + YarnConfiguration yarnConf = conf; + long maxRootLifetime = 20L; + long maxChildLifetime = 40L; + long defaultRootLifetime = 10L; + if (scheduler.equals(CapacityScheduler.class)) { + CapacitySchedulerConfiguration csConf = + new CapacitySchedulerConfiguration(); + csConf.setQueues(CapacitySchedulerConfiguration.ROOT, + new String[] {CQ1}); + csConf.setCapacity(CapacitySchedulerConfiguration.ROOT + "." + CQ1, 100); + csConf.setMaximumLifetimePerQueue( + CapacitySchedulerConfiguration.ROOT, maxRootLifetime); + csConf.setMaximumLifetimePerQueue( + CapacitySchedulerConfiguration.ROOT + "." + CQ1, maxChildLifetime); + csConf.setDefaultLifetimePerQueue( + CapacitySchedulerConfiguration.ROOT, defaultRootLifetime); + csConf.setDefaultLifetimePerQueue( + CapacitySchedulerConfiguration.ROOT + "." + CQ1, maxChildLifetime); + yarnConf = new YarnConfiguration(csConf); + } + + MockRM rm = null; + try { + rm = new MockRM(yarnConf); + rm.start(); + + Priority appPriority = Priority.newInstance(0); + MockNM nm1 = rm.registerNode("127.0.0.1:1234", 16 * 1024); + + // user not set lifetime, so queue max lifetime will be considered. + RMApp app1 = rm.submitApp(200, CQ1, false, null, appPriority, null); + + nm1.nodeHeartbeat(true); + + if (scheduler.equals(CapacityScheduler.class)) { + // Supported only on capacity scheduler + CapacityScheduler csched = + (CapacityScheduler) rm.getResourceScheduler(); + + rm.waitForState(app1.getApplicationId(), RMAppState.KILLED); + long totalTimeRun = app1.getFinishTime() - app1.getSubmitTime(); + // Child queue's max lifetime can override parent's and be larger. + Assert.assertTrue("Application killed before default lifetime value", + (maxRootLifetime < maxChildLifetime) + && (totalTimeRun > (maxChildLifetime * 1000))); + Assert.assertEquals("Root queue max lifetime property set incorrectly", + maxRootLifetime, + csched.getRootQueue().getMaximumApplicationLifetime()); + Assert.assertEquals("Child queue max lifetime should have overridden" + + " parent value", maxChildLifetime, + csched.getQueue(CQ1).getMaximumApplicationLifetime()); + } + } finally { + stopRM(rm); + } + } + + @Test(timeout = 120000) + public void testOverrideParentQueueDefaultAppLifetime() throws Exception { + YarnConfiguration yarnConf = conf; + long maxRootLifetime = -1L; + long maxChildLifetime = -1L; + long defaultChildLifetime = 10L; + if (scheduler.equals(CapacityScheduler.class)) { + CapacitySchedulerConfiguration csConf = + new CapacitySchedulerConfiguration(); + csConf.setQueues(CapacitySchedulerConfiguration.ROOT, + new String[] {CQ1}); + csConf.setCapacity(CapacitySchedulerConfiguration.ROOT + "." + CQ1, 100); + csConf.setMaximumLifetimePerQueue( + CapacitySchedulerConfiguration.ROOT, maxRootLifetime); + csConf.setMaximumLifetimePerQueue( + CapacitySchedulerConfiguration.ROOT + "." + CQ1, maxChildLifetime); + csConf.setDefaultLifetimePerQueue( + CapacitySchedulerConfiguration.ROOT + "." + CQ1, + defaultChildLifetime); + yarnConf = new YarnConfiguration(csConf); + } + + MockRM rm = null; + try { + rm = new MockRM(yarnConf); + rm.start(); + + Priority appPriority = Priority.newInstance(0); + MockNM nm1 = rm.registerNode("127.0.0.1:1234", 16 * 1024); + + // user not set lifetime, so queue max lifetime will be considered. + RMApp app1 = rm.submitApp(200, CQ1, false, null, appPriority, null); + + nm1.nodeHeartbeat(true); + + if (scheduler.equals(CapacityScheduler.class)) { + // Supported only on capacity scheduler + CapacityScheduler csched = + (CapacityScheduler) rm.getResourceScheduler(); + + rm.waitForState(app1.getApplicationId(), RMAppState.KILLED); + long totalTimeRun = app1.getFinishTime() - app1.getSubmitTime(); + // app1 (run in 'child1' queue) should have overridden the parent's + // default lifetime. + Assert.assertTrue("Application killed before default lifetime value", + totalTimeRun > (defaultChildLifetime * 1000)); + // Root and child queue's max lifetime should be -1. + Assert.assertEquals("Root queue max lifetime property set incorrectly", + maxRootLifetime, + csched.getRootQueue().getMaximumApplicationLifetime()); + Assert.assertEquals("Child queue max lifetime property set incorrectly", + maxChildLifetime, + csched.getQueue(CQ1).getMaximumApplicationLifetime()); + // 'child1' queue's default lifetime should have overridden parent's. + Assert.assertEquals("Child queue default lifetime should have" + + " overridden parent value", defaultChildLifetime, + csched.getQueue(CQ1).getDefaultApplicationLifetime()); + } + } finally { + stopRM(rm); + } + } + private CapacitySchedulerConfiguration setUpCSQueue(long maxLifetime, long defaultLifetime) { CapacitySchedulerConfiguration csConf = diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-site/src/site/markdown/CapacityScheduler.md b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-site/src/site/markdown/CapacityScheduler.md index 33d2b13e06f..37eada9b0be 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-site/src/site/markdown/CapacityScheduler.md +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-site/src/site/markdown/CapacityScheduler.md @@ -196,8 +196,8 @@ Example: | Property | Description | |:---- |:---- | -| `yarn.scheduler.capacity..maximum-application-lifetime` | Maximum lifetime of an application which is submitted to a queue in seconds. Any value less than or equal to zero will be considered as disabled. This will be a hard time limit for all applications in this queue. If positive value is configured then any application submitted to this queue will be killed after exceeds the configured lifetime. User can also specify lifetime per application basis in application submission context. But user lifetime will be overridden if it exceeds queue maximum lifetime. It is point-in-time configuration. Note : Configuring too low value will result in killing application sooner. This feature is applicable only for leaf queue. | -| `yarn.scheduler.capacity.root..default-application-lifetime` | Default lifetime of an application which is submitted to a queue in seconds. Any value less than or equal to zero will be considered as disabled. If the user has not submitted application with lifetime value then this value will be taken. It is point-in-time configuration. Note : Default lifetime can't exceed maximum lifetime. This feature is applicable only for leaf queue.| +| `yarn.scheduler.capacity..maximum-application-lifetime` | Maximum lifetime (in seconds) of an application which is submitted to a queue. Any value less than or equal to zero will be considered as disabled. The default is -1. If positive value is configured then any application submitted to this queue will be killed after it exceeds the configured lifetime. User can also specify lifetime per application in application submission context. However, user lifetime will be overridden if it exceeds queue maximum lifetime. It is point-in-time configuration. Note: This feature can be set at any level in the queue hierarchy. Child queues will inherit their parent's value unless overridden at the child level. A value of 0 means no max lifetime and will override a parent's max lifetime. If this property is not set or is set to a negative number, then this queue's max lifetime value will be inherited from it's parent.| +| `yarn.scheduler.capacity.root..default-application-lifetime` | Default lifetime (in seconds) of an application which is submitted to a queue. Any value less than or equal to zero will be considered as disabled. If the user has not submitted application with lifetime value then this value will be taken. It is point-in-time configuration. This feature can be set at any level in the queue hierarchy. Child queues will inherit their parent's value unless overridden at the child level. If set to less than or equal to 0, the queue's max value must also be unlimited. Default lifetime can't exceed maximum lifetime. | ###Setup for application priority.