YARN-1273. Fixed Distributed-shell to account for containers that failed to start. Contributed by Hitesh Shah.

svn merge --ignore-ancestry -c 1529389 ../../trunk


git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/branches/branch-2@1529390 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Vinod Kumar Vavilapalli 2013-10-05 06:10:00 +00:00
parent c91addb743
commit 5c246b999c
5 changed files with 149 additions and 15 deletions

View File

@ -159,6 +159,9 @@ Release 2.1.2 - UNRELEASED
YARN-1254. Fixed NodeManager to not pollute container's credentials. (Omkar YARN-1254. Fixed NodeManager to not pollute container's credentials. (Omkar
Vinit Joshi via vinodkv) Vinit Joshi via vinodkv)
YARN-1273. Fixed Distributed-shell to account for containers that failed
to start. (Hitesh Shah via vinodkv)
Release 2.1.1-beta - 2013-09-23 Release 2.1.1-beta - 2013-09-23
INCOMPATIBLE CHANGES INCOMPATIBLE CHANGES

View File

@ -34,6 +34,7 @@ import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentMap; import java.util.concurrent.ConcurrentMap;
import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicInteger;
import com.google.common.annotations.VisibleForTesting;
import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.GnuParser; import org.apache.commons.cli.GnuParser;
import org.apache.commons.cli.HelpFormatter; import org.apache.commons.cli.HelpFormatter;
@ -281,8 +282,8 @@ public class ApplicationMaster {
} }
} }
public ApplicationMaster() throws Exception { public ApplicationMaster() {
// Set up the configuration and RPC // Set up the configuration
conf = new YarnConfiguration(); conf = new YarnConfiguration();
} }
@ -470,7 +471,7 @@ public class ApplicationMaster {
amRMClient.init(conf); amRMClient.init(conf);
amRMClient.start(); amRMClient.start();
containerListener = new NMCallbackHandler(); containerListener = createNMCallbackHandler();
nmClientAsync = new NMClientAsyncImpl(containerListener); nmClientAsync = new NMClientAsyncImpl(containerListener);
nmClientAsync.init(conf); nmClientAsync.init(conf);
nmClientAsync.start(); nmClientAsync.start();
@ -500,7 +501,6 @@ public class ApplicationMaster {
containerMemory = maxMem; containerMemory = maxMem;
} }
// Setup ask for containers from RM // Setup ask for containers from RM
// Send request for containers to RM // Send request for containers to RM
// Until we get our fully allocated quota, we keep on polling RM for // Until we get our fully allocated quota, we keep on polling RM for
@ -513,7 +513,8 @@ public class ApplicationMaster {
} }
numRequestedContainers.set(numTotalContainers); numRequestedContainers.set(numTotalContainers);
while (!done) { while (!done
&& (numCompletedContainers.get() != numTotalContainers)) {
try { try {
Thread.sleep(200); Thread.sleep(200);
} catch (InterruptedException ex) {} } catch (InterruptedException ex) {}
@ -523,6 +524,11 @@ public class ApplicationMaster {
return success; return success;
} }
@VisibleForTesting
NMCallbackHandler createNMCallbackHandler() {
return new NMCallbackHandler(this);
}
private void finish() { private void finish() {
// Join all launched threads // Join all launched threads
// needed for when we time out // needed for when we time out
@ -566,7 +572,6 @@ public class ApplicationMaster {
LOG.error("Failed to unregister application", e); LOG.error("Failed to unregister application", e);
} }
done = true;
amRMClient.stop(); amRMClient.stop();
} }
@ -679,10 +684,17 @@ public class ApplicationMaster {
} }
} }
private class NMCallbackHandler implements NMClientAsync.CallbackHandler { @VisibleForTesting
static class NMCallbackHandler
implements NMClientAsync.CallbackHandler {
private ConcurrentMap<ContainerId, Container> containers = private ConcurrentMap<ContainerId, Container> containers =
new ConcurrentHashMap<ContainerId, Container>(); new ConcurrentHashMap<ContainerId, Container>();
private final ApplicationMaster applicationMaster;
public NMCallbackHandler(ApplicationMaster applicationMaster) {
this.applicationMaster = applicationMaster;
}
public void addContainer(ContainerId containerId, Container container) { public void addContainer(ContainerId containerId, Container container) {
containers.putIfAbsent(containerId, container); containers.putIfAbsent(containerId, container);
@ -713,7 +725,7 @@ public class ApplicationMaster {
} }
Container container = containers.get(containerId); Container container = containers.get(containerId);
if (container != null) { if (container != null) {
nmClientAsync.getContainerStatusAsync(containerId, container.getNodeId()); applicationMaster.nmClientAsync.getContainerStatusAsync(containerId, container.getNodeId());
} }
} }
@ -721,6 +733,8 @@ public class ApplicationMaster {
public void onStartContainerError(ContainerId containerId, Throwable t) { public void onStartContainerError(ContainerId containerId, Throwable t) {
LOG.error("Failed to start Container " + containerId); LOG.error("Failed to start Container " + containerId);
containers.remove(containerId); containers.remove(containerId);
applicationMaster.numCompletedContainers.incrementAndGet();
applicationMaster.numFailedContainers.incrementAndGet();
} }
@Override @Override
@ -847,7 +861,6 @@ public class ApplicationMaster {
/** /**
* Setup the request that will be sent to the RM for the container ask. * Setup the request that will be sent to the RM for the container ask.
* *
* @param numContainers Containers to ask for from RM
* @return the setup ResourceRequest to be sent to RM * @return the setup ResourceRequest to be sent to RM
*/ */
private ContainerRequest setupContainerAskForRM() { private ContainerRequest setupContainerAskForRM() {

View File

@ -125,8 +125,7 @@ public class Client {
// Application master jar file // Application master jar file
private String appMasterJar = ""; private String appMasterJar = "";
// Main class to invoke application master // Main class to invoke application master
private final String appMasterMainClass = private final String appMasterMainClass;
"org.apache.hadoop.yarn.applications.distributedshell.ApplicationMaster";
// Shell command to be executed // Shell command to be executed
private String shellCommand = ""; private String shellCommand = "";
@ -193,8 +192,14 @@ public class Client {
/** /**
*/ */
public Client(Configuration conf) throws Exception { public Client(Configuration conf) throws Exception {
this(
"org.apache.hadoop.yarn.applications.distributedshell.ApplicationMaster",
conf);
}
Client(String appMasterMainClass, Configuration conf) {
this.conf = conf; this.conf = conf;
this.appMasterMainClass = appMasterMainClass;
yarnClient = YarnClient.createYarnClient(); yarnClient = YarnClient.createYarnClient();
yarnClient.init(conf); yarnClient.init(conf);
opts = new Options(); opts = new Options();
@ -214,6 +219,7 @@ public class Client {
opts.addOption("log_properties", true, "log4j.properties file"); opts.addOption("log_properties", true, "log4j.properties file");
opts.addOption("debug", false, "Dump out debug information"); opts.addOption("debug", false, "Dump out debug information");
opts.addOption("help", false, "Print usage"); opts.addOption("help", false, "Print usage");
} }
/** /**

View File

@ -0,0 +1,83 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.yarn.applications.distributedshell;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.yarn.api.records.ContainerId;
import java.nio.ByteBuffer;
import java.util.Map;
public class ContainerLaunchFailAppMaster extends ApplicationMaster {
private static final Log LOG =
LogFactory.getLog(ContainerLaunchFailAppMaster.class);
public ContainerLaunchFailAppMaster() {
super();
}
@Override
NMCallbackHandler createNMCallbackHandler() {
return new FailContainerLaunchNMCallbackHandler(this);
}
class FailContainerLaunchNMCallbackHandler
extends ApplicationMaster.NMCallbackHandler {
public FailContainerLaunchNMCallbackHandler(
ApplicationMaster applicationMaster) {
super(applicationMaster);
}
@Override
public void onContainerStarted(ContainerId containerId,
Map<String, ByteBuffer> allServiceResponse) {
super.onStartContainerError(containerId,
new RuntimeException("Inject Container Launch failure"));
}
}
public static void main(String[] args) {
boolean result = false;
try {
ContainerLaunchFailAppMaster appMaster =
new ContainerLaunchFailAppMaster();
LOG.info("Initializing ApplicationMaster");
boolean doRun = appMaster.init(args);
if (!doRun) {
System.exit(0);
}
result = appMaster.run();
} catch (Throwable t) {
LOG.fatal("Error running ApplicationMaster", t);
System.exit(1);
}
if (result) {
LOG.info("Application Master completed successfully. exiting");
System.exit(0);
} else {
LOG.info("Application Master failed. exiting");
System.exit(2);
}
}
}

View File

@ -59,7 +59,7 @@ public class TestDistributedShell {
protected static String APPMASTER_JAR = JarFinder.getJar(ApplicationMaster.class); protected static String APPMASTER_JAR = JarFinder.getJar(ApplicationMaster.class);
@BeforeClass @BeforeClass
public static void setup() throws InterruptedException, Exception { public static void setup() throws Exception {
LOG.info("Starting up YARN cluster"); LOG.info("Starting up YARN cluster");
conf.setInt(YarnConfiguration.RM_SCHEDULER_MINIMUM_ALLOCATION_MB, 128); conf.setInt(YarnConfiguration.RM_SCHEDULER_MINIMUM_ALLOCATION_MB, 128);
conf.setClass(YarnConfiguration.RM_SCHEDULER, conf.setClass(YarnConfiguration.RM_SCHEDULER,
@ -135,7 +135,7 @@ public class TestDistributedShell {
} catch (Exception e) { } catch (Exception e) {
throw new RuntimeException(e); throw new RuntimeException(e);
} }
}; }
}; };
t.start(); t.start();
@ -248,5 +248,34 @@ public class TestDistributedShell {
Thread.sleep(2000); Thread.sleep(2000);
} }
} }
@Test(timeout=90000)
public void testContainerLaunchFailureHandling() throws Exception {
String[] args = {
"--jar",
APPMASTER_JAR,
"--num_containers",
"2",
"--shell_command",
Shell.WINDOWS ? "dir" : "ls",
"--master_memory",
"512",
"--container_memory",
"128"
};
LOG.info("Initializing DS Client");
Client client = new Client(ContainerLaunchFailAppMaster.class.getName(),
new Configuration(yarnCluster.getConfig()));
boolean initSuccess = client.init(args);
Assert.assertTrue(initSuccess);
LOG.info("Running DS Client");
boolean result = client.run();
LOG.info("Client run completed. Result=" + result);
Assert.assertFalse(result);
}
} }