YARN-8769. [Submarine] Allow user to specify customized quicklink(s) when submit Submarine job. Contributed by Wangda Tan.

This commit is contained in:
Sunil G 2018-09-21 23:39:22 +05:30
parent a2752779ac
commit 0cd6346102
7 changed files with 303 additions and 35 deletions

View File

@ -49,6 +49,7 @@ public class CliConstants {
public static final String WAIT_JOB_FINISH = "wait_job_finish";
public static final String PS_DOCKER_IMAGE = "ps_docker_image";
public static final String WORKER_DOCKER_IMAGE = "worker_docker_image";
public static final String QUICKLINK = "quicklink";
public static final String TENSORBOARD_DOCKER_IMAGE =
"tensorboard_docker_image";
}

View File

@ -117,6 +117,14 @@ private Options generateOptions() {
options.addOption(CliConstants.WORKER_DOCKER_IMAGE, true,
"Specify docker image for WORKER, when this is not specified, WORKER "
+ "uses --" + CliConstants.DOCKER_IMAGE + " as default.");
options.addOption(CliConstants.QUICKLINK, true, "Specify quicklink so YARN"
+ "web UI shows link to given role instance and port. When "
+ "--tensorboard is speciied, quicklink to tensorboard instance will "
+ "be added automatically. The format of quick link is: "
+ "Quick_link_label=http(or https)://role-name:port. For example, "
+ "if want to link to first worker's 7070 port, and text of quicklink "
+ "is Notebook_UI, user need to specify --quicklink "
+ "Notebook_UI=https://master-0:7070");
options.addOption("h", "help", false, "Print help");
return options;
}

View File

@ -0,0 +1,71 @@
/**
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License. See accompanying LICENSE file.
*/
package org.apache.hadoop.yarn.submarine.client.cli.param;
import org.apache.commons.cli.ParseException;
/**
* A class represents quick links to a web page.
*/
public class Quicklink {
private String label;
private String componentInstanceName;
private String protocol;
private int port;
public void parse(String quicklinkStr) throws ParseException {
if (!quicklinkStr.contains("=")) {
throw new ParseException("Should be <label>=<link> format for quicklink");
}
int index = quicklinkStr.indexOf("=");
label = quicklinkStr.substring(0, index);
quicklinkStr = quicklinkStr.substring(index + 1);
if (quicklinkStr.startsWith("http://")) {
protocol = "http://";
} else if (quicklinkStr.startsWith("https://")) {
protocol = "https://";
} else {
throw new ParseException("Quicklink should start with http or https");
}
quicklinkStr = quicklinkStr.substring(protocol.length());
index = quicklinkStr.indexOf(":");
if (index == -1) {
throw new ParseException("Quicklink should be componet-id:port form");
}
componentInstanceName = quicklinkStr.substring(0, index);
port = Integer.parseInt(quicklinkStr.substring(index + 1));
}
public String getLabel() {
return label;
}
public String getComponentInstanceName() {
return componentInstanceName;
}
public String getProtocol() {
return protocol;
}
public int getPort() {
return port;
}
}

View File

@ -24,6 +24,8 @@
import org.apache.hadoop.yarn.submarine.common.ClientContext;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
/**
* Parameters used to run a job
@ -41,6 +43,7 @@ public class RunJobParameters extends RunParameters {
private String tensorboardDockerImage;
private String workerLaunchCmd;
private String psLaunchCmd;
private List<Quicklink> quicklinks = new ArrayList<>();
private String psDockerImage = null;
private String workerDockerImage = null;
@ -119,6 +122,17 @@ public void updateParametersByParsedCommandline(CommandLine parsedCommandLine,
this.waitJobFinish = true;
}
// Quicklinks
String[] quicklinkStrs = parsedCommandLine.getOptionValues(
CliConstants.QUICKLINK);
if (quicklinkStrs != null) {
for (String ql : quicklinkStrs) {
Quicklink quicklink = new Quicklink();
quicklink.parse(ql);
quicklinks.add(quicklink);
}
}
psDockerImage = parsedCommandLine.getOptionValue(
CliConstants.PS_DOCKER_IMAGE);
workerDockerImage = parsedCommandLine.getOptionValue(
@ -247,4 +261,8 @@ public void setTensorboardResource(Resource tensorboardResource) {
public String getTensorboardDockerImage() {
return tensorboardDockerImage;
}
public List<Quicklink> getQuicklinks() {
return quicklinks;
}
}

View File

@ -15,7 +15,6 @@
package org.apache.hadoop.yarn.submarine.runtimes.yarnservice;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.collect.ImmutableMap;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
@ -29,6 +28,7 @@
import org.apache.hadoop.yarn.service.api.records.ResourceInformation;
import org.apache.hadoop.yarn.service.api.records.Service;
import org.apache.hadoop.yarn.service.client.ServiceClient;
import org.apache.hadoop.yarn.submarine.client.cli.param.Quicklink;
import org.apache.hadoop.yarn.submarine.client.cli.param.RunJobParameters;
import org.apache.hadoop.yarn.submarine.common.ClientContext;
import org.apache.hadoop.yarn.submarine.common.Envs;
@ -40,10 +40,14 @@
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileWriter;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.io.Writer;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.StringTokenizer;
@ -54,6 +58,7 @@
* Submit a job to cluster
*/
public class YarnServiceJobSubmitter implements JobSubmitter {
public static final String TENSORBOARD_QUICKLINK_LABEL = "Tensorboard";
private static final Logger LOG =
LoggerFactory.getLogger(YarnServiceJobSubmitter.class);
ClientContext clientContext;
@ -98,7 +103,7 @@ private String getValueOfEnvionment(String envar) {
}
private void addHdfsClassPathIfNeeded(RunJobParameters parameters,
FileWriter fw, Component comp) throws IOException {
PrintWriter fw, Component comp) throws IOException {
// Find envs to use HDFS
String hdfsHome = null;
String javaHome = null;
@ -191,7 +196,8 @@ private void addCommonEnvironments(Component component, TaskType taskType) {
envs.put(Envs.TASK_TYPE_ENV, taskType.name());
}
private String getUserName() {
@VisibleForTesting
protected String getUserName() {
return System.getProperty("user.name");
}
@ -205,18 +211,19 @@ private String getDNSDomain() {
private String generateCommandLaunchScript(RunJobParameters parameters,
TaskType taskType, Component comp) throws IOException {
File file = File.createTempFile(taskType.name() + "-launch-script", ".sh");
FileWriter fw = new FileWriter(file);
Writer w = new OutputStreamWriter(new FileOutputStream(file), "UTF-8");
PrintWriter pw = new PrintWriter(w);
try {
fw.append("#!/bin/bash\n");
pw.append("#!/bin/bash\n");
addHdfsClassPathIfNeeded(parameters, fw, comp);
addHdfsClassPathIfNeeded(parameters, pw, comp);
if (taskType.equals(TaskType.TENSORBOARD)) {
String tbCommand =
"export LC_ALL=C && tensorboard --logdir=" + parameters
.getCheckpointPath();
fw.append(tbCommand + "\n");
pw.append(tbCommand + "\n");
LOG.info("Tensorboard command=" + tbCommand);
} else{
// When distributed training is required
@ -226,20 +233,20 @@ private String generateCommandLaunchScript(RunJobParameters parameters,
taskType.getComponentName(), parameters.getNumWorkers(),
parameters.getNumPS(), parameters.getName(), getUserName(),
getDNSDomain());
fw.append("export TF_CONFIG=\"" + tfConfigEnv + "\"\n");
pw.append("export TF_CONFIG=\"" + tfConfigEnv + "\"\n");
}
// Print launch command
if (taskType.equals(TaskType.WORKER) || taskType.equals(
TaskType.PRIMARY_WORKER)) {
fw.append(parameters.getWorkerLaunchCmd() + '\n');
pw.append(parameters.getWorkerLaunchCmd() + '\n');
if (SubmarineLogs.isVerbose()) {
LOG.info(
"Worker command =[" + parameters.getWorkerLaunchCmd() + "]");
}
} else if (taskType.equals(TaskType.PS)) {
fw.append(parameters.getPSLaunchCmd() + '\n');
pw.append(parameters.getPSLaunchCmd() + '\n');
if (SubmarineLogs.isVerbose()) {
LOG.info("PS command =[" + parameters.getPSLaunchCmd() + "]");
@ -247,7 +254,7 @@ private String generateCommandLaunchScript(RunJobParameters parameters,
}
}
} finally {
fw.close();
pw.close();
}
return file.getAbsolutePath();
}
@ -421,18 +428,51 @@ private Artifact getDockerArtifact(String dockerImageName) {
return new Artifact().type(Artifact.TypeEnum.DOCKER).id(dockerImageName);
}
private void handleQuicklinks(RunJobParameters runJobParameters)
throws IOException {
List<Quicklink> quicklinks = runJobParameters.getQuicklinks();
if (null != quicklinks && !quicklinks.isEmpty()) {
for (Quicklink ql : quicklinks) {
// Make sure it is a valid instance name
String instanceName = ql.getComponentInstanceName();
boolean found = false;
for (Component comp : serviceSpec.getComponents()) {
for (int i = 0; i < comp.getNumberOfContainers(); i++) {
String possibleInstanceName = comp.getName() + "-" + i;
if (possibleInstanceName.equals(instanceName)) {
found = true;
break;
}
}
}
if (!found) {
throw new IOException(
"Couldn't find a component instance = " + instanceName
+ " while adding quicklink");
}
String link = ql.getProtocol() + YarnServiceUtils.getDNSName(
serviceSpec.getName(), instanceName, getUserName(), getDNSDomain(),
ql.getPort());
YarnServiceUtils.addQuicklink(serviceSpec, ql.getLabel(), link);
}
}
}
private Service createServiceByParameters(RunJobParameters parameters)
throws IOException {
componentToLocalLaunchScriptPath.clear();
Service service = new Service();
service.setName(parameters.getName());
service.setVersion(String.valueOf(System.currentTimeMillis()));
service.setArtifact(getDockerArtifact(parameters.getDockerImageName()));
serviceSpec = new Service();
serviceSpec.setName(parameters.getName());
serviceSpec.setVersion(String.valueOf(System.currentTimeMillis()));
serviceSpec.setArtifact(getDockerArtifact(parameters.getDockerImageName()));
handleServiceEnvs(service, parameters);
handleServiceEnvs(serviceSpec, parameters);
if (parameters.getNumWorkers() > 0) {
addWorkerComponents(service, parameters);
addWorkerComponents(serviceSpec, parameters);
}
if (parameters.getNumPS() > 0) {
@ -450,7 +490,7 @@ private Service createServiceByParameters(RunJobParameters parameters)
getDockerArtifact(parameters.getPsDockerImage()));
}
handleLaunchCommand(parameters, TaskType.PS, psComponent);
service.addComponent(psComponent);
serviceSpec.addComponent(psComponent);
}
if (parameters.isTensorboardEnabled()) {
@ -470,14 +510,20 @@ private Service createServiceByParameters(RunJobParameters parameters)
// Add tensorboard to quicklink
String tensorboardLink = "http://" + YarnServiceUtils.getDNSName(
parameters.getName(), TaskType.TENSORBOARD.getComponentName(), 0,
getUserName(), getDNSDomain(), 6006);
parameters.getName(),
TaskType.TENSORBOARD.getComponentName() + "-" + 0, getUserName(),
getDNSDomain(), 6006);
LOG.info("Link to tensorboard:" + tensorboardLink);
service.addComponent(tbComponent);
service.setQuicklinks(ImmutableMap.of("Tensorboard", tensorboardLink));
serviceSpec.addComponent(tbComponent);
YarnServiceUtils.addQuicklink(serviceSpec, TENSORBOARD_QUICKLINK_LABEL,
tensorboardLink);
}
return service;
// After all components added, handle quicklinks
handleQuicklinks(parameters);
return serviceSpec;
}
/**
@ -486,12 +532,11 @@ private Service createServiceByParameters(RunJobParameters parameters)
@Override
public ApplicationId submitJob(RunJobParameters parameters)
throws IOException, YarnException {
Service service = createServiceByParameters(parameters);
createServiceByParameters(parameters);
ServiceClient serviceClient = YarnServiceUtils.createServiceClient(
clientContext.getYarnConfig());
ApplicationId appid = serviceClient.actionCreate(service);
ApplicationId appid = serviceClient.actionCreate(serviceSpec);
serviceClient.stop();
this.serviceSpec = service;
return appid;
}

View File

@ -16,10 +16,20 @@
import com.google.common.annotations.VisibleForTesting;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.yarn.service.api.records.Service;
import org.apache.hadoop.yarn.service.client.ServiceClient;
import org.apache.hadoop.yarn.submarine.common.Envs;
import org.apache.hadoop.yarn.submarine.common.conf.SubmarineLogs;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.HashMap;
import java.util.Map;
public class YarnServiceUtils {
private static final Logger LOG =
LoggerFactory.getLogger(YarnServiceUtils.class);
// This will be true only in UT.
private static ServiceClient stubServiceClient = null;
@ -40,10 +50,10 @@ public static void setStubServiceClient(ServiceClient stubServiceClient) {
YarnServiceUtils.stubServiceClient = stubServiceClient;
}
public static String getDNSName(String serviceName, String componentName,
int index, String userName, String domain, int port) {
return componentName + "-" + index + getDNSNameCommonSuffix(serviceName,
userName, domain, port);
public static String getDNSName(String serviceName,
String componentInstanceName, String userName, String domain, int port) {
return componentInstanceName + getDNSNameCommonSuffix(serviceName, userName,
domain, port);
}
private static String getDNSNameCommonSuffix(String serviceName,
@ -66,12 +76,18 @@ public static String getTFConfigEnv(String curCommponentName, int nWorkers,
commonEndpointSuffix) + ",";
String ps = getComponentArrayJson("ps", nPs, commonEndpointSuffix) + "},";
String task =
"\\\"task\\\":{" + " \\\"type\\\":\\\"" + curCommponentName + "\\\","
+ " \\\"index\\\":" + '$' + Envs.TASK_INDEX_ENV + "},";
StringBuilder sb = new StringBuilder();
sb.append("\\\"task\\\":{");
sb.append(" \\\"type\\\":\\\"");
sb.append(curCommponentName);
sb.append("\\\",");
sb.append(" \\\"index\\\":");
sb.append('$');
sb.append(Envs.TASK_INDEX_ENV + "},");
String task = sb.toString();
String environment = "\\\"environment\\\":\\\"cloud\\\"}";
StringBuilder sb = new StringBuilder();
sb = new StringBuilder();
sb.append(json);
sb.append(master);
sb.append(worker);
@ -81,6 +97,21 @@ public static String getTFConfigEnv(String curCommponentName, int nWorkers,
return sb.toString();
}
public static void addQuicklink(Service serviceSpec, String label,
String link) {
Map<String, String> quicklinks = serviceSpec.getQuicklinks();
if (null == quicklinks) {
quicklinks = new HashMap<>();
serviceSpec.setQuicklinks(quicklinks);
}
if (SubmarineLogs.isVerbose()) {
LOG.info("Added quicklink, " + label + "=" + link);
}
quicklinks.put(label, link);
}
private static String getComponentArrayJson(String componentName, int count,
String endpointSuffix) {
String component = "\\\"" + componentName + "\\\":";

View File

@ -18,6 +18,7 @@
package org.apache.hadoop.yarn.submarine.client.cli.yarnservice;
import com.google.common.collect.ImmutableMap;
import org.apache.hadoop.yarn.api.records.ApplicationId;
import org.apache.hadoop.yarn.api.records.Resource;
import org.apache.hadoop.yarn.exceptions.YarnException;
@ -100,6 +101,32 @@ private void commonVerifyDistributedTrainingSpec(Service serviceSpec)
Assert.assertTrue(SubmarineLogs.isVerbose());
}
private void verifyQuicklink(Service serviceSpec,
Map<String, String> expectedQuicklinks) {
Map<String, String> actualQuicklinks = serviceSpec.getQuicklinks();
if (actualQuicklinks == null || actualQuicklinks.isEmpty()) {
Assert.assertTrue(
expectedQuicklinks == null || expectedQuicklinks.isEmpty());
return;
}
Assert.assertEquals(expectedQuicklinks.size(), actualQuicklinks.size());
for (Map.Entry<String, String> expectedEntry : expectedQuicklinks
.entrySet()) {
Assert.assertTrue(actualQuicklinks.containsKey(expectedEntry.getKey()));
// $USER could be changed in different environment. so replace $USER by
// "user"
String expectedValue = expectedEntry.getValue();
String actualValue = actualQuicklinks.get(expectedEntry.getKey());
String userName = System.getProperty("user.name");
actualValue = actualValue.replaceAll(userName, "username");
Assert.assertEquals(expectedValue, actualValue);
}
}
@Test
public void testBasicRunJobForDistributedTraining() throws Exception {
MockClientContext mockClientContext =
@ -120,6 +147,8 @@ public void testBasicRunJobForDistributedTraining() throws Exception {
Assert.assertEquals(3, serviceSpec.getComponents().size());
commonVerifyDistributedTrainingSpec(serviceSpec);
verifyQuicklink(serviceSpec, null);
}
@Test
@ -147,6 +176,10 @@ public void testBasicRunJobForDistributedTrainingWithTensorboard()
verifyTensorboardComponent(runJobCli, serviceSpec,
Resources.createResource(4096, 1));
verifyQuicklink(serviceSpec, ImmutableMap
.of(YarnServiceJobSubmitter.TENSORBOARD_QUICKLINK_LABEL,
"http://tensorboard-0.my-job.username.null:6006"));
}
@Test
@ -232,6 +265,9 @@ public void testTensorboardOnlyServiceWithCustomizedDockerImageAndResource()
verifyTensorboardComponent(runJobCli, serviceSpec,
Resources.createResource(2048, 2));
verifyQuicklink(serviceSpec, ImmutableMap
.of(YarnServiceJobSubmitter.TENSORBOARD_QUICKLINK_LABEL,
"http://tensorboard-0.my-job.username.null:6006"));
}
private void commonTestSingleNodeTraining(Service serviceSpec)
@ -372,4 +408,62 @@ public void testParameterStorageForTrainingJob() throws Exception {
Assert.assertEquals(jobInfo.get(StorageKeyConstants.INPUT_PATH),
"s3://input");
}
@Test
public void testAddQuicklinksWithoutTensorboard() throws Exception {
MockClientContext mockClientContext =
YarnServiceCliTestUtils.getMockClientContext();
RunJobCli runJobCli = new RunJobCli(mockClientContext);
Assert.assertFalse(SubmarineLogs.isVerbose());
runJobCli.run(
new String[] { "--name", "my-job", "--docker_image", "tf-docker:1.1.0",
"--input_path", "s3://input", "--checkpoint_path", "s3://output",
"--num_workers", "3", "--num_ps", "2", "--worker_launch_cmd",
"python run-job.py", "--worker_resources", "memory=2048M,vcores=2",
"--ps_resources", "memory=4096M,vcores=4", "--ps_docker_image",
"ps.image", "--worker_docker_image", "worker.image",
"--ps_launch_cmd", "python run-ps.py", "--verbose", "--quicklink",
"AAA=http://master-0:8321", "--quicklink",
"BBB=http://worker-0:1234" });
Service serviceSpec = getServiceSpecFromJobSubmitter(
runJobCli.getJobSubmitter());
Assert.assertEquals(3, serviceSpec.getComponents().size());
commonVerifyDistributedTrainingSpec(serviceSpec);
verifyQuicklink(serviceSpec, ImmutableMap
.of("AAA", "http://master-0.my-job.username.null:8321", "BBB",
"http://worker-0.my-job.username.null:1234"));
}
@Test
public void testAddQuicklinksWithTensorboard() throws Exception {
MockClientContext mockClientContext =
YarnServiceCliTestUtils.getMockClientContext();
RunJobCli runJobCli = new RunJobCli(mockClientContext);
Assert.assertFalse(SubmarineLogs.isVerbose());
runJobCli.run(
new String[] { "--name", "my-job", "--docker_image", "tf-docker:1.1.0",
"--input_path", "s3://input", "--checkpoint_path", "s3://output",
"--num_workers", "3", "--num_ps", "2", "--worker_launch_cmd",
"python run-job.py", "--worker_resources", "memory=2048M,vcores=2",
"--ps_resources", "memory=4096M,vcores=4", "--ps_docker_image",
"ps.image", "--worker_docker_image", "worker.image",
"--ps_launch_cmd", "python run-ps.py", "--verbose", "--quicklink",
"AAA=http://master-0:8321", "--quicklink",
"BBB=http://worker-0:1234", "--tensorboard" });
Service serviceSpec = getServiceSpecFromJobSubmitter(
runJobCli.getJobSubmitter());
Assert.assertEquals(4, serviceSpec.getComponents().size());
commonVerifyDistributedTrainingSpec(serviceSpec);
verifyQuicklink(serviceSpec, ImmutableMap
.of("AAA", "http://master-0.my-job.username.null:8321", "BBB",
"http://worker-0.my-job.username.null:1234",
YarnServiceJobSubmitter.TENSORBOARD_QUICKLINK_LABEL,
"http://tensorboard-0.my-job.username.null:6006"));
}
}