SUBMARINE-54. Add test coverage for YarnServiceJobSubmitter and make it ready for extension for PyTorch. Contributed by Szilard Nemeth.
This commit is contained in:
parent
afe6613ee6
commit
0b3d41bdee
|
@ -293,10 +293,18 @@ public class RunJobParameters extends RunParameters {
|
|||
return psDockerImage;
|
||||
}
|
||||
|
||||
public void setPsDockerImage(String psDockerImage) {
|
||||
this.psDockerImage = psDockerImage;
|
||||
}
|
||||
|
||||
public String getWorkerDockerImage() {
|
||||
return workerDockerImage;
|
||||
}
|
||||
|
||||
public void setWorkerDockerImage(String workerDockerImage) {
|
||||
this.workerDockerImage = workerDockerImage;
|
||||
}
|
||||
|
||||
public boolean isDistributed() {
|
||||
return distributed;
|
||||
}
|
||||
|
@ -313,6 +321,10 @@ public class RunJobParameters extends RunParameters {
|
|||
return tensorboardDockerImage;
|
||||
}
|
||||
|
||||
public void setTensorboardDockerImage(String tensorboardDockerImage) {
|
||||
this.tensorboardDockerImage = tensorboardDockerImage;
|
||||
}
|
||||
|
||||
public List<Quicklink> getQuicklinks() {
|
||||
return quicklinks;
|
||||
}
|
||||
|
@ -366,6 +378,10 @@ public class RunJobParameters extends RunParameters {
|
|||
return this;
|
||||
}
|
||||
|
||||
public void setDistributed(boolean distributed) {
|
||||
this.distributed = distributed;
|
||||
}
|
||||
|
||||
@VisibleForTesting
|
||||
public static class UnderscoreConverterPropertyUtils extends PropertyUtils {
|
||||
@Override
|
||||
|
|
|
@ -177,6 +177,25 @@ public class TestRunJobCliParsing {
|
|||
Assert.assertTrue(success);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testJobWithoutName() throws Exception {
|
||||
RunJobCli runJobCli = new RunJobCli(getMockClientContext());
|
||||
String expectedErrorMessage =
|
||||
"--" + CliConstants.NAME + " is absent";
|
||||
String actualMessage = "";
|
||||
try {
|
||||
runJobCli.run(
|
||||
new String[]{"--docker_image", "tf-docker:1.1.0",
|
||||
"--num_workers", "0", "--tensorboard", "--verbose",
|
||||
"--tensorboard_resources", "memory=2G,vcores=2",
|
||||
"--tensorboard_docker_image", "tb_docker_image:001"});
|
||||
} catch (ParseException e) {
|
||||
actualMessage = e.getMessage();
|
||||
e.printStackTrace();
|
||||
}
|
||||
assertEquals(expectedErrorMessage, actualMessage);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testLaunchCommandPatternReplace() throws Exception {
|
||||
RunJobCli runJobCli = new RunJobCli(getMockClientContext());
|
||||
|
|
|
@ -26,6 +26,7 @@ import org.apache.hadoop.fs.Path;
|
|||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.util.Objects;
|
||||
|
||||
public class MockRemoteDirectoryManager implements RemoteDirectoryManager {
|
||||
private File jobsParentDir = null;
|
||||
|
@ -35,6 +36,7 @@ public class MockRemoteDirectoryManager implements RemoteDirectoryManager {
|
|||
@Override
|
||||
public Path getJobStagingArea(String jobName, boolean create)
|
||||
throws IOException {
|
||||
Objects.requireNonNull(jobName, "Job name must not be null!");
|
||||
if (jobsParentDir == null && create) {
|
||||
jobsParentDir = new File(
|
||||
"target/_staging_area_" + System.currentTimeMillis());
|
||||
|
|
|
@ -115,6 +115,12 @@
|
|||
<artifactId>hadoop-yarn-services-core</artifactId>
|
||||
<version>3.3.0-SNAPSHOT</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.hadoop</groupId>
|
||||
<artifactId>hadoop-yarn-common</artifactId>
|
||||
<type>test-jar</type>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
|
||||
<build>
|
||||
|
|
|
@ -0,0 +1,99 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hadoop.yarn.submarine.runtimes.yarnservice;
|
||||
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.yarn.service.api.records.Component;
|
||||
import org.apache.hadoop.yarn.submarine.client.cli.param.RunJobParameters;
|
||||
import org.apache.hadoop.yarn.submarine.common.api.TaskType;
|
||||
import org.apache.hadoop.yarn.submarine.common.fs.RemoteDirectoryManager;
|
||||
import org.apache.hadoop.yarn.submarine.runtimes.yarnservice.command.AbstractLaunchCommand;
|
||||
import org.apache.hadoop.yarn.submarine.runtimes.yarnservice.command.LaunchCommandFactory;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Objects;
|
||||
|
||||
import static org.apache.hadoop.yarn.submarine.runtimes.yarnservice.tensorflow.TensorFlowCommons.getScriptFileName;
|
||||
|
||||
/**
|
||||
* Abstract base class for Component classes.
|
||||
* The implementations of this class are act like factories for
|
||||
* {@link Component} instances.
|
||||
* All dependencies are passed to the constructor so that child classes
|
||||
* are obliged to provide matching constructors.
|
||||
*/
|
||||
public abstract class AbstractComponent {
|
||||
private final FileSystemOperations fsOperations;
|
||||
protected final RunJobParameters parameters;
|
||||
protected final TaskType taskType;
|
||||
private final RemoteDirectoryManager remoteDirectoryManager;
|
||||
protected final Configuration yarnConfig;
|
||||
private final LaunchCommandFactory launchCommandFactory;
|
||||
|
||||
/**
|
||||
* This is only required for testing.
|
||||
*/
|
||||
private String localScriptFile;
|
||||
|
||||
public AbstractComponent(FileSystemOperations fsOperations,
|
||||
RemoteDirectoryManager remoteDirectoryManager,
|
||||
RunJobParameters parameters, TaskType taskType,
|
||||
Configuration yarnConfig,
|
||||
LaunchCommandFactory launchCommandFactory) {
|
||||
this.fsOperations = fsOperations;
|
||||
this.remoteDirectoryManager = remoteDirectoryManager;
|
||||
this.parameters = parameters;
|
||||
this.taskType = taskType;
|
||||
this.launchCommandFactory = launchCommandFactory;
|
||||
this.yarnConfig = yarnConfig;
|
||||
}
|
||||
|
||||
protected abstract Component createComponent() throws IOException;
|
||||
|
||||
/**
|
||||
* Generates a command launch script on local disk,
|
||||
* returns path to the script.
|
||||
*/
|
||||
protected void generateLaunchCommand(Component component)
|
||||
throws IOException {
|
||||
AbstractLaunchCommand launchCommand =
|
||||
launchCommandFactory.createLaunchCommand(taskType, component);
|
||||
this.localScriptFile = launchCommand.generateLaunchScript();
|
||||
|
||||
String remoteLaunchCommand = uploadLaunchCommand(component);
|
||||
component.setLaunchCommand(remoteLaunchCommand);
|
||||
}
|
||||
|
||||
private String uploadLaunchCommand(Component component)
|
||||
throws IOException {
|
||||
Objects.requireNonNull(localScriptFile, "localScriptFile should be " +
|
||||
"set before calling this method!");
|
||||
Path stagingDir =
|
||||
remoteDirectoryManager.getJobStagingArea(parameters.getName(), true);
|
||||
|
||||
String destScriptFileName = getScriptFileName(taskType);
|
||||
fsOperations.uploadToRemoteFileAndLocalizeToContainerWorkDir(stagingDir,
|
||||
localScriptFile, destScriptFileName, component);
|
||||
|
||||
return "./" + destScriptFileName;
|
||||
}
|
||||
|
||||
String getLocalScriptFile() {
|
||||
return localScriptFile;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,201 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hadoop.yarn.submarine.runtimes.yarnservice;
|
||||
|
||||
import com.google.common.annotations.VisibleForTesting;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FileStatus;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.FileUtil;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.fs.permission.FsPermission;
|
||||
import org.apache.hadoop.yarn.service.api.records.Component;
|
||||
import org.apache.hadoop.yarn.service.api.records.ConfigFile;
|
||||
import org.apache.hadoop.yarn.submarine.common.ClientContext;
|
||||
import org.apache.hadoop.yarn.submarine.common.conf.SubmarineConfiguration;
|
||||
import org.apache.hadoop.yarn.submarine.common.conf.SubmarineLogs;
|
||||
import org.apache.hadoop.yarn.submarine.common.fs.RemoteDirectoryManager;
|
||||
import org.apache.hadoop.yarn.submarine.utils.ZipUtilities;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.IOException;
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
|
||||
/**
|
||||
* Contains methods to perform file system operations. Almost all of the methods
|
||||
* are regular non-static methods as the operations are performed with the help
|
||||
* of a {@link RemoteDirectoryManager} instance passed in as a constructor
|
||||
* dependency. Please note that some operations require to read config settings
|
||||
* as well, so that we have Submarine and YARN config objects as dependencies as
|
||||
* well.
|
||||
*/
|
||||
public class FileSystemOperations {
|
||||
private static final Logger LOG =
|
||||
LoggerFactory.getLogger(FileSystemOperations.class);
|
||||
private final Configuration submarineConfig;
|
||||
private final Configuration yarnConfig;
|
||||
|
||||
private Set<Path> uploadedFiles = new HashSet<>();
|
||||
private RemoteDirectoryManager remoteDirectoryManager;
|
||||
|
||||
public FileSystemOperations(ClientContext clientContext) {
|
||||
this.remoteDirectoryManager = clientContext.getRemoteDirectoryManager();
|
||||
this.submarineConfig = clientContext.getSubmarineConfig();
|
||||
this.yarnConfig = clientContext.getYarnConfig();
|
||||
}
|
||||
|
||||
/**
|
||||
* May download a remote uri(file/dir) and zip.
|
||||
* Skip download if local dir
|
||||
* Remote uri can be a local dir(won't download)
|
||||
* or remote HDFS dir, s3 dir/file .etc
|
||||
* */
|
||||
public String downloadAndZip(String remoteDir, String zipFileName,
|
||||
boolean doZip)
|
||||
throws IOException {
|
||||
//Append original modification time and size to zip file name
|
||||
String suffix;
|
||||
String srcDir = remoteDir;
|
||||
String zipDirPath =
|
||||
System.getProperty("java.io.tmpdir") + "/" + zipFileName;
|
||||
boolean needDeleteTempDir = false;
|
||||
if (remoteDirectoryManager.isRemote(remoteDir)) {
|
||||
//Append original modification time and size to zip file name
|
||||
FileStatus status =
|
||||
remoteDirectoryManager.getRemoteFileStatus(new Path(remoteDir));
|
||||
suffix = "_" + status.getModificationTime()
|
||||
+ "-" + remoteDirectoryManager.getRemoteFileSize(remoteDir);
|
||||
// Download them to temp dir
|
||||
boolean downloaded =
|
||||
remoteDirectoryManager.copyRemoteToLocal(remoteDir, zipDirPath);
|
||||
if (!downloaded) {
|
||||
throw new IOException("Failed to download files from "
|
||||
+ remoteDir);
|
||||
}
|
||||
LOG.info("Downloaded remote: {} to local: {}", remoteDir, zipDirPath);
|
||||
srcDir = zipDirPath;
|
||||
needDeleteTempDir = true;
|
||||
} else {
|
||||
File localDir = new File(remoteDir);
|
||||
suffix = "_" + localDir.lastModified()
|
||||
+ "-" + localDir.length();
|
||||
}
|
||||
if (!doZip) {
|
||||
return srcDir;
|
||||
}
|
||||
// zip a local dir
|
||||
String zipFileUri =
|
||||
ZipUtilities.zipDir(srcDir, zipDirPath + suffix + ".zip");
|
||||
// delete downloaded temp dir
|
||||
if (needDeleteTempDir) {
|
||||
deleteFiles(srcDir);
|
||||
}
|
||||
return zipFileUri;
|
||||
}
|
||||
|
||||
public void deleteFiles(String localUri) {
|
||||
boolean success = FileUtil.fullyDelete(new File(localUri));
|
||||
if (!success) {
|
||||
LOG.warn("Failed to delete {}", localUri);
|
||||
}
|
||||
LOG.info("Deleted {}", localUri);
|
||||
}
|
||||
|
||||
@VisibleForTesting
|
||||
public void uploadToRemoteFileAndLocalizeToContainerWorkDir(Path stagingDir,
|
||||
String fileToUpload, String destFilename, Component comp)
|
||||
throws IOException {
|
||||
Path uploadedFilePath = uploadToRemoteFile(stagingDir, fileToUpload);
|
||||
locateRemoteFileToContainerWorkDir(destFilename, comp, uploadedFilePath);
|
||||
}
|
||||
|
||||
private void locateRemoteFileToContainerWorkDir(String destFilename,
|
||||
Component comp, Path uploadedFilePath)
|
||||
throws IOException {
|
||||
FileSystem fs = FileSystem.get(yarnConfig);
|
||||
|
||||
FileStatus fileStatus = fs.getFileStatus(uploadedFilePath);
|
||||
LOG.info("Uploaded file path = " + fileStatus.getPath());
|
||||
|
||||
// Set it to component's files list
|
||||
comp.getConfiguration().getFiles().add(new ConfigFile().srcFile(
|
||||
fileStatus.getPath().toUri().toString()).destFile(destFilename)
|
||||
.type(ConfigFile.TypeEnum.STATIC));
|
||||
}
|
||||
|
||||
public Path uploadToRemoteFile(Path stagingDir, String fileToUpload) throws
|
||||
IOException {
|
||||
FileSystem fs = remoteDirectoryManager.getDefaultFileSystem();
|
||||
|
||||
// Upload to remote FS under staging area
|
||||
File localFile = new File(fileToUpload);
|
||||
if (!localFile.exists()) {
|
||||
throw new FileNotFoundException(
|
||||
"Trying to upload file=" + localFile.getAbsolutePath()
|
||||
+ " to remote, but couldn't find local file.");
|
||||
}
|
||||
String filename = new File(fileToUpload).getName();
|
||||
|
||||
Path uploadedFilePath = new Path(stagingDir, filename);
|
||||
if (!uploadedFiles.contains(uploadedFilePath)) {
|
||||
if (SubmarineLogs.isVerbose()) {
|
||||
LOG.info("Copying local file=" + fileToUpload + " to remote="
|
||||
+ uploadedFilePath);
|
||||
}
|
||||
fs.copyFromLocalFile(new Path(fileToUpload), uploadedFilePath);
|
||||
uploadedFiles.add(uploadedFilePath);
|
||||
}
|
||||
return uploadedFilePath;
|
||||
}
|
||||
|
||||
public void validFileSize(String uri) throws IOException {
|
||||
long actualSizeByte;
|
||||
String locationType = "Local";
|
||||
if (remoteDirectoryManager.isRemote(uri)) {
|
||||
actualSizeByte = remoteDirectoryManager.getRemoteFileSize(uri);
|
||||
locationType = "Remote";
|
||||
} else {
|
||||
actualSizeByte = FileUtil.getDU(new File(uri));
|
||||
}
|
||||
long maxFileSizeMB = submarineConfig
|
||||
.getLong(SubmarineConfiguration.LOCALIZATION_MAX_ALLOWED_FILE_SIZE_MB,
|
||||
SubmarineConfiguration.DEFAULT_MAX_ALLOWED_REMOTE_URI_SIZE_MB);
|
||||
LOG.info("{} fie/dir: {}, size(Byte):{},"
|
||||
+ " Allowed max file/dir size: {}",
|
||||
locationType, uri, actualSizeByte, maxFileSizeMB * 1024 * 1024);
|
||||
|
||||
if (actualSizeByte > maxFileSizeMB * 1024 * 1024) {
|
||||
throw new IOException(uri + " size(Byte): "
|
||||
+ actualSizeByte + " exceeds configured max size:"
|
||||
+ maxFileSizeMB * 1024 * 1024);
|
||||
}
|
||||
}
|
||||
|
||||
public void setPermission(Path destPath, FsPermission permission) throws
|
||||
IOException {
|
||||
FileSystem fs = FileSystem.get(yarnConfig);
|
||||
fs.setPermission(destPath, new FsPermission(permission));
|
||||
}
|
||||
|
||||
public static boolean needHdfs(String content) {
|
||||
return content != null && content.contains("hdfs://");
|
||||
}
|
||||
}
|
|
@ -0,0 +1,161 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hadoop.yarn.submarine.runtimes.yarnservice;
|
||||
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.yarn.service.api.records.Component;
|
||||
import org.apache.hadoop.yarn.submarine.client.cli.param.RunJobParameters;
|
||||
import org.apache.hadoop.yarn.submarine.common.ClientContext;
|
||||
import org.apache.hadoop.yarn.submarine.common.conf.SubmarineLogs;
|
||||
import org.apache.hadoop.yarn.submarine.common.fs.RemoteDirectoryManager;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.io.PrintWriter;
|
||||
|
||||
import static org.apache.hadoop.yarn.submarine.runtimes.yarnservice.FileSystemOperations.needHdfs;
|
||||
import static org.apache.hadoop.yarn.submarine.utils.ClassPathUtilities.findFileOnClassPath;
|
||||
import static org.apache.hadoop.yarn.submarine.utils.EnvironmentUtilities.getValueOfEnvironment;
|
||||
|
||||
/**
|
||||
* This class contains helper methods to fill HDFS and Java environment
|
||||
* variables into scripts.
|
||||
*/
|
||||
public class HadoopEnvironmentSetup {
|
||||
private static final Logger LOG =
|
||||
LoggerFactory.getLogger(HadoopEnvironmentSetup.class);
|
||||
private static final String CORE_SITE_XML = "core-site.xml";
|
||||
private static final String HDFS_SITE_XML = "hdfs-site.xml";
|
||||
|
||||
public static final String DOCKER_HADOOP_HDFS_HOME =
|
||||
"DOCKER_HADOOP_HDFS_HOME";
|
||||
public static final String DOCKER_JAVA_HOME = "DOCKER_JAVA_HOME";
|
||||
private final RemoteDirectoryManager remoteDirectoryManager;
|
||||
private final FileSystemOperations fsOperations;
|
||||
|
||||
public HadoopEnvironmentSetup(ClientContext clientContext,
|
||||
FileSystemOperations fsOperations) {
|
||||
this.remoteDirectoryManager = clientContext.getRemoteDirectoryManager();
|
||||
this.fsOperations = fsOperations;
|
||||
}
|
||||
|
||||
public void addHdfsClassPath(RunJobParameters parameters,
|
||||
PrintWriter fw, Component comp) throws IOException {
|
||||
// Find envs to use HDFS
|
||||
String hdfsHome = null;
|
||||
String javaHome = null;
|
||||
|
||||
boolean hadoopEnv = false;
|
||||
|
||||
for (String envVar : parameters.getEnvars()) {
|
||||
if (envVar.startsWith(DOCKER_HADOOP_HDFS_HOME + "=")) {
|
||||
hdfsHome = getValueOfEnvironment(envVar);
|
||||
hadoopEnv = true;
|
||||
} else if (envVar.startsWith(DOCKER_JAVA_HOME + "=")) {
|
||||
javaHome = getValueOfEnvironment(envVar);
|
||||
}
|
||||
}
|
||||
|
||||
boolean hasHdfsEnvs = hdfsHome != null && javaHome != null;
|
||||
boolean needHdfs = doesNeedHdfs(parameters, hadoopEnv);
|
||||
if (needHdfs) {
|
||||
// HDFS is asked either in input or output, set LD_LIBRARY_PATH
|
||||
// and classpath
|
||||
if (hdfsHome != null) {
|
||||
appendHdfsHome(fw, hdfsHome);
|
||||
}
|
||||
|
||||
// hadoop confs will be uploaded to HDFS and localized to container's
|
||||
// local folder, so here set $HADOOP_CONF_DIR to $WORK_DIR.
|
||||
fw.append("export HADOOP_CONF_DIR=$WORK_DIR\n");
|
||||
if (javaHome != null) {
|
||||
appendJavaHome(fw, javaHome);
|
||||
}
|
||||
|
||||
fw.append(
|
||||
"export CLASSPATH=`$HADOOP_HDFS_HOME/bin/hadoop classpath --glob`\n");
|
||||
}
|
||||
|
||||
if (needHdfs && !hasHdfsEnvs) {
|
||||
LOG.error("When HDFS is being used to read/write models/data, " +
|
||||
"the following environment variables are required: " +
|
||||
"1) {}=<HDFS_HOME inside docker container> " +
|
||||
"2) {}=<JAVA_HOME inside docker container>. " +
|
||||
"You can use --env to pass these environment variables.",
|
||||
DOCKER_HADOOP_HDFS_HOME, DOCKER_JAVA_HOME);
|
||||
throw new IOException("Failed to detect HDFS-related environments.");
|
||||
}
|
||||
|
||||
// Trying to upload core-site.xml and hdfs-site.xml
|
||||
Path stagingDir =
|
||||
remoteDirectoryManager.getJobStagingArea(
|
||||
parameters.getName(), true);
|
||||
File coreSite = findFileOnClassPath(CORE_SITE_XML);
|
||||
File hdfsSite = findFileOnClassPath(HDFS_SITE_XML);
|
||||
if (coreSite == null || hdfsSite == null) {
|
||||
LOG.error("HDFS is being used, however we could not locate " +
|
||||
"{} nor {} on classpath! " +
|
||||
"Please double check your classpath setting and make sure these " +
|
||||
"setting files are included!", CORE_SITE_XML, HDFS_SITE_XML);
|
||||
throw new IOException(
|
||||
"Failed to locate core-site.xml / hdfs-site.xml on classpath!");
|
||||
}
|
||||
fsOperations.uploadToRemoteFileAndLocalizeToContainerWorkDir(stagingDir,
|
||||
coreSite.getAbsolutePath(), CORE_SITE_XML, comp);
|
||||
fsOperations.uploadToRemoteFileAndLocalizeToContainerWorkDir(stagingDir,
|
||||
hdfsSite.getAbsolutePath(), HDFS_SITE_XML, comp);
|
||||
|
||||
// DEBUG
|
||||
if (SubmarineLogs.isVerbose()) {
|
||||
appendEchoOfEnvVars(fw);
|
||||
}
|
||||
}
|
||||
|
||||
private boolean doesNeedHdfs(RunJobParameters parameters, boolean hadoopEnv) {
|
||||
return needHdfs(parameters.getInputPath()) ||
|
||||
needHdfs(parameters.getPSLaunchCmd()) ||
|
||||
needHdfs(parameters.getWorkerLaunchCmd()) ||
|
||||
hadoopEnv;
|
||||
}
|
||||
|
||||
private void appendHdfsHome(PrintWriter fw, String hdfsHome) {
|
||||
// Unset HADOOP_HOME/HADOOP_YARN_HOME to make sure host machine's envs
|
||||
// won't pollute docker's env.
|
||||
fw.append("export HADOOP_HOME=\n");
|
||||
fw.append("export HADOOP_YARN_HOME=\n");
|
||||
fw.append("export HADOOP_HDFS_HOME=" + hdfsHome + "\n");
|
||||
fw.append("export HADOOP_COMMON_HOME=" + hdfsHome + "\n");
|
||||
}
|
||||
|
||||
private void appendJavaHome(PrintWriter fw, String javaHome) {
|
||||
fw.append("export JAVA_HOME=" + javaHome + "\n");
|
||||
fw.append("export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:"
|
||||
+ "$JAVA_HOME/lib/amd64/server\n");
|
||||
}
|
||||
|
||||
private void appendEchoOfEnvVars(PrintWriter fw) {
|
||||
fw.append("echo \"CLASSPATH:$CLASSPATH\"\n");
|
||||
fw.append("echo \"HADOOP_CONF_DIR:$HADOOP_CONF_DIR\"\n");
|
||||
fw.append(
|
||||
"echo \"HADOOP_TOKEN_FILE_LOCATION:$HADOOP_TOKEN_FILE_LOCATION\"\n");
|
||||
fw.append("echo \"JAVA_HOME:$JAVA_HOME\"\n");
|
||||
fw.append("echo \"LD_LIBRARY_PATH:$LD_LIBRARY_PATH\"\n");
|
||||
fw.append("echo \"HADOOP_HDFS_HOME:$HADOOP_HDFS_HOME\"\n");
|
||||
}
|
||||
}
|
|
@ -0,0 +1,27 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hadoop.yarn.submarine.runtimes.yarnservice;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
/**
|
||||
* This interface is to provide means of creating wrappers around
|
||||
* {@link org.apache.hadoop.yarn.service.api.records.Service} instances.
|
||||
*/
|
||||
public interface ServiceSpec {
|
||||
ServiceWrapper create() throws IOException;
|
||||
}
|
|
@ -0,0 +1,51 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hadoop.yarn.submarine.runtimes.yarnservice;
|
||||
|
||||
import org.apache.hadoop.yarn.service.api.records.Service;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.OutputStreamWriter;
|
||||
import java.io.PrintWriter;
|
||||
import java.io.Writer;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
|
||||
import static org.apache.hadoop.yarn.service.utils.ServiceApiUtil.jsonSerDeser;
|
||||
|
||||
/**
|
||||
* This class is merely responsible for creating Json representation of
|
||||
* {@link Service} instances.
|
||||
*/
|
||||
public final class ServiceSpecFileGenerator {
|
||||
private ServiceSpecFileGenerator() {
|
||||
throw new UnsupportedOperationException("This class should not be " +
|
||||
"instantiated!");
|
||||
}
|
||||
|
||||
static String generateJson(Service service) throws IOException {
|
||||
File serviceSpecFile = File.createTempFile(service.getName(), ".json");
|
||||
String buffer = jsonSerDeser.toJson(service);
|
||||
Writer w = new OutputStreamWriter(new FileOutputStream(serviceSpecFile),
|
||||
StandardCharsets.UTF_8);
|
||||
try (PrintWriter pw = new PrintWriter(w)) {
|
||||
pw.append(buffer);
|
||||
}
|
||||
return serviceSpecFile.getAbsolutePath();
|
||||
}
|
||||
}
|
|
@ -0,0 +1,62 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hadoop.yarn.submarine.runtimes.yarnservice;
|
||||
|
||||
import com.google.common.annotations.VisibleForTesting;
|
||||
import com.google.common.collect.Maps;
|
||||
import org.apache.hadoop.yarn.service.api.records.Component;
|
||||
import org.apache.hadoop.yarn.service.api.records.Service;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* This class is only existing because we need a component name to
|
||||
* local launch command mapping from the test code.
|
||||
* Once this is solved in more clean or different way, we can delete this class.
|
||||
*/
|
||||
public class ServiceWrapper {
|
||||
private final Service service;
|
||||
|
||||
@VisibleForTesting
|
||||
private Map<String, String> componentToLocalLaunchCommand = Maps.newHashMap();
|
||||
|
||||
public ServiceWrapper(Service service) {
|
||||
this.service = service;
|
||||
}
|
||||
|
||||
public void addComponent(AbstractComponent abstractComponent)
|
||||
throws IOException {
|
||||
Component component = abstractComponent.createComponent();
|
||||
service.addComponent(component);
|
||||
storeComponentName(abstractComponent, component.getName());
|
||||
}
|
||||
|
||||
private void storeComponentName(
|
||||
AbstractComponent component, String name) {
|
||||
componentToLocalLaunchCommand.put(name,
|
||||
component.getLocalScriptFile());
|
||||
}
|
||||
|
||||
public Service getService() {
|
||||
return service;
|
||||
}
|
||||
|
||||
public String getLocalLaunchCommandPathForComponent(String componentName) {
|
||||
return componentToLocalLaunchCommand.get(componentName);
|
||||
}
|
||||
}
|
|
@ -15,858 +15,59 @@
|
|||
package org.apache.hadoop.yarn.submarine.runtimes.yarnservice;
|
||||
|
||||
import com.google.common.annotations.VisibleForTesting;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.hadoop.fs.FileStatus;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.FileUtil;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.fs.permission.FsPermission;
|
||||
import org.apache.hadoop.yarn.api.records.ApplicationId;
|
||||
import org.apache.hadoop.yarn.client.api.AppAdminClient;
|
||||
import org.apache.hadoop.yarn.exceptions.YarnException;
|
||||
import org.apache.hadoop.yarn.service.api.ServiceApiConstants;
|
||||
import org.apache.hadoop.yarn.service.api.records.Artifact;
|
||||
import org.apache.hadoop.yarn.service.api.records.Component;
|
||||
import org.apache.hadoop.yarn.service.api.records.ConfigFile;
|
||||
import org.apache.hadoop.yarn.service.api.records.Resource;
|
||||
import org.apache.hadoop.yarn.service.api.records.ResourceInformation;
|
||||
import org.apache.hadoop.yarn.service.api.records.Service;
|
||||
import org.apache.hadoop.yarn.service.api.records.KerberosPrincipal;
|
||||
import org.apache.hadoop.yarn.service.utils.ServiceApiUtil;
|
||||
import org.apache.hadoop.yarn.submarine.client.cli.param.Localization;
|
||||
import org.apache.hadoop.yarn.submarine.client.cli.param.Quicklink;
|
||||
import org.apache.hadoop.yarn.submarine.client.cli.param.RunJobParameters;
|
||||
import org.apache.hadoop.yarn.submarine.common.ClientContext;
|
||||
import org.apache.hadoop.yarn.submarine.common.Envs;
|
||||
import org.apache.hadoop.yarn.submarine.common.api.TaskType;
|
||||
import org.apache.hadoop.yarn.submarine.common.conf.SubmarineConfiguration;
|
||||
import org.apache.hadoop.yarn.submarine.common.conf.SubmarineLogs;
|
||||
import org.apache.hadoop.yarn.submarine.common.fs.RemoteDirectoryManager;
|
||||
import org.apache.hadoop.yarn.submarine.runtimes.common.JobSubmitter;
|
||||
import org.apache.hadoop.yarn.submarine.runtimes.yarnservice.command.LaunchCommandFactory;
|
||||
import org.apache.hadoop.yarn.submarine.runtimes.yarnservice.tensorflow.TensorFlowServiceSpec;
|
||||
import org.apache.hadoop.yarn.submarine.utils.Localizer;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.OutputStreamWriter;
|
||||
import java.io.PrintWriter;
|
||||
import java.io.Writer;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.StringTokenizer;
|
||||
import java.util.zip.ZipEntry;
|
||||
import java.util.zip.ZipOutputStream;
|
||||
|
||||
import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.HADOOP_SECURITY_AUTHENTICATION;
|
||||
|
||||
import static org.apache.hadoop.yarn.service.conf.YarnServiceConstants
|
||||
.CONTAINER_STATE_REPORT_AS_SERVICE_STATE;
|
||||
import static org.apache.hadoop.yarn.service.exceptions.LauncherExitCodes.EXIT_SUCCESS;
|
||||
import static org.apache.hadoop.yarn.service.utils.ServiceApiUtil.jsonSerDeser;
|
||||
|
||||
/**
|
||||
* Submit a job to cluster
|
||||
* Submit a job to cluster.
|
||||
*/
|
||||
public class YarnServiceJobSubmitter implements JobSubmitter {
|
||||
public static final String TENSORBOARD_QUICKLINK_LABEL = "Tensorboard";
|
||||
|
||||
private static final Logger LOG =
|
||||
LoggerFactory.getLogger(YarnServiceJobSubmitter.class);
|
||||
ClientContext clientContext;
|
||||
Service serviceSpec;
|
||||
private Set<Path> uploadedFiles = new HashSet<>();
|
||||
private ClientContext clientContext;
|
||||
private ServiceWrapper serviceWrapper;
|
||||
|
||||
// Used by testing
|
||||
private Map<String, String> componentToLocalLaunchScriptPath =
|
||||
new HashMap<>();
|
||||
|
||||
public YarnServiceJobSubmitter(ClientContext clientContext) {
|
||||
YarnServiceJobSubmitter(ClientContext clientContext) {
|
||||
this.clientContext = clientContext;
|
||||
}
|
||||
|
||||
private Resource getServiceResourceFromYarnResource(
|
||||
org.apache.hadoop.yarn.api.records.Resource yarnResource) {
|
||||
Resource serviceResource = new Resource();
|
||||
serviceResource.setCpus(yarnResource.getVirtualCores());
|
||||
serviceResource.setMemory(String.valueOf(yarnResource.getMemorySize()));
|
||||
|
||||
Map<String, ResourceInformation> riMap = new HashMap<>();
|
||||
for (org.apache.hadoop.yarn.api.records.ResourceInformation ri : yarnResource
|
||||
.getAllResourcesListCopy()) {
|
||||
ResourceInformation serviceRi =
|
||||
new ResourceInformation();
|
||||
serviceRi.setValue(ri.getValue());
|
||||
serviceRi.setUnit(ri.getUnits());
|
||||
riMap.put(ri.getName(), serviceRi);
|
||||
}
|
||||
serviceResource.setResourceInformations(riMap);
|
||||
|
||||
return serviceResource;
|
||||
}
|
||||
|
||||
private String getValueOfEnvironment(String envar) {
|
||||
// extract value from "key=value" form
|
||||
if (envar == null || !envar.contains("=")) {
|
||||
return "";
|
||||
} else {
|
||||
return envar.substring(envar.indexOf("=") + 1);
|
||||
}
|
||||
}
|
||||
|
||||
private boolean needHdfs(String content) {
|
||||
return content != null && content.contains("hdfs://");
|
||||
}
|
||||
|
||||
private void addHdfsClassPathIfNeeded(RunJobParameters parameters,
|
||||
PrintWriter fw, Component comp) throws IOException {
|
||||
// Find envs to use HDFS
|
||||
String hdfsHome = null;
|
||||
String javaHome = null;
|
||||
|
||||
boolean hadoopEnv = false;
|
||||
|
||||
for (String envar : parameters.getEnvars()) {
|
||||
if (envar.startsWith("DOCKER_HADOOP_HDFS_HOME=")) {
|
||||
hdfsHome = getValueOfEnvironment(envar);
|
||||
hadoopEnv = true;
|
||||
} else if (envar.startsWith("DOCKER_JAVA_HOME=")) {
|
||||
javaHome = getValueOfEnvironment(envar);
|
||||
}
|
||||
}
|
||||
|
||||
boolean lackingEnvs = false;
|
||||
|
||||
if (needHdfs(parameters.getInputPath()) || needHdfs(
|
||||
parameters.getPSLaunchCmd()) || needHdfs(
|
||||
parameters.getWorkerLaunchCmd()) || hadoopEnv) {
|
||||
// HDFS is asked either in input or output, set LD_LIBRARY_PATH
|
||||
// and classpath
|
||||
if (hdfsHome != null) {
|
||||
// Unset HADOOP_HOME/HADOOP_YARN_HOME to make sure host machine's envs
|
||||
// won't pollute docker's env.
|
||||
fw.append("export HADOOP_HOME=\n");
|
||||
fw.append("export HADOOP_YARN_HOME=\n");
|
||||
fw.append("export HADOOP_HDFS_HOME=" + hdfsHome + "\n");
|
||||
fw.append("export HADOOP_COMMON_HOME=" + hdfsHome + "\n");
|
||||
} else{
|
||||
lackingEnvs = true;
|
||||
}
|
||||
|
||||
// hadoop confs will be uploaded to HDFS and localized to container's
|
||||
// local folder, so here set $HADOOP_CONF_DIR to $WORK_DIR.
|
||||
fw.append("export HADOOP_CONF_DIR=$WORK_DIR\n");
|
||||
if (javaHome != null) {
|
||||
fw.append("export JAVA_HOME=" + javaHome + "\n");
|
||||
fw.append("export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:"
|
||||
+ "$JAVA_HOME/lib/amd64/server\n");
|
||||
} else {
|
||||
lackingEnvs = true;
|
||||
}
|
||||
fw.append("export CLASSPATH=`$HADOOP_HDFS_HOME/bin/hadoop classpath --glob`\n");
|
||||
}
|
||||
|
||||
if (lackingEnvs) {
|
||||
LOG.error("When hdfs is being used to read/write models/data. Following"
|
||||
+ "envs are required: 1) DOCKER_HADOOP_HDFS_HOME=<HDFS_HOME inside"
|
||||
+ "docker container> 2) DOCKER_JAVA_HOME=<JAVA_HOME inside docker"
|
||||
+ "container>. You can use --env to pass these envars.");
|
||||
throw new IOException("Failed to detect HDFS-related environments.");
|
||||
}
|
||||
|
||||
// Trying to upload core-site.xml and hdfs-site.xml
|
||||
Path stagingDir =
|
||||
clientContext.getRemoteDirectoryManager().getJobStagingArea(
|
||||
parameters.getName(), true);
|
||||
File coreSite = findFileOnClassPath("core-site.xml");
|
||||
File hdfsSite = findFileOnClassPath("hdfs-site.xml");
|
||||
if (coreSite == null || hdfsSite == null) {
|
||||
LOG.error("hdfs is being used, however we couldn't locate core-site.xml/"
|
||||
+ "hdfs-site.xml from classpath, please double check you classpath"
|
||||
+ "setting and make sure they're included.");
|
||||
throw new IOException(
|
||||
"Failed to locate core-site.xml / hdfs-site.xml from class path");
|
||||
}
|
||||
uploadToRemoteFileAndLocalizeToContainerWorkDir(stagingDir,
|
||||
coreSite.getAbsolutePath(), "core-site.xml", comp);
|
||||
uploadToRemoteFileAndLocalizeToContainerWorkDir(stagingDir,
|
||||
hdfsSite.getAbsolutePath(), "hdfs-site.xml", comp);
|
||||
|
||||
// DEBUG
|
||||
if (SubmarineLogs.isVerbose()) {
|
||||
fw.append("echo \"CLASSPATH:$CLASSPATH\"\n");
|
||||
fw.append("echo \"HADOOP_CONF_DIR:$HADOOP_CONF_DIR\"\n");
|
||||
fw.append("echo \"HADOOP_TOKEN_FILE_LOCATION:$HADOOP_TOKEN_FILE_LOCATION\"\n");
|
||||
fw.append("echo \"JAVA_HOME:$JAVA_HOME\"\n");
|
||||
fw.append("echo \"LD_LIBRARY_PATH:$LD_LIBRARY_PATH\"\n");
|
||||
fw.append("echo \"HADOOP_HDFS_HOME:$HADOOP_HDFS_HOME\"\n");
|
||||
}
|
||||
}
|
||||
|
||||
private void addCommonEnvironments(Component component, TaskType taskType) {
|
||||
Map<String, String> envs = component.getConfiguration().getEnv();
|
||||
envs.put(Envs.TASK_INDEX_ENV, ServiceApiConstants.COMPONENT_ID);
|
||||
envs.put(Envs.TASK_TYPE_ENV, taskType.name());
|
||||
}
|
||||
|
||||
@VisibleForTesting
|
||||
protected String getUserName() {
|
||||
return System.getProperty("user.name");
|
||||
}
|
||||
|
||||
private String getDNSDomain() {
|
||||
return clientContext.getYarnConfig().get("hadoop.registry.dns.domain-name");
|
||||
}
|
||||
|
||||
/*
|
||||
* Generate a command launch script on local disk, returns patch to the script
|
||||
*/
|
||||
private String generateCommandLaunchScript(RunJobParameters parameters,
|
||||
TaskType taskType, Component comp) throws IOException {
|
||||
File file = File.createTempFile(taskType.name() + "-launch-script", ".sh");
|
||||
Writer w = new OutputStreamWriter(new FileOutputStream(file),
|
||||
StandardCharsets.UTF_8);
|
||||
PrintWriter pw = new PrintWriter(w);
|
||||
|
||||
try {
|
||||
pw.append("#!/bin/bash\n");
|
||||
|
||||
addHdfsClassPathIfNeeded(parameters, pw, comp);
|
||||
|
||||
if (taskType.equals(TaskType.TENSORBOARD)) {
|
||||
String tbCommand =
|
||||
"export LC_ALL=C && tensorboard --logdir=" + parameters
|
||||
.getCheckpointPath();
|
||||
pw.append(tbCommand + "\n");
|
||||
LOG.info("Tensorboard command=" + tbCommand);
|
||||
} else{
|
||||
// When distributed training is required
|
||||
if (parameters.isDistributed()) {
|
||||
// Generated TF_CONFIG
|
||||
String tfConfigEnv = YarnServiceUtils.getTFConfigEnv(
|
||||
taskType.getComponentName(), parameters.getNumWorkers(),
|
||||
parameters.getNumPS(), parameters.getName(), getUserName(),
|
||||
getDNSDomain());
|
||||
pw.append("export TF_CONFIG=\"" + tfConfigEnv + "\"\n");
|
||||
}
|
||||
|
||||
// Print launch command
|
||||
if (taskType.equals(TaskType.WORKER) || taskType.equals(
|
||||
TaskType.PRIMARY_WORKER)) {
|
||||
pw.append(parameters.getWorkerLaunchCmd() + '\n');
|
||||
|
||||
if (SubmarineLogs.isVerbose()) {
|
||||
LOG.info(
|
||||
"Worker command =[" + parameters.getWorkerLaunchCmd() + "]");
|
||||
}
|
||||
} else if (taskType.equals(TaskType.PS)) {
|
||||
pw.append(parameters.getPSLaunchCmd() + '\n');
|
||||
|
||||
if (SubmarineLogs.isVerbose()) {
|
||||
LOG.info("PS command =[" + parameters.getPSLaunchCmd() + "]");
|
||||
}
|
||||
}
|
||||
}
|
||||
} finally {
|
||||
pw.close();
|
||||
}
|
||||
return file.getAbsolutePath();
|
||||
}
|
||||
|
||||
private String getScriptFileName(TaskType taskType) {
|
||||
return "run-" + taskType.name() + ".sh";
|
||||
}
|
||||
|
||||
private File findFileOnClassPath(final String fileName) {
|
||||
final String classpath = System.getProperty("java.class.path");
|
||||
final String pathSeparator = System.getProperty("path.separator");
|
||||
final StringTokenizer tokenizer = new StringTokenizer(classpath,
|
||||
pathSeparator);
|
||||
|
||||
while (tokenizer.hasMoreTokens()) {
|
||||
final String pathElement = tokenizer.nextToken();
|
||||
final File directoryOrJar = new File(pathElement);
|
||||
final File absoluteDirectoryOrJar = directoryOrJar.getAbsoluteFile();
|
||||
if (absoluteDirectoryOrJar.isFile()) {
|
||||
final File target = new File(absoluteDirectoryOrJar.getParent(),
|
||||
fileName);
|
||||
if (target.exists()) {
|
||||
return target;
|
||||
}
|
||||
} else{
|
||||
final File target = new File(directoryOrJar, fileName);
|
||||
if (target.exists()) {
|
||||
return target;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
private void uploadToRemoteFileAndLocalizeToContainerWorkDir(Path stagingDir,
|
||||
String fileToUpload, String destFilename, Component comp)
|
||||
throws IOException {
|
||||
Path uploadedFilePath = uploadToRemoteFile(stagingDir, fileToUpload);
|
||||
locateRemoteFileToContainerWorkDir(destFilename, comp, uploadedFilePath);
|
||||
}
|
||||
|
||||
private void locateRemoteFileToContainerWorkDir(String destFilename,
|
||||
Component comp, Path uploadedFilePath)
|
||||
throws IOException {
|
||||
FileSystem fs = FileSystem.get(clientContext.getYarnConfig());
|
||||
|
||||
FileStatus fileStatus = fs.getFileStatus(uploadedFilePath);
|
||||
LOG.info("Uploaded file path = " + fileStatus.getPath());
|
||||
|
||||
// Set it to component's files list
|
||||
comp.getConfiguration().getFiles().add(new ConfigFile().srcFile(
|
||||
fileStatus.getPath().toUri().toString()).destFile(destFilename)
|
||||
.type(ConfigFile.TypeEnum.STATIC));
|
||||
}
|
||||
|
||||
private Path uploadToRemoteFile(Path stagingDir, String fileToUpload) throws
|
||||
IOException {
|
||||
FileSystem fs = clientContext.getRemoteDirectoryManager()
|
||||
.getDefaultFileSystem();
|
||||
|
||||
// Upload to remote FS under staging area
|
||||
File localFile = new File(fileToUpload);
|
||||
if (!localFile.exists()) {
|
||||
throw new FileNotFoundException(
|
||||
"Trying to upload file=" + localFile.getAbsolutePath()
|
||||
+ " to remote, but couldn't find local file.");
|
||||
}
|
||||
String filename = new File(fileToUpload).getName();
|
||||
|
||||
Path uploadedFilePath = new Path(stagingDir, filename);
|
||||
if (!uploadedFiles.contains(uploadedFilePath)) {
|
||||
if (SubmarineLogs.isVerbose()) {
|
||||
LOG.info("Copying local file=" + fileToUpload + " to remote="
|
||||
+ uploadedFilePath);
|
||||
}
|
||||
fs.copyFromLocalFile(new Path(fileToUpload), uploadedFilePath);
|
||||
uploadedFiles.add(uploadedFilePath);
|
||||
}
|
||||
return uploadedFilePath;
|
||||
}
|
||||
|
||||
private void setPermission(Path destPath, FsPermission permission) throws
|
||||
IOException {
|
||||
FileSystem fs = FileSystem.get(clientContext.getYarnConfig());
|
||||
fs.setPermission(destPath, new FsPermission(permission));
|
||||
}
|
||||
|
||||
private void handleLaunchCommand(RunJobParameters parameters,
|
||||
TaskType taskType, Component component) throws IOException {
|
||||
// Get staging area directory
|
||||
Path stagingDir =
|
||||
clientContext.getRemoteDirectoryManager().getJobStagingArea(
|
||||
parameters.getName(), true);
|
||||
|
||||
// Generate script file in the local disk
|
||||
String localScriptFile = generateCommandLaunchScript(parameters, taskType,
|
||||
component);
|
||||
String destScriptFileName = getScriptFileName(taskType);
|
||||
uploadToRemoteFileAndLocalizeToContainerWorkDir(stagingDir, localScriptFile,
|
||||
destScriptFileName, component);
|
||||
|
||||
component.setLaunchCommand("./" + destScriptFileName);
|
||||
componentToLocalLaunchScriptPath.put(taskType.getComponentName(),
|
||||
localScriptFile);
|
||||
}
|
||||
|
||||
private String getLastNameFromPath(String srcFileStr) {
|
||||
return new Path(srcFileStr).getName();
|
||||
}
|
||||
|
||||
/**
|
||||
* May download a remote uri(file/dir) and zip.
|
||||
* Skip download if local dir
|
||||
* Remote uri can be a local dir(won't download)
|
||||
* or remote HDFS dir, s3 dir/file .etc
|
||||
* */
|
||||
private String mayDownloadAndZipIt(String remoteDir, String zipFileName,
|
||||
boolean doZip)
|
||||
throws IOException {
|
||||
RemoteDirectoryManager rdm = clientContext.getRemoteDirectoryManager();
|
||||
//Append original modification time and size to zip file name
|
||||
String suffix;
|
||||
String srcDir = remoteDir;
|
||||
String zipDirPath =
|
||||
System.getProperty("java.io.tmpdir") + "/" + zipFileName;
|
||||
boolean needDeleteTempDir = false;
|
||||
if (rdm.isRemote(remoteDir)) {
|
||||
//Append original modification time and size to zip file name
|
||||
FileStatus status = rdm.getRemoteFileStatus(new Path(remoteDir));
|
||||
suffix = "_" + status.getModificationTime()
|
||||
+ "-" + rdm.getRemoteFileSize(remoteDir);
|
||||
// Download them to temp dir
|
||||
boolean downloaded = rdm.copyRemoteToLocal(remoteDir, zipDirPath);
|
||||
if (!downloaded) {
|
||||
throw new IOException("Failed to download files from "
|
||||
+ remoteDir);
|
||||
}
|
||||
LOG.info("Downloaded remote: {} to local: {}", remoteDir, zipDirPath);
|
||||
srcDir = zipDirPath;
|
||||
needDeleteTempDir = true;
|
||||
} else {
|
||||
File localDir = new File(remoteDir);
|
||||
suffix = "_" + localDir.lastModified()
|
||||
+ "-" + localDir.length();
|
||||
}
|
||||
if (!doZip) {
|
||||
return srcDir;
|
||||
}
|
||||
// zip a local dir
|
||||
String zipFileUri = zipDir(srcDir, zipDirPath + suffix + ".zip");
|
||||
// delete downloaded temp dir
|
||||
if (needDeleteTempDir) {
|
||||
deleteFiles(srcDir);
|
||||
}
|
||||
return zipFileUri;
|
||||
}
|
||||
|
||||
@VisibleForTesting
|
||||
public String zipDir(String srcDir, String dstFile) throws IOException {
|
||||
FileOutputStream fos = new FileOutputStream(dstFile);
|
||||
ZipOutputStream zos = new ZipOutputStream(fos);
|
||||
File srcFile = new File(srcDir);
|
||||
LOG.info("Compressing {}", srcDir);
|
||||
addDirToZip(zos, srcFile, srcFile);
|
||||
// close the ZipOutputStream
|
||||
zos.close();
|
||||
LOG.info("Compressed {} to {}", srcDir, dstFile);
|
||||
return dstFile;
|
||||
}
|
||||
|
||||
private void deleteFiles(String localUri) {
|
||||
boolean success = FileUtil.fullyDelete(new File(localUri));
|
||||
if (!success) {
|
||||
LOG.warn("Fail to delete {}", localUri);
|
||||
}
|
||||
LOG.info("Deleted {}", localUri);
|
||||
}
|
||||
|
||||
private void addDirToZip(ZipOutputStream zos, File srcFile, File base)
|
||||
throws IOException {
|
||||
File[] files = srcFile.listFiles();
|
||||
if (null == files) {
|
||||
return;
|
||||
}
|
||||
FileInputStream fis = null;
|
||||
for (int i = 0; i < files.length; i++) {
|
||||
// if it's directory, add recursively
|
||||
if (files[i].isDirectory()) {
|
||||
addDirToZip(zos, files[i], base);
|
||||
continue;
|
||||
}
|
||||
byte[] buffer = new byte[1024];
|
||||
try {
|
||||
fis = new FileInputStream(files[i]);
|
||||
String name = base.toURI().relativize(files[i].toURI()).getPath();
|
||||
LOG.info(" Zip adding: " + name);
|
||||
zos.putNextEntry(new ZipEntry(name));
|
||||
int length;
|
||||
while ((length = fis.read(buffer)) > 0) {
|
||||
zos.write(buffer, 0, length);
|
||||
}
|
||||
zos.flush();
|
||||
} finally {
|
||||
if (fis != null) {
|
||||
fis.close();
|
||||
}
|
||||
zos.closeEntry();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void addWorkerComponent(Service service,
|
||||
RunJobParameters parameters, TaskType taskType) throws IOException {
|
||||
Component workerComponent = new Component();
|
||||
addCommonEnvironments(workerComponent, taskType);
|
||||
|
||||
workerComponent.setName(taskType.getComponentName());
|
||||
|
||||
if (taskType.equals(TaskType.PRIMARY_WORKER)) {
|
||||
workerComponent.setNumberOfContainers(1L);
|
||||
workerComponent.getConfiguration().setProperty(
|
||||
CONTAINER_STATE_REPORT_AS_SERVICE_STATE, "true");
|
||||
} else{
|
||||
workerComponent.setNumberOfContainers(
|
||||
(long) parameters.getNumWorkers() - 1);
|
||||
}
|
||||
|
||||
if (parameters.getWorkerDockerImage() != null) {
|
||||
workerComponent.setArtifact(
|
||||
getDockerArtifact(parameters.getWorkerDockerImage()));
|
||||
}
|
||||
|
||||
workerComponent.setResource(
|
||||
getServiceResourceFromYarnResource(parameters.getWorkerResource()));
|
||||
handleLaunchCommand(parameters, taskType, workerComponent);
|
||||
workerComponent.setRestartPolicy(Component.RestartPolicyEnum.NEVER);
|
||||
service.addComponent(workerComponent);
|
||||
}
|
||||
|
||||
// Handle worker and primary_worker.
|
||||
private void addWorkerComponents(Service service, RunJobParameters parameters)
|
||||
throws IOException {
|
||||
addWorkerComponent(service, parameters, TaskType.PRIMARY_WORKER);
|
||||
|
||||
if (parameters.getNumWorkers() > 1) {
|
||||
addWorkerComponent(service, parameters, TaskType.WORKER);
|
||||
}
|
||||
}
|
||||
|
||||
private void appendToEnv(Service service, String key, String value,
|
||||
String delim) {
|
||||
Map<String, String> env = service.getConfiguration().getEnv();
|
||||
if (!env.containsKey(key)) {
|
||||
env.put(key, value);
|
||||
} else {
|
||||
if (!value.isEmpty()) {
|
||||
String existingValue = env.get(key);
|
||||
if (!existingValue.endsWith(delim)) {
|
||||
env.put(key, existingValue + delim + value);
|
||||
} else {
|
||||
env.put(key, existingValue + value);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void handleServiceEnvs(Service service, RunJobParameters parameters) {
|
||||
if (parameters.getEnvars() != null) {
|
||||
for (String envarPair : parameters.getEnvars()) {
|
||||
String key, value;
|
||||
if (envarPair.contains("=")) {
|
||||
int idx = envarPair.indexOf('=');
|
||||
key = envarPair.substring(0, idx);
|
||||
value = envarPair.substring(idx + 1);
|
||||
} else{
|
||||
// No "=" found so use the whole key
|
||||
key = envarPair;
|
||||
value = "";
|
||||
}
|
||||
appendToEnv(service, key, value, ":");
|
||||
}
|
||||
}
|
||||
|
||||
// Append other configs like /etc/passwd, /etc/krb5.conf
|
||||
appendToEnv(service, "YARN_CONTAINER_RUNTIME_DOCKER_MOUNTS",
|
||||
"/etc/passwd:/etc/passwd:ro", ",");
|
||||
|
||||
String authenication = clientContext.getYarnConfig().get(
|
||||
HADOOP_SECURITY_AUTHENTICATION);
|
||||
if (authenication != null && authenication.equals("kerberos")) {
|
||||
appendToEnv(service, "YARN_CONTAINER_RUNTIME_DOCKER_MOUNTS",
|
||||
"/etc/krb5.conf:/etc/krb5.conf:ro", ",");
|
||||
}
|
||||
}
|
||||
|
||||
private Artifact getDockerArtifact(String dockerImageName) {
|
||||
return new Artifact().type(Artifact.TypeEnum.DOCKER).id(dockerImageName);
|
||||
}
|
||||
|
||||
private void handleQuicklinks(RunJobParameters runJobParameters)
|
||||
throws IOException {
|
||||
List<Quicklink> quicklinks = runJobParameters.getQuicklinks();
|
||||
if (null != quicklinks && !quicklinks.isEmpty()) {
|
||||
for (Quicklink ql : quicklinks) {
|
||||
// Make sure it is a valid instance name
|
||||
String instanceName = ql.getComponentInstanceName();
|
||||
boolean found = false;
|
||||
|
||||
for (Component comp : serviceSpec.getComponents()) {
|
||||
for (int i = 0; i < comp.getNumberOfContainers(); i++) {
|
||||
String possibleInstanceName = comp.getName() + "-" + i;
|
||||
if (possibleInstanceName.equals(instanceName)) {
|
||||
found = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!found) {
|
||||
throw new IOException(
|
||||
"Couldn't find a component instance = " + instanceName
|
||||
+ " while adding quicklink");
|
||||
}
|
||||
|
||||
String link = ql.getProtocol() + YarnServiceUtils.getDNSName(
|
||||
serviceSpec.getName(), instanceName, getUserName(), getDNSDomain(),
|
||||
ql.getPort());
|
||||
YarnServiceUtils.addQuicklink(serviceSpec, ql.getLabel(), link);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private Service createServiceByParameters(RunJobParameters parameters)
|
||||
throws IOException {
|
||||
componentToLocalLaunchScriptPath.clear();
|
||||
serviceSpec = new Service();
|
||||
serviceSpec.setName(parameters.getName());
|
||||
serviceSpec.setVersion(String.valueOf(System.currentTimeMillis()));
|
||||
serviceSpec.setArtifact(getDockerArtifact(parameters.getDockerImageName()));
|
||||
handleKerberosPrincipal(parameters);
|
||||
|
||||
handleServiceEnvs(serviceSpec, parameters);
|
||||
|
||||
handleLocalizations(parameters);
|
||||
|
||||
if (parameters.getNumWorkers() > 0) {
|
||||
addWorkerComponents(serviceSpec, parameters);
|
||||
}
|
||||
|
||||
if (parameters.getNumPS() > 0) {
|
||||
Component psComponent = new Component();
|
||||
psComponent.setName(TaskType.PS.getComponentName());
|
||||
addCommonEnvironments(psComponent, TaskType.PS);
|
||||
psComponent.setNumberOfContainers((long) parameters.getNumPS());
|
||||
psComponent.setRestartPolicy(Component.RestartPolicyEnum.NEVER);
|
||||
psComponent.setResource(
|
||||
getServiceResourceFromYarnResource(parameters.getPsResource()));
|
||||
|
||||
// Override global docker image if needed.
|
||||
if (parameters.getPsDockerImage() != null) {
|
||||
psComponent.setArtifact(
|
||||
getDockerArtifact(parameters.getPsDockerImage()));
|
||||
}
|
||||
handleLaunchCommand(parameters, TaskType.PS, psComponent);
|
||||
serviceSpec.addComponent(psComponent);
|
||||
}
|
||||
|
||||
if (parameters.isTensorboardEnabled()) {
|
||||
Component tbComponent = new Component();
|
||||
tbComponent.setName(TaskType.TENSORBOARD.getComponentName());
|
||||
addCommonEnvironments(tbComponent, TaskType.TENSORBOARD);
|
||||
tbComponent.setNumberOfContainers(1L);
|
||||
tbComponent.setRestartPolicy(Component.RestartPolicyEnum.NEVER);
|
||||
tbComponent.setResource(getServiceResourceFromYarnResource(
|
||||
parameters.getTensorboardResource()));
|
||||
if (parameters.getTensorboardDockerImage() != null) {
|
||||
tbComponent.setArtifact(
|
||||
getDockerArtifact(parameters.getTensorboardDockerImage()));
|
||||
}
|
||||
|
||||
handleLaunchCommand(parameters, TaskType.TENSORBOARD, tbComponent);
|
||||
|
||||
// Add tensorboard to quicklink
|
||||
String tensorboardLink = "http://" + YarnServiceUtils.getDNSName(
|
||||
parameters.getName(),
|
||||
TaskType.TENSORBOARD.getComponentName() + "-" + 0, getUserName(),
|
||||
getDNSDomain(), 6006);
|
||||
LOG.info("Link to tensorboard:" + tensorboardLink);
|
||||
serviceSpec.addComponent(tbComponent);
|
||||
|
||||
YarnServiceUtils.addQuicklink(serviceSpec, TENSORBOARD_QUICKLINK_LABEL,
|
||||
tensorboardLink);
|
||||
}
|
||||
|
||||
// After all components added, handle quicklinks
|
||||
handleQuicklinks(parameters);
|
||||
|
||||
return serviceSpec;
|
||||
}
|
||||
|
||||
/**
|
||||
* Localize dependencies for all containers.
|
||||
* If remoteUri is a local directory,
|
||||
* we'll zip it, upload to HDFS staging dir HDFS.
|
||||
* If remoteUri is directory, we'll download it, zip it and upload
|
||||
* to HDFS.
|
||||
* If localFilePath is ".", we'll use remoteUri's file/dir name
|
||||
* */
|
||||
private void handleLocalizations(RunJobParameters parameters)
|
||||
throws IOException {
|
||||
// Handle localizations
|
||||
Path stagingDir =
|
||||
clientContext.getRemoteDirectoryManager().getJobStagingArea(
|
||||
parameters.getName(), true);
|
||||
List<Localization> locs = parameters.getLocalizations();
|
||||
String remoteUri;
|
||||
String containerLocalPath;
|
||||
RemoteDirectoryManager rdm = clientContext.getRemoteDirectoryManager();
|
||||
|
||||
// Check to fail fast
|
||||
for (Localization loc : locs) {
|
||||
remoteUri = loc.getRemoteUri();
|
||||
Path resourceToLocalize = new Path(remoteUri);
|
||||
// Check if remoteUri exists
|
||||
if (rdm.isRemote(remoteUri)) {
|
||||
// check if exists
|
||||
if (!rdm.existsRemoteFile(resourceToLocalize)) {
|
||||
throw new FileNotFoundException(
|
||||
"File " + remoteUri + " doesn't exists.");
|
||||
}
|
||||
} else {
|
||||
// Check if exists
|
||||
File localFile = new File(remoteUri);
|
||||
if (!localFile.exists()) {
|
||||
throw new FileNotFoundException(
|
||||
"File " + remoteUri + " doesn't exists.");
|
||||
}
|
||||
}
|
||||
// check remote file size
|
||||
validFileSize(remoteUri);
|
||||
}
|
||||
// Start download remote if needed and upload to HDFS
|
||||
for (Localization loc : locs) {
|
||||
remoteUri = loc.getRemoteUri();
|
||||
containerLocalPath = loc.getLocalPath();
|
||||
String srcFileStr = remoteUri;
|
||||
ConfigFile.TypeEnum destFileType = ConfigFile.TypeEnum.STATIC;
|
||||
Path resourceToLocalize = new Path(remoteUri);
|
||||
boolean needUploadToHDFS = true;
|
||||
|
||||
/**
|
||||
* Special handling for remoteUri directory.
|
||||
* */
|
||||
boolean needDeleteTempFile = false;
|
||||
if (rdm.isDir(remoteUri)) {
|
||||
destFileType = ConfigFile.TypeEnum.ARCHIVE;
|
||||
srcFileStr = mayDownloadAndZipIt(
|
||||
remoteUri, getLastNameFromPath(srcFileStr), true);
|
||||
} else if (rdm.isRemote(remoteUri)) {
|
||||
if (!needHdfs(remoteUri)) {
|
||||
// Non HDFS remote uri. Non directory, no need to zip
|
||||
srcFileStr = mayDownloadAndZipIt(
|
||||
remoteUri, getLastNameFromPath(srcFileStr), false);
|
||||
needDeleteTempFile = true;
|
||||
} else {
|
||||
// HDFS file, no need to upload
|
||||
needUploadToHDFS = false;
|
||||
}
|
||||
}
|
||||
|
||||
// Upload file to HDFS
|
||||
if (needUploadToHDFS) {
|
||||
resourceToLocalize = uploadToRemoteFile(stagingDir, srcFileStr);
|
||||
}
|
||||
if (needDeleteTempFile) {
|
||||
deleteFiles(srcFileStr);
|
||||
}
|
||||
// Remove .zip from zipped dir name
|
||||
if (destFileType == ConfigFile.TypeEnum.ARCHIVE
|
||||
&& srcFileStr.endsWith(".zip")) {
|
||||
// Delete local zip file
|
||||
deleteFiles(srcFileStr);
|
||||
int suffixIndex = srcFileStr.lastIndexOf('_');
|
||||
srcFileStr = srcFileStr.substring(0, suffixIndex);
|
||||
}
|
||||
// If provided, use the name of local uri
|
||||
if (!containerLocalPath.equals(".")
|
||||
&& !containerLocalPath.equals("./")) {
|
||||
// Change the YARN localized file name to what'll used in container
|
||||
srcFileStr = getLastNameFromPath(containerLocalPath);
|
||||
}
|
||||
String localizedName = getLastNameFromPath(srcFileStr);
|
||||
LOG.info("The file/dir to be localized is {}",
|
||||
resourceToLocalize.toString());
|
||||
LOG.info("Its localized file name will be {}", localizedName);
|
||||
serviceSpec.getConfiguration().getFiles().add(new ConfigFile().srcFile(
|
||||
resourceToLocalize.toUri().toString()).destFile(localizedName)
|
||||
.type(destFileType));
|
||||
// set mounts
|
||||
// if mount path is absolute, just use it.
|
||||
// if relative, no need to mount explicitly
|
||||
if (containerLocalPath.startsWith("/")) {
|
||||
String mountStr = getLastNameFromPath(srcFileStr) + ":"
|
||||
+ containerLocalPath + ":" + loc.getMountPermission();
|
||||
LOG.info("Add bind-mount string {}", mountStr);
|
||||
appendToEnv(serviceSpec, "YARN_CONTAINER_RUNTIME_DOCKER_MOUNTS",
|
||||
mountStr, ",");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void validFileSize(String uri) throws IOException {
|
||||
RemoteDirectoryManager rdm = clientContext.getRemoteDirectoryManager();
|
||||
long actualSizeByte;
|
||||
String locationType = "Local";
|
||||
if (rdm.isRemote(uri)) {
|
||||
actualSizeByte = clientContext.getRemoteDirectoryManager()
|
||||
.getRemoteFileSize(uri);
|
||||
locationType = "Remote";
|
||||
} else {
|
||||
actualSizeByte = FileUtil.getDU(new File(uri));
|
||||
}
|
||||
long maxFileSizeMB = clientContext.getSubmarineConfig()
|
||||
.getLong(SubmarineConfiguration.LOCALIZATION_MAX_ALLOWED_FILE_SIZE_MB,
|
||||
SubmarineConfiguration.DEFAULT_MAX_ALLOWED_REMOTE_URI_SIZE_MB);
|
||||
LOG.info("{} fie/dir: {}, size(Byte):{},"
|
||||
+ " Allowed max file/dir size: {}",
|
||||
locationType, uri, actualSizeByte, maxFileSizeMB * 1024 * 1024);
|
||||
|
||||
if (actualSizeByte > maxFileSizeMB * 1024 * 1024) {
|
||||
throw new IOException(uri + " size(Byte): "
|
||||
+ actualSizeByte + " exceeds configured max size:"
|
||||
+ maxFileSizeMB * 1024 * 1024);
|
||||
}
|
||||
}
|
||||
|
||||
private String generateServiceSpecFile(Service service) throws IOException {
|
||||
File serviceSpecFile = File.createTempFile(service.getName(), ".json");
|
||||
String buffer = jsonSerDeser.toJson(service);
|
||||
Writer w = new OutputStreamWriter(new FileOutputStream(serviceSpecFile),
|
||||
"UTF-8");
|
||||
PrintWriter pw = new PrintWriter(w);
|
||||
try {
|
||||
pw.append(buffer);
|
||||
} finally {
|
||||
pw.close();
|
||||
}
|
||||
return serviceSpecFile.getAbsolutePath();
|
||||
}
|
||||
|
||||
private void handleKerberosPrincipal(RunJobParameters parameters) throws
|
||||
IOException {
|
||||
if(StringUtils.isNotBlank(parameters.getKeytab()) && StringUtils
|
||||
.isNotBlank(parameters.getPrincipal())) {
|
||||
String keytab = parameters.getKeytab();
|
||||
String principal = parameters.getPrincipal();
|
||||
if(parameters.isDistributeKeytab()) {
|
||||
Path stagingDir =
|
||||
clientContext.getRemoteDirectoryManager().getJobStagingArea(
|
||||
parameters.getName(), true);
|
||||
Path remoteKeytabPath = uploadToRemoteFile(stagingDir, keytab);
|
||||
//only the owner has read access
|
||||
setPermission(remoteKeytabPath,
|
||||
FsPermission.createImmutable((short)Integer.parseInt("400", 8)));
|
||||
serviceSpec.setKerberosPrincipal(new KerberosPrincipal().keytab(
|
||||
remoteKeytabPath.toString()).principalName(principal));
|
||||
} else {
|
||||
if(!keytab.startsWith("file")) {
|
||||
keytab = "file://" + keytab;
|
||||
}
|
||||
serviceSpec.setKerberosPrincipal(new KerberosPrincipal().keytab(
|
||||
keytab).principalName(principal));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* {@inheritDoc}
|
||||
*/
|
||||
@Override
|
||||
public ApplicationId submitJob(RunJobParameters parameters)
|
||||
throws IOException, YarnException {
|
||||
createServiceByParameters(parameters);
|
||||
String serviceSpecFile = generateServiceSpecFile(serviceSpec);
|
||||
FileSystemOperations fsOperations = new FileSystemOperations(clientContext);
|
||||
HadoopEnvironmentSetup hadoopEnvSetup =
|
||||
new HadoopEnvironmentSetup(clientContext, fsOperations);
|
||||
|
||||
AppAdminClient appAdminClient = YarnServiceUtils.createServiceClient(
|
||||
clientContext.getYarnConfig());
|
||||
Service serviceSpec = createTensorFlowServiceSpec(parameters,
|
||||
fsOperations, hadoopEnvSetup);
|
||||
String serviceSpecFile = ServiceSpecFileGenerator.generateJson(serviceSpec);
|
||||
|
||||
AppAdminClient appAdminClient =
|
||||
YarnServiceUtils.createServiceClient(clientContext.getYarnConfig());
|
||||
int code = appAdminClient.actionLaunch(serviceSpecFile,
|
||||
serviceSpec.getName(), null, null);
|
||||
if (code != EXIT_SUCCESS) {
|
||||
throw new YarnException("Fail to launch application with exit code:" +
|
||||
code);
|
||||
throw new YarnException(
|
||||
"Fail to launch application with exit code:" + code);
|
||||
}
|
||||
|
||||
String appStatus=appAdminClient.getStatusString(serviceSpec.getName());
|
||||
|
@ -896,13 +97,24 @@ public class YarnServiceJobSubmitter implements JobSubmitter {
|
|||
return appid;
|
||||
}
|
||||
|
||||
@VisibleForTesting
|
||||
public Service getServiceSpec() {
|
||||
return serviceSpec;
|
||||
private Service createTensorFlowServiceSpec(RunJobParameters parameters,
|
||||
FileSystemOperations fsOperations, HadoopEnvironmentSetup hadoopEnvSetup)
|
||||
throws IOException {
|
||||
LaunchCommandFactory launchCommandFactory =
|
||||
new LaunchCommandFactory(hadoopEnvSetup, parameters,
|
||||
clientContext.getYarnConfig());
|
||||
Localizer localizer = new Localizer(fsOperations,
|
||||
clientContext.getRemoteDirectoryManager(), parameters);
|
||||
TensorFlowServiceSpec tensorFlowServiceSpec = new TensorFlowServiceSpec(
|
||||
parameters, this.clientContext, fsOperations, launchCommandFactory,
|
||||
localizer);
|
||||
|
||||
serviceWrapper = tensorFlowServiceSpec.create();
|
||||
return serviceWrapper.getService();
|
||||
}
|
||||
|
||||
@VisibleForTesting
|
||||
public Map<String, String> getComponentToLocalLaunchScriptPath() {
|
||||
return componentToLocalLaunchScriptPath;
|
||||
public ServiceWrapper getServiceWrapper() {
|
||||
return serviceWrapper;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -17,33 +17,27 @@ package org.apache.hadoop.yarn.submarine.runtimes.yarnservice;
|
|||
import com.google.common.annotations.VisibleForTesting;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.yarn.client.api.AppAdminClient;
|
||||
import org.apache.hadoop.yarn.service.api.records.Service;
|
||||
import org.apache.hadoop.yarn.submarine.common.Envs;
|
||||
import org.apache.hadoop.yarn.submarine.common.conf.SubmarineLogs;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import static org.apache.hadoop.yarn.client.api.AppAdminClient.DEFAULT_TYPE;
|
||||
|
||||
public class YarnServiceUtils {
|
||||
private static final Logger LOG =
|
||||
LoggerFactory.getLogger(YarnServiceUtils.class);
|
||||
/**
|
||||
* This class contains some static helper methods to query DNS data
|
||||
* based on the provided parameters.
|
||||
*/
|
||||
public final class YarnServiceUtils {
|
||||
private YarnServiceUtils() {
|
||||
}
|
||||
|
||||
// This will be true only in UT.
|
||||
private static AppAdminClient stubServiceClient = null;
|
||||
|
||||
public static AppAdminClient createServiceClient(
|
||||
static AppAdminClient createServiceClient(
|
||||
Configuration yarnConfiguration) {
|
||||
if (stubServiceClient != null) {
|
||||
return stubServiceClient;
|
||||
}
|
||||
|
||||
AppAdminClient serviceClient = AppAdminClient.createAppAdminClient(
|
||||
DEFAULT_TYPE, yarnConfiguration);
|
||||
return serviceClient;
|
||||
return AppAdminClient.createAppAdminClient(DEFAULT_TYPE, yarnConfiguration);
|
||||
}
|
||||
|
||||
@VisibleForTesting
|
||||
|
@ -57,77 +51,9 @@ public class YarnServiceUtils {
|
|||
domain, port);
|
||||
}
|
||||
|
||||
private static String getDNSNameCommonSuffix(String serviceName,
|
||||
public static String getDNSNameCommonSuffix(String serviceName,
|
||||
String userName, String domain, int port) {
|
||||
return "." + serviceName + "." + userName + "." + domain + ":" + port;
|
||||
}
|
||||
|
||||
public static String getTFConfigEnv(String curCommponentName, int nWorkers,
|
||||
int nPs, String serviceName, String userName, String domain) {
|
||||
String commonEndpointSuffix = getDNSNameCommonSuffix(serviceName, userName,
|
||||
domain, 8000);
|
||||
|
||||
String json = "{\\\"cluster\\\":{";
|
||||
|
||||
String master = getComponentArrayJson("master", 1, commonEndpointSuffix)
|
||||
+ ",";
|
||||
String worker = getComponentArrayJson("worker", nWorkers - 1,
|
||||
commonEndpointSuffix) + ",";
|
||||
String ps = getComponentArrayJson("ps", nPs, commonEndpointSuffix) + "},";
|
||||
|
||||
StringBuilder sb = new StringBuilder();
|
||||
sb.append("\\\"task\\\":{");
|
||||
sb.append(" \\\"type\\\":\\\"");
|
||||
sb.append(curCommponentName);
|
||||
sb.append("\\\",");
|
||||
sb.append(" \\\"index\\\":");
|
||||
sb.append('$');
|
||||
sb.append(Envs.TASK_INDEX_ENV + "},");
|
||||
String task = sb.toString();
|
||||
String environment = "\\\"environment\\\":\\\"cloud\\\"}";
|
||||
|
||||
sb = new StringBuilder();
|
||||
sb.append(json);
|
||||
sb.append(master);
|
||||
sb.append(worker);
|
||||
sb.append(ps);
|
||||
sb.append(task);
|
||||
sb.append(environment);
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
public static void addQuicklink(Service serviceSpec, String label,
|
||||
String link) {
|
||||
Map<String, String> quicklinks = serviceSpec.getQuicklinks();
|
||||
if (null == quicklinks) {
|
||||
quicklinks = new HashMap<>();
|
||||
serviceSpec.setQuicklinks(quicklinks);
|
||||
}
|
||||
|
||||
if (SubmarineLogs.isVerbose()) {
|
||||
LOG.info("Added quicklink, " + label + "=" + link);
|
||||
}
|
||||
|
||||
quicklinks.put(label, link);
|
||||
}
|
||||
|
||||
private static String getComponentArrayJson(String componentName, int count,
|
||||
String endpointSuffix) {
|
||||
String component = "\\\"" + componentName + "\\\":";
|
||||
StringBuilder array = new StringBuilder();
|
||||
array.append("[");
|
||||
for (int i = 0; i < count; i++) {
|
||||
array.append("\\\"");
|
||||
array.append(componentName);
|
||||
array.append("-");
|
||||
array.append(i);
|
||||
array.append(endpointSuffix);
|
||||
array.append("\\\"");
|
||||
if (i != count - 1) {
|
||||
array.append(",");
|
||||
}
|
||||
}
|
||||
array.append("]");
|
||||
return component + array.toString();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,64 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hadoop.yarn.submarine.runtimes.yarnservice.command;
|
||||
|
||||
import org.apache.hadoop.yarn.service.api.records.Component;
|
||||
import org.apache.hadoop.yarn.submarine.client.cli.param.RunJobParameters;
|
||||
import org.apache.hadoop.yarn.submarine.common.api.TaskType;
|
||||
import org.apache.hadoop.yarn.submarine.runtimes.yarnservice.HadoopEnvironmentSetup;
|
||||
import java.io.IOException;
|
||||
import java.util.Objects;
|
||||
|
||||
/**
|
||||
* Abstract base class for Launch command implementations for Services.
|
||||
* Currently we have launch command implementations
|
||||
* for TensorFlow PS, worker and Tensorboard instances.
|
||||
*/
|
||||
public abstract class AbstractLaunchCommand {
|
||||
private final LaunchScriptBuilder builder;
|
||||
|
||||
public AbstractLaunchCommand(HadoopEnvironmentSetup hadoopEnvSetup,
|
||||
TaskType taskType, Component component, RunJobParameters parameters)
|
||||
throws IOException {
|
||||
Objects.requireNonNull(taskType, "TaskType must not be null!");
|
||||
this.builder = new LaunchScriptBuilder(taskType.name(), hadoopEnvSetup,
|
||||
parameters, component);
|
||||
}
|
||||
|
||||
protected LaunchScriptBuilder getBuilder() {
|
||||
return builder;
|
||||
}
|
||||
|
||||
/**
|
||||
* Subclasses need to defined this method and return a valid launch script.
|
||||
* Implementors can utilize the {@link LaunchScriptBuilder} using
|
||||
* the getBuilder method of this class.
|
||||
* @return The contents of a script.
|
||||
* @throws IOException If any IO issue happens.
|
||||
*/
|
||||
public abstract String generateLaunchScript() throws IOException;
|
||||
|
||||
/**
|
||||
* Subclasses need to provide a service-specific launch command
|
||||
* of the service.
|
||||
* Please note that this method should only return the launch command
|
||||
* but not the whole script.
|
||||
* @return The launch command
|
||||
*/
|
||||
public abstract String createLaunchCommand();
|
||||
|
||||
}
|
|
@ -0,0 +1,67 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hadoop.yarn.submarine.runtimes.yarnservice.command;
|
||||
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.yarn.service.api.records.Component;
|
||||
import org.apache.hadoop.yarn.submarine.client.cli.param.RunJobParameters;
|
||||
import org.apache.hadoop.yarn.submarine.common.api.TaskType;
|
||||
import org.apache.hadoop.yarn.submarine.runtimes.yarnservice.HadoopEnvironmentSetup;
|
||||
import org.apache.hadoop.yarn.submarine.runtimes.yarnservice.tensorflow.command.TensorBoardLaunchCommand;
|
||||
import org.apache.hadoop.yarn.submarine.runtimes.yarnservice.tensorflow.command.TensorFlowPsLaunchCommand;
|
||||
import org.apache.hadoop.yarn.submarine.runtimes.yarnservice.tensorflow.command.TensorFlowWorkerLaunchCommand;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Objects;
|
||||
|
||||
/**
|
||||
* Simple factory to create instances of {@link AbstractLaunchCommand}
|
||||
* based on the {@link TaskType}.
|
||||
* All dependencies are passed to this factory that could be required
|
||||
* by any implementor of {@link AbstractLaunchCommand}.
|
||||
*/
|
||||
public class LaunchCommandFactory {
|
||||
private final HadoopEnvironmentSetup hadoopEnvSetup;
|
||||
private final RunJobParameters parameters;
|
||||
private final Configuration yarnConfig;
|
||||
|
||||
public LaunchCommandFactory(HadoopEnvironmentSetup hadoopEnvSetup,
|
||||
RunJobParameters parameters, Configuration yarnConfig) {
|
||||
this.hadoopEnvSetup = hadoopEnvSetup;
|
||||
this.parameters = parameters;
|
||||
this.yarnConfig = yarnConfig;
|
||||
}
|
||||
|
||||
public AbstractLaunchCommand createLaunchCommand(TaskType taskType,
|
||||
Component component) throws IOException {
|
||||
Objects.requireNonNull(taskType, "TaskType must not be null!");
|
||||
|
||||
if (taskType == TaskType.WORKER || taskType == TaskType.PRIMARY_WORKER) {
|
||||
return new TensorFlowWorkerLaunchCommand(hadoopEnvSetup, taskType,
|
||||
component, parameters, yarnConfig);
|
||||
|
||||
} else if (taskType == TaskType.PS) {
|
||||
return new TensorFlowPsLaunchCommand(hadoopEnvSetup, taskType, component,
|
||||
parameters, yarnConfig);
|
||||
|
||||
} else if (taskType == TaskType.TENSORBOARD) {
|
||||
return new TensorBoardLaunchCommand(hadoopEnvSetup, taskType, component,
|
||||
parameters);
|
||||
}
|
||||
throw new IllegalStateException("Unknown task type: " + taskType);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,107 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hadoop.yarn.submarine.runtimes.yarnservice.command;
|
||||
|
||||
import org.apache.hadoop.yarn.service.api.records.Component;
|
||||
import org.apache.hadoop.yarn.submarine.client.cli.param.RunJobParameters;
|
||||
import org.apache.hadoop.yarn.submarine.runtimes.yarnservice.HadoopEnvironmentSetup;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.OutputStreamWriter;
|
||||
import java.io.PrintWriter;
|
||||
|
||||
import static java.nio.charset.StandardCharsets.UTF_8;
|
||||
|
||||
/**
|
||||
* This class is a builder to conveniently create launch scripts.
|
||||
* All dependencies are provided with the constructor except
|
||||
* the launch command.
|
||||
*/
|
||||
public class LaunchScriptBuilder {
|
||||
private static final Logger LOG = LoggerFactory.getLogger(
|
||||
LaunchScriptBuilder.class);
|
||||
|
||||
private final File file;
|
||||
private final HadoopEnvironmentSetup hadoopEnvSetup;
|
||||
private final RunJobParameters parameters;
|
||||
private final Component component;
|
||||
private final OutputStreamWriter writer;
|
||||
private final StringBuilder scriptBuffer;
|
||||
private String launchCommand;
|
||||
|
||||
LaunchScriptBuilder(String namePrefix,
|
||||
HadoopEnvironmentSetup hadoopEnvSetup, RunJobParameters parameters,
|
||||
Component component) throws IOException {
|
||||
this.file = File.createTempFile(namePrefix + "-launch-script", ".sh");
|
||||
this.hadoopEnvSetup = hadoopEnvSetup;
|
||||
this.parameters = parameters;
|
||||
this.component = component;
|
||||
this.writer = new OutputStreamWriter(new FileOutputStream(file), UTF_8);
|
||||
this.scriptBuffer = new StringBuilder();
|
||||
}
|
||||
|
||||
public void append(String s) {
|
||||
scriptBuffer.append(s);
|
||||
}
|
||||
|
||||
public LaunchScriptBuilder withLaunchCommand(String command) {
|
||||
this.launchCommand = command;
|
||||
return this;
|
||||
}
|
||||
|
||||
public String build() throws IOException {
|
||||
if (launchCommand != null) {
|
||||
append(launchCommand);
|
||||
} else {
|
||||
LOG.warn("LaunchScript object was null!");
|
||||
if (LOG.isDebugEnabled()) {
|
||||
LOG.debug("LaunchScript's Builder object: {}", this);
|
||||
}
|
||||
}
|
||||
|
||||
try (PrintWriter pw = new PrintWriter(writer)) {
|
||||
writeBashHeader(pw);
|
||||
hadoopEnvSetup.addHdfsClassPath(parameters, pw, component);
|
||||
if (LOG.isDebugEnabled()) {
|
||||
LOG.debug("Appending command to launch script: {}", scriptBuffer);
|
||||
}
|
||||
pw.append(scriptBuffer);
|
||||
}
|
||||
return file.getAbsolutePath();
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "LaunchScriptBuilder{" +
|
||||
"file=" + file +
|
||||
", hadoopEnvSetup=" + hadoopEnvSetup +
|
||||
", parameters=" + parameters +
|
||||
", component=" + component +
|
||||
", writer=" + writer +
|
||||
", scriptBuffer=" + scriptBuffer +
|
||||
", launchCommand='" + launchCommand + '\'' +
|
||||
'}';
|
||||
}
|
||||
|
||||
private void writeBashHeader(PrintWriter pw) {
|
||||
pw.append("#!/bin/bash\n");
|
||||
}
|
||||
}
|
|
@ -0,0 +1,19 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
/**
|
||||
* This package contains classes to produce launch commands and scripts.
|
||||
*/
|
||||
package org.apache.hadoop.yarn.submarine.runtimes.yarnservice.command;
|
|
@ -0,0 +1,109 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hadoop.yarn.submarine.runtimes.yarnservice.tensorflow;
|
||||
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.yarn.service.api.ServiceApiConstants;
|
||||
import org.apache.hadoop.yarn.service.api.records.Component;
|
||||
import org.apache.hadoop.yarn.submarine.common.Envs;
|
||||
import org.apache.hadoop.yarn.submarine.common.api.TaskType;
|
||||
import org.apache.hadoop.yarn.submarine.runtimes.yarnservice.YarnServiceUtils;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* This class has common helper methods for TensorFlow.
|
||||
*/
|
||||
public final class TensorFlowCommons {
|
||||
private TensorFlowCommons() {
|
||||
throw new UnsupportedOperationException("This class should not be " +
|
||||
"instantiated!");
|
||||
}
|
||||
|
||||
public static void addCommonEnvironments(Component component,
|
||||
TaskType taskType) {
|
||||
Map<String, String> envs = component.getConfiguration().getEnv();
|
||||
envs.put(Envs.TASK_INDEX_ENV, ServiceApiConstants.COMPONENT_ID);
|
||||
envs.put(Envs.TASK_TYPE_ENV, taskType.name());
|
||||
}
|
||||
|
||||
public static String getUserName() {
|
||||
return System.getProperty("user.name");
|
||||
}
|
||||
|
||||
public static String getDNSDomain(Configuration yarnConfig) {
|
||||
return yarnConfig.get("hadoop.registry.dns.domain-name");
|
||||
}
|
||||
|
||||
public static String getScriptFileName(TaskType taskType) {
|
||||
return "run-" + taskType.name() + ".sh";
|
||||
}
|
||||
|
||||
public static String getTFConfigEnv(String componentName, int nWorkers,
|
||||
int nPs, String serviceName, String userName, String domain) {
|
||||
String commonEndpointSuffix = YarnServiceUtils
|
||||
.getDNSNameCommonSuffix(serviceName, userName, domain, 8000);
|
||||
|
||||
String json = "{\\\"cluster\\\":{";
|
||||
|
||||
String master = getComponentArrayJson("master", 1, commonEndpointSuffix)
|
||||
+ ",";
|
||||
String worker = getComponentArrayJson("worker", nWorkers - 1,
|
||||
commonEndpointSuffix) + ",";
|
||||
String ps = getComponentArrayJson("ps", nPs, commonEndpointSuffix) + "},";
|
||||
|
||||
StringBuilder sb = new StringBuilder();
|
||||
sb.append("\\\"task\\\":{");
|
||||
sb.append(" \\\"type\\\":\\\"");
|
||||
sb.append(componentName);
|
||||
sb.append("\\\",");
|
||||
sb.append(" \\\"index\\\":");
|
||||
sb.append('$');
|
||||
sb.append(Envs.TASK_INDEX_ENV + "},");
|
||||
String task = sb.toString();
|
||||
String environment = "\\\"environment\\\":\\\"cloud\\\"}";
|
||||
|
||||
sb = new StringBuilder();
|
||||
sb.append(json);
|
||||
sb.append(master);
|
||||
sb.append(worker);
|
||||
sb.append(ps);
|
||||
sb.append(task);
|
||||
sb.append(environment);
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
private static String getComponentArrayJson(String componentName, int count,
|
||||
String endpointSuffix) {
|
||||
String component = "\\\"" + componentName + "\\\":";
|
||||
StringBuilder array = new StringBuilder();
|
||||
array.append("[");
|
||||
for (int i = 0; i < count; i++) {
|
||||
array.append("\\\"");
|
||||
array.append(componentName);
|
||||
array.append("-");
|
||||
array.append(i);
|
||||
array.append(endpointSuffix);
|
||||
array.append("\\\"");
|
||||
if (i != count - 1) {
|
||||
array.append(",");
|
||||
}
|
||||
}
|
||||
array.append("]");
|
||||
return component + array.toString();
|
||||
}
|
||||
}
|
|
@ -0,0 +1,203 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hadoop.yarn.submarine.runtimes.yarnservice.tensorflow;
|
||||
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.yarn.service.api.records.Component;
|
||||
import org.apache.hadoop.yarn.service.api.records.KerberosPrincipal;
|
||||
import org.apache.hadoop.yarn.service.api.records.Service;
|
||||
import org.apache.hadoop.yarn.submarine.client.cli.param.Quicklink;
|
||||
import org.apache.hadoop.yarn.submarine.client.cli.param.RunJobParameters;
|
||||
import org.apache.hadoop.yarn.submarine.common.ClientContext;
|
||||
import org.apache.hadoop.yarn.submarine.common.api.TaskType;
|
||||
import org.apache.hadoop.yarn.submarine.common.conf.SubmarineLogs;
|
||||
import org.apache.hadoop.yarn.submarine.common.fs.RemoteDirectoryManager;
|
||||
import org.apache.hadoop.yarn.submarine.runtimes.yarnservice.FileSystemOperations;
|
||||
import org.apache.hadoop.yarn.submarine.runtimes.yarnservice.ServiceSpec;
|
||||
import org.apache.hadoop.yarn.submarine.runtimes.yarnservice.ServiceWrapper;
|
||||
import org.apache.hadoop.yarn.submarine.runtimes.yarnservice.YarnServiceUtils;
|
||||
import org.apache.hadoop.yarn.submarine.runtimes.yarnservice.command.LaunchCommandFactory;
|
||||
import org.apache.hadoop.yarn.submarine.runtimes.yarnservice.tensorflow.component.TensorBoardComponent;
|
||||
import org.apache.hadoop.yarn.submarine.runtimes.yarnservice.tensorflow.component.TensorFlowPsComponent;
|
||||
import org.apache.hadoop.yarn.submarine.runtimes.yarnservice.tensorflow.component.TensorFlowWorkerComponent;
|
||||
import org.apache.hadoop.yarn.submarine.utils.KerberosPrincipalFactory;
|
||||
import org.apache.hadoop.yarn.submarine.utils.Localizer;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import static org.apache.hadoop.yarn.submarine.runtimes.yarnservice.tensorflow.TensorFlowCommons.getDNSDomain;
|
||||
import static org.apache.hadoop.yarn.submarine.runtimes.yarnservice.tensorflow.TensorFlowCommons.getUserName;
|
||||
import static org.apache.hadoop.yarn.submarine.runtimes.yarnservice.tensorflow.component.TensorBoardComponent.TENSORBOARD_QUICKLINK_LABEL;
|
||||
import static org.apache.hadoop.yarn.submarine.utils.DockerUtilities.getDockerArtifact;
|
||||
import static org.apache.hadoop.yarn.submarine.utils.EnvironmentUtilities.handleServiceEnvs;
|
||||
|
||||
/**
|
||||
* This class contains all the logic to create an instance
|
||||
* of a {@link Service} object for TensorFlow.
|
||||
* Worker,PS and Tensorboard components are added to the Service
|
||||
* based on the value of the received {@link RunJobParameters}.
|
||||
*/
|
||||
public class TensorFlowServiceSpec implements ServiceSpec {
|
||||
private static final Logger LOG =
|
||||
LoggerFactory.getLogger(TensorFlowServiceSpec.class);
|
||||
|
||||
private final RemoteDirectoryManager remoteDirectoryManager;
|
||||
|
||||
private final RunJobParameters parameters;
|
||||
private final Configuration yarnConfig;
|
||||
private final FileSystemOperations fsOperations;
|
||||
private final LaunchCommandFactory launchCommandFactory;
|
||||
private final Localizer localizer;
|
||||
|
||||
public TensorFlowServiceSpec(RunJobParameters parameters,
|
||||
ClientContext clientContext, FileSystemOperations fsOperations,
|
||||
LaunchCommandFactory launchCommandFactory, Localizer localizer) {
|
||||
this.parameters = parameters;
|
||||
this.remoteDirectoryManager = clientContext.getRemoteDirectoryManager();
|
||||
this.yarnConfig = clientContext.getYarnConfig();
|
||||
this.fsOperations = fsOperations;
|
||||
this.launchCommandFactory = launchCommandFactory;
|
||||
this.localizer = localizer;
|
||||
}
|
||||
|
||||
@Override
|
||||
public ServiceWrapper create() throws IOException {
|
||||
ServiceWrapper serviceWrapper = createServiceSpecWrapper();
|
||||
|
||||
if (parameters.getNumWorkers() > 0) {
|
||||
addWorkerComponents(serviceWrapper);
|
||||
}
|
||||
|
||||
if (parameters.getNumPS() > 0) {
|
||||
addPsComponent(serviceWrapper);
|
||||
}
|
||||
|
||||
if (parameters.isTensorboardEnabled()) {
|
||||
createTensorBoardComponent(serviceWrapper);
|
||||
}
|
||||
|
||||
// After all components added, handle quicklinks
|
||||
handleQuicklinks(serviceWrapper.getService());
|
||||
|
||||
return serviceWrapper;
|
||||
}
|
||||
|
||||
private ServiceWrapper createServiceSpecWrapper() throws IOException {
|
||||
Service serviceSpec = new Service();
|
||||
serviceSpec.setName(parameters.getName());
|
||||
serviceSpec.setVersion(String.valueOf(System.currentTimeMillis()));
|
||||
serviceSpec.setArtifact(getDockerArtifact(parameters.getDockerImageName()));
|
||||
|
||||
KerberosPrincipal kerberosPrincipal = KerberosPrincipalFactory
|
||||
.create(fsOperations, remoteDirectoryManager, parameters);
|
||||
if (kerberosPrincipal != null) {
|
||||
serviceSpec.setKerberosPrincipal(kerberosPrincipal);
|
||||
}
|
||||
|
||||
handleServiceEnvs(serviceSpec, yarnConfig, parameters.getEnvars());
|
||||
localizer.handleLocalizations(serviceSpec);
|
||||
return new ServiceWrapper(serviceSpec);
|
||||
}
|
||||
|
||||
private void createTensorBoardComponent(ServiceWrapper serviceWrapper)
|
||||
throws IOException {
|
||||
TensorBoardComponent tbComponent = new TensorBoardComponent(fsOperations,
|
||||
remoteDirectoryManager, parameters, launchCommandFactory, yarnConfig);
|
||||
serviceWrapper.addComponent(tbComponent);
|
||||
|
||||
addQuicklink(serviceWrapper.getService(), TENSORBOARD_QUICKLINK_LABEL,
|
||||
tbComponent.getTensorboardLink());
|
||||
}
|
||||
|
||||
private static void addQuicklink(Service serviceSpec, String label,
|
||||
String link) {
|
||||
Map<String, String> quicklinks = serviceSpec.getQuicklinks();
|
||||
if (quicklinks == null) {
|
||||
quicklinks = new HashMap<>();
|
||||
serviceSpec.setQuicklinks(quicklinks);
|
||||
}
|
||||
|
||||
if (SubmarineLogs.isVerbose()) {
|
||||
LOG.info("Added quicklink, " + label + "=" + link);
|
||||
}
|
||||
|
||||
quicklinks.put(label, link);
|
||||
}
|
||||
|
||||
private void handleQuicklinks(Service serviceSpec)
|
||||
throws IOException {
|
||||
List<Quicklink> quicklinks = parameters.getQuicklinks();
|
||||
if (quicklinks != null && !quicklinks.isEmpty()) {
|
||||
for (Quicklink ql : quicklinks) {
|
||||
// Make sure it is a valid instance name
|
||||
String instanceName = ql.getComponentInstanceName();
|
||||
boolean found = false;
|
||||
|
||||
for (Component comp : serviceSpec.getComponents()) {
|
||||
for (int i = 0; i < comp.getNumberOfContainers(); i++) {
|
||||
String possibleInstanceName = comp.getName() + "-" + i;
|
||||
if (possibleInstanceName.equals(instanceName)) {
|
||||
found = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!found) {
|
||||
throw new IOException(
|
||||
"Couldn't find a component instance = " + instanceName
|
||||
+ " while adding quicklink");
|
||||
}
|
||||
|
||||
String link = ql.getProtocol()
|
||||
+ YarnServiceUtils.getDNSName(serviceSpec.getName(), instanceName,
|
||||
getUserName(), getDNSDomain(yarnConfig), ql.getPort());
|
||||
addQuicklink(serviceSpec, ql.getLabel(), link);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Handle worker and primary_worker.
|
||||
|
||||
private void addWorkerComponents(ServiceWrapper serviceWrapper)
|
||||
throws IOException {
|
||||
addWorkerComponent(serviceWrapper, parameters, TaskType.PRIMARY_WORKER);
|
||||
|
||||
if (parameters.getNumWorkers() > 1) {
|
||||
addWorkerComponent(serviceWrapper, parameters, TaskType.WORKER);
|
||||
}
|
||||
}
|
||||
private void addWorkerComponent(ServiceWrapper serviceWrapper,
|
||||
RunJobParameters parameters, TaskType taskType) throws IOException {
|
||||
serviceWrapper.addComponent(
|
||||
new TensorFlowWorkerComponent(fsOperations, remoteDirectoryManager,
|
||||
parameters, taskType, launchCommandFactory, yarnConfig));
|
||||
}
|
||||
|
||||
private void addPsComponent(ServiceWrapper serviceWrapper)
|
||||
throws IOException {
|
||||
serviceWrapper.addComponent(
|
||||
new TensorFlowPsComponent(fsOperations, remoteDirectoryManager,
|
||||
launchCommandFactory, parameters, yarnConfig));
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,67 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hadoop.yarn.submarine.runtimes.yarnservice.tensorflow.command;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.hadoop.yarn.service.api.records.Component;
|
||||
import org.apache.hadoop.yarn.submarine.client.cli.param.RunJobParameters;
|
||||
import org.apache.hadoop.yarn.submarine.common.api.TaskType;
|
||||
import org.apache.hadoop.yarn.submarine.runtimes.yarnservice.HadoopEnvironmentSetup;
|
||||
import org.apache.hadoop.yarn.submarine.runtimes.yarnservice.command.AbstractLaunchCommand;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Objects;
|
||||
|
||||
/**
|
||||
* Launch command implementation for Tensorboard.
|
||||
*/
|
||||
public class TensorBoardLaunchCommand extends AbstractLaunchCommand {
|
||||
private static final Logger LOG =
|
||||
LoggerFactory.getLogger(TensorBoardLaunchCommand.class);
|
||||
private final String checkpointPath;
|
||||
|
||||
public TensorBoardLaunchCommand(HadoopEnvironmentSetup hadoopEnvSetup,
|
||||
TaskType taskType, Component component, RunJobParameters parameters)
|
||||
throws IOException {
|
||||
super(hadoopEnvSetup, taskType, component, parameters);
|
||||
Objects.requireNonNull(parameters.getCheckpointPath(),
|
||||
"CheckpointPath must not be null as it is part "
|
||||
+ "of the tensorboard command!");
|
||||
if (StringUtils.isEmpty(parameters.getCheckpointPath())) {
|
||||
throw new IllegalArgumentException("CheckpointPath must not be empty!");
|
||||
}
|
||||
|
||||
this.checkpointPath = parameters.getCheckpointPath();
|
||||
}
|
||||
|
||||
@Override
|
||||
public String generateLaunchScript() throws IOException {
|
||||
return getBuilder()
|
||||
.withLaunchCommand(createLaunchCommand())
|
||||
.build();
|
||||
}
|
||||
|
||||
@Override
|
||||
public String createLaunchCommand() {
|
||||
String tbCommand = String.format("export LC_ALL=C && tensorboard " +
|
||||
"--logdir=%s%n", checkpointPath);
|
||||
LOG.info("Tensorboard command=" + tbCommand);
|
||||
return tbCommand;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,87 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hadoop.yarn.submarine.runtimes.yarnservice.tensorflow.command;
|
||||
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.yarn.service.api.records.Component;
|
||||
import org.apache.hadoop.yarn.submarine.client.cli.param.RunJobParameters;
|
||||
import org.apache.hadoop.yarn.submarine.common.api.TaskType;
|
||||
import org.apache.hadoop.yarn.submarine.runtimes.yarnservice.HadoopEnvironmentSetup;
|
||||
import org.apache.hadoop.yarn.submarine.runtimes.yarnservice.command.AbstractLaunchCommand;
|
||||
import org.apache.hadoop.yarn.submarine.runtimes.yarnservice.command.LaunchScriptBuilder;
|
||||
import org.apache.hadoop.yarn.submarine.runtimes.yarnservice.tensorflow.TensorFlowCommons;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
/**
|
||||
* Launch command implementation for
|
||||
* TensorFlow PS and Worker Service components.
|
||||
*/
|
||||
public abstract class TensorFlowLaunchCommand extends AbstractLaunchCommand {
|
||||
private static final Logger LOG =
|
||||
LoggerFactory.getLogger(TensorFlowLaunchCommand.class);
|
||||
private final Configuration yarnConfig;
|
||||
private final boolean distributed;
|
||||
private final int numberOfWorkers;
|
||||
private final int numberOfPS;
|
||||
private final String name;
|
||||
private final TaskType taskType;
|
||||
|
||||
TensorFlowLaunchCommand(HadoopEnvironmentSetup hadoopEnvSetup,
|
||||
TaskType taskType, Component component, RunJobParameters parameters,
|
||||
Configuration yarnConfig) throws IOException {
|
||||
super(hadoopEnvSetup, taskType, component, parameters);
|
||||
this.taskType = taskType;
|
||||
this.name = parameters.getName();
|
||||
this.distributed = parameters.isDistributed();
|
||||
this.numberOfWorkers = parameters.getNumWorkers();
|
||||
this.numberOfPS = parameters.getNumPS();
|
||||
this.yarnConfig = yarnConfig;
|
||||
logReceivedParameters();
|
||||
}
|
||||
|
||||
private void logReceivedParameters() {
|
||||
if (this.numberOfWorkers <= 0) {
|
||||
LOG.warn("Received number of workers: {}", this.numberOfWorkers);
|
||||
}
|
||||
if (this.numberOfPS <= 0) {
|
||||
LOG.warn("Received number of PS: {}", this.numberOfPS);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public String generateLaunchScript() throws IOException {
|
||||
LaunchScriptBuilder builder = getBuilder();
|
||||
|
||||
// When distributed training is required
|
||||
if (distributed) {
|
||||
String tfConfigEnvValue = TensorFlowCommons.getTFConfigEnv(
|
||||
taskType.getComponentName(), numberOfWorkers,
|
||||
numberOfPS, name,
|
||||
TensorFlowCommons.getUserName(),
|
||||
TensorFlowCommons.getDNSDomain(yarnConfig));
|
||||
String tfConfig = "export TF_CONFIG=\"" + tfConfigEnvValue + "\"\n";
|
||||
builder.append(tfConfig);
|
||||
}
|
||||
|
||||
return builder
|
||||
.withLaunchCommand(createLaunchCommand())
|
||||
.build();
|
||||
}
|
||||
}
|
|
@ -0,0 +1,58 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hadoop.yarn.submarine.runtimes.yarnservice.tensorflow.command;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.yarn.service.api.records.Component;
|
||||
import org.apache.hadoop.yarn.submarine.client.cli.param.RunJobParameters;
|
||||
import org.apache.hadoop.yarn.submarine.common.api.TaskType;
|
||||
import org.apache.hadoop.yarn.submarine.common.conf.SubmarineLogs;
|
||||
import org.apache.hadoop.yarn.submarine.runtimes.yarnservice.HadoopEnvironmentSetup;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
/**
|
||||
* Launch command implementation for Tensorboard's PS component.
|
||||
*/
|
||||
public class TensorFlowPsLaunchCommand extends TensorFlowLaunchCommand {
|
||||
private static final Logger LOG =
|
||||
LoggerFactory.getLogger(TensorFlowPsLaunchCommand.class);
|
||||
private final String launchCommand;
|
||||
|
||||
public TensorFlowPsLaunchCommand(HadoopEnvironmentSetup hadoopEnvSetup,
|
||||
TaskType taskType, Component component, RunJobParameters parameters,
|
||||
Configuration yarnConfig) throws IOException {
|
||||
super(hadoopEnvSetup, taskType, component, parameters, yarnConfig);
|
||||
this.launchCommand = parameters.getPSLaunchCmd();
|
||||
|
||||
if (StringUtils.isEmpty(this.launchCommand)) {
|
||||
throw new IllegalArgumentException("LaunchCommand must not be null " +
|
||||
"or empty!");
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public String createLaunchCommand() {
|
||||
if (SubmarineLogs.isVerbose()) {
|
||||
LOG.info("PS command =[" + launchCommand + "]");
|
||||
}
|
||||
return launchCommand + '\n';
|
||||
}
|
||||
}
|
|
@ -0,0 +1,59 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hadoop.yarn.submarine.runtimes.yarnservice.tensorflow.command;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.yarn.service.api.records.Component;
|
||||
import org.apache.hadoop.yarn.submarine.client.cli.param.RunJobParameters;
|
||||
import org.apache.hadoop.yarn.submarine.common.api.TaskType;
|
||||
import org.apache.hadoop.yarn.submarine.common.conf.SubmarineLogs;
|
||||
import org.apache.hadoop.yarn.submarine.runtimes.yarnservice.HadoopEnvironmentSetup;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
/**
|
||||
* Launch command implementation for Tensorboard's Worker component.
|
||||
*/
|
||||
public class TensorFlowWorkerLaunchCommand extends TensorFlowLaunchCommand {
|
||||
private static final Logger LOG =
|
||||
LoggerFactory.getLogger(TensorFlowWorkerLaunchCommand.class);
|
||||
private final String launchCommand;
|
||||
|
||||
public TensorFlowWorkerLaunchCommand(
|
||||
HadoopEnvironmentSetup hadoopEnvSetup, TaskType taskType,
|
||||
Component component, RunJobParameters parameters,
|
||||
Configuration yarnConfig) throws IOException {
|
||||
super(hadoopEnvSetup, taskType, component, parameters, yarnConfig);
|
||||
this.launchCommand = parameters.getWorkerLaunchCmd();
|
||||
|
||||
if (StringUtils.isEmpty(this.launchCommand)) {
|
||||
throw new IllegalArgumentException("LaunchCommand must not be null " +
|
||||
"or empty!");
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public String createLaunchCommand() {
|
||||
if (SubmarineLogs.isVerbose()) {
|
||||
LOG.info("Worker command =[" + launchCommand + "]");
|
||||
}
|
||||
return launchCommand + '\n';
|
||||
}
|
||||
}
|
|
@ -0,0 +1,19 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
/**
|
||||
* This package contains classes to generate TensorFlow launch commands.
|
||||
*/
|
||||
package org.apache.hadoop.yarn.submarine.runtimes.yarnservice.tensorflow.command;
|
|
@ -0,0 +1,96 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hadoop.yarn.submarine.runtimes.yarnservice.tensorflow.component;
|
||||
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.yarn.service.api.records.Component;
|
||||
import org.apache.hadoop.yarn.service.api.records.Component.RestartPolicyEnum;
|
||||
import org.apache.hadoop.yarn.submarine.client.cli.param.RunJobParameters;
|
||||
import org.apache.hadoop.yarn.submarine.common.api.TaskType;
|
||||
import org.apache.hadoop.yarn.submarine.common.fs.RemoteDirectoryManager;
|
||||
import org.apache.hadoop.yarn.submarine.runtimes.yarnservice.AbstractComponent;
|
||||
import org.apache.hadoop.yarn.submarine.runtimes.yarnservice.FileSystemOperations;
|
||||
import org.apache.hadoop.yarn.submarine.runtimes.yarnservice.YarnServiceUtils;
|
||||
import org.apache.hadoop.yarn.submarine.runtimes.yarnservice.command.LaunchCommandFactory;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Objects;
|
||||
|
||||
import static org.apache.hadoop.yarn.submarine.runtimes.yarnservice.tensorflow.TensorFlowCommons.addCommonEnvironments;
|
||||
import static org.apache.hadoop.yarn.submarine.runtimes.yarnservice.tensorflow.TensorFlowCommons.getDNSDomain;
|
||||
import static org.apache.hadoop.yarn.submarine.runtimes.yarnservice.tensorflow.TensorFlowCommons.getUserName;
|
||||
import static org.apache.hadoop.yarn.submarine.utils.DockerUtilities.getDockerArtifact;
|
||||
import static org.apache.hadoop.yarn.submarine.utils.SubmarineResourceUtils.convertYarnResourceToServiceResource;
|
||||
|
||||
/**
|
||||
* Component implementation for Tensorboard's Tensorboard.
|
||||
*/
|
||||
public class TensorBoardComponent extends AbstractComponent {
|
||||
private static final Logger LOG =
|
||||
LoggerFactory.getLogger(TensorBoardComponent.class);
|
||||
|
||||
public static final String TENSORBOARD_QUICKLINK_LABEL = "Tensorboard";
|
||||
private static final int DEFAULT_PORT = 6006;
|
||||
|
||||
//computed fields
|
||||
private String tensorboardLink;
|
||||
|
||||
public TensorBoardComponent(FileSystemOperations fsOperations,
|
||||
RemoteDirectoryManager remoteDirectoryManager,
|
||||
RunJobParameters parameters,
|
||||
LaunchCommandFactory launchCommandFactory,
|
||||
Configuration yarnConfig) {
|
||||
super(fsOperations, remoteDirectoryManager, parameters,
|
||||
TaskType.TENSORBOARD, yarnConfig, launchCommandFactory);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Component createComponent() throws IOException {
|
||||
Objects.requireNonNull(parameters.getTensorboardResource(),
|
||||
"TensorBoard resource must not be null!");
|
||||
|
||||
Component component = new Component();
|
||||
component.setName(taskType.getComponentName());
|
||||
component.setNumberOfContainers(1L);
|
||||
component.setRestartPolicy(RestartPolicyEnum.NEVER);
|
||||
component.setResource(convertYarnResourceToServiceResource(
|
||||
parameters.getTensorboardResource()));
|
||||
|
||||
if (parameters.getTensorboardDockerImage() != null) {
|
||||
component.setArtifact(
|
||||
getDockerArtifact(parameters.getTensorboardDockerImage()));
|
||||
}
|
||||
|
||||
addCommonEnvironments(component, taskType);
|
||||
generateLaunchCommand(component);
|
||||
|
||||
tensorboardLink = "http://" + YarnServiceUtils.getDNSName(
|
||||
parameters.getName(),
|
||||
taskType.getComponentName() + "-" + 0, getUserName(),
|
||||
getDNSDomain(yarnConfig), DEFAULT_PORT);
|
||||
LOG.info("Link to tensorboard:" + tensorboardLink);
|
||||
|
||||
return component;
|
||||
}
|
||||
|
||||
public String getTensorboardLink() {
|
||||
return tensorboardLink;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,73 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hadoop.yarn.submarine.runtimes.yarnservice.tensorflow.component;
|
||||
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.yarn.service.api.records.Component;
|
||||
import org.apache.hadoop.yarn.submarine.client.cli.param.RunJobParameters;
|
||||
import org.apache.hadoop.yarn.submarine.common.api.TaskType;
|
||||
import org.apache.hadoop.yarn.submarine.common.fs.RemoteDirectoryManager;
|
||||
import org.apache.hadoop.yarn.submarine.runtimes.yarnservice.AbstractComponent;
|
||||
import org.apache.hadoop.yarn.submarine.runtimes.yarnservice.FileSystemOperations;
|
||||
import org.apache.hadoop.yarn.submarine.runtimes.yarnservice.command.LaunchCommandFactory;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Objects;
|
||||
|
||||
import static org.apache.hadoop.yarn.submarine.runtimes.yarnservice.tensorflow.TensorFlowCommons.addCommonEnvironments;
|
||||
import static org.apache.hadoop.yarn.submarine.utils.DockerUtilities.getDockerArtifact;
|
||||
import static org.apache.hadoop.yarn.submarine.utils.SubmarineResourceUtils.convertYarnResourceToServiceResource;
|
||||
|
||||
/**
|
||||
* Component implementation for TensorFlow's PS process.
|
||||
*/
|
||||
public class TensorFlowPsComponent extends AbstractComponent {
|
||||
public TensorFlowPsComponent(FileSystemOperations fsOperations,
|
||||
RemoteDirectoryManager remoteDirectoryManager,
|
||||
LaunchCommandFactory launchCommandFactory,
|
||||
RunJobParameters parameters,
|
||||
Configuration yarnConfig) {
|
||||
super(fsOperations, remoteDirectoryManager, parameters, TaskType.PS,
|
||||
yarnConfig, launchCommandFactory);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Component createComponent() throws IOException {
|
||||
Objects.requireNonNull(parameters.getPsResource(),
|
||||
"PS resource must not be null!");
|
||||
if (parameters.getNumPS() < 1) {
|
||||
throw new IllegalArgumentException("Number of PS should be at least 1!");
|
||||
}
|
||||
|
||||
Component component = new Component();
|
||||
component.setName(taskType.getComponentName());
|
||||
component.setNumberOfContainers((long) parameters.getNumPS());
|
||||
component.setRestartPolicy(Component.RestartPolicyEnum.NEVER);
|
||||
component.setResource(
|
||||
convertYarnResourceToServiceResource(parameters.getPsResource()));
|
||||
|
||||
// Override global docker image if needed.
|
||||
if (parameters.getPsDockerImage() != null) {
|
||||
component.setArtifact(
|
||||
getDockerArtifact(parameters.getPsDockerImage()));
|
||||
}
|
||||
addCommonEnvironments(component, taskType);
|
||||
generateLaunchCommand(component);
|
||||
|
||||
return component;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,82 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hadoop.yarn.submarine.runtimes.yarnservice.tensorflow.component;
|
||||
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.yarn.service.api.records.Component;
|
||||
import org.apache.hadoop.yarn.submarine.client.cli.param.RunJobParameters;
|
||||
import org.apache.hadoop.yarn.submarine.common.api.TaskType;
|
||||
import org.apache.hadoop.yarn.submarine.common.fs.RemoteDirectoryManager;
|
||||
import org.apache.hadoop.yarn.submarine.runtimes.yarnservice.AbstractComponent;
|
||||
import org.apache.hadoop.yarn.submarine.runtimes.yarnservice.FileSystemOperations;
|
||||
import org.apache.hadoop.yarn.submarine.runtimes.yarnservice.command.LaunchCommandFactory;
|
||||
import java.io.IOException;
|
||||
import java.util.Objects;
|
||||
import static org.apache.hadoop.yarn.service.conf.YarnServiceConstants.CONTAINER_STATE_REPORT_AS_SERVICE_STATE;
|
||||
import static org.apache.hadoop.yarn.submarine.runtimes.yarnservice.tensorflow.TensorFlowCommons.addCommonEnvironments;
|
||||
import static org.apache.hadoop.yarn.submarine.utils.DockerUtilities.getDockerArtifact;
|
||||
import static org.apache.hadoop.yarn.submarine.utils.SubmarineResourceUtils.convertYarnResourceToServiceResource;
|
||||
|
||||
/**
|
||||
* Component implementation for TensorFlow's Worker process.
|
||||
*/
|
||||
public class TensorFlowWorkerComponent extends AbstractComponent {
|
||||
public TensorFlowWorkerComponent(FileSystemOperations fsOperations,
|
||||
RemoteDirectoryManager remoteDirectoryManager,
|
||||
RunJobParameters parameters, TaskType taskType,
|
||||
LaunchCommandFactory launchCommandFactory,
|
||||
Configuration yarnConfig) {
|
||||
super(fsOperations, remoteDirectoryManager, parameters, taskType,
|
||||
yarnConfig, launchCommandFactory);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Component createComponent() throws IOException {
|
||||
Objects.requireNonNull(parameters.getWorkerResource(),
|
||||
"Worker resource must not be null!");
|
||||
if (parameters.getNumWorkers() < 1) {
|
||||
throw new IllegalArgumentException(
|
||||
"Number of workers should be at least 1!");
|
||||
}
|
||||
|
||||
Component component = new Component();
|
||||
component.setName(taskType.getComponentName());
|
||||
|
||||
if (taskType.equals(TaskType.PRIMARY_WORKER)) {
|
||||
component.setNumberOfContainers(1L);
|
||||
component.getConfiguration().setProperty(
|
||||
CONTAINER_STATE_REPORT_AS_SERVICE_STATE, "true");
|
||||
} else {
|
||||
component.setNumberOfContainers(
|
||||
(long) parameters.getNumWorkers() - 1);
|
||||
}
|
||||
|
||||
if (parameters.getWorkerDockerImage() != null) {
|
||||
component.setArtifact(
|
||||
getDockerArtifact(parameters.getWorkerDockerImage()));
|
||||
}
|
||||
|
||||
component.setResource(convertYarnResourceToServiceResource(
|
||||
parameters.getWorkerResource()));
|
||||
component.setRestartPolicy(Component.RestartPolicyEnum.NEVER);
|
||||
|
||||
addCommonEnvironments(component, taskType);
|
||||
generateLaunchCommand(component);
|
||||
|
||||
return component;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,20 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
/**
|
||||
* This package contains classes to generate
|
||||
* TensorFlow Native Service components.
|
||||
*/
|
||||
package org.apache.hadoop.yarn.submarine.runtimes.yarnservice.tensorflow.component;
|
|
@ -0,0 +1,20 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
/**
|
||||
* This package contains classes to generate
|
||||
* TensorFlow-related Native Service runtime artifacts.
|
||||
*/
|
||||
package org.apache.hadoop.yarn.submarine.runtimes.yarnservice.tensorflow;
|
|
@ -0,0 +1,57 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hadoop.yarn.submarine.utils;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.StringTokenizer;
|
||||
|
||||
/**
|
||||
* Utilities for classpath operations.
|
||||
*/
|
||||
public final class ClassPathUtilities {
|
||||
private ClassPathUtilities() {
|
||||
throw new UnsupportedOperationException("This class should not be " +
|
||||
"instantiated!");
|
||||
}
|
||||
|
||||
public static File findFileOnClassPath(final String fileName) {
|
||||
final String classpath = System.getProperty("java.class.path");
|
||||
final String pathSeparator = System.getProperty("path.separator");
|
||||
final StringTokenizer tokenizer = new StringTokenizer(classpath,
|
||||
pathSeparator);
|
||||
|
||||
while (tokenizer.hasMoreTokens()) {
|
||||
final String pathElement = tokenizer.nextToken();
|
||||
final File directoryOrJar = new File(pathElement);
|
||||
final File absoluteDirectoryOrJar = directoryOrJar.getAbsoluteFile();
|
||||
if (absoluteDirectoryOrJar.isFile()) {
|
||||
final File target =
|
||||
new File(absoluteDirectoryOrJar.getParent(), fileName);
|
||||
if (target.exists()) {
|
||||
return target;
|
||||
}
|
||||
} else {
|
||||
final File target = new File(directoryOrJar, fileName);
|
||||
if (target.exists()) {
|
||||
return target;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,33 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hadoop.yarn.submarine.utils;
|
||||
|
||||
import org.apache.hadoop.yarn.service.api.records.Artifact;
|
||||
|
||||
/**
|
||||
* Utilities for Docker-related operations.
|
||||
*/
|
||||
public final class DockerUtilities {
|
||||
private DockerUtilities() {
|
||||
throw new UnsupportedOperationException("This class should not be " +
|
||||
"instantiated!");
|
||||
}
|
||||
|
||||
public static Artifact getDockerArtifact(String dockerImageName) {
|
||||
return new Artifact().type(Artifact.TypeEnum.DOCKER).id(dockerImageName);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,120 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hadoop.yarn.submarine.utils;
|
||||
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.yarn.service.api.records.Service;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.HADOOP_SECURITY_AUTHENTICATION;
|
||||
|
||||
/**
|
||||
* Utilities for environment variable related operations
|
||||
* for {@link Service} objects.
|
||||
*/
|
||||
public final class EnvironmentUtilities {
|
||||
private EnvironmentUtilities() {
|
||||
throw new UnsupportedOperationException("This class should not be " +
|
||||
"instantiated!");
|
||||
}
|
||||
|
||||
private static final Logger LOG =
|
||||
LoggerFactory.getLogger(EnvironmentUtilities.class);
|
||||
|
||||
static final String ENV_DOCKER_MOUNTS_FOR_CONTAINER_RUNTIME =
|
||||
"YARN_CONTAINER_RUNTIME_DOCKER_MOUNTS";
|
||||
private static final String MOUNTS_DELIM = ",";
|
||||
private static final String ENV_SEPARATOR = "=";
|
||||
private static final String ETC_PASSWD_MOUNT_STRING =
|
||||
"/etc/passwd:/etc/passwd:ro";
|
||||
private static final String KERBEROS_CONF_MOUNT_STRING =
|
||||
"/etc/krb5.conf:/etc/krb5.conf:ro";
|
||||
private static final String ENV_VAR_DELIM = ":";
|
||||
|
||||
/**
|
||||
* Extracts value from a string representation of an environment variable.
|
||||
* @param envVar The environment variable in 'key=value' format.
|
||||
* @return The value of the environment variable
|
||||
*/
|
||||
public static String getValueOfEnvironment(String envVar) {
|
||||
if (envVar == null || !envVar.contains(ENV_SEPARATOR)) {
|
||||
return "";
|
||||
} else {
|
||||
return envVar.substring(envVar.indexOf(ENV_SEPARATOR) + 1);
|
||||
}
|
||||
}
|
||||
|
||||
public static void handleServiceEnvs(Service service,
|
||||
Configuration yarnConfig, List<String> envVars) {
|
||||
if (envVars != null) {
|
||||
for (String envVarPair : envVars) {
|
||||
String key, value;
|
||||
if (envVarPair.contains(ENV_SEPARATOR)) {
|
||||
int idx = envVarPair.indexOf(ENV_SEPARATOR);
|
||||
key = envVarPair.substring(0, idx);
|
||||
value = envVarPair.substring(idx + 1);
|
||||
} else {
|
||||
LOG.warn("Found environment variable with unusual format: '{}'",
|
||||
envVarPair);
|
||||
// No "=" found so use the whole key
|
||||
key = envVarPair;
|
||||
value = "";
|
||||
}
|
||||
appendToEnv(service, key, value, ENV_VAR_DELIM);
|
||||
}
|
||||
}
|
||||
appendOtherConfigs(service, yarnConfig);
|
||||
}
|
||||
|
||||
/**
|
||||
* Appends other configs like /etc/passwd, /etc/krb5.conf.
|
||||
* @param service
|
||||
* @param yarnConfig
|
||||
*/
|
||||
private static void appendOtherConfigs(Service service,
|
||||
Configuration yarnConfig) {
|
||||
appendToEnv(service, ENV_DOCKER_MOUNTS_FOR_CONTAINER_RUNTIME,
|
||||
ETC_PASSWD_MOUNT_STRING, MOUNTS_DELIM);
|
||||
|
||||
String authentication = yarnConfig.get(HADOOP_SECURITY_AUTHENTICATION);
|
||||
if (authentication != null && authentication.equals("kerberos")) {
|
||||
appendToEnv(service, ENV_DOCKER_MOUNTS_FOR_CONTAINER_RUNTIME,
|
||||
KERBEROS_CONF_MOUNT_STRING, MOUNTS_DELIM);
|
||||
}
|
||||
}
|
||||
|
||||
static void appendToEnv(Service service, String key, String value,
|
||||
String delim) {
|
||||
Map<String, String> env = service.getConfiguration().getEnv();
|
||||
if (!env.containsKey(key)) {
|
||||
env.put(key, value);
|
||||
} else {
|
||||
if (!value.isEmpty()) {
|
||||
String existingValue = env.get(key);
|
||||
if (!existingValue.endsWith(delim)) {
|
||||
env.put(key, existingValue + delim + value);
|
||||
} else {
|
||||
env.put(key, existingValue + value);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,95 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hadoop.yarn.submarine.utils;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.fs.permission.FsPermission;
|
||||
import org.apache.hadoop.yarn.service.api.records.KerberosPrincipal;
|
||||
import org.apache.hadoop.yarn.submarine.client.cli.param.RunJobParameters;
|
||||
import org.apache.hadoop.yarn.submarine.common.fs.RemoteDirectoryManager;
|
||||
import org.apache.hadoop.yarn.submarine.runtimes.yarnservice.FileSystemOperations;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Objects;
|
||||
|
||||
/**
|
||||
* Simple factory that creates a {@link KerberosPrincipal}.
|
||||
*/
|
||||
public final class KerberosPrincipalFactory {
|
||||
private KerberosPrincipalFactory() {
|
||||
throw new UnsupportedOperationException("This class should not be " +
|
||||
"instantiated!");
|
||||
}
|
||||
|
||||
private static final Logger LOG =
|
||||
LoggerFactory.getLogger(KerberosPrincipalFactory.class);
|
||||
|
||||
public static KerberosPrincipal create(FileSystemOperations fsOperations,
|
||||
RemoteDirectoryManager remoteDirectoryManager,
|
||||
RunJobParameters parameters) throws IOException {
|
||||
Objects.requireNonNull(fsOperations,
|
||||
"FileSystemOperations must not be null!");
|
||||
Objects.requireNonNull(remoteDirectoryManager,
|
||||
"RemoteDirectoryManager must not be null!");
|
||||
Objects.requireNonNull(parameters, "Parameters must not be null!");
|
||||
|
||||
if (StringUtils.isNotBlank(parameters.getKeytab()) && StringUtils
|
||||
.isNotBlank(parameters.getPrincipal())) {
|
||||
String keytab = parameters.getKeytab();
|
||||
String principal = parameters.getPrincipal();
|
||||
if (parameters.isDistributeKeytab()) {
|
||||
return handleDistributedKeytab(fsOperations, remoteDirectoryManager,
|
||||
parameters, keytab, principal);
|
||||
} else {
|
||||
return handleNormalKeytab(keytab, principal);
|
||||
}
|
||||
}
|
||||
LOG.debug("Principal and keytab was null or empty, " +
|
||||
"returning null KerberosPrincipal!");
|
||||
return null;
|
||||
}
|
||||
|
||||
private static KerberosPrincipal handleDistributedKeytab(
|
||||
FileSystemOperations fsOperations,
|
||||
RemoteDirectoryManager remoteDirectoryManager,
|
||||
RunJobParameters parameters, String keytab, String principal)
|
||||
throws IOException {
|
||||
Path stagingDir = remoteDirectoryManager
|
||||
.getJobStagingArea(parameters.getName(), true);
|
||||
Path remoteKeytabPath =
|
||||
fsOperations.uploadToRemoteFile(stagingDir, keytab);
|
||||
// Only the owner has read access
|
||||
fsOperations.setPermission(remoteKeytabPath,
|
||||
FsPermission.createImmutable((short)Integer.parseInt("400", 8)));
|
||||
return new KerberosPrincipal()
|
||||
.keytab(remoteKeytabPath.toString())
|
||||
.principalName(principal);
|
||||
}
|
||||
|
||||
private static KerberosPrincipal handleNormalKeytab(String keytab,
|
||||
String principal) {
|
||||
if(!keytab.startsWith("file")) {
|
||||
keytab = "file://" + keytab;
|
||||
}
|
||||
return new KerberosPrincipal()
|
||||
.keytab(keytab)
|
||||
.principalName(principal);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,170 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hadoop.yarn.submarine.utils;
|
||||
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.yarn.service.api.records.ConfigFile;
|
||||
import org.apache.hadoop.yarn.service.api.records.Service;
|
||||
import org.apache.hadoop.yarn.submarine.client.cli.param.Localization;
|
||||
import org.apache.hadoop.yarn.submarine.client.cli.param.RunJobParameters;
|
||||
import org.apache.hadoop.yarn.submarine.common.fs.RemoteDirectoryManager;
|
||||
import org.apache.hadoop.yarn.submarine.runtimes.yarnservice.FileSystemOperations;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.IOException;
|
||||
import java.util.List;
|
||||
|
||||
import static org.apache.hadoop.yarn.submarine.runtimes.yarnservice.FileSystemOperations.needHdfs;
|
||||
import static org.apache.hadoop.yarn.submarine.utils.EnvironmentUtilities.appendToEnv;
|
||||
|
||||
/**
|
||||
* This class holds all dependencies in order to localize dependencies
|
||||
* for containers.
|
||||
*/
|
||||
public class Localizer {
|
||||
private static final Logger LOG = LoggerFactory.getLogger(Localizer.class);
|
||||
|
||||
private final FileSystemOperations fsOperations;
|
||||
private final RemoteDirectoryManager remoteDirectoryManager;
|
||||
private final RunJobParameters parameters;
|
||||
|
||||
public Localizer(FileSystemOperations fsOperations,
|
||||
RemoteDirectoryManager remoteDirectoryManager,
|
||||
RunJobParameters parameters) {
|
||||
this.fsOperations = fsOperations;
|
||||
this.remoteDirectoryManager = remoteDirectoryManager;
|
||||
this.parameters = parameters;
|
||||
}
|
||||
|
||||
/**
|
||||
* Localize dependencies for all containers.
|
||||
* If remoteUri is a local directory,
|
||||
* we'll zip it, upload to HDFS staging dir HDFS.
|
||||
* If remoteUri is directory, we'll download it, zip it and upload
|
||||
* to HDFS.
|
||||
* If localFilePath is ".", we'll use remoteUri's file/dir name
|
||||
* */
|
||||
public void handleLocalizations(Service service)
|
||||
throws IOException {
|
||||
// Handle localizations
|
||||
Path stagingDir =
|
||||
remoteDirectoryManager.getJobStagingArea(
|
||||
parameters.getName(), true);
|
||||
List<Localization> localizations = parameters.getLocalizations();
|
||||
String remoteUri;
|
||||
String containerLocalPath;
|
||||
|
||||
// Check to fail fast
|
||||
for (Localization loc : localizations) {
|
||||
remoteUri = loc.getRemoteUri();
|
||||
Path resourceToLocalize = new Path(remoteUri);
|
||||
// Check if remoteUri exists
|
||||
if (remoteDirectoryManager.isRemote(remoteUri)) {
|
||||
// check if exists
|
||||
if (!remoteDirectoryManager.existsRemoteFile(resourceToLocalize)) {
|
||||
throw new FileNotFoundException(
|
||||
"File " + remoteUri + " doesn't exists.");
|
||||
}
|
||||
} else {
|
||||
// Check if exists
|
||||
File localFile = new File(remoteUri);
|
||||
if (!localFile.exists()) {
|
||||
throw new FileNotFoundException(
|
||||
"File " + remoteUri + " doesn't exists.");
|
||||
}
|
||||
}
|
||||
// check remote file size
|
||||
fsOperations.validFileSize(remoteUri);
|
||||
}
|
||||
// Start download remote if needed and upload to HDFS
|
||||
for (Localization loc : localizations) {
|
||||
remoteUri = loc.getRemoteUri();
|
||||
containerLocalPath = loc.getLocalPath();
|
||||
String srcFileStr = remoteUri;
|
||||
ConfigFile.TypeEnum destFileType = ConfigFile.TypeEnum.STATIC;
|
||||
Path resourceToLocalize = new Path(remoteUri);
|
||||
boolean needUploadToHDFS = true;
|
||||
|
||||
|
||||
// Special handling of remoteUri directory
|
||||
boolean needDeleteTempFile = false;
|
||||
if (remoteDirectoryManager.isDir(remoteUri)) {
|
||||
destFileType = ConfigFile.TypeEnum.ARCHIVE;
|
||||
srcFileStr = fsOperations.downloadAndZip(
|
||||
remoteUri, getLastNameFromPath(srcFileStr), true);
|
||||
} else if (remoteDirectoryManager.isRemote(remoteUri)) {
|
||||
if (!needHdfs(remoteUri)) {
|
||||
// Non HDFS remote uri. Non directory, no need to zip
|
||||
srcFileStr = fsOperations.downloadAndZip(
|
||||
remoteUri, getLastNameFromPath(srcFileStr), false);
|
||||
needDeleteTempFile = true;
|
||||
} else {
|
||||
// HDFS file, no need to upload
|
||||
needUploadToHDFS = false;
|
||||
}
|
||||
}
|
||||
|
||||
// Upload file to HDFS
|
||||
if (needUploadToHDFS) {
|
||||
resourceToLocalize =
|
||||
fsOperations.uploadToRemoteFile(stagingDir, srcFileStr);
|
||||
}
|
||||
if (needDeleteTempFile) {
|
||||
fsOperations.deleteFiles(srcFileStr);
|
||||
}
|
||||
// Remove .zip from zipped dir name
|
||||
if (destFileType == ConfigFile.TypeEnum.ARCHIVE
|
||||
&& srcFileStr.endsWith(".zip")) {
|
||||
// Delete local zip file
|
||||
fsOperations.deleteFiles(srcFileStr);
|
||||
int suffixIndex = srcFileStr.lastIndexOf('_');
|
||||
srcFileStr = srcFileStr.substring(0, suffixIndex);
|
||||
}
|
||||
// If provided, use the name of local uri
|
||||
if (!containerLocalPath.equals(".")
|
||||
&& !containerLocalPath.equals("./")) {
|
||||
// Change the YARN localized file name to what'll used in container
|
||||
srcFileStr = getLastNameFromPath(containerLocalPath);
|
||||
}
|
||||
String localizedName = getLastNameFromPath(srcFileStr);
|
||||
LOG.info("The file/dir to be localized is {}",
|
||||
resourceToLocalize.toString());
|
||||
LOG.info("Its localized file name will be {}", localizedName);
|
||||
service.getConfiguration().getFiles().add(new ConfigFile().srcFile(
|
||||
resourceToLocalize.toUri().toString()).destFile(localizedName)
|
||||
.type(destFileType));
|
||||
// set mounts
|
||||
// if mount path is absolute, just use it.
|
||||
// if relative, no need to mount explicitly
|
||||
if (containerLocalPath.startsWith("/")) {
|
||||
String mountStr = getLastNameFromPath(srcFileStr) + ":"
|
||||
+ containerLocalPath + ":" + loc.getMountPermission();
|
||||
LOG.info("Add bind-mount string {}", mountStr);
|
||||
appendToEnv(service,
|
||||
EnvironmentUtilities.ENV_DOCKER_MOUNTS_FOR_CONTAINER_RUNTIME,
|
||||
mountStr, ",");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private String getLastNameFromPath(String srcFileStr) {
|
||||
return new Path(srcFileStr).getName();
|
||||
}
|
||||
}
|
|
@ -0,0 +1,51 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hadoop.yarn.submarine.utils;
|
||||
|
||||
import org.apache.hadoop.yarn.service.api.records.Resource;
|
||||
import org.apache.hadoop.yarn.service.api.records.ResourceInformation;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* Resource utilities for Submarine.
|
||||
*/
|
||||
public final class SubmarineResourceUtils {
|
||||
private SubmarineResourceUtils() {
|
||||
throw new UnsupportedOperationException("This class should not be " +
|
||||
"instantiated!");
|
||||
}
|
||||
|
||||
public static Resource convertYarnResourceToServiceResource(
|
||||
org.apache.hadoop.yarn.api.records.Resource yarnResource) {
|
||||
Resource serviceResource = new Resource();
|
||||
serviceResource.setCpus(yarnResource.getVirtualCores());
|
||||
serviceResource.setMemory(String.valueOf(yarnResource.getMemorySize()));
|
||||
|
||||
Map<String, ResourceInformation> riMap = new HashMap<>();
|
||||
for (org.apache.hadoop.yarn.api.records.ResourceInformation ri :
|
||||
yarnResource.getAllResourcesListCopy()) {
|
||||
ResourceInformation serviceRi = new ResourceInformation();
|
||||
serviceRi.setValue(ri.getValue());
|
||||
serviceRi.setUnit(ri.getUnits());
|
||||
riMap.put(ri.getName(), serviceRi);
|
||||
}
|
||||
serviceResource.setResourceInformations(riMap);
|
||||
|
||||
return serviceResource;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,82 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hadoop.yarn.submarine.utils;
|
||||
|
||||
import com.google.common.annotations.VisibleForTesting;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.util.zip.ZipEntry;
|
||||
import java.util.zip.ZipOutputStream;
|
||||
|
||||
/**
|
||||
* Utilities for zipping directories and adding existing directories to zips.
|
||||
*/
|
||||
public final class ZipUtilities {
|
||||
private ZipUtilities() {
|
||||
throw new UnsupportedOperationException("This class should not be " +
|
||||
"instantiated!");
|
||||
}
|
||||
|
||||
private static final Logger LOG = LoggerFactory.getLogger(ZipUtilities.class);
|
||||
|
||||
@VisibleForTesting
|
||||
public static String zipDir(String srcDir, String dstFile)
|
||||
throws IOException {
|
||||
FileOutputStream fos = new FileOutputStream(dstFile);
|
||||
ZipOutputStream zos = new ZipOutputStream(fos);
|
||||
File srcFile = new File(srcDir);
|
||||
LOG.info("Compressing directory {}", srcDir);
|
||||
addDirToZip(zos, srcFile, srcFile);
|
||||
// close the ZipOutputStream
|
||||
zos.close();
|
||||
LOG.info("Compressed directory {} to file: {}", srcDir, dstFile);
|
||||
return dstFile;
|
||||
}
|
||||
|
||||
private static void addDirToZip(ZipOutputStream zos, File srcFile, File base)
|
||||
throws IOException {
|
||||
File[] files = srcFile.listFiles();
|
||||
if (files == null) {
|
||||
return;
|
||||
}
|
||||
for (File file : files) {
|
||||
// if it's directory, add recursively
|
||||
if (file.isDirectory()) {
|
||||
addDirToZip(zos, file, base);
|
||||
continue;
|
||||
}
|
||||
byte[] buffer = new byte[1024];
|
||||
try(FileInputStream fis = new FileInputStream(file)) {
|
||||
String name = base.toURI().relativize(file.toURI()).getPath();
|
||||
LOG.info("Adding file {} to zip", name);
|
||||
zos.putNextEntry(new ZipEntry(name));
|
||||
int length;
|
||||
while ((length = fis.read(buffer)) > 0) {
|
||||
zos.write(buffer, 0, length);
|
||||
}
|
||||
zos.flush();
|
||||
} finally {
|
||||
zos.closeEntry();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,19 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
/**
|
||||
* This package contains classes utility classes.
|
||||
*/
|
||||
package org.apache.hadoop.yarn.submarine.utils;
|
|
@ -0,0 +1,146 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hadoop.yarn.submarine;
|
||||
|
||||
import com.google.common.collect.Lists;
|
||||
import org.apache.commons.io.FileUtils;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.util.List;
|
||||
|
||||
import static org.junit.Assert.assertTrue;
|
||||
|
||||
/**
|
||||
* File utilities for tests.
|
||||
* Provides methods that can create, delete files or directories
|
||||
* in a temp directory, or any specified directory.
|
||||
*/
|
||||
public class FileUtilitiesForTests {
|
||||
private static final Logger LOG =
|
||||
LoggerFactory.getLogger(FileUtilitiesForTests.class);
|
||||
private String tempDir;
|
||||
private List<File> cleanupFiles;
|
||||
|
||||
public void setup() {
|
||||
cleanupFiles = Lists.newArrayList();
|
||||
tempDir = System.getProperty("java.io.tmpdir");
|
||||
}
|
||||
|
||||
public void teardown() throws IOException {
|
||||
LOG.info("About to clean up files: " + cleanupFiles);
|
||||
List<File> dirs = Lists.newArrayList();
|
||||
for (File cleanupFile : cleanupFiles) {
|
||||
if (cleanupFile.isDirectory()) {
|
||||
dirs.add(cleanupFile);
|
||||
} else {
|
||||
deleteFile(cleanupFile);
|
||||
}
|
||||
}
|
||||
|
||||
for (File dir : dirs) {
|
||||
deleteFile(dir);
|
||||
}
|
||||
}
|
||||
|
||||
public File createFileInTempDir(String filename) throws IOException {
|
||||
File file = new File(tempDir, new Path(filename).getName());
|
||||
createFile(file);
|
||||
return file;
|
||||
}
|
||||
|
||||
public File createDirInTempDir(String dirName) {
|
||||
File file = new File(tempDir, new Path(dirName).getName());
|
||||
createDirectory(file);
|
||||
return file;
|
||||
}
|
||||
|
||||
public File createFileInDir(Path dir, String filename) throws IOException {
|
||||
File dirTmp = new File(dir.toUri().getPath());
|
||||
if (!dirTmp.exists()) {
|
||||
createDirectory(dirTmp);
|
||||
}
|
||||
File file =
|
||||
new File(dir.toUri().getPath() + "/" + new Path(filename).getName());
|
||||
createFile(file);
|
||||
return file;
|
||||
}
|
||||
|
||||
public File createFileInDir(File dir, String filename) throws IOException {
|
||||
if (!dir.exists()) {
|
||||
createDirectory(dir);
|
||||
}
|
||||
File file = new File(dir, filename);
|
||||
createFile(file);
|
||||
return file;
|
||||
}
|
||||
|
||||
public File createDirectory(Path parent, String dirname) {
|
||||
File dir =
|
||||
new File(parent.toUri().getPath() + "/" + new Path(dirname).getName());
|
||||
createDirectory(dir);
|
||||
return dir;
|
||||
}
|
||||
|
||||
public File createDirectory(File parent, String dirname) {
|
||||
File dir =
|
||||
new File(parent.getPath() + "/" + new Path(dirname).getName());
|
||||
createDirectory(dir);
|
||||
return dir;
|
||||
}
|
||||
|
||||
private void createDirectory(File dir) {
|
||||
boolean result = dir.mkdir();
|
||||
assertTrue("Failed to create directory " + dir.getAbsolutePath(), result);
|
||||
assertTrue("Directory does not exist: " + dir.getAbsolutePath(),
|
||||
dir.exists());
|
||||
this.cleanupFiles.add(dir);
|
||||
}
|
||||
|
||||
private void createFile(File file) throws IOException {
|
||||
boolean result = file.createNewFile();
|
||||
assertTrue("Failed to create file " + file.getAbsolutePath(), result);
|
||||
assertTrue("File does not exist: " + file.getAbsolutePath(), file.exists());
|
||||
this.cleanupFiles.add(file);
|
||||
}
|
||||
|
||||
private static void deleteFile(File file) throws IOException {
|
||||
if (file.isDirectory()) {
|
||||
LOG.info("Removing directory: " + file.getAbsolutePath());
|
||||
FileUtils.deleteDirectory(file);
|
||||
}
|
||||
|
||||
if (file.exists()) {
|
||||
LOG.info("Removing file: " + file.getAbsolutePath());
|
||||
boolean result = file.delete();
|
||||
assertTrue("Deletion of file " + file.getAbsolutePath()
|
||||
+ " was not successful!", result);
|
||||
}
|
||||
}
|
||||
|
||||
public File getTempFileWithName(String filename) {
|
||||
return new File(tempDir + "/" + new Path(filename).getName());
|
||||
}
|
||||
|
||||
public static File getFilename(Path parent, String filename) {
|
||||
return new File(
|
||||
parent.toUri().getPath() + "/" + new Path(filename).getName());
|
||||
}
|
||||
}
|
|
@ -0,0 +1,139 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hadoop.yarn.submarine.client.cli.yarnservice;
|
||||
|
||||
import com.google.common.collect.Lists;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
class ParamBuilderForTest {
|
||||
private final List<String> params = Lists.newArrayList();
|
||||
|
||||
static ParamBuilderForTest create() {
|
||||
return new ParamBuilderForTest();
|
||||
}
|
||||
|
||||
ParamBuilderForTest withJobName(String name) {
|
||||
params.add("--name");
|
||||
params.add(name);
|
||||
return this;
|
||||
}
|
||||
|
||||
ParamBuilderForTest withDockerImage(String dockerImage) {
|
||||
params.add("--docker_image");
|
||||
params.add(dockerImage);
|
||||
return this;
|
||||
}
|
||||
|
||||
ParamBuilderForTest withInputPath(String inputPath) {
|
||||
params.add("--input_path");
|
||||
params.add(inputPath);
|
||||
return this;
|
||||
}
|
||||
|
||||
ParamBuilderForTest withCheckpointPath(String checkpointPath) {
|
||||
params.add("--checkpoint_path");
|
||||
params.add(checkpointPath);
|
||||
return this;
|
||||
}
|
||||
|
||||
ParamBuilderForTest withNumberOfWorkers(int numWorkers) {
|
||||
params.add("--num_workers");
|
||||
params.add(String.valueOf(numWorkers));
|
||||
return this;
|
||||
}
|
||||
|
||||
ParamBuilderForTest withNumberOfPs(int numPs) {
|
||||
params.add("--num_ps");
|
||||
params.add(String.valueOf(numPs));
|
||||
return this;
|
||||
}
|
||||
|
||||
ParamBuilderForTest withWorkerLaunchCommand(String launchCommand) {
|
||||
params.add("--worker_launch_cmd");
|
||||
params.add(launchCommand);
|
||||
return this;
|
||||
}
|
||||
|
||||
ParamBuilderForTest withPsLaunchCommand(String launchCommand) {
|
||||
params.add("--ps_launch_cmd");
|
||||
params.add(launchCommand);
|
||||
return this;
|
||||
}
|
||||
|
||||
ParamBuilderForTest withWorkerResources(String workerResources) {
|
||||
params.add("--worker_resources");
|
||||
params.add(workerResources);
|
||||
return this;
|
||||
}
|
||||
|
||||
ParamBuilderForTest withPsResources(String psResources) {
|
||||
params.add("--ps_resources");
|
||||
params.add(psResources);
|
||||
return this;
|
||||
}
|
||||
|
||||
ParamBuilderForTest withWorkerDockerImage(String dockerImage) {
|
||||
params.add("--worker_docker_image");
|
||||
params.add(dockerImage);
|
||||
return this;
|
||||
}
|
||||
|
||||
ParamBuilderForTest withPsDockerImage(String dockerImage) {
|
||||
params.add("--ps_docker_image");
|
||||
params.add(dockerImage);
|
||||
return this;
|
||||
}
|
||||
|
||||
ParamBuilderForTest withVerbose() {
|
||||
params.add("--verbose");
|
||||
return this;
|
||||
}
|
||||
|
||||
ParamBuilderForTest withTensorboard() {
|
||||
params.add("--tensorboard");
|
||||
return this;
|
||||
}
|
||||
|
||||
ParamBuilderForTest withTensorboardResources(String resources) {
|
||||
params.add("--tensorboard_resources");
|
||||
params.add(resources);
|
||||
return this;
|
||||
}
|
||||
|
||||
ParamBuilderForTest withTensorboardDockerImage(String dockerImage) {
|
||||
params.add("--tensorboard_docker_image");
|
||||
params.add(dockerImage);
|
||||
return this;
|
||||
}
|
||||
|
||||
ParamBuilderForTest withQuickLink(String quickLink) {
|
||||
params.add("--quicklink");
|
||||
params.add(quickLink);
|
||||
return this;
|
||||
}
|
||||
|
||||
ParamBuilderForTest withLocalization(String remoteUrl, String localUrl) {
|
||||
params.add("--localization");
|
||||
params.add(remoteUrl + ":" + localUrl);
|
||||
return this;
|
||||
}
|
||||
|
||||
String[] build() {
|
||||
return params.toArray(new String[0]);
|
||||
}
|
||||
}
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,79 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hadoop.yarn.submarine.client.cli.yarnservice;
|
||||
|
||||
import org.apache.hadoop.yarn.client.api.AppAdminClient;
|
||||
import org.apache.hadoop.yarn.exceptions.YarnException;
|
||||
import org.apache.hadoop.yarn.service.api.records.Service;
|
||||
import org.apache.hadoop.yarn.submarine.FileUtilitiesForTests;
|
||||
import org.apache.hadoop.yarn.submarine.common.conf.SubmarineLogs;
|
||||
import org.apache.hadoop.yarn.submarine.runtimes.common.JobSubmitter;
|
||||
import org.apache.hadoop.yarn.submarine.runtimes.yarnservice.YarnServiceJobSubmitter;
|
||||
import org.apache.hadoop.yarn.submarine.runtimes.yarnservice.YarnServiceUtils;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import static org.apache.hadoop.yarn.service.exceptions.LauncherExitCodes.EXIT_SUCCESS;
|
||||
import static org.mockito.ArgumentMatchers.any;
|
||||
import static org.mockito.Mockito.mock;
|
||||
import static org.mockito.Mockito.when;
|
||||
|
||||
/**
|
||||
* Common operations shared with test classes using Run job-related actions.
|
||||
*/
|
||||
public class TestYarnServiceRunJobCliCommons {
|
||||
static final String DEFAULT_JOB_NAME = "my-job";
|
||||
static final String DEFAULT_DOCKER_IMAGE = "tf-docker:1.1.0";
|
||||
static final String DEFAULT_INPUT_PATH = "s3://input";
|
||||
static final String DEFAULT_CHECKPOINT_PATH = "s3://output";
|
||||
static final String DEFAULT_WORKER_DOCKER_IMAGE = "worker.image";
|
||||
static final String DEFAULT_PS_DOCKER_IMAGE = "ps.image";
|
||||
static final String DEFAULT_WORKER_LAUNCH_CMD = "python run-job.py";
|
||||
static final String DEFAULT_PS_LAUNCH_CMD = "python run-ps.py";
|
||||
static final String DEFAULT_TENSORBOARD_RESOURCES = "memory=2G,vcores=2";
|
||||
static final String DEFAULT_WORKER_RESOURCES = "memory=2048M,vcores=2";
|
||||
static final String DEFAULT_PS_RESOURCES = "memory=4096M,vcores=4";
|
||||
static final String DEFAULT_TENSORBOARD_DOCKER_IMAGE = "tb_docker_image:001";
|
||||
|
||||
private FileUtilitiesForTests fileUtils = new FileUtilitiesForTests();
|
||||
|
||||
void setup() throws IOException, YarnException {
|
||||
SubmarineLogs.verboseOff();
|
||||
AppAdminClient serviceClient = mock(AppAdminClient.class);
|
||||
when(serviceClient.actionLaunch(any(String.class), any(String.class),
|
||||
any(Long.class), any(String.class))).thenReturn(EXIT_SUCCESS);
|
||||
when(serviceClient.getStatusString(any(String.class))).thenReturn(
|
||||
"{\"id\": \"application_1234_1\"}");
|
||||
YarnServiceUtils.setStubServiceClient(serviceClient);
|
||||
|
||||
fileUtils.setup();
|
||||
}
|
||||
|
||||
void teardown() throws IOException {
|
||||
fileUtils.teardown();
|
||||
}
|
||||
|
||||
FileUtilitiesForTests getFileUtils() {
|
||||
return fileUtils;
|
||||
}
|
||||
|
||||
Service getServiceSpecFromJobSubmitter(JobSubmitter jobSubmitter) {
|
||||
return ((YarnServiceJobSubmitter) jobSubmitter).getServiceWrapper()
|
||||
.getService();
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,599 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hadoop.yarn.submarine.client.cli.yarnservice;
|
||||
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.yarn.exceptions.YarnException;
|
||||
import org.apache.hadoop.yarn.service.api.records.ConfigFile;
|
||||
import org.apache.hadoop.yarn.service.api.records.Service;
|
||||
import org.apache.hadoop.yarn.submarine.client.cli.RunJobCli;
|
||||
import org.apache.hadoop.yarn.submarine.common.MockClientContext;
|
||||
import org.apache.hadoop.yarn.submarine.common.conf.SubmarineConfiguration;
|
||||
import org.apache.hadoop.yarn.submarine.common.conf.SubmarineLogs;
|
||||
import org.apache.hadoop.yarn.submarine.common.fs.RemoteDirectoryManager;
|
||||
import static org.apache.hadoop.yarn.submarine.client.cli.yarnservice.TestYarnServiceRunJobCliCommons.*;
|
||||
import org.junit.After;
|
||||
import org.junit.Before;
|
||||
import org.junit.Test;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.util.List;
|
||||
|
||||
import static org.junit.Assert.assertEquals;
|
||||
import static org.junit.Assert.assertFalse;
|
||||
import static org.junit.Assert.assertTrue;
|
||||
import static org.junit.Assert.fail;
|
||||
import static org.mockito.ArgumentMatchers.anyString;
|
||||
import static org.mockito.Mockito.reset;
|
||||
import static org.mockito.Mockito.spy;
|
||||
import static org.mockito.Mockito.times;
|
||||
import static org.mockito.Mockito.verify;
|
||||
|
||||
/**
|
||||
* Class to test YarnService localization feature with the Run job CLI action.
|
||||
*/
|
||||
public class TestYarnServiceRunJobCliLocalization {
|
||||
private static final String ZIP_EXTENSION = ".zip";
|
||||
private TestYarnServiceRunJobCliCommons testCommons =
|
||||
new TestYarnServiceRunJobCliCommons();
|
||||
private MockClientContext mockClientContext;
|
||||
private RemoteDirectoryManager spyRdm;
|
||||
|
||||
@Before
|
||||
public void before() throws IOException, YarnException {
|
||||
testCommons.setup();
|
||||
mockClientContext = YarnServiceCliTestUtils.getMockClientContext();
|
||||
spyRdm = setupSpyRemoteDirManager();
|
||||
}
|
||||
|
||||
@After
|
||||
public void cleanup() throws IOException {
|
||||
testCommons.teardown();
|
||||
}
|
||||
|
||||
private ParamBuilderForTest createCommonParamsBuilder() {
|
||||
return ParamBuilderForTest.create()
|
||||
.withJobName(DEFAULT_JOB_NAME)
|
||||
.withDockerImage(DEFAULT_DOCKER_IMAGE)
|
||||
.withInputPath(DEFAULT_INPUT_PATH)
|
||||
.withCheckpointPath(DEFAULT_CHECKPOINT_PATH)
|
||||
.withNumberOfWorkers(3)
|
||||
.withWorkerDockerImage(DEFAULT_WORKER_DOCKER_IMAGE)
|
||||
.withWorkerLaunchCommand(DEFAULT_WORKER_LAUNCH_CMD)
|
||||
.withWorkerResources(DEFAULT_WORKER_RESOURCES)
|
||||
.withNumberOfPs(2)
|
||||
.withPsDockerImage(DEFAULT_PS_DOCKER_IMAGE)
|
||||
.withPsLaunchCommand(DEFAULT_PS_LAUNCH_CMD)
|
||||
.withPsResources(DEFAULT_PS_RESOURCES)
|
||||
.withVerbose();
|
||||
}
|
||||
|
||||
private void assertFilesAreDeleted(File... files) {
|
||||
for (File file : files) {
|
||||
assertFalse("File should be deleted: " + file.getAbsolutePath(),
|
||||
file.exists());
|
||||
}
|
||||
}
|
||||
|
||||
private RemoteDirectoryManager setupSpyRemoteDirManager() {
|
||||
RemoteDirectoryManager spyRdm =
|
||||
spy(mockClientContext.getRemoteDirectoryManager());
|
||||
mockClientContext.setRemoteDirectoryMgr(spyRdm);
|
||||
return spyRdm;
|
||||
}
|
||||
|
||||
private Path getStagingDir() throws IOException {
|
||||
return mockClientContext.getRemoteDirectoryManager()
|
||||
.getJobStagingArea(DEFAULT_JOB_NAME, true);
|
||||
}
|
||||
|
||||
private RunJobCli createRunJobCliWithoutVerboseAssertion() {
|
||||
return new RunJobCli(mockClientContext);
|
||||
}
|
||||
|
||||
private RunJobCli createRunJobCli() {
|
||||
RunJobCli runJobCli = new RunJobCli(mockClientContext);
|
||||
assertFalse(SubmarineLogs.isVerbose());
|
||||
return runJobCli;
|
||||
}
|
||||
|
||||
private String getFilePath(String localUrl, Path stagingDir) {
|
||||
return stagingDir.toUri().getPath()
|
||||
+ "/" + new Path(localUrl).getName();
|
||||
}
|
||||
|
||||
private String getFilePathWithSuffix(Path stagingDir, String localUrl,
|
||||
String suffix) {
|
||||
return stagingDir.toUri().getPath() + "/" + new Path(localUrl).getName()
|
||||
+ suffix;
|
||||
}
|
||||
|
||||
private void assertConfigFile(ConfigFile expected, ConfigFile actual) {
|
||||
assertEquals("ConfigFile does not equal to expected!", expected, actual);
|
||||
}
|
||||
|
||||
private void assertNumberOfLocalizations(List<ConfigFile> files,
|
||||
int expected) {
|
||||
assertEquals("Number of localizations is not the expected!", expected,
|
||||
files.size());
|
||||
}
|
||||
|
||||
private void verifyRdmCopyToRemoteLocalCalls(int expectedCalls)
|
||||
throws IOException {
|
||||
verify(spyRdm, times(expectedCalls)).copyRemoteToLocal(anyString(),
|
||||
anyString());
|
||||
}
|
||||
|
||||
/**
|
||||
* Basic test.
|
||||
* In one hand, create local temp file/dir for hdfs URI in
|
||||
* local staging dir.
|
||||
* In the other hand, use MockRemoteDirectoryManager mock
|
||||
* implementation when check FileStatus or exists of HDFS file/dir
|
||||
* --localization hdfs:///user/yarn/script1.py:.
|
||||
* --localization /temp/script2.py:./
|
||||
* --localization /temp/script2.py:/opt/script.py
|
||||
*/
|
||||
@Test
|
||||
public void testRunJobWithBasicLocalization() throws Exception {
|
||||
String remoteUrl = "hdfs:///user/yarn/script1.py";
|
||||
String containerLocal1 = ".";
|
||||
String localUrl = "/temp/script2.py";
|
||||
String containerLocal2 = "./";
|
||||
String containerLocal3 = "/opt/script.py";
|
||||
// Create local file, we need to put it under local temp dir
|
||||
File localFile1 = testCommons.getFileUtils().createFileInTempDir(localUrl);
|
||||
|
||||
// create remote file in local staging dir to simulate HDFS
|
||||
Path stagingDir = getStagingDir();
|
||||
testCommons.getFileUtils().createFileInDir(stagingDir, remoteUrl);
|
||||
|
||||
String[] params = createCommonParamsBuilder()
|
||||
.withLocalization(remoteUrl, containerLocal1)
|
||||
.withLocalization(localFile1.getAbsolutePath(), containerLocal2)
|
||||
.withLocalization(localFile1.getAbsolutePath(), containerLocal3)
|
||||
.build();
|
||||
RunJobCli runJobCli = createRunJobCli();
|
||||
runJobCli.run(params);
|
||||
Service serviceSpec = testCommons.getServiceSpecFromJobSubmitter(
|
||||
runJobCli.getJobSubmitter());
|
||||
assertNumberOfServiceComponents(serviceSpec, 3);
|
||||
|
||||
// No remote dir and HDFS file exists.
|
||||
// Ensure download never happened.
|
||||
verifyRdmCopyToRemoteLocalCalls(0);
|
||||
// Ensure local original files are not deleted
|
||||
assertTrue(localFile1.exists());
|
||||
|
||||
List<ConfigFile> files = serviceSpec.getConfiguration().getFiles();
|
||||
assertNumberOfLocalizations(files, 3);
|
||||
|
||||
ConfigFile expectedConfigFile = new ConfigFile();
|
||||
expectedConfigFile.setType(ConfigFile.TypeEnum.STATIC);
|
||||
expectedConfigFile.setSrcFile(remoteUrl);
|
||||
expectedConfigFile.setDestFile(new Path(remoteUrl).getName());
|
||||
assertConfigFile(expectedConfigFile, files.get(0));
|
||||
|
||||
expectedConfigFile = new ConfigFile();
|
||||
expectedConfigFile.setType(ConfigFile.TypeEnum.STATIC);
|
||||
expectedConfigFile.setSrcFile(getFilePath(localUrl, stagingDir));
|
||||
expectedConfigFile.setDestFile(new Path(localUrl).getName());
|
||||
assertConfigFile(expectedConfigFile, files.get(1));
|
||||
|
||||
expectedConfigFile = new ConfigFile();
|
||||
expectedConfigFile.setType(ConfigFile.TypeEnum.STATIC);
|
||||
expectedConfigFile.setSrcFile(getFilePath(localUrl, stagingDir));
|
||||
expectedConfigFile.setDestFile(new Path(containerLocal3).getName());
|
||||
assertConfigFile(expectedConfigFile, files.get(2));
|
||||
|
||||
// Ensure env value is correct
|
||||
String env = serviceSpec.getConfiguration().getEnv()
|
||||
.get("YARN_CONTAINER_RUNTIME_DOCKER_MOUNTS");
|
||||
String expectedMounts = new Path(containerLocal3).getName()
|
||||
+ ":" + containerLocal3 + ":rw";
|
||||
assertTrue(env.contains(expectedMounts));
|
||||
}
|
||||
|
||||
private void assertNumberOfServiceComponents(Service serviceSpec,
|
||||
int expected) {
|
||||
assertEquals(expected, serviceSpec.getComponents().size());
|
||||
}
|
||||
|
||||
/**
|
||||
* Non HDFS remote URI test.
|
||||
* --localization https://a/b/1.patch:.
|
||||
* --localization s3a://a/dir:/opt/mys3dir
|
||||
*/
|
||||
@Test
|
||||
public void testRunJobWithNonHDFSRemoteLocalization() throws Exception {
|
||||
String remoteUri1 = "https://a/b/1.patch";
|
||||
String containerLocal1 = ".";
|
||||
String remoteUri2 = "s3a://a/s3dir";
|
||||
String containerLocal2 = "/opt/mys3dir";
|
||||
|
||||
// create remote file in local staging dir to simulate HDFS
|
||||
Path stagingDir = getStagingDir();
|
||||
testCommons.getFileUtils().createFileInDir(stagingDir, remoteUri1);
|
||||
File remoteDir1 =
|
||||
testCommons.getFileUtils().createDirectory(stagingDir, remoteUri2);
|
||||
testCommons.getFileUtils().createFileInDir(remoteDir1, "afile");
|
||||
|
||||
String suffix1 = "_" + remoteDir1.lastModified()
|
||||
+ "-" + mockClientContext.getRemoteDirectoryManager()
|
||||
.getRemoteFileSize(remoteUri2);
|
||||
|
||||
String[] params = createCommonParamsBuilder()
|
||||
.withLocalization(remoteUri1, containerLocal1)
|
||||
.withLocalization(remoteUri2, containerLocal2)
|
||||
.build();
|
||||
RunJobCli runJobCli = createRunJobCli();
|
||||
runJobCli.run(params);
|
||||
Service serviceSpec = testCommons.getServiceSpecFromJobSubmitter(
|
||||
runJobCli.getJobSubmitter());
|
||||
assertNumberOfServiceComponents(serviceSpec, 3);
|
||||
|
||||
// Ensure download remote dir 2 times
|
||||
verifyRdmCopyToRemoteLocalCalls(2);
|
||||
|
||||
// Ensure downloaded temp files are deleted
|
||||
assertFilesAreDeleted(
|
||||
testCommons.getFileUtils().getTempFileWithName(remoteUri1),
|
||||
testCommons.getFileUtils().getTempFileWithName(remoteUri2));
|
||||
|
||||
// Ensure zip file are deleted
|
||||
assertFilesAreDeleted(
|
||||
testCommons.getFileUtils()
|
||||
.getTempFileWithName(remoteUri2 + "_" + suffix1 + ZIP_EXTENSION));
|
||||
|
||||
List<ConfigFile> files = serviceSpec.getConfiguration().getFiles();
|
||||
assertNumberOfLocalizations(files, 2);
|
||||
|
||||
ConfigFile expectedConfigFile = new ConfigFile();
|
||||
expectedConfigFile.setType(ConfigFile.TypeEnum.STATIC);
|
||||
expectedConfigFile.setSrcFile(getFilePath(remoteUri1, stagingDir));
|
||||
expectedConfigFile.setDestFile(new Path(remoteUri1).getName());
|
||||
assertConfigFile(expectedConfigFile, files.get(0));
|
||||
|
||||
expectedConfigFile = new ConfigFile();
|
||||
expectedConfigFile.setType(ConfigFile.TypeEnum.ARCHIVE);
|
||||
expectedConfigFile.setSrcFile(
|
||||
getFilePathWithSuffix(stagingDir, remoteUri2, suffix1 + ZIP_EXTENSION));
|
||||
expectedConfigFile.setDestFile(new Path(containerLocal2).getName());
|
||||
assertConfigFile(expectedConfigFile, files.get(1));
|
||||
|
||||
// Ensure env value is correct
|
||||
String env = serviceSpec.getConfiguration().getEnv()
|
||||
.get("YARN_CONTAINER_RUNTIME_DOCKER_MOUNTS");
|
||||
String expectedMounts = new Path(remoteUri2).getName()
|
||||
+ ":" + containerLocal2 + ":rw";
|
||||
assertTrue(env.contains(expectedMounts));
|
||||
}
|
||||
|
||||
/**
|
||||
* Test HDFS dir localization.
|
||||
* --localization hdfs:///user/yarn/mydir:./mydir1
|
||||
* --localization hdfs:///user/yarn/mydir2:/opt/dir2:rw
|
||||
* --localization hdfs:///user/yarn/mydir:.
|
||||
* --localization hdfs:///user/yarn/mydir2:./
|
||||
*/
|
||||
@Test
|
||||
public void testRunJobWithHdfsDirLocalization() throws Exception {
|
||||
String remoteUrl = "hdfs:///user/yarn/mydir";
|
||||
String containerPath = "./mydir1";
|
||||
String remoteUrl2 = "hdfs:///user/yarn/mydir2";
|
||||
String containerPath2 = "/opt/dir2";
|
||||
String containerPath3 = ".";
|
||||
String containerPath4 = "./";
|
||||
|
||||
// create remote file in local staging dir to simulate HDFS
|
||||
Path stagingDir = getStagingDir();
|
||||
File remoteDir1 =
|
||||
testCommons.getFileUtils().createDirectory(stagingDir, remoteUrl);
|
||||
testCommons.getFileUtils().createFileInDir(remoteDir1, "1.py");
|
||||
testCommons.getFileUtils().createFileInDir(remoteDir1, "2.py");
|
||||
|
||||
File remoteDir2 =
|
||||
testCommons.getFileUtils().createDirectory(stagingDir, remoteUrl2);
|
||||
testCommons.getFileUtils().createFileInDir(remoteDir2, "3.py");
|
||||
testCommons.getFileUtils().createFileInDir(remoteDir2, "4.py");
|
||||
|
||||
String suffix1 = "_" + remoteDir1.lastModified()
|
||||
+ "-" + mockClientContext.getRemoteDirectoryManager()
|
||||
.getRemoteFileSize(remoteUrl);
|
||||
String suffix2 = "_" + remoteDir2.lastModified()
|
||||
+ "-" + mockClientContext.getRemoteDirectoryManager()
|
||||
.getRemoteFileSize(remoteUrl2);
|
||||
|
||||
String[] params = createCommonParamsBuilder()
|
||||
.withLocalization(remoteUrl, containerPath)
|
||||
.withLocalization(remoteUrl2, containerPath2)
|
||||
.withLocalization(remoteUrl, containerPath3)
|
||||
.withLocalization(remoteUrl2, containerPath4)
|
||||
.build();
|
||||
RunJobCli runJobCli = createRunJobCli();
|
||||
runJobCli.run(params);
|
||||
Service serviceSpec = testCommons.getServiceSpecFromJobSubmitter(
|
||||
runJobCli.getJobSubmitter());
|
||||
assertNumberOfServiceComponents(serviceSpec, 3);
|
||||
|
||||
// Ensure download remote dir 4 times
|
||||
verifyRdmCopyToRemoteLocalCalls(4);
|
||||
|
||||
// Ensure downloaded temp files are deleted
|
||||
assertFilesAreDeleted(
|
||||
testCommons.getFileUtils().getTempFileWithName(remoteUrl),
|
||||
testCommons.getFileUtils().getTempFileWithName(remoteUrl2));
|
||||
|
||||
// Ensure zip file are deleted
|
||||
assertFilesAreDeleted(
|
||||
testCommons.getFileUtils()
|
||||
.getTempFileWithName(remoteUrl + suffix1 + ZIP_EXTENSION),
|
||||
testCommons.getFileUtils()
|
||||
.getTempFileWithName(remoteUrl2 + suffix2 + ZIP_EXTENSION));
|
||||
|
||||
// Ensure files will be localized
|
||||
List<ConfigFile> files = serviceSpec.getConfiguration().getFiles();
|
||||
assertNumberOfLocalizations(files, 4);
|
||||
|
||||
ConfigFile expectedConfigFile = new ConfigFile();
|
||||
// The hdfs dir should be download and compress and let YARN to uncompress
|
||||
expectedConfigFile.setType(ConfigFile.TypeEnum.ARCHIVE);
|
||||
expectedConfigFile.setSrcFile(
|
||||
getFilePathWithSuffix(stagingDir, remoteUrl, suffix1 + ZIP_EXTENSION));
|
||||
// Relative path in container, but not "." or "./". Use its own name
|
||||
expectedConfigFile.setDestFile(new Path(containerPath).getName());
|
||||
assertConfigFile(expectedConfigFile, files.get(0));
|
||||
|
||||
expectedConfigFile = new ConfigFile();
|
||||
expectedConfigFile.setType(ConfigFile.TypeEnum.ARCHIVE);
|
||||
expectedConfigFile.setSrcFile(
|
||||
getFilePathWithSuffix(stagingDir, remoteUrl2, suffix2 + ZIP_EXTENSION));
|
||||
expectedConfigFile.setDestFile(new Path(containerPath2).getName());
|
||||
assertConfigFile(expectedConfigFile, files.get(1));
|
||||
|
||||
expectedConfigFile = new ConfigFile();
|
||||
expectedConfigFile.setType(ConfigFile.TypeEnum.ARCHIVE);
|
||||
expectedConfigFile.setSrcFile(
|
||||
getFilePathWithSuffix(stagingDir, remoteUrl, suffix1 + ZIP_EXTENSION));
|
||||
// Relative path in container ".", use remote path name
|
||||
expectedConfigFile.setDestFile(new Path(remoteUrl).getName());
|
||||
assertConfigFile(expectedConfigFile, files.get(2));
|
||||
|
||||
expectedConfigFile = new ConfigFile();
|
||||
expectedConfigFile.setType(ConfigFile.TypeEnum.ARCHIVE);
|
||||
expectedConfigFile.setSrcFile(
|
||||
getFilePathWithSuffix(stagingDir, remoteUrl2, suffix2 + ZIP_EXTENSION));
|
||||
// Relative path in container ".", use remote path name
|
||||
expectedConfigFile.setDestFile(new Path(remoteUrl2).getName());
|
||||
assertConfigFile(expectedConfigFile, files.get(3));
|
||||
|
||||
// Ensure mounts env value is correct. Add one mount string
|
||||
String env = serviceSpec.getConfiguration().getEnv()
|
||||
.get("YARN_CONTAINER_RUNTIME_DOCKER_MOUNTS");
|
||||
|
||||
String expectedMounts =
|
||||
new Path(containerPath2).getName() + ":" + containerPath2 + ":rw";
|
||||
assertTrue(env.contains(expectedMounts));
|
||||
}
|
||||
|
||||
/**
|
||||
* Test if file/dir to be localized whose size exceeds limit.
|
||||
* Max 10MB in configuration, mock remote will
|
||||
* always return file size 100MB.
|
||||
* This configuration will fail the job which has remoteUri
|
||||
* But don't impact local dir/file
|
||||
*
|
||||
* --localization https://a/b/1.patch:.
|
||||
* --localization s3a://a/dir:/opt/mys3dir
|
||||
* --localization /temp/script2.py:./
|
||||
*/
|
||||
@Test
|
||||
public void testRunJobRemoteUriExceedLocalizationSize() throws Exception {
|
||||
String remoteUri1 = "https://a/b/1.patch";
|
||||
String containerLocal1 = ".";
|
||||
String remoteUri2 = "s3a://a/s3dir";
|
||||
String containerLocal2 = "/opt/mys3dir";
|
||||
String localUri1 = "/temp/script2";
|
||||
String containerLocal3 = "./";
|
||||
|
||||
SubmarineConfiguration submarineConf = new SubmarineConfiguration();
|
||||
|
||||
// Max 10MB, mock remote will always return file size 100MB.
|
||||
submarineConf.set(
|
||||
SubmarineConfiguration.LOCALIZATION_MAX_ALLOWED_FILE_SIZE_MB,
|
||||
"10");
|
||||
mockClientContext.setSubmarineConfig(submarineConf);
|
||||
|
||||
assertFalse(SubmarineLogs.isVerbose());
|
||||
|
||||
// create remote file in local staging dir to simulate
|
||||
Path stagingDir = getStagingDir();
|
||||
testCommons.getFileUtils().createFileInDir(stagingDir, remoteUri1);
|
||||
File remoteDir1 =
|
||||
testCommons.getFileUtils().createDirectory(stagingDir, remoteUri2);
|
||||
testCommons.getFileUtils().createFileInDir(remoteDir1, "afile");
|
||||
|
||||
// create local file, we need to put it under local temp dir
|
||||
File localFile1 = testCommons.getFileUtils().createFileInTempDir(localUri1);
|
||||
|
||||
try {
|
||||
RunJobCli runJobCli = createRunJobCli();
|
||||
String[] params = createCommonParamsBuilder()
|
||||
.withLocalization(remoteUri1, containerLocal1)
|
||||
.build();
|
||||
runJobCli.run(params);
|
||||
} catch (IOException e) {
|
||||
// Shouldn't have exception because it's within file size limit
|
||||
fail();
|
||||
}
|
||||
// we should download because fail fast
|
||||
verifyRdmCopyToRemoteLocalCalls(1);
|
||||
try {
|
||||
String[] params = createCommonParamsBuilder()
|
||||
.withLocalization(remoteUri1, containerLocal1)
|
||||
.withLocalization(remoteUri2, containerLocal2)
|
||||
.withLocalization(localFile1.getAbsolutePath(), containerLocal3)
|
||||
.build();
|
||||
|
||||
reset(spyRdm);
|
||||
RunJobCli runJobCli = createRunJobCliWithoutVerboseAssertion();
|
||||
runJobCli.run(params);
|
||||
} catch (IOException e) {
|
||||
assertTrue(e.getMessage()
|
||||
.contains("104857600 exceeds configured max size:10485760"));
|
||||
// we shouldn't do any download because fail fast
|
||||
verifyRdmCopyToRemoteLocalCalls(0);
|
||||
}
|
||||
|
||||
try {
|
||||
String[] params = createCommonParamsBuilder()
|
||||
.withLocalization(localFile1.getAbsolutePath(), containerLocal3)
|
||||
.build();
|
||||
RunJobCli runJobCli = createRunJobCliWithoutVerboseAssertion();
|
||||
runJobCli.run(params);
|
||||
} catch (IOException e) {
|
||||
assertTrue(e.getMessage()
|
||||
.contains("104857600 exceeds configured max size:10485760"));
|
||||
// we shouldn't do any download because fail fast
|
||||
verifyRdmCopyToRemoteLocalCalls(0);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Test remote Uri doesn't exist.
|
||||
* */
|
||||
@Test
|
||||
public void testRunJobWithNonExistRemoteUri() throws Exception {
|
||||
String remoteUri1 = "hdfs:///a/b/1.patch";
|
||||
String containerLocal1 = ".";
|
||||
String localUri1 = "/a/b/c";
|
||||
String containerLocal2 = "./";
|
||||
|
||||
try {
|
||||
String[] params = createCommonParamsBuilder()
|
||||
.withLocalization(remoteUri1, containerLocal1)
|
||||
.build();
|
||||
RunJobCli runJobCli = createRunJobCli();
|
||||
runJobCli.run(params);
|
||||
} catch (IOException e) {
|
||||
assertTrue(e.getMessage().contains("doesn't exists"));
|
||||
}
|
||||
|
||||
try {
|
||||
String[] params = createCommonParamsBuilder()
|
||||
.withLocalization(localUri1, containerLocal2)
|
||||
.build();
|
||||
RunJobCli runJobCli = createRunJobCliWithoutVerboseAssertion();
|
||||
runJobCli.run(params);
|
||||
} catch (IOException e) {
|
||||
assertTrue(e.getMessage().contains("doesn't exists"));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Test local dir
|
||||
* --localization /user/yarn/mydir:./mydir1
|
||||
* --localization /user/yarn/mydir2:/opt/dir2:rw
|
||||
* --localization /user/yarn/mydir2:.
|
||||
*/
|
||||
@Test
|
||||
public void testRunJobWithLocalDirLocalization() throws Exception {
|
||||
String localUrl = "/user/yarn/mydir";
|
||||
String containerPath = "./mydir1";
|
||||
String localUrl2 = "/user/yarn/mydir2";
|
||||
String containerPath2 = "/opt/dir2";
|
||||
String containerPath3 = ".";
|
||||
|
||||
// create local file
|
||||
File localDir1 = testCommons.getFileUtils().createDirInTempDir(localUrl);
|
||||
testCommons.getFileUtils().createFileInDir(localDir1, "1.py");
|
||||
testCommons.getFileUtils().createFileInDir(localDir1, "2.py");
|
||||
|
||||
File localDir2 = testCommons.getFileUtils().createDirInTempDir(localUrl2);
|
||||
testCommons.getFileUtils().createFileInDir(localDir2, "3.py");
|
||||
testCommons.getFileUtils().createFileInDir(localDir2, "4.py");
|
||||
|
||||
String suffix1 = "_" + localDir1.lastModified()
|
||||
+ "-" + localDir1.length();
|
||||
String suffix2 = "_" + localDir2.lastModified()
|
||||
+ "-" + localDir2.length();
|
||||
|
||||
String[] params = createCommonParamsBuilder()
|
||||
.withLocalization(localDir1.getAbsolutePath(), containerPath)
|
||||
.withLocalization(localDir2.getAbsolutePath(), containerPath2)
|
||||
.withLocalization(localDir2.getAbsolutePath(), containerPath3)
|
||||
.build();
|
||||
RunJobCli runJobCli = createRunJobCli();
|
||||
runJobCli.run(params);
|
||||
|
||||
Service serviceSpec = testCommons.getServiceSpecFromJobSubmitter(
|
||||
runJobCli.getJobSubmitter());
|
||||
assertNumberOfServiceComponents(serviceSpec, 3);
|
||||
|
||||
// we shouldn't do any download
|
||||
verifyRdmCopyToRemoteLocalCalls(0);
|
||||
|
||||
// Ensure local original files are not deleted
|
||||
assertTrue(localDir1.exists());
|
||||
assertTrue(localDir2.exists());
|
||||
|
||||
// Ensure zip file are deleted
|
||||
assertFalse(
|
||||
testCommons.getFileUtils()
|
||||
.getTempFileWithName(localUrl + suffix1 + ZIP_EXTENSION)
|
||||
.exists());
|
||||
assertFalse(
|
||||
testCommons.getFileUtils()
|
||||
.getTempFileWithName(localUrl2 + suffix2 + ZIP_EXTENSION)
|
||||
.exists());
|
||||
|
||||
// Ensure dirs will be zipped and localized
|
||||
List<ConfigFile> files = serviceSpec.getConfiguration().getFiles();
|
||||
assertNumberOfLocalizations(files, 3);
|
||||
|
||||
Path stagingDir = getStagingDir();
|
||||
ConfigFile expectedConfigFile = new ConfigFile();
|
||||
expectedConfigFile.setType(ConfigFile.TypeEnum.ARCHIVE);
|
||||
expectedConfigFile.setSrcFile(
|
||||
getFilePathWithSuffix(stagingDir, localUrl, suffix1 + ZIP_EXTENSION));
|
||||
expectedConfigFile.setDestFile(new Path(containerPath).getName());
|
||||
assertConfigFile(expectedConfigFile, files.get(0));
|
||||
|
||||
expectedConfigFile = new ConfigFile();
|
||||
expectedConfigFile.setType(ConfigFile.TypeEnum.ARCHIVE);
|
||||
expectedConfigFile.setSrcFile(
|
||||
getFilePathWithSuffix(stagingDir, localUrl2, suffix2 + ZIP_EXTENSION));
|
||||
expectedConfigFile.setDestFile(new Path(containerPath2).getName());
|
||||
assertConfigFile(expectedConfigFile, files.get(1));
|
||||
|
||||
expectedConfigFile = new ConfigFile();
|
||||
expectedConfigFile.setType(ConfigFile.TypeEnum.ARCHIVE);
|
||||
expectedConfigFile.setSrcFile(
|
||||
getFilePathWithSuffix(stagingDir, localUrl2, suffix2 + ZIP_EXTENSION));
|
||||
expectedConfigFile.setDestFile(new Path(localUrl2).getName());
|
||||
assertConfigFile(expectedConfigFile, files.get(2));
|
||||
|
||||
// Ensure mounts env value is correct
|
||||
String env = serviceSpec.getConfiguration().getEnv()
|
||||
.get("YARN_CONTAINER_RUNTIME_DOCKER_MOUNTS");
|
||||
String expectedMounts = new Path(containerPath2).getName()
|
||||
+ ":" + containerPath2 + ":rw";
|
||||
|
||||
assertTrue(env.contains(expectedMounts));
|
||||
}
|
||||
}
|
|
@ -0,0 +1,95 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hadoop.yarn.submarine.runtimes.yarnservice;
|
||||
|
||||
import org.apache.hadoop.yarn.service.api.records.Component;
|
||||
import org.apache.hadoop.yarn.service.api.records.Service;
|
||||
import org.junit.Test;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import static org.junit.Assert.*;
|
||||
import static org.mockito.ArgumentMatchers.eq;
|
||||
import static org.mockito.Mockito.mock;
|
||||
import static org.mockito.Mockito.verify;
|
||||
import static org.mockito.Mockito.when;
|
||||
|
||||
/**
|
||||
* Class to test the {@link ServiceWrapper}.
|
||||
*/
|
||||
public class TestServiceWrapper {
|
||||
private AbstractComponent createMockAbstractComponent(Component mockComponent,
|
||||
String componentName, String localScriptFile) throws IOException {
|
||||
when(mockComponent.getName()).thenReturn(componentName);
|
||||
|
||||
AbstractComponent mockAbstractComponent = mock(AbstractComponent.class);
|
||||
when(mockAbstractComponent.createComponent()).thenReturn(mockComponent);
|
||||
when(mockAbstractComponent.getLocalScriptFile())
|
||||
.thenReturn(localScriptFile);
|
||||
return mockAbstractComponent;
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testWithSingleComponent() throws IOException {
|
||||
Service mockService = mock(Service.class);
|
||||
ServiceWrapper serviceWrapper = new ServiceWrapper(mockService);
|
||||
|
||||
Component mockComponent = mock(Component.class);
|
||||
AbstractComponent mockAbstractComponent =
|
||||
createMockAbstractComponent(mockComponent, "testComponent",
|
||||
"testLocalScriptFile");
|
||||
serviceWrapper.addComponent(mockAbstractComponent);
|
||||
|
||||
verify(mockService).addComponent(eq(mockComponent));
|
||||
|
||||
String launchCommand =
|
||||
serviceWrapper.getLocalLaunchCommandPathForComponent("testComponent");
|
||||
assertEquals("testLocalScriptFile", launchCommand);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testWithMultipleComponent() throws IOException {
|
||||
Service mockService = mock(Service.class);
|
||||
ServiceWrapper serviceWrapper = new ServiceWrapper(mockService);
|
||||
|
||||
Component mockComponent1 = mock(Component.class);
|
||||
AbstractComponent mockAbstractComponent1 =
|
||||
createMockAbstractComponent(mockComponent1, "testComponent1",
|
||||
"testLocalScriptFile1");
|
||||
|
||||
Component mockComponent2 = mock(Component.class);
|
||||
AbstractComponent mockAbstractComponent2 =
|
||||
createMockAbstractComponent(mockComponent2, "testComponent2",
|
||||
"testLocalScriptFile2");
|
||||
|
||||
serviceWrapper.addComponent(mockAbstractComponent1);
|
||||
serviceWrapper.addComponent(mockAbstractComponent2);
|
||||
|
||||
verify(mockService).addComponent(eq(mockComponent1));
|
||||
verify(mockService).addComponent(eq(mockComponent2));
|
||||
|
||||
String launchCommand1 =
|
||||
serviceWrapper.getLocalLaunchCommandPathForComponent("testComponent1");
|
||||
assertEquals("testLocalScriptFile1", launchCommand1);
|
||||
|
||||
String launchCommand2 =
|
||||
serviceWrapper.getLocalLaunchCommandPathForComponent("testComponent2");
|
||||
assertEquals("testLocalScriptFile2", launchCommand2);
|
||||
}
|
||||
|
||||
|
||||
}
|
|
@ -14,26 +14,30 @@
|
|||
|
||||
package org.apache.hadoop.yarn.submarine.runtimes.yarnservice;
|
||||
|
||||
import org.apache.hadoop.yarn.submarine.runtimes.yarnservice.tensorflow.TensorFlowCommons;
|
||||
import org.codehaus.jettison.json.JSONException;
|
||||
import org.junit.Assert;
|
||||
import org.junit.Test;
|
||||
|
||||
/**
|
||||
* Class to test some functionality of {@link TensorFlowCommons}.
|
||||
*/
|
||||
public class TestTFConfigGenerator {
|
||||
@Test
|
||||
public void testSimpleDistributedTFConfigGenerator() throws JSONException {
|
||||
String json = YarnServiceUtils.getTFConfigEnv("worker", 5, 3, "wtan",
|
||||
String json = TensorFlowCommons.getTFConfigEnv("worker", 5, 3, "wtan",
|
||||
"tf-job-001", "example.com");
|
||||
String expected =
|
||||
"{\\\"cluster\\\":{\\\"master\\\":[\\\"master-0.wtan.tf-job-001.example.com:8000\\\"],\\\"worker\\\":[\\\"worker-0.wtan.tf-job-001.example.com:8000\\\",\\\"worker-1.wtan.tf-job-001.example.com:8000\\\",\\\"worker-2.wtan.tf-job-001.example.com:8000\\\",\\\"worker-3.wtan.tf-job-001.example.com:8000\\\"],\\\"ps\\\":[\\\"ps-0.wtan.tf-job-001.example.com:8000\\\",\\\"ps-1.wtan.tf-job-001.example.com:8000\\\",\\\"ps-2.wtan.tf-job-001.example.com:8000\\\"]},\\\"task\\\":{ \\\"type\\\":\\\"worker\\\", \\\"index\\\":$_TASK_INDEX},\\\"environment\\\":\\\"cloud\\\"}";
|
||||
Assert.assertEquals(expected, json);
|
||||
|
||||
json = YarnServiceUtils.getTFConfigEnv("ps", 5, 3, "wtan", "tf-job-001",
|
||||
json = TensorFlowCommons.getTFConfigEnv("ps", 5, 3, "wtan", "tf-job-001",
|
||||
"example.com");
|
||||
expected =
|
||||
"{\\\"cluster\\\":{\\\"master\\\":[\\\"master-0.wtan.tf-job-001.example.com:8000\\\"],\\\"worker\\\":[\\\"worker-0.wtan.tf-job-001.example.com:8000\\\",\\\"worker-1.wtan.tf-job-001.example.com:8000\\\",\\\"worker-2.wtan.tf-job-001.example.com:8000\\\",\\\"worker-3.wtan.tf-job-001.example.com:8000\\\"],\\\"ps\\\":[\\\"ps-0.wtan.tf-job-001.example.com:8000\\\",\\\"ps-1.wtan.tf-job-001.example.com:8000\\\",\\\"ps-2.wtan.tf-job-001.example.com:8000\\\"]},\\\"task\\\":{ \\\"type\\\":\\\"ps\\\", \\\"index\\\":$_TASK_INDEX},\\\"environment\\\":\\\"cloud\\\"}";
|
||||
Assert.assertEquals(expected, json);
|
||||
|
||||
json = YarnServiceUtils.getTFConfigEnv("master", 2, 1, "wtan", "tf-job-001",
|
||||
json = TensorFlowCommons.getTFConfigEnv("master", 2, 1, "wtan", "tf-job-001",
|
||||
"example.com");
|
||||
expected =
|
||||
"{\\\"cluster\\\":{\\\"master\\\":[\\\"master-0.wtan.tf-job-001.example.com:8000\\\"],\\\"worker\\\":[\\\"worker-0.wtan.tf-job-001.example.com:8000\\\"],\\\"ps\\\":[\\\"ps-0.wtan.tf-job-001.example.com:8000\\\"]},\\\"task\\\":{ \\\"type\\\":\\\"master\\\", \\\"index\\\":$_TASK_INDEX},\\\"environment\\\":\\\"cloud\\\"}";
|
||||
|
|
|
@ -0,0 +1,190 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hadoop.yarn.submarine.runtimes.yarnservice.command;
|
||||
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.yarn.service.api.records.Component;
|
||||
import org.apache.hadoop.yarn.submarine.client.cli.param.RunJobParameters;
|
||||
import org.apache.hadoop.yarn.submarine.common.MockClientContext;
|
||||
import org.apache.hadoop.yarn.submarine.common.api.TaskType;
|
||||
import org.apache.hadoop.yarn.submarine.runtimes.yarnservice.FileSystemOperations;
|
||||
import org.apache.hadoop.yarn.submarine.runtimes.yarnservice.HadoopEnvironmentSetup;
|
||||
import org.apache.hadoop.yarn.submarine.runtimes.yarnservice.tensorflow.command.TensorBoardLaunchCommand;
|
||||
import org.apache.hadoop.yarn.submarine.runtimes.yarnservice.tensorflow.command.TensorFlowPsLaunchCommand;
|
||||
import org.apache.hadoop.yarn.submarine.runtimes.yarnservice.tensorflow.command.TensorFlowWorkerLaunchCommand;
|
||||
import org.junit.Rule;
|
||||
import org.junit.rules.ExpectedException;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.nio.charset.Charset;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Paths;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
import static junit.framework.TestCase.assertEquals;
|
||||
import static org.junit.Assert.assertFalse;
|
||||
import static org.junit.Assert.assertNotNull;
|
||||
import static org.junit.Assert.assertTrue;
|
||||
import static org.junit.Assert.fail;
|
||||
|
||||
/**
|
||||
* This class is an abstract base class for testing Tensorboard and TensorFlow
|
||||
* launch commands.
|
||||
*/
|
||||
public abstract class AbstractLaunchCommandTestHelper {
|
||||
private TaskType taskType;
|
||||
private boolean useTaskTypeOverride;
|
||||
|
||||
@Rule
|
||||
public ExpectedException expectedException = ExpectedException.none();
|
||||
|
||||
private void assertScriptContainsExportedEnvVar(List<String> fileContents,
|
||||
String varName) {
|
||||
String expected = String.format("export %s=", varName);
|
||||
assertScriptContainsLine(fileContents, expected);
|
||||
}
|
||||
|
||||
public static void assertScriptContainsExportedEnvVarWithValue(
|
||||
List<String> fileContents, String varName, String value) {
|
||||
String expected = String.format("export %s=%s", varName, value);
|
||||
assertScriptContainsLine(fileContents, expected);
|
||||
}
|
||||
|
||||
public static void assertScriptContainsLine(List<String> fileContents,
|
||||
String expected) {
|
||||
String message = String.format(
|
||||
"File does not contain expected line '%s'!" + " File contents: %s",
|
||||
expected, Arrays.toString(fileContents.toArray()));
|
||||
assertTrue(message, fileContents.contains(expected));
|
||||
}
|
||||
|
||||
public static void assertScriptContainsLineWithRegex(
|
||||
List<String> fileContents,
|
||||
String regex) {
|
||||
String message = String.format(
|
||||
"File does not contain expected line '%s'!" + " File contents: %s",
|
||||
regex, Arrays.toString(fileContents.toArray()));
|
||||
|
||||
for (String line : fileContents) {
|
||||
if (line.matches(regex)) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
fail(message);
|
||||
}
|
||||
|
||||
public static void assertScriptDoesNotContainLine(List<String> fileContents,
|
||||
String expected) {
|
||||
String message = String.format(
|
||||
"File contains unexpected line '%s'!" + " File contents: %s",
|
||||
expected, Arrays.toString(fileContents.toArray()));
|
||||
assertFalse(message, fileContents.contains(expected));
|
||||
}
|
||||
|
||||
|
||||
private AbstractLaunchCommand createLaunchCommandByTaskType(TaskType taskType,
|
||||
RunJobParameters params) throws IOException {
|
||||
MockClientContext mockClientContext = new MockClientContext();
|
||||
FileSystemOperations fsOperations =
|
||||
new FileSystemOperations(mockClientContext);
|
||||
HadoopEnvironmentSetup hadoopEnvSetup =
|
||||
new HadoopEnvironmentSetup(mockClientContext, fsOperations);
|
||||
Component component = new Component();
|
||||
Configuration yarnConfig = new Configuration();
|
||||
|
||||
return createLaunchCommandByTaskTypeInternal(taskType, params,
|
||||
hadoopEnvSetup, component, yarnConfig);
|
||||
}
|
||||
|
||||
private AbstractLaunchCommand createLaunchCommandByTaskTypeInternal(
|
||||
TaskType taskType, RunJobParameters params,
|
||||
HadoopEnvironmentSetup hadoopEnvSetup, Component component,
|
||||
Configuration yarnConfig)
|
||||
throws IOException {
|
||||
if (taskType == TaskType.TENSORBOARD) {
|
||||
return new TensorBoardLaunchCommand(
|
||||
hadoopEnvSetup, getTaskType(taskType), component, params);
|
||||
} else if (taskType == TaskType.WORKER
|
||||
|| taskType == TaskType.PRIMARY_WORKER) {
|
||||
return new TensorFlowWorkerLaunchCommand(
|
||||
hadoopEnvSetup, getTaskType(taskType), component, params, yarnConfig);
|
||||
} else if (taskType == TaskType.PS) {
|
||||
return new TensorFlowPsLaunchCommand(
|
||||
hadoopEnvSetup, getTaskType(taskType), component, params, yarnConfig);
|
||||
}
|
||||
throw new IllegalStateException("Unknown taskType!");
|
||||
}
|
||||
|
||||
public void overrideTaskType(TaskType taskType) {
|
||||
this.taskType = taskType;
|
||||
this.useTaskTypeOverride = true;
|
||||
}
|
||||
|
||||
private TaskType getTaskType(TaskType taskType) {
|
||||
if (useTaskTypeOverride) {
|
||||
return this.taskType;
|
||||
}
|
||||
return taskType;
|
||||
}
|
||||
|
||||
public void testHdfsRelatedEnvironmentIsUndefined(TaskType taskType,
|
||||
RunJobParameters params) throws IOException {
|
||||
AbstractLaunchCommand launchCommand =
|
||||
createLaunchCommandByTaskType(taskType, params);
|
||||
|
||||
expectedException.expect(IOException.class);
|
||||
expectedException
|
||||
.expectMessage("Failed to detect HDFS-related environments.");
|
||||
launchCommand.generateLaunchScript();
|
||||
}
|
||||
|
||||
public List<String> testHdfsRelatedEnvironmentIsDefined(TaskType taskType,
|
||||
RunJobParameters params) throws IOException {
|
||||
AbstractLaunchCommand launchCommand =
|
||||
createLaunchCommandByTaskType(taskType, params);
|
||||
|
||||
String result = launchCommand.generateLaunchScript();
|
||||
assertNotNull(result);
|
||||
File resultFile = new File(result);
|
||||
assertTrue(resultFile.exists());
|
||||
|
||||
List<String> fileContents = Files.readAllLines(
|
||||
Paths.get(resultFile.toURI()),
|
||||
Charset.forName("UTF-8"));
|
||||
|
||||
assertEquals("#!/bin/bash", fileContents.get(0));
|
||||
assertScriptContainsExportedEnvVar(fileContents, "HADOOP_HOME");
|
||||
assertScriptContainsExportedEnvVar(fileContents, "HADOOP_YARN_HOME");
|
||||
assertScriptContainsExportedEnvVarWithValue(fileContents,
|
||||
"HADOOP_HDFS_HOME", "testHdfsHome");
|
||||
assertScriptContainsExportedEnvVarWithValue(fileContents,
|
||||
"HADOOP_COMMON_HOME", "testHdfsHome");
|
||||
assertScriptContainsExportedEnvVarWithValue(fileContents, "HADOOP_CONF_DIR",
|
||||
"$WORK_DIR");
|
||||
assertScriptContainsExportedEnvVarWithValue(fileContents, "JAVA_HOME",
|
||||
"testJavaHome");
|
||||
assertScriptContainsExportedEnvVarWithValue(fileContents, "LD_LIBRARY_PATH",
|
||||
"$LD_LIBRARY_PATH:$JAVA_HOME/lib/amd64/server");
|
||||
assertScriptContainsExportedEnvVarWithValue(fileContents, "CLASSPATH",
|
||||
"`$HADOOP_HDFS_HOME/bin/hadoop classpath --glob`");
|
||||
|
||||
return fileContents;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,97 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hadoop.yarn.submarine.runtimes.yarnservice.command;
|
||||
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.yarn.service.api.records.Component;
|
||||
import org.apache.hadoop.yarn.submarine.client.cli.param.RunJobParameters;
|
||||
import org.apache.hadoop.yarn.submarine.common.api.TaskType;
|
||||
import org.apache.hadoop.yarn.submarine.runtimes.yarnservice.HadoopEnvironmentSetup;
|
||||
import org.apache.hadoop.yarn.submarine.runtimes.yarnservice.tensorflow.command.TensorBoardLaunchCommand;
|
||||
import org.apache.hadoop.yarn.submarine.runtimes.yarnservice.tensorflow.command.TensorFlowPsLaunchCommand;
|
||||
import org.apache.hadoop.yarn.submarine.runtimes.yarnservice.tensorflow.command.TensorFlowWorkerLaunchCommand;
|
||||
import org.junit.Test;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import static org.junit.Assert.assertTrue;
|
||||
import static org.mockito.Mockito.mock;
|
||||
|
||||
/**
|
||||
* This class is to test the {@link LaunchCommandFactory}.
|
||||
*/
|
||||
public class TestLaunchCommandFactory {
|
||||
|
||||
private LaunchCommandFactory createLaunchCommandFactory(
|
||||
RunJobParameters parameters) {
|
||||
HadoopEnvironmentSetup hadoopEnvSetup = mock(HadoopEnvironmentSetup.class);
|
||||
Configuration configuration = mock(Configuration.class);
|
||||
return new LaunchCommandFactory(hadoopEnvSetup, parameters, configuration);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void createLaunchCommandWorkerAndPrimaryWorker() throws IOException {
|
||||
RunJobParameters parameters = new RunJobParameters();
|
||||
parameters.setWorkerLaunchCmd("testWorkerLaunchCommand");
|
||||
LaunchCommandFactory launchCommandFactory = createLaunchCommandFactory(
|
||||
parameters);
|
||||
Component mockComponent = mock(Component.class);
|
||||
|
||||
AbstractLaunchCommand launchCommand =
|
||||
launchCommandFactory.createLaunchCommand(TaskType.PRIMARY_WORKER,
|
||||
mockComponent);
|
||||
|
||||
assertTrue(launchCommand instanceof TensorFlowWorkerLaunchCommand);
|
||||
|
||||
launchCommand =
|
||||
launchCommandFactory.createLaunchCommand(TaskType.WORKER,
|
||||
mockComponent);
|
||||
assertTrue(launchCommand instanceof TensorFlowWorkerLaunchCommand);
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void createLaunchCommandPs() throws IOException {
|
||||
RunJobParameters parameters = new RunJobParameters();
|
||||
parameters.setPSLaunchCmd("testPSLaunchCommand");
|
||||
LaunchCommandFactory launchCommandFactory = createLaunchCommandFactory(
|
||||
parameters);
|
||||
Component mockComponent = mock(Component.class);
|
||||
|
||||
AbstractLaunchCommand launchCommand =
|
||||
launchCommandFactory.createLaunchCommand(TaskType.PS,
|
||||
mockComponent);
|
||||
|
||||
assertTrue(launchCommand instanceof TensorFlowPsLaunchCommand);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void createLaunchCommandTensorboard() throws IOException {
|
||||
RunJobParameters parameters = new RunJobParameters();
|
||||
parameters.setCheckpointPath("testCheckpointPath");
|
||||
LaunchCommandFactory launchCommandFactory =
|
||||
createLaunchCommandFactory(parameters);
|
||||
Component mockComponent = mock(Component.class);
|
||||
|
||||
AbstractLaunchCommand launchCommand =
|
||||
launchCommandFactory.createLaunchCommand(TaskType.TENSORBOARD,
|
||||
mockComponent);
|
||||
|
||||
assertTrue(launchCommand instanceof TensorBoardLaunchCommand);
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,104 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hadoop.yarn.submarine.runtimes.yarnservice.tensorflow.command;
|
||||
|
||||
import com.google.common.collect.ImmutableList;
|
||||
import org.apache.hadoop.yarn.service.api.records.Component;
|
||||
import org.apache.hadoop.yarn.submarine.client.cli.param.RunJobParameters;
|
||||
import org.apache.hadoop.yarn.submarine.common.MockClientContext;
|
||||
import org.apache.hadoop.yarn.submarine.common.api.TaskType;
|
||||
import org.apache.hadoop.yarn.submarine.runtimes.yarnservice.FileSystemOperations;
|
||||
import org.apache.hadoop.yarn.submarine.runtimes.yarnservice.HadoopEnvironmentSetup;
|
||||
import org.apache.hadoop.yarn.submarine.runtimes.yarnservice.command.AbstractLaunchCommandTestHelper;
|
||||
import org.junit.Test;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.List;
|
||||
|
||||
import static org.apache.hadoop.yarn.submarine.runtimes.yarnservice.HadoopEnvironmentSetup.DOCKER_HADOOP_HDFS_HOME;
|
||||
import static org.apache.hadoop.yarn.submarine.runtimes.yarnservice.HadoopEnvironmentSetup.DOCKER_JAVA_HOME;
|
||||
|
||||
/**
|
||||
* This class is to test the {@link TensorBoardLaunchCommand}.
|
||||
*/
|
||||
public class TestTensorBoardLaunchCommand extends
|
||||
AbstractLaunchCommandTestHelper {
|
||||
|
||||
@Test
|
||||
public void testHdfsRelatedEnvironmentIsUndefined() throws IOException {
|
||||
RunJobParameters params = new RunJobParameters();
|
||||
params.setInputPath("hdfs://bla");
|
||||
params.setName("testJobname");
|
||||
params.setCheckpointPath("something");
|
||||
|
||||
testHdfsRelatedEnvironmentIsUndefined(TaskType.TENSORBOARD,
|
||||
params);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testHdfsRelatedEnvironmentIsDefined() throws IOException {
|
||||
RunJobParameters params = new RunJobParameters();
|
||||
params.setName("testName");
|
||||
params.setCheckpointPath("testCheckpointPath");
|
||||
params.setInputPath("hdfs://bla");
|
||||
params.setEnvars(ImmutableList.of(
|
||||
DOCKER_HADOOP_HDFS_HOME + "=" + "testHdfsHome",
|
||||
DOCKER_JAVA_HOME + "=" + "testJavaHome"));
|
||||
|
||||
List<String> fileContents =
|
||||
testHdfsRelatedEnvironmentIsDefined(TaskType.TENSORBOARD,
|
||||
params);
|
||||
assertScriptContainsExportedEnvVarWithValue(fileContents, "LC_ALL",
|
||||
"C && tensorboard --logdir=testCheckpointPath");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testCheckpointPathUndefined() throws IOException {
|
||||
MockClientContext mockClientContext = new MockClientContext();
|
||||
FileSystemOperations fsOperations =
|
||||
new FileSystemOperations(mockClientContext);
|
||||
HadoopEnvironmentSetup hadoopEnvSetup =
|
||||
new HadoopEnvironmentSetup(mockClientContext, fsOperations);
|
||||
|
||||
Component component = new Component();
|
||||
RunJobParameters params = new RunJobParameters();
|
||||
params.setCheckpointPath(null);
|
||||
|
||||
expectedException.expect(NullPointerException.class);
|
||||
expectedException.expectMessage("CheckpointPath must not be null");
|
||||
new TensorBoardLaunchCommand(hadoopEnvSetup, TaskType.TENSORBOARD,
|
||||
component, params);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testCheckpointPathEmptyString() throws IOException {
|
||||
MockClientContext mockClientContext = new MockClientContext();
|
||||
FileSystemOperations fsOperations =
|
||||
new FileSystemOperations(mockClientContext);
|
||||
HadoopEnvironmentSetup hadoopEnvSetup =
|
||||
new HadoopEnvironmentSetup(mockClientContext, fsOperations);
|
||||
|
||||
Component component = new Component();
|
||||
RunJobParameters params = new RunJobParameters();
|
||||
params.setCheckpointPath("");
|
||||
|
||||
expectedException.expect(IllegalArgumentException.class);
|
||||
expectedException.expectMessage("CheckpointPath must not be empty");
|
||||
new TensorBoardLaunchCommand(hadoopEnvSetup, TaskType.TENSORBOARD,
|
||||
component, params);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,251 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hadoop.yarn.submarine.runtimes.yarnservice.tensorflow.command;
|
||||
|
||||
import com.google.common.collect.ImmutableList;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.yarn.service.api.records.Component;
|
||||
import org.apache.hadoop.yarn.submarine.client.cli.param.RunJobParameters;
|
||||
import org.apache.hadoop.yarn.submarine.common.MockClientContext;
|
||||
import org.apache.hadoop.yarn.submarine.common.api.TaskType;
|
||||
import org.apache.hadoop.yarn.submarine.runtimes.yarnservice.FileSystemOperations;
|
||||
import org.apache.hadoop.yarn.submarine.runtimes.yarnservice.HadoopEnvironmentSetup;
|
||||
import org.apache.hadoop.yarn.submarine.runtimes.yarnservice.command.AbstractLaunchCommandTestHelper;
|
||||
import org.junit.Test;
|
||||
import org.junit.runner.RunWith;
|
||||
import org.junit.runners.Parameterized;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.List;
|
||||
|
||||
import static org.apache.hadoop.yarn.submarine.runtimes.yarnservice.HadoopEnvironmentSetup.DOCKER_HADOOP_HDFS_HOME;
|
||||
import static org.apache.hadoop.yarn.submarine.runtimes.yarnservice.HadoopEnvironmentSetup.DOCKER_JAVA_HOME;
|
||||
|
||||
/**
|
||||
* This class is to test the implementors of {@link TensorFlowLaunchCommand}.
|
||||
*/
|
||||
@RunWith(Parameterized.class)
|
||||
public class TestTensorFlowLaunchCommand
|
||||
extends AbstractLaunchCommandTestHelper {
|
||||
private TaskType taskType;
|
||||
|
||||
@Parameterized.Parameters
|
||||
public static Collection<Object[]> data() {
|
||||
Collection<Object[]> params = new ArrayList<>();
|
||||
params.add(new Object[]{TaskType.WORKER });
|
||||
params.add(new Object[]{TaskType.PS });
|
||||
return params;
|
||||
}
|
||||
|
||||
public TestTensorFlowLaunchCommand(TaskType taskType) {
|
||||
this.taskType = taskType;
|
||||
}
|
||||
|
||||
|
||||
private void assertScriptContainsLaunchCommand(List<String> fileContents,
|
||||
RunJobParameters params) {
|
||||
String launchCommand = null;
|
||||
if (taskType == TaskType.WORKER) {
|
||||
launchCommand = params.getWorkerLaunchCmd();
|
||||
} else if (taskType == TaskType.PS) {
|
||||
launchCommand = params.getPSLaunchCmd();
|
||||
}
|
||||
assertScriptContainsLine(fileContents, launchCommand);
|
||||
}
|
||||
|
||||
private void setLaunchCommandToParams(RunJobParameters params) {
|
||||
if (taskType == TaskType.WORKER) {
|
||||
params.setWorkerLaunchCmd("testWorkerLaunchCommand");
|
||||
} else if (taskType == TaskType.PS) {
|
||||
params.setPSLaunchCmd("testPsLaunchCommand");
|
||||
}
|
||||
}
|
||||
|
||||
private void setLaunchCommandToParams(RunJobParameters params, String value) {
|
||||
if (taskType == TaskType.WORKER) {
|
||||
params.setWorkerLaunchCmd(value);
|
||||
} else if (taskType == TaskType.PS) {
|
||||
params.setPSLaunchCmd(value);
|
||||
}
|
||||
}
|
||||
|
||||
private void assertTypeInJson(List<String> fileContents) {
|
||||
String expectedType = null;
|
||||
if (taskType == TaskType.WORKER) {
|
||||
expectedType = "worker";
|
||||
} else if (taskType == TaskType.PS) {
|
||||
expectedType = "ps";
|
||||
}
|
||||
assertScriptContainsLineWithRegex(fileContents, String.format(".*type.*:" +
|
||||
".*%s.*", expectedType));
|
||||
}
|
||||
|
||||
private TensorFlowLaunchCommand createTensorFlowLaunchCommandObject(
|
||||
HadoopEnvironmentSetup hadoopEnvSetup, Configuration yarnConfig,
|
||||
Component component, RunJobParameters params) throws IOException {
|
||||
if (taskType == TaskType.WORKER) {
|
||||
return new TensorFlowWorkerLaunchCommand(hadoopEnvSetup, taskType,
|
||||
component,
|
||||
params, yarnConfig);
|
||||
} else if (taskType == TaskType.PS) {
|
||||
return new TensorFlowPsLaunchCommand(hadoopEnvSetup, taskType, component,
|
||||
params, yarnConfig);
|
||||
}
|
||||
throw new IllegalStateException("Unknown tasktype!");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testHdfsRelatedEnvironmentIsUndefined() throws IOException {
|
||||
RunJobParameters params = new RunJobParameters();
|
||||
params.setInputPath("hdfs://bla");
|
||||
params.setName("testJobname");
|
||||
setLaunchCommandToParams(params);
|
||||
|
||||
testHdfsRelatedEnvironmentIsUndefined(taskType, params);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testHdfsRelatedEnvironmentIsDefined() throws IOException {
|
||||
RunJobParameters params = new RunJobParameters();
|
||||
params.setName("testName");
|
||||
params.setInputPath("hdfs://bla");
|
||||
params.setEnvars(ImmutableList.of(
|
||||
DOCKER_HADOOP_HDFS_HOME + "=" + "testHdfsHome",
|
||||
DOCKER_JAVA_HOME + "=" + "testJavaHome"));
|
||||
setLaunchCommandToParams(params);
|
||||
|
||||
List<String> fileContents =
|
||||
testHdfsRelatedEnvironmentIsDefined(taskType,
|
||||
params);
|
||||
assertScriptContainsLaunchCommand(fileContents, params);
|
||||
assertScriptDoesNotContainLine(fileContents, "export TF_CONFIG=");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testLaunchCommandIsNull() throws IOException {
|
||||
MockClientContext mockClientContext = new MockClientContext();
|
||||
FileSystemOperations fsOperations =
|
||||
new FileSystemOperations(mockClientContext);
|
||||
HadoopEnvironmentSetup hadoopEnvSetup =
|
||||
new HadoopEnvironmentSetup(mockClientContext, fsOperations);
|
||||
Configuration yarnConfig = new Configuration();
|
||||
|
||||
Component component = new Component();
|
||||
RunJobParameters params = new RunJobParameters();
|
||||
params.setName("testName");
|
||||
setLaunchCommandToParams(params, null);
|
||||
|
||||
expectedException.expect(IllegalArgumentException.class);
|
||||
expectedException.expectMessage("LaunchCommand must not be null or empty");
|
||||
TensorFlowLaunchCommand launchCommand =
|
||||
createTensorFlowLaunchCommandObject(hadoopEnvSetup, yarnConfig,
|
||||
component,
|
||||
params);
|
||||
launchCommand.generateLaunchScript();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testLaunchCommandIsEmpty() throws IOException {
|
||||
MockClientContext mockClientContext = new MockClientContext();
|
||||
FileSystemOperations fsOperations =
|
||||
new FileSystemOperations(mockClientContext);
|
||||
HadoopEnvironmentSetup hadoopEnvSetup =
|
||||
new HadoopEnvironmentSetup(mockClientContext, fsOperations);
|
||||
Configuration yarnConfig = new Configuration();
|
||||
|
||||
Component component = new Component();
|
||||
RunJobParameters params = new RunJobParameters();
|
||||
params.setName("testName");
|
||||
setLaunchCommandToParams(params, "");
|
||||
|
||||
expectedException.expect(IllegalArgumentException.class);
|
||||
expectedException.expectMessage("LaunchCommand must not be null or empty");
|
||||
TensorFlowLaunchCommand launchCommand =
|
||||
createTensorFlowLaunchCommandObject(hadoopEnvSetup, yarnConfig,
|
||||
component, params);
|
||||
launchCommand.generateLaunchScript();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testDistributedTrainingMissingTaskType() throws IOException {
|
||||
overrideTaskType(null);
|
||||
|
||||
RunJobParameters params = new RunJobParameters();
|
||||
params.setDistributed(true);
|
||||
params.setName("testName");
|
||||
params.setInputPath("hdfs://bla");
|
||||
params.setEnvars(ImmutableList.of(
|
||||
DOCKER_HADOOP_HDFS_HOME + "=" + "testHdfsHome",
|
||||
DOCKER_JAVA_HOME + "=" + "testJavaHome"));
|
||||
setLaunchCommandToParams(params);
|
||||
|
||||
expectedException.expect(NullPointerException.class);
|
||||
expectedException.expectMessage("TaskType must not be null");
|
||||
testHdfsRelatedEnvironmentIsDefined(taskType, params);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testDistributedTrainingNumberOfWorkersAndPsIsZero()
|
||||
throws IOException {
|
||||
RunJobParameters params = new RunJobParameters();
|
||||
params.setDistributed(true);
|
||||
params.setNumWorkers(0);
|
||||
params.setNumPS(0);
|
||||
params.setName("testName");
|
||||
params.setInputPath("hdfs://bla");
|
||||
params.setEnvars(ImmutableList.of(
|
||||
DOCKER_HADOOP_HDFS_HOME + "=" + "testHdfsHome",
|
||||
DOCKER_JAVA_HOME + "=" + "testJavaHome"));
|
||||
setLaunchCommandToParams(params);
|
||||
|
||||
List<String> fileContents =
|
||||
testHdfsRelatedEnvironmentIsDefined(taskType, params);
|
||||
|
||||
assertScriptDoesNotContainLine(fileContents, "export TF_CONFIG=");
|
||||
assertScriptContainsLineWithRegex(fileContents, ".*worker.*:\\[\\].*");
|
||||
assertScriptContainsLineWithRegex(fileContents, ".*ps.*:\\[\\].*");
|
||||
assertTypeInJson(fileContents);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testDistributedTrainingNumberOfWorkersAndPsIsNonZero()
|
||||
throws IOException {
|
||||
RunJobParameters params = new RunJobParameters();
|
||||
params.setDistributed(true);
|
||||
params.setNumWorkers(3);
|
||||
params.setNumPS(2);
|
||||
params.setName("testName");
|
||||
params.setInputPath("hdfs://bla");
|
||||
params.setEnvars(ImmutableList.of(
|
||||
DOCKER_HADOOP_HDFS_HOME + "=" + "testHdfsHome",
|
||||
DOCKER_JAVA_HOME + "=" + "testJavaHome"));
|
||||
setLaunchCommandToParams(params);
|
||||
|
||||
List<String> fileContents =
|
||||
testHdfsRelatedEnvironmentIsDefined(taskType, params);
|
||||
|
||||
//assert we have multiple PS and workers
|
||||
assertScriptDoesNotContainLine(fileContents, "export TF_CONFIG=");
|
||||
assertScriptContainsLineWithRegex(fileContents, ".*worker.*:\\[.*,.*\\].*");
|
||||
assertScriptContainsLineWithRegex(fileContents, ".*ps.*:\\[.*,.*\\].*");
|
||||
assertTypeInJson(fileContents);
|
||||
}
|
||||
|
||||
|
||||
}
|
|
@ -0,0 +1,90 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hadoop.yarn.submarine.runtimes.yarnservice.tensorflow.component;
|
||||
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.yarn.api.records.Resource;
|
||||
import org.apache.hadoop.yarn.service.api.ServiceApiConstants;
|
||||
import org.apache.hadoop.yarn.service.api.records.Component;
|
||||
import org.apache.hadoop.yarn.submarine.common.Envs;
|
||||
import org.apache.hadoop.yarn.submarine.common.MockClientContext;
|
||||
import org.apache.hadoop.yarn.submarine.common.api.TaskType;
|
||||
import org.apache.hadoop.yarn.submarine.runtimes.yarnservice.FileSystemOperations;
|
||||
import org.apache.hadoop.yarn.submarine.runtimes.yarnservice.command.AbstractLaunchCommand;
|
||||
import org.apache.hadoop.yarn.submarine.runtimes.yarnservice.command.LaunchCommandFactory;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import static org.junit.Assert.assertEquals;
|
||||
import static org.junit.Assert.assertNotNull;
|
||||
import static org.mockito.ArgumentMatchers.any;
|
||||
import static org.mockito.ArgumentMatchers.eq;
|
||||
import static org.mockito.Mockito.mock;
|
||||
import static org.mockito.Mockito.when;
|
||||
|
||||
/**
|
||||
* This class has some helper methods and fields
|
||||
* in order to test TensorFlow-related Components easier.
|
||||
*/
|
||||
public class ComponentTestCommons {
|
||||
String userName;
|
||||
TaskType taskType;
|
||||
LaunchCommandFactory mockLaunchCommandFactory;
|
||||
FileSystemOperations fsOperations;
|
||||
MockClientContext mockClientContext;
|
||||
Configuration yarnConfig;
|
||||
Resource resource;
|
||||
|
||||
ComponentTestCommons(TaskType taskType) {
|
||||
this.taskType = taskType;
|
||||
}
|
||||
|
||||
public void setup() throws IOException {
|
||||
this.userName = System.getProperty("user.name");
|
||||
this.resource = Resource.newInstance(4000, 10);
|
||||
setupDependencies();
|
||||
}
|
||||
|
||||
private void setupDependencies() throws IOException {
|
||||
fsOperations = mock(FileSystemOperations.class);
|
||||
mockClientContext = new MockClientContext();
|
||||
mockLaunchCommandFactory = mock(LaunchCommandFactory.class);
|
||||
|
||||
AbstractLaunchCommand mockLaunchCommand = mock(AbstractLaunchCommand.class);
|
||||
when(mockLaunchCommand.generateLaunchScript()).thenReturn("mockScript");
|
||||
when(mockLaunchCommandFactory.createLaunchCommand(eq(taskType),
|
||||
any(Component.class))).thenReturn(mockLaunchCommand);
|
||||
|
||||
yarnConfig = new Configuration();
|
||||
}
|
||||
|
||||
void verifyCommonConfigEnvs(Component component) {
|
||||
assertNotNull(component.getConfiguration().getEnv());
|
||||
assertEquals(2, component.getConfiguration().getEnv().size());
|
||||
assertEquals(ServiceApiConstants.COMPONENT_ID,
|
||||
component.getConfiguration().getEnv().get(Envs.TASK_INDEX_ENV));
|
||||
assertEquals(taskType.name(),
|
||||
component.getConfiguration().getEnv().get(Envs.TASK_TYPE_ENV));
|
||||
}
|
||||
|
||||
void verifyResources(Component component) {
|
||||
assertNotNull(component.getResource());
|
||||
assertEquals(10, (int) component.getResource().getCpus());
|
||||
assertEquals(4000,
|
||||
(int) Integer.valueOf(component.getResource().getMemory()));
|
||||
}
|
||||
}
|
|
@ -0,0 +1,125 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hadoop.yarn.submarine.runtimes.yarnservice.tensorflow.component;
|
||||
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.yarn.service.api.records.Artifact;
|
||||
import org.apache.hadoop.yarn.service.api.records.Component;
|
||||
import org.apache.hadoop.yarn.service.api.records.Component.RestartPolicyEnum;
|
||||
import org.apache.hadoop.yarn.submarine.client.cli.param.RunJobParameters;
|
||||
import org.apache.hadoop.yarn.submarine.common.api.TaskType;
|
||||
import org.junit.Before;
|
||||
import org.junit.Rule;
|
||||
import org.junit.Test;
|
||||
import org.junit.rules.ExpectedException;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import static org.junit.Assert.assertEquals;
|
||||
import static org.mockito.ArgumentMatchers.any;
|
||||
import static org.mockito.ArgumentMatchers.eq;
|
||||
import static org.mockito.Mockito.verify;
|
||||
|
||||
/**
|
||||
* This class is to test {@link TensorBoardComponent}.
|
||||
*/
|
||||
public class TestTensorBoardComponent {
|
||||
|
||||
@Rule
|
||||
public ExpectedException expectedException = ExpectedException.none();
|
||||
private ComponentTestCommons testCommons =
|
||||
new ComponentTestCommons(TaskType.TENSORBOARD);
|
||||
|
||||
@Before
|
||||
public void setUp() throws IOException {
|
||||
testCommons.setup();
|
||||
}
|
||||
|
||||
private TensorBoardComponent createTensorBoardComponent(
|
||||
RunJobParameters parameters) {
|
||||
return new TensorBoardComponent(
|
||||
testCommons.fsOperations,
|
||||
testCommons.mockClientContext.getRemoteDirectoryManager(),
|
||||
parameters,
|
||||
testCommons.mockLaunchCommandFactory,
|
||||
testCommons.yarnConfig);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testTensorBoardComponentWithNullResource() throws IOException {
|
||||
RunJobParameters parameters = new RunJobParameters();
|
||||
parameters.setTensorboardResource(null);
|
||||
|
||||
TensorBoardComponent tensorBoardComponent =
|
||||
createTensorBoardComponent(parameters);
|
||||
|
||||
expectedException.expect(NullPointerException.class);
|
||||
expectedException.expectMessage("TensorBoard resource must not be null");
|
||||
tensorBoardComponent.createComponent();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testTensorBoardComponentWithNullJobName() throws IOException {
|
||||
RunJobParameters parameters = new RunJobParameters();
|
||||
parameters.setTensorboardResource(testCommons.resource);
|
||||
parameters.setName(null);
|
||||
|
||||
TensorBoardComponent tensorBoardComponent =
|
||||
createTensorBoardComponent(parameters);
|
||||
|
||||
expectedException.expect(NullPointerException.class);
|
||||
expectedException.expectMessage("Job name must not be null");
|
||||
tensorBoardComponent.createComponent();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testTensorBoardComponent() throws IOException {
|
||||
testCommons.yarnConfig.set("hadoop.registry.dns.domain-name", "testDomain");
|
||||
|
||||
RunJobParameters parameters = new RunJobParameters();
|
||||
parameters.setTensorboardResource(testCommons.resource);
|
||||
parameters.setName("testJobName");
|
||||
parameters.setTensorboardDockerImage("testTBDockerImage");
|
||||
|
||||
TensorBoardComponent tensorBoardComponent =
|
||||
createTensorBoardComponent(parameters);
|
||||
|
||||
Component component = tensorBoardComponent.createComponent();
|
||||
|
||||
assertEquals(testCommons.taskType.getComponentName(), component.getName());
|
||||
testCommons.verifyCommonConfigEnvs(component);
|
||||
|
||||
assertEquals(1L, (long) component.getNumberOfContainers());
|
||||
assertEquals(RestartPolicyEnum.NEVER, component.getRestartPolicy());
|
||||
testCommons.verifyResources(component);
|
||||
assertEquals(
|
||||
new Artifact().type(Artifact.TypeEnum.DOCKER).id("testTBDockerImage"),
|
||||
component.getArtifact());
|
||||
|
||||
assertEquals(String.format(
|
||||
"http://tensorboard-0.testJobName.%s" + ".testDomain:6006",
|
||||
testCommons.userName),
|
||||
tensorBoardComponent.getTensorboardLink());
|
||||
|
||||
assertEquals("./run-TENSORBOARD.sh", component.getLaunchCommand());
|
||||
verify(testCommons.fsOperations)
|
||||
.uploadToRemoteFileAndLocalizeToContainerWorkDir(
|
||||
any(Path.class), eq("mockScript"), eq("run-TENSORBOARD.sh"),
|
||||
eq(component));
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,166 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hadoop.yarn.submarine.runtimes.yarnservice.tensorflow.component;
|
||||
|
||||
import static junit.framework.TestCase.assertTrue;
|
||||
import static org.junit.Assert.assertEquals;
|
||||
import static org.mockito.ArgumentMatchers.any;
|
||||
import static org.mockito.ArgumentMatchers.eq;
|
||||
import static org.mockito.Mockito.verify;
|
||||
import java.io.IOException;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.yarn.service.api.records.Artifact;
|
||||
import org.apache.hadoop.yarn.service.api.records.Component;
|
||||
import org.apache.hadoop.yarn.service.api.records.Component.RestartPolicyEnum;
|
||||
import org.apache.hadoop.yarn.submarine.client.cli.param.RunJobParameters;
|
||||
import org.apache.hadoop.yarn.submarine.common.api.TaskType;
|
||||
import org.junit.Before;
|
||||
import org.junit.Rule;
|
||||
import org.junit.Test;
|
||||
import org.junit.rules.ExpectedException;
|
||||
|
||||
/**
|
||||
* This class is to test {@link TensorFlowPsComponent}.
|
||||
*/
|
||||
public class TestTensorFlowPsComponent {
|
||||
|
||||
@Rule
|
||||
public ExpectedException expectedException = ExpectedException.none();
|
||||
private ComponentTestCommons testCommons =
|
||||
new ComponentTestCommons(TaskType.PS);
|
||||
|
||||
@Before
|
||||
public void setUp() throws IOException {
|
||||
testCommons.setup();
|
||||
}
|
||||
|
||||
private TensorFlowPsComponent createPsComponent(RunJobParameters parameters) {
|
||||
return new TensorFlowPsComponent(
|
||||
testCommons.fsOperations,
|
||||
testCommons.mockClientContext.getRemoteDirectoryManager(),
|
||||
testCommons.mockLaunchCommandFactory,
|
||||
parameters,
|
||||
testCommons.yarnConfig);
|
||||
}
|
||||
|
||||
private void verifyCommons(Component component) throws IOException {
|
||||
assertEquals(testCommons.taskType.getComponentName(), component.getName());
|
||||
testCommons.verifyCommonConfigEnvs(component);
|
||||
|
||||
assertTrue(component.getConfiguration().getProperties().isEmpty());
|
||||
|
||||
assertEquals(RestartPolicyEnum.NEVER, component.getRestartPolicy());
|
||||
testCommons.verifyResources(component);
|
||||
assertEquals(
|
||||
new Artifact().type(Artifact.TypeEnum.DOCKER).id("testPSDockerImage"),
|
||||
component.getArtifact());
|
||||
|
||||
String taskTypeUppercase = testCommons.taskType.name().toUpperCase();
|
||||
String expectedScriptName = String.format("run-%s.sh", taskTypeUppercase);
|
||||
assertEquals(String.format("./%s", expectedScriptName),
|
||||
component.getLaunchCommand());
|
||||
verify(testCommons.fsOperations)
|
||||
.uploadToRemoteFileAndLocalizeToContainerWorkDir(
|
||||
any(Path.class), eq("mockScript"), eq(expectedScriptName),
|
||||
eq(component));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testPSComponentWithNullResource() throws IOException {
|
||||
RunJobParameters parameters = new RunJobParameters();
|
||||
parameters.setPsResource(null);
|
||||
|
||||
TensorFlowPsComponent psComponent =
|
||||
createPsComponent(parameters);
|
||||
|
||||
expectedException.expect(NullPointerException.class);
|
||||
expectedException.expectMessage("PS resource must not be null");
|
||||
psComponent.createComponent();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testPSComponentWithNullJobName() throws IOException {
|
||||
RunJobParameters parameters = new RunJobParameters();
|
||||
parameters.setPsResource(testCommons.resource);
|
||||
parameters.setNumPS(1);
|
||||
parameters.setName(null);
|
||||
|
||||
TensorFlowPsComponent psComponent =
|
||||
createPsComponent(parameters);
|
||||
|
||||
expectedException.expect(NullPointerException.class);
|
||||
expectedException.expectMessage("Job name must not be null");
|
||||
psComponent.createComponent();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testPSComponentZeroNumberOfPS() throws IOException {
|
||||
testCommons.yarnConfig.set("hadoop.registry.dns.domain-name", "testDomain");
|
||||
|
||||
RunJobParameters parameters = new RunJobParameters();
|
||||
parameters.setPsResource(testCommons.resource);
|
||||
parameters.setName("testJobName");
|
||||
parameters.setPsDockerImage("testPSDockerImage");
|
||||
parameters.setNumPS(0);
|
||||
|
||||
TensorFlowPsComponent psComponent =
|
||||
createPsComponent(parameters);
|
||||
|
||||
expectedException.expect(IllegalArgumentException.class);
|
||||
expectedException.expectMessage("Number of PS should be at least 1!");
|
||||
psComponent.createComponent();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testPSComponentNumPSIsOne() throws IOException {
|
||||
testCommons.yarnConfig.set("hadoop.registry.dns.domain-name", "testDomain");
|
||||
|
||||
RunJobParameters parameters = new RunJobParameters();
|
||||
parameters.setPsResource(testCommons.resource);
|
||||
parameters.setName("testJobName");
|
||||
parameters.setNumPS(1);
|
||||
parameters.setPsDockerImage("testPSDockerImage");
|
||||
|
||||
TensorFlowPsComponent psComponent =
|
||||
createPsComponent(parameters);
|
||||
|
||||
Component component = psComponent.createComponent();
|
||||
|
||||
assertEquals(1L, (long) component.getNumberOfContainers());
|
||||
verifyCommons(component);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testPSComponentNumPSIsTwo() throws IOException {
|
||||
testCommons.yarnConfig.set("hadoop.registry.dns.domain-name", "testDomain");
|
||||
|
||||
RunJobParameters parameters = new RunJobParameters();
|
||||
parameters.setPsResource(testCommons.resource);
|
||||
parameters.setName("testJobName");
|
||||
parameters.setNumPS(2);
|
||||
parameters.setPsDockerImage("testPSDockerImage");
|
||||
|
||||
TensorFlowPsComponent psComponent =
|
||||
createPsComponent(parameters);
|
||||
|
||||
Component component = psComponent.createComponent();
|
||||
|
||||
assertEquals(2L, (long) component.getNumberOfContainers());
|
||||
verifyCommons(component);
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,215 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hadoop.yarn.submarine.runtimes.yarnservice.tensorflow.component;
|
||||
|
||||
import com.google.common.collect.ImmutableMap;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.yarn.service.api.records.Artifact;
|
||||
import org.apache.hadoop.yarn.service.api.records.Component;
|
||||
import org.apache.hadoop.yarn.service.api.records.Component.RestartPolicyEnum;
|
||||
import org.apache.hadoop.yarn.submarine.client.cli.param.RunJobParameters;
|
||||
import org.apache.hadoop.yarn.submarine.common.api.TaskType;
|
||||
import org.junit.Before;
|
||||
import org.junit.Rule;
|
||||
import org.junit.Test;
|
||||
import org.junit.rules.ExpectedException;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Map;
|
||||
|
||||
import static junit.framework.TestCase.assertTrue;
|
||||
import static org.apache.hadoop.yarn.service.conf.YarnServiceConstants.CONTAINER_STATE_REPORT_AS_SERVICE_STATE;
|
||||
import static org.junit.Assert.assertEquals;
|
||||
import static org.junit.Assert.assertFalse;
|
||||
import static org.mockito.ArgumentMatchers.any;
|
||||
import static org.mockito.ArgumentMatchers.eq;
|
||||
import static org.mockito.Mockito.verify;
|
||||
|
||||
/**
|
||||
* This class is to test {@link TensorFlowWorkerComponent}.
|
||||
*/
|
||||
public class TestTensorFlowWorkerComponent {
|
||||
|
||||
@Rule
|
||||
public ExpectedException expectedException = ExpectedException.none();
|
||||
private ComponentTestCommons testCommons =
|
||||
new ComponentTestCommons(TaskType.TENSORBOARD);
|
||||
|
||||
@Before
|
||||
public void setUp() throws IOException {
|
||||
testCommons.setup();
|
||||
}
|
||||
|
||||
private TensorFlowWorkerComponent createWorkerComponent(
|
||||
RunJobParameters parameters) {
|
||||
return new TensorFlowWorkerComponent(
|
||||
testCommons.fsOperations,
|
||||
testCommons.mockClientContext.getRemoteDirectoryManager(),
|
||||
parameters, testCommons.taskType,
|
||||
testCommons.mockLaunchCommandFactory,
|
||||
testCommons.yarnConfig);
|
||||
}
|
||||
|
||||
private void verifyCommons(Component component) throws IOException {
|
||||
verifyCommonsInternal(component, ImmutableMap.of());
|
||||
}
|
||||
|
||||
private void verifyCommons(Component component,
|
||||
Map<String, String> expectedProperties) throws IOException {
|
||||
verifyCommonsInternal(component, expectedProperties);
|
||||
}
|
||||
|
||||
private void verifyCommonsInternal(Component component,
|
||||
Map<String, String> expectedProperties) throws IOException {
|
||||
assertEquals(testCommons.taskType.getComponentName(), component.getName());
|
||||
testCommons.verifyCommonConfigEnvs(component);
|
||||
|
||||
Map<String, String> actualProperties =
|
||||
component.getConfiguration().getProperties();
|
||||
if (!expectedProperties.isEmpty()) {
|
||||
assertFalse(actualProperties.isEmpty());
|
||||
expectedProperties.forEach(
|
||||
(k, v) -> assertEquals(v, actualProperties.get(k)));
|
||||
} else {
|
||||
assertTrue(actualProperties.isEmpty());
|
||||
}
|
||||
|
||||
assertEquals(RestartPolicyEnum.NEVER, component.getRestartPolicy());
|
||||
testCommons.verifyResources(component);
|
||||
assertEquals(
|
||||
new Artifact().type(Artifact.TypeEnum.DOCKER)
|
||||
.id("testWorkerDockerImage"),
|
||||
component.getArtifact());
|
||||
|
||||
String taskTypeUppercase = testCommons.taskType.name().toUpperCase();
|
||||
String expectedScriptName = String.format("run-%s.sh", taskTypeUppercase);
|
||||
assertEquals(String.format("./%s", expectedScriptName),
|
||||
component.getLaunchCommand());
|
||||
verify(testCommons.fsOperations)
|
||||
.uploadToRemoteFileAndLocalizeToContainerWorkDir(
|
||||
any(Path.class), eq("mockScript"), eq(expectedScriptName),
|
||||
eq(component));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testWorkerComponentWithNullResource() throws IOException {
|
||||
RunJobParameters parameters = new RunJobParameters();
|
||||
parameters.setWorkerResource(null);
|
||||
|
||||
TensorFlowWorkerComponent workerComponent =
|
||||
createWorkerComponent(parameters);
|
||||
|
||||
expectedException.expect(NullPointerException.class);
|
||||
expectedException.expectMessage("Worker resource must not be null");
|
||||
workerComponent.createComponent();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testWorkerComponentWithNullJobName() throws IOException {
|
||||
RunJobParameters parameters = new RunJobParameters();
|
||||
parameters.setWorkerResource(testCommons.resource);
|
||||
parameters.setNumWorkers(1);
|
||||
parameters.setName(null);
|
||||
|
||||
TensorFlowWorkerComponent workerComponent =
|
||||
createWorkerComponent(parameters);
|
||||
|
||||
expectedException.expect(NullPointerException.class);
|
||||
expectedException.expectMessage("Job name must not be null");
|
||||
workerComponent.createComponent();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testNormalWorkerComponentZeroNumberOfWorkers()
|
||||
throws IOException {
|
||||
testCommons.yarnConfig.set("hadoop.registry.dns.domain-name", "testDomain");
|
||||
|
||||
RunJobParameters parameters = new RunJobParameters();
|
||||
parameters.setWorkerResource(testCommons.resource);
|
||||
parameters.setName("testJobName");
|
||||
parameters.setWorkerDockerImage("testWorkerDockerImage");
|
||||
parameters.setNumWorkers(0);
|
||||
|
||||
TensorFlowWorkerComponent workerComponent =
|
||||
createWorkerComponent(parameters);
|
||||
|
||||
expectedException.expect(IllegalArgumentException.class);
|
||||
expectedException.expectMessage("Number of workers should be at least 1!");
|
||||
workerComponent.createComponent();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testNormalWorkerComponentNumWorkersIsOne() throws IOException {
|
||||
testCommons.yarnConfig.set("hadoop.registry.dns.domain-name", "testDomain");
|
||||
|
||||
RunJobParameters parameters = new RunJobParameters();
|
||||
parameters.setWorkerResource(testCommons.resource);
|
||||
parameters.setName("testJobName");
|
||||
parameters.setNumWorkers(1);
|
||||
parameters.setWorkerDockerImage("testWorkerDockerImage");
|
||||
|
||||
TensorFlowWorkerComponent workerComponent =
|
||||
createWorkerComponent(parameters);
|
||||
|
||||
Component component = workerComponent.createComponent();
|
||||
|
||||
assertEquals(0L, (long) component.getNumberOfContainers());
|
||||
verifyCommons(component);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testNormalWorkerComponentNumWorkersIsTwo() throws IOException {
|
||||
testCommons.yarnConfig.set("hadoop.registry.dns.domain-name", "testDomain");
|
||||
|
||||
RunJobParameters parameters = new RunJobParameters();
|
||||
parameters.setWorkerResource(testCommons.resource);
|
||||
parameters.setName("testJobName");
|
||||
parameters.setNumWorkers(2);
|
||||
parameters.setWorkerDockerImage("testWorkerDockerImage");
|
||||
|
||||
TensorFlowWorkerComponent workerComponent =
|
||||
createWorkerComponent(parameters);
|
||||
|
||||
Component component = workerComponent.createComponent();
|
||||
|
||||
assertEquals(1L, (long) component.getNumberOfContainers());
|
||||
verifyCommons(component);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testPrimaryWorkerComponentNumWorkersIsTwo() throws IOException {
|
||||
testCommons.yarnConfig.set("hadoop.registry.dns.domain-name", "testDomain");
|
||||
testCommons = new ComponentTestCommons(TaskType.PRIMARY_WORKER);
|
||||
testCommons.setup();
|
||||
|
||||
RunJobParameters parameters = new RunJobParameters();
|
||||
parameters.setWorkerResource(testCommons.resource);
|
||||
parameters.setName("testJobName");
|
||||
parameters.setNumWorkers(2);
|
||||
parameters.setWorkerDockerImage("testWorkerDockerImage");
|
||||
|
||||
TensorFlowWorkerComponent workerComponent =
|
||||
createWorkerComponent(parameters);
|
||||
|
||||
Component component = workerComponent.createComponent();
|
||||
|
||||
assertEquals(1L, (long) component.getNumberOfContainers());
|
||||
verifyCommons(component, ImmutableMap.of(
|
||||
CONTAINER_STATE_REPORT_AS_SERVICE_STATE, "true"));
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,91 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hadoop.yarn.submarine.utils;
|
||||
|
||||
import org.apache.hadoop.yarn.submarine.FileUtilitiesForTests;
|
||||
import org.junit.After;
|
||||
import org.junit.Before;
|
||||
import org.junit.BeforeClass;
|
||||
import org.junit.Test;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
|
||||
import static org.junit.Assert.assertEquals;
|
||||
import static org.junit.Assert.assertNotNull;
|
||||
import static org.junit.Assert.assertNull;
|
||||
|
||||
/**
|
||||
* This class is to test {@link ClassPathUtilities}.
|
||||
*/
|
||||
public class TestClassPathUtilities {
|
||||
|
||||
private static final String CLASSPATH_KEY = "java.class.path";
|
||||
private FileUtilitiesForTests fileUtils = new FileUtilitiesForTests();
|
||||
private static String originalClasspath;
|
||||
|
||||
@BeforeClass
|
||||
public static void setUpClass() {
|
||||
originalClasspath = System.getProperty(CLASSPATH_KEY);
|
||||
}
|
||||
|
||||
@Before
|
||||
public void setUp() {
|
||||
fileUtils.setup();
|
||||
}
|
||||
|
||||
@After
|
||||
public void teardown() throws IOException {
|
||||
fileUtils.teardown();
|
||||
System.setProperty(CLASSPATH_KEY, originalClasspath);
|
||||
}
|
||||
|
||||
private static void addFileToClasspath(File file) {
|
||||
String newClasspath = originalClasspath + ":" + file.getAbsolutePath();
|
||||
System.setProperty(CLASSPATH_KEY, newClasspath);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void findFileNotInClasspath() {
|
||||
File resultFile = ClassPathUtilities.findFileOnClassPath("bla");
|
||||
assertNull(resultFile);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void findFileOnClasspath() throws Exception {
|
||||
File testFile = fileUtils.createFileInTempDir("testFile");
|
||||
|
||||
addFileToClasspath(testFile);
|
||||
File resultFile = ClassPathUtilities.findFileOnClassPath("testFile");
|
||||
|
||||
assertNotNull(resultFile);
|
||||
assertEquals(testFile.getAbsolutePath(), resultFile.getAbsolutePath());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void findDirectoryOnClasspath() throws Exception {
|
||||
File testDir = fileUtils.createDirInTempDir("testDir");
|
||||
File testFile = fileUtils.createFileInDir(testDir, "testFile");
|
||||
|
||||
addFileToClasspath(testDir);
|
||||
File resultFile = ClassPathUtilities.findFileOnClassPath("testFile");
|
||||
|
||||
assertNotNull(resultFile);
|
||||
assertEquals(testFile.getAbsolutePath(), resultFile.getAbsolutePath());
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,231 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hadoop.yarn.submarine.utils;
|
||||
|
||||
import com.google.common.collect.ImmutableList;
|
||||
import com.google.common.collect.ImmutableMap;
|
||||
import com.google.common.collect.Maps;
|
||||
import org.apache.hadoop.yarn.service.api.records.Configuration;
|
||||
import org.apache.hadoop.yarn.service.api.records.Service;
|
||||
import org.junit.Test;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.HADOOP_SECURITY_AUTHENTICATION;
|
||||
import static org.apache.hadoop.yarn.submarine.utils.EnvironmentUtilities.ENV_DOCKER_MOUNTS_FOR_CONTAINER_RUNTIME;
|
||||
import static org.junit.Assert.*;
|
||||
import static org.mockito.Mockito.mock;
|
||||
import static org.mockito.Mockito.when;
|
||||
|
||||
/**
|
||||
* This class is to test {@link EnvironmentUtilities}.
|
||||
*/
|
||||
public class TestEnvironmentUtilities {
|
||||
private Service createServiceWithEmptyEnvVars() {
|
||||
return createServiceWithEnvVars(Maps.newHashMap());
|
||||
}
|
||||
|
||||
private Service createServiceWithEnvVars(Map<String, String> envVars) {
|
||||
Service service = mock(Service.class);
|
||||
Configuration config = mock(Configuration.class);
|
||||
when(config.getEnv()).thenReturn(envVars);
|
||||
when(service.getConfiguration()).thenReturn(config);
|
||||
|
||||
return service;
|
||||
}
|
||||
|
||||
private void validateDefaultEnvVars(Map<String, String> resultEnvs) {
|
||||
assertEquals("/etc/passwd:/etc/passwd:ro",
|
||||
resultEnvs.get(ENV_DOCKER_MOUNTS_FOR_CONTAINER_RUNTIME));
|
||||
}
|
||||
|
||||
private org.apache.hadoop.conf.Configuration
|
||||
createYarnConfigWithSecurityValue(String value) {
|
||||
org.apache.hadoop.conf.Configuration mockConfig =
|
||||
mock(org.apache.hadoop.conf.Configuration.class);
|
||||
when(mockConfig.get(HADOOP_SECURITY_AUTHENTICATION)).thenReturn(value);
|
||||
return mockConfig;
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testGetValueOfNullEnvVar() {
|
||||
assertEquals("", EnvironmentUtilities.getValueOfEnvironment(null));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testGetValueOfEmptyEnvVar() {
|
||||
assertEquals("", EnvironmentUtilities.getValueOfEnvironment(""));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testGetValueOfEnvVarJustAnEqualsSign() {
|
||||
assertEquals("", EnvironmentUtilities.getValueOfEnvironment("="));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testGetValueOfEnvVarWithoutValue() {
|
||||
assertEquals("", EnvironmentUtilities.getValueOfEnvironment("a="));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testGetValueOfEnvVarValidFormat() {
|
||||
assertEquals("bbb", EnvironmentUtilities.getValueOfEnvironment("a=bbb"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testHandleServiceEnvWithNullMap() {
|
||||
Service service = createServiceWithEmptyEnvVars();
|
||||
org.apache.hadoop.conf.Configuration yarnConfig =
|
||||
mock(org.apache.hadoop.conf.Configuration.class);
|
||||
EnvironmentUtilities.handleServiceEnvs(service, yarnConfig, null);
|
||||
|
||||
Map<String, String> resultEnvs = service.getConfiguration().getEnv();
|
||||
assertEquals(1, resultEnvs.size());
|
||||
validateDefaultEnvVars(resultEnvs);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testHandleServiceEnvWithEmptyMap() {
|
||||
Service service = createServiceWithEmptyEnvVars();
|
||||
org.apache.hadoop.conf.Configuration yarnConfig =
|
||||
mock(org.apache.hadoop.conf.Configuration.class);
|
||||
EnvironmentUtilities.handleServiceEnvs(service, yarnConfig, null);
|
||||
|
||||
Map<String, String> resultEnvs = service.getConfiguration().getEnv();
|
||||
assertEquals(1, resultEnvs.size());
|
||||
validateDefaultEnvVars(resultEnvs);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testHandleServiceEnvWithYarnConfigSecurityValueNonKerberos() {
|
||||
Service service = createServiceWithEmptyEnvVars();
|
||||
org.apache.hadoop.conf.Configuration yarnConfig =
|
||||
createYarnConfigWithSecurityValue("nonkerberos");
|
||||
EnvironmentUtilities.handleServiceEnvs(service, yarnConfig, null);
|
||||
|
||||
Map<String, String> resultEnvs = service.getConfiguration().getEnv();
|
||||
assertEquals(1, resultEnvs.size());
|
||||
validateDefaultEnvVars(resultEnvs);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testHandleServiceEnvWithYarnConfigSecurityValueKerberos() {
|
||||
Service service = createServiceWithEmptyEnvVars();
|
||||
org.apache.hadoop.conf.Configuration yarnConfig =
|
||||
createYarnConfigWithSecurityValue("kerberos");
|
||||
EnvironmentUtilities.handleServiceEnvs(service, yarnConfig, null);
|
||||
|
||||
Map<String, String> resultEnvs = service.getConfiguration().getEnv();
|
||||
assertEquals(1, resultEnvs.size());
|
||||
assertEquals("/etc/passwd:/etc/passwd:ro,/etc/krb5.conf:/etc/krb5.conf:ro",
|
||||
resultEnvs.get(ENV_DOCKER_MOUNTS_FOR_CONTAINER_RUNTIME));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testHandleServiceEnvWithExistingEnvsAndValidNewEnvs() {
|
||||
Map<String, String> existingEnvs = Maps.newHashMap(
|
||||
ImmutableMap.<String, String>builder().
|
||||
put("a", "1").
|
||||
put("b", "2").
|
||||
build());
|
||||
ImmutableList<String> newEnvs = ImmutableList.of("c=3", "d=4");
|
||||
|
||||
Service service = createServiceWithEnvVars(existingEnvs);
|
||||
org.apache.hadoop.conf.Configuration yarnConfig =
|
||||
createYarnConfigWithSecurityValue("kerberos");
|
||||
EnvironmentUtilities.handleServiceEnvs(service, yarnConfig, newEnvs);
|
||||
|
||||
Map<String, String> resultEnvs = service.getConfiguration().getEnv();
|
||||
assertEquals(5, resultEnvs.size());
|
||||
assertEquals("/etc/passwd:/etc/passwd:ro,/etc/krb5.conf:/etc/krb5.conf:ro",
|
||||
resultEnvs.get(ENV_DOCKER_MOUNTS_FOR_CONTAINER_RUNTIME));
|
||||
assertEquals("1", resultEnvs.get("a"));
|
||||
assertEquals("2", resultEnvs.get("b"));
|
||||
assertEquals("3", resultEnvs.get("c"));
|
||||
assertEquals("4", resultEnvs.get("d"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testHandleServiceEnvWithExistingEnvsAndNewEnvsWithoutEquals() {
|
||||
Map<String, String> existingEnvs = Maps.newHashMap(
|
||||
ImmutableMap.<String, String>builder().
|
||||
put("a", "1").
|
||||
put("b", "2").
|
||||
build());
|
||||
ImmutableList<String> newEnvs = ImmutableList.of("c3", "d4");
|
||||
|
||||
Service service = createServiceWithEnvVars(existingEnvs);
|
||||
org.apache.hadoop.conf.Configuration yarnConfig =
|
||||
createYarnConfigWithSecurityValue("kerberos");
|
||||
EnvironmentUtilities.handleServiceEnvs(service, yarnConfig, newEnvs);
|
||||
|
||||
Map<String, String> resultEnvs = service.getConfiguration().getEnv();
|
||||
assertEquals(5, resultEnvs.size());
|
||||
assertEquals("/etc/passwd:/etc/passwd:ro,/etc/krb5.conf:/etc/krb5.conf:ro",
|
||||
resultEnvs.get(ENV_DOCKER_MOUNTS_FOR_CONTAINER_RUNTIME));
|
||||
assertEquals("1", resultEnvs.get("a"));
|
||||
assertEquals("2", resultEnvs.get("b"));
|
||||
assertEquals("", resultEnvs.get("c3"));
|
||||
assertEquals("", resultEnvs.get("d4"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testHandleServiceEnvWithExistingEnvVarKey() {
|
||||
Map<String, String> existingEnvs = Maps.newHashMap(
|
||||
ImmutableMap.<String, String>builder().
|
||||
put("a", "1").
|
||||
put("b", "2").
|
||||
build());
|
||||
ImmutableList<String> newEnvs = ImmutableList.of("a=33", "c=44");
|
||||
|
||||
Service service = createServiceWithEnvVars(existingEnvs);
|
||||
org.apache.hadoop.conf.Configuration yarnConfig =
|
||||
createYarnConfigWithSecurityValue("kerberos");
|
||||
EnvironmentUtilities.handleServiceEnvs(service, yarnConfig, newEnvs);
|
||||
|
||||
Map<String, String> resultEnvs = service.getConfiguration().getEnv();
|
||||
assertEquals(4, resultEnvs.size());
|
||||
assertEquals("/etc/passwd:/etc/passwd:ro,/etc/krb5.conf:/etc/krb5.conf:ro",
|
||||
resultEnvs.get(ENV_DOCKER_MOUNTS_FOR_CONTAINER_RUNTIME));
|
||||
assertEquals("1:33", resultEnvs.get("a"));
|
||||
assertEquals("2", resultEnvs.get("b"));
|
||||
assertEquals("44", resultEnvs.get("c"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testHandleServiceEnvWithExistingEnvVarKeyMultipleTimes() {
|
||||
Map<String, String> existingEnvs = Maps.newHashMap(
|
||||
ImmutableMap.<String, String>builder().
|
||||
put("a", "1").
|
||||
put("b", "2").
|
||||
build());
|
||||
ImmutableList<String> newEnvs = ImmutableList.of("a=33", "a=44");
|
||||
|
||||
Service service = createServiceWithEnvVars(existingEnvs);
|
||||
org.apache.hadoop.conf.Configuration yarnConfig =
|
||||
createYarnConfigWithSecurityValue("kerberos");
|
||||
EnvironmentUtilities.handleServiceEnvs(service, yarnConfig, newEnvs);
|
||||
|
||||
Map<String, String> resultEnvs = service.getConfiguration().getEnv();
|
||||
assertEquals(3, resultEnvs.size());
|
||||
assertEquals("/etc/passwd:/etc/passwd:ro,/etc/krb5.conf:/etc/krb5.conf:ro",
|
||||
resultEnvs.get(ENV_DOCKER_MOUNTS_FOR_CONTAINER_RUNTIME));
|
||||
assertEquals("1:33:44", resultEnvs.get("a"));
|
||||
assertEquals("2", resultEnvs.get("b"));
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,156 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hadoop.yarn.submarine.utils;
|
||||
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.yarn.service.api.records.KerberosPrincipal;
|
||||
import org.apache.hadoop.yarn.submarine.FileUtilitiesForTests;
|
||||
import org.apache.hadoop.yarn.submarine.client.cli.param.RunJobParameters;
|
||||
import org.apache.hadoop.yarn.submarine.common.MockClientContext;
|
||||
import org.apache.hadoop.yarn.submarine.runtimes.yarnservice.FileSystemOperations;
|
||||
import org.junit.After;
|
||||
import org.junit.Before;
|
||||
import org.junit.Test;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
|
||||
import static org.junit.Assert.*;
|
||||
import static org.mockito.Mockito.mock;
|
||||
import static org.mockito.Mockito.when;
|
||||
|
||||
/**
|
||||
* This class is to test {@link KerberosPrincipalFactory}.
|
||||
*/
|
||||
public class TestKerberosPrincipalFactory {
|
||||
private FileUtilitiesForTests fileUtils = new FileUtilitiesForTests();
|
||||
|
||||
@Before
|
||||
public void setUp() {
|
||||
fileUtils.setup();
|
||||
}
|
||||
|
||||
@After
|
||||
public void teardown() throws IOException {
|
||||
fileUtils.teardown();
|
||||
}
|
||||
|
||||
private File createKeytabFile(String keytabFileName) throws IOException {
|
||||
return fileUtils.createFileInTempDir(keytabFileName);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testCreatePrincipalEmptyPrincipalAndKeytab() throws IOException {
|
||||
MockClientContext mockClientContext = new MockClientContext();
|
||||
|
||||
RunJobParameters parameters = mock(RunJobParameters.class);
|
||||
when(parameters.getPrincipal()).thenReturn("");
|
||||
when(parameters.getKeytab()).thenReturn("");
|
||||
|
||||
FileSystemOperations fsOperations =
|
||||
new FileSystemOperations(mockClientContext);
|
||||
KerberosPrincipal result =
|
||||
KerberosPrincipalFactory.create(fsOperations,
|
||||
mockClientContext.getRemoteDirectoryManager(), parameters);
|
||||
|
||||
assertNull(result);
|
||||
}
|
||||
@Test
|
||||
public void testCreatePrincipalEmptyPrincipalString() throws IOException {
|
||||
MockClientContext mockClientContext = new MockClientContext();
|
||||
|
||||
RunJobParameters parameters = mock(RunJobParameters.class);
|
||||
when(parameters.getPrincipal()).thenReturn("");
|
||||
when(parameters.getKeytab()).thenReturn("keytab");
|
||||
|
||||
FileSystemOperations fsOperations =
|
||||
new FileSystemOperations(mockClientContext);
|
||||
KerberosPrincipal result =
|
||||
KerberosPrincipalFactory.create(fsOperations,
|
||||
mockClientContext.getRemoteDirectoryManager(), parameters);
|
||||
|
||||
assertNull(result);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testCreatePrincipalEmptyKeyTabString() throws IOException {
|
||||
MockClientContext mockClientContext = new MockClientContext();
|
||||
|
||||
RunJobParameters parameters = mock(RunJobParameters.class);
|
||||
when(parameters.getPrincipal()).thenReturn("principal");
|
||||
when(parameters.getKeytab()).thenReturn("");
|
||||
|
||||
FileSystemOperations fsOperations =
|
||||
new FileSystemOperations(mockClientContext);
|
||||
KerberosPrincipal result =
|
||||
KerberosPrincipalFactory.create(fsOperations,
|
||||
mockClientContext.getRemoteDirectoryManager(), parameters);
|
||||
|
||||
assertNull(result);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testCreatePrincipalNonEmptyPrincipalAndKeytab()
|
||||
throws IOException {
|
||||
MockClientContext mockClientContext = new MockClientContext();
|
||||
|
||||
RunJobParameters parameters = mock(RunJobParameters.class);
|
||||
when(parameters.getPrincipal()).thenReturn("principal");
|
||||
when(parameters.getKeytab()).thenReturn("keytab");
|
||||
|
||||
FileSystemOperations fsOperations =
|
||||
new FileSystemOperations(mockClientContext);
|
||||
KerberosPrincipal result =
|
||||
KerberosPrincipalFactory.create(fsOperations,
|
||||
mockClientContext.getRemoteDirectoryManager(), parameters);
|
||||
|
||||
assertNotNull(result);
|
||||
assertEquals("file://keytab", result.getKeytab());
|
||||
assertEquals("principal", result.getPrincipalName());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testCreatePrincipalDistributedKeytab() throws IOException {
|
||||
MockClientContext mockClientContext = new MockClientContext();
|
||||
String jobname = "testJobname";
|
||||
String keytab = "testKeytab";
|
||||
File keytabFile = createKeytabFile(keytab);
|
||||
|
||||
RunJobParameters parameters = mock(RunJobParameters.class);
|
||||
when(parameters.getPrincipal()).thenReturn("principal");
|
||||
when(parameters.getKeytab()).thenReturn(keytabFile.getAbsolutePath());
|
||||
when(parameters.getName()).thenReturn(jobname);
|
||||
when(parameters.isDistributeKeytab()).thenReturn(true);
|
||||
|
||||
FileSystemOperations fsOperations =
|
||||
new FileSystemOperations(mockClientContext);
|
||||
|
||||
KerberosPrincipal result =
|
||||
KerberosPrincipalFactory.create(fsOperations,
|
||||
mockClientContext.getRemoteDirectoryManager(), parameters);
|
||||
|
||||
Path stagingDir = mockClientContext.getRemoteDirectoryManager()
|
||||
.getJobStagingArea(parameters.getName(), true);
|
||||
String expectedKeytabFilePath =
|
||||
FileUtilitiesForTests.getFilename(stagingDir, keytab).getAbsolutePath();
|
||||
|
||||
assertNotNull(result);
|
||||
assertEquals(expectedKeytabFilePath, result.getKeytab());
|
||||
assertEquals("principal", result.getPrincipalName());
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,72 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hadoop.yarn.submarine.utils;
|
||||
|
||||
import com.google.common.collect.ImmutableMap;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.yarn.api.records.Resource;
|
||||
import org.apache.hadoop.yarn.service.api.records.ResourceInformation;
|
||||
import org.apache.hadoop.yarn.util.resource.CustomResourceTypesConfigurationProvider;
|
||||
import org.apache.hadoop.yarn.util.resource.ResourceUtils;
|
||||
import org.junit.After;
|
||||
import org.junit.Test;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
import static org.junit.Assert.*;
|
||||
|
||||
/**
|
||||
* This class is to test {@link SubmarineResourceUtils}.
|
||||
*/
|
||||
public class TestSubmarineResourceUtils {
|
||||
private static final String CUSTOM_RESOURCE_NAME = "a-custom-resource";
|
||||
|
||||
private void initResourceTypes() {
|
||||
CustomResourceTypesConfigurationProvider.initResourceTypes(
|
||||
ImmutableMap.<String, String>builder()
|
||||
.put(CUSTOM_RESOURCE_NAME, "G")
|
||||
.build());
|
||||
}
|
||||
|
||||
@After
|
||||
public void cleanup() {
|
||||
ResourceUtils.resetResourceTypes(new Configuration());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testConvertResourceWithCustomResource() {
|
||||
initResourceTypes();
|
||||
Resource res = Resource.newInstance(4096, 12,
|
||||
ImmutableMap.of(CUSTOM_RESOURCE_NAME, 20L));
|
||||
|
||||
org.apache.hadoop.yarn.service.api.records.Resource serviceResource =
|
||||
SubmarineResourceUtils.convertYarnResourceToServiceResource(res);
|
||||
|
||||
assertEquals(12, serviceResource.getCpus().intValue());
|
||||
assertEquals(4096, (int) Integer.valueOf(serviceResource.getMemory()));
|
||||
Map<String, ResourceInformation> additionalResources =
|
||||
serviceResource.getAdditional();
|
||||
|
||||
// Additional resources also includes vcores and memory
|
||||
assertEquals(3, additionalResources.size());
|
||||
ResourceInformation customResourceRI =
|
||||
additionalResources.get(CUSTOM_RESOURCE_NAME);
|
||||
assertEquals("G", customResourceRI.getUnit());
|
||||
assertEquals(20L, (long) customResourceRI.getValue());
|
||||
}
|
||||
|
||||
}
|
Loading…
Reference in New Issue