SUBMARINE-54. Add test coverage for YarnServiceJobSubmitter and make it ready for extension for PyTorch. Contributed by Szilard Nemeth.
This commit is contained in:
parent
afe6613ee6
commit
0b3d41bdee
|
@ -293,10 +293,18 @@ public class RunJobParameters extends RunParameters {
|
||||||
return psDockerImage;
|
return psDockerImage;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void setPsDockerImage(String psDockerImage) {
|
||||||
|
this.psDockerImage = psDockerImage;
|
||||||
|
}
|
||||||
|
|
||||||
public String getWorkerDockerImage() {
|
public String getWorkerDockerImage() {
|
||||||
return workerDockerImage;
|
return workerDockerImage;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void setWorkerDockerImage(String workerDockerImage) {
|
||||||
|
this.workerDockerImage = workerDockerImage;
|
||||||
|
}
|
||||||
|
|
||||||
public boolean isDistributed() {
|
public boolean isDistributed() {
|
||||||
return distributed;
|
return distributed;
|
||||||
}
|
}
|
||||||
|
@ -313,6 +321,10 @@ public class RunJobParameters extends RunParameters {
|
||||||
return tensorboardDockerImage;
|
return tensorboardDockerImage;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void setTensorboardDockerImage(String tensorboardDockerImage) {
|
||||||
|
this.tensorboardDockerImage = tensorboardDockerImage;
|
||||||
|
}
|
||||||
|
|
||||||
public List<Quicklink> getQuicklinks() {
|
public List<Quicklink> getQuicklinks() {
|
||||||
return quicklinks;
|
return quicklinks;
|
||||||
}
|
}
|
||||||
|
@ -366,6 +378,10 @@ public class RunJobParameters extends RunParameters {
|
||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void setDistributed(boolean distributed) {
|
||||||
|
this.distributed = distributed;
|
||||||
|
}
|
||||||
|
|
||||||
@VisibleForTesting
|
@VisibleForTesting
|
||||||
public static class UnderscoreConverterPropertyUtils extends PropertyUtils {
|
public static class UnderscoreConverterPropertyUtils extends PropertyUtils {
|
||||||
@Override
|
@Override
|
||||||
|
|
|
@ -177,6 +177,25 @@ public class TestRunJobCliParsing {
|
||||||
Assert.assertTrue(success);
|
Assert.assertTrue(success);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testJobWithoutName() throws Exception {
|
||||||
|
RunJobCli runJobCli = new RunJobCli(getMockClientContext());
|
||||||
|
String expectedErrorMessage =
|
||||||
|
"--" + CliConstants.NAME + " is absent";
|
||||||
|
String actualMessage = "";
|
||||||
|
try {
|
||||||
|
runJobCli.run(
|
||||||
|
new String[]{"--docker_image", "tf-docker:1.1.0",
|
||||||
|
"--num_workers", "0", "--tensorboard", "--verbose",
|
||||||
|
"--tensorboard_resources", "memory=2G,vcores=2",
|
||||||
|
"--tensorboard_docker_image", "tb_docker_image:001"});
|
||||||
|
} catch (ParseException e) {
|
||||||
|
actualMessage = e.getMessage();
|
||||||
|
e.printStackTrace();
|
||||||
|
}
|
||||||
|
assertEquals(expectedErrorMessage, actualMessage);
|
||||||
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testLaunchCommandPatternReplace() throws Exception {
|
public void testLaunchCommandPatternReplace() throws Exception {
|
||||||
RunJobCli runJobCli = new RunJobCli(getMockClientContext());
|
RunJobCli runJobCli = new RunJobCli(getMockClientContext());
|
||||||
|
|
|
@ -26,6 +26,7 @@ import org.apache.hadoop.fs.Path;
|
||||||
|
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.util.Objects;
|
||||||
|
|
||||||
public class MockRemoteDirectoryManager implements RemoteDirectoryManager {
|
public class MockRemoteDirectoryManager implements RemoteDirectoryManager {
|
||||||
private File jobsParentDir = null;
|
private File jobsParentDir = null;
|
||||||
|
@ -35,6 +36,7 @@ public class MockRemoteDirectoryManager implements RemoteDirectoryManager {
|
||||||
@Override
|
@Override
|
||||||
public Path getJobStagingArea(String jobName, boolean create)
|
public Path getJobStagingArea(String jobName, boolean create)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
|
Objects.requireNonNull(jobName, "Job name must not be null!");
|
||||||
if (jobsParentDir == null && create) {
|
if (jobsParentDir == null && create) {
|
||||||
jobsParentDir = new File(
|
jobsParentDir = new File(
|
||||||
"target/_staging_area_" + System.currentTimeMillis());
|
"target/_staging_area_" + System.currentTimeMillis());
|
||||||
|
|
|
@ -115,6 +115,12 @@
|
||||||
<artifactId>hadoop-yarn-services-core</artifactId>
|
<artifactId>hadoop-yarn-services-core</artifactId>
|
||||||
<version>3.3.0-SNAPSHOT</version>
|
<version>3.3.0-SNAPSHOT</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.apache.hadoop</groupId>
|
||||||
|
<artifactId>hadoop-yarn-common</artifactId>
|
||||||
|
<type>test-jar</type>
|
||||||
|
<scope>test</scope>
|
||||||
|
</dependency>
|
||||||
</dependencies>
|
</dependencies>
|
||||||
|
|
||||||
<build>
|
<build>
|
||||||
|
|
|
@ -0,0 +1,99 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hadoop.yarn.submarine.runtimes.yarnservice;
|
||||||
|
|
||||||
|
import org.apache.hadoop.conf.Configuration;
|
||||||
|
import org.apache.hadoop.fs.Path;
|
||||||
|
import org.apache.hadoop.yarn.service.api.records.Component;
|
||||||
|
import org.apache.hadoop.yarn.submarine.client.cli.param.RunJobParameters;
|
||||||
|
import org.apache.hadoop.yarn.submarine.common.api.TaskType;
|
||||||
|
import org.apache.hadoop.yarn.submarine.common.fs.RemoteDirectoryManager;
|
||||||
|
import org.apache.hadoop.yarn.submarine.runtimes.yarnservice.command.AbstractLaunchCommand;
|
||||||
|
import org.apache.hadoop.yarn.submarine.runtimes.yarnservice.command.LaunchCommandFactory;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.Objects;
|
||||||
|
|
||||||
|
import static org.apache.hadoop.yarn.submarine.runtimes.yarnservice.tensorflow.TensorFlowCommons.getScriptFileName;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Abstract base class for Component classes.
|
||||||
|
* The implementations of this class are act like factories for
|
||||||
|
* {@link Component} instances.
|
||||||
|
* All dependencies are passed to the constructor so that child classes
|
||||||
|
* are obliged to provide matching constructors.
|
||||||
|
*/
|
||||||
|
public abstract class AbstractComponent {
|
||||||
|
private final FileSystemOperations fsOperations;
|
||||||
|
protected final RunJobParameters parameters;
|
||||||
|
protected final TaskType taskType;
|
||||||
|
private final RemoteDirectoryManager remoteDirectoryManager;
|
||||||
|
protected final Configuration yarnConfig;
|
||||||
|
private final LaunchCommandFactory launchCommandFactory;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This is only required for testing.
|
||||||
|
*/
|
||||||
|
private String localScriptFile;
|
||||||
|
|
||||||
|
public AbstractComponent(FileSystemOperations fsOperations,
|
||||||
|
RemoteDirectoryManager remoteDirectoryManager,
|
||||||
|
RunJobParameters parameters, TaskType taskType,
|
||||||
|
Configuration yarnConfig,
|
||||||
|
LaunchCommandFactory launchCommandFactory) {
|
||||||
|
this.fsOperations = fsOperations;
|
||||||
|
this.remoteDirectoryManager = remoteDirectoryManager;
|
||||||
|
this.parameters = parameters;
|
||||||
|
this.taskType = taskType;
|
||||||
|
this.launchCommandFactory = launchCommandFactory;
|
||||||
|
this.yarnConfig = yarnConfig;
|
||||||
|
}
|
||||||
|
|
||||||
|
protected abstract Component createComponent() throws IOException;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Generates a command launch script on local disk,
|
||||||
|
* returns path to the script.
|
||||||
|
*/
|
||||||
|
protected void generateLaunchCommand(Component component)
|
||||||
|
throws IOException {
|
||||||
|
AbstractLaunchCommand launchCommand =
|
||||||
|
launchCommandFactory.createLaunchCommand(taskType, component);
|
||||||
|
this.localScriptFile = launchCommand.generateLaunchScript();
|
||||||
|
|
||||||
|
String remoteLaunchCommand = uploadLaunchCommand(component);
|
||||||
|
component.setLaunchCommand(remoteLaunchCommand);
|
||||||
|
}
|
||||||
|
|
||||||
|
private String uploadLaunchCommand(Component component)
|
||||||
|
throws IOException {
|
||||||
|
Objects.requireNonNull(localScriptFile, "localScriptFile should be " +
|
||||||
|
"set before calling this method!");
|
||||||
|
Path stagingDir =
|
||||||
|
remoteDirectoryManager.getJobStagingArea(parameters.getName(), true);
|
||||||
|
|
||||||
|
String destScriptFileName = getScriptFileName(taskType);
|
||||||
|
fsOperations.uploadToRemoteFileAndLocalizeToContainerWorkDir(stagingDir,
|
||||||
|
localScriptFile, destScriptFileName, component);
|
||||||
|
|
||||||
|
return "./" + destScriptFileName;
|
||||||
|
}
|
||||||
|
|
||||||
|
String getLocalScriptFile() {
|
||||||
|
return localScriptFile;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,201 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hadoop.yarn.submarine.runtimes.yarnservice;
|
||||||
|
|
||||||
|
import com.google.common.annotations.VisibleForTesting;
|
||||||
|
import org.apache.hadoop.conf.Configuration;
|
||||||
|
import org.apache.hadoop.fs.FileStatus;
|
||||||
|
import org.apache.hadoop.fs.FileSystem;
|
||||||
|
import org.apache.hadoop.fs.FileUtil;
|
||||||
|
import org.apache.hadoop.fs.Path;
|
||||||
|
import org.apache.hadoop.fs.permission.FsPermission;
|
||||||
|
import org.apache.hadoop.yarn.service.api.records.Component;
|
||||||
|
import org.apache.hadoop.yarn.service.api.records.ConfigFile;
|
||||||
|
import org.apache.hadoop.yarn.submarine.common.ClientContext;
|
||||||
|
import org.apache.hadoop.yarn.submarine.common.conf.SubmarineConfiguration;
|
||||||
|
import org.apache.hadoop.yarn.submarine.common.conf.SubmarineLogs;
|
||||||
|
import org.apache.hadoop.yarn.submarine.common.fs.RemoteDirectoryManager;
|
||||||
|
import org.apache.hadoop.yarn.submarine.utils.ZipUtilities;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import java.io.File;
|
||||||
|
import java.io.FileNotFoundException;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.HashSet;
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Contains methods to perform file system operations. Almost all of the methods
|
||||||
|
* are regular non-static methods as the operations are performed with the help
|
||||||
|
* of a {@link RemoteDirectoryManager} instance passed in as a constructor
|
||||||
|
* dependency. Please note that some operations require to read config settings
|
||||||
|
* as well, so that we have Submarine and YARN config objects as dependencies as
|
||||||
|
* well.
|
||||||
|
*/
|
||||||
|
public class FileSystemOperations {
|
||||||
|
private static final Logger LOG =
|
||||||
|
LoggerFactory.getLogger(FileSystemOperations.class);
|
||||||
|
private final Configuration submarineConfig;
|
||||||
|
private final Configuration yarnConfig;
|
||||||
|
|
||||||
|
private Set<Path> uploadedFiles = new HashSet<>();
|
||||||
|
private RemoteDirectoryManager remoteDirectoryManager;
|
||||||
|
|
||||||
|
public FileSystemOperations(ClientContext clientContext) {
|
||||||
|
this.remoteDirectoryManager = clientContext.getRemoteDirectoryManager();
|
||||||
|
this.submarineConfig = clientContext.getSubmarineConfig();
|
||||||
|
this.yarnConfig = clientContext.getYarnConfig();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* May download a remote uri(file/dir) and zip.
|
||||||
|
* Skip download if local dir
|
||||||
|
* Remote uri can be a local dir(won't download)
|
||||||
|
* or remote HDFS dir, s3 dir/file .etc
|
||||||
|
* */
|
||||||
|
public String downloadAndZip(String remoteDir, String zipFileName,
|
||||||
|
boolean doZip)
|
||||||
|
throws IOException {
|
||||||
|
//Append original modification time and size to zip file name
|
||||||
|
String suffix;
|
||||||
|
String srcDir = remoteDir;
|
||||||
|
String zipDirPath =
|
||||||
|
System.getProperty("java.io.tmpdir") + "/" + zipFileName;
|
||||||
|
boolean needDeleteTempDir = false;
|
||||||
|
if (remoteDirectoryManager.isRemote(remoteDir)) {
|
||||||
|
//Append original modification time and size to zip file name
|
||||||
|
FileStatus status =
|
||||||
|
remoteDirectoryManager.getRemoteFileStatus(new Path(remoteDir));
|
||||||
|
suffix = "_" + status.getModificationTime()
|
||||||
|
+ "-" + remoteDirectoryManager.getRemoteFileSize(remoteDir);
|
||||||
|
// Download them to temp dir
|
||||||
|
boolean downloaded =
|
||||||
|
remoteDirectoryManager.copyRemoteToLocal(remoteDir, zipDirPath);
|
||||||
|
if (!downloaded) {
|
||||||
|
throw new IOException("Failed to download files from "
|
||||||
|
+ remoteDir);
|
||||||
|
}
|
||||||
|
LOG.info("Downloaded remote: {} to local: {}", remoteDir, zipDirPath);
|
||||||
|
srcDir = zipDirPath;
|
||||||
|
needDeleteTempDir = true;
|
||||||
|
} else {
|
||||||
|
File localDir = new File(remoteDir);
|
||||||
|
suffix = "_" + localDir.lastModified()
|
||||||
|
+ "-" + localDir.length();
|
||||||
|
}
|
||||||
|
if (!doZip) {
|
||||||
|
return srcDir;
|
||||||
|
}
|
||||||
|
// zip a local dir
|
||||||
|
String zipFileUri =
|
||||||
|
ZipUtilities.zipDir(srcDir, zipDirPath + suffix + ".zip");
|
||||||
|
// delete downloaded temp dir
|
||||||
|
if (needDeleteTempDir) {
|
||||||
|
deleteFiles(srcDir);
|
||||||
|
}
|
||||||
|
return zipFileUri;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void deleteFiles(String localUri) {
|
||||||
|
boolean success = FileUtil.fullyDelete(new File(localUri));
|
||||||
|
if (!success) {
|
||||||
|
LOG.warn("Failed to delete {}", localUri);
|
||||||
|
}
|
||||||
|
LOG.info("Deleted {}", localUri);
|
||||||
|
}
|
||||||
|
|
||||||
|
@VisibleForTesting
|
||||||
|
public void uploadToRemoteFileAndLocalizeToContainerWorkDir(Path stagingDir,
|
||||||
|
String fileToUpload, String destFilename, Component comp)
|
||||||
|
throws IOException {
|
||||||
|
Path uploadedFilePath = uploadToRemoteFile(stagingDir, fileToUpload);
|
||||||
|
locateRemoteFileToContainerWorkDir(destFilename, comp, uploadedFilePath);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void locateRemoteFileToContainerWorkDir(String destFilename,
|
||||||
|
Component comp, Path uploadedFilePath)
|
||||||
|
throws IOException {
|
||||||
|
FileSystem fs = FileSystem.get(yarnConfig);
|
||||||
|
|
||||||
|
FileStatus fileStatus = fs.getFileStatus(uploadedFilePath);
|
||||||
|
LOG.info("Uploaded file path = " + fileStatus.getPath());
|
||||||
|
|
||||||
|
// Set it to component's files list
|
||||||
|
comp.getConfiguration().getFiles().add(new ConfigFile().srcFile(
|
||||||
|
fileStatus.getPath().toUri().toString()).destFile(destFilename)
|
||||||
|
.type(ConfigFile.TypeEnum.STATIC));
|
||||||
|
}
|
||||||
|
|
||||||
|
public Path uploadToRemoteFile(Path stagingDir, String fileToUpload) throws
|
||||||
|
IOException {
|
||||||
|
FileSystem fs = remoteDirectoryManager.getDefaultFileSystem();
|
||||||
|
|
||||||
|
// Upload to remote FS under staging area
|
||||||
|
File localFile = new File(fileToUpload);
|
||||||
|
if (!localFile.exists()) {
|
||||||
|
throw new FileNotFoundException(
|
||||||
|
"Trying to upload file=" + localFile.getAbsolutePath()
|
||||||
|
+ " to remote, but couldn't find local file.");
|
||||||
|
}
|
||||||
|
String filename = new File(fileToUpload).getName();
|
||||||
|
|
||||||
|
Path uploadedFilePath = new Path(stagingDir, filename);
|
||||||
|
if (!uploadedFiles.contains(uploadedFilePath)) {
|
||||||
|
if (SubmarineLogs.isVerbose()) {
|
||||||
|
LOG.info("Copying local file=" + fileToUpload + " to remote="
|
||||||
|
+ uploadedFilePath);
|
||||||
|
}
|
||||||
|
fs.copyFromLocalFile(new Path(fileToUpload), uploadedFilePath);
|
||||||
|
uploadedFiles.add(uploadedFilePath);
|
||||||
|
}
|
||||||
|
return uploadedFilePath;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void validFileSize(String uri) throws IOException {
|
||||||
|
long actualSizeByte;
|
||||||
|
String locationType = "Local";
|
||||||
|
if (remoteDirectoryManager.isRemote(uri)) {
|
||||||
|
actualSizeByte = remoteDirectoryManager.getRemoteFileSize(uri);
|
||||||
|
locationType = "Remote";
|
||||||
|
} else {
|
||||||
|
actualSizeByte = FileUtil.getDU(new File(uri));
|
||||||
|
}
|
||||||
|
long maxFileSizeMB = submarineConfig
|
||||||
|
.getLong(SubmarineConfiguration.LOCALIZATION_MAX_ALLOWED_FILE_SIZE_MB,
|
||||||
|
SubmarineConfiguration.DEFAULT_MAX_ALLOWED_REMOTE_URI_SIZE_MB);
|
||||||
|
LOG.info("{} fie/dir: {}, size(Byte):{},"
|
||||||
|
+ " Allowed max file/dir size: {}",
|
||||||
|
locationType, uri, actualSizeByte, maxFileSizeMB * 1024 * 1024);
|
||||||
|
|
||||||
|
if (actualSizeByte > maxFileSizeMB * 1024 * 1024) {
|
||||||
|
throw new IOException(uri + " size(Byte): "
|
||||||
|
+ actualSizeByte + " exceeds configured max size:"
|
||||||
|
+ maxFileSizeMB * 1024 * 1024);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setPermission(Path destPath, FsPermission permission) throws
|
||||||
|
IOException {
|
||||||
|
FileSystem fs = FileSystem.get(yarnConfig);
|
||||||
|
fs.setPermission(destPath, new FsPermission(permission));
|
||||||
|
}
|
||||||
|
|
||||||
|
public static boolean needHdfs(String content) {
|
||||||
|
return content != null && content.contains("hdfs://");
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,161 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hadoop.yarn.submarine.runtimes.yarnservice;
|
||||||
|
|
||||||
|
import org.apache.hadoop.fs.Path;
|
||||||
|
import org.apache.hadoop.yarn.service.api.records.Component;
|
||||||
|
import org.apache.hadoop.yarn.submarine.client.cli.param.RunJobParameters;
|
||||||
|
import org.apache.hadoop.yarn.submarine.common.ClientContext;
|
||||||
|
import org.apache.hadoop.yarn.submarine.common.conf.SubmarineLogs;
|
||||||
|
import org.apache.hadoop.yarn.submarine.common.fs.RemoteDirectoryManager;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import java.io.File;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.PrintWriter;
|
||||||
|
|
||||||
|
import static org.apache.hadoop.yarn.submarine.runtimes.yarnservice.FileSystemOperations.needHdfs;
|
||||||
|
import static org.apache.hadoop.yarn.submarine.utils.ClassPathUtilities.findFileOnClassPath;
|
||||||
|
import static org.apache.hadoop.yarn.submarine.utils.EnvironmentUtilities.getValueOfEnvironment;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This class contains helper methods to fill HDFS and Java environment
|
||||||
|
* variables into scripts.
|
||||||
|
*/
|
||||||
|
public class HadoopEnvironmentSetup {
|
||||||
|
private static final Logger LOG =
|
||||||
|
LoggerFactory.getLogger(HadoopEnvironmentSetup.class);
|
||||||
|
private static final String CORE_SITE_XML = "core-site.xml";
|
||||||
|
private static final String HDFS_SITE_XML = "hdfs-site.xml";
|
||||||
|
|
||||||
|
public static final String DOCKER_HADOOP_HDFS_HOME =
|
||||||
|
"DOCKER_HADOOP_HDFS_HOME";
|
||||||
|
public static final String DOCKER_JAVA_HOME = "DOCKER_JAVA_HOME";
|
||||||
|
private final RemoteDirectoryManager remoteDirectoryManager;
|
||||||
|
private final FileSystemOperations fsOperations;
|
||||||
|
|
||||||
|
public HadoopEnvironmentSetup(ClientContext clientContext,
|
||||||
|
FileSystemOperations fsOperations) {
|
||||||
|
this.remoteDirectoryManager = clientContext.getRemoteDirectoryManager();
|
||||||
|
this.fsOperations = fsOperations;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void addHdfsClassPath(RunJobParameters parameters,
|
||||||
|
PrintWriter fw, Component comp) throws IOException {
|
||||||
|
// Find envs to use HDFS
|
||||||
|
String hdfsHome = null;
|
||||||
|
String javaHome = null;
|
||||||
|
|
||||||
|
boolean hadoopEnv = false;
|
||||||
|
|
||||||
|
for (String envVar : parameters.getEnvars()) {
|
||||||
|
if (envVar.startsWith(DOCKER_HADOOP_HDFS_HOME + "=")) {
|
||||||
|
hdfsHome = getValueOfEnvironment(envVar);
|
||||||
|
hadoopEnv = true;
|
||||||
|
} else if (envVar.startsWith(DOCKER_JAVA_HOME + "=")) {
|
||||||
|
javaHome = getValueOfEnvironment(envVar);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
boolean hasHdfsEnvs = hdfsHome != null && javaHome != null;
|
||||||
|
boolean needHdfs = doesNeedHdfs(parameters, hadoopEnv);
|
||||||
|
if (needHdfs) {
|
||||||
|
// HDFS is asked either in input or output, set LD_LIBRARY_PATH
|
||||||
|
// and classpath
|
||||||
|
if (hdfsHome != null) {
|
||||||
|
appendHdfsHome(fw, hdfsHome);
|
||||||
|
}
|
||||||
|
|
||||||
|
// hadoop confs will be uploaded to HDFS and localized to container's
|
||||||
|
// local folder, so here set $HADOOP_CONF_DIR to $WORK_DIR.
|
||||||
|
fw.append("export HADOOP_CONF_DIR=$WORK_DIR\n");
|
||||||
|
if (javaHome != null) {
|
||||||
|
appendJavaHome(fw, javaHome);
|
||||||
|
}
|
||||||
|
|
||||||
|
fw.append(
|
||||||
|
"export CLASSPATH=`$HADOOP_HDFS_HOME/bin/hadoop classpath --glob`\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
if (needHdfs && !hasHdfsEnvs) {
|
||||||
|
LOG.error("When HDFS is being used to read/write models/data, " +
|
||||||
|
"the following environment variables are required: " +
|
||||||
|
"1) {}=<HDFS_HOME inside docker container> " +
|
||||||
|
"2) {}=<JAVA_HOME inside docker container>. " +
|
||||||
|
"You can use --env to pass these environment variables.",
|
||||||
|
DOCKER_HADOOP_HDFS_HOME, DOCKER_JAVA_HOME);
|
||||||
|
throw new IOException("Failed to detect HDFS-related environments.");
|
||||||
|
}
|
||||||
|
|
||||||
|
// Trying to upload core-site.xml and hdfs-site.xml
|
||||||
|
Path stagingDir =
|
||||||
|
remoteDirectoryManager.getJobStagingArea(
|
||||||
|
parameters.getName(), true);
|
||||||
|
File coreSite = findFileOnClassPath(CORE_SITE_XML);
|
||||||
|
File hdfsSite = findFileOnClassPath(HDFS_SITE_XML);
|
||||||
|
if (coreSite == null || hdfsSite == null) {
|
||||||
|
LOG.error("HDFS is being used, however we could not locate " +
|
||||||
|
"{} nor {} on classpath! " +
|
||||||
|
"Please double check your classpath setting and make sure these " +
|
||||||
|
"setting files are included!", CORE_SITE_XML, HDFS_SITE_XML);
|
||||||
|
throw new IOException(
|
||||||
|
"Failed to locate core-site.xml / hdfs-site.xml on classpath!");
|
||||||
|
}
|
||||||
|
fsOperations.uploadToRemoteFileAndLocalizeToContainerWorkDir(stagingDir,
|
||||||
|
coreSite.getAbsolutePath(), CORE_SITE_XML, comp);
|
||||||
|
fsOperations.uploadToRemoteFileAndLocalizeToContainerWorkDir(stagingDir,
|
||||||
|
hdfsSite.getAbsolutePath(), HDFS_SITE_XML, comp);
|
||||||
|
|
||||||
|
// DEBUG
|
||||||
|
if (SubmarineLogs.isVerbose()) {
|
||||||
|
appendEchoOfEnvVars(fw);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private boolean doesNeedHdfs(RunJobParameters parameters, boolean hadoopEnv) {
|
||||||
|
return needHdfs(parameters.getInputPath()) ||
|
||||||
|
needHdfs(parameters.getPSLaunchCmd()) ||
|
||||||
|
needHdfs(parameters.getWorkerLaunchCmd()) ||
|
||||||
|
hadoopEnv;
|
||||||
|
}
|
||||||
|
|
||||||
|
private void appendHdfsHome(PrintWriter fw, String hdfsHome) {
|
||||||
|
// Unset HADOOP_HOME/HADOOP_YARN_HOME to make sure host machine's envs
|
||||||
|
// won't pollute docker's env.
|
||||||
|
fw.append("export HADOOP_HOME=\n");
|
||||||
|
fw.append("export HADOOP_YARN_HOME=\n");
|
||||||
|
fw.append("export HADOOP_HDFS_HOME=" + hdfsHome + "\n");
|
||||||
|
fw.append("export HADOOP_COMMON_HOME=" + hdfsHome + "\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
private void appendJavaHome(PrintWriter fw, String javaHome) {
|
||||||
|
fw.append("export JAVA_HOME=" + javaHome + "\n");
|
||||||
|
fw.append("export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:"
|
||||||
|
+ "$JAVA_HOME/lib/amd64/server\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
private void appendEchoOfEnvVars(PrintWriter fw) {
|
||||||
|
fw.append("echo \"CLASSPATH:$CLASSPATH\"\n");
|
||||||
|
fw.append("echo \"HADOOP_CONF_DIR:$HADOOP_CONF_DIR\"\n");
|
||||||
|
fw.append(
|
||||||
|
"echo \"HADOOP_TOKEN_FILE_LOCATION:$HADOOP_TOKEN_FILE_LOCATION\"\n");
|
||||||
|
fw.append("echo \"JAVA_HOME:$JAVA_HOME\"\n");
|
||||||
|
fw.append("echo \"LD_LIBRARY_PATH:$LD_LIBRARY_PATH\"\n");
|
||||||
|
fw.append("echo \"HADOOP_HDFS_HOME:$HADOOP_HDFS_HOME\"\n");
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,27 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hadoop.yarn.submarine.runtimes.yarnservice;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This interface is to provide means of creating wrappers around
|
||||||
|
* {@link org.apache.hadoop.yarn.service.api.records.Service} instances.
|
||||||
|
*/
|
||||||
|
public interface ServiceSpec {
|
||||||
|
ServiceWrapper create() throws IOException;
|
||||||
|
}
|
|
@ -0,0 +1,51 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hadoop.yarn.submarine.runtimes.yarnservice;
|
||||||
|
|
||||||
|
import org.apache.hadoop.yarn.service.api.records.Service;
|
||||||
|
|
||||||
|
import java.io.File;
|
||||||
|
import java.io.FileOutputStream;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.OutputStreamWriter;
|
||||||
|
import java.io.PrintWriter;
|
||||||
|
import java.io.Writer;
|
||||||
|
import java.nio.charset.StandardCharsets;
|
||||||
|
|
||||||
|
import static org.apache.hadoop.yarn.service.utils.ServiceApiUtil.jsonSerDeser;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This class is merely responsible for creating Json representation of
|
||||||
|
* {@link Service} instances.
|
||||||
|
*/
|
||||||
|
public final class ServiceSpecFileGenerator {
|
||||||
|
private ServiceSpecFileGenerator() {
|
||||||
|
throw new UnsupportedOperationException("This class should not be " +
|
||||||
|
"instantiated!");
|
||||||
|
}
|
||||||
|
|
||||||
|
static String generateJson(Service service) throws IOException {
|
||||||
|
File serviceSpecFile = File.createTempFile(service.getName(), ".json");
|
||||||
|
String buffer = jsonSerDeser.toJson(service);
|
||||||
|
Writer w = new OutputStreamWriter(new FileOutputStream(serviceSpecFile),
|
||||||
|
StandardCharsets.UTF_8);
|
||||||
|
try (PrintWriter pw = new PrintWriter(w)) {
|
||||||
|
pw.append(buffer);
|
||||||
|
}
|
||||||
|
return serviceSpecFile.getAbsolutePath();
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,62 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hadoop.yarn.submarine.runtimes.yarnservice;
|
||||||
|
|
||||||
|
import com.google.common.annotations.VisibleForTesting;
|
||||||
|
import com.google.common.collect.Maps;
|
||||||
|
import org.apache.hadoop.yarn.service.api.records.Component;
|
||||||
|
import org.apache.hadoop.yarn.service.api.records.Service;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This class is only existing because we need a component name to
|
||||||
|
* local launch command mapping from the test code.
|
||||||
|
* Once this is solved in more clean or different way, we can delete this class.
|
||||||
|
*/
|
||||||
|
public class ServiceWrapper {
|
||||||
|
private final Service service;
|
||||||
|
|
||||||
|
@VisibleForTesting
|
||||||
|
private Map<String, String> componentToLocalLaunchCommand = Maps.newHashMap();
|
||||||
|
|
||||||
|
public ServiceWrapper(Service service) {
|
||||||
|
this.service = service;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void addComponent(AbstractComponent abstractComponent)
|
||||||
|
throws IOException {
|
||||||
|
Component component = abstractComponent.createComponent();
|
||||||
|
service.addComponent(component);
|
||||||
|
storeComponentName(abstractComponent, component.getName());
|
||||||
|
}
|
||||||
|
|
||||||
|
private void storeComponentName(
|
||||||
|
AbstractComponent component, String name) {
|
||||||
|
componentToLocalLaunchCommand.put(name,
|
||||||
|
component.getLocalScriptFile());
|
||||||
|
}
|
||||||
|
|
||||||
|
public Service getService() {
|
||||||
|
return service;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getLocalLaunchCommandPathForComponent(String componentName) {
|
||||||
|
return componentToLocalLaunchCommand.get(componentName);
|
||||||
|
}
|
||||||
|
}
|
|
@ -15,858 +15,59 @@
|
||||||
package org.apache.hadoop.yarn.submarine.runtimes.yarnservice;
|
package org.apache.hadoop.yarn.submarine.runtimes.yarnservice;
|
||||||
|
|
||||||
import com.google.common.annotations.VisibleForTesting;
|
import com.google.common.annotations.VisibleForTesting;
|
||||||
import org.apache.commons.lang3.StringUtils;
|
|
||||||
import org.apache.hadoop.fs.FileStatus;
|
|
||||||
import org.apache.hadoop.fs.FileSystem;
|
|
||||||
import org.apache.hadoop.fs.FileUtil;
|
|
||||||
import org.apache.hadoop.fs.Path;
|
|
||||||
import org.apache.hadoop.fs.permission.FsPermission;
|
|
||||||
import org.apache.hadoop.yarn.api.records.ApplicationId;
|
import org.apache.hadoop.yarn.api.records.ApplicationId;
|
||||||
import org.apache.hadoop.yarn.client.api.AppAdminClient;
|
import org.apache.hadoop.yarn.client.api.AppAdminClient;
|
||||||
import org.apache.hadoop.yarn.exceptions.YarnException;
|
import org.apache.hadoop.yarn.exceptions.YarnException;
|
||||||
import org.apache.hadoop.yarn.service.api.ServiceApiConstants;
|
|
||||||
import org.apache.hadoop.yarn.service.api.records.Artifact;
|
|
||||||
import org.apache.hadoop.yarn.service.api.records.Component;
|
|
||||||
import org.apache.hadoop.yarn.service.api.records.ConfigFile;
|
|
||||||
import org.apache.hadoop.yarn.service.api.records.Resource;
|
|
||||||
import org.apache.hadoop.yarn.service.api.records.ResourceInformation;
|
|
||||||
import org.apache.hadoop.yarn.service.api.records.Service;
|
import org.apache.hadoop.yarn.service.api.records.Service;
|
||||||
import org.apache.hadoop.yarn.service.api.records.KerberosPrincipal;
|
|
||||||
import org.apache.hadoop.yarn.service.utils.ServiceApiUtil;
|
import org.apache.hadoop.yarn.service.utils.ServiceApiUtil;
|
||||||
import org.apache.hadoop.yarn.submarine.client.cli.param.Localization;
|
|
||||||
import org.apache.hadoop.yarn.submarine.client.cli.param.Quicklink;
|
|
||||||
import org.apache.hadoop.yarn.submarine.client.cli.param.RunJobParameters;
|
import org.apache.hadoop.yarn.submarine.client.cli.param.RunJobParameters;
|
||||||
import org.apache.hadoop.yarn.submarine.common.ClientContext;
|
import org.apache.hadoop.yarn.submarine.common.ClientContext;
|
||||||
import org.apache.hadoop.yarn.submarine.common.Envs;
|
|
||||||
import org.apache.hadoop.yarn.submarine.common.api.TaskType;
|
|
||||||
import org.apache.hadoop.yarn.submarine.common.conf.SubmarineConfiguration;
|
|
||||||
import org.apache.hadoop.yarn.submarine.common.conf.SubmarineLogs;
|
|
||||||
import org.apache.hadoop.yarn.submarine.common.fs.RemoteDirectoryManager;
|
|
||||||
import org.apache.hadoop.yarn.submarine.runtimes.common.JobSubmitter;
|
import org.apache.hadoop.yarn.submarine.runtimes.common.JobSubmitter;
|
||||||
|
import org.apache.hadoop.yarn.submarine.runtimes.yarnservice.command.LaunchCommandFactory;
|
||||||
|
import org.apache.hadoop.yarn.submarine.runtimes.yarnservice.tensorflow.TensorFlowServiceSpec;
|
||||||
|
import org.apache.hadoop.yarn.submarine.utils.Localizer;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import java.io.File;
|
|
||||||
import java.io.FileInputStream;
|
|
||||||
import java.io.FileNotFoundException;
|
|
||||||
import java.io.FileOutputStream;
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.OutputStreamWriter;
|
|
||||||
import java.io.PrintWriter;
|
|
||||||
import java.io.Writer;
|
|
||||||
import java.nio.charset.StandardCharsets;
|
|
||||||
import java.util.HashMap;
|
|
||||||
import java.util.HashSet;
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.Map;
|
|
||||||
import java.util.Set;
|
|
||||||
import java.util.StringTokenizer;
|
|
||||||
import java.util.zip.ZipEntry;
|
|
||||||
import java.util.zip.ZipOutputStream;
|
|
||||||
|
|
||||||
import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.HADOOP_SECURITY_AUTHENTICATION;
|
|
||||||
|
|
||||||
import static org.apache.hadoop.yarn.service.conf.YarnServiceConstants
|
|
||||||
.CONTAINER_STATE_REPORT_AS_SERVICE_STATE;
|
|
||||||
import static org.apache.hadoop.yarn.service.exceptions.LauncherExitCodes.EXIT_SUCCESS;
|
import static org.apache.hadoop.yarn.service.exceptions.LauncherExitCodes.EXIT_SUCCESS;
|
||||||
import static org.apache.hadoop.yarn.service.utils.ServiceApiUtil.jsonSerDeser;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Submit a job to cluster
|
* Submit a job to cluster.
|
||||||
*/
|
*/
|
||||||
public class YarnServiceJobSubmitter implements JobSubmitter {
|
public class YarnServiceJobSubmitter implements JobSubmitter {
|
||||||
public static final String TENSORBOARD_QUICKLINK_LABEL = "Tensorboard";
|
|
||||||
private static final Logger LOG =
|
private static final Logger LOG =
|
||||||
LoggerFactory.getLogger(YarnServiceJobSubmitter.class);
|
LoggerFactory.getLogger(YarnServiceJobSubmitter.class);
|
||||||
ClientContext clientContext;
|
private ClientContext clientContext;
|
||||||
Service serviceSpec;
|
private ServiceWrapper serviceWrapper;
|
||||||
private Set<Path> uploadedFiles = new HashSet<>();
|
|
||||||
|
|
||||||
// Used by testing
|
YarnServiceJobSubmitter(ClientContext clientContext) {
|
||||||
private Map<String, String> componentToLocalLaunchScriptPath =
|
|
||||||
new HashMap<>();
|
|
||||||
|
|
||||||
public YarnServiceJobSubmitter(ClientContext clientContext) {
|
|
||||||
this.clientContext = clientContext;
|
this.clientContext = clientContext;
|
||||||
}
|
}
|
||||||
|
|
||||||
private Resource getServiceResourceFromYarnResource(
|
|
||||||
org.apache.hadoop.yarn.api.records.Resource yarnResource) {
|
|
||||||
Resource serviceResource = new Resource();
|
|
||||||
serviceResource.setCpus(yarnResource.getVirtualCores());
|
|
||||||
serviceResource.setMemory(String.valueOf(yarnResource.getMemorySize()));
|
|
||||||
|
|
||||||
Map<String, ResourceInformation> riMap = new HashMap<>();
|
|
||||||
for (org.apache.hadoop.yarn.api.records.ResourceInformation ri : yarnResource
|
|
||||||
.getAllResourcesListCopy()) {
|
|
||||||
ResourceInformation serviceRi =
|
|
||||||
new ResourceInformation();
|
|
||||||
serviceRi.setValue(ri.getValue());
|
|
||||||
serviceRi.setUnit(ri.getUnits());
|
|
||||||
riMap.put(ri.getName(), serviceRi);
|
|
||||||
}
|
|
||||||
serviceResource.setResourceInformations(riMap);
|
|
||||||
|
|
||||||
return serviceResource;
|
|
||||||
}
|
|
||||||
|
|
||||||
private String getValueOfEnvironment(String envar) {
|
|
||||||
// extract value from "key=value" form
|
|
||||||
if (envar == null || !envar.contains("=")) {
|
|
||||||
return "";
|
|
||||||
} else {
|
|
||||||
return envar.substring(envar.indexOf("=") + 1);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private boolean needHdfs(String content) {
|
|
||||||
return content != null && content.contains("hdfs://");
|
|
||||||
}
|
|
||||||
|
|
||||||
private void addHdfsClassPathIfNeeded(RunJobParameters parameters,
|
|
||||||
PrintWriter fw, Component comp) throws IOException {
|
|
||||||
// Find envs to use HDFS
|
|
||||||
String hdfsHome = null;
|
|
||||||
String javaHome = null;
|
|
||||||
|
|
||||||
boolean hadoopEnv = false;
|
|
||||||
|
|
||||||
for (String envar : parameters.getEnvars()) {
|
|
||||||
if (envar.startsWith("DOCKER_HADOOP_HDFS_HOME=")) {
|
|
||||||
hdfsHome = getValueOfEnvironment(envar);
|
|
||||||
hadoopEnv = true;
|
|
||||||
} else if (envar.startsWith("DOCKER_JAVA_HOME=")) {
|
|
||||||
javaHome = getValueOfEnvironment(envar);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
boolean lackingEnvs = false;
|
|
||||||
|
|
||||||
if (needHdfs(parameters.getInputPath()) || needHdfs(
|
|
||||||
parameters.getPSLaunchCmd()) || needHdfs(
|
|
||||||
parameters.getWorkerLaunchCmd()) || hadoopEnv) {
|
|
||||||
// HDFS is asked either in input or output, set LD_LIBRARY_PATH
|
|
||||||
// and classpath
|
|
||||||
if (hdfsHome != null) {
|
|
||||||
// Unset HADOOP_HOME/HADOOP_YARN_HOME to make sure host machine's envs
|
|
||||||
// won't pollute docker's env.
|
|
||||||
fw.append("export HADOOP_HOME=\n");
|
|
||||||
fw.append("export HADOOP_YARN_HOME=\n");
|
|
||||||
fw.append("export HADOOP_HDFS_HOME=" + hdfsHome + "\n");
|
|
||||||
fw.append("export HADOOP_COMMON_HOME=" + hdfsHome + "\n");
|
|
||||||
} else{
|
|
||||||
lackingEnvs = true;
|
|
||||||
}
|
|
||||||
|
|
||||||
// hadoop confs will be uploaded to HDFS and localized to container's
|
|
||||||
// local folder, so here set $HADOOP_CONF_DIR to $WORK_DIR.
|
|
||||||
fw.append("export HADOOP_CONF_DIR=$WORK_DIR\n");
|
|
||||||
if (javaHome != null) {
|
|
||||||
fw.append("export JAVA_HOME=" + javaHome + "\n");
|
|
||||||
fw.append("export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:"
|
|
||||||
+ "$JAVA_HOME/lib/amd64/server\n");
|
|
||||||
} else {
|
|
||||||
lackingEnvs = true;
|
|
||||||
}
|
|
||||||
fw.append("export CLASSPATH=`$HADOOP_HDFS_HOME/bin/hadoop classpath --glob`\n");
|
|
||||||
}
|
|
||||||
|
|
||||||
if (lackingEnvs) {
|
|
||||||
LOG.error("When hdfs is being used to read/write models/data. Following"
|
|
||||||
+ "envs are required: 1) DOCKER_HADOOP_HDFS_HOME=<HDFS_HOME inside"
|
|
||||||
+ "docker container> 2) DOCKER_JAVA_HOME=<JAVA_HOME inside docker"
|
|
||||||
+ "container>. You can use --env to pass these envars.");
|
|
||||||
throw new IOException("Failed to detect HDFS-related environments.");
|
|
||||||
}
|
|
||||||
|
|
||||||
// Trying to upload core-site.xml and hdfs-site.xml
|
|
||||||
Path stagingDir =
|
|
||||||
clientContext.getRemoteDirectoryManager().getJobStagingArea(
|
|
||||||
parameters.getName(), true);
|
|
||||||
File coreSite = findFileOnClassPath("core-site.xml");
|
|
||||||
File hdfsSite = findFileOnClassPath("hdfs-site.xml");
|
|
||||||
if (coreSite == null || hdfsSite == null) {
|
|
||||||
LOG.error("hdfs is being used, however we couldn't locate core-site.xml/"
|
|
||||||
+ "hdfs-site.xml from classpath, please double check you classpath"
|
|
||||||
+ "setting and make sure they're included.");
|
|
||||||
throw new IOException(
|
|
||||||
"Failed to locate core-site.xml / hdfs-site.xml from class path");
|
|
||||||
}
|
|
||||||
uploadToRemoteFileAndLocalizeToContainerWorkDir(stagingDir,
|
|
||||||
coreSite.getAbsolutePath(), "core-site.xml", comp);
|
|
||||||
uploadToRemoteFileAndLocalizeToContainerWorkDir(stagingDir,
|
|
||||||
hdfsSite.getAbsolutePath(), "hdfs-site.xml", comp);
|
|
||||||
|
|
||||||
// DEBUG
|
|
||||||
if (SubmarineLogs.isVerbose()) {
|
|
||||||
fw.append("echo \"CLASSPATH:$CLASSPATH\"\n");
|
|
||||||
fw.append("echo \"HADOOP_CONF_DIR:$HADOOP_CONF_DIR\"\n");
|
|
||||||
fw.append("echo \"HADOOP_TOKEN_FILE_LOCATION:$HADOOP_TOKEN_FILE_LOCATION\"\n");
|
|
||||||
fw.append("echo \"JAVA_HOME:$JAVA_HOME\"\n");
|
|
||||||
fw.append("echo \"LD_LIBRARY_PATH:$LD_LIBRARY_PATH\"\n");
|
|
||||||
fw.append("echo \"HADOOP_HDFS_HOME:$HADOOP_HDFS_HOME\"\n");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private void addCommonEnvironments(Component component, TaskType taskType) {
|
|
||||||
Map<String, String> envs = component.getConfiguration().getEnv();
|
|
||||||
envs.put(Envs.TASK_INDEX_ENV, ServiceApiConstants.COMPONENT_ID);
|
|
||||||
envs.put(Envs.TASK_TYPE_ENV, taskType.name());
|
|
||||||
}
|
|
||||||
|
|
||||||
@VisibleForTesting
|
|
||||||
protected String getUserName() {
|
|
||||||
return System.getProperty("user.name");
|
|
||||||
}
|
|
||||||
|
|
||||||
private String getDNSDomain() {
|
|
||||||
return clientContext.getYarnConfig().get("hadoop.registry.dns.domain-name");
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Generate a command launch script on local disk, returns patch to the script
|
|
||||||
*/
|
|
||||||
private String generateCommandLaunchScript(RunJobParameters parameters,
|
|
||||||
TaskType taskType, Component comp) throws IOException {
|
|
||||||
File file = File.createTempFile(taskType.name() + "-launch-script", ".sh");
|
|
||||||
Writer w = new OutputStreamWriter(new FileOutputStream(file),
|
|
||||||
StandardCharsets.UTF_8);
|
|
||||||
PrintWriter pw = new PrintWriter(w);
|
|
||||||
|
|
||||||
try {
|
|
||||||
pw.append("#!/bin/bash\n");
|
|
||||||
|
|
||||||
addHdfsClassPathIfNeeded(parameters, pw, comp);
|
|
||||||
|
|
||||||
if (taskType.equals(TaskType.TENSORBOARD)) {
|
|
||||||
String tbCommand =
|
|
||||||
"export LC_ALL=C && tensorboard --logdir=" + parameters
|
|
||||||
.getCheckpointPath();
|
|
||||||
pw.append(tbCommand + "\n");
|
|
||||||
LOG.info("Tensorboard command=" + tbCommand);
|
|
||||||
} else{
|
|
||||||
// When distributed training is required
|
|
||||||
if (parameters.isDistributed()) {
|
|
||||||
// Generated TF_CONFIG
|
|
||||||
String tfConfigEnv = YarnServiceUtils.getTFConfigEnv(
|
|
||||||
taskType.getComponentName(), parameters.getNumWorkers(),
|
|
||||||
parameters.getNumPS(), parameters.getName(), getUserName(),
|
|
||||||
getDNSDomain());
|
|
||||||
pw.append("export TF_CONFIG=\"" + tfConfigEnv + "\"\n");
|
|
||||||
}
|
|
||||||
|
|
||||||
// Print launch command
|
|
||||||
if (taskType.equals(TaskType.WORKER) || taskType.equals(
|
|
||||||
TaskType.PRIMARY_WORKER)) {
|
|
||||||
pw.append(parameters.getWorkerLaunchCmd() + '\n');
|
|
||||||
|
|
||||||
if (SubmarineLogs.isVerbose()) {
|
|
||||||
LOG.info(
|
|
||||||
"Worker command =[" + parameters.getWorkerLaunchCmd() + "]");
|
|
||||||
}
|
|
||||||
} else if (taskType.equals(TaskType.PS)) {
|
|
||||||
pw.append(parameters.getPSLaunchCmd() + '\n');
|
|
||||||
|
|
||||||
if (SubmarineLogs.isVerbose()) {
|
|
||||||
LOG.info("PS command =[" + parameters.getPSLaunchCmd() + "]");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} finally {
|
|
||||||
pw.close();
|
|
||||||
}
|
|
||||||
return file.getAbsolutePath();
|
|
||||||
}
|
|
||||||
|
|
||||||
private String getScriptFileName(TaskType taskType) {
|
|
||||||
return "run-" + taskType.name() + ".sh";
|
|
||||||
}
|
|
||||||
|
|
||||||
private File findFileOnClassPath(final String fileName) {
|
|
||||||
final String classpath = System.getProperty("java.class.path");
|
|
||||||
final String pathSeparator = System.getProperty("path.separator");
|
|
||||||
final StringTokenizer tokenizer = new StringTokenizer(classpath,
|
|
||||||
pathSeparator);
|
|
||||||
|
|
||||||
while (tokenizer.hasMoreTokens()) {
|
|
||||||
final String pathElement = tokenizer.nextToken();
|
|
||||||
final File directoryOrJar = new File(pathElement);
|
|
||||||
final File absoluteDirectoryOrJar = directoryOrJar.getAbsoluteFile();
|
|
||||||
if (absoluteDirectoryOrJar.isFile()) {
|
|
||||||
final File target = new File(absoluteDirectoryOrJar.getParent(),
|
|
||||||
fileName);
|
|
||||||
if (target.exists()) {
|
|
||||||
return target;
|
|
||||||
}
|
|
||||||
} else{
|
|
||||||
final File target = new File(directoryOrJar, fileName);
|
|
||||||
if (target.exists()) {
|
|
||||||
return target;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
|
|
||||||
private void uploadToRemoteFileAndLocalizeToContainerWorkDir(Path stagingDir,
|
|
||||||
String fileToUpload, String destFilename, Component comp)
|
|
||||||
throws IOException {
|
|
||||||
Path uploadedFilePath = uploadToRemoteFile(stagingDir, fileToUpload);
|
|
||||||
locateRemoteFileToContainerWorkDir(destFilename, comp, uploadedFilePath);
|
|
||||||
}
|
|
||||||
|
|
||||||
private void locateRemoteFileToContainerWorkDir(String destFilename,
|
|
||||||
Component comp, Path uploadedFilePath)
|
|
||||||
throws IOException {
|
|
||||||
FileSystem fs = FileSystem.get(clientContext.getYarnConfig());
|
|
||||||
|
|
||||||
FileStatus fileStatus = fs.getFileStatus(uploadedFilePath);
|
|
||||||
LOG.info("Uploaded file path = " + fileStatus.getPath());
|
|
||||||
|
|
||||||
// Set it to component's files list
|
|
||||||
comp.getConfiguration().getFiles().add(new ConfigFile().srcFile(
|
|
||||||
fileStatus.getPath().toUri().toString()).destFile(destFilename)
|
|
||||||
.type(ConfigFile.TypeEnum.STATIC));
|
|
||||||
}
|
|
||||||
|
|
||||||
private Path uploadToRemoteFile(Path stagingDir, String fileToUpload) throws
|
|
||||||
IOException {
|
|
||||||
FileSystem fs = clientContext.getRemoteDirectoryManager()
|
|
||||||
.getDefaultFileSystem();
|
|
||||||
|
|
||||||
// Upload to remote FS under staging area
|
|
||||||
File localFile = new File(fileToUpload);
|
|
||||||
if (!localFile.exists()) {
|
|
||||||
throw new FileNotFoundException(
|
|
||||||
"Trying to upload file=" + localFile.getAbsolutePath()
|
|
||||||
+ " to remote, but couldn't find local file.");
|
|
||||||
}
|
|
||||||
String filename = new File(fileToUpload).getName();
|
|
||||||
|
|
||||||
Path uploadedFilePath = new Path(stagingDir, filename);
|
|
||||||
if (!uploadedFiles.contains(uploadedFilePath)) {
|
|
||||||
if (SubmarineLogs.isVerbose()) {
|
|
||||||
LOG.info("Copying local file=" + fileToUpload + " to remote="
|
|
||||||
+ uploadedFilePath);
|
|
||||||
}
|
|
||||||
fs.copyFromLocalFile(new Path(fileToUpload), uploadedFilePath);
|
|
||||||
uploadedFiles.add(uploadedFilePath);
|
|
||||||
}
|
|
||||||
return uploadedFilePath;
|
|
||||||
}
|
|
||||||
|
|
||||||
private void setPermission(Path destPath, FsPermission permission) throws
|
|
||||||
IOException {
|
|
||||||
FileSystem fs = FileSystem.get(clientContext.getYarnConfig());
|
|
||||||
fs.setPermission(destPath, new FsPermission(permission));
|
|
||||||
}
|
|
||||||
|
|
||||||
private void handleLaunchCommand(RunJobParameters parameters,
|
|
||||||
TaskType taskType, Component component) throws IOException {
|
|
||||||
// Get staging area directory
|
|
||||||
Path stagingDir =
|
|
||||||
clientContext.getRemoteDirectoryManager().getJobStagingArea(
|
|
||||||
parameters.getName(), true);
|
|
||||||
|
|
||||||
// Generate script file in the local disk
|
|
||||||
String localScriptFile = generateCommandLaunchScript(parameters, taskType,
|
|
||||||
component);
|
|
||||||
String destScriptFileName = getScriptFileName(taskType);
|
|
||||||
uploadToRemoteFileAndLocalizeToContainerWorkDir(stagingDir, localScriptFile,
|
|
||||||
destScriptFileName, component);
|
|
||||||
|
|
||||||
component.setLaunchCommand("./" + destScriptFileName);
|
|
||||||
componentToLocalLaunchScriptPath.put(taskType.getComponentName(),
|
|
||||||
localScriptFile);
|
|
||||||
}
|
|
||||||
|
|
||||||
private String getLastNameFromPath(String srcFileStr) {
|
|
||||||
return new Path(srcFileStr).getName();
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* May download a remote uri(file/dir) and zip.
|
|
||||||
* Skip download if local dir
|
|
||||||
* Remote uri can be a local dir(won't download)
|
|
||||||
* or remote HDFS dir, s3 dir/file .etc
|
|
||||||
* */
|
|
||||||
private String mayDownloadAndZipIt(String remoteDir, String zipFileName,
|
|
||||||
boolean doZip)
|
|
||||||
throws IOException {
|
|
||||||
RemoteDirectoryManager rdm = clientContext.getRemoteDirectoryManager();
|
|
||||||
//Append original modification time and size to zip file name
|
|
||||||
String suffix;
|
|
||||||
String srcDir = remoteDir;
|
|
||||||
String zipDirPath =
|
|
||||||
System.getProperty("java.io.tmpdir") + "/" + zipFileName;
|
|
||||||
boolean needDeleteTempDir = false;
|
|
||||||
if (rdm.isRemote(remoteDir)) {
|
|
||||||
//Append original modification time and size to zip file name
|
|
||||||
FileStatus status = rdm.getRemoteFileStatus(new Path(remoteDir));
|
|
||||||
suffix = "_" + status.getModificationTime()
|
|
||||||
+ "-" + rdm.getRemoteFileSize(remoteDir);
|
|
||||||
// Download them to temp dir
|
|
||||||
boolean downloaded = rdm.copyRemoteToLocal(remoteDir, zipDirPath);
|
|
||||||
if (!downloaded) {
|
|
||||||
throw new IOException("Failed to download files from "
|
|
||||||
+ remoteDir);
|
|
||||||
}
|
|
||||||
LOG.info("Downloaded remote: {} to local: {}", remoteDir, zipDirPath);
|
|
||||||
srcDir = zipDirPath;
|
|
||||||
needDeleteTempDir = true;
|
|
||||||
} else {
|
|
||||||
File localDir = new File(remoteDir);
|
|
||||||
suffix = "_" + localDir.lastModified()
|
|
||||||
+ "-" + localDir.length();
|
|
||||||
}
|
|
||||||
if (!doZip) {
|
|
||||||
return srcDir;
|
|
||||||
}
|
|
||||||
// zip a local dir
|
|
||||||
String zipFileUri = zipDir(srcDir, zipDirPath + suffix + ".zip");
|
|
||||||
// delete downloaded temp dir
|
|
||||||
if (needDeleteTempDir) {
|
|
||||||
deleteFiles(srcDir);
|
|
||||||
}
|
|
||||||
return zipFileUri;
|
|
||||||
}
|
|
||||||
|
|
||||||
@VisibleForTesting
|
|
||||||
public String zipDir(String srcDir, String dstFile) throws IOException {
|
|
||||||
FileOutputStream fos = new FileOutputStream(dstFile);
|
|
||||||
ZipOutputStream zos = new ZipOutputStream(fos);
|
|
||||||
File srcFile = new File(srcDir);
|
|
||||||
LOG.info("Compressing {}", srcDir);
|
|
||||||
addDirToZip(zos, srcFile, srcFile);
|
|
||||||
// close the ZipOutputStream
|
|
||||||
zos.close();
|
|
||||||
LOG.info("Compressed {} to {}", srcDir, dstFile);
|
|
||||||
return dstFile;
|
|
||||||
}
|
|
||||||
|
|
||||||
private void deleteFiles(String localUri) {
|
|
||||||
boolean success = FileUtil.fullyDelete(new File(localUri));
|
|
||||||
if (!success) {
|
|
||||||
LOG.warn("Fail to delete {}", localUri);
|
|
||||||
}
|
|
||||||
LOG.info("Deleted {}", localUri);
|
|
||||||
}
|
|
||||||
|
|
||||||
private void addDirToZip(ZipOutputStream zos, File srcFile, File base)
|
|
||||||
throws IOException {
|
|
||||||
File[] files = srcFile.listFiles();
|
|
||||||
if (null == files) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
FileInputStream fis = null;
|
|
||||||
for (int i = 0; i < files.length; i++) {
|
|
||||||
// if it's directory, add recursively
|
|
||||||
if (files[i].isDirectory()) {
|
|
||||||
addDirToZip(zos, files[i], base);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
byte[] buffer = new byte[1024];
|
|
||||||
try {
|
|
||||||
fis = new FileInputStream(files[i]);
|
|
||||||
String name = base.toURI().relativize(files[i].toURI()).getPath();
|
|
||||||
LOG.info(" Zip adding: " + name);
|
|
||||||
zos.putNextEntry(new ZipEntry(name));
|
|
||||||
int length;
|
|
||||||
while ((length = fis.read(buffer)) > 0) {
|
|
||||||
zos.write(buffer, 0, length);
|
|
||||||
}
|
|
||||||
zos.flush();
|
|
||||||
} finally {
|
|
||||||
if (fis != null) {
|
|
||||||
fis.close();
|
|
||||||
}
|
|
||||||
zos.closeEntry();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private void addWorkerComponent(Service service,
|
|
||||||
RunJobParameters parameters, TaskType taskType) throws IOException {
|
|
||||||
Component workerComponent = new Component();
|
|
||||||
addCommonEnvironments(workerComponent, taskType);
|
|
||||||
|
|
||||||
workerComponent.setName(taskType.getComponentName());
|
|
||||||
|
|
||||||
if (taskType.equals(TaskType.PRIMARY_WORKER)) {
|
|
||||||
workerComponent.setNumberOfContainers(1L);
|
|
||||||
workerComponent.getConfiguration().setProperty(
|
|
||||||
CONTAINER_STATE_REPORT_AS_SERVICE_STATE, "true");
|
|
||||||
} else{
|
|
||||||
workerComponent.setNumberOfContainers(
|
|
||||||
(long) parameters.getNumWorkers() - 1);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (parameters.getWorkerDockerImage() != null) {
|
|
||||||
workerComponent.setArtifact(
|
|
||||||
getDockerArtifact(parameters.getWorkerDockerImage()));
|
|
||||||
}
|
|
||||||
|
|
||||||
workerComponent.setResource(
|
|
||||||
getServiceResourceFromYarnResource(parameters.getWorkerResource()));
|
|
||||||
handleLaunchCommand(parameters, taskType, workerComponent);
|
|
||||||
workerComponent.setRestartPolicy(Component.RestartPolicyEnum.NEVER);
|
|
||||||
service.addComponent(workerComponent);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Handle worker and primary_worker.
|
|
||||||
private void addWorkerComponents(Service service, RunJobParameters parameters)
|
|
||||||
throws IOException {
|
|
||||||
addWorkerComponent(service, parameters, TaskType.PRIMARY_WORKER);
|
|
||||||
|
|
||||||
if (parameters.getNumWorkers() > 1) {
|
|
||||||
addWorkerComponent(service, parameters, TaskType.WORKER);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private void appendToEnv(Service service, String key, String value,
|
|
||||||
String delim) {
|
|
||||||
Map<String, String> env = service.getConfiguration().getEnv();
|
|
||||||
if (!env.containsKey(key)) {
|
|
||||||
env.put(key, value);
|
|
||||||
} else {
|
|
||||||
if (!value.isEmpty()) {
|
|
||||||
String existingValue = env.get(key);
|
|
||||||
if (!existingValue.endsWith(delim)) {
|
|
||||||
env.put(key, existingValue + delim + value);
|
|
||||||
} else {
|
|
||||||
env.put(key, existingValue + value);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private void handleServiceEnvs(Service service, RunJobParameters parameters) {
|
|
||||||
if (parameters.getEnvars() != null) {
|
|
||||||
for (String envarPair : parameters.getEnvars()) {
|
|
||||||
String key, value;
|
|
||||||
if (envarPair.contains("=")) {
|
|
||||||
int idx = envarPair.indexOf('=');
|
|
||||||
key = envarPair.substring(0, idx);
|
|
||||||
value = envarPair.substring(idx + 1);
|
|
||||||
} else{
|
|
||||||
// No "=" found so use the whole key
|
|
||||||
key = envarPair;
|
|
||||||
value = "";
|
|
||||||
}
|
|
||||||
appendToEnv(service, key, value, ":");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Append other configs like /etc/passwd, /etc/krb5.conf
|
|
||||||
appendToEnv(service, "YARN_CONTAINER_RUNTIME_DOCKER_MOUNTS",
|
|
||||||
"/etc/passwd:/etc/passwd:ro", ",");
|
|
||||||
|
|
||||||
String authenication = clientContext.getYarnConfig().get(
|
|
||||||
HADOOP_SECURITY_AUTHENTICATION);
|
|
||||||
if (authenication != null && authenication.equals("kerberos")) {
|
|
||||||
appendToEnv(service, "YARN_CONTAINER_RUNTIME_DOCKER_MOUNTS",
|
|
||||||
"/etc/krb5.conf:/etc/krb5.conf:ro", ",");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private Artifact getDockerArtifact(String dockerImageName) {
|
|
||||||
return new Artifact().type(Artifact.TypeEnum.DOCKER).id(dockerImageName);
|
|
||||||
}
|
|
||||||
|
|
||||||
private void handleQuicklinks(RunJobParameters runJobParameters)
|
|
||||||
throws IOException {
|
|
||||||
List<Quicklink> quicklinks = runJobParameters.getQuicklinks();
|
|
||||||
if (null != quicklinks && !quicklinks.isEmpty()) {
|
|
||||||
for (Quicklink ql : quicklinks) {
|
|
||||||
// Make sure it is a valid instance name
|
|
||||||
String instanceName = ql.getComponentInstanceName();
|
|
||||||
boolean found = false;
|
|
||||||
|
|
||||||
for (Component comp : serviceSpec.getComponents()) {
|
|
||||||
for (int i = 0; i < comp.getNumberOfContainers(); i++) {
|
|
||||||
String possibleInstanceName = comp.getName() + "-" + i;
|
|
||||||
if (possibleInstanceName.equals(instanceName)) {
|
|
||||||
found = true;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!found) {
|
|
||||||
throw new IOException(
|
|
||||||
"Couldn't find a component instance = " + instanceName
|
|
||||||
+ " while adding quicklink");
|
|
||||||
}
|
|
||||||
|
|
||||||
String link = ql.getProtocol() + YarnServiceUtils.getDNSName(
|
|
||||||
serviceSpec.getName(), instanceName, getUserName(), getDNSDomain(),
|
|
||||||
ql.getPort());
|
|
||||||
YarnServiceUtils.addQuicklink(serviceSpec, ql.getLabel(), link);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private Service createServiceByParameters(RunJobParameters parameters)
|
|
||||||
throws IOException {
|
|
||||||
componentToLocalLaunchScriptPath.clear();
|
|
||||||
serviceSpec = new Service();
|
|
||||||
serviceSpec.setName(parameters.getName());
|
|
||||||
serviceSpec.setVersion(String.valueOf(System.currentTimeMillis()));
|
|
||||||
serviceSpec.setArtifact(getDockerArtifact(parameters.getDockerImageName()));
|
|
||||||
handleKerberosPrincipal(parameters);
|
|
||||||
|
|
||||||
handleServiceEnvs(serviceSpec, parameters);
|
|
||||||
|
|
||||||
handleLocalizations(parameters);
|
|
||||||
|
|
||||||
if (parameters.getNumWorkers() > 0) {
|
|
||||||
addWorkerComponents(serviceSpec, parameters);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (parameters.getNumPS() > 0) {
|
|
||||||
Component psComponent = new Component();
|
|
||||||
psComponent.setName(TaskType.PS.getComponentName());
|
|
||||||
addCommonEnvironments(psComponent, TaskType.PS);
|
|
||||||
psComponent.setNumberOfContainers((long) parameters.getNumPS());
|
|
||||||
psComponent.setRestartPolicy(Component.RestartPolicyEnum.NEVER);
|
|
||||||
psComponent.setResource(
|
|
||||||
getServiceResourceFromYarnResource(parameters.getPsResource()));
|
|
||||||
|
|
||||||
// Override global docker image if needed.
|
|
||||||
if (parameters.getPsDockerImage() != null) {
|
|
||||||
psComponent.setArtifact(
|
|
||||||
getDockerArtifact(parameters.getPsDockerImage()));
|
|
||||||
}
|
|
||||||
handleLaunchCommand(parameters, TaskType.PS, psComponent);
|
|
||||||
serviceSpec.addComponent(psComponent);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (parameters.isTensorboardEnabled()) {
|
|
||||||
Component tbComponent = new Component();
|
|
||||||
tbComponent.setName(TaskType.TENSORBOARD.getComponentName());
|
|
||||||
addCommonEnvironments(tbComponent, TaskType.TENSORBOARD);
|
|
||||||
tbComponent.setNumberOfContainers(1L);
|
|
||||||
tbComponent.setRestartPolicy(Component.RestartPolicyEnum.NEVER);
|
|
||||||
tbComponent.setResource(getServiceResourceFromYarnResource(
|
|
||||||
parameters.getTensorboardResource()));
|
|
||||||
if (parameters.getTensorboardDockerImage() != null) {
|
|
||||||
tbComponent.setArtifact(
|
|
||||||
getDockerArtifact(parameters.getTensorboardDockerImage()));
|
|
||||||
}
|
|
||||||
|
|
||||||
handleLaunchCommand(parameters, TaskType.TENSORBOARD, tbComponent);
|
|
||||||
|
|
||||||
// Add tensorboard to quicklink
|
|
||||||
String tensorboardLink = "http://" + YarnServiceUtils.getDNSName(
|
|
||||||
parameters.getName(),
|
|
||||||
TaskType.TENSORBOARD.getComponentName() + "-" + 0, getUserName(),
|
|
||||||
getDNSDomain(), 6006);
|
|
||||||
LOG.info("Link to tensorboard:" + tensorboardLink);
|
|
||||||
serviceSpec.addComponent(tbComponent);
|
|
||||||
|
|
||||||
YarnServiceUtils.addQuicklink(serviceSpec, TENSORBOARD_QUICKLINK_LABEL,
|
|
||||||
tensorboardLink);
|
|
||||||
}
|
|
||||||
|
|
||||||
// After all components added, handle quicklinks
|
|
||||||
handleQuicklinks(parameters);
|
|
||||||
|
|
||||||
return serviceSpec;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Localize dependencies for all containers.
|
|
||||||
* If remoteUri is a local directory,
|
|
||||||
* we'll zip it, upload to HDFS staging dir HDFS.
|
|
||||||
* If remoteUri is directory, we'll download it, zip it and upload
|
|
||||||
* to HDFS.
|
|
||||||
* If localFilePath is ".", we'll use remoteUri's file/dir name
|
|
||||||
* */
|
|
||||||
private void handleLocalizations(RunJobParameters parameters)
|
|
||||||
throws IOException {
|
|
||||||
// Handle localizations
|
|
||||||
Path stagingDir =
|
|
||||||
clientContext.getRemoteDirectoryManager().getJobStagingArea(
|
|
||||||
parameters.getName(), true);
|
|
||||||
List<Localization> locs = parameters.getLocalizations();
|
|
||||||
String remoteUri;
|
|
||||||
String containerLocalPath;
|
|
||||||
RemoteDirectoryManager rdm = clientContext.getRemoteDirectoryManager();
|
|
||||||
|
|
||||||
// Check to fail fast
|
|
||||||
for (Localization loc : locs) {
|
|
||||||
remoteUri = loc.getRemoteUri();
|
|
||||||
Path resourceToLocalize = new Path(remoteUri);
|
|
||||||
// Check if remoteUri exists
|
|
||||||
if (rdm.isRemote(remoteUri)) {
|
|
||||||
// check if exists
|
|
||||||
if (!rdm.existsRemoteFile(resourceToLocalize)) {
|
|
||||||
throw new FileNotFoundException(
|
|
||||||
"File " + remoteUri + " doesn't exists.");
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
// Check if exists
|
|
||||||
File localFile = new File(remoteUri);
|
|
||||||
if (!localFile.exists()) {
|
|
||||||
throw new FileNotFoundException(
|
|
||||||
"File " + remoteUri + " doesn't exists.");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// check remote file size
|
|
||||||
validFileSize(remoteUri);
|
|
||||||
}
|
|
||||||
// Start download remote if needed and upload to HDFS
|
|
||||||
for (Localization loc : locs) {
|
|
||||||
remoteUri = loc.getRemoteUri();
|
|
||||||
containerLocalPath = loc.getLocalPath();
|
|
||||||
String srcFileStr = remoteUri;
|
|
||||||
ConfigFile.TypeEnum destFileType = ConfigFile.TypeEnum.STATIC;
|
|
||||||
Path resourceToLocalize = new Path(remoteUri);
|
|
||||||
boolean needUploadToHDFS = true;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Special handling for remoteUri directory.
|
|
||||||
* */
|
|
||||||
boolean needDeleteTempFile = false;
|
|
||||||
if (rdm.isDir(remoteUri)) {
|
|
||||||
destFileType = ConfigFile.TypeEnum.ARCHIVE;
|
|
||||||
srcFileStr = mayDownloadAndZipIt(
|
|
||||||
remoteUri, getLastNameFromPath(srcFileStr), true);
|
|
||||||
} else if (rdm.isRemote(remoteUri)) {
|
|
||||||
if (!needHdfs(remoteUri)) {
|
|
||||||
// Non HDFS remote uri. Non directory, no need to zip
|
|
||||||
srcFileStr = mayDownloadAndZipIt(
|
|
||||||
remoteUri, getLastNameFromPath(srcFileStr), false);
|
|
||||||
needDeleteTempFile = true;
|
|
||||||
} else {
|
|
||||||
// HDFS file, no need to upload
|
|
||||||
needUploadToHDFS = false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Upload file to HDFS
|
|
||||||
if (needUploadToHDFS) {
|
|
||||||
resourceToLocalize = uploadToRemoteFile(stagingDir, srcFileStr);
|
|
||||||
}
|
|
||||||
if (needDeleteTempFile) {
|
|
||||||
deleteFiles(srcFileStr);
|
|
||||||
}
|
|
||||||
// Remove .zip from zipped dir name
|
|
||||||
if (destFileType == ConfigFile.TypeEnum.ARCHIVE
|
|
||||||
&& srcFileStr.endsWith(".zip")) {
|
|
||||||
// Delete local zip file
|
|
||||||
deleteFiles(srcFileStr);
|
|
||||||
int suffixIndex = srcFileStr.lastIndexOf('_');
|
|
||||||
srcFileStr = srcFileStr.substring(0, suffixIndex);
|
|
||||||
}
|
|
||||||
// If provided, use the name of local uri
|
|
||||||
if (!containerLocalPath.equals(".")
|
|
||||||
&& !containerLocalPath.equals("./")) {
|
|
||||||
// Change the YARN localized file name to what'll used in container
|
|
||||||
srcFileStr = getLastNameFromPath(containerLocalPath);
|
|
||||||
}
|
|
||||||
String localizedName = getLastNameFromPath(srcFileStr);
|
|
||||||
LOG.info("The file/dir to be localized is {}",
|
|
||||||
resourceToLocalize.toString());
|
|
||||||
LOG.info("Its localized file name will be {}", localizedName);
|
|
||||||
serviceSpec.getConfiguration().getFiles().add(new ConfigFile().srcFile(
|
|
||||||
resourceToLocalize.toUri().toString()).destFile(localizedName)
|
|
||||||
.type(destFileType));
|
|
||||||
// set mounts
|
|
||||||
// if mount path is absolute, just use it.
|
|
||||||
// if relative, no need to mount explicitly
|
|
||||||
if (containerLocalPath.startsWith("/")) {
|
|
||||||
String mountStr = getLastNameFromPath(srcFileStr) + ":"
|
|
||||||
+ containerLocalPath + ":" + loc.getMountPermission();
|
|
||||||
LOG.info("Add bind-mount string {}", mountStr);
|
|
||||||
appendToEnv(serviceSpec, "YARN_CONTAINER_RUNTIME_DOCKER_MOUNTS",
|
|
||||||
mountStr, ",");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private void validFileSize(String uri) throws IOException {
|
|
||||||
RemoteDirectoryManager rdm = clientContext.getRemoteDirectoryManager();
|
|
||||||
long actualSizeByte;
|
|
||||||
String locationType = "Local";
|
|
||||||
if (rdm.isRemote(uri)) {
|
|
||||||
actualSizeByte = clientContext.getRemoteDirectoryManager()
|
|
||||||
.getRemoteFileSize(uri);
|
|
||||||
locationType = "Remote";
|
|
||||||
} else {
|
|
||||||
actualSizeByte = FileUtil.getDU(new File(uri));
|
|
||||||
}
|
|
||||||
long maxFileSizeMB = clientContext.getSubmarineConfig()
|
|
||||||
.getLong(SubmarineConfiguration.LOCALIZATION_MAX_ALLOWED_FILE_SIZE_MB,
|
|
||||||
SubmarineConfiguration.DEFAULT_MAX_ALLOWED_REMOTE_URI_SIZE_MB);
|
|
||||||
LOG.info("{} fie/dir: {}, size(Byte):{},"
|
|
||||||
+ " Allowed max file/dir size: {}",
|
|
||||||
locationType, uri, actualSizeByte, maxFileSizeMB * 1024 * 1024);
|
|
||||||
|
|
||||||
if (actualSizeByte > maxFileSizeMB * 1024 * 1024) {
|
|
||||||
throw new IOException(uri + " size(Byte): "
|
|
||||||
+ actualSizeByte + " exceeds configured max size:"
|
|
||||||
+ maxFileSizeMB * 1024 * 1024);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private String generateServiceSpecFile(Service service) throws IOException {
|
|
||||||
File serviceSpecFile = File.createTempFile(service.getName(), ".json");
|
|
||||||
String buffer = jsonSerDeser.toJson(service);
|
|
||||||
Writer w = new OutputStreamWriter(new FileOutputStream(serviceSpecFile),
|
|
||||||
"UTF-8");
|
|
||||||
PrintWriter pw = new PrintWriter(w);
|
|
||||||
try {
|
|
||||||
pw.append(buffer);
|
|
||||||
} finally {
|
|
||||||
pw.close();
|
|
||||||
}
|
|
||||||
return serviceSpecFile.getAbsolutePath();
|
|
||||||
}
|
|
||||||
|
|
||||||
private void handleKerberosPrincipal(RunJobParameters parameters) throws
|
|
||||||
IOException {
|
|
||||||
if(StringUtils.isNotBlank(parameters.getKeytab()) && StringUtils
|
|
||||||
.isNotBlank(parameters.getPrincipal())) {
|
|
||||||
String keytab = parameters.getKeytab();
|
|
||||||
String principal = parameters.getPrincipal();
|
|
||||||
if(parameters.isDistributeKeytab()) {
|
|
||||||
Path stagingDir =
|
|
||||||
clientContext.getRemoteDirectoryManager().getJobStagingArea(
|
|
||||||
parameters.getName(), true);
|
|
||||||
Path remoteKeytabPath = uploadToRemoteFile(stagingDir, keytab);
|
|
||||||
//only the owner has read access
|
|
||||||
setPermission(remoteKeytabPath,
|
|
||||||
FsPermission.createImmutable((short)Integer.parseInt("400", 8)));
|
|
||||||
serviceSpec.setKerberosPrincipal(new KerberosPrincipal().keytab(
|
|
||||||
remoteKeytabPath.toString()).principalName(principal));
|
|
||||||
} else {
|
|
||||||
if(!keytab.startsWith("file")) {
|
|
||||||
keytab = "file://" + keytab;
|
|
||||||
}
|
|
||||||
serviceSpec.setKerberosPrincipal(new KerberosPrincipal().keytab(
|
|
||||||
keytab).principalName(principal));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* {@inheritDoc}
|
* {@inheritDoc}
|
||||||
*/
|
*/
|
||||||
@Override
|
@Override
|
||||||
public ApplicationId submitJob(RunJobParameters parameters)
|
public ApplicationId submitJob(RunJobParameters parameters)
|
||||||
throws IOException, YarnException {
|
throws IOException, YarnException {
|
||||||
createServiceByParameters(parameters);
|
FileSystemOperations fsOperations = new FileSystemOperations(clientContext);
|
||||||
String serviceSpecFile = generateServiceSpecFile(serviceSpec);
|
HadoopEnvironmentSetup hadoopEnvSetup =
|
||||||
|
new HadoopEnvironmentSetup(clientContext, fsOperations);
|
||||||
|
|
||||||
AppAdminClient appAdminClient = YarnServiceUtils.createServiceClient(
|
Service serviceSpec = createTensorFlowServiceSpec(parameters,
|
||||||
clientContext.getYarnConfig());
|
fsOperations, hadoopEnvSetup);
|
||||||
|
String serviceSpecFile = ServiceSpecFileGenerator.generateJson(serviceSpec);
|
||||||
|
|
||||||
|
AppAdminClient appAdminClient =
|
||||||
|
YarnServiceUtils.createServiceClient(clientContext.getYarnConfig());
|
||||||
int code = appAdminClient.actionLaunch(serviceSpecFile,
|
int code = appAdminClient.actionLaunch(serviceSpecFile,
|
||||||
serviceSpec.getName(), null, null);
|
serviceSpec.getName(), null, null);
|
||||||
if(code != EXIT_SUCCESS) {
|
if (code != EXIT_SUCCESS) {
|
||||||
throw new YarnException("Fail to launch application with exit code:" +
|
throw new YarnException(
|
||||||
code);
|
"Fail to launch application with exit code:" + code);
|
||||||
}
|
}
|
||||||
|
|
||||||
String appStatus=appAdminClient.getStatusString(serviceSpec.getName());
|
String appStatus=appAdminClient.getStatusString(serviceSpec.getName());
|
||||||
|
@ -896,13 +97,24 @@ public class YarnServiceJobSubmitter implements JobSubmitter {
|
||||||
return appid;
|
return appid;
|
||||||
}
|
}
|
||||||
|
|
||||||
@VisibleForTesting
|
private Service createTensorFlowServiceSpec(RunJobParameters parameters,
|
||||||
public Service getServiceSpec() {
|
FileSystemOperations fsOperations, HadoopEnvironmentSetup hadoopEnvSetup)
|
||||||
return serviceSpec;
|
throws IOException {
|
||||||
|
LaunchCommandFactory launchCommandFactory =
|
||||||
|
new LaunchCommandFactory(hadoopEnvSetup, parameters,
|
||||||
|
clientContext.getYarnConfig());
|
||||||
|
Localizer localizer = new Localizer(fsOperations,
|
||||||
|
clientContext.getRemoteDirectoryManager(), parameters);
|
||||||
|
TensorFlowServiceSpec tensorFlowServiceSpec = new TensorFlowServiceSpec(
|
||||||
|
parameters, this.clientContext, fsOperations, launchCommandFactory,
|
||||||
|
localizer);
|
||||||
|
|
||||||
|
serviceWrapper = tensorFlowServiceSpec.create();
|
||||||
|
return serviceWrapper.getService();
|
||||||
}
|
}
|
||||||
|
|
||||||
@VisibleForTesting
|
@VisibleForTesting
|
||||||
public Map<String, String> getComponentToLocalLaunchScriptPath() {
|
public ServiceWrapper getServiceWrapper() {
|
||||||
return componentToLocalLaunchScriptPath;
|
return serviceWrapper;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -17,33 +17,27 @@ package org.apache.hadoop.yarn.submarine.runtimes.yarnservice;
|
||||||
import com.google.common.annotations.VisibleForTesting;
|
import com.google.common.annotations.VisibleForTesting;
|
||||||
import org.apache.hadoop.conf.Configuration;
|
import org.apache.hadoop.conf.Configuration;
|
||||||
import org.apache.hadoop.yarn.client.api.AppAdminClient;
|
import org.apache.hadoop.yarn.client.api.AppAdminClient;
|
||||||
import org.apache.hadoop.yarn.service.api.records.Service;
|
|
||||||
import org.apache.hadoop.yarn.submarine.common.Envs;
|
|
||||||
import org.apache.hadoop.yarn.submarine.common.conf.SubmarineLogs;
|
|
||||||
import org.slf4j.Logger;
|
|
||||||
import org.slf4j.LoggerFactory;
|
|
||||||
|
|
||||||
import java.util.HashMap;
|
|
||||||
import java.util.Map;
|
|
||||||
|
|
||||||
import static org.apache.hadoop.yarn.client.api.AppAdminClient.DEFAULT_TYPE;
|
import static org.apache.hadoop.yarn.client.api.AppAdminClient.DEFAULT_TYPE;
|
||||||
|
|
||||||
public class YarnServiceUtils {
|
/**
|
||||||
private static final Logger LOG =
|
* This class contains some static helper methods to query DNS data
|
||||||
LoggerFactory.getLogger(YarnServiceUtils.class);
|
* based on the provided parameters.
|
||||||
|
*/
|
||||||
|
public final class YarnServiceUtils {
|
||||||
|
private YarnServiceUtils() {
|
||||||
|
}
|
||||||
|
|
||||||
// This will be true only in UT.
|
// This will be true only in UT.
|
||||||
private static AppAdminClient stubServiceClient = null;
|
private static AppAdminClient stubServiceClient = null;
|
||||||
|
|
||||||
public static AppAdminClient createServiceClient(
|
static AppAdminClient createServiceClient(
|
||||||
Configuration yarnConfiguration) {
|
Configuration yarnConfiguration) {
|
||||||
if (stubServiceClient != null) {
|
if (stubServiceClient != null) {
|
||||||
return stubServiceClient;
|
return stubServiceClient;
|
||||||
}
|
}
|
||||||
|
|
||||||
AppAdminClient serviceClient = AppAdminClient.createAppAdminClient(
|
return AppAdminClient.createAppAdminClient(DEFAULT_TYPE, yarnConfiguration);
|
||||||
DEFAULT_TYPE, yarnConfiguration);
|
|
||||||
return serviceClient;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@VisibleForTesting
|
@VisibleForTesting
|
||||||
|
@ -57,77 +51,9 @@ public class YarnServiceUtils {
|
||||||
domain, port);
|
domain, port);
|
||||||
}
|
}
|
||||||
|
|
||||||
private static String getDNSNameCommonSuffix(String serviceName,
|
public static String getDNSNameCommonSuffix(String serviceName,
|
||||||
String userName, String domain, int port) {
|
String userName, String domain, int port) {
|
||||||
return "." + serviceName + "." + userName + "." + domain + ":" + port;
|
return "." + serviceName + "." + userName + "." + domain + ":" + port;
|
||||||
}
|
}
|
||||||
|
|
||||||
public static String getTFConfigEnv(String curCommponentName, int nWorkers,
|
|
||||||
int nPs, String serviceName, String userName, String domain) {
|
|
||||||
String commonEndpointSuffix = getDNSNameCommonSuffix(serviceName, userName,
|
|
||||||
domain, 8000);
|
|
||||||
|
|
||||||
String json = "{\\\"cluster\\\":{";
|
|
||||||
|
|
||||||
String master = getComponentArrayJson("master", 1, commonEndpointSuffix)
|
|
||||||
+ ",";
|
|
||||||
String worker = getComponentArrayJson("worker", nWorkers - 1,
|
|
||||||
commonEndpointSuffix) + ",";
|
|
||||||
String ps = getComponentArrayJson("ps", nPs, commonEndpointSuffix) + "},";
|
|
||||||
|
|
||||||
StringBuilder sb = new StringBuilder();
|
|
||||||
sb.append("\\\"task\\\":{");
|
|
||||||
sb.append(" \\\"type\\\":\\\"");
|
|
||||||
sb.append(curCommponentName);
|
|
||||||
sb.append("\\\",");
|
|
||||||
sb.append(" \\\"index\\\":");
|
|
||||||
sb.append('$');
|
|
||||||
sb.append(Envs.TASK_INDEX_ENV + "},");
|
|
||||||
String task = sb.toString();
|
|
||||||
String environment = "\\\"environment\\\":\\\"cloud\\\"}";
|
|
||||||
|
|
||||||
sb = new StringBuilder();
|
|
||||||
sb.append(json);
|
|
||||||
sb.append(master);
|
|
||||||
sb.append(worker);
|
|
||||||
sb.append(ps);
|
|
||||||
sb.append(task);
|
|
||||||
sb.append(environment);
|
|
||||||
return sb.toString();
|
|
||||||
}
|
|
||||||
|
|
||||||
public static void addQuicklink(Service serviceSpec, String label,
|
|
||||||
String link) {
|
|
||||||
Map<String, String> quicklinks = serviceSpec.getQuicklinks();
|
|
||||||
if (null == quicklinks) {
|
|
||||||
quicklinks = new HashMap<>();
|
|
||||||
serviceSpec.setQuicklinks(quicklinks);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (SubmarineLogs.isVerbose()) {
|
|
||||||
LOG.info("Added quicklink, " + label + "=" + link);
|
|
||||||
}
|
|
||||||
|
|
||||||
quicklinks.put(label, link);
|
|
||||||
}
|
|
||||||
|
|
||||||
private static String getComponentArrayJson(String componentName, int count,
|
|
||||||
String endpointSuffix) {
|
|
||||||
String component = "\\\"" + componentName + "\\\":";
|
|
||||||
StringBuilder array = new StringBuilder();
|
|
||||||
array.append("[");
|
|
||||||
for (int i = 0; i < count; i++) {
|
|
||||||
array.append("\\\"");
|
|
||||||
array.append(componentName);
|
|
||||||
array.append("-");
|
|
||||||
array.append(i);
|
|
||||||
array.append(endpointSuffix);
|
|
||||||
array.append("\\\"");
|
|
||||||
if (i != count - 1) {
|
|
||||||
array.append(",");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
array.append("]");
|
|
||||||
return component + array.toString();
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,64 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hadoop.yarn.submarine.runtimes.yarnservice.command;
|
||||||
|
|
||||||
|
import org.apache.hadoop.yarn.service.api.records.Component;
|
||||||
|
import org.apache.hadoop.yarn.submarine.client.cli.param.RunJobParameters;
|
||||||
|
import org.apache.hadoop.yarn.submarine.common.api.TaskType;
|
||||||
|
import org.apache.hadoop.yarn.submarine.runtimes.yarnservice.HadoopEnvironmentSetup;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.Objects;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Abstract base class for Launch command implementations for Services.
|
||||||
|
* Currently we have launch command implementations
|
||||||
|
* for TensorFlow PS, worker and Tensorboard instances.
|
||||||
|
*/
|
||||||
|
public abstract class AbstractLaunchCommand {
|
||||||
|
private final LaunchScriptBuilder builder;
|
||||||
|
|
||||||
|
public AbstractLaunchCommand(HadoopEnvironmentSetup hadoopEnvSetup,
|
||||||
|
TaskType taskType, Component component, RunJobParameters parameters)
|
||||||
|
throws IOException {
|
||||||
|
Objects.requireNonNull(taskType, "TaskType must not be null!");
|
||||||
|
this.builder = new LaunchScriptBuilder(taskType.name(), hadoopEnvSetup,
|
||||||
|
parameters, component);
|
||||||
|
}
|
||||||
|
|
||||||
|
protected LaunchScriptBuilder getBuilder() {
|
||||||
|
return builder;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Subclasses need to defined this method and return a valid launch script.
|
||||||
|
* Implementors can utilize the {@link LaunchScriptBuilder} using
|
||||||
|
* the getBuilder method of this class.
|
||||||
|
* @return The contents of a script.
|
||||||
|
* @throws IOException If any IO issue happens.
|
||||||
|
*/
|
||||||
|
public abstract String generateLaunchScript() throws IOException;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Subclasses need to provide a service-specific launch command
|
||||||
|
* of the service.
|
||||||
|
* Please note that this method should only return the launch command
|
||||||
|
* but not the whole script.
|
||||||
|
* @return The launch command
|
||||||
|
*/
|
||||||
|
public abstract String createLaunchCommand();
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,67 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hadoop.yarn.submarine.runtimes.yarnservice.command;
|
||||||
|
|
||||||
|
import org.apache.hadoop.conf.Configuration;
|
||||||
|
import org.apache.hadoop.yarn.service.api.records.Component;
|
||||||
|
import org.apache.hadoop.yarn.submarine.client.cli.param.RunJobParameters;
|
||||||
|
import org.apache.hadoop.yarn.submarine.common.api.TaskType;
|
||||||
|
import org.apache.hadoop.yarn.submarine.runtimes.yarnservice.HadoopEnvironmentSetup;
|
||||||
|
import org.apache.hadoop.yarn.submarine.runtimes.yarnservice.tensorflow.command.TensorBoardLaunchCommand;
|
||||||
|
import org.apache.hadoop.yarn.submarine.runtimes.yarnservice.tensorflow.command.TensorFlowPsLaunchCommand;
|
||||||
|
import org.apache.hadoop.yarn.submarine.runtimes.yarnservice.tensorflow.command.TensorFlowWorkerLaunchCommand;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.Objects;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Simple factory to create instances of {@link AbstractLaunchCommand}
|
||||||
|
* based on the {@link TaskType}.
|
||||||
|
* All dependencies are passed to this factory that could be required
|
||||||
|
* by any implementor of {@link AbstractLaunchCommand}.
|
||||||
|
*/
|
||||||
|
public class LaunchCommandFactory {
|
||||||
|
private final HadoopEnvironmentSetup hadoopEnvSetup;
|
||||||
|
private final RunJobParameters parameters;
|
||||||
|
private final Configuration yarnConfig;
|
||||||
|
|
||||||
|
public LaunchCommandFactory(HadoopEnvironmentSetup hadoopEnvSetup,
|
||||||
|
RunJobParameters parameters, Configuration yarnConfig) {
|
||||||
|
this.hadoopEnvSetup = hadoopEnvSetup;
|
||||||
|
this.parameters = parameters;
|
||||||
|
this.yarnConfig = yarnConfig;
|
||||||
|
}
|
||||||
|
|
||||||
|
public AbstractLaunchCommand createLaunchCommand(TaskType taskType,
|
||||||
|
Component component) throws IOException {
|
||||||
|
Objects.requireNonNull(taskType, "TaskType must not be null!");
|
||||||
|
|
||||||
|
if (taskType == TaskType.WORKER || taskType == TaskType.PRIMARY_WORKER) {
|
||||||
|
return new TensorFlowWorkerLaunchCommand(hadoopEnvSetup, taskType,
|
||||||
|
component, parameters, yarnConfig);
|
||||||
|
|
||||||
|
} else if (taskType == TaskType.PS) {
|
||||||
|
return new TensorFlowPsLaunchCommand(hadoopEnvSetup, taskType, component,
|
||||||
|
parameters, yarnConfig);
|
||||||
|
|
||||||
|
} else if (taskType == TaskType.TENSORBOARD) {
|
||||||
|
return new TensorBoardLaunchCommand(hadoopEnvSetup, taskType, component,
|
||||||
|
parameters);
|
||||||
|
}
|
||||||
|
throw new IllegalStateException("Unknown task type: " + taskType);
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,107 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hadoop.yarn.submarine.runtimes.yarnservice.command;
|
||||||
|
|
||||||
|
import org.apache.hadoop.yarn.service.api.records.Component;
|
||||||
|
import org.apache.hadoop.yarn.submarine.client.cli.param.RunJobParameters;
|
||||||
|
import org.apache.hadoop.yarn.submarine.runtimes.yarnservice.HadoopEnvironmentSetup;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import java.io.File;
|
||||||
|
import java.io.FileOutputStream;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.OutputStreamWriter;
|
||||||
|
import java.io.PrintWriter;
|
||||||
|
|
||||||
|
import static java.nio.charset.StandardCharsets.UTF_8;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This class is a builder to conveniently create launch scripts.
|
||||||
|
* All dependencies are provided with the constructor except
|
||||||
|
* the launch command.
|
||||||
|
*/
|
||||||
|
public class LaunchScriptBuilder {
|
||||||
|
private static final Logger LOG = LoggerFactory.getLogger(
|
||||||
|
LaunchScriptBuilder.class);
|
||||||
|
|
||||||
|
private final File file;
|
||||||
|
private final HadoopEnvironmentSetup hadoopEnvSetup;
|
||||||
|
private final RunJobParameters parameters;
|
||||||
|
private final Component component;
|
||||||
|
private final OutputStreamWriter writer;
|
||||||
|
private final StringBuilder scriptBuffer;
|
||||||
|
private String launchCommand;
|
||||||
|
|
||||||
|
LaunchScriptBuilder(String namePrefix,
|
||||||
|
HadoopEnvironmentSetup hadoopEnvSetup, RunJobParameters parameters,
|
||||||
|
Component component) throws IOException {
|
||||||
|
this.file = File.createTempFile(namePrefix + "-launch-script", ".sh");
|
||||||
|
this.hadoopEnvSetup = hadoopEnvSetup;
|
||||||
|
this.parameters = parameters;
|
||||||
|
this.component = component;
|
||||||
|
this.writer = new OutputStreamWriter(new FileOutputStream(file), UTF_8);
|
||||||
|
this.scriptBuffer = new StringBuilder();
|
||||||
|
}
|
||||||
|
|
||||||
|
public void append(String s) {
|
||||||
|
scriptBuffer.append(s);
|
||||||
|
}
|
||||||
|
|
||||||
|
public LaunchScriptBuilder withLaunchCommand(String command) {
|
||||||
|
this.launchCommand = command;
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String build() throws IOException {
|
||||||
|
if (launchCommand != null) {
|
||||||
|
append(launchCommand);
|
||||||
|
} else {
|
||||||
|
LOG.warn("LaunchScript object was null!");
|
||||||
|
if (LOG.isDebugEnabled()) {
|
||||||
|
LOG.debug("LaunchScript's Builder object: {}", this);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
try (PrintWriter pw = new PrintWriter(writer)) {
|
||||||
|
writeBashHeader(pw);
|
||||||
|
hadoopEnvSetup.addHdfsClassPath(parameters, pw, component);
|
||||||
|
if (LOG.isDebugEnabled()) {
|
||||||
|
LOG.debug("Appending command to launch script: {}", scriptBuffer);
|
||||||
|
}
|
||||||
|
pw.append(scriptBuffer);
|
||||||
|
}
|
||||||
|
return file.getAbsolutePath();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String toString() {
|
||||||
|
return "LaunchScriptBuilder{" +
|
||||||
|
"file=" + file +
|
||||||
|
", hadoopEnvSetup=" + hadoopEnvSetup +
|
||||||
|
", parameters=" + parameters +
|
||||||
|
", component=" + component +
|
||||||
|
", writer=" + writer +
|
||||||
|
", scriptBuffer=" + scriptBuffer +
|
||||||
|
", launchCommand='" + launchCommand + '\'' +
|
||||||
|
'}';
|
||||||
|
}
|
||||||
|
|
||||||
|
private void writeBashHeader(PrintWriter pw) {
|
||||||
|
pw.append("#!/bin/bash\n");
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,19 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
/**
|
||||||
|
* This package contains classes to produce launch commands and scripts.
|
||||||
|
*/
|
||||||
|
package org.apache.hadoop.yarn.submarine.runtimes.yarnservice.command;
|
|
@ -0,0 +1,109 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hadoop.yarn.submarine.runtimes.yarnservice.tensorflow;
|
||||||
|
|
||||||
|
import org.apache.hadoop.conf.Configuration;
|
||||||
|
import org.apache.hadoop.yarn.service.api.ServiceApiConstants;
|
||||||
|
import org.apache.hadoop.yarn.service.api.records.Component;
|
||||||
|
import org.apache.hadoop.yarn.submarine.common.Envs;
|
||||||
|
import org.apache.hadoop.yarn.submarine.common.api.TaskType;
|
||||||
|
import org.apache.hadoop.yarn.submarine.runtimes.yarnservice.YarnServiceUtils;
|
||||||
|
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This class has common helper methods for TensorFlow.
|
||||||
|
*/
|
||||||
|
public final class TensorFlowCommons {
|
||||||
|
private TensorFlowCommons() {
|
||||||
|
throw new UnsupportedOperationException("This class should not be " +
|
||||||
|
"instantiated!");
|
||||||
|
}
|
||||||
|
|
||||||
|
public static void addCommonEnvironments(Component component,
|
||||||
|
TaskType taskType) {
|
||||||
|
Map<String, String> envs = component.getConfiguration().getEnv();
|
||||||
|
envs.put(Envs.TASK_INDEX_ENV, ServiceApiConstants.COMPONENT_ID);
|
||||||
|
envs.put(Envs.TASK_TYPE_ENV, taskType.name());
|
||||||
|
}
|
||||||
|
|
||||||
|
public static String getUserName() {
|
||||||
|
return System.getProperty("user.name");
|
||||||
|
}
|
||||||
|
|
||||||
|
public static String getDNSDomain(Configuration yarnConfig) {
|
||||||
|
return yarnConfig.get("hadoop.registry.dns.domain-name");
|
||||||
|
}
|
||||||
|
|
||||||
|
public static String getScriptFileName(TaskType taskType) {
|
||||||
|
return "run-" + taskType.name() + ".sh";
|
||||||
|
}
|
||||||
|
|
||||||
|
public static String getTFConfigEnv(String componentName, int nWorkers,
|
||||||
|
int nPs, String serviceName, String userName, String domain) {
|
||||||
|
String commonEndpointSuffix = YarnServiceUtils
|
||||||
|
.getDNSNameCommonSuffix(serviceName, userName, domain, 8000);
|
||||||
|
|
||||||
|
String json = "{\\\"cluster\\\":{";
|
||||||
|
|
||||||
|
String master = getComponentArrayJson("master", 1, commonEndpointSuffix)
|
||||||
|
+ ",";
|
||||||
|
String worker = getComponentArrayJson("worker", nWorkers - 1,
|
||||||
|
commonEndpointSuffix) + ",";
|
||||||
|
String ps = getComponentArrayJson("ps", nPs, commonEndpointSuffix) + "},";
|
||||||
|
|
||||||
|
StringBuilder sb = new StringBuilder();
|
||||||
|
sb.append("\\\"task\\\":{");
|
||||||
|
sb.append(" \\\"type\\\":\\\"");
|
||||||
|
sb.append(componentName);
|
||||||
|
sb.append("\\\",");
|
||||||
|
sb.append(" \\\"index\\\":");
|
||||||
|
sb.append('$');
|
||||||
|
sb.append(Envs.TASK_INDEX_ENV + "},");
|
||||||
|
String task = sb.toString();
|
||||||
|
String environment = "\\\"environment\\\":\\\"cloud\\\"}";
|
||||||
|
|
||||||
|
sb = new StringBuilder();
|
||||||
|
sb.append(json);
|
||||||
|
sb.append(master);
|
||||||
|
sb.append(worker);
|
||||||
|
sb.append(ps);
|
||||||
|
sb.append(task);
|
||||||
|
sb.append(environment);
|
||||||
|
return sb.toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
private static String getComponentArrayJson(String componentName, int count,
|
||||||
|
String endpointSuffix) {
|
||||||
|
String component = "\\\"" + componentName + "\\\":";
|
||||||
|
StringBuilder array = new StringBuilder();
|
||||||
|
array.append("[");
|
||||||
|
for (int i = 0; i < count; i++) {
|
||||||
|
array.append("\\\"");
|
||||||
|
array.append(componentName);
|
||||||
|
array.append("-");
|
||||||
|
array.append(i);
|
||||||
|
array.append(endpointSuffix);
|
||||||
|
array.append("\\\"");
|
||||||
|
if (i != count - 1) {
|
||||||
|
array.append(",");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
array.append("]");
|
||||||
|
return component + array.toString();
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,203 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hadoop.yarn.submarine.runtimes.yarnservice.tensorflow;
|
||||||
|
|
||||||
|
import org.apache.hadoop.conf.Configuration;
|
||||||
|
import org.apache.hadoop.yarn.service.api.records.Component;
|
||||||
|
import org.apache.hadoop.yarn.service.api.records.KerberosPrincipal;
|
||||||
|
import org.apache.hadoop.yarn.service.api.records.Service;
|
||||||
|
import org.apache.hadoop.yarn.submarine.client.cli.param.Quicklink;
|
||||||
|
import org.apache.hadoop.yarn.submarine.client.cli.param.RunJobParameters;
|
||||||
|
import org.apache.hadoop.yarn.submarine.common.ClientContext;
|
||||||
|
import org.apache.hadoop.yarn.submarine.common.api.TaskType;
|
||||||
|
import org.apache.hadoop.yarn.submarine.common.conf.SubmarineLogs;
|
||||||
|
import org.apache.hadoop.yarn.submarine.common.fs.RemoteDirectoryManager;
|
||||||
|
import org.apache.hadoop.yarn.submarine.runtimes.yarnservice.FileSystemOperations;
|
||||||
|
import org.apache.hadoop.yarn.submarine.runtimes.yarnservice.ServiceSpec;
|
||||||
|
import org.apache.hadoop.yarn.submarine.runtimes.yarnservice.ServiceWrapper;
|
||||||
|
import org.apache.hadoop.yarn.submarine.runtimes.yarnservice.YarnServiceUtils;
|
||||||
|
import org.apache.hadoop.yarn.submarine.runtimes.yarnservice.command.LaunchCommandFactory;
|
||||||
|
import org.apache.hadoop.yarn.submarine.runtimes.yarnservice.tensorflow.component.TensorBoardComponent;
|
||||||
|
import org.apache.hadoop.yarn.submarine.runtimes.yarnservice.tensorflow.component.TensorFlowPsComponent;
|
||||||
|
import org.apache.hadoop.yarn.submarine.runtimes.yarnservice.tensorflow.component.TensorFlowWorkerComponent;
|
||||||
|
import org.apache.hadoop.yarn.submarine.utils.KerberosPrincipalFactory;
|
||||||
|
import org.apache.hadoop.yarn.submarine.utils.Localizer;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
import static org.apache.hadoop.yarn.submarine.runtimes.yarnservice.tensorflow.TensorFlowCommons.getDNSDomain;
|
||||||
|
import static org.apache.hadoop.yarn.submarine.runtimes.yarnservice.tensorflow.TensorFlowCommons.getUserName;
|
||||||
|
import static org.apache.hadoop.yarn.submarine.runtimes.yarnservice.tensorflow.component.TensorBoardComponent.TENSORBOARD_QUICKLINK_LABEL;
|
||||||
|
import static org.apache.hadoop.yarn.submarine.utils.DockerUtilities.getDockerArtifact;
|
||||||
|
import static org.apache.hadoop.yarn.submarine.utils.EnvironmentUtilities.handleServiceEnvs;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This class contains all the logic to create an instance
|
||||||
|
* of a {@link Service} object for TensorFlow.
|
||||||
|
* Worker,PS and Tensorboard components are added to the Service
|
||||||
|
* based on the value of the received {@link RunJobParameters}.
|
||||||
|
*/
|
||||||
|
public class TensorFlowServiceSpec implements ServiceSpec {
|
||||||
|
private static final Logger LOG =
|
||||||
|
LoggerFactory.getLogger(TensorFlowServiceSpec.class);
|
||||||
|
|
||||||
|
private final RemoteDirectoryManager remoteDirectoryManager;
|
||||||
|
|
||||||
|
private final RunJobParameters parameters;
|
||||||
|
private final Configuration yarnConfig;
|
||||||
|
private final FileSystemOperations fsOperations;
|
||||||
|
private final LaunchCommandFactory launchCommandFactory;
|
||||||
|
private final Localizer localizer;
|
||||||
|
|
||||||
|
public TensorFlowServiceSpec(RunJobParameters parameters,
|
||||||
|
ClientContext clientContext, FileSystemOperations fsOperations,
|
||||||
|
LaunchCommandFactory launchCommandFactory, Localizer localizer) {
|
||||||
|
this.parameters = parameters;
|
||||||
|
this.remoteDirectoryManager = clientContext.getRemoteDirectoryManager();
|
||||||
|
this.yarnConfig = clientContext.getYarnConfig();
|
||||||
|
this.fsOperations = fsOperations;
|
||||||
|
this.launchCommandFactory = launchCommandFactory;
|
||||||
|
this.localizer = localizer;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public ServiceWrapper create() throws IOException {
|
||||||
|
ServiceWrapper serviceWrapper = createServiceSpecWrapper();
|
||||||
|
|
||||||
|
if (parameters.getNumWorkers() > 0) {
|
||||||
|
addWorkerComponents(serviceWrapper);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (parameters.getNumPS() > 0) {
|
||||||
|
addPsComponent(serviceWrapper);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (parameters.isTensorboardEnabled()) {
|
||||||
|
createTensorBoardComponent(serviceWrapper);
|
||||||
|
}
|
||||||
|
|
||||||
|
// After all components added, handle quicklinks
|
||||||
|
handleQuicklinks(serviceWrapper.getService());
|
||||||
|
|
||||||
|
return serviceWrapper;
|
||||||
|
}
|
||||||
|
|
||||||
|
private ServiceWrapper createServiceSpecWrapper() throws IOException {
|
||||||
|
Service serviceSpec = new Service();
|
||||||
|
serviceSpec.setName(parameters.getName());
|
||||||
|
serviceSpec.setVersion(String.valueOf(System.currentTimeMillis()));
|
||||||
|
serviceSpec.setArtifact(getDockerArtifact(parameters.getDockerImageName()));
|
||||||
|
|
||||||
|
KerberosPrincipal kerberosPrincipal = KerberosPrincipalFactory
|
||||||
|
.create(fsOperations, remoteDirectoryManager, parameters);
|
||||||
|
if (kerberosPrincipal != null) {
|
||||||
|
serviceSpec.setKerberosPrincipal(kerberosPrincipal);
|
||||||
|
}
|
||||||
|
|
||||||
|
handleServiceEnvs(serviceSpec, yarnConfig, parameters.getEnvars());
|
||||||
|
localizer.handleLocalizations(serviceSpec);
|
||||||
|
return new ServiceWrapper(serviceSpec);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void createTensorBoardComponent(ServiceWrapper serviceWrapper)
|
||||||
|
throws IOException {
|
||||||
|
TensorBoardComponent tbComponent = new TensorBoardComponent(fsOperations,
|
||||||
|
remoteDirectoryManager, parameters, launchCommandFactory, yarnConfig);
|
||||||
|
serviceWrapper.addComponent(tbComponent);
|
||||||
|
|
||||||
|
addQuicklink(serviceWrapper.getService(), TENSORBOARD_QUICKLINK_LABEL,
|
||||||
|
tbComponent.getTensorboardLink());
|
||||||
|
}
|
||||||
|
|
||||||
|
private static void addQuicklink(Service serviceSpec, String label,
|
||||||
|
String link) {
|
||||||
|
Map<String, String> quicklinks = serviceSpec.getQuicklinks();
|
||||||
|
if (quicklinks == null) {
|
||||||
|
quicklinks = new HashMap<>();
|
||||||
|
serviceSpec.setQuicklinks(quicklinks);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (SubmarineLogs.isVerbose()) {
|
||||||
|
LOG.info("Added quicklink, " + label + "=" + link);
|
||||||
|
}
|
||||||
|
|
||||||
|
quicklinks.put(label, link);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void handleQuicklinks(Service serviceSpec)
|
||||||
|
throws IOException {
|
||||||
|
List<Quicklink> quicklinks = parameters.getQuicklinks();
|
||||||
|
if (quicklinks != null && !quicklinks.isEmpty()) {
|
||||||
|
for (Quicklink ql : quicklinks) {
|
||||||
|
// Make sure it is a valid instance name
|
||||||
|
String instanceName = ql.getComponentInstanceName();
|
||||||
|
boolean found = false;
|
||||||
|
|
||||||
|
for (Component comp : serviceSpec.getComponents()) {
|
||||||
|
for (int i = 0; i < comp.getNumberOfContainers(); i++) {
|
||||||
|
String possibleInstanceName = comp.getName() + "-" + i;
|
||||||
|
if (possibleInstanceName.equals(instanceName)) {
|
||||||
|
found = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!found) {
|
||||||
|
throw new IOException(
|
||||||
|
"Couldn't find a component instance = " + instanceName
|
||||||
|
+ " while adding quicklink");
|
||||||
|
}
|
||||||
|
|
||||||
|
String link = ql.getProtocol()
|
||||||
|
+ YarnServiceUtils.getDNSName(serviceSpec.getName(), instanceName,
|
||||||
|
getUserName(), getDNSDomain(yarnConfig), ql.getPort());
|
||||||
|
addQuicklink(serviceSpec, ql.getLabel(), link);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Handle worker and primary_worker.
|
||||||
|
|
||||||
|
private void addWorkerComponents(ServiceWrapper serviceWrapper)
|
||||||
|
throws IOException {
|
||||||
|
addWorkerComponent(serviceWrapper, parameters, TaskType.PRIMARY_WORKER);
|
||||||
|
|
||||||
|
if (parameters.getNumWorkers() > 1) {
|
||||||
|
addWorkerComponent(serviceWrapper, parameters, TaskType.WORKER);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
private void addWorkerComponent(ServiceWrapper serviceWrapper,
|
||||||
|
RunJobParameters parameters, TaskType taskType) throws IOException {
|
||||||
|
serviceWrapper.addComponent(
|
||||||
|
new TensorFlowWorkerComponent(fsOperations, remoteDirectoryManager,
|
||||||
|
parameters, taskType, launchCommandFactory, yarnConfig));
|
||||||
|
}
|
||||||
|
|
||||||
|
private void addPsComponent(ServiceWrapper serviceWrapper)
|
||||||
|
throws IOException {
|
||||||
|
serviceWrapper.addComponent(
|
||||||
|
new TensorFlowPsComponent(fsOperations, remoteDirectoryManager,
|
||||||
|
launchCommandFactory, parameters, yarnConfig));
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,67 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hadoop.yarn.submarine.runtimes.yarnservice.tensorflow.command;
|
||||||
|
|
||||||
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
import org.apache.hadoop.yarn.service.api.records.Component;
|
||||||
|
import org.apache.hadoop.yarn.submarine.client.cli.param.RunJobParameters;
|
||||||
|
import org.apache.hadoop.yarn.submarine.common.api.TaskType;
|
||||||
|
import org.apache.hadoop.yarn.submarine.runtimes.yarnservice.HadoopEnvironmentSetup;
|
||||||
|
import org.apache.hadoop.yarn.submarine.runtimes.yarnservice.command.AbstractLaunchCommand;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.Objects;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Launch command implementation for Tensorboard.
|
||||||
|
*/
|
||||||
|
public class TensorBoardLaunchCommand extends AbstractLaunchCommand {
|
||||||
|
private static final Logger LOG =
|
||||||
|
LoggerFactory.getLogger(TensorBoardLaunchCommand.class);
|
||||||
|
private final String checkpointPath;
|
||||||
|
|
||||||
|
public TensorBoardLaunchCommand(HadoopEnvironmentSetup hadoopEnvSetup,
|
||||||
|
TaskType taskType, Component component, RunJobParameters parameters)
|
||||||
|
throws IOException {
|
||||||
|
super(hadoopEnvSetup, taskType, component, parameters);
|
||||||
|
Objects.requireNonNull(parameters.getCheckpointPath(),
|
||||||
|
"CheckpointPath must not be null as it is part "
|
||||||
|
+ "of the tensorboard command!");
|
||||||
|
if (StringUtils.isEmpty(parameters.getCheckpointPath())) {
|
||||||
|
throw new IllegalArgumentException("CheckpointPath must not be empty!");
|
||||||
|
}
|
||||||
|
|
||||||
|
this.checkpointPath = parameters.getCheckpointPath();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String generateLaunchScript() throws IOException {
|
||||||
|
return getBuilder()
|
||||||
|
.withLaunchCommand(createLaunchCommand())
|
||||||
|
.build();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String createLaunchCommand() {
|
||||||
|
String tbCommand = String.format("export LC_ALL=C && tensorboard " +
|
||||||
|
"--logdir=%s%n", checkpointPath);
|
||||||
|
LOG.info("Tensorboard command=" + tbCommand);
|
||||||
|
return tbCommand;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,87 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hadoop.yarn.submarine.runtimes.yarnservice.tensorflow.command;
|
||||||
|
|
||||||
|
import org.apache.hadoop.conf.Configuration;
|
||||||
|
import org.apache.hadoop.yarn.service.api.records.Component;
|
||||||
|
import org.apache.hadoop.yarn.submarine.client.cli.param.RunJobParameters;
|
||||||
|
import org.apache.hadoop.yarn.submarine.common.api.TaskType;
|
||||||
|
import org.apache.hadoop.yarn.submarine.runtimes.yarnservice.HadoopEnvironmentSetup;
|
||||||
|
import org.apache.hadoop.yarn.submarine.runtimes.yarnservice.command.AbstractLaunchCommand;
|
||||||
|
import org.apache.hadoop.yarn.submarine.runtimes.yarnservice.command.LaunchScriptBuilder;
|
||||||
|
import org.apache.hadoop.yarn.submarine.runtimes.yarnservice.tensorflow.TensorFlowCommons;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Launch command implementation for
|
||||||
|
* TensorFlow PS and Worker Service components.
|
||||||
|
*/
|
||||||
|
public abstract class TensorFlowLaunchCommand extends AbstractLaunchCommand {
|
||||||
|
private static final Logger LOG =
|
||||||
|
LoggerFactory.getLogger(TensorFlowLaunchCommand.class);
|
||||||
|
private final Configuration yarnConfig;
|
||||||
|
private final boolean distributed;
|
||||||
|
private final int numberOfWorkers;
|
||||||
|
private final int numberOfPS;
|
||||||
|
private final String name;
|
||||||
|
private final TaskType taskType;
|
||||||
|
|
||||||
|
TensorFlowLaunchCommand(HadoopEnvironmentSetup hadoopEnvSetup,
|
||||||
|
TaskType taskType, Component component, RunJobParameters parameters,
|
||||||
|
Configuration yarnConfig) throws IOException {
|
||||||
|
super(hadoopEnvSetup, taskType, component, parameters);
|
||||||
|
this.taskType = taskType;
|
||||||
|
this.name = parameters.getName();
|
||||||
|
this.distributed = parameters.isDistributed();
|
||||||
|
this.numberOfWorkers = parameters.getNumWorkers();
|
||||||
|
this.numberOfPS = parameters.getNumPS();
|
||||||
|
this.yarnConfig = yarnConfig;
|
||||||
|
logReceivedParameters();
|
||||||
|
}
|
||||||
|
|
||||||
|
private void logReceivedParameters() {
|
||||||
|
if (this.numberOfWorkers <= 0) {
|
||||||
|
LOG.warn("Received number of workers: {}", this.numberOfWorkers);
|
||||||
|
}
|
||||||
|
if (this.numberOfPS <= 0) {
|
||||||
|
LOG.warn("Received number of PS: {}", this.numberOfPS);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String generateLaunchScript() throws IOException {
|
||||||
|
LaunchScriptBuilder builder = getBuilder();
|
||||||
|
|
||||||
|
// When distributed training is required
|
||||||
|
if (distributed) {
|
||||||
|
String tfConfigEnvValue = TensorFlowCommons.getTFConfigEnv(
|
||||||
|
taskType.getComponentName(), numberOfWorkers,
|
||||||
|
numberOfPS, name,
|
||||||
|
TensorFlowCommons.getUserName(),
|
||||||
|
TensorFlowCommons.getDNSDomain(yarnConfig));
|
||||||
|
String tfConfig = "export TF_CONFIG=\"" + tfConfigEnvValue + "\"\n";
|
||||||
|
builder.append(tfConfig);
|
||||||
|
}
|
||||||
|
|
||||||
|
return builder
|
||||||
|
.withLaunchCommand(createLaunchCommand())
|
||||||
|
.build();
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,58 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hadoop.yarn.submarine.runtimes.yarnservice.tensorflow.command;
|
||||||
|
|
||||||
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
import org.apache.hadoop.conf.Configuration;
|
||||||
|
import org.apache.hadoop.yarn.service.api.records.Component;
|
||||||
|
import org.apache.hadoop.yarn.submarine.client.cli.param.RunJobParameters;
|
||||||
|
import org.apache.hadoop.yarn.submarine.common.api.TaskType;
|
||||||
|
import org.apache.hadoop.yarn.submarine.common.conf.SubmarineLogs;
|
||||||
|
import org.apache.hadoop.yarn.submarine.runtimes.yarnservice.HadoopEnvironmentSetup;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Launch command implementation for Tensorboard's PS component.
|
||||||
|
*/
|
||||||
|
public class TensorFlowPsLaunchCommand extends TensorFlowLaunchCommand {
|
||||||
|
private static final Logger LOG =
|
||||||
|
LoggerFactory.getLogger(TensorFlowPsLaunchCommand.class);
|
||||||
|
private final String launchCommand;
|
||||||
|
|
||||||
|
public TensorFlowPsLaunchCommand(HadoopEnvironmentSetup hadoopEnvSetup,
|
||||||
|
TaskType taskType, Component component, RunJobParameters parameters,
|
||||||
|
Configuration yarnConfig) throws IOException {
|
||||||
|
super(hadoopEnvSetup, taskType, component, parameters, yarnConfig);
|
||||||
|
this.launchCommand = parameters.getPSLaunchCmd();
|
||||||
|
|
||||||
|
if (StringUtils.isEmpty(this.launchCommand)) {
|
||||||
|
throw new IllegalArgumentException("LaunchCommand must not be null " +
|
||||||
|
"or empty!");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String createLaunchCommand() {
|
||||||
|
if (SubmarineLogs.isVerbose()) {
|
||||||
|
LOG.info("PS command =[" + launchCommand + "]");
|
||||||
|
}
|
||||||
|
return launchCommand + '\n';
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,59 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hadoop.yarn.submarine.runtimes.yarnservice.tensorflow.command;
|
||||||
|
|
||||||
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
import org.apache.hadoop.conf.Configuration;
|
||||||
|
import org.apache.hadoop.yarn.service.api.records.Component;
|
||||||
|
import org.apache.hadoop.yarn.submarine.client.cli.param.RunJobParameters;
|
||||||
|
import org.apache.hadoop.yarn.submarine.common.api.TaskType;
|
||||||
|
import org.apache.hadoop.yarn.submarine.common.conf.SubmarineLogs;
|
||||||
|
import org.apache.hadoop.yarn.submarine.runtimes.yarnservice.HadoopEnvironmentSetup;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Launch command implementation for Tensorboard's Worker component.
|
||||||
|
*/
|
||||||
|
public class TensorFlowWorkerLaunchCommand extends TensorFlowLaunchCommand {
|
||||||
|
private static final Logger LOG =
|
||||||
|
LoggerFactory.getLogger(TensorFlowWorkerLaunchCommand.class);
|
||||||
|
private final String launchCommand;
|
||||||
|
|
||||||
|
public TensorFlowWorkerLaunchCommand(
|
||||||
|
HadoopEnvironmentSetup hadoopEnvSetup, TaskType taskType,
|
||||||
|
Component component, RunJobParameters parameters,
|
||||||
|
Configuration yarnConfig) throws IOException {
|
||||||
|
super(hadoopEnvSetup, taskType, component, parameters, yarnConfig);
|
||||||
|
this.launchCommand = parameters.getWorkerLaunchCmd();
|
||||||
|
|
||||||
|
if (StringUtils.isEmpty(this.launchCommand)) {
|
||||||
|
throw new IllegalArgumentException("LaunchCommand must not be null " +
|
||||||
|
"or empty!");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String createLaunchCommand() {
|
||||||
|
if (SubmarineLogs.isVerbose()) {
|
||||||
|
LOG.info("Worker command =[" + launchCommand + "]");
|
||||||
|
}
|
||||||
|
return launchCommand + '\n';
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,19 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
/**
|
||||||
|
* This package contains classes to generate TensorFlow launch commands.
|
||||||
|
*/
|
||||||
|
package org.apache.hadoop.yarn.submarine.runtimes.yarnservice.tensorflow.command;
|
|
@ -0,0 +1,96 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hadoop.yarn.submarine.runtimes.yarnservice.tensorflow.component;
|
||||||
|
|
||||||
|
import org.apache.hadoop.conf.Configuration;
|
||||||
|
import org.apache.hadoop.yarn.service.api.records.Component;
|
||||||
|
import org.apache.hadoop.yarn.service.api.records.Component.RestartPolicyEnum;
|
||||||
|
import org.apache.hadoop.yarn.submarine.client.cli.param.RunJobParameters;
|
||||||
|
import org.apache.hadoop.yarn.submarine.common.api.TaskType;
|
||||||
|
import org.apache.hadoop.yarn.submarine.common.fs.RemoteDirectoryManager;
|
||||||
|
import org.apache.hadoop.yarn.submarine.runtimes.yarnservice.AbstractComponent;
|
||||||
|
import org.apache.hadoop.yarn.submarine.runtimes.yarnservice.FileSystemOperations;
|
||||||
|
import org.apache.hadoop.yarn.submarine.runtimes.yarnservice.YarnServiceUtils;
|
||||||
|
import org.apache.hadoop.yarn.submarine.runtimes.yarnservice.command.LaunchCommandFactory;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.Objects;
|
||||||
|
|
||||||
|
import static org.apache.hadoop.yarn.submarine.runtimes.yarnservice.tensorflow.TensorFlowCommons.addCommonEnvironments;
|
||||||
|
import static org.apache.hadoop.yarn.submarine.runtimes.yarnservice.tensorflow.TensorFlowCommons.getDNSDomain;
|
||||||
|
import static org.apache.hadoop.yarn.submarine.runtimes.yarnservice.tensorflow.TensorFlowCommons.getUserName;
|
||||||
|
import static org.apache.hadoop.yarn.submarine.utils.DockerUtilities.getDockerArtifact;
|
||||||
|
import static org.apache.hadoop.yarn.submarine.utils.SubmarineResourceUtils.convertYarnResourceToServiceResource;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Component implementation for Tensorboard's Tensorboard.
|
||||||
|
*/
|
||||||
|
public class TensorBoardComponent extends AbstractComponent {
|
||||||
|
private static final Logger LOG =
|
||||||
|
LoggerFactory.getLogger(TensorBoardComponent.class);
|
||||||
|
|
||||||
|
public static final String TENSORBOARD_QUICKLINK_LABEL = "Tensorboard";
|
||||||
|
private static final int DEFAULT_PORT = 6006;
|
||||||
|
|
||||||
|
//computed fields
|
||||||
|
private String tensorboardLink;
|
||||||
|
|
||||||
|
public TensorBoardComponent(FileSystemOperations fsOperations,
|
||||||
|
RemoteDirectoryManager remoteDirectoryManager,
|
||||||
|
RunJobParameters parameters,
|
||||||
|
LaunchCommandFactory launchCommandFactory,
|
||||||
|
Configuration yarnConfig) {
|
||||||
|
super(fsOperations, remoteDirectoryManager, parameters,
|
||||||
|
TaskType.TENSORBOARD, yarnConfig, launchCommandFactory);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Component createComponent() throws IOException {
|
||||||
|
Objects.requireNonNull(parameters.getTensorboardResource(),
|
||||||
|
"TensorBoard resource must not be null!");
|
||||||
|
|
||||||
|
Component component = new Component();
|
||||||
|
component.setName(taskType.getComponentName());
|
||||||
|
component.setNumberOfContainers(1L);
|
||||||
|
component.setRestartPolicy(RestartPolicyEnum.NEVER);
|
||||||
|
component.setResource(convertYarnResourceToServiceResource(
|
||||||
|
parameters.getTensorboardResource()));
|
||||||
|
|
||||||
|
if (parameters.getTensorboardDockerImage() != null) {
|
||||||
|
component.setArtifact(
|
||||||
|
getDockerArtifact(parameters.getTensorboardDockerImage()));
|
||||||
|
}
|
||||||
|
|
||||||
|
addCommonEnvironments(component, taskType);
|
||||||
|
generateLaunchCommand(component);
|
||||||
|
|
||||||
|
tensorboardLink = "http://" + YarnServiceUtils.getDNSName(
|
||||||
|
parameters.getName(),
|
||||||
|
taskType.getComponentName() + "-" + 0, getUserName(),
|
||||||
|
getDNSDomain(yarnConfig), DEFAULT_PORT);
|
||||||
|
LOG.info("Link to tensorboard:" + tensorboardLink);
|
||||||
|
|
||||||
|
return component;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getTensorboardLink() {
|
||||||
|
return tensorboardLink;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,73 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hadoop.yarn.submarine.runtimes.yarnservice.tensorflow.component;
|
||||||
|
|
||||||
|
import org.apache.hadoop.conf.Configuration;
|
||||||
|
import org.apache.hadoop.yarn.service.api.records.Component;
|
||||||
|
import org.apache.hadoop.yarn.submarine.client.cli.param.RunJobParameters;
|
||||||
|
import org.apache.hadoop.yarn.submarine.common.api.TaskType;
|
||||||
|
import org.apache.hadoop.yarn.submarine.common.fs.RemoteDirectoryManager;
|
||||||
|
import org.apache.hadoop.yarn.submarine.runtimes.yarnservice.AbstractComponent;
|
||||||
|
import org.apache.hadoop.yarn.submarine.runtimes.yarnservice.FileSystemOperations;
|
||||||
|
import org.apache.hadoop.yarn.submarine.runtimes.yarnservice.command.LaunchCommandFactory;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.Objects;
|
||||||
|
|
||||||
|
import static org.apache.hadoop.yarn.submarine.runtimes.yarnservice.tensorflow.TensorFlowCommons.addCommonEnvironments;
|
||||||
|
import static org.apache.hadoop.yarn.submarine.utils.DockerUtilities.getDockerArtifact;
|
||||||
|
import static org.apache.hadoop.yarn.submarine.utils.SubmarineResourceUtils.convertYarnResourceToServiceResource;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Component implementation for TensorFlow's PS process.
|
||||||
|
*/
|
||||||
|
public class TensorFlowPsComponent extends AbstractComponent {
|
||||||
|
public TensorFlowPsComponent(FileSystemOperations fsOperations,
|
||||||
|
RemoteDirectoryManager remoteDirectoryManager,
|
||||||
|
LaunchCommandFactory launchCommandFactory,
|
||||||
|
RunJobParameters parameters,
|
||||||
|
Configuration yarnConfig) {
|
||||||
|
super(fsOperations, remoteDirectoryManager, parameters, TaskType.PS,
|
||||||
|
yarnConfig, launchCommandFactory);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Component createComponent() throws IOException {
|
||||||
|
Objects.requireNonNull(parameters.getPsResource(),
|
||||||
|
"PS resource must not be null!");
|
||||||
|
if (parameters.getNumPS() < 1) {
|
||||||
|
throw new IllegalArgumentException("Number of PS should be at least 1!");
|
||||||
|
}
|
||||||
|
|
||||||
|
Component component = new Component();
|
||||||
|
component.setName(taskType.getComponentName());
|
||||||
|
component.setNumberOfContainers((long) parameters.getNumPS());
|
||||||
|
component.setRestartPolicy(Component.RestartPolicyEnum.NEVER);
|
||||||
|
component.setResource(
|
||||||
|
convertYarnResourceToServiceResource(parameters.getPsResource()));
|
||||||
|
|
||||||
|
// Override global docker image if needed.
|
||||||
|
if (parameters.getPsDockerImage() != null) {
|
||||||
|
component.setArtifact(
|
||||||
|
getDockerArtifact(parameters.getPsDockerImage()));
|
||||||
|
}
|
||||||
|
addCommonEnvironments(component, taskType);
|
||||||
|
generateLaunchCommand(component);
|
||||||
|
|
||||||
|
return component;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,82 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hadoop.yarn.submarine.runtimes.yarnservice.tensorflow.component;
|
||||||
|
|
||||||
|
import org.apache.hadoop.conf.Configuration;
|
||||||
|
import org.apache.hadoop.yarn.service.api.records.Component;
|
||||||
|
import org.apache.hadoop.yarn.submarine.client.cli.param.RunJobParameters;
|
||||||
|
import org.apache.hadoop.yarn.submarine.common.api.TaskType;
|
||||||
|
import org.apache.hadoop.yarn.submarine.common.fs.RemoteDirectoryManager;
|
||||||
|
import org.apache.hadoop.yarn.submarine.runtimes.yarnservice.AbstractComponent;
|
||||||
|
import org.apache.hadoop.yarn.submarine.runtimes.yarnservice.FileSystemOperations;
|
||||||
|
import org.apache.hadoop.yarn.submarine.runtimes.yarnservice.command.LaunchCommandFactory;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.Objects;
|
||||||
|
import static org.apache.hadoop.yarn.service.conf.YarnServiceConstants.CONTAINER_STATE_REPORT_AS_SERVICE_STATE;
|
||||||
|
import static org.apache.hadoop.yarn.submarine.runtimes.yarnservice.tensorflow.TensorFlowCommons.addCommonEnvironments;
|
||||||
|
import static org.apache.hadoop.yarn.submarine.utils.DockerUtilities.getDockerArtifact;
|
||||||
|
import static org.apache.hadoop.yarn.submarine.utils.SubmarineResourceUtils.convertYarnResourceToServiceResource;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Component implementation for TensorFlow's Worker process.
|
||||||
|
*/
|
||||||
|
public class TensorFlowWorkerComponent extends AbstractComponent {
|
||||||
|
public TensorFlowWorkerComponent(FileSystemOperations fsOperations,
|
||||||
|
RemoteDirectoryManager remoteDirectoryManager,
|
||||||
|
RunJobParameters parameters, TaskType taskType,
|
||||||
|
LaunchCommandFactory launchCommandFactory,
|
||||||
|
Configuration yarnConfig) {
|
||||||
|
super(fsOperations, remoteDirectoryManager, parameters, taskType,
|
||||||
|
yarnConfig, launchCommandFactory);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Component createComponent() throws IOException {
|
||||||
|
Objects.requireNonNull(parameters.getWorkerResource(),
|
||||||
|
"Worker resource must not be null!");
|
||||||
|
if (parameters.getNumWorkers() < 1) {
|
||||||
|
throw new IllegalArgumentException(
|
||||||
|
"Number of workers should be at least 1!");
|
||||||
|
}
|
||||||
|
|
||||||
|
Component component = new Component();
|
||||||
|
component.setName(taskType.getComponentName());
|
||||||
|
|
||||||
|
if (taskType.equals(TaskType.PRIMARY_WORKER)) {
|
||||||
|
component.setNumberOfContainers(1L);
|
||||||
|
component.getConfiguration().setProperty(
|
||||||
|
CONTAINER_STATE_REPORT_AS_SERVICE_STATE, "true");
|
||||||
|
} else {
|
||||||
|
component.setNumberOfContainers(
|
||||||
|
(long) parameters.getNumWorkers() - 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (parameters.getWorkerDockerImage() != null) {
|
||||||
|
component.setArtifact(
|
||||||
|
getDockerArtifact(parameters.getWorkerDockerImage()));
|
||||||
|
}
|
||||||
|
|
||||||
|
component.setResource(convertYarnResourceToServiceResource(
|
||||||
|
parameters.getWorkerResource()));
|
||||||
|
component.setRestartPolicy(Component.RestartPolicyEnum.NEVER);
|
||||||
|
|
||||||
|
addCommonEnvironments(component, taskType);
|
||||||
|
generateLaunchCommand(component);
|
||||||
|
|
||||||
|
return component;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,20 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
/**
|
||||||
|
* This package contains classes to generate
|
||||||
|
* TensorFlow Native Service components.
|
||||||
|
*/
|
||||||
|
package org.apache.hadoop.yarn.submarine.runtimes.yarnservice.tensorflow.component;
|
|
@ -0,0 +1,20 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
/**
|
||||||
|
* This package contains classes to generate
|
||||||
|
* TensorFlow-related Native Service runtime artifacts.
|
||||||
|
*/
|
||||||
|
package org.apache.hadoop.yarn.submarine.runtimes.yarnservice.tensorflow;
|
|
@ -0,0 +1,57 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hadoop.yarn.submarine.utils;
|
||||||
|
|
||||||
|
import java.io.File;
|
||||||
|
import java.util.StringTokenizer;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Utilities for classpath operations.
|
||||||
|
*/
|
||||||
|
public final class ClassPathUtilities {
|
||||||
|
private ClassPathUtilities() {
|
||||||
|
throw new UnsupportedOperationException("This class should not be " +
|
||||||
|
"instantiated!");
|
||||||
|
}
|
||||||
|
|
||||||
|
public static File findFileOnClassPath(final String fileName) {
|
||||||
|
final String classpath = System.getProperty("java.class.path");
|
||||||
|
final String pathSeparator = System.getProperty("path.separator");
|
||||||
|
final StringTokenizer tokenizer = new StringTokenizer(classpath,
|
||||||
|
pathSeparator);
|
||||||
|
|
||||||
|
while (tokenizer.hasMoreTokens()) {
|
||||||
|
final String pathElement = tokenizer.nextToken();
|
||||||
|
final File directoryOrJar = new File(pathElement);
|
||||||
|
final File absoluteDirectoryOrJar = directoryOrJar.getAbsoluteFile();
|
||||||
|
if (absoluteDirectoryOrJar.isFile()) {
|
||||||
|
final File target =
|
||||||
|
new File(absoluteDirectoryOrJar.getParent(), fileName);
|
||||||
|
if (target.exists()) {
|
||||||
|
return target;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
final File target = new File(directoryOrJar, fileName);
|
||||||
|
if (target.exists()) {
|
||||||
|
return target;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,33 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hadoop.yarn.submarine.utils;
|
||||||
|
|
||||||
|
import org.apache.hadoop.yarn.service.api.records.Artifact;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Utilities for Docker-related operations.
|
||||||
|
*/
|
||||||
|
public final class DockerUtilities {
|
||||||
|
private DockerUtilities() {
|
||||||
|
throw new UnsupportedOperationException("This class should not be " +
|
||||||
|
"instantiated!");
|
||||||
|
}
|
||||||
|
|
||||||
|
public static Artifact getDockerArtifact(String dockerImageName) {
|
||||||
|
return new Artifact().type(Artifact.TypeEnum.DOCKER).id(dockerImageName);
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,120 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hadoop.yarn.submarine.utils;
|
||||||
|
|
||||||
|
import org.apache.hadoop.conf.Configuration;
|
||||||
|
import org.apache.hadoop.yarn.service.api.records.Service;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.HADOOP_SECURITY_AUTHENTICATION;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Utilities for environment variable related operations
|
||||||
|
* for {@link Service} objects.
|
||||||
|
*/
|
||||||
|
public final class EnvironmentUtilities {
|
||||||
|
private EnvironmentUtilities() {
|
||||||
|
throw new UnsupportedOperationException("This class should not be " +
|
||||||
|
"instantiated!");
|
||||||
|
}
|
||||||
|
|
||||||
|
private static final Logger LOG =
|
||||||
|
LoggerFactory.getLogger(EnvironmentUtilities.class);
|
||||||
|
|
||||||
|
static final String ENV_DOCKER_MOUNTS_FOR_CONTAINER_RUNTIME =
|
||||||
|
"YARN_CONTAINER_RUNTIME_DOCKER_MOUNTS";
|
||||||
|
private static final String MOUNTS_DELIM = ",";
|
||||||
|
private static final String ENV_SEPARATOR = "=";
|
||||||
|
private static final String ETC_PASSWD_MOUNT_STRING =
|
||||||
|
"/etc/passwd:/etc/passwd:ro";
|
||||||
|
private static final String KERBEROS_CONF_MOUNT_STRING =
|
||||||
|
"/etc/krb5.conf:/etc/krb5.conf:ro";
|
||||||
|
private static final String ENV_VAR_DELIM = ":";
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Extracts value from a string representation of an environment variable.
|
||||||
|
* @param envVar The environment variable in 'key=value' format.
|
||||||
|
* @return The value of the environment variable
|
||||||
|
*/
|
||||||
|
public static String getValueOfEnvironment(String envVar) {
|
||||||
|
if (envVar == null || !envVar.contains(ENV_SEPARATOR)) {
|
||||||
|
return "";
|
||||||
|
} else {
|
||||||
|
return envVar.substring(envVar.indexOf(ENV_SEPARATOR) + 1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public static void handleServiceEnvs(Service service,
|
||||||
|
Configuration yarnConfig, List<String> envVars) {
|
||||||
|
if (envVars != null) {
|
||||||
|
for (String envVarPair : envVars) {
|
||||||
|
String key, value;
|
||||||
|
if (envVarPair.contains(ENV_SEPARATOR)) {
|
||||||
|
int idx = envVarPair.indexOf(ENV_SEPARATOR);
|
||||||
|
key = envVarPair.substring(0, idx);
|
||||||
|
value = envVarPair.substring(idx + 1);
|
||||||
|
} else {
|
||||||
|
LOG.warn("Found environment variable with unusual format: '{}'",
|
||||||
|
envVarPair);
|
||||||
|
// No "=" found so use the whole key
|
||||||
|
key = envVarPair;
|
||||||
|
value = "";
|
||||||
|
}
|
||||||
|
appendToEnv(service, key, value, ENV_VAR_DELIM);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
appendOtherConfigs(service, yarnConfig);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Appends other configs like /etc/passwd, /etc/krb5.conf.
|
||||||
|
* @param service
|
||||||
|
* @param yarnConfig
|
||||||
|
*/
|
||||||
|
private static void appendOtherConfigs(Service service,
|
||||||
|
Configuration yarnConfig) {
|
||||||
|
appendToEnv(service, ENV_DOCKER_MOUNTS_FOR_CONTAINER_RUNTIME,
|
||||||
|
ETC_PASSWD_MOUNT_STRING, MOUNTS_DELIM);
|
||||||
|
|
||||||
|
String authentication = yarnConfig.get(HADOOP_SECURITY_AUTHENTICATION);
|
||||||
|
if (authentication != null && authentication.equals("kerberos")) {
|
||||||
|
appendToEnv(service, ENV_DOCKER_MOUNTS_FOR_CONTAINER_RUNTIME,
|
||||||
|
KERBEROS_CONF_MOUNT_STRING, MOUNTS_DELIM);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void appendToEnv(Service service, String key, String value,
|
||||||
|
String delim) {
|
||||||
|
Map<String, String> env = service.getConfiguration().getEnv();
|
||||||
|
if (!env.containsKey(key)) {
|
||||||
|
env.put(key, value);
|
||||||
|
} else {
|
||||||
|
if (!value.isEmpty()) {
|
||||||
|
String existingValue = env.get(key);
|
||||||
|
if (!existingValue.endsWith(delim)) {
|
||||||
|
env.put(key, existingValue + delim + value);
|
||||||
|
} else {
|
||||||
|
env.put(key, existingValue + value);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,95 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hadoop.yarn.submarine.utils;
|
||||||
|
|
||||||
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
import org.apache.hadoop.fs.Path;
|
||||||
|
import org.apache.hadoop.fs.permission.FsPermission;
|
||||||
|
import org.apache.hadoop.yarn.service.api.records.KerberosPrincipal;
|
||||||
|
import org.apache.hadoop.yarn.submarine.client.cli.param.RunJobParameters;
|
||||||
|
import org.apache.hadoop.yarn.submarine.common.fs.RemoteDirectoryManager;
|
||||||
|
import org.apache.hadoop.yarn.submarine.runtimes.yarnservice.FileSystemOperations;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.Objects;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Simple factory that creates a {@link KerberosPrincipal}.
|
||||||
|
*/
|
||||||
|
public final class KerberosPrincipalFactory {
|
||||||
|
private KerberosPrincipalFactory() {
|
||||||
|
throw new UnsupportedOperationException("This class should not be " +
|
||||||
|
"instantiated!");
|
||||||
|
}
|
||||||
|
|
||||||
|
private static final Logger LOG =
|
||||||
|
LoggerFactory.getLogger(KerberosPrincipalFactory.class);
|
||||||
|
|
||||||
|
public static KerberosPrincipal create(FileSystemOperations fsOperations,
|
||||||
|
RemoteDirectoryManager remoteDirectoryManager,
|
||||||
|
RunJobParameters parameters) throws IOException {
|
||||||
|
Objects.requireNonNull(fsOperations,
|
||||||
|
"FileSystemOperations must not be null!");
|
||||||
|
Objects.requireNonNull(remoteDirectoryManager,
|
||||||
|
"RemoteDirectoryManager must not be null!");
|
||||||
|
Objects.requireNonNull(parameters, "Parameters must not be null!");
|
||||||
|
|
||||||
|
if (StringUtils.isNotBlank(parameters.getKeytab()) && StringUtils
|
||||||
|
.isNotBlank(parameters.getPrincipal())) {
|
||||||
|
String keytab = parameters.getKeytab();
|
||||||
|
String principal = parameters.getPrincipal();
|
||||||
|
if (parameters.isDistributeKeytab()) {
|
||||||
|
return handleDistributedKeytab(fsOperations, remoteDirectoryManager,
|
||||||
|
parameters, keytab, principal);
|
||||||
|
} else {
|
||||||
|
return handleNormalKeytab(keytab, principal);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
LOG.debug("Principal and keytab was null or empty, " +
|
||||||
|
"returning null KerberosPrincipal!");
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static KerberosPrincipal handleDistributedKeytab(
|
||||||
|
FileSystemOperations fsOperations,
|
||||||
|
RemoteDirectoryManager remoteDirectoryManager,
|
||||||
|
RunJobParameters parameters, String keytab, String principal)
|
||||||
|
throws IOException {
|
||||||
|
Path stagingDir = remoteDirectoryManager
|
||||||
|
.getJobStagingArea(parameters.getName(), true);
|
||||||
|
Path remoteKeytabPath =
|
||||||
|
fsOperations.uploadToRemoteFile(stagingDir, keytab);
|
||||||
|
// Only the owner has read access
|
||||||
|
fsOperations.setPermission(remoteKeytabPath,
|
||||||
|
FsPermission.createImmutable((short)Integer.parseInt("400", 8)));
|
||||||
|
return new KerberosPrincipal()
|
||||||
|
.keytab(remoteKeytabPath.toString())
|
||||||
|
.principalName(principal);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static KerberosPrincipal handleNormalKeytab(String keytab,
|
||||||
|
String principal) {
|
||||||
|
if(!keytab.startsWith("file")) {
|
||||||
|
keytab = "file://" + keytab;
|
||||||
|
}
|
||||||
|
return new KerberosPrincipal()
|
||||||
|
.keytab(keytab)
|
||||||
|
.principalName(principal);
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,170 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hadoop.yarn.submarine.utils;
|
||||||
|
|
||||||
|
import org.apache.hadoop.fs.Path;
|
||||||
|
import org.apache.hadoop.yarn.service.api.records.ConfigFile;
|
||||||
|
import org.apache.hadoop.yarn.service.api.records.Service;
|
||||||
|
import org.apache.hadoop.yarn.submarine.client.cli.param.Localization;
|
||||||
|
import org.apache.hadoop.yarn.submarine.client.cli.param.RunJobParameters;
|
||||||
|
import org.apache.hadoop.yarn.submarine.common.fs.RemoteDirectoryManager;
|
||||||
|
import org.apache.hadoop.yarn.submarine.runtimes.yarnservice.FileSystemOperations;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import java.io.File;
|
||||||
|
import java.io.FileNotFoundException;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import static org.apache.hadoop.yarn.submarine.runtimes.yarnservice.FileSystemOperations.needHdfs;
|
||||||
|
import static org.apache.hadoop.yarn.submarine.utils.EnvironmentUtilities.appendToEnv;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This class holds all dependencies in order to localize dependencies
|
||||||
|
* for containers.
|
||||||
|
*/
|
||||||
|
public class Localizer {
|
||||||
|
private static final Logger LOG = LoggerFactory.getLogger(Localizer.class);
|
||||||
|
|
||||||
|
private final FileSystemOperations fsOperations;
|
||||||
|
private final RemoteDirectoryManager remoteDirectoryManager;
|
||||||
|
private final RunJobParameters parameters;
|
||||||
|
|
||||||
|
public Localizer(FileSystemOperations fsOperations,
|
||||||
|
RemoteDirectoryManager remoteDirectoryManager,
|
||||||
|
RunJobParameters parameters) {
|
||||||
|
this.fsOperations = fsOperations;
|
||||||
|
this.remoteDirectoryManager = remoteDirectoryManager;
|
||||||
|
this.parameters = parameters;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Localize dependencies for all containers.
|
||||||
|
* If remoteUri is a local directory,
|
||||||
|
* we'll zip it, upload to HDFS staging dir HDFS.
|
||||||
|
* If remoteUri is directory, we'll download it, zip it and upload
|
||||||
|
* to HDFS.
|
||||||
|
* If localFilePath is ".", we'll use remoteUri's file/dir name
|
||||||
|
* */
|
||||||
|
public void handleLocalizations(Service service)
|
||||||
|
throws IOException {
|
||||||
|
// Handle localizations
|
||||||
|
Path stagingDir =
|
||||||
|
remoteDirectoryManager.getJobStagingArea(
|
||||||
|
parameters.getName(), true);
|
||||||
|
List<Localization> localizations = parameters.getLocalizations();
|
||||||
|
String remoteUri;
|
||||||
|
String containerLocalPath;
|
||||||
|
|
||||||
|
// Check to fail fast
|
||||||
|
for (Localization loc : localizations) {
|
||||||
|
remoteUri = loc.getRemoteUri();
|
||||||
|
Path resourceToLocalize = new Path(remoteUri);
|
||||||
|
// Check if remoteUri exists
|
||||||
|
if (remoteDirectoryManager.isRemote(remoteUri)) {
|
||||||
|
// check if exists
|
||||||
|
if (!remoteDirectoryManager.existsRemoteFile(resourceToLocalize)) {
|
||||||
|
throw new FileNotFoundException(
|
||||||
|
"File " + remoteUri + " doesn't exists.");
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// Check if exists
|
||||||
|
File localFile = new File(remoteUri);
|
||||||
|
if (!localFile.exists()) {
|
||||||
|
throw new FileNotFoundException(
|
||||||
|
"File " + remoteUri + " doesn't exists.");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// check remote file size
|
||||||
|
fsOperations.validFileSize(remoteUri);
|
||||||
|
}
|
||||||
|
// Start download remote if needed and upload to HDFS
|
||||||
|
for (Localization loc : localizations) {
|
||||||
|
remoteUri = loc.getRemoteUri();
|
||||||
|
containerLocalPath = loc.getLocalPath();
|
||||||
|
String srcFileStr = remoteUri;
|
||||||
|
ConfigFile.TypeEnum destFileType = ConfigFile.TypeEnum.STATIC;
|
||||||
|
Path resourceToLocalize = new Path(remoteUri);
|
||||||
|
boolean needUploadToHDFS = true;
|
||||||
|
|
||||||
|
|
||||||
|
// Special handling of remoteUri directory
|
||||||
|
boolean needDeleteTempFile = false;
|
||||||
|
if (remoteDirectoryManager.isDir(remoteUri)) {
|
||||||
|
destFileType = ConfigFile.TypeEnum.ARCHIVE;
|
||||||
|
srcFileStr = fsOperations.downloadAndZip(
|
||||||
|
remoteUri, getLastNameFromPath(srcFileStr), true);
|
||||||
|
} else if (remoteDirectoryManager.isRemote(remoteUri)) {
|
||||||
|
if (!needHdfs(remoteUri)) {
|
||||||
|
// Non HDFS remote uri. Non directory, no need to zip
|
||||||
|
srcFileStr = fsOperations.downloadAndZip(
|
||||||
|
remoteUri, getLastNameFromPath(srcFileStr), false);
|
||||||
|
needDeleteTempFile = true;
|
||||||
|
} else {
|
||||||
|
// HDFS file, no need to upload
|
||||||
|
needUploadToHDFS = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Upload file to HDFS
|
||||||
|
if (needUploadToHDFS) {
|
||||||
|
resourceToLocalize =
|
||||||
|
fsOperations.uploadToRemoteFile(stagingDir, srcFileStr);
|
||||||
|
}
|
||||||
|
if (needDeleteTempFile) {
|
||||||
|
fsOperations.deleteFiles(srcFileStr);
|
||||||
|
}
|
||||||
|
// Remove .zip from zipped dir name
|
||||||
|
if (destFileType == ConfigFile.TypeEnum.ARCHIVE
|
||||||
|
&& srcFileStr.endsWith(".zip")) {
|
||||||
|
// Delete local zip file
|
||||||
|
fsOperations.deleteFiles(srcFileStr);
|
||||||
|
int suffixIndex = srcFileStr.lastIndexOf('_');
|
||||||
|
srcFileStr = srcFileStr.substring(0, suffixIndex);
|
||||||
|
}
|
||||||
|
// If provided, use the name of local uri
|
||||||
|
if (!containerLocalPath.equals(".")
|
||||||
|
&& !containerLocalPath.equals("./")) {
|
||||||
|
// Change the YARN localized file name to what'll used in container
|
||||||
|
srcFileStr = getLastNameFromPath(containerLocalPath);
|
||||||
|
}
|
||||||
|
String localizedName = getLastNameFromPath(srcFileStr);
|
||||||
|
LOG.info("The file/dir to be localized is {}",
|
||||||
|
resourceToLocalize.toString());
|
||||||
|
LOG.info("Its localized file name will be {}", localizedName);
|
||||||
|
service.getConfiguration().getFiles().add(new ConfigFile().srcFile(
|
||||||
|
resourceToLocalize.toUri().toString()).destFile(localizedName)
|
||||||
|
.type(destFileType));
|
||||||
|
// set mounts
|
||||||
|
// if mount path is absolute, just use it.
|
||||||
|
// if relative, no need to mount explicitly
|
||||||
|
if (containerLocalPath.startsWith("/")) {
|
||||||
|
String mountStr = getLastNameFromPath(srcFileStr) + ":"
|
||||||
|
+ containerLocalPath + ":" + loc.getMountPermission();
|
||||||
|
LOG.info("Add bind-mount string {}", mountStr);
|
||||||
|
appendToEnv(service,
|
||||||
|
EnvironmentUtilities.ENV_DOCKER_MOUNTS_FOR_CONTAINER_RUNTIME,
|
||||||
|
mountStr, ",");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private String getLastNameFromPath(String srcFileStr) {
|
||||||
|
return new Path(srcFileStr).getName();
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,51 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hadoop.yarn.submarine.utils;
|
||||||
|
|
||||||
|
import org.apache.hadoop.yarn.service.api.records.Resource;
|
||||||
|
import org.apache.hadoop.yarn.service.api.records.ResourceInformation;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Resource utilities for Submarine.
|
||||||
|
*/
|
||||||
|
public final class SubmarineResourceUtils {
|
||||||
|
private SubmarineResourceUtils() {
|
||||||
|
throw new UnsupportedOperationException("This class should not be " +
|
||||||
|
"instantiated!");
|
||||||
|
}
|
||||||
|
|
||||||
|
public static Resource convertYarnResourceToServiceResource(
|
||||||
|
org.apache.hadoop.yarn.api.records.Resource yarnResource) {
|
||||||
|
Resource serviceResource = new Resource();
|
||||||
|
serviceResource.setCpus(yarnResource.getVirtualCores());
|
||||||
|
serviceResource.setMemory(String.valueOf(yarnResource.getMemorySize()));
|
||||||
|
|
||||||
|
Map<String, ResourceInformation> riMap = new HashMap<>();
|
||||||
|
for (org.apache.hadoop.yarn.api.records.ResourceInformation ri :
|
||||||
|
yarnResource.getAllResourcesListCopy()) {
|
||||||
|
ResourceInformation serviceRi = new ResourceInformation();
|
||||||
|
serviceRi.setValue(ri.getValue());
|
||||||
|
serviceRi.setUnit(ri.getUnits());
|
||||||
|
riMap.put(ri.getName(), serviceRi);
|
||||||
|
}
|
||||||
|
serviceResource.setResourceInformations(riMap);
|
||||||
|
|
||||||
|
return serviceResource;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,82 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hadoop.yarn.submarine.utils;
|
||||||
|
|
||||||
|
import com.google.common.annotations.VisibleForTesting;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import java.io.File;
|
||||||
|
import java.io.FileInputStream;
|
||||||
|
import java.io.FileOutputStream;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.zip.ZipEntry;
|
||||||
|
import java.util.zip.ZipOutputStream;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Utilities for zipping directories and adding existing directories to zips.
|
||||||
|
*/
|
||||||
|
public final class ZipUtilities {
|
||||||
|
private ZipUtilities() {
|
||||||
|
throw new UnsupportedOperationException("This class should not be " +
|
||||||
|
"instantiated!");
|
||||||
|
}
|
||||||
|
|
||||||
|
private static final Logger LOG = LoggerFactory.getLogger(ZipUtilities.class);
|
||||||
|
|
||||||
|
@VisibleForTesting
|
||||||
|
public static String zipDir(String srcDir, String dstFile)
|
||||||
|
throws IOException {
|
||||||
|
FileOutputStream fos = new FileOutputStream(dstFile);
|
||||||
|
ZipOutputStream zos = new ZipOutputStream(fos);
|
||||||
|
File srcFile = new File(srcDir);
|
||||||
|
LOG.info("Compressing directory {}", srcDir);
|
||||||
|
addDirToZip(zos, srcFile, srcFile);
|
||||||
|
// close the ZipOutputStream
|
||||||
|
zos.close();
|
||||||
|
LOG.info("Compressed directory {} to file: {}", srcDir, dstFile);
|
||||||
|
return dstFile;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static void addDirToZip(ZipOutputStream zos, File srcFile, File base)
|
||||||
|
throws IOException {
|
||||||
|
File[] files = srcFile.listFiles();
|
||||||
|
if (files == null) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
for (File file : files) {
|
||||||
|
// if it's directory, add recursively
|
||||||
|
if (file.isDirectory()) {
|
||||||
|
addDirToZip(zos, file, base);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
byte[] buffer = new byte[1024];
|
||||||
|
try(FileInputStream fis = new FileInputStream(file)) {
|
||||||
|
String name = base.toURI().relativize(file.toURI()).getPath();
|
||||||
|
LOG.info("Adding file {} to zip", name);
|
||||||
|
zos.putNextEntry(new ZipEntry(name));
|
||||||
|
int length;
|
||||||
|
while ((length = fis.read(buffer)) > 0) {
|
||||||
|
zos.write(buffer, 0, length);
|
||||||
|
}
|
||||||
|
zos.flush();
|
||||||
|
} finally {
|
||||||
|
zos.closeEntry();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,19 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
/**
|
||||||
|
* This package contains classes utility classes.
|
||||||
|
*/
|
||||||
|
package org.apache.hadoop.yarn.submarine.utils;
|
|
@ -0,0 +1,146 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hadoop.yarn.submarine;
|
||||||
|
|
||||||
|
import com.google.common.collect.Lists;
|
||||||
|
import org.apache.commons.io.FileUtils;
|
||||||
|
import org.apache.hadoop.fs.Path;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import java.io.File;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import static org.junit.Assert.assertTrue;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* File utilities for tests.
|
||||||
|
* Provides methods that can create, delete files or directories
|
||||||
|
* in a temp directory, or any specified directory.
|
||||||
|
*/
|
||||||
|
public class FileUtilitiesForTests {
|
||||||
|
private static final Logger LOG =
|
||||||
|
LoggerFactory.getLogger(FileUtilitiesForTests.class);
|
||||||
|
private String tempDir;
|
||||||
|
private List<File> cleanupFiles;
|
||||||
|
|
||||||
|
public void setup() {
|
||||||
|
cleanupFiles = Lists.newArrayList();
|
||||||
|
tempDir = System.getProperty("java.io.tmpdir");
|
||||||
|
}
|
||||||
|
|
||||||
|
public void teardown() throws IOException {
|
||||||
|
LOG.info("About to clean up files: " + cleanupFiles);
|
||||||
|
List<File> dirs = Lists.newArrayList();
|
||||||
|
for (File cleanupFile : cleanupFiles) {
|
||||||
|
if (cleanupFile.isDirectory()) {
|
||||||
|
dirs.add(cleanupFile);
|
||||||
|
} else {
|
||||||
|
deleteFile(cleanupFile);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (File dir : dirs) {
|
||||||
|
deleteFile(dir);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public File createFileInTempDir(String filename) throws IOException {
|
||||||
|
File file = new File(tempDir, new Path(filename).getName());
|
||||||
|
createFile(file);
|
||||||
|
return file;
|
||||||
|
}
|
||||||
|
|
||||||
|
public File createDirInTempDir(String dirName) {
|
||||||
|
File file = new File(tempDir, new Path(dirName).getName());
|
||||||
|
createDirectory(file);
|
||||||
|
return file;
|
||||||
|
}
|
||||||
|
|
||||||
|
public File createFileInDir(Path dir, String filename) throws IOException {
|
||||||
|
File dirTmp = new File(dir.toUri().getPath());
|
||||||
|
if (!dirTmp.exists()) {
|
||||||
|
createDirectory(dirTmp);
|
||||||
|
}
|
||||||
|
File file =
|
||||||
|
new File(dir.toUri().getPath() + "/" + new Path(filename).getName());
|
||||||
|
createFile(file);
|
||||||
|
return file;
|
||||||
|
}
|
||||||
|
|
||||||
|
public File createFileInDir(File dir, String filename) throws IOException {
|
||||||
|
if (!dir.exists()) {
|
||||||
|
createDirectory(dir);
|
||||||
|
}
|
||||||
|
File file = new File(dir, filename);
|
||||||
|
createFile(file);
|
||||||
|
return file;
|
||||||
|
}
|
||||||
|
|
||||||
|
public File createDirectory(Path parent, String dirname) {
|
||||||
|
File dir =
|
||||||
|
new File(parent.toUri().getPath() + "/" + new Path(dirname).getName());
|
||||||
|
createDirectory(dir);
|
||||||
|
return dir;
|
||||||
|
}
|
||||||
|
|
||||||
|
public File createDirectory(File parent, String dirname) {
|
||||||
|
File dir =
|
||||||
|
new File(parent.getPath() + "/" + new Path(dirname).getName());
|
||||||
|
createDirectory(dir);
|
||||||
|
return dir;
|
||||||
|
}
|
||||||
|
|
||||||
|
private void createDirectory(File dir) {
|
||||||
|
boolean result = dir.mkdir();
|
||||||
|
assertTrue("Failed to create directory " + dir.getAbsolutePath(), result);
|
||||||
|
assertTrue("Directory does not exist: " + dir.getAbsolutePath(),
|
||||||
|
dir.exists());
|
||||||
|
this.cleanupFiles.add(dir);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void createFile(File file) throws IOException {
|
||||||
|
boolean result = file.createNewFile();
|
||||||
|
assertTrue("Failed to create file " + file.getAbsolutePath(), result);
|
||||||
|
assertTrue("File does not exist: " + file.getAbsolutePath(), file.exists());
|
||||||
|
this.cleanupFiles.add(file);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static void deleteFile(File file) throws IOException {
|
||||||
|
if (file.isDirectory()) {
|
||||||
|
LOG.info("Removing directory: " + file.getAbsolutePath());
|
||||||
|
FileUtils.deleteDirectory(file);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (file.exists()) {
|
||||||
|
LOG.info("Removing file: " + file.getAbsolutePath());
|
||||||
|
boolean result = file.delete();
|
||||||
|
assertTrue("Deletion of file " + file.getAbsolutePath()
|
||||||
|
+ " was not successful!", result);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public File getTempFileWithName(String filename) {
|
||||||
|
return new File(tempDir + "/" + new Path(filename).getName());
|
||||||
|
}
|
||||||
|
|
||||||
|
public static File getFilename(Path parent, String filename) {
|
||||||
|
return new File(
|
||||||
|
parent.toUri().getPath() + "/" + new Path(filename).getName());
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,139 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hadoop.yarn.submarine.client.cli.yarnservice;
|
||||||
|
|
||||||
|
import com.google.common.collect.Lists;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
class ParamBuilderForTest {
|
||||||
|
private final List<String> params = Lists.newArrayList();
|
||||||
|
|
||||||
|
static ParamBuilderForTest create() {
|
||||||
|
return new ParamBuilderForTest();
|
||||||
|
}
|
||||||
|
|
||||||
|
ParamBuilderForTest withJobName(String name) {
|
||||||
|
params.add("--name");
|
||||||
|
params.add(name);
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
ParamBuilderForTest withDockerImage(String dockerImage) {
|
||||||
|
params.add("--docker_image");
|
||||||
|
params.add(dockerImage);
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
ParamBuilderForTest withInputPath(String inputPath) {
|
||||||
|
params.add("--input_path");
|
||||||
|
params.add(inputPath);
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
ParamBuilderForTest withCheckpointPath(String checkpointPath) {
|
||||||
|
params.add("--checkpoint_path");
|
||||||
|
params.add(checkpointPath);
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
ParamBuilderForTest withNumberOfWorkers(int numWorkers) {
|
||||||
|
params.add("--num_workers");
|
||||||
|
params.add(String.valueOf(numWorkers));
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
ParamBuilderForTest withNumberOfPs(int numPs) {
|
||||||
|
params.add("--num_ps");
|
||||||
|
params.add(String.valueOf(numPs));
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
ParamBuilderForTest withWorkerLaunchCommand(String launchCommand) {
|
||||||
|
params.add("--worker_launch_cmd");
|
||||||
|
params.add(launchCommand);
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
ParamBuilderForTest withPsLaunchCommand(String launchCommand) {
|
||||||
|
params.add("--ps_launch_cmd");
|
||||||
|
params.add(launchCommand);
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
ParamBuilderForTest withWorkerResources(String workerResources) {
|
||||||
|
params.add("--worker_resources");
|
||||||
|
params.add(workerResources);
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
ParamBuilderForTest withPsResources(String psResources) {
|
||||||
|
params.add("--ps_resources");
|
||||||
|
params.add(psResources);
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
ParamBuilderForTest withWorkerDockerImage(String dockerImage) {
|
||||||
|
params.add("--worker_docker_image");
|
||||||
|
params.add(dockerImage);
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
ParamBuilderForTest withPsDockerImage(String dockerImage) {
|
||||||
|
params.add("--ps_docker_image");
|
||||||
|
params.add(dockerImage);
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
ParamBuilderForTest withVerbose() {
|
||||||
|
params.add("--verbose");
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
ParamBuilderForTest withTensorboard() {
|
||||||
|
params.add("--tensorboard");
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
ParamBuilderForTest withTensorboardResources(String resources) {
|
||||||
|
params.add("--tensorboard_resources");
|
||||||
|
params.add(resources);
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
ParamBuilderForTest withTensorboardDockerImage(String dockerImage) {
|
||||||
|
params.add("--tensorboard_docker_image");
|
||||||
|
params.add(dockerImage);
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
ParamBuilderForTest withQuickLink(String quickLink) {
|
||||||
|
params.add("--quicklink");
|
||||||
|
params.add(quickLink);
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
ParamBuilderForTest withLocalization(String remoteUrl, String localUrl) {
|
||||||
|
params.add("--localization");
|
||||||
|
params.add(remoteUrl + ":" + localUrl);
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
String[] build() {
|
||||||
|
return params.toArray(new String[0]);
|
||||||
|
}
|
||||||
|
}
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,79 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hadoop.yarn.submarine.client.cli.yarnservice;
|
||||||
|
|
||||||
|
import org.apache.hadoop.yarn.client.api.AppAdminClient;
|
||||||
|
import org.apache.hadoop.yarn.exceptions.YarnException;
|
||||||
|
import org.apache.hadoop.yarn.service.api.records.Service;
|
||||||
|
import org.apache.hadoop.yarn.submarine.FileUtilitiesForTests;
|
||||||
|
import org.apache.hadoop.yarn.submarine.common.conf.SubmarineLogs;
|
||||||
|
import org.apache.hadoop.yarn.submarine.runtimes.common.JobSubmitter;
|
||||||
|
import org.apache.hadoop.yarn.submarine.runtimes.yarnservice.YarnServiceJobSubmitter;
|
||||||
|
import org.apache.hadoop.yarn.submarine.runtimes.yarnservice.YarnServiceUtils;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
import static org.apache.hadoop.yarn.service.exceptions.LauncherExitCodes.EXIT_SUCCESS;
|
||||||
|
import static org.mockito.ArgumentMatchers.any;
|
||||||
|
import static org.mockito.Mockito.mock;
|
||||||
|
import static org.mockito.Mockito.when;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Common operations shared with test classes using Run job-related actions.
|
||||||
|
*/
|
||||||
|
public class TestYarnServiceRunJobCliCommons {
|
||||||
|
static final String DEFAULT_JOB_NAME = "my-job";
|
||||||
|
static final String DEFAULT_DOCKER_IMAGE = "tf-docker:1.1.0";
|
||||||
|
static final String DEFAULT_INPUT_PATH = "s3://input";
|
||||||
|
static final String DEFAULT_CHECKPOINT_PATH = "s3://output";
|
||||||
|
static final String DEFAULT_WORKER_DOCKER_IMAGE = "worker.image";
|
||||||
|
static final String DEFAULT_PS_DOCKER_IMAGE = "ps.image";
|
||||||
|
static final String DEFAULT_WORKER_LAUNCH_CMD = "python run-job.py";
|
||||||
|
static final String DEFAULT_PS_LAUNCH_CMD = "python run-ps.py";
|
||||||
|
static final String DEFAULT_TENSORBOARD_RESOURCES = "memory=2G,vcores=2";
|
||||||
|
static final String DEFAULT_WORKER_RESOURCES = "memory=2048M,vcores=2";
|
||||||
|
static final String DEFAULT_PS_RESOURCES = "memory=4096M,vcores=4";
|
||||||
|
static final String DEFAULT_TENSORBOARD_DOCKER_IMAGE = "tb_docker_image:001";
|
||||||
|
|
||||||
|
private FileUtilitiesForTests fileUtils = new FileUtilitiesForTests();
|
||||||
|
|
||||||
|
void setup() throws IOException, YarnException {
|
||||||
|
SubmarineLogs.verboseOff();
|
||||||
|
AppAdminClient serviceClient = mock(AppAdminClient.class);
|
||||||
|
when(serviceClient.actionLaunch(any(String.class), any(String.class),
|
||||||
|
any(Long.class), any(String.class))).thenReturn(EXIT_SUCCESS);
|
||||||
|
when(serviceClient.getStatusString(any(String.class))).thenReturn(
|
||||||
|
"{\"id\": \"application_1234_1\"}");
|
||||||
|
YarnServiceUtils.setStubServiceClient(serviceClient);
|
||||||
|
|
||||||
|
fileUtils.setup();
|
||||||
|
}
|
||||||
|
|
||||||
|
void teardown() throws IOException {
|
||||||
|
fileUtils.teardown();
|
||||||
|
}
|
||||||
|
|
||||||
|
FileUtilitiesForTests getFileUtils() {
|
||||||
|
return fileUtils;
|
||||||
|
}
|
||||||
|
|
||||||
|
Service getServiceSpecFromJobSubmitter(JobSubmitter jobSubmitter) {
|
||||||
|
return ((YarnServiceJobSubmitter) jobSubmitter).getServiceWrapper()
|
||||||
|
.getService();
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,599 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hadoop.yarn.submarine.client.cli.yarnservice;
|
||||||
|
|
||||||
|
import org.apache.hadoop.fs.Path;
|
||||||
|
import org.apache.hadoop.yarn.exceptions.YarnException;
|
||||||
|
import org.apache.hadoop.yarn.service.api.records.ConfigFile;
|
||||||
|
import org.apache.hadoop.yarn.service.api.records.Service;
|
||||||
|
import org.apache.hadoop.yarn.submarine.client.cli.RunJobCli;
|
||||||
|
import org.apache.hadoop.yarn.submarine.common.MockClientContext;
|
||||||
|
import org.apache.hadoop.yarn.submarine.common.conf.SubmarineConfiguration;
|
||||||
|
import org.apache.hadoop.yarn.submarine.common.conf.SubmarineLogs;
|
||||||
|
import org.apache.hadoop.yarn.submarine.common.fs.RemoteDirectoryManager;
|
||||||
|
import static org.apache.hadoop.yarn.submarine.client.cli.yarnservice.TestYarnServiceRunJobCliCommons.*;
|
||||||
|
import org.junit.After;
|
||||||
|
import org.junit.Before;
|
||||||
|
import org.junit.Test;
|
||||||
|
|
||||||
|
import java.io.File;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import static org.junit.Assert.assertEquals;
|
||||||
|
import static org.junit.Assert.assertFalse;
|
||||||
|
import static org.junit.Assert.assertTrue;
|
||||||
|
import static org.junit.Assert.fail;
|
||||||
|
import static org.mockito.ArgumentMatchers.anyString;
|
||||||
|
import static org.mockito.Mockito.reset;
|
||||||
|
import static org.mockito.Mockito.spy;
|
||||||
|
import static org.mockito.Mockito.times;
|
||||||
|
import static org.mockito.Mockito.verify;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Class to test YarnService localization feature with the Run job CLI action.
|
||||||
|
*/
|
||||||
|
public class TestYarnServiceRunJobCliLocalization {
|
||||||
|
private static final String ZIP_EXTENSION = ".zip";
|
||||||
|
private TestYarnServiceRunJobCliCommons testCommons =
|
||||||
|
new TestYarnServiceRunJobCliCommons();
|
||||||
|
private MockClientContext mockClientContext;
|
||||||
|
private RemoteDirectoryManager spyRdm;
|
||||||
|
|
||||||
|
@Before
|
||||||
|
public void before() throws IOException, YarnException {
|
||||||
|
testCommons.setup();
|
||||||
|
mockClientContext = YarnServiceCliTestUtils.getMockClientContext();
|
||||||
|
spyRdm = setupSpyRemoteDirManager();
|
||||||
|
}
|
||||||
|
|
||||||
|
@After
|
||||||
|
public void cleanup() throws IOException {
|
||||||
|
testCommons.teardown();
|
||||||
|
}
|
||||||
|
|
||||||
|
private ParamBuilderForTest createCommonParamsBuilder() {
|
||||||
|
return ParamBuilderForTest.create()
|
||||||
|
.withJobName(DEFAULT_JOB_NAME)
|
||||||
|
.withDockerImage(DEFAULT_DOCKER_IMAGE)
|
||||||
|
.withInputPath(DEFAULT_INPUT_PATH)
|
||||||
|
.withCheckpointPath(DEFAULT_CHECKPOINT_PATH)
|
||||||
|
.withNumberOfWorkers(3)
|
||||||
|
.withWorkerDockerImage(DEFAULT_WORKER_DOCKER_IMAGE)
|
||||||
|
.withWorkerLaunchCommand(DEFAULT_WORKER_LAUNCH_CMD)
|
||||||
|
.withWorkerResources(DEFAULT_WORKER_RESOURCES)
|
||||||
|
.withNumberOfPs(2)
|
||||||
|
.withPsDockerImage(DEFAULT_PS_DOCKER_IMAGE)
|
||||||
|
.withPsLaunchCommand(DEFAULT_PS_LAUNCH_CMD)
|
||||||
|
.withPsResources(DEFAULT_PS_RESOURCES)
|
||||||
|
.withVerbose();
|
||||||
|
}
|
||||||
|
|
||||||
|
private void assertFilesAreDeleted(File... files) {
|
||||||
|
for (File file : files) {
|
||||||
|
assertFalse("File should be deleted: " + file.getAbsolutePath(),
|
||||||
|
file.exists());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private RemoteDirectoryManager setupSpyRemoteDirManager() {
|
||||||
|
RemoteDirectoryManager spyRdm =
|
||||||
|
spy(mockClientContext.getRemoteDirectoryManager());
|
||||||
|
mockClientContext.setRemoteDirectoryMgr(spyRdm);
|
||||||
|
return spyRdm;
|
||||||
|
}
|
||||||
|
|
||||||
|
private Path getStagingDir() throws IOException {
|
||||||
|
return mockClientContext.getRemoteDirectoryManager()
|
||||||
|
.getJobStagingArea(DEFAULT_JOB_NAME, true);
|
||||||
|
}
|
||||||
|
|
||||||
|
private RunJobCli createRunJobCliWithoutVerboseAssertion() {
|
||||||
|
return new RunJobCli(mockClientContext);
|
||||||
|
}
|
||||||
|
|
||||||
|
private RunJobCli createRunJobCli() {
|
||||||
|
RunJobCli runJobCli = new RunJobCli(mockClientContext);
|
||||||
|
assertFalse(SubmarineLogs.isVerbose());
|
||||||
|
return runJobCli;
|
||||||
|
}
|
||||||
|
|
||||||
|
private String getFilePath(String localUrl, Path stagingDir) {
|
||||||
|
return stagingDir.toUri().getPath()
|
||||||
|
+ "/" + new Path(localUrl).getName();
|
||||||
|
}
|
||||||
|
|
||||||
|
private String getFilePathWithSuffix(Path stagingDir, String localUrl,
|
||||||
|
String suffix) {
|
||||||
|
return stagingDir.toUri().getPath() + "/" + new Path(localUrl).getName()
|
||||||
|
+ suffix;
|
||||||
|
}
|
||||||
|
|
||||||
|
private void assertConfigFile(ConfigFile expected, ConfigFile actual) {
|
||||||
|
assertEquals("ConfigFile does not equal to expected!", expected, actual);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void assertNumberOfLocalizations(List<ConfigFile> files,
|
||||||
|
int expected) {
|
||||||
|
assertEquals("Number of localizations is not the expected!", expected,
|
||||||
|
files.size());
|
||||||
|
}
|
||||||
|
|
||||||
|
private void verifyRdmCopyToRemoteLocalCalls(int expectedCalls)
|
||||||
|
throws IOException {
|
||||||
|
verify(spyRdm, times(expectedCalls)).copyRemoteToLocal(anyString(),
|
||||||
|
anyString());
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Basic test.
|
||||||
|
* In one hand, create local temp file/dir for hdfs URI in
|
||||||
|
* local staging dir.
|
||||||
|
* In the other hand, use MockRemoteDirectoryManager mock
|
||||||
|
* implementation when check FileStatus or exists of HDFS file/dir
|
||||||
|
* --localization hdfs:///user/yarn/script1.py:.
|
||||||
|
* --localization /temp/script2.py:./
|
||||||
|
* --localization /temp/script2.py:/opt/script.py
|
||||||
|
*/
|
||||||
|
@Test
|
||||||
|
public void testRunJobWithBasicLocalization() throws Exception {
|
||||||
|
String remoteUrl = "hdfs:///user/yarn/script1.py";
|
||||||
|
String containerLocal1 = ".";
|
||||||
|
String localUrl = "/temp/script2.py";
|
||||||
|
String containerLocal2 = "./";
|
||||||
|
String containerLocal3 = "/opt/script.py";
|
||||||
|
// Create local file, we need to put it under local temp dir
|
||||||
|
File localFile1 = testCommons.getFileUtils().createFileInTempDir(localUrl);
|
||||||
|
|
||||||
|
// create remote file in local staging dir to simulate HDFS
|
||||||
|
Path stagingDir = getStagingDir();
|
||||||
|
testCommons.getFileUtils().createFileInDir(stagingDir, remoteUrl);
|
||||||
|
|
||||||
|
String[] params = createCommonParamsBuilder()
|
||||||
|
.withLocalization(remoteUrl, containerLocal1)
|
||||||
|
.withLocalization(localFile1.getAbsolutePath(), containerLocal2)
|
||||||
|
.withLocalization(localFile1.getAbsolutePath(), containerLocal3)
|
||||||
|
.build();
|
||||||
|
RunJobCli runJobCli = createRunJobCli();
|
||||||
|
runJobCli.run(params);
|
||||||
|
Service serviceSpec = testCommons.getServiceSpecFromJobSubmitter(
|
||||||
|
runJobCli.getJobSubmitter());
|
||||||
|
assertNumberOfServiceComponents(serviceSpec, 3);
|
||||||
|
|
||||||
|
// No remote dir and HDFS file exists.
|
||||||
|
// Ensure download never happened.
|
||||||
|
verifyRdmCopyToRemoteLocalCalls(0);
|
||||||
|
// Ensure local original files are not deleted
|
||||||
|
assertTrue(localFile1.exists());
|
||||||
|
|
||||||
|
List<ConfigFile> files = serviceSpec.getConfiguration().getFiles();
|
||||||
|
assertNumberOfLocalizations(files, 3);
|
||||||
|
|
||||||
|
ConfigFile expectedConfigFile = new ConfigFile();
|
||||||
|
expectedConfigFile.setType(ConfigFile.TypeEnum.STATIC);
|
||||||
|
expectedConfigFile.setSrcFile(remoteUrl);
|
||||||
|
expectedConfigFile.setDestFile(new Path(remoteUrl).getName());
|
||||||
|
assertConfigFile(expectedConfigFile, files.get(0));
|
||||||
|
|
||||||
|
expectedConfigFile = new ConfigFile();
|
||||||
|
expectedConfigFile.setType(ConfigFile.TypeEnum.STATIC);
|
||||||
|
expectedConfigFile.setSrcFile(getFilePath(localUrl, stagingDir));
|
||||||
|
expectedConfigFile.setDestFile(new Path(localUrl).getName());
|
||||||
|
assertConfigFile(expectedConfigFile, files.get(1));
|
||||||
|
|
||||||
|
expectedConfigFile = new ConfigFile();
|
||||||
|
expectedConfigFile.setType(ConfigFile.TypeEnum.STATIC);
|
||||||
|
expectedConfigFile.setSrcFile(getFilePath(localUrl, stagingDir));
|
||||||
|
expectedConfigFile.setDestFile(new Path(containerLocal3).getName());
|
||||||
|
assertConfigFile(expectedConfigFile, files.get(2));
|
||||||
|
|
||||||
|
// Ensure env value is correct
|
||||||
|
String env = serviceSpec.getConfiguration().getEnv()
|
||||||
|
.get("YARN_CONTAINER_RUNTIME_DOCKER_MOUNTS");
|
||||||
|
String expectedMounts = new Path(containerLocal3).getName()
|
||||||
|
+ ":" + containerLocal3 + ":rw";
|
||||||
|
assertTrue(env.contains(expectedMounts));
|
||||||
|
}
|
||||||
|
|
||||||
|
private void assertNumberOfServiceComponents(Service serviceSpec,
|
||||||
|
int expected) {
|
||||||
|
assertEquals(expected, serviceSpec.getComponents().size());
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Non HDFS remote URI test.
|
||||||
|
* --localization https://a/b/1.patch:.
|
||||||
|
* --localization s3a://a/dir:/opt/mys3dir
|
||||||
|
*/
|
||||||
|
@Test
|
||||||
|
public void testRunJobWithNonHDFSRemoteLocalization() throws Exception {
|
||||||
|
String remoteUri1 = "https://a/b/1.patch";
|
||||||
|
String containerLocal1 = ".";
|
||||||
|
String remoteUri2 = "s3a://a/s3dir";
|
||||||
|
String containerLocal2 = "/opt/mys3dir";
|
||||||
|
|
||||||
|
// create remote file in local staging dir to simulate HDFS
|
||||||
|
Path stagingDir = getStagingDir();
|
||||||
|
testCommons.getFileUtils().createFileInDir(stagingDir, remoteUri1);
|
||||||
|
File remoteDir1 =
|
||||||
|
testCommons.getFileUtils().createDirectory(stagingDir, remoteUri2);
|
||||||
|
testCommons.getFileUtils().createFileInDir(remoteDir1, "afile");
|
||||||
|
|
||||||
|
String suffix1 = "_" + remoteDir1.lastModified()
|
||||||
|
+ "-" + mockClientContext.getRemoteDirectoryManager()
|
||||||
|
.getRemoteFileSize(remoteUri2);
|
||||||
|
|
||||||
|
String[] params = createCommonParamsBuilder()
|
||||||
|
.withLocalization(remoteUri1, containerLocal1)
|
||||||
|
.withLocalization(remoteUri2, containerLocal2)
|
||||||
|
.build();
|
||||||
|
RunJobCli runJobCli = createRunJobCli();
|
||||||
|
runJobCli.run(params);
|
||||||
|
Service serviceSpec = testCommons.getServiceSpecFromJobSubmitter(
|
||||||
|
runJobCli.getJobSubmitter());
|
||||||
|
assertNumberOfServiceComponents(serviceSpec, 3);
|
||||||
|
|
||||||
|
// Ensure download remote dir 2 times
|
||||||
|
verifyRdmCopyToRemoteLocalCalls(2);
|
||||||
|
|
||||||
|
// Ensure downloaded temp files are deleted
|
||||||
|
assertFilesAreDeleted(
|
||||||
|
testCommons.getFileUtils().getTempFileWithName(remoteUri1),
|
||||||
|
testCommons.getFileUtils().getTempFileWithName(remoteUri2));
|
||||||
|
|
||||||
|
// Ensure zip file are deleted
|
||||||
|
assertFilesAreDeleted(
|
||||||
|
testCommons.getFileUtils()
|
||||||
|
.getTempFileWithName(remoteUri2 + "_" + suffix1 + ZIP_EXTENSION));
|
||||||
|
|
||||||
|
List<ConfigFile> files = serviceSpec.getConfiguration().getFiles();
|
||||||
|
assertNumberOfLocalizations(files, 2);
|
||||||
|
|
||||||
|
ConfigFile expectedConfigFile = new ConfigFile();
|
||||||
|
expectedConfigFile.setType(ConfigFile.TypeEnum.STATIC);
|
||||||
|
expectedConfigFile.setSrcFile(getFilePath(remoteUri1, stagingDir));
|
||||||
|
expectedConfigFile.setDestFile(new Path(remoteUri1).getName());
|
||||||
|
assertConfigFile(expectedConfigFile, files.get(0));
|
||||||
|
|
||||||
|
expectedConfigFile = new ConfigFile();
|
||||||
|
expectedConfigFile.setType(ConfigFile.TypeEnum.ARCHIVE);
|
||||||
|
expectedConfigFile.setSrcFile(
|
||||||
|
getFilePathWithSuffix(stagingDir, remoteUri2, suffix1 + ZIP_EXTENSION));
|
||||||
|
expectedConfigFile.setDestFile(new Path(containerLocal2).getName());
|
||||||
|
assertConfigFile(expectedConfigFile, files.get(1));
|
||||||
|
|
||||||
|
// Ensure env value is correct
|
||||||
|
String env = serviceSpec.getConfiguration().getEnv()
|
||||||
|
.get("YARN_CONTAINER_RUNTIME_DOCKER_MOUNTS");
|
||||||
|
String expectedMounts = new Path(remoteUri2).getName()
|
||||||
|
+ ":" + containerLocal2 + ":rw";
|
||||||
|
assertTrue(env.contains(expectedMounts));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Test HDFS dir localization.
|
||||||
|
* --localization hdfs:///user/yarn/mydir:./mydir1
|
||||||
|
* --localization hdfs:///user/yarn/mydir2:/opt/dir2:rw
|
||||||
|
* --localization hdfs:///user/yarn/mydir:.
|
||||||
|
* --localization hdfs:///user/yarn/mydir2:./
|
||||||
|
*/
|
||||||
|
@Test
|
||||||
|
public void testRunJobWithHdfsDirLocalization() throws Exception {
|
||||||
|
String remoteUrl = "hdfs:///user/yarn/mydir";
|
||||||
|
String containerPath = "./mydir1";
|
||||||
|
String remoteUrl2 = "hdfs:///user/yarn/mydir2";
|
||||||
|
String containerPath2 = "/opt/dir2";
|
||||||
|
String containerPath3 = ".";
|
||||||
|
String containerPath4 = "./";
|
||||||
|
|
||||||
|
// create remote file in local staging dir to simulate HDFS
|
||||||
|
Path stagingDir = getStagingDir();
|
||||||
|
File remoteDir1 =
|
||||||
|
testCommons.getFileUtils().createDirectory(stagingDir, remoteUrl);
|
||||||
|
testCommons.getFileUtils().createFileInDir(remoteDir1, "1.py");
|
||||||
|
testCommons.getFileUtils().createFileInDir(remoteDir1, "2.py");
|
||||||
|
|
||||||
|
File remoteDir2 =
|
||||||
|
testCommons.getFileUtils().createDirectory(stagingDir, remoteUrl2);
|
||||||
|
testCommons.getFileUtils().createFileInDir(remoteDir2, "3.py");
|
||||||
|
testCommons.getFileUtils().createFileInDir(remoteDir2, "4.py");
|
||||||
|
|
||||||
|
String suffix1 = "_" + remoteDir1.lastModified()
|
||||||
|
+ "-" + mockClientContext.getRemoteDirectoryManager()
|
||||||
|
.getRemoteFileSize(remoteUrl);
|
||||||
|
String suffix2 = "_" + remoteDir2.lastModified()
|
||||||
|
+ "-" + mockClientContext.getRemoteDirectoryManager()
|
||||||
|
.getRemoteFileSize(remoteUrl2);
|
||||||
|
|
||||||
|
String[] params = createCommonParamsBuilder()
|
||||||
|
.withLocalization(remoteUrl, containerPath)
|
||||||
|
.withLocalization(remoteUrl2, containerPath2)
|
||||||
|
.withLocalization(remoteUrl, containerPath3)
|
||||||
|
.withLocalization(remoteUrl2, containerPath4)
|
||||||
|
.build();
|
||||||
|
RunJobCli runJobCli = createRunJobCli();
|
||||||
|
runJobCli.run(params);
|
||||||
|
Service serviceSpec = testCommons.getServiceSpecFromJobSubmitter(
|
||||||
|
runJobCli.getJobSubmitter());
|
||||||
|
assertNumberOfServiceComponents(serviceSpec, 3);
|
||||||
|
|
||||||
|
// Ensure download remote dir 4 times
|
||||||
|
verifyRdmCopyToRemoteLocalCalls(4);
|
||||||
|
|
||||||
|
// Ensure downloaded temp files are deleted
|
||||||
|
assertFilesAreDeleted(
|
||||||
|
testCommons.getFileUtils().getTempFileWithName(remoteUrl),
|
||||||
|
testCommons.getFileUtils().getTempFileWithName(remoteUrl2));
|
||||||
|
|
||||||
|
// Ensure zip file are deleted
|
||||||
|
assertFilesAreDeleted(
|
||||||
|
testCommons.getFileUtils()
|
||||||
|
.getTempFileWithName(remoteUrl + suffix1 + ZIP_EXTENSION),
|
||||||
|
testCommons.getFileUtils()
|
||||||
|
.getTempFileWithName(remoteUrl2 + suffix2 + ZIP_EXTENSION));
|
||||||
|
|
||||||
|
// Ensure files will be localized
|
||||||
|
List<ConfigFile> files = serviceSpec.getConfiguration().getFiles();
|
||||||
|
assertNumberOfLocalizations(files, 4);
|
||||||
|
|
||||||
|
ConfigFile expectedConfigFile = new ConfigFile();
|
||||||
|
// The hdfs dir should be download and compress and let YARN to uncompress
|
||||||
|
expectedConfigFile.setType(ConfigFile.TypeEnum.ARCHIVE);
|
||||||
|
expectedConfigFile.setSrcFile(
|
||||||
|
getFilePathWithSuffix(stagingDir, remoteUrl, suffix1 + ZIP_EXTENSION));
|
||||||
|
// Relative path in container, but not "." or "./". Use its own name
|
||||||
|
expectedConfigFile.setDestFile(new Path(containerPath).getName());
|
||||||
|
assertConfigFile(expectedConfigFile, files.get(0));
|
||||||
|
|
||||||
|
expectedConfigFile = new ConfigFile();
|
||||||
|
expectedConfigFile.setType(ConfigFile.TypeEnum.ARCHIVE);
|
||||||
|
expectedConfigFile.setSrcFile(
|
||||||
|
getFilePathWithSuffix(stagingDir, remoteUrl2, suffix2 + ZIP_EXTENSION));
|
||||||
|
expectedConfigFile.setDestFile(new Path(containerPath2).getName());
|
||||||
|
assertConfigFile(expectedConfigFile, files.get(1));
|
||||||
|
|
||||||
|
expectedConfigFile = new ConfigFile();
|
||||||
|
expectedConfigFile.setType(ConfigFile.TypeEnum.ARCHIVE);
|
||||||
|
expectedConfigFile.setSrcFile(
|
||||||
|
getFilePathWithSuffix(stagingDir, remoteUrl, suffix1 + ZIP_EXTENSION));
|
||||||
|
// Relative path in container ".", use remote path name
|
||||||
|
expectedConfigFile.setDestFile(new Path(remoteUrl).getName());
|
||||||
|
assertConfigFile(expectedConfigFile, files.get(2));
|
||||||
|
|
||||||
|
expectedConfigFile = new ConfigFile();
|
||||||
|
expectedConfigFile.setType(ConfigFile.TypeEnum.ARCHIVE);
|
||||||
|
expectedConfigFile.setSrcFile(
|
||||||
|
getFilePathWithSuffix(stagingDir, remoteUrl2, suffix2 + ZIP_EXTENSION));
|
||||||
|
// Relative path in container ".", use remote path name
|
||||||
|
expectedConfigFile.setDestFile(new Path(remoteUrl2).getName());
|
||||||
|
assertConfigFile(expectedConfigFile, files.get(3));
|
||||||
|
|
||||||
|
// Ensure mounts env value is correct. Add one mount string
|
||||||
|
String env = serviceSpec.getConfiguration().getEnv()
|
||||||
|
.get("YARN_CONTAINER_RUNTIME_DOCKER_MOUNTS");
|
||||||
|
|
||||||
|
String expectedMounts =
|
||||||
|
new Path(containerPath2).getName() + ":" + containerPath2 + ":rw";
|
||||||
|
assertTrue(env.contains(expectedMounts));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Test if file/dir to be localized whose size exceeds limit.
|
||||||
|
* Max 10MB in configuration, mock remote will
|
||||||
|
* always return file size 100MB.
|
||||||
|
* This configuration will fail the job which has remoteUri
|
||||||
|
* But don't impact local dir/file
|
||||||
|
*
|
||||||
|
* --localization https://a/b/1.patch:.
|
||||||
|
* --localization s3a://a/dir:/opt/mys3dir
|
||||||
|
* --localization /temp/script2.py:./
|
||||||
|
*/
|
||||||
|
@Test
|
||||||
|
public void testRunJobRemoteUriExceedLocalizationSize() throws Exception {
|
||||||
|
String remoteUri1 = "https://a/b/1.patch";
|
||||||
|
String containerLocal1 = ".";
|
||||||
|
String remoteUri2 = "s3a://a/s3dir";
|
||||||
|
String containerLocal2 = "/opt/mys3dir";
|
||||||
|
String localUri1 = "/temp/script2";
|
||||||
|
String containerLocal3 = "./";
|
||||||
|
|
||||||
|
SubmarineConfiguration submarineConf = new SubmarineConfiguration();
|
||||||
|
|
||||||
|
// Max 10MB, mock remote will always return file size 100MB.
|
||||||
|
submarineConf.set(
|
||||||
|
SubmarineConfiguration.LOCALIZATION_MAX_ALLOWED_FILE_SIZE_MB,
|
||||||
|
"10");
|
||||||
|
mockClientContext.setSubmarineConfig(submarineConf);
|
||||||
|
|
||||||
|
assertFalse(SubmarineLogs.isVerbose());
|
||||||
|
|
||||||
|
// create remote file in local staging dir to simulate
|
||||||
|
Path stagingDir = getStagingDir();
|
||||||
|
testCommons.getFileUtils().createFileInDir(stagingDir, remoteUri1);
|
||||||
|
File remoteDir1 =
|
||||||
|
testCommons.getFileUtils().createDirectory(stagingDir, remoteUri2);
|
||||||
|
testCommons.getFileUtils().createFileInDir(remoteDir1, "afile");
|
||||||
|
|
||||||
|
// create local file, we need to put it under local temp dir
|
||||||
|
File localFile1 = testCommons.getFileUtils().createFileInTempDir(localUri1);
|
||||||
|
|
||||||
|
try {
|
||||||
|
RunJobCli runJobCli = createRunJobCli();
|
||||||
|
String[] params = createCommonParamsBuilder()
|
||||||
|
.withLocalization(remoteUri1, containerLocal1)
|
||||||
|
.build();
|
||||||
|
runJobCli.run(params);
|
||||||
|
} catch (IOException e) {
|
||||||
|
// Shouldn't have exception because it's within file size limit
|
||||||
|
fail();
|
||||||
|
}
|
||||||
|
// we should download because fail fast
|
||||||
|
verifyRdmCopyToRemoteLocalCalls(1);
|
||||||
|
try {
|
||||||
|
String[] params = createCommonParamsBuilder()
|
||||||
|
.withLocalization(remoteUri1, containerLocal1)
|
||||||
|
.withLocalization(remoteUri2, containerLocal2)
|
||||||
|
.withLocalization(localFile1.getAbsolutePath(), containerLocal3)
|
||||||
|
.build();
|
||||||
|
|
||||||
|
reset(spyRdm);
|
||||||
|
RunJobCli runJobCli = createRunJobCliWithoutVerboseAssertion();
|
||||||
|
runJobCli.run(params);
|
||||||
|
} catch (IOException e) {
|
||||||
|
assertTrue(e.getMessage()
|
||||||
|
.contains("104857600 exceeds configured max size:10485760"));
|
||||||
|
// we shouldn't do any download because fail fast
|
||||||
|
verifyRdmCopyToRemoteLocalCalls(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
String[] params = createCommonParamsBuilder()
|
||||||
|
.withLocalization(localFile1.getAbsolutePath(), containerLocal3)
|
||||||
|
.build();
|
||||||
|
RunJobCli runJobCli = createRunJobCliWithoutVerboseAssertion();
|
||||||
|
runJobCli.run(params);
|
||||||
|
} catch (IOException e) {
|
||||||
|
assertTrue(e.getMessage()
|
||||||
|
.contains("104857600 exceeds configured max size:10485760"));
|
||||||
|
// we shouldn't do any download because fail fast
|
||||||
|
verifyRdmCopyToRemoteLocalCalls(0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Test remote Uri doesn't exist.
|
||||||
|
* */
|
||||||
|
@Test
|
||||||
|
public void testRunJobWithNonExistRemoteUri() throws Exception {
|
||||||
|
String remoteUri1 = "hdfs:///a/b/1.patch";
|
||||||
|
String containerLocal1 = ".";
|
||||||
|
String localUri1 = "/a/b/c";
|
||||||
|
String containerLocal2 = "./";
|
||||||
|
|
||||||
|
try {
|
||||||
|
String[] params = createCommonParamsBuilder()
|
||||||
|
.withLocalization(remoteUri1, containerLocal1)
|
||||||
|
.build();
|
||||||
|
RunJobCli runJobCli = createRunJobCli();
|
||||||
|
runJobCli.run(params);
|
||||||
|
} catch (IOException e) {
|
||||||
|
assertTrue(e.getMessage().contains("doesn't exists"));
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
String[] params = createCommonParamsBuilder()
|
||||||
|
.withLocalization(localUri1, containerLocal2)
|
||||||
|
.build();
|
||||||
|
RunJobCli runJobCli = createRunJobCliWithoutVerboseAssertion();
|
||||||
|
runJobCli.run(params);
|
||||||
|
} catch (IOException e) {
|
||||||
|
assertTrue(e.getMessage().contains("doesn't exists"));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Test local dir
|
||||||
|
* --localization /user/yarn/mydir:./mydir1
|
||||||
|
* --localization /user/yarn/mydir2:/opt/dir2:rw
|
||||||
|
* --localization /user/yarn/mydir2:.
|
||||||
|
*/
|
||||||
|
@Test
|
||||||
|
public void testRunJobWithLocalDirLocalization() throws Exception {
|
||||||
|
String localUrl = "/user/yarn/mydir";
|
||||||
|
String containerPath = "./mydir1";
|
||||||
|
String localUrl2 = "/user/yarn/mydir2";
|
||||||
|
String containerPath2 = "/opt/dir2";
|
||||||
|
String containerPath3 = ".";
|
||||||
|
|
||||||
|
// create local file
|
||||||
|
File localDir1 = testCommons.getFileUtils().createDirInTempDir(localUrl);
|
||||||
|
testCommons.getFileUtils().createFileInDir(localDir1, "1.py");
|
||||||
|
testCommons.getFileUtils().createFileInDir(localDir1, "2.py");
|
||||||
|
|
||||||
|
File localDir2 = testCommons.getFileUtils().createDirInTempDir(localUrl2);
|
||||||
|
testCommons.getFileUtils().createFileInDir(localDir2, "3.py");
|
||||||
|
testCommons.getFileUtils().createFileInDir(localDir2, "4.py");
|
||||||
|
|
||||||
|
String suffix1 = "_" + localDir1.lastModified()
|
||||||
|
+ "-" + localDir1.length();
|
||||||
|
String suffix2 = "_" + localDir2.lastModified()
|
||||||
|
+ "-" + localDir2.length();
|
||||||
|
|
||||||
|
String[] params = createCommonParamsBuilder()
|
||||||
|
.withLocalization(localDir1.getAbsolutePath(), containerPath)
|
||||||
|
.withLocalization(localDir2.getAbsolutePath(), containerPath2)
|
||||||
|
.withLocalization(localDir2.getAbsolutePath(), containerPath3)
|
||||||
|
.build();
|
||||||
|
RunJobCli runJobCli = createRunJobCli();
|
||||||
|
runJobCli.run(params);
|
||||||
|
|
||||||
|
Service serviceSpec = testCommons.getServiceSpecFromJobSubmitter(
|
||||||
|
runJobCli.getJobSubmitter());
|
||||||
|
assertNumberOfServiceComponents(serviceSpec, 3);
|
||||||
|
|
||||||
|
// we shouldn't do any download
|
||||||
|
verifyRdmCopyToRemoteLocalCalls(0);
|
||||||
|
|
||||||
|
// Ensure local original files are not deleted
|
||||||
|
assertTrue(localDir1.exists());
|
||||||
|
assertTrue(localDir2.exists());
|
||||||
|
|
||||||
|
// Ensure zip file are deleted
|
||||||
|
assertFalse(
|
||||||
|
testCommons.getFileUtils()
|
||||||
|
.getTempFileWithName(localUrl + suffix1 + ZIP_EXTENSION)
|
||||||
|
.exists());
|
||||||
|
assertFalse(
|
||||||
|
testCommons.getFileUtils()
|
||||||
|
.getTempFileWithName(localUrl2 + suffix2 + ZIP_EXTENSION)
|
||||||
|
.exists());
|
||||||
|
|
||||||
|
// Ensure dirs will be zipped and localized
|
||||||
|
List<ConfigFile> files = serviceSpec.getConfiguration().getFiles();
|
||||||
|
assertNumberOfLocalizations(files, 3);
|
||||||
|
|
||||||
|
Path stagingDir = getStagingDir();
|
||||||
|
ConfigFile expectedConfigFile = new ConfigFile();
|
||||||
|
expectedConfigFile.setType(ConfigFile.TypeEnum.ARCHIVE);
|
||||||
|
expectedConfigFile.setSrcFile(
|
||||||
|
getFilePathWithSuffix(stagingDir, localUrl, suffix1 + ZIP_EXTENSION));
|
||||||
|
expectedConfigFile.setDestFile(new Path(containerPath).getName());
|
||||||
|
assertConfigFile(expectedConfigFile, files.get(0));
|
||||||
|
|
||||||
|
expectedConfigFile = new ConfigFile();
|
||||||
|
expectedConfigFile.setType(ConfigFile.TypeEnum.ARCHIVE);
|
||||||
|
expectedConfigFile.setSrcFile(
|
||||||
|
getFilePathWithSuffix(stagingDir, localUrl2, suffix2 + ZIP_EXTENSION));
|
||||||
|
expectedConfigFile.setDestFile(new Path(containerPath2).getName());
|
||||||
|
assertConfigFile(expectedConfigFile, files.get(1));
|
||||||
|
|
||||||
|
expectedConfigFile = new ConfigFile();
|
||||||
|
expectedConfigFile.setType(ConfigFile.TypeEnum.ARCHIVE);
|
||||||
|
expectedConfigFile.setSrcFile(
|
||||||
|
getFilePathWithSuffix(stagingDir, localUrl2, suffix2 + ZIP_EXTENSION));
|
||||||
|
expectedConfigFile.setDestFile(new Path(localUrl2).getName());
|
||||||
|
assertConfigFile(expectedConfigFile, files.get(2));
|
||||||
|
|
||||||
|
// Ensure mounts env value is correct
|
||||||
|
String env = serviceSpec.getConfiguration().getEnv()
|
||||||
|
.get("YARN_CONTAINER_RUNTIME_DOCKER_MOUNTS");
|
||||||
|
String expectedMounts = new Path(containerPath2).getName()
|
||||||
|
+ ":" + containerPath2 + ":rw";
|
||||||
|
|
||||||
|
assertTrue(env.contains(expectedMounts));
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,95 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hadoop.yarn.submarine.runtimes.yarnservice;
|
||||||
|
|
||||||
|
import org.apache.hadoop.yarn.service.api.records.Component;
|
||||||
|
import org.apache.hadoop.yarn.service.api.records.Service;
|
||||||
|
import org.junit.Test;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
import static org.junit.Assert.*;
|
||||||
|
import static org.mockito.ArgumentMatchers.eq;
|
||||||
|
import static org.mockito.Mockito.mock;
|
||||||
|
import static org.mockito.Mockito.verify;
|
||||||
|
import static org.mockito.Mockito.when;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Class to test the {@link ServiceWrapper}.
|
||||||
|
*/
|
||||||
|
public class TestServiceWrapper {
|
||||||
|
private AbstractComponent createMockAbstractComponent(Component mockComponent,
|
||||||
|
String componentName, String localScriptFile) throws IOException {
|
||||||
|
when(mockComponent.getName()).thenReturn(componentName);
|
||||||
|
|
||||||
|
AbstractComponent mockAbstractComponent = mock(AbstractComponent.class);
|
||||||
|
when(mockAbstractComponent.createComponent()).thenReturn(mockComponent);
|
||||||
|
when(mockAbstractComponent.getLocalScriptFile())
|
||||||
|
.thenReturn(localScriptFile);
|
||||||
|
return mockAbstractComponent;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testWithSingleComponent() throws IOException {
|
||||||
|
Service mockService = mock(Service.class);
|
||||||
|
ServiceWrapper serviceWrapper = new ServiceWrapper(mockService);
|
||||||
|
|
||||||
|
Component mockComponent = mock(Component.class);
|
||||||
|
AbstractComponent mockAbstractComponent =
|
||||||
|
createMockAbstractComponent(mockComponent, "testComponent",
|
||||||
|
"testLocalScriptFile");
|
||||||
|
serviceWrapper.addComponent(mockAbstractComponent);
|
||||||
|
|
||||||
|
verify(mockService).addComponent(eq(mockComponent));
|
||||||
|
|
||||||
|
String launchCommand =
|
||||||
|
serviceWrapper.getLocalLaunchCommandPathForComponent("testComponent");
|
||||||
|
assertEquals("testLocalScriptFile", launchCommand);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testWithMultipleComponent() throws IOException {
|
||||||
|
Service mockService = mock(Service.class);
|
||||||
|
ServiceWrapper serviceWrapper = new ServiceWrapper(mockService);
|
||||||
|
|
||||||
|
Component mockComponent1 = mock(Component.class);
|
||||||
|
AbstractComponent mockAbstractComponent1 =
|
||||||
|
createMockAbstractComponent(mockComponent1, "testComponent1",
|
||||||
|
"testLocalScriptFile1");
|
||||||
|
|
||||||
|
Component mockComponent2 = mock(Component.class);
|
||||||
|
AbstractComponent mockAbstractComponent2 =
|
||||||
|
createMockAbstractComponent(mockComponent2, "testComponent2",
|
||||||
|
"testLocalScriptFile2");
|
||||||
|
|
||||||
|
serviceWrapper.addComponent(mockAbstractComponent1);
|
||||||
|
serviceWrapper.addComponent(mockAbstractComponent2);
|
||||||
|
|
||||||
|
verify(mockService).addComponent(eq(mockComponent1));
|
||||||
|
verify(mockService).addComponent(eq(mockComponent2));
|
||||||
|
|
||||||
|
String launchCommand1 =
|
||||||
|
serviceWrapper.getLocalLaunchCommandPathForComponent("testComponent1");
|
||||||
|
assertEquals("testLocalScriptFile1", launchCommand1);
|
||||||
|
|
||||||
|
String launchCommand2 =
|
||||||
|
serviceWrapper.getLocalLaunchCommandPathForComponent("testComponent2");
|
||||||
|
assertEquals("testLocalScriptFile2", launchCommand2);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
}
|
|
@ -14,26 +14,30 @@
|
||||||
|
|
||||||
package org.apache.hadoop.yarn.submarine.runtimes.yarnservice;
|
package org.apache.hadoop.yarn.submarine.runtimes.yarnservice;
|
||||||
|
|
||||||
|
import org.apache.hadoop.yarn.submarine.runtimes.yarnservice.tensorflow.TensorFlowCommons;
|
||||||
import org.codehaus.jettison.json.JSONException;
|
import org.codehaus.jettison.json.JSONException;
|
||||||
import org.junit.Assert;
|
import org.junit.Assert;
|
||||||
import org.junit.Test;
|
import org.junit.Test;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Class to test some functionality of {@link TensorFlowCommons}.
|
||||||
|
*/
|
||||||
public class TestTFConfigGenerator {
|
public class TestTFConfigGenerator {
|
||||||
@Test
|
@Test
|
||||||
public void testSimpleDistributedTFConfigGenerator() throws JSONException {
|
public void testSimpleDistributedTFConfigGenerator() throws JSONException {
|
||||||
String json = YarnServiceUtils.getTFConfigEnv("worker", 5, 3, "wtan",
|
String json = TensorFlowCommons.getTFConfigEnv("worker", 5, 3, "wtan",
|
||||||
"tf-job-001", "example.com");
|
"tf-job-001", "example.com");
|
||||||
String expected =
|
String expected =
|
||||||
"{\\\"cluster\\\":{\\\"master\\\":[\\\"master-0.wtan.tf-job-001.example.com:8000\\\"],\\\"worker\\\":[\\\"worker-0.wtan.tf-job-001.example.com:8000\\\",\\\"worker-1.wtan.tf-job-001.example.com:8000\\\",\\\"worker-2.wtan.tf-job-001.example.com:8000\\\",\\\"worker-3.wtan.tf-job-001.example.com:8000\\\"],\\\"ps\\\":[\\\"ps-0.wtan.tf-job-001.example.com:8000\\\",\\\"ps-1.wtan.tf-job-001.example.com:8000\\\",\\\"ps-2.wtan.tf-job-001.example.com:8000\\\"]},\\\"task\\\":{ \\\"type\\\":\\\"worker\\\", \\\"index\\\":$_TASK_INDEX},\\\"environment\\\":\\\"cloud\\\"}";
|
"{\\\"cluster\\\":{\\\"master\\\":[\\\"master-0.wtan.tf-job-001.example.com:8000\\\"],\\\"worker\\\":[\\\"worker-0.wtan.tf-job-001.example.com:8000\\\",\\\"worker-1.wtan.tf-job-001.example.com:8000\\\",\\\"worker-2.wtan.tf-job-001.example.com:8000\\\",\\\"worker-3.wtan.tf-job-001.example.com:8000\\\"],\\\"ps\\\":[\\\"ps-0.wtan.tf-job-001.example.com:8000\\\",\\\"ps-1.wtan.tf-job-001.example.com:8000\\\",\\\"ps-2.wtan.tf-job-001.example.com:8000\\\"]},\\\"task\\\":{ \\\"type\\\":\\\"worker\\\", \\\"index\\\":$_TASK_INDEX},\\\"environment\\\":\\\"cloud\\\"}";
|
||||||
Assert.assertEquals(expected, json);
|
Assert.assertEquals(expected, json);
|
||||||
|
|
||||||
json = YarnServiceUtils.getTFConfigEnv("ps", 5, 3, "wtan", "tf-job-001",
|
json = TensorFlowCommons.getTFConfigEnv("ps", 5, 3, "wtan", "tf-job-001",
|
||||||
"example.com");
|
"example.com");
|
||||||
expected =
|
expected =
|
||||||
"{\\\"cluster\\\":{\\\"master\\\":[\\\"master-0.wtan.tf-job-001.example.com:8000\\\"],\\\"worker\\\":[\\\"worker-0.wtan.tf-job-001.example.com:8000\\\",\\\"worker-1.wtan.tf-job-001.example.com:8000\\\",\\\"worker-2.wtan.tf-job-001.example.com:8000\\\",\\\"worker-3.wtan.tf-job-001.example.com:8000\\\"],\\\"ps\\\":[\\\"ps-0.wtan.tf-job-001.example.com:8000\\\",\\\"ps-1.wtan.tf-job-001.example.com:8000\\\",\\\"ps-2.wtan.tf-job-001.example.com:8000\\\"]},\\\"task\\\":{ \\\"type\\\":\\\"ps\\\", \\\"index\\\":$_TASK_INDEX},\\\"environment\\\":\\\"cloud\\\"}";
|
"{\\\"cluster\\\":{\\\"master\\\":[\\\"master-0.wtan.tf-job-001.example.com:8000\\\"],\\\"worker\\\":[\\\"worker-0.wtan.tf-job-001.example.com:8000\\\",\\\"worker-1.wtan.tf-job-001.example.com:8000\\\",\\\"worker-2.wtan.tf-job-001.example.com:8000\\\",\\\"worker-3.wtan.tf-job-001.example.com:8000\\\"],\\\"ps\\\":[\\\"ps-0.wtan.tf-job-001.example.com:8000\\\",\\\"ps-1.wtan.tf-job-001.example.com:8000\\\",\\\"ps-2.wtan.tf-job-001.example.com:8000\\\"]},\\\"task\\\":{ \\\"type\\\":\\\"ps\\\", \\\"index\\\":$_TASK_INDEX},\\\"environment\\\":\\\"cloud\\\"}";
|
||||||
Assert.assertEquals(expected, json);
|
Assert.assertEquals(expected, json);
|
||||||
|
|
||||||
json = YarnServiceUtils.getTFConfigEnv("master", 2, 1, "wtan", "tf-job-001",
|
json = TensorFlowCommons.getTFConfigEnv("master", 2, 1, "wtan", "tf-job-001",
|
||||||
"example.com");
|
"example.com");
|
||||||
expected =
|
expected =
|
||||||
"{\\\"cluster\\\":{\\\"master\\\":[\\\"master-0.wtan.tf-job-001.example.com:8000\\\"],\\\"worker\\\":[\\\"worker-0.wtan.tf-job-001.example.com:8000\\\"],\\\"ps\\\":[\\\"ps-0.wtan.tf-job-001.example.com:8000\\\"]},\\\"task\\\":{ \\\"type\\\":\\\"master\\\", \\\"index\\\":$_TASK_INDEX},\\\"environment\\\":\\\"cloud\\\"}";
|
"{\\\"cluster\\\":{\\\"master\\\":[\\\"master-0.wtan.tf-job-001.example.com:8000\\\"],\\\"worker\\\":[\\\"worker-0.wtan.tf-job-001.example.com:8000\\\"],\\\"ps\\\":[\\\"ps-0.wtan.tf-job-001.example.com:8000\\\"]},\\\"task\\\":{ \\\"type\\\":\\\"master\\\", \\\"index\\\":$_TASK_INDEX},\\\"environment\\\":\\\"cloud\\\"}";
|
||||||
|
|
|
@ -0,0 +1,190 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hadoop.yarn.submarine.runtimes.yarnservice.command;
|
||||||
|
|
||||||
|
import org.apache.hadoop.conf.Configuration;
|
||||||
|
import org.apache.hadoop.yarn.service.api.records.Component;
|
||||||
|
import org.apache.hadoop.yarn.submarine.client.cli.param.RunJobParameters;
|
||||||
|
import org.apache.hadoop.yarn.submarine.common.MockClientContext;
|
||||||
|
import org.apache.hadoop.yarn.submarine.common.api.TaskType;
|
||||||
|
import org.apache.hadoop.yarn.submarine.runtimes.yarnservice.FileSystemOperations;
|
||||||
|
import org.apache.hadoop.yarn.submarine.runtimes.yarnservice.HadoopEnvironmentSetup;
|
||||||
|
import org.apache.hadoop.yarn.submarine.runtimes.yarnservice.tensorflow.command.TensorBoardLaunchCommand;
|
||||||
|
import org.apache.hadoop.yarn.submarine.runtimes.yarnservice.tensorflow.command.TensorFlowPsLaunchCommand;
|
||||||
|
import org.apache.hadoop.yarn.submarine.runtimes.yarnservice.tensorflow.command.TensorFlowWorkerLaunchCommand;
|
||||||
|
import org.junit.Rule;
|
||||||
|
import org.junit.rules.ExpectedException;
|
||||||
|
|
||||||
|
import java.io.File;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.nio.charset.Charset;
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Paths;
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import static junit.framework.TestCase.assertEquals;
|
||||||
|
import static org.junit.Assert.assertFalse;
|
||||||
|
import static org.junit.Assert.assertNotNull;
|
||||||
|
import static org.junit.Assert.assertTrue;
|
||||||
|
import static org.junit.Assert.fail;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This class is an abstract base class for testing Tensorboard and TensorFlow
|
||||||
|
* launch commands.
|
||||||
|
*/
|
||||||
|
public abstract class AbstractLaunchCommandTestHelper {
|
||||||
|
private TaskType taskType;
|
||||||
|
private boolean useTaskTypeOverride;
|
||||||
|
|
||||||
|
@Rule
|
||||||
|
public ExpectedException expectedException = ExpectedException.none();
|
||||||
|
|
||||||
|
private void assertScriptContainsExportedEnvVar(List<String> fileContents,
|
||||||
|
String varName) {
|
||||||
|
String expected = String.format("export %s=", varName);
|
||||||
|
assertScriptContainsLine(fileContents, expected);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static void assertScriptContainsExportedEnvVarWithValue(
|
||||||
|
List<String> fileContents, String varName, String value) {
|
||||||
|
String expected = String.format("export %s=%s", varName, value);
|
||||||
|
assertScriptContainsLine(fileContents, expected);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static void assertScriptContainsLine(List<String> fileContents,
|
||||||
|
String expected) {
|
||||||
|
String message = String.format(
|
||||||
|
"File does not contain expected line '%s'!" + " File contents: %s",
|
||||||
|
expected, Arrays.toString(fileContents.toArray()));
|
||||||
|
assertTrue(message, fileContents.contains(expected));
|
||||||
|
}
|
||||||
|
|
||||||
|
public static void assertScriptContainsLineWithRegex(
|
||||||
|
List<String> fileContents,
|
||||||
|
String regex) {
|
||||||
|
String message = String.format(
|
||||||
|
"File does not contain expected line '%s'!" + " File contents: %s",
|
||||||
|
regex, Arrays.toString(fileContents.toArray()));
|
||||||
|
|
||||||
|
for (String line : fileContents) {
|
||||||
|
if (line.matches(regex)) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
fail(message);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static void assertScriptDoesNotContainLine(List<String> fileContents,
|
||||||
|
String expected) {
|
||||||
|
String message = String.format(
|
||||||
|
"File contains unexpected line '%s'!" + " File contents: %s",
|
||||||
|
expected, Arrays.toString(fileContents.toArray()));
|
||||||
|
assertFalse(message, fileContents.contains(expected));
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private AbstractLaunchCommand createLaunchCommandByTaskType(TaskType taskType,
|
||||||
|
RunJobParameters params) throws IOException {
|
||||||
|
MockClientContext mockClientContext = new MockClientContext();
|
||||||
|
FileSystemOperations fsOperations =
|
||||||
|
new FileSystemOperations(mockClientContext);
|
||||||
|
HadoopEnvironmentSetup hadoopEnvSetup =
|
||||||
|
new HadoopEnvironmentSetup(mockClientContext, fsOperations);
|
||||||
|
Component component = new Component();
|
||||||
|
Configuration yarnConfig = new Configuration();
|
||||||
|
|
||||||
|
return createLaunchCommandByTaskTypeInternal(taskType, params,
|
||||||
|
hadoopEnvSetup, component, yarnConfig);
|
||||||
|
}
|
||||||
|
|
||||||
|
private AbstractLaunchCommand createLaunchCommandByTaskTypeInternal(
|
||||||
|
TaskType taskType, RunJobParameters params,
|
||||||
|
HadoopEnvironmentSetup hadoopEnvSetup, Component component,
|
||||||
|
Configuration yarnConfig)
|
||||||
|
throws IOException {
|
||||||
|
if (taskType == TaskType.TENSORBOARD) {
|
||||||
|
return new TensorBoardLaunchCommand(
|
||||||
|
hadoopEnvSetup, getTaskType(taskType), component, params);
|
||||||
|
} else if (taskType == TaskType.WORKER
|
||||||
|
|| taskType == TaskType.PRIMARY_WORKER) {
|
||||||
|
return new TensorFlowWorkerLaunchCommand(
|
||||||
|
hadoopEnvSetup, getTaskType(taskType), component, params, yarnConfig);
|
||||||
|
} else if (taskType == TaskType.PS) {
|
||||||
|
return new TensorFlowPsLaunchCommand(
|
||||||
|
hadoopEnvSetup, getTaskType(taskType), component, params, yarnConfig);
|
||||||
|
}
|
||||||
|
throw new IllegalStateException("Unknown taskType!");
|
||||||
|
}
|
||||||
|
|
||||||
|
public void overrideTaskType(TaskType taskType) {
|
||||||
|
this.taskType = taskType;
|
||||||
|
this.useTaskTypeOverride = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
private TaskType getTaskType(TaskType taskType) {
|
||||||
|
if (useTaskTypeOverride) {
|
||||||
|
return this.taskType;
|
||||||
|
}
|
||||||
|
return taskType;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testHdfsRelatedEnvironmentIsUndefined(TaskType taskType,
|
||||||
|
RunJobParameters params) throws IOException {
|
||||||
|
AbstractLaunchCommand launchCommand =
|
||||||
|
createLaunchCommandByTaskType(taskType, params);
|
||||||
|
|
||||||
|
expectedException.expect(IOException.class);
|
||||||
|
expectedException
|
||||||
|
.expectMessage("Failed to detect HDFS-related environments.");
|
||||||
|
launchCommand.generateLaunchScript();
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<String> testHdfsRelatedEnvironmentIsDefined(TaskType taskType,
|
||||||
|
RunJobParameters params) throws IOException {
|
||||||
|
AbstractLaunchCommand launchCommand =
|
||||||
|
createLaunchCommandByTaskType(taskType, params);
|
||||||
|
|
||||||
|
String result = launchCommand.generateLaunchScript();
|
||||||
|
assertNotNull(result);
|
||||||
|
File resultFile = new File(result);
|
||||||
|
assertTrue(resultFile.exists());
|
||||||
|
|
||||||
|
List<String> fileContents = Files.readAllLines(
|
||||||
|
Paths.get(resultFile.toURI()),
|
||||||
|
Charset.forName("UTF-8"));
|
||||||
|
|
||||||
|
assertEquals("#!/bin/bash", fileContents.get(0));
|
||||||
|
assertScriptContainsExportedEnvVar(fileContents, "HADOOP_HOME");
|
||||||
|
assertScriptContainsExportedEnvVar(fileContents, "HADOOP_YARN_HOME");
|
||||||
|
assertScriptContainsExportedEnvVarWithValue(fileContents,
|
||||||
|
"HADOOP_HDFS_HOME", "testHdfsHome");
|
||||||
|
assertScriptContainsExportedEnvVarWithValue(fileContents,
|
||||||
|
"HADOOP_COMMON_HOME", "testHdfsHome");
|
||||||
|
assertScriptContainsExportedEnvVarWithValue(fileContents, "HADOOP_CONF_DIR",
|
||||||
|
"$WORK_DIR");
|
||||||
|
assertScriptContainsExportedEnvVarWithValue(fileContents, "JAVA_HOME",
|
||||||
|
"testJavaHome");
|
||||||
|
assertScriptContainsExportedEnvVarWithValue(fileContents, "LD_LIBRARY_PATH",
|
||||||
|
"$LD_LIBRARY_PATH:$JAVA_HOME/lib/amd64/server");
|
||||||
|
assertScriptContainsExportedEnvVarWithValue(fileContents, "CLASSPATH",
|
||||||
|
"`$HADOOP_HDFS_HOME/bin/hadoop classpath --glob`");
|
||||||
|
|
||||||
|
return fileContents;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,97 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hadoop.yarn.submarine.runtimes.yarnservice.command;
|
||||||
|
|
||||||
|
import org.apache.hadoop.conf.Configuration;
|
||||||
|
import org.apache.hadoop.yarn.service.api.records.Component;
|
||||||
|
import org.apache.hadoop.yarn.submarine.client.cli.param.RunJobParameters;
|
||||||
|
import org.apache.hadoop.yarn.submarine.common.api.TaskType;
|
||||||
|
import org.apache.hadoop.yarn.submarine.runtimes.yarnservice.HadoopEnvironmentSetup;
|
||||||
|
import org.apache.hadoop.yarn.submarine.runtimes.yarnservice.tensorflow.command.TensorBoardLaunchCommand;
|
||||||
|
import org.apache.hadoop.yarn.submarine.runtimes.yarnservice.tensorflow.command.TensorFlowPsLaunchCommand;
|
||||||
|
import org.apache.hadoop.yarn.submarine.runtimes.yarnservice.tensorflow.command.TensorFlowWorkerLaunchCommand;
|
||||||
|
import org.junit.Test;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
import static org.junit.Assert.assertTrue;
|
||||||
|
import static org.mockito.Mockito.mock;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This class is to test the {@link LaunchCommandFactory}.
|
||||||
|
*/
|
||||||
|
public class TestLaunchCommandFactory {
|
||||||
|
|
||||||
|
private LaunchCommandFactory createLaunchCommandFactory(
|
||||||
|
RunJobParameters parameters) {
|
||||||
|
HadoopEnvironmentSetup hadoopEnvSetup = mock(HadoopEnvironmentSetup.class);
|
||||||
|
Configuration configuration = mock(Configuration.class);
|
||||||
|
return new LaunchCommandFactory(hadoopEnvSetup, parameters, configuration);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void createLaunchCommandWorkerAndPrimaryWorker() throws IOException {
|
||||||
|
RunJobParameters parameters = new RunJobParameters();
|
||||||
|
parameters.setWorkerLaunchCmd("testWorkerLaunchCommand");
|
||||||
|
LaunchCommandFactory launchCommandFactory = createLaunchCommandFactory(
|
||||||
|
parameters);
|
||||||
|
Component mockComponent = mock(Component.class);
|
||||||
|
|
||||||
|
AbstractLaunchCommand launchCommand =
|
||||||
|
launchCommandFactory.createLaunchCommand(TaskType.PRIMARY_WORKER,
|
||||||
|
mockComponent);
|
||||||
|
|
||||||
|
assertTrue(launchCommand instanceof TensorFlowWorkerLaunchCommand);
|
||||||
|
|
||||||
|
launchCommand =
|
||||||
|
launchCommandFactory.createLaunchCommand(TaskType.WORKER,
|
||||||
|
mockComponent);
|
||||||
|
assertTrue(launchCommand instanceof TensorFlowWorkerLaunchCommand);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void createLaunchCommandPs() throws IOException {
|
||||||
|
RunJobParameters parameters = new RunJobParameters();
|
||||||
|
parameters.setPSLaunchCmd("testPSLaunchCommand");
|
||||||
|
LaunchCommandFactory launchCommandFactory = createLaunchCommandFactory(
|
||||||
|
parameters);
|
||||||
|
Component mockComponent = mock(Component.class);
|
||||||
|
|
||||||
|
AbstractLaunchCommand launchCommand =
|
||||||
|
launchCommandFactory.createLaunchCommand(TaskType.PS,
|
||||||
|
mockComponent);
|
||||||
|
|
||||||
|
assertTrue(launchCommand instanceof TensorFlowPsLaunchCommand);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void createLaunchCommandTensorboard() throws IOException {
|
||||||
|
RunJobParameters parameters = new RunJobParameters();
|
||||||
|
parameters.setCheckpointPath("testCheckpointPath");
|
||||||
|
LaunchCommandFactory launchCommandFactory =
|
||||||
|
createLaunchCommandFactory(parameters);
|
||||||
|
Component mockComponent = mock(Component.class);
|
||||||
|
|
||||||
|
AbstractLaunchCommand launchCommand =
|
||||||
|
launchCommandFactory.createLaunchCommand(TaskType.TENSORBOARD,
|
||||||
|
mockComponent);
|
||||||
|
|
||||||
|
assertTrue(launchCommand instanceof TensorBoardLaunchCommand);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,104 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hadoop.yarn.submarine.runtimes.yarnservice.tensorflow.command;
|
||||||
|
|
||||||
|
import com.google.common.collect.ImmutableList;
|
||||||
|
import org.apache.hadoop.yarn.service.api.records.Component;
|
||||||
|
import org.apache.hadoop.yarn.submarine.client.cli.param.RunJobParameters;
|
||||||
|
import org.apache.hadoop.yarn.submarine.common.MockClientContext;
|
||||||
|
import org.apache.hadoop.yarn.submarine.common.api.TaskType;
|
||||||
|
import org.apache.hadoop.yarn.submarine.runtimes.yarnservice.FileSystemOperations;
|
||||||
|
import org.apache.hadoop.yarn.submarine.runtimes.yarnservice.HadoopEnvironmentSetup;
|
||||||
|
import org.apache.hadoop.yarn.submarine.runtimes.yarnservice.command.AbstractLaunchCommandTestHelper;
|
||||||
|
import org.junit.Test;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import static org.apache.hadoop.yarn.submarine.runtimes.yarnservice.HadoopEnvironmentSetup.DOCKER_HADOOP_HDFS_HOME;
|
||||||
|
import static org.apache.hadoop.yarn.submarine.runtimes.yarnservice.HadoopEnvironmentSetup.DOCKER_JAVA_HOME;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This class is to test the {@link TensorBoardLaunchCommand}.
|
||||||
|
*/
|
||||||
|
public class TestTensorBoardLaunchCommand extends
|
||||||
|
AbstractLaunchCommandTestHelper {
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testHdfsRelatedEnvironmentIsUndefined() throws IOException {
|
||||||
|
RunJobParameters params = new RunJobParameters();
|
||||||
|
params.setInputPath("hdfs://bla");
|
||||||
|
params.setName("testJobname");
|
||||||
|
params.setCheckpointPath("something");
|
||||||
|
|
||||||
|
testHdfsRelatedEnvironmentIsUndefined(TaskType.TENSORBOARD,
|
||||||
|
params);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testHdfsRelatedEnvironmentIsDefined() throws IOException {
|
||||||
|
RunJobParameters params = new RunJobParameters();
|
||||||
|
params.setName("testName");
|
||||||
|
params.setCheckpointPath("testCheckpointPath");
|
||||||
|
params.setInputPath("hdfs://bla");
|
||||||
|
params.setEnvars(ImmutableList.of(
|
||||||
|
DOCKER_HADOOP_HDFS_HOME + "=" + "testHdfsHome",
|
||||||
|
DOCKER_JAVA_HOME + "=" + "testJavaHome"));
|
||||||
|
|
||||||
|
List<String> fileContents =
|
||||||
|
testHdfsRelatedEnvironmentIsDefined(TaskType.TENSORBOARD,
|
||||||
|
params);
|
||||||
|
assertScriptContainsExportedEnvVarWithValue(fileContents, "LC_ALL",
|
||||||
|
"C && tensorboard --logdir=testCheckpointPath");
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testCheckpointPathUndefined() throws IOException {
|
||||||
|
MockClientContext mockClientContext = new MockClientContext();
|
||||||
|
FileSystemOperations fsOperations =
|
||||||
|
new FileSystemOperations(mockClientContext);
|
||||||
|
HadoopEnvironmentSetup hadoopEnvSetup =
|
||||||
|
new HadoopEnvironmentSetup(mockClientContext, fsOperations);
|
||||||
|
|
||||||
|
Component component = new Component();
|
||||||
|
RunJobParameters params = new RunJobParameters();
|
||||||
|
params.setCheckpointPath(null);
|
||||||
|
|
||||||
|
expectedException.expect(NullPointerException.class);
|
||||||
|
expectedException.expectMessage("CheckpointPath must not be null");
|
||||||
|
new TensorBoardLaunchCommand(hadoopEnvSetup, TaskType.TENSORBOARD,
|
||||||
|
component, params);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testCheckpointPathEmptyString() throws IOException {
|
||||||
|
MockClientContext mockClientContext = new MockClientContext();
|
||||||
|
FileSystemOperations fsOperations =
|
||||||
|
new FileSystemOperations(mockClientContext);
|
||||||
|
HadoopEnvironmentSetup hadoopEnvSetup =
|
||||||
|
new HadoopEnvironmentSetup(mockClientContext, fsOperations);
|
||||||
|
|
||||||
|
Component component = new Component();
|
||||||
|
RunJobParameters params = new RunJobParameters();
|
||||||
|
params.setCheckpointPath("");
|
||||||
|
|
||||||
|
expectedException.expect(IllegalArgumentException.class);
|
||||||
|
expectedException.expectMessage("CheckpointPath must not be empty");
|
||||||
|
new TensorBoardLaunchCommand(hadoopEnvSetup, TaskType.TENSORBOARD,
|
||||||
|
component, params);
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,251 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hadoop.yarn.submarine.runtimes.yarnservice.tensorflow.command;
|
||||||
|
|
||||||
|
import com.google.common.collect.ImmutableList;
|
||||||
|
import org.apache.hadoop.conf.Configuration;
|
||||||
|
import org.apache.hadoop.yarn.service.api.records.Component;
|
||||||
|
import org.apache.hadoop.yarn.submarine.client.cli.param.RunJobParameters;
|
||||||
|
import org.apache.hadoop.yarn.submarine.common.MockClientContext;
|
||||||
|
import org.apache.hadoop.yarn.submarine.common.api.TaskType;
|
||||||
|
import org.apache.hadoop.yarn.submarine.runtimes.yarnservice.FileSystemOperations;
|
||||||
|
import org.apache.hadoop.yarn.submarine.runtimes.yarnservice.HadoopEnvironmentSetup;
|
||||||
|
import org.apache.hadoop.yarn.submarine.runtimes.yarnservice.command.AbstractLaunchCommandTestHelper;
|
||||||
|
import org.junit.Test;
|
||||||
|
import org.junit.runner.RunWith;
|
||||||
|
import org.junit.runners.Parameterized;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Collection;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import static org.apache.hadoop.yarn.submarine.runtimes.yarnservice.HadoopEnvironmentSetup.DOCKER_HADOOP_HDFS_HOME;
|
||||||
|
import static org.apache.hadoop.yarn.submarine.runtimes.yarnservice.HadoopEnvironmentSetup.DOCKER_JAVA_HOME;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This class is to test the implementors of {@link TensorFlowLaunchCommand}.
|
||||||
|
*/
|
||||||
|
@RunWith(Parameterized.class)
|
||||||
|
public class TestTensorFlowLaunchCommand
|
||||||
|
extends AbstractLaunchCommandTestHelper {
|
||||||
|
private TaskType taskType;
|
||||||
|
|
||||||
|
@Parameterized.Parameters
|
||||||
|
public static Collection<Object[]> data() {
|
||||||
|
Collection<Object[]> params = new ArrayList<>();
|
||||||
|
params.add(new Object[]{TaskType.WORKER });
|
||||||
|
params.add(new Object[]{TaskType.PS });
|
||||||
|
return params;
|
||||||
|
}
|
||||||
|
|
||||||
|
public TestTensorFlowLaunchCommand(TaskType taskType) {
|
||||||
|
this.taskType = taskType;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private void assertScriptContainsLaunchCommand(List<String> fileContents,
|
||||||
|
RunJobParameters params) {
|
||||||
|
String launchCommand = null;
|
||||||
|
if (taskType == TaskType.WORKER) {
|
||||||
|
launchCommand = params.getWorkerLaunchCmd();
|
||||||
|
} else if (taskType == TaskType.PS) {
|
||||||
|
launchCommand = params.getPSLaunchCmd();
|
||||||
|
}
|
||||||
|
assertScriptContainsLine(fileContents, launchCommand);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void setLaunchCommandToParams(RunJobParameters params) {
|
||||||
|
if (taskType == TaskType.WORKER) {
|
||||||
|
params.setWorkerLaunchCmd("testWorkerLaunchCommand");
|
||||||
|
} else if (taskType == TaskType.PS) {
|
||||||
|
params.setPSLaunchCmd("testPsLaunchCommand");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void setLaunchCommandToParams(RunJobParameters params, String value) {
|
||||||
|
if (taskType == TaskType.WORKER) {
|
||||||
|
params.setWorkerLaunchCmd(value);
|
||||||
|
} else if (taskType == TaskType.PS) {
|
||||||
|
params.setPSLaunchCmd(value);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void assertTypeInJson(List<String> fileContents) {
|
||||||
|
String expectedType = null;
|
||||||
|
if (taskType == TaskType.WORKER) {
|
||||||
|
expectedType = "worker";
|
||||||
|
} else if (taskType == TaskType.PS) {
|
||||||
|
expectedType = "ps";
|
||||||
|
}
|
||||||
|
assertScriptContainsLineWithRegex(fileContents, String.format(".*type.*:" +
|
||||||
|
".*%s.*", expectedType));
|
||||||
|
}
|
||||||
|
|
||||||
|
private TensorFlowLaunchCommand createTensorFlowLaunchCommandObject(
|
||||||
|
HadoopEnvironmentSetup hadoopEnvSetup, Configuration yarnConfig,
|
||||||
|
Component component, RunJobParameters params) throws IOException {
|
||||||
|
if (taskType == TaskType.WORKER) {
|
||||||
|
return new TensorFlowWorkerLaunchCommand(hadoopEnvSetup, taskType,
|
||||||
|
component,
|
||||||
|
params, yarnConfig);
|
||||||
|
} else if (taskType == TaskType.PS) {
|
||||||
|
return new TensorFlowPsLaunchCommand(hadoopEnvSetup, taskType, component,
|
||||||
|
params, yarnConfig);
|
||||||
|
}
|
||||||
|
throw new IllegalStateException("Unknown tasktype!");
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testHdfsRelatedEnvironmentIsUndefined() throws IOException {
|
||||||
|
RunJobParameters params = new RunJobParameters();
|
||||||
|
params.setInputPath("hdfs://bla");
|
||||||
|
params.setName("testJobname");
|
||||||
|
setLaunchCommandToParams(params);
|
||||||
|
|
||||||
|
testHdfsRelatedEnvironmentIsUndefined(taskType, params);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testHdfsRelatedEnvironmentIsDefined() throws IOException {
|
||||||
|
RunJobParameters params = new RunJobParameters();
|
||||||
|
params.setName("testName");
|
||||||
|
params.setInputPath("hdfs://bla");
|
||||||
|
params.setEnvars(ImmutableList.of(
|
||||||
|
DOCKER_HADOOP_HDFS_HOME + "=" + "testHdfsHome",
|
||||||
|
DOCKER_JAVA_HOME + "=" + "testJavaHome"));
|
||||||
|
setLaunchCommandToParams(params);
|
||||||
|
|
||||||
|
List<String> fileContents =
|
||||||
|
testHdfsRelatedEnvironmentIsDefined(taskType,
|
||||||
|
params);
|
||||||
|
assertScriptContainsLaunchCommand(fileContents, params);
|
||||||
|
assertScriptDoesNotContainLine(fileContents, "export TF_CONFIG=");
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testLaunchCommandIsNull() throws IOException {
|
||||||
|
MockClientContext mockClientContext = new MockClientContext();
|
||||||
|
FileSystemOperations fsOperations =
|
||||||
|
new FileSystemOperations(mockClientContext);
|
||||||
|
HadoopEnvironmentSetup hadoopEnvSetup =
|
||||||
|
new HadoopEnvironmentSetup(mockClientContext, fsOperations);
|
||||||
|
Configuration yarnConfig = new Configuration();
|
||||||
|
|
||||||
|
Component component = new Component();
|
||||||
|
RunJobParameters params = new RunJobParameters();
|
||||||
|
params.setName("testName");
|
||||||
|
setLaunchCommandToParams(params, null);
|
||||||
|
|
||||||
|
expectedException.expect(IllegalArgumentException.class);
|
||||||
|
expectedException.expectMessage("LaunchCommand must not be null or empty");
|
||||||
|
TensorFlowLaunchCommand launchCommand =
|
||||||
|
createTensorFlowLaunchCommandObject(hadoopEnvSetup, yarnConfig,
|
||||||
|
component,
|
||||||
|
params);
|
||||||
|
launchCommand.generateLaunchScript();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testLaunchCommandIsEmpty() throws IOException {
|
||||||
|
MockClientContext mockClientContext = new MockClientContext();
|
||||||
|
FileSystemOperations fsOperations =
|
||||||
|
new FileSystemOperations(mockClientContext);
|
||||||
|
HadoopEnvironmentSetup hadoopEnvSetup =
|
||||||
|
new HadoopEnvironmentSetup(mockClientContext, fsOperations);
|
||||||
|
Configuration yarnConfig = new Configuration();
|
||||||
|
|
||||||
|
Component component = new Component();
|
||||||
|
RunJobParameters params = new RunJobParameters();
|
||||||
|
params.setName("testName");
|
||||||
|
setLaunchCommandToParams(params, "");
|
||||||
|
|
||||||
|
expectedException.expect(IllegalArgumentException.class);
|
||||||
|
expectedException.expectMessage("LaunchCommand must not be null or empty");
|
||||||
|
TensorFlowLaunchCommand launchCommand =
|
||||||
|
createTensorFlowLaunchCommandObject(hadoopEnvSetup, yarnConfig,
|
||||||
|
component, params);
|
||||||
|
launchCommand.generateLaunchScript();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testDistributedTrainingMissingTaskType() throws IOException {
|
||||||
|
overrideTaskType(null);
|
||||||
|
|
||||||
|
RunJobParameters params = new RunJobParameters();
|
||||||
|
params.setDistributed(true);
|
||||||
|
params.setName("testName");
|
||||||
|
params.setInputPath("hdfs://bla");
|
||||||
|
params.setEnvars(ImmutableList.of(
|
||||||
|
DOCKER_HADOOP_HDFS_HOME + "=" + "testHdfsHome",
|
||||||
|
DOCKER_JAVA_HOME + "=" + "testJavaHome"));
|
||||||
|
setLaunchCommandToParams(params);
|
||||||
|
|
||||||
|
expectedException.expect(NullPointerException.class);
|
||||||
|
expectedException.expectMessage("TaskType must not be null");
|
||||||
|
testHdfsRelatedEnvironmentIsDefined(taskType, params);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testDistributedTrainingNumberOfWorkersAndPsIsZero()
|
||||||
|
throws IOException {
|
||||||
|
RunJobParameters params = new RunJobParameters();
|
||||||
|
params.setDistributed(true);
|
||||||
|
params.setNumWorkers(0);
|
||||||
|
params.setNumPS(0);
|
||||||
|
params.setName("testName");
|
||||||
|
params.setInputPath("hdfs://bla");
|
||||||
|
params.setEnvars(ImmutableList.of(
|
||||||
|
DOCKER_HADOOP_HDFS_HOME + "=" + "testHdfsHome",
|
||||||
|
DOCKER_JAVA_HOME + "=" + "testJavaHome"));
|
||||||
|
setLaunchCommandToParams(params);
|
||||||
|
|
||||||
|
List<String> fileContents =
|
||||||
|
testHdfsRelatedEnvironmentIsDefined(taskType, params);
|
||||||
|
|
||||||
|
assertScriptDoesNotContainLine(fileContents, "export TF_CONFIG=");
|
||||||
|
assertScriptContainsLineWithRegex(fileContents, ".*worker.*:\\[\\].*");
|
||||||
|
assertScriptContainsLineWithRegex(fileContents, ".*ps.*:\\[\\].*");
|
||||||
|
assertTypeInJson(fileContents);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testDistributedTrainingNumberOfWorkersAndPsIsNonZero()
|
||||||
|
throws IOException {
|
||||||
|
RunJobParameters params = new RunJobParameters();
|
||||||
|
params.setDistributed(true);
|
||||||
|
params.setNumWorkers(3);
|
||||||
|
params.setNumPS(2);
|
||||||
|
params.setName("testName");
|
||||||
|
params.setInputPath("hdfs://bla");
|
||||||
|
params.setEnvars(ImmutableList.of(
|
||||||
|
DOCKER_HADOOP_HDFS_HOME + "=" + "testHdfsHome",
|
||||||
|
DOCKER_JAVA_HOME + "=" + "testJavaHome"));
|
||||||
|
setLaunchCommandToParams(params);
|
||||||
|
|
||||||
|
List<String> fileContents =
|
||||||
|
testHdfsRelatedEnvironmentIsDefined(taskType, params);
|
||||||
|
|
||||||
|
//assert we have multiple PS and workers
|
||||||
|
assertScriptDoesNotContainLine(fileContents, "export TF_CONFIG=");
|
||||||
|
assertScriptContainsLineWithRegex(fileContents, ".*worker.*:\\[.*,.*\\].*");
|
||||||
|
assertScriptContainsLineWithRegex(fileContents, ".*ps.*:\\[.*,.*\\].*");
|
||||||
|
assertTypeInJson(fileContents);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,90 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hadoop.yarn.submarine.runtimes.yarnservice.tensorflow.component;
|
||||||
|
|
||||||
|
import org.apache.hadoop.conf.Configuration;
|
||||||
|
import org.apache.hadoop.yarn.api.records.Resource;
|
||||||
|
import org.apache.hadoop.yarn.service.api.ServiceApiConstants;
|
||||||
|
import org.apache.hadoop.yarn.service.api.records.Component;
|
||||||
|
import org.apache.hadoop.yarn.submarine.common.Envs;
|
||||||
|
import org.apache.hadoop.yarn.submarine.common.MockClientContext;
|
||||||
|
import org.apache.hadoop.yarn.submarine.common.api.TaskType;
|
||||||
|
import org.apache.hadoop.yarn.submarine.runtimes.yarnservice.FileSystemOperations;
|
||||||
|
import org.apache.hadoop.yarn.submarine.runtimes.yarnservice.command.AbstractLaunchCommand;
|
||||||
|
import org.apache.hadoop.yarn.submarine.runtimes.yarnservice.command.LaunchCommandFactory;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
import static org.junit.Assert.assertEquals;
|
||||||
|
import static org.junit.Assert.assertNotNull;
|
||||||
|
import static org.mockito.ArgumentMatchers.any;
|
||||||
|
import static org.mockito.ArgumentMatchers.eq;
|
||||||
|
import static org.mockito.Mockito.mock;
|
||||||
|
import static org.mockito.Mockito.when;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This class has some helper methods and fields
|
||||||
|
* in order to test TensorFlow-related Components easier.
|
||||||
|
*/
|
||||||
|
public class ComponentTestCommons {
|
||||||
|
String userName;
|
||||||
|
TaskType taskType;
|
||||||
|
LaunchCommandFactory mockLaunchCommandFactory;
|
||||||
|
FileSystemOperations fsOperations;
|
||||||
|
MockClientContext mockClientContext;
|
||||||
|
Configuration yarnConfig;
|
||||||
|
Resource resource;
|
||||||
|
|
||||||
|
ComponentTestCommons(TaskType taskType) {
|
||||||
|
this.taskType = taskType;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setup() throws IOException {
|
||||||
|
this.userName = System.getProperty("user.name");
|
||||||
|
this.resource = Resource.newInstance(4000, 10);
|
||||||
|
setupDependencies();
|
||||||
|
}
|
||||||
|
|
||||||
|
private void setupDependencies() throws IOException {
|
||||||
|
fsOperations = mock(FileSystemOperations.class);
|
||||||
|
mockClientContext = new MockClientContext();
|
||||||
|
mockLaunchCommandFactory = mock(LaunchCommandFactory.class);
|
||||||
|
|
||||||
|
AbstractLaunchCommand mockLaunchCommand = mock(AbstractLaunchCommand.class);
|
||||||
|
when(mockLaunchCommand.generateLaunchScript()).thenReturn("mockScript");
|
||||||
|
when(mockLaunchCommandFactory.createLaunchCommand(eq(taskType),
|
||||||
|
any(Component.class))).thenReturn(mockLaunchCommand);
|
||||||
|
|
||||||
|
yarnConfig = new Configuration();
|
||||||
|
}
|
||||||
|
|
||||||
|
void verifyCommonConfigEnvs(Component component) {
|
||||||
|
assertNotNull(component.getConfiguration().getEnv());
|
||||||
|
assertEquals(2, component.getConfiguration().getEnv().size());
|
||||||
|
assertEquals(ServiceApiConstants.COMPONENT_ID,
|
||||||
|
component.getConfiguration().getEnv().get(Envs.TASK_INDEX_ENV));
|
||||||
|
assertEquals(taskType.name(),
|
||||||
|
component.getConfiguration().getEnv().get(Envs.TASK_TYPE_ENV));
|
||||||
|
}
|
||||||
|
|
||||||
|
void verifyResources(Component component) {
|
||||||
|
assertNotNull(component.getResource());
|
||||||
|
assertEquals(10, (int) component.getResource().getCpus());
|
||||||
|
assertEquals(4000,
|
||||||
|
(int) Integer.valueOf(component.getResource().getMemory()));
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,125 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hadoop.yarn.submarine.runtimes.yarnservice.tensorflow.component;
|
||||||
|
|
||||||
|
import org.apache.hadoop.fs.Path;
|
||||||
|
import org.apache.hadoop.yarn.service.api.records.Artifact;
|
||||||
|
import org.apache.hadoop.yarn.service.api.records.Component;
|
||||||
|
import org.apache.hadoop.yarn.service.api.records.Component.RestartPolicyEnum;
|
||||||
|
import org.apache.hadoop.yarn.submarine.client.cli.param.RunJobParameters;
|
||||||
|
import org.apache.hadoop.yarn.submarine.common.api.TaskType;
|
||||||
|
import org.junit.Before;
|
||||||
|
import org.junit.Rule;
|
||||||
|
import org.junit.Test;
|
||||||
|
import org.junit.rules.ExpectedException;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
import static org.junit.Assert.assertEquals;
|
||||||
|
import static org.mockito.ArgumentMatchers.any;
|
||||||
|
import static org.mockito.ArgumentMatchers.eq;
|
||||||
|
import static org.mockito.Mockito.verify;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This class is to test {@link TensorBoardComponent}.
|
||||||
|
*/
|
||||||
|
public class TestTensorBoardComponent {
|
||||||
|
|
||||||
|
@Rule
|
||||||
|
public ExpectedException expectedException = ExpectedException.none();
|
||||||
|
private ComponentTestCommons testCommons =
|
||||||
|
new ComponentTestCommons(TaskType.TENSORBOARD);
|
||||||
|
|
||||||
|
@Before
|
||||||
|
public void setUp() throws IOException {
|
||||||
|
testCommons.setup();
|
||||||
|
}
|
||||||
|
|
||||||
|
private TensorBoardComponent createTensorBoardComponent(
|
||||||
|
RunJobParameters parameters) {
|
||||||
|
return new TensorBoardComponent(
|
||||||
|
testCommons.fsOperations,
|
||||||
|
testCommons.mockClientContext.getRemoteDirectoryManager(),
|
||||||
|
parameters,
|
||||||
|
testCommons.mockLaunchCommandFactory,
|
||||||
|
testCommons.yarnConfig);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testTensorBoardComponentWithNullResource() throws IOException {
|
||||||
|
RunJobParameters parameters = new RunJobParameters();
|
||||||
|
parameters.setTensorboardResource(null);
|
||||||
|
|
||||||
|
TensorBoardComponent tensorBoardComponent =
|
||||||
|
createTensorBoardComponent(parameters);
|
||||||
|
|
||||||
|
expectedException.expect(NullPointerException.class);
|
||||||
|
expectedException.expectMessage("TensorBoard resource must not be null");
|
||||||
|
tensorBoardComponent.createComponent();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testTensorBoardComponentWithNullJobName() throws IOException {
|
||||||
|
RunJobParameters parameters = new RunJobParameters();
|
||||||
|
parameters.setTensorboardResource(testCommons.resource);
|
||||||
|
parameters.setName(null);
|
||||||
|
|
||||||
|
TensorBoardComponent tensorBoardComponent =
|
||||||
|
createTensorBoardComponent(parameters);
|
||||||
|
|
||||||
|
expectedException.expect(NullPointerException.class);
|
||||||
|
expectedException.expectMessage("Job name must not be null");
|
||||||
|
tensorBoardComponent.createComponent();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testTensorBoardComponent() throws IOException {
|
||||||
|
testCommons.yarnConfig.set("hadoop.registry.dns.domain-name", "testDomain");
|
||||||
|
|
||||||
|
RunJobParameters parameters = new RunJobParameters();
|
||||||
|
parameters.setTensorboardResource(testCommons.resource);
|
||||||
|
parameters.setName("testJobName");
|
||||||
|
parameters.setTensorboardDockerImage("testTBDockerImage");
|
||||||
|
|
||||||
|
TensorBoardComponent tensorBoardComponent =
|
||||||
|
createTensorBoardComponent(parameters);
|
||||||
|
|
||||||
|
Component component = tensorBoardComponent.createComponent();
|
||||||
|
|
||||||
|
assertEquals(testCommons.taskType.getComponentName(), component.getName());
|
||||||
|
testCommons.verifyCommonConfigEnvs(component);
|
||||||
|
|
||||||
|
assertEquals(1L, (long) component.getNumberOfContainers());
|
||||||
|
assertEquals(RestartPolicyEnum.NEVER, component.getRestartPolicy());
|
||||||
|
testCommons.verifyResources(component);
|
||||||
|
assertEquals(
|
||||||
|
new Artifact().type(Artifact.TypeEnum.DOCKER).id("testTBDockerImage"),
|
||||||
|
component.getArtifact());
|
||||||
|
|
||||||
|
assertEquals(String.format(
|
||||||
|
"http://tensorboard-0.testJobName.%s" + ".testDomain:6006",
|
||||||
|
testCommons.userName),
|
||||||
|
tensorBoardComponent.getTensorboardLink());
|
||||||
|
|
||||||
|
assertEquals("./run-TENSORBOARD.sh", component.getLaunchCommand());
|
||||||
|
verify(testCommons.fsOperations)
|
||||||
|
.uploadToRemoteFileAndLocalizeToContainerWorkDir(
|
||||||
|
any(Path.class), eq("mockScript"), eq("run-TENSORBOARD.sh"),
|
||||||
|
eq(component));
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,166 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hadoop.yarn.submarine.runtimes.yarnservice.tensorflow.component;
|
||||||
|
|
||||||
|
import static junit.framework.TestCase.assertTrue;
|
||||||
|
import static org.junit.Assert.assertEquals;
|
||||||
|
import static org.mockito.ArgumentMatchers.any;
|
||||||
|
import static org.mockito.ArgumentMatchers.eq;
|
||||||
|
import static org.mockito.Mockito.verify;
|
||||||
|
import java.io.IOException;
|
||||||
|
import org.apache.hadoop.fs.Path;
|
||||||
|
import org.apache.hadoop.yarn.service.api.records.Artifact;
|
||||||
|
import org.apache.hadoop.yarn.service.api.records.Component;
|
||||||
|
import org.apache.hadoop.yarn.service.api.records.Component.RestartPolicyEnum;
|
||||||
|
import org.apache.hadoop.yarn.submarine.client.cli.param.RunJobParameters;
|
||||||
|
import org.apache.hadoop.yarn.submarine.common.api.TaskType;
|
||||||
|
import org.junit.Before;
|
||||||
|
import org.junit.Rule;
|
||||||
|
import org.junit.Test;
|
||||||
|
import org.junit.rules.ExpectedException;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This class is to test {@link TensorFlowPsComponent}.
|
||||||
|
*/
|
||||||
|
public class TestTensorFlowPsComponent {
|
||||||
|
|
||||||
|
@Rule
|
||||||
|
public ExpectedException expectedException = ExpectedException.none();
|
||||||
|
private ComponentTestCommons testCommons =
|
||||||
|
new ComponentTestCommons(TaskType.PS);
|
||||||
|
|
||||||
|
@Before
|
||||||
|
public void setUp() throws IOException {
|
||||||
|
testCommons.setup();
|
||||||
|
}
|
||||||
|
|
||||||
|
private TensorFlowPsComponent createPsComponent(RunJobParameters parameters) {
|
||||||
|
return new TensorFlowPsComponent(
|
||||||
|
testCommons.fsOperations,
|
||||||
|
testCommons.mockClientContext.getRemoteDirectoryManager(),
|
||||||
|
testCommons.mockLaunchCommandFactory,
|
||||||
|
parameters,
|
||||||
|
testCommons.yarnConfig);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void verifyCommons(Component component) throws IOException {
|
||||||
|
assertEquals(testCommons.taskType.getComponentName(), component.getName());
|
||||||
|
testCommons.verifyCommonConfigEnvs(component);
|
||||||
|
|
||||||
|
assertTrue(component.getConfiguration().getProperties().isEmpty());
|
||||||
|
|
||||||
|
assertEquals(RestartPolicyEnum.NEVER, component.getRestartPolicy());
|
||||||
|
testCommons.verifyResources(component);
|
||||||
|
assertEquals(
|
||||||
|
new Artifact().type(Artifact.TypeEnum.DOCKER).id("testPSDockerImage"),
|
||||||
|
component.getArtifact());
|
||||||
|
|
||||||
|
String taskTypeUppercase = testCommons.taskType.name().toUpperCase();
|
||||||
|
String expectedScriptName = String.format("run-%s.sh", taskTypeUppercase);
|
||||||
|
assertEquals(String.format("./%s", expectedScriptName),
|
||||||
|
component.getLaunchCommand());
|
||||||
|
verify(testCommons.fsOperations)
|
||||||
|
.uploadToRemoteFileAndLocalizeToContainerWorkDir(
|
||||||
|
any(Path.class), eq("mockScript"), eq(expectedScriptName),
|
||||||
|
eq(component));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testPSComponentWithNullResource() throws IOException {
|
||||||
|
RunJobParameters parameters = new RunJobParameters();
|
||||||
|
parameters.setPsResource(null);
|
||||||
|
|
||||||
|
TensorFlowPsComponent psComponent =
|
||||||
|
createPsComponent(parameters);
|
||||||
|
|
||||||
|
expectedException.expect(NullPointerException.class);
|
||||||
|
expectedException.expectMessage("PS resource must not be null");
|
||||||
|
psComponent.createComponent();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testPSComponentWithNullJobName() throws IOException {
|
||||||
|
RunJobParameters parameters = new RunJobParameters();
|
||||||
|
parameters.setPsResource(testCommons.resource);
|
||||||
|
parameters.setNumPS(1);
|
||||||
|
parameters.setName(null);
|
||||||
|
|
||||||
|
TensorFlowPsComponent psComponent =
|
||||||
|
createPsComponent(parameters);
|
||||||
|
|
||||||
|
expectedException.expect(NullPointerException.class);
|
||||||
|
expectedException.expectMessage("Job name must not be null");
|
||||||
|
psComponent.createComponent();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testPSComponentZeroNumberOfPS() throws IOException {
|
||||||
|
testCommons.yarnConfig.set("hadoop.registry.dns.domain-name", "testDomain");
|
||||||
|
|
||||||
|
RunJobParameters parameters = new RunJobParameters();
|
||||||
|
parameters.setPsResource(testCommons.resource);
|
||||||
|
parameters.setName("testJobName");
|
||||||
|
parameters.setPsDockerImage("testPSDockerImage");
|
||||||
|
parameters.setNumPS(0);
|
||||||
|
|
||||||
|
TensorFlowPsComponent psComponent =
|
||||||
|
createPsComponent(parameters);
|
||||||
|
|
||||||
|
expectedException.expect(IllegalArgumentException.class);
|
||||||
|
expectedException.expectMessage("Number of PS should be at least 1!");
|
||||||
|
psComponent.createComponent();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testPSComponentNumPSIsOne() throws IOException {
|
||||||
|
testCommons.yarnConfig.set("hadoop.registry.dns.domain-name", "testDomain");
|
||||||
|
|
||||||
|
RunJobParameters parameters = new RunJobParameters();
|
||||||
|
parameters.setPsResource(testCommons.resource);
|
||||||
|
parameters.setName("testJobName");
|
||||||
|
parameters.setNumPS(1);
|
||||||
|
parameters.setPsDockerImage("testPSDockerImage");
|
||||||
|
|
||||||
|
TensorFlowPsComponent psComponent =
|
||||||
|
createPsComponent(parameters);
|
||||||
|
|
||||||
|
Component component = psComponent.createComponent();
|
||||||
|
|
||||||
|
assertEquals(1L, (long) component.getNumberOfContainers());
|
||||||
|
verifyCommons(component);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testPSComponentNumPSIsTwo() throws IOException {
|
||||||
|
testCommons.yarnConfig.set("hadoop.registry.dns.domain-name", "testDomain");
|
||||||
|
|
||||||
|
RunJobParameters parameters = new RunJobParameters();
|
||||||
|
parameters.setPsResource(testCommons.resource);
|
||||||
|
parameters.setName("testJobName");
|
||||||
|
parameters.setNumPS(2);
|
||||||
|
parameters.setPsDockerImage("testPSDockerImage");
|
||||||
|
|
||||||
|
TensorFlowPsComponent psComponent =
|
||||||
|
createPsComponent(parameters);
|
||||||
|
|
||||||
|
Component component = psComponent.createComponent();
|
||||||
|
|
||||||
|
assertEquals(2L, (long) component.getNumberOfContainers());
|
||||||
|
verifyCommons(component);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,215 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hadoop.yarn.submarine.runtimes.yarnservice.tensorflow.component;
|
||||||
|
|
||||||
|
import com.google.common.collect.ImmutableMap;
|
||||||
|
import org.apache.hadoop.fs.Path;
|
||||||
|
import org.apache.hadoop.yarn.service.api.records.Artifact;
|
||||||
|
import org.apache.hadoop.yarn.service.api.records.Component;
|
||||||
|
import org.apache.hadoop.yarn.service.api.records.Component.RestartPolicyEnum;
|
||||||
|
import org.apache.hadoop.yarn.submarine.client.cli.param.RunJobParameters;
|
||||||
|
import org.apache.hadoop.yarn.submarine.common.api.TaskType;
|
||||||
|
import org.junit.Before;
|
||||||
|
import org.junit.Rule;
|
||||||
|
import org.junit.Test;
|
||||||
|
import org.junit.rules.ExpectedException;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
import static junit.framework.TestCase.assertTrue;
|
||||||
|
import static org.apache.hadoop.yarn.service.conf.YarnServiceConstants.CONTAINER_STATE_REPORT_AS_SERVICE_STATE;
|
||||||
|
import static org.junit.Assert.assertEquals;
|
||||||
|
import static org.junit.Assert.assertFalse;
|
||||||
|
import static org.mockito.ArgumentMatchers.any;
|
||||||
|
import static org.mockito.ArgumentMatchers.eq;
|
||||||
|
import static org.mockito.Mockito.verify;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This class is to test {@link TensorFlowWorkerComponent}.
|
||||||
|
*/
|
||||||
|
public class TestTensorFlowWorkerComponent {
|
||||||
|
|
||||||
|
@Rule
|
||||||
|
public ExpectedException expectedException = ExpectedException.none();
|
||||||
|
private ComponentTestCommons testCommons =
|
||||||
|
new ComponentTestCommons(TaskType.TENSORBOARD);
|
||||||
|
|
||||||
|
@Before
|
||||||
|
public void setUp() throws IOException {
|
||||||
|
testCommons.setup();
|
||||||
|
}
|
||||||
|
|
||||||
|
private TensorFlowWorkerComponent createWorkerComponent(
|
||||||
|
RunJobParameters parameters) {
|
||||||
|
return new TensorFlowWorkerComponent(
|
||||||
|
testCommons.fsOperations,
|
||||||
|
testCommons.mockClientContext.getRemoteDirectoryManager(),
|
||||||
|
parameters, testCommons.taskType,
|
||||||
|
testCommons.mockLaunchCommandFactory,
|
||||||
|
testCommons.yarnConfig);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void verifyCommons(Component component) throws IOException {
|
||||||
|
verifyCommonsInternal(component, ImmutableMap.of());
|
||||||
|
}
|
||||||
|
|
||||||
|
private void verifyCommons(Component component,
|
||||||
|
Map<String, String> expectedProperties) throws IOException {
|
||||||
|
verifyCommonsInternal(component, expectedProperties);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void verifyCommonsInternal(Component component,
|
||||||
|
Map<String, String> expectedProperties) throws IOException {
|
||||||
|
assertEquals(testCommons.taskType.getComponentName(), component.getName());
|
||||||
|
testCommons.verifyCommonConfigEnvs(component);
|
||||||
|
|
||||||
|
Map<String, String> actualProperties =
|
||||||
|
component.getConfiguration().getProperties();
|
||||||
|
if (!expectedProperties.isEmpty()) {
|
||||||
|
assertFalse(actualProperties.isEmpty());
|
||||||
|
expectedProperties.forEach(
|
||||||
|
(k, v) -> assertEquals(v, actualProperties.get(k)));
|
||||||
|
} else {
|
||||||
|
assertTrue(actualProperties.isEmpty());
|
||||||
|
}
|
||||||
|
|
||||||
|
assertEquals(RestartPolicyEnum.NEVER, component.getRestartPolicy());
|
||||||
|
testCommons.verifyResources(component);
|
||||||
|
assertEquals(
|
||||||
|
new Artifact().type(Artifact.TypeEnum.DOCKER)
|
||||||
|
.id("testWorkerDockerImage"),
|
||||||
|
component.getArtifact());
|
||||||
|
|
||||||
|
String taskTypeUppercase = testCommons.taskType.name().toUpperCase();
|
||||||
|
String expectedScriptName = String.format("run-%s.sh", taskTypeUppercase);
|
||||||
|
assertEquals(String.format("./%s", expectedScriptName),
|
||||||
|
component.getLaunchCommand());
|
||||||
|
verify(testCommons.fsOperations)
|
||||||
|
.uploadToRemoteFileAndLocalizeToContainerWorkDir(
|
||||||
|
any(Path.class), eq("mockScript"), eq(expectedScriptName),
|
||||||
|
eq(component));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testWorkerComponentWithNullResource() throws IOException {
|
||||||
|
RunJobParameters parameters = new RunJobParameters();
|
||||||
|
parameters.setWorkerResource(null);
|
||||||
|
|
||||||
|
TensorFlowWorkerComponent workerComponent =
|
||||||
|
createWorkerComponent(parameters);
|
||||||
|
|
||||||
|
expectedException.expect(NullPointerException.class);
|
||||||
|
expectedException.expectMessage("Worker resource must not be null");
|
||||||
|
workerComponent.createComponent();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testWorkerComponentWithNullJobName() throws IOException {
|
||||||
|
RunJobParameters parameters = new RunJobParameters();
|
||||||
|
parameters.setWorkerResource(testCommons.resource);
|
||||||
|
parameters.setNumWorkers(1);
|
||||||
|
parameters.setName(null);
|
||||||
|
|
||||||
|
TensorFlowWorkerComponent workerComponent =
|
||||||
|
createWorkerComponent(parameters);
|
||||||
|
|
||||||
|
expectedException.expect(NullPointerException.class);
|
||||||
|
expectedException.expectMessage("Job name must not be null");
|
||||||
|
workerComponent.createComponent();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testNormalWorkerComponentZeroNumberOfWorkers()
|
||||||
|
throws IOException {
|
||||||
|
testCommons.yarnConfig.set("hadoop.registry.dns.domain-name", "testDomain");
|
||||||
|
|
||||||
|
RunJobParameters parameters = new RunJobParameters();
|
||||||
|
parameters.setWorkerResource(testCommons.resource);
|
||||||
|
parameters.setName("testJobName");
|
||||||
|
parameters.setWorkerDockerImage("testWorkerDockerImage");
|
||||||
|
parameters.setNumWorkers(0);
|
||||||
|
|
||||||
|
TensorFlowWorkerComponent workerComponent =
|
||||||
|
createWorkerComponent(parameters);
|
||||||
|
|
||||||
|
expectedException.expect(IllegalArgumentException.class);
|
||||||
|
expectedException.expectMessage("Number of workers should be at least 1!");
|
||||||
|
workerComponent.createComponent();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testNormalWorkerComponentNumWorkersIsOne() throws IOException {
|
||||||
|
testCommons.yarnConfig.set("hadoop.registry.dns.domain-name", "testDomain");
|
||||||
|
|
||||||
|
RunJobParameters parameters = new RunJobParameters();
|
||||||
|
parameters.setWorkerResource(testCommons.resource);
|
||||||
|
parameters.setName("testJobName");
|
||||||
|
parameters.setNumWorkers(1);
|
||||||
|
parameters.setWorkerDockerImage("testWorkerDockerImage");
|
||||||
|
|
||||||
|
TensorFlowWorkerComponent workerComponent =
|
||||||
|
createWorkerComponent(parameters);
|
||||||
|
|
||||||
|
Component component = workerComponent.createComponent();
|
||||||
|
|
||||||
|
assertEquals(0L, (long) component.getNumberOfContainers());
|
||||||
|
verifyCommons(component);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testNormalWorkerComponentNumWorkersIsTwo() throws IOException {
|
||||||
|
testCommons.yarnConfig.set("hadoop.registry.dns.domain-name", "testDomain");
|
||||||
|
|
||||||
|
RunJobParameters parameters = new RunJobParameters();
|
||||||
|
parameters.setWorkerResource(testCommons.resource);
|
||||||
|
parameters.setName("testJobName");
|
||||||
|
parameters.setNumWorkers(2);
|
||||||
|
parameters.setWorkerDockerImage("testWorkerDockerImage");
|
||||||
|
|
||||||
|
TensorFlowWorkerComponent workerComponent =
|
||||||
|
createWorkerComponent(parameters);
|
||||||
|
|
||||||
|
Component component = workerComponent.createComponent();
|
||||||
|
|
||||||
|
assertEquals(1L, (long) component.getNumberOfContainers());
|
||||||
|
verifyCommons(component);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testPrimaryWorkerComponentNumWorkersIsTwo() throws IOException {
|
||||||
|
testCommons.yarnConfig.set("hadoop.registry.dns.domain-name", "testDomain");
|
||||||
|
testCommons = new ComponentTestCommons(TaskType.PRIMARY_WORKER);
|
||||||
|
testCommons.setup();
|
||||||
|
|
||||||
|
RunJobParameters parameters = new RunJobParameters();
|
||||||
|
parameters.setWorkerResource(testCommons.resource);
|
||||||
|
parameters.setName("testJobName");
|
||||||
|
parameters.setNumWorkers(2);
|
||||||
|
parameters.setWorkerDockerImage("testWorkerDockerImage");
|
||||||
|
|
||||||
|
TensorFlowWorkerComponent workerComponent =
|
||||||
|
createWorkerComponent(parameters);
|
||||||
|
|
||||||
|
Component component = workerComponent.createComponent();
|
||||||
|
|
||||||
|
assertEquals(1L, (long) component.getNumberOfContainers());
|
||||||
|
verifyCommons(component, ImmutableMap.of(
|
||||||
|
CONTAINER_STATE_REPORT_AS_SERVICE_STATE, "true"));
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,91 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hadoop.yarn.submarine.utils;
|
||||||
|
|
||||||
|
import org.apache.hadoop.yarn.submarine.FileUtilitiesForTests;
|
||||||
|
import org.junit.After;
|
||||||
|
import org.junit.Before;
|
||||||
|
import org.junit.BeforeClass;
|
||||||
|
import org.junit.Test;
|
||||||
|
|
||||||
|
import java.io.File;
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
import static org.junit.Assert.assertEquals;
|
||||||
|
import static org.junit.Assert.assertNotNull;
|
||||||
|
import static org.junit.Assert.assertNull;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This class is to test {@link ClassPathUtilities}.
|
||||||
|
*/
|
||||||
|
public class TestClassPathUtilities {
|
||||||
|
|
||||||
|
private static final String CLASSPATH_KEY = "java.class.path";
|
||||||
|
private FileUtilitiesForTests fileUtils = new FileUtilitiesForTests();
|
||||||
|
private static String originalClasspath;
|
||||||
|
|
||||||
|
@BeforeClass
|
||||||
|
public static void setUpClass() {
|
||||||
|
originalClasspath = System.getProperty(CLASSPATH_KEY);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Before
|
||||||
|
public void setUp() {
|
||||||
|
fileUtils.setup();
|
||||||
|
}
|
||||||
|
|
||||||
|
@After
|
||||||
|
public void teardown() throws IOException {
|
||||||
|
fileUtils.teardown();
|
||||||
|
System.setProperty(CLASSPATH_KEY, originalClasspath);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static void addFileToClasspath(File file) {
|
||||||
|
String newClasspath = originalClasspath + ":" + file.getAbsolutePath();
|
||||||
|
System.setProperty(CLASSPATH_KEY, newClasspath);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void findFileNotInClasspath() {
|
||||||
|
File resultFile = ClassPathUtilities.findFileOnClassPath("bla");
|
||||||
|
assertNull(resultFile);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void findFileOnClasspath() throws Exception {
|
||||||
|
File testFile = fileUtils.createFileInTempDir("testFile");
|
||||||
|
|
||||||
|
addFileToClasspath(testFile);
|
||||||
|
File resultFile = ClassPathUtilities.findFileOnClassPath("testFile");
|
||||||
|
|
||||||
|
assertNotNull(resultFile);
|
||||||
|
assertEquals(testFile.getAbsolutePath(), resultFile.getAbsolutePath());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void findDirectoryOnClasspath() throws Exception {
|
||||||
|
File testDir = fileUtils.createDirInTempDir("testDir");
|
||||||
|
File testFile = fileUtils.createFileInDir(testDir, "testFile");
|
||||||
|
|
||||||
|
addFileToClasspath(testDir);
|
||||||
|
File resultFile = ClassPathUtilities.findFileOnClassPath("testFile");
|
||||||
|
|
||||||
|
assertNotNull(resultFile);
|
||||||
|
assertEquals(testFile.getAbsolutePath(), resultFile.getAbsolutePath());
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,231 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hadoop.yarn.submarine.utils;
|
||||||
|
|
||||||
|
import com.google.common.collect.ImmutableList;
|
||||||
|
import com.google.common.collect.ImmutableMap;
|
||||||
|
import com.google.common.collect.Maps;
|
||||||
|
import org.apache.hadoop.yarn.service.api.records.Configuration;
|
||||||
|
import org.apache.hadoop.yarn.service.api.records.Service;
|
||||||
|
import org.junit.Test;
|
||||||
|
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.HADOOP_SECURITY_AUTHENTICATION;
|
||||||
|
import static org.apache.hadoop.yarn.submarine.utils.EnvironmentUtilities.ENV_DOCKER_MOUNTS_FOR_CONTAINER_RUNTIME;
|
||||||
|
import static org.junit.Assert.*;
|
||||||
|
import static org.mockito.Mockito.mock;
|
||||||
|
import static org.mockito.Mockito.when;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This class is to test {@link EnvironmentUtilities}.
|
||||||
|
*/
|
||||||
|
public class TestEnvironmentUtilities {
|
||||||
|
private Service createServiceWithEmptyEnvVars() {
|
||||||
|
return createServiceWithEnvVars(Maps.newHashMap());
|
||||||
|
}
|
||||||
|
|
||||||
|
private Service createServiceWithEnvVars(Map<String, String> envVars) {
|
||||||
|
Service service = mock(Service.class);
|
||||||
|
Configuration config = mock(Configuration.class);
|
||||||
|
when(config.getEnv()).thenReturn(envVars);
|
||||||
|
when(service.getConfiguration()).thenReturn(config);
|
||||||
|
|
||||||
|
return service;
|
||||||
|
}
|
||||||
|
|
||||||
|
private void validateDefaultEnvVars(Map<String, String> resultEnvs) {
|
||||||
|
assertEquals("/etc/passwd:/etc/passwd:ro",
|
||||||
|
resultEnvs.get(ENV_DOCKER_MOUNTS_FOR_CONTAINER_RUNTIME));
|
||||||
|
}
|
||||||
|
|
||||||
|
private org.apache.hadoop.conf.Configuration
|
||||||
|
createYarnConfigWithSecurityValue(String value) {
|
||||||
|
org.apache.hadoop.conf.Configuration mockConfig =
|
||||||
|
mock(org.apache.hadoop.conf.Configuration.class);
|
||||||
|
when(mockConfig.get(HADOOP_SECURITY_AUTHENTICATION)).thenReturn(value);
|
||||||
|
return mockConfig;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testGetValueOfNullEnvVar() {
|
||||||
|
assertEquals("", EnvironmentUtilities.getValueOfEnvironment(null));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testGetValueOfEmptyEnvVar() {
|
||||||
|
assertEquals("", EnvironmentUtilities.getValueOfEnvironment(""));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testGetValueOfEnvVarJustAnEqualsSign() {
|
||||||
|
assertEquals("", EnvironmentUtilities.getValueOfEnvironment("="));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testGetValueOfEnvVarWithoutValue() {
|
||||||
|
assertEquals("", EnvironmentUtilities.getValueOfEnvironment("a="));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testGetValueOfEnvVarValidFormat() {
|
||||||
|
assertEquals("bbb", EnvironmentUtilities.getValueOfEnvironment("a=bbb"));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testHandleServiceEnvWithNullMap() {
|
||||||
|
Service service = createServiceWithEmptyEnvVars();
|
||||||
|
org.apache.hadoop.conf.Configuration yarnConfig =
|
||||||
|
mock(org.apache.hadoop.conf.Configuration.class);
|
||||||
|
EnvironmentUtilities.handleServiceEnvs(service, yarnConfig, null);
|
||||||
|
|
||||||
|
Map<String, String> resultEnvs = service.getConfiguration().getEnv();
|
||||||
|
assertEquals(1, resultEnvs.size());
|
||||||
|
validateDefaultEnvVars(resultEnvs);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testHandleServiceEnvWithEmptyMap() {
|
||||||
|
Service service = createServiceWithEmptyEnvVars();
|
||||||
|
org.apache.hadoop.conf.Configuration yarnConfig =
|
||||||
|
mock(org.apache.hadoop.conf.Configuration.class);
|
||||||
|
EnvironmentUtilities.handleServiceEnvs(service, yarnConfig, null);
|
||||||
|
|
||||||
|
Map<String, String> resultEnvs = service.getConfiguration().getEnv();
|
||||||
|
assertEquals(1, resultEnvs.size());
|
||||||
|
validateDefaultEnvVars(resultEnvs);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testHandleServiceEnvWithYarnConfigSecurityValueNonKerberos() {
|
||||||
|
Service service = createServiceWithEmptyEnvVars();
|
||||||
|
org.apache.hadoop.conf.Configuration yarnConfig =
|
||||||
|
createYarnConfigWithSecurityValue("nonkerberos");
|
||||||
|
EnvironmentUtilities.handleServiceEnvs(service, yarnConfig, null);
|
||||||
|
|
||||||
|
Map<String, String> resultEnvs = service.getConfiguration().getEnv();
|
||||||
|
assertEquals(1, resultEnvs.size());
|
||||||
|
validateDefaultEnvVars(resultEnvs);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testHandleServiceEnvWithYarnConfigSecurityValueKerberos() {
|
||||||
|
Service service = createServiceWithEmptyEnvVars();
|
||||||
|
org.apache.hadoop.conf.Configuration yarnConfig =
|
||||||
|
createYarnConfigWithSecurityValue("kerberos");
|
||||||
|
EnvironmentUtilities.handleServiceEnvs(service, yarnConfig, null);
|
||||||
|
|
||||||
|
Map<String, String> resultEnvs = service.getConfiguration().getEnv();
|
||||||
|
assertEquals(1, resultEnvs.size());
|
||||||
|
assertEquals("/etc/passwd:/etc/passwd:ro,/etc/krb5.conf:/etc/krb5.conf:ro",
|
||||||
|
resultEnvs.get(ENV_DOCKER_MOUNTS_FOR_CONTAINER_RUNTIME));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testHandleServiceEnvWithExistingEnvsAndValidNewEnvs() {
|
||||||
|
Map<String, String> existingEnvs = Maps.newHashMap(
|
||||||
|
ImmutableMap.<String, String>builder().
|
||||||
|
put("a", "1").
|
||||||
|
put("b", "2").
|
||||||
|
build());
|
||||||
|
ImmutableList<String> newEnvs = ImmutableList.of("c=3", "d=4");
|
||||||
|
|
||||||
|
Service service = createServiceWithEnvVars(existingEnvs);
|
||||||
|
org.apache.hadoop.conf.Configuration yarnConfig =
|
||||||
|
createYarnConfigWithSecurityValue("kerberos");
|
||||||
|
EnvironmentUtilities.handleServiceEnvs(service, yarnConfig, newEnvs);
|
||||||
|
|
||||||
|
Map<String, String> resultEnvs = service.getConfiguration().getEnv();
|
||||||
|
assertEquals(5, resultEnvs.size());
|
||||||
|
assertEquals("/etc/passwd:/etc/passwd:ro,/etc/krb5.conf:/etc/krb5.conf:ro",
|
||||||
|
resultEnvs.get(ENV_DOCKER_MOUNTS_FOR_CONTAINER_RUNTIME));
|
||||||
|
assertEquals("1", resultEnvs.get("a"));
|
||||||
|
assertEquals("2", resultEnvs.get("b"));
|
||||||
|
assertEquals("3", resultEnvs.get("c"));
|
||||||
|
assertEquals("4", resultEnvs.get("d"));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testHandleServiceEnvWithExistingEnvsAndNewEnvsWithoutEquals() {
|
||||||
|
Map<String, String> existingEnvs = Maps.newHashMap(
|
||||||
|
ImmutableMap.<String, String>builder().
|
||||||
|
put("a", "1").
|
||||||
|
put("b", "2").
|
||||||
|
build());
|
||||||
|
ImmutableList<String> newEnvs = ImmutableList.of("c3", "d4");
|
||||||
|
|
||||||
|
Service service = createServiceWithEnvVars(existingEnvs);
|
||||||
|
org.apache.hadoop.conf.Configuration yarnConfig =
|
||||||
|
createYarnConfigWithSecurityValue("kerberos");
|
||||||
|
EnvironmentUtilities.handleServiceEnvs(service, yarnConfig, newEnvs);
|
||||||
|
|
||||||
|
Map<String, String> resultEnvs = service.getConfiguration().getEnv();
|
||||||
|
assertEquals(5, resultEnvs.size());
|
||||||
|
assertEquals("/etc/passwd:/etc/passwd:ro,/etc/krb5.conf:/etc/krb5.conf:ro",
|
||||||
|
resultEnvs.get(ENV_DOCKER_MOUNTS_FOR_CONTAINER_RUNTIME));
|
||||||
|
assertEquals("1", resultEnvs.get("a"));
|
||||||
|
assertEquals("2", resultEnvs.get("b"));
|
||||||
|
assertEquals("", resultEnvs.get("c3"));
|
||||||
|
assertEquals("", resultEnvs.get("d4"));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testHandleServiceEnvWithExistingEnvVarKey() {
|
||||||
|
Map<String, String> existingEnvs = Maps.newHashMap(
|
||||||
|
ImmutableMap.<String, String>builder().
|
||||||
|
put("a", "1").
|
||||||
|
put("b", "2").
|
||||||
|
build());
|
||||||
|
ImmutableList<String> newEnvs = ImmutableList.of("a=33", "c=44");
|
||||||
|
|
||||||
|
Service service = createServiceWithEnvVars(existingEnvs);
|
||||||
|
org.apache.hadoop.conf.Configuration yarnConfig =
|
||||||
|
createYarnConfigWithSecurityValue("kerberos");
|
||||||
|
EnvironmentUtilities.handleServiceEnvs(service, yarnConfig, newEnvs);
|
||||||
|
|
||||||
|
Map<String, String> resultEnvs = service.getConfiguration().getEnv();
|
||||||
|
assertEquals(4, resultEnvs.size());
|
||||||
|
assertEquals("/etc/passwd:/etc/passwd:ro,/etc/krb5.conf:/etc/krb5.conf:ro",
|
||||||
|
resultEnvs.get(ENV_DOCKER_MOUNTS_FOR_CONTAINER_RUNTIME));
|
||||||
|
assertEquals("1:33", resultEnvs.get("a"));
|
||||||
|
assertEquals("2", resultEnvs.get("b"));
|
||||||
|
assertEquals("44", resultEnvs.get("c"));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testHandleServiceEnvWithExistingEnvVarKeyMultipleTimes() {
|
||||||
|
Map<String, String> existingEnvs = Maps.newHashMap(
|
||||||
|
ImmutableMap.<String, String>builder().
|
||||||
|
put("a", "1").
|
||||||
|
put("b", "2").
|
||||||
|
build());
|
||||||
|
ImmutableList<String> newEnvs = ImmutableList.of("a=33", "a=44");
|
||||||
|
|
||||||
|
Service service = createServiceWithEnvVars(existingEnvs);
|
||||||
|
org.apache.hadoop.conf.Configuration yarnConfig =
|
||||||
|
createYarnConfigWithSecurityValue("kerberos");
|
||||||
|
EnvironmentUtilities.handleServiceEnvs(service, yarnConfig, newEnvs);
|
||||||
|
|
||||||
|
Map<String, String> resultEnvs = service.getConfiguration().getEnv();
|
||||||
|
assertEquals(3, resultEnvs.size());
|
||||||
|
assertEquals("/etc/passwd:/etc/passwd:ro,/etc/krb5.conf:/etc/krb5.conf:ro",
|
||||||
|
resultEnvs.get(ENV_DOCKER_MOUNTS_FOR_CONTAINER_RUNTIME));
|
||||||
|
assertEquals("1:33:44", resultEnvs.get("a"));
|
||||||
|
assertEquals("2", resultEnvs.get("b"));
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,156 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hadoop.yarn.submarine.utils;
|
||||||
|
|
||||||
|
import org.apache.hadoop.fs.Path;
|
||||||
|
import org.apache.hadoop.yarn.service.api.records.KerberosPrincipal;
|
||||||
|
import org.apache.hadoop.yarn.submarine.FileUtilitiesForTests;
|
||||||
|
import org.apache.hadoop.yarn.submarine.client.cli.param.RunJobParameters;
|
||||||
|
import org.apache.hadoop.yarn.submarine.common.MockClientContext;
|
||||||
|
import org.apache.hadoop.yarn.submarine.runtimes.yarnservice.FileSystemOperations;
|
||||||
|
import org.junit.After;
|
||||||
|
import org.junit.Before;
|
||||||
|
import org.junit.Test;
|
||||||
|
|
||||||
|
import java.io.File;
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
import static org.junit.Assert.*;
|
||||||
|
import static org.mockito.Mockito.mock;
|
||||||
|
import static org.mockito.Mockito.when;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This class is to test {@link KerberosPrincipalFactory}.
|
||||||
|
*/
|
||||||
|
public class TestKerberosPrincipalFactory {
|
||||||
|
private FileUtilitiesForTests fileUtils = new FileUtilitiesForTests();
|
||||||
|
|
||||||
|
@Before
|
||||||
|
public void setUp() {
|
||||||
|
fileUtils.setup();
|
||||||
|
}
|
||||||
|
|
||||||
|
@After
|
||||||
|
public void teardown() throws IOException {
|
||||||
|
fileUtils.teardown();
|
||||||
|
}
|
||||||
|
|
||||||
|
private File createKeytabFile(String keytabFileName) throws IOException {
|
||||||
|
return fileUtils.createFileInTempDir(keytabFileName);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testCreatePrincipalEmptyPrincipalAndKeytab() throws IOException {
|
||||||
|
MockClientContext mockClientContext = new MockClientContext();
|
||||||
|
|
||||||
|
RunJobParameters parameters = mock(RunJobParameters.class);
|
||||||
|
when(parameters.getPrincipal()).thenReturn("");
|
||||||
|
when(parameters.getKeytab()).thenReturn("");
|
||||||
|
|
||||||
|
FileSystemOperations fsOperations =
|
||||||
|
new FileSystemOperations(mockClientContext);
|
||||||
|
KerberosPrincipal result =
|
||||||
|
KerberosPrincipalFactory.create(fsOperations,
|
||||||
|
mockClientContext.getRemoteDirectoryManager(), parameters);
|
||||||
|
|
||||||
|
assertNull(result);
|
||||||
|
}
|
||||||
|
@Test
|
||||||
|
public void testCreatePrincipalEmptyPrincipalString() throws IOException {
|
||||||
|
MockClientContext mockClientContext = new MockClientContext();
|
||||||
|
|
||||||
|
RunJobParameters parameters = mock(RunJobParameters.class);
|
||||||
|
when(parameters.getPrincipal()).thenReturn("");
|
||||||
|
when(parameters.getKeytab()).thenReturn("keytab");
|
||||||
|
|
||||||
|
FileSystemOperations fsOperations =
|
||||||
|
new FileSystemOperations(mockClientContext);
|
||||||
|
KerberosPrincipal result =
|
||||||
|
KerberosPrincipalFactory.create(fsOperations,
|
||||||
|
mockClientContext.getRemoteDirectoryManager(), parameters);
|
||||||
|
|
||||||
|
assertNull(result);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testCreatePrincipalEmptyKeyTabString() throws IOException {
|
||||||
|
MockClientContext mockClientContext = new MockClientContext();
|
||||||
|
|
||||||
|
RunJobParameters parameters = mock(RunJobParameters.class);
|
||||||
|
when(parameters.getPrincipal()).thenReturn("principal");
|
||||||
|
when(parameters.getKeytab()).thenReturn("");
|
||||||
|
|
||||||
|
FileSystemOperations fsOperations =
|
||||||
|
new FileSystemOperations(mockClientContext);
|
||||||
|
KerberosPrincipal result =
|
||||||
|
KerberosPrincipalFactory.create(fsOperations,
|
||||||
|
mockClientContext.getRemoteDirectoryManager(), parameters);
|
||||||
|
|
||||||
|
assertNull(result);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testCreatePrincipalNonEmptyPrincipalAndKeytab()
|
||||||
|
throws IOException {
|
||||||
|
MockClientContext mockClientContext = new MockClientContext();
|
||||||
|
|
||||||
|
RunJobParameters parameters = mock(RunJobParameters.class);
|
||||||
|
when(parameters.getPrincipal()).thenReturn("principal");
|
||||||
|
when(parameters.getKeytab()).thenReturn("keytab");
|
||||||
|
|
||||||
|
FileSystemOperations fsOperations =
|
||||||
|
new FileSystemOperations(mockClientContext);
|
||||||
|
KerberosPrincipal result =
|
||||||
|
KerberosPrincipalFactory.create(fsOperations,
|
||||||
|
mockClientContext.getRemoteDirectoryManager(), parameters);
|
||||||
|
|
||||||
|
assertNotNull(result);
|
||||||
|
assertEquals("file://keytab", result.getKeytab());
|
||||||
|
assertEquals("principal", result.getPrincipalName());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testCreatePrincipalDistributedKeytab() throws IOException {
|
||||||
|
MockClientContext mockClientContext = new MockClientContext();
|
||||||
|
String jobname = "testJobname";
|
||||||
|
String keytab = "testKeytab";
|
||||||
|
File keytabFile = createKeytabFile(keytab);
|
||||||
|
|
||||||
|
RunJobParameters parameters = mock(RunJobParameters.class);
|
||||||
|
when(parameters.getPrincipal()).thenReturn("principal");
|
||||||
|
when(parameters.getKeytab()).thenReturn(keytabFile.getAbsolutePath());
|
||||||
|
when(parameters.getName()).thenReturn(jobname);
|
||||||
|
when(parameters.isDistributeKeytab()).thenReturn(true);
|
||||||
|
|
||||||
|
FileSystemOperations fsOperations =
|
||||||
|
new FileSystemOperations(mockClientContext);
|
||||||
|
|
||||||
|
KerberosPrincipal result =
|
||||||
|
KerberosPrincipalFactory.create(fsOperations,
|
||||||
|
mockClientContext.getRemoteDirectoryManager(), parameters);
|
||||||
|
|
||||||
|
Path stagingDir = mockClientContext.getRemoteDirectoryManager()
|
||||||
|
.getJobStagingArea(parameters.getName(), true);
|
||||||
|
String expectedKeytabFilePath =
|
||||||
|
FileUtilitiesForTests.getFilename(stagingDir, keytab).getAbsolutePath();
|
||||||
|
|
||||||
|
assertNotNull(result);
|
||||||
|
assertEquals(expectedKeytabFilePath, result.getKeytab());
|
||||||
|
assertEquals("principal", result.getPrincipalName());
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,72 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hadoop.yarn.submarine.utils;
|
||||||
|
|
||||||
|
import com.google.common.collect.ImmutableMap;
|
||||||
|
import org.apache.hadoop.conf.Configuration;
|
||||||
|
import org.apache.hadoop.yarn.api.records.Resource;
|
||||||
|
import org.apache.hadoop.yarn.service.api.records.ResourceInformation;
|
||||||
|
import org.apache.hadoop.yarn.util.resource.CustomResourceTypesConfigurationProvider;
|
||||||
|
import org.apache.hadoop.yarn.util.resource.ResourceUtils;
|
||||||
|
import org.junit.After;
|
||||||
|
import org.junit.Test;
|
||||||
|
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
import static org.junit.Assert.*;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This class is to test {@link SubmarineResourceUtils}.
|
||||||
|
*/
|
||||||
|
public class TestSubmarineResourceUtils {
|
||||||
|
private static final String CUSTOM_RESOURCE_NAME = "a-custom-resource";
|
||||||
|
|
||||||
|
private void initResourceTypes() {
|
||||||
|
CustomResourceTypesConfigurationProvider.initResourceTypes(
|
||||||
|
ImmutableMap.<String, String>builder()
|
||||||
|
.put(CUSTOM_RESOURCE_NAME, "G")
|
||||||
|
.build());
|
||||||
|
}
|
||||||
|
|
||||||
|
@After
|
||||||
|
public void cleanup() {
|
||||||
|
ResourceUtils.resetResourceTypes(new Configuration());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testConvertResourceWithCustomResource() {
|
||||||
|
initResourceTypes();
|
||||||
|
Resource res = Resource.newInstance(4096, 12,
|
||||||
|
ImmutableMap.of(CUSTOM_RESOURCE_NAME, 20L));
|
||||||
|
|
||||||
|
org.apache.hadoop.yarn.service.api.records.Resource serviceResource =
|
||||||
|
SubmarineResourceUtils.convertYarnResourceToServiceResource(res);
|
||||||
|
|
||||||
|
assertEquals(12, serviceResource.getCpus().intValue());
|
||||||
|
assertEquals(4096, (int) Integer.valueOf(serviceResource.getMemory()));
|
||||||
|
Map<String, ResourceInformation> additionalResources =
|
||||||
|
serviceResource.getAdditional();
|
||||||
|
|
||||||
|
// Additional resources also includes vcores and memory
|
||||||
|
assertEquals(3, additionalResources.size());
|
||||||
|
ResourceInformation customResourceRI =
|
||||||
|
additionalResources.get(CUSTOM_RESOURCE_NAME);
|
||||||
|
assertEquals("G", customResourceRI.getUnit());
|
||||||
|
assertEquals(20L, (long) customResourceRI.getValue());
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
Loading…
Reference in New Issue