YARN-8561. [Submarine] Initial implementation: Training job submission and job history retrieval. Contributed by Wangda Tan.
This commit is contained in:
parent
a8dae0047c
commit
cadbc8b57f
|
@ -0,0 +1,53 @@
|
||||||
|
<!---
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
you may not use this file except in compliance with the License.
|
||||||
|
You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License. See accompanying LICENSE file.
|
||||||
|
-->
|
||||||
|
|
||||||
|
# Overview
|
||||||
|
|
||||||
|
```$xslt
|
||||||
|
_ _
|
||||||
|
| | (_)
|
||||||
|
___ _ _ | |__ _ __ ___ __ _ _ __ _ _ __ ___
|
||||||
|
/ __|| | | || '_ \ | '_ ` _ \ / _` || '__|| || '_ \ / _ \
|
||||||
|
\__ \| |_| || |_) || | | | | || (_| || | | || | | || __/
|
||||||
|
|___/ \__,_||_.__/ |_| |_| |_| \__,_||_| |_||_| |_| \___|
|
||||||
|
|
||||||
|
?
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~|^"~~~~~~~~~~~~~~~~~~~~~~~~~o~~~~~~~~~~~
|
||||||
|
o | o __o
|
||||||
|
o | o |X__>
|
||||||
|
___o | __o
|
||||||
|
(X___>-- __|__ |X__> o
|
||||||
|
| \ __o
|
||||||
|
| \ |X__>
|
||||||
|
_______________________|_______\________________
|
||||||
|
< \____________ _
|
||||||
|
\ \ (_)
|
||||||
|
\ O O O >=)
|
||||||
|
\__________________________________________________________/ (_)
|
||||||
|
```
|
||||||
|
|
||||||
|
Submarine is a project which allows infra engineer / data scientist to run *unmodified* Tensorflow programs on YARN.
|
||||||
|
|
||||||
|
Goals of Submarine:
|
||||||
|
- It allows jobs easy access data/models in HDFS and other storages.
|
||||||
|
- Can launch services to serve Tensorflow/MXNet models.
|
||||||
|
- Support run distributed Tensorflow jobs with simple configs.
|
||||||
|
- Support run user-specified Docker images.
|
||||||
|
- Support specify GPU and other resources.
|
||||||
|
- Support launch tensorboard for training jobs if user specified.
|
||||||
|
- Support customized DNS name for roles (like tensorboard.$user.$domain:6006)
|
||||||
|
|
||||||
|
Please jump to [QuickStart](src/site/QuickStart.md) guide to quickly understand how to use this framework.
|
||||||
|
|
||||||
|
If you're a developer, please find [Developer](src/site/DeveloperGuide.md) guide for more details.
|
|
@ -0,0 +1,213 @@
|
||||||
|
<?xml version="1.0"?>
|
||||||
|
<!--
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
you may not use this file except in compliance with the License.
|
||||||
|
You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License. See accompanying LICENSE file.
|
||||||
|
-->
|
||||||
|
<project xmlns="http://maven.apache.org/POM/4.0.0"
|
||||||
|
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||||
|
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0
|
||||||
|
http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||||
|
<parent>
|
||||||
|
<artifactId>hadoop-yarn-applications</artifactId>
|
||||||
|
<groupId>org.apache.hadoop</groupId>
|
||||||
|
<version>3.2.0-SNAPSHOT</version>
|
||||||
|
</parent>
|
||||||
|
<modelVersion>4.0.0</modelVersion>
|
||||||
|
<artifactId>hadoop-yarn-submarine</artifactId>
|
||||||
|
<version>3.2.0-SNAPSHOT</version>
|
||||||
|
<name>Yet Another Learning Platform</name>
|
||||||
|
|
||||||
|
<properties>
|
||||||
|
<!-- Needed for generating FindBugs warnings using parent pom -->
|
||||||
|
<yarn.basedir>${project.parent.parent.basedir}</yarn.basedir>
|
||||||
|
</properties>
|
||||||
|
|
||||||
|
<dependencies>
|
||||||
|
<!-- 'mvn dependency:analyze' fails to detect use of this dependency -->
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.apache.hadoop</groupId>
|
||||||
|
<artifactId>hadoop-common</artifactId>
|
||||||
|
<scope>provided</scope>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
|
<dependency>
|
||||||
|
<groupId>junit</groupId>
|
||||||
|
<artifactId>junit</artifactId>
|
||||||
|
<scope>test</scope>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
|
<dependency>
|
||||||
|
<groupId>log4j</groupId>
|
||||||
|
<artifactId>log4j</artifactId>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>com.google.guava</groupId>
|
||||||
|
<artifactId>guava</artifactId>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>commons-logging</groupId>
|
||||||
|
<artifactId>commons-logging</artifactId>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>commons-cli</groupId>
|
||||||
|
<artifactId>commons-cli</artifactId>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>commons-io</groupId>
|
||||||
|
<artifactId>commons-io</artifactId>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.apache.hadoop</groupId>
|
||||||
|
<artifactId>hadoop-yarn-server-applicationhistoryservice</artifactId>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.apache.hadoop</groupId>
|
||||||
|
<artifactId>hadoop-yarn-server-timelineservice</artifactId>
|
||||||
|
<type>test-jar</type>
|
||||||
|
<scope>test</scope>
|
||||||
|
</dependency>
|
||||||
|
<!-- 'mvn dependency:analyze' fails to detect use of this dependency -->
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.apache.hadoop</groupId>
|
||||||
|
<artifactId>hadoop-annotations</artifactId>
|
||||||
|
</dependency>
|
||||||
|
<!-- 'mvn dependency:analyze' fails to detect use of this dependency -->
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.apache.hadoop</groupId>
|
||||||
|
<artifactId>hadoop-common</artifactId>
|
||||||
|
<type>test-jar</type>
|
||||||
|
<scope>test</scope>
|
||||||
|
</dependency>
|
||||||
|
<!-- 'mvn dependency:analyze' fails to detect use of this dependency -->
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.apache.hadoop</groupId>
|
||||||
|
<artifactId>hadoop-yarn-api</artifactId>
|
||||||
|
</dependency>
|
||||||
|
<!-- 'mvn dependency:analyze' fails to detect use of this dependency -->
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.apache.hadoop</groupId>
|
||||||
|
<artifactId>hadoop-yarn-common</artifactId>
|
||||||
|
</dependency>
|
||||||
|
<!-- 'mvn dependency:analyze' fails to detect use of this dependency -->
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.apache.hadoop</groupId>
|
||||||
|
<artifactId>hadoop-yarn-client</artifactId>
|
||||||
|
</dependency>
|
||||||
|
<!-- 'mvn dependency:analyze' fails to detect use of this dependency -->
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.apache.hadoop</groupId>
|
||||||
|
<artifactId>hadoop-yarn-server-nodemanager</artifactId>
|
||||||
|
<scope>test</scope>
|
||||||
|
</dependency>
|
||||||
|
<!-- 'mvn dependency:analyze' fails to detect use of this dependency -->
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.apache.hadoop</groupId>
|
||||||
|
<artifactId>hadoop-yarn-server-resourcemanager</artifactId>
|
||||||
|
<scope>test</scope>
|
||||||
|
</dependency>
|
||||||
|
<!-- 'mvn dependency:analyze' fails to detect use of this dependency -->
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.apache.hadoop</groupId>
|
||||||
|
<artifactId>hadoop-yarn-server-tests</artifactId>
|
||||||
|
<type>test-jar</type>
|
||||||
|
<scope>test</scope>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.mockito</groupId>
|
||||||
|
<artifactId>mockito-all</artifactId>
|
||||||
|
<scope>test</scope>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.apache.hadoop</groupId>
|
||||||
|
<artifactId>hadoop-yarn-server-timeline-pluginstorage</artifactId>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.apache.hadoop</groupId>
|
||||||
|
<artifactId>hadoop-yarn-server-timeline-pluginstorage</artifactId>
|
||||||
|
<type>test-jar</type>
|
||||||
|
<scope>test</scope>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.apache.hadoop</groupId>
|
||||||
|
<artifactId>hadoop-yarn-common</artifactId>
|
||||||
|
<type>test-jar</type>
|
||||||
|
<scope>test</scope>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.apache.hadoop</groupId>
|
||||||
|
<artifactId>hadoop-hdfs</artifactId>
|
||||||
|
<scope>test</scope>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.apache.hadoop</groupId>
|
||||||
|
<artifactId>hadoop-hdfs-client</artifactId>
|
||||||
|
<scope>test</scope>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.apache.hadoop</groupId>
|
||||||
|
<artifactId>hadoop-hdfs</artifactId>
|
||||||
|
<scope>test</scope>
|
||||||
|
<type>test-jar</type>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.apache.hadoop</groupId>
|
||||||
|
<artifactId>hadoop-fs2img</artifactId>
|
||||||
|
<version>3.2.0-SNAPSHOT</version>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.apache.hadoop</groupId>
|
||||||
|
<artifactId>hadoop-yarn-services-api</artifactId>
|
||||||
|
<version>3.2.0-SNAPSHOT</version>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.apache.hadoop</groupId>
|
||||||
|
<artifactId>hadoop-yarn-services-core</artifactId>
|
||||||
|
<version>3.2.0-SNAPSHOT</version>
|
||||||
|
</dependency>
|
||||||
|
</dependencies>
|
||||||
|
|
||||||
|
<build>
|
||||||
|
<plugins>
|
||||||
|
<plugin>
|
||||||
|
<artifactId>maven-jar-plugin</artifactId>
|
||||||
|
<executions>
|
||||||
|
<execution>
|
||||||
|
<goals>
|
||||||
|
<goal>jar</goal>
|
||||||
|
</goals>
|
||||||
|
<!-- strictly speaking, the unit test is really a regression test. It
|
||||||
|
needs the main jar to be available to be able to run. -->
|
||||||
|
<phase>test-compile</phase>
|
||||||
|
</execution>
|
||||||
|
</executions>
|
||||||
|
<configuration>
|
||||||
|
<archive>
|
||||||
|
<manifest>
|
||||||
|
<mainClass>org.apache.hadoop.yarn.submarine.client.cli.Cli</mainClass>
|
||||||
|
</manifest>
|
||||||
|
</archive>
|
||||||
|
</configuration>
|
||||||
|
</plugin>
|
||||||
|
<plugin>
|
||||||
|
<groupId>org.apache.maven.plugins</groupId>
|
||||||
|
<artifactId>maven-surefire-plugin</artifactId>
|
||||||
|
<configuration>
|
||||||
|
<environmentVariables>
|
||||||
|
<JAVA_HOME>${java.home}</JAVA_HOME>
|
||||||
|
</environmentVariables>
|
||||||
|
</configuration>
|
||||||
|
</plugin>
|
||||||
|
</plugins>
|
||||||
|
</build>
|
||||||
|
|
||||||
|
|
||||||
|
</project>
|
|
@ -0,0 +1,47 @@
|
||||||
|
/**
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License. See accompanying LICENSE file.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hadoop.yarn.submarine.client.cli;
|
||||||
|
|
||||||
|
import org.apache.commons.cli.ParseException;
|
||||||
|
import org.apache.hadoop.conf.Configuration;
|
||||||
|
import org.apache.hadoop.util.Tool;
|
||||||
|
import org.apache.hadoop.yarn.submarine.common.ClientContext;
|
||||||
|
import org.apache.hadoop.yarn.exceptions.YarnException;
|
||||||
|
import org.apache.hadoop.yarn.submarine.common.exception.SubmarineException;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
public abstract class AbstractCli implements Tool {
|
||||||
|
protected ClientContext clientContext;
|
||||||
|
|
||||||
|
public AbstractCli(ClientContext cliContext) {
|
||||||
|
this.clientContext = cliContext;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public abstract int run(String[] args)
|
||||||
|
throws ParseException, IOException, YarnException, InterruptedException,
|
||||||
|
SubmarineException;
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void setConf(Configuration conf) {
|
||||||
|
clientContext.setSubmarineConfig(conf);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Configuration getConf() {
|
||||||
|
return clientContext.getSubmarineConfig();
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,106 @@
|
||||||
|
/**
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License. See accompanying LICENSE file.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hadoop.yarn.submarine.client.cli;
|
||||||
|
|
||||||
|
import org.apache.hadoop.conf.Configuration;
|
||||||
|
import org.apache.hadoop.yarn.submarine.common.ClientContext;
|
||||||
|
import org.apache.hadoop.yarn.submarine.common.fs.DefaultRemoteDirectoryManager;
|
||||||
|
import org.apache.hadoop.yarn.conf.YarnConfiguration;
|
||||||
|
import org.apache.hadoop.yarn.submarine.runtimes.RuntimeFactory;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import java.util.Arrays;
|
||||||
|
|
||||||
|
public class Cli {
|
||||||
|
private static final Logger LOG =
|
||||||
|
LoggerFactory.getLogger(Cli.class);
|
||||||
|
|
||||||
|
private static void printHelp() {
|
||||||
|
StringBuilder helpMsg = new StringBuilder();
|
||||||
|
helpMsg.append("\n\nUsage: <object> [<action>] [<args>]\n");
|
||||||
|
helpMsg.append(" Below are all objects / actions:\n");
|
||||||
|
helpMsg.append(" job \n");
|
||||||
|
helpMsg.append(" run : run a job, please see 'job run --help' for usage \n");
|
||||||
|
helpMsg.append(" show : get status of job, please see 'job show --help' for usage \n");
|
||||||
|
|
||||||
|
System.out.println(helpMsg.toString());
|
||||||
|
}
|
||||||
|
|
||||||
|
private static ClientContext getClientContext() {
|
||||||
|
Configuration conf = new YarnConfiguration();
|
||||||
|
ClientContext clientContext = new ClientContext();
|
||||||
|
clientContext.setConfiguration(conf);
|
||||||
|
clientContext.setRemoteDirectoryManager(
|
||||||
|
new DefaultRemoteDirectoryManager(clientContext));
|
||||||
|
RuntimeFactory runtimeFactory = RuntimeFactory.getRuntimeFactory(
|
||||||
|
clientContext);
|
||||||
|
clientContext.setRuntimeFactory(runtimeFactory);
|
||||||
|
return clientContext;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static void main(String[] args) throws Exception {
|
||||||
|
System.out.println(" _ _ \n"
|
||||||
|
+ " | | (_) \n"
|
||||||
|
+ " ___ _ _ | |__ _ __ ___ __ _ _ __ _ _ __ ___ \n"
|
||||||
|
+ " / __|| | | || '_ \\ | '_ ` _ \\ / _` || '__|| || '_ \\ / _ \\\n"
|
||||||
|
+ " \\__ \\| |_| || |_) || | | | | || (_| || | | || | | || __/\n"
|
||||||
|
+ " |___/ \\__,_||_.__/ |_| |_| |_| \\__,_||_| |_||_| |_| \\___|\n"
|
||||||
|
+ " \n"
|
||||||
|
+ " ?\n"
|
||||||
|
+ " ~~~~~~~~~~~~~~~~~~~~~~~~~~~|^\"~~~~~~~~~~~~~~~~~~~~~~~~~o~~~~~~~~~~~\n"
|
||||||
|
+ " o | o __o\n"
|
||||||
|
+ " o | o |X__>\n"
|
||||||
|
+ " ___o | __o\n"
|
||||||
|
+ " (X___>-- __|__ |X__> o\n"
|
||||||
|
+ " | \\ __o\n"
|
||||||
|
+ " | \\ |X__>\n"
|
||||||
|
+ " _______________________|_______\\________________\n"
|
||||||
|
+ " < \\____________ _\n"
|
||||||
|
+ " \\ \\ (_)\n"
|
||||||
|
+ " \\ O O O >=)\n"
|
||||||
|
+ " \\__________________________________________________________/ (_)\n"
|
||||||
|
+ "\n");
|
||||||
|
|
||||||
|
if (CliUtils.argsForHelp(args)) {
|
||||||
|
printHelp();
|
||||||
|
System.exit(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (args.length < 2) {
|
||||||
|
LOG.error("Bad parameters specified.");
|
||||||
|
printHelp();
|
||||||
|
System.exit(-1);
|
||||||
|
}
|
||||||
|
|
||||||
|
String[] moduleArgs = Arrays.copyOfRange(args, 2, args.length);
|
||||||
|
ClientContext clientContext = getClientContext();
|
||||||
|
|
||||||
|
if (args[0].equals("job")) {
|
||||||
|
String subCmd = args[1];
|
||||||
|
if (subCmd.equals(CliConstants.RUN)) {
|
||||||
|
new RunJobCli(clientContext).run(moduleArgs);
|
||||||
|
} else if (subCmd.equals(CliConstants.SHOW)) {
|
||||||
|
new ShowJobCli(clientContext).run(moduleArgs);
|
||||||
|
} else {
|
||||||
|
printHelp();
|
||||||
|
throw new IllegalArgumentException("Unknown option for job");
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
printHelp();
|
||||||
|
throw new IllegalArgumentException("Bad parameters <TODO>");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,48 @@
|
||||||
|
/**
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License. See accompanying LICENSE file.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hadoop.yarn.submarine.client.cli;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* NOTE: use lowercase + "_" for the option name
|
||||||
|
*/
|
||||||
|
public class CliConstants {
|
||||||
|
public static final String RUN = "run";
|
||||||
|
public static final String SERVE = "serve";
|
||||||
|
public static final String LIST = "list";
|
||||||
|
public static final String SHOW = "show";
|
||||||
|
public static final String NAME = "name";
|
||||||
|
public static final String INPUT_PATH = "input_path";
|
||||||
|
public static final String CHECKPOINT_PATH = "checkpoint_path";
|
||||||
|
public static final String SAVED_MODEL_PATH = "saved_model_path";
|
||||||
|
public static final String N_WORKERS = "num_workers";
|
||||||
|
public static final String N_SERVING_TASKS = "num_serving_tasks";
|
||||||
|
public static final String N_PS = "num_ps";
|
||||||
|
public static final String WORKER_RES = "worker_resources";
|
||||||
|
public static final String SERVING_RES = "serving_resources";
|
||||||
|
public static final String PS_RES = "ps_resources";
|
||||||
|
public static final String DOCKER_IMAGE = "docker_image";
|
||||||
|
public static final String QUEUE = "queue";
|
||||||
|
public static final String TENSORBOARD = "tensorboard";
|
||||||
|
public static final String WORKER_LAUNCH_CMD = "worker_launch_cmd";
|
||||||
|
public static final String SERVING_LAUNCH_CMD = "serving_launch_cmd";
|
||||||
|
public static final String PS_LAUNCH_CMD = "ps_launch_cmd";
|
||||||
|
public static final String ENV = "env";
|
||||||
|
public static final String VERBOSE = "verbose";
|
||||||
|
public static final String SERVING_FRAMEWORK = "serving_framework";
|
||||||
|
public static final String STOP = "stop";
|
||||||
|
public static final String WAIT_JOB_FINISH = "wait_job_finish";
|
||||||
|
public static final String PS_DOCKER_IMAGE = "ps_docker_image";
|
||||||
|
public static final String WORKER_DOCKER_IMAGE = "worker_docker_image";
|
||||||
|
}
|
|
@ -0,0 +1,174 @@
|
||||||
|
/**
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
* <p>
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
* <p>
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License. See accompanying LICENSE file.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hadoop.yarn.submarine.client.cli;
|
||||||
|
|
||||||
|
import org.apache.hadoop.yarn.api.records.Resource;
|
||||||
|
import org.apache.hadoop.yarn.api.records.ResourceInformation;
|
||||||
|
import org.apache.hadoop.yarn.api.records.ResourceTypeInfo;
|
||||||
|
import org.apache.hadoop.yarn.exceptions.ResourceNotFoundException;
|
||||||
|
import org.apache.hadoop.yarn.exceptions.YarnException;
|
||||||
|
import org.apache.hadoop.yarn.submarine.client.cli.param.RunJobParameters;
|
||||||
|
import org.apache.hadoop.yarn.submarine.common.fs.RemoteDirectoryManager;
|
||||||
|
import org.apache.hadoop.yarn.util.UnitsConversionUtil;
|
||||||
|
import org.apache.hadoop.yarn.util.resource.ResourceUtils;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
public class CliUtils {
|
||||||
|
private final static String RES_PATTERN = "^[^=]+=\\d+\\s?\\w*$";
|
||||||
|
/**
|
||||||
|
* Replace patterns inside cli
|
||||||
|
*
|
||||||
|
* @return launch command after pattern replace
|
||||||
|
*/
|
||||||
|
public static String replacePatternsInLaunchCommand(String specifiedCli,
|
||||||
|
RunJobParameters jobRunParameters,
|
||||||
|
RemoteDirectoryManager directoryManager) throws IOException {
|
||||||
|
String jobDir = jobRunParameters.getCheckpointPath();
|
||||||
|
if (null == jobDir) {
|
||||||
|
jobDir = directoryManager.getJobCheckpointDir(jobRunParameters.getName(),
|
||||||
|
true).toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
String input = jobRunParameters.getInputPath();
|
||||||
|
String savedModelDir = jobRunParameters.getSavedModelPath();
|
||||||
|
if (null == savedModelDir) {
|
||||||
|
savedModelDir = jobDir;
|
||||||
|
}
|
||||||
|
|
||||||
|
Map<String, String> replacePattern = new HashMap<>();
|
||||||
|
if (jobDir != null) {
|
||||||
|
replacePattern.put("%" + CliConstants.CHECKPOINT_PATH + "%", jobDir);
|
||||||
|
}
|
||||||
|
if (input != null) {
|
||||||
|
replacePattern.put("%" + CliConstants.INPUT_PATH + "%", input);
|
||||||
|
}
|
||||||
|
if (savedModelDir != null) {
|
||||||
|
replacePattern.put("%" + CliConstants.SAVED_MODEL_PATH + "%",
|
||||||
|
savedModelDir);
|
||||||
|
}
|
||||||
|
|
||||||
|
String newCli = specifiedCli;
|
||||||
|
for (Map.Entry<String, String> replace : replacePattern.entrySet()) {
|
||||||
|
newCli = newCli.replace(replace.getKey(), replace.getValue());
|
||||||
|
}
|
||||||
|
|
||||||
|
return newCli;
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO, this duplicated to Client of distributed shell, should cleanup
|
||||||
|
private static Map<String, Long> parseResourcesString(String resourcesStr) {
|
||||||
|
Map<String, Long> resources = new HashMap<>();
|
||||||
|
|
||||||
|
// Ignore the grouping "[]"
|
||||||
|
if (resourcesStr.startsWith("[")) {
|
||||||
|
resourcesStr = resourcesStr.substring(1);
|
||||||
|
}
|
||||||
|
if (resourcesStr.endsWith("]")) {
|
||||||
|
resourcesStr = resourcesStr.substring(0, resourcesStr.length());
|
||||||
|
}
|
||||||
|
|
||||||
|
for (String resource : resourcesStr.trim().split(",")) {
|
||||||
|
resource = resource.trim();
|
||||||
|
if (!resource.matches(RES_PATTERN)) {
|
||||||
|
throw new IllegalArgumentException("\"" + resource + "\" is not a "
|
||||||
|
+ "valid resource type/amount pair. "
|
||||||
|
+ "Please provide key=amount pairs separated by commas.");
|
||||||
|
}
|
||||||
|
String[] splits = resource.split("=");
|
||||||
|
String key = splits[0], value = splits[1];
|
||||||
|
String units = ResourceUtils.getUnits(value);
|
||||||
|
|
||||||
|
String valueWithoutUnit = value.substring(0, value.length() - units.length()).trim();
|
||||||
|
Long resourceValue = Long.valueOf(valueWithoutUnit);
|
||||||
|
|
||||||
|
// Convert commandline unit to standard YARN unit.
|
||||||
|
if (units.equals("M") || units.equals("m")) {
|
||||||
|
units = "Mi";
|
||||||
|
} else if (units.equals("G") || units.equals("g")) {
|
||||||
|
units = "Gi";
|
||||||
|
} else if (units.isEmpty()) {
|
||||||
|
// do nothing;
|
||||||
|
} else{
|
||||||
|
throw new IllegalArgumentException("Acceptable units are M/G or empty");
|
||||||
|
}
|
||||||
|
|
||||||
|
// special handle memory-mb and memory
|
||||||
|
if (key.equals(ResourceInformation.MEMORY_URI)) {
|
||||||
|
if (!units.isEmpty()) {
|
||||||
|
resourceValue = UnitsConversionUtil.convert(units, "Mi",
|
||||||
|
resourceValue);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (key.equals("memory")) {
|
||||||
|
key = ResourceInformation.MEMORY_URI;
|
||||||
|
resourceValue = UnitsConversionUtil.convert(units, "Mi", resourceValue);
|
||||||
|
}
|
||||||
|
|
||||||
|
// special handle gpu
|
||||||
|
if (key.equals("gpu")) {
|
||||||
|
key = ResourceInformation.GPU_URI;
|
||||||
|
}
|
||||||
|
|
||||||
|
// special handle fpga
|
||||||
|
if (key.equals("fpga")) {
|
||||||
|
key = ResourceInformation.FPGA_URI;
|
||||||
|
}
|
||||||
|
|
||||||
|
resources.put(key, resourceValue);
|
||||||
|
}
|
||||||
|
return resources;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static void validateResourceTypes(Iterable<String> resourceNames,
|
||||||
|
List<ResourceTypeInfo> resourceTypes) throws IOException, YarnException {
|
||||||
|
for (String resourceName : resourceNames) {
|
||||||
|
if (!resourceTypes.stream().anyMatch(
|
||||||
|
e -> e.getName().equals(resourceName))) {
|
||||||
|
throw new ResourceNotFoundException(
|
||||||
|
"Unknown resource: " + resourceName);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public static Resource createResourceFromString(String resourceStr,
|
||||||
|
List<ResourceTypeInfo> resourceTypes) throws IOException, YarnException {
|
||||||
|
Map<String, Long> typeToValue = parseResourcesString(resourceStr);
|
||||||
|
validateResourceTypes(typeToValue.keySet(), resourceTypes);
|
||||||
|
Resource resource = Resource.newInstance(0, 0);
|
||||||
|
for (Map.Entry<String, Long> entry : typeToValue.entrySet()) {
|
||||||
|
resource.setResourceValue(entry.getKey(), entry.getValue());
|
||||||
|
}
|
||||||
|
return resource;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Is it for help?
|
||||||
|
public static boolean argsForHelp(String[] args) {
|
||||||
|
if (args == null || args.length == 0)
|
||||||
|
return true;
|
||||||
|
|
||||||
|
if (args.length == 1) {
|
||||||
|
if (args[0].equals("-h") || args[0].equals("--help")) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,204 @@
|
||||||
|
/**
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
* <p>
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
* <p>
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License. See accompanying LICENSE file.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hadoop.yarn.submarine.client.cli;
|
||||||
|
|
||||||
|
import com.google.common.annotations.VisibleForTesting;
|
||||||
|
import org.apache.commons.cli.CommandLine;
|
||||||
|
import org.apache.commons.cli.GnuParser;
|
||||||
|
import org.apache.commons.cli.HelpFormatter;
|
||||||
|
import org.apache.commons.cli.Options;
|
||||||
|
import org.apache.commons.cli.ParseException;
|
||||||
|
import org.apache.hadoop.yarn.api.records.ApplicationId;
|
||||||
|
import org.apache.hadoop.yarn.exceptions.YarnException;
|
||||||
|
import org.apache.hadoop.yarn.submarine.client.cli.param.RunJobParameters;
|
||||||
|
import org.apache.hadoop.yarn.submarine.common.ClientContext;
|
||||||
|
import org.apache.hadoop.yarn.submarine.common.exception.SubmarineException;
|
||||||
|
import org.apache.hadoop.yarn.submarine.runtimes.common.JobMonitor;
|
||||||
|
import org.apache.hadoop.yarn.submarine.runtimes.common.JobSubmitter;
|
||||||
|
import org.apache.hadoop.yarn.submarine.runtimes.common.StorageKeyConstants;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
public class RunJobCli extends AbstractCli {
|
||||||
|
private static final Logger LOG =
|
||||||
|
LoggerFactory.getLogger(RunJobCli.class);
|
||||||
|
|
||||||
|
private Options options;
|
||||||
|
private RunJobParameters parameters = new RunJobParameters();
|
||||||
|
|
||||||
|
private JobSubmitter jobSubmitter;
|
||||||
|
private JobMonitor jobMonitor;
|
||||||
|
|
||||||
|
public RunJobCli(ClientContext cliContext) {
|
||||||
|
this(cliContext, cliContext.getRuntimeFactory().getJobSubmitterInstance(),
|
||||||
|
cliContext.getRuntimeFactory().getJobMonitorInstance());
|
||||||
|
}
|
||||||
|
|
||||||
|
@VisibleForTesting
|
||||||
|
public RunJobCli(ClientContext cliContext, JobSubmitter jobSubmitter,
|
||||||
|
JobMonitor jobMonitor) {
|
||||||
|
super(cliContext);
|
||||||
|
options = generateOptions();
|
||||||
|
this.jobSubmitter = jobSubmitter;
|
||||||
|
this.jobMonitor = jobMonitor;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void printUsages() {
|
||||||
|
new HelpFormatter().printHelp("job run", options);
|
||||||
|
}
|
||||||
|
|
||||||
|
private Options generateOptions() {
|
||||||
|
Options options = new Options();
|
||||||
|
options.addOption(CliConstants.NAME, true, "Name of the job");
|
||||||
|
options.addOption(CliConstants.INPUT_PATH, true,
|
||||||
|
"Input of the job, could be local or other FS directory");
|
||||||
|
options.addOption(CliConstants.CHECKPOINT_PATH, true,
|
||||||
|
"Training output directory of the job, "
|
||||||
|
+ "could be local or other FS directory. This typically includes "
|
||||||
|
+ "checkpoint files and exported model ");
|
||||||
|
options.addOption(CliConstants.SAVED_MODEL_PATH, true,
|
||||||
|
"Model exported path (savedmodel) of the job, which is needed when "
|
||||||
|
+ "exported model is not placed under ${checkpoint_path}"
|
||||||
|
+ "could be local or other FS directory. This will be used to serve.");
|
||||||
|
options.addOption(CliConstants.N_WORKERS, true,
|
||||||
|
"Numnber of worker tasks of the job, by default it's 1");
|
||||||
|
options.addOption(CliConstants.N_PS, true,
|
||||||
|
"Number of PS tasks of the job, by default it's 0");
|
||||||
|
options.addOption(CliConstants.WORKER_RES, true,
|
||||||
|
"Resource of each worker, for example "
|
||||||
|
+ "memory-mb=2048,vcores=2,yarn.io/gpu=2");
|
||||||
|
options.addOption(CliConstants.PS_RES, true,
|
||||||
|
"Resource of each PS, for example "
|
||||||
|
+ "memory-mb=2048,vcores=2,yarn.io/gpu=2");
|
||||||
|
options.addOption(CliConstants.DOCKER_IMAGE, true, "Docker image name/tag");
|
||||||
|
options.addOption(CliConstants.QUEUE, true,
|
||||||
|
"Name of queue to run the job, by default it uses default queue");
|
||||||
|
options.addOption(CliConstants.TENSORBOARD, true,
|
||||||
|
"Should we run TensorBoard" + " for this job? By default it's true");
|
||||||
|
options.addOption(CliConstants.WORKER_LAUNCH_CMD, true,
|
||||||
|
"Commandline of worker, arguments will be "
|
||||||
|
+ "directly used to launch the worker");
|
||||||
|
options.addOption(CliConstants.PS_LAUNCH_CMD, true,
|
||||||
|
"Commandline of worker, arguments will be "
|
||||||
|
+ "directly used to launch the PS");
|
||||||
|
options.addOption(CliConstants.ENV, true,
|
||||||
|
"Common environment variable of worker/ps");
|
||||||
|
options.addOption(CliConstants.VERBOSE, false,
|
||||||
|
"Print verbose log for troubleshooting");
|
||||||
|
options.addOption(CliConstants.WAIT_JOB_FINISH, false,
|
||||||
|
"Specified when user want to wait the job finish");
|
||||||
|
options.addOption(CliConstants.PS_DOCKER_IMAGE, true,
|
||||||
|
"Specify docker image for PS, when this is not specified, PS uses --"
|
||||||
|
+ CliConstants.DOCKER_IMAGE + " as default.");
|
||||||
|
options.addOption(CliConstants.WORKER_DOCKER_IMAGE, true,
|
||||||
|
"Specify docker image for WORKER, when this is not specified, WORKER "
|
||||||
|
+ "uses --" + CliConstants.DOCKER_IMAGE + " as default.");
|
||||||
|
options.addOption("h", "help", false, "Print help");
|
||||||
|
return options;
|
||||||
|
}
|
||||||
|
|
||||||
|
private void replacePatternsInParameters() throws IOException {
|
||||||
|
if (parameters.getPSLaunchCmd() != null && !parameters.getPSLaunchCmd()
|
||||||
|
.isEmpty()) {
|
||||||
|
String afterReplace = CliUtils.replacePatternsInLaunchCommand(
|
||||||
|
parameters.getPSLaunchCmd(), parameters,
|
||||||
|
clientContext.getRemoteDirectoryManager());
|
||||||
|
parameters.setPSLaunchCmd(afterReplace);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (parameters.getWorkerLaunchCmd() != null && !parameters
|
||||||
|
.getWorkerLaunchCmd().isEmpty()) {
|
||||||
|
String afterReplace = CliUtils.replacePatternsInLaunchCommand(
|
||||||
|
parameters.getWorkerLaunchCmd(), parameters,
|
||||||
|
clientContext.getRemoteDirectoryManager());
|
||||||
|
parameters.setWorkerLaunchCmd(afterReplace);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void parseCommandLineAndGetRunJobParameters(String[] args)
|
||||||
|
throws ParseException, IOException, YarnException {
|
||||||
|
try {
|
||||||
|
// Do parsing
|
||||||
|
GnuParser parser = new GnuParser();
|
||||||
|
CommandLine cli = parser.parse(options, args);
|
||||||
|
parameters.updateParametersByParsedCommandline(cli, options, clientContext);
|
||||||
|
} catch (ParseException e) {
|
||||||
|
LOG.error("Exception in parse:", e.getMessage());
|
||||||
|
printUsages();
|
||||||
|
throw e;
|
||||||
|
}
|
||||||
|
|
||||||
|
// replace patterns
|
||||||
|
replacePatternsInParameters();
|
||||||
|
}
|
||||||
|
|
||||||
|
private void storeJobInformation(String jobName, ApplicationId applicationId,
|
||||||
|
String[] args) throws IOException {
|
||||||
|
Map<String, String> jobInfo = new HashMap<>();
|
||||||
|
jobInfo.put(StorageKeyConstants.JOB_NAME, jobName);
|
||||||
|
jobInfo.put(StorageKeyConstants.APPLICATION_ID, applicationId.toString());
|
||||||
|
|
||||||
|
if (parameters.getCheckpointPath() != null) {
|
||||||
|
jobInfo.put(StorageKeyConstants.CHECKPOINT_PATH,
|
||||||
|
parameters.getCheckpointPath());
|
||||||
|
}
|
||||||
|
if (parameters.getInputPath() != null) {
|
||||||
|
jobInfo.put(StorageKeyConstants.INPUT_PATH,
|
||||||
|
parameters.getInputPath());
|
||||||
|
}
|
||||||
|
if (parameters.getSavedModelPath() != null) {
|
||||||
|
jobInfo.put(StorageKeyConstants.SAVED_MODEL_PATH,
|
||||||
|
parameters.getSavedModelPath());
|
||||||
|
}
|
||||||
|
|
||||||
|
String joinedArgs = String.join(" ", args);
|
||||||
|
jobInfo.put(StorageKeyConstants.JOB_RUN_ARGS, joinedArgs);
|
||||||
|
clientContext.getRuntimeFactory().getSubmarineStorage().addNewJob(jobName,
|
||||||
|
jobInfo);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int run(String[] args)
|
||||||
|
throws ParseException, IOException, YarnException, InterruptedException,
|
||||||
|
SubmarineException {
|
||||||
|
if (CliUtils.argsForHelp(args)) {
|
||||||
|
printUsages();
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
parseCommandLineAndGetRunJobParameters(args);
|
||||||
|
ApplicationId applicationId = this.jobSubmitter.submitJob(parameters);
|
||||||
|
storeJobInformation(parameters.getName(), applicationId, args);
|
||||||
|
if (parameters.isWaitJobFinish()) {
|
||||||
|
this.jobMonitor.waitTrainingFinal(parameters.getName());
|
||||||
|
}
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
@VisibleForTesting
|
||||||
|
public JobSubmitter getJobSubmitter() {
|
||||||
|
return jobSubmitter;
|
||||||
|
}
|
||||||
|
|
||||||
|
@VisibleForTesting
|
||||||
|
RunJobParameters getRunJobParameters() {
|
||||||
|
return parameters;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,125 @@
|
||||||
|
/**
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
* <p>
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
* <p>
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License. See accompanying LICENSE file.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hadoop.yarn.submarine.client.cli;
|
||||||
|
|
||||||
|
import com.google.common.annotations.VisibleForTesting;
|
||||||
|
import org.apache.commons.cli.CommandLine;
|
||||||
|
import org.apache.commons.cli.GnuParser;
|
||||||
|
import org.apache.commons.cli.HelpFormatter;
|
||||||
|
import org.apache.commons.cli.Options;
|
||||||
|
import org.apache.commons.cli.ParseException;
|
||||||
|
import org.apache.hadoop.yarn.exceptions.YarnException;
|
||||||
|
import org.apache.hadoop.yarn.submarine.client.cli.param.ShowJobParameters;
|
||||||
|
import org.apache.hadoop.yarn.submarine.common.ClientContext;
|
||||||
|
import org.apache.hadoop.yarn.submarine.common.exception.SubmarineException;
|
||||||
|
import org.apache.hadoop.yarn.submarine.runtimes.common.StorageKeyConstants;
|
||||||
|
import org.apache.hadoop.yarn.submarine.runtimes.common.SubmarineStorage;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
public class ShowJobCli extends AbstractCli {
|
||||||
|
private static final Logger LOG = LoggerFactory.getLogger(ShowJobCli.class);
|
||||||
|
|
||||||
|
private Options options;
|
||||||
|
private ShowJobParameters parameters = new ShowJobParameters();
|
||||||
|
|
||||||
|
public ShowJobCli(ClientContext cliContext) {
|
||||||
|
super(cliContext);
|
||||||
|
options = generateOptions();
|
||||||
|
}
|
||||||
|
|
||||||
|
public void printUsages() {
|
||||||
|
new HelpFormatter().printHelp("job show", options);
|
||||||
|
}
|
||||||
|
|
||||||
|
private Options generateOptions() {
|
||||||
|
Options options = new Options();
|
||||||
|
options.addOption(CliConstants.NAME, true, "Name of the job");
|
||||||
|
options.addOption("h", "help", false, "Print help");
|
||||||
|
return options;
|
||||||
|
}
|
||||||
|
|
||||||
|
private void parseCommandLineAndGetShowJobParameters(String[] args)
|
||||||
|
throws IOException, YarnException {
|
||||||
|
// Do parsing
|
||||||
|
GnuParser parser = new GnuParser();
|
||||||
|
CommandLine cli;
|
||||||
|
try {
|
||||||
|
cli = parser.parse(options, args);
|
||||||
|
parameters.updateParametersByParsedCommandline(cli, options,
|
||||||
|
clientContext);
|
||||||
|
} catch (ParseException e) {
|
||||||
|
printUsages();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void printIfNotNull(String keyForPrint, String keyInStorage,
|
||||||
|
Map<String, String> jobInfo) {
|
||||||
|
if (jobInfo.containsKey(keyInStorage)) {
|
||||||
|
System.out.println("\t" + keyForPrint + ": " + jobInfo.get(keyInStorage));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void printJobInfo(Map<String, String> jobInfo) {
|
||||||
|
System.out.println("Job Meta Info:");
|
||||||
|
printIfNotNull("Application Id", StorageKeyConstants.APPLICATION_ID,
|
||||||
|
jobInfo);
|
||||||
|
printIfNotNull("Input Path", StorageKeyConstants.INPUT_PATH, jobInfo);
|
||||||
|
printIfNotNull("Saved Model Path", StorageKeyConstants.SAVED_MODEL_PATH,
|
||||||
|
jobInfo);
|
||||||
|
printIfNotNull("Checkpoint Path", StorageKeyConstants.CHECKPOINT_PATH,
|
||||||
|
jobInfo);
|
||||||
|
printIfNotNull("Run Parameters", StorageKeyConstants.JOB_RUN_ARGS,
|
||||||
|
jobInfo);
|
||||||
|
}
|
||||||
|
|
||||||
|
@VisibleForTesting
|
||||||
|
protected void getAndPrintJobInfo() throws IOException {
|
||||||
|
SubmarineStorage storage =
|
||||||
|
clientContext.getRuntimeFactory().getSubmarineStorage();
|
||||||
|
|
||||||
|
Map<String, String> jobInfo = null;
|
||||||
|
try {
|
||||||
|
jobInfo = storage.getJobInfoByName(parameters.getName());
|
||||||
|
} catch (IOException e) {
|
||||||
|
LOG.error("Failed to retrieve job info", e);
|
||||||
|
throw e;
|
||||||
|
}
|
||||||
|
|
||||||
|
printJobInfo(jobInfo);
|
||||||
|
}
|
||||||
|
|
||||||
|
@VisibleForTesting
|
||||||
|
public ShowJobParameters getParameters() {
|
||||||
|
return parameters;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int run(String[] args)
|
||||||
|
throws ParseException, IOException, YarnException, InterruptedException,
|
||||||
|
SubmarineException {
|
||||||
|
if (CliUtils.argsForHelp(args)) {
|
||||||
|
printUsages();
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
parseCommandLineAndGetShowJobParameters(args);
|
||||||
|
getAndPrintJobInfo();
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,56 @@
|
||||||
|
/**
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License. See accompanying LICENSE file.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hadoop.yarn.submarine.client.cli.param;
|
||||||
|
|
||||||
|
import org.apache.commons.cli.CommandLine;
|
||||||
|
import org.apache.commons.cli.Options;
|
||||||
|
import org.apache.commons.cli.ParseException;
|
||||||
|
import org.apache.hadoop.yarn.exceptions.YarnException;
|
||||||
|
import org.apache.hadoop.yarn.submarine.client.cli.CliConstants;
|
||||||
|
import org.apache.hadoop.yarn.submarine.common.ClientContext;
|
||||||
|
import org.apache.hadoop.yarn.submarine.common.conf.SubmarineLogs;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Base class of all parameters.
|
||||||
|
*/
|
||||||
|
public abstract class BaseParameters {
|
||||||
|
private String name;
|
||||||
|
|
||||||
|
public void updateParametersByParsedCommandline(CommandLine parsedCommandLine,
|
||||||
|
Options options, ClientContext clientContext)
|
||||||
|
throws ParseException, IOException, YarnException {
|
||||||
|
String name = parsedCommandLine.getOptionValue(CliConstants.NAME);
|
||||||
|
if (name == null) {
|
||||||
|
throw new ParseException("--name is absent");
|
||||||
|
}
|
||||||
|
|
||||||
|
if (parsedCommandLine.hasOption(CliConstants.VERBOSE)) {
|
||||||
|
SubmarineLogs.verboseOn();
|
||||||
|
}
|
||||||
|
|
||||||
|
this.setName(name);
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getName() {
|
||||||
|
return name;
|
||||||
|
}
|
||||||
|
|
||||||
|
public BaseParameters setName(String name) {
|
||||||
|
this.name = name;
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,222 @@
|
||||||
|
/**
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License. See accompanying LICENSE file.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hadoop.yarn.submarine.client.cli.param;
|
||||||
|
|
||||||
|
import org.apache.commons.cli.CommandLine;
|
||||||
|
import org.apache.commons.cli.Options;
|
||||||
|
import org.apache.commons.cli.ParseException;
|
||||||
|
import org.apache.hadoop.yarn.api.records.Resource;
|
||||||
|
import org.apache.hadoop.yarn.exceptions.YarnException;
|
||||||
|
import org.apache.hadoop.yarn.submarine.client.cli.CliConstants;
|
||||||
|
import org.apache.hadoop.yarn.submarine.client.cli.CliUtils;
|
||||||
|
import org.apache.hadoop.yarn.submarine.common.ClientContext;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Parameters used to run a job
|
||||||
|
*/
|
||||||
|
public class RunJobParameters extends RunParameters {
|
||||||
|
private String input;
|
||||||
|
private String checkpointPath;
|
||||||
|
|
||||||
|
private int numWorkers;
|
||||||
|
private int numPS;
|
||||||
|
private Resource workerResource;
|
||||||
|
private Resource psResource;
|
||||||
|
private boolean tensorboardEnabled;
|
||||||
|
private String workerLaunchCmd;
|
||||||
|
private String psLaunchCmd;
|
||||||
|
|
||||||
|
private String psDockerImage = null;
|
||||||
|
private String workerDockerImage = null;
|
||||||
|
|
||||||
|
private boolean waitJobFinish = false;
|
||||||
|
private boolean distributed = false;
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void updateParametersByParsedCommandline(CommandLine parsedCommandLine,
|
||||||
|
Options options, ClientContext clientContext)
|
||||||
|
throws ParseException, IOException, YarnException {
|
||||||
|
|
||||||
|
String input = parsedCommandLine.getOptionValue(CliConstants.INPUT_PATH);
|
||||||
|
String jobDir = parsedCommandLine.getOptionValue(CliConstants.CHECKPOINT_PATH);
|
||||||
|
int nWorkers = 1;
|
||||||
|
if (parsedCommandLine.getOptionValue(CliConstants.N_WORKERS) != null) {
|
||||||
|
nWorkers = Integer.parseInt(
|
||||||
|
parsedCommandLine.getOptionValue(CliConstants.N_WORKERS));
|
||||||
|
}
|
||||||
|
|
||||||
|
int nPS = 0;
|
||||||
|
if (parsedCommandLine.getOptionValue(CliConstants.N_PS) != null) {
|
||||||
|
nPS = Integer.parseInt(
|
||||||
|
parsedCommandLine.getOptionValue(CliConstants.N_PS));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check #workers and #ps.
|
||||||
|
// When distributed training is required
|
||||||
|
if (nWorkers >= 2 && nPS > 0) {
|
||||||
|
distributed = true;
|
||||||
|
} else if (nWorkers == 1 && nPS > 0) {
|
||||||
|
throw new ParseException("Only specified one worker but non-zero PS, "
|
||||||
|
+ "please double check.");
|
||||||
|
}
|
||||||
|
|
||||||
|
String workerResourceStr = parsedCommandLine.getOptionValue(
|
||||||
|
CliConstants.WORKER_RES);
|
||||||
|
if (workerResourceStr == null) {
|
||||||
|
throw new ParseException("--" + CliConstants.WORKER_RES + " is absent.");
|
||||||
|
}
|
||||||
|
Resource workerResource = CliUtils.createResourceFromString(
|
||||||
|
workerResourceStr,
|
||||||
|
clientContext.getOrCreateYarnClient().getResourceTypeInfo());
|
||||||
|
|
||||||
|
Resource psResource = null;
|
||||||
|
if (nPS > 0) {
|
||||||
|
String psResourceStr = parsedCommandLine.getOptionValue(CliConstants.PS_RES);
|
||||||
|
if (psResourceStr == null) {
|
||||||
|
throw new ParseException("--" + CliConstants.PS_RES + " is absent.");
|
||||||
|
}
|
||||||
|
psResource = CliUtils.createResourceFromString(psResourceStr,
|
||||||
|
clientContext.getOrCreateYarnClient().getResourceTypeInfo());
|
||||||
|
}
|
||||||
|
|
||||||
|
boolean tensorboard = false;
|
||||||
|
if (parsedCommandLine.getOptionValue(CliConstants.TENSORBOARD) != null) {
|
||||||
|
tensorboard = Boolean.parseBoolean(
|
||||||
|
parsedCommandLine.getOptionValue(CliConstants.TENSORBOARD));
|
||||||
|
}
|
||||||
|
|
||||||
|
if (parsedCommandLine.hasOption(CliConstants.WAIT_JOB_FINISH)) {
|
||||||
|
this.waitJobFinish = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
psDockerImage = parsedCommandLine.getOptionValue(
|
||||||
|
CliConstants.PS_DOCKER_IMAGE);
|
||||||
|
workerDockerImage = parsedCommandLine.getOptionValue(
|
||||||
|
CliConstants.WORKER_DOCKER_IMAGE);
|
||||||
|
|
||||||
|
String workerLaunchCmd = parsedCommandLine.getOptionValue(
|
||||||
|
CliConstants.WORKER_LAUNCH_CMD);
|
||||||
|
String psLaunchCommand = parsedCommandLine.getOptionValue(
|
||||||
|
CliConstants.PS_LAUNCH_CMD);
|
||||||
|
|
||||||
|
this.setInputPath(input).setCheckpointPath(jobDir).setNumPS(nPS).setNumWorkers(nWorkers)
|
||||||
|
.setPSLaunchCmd(psLaunchCommand).setWorkerLaunchCmd(workerLaunchCmd)
|
||||||
|
.setPsResource(psResource).setWorkerResource(workerResource)
|
||||||
|
.setTensorboardEnabled(tensorboard);
|
||||||
|
|
||||||
|
super.updateParametersByParsedCommandline(parsedCommandLine,
|
||||||
|
options, clientContext);
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getInputPath() {
|
||||||
|
return input;
|
||||||
|
}
|
||||||
|
|
||||||
|
public RunJobParameters setInputPath(String input) {
|
||||||
|
this.input = input;
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getCheckpointPath() {
|
||||||
|
return checkpointPath;
|
||||||
|
}
|
||||||
|
|
||||||
|
public RunJobParameters setCheckpointPath(String checkpointPath) {
|
||||||
|
this.checkpointPath = checkpointPath;
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int getNumWorkers() {
|
||||||
|
return numWorkers;
|
||||||
|
}
|
||||||
|
|
||||||
|
public RunJobParameters setNumWorkers(int numWorkers) {
|
||||||
|
this.numWorkers = numWorkers;
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int getNumPS() {
|
||||||
|
return numPS;
|
||||||
|
}
|
||||||
|
|
||||||
|
public RunJobParameters setNumPS(int numPS) {
|
||||||
|
this.numPS = numPS;
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Resource getWorkerResource() {
|
||||||
|
return workerResource;
|
||||||
|
}
|
||||||
|
|
||||||
|
public RunJobParameters setWorkerResource(Resource workerResource) {
|
||||||
|
this.workerResource = workerResource;
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Resource getPsResource() {
|
||||||
|
return psResource;
|
||||||
|
}
|
||||||
|
|
||||||
|
public RunJobParameters setPsResource(Resource psResource) {
|
||||||
|
this.psResource = psResource;
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean isTensorboardEnabled() {
|
||||||
|
return tensorboardEnabled;
|
||||||
|
}
|
||||||
|
|
||||||
|
public RunJobParameters setTensorboardEnabled(boolean tensorboardEnabled) {
|
||||||
|
this.tensorboardEnabled = tensorboardEnabled;
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getWorkerLaunchCmd() {
|
||||||
|
return workerLaunchCmd;
|
||||||
|
}
|
||||||
|
|
||||||
|
public RunJobParameters setWorkerLaunchCmd(String workerLaunchCmd) {
|
||||||
|
this.workerLaunchCmd = workerLaunchCmd;
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getPSLaunchCmd() {
|
||||||
|
return psLaunchCmd;
|
||||||
|
}
|
||||||
|
|
||||||
|
public RunJobParameters setPSLaunchCmd(String psLaunchCmd) {
|
||||||
|
this.psLaunchCmd = psLaunchCmd;
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean isWaitJobFinish() {
|
||||||
|
return waitJobFinish;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public String getPsDockerImage() {
|
||||||
|
return psDockerImage;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getWorkerDockerImage() {
|
||||||
|
return workerDockerImage;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean isDistributed() {
|
||||||
|
return distributed;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,103 @@
|
||||||
|
/**
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License. See accompanying LICENSE file.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hadoop.yarn.submarine.client.cli.param;
|
||||||
|
|
||||||
|
import org.apache.commons.cli.CommandLine;
|
||||||
|
import org.apache.commons.cli.Options;
|
||||||
|
import org.apache.commons.cli.ParseException;
|
||||||
|
import org.apache.hadoop.yarn.exceptions.YarnException;
|
||||||
|
import org.apache.hadoop.yarn.submarine.client.cli.CliConstants;
|
||||||
|
import org.apache.hadoop.yarn.submarine.common.ClientContext;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Parameters required to run anything on cluster. Such as run job / serve model
|
||||||
|
*/
|
||||||
|
public abstract class RunParameters extends BaseParameters {
|
||||||
|
private String savedModelPath;
|
||||||
|
private String dockerImageName;
|
||||||
|
private List<String> envars = new ArrayList<>();
|
||||||
|
private String queue;
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void updateParametersByParsedCommandline(CommandLine parsedCommandLine,
|
||||||
|
Options options, ClientContext clientContext) throws ParseException,
|
||||||
|
IOException, YarnException {
|
||||||
|
String savedModelPath = parsedCommandLine.getOptionValue(
|
||||||
|
CliConstants.SAVED_MODEL_PATH);
|
||||||
|
this.setSavedModelPath(savedModelPath);
|
||||||
|
|
||||||
|
// Envars
|
||||||
|
List<String> envarsList = new ArrayList<>();
|
||||||
|
String[] envars = parsedCommandLine.getOptionValues(CliConstants.ENV);
|
||||||
|
if (envars != null) {
|
||||||
|
for (String envar : envars) {
|
||||||
|
envarsList.add(envar);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
this.setEnvars(envarsList);
|
||||||
|
|
||||||
|
String queue = parsedCommandLine.getOptionValue(
|
||||||
|
CliConstants.QUEUE);
|
||||||
|
this.setQueue(queue);
|
||||||
|
|
||||||
|
String dockerImage = parsedCommandLine.getOptionValue(
|
||||||
|
CliConstants.DOCKER_IMAGE);
|
||||||
|
this.setDockerImageName(dockerImage);
|
||||||
|
|
||||||
|
super.updateParametersByParsedCommandline(parsedCommandLine,
|
||||||
|
options, clientContext);
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getQueue() {
|
||||||
|
return queue;
|
||||||
|
}
|
||||||
|
|
||||||
|
public RunParameters setQueue(String queue) {
|
||||||
|
this.queue = queue;
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getDockerImageName() {
|
||||||
|
return dockerImageName;
|
||||||
|
}
|
||||||
|
|
||||||
|
public RunParameters setDockerImageName(String dockerImageName) {
|
||||||
|
this.dockerImageName = dockerImageName;
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public List<String> getEnvars() {
|
||||||
|
return envars;
|
||||||
|
}
|
||||||
|
|
||||||
|
public RunParameters setEnvars(List<String> envars) {
|
||||||
|
this.envars = envars;
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getSavedModelPath() {
|
||||||
|
return savedModelPath;
|
||||||
|
}
|
||||||
|
|
||||||
|
public RunParameters setSavedModelPath(String savedModelPath) {
|
||||||
|
this.savedModelPath = savedModelPath;
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,18 @@
|
||||||
|
/**
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License. See accompanying LICENSE file.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hadoop.yarn.submarine.client.cli.param;
|
||||||
|
|
||||||
|
public class ShowJobParameters extends BaseParameters {
|
||||||
|
}
|
|
@ -0,0 +1,77 @@
|
||||||
|
/**
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License. See accompanying LICENSE file.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hadoop.yarn.submarine.common;
|
||||||
|
|
||||||
|
import org.apache.hadoop.conf.Configuration;
|
||||||
|
import org.apache.hadoop.yarn.client.api.YarnClient;
|
||||||
|
import org.apache.hadoop.yarn.conf.YarnConfiguration;
|
||||||
|
import org.apache.hadoop.yarn.submarine.common.conf.SubmarineConfiguration;
|
||||||
|
import org.apache.hadoop.yarn.submarine.common.fs.RemoteDirectoryManager;
|
||||||
|
import org.apache.hadoop.yarn.submarine.runtimes.RuntimeFactory;
|
||||||
|
|
||||||
|
public class ClientContext {
|
||||||
|
private Configuration yarnConf = new YarnConfiguration();
|
||||||
|
|
||||||
|
private RemoteDirectoryManager remoteDirectoryManager;
|
||||||
|
private YarnClient yarnClient;
|
||||||
|
private Configuration submarineConfig;
|
||||||
|
private RuntimeFactory runtimeFactory;
|
||||||
|
|
||||||
|
public ClientContext() {
|
||||||
|
submarineConfig = new SubmarineConfiguration();
|
||||||
|
}
|
||||||
|
|
||||||
|
public synchronized YarnClient getOrCreateYarnClient() {
|
||||||
|
if (yarnClient == null) {
|
||||||
|
yarnClient = YarnClient.createYarnClient();
|
||||||
|
yarnClient.init(yarnConf);
|
||||||
|
yarnClient.start();
|
||||||
|
}
|
||||||
|
return yarnClient;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Configuration getYarnConfig() {
|
||||||
|
return yarnConf;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setConfiguration(Configuration conf) {
|
||||||
|
this.yarnConf = conf;
|
||||||
|
}
|
||||||
|
|
||||||
|
public RemoteDirectoryManager getRemoteDirectoryManager() {
|
||||||
|
return remoteDirectoryManager;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setRemoteDirectoryManager(
|
||||||
|
RemoteDirectoryManager remoteDirectoryManager) {
|
||||||
|
this.remoteDirectoryManager = remoteDirectoryManager;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Configuration getSubmarineConfig() {
|
||||||
|
return submarineConfig;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setSubmarineConfig(Configuration submarineConfig) {
|
||||||
|
this.submarineConfig = submarineConfig;
|
||||||
|
}
|
||||||
|
|
||||||
|
public RuntimeFactory getRuntimeFactory() {
|
||||||
|
return runtimeFactory;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setRuntimeFactory(RuntimeFactory runtimeFactory) {
|
||||||
|
this.runtimeFactory = runtimeFactory;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,27 @@
|
||||||
|
/**
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License. See accompanying LICENSE file.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hadoop.yarn.submarine.common;
|
||||||
|
|
||||||
|
public class Envs {
|
||||||
|
public static final String TASK_TYPE_ENV = "_TASK_TYPE";
|
||||||
|
public static final String TASK_INDEX_ENV = "_TASK_INDEX";
|
||||||
|
|
||||||
|
/*
|
||||||
|
* HDFS/HADOOP-related configs
|
||||||
|
*/
|
||||||
|
public static final String HADOOP_HDFS_HOME = "HADOOP_HDFS_HOME";
|
||||||
|
public static final String JAVA_HOME = "JAVA_HOME";
|
||||||
|
public static final String HADOOP_CONF_DIR = "HADOOP_CONF_DIR";
|
||||||
|
}
|
|
@ -0,0 +1,73 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hadoop.yarn.submarine.common.api;
|
||||||
|
|
||||||
|
import org.apache.hadoop.yarn.service.api.records.Component;
|
||||||
|
import org.apache.hadoop.yarn.service.api.records.Container;
|
||||||
|
import org.apache.hadoop.yarn.service.api.records.ContainerState;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Status of component of training job
|
||||||
|
*/
|
||||||
|
public class JobComponentStatus {
|
||||||
|
private String compName;
|
||||||
|
private long numReadyContainers = 0;
|
||||||
|
private long numRunningButUnreadyContainers = 0;
|
||||||
|
private long totalAskedContainers;
|
||||||
|
|
||||||
|
public JobComponentStatus(String compName, long nReadyContainers,
|
||||||
|
long nRunningButUnreadyContainers, long totalAskedContainers) {
|
||||||
|
this.compName = compName;
|
||||||
|
this.numReadyContainers = nReadyContainers;
|
||||||
|
this.numRunningButUnreadyContainers = nRunningButUnreadyContainers;
|
||||||
|
this.totalAskedContainers = totalAskedContainers;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getCompName() {
|
||||||
|
return compName;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setCompName(String compName) {
|
||||||
|
this.compName = compName;
|
||||||
|
}
|
||||||
|
|
||||||
|
public long getNumReadyContainers() {
|
||||||
|
return numReadyContainers;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setNumReadyContainers(long numReadyContainers) {
|
||||||
|
this.numReadyContainers = numReadyContainers;
|
||||||
|
}
|
||||||
|
|
||||||
|
public long getNumRunningButUnreadyContainers() {
|
||||||
|
return numRunningButUnreadyContainers;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setNumRunningButUnreadyContainers(
|
||||||
|
long numRunningButUnreadyContainers) {
|
||||||
|
this.numRunningButUnreadyContainers = numRunningButUnreadyContainers;
|
||||||
|
}
|
||||||
|
|
||||||
|
public long getTotalAskedContainers() {
|
||||||
|
return totalAskedContainers;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setTotalAskedContainers(long totalAskedContainers) {
|
||||||
|
this.totalAskedContainers = totalAskedContainers;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,52 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hadoop.yarn.submarine.common.api;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* State of training job
|
||||||
|
*/
|
||||||
|
public enum JobState {
|
||||||
|
/**
|
||||||
|
* Job accepted by scheduler and start running
|
||||||
|
*/
|
||||||
|
RUNNING,
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Job killed by user
|
||||||
|
*/
|
||||||
|
KILLED,
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Job failed
|
||||||
|
*/
|
||||||
|
FAILED,
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Job succeeded
|
||||||
|
*/
|
||||||
|
SUCCEEDED,
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Job paused by user
|
||||||
|
*/
|
||||||
|
PAUSED;
|
||||||
|
|
||||||
|
public static boolean isFinal(JobState state) {
|
||||||
|
return state == KILLED || state == SUCCEEDED || state == FAILED;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,87 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hadoop.yarn.submarine.common.api;
|
||||||
|
|
||||||
|
import java.io.PrintStream;
|
||||||
|
import java.time.Instant;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Status of training job.
|
||||||
|
*/
|
||||||
|
public class JobStatus {
|
||||||
|
|
||||||
|
protected String jobName;
|
||||||
|
protected JobState state;
|
||||||
|
protected String tensorboardLink = "N/A";
|
||||||
|
protected List<JobComponentStatus> componentStatus;
|
||||||
|
|
||||||
|
public void nicePrint(PrintStream out) {
|
||||||
|
out.println(
|
||||||
|
"Job Name=" + this.jobName + ", status=" + state.name() + " time="
|
||||||
|
+ Instant.now());
|
||||||
|
if (JobState.isFinal(this.state)) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (tensorboardLink.startsWith("http")) {
|
||||||
|
out.println(" Tensorboard link: " + tensorboardLink);
|
||||||
|
}
|
||||||
|
|
||||||
|
out.println(" Components:");
|
||||||
|
for (JobComponentStatus comp : componentStatus) {
|
||||||
|
out.println(" [" + comp.getCompName() + "] Ready=" + comp
|
||||||
|
.getNumReadyContainers() + " + Running-But-Non-Ready=" + comp
|
||||||
|
.getNumRunningButUnreadyContainers() + " | Asked=" + comp
|
||||||
|
.getTotalAskedContainers());
|
||||||
|
}
|
||||||
|
out.println("------------------");
|
||||||
|
}
|
||||||
|
|
||||||
|
public JobState getState() {
|
||||||
|
return state;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getTensorboardLink() {
|
||||||
|
return tensorboardLink;
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<JobComponentStatus> getComponentStatus() {
|
||||||
|
return componentStatus;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getJobName() {
|
||||||
|
return jobName;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setJobName(String jobName) {
|
||||||
|
this.jobName = jobName;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setState(JobState state) {
|
||||||
|
this.state = state;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setTensorboardLink(String tensorboardLink) {
|
||||||
|
this.tensorboardLink = tensorboardLink;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setComponentStatus(List<JobComponentStatus> componentStatus) {
|
||||||
|
this.componentStatus = componentStatus;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,32 @@
|
||||||
|
/**
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License. See accompanying LICENSE file.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hadoop.yarn.submarine.common.api;
|
||||||
|
|
||||||
|
public enum TaskType {
|
||||||
|
PRIMARY_WORKER("master"),
|
||||||
|
WORKER("worker"),
|
||||||
|
PS("ps"),
|
||||||
|
TENSORBOARD("tensorboard");
|
||||||
|
|
||||||
|
private String compName;
|
||||||
|
|
||||||
|
TaskType(String compName) {
|
||||||
|
this.compName = compName;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getComponentName() {
|
||||||
|
return compName;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,44 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hadoop.yarn.submarine.common.api.builder;
|
||||||
|
|
||||||
|
import org.apache.hadoop.yarn.service.api.records.Component;
|
||||||
|
import org.apache.hadoop.yarn.service.api.records.Container;
|
||||||
|
import org.apache.hadoop.yarn.service.api.records.ContainerState;
|
||||||
|
import org.apache.hadoop.yarn.submarine.common.api.JobComponentStatus;
|
||||||
|
|
||||||
|
public class JobComponentStatusBuilder {
|
||||||
|
public static JobComponentStatus fromServiceComponent(Component component) {
|
||||||
|
long totalAskedContainers = component.getNumberOfContainers();
|
||||||
|
int numReadyContainers = 0;
|
||||||
|
int numRunningButUnreadyContainers = 0;
|
||||||
|
String compName = component.getName();
|
||||||
|
|
||||||
|
for (Container c : component.getContainers()) {
|
||||||
|
if (c.getState() == ContainerState.READY) {
|
||||||
|
numReadyContainers++;
|
||||||
|
} else if (c.getState() == ContainerState.RUNNING_BUT_UNREADY) {
|
||||||
|
numRunningButUnreadyContainers++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return new JobComponentStatus(compName, numReadyContainers,
|
||||||
|
numRunningButUnreadyContainers, totalAskedContainers);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,64 @@
|
||||||
|
/**
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License. See accompanying LICENSE file.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hadoop.yarn.submarine.common.api.builder;
|
||||||
|
|
||||||
|
import org.apache.hadoop.yarn.service.api.records.Component;
|
||||||
|
import org.apache.hadoop.yarn.service.api.records.Service;
|
||||||
|
import org.apache.hadoop.yarn.service.api.records.ServiceState;
|
||||||
|
import org.apache.hadoop.yarn.submarine.common.api.JobComponentStatus;
|
||||||
|
import org.apache.hadoop.yarn.submarine.common.api.JobState;
|
||||||
|
import org.apache.hadoop.yarn.submarine.common.api.JobStatus;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
public class JobStatusBuilder {
|
||||||
|
public static JobStatus fromServiceSpec(Service serviceSpec) {
|
||||||
|
JobStatus status = new JobStatus();
|
||||||
|
status.setState(fromServiceState(serviceSpec.getState()));
|
||||||
|
|
||||||
|
// If it is a final state, return.
|
||||||
|
if (JobState.isFinal(status.getState())) {
|
||||||
|
return status;
|
||||||
|
}
|
||||||
|
|
||||||
|
List<JobComponentStatus> componentStatusList = new ArrayList<>();
|
||||||
|
|
||||||
|
for (Component component : serviceSpec.getComponents()) {
|
||||||
|
componentStatusList.add(
|
||||||
|
JobComponentStatusBuilder.fromServiceComponent(component));
|
||||||
|
}
|
||||||
|
status.setComponentStatus(componentStatusList);
|
||||||
|
|
||||||
|
// TODO, handle tensorboard differently.
|
||||||
|
// status.setTensorboardLink(getTensorboardLink(serviceSpec, clientContext));
|
||||||
|
|
||||||
|
status.setJobName(serviceSpec.getName());
|
||||||
|
|
||||||
|
return status;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static JobState fromServiceState(ServiceState serviceState) {
|
||||||
|
switch (serviceState) {
|
||||||
|
case STOPPED:
|
||||||
|
// TODO, once YARN-8488 gets committed, we need to update this.
|
||||||
|
return JobState.SUCCEEDED;
|
||||||
|
case FAILED:
|
||||||
|
return JobState.FAILED;
|
||||||
|
}
|
||||||
|
|
||||||
|
return JobState.RUNNING;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,51 @@
|
||||||
|
/**
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License. See accompanying LICENSE file.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hadoop.yarn.submarine.common.conf;
|
||||||
|
|
||||||
|
import org.apache.hadoop.conf.Configuration;
|
||||||
|
|
||||||
|
public class SubmarineConfiguration extends Configuration {
|
||||||
|
private static final String SUBMARINE_CONFIGURATION_FILE = "submarine.xml";
|
||||||
|
|
||||||
|
public SubmarineConfiguration() {
|
||||||
|
this(new Configuration(false), true);
|
||||||
|
}
|
||||||
|
|
||||||
|
public SubmarineConfiguration(Configuration configuration) {
|
||||||
|
this(configuration, false);
|
||||||
|
}
|
||||||
|
|
||||||
|
public SubmarineConfiguration(Configuration configuration,
|
||||||
|
boolean loadLocalConfig) {
|
||||||
|
super(configuration);
|
||||||
|
if (loadLocalConfig) {
|
||||||
|
addResource(SUBMARINE_CONFIGURATION_FILE);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Runtime of submarine
|
||||||
|
*/
|
||||||
|
|
||||||
|
private static final String PREFIX = "submarine.";
|
||||||
|
|
||||||
|
public static final String RUNTIME_CLASS = PREFIX + "runtime.class";
|
||||||
|
public static final String DEFAULT_RUNTIME_CLASS =
|
||||||
|
"org.apache.hadoop.yarn.submarine.runtimes.yarnservice.YarnServiceRuntimeFactory";
|
||||||
|
|
||||||
|
public void setSubmarineRuntimeClass(String runtimeClass) {
|
||||||
|
set(RUNTIME_CLASS, runtimeClass);
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,31 @@
|
||||||
|
/**
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License. See accompanying LICENSE file.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hadoop.yarn.submarine.common.conf;
|
||||||
|
|
||||||
|
public class SubmarineLogs {
|
||||||
|
private static volatile boolean verbose = false;
|
||||||
|
|
||||||
|
public static boolean isVerbose() {
|
||||||
|
return SubmarineLogs.verbose;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static void verboseOn() {
|
||||||
|
SubmarineLogs.verbose = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static void verboseOff() {
|
||||||
|
SubmarineLogs.verbose = false;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,21 @@
|
||||||
|
/**
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License. See accompanying LICENSE file.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hadoop.yarn.submarine.common.exception;
|
||||||
|
|
||||||
|
public class SubmarineException extends Exception {
|
||||||
|
public SubmarineException(String msg) {
|
||||||
|
super(msg);
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,25 @@
|
||||||
|
/**
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
* <p>
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
* <p>
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License. See accompanying LICENSE file.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hadoop.yarn.submarine.common.exception;
|
||||||
|
|
||||||
|
public class SubmarineRuntimeException extends RuntimeException {
|
||||||
|
public SubmarineRuntimeException(String s) {
|
||||||
|
super(s);
|
||||||
|
}
|
||||||
|
|
||||||
|
public SubmarineRuntimeException(String message, Throwable cause) {
|
||||||
|
super(message, cause);
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,84 @@
|
||||||
|
/**
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License. See accompanying LICENSE file.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hadoop.yarn.submarine.common.fs;
|
||||||
|
|
||||||
|
import org.apache.hadoop.fs.FileSystem;
|
||||||
|
import org.apache.hadoop.fs.Path;
|
||||||
|
import org.apache.hadoop.yarn.submarine.client.cli.CliConstants;
|
||||||
|
import org.apache.hadoop.yarn.submarine.common.ClientContext;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Manages remote directories for staging, log, etc.
|
||||||
|
* TODO, need to properly handle permission / name validation, etc.
|
||||||
|
*/
|
||||||
|
public class DefaultRemoteDirectoryManager implements RemoteDirectoryManager {
|
||||||
|
FileSystem fs;
|
||||||
|
|
||||||
|
public DefaultRemoteDirectoryManager(ClientContext context) {
|
||||||
|
try {
|
||||||
|
this.fs = FileSystem.get(context.getYarnConfig());
|
||||||
|
} catch (IOException e) {
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Path getJobStagingArea(String jobName, boolean create) throws IOException {
|
||||||
|
Path staging = new Path(getJobRootFolder(jobName), "staging");
|
||||||
|
if (create) {
|
||||||
|
createFolderIfNotExist(staging);
|
||||||
|
}
|
||||||
|
return staging;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Path getJobCheckpointDir(String jobName, boolean create)
|
||||||
|
throws IOException {
|
||||||
|
Path jobDir = new Path(getJobStagingArea(jobName, create),
|
||||||
|
CliConstants.CHECKPOINT_PATH);
|
||||||
|
if (create) {
|
||||||
|
createFolderIfNotExist(jobDir);
|
||||||
|
}
|
||||||
|
return jobDir;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Path getModelDir(String modelName, boolean create) throws IOException {
|
||||||
|
Path modelDir = new Path(new Path("submarine", "models"), modelName);
|
||||||
|
if (create) {
|
||||||
|
createFolderIfNotExist(modelDir);
|
||||||
|
}
|
||||||
|
return modelDir;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public FileSystem getFileSystem() {
|
||||||
|
return fs;
|
||||||
|
}
|
||||||
|
|
||||||
|
private Path getJobRootFolder(String jobName) throws IOException {
|
||||||
|
return new Path(new Path("submarine", "jobs"), jobName);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void createFolderIfNotExist(Path path) throws IOException {
|
||||||
|
if (!fs.exists(path)) {
|
||||||
|
if (!fs.mkdirs(path)) {
|
||||||
|
throw new IOException("Failed to create folder=" + path);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,30 @@
|
||||||
|
/**
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License. See accompanying LICENSE file.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hadoop.yarn.submarine.common.fs;
|
||||||
|
|
||||||
|
import org.apache.hadoop.fs.FileSystem;
|
||||||
|
import org.apache.hadoop.fs.Path;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
public interface RemoteDirectoryManager {
|
||||||
|
Path getJobStagingArea(String jobName, boolean create) throws IOException;
|
||||||
|
|
||||||
|
Path getJobCheckpointDir(String jobName, boolean create) throws IOException;
|
||||||
|
|
||||||
|
Path getModelDir(String modelName, boolean create) throws IOException;
|
||||||
|
|
||||||
|
FileSystem getFileSystem() throws IOException;
|
||||||
|
}
|
|
@ -0,0 +1,106 @@
|
||||||
|
/**
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License. See accompanying LICENSE file.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hadoop.yarn.submarine.runtimes;
|
||||||
|
|
||||||
|
import com.google.common.annotations.VisibleForTesting;
|
||||||
|
import org.apache.hadoop.conf.Configuration;
|
||||||
|
import org.apache.hadoop.yarn.submarine.common.ClientContext;
|
||||||
|
import org.apache.hadoop.yarn.submarine.common.conf.SubmarineConfiguration;
|
||||||
|
import org.apache.hadoop.yarn.submarine.common.exception.SubmarineRuntimeException;
|
||||||
|
import org.apache.hadoop.yarn.submarine.runtimes.common.FSBasedSubmarineStorageImpl;
|
||||||
|
import org.apache.hadoop.yarn.submarine.runtimes.common.JobMonitor;
|
||||||
|
import org.apache.hadoop.yarn.submarine.runtimes.common.JobSubmitter;
|
||||||
|
import org.apache.hadoop.yarn.submarine.runtimes.common.SubmarineStorage;
|
||||||
|
import org.apache.hadoop.yarn.submarine.runtimes.yarnservice.YarnServiceJobMonitor;
|
||||||
|
import org.apache.hadoop.yarn.submarine.runtimes.yarnservice.YarnServiceJobSubmitter;
|
||||||
|
|
||||||
|
import java.lang.reflect.InvocationTargetException;
|
||||||
|
|
||||||
|
public abstract class RuntimeFactory {
|
||||||
|
protected ClientContext clientContext;
|
||||||
|
private JobSubmitter jobSubmitter;
|
||||||
|
private JobMonitor jobMonitor;
|
||||||
|
private SubmarineStorage submarineStorage;
|
||||||
|
|
||||||
|
public RuntimeFactory(ClientContext clientContext) {
|
||||||
|
this.clientContext = clientContext;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static RuntimeFactory getRuntimeFactory(
|
||||||
|
ClientContext clientContext) {
|
||||||
|
Configuration submarineConfiguration =
|
||||||
|
clientContext.getSubmarineConfig();
|
||||||
|
String runtimeClass = submarineConfiguration.get(
|
||||||
|
SubmarineConfiguration.RUNTIME_CLASS,
|
||||||
|
SubmarineConfiguration.DEFAULT_RUNTIME_CLASS);
|
||||||
|
|
||||||
|
try {
|
||||||
|
Class<?> runtimeClazz = Class.forName(runtimeClass);
|
||||||
|
if (RuntimeFactory.class.isAssignableFrom(runtimeClazz)) {
|
||||||
|
return (RuntimeFactory) runtimeClazz.getConstructor(ClientContext.class).newInstance(clientContext);
|
||||||
|
} else {
|
||||||
|
throw new SubmarineRuntimeException("Class: " + runtimeClass
|
||||||
|
+ " not instance of " + RuntimeFactory.class.getCanonicalName());
|
||||||
|
}
|
||||||
|
} catch (ClassNotFoundException | IllegalAccessException |
|
||||||
|
InstantiationException | NoSuchMethodException |
|
||||||
|
InvocationTargetException e) {
|
||||||
|
throw new SubmarineRuntimeException(
|
||||||
|
"Could not instantiate RuntimeFactory: " + runtimeClass, e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
protected abstract JobSubmitter internalCreateJobSubmitter();
|
||||||
|
|
||||||
|
protected abstract JobMonitor internalCreateJobMonitor();
|
||||||
|
|
||||||
|
protected abstract SubmarineStorage internalCreateSubmarineStorage();
|
||||||
|
|
||||||
|
public synchronized JobSubmitter getJobSubmitterInstance() {
|
||||||
|
if (jobSubmitter == null) {
|
||||||
|
jobSubmitter = internalCreateJobSubmitter();
|
||||||
|
}
|
||||||
|
return jobSubmitter;
|
||||||
|
}
|
||||||
|
|
||||||
|
public synchronized JobMonitor getJobMonitorInstance() {
|
||||||
|
if (jobMonitor == null) {
|
||||||
|
jobMonitor = internalCreateJobMonitor();
|
||||||
|
}
|
||||||
|
return jobMonitor;
|
||||||
|
}
|
||||||
|
|
||||||
|
public synchronized SubmarineStorage getSubmarineStorage() {
|
||||||
|
if (submarineStorage == null) {
|
||||||
|
submarineStorage = internalCreateSubmarineStorage();
|
||||||
|
}
|
||||||
|
return submarineStorage;
|
||||||
|
}
|
||||||
|
|
||||||
|
@VisibleForTesting
|
||||||
|
public synchronized void setJobSubmitterInstance(JobSubmitter jobSubmitter) {
|
||||||
|
this.jobSubmitter = jobSubmitter;
|
||||||
|
}
|
||||||
|
|
||||||
|
@VisibleForTesting
|
||||||
|
public synchronized void setJobMonitorInstance(JobMonitor jobMonitor) {
|
||||||
|
this.jobMonitor = jobMonitor;
|
||||||
|
}
|
||||||
|
|
||||||
|
@VisibleForTesting
|
||||||
|
public synchronized void setSubmarineStorage(SubmarineStorage storage) {
|
||||||
|
this.submarineStorage = storage;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,106 @@
|
||||||
|
/**
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
* <p>
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
* <p>
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License. See accompanying LICENSE file.
|
||||||
|
*/
|
||||||
|
|
||||||
|
|
||||||
|
package org.apache.hadoop.yarn.submarine.runtimes.common;
|
||||||
|
|
||||||
|
import org.apache.hadoop.fs.FSDataInputStream;
|
||||||
|
import org.apache.hadoop.fs.FSDataOutputStream;
|
||||||
|
import org.apache.hadoop.fs.Path;
|
||||||
|
import org.apache.hadoop.yarn.submarine.common.ClientContext;
|
||||||
|
import org.apache.hadoop.yarn.submarine.common.fs.RemoteDirectoryManager;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.ObjectInput;
|
||||||
|
import java.io.ObjectInputStream;
|
||||||
|
import java.io.ObjectOutput;
|
||||||
|
import java.io.ObjectOutputStream;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A super naive FS-based storage.
|
||||||
|
*/
|
||||||
|
public class FSBasedSubmarineStorageImpl extends SubmarineStorage {
|
||||||
|
ClientContext clientContext;
|
||||||
|
RemoteDirectoryManager rdm;
|
||||||
|
|
||||||
|
public FSBasedSubmarineStorageImpl(ClientContext clientContext) {
|
||||||
|
this.clientContext = clientContext;
|
||||||
|
rdm = clientContext.getRemoteDirectoryManager();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void addNewJob(String jobName, Map<String, String> jobInfo)
|
||||||
|
throws IOException {
|
||||||
|
Path jobInfoPath = getJobInfoPath(jobName, true);
|
||||||
|
FSDataOutputStream fos = rdm.getFileSystem().create(jobInfoPath);
|
||||||
|
serializeMap(fos, jobInfo);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Map<String, String> getJobInfoByName(String jobName)
|
||||||
|
throws IOException {
|
||||||
|
Path jobInfoPath = getJobInfoPath(jobName, false);
|
||||||
|
FSDataInputStream fis = rdm.getFileSystem().open(jobInfoPath);
|
||||||
|
return deserializeMap(fis);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void addNewModel(String modelName, String version,
|
||||||
|
Map<String, String> modelInfo) throws IOException {
|
||||||
|
Path modelInfoPath = getModelInfoPath(modelName, version, true);
|
||||||
|
FSDataOutputStream fos = rdm.getFileSystem().create(modelInfoPath);
|
||||||
|
serializeMap(fos, modelInfo);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Map<String, String> getModelInfoByName(String modelName,
|
||||||
|
String version) throws IOException {
|
||||||
|
Path modelInfoPath = getModelInfoPath(modelName, version, false);
|
||||||
|
FSDataInputStream fis = rdm.getFileSystem().open(modelInfoPath);
|
||||||
|
return deserializeMap(fis);
|
||||||
|
}
|
||||||
|
|
||||||
|
private Path getModelInfoPath(String modelName, String version, boolean create)
|
||||||
|
throws IOException {
|
||||||
|
Path modelDir = rdm.getModelDir(modelName, create);
|
||||||
|
Path modelInfo = new Path(modelDir, version + ".info");
|
||||||
|
return modelInfo;
|
||||||
|
}
|
||||||
|
|
||||||
|
private void serializeMap(FSDataOutputStream fos, Map<String, String> map)
|
||||||
|
throws IOException {
|
||||||
|
ObjectOutput oo = new ObjectOutputStream(fos);
|
||||||
|
oo.writeObject(map);
|
||||||
|
oo.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
private Map<String, String> deserializeMap(FSDataInputStream fis)
|
||||||
|
throws IOException {
|
||||||
|
ObjectInput oi = new ObjectInputStream(fis);
|
||||||
|
Map<String, String> newMap = null;
|
||||||
|
try {
|
||||||
|
newMap = (Map<String, String>) oi.readObject();
|
||||||
|
} catch (ClassNotFoundException e) {
|
||||||
|
throw new IOException(e);
|
||||||
|
}
|
||||||
|
return newMap;
|
||||||
|
}
|
||||||
|
|
||||||
|
private Path getJobInfoPath(String jobName, boolean create) throws IOException {
|
||||||
|
Path path = rdm.getJobStagingArea(jobName, create);
|
||||||
|
Path fileName = new Path(path, "job.info");
|
||||||
|
return fileName;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,84 @@
|
||||||
|
/**
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License. See accompanying LICENSE file.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hadoop.yarn.submarine.runtimes.common;
|
||||||
|
|
||||||
|
import org.apache.hadoop.yarn.submarine.common.ClientContext;
|
||||||
|
import org.apache.hadoop.yarn.submarine.common.api.JobState;
|
||||||
|
import org.apache.hadoop.yarn.submarine.common.api.JobStatus;
|
||||||
|
import org.apache.hadoop.yarn.exceptions.YarnException;
|
||||||
|
import org.apache.hadoop.yarn.submarine.common.exception.SubmarineException;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Monitor status of job(s)
|
||||||
|
*/
|
||||||
|
public abstract class JobMonitor {
|
||||||
|
private static final Logger LOG =
|
||||||
|
LoggerFactory.getLogger(JobMonitor.class);
|
||||||
|
protected ClientContext clientContext;
|
||||||
|
|
||||||
|
public JobMonitor(ClientContext clientContext) {
|
||||||
|
this.clientContext = clientContext;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns status of training job.
|
||||||
|
*
|
||||||
|
* @param jobName name of job
|
||||||
|
* @return job status
|
||||||
|
* @throws IOException anything else happens
|
||||||
|
* @throws YarnException anything related to YARN happens
|
||||||
|
*/
|
||||||
|
public abstract JobStatus getTrainingJobStatus(String jobName)
|
||||||
|
throws IOException, YarnException;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Continue wait and print status if job goes to ready or final state.
|
||||||
|
* @param jobName
|
||||||
|
* @throws IOException
|
||||||
|
* @throws YarnException
|
||||||
|
* @throws SubmarineException
|
||||||
|
*/
|
||||||
|
public void waitTrainingFinal(String jobName)
|
||||||
|
throws IOException, YarnException, SubmarineException {
|
||||||
|
// Wait 5 sec between each fetch.
|
||||||
|
int waitIntervalSec = 5;
|
||||||
|
JobStatus js;
|
||||||
|
while (true) {
|
||||||
|
js = getTrainingJobStatus(jobName);
|
||||||
|
JobState jobState = js.getState();
|
||||||
|
js.nicePrint(System.err);
|
||||||
|
|
||||||
|
if (JobState.isFinal(jobState)) {
|
||||||
|
if (jobState.equals(JobState.FAILED)) {
|
||||||
|
throw new SubmarineException("Job failed");
|
||||||
|
} else if (jobState.equals(JobState.KILLED)) {
|
||||||
|
throw new SubmarineException("Job killed");
|
||||||
|
}
|
||||||
|
LOG.info("Job exited with state=" + jobState);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
Thread.sleep(waitIntervalSec * 1000);
|
||||||
|
} catch (InterruptedException e) {
|
||||||
|
throw new IOException(e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,36 @@
|
||||||
|
/**
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License. See accompanying LICENSE file.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hadoop.yarn.submarine.runtimes.common;
|
||||||
|
|
||||||
|
import org.apache.hadoop.yarn.api.records.ApplicationId;
|
||||||
|
import org.apache.hadoop.yarn.exceptions.YarnException;
|
||||||
|
import org.apache.hadoop.yarn.submarine.client.cli.param.RunJobParameters;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Submit job to cluster master
|
||||||
|
*/
|
||||||
|
public interface JobSubmitter {
|
||||||
|
/**
|
||||||
|
* Submit job to cluster
|
||||||
|
* @param parameters run job parameters
|
||||||
|
* @return applicatioId when successfully submitted
|
||||||
|
* @throws YarnException for issues while contacting YARN daemons
|
||||||
|
* @throws IOException for other issues.
|
||||||
|
*/
|
||||||
|
ApplicationId submitJob(RunJobParameters parameters)
|
||||||
|
throws IOException, YarnException;
|
||||||
|
}
|
|
@ -0,0 +1,24 @@
|
||||||
|
/**
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
* <p>
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
* <p>
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License. See accompanying LICENSE file.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hadoop.yarn.submarine.runtimes.common;
|
||||||
|
|
||||||
|
public class StorageKeyConstants {
|
||||||
|
public static final String JOB_NAME = "JOB_NAME";
|
||||||
|
public static final String JOB_RUN_ARGS = "JOB_RUN_ARGS";
|
||||||
|
public static final String APPLICATION_ID = "APPLICATION_ID";
|
||||||
|
public static final String CHECKPOINT_PATH = "CHECKPOINT_PATH";
|
||||||
|
public static final String INPUT_PATH = "INPUT_PATH";
|
||||||
|
public static final String SAVED_MODEL_PATH = "SAVED_MODEL_PATH";
|
||||||
|
}
|
|
@ -0,0 +1,57 @@
|
||||||
|
/**
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
* <p>
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
* <p>
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License. See accompanying LICENSE file.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hadoop.yarn.submarine.runtimes.common;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Persistent job/model, etc.
|
||||||
|
*/
|
||||||
|
public abstract class SubmarineStorage {
|
||||||
|
/**
|
||||||
|
* Add a new job by name
|
||||||
|
* @param jobName name of job.
|
||||||
|
* @param jobInfo info of the job.
|
||||||
|
*/
|
||||||
|
public abstract void addNewJob(String jobName, Map<String, String> jobInfo)
|
||||||
|
throws IOException;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get job info by job name.
|
||||||
|
* @param jobName name of job
|
||||||
|
* @return info of the job.
|
||||||
|
*/
|
||||||
|
public abstract Map<String, String> getJobInfoByName(String jobName)
|
||||||
|
throws IOException;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Add a new model
|
||||||
|
* @param modelName name of model
|
||||||
|
* @param version version of the model, when null is specified, it will be
|
||||||
|
* "default"
|
||||||
|
* @param modelInfo info of the model.
|
||||||
|
*/
|
||||||
|
public abstract void addNewModel(String modelName, String version,
|
||||||
|
Map<String, String> modelInfo) throws IOException;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get model info by name and version.
|
||||||
|
* @param modelName name of model.
|
||||||
|
* @param version version of the model, when null is specifed, it will be
|
||||||
|
*/
|
||||||
|
public abstract Map<String, String> getModelInfoByName(String modelName, String version)
|
||||||
|
throws IOException;
|
||||||
|
}
|
|
@ -0,0 +1,46 @@
|
||||||
|
/**
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License. See accompanying LICENSE file.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hadoop.yarn.submarine.runtimes.yarnservice;
|
||||||
|
|
||||||
|
import org.apache.hadoop.yarn.exceptions.YarnException;
|
||||||
|
import org.apache.hadoop.yarn.service.api.records.Service;
|
||||||
|
import org.apache.hadoop.yarn.service.client.ServiceClient;
|
||||||
|
import org.apache.hadoop.yarn.submarine.common.ClientContext;
|
||||||
|
import org.apache.hadoop.yarn.submarine.common.api.JobStatus;
|
||||||
|
import org.apache.hadoop.yarn.submarine.common.api.builder.JobStatusBuilder;
|
||||||
|
import org.apache.hadoop.yarn.submarine.runtimes.common.JobMonitor;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
public class YarnServiceJobMonitor extends JobMonitor {
|
||||||
|
private ServiceClient serviceClient = null;
|
||||||
|
|
||||||
|
public YarnServiceJobMonitor(ClientContext clientContext) {
|
||||||
|
super(clientContext);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public synchronized JobStatus getTrainingJobStatus(String jobName)
|
||||||
|
throws IOException, YarnException {
|
||||||
|
if (this.serviceClient == null) {
|
||||||
|
this.serviceClient = YarnServiceUtils.createServiceClient(
|
||||||
|
clientContext.getYarnConfig());
|
||||||
|
}
|
||||||
|
|
||||||
|
Service serviceSpec = this.serviceClient.getStatus(jobName);
|
||||||
|
JobStatus jobStatus = JobStatusBuilder.fromServiceSpec(serviceSpec);
|
||||||
|
return jobStatus;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,458 @@
|
||||||
|
/**
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License. See accompanying LICENSE file.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hadoop.yarn.submarine.runtimes.yarnservice;
|
||||||
|
|
||||||
|
import com.google.common.annotations.VisibleForTesting;
|
||||||
|
import org.apache.hadoop.fs.FileStatus;
|
||||||
|
import org.apache.hadoop.fs.FileSystem;
|
||||||
|
import org.apache.hadoop.fs.Path;
|
||||||
|
import org.apache.hadoop.yarn.api.records.ApplicationId;
|
||||||
|
import org.apache.hadoop.yarn.exceptions.YarnException;
|
||||||
|
import org.apache.hadoop.yarn.service.api.ServiceApiConstants;
|
||||||
|
import org.apache.hadoop.yarn.service.api.records.Artifact;
|
||||||
|
import org.apache.hadoop.yarn.service.api.records.Component;
|
||||||
|
import org.apache.hadoop.yarn.service.api.records.ConfigFile;
|
||||||
|
import org.apache.hadoop.yarn.service.api.records.Resource;
|
||||||
|
import org.apache.hadoop.yarn.service.api.records.ResourceInformation;
|
||||||
|
import org.apache.hadoop.yarn.service.api.records.Service;
|
||||||
|
import org.apache.hadoop.yarn.service.client.ServiceClient;
|
||||||
|
import org.apache.hadoop.yarn.submarine.client.cli.param.RunJobParameters;
|
||||||
|
import org.apache.hadoop.yarn.submarine.common.ClientContext;
|
||||||
|
import org.apache.hadoop.yarn.submarine.common.Envs;
|
||||||
|
import org.apache.hadoop.yarn.submarine.common.api.TaskType;
|
||||||
|
import org.apache.hadoop.yarn.submarine.common.conf.SubmarineLogs;
|
||||||
|
import org.apache.hadoop.yarn.submarine.runtimes.common.JobSubmitter;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import java.io.File;
|
||||||
|
import java.io.FileNotFoundException;
|
||||||
|
import java.io.FileWriter;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.HashSet;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.Set;
|
||||||
|
import java.util.StringTokenizer;
|
||||||
|
|
||||||
|
import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.HADOOP_SECURITY_AUTHENTICATION;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Submit a job to cluster
|
||||||
|
*/
|
||||||
|
public class YarnServiceJobSubmitter implements JobSubmitter {
|
||||||
|
private static final Logger LOG =
|
||||||
|
LoggerFactory.getLogger(YarnServiceJobSubmitter.class);
|
||||||
|
ClientContext clientContext;
|
||||||
|
Service serviceSpec;
|
||||||
|
private Set<Path> uploadedFiles = new HashSet<>();
|
||||||
|
|
||||||
|
public YarnServiceJobSubmitter(ClientContext clientContext) {
|
||||||
|
this.clientContext = clientContext;
|
||||||
|
}
|
||||||
|
|
||||||
|
private Resource getServiceResourceFromYarnResource(
|
||||||
|
org.apache.hadoop.yarn.api.records.Resource yarnResource) {
|
||||||
|
Resource serviceResource = new Resource();
|
||||||
|
serviceResource.setCpus(yarnResource.getVirtualCores());
|
||||||
|
serviceResource.setMemory(String.valueOf(yarnResource.getMemorySize()));
|
||||||
|
|
||||||
|
Map<String, ResourceInformation> riMap = new HashMap<>();
|
||||||
|
for (org.apache.hadoop.yarn.api.records.ResourceInformation ri : yarnResource
|
||||||
|
.getAllResourcesListCopy()) {
|
||||||
|
ResourceInformation serviceRi =
|
||||||
|
new ResourceInformation();
|
||||||
|
serviceRi.setValue(ri.getValue());
|
||||||
|
serviceRi.setUnit(ri.getUnits());
|
||||||
|
riMap.put(ri.getName(), serviceRi);
|
||||||
|
}
|
||||||
|
serviceResource.setResourceInformations(riMap);
|
||||||
|
|
||||||
|
return serviceResource;
|
||||||
|
}
|
||||||
|
|
||||||
|
private String getValueOfEnvionment(String envar) {
|
||||||
|
// extract value from "key=value" form
|
||||||
|
if (envar == null || !envar.contains("=")) {
|
||||||
|
return "";
|
||||||
|
} else {
|
||||||
|
return envar.substring(envar.indexOf("=") + 1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void addHdfsClassPathIfNeeded(RunJobParameters parameters,
|
||||||
|
FileWriter fw, Component comp) throws IOException {
|
||||||
|
// Find envs to use HDFS
|
||||||
|
String hdfsHome = null;
|
||||||
|
String javaHome = null;
|
||||||
|
|
||||||
|
boolean hadoopEnv = false;
|
||||||
|
|
||||||
|
for (String envar : parameters.getEnvars()) {
|
||||||
|
if (envar.startsWith("DOCKER_HADOOP_HDFS_HOME=")) {
|
||||||
|
hdfsHome = getValueOfEnvionment(envar);
|
||||||
|
hadoopEnv = true;
|
||||||
|
} else if (envar.startsWith("DOCKER_JAVA_HOME=")) {
|
||||||
|
javaHome = getValueOfEnvionment(envar);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
boolean lackingEnvs = false;
|
||||||
|
|
||||||
|
if ((parameters.getInputPath() != null && parameters.getInputPath()
|
||||||
|
.contains("hdfs://")) || (parameters.getCheckpointPath() != null
|
||||||
|
&& parameters.getCheckpointPath().contains("hdfs://")) || (
|
||||||
|
parameters.getSavedModelPath() != null && parameters.getSavedModelPath()
|
||||||
|
.contains("hdfs://")) || hadoopEnv) {
|
||||||
|
// HDFS is asked either in input or output, set LD_LIBRARY_PATH
|
||||||
|
// and classpath
|
||||||
|
|
||||||
|
if (hdfsHome != null) {
|
||||||
|
// Unset HADOOP_HOME/HADOOP_YARN_HOME to make sure host machine's envs
|
||||||
|
// won't pollute docker's env.
|
||||||
|
fw.append("export HADOOP_HOME=\n");
|
||||||
|
fw.append("export HADOOP_YARN_HOME=\n");
|
||||||
|
fw.append("export HADOOP_HDFS_HOME=" + hdfsHome + "\n");
|
||||||
|
} else{
|
||||||
|
lackingEnvs = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
// hadoop confs will be uploaded to HDFS and localized to container's
|
||||||
|
// local folder, so here set $HADOOP_CONF_DIR to $WORK_DIR.
|
||||||
|
fw.append("export HADOOP_CONF_DIR=$WORK_DIR\n");
|
||||||
|
if (javaHome != null) {
|
||||||
|
fw.append("export JAVA_HOME=" + javaHome + "\n");
|
||||||
|
fw.append("export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:"
|
||||||
|
+ "$JAVA_HOME/lib/amd64/server\n");
|
||||||
|
} else {
|
||||||
|
lackingEnvs = true;
|
||||||
|
}
|
||||||
|
fw.append("export CLASSPATH=`$HADOOP_HDFS_HOME/bin/hadoop classpath --glob`\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
if (lackingEnvs) {
|
||||||
|
LOG.error("When hdfs is being used to read/write models/data. Following"
|
||||||
|
+ "envs are required: 1) DOCKER_HADOOP_HDFS_HOME=<HDFS_HOME inside"
|
||||||
|
+ "docker container> 2) DOCKER_JAVA_HOME=<JAVA_HOME inside docker"
|
||||||
|
+ "container>. You can use --env to pass these envars.");
|
||||||
|
throw new IOException("Failed to detect HDFS-related environments.");
|
||||||
|
}
|
||||||
|
|
||||||
|
// Trying to upload core-site.xml and hdfs-site.xml
|
||||||
|
Path stagingDir =
|
||||||
|
clientContext.getRemoteDirectoryManager().getJobStagingArea(
|
||||||
|
parameters.getName(), true);
|
||||||
|
File coreSite = findFileOnClassPath("core-site.xml");
|
||||||
|
File hdfsSite = findFileOnClassPath("hdfs-site.xml");
|
||||||
|
if (coreSite == null || hdfsSite == null) {
|
||||||
|
LOG.error("hdfs is being used, however we couldn't locate core-site.xml/"
|
||||||
|
+ "hdfs-site.xml from classpath, please double check you classpath"
|
||||||
|
+ "setting and make sure they're included.");
|
||||||
|
throw new IOException(
|
||||||
|
"Failed to locate core-site.xml / hdfs-site.xml from class path");
|
||||||
|
}
|
||||||
|
uploadToRemoteFileAndLocalizeToContainerWorkDir(stagingDir,
|
||||||
|
coreSite.getAbsolutePath(), "core-site.xml", comp);
|
||||||
|
uploadToRemoteFileAndLocalizeToContainerWorkDir(stagingDir,
|
||||||
|
hdfsSite.getAbsolutePath(), "hdfs-site.xml", comp);
|
||||||
|
|
||||||
|
// DEBUG
|
||||||
|
if (SubmarineLogs.isVerbose()) {
|
||||||
|
fw.append("echo $CLASSPATH\n");
|
||||||
|
fw.append("echo $JAVA_HOME\n");
|
||||||
|
fw.append("echo $LD_LIBRARY_PATH\n");
|
||||||
|
fw.append("echo $HADOOP_HDFS_HOME\n");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void addCommonEnvironments(Component component, TaskType taskType) {
|
||||||
|
Map<String, String> envs = component.getConfiguration().getEnv();
|
||||||
|
envs.put(Envs.TASK_INDEX_ENV, ServiceApiConstants.COMPONENT_ID);
|
||||||
|
envs.put(Envs.TASK_TYPE_ENV, taskType.name());
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Generate a command launch script on local disk, returns patch to the script
|
||||||
|
*/
|
||||||
|
private String generateCommandLaunchScript(RunJobParameters parameters,
|
||||||
|
TaskType taskType, Component comp) throws IOException {
|
||||||
|
File file = File.createTempFile(taskType.name() + "-launch-script", ".sh");
|
||||||
|
FileWriter fw = new FileWriter(file);
|
||||||
|
|
||||||
|
fw.append("#!/bin/bash\n");
|
||||||
|
|
||||||
|
addHdfsClassPathIfNeeded(parameters, fw, comp);
|
||||||
|
|
||||||
|
// For primary_worker
|
||||||
|
if (taskType == TaskType.PRIMARY_WORKER) {
|
||||||
|
// Do we need tensorboard?
|
||||||
|
if (parameters.isTensorboardEnabled()) {
|
||||||
|
int tensorboardPort = 6006;
|
||||||
|
// Run tensorboard at the background
|
||||||
|
fw.append(
|
||||||
|
"tensorboard --port " + tensorboardPort + " --logdir " + parameters
|
||||||
|
.getCheckpointPath() + " &\n");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// When distributed training is required
|
||||||
|
if (parameters.isDistributed()) {
|
||||||
|
// Generated TF_CONFIG
|
||||||
|
String tfConfigEnv = YarnServiceUtils.getTFConfigEnv(
|
||||||
|
taskType.getComponentName(), parameters.getNumWorkers(),
|
||||||
|
parameters.getNumPS(), parameters.getName(),
|
||||||
|
System.getProperty("user.name"),
|
||||||
|
clientContext.getYarnConfig().get("hadoop.registry.dns.domain-name"));
|
||||||
|
fw.append("export TF_CONFIG=\"" + tfConfigEnv + "\"\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
// Print launch command
|
||||||
|
if (taskType.equals(TaskType.WORKER) || taskType.equals(
|
||||||
|
TaskType.PRIMARY_WORKER)) {
|
||||||
|
fw.append(parameters.getWorkerLaunchCmd() + '\n');
|
||||||
|
|
||||||
|
if (SubmarineLogs.isVerbose()) {
|
||||||
|
LOG.info("Worker command =[" + parameters.getWorkerLaunchCmd() + "]");
|
||||||
|
}
|
||||||
|
} else if (taskType.equals(TaskType.PS)) {
|
||||||
|
fw.append(parameters.getPSLaunchCmd() + '\n');
|
||||||
|
|
||||||
|
if (SubmarineLogs.isVerbose()) {
|
||||||
|
LOG.info("PS command =[" + parameters.getPSLaunchCmd() + "]");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fw.close();
|
||||||
|
return file.getAbsolutePath();
|
||||||
|
}
|
||||||
|
|
||||||
|
private String getScriptFileName(TaskType taskType) {
|
||||||
|
return "run-" + taskType.name() + ".sh";
|
||||||
|
}
|
||||||
|
|
||||||
|
private File findFileOnClassPath(final String fileName) {
|
||||||
|
final String classpath = System.getProperty("java.class.path");
|
||||||
|
final String pathSeparator = System.getProperty("path.separator");
|
||||||
|
final StringTokenizer tokenizer = new StringTokenizer(classpath,
|
||||||
|
pathSeparator);
|
||||||
|
|
||||||
|
while (tokenizer.hasMoreTokens()) {
|
||||||
|
final String pathElement = tokenizer.nextToken();
|
||||||
|
final File directoryOrJar = new File(pathElement);
|
||||||
|
final File absoluteDirectoryOrJar = directoryOrJar.getAbsoluteFile();
|
||||||
|
if (absoluteDirectoryOrJar.isFile()) {
|
||||||
|
final File target = new File(absoluteDirectoryOrJar.getParent(),
|
||||||
|
fileName);
|
||||||
|
if (target.exists()) {
|
||||||
|
return target;
|
||||||
|
}
|
||||||
|
} else{
|
||||||
|
final File target = new File(directoryOrJar, fileName);
|
||||||
|
if (target.exists()) {
|
||||||
|
return target;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
private void uploadToRemoteFileAndLocalizeToContainerWorkDir(Path stagingDir,
|
||||||
|
String fileToUpload, String destFilename, Component comp)
|
||||||
|
throws IOException {
|
||||||
|
FileSystem fs = FileSystem.get(clientContext.getYarnConfig());
|
||||||
|
|
||||||
|
// Upload to remote FS under staging area
|
||||||
|
File localFile = new File(fileToUpload);
|
||||||
|
if (!localFile.exists()) {
|
||||||
|
throw new FileNotFoundException(
|
||||||
|
"Trying to upload file=" + localFile.getAbsolutePath()
|
||||||
|
+ " to remote, but couldn't find local file.");
|
||||||
|
}
|
||||||
|
String filename = new File(fileToUpload).getName();
|
||||||
|
|
||||||
|
Path uploadedFilePath = new Path(stagingDir, filename);
|
||||||
|
if (!uploadedFiles.contains(uploadedFilePath)) {
|
||||||
|
if (SubmarineLogs.isVerbose()) {
|
||||||
|
LOG.info("Copying local file=" + fileToUpload + " to remote="
|
||||||
|
+ uploadedFilePath);
|
||||||
|
}
|
||||||
|
fs.copyFromLocalFile(new Path(fileToUpload), uploadedFilePath);
|
||||||
|
uploadedFiles.add(uploadedFilePath);
|
||||||
|
}
|
||||||
|
|
||||||
|
FileStatus fileStatus = fs.getFileStatus(uploadedFilePath);
|
||||||
|
LOG.info("Uploaded file path = " + fileStatus.getPath());
|
||||||
|
|
||||||
|
// Set it to component's files list
|
||||||
|
comp.getConfiguration().getFiles().add(new ConfigFile().srcFile(
|
||||||
|
fileStatus.getPath().toUri().toString()).destFile(destFilename)
|
||||||
|
.type(ConfigFile.TypeEnum.STATIC));
|
||||||
|
}
|
||||||
|
|
||||||
|
private void handleLaunchCommand(RunJobParameters parameters,
|
||||||
|
TaskType taskType, Component component) throws IOException {
|
||||||
|
// Get staging area directory
|
||||||
|
Path stagingDir =
|
||||||
|
clientContext.getRemoteDirectoryManager().getJobStagingArea(
|
||||||
|
parameters.getName(), true);
|
||||||
|
|
||||||
|
// Generate script file in the local disk
|
||||||
|
String localScriptFile = generateCommandLaunchScript(parameters, taskType,
|
||||||
|
component);
|
||||||
|
String destScriptFileName = getScriptFileName(taskType);
|
||||||
|
uploadToRemoteFileAndLocalizeToContainerWorkDir(stagingDir, localScriptFile,
|
||||||
|
destScriptFileName, component);
|
||||||
|
|
||||||
|
component.setLaunchCommand("./" + destScriptFileName);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void addWorkerComponent(Service service,
|
||||||
|
RunJobParameters parameters, TaskType taskType) throws IOException {
|
||||||
|
Component workerComponent = new Component();
|
||||||
|
addCommonEnvironments(workerComponent, taskType);
|
||||||
|
|
||||||
|
workerComponent.setName(taskType.getComponentName());
|
||||||
|
|
||||||
|
if (taskType.equals(TaskType.PRIMARY_WORKER)) {
|
||||||
|
workerComponent.setNumberOfContainers(1L);
|
||||||
|
} else{
|
||||||
|
workerComponent.setNumberOfContainers(
|
||||||
|
(long) parameters.getNumWorkers() - 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (parameters.getWorkerDockerImage() != null) {
|
||||||
|
workerComponent.setArtifact(
|
||||||
|
getDockerArtifact(parameters.getWorkerDockerImage()));
|
||||||
|
}
|
||||||
|
|
||||||
|
workerComponent.setResource(
|
||||||
|
getServiceResourceFromYarnResource(parameters.getWorkerResource()));
|
||||||
|
handleLaunchCommand(parameters, taskType, workerComponent);
|
||||||
|
workerComponent.setRestartPolicy(Component.RestartPolicyEnum.NEVER);
|
||||||
|
service.addComponent(workerComponent);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Handle worker and primary_worker.
|
||||||
|
private void addWorkerComponents(Service service, RunJobParameters parameters)
|
||||||
|
throws IOException {
|
||||||
|
addWorkerComponent(service, parameters, TaskType.PRIMARY_WORKER);
|
||||||
|
|
||||||
|
if (parameters.getNumWorkers() > 1) {
|
||||||
|
addWorkerComponent(service, parameters, TaskType.WORKER);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void appendToEnv(Service service, String key, String value,
|
||||||
|
String delim) {
|
||||||
|
Map<String, String> env = service.getConfiguration().getEnv();
|
||||||
|
if (!env.containsKey(key)) {
|
||||||
|
env.put(key, value);
|
||||||
|
} else {
|
||||||
|
if (!value.isEmpty()) {
|
||||||
|
String existingValue = env.get(key);
|
||||||
|
if (!existingValue.endsWith(delim)) {
|
||||||
|
env.put(key, existingValue + delim + value);
|
||||||
|
} else {
|
||||||
|
env.put(key, existingValue + value);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void handleServiceEnvs(Service service, RunJobParameters parameters) {
|
||||||
|
if (parameters.getEnvars() != null) {
|
||||||
|
for (String envarPair : parameters.getEnvars()) {
|
||||||
|
String key, value;
|
||||||
|
if (envarPair.contains("=")) {
|
||||||
|
int idx = envarPair.indexOf('=');
|
||||||
|
key = envarPair.substring(0, idx);
|
||||||
|
value = envarPair.substring(idx + 1);
|
||||||
|
} else{
|
||||||
|
// No "=" found so use the whole key
|
||||||
|
key = envarPair;
|
||||||
|
value = "";
|
||||||
|
}
|
||||||
|
appendToEnv(service, key, value, ":");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Append other configs like /etc/passwd, /etc/krb5.conf
|
||||||
|
appendToEnv(service, "YARN_CONTAINER_RUNTIME_DOCKER_MOUNTS",
|
||||||
|
"/etc/passwd:/etc/passwd:ro", ",");
|
||||||
|
|
||||||
|
String authenication = clientContext.getYarnConfig().get(
|
||||||
|
HADOOP_SECURITY_AUTHENTICATION);
|
||||||
|
if (authenication != null && authenication.equals("kerberos")) {
|
||||||
|
appendToEnv(service, "YARN_CONTAINER_RUNTIME_DOCKER_MOUNTS",
|
||||||
|
"/etc/krb5.conf:/etc/krb5.conf:ro", ",");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private Artifact getDockerArtifact(String dockerImageName) {
|
||||||
|
return new Artifact().type(Artifact.TypeEnum.DOCKER).id(dockerImageName);
|
||||||
|
}
|
||||||
|
|
||||||
|
private Service createServiceByParameters(RunJobParameters parameters)
|
||||||
|
throws IOException {
|
||||||
|
Service service = new Service();
|
||||||
|
service.setName(parameters.getName());
|
||||||
|
service.setVersion(String.valueOf(System.currentTimeMillis()));
|
||||||
|
service.setArtifact(getDockerArtifact(parameters.getDockerImageName()));
|
||||||
|
|
||||||
|
handleServiceEnvs(service, parameters);
|
||||||
|
|
||||||
|
addWorkerComponents(service, parameters);
|
||||||
|
|
||||||
|
if (parameters.getNumPS() > 0) {
|
||||||
|
Component psComponent = new Component();
|
||||||
|
psComponent.setName(TaskType.PS.getComponentName());
|
||||||
|
addCommonEnvironments(psComponent, TaskType.PS);
|
||||||
|
psComponent.setNumberOfContainers((long) parameters.getNumPS());
|
||||||
|
psComponent.setRestartPolicy(Component.RestartPolicyEnum.NEVER);
|
||||||
|
psComponent.setResource(
|
||||||
|
getServiceResourceFromYarnResource(parameters.getPsResource()));
|
||||||
|
|
||||||
|
// Override global docker image if needed.
|
||||||
|
if (parameters.getPsDockerImage() != null) {
|
||||||
|
psComponent.setArtifact(
|
||||||
|
getDockerArtifact(parameters.getPsDockerImage()));
|
||||||
|
}
|
||||||
|
handleLaunchCommand(parameters, TaskType.PS, psComponent);
|
||||||
|
service.addComponent(psComponent);
|
||||||
|
}
|
||||||
|
return service;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* {@inheritDoc}
|
||||||
|
*/
|
||||||
|
@Override
|
||||||
|
public ApplicationId submitJob(RunJobParameters parameters)
|
||||||
|
throws IOException, YarnException {
|
||||||
|
Service service = createServiceByParameters(parameters);
|
||||||
|
ServiceClient serviceClient = YarnServiceUtils.createServiceClient(
|
||||||
|
clientContext.getYarnConfig());
|
||||||
|
ApplicationId appid = serviceClient.actionCreate(service);
|
||||||
|
serviceClient.stop();
|
||||||
|
this.serviceSpec = service;
|
||||||
|
return appid;
|
||||||
|
}
|
||||||
|
|
||||||
|
@VisibleForTesting
|
||||||
|
public Service getServiceSpec() {
|
||||||
|
return serviceSpec;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,44 @@
|
||||||
|
/**
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License. See accompanying LICENSE file.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hadoop.yarn.submarine.runtimes.yarnservice;
|
||||||
|
|
||||||
|
import org.apache.hadoop.yarn.submarine.common.ClientContext;
|
||||||
|
import org.apache.hadoop.yarn.submarine.runtimes.RuntimeFactory;
|
||||||
|
import org.apache.hadoop.yarn.submarine.runtimes.common.FSBasedSubmarineStorageImpl;
|
||||||
|
import org.apache.hadoop.yarn.submarine.runtimes.common.JobMonitor;
|
||||||
|
import org.apache.hadoop.yarn.submarine.runtimes.common.JobSubmitter;
|
||||||
|
import org.apache.hadoop.yarn.submarine.runtimes.common.SubmarineStorage;
|
||||||
|
|
||||||
|
public class YarnServiceRuntimeFactory extends RuntimeFactory {
|
||||||
|
|
||||||
|
public YarnServiceRuntimeFactory(ClientContext clientContext) {
|
||||||
|
super(clientContext);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected JobSubmitter internalCreateJobSubmitter() {
|
||||||
|
return new YarnServiceJobSubmitter(super.clientContext);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected JobMonitor internalCreateJobMonitor() {
|
||||||
|
return new YarnServiceJobMonitor(super.clientContext);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected SubmarineStorage internalCreateSubmarineStorage() {
|
||||||
|
return new FSBasedSubmarineStorageImpl(super.clientContext);
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,78 @@
|
||||||
|
/**
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License. See accompanying LICENSE file.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hadoop.yarn.submarine.runtimes.yarnservice;
|
||||||
|
|
||||||
|
import com.google.common.annotations.VisibleForTesting;
|
||||||
|
import org.apache.hadoop.conf.Configuration;
|
||||||
|
import org.apache.hadoop.yarn.service.client.ServiceClient;
|
||||||
|
import org.apache.hadoop.yarn.submarine.common.Envs;
|
||||||
|
|
||||||
|
public class YarnServiceUtils {
|
||||||
|
// This will be true only in UT.
|
||||||
|
private static ServiceClient stubServiceClient = null;
|
||||||
|
|
||||||
|
public static ServiceClient createServiceClient(
|
||||||
|
Configuration yarnConfiguration) {
|
||||||
|
if (stubServiceClient != null) {
|
||||||
|
return stubServiceClient;
|
||||||
|
}
|
||||||
|
|
||||||
|
ServiceClient serviceClient = new ServiceClient();
|
||||||
|
serviceClient.init(yarnConfiguration);
|
||||||
|
serviceClient.start();
|
||||||
|
return serviceClient;
|
||||||
|
}
|
||||||
|
|
||||||
|
@VisibleForTesting
|
||||||
|
public static void setStubServiceClient(ServiceClient stubServiceClient) {
|
||||||
|
YarnServiceUtils.stubServiceClient = stubServiceClient;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static String getTFConfigEnv(String curCommponentName, int nWorkers,
|
||||||
|
int nPs, String serviceName, String userName, String domain) {
|
||||||
|
String commonEndpointSuffix =
|
||||||
|
"." + serviceName + "." + userName + "." + domain + ":8000";
|
||||||
|
|
||||||
|
String json = "{\\\"cluster\\\":{";
|
||||||
|
|
||||||
|
String master = getComponentArrayJson("master", 1, commonEndpointSuffix)
|
||||||
|
+ ",";
|
||||||
|
String worker = getComponentArrayJson("worker", nWorkers - 1,
|
||||||
|
commonEndpointSuffix) + ",";
|
||||||
|
String ps = getComponentArrayJson("ps", nPs, commonEndpointSuffix) + "},";
|
||||||
|
|
||||||
|
String task =
|
||||||
|
"\\\"task\\\":{" + " \\\"type\\\":\\\"" + curCommponentName + "\\\","
|
||||||
|
+ " \\\"index\\\":" + '$' + Envs.TASK_INDEX_ENV + "},";
|
||||||
|
String environment = "\\\"environment\\\":\\\"cloud\\\"}";
|
||||||
|
|
||||||
|
return json + master + worker + ps + task + environment;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static String getComponentArrayJson(String componentName, int count,
|
||||||
|
String endpointSuffix) {
|
||||||
|
String component = "\\\"" + componentName + "\\\":";
|
||||||
|
String array = "[";
|
||||||
|
for (int i = 0; i < count; i++) {
|
||||||
|
array = array + "\\\"" + componentName + "-" + i
|
||||||
|
+ endpointSuffix + "\\\"";
|
||||||
|
if (i != count - 1) {
|
||||||
|
array = array + ",";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
array = array + "]";
|
||||||
|
return component + array;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,26 @@
|
||||||
|
<!---
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
you may not use this file except in compliance with the License.
|
||||||
|
You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License. See accompanying LICENSE file.
|
||||||
|
-->
|
||||||
|
|
||||||
|
# Developper Guide
|
||||||
|
|
||||||
|
(Need add more details)
|
||||||
|
|
||||||
|
By default, submarine uses YARN service framework as runtime. If you want to add your own implementation. You can add a new `RuntimeFactory` implementation and configure following option to `submarine.xml` (which should be placed under same `$HADOOP_CONF_DIR`)
|
||||||
|
|
||||||
|
```
|
||||||
|
<property>
|
||||||
|
<name>submarine.runtime.class</name>
|
||||||
|
<value>... full qualified class name for your runtime factory ... </value>
|
||||||
|
</property>
|
||||||
|
```
|
|
@ -0,0 +1,134 @@
|
||||||
|
<!---
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
you may not use this file except in compliance with the License.
|
||||||
|
You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License. See accompanying LICENSE file.
|
||||||
|
-->
|
||||||
|
|
||||||
|
# Quick Start Guide
|
||||||
|
|
||||||
|
## Prerequisite
|
||||||
|
|
||||||
|
Must:
|
||||||
|
- Apache Hadoop 3.1.0, YARN service enabled.
|
||||||
|
|
||||||
|
Optional:
|
||||||
|
- Enable YARN DNS. (When distributed training required.)
|
||||||
|
- Enable GPU on YARN support. (When GPU-based training required.)
|
||||||
|
|
||||||
|
## Run jobs
|
||||||
|
|
||||||
|
### Commandline options
|
||||||
|
|
||||||
|
```$xslt
|
||||||
|
usage: job run
|
||||||
|
-checkpoint_path <arg> Training output directory of the job, could
|
||||||
|
be local or other FS directory. This
|
||||||
|
typically includes checkpoint files and
|
||||||
|
exported model
|
||||||
|
-docker_image <arg> Docker image name/tag
|
||||||
|
-env <arg> Common environment variable of worker/ps
|
||||||
|
-input_path <arg> Input of the job, could be local or other FS
|
||||||
|
directory
|
||||||
|
-name <arg> Name of the job
|
||||||
|
-num_ps <arg> Number of PS tasks of the job, by default
|
||||||
|
it's 0
|
||||||
|
-num_workers <arg> Numnber of worker tasks of the job, by
|
||||||
|
default it's 1
|
||||||
|
-ps_docker_image <arg> Specify docker image for PS, when this is
|
||||||
|
not specified, PS uses --docker_image as
|
||||||
|
default.
|
||||||
|
-ps_launch_cmd <arg> Commandline of worker, arguments will be
|
||||||
|
directly used to launch the PS
|
||||||
|
-ps_resources <arg> Resource of each PS, for example
|
||||||
|
memory-mb=2048,vcores=2,yarn.io/gpu=2
|
||||||
|
-queue <arg> Name of queue to run the job, by default it
|
||||||
|
uses default queue
|
||||||
|
-saved_model_path <arg> Model exported path (savedmodel) of the job,
|
||||||
|
which is needed when exported model is not
|
||||||
|
placed under ${checkpoint_path}could be
|
||||||
|
local or other FS directory. This will be
|
||||||
|
used to serve.
|
||||||
|
-tensorboard <arg> Should we run TensorBoard for this job? By
|
||||||
|
default it's true
|
||||||
|
-verbose Print verbose log for troubleshooting
|
||||||
|
-wait_job_finish Specified when user want to wait the job
|
||||||
|
finish
|
||||||
|
-worker_docker_image <arg> Specify docker image for WORKER, when this
|
||||||
|
is not specified, WORKER uses --docker_image
|
||||||
|
as default.
|
||||||
|
-worker_launch_cmd <arg> Commandline of worker, arguments will be
|
||||||
|
directly used to launch the worker
|
||||||
|
-worker_resources <arg> Resource of each worker, for example
|
||||||
|
memory-mb=2048,vcores=2,yarn.io/gpu=2
|
||||||
|
```
|
||||||
|
|
||||||
|
### Launch Standalone Tensorflow Application:
|
||||||
|
|
||||||
|
#### Commandline
|
||||||
|
```
|
||||||
|
yarn jar path-to/hadoop-yarn-applications-submarine-3.2.0-SNAPSHOT.jar job run \
|
||||||
|
--env DOCKER_JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64/jre/ \
|
||||||
|
--env DOCKER_HADOOP_HDFS_HOME=/hadoop-3.1.0 --name tf-job-001 \
|
||||||
|
--docker_image <your-docker-image> \
|
||||||
|
--input_path hdfs://default/dataset/cifar-10-data \
|
||||||
|
--checkpoint_path hdfs://default/tmp/cifar-10-jobdir \
|
||||||
|
--worker_resources memory=4G,vcores=2,gpu=2 \
|
||||||
|
--worker_launch_cmd "python ... (Your training application cmd)"
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Notes:
|
||||||
|
|
||||||
|
1) `DOCKER_JAVA_HOME` points to JAVA_HOME inside Docker image.
|
||||||
|
2) `DOCKER_HADOOP_HDFS_HOME` points to HADOOP_HDFS_HOME inside Docker image.
|
||||||
|
3) `--worker_resources` can include gpu when you need GPU to train your task.
|
||||||
|
|
||||||
|
### Launch Distributed Tensorflow Application:
|
||||||
|
|
||||||
|
#### Commandline
|
||||||
|
|
||||||
|
```
|
||||||
|
yarn jar hadoop-yarn-applications-submarine-<version>.jar job run \
|
||||||
|
--name tf-job-001 --docker_image <your docker image> \
|
||||||
|
--input_path hdfs://default/dataset/cifar-10-data \
|
||||||
|
--checkpoint_path hdfs://default/tmp/cifar-10-jobdir \
|
||||||
|
--env DOCKER_JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64/jre/ \
|
||||||
|
--env DOCKER_HADOOP_HDFS_HOME=/hadoop-3.1.0 \
|
||||||
|
--num_workers 2 \
|
||||||
|
--worker_resources memory=8G,vcores=2,gpu=1 --worker_launch_cmd "cmd for worker ..." \
|
||||||
|
--num_ps 2 \
|
||||||
|
--ps_resources memory=4G,vcores=2,gpu=0 --ps_launch_cmd "cmd for ps" \
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Notes:
|
||||||
|
|
||||||
|
1) Very similar to standalone TF application, but you need to specify #worker/#ps
|
||||||
|
2) Different resources can be specified for worker and PS.
|
||||||
|
3) `TF_CONFIG` environment will be auto generated and set before executing user's launch command.
|
||||||
|
|
||||||
|
## Run jobs
|
||||||
|
|
||||||
|
### Get Job Status
|
||||||
|
|
||||||
|
```
|
||||||
|
yarn jar hadoop-yarn-applications-submarine-3.2.0-SNAPSHOT.jar job show --name tf-job-001
|
||||||
|
```
|
||||||
|
|
||||||
|
Output looks like:
|
||||||
|
```
|
||||||
|
Job Meta Info:
|
||||||
|
Application Id: application_1532131617202_0005
|
||||||
|
Input Path: hdfs://default/dataset/cifar-10-data
|
||||||
|
Checkpoint Path: hdfs://default/tmp/cifar-10-jobdir
|
||||||
|
Run Parameters: --name tf-job-001 --docker_image wtan/tf-1.8.0-gpu:0.0.3
|
||||||
|
(... all your commandline before run the job)
|
||||||
|
```
|
||||||
|
|
||||||
|
After that, you can run ```tensorboard --logdir=<checkpoint-path>``` to view Tensorboard of the job.
|
|
@ -0,0 +1,229 @@
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
* <p>
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
* <p>
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
|
||||||
|
package org.apache.hadoop.yarn.submarine.client.cli;
|
||||||
|
|
||||||
|
import org.apache.hadoop.yarn.api.records.ApplicationId;
|
||||||
|
import org.apache.hadoop.yarn.api.records.Resource;
|
||||||
|
import org.apache.hadoop.yarn.api.records.ResourceInformation;
|
||||||
|
import org.apache.hadoop.yarn.api.records.ResourceTypeInfo;
|
||||||
|
import org.apache.hadoop.yarn.exceptions.YarnException;
|
||||||
|
import org.apache.hadoop.yarn.submarine.client.cli.param.RunJobParameters;
|
||||||
|
import org.apache.hadoop.yarn.submarine.common.MockClientContext;
|
||||||
|
import org.apache.hadoop.yarn.submarine.common.conf.SubmarineLogs;
|
||||||
|
import org.apache.hadoop.yarn.submarine.runtimes.RuntimeFactory;
|
||||||
|
import org.apache.hadoop.yarn.submarine.runtimes.common.JobMonitor;
|
||||||
|
import org.apache.hadoop.yarn.submarine.runtimes.common.JobSubmitter;
|
||||||
|
import org.apache.hadoop.yarn.submarine.runtimes.common.SubmarineStorage;
|
||||||
|
import org.apache.hadoop.yarn.util.resource.ResourceUtils;
|
||||||
|
import org.apache.hadoop.yarn.util.resource.Resources;
|
||||||
|
import org.junit.Assert;
|
||||||
|
import org.junit.Before;
|
||||||
|
import org.junit.Test;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import static org.mockito.Matchers.any;
|
||||||
|
import static org.mockito.Mockito.mock;
|
||||||
|
import static org.mockito.Mockito.when;
|
||||||
|
|
||||||
|
public class TestRunJobCliParsing {
|
||||||
|
@Before
|
||||||
|
public void before() {
|
||||||
|
SubmarineLogs.verboseOff();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testPrintHelp() {
|
||||||
|
MockClientContext mockClientContext = new MockClientContext();
|
||||||
|
JobSubmitter mockJobSubmitter = mock(JobSubmitter.class);
|
||||||
|
JobMonitor mockJobMonitor = mock(JobMonitor.class);
|
||||||
|
RunJobCli runJobCli = new RunJobCli(mockClientContext, mockJobSubmitter,
|
||||||
|
mockJobMonitor);
|
||||||
|
runJobCli.printUsages();
|
||||||
|
}
|
||||||
|
|
||||||
|
private MockClientContext getMockClientContext()
|
||||||
|
throws IOException, YarnException {
|
||||||
|
MockClientContext mockClientContext = new MockClientContext();
|
||||||
|
JobSubmitter mockJobSubmitter = mock(JobSubmitter.class);
|
||||||
|
when(mockJobSubmitter.submitJob(any(RunJobParameters.class))).thenReturn(
|
||||||
|
ApplicationId.newInstance(1234L, 1));
|
||||||
|
JobMonitor mockJobMonitor = mock(JobMonitor.class);
|
||||||
|
SubmarineStorage storage = mock(SubmarineStorage.class);
|
||||||
|
RuntimeFactory rtFactory = mock(RuntimeFactory.class);
|
||||||
|
|
||||||
|
when(rtFactory.getJobSubmitterInstance()).thenReturn(mockJobSubmitter);
|
||||||
|
when(rtFactory.getJobMonitorInstance()).thenReturn(mockJobMonitor);
|
||||||
|
when(rtFactory.getSubmarineStorage()).thenReturn(storage);
|
||||||
|
|
||||||
|
mockClientContext.setRuntimeFactory(rtFactory);
|
||||||
|
return mockClientContext;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testBasicRunJobForDistributedTraining() throws Exception {
|
||||||
|
RunJobCli runJobCli = new RunJobCli(getMockClientContext());
|
||||||
|
|
||||||
|
Assert.assertFalse(SubmarineLogs.isVerbose());
|
||||||
|
|
||||||
|
runJobCli.run(
|
||||||
|
new String[] { "--name", "my-job", "--docker_image", "tf-docker:1.1.0",
|
||||||
|
"--input_path", "hdfs://input", "--checkpoint_path", "hdfs://output",
|
||||||
|
"--num_workers", "3", "--num_ps", "2", "--worker_launch_cmd",
|
||||||
|
"python run-job.py", "--worker_resources", "memory=2048M,vcores=2",
|
||||||
|
"--ps_resources", "memory=4G,vcores=4", "--tensorboard", "true",
|
||||||
|
"--ps_launch_cmd", "python run-ps.py", "--verbose" });
|
||||||
|
|
||||||
|
RunJobParameters jobRunParameters = runJobCli.getRunJobParameters();
|
||||||
|
|
||||||
|
Assert.assertEquals(jobRunParameters.getInputPath(), "hdfs://input");
|
||||||
|
Assert.assertEquals(jobRunParameters.getCheckpointPath(), "hdfs://output");
|
||||||
|
Assert.assertEquals(jobRunParameters.getNumPS(), 2);
|
||||||
|
Assert.assertEquals(jobRunParameters.getPSLaunchCmd(), "python run-ps.py");
|
||||||
|
Assert.assertEquals(Resources.createResource(4096, 4),
|
||||||
|
jobRunParameters.getPsResource());
|
||||||
|
Assert.assertEquals(jobRunParameters.getWorkerLaunchCmd(),
|
||||||
|
"python run-job.py");
|
||||||
|
Assert.assertEquals(Resources.createResource(2048, 2),
|
||||||
|
jobRunParameters.getWorkerResource());
|
||||||
|
Assert.assertEquals(jobRunParameters.getDockerImageName(),
|
||||||
|
"tf-docker:1.1.0");
|
||||||
|
Assert.assertTrue(SubmarineLogs.isVerbose());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testBasicRunJobForSingleNodeTraining() throws Exception {
|
||||||
|
RunJobCli runJobCli = new RunJobCli(getMockClientContext());
|
||||||
|
Assert.assertFalse(SubmarineLogs.isVerbose());
|
||||||
|
|
||||||
|
runJobCli.run(
|
||||||
|
new String[] { "--name", "my-job", "--docker_image", "tf-docker:1.1.0",
|
||||||
|
"--input_path", "hdfs://input", "--checkpoint_path", "hdfs://output",
|
||||||
|
"--num_workers", "1", "--worker_launch_cmd", "python run-job.py",
|
||||||
|
"--worker_resources", "memory=4g,vcores=2", "--tensorboard",
|
||||||
|
"true", "--verbose", "--wait_job_finish" });
|
||||||
|
|
||||||
|
RunJobParameters jobRunParameters = runJobCli.getRunJobParameters();
|
||||||
|
|
||||||
|
Assert.assertEquals(jobRunParameters.getInputPath(), "hdfs://input");
|
||||||
|
Assert.assertEquals(jobRunParameters.getCheckpointPath(), "hdfs://output");
|
||||||
|
Assert.assertEquals(jobRunParameters.getNumWorkers(), 1);
|
||||||
|
Assert.assertEquals(jobRunParameters.getWorkerLaunchCmd(),
|
||||||
|
"python run-job.py");
|
||||||
|
Assert.assertEquals(Resources.createResource(4096, 2),
|
||||||
|
jobRunParameters.getWorkerResource());
|
||||||
|
Assert.assertTrue(SubmarineLogs.isVerbose());
|
||||||
|
Assert.assertTrue(jobRunParameters.isWaitJobFinish());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testLaunchCommandPatternReplace() throws Exception {
|
||||||
|
RunJobCli runJobCli = new RunJobCli(getMockClientContext());
|
||||||
|
Assert.assertFalse(SubmarineLogs.isVerbose());
|
||||||
|
|
||||||
|
runJobCli.run(
|
||||||
|
new String[] { "--name", "my-job", "--docker_image", "tf-docker:1.1.0",
|
||||||
|
"--input_path", "hdfs://input", "--checkpoint_path", "hdfs://output",
|
||||||
|
"--num_workers", "3", "--num_ps", "2", "--worker_launch_cmd",
|
||||||
|
"python run-job.py --input=%input_path% --model_dir=%checkpoint_path% --export_dir=%saved_model_path%/savedmodel",
|
||||||
|
"--worker_resources", "memory=2048,vcores=2", "--ps_resources",
|
||||||
|
"memory=4096,vcores=4", "--tensorboard", "true", "--ps_launch_cmd",
|
||||||
|
"python run-ps.py --input=%input_path% --model_dir=%checkpoint_path%/model",
|
||||||
|
"--verbose" });
|
||||||
|
|
||||||
|
Assert.assertEquals(
|
||||||
|
"python run-job.py --input=hdfs://input --model_dir=hdfs://output "
|
||||||
|
+ "--export_dir=hdfs://output/savedmodel",
|
||||||
|
runJobCli.getRunJobParameters().getWorkerLaunchCmd());
|
||||||
|
Assert.assertEquals(
|
||||||
|
"python run-ps.py --input=hdfs://input --model_dir=hdfs://output/model",
|
||||||
|
runJobCli.getRunJobParameters().getPSLaunchCmd());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testResourceUnitParsing() throws Exception {
|
||||||
|
Resource res = CliUtils.createResourceFromString("memory=20g,vcores=3",
|
||||||
|
ResourceUtils.getResourcesTypeInfo());
|
||||||
|
Assert.assertEquals(Resources.createResource(20 * 1024, 3), res);
|
||||||
|
|
||||||
|
res = CliUtils.createResourceFromString("memory=20G,vcores=3",
|
||||||
|
ResourceUtils.getResourcesTypeInfo());
|
||||||
|
Assert.assertEquals(Resources.createResource(20 * 1024, 3), res);
|
||||||
|
|
||||||
|
res = CliUtils.createResourceFromString("memory=20M,vcores=3",
|
||||||
|
ResourceUtils.getResourcesTypeInfo());
|
||||||
|
Assert.assertEquals(Resources.createResource(20, 3), res);
|
||||||
|
|
||||||
|
res = CliUtils.createResourceFromString("memory=20m,vcores=3",
|
||||||
|
ResourceUtils.getResourcesTypeInfo());
|
||||||
|
Assert.assertEquals(Resources.createResource(20, 3), res);
|
||||||
|
|
||||||
|
res = CliUtils.createResourceFromString("memory-mb=20,vcores=3",
|
||||||
|
ResourceUtils.getResourcesTypeInfo());
|
||||||
|
Assert.assertEquals(Resources.createResource(20, 3), res);
|
||||||
|
|
||||||
|
res = CliUtils.createResourceFromString("memory-mb=20m,vcores=3",
|
||||||
|
ResourceUtils.getResourcesTypeInfo());
|
||||||
|
Assert.assertEquals(Resources.createResource(20, 3), res);
|
||||||
|
|
||||||
|
res = CliUtils.createResourceFromString("memory-mb=20G,vcores=3",
|
||||||
|
ResourceUtils.getResourcesTypeInfo());
|
||||||
|
Assert.assertEquals(Resources.createResource(20 * 1024, 3), res);
|
||||||
|
|
||||||
|
// W/o unit for memory means bits, and 20 bits will be rounded to 0
|
||||||
|
res = CliUtils.createResourceFromString("memory=20,vcores=3",
|
||||||
|
ResourceUtils.getResourcesTypeInfo());
|
||||||
|
Assert.assertEquals(Resources.createResource(0, 3), res);
|
||||||
|
|
||||||
|
// Test multiple resources
|
||||||
|
List<ResourceTypeInfo> resTypes = new ArrayList<>(
|
||||||
|
ResourceUtils.getResourcesTypeInfo());
|
||||||
|
resTypes.add(ResourceTypeInfo.newInstance(ResourceInformation.GPU_URI, ""));
|
||||||
|
ResourceUtils.reinitializeResources(resTypes);
|
||||||
|
res = CliUtils.createResourceFromString("memory=2G,vcores=3,gpu=0",
|
||||||
|
resTypes);
|
||||||
|
Assert.assertEquals(2 * 1024, res.getMemorySize());
|
||||||
|
Assert.assertEquals(0, res.getResourceValue(ResourceInformation.GPU_URI));
|
||||||
|
|
||||||
|
res = CliUtils.createResourceFromString("memory=2G,vcores=3,gpu=3",
|
||||||
|
resTypes);
|
||||||
|
Assert.assertEquals(2 * 1024, res.getMemorySize());
|
||||||
|
Assert.assertEquals(3, res.getResourceValue(ResourceInformation.GPU_URI));
|
||||||
|
|
||||||
|
res = CliUtils.createResourceFromString("memory=2G,vcores=3",
|
||||||
|
resTypes);
|
||||||
|
Assert.assertEquals(2 * 1024, res.getMemorySize());
|
||||||
|
Assert.assertEquals(0, res.getResourceValue(ResourceInformation.GPU_URI));
|
||||||
|
|
||||||
|
res = CliUtils.createResourceFromString("memory=2G,vcores=3,yarn.io/gpu=0",
|
||||||
|
resTypes);
|
||||||
|
Assert.assertEquals(2 * 1024, res.getMemorySize());
|
||||||
|
Assert.assertEquals(0, res.getResourceValue(ResourceInformation.GPU_URI));
|
||||||
|
|
||||||
|
res = CliUtils.createResourceFromString("memory=2G,vcores=3,yarn.io/gpu=3",
|
||||||
|
resTypes);
|
||||||
|
Assert.assertEquals(2 * 1024, res.getMemorySize());
|
||||||
|
Assert.assertEquals(3, res.getResourceValue(ResourceInformation.GPU_URI));
|
||||||
|
|
||||||
|
// TODO, add more negative tests.
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,104 @@
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
* <p>
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
* <p>
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
|
||||||
|
package org.apache.hadoop.yarn.submarine.client.cli;
|
||||||
|
|
||||||
|
import org.apache.commons.cli.ParseException;
|
||||||
|
import org.apache.hadoop.yarn.api.records.ApplicationId;
|
||||||
|
import org.apache.hadoop.yarn.exceptions.YarnException;
|
||||||
|
import org.apache.hadoop.yarn.submarine.client.cli.param.ShowJobParameters;
|
||||||
|
import org.apache.hadoop.yarn.submarine.common.MockClientContext;
|
||||||
|
import org.apache.hadoop.yarn.submarine.common.conf.SubmarineLogs;
|
||||||
|
import org.apache.hadoop.yarn.submarine.common.exception.SubmarineException;
|
||||||
|
import org.apache.hadoop.yarn.submarine.runtimes.RuntimeFactory;
|
||||||
|
import org.apache.hadoop.yarn.submarine.runtimes.common.MemorySubmarineStorage;
|
||||||
|
import org.apache.hadoop.yarn.submarine.runtimes.common.StorageKeyConstants;
|
||||||
|
import org.apache.hadoop.yarn.submarine.runtimes.common.SubmarineStorage;
|
||||||
|
import org.junit.Assert;
|
||||||
|
import org.junit.Before;
|
||||||
|
import org.junit.Test;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
import static org.mockito.Mockito.mock;
|
||||||
|
import static org.mockito.Mockito.when;
|
||||||
|
|
||||||
|
public class TestShowJobCliParsing {
|
||||||
|
@Before
|
||||||
|
public void before() {
|
||||||
|
SubmarineLogs.verboseOff();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testPrintHelp() {
|
||||||
|
MockClientContext mockClientContext = new MockClientContext();
|
||||||
|
ShowJobCli showJobCli = new ShowJobCli(mockClientContext);
|
||||||
|
showJobCli.printUsages();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testShowJob()
|
||||||
|
throws InterruptedException, SubmarineException, YarnException,
|
||||||
|
ParseException, IOException {
|
||||||
|
MockClientContext mockClientContext = new MockClientContext();
|
||||||
|
ShowJobCli showJobCli = new ShowJobCli(mockClientContext) {
|
||||||
|
@Override
|
||||||
|
protected void getAndPrintJobInfo() {
|
||||||
|
// do nothing
|
||||||
|
}
|
||||||
|
};
|
||||||
|
showJobCli.run(new String[] { "--name", "my-job" });
|
||||||
|
ShowJobParameters parameters = showJobCli.getParameters();
|
||||||
|
Assert.assertEquals(parameters.getName(), "my-job");
|
||||||
|
}
|
||||||
|
|
||||||
|
private Map<String, String> getMockJobInfo(String jobName) {
|
||||||
|
Map<String, String> map = new HashMap<>();
|
||||||
|
map.put(StorageKeyConstants.APPLICATION_ID,
|
||||||
|
ApplicationId.newInstance(1234L, 1).toString());
|
||||||
|
map.put(StorageKeyConstants.JOB_RUN_ARGS, "job run 123456");
|
||||||
|
map.put(StorageKeyConstants.INPUT_PATH, "hdfs://" + jobName);
|
||||||
|
return map;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testSimpleShowJob()
|
||||||
|
throws InterruptedException, SubmarineException, YarnException,
|
||||||
|
ParseException, IOException {
|
||||||
|
SubmarineStorage storage = new MemorySubmarineStorage();
|
||||||
|
MockClientContext mockClientContext = new MockClientContext();
|
||||||
|
RuntimeFactory runtimeFactory = mock(RuntimeFactory.class);
|
||||||
|
when(runtimeFactory.getSubmarineStorage()).thenReturn(storage);
|
||||||
|
mockClientContext.setRuntimeFactory(runtimeFactory);
|
||||||
|
|
||||||
|
ShowJobCli showJobCli = new ShowJobCli(mockClientContext);
|
||||||
|
|
||||||
|
try {
|
||||||
|
showJobCli.run(new String[] { "--name", "my-job" });
|
||||||
|
} catch (IOException e) {
|
||||||
|
// expected
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
storage.addNewJob("my-job", getMockJobInfo("my-job"));
|
||||||
|
showJobCli.run(new String[] { "--name", "my-job" });
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,167 @@
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
* <p>
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
* <p>
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hadoop.yarn.submarine.client.cli.yarnservice;
|
||||||
|
|
||||||
|
import org.apache.hadoop.yarn.api.records.ApplicationId;
|
||||||
|
import org.apache.hadoop.yarn.exceptions.YarnException;
|
||||||
|
import org.apache.hadoop.yarn.service.api.records.Component;
|
||||||
|
import org.apache.hadoop.yarn.service.api.records.Service;
|
||||||
|
import org.apache.hadoop.yarn.service.client.ServiceClient;
|
||||||
|
import org.apache.hadoop.yarn.submarine.client.cli.RunJobCli;
|
||||||
|
import org.apache.hadoop.yarn.submarine.common.MockClientContext;
|
||||||
|
import org.apache.hadoop.yarn.submarine.common.api.TaskType;
|
||||||
|
import org.apache.hadoop.yarn.submarine.common.conf.SubmarineLogs;
|
||||||
|
import org.apache.hadoop.yarn.submarine.runtimes.common.JobSubmitter;
|
||||||
|
import org.apache.hadoop.yarn.submarine.runtimes.common.StorageKeyConstants;
|
||||||
|
import org.apache.hadoop.yarn.submarine.runtimes.common.SubmarineStorage;
|
||||||
|
import org.apache.hadoop.yarn.submarine.runtimes.yarnservice.YarnServiceJobSubmitter;
|
||||||
|
import org.apache.hadoop.yarn.submarine.runtimes.yarnservice.YarnServiceUtils;
|
||||||
|
import org.junit.Assert;
|
||||||
|
import org.junit.Before;
|
||||||
|
import org.junit.Test;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
import static org.mockito.Matchers.any;
|
||||||
|
import static org.mockito.Mockito.mock;
|
||||||
|
import static org.mockito.Mockito.when;
|
||||||
|
|
||||||
|
public class TestYarnServiceRunJobCli {
|
||||||
|
@Before
|
||||||
|
public void before() throws IOException, YarnException {
|
||||||
|
SubmarineLogs.verboseOff();
|
||||||
|
ServiceClient serviceClient = mock(ServiceClient.class);
|
||||||
|
when(serviceClient.actionCreate(any(Service.class))).thenReturn(
|
||||||
|
ApplicationId.newInstance(1234L, 1));
|
||||||
|
YarnServiceUtils.setStubServiceClient(serviceClient);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testPrintHelp() {
|
||||||
|
MockClientContext mockClientContext =
|
||||||
|
YarnServiceCliTestUtils.getMockClientContext();
|
||||||
|
RunJobCli runJobCli = new RunJobCli(mockClientContext);
|
||||||
|
runJobCli.printUsages();
|
||||||
|
}
|
||||||
|
|
||||||
|
private Service getServiceSpecFromJobSubmitter(JobSubmitter jobSubmitter) {
|
||||||
|
return ((YarnServiceJobSubmitter) jobSubmitter).getServiceSpec();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testBasicRunJobForDistributedTraining() throws Exception {
|
||||||
|
MockClientContext mockClientContext =
|
||||||
|
YarnServiceCliTestUtils.getMockClientContext();
|
||||||
|
RunJobCli runJobCli = new RunJobCli(mockClientContext);
|
||||||
|
Assert.assertFalse(SubmarineLogs.isVerbose());
|
||||||
|
|
||||||
|
runJobCli.run(
|
||||||
|
new String[] { "--name", "my-job", "--docker_image", "tf-docker:1.1.0",
|
||||||
|
"--input_path", "s3://input", "--checkpoint_path",
|
||||||
|
"s3://output", "--num_workers", "3", "--num_ps", "2",
|
||||||
|
"--worker_launch_cmd", "python run-job.py", "--worker_resources",
|
||||||
|
"memory=2048M,vcores=2", "--ps_resources", "memory=4096M,vcores=4",
|
||||||
|
"--tensorboard", "true", "--ps_docker_image", "ps.image",
|
||||||
|
"--worker_docker_image", "worker.image",
|
||||||
|
"--ps_launch_cmd", "python run-ps.py", "--verbose" });
|
||||||
|
Service serviceSpec = getServiceSpecFromJobSubmitter(
|
||||||
|
runJobCli.getJobSubmitter());
|
||||||
|
Assert.assertEquals(3, serviceSpec.getComponents().size());
|
||||||
|
Assert.assertTrue(
|
||||||
|
serviceSpec.getComponent(TaskType.WORKER.getComponentName()) != null);
|
||||||
|
Assert.assertTrue(
|
||||||
|
serviceSpec.getComponent(TaskType.PRIMARY_WORKER.getComponentName())
|
||||||
|
!= null);
|
||||||
|
Assert.assertTrue(
|
||||||
|
serviceSpec.getComponent(TaskType.PS.getComponentName()) != null);
|
||||||
|
Component primaryWorkerComp = serviceSpec.getComponent(
|
||||||
|
TaskType.PRIMARY_WORKER.getComponentName());
|
||||||
|
Assert.assertEquals(2048, primaryWorkerComp.getResource().calcMemoryMB());
|
||||||
|
Assert.assertEquals(2,
|
||||||
|
primaryWorkerComp.getResource().getCpus().intValue());
|
||||||
|
|
||||||
|
Component workerComp = serviceSpec.getComponent(
|
||||||
|
TaskType.WORKER.getComponentName());
|
||||||
|
Assert.assertEquals(2048, workerComp.getResource().calcMemoryMB());
|
||||||
|
Assert.assertEquals(2, workerComp.getResource().getCpus().intValue());
|
||||||
|
|
||||||
|
Component psComp = serviceSpec.getComponent(TaskType.PS.getComponentName());
|
||||||
|
Assert.assertEquals(4096, psComp.getResource().calcMemoryMB());
|
||||||
|
Assert.assertEquals(4, psComp.getResource().getCpus().intValue());
|
||||||
|
|
||||||
|
Assert.assertEquals("worker.image", workerComp.getArtifact().getId());
|
||||||
|
Assert.assertEquals("ps.image", psComp.getArtifact().getId());
|
||||||
|
|
||||||
|
Assert.assertTrue(SubmarineLogs.isVerbose());
|
||||||
|
|
||||||
|
// TODO, ADD TEST TO USE SERVICE CLIENT TO VALIDATE THE JSON SPEC
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testBasicRunJobForSingleNodeTraining() throws Exception {
|
||||||
|
MockClientContext mockClientContext =
|
||||||
|
YarnServiceCliTestUtils.getMockClientContext();
|
||||||
|
RunJobCli runJobCli = new RunJobCli(mockClientContext);
|
||||||
|
Assert.assertFalse(SubmarineLogs.isVerbose());
|
||||||
|
|
||||||
|
runJobCli.run(
|
||||||
|
new String[] { "--name", "my-job", "--docker_image", "tf-docker:1.1.0",
|
||||||
|
"--input_path", "s3://input", "--checkpoint_path",
|
||||||
|
"s3://output", "--num_workers", "1", "--worker_launch_cmd",
|
||||||
|
"python run-job.py", "--worker_resources", "memory=2G,vcores=2",
|
||||||
|
"--tensorboard", "true", "--verbose" });
|
||||||
|
Service serviceSpec = getServiceSpecFromJobSubmitter(
|
||||||
|
runJobCli.getJobSubmitter());
|
||||||
|
Assert.assertEquals(1, serviceSpec.getComponents().size());
|
||||||
|
Assert.assertTrue(
|
||||||
|
serviceSpec.getComponent(TaskType.PRIMARY_WORKER.getComponentName())
|
||||||
|
!= null);
|
||||||
|
Component primaryWorkerComp = serviceSpec.getComponent(
|
||||||
|
TaskType.PRIMARY_WORKER.getComponentName());
|
||||||
|
Assert.assertEquals(2048, primaryWorkerComp.getResource().calcMemoryMB());
|
||||||
|
Assert.assertEquals(2,
|
||||||
|
primaryWorkerComp.getResource().getCpus().intValue());
|
||||||
|
|
||||||
|
Assert.assertTrue(SubmarineLogs.isVerbose());
|
||||||
|
|
||||||
|
// TODO, ADD TEST TO USE SERVICE CLIENT TO VALIDATE THE JSON SPEC
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testParameterStorageForTrainingJob() throws Exception {
|
||||||
|
MockClientContext mockClientContext =
|
||||||
|
YarnServiceCliTestUtils.getMockClientContext();
|
||||||
|
RunJobCli runJobCli = new RunJobCli(mockClientContext);
|
||||||
|
Assert.assertFalse(SubmarineLogs.isVerbose());
|
||||||
|
|
||||||
|
runJobCli.run(
|
||||||
|
new String[] { "--name", "my-job", "--docker_image", "tf-docker:1.1.0",
|
||||||
|
"--input_path", "s3://input", "--checkpoint_path",
|
||||||
|
"s3://output", "--num_workers", "1", "--worker_launch_cmd",
|
||||||
|
"python run-job.py", "--worker_resources", "memory=2G,vcores=2",
|
||||||
|
"--tensorboard", "true", "--verbose" });
|
||||||
|
SubmarineStorage storage =
|
||||||
|
mockClientContext.getRuntimeFactory().getSubmarineStorage();
|
||||||
|
Map<String, String> jobInfo = storage.getJobInfoByName("my-job");
|
||||||
|
Assert.assertTrue(jobInfo.size() > 0);
|
||||||
|
Assert.assertEquals(jobInfo.get(StorageKeyConstants.INPUT_PATH),
|
||||||
|
"s3://input");
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,35 @@
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
* <p>
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
* <p>
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hadoop.yarn.submarine.client.cli.yarnservice;
|
||||||
|
|
||||||
|
import org.apache.hadoop.yarn.submarine.common.MockClientContext;
|
||||||
|
import org.apache.hadoop.yarn.submarine.runtimes.RuntimeFactory;
|
||||||
|
import org.apache.hadoop.yarn.submarine.runtimes.common.MemorySubmarineStorage;
|
||||||
|
import org.apache.hadoop.yarn.submarine.runtimes.yarnservice.YarnServiceRuntimeFactory;
|
||||||
|
|
||||||
|
public class YarnServiceCliTestUtils {
|
||||||
|
public static MockClientContext getMockClientContext() {
|
||||||
|
MockClientContext mockClientContext = new MockClientContext();
|
||||||
|
RuntimeFactory runtimeFactory = new YarnServiceRuntimeFactory(
|
||||||
|
mockClientContext);
|
||||||
|
mockClientContext.setRuntimeFactory(runtimeFactory);
|
||||||
|
runtimeFactory.setSubmarineStorage(new MemorySubmarineStorage());
|
||||||
|
return mockClientContext;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,56 @@
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hadoop.yarn.submarine.common;
|
||||||
|
|
||||||
|
import org.apache.hadoop.yarn.submarine.common.fs.MockRemoteDirectoryManager;
|
||||||
|
import org.apache.hadoop.yarn.submarine.common.fs.RemoteDirectoryManager;
|
||||||
|
import org.apache.hadoop.yarn.client.api.YarnClient;
|
||||||
|
import org.apache.hadoop.yarn.exceptions.YarnException;
|
||||||
|
import org.apache.hadoop.yarn.service.client.ServiceClient;
|
||||||
|
import org.apache.hadoop.yarn.util.resource.ResourceUtils;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
import static org.junit.Assert.fail;
|
||||||
|
import static org.mockito.Mockito.mock;
|
||||||
|
import static org.mockito.Mockito.when;
|
||||||
|
|
||||||
|
public class MockClientContext extends ClientContext {
|
||||||
|
private MockRemoteDirectoryManager remoteDirectoryMgr =
|
||||||
|
new MockRemoteDirectoryManager();
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public RemoteDirectoryManager getRemoteDirectoryManager() {
|
||||||
|
return remoteDirectoryMgr;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public synchronized YarnClient getOrCreateYarnClient() {
|
||||||
|
YarnClient client = mock(YarnClient.class);
|
||||||
|
try {
|
||||||
|
when(client.getResourceTypeInfo()).thenReturn(
|
||||||
|
ResourceUtils.getResourcesTypeInfo());
|
||||||
|
} catch (YarnException e) {
|
||||||
|
fail(e.getMessage());
|
||||||
|
} catch (IOException e) {
|
||||||
|
fail(e.getMessage());
|
||||||
|
}
|
||||||
|
return client;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,83 @@
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hadoop.yarn.submarine.common.fs;
|
||||||
|
|
||||||
|
import org.apache.hadoop.conf.Configuration;
|
||||||
|
import org.apache.hadoop.fs.FileSystem;
|
||||||
|
import org.apache.hadoop.fs.Path;
|
||||||
|
|
||||||
|
import java.io.File;
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
public class MockRemoteDirectoryManager implements RemoteDirectoryManager {
|
||||||
|
private File jobsParentDir = null;
|
||||||
|
private File modelParentDir = null;
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Path getJobStagingArea(String jobName, boolean create)
|
||||||
|
throws IOException {
|
||||||
|
if (jobsParentDir == null && create) {
|
||||||
|
jobsParentDir = new File(
|
||||||
|
"target/_staging_area_" + System.currentTimeMillis());
|
||||||
|
if (!jobsParentDir.mkdirs()) {
|
||||||
|
throw new IOException(
|
||||||
|
"Failed to mkdirs for" + jobsParentDir.getAbsolutePath());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
File jobDir = new File(jobsParentDir.getAbsolutePath(), jobName);
|
||||||
|
if (create && !jobDir.exists()) {
|
||||||
|
if (!jobDir.mkdirs()) {
|
||||||
|
throw new IOException("Failed to mkdirs for " + jobDir.getAbsolutePath());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return new Path(jobDir.getAbsolutePath());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Path getJobCheckpointDir(String jobName, boolean create)
|
||||||
|
throws IOException {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Path getModelDir(String modelName, boolean create) throws IOException {
|
||||||
|
if (modelParentDir == null && create) {
|
||||||
|
modelParentDir = new File(
|
||||||
|
"target/_models_" + System.currentTimeMillis());
|
||||||
|
if (!modelParentDir.mkdirs()) {
|
||||||
|
throw new IOException(
|
||||||
|
"Failed to mkdirs for " + modelParentDir.getAbsolutePath());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
File modelDir = new File(modelParentDir.getAbsolutePath(), modelName);
|
||||||
|
if (create) {
|
||||||
|
if (!modelDir.exists() && !modelDir.mkdirs()) {
|
||||||
|
throw new IOException("Failed to mkdirs for " + modelDir.getAbsolutePath());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return new Path(modelDir.getAbsolutePath());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public FileSystem getFileSystem() throws IOException {
|
||||||
|
return FileSystem.getLocal(new Configuration());
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,74 @@
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
* <p>
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
* <p>
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hadoop.yarn.submarine.runtimes.common;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
public class MemorySubmarineStorage extends SubmarineStorage {
|
||||||
|
private Map<String, Map<String, String>> jobsInfo = new HashMap<>();
|
||||||
|
private Map<String, Map<String, Map<String, String>>> modelsInfo =
|
||||||
|
new HashMap<>();
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public synchronized void addNewJob(String jobName, Map<String, String> jobInfo)
|
||||||
|
throws IOException {
|
||||||
|
jobsInfo.put(jobName, jobInfo);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public synchronized Map<String, String> getJobInfoByName(String jobName)
|
||||||
|
throws IOException {
|
||||||
|
Map<String, String> info = jobsInfo.get(jobName);
|
||||||
|
if (info == null) {
|
||||||
|
throw new IOException("Failed to find job=" + jobName);
|
||||||
|
}
|
||||||
|
return info;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public synchronized void addNewModel(String modelName, String version,
|
||||||
|
Map<String, String> modelInfo) throws IOException {
|
||||||
|
if (!modelsInfo.containsKey(modelName)) {
|
||||||
|
modelsInfo.put(modelName, new HashMap<>());
|
||||||
|
}
|
||||||
|
modelsInfo.get(modelName).put(version, modelInfo);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public synchronized Map<String, String> getModelInfoByName(String modelName,
|
||||||
|
String version) throws IOException {
|
||||||
|
|
||||||
|
boolean notFound = false;
|
||||||
|
Map<String, String> info = null;
|
||||||
|
try {
|
||||||
|
info = modelsInfo.get(modelName).get(version);
|
||||||
|
} catch (NullPointerException e) {
|
||||||
|
notFound = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (notFound || info == null) {
|
||||||
|
throw new IOException(
|
||||||
|
"Failed to find, model=" + modelName + " version=" + version);
|
||||||
|
}
|
||||||
|
|
||||||
|
return info;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,73 @@
|
||||||
|
/**
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License. See accompanying LICENSE file.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hadoop.yarn.submarine.runtimes.common;
|
||||||
|
|
||||||
|
import org.apache.hadoop.yarn.submarine.common.ClientContext;
|
||||||
|
import org.apache.hadoop.yarn.submarine.common.fs.MockRemoteDirectoryManager;
|
||||||
|
import org.junit.Assert;
|
||||||
|
import org.junit.Test;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
import static org.mockito.Mockito.mock;
|
||||||
|
import static org.mockito.Mockito.when;
|
||||||
|
|
||||||
|
public class TestFSBasedSubmarineStorage {
|
||||||
|
private Map<String, String> getMap(String prefix) {
|
||||||
|
Map<String, String> map = new HashMap<>();
|
||||||
|
map.put(prefix + "1", "1");
|
||||||
|
map.put(prefix + "2", "2");
|
||||||
|
map.put(prefix + "3", "3");
|
||||||
|
map.put(prefix + "4", "4");
|
||||||
|
return map;
|
||||||
|
}
|
||||||
|
|
||||||
|
private void compareMap(Map<String, String> map1, Map<String, String> map2) {
|
||||||
|
Assert.assertEquals(map1.size(), map2.size());
|
||||||
|
for (String k : map1.keySet()) {
|
||||||
|
Assert.assertEquals(map1.get(k), map2.get(k));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testStorageOps() throws IOException {
|
||||||
|
MockRemoteDirectoryManager remoteDirectoryManager = new MockRemoteDirectoryManager();
|
||||||
|
ClientContext clientContext = mock(ClientContext.class);
|
||||||
|
when(clientContext.getRemoteDirectoryManager()).thenReturn(remoteDirectoryManager);
|
||||||
|
FSBasedSubmarineStorageImpl storage = new FSBasedSubmarineStorageImpl(
|
||||||
|
clientContext);
|
||||||
|
storage.addNewJob("job1", getMap("job1"));
|
||||||
|
storage.addNewJob("job2", getMap("job2"));
|
||||||
|
storage.addNewJob("job3", getMap("job3"));
|
||||||
|
storage.addNewJob("job4", new HashMap<>());
|
||||||
|
storage.addNewModel("model1", "1.0", getMap("model1_1.0"));
|
||||||
|
storage.addNewModel("model1", "2.0.0", getMap("model1_2.0.0"));
|
||||||
|
storage.addNewModel("model2", null, getMap("model1_default"));
|
||||||
|
storage.addNewModel("model2", "1.0", getMap("model2_1.0"));
|
||||||
|
|
||||||
|
// create a new storage and read it back.
|
||||||
|
storage = new FSBasedSubmarineStorageImpl(
|
||||||
|
clientContext);
|
||||||
|
compareMap(getMap("job1"), storage.getJobInfoByName("job1"));
|
||||||
|
compareMap(getMap("job2"), storage.getJobInfoByName("job2"));
|
||||||
|
compareMap(getMap("job3"), storage.getJobInfoByName("job3"));
|
||||||
|
compareMap(new HashMap<>(), storage.getJobInfoByName("job4"));
|
||||||
|
compareMap(getMap("model1_1.0"), storage.getModelInfoByName("model1", "1.0"));
|
||||||
|
compareMap(getMap("model1_2.0.0"), storage.getModelInfoByName("model1", "2.0.0"));
|
||||||
|
compareMap(getMap("model2_1.0"), storage.getModelInfoByName("model2", "1.0"));
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,42 @@
|
||||||
|
/**
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
* <p>
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
* <p>
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License. See accompanying LICENSE file.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hadoop.yarn.submarine.runtimes.yarnservice;
|
||||||
|
|
||||||
|
import org.codehaus.jettison.json.JSONException;
|
||||||
|
import org.junit.Assert;
|
||||||
|
import org.junit.Test;
|
||||||
|
|
||||||
|
public class TestTFConfigGenerator {
|
||||||
|
@Test
|
||||||
|
public void testSimpleDistributedTFConfigGenerator() throws JSONException {
|
||||||
|
String json = YarnServiceUtils.getTFConfigEnv("worker", 5, 3, "wtan",
|
||||||
|
"tf-job-001", "example.com");
|
||||||
|
String expected =
|
||||||
|
"{\\\"cluster\\\":{\\\"master\\\":[\\\"master-0.wtan.tf-job-001.example.com:8000\\\"],\\\"worker\\\":[\\\"worker-0.wtan.tf-job-001.example.com:8000\\\",\\\"worker-1.wtan.tf-job-001.example.com:8000\\\",\\\"worker-2.wtan.tf-job-001.example.com:8000\\\",\\\"worker-3.wtan.tf-job-001.example.com:8000\\\"],\\\"ps\\\":[\\\"ps-0.wtan.tf-job-001.example.com:8000\\\",\\\"ps-1.wtan.tf-job-001.example.com:8000\\\",\\\"ps-2.wtan.tf-job-001.example.com:8000\\\"]},\\\"task\\\":{ \\\"type\\\":\\\"worker\\\", \\\"index\\\":$_TASK_INDEX},\\\"environment\\\":\\\"cloud\\\"}";
|
||||||
|
Assert.assertEquals(expected, json);
|
||||||
|
|
||||||
|
json = YarnServiceUtils.getTFConfigEnv("ps", 5, 3, "wtan", "tf-job-001",
|
||||||
|
"example.com");
|
||||||
|
expected =
|
||||||
|
"{\\\"cluster\\\":{\\\"master\\\":[\\\"master-0.wtan.tf-job-001.example.com:8000\\\"],\\\"worker\\\":[\\\"worker-0.wtan.tf-job-001.example.com:8000\\\",\\\"worker-1.wtan.tf-job-001.example.com:8000\\\",\\\"worker-2.wtan.tf-job-001.example.com:8000\\\",\\\"worker-3.wtan.tf-job-001.example.com:8000\\\"],\\\"ps\\\":[\\\"ps-0.wtan.tf-job-001.example.com:8000\\\",\\\"ps-1.wtan.tf-job-001.example.com:8000\\\",\\\"ps-2.wtan.tf-job-001.example.com:8000\\\"]},\\\"task\\\":{ \\\"type\\\":\\\"ps\\\", \\\"index\\\":$_TASK_INDEX},\\\"environment\\\":\\\"cloud\\\"}";
|
||||||
|
Assert.assertEquals(expected, json);
|
||||||
|
|
||||||
|
json = YarnServiceUtils.getTFConfigEnv("master", 2, 1, "wtan", "tf-job-001",
|
||||||
|
"example.com");
|
||||||
|
expected =
|
||||||
|
"{\\\"cluster\\\":{\\\"master\\\":[\\\"master-0.wtan.tf-job-001.example.com:8000\\\"],\\\"worker\\\":[\\\"worker-0.wtan.tf-job-001.example.com:8000\\\"],\\\"ps\\\":[\\\"ps-0.wtan.tf-job-001.example.com:8000\\\"]},\\\"task\\\":{ \\\"type\\\":\\\"master\\\", \\\"index\\\":$_TASK_INDEX},\\\"environment\\\":\\\"cloud\\\"}";
|
||||||
|
Assert.assertEquals(expected, json);
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,21 @@
|
||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
|
||||||
|
<!--
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
you may not use this file except in compliance with the License.
|
||||||
|
You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License. See accompanying LICENSE file.
|
||||||
|
-->
|
||||||
|
|
||||||
|
<!-- Put site-specific property overrides in this file. -->
|
||||||
|
|
||||||
|
<configuration>
|
||||||
|
|
||||||
|
</configuration>
|
|
@ -0,0 +1,21 @@
|
||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
|
||||||
|
<!--
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
you may not use this file except in compliance with the License.
|
||||||
|
You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License. See accompanying LICENSE file.
|
||||||
|
-->
|
||||||
|
|
||||||
|
<!-- Put site-specific property overrides in this file. -->
|
||||||
|
|
||||||
|
<configuration>
|
||||||
|
|
||||||
|
</configuration>
|
|
@ -37,6 +37,7 @@
|
||||||
<module>hadoop-yarn-applications-distributedshell</module>
|
<module>hadoop-yarn-applications-distributedshell</module>
|
||||||
<module>hadoop-yarn-applications-unmanaged-am-launcher</module>
|
<module>hadoop-yarn-applications-unmanaged-am-launcher</module>
|
||||||
<module>hadoop-yarn-services</module>
|
<module>hadoop-yarn-services</module>
|
||||||
|
<module>hadoop-yarn-submarine</module>
|
||||||
</modules>
|
</modules>
|
||||||
|
|
||||||
<profiles>
|
<profiles>
|
||||||
|
|
Loading…
Reference in New Issue