SUBMARINE-62. PS_LAUNCH_CMD CLI description is wrong in RunJobCli. Contributed by Adam Antal

This commit is contained in:
Szilard Nemeth 2019-07-15 11:17:16 +02:00
parent 91ce09e706
commit be784de2d4
1 changed files with 88 additions and 61 deletions

View File

@ -71,13 +71,22 @@ import java.util.Map;
public class RunJobCli extends AbstractCli {
private static final Logger LOG =
LoggerFactory.getLogger(RunJobCli.class);
private static final String TENSORFLOW = "TensorFlow";
private static final String PYTORCH = "PyTorch";
private static final String PS = "PS";
private static final String WORKER = "worker";
private static final String TENSORBOARD = "TensorBoard";
private static final String CAN_BE_USED_WITH_TF_PYTORCH =
"Can be used with TensorFlow or PyTorch frameworks.";
private static final String CAN_BE_USED_WITH_TF_ONLY =
"Can only be used with TensorFlow framework.";
String.format("Can be used with %s or %s frameworks.",
TENSORFLOW, PYTORCH);
private static final String TENSORFLOW_ONLY =
String.format("Can only be used with %s framework.", TENSORFLOW);
public static final String YAML_PARSE_FAILED = "Failed to parse " +
"YAML config";
private static final String LOCAL_OR_ANY_FS_DIRECTORY = "Could be a local " +
"directory or any other directory on the file system.";
private Options options;
private JobSubmitter jobSubmitter;
@ -112,50 +121,55 @@ public class RunJobCli extends AbstractCli {
Framework.getValues()));
options.addOption(CliConstants.NAME, true, "Name of the job");
options.addOption(CliConstants.INPUT_PATH, true,
"Input of the job, could be local or other FS directory");
"Input of the job. " + LOCAL_OR_ANY_FS_DIRECTORY);
options.addOption(CliConstants.CHECKPOINT_PATH, true,
"Training output directory of the job, "
+ "could be local or other FS directory. This typically includes "
+ "checkpoint files and exported model ");
"Training output directory of the job. " + LOCAL_OR_ANY_FS_DIRECTORY +
"This typically includes checkpoint files and exported model");
options.addOption(CliConstants.SAVED_MODEL_PATH, true,
"Model exported path (savedmodel) of the job, which is needed when "
+ "exported model is not placed under ${checkpoint_path}"
+ "could be local or other FS directory. " +
"This will be used to serve.");
"Model exported path (saved model) of the job, which is needed when " +
"exported model is not placed under ${checkpoint_path}. " +
LOCAL_OR_ANY_FS_DIRECTORY + "This will be used to serve");
options.addOption(CliConstants.DOCKER_IMAGE, true, "Docker image name/tag");
options.addOption(CliConstants.PS_DOCKER_IMAGE, true,
getDockerImageMessage(PS));
options.addOption(CliConstants.WORKER_DOCKER_IMAGE, true,
getDockerImageMessage(WORKER));
options.addOption(CliConstants.QUEUE, true,
"Name of queue to run the job, by default it uses default queue");
"Name of queue to run the job. By default, the default queue is used");
addWorkerOptions(options);
addPSOptions(options);
addTensorboardOptions(options);
options.addOption(CliConstants.ENV, true,
"Common environment variable of worker/ps");
"Common environment variable passed to worker / PS");
options.addOption(CliConstants.VERBOSE, false,
"Print verbose log for troubleshooting");
options.addOption(CliConstants.WAIT_JOB_FINISH, false,
"Specified when user want to wait the job finish");
options.addOption(CliConstants.QUICKLINK, true, "Specify quicklink so YARN"
+ "web UI shows link to given role instance and port. When "
+ "--tensorboard is specified, quicklink to tensorboard instance will "
+ "be added automatically. The format of quick link is: "
+ "Quick_link_label=http(or https)://role-name:port. For example, "
+ "if want to link to first worker's 7070 port, and text of quicklink "
+ "is Notebook_UI, user need to specify --quicklink "
+ "Notebook_UI=https://master-0:7070");
"Specified when user wants to wait for jobs to finish");
options.addOption(CliConstants.QUICKLINK, true, "Specify quicklink so YARN "
+ "web UI shows link to the given role instance and port. " +
"When --tensorboard is specified, quicklink to the " +
TENSORBOARD + " instance will be added automatically. " +
"The format of quick link is: "
+ "Quick_link_label=http(or https)://role-name:port. " +
"For example, if users want to link to the first worker's 7070 port, " +
"and text of quicklink is Notebook_UI, " +
"users need to specify --quicklink Notebook_UI=https://master-0:7070");
options.addOption(CliConstants.LOCALIZATION, true, "Specify"
+ " localization to make remote/local file/directory available to"
+ " all container(Docker)."
+ " Argument format is \"RemoteUri:LocalFilePath[:rw] \" (ro"
+ " permission is not supported yet)"
+ " The RemoteUri can be a file or directory in local or"
+ " HDFS or s3 or abfs or http .etc."
+ " Argument format is: \"RemoteUri:LocalFilePath[:rw] \" "
+ "(ro permission is not supported yet)."
+ " The RemoteUri can be a local file or directory on the filesystem."
+ " Alternatively, the following remote file systems / "
+ "transmit mechanisms can be used: "
+ " HDFS, S3 or abfs, HTTP, etc."
+ " The LocalFilePath can be absolute or relative."
+ " If it's a relative path, it'll be"
+ " If it is a relative path, it will be"
+ " under container's implied working directory"
+ " but sub directory is not supported yet."
+ " This option can be set mutiple times."
+ " but sub-directory is not supported yet."
+ " This option can be set multiple times."
+ " Examples are \n"
+ "-localization \"hdfs:///user/yarn/mydir2:/opt/data\"\n"
+ "-localization \"s3a:///a/b/myfile1:./\"\n"
@ -163,12 +177,12 @@ public class RunJobCli extends AbstractCli {
+ "-localization \"/user/yarn/mydir3:/opt/mydir3\"\n"
+ "-localization \"./mydir1:.\"\n");
options.addOption(CliConstants.KEYTAB, true, "Specify keytab used by the " +
"job under security environment");
"job under a secured environment");
options.addOption(CliConstants.PRINCIPAL, true, "Specify principal used " +
"by the job under security environment");
"by the job under a secured environment");
options.addOption(CliConstants.DISTRIBUTE_KEYTAB, false, "Distribute " +
"local keytab to cluster machines for service authentication. If not " +
"specified, pre-distributed keytab of which path specified by" +
"local keytab to cluster machines for service authentication. " +
"If not specified, pre-distributed keytab of which path specified by" +
" parameter" + CliConstants.KEYTAB + " on cluster machines will be " +
"used");
options.addOption("h", "help", false, "Print help");
@ -180,54 +194,67 @@ public class RunJobCli extends AbstractCli {
private void addWorkerOptions(Options options) {
options.addOption(CliConstants.N_WORKERS, true,
"Number of worker tasks of the job, by default it's 1." +
getNumberOfServiceMessage(WORKER, 1) +
CAN_BE_USED_WITH_TF_PYTORCH);
options.addOption(CliConstants.WORKER_DOCKER_IMAGE, true,
"Specify docker image for WORKER, when this is not specified, WORKER "
+ "uses --" + CliConstants.DOCKER_IMAGE + " as default." +
getDockerImageMessage(WORKER) +
CAN_BE_USED_WITH_TF_PYTORCH);
options.addOption(CliConstants.WORKER_LAUNCH_CMD, true,
"Commandline of worker, arguments will be "
+ "directly used to launch the worker" +
getLaunchCommandMessage(WORKER) +
CAN_BE_USED_WITH_TF_PYTORCH);
options.addOption(CliConstants.WORKER_RES, true,
"Resource of each worker, for example "
+ "memory-mb=2048,vcores=2,yarn.io/gpu=2" +
getServiceResourceMessage(WORKER) +
CAN_BE_USED_WITH_TF_PYTORCH);
}
private void addPSOptions(Options options) {
options.addOption(CliConstants.N_PS, true,
"Number of PS tasks of the job, by default it's 0. " +
CAN_BE_USED_WITH_TF_ONLY);
getNumberOfServiceMessage("PS", 0) +
TENSORFLOW_ONLY);
options.addOption(CliConstants.PS_DOCKER_IMAGE, true,
"Specify docker image for PS, when this is not specified, PS uses --"
+ CliConstants.DOCKER_IMAGE + " as default." +
CAN_BE_USED_WITH_TF_ONLY);
getDockerImageMessage(PS) +
TENSORFLOW_ONLY);
options.addOption(CliConstants.PS_LAUNCH_CMD, true,
"Commandline of worker, arguments will be "
+ "directly used to launch the PS" +
CAN_BE_USED_WITH_TF_ONLY);
getLaunchCommandMessage("PS") +
TENSORFLOW_ONLY);
options.addOption(CliConstants.PS_RES, true,
"Resource of each PS, for example "
+ "memory-mb=2048,vcores=2,yarn.io/gpu=2" +
CAN_BE_USED_WITH_TF_ONLY);
getServiceResourceMessage("PS") +
TENSORFLOW_ONLY);
}
private void addTensorboardOptions(Options options) {
options.addOption(CliConstants.TENSORBOARD, false,
"Should we run TensorBoard"
+ " for this job? By default it's disabled." +
CAN_BE_USED_WITH_TF_ONLY);
"Should we run TensorBoard for this job? " +
"By default, TensorBoard is disabled." +
TENSORFLOW_ONLY);
options.addOption(CliConstants.TENSORBOARD_RESOURCES, true,
"Specify resources of Tensorboard, by default it is "
"Specifies resources of Tensorboard. The default resource is: "
+ CliConstants.TENSORBOARD_DEFAULT_RESOURCES + "." +
CAN_BE_USED_WITH_TF_ONLY);
TENSORFLOW_ONLY);
options.addOption(CliConstants.TENSORBOARD_DOCKER_IMAGE, true,
"Specify Tensorboard docker image. when this is not "
+ "specified, Tensorboard " + "uses --" + CliConstants.DOCKER_IMAGE
+ " as default." +
CAN_BE_USED_WITH_TF_ONLY);
getDockerImageMessage(TENSORBOARD));
}
private String getLaunchCommandMessage(String service) {
return String.format("Launch command of the %s, arguments will be "
+ "directly used to launch the %s", service, service);
}
private String getServiceResourceMessage(String serviceType) {
return String.format("Resource of each %s process, for example: "
+ "memory-mb=2048,vcores=2,yarn.io/gpu=2", serviceType);
}
private String getNumberOfServiceMessage(String serviceType,
int defaultValue) {
return String.format("Number of %s processes for the job. " +
"The default value is %d.", serviceType, defaultValue);
}
private String getDockerImageMessage(String serviceType) {
return String.format("Specifies docker image for the %s process. " +
"When not specified, %s uses --%s as a default value.",
serviceType, serviceType, CliConstants.DOCKER_IMAGE);
}
private void parseCommandLineAndGetRunJobParameters(String[] args)