SUBMARINE-62. PS_LAUNCH_CMD CLI description is wrong in RunJobCli. Contributed by Adam Antal
This commit is contained in:
parent
91ce09e706
commit
be784de2d4
|
@ -71,13 +71,22 @@ import java.util.Map;
|
|||
public class RunJobCli extends AbstractCli {
|
||||
private static final Logger LOG =
|
||||
LoggerFactory.getLogger(RunJobCli.class);
|
||||
|
||||
private static final String TENSORFLOW = "TensorFlow";
|
||||
private static final String PYTORCH = "PyTorch";
|
||||
private static final String PS = "PS";
|
||||
private static final String WORKER = "worker";
|
||||
private static final String TENSORBOARD = "TensorBoard";
|
||||
|
||||
private static final String CAN_BE_USED_WITH_TF_PYTORCH =
|
||||
"Can be used with TensorFlow or PyTorch frameworks.";
|
||||
private static final String CAN_BE_USED_WITH_TF_ONLY =
|
||||
"Can only be used with TensorFlow framework.";
|
||||
String.format("Can be used with %s or %s frameworks.",
|
||||
TENSORFLOW, PYTORCH);
|
||||
private static final String TENSORFLOW_ONLY =
|
||||
String.format("Can only be used with %s framework.", TENSORFLOW);
|
||||
public static final String YAML_PARSE_FAILED = "Failed to parse " +
|
||||
"YAML config";
|
||||
|
||||
private static final String LOCAL_OR_ANY_FS_DIRECTORY = "Could be a local " +
|
||||
"directory or any other directory on the file system.";
|
||||
|
||||
private Options options;
|
||||
private JobSubmitter jobSubmitter;
|
||||
|
@ -112,50 +121,55 @@ public class RunJobCli extends AbstractCli {
|
|||
Framework.getValues()));
|
||||
options.addOption(CliConstants.NAME, true, "Name of the job");
|
||||
options.addOption(CliConstants.INPUT_PATH, true,
|
||||
"Input of the job, could be local or other FS directory");
|
||||
"Input of the job. " + LOCAL_OR_ANY_FS_DIRECTORY);
|
||||
options.addOption(CliConstants.CHECKPOINT_PATH, true,
|
||||
"Training output directory of the job, "
|
||||
+ "could be local or other FS directory. This typically includes "
|
||||
+ "checkpoint files and exported model ");
|
||||
"Training output directory of the job. " + LOCAL_OR_ANY_FS_DIRECTORY +
|
||||
"This typically includes checkpoint files and exported model");
|
||||
options.addOption(CliConstants.SAVED_MODEL_PATH, true,
|
||||
"Model exported path (savedmodel) of the job, which is needed when "
|
||||
+ "exported model is not placed under ${checkpoint_path}"
|
||||
+ "could be local or other FS directory. " +
|
||||
"This will be used to serve.");
|
||||
"Model exported path (saved model) of the job, which is needed when " +
|
||||
"exported model is not placed under ${checkpoint_path}. " +
|
||||
LOCAL_OR_ANY_FS_DIRECTORY + "This will be used to serve");
|
||||
options.addOption(CliConstants.DOCKER_IMAGE, true, "Docker image name/tag");
|
||||
options.addOption(CliConstants.PS_DOCKER_IMAGE, true,
|
||||
getDockerImageMessage(PS));
|
||||
options.addOption(CliConstants.WORKER_DOCKER_IMAGE, true,
|
||||
getDockerImageMessage(WORKER));
|
||||
options.addOption(CliConstants.QUEUE, true,
|
||||
"Name of queue to run the job, by default it uses default queue");
|
||||
"Name of queue to run the job. By default, the default queue is used");
|
||||
|
||||
addWorkerOptions(options);
|
||||
addPSOptions(options);
|
||||
addTensorboardOptions(options);
|
||||
|
||||
options.addOption(CliConstants.ENV, true,
|
||||
"Common environment variable of worker/ps");
|
||||
"Common environment variable passed to worker / PS");
|
||||
options.addOption(CliConstants.VERBOSE, false,
|
||||
"Print verbose log for troubleshooting");
|
||||
options.addOption(CliConstants.WAIT_JOB_FINISH, false,
|
||||
"Specified when user want to wait the job finish");
|
||||
options.addOption(CliConstants.QUICKLINK, true, "Specify quicklink so YARN"
|
||||
+ "web UI shows link to given role instance and port. When "
|
||||
+ "--tensorboard is specified, quicklink to tensorboard instance will "
|
||||
+ "be added automatically. The format of quick link is: "
|
||||
+ "Quick_link_label=http(or https)://role-name:port. For example, "
|
||||
+ "if want to link to first worker's 7070 port, and text of quicklink "
|
||||
+ "is Notebook_UI, user need to specify --quicklink "
|
||||
+ "Notebook_UI=https://master-0:7070");
|
||||
"Specified when user wants to wait for jobs to finish");
|
||||
options.addOption(CliConstants.QUICKLINK, true, "Specify quicklink so YARN "
|
||||
+ "web UI shows link to the given role instance and port. " +
|
||||
"When --tensorboard is specified, quicklink to the " +
|
||||
TENSORBOARD + " instance will be added automatically. " +
|
||||
"The format of quick link is: "
|
||||
+ "Quick_link_label=http(or https)://role-name:port. " +
|
||||
"For example, if users want to link to the first worker's 7070 port, " +
|
||||
"and text of quicklink is Notebook_UI, " +
|
||||
"users need to specify --quicklink Notebook_UI=https://master-0:7070");
|
||||
options.addOption(CliConstants.LOCALIZATION, true, "Specify"
|
||||
+ " localization to make remote/local file/directory available to"
|
||||
+ " all container(Docker)."
|
||||
+ " Argument format is \"RemoteUri:LocalFilePath[:rw] \" (ro"
|
||||
+ " permission is not supported yet)"
|
||||
+ " The RemoteUri can be a file or directory in local or"
|
||||
+ " HDFS or s3 or abfs or http .etc."
|
||||
+ " Argument format is: \"RemoteUri:LocalFilePath[:rw] \" "
|
||||
+ "(ro permission is not supported yet)."
|
||||
+ " The RemoteUri can be a local file or directory on the filesystem."
|
||||
+ " Alternatively, the following remote file systems / "
|
||||
+ "transmit mechanisms can be used: "
|
||||
+ " HDFS, S3 or abfs, HTTP, etc."
|
||||
+ " The LocalFilePath can be absolute or relative."
|
||||
+ " If it's a relative path, it'll be"
|
||||
+ " If it is a relative path, it will be"
|
||||
+ " under container's implied working directory"
|
||||
+ " but sub directory is not supported yet."
|
||||
+ " This option can be set mutiple times."
|
||||
+ " but sub-directory is not supported yet."
|
||||
+ " This option can be set multiple times."
|
||||
+ " Examples are \n"
|
||||
+ "-localization \"hdfs:///user/yarn/mydir2:/opt/data\"\n"
|
||||
+ "-localization \"s3a:///a/b/myfile1:./\"\n"
|
||||
|
@ -163,12 +177,12 @@ public class RunJobCli extends AbstractCli {
|
|||
+ "-localization \"/user/yarn/mydir3:/opt/mydir3\"\n"
|
||||
+ "-localization \"./mydir1:.\"\n");
|
||||
options.addOption(CliConstants.KEYTAB, true, "Specify keytab used by the " +
|
||||
"job under security environment");
|
||||
"job under a secured environment");
|
||||
options.addOption(CliConstants.PRINCIPAL, true, "Specify principal used " +
|
||||
"by the job under security environment");
|
||||
"by the job under a secured environment");
|
||||
options.addOption(CliConstants.DISTRIBUTE_KEYTAB, false, "Distribute " +
|
||||
"local keytab to cluster machines for service authentication. If not " +
|
||||
"specified, pre-distributed keytab of which path specified by" +
|
||||
"local keytab to cluster machines for service authentication. " +
|
||||
"If not specified, pre-distributed keytab of which path specified by" +
|
||||
" parameter" + CliConstants.KEYTAB + " on cluster machines will be " +
|
||||
"used");
|
||||
options.addOption("h", "help", false, "Print help");
|
||||
|
@ -180,54 +194,67 @@ public class RunJobCli extends AbstractCli {
|
|||
|
||||
private void addWorkerOptions(Options options) {
|
||||
options.addOption(CliConstants.N_WORKERS, true,
|
||||
"Number of worker tasks of the job, by default it's 1." +
|
||||
getNumberOfServiceMessage(WORKER, 1) +
|
||||
CAN_BE_USED_WITH_TF_PYTORCH);
|
||||
options.addOption(CliConstants.WORKER_DOCKER_IMAGE, true,
|
||||
"Specify docker image for WORKER, when this is not specified, WORKER "
|
||||
+ "uses --" + CliConstants.DOCKER_IMAGE + " as default." +
|
||||
getDockerImageMessage(WORKER) +
|
||||
CAN_BE_USED_WITH_TF_PYTORCH);
|
||||
options.addOption(CliConstants.WORKER_LAUNCH_CMD, true,
|
||||
"Commandline of worker, arguments will be "
|
||||
+ "directly used to launch the worker" +
|
||||
getLaunchCommandMessage(WORKER) +
|
||||
CAN_BE_USED_WITH_TF_PYTORCH);
|
||||
options.addOption(CliConstants.WORKER_RES, true,
|
||||
"Resource of each worker, for example "
|
||||
+ "memory-mb=2048,vcores=2,yarn.io/gpu=2" +
|
||||
getServiceResourceMessage(WORKER) +
|
||||
CAN_BE_USED_WITH_TF_PYTORCH);
|
||||
}
|
||||
|
||||
private void addPSOptions(Options options) {
|
||||
options.addOption(CliConstants.N_PS, true,
|
||||
"Number of PS tasks of the job, by default it's 0. " +
|
||||
CAN_BE_USED_WITH_TF_ONLY);
|
||||
getNumberOfServiceMessage("PS", 0) +
|
||||
TENSORFLOW_ONLY);
|
||||
options.addOption(CliConstants.PS_DOCKER_IMAGE, true,
|
||||
"Specify docker image for PS, when this is not specified, PS uses --"
|
||||
+ CliConstants.DOCKER_IMAGE + " as default." +
|
||||
CAN_BE_USED_WITH_TF_ONLY);
|
||||
getDockerImageMessage(PS) +
|
||||
TENSORFLOW_ONLY);
|
||||
options.addOption(CliConstants.PS_LAUNCH_CMD, true,
|
||||
"Commandline of worker, arguments will be "
|
||||
+ "directly used to launch the PS" +
|
||||
CAN_BE_USED_WITH_TF_ONLY);
|
||||
getLaunchCommandMessage("PS") +
|
||||
TENSORFLOW_ONLY);
|
||||
options.addOption(CliConstants.PS_RES, true,
|
||||
"Resource of each PS, for example "
|
||||
+ "memory-mb=2048,vcores=2,yarn.io/gpu=2" +
|
||||
CAN_BE_USED_WITH_TF_ONLY);
|
||||
getServiceResourceMessage("PS") +
|
||||
TENSORFLOW_ONLY);
|
||||
}
|
||||
|
||||
private void addTensorboardOptions(Options options) {
|
||||
options.addOption(CliConstants.TENSORBOARD, false,
|
||||
"Should we run TensorBoard"
|
||||
+ " for this job? By default it's disabled." +
|
||||
CAN_BE_USED_WITH_TF_ONLY);
|
||||
"Should we run TensorBoard for this job? " +
|
||||
"By default, TensorBoard is disabled." +
|
||||
TENSORFLOW_ONLY);
|
||||
options.addOption(CliConstants.TENSORBOARD_RESOURCES, true,
|
||||
"Specify resources of Tensorboard, by default it is "
|
||||
"Specifies resources of Tensorboard. The default resource is: "
|
||||
+ CliConstants.TENSORBOARD_DEFAULT_RESOURCES + "." +
|
||||
CAN_BE_USED_WITH_TF_ONLY);
|
||||
TENSORFLOW_ONLY);
|
||||
options.addOption(CliConstants.TENSORBOARD_DOCKER_IMAGE, true,
|
||||
"Specify Tensorboard docker image. when this is not "
|
||||
+ "specified, Tensorboard " + "uses --" + CliConstants.DOCKER_IMAGE
|
||||
+ " as default." +
|
||||
CAN_BE_USED_WITH_TF_ONLY);
|
||||
getDockerImageMessage(TENSORBOARD));
|
||||
}
|
||||
|
||||
private String getLaunchCommandMessage(String service) {
|
||||
return String.format("Launch command of the %s, arguments will be "
|
||||
+ "directly used to launch the %s", service, service);
|
||||
}
|
||||
|
||||
private String getServiceResourceMessage(String serviceType) {
|
||||
return String.format("Resource of each %s process, for example: "
|
||||
+ "memory-mb=2048,vcores=2,yarn.io/gpu=2", serviceType);
|
||||
}
|
||||
|
||||
private String getNumberOfServiceMessage(String serviceType,
|
||||
int defaultValue) {
|
||||
return String.format("Number of %s processes for the job. " +
|
||||
"The default value is %d.", serviceType, defaultValue);
|
||||
}
|
||||
|
||||
private String getDockerImageMessage(String serviceType) {
|
||||
return String.format("Specifies docker image for the %s process. " +
|
||||
"When not specified, %s uses --%s as a default value.",
|
||||
serviceType, serviceType, CliConstants.DOCKER_IMAGE);
|
||||
}
|
||||
|
||||
private void parseCommandLineAndGetRunJobParameters(String[] args)
|
||||
|
|
Loading…
Reference in New Issue