SUBMARINE-58. Submarine client needs to generate fat jar. Contributed by Zac Zhou.
This commit is contained in:
parent
732133cb2a
commit
729ccb2cab
|
@ -0,0 +1,183 @@
|
|||
<?xml version="1.0"?>
|
||||
<!--
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License. See accompanying LICENSE file.
|
||||
-->
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0
|
||||
http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
<parent>
|
||||
<artifactId>hadoop-submarine</artifactId>
|
||||
<groupId>org.apache.hadoop</groupId>
|
||||
<version>0.2.0-SNAPSHOT</version>
|
||||
</parent>
|
||||
<artifactId>${project.artifactId}</artifactId>
|
||||
<version>${project.version}</version>
|
||||
<name>Hadoop Submarine All</name>
|
||||
|
||||
<properties>
|
||||
<!-- Needed for generating FindBugs warnings using parent pom -->
|
||||
<yarn.basedir>${project.parent.parent.basedir}</yarn.basedir>
|
||||
<project.artifactId>hadoop-submarine-all</project.artifactId>
|
||||
<project.version>0.2.0-SNAPSHOT</project.version>
|
||||
</properties>
|
||||
|
||||
<dependencies>
|
||||
|
||||
<!-- Dependencies for Hadoop commons -->
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.hadoop</groupId>
|
||||
<artifactId>hadoop-hdfs</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.hadoop</groupId>
|
||||
<artifactId>hadoop-common</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.hadoop</groupId>
|
||||
<artifactId>hadoop-submarine-core</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
|
||||
<profiles>
|
||||
|
||||
<profile>
|
||||
<id>hadoop-3.2</id>
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>org.apache.hadoop</groupId>
|
||||
<artifactId>hadoop-submarine-yarnservice-runtime</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.hadoop</groupId>
|
||||
<artifactId>hadoop-submarine-tony-runtime</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.hadoop</groupId>
|
||||
<artifactId>hadoop-hdfs-client</artifactId>
|
||||
<version>${hadoop.version}</version>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
</profile>
|
||||
|
||||
<!-- Default profile-->
|
||||
<profile>
|
||||
<id>hadoop-3.1</id>
|
||||
<activation>
|
||||
<activeByDefault>true</activeByDefault>
|
||||
</activation>
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>org.apache.hadoop</groupId>
|
||||
<artifactId>hadoop-submarine-yarnservice-runtime</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.hadoop</groupId>
|
||||
<artifactId>hadoop-submarine-tony-runtime</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.hadoop</groupId>
|
||||
<artifactId>hadoop-hdfs-client</artifactId>
|
||||
<version>${hadoop.version}</version>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
</profile>
|
||||
|
||||
<profile>
|
||||
<id>hadoop-2.9</id>
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>org.apache.hadoop</groupId>
|
||||
<artifactId>hadoop-submarine-tony-runtime</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.hadoop</groupId>
|
||||
<artifactId>hadoop-hdfs-client</artifactId>
|
||||
<version>${hadoop.version}</version>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
</profile>
|
||||
|
||||
<profile>
|
||||
<id>hadoop-2.7</id>
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>org.apache.hadoop</groupId>
|
||||
<artifactId>hadoop-submarine-tony-runtime</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
</profile>
|
||||
</profiles>
|
||||
|
||||
<build>
|
||||
<plugins>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-shade-plugin</artifactId>
|
||||
<version>3.2.1</version>
|
||||
<executions>
|
||||
<execution>
|
||||
<phase>package</phase>
|
||||
<goals>
|
||||
<goal>shade</goal>
|
||||
</goals>
|
||||
<configuration>
|
||||
<!--
|
||||
<shadedArtifactAttached>true</shadedArtifactAttached>
|
||||
<shadedClassifierName>with-all-dependencies</shadedClassifierName>
|
||||
-->
|
||||
<outputFile>target/${project.artifactId}-${project.version}-${project.activeProfiles[0].id}.jar</outputFile>
|
||||
<artifactSet>
|
||||
<excludes>
|
||||
<exclude>classworlds:classworlds</exclude>
|
||||
<exclude>junit:junit</exclude>
|
||||
<exclude>jmock:*</exclude>
|
||||
<exclude>*:xml-apis</exclude>
|
||||
<exclude>org.apache.maven:lib:tests</exclude>
|
||||
</excludes>
|
||||
</artifactSet>
|
||||
<filters>
|
||||
<filter>
|
||||
<artifact>*:*</artifact>
|
||||
<excludes>
|
||||
<exclude>META-INF/*.SF</exclude>
|
||||
<exclude>META-INF/*.DSA</exclude>
|
||||
<exclude>META-INF/*.RSA</exclude>
|
||||
</excludes>
|
||||
</filter>
|
||||
</filters>
|
||||
<transformers>
|
||||
<transformer implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
|
||||
<mainClass>org.apache.hadoop.yarn.submarine.client.cli.Cli</mainClass>
|
||||
</transformer>
|
||||
<transformer implementation="org.apache.maven.plugins.shade.resource.ServicesResourceTransformer"/>
|
||||
</transformers>
|
||||
</configuration>
|
||||
</execution>
|
||||
</executions>
|
||||
</plugin>
|
||||
</plugins>
|
||||
</build>
|
||||
|
||||
</project>
|
|
@ -67,6 +67,10 @@
|
|||
<groupId>org.yaml</groupId>
|
||||
<artifactId>snakeyaml</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.commons</groupId>
|
||||
<artifactId>commons-lang3</artifactId>
|
||||
</dependency>
|
||||
|
||||
<!-- Dependencies for Hadoop commons -->
|
||||
|
||||
|
|
|
@ -45,7 +45,7 @@ import org.apache.hadoop.yarn.submarine.client.cli.runjob.RoleParameters;
|
|||
import org.apache.hadoop.yarn.submarine.common.ClientContext;
|
||||
import org.apache.hadoop.yarn.submarine.common.api.TensorFlowRole;
|
||||
import org.apache.hadoop.yarn.submarine.common.fs.RemoteDirectoryManager;
|
||||
import org.apache.hadoop.yarn.util.resource.ResourceUtils;
|
||||
import org.apache.hadoop.yarn.submarine.common.resource.ResourceUtils;
|
||||
import org.yaml.snakeyaml.introspector.Property;
|
||||
import org.yaml.snakeyaml.introspector.PropertyUtils;
|
||||
|
||||
|
@ -271,8 +271,7 @@ public abstract class RunJobParameters extends RunParameters {
|
|||
throw new ParseException(
|
||||
"--" + CliConstants.WORKER_RES + " is absent.");
|
||||
}
|
||||
return ResourceUtils.createResourceFromString(workerResourceStr,
|
||||
clientContext.getOrCreateYarnClient().getResourceTypeInfo());
|
||||
return ResourceUtils.createResourceFromString(workerResourceStr);
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
|
|
@ -27,7 +27,7 @@ import org.apache.hadoop.yarn.submarine.client.cli.param.ParametersHolder;
|
|||
import org.apache.hadoop.yarn.submarine.client.cli.runjob.RoleParameters;
|
||||
import org.apache.hadoop.yarn.submarine.common.ClientContext;
|
||||
import org.apache.hadoop.yarn.submarine.common.api.TensorFlowRole;
|
||||
import org.apache.hadoop.yarn.util.resource.ResourceUtils;
|
||||
import org.apache.hadoop.yarn.submarine.common.resource.ResourceUtils;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.List;
|
||||
|
@ -127,8 +127,7 @@ public class TensorFlowRunJobParameters extends RunJobParameters {
|
|||
if (psResourceStr == null) {
|
||||
throw new ParseException("--" + CliConstants.PS_RES + " is absent.");
|
||||
}
|
||||
return ResourceUtils.createResourceFromString(psResourceStr,
|
||||
clientContext.getOrCreateYarnClient().getResourceTypeInfo());
|
||||
return ResourceUtils.createResourceFromString(psResourceStr);
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
@ -151,9 +150,8 @@ public class TensorFlowRunJobParameters extends RunJobParameters {
|
|||
if (tensorboardResourceStr == null || tensorboardResourceStr.isEmpty()) {
|
||||
tensorboardResourceStr = CliConstants.TENSORBOARD_DEFAULT_RESOURCES;
|
||||
}
|
||||
Resource tensorboardResource =
|
||||
ResourceUtils.createResourceFromString(tensorboardResourceStr,
|
||||
clientContext.getOrCreateYarnClient().getResourceTypeInfo());
|
||||
Resource tensorboardResource = ResourceUtils.createResourceFromString(
|
||||
tensorboardResourceStr);
|
||||
String tensorboardDockerImage =
|
||||
parametersHolder.getOptionValue(CliConstants.TENSORBOARD_DOCKER_IMAGE);
|
||||
return new RoleParameters(TensorFlowRole.TENSORBOARD, 1, null,
|
||||
|
|
|
@ -0,0 +1,332 @@
|
|||
/**
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License. See accompanying LICENSE file.
|
||||
*/
|
||||
|
||||
package org.apache.hadoop.yarn.submarine.common.resource;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.hadoop.yarn.api.records.Resource;
|
||||
import org.apache.hadoop.yarn.submarine.common.exception.SubmarineRuntimeException;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.lang.reflect.InvocationTargetException;
|
||||
import java.lang.reflect.Method;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* This class implements some methods with the almost the same logic as
|
||||
* org.apache.hadoop.yarn.util.resource.ResourceUtils of hadoop 3.3.
|
||||
* If the hadoop dependencies are upgraded to 3.3, this class can be refactored
|
||||
* with org.apache.hadoop.yarn.util.resource.ResourceUtils.
|
||||
*/
|
||||
public final class ResourceUtils {
|
||||
|
||||
private final static String RES_PATTERN = "^[^=]+=\\d+\\s?\\w*$";
|
||||
private final static String SET_RESOURCE_VALUE_METHOD = "setResourceValue";
|
||||
private final static String SET_MEMORY_SIZE_METHOD = "setMemorySize";
|
||||
private final static String DEPRECATED_SET_MEMORY_SIZE_METHOD =
|
||||
"setMemory";
|
||||
private final static String GET_MEMORY_SIZE_METHOD = "getMemorySize";
|
||||
private final static String DEPRECATED_GET_MEMORY_SIZE_METHOD =
|
||||
"getMemory";
|
||||
private final static String GET_RESOURCE_VALUE_METHOD = "getResourceValue";
|
||||
private final static String GET_RESOURCE_TYPE_METHOD =
|
||||
"getResourcesTypeInfo";
|
||||
private final static String REINITIALIZE_RESOURCES_METHOD =
|
||||
"reinitializeResources";
|
||||
public static final String MEMORY_URI = "memory-mb";
|
||||
public static final String VCORES_URI = "vcores";
|
||||
public static final String GPU_URI = "yarn.io/gpu";
|
||||
public static final String FPGA_URI = "yarn.io/fpga";
|
||||
|
||||
private static final Logger LOG =
|
||||
LoggerFactory.getLogger(ResourceUtils.class);
|
||||
|
||||
private ResourceUtils() {}
|
||||
|
||||
public static Resource createResourceFromString(String resourceStr) {
|
||||
Map<String, Long> typeToValue = parseResourcesString(resourceStr);
|
||||
Resource resource = Resource.newInstance(0, 0);
|
||||
for (Map.Entry<String, Long> entry : typeToValue.entrySet()) {
|
||||
if(entry.getKey().equals(VCORES_URI)) {
|
||||
resource.setVirtualCores(entry.getValue().intValue());
|
||||
continue;
|
||||
} else if (entry.getKey().equals(MEMORY_URI)) {
|
||||
setMemorySize(resource, entry.getValue());
|
||||
continue;
|
||||
}
|
||||
setResource(resource, entry.getKey(), entry.getValue().intValue());
|
||||
}
|
||||
return resource;
|
||||
}
|
||||
|
||||
private static Map<String, Long> parseResourcesString(String resourcesStr) {
|
||||
Map<String, Long> resources = new HashMap<>();
|
||||
String[] pairs = resourcesStr.trim().split(",");
|
||||
for (String resource : pairs) {
|
||||
resource = resource.trim();
|
||||
if (!resource.matches(RES_PATTERN)) {
|
||||
throw new IllegalArgumentException("\"" + resource + "\" is not a "
|
||||
+ "valid resource type/amount pair. "
|
||||
+ "Please provide key=amount pairs separated by commas.");
|
||||
}
|
||||
String[] splits = resource.split("=");
|
||||
String key = splits[0], value = splits[1];
|
||||
String units = getUnits(value);
|
||||
|
||||
String valueWithoutUnit = value.substring(0,
|
||||
value.length()- units.length()).trim();
|
||||
long resourceValue = Long.parseLong(valueWithoutUnit);
|
||||
|
||||
// Convert commandline unit to standard YARN unit.
|
||||
if (units.equals("M") || units.equals("m")) {
|
||||
units = "Mi";
|
||||
} else if (units.equals("G") || units.equals("g")) {
|
||||
units = "Gi";
|
||||
} else if (!units.isEmpty()){
|
||||
throw new IllegalArgumentException("Acceptable units are M/G or empty");
|
||||
}
|
||||
|
||||
// special handle memory-mb and memory
|
||||
if (key.equals(MEMORY_URI)) {
|
||||
if (!units.isEmpty()) {
|
||||
resourceValue = UnitsConversionUtil.convert(units, "Mi",
|
||||
resourceValue);
|
||||
}
|
||||
}
|
||||
|
||||
if (key.equals("memory")) {
|
||||
key = MEMORY_URI;
|
||||
resourceValue = UnitsConversionUtil.convert(units, "Mi",
|
||||
resourceValue);
|
||||
}
|
||||
|
||||
// special handle gpu
|
||||
if (key.equals("gpu")) {
|
||||
key = GPU_URI;
|
||||
}
|
||||
|
||||
// special handle fpga
|
||||
if (key.equals("fpga")) {
|
||||
key = FPGA_URI;
|
||||
}
|
||||
|
||||
resources.put(key, resourceValue);
|
||||
}
|
||||
return resources;
|
||||
}
|
||||
|
||||
/**
|
||||
* As hadoop 2.9.2 and lower don't support resources except cpu and memory.
|
||||
* Use reflection to set GPU or other resources for compatibility with
|
||||
* hadoop 2.9.2
|
||||
*/
|
||||
public static void setResource(Resource resource, String resourceName,
|
||||
int resourceValue) {
|
||||
try {
|
||||
Method method = resource.getClass().getMethod(SET_RESOURCE_VALUE_METHOD,
|
||||
String.class, long.class);
|
||||
method.invoke(resource, resourceName, resourceValue);
|
||||
} catch (NoSuchMethodException e) {
|
||||
LOG.error("There is no '" + SET_RESOURCE_VALUE_METHOD + "' API in this" +
|
||||
"version of YARN", e);
|
||||
throw new SubmarineRuntimeException(e.getMessage(), e.getCause());
|
||||
} catch (IllegalAccessException | InvocationTargetException e) {
|
||||
LOG.error("Failed to invoke '" + SET_RESOURCE_VALUE_METHOD +
|
||||
"' method to set GPU resources", e);
|
||||
throw new SubmarineRuntimeException(e.getMessage(), e.getCause());
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
public static void setMemorySize(Resource resource, Long memorySize) {
|
||||
boolean useWithIntParameter = false;
|
||||
// For hadoop 2.9.2 and above
|
||||
try {
|
||||
Method method = resource.getClass().getMethod(SET_MEMORY_SIZE_METHOD,
|
||||
long.class);
|
||||
method.setAccessible(true);
|
||||
method.invoke(resource, memorySize);
|
||||
} catch (NoSuchMethodException nsme) {
|
||||
LOG.info("There is no '" + SET_MEMORY_SIZE_METHOD + "(long)' API in" +
|
||||
" this version of YARN");
|
||||
useWithIntParameter = true;
|
||||
} catch (IllegalAccessException | InvocationTargetException e) {
|
||||
LOG.error("Failed to invoke '" + SET_MEMORY_SIZE_METHOD +
|
||||
"' method", e);
|
||||
throw new SubmarineRuntimeException(e.getMessage(), e.getCause());
|
||||
}
|
||||
// For hadoop 2.7.3
|
||||
if (useWithIntParameter) {
|
||||
try {
|
||||
LOG.info("Trying to use '" + DEPRECATED_SET_MEMORY_SIZE_METHOD +
|
||||
"(int)' API for this version of YARN");
|
||||
Method method = resource.getClass().getMethod(
|
||||
DEPRECATED_SET_MEMORY_SIZE_METHOD, int.class);
|
||||
method.invoke(resource, memorySize.intValue());
|
||||
} catch (NoSuchMethodException e) {
|
||||
LOG.error("There is no '" + DEPRECATED_SET_MEMORY_SIZE_METHOD +
|
||||
"(int)' API in this version of YARN", e);
|
||||
throw new SubmarineRuntimeException(e.getMessage(), e.getCause());
|
||||
} catch (IllegalAccessException | InvocationTargetException e) {
|
||||
LOG.error("Failed to invoke '" + DEPRECATED_SET_MEMORY_SIZE_METHOD +
|
||||
"' method", e);
|
||||
throw new SubmarineRuntimeException(e.getMessage(), e.getCause());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public static long getMemorySize(Resource resource) {
|
||||
boolean useWithIntParameter = false;
|
||||
long memory = 0;
|
||||
// For hadoop 2.9.2 and above
|
||||
try {
|
||||
Method method = resource.getClass().getMethod(GET_MEMORY_SIZE_METHOD);
|
||||
method.setAccessible(true);
|
||||
memory = (long) method.invoke(resource);
|
||||
} catch (NoSuchMethodException e) {
|
||||
LOG.info("There is no '" + GET_MEMORY_SIZE_METHOD + "' API in" +
|
||||
" this version of YARN");
|
||||
useWithIntParameter = true;
|
||||
} catch (IllegalAccessException | InvocationTargetException e) {
|
||||
LOG.error("Failed to invoke '" + GET_MEMORY_SIZE_METHOD +
|
||||
"' method", e);
|
||||
throw new SubmarineRuntimeException(e.getMessage(), e.getCause());
|
||||
}
|
||||
// For hadoop 2.7.3
|
||||
if (useWithIntParameter) {
|
||||
try {
|
||||
LOG.info("Trying to use '" + DEPRECATED_GET_MEMORY_SIZE_METHOD +
|
||||
"' API for this version of YARN");
|
||||
Method method = resource.getClass().getMethod(
|
||||
DEPRECATED_GET_MEMORY_SIZE_METHOD);
|
||||
method.setAccessible(true);
|
||||
memory = ((Integer) method.invoke(resource)).longValue();
|
||||
} catch (NoSuchMethodException e) {
|
||||
LOG.error("There is no '" + DEPRECATED_GET_MEMORY_SIZE_METHOD +
|
||||
"' API in this version of YARN", e);
|
||||
throw new SubmarineRuntimeException(e.getMessage(), e.getCause());
|
||||
} catch (IllegalAccessException | InvocationTargetException e) {
|
||||
LOG.error("Failed to invoke '" + DEPRECATED_GET_MEMORY_SIZE_METHOD +
|
||||
"' method", e);
|
||||
throw new SubmarineRuntimeException(e.getMessage(), e.getCause());
|
||||
}
|
||||
}
|
||||
return memory;
|
||||
}
|
||||
|
||||
/**
|
||||
* As hadoop 2.9.2 and lower don't support resources except cpu and memory.
|
||||
* Use reflection to set GPU or other resources for compatibility with
|
||||
* hadoop 2.9.2
|
||||
*/
|
||||
public static long getResourceValue(Resource resource, String resourceName) {
|
||||
long resourceValue = 0;
|
||||
try {
|
||||
Method method = resource.getClass().getMethod(GET_RESOURCE_VALUE_METHOD,
|
||||
String.class);
|
||||
Object value = method.invoke(resource, resourceName);
|
||||
resourceValue = (long) value;
|
||||
} catch (NoSuchMethodException e) {
|
||||
LOG.info("There is no '" + GET_RESOURCE_VALUE_METHOD + "' API in this" +
|
||||
" version of YARN");
|
||||
} catch (InvocationTargetException e) {
|
||||
if (e.getTargetException().getClass().getName().equals(
|
||||
"org.apache.hadoop.yarn.exceptions.ResourceNotFoundException")) {
|
||||
LOG.info("Not found resource " + resourceName);
|
||||
} else {
|
||||
LOG.info("Failed to invoke '" + GET_RESOURCE_VALUE_METHOD + "'" +
|
||||
" method to get resource " + resourceName);
|
||||
throw new SubmarineRuntimeException(e.getMessage(), e.getCause());
|
||||
}
|
||||
} catch (IllegalAccessException | ClassCastException e) {
|
||||
LOG.error("Failed to invoke '" + GET_RESOURCE_VALUE_METHOD +
|
||||
"' method to get resource " + resourceName, e);
|
||||
throw new SubmarineRuntimeException(e.getMessage(), e.getCause());
|
||||
}
|
||||
return resourceValue;
|
||||
}
|
||||
|
||||
/**
|
||||
* As hadoop 2.9.2 and lower don't support resources except cpu and memory.
|
||||
* Use reflection to add GPU or other resources for compatibility with
|
||||
* hadoop 2.9.2
|
||||
*/
|
||||
public static void configureResourceType(String resrouceName) {
|
||||
Class resourceTypeInfo;
|
||||
try{
|
||||
resourceTypeInfo = Class.forName(
|
||||
"org.apache.hadoop.yarn.api.records.ResourceTypeInfo");
|
||||
Class resourceUtils = Class.forName(
|
||||
"org.apache.hadoop.yarn.util.resource.ResourceUtils");
|
||||
Method method = resourceUtils.getMethod(GET_RESOURCE_TYPE_METHOD);
|
||||
Object resTypes = method.invoke(null);
|
||||
|
||||
Method resourceTypeInstance = resourceTypeInfo.getMethod("newInstance",
|
||||
String.class, String.class);
|
||||
Object resourceType = resourceTypeInstance.invoke(null, resrouceName, "");
|
||||
((ArrayList)resTypes).add(resourceType);
|
||||
|
||||
Method reInitialMethod = resourceUtils.getMethod(
|
||||
REINITIALIZE_RESOURCES_METHOD, List.class);
|
||||
reInitialMethod.invoke(null, resTypes);
|
||||
|
||||
} catch (ClassNotFoundException e) {
|
||||
LOG.info("There is no specified class API in this" +
|
||||
" version of YARN");
|
||||
LOG.info(e.getMessage());
|
||||
throw new SubmarineRuntimeException(e.getMessage(), e.getCause());
|
||||
} catch (NoSuchMethodException nsme) {
|
||||
LOG.info("There is no '" + GET_RESOURCE_VALUE_METHOD + "' API in this" +
|
||||
" version of YARN");
|
||||
} catch (IllegalAccessException | InvocationTargetException e) {
|
||||
LOG.info("Failed to invoke 'configureResourceType' method ", e);
|
||||
throw new SubmarineRuntimeException(e.getMessage(), e.getCause());
|
||||
}
|
||||
}
|
||||
|
||||
private static String getUnits(String resourceValue) {
|
||||
return parseResourceValue(resourceValue)[0];
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract unit and actual value from resource value.
|
||||
* @param resourceValue Value of the resource
|
||||
* @return Array containing unit and value. [0]=unit, [1]=value
|
||||
* @throws IllegalArgumentException if units contain non alpha characters
|
||||
*/
|
||||
private static String[] parseResourceValue(String resourceValue) {
|
||||
String[] resource = new String[2];
|
||||
int i = 0;
|
||||
for (; i < resourceValue.length(); i++) {
|
||||
if (Character.isAlphabetic(resourceValue.charAt(i))) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
String units = resourceValue.substring(i);
|
||||
|
||||
if (StringUtils.isAlpha(units) || units.equals("")) {
|
||||
resource[0] = units;
|
||||
resource[1] = resourceValue.substring(0, i);
|
||||
return resource;
|
||||
} else {
|
||||
throw new IllegalArgumentException("Units '" + units + "'"
|
||||
+ " contains non alphabet characters, which is not allowed.");
|
||||
}
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,164 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hadoop.yarn.submarine.common.resource;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
/**
|
||||
* Almost the same logic as UnitsConversionUtil[YARN-4081]. If the dependencies
|
||||
* are upgraded to hadoop 3.*, this class can be replaced.
|
||||
*/
|
||||
public final class UnitsConversionUtil {
|
||||
|
||||
private UnitsConversionUtil() {}
|
||||
|
||||
/**
|
||||
* Helper class for encapsulating conversion values.
|
||||
*/
|
||||
public static class Converter {
|
||||
private long numerator;
|
||||
private long denominator;
|
||||
|
||||
Converter(long n, long d) {
|
||||
this.numerator = n;
|
||||
this.denominator = d;
|
||||
}
|
||||
}
|
||||
|
||||
private static final String[] UNITS = {"p", "n", "u", "m", "", "k", "M", "G",
|
||||
"T", "P", "Ki", "Mi", "Gi", "Ti", "Pi"};
|
||||
private static final List<String> SORTED_UNITS = Arrays.asList(UNITS);
|
||||
public static final Set<String> KNOWN_UNITS = createKnownUnitsSet();
|
||||
private static final Converter PICO =
|
||||
new Converter(1L, 1000L * 1000L * 1000L * 1000L);
|
||||
private static final Converter NANO =
|
||||
new Converter(1L, 1000L * 1000L * 1000L);
|
||||
private static final Converter MICRO = new Converter(1L, 1000L * 1000L);
|
||||
private static final Converter MILLI = new Converter(1L, 1000L);
|
||||
private static final Converter BASE = new Converter(1L, 1L);
|
||||
private static final Converter KILO = new Converter(1000L, 1L);
|
||||
private static final Converter MEGA = new Converter(1000L * 1000L, 1L);
|
||||
private static final Converter GIGA =
|
||||
new Converter(1000L * 1000L * 1000L, 1L);
|
||||
private static final Converter TERA =
|
||||
new Converter(1000L * 1000L * 1000L * 1000L, 1L);
|
||||
private static final Converter PETA =
|
||||
new Converter(1000L * 1000L * 1000L * 1000L * 1000L, 1L);
|
||||
|
||||
private static final Converter KILO_BINARY = new Converter(1024L, 1L);
|
||||
private static final Converter MEGA_BINARY = new Converter(1024L * 1024L, 1L);
|
||||
private static final Converter GIGA_BINARY =
|
||||
new Converter(1024L * 1024L * 1024L, 1L);
|
||||
private static final Converter TERA_BINARY =
|
||||
new Converter(1024L * 1024L * 1024L * 1024L, 1L);
|
||||
private static final Converter PETA_BINARY =
|
||||
new Converter(1024L * 1024L * 1024L * 1024L * 1024L, 1L);
|
||||
|
||||
private static Set<String> createKnownUnitsSet() {
|
||||
Set<String> ret = new HashSet<>();
|
||||
ret.addAll(Arrays.asList(UNITS));
|
||||
return ret;
|
||||
}
|
||||
|
||||
private static Converter getConverter(String unit) {
|
||||
switch (unit) {
|
||||
case "p":
|
||||
return PICO;
|
||||
case "n":
|
||||
return NANO;
|
||||
case "u":
|
||||
return MICRO;
|
||||
case "m":
|
||||
return MILLI;
|
||||
case "":
|
||||
return BASE;
|
||||
case "k":
|
||||
return KILO;
|
||||
case "M":
|
||||
return MEGA;
|
||||
case "G":
|
||||
return GIGA;
|
||||
case "T":
|
||||
return TERA;
|
||||
case "P":
|
||||
return PETA;
|
||||
case "Ki":
|
||||
return KILO_BINARY;
|
||||
case "Mi":
|
||||
return MEGA_BINARY;
|
||||
case "Gi":
|
||||
return GIGA_BINARY;
|
||||
case "Ti":
|
||||
return TERA_BINARY;
|
||||
case "Pi":
|
||||
return PETA_BINARY;
|
||||
default:
|
||||
throw new IllegalArgumentException(
|
||||
"Unknown unit '" + unit + "'. Known units are " + KNOWN_UNITS);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Converts a value from one unit to another. Supported units can be obtained
|
||||
* by inspecting the KNOWN_UNITS set.
|
||||
*
|
||||
* @param fromUnit the unit of the from value
|
||||
* @param toUnit the target unit
|
||||
* @param fromValue the value you wish to convert
|
||||
* @return the value in toUnit
|
||||
*/
|
||||
public static long convert(String fromUnit, String toUnit, long fromValue) {
|
||||
if (toUnit == null || fromUnit == null) {
|
||||
throw new IllegalArgumentException("One or more arguments are null");
|
||||
}
|
||||
|
||||
if (fromUnit.equals(toUnit)) {
|
||||
return fromValue;
|
||||
}
|
||||
Converter fc = getConverter(fromUnit);
|
||||
Converter tc = getConverter(toUnit);
|
||||
long numerator = fc.numerator * tc.denominator;
|
||||
long denominator = fc.denominator * tc.numerator;
|
||||
long numeratorMultiplierLimit = Long.MAX_VALUE / numerator;
|
||||
if (numerator < denominator) {
|
||||
if (numeratorMultiplierLimit < fromValue) {
|
||||
String overflowMsg =
|
||||
"Converting " + fromValue + " from '" + fromUnit + "' to '" + toUnit
|
||||
+ "' will result in an overflow of Long";
|
||||
throw new IllegalArgumentException(overflowMsg);
|
||||
}
|
||||
return (fromValue * numerator) / denominator;
|
||||
}
|
||||
if (numeratorMultiplierLimit > fromValue) {
|
||||
return (numerator * fromValue) / denominator;
|
||||
}
|
||||
long tmp = numerator / denominator;
|
||||
if ((Long.MAX_VALUE / tmp) < fromValue) {
|
||||
String overflowMsg =
|
||||
"Converting " + fromValue + " from '" + fromUnit + "' to '" + toUnit
|
||||
+ "' will result in an overflow of Long";
|
||||
throw new IllegalArgumentException(overflowMsg);
|
||||
}
|
||||
return fromValue * tmp;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,19 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
/**
|
||||
* This package contains resource utility classes.
|
||||
*/
|
||||
package org.apache.hadoop.yarn.submarine.common.resource;
|
|
@ -164,6 +164,22 @@ See below screenshot:
|
|||
|
||||
![alt text](./images/tensorboard-service.png "Tensorboard service")
|
||||
|
||||
If there is no hadoop client, we can also use the java command and the uber jar, hadoop-submarine-all-*.jar, to submit the job.
|
||||
|
||||
```
|
||||
java -cp /path-to/hadoop-conf:/path-to/hadoop-submarine-all-*.jar \
|
||||
org.apache.hadoop.yarn.submarine.client.cli.Cli job run \
|
||||
--env DOCKER_JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64/jre/ \
|
||||
--env DOCKER_HADOOP_HDFS_HOME=/hadoop-3.1.0 --name tf-job-001 \
|
||||
--docker_image <your-docker-image> \
|
||||
--input_path hdfs://default/dataset/cifar-10-data \
|
||||
--checkpoint_path hdfs://default/tmp/cifar-10-jobdir \
|
||||
--worker_resources memory=4G,vcores=2,gpu=2 \
|
||||
--worker_launch_cmd "python ... (Your training application cmd)" \
|
||||
--tensorboard # this will launch a companion tensorboard container for monitoring
|
||||
```
|
||||
|
||||
|
||||
### Launch Distributed Tensorflow Application:
|
||||
|
||||
#### Commandline
|
||||
|
@ -181,6 +197,20 @@ yarn jar hadoop-yarn-applications-submarine-<version>.jar job run \
|
|||
--num_ps 2 \
|
||||
--ps_resources memory=4G,vcores=2,gpu=0 --ps_launch_cmd "cmd for ps" \
|
||||
```
|
||||
Or
|
||||
```
|
||||
java -cp /path-to/hadoop-conf:/path-to/hadoop-submarine-all-*.jar \
|
||||
org.apache.hadoop.yarn.submarine.client.cli.Cli job run \
|
||||
--name tf-job-001 --docker_image <your docker image> \
|
||||
--input_path hdfs://default/dataset/cifar-10-data \
|
||||
--checkpoint_path hdfs://default/tmp/cifar-10-jobdir \
|
||||
--env DOCKER_JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64/jre/ \
|
||||
--env DOCKER_HADOOP_HDFS_HOME=/hadoop-3.1.0 \
|
||||
--num_workers 2 \
|
||||
--worker_resources memory=8G,vcores=2,gpu=1 --worker_launch_cmd "cmd for worker ..." \
|
||||
--num_ps 2 \
|
||||
--ps_resources memory=4G,vcores=2,gpu=0 --ps_launch_cmd "cmd for ps" \
|
||||
```
|
||||
|
||||
#### Notes:
|
||||
|
||||
|
@ -197,7 +227,11 @@ yarn jar hadoop-yarn-applications-submarine-<version>.jar job run \
|
|||
```
|
||||
yarn jar hadoop-yarn-applications-submarine-3.2.0-SNAPSHOT.jar job show --name tf-job-001
|
||||
```
|
||||
|
||||
Or
|
||||
```
|
||||
java -cp /path-to/hadoop-conf:/path-to/hadoop-submarine-all-*.jar \
|
||||
org.apache.hadoop.yarn.submarine.client.cli.Cli job show --name tf-job-001
|
||||
```
|
||||
Output looks like:
|
||||
```
|
||||
Job Meta Info:
|
||||
|
@ -222,6 +256,17 @@ yarn jar /tmp/hadoop-yarn-applications-submarine-3.2.0-SNAPSHOT.jar \
|
|||
--env DOCKER_HADOOP_HDFS_HOME=/hadoop-current \
|
||||
--num_workers 0 --tensorboard
|
||||
```
|
||||
Or
|
||||
```
|
||||
# Cleanup previous service if needed
|
||||
yarn app -destroy tensorboard-service; \
|
||||
java -cp /path-to/hadoop-conf:/path-to/hadoop-submarine-all-*.jar \
|
||||
org.apache.hadoop.yarn.submarine.client.cli.Cli job run \
|
||||
--name tensorboard-service --verbose --docker_image wtan/tf-1.8.0-cpu:0.0.3 \
|
||||
--env DOCKER_JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64/jre/ \
|
||||
--env DOCKER_HADOOP_HDFS_HOME=/hadoop-3.1.0 \
|
||||
--num_workers 0 --tensorboard
|
||||
```
|
||||
|
||||
You can view multiple job training history like from the `Tensorboard` link:
|
||||
|
||||
|
@ -242,4 +287,18 @@ If you want to build the Submarine project by yourself, you can follow the steps
|
|||
|
||||
- Run 'mvn install -DskipTests' from Hadoop source top level once.
|
||||
|
||||
- Navigate to hadoop-submarine folder and run 'mvn clean package'.
|
||||
- Navigate to hadoop-submarine folder and run 'mvn clean package'.
|
||||
|
||||
- By Default, hadoop-submarine is built based on hadoop 3.1.2 dependencies.
|
||||
Both yarn service runtime and tony runtime are built.
|
||||
You can also add a parameter of "-Phadoop-3.2" to specify the dependencies
|
||||
to hadoop 3.2.0.
|
||||
|
||||
- Hadoop-submarine can support hadoop 2.9.2 and hadoop 2.7.4 as well.
|
||||
You can add "-Phadoop-2.9" to build submarine based on hadoop 2.9.2.
|
||||
For example:
|
||||
```
|
||||
mvn clean package -Phadoop-2.9
|
||||
```
|
||||
As yarn service is based on hadoop 3.*, so only tony runtime is built
|
||||
in this case.
|
||||
|
|
|
@ -16,15 +16,14 @@
|
|||
|
||||
package org.apache.hadoop.yarn.submarine.client.cli.runjob;
|
||||
|
||||
import org.apache.hadoop.yarn.api.records.ResourceInformation;
|
||||
import org.apache.hadoop.yarn.api.records.ResourceTypeInfo;
|
||||
import org.apache.hadoop.yarn.exceptions.YarnException;
|
||||
import org.apache.hadoop.yarn.submarine.client.cli.YamlConfigTestUtils;
|
||||
import org.apache.hadoop.yarn.submarine.client.cli.param.runjob.RunJobParameters;
|
||||
import org.apache.hadoop.yarn.submarine.client.cli.param.runjob.TensorFlowRunJobParameters;
|
||||
import org.apache.hadoop.yarn.submarine.client.cli.param.yaml.YamlParseException;
|
||||
import org.apache.hadoop.yarn.submarine.common.conf.SubmarineLogs;
|
||||
import org.apache.hadoop.yarn.util.resource.ResourceUtils;
|
||||
import org.apache.hadoop.yarn.submarine.common.exception.SubmarineRuntimeException;
|
||||
import org.apache.hadoop.yarn.submarine.common.resource.ResourceUtils;
|
||||
import org.junit.After;
|
||||
import org.junit.Assert;
|
||||
import org.junit.Before;
|
||||
|
@ -32,10 +31,10 @@ import org.junit.BeforeClass;
|
|||
import org.junit.Rule;
|
||||
import org.junit.Test;
|
||||
import org.junit.rules.ExpectedException;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import static org.apache.hadoop.yarn.submarine.client.cli.runjob.TestRunJobCliParsingCommon.getMockClientContext;
|
||||
import static org.junit.Assert.assertFalse;
|
||||
|
@ -49,6 +48,8 @@ public class TestRunJobCliParsingCommonYaml {
|
|||
private static final String DIR_NAME = "runjob-common-yaml";
|
||||
private static final String TF_DIR = "runjob-pytorch-yaml";
|
||||
private File yamlConfig;
|
||||
private static Logger LOG = LoggerFactory.getLogger(
|
||||
TestRunJobCliParsingCommonYaml.class);
|
||||
|
||||
@Before
|
||||
public void before() {
|
||||
|
@ -62,10 +63,12 @@ public class TestRunJobCliParsingCommonYaml {
|
|||
|
||||
@BeforeClass
|
||||
public static void configureResourceTypes() {
|
||||
List<ResourceTypeInfo> resTypes = new ArrayList<>(
|
||||
ResourceUtils.getResourcesTypeInfo());
|
||||
resTypes.add(ResourceTypeInfo.newInstance(ResourceInformation.GPU_URI, ""));
|
||||
ResourceUtils.reinitializeResources(resTypes);
|
||||
try {
|
||||
ResourceUtils.configureResourceType(ResourceUtils.GPU_URI);
|
||||
} catch (SubmarineRuntimeException e) {
|
||||
LOG.info("The hadoop dependency doesn't support gpu resource, " +
|
||||
"so just skip this test case.");
|
||||
}
|
||||
}
|
||||
|
||||
@Rule
|
||||
|
|
|
@ -24,29 +24,28 @@ import static org.junit.Assert.assertNull;
|
|||
import static org.junit.Assert.assertTrue;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.hadoop.yarn.api.records.ResourceInformation;
|
||||
import org.apache.hadoop.yarn.api.records.ResourceTypeInfo;
|
||||
import org.apache.hadoop.yarn.resourcetypes.ResourceTypesTestHelper;
|
||||
import org.apache.hadoop.yarn.api.records.Resource;
|
||||
import org.apache.hadoop.yarn.submarine.client.cli.YamlConfigTestUtils;
|
||||
import org.apache.hadoop.yarn.submarine.client.cli.param.runjob.PyTorchRunJobParameters;
|
||||
import org.apache.hadoop.yarn.submarine.client.cli.param.runjob.RunJobParameters;
|
||||
import org.apache.hadoop.yarn.submarine.client.cli.param.yaml.YamlParseException;
|
||||
import org.apache.hadoop.yarn.submarine.client.cli.runjob.RunJobCli;
|
||||
import org.apache.hadoop.yarn.submarine.common.conf.SubmarineLogs;
|
||||
import org.apache.hadoop.yarn.util.resource.ResourceUtils;
|
||||
import org.apache.hadoop.yarn.submarine.common.exception.SubmarineRuntimeException;
|
||||
import org.apache.hadoop.yarn.submarine.common.resource.ResourceUtils;
|
||||
import org.apache.hadoop.yarn.util.resource.Resources;
|
||||
import org.junit.After;
|
||||
import org.junit.Assert;
|
||||
import org.junit.Before;
|
||||
import org.junit.BeforeClass;
|
||||
import org.junit.Rule;
|
||||
import org.junit.Test;
|
||||
import org.junit.rules.ExpectedException;
|
||||
|
||||
import com.google.common.collect.ImmutableList;
|
||||
import com.google.common.collect.ImmutableMap;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
/**
|
||||
* Test class that verifies the correctness of PyTorch
|
||||
|
@ -56,6 +55,8 @@ public class TestRunJobCliParsingPyTorchYaml {
|
|||
private static final String OVERRIDDEN_PREFIX = "overridden_";
|
||||
private static final String DIR_NAME = "runjob-pytorch-yaml";
|
||||
private File yamlConfig;
|
||||
private static Logger LOG = LoggerFactory.getLogger(
|
||||
TestRunJobCliParsingPyTorchYaml.class);
|
||||
|
||||
@Before
|
||||
public void before() {
|
||||
|
@ -67,14 +68,6 @@ public class TestRunJobCliParsingPyTorchYaml {
|
|||
YamlConfigTestUtils.deleteFile(yamlConfig);
|
||||
}
|
||||
|
||||
@BeforeClass
|
||||
public static void configureResourceTypes() {
|
||||
List<ResourceTypeInfo> resTypes = new ArrayList<>(
|
||||
ResourceUtils.getResourcesTypeInfo());
|
||||
resTypes.add(ResourceTypeInfo.newInstance(ResourceInformation.GPU_URI, ""));
|
||||
ResourceUtils.reinitializeResources(resTypes);
|
||||
}
|
||||
|
||||
@Rule
|
||||
public ExpectedException exception = ExpectedException.none();
|
||||
|
||||
|
@ -105,23 +98,38 @@ public class TestRunJobCliParsingPyTorchYaml {
|
|||
}
|
||||
}
|
||||
|
||||
private void verifyWorkerValues(RunJobParameters jobRunParameters,
|
||||
String prefix) {
|
||||
private PyTorchRunJobParameters verifyWorkerCommonValues(RunJobParameters
|
||||
jobRunParameters, String prefix) {
|
||||
assertTrue(RunJobParameters.class + " must be an instance of " +
|
||||
PyTorchRunJobParameters.class,
|
||||
jobRunParameters instanceof PyTorchRunJobParameters);
|
||||
PyTorchRunJobParameters tensorFlowParams =
|
||||
PyTorchRunJobParameters pyTorchParams =
|
||||
(PyTorchRunJobParameters) jobRunParameters;
|
||||
|
||||
assertEquals(3, tensorFlowParams.getNumWorkers());
|
||||
assertEquals(3, pyTorchParams.getNumWorkers());
|
||||
assertEquals(prefix + "testLaunchCmdWorker",
|
||||
tensorFlowParams.getWorkerLaunchCmd());
|
||||
pyTorchParams.getWorkerLaunchCmd());
|
||||
assertEquals(prefix + "testDockerImageWorker",
|
||||
tensorFlowParams.getWorkerDockerImage());
|
||||
assertEquals(ResourceTypesTestHelper.newResource(20480L, 32,
|
||||
ImmutableMap.<String, String> builder()
|
||||
.put(ResourceInformation.GPU_URI, "2").build()),
|
||||
tensorFlowParams.getWorkerResource());
|
||||
pyTorchParams.getWorkerDockerImage());
|
||||
return pyTorchParams;
|
||||
}
|
||||
|
||||
private void verifyWorkerValues(RunJobParameters jobRunParameters,
|
||||
String prefix) {
|
||||
PyTorchRunJobParameters pyTorchParams = verifyWorkerCommonValues
|
||||
(jobRunParameters, prefix);
|
||||
assertEquals(Resources.createResource(20480, 32),
|
||||
pyTorchParams.getWorkerResource());
|
||||
}
|
||||
|
||||
private void verifyWorkerValuesWithGpu(RunJobParameters jobRunParameters,
|
||||
String prefix) {
|
||||
|
||||
PyTorchRunJobParameters pyTorchParams = verifyWorkerCommonValues
|
||||
(jobRunParameters, prefix);
|
||||
Resource workResource = Resources.createResource(20480, 32);
|
||||
ResourceUtils.setResource(workResource, ResourceUtils.GPU_URI, 2);
|
||||
assertEquals(workResource, pyTorchParams.getWorkerResource());
|
||||
}
|
||||
|
||||
private void verifySecurityValues(RunJobParameters jobRunParameters) {
|
||||
|
@ -146,6 +154,30 @@ public class TestRunJobCliParsingPyTorchYaml {
|
|||
verifySecurityValues(jobRunParameters);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testValidGpuYamlParsing() throws Exception {
|
||||
try {
|
||||
ResourceUtils.configureResourceType(ResourceUtils.GPU_URI);
|
||||
} catch (SubmarineRuntimeException e) {
|
||||
LOG.info("The hadoop dependency doesn't support gpu resource, " +
|
||||
"so just skip this test case.");
|
||||
return;
|
||||
}
|
||||
|
||||
RunJobCli runJobCli = new RunJobCli(getMockClientContext());
|
||||
Assert.assertFalse(SubmarineLogs.isVerbose());
|
||||
|
||||
yamlConfig = YamlConfigTestUtils.createTempFileWithContents(
|
||||
DIR_NAME + "/valid-gpu-config.yaml");
|
||||
runJobCli.run(
|
||||
new String[] {"-f", yamlConfig.getAbsolutePath(), "--verbose"});
|
||||
|
||||
RunJobParameters jobRunParameters = runJobCli.getRunJobParameters();
|
||||
verifyBasicConfigValues(jobRunParameters);
|
||||
verifyWorkerValuesWithGpu(jobRunParameters, "");
|
||||
verifySecurityValues(jobRunParameters);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testRoleOverrides() throws Exception {
|
||||
RunJobCli runJobCli = new RunJobCli(getMockClientContext());
|
||||
|
|
|
@ -18,26 +18,25 @@
|
|||
package org.apache.hadoop.yarn.submarine.client.cli.runjob.tensorflow;
|
||||
|
||||
import com.google.common.collect.ImmutableList;
|
||||
import com.google.common.collect.ImmutableMap;
|
||||
import org.apache.hadoop.yarn.api.records.ResourceInformation;
|
||||
import org.apache.hadoop.yarn.api.records.ResourceTypeInfo;
|
||||
import org.apache.hadoop.yarn.resourcetypes.ResourceTypesTestHelper;
|
||||
import org.apache.hadoop.yarn.api.records.Resource;
|
||||
import org.apache.hadoop.yarn.submarine.client.cli.YamlConfigTestUtils;
|
||||
import org.apache.hadoop.yarn.submarine.client.cli.param.runjob.RunJobParameters;
|
||||
import org.apache.hadoop.yarn.submarine.client.cli.param.runjob.TensorFlowRunJobParameters;
|
||||
import org.apache.hadoop.yarn.submarine.client.cli.runjob.RunJobCli;
|
||||
import org.apache.hadoop.yarn.submarine.common.conf.SubmarineLogs;
|
||||
import org.apache.hadoop.yarn.util.resource.ResourceUtils;
|
||||
import org.apache.hadoop.yarn.submarine.common.exception.SubmarineRuntimeException;
|
||||
import org.apache.hadoop.yarn.submarine.common.resource.ResourceUtils;
|
||||
import org.apache.hadoop.yarn.util.resource.Resources;
|
||||
import org.junit.After;
|
||||
import org.junit.Assert;
|
||||
import org.junit.Before;
|
||||
import org.junit.BeforeClass;
|
||||
import org.junit.Rule;
|
||||
import org.junit.Test;
|
||||
import org.junit.rules.ExpectedException;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import static org.apache.hadoop.yarn.submarine.client.cli.runjob.TestRunJobCliParsingCommon.getMockClientContext;
|
||||
|
@ -53,6 +52,8 @@ public class TestRunJobCliParsingTensorFlowYaml {
|
|||
private static final String OVERRIDDEN_PREFIX = "overridden_";
|
||||
private static final String DIR_NAME = "runjob-tensorflow-yaml";
|
||||
private File yamlConfig;
|
||||
private static Logger LOG = LoggerFactory.getLogger(
|
||||
TestRunJobCliParsingTensorFlowYaml.class);
|
||||
|
||||
@Before
|
||||
public void before() {
|
||||
|
@ -64,14 +65,6 @@ public class TestRunJobCliParsingTensorFlowYaml {
|
|||
YamlConfigTestUtils.deleteFile(yamlConfig);
|
||||
}
|
||||
|
||||
@BeforeClass
|
||||
public static void configureResourceTypes() {
|
||||
List<ResourceTypeInfo> resTypes = new ArrayList<>(
|
||||
ResourceUtils.getResourcesTypeInfo());
|
||||
resTypes.add(ResourceTypeInfo.newInstance(ResourceInformation.GPU_URI, ""));
|
||||
ResourceUtils.reinitializeResources(resTypes);
|
||||
}
|
||||
|
||||
@Rule
|
||||
public ExpectedException exception = ExpectedException.none();
|
||||
|
||||
|
@ -114,14 +107,12 @@ public class TestRunJobCliParsingTensorFlowYaml {
|
|||
assertEquals(prefix + "testLaunchCmdPs", tensorFlowParams.getPSLaunchCmd());
|
||||
assertEquals(prefix + "testDockerImagePs",
|
||||
tensorFlowParams.getPsDockerImage());
|
||||
assertEquals(ResourceTypesTestHelper.newResource(20500L, 34,
|
||||
ImmutableMap.<String, String> builder()
|
||||
.put(ResourceInformation.GPU_URI, "4").build()),
|
||||
assertEquals(Resources.createResource(20500, 34),
|
||||
tensorFlowParams.getPsResource());
|
||||
}
|
||||
|
||||
private void verifyWorkerValues(RunJobParameters jobRunParameters,
|
||||
String prefix) {
|
||||
private TensorFlowRunJobParameters verifyWorkerCommonValues(
|
||||
RunJobParameters jobRunParameters, String prefix) {
|
||||
assertTrue(RunJobParameters.class + " must be an instance of " +
|
||||
TensorFlowRunJobParameters.class,
|
||||
jobRunParameters instanceof TensorFlowRunJobParameters);
|
||||
|
@ -133,12 +124,26 @@ public class TestRunJobCliParsingTensorFlowYaml {
|
|||
tensorFlowParams.getWorkerLaunchCmd());
|
||||
assertEquals(prefix + "testDockerImageWorker",
|
||||
tensorFlowParams.getWorkerDockerImage());
|
||||
assertEquals(ResourceTypesTestHelper.newResource(20480L, 32,
|
||||
ImmutableMap.<String, String> builder()
|
||||
.put(ResourceInformation.GPU_URI, "2").build()),
|
||||
return tensorFlowParams;
|
||||
}
|
||||
|
||||
private void verifyWorkerValues(RunJobParameters jobRunParameters,
|
||||
String prefix) {
|
||||
TensorFlowRunJobParameters tensorFlowParams = verifyWorkerCommonValues
|
||||
(jobRunParameters, prefix);
|
||||
assertEquals(Resources.createResource(20480, 32),
|
||||
tensorFlowParams.getWorkerResource());
|
||||
}
|
||||
|
||||
private void verifyWorkerValuesWithGpu(RunJobParameters jobRunParameters,
|
||||
String prefix) {
|
||||
TensorFlowRunJobParameters tensorFlowParams = verifyWorkerCommonValues
|
||||
(jobRunParameters, prefix);
|
||||
Resource workResource = Resources.createResource(20480, 32);
|
||||
ResourceUtils.setResource(workResource, ResourceUtils.GPU_URI, 2);
|
||||
assertEquals(workResource, tensorFlowParams.getWorkerResource());
|
||||
}
|
||||
|
||||
private void verifySecurityValues(RunJobParameters jobRunParameters) {
|
||||
assertEquals("keytabPath", jobRunParameters.getKeytab());
|
||||
assertEquals("testPrincipal", jobRunParameters.getPrincipal());
|
||||
|
@ -155,9 +160,7 @@ public class TestRunJobCliParsingTensorFlowYaml {
|
|||
assertTrue(tensorFlowParams.isTensorboardEnabled());
|
||||
assertEquals("tensorboardDockerImage",
|
||||
tensorFlowParams.getTensorboardDockerImage());
|
||||
assertEquals(ResourceTypesTestHelper.newResource(21000L, 37,
|
||||
ImmutableMap.<String, String> builder()
|
||||
.put(ResourceInformation.GPU_URI, "3").build()),
|
||||
assertEquals(Resources.createResource(21000, 37),
|
||||
tensorFlowParams.getTensorboardResource());
|
||||
}
|
||||
|
||||
|
@ -179,6 +182,32 @@ public class TestRunJobCliParsingTensorFlowYaml {
|
|||
verifyTensorboardValues(jobRunParameters);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testValidGpuYamlParsing() throws Exception {
|
||||
try {
|
||||
ResourceUtils.configureResourceType(ResourceUtils.GPU_URI);
|
||||
} catch (SubmarineRuntimeException e) {
|
||||
LOG.info("The hadoop dependency doesn't support gpu resource, " +
|
||||
"so just skip this test case.");
|
||||
return;
|
||||
}
|
||||
|
||||
RunJobCli runJobCli = new RunJobCli(getMockClientContext());
|
||||
Assert.assertFalse(SubmarineLogs.isVerbose());
|
||||
|
||||
yamlConfig = YamlConfigTestUtils.createTempFileWithContents(
|
||||
DIR_NAME + "/valid-gpu-config.yaml");
|
||||
runJobCli.run(
|
||||
new String[] {"-f", yamlConfig.getAbsolutePath(), "--verbose"});
|
||||
|
||||
RunJobParameters jobRunParameters = runJobCli.getRunJobParameters();
|
||||
verifyBasicConfigValues(jobRunParameters);
|
||||
verifyPsValues(jobRunParameters, "");
|
||||
verifyWorkerValuesWithGpu(jobRunParameters, "");
|
||||
verifySecurityValues(jobRunParameters);
|
||||
verifyTensorboardValues(jobRunParameters);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testRoleOverrides() throws Exception {
|
||||
RunJobCli runJobCli = new RunJobCli(getMockClientContext());
|
||||
|
@ -240,9 +269,7 @@ public class TestRunJobCliParsingTensorFlowYaml {
|
|||
assertTrue(tensorFlowParams.isTensorboardEnabled());
|
||||
assertNull("tensorboardDockerImage should be null!",
|
||||
tensorFlowParams.getTensorboardDockerImage());
|
||||
assertEquals(ResourceTypesTestHelper.newResource(21000L, 37,
|
||||
ImmutableMap.<String, String> builder()
|
||||
.put(ResourceInformation.GPU_URI, "3").build()),
|
||||
assertEquals(Resources.createResource(21000, 37),
|
||||
tensorFlowParams.getTensorboardResource());
|
||||
}
|
||||
|
||||
|
|
|
@ -132,14 +132,14 @@ public class TestRunJobCliParsingTensorFlowYamlStandalone {
|
|||
private void assertWorkerValues(Role worker) {
|
||||
assertEquals("testLaunchCmdWorker", worker.getLaunchCmd());
|
||||
assertEquals("testDockerImageWorker", worker.getDockerImage());
|
||||
assertEquals("memory=20480M,vcores=32,gpu=2", worker.getResources());
|
||||
assertEquals("memory=20480M,vcores=32", worker.getResources());
|
||||
assertEquals(3, worker.getReplicas());
|
||||
}
|
||||
|
||||
private void assertPsValues(Role ps) {
|
||||
assertEquals("testLaunchCmdPs", ps.getLaunchCmd());
|
||||
assertEquals("testDockerImagePs", ps.getDockerImage());
|
||||
assertEquals("memory=20500M,vcores=34,gpu=4", ps.getResources());
|
||||
assertEquals("memory=20500M,vcores=34", ps.getResources());
|
||||
assertEquals(4, ps.getReplicas());
|
||||
}
|
||||
|
||||
|
@ -161,7 +161,7 @@ public class TestRunJobCliParsingTensorFlowYamlStandalone {
|
|||
TensorBoard tensorBoard = yamlConfigFile.getTensorBoard();
|
||||
assertNotNull("Tensorboard should not be null!", tensorBoard);
|
||||
assertEquals("tensorboardDockerImage", tensorBoard.getDockerImage());
|
||||
assertEquals("memory=21000M,vcores=37,gpu=3", tensorBoard.getResources());
|
||||
assertEquals("memory=21000M,vcores=37", tensorBoard.getResources());
|
||||
}
|
||||
|
||||
@Before
|
||||
|
|
|
@ -20,15 +20,6 @@ package org.apache.hadoop.yarn.submarine.common;
|
|||
|
||||
import org.apache.hadoop.yarn.submarine.common.fs.MockRemoteDirectoryManager;
|
||||
import org.apache.hadoop.yarn.submarine.common.fs.RemoteDirectoryManager;
|
||||
import org.apache.hadoop.yarn.client.api.YarnClient;
|
||||
import org.apache.hadoop.yarn.exceptions.YarnException;
|
||||
import org.apache.hadoop.yarn.util.resource.ResourceUtils;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import static org.junit.Assert.fail;
|
||||
import static org.mockito.Mockito.mock;
|
||||
import static org.mockito.Mockito.when;
|
||||
|
||||
public class MockClientContext extends ClientContext {
|
||||
|
||||
|
@ -44,18 +35,4 @@ public class MockClientContext extends ClientContext {
|
|||
RemoteDirectoryManager remoteDirectoryMgr) {
|
||||
this.remoteDirectoryMgr = remoteDirectoryMgr;
|
||||
}
|
||||
|
||||
@Override
|
||||
public synchronized YarnClient getOrCreateYarnClient() {
|
||||
YarnClient client = mock(YarnClient.class);
|
||||
try {
|
||||
when(client.getResourceTypeInfo()).thenReturn(
|
||||
ResourceUtils.getResourcesTypeInfo());
|
||||
} catch (YarnException e) {
|
||||
fail(e.getMessage());
|
||||
} catch (IOException e) {
|
||||
fail(e.getMessage());
|
||||
}
|
||||
return client;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -43,12 +43,12 @@ scheduling:
|
|||
|
||||
roles:
|
||||
worker:
|
||||
resources: memory=20480M,vcores=32,gpu=2
|
||||
resources: memory=20480M,vcores=32
|
||||
replicas: 3
|
||||
launch_cmd: testLaunchCmdWorker
|
||||
docker_image: testDockerImageWorker
|
||||
ps:
|
||||
resources: memory=20500M,vcores=34,gpu=4
|
||||
resources: memory=20500M,vcores=34
|
||||
replicas: 4
|
||||
launch_cmd: testLaunchCmdPs
|
||||
docker_image: testDockerImagePs
|
||||
|
@ -59,5 +59,5 @@ security:
|
|||
distribute_keytab: true
|
||||
|
||||
tensorBoard:
|
||||
resources: memory=21000M,vcores=37,gpu=3
|
||||
resources: memory=21000M,vcores=37
|
||||
docker_image: tensorboardDockerImage
|
|
@ -23,11 +23,11 @@ scheduling:
|
|||
|
||||
roles:
|
||||
worker:
|
||||
resources: memory=20480M,vcores=32,gpu=2
|
||||
resources: memory=20480M,vcores=32
|
||||
replicas: 3
|
||||
launch_cmd: testLaunchCmdWorker
|
||||
ps:
|
||||
resources: memory=20500M,vcores=34,gpu=4
|
||||
resources: memory=20500M,vcores=34
|
||||
replicas: 4
|
||||
launch_cmd: testLaunchCmdPs
|
||||
|
||||
|
@ -37,5 +37,5 @@ security:
|
|||
distribute_keytab: true
|
||||
|
||||
tensorBoard:
|
||||
resources: memory=21000M,vcores=37,gpu=3
|
||||
resources: memory=21000M,vcores=37
|
||||
docker_image: tensorboardDockerImage
|
|
@ -42,12 +42,12 @@ scheduling:
|
|||
|
||||
roles:
|
||||
worker:
|
||||
resources: memory=20480M,vcores=32,gpu=2
|
||||
resources: memory=20480M,vcores=32
|
||||
replicas: 3
|
||||
launch_cmd: testLaunchCmdWorker
|
||||
docker_image: testDockerImageWorker
|
||||
ps:
|
||||
resources: memory=20500M,vcores=34,gpu=4
|
||||
resources: memory=20500M,vcores=34
|
||||
replicas: 4
|
||||
launch_cmd: testLaunchCmdPs
|
||||
docker_image: testDockerImagePs
|
||||
|
@ -58,5 +58,5 @@ security:
|
|||
distribute_keytab: true
|
||||
|
||||
tensorBoard:
|
||||
resources: memory=21000M,vcores=37,gpu=3
|
||||
resources: memory=21000M,vcores=37
|
||||
docker_image: tensorboardDockerImage
|
|
@ -40,10 +40,10 @@ configs:
|
|||
|
||||
roles:
|
||||
worker:
|
||||
resources: memory=20480M,vcores=32,gpu=2
|
||||
resources: memory=20480M,vcores=32
|
||||
replicas: 3
|
||||
launch_cmd: testLaunchCmdWorker
|
||||
ps:
|
||||
resources: memory=20500M,vcores=34,gpu=4
|
||||
resources: memory=20500M,vcores=34
|
||||
replicas: 4
|
||||
launch_cmd: testLaunchCmdPs
|
|
@ -43,11 +43,11 @@ scheduling:
|
|||
|
||||
roles:
|
||||
worker:
|
||||
resources: memory=20480M,vcores=32,gpu=2
|
||||
resources: memory=20480M,vcores=32
|
||||
replicas: 3
|
||||
launch_cmd: testLaunchCmdWorker
|
||||
ps:
|
||||
resources: memory=20500M,vcores=34,gpu=4
|
||||
resources: memory=20500M,vcores=34
|
||||
replicas: 4
|
||||
launch_cmd: testLaunchCmdPs
|
||||
|
||||
|
|
|
@ -42,11 +42,11 @@ scheduling:
|
|||
|
||||
roles:
|
||||
worker:
|
||||
resources: memory=20480M,vcores=32,gpu=2
|
||||
resources: memory=20480M,vcores=32
|
||||
replicas: 3
|
||||
launch_cmd: testLaunchCmdWorker
|
||||
ps:
|
||||
resources: memory=20500M,vcores=34,gpu=4
|
||||
resources: memory=20500M,vcores=34
|
||||
replicas: 4
|
||||
launch_cmd: testLaunchCmdPs
|
||||
|
||||
|
@ -56,5 +56,5 @@ security:
|
|||
distribute_keytab: true
|
||||
|
||||
tensorBoard:
|
||||
resources: memory=21000M,vcores=37,gpu=3
|
||||
resources: memory=21000M,vcores=37
|
||||
docker_image: tensorboardDockerImage
|
|
@ -42,11 +42,11 @@ scheduling:
|
|||
|
||||
roles:
|
||||
worker:
|
||||
resources: memory=20480M,vcores=32,gpu=2
|
||||
resources: memory=20480M,vcores=32
|
||||
replicas: 3
|
||||
launch_cmd: testLaunchCmdWorker
|
||||
ps:
|
||||
resources: memory=20500M,vcores=34,gpu=4
|
||||
resources: memory=20500M,vcores=34
|
||||
replicas: 4
|
||||
launch_cmd: testLaunchCmdPs
|
||||
|
||||
|
@ -56,5 +56,5 @@ security:
|
|||
distribute_keytab: true
|
||||
|
||||
tensorBoard:
|
||||
resources: memory=21000M,vcores=37,gpu=3
|
||||
resources: memory=21000M,vcores=37
|
||||
docker_image: tensorboardDockerImage
|
|
@ -40,7 +40,7 @@ scheduling:
|
|||
|
||||
roles:
|
||||
worker:
|
||||
resources: memory=20480M,vcores=32,gpu=2
|
||||
resources: memory=20480M,vcores=32
|
||||
replicas: 3
|
||||
launch_cmd: testLaunchCmdWorker
|
||||
docker_image: testDockerImageWorker
|
||||
|
|
|
@ -43,7 +43,7 @@ scheduling:
|
|||
|
||||
roles:
|
||||
worker:
|
||||
resources: memory=20480M,vcores=32,gpu=2
|
||||
resources: memory=20480M,vcores=32
|
||||
replicas: 3
|
||||
launch_cmd: testLaunchCmdWorker
|
||||
docker_image: testDockerImageWorker
|
||||
|
|
|
@ -43,7 +43,7 @@ scheduling:
|
|||
|
||||
roles:
|
||||
worker:
|
||||
resources: memory=20480M,vcores=32,gpu=2
|
||||
resources: memory=20480M,vcores=32
|
||||
replicas: 3
|
||||
launch_cmd: testLaunchCmdWorker
|
||||
docker_image: testDockerImageWorker
|
||||
|
|
|
@ -43,7 +43,7 @@ scheduling:
|
|||
|
||||
roles:
|
||||
worker:
|
||||
resources: memory=20480M,vcores=32,gpu=2
|
||||
resources: memory=20480M,vcores=32
|
||||
replicas: 3
|
||||
launch_cmd: testLaunchCmdWorker
|
||||
docker_image: testDockerImageWorker
|
||||
|
|
|
@ -43,7 +43,7 @@ scheduling:
|
|||
|
||||
roles:
|
||||
worker:
|
||||
resources: memory=20480M,vcores=32,gpu=2
|
||||
resources: memory=20480M,vcores=32
|
||||
replicas: 3
|
||||
launch_cmd: overridden_testLaunchCmdWorker
|
||||
docker_image: overridden_testDockerImageWorker
|
||||
|
|
|
@ -43,7 +43,7 @@ scheduling:
|
|||
|
||||
roles:
|
||||
worker:
|
||||
resources: memory=20480M,vcores=32,gpu=2
|
||||
resources: memory=20480M,vcores=32
|
||||
replicas: 3
|
||||
launch_cmd: testLaunchCmdWorker
|
||||
docker_image: testDockerImageWorker
|
||||
|
|
|
@ -0,0 +1,54 @@
|
|||
# Licensed to the Apache Software Foundation (ASF) under one
|
||||
# or more contributor license agreements. See the NOTICE file
|
||||
# distributed with this work for additional information
|
||||
# regarding copyright ownership. The ASF licenses this file
|
||||
# to you under the Apache License, Version 2.0 (the
|
||||
# "License"); you may not use this file except in compliance
|
||||
# with the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
spec:
|
||||
name: testJobName
|
||||
job_type: testJobType
|
||||
framework: pytorch
|
||||
|
||||
configs:
|
||||
input_path: testInputPath
|
||||
checkpoint_path: testCheckpointPath
|
||||
saved_model_path: testSavedModelPath
|
||||
docker_image: testDockerImage
|
||||
wait_job_finish: true
|
||||
envs:
|
||||
env1: 'env1Value'
|
||||
env2: 'env2Value'
|
||||
localizations:
|
||||
- hdfs://remote-file1:/local-filename1:rw
|
||||
- nfs://remote-file2:/local-filename2:rw
|
||||
mounts:
|
||||
- /etc/passwd:/etc/passwd:rw
|
||||
- /etc/hosts:/etc/hosts:rw
|
||||
quicklinks:
|
||||
- Notebook_UI=https://master-0:7070
|
||||
- Notebook_UI2=https://master-0:7071
|
||||
|
||||
scheduling:
|
||||
queue: queue1
|
||||
|
||||
roles:
|
||||
worker:
|
||||
resources: memory=20480M,vcores=32,gpu=2
|
||||
replicas: 3
|
||||
launch_cmd: testLaunchCmdWorker
|
||||
docker_image: testDockerImageWorker
|
||||
|
||||
security:
|
||||
keytab: keytabPath
|
||||
principal: testPrincipal
|
||||
distribute_keytab: true
|
|
@ -40,12 +40,12 @@ scheduling:
|
|||
|
||||
roles:
|
||||
worker:
|
||||
resources: memory=20480M,vcores=32,gpu=2
|
||||
resources: memory=20480M,vcores=32
|
||||
replicas: 3
|
||||
launch_cmd: testLaunchCmdWorker
|
||||
docker_image: testDockerImageWorker
|
||||
ps:
|
||||
resources: memory=20500M,vcores=34,gpu=4
|
||||
resources: memory=20500M,vcores=34
|
||||
replicas: 4
|
||||
launch_cmd: testLaunchCmdPs
|
||||
docker_image: testDockerImagePs
|
||||
|
@ -56,5 +56,5 @@ security:
|
|||
distribute_keytab: true
|
||||
|
||||
tensorBoard:
|
||||
resources: memory=21000M,vcores=37,gpu=3
|
||||
resources: memory=21000M,vcores=37
|
||||
docker_image: tensorboardDockerImage
|
|
@ -43,12 +43,12 @@ scheduling:
|
|||
|
||||
roles:
|
||||
worker:
|
||||
resources: memory=20480M,vcores=32,gpu=2
|
||||
resources: memory=20480M,vcores=32
|
||||
replicas: 3
|
||||
launch_cmd: testLaunchCmdWorker
|
||||
docker_image: testDockerImageWorker
|
||||
ps:
|
||||
resources: memory=20500M,vcores=34,gpu=4
|
||||
resources: memory=20500M,vcores=34
|
||||
replicas: 4
|
||||
launch_cmd: testLaunchCmdPs
|
||||
docker_image: testDockerImagePs
|
||||
|
@ -58,5 +58,5 @@ security:
|
|||
distribute_keytab: true
|
||||
|
||||
tensorBoard:
|
||||
resources: memory=21000M,vcores=37,gpu=3
|
||||
resources: memory=21000M,vcores=37
|
||||
docker_image: tensorboardDockerImage
|
|
@ -43,12 +43,12 @@ scheduling:
|
|||
|
||||
roles:
|
||||
worker:
|
||||
resources: memory=20480M,vcores=32,gpu=2
|
||||
resources: memory=20480M,vcores=32
|
||||
replicas: 3
|
||||
launch_cmd: testLaunchCmdWorker
|
||||
docker_image: testDockerImageWorker
|
||||
ps:
|
||||
resources: memory=20500M,vcores=34,gpu=4
|
||||
resources: memory=20500M,vcores=34
|
||||
replicas: 4
|
||||
launch_cmd: testLaunchCmdPs
|
||||
docker_image: testDockerImagePs
|
||||
|
@ -59,4 +59,4 @@ security:
|
|||
distribute_keytab: true
|
||||
|
||||
tensorBoard:
|
||||
resources: memory=21000M,vcores=37,gpu=3
|
||||
resources: memory=21000M,vcores=37
|
||||
|
|
|
@ -43,7 +43,7 @@ scheduling:
|
|||
|
||||
roles:
|
||||
worker:
|
||||
resources: memory=20480M,vcores=32,gpu=2
|
||||
resources: memory=20480M,vcores=32
|
||||
replicas: 3
|
||||
launch_cmd: overridden_testLaunchCmdWorker
|
||||
docker_image: overridden_testDockerImageWorker
|
||||
|
@ -58,7 +58,7 @@ roles:
|
|||
- /etc/hosts:/overridden_Worker
|
||||
|
||||
ps:
|
||||
resources: memory=20500M,vcores=34,gpu=4
|
||||
resources: memory=20500M,vcores=34
|
||||
replicas: 4
|
||||
launch_cmd: overridden_testLaunchCmdPs
|
||||
docker_image: overridden_testDockerImagePs
|
||||
|
@ -78,5 +78,5 @@ security:
|
|||
distribute_keytab: true
|
||||
|
||||
tensorBoard:
|
||||
resources: memory=21000M,vcores=37,gpu=3
|
||||
resources: memory=21000M,vcores=37
|
||||
docker_image: tensorboardDockerImage
|
|
@ -43,12 +43,12 @@ scheduling:
|
|||
|
||||
roles:
|
||||
worker:
|
||||
resources: memory=20480M,vcores=32,gpu=2
|
||||
resources: memory=20480M,vcores=32
|
||||
replicas: 3
|
||||
launch_cmd: testLaunchCmdWorker
|
||||
docker_image: testDockerImageWorker
|
||||
ps:
|
||||
resources: memory=20500M,vcores=34,gpu=4
|
||||
resources: memory=20500M,vcores=34
|
||||
replicas: 4
|
||||
launch_cmd: testLaunchCmdPs
|
||||
docker_image: testDockerImagePs
|
||||
|
@ -59,5 +59,5 @@ security:
|
|||
distribute_keytab: true
|
||||
|
||||
tensorBoard:
|
||||
resources: memory=21000M,vcores=37,gpu=3
|
||||
resources: memory=21000M,vcores=37
|
||||
docker_image: tensorboardDockerImage
|
|
@ -0,0 +1,63 @@
|
|||
# Licensed to the Apache Software Foundation (ASF) under one
|
||||
# or more contributor license agreements. See the NOTICE file
|
||||
# distributed with this work for additional information
|
||||
# regarding copyright ownership. The ASF licenses this file
|
||||
# to you under the Apache License, Version 2.0 (the
|
||||
# "License"); you may not use this file except in compliance
|
||||
# with the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
spec:
|
||||
name: testJobName
|
||||
job_type: testJobType
|
||||
framework: tensorflow
|
||||
|
||||
configs:
|
||||
input_path: testInputPath
|
||||
checkpoint_path: testCheckpointPath
|
||||
saved_model_path: testSavedModelPath
|
||||
docker_image: testDockerImage
|
||||
wait_job_finish: true
|
||||
envs:
|
||||
env1: 'env1Value'
|
||||
env2: 'env2Value'
|
||||
localizations:
|
||||
- hdfs://remote-file1:/local-filename1:rw
|
||||
- nfs://remote-file2:/local-filename2:rw
|
||||
mounts:
|
||||
- /etc/passwd:/etc/passwd:rw
|
||||
- /etc/hosts:/etc/hosts:rw
|
||||
quicklinks:
|
||||
- Notebook_UI=https://master-0:7070
|
||||
- Notebook_UI2=https://master-0:7071
|
||||
|
||||
scheduling:
|
||||
queue: queue1
|
||||
|
||||
roles:
|
||||
worker:
|
||||
resources: memory=20480M,vcores=32,gpu=2
|
||||
replicas: 3
|
||||
launch_cmd: testLaunchCmdWorker
|
||||
docker_image: testDockerImageWorker
|
||||
ps:
|
||||
resources: memory=20500M,vcores=34
|
||||
replicas: 4
|
||||
launch_cmd: testLaunchCmdPs
|
||||
docker_image: testDockerImagePs
|
||||
|
||||
security:
|
||||
keytab: keytabPath
|
||||
principal: testPrincipal
|
||||
distribute_keytab: true
|
||||
|
||||
tensorBoard:
|
||||
resources: memory=21000M,vcores=37
|
||||
docker_image: tensorboardDockerImage
|
|
@ -0,0 +1,131 @@
|
|||
<?xml version="1.0"?>
|
||||
<!--
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License. See accompanying LICENSE file.
|
||||
-->
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0
|
||||
http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
<parent>
|
||||
<artifactId>hadoop-submarine</artifactId>
|
||||
<groupId>org.apache.hadoop</groupId>
|
||||
<version>0.2.0-SNAPSHOT</version>
|
||||
</parent>
|
||||
<artifactId>${project.artifactId}</artifactId>
|
||||
<version>${project.version}</version>
|
||||
<name>Hadoop Submarine Dist</name>
|
||||
<packaging>pom</packaging>
|
||||
|
||||
<properties>
|
||||
<!-- Needed for generating FindBugs warnings using parent pom -->
|
||||
<yarn.basedir>${project.parent.parent.basedir}</yarn.basedir>
|
||||
<project.artifactId>hadoop-submarine-dist</project.artifactId>
|
||||
<project.version>0.2.0-SNAPSHOT</project.version>
|
||||
</properties>
|
||||
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>org.apache.hadoop</groupId>
|
||||
<artifactId>hadoop-submarine-core</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
|
||||
<profiles>
|
||||
<profile>
|
||||
<id>hadoop-3.2</id>
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>org.apache.hadoop</groupId>
|
||||
<artifactId>hadoop-submarine-yarnservice-runtime</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.hadoop</groupId>
|
||||
<artifactId>hadoop-submarine-tony-runtime</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
</profile>
|
||||
|
||||
<!-- Default profile-->
|
||||
<profile>
|
||||
<id>hadoop-3.1</id>
|
||||
<activation>
|
||||
<activeByDefault>true</activeByDefault>
|
||||
</activation>
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>org.apache.hadoop</groupId>
|
||||
<artifactId>hadoop-submarine-yarnservice-runtime</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.hadoop</groupId>
|
||||
<artifactId>hadoop-submarine-tony-runtime</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
</profile>
|
||||
|
||||
<profile>
|
||||
<id>hadoop-2.9</id>
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>org.apache.hadoop</groupId>
|
||||
<artifactId>hadoop-submarine-tony-runtime</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
</profile>
|
||||
|
||||
<profile>
|
||||
<id>hadoop-2.7</id>
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>org.apache.hadoop</groupId>
|
||||
<artifactId>hadoop-submarine-tony-runtime</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
</profile>
|
||||
</profiles>
|
||||
|
||||
<build>
|
||||
<plugins>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-assembly-plugin</artifactId>
|
||||
<executions>
|
||||
<execution>
|
||||
<id>dist</id>
|
||||
<phase>package</phase>
|
||||
<goals>
|
||||
<goal>single</goal>
|
||||
</goals>
|
||||
<configuration>
|
||||
<finalName>${project.artifactId}-${project.version}-${project.activeProfiles[0].id}</finalName>
|
||||
<appendAssemblyId>false</appendAssemblyId>
|
||||
<attach>false</attach>
|
||||
<descriptors>
|
||||
<descriptor>src/assembly/distribution.xml</descriptor>
|
||||
</descriptors>
|
||||
</configuration>
|
||||
</execution>
|
||||
</executions>
|
||||
</plugin>
|
||||
</plugins>
|
||||
</build>
|
||||
|
||||
</project>
|
|
@ -0,0 +1,61 @@
|
|||
<!--
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License. See accompanying LICENSE file.
|
||||
-->
|
||||
|
||||
<assembly xmlns="http://maven.apache.org/ASSEMBLY/2.0.0"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://maven.apache.org/ASSEMBLY/2.0.0 http://maven.apache.org/xsd/assembly-2.0.0.xsd">
|
||||
<id>distribution</id>
|
||||
<formats>
|
||||
<format>dir</format>
|
||||
<format>tar.gz</format>
|
||||
</formats>
|
||||
|
||||
<dependencySets>
|
||||
<dependencySet>
|
||||
<outputDirectory>/</outputDirectory>
|
||||
<includes>
|
||||
<include>org.apache.hadoop:hadoop-submarine-core</include>
|
||||
</includes>
|
||||
</dependencySet>
|
||||
<dependencySet>
|
||||
<outputDirectory>/</outputDirectory>
|
||||
<includes>
|
||||
<include>org.apache.hadoop:hadoop-submarine-tony-runtime</include>
|
||||
</includes>
|
||||
</dependencySet>
|
||||
<dependencySet>
|
||||
<outputDirectory>/</outputDirectory>
|
||||
<includes>
|
||||
<include>org.apache.hadoop:hadoop-submarine-yarnservice-runtime</include>
|
||||
</includes>
|
||||
</dependencySet>
|
||||
</dependencySets>
|
||||
|
||||
<fileSets>
|
||||
<fileSet>
|
||||
<directory>../../</directory>
|
||||
<includes>
|
||||
<include>LICENSE*</include>
|
||||
<include>NOTICE*</include>
|
||||
</includes>
|
||||
</fileSet>
|
||||
<fileSet>
|
||||
<directory>../hadoop-submarine-all/target/</directory>
|
||||
<outputDirectory>/</outputDirectory>
|
||||
<includes>
|
||||
<include>hadoop-submarine-all-${project.version}-*.jar</include>
|
||||
</includes>
|
||||
</fileSet>
|
||||
</fileSets>
|
||||
</assembly>
|
|
@ -23,6 +23,7 @@
|
|||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
<artifactId>hadoop-submarine-tony-runtime</artifactId>
|
||||
<name>Hadoop Submarine Tony Runtime</name>
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>org.apache.hadoop</groupId>
|
||||
|
|
|
@ -19,9 +19,8 @@ import com.linkedin.tony.util.Utils;
|
|||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.yarn.api.records.ResourceInformation;
|
||||
import org.apache.hadoop.yarn.exceptions.ResourceNotFoundException;
|
||||
import org.apache.hadoop.yarn.submarine.client.cli.param.runjob.TensorFlowRunJobParameters;
|
||||
import org.apache.hadoop.yarn.submarine.common.resource.ResourceUtils;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
|
@ -52,7 +51,7 @@ public final class TonyUtils {
|
|||
tonyConf.setLong(
|
||||
TonyConfigurationKeys.getResourceKey(Constants.PS_JOB_NAME,
|
||||
Constants.MEMORY),
|
||||
parameters.getPsResource().getMemorySize());
|
||||
ResourceUtils.getMemorySize(parameters.getPsResource()));
|
||||
}
|
||||
if (parameters.getWorkerResource() != null) {
|
||||
tonyConf.setInt(
|
||||
|
@ -62,16 +61,12 @@ public final class TonyUtils {
|
|||
tonyConf.setLong(
|
||||
TonyConfigurationKeys.getResourceKey(Constants.WORKER_JOB_NAME,
|
||||
Constants.MEMORY),
|
||||
parameters.getWorkerResource().getMemorySize());
|
||||
try {
|
||||
tonyConf.setLong(
|
||||
TonyConfigurationKeys.getResourceKey(Constants.WORKER_JOB_NAME,
|
||||
Constants.GPUS),
|
||||
parameters.getWorkerResource()
|
||||
.getResourceValue(ResourceInformation.GPU_URI));
|
||||
} catch (ResourceNotFoundException rnfe) {
|
||||
LOG.error("GPU resources not enabled.");
|
||||
}
|
||||
ResourceUtils.getMemorySize(parameters.getWorkerResource()));
|
||||
tonyConf.setLong(
|
||||
TonyConfigurationKeys.getResourceKey(Constants.WORKER_JOB_NAME,
|
||||
Constants.GPUS),
|
||||
ResourceUtils.getResourceValue(parameters.getWorkerResource(),
|
||||
ResourceUtils.GPU_URI));
|
||||
}
|
||||
if (parameters.getQueue() != null) {
|
||||
tonyConf.set(
|
||||
|
|
|
@ -22,7 +22,7 @@
|
|||
<groupId>org.apache.hadoop</groupId>
|
||||
<version>0.2.0-SNAPSHOT</version>
|
||||
</parent>
|
||||
<artifactId>hadoop-submarine-score-yarnservice-runtime</artifactId>
|
||||
<artifactId>hadoop-submarine-yarnservice-runtime</artifactId>
|
||||
<version>0.2.0-SNAPSHOT</version>
|
||||
<name>Hadoop Submarine YARN Service Runtime</name>
|
||||
|
||||
|
@ -108,12 +108,10 @@
|
|||
<dependency>
|
||||
<groupId>org.apache.hadoop</groupId>
|
||||
<artifactId>hadoop-yarn-services-api</artifactId>
|
||||
<version>3.3.0-SNAPSHOT</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.hadoop</groupId>
|
||||
<artifactId>hadoop-yarn-services-core</artifactId>
|
||||
<version>3.3.0-SNAPSHOT</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.hadoop</groupId>
|
||||
|
|
|
@ -30,7 +30,6 @@ import org.apache.hadoop.yarn.submarine.runtimes.yarnservice.command.LaunchComma
|
|||
import java.io.IOException;
|
||||
import java.util.Objects;
|
||||
|
||||
import static org.apache.hadoop.yarn.service.conf.YarnServiceConstants.CONTAINER_STATE_REPORT_AS_SERVICE_STATE;
|
||||
import static org.apache.hadoop.yarn.submarine.runtimes.yarnservice.tensorflow.TensorFlowCommons.addCommonEnvironments;
|
||||
import static org.apache.hadoop.yarn.submarine.runtimes.yarnservice.tensorflow.TensorFlowCommons.getScriptFileName;
|
||||
import static org.apache.hadoop.yarn.submarine.utils.DockerUtilities.getDockerArtifact;
|
||||
|
@ -85,8 +84,11 @@ public abstract class AbstractComponent {
|
|||
if (role.equals(TensorFlowRole.PRIMARY_WORKER) ||
|
||||
role.equals(PyTorchRole.PRIMARY_WORKER)) {
|
||||
component.setNumberOfContainers(1L);
|
||||
// If the dependencies are upgraded to hadoop 3.3.0.
|
||||
// yarn.service.container-state-report-as-service-state can be replaced
|
||||
// with CONTAINER_STATE_REPORT_AS_SERVICE_STATE
|
||||
component.getConfiguration().setProperty(
|
||||
CONTAINER_STATE_REPORT_AS_SERVICE_STATE, "true");
|
||||
"yarn.service.container-state-report-as-service-state", "true");
|
||||
} else {
|
||||
component.setNumberOfContainers(
|
||||
(long) parameters.getNumWorkers() - 1);
|
||||
|
|
|
@ -33,7 +33,6 @@ import java.io.IOException;
|
|||
import java.util.Map;
|
||||
|
||||
import static junit.framework.TestCase.assertTrue;
|
||||
import static org.apache.hadoop.yarn.service.conf.YarnServiceConstants.CONTAINER_STATE_REPORT_AS_SERVICE_STATE;
|
||||
import static org.junit.Assert.assertEquals;
|
||||
import static org.junit.Assert.assertFalse;
|
||||
import static org.mockito.ArgumentMatchers.any;
|
||||
|
@ -209,8 +208,11 @@ public class TestTensorFlowWorkerComponent {
|
|||
Component component = workerComponent.createComponent();
|
||||
|
||||
assertEquals(1L, (long) component.getNumberOfContainers());
|
||||
// If the dependencies are upgraded to hadoop 3.3.0.
|
||||
// yarn.service.container-state-report-as-service-state can be replaced
|
||||
// with CONTAINER_STATE_REPORT_AS_SERVICE_STATE
|
||||
verifyCommons(component, ImmutableMap.of(
|
||||
CONTAINER_STATE_REPORT_AS_SERVICE_STATE, "true"));
|
||||
"yarn.service.container-state-report-as-service-state", "true"));
|
||||
}
|
||||
|
||||
}
|
|
@ -19,12 +19,17 @@ package org.apache.hadoop.yarn.submarine.utils;
|
|||
import com.google.common.collect.ImmutableMap;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.yarn.api.records.Resource;
|
||||
import org.apache.hadoop.yarn.LocalConfigurationProvider;
|
||||
import org.apache.hadoop.yarn.conf.YarnConfiguration;
|
||||
import org.apache.hadoop.yarn.exceptions.YarnException;
|
||||
import org.apache.hadoop.yarn.service.api.records.ResourceInformation;
|
||||
import org.apache.hadoop.yarn.util.resource.CustomResourceTypesConfigurationProvider;
|
||||
import org.apache.hadoop.yarn.util.resource.ResourceUtils;
|
||||
import org.junit.After;
|
||||
import org.junit.Test;
|
||||
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.util.Map;
|
||||
|
||||
import static org.junit.Assert.*;
|
||||
|
@ -33,13 +38,48 @@ import static org.junit.Assert.*;
|
|||
* This class is to test {@link SubmarineResourceUtils}.
|
||||
*/
|
||||
public class TestSubmarineResourceUtils {
|
||||
/**
|
||||
* With the dependencies of hadoop 3.2.0, Need to create a
|
||||
* CustomResourceTypesConfigurationProvider implementations. If the
|
||||
* dependencies are upgraded to hadoop 3.3.0. It can be replaced by
|
||||
* org.apache.hadoop.yarn.util.resource.CustomResourceTypesConfigurationProvi-
|
||||
* der
|
||||
*/
|
||||
private static class CustomResourceTypesConfigurationProvider
|
||||
extends LocalConfigurationProvider {
|
||||
|
||||
@Override
|
||||
public InputStream getConfigurationInputStream(Configuration bootstrapConf,
|
||||
String name) throws YarnException, IOException {
|
||||
if (YarnConfiguration.RESOURCE_TYPES_CONFIGURATION_FILE.equals(name)) {
|
||||
return new ByteArrayInputStream(
|
||||
("<configuration>\n" +
|
||||
" <property>\n" +
|
||||
" <name>yarn.resource-types</name>\n" +
|
||||
" <value>" + CUSTOM_RESOURCE_NAME + "</value>\n" +
|
||||
" </property>\n" +
|
||||
" <property>\n" +
|
||||
" <name>yarn.resource-types.a-custom-resource.units</name>\n"
|
||||
+
|
||||
" <value>G</value>\n" +
|
||||
" </property>\n" +
|
||||
"</configuration>\n").getBytes());
|
||||
} else {
|
||||
return super.getConfigurationInputStream(bootstrapConf, name);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static final String CUSTOM_RESOURCE_NAME = "a-custom-resource";
|
||||
|
||||
private void initResourceTypes() {
|
||||
CustomResourceTypesConfigurationProvider.initResourceTypes(
|
||||
ImmutableMap.<String, String>builder()
|
||||
.put(CUSTOM_RESOURCE_NAME, "G")
|
||||
.build());
|
||||
// If the dependencies are upgraded to hadoop 3.3.0. It can be replaced by
|
||||
// org.apache.hadoop.yarn.util.resource.CustomResourceTypesConfigurationPro-
|
||||
// vider
|
||||
Configuration configuration = new Configuration();
|
||||
configuration.set(YarnConfiguration.RM_CONFIGURATION_PROVIDER_CLASS,
|
||||
CustomResourceTypesConfigurationProvider.class.getName());
|
||||
ResourceUtils.resetResourceTypes(configuration);
|
||||
}
|
||||
|
||||
@After
|
||||
|
|
|
@ -20,7 +20,7 @@
|
|||
<parent>
|
||||
<groupId>org.apache.hadoop</groupId>
|
||||
<artifactId>hadoop-project</artifactId>
|
||||
<version>3.3.0-SNAPSHOT</version>
|
||||
<version>3.2.0</version>
|
||||
<relativePath/>
|
||||
</parent>
|
||||
<artifactId>hadoop-submarine</artifactId>
|
||||
|
@ -32,29 +32,126 @@
|
|||
<hadoop.common.build.dir>${basedir}/../hadoop-common-project/hadoop-common/target</hadoop.common.build.dir>
|
||||
</properties>
|
||||
|
||||
<!-- Do not add dependencies here, add them to the POM of the leaf module -->
|
||||
|
||||
<modules>
|
||||
<module>hadoop-submarine-core</module>
|
||||
<module>hadoop-submarine-yarnservice-runtime</module>
|
||||
<module>hadoop-submarine-tony-runtime</module>
|
||||
<module>hadoop-submarine-all</module>
|
||||
<module>hadoop-submarine-dist</module>
|
||||
</modules>
|
||||
|
||||
<profiles>
|
||||
<profile>
|
||||
<id>clover</id>
|
||||
<activation>
|
||||
<activeByDefault>false</activeByDefault>
|
||||
<property>
|
||||
<name>clover</name>
|
||||
</property>
|
||||
</activation>
|
||||
<dependencyManagement>
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>com.cenqua.clover</groupId>
|
||||
<artifactId>clover</artifactId>
|
||||
<groupId>org.mockito</groupId>
|
||||
<artifactId>mockito-core</artifactId>
|
||||
<version>2.23.4</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.hadoop</groupId>
|
||||
<artifactId>hadoop-yarn-services-api</artifactId>
|
||||
<version>${hadoop.version}</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.hadoop</groupId>
|
||||
<artifactId>hadoop-common</artifactId>
|
||||
<version>${hadoop.version}</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.hadoop</groupId>
|
||||
<artifactId>hadoop-yarn-api</artifactId>
|
||||
<version>${hadoop.version}</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.hadoop</groupId>
|
||||
<artifactId>hadoop-yarn-common</artifactId>
|
||||
<version>${hadoop.version}</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.hadoop</groupId>
|
||||
<artifactId>hadoop-yarn-client</artifactId>
|
||||
<version>${hadoop.version}</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.hadoop</groupId>
|
||||
<artifactId>hadoop-hdfs</artifactId>
|
||||
<version>${hadoop.version}</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.commons</groupId>
|
||||
<artifactId>commons-lang3</artifactId>
|
||||
<version>3.7</version>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
</profile>
|
||||
</profiles>
|
||||
</dependencyManagement>
|
||||
|
||||
<profiles>
|
||||
<profile>
|
||||
<id>hadoop-3.2</id>
|
||||
<properties>
|
||||
<hadoop.version>3.2.0</hadoop.version>
|
||||
</properties>
|
||||
<modules>
|
||||
<module>hadoop-submarine-yarnservice-runtime</module>
|
||||
<module>hadoop-submarine-tony-runtime</module>
|
||||
</modules>
|
||||
</profile>
|
||||
|
||||
<!-- Default profile-->
|
||||
<profile>
|
||||
<id>hadoop-3.1</id>
|
||||
<activation>
|
||||
<activeByDefault>true</activeByDefault>
|
||||
</activation>
|
||||
<properties>
|
||||
<hadoop.version>3.1.2</hadoop.version>
|
||||
</properties>
|
||||
<modules>
|
||||
<module>hadoop-submarine-yarnservice-runtime</module>
|
||||
<module>hadoop-submarine-tony-runtime</module>
|
||||
</modules>
|
||||
</profile>
|
||||
|
||||
<profile>
|
||||
<id>hadoop-2.9</id>
|
||||
<properties>
|
||||
<hadoop.version>2.9.2</hadoop.version>
|
||||
</properties>
|
||||
<modules>
|
||||
<module>hadoop-submarine-tony-runtime</module>
|
||||
</modules>
|
||||
</profile>
|
||||
|
||||
<profile>
|
||||
<id>hadoop-2.7</id>
|
||||
<properties>
|
||||
<hadoop.version>2.7.3</hadoop.version>
|
||||
</properties>
|
||||
<modules>
|
||||
<module>hadoop-submarine-tony-runtime</module>
|
||||
</modules>
|
||||
</profile>
|
||||
|
||||
<profile>
|
||||
<id>clover</id>
|
||||
<activation>
|
||||
<activeByDefault>false</activeByDefault>
|
||||
<property>
|
||||
<name>clover</name>
|
||||
</property>
|
||||
</activation>
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>com.cenqua.clover</groupId>
|
||||
<artifactId>clover</artifactId>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
</profile>
|
||||
</profiles>
|
||||
|
||||
</project>
|
||||
|
|
Loading…
Reference in New Issue