YARN-9235. If linux container executor is not set for a GPU cluster GpuResourceHandlerImpl is not initialized and NPE is thrown. Contributed by Antal Balint Steinbach, Adam Antal

(cherry picked from commit c416284bb7)
This commit is contained in:
Szilard Nemeth 2019-07-12 16:51:58 +02:00
parent 9ed2c22d57
commit c61c969668
2 changed files with 76 additions and 0 deletions

View File

@ -18,6 +18,7 @@
package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu; package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu;
import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.apache.hadoop.yarn.exceptions.YarnException; import org.apache.hadoop.yarn.exceptions.YarnException;
import org.apache.hadoop.yarn.server.nodemanager.Context; import org.apache.hadoop.yarn.server.nodemanager.Context;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.privileged.PrivilegedOperationExecutor; import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.privileged.PrivilegedOperationExecutor;
@ -33,8 +34,14 @@ import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.GpuDeviceInforma
import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.NMGpuResourceInfo; import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.NMGpuResourceInfo;
import java.util.List; import java.util.List;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class GpuResourcePlugin implements ResourcePlugin { public class GpuResourcePlugin implements ResourcePlugin {
private static final Logger LOG =
LoggerFactory.getLogger(GpuResourcePlugin.class);
private final GpuNodeResourceUpdateHandler resourceDiscoverHandler; private final GpuNodeResourceUpdateHandler resourceDiscoverHandler;
private final GpuDiscoverer gpuDiscoverer; private final GpuDiscoverer gpuDiscoverer;
private GpuResourceHandlerImpl gpuResourceHandler = null; private GpuResourceHandlerImpl gpuResourceHandler = null;
@ -84,6 +91,10 @@ public class GpuResourcePlugin implements ResourcePlugin {
public synchronized NMResourceInfo getNMResourceInfo() throws YarnException { public synchronized NMResourceInfo getNMResourceInfo() throws YarnException {
GpuDeviceInformation gpuDeviceInformation = GpuDeviceInformation gpuDeviceInformation =
gpuDiscoverer.getGpuDeviceInformation(); gpuDiscoverer.getGpuDeviceInformation();
//At this point the gpu plugin is already enabled
checkGpuResourceHandler();
GpuResourceAllocator gpuResourceAllocator = GpuResourceAllocator gpuResourceAllocator =
gpuResourceHandler.getGpuAllocator(); gpuResourceHandler.getGpuAllocator();
List<GpuDevice> totalGpus = gpuResourceAllocator.getAllowedGpusCopy(); List<GpuDevice> totalGpus = gpuResourceAllocator.getAllowedGpusCopy();
@ -94,6 +105,17 @@ public class GpuResourcePlugin implements ResourcePlugin {
assignedGpuDevices); assignedGpuDevices);
} }
private void checkGpuResourceHandler() throws YarnException {
if(gpuResourceHandler == null) {
String errorMsg =
"Linux Container Executor is not configured for the NodeManager. "
+ "To fully enable GPU feature on the node also set "
+ YarnConfiguration.NM_CONTAINER_EXECUTOR + " properly.";
LOG.warn(errorMsg);
throw new YarnException(errorMsg);
}
}
@Override @Override
public String toString() { public String toString() {
return GpuResourcePlugin.class.getName(); return GpuResourcePlugin.class.getName();

View File

@ -0,0 +1,54 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu;
import static org.mockito.Mockito.mock;
import org.apache.hadoop.yarn.exceptions.YarnException;
import org.junit.Test;
public class TestGpuResourcePlugin {
@Test(expected = YarnException.class)
public void testResourceHandlerNotInitialized() throws YarnException {
GpuDiscoverer gpuDiscoverer = mock(GpuDiscoverer.class);
GpuNodeResourceUpdateHandler gpuNodeResourceUpdateHandler =
mock(GpuNodeResourceUpdateHandler.class);
GpuResourcePlugin target =
new GpuResourcePlugin(gpuNodeResourceUpdateHandler, gpuDiscoverer);
target.getNMResourceInfo();
}
@Test
public void testResourceHandlerIsInitialized() throws YarnException {
GpuDiscoverer gpuDiscoverer = mock(GpuDiscoverer.class);
GpuNodeResourceUpdateHandler gpuNodeResourceUpdateHandler =
mock(GpuNodeResourceUpdateHandler.class);
GpuResourcePlugin target =
new GpuResourcePlugin(gpuNodeResourceUpdateHandler, gpuDiscoverer);
target.createResourceHandler(null, null, null);
//Not throwing any exception
target.getNMResourceInfo();
}
}