YARN-9011. Race condition during decommissioning. Contributed by Peter Bacsko

This commit is contained in:
Szilard Nemeth 2019-11-18 16:29:39 +01:00
parent 7f811722b7
commit 27642367ef
4 changed files with 140 additions and 12 deletions

View File

@ -52,6 +52,8 @@ public class HostsFileReader {
.class); .class);
private final AtomicReference<HostDetails> current; private final AtomicReference<HostDetails> current;
private final AtomicReference<HostDetails> lazyLoaded =
new AtomicReference<>();
public HostsFileReader(String inFile, public HostsFileReader(String inFile,
String exFile) throws IOException { String exFile) throws IOException {
@ -187,7 +189,18 @@ public class HostsFileReader {
public void refresh(String includesFile, String excludesFile) public void refresh(String includesFile, String excludesFile)
throws IOException { throws IOException {
LOG.info("Refreshing hosts (include/exclude) list"); refreshInternal(includesFile, excludesFile, false);
}
public void lazyRefresh(String includesFile, String excludesFile)
throws IOException {
refreshInternal(includesFile, excludesFile, true);
}
private void refreshInternal(String includesFile, String excludesFile,
boolean lazy) throws IOException {
LOG.info("Refreshing hosts (include/exclude) list (lazy refresh = {})",
lazy);
HostDetails oldDetails = current.get(); HostDetails oldDetails = current.get();
Set<String> newIncludes = oldDetails.includes; Set<String> newIncludes = oldDetails.includes;
Map<String, Integer> newExcludes = oldDetails.excludes; Map<String, Integer> newExcludes = oldDetails.excludes;
@ -203,7 +216,21 @@ public class HostsFileReader {
} }
HostDetails newDetails = new HostDetails(includesFile, newIncludes, HostDetails newDetails = new HostDetails(includesFile, newIncludes,
excludesFile, newExcludes); excludesFile, newExcludes);
current.set(newDetails);
if (lazy) {
lazyLoaded.set(newDetails);
} else {
current.set(newDetails);
}
}
public void finishRefresh() {
if (lazyLoaded.get() == null) {
throw new IllegalStateException(
"Cannot finish refresh - call lazyRefresh() first");
}
current.set(lazyLoaded.get());
lazyLoaded.set(null);
} }
@Private @Private
@ -279,6 +306,10 @@ public class HostsFileReader {
return current.get(); return current.get();
} }
public HostDetails getLazyLoadedHostDetails() {
return lazyLoaded.get();
}
public void setIncludesFile(String includesFile) { public void setIncludesFile(String includesFile) {
LOG.info("Setting the includes file to " + includesFile); LOG.info("Setting the includes file to " + includesFile);
HostDetails oldDetails = current.get(); HostDetails oldDetails = current.get();

View File

@ -19,6 +19,7 @@ package org.apache.hadoop.util;
import java.io.File; import java.io.File;
import java.io.FileWriter; import java.io.FileWriter;
import java.io.IOException;
import java.nio.file.NoSuchFileException; import java.nio.file.NoSuchFileException;
import java.util.Map; import java.util.Map;
@ -347,4 +348,62 @@ public class TestHostsFileReader {
assertTrue(excludes.get("host5") == 1800); assertTrue(excludes.get("host5") == 1800);
assertTrue(excludes.get("host6") == 1800); assertTrue(excludes.get("host6") == 1800);
} }
}
@Test
public void testLazyRefresh() throws IOException {
FileWriter efw = new FileWriter(excludesFile);
FileWriter ifw = new FileWriter(includesFile);
efw.write("host1\n");
efw.write("host2\n");
efw.close();
ifw.write("host3\n");
ifw.write("host4\n");
ifw.close();
HostsFileReader hfp = new HostsFileReader(includesFile, excludesFile);
ifw = new FileWriter(includesFile);
ifw.close();
efw = new FileWriter(excludesFile, true);
efw.write("host3\n");
efw.write("host4\n");
efw.close();
hfp.lazyRefresh(includesFile, excludesFile);
HostDetails details = hfp.getHostDetails();
HostDetails lazyDetails = hfp.getLazyLoadedHostDetails();
assertEquals("Details: no. of excluded hosts", 2,
details.getExcludedHosts().size());
assertEquals("Details: no. of included hosts", 2,
details.getIncludedHosts().size());
assertEquals("LazyDetails: no. of excluded hosts", 4,
lazyDetails.getExcludedHosts().size());
assertEquals("LayDetails: no. of included hosts", 0,
lazyDetails.getIncludedHosts().size());
hfp.finishRefresh();
details = hfp.getHostDetails();
assertEquals("Details: no. of excluded hosts", 4,
details.getExcludedHosts().size());
assertEquals("Details: no. of included hosts", 0,
details.getIncludedHosts().size());
assertNull("Lazy host details should be null",
hfp.getLazyLoadedHostDetails());
}
@Test(expected = IllegalStateException.class)
public void testFinishRefreshWithoutLazyRefresh() throws IOException {
FileWriter efw = new FileWriter(excludesFile);
FileWriter ifw = new FileWriter(includesFile);
efw.close();
ifw.close();
HostsFileReader hfp = new HostsFileReader(includesFile, excludesFile);
hfp.finishRefresh();
}
}

View File

@ -84,10 +84,12 @@ public class NodesListManager extends CompositeService implements
private Resolver resolver; private Resolver resolver;
private Timer removalTimer; private Timer removalTimer;
private int nodeRemovalCheckInterval; private int nodeRemovalCheckInterval;
private Set<RMNode> gracefulDecommissionableNodes;
public NodesListManager(RMContext rmContext) { public NodesListManager(RMContext rmContext) {
super(NodesListManager.class.getName()); super(NodesListManager.class.getName());
this.rmContext = rmContext; this.rmContext = rmContext;
this.gracefulDecommissionableNodes = ConcurrentHashMap.newKeySet();
} }
@Override @Override
@ -115,7 +117,7 @@ public class NodesListManager extends CompositeService implements
this.hostsReader = this.hostsReader =
createHostsFileReader(this.includesFile, this.excludesFile); createHostsFileReader(this.includesFile, this.excludesFile);
setDecommissionedNMs(); setDecommissionedNMs();
printConfiguredHosts(); printConfiguredHosts(false);
} catch (YarnException ex) { } catch (YarnException ex) {
disableHostsFileReader(ex); disableHostsFileReader(ex);
} catch (IOException ioe) { } catch (IOException ioe) {
@ -187,7 +189,7 @@ public class NodesListManager extends CompositeService implements
removalTimer.cancel(); removalTimer.cancel();
} }
private void printConfiguredHosts() { private void printConfiguredHosts(boolean graceful) {
if (!LOG.isDebugEnabled()) { if (!LOG.isDebugEnabled()) {
return; return;
} }
@ -198,7 +200,12 @@ public class NodesListManager extends CompositeService implements
conf.get(YarnConfiguration.RM_NODES_EXCLUDE_FILE_PATH, conf.get(YarnConfiguration.RM_NODES_EXCLUDE_FILE_PATH,
YarnConfiguration.DEFAULT_RM_NODES_EXCLUDE_FILE_PATH)); YarnConfiguration.DEFAULT_RM_NODES_EXCLUDE_FILE_PATH));
HostDetails hostDetails = hostsReader.getHostDetails(); HostDetails hostDetails;
if (graceful) {
hostDetails = hostsReader.getLazyLoadedHostDetails();
} else {
hostDetails = hostsReader.getHostDetails();
}
for (String include : hostDetails.getIncludedHosts()) { for (String include : hostDetails.getIncludedHosts()) {
LOG.debug("include: " + include); LOG.debug("include: " + include);
} }
@ -235,8 +242,15 @@ public class NodesListManager extends CompositeService implements
yarnConf.get(YarnConfiguration.RM_NODES_EXCLUDE_FILE_PATH, yarnConf.get(YarnConfiguration.RM_NODES_EXCLUDE_FILE_PATH,
YarnConfiguration.DEFAULT_RM_NODES_EXCLUDE_FILE_PATH); YarnConfiguration.DEFAULT_RM_NODES_EXCLUDE_FILE_PATH);
LOG.info("refreshNodes excludesFile " + excludesFile); LOG.info("refreshNodes excludesFile " + excludesFile);
hostsReader.refresh(includesFile, excludesFile);
printConfiguredHosts(); if (graceful) {
// update hosts, but don't make it visible just yet
hostsReader.lazyRefresh(includesFile, excludesFile);
} else {
hostsReader.refresh(includesFile, excludesFile);
}
printConfiguredHosts(graceful);
LOG.info("hostsReader include:{" + LOG.info("hostsReader include:{" +
StringUtils.join(",", hostsReader.getHosts()) + StringUtils.join(",", hostsReader.getHosts()) +
@ -270,7 +284,14 @@ public class NodesListManager extends CompositeService implements
// Nodes need to be decommissioned (graceful or forceful); // Nodes need to be decommissioned (graceful or forceful);
List<RMNode> nodesToDecom = new ArrayList<RMNode>(); List<RMNode> nodesToDecom = new ArrayList<RMNode>();
HostDetails hostDetails = hostsReader.getHostDetails(); HostDetails hostDetails;
gracefulDecommissionableNodes.clear();
if (graceful) {
hostDetails = hostsReader.getLazyLoadedHostDetails();
} else {
hostDetails = hostsReader.getHostDetails();
}
Set<String> includes = hostDetails.getIncludedHosts(); Set<String> includes = hostDetails.getIncludedHosts();
Map<String, Integer> excludes = hostDetails.getExcludedMap(); Map<String, Integer> excludes = hostDetails.getExcludedMap();
@ -298,11 +319,13 @@ public class NodesListManager extends CompositeService implements
s != NodeState.DECOMMISSIONING) { s != NodeState.DECOMMISSIONING) {
LOG.info("Gracefully decommission " + nodeStr); LOG.info("Gracefully decommission " + nodeStr);
nodesToDecom.add(n); nodesToDecom.add(n);
gracefulDecommissionableNodes.add(n);
} else if (s == NodeState.DECOMMISSIONING && } else if (s == NodeState.DECOMMISSIONING &&
!Objects.equals(n.getDecommissioningTimeout(), !Objects.equals(n.getDecommissioningTimeout(),
timeoutToUse)) { timeoutToUse)) {
LOG.info("Update " + nodeStr + " timeout to be " + timeoutToUse); LOG.info("Update " + nodeStr + " timeout to be " + timeoutToUse);
nodesToDecom.add(n); nodesToDecom.add(n);
gracefulDecommissionableNodes.add(n);
} else { } else {
LOG.info("No action for " + nodeStr); LOG.info("No action for " + nodeStr);
} }
@ -315,6 +338,10 @@ public class NodesListManager extends CompositeService implements
} }
} }
if (graceful) {
hostsReader.finishRefresh();
}
for (RMNode n : nodesToRecom) { for (RMNode n : nodesToRecom) {
RMNodeEvent e = new RMNodeEvent( RMNodeEvent e = new RMNodeEvent(
n.getNodeID(), RMNodeEventType.RECOMMISSION); n.getNodeID(), RMNodeEventType.RECOMMISSION);
@ -466,6 +493,10 @@ public class NodesListManager extends CompositeService implements
hostDetails.getExcludedHosts()); hostDetails.getExcludedHosts());
} }
boolean isGracefullyDecommissionableNode(RMNode node) {
return gracefulDecommissionableNodes.contains(node);
}
private boolean isValidNode( private boolean isValidNode(
String hostName, Set<String> hostsList, Set<String> excludeList) { String hostName, Set<String> hostsList, Set<String> excludeList) {
String ip = resolver.resolve(hostName); String ip = resolver.resolve(hostName);

View File

@ -836,10 +836,17 @@ public class ResourceTrackerService extends AbstractService implements
*/ */
private boolean isNodeInDecommissioning(NodeId nodeId) { private boolean isNodeInDecommissioning(NodeId nodeId) {
RMNode rmNode = this.rmContext.getRMNodes().get(nodeId); RMNode rmNode = this.rmContext.getRMNodes().get(nodeId);
if (rmNode != null &&
rmNode.getState().equals(NodeState.DECOMMISSIONING)) { if (rmNode != null) {
return true; NodeState state = rmNode.getState();
if (state == NodeState.DECOMMISSIONING ||
(state == NodeState.RUNNING &&
this.nodesListManager.isGracefullyDecommissionableNode(rmNode))) {
return true;
}
} }
return false; return false;
} }