YARN-9011. Race condition during decommissioning. Contributed by Peter Bacsko
This commit is contained in:
parent
7f811722b7
commit
27642367ef
|
@ -52,6 +52,8 @@ public class HostsFileReader {
|
||||||
.class);
|
.class);
|
||||||
|
|
||||||
private final AtomicReference<HostDetails> current;
|
private final AtomicReference<HostDetails> current;
|
||||||
|
private final AtomicReference<HostDetails> lazyLoaded =
|
||||||
|
new AtomicReference<>();
|
||||||
|
|
||||||
public HostsFileReader(String inFile,
|
public HostsFileReader(String inFile,
|
||||||
String exFile) throws IOException {
|
String exFile) throws IOException {
|
||||||
|
@ -187,7 +189,18 @@ public class HostsFileReader {
|
||||||
|
|
||||||
public void refresh(String includesFile, String excludesFile)
|
public void refresh(String includesFile, String excludesFile)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
LOG.info("Refreshing hosts (include/exclude) list");
|
refreshInternal(includesFile, excludesFile, false);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void lazyRefresh(String includesFile, String excludesFile)
|
||||||
|
throws IOException {
|
||||||
|
refreshInternal(includesFile, excludesFile, true);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void refreshInternal(String includesFile, String excludesFile,
|
||||||
|
boolean lazy) throws IOException {
|
||||||
|
LOG.info("Refreshing hosts (include/exclude) list (lazy refresh = {})",
|
||||||
|
lazy);
|
||||||
HostDetails oldDetails = current.get();
|
HostDetails oldDetails = current.get();
|
||||||
Set<String> newIncludes = oldDetails.includes;
|
Set<String> newIncludes = oldDetails.includes;
|
||||||
Map<String, Integer> newExcludes = oldDetails.excludes;
|
Map<String, Integer> newExcludes = oldDetails.excludes;
|
||||||
|
@ -203,7 +216,21 @@ public class HostsFileReader {
|
||||||
}
|
}
|
||||||
HostDetails newDetails = new HostDetails(includesFile, newIncludes,
|
HostDetails newDetails = new HostDetails(includesFile, newIncludes,
|
||||||
excludesFile, newExcludes);
|
excludesFile, newExcludes);
|
||||||
current.set(newDetails);
|
|
||||||
|
if (lazy) {
|
||||||
|
lazyLoaded.set(newDetails);
|
||||||
|
} else {
|
||||||
|
current.set(newDetails);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public void finishRefresh() {
|
||||||
|
if (lazyLoaded.get() == null) {
|
||||||
|
throw new IllegalStateException(
|
||||||
|
"Cannot finish refresh - call lazyRefresh() first");
|
||||||
|
}
|
||||||
|
current.set(lazyLoaded.get());
|
||||||
|
lazyLoaded.set(null);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Private
|
@Private
|
||||||
|
@ -279,6 +306,10 @@ public class HostsFileReader {
|
||||||
return current.get();
|
return current.get();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public HostDetails getLazyLoadedHostDetails() {
|
||||||
|
return lazyLoaded.get();
|
||||||
|
}
|
||||||
|
|
||||||
public void setIncludesFile(String includesFile) {
|
public void setIncludesFile(String includesFile) {
|
||||||
LOG.info("Setting the includes file to " + includesFile);
|
LOG.info("Setting the includes file to " + includesFile);
|
||||||
HostDetails oldDetails = current.get();
|
HostDetails oldDetails = current.get();
|
||||||
|
|
|
@ -19,6 +19,7 @@ package org.apache.hadoop.util;
|
||||||
|
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
import java.io.FileWriter;
|
import java.io.FileWriter;
|
||||||
|
import java.io.IOException;
|
||||||
import java.nio.file.NoSuchFileException;
|
import java.nio.file.NoSuchFileException;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
|
||||||
|
@ -347,4 +348,62 @@ public class TestHostsFileReader {
|
||||||
assertTrue(excludes.get("host5") == 1800);
|
assertTrue(excludes.get("host5") == 1800);
|
||||||
assertTrue(excludes.get("host6") == 1800);
|
assertTrue(excludes.get("host6") == 1800);
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
@Test
|
||||||
|
public void testLazyRefresh() throws IOException {
|
||||||
|
FileWriter efw = new FileWriter(excludesFile);
|
||||||
|
FileWriter ifw = new FileWriter(includesFile);
|
||||||
|
|
||||||
|
efw.write("host1\n");
|
||||||
|
efw.write("host2\n");
|
||||||
|
efw.close();
|
||||||
|
ifw.write("host3\n");
|
||||||
|
ifw.write("host4\n");
|
||||||
|
ifw.close();
|
||||||
|
|
||||||
|
HostsFileReader hfp = new HostsFileReader(includesFile, excludesFile);
|
||||||
|
|
||||||
|
ifw = new FileWriter(includesFile);
|
||||||
|
ifw.close();
|
||||||
|
|
||||||
|
efw = new FileWriter(excludesFile, true);
|
||||||
|
efw.write("host3\n");
|
||||||
|
efw.write("host4\n");
|
||||||
|
efw.close();
|
||||||
|
|
||||||
|
hfp.lazyRefresh(includesFile, excludesFile);
|
||||||
|
|
||||||
|
HostDetails details = hfp.getHostDetails();
|
||||||
|
HostDetails lazyDetails = hfp.getLazyLoadedHostDetails();
|
||||||
|
|
||||||
|
assertEquals("Details: no. of excluded hosts", 2,
|
||||||
|
details.getExcludedHosts().size());
|
||||||
|
assertEquals("Details: no. of included hosts", 2,
|
||||||
|
details.getIncludedHosts().size());
|
||||||
|
assertEquals("LazyDetails: no. of excluded hosts", 4,
|
||||||
|
lazyDetails.getExcludedHosts().size());
|
||||||
|
assertEquals("LayDetails: no. of included hosts", 0,
|
||||||
|
lazyDetails.getIncludedHosts().size());
|
||||||
|
|
||||||
|
hfp.finishRefresh();
|
||||||
|
|
||||||
|
details = hfp.getHostDetails();
|
||||||
|
assertEquals("Details: no. of excluded hosts", 4,
|
||||||
|
details.getExcludedHosts().size());
|
||||||
|
assertEquals("Details: no. of included hosts", 0,
|
||||||
|
details.getIncludedHosts().size());
|
||||||
|
assertNull("Lazy host details should be null",
|
||||||
|
hfp.getLazyLoadedHostDetails());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test(expected = IllegalStateException.class)
|
||||||
|
public void testFinishRefreshWithoutLazyRefresh() throws IOException {
|
||||||
|
FileWriter efw = new FileWriter(excludesFile);
|
||||||
|
FileWriter ifw = new FileWriter(includesFile);
|
||||||
|
efw.close();
|
||||||
|
ifw.close();
|
||||||
|
|
||||||
|
HostsFileReader hfp = new HostsFileReader(includesFile, excludesFile);
|
||||||
|
hfp.finishRefresh();
|
||||||
|
}
|
||||||
|
}
|
|
@ -84,10 +84,12 @@ public class NodesListManager extends CompositeService implements
|
||||||
private Resolver resolver;
|
private Resolver resolver;
|
||||||
private Timer removalTimer;
|
private Timer removalTimer;
|
||||||
private int nodeRemovalCheckInterval;
|
private int nodeRemovalCheckInterval;
|
||||||
|
private Set<RMNode> gracefulDecommissionableNodes;
|
||||||
|
|
||||||
public NodesListManager(RMContext rmContext) {
|
public NodesListManager(RMContext rmContext) {
|
||||||
super(NodesListManager.class.getName());
|
super(NodesListManager.class.getName());
|
||||||
this.rmContext = rmContext;
|
this.rmContext = rmContext;
|
||||||
|
this.gracefulDecommissionableNodes = ConcurrentHashMap.newKeySet();
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -115,7 +117,7 @@ public class NodesListManager extends CompositeService implements
|
||||||
this.hostsReader =
|
this.hostsReader =
|
||||||
createHostsFileReader(this.includesFile, this.excludesFile);
|
createHostsFileReader(this.includesFile, this.excludesFile);
|
||||||
setDecommissionedNMs();
|
setDecommissionedNMs();
|
||||||
printConfiguredHosts();
|
printConfiguredHosts(false);
|
||||||
} catch (YarnException ex) {
|
} catch (YarnException ex) {
|
||||||
disableHostsFileReader(ex);
|
disableHostsFileReader(ex);
|
||||||
} catch (IOException ioe) {
|
} catch (IOException ioe) {
|
||||||
|
@ -187,7 +189,7 @@ public class NodesListManager extends CompositeService implements
|
||||||
removalTimer.cancel();
|
removalTimer.cancel();
|
||||||
}
|
}
|
||||||
|
|
||||||
private void printConfiguredHosts() {
|
private void printConfiguredHosts(boolean graceful) {
|
||||||
if (!LOG.isDebugEnabled()) {
|
if (!LOG.isDebugEnabled()) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
@ -198,7 +200,12 @@ public class NodesListManager extends CompositeService implements
|
||||||
conf.get(YarnConfiguration.RM_NODES_EXCLUDE_FILE_PATH,
|
conf.get(YarnConfiguration.RM_NODES_EXCLUDE_FILE_PATH,
|
||||||
YarnConfiguration.DEFAULT_RM_NODES_EXCLUDE_FILE_PATH));
|
YarnConfiguration.DEFAULT_RM_NODES_EXCLUDE_FILE_PATH));
|
||||||
|
|
||||||
HostDetails hostDetails = hostsReader.getHostDetails();
|
HostDetails hostDetails;
|
||||||
|
if (graceful) {
|
||||||
|
hostDetails = hostsReader.getLazyLoadedHostDetails();
|
||||||
|
} else {
|
||||||
|
hostDetails = hostsReader.getHostDetails();
|
||||||
|
}
|
||||||
for (String include : hostDetails.getIncludedHosts()) {
|
for (String include : hostDetails.getIncludedHosts()) {
|
||||||
LOG.debug("include: " + include);
|
LOG.debug("include: " + include);
|
||||||
}
|
}
|
||||||
|
@ -235,8 +242,15 @@ public class NodesListManager extends CompositeService implements
|
||||||
yarnConf.get(YarnConfiguration.RM_NODES_EXCLUDE_FILE_PATH,
|
yarnConf.get(YarnConfiguration.RM_NODES_EXCLUDE_FILE_PATH,
|
||||||
YarnConfiguration.DEFAULT_RM_NODES_EXCLUDE_FILE_PATH);
|
YarnConfiguration.DEFAULT_RM_NODES_EXCLUDE_FILE_PATH);
|
||||||
LOG.info("refreshNodes excludesFile " + excludesFile);
|
LOG.info("refreshNodes excludesFile " + excludesFile);
|
||||||
hostsReader.refresh(includesFile, excludesFile);
|
|
||||||
printConfiguredHosts();
|
if (graceful) {
|
||||||
|
// update hosts, but don't make it visible just yet
|
||||||
|
hostsReader.lazyRefresh(includesFile, excludesFile);
|
||||||
|
} else {
|
||||||
|
hostsReader.refresh(includesFile, excludesFile);
|
||||||
|
}
|
||||||
|
|
||||||
|
printConfiguredHosts(graceful);
|
||||||
|
|
||||||
LOG.info("hostsReader include:{" +
|
LOG.info("hostsReader include:{" +
|
||||||
StringUtils.join(",", hostsReader.getHosts()) +
|
StringUtils.join(",", hostsReader.getHosts()) +
|
||||||
|
@ -270,7 +284,14 @@ public class NodesListManager extends CompositeService implements
|
||||||
// Nodes need to be decommissioned (graceful or forceful);
|
// Nodes need to be decommissioned (graceful or forceful);
|
||||||
List<RMNode> nodesToDecom = new ArrayList<RMNode>();
|
List<RMNode> nodesToDecom = new ArrayList<RMNode>();
|
||||||
|
|
||||||
HostDetails hostDetails = hostsReader.getHostDetails();
|
HostDetails hostDetails;
|
||||||
|
gracefulDecommissionableNodes.clear();
|
||||||
|
if (graceful) {
|
||||||
|
hostDetails = hostsReader.getLazyLoadedHostDetails();
|
||||||
|
} else {
|
||||||
|
hostDetails = hostsReader.getHostDetails();
|
||||||
|
}
|
||||||
|
|
||||||
Set<String> includes = hostDetails.getIncludedHosts();
|
Set<String> includes = hostDetails.getIncludedHosts();
|
||||||
Map<String, Integer> excludes = hostDetails.getExcludedMap();
|
Map<String, Integer> excludes = hostDetails.getExcludedMap();
|
||||||
|
|
||||||
|
@ -298,11 +319,13 @@ public class NodesListManager extends CompositeService implements
|
||||||
s != NodeState.DECOMMISSIONING) {
|
s != NodeState.DECOMMISSIONING) {
|
||||||
LOG.info("Gracefully decommission " + nodeStr);
|
LOG.info("Gracefully decommission " + nodeStr);
|
||||||
nodesToDecom.add(n);
|
nodesToDecom.add(n);
|
||||||
|
gracefulDecommissionableNodes.add(n);
|
||||||
} else if (s == NodeState.DECOMMISSIONING &&
|
} else if (s == NodeState.DECOMMISSIONING &&
|
||||||
!Objects.equals(n.getDecommissioningTimeout(),
|
!Objects.equals(n.getDecommissioningTimeout(),
|
||||||
timeoutToUse)) {
|
timeoutToUse)) {
|
||||||
LOG.info("Update " + nodeStr + " timeout to be " + timeoutToUse);
|
LOG.info("Update " + nodeStr + " timeout to be " + timeoutToUse);
|
||||||
nodesToDecom.add(n);
|
nodesToDecom.add(n);
|
||||||
|
gracefulDecommissionableNodes.add(n);
|
||||||
} else {
|
} else {
|
||||||
LOG.info("No action for " + nodeStr);
|
LOG.info("No action for " + nodeStr);
|
||||||
}
|
}
|
||||||
|
@ -315,6 +338,10 @@ public class NodesListManager extends CompositeService implements
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (graceful) {
|
||||||
|
hostsReader.finishRefresh();
|
||||||
|
}
|
||||||
|
|
||||||
for (RMNode n : nodesToRecom) {
|
for (RMNode n : nodesToRecom) {
|
||||||
RMNodeEvent e = new RMNodeEvent(
|
RMNodeEvent e = new RMNodeEvent(
|
||||||
n.getNodeID(), RMNodeEventType.RECOMMISSION);
|
n.getNodeID(), RMNodeEventType.RECOMMISSION);
|
||||||
|
@ -466,6 +493,10 @@ public class NodesListManager extends CompositeService implements
|
||||||
hostDetails.getExcludedHosts());
|
hostDetails.getExcludedHosts());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
boolean isGracefullyDecommissionableNode(RMNode node) {
|
||||||
|
return gracefulDecommissionableNodes.contains(node);
|
||||||
|
}
|
||||||
|
|
||||||
private boolean isValidNode(
|
private boolean isValidNode(
|
||||||
String hostName, Set<String> hostsList, Set<String> excludeList) {
|
String hostName, Set<String> hostsList, Set<String> excludeList) {
|
||||||
String ip = resolver.resolve(hostName);
|
String ip = resolver.resolve(hostName);
|
||||||
|
|
|
@ -836,10 +836,17 @@ public class ResourceTrackerService extends AbstractService implements
|
||||||
*/
|
*/
|
||||||
private boolean isNodeInDecommissioning(NodeId nodeId) {
|
private boolean isNodeInDecommissioning(NodeId nodeId) {
|
||||||
RMNode rmNode = this.rmContext.getRMNodes().get(nodeId);
|
RMNode rmNode = this.rmContext.getRMNodes().get(nodeId);
|
||||||
if (rmNode != null &&
|
|
||||||
rmNode.getState().equals(NodeState.DECOMMISSIONING)) {
|
if (rmNode != null) {
|
||||||
return true;
|
NodeState state = rmNode.getState();
|
||||||
|
|
||||||
|
if (state == NodeState.DECOMMISSIONING ||
|
||||||
|
(state == NodeState.RUNNING &&
|
||||||
|
this.nodesListManager.isGracefullyDecommissionableNode(rmNode))) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue