HDDS-571. Update SCM chill mode exit criteria to optionally wait for n datanodes. Contributed by Ajay Kumar.

This commit is contained in:
Ajay Kumar 2018-10-05 14:02:54 -07:00
parent 9bb2801e8c
commit cdf5d58364
4 changed files with 113 additions and 3 deletions

View File

@ -83,6 +83,9 @@ public final class HddsConfigKeys {
public static final String HDDS_SCM_CHILLMODE_ENABLED =
"hdds.scm.chillmode.enabled";
public static final boolean HDDS_SCM_CHILLMODE_ENABLED_DEFAULT = true;
public static final String HDDS_SCM_CHILLMODE_MIN_DATANODE =
"hdds.scm.chillmode.min.datanode";
public static final int HDDS_SCM_CHILLMODE_MIN_DATANODE_DEFAULT = 1;
// % of containers which should have at least one reported replica
// before SCM comes out of chill mode.

View File

@ -1164,6 +1164,15 @@
</description>
</property>
<property>
<name>hdds.scm.chillmode.min.datanode</name>
<value>1</value>
<tag>HDDS,SCM,OPERATION</tag>
<description>Minimum DataNodes which should be registered to get SCM out of
chill mode.
</description>
</property>
<property>
<name>hdds.container.action.max.limit</name>
<value>20</value>

View File

@ -20,8 +20,10 @@ package org.apache.hadoop.hdds.scm.server;
import com.google.common.annotations.VisibleForTesting;
import java.util.EnumSet;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.UUID;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicLong;
@ -60,14 +62,16 @@ public class SCMChillModeManager implements
private Map<String, ChillModeExitRule> exitRules = new HashMap(1);
private Configuration config;
private static final String CONT_EXIT_RULE = "ContainerChillModeRule";
private static final String DN_EXIT_RULE = "DataNodeChillModeRule";
private final EventQueue eventPublisher;
SCMChillModeManager(Configuration conf, List<ContainerInfo> allContainers,
EventQueue eventQueue) {
this.config = conf;
this.eventPublisher = eventQueue;
exitRules
.put(CONT_EXIT_RULE, new ContainerChillModeRule(config, allContainers));
exitRules.put(CONT_EXIT_RULE,
new ContainerChillModeRule(config, allContainers));
exitRules.put(DN_EXIT_RULE, new DataNodeChillModeRule(config));
if (!conf.getBoolean(HddsConfigKeys.HDDS_SCM_CHILLMODE_ENABLED,
HddsConfigKeys.HDDS_SCM_CHILLMODE_ENABLED_DEFAULT)) {
exitChillMode(eventQueue);
@ -120,6 +124,7 @@ public class SCMChillModeManager implements
EventPublisher publisher) {
if (getInChillMode()) {
exitRules.get(CONT_EXIT_RULE).process(nodeRegistrationContainerReport);
exitRules.get(DN_EXIT_RULE).process(nodeRegistrationContainerReport);
validateChillModeExitRules(publisher);
}
}
@ -187,6 +192,9 @@ public class SCMChillModeManager implements
@VisibleForTesting
public double getCurrentContainerThreshold() {
if (maxContainer == 0) {
return 1;
}
return (containerWithMinReplicas.doubleValue() / maxContainer);
}
@ -217,6 +225,57 @@ public class SCMChillModeManager implements
}
}
/**
* Class defining Chill mode exit criteria according to number of DataNodes
* registered with SCM.
*/
public class DataNodeChillModeRule implements
ChillModeExitRule<NodeRegistrationContainerReport> {
// Min DataNodes required to exit chill mode.
private int requiredDns;
private int registeredDns = 0;
// Set to track registered DataNodes.
private HashSet<UUID> registeredDnSet;
public DataNodeChillModeRule(Configuration conf) {
requiredDns = conf
.getInt(HddsConfigKeys.HDDS_SCM_CHILLMODE_MIN_DATANODE,
HddsConfigKeys.HDDS_SCM_CHILLMODE_MIN_DATANODE_DEFAULT);
registeredDnSet = new HashSet<>(requiredDns * 2);
}
@Override
public boolean validate() {
return registeredDns >= requiredDns;
}
@VisibleForTesting
public double getRegisteredDataNodes() {
return registeredDns;
}
@Override
public void process(NodeRegistrationContainerReport reportsProto) {
if (requiredDns == 0) {
// No dn check required.
return;
}
if(inChillMode.get()) {
registeredDnSet.add(reportsProto.getDatanodeDetails().getUuid());
registeredDns = registeredDnSet.size();
LOG.info("SCM in chill mode. {} DataNodes registered, {} required.",
registeredDns, requiredDns);
}
}
@Override
public void cleanup() {
registeredDnSet.clear();
}
}
@VisibleForTesting
public static Logger getLogger() {
return LOG;

View File

@ -45,7 +45,7 @@ public class TestSCMChillModeManager {
private List<ContainerInfo> containers;
@Rule
public Timeout timeout = new Timeout(1000 * 20);
public Timeout timeout = new Timeout(1000 * 35);
@BeforeClass
public static void setUp() {
@ -111,6 +111,45 @@ public class TestSCMChillModeManager {
assertFalse(scmChillModeManager.getInChillMode());
}
@Test
public void testChillModeDataNodeExitRule() throws Exception {
containers = new ArrayList<>();
testChillModeDataNodes(0);
testChillModeDataNodes(3);
testChillModeDataNodes(5);
}
private void testChillModeDataNodes(int numOfDns) throws Exception {
OzoneConfiguration conf = new OzoneConfiguration(config);
conf.setInt(HddsConfigKeys.HDDS_SCM_CHILLMODE_MIN_DATANODE, numOfDns);
scmChillModeManager = new SCMChillModeManager(conf, containers, queue);
queue.addHandler(SCMEvents.NODE_REGISTRATION_CONT_REPORT,
scmChillModeManager);
// Assert SCM is in Chill mode.
assertTrue(scmChillModeManager.getInChillMode());
// Register all DataNodes except last one and assert SCM is in chill mode.
for (int i = 0; i < numOfDns-1; i++) {
queue.fireEvent(SCMEvents.NODE_REGISTRATION_CONT_REPORT,
HddsTestUtils.createNodeRegistrationContainerReport(containers));
assertTrue(scmChillModeManager.getInChillMode());
assertTrue(scmChillModeManager.getCurrentContainerThreshold() == 1);
}
if(numOfDns == 0){
GenericTestUtils.waitFor(() -> {
return scmChillModeManager.getInChillMode();
}, 10, 1000 * 10);
return;
}
// Register last DataNode and check that SCM is out of Chill mode.
queue.fireEvent(SCMEvents.NODE_REGISTRATION_CONT_REPORT,
HddsTestUtils.createNodeRegistrationContainerReport(containers));
GenericTestUtils.waitFor(() -> {
return scmChillModeManager.getInChillMode();
}, 10, 1000 * 10);
}
private void testContainerThreshold(List<ContainerInfo> dnContainers,
double expectedThreshold)
throws Exception {