HDFS-11919. Ozone: SCM: TestNodeManager takes too long to execute. Contributed by Yiqun Lin.

This commit is contained in:
Anu Engineer 2017-06-05 21:34:32 -07:00 committed by Owen O'Malley
parent d67542c115
commit f5d17b8f7c
1 changed files with 37 additions and 68 deletions

View File

@ -48,6 +48,7 @@ import java.util.List;
import java.util.UUID; import java.util.UUID;
import java.util.concurrent.TimeoutException; import java.util.concurrent.TimeoutException;
import static java.util.concurrent.TimeUnit.SECONDS;
import static org.apache.hadoop.ozone.protocol.proto import static org.apache.hadoop.ozone.protocol.proto
.StorageContainerDatanodeProtocolProtos.Type; .StorageContainerDatanodeProtocolProtos.Type;
import static org.apache.hadoop.ozone.scm.node.NodeManager.NODESTATE.DEAD; import static org.apache.hadoop.ozone.scm.node.NodeManager.NODESTATE.DEAD;
@ -99,6 +100,7 @@ public class TestNodeManager {
OzoneConfiguration conf = new OzoneConfiguration(); OzoneConfiguration conf = new OzoneConfiguration();
conf.set(OzoneConfigKeys.OZONE_CONTAINER_METADATA_DIRS, conf.set(OzoneConfigKeys.OZONE_CONTAINER_METADATA_DIRS,
testDir.getAbsolutePath()); testDir.getAbsolutePath());
conf.setLong(OZONE_SCM_HEARTBEAT_PROCESS_INTERVAL_MS, 100);
return conf; return conf;
} }
@ -367,28 +369,30 @@ public class TestNodeManager {
} }
/** /**
* Asserts that a single node moves from Healthy to stale node if it misses * Asserts that a single node moves from Healthy to stale node, then from
* the heartbeat. * stale node to dead node if it misses enough heartbeats.
* *
* @throws IOException * @throws IOException
* @throws InterruptedException * @throws InterruptedException
* @throws TimeoutException * @throws TimeoutException
*/ */
@Test @Test
public void testScmDetectStaleNode() throws IOException, public void testScmDetectStaleAndDeadNode() throws IOException,
InterruptedException, TimeoutException { InterruptedException, TimeoutException {
OzoneConfiguration conf = getConf();
final int interval = 100; final int interval = 100;
final int nodeCount = 10; final int nodeCount = 10;
OzoneConfiguration conf = getConf();
conf.setInt(OZONE_SCM_HEARTBEAT_PROCESS_INTERVAL_MS, interval); conf.setInt(OZONE_SCM_HEARTBEAT_PROCESS_INTERVAL_MS, interval);
conf.setInt(OZONE_SCM_HEARTBEAT_INTERVAL_SECONDS, 1); conf.setTimeDuration(OZONE_SCM_HEARTBEAT_INTERVAL_SECONDS, 1, SECONDS);
// This should be 5 times more than OZONE_SCM_HEARTBEAT_PROCESS_INTERVAL_MS
// and 3 times more than OZONE_SCM_HEARTBEAT_INTERVAL_SECONDS
conf.setInt(OZONE_SCM_STALENODE_INTERVAL_MS, 3 * 1000); conf.setInt(OZONE_SCM_STALENODE_INTERVAL_MS, 3 * 1000);
conf.setInt(OZONE_SCM_DEADNODE_INTERVAL_MS, 6 * 1000);
try (SCMNodeManager nodeManager = createNodeManager(conf)) { try (SCMNodeManager nodeManager = createNodeManager(conf)) {
List<DatanodeID> nodeList = createNodeSet(nodeManager, nodeCount, List<DatanodeID> nodeList = createNodeSet(nodeManager, nodeCount,
"staleNode"); "Node");
DatanodeID staleNode = SCMTestUtils.getDatanodeID(nodeManager); DatanodeID staleNode = SCMTestUtils.getDatanodeID(nodeManager);
// Heartbeat once // Heartbeat once
@ -401,13 +405,14 @@ public class TestNodeManager {
// Wait for 2 seconds .. and heartbeat good nodes again. // Wait for 2 seconds .. and heartbeat good nodes again.
Thread.sleep(2 * 1000); Thread.sleep(2 * 1000);
for (DatanodeID dn : nodeList) { for (DatanodeID dn : nodeList) {
nodeManager.sendHeartbeat(dn, null); nodeManager.sendHeartbeat(dn, null);
} }
// Wait for 2 more seconds, 3 seconds is the stale window for this test // Wait for 2 seconds, wait a total of 4 seconds to make sure that the
// node moves into stale state.
Thread.sleep(2 * 1000); Thread.sleep(2 * 1000);
List<DatanodeID> staleNodeList = nodeManager.getNodes(NodeManager List<DatanodeID> staleNodeList = nodeManager.getNodes(NodeManager
.NODESTATE.STALE); .NODESTATE.STALE);
assertEquals("Expected to find 1 stale node", assertEquals("Expected to find 1 stale node",
@ -416,51 +421,7 @@ public class TestNodeManager {
1, staleNodeList.size()); 1, staleNodeList.size());
assertEquals("Stale node is not the expected ID", staleNode assertEquals("Stale node is not the expected ID", staleNode
.getDatanodeUuid(), staleNodeList.get(0).getDatanodeUuid()); .getDatanodeUuid(), staleNodeList.get(0).getDatanodeUuid());
} Thread.sleep(1000);
}
/**
* Asserts that a single node moves from Healthy to dead node if it misses
* enough heartbeats.
*
* @throws IOException
* @throws InterruptedException
* @throws TimeoutException
*/
@Test
public void testScmDetectDeadNode() throws IOException,
InterruptedException, TimeoutException {
final int interval = 100;
final int nodeCount = 10;
OzoneConfiguration conf = getConf();
conf.setInt(OZONE_SCM_HEARTBEAT_PROCESS_INTERVAL_MS, interval);
conf.setInt(OZONE_SCM_HEARTBEAT_INTERVAL_SECONDS, 1);
conf.setInt(OZONE_SCM_STALENODE_INTERVAL_MS, 3 * 1000);
conf.setInt(OZONE_SCM_DEADNODE_INTERVAL_MS, 6 * 1000);
try (SCMNodeManager nodeManager = createNodeManager(conf)) {
List<DatanodeID> nodeList = createNodeSet(nodeManager, nodeCount,
"Node");
DatanodeID deadNode = SCMTestUtils.getDatanodeID(nodeManager);
// Heartbeat once
nodeManager.sendHeartbeat(deadNode, null);
// Heartbeat all other nodes.
for (DatanodeID dn : nodeList) {
nodeManager.sendHeartbeat(dn, null);
}
// Wait for 2 seconds .. and heartbeat good nodes again.
Thread.sleep(2 * 1000);
for (DatanodeID dn : nodeList) {
nodeManager.sendHeartbeat(dn, null);
}
Thread.sleep(3 * 1000);
// heartbeat good nodes again. // heartbeat good nodes again.
for (DatanodeID dn : nodeList) { for (DatanodeID dn : nodeList) {
@ -471,13 +432,21 @@ public class TestNodeManager {
// 7 seconds to make sure that the node moves into dead state. // 7 seconds to make sure that the node moves into dead state.
Thread.sleep(2 * 1000); Thread.sleep(2 * 1000);
// the stale node has been removed
staleNodeList = nodeManager.getNodes(NodeManager
.NODESTATE.STALE);
assertEquals("Expected to find 1 stale node",
0, nodeManager.getNodeCount(STALE));
assertEquals("Expected to find 1 stale node",
0, staleNodeList.size());
// Check for the dead node now. // Check for the dead node now.
List<DatanodeID> deadNodeList = nodeManager.getNodes(DEAD); List<DatanodeID> deadNodeList = nodeManager.getNodes(DEAD);
assertEquals("Expected to find 1 dead node", 1, assertEquals("Expected to find 1 dead node", 1,
nodeManager.getNodeCount(DEAD)); nodeManager.getNodeCount(DEAD));
assertEquals("Expected to find 1 dead node", assertEquals("Expected to find 1 dead node",
1, deadNodeList.size()); 1, deadNodeList.size());
assertEquals("Dead node is not the expected ID", deadNode assertEquals("Dead node is not the expected ID", staleNode
.getDatanodeUuid(), deadNodeList.get(0).getDatanodeUuid()); .getDatanodeUuid(), deadNodeList.get(0).getDatanodeUuid());
} }
} }
@ -556,7 +525,7 @@ public class TestNodeManager {
OzoneConfiguration conf = getConf(); OzoneConfiguration conf = getConf();
conf.setInt(OZONE_SCM_HEARTBEAT_PROCESS_INTERVAL_MS, 100); conf.setInt(OZONE_SCM_HEARTBEAT_PROCESS_INTERVAL_MS, 100);
conf.setInt(OZONE_SCM_HEARTBEAT_INTERVAL_SECONDS, 1); conf.setTimeDuration(OZONE_SCM_HEARTBEAT_INTERVAL_SECONDS, 1, SECONDS);
conf.setInt(OZONE_SCM_STALENODE_INTERVAL_MS, 3 * 1000); conf.setInt(OZONE_SCM_STALENODE_INTERVAL_MS, 3 * 1000);
conf.setInt(OZONE_SCM_DEADNODE_INTERVAL_MS, 6 * 1000); conf.setInt(OZONE_SCM_DEADNODE_INTERVAL_MS, 6 * 1000);
@ -733,7 +702,7 @@ public class TestNodeManager {
OzoneConfiguration conf = getConf(); OzoneConfiguration conf = getConf();
conf.setInt(OZONE_SCM_HEARTBEAT_PROCESS_INTERVAL_MS, 100); conf.setInt(OZONE_SCM_HEARTBEAT_PROCESS_INTERVAL_MS, 100);
conf.setInt(OZONE_SCM_HEARTBEAT_INTERVAL_SECONDS, 1); conf.setTimeDuration(OZONE_SCM_HEARTBEAT_INTERVAL_SECONDS, 1, SECONDS);
conf.setInt(OZONE_SCM_STALENODE_INTERVAL_MS, 3 * 1000); conf.setInt(OZONE_SCM_STALENODE_INTERVAL_MS, 3 * 1000);
conf.setInt(OZONE_SCM_DEADNODE_INTERVAL_MS, 6 * 1000); conf.setInt(OZONE_SCM_DEADNODE_INTERVAL_MS, 6 * 1000);
conf.setInt(OZONE_SCM_MAX_HB_COUNT_TO_PROCESS, 7000); conf.setInt(OZONE_SCM_MAX_HB_COUNT_TO_PROCESS, 7000);
@ -822,7 +791,7 @@ public class TestNodeManager {
final int staleCount = 3000; final int staleCount = 3000;
OzoneConfiguration conf = getConf(); OzoneConfiguration conf = getConf();
conf.setInt(OZONE_SCM_HEARTBEAT_PROCESS_INTERVAL_MS, 100); conf.setInt(OZONE_SCM_HEARTBEAT_PROCESS_INTERVAL_MS, 100);
conf.setInt(OZONE_SCM_HEARTBEAT_INTERVAL_SECONDS, 1); conf.setTimeDuration(OZONE_SCM_HEARTBEAT_INTERVAL_SECONDS, 1, SECONDS);
conf.setInt(OZONE_SCM_STALENODE_INTERVAL_MS, 3 * 1000); conf.setInt(OZONE_SCM_STALENODE_INTERVAL_MS, 3 * 1000);
conf.setInt(OZONE_SCM_DEADNODE_INTERVAL_MS, 6 * 1000); conf.setInt(OZONE_SCM_DEADNODE_INTERVAL_MS, 6 * 1000);
@ -873,16 +842,17 @@ public class TestNodeManager {
* lead to many nodes becoming stale or dead due to the fact that SCM is not * lead to many nodes becoming stale or dead due to the fact that SCM is not
* able to keep up with heartbeat processing. This test just verifies that SCM * able to keep up with heartbeat processing. This test just verifies that SCM
* will log that information. * will log that information.
* @throws TimeoutException
*/ */
@Test @Test
public void testScmLogsHeartbeatFlooding() throws IOException, public void testScmLogsHeartbeatFlooding() throws IOException,
InterruptedException { InterruptedException, TimeoutException {
final int healthyCount = 3000; final int healthyCount = 3000;
// Make the HB process thread run slower. // Make the HB process thread run slower.
OzoneConfiguration conf = getConf(); OzoneConfiguration conf = getConf();
conf.setInt(OZONE_SCM_HEARTBEAT_PROCESS_INTERVAL_MS, 500); conf.setInt(OZONE_SCM_HEARTBEAT_PROCESS_INTERVAL_MS, 500);
conf.setInt(OZONE_SCM_HEARTBEAT_INTERVAL_SECONDS, 1); conf.setTimeDuration(OZONE_SCM_HEARTBEAT_INTERVAL_SECONDS, 1, SECONDS);
conf.setInt(OZONE_SCM_MAX_HB_COUNT_TO_PROCESS, 500); conf.setInt(OZONE_SCM_MAX_HB_COUNT_TO_PROCESS, 500);
try (SCMNodeManager nodeManager = createNodeManager(conf)) { try (SCMNodeManager nodeManager = createNodeManager(conf)) {
@ -901,15 +871,14 @@ public class TestNodeManager {
thread1.setDaemon(true); thread1.setDaemon(true);
thread1.start(); thread1.start();
Thread.sleep(6 * 1000); GenericTestUtils.waitFor(() -> logCapturer.getOutput()
.contains("SCM is being "
+ "flooded by heartbeats. Not able to keep up"
+ " with the heartbeat counts."),
500, 20 * 1000);
thread1.interrupt(); thread1.interrupt();
logCapturer.stopCapturing(); logCapturer.stopCapturing();
assertThat(logCapturer.getOutput(), containsString("SCM is being " +
"flooded by heartbeats. Not able to keep up with the heartbeat " +
"counts."));
} }
} }
@ -1025,7 +994,7 @@ public class TestNodeManager {
final int interval = 100; final int interval = 100;
conf.setInt(OZONE_SCM_HEARTBEAT_PROCESS_INTERVAL_MS, interval); conf.setInt(OZONE_SCM_HEARTBEAT_PROCESS_INTERVAL_MS, interval);
conf.setInt(OZONE_SCM_HEARTBEAT_INTERVAL_SECONDS, 1); conf.setTimeDuration(OZONE_SCM_HEARTBEAT_INTERVAL_SECONDS, 1, SECONDS);
conf.setInt(OZONE_SCM_STALENODE_INTERVAL_MS, 3 * 1000); conf.setInt(OZONE_SCM_STALENODE_INTERVAL_MS, 3 * 1000);
conf.setInt(OZONE_SCM_DEADNODE_INTERVAL_MS, 6 * 1000); conf.setInt(OZONE_SCM_DEADNODE_INTERVAL_MS, 6 * 1000);