HDDS-1758. Add replication and key deletion tests to MiniOzoneChaosCluster. Contributed by Mukul Kumar Singh. (#1049)

This commit is contained in:
Mukul Kumar Singh 2019-07-05 17:04:40 +05:30 committed by GitHub
parent 928edb2c47
commit c9c3429a16
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 58 additions and 9 deletions

View File

@ -22,6 +22,7 @@ import org.apache.commons.lang3.RandomUtils;
import org.apache.hadoop.conf.StorageUnit; import org.apache.hadoop.conf.StorageUnit;
import org.apache.hadoop.hdds.HddsConfigKeys; import org.apache.hadoop.hdds.HddsConfigKeys;
import org.apache.hadoop.hdds.conf.OzoneConfiguration; import org.apache.hadoop.hdds.conf.OzoneConfiguration;
import org.apache.hadoop.hdds.protocol.DatanodeDetails;
import org.apache.hadoop.hdds.scm.ScmConfigKeys; import org.apache.hadoop.hdds.scm.ScmConfigKeys;
import org.apache.hadoop.hdds.scm.server.StorageContainerManager; import org.apache.hadoop.hdds.scm.server.StorageContainerManager;
import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem; import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem;
@ -55,7 +56,8 @@ public class MiniOzoneChaosCluster extends MiniOzoneClusterImpl {
private ScheduledFuture scheduledFuture; private ScheduledFuture scheduledFuture;
private enum FailureMode { private enum FailureMode {
NODES NODES_RESTART,
NODES_SHUTDOWN
} }
public MiniOzoneChaosCluster(OzoneConfiguration conf, public MiniOzoneChaosCluster(OzoneConfiguration conf,
@ -81,21 +83,55 @@ public class MiniOzoneChaosCluster extends MiniOzoneClusterImpl {
return RandomUtils.nextBoolean(); return RandomUtils.nextBoolean();
} }
// Should the selected node be stopped or started.
private boolean shouldStop() {
return RandomUtils.nextBoolean();
}
// Get the datanode index of the datanode to fail. // Get the datanode index of the datanode to fail.
private int getNodeToFail() { private int getNodeToFail() {
return RandomUtils.nextInt() % numDatanodes; return RandomUtils.nextInt() % numDatanodes;
} }
private void failNodes() { private void restartNodes() {
final int numNodesToFail = getNumberOfNodesToFail(); final int numNodesToFail = getNumberOfNodesToFail();
LOG.info("Will restart {} nodes to simulate failure", numNodesToFail); LOG.info("Will restart {} nodes to simulate failure", numNodesToFail);
for (int i = 0; i < numNodesToFail; i++) { for (int i = 0; i < numNodesToFail; i++) {
boolean failureMode = isFastRestart(); boolean failureMode = isFastRestart();
int failedNodeIndex = getNodeToFail(); int failedNodeIndex = getNodeToFail();
String failString = failureMode ? "Fast" : "Slow";
DatanodeDetails dn =
getHddsDatanodes().get(failedNodeIndex).getDatanodeDetails();
try { try {
LOG.info("Restarting DataNodeIndex {}", failedNodeIndex); LOG.info("{} Restarting DataNode: {}", failString, dn.getUuid());
restartHddsDatanode(failedNodeIndex, failureMode); restartHddsDatanode(failedNodeIndex, failureMode);
LOG.info("Completed restarting DataNodeIndex {}", failedNodeIndex); LOG.info("{} Completed restarting Datanode: {}", failString,
dn.getUuid());
} catch (Exception e) {
}
}
}
private void shutdownNodes() {
final int numNodesToFail = getNumberOfNodesToFail();
LOG.info("Will shutdown {} nodes to simulate failure", numNodesToFail);
for (int i = 0; i < numNodesToFail; i++) {
boolean shouldStop = shouldStop();
int failedNodeIndex = getNodeToFail();
String stopString = shouldStop ? "Stopping" : "Starting";
DatanodeDetails dn =
getHddsDatanodes().get(failedNodeIndex).getDatanodeDetails();
try {
LOG.info("{} DataNode {}", stopString, dn.getUuid());
if (shouldStop) {
shutdownHddsDatanode(failedNodeIndex);
} else {
restartHddsDatanode(failedNodeIndex, true);
}
LOG.info("Completed {} DataNode {}", stopString, dn.getUuid());
} catch (Exception e) { } catch (Exception e) {
} }
@ -111,8 +147,11 @@ public class MiniOzoneChaosCluster extends MiniOzoneClusterImpl {
private void fail() { private void fail() {
FailureMode mode = getFailureMode(); FailureMode mode = getFailureMode();
switch (mode) { switch (mode) {
case NODES: case NODES_RESTART:
failNodes(); restartNodes();
break;
case NODES_SHUTDOWN:
shutdownNodes();
break; break;
default: default:
@ -190,7 +229,9 @@ public class MiniOzoneChaosCluster extends MiniOzoneClusterImpl {
1, StorageUnit.MB); 1, StorageUnit.MB);
conf.setTimeDuration(ScmConfigKeys.HDDS_SCM_WATCHER_TIMEOUT, 1000, conf.setTimeDuration(ScmConfigKeys.HDDS_SCM_WATCHER_TIMEOUT, 1000,
TimeUnit.MILLISECONDS); TimeUnit.MILLISECONDS);
conf.setTimeDuration(ScmConfigKeys.OZONE_SCM_STALENODE_INTERVAL, 5, conf.setTimeDuration(ScmConfigKeys.OZONE_SCM_STALENODE_INTERVAL, 10,
TimeUnit.SECONDS);
conf.setTimeDuration(ScmConfigKeys.OZONE_SCM_DEADNODE_INTERVAL, 20,
TimeUnit.SECONDS); TimeUnit.SECONDS);
conf.setTimeDuration(HddsConfigKeys.HDDS_CONTAINER_REPORT_INTERVAL, 1, conf.setTimeDuration(HddsConfigKeys.HDDS_CONTAINER_REPORT_INTERVAL, 1,
TimeUnit.SECONDS); TimeUnit.SECONDS);
@ -204,6 +245,8 @@ public class MiniOzoneChaosCluster extends MiniOzoneClusterImpl {
conf.setTimeDuration(HddsConfigKeys.HDDS_HEARTBEAT_INTERVAL, 1, conf.setTimeDuration(HddsConfigKeys.HDDS_HEARTBEAT_INTERVAL, 1,
TimeUnit.SECONDS); TimeUnit.SECONDS);
conf.setInt(OzoneConfigKeys.OZONE_CONTAINER_CACHE_SIZE, 8); conf.setInt(OzoneConfigKeys.OZONE_CONTAINER_CACHE_SIZE, 8);
conf.setInt("hdds.scm.replication.thread.interval", 10 * 1000);
conf.setInt("hdds.scm.replication.event.timeout", 20 * 1000);
} }
@Override @Override

View File

@ -130,6 +130,11 @@ public class MiniOzoneLoadGenerator {
break; break;
} }
try {
bucket.deleteKey(keyName);
} catch (Exception e) {
LOG.error("LOADGEN: Unable to delete key:{}", keyName, e);
}
} }
// This will terminate other threads too. // This will terminate other threads too.
isWriteThreadRunning.set(false); isWriteThreadRunning.set(false);

View File

@ -62,7 +62,7 @@ public class TestMiniChaosOzoneCluster implements Runnable {
@Option(names = {"-i", "--failureInterval"}, @Option(names = {"-i", "--failureInterval"},
description = "time between failure events in seconds") description = "time between failure events in seconds")
private static int failureInterval = 5; // 5 second period between failures. private static int failureInterval = 300; // 5 second period between failures.
private static MiniOzoneChaosCluster cluster; private static MiniOzoneChaosCluster cluster;
private static MiniOzoneLoadGenerator loadGenerator; private static MiniOzoneLoadGenerator loadGenerator;

View File

@ -711,7 +711,8 @@ public class KeyManagerImpl implements KeyManager {
k.setPipeline(cp.getPipeline()); k.setPipeline(cp.getPipeline());
} }
} catch (IOException e) { } catch (IOException e) {
LOG.debug("Unable to update pipeline for container"); LOG.error("Unable to update pipeline for container:{}",
k.getContainerID());
} }
} }
}); });