tests: try harder to not kill the last replica in a shard

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1560557 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Mark Robert Miller 2014-01-22 23:13:01 +00:00
parent d9e7f46ef9
commit df7c5e7772
1 changed files with 45 additions and 44 deletions

View File

@ -79,6 +79,8 @@ public class ChaosMonkey {
private boolean aggressivelyKillLeaders;
private Map<String,CloudJettyRunner> shardToLeaderJetty;
private volatile long startTime;
private List<CloudJettyRunner> deadPool = new ArrayList<CloudJettyRunner>();
private Thread monkeyThread;
@ -319,51 +321,9 @@ public class ChaosMonkey {
public CloudJettyRunner getRandomJetty(String slice, boolean aggressivelyKillLeaders) throws KeeperException, InterruptedException {
int numRunning = 0;
int numRecovering = 0;
int numActive = 0;
for (CloudJettyRunner cloudJetty : shardToJetty.get(slice)) {
boolean running = true;
// get latest cloud state
zkStateReader.updateClusterState(true);
Slice theShards = zkStateReader.getClusterState().getSlicesMap(collection)
.get(slice);
ZkNodeProps props = theShards.getReplicasMap().get(cloudJetty.coreNodeName);
if (props == null) {
throw new RuntimeException("shard name " + cloudJetty.coreNodeName + " not found in " + theShards.getReplicasMap().keySet());
}
String state = props.getStr(ZkStateReader.STATE_PROP);
String nodeName = props.getStr(ZkStateReader.NODE_NAME_PROP);
if (!cloudJetty.jetty.isRunning()
|| !state.equals(ZkStateReader.ACTIVE)
|| !zkStateReader.getClusterState().liveNodesContain(nodeName)) {
running = false;
}
if (cloudJetty.jetty.isRunning()
&& state.equals(ZkStateReader.RECOVERING)
&& zkStateReader.getClusterState().liveNodesContain(nodeName)) {
numRecovering++;
}
if (cloudJetty.jetty.isRunning()
&& state.equals(ZkStateReader.ACTIVE)
&& zkStateReader.getClusterState().liveNodesContain(nodeName)) {
numActive++;
}
if (running) {
numRunning++;
}
}
numActive = checkIfKillIsLegal(slice, numActive);
// TODO: stale state makes this a tough call
if (numActive < 2) {
@ -371,6 +331,21 @@ public class ChaosMonkey {
monkeyLog("only one active node in shard - monkey cannot kill :(");
return null;
}
// let's check the deadpool count
int numRunning = 0;
for (CloudJettyRunner cjetty : shardToJetty.get(slice)) {
if (!deadPool.contains(cjetty)) {
numRunning++;
}
}
if (numRunning < 2) {
// we cannot kill anyone
monkeyLog("only one active node in shard - monkey cannot kill :(");
return null;
}
Random random = LuceneTestCase.random();
int chance = random.nextInt(10);
CloudJettyRunner cjetty;
@ -439,6 +414,33 @@ public class ChaosMonkey {
return cjetty;
}
private int checkIfKillIsLegal(String slice, int numActive)
throws KeeperException, InterruptedException {
for (CloudJettyRunner cloudJetty : shardToJetty.get(slice)) {
// get latest cloud state
zkStateReader.updateClusterState(true);
Slice theShards = zkStateReader.getClusterState().getSlicesMap(collection)
.get(slice);
ZkNodeProps props = theShards.getReplicasMap().get(cloudJetty.coreNodeName);
if (props == null) {
throw new RuntimeException("shard name " + cloudJetty.coreNodeName + " not found in " + theShards.getReplicasMap().keySet());
}
String state = props.getStr(ZkStateReader.STATE_PROP);
String nodeName = props.getStr(ZkStateReader.NODE_NAME_PROP);
if (cloudJetty.jetty.isRunning()
&& state.equals(ZkStateReader.ACTIVE)
&& zkStateReader.getClusterState().liveNodesContain(nodeName)) {
numActive++;
}
}
return numActive;
}
public SolrServer getRandomClient(String slice) throws KeeperException, InterruptedException {
// get latest cloud state
@ -473,7 +475,6 @@ public class ChaosMonkey {
stop = false;
monkeyThread = new Thread() {
private List<CloudJettyRunner> deadPool = new ArrayList<CloudJettyRunner>();
@Override
public void run() {