SOLR-13045: Harden TestSimPolicyCloud

This commit fixes a race condition in SimClusterStateProvider, fixing
several fails in TestSimPolicyCloud.
This commit is contained in:
Jason Gerlowski 2018-12-07 11:07:42 -05:00
parent af6e15e838
commit f89f109ec1
4 changed files with 75 additions and 74 deletions

View File

@ -838,7 +838,7 @@ public class SimCloudManager implements SolrCloudManager {
results.add("success", "");
break;
case ADDROLE:
nodeStateProvider.simAddNodeValue(req.getParams().get("node"), "nodeRole", req.getParams().get("role"));
nodeStateProvider.simSetNodeValue(req.getParams().get("node"), "nodeRole", req.getParams().get("role"));
break;
case CREATESHARD:
try {

View File

@ -691,79 +691,84 @@ public class SimClusterStateProvider implements ClusterStateProvider {
private void simRunLeaderElection(String collection, Slice s, boolean saveState) throws Exception {
AtomicBoolean stateChanged = new AtomicBoolean(Boolean.FALSE);
Replica leader = s.getLeader();
if (leader == null || !liveNodes.contains(leader.getNodeName())) {
log.trace("Running leader election for {} / {}", collection, s.getName());
if (s.getReplicas().isEmpty()) { // no replicas - punt
log.trace("-- no replicas in {} / {}", collection, s.getName());
return;
}
ActionThrottle lt = getThrottle(collection, s.getName());
synchronized (lt) {
// collect all active and live
List<ReplicaInfo> active = new ArrayList<>();
AtomicBoolean alreadyHasLeader = new AtomicBoolean(false);
s.getReplicas().forEach(r -> {
// find our ReplicaInfo for this replica
ReplicaInfo ri = getReplicaInfo(r);
if (ri == null) {
throw new IllegalStateException("-- could not find ReplicaInfo for replica " + r);
}
synchronized (ri) {
if (r.isActive(liveNodes.get())) {
if (ri.getVariables().get(ZkStateReader.LEADER_PROP) != null) {
log.trace("-- found existing leader {} / {}: {}, {}", collection, s.getName(), ri, r);
alreadyHasLeader.set(true);
return;
} else {
active.add(ri);
}
} else { // if it's on a node that is not live mark it down
log.trace("-- replica not active on live nodes: {}, {}", liveNodes.get(), r);
if (!liveNodes.contains(r.getNodeName())) {
ri.getVariables().put(ZkStateReader.STATE_PROP, Replica.State.DOWN.toString());
ri.getVariables().remove(ZkStateReader.LEADER_PROP);
stateChanged.set(true);
lock.lockInterruptibly();
try {
Replica leader = s.getLeader();
if (leader == null || !liveNodes.contains(leader.getNodeName())) {
log.trace("Running leader election for {} / {}", collection, s.getName());
if (s.getReplicas().isEmpty()) { // no replicas - punt
log.trace("-- no replicas in {} / {}", collection, s.getName());
return;
}
ActionThrottle lt = getThrottle(collection, s.getName());
synchronized (lt) {
// collect all active and live
List<ReplicaInfo> active = new ArrayList<>();
AtomicBoolean alreadyHasLeader = new AtomicBoolean(false);
s.getReplicas().forEach(r -> {
// find our ReplicaInfo for this replica
ReplicaInfo ri = getReplicaInfo(r);
if (ri == null) {
throw new IllegalStateException("-- could not find ReplicaInfo for replica " + r);
}
synchronized (ri) {
if (r.isActive(liveNodes.get())) {
if (ri.getVariables().get(ZkStateReader.LEADER_PROP) != null) {
log.trace("-- found existing leader {} / {}: {}, {}", collection, s.getName(), ri, r);
alreadyHasLeader.set(true);
return;
} else {
active.add(ri);
}
} else { // if it's on a node that is not live mark it down
log.trace("-- replica not active on live nodes: {}, {}", liveNodes.get(), r);
if (!liveNodes.contains(r.getNodeName())) {
ri.getVariables().put(ZkStateReader.STATE_PROP, Replica.State.DOWN.toString());
ri.getVariables().remove(ZkStateReader.LEADER_PROP);
stateChanged.set(true);
}
}
}
});
if (alreadyHasLeader.get()) {
log.trace("-- already has leader {} / {}: {}", collection, s.getName(), s);
return;
}
});
if (alreadyHasLeader.get()) {
log.trace("-- already has leader {} / {}: {}", collection, s.getName(), s);
return;
}
if (active.isEmpty()) {
log.warn("Can't find any active replicas for {} / {}: {}", collection, s.getName(), s);
log.debug("-- liveNodes: {}", liveNodes.get());
return;
}
// pick first active one
ReplicaInfo ri = null;
for (ReplicaInfo a : active) {
if (!a.getType().equals(Replica.Type.PULL)) {
ri = a;
break;
if (active.isEmpty()) {
log.warn("Can't find any active replicas for {} / {}: {}", collection, s.getName(), s);
log.debug("-- liveNodes: {}", liveNodes.get());
return;
}
// pick first active one
ReplicaInfo ri = null;
for (ReplicaInfo a : active) {
if (!a.getType().equals(Replica.Type.PULL)) {
ri = a;
break;
}
}
if (ri == null) {
log.warn("-- can't find any suitable replica type for {} / {}: {}", collection, s.getName(), s);
return;
}
// now mark the leader election throttle
lt.minimumWaitBetweenActions();
lt.markAttemptingAction();
synchronized (ri) {
ri.getVariables().put(ZkStateReader.LEADER_PROP, "true");
}
log.debug("-- elected new leader for {} / {} (currentVersion={}): {}", collection,
s.getName(), clusterStateVersion, ri);
stateChanged.set(true);
}
if (ri == null) {
log.warn("-- can't find any suitable replica type for {} / {}: {}", collection, s.getName(), s);
return;
}
// now mark the leader election throttle
lt.minimumWaitBetweenActions();
lt.markAttemptingAction();
synchronized (ri) {
ri.getVariables().put(ZkStateReader.LEADER_PROP, "true");
}
log.debug("-- elected new leader for {} / {} (currentVersion={}): {}", collection,
s.getName(), clusterStateVersion, ri);
stateChanged.set(true);
} else {
log.trace("-- already has leader for {} / {}", collection, s.getName());
}
} else {
log.trace("-- already has leader for {} / {}", collection, s.getName());
}
if (stateChanged.get() || saveState) {
collectionsStatesRef.set(null);
} finally {
if (stateChanged.get() || saveState) {
collectionsStatesRef.set(null);
}
lock.unlock();
}
}

View File

@ -23,7 +23,6 @@ import java.util.concurrent.TimeUnit;
import com.carrotsearch.randomizedtesting.annotations.TimeoutSuite;
import org.apache.lucene.util.LuceneTestCase.AwaitsFix;
import org.apache.solr.client.solrj.SolrClient;
import org.apache.solr.client.solrj.SolrRequest;
import org.apache.solr.client.solrj.request.CollectionAdminRequest;
@ -52,7 +51,6 @@ import static org.apache.solr.cloud.autoscaling.AutoScalingHandlerTest.createAut
@TimeoutSuite(millis = 48 * 3600 * 1000)
@LogLevel("org.apache.solr.cloud.autoscaling=DEBUG;org.apache.solr.cloud.autoscaling.NodeLostTrigger=INFO;org.apache.client.solrj.cloud.autoscaling=DEBUG;org.apache.solr.cloud.autoscaling.ComputePlanAction=INFO;org.apache.solr.cloud.autoscaling.ExecutePlanAction=DEBUG;org.apache.solr.cloud.autoscaling.ScheduledTriggers=DEBUG")
//@LogLevel("org.apache.solr.cloud.autoscaling=DEBUG;org.apache.solr.cloud.autoscaling.NodeLostTrigger=INFO;org.apache.client.solrj.cloud.autoscaling=DEBUG;org.apache.solr.cloud.CloudTestUtils=TRACE")
@AwaitsFix(bugUrl="https://issues.apache.org/jira/browse/SOLR-12028") // this test can fail to elect a leader, seems to be common among sim tests
public class TestSimExtremeIndexing extends SimSolrCloudTestCase {
private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());

View File

@ -108,7 +108,6 @@ public class TestSimPolicyCloud extends SimSolrCloudTestCase {
}
@AwaitsFix(bugUrl = "https://issues.apache.org/jira/browse/SOLR-12028")
public void testCreateCollectionAddReplica() throws Exception {
SolrClient solrClient = cluster.simGetSolrClient();
String nodeId = cluster.getSimClusterStateProvider().simGetRandomNode();
@ -134,8 +133,7 @@ public class TestSimPolicyCloud extends SimSolrCloudTestCase {
getCollectionState(collectionName).forEachReplica((s, replica) -> assertEquals(nodeId, replica.getNodeName()));
}
@AwaitsFix(bugUrl="https://issues.apache.org/jira/browse/SOLR-12028")
public void testCreateCollectionSplitShard() throws Exception {
SolrClient solrClient = cluster.simGetSolrClient();
String firstNode = cluster.getSimClusterStateProvider().simGetRandomNode();
@ -294,7 +292,7 @@ public class TestSimPolicyCloud extends SimSolrCloudTestCase {
assertEquals(3, coll.getSlice("s3").getReplicas().size());
coll.forEachReplica(verifyReplicas);
}
@BadApple(bugUrl="https://issues.apache.org/jira/browse/SOLR-12028") // 14-Oct-2018
public void testCreateCollectionAddShardUsingPolicy() throws Exception {
SolrClient solrClient = cluster.simGetSolrClient();
String nodeId = cluster.getSimClusterStateProvider().simGetRandomNode();