mirror of https://github.com/apache/lucene.git
SOLR-13045: Harden TestSimPolicyCloud
This commit fixes a race condition in SimClusterStateProvider, fixing several fails in TestSimPolicyCloud.
This commit is contained in:
parent
af6e15e838
commit
f89f109ec1
|
@ -838,7 +838,7 @@ public class SimCloudManager implements SolrCloudManager {
|
|||
results.add("success", "");
|
||||
break;
|
||||
case ADDROLE:
|
||||
nodeStateProvider.simAddNodeValue(req.getParams().get("node"), "nodeRole", req.getParams().get("role"));
|
||||
nodeStateProvider.simSetNodeValue(req.getParams().get("node"), "nodeRole", req.getParams().get("role"));
|
||||
break;
|
||||
case CREATESHARD:
|
||||
try {
|
||||
|
|
|
@ -691,79 +691,84 @@ public class SimClusterStateProvider implements ClusterStateProvider {
|
|||
|
||||
private void simRunLeaderElection(String collection, Slice s, boolean saveState) throws Exception {
|
||||
AtomicBoolean stateChanged = new AtomicBoolean(Boolean.FALSE);
|
||||
Replica leader = s.getLeader();
|
||||
if (leader == null || !liveNodes.contains(leader.getNodeName())) {
|
||||
log.trace("Running leader election for {} / {}", collection, s.getName());
|
||||
if (s.getReplicas().isEmpty()) { // no replicas - punt
|
||||
log.trace("-- no replicas in {} / {}", collection, s.getName());
|
||||
return;
|
||||
}
|
||||
ActionThrottle lt = getThrottle(collection, s.getName());
|
||||
synchronized (lt) {
|
||||
// collect all active and live
|
||||
List<ReplicaInfo> active = new ArrayList<>();
|
||||
AtomicBoolean alreadyHasLeader = new AtomicBoolean(false);
|
||||
s.getReplicas().forEach(r -> {
|
||||
// find our ReplicaInfo for this replica
|
||||
ReplicaInfo ri = getReplicaInfo(r);
|
||||
if (ri == null) {
|
||||
throw new IllegalStateException("-- could not find ReplicaInfo for replica " + r);
|
||||
}
|
||||
synchronized (ri) {
|
||||
if (r.isActive(liveNodes.get())) {
|
||||
if (ri.getVariables().get(ZkStateReader.LEADER_PROP) != null) {
|
||||
log.trace("-- found existing leader {} / {}: {}, {}", collection, s.getName(), ri, r);
|
||||
alreadyHasLeader.set(true);
|
||||
return;
|
||||
} else {
|
||||
active.add(ri);
|
||||
}
|
||||
} else { // if it's on a node that is not live mark it down
|
||||
log.trace("-- replica not active on live nodes: {}, {}", liveNodes.get(), r);
|
||||
if (!liveNodes.contains(r.getNodeName())) {
|
||||
ri.getVariables().put(ZkStateReader.STATE_PROP, Replica.State.DOWN.toString());
|
||||
ri.getVariables().remove(ZkStateReader.LEADER_PROP);
|
||||
stateChanged.set(true);
|
||||
lock.lockInterruptibly();
|
||||
try {
|
||||
Replica leader = s.getLeader();
|
||||
if (leader == null || !liveNodes.contains(leader.getNodeName())) {
|
||||
log.trace("Running leader election for {} / {}", collection, s.getName());
|
||||
if (s.getReplicas().isEmpty()) { // no replicas - punt
|
||||
log.trace("-- no replicas in {} / {}", collection, s.getName());
|
||||
return;
|
||||
}
|
||||
ActionThrottle lt = getThrottle(collection, s.getName());
|
||||
synchronized (lt) {
|
||||
// collect all active and live
|
||||
List<ReplicaInfo> active = new ArrayList<>();
|
||||
AtomicBoolean alreadyHasLeader = new AtomicBoolean(false);
|
||||
s.getReplicas().forEach(r -> {
|
||||
// find our ReplicaInfo for this replica
|
||||
ReplicaInfo ri = getReplicaInfo(r);
|
||||
if (ri == null) {
|
||||
throw new IllegalStateException("-- could not find ReplicaInfo for replica " + r);
|
||||
}
|
||||
synchronized (ri) {
|
||||
if (r.isActive(liveNodes.get())) {
|
||||
if (ri.getVariables().get(ZkStateReader.LEADER_PROP) != null) {
|
||||
log.trace("-- found existing leader {} / {}: {}, {}", collection, s.getName(), ri, r);
|
||||
alreadyHasLeader.set(true);
|
||||
return;
|
||||
} else {
|
||||
active.add(ri);
|
||||
}
|
||||
} else { // if it's on a node that is not live mark it down
|
||||
log.trace("-- replica not active on live nodes: {}, {}", liveNodes.get(), r);
|
||||
if (!liveNodes.contains(r.getNodeName())) {
|
||||
ri.getVariables().put(ZkStateReader.STATE_PROP, Replica.State.DOWN.toString());
|
||||
ri.getVariables().remove(ZkStateReader.LEADER_PROP);
|
||||
stateChanged.set(true);
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
if (alreadyHasLeader.get()) {
|
||||
log.trace("-- already has leader {} / {}: {}", collection, s.getName(), s);
|
||||
return;
|
||||
}
|
||||
});
|
||||
if (alreadyHasLeader.get()) {
|
||||
log.trace("-- already has leader {} / {}: {}", collection, s.getName(), s);
|
||||
return;
|
||||
}
|
||||
if (active.isEmpty()) {
|
||||
log.warn("Can't find any active replicas for {} / {}: {}", collection, s.getName(), s);
|
||||
log.debug("-- liveNodes: {}", liveNodes.get());
|
||||
return;
|
||||
}
|
||||
// pick first active one
|
||||
ReplicaInfo ri = null;
|
||||
for (ReplicaInfo a : active) {
|
||||
if (!a.getType().equals(Replica.Type.PULL)) {
|
||||
ri = a;
|
||||
break;
|
||||
if (active.isEmpty()) {
|
||||
log.warn("Can't find any active replicas for {} / {}: {}", collection, s.getName(), s);
|
||||
log.debug("-- liveNodes: {}", liveNodes.get());
|
||||
return;
|
||||
}
|
||||
// pick first active one
|
||||
ReplicaInfo ri = null;
|
||||
for (ReplicaInfo a : active) {
|
||||
if (!a.getType().equals(Replica.Type.PULL)) {
|
||||
ri = a;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (ri == null) {
|
||||
log.warn("-- can't find any suitable replica type for {} / {}: {}", collection, s.getName(), s);
|
||||
return;
|
||||
}
|
||||
// now mark the leader election throttle
|
||||
lt.minimumWaitBetweenActions();
|
||||
lt.markAttemptingAction();
|
||||
synchronized (ri) {
|
||||
ri.getVariables().put(ZkStateReader.LEADER_PROP, "true");
|
||||
}
|
||||
log.debug("-- elected new leader for {} / {} (currentVersion={}): {}", collection,
|
||||
s.getName(), clusterStateVersion, ri);
|
||||
stateChanged.set(true);
|
||||
}
|
||||
if (ri == null) {
|
||||
log.warn("-- can't find any suitable replica type for {} / {}: {}", collection, s.getName(), s);
|
||||
return;
|
||||
}
|
||||
// now mark the leader election throttle
|
||||
lt.minimumWaitBetweenActions();
|
||||
lt.markAttemptingAction();
|
||||
synchronized (ri) {
|
||||
ri.getVariables().put(ZkStateReader.LEADER_PROP, "true");
|
||||
}
|
||||
log.debug("-- elected new leader for {} / {} (currentVersion={}): {}", collection,
|
||||
s.getName(), clusterStateVersion, ri);
|
||||
stateChanged.set(true);
|
||||
} else {
|
||||
log.trace("-- already has leader for {} / {}", collection, s.getName());
|
||||
}
|
||||
} else {
|
||||
log.trace("-- already has leader for {} / {}", collection, s.getName());
|
||||
}
|
||||
if (stateChanged.get() || saveState) {
|
||||
collectionsStatesRef.set(null);
|
||||
} finally {
|
||||
if (stateChanged.get() || saveState) {
|
||||
collectionsStatesRef.set(null);
|
||||
}
|
||||
lock.unlock();
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -23,7 +23,6 @@ import java.util.concurrent.TimeUnit;
|
|||
|
||||
import com.carrotsearch.randomizedtesting.annotations.TimeoutSuite;
|
||||
|
||||
import org.apache.lucene.util.LuceneTestCase.AwaitsFix;
|
||||
import org.apache.solr.client.solrj.SolrClient;
|
||||
import org.apache.solr.client.solrj.SolrRequest;
|
||||
import org.apache.solr.client.solrj.request.CollectionAdminRequest;
|
||||
|
@ -52,7 +51,6 @@ import static org.apache.solr.cloud.autoscaling.AutoScalingHandlerTest.createAut
|
|||
@TimeoutSuite(millis = 48 * 3600 * 1000)
|
||||
@LogLevel("org.apache.solr.cloud.autoscaling=DEBUG;org.apache.solr.cloud.autoscaling.NodeLostTrigger=INFO;org.apache.client.solrj.cloud.autoscaling=DEBUG;org.apache.solr.cloud.autoscaling.ComputePlanAction=INFO;org.apache.solr.cloud.autoscaling.ExecutePlanAction=DEBUG;org.apache.solr.cloud.autoscaling.ScheduledTriggers=DEBUG")
|
||||
//@LogLevel("org.apache.solr.cloud.autoscaling=DEBUG;org.apache.solr.cloud.autoscaling.NodeLostTrigger=INFO;org.apache.client.solrj.cloud.autoscaling=DEBUG;org.apache.solr.cloud.CloudTestUtils=TRACE")
|
||||
@AwaitsFix(bugUrl="https://issues.apache.org/jira/browse/SOLR-12028") // this test can fail to elect a leader, seems to be common among sim tests
|
||||
public class TestSimExtremeIndexing extends SimSolrCloudTestCase {
|
||||
private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
|
||||
|
||||
|
|
|
@ -108,7 +108,6 @@ public class TestSimPolicyCloud extends SimSolrCloudTestCase {
|
|||
|
||||
}
|
||||
|
||||
@AwaitsFix(bugUrl = "https://issues.apache.org/jira/browse/SOLR-12028")
|
||||
public void testCreateCollectionAddReplica() throws Exception {
|
||||
SolrClient solrClient = cluster.simGetSolrClient();
|
||||
String nodeId = cluster.getSimClusterStateProvider().simGetRandomNode();
|
||||
|
@ -134,8 +133,7 @@ public class TestSimPolicyCloud extends SimSolrCloudTestCase {
|
|||
|
||||
getCollectionState(collectionName).forEachReplica((s, replica) -> assertEquals(nodeId, replica.getNodeName()));
|
||||
}
|
||||
|
||||
@AwaitsFix(bugUrl="https://issues.apache.org/jira/browse/SOLR-12028")
|
||||
|
||||
public void testCreateCollectionSplitShard() throws Exception {
|
||||
SolrClient solrClient = cluster.simGetSolrClient();
|
||||
String firstNode = cluster.getSimClusterStateProvider().simGetRandomNode();
|
||||
|
@ -294,7 +292,7 @@ public class TestSimPolicyCloud extends SimSolrCloudTestCase {
|
|||
assertEquals(3, coll.getSlice("s3").getReplicas().size());
|
||||
coll.forEachReplica(verifyReplicas);
|
||||
}
|
||||
@BadApple(bugUrl="https://issues.apache.org/jira/browse/SOLR-12028") // 14-Oct-2018
|
||||
|
||||
public void testCreateCollectionAddShardUsingPolicy() throws Exception {
|
||||
SolrClient solrClient = cluster.simGetSolrClient();
|
||||
String nodeId = cluster.getSimClusterStateProvider().simGetRandomNode();
|
||||
|
|
Loading…
Reference in New Issue