From f89f109ec1d1b9a19034d17345d41d3dd2c34852 Mon Sep 17 00:00:00 2001 From: Jason Gerlowski Date: Fri, 7 Dec 2018 11:07:42 -0500 Subject: [PATCH] SOLR-13045: Harden TestSimPolicyCloud This commit fixes a race condition in SimClusterStateProvider, fixing several fails in TestSimPolicyCloud. --- .../autoscaling/sim/SimCloudManager.java | 2 +- .../sim/SimClusterStateProvider.java | 139 +++++++++--------- .../sim/TestSimExtremeIndexing.java | 2 - .../autoscaling/sim/TestSimPolicyCloud.java | 6 +- 4 files changed, 75 insertions(+), 74 deletions(-) diff --git a/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/SimCloudManager.java b/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/SimCloudManager.java index dad946368bf..fd622c13f86 100644 --- a/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/SimCloudManager.java +++ b/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/SimCloudManager.java @@ -838,7 +838,7 @@ public class SimCloudManager implements SolrCloudManager { results.add("success", ""); break; case ADDROLE: - nodeStateProvider.simAddNodeValue(req.getParams().get("node"), "nodeRole", req.getParams().get("role")); + nodeStateProvider.simSetNodeValue(req.getParams().get("node"), "nodeRole", req.getParams().get("role")); break; case CREATESHARD: try { diff --git a/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/SimClusterStateProvider.java b/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/SimClusterStateProvider.java index d81d92c75ed..6197c41ac0f 100644 --- a/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/SimClusterStateProvider.java +++ b/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/SimClusterStateProvider.java @@ -691,79 +691,84 @@ public class SimClusterStateProvider implements ClusterStateProvider { private void simRunLeaderElection(String collection, Slice s, boolean saveState) throws Exception { AtomicBoolean stateChanged = new AtomicBoolean(Boolean.FALSE); - Replica leader = s.getLeader(); - if (leader == null || !liveNodes.contains(leader.getNodeName())) { - log.trace("Running leader election for {} / {}", collection, s.getName()); - if (s.getReplicas().isEmpty()) { // no replicas - punt - log.trace("-- no replicas in {} / {}", collection, s.getName()); - return; - } - ActionThrottle lt = getThrottle(collection, s.getName()); - synchronized (lt) { - // collect all active and live - List active = new ArrayList<>(); - AtomicBoolean alreadyHasLeader = new AtomicBoolean(false); - s.getReplicas().forEach(r -> { - // find our ReplicaInfo for this replica - ReplicaInfo ri = getReplicaInfo(r); - if (ri == null) { - throw new IllegalStateException("-- could not find ReplicaInfo for replica " + r); - } - synchronized (ri) { - if (r.isActive(liveNodes.get())) { - if (ri.getVariables().get(ZkStateReader.LEADER_PROP) != null) { - log.trace("-- found existing leader {} / {}: {}, {}", collection, s.getName(), ri, r); - alreadyHasLeader.set(true); - return; - } else { - active.add(ri); - } - } else { // if it's on a node that is not live mark it down - log.trace("-- replica not active on live nodes: {}, {}", liveNodes.get(), r); - if (!liveNodes.contains(r.getNodeName())) { - ri.getVariables().put(ZkStateReader.STATE_PROP, Replica.State.DOWN.toString()); - ri.getVariables().remove(ZkStateReader.LEADER_PROP); - stateChanged.set(true); + lock.lockInterruptibly(); + try { + Replica leader = s.getLeader(); + if (leader == null || !liveNodes.contains(leader.getNodeName())) { + log.trace("Running leader election for {} / {}", collection, s.getName()); + if (s.getReplicas().isEmpty()) { // no replicas - punt + log.trace("-- no replicas in {} / {}", collection, s.getName()); + return; + } + ActionThrottle lt = getThrottle(collection, s.getName()); + synchronized (lt) { + // collect all active and live + List active = new ArrayList<>(); + AtomicBoolean alreadyHasLeader = new AtomicBoolean(false); + s.getReplicas().forEach(r -> { + // find our ReplicaInfo for this replica + ReplicaInfo ri = getReplicaInfo(r); + if (ri == null) { + throw new IllegalStateException("-- could not find ReplicaInfo for replica " + r); + } + synchronized (ri) { + if (r.isActive(liveNodes.get())) { + if (ri.getVariables().get(ZkStateReader.LEADER_PROP) != null) { + log.trace("-- found existing leader {} / {}: {}, {}", collection, s.getName(), ri, r); + alreadyHasLeader.set(true); + return; + } else { + active.add(ri); + } + } else { // if it's on a node that is not live mark it down + log.trace("-- replica not active on live nodes: {}, {}", liveNodes.get(), r); + if (!liveNodes.contains(r.getNodeName())) { + ri.getVariables().put(ZkStateReader.STATE_PROP, Replica.State.DOWN.toString()); + ri.getVariables().remove(ZkStateReader.LEADER_PROP); + stateChanged.set(true); + } } } + }); + if (alreadyHasLeader.get()) { + log.trace("-- already has leader {} / {}: {}", collection, s.getName(), s); + return; } - }); - if (alreadyHasLeader.get()) { - log.trace("-- already has leader {} / {}: {}", collection, s.getName(), s); - return; - } - if (active.isEmpty()) { - log.warn("Can't find any active replicas for {} / {}: {}", collection, s.getName(), s); - log.debug("-- liveNodes: {}", liveNodes.get()); - return; - } - // pick first active one - ReplicaInfo ri = null; - for (ReplicaInfo a : active) { - if (!a.getType().equals(Replica.Type.PULL)) { - ri = a; - break; + if (active.isEmpty()) { + log.warn("Can't find any active replicas for {} / {}: {}", collection, s.getName(), s); + log.debug("-- liveNodes: {}", liveNodes.get()); + return; } + // pick first active one + ReplicaInfo ri = null; + for (ReplicaInfo a : active) { + if (!a.getType().equals(Replica.Type.PULL)) { + ri = a; + break; + } + } + if (ri == null) { + log.warn("-- can't find any suitable replica type for {} / {}: {}", collection, s.getName(), s); + return; + } + // now mark the leader election throttle + lt.minimumWaitBetweenActions(); + lt.markAttemptingAction(); + synchronized (ri) { + ri.getVariables().put(ZkStateReader.LEADER_PROP, "true"); + } + log.debug("-- elected new leader for {} / {} (currentVersion={}): {}", collection, + s.getName(), clusterStateVersion, ri); + stateChanged.set(true); } - if (ri == null) { - log.warn("-- can't find any suitable replica type for {} / {}: {}", collection, s.getName(), s); - return; - } - // now mark the leader election throttle - lt.minimumWaitBetweenActions(); - lt.markAttemptingAction(); - synchronized (ri) { - ri.getVariables().put(ZkStateReader.LEADER_PROP, "true"); - } - log.debug("-- elected new leader for {} / {} (currentVersion={}): {}", collection, - s.getName(), clusterStateVersion, ri); - stateChanged.set(true); + } else { + log.trace("-- already has leader for {} / {}", collection, s.getName()); } - } else { - log.trace("-- already has leader for {} / {}", collection, s.getName()); - } - if (stateChanged.get() || saveState) { - collectionsStatesRef.set(null); + } finally { + if (stateChanged.get() || saveState) { + collectionsStatesRef.set(null); + } + lock.unlock(); } } diff --git a/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/TestSimExtremeIndexing.java b/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/TestSimExtremeIndexing.java index 6c5f9e6c128..15d676b523b 100644 --- a/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/TestSimExtremeIndexing.java +++ b/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/TestSimExtremeIndexing.java @@ -23,7 +23,6 @@ import java.util.concurrent.TimeUnit; import com.carrotsearch.randomizedtesting.annotations.TimeoutSuite; -import org.apache.lucene.util.LuceneTestCase.AwaitsFix; import org.apache.solr.client.solrj.SolrClient; import org.apache.solr.client.solrj.SolrRequest; import org.apache.solr.client.solrj.request.CollectionAdminRequest; @@ -52,7 +51,6 @@ import static org.apache.solr.cloud.autoscaling.AutoScalingHandlerTest.createAut @TimeoutSuite(millis = 48 * 3600 * 1000) @LogLevel("org.apache.solr.cloud.autoscaling=DEBUG;org.apache.solr.cloud.autoscaling.NodeLostTrigger=INFO;org.apache.client.solrj.cloud.autoscaling=DEBUG;org.apache.solr.cloud.autoscaling.ComputePlanAction=INFO;org.apache.solr.cloud.autoscaling.ExecutePlanAction=DEBUG;org.apache.solr.cloud.autoscaling.ScheduledTriggers=DEBUG") //@LogLevel("org.apache.solr.cloud.autoscaling=DEBUG;org.apache.solr.cloud.autoscaling.NodeLostTrigger=INFO;org.apache.client.solrj.cloud.autoscaling=DEBUG;org.apache.solr.cloud.CloudTestUtils=TRACE") -@AwaitsFix(bugUrl="https://issues.apache.org/jira/browse/SOLR-12028") // this test can fail to elect a leader, seems to be common among sim tests public class TestSimExtremeIndexing extends SimSolrCloudTestCase { private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); diff --git a/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/TestSimPolicyCloud.java b/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/TestSimPolicyCloud.java index e70cefbb6b0..eb251010db0 100644 --- a/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/TestSimPolicyCloud.java +++ b/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/TestSimPolicyCloud.java @@ -108,7 +108,6 @@ public class TestSimPolicyCloud extends SimSolrCloudTestCase { } - @AwaitsFix(bugUrl = "https://issues.apache.org/jira/browse/SOLR-12028") public void testCreateCollectionAddReplica() throws Exception { SolrClient solrClient = cluster.simGetSolrClient(); String nodeId = cluster.getSimClusterStateProvider().simGetRandomNode(); @@ -134,8 +133,7 @@ public class TestSimPolicyCloud extends SimSolrCloudTestCase { getCollectionState(collectionName).forEachReplica((s, replica) -> assertEquals(nodeId, replica.getNodeName())); } - - @AwaitsFix(bugUrl="https://issues.apache.org/jira/browse/SOLR-12028") + public void testCreateCollectionSplitShard() throws Exception { SolrClient solrClient = cluster.simGetSolrClient(); String firstNode = cluster.getSimClusterStateProvider().simGetRandomNode(); @@ -294,7 +292,7 @@ public class TestSimPolicyCloud extends SimSolrCloudTestCase { assertEquals(3, coll.getSlice("s3").getReplicas().size()); coll.forEachReplica(verifyReplicas); } - @BadApple(bugUrl="https://issues.apache.org/jira/browse/SOLR-12028") // 14-Oct-2018 + public void testCreateCollectionAddShardUsingPolicy() throws Exception { SolrClient solrClient = cluster.simGetSolrClient(); String nodeId = cluster.getSimClusterStateProvider().simGetRandomNode();