From e0820acc45c24cf0c7bd8dabe36d9a9c72b35483 Mon Sep 17 00:00:00 2001 From: Chris Hostetter Date: Wed, 22 Jan 2020 14:44:56 -0700 Subject: [PATCH] SOLR-14159: Eliminate some 'spin loops' in tests that may be contributing factors to odd test failures (cherry picked from commit 6b3e7feba19d2314d8c38205dbf1ab1fe2607096) --- .../solr/cloud/TestCloudConsistency.java | 49 ++++++++++++------- .../solr/cloud/TestTlogReplayVsRecovery.java | 24 +++++---- 2 files changed, 47 insertions(+), 26 deletions(-) diff --git a/solr/core/src/test/org/apache/solr/cloud/TestCloudConsistency.java b/solr/core/src/test/org/apache/solr/cloud/TestCloudConsistency.java index c79ae545447..9d04f99c629 100644 --- a/solr/core/src/test/org/apache/solr/cloud/TestCloudConsistency.java +++ b/solr/core/src/test/org/apache/solr/cloud/TestCloudConsistency.java @@ -25,6 +25,7 @@ import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.concurrent.TimeUnit; +import java.util.concurrent.TimeoutException; import org.apache.solr.JSONTestUtil; import org.apache.solr.client.solrj.SolrServerException; @@ -37,8 +38,6 @@ import org.apache.solr.common.SolrInputDocument; import org.apache.solr.common.cloud.Replica; import org.apache.solr.common.cloud.ZkCoreNodeProps; import org.apache.solr.common.util.NamedList; -import org.apache.solr.common.util.TimeSource; -import org.apache.solr.util.TimeOut; import org.junit.After; import org.junit.Before; import org.junit.Test; @@ -168,14 +167,22 @@ public class TestCloudConsistency extends SolrCloudTestCase { cluster.waitForNode(j1, 30); cluster.waitForNode(j2, 30); - - TimeOut timeOut = new TimeOut(10, TimeUnit.SECONDS, TimeSource.NANO_TIME); - while (!timeOut.hasTimedOut()) { - Replica newLeader = getCollectionState(collection).getSlice("shard1").getLeader(); - if (newLeader != null && !newLeader.getName().equals(leader.getName()) && newLeader.getState() == Replica.State.ACTIVE) { - fail("Out of sync replica became leader " + newLeader); - } - } + + // the meat of the test -- wait to see if a different replica become a leader + // the correct behavior is that this should time out, if it succeeds we have a problem... + expectThrows(TimeoutException.class, + "Did not time out waiting for new leader, out of sync replica became leader", + () -> { + cluster.getSolrClient().waitForState(collection, 10, TimeUnit.SECONDS, (state) -> { + Replica newLeader = state.getSlice("shard1").getLeader(); + if (newLeader != null && !newLeader.getName().equals(leader.getName()) && newLeader.getState() == Replica.State.ACTIVE) { + // this is is the bad case, our "bad" state was found before timeout + log.error("WTF: New Leader={}", newLeader); + return true; + } + return false; // still no bad state, wait for timeout + }); + }); JettySolrRunner j0 = cluster.getJettySolrRunner(0); j0.start(); @@ -211,13 +218,21 @@ public class TestCloudConsistency extends SolrCloudTestCase { waitForState("Timeout waiting for leader goes DOWN", collection, (liveNodes, collectionState) -> collectionState.getReplica(leader.getName()).getState() == Replica.State.DOWN); - TimeOut timeOut = new TimeOut(10, TimeUnit.SECONDS, TimeSource.NANO_TIME); - while (!timeOut.hasTimedOut()) { - Replica newLeader = getCollectionState(collection).getLeader("shard1"); - if (newLeader != null && !newLeader.getName().equals(leader.getName()) && newLeader.getState() == Replica.State.ACTIVE) { - fail("Out of sync replica became leader " + newLeader); - } - } + // the meat of the test -- wait to see if a different replica become a leader + // the correct behavior is that this should time out, if it succeeds we have a problem... + expectThrows(TimeoutException.class, + "Did not time out waiting for new leader, out of sync replica became leader", + () -> { + cluster.getSolrClient().waitForState(collection, 10, TimeUnit.SECONDS, (state) -> { + Replica newLeader = state.getSlice("shard1").getLeader(); + if (newLeader != null && !newLeader.getName().equals(leader.getName()) && newLeader.getState() == Replica.State.ACTIVE) { + // this is is the bad case, our "bad" state was found before timeout + log.error("WTF: New Leader={}", newLeader); + return true; + } + return false; // still no bad state, wait for timeout + }); + }); proxies.get(cluster.getJettySolrRunner(0)).reopen(); cluster.getJettySolrRunner(0).start(); diff --git a/solr/core/src/test/org/apache/solr/cloud/TestTlogReplayVsRecovery.java b/solr/core/src/test/org/apache/solr/cloud/TestTlogReplayVsRecovery.java index 09d38854cdf..d1c4d226962 100644 --- a/solr/core/src/test/org/apache/solr/cloud/TestTlogReplayVsRecovery.java +++ b/solr/core/src/test/org/apache/solr/cloud/TestTlogReplayVsRecovery.java @@ -25,6 +25,7 @@ import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.concurrent.TimeUnit; +import java.util.concurrent.TimeoutException; import org.apache.lucene.util.LuceneTestCase.AwaitsFix; @@ -39,9 +40,7 @@ import org.apache.solr.client.solrj.response.QueryResponse; import org.apache.solr.client.solrj.response.RequestStatusState; import org.apache.solr.common.SolrInputDocument; import org.apache.solr.common.cloud.Replica; -import org.apache.solr.common.util.TimeSource; import org.apache.solr.util.TestInjection; -import org.apache.solr.util.TimeOut; import org.junit.After; import org.junit.Before; import org.slf4j.Logger; @@ -172,13 +171,20 @@ public class TestTlogReplayVsRecovery extends SolrCloudTestCase { waitForState("Timeout waiting for leader goes DOWN", COLLECTION, (liveNodes, collectionState) -> collectionState.getReplica(leader.getName()).getState() == Replica.State.DOWN); - TimeOut timeOut = new TimeOut(10, TimeUnit.SECONDS, TimeSource.NANO_TIME); - while (!timeOut.hasTimedOut()) { - Replica newLeader = getCollectionState(COLLECTION).getLeader("shard1"); - if (newLeader != null && !newLeader.getName().equals(leader.getName()) && newLeader.getState() == Replica.State.ACTIVE) { - fail("Out of sync replica became leader " + newLeader); - } - } + // Sanity check that a new (out of sync) replica doesn't come up in our place... + expectThrows(TimeoutException.class, + "Did not time out waiting for new leader, out of sync replica became leader", + () -> { + cluster.getSolrClient().waitForState(COLLECTION, 10, TimeUnit.SECONDS, (state) -> { + Replica newLeader = state.getSlice("shard1").getLeader(); + if (newLeader != null && !newLeader.getName().equals(leader.getName()) && newLeader.getState() == Replica.State.ACTIVE) { + // this is is the bad case, our "bad" state was found before timeout + log.error("WTF: New Leader={}", newLeader); + return true; + } + return false; // still no bad state, wait for timeout + }); + }); log.info("Enabling TestInjection.updateLogReplayRandomPause"); TestInjection.updateLogReplayRandomPause = "true:100";