SOLR-13486: Text improvements

* force a hard commit of all docs in TestCloudConsistency to work around bug in that test

 * add new AwaitsFix'ed TestTlogReplayVsRecovery that more explicitly demonstrates the bug via TestInjection.updateLogReplayRandomPause
This commit is contained in:
Chris Hostetter 2020-01-03 15:12:23 -07:00
parent 8fba8eba13
commit 0fac7c1a26
2 changed files with 252 additions and 3 deletions

View File

@ -93,14 +93,11 @@ public class TestCloudConsistency extends SolrCloudTestCase {
}
@Test
//commented 2-Aug-2018 @LuceneTestCase.BadApple(bugUrl="https://issues.apache.org/jira/browse/SOLR-12028") // 12-Jun-2018
@BadApple(bugUrl="https://issues.apache.org/jira/browse/SOLR-12028") // 6-Sep-2018
public void testOutOfSyncReplicasCannotBecomeLeader() throws Exception {
testOutOfSyncReplicasCannotBecomeLeader(false);
}
@Test
// commented out on: 24-Dec-2018 @LuceneTestCase.BadApple(bugUrl="https://issues.apache.org/jira/browse/SOLR-12028") // 2-Aug-2018
public void testOutOfSyncReplicasCannotBecomeLeaderAfterRestart() throws Exception {
testOutOfSyncReplicasCannotBecomeLeader(true);
}
@ -247,6 +244,7 @@ public class TestCloudConsistency extends SolrCloudTestCase {
private void addDoc(String collection, int docId, JettySolrRunner solrRunner) throws IOException, SolrServerException {
try (HttpSolrClient solrClient = new HttpSolrClient.Builder(solrRunner.getBaseUrl().toString()).build()) {
solrClient.add(collection, new SolrInputDocument("id", String.valueOf(docId), "fieldName_s", String.valueOf(docId)));
solrClient.commit(collection);
}
}

View File

@ -0,0 +1,251 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.cloud;
import java.io.IOException;
import java.lang.invoke.MethodHandles;
import java.net.URI;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.TimeUnit;
import org.apache.lucene.util.LuceneTestCase.AwaitsFix;
import org.apache.solr.JSONTestUtil;
import org.apache.solr.client.solrj.SolrServerException;
import org.apache.solr.client.solrj.cloud.SocketProxy;
import org.apache.solr.client.solrj.embedded.JettySolrRunner;
import org.apache.solr.client.solrj.impl.HttpSolrClient;
import org.apache.solr.client.solrj.request.CollectionAdminRequest;
import org.apache.solr.client.solrj.request.QueryRequest;
import org.apache.solr.client.solrj.response.QueryResponse;
import org.apache.solr.client.solrj.response.RequestStatusState;
import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.common.cloud.Replica;
import org.apache.solr.common.util.TimeSource;
import org.apache.solr.update.DirectUpdateHandler2;
import org.apache.solr.util.TestInjection;
import org.apache.solr.util.TimeOut;
import org.junit.After;
import org.junit.Before;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@AwaitsFix(bugUrl="https://issues.apache.org/jira/browse/SOLR-13486")
public class TestTlogReplayVsRecovery extends SolrCloudTestCase {
private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
private static final String COLLECTION = "collecion_with_slow_tlog_recovery";
private JettySolrRunner NODE0;
private JettySolrRunner NODE1;
private Map<JettySolrRunner, SocketProxy> proxies;
private Map<URI, JettySolrRunner> jettys;
@Before
public void setupCluster() throws Exception {
// we want to ensure there is tlog replay on the leader after we restart it,
// so in addition to not committing the docs we add during network partition
// we also want to ensure that our leader doesn't do a "Commit on close"
DirectUpdateHandler2.commitOnClose = false;
System.setProperty("solr.directoryFactory", "solr.StandardDirectoryFactory");
System.setProperty("solr.ulog.numRecordsToKeep", "1000");
System.setProperty("leaderVoteWait", "60000");
configureCluster(2)
.addConfig("conf", configset("cloud-minimal"))
.configure();
NODE0 = cluster.getJettySolrRunner(0);
NODE1 = cluster.getJettySolrRunner(1);
// Add proxies
proxies = new HashMap<>(cluster.getJettySolrRunners().size());
jettys = new HashMap<>();
for (JettySolrRunner jetty:cluster.getJettySolrRunners()) {
SocketProxy proxy = new SocketProxy();
jetty.setProxyPort(proxy.getListenPort());
cluster.stopJettySolrRunner(jetty);//TODO: Can we avoid this restart
cluster.startJettySolrRunner(jetty);
proxy.open(jetty.getBaseUrl().toURI());
log.info("Adding proxy for URL: " + jetty.getBaseUrl() + ". Proxy: " + proxy.getUrl());
proxies.put(jetty, proxy);
jettys.put(proxy.getUrl(), jetty);
}
}
@After
public void tearDownCluster() throws Exception {
TestInjection.reset();
DirectUpdateHandler2.commitOnClose = true;
if (null != proxies) {
for (SocketProxy proxy : proxies.values()) {
proxy.close();
}
proxies = null;
}
jettys = null;
System.clearProperty("solr.directoryFactory");
System.clearProperty("solr.ulog.numRecordsToKeep");
System.clearProperty("leaderVoteWait");
shutdownCluster();
}
public void testManyDocsInTlogReplayWhileReplicaIsTryingToRecover() throws Exception {
final int committedDocs = 3;
final int uncommittedDocs = 50;
log.info("Create Collection...");
assertEquals(RequestStatusState.COMPLETED,
CollectionAdminRequest.createCollection(COLLECTION, 1, 2)
.setCreateNodeSet("")
.processAndWait(cluster.getSolrClient(), DEFAULT_TIMEOUT));
assertEquals(RequestStatusState.COMPLETED,
CollectionAdminRequest.addReplicaToShard(COLLECTION, "shard1")
.setNode(NODE0.getNodeName())
.processAndWait(cluster.getSolrClient(), DEFAULT_TIMEOUT));
waitForState("Timeout waiting for shard leader", COLLECTION, clusterShape(1, 1));
assertEquals(RequestStatusState.COMPLETED,
CollectionAdminRequest.addReplicaToShard(COLLECTION, "shard1")
.setNode(NODE1.getNodeName())
.processAndWait(cluster.getSolrClient(), DEFAULT_TIMEOUT));
cluster.waitForActiveCollection(COLLECTION, 1, 2);
waitForState("Timeout waiting for 1x2 collection", COLLECTION, clusterShape(1, 2));
final Replica leader = getCollectionState(COLLECTION).getSlice("shard1").getLeader();
assertEquals("Sanity check failed", NODE0.getNodeName(), leader.getNodeName());
log.info("Add and commit a {} docs...", committedDocs);
addDocs(true, committedDocs, 1);
log.info("Partition nodes...");
proxies.get(NODE0).close();
proxies.get(NODE1).close();
log.info("Adding {} (uncommitted) docs during network partition....", uncommittedDocs);
addDocs(false, uncommittedDocs, committedDocs + 1);
log.info("Stopping leader node...");
assertEquals("Something broke our expected commitOnClose", false, DirectUpdateHandler2.commitOnClose);
NODE0.stop();
cluster.waitForJettyToStop(NODE0);
log.info("Un-Partition replica (NODE1)...");
proxies.get(NODE1).reopen();
waitForState("Timeout waiting for leader goes DOWN", COLLECTION, (liveNodes, collectionState)
-> collectionState.getReplica(leader.getName()).getState() == Replica.State.DOWN);
TimeOut timeOut = new TimeOut(10, TimeUnit.SECONDS, TimeSource.NANO_TIME);
while (!timeOut.hasTimedOut()) {
Replica newLeader = getCollectionState(COLLECTION).getLeader("shard1");
if (newLeader != null && !newLeader.getName().equals(leader.getName()) && newLeader.getState() == Replica.State.ACTIVE) {
fail("Out of sync replica became leader " + newLeader);
}
}
log.info("Enabling TestInjection.updateLogReplayRandomPause");
TestInjection.updateLogReplayRandomPause = "true:100";
log.info("Un-Partition & restart leader (NODE0)...");
proxies.get(NODE0).reopen();
NODE0.start();
log.info("Waiting for all nodes and active collection...");
cluster.waitForAllNodes(30);;
waitForState("Timeout waiting for leader", COLLECTION, (liveNodes, collectionState) -> {
Replica newLeader = collectionState.getLeader("shard1");
return newLeader != null && newLeader.getName().equals(leader.getName());
});
waitForState("Timeout waiting for active collection", COLLECTION, clusterShape(1, 2));
cluster.waitForActiveCollection(COLLECTION, 1, 2);
log.info("Check docs on both replicas...");
assertDocsExistInBothReplicas(1, uncommittedDocs + uncommittedDocs);
log.info("Test ok, delete collection...");
CollectionAdminRequest.deleteCollection(COLLECTION).process(cluster.getSolrClient());
}
/**
* Adds the specified number of docs directly to the leader,
* using increasing docIds begining with startId. Commits if and only if the boolean is true.
*/
private void addDocs(final boolean commit, final int numDocs, final int startId) throws SolrServerException, IOException {
List<SolrInputDocument> docs = new ArrayList<>(numDocs);
for (int i = 0; i < numDocs; i++) {
int id = startId + i;
docs.add(new SolrInputDocument("id", String.valueOf(id), "fieldName_s", String.valueOf(id)));
}
// For simplicity, we always add out docs directly to NODE0
// (where the leader should be) and bypass the proxy...
try (HttpSolrClient client = getHttpSolrClient(NODE0.getBaseUrl().toString())) {
assertEquals(0, client.add(COLLECTION, docs).getStatus());
if (commit) {
assertEquals(0, client.commit(COLLECTION).getStatus());
}
}
}
/**
* uses distrib=false RTG requests to verify that every doc between firstDocId and lastDocId
* (inclusive) can be found on both the leader and the replica
*/
private void assertDocsExistInBothReplicas(int firstDocId,
int lastDocId) throws Exception {
try (HttpSolrClient leaderSolr = getHttpSolrClient(NODE0.getBaseUrl().toString());
HttpSolrClient replicaSolr = getHttpSolrClient(NODE1.getBaseUrl().toString())) {
for (int d = firstDocId; d <= lastDocId; d++) {
String docId = String.valueOf(d);
assertDocExists("leader", leaderSolr, docId);
assertDocExists("replica", replicaSolr, docId);
}
}
}
/**
* uses distrib=false RTG requests to verify that the specified docId can be found using the
* specified solr client
*/
private void assertDocExists(final String clientName, final HttpSolrClient client, final String docId) throws Exception {
final QueryResponse rsp = (new QueryRequest(params("qt", "/get",
"id", docId,
"_trace", clientName,
"distrib", "false")))
.process(client, COLLECTION);
assertEquals(0, rsp.getStatus());
String match = JSONTestUtil.matchObj("/id", rsp.getResponse().get("doc"), docId);
assertTrue("Doc with id=" + docId + " not found in " + clientName
+ " due to: " + match + "; rsp="+rsp, match == null);
}
}