diff --git a/solr/core/src/test/org/apache/solr/cloud/TestCloudConsistency.java b/solr/core/src/test/org/apache/solr/cloud/TestCloudConsistency.java index f3224ff452b..c79ae545447 100644 --- a/solr/core/src/test/org/apache/solr/cloud/TestCloudConsistency.java +++ b/solr/core/src/test/org/apache/solr/cloud/TestCloudConsistency.java @@ -93,14 +93,11 @@ public class TestCloudConsistency extends SolrCloudTestCase { } @Test - //commented 2-Aug-2018 @LuceneTestCase.BadApple(bugUrl="https://issues.apache.org/jira/browse/SOLR-12028") // 12-Jun-2018 - @BadApple(bugUrl="https://issues.apache.org/jira/browse/SOLR-12028") // 6-Sep-2018 public void testOutOfSyncReplicasCannotBecomeLeader() throws Exception { testOutOfSyncReplicasCannotBecomeLeader(false); } @Test - // commented out on: 24-Dec-2018 @LuceneTestCase.BadApple(bugUrl="https://issues.apache.org/jira/browse/SOLR-12028") // 2-Aug-2018 public void testOutOfSyncReplicasCannotBecomeLeaderAfterRestart() throws Exception { testOutOfSyncReplicasCannotBecomeLeader(true); } @@ -247,6 +244,7 @@ public class TestCloudConsistency extends SolrCloudTestCase { private void addDoc(String collection, int docId, JettySolrRunner solrRunner) throws IOException, SolrServerException { try (HttpSolrClient solrClient = new HttpSolrClient.Builder(solrRunner.getBaseUrl().toString()).build()) { solrClient.add(collection, new SolrInputDocument("id", String.valueOf(docId), "fieldName_s", String.valueOf(docId))); + solrClient.commit(collection); } } diff --git a/solr/core/src/test/org/apache/solr/cloud/TestTlogReplayVsRecovery.java b/solr/core/src/test/org/apache/solr/cloud/TestTlogReplayVsRecovery.java new file mode 100644 index 00000000000..b266c1fedb4 --- /dev/null +++ b/solr/core/src/test/org/apache/solr/cloud/TestTlogReplayVsRecovery.java @@ -0,0 +1,251 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.solr.cloud; + +import java.io.IOException; +import java.lang.invoke.MethodHandles; +import java.net.URI; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.concurrent.TimeUnit; + +import org.apache.lucene.util.LuceneTestCase.AwaitsFix; + +import org.apache.solr.JSONTestUtil; +import org.apache.solr.client.solrj.SolrServerException; +import org.apache.solr.client.solrj.cloud.SocketProxy; +import org.apache.solr.client.solrj.embedded.JettySolrRunner; +import org.apache.solr.client.solrj.impl.HttpSolrClient; +import org.apache.solr.client.solrj.request.CollectionAdminRequest; +import org.apache.solr.client.solrj.request.QueryRequest; +import org.apache.solr.client.solrj.response.QueryResponse; +import org.apache.solr.client.solrj.response.RequestStatusState; +import org.apache.solr.common.SolrInputDocument; +import org.apache.solr.common.cloud.Replica; +import org.apache.solr.common.util.TimeSource; +import org.apache.solr.update.DirectUpdateHandler2; +import org.apache.solr.util.TestInjection; +import org.apache.solr.util.TimeOut; +import org.junit.After; +import org.junit.Before; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +@AwaitsFix(bugUrl="https://issues.apache.org/jira/browse/SOLR-13486") +public class TestTlogReplayVsRecovery extends SolrCloudTestCase { + + private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); + + private static final String COLLECTION = "collecion_with_slow_tlog_recovery"; + + private JettySolrRunner NODE0; + private JettySolrRunner NODE1; + private Map proxies; + private Map jettys; + + @Before + public void setupCluster() throws Exception { + // we want to ensure there is tlog replay on the leader after we restart it, + // so in addition to not committing the docs we add during network partition + // we also want to ensure that our leader doesn't do a "Commit on close" + DirectUpdateHandler2.commitOnClose = false; + + System.setProperty("solr.directoryFactory", "solr.StandardDirectoryFactory"); + System.setProperty("solr.ulog.numRecordsToKeep", "1000"); + System.setProperty("leaderVoteWait", "60000"); + + configureCluster(2) + .addConfig("conf", configset("cloud-minimal")) + .configure(); + + NODE0 = cluster.getJettySolrRunner(0); + NODE1 = cluster.getJettySolrRunner(1); + + // Add proxies + proxies = new HashMap<>(cluster.getJettySolrRunners().size()); + jettys = new HashMap<>(); + for (JettySolrRunner jetty:cluster.getJettySolrRunners()) { + SocketProxy proxy = new SocketProxy(); + jetty.setProxyPort(proxy.getListenPort()); + cluster.stopJettySolrRunner(jetty);//TODO: Can we avoid this restart + cluster.startJettySolrRunner(jetty); + proxy.open(jetty.getBaseUrl().toURI()); + log.info("Adding proxy for URL: " + jetty.getBaseUrl() + ". Proxy: " + proxy.getUrl()); + proxies.put(jetty, proxy); + jettys.put(proxy.getUrl(), jetty); + } + } + + @After + public void tearDownCluster() throws Exception { + TestInjection.reset(); + DirectUpdateHandler2.commitOnClose = true; + + if (null != proxies) { + for (SocketProxy proxy : proxies.values()) { + proxy.close(); + } + proxies = null; + } + jettys = null; + System.clearProperty("solr.directoryFactory"); + System.clearProperty("solr.ulog.numRecordsToKeep"); + System.clearProperty("leaderVoteWait"); + + shutdownCluster(); + } + + public void testManyDocsInTlogReplayWhileReplicaIsTryingToRecover() throws Exception { + final int committedDocs = 3; + final int uncommittedDocs = 50; + + log.info("Create Collection..."); + assertEquals(RequestStatusState.COMPLETED, + CollectionAdminRequest.createCollection(COLLECTION, 1, 2) + .setCreateNodeSet("") + .processAndWait(cluster.getSolrClient(), DEFAULT_TIMEOUT)); + assertEquals(RequestStatusState.COMPLETED, + CollectionAdminRequest.addReplicaToShard(COLLECTION, "shard1") + .setNode(NODE0.getNodeName()) + .processAndWait(cluster.getSolrClient(), DEFAULT_TIMEOUT)); + + waitForState("Timeout waiting for shard leader", COLLECTION, clusterShape(1, 1)); + + assertEquals(RequestStatusState.COMPLETED, + CollectionAdminRequest.addReplicaToShard(COLLECTION, "shard1") + .setNode(NODE1.getNodeName()) + .processAndWait(cluster.getSolrClient(), DEFAULT_TIMEOUT)); + + cluster.waitForActiveCollection(COLLECTION, 1, 2); + + waitForState("Timeout waiting for 1x2 collection", COLLECTION, clusterShape(1, 2)); + + final Replica leader = getCollectionState(COLLECTION).getSlice("shard1").getLeader(); + assertEquals("Sanity check failed", NODE0.getNodeName(), leader.getNodeName()); + + log.info("Add and commit a {} docs...", committedDocs); + addDocs(true, committedDocs, 1); + + log.info("Partition nodes..."); + proxies.get(NODE0).close(); + proxies.get(NODE1).close(); + + log.info("Adding {} (uncommitted) docs during network partition....", uncommittedDocs); + addDocs(false, uncommittedDocs, committedDocs + 1); + + log.info("Stopping leader node..."); + assertEquals("Something broke our expected commitOnClose", false, DirectUpdateHandler2.commitOnClose); + NODE0.stop(); + cluster.waitForJettyToStop(NODE0); + + log.info("Un-Partition replica (NODE1)..."); + proxies.get(NODE1).reopen(); + + waitForState("Timeout waiting for leader goes DOWN", COLLECTION, (liveNodes, collectionState) + -> collectionState.getReplica(leader.getName()).getState() == Replica.State.DOWN); + + TimeOut timeOut = new TimeOut(10, TimeUnit.SECONDS, TimeSource.NANO_TIME); + while (!timeOut.hasTimedOut()) { + Replica newLeader = getCollectionState(COLLECTION).getLeader("shard1"); + if (newLeader != null && !newLeader.getName().equals(leader.getName()) && newLeader.getState() == Replica.State.ACTIVE) { + fail("Out of sync replica became leader " + newLeader); + } + } + + log.info("Enabling TestInjection.updateLogReplayRandomPause"); + TestInjection.updateLogReplayRandomPause = "true:100"; + + log.info("Un-Partition & restart leader (NODE0)..."); + proxies.get(NODE0).reopen(); + NODE0.start(); + + log.info("Waiting for all nodes and active collection..."); + + cluster.waitForAllNodes(30);; + waitForState("Timeout waiting for leader", COLLECTION, (liveNodes, collectionState) -> { + Replica newLeader = collectionState.getLeader("shard1"); + return newLeader != null && newLeader.getName().equals(leader.getName()); + }); + waitForState("Timeout waiting for active collection", COLLECTION, clusterShape(1, 2)); + + cluster.waitForActiveCollection(COLLECTION, 1, 2); + + log.info("Check docs on both replicas..."); + assertDocsExistInBothReplicas(1, uncommittedDocs + uncommittedDocs); + + log.info("Test ok, delete collection..."); + CollectionAdminRequest.deleteCollection(COLLECTION).process(cluster.getSolrClient()); + } + + /** + * Adds the specified number of docs directly to the leader, + * using increasing docIds begining with startId. Commits if and only if the boolean is true. + */ + private void addDocs(final boolean commit, final int numDocs, final int startId) throws SolrServerException, IOException { + + List docs = new ArrayList<>(numDocs); + for (int i = 0; i < numDocs; i++) { + int id = startId + i; + docs.add(new SolrInputDocument("id", String.valueOf(id), "fieldName_s", String.valueOf(id))); + } + // For simplicity, we always add out docs directly to NODE0 + // (where the leader should be) and bypass the proxy... + try (HttpSolrClient client = getHttpSolrClient(NODE0.getBaseUrl().toString())) { + assertEquals(0, client.add(COLLECTION, docs).getStatus()); + if (commit) { + assertEquals(0, client.commit(COLLECTION).getStatus()); + } + } + } + + /** + * uses distrib=false RTG requests to verify that every doc between firstDocId and lastDocId + * (inclusive) can be found on both the leader and the replica + */ + private void assertDocsExistInBothReplicas(int firstDocId, + int lastDocId) throws Exception { + try (HttpSolrClient leaderSolr = getHttpSolrClient(NODE0.getBaseUrl().toString()); + HttpSolrClient replicaSolr = getHttpSolrClient(NODE1.getBaseUrl().toString())) { + for (int d = firstDocId; d <= lastDocId; d++) { + String docId = String.valueOf(d); + assertDocExists("leader", leaderSolr, docId); + assertDocExists("replica", replicaSolr, docId); + } + } + } + + /** + * uses distrib=false RTG requests to verify that the specified docId can be found using the + * specified solr client + */ + private void assertDocExists(final String clientName, final HttpSolrClient client, final String docId) throws Exception { + final QueryResponse rsp = (new QueryRequest(params("qt", "/get", + "id", docId, + "_trace", clientName, + "distrib", "false"))) + .process(client, COLLECTION); + assertEquals(0, rsp.getStatus()); + + String match = JSONTestUtil.matchObj("/id", rsp.getResponse().get("doc"), docId); + assertTrue("Doc with id=" + docId + " not found in " + clientName + + " due to: " + match + "; rsp="+rsp, match == null); + } + +}