SOLR-10914: RecoveryStrategy's sendPrepRecoveryCmd can get stuck for 5 minutes if leader is unloaded

This commit is contained in:
Shalin Shekhar Mangar 2017-07-03 19:50:33 +05:30
parent b978f37e65
commit 157ff9a4e1
6 changed files with 117 additions and 32 deletions

View File

@ -501,6 +501,8 @@ when using one of Exact*StatsCache (Mikhail Khludnev)
* SOLR-10910: Clean up a few details left over from pluggable transient core and untangling * SOLR-10910: Clean up a few details left over from pluggable transient core and untangling
CoreDescriptor/CoreContainer references (Erick Erickson) CoreDescriptor/CoreContainer references (Erick Erickson)
* SOLR-10914: RecoveryStrategy's sendPrepRecoveryCmd can get stuck for 5 minutes if leader is unloaded. (shalin)
Optimizations Optimizations
---------------------- ----------------------
* SOLR-10634: JSON Facet API: When a field/terms facet will retrieve all buckets (i.e. limit:-1) * SOLR-10634: JSON Facet API: When a field/terms facet will retrieve all buckets (i.e. limit:-1)

View File

@ -19,7 +19,6 @@ package org.apache.solr.cloud;
import java.io.Closeable; import java.io.Closeable;
import java.io.IOException; import java.io.IOException;
import java.lang.invoke.MethodHandles; import java.lang.invoke.MethodHandles;
import java.net.SocketTimeoutException;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Arrays; import java.util.Arrays;
import java.util.Collections; import java.util.Collections;
@ -811,29 +810,12 @@ public class RecoveryStrategy implements Runnable, Closeable {
prepCmd.setOnlyIfLeaderActive(true); prepCmd.setOnlyIfLeaderActive(true);
} }
final int maxTries = 30; int conflictWaitMs = zkController.getLeaderConflictResolveWait();
for (int numTries = 0; numTries < maxTries; numTries++) { // timeout after 5 seconds more than the max timeout (conflictWait + 3 seconds) on the server side
try { int readTimeout = conflictWaitMs + 8000;
sendPrepRecoveryCmd(leaderBaseUrl, prepCmd);
break;
} catch (ExecutionException e) {
if (e.getCause() instanceof SolrServerException) {
SolrServerException solrException = (SolrServerException) e.getCause();
if (solrException.getRootCause() instanceof SocketTimeoutException && numTries < maxTries) {
LOG.warn("Socket timeout on send prep recovery cmd, retrying.. ");
continue;
}
}
throw e;
}
}
}
final private void sendPrepRecoveryCmd(String leaderBaseUrl, WaitForState prepCmd)
throws SolrServerException, IOException, InterruptedException, ExecutionException {
try (HttpSolrClient client = new HttpSolrClient.Builder(leaderBaseUrl).build()) { try (HttpSolrClient client = new HttpSolrClient.Builder(leaderBaseUrl).build()) {
client.setConnectionTimeout(10000); client.setConnectionTimeout(10000);
client.setSoTimeout(10000); client.setSoTimeout(readTimeout);
HttpUriRequestResponse mrr = client.httpUriRequest(prepCmd); HttpUriRequestResponse mrr = client.httpUriRequest(prepCmd);
prevSendPreRecoveryHttpUriRequest = mrr.httpUriRequest; prevSendPreRecoveryHttpUriRequest = mrr.httpUriRequest;
@ -842,5 +824,4 @@ public class RecoveryStrategy implements Runnable, Closeable {
mrr.future.get(); mrr.future.get();
} }
} }
} }

View File

@ -329,7 +329,7 @@ public class TestInjection {
boolean enabled = pair.first(); boolean enabled = pair.first();
int chanceIn100 = pair.second(); int chanceIn100 = pair.second();
// Prevent for continuous pause forever // Prevent for continuous pause forever
if (enabled && rand.nextInt(100) >= (100 - chanceIn100) && countPrepRecoveryOpPauseForever.get() < 2) { if (enabled && rand.nextInt(100) >= (100 - chanceIn100) && countPrepRecoveryOpPauseForever.get() < 1) {
countPrepRecoveryOpPauseForever.incrementAndGet(); countPrepRecoveryOpPauseForever.incrementAndGet();
log.info("inject pause forever for prep recovery op"); log.info("inject pause forever for prep recovery op");
try { try {

View File

@ -43,6 +43,7 @@
<int name="zkClientTimeout">${solr.zkclienttimeout:30000}</int> <int name="zkClientTimeout">${solr.zkclienttimeout:30000}</int>
<bool name="genericCoreNodeNames">${genericCoreNodeNames:true}</bool> <bool name="genericCoreNodeNames">${genericCoreNodeNames:true}</bool>
<int name="leaderVoteWait">${leaderVoteWait:10000}</int> <int name="leaderVoteWait">${leaderVoteWait:10000}</int>
<int name="leaderConflictResolveWait">${leaderConflictResolveWait:180000}</int>
<int name="distribUpdateConnTimeout">${distribUpdateConnTimeout:45000}</int> <int name="distribUpdateConnTimeout">${distribUpdateConnTimeout:45000}</int>
<int name="distribUpdateSoTimeout">${distribUpdateSoTimeout:340000}</int> <int name="distribUpdateSoTimeout">${distribUpdateSoTimeout:340000}</int>
<int name="autoReplicaFailoverWaitAfterExpiration">${autoReplicaFailoverWaitAfterExpiration:10000}</int> <int name="autoReplicaFailoverWaitAfterExpiration">${autoReplicaFailoverWaitAfterExpiration:10000}</int>

View File

@ -43,8 +43,6 @@ import org.apache.solr.core.SolrCore;
import org.apache.solr.metrics.SolrMetricManager; import org.apache.solr.metrics.SolrMetricManager;
import org.apache.solr.update.DirectUpdateHandler2; import org.apache.solr.update.DirectUpdateHandler2;
import org.apache.solr.update.UpdateLog; import org.apache.solr.update.UpdateLog;
import org.apache.solr.util.TestInjection;
import org.junit.AfterClass;
import org.junit.Before; import org.junit.Before;
import org.junit.BeforeClass; import org.junit.BeforeClass;
import org.junit.Test; import org.junit.Test;
@ -56,7 +54,6 @@ public class TestCloudRecovery extends SolrCloudTestCase {
@BeforeClass @BeforeClass
public static void setupCluster() throws Exception { public static void setupCluster() throws Exception {
TestInjection.prepRecoveryOpPauseForever = "true:30";
System.setProperty("solr.directoryFactory", "solr.StandardDirectoryFactory"); System.setProperty("solr.directoryFactory", "solr.StandardDirectoryFactory");
System.setProperty("solr.ulog.numRecordsToKeep", "1000"); System.setProperty("solr.ulog.numRecordsToKeep", "1000");
@ -73,11 +70,6 @@ public class TestCloudRecovery extends SolrCloudTestCase {
false, true, 30); false, true, 30);
} }
@AfterClass
public static void afterClass() {
TestInjection.reset();
}
@Before @Before
public void resetCollection() throws IOException, SolrServerException { public void resetCollection() throws IOException, SolrServerException {
cluster.getSolrClient().deleteByQuery(COLLECTION, "*:*"); cluster.getSolrClient().deleteByQuery(COLLECTION, "*:*");

View File

@ -0,0 +1,109 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.cloud;
import org.apache.solr.client.solrj.embedded.JettySolrRunner;
import org.apache.solr.client.solrj.impl.CloudSolrClient;
import org.apache.solr.client.solrj.request.CollectionAdminRequest;
import org.apache.solr.common.cloud.Replica;
import org.apache.solr.util.TestInjection;
import org.junit.BeforeClass;
import org.junit.Test;
/**
* Tests for PREPRECOVERY CoreAdmin API
*/
public class TestPrepRecovery extends SolrCloudTestCase {
@BeforeClass
public static void setupCluster() throws Exception {
System.setProperty("solr.directoryFactory", "solr.StandardDirectoryFactory");
System.setProperty("solr.ulog.numRecordsToKeep", "1000");
// the default is 180s and our waitForState times out in 90s
// so we lower this to 10s so that we can still test timeouts
System.setProperty("leaderConflictResolveWait", "10000");
configureCluster(2)
.addConfig("config", TEST_PATH().resolve("configsets").resolve("cloud-minimal").resolve("conf"))
.withSolrXml(TEST_PATH().resolve("solr.xml"))
.configure();
}
public static void tearCluster() throws Exception {
System.clearProperty("leaderConflictResolveWait");
}
@Test
public void testLeaderUnloaded() throws Exception {
CloudSolrClient solrClient = cluster.getSolrClient();
String collectionName = "testLeaderUnloaded";
CollectionAdminRequest.createCollection(collectionName, 1, 2)
.process(solrClient);
waitForState("Expected collection: testLeaderUnloaded to be live with 1 shard and 2 replicas",
collectionName, clusterShape(1, 2));
JettySolrRunner newNode = cluster.startJettySolrRunner();
String newNodeName = newNode.getNodeName();
// add a replica to the new node so that it starts watching the collection
CollectionAdminRequest.addReplicaToShard(collectionName, "shard1")
.setNode(newNodeName)
.process(solrClient);
// now delete the leader
Replica leader = solrClient.getZkStateReader().getLeaderRetry(collectionName, "shard1");
CollectionAdminRequest.deleteReplica(collectionName, "shard1", leader.getName())
.process(solrClient);
// add another replica to the new node. When it starts recovering, it will likely have stale state
// and ask the erstwhile leader to PREPRECOVERY which will hang for about 30 seconds
CollectionAdminRequest.addReplicaToShard(collectionName, "shard1")
.setNode(newNodeName)
.process(solrClient);
// in the absence of the fixes made in SOLR-10914, this statement will timeout after 90s
waitForState("Expected collection: testLeaderUnloaded to be live with 1 shard and 3 replicas",
collectionName, clusterShape(1, 3));
}
public void testLeaderNotResponding() throws Exception {
CloudSolrClient solrClient = cluster.getSolrClient();
String collectionName = "testLeaderNotResponding";
CollectionAdminRequest.createCollection(collectionName, 1, 1)
.process(solrClient);
waitForState("Expected collection: testLeaderNotResponding to be live with 1 shard and 1 replicas",
collectionName, clusterShape(1, 1));
TestInjection.prepRecoveryOpPauseForever = "true:100";
try {
CollectionAdminRequest.addReplicaToShard(collectionName, "shard1")
.process(solrClient);
// in the absence of fixes made in SOLR-9716, prep recovery waits forever and the following statement
// times out in 90 seconds
waitForState("Expected collection: testLeaderNotResponding to be live with 1 shard and 2 replicas",
collectionName, clusterShape(1, 2));
} finally {
TestInjection.reset();
}
}
}