mirror of https://github.com/apache/lucene.git
SOLR-10914: RecoveryStrategy's sendPrepRecoveryCmd can get stuck for 5 minutes if leader is unloaded
This commit is contained in:
parent
b978f37e65
commit
157ff9a4e1
|
@ -501,6 +501,8 @@ when using one of Exact*StatsCache (Mikhail Khludnev)
|
||||||
* SOLR-10910: Clean up a few details left over from pluggable transient core and untangling
|
* SOLR-10910: Clean up a few details left over from pluggable transient core and untangling
|
||||||
CoreDescriptor/CoreContainer references (Erick Erickson)
|
CoreDescriptor/CoreContainer references (Erick Erickson)
|
||||||
|
|
||||||
|
* SOLR-10914: RecoveryStrategy's sendPrepRecoveryCmd can get stuck for 5 minutes if leader is unloaded. (shalin)
|
||||||
|
|
||||||
Optimizations
|
Optimizations
|
||||||
----------------------
|
----------------------
|
||||||
* SOLR-10634: JSON Facet API: When a field/terms facet will retrieve all buckets (i.e. limit:-1)
|
* SOLR-10634: JSON Facet API: When a field/terms facet will retrieve all buckets (i.e. limit:-1)
|
||||||
|
|
|
@ -19,7 +19,6 @@ package org.apache.solr.cloud;
|
||||||
import java.io.Closeable;
|
import java.io.Closeable;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.lang.invoke.MethodHandles;
|
import java.lang.invoke.MethodHandles;
|
||||||
import java.net.SocketTimeoutException;
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
import java.util.Collections;
|
import java.util.Collections;
|
||||||
|
@ -811,29 +810,12 @@ public class RecoveryStrategy implements Runnable, Closeable {
|
||||||
prepCmd.setOnlyIfLeaderActive(true);
|
prepCmd.setOnlyIfLeaderActive(true);
|
||||||
}
|
}
|
||||||
|
|
||||||
final int maxTries = 30;
|
int conflictWaitMs = zkController.getLeaderConflictResolveWait();
|
||||||
for (int numTries = 0; numTries < maxTries; numTries++) {
|
// timeout after 5 seconds more than the max timeout (conflictWait + 3 seconds) on the server side
|
||||||
try {
|
int readTimeout = conflictWaitMs + 8000;
|
||||||
sendPrepRecoveryCmd(leaderBaseUrl, prepCmd);
|
|
||||||
break;
|
|
||||||
} catch (ExecutionException e) {
|
|
||||||
if (e.getCause() instanceof SolrServerException) {
|
|
||||||
SolrServerException solrException = (SolrServerException) e.getCause();
|
|
||||||
if (solrException.getRootCause() instanceof SocketTimeoutException && numTries < maxTries) {
|
|
||||||
LOG.warn("Socket timeout on send prep recovery cmd, retrying.. ");
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
throw e;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
final private void sendPrepRecoveryCmd(String leaderBaseUrl, WaitForState prepCmd)
|
|
||||||
throws SolrServerException, IOException, InterruptedException, ExecutionException {
|
|
||||||
try (HttpSolrClient client = new HttpSolrClient.Builder(leaderBaseUrl).build()) {
|
try (HttpSolrClient client = new HttpSolrClient.Builder(leaderBaseUrl).build()) {
|
||||||
client.setConnectionTimeout(10000);
|
client.setConnectionTimeout(10000);
|
||||||
client.setSoTimeout(10000);
|
client.setSoTimeout(readTimeout);
|
||||||
HttpUriRequestResponse mrr = client.httpUriRequest(prepCmd);
|
HttpUriRequestResponse mrr = client.httpUriRequest(prepCmd);
|
||||||
prevSendPreRecoveryHttpUriRequest = mrr.httpUriRequest;
|
prevSendPreRecoveryHttpUriRequest = mrr.httpUriRequest;
|
||||||
|
|
||||||
|
@ -842,5 +824,4 @@ public class RecoveryStrategy implements Runnable, Closeable {
|
||||||
mrr.future.get();
|
mrr.future.get();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -329,7 +329,7 @@ public class TestInjection {
|
||||||
boolean enabled = pair.first();
|
boolean enabled = pair.first();
|
||||||
int chanceIn100 = pair.second();
|
int chanceIn100 = pair.second();
|
||||||
// Prevent for continuous pause forever
|
// Prevent for continuous pause forever
|
||||||
if (enabled && rand.nextInt(100) >= (100 - chanceIn100) && countPrepRecoveryOpPauseForever.get() < 2) {
|
if (enabled && rand.nextInt(100) >= (100 - chanceIn100) && countPrepRecoveryOpPauseForever.get() < 1) {
|
||||||
countPrepRecoveryOpPauseForever.incrementAndGet();
|
countPrepRecoveryOpPauseForever.incrementAndGet();
|
||||||
log.info("inject pause forever for prep recovery op");
|
log.info("inject pause forever for prep recovery op");
|
||||||
try {
|
try {
|
||||||
|
|
|
@ -43,6 +43,7 @@
|
||||||
<int name="zkClientTimeout">${solr.zkclienttimeout:30000}</int>
|
<int name="zkClientTimeout">${solr.zkclienttimeout:30000}</int>
|
||||||
<bool name="genericCoreNodeNames">${genericCoreNodeNames:true}</bool>
|
<bool name="genericCoreNodeNames">${genericCoreNodeNames:true}</bool>
|
||||||
<int name="leaderVoteWait">${leaderVoteWait:10000}</int>
|
<int name="leaderVoteWait">${leaderVoteWait:10000}</int>
|
||||||
|
<int name="leaderConflictResolveWait">${leaderConflictResolveWait:180000}</int>
|
||||||
<int name="distribUpdateConnTimeout">${distribUpdateConnTimeout:45000}</int>
|
<int name="distribUpdateConnTimeout">${distribUpdateConnTimeout:45000}</int>
|
||||||
<int name="distribUpdateSoTimeout">${distribUpdateSoTimeout:340000}</int>
|
<int name="distribUpdateSoTimeout">${distribUpdateSoTimeout:340000}</int>
|
||||||
<int name="autoReplicaFailoverWaitAfterExpiration">${autoReplicaFailoverWaitAfterExpiration:10000}</int>
|
<int name="autoReplicaFailoverWaitAfterExpiration">${autoReplicaFailoverWaitAfterExpiration:10000}</int>
|
||||||
|
|
|
@ -43,8 +43,6 @@ import org.apache.solr.core.SolrCore;
|
||||||
import org.apache.solr.metrics.SolrMetricManager;
|
import org.apache.solr.metrics.SolrMetricManager;
|
||||||
import org.apache.solr.update.DirectUpdateHandler2;
|
import org.apache.solr.update.DirectUpdateHandler2;
|
||||||
import org.apache.solr.update.UpdateLog;
|
import org.apache.solr.update.UpdateLog;
|
||||||
import org.apache.solr.util.TestInjection;
|
|
||||||
import org.junit.AfterClass;
|
|
||||||
import org.junit.Before;
|
import org.junit.Before;
|
||||||
import org.junit.BeforeClass;
|
import org.junit.BeforeClass;
|
||||||
import org.junit.Test;
|
import org.junit.Test;
|
||||||
|
@ -56,7 +54,6 @@ public class TestCloudRecovery extends SolrCloudTestCase {
|
||||||
|
|
||||||
@BeforeClass
|
@BeforeClass
|
||||||
public static void setupCluster() throws Exception {
|
public static void setupCluster() throws Exception {
|
||||||
TestInjection.prepRecoveryOpPauseForever = "true:30";
|
|
||||||
System.setProperty("solr.directoryFactory", "solr.StandardDirectoryFactory");
|
System.setProperty("solr.directoryFactory", "solr.StandardDirectoryFactory");
|
||||||
System.setProperty("solr.ulog.numRecordsToKeep", "1000");
|
System.setProperty("solr.ulog.numRecordsToKeep", "1000");
|
||||||
|
|
||||||
|
@ -73,11 +70,6 @@ public class TestCloudRecovery extends SolrCloudTestCase {
|
||||||
false, true, 30);
|
false, true, 30);
|
||||||
}
|
}
|
||||||
|
|
||||||
@AfterClass
|
|
||||||
public static void afterClass() {
|
|
||||||
TestInjection.reset();
|
|
||||||
}
|
|
||||||
|
|
||||||
@Before
|
@Before
|
||||||
public void resetCollection() throws IOException, SolrServerException {
|
public void resetCollection() throws IOException, SolrServerException {
|
||||||
cluster.getSolrClient().deleteByQuery(COLLECTION, "*:*");
|
cluster.getSolrClient().deleteByQuery(COLLECTION, "*:*");
|
||||||
|
|
|
@ -0,0 +1,109 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.solr.cloud;
|
||||||
|
|
||||||
|
import org.apache.solr.client.solrj.embedded.JettySolrRunner;
|
||||||
|
import org.apache.solr.client.solrj.impl.CloudSolrClient;
|
||||||
|
import org.apache.solr.client.solrj.request.CollectionAdminRequest;
|
||||||
|
import org.apache.solr.common.cloud.Replica;
|
||||||
|
import org.apache.solr.util.TestInjection;
|
||||||
|
import org.junit.BeforeClass;
|
||||||
|
import org.junit.Test;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Tests for PREPRECOVERY CoreAdmin API
|
||||||
|
*/
|
||||||
|
public class TestPrepRecovery extends SolrCloudTestCase {
|
||||||
|
|
||||||
|
@BeforeClass
|
||||||
|
public static void setupCluster() throws Exception {
|
||||||
|
System.setProperty("solr.directoryFactory", "solr.StandardDirectoryFactory");
|
||||||
|
System.setProperty("solr.ulog.numRecordsToKeep", "1000");
|
||||||
|
// the default is 180s and our waitForState times out in 90s
|
||||||
|
// so we lower this to 10s so that we can still test timeouts
|
||||||
|
System.setProperty("leaderConflictResolveWait", "10000");
|
||||||
|
|
||||||
|
configureCluster(2)
|
||||||
|
.addConfig("config", TEST_PATH().resolve("configsets").resolve("cloud-minimal").resolve("conf"))
|
||||||
|
.withSolrXml(TEST_PATH().resolve("solr.xml"))
|
||||||
|
.configure();
|
||||||
|
}
|
||||||
|
|
||||||
|
public static void tearCluster() throws Exception {
|
||||||
|
System.clearProperty("leaderConflictResolveWait");
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testLeaderUnloaded() throws Exception {
|
||||||
|
CloudSolrClient solrClient = cluster.getSolrClient();
|
||||||
|
|
||||||
|
String collectionName = "testLeaderUnloaded";
|
||||||
|
CollectionAdminRequest.createCollection(collectionName, 1, 2)
|
||||||
|
.process(solrClient);
|
||||||
|
|
||||||
|
waitForState("Expected collection: testLeaderUnloaded to be live with 1 shard and 2 replicas",
|
||||||
|
collectionName, clusterShape(1, 2));
|
||||||
|
|
||||||
|
JettySolrRunner newNode = cluster.startJettySolrRunner();
|
||||||
|
String newNodeName = newNode.getNodeName();
|
||||||
|
|
||||||
|
// add a replica to the new node so that it starts watching the collection
|
||||||
|
CollectionAdminRequest.addReplicaToShard(collectionName, "shard1")
|
||||||
|
.setNode(newNodeName)
|
||||||
|
.process(solrClient);
|
||||||
|
|
||||||
|
// now delete the leader
|
||||||
|
Replica leader = solrClient.getZkStateReader().getLeaderRetry(collectionName, "shard1");
|
||||||
|
CollectionAdminRequest.deleteReplica(collectionName, "shard1", leader.getName())
|
||||||
|
.process(solrClient);
|
||||||
|
|
||||||
|
// add another replica to the new node. When it starts recovering, it will likely have stale state
|
||||||
|
// and ask the erstwhile leader to PREPRECOVERY which will hang for about 30 seconds
|
||||||
|
CollectionAdminRequest.addReplicaToShard(collectionName, "shard1")
|
||||||
|
.setNode(newNodeName)
|
||||||
|
.process(solrClient);
|
||||||
|
|
||||||
|
// in the absence of the fixes made in SOLR-10914, this statement will timeout after 90s
|
||||||
|
waitForState("Expected collection: testLeaderUnloaded to be live with 1 shard and 3 replicas",
|
||||||
|
collectionName, clusterShape(1, 3));
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testLeaderNotResponding() throws Exception {
|
||||||
|
CloudSolrClient solrClient = cluster.getSolrClient();
|
||||||
|
|
||||||
|
String collectionName = "testLeaderNotResponding";
|
||||||
|
CollectionAdminRequest.createCollection(collectionName, 1, 1)
|
||||||
|
.process(solrClient);
|
||||||
|
|
||||||
|
waitForState("Expected collection: testLeaderNotResponding to be live with 1 shard and 1 replicas",
|
||||||
|
collectionName, clusterShape(1, 1));
|
||||||
|
|
||||||
|
TestInjection.prepRecoveryOpPauseForever = "true:100";
|
||||||
|
try {
|
||||||
|
CollectionAdminRequest.addReplicaToShard(collectionName, "shard1")
|
||||||
|
.process(solrClient);
|
||||||
|
|
||||||
|
// in the absence of fixes made in SOLR-9716, prep recovery waits forever and the following statement
|
||||||
|
// times out in 90 seconds
|
||||||
|
waitForState("Expected collection: testLeaderNotResponding to be live with 1 shard and 2 replicas",
|
||||||
|
collectionName, clusterShape(1, 2));
|
||||||
|
} finally {
|
||||||
|
TestInjection.reset();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue