mirror of https://github.com/apache/lucene.git
SOLR-9716: RecoveryStrategy sends prep recovery command without setting read time out which can cause replica recovery to hang indefinitely on network partitions
This commit is contained in:
parent
358bdd490b
commit
1f1990d8be
|
@ -114,6 +114,9 @@ Bug Fixes
|
|||
* SOLR-9360: Solr script not properly checking SOLR_PID
|
||||
(Alessandro Benedetti via Erick Erickson)
|
||||
|
||||
* SOLR-9716: RecoveryStrategy sends prep recovery command without setting read time out which can cause
|
||||
replica recovery to hang indefinitely on network partitions. (Cao Manh Dat, shalin)
|
||||
|
||||
Other Changes
|
||||
----------------------
|
||||
|
||||
|
|
|
@ -19,6 +19,7 @@ package org.apache.solr.cloud;
|
|||
import java.io.Closeable;
|
||||
import java.io.IOException;
|
||||
import java.lang.invoke.MethodHandles;
|
||||
import java.net.SocketTimeoutException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
|
@ -572,24 +573,44 @@ public class RecoveryStrategy extends Thread implements Closeable {
|
|||
private void sendPrepRecoveryCmd(String leaderBaseUrl, String leaderCoreName, Slice slice)
|
||||
throws SolrServerException, IOException, InterruptedException, ExecutionException {
|
||||
|
||||
try (HttpSolrClient client = new HttpSolrClient.Builder(leaderBaseUrl).build()) {
|
||||
client.setConnectionTimeout(30000);
|
||||
WaitForState prepCmd = new WaitForState();
|
||||
prepCmd.setCoreName(leaderCoreName);
|
||||
prepCmd.setNodeName(zkController.getNodeName());
|
||||
prepCmd.setCoreNodeName(coreZkNodeName);
|
||||
prepCmd.setState(Replica.State.RECOVERING);
|
||||
prepCmd.setCheckLive(true);
|
||||
prepCmd.setOnlyIfLeader(true);
|
||||
final Slice.State state = slice.getState();
|
||||
if (state != Slice.State.CONSTRUCTION && state != Slice.State.RECOVERY && state != Slice.State.RECOVERY_FAILED) {
|
||||
prepCmd.setOnlyIfLeaderActive(true);
|
||||
WaitForState prepCmd = new WaitForState();
|
||||
prepCmd.setCoreName(leaderCoreName);
|
||||
prepCmd.setNodeName(zkController.getNodeName());
|
||||
prepCmd.setCoreNodeName(coreZkNodeName);
|
||||
prepCmd.setState(Replica.State.RECOVERING);
|
||||
prepCmd.setCheckLive(true);
|
||||
prepCmd.setOnlyIfLeader(true);
|
||||
final Slice.State state = slice.getState();
|
||||
if (state != Slice.State.CONSTRUCTION && state != Slice.State.RECOVERY && state != Slice.State.RECOVERY_FAILED) {
|
||||
prepCmd.setOnlyIfLeaderActive(true);
|
||||
}
|
||||
|
||||
final int maxTries = 30;
|
||||
for (int numTries = 0; numTries < maxTries; numTries++) {
|
||||
try {
|
||||
sendPrepRecoveryCmd(leaderBaseUrl, prepCmd);
|
||||
break;
|
||||
} catch (ExecutionException e) {
|
||||
SolrServerException solrException = (SolrServerException) e.getCause();
|
||||
if (solrException.getRootCause() instanceof SocketTimeoutException && numTries < maxTries) {
|
||||
LOG.warn("Socket timeout when send prep recovery cmd, retrying.. ");
|
||||
continue;
|
||||
}
|
||||
throw e;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void sendPrepRecoveryCmd(String leaderBaseUrl, WaitForState prepCmd)
|
||||
throws SolrServerException, IOException, InterruptedException, ExecutionException {
|
||||
try (HttpSolrClient client = new HttpSolrClient.Builder(leaderBaseUrl).build()) {
|
||||
client.setConnectionTimeout(10000);
|
||||
client.setSoTimeout(10000);
|
||||
HttpUriRequestResponse mrr = client.httpUriRequest(prepCmd);
|
||||
prevSendPreRecoveryHttpUriRequest = mrr.httpUriRequest;
|
||||
|
||||
|
||||
LOG.info("Sending prep recovery command to [{}]; [{}]", leaderBaseUrl, prepCmd.toString());
|
||||
|
||||
|
||||
mrr.future.get();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -37,6 +37,7 @@ import org.apache.solr.request.LocalSolrQueryRequest;
|
|||
import org.apache.solr.search.SolrIndexSearcher;
|
||||
import org.apache.solr.update.CommitUpdateCommand;
|
||||
import org.apache.solr.util.RefCounted;
|
||||
import org.apache.solr.util.TestInjection;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
|
@ -46,6 +47,8 @@ class PrepRecoveryOp implements CoreAdminHandler.CoreAdminOp {
|
|||
|
||||
@Override
|
||||
public void execute(CallInfo it) throws Exception {
|
||||
assert TestInjection.injectPrepRecoveryOpPauseForever();
|
||||
|
||||
final SolrParams params = it.req.getParams();
|
||||
|
||||
String cname = params.get(CoreAdminParams.CORE);
|
||||
|
|
|
@ -24,6 +24,7 @@ import java.util.Random;
|
|||
import java.util.Set;
|
||||
import java.util.Timer;
|
||||
import java.util.TimerTask;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
|
@ -110,6 +111,8 @@ public class TestInjection {
|
|||
|
||||
public static String updateRandomPause = null;
|
||||
|
||||
public static String prepRecoveryOpPauseForever = null;
|
||||
|
||||
public static String randomDelayInCoreCreation = null;
|
||||
|
||||
public static int randomDelayMaxInCoreCreationInSec = 10;
|
||||
|
@ -118,6 +121,8 @@ public class TestInjection {
|
|||
|
||||
private static Set<Timer> timers = Collections.synchronizedSet(new HashSet<Timer>());
|
||||
|
||||
private static AtomicInteger countPrepRecoveryOpPauseForever = new AtomicInteger(0);
|
||||
|
||||
public static void reset() {
|
||||
nonGracefullClose = null;
|
||||
failReplicaRequests = null;
|
||||
|
@ -127,6 +132,8 @@ public class TestInjection {
|
|||
updateRandomPause = null;
|
||||
randomDelayInCoreCreation = null;
|
||||
splitFailureBeforeReplicaCreation = null;
|
||||
prepRecoveryOpPauseForever = null;
|
||||
countPrepRecoveryOpPauseForever = new AtomicInteger(0);
|
||||
|
||||
for (Timer timer : timers) {
|
||||
timer.cancel();
|
||||
|
@ -289,6 +296,31 @@ public class TestInjection {
|
|||
return true;
|
||||
}
|
||||
|
||||
public static boolean injectPrepRecoveryOpPauseForever() {
|
||||
if (prepRecoveryOpPauseForever != null) {
|
||||
Random rand = random();
|
||||
if (null == rand) return true;
|
||||
|
||||
Pair<Boolean,Integer> pair = parseValue(prepRecoveryOpPauseForever);
|
||||
boolean enabled = pair.first();
|
||||
int chanceIn100 = pair.second();
|
||||
// Prevent for continuous pause forever
|
||||
if (enabled && rand.nextInt(100) >= (100 - chanceIn100) && countPrepRecoveryOpPauseForever.get() < 2) {
|
||||
countPrepRecoveryOpPauseForever.incrementAndGet();
|
||||
log.info("inject pause forever for prep recovery op");
|
||||
try {
|
||||
Thread.sleep(Integer.MAX_VALUE);
|
||||
} catch (InterruptedException e) {
|
||||
Thread.currentThread().interrupt();
|
||||
}
|
||||
} else {
|
||||
countPrepRecoveryOpPauseForever.set(0);
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
public static boolean injectSplitFailureBeforeReplicaCreation() {
|
||||
if (splitFailureBeforeReplicaCreation != null) {
|
||||
Random rand = random();
|
||||
|
|
|
@ -37,6 +37,8 @@ import org.apache.solr.common.params.ModifiableSolrParams;
|
|||
import org.apache.solr.core.SolrCore;
|
||||
import org.apache.solr.update.DirectUpdateHandler2;
|
||||
import org.apache.solr.update.UpdateLog;
|
||||
import org.apache.solr.util.TestInjection;
|
||||
import org.junit.AfterClass;
|
||||
import org.junit.Before;
|
||||
import org.junit.BeforeClass;
|
||||
import org.junit.Test;
|
||||
|
@ -47,6 +49,7 @@ public class TestCloudRecovery extends SolrCloudTestCase {
|
|||
|
||||
@BeforeClass
|
||||
public static void setupCluster() throws Exception {
|
||||
TestInjection.prepRecoveryOpPauseForever = "true:30";
|
||||
System.setProperty("solr.directoryFactory", "solr.StandardDirectoryFactory");
|
||||
System.setProperty("solr.ulog.numRecordsToKeep", "1000");
|
||||
|
||||
|
@ -62,6 +65,11 @@ public class TestCloudRecovery extends SolrCloudTestCase {
|
|||
false, true, 30);
|
||||
}
|
||||
|
||||
@AfterClass
|
||||
public static void afterClass() {
|
||||
TestInjection.reset();
|
||||
}
|
||||
|
||||
@Before
|
||||
public void resetCollection() throws IOException, SolrServerException {
|
||||
cluster.getSolrClient().deleteByQuery(COLLECTION, "*:*");
|
||||
|
|
Loading…
Reference in New Issue