mirror of https://github.com/apache/lucene.git
SOLR-9716: RecoveryStrategy sends prep recovery command without setting read time out which can cause replica recovery to hang indefinitely on network partitions
This commit is contained in:
parent
358bdd490b
commit
1f1990d8be
|
@ -114,6 +114,9 @@ Bug Fixes
|
||||||
* SOLR-9360: Solr script not properly checking SOLR_PID
|
* SOLR-9360: Solr script not properly checking SOLR_PID
|
||||||
(Alessandro Benedetti via Erick Erickson)
|
(Alessandro Benedetti via Erick Erickson)
|
||||||
|
|
||||||
|
* SOLR-9716: RecoveryStrategy sends prep recovery command without setting read time out which can cause
|
||||||
|
replica recovery to hang indefinitely on network partitions. (Cao Manh Dat, shalin)
|
||||||
|
|
||||||
Other Changes
|
Other Changes
|
||||||
----------------------
|
----------------------
|
||||||
|
|
||||||
|
|
|
@ -19,6 +19,7 @@ package org.apache.solr.cloud;
|
||||||
import java.io.Closeable;
|
import java.io.Closeable;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.lang.invoke.MethodHandles;
|
import java.lang.invoke.MethodHandles;
|
||||||
|
import java.net.SocketTimeoutException;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
import java.util.Collections;
|
import java.util.Collections;
|
||||||
|
@ -572,19 +573,39 @@ public class RecoveryStrategy extends Thread implements Closeable {
|
||||||
private void sendPrepRecoveryCmd(String leaderBaseUrl, String leaderCoreName, Slice slice)
|
private void sendPrepRecoveryCmd(String leaderBaseUrl, String leaderCoreName, Slice slice)
|
||||||
throws SolrServerException, IOException, InterruptedException, ExecutionException {
|
throws SolrServerException, IOException, InterruptedException, ExecutionException {
|
||||||
|
|
||||||
try (HttpSolrClient client = new HttpSolrClient.Builder(leaderBaseUrl).build()) {
|
WaitForState prepCmd = new WaitForState();
|
||||||
client.setConnectionTimeout(30000);
|
prepCmd.setCoreName(leaderCoreName);
|
||||||
WaitForState prepCmd = new WaitForState();
|
prepCmd.setNodeName(zkController.getNodeName());
|
||||||
prepCmd.setCoreName(leaderCoreName);
|
prepCmd.setCoreNodeName(coreZkNodeName);
|
||||||
prepCmd.setNodeName(zkController.getNodeName());
|
prepCmd.setState(Replica.State.RECOVERING);
|
||||||
prepCmd.setCoreNodeName(coreZkNodeName);
|
prepCmd.setCheckLive(true);
|
||||||
prepCmd.setState(Replica.State.RECOVERING);
|
prepCmd.setOnlyIfLeader(true);
|
||||||
prepCmd.setCheckLive(true);
|
final Slice.State state = slice.getState();
|
||||||
prepCmd.setOnlyIfLeader(true);
|
if (state != Slice.State.CONSTRUCTION && state != Slice.State.RECOVERY && state != Slice.State.RECOVERY_FAILED) {
|
||||||
final Slice.State state = slice.getState();
|
prepCmd.setOnlyIfLeaderActive(true);
|
||||||
if (state != Slice.State.CONSTRUCTION && state != Slice.State.RECOVERY && state != Slice.State.RECOVERY_FAILED) {
|
}
|
||||||
prepCmd.setOnlyIfLeaderActive(true);
|
|
||||||
|
final int maxTries = 30;
|
||||||
|
for (int numTries = 0; numTries < maxTries; numTries++) {
|
||||||
|
try {
|
||||||
|
sendPrepRecoveryCmd(leaderBaseUrl, prepCmd);
|
||||||
|
break;
|
||||||
|
} catch (ExecutionException e) {
|
||||||
|
SolrServerException solrException = (SolrServerException) e.getCause();
|
||||||
|
if (solrException.getRootCause() instanceof SocketTimeoutException && numTries < maxTries) {
|
||||||
|
LOG.warn("Socket timeout when send prep recovery cmd, retrying.. ");
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
throw e;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void sendPrepRecoveryCmd(String leaderBaseUrl, WaitForState prepCmd)
|
||||||
|
throws SolrServerException, IOException, InterruptedException, ExecutionException {
|
||||||
|
try (HttpSolrClient client = new HttpSolrClient.Builder(leaderBaseUrl).build()) {
|
||||||
|
client.setConnectionTimeout(10000);
|
||||||
|
client.setSoTimeout(10000);
|
||||||
HttpUriRequestResponse mrr = client.httpUriRequest(prepCmd);
|
HttpUriRequestResponse mrr = client.httpUriRequest(prepCmd);
|
||||||
prevSendPreRecoveryHttpUriRequest = mrr.httpUriRequest;
|
prevSendPreRecoveryHttpUriRequest = mrr.httpUriRequest;
|
||||||
|
|
||||||
|
|
|
@ -37,6 +37,7 @@ import org.apache.solr.request.LocalSolrQueryRequest;
|
||||||
import org.apache.solr.search.SolrIndexSearcher;
|
import org.apache.solr.search.SolrIndexSearcher;
|
||||||
import org.apache.solr.update.CommitUpdateCommand;
|
import org.apache.solr.update.CommitUpdateCommand;
|
||||||
import org.apache.solr.util.RefCounted;
|
import org.apache.solr.util.RefCounted;
|
||||||
|
import org.apache.solr.util.TestInjection;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
@ -46,6 +47,8 @@ class PrepRecoveryOp implements CoreAdminHandler.CoreAdminOp {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void execute(CallInfo it) throws Exception {
|
public void execute(CallInfo it) throws Exception {
|
||||||
|
assert TestInjection.injectPrepRecoveryOpPauseForever();
|
||||||
|
|
||||||
final SolrParams params = it.req.getParams();
|
final SolrParams params = it.req.getParams();
|
||||||
|
|
||||||
String cname = params.get(CoreAdminParams.CORE);
|
String cname = params.get(CoreAdminParams.CORE);
|
||||||
|
|
|
@ -24,6 +24,7 @@ import java.util.Random;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
import java.util.Timer;
|
import java.util.Timer;
|
||||||
import java.util.TimerTask;
|
import java.util.TimerTask;
|
||||||
|
import java.util.concurrent.atomic.AtomicInteger;
|
||||||
import java.util.regex.Matcher;
|
import java.util.regex.Matcher;
|
||||||
import java.util.regex.Pattern;
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
|
@ -110,6 +111,8 @@ public class TestInjection {
|
||||||
|
|
||||||
public static String updateRandomPause = null;
|
public static String updateRandomPause = null;
|
||||||
|
|
||||||
|
public static String prepRecoveryOpPauseForever = null;
|
||||||
|
|
||||||
public static String randomDelayInCoreCreation = null;
|
public static String randomDelayInCoreCreation = null;
|
||||||
|
|
||||||
public static int randomDelayMaxInCoreCreationInSec = 10;
|
public static int randomDelayMaxInCoreCreationInSec = 10;
|
||||||
|
@ -118,6 +121,8 @@ public class TestInjection {
|
||||||
|
|
||||||
private static Set<Timer> timers = Collections.synchronizedSet(new HashSet<Timer>());
|
private static Set<Timer> timers = Collections.synchronizedSet(new HashSet<Timer>());
|
||||||
|
|
||||||
|
private static AtomicInteger countPrepRecoveryOpPauseForever = new AtomicInteger(0);
|
||||||
|
|
||||||
public static void reset() {
|
public static void reset() {
|
||||||
nonGracefullClose = null;
|
nonGracefullClose = null;
|
||||||
failReplicaRequests = null;
|
failReplicaRequests = null;
|
||||||
|
@ -127,6 +132,8 @@ public class TestInjection {
|
||||||
updateRandomPause = null;
|
updateRandomPause = null;
|
||||||
randomDelayInCoreCreation = null;
|
randomDelayInCoreCreation = null;
|
||||||
splitFailureBeforeReplicaCreation = null;
|
splitFailureBeforeReplicaCreation = null;
|
||||||
|
prepRecoveryOpPauseForever = null;
|
||||||
|
countPrepRecoveryOpPauseForever = new AtomicInteger(0);
|
||||||
|
|
||||||
for (Timer timer : timers) {
|
for (Timer timer : timers) {
|
||||||
timer.cancel();
|
timer.cancel();
|
||||||
|
@ -289,6 +296,31 @@ public class TestInjection {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static boolean injectPrepRecoveryOpPauseForever() {
|
||||||
|
if (prepRecoveryOpPauseForever != null) {
|
||||||
|
Random rand = random();
|
||||||
|
if (null == rand) return true;
|
||||||
|
|
||||||
|
Pair<Boolean,Integer> pair = parseValue(prepRecoveryOpPauseForever);
|
||||||
|
boolean enabled = pair.first();
|
||||||
|
int chanceIn100 = pair.second();
|
||||||
|
// Prevent for continuous pause forever
|
||||||
|
if (enabled && rand.nextInt(100) >= (100 - chanceIn100) && countPrepRecoveryOpPauseForever.get() < 2) {
|
||||||
|
countPrepRecoveryOpPauseForever.incrementAndGet();
|
||||||
|
log.info("inject pause forever for prep recovery op");
|
||||||
|
try {
|
||||||
|
Thread.sleep(Integer.MAX_VALUE);
|
||||||
|
} catch (InterruptedException e) {
|
||||||
|
Thread.currentThread().interrupt();
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
countPrepRecoveryOpPauseForever.set(0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
public static boolean injectSplitFailureBeforeReplicaCreation() {
|
public static boolean injectSplitFailureBeforeReplicaCreation() {
|
||||||
if (splitFailureBeforeReplicaCreation != null) {
|
if (splitFailureBeforeReplicaCreation != null) {
|
||||||
Random rand = random();
|
Random rand = random();
|
||||||
|
|
|
@ -37,6 +37,8 @@ import org.apache.solr.common.params.ModifiableSolrParams;
|
||||||
import org.apache.solr.core.SolrCore;
|
import org.apache.solr.core.SolrCore;
|
||||||
import org.apache.solr.update.DirectUpdateHandler2;
|
import org.apache.solr.update.DirectUpdateHandler2;
|
||||||
import org.apache.solr.update.UpdateLog;
|
import org.apache.solr.update.UpdateLog;
|
||||||
|
import org.apache.solr.util.TestInjection;
|
||||||
|
import org.junit.AfterClass;
|
||||||
import org.junit.Before;
|
import org.junit.Before;
|
||||||
import org.junit.BeforeClass;
|
import org.junit.BeforeClass;
|
||||||
import org.junit.Test;
|
import org.junit.Test;
|
||||||
|
@ -47,6 +49,7 @@ public class TestCloudRecovery extends SolrCloudTestCase {
|
||||||
|
|
||||||
@BeforeClass
|
@BeforeClass
|
||||||
public static void setupCluster() throws Exception {
|
public static void setupCluster() throws Exception {
|
||||||
|
TestInjection.prepRecoveryOpPauseForever = "true:30";
|
||||||
System.setProperty("solr.directoryFactory", "solr.StandardDirectoryFactory");
|
System.setProperty("solr.directoryFactory", "solr.StandardDirectoryFactory");
|
||||||
System.setProperty("solr.ulog.numRecordsToKeep", "1000");
|
System.setProperty("solr.ulog.numRecordsToKeep", "1000");
|
||||||
|
|
||||||
|
@ -62,6 +65,11 @@ public class TestCloudRecovery extends SolrCloudTestCase {
|
||||||
false, true, 30);
|
false, true, 30);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@AfterClass
|
||||||
|
public static void afterClass() {
|
||||||
|
TestInjection.reset();
|
||||||
|
}
|
||||||
|
|
||||||
@Before
|
@Before
|
||||||
public void resetCollection() throws IOException, SolrServerException {
|
public void resetCollection() throws IOException, SolrServerException {
|
||||||
cluster.getSolrClient().deleteByQuery(COLLECTION, "*:*");
|
cluster.getSolrClient().deleteByQuery(COLLECTION, "*:*");
|
||||||
|
|
Loading…
Reference in New Issue