SOLR-9716: RecoveryStrategy sends prep recovery command without setting read time out which can cause replica recovery to hang indefinitely on network partitions

This commit is contained in:
Shalin Shekhar Mangar 2016-11-05 12:46:42 +05:30
parent 358bdd490b
commit 1f1990d8be
5 changed files with 81 additions and 14 deletions

View File

@ -114,6 +114,9 @@ Bug Fixes
* SOLR-9360: Solr script not properly checking SOLR_PID
(Alessandro Benedetti via Erick Erickson)
* SOLR-9716: RecoveryStrategy sends prep recovery command without setting read time out which can cause
replica recovery to hang indefinitely on network partitions. (Cao Manh Dat, shalin)
Other Changes
----------------------

View File

@ -19,6 +19,7 @@ package org.apache.solr.cloud;
import java.io.Closeable;
import java.io.IOException;
import java.lang.invoke.MethodHandles;
import java.net.SocketTimeoutException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
@ -572,24 +573,44 @@ public class RecoveryStrategy extends Thread implements Closeable {
private void sendPrepRecoveryCmd(String leaderBaseUrl, String leaderCoreName, Slice slice)
throws SolrServerException, IOException, InterruptedException, ExecutionException {
try (HttpSolrClient client = new HttpSolrClient.Builder(leaderBaseUrl).build()) {
client.setConnectionTimeout(30000);
WaitForState prepCmd = new WaitForState();
prepCmd.setCoreName(leaderCoreName);
prepCmd.setNodeName(zkController.getNodeName());
prepCmd.setCoreNodeName(coreZkNodeName);
prepCmd.setState(Replica.State.RECOVERING);
prepCmd.setCheckLive(true);
prepCmd.setOnlyIfLeader(true);
final Slice.State state = slice.getState();
if (state != Slice.State.CONSTRUCTION && state != Slice.State.RECOVERY && state != Slice.State.RECOVERY_FAILED) {
prepCmd.setOnlyIfLeaderActive(true);
WaitForState prepCmd = new WaitForState();
prepCmd.setCoreName(leaderCoreName);
prepCmd.setNodeName(zkController.getNodeName());
prepCmd.setCoreNodeName(coreZkNodeName);
prepCmd.setState(Replica.State.RECOVERING);
prepCmd.setCheckLive(true);
prepCmd.setOnlyIfLeader(true);
final Slice.State state = slice.getState();
if (state != Slice.State.CONSTRUCTION && state != Slice.State.RECOVERY && state != Slice.State.RECOVERY_FAILED) {
prepCmd.setOnlyIfLeaderActive(true);
}
final int maxTries = 30;
for (int numTries = 0; numTries < maxTries; numTries++) {
try {
sendPrepRecoveryCmd(leaderBaseUrl, prepCmd);
break;
} catch (ExecutionException e) {
SolrServerException solrException = (SolrServerException) e.getCause();
if (solrException.getRootCause() instanceof SocketTimeoutException && numTries < maxTries) {
LOG.warn("Socket timeout when send prep recovery cmd, retrying.. ");
continue;
}
throw e;
}
}
}
private void sendPrepRecoveryCmd(String leaderBaseUrl, WaitForState prepCmd)
throws SolrServerException, IOException, InterruptedException, ExecutionException {
try (HttpSolrClient client = new HttpSolrClient.Builder(leaderBaseUrl).build()) {
client.setConnectionTimeout(10000);
client.setSoTimeout(10000);
HttpUriRequestResponse mrr = client.httpUriRequest(prepCmd);
prevSendPreRecoveryHttpUriRequest = mrr.httpUriRequest;
LOG.info("Sending prep recovery command to [{}]; [{}]", leaderBaseUrl, prepCmd.toString());
mrr.future.get();
}
}

View File

@ -37,6 +37,7 @@ import org.apache.solr.request.LocalSolrQueryRequest;
import org.apache.solr.search.SolrIndexSearcher;
import org.apache.solr.update.CommitUpdateCommand;
import org.apache.solr.util.RefCounted;
import org.apache.solr.util.TestInjection;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -46,6 +47,8 @@ class PrepRecoveryOp implements CoreAdminHandler.CoreAdminOp {
@Override
public void execute(CallInfo it) throws Exception {
assert TestInjection.injectPrepRecoveryOpPauseForever();
final SolrParams params = it.req.getParams();
String cname = params.get(CoreAdminParams.CORE);

View File

@ -24,6 +24,7 @@ import java.util.Random;
import java.util.Set;
import java.util.Timer;
import java.util.TimerTask;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
@ -110,6 +111,8 @@ public class TestInjection {
public static String updateRandomPause = null;
public static String prepRecoveryOpPauseForever = null;
public static String randomDelayInCoreCreation = null;
public static int randomDelayMaxInCoreCreationInSec = 10;
@ -118,6 +121,8 @@ public class TestInjection {
private static Set<Timer> timers = Collections.synchronizedSet(new HashSet<Timer>());
private static AtomicInteger countPrepRecoveryOpPauseForever = new AtomicInteger(0);
public static void reset() {
nonGracefullClose = null;
failReplicaRequests = null;
@ -127,6 +132,8 @@ public class TestInjection {
updateRandomPause = null;
randomDelayInCoreCreation = null;
splitFailureBeforeReplicaCreation = null;
prepRecoveryOpPauseForever = null;
countPrepRecoveryOpPauseForever = new AtomicInteger(0);
for (Timer timer : timers) {
timer.cancel();
@ -289,6 +296,31 @@ public class TestInjection {
return true;
}
public static boolean injectPrepRecoveryOpPauseForever() {
if (prepRecoveryOpPauseForever != null) {
Random rand = random();
if (null == rand) return true;
Pair<Boolean,Integer> pair = parseValue(prepRecoveryOpPauseForever);
boolean enabled = pair.first();
int chanceIn100 = pair.second();
// Prevent for continuous pause forever
if (enabled && rand.nextInt(100) >= (100 - chanceIn100) && countPrepRecoveryOpPauseForever.get() < 2) {
countPrepRecoveryOpPauseForever.incrementAndGet();
log.info("inject pause forever for prep recovery op");
try {
Thread.sleep(Integer.MAX_VALUE);
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
}
} else {
countPrepRecoveryOpPauseForever.set(0);
}
}
return true;
}
public static boolean injectSplitFailureBeforeReplicaCreation() {
if (splitFailureBeforeReplicaCreation != null) {
Random rand = random();

View File

@ -37,6 +37,8 @@ import org.apache.solr.common.params.ModifiableSolrParams;
import org.apache.solr.core.SolrCore;
import org.apache.solr.update.DirectUpdateHandler2;
import org.apache.solr.update.UpdateLog;
import org.apache.solr.util.TestInjection;
import org.junit.AfterClass;
import org.junit.Before;
import org.junit.BeforeClass;
import org.junit.Test;
@ -47,6 +49,7 @@ public class TestCloudRecovery extends SolrCloudTestCase {
@BeforeClass
public static void setupCluster() throws Exception {
TestInjection.prepRecoveryOpPauseForever = "true:30";
System.setProperty("solr.directoryFactory", "solr.StandardDirectoryFactory");
System.setProperty("solr.ulog.numRecordsToKeep", "1000");
@ -62,6 +65,11 @@ public class TestCloudRecovery extends SolrCloudTestCase {
false, true, 30);
}
@AfterClass
public static void afterClass() {
TestInjection.reset();
}
@Before
public void resetCollection() throws IOException, SolrServerException {
cluster.getSolrClient().deleteByQuery(COLLECTION, "*:*");