mirror of https://github.com/apache/lucene.git
SOLR-7033, SOLR-5961: RecoveryStrategy should not publish any state when closed / cancelled and there should always be a pause between recoveries even when recoveries are rapidly stopped and started as well as when a node attempts to become the leader for a shard.
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1658236 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
908da87486
commit
f2bf0e2235
|
@ -619,6 +619,12 @@ Bug Fixes
|
||||||
* SOLR-6920: A replicated index can end up corrupted when small files end up with the same
|
* SOLR-6920: A replicated index can end up corrupted when small files end up with the same
|
||||||
file name and size. (Varun Thacker, Mark Miller)
|
file name and size. (Varun Thacker, Mark Miller)
|
||||||
|
|
||||||
|
* SOLR-7033, SOLR-5961: RecoveryStrategy should not publish any state when
|
||||||
|
closed / cancelled and there should always be a pause between recoveries
|
||||||
|
even when recoveries are rapidly stopped and started as well as when a
|
||||||
|
node attempts to become the leader for a shard.
|
||||||
|
(Mark Miller, Maxim Novikov)
|
||||||
|
|
||||||
Optimizations
|
Optimizations
|
||||||
----------------------
|
----------------------
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,68 @@
|
||||||
|
package org.apache.solr.cloud;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.util.concurrent.TimeUnit;
|
||||||
|
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
// this class may be accessed by multiple threads, but only one at a time
|
||||||
|
public class ActionThrottle {
|
||||||
|
private static Logger log = LoggerFactory.getLogger(ActionThrottle.class);
|
||||||
|
|
||||||
|
private volatile long lastActionStartedAt;
|
||||||
|
private volatile long minMsBetweenActions;
|
||||||
|
|
||||||
|
private final String name;
|
||||||
|
|
||||||
|
public ActionThrottle(String name, long minMsBetweenActions) {
|
||||||
|
this.name = name;
|
||||||
|
this.minMsBetweenActions = minMsBetweenActions;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void markAttemptingAction() {
|
||||||
|
lastActionStartedAt = System.nanoTime();
|
||||||
|
}
|
||||||
|
|
||||||
|
public void minimumWaitBetweenActions() {
|
||||||
|
if (lastActionStartedAt == 0) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
long diff = System.nanoTime() - lastActionStartedAt;
|
||||||
|
int diffMs = (int) TimeUnit.MILLISECONDS.convert(diff, TimeUnit.NANOSECONDS);
|
||||||
|
long minNsBetweenActions = TimeUnit.NANOSECONDS.convert(minMsBetweenActions, TimeUnit.MILLISECONDS);
|
||||||
|
log.info("The last {} attempt started {}ms ago.", name, diffMs);
|
||||||
|
int sleep = 0;
|
||||||
|
|
||||||
|
if (diffMs > 0 && diff < minNsBetweenActions) {
|
||||||
|
sleep = (int) TimeUnit.MILLISECONDS.convert(minNsBetweenActions - diff, TimeUnit.NANOSECONDS);
|
||||||
|
} else if (diffMs == 0) {
|
||||||
|
sleep = (int) minMsBetweenActions;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (sleep > 0) {
|
||||||
|
log.info("Throttling {} attempts - waiting for {}ms", name, sleep);
|
||||||
|
try {
|
||||||
|
Thread.sleep(sleep);
|
||||||
|
} catch (InterruptedException e) {
|
||||||
|
Thread.currentThread().interrupt();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -200,6 +200,21 @@ final class ShardLeaderElectionContext extends ShardLeaderElectionContextBase {
|
||||||
log.info("Running the leader process for shard " + shardId);
|
log.info("Running the leader process for shard " + shardId);
|
||||||
|
|
||||||
String coreName = leaderProps.getStr(ZkStateReader.CORE_NAME_PROP);
|
String coreName = leaderProps.getStr(ZkStateReader.CORE_NAME_PROP);
|
||||||
|
ActionThrottle lt;
|
||||||
|
try (SolrCore core = cc.getCore(coreName)) {
|
||||||
|
|
||||||
|
if (core == null) {
|
||||||
|
cancelElection();
|
||||||
|
throw new SolrException(ErrorCode.SERVER_ERROR,
|
||||||
|
"SolrCore not found:" + coreName + " in "
|
||||||
|
+ cc.getCoreNames());
|
||||||
|
}
|
||||||
|
|
||||||
|
lt = core.getUpdateHandler().getSolrCoreState().getLeaderThrottle();
|
||||||
|
}
|
||||||
|
|
||||||
|
lt.minimumWaitBetweenActions();
|
||||||
|
lt.markAttemptingAction();
|
||||||
|
|
||||||
// clear the leader in clusterstate
|
// clear the leader in clusterstate
|
||||||
ZkNodeProps m = new ZkNodeProps(Overseer.QUEUE_OPERATION, OverseerAction.LEADER.toLower(),
|
ZkNodeProps m = new ZkNodeProps(Overseer.QUEUE_OPERATION, OverseerAction.LEADER.toLower(),
|
||||||
|
@ -217,7 +232,7 @@ final class ShardLeaderElectionContext extends ShardLeaderElectionContextBase {
|
||||||
if (core == null) {
|
if (core == null) {
|
||||||
cancelElection();
|
cancelElection();
|
||||||
throw new SolrException(ErrorCode.SERVER_ERROR,
|
throw new SolrException(ErrorCode.SERVER_ERROR,
|
||||||
"Fatal Error, SolrCore not found:" + coreName + " in "
|
"SolrCore not found:" + coreName + " in "
|
||||||
+ cc.getCoreNames());
|
+ cc.getCoreNames());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -68,8 +68,7 @@ import java.util.concurrent.Future;
|
||||||
|
|
||||||
public class RecoveryStrategy extends Thread implements ClosableThread {
|
public class RecoveryStrategy extends Thread implements ClosableThread {
|
||||||
private static final int MAX_RETRIES = 500;
|
private static final int MAX_RETRIES = 500;
|
||||||
private static final int INTERRUPTED = MAX_RETRIES + 1;
|
private static final int STARTING_RECOVERY_DELAY = 5000;
|
||||||
private static final int STARTING_RECOVERY_DELAY = 1000;
|
|
||||||
|
|
||||||
private static final String REPLICATION_HANDLER = "/replication";
|
private static final String REPLICATION_HANDLER = "/replication";
|
||||||
|
|
||||||
|
@ -93,6 +92,7 @@ public class RecoveryStrategy extends Thread implements ClosableThread {
|
||||||
private CoreContainer cc;
|
private CoreContainer cc;
|
||||||
private volatile HttpUriRequest prevSendPreRecoveryHttpUriRequest;
|
private volatile HttpUriRequest prevSendPreRecoveryHttpUriRequest;
|
||||||
|
|
||||||
|
// this should only be used from SolrCoreState
|
||||||
public RecoveryStrategy(CoreContainer cc, CoreDescriptor cd, RecoveryListener recoveryListener) {
|
public RecoveryStrategy(CoreContainer cc, CoreDescriptor cd, RecoveryListener recoveryListener) {
|
||||||
this.cc = cc;
|
this.cc = cc;
|
||||||
this.coreName = cd.getName();
|
this.coreName = cd.getName();
|
||||||
|
@ -159,7 +159,7 @@ public class RecoveryStrategy extends Thread implements ClosableThread {
|
||||||
ModifiableSolrParams solrParams = new ModifiableSolrParams();
|
ModifiableSolrParams solrParams = new ModifiableSolrParams();
|
||||||
solrParams.set(ReplicationHandler.MASTER_URL, leaderUrl);
|
solrParams.set(ReplicationHandler.MASTER_URL, leaderUrl);
|
||||||
|
|
||||||
if (isClosed()) retries = INTERRUPTED;
|
if (isClosed()) return; // we check closed on return
|
||||||
boolean success = replicationHandler.doFetch(solrParams, false);
|
boolean success = replicationHandler.doFetch(solrParams, false);
|
||||||
|
|
||||||
if (!success) {
|
if (!success) {
|
||||||
|
@ -233,12 +233,10 @@ public class RecoveryStrategy extends Thread implements ClosableThread {
|
||||||
} catch (InterruptedException e) {
|
} catch (InterruptedException e) {
|
||||||
Thread.currentThread().interrupt();
|
Thread.currentThread().interrupt();
|
||||||
SolrException.log(log, "", e);
|
SolrException.log(log, "", e);
|
||||||
throw new ZooKeeperException(SolrException.ErrorCode.SERVER_ERROR, "",
|
throw new ZooKeeperException(SolrException.ErrorCode.SERVER_ERROR, "", e);
|
||||||
e);
|
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
log.error("", e);
|
log.error("", e);
|
||||||
throw new ZooKeeperException(SolrException.ErrorCode.SERVER_ERROR,
|
throw new ZooKeeperException(SolrException.ErrorCode.SERVER_ERROR, "", e);
|
||||||
"", e);
|
|
||||||
}
|
}
|
||||||
} finally {
|
} finally {
|
||||||
SolrRequestInfo.clearRequestInfo();
|
SolrRequestInfo.clearRequestInfo();
|
||||||
|
@ -465,7 +463,7 @@ public class RecoveryStrategy extends Thread implements ClosableThread {
|
||||||
} catch (InterruptedException e) {
|
} catch (InterruptedException e) {
|
||||||
Thread.currentThread().interrupt();
|
Thread.currentThread().interrupt();
|
||||||
log.warn("Recovery was interrupted", e);
|
log.warn("Recovery was interrupted", e);
|
||||||
retries = INTERRUPTED;
|
close = true;
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
SolrException.log(log, "Error while trying to recover", e);
|
SolrException.log(log, "Error while trying to recover", e);
|
||||||
} finally {
|
} finally {
|
||||||
|
@ -489,38 +487,22 @@ public class RecoveryStrategy extends Thread implements ClosableThread {
|
||||||
// Or do a fall off retry...
|
// Or do a fall off retry...
|
||||||
try {
|
try {
|
||||||
|
|
||||||
log.error("Recovery failed - trying again... (" + retries + ") core=" + coreName);
|
|
||||||
|
|
||||||
if (isClosed()) {
|
if (isClosed()) {
|
||||||
retries = INTERRUPTED;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
log.error("Recovery failed - trying again... (" + retries + ") core=" + coreName);
|
||||||
|
|
||||||
retries++;
|
retries++;
|
||||||
if (retries >= MAX_RETRIES) {
|
if (retries >= MAX_RETRIES) {
|
||||||
if (retries >= INTERRUPTED) {
|
SolrException.log(log, "Recovery failed - max retries exceeded (" + retries + "). core=" + coreName);
|
||||||
SolrException.log(log, "Recovery failed - interrupted. core="
|
try {
|
||||||
+ coreName);
|
recoveryFailed(core, zkController, baseUrl, coreZkNodeName, core.getCoreDescriptor());
|
||||||
try {
|
} catch (Exception e) {
|
||||||
recoveryFailed(core, zkController, baseUrl, coreZkNodeName,
|
SolrException.log(log, "Could not publish that recovery failed", e);
|
||||||
core.getCoreDescriptor());
|
|
||||||
} catch (Exception e) {
|
|
||||||
SolrException.log(log,
|
|
||||||
"Could not publish that recovery failed", e);
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
SolrException.log(log,
|
|
||||||
"Recovery failed - max retries exceeded (" + retries + "). core=" + coreName);
|
|
||||||
try {
|
|
||||||
recoveryFailed(core, zkController, baseUrl, coreZkNodeName,
|
|
||||||
core.getCoreDescriptor());
|
|
||||||
} catch (Exception e) {
|
|
||||||
SolrException.log(log,
|
|
||||||
"Could not publish that recovery failed", e);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
SolrException.log(log, "core=" + coreName, e);
|
SolrException.log(log, "core=" + coreName, e);
|
||||||
}
|
}
|
||||||
|
@ -536,7 +518,7 @@ public class RecoveryStrategy extends Thread implements ClosableThread {
|
||||||
} catch (InterruptedException e) {
|
} catch (InterruptedException e) {
|
||||||
Thread.currentThread().interrupt();
|
Thread.currentThread().interrupt();
|
||||||
log.warn("Recovery was interrupted. core=" + coreName, e);
|
log.warn("Recovery was interrupted. core=" + coreName, e);
|
||||||
retries = INTERRUPTED;
|
close = true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -23,6 +23,7 @@ import java.util.concurrent.locks.ReentrantLock;
|
||||||
|
|
||||||
import org.apache.lucene.index.IndexWriter;
|
import org.apache.lucene.index.IndexWriter;
|
||||||
import org.apache.solr.cloud.RecoveryStrategy;
|
import org.apache.solr.cloud.RecoveryStrategy;
|
||||||
|
import org.apache.solr.cloud.ActionThrottle;
|
||||||
import org.apache.solr.common.SolrException;
|
import org.apache.solr.common.SolrException;
|
||||||
import org.apache.solr.common.SolrException.ErrorCode;
|
import org.apache.solr.common.SolrException.ErrorCode;
|
||||||
import org.apache.solr.core.CoreContainer;
|
import org.apache.solr.core.CoreContainer;
|
||||||
|
@ -40,6 +41,10 @@ public final class DefaultSolrCoreState extends SolrCoreState implements Recover
|
||||||
|
|
||||||
private final Object recoveryLock = new Object();
|
private final Object recoveryLock = new Object();
|
||||||
|
|
||||||
|
private final ActionThrottle recoveryThrottle = new ActionThrottle("recovery", 10000);
|
||||||
|
|
||||||
|
private final ActionThrottle leaderThrottle = new ActionThrottle("leader", 5000);
|
||||||
|
|
||||||
// protects pauseWriter and writerFree
|
// protects pauseWriter and writerFree
|
||||||
private final Object writerPauseLock = new Object();
|
private final Object writerPauseLock = new Object();
|
||||||
|
|
||||||
|
@ -313,6 +318,9 @@ public final class DefaultSolrCoreState extends SolrCoreState implements Recover
|
||||||
// if true, we are recovering after startup and shouldn't have (or be receiving) additional updates (except for local tlog recovery)
|
// if true, we are recovering after startup and shouldn't have (or be receiving) additional updates (except for local tlog recovery)
|
||||||
boolean recoveringAfterStartup = recoveryStrat == null;
|
boolean recoveringAfterStartup = recoveryStrat == null;
|
||||||
|
|
||||||
|
recoveryThrottle.minimumWaitBetweenActions();
|
||||||
|
recoveryThrottle.markAttemptingAction();
|
||||||
|
|
||||||
recoveryStrat = new RecoveryStrategy(cc, cd, this);
|
recoveryStrat = new RecoveryStrategy(cc, cd, this);
|
||||||
recoveryStrat.setRecoveringAfterStartup(recoveringAfterStartup);
|
recoveryStrat.setRecoveringAfterStartup(recoveringAfterStartup);
|
||||||
recoveryStrat.start();
|
recoveryStrat.start();
|
||||||
|
@ -364,4 +372,10 @@ public final class DefaultSolrCoreState extends SolrCoreState implements Recover
|
||||||
return commitLock;
|
return commitLock;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public ActionThrottle getLeaderThrottle() {
|
||||||
|
return leaderThrottle;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -21,6 +21,7 @@ import java.io.IOException;
|
||||||
import java.util.concurrent.locks.Lock;
|
import java.util.concurrent.locks.Lock;
|
||||||
|
|
||||||
import org.apache.lucene.index.IndexWriter;
|
import org.apache.lucene.index.IndexWriter;
|
||||||
|
import org.apache.solr.cloud.ActionThrottle;
|
||||||
import org.apache.solr.core.CoreContainer;
|
import org.apache.solr.core.CoreContainer;
|
||||||
import org.apache.solr.core.CoreDescriptor;
|
import org.apache.solr.core.CoreDescriptor;
|
||||||
import org.apache.solr.core.DirectoryFactory;
|
import org.apache.solr.core.DirectoryFactory;
|
||||||
|
@ -140,4 +141,9 @@ public abstract class SolrCoreState {
|
||||||
|
|
||||||
public abstract void close(IndexWriterCloser closer);
|
public abstract void close(IndexWriterCloser closer);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @return throttle to limit how fast a core attempts to become leader
|
||||||
|
*/
|
||||||
|
public abstract ActionThrottle getLeaderThrottle();
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,63 @@
|
||||||
|
package org.apache.solr.cloud;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.util.concurrent.TimeUnit;
|
||||||
|
|
||||||
|
import org.apache.solr.SolrTestCaseJ4;
|
||||||
|
import org.junit.Test;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
public class ActionThrottleTest extends SolrTestCaseJ4 {
|
||||||
|
protected static Logger log = LoggerFactory.getLogger(ActionThrottleTest.class);
|
||||||
|
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testBasics() throws Exception {
|
||||||
|
|
||||||
|
ActionThrottle at = new ActionThrottle("test", 1000);
|
||||||
|
long start = System.nanoTime();
|
||||||
|
|
||||||
|
at.minimumWaitBetweenActions();
|
||||||
|
|
||||||
|
// should be no wait
|
||||||
|
assertTrue(TimeUnit.MILLISECONDS.convert(System.nanoTime() - start, TimeUnit.NANOSECONDS) < 1000);
|
||||||
|
at.markAttemptingAction();
|
||||||
|
|
||||||
|
if (random().nextBoolean()) Thread.sleep(100);
|
||||||
|
|
||||||
|
at.minimumWaitBetweenActions();
|
||||||
|
|
||||||
|
long elaspsedTime = TimeUnit.MILLISECONDS.convert(System.nanoTime() - start, TimeUnit.NANOSECONDS);
|
||||||
|
|
||||||
|
assertTrue(elaspsedTime + "ms", elaspsedTime >= 995);
|
||||||
|
|
||||||
|
start = System.nanoTime();
|
||||||
|
|
||||||
|
at.markAttemptingAction();
|
||||||
|
at.minimumWaitBetweenActions();
|
||||||
|
|
||||||
|
Thread.sleep(random().nextInt(1000));
|
||||||
|
|
||||||
|
elaspsedTime = TimeUnit.MILLISECONDS.convert(System.nanoTime() - start, TimeUnit.NANOSECONDS);
|
||||||
|
|
||||||
|
assertTrue(elaspsedTime + "ms", elaspsedTime >= 995);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
Loading…
Reference in New Issue