mirror of https://github.com/apache/lucene.git
SOLR-14928: add exponential backoff wait time when Compare And Swap fails in distributed cluster state update due to concurrent update (#2438)
This commit is contained in:
parent
988a16fe95
commit
1fff174690
|
@ -18,6 +18,7 @@
|
||||||
package org.apache.solr.cloud;
|
package org.apache.solr.cloud;
|
||||||
|
|
||||||
import org.apache.solr.client.solrj.cloud.SolrCloudManager;
|
import org.apache.solr.client.solrj.cloud.SolrCloudManager;
|
||||||
|
import org.apache.solr.cloud.api.collections.CollectionHandlingUtils;
|
||||||
import org.apache.solr.cloud.overseer.*;
|
import org.apache.solr.cloud.overseer.*;
|
||||||
import org.apache.solr.common.SolrException;
|
import org.apache.solr.common.SolrException;
|
||||||
import org.apache.solr.common.cloud.*;
|
import org.apache.solr.common.cloud.*;
|
||||||
|
@ -446,9 +447,11 @@ public class DistributedClusterStateUpdater {
|
||||||
}
|
}
|
||||||
// We've tried to update an existing state.json and got a BadVersionException. We'll try again a few times.
|
// We've tried to update an existing state.json and got a BadVersionException. We'll try again a few times.
|
||||||
// When only two threads compete, no point in waiting: if we lost this time we'll get it next time right away.
|
// When only two threads compete, no point in waiting: if we lost this time we'll get it next time right away.
|
||||||
// But if more threads compete, then waiting a bit (random delay) can improve our chances. The delay should likely
|
// But if more threads compete, then waiting a bit (random delay) can improve our chances. The delay should in
|
||||||
// be proportional to the time between reading the cluster state and updating it. We can measure it in the loop above.
|
// theory grow as the number of concurrent threads attempting updates increase, but we don't know that number, so
|
||||||
|
// doing exponential backoff instead.
|
||||||
// With "per replica states" collections, concurrent attempts of even just two threads are expected to be extremely rare.
|
// With "per replica states" collections, concurrent attempts of even just two threads are expected to be extremely rare.
|
||||||
|
Thread.sleep(CollectionHandlingUtils.RANDOM.nextInt(attempt < 13 ? 1 << attempt : 1 << 13)); // max wait 2^13ms=8.192 sec
|
||||||
}
|
}
|
||||||
|
|
||||||
// We made quite a few attempts but failed repeatedly. This is pretty bad but we can't loop trying forever.
|
// We made quite a few attempts but failed repeatedly. This is pretty bad but we can't loop trying forever.
|
||||||
|
|
|
@ -111,7 +111,7 @@ public class CollectionHandlingUtils {
|
||||||
DocCollection.PER_REPLICA_STATE, null,
|
DocCollection.PER_REPLICA_STATE, null,
|
||||||
ZkStateReader.PULL_REPLICAS, "0"));
|
ZkStateReader.PULL_REPLICAS, "0"));
|
||||||
|
|
||||||
protected static final Random RANDOM;
|
public static final Random RANDOM;
|
||||||
static {
|
static {
|
||||||
// We try to make things reproducible in the context of our tests by initializing the random instance
|
// We try to make things reproducible in the context of our tests by initializing the random instance
|
||||||
// based on the current seed
|
// based on the current seed
|
||||||
|
|
Loading…
Reference in New Issue