SOLR-14928: add exponential backoff wait time when Compare And Swap fails in distributed cluster state update due to concurrent update (#2438)

This commit is contained in:
Ilan Ginzburg 2021-02-28 00:53:42 +01:00 committed by GitHub
parent 988a16fe95
commit 1fff174690
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 6 additions and 3 deletions

View File

@ -18,6 +18,7 @@
package org.apache.solr.cloud; package org.apache.solr.cloud;
import org.apache.solr.client.solrj.cloud.SolrCloudManager; import org.apache.solr.client.solrj.cloud.SolrCloudManager;
import org.apache.solr.cloud.api.collections.CollectionHandlingUtils;
import org.apache.solr.cloud.overseer.*; import org.apache.solr.cloud.overseer.*;
import org.apache.solr.common.SolrException; import org.apache.solr.common.SolrException;
import org.apache.solr.common.cloud.*; import org.apache.solr.common.cloud.*;
@ -446,9 +447,11 @@ public class DistributedClusterStateUpdater {
} }
// We've tried to update an existing state.json and got a BadVersionException. We'll try again a few times. // We've tried to update an existing state.json and got a BadVersionException. We'll try again a few times.
// When only two threads compete, no point in waiting: if we lost this time we'll get it next time right away. // When only two threads compete, no point in waiting: if we lost this time we'll get it next time right away.
// But if more threads compete, then waiting a bit (random delay) can improve our chances. The delay should likely // But if more threads compete, then waiting a bit (random delay) can improve our chances. The delay should in
// be proportional to the time between reading the cluster state and updating it. We can measure it in the loop above. // theory grow as the number of concurrent threads attempting updates increase, but we don't know that number, so
// doing exponential backoff instead.
// With "per replica states" collections, concurrent attempts of even just two threads are expected to be extremely rare. // With "per replica states" collections, concurrent attempts of even just two threads are expected to be extremely rare.
Thread.sleep(CollectionHandlingUtils.RANDOM.nextInt(attempt < 13 ? 1 << attempt : 1 << 13)); // max wait 2^13ms=8.192 sec
} }
// We made quite a few attempts but failed repeatedly. This is pretty bad but we can't loop trying forever. // We made quite a few attempts but failed repeatedly. This is pretty bad but we can't loop trying forever.

View File

@ -111,7 +111,7 @@ public class CollectionHandlingUtils {
DocCollection.PER_REPLICA_STATE, null, DocCollection.PER_REPLICA_STATE, null,
ZkStateReader.PULL_REPLICAS, "0")); ZkStateReader.PULL_REPLICAS, "0"));
protected static final Random RANDOM; public static final Random RANDOM;
static { static {
// We try to make things reproducible in the context of our tests by initializing the random instance // We try to make things reproducible in the context of our tests by initializing the random instance
// based on the current seed // based on the current seed