From 1fff1746909361aaccab8b5c146026eba9fbeb88 Mon Sep 17 00:00:00 2001 From: Ilan Ginzburg Date: Sun, 28 Feb 2021 00:53:42 +0100 Subject: [PATCH] SOLR-14928: add exponential backoff wait time when Compare And Swap fails in distributed cluster state update due to concurrent update (#2438) --- .../apache/solr/cloud/DistributedClusterStateUpdater.java | 7 +++++-- .../cloud/api/collections/CollectionHandlingUtils.java | 2 +- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/solr/core/src/java/org/apache/solr/cloud/DistributedClusterStateUpdater.java b/solr/core/src/java/org/apache/solr/cloud/DistributedClusterStateUpdater.java index f57cc314afe..e48b7ced014 100644 --- a/solr/core/src/java/org/apache/solr/cloud/DistributedClusterStateUpdater.java +++ b/solr/core/src/java/org/apache/solr/cloud/DistributedClusterStateUpdater.java @@ -18,6 +18,7 @@ package org.apache.solr.cloud; import org.apache.solr.client.solrj.cloud.SolrCloudManager; +import org.apache.solr.cloud.api.collections.CollectionHandlingUtils; import org.apache.solr.cloud.overseer.*; import org.apache.solr.common.SolrException; import org.apache.solr.common.cloud.*; @@ -446,9 +447,11 @@ public class DistributedClusterStateUpdater { } // We've tried to update an existing state.json and got a BadVersionException. We'll try again a few times. // When only two threads compete, no point in waiting: if we lost this time we'll get it next time right away. - // But if more threads compete, then waiting a bit (random delay) can improve our chances. The delay should likely - // be proportional to the time between reading the cluster state and updating it. We can measure it in the loop above. + // But if more threads compete, then waiting a bit (random delay) can improve our chances. The delay should in + // theory grow as the number of concurrent threads attempting updates increase, but we don't know that number, so + // doing exponential backoff instead. // With "per replica states" collections, concurrent attempts of even just two threads are expected to be extremely rare. + Thread.sleep(CollectionHandlingUtils.RANDOM.nextInt(attempt < 13 ? 1 << attempt : 1 << 13)); // max wait 2^13ms=8.192 sec } // We made quite a few attempts but failed repeatedly. This is pretty bad but we can't loop trying forever. diff --git a/solr/core/src/java/org/apache/solr/cloud/api/collections/CollectionHandlingUtils.java b/solr/core/src/java/org/apache/solr/cloud/api/collections/CollectionHandlingUtils.java index 439d7c7d690..77075664c45 100644 --- a/solr/core/src/java/org/apache/solr/cloud/api/collections/CollectionHandlingUtils.java +++ b/solr/core/src/java/org/apache/solr/cloud/api/collections/CollectionHandlingUtils.java @@ -111,7 +111,7 @@ public class CollectionHandlingUtils { DocCollection.PER_REPLICA_STATE, null, ZkStateReader.PULL_REPLICAS, "0")); - protected static final Random RANDOM; + public static final Random RANDOM; static { // We try to make things reproducible in the context of our tests by initializing the random instance // based on the current seed