SnapshotShardSizeInfo should prefer default value when provided (#63390) (#63394)

In #61906 we agreed on always providing the default value 
ShardRouting.UNAVAILABLE_EXPECTED_SHARD_SIZE 
when the SnasphotInfoService failed to retrieve the exact 
size for a given snapshot shard. The motivation was to 
allow the shard allocation to move forward in case of 
failures (so that the unassigned shard does not get stuck 
in an unassigned state for too long) while relying on the 
fallback values for shard sizes.

Sadly a bug in the 
SnapshotShardSizeInfo#getShardSize(ShardRouting, long) 
makes the default value to be ignored when the snapshot 
shard size retrieval previously failed, returning 
ShardRouting.UNAVAILABLE_EXPECTED_SHARD_SIZE 
instead of the provided default value. With DiskThresholdDecider 
also not relying on the provided default value this triggers 
some assertion like in #63376 which helped us to spot the bug.

Closes ##63376
This commit is contained in:
Tanguy Leroux 2020-10-07 13:53:05 +02:00 committed by GitHub
parent e4f37d96f4
commit eac99dd594
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 32 additions and 11 deletions

View File

@ -490,9 +490,7 @@ public class DiskThresholdDecider extends AllocationDecider {
return targetShardSize == 0 ? defaultValue : targetShardSize;
} else {
if (shard.unassigned() && shard.recoverySource().getType() == RecoverySource.Type.SNAPSHOT) {
final Long shardSize = snapshotShardSizeInfo.getShardSize(shard);
assert shardSize != null : "no shard size provided for " + shard;
return shardSize;
return snapshotShardSizeInfo.getShardSize(shard, defaultValue);
}
return clusterInfo.getShardSize(shard, defaultValue);
}

View File

@ -48,6 +48,9 @@ public class SnapshotShardSizeInfo {
public long getShardSize(ShardRouting shardRouting, long fallback) {
final Long shardSize = getShardSize(shardRouting);
return shardSize == null ? fallback : shardSize;
if (shardSize == null || shardSize == ShardRouting.UNAVAILABLE_EXPECTED_SHARD_SIZE) {
return fallback;
}
return shardSize;
}
}

View File

@ -64,6 +64,7 @@ import java.util.concurrent.CountDownLatch;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.function.Function;
import java.util.function.Predicate;
import static org.elasticsearch.cluster.metadata.IndexMetadata.SETTING_CREATION_DATE;
import static org.elasticsearch.cluster.metadata.IndexMetadata.SETTING_NUMBER_OF_REPLICAS;
@ -170,9 +171,11 @@ public class InternalSnapshotsInfoServiceTests extends ESTestCase {
verify(rerouteService, times(numberOfShards)).reroute(anyString(), any(Priority.class), any());
assertThat(getShardSnapshotStatusCount.get(), equalTo(numberOfShards));
final SnapshotShardSizeInfo snapshotShardSizeInfo = snapshotsInfoService.snapshotShardSizes();
for (int i = 0; i < numberOfShards; i++) {
final ShardRouting shardRouting = clusterService.state().routingTable().index(indexName).shard(i).primaryShard();
assertThat(snapshotsInfoService.snapshotShardSizes().getShardSize(shardRouting), equalTo(expectedShardSizes[i]));
assertThat(snapshotShardSizeInfo.getShardSize(shardRouting), equalTo(expectedShardSizes[i]));
assertThat(snapshotShardSizeInfo.getShardSize(shardRouting, Long.MIN_VALUE), equalTo(expectedShardSizes[i]));
}
}
@ -182,18 +185,19 @@ public class InternalSnapshotsInfoServiceTests extends ESTestCase {
.put(INTERNAL_SNAPSHOT_INFO_MAX_CONCURRENT_FETCHES_SETTING.getKey(), randomIntBetween(1, 10))
.build(), clusterService, () -> repositoriesService, () -> rerouteService);
final Map<InternalSnapshotsInfoService.SnapshotShard, Boolean> results = new ConcurrentHashMap<>();
final Map<InternalSnapshotsInfoService.SnapshotShard, Long> results = new ConcurrentHashMap<>();
final Repository mockRepository = new FilterRepository(mock(Repository.class)) {
@Override
public IndexShardSnapshotStatus getShardSnapshotStatus(SnapshotId snapshotId, IndexId indexId, ShardId shardId) {
final InternalSnapshotsInfoService.SnapshotShard snapshotShard =
new InternalSnapshotsInfoService.SnapshotShard(new Snapshot("_repo", snapshotId), indexId, shardId);
if (randomBoolean()) {
results.put(snapshotShard, Boolean.FALSE);
results.put(snapshotShard, Long.MIN_VALUE);
throw new SnapshotException(snapshotShard.snapshot(), "simulated");
} else {
results.put(snapshotShard, Boolean.TRUE);
return IndexShardSnapshotStatus.newDone(0L, 0L, 0, 0, 0L, randomNonNegativeLong(), null);
final long shardSize = randomNonNegativeLong();
results.put(snapshotShard, shardSize);
return IndexShardSnapshotStatus.newDone(0L, 0L, 0, 0, 0L, shardSize, null);
}
}
};
@ -218,13 +222,29 @@ public class InternalSnapshotsInfoServiceTests extends ESTestCase {
addSnapshotRestoreIndicesThread.start();
addSnapshotRestoreIndicesThread.join();
final Predicate<Long> failedSnapshotShardSizeRetrieval = shardSize -> shardSize == Long.MIN_VALUE;
assertBusy(() -> {
assertThat(snapshotsInfoService.numberOfKnownSnapshotShardSizes(),
equalTo((int) results.values().stream().filter(result -> result.equals(Boolean.TRUE)).count()));
equalTo((int) results.values().stream().filter(size -> failedSnapshotShardSizeRetrieval.test(size) == false).count()));
assertThat(snapshotsInfoService.numberOfFailedSnapshotShardSizes(),
equalTo((int) results.values().stream().filter(result -> result.equals(Boolean.FALSE)).count()));
equalTo((int) results.values().stream().filter(failedSnapshotShardSizeRetrieval).count()));
assertThat(snapshotsInfoService.numberOfUnknownSnapshotShardSizes(), equalTo(0));
});
final SnapshotShardSizeInfo snapshotShardSizeInfo = snapshotsInfoService.snapshotShardSizes();
for (Map.Entry<InternalSnapshotsInfoService.SnapshotShard, Long> snapshotShard : results.entrySet()) {
final ShardId shardId = snapshotShard.getKey().shardId();
final ShardRouting shardRouting = clusterService.state().routingTable().index(shardId.getIndexName())
.shard(shardId.id()).primaryShard();
assertThat(shardRouting, notNullValue());
final boolean success = failedSnapshotShardSizeRetrieval.test(snapshotShard.getValue()) == false;
assertThat(snapshotShardSizeInfo.getShardSize(shardRouting),
success ? equalTo(results.get(snapshotShard.getKey())) : equalTo(ShardRouting.UNAVAILABLE_EXPECTED_SHARD_SIZE));
final long defaultValue = randomNonNegativeLong();
assertThat(snapshotShardSizeInfo.getShardSize(shardRouting, defaultValue),
success ? equalTo(results.get(snapshotShard.getKey())) : equalTo(defaultValue));
}
}
public void testNoLongerMaster() throws Exception {