Adjust Cleanup Order of Internal State in SnapshotsService (#66225) (#66244)

In the assertion mentioned in the new comment we first get the `endingSnapshots`
and then check that we don't have a listener that isn't referred to by it so we need
to remove from the listers map before removing from `endingSnapshots` to avoid rare, random
assertion tripping here with concurrent repository operations.
This commit is contained in:
Armin Braun 2020-12-14 12:24:39 +01:00 committed by GitHub
parent 416ea4fcdc
commit 7e1fc6dc67
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
1 changed files with 13 additions and 5 deletions

View File

@ -1554,9 +1554,7 @@ public class SnapshotsService extends AbstractLifecycleComponent implements Clus
entry.version(),
state -> stateWithoutSnapshot(state, snapshot),
ActionListener.wrap(newRepoData -> {
endingSnapshots.remove(snapshot);
completeListenersIgnoringException(
snapshotCompletionListeners.remove(snapshot), Tuple.tuple(newRepoData, snapshotInfo));
completeListenersIgnoringException(endAndGetListenersToResolve(snapshot), Tuple.tuple(newRepoData, snapshotInfo));
logger.info("snapshot [{}] completed with state [{}]", snapshot, snapshotInfo.state());
runNextQueuedOperation(newRepoData, repository, true);
}, e -> handleFinalizationFailure(e, entry, repositoryData))),
@ -1567,6 +1565,17 @@ public class SnapshotsService extends AbstractLifecycleComponent implements Clus
}
}
/**
* Remove a snapshot from {@link #endingSnapshots} set and return its completion listeners that must be resolved.
*/
private List<ActionListener<Tuple<RepositoryData, SnapshotInfo>>> endAndGetListenersToResolve(Snapshot snapshot) {
// get listeners before removing from the ending snapshots set to not trip assertion in #assertConsistentWithClusterState that
// makes sure we don't have listeners for snapshots that aren't tracked in any internal state of this class
final List<ActionListener<Tuple<RepositoryData, SnapshotInfo>>> listenersToComplete = snapshotCompletionListeners.remove(snapshot);
endingSnapshots.remove(snapshot);
return listenersToComplete;
}
/**
* Handles failure to finalize a snapshot. If the exception indicates that this node was unable to publish a cluster state and stopped
* being the master node, then fail all snapshot create and delete listeners executing on this node by delegating to
@ -1814,8 +1823,7 @@ public class SnapshotsService extends AbstractLifecycleComponent implements Clus
}
private void failSnapshotCompletionListeners(Snapshot snapshot, Exception e) {
endingSnapshots.remove(snapshot);
failListenersIgnoringException(snapshotCompletionListeners.remove(snapshot), e);
failListenersIgnoringException(endAndGetListenersToResolve(snapshot), e);
assert repositoryOperations.assertNotQueued(snapshot);
}