Add global checkpoint assertion in index shard

We previously removed this assertion because it could be violated in
races. This commit adds this assertion back with sampling done more
carefully to avoid failures solely due to race conditions.
This commit is contained in:
Jason Tedor 2017-05-08 13:37:28 -04:00
parent bf32b0c59d
commit 89b305c09e
1 changed files with 5 additions and 3 deletions

View File

@ -1521,6 +1521,8 @@ public class IndexShard extends AbstractIndexShardComponent implements IndicesCl
*/ */
public void updateGlobalCheckpointOnReplica(final long globalCheckpoint) { public void updateGlobalCheckpointOnReplica(final long globalCheckpoint) {
verifyReplicationTarget(); verifyReplicationTarget();
// we sample the recovery stage before sampling the local checkpoint or we are subject to a race condition in the below assertion
final RecoveryState.Stage stage = recoveryState().getStage();
final SequenceNumbersService seqNoService = getEngine().seqNoService(); final SequenceNumbersService seqNoService = getEngine().seqNoService();
final long localCheckpoint = seqNoService.getLocalCheckpoint(); final long localCheckpoint = seqNoService.getLocalCheckpoint();
if (globalCheckpoint > localCheckpoint) { if (globalCheckpoint > localCheckpoint) {
@ -1530,10 +1532,10 @@ public class IndexShard extends AbstractIndexShardComponent implements IndicesCl
* case that the global checkpoint update from the primary is ahead of the local checkpoint on this shard. In this case, we * case that the global checkpoint update from the primary is ahead of the local checkpoint on this shard. In this case, we
* ignore the global checkpoint update. This can happen if we are in the translog stage of recovery. Prior to this, the engine * ignore the global checkpoint update. This can happen if we are in the translog stage of recovery. Prior to this, the engine
* is not opened and this shard will not receive global checkpoint updates, and after this the shard will be contributing to * is not opened and this shard will not receive global checkpoint updates, and after this the shard will be contributing to
* calculations of the the global checkpoint. However, we can not assert that we are in the translog stage of recovery here as * calculations of the the global checkpoint.
* while the global checkpoint update may have emanated from the primary when we were in that state, we could subsequently move
* to recovery finalization, or even finished recovery before the update arrives here.
*/ */
assert stage == RecoveryState.Stage.TRANSLOG
: "expected recovery stage [" + RecoveryState.Stage.TRANSLOG + "] but was [" + stage + "]";
return; return;
} }
seqNoService.updateGlobalCheckpointOnReplica(globalCheckpoint); seqNoService.updateGlobalCheckpointOnReplica(globalCheckpoint);