Init global checkpoint after copy commit in peer recovery (#40823)
Today a new replica of a closed index does not have a safe commit invariant when its engine is opened because we won't initialize the global checkpoint on a recovering replica until the finalize step. With this change, we can achieve that property by creating a new translog with the global checkpoint from the primary at the end of phase 1.
This commit is contained in:
parent
79c7a57737
commit
e9999dfa1d
|
@ -103,15 +103,16 @@ public class ReadOnlyEngine extends Engine {
|
|||
this.translogStats = translogStats == null ? new TranslogStats(0, 0, 0, 0, 0) : translogStats;
|
||||
if (seqNoStats == null) {
|
||||
seqNoStats = buildSeqNoStats(lastCommittedSegmentInfos);
|
||||
// During a peer-recovery the global checkpoint is not known and up to date when the engine
|
||||
// is created, so we only check the max seq no / global checkpoint coherency when the global
|
||||
// Before 8.0 the global checkpoint is not known and up to date when the engine is created after
|
||||
// peer recovery, so we only check the max seq no / global checkpoint coherency when the global
|
||||
// checkpoint is different from the unassigned sequence number value.
|
||||
// In addition to that we only execute the check if the index the engine belongs to has been
|
||||
// created after the refactoring of the Close Index API and its TransportVerifyShardBeforeCloseAction
|
||||
// that guarantee that all operations have been flushed to Lucene.
|
||||
final long globalCheckpoint = engineConfig.getGlobalCheckpointSupplier().getAsLong();
|
||||
if (globalCheckpoint != SequenceNumbers.UNASSIGNED_SEQ_NO
|
||||
&& engineConfig.getIndexSettings().getIndexVersionCreated().onOrAfter(Version.V_6_7_0)) {
|
||||
final Version indexVersionCreated = engineConfig.getIndexSettings().getIndexVersionCreated();
|
||||
if (indexVersionCreated.onOrAfter(Version.V_7_1_0) ||
|
||||
(globalCheckpoint != SequenceNumbers.UNASSIGNED_SEQ_NO && indexVersionCreated.onOrAfter(Version.V_6_7_0))) {
|
||||
if (seqNoStats.getMaxSeqNo() != globalCheckpoint) {
|
||||
assertMaxSeqNoEqualsToGlobalCheckpoint(seqNoStats.getMaxSeqNo(), globalCheckpoint);
|
||||
throw new IllegalStateException("Maximum sequence number [" + seqNoStats.getMaxSeqNo()
|
||||
|
|
|
@ -115,8 +115,8 @@ public class PeerRecoveryTargetService implements IndexEventListener {
|
|||
FilesInfoRequestHandler());
|
||||
transportService.registerRequestHandler(Actions.FILE_CHUNK, RecoveryFileChunkRequest::new, ThreadPool.Names.GENERIC, new
|
||||
FileChunkTransportRequestHandler());
|
||||
transportService.registerRequestHandler(Actions.CLEAN_FILES, RecoveryCleanFilesRequest::new, ThreadPool.Names.GENERIC, new
|
||||
CleanFilesRequestHandler());
|
||||
transportService.registerRequestHandler(Actions.CLEAN_FILES, ThreadPool.Names.GENERIC,
|
||||
RecoveryCleanFilesRequest::new, new CleanFilesRequestHandler());
|
||||
transportService.registerRequestHandler(Actions.PREPARE_TRANSLOG, ThreadPool.Names.GENERIC,
|
||||
RecoveryPrepareForTranslogOperationsRequest::new, new PrepareForTranslogOperationsRequestHandler());
|
||||
transportService.registerRequestHandler(Actions.TRANSLOG_OPS, RecoveryTranslogOperationsRequest::new, ThreadPool.Names.GENERIC,
|
||||
|
@ -540,7 +540,7 @@ public class PeerRecoveryTargetService implements IndexEventListener {
|
|||
public void messageReceived(RecoveryCleanFilesRequest request, TransportChannel channel, Task task) throws Exception {
|
||||
try (RecoveryRef recoveryRef = onGoingRecoveries.getRecoverySafe(request.recoveryId(), request.shardId()
|
||||
)) {
|
||||
recoveryRef.target().cleanFiles(request.totalTranslogOps(), request.sourceMetaSnapshot());
|
||||
recoveryRef.target().cleanFiles(request.totalTranslogOps(), request.getGlobalCheckpoint(), request.sourceMetaSnapshot());
|
||||
channel.sendResponse(TransportResponse.Empty.INSTANCE);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -19,8 +19,10 @@
|
|||
|
||||
package org.elasticsearch.indices.recovery;
|
||||
|
||||
import org.elasticsearch.Version;
|
||||
import org.elasticsearch.common.io.stream.StreamInput;
|
||||
import org.elasticsearch.common.io.stream.StreamOutput;
|
||||
import org.elasticsearch.index.seqno.SequenceNumbers;
|
||||
import org.elasticsearch.index.shard.ShardId;
|
||||
import org.elasticsearch.index.store.Store;
|
||||
import org.elasticsearch.transport.TransportRequest;
|
||||
|
@ -29,20 +31,48 @@ import java.io.IOException;
|
|||
|
||||
public class RecoveryCleanFilesRequest extends TransportRequest {
|
||||
|
||||
private long recoveryId;
|
||||
private ShardId shardId;
|
||||
private final long recoveryId;
|
||||
private final ShardId shardId;
|
||||
private final Store.MetadataSnapshot snapshotFiles;
|
||||
private final int totalTranslogOps;
|
||||
private final long globalCheckpoint;
|
||||
|
||||
private Store.MetadataSnapshot snapshotFiles;
|
||||
private int totalTranslogOps = RecoveryState.Translog.UNKNOWN;
|
||||
|
||||
public RecoveryCleanFilesRequest() {
|
||||
}
|
||||
|
||||
RecoveryCleanFilesRequest(long recoveryId, ShardId shardId, Store.MetadataSnapshot snapshotFiles, int totalTranslogOps) {
|
||||
RecoveryCleanFilesRequest(long recoveryId, ShardId shardId, Store.MetadataSnapshot snapshotFiles,
|
||||
int totalTranslogOps, long globalCheckpoint) {
|
||||
this.recoveryId = recoveryId;
|
||||
this.shardId = shardId;
|
||||
this.snapshotFiles = snapshotFiles;
|
||||
this.totalTranslogOps = totalTranslogOps;
|
||||
this.globalCheckpoint = globalCheckpoint;
|
||||
}
|
||||
|
||||
RecoveryCleanFilesRequest(StreamInput in) throws IOException {
|
||||
super(in);
|
||||
recoveryId = in.readLong();
|
||||
shardId = ShardId.readShardId(in);
|
||||
snapshotFiles = new Store.MetadataSnapshot(in);
|
||||
totalTranslogOps = in.readVInt();
|
||||
if (in.getVersion().onOrAfter(Version.V_7_1_0)) {
|
||||
globalCheckpoint = in.readZLong();
|
||||
} else {
|
||||
globalCheckpoint = SequenceNumbers.UNASSIGNED_SEQ_NO;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void writeTo(StreamOutput out) throws IOException {
|
||||
super.writeTo(out);
|
||||
out.writeLong(recoveryId);
|
||||
shardId.writeTo(out);
|
||||
snapshotFiles.writeTo(out);
|
||||
out.writeVInt(totalTranslogOps);
|
||||
if (out.getVersion().onOrAfter(Version.V_7_1_0)) {
|
||||
out.writeZLong(globalCheckpoint);
|
||||
}
|
||||
}
|
||||
|
||||
public Store.MetadataSnapshot sourceMetaSnapshot() {
|
||||
return snapshotFiles;
|
||||
}
|
||||
|
||||
public long recoveryId() {
|
||||
|
@ -53,29 +83,11 @@ public class RecoveryCleanFilesRequest extends TransportRequest {
|
|||
return shardId;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void readFrom(StreamInput in) throws IOException {
|
||||
super.readFrom(in);
|
||||
recoveryId = in.readLong();
|
||||
shardId = ShardId.readShardId(in);
|
||||
snapshotFiles = new Store.MetadataSnapshot(in);
|
||||
totalTranslogOps = in.readVInt();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void writeTo(StreamOutput out) throws IOException {
|
||||
super.writeTo(out);
|
||||
out.writeLong(recoveryId);
|
||||
shardId.writeTo(out);
|
||||
snapshotFiles.writeTo(out);
|
||||
out.writeVInt(totalTranslogOps);
|
||||
}
|
||||
|
||||
public Store.MetadataSnapshot sourceMetaSnapshot() {
|
||||
return snapshotFiles;
|
||||
}
|
||||
|
||||
public int totalTranslogOps() {
|
||||
return totalTranslogOps;
|
||||
}
|
||||
|
||||
public long getGlobalCheckpoint() {
|
||||
return globalCheckpoint;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -177,7 +177,7 @@ public class RecoverySourceHandler {
|
|||
startingSeqNo = 0;
|
||||
try {
|
||||
final int estimateNumOps = shard.estimateNumberOfHistoryOperations("peer-recovery", startingSeqNo);
|
||||
sendFileResult = phase1(phase1Snapshot.getIndexCommit(), () -> estimateNumOps);
|
||||
sendFileResult = phase1(phase1Snapshot.getIndexCommit(), shard.getGlobalCheckpoint(), () -> estimateNumOps);
|
||||
} catch (final Exception e) {
|
||||
throw new RecoveryEngineException(shard.shardId(), 1, "phase1 failed", e);
|
||||
} finally {
|
||||
|
@ -332,7 +332,7 @@ public class RecoverySourceHandler {
|
|||
* segments that are missing. Only segments that have the same size and
|
||||
* checksum can be reused
|
||||
*/
|
||||
public SendFileResult phase1(final IndexCommit snapshot, final Supplier<Integer> translogOps) {
|
||||
public SendFileResult phase1(final IndexCommit snapshot, final long globalCheckpoint, final Supplier<Integer> translogOps) {
|
||||
cancellableThreads.checkForCancel();
|
||||
// Total size of segment files that are recovered
|
||||
long totalSize = 0;
|
||||
|
@ -422,7 +422,7 @@ public class RecoverySourceHandler {
|
|||
// are deleted
|
||||
try {
|
||||
cancellableThreads.executeIO(() ->
|
||||
recoveryTarget.cleanFiles(translogOps.get(), recoverySourceMetadata));
|
||||
recoveryTarget.cleanFiles(translogOps.get(), globalCheckpoint, recoverySourceMetadata));
|
||||
} catch (RemoteTransportException | IOException targetException) {
|
||||
final IOException corruptIndexException;
|
||||
// we realized that after the index was copied and we wanted to finalize the recovery
|
||||
|
|
|
@ -288,6 +288,9 @@ public class RecoveryTarget extends AbstractRefCounted implements RecoveryTarget
|
|||
ActionListener.completeWith(listener, () -> {
|
||||
state().getTranslog().totalOperations(totalTranslogOps);
|
||||
indexShard().openEngineAndSkipTranslogRecovery();
|
||||
assert indexShard.getGlobalCheckpoint() >= indexShard.seqNoStats().getMaxSeqNo() ||
|
||||
indexShard.indexSettings().getIndexVersionCreated().before(Version.V_7_1_0)
|
||||
: "global checkpoint is not initialized [" + indexShard.seqNoStats() + "]";
|
||||
return null;
|
||||
});
|
||||
}
|
||||
|
@ -382,7 +385,7 @@ public class RecoveryTarget extends AbstractRefCounted implements RecoveryTarget
|
|||
}
|
||||
|
||||
@Override
|
||||
public void cleanFiles(int totalTranslogOps, Store.MetadataSnapshot sourceMetaData) throws IOException {
|
||||
public void cleanFiles(int totalTranslogOps, long globalCheckpoint, Store.MetadataSnapshot sourceMetaData) throws IOException {
|
||||
state().getTranslog().totalOperations(totalTranslogOps);
|
||||
// first, we go and move files that were created with the recovery id suffix to
|
||||
// the actual names, its ok if we have a corrupted index here, since we have replicas
|
||||
|
@ -395,10 +398,11 @@ public class RecoveryTarget extends AbstractRefCounted implements RecoveryTarget
|
|||
if (indexShard.indexSettings().getIndexVersionCreated().before(Version.V_6_0_0_rc1)) {
|
||||
store.ensureIndexHasHistoryUUID();
|
||||
}
|
||||
// TODO: Assign the global checkpoint to the max_seqno of the safe commit if the index version >= 6.2
|
||||
assert globalCheckpoint >= Long.parseLong(sourceMetaData.getCommitUserData().get(SequenceNumbers.MAX_SEQ_NO))
|
||||
|| indexShard.indexSettings().getIndexVersionCreated().before(Version.V_7_1_0) :
|
||||
"invalid global checkpoint[" + globalCheckpoint + "] source_meta_data [" + sourceMetaData.getCommitUserData() + "]";
|
||||
final String translogUUID = Translog.createEmptyTranslog(
|
||||
indexShard.shardPath().resolveTranslog(), SequenceNumbers.UNASSIGNED_SEQ_NO, shardId,
|
||||
indexShard.getPendingPrimaryTerm());
|
||||
indexShard.shardPath().resolveTranslog(), globalCheckpoint, shardId, indexShard.getPendingPrimaryTerm());
|
||||
store.associateIndexWithNewTranslog(translogUUID);
|
||||
|
||||
if (indexShard.getRetentionLeases().leases().isEmpty()) {
|
||||
|
|
|
@ -88,10 +88,12 @@ public interface RecoveryTargetHandler {
|
|||
/**
|
||||
* After all source files has been sent over, this command is sent to the target so it can clean any local
|
||||
* files that are not part of the source store
|
||||
*
|
||||
* @param totalTranslogOps an update number of translog operations that will be replayed later on
|
||||
* @param sourceMetaData meta data of the source store
|
||||
* @param globalCheckpoint the global checkpoint on the primary
|
||||
* @param sourceMetaData meta data of the source store
|
||||
*/
|
||||
void cleanFiles(int totalTranslogOps, Store.MetadataSnapshot sourceMetaData) throws IOException;
|
||||
void cleanFiles(int totalTranslogOps, long globalCheckpoint, Store.MetadataSnapshot sourceMetaData) throws IOException;
|
||||
|
||||
/** writes a partial file chunk to the target store */
|
||||
void writeFileChunk(StoreFileMetaData fileMetaData, long position, BytesReference content,
|
||||
|
|
|
@ -139,9 +139,9 @@ public class RemoteRecoveryTargetHandler implements RecoveryTargetHandler {
|
|||
}
|
||||
|
||||
@Override
|
||||
public void cleanFiles(int totalTranslogOps, Store.MetadataSnapshot sourceMetaData) throws IOException {
|
||||
public void cleanFiles(int totalTranslogOps, long globalCheckpoint, Store.MetadataSnapshot sourceMetaData) throws IOException {
|
||||
transportService.submitRequest(targetNode, PeerRecoveryTargetService.Actions.CLEAN_FILES,
|
||||
new RecoveryCleanFilesRequest(recoveryId, shardId, sourceMetaData, totalTranslogOps),
|
||||
new RecoveryCleanFilesRequest(recoveryId, shardId, sourceMetaData, totalTranslogOps, globalCheckpoint),
|
||||
TransportRequestOptions.builder().withTimeout(recoverySettings.internalActionTimeout()).build(),
|
||||
EmptyTransportResponseHandler.INSTANCE_SAME).txGet();
|
||||
}
|
||||
|
|
|
@ -121,8 +121,9 @@ public class IndexLevelReplicationTests extends ESIndexLevelReplicationTestCase
|
|||
Future<Void> future = shards.asyncRecoverReplica(replica,
|
||||
(indexShard, node) -> new RecoveryTarget(indexShard, node, recoveryListener) {
|
||||
@Override
|
||||
public void cleanFiles(int totalTranslogOps, Store.MetadataSnapshot sourceMetaData) throws IOException {
|
||||
super.cleanFiles(totalTranslogOps, sourceMetaData);
|
||||
public void cleanFiles(int totalTranslogOps, long globalCheckpoint,
|
||||
Store.MetadataSnapshot sourceMetaData) throws IOException {
|
||||
super.cleanFiles(totalTranslogOps, globalCheckpoint, sourceMetaData);
|
||||
latch.countDown();
|
||||
try {
|
||||
latch.await();
|
||||
|
|
|
@ -853,9 +853,9 @@ public class RecoveryDuringReplicationTests extends ESIndexLevelReplicationTestC
|
|||
}
|
||||
|
||||
@Override
|
||||
public void cleanFiles(int totalTranslogOps, Store.MetadataSnapshot sourceMetaData) throws IOException {
|
||||
public void cleanFiles(int totalTranslogOps, long globalCheckpoint, Store.MetadataSnapshot sourceMetaData) throws IOException {
|
||||
blockIfNeeded(RecoveryState.Stage.INDEX);
|
||||
super.cleanFiles(totalTranslogOps, sourceMetaData);
|
||||
super.cleanFiles(totalTranslogOps, globalCheckpoint, sourceMetaData);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -189,7 +189,7 @@ public class PeerRecoveryTargetServiceTests extends IndexShardTestCase {
|
|||
for (Thread sender : senders) {
|
||||
sender.join();
|
||||
}
|
||||
recoveryTarget.cleanFiles(0, sourceSnapshot);
|
||||
recoveryTarget.cleanFiles(0, Long.parseLong(sourceSnapshot.getCommitUserData().get(SequenceNumbers.MAX_SEQ_NO)), sourceSnapshot);
|
||||
recoveryTarget.decRef();
|
||||
Store.MetadataSnapshot targetSnapshot = targetShard.snapshotStoreMetadata();
|
||||
Store.RecoveryDiff diff = sourceSnapshot.recoveryDiff(targetSnapshot);
|
||||
|
|
|
@ -473,9 +473,9 @@ public class RecoverySourceHandlerTests extends ESTestCase {
|
|||
between(1, 8)) {
|
||||
|
||||
@Override
|
||||
public SendFileResult phase1(final IndexCommit snapshot, final Supplier<Integer> translogOps) {
|
||||
public SendFileResult phase1(final IndexCommit snapshot, final long globalCheckpoint, final Supplier<Integer> translogOps) {
|
||||
phase1Called.set(true);
|
||||
return super.phase1(snapshot, translogOps);
|
||||
return super.phase1(snapshot, globalCheckpoint, translogOps);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -715,7 +715,7 @@ public class RecoverySourceHandlerTests extends ESTestCase {
|
|||
}
|
||||
|
||||
@Override
|
||||
public void cleanFiles(int totalTranslogOps, Store.MetadataSnapshot sourceMetaData) {
|
||||
public void cleanFiles(int totalTranslogOps, long globalCheckpoint, Store.MetadataSnapshot sourceMetaData) {
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -25,6 +25,7 @@ import org.apache.lucene.index.IndexCommit;
|
|||
import org.apache.lucene.index.IndexWriter;
|
||||
import org.apache.lucene.index.IndexWriterConfig;
|
||||
import org.apache.lucene.index.NoMergePolicy;
|
||||
import org.elasticsearch.action.ActionListener;
|
||||
import org.elasticsearch.action.admin.indices.flush.FlushRequest;
|
||||
import org.elasticsearch.action.index.IndexRequest;
|
||||
import org.elasticsearch.cluster.metadata.IndexMetaData;
|
||||
|
@ -42,9 +43,11 @@ import org.elasticsearch.index.replication.ESIndexLevelReplicationTestCase;
|
|||
import org.elasticsearch.index.replication.RecoveryDuringReplicationTests;
|
||||
import org.elasticsearch.index.seqno.SequenceNumbers;
|
||||
import org.elasticsearch.index.shard.IndexShard;
|
||||
import org.elasticsearch.index.store.Store;
|
||||
import org.elasticsearch.index.translog.SnapshotMatchers;
|
||||
import org.elasticsearch.index.translog.Translog;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
@ -323,7 +326,18 @@ public class RecoveryTests extends ESIndexLevelReplicationTestCase {
|
|||
}
|
||||
IndexShard replicaShard = newShard(primaryShard.shardId(), false);
|
||||
updateMappings(replicaShard, primaryShard.indexSettings().getIndexMetaData());
|
||||
recoverReplica(replicaShard, primaryShard, true);
|
||||
recoverReplica(replicaShard, primaryShard, (r, sourceNode) -> new RecoveryTarget(r, sourceNode, recoveryListener) {
|
||||
@Override
|
||||
public void prepareForTranslogOperations(boolean fileBasedRecovery, int totalTranslogOps, ActionListener<Void> listener) {
|
||||
super.prepareForTranslogOperations(fileBasedRecovery, totalTranslogOps, listener);
|
||||
assertThat(replicaShard.getGlobalCheckpoint(), equalTo(primaryShard.getGlobalCheckpoint()));
|
||||
}
|
||||
@Override
|
||||
public void cleanFiles(int totalTranslogOps, long globalCheckpoint, Store.MetadataSnapshot sourceMetaData) throws IOException {
|
||||
assertThat(globalCheckpoint, equalTo(primaryShard.getGlobalCheckpoint()));
|
||||
super.cleanFiles(totalTranslogOps, globalCheckpoint, sourceMetaData);
|
||||
}
|
||||
}, true, true);
|
||||
List<IndexCommit> commits = DirectoryReader.listCommits(replicaShard.store().directory());
|
||||
long maxSeqNo = Long.parseLong(commits.get(0).getUserData().get(SequenceNumbers.MAX_SEQ_NO));
|
||||
assertThat(maxSeqNo, lessThanOrEqualTo(globalCheckpoint));
|
||||
|
|
|
@ -75,8 +75,8 @@ public class AsyncRecoveryTarget implements RecoveryTargetHandler {
|
|||
}
|
||||
|
||||
@Override
|
||||
public void cleanFiles(int totalTranslogOps, Store.MetadataSnapshot sourceMetaData) throws IOException {
|
||||
target.cleanFiles(totalTranslogOps, sourceMetaData);
|
||||
public void cleanFiles(int totalTranslogOps, long globalCheckpoint, Store.MetadataSnapshot sourceMetaData) throws IOException {
|
||||
target.cleanFiles(totalTranslogOps, globalCheckpoint, sourceMetaData);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
Loading…
Reference in New Issue