Init global checkpoint after copy commit in peer recovery (#40823)

Today a new replica of a closed index does not have a safe commit
invariant when its engine is opened because we won't initialize the
global checkpoint on a recovering replica until the finalize step. With
this change, we can achieve that property by creating a new translog
with the global checkpoint from the primary at the end of phase 1.
This commit is contained in:
Nhat Nguyen 2019-04-11 22:18:31 -04:00 committed by GitHub
parent 79c7a57737
commit e9999dfa1d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
13 changed files with 94 additions and 60 deletions

View File

@ -103,15 +103,16 @@ public class ReadOnlyEngine extends Engine {
this.translogStats = translogStats == null ? new TranslogStats(0, 0, 0, 0, 0) : translogStats; this.translogStats = translogStats == null ? new TranslogStats(0, 0, 0, 0, 0) : translogStats;
if (seqNoStats == null) { if (seqNoStats == null) {
seqNoStats = buildSeqNoStats(lastCommittedSegmentInfos); seqNoStats = buildSeqNoStats(lastCommittedSegmentInfos);
// During a peer-recovery the global checkpoint is not known and up to date when the engine // Before 8.0 the global checkpoint is not known and up to date when the engine is created after
// is created, so we only check the max seq no / global checkpoint coherency when the global // peer recovery, so we only check the max seq no / global checkpoint coherency when the global
// checkpoint is different from the unassigned sequence number value. // checkpoint is different from the unassigned sequence number value.
// In addition to that we only execute the check if the index the engine belongs to has been // In addition to that we only execute the check if the index the engine belongs to has been
// created after the refactoring of the Close Index API and its TransportVerifyShardBeforeCloseAction // created after the refactoring of the Close Index API and its TransportVerifyShardBeforeCloseAction
// that guarantee that all operations have been flushed to Lucene. // that guarantee that all operations have been flushed to Lucene.
final long globalCheckpoint = engineConfig.getGlobalCheckpointSupplier().getAsLong(); final long globalCheckpoint = engineConfig.getGlobalCheckpointSupplier().getAsLong();
if (globalCheckpoint != SequenceNumbers.UNASSIGNED_SEQ_NO final Version indexVersionCreated = engineConfig.getIndexSettings().getIndexVersionCreated();
&& engineConfig.getIndexSettings().getIndexVersionCreated().onOrAfter(Version.V_6_7_0)) { if (indexVersionCreated.onOrAfter(Version.V_7_1_0) ||
(globalCheckpoint != SequenceNumbers.UNASSIGNED_SEQ_NO && indexVersionCreated.onOrAfter(Version.V_6_7_0))) {
if (seqNoStats.getMaxSeqNo() != globalCheckpoint) { if (seqNoStats.getMaxSeqNo() != globalCheckpoint) {
assertMaxSeqNoEqualsToGlobalCheckpoint(seqNoStats.getMaxSeqNo(), globalCheckpoint); assertMaxSeqNoEqualsToGlobalCheckpoint(seqNoStats.getMaxSeqNo(), globalCheckpoint);
throw new IllegalStateException("Maximum sequence number [" + seqNoStats.getMaxSeqNo() throw new IllegalStateException("Maximum sequence number [" + seqNoStats.getMaxSeqNo()

View File

@ -115,8 +115,8 @@ public class PeerRecoveryTargetService implements IndexEventListener {
FilesInfoRequestHandler()); FilesInfoRequestHandler());
transportService.registerRequestHandler(Actions.FILE_CHUNK, RecoveryFileChunkRequest::new, ThreadPool.Names.GENERIC, new transportService.registerRequestHandler(Actions.FILE_CHUNK, RecoveryFileChunkRequest::new, ThreadPool.Names.GENERIC, new
FileChunkTransportRequestHandler()); FileChunkTransportRequestHandler());
transportService.registerRequestHandler(Actions.CLEAN_FILES, RecoveryCleanFilesRequest::new, ThreadPool.Names.GENERIC, new transportService.registerRequestHandler(Actions.CLEAN_FILES, ThreadPool.Names.GENERIC,
CleanFilesRequestHandler()); RecoveryCleanFilesRequest::new, new CleanFilesRequestHandler());
transportService.registerRequestHandler(Actions.PREPARE_TRANSLOG, ThreadPool.Names.GENERIC, transportService.registerRequestHandler(Actions.PREPARE_TRANSLOG, ThreadPool.Names.GENERIC,
RecoveryPrepareForTranslogOperationsRequest::new, new PrepareForTranslogOperationsRequestHandler()); RecoveryPrepareForTranslogOperationsRequest::new, new PrepareForTranslogOperationsRequestHandler());
transportService.registerRequestHandler(Actions.TRANSLOG_OPS, RecoveryTranslogOperationsRequest::new, ThreadPool.Names.GENERIC, transportService.registerRequestHandler(Actions.TRANSLOG_OPS, RecoveryTranslogOperationsRequest::new, ThreadPool.Names.GENERIC,
@ -540,7 +540,7 @@ public class PeerRecoveryTargetService implements IndexEventListener {
public void messageReceived(RecoveryCleanFilesRequest request, TransportChannel channel, Task task) throws Exception { public void messageReceived(RecoveryCleanFilesRequest request, TransportChannel channel, Task task) throws Exception {
try (RecoveryRef recoveryRef = onGoingRecoveries.getRecoverySafe(request.recoveryId(), request.shardId() try (RecoveryRef recoveryRef = onGoingRecoveries.getRecoverySafe(request.recoveryId(), request.shardId()
)) { )) {
recoveryRef.target().cleanFiles(request.totalTranslogOps(), request.sourceMetaSnapshot()); recoveryRef.target().cleanFiles(request.totalTranslogOps(), request.getGlobalCheckpoint(), request.sourceMetaSnapshot());
channel.sendResponse(TransportResponse.Empty.INSTANCE); channel.sendResponse(TransportResponse.Empty.INSTANCE);
} }
} }

View File

@ -19,8 +19,10 @@
package org.elasticsearch.indices.recovery; package org.elasticsearch.indices.recovery;
import org.elasticsearch.Version;
import org.elasticsearch.common.io.stream.StreamInput; import org.elasticsearch.common.io.stream.StreamInput;
import org.elasticsearch.common.io.stream.StreamOutput; import org.elasticsearch.common.io.stream.StreamOutput;
import org.elasticsearch.index.seqno.SequenceNumbers;
import org.elasticsearch.index.shard.ShardId; import org.elasticsearch.index.shard.ShardId;
import org.elasticsearch.index.store.Store; import org.elasticsearch.index.store.Store;
import org.elasticsearch.transport.TransportRequest; import org.elasticsearch.transport.TransportRequest;
@ -29,20 +31,48 @@ import java.io.IOException;
public class RecoveryCleanFilesRequest extends TransportRequest { public class RecoveryCleanFilesRequest extends TransportRequest {
private long recoveryId; private final long recoveryId;
private ShardId shardId; private final ShardId shardId;
private final Store.MetadataSnapshot snapshotFiles;
private final int totalTranslogOps;
private final long globalCheckpoint;
private Store.MetadataSnapshot snapshotFiles; RecoveryCleanFilesRequest(long recoveryId, ShardId shardId, Store.MetadataSnapshot snapshotFiles,
private int totalTranslogOps = RecoveryState.Translog.UNKNOWN; int totalTranslogOps, long globalCheckpoint) {
public RecoveryCleanFilesRequest() {
}
RecoveryCleanFilesRequest(long recoveryId, ShardId shardId, Store.MetadataSnapshot snapshotFiles, int totalTranslogOps) {
this.recoveryId = recoveryId; this.recoveryId = recoveryId;
this.shardId = shardId; this.shardId = shardId;
this.snapshotFiles = snapshotFiles; this.snapshotFiles = snapshotFiles;
this.totalTranslogOps = totalTranslogOps; this.totalTranslogOps = totalTranslogOps;
this.globalCheckpoint = globalCheckpoint;
}
RecoveryCleanFilesRequest(StreamInput in) throws IOException {
super(in);
recoveryId = in.readLong();
shardId = ShardId.readShardId(in);
snapshotFiles = new Store.MetadataSnapshot(in);
totalTranslogOps = in.readVInt();
if (in.getVersion().onOrAfter(Version.V_7_1_0)) {
globalCheckpoint = in.readZLong();
} else {
globalCheckpoint = SequenceNumbers.UNASSIGNED_SEQ_NO;
}
}
@Override
public void writeTo(StreamOutput out) throws IOException {
super.writeTo(out);
out.writeLong(recoveryId);
shardId.writeTo(out);
snapshotFiles.writeTo(out);
out.writeVInt(totalTranslogOps);
if (out.getVersion().onOrAfter(Version.V_7_1_0)) {
out.writeZLong(globalCheckpoint);
}
}
public Store.MetadataSnapshot sourceMetaSnapshot() {
return snapshotFiles;
} }
public long recoveryId() { public long recoveryId() {
@ -53,29 +83,11 @@ public class RecoveryCleanFilesRequest extends TransportRequest {
return shardId; return shardId;
} }
@Override
public void readFrom(StreamInput in) throws IOException {
super.readFrom(in);
recoveryId = in.readLong();
shardId = ShardId.readShardId(in);
snapshotFiles = new Store.MetadataSnapshot(in);
totalTranslogOps = in.readVInt();
}
@Override
public void writeTo(StreamOutput out) throws IOException {
super.writeTo(out);
out.writeLong(recoveryId);
shardId.writeTo(out);
snapshotFiles.writeTo(out);
out.writeVInt(totalTranslogOps);
}
public Store.MetadataSnapshot sourceMetaSnapshot() {
return snapshotFiles;
}
public int totalTranslogOps() { public int totalTranslogOps() {
return totalTranslogOps; return totalTranslogOps;
} }
public long getGlobalCheckpoint() {
return globalCheckpoint;
}
} }

View File

@ -177,7 +177,7 @@ public class RecoverySourceHandler {
startingSeqNo = 0; startingSeqNo = 0;
try { try {
final int estimateNumOps = shard.estimateNumberOfHistoryOperations("peer-recovery", startingSeqNo); final int estimateNumOps = shard.estimateNumberOfHistoryOperations("peer-recovery", startingSeqNo);
sendFileResult = phase1(phase1Snapshot.getIndexCommit(), () -> estimateNumOps); sendFileResult = phase1(phase1Snapshot.getIndexCommit(), shard.getGlobalCheckpoint(), () -> estimateNumOps);
} catch (final Exception e) { } catch (final Exception e) {
throw new RecoveryEngineException(shard.shardId(), 1, "phase1 failed", e); throw new RecoveryEngineException(shard.shardId(), 1, "phase1 failed", e);
} finally { } finally {
@ -332,7 +332,7 @@ public class RecoverySourceHandler {
* segments that are missing. Only segments that have the same size and * segments that are missing. Only segments that have the same size and
* checksum can be reused * checksum can be reused
*/ */
public SendFileResult phase1(final IndexCommit snapshot, final Supplier<Integer> translogOps) { public SendFileResult phase1(final IndexCommit snapshot, final long globalCheckpoint, final Supplier<Integer> translogOps) {
cancellableThreads.checkForCancel(); cancellableThreads.checkForCancel();
// Total size of segment files that are recovered // Total size of segment files that are recovered
long totalSize = 0; long totalSize = 0;
@ -422,7 +422,7 @@ public class RecoverySourceHandler {
// are deleted // are deleted
try { try {
cancellableThreads.executeIO(() -> cancellableThreads.executeIO(() ->
recoveryTarget.cleanFiles(translogOps.get(), recoverySourceMetadata)); recoveryTarget.cleanFiles(translogOps.get(), globalCheckpoint, recoverySourceMetadata));
} catch (RemoteTransportException | IOException targetException) { } catch (RemoteTransportException | IOException targetException) {
final IOException corruptIndexException; final IOException corruptIndexException;
// we realized that after the index was copied and we wanted to finalize the recovery // we realized that after the index was copied and we wanted to finalize the recovery

View File

@ -288,6 +288,9 @@ public class RecoveryTarget extends AbstractRefCounted implements RecoveryTarget
ActionListener.completeWith(listener, () -> { ActionListener.completeWith(listener, () -> {
state().getTranslog().totalOperations(totalTranslogOps); state().getTranslog().totalOperations(totalTranslogOps);
indexShard().openEngineAndSkipTranslogRecovery(); indexShard().openEngineAndSkipTranslogRecovery();
assert indexShard.getGlobalCheckpoint() >= indexShard.seqNoStats().getMaxSeqNo() ||
indexShard.indexSettings().getIndexVersionCreated().before(Version.V_7_1_0)
: "global checkpoint is not initialized [" + indexShard.seqNoStats() + "]";
return null; return null;
}); });
} }
@ -382,7 +385,7 @@ public class RecoveryTarget extends AbstractRefCounted implements RecoveryTarget
} }
@Override @Override
public void cleanFiles(int totalTranslogOps, Store.MetadataSnapshot sourceMetaData) throws IOException { public void cleanFiles(int totalTranslogOps, long globalCheckpoint, Store.MetadataSnapshot sourceMetaData) throws IOException {
state().getTranslog().totalOperations(totalTranslogOps); state().getTranslog().totalOperations(totalTranslogOps);
// first, we go and move files that were created with the recovery id suffix to // first, we go and move files that were created with the recovery id suffix to
// the actual names, its ok if we have a corrupted index here, since we have replicas // the actual names, its ok if we have a corrupted index here, since we have replicas
@ -395,10 +398,11 @@ public class RecoveryTarget extends AbstractRefCounted implements RecoveryTarget
if (indexShard.indexSettings().getIndexVersionCreated().before(Version.V_6_0_0_rc1)) { if (indexShard.indexSettings().getIndexVersionCreated().before(Version.V_6_0_0_rc1)) {
store.ensureIndexHasHistoryUUID(); store.ensureIndexHasHistoryUUID();
} }
// TODO: Assign the global checkpoint to the max_seqno of the safe commit if the index version >= 6.2 assert globalCheckpoint >= Long.parseLong(sourceMetaData.getCommitUserData().get(SequenceNumbers.MAX_SEQ_NO))
|| indexShard.indexSettings().getIndexVersionCreated().before(Version.V_7_1_0) :
"invalid global checkpoint[" + globalCheckpoint + "] source_meta_data [" + sourceMetaData.getCommitUserData() + "]";
final String translogUUID = Translog.createEmptyTranslog( final String translogUUID = Translog.createEmptyTranslog(
indexShard.shardPath().resolveTranslog(), SequenceNumbers.UNASSIGNED_SEQ_NO, shardId, indexShard.shardPath().resolveTranslog(), globalCheckpoint, shardId, indexShard.getPendingPrimaryTerm());
indexShard.getPendingPrimaryTerm());
store.associateIndexWithNewTranslog(translogUUID); store.associateIndexWithNewTranslog(translogUUID);
if (indexShard.getRetentionLeases().leases().isEmpty()) { if (indexShard.getRetentionLeases().leases().isEmpty()) {

View File

@ -88,10 +88,12 @@ public interface RecoveryTargetHandler {
/** /**
* After all source files has been sent over, this command is sent to the target so it can clean any local * After all source files has been sent over, this command is sent to the target so it can clean any local
* files that are not part of the source store * files that are not part of the source store
*
* @param totalTranslogOps an update number of translog operations that will be replayed later on * @param totalTranslogOps an update number of translog operations that will be replayed later on
* @param globalCheckpoint the global checkpoint on the primary
* @param sourceMetaData meta data of the source store * @param sourceMetaData meta data of the source store
*/ */
void cleanFiles(int totalTranslogOps, Store.MetadataSnapshot sourceMetaData) throws IOException; void cleanFiles(int totalTranslogOps, long globalCheckpoint, Store.MetadataSnapshot sourceMetaData) throws IOException;
/** writes a partial file chunk to the target store */ /** writes a partial file chunk to the target store */
void writeFileChunk(StoreFileMetaData fileMetaData, long position, BytesReference content, void writeFileChunk(StoreFileMetaData fileMetaData, long position, BytesReference content,

View File

@ -139,9 +139,9 @@ public class RemoteRecoveryTargetHandler implements RecoveryTargetHandler {
} }
@Override @Override
public void cleanFiles(int totalTranslogOps, Store.MetadataSnapshot sourceMetaData) throws IOException { public void cleanFiles(int totalTranslogOps, long globalCheckpoint, Store.MetadataSnapshot sourceMetaData) throws IOException {
transportService.submitRequest(targetNode, PeerRecoveryTargetService.Actions.CLEAN_FILES, transportService.submitRequest(targetNode, PeerRecoveryTargetService.Actions.CLEAN_FILES,
new RecoveryCleanFilesRequest(recoveryId, shardId, sourceMetaData, totalTranslogOps), new RecoveryCleanFilesRequest(recoveryId, shardId, sourceMetaData, totalTranslogOps, globalCheckpoint),
TransportRequestOptions.builder().withTimeout(recoverySettings.internalActionTimeout()).build(), TransportRequestOptions.builder().withTimeout(recoverySettings.internalActionTimeout()).build(),
EmptyTransportResponseHandler.INSTANCE_SAME).txGet(); EmptyTransportResponseHandler.INSTANCE_SAME).txGet();
} }

View File

@ -121,8 +121,9 @@ public class IndexLevelReplicationTests extends ESIndexLevelReplicationTestCase
Future<Void> future = shards.asyncRecoverReplica(replica, Future<Void> future = shards.asyncRecoverReplica(replica,
(indexShard, node) -> new RecoveryTarget(indexShard, node, recoveryListener) { (indexShard, node) -> new RecoveryTarget(indexShard, node, recoveryListener) {
@Override @Override
public void cleanFiles(int totalTranslogOps, Store.MetadataSnapshot sourceMetaData) throws IOException { public void cleanFiles(int totalTranslogOps, long globalCheckpoint,
super.cleanFiles(totalTranslogOps, sourceMetaData); Store.MetadataSnapshot sourceMetaData) throws IOException {
super.cleanFiles(totalTranslogOps, globalCheckpoint, sourceMetaData);
latch.countDown(); latch.countDown();
try { try {
latch.await(); latch.await();

View File

@ -853,9 +853,9 @@ public class RecoveryDuringReplicationTests extends ESIndexLevelReplicationTestC
} }
@Override @Override
public void cleanFiles(int totalTranslogOps, Store.MetadataSnapshot sourceMetaData) throws IOException { public void cleanFiles(int totalTranslogOps, long globalCheckpoint, Store.MetadataSnapshot sourceMetaData) throws IOException {
blockIfNeeded(RecoveryState.Stage.INDEX); blockIfNeeded(RecoveryState.Stage.INDEX);
super.cleanFiles(totalTranslogOps, sourceMetaData); super.cleanFiles(totalTranslogOps, globalCheckpoint, sourceMetaData);
} }
@Override @Override

View File

@ -189,7 +189,7 @@ public class PeerRecoveryTargetServiceTests extends IndexShardTestCase {
for (Thread sender : senders) { for (Thread sender : senders) {
sender.join(); sender.join();
} }
recoveryTarget.cleanFiles(0, sourceSnapshot); recoveryTarget.cleanFiles(0, Long.parseLong(sourceSnapshot.getCommitUserData().get(SequenceNumbers.MAX_SEQ_NO)), sourceSnapshot);
recoveryTarget.decRef(); recoveryTarget.decRef();
Store.MetadataSnapshot targetSnapshot = targetShard.snapshotStoreMetadata(); Store.MetadataSnapshot targetSnapshot = targetShard.snapshotStoreMetadata();
Store.RecoveryDiff diff = sourceSnapshot.recoveryDiff(targetSnapshot); Store.RecoveryDiff diff = sourceSnapshot.recoveryDiff(targetSnapshot);

View File

@ -473,9 +473,9 @@ public class RecoverySourceHandlerTests extends ESTestCase {
between(1, 8)) { between(1, 8)) {
@Override @Override
public SendFileResult phase1(final IndexCommit snapshot, final Supplier<Integer> translogOps) { public SendFileResult phase1(final IndexCommit snapshot, final long globalCheckpoint, final Supplier<Integer> translogOps) {
phase1Called.set(true); phase1Called.set(true);
return super.phase1(snapshot, translogOps); return super.phase1(snapshot, globalCheckpoint, translogOps);
} }
@Override @Override
@ -715,7 +715,7 @@ public class RecoverySourceHandlerTests extends ESTestCase {
} }
@Override @Override
public void cleanFiles(int totalTranslogOps, Store.MetadataSnapshot sourceMetaData) { public void cleanFiles(int totalTranslogOps, long globalCheckpoint, Store.MetadataSnapshot sourceMetaData) {
} }
@Override @Override

View File

@ -25,6 +25,7 @@ import org.apache.lucene.index.IndexCommit;
import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.NoMergePolicy; import org.apache.lucene.index.NoMergePolicy;
import org.elasticsearch.action.ActionListener;
import org.elasticsearch.action.admin.indices.flush.FlushRequest; import org.elasticsearch.action.admin.indices.flush.FlushRequest;
import org.elasticsearch.action.index.IndexRequest; import org.elasticsearch.action.index.IndexRequest;
import org.elasticsearch.cluster.metadata.IndexMetaData; import org.elasticsearch.cluster.metadata.IndexMetaData;
@ -42,9 +43,11 @@ import org.elasticsearch.index.replication.ESIndexLevelReplicationTestCase;
import org.elasticsearch.index.replication.RecoveryDuringReplicationTests; import org.elasticsearch.index.replication.RecoveryDuringReplicationTests;
import org.elasticsearch.index.seqno.SequenceNumbers; import org.elasticsearch.index.seqno.SequenceNumbers;
import org.elasticsearch.index.shard.IndexShard; import org.elasticsearch.index.shard.IndexShard;
import org.elasticsearch.index.store.Store;
import org.elasticsearch.index.translog.SnapshotMatchers; import org.elasticsearch.index.translog.SnapshotMatchers;
import org.elasticsearch.index.translog.Translog; import org.elasticsearch.index.translog.Translog;
import java.io.IOException;
import java.util.HashMap; import java.util.HashMap;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
@ -323,7 +326,18 @@ public class RecoveryTests extends ESIndexLevelReplicationTestCase {
} }
IndexShard replicaShard = newShard(primaryShard.shardId(), false); IndexShard replicaShard = newShard(primaryShard.shardId(), false);
updateMappings(replicaShard, primaryShard.indexSettings().getIndexMetaData()); updateMappings(replicaShard, primaryShard.indexSettings().getIndexMetaData());
recoverReplica(replicaShard, primaryShard, true); recoverReplica(replicaShard, primaryShard, (r, sourceNode) -> new RecoveryTarget(r, sourceNode, recoveryListener) {
@Override
public void prepareForTranslogOperations(boolean fileBasedRecovery, int totalTranslogOps, ActionListener<Void> listener) {
super.prepareForTranslogOperations(fileBasedRecovery, totalTranslogOps, listener);
assertThat(replicaShard.getGlobalCheckpoint(), equalTo(primaryShard.getGlobalCheckpoint()));
}
@Override
public void cleanFiles(int totalTranslogOps, long globalCheckpoint, Store.MetadataSnapshot sourceMetaData) throws IOException {
assertThat(globalCheckpoint, equalTo(primaryShard.getGlobalCheckpoint()));
super.cleanFiles(totalTranslogOps, globalCheckpoint, sourceMetaData);
}
}, true, true);
List<IndexCommit> commits = DirectoryReader.listCommits(replicaShard.store().directory()); List<IndexCommit> commits = DirectoryReader.listCommits(replicaShard.store().directory());
long maxSeqNo = Long.parseLong(commits.get(0).getUserData().get(SequenceNumbers.MAX_SEQ_NO)); long maxSeqNo = Long.parseLong(commits.get(0).getUserData().get(SequenceNumbers.MAX_SEQ_NO));
assertThat(maxSeqNo, lessThanOrEqualTo(globalCheckpoint)); assertThat(maxSeqNo, lessThanOrEqualTo(globalCheckpoint));

View File

@ -75,8 +75,8 @@ public class AsyncRecoveryTarget implements RecoveryTargetHandler {
} }
@Override @Override
public void cleanFiles(int totalTranslogOps, Store.MetadataSnapshot sourceMetaData) throws IOException { public void cleanFiles(int totalTranslogOps, long globalCheckpoint, Store.MetadataSnapshot sourceMetaData) throws IOException {
target.cleanFiles(totalTranslogOps, sourceMetaData); target.cleanFiles(totalTranslogOps, globalCheckpoint, sourceMetaData);
} }
@Override @Override