Do not Block Snapshot Thread Pool Fully During Restore or Snapshot (#57360) (#57511)

Allow for a fairer distribution of snapshot and restore operations
to enable parallel snapshots and improve behaviour for parallel snapshot + restore.

Closes #55803
This commit is contained in:
Armin Braun 2020-06-02 11:45:55 +02:00 committed by GitHub
parent b4a2cd810a
commit 9bc9d01b84
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
1 changed files with 41 additions and 22 deletions

View File

@ -1835,23 +1835,30 @@ public abstract class BlobStoreRepository extends AbstractLifecycleComponent imp
final int workers = Math.min(threadPool.info(ThreadPool.Names.SNAPSHOT).getMax(), indexIncrementalFileCount);
final ActionListener<Void> filesListener = fileQueueListener(filesToSnapshot, workers, allFilesUploadedListener);
for (int i = 0; i < workers; ++i) {
executor.execute(ActionRunnable.run(filesListener, () -> {
BlobStoreIndexShardSnapshot.FileInfo snapshotFileInfo = filesToSnapshot.poll(0L, TimeUnit.MILLISECONDS);
if (snapshotFileInfo != null) {
try (Releasable ignored = incrementStoreRef(store, snapshotStatus, shardId)) {
do {
snapshotFile(snapshotFileInfo, indexId, shardId, snapshotId, snapshotStatus, store);
snapshotFileInfo = filesToSnapshot.poll(0L, TimeUnit.MILLISECONDS);
} while (snapshotFileInfo != null);
}
}
}));
executeOneFileSnapshot(store, snapshotId, indexId, snapshotStatus, filesToSnapshot, executor, filesListener);
}
} catch (Exception e) {
listener.onFailure(e);
}
}
private void executeOneFileSnapshot(Store store, SnapshotId snapshotId, IndexId indexId, IndexShardSnapshotStatus snapshotStatus,
BlockingQueue<BlobStoreIndexShardSnapshot.FileInfo> filesToSnapshot, Executor executor,
ActionListener<Void> listener) throws InterruptedException {
final ShardId shardId = store.shardId();
final BlobStoreIndexShardSnapshot.FileInfo snapshotFileInfo = filesToSnapshot.poll(0L, TimeUnit.MILLISECONDS);
if (snapshotFileInfo == null) {
listener.onResponse(null);
} else {
executor.execute(ActionRunnable.wrap(listener, l -> {
try (Releasable ignored = incrementStoreRef(store, snapshotStatus, shardId)) {
snapshotFile(snapshotFileInfo, indexId, shardId, snapshotId, snapshotStatus, store);
executeOneFileSnapshot(store, snapshotId, indexId, snapshotStatus, filesToSnapshot, executor, l);
}
}));
}
}
private static Releasable incrementStoreRef(Store store, IndexShardSnapshotStatus snapshotStatus, ShardId shardId) {
if (store.tryIncRef() == false) {
if (snapshotStatus.isAborted()) {
@ -1901,21 +1908,33 @@ public abstract class BlobStoreRepository extends AbstractLifecycleComponent imp
fileQueueListener(files, workers, ActionListener.map(listener, v -> null));
// restore the files from the snapshot to the Lucene store
for (int i = 0; i < workers; ++i) {
executor.execute(ActionRunnable.run(allFilesListener, () -> {
store.incRef();
try {
BlobStoreIndexShardSnapshot.FileInfo fileToRecover;
while ((fileToRecover = files.poll(0L, TimeUnit.MILLISECONDS)) != null) {
restoreFile(fileToRecover, store);
}
} finally {
store.decRef();
}
}));
try {
executeOneFileRestore(files, allFilesListener);
} catch (Exception e) {
allFilesListener.onFailure(e);
}
}
}
}
private void executeOneFileRestore(BlockingQueue<BlobStoreIndexShardSnapshot.FileInfo> files,
ActionListener<Void> allFilesListener) throws InterruptedException {
final BlobStoreIndexShardSnapshot.FileInfo fileToRecover = files.poll(0L, TimeUnit.MILLISECONDS);
if (fileToRecover == null) {
allFilesListener.onResponse(null);
} else {
executor.execute(ActionRunnable.wrap(allFilesListener, filesListener -> {
store.incRef();
try {
restoreFile(fileToRecover, store);
} finally {
store.decRef();
}
executeOneFileRestore(files, filesListener);
}));
}
}
private void restoreFile(BlobStoreIndexShardSnapshot.FileInfo fileInfo, Store store) throws IOException {
boolean success = false;
try (IndexOutput indexOutput =