Fsync directory after cleanup (#28604)

After copying over the Lucene segments during peer recovery, we call cleanupAndVerify which removes all other files in the directory and which then calls getMetadata to check if the resulting files are a proper index. There are two issues with this:

- the directory is not fsynced after the deletions, so that the call to getMetadata, which lists files in the directory, can get a stale view, possibly seeing a deleted corruption marker (which leads to the exception seen in #28435)
- failing to delete a corruption marker should result in a hard failure, as the shard is otherwise unusable.
This commit is contained in:
Yannick Welsch 2018-02-09 17:06:36 +01:00 committed by GitHub
parent 231fd3c9be
commit 5735e088f9
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
1 changed files with 5 additions and 4 deletions

View File

@ -661,17 +661,17 @@ public class Store extends AbstractIndexShardComponent implements Closeable, Ref
public void cleanupAndVerify(String reason, MetadataSnapshot sourceMetaData) throws IOException { public void cleanupAndVerify(String reason, MetadataSnapshot sourceMetaData) throws IOException {
metadataLock.writeLock().lock(); metadataLock.writeLock().lock();
try (Lock writeLock = directory.obtainLock(IndexWriter.WRITE_LOCK_NAME)) { try (Lock writeLock = directory.obtainLock(IndexWriter.WRITE_LOCK_NAME)) {
final StoreDirectory dir = directory; for (String existingFile : directory.listAll()) {
for (String existingFile : dir.listAll()) {
if (Store.isAutogenerated(existingFile) || sourceMetaData.contains(existingFile)) { if (Store.isAutogenerated(existingFile) || sourceMetaData.contains(existingFile)) {
continue; // don't delete snapshot file, or the checksums file (note, this is extra protection since the Store won't delete checksum) continue; // don't delete snapshot file, or the checksums file (note, this is extra protection since the Store won't delete checksum)
} }
try { try {
dir.deleteFile(reason, existingFile); directory.deleteFile(reason, existingFile);
// FNF should not happen since we hold a write lock? // FNF should not happen since we hold a write lock?
} catch (IOException ex) { } catch (IOException ex) {
if (existingFile.startsWith(IndexFileNames.SEGMENTS) if (existingFile.startsWith(IndexFileNames.SEGMENTS)
|| existingFile.equals(IndexFileNames.OLD_SEGMENTS_GEN)) { || existingFile.equals(IndexFileNames.OLD_SEGMENTS_GEN)
|| existingFile.startsWith(CORRUPTED)) {
// TODO do we need to also fail this if we can't delete the pending commit file? // TODO do we need to also fail this if we can't delete the pending commit file?
// if one of those files can't be deleted we better fail the cleanup otherwise we might leave an old commit point around? // if one of those files can't be deleted we better fail the cleanup otherwise we might leave an old commit point around?
throw new IllegalStateException("Can't delete " + existingFile + " - cleanup failed", ex); throw new IllegalStateException("Can't delete " + existingFile + " - cleanup failed", ex);
@ -680,6 +680,7 @@ public class Store extends AbstractIndexShardComponent implements Closeable, Ref
// ignore, we don't really care, will get deleted later on // ignore, we don't really care, will get deleted later on
} }
} }
directory.syncMetaData();
final Store.MetadataSnapshot metadataOrEmpty = getMetadata(null); final Store.MetadataSnapshot metadataOrEmpty = getMetadata(null);
verifyAfterCleanup(sourceMetaData, metadataOrEmpty); verifyAfterCleanup(sourceMetaData, metadataOrEmpty);
} finally { } finally {