[Store] Only fail recovery if files are inconsistent

the recovery diff can return file in the `different` category
since it's conservative if it can't tell if the files are the same.
Yet, the cleanup code only needs to ensure both ends of the recovery
are consistent. If we have a very old segments_N file no checksum is present
but for the delete files they might be such that the segments file passes
the consistency check but the .del file doesn't sicne it's in-fact the same
but this check was missing in the last commit.
This commit is contained in:
Simon Willnauer 2014-12-04 14:00:21 +01:00
parent 164853fd0b
commit 8b5bc2643e
2 changed files with 77 additions and 21 deletions

View File

@ -560,32 +560,49 @@ public class Store extends AbstractIndexShardComponent implements Closeable, Ref
} }
} }
final Store.MetadataSnapshot metadataOrEmpty = getMetadata(); final Store.MetadataSnapshot metadataOrEmpty = getMetadata();
final Store.RecoveryDiff recoveryDiff = metadataOrEmpty.recoveryDiff(sourceMetaData); verifyAfterCleanup(sourceMetaData, metadataOrEmpty);
if (recoveryDiff.identical.size() != recoveryDiff.size()) {
if (recoveryDiff.missing.isEmpty()) {
for (StoreFileMetaData meta : recoveryDiff.different) {
StoreFileMetaData local = metadataOrEmpty.get(meta.name());
StoreFileMetaData remote = sourceMetaData.get(meta.name());
// if we have different files the they must have no checksums otherwise something went wrong during recovery.
// we have that problem when we have an empty index is only a segments_1 file then we can't tell if it's a Lucene 4.8 file
// and therefore no checksum. That isn't much of a problem since we simply copy it over anyway but those files come out as
// different in the diff. That's why we have to double check here again if the rest of it matches.
boolean consistent = (local.checksum() == null && remote.checksum() == null && local.hash().equals(remote.hash()) && local.length() == remote.length());
if (consistent == false) {
throw new ElasticsearchIllegalStateException("local version: " + local + " is different from remote version after recovery: " + remote, null);
}
}
} else {
logger.debug("Files are missing on the recovery target: {} ", recoveryDiff);
throw new ElasticsearchIllegalStateException("Files are missing on the recovery target: [different="
+ recoveryDiff.different + ", missing=" + recoveryDiff.missing +']', null);
}
}
} finally { } finally {
metadataLock.writeLock().unlock(); metadataLock.writeLock().unlock();
} }
} }
// pkg private for testing
final void verifyAfterCleanup(MetadataSnapshot sourceMetaData, MetadataSnapshot targetMetaData) {
final RecoveryDiff recoveryDiff = targetMetaData.recoveryDiff(sourceMetaData);
if (recoveryDiff.identical.size() != recoveryDiff.size()) {
if (recoveryDiff.missing.isEmpty()) {
for (StoreFileMetaData meta : recoveryDiff.different) {
StoreFileMetaData local = targetMetaData.get(meta.name());
StoreFileMetaData remote = sourceMetaData.get(meta.name());
// if we have different files the they must have no checksums otherwise something went wrong during recovery.
// we have that problem when we have an empty index is only a segments_1 file then we can't tell if it's a Lucene 4.8 file
// and therefore no checksum. That isn't much of a problem since we simply copy it over anyway but those files come out as
// different in the diff. That's why we have to double check here again if the rest of it matches.
// all is fine this file is just part of a commit or a segment that is different
final boolean same = local.isSame(remote);
// this check ensures that the two files are consistent ie. if we don't have checksums only the rest needs to match we are just
// verifying that we are consistent on both ends source and target
final boolean hashAndLengthEqual = (
local.checksum() == null
&& remote.checksum() == null
&& local.hash().equals(remote.hash())
&& local.length() == remote.length());
final boolean consistent = hashAndLengthEqual || same;
if (consistent == false) {
logger.debug("Files are different on the recovery target: {} ", recoveryDiff);
throw new ElasticsearchIllegalStateException("local version: " + local + " is different from remote version after recovery: " + remote, null);
}
}
} else {
logger.debug("Files are missing on the recovery target: {} ", recoveryDiff);
throw new ElasticsearchIllegalStateException("Files are missing on the recovery target: [different="
+ recoveryDiff.different + ", missing=" + recoveryDiff.missing +']', null);
}
}
}
/** /**
* This exists so {@link org.elasticsearch.index.codec.postingsformat.BloomFilterPostingsFormat} can load its boolean setting; can we find a more straightforward way? * This exists so {@link org.elasticsearch.index.codec.postingsformat.BloomFilterPostingsFormat} can load its boolean setting; can we find a more straightforward way?
*/ */

View File

@ -752,6 +752,22 @@ public class StoreTest extends ElasticsearchLuceneTestCase {
return random.nextBoolean() ? new LeastUsedDistributor(service) : new RandomWeightedDistributor(service); return random.nextBoolean() ? new LeastUsedDistributor(service) : new RandomWeightedDistributor(service);
} }
/**
* Legacy indices without lucene CRC32 did never write or calculate checksums for segments_N files
* but for other files
*/
@Test
public void testRecoveryDiffWithLegacyCommit() {
Map<String, StoreFileMetaData> metaDataMap = new HashMap<>();
metaDataMap.put("segments_1", new StoreFileMetaData("segments_1", 50, null, null, new BytesRef(new byte[] {1})));
metaDataMap.put("_0_1.del", new StoreFileMetaData("_0_1.del", 42, "foobarbaz", null, new BytesRef()));
Store.MetadataSnapshot first = new Store.MetadataSnapshot(metaDataMap);
Store.MetadataSnapshot second = new Store.MetadataSnapshot(metaDataMap);
Store.RecoveryDiff recoveryDiff = first.recoveryDiff(second);
assertEquals(recoveryDiff.toString(), recoveryDiff.different.size(), 2);
}
@Test @Test
public void testRecoveryDiff() throws IOException, InterruptedException { public void testRecoveryDiff() throws IOException, InterruptedException {
@ -1005,4 +1021,27 @@ public class StoreTest extends ElasticsearchLuceneTestCase {
store.deleteContent(); store.deleteContent();
IOUtils.close(store); IOUtils.close(store);
} }
@Test
public void testCleanUpWithLegacyChecksums() throws IOException {
Map<String, StoreFileMetaData> metaDataMap = new HashMap<>();
metaDataMap.put("segments_1", new StoreFileMetaData("segments_1", 50, null, null, new BytesRef(new byte[] {1})));
metaDataMap.put("_0_1.del", new StoreFileMetaData("_0_1.del", 42, "foobarbaz", null, new BytesRef()));
Store.MetadataSnapshot snapshot = new Store.MetadataSnapshot(metaDataMap);
final ShardId shardId = new ShardId(new Index("index"), 1);
DirectoryService directoryService = new LuceneManagedDirectoryService(random());
Store store = new Store(shardId, ImmutableSettings.EMPTY, directoryService, randomDistributor(directoryService), new DummyShardLock(shardId));
for (String file : metaDataMap.keySet()) {
try (IndexOutput output = store.directory().createOutput(file, IOContext.DEFAULT)) {
BytesRef bytesRef = new BytesRef(TestUtil.randomRealisticUnicodeString(random(), 10, 1024));
output.writeBytes(bytesRef.bytes, bytesRef.offset, bytesRef.length);
CodecUtil.writeFooter(output);
}
}
store.verifyAfterCleanup(snapshot, snapshot);
store.deleteContent();
IOUtils.close(store);
}
} }