LUCENE-8290: Keep soft deletes in sync with on-disk DocValues

Today we pass on the doc values update to the PendingDeletes
when it's applied. This might cause issues with a rentention policy
merge policy that will see a deleted document but not it's value on
disk.
This change moves back the PendingDeletes callback to flush time
in order to be consistent with what is actually updated on disk.

This change also makes sure we write values to disk on flush that
are in the reader pool as well as extra best effort checks to drop
fully deleted segments on flush, commit and getReader.
This commit is contained in:
Simon Willnauer 2018-05-02 16:17:15 +02:00
parent 8b9c2a3185
commit 591fc6627a
6 changed files with 51 additions and 43 deletions

View File

@ -510,16 +510,7 @@ public class IndexWriter implements Closeable, TwoPhaseCommit, Accountable {
// TODO: we could instead just clone SIS and pull/incref readers in sync'd block, and then do this w/o IW's lock?
// Must do this sync'd on IW to prevent a merge from completing at the last second and failing to write its DV updates:
if (readerPool.writeAllDocValuesUpdates()) {
checkpoint();
}
if (writeAllDeletes) {
// Must move the deletes to disk:
if (readerPool.commit(segmentInfos)) {
checkpointNoSIS();
}
}
writeReaderPool(writeAllDeletes);
// Prevent segmentInfos from changing while opening the
// reader; in theory we could instead do similar retry logic,
@ -3132,11 +3123,7 @@ public class IndexWriter implements Closeable, TwoPhaseCommit, Accountable {
applyAllDeletesAndUpdates();
synchronized(this) {
if (readerPool.commit(segmentInfos)) {
checkpointNoSIS();
}
writeReaderPool(true);
if (changeCount.get() != lastCommitChangeCount) {
// There are changes to commit, so we will write a new segments_N in startCommit.
// The act of committing is itself an NRT-visible change (an NRT reader that was
@ -3217,6 +3204,39 @@ public class IndexWriter implements Closeable, TwoPhaseCommit, Accountable {
}
}
}
/**
* Ensures that all changes in the reader-pool are written to disk.
* @param writeDeletes if <code>true</code> if deletes should be written to disk too.
*/
private final void writeReaderPool(boolean writeDeletes) throws IOException {
assert Thread.holdsLock(this);
if (writeDeletes) {
if (readerPool.commit(segmentInfos)) {
checkpointNoSIS();
}
} else { // only write the docValues
if (readerPool.writeAllDocValuesUpdates()) {
checkpoint();
}
}
// now do some best effort to check if a segment is fully deleted
List<SegmentCommitInfo> toDrop = new ArrayList<>(); // don't modify segmentInfos in-place
for (SegmentCommitInfo info : segmentInfos) {
ReadersAndUpdates readersAndUpdates = readerPool.get(info, false);
if (readersAndUpdates != null) {
if (isFullyDeleted(readersAndUpdates)) {
toDrop.add(info);
}
}
}
for (SegmentCommitInfo info : toDrop) {
dropDeletedSegment(info);
}
if (toDrop.isEmpty() == false) {
checkpoint();
}
}
/**
* Sets the iterator to provide the commit user data map at commit time. Calling this method
@ -3503,6 +3523,7 @@ public class IndexWriter implements Closeable, TwoPhaseCommit, Accountable {
anyChanges |= maybeMerge.getAndSet(false);
synchronized(this) {
writeReaderPool(applyAllDeletes);
doAfterFlush();
success = true;
return anyChanges;

View File

@ -235,18 +235,11 @@ class PendingDeletes {
}
/**
* Called before the given DocValuesFieldUpdates are written to disk
* @param info the field to apply
*/
void onDocValuesUpdate(FieldInfo info) {
}
/**
* Called for every field update for the given field
* @param field the field that's updated
* Called for every field update for the given field at flush time
* @param info the field info of the field that's updated
* @param iterator the values to apply
*/
void onDocValuesUpdate(String field, DocValuesFieldUpdates.Iterator iterator) throws IOException {
void onDocValuesUpdate(FieldInfo info, DocValuesFieldUpdates.Iterator iterator) throws IOException {
}
int numDeletesToMerge(MergePolicy policy, IOSupplier<CodecReader> readerIOSupplier) throws IOException {

View File

@ -120,14 +120,9 @@ final class PendingSoftDeletes extends PendingDeletes {
}
@Override
void onDocValuesUpdate(String field, DocValuesFieldUpdates.Iterator iterator) throws IOException {
if (this.field.equals(field)) {
void onDocValuesUpdate(FieldInfo info, DocValuesFieldUpdates.Iterator iterator) throws IOException {
if (this.field.equals(info.name)) {
pendingDeleteCount += applySoftDeletes(iterator, getMutableBits());
}
}
@Override
void onDocValuesUpdate(FieldInfo info) {
if (field.equals(info.name)) {
assert dvGeneration < info.getDocValuesGen() : "we have seen this generation update already: " + dvGeneration + " vs. " + info.getDocValuesGen();
assert dvGeneration != -2 : "docValues generation is still uninitialized";
dvGeneration = info.getDocValuesGen();

View File

@ -161,7 +161,6 @@ final class ReadersAndUpdates {
}
fieldUpdates.add(update);
}
pendingDeletes.onDocValuesUpdate(update.field, update.iterator());
}
public synchronized long getNumDVUpdates() {
@ -334,7 +333,6 @@ final class ReadersAndUpdates {
final TrackingDirectoryWrapper trackingDir = new TrackingDirectoryWrapper(dir);
final SegmentWriteState state = new SegmentWriteState(null, trackingDir, info.info, fieldInfos, null, updatesContext, segmentSuffix);
try (final DocValuesConsumer fieldsConsumer = dvFormat.fieldsConsumer(state)) {
pendingDeletes.onDocValuesUpdate(fieldInfo);
Function<FieldInfo, DocValuesFieldUpdates.Iterator> updateSupplier = (info) -> {
if (info != fieldInfo) {
throw new IllegalArgumentException("expected field info for field: " + fieldInfo.name + " but got: " + info.name);
@ -345,6 +343,7 @@ final class ReadersAndUpdates {
}
return DocValuesFieldUpdates.mergedIterator(subs);
};
pendingDeletes.onDocValuesUpdate(fieldInfo, updateSupplier.apply(fieldInfo));
if (type == DocValuesType.BINARY) {
fieldsConsumer.addBinaryField(fieldInfo, new EmptyDocValuesProducer() {
@Override

View File

@ -116,9 +116,8 @@ public class TestPendingSoftDeletes extends TestPendingDeletes {
List<Integer> docsDeleted = Arrays.asList(1, 3, 7, 8, DocIdSetIterator.NO_MORE_DOCS);
List<DocValuesFieldUpdates> updates = Arrays.asList(singleUpdate(docsDeleted, 10));
for (DocValuesFieldUpdates update : updates) {
deletes.onDocValuesUpdate(update.field, update.iterator());
deletes.onDocValuesUpdate(fieldInfo, update.iterator());
}
deletes.onDocValuesUpdate(fieldInfo);
assertEquals(4, deletes.numPendingDeletes());
assertTrue(deletes.getLiveDocs().get(0));
assertFalse(deletes.getLiveDocs().get(1));
@ -135,9 +134,8 @@ public class TestPendingSoftDeletes extends TestPendingDeletes {
updates = Arrays.asList(singleUpdate(docsDeleted, 10));
fieldInfo = new FieldInfo("_soft_deletes", 1, false, false, false, IndexOptions.NONE, DocValuesType.NUMERIC, 1, Collections.emptyMap(), 0, 0);
for (DocValuesFieldUpdates update : updates) {
deletes.onDocValuesUpdate(update.field, update.iterator());
deletes.onDocValuesUpdate(fieldInfo, update.iterator());
}
deletes.onDocValuesUpdate(fieldInfo);
assertEquals(5, deletes.numPendingDeletes());
assertTrue(deletes.getLiveDocs().get(0));
assertFalse(deletes.getLiveDocs().get(1));
@ -180,9 +178,8 @@ public class TestPendingSoftDeletes extends TestPendingDeletes {
List<Integer> docsDeleted = Arrays.asList(1, DocIdSetIterator.NO_MORE_DOCS);
List<DocValuesFieldUpdates> updates = Arrays.asList(singleUpdate(docsDeleted, 3));
for (DocValuesFieldUpdates update : updates) {
deletes.onDocValuesUpdate(update.field, update.iterator());
deletes.onDocValuesUpdate(fieldInfo, update.iterator());
}
deletes.onDocValuesUpdate(fieldInfo);
assertEquals(1, deletes.numPendingDeletes());
assertTrue(deletes.getLiveDocs().get(0));
assertFalse(deletes.getLiveDocs().get(1));

View File

@ -289,6 +289,9 @@ public class TestSoftDeletesRetentionMergePolicy extends LuceneTestCase {
writer.softUpdateDocument(new Term("id", id), doc,
new NumericDocValuesField("soft_delete", 1));
}
if (rarely()) {
writer.flush();
}
ids.add(id);
}
} catch (IOException | InterruptedException e) {
@ -382,13 +385,13 @@ public class TestSoftDeletesRetentionMergePolicy extends LuceneTestCase {
Document tombstone = new Document();
tombstone.add(new NumericDocValuesField("soft_delete", 1));
writer.softUpdateDocument(new Term("id", "1"), tombstone, new NumericDocValuesField("soft_delete", 1));
// Internally, forceMergeDeletes will call flush to flush pending updates
writer.forceMergeDeletes(true); // Internally, forceMergeDeletes will call flush to flush pending updates
// Thus, we will have two segments - both having soft-deleted documents.
// We expect any MP to merge these segments into one segment
// when calling forceMergeDeletes.
writer.forceMergeDeletes(true);
assertEquals(1, writer.maxDoc());
assertEquals(1, writer.segmentInfos.asList().size());
assertEquals(1, writer.numDocs());
assertEquals(1, writer.maxDoc());
writer.close();
dir.close();
}