From c70cceaee56cecf35875cd2b5c8d5700f2b3cedb Mon Sep 17 00:00:00 2001 From: Simon Willnauer Date: Mon, 16 Apr 2018 16:16:43 +0200 Subject: [PATCH] LUCENE-8253: Account for soft-deletes before they are flushed to disk Inside the IndexWriter buffers are only written to disk if it's needed or "worth it" which doesn't guarantee soft deletes to be accounted in time. This is not necessarily a problem since they are eventually collected and segments that have soft-deletes will me merged eventually but for tests and on par behavior compared to hard deletes this behavior is tricky. This change cuts over to accounting in-place just like hard-deletes. This results in accurate delete numbers for soft deletes at any give point in time once the reader is loaded or a pending soft delete occurs. This change also fixes an issue where all updates to a DV field are allowed event if the field is unknown. Now this only works if the field is equal to the soft deletes field. This behavior was never released. --- .../org/apache/lucene/index/IndexWriter.java | 28 +++---- .../apache/lucene/index/PendingDeletes.java | 21 ++++-- .../lucene/index/PendingSoftDeletes.java | 75 +++++++++++++++---- .../lucene/index/ReadersAndUpdates.java | 10 +-- .../SoftDeletesRetentionMergePolicy.java | 22 +++--- .../lucene/index/TestPendingSoftDeletes.java | 19 +++-- ...TestSoftDeletesDirectoryReaderWrapper.java | 32 ++++++++ .../lucene/search/TestLRUQueryCache.java | 46 ++++++++++++ .../apache/lucene/util/LuceneTestCase.java | 6 +- 9 files changed, 206 insertions(+), 53 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java b/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java index e973b91f47e..e8f3e13765b 100644 --- a/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java +++ b/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java @@ -1594,7 +1594,7 @@ public class IndexWriter implements Closeable, TwoPhaseCommit, Accountable { if (softDeletes == null || softDeletes.length == 0) { throw new IllegalArgumentException("at least one soft delete must be present"); } - return updateDocuments(DocumentsWriterDeleteQueue.newNode(buildDocValuesUpdate(term, softDeletes, false)), docs); + return updateDocuments(DocumentsWriterDeleteQueue.newNode(buildDocValuesUpdate(term, softDeletes)), docs); } /** Expert: attempts to delete by document ID, as long as @@ -1831,7 +1831,7 @@ public class IndexWriter implements Closeable, TwoPhaseCommit, Accountable { if (softDeletes == null || softDeletes.length == 0) { throw new IllegalArgumentException("at least one soft delete must be present"); } - return updateDocument(DocumentsWriterDeleteQueue.newNode(buildDocValuesUpdate(term, softDeletes, false)), doc); + return updateDocument(DocumentsWriterDeleteQueue.newNode(buildDocValuesUpdate(term, softDeletes)), doc); } @@ -1940,7 +1940,7 @@ public class IndexWriter implements Closeable, TwoPhaseCommit, Accountable { */ public long updateDocValues(Term term, Field... updates) throws IOException { ensureOpen(); - DocValuesUpdate[] dvUpdates = buildDocValuesUpdate(term, updates, true); + DocValuesUpdate[] dvUpdates = buildDocValuesUpdate(term, updates); try { long seqNo = docWriter.updateDocValues(dvUpdates); if (seqNo < 0) { @@ -1954,7 +1954,7 @@ public class IndexWriter implements Closeable, TwoPhaseCommit, Accountable { } } - private DocValuesUpdate[] buildDocValuesUpdate(Term term, Field[] updates, boolean enforceFieldExistence) { + private DocValuesUpdate[] buildDocValuesUpdate(Term term, Field[] updates) { DocValuesUpdate[] dvUpdates = new DocValuesUpdate[updates.length]; for (int i = 0; i < updates.length; i++) { final Field f = updates[i]; @@ -1965,7 +1965,7 @@ public class IndexWriter implements Closeable, TwoPhaseCommit, Accountable { if (dvType == DocValuesType.NONE) { throw new IllegalArgumentException("can only update NUMERIC or BINARY fields! field=" + f.name()); } - if (enforceFieldExistence && !globalFieldNumberMap.contains(f.name(), dvType)) { + if (!globalFieldNumberMap.contains(f.name(), dvType) && f.name().equals(config.softDeletesField) == false) { throw new IllegalArgumentException("can only update existing docvalues fields! field=" + f.name() + ", type=" + dvType); } if (config.getIndexSortFields().contains(f.name())) { @@ -5230,15 +5230,15 @@ public class IndexWriter implements Closeable, TwoPhaseCommit, Accountable { */ public final int numDeletesToMerge(SegmentCommitInfo info) throws IOException { MergePolicy mergePolicy = config.getMergePolicy(); - final ReadersAndUpdates rld = readerPool.get(info, false); - int numDeletesToMerge; - if (rld != null) { - numDeletesToMerge = rld.numDeletesToMerge(mergePolicy); - } else { - numDeletesToMerge = mergePolicy.numDeletesToMerge(info, 0, null); + final ReadersAndUpdates rld = readerPool.get(info, true); + try { + int numDeletesToMerge = rld.numDeletesToMerge(mergePolicy); + assert numDeletesToMerge <= info.info.maxDoc() : + "numDeletesToMerge: " + numDeletesToMerge + " > maxDoc: " + info.info.maxDoc(); + return numDeletesToMerge; + } finally { + readerPool.release(rld); } - assert numDeletesToMerge <= info.info.maxDoc() : - "numDeletesToMerge: " + numDeletesToMerge + " > maxDoc: " + info.info.maxDoc(); - return numDeletesToMerge; + } } diff --git a/lucene/core/src/java/org/apache/lucene/index/PendingDeletes.java b/lucene/core/src/java/org/apache/lucene/index/PendingDeletes.java index 52d06e8fd29..c0aed38ba73 100644 --- a/lucene/core/src/java/org/apache/lucene/index/PendingDeletes.java +++ b/lucene/core/src/java/org/apache/lucene/index/PendingDeletes.java @@ -18,7 +18,6 @@ package org.apache.lucene.index; import java.io.IOException; -import java.util.List; import org.apache.lucene.codecs.Codec; import org.apache.lucene.codecs.LiveDocsFormat; @@ -26,6 +25,7 @@ import org.apache.lucene.store.Directory; import org.apache.lucene.store.IOContext; import org.apache.lucene.store.TrackingDirectoryWrapper; import org.apache.lucene.util.Bits; +import org.apache.lucene.util.IOSupplier; import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.MutableBits; @@ -135,7 +135,7 @@ class PendingDeletes { /** * Called once a new reader is opened for this segment ie. when deletes or updates are applied. */ - void onNewReader(SegmentReader reader, SegmentCommitInfo info) throws IOException { + void onNewReader(CodecReader reader, SegmentCommitInfo info) throws IOException { if (liveDocsInitialized == false) { if (reader.hasDeletions()) { // we only initialize this once either in the ctor or here @@ -235,10 +235,21 @@ class PendingDeletes { } /** - * Called before the given DocValuesFieldUpdates are applied + * Called before the given DocValuesFieldUpdates are written to disk * @param info the field to apply - * @param fieldUpdates the field updates */ - void onDocValuesUpdate(FieldInfo info, List fieldUpdates) throws IOException { + void onDocValuesUpdate(FieldInfo info) { + } + + /** + * Called for every field update for the given field + * @param field the field that's updated + * @param iterator the values to apply + */ + void onDocValuesUpdate(String field, DocValuesFieldUpdates.Iterator iterator) throws IOException { + } + + int numDeletesToMerge(MergePolicy policy, IOSupplier readerIOSupplier) throws IOException { + return policy.numDeletesToMerge(info, numPendingDeletes(), readerIOSupplier); } } diff --git a/lucene/core/src/java/org/apache/lucene/index/PendingSoftDeletes.java b/lucene/core/src/java/org/apache/lucene/index/PendingSoftDeletes.java index 4c3db482ab5..3dca782d663 100644 --- a/lucene/core/src/java/org/apache/lucene/index/PendingSoftDeletes.java +++ b/lucene/core/src/java/org/apache/lucene/index/PendingSoftDeletes.java @@ -17,11 +17,16 @@ package org.apache.lucene.index; +import java.io.Closeable; import java.io.IOException; -import java.util.List; + +import org.apache.lucene.codecs.FieldInfosFormat; import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.DocValuesFieldExistsQuery; import org.apache.lucene.store.Directory; +import org.apache.lucene.store.IOContext; +import org.apache.lucene.util.IOSupplier; +import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.MutableBits; final class PendingSoftDeletes extends PendingDeletes { @@ -64,14 +69,12 @@ final class PendingSoftDeletes extends PendingDeletes { } @Override - void onNewReader(SegmentReader reader, SegmentCommitInfo info) throws IOException { + void onNewReader(CodecReader reader, SegmentCommitInfo info) throws IOException { super.onNewReader(reader, info); hardDeletes.onNewReader(reader, info); if (dvGeneration < info.getDocValuesGen()) { // only re-calculate this if we haven't seen this generation final DocIdSetIterator iterator = DocValuesFieldExistsQuery.getDocValuesDocIdSetIterator(field, reader); - if (iterator == null) { // nothing is deleted we don't have a soft deletes field in this segment - this.pendingDeleteCount = 0; - } else { + if (iterator != null) { // nothing is deleted we don't have a soft deletes field in this segment assert info.info.maxDoc() > 0 : "maxDoc is 0"; pendingDeleteCount += applySoftDeletes(iterator, getMutableBits()); } @@ -117,15 +120,8 @@ final class PendingSoftDeletes extends PendingDeletes { } @Override - void onDocValuesUpdate(FieldInfo info, List updatesToApply) throws IOException { - if (field.equals(info.name)) { - assert dvGeneration < info.getDocValuesGen() : "we have seen this generation update already: " + dvGeneration + " vs. " + info.getDocValuesGen(); - assert dvGeneration != -2 : "docValues generation is still uninitialized"; - DocValuesFieldUpdates.Iterator[] subs = new DocValuesFieldUpdates.Iterator[updatesToApply.size()]; - for(int i=0; i readerSupplier) throws IOException { - int numDeletesToMerge = super.numDeletesToMerge(info, pendingDeleteCount, readerSupplier); + final int numDeletesToMerge = super.numDeletesToMerge(info, pendingDeleteCount, readerSupplier); if (numDeletesToMerge != 0) { final CodecReader reader = readerSupplier.get(); if (reader.getLiveDocs() != null) { - Scorer scorer = getScorer(field, retentionQuerySupplier.get(), wrapLiveDocs(reader, null, reader.maxDoc())); + BooleanQuery.Builder builder = new BooleanQuery.Builder(); + builder.add(new DocValuesFieldExistsQuery(field), BooleanClause.Occur.FILTER); + builder.add(retentionQuerySupplier.get(), BooleanClause.Occur.FILTER); + Scorer scorer = getScorer(builder.build(), wrapLiveDocs(reader, null, reader.maxDoc())); if (scorer != null) { DocIdSetIterator iterator = scorer.iterator(); Bits liveDocs = reader.getLiveDocs(); diff --git a/lucene/core/src/test/org/apache/lucene/index/TestPendingSoftDeletes.java b/lucene/core/src/test/org/apache/lucene/index/TestPendingSoftDeletes.java index 255ff9ec49e..eac438826d1 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestPendingSoftDeletes.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestPendingSoftDeletes.java @@ -45,7 +45,7 @@ public class TestPendingSoftDeletes extends TestPendingDeletes { public void testDeleteSoft() throws IOException { Directory dir = newDirectory(); - IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig()); // no soft delete field here + IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig().setSoftDeletesField("_soft_deletes")); Document doc = new Document(); doc.add(new StringField("id", "1", Field.Store.YES)); writer.softUpdateDocument(new Term("id", "1"), doc, @@ -114,7 +114,10 @@ public class TestPendingSoftDeletes extends TestPendingDeletes { FieldInfo fieldInfo = new FieldInfo("_soft_deletes", 1, false, false, false, IndexOptions.NONE, DocValuesType.NUMERIC, 0, Collections.emptyMap(), 0, 0); List docsDeleted = Arrays.asList(1, 3, 7, 8, DocIdSetIterator.NO_MORE_DOCS); List updates = Arrays.asList(singleUpdate(docsDeleted, 10)); - deletes.onDocValuesUpdate(fieldInfo, updates); + for (DocValuesFieldUpdates update : updates) { + deletes.onDocValuesUpdate(update.field, update.iterator()); + } + deletes.onDocValuesUpdate(fieldInfo); assertEquals(4, deletes.numPendingDeletes()); assertTrue(deletes.getLiveDocs().get(0)); assertFalse(deletes.getLiveDocs().get(1)); @@ -130,7 +133,10 @@ public class TestPendingSoftDeletes extends TestPendingDeletes { docsDeleted = Arrays.asList(1, 2, DocIdSetIterator.NO_MORE_DOCS); updates = Arrays.asList(singleUpdate(docsDeleted, 10)); fieldInfo = new FieldInfo("_soft_deletes", 1, false, false, false, IndexOptions.NONE, DocValuesType.NUMERIC, 1, Collections.emptyMap(), 0, 0); - deletes.onDocValuesUpdate(fieldInfo, updates); + for (DocValuesFieldUpdates update : updates) { + deletes.onDocValuesUpdate(update.field, update.iterator()); + } + deletes.onDocValuesUpdate(fieldInfo); assertEquals(5, deletes.numPendingDeletes()); assertTrue(deletes.getLiveDocs().get(0)); assertFalse(deletes.getLiveDocs().get(1)); @@ -146,7 +152,7 @@ public class TestPendingSoftDeletes extends TestPendingDeletes { public void testUpdateAppliedOnlyOnce() throws IOException { Directory dir = newDirectory(); - IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig()); // no soft delete field hier + IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig().setSoftDeletesField("_soft_deletes")); Document doc = new Document(); doc.add(new StringField("id", "1", Field.Store.YES)); writer.softUpdateDocument(new Term("id", "1"), doc, @@ -169,7 +175,10 @@ public class TestPendingSoftDeletes extends TestPendingDeletes { FieldInfo fieldInfo = new FieldInfo("_soft_deletes", 1, false, false, false, IndexOptions.NONE, DocValuesType.NUMERIC, segmentInfo.getNextDocValuesGen(), Collections.emptyMap(), 0, 0); List docsDeleted = Arrays.asList(1, DocIdSetIterator.NO_MORE_DOCS); List updates = Arrays.asList(singleUpdate(docsDeleted, 3)); - deletes.onDocValuesUpdate(fieldInfo, updates); + for (DocValuesFieldUpdates update : updates) { + deletes.onDocValuesUpdate(update.field, update.iterator()); + } + deletes.onDocValuesUpdate(fieldInfo); assertEquals(1, deletes.numPendingDeletes()); assertTrue(deletes.getLiveDocs().get(0)); assertFalse(deletes.getLiveDocs().get(1)); diff --git a/lucene/core/src/test/org/apache/lucene/index/TestSoftDeletesDirectoryReaderWrapper.java b/lucene/core/src/test/org/apache/lucene/index/TestSoftDeletesDirectoryReaderWrapper.java index dea7bc977be..30a11b69fb7 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestSoftDeletesDirectoryReaderWrapper.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestSoftDeletesDirectoryReaderWrapper.java @@ -27,6 +27,7 @@ import org.apache.lucene.document.Field; import org.apache.lucene.document.NumericDocValuesField; import org.apache.lucene.document.StringField; import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.MatchNoDocsQuery; import org.apache.lucene.search.TermQuery; import org.apache.lucene.store.Directory; import org.apache.lucene.util.IOUtils; @@ -196,4 +197,35 @@ public class TestSoftDeletesDirectoryReaderWrapper extends LuceneTestCase { assertEquals(1, leafCalled.get()); IOUtils.close(reader, writer, dir); } + + public void testForceMergeDeletes() throws Exception { + Directory dir = newDirectory(); + IndexWriterConfig config = newIndexWriterConfig().setSoftDeletesField("soft_delete"); + config.setMergePolicy(newMergePolicy(random(), false)); // no mock MP it might not select segments for force merge + if (random().nextBoolean()) { + config.setMergePolicy(new SoftDeletesRetentionMergePolicy("soft_delete", + () -> new MatchNoDocsQuery(), config.getMergePolicy())); + } + IndexWriter writer = new IndexWriter(dir, config); + // The first segment includes d1 and d2 + for (int i = 0; i < 2; i++) { + Document d = new Document(); + d.add(new StringField("id", Integer.toString(i), Field.Store.YES)); + writer.addDocument(d); + } + writer.flush(); + // The second segment includes only the tombstone + Document tombstone = new Document(); + tombstone.add(new NumericDocValuesField("soft_delete", 1)); + writer.softUpdateDocument(new Term("id", "1"), tombstone, new NumericDocValuesField("soft_delete", 1)); + // Internally, forceMergeDeletes will call flush to flush pending updates + // Thus, we will have two segments - both having soft-deleted documents. + // We expect any MP to merge these segments into one segment + // when calling forceMergeDeletes. + writer.forceMergeDeletes(true); + assertEquals(1, writer.maxDoc()); + assertEquals(1, writer.segmentInfos.asList().size()); + writer.close(); + dir.close(); + } } diff --git a/lucene/core/src/test/org/apache/lucene/search/TestLRUQueryCache.java b/lucene/core/src/test/org/apache/lucene/search/TestLRUQueryCache.java index f4240e195bf..d0eaa4363be 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestLRUQueryCache.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestLRUQueryCache.java @@ -1541,7 +1541,53 @@ public class TestLRUQueryCache extends LuceneTestCase { reader.close(); w.close(); dir.close(); + } + + public void testQueryCacheSoftUpdate() throws IOException { + Directory dir = newDirectory(); + IndexWriterConfig iwc = newIndexWriterConfig().setSoftDeletesField("soft_delete"); + IndexWriter w = new IndexWriter(dir, iwc); + LRUQueryCache queryCache = new LRUQueryCache(10, 1000 * 1000, ctx -> true); + IndexSearcher.setDefaultQueryCache(queryCache); + IndexSearcher.setDefaultQueryCachingPolicy(QueryCachingPolicy.ALWAYS_CACHE); + + SearcherManager sm = new SearcherManager(w, new SearcherFactory()); + + Document doc = new Document(); + doc.add(new StringField("id", "1", org.apache.lucene.document.Field.Store.YES)); + w.addDocument(doc); + + doc = new Document(); + doc.add(new StringField("id", "2", org.apache.lucene.document.Field.Store.YES)); + w.addDocument(doc); + + sm.maybeRefreshBlocking(); + + IndexSearcher searcher = sm.acquire(); + Query query = new BooleanQuery.Builder().add(new TermQuery(new Term("id", "1")), BooleanClause.Occur.FILTER).build(); + assertEquals(1, searcher.count(query)); + assertEquals(1, queryCache.getCacheSize()); + assertEquals(0, queryCache.getEvictionCount()); + + boolean softDelete = true; + if (softDelete) { + Document tombstone = new Document(); + tombstone.add(new NumericDocValuesField("soft_delete", 1)); + w.softUpdateDocument(new Term("id", "1"), tombstone, new NumericDocValuesField("soft_delete", 1)); + w.softUpdateDocument(new Term("id", "2"), tombstone, new NumericDocValuesField("soft_delete", 1)); + } else { + w.deleteDocuments(new Term("id", "1")); + w.deleteDocuments(new Term("id", "2")); + } + sm.maybeRefreshBlocking(); + // All docs in the first segment are deleted - we should drop it with the default merge policy. + sm.release(searcher); + assertEquals(0, queryCache.getCacheSize()); + assertEquals(1, queryCache.getEvictionCount()); + sm.close(); + w.close(); + dir.close(); } public void testBulkScorerLocking() throws Exception { diff --git a/lucene/test-framework/src/java/org/apache/lucene/util/LuceneTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/util/LuceneTestCase.java index 1087e209f74..d6c0e7ed6a7 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/util/LuceneTestCase.java +++ b/lucene/test-framework/src/java/org/apache/lucene/util/LuceneTestCase.java @@ -1049,7 +1049,11 @@ public abstract class LuceneTestCase extends Assert { } public static MergePolicy newMergePolicy(Random r) { - if (rarely(r)) { + return newMergePolicy(r, true); + } + + public static MergePolicy newMergePolicy(Random r, boolean includeMockMP) { + if (includeMockMP && rarely(r)) { return new MockRandomMergePolicy(r); } else if (r.nextBoolean()) { return newTieredMergePolicy(r);