From 35fa0b4f55f95ca0c8d8b21c77e78e478fba8e74 Mon Sep 17 00:00:00 2001 From: Simon Willnauer Date: Tue, 24 Jul 2018 13:41:11 +0200 Subject: [PATCH] LUCENE-8425: Expose hard live docs on SegmentReader level Today if soft deletes are used we expose a union of hard and soft deletes via LeafReader#getLiveDocs. Yet, if a users wants to take advantage of searching also soft-deleted documents the only option today is to search all documents even though some of them are hard deleted. The recommendation is to not mix those but in exceptional cases ie. when a document hits a non-aborting exception during indexing the document is marked as hard deleted which is the correct action. In order to filter those out having access to the hard live docs on the segment reader level allows to filter out these documents. --- .../org/apache/lucene/index/ReaderPool.java | 2 +- .../lucene/index/ReadersAndUpdates.java | 5 +- .../apache/lucene/index/SegmentReader.java | 49 ++++++------ .../lucene/index/StandardDirectoryReader.java | 16 ++-- .../apache/lucene/index/TestIndexWriter.java | 75 +++++++++++++++++++ .../nrt/SegmentInfosSearcherManager.java | 2 +- 6 files changed, 118 insertions(+), 31 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/index/ReaderPool.java b/lucene/core/src/java/org/apache/lucene/index/ReaderPool.java index 980f4a19924..b792be26873 100644 --- a/lucene/core/src/java/org/apache/lucene/index/ReaderPool.java +++ b/lucene/core/src/java/org/apache/lucene/index/ReaderPool.java @@ -89,7 +89,7 @@ final class ReaderPool implements Closeable { LeafReaderContext leaf = leaves.get(i); SegmentReader segReader = (SegmentReader) leaf.reader(); SegmentReader newReader = new SegmentReader(segmentInfos.info(i), segReader, segReader.getLiveDocs(), - segReader.numDocs()); + segReader.getHardLiveDocs(), segReader.numDocs(), true); readerMap.put(newReader.getOriginalSegmentInfo(), new ReadersAndUpdates(segmentInfos.getIndexCreatedVersionMajor(), newReader, newPendingDeletes(newReader, newReader.getOriginalSegmentInfo()))); } diff --git a/lucene/core/src/java/org/apache/lucene/index/ReadersAndUpdates.java b/lucene/core/src/java/org/apache/lucene/index/ReadersAndUpdates.java index 3453447ecce..b09338fc722 100644 --- a/lucene/core/src/java/org/apache/lucene/index/ReadersAndUpdates.java +++ b/lucene/core/src/java/org/apache/lucene/index/ReadersAndUpdates.java @@ -214,7 +214,7 @@ final class ReadersAndUpdates { // force new liveDocs Bits liveDocs = pendingDeletes.getLiveDocs(); if (liveDocs != null) { - return new SegmentReader(info, reader, liveDocs, pendingDeletes.numDocs()); + return new SegmentReader(info, reader, liveDocs, pendingDeletes.getHardLiveDocs(), pendingDeletes.numDocs(), true); } else { // liveDocs == null and reader != null. That can only be if there are no deletes assert reader.getLiveDocs() == null; @@ -645,7 +645,8 @@ final class ReadersAndUpdates { private SegmentReader createNewReaderWithLatestLiveDocs(SegmentReader reader) throws IOException { assert reader != null; assert Thread.holdsLock(this) : Thread.currentThread().getName(); - SegmentReader newReader = new SegmentReader(info, reader, pendingDeletes.getLiveDocs(), pendingDeletes.numDocs()); + SegmentReader newReader = new SegmentReader(info, reader, pendingDeletes.getLiveDocs(), + pendingDeletes.getHardLiveDocs(), pendingDeletes.numDocs(), true); boolean success2 = false; try { pendingDeletes.onNewReader(newReader, info); diff --git a/lucene/core/src/java/org/apache/lucene/index/SegmentReader.java b/lucene/core/src/java/org/apache/lucene/index/SegmentReader.java index 9373718dcce..b368b964b7a 100644 --- a/lucene/core/src/java/org/apache/lucene/index/SegmentReader.java +++ b/lucene/core/src/java/org/apache/lucene/index/SegmentReader.java @@ -51,6 +51,7 @@ public final class SegmentReader extends CodecReader { private final SegmentCommitInfo originalSi; private final LeafMetaData metaData; private final Bits liveDocs; + private final Bits hardLiveDocs; // Normally set to si.maxDoc - si.delDocCount, unless we // were created as an NRT reader from IW, in which case IW @@ -65,7 +66,7 @@ public final class SegmentReader extends CodecReader { final DocValuesProducer docValuesProducer; final FieldInfos fieldInfos; - + /** * Constructs a new SegmentReader with a new core. * @throws CorruptIndexException if the index is corrupt @@ -87,16 +88,16 @@ public final class SegmentReader extends CodecReader { try { if (si.hasDeletions()) { // NOTE: the bitvector is stored using the regular directory, not cfs - liveDocs = codec.liveDocsFormat().readLiveDocs(directory(), si, IOContext.READONCE); + hardLiveDocs = liveDocs = codec.liveDocsFormat().readLiveDocs(directory(), si, IOContext.READONCE); } else { assert si.getDelCount() == 0; - liveDocs = null; + hardLiveDocs = liveDocs = null; } numDocs = si.info.maxDoc() - si.getDelCount(); fieldInfos = initFieldInfos(); docValuesProducer = initDocValuesProducer(); - + assert assertLiveDocs(isNRT, hardLiveDocs, liveDocs); success = true; } finally { // With lock-less commits, it's entirely possible (and @@ -110,27 +111,10 @@ public final class SegmentReader extends CodecReader { } } - /** Create new SegmentReader sharing core from a previous - * SegmentReader and loading new live docs from a new - * deletes file. Used by openIfChanged. */ - SegmentReader(SegmentCommitInfo si, SegmentReader sr) throws IOException { - this(si, sr, - si.hasDeletions() ? si.info.getCodec().liveDocsFormat().readLiveDocs(si.info.dir, si, IOContext.READONCE) : null, - si.info.maxDoc() - si.getDelCount(), false); - } - - /** Create new SegmentReader sharing core from a previous - * SegmentReader and using the provided in-memory - * liveDocs. Used by IndexWriter to provide a new NRT - * reader */ - SegmentReader(SegmentCommitInfo si, SegmentReader sr, Bits liveDocs, int numDocs) throws IOException { - this(si, sr, liveDocs, numDocs, true); - } - /** Create new SegmentReader sharing core from a previous * SegmentReader and using the provided liveDocs, and recording * whether those liveDocs were carried in ram (isNRT=true). */ - SegmentReader(SegmentCommitInfo si, SegmentReader sr, Bits liveDocs, int numDocs, boolean isNRT) throws IOException { + SegmentReader(SegmentCommitInfo si, SegmentReader sr, Bits liveDocs, Bits hardLiveDocs, int numDocs, boolean isNRT) throws IOException { if (numDocs > si.info.maxDoc()) { throw new IllegalArgumentException("numDocs=" + numDocs + " but maxDoc=" + si.info.maxDoc()); } @@ -141,6 +125,8 @@ public final class SegmentReader extends CodecReader { this.originalSi = si; this.metaData = sr.getMetaData(); this.liveDocs = liveDocs; + this.hardLiveDocs = hardLiveDocs; + assert assertLiveDocs(isNRT, hardLiveDocs, liveDocs); this.isNRT = isNRT; this.numDocs = numDocs; this.core = sr.core; @@ -159,6 +145,15 @@ public final class SegmentReader extends CodecReader { } } + private static boolean assertLiveDocs(boolean isNRT, Bits hardLiveDocs, Bits liveDocs) { + if (isNRT) { + assert hardLiveDocs == null || liveDocs != null : " liveDocs must be non null if hardLiveDocs are non null"; + } else { + assert hardLiveDocs == liveDocs : "non-nrt case must have identical liveDocs"; + } + return true; + } + /** * init most recent DocValues for the current commit */ @@ -361,4 +356,14 @@ public final class SegmentReader extends CodecReader { SegmentCommitInfo getOriginalSegmentInfo() { return originalSi; } + + /** + * Returns the live docs that are not hard-deleted. This is an expert API to be used with + * soft-deletes to filter out document that hard deleted for instance due to aborted documents or to distinguish + * soft and hard deleted documents ie. a rolled back tombstone. + * @lucene.experimental + */ + public Bits getHardLiveDocs() { + return hardLiveDocs; + } } diff --git a/lucene/core/src/java/org/apache/lucene/index/StandardDirectoryReader.java b/lucene/core/src/java/org/apache/lucene/index/StandardDirectoryReader.java index 3b1b72fc7d1..5b2b049125f 100644 --- a/lucene/core/src/java/org/apache/lucene/index/StandardDirectoryReader.java +++ b/lucene/core/src/java/org/apache/lucene/index/StandardDirectoryReader.java @@ -32,6 +32,7 @@ import java.util.concurrent.CopyOnWriteArraySet; import org.apache.lucene.store.AlreadyClosedException; import org.apache.lucene.store.Directory; import org.apache.lucene.store.IOContext; +import org.apache.lucene.util.Bits; import org.apache.lucene.util.IOUtils; /** Default implementation of {@link DirectoryReader}. */ @@ -174,16 +175,17 @@ public final class StandardDirectoryReader extends DirectoryReader { try { SegmentReader newReader; if (oldReader == null || commitInfo.info.getUseCompoundFile() != oldReader.getSegmentInfo().info.getUseCompoundFile()) { - // this is a new reader; in case we hit an exception we can decRef it safely newReader = new SegmentReader(commitInfo, infos.getIndexCreatedVersionMajor(), IOContext.READ); newReaders[i] = newReader; } else { if (oldReader.isNRT) { // We must load liveDocs/DV updates from disk: - newReaders[i] = new SegmentReader(commitInfo, oldReader); + Bits liveDocs = commitInfo.hasDeletions() ? commitInfo.info.getCodec().liveDocsFormat() + .readLiveDocs(commitInfo.info.dir, commitInfo, IOContext.READONCE) : null; + newReaders[i] = new SegmentReader(commitInfo, oldReader, liveDocs, liveDocs, + commitInfo.info.maxDoc() - commitInfo.getDelCount(), false); } else { - if (oldReader.getSegmentInfo().getDelGen() == commitInfo.getDelGen() && oldReader.getSegmentInfo().getFieldInfosGen() == commitInfo.getFieldInfosGen()) { // No change; this reader will be shared between @@ -197,10 +199,14 @@ public final class StandardDirectoryReader extends DirectoryReader { if (oldReader.getSegmentInfo().getDelGen() == commitInfo.getDelGen()) { // only DV updates - newReaders[i] = new SegmentReader(commitInfo, oldReader, oldReader.getLiveDocs(), oldReader.numDocs(), false); // this is not an NRT reader! + newReaders[i] = new SegmentReader(commitInfo, oldReader, oldReader.getLiveDocs(), + oldReader.getHardLiveDocs(), oldReader.numDocs(), false); // this is not an NRT reader! } else { // both DV and liveDocs have changed - newReaders[i] = new SegmentReader(commitInfo, oldReader); + Bits liveDocs = commitInfo.hasDeletions() ? commitInfo.info.getCodec().liveDocsFormat() + .readLiveDocs(commitInfo.info.dir, commitInfo, IOContext.READONCE) : null; + newReaders[i] = new SegmentReader(commitInfo, oldReader, liveDocs, liveDocs, + commitInfo.info.maxDoc() - commitInfo.getDelCount(), false); } } } diff --git a/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java b/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java index 85e6979fe10..4861929d2cb 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java @@ -3144,6 +3144,10 @@ public class TestIndexWriter extends LuceneTestCase { numSoftDeleted += info.getSoftDelCount(); } assertEquals(writer.maxDoc() - writer.numDocs(), numSoftDeleted); + for (LeafReaderContext context : reader.leaves()) { + LeafReader leaf = context.reader(); + assertNull(((SegmentReader) leaf).getHardLiveDocs()); + } writer.close(); reader.close(); dir.close(); @@ -3263,6 +3267,12 @@ public class TestIndexWriter extends LuceneTestCase { assertEquals(1, topDocs.totalHits); } } + if (mixDeletes == false) { + for (LeafReaderContext context : reader.leaves()) { + LeafReader leaf = context.reader(); + assertNull(((SegmentReader) leaf).getHardLiveDocs()); + } + } mergeAwaySoftDeletes.set(true); writer.addDocument(new Document()); // add a dummy doc to trigger a segment here writer.flush(); @@ -3524,4 +3534,69 @@ public class TestIndexWriter extends LuceneTestCase { w.close(); d.close(); } + + public void testSoftAndHardLiveDocs() throws IOException { + Directory dir = newDirectory(); + IndexWriterConfig indexWriterConfig = newIndexWriterConfig(); + String softDeletesField = "soft_delete"; + indexWriterConfig.setSoftDeletesField(softDeletesField); + IndexWriter writer = new IndexWriter(dir, indexWriterConfig); + Set uniqueDocs = new HashSet<>(); + for (int i = 0; i < 100; i++) { + int docId = random().nextInt(5); + uniqueDocs.add(docId); + Document doc = new Document(); + doc.add(new StringField("id", String.valueOf(docId), Field.Store.YES)); + if (docId % 2 == 0) { + writer.updateDocument(new Term("id", String.valueOf(docId)), doc); + } else { + writer.softUpdateDocument(new Term("id", String.valueOf(docId)), doc, + new NumericDocValuesField(softDeletesField, 0)); + } + if (random().nextBoolean()) { + assertHardLiveDocs(writer, uniqueDocs); + } + } + + if (random().nextBoolean()) { + writer.commit(); + } + assertHardLiveDocs(writer, uniqueDocs); + + + IOUtils.close(writer, dir); + } + + private void assertHardLiveDocs(IndexWriter writer, Set uniqueDocs) throws IOException { + try (DirectoryReader reader = DirectoryReader.open(writer)) { + assertEquals(uniqueDocs.size(), reader.numDocs()); + List leaves = reader.leaves(); + for (LeafReaderContext ctx : leaves) { + LeafReader leaf = ctx.reader(); + assertTrue(leaf instanceof SegmentReader); + SegmentReader sr = (SegmentReader) leaf; + if (sr.getHardLiveDocs() != null) { + Terms id = sr.terms("id"); + TermsEnum iterator = id.iterator(); + Bits hardLiveDocs = sr.getHardLiveDocs(); + Bits liveDocs = sr.getLiveDocs(); + for (Integer dId : uniqueDocs) { + boolean mustBeHardDeleted = dId % 2 == 0; + if (iterator.seekExact(new BytesRef(dId.toString()))) { + PostingsEnum postings = iterator.postings(null); + while (postings.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { + if (liveDocs.get(postings.docID())) { + assertTrue(hardLiveDocs.get(postings.docID())); + } else if (mustBeHardDeleted) { + assertFalse(hardLiveDocs.get(postings.docID())); + } else { + assertTrue(hardLiveDocs.get(postings.docID())); + } + } + } + } + } + } + } + } } diff --git a/lucene/replicator/src/java/org/apache/lucene/replicator/nrt/SegmentInfosSearcherManager.java b/lucene/replicator/src/java/org/apache/lucene/replicator/nrt/SegmentInfosSearcherManager.java index a04464aa9da..d18ee1029d9 100644 --- a/lucene/replicator/src/java/org/apache/lucene/replicator/nrt/SegmentInfosSearcherManager.java +++ b/lucene/replicator/src/java/org/apache/lucene/replicator/nrt/SegmentInfosSearcherManager.java @@ -107,7 +107,7 @@ class SegmentInfosSearcherManager extends ReferenceManager { DirectoryReader r = StandardDirectoryReader.open(dir, currentInfos, subs); addReaderClosedListener(r); node.message("refreshed to version=" + currentInfos.getVersion() + " r=" + r); - return SearcherManager.getSearcher(searcherFactory, r, (DirectoryReader) old.getIndexReader()); + return SearcherManager.getSearcher(searcherFactory, r, old.getIndexReader()); } private void addReaderClosedListener(IndexReader r) {