LUCENE-8425: Expose hard live docs on SegmentReader level

Today if soft deletes are used we expose a union of hard and soft deletes
via LeafReader#getLiveDocs. Yet, if a users wants to take advantage of
searching also soft-deleted documents the only option today is to search
all documents even though some of them are hard deleted. The recommendation
is to not mix those but in exceptional cases ie. when a document hits a
non-aborting exception during indexing the document is marked as hard
deleted which is the correct action. In order to filter those out having
access to the hard live docs on the segment reader level allows to filter out
these documents.
This commit is contained in:
Simon Willnauer 2018-07-24 13:41:11 +02:00
parent a254e7d7bc
commit 35fa0b4f55
6 changed files with 118 additions and 31 deletions

View File

@ -89,7 +89,7 @@ final class ReaderPool implements Closeable {
LeafReaderContext leaf = leaves.get(i);
SegmentReader segReader = (SegmentReader) leaf.reader();
SegmentReader newReader = new SegmentReader(segmentInfos.info(i), segReader, segReader.getLiveDocs(),
segReader.numDocs());
segReader.getHardLiveDocs(), segReader.numDocs(), true);
readerMap.put(newReader.getOriginalSegmentInfo(), new ReadersAndUpdates(segmentInfos.getIndexCreatedVersionMajor(),
newReader, newPendingDeletes(newReader, newReader.getOriginalSegmentInfo())));
}

View File

@ -214,7 +214,7 @@ final class ReadersAndUpdates {
// force new liveDocs
Bits liveDocs = pendingDeletes.getLiveDocs();
if (liveDocs != null) {
return new SegmentReader(info, reader, liveDocs, pendingDeletes.numDocs());
return new SegmentReader(info, reader, liveDocs, pendingDeletes.getHardLiveDocs(), pendingDeletes.numDocs(), true);
} else {
// liveDocs == null and reader != null. That can only be if there are no deletes
assert reader.getLiveDocs() == null;
@ -645,7 +645,8 @@ final class ReadersAndUpdates {
private SegmentReader createNewReaderWithLatestLiveDocs(SegmentReader reader) throws IOException {
assert reader != null;
assert Thread.holdsLock(this) : Thread.currentThread().getName();
SegmentReader newReader = new SegmentReader(info, reader, pendingDeletes.getLiveDocs(), pendingDeletes.numDocs());
SegmentReader newReader = new SegmentReader(info, reader, pendingDeletes.getLiveDocs(),
pendingDeletes.getHardLiveDocs(), pendingDeletes.numDocs(), true);
boolean success2 = false;
try {
pendingDeletes.onNewReader(newReader, info);

View File

@ -51,6 +51,7 @@ public final class SegmentReader extends CodecReader {
private final SegmentCommitInfo originalSi;
private final LeafMetaData metaData;
private final Bits liveDocs;
private final Bits hardLiveDocs;
// Normally set to si.maxDoc - si.delDocCount, unless we
// were created as an NRT reader from IW, in which case IW
@ -87,16 +88,16 @@ public final class SegmentReader extends CodecReader {
try {
if (si.hasDeletions()) {
// NOTE: the bitvector is stored using the regular directory, not cfs
liveDocs = codec.liveDocsFormat().readLiveDocs(directory(), si, IOContext.READONCE);
hardLiveDocs = liveDocs = codec.liveDocsFormat().readLiveDocs(directory(), si, IOContext.READONCE);
} else {
assert si.getDelCount() == 0;
liveDocs = null;
hardLiveDocs = liveDocs = null;
}
numDocs = si.info.maxDoc() - si.getDelCount();
fieldInfos = initFieldInfos();
docValuesProducer = initDocValuesProducer();
assert assertLiveDocs(isNRT, hardLiveDocs, liveDocs);
success = true;
} finally {
// With lock-less commits, it's entirely possible (and
@ -110,27 +111,10 @@ public final class SegmentReader extends CodecReader {
}
}
/** Create new SegmentReader sharing core from a previous
* SegmentReader and loading new live docs from a new
* deletes file. Used by openIfChanged. */
SegmentReader(SegmentCommitInfo si, SegmentReader sr) throws IOException {
this(si, sr,
si.hasDeletions() ? si.info.getCodec().liveDocsFormat().readLiveDocs(si.info.dir, si, IOContext.READONCE) : null,
si.info.maxDoc() - si.getDelCount(), false);
}
/** Create new SegmentReader sharing core from a previous
* SegmentReader and using the provided in-memory
* liveDocs. Used by IndexWriter to provide a new NRT
* reader */
SegmentReader(SegmentCommitInfo si, SegmentReader sr, Bits liveDocs, int numDocs) throws IOException {
this(si, sr, liveDocs, numDocs, true);
}
/** Create new SegmentReader sharing core from a previous
* SegmentReader and using the provided liveDocs, and recording
* whether those liveDocs were carried in ram (isNRT=true). */
SegmentReader(SegmentCommitInfo si, SegmentReader sr, Bits liveDocs, int numDocs, boolean isNRT) throws IOException {
SegmentReader(SegmentCommitInfo si, SegmentReader sr, Bits liveDocs, Bits hardLiveDocs, int numDocs, boolean isNRT) throws IOException {
if (numDocs > si.info.maxDoc()) {
throw new IllegalArgumentException("numDocs=" + numDocs + " but maxDoc=" + si.info.maxDoc());
}
@ -141,6 +125,8 @@ public final class SegmentReader extends CodecReader {
this.originalSi = si;
this.metaData = sr.getMetaData();
this.liveDocs = liveDocs;
this.hardLiveDocs = hardLiveDocs;
assert assertLiveDocs(isNRT, hardLiveDocs, liveDocs);
this.isNRT = isNRT;
this.numDocs = numDocs;
this.core = sr.core;
@ -159,6 +145,15 @@ public final class SegmentReader extends CodecReader {
}
}
private static boolean assertLiveDocs(boolean isNRT, Bits hardLiveDocs, Bits liveDocs) {
if (isNRT) {
assert hardLiveDocs == null || liveDocs != null : " liveDocs must be non null if hardLiveDocs are non null";
} else {
assert hardLiveDocs == liveDocs : "non-nrt case must have identical liveDocs";
}
return true;
}
/**
* init most recent DocValues for the current commit
*/
@ -361,4 +356,14 @@ public final class SegmentReader extends CodecReader {
SegmentCommitInfo getOriginalSegmentInfo() {
return originalSi;
}
/**
* Returns the live docs that are not hard-deleted. This is an expert API to be used with
* soft-deletes to filter out document that hard deleted for instance due to aborted documents or to distinguish
* soft and hard deleted documents ie. a rolled back tombstone.
* @lucene.experimental
*/
public Bits getHardLiveDocs() {
return hardLiveDocs;
}
}

View File

@ -32,6 +32,7 @@ import java.util.concurrent.CopyOnWriteArraySet;
import org.apache.lucene.store.AlreadyClosedException;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.IOUtils;
/** Default implementation of {@link DirectoryReader}. */
@ -174,16 +175,17 @@ public final class StandardDirectoryReader extends DirectoryReader {
try {
SegmentReader newReader;
if (oldReader == null || commitInfo.info.getUseCompoundFile() != oldReader.getSegmentInfo().info.getUseCompoundFile()) {
// this is a new reader; in case we hit an exception we can decRef it safely
newReader = new SegmentReader(commitInfo, infos.getIndexCreatedVersionMajor(), IOContext.READ);
newReaders[i] = newReader;
} else {
if (oldReader.isNRT) {
// We must load liveDocs/DV updates from disk:
newReaders[i] = new SegmentReader(commitInfo, oldReader);
Bits liveDocs = commitInfo.hasDeletions() ? commitInfo.info.getCodec().liveDocsFormat()
.readLiveDocs(commitInfo.info.dir, commitInfo, IOContext.READONCE) : null;
newReaders[i] = new SegmentReader(commitInfo, oldReader, liveDocs, liveDocs,
commitInfo.info.maxDoc() - commitInfo.getDelCount(), false);
} else {
if (oldReader.getSegmentInfo().getDelGen() == commitInfo.getDelGen()
&& oldReader.getSegmentInfo().getFieldInfosGen() == commitInfo.getFieldInfosGen()) {
// No change; this reader will be shared between
@ -197,10 +199,14 @@ public final class StandardDirectoryReader extends DirectoryReader {
if (oldReader.getSegmentInfo().getDelGen() == commitInfo.getDelGen()) {
// only DV updates
newReaders[i] = new SegmentReader(commitInfo, oldReader, oldReader.getLiveDocs(), oldReader.numDocs(), false); // this is not an NRT reader!
newReaders[i] = new SegmentReader(commitInfo, oldReader, oldReader.getLiveDocs(),
oldReader.getHardLiveDocs(), oldReader.numDocs(), false); // this is not an NRT reader!
} else {
// both DV and liveDocs have changed
newReaders[i] = new SegmentReader(commitInfo, oldReader);
Bits liveDocs = commitInfo.hasDeletions() ? commitInfo.info.getCodec().liveDocsFormat()
.readLiveDocs(commitInfo.info.dir, commitInfo, IOContext.READONCE) : null;
newReaders[i] = new SegmentReader(commitInfo, oldReader, liveDocs, liveDocs,
commitInfo.info.maxDoc() - commitInfo.getDelCount(), false);
}
}
}

View File

@ -3144,6 +3144,10 @@ public class TestIndexWriter extends LuceneTestCase {
numSoftDeleted += info.getSoftDelCount();
}
assertEquals(writer.maxDoc() - writer.numDocs(), numSoftDeleted);
for (LeafReaderContext context : reader.leaves()) {
LeafReader leaf = context.reader();
assertNull(((SegmentReader) leaf).getHardLiveDocs());
}
writer.close();
reader.close();
dir.close();
@ -3263,6 +3267,12 @@ public class TestIndexWriter extends LuceneTestCase {
assertEquals(1, topDocs.totalHits);
}
}
if (mixDeletes == false) {
for (LeafReaderContext context : reader.leaves()) {
LeafReader leaf = context.reader();
assertNull(((SegmentReader) leaf).getHardLiveDocs());
}
}
mergeAwaySoftDeletes.set(true);
writer.addDocument(new Document()); // add a dummy doc to trigger a segment here
writer.flush();
@ -3524,4 +3534,69 @@ public class TestIndexWriter extends LuceneTestCase {
w.close();
d.close();
}
public void testSoftAndHardLiveDocs() throws IOException {
Directory dir = newDirectory();
IndexWriterConfig indexWriterConfig = newIndexWriterConfig();
String softDeletesField = "soft_delete";
indexWriterConfig.setSoftDeletesField(softDeletesField);
IndexWriter writer = new IndexWriter(dir, indexWriterConfig);
Set<Integer> uniqueDocs = new HashSet<>();
for (int i = 0; i < 100; i++) {
int docId = random().nextInt(5);
uniqueDocs.add(docId);
Document doc = new Document();
doc.add(new StringField("id", String.valueOf(docId), Field.Store.YES));
if (docId % 2 == 0) {
writer.updateDocument(new Term("id", String.valueOf(docId)), doc);
} else {
writer.softUpdateDocument(new Term("id", String.valueOf(docId)), doc,
new NumericDocValuesField(softDeletesField, 0));
}
if (random().nextBoolean()) {
assertHardLiveDocs(writer, uniqueDocs);
}
}
if (random().nextBoolean()) {
writer.commit();
}
assertHardLiveDocs(writer, uniqueDocs);
IOUtils.close(writer, dir);
}
private void assertHardLiveDocs(IndexWriter writer, Set<Integer> uniqueDocs) throws IOException {
try (DirectoryReader reader = DirectoryReader.open(writer)) {
assertEquals(uniqueDocs.size(), reader.numDocs());
List<LeafReaderContext> leaves = reader.leaves();
for (LeafReaderContext ctx : leaves) {
LeafReader leaf = ctx.reader();
assertTrue(leaf instanceof SegmentReader);
SegmentReader sr = (SegmentReader) leaf;
if (sr.getHardLiveDocs() != null) {
Terms id = sr.terms("id");
TermsEnum iterator = id.iterator();
Bits hardLiveDocs = sr.getHardLiveDocs();
Bits liveDocs = sr.getLiveDocs();
for (Integer dId : uniqueDocs) {
boolean mustBeHardDeleted = dId % 2 == 0;
if (iterator.seekExact(new BytesRef(dId.toString()))) {
PostingsEnum postings = iterator.postings(null);
while (postings.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
if (liveDocs.get(postings.docID())) {
assertTrue(hardLiveDocs.get(postings.docID()));
} else if (mustBeHardDeleted) {
assertFalse(hardLiveDocs.get(postings.docID()));
} else {
assertTrue(hardLiveDocs.get(postings.docID()));
}
}
}
}
}
}
}
}
}

View File

@ -107,7 +107,7 @@ class SegmentInfosSearcherManager extends ReferenceManager<IndexSearcher> {
DirectoryReader r = StandardDirectoryReader.open(dir, currentInfos, subs);
addReaderClosedListener(r);
node.message("refreshed to version=" + currentInfos.getVersion() + " r=" + r);
return SearcherManager.getSearcher(searcherFactory, r, (DirectoryReader) old.getIndexReader());
return SearcherManager.getSearcher(searcherFactory, r, old.getIndexReader());
}
private void addReaderClosedListener(IndexReader r) {