mirror of https://github.com/apache/lucene.git
LUCENE-8425: Expose hard live docs on SegmentReader level
Today if soft deletes are used we expose a union of hard and soft deletes via LeafReader#getLiveDocs. Yet, if a users wants to take advantage of searching also soft-deleted documents the only option today is to search all documents even though some of them are hard deleted. The recommendation is to not mix those but in exceptional cases ie. when a document hits a non-aborting exception during indexing the document is marked as hard deleted which is the correct action. In order to filter those out having access to the hard live docs on the segment reader level allows to filter out these documents.
This commit is contained in:
parent
a254e7d7bc
commit
35fa0b4f55
|
@ -89,7 +89,7 @@ final class ReaderPool implements Closeable {
|
|||
LeafReaderContext leaf = leaves.get(i);
|
||||
SegmentReader segReader = (SegmentReader) leaf.reader();
|
||||
SegmentReader newReader = new SegmentReader(segmentInfos.info(i), segReader, segReader.getLiveDocs(),
|
||||
segReader.numDocs());
|
||||
segReader.getHardLiveDocs(), segReader.numDocs(), true);
|
||||
readerMap.put(newReader.getOriginalSegmentInfo(), new ReadersAndUpdates(segmentInfos.getIndexCreatedVersionMajor(),
|
||||
newReader, newPendingDeletes(newReader, newReader.getOriginalSegmentInfo())));
|
||||
}
|
||||
|
|
|
@ -214,7 +214,7 @@ final class ReadersAndUpdates {
|
|||
// force new liveDocs
|
||||
Bits liveDocs = pendingDeletes.getLiveDocs();
|
||||
if (liveDocs != null) {
|
||||
return new SegmentReader(info, reader, liveDocs, pendingDeletes.numDocs());
|
||||
return new SegmentReader(info, reader, liveDocs, pendingDeletes.getHardLiveDocs(), pendingDeletes.numDocs(), true);
|
||||
} else {
|
||||
// liveDocs == null and reader != null. That can only be if there are no deletes
|
||||
assert reader.getLiveDocs() == null;
|
||||
|
@ -645,7 +645,8 @@ final class ReadersAndUpdates {
|
|||
private SegmentReader createNewReaderWithLatestLiveDocs(SegmentReader reader) throws IOException {
|
||||
assert reader != null;
|
||||
assert Thread.holdsLock(this) : Thread.currentThread().getName();
|
||||
SegmentReader newReader = new SegmentReader(info, reader, pendingDeletes.getLiveDocs(), pendingDeletes.numDocs());
|
||||
SegmentReader newReader = new SegmentReader(info, reader, pendingDeletes.getLiveDocs(),
|
||||
pendingDeletes.getHardLiveDocs(), pendingDeletes.numDocs(), true);
|
||||
boolean success2 = false;
|
||||
try {
|
||||
pendingDeletes.onNewReader(newReader, info);
|
||||
|
|
|
@ -51,6 +51,7 @@ public final class SegmentReader extends CodecReader {
|
|||
private final SegmentCommitInfo originalSi;
|
||||
private final LeafMetaData metaData;
|
||||
private final Bits liveDocs;
|
||||
private final Bits hardLiveDocs;
|
||||
|
||||
// Normally set to si.maxDoc - si.delDocCount, unless we
|
||||
// were created as an NRT reader from IW, in which case IW
|
||||
|
@ -65,7 +66,7 @@ public final class SegmentReader extends CodecReader {
|
|||
|
||||
final DocValuesProducer docValuesProducer;
|
||||
final FieldInfos fieldInfos;
|
||||
|
||||
|
||||
/**
|
||||
* Constructs a new SegmentReader with a new core.
|
||||
* @throws CorruptIndexException if the index is corrupt
|
||||
|
@ -87,16 +88,16 @@ public final class SegmentReader extends CodecReader {
|
|||
try {
|
||||
if (si.hasDeletions()) {
|
||||
// NOTE: the bitvector is stored using the regular directory, not cfs
|
||||
liveDocs = codec.liveDocsFormat().readLiveDocs(directory(), si, IOContext.READONCE);
|
||||
hardLiveDocs = liveDocs = codec.liveDocsFormat().readLiveDocs(directory(), si, IOContext.READONCE);
|
||||
} else {
|
||||
assert si.getDelCount() == 0;
|
||||
liveDocs = null;
|
||||
hardLiveDocs = liveDocs = null;
|
||||
}
|
||||
numDocs = si.info.maxDoc() - si.getDelCount();
|
||||
|
||||
fieldInfos = initFieldInfos();
|
||||
docValuesProducer = initDocValuesProducer();
|
||||
|
||||
assert assertLiveDocs(isNRT, hardLiveDocs, liveDocs);
|
||||
success = true;
|
||||
} finally {
|
||||
// With lock-less commits, it's entirely possible (and
|
||||
|
@ -110,27 +111,10 @@ public final class SegmentReader extends CodecReader {
|
|||
}
|
||||
}
|
||||
|
||||
/** Create new SegmentReader sharing core from a previous
|
||||
* SegmentReader and loading new live docs from a new
|
||||
* deletes file. Used by openIfChanged. */
|
||||
SegmentReader(SegmentCommitInfo si, SegmentReader sr) throws IOException {
|
||||
this(si, sr,
|
||||
si.hasDeletions() ? si.info.getCodec().liveDocsFormat().readLiveDocs(si.info.dir, si, IOContext.READONCE) : null,
|
||||
si.info.maxDoc() - si.getDelCount(), false);
|
||||
}
|
||||
|
||||
/** Create new SegmentReader sharing core from a previous
|
||||
* SegmentReader and using the provided in-memory
|
||||
* liveDocs. Used by IndexWriter to provide a new NRT
|
||||
* reader */
|
||||
SegmentReader(SegmentCommitInfo si, SegmentReader sr, Bits liveDocs, int numDocs) throws IOException {
|
||||
this(si, sr, liveDocs, numDocs, true);
|
||||
}
|
||||
|
||||
/** Create new SegmentReader sharing core from a previous
|
||||
* SegmentReader and using the provided liveDocs, and recording
|
||||
* whether those liveDocs were carried in ram (isNRT=true). */
|
||||
SegmentReader(SegmentCommitInfo si, SegmentReader sr, Bits liveDocs, int numDocs, boolean isNRT) throws IOException {
|
||||
SegmentReader(SegmentCommitInfo si, SegmentReader sr, Bits liveDocs, Bits hardLiveDocs, int numDocs, boolean isNRT) throws IOException {
|
||||
if (numDocs > si.info.maxDoc()) {
|
||||
throw new IllegalArgumentException("numDocs=" + numDocs + " but maxDoc=" + si.info.maxDoc());
|
||||
}
|
||||
|
@ -141,6 +125,8 @@ public final class SegmentReader extends CodecReader {
|
|||
this.originalSi = si;
|
||||
this.metaData = sr.getMetaData();
|
||||
this.liveDocs = liveDocs;
|
||||
this.hardLiveDocs = hardLiveDocs;
|
||||
assert assertLiveDocs(isNRT, hardLiveDocs, liveDocs);
|
||||
this.isNRT = isNRT;
|
||||
this.numDocs = numDocs;
|
||||
this.core = sr.core;
|
||||
|
@ -159,6 +145,15 @@ public final class SegmentReader extends CodecReader {
|
|||
}
|
||||
}
|
||||
|
||||
private static boolean assertLiveDocs(boolean isNRT, Bits hardLiveDocs, Bits liveDocs) {
|
||||
if (isNRT) {
|
||||
assert hardLiveDocs == null || liveDocs != null : " liveDocs must be non null if hardLiveDocs are non null";
|
||||
} else {
|
||||
assert hardLiveDocs == liveDocs : "non-nrt case must have identical liveDocs";
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* init most recent DocValues for the current commit
|
||||
*/
|
||||
|
@ -361,4 +356,14 @@ public final class SegmentReader extends CodecReader {
|
|||
SegmentCommitInfo getOriginalSegmentInfo() {
|
||||
return originalSi;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the live docs that are not hard-deleted. This is an expert API to be used with
|
||||
* soft-deletes to filter out document that hard deleted for instance due to aborted documents or to distinguish
|
||||
* soft and hard deleted documents ie. a rolled back tombstone.
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public Bits getHardLiveDocs() {
|
||||
return hardLiveDocs;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -32,6 +32,7 @@ import java.util.concurrent.CopyOnWriteArraySet;
|
|||
import org.apache.lucene.store.AlreadyClosedException;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.IOContext;
|
||||
import org.apache.lucene.util.Bits;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
|
||||
/** Default implementation of {@link DirectoryReader}. */
|
||||
|
@ -174,16 +175,17 @@ public final class StandardDirectoryReader extends DirectoryReader {
|
|||
try {
|
||||
SegmentReader newReader;
|
||||
if (oldReader == null || commitInfo.info.getUseCompoundFile() != oldReader.getSegmentInfo().info.getUseCompoundFile()) {
|
||||
|
||||
// this is a new reader; in case we hit an exception we can decRef it safely
|
||||
newReader = new SegmentReader(commitInfo, infos.getIndexCreatedVersionMajor(), IOContext.READ);
|
||||
newReaders[i] = newReader;
|
||||
} else {
|
||||
if (oldReader.isNRT) {
|
||||
// We must load liveDocs/DV updates from disk:
|
||||
newReaders[i] = new SegmentReader(commitInfo, oldReader);
|
||||
Bits liveDocs = commitInfo.hasDeletions() ? commitInfo.info.getCodec().liveDocsFormat()
|
||||
.readLiveDocs(commitInfo.info.dir, commitInfo, IOContext.READONCE) : null;
|
||||
newReaders[i] = new SegmentReader(commitInfo, oldReader, liveDocs, liveDocs,
|
||||
commitInfo.info.maxDoc() - commitInfo.getDelCount(), false);
|
||||
} else {
|
||||
|
||||
if (oldReader.getSegmentInfo().getDelGen() == commitInfo.getDelGen()
|
||||
&& oldReader.getSegmentInfo().getFieldInfosGen() == commitInfo.getFieldInfosGen()) {
|
||||
// No change; this reader will be shared between
|
||||
|
@ -197,10 +199,14 @@ public final class StandardDirectoryReader extends DirectoryReader {
|
|||
|
||||
if (oldReader.getSegmentInfo().getDelGen() == commitInfo.getDelGen()) {
|
||||
// only DV updates
|
||||
newReaders[i] = new SegmentReader(commitInfo, oldReader, oldReader.getLiveDocs(), oldReader.numDocs(), false); // this is not an NRT reader!
|
||||
newReaders[i] = new SegmentReader(commitInfo, oldReader, oldReader.getLiveDocs(),
|
||||
oldReader.getHardLiveDocs(), oldReader.numDocs(), false); // this is not an NRT reader!
|
||||
} else {
|
||||
// both DV and liveDocs have changed
|
||||
newReaders[i] = new SegmentReader(commitInfo, oldReader);
|
||||
Bits liveDocs = commitInfo.hasDeletions() ? commitInfo.info.getCodec().liveDocsFormat()
|
||||
.readLiveDocs(commitInfo.info.dir, commitInfo, IOContext.READONCE) : null;
|
||||
newReaders[i] = new SegmentReader(commitInfo, oldReader, liveDocs, liveDocs,
|
||||
commitInfo.info.maxDoc() - commitInfo.getDelCount(), false);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -3144,6 +3144,10 @@ public class TestIndexWriter extends LuceneTestCase {
|
|||
numSoftDeleted += info.getSoftDelCount();
|
||||
}
|
||||
assertEquals(writer.maxDoc() - writer.numDocs(), numSoftDeleted);
|
||||
for (LeafReaderContext context : reader.leaves()) {
|
||||
LeafReader leaf = context.reader();
|
||||
assertNull(((SegmentReader) leaf).getHardLiveDocs());
|
||||
}
|
||||
writer.close();
|
||||
reader.close();
|
||||
dir.close();
|
||||
|
@ -3263,6 +3267,12 @@ public class TestIndexWriter extends LuceneTestCase {
|
|||
assertEquals(1, topDocs.totalHits);
|
||||
}
|
||||
}
|
||||
if (mixDeletes == false) {
|
||||
for (LeafReaderContext context : reader.leaves()) {
|
||||
LeafReader leaf = context.reader();
|
||||
assertNull(((SegmentReader) leaf).getHardLiveDocs());
|
||||
}
|
||||
}
|
||||
mergeAwaySoftDeletes.set(true);
|
||||
writer.addDocument(new Document()); // add a dummy doc to trigger a segment here
|
||||
writer.flush();
|
||||
|
@ -3524,4 +3534,69 @@ public class TestIndexWriter extends LuceneTestCase {
|
|||
w.close();
|
||||
d.close();
|
||||
}
|
||||
|
||||
public void testSoftAndHardLiveDocs() throws IOException {
|
||||
Directory dir = newDirectory();
|
||||
IndexWriterConfig indexWriterConfig = newIndexWriterConfig();
|
||||
String softDeletesField = "soft_delete";
|
||||
indexWriterConfig.setSoftDeletesField(softDeletesField);
|
||||
IndexWriter writer = new IndexWriter(dir, indexWriterConfig);
|
||||
Set<Integer> uniqueDocs = new HashSet<>();
|
||||
for (int i = 0; i < 100; i++) {
|
||||
int docId = random().nextInt(5);
|
||||
uniqueDocs.add(docId);
|
||||
Document doc = new Document();
|
||||
doc.add(new StringField("id", String.valueOf(docId), Field.Store.YES));
|
||||
if (docId % 2 == 0) {
|
||||
writer.updateDocument(new Term("id", String.valueOf(docId)), doc);
|
||||
} else {
|
||||
writer.softUpdateDocument(new Term("id", String.valueOf(docId)), doc,
|
||||
new NumericDocValuesField(softDeletesField, 0));
|
||||
}
|
||||
if (random().nextBoolean()) {
|
||||
assertHardLiveDocs(writer, uniqueDocs);
|
||||
}
|
||||
}
|
||||
|
||||
if (random().nextBoolean()) {
|
||||
writer.commit();
|
||||
}
|
||||
assertHardLiveDocs(writer, uniqueDocs);
|
||||
|
||||
|
||||
IOUtils.close(writer, dir);
|
||||
}
|
||||
|
||||
private void assertHardLiveDocs(IndexWriter writer, Set<Integer> uniqueDocs) throws IOException {
|
||||
try (DirectoryReader reader = DirectoryReader.open(writer)) {
|
||||
assertEquals(uniqueDocs.size(), reader.numDocs());
|
||||
List<LeafReaderContext> leaves = reader.leaves();
|
||||
for (LeafReaderContext ctx : leaves) {
|
||||
LeafReader leaf = ctx.reader();
|
||||
assertTrue(leaf instanceof SegmentReader);
|
||||
SegmentReader sr = (SegmentReader) leaf;
|
||||
if (sr.getHardLiveDocs() != null) {
|
||||
Terms id = sr.terms("id");
|
||||
TermsEnum iterator = id.iterator();
|
||||
Bits hardLiveDocs = sr.getHardLiveDocs();
|
||||
Bits liveDocs = sr.getLiveDocs();
|
||||
for (Integer dId : uniqueDocs) {
|
||||
boolean mustBeHardDeleted = dId % 2 == 0;
|
||||
if (iterator.seekExact(new BytesRef(dId.toString()))) {
|
||||
PostingsEnum postings = iterator.postings(null);
|
||||
while (postings.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
|
||||
if (liveDocs.get(postings.docID())) {
|
||||
assertTrue(hardLiveDocs.get(postings.docID()));
|
||||
} else if (mustBeHardDeleted) {
|
||||
assertFalse(hardLiveDocs.get(postings.docID()));
|
||||
} else {
|
||||
assertTrue(hardLiveDocs.get(postings.docID()));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -107,7 +107,7 @@ class SegmentInfosSearcherManager extends ReferenceManager<IndexSearcher> {
|
|||
DirectoryReader r = StandardDirectoryReader.open(dir, currentInfos, subs);
|
||||
addReaderClosedListener(r);
|
||||
node.message("refreshed to version=" + currentInfos.getVersion() + " r=" + r);
|
||||
return SearcherManager.getSearcher(searcherFactory, r, (DirectoryReader) old.getIndexReader());
|
||||
return SearcherManager.getSearcher(searcherFactory, r, old.getIndexReader());
|
||||
}
|
||||
|
||||
private void addReaderClosedListener(IndexReader r) {
|
||||
|
|
Loading…
Reference in New Issue