Ignore live docs when loading field data, the ID cache and filter caches.

Relying on deleted documents when loading field data is dangerous because a field data instance might be loaded for a given generation of a segment and then loaded from the cache by an older generation of the same segment which has fewer deleted documents. This could, for example, lead to under-estimated facet counts. The same issue applies to the ID cache and filter caches. Close #3224
2025-03-09 14:34:43 +00:00 · 2013-06-26 23:10:48 +02:00 · 2013-06-26 23:10:48 +02:00 · 40cd549c37
commit 40cd549c37
parent 477489ac82
14 changed files with 120 additions and 16 deletions
--- a/src/main/java/org/elasticsearch/index/cache/filter/weighted/WeightedFilterCache.java
+++ b/src/main/java/org/elasticsearch/index/cache/filter/weighted/WeightedFilterCache.java
@ -167,8 +167,9 @@ public class WeightedFilterCache extends AbstractIndexComponent implements Filte
                    }
                }
                // we can't pass down acceptedDocs provided, because we are caching the result, and acceptedDocs
-                // might be specific to a query AST, we do pass down the live docs to make sure we optimize the execution
-                cacheValue = DocIdSets.toCacheable(context.reader(), filter.getDocIdSet(context, context.reader().getLiveDocs()));
+                // might be specific to a query. We don't pass the live docs either because a cache built for a specific
+                // generation of a segment might be reused by an older generation which has fewer deleted documents
+                cacheValue = DocIdSets.toCacheable(context.reader(), filter.getDocIdSet(context, null));
                // we might put the same one concurrently, that's fine, it will be replaced and the removal
                // will be called
                ShardId shardId = ShardUtils.extractShardId(context.reader());
--- a/src/main/java/org/elasticsearch/index/cache/id/simple/SimpleIdCache.java
+++ b/src/main/java/org/elasticsearch/index/cache/id/simple/SimpleIdCache.java
@ -180,7 +180,7 @@ public class SimpleIdCache extends AbstractIndexComponent implements IdCache, Se
                            }

                            HashedBytesArray idAsBytes = checkIfCanReuse(builders, typeAndId[1]);
-                            docsEnum = termsEnum.docs(reader.getLiveDocs(), docsEnum, 0);
+                            docsEnum = termsEnum.docs(null, docsEnum, 0);
                            for (int docId = docsEnum.nextDoc(); docId != DocsEnum.NO_MORE_DOCS; docId = docsEnum.nextDoc()) {
                                typeBuilder.idToDoc.put(idAsBytes, docId);
                                typeBuilder.docToId[docId] = idAsBytes;
@ -216,7 +216,7 @@ public class SimpleIdCache extends AbstractIndexComponent implements IdCache, Se
                            HashedBytesArray idAsBytes = checkIfCanReuse(builders, typeAndId[1]);
                            boolean added = false; // optimize for when all the docs are deleted for this id

-                            docsEnum = termsEnum.docs(reader.getLiveDocs(), docsEnum, 0);
+                            docsEnum = termsEnum.docs(null, docsEnum, 0);
                            for (int docId = docsEnum.nextDoc(); docId != DocsEnum.NO_MORE_DOCS; docId = docsEnum.nextDoc()) {
                                if (!added) {
                                    typeBuilder.parentIdsValues.add(idAsBytes);
--- a/src/main/java/org/elasticsearch/index/fielddata/ordinals/OrdinalsBuilder.java
+++ b/src/main/java/org/elasticsearch/index/fielddata/ordinals/OrdinalsBuilder.java
@ -334,7 +334,7 @@ public final class OrdinalsBuilder implements Closeable {
     * only full-precision terms.
     * </p>
     */
-    public BytesRefIterator buildFromTerms(final TermsEnum termsEnum, final Bits liveDocs) throws IOException {
+    public BytesRefIterator buildFromTerms(final TermsEnum termsEnum) throws IOException {
        return new BytesRefIterator() {
            private DocsEnum docsEnum = null;

@ -342,7 +342,7 @@ public final class OrdinalsBuilder implements Closeable {
            public BytesRef next() throws IOException {
                BytesRef ref;
                if ((ref = termsEnum.next()) != null) {
-                    docsEnum = termsEnum.docs(liveDocs, docsEnum, DocsEnum.FLAG_NONE);
+                    docsEnum = termsEnum.docs(null, docsEnum, DocsEnum.FLAG_NONE);
                    nextOrdinal();
                    int docId;
                    while((docId = docsEnum.nextDoc()) != DocsEnum.NO_MORE_DOCS) {
--- a/src/main/java/org/elasticsearch/index/fielddata/plain/DoubleArrayIndexFieldData.java
+++ b/src/main/java/org/elasticsearch/index/fielddata/plain/DoubleArrayIndexFieldData.java
@ -98,7 +98,7 @@ public class DoubleArrayIndexFieldData extends AbstractIndexFieldData<DoubleArra
        final float acceptableOverheadRatio = fieldDataType.getSettings().getAsFloat("acceptable_overhead_ratio", PackedInts.DEFAULT);
        OrdinalsBuilder builder = new OrdinalsBuilder(terms, reader.maxDoc(), acceptableOverheadRatio);
        try {
-            final BytesRefIterator iter = builder.buildFromTerms(getNumericType().wrapTermsEnum(terms.iterator(null)), reader.getLiveDocs());
+            final BytesRefIterator iter = builder.buildFromTerms(getNumericType().wrapTermsEnum(terms.iterator(null)));
            BytesRef term;
            while ((term = iter.next()) != null) {
                values.add(NumericUtils.sortableLongToDouble(NumericUtils.prefixCodedToLong(term)));
--- a/src/main/java/org/elasticsearch/index/fielddata/plain/FSTBytesIndexFieldData.java
+++ b/src/main/java/org/elasticsearch/index/fielddata/plain/FSTBytesIndexFieldData.java
@ -78,7 +78,7 @@ public class FSTBytesIndexFieldData extends AbstractBytesIndexFieldData<FSTBytes
                final int termOrd = builder.nextOrdinal();
                assert termOrd > 0;
                fstBuilder.add(Util.toIntsRef(term, scratch), (long)termOrd);
-                docsEnum = termsEnum.docs(reader.getLiveDocs(), docsEnum, DocsEnum.FLAG_NONE);
+                docsEnum = termsEnum.docs(null, docsEnum, DocsEnum.FLAG_NONE);
                for (int docId = docsEnum.nextDoc(); docId != DocsEnum.NO_MORE_DOCS; docId = docsEnum.nextDoc()) {
                    builder.addDoc(docId);
                }
--- a/src/main/java/org/elasticsearch/index/fielddata/plain/FloatArrayIndexFieldData.java
+++ b/src/main/java/org/elasticsearch/index/fielddata/plain/FloatArrayIndexFieldData.java
@ -98,7 +98,7 @@ public class FloatArrayIndexFieldData extends AbstractIndexFieldData<FloatArrayA
        final float acceptableOverheadRatio = fieldDataType.getSettings().getAsFloat("acceptable_overhead_ratio", PackedInts.DEFAULT);
        OrdinalsBuilder builder = new OrdinalsBuilder(terms, reader.maxDoc(), acceptableOverheadRatio);
        try {
-            BytesRefIterator iter = builder.buildFromTerms(getNumericType().wrapTermsEnum(terms.iterator(null)), reader.getLiveDocs());
+            BytesRefIterator iter = builder.buildFromTerms(getNumericType().wrapTermsEnum(terms.iterator(null)));
            BytesRef term;
            while ((term = iter.next()) != null) {
                values.add(NumericUtils.sortableIntToFloat(NumericUtils.prefixCodedToInt(term)));
--- a/src/main/java/org/elasticsearch/index/fielddata/plain/GeoPointDoubleArrayIndexFieldData.java
+++ b/src/main/java/org/elasticsearch/index/fielddata/plain/GeoPointDoubleArrayIndexFieldData.java
@ -91,7 +91,7 @@ public class GeoPointDoubleArrayIndexFieldData extends AbstractIndexFieldData<Ge
        OrdinalsBuilder builder = new OrdinalsBuilder(terms, reader.maxDoc(), acceptableOverheadRatio);
        final CharsRef spare = new CharsRef();
        try {
-            BytesRefIterator iter = builder.buildFromTerms(terms.iterator(null), reader.getLiveDocs());
+            BytesRefIterator iter = builder.buildFromTerms(terms.iterator(null));
            BytesRef term;
            while ((term = iter.next()) != null) {
                UnicodeUtil.UTF8toUTF16(term, spare);
--- a/src/main/java/org/elasticsearch/index/fielddata/plain/PackedArrayIndexFieldData.java
+++ b/src/main/java/org/elasticsearch/index/fielddata/plain/PackedArrayIndexFieldData.java
@ -113,7 +113,7 @@ public class PackedArrayIndexFieldData extends AbstractIndexFieldData<AtomicNume
        final float acceptableOverheadRatio = fieldDataType.getSettings().getAsFloat("acceptable_overhead_ratio", PackedInts.DEFAULT);
        OrdinalsBuilder builder = new OrdinalsBuilder(terms, reader.maxDoc(), acceptableOverheadRatio);
        try {
-            BytesRefIterator iter = builder.buildFromTerms(getNumericType().wrapTermsEnum(terms.iterator(null)), reader.getLiveDocs());
+            BytesRefIterator iter = builder.buildFromTerms(getNumericType().wrapTermsEnum(terms.iterator(null)));
            BytesRef term;
            assert !getNumericType().isFloatingPoint();
            final boolean indexedAsLong = getNumericType().requiredBits() > 32;
--- a/src/main/java/org/elasticsearch/index/fielddata/plain/PagedBytesIndexFieldData.java
+++ b/src/main/java/org/elasticsearch/index/fielddata/plain/PagedBytesIndexFieldData.java
@ -96,7 +96,7 @@ public class PagedBytesIndexFieldData extends AbstractBytesIndexFieldData<PagedB
                final int termOrd = builder.nextOrdinal();
                assert termOrd == termOrdToBytesOffset.size();
                termOrdToBytesOffset.add(bytes.copyUsingLengthPrefix(term));
-                docsEnum = termsEnum.docs(reader.getLiveDocs(), docsEnum, DocsEnum.FLAG_NONE);
+                docsEnum = termsEnum.docs(null, docsEnum, DocsEnum.FLAG_NONE);
                for (int docId = docsEnum.nextDoc(); docId != DocsEnum.NO_MORE_DOCS; docId = docsEnum.nextDoc()) {
                    builder.addDoc(docId);
                }
--- a/src/test/java/org/elasticsearch/test/unit/index/cache/id/SimpleIdCacheTests.java
+++ b/src/test/java/org/elasticsearch/test/unit/index/cache/id/SimpleIdCacheTests.java
@ -24,10 +24,7 @@ import org.apache.lucene.analysis.standard.StandardAnalyzer;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
 import org.apache.lucene.document.StringField;
-import org.apache.lucene.index.AtomicReaderContext;
-import org.apache.lucene.index.DirectoryReader;
-import org.apache.lucene.index.IndexWriter;
-import org.apache.lucene.index.IndexWriterConfig;
+import org.apache.lucene.index.*;
 import org.apache.lucene.store.RAMDirectory;
 import org.elasticsearch.ElasticSearchException;
 import org.elasticsearch.common.Strings;
@ -75,6 +72,33 @@ import static org.hamcrest.Matchers.nullValue;
 */
 public class SimpleIdCacheTests {

+    @Test
+    public void testDeletedDocuments() throws Exception {
+        SimpleIdCache idCache = createSimpleIdCache(Tuple.tuple("child", "parent"));
+        IndexWriter writer = createIndexWriter();
+        // Begins with parent, ends with child docs
+        final Document parent = doc("parent", "1");
+        writer.addDocument(parent);
+        writer.addDocument(childDoc("child", "1", "parent", "1"));
+        writer.addDocument(childDoc("child", "2", "parent", "1"));
+        writer.addDocument(childDoc("child", "3", "parent", "1"));
+        writer.commit();
+
+        final String parentUid = parent.get("_uid");
+        assert parentUid != null;
+        writer.deleteDocuments(new Term("_uid", parentUid));
+
+        writer.close();
+        DirectoryReader topLevelReader = DirectoryReader.open(writer.getDirectory());
+        List<AtomicReaderContext> leaves = topLevelReader.getContext().leaves();
+        idCache.refresh(leaves);
+
+        assertThat(leaves.size(), equalTo(1));
+        IdReaderCache readerCache = idCache.reader(leaves.get(0).reader());
+        IdReaderTypeCache typeCache = readerCache.type("parent");
+        assertThat(typeCache.idByDoc(0).toUtf8(), equalTo("1"));
+    }
+
    @Test
    public void testRefresh() throws Exception {
        SimpleIdCache idCache = createSimpleIdCache(Tuple.tuple("child", "parent"));
--- a/src/test/java/org/elasticsearch/test/unit/index/fielddata/DoubleFieldDataTests.java
+++ b/src/test/java/org/elasticsearch/test/unit/index/fielddata/DoubleFieldDataTests.java
@ -23,6 +23,7 @@ import org.apache.lucene.document.Document;
 import org.apache.lucene.document.DoubleField;
 import org.apache.lucene.document.Field;
 import org.apache.lucene.document.StringField;
+import org.apache.lucene.index.Term;
 import org.elasticsearch.index.fielddata.FieldDataType;

 /**
@ -50,6 +51,21 @@ public class DoubleFieldDataTests extends NumericFieldDataTests {
        return "4.0";
    }

+    protected void add2SingleValuedDocumentsAndDeleteOneOfThem() throws Exception {
+        Document d = new Document();
+        d.add(new StringField("_id", "1", Field.Store.NO));
+        d.add(new DoubleField("value", 2.0d, Field.Store.NO));
+        writer.addDocument(d);
+
+        d = new Document();
+        d.add(new StringField("_id", "2", Field.Store.NO));
+        d.add(new DoubleField("value", 4.0d, Field.Store.NO));
+        writer.addDocument(d);
+
+        writer.commit();
+
+        writer.deleteDocuments(new Term("_id", "1"));
+    }

    @Override
    protected void fillSingleValueAllSet() throws Exception {
--- a/src/test/java/org/elasticsearch/test/unit/index/fielddata/FloatFieldDataTests.java
+++ b/src/test/java/org/elasticsearch/test/unit/index/fielddata/FloatFieldDataTests.java
@ -23,6 +23,7 @@ import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
 import org.apache.lucene.document.FloatField;
 import org.apache.lucene.document.StringField;
+import org.apache.lucene.index.Term;
 import org.elasticsearch.index.fielddata.FieldDataType;

 /**
@ -50,6 +51,21 @@ public class FloatFieldDataTests extends NumericFieldDataTests {
        return "4.0";
    }

+    protected void add2SingleValuedDocumentsAndDeleteOneOfThem() throws Exception {
+        Document d = new Document();
+        d.add(new StringField("_id", "1", Field.Store.NO));
+        d.add(new FloatField("value", 2.0f, Field.Store.NO));
+        writer.addDocument(d);
+
+        d = new Document();
+        d.add(new StringField("_id", "2", Field.Store.NO));
+        d.add(new FloatField("value", 4.0f, Field.Store.NO));
+        writer.addDocument(d);
+
+        writer.commit();
+
+        writer.deleteDocuments(new Term("_id", "1"));
+    }

    @Override
    protected void fillSingleValueAllSet() throws Exception {
--- a/src/test/java/org/elasticsearch/test/unit/index/fielddata/LongFieldDataTests.java
+++ b/src/test/java/org/elasticsearch/test/unit/index/fielddata/LongFieldDataTests.java
@ -28,6 +28,7 @@ import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
 import org.apache.lucene.document.LongField;
 import org.apache.lucene.document.StringField;
+import org.apache.lucene.index.Term;
 import org.elasticsearch.common.settings.ImmutableSettings;
 import org.elasticsearch.index.fielddata.*;
 import org.elasticsearch.index.fielddata.plain.PackedArrayAtomicFieldData;
@ -54,6 +55,22 @@ public class LongFieldDataTests extends NumericFieldDataTests {
        return new FieldDataType("long", ImmutableSettings.builder());
    }

+    protected void add2SingleValuedDocumentsAndDeleteOneOfThem() throws Exception {
+        Document d = new Document();
+        d.add(new StringField("_id", "1", Field.Store.NO));
+        d.add(new LongField("value", 2, Field.Store.NO));
+        writer.addDocument(d);
+
+        d = new Document();
+        d.add(new StringField("_id", "2", Field.Store.NO));
+        d.add(new LongField("value", 4, Field.Store.NO));
+        writer.addDocument(d);
+
+        writer.commit();
+
+        writer.deleteDocuments(new Term("_id", "1"));
+    }
+
    @Test
    public void testOptimizeTypeLong() throws Exception {
        Document d = new Document();
--- a/src/test/java/org/elasticsearch/test/unit/index/fielddata/StringFieldDataTests.java
+++ b/src/test/java/org/elasticsearch/test/unit/index/fielddata/StringFieldDataTests.java
@ -24,6 +24,7 @@ import org.apache.lucene.document.Field;
 import org.apache.lucene.document.StringField;
 import org.apache.lucene.index.AtomicReaderContext;
 import org.apache.lucene.index.DirectoryReader;
+import org.apache.lucene.index.Term;
 import org.apache.lucene.search.*;
 import org.apache.lucene.util.BytesRef;
 import org.elasticsearch.common.lucene.HashedBytesRef;
@ -81,6 +82,35 @@ public abstract class StringFieldDataTests extends AbstractFieldDataTests {
        writer.addDocument(d);
    }

+    protected void add2SingleValuedDocumentsAndDeleteOneOfThem() throws Exception {
+        Document d = new Document();
+        d.add(new StringField("_id", "1", Field.Store.NO));
+        d.add(new StringField("value", "2", Field.Store.NO));
+        writer.addDocument(d);
+
+        d = new Document();
+        d.add(new StringField("_id", "2", Field.Store.NO));
+        d.add(new StringField("value", "4", Field.Store.NO));
+        writer.addDocument(d);
+
+        writer.commit();
+
+        writer.deleteDocuments(new Term("_id", "1"));
+    }
+
+    @Test
+    public void testDeletedDocs() throws Exception {
+        add2SingleValuedDocumentsAndDeleteOneOfThem();
+        IndexFieldData indexFieldData = getForField("value");
+        AtomicReaderContext readerContext = refreshReader();
+        AtomicFieldData fieldData = indexFieldData.load(readerContext);
+        BytesValues values = fieldData.getBytesValues();
+        assertThat(fieldData.getNumDocs(), equalTo(2));
+        for (int i = 0; i < fieldData.getNumDocs(); ++i) {
+            assertThat(values.hasValue(i), equalTo(true));
+        }
+    }
+
    @Test
    public void testSingleValueAllSet() throws Exception {
        fillSingleValueAllSet();