From ef61b547b175f9fbb63dd9fca78f7984f4f72be2 Mon Sep 17 00:00:00 2001 From: Simon Willnauer Date: Mon, 10 Dec 2018 15:31:43 +0100 Subject: [PATCH] LUCENE-8599: Use sparse bitset to store docs in SingleValueDocValuesFieldUpdates Using a sparse bitset in SingleValueDocValuesFieldUdpates allows storing which documents have an update much more efficient and prevents the need to sort the docs array altogether that showed to be a significant bottleneck in LUCENE-8598. Using the spares bitset yields another 10x performance improvement in applying updates versus the changes proposed in LUCENE-8598. --- lucene/CHANGES.txt | 3 + .../lucene/index/DocValuesFieldUpdates.java | 73 ++++++++++++++++--- .../index/TestDocValuesFieldUpdates.java | 59 ++++++++++++++- 3 files changed, 124 insertions(+), 11 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 4eaca5f6ba6..22e3e61d15d 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -271,6 +271,9 @@ Optimizations * LUCENE-8598: Moved to the default accepted overhead ratio for packet ints in DocValuesFieldUpdats yields an up-to 4x performance improvement when applying doc values updates. (Simon Willnauer, Adrien Grand) +* LUCENE-8599: Use sparse bitset to store docs in SingleValueDocValuesFieldUpdates. + (Simon Willnauer, Adrien Grand) + Other * LUCENE-8573: BKDWriter now uses FutureArrays#mismatch to compute shared prefixes. diff --git a/lucene/core/src/java/org/apache/lucene/index/DocValuesFieldUpdates.java b/lucene/core/src/java/org/apache/lucene/index/DocValuesFieldUpdates.java index 9ab3b7cc105..c514ad0e22e 100644 --- a/lucene/core/src/java/org/apache/lucene/index/DocValuesFieldUpdates.java +++ b/lucene/core/src/java/org/apache/lucene/index/DocValuesFieldUpdates.java @@ -18,10 +18,13 @@ package org.apache.lucene.index; import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.util.Accountable; +import org.apache.lucene.util.BitSet; +import org.apache.lucene.util.BitSetIterator; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.InPlaceMergeSorter; import org.apache.lucene.util.PriorityQueue; import org.apache.lucene.util.RamUsageEstimator; +import org.apache.lucene.util.SparseFixedBitSet; import org.apache.lucene.util.packed.PackedInts; import org.apache.lucene.util.packed.PagedMutable; @@ -282,7 +285,6 @@ abstract class DocValuesFieldUpdates implements Accountable { throw new IllegalStateException("already finished"); } finished = true; - // shrink wrap if (size < docs.size()) { resize(size); @@ -302,9 +304,9 @@ abstract class DocValuesFieldUpdates implements Accountable { } }.sort(0, size); } - + /** Returns true if this instance contains any updates. */ - synchronized final boolean any() { + synchronized boolean any() { return size > 0; } @@ -316,7 +318,7 @@ abstract class DocValuesFieldUpdates implements Accountable { * Adds an update that resets the documents value. * @param doc the doc to update */ - final synchronized void reset(int doc) { + synchronized void reset(int doc) { addInternal(doc, HAS_NO_VALUE_MASK); } final synchronized int add(int doc) { @@ -431,21 +433,42 @@ abstract class DocValuesFieldUpdates implements Accountable { } static abstract class SingleValueDocValuesFieldUpdates extends DocValuesFieldUpdates { - + private final BitSet bitSet; + private BitSet hasNoValue; + private boolean hasAtLeastOneValue; protected SingleValueDocValuesFieldUpdates(int maxDoc, long delGen, String field, DocValuesType type) { super(maxDoc, delGen, field, type); + this.bitSet = new SparseFixedBitSet(maxDoc); } @Override void add(int doc, long value) { assert longValue() == value; - super.add(doc); + bitSet.set(doc); + this.hasAtLeastOneValue = true; + if (hasNoValue != null) { + hasNoValue.clear(doc); + } } @Override void add(int doc, BytesRef value) { assert binaryValue().equals(value); - super.add(doc); + bitSet.set(doc); + this.hasAtLeastOneValue = true; + if (hasNoValue != null) { + hasNoValue.clear(doc); + } + } + + @Override + synchronized void reset(int doc) { + bitSet.set(doc); + this.hasAtLeastOneValue = true; + if (hasNoValue == null) { + hasNoValue = new SparseFixedBitSet(maxDoc); + } + hasNoValue.set(doc); } @Override @@ -457,12 +480,29 @@ abstract class DocValuesFieldUpdates implements Accountable { protected abstract long longValue(); + @Override + synchronized boolean any() { + return super.any() || hasAtLeastOneValue; + } + + @Override + public long ramBytesUsed() { + return super.ramBytesUsed() + bitSet.ramBytesUsed(); + } + @Override Iterator iterator() { - return new DocValuesFieldUpdates.AbstractIterator(size, docs, delGen) { + BitSetIterator iterator = new BitSetIterator(bitSet, maxDoc); + return new DocValuesFieldUpdates.Iterator() { + @Override - protected void set(long idx) { - // nothing to do; + public int docID() { + return iterator.docID(); + } + + @Override + public int nextDoc() { + return iterator.nextDoc(); } @Override @@ -474,6 +514,19 @@ abstract class DocValuesFieldUpdates implements Accountable { BytesRef binaryValue() { return SingleValueDocValuesFieldUpdates.this.binaryValue(); } + + @Override + long delGen() { + return delGen; + } + + @Override + boolean hasValue() { + if (hasNoValue != null) { + return hasNoValue.get(docID()) == false; + } + return true; + } }; } } diff --git a/lucene/core/src/test/org/apache/lucene/index/TestDocValuesFieldUpdates.java b/lucene/core/src/test/org/apache/lucene/index/TestDocValuesFieldUpdates.java index 6c60df3f3be..a3a6e5aff56 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestDocValuesFieldUpdates.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestDocValuesFieldUpdates.java @@ -21,6 +21,7 @@ import java.util.Arrays; import java.util.Collections; import java.util.List; +import org.apache.lucene.index.NumericDocValuesFieldUpdates.SingleValueNumericDocValuesFieldUpdates; import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.util.LuceneTestCase; @@ -129,7 +130,63 @@ public class TestDocValuesFieldUpdates extends LuceneTestCase { } idx++; } + } - + public void testSharedValueUpdates() { + int delGen = random().nextInt(); + int maxDoc = 1 + random().nextInt(1000); + long value = random().nextLong(); + SingleValueNumericDocValuesFieldUpdates update = new SingleValueNumericDocValuesFieldUpdates(delGen, "foo", maxDoc, value); + assertEquals(value, update.longValue()); + Boolean[] values = new Boolean[maxDoc]; + boolean any = false; + boolean noReset = random().nextBoolean(); // sometimes don't reset + for (int i = 0; i < maxDoc; i++) { + if (random().nextBoolean()) { + values[i] = Boolean.TRUE; + any = true; + update.add(i, value); + } else if (random().nextBoolean() && noReset == false) { + values[i] = null; + any = true; + update.reset(i); + } else { + values[i] = Boolean.FALSE; + } + } + if (noReset == false) { + for (int i = 0; i < values.length; i++) { + if (rarely()) { + if (values[i] == null) { + values[i] = Boolean.TRUE; + update.add(i, value); + } else if (values[i]) { + values[i] = null; + update.reset(i); + } + } + } + } + update.finish(); + DocValuesFieldUpdates.Iterator iterator = update.iterator(); + assertEquals(any, update.any()); + assertEquals(delGen, iterator.delGen()); + int index = 0; + while (iterator.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { + int doc = iterator.docID(); + if (index < iterator.docID()) { + for (;index < doc; index++) { + assertFalse(values[index]); + } + } + if (index == doc) { + if (values[index++] == null) { + assertFalse(iterator.hasValue()); + } else { + assertTrue(iterator.hasValue()); + assertEquals(value, iterator.longValue()); + } + } + } } }