LUCENE-8599: Use sparse bitset to store docs in SingleValueDocValuesFieldUpdates

Using a sparse bitset in SingleValueDocValuesFieldUdpates allows storing
which documents have an update much more efficient and prevents the need
to sort the docs array altogether that showed to be a significant bottleneck
in LUCENE-8598. Using the spares bitset yields another 10x performance improvement
in applying updates versus the changes proposed in LUCENE-8598.
This commit is contained in:
Simon Willnauer 2018-12-10 15:31:43 +01:00
parent fdd9b55df0
commit ef61b547b1
3 changed files with 124 additions and 11 deletions

View File

@ -271,6 +271,9 @@ Optimizations
* LUCENE-8598: Moved to the default accepted overhead ratio for packet ints in DocValuesFieldUpdats * LUCENE-8598: Moved to the default accepted overhead ratio for packet ints in DocValuesFieldUpdats
yields an up-to 4x performance improvement when applying doc values updates. (Simon Willnauer, Adrien Grand) yields an up-to 4x performance improvement when applying doc values updates. (Simon Willnauer, Adrien Grand)
* LUCENE-8599: Use sparse bitset to store docs in SingleValueDocValuesFieldUpdates.
(Simon Willnauer, Adrien Grand)
Other Other
* LUCENE-8573: BKDWriter now uses FutureArrays#mismatch to compute shared prefixes. * LUCENE-8573: BKDWriter now uses FutureArrays#mismatch to compute shared prefixes.

View File

@ -18,10 +18,13 @@ package org.apache.lucene.index;
import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.util.Accountable; import org.apache.lucene.util.Accountable;
import org.apache.lucene.util.BitSet;
import org.apache.lucene.util.BitSetIterator;
import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.InPlaceMergeSorter; import org.apache.lucene.util.InPlaceMergeSorter;
import org.apache.lucene.util.PriorityQueue; import org.apache.lucene.util.PriorityQueue;
import org.apache.lucene.util.RamUsageEstimator; import org.apache.lucene.util.RamUsageEstimator;
import org.apache.lucene.util.SparseFixedBitSet;
import org.apache.lucene.util.packed.PackedInts; import org.apache.lucene.util.packed.PackedInts;
import org.apache.lucene.util.packed.PagedMutable; import org.apache.lucene.util.packed.PagedMutable;
@ -282,7 +285,6 @@ abstract class DocValuesFieldUpdates implements Accountable {
throw new IllegalStateException("already finished"); throw new IllegalStateException("already finished");
} }
finished = true; finished = true;
// shrink wrap // shrink wrap
if (size < docs.size()) { if (size < docs.size()) {
resize(size); resize(size);
@ -304,7 +306,7 @@ abstract class DocValuesFieldUpdates implements Accountable {
} }
/** Returns true if this instance contains any updates. */ /** Returns true if this instance contains any updates. */
synchronized final boolean any() { synchronized boolean any() {
return size > 0; return size > 0;
} }
@ -316,7 +318,7 @@ abstract class DocValuesFieldUpdates implements Accountable {
* Adds an update that resets the documents value. * Adds an update that resets the documents value.
* @param doc the doc to update * @param doc the doc to update
*/ */
final synchronized void reset(int doc) { synchronized void reset(int doc) {
addInternal(doc, HAS_NO_VALUE_MASK); addInternal(doc, HAS_NO_VALUE_MASK);
} }
final synchronized int add(int doc) { final synchronized int add(int doc) {
@ -431,21 +433,42 @@ abstract class DocValuesFieldUpdates implements Accountable {
} }
static abstract class SingleValueDocValuesFieldUpdates extends DocValuesFieldUpdates { static abstract class SingleValueDocValuesFieldUpdates extends DocValuesFieldUpdates {
private final BitSet bitSet;
private BitSet hasNoValue;
private boolean hasAtLeastOneValue;
protected SingleValueDocValuesFieldUpdates(int maxDoc, long delGen, String field, DocValuesType type) { protected SingleValueDocValuesFieldUpdates(int maxDoc, long delGen, String field, DocValuesType type) {
super(maxDoc, delGen, field, type); super(maxDoc, delGen, field, type);
this.bitSet = new SparseFixedBitSet(maxDoc);
} }
@Override @Override
void add(int doc, long value) { void add(int doc, long value) {
assert longValue() == value; assert longValue() == value;
super.add(doc); bitSet.set(doc);
this.hasAtLeastOneValue = true;
if (hasNoValue != null) {
hasNoValue.clear(doc);
}
} }
@Override @Override
void add(int doc, BytesRef value) { void add(int doc, BytesRef value) {
assert binaryValue().equals(value); assert binaryValue().equals(value);
super.add(doc); bitSet.set(doc);
this.hasAtLeastOneValue = true;
if (hasNoValue != null) {
hasNoValue.clear(doc);
}
}
@Override
synchronized void reset(int doc) {
bitSet.set(doc);
this.hasAtLeastOneValue = true;
if (hasNoValue == null) {
hasNoValue = new SparseFixedBitSet(maxDoc);
}
hasNoValue.set(doc);
} }
@Override @Override
@ -458,11 +481,28 @@ abstract class DocValuesFieldUpdates implements Accountable {
protected abstract long longValue(); protected abstract long longValue();
@Override @Override
Iterator iterator() { synchronized boolean any() {
return new DocValuesFieldUpdates.AbstractIterator(size, docs, delGen) { return super.any() || hasAtLeastOneValue;
}
@Override @Override
protected void set(long idx) { public long ramBytesUsed() {
// nothing to do; return super.ramBytesUsed() + bitSet.ramBytesUsed();
}
@Override
Iterator iterator() {
BitSetIterator iterator = new BitSetIterator(bitSet, maxDoc);
return new DocValuesFieldUpdates.Iterator() {
@Override
public int docID() {
return iterator.docID();
}
@Override
public int nextDoc() {
return iterator.nextDoc();
} }
@Override @Override
@ -474,6 +514,19 @@ abstract class DocValuesFieldUpdates implements Accountable {
BytesRef binaryValue() { BytesRef binaryValue() {
return SingleValueDocValuesFieldUpdates.this.binaryValue(); return SingleValueDocValuesFieldUpdates.this.binaryValue();
} }
@Override
long delGen() {
return delGen;
}
@Override
boolean hasValue() {
if (hasNoValue != null) {
return hasNoValue.get(docID()) == false;
}
return true;
}
}; };
} }
} }

View File

@ -21,6 +21,7 @@ import java.util.Arrays;
import java.util.Collections; import java.util.Collections;
import java.util.List; import java.util.List;
import org.apache.lucene.index.NumericDocValuesFieldUpdates.SingleValueNumericDocValuesFieldUpdates;
import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util.LuceneTestCase;
@ -129,7 +130,63 @@ public class TestDocValuesFieldUpdates extends LuceneTestCase {
} }
idx++; idx++;
} }
}
public void testSharedValueUpdates() {
int delGen = random().nextInt();
int maxDoc = 1 + random().nextInt(1000);
long value = random().nextLong();
SingleValueNumericDocValuesFieldUpdates update = new SingleValueNumericDocValuesFieldUpdates(delGen, "foo", maxDoc, value);
assertEquals(value, update.longValue());
Boolean[] values = new Boolean[maxDoc];
boolean any = false;
boolean noReset = random().nextBoolean(); // sometimes don't reset
for (int i = 0; i < maxDoc; i++) {
if (random().nextBoolean()) {
values[i] = Boolean.TRUE;
any = true;
update.add(i, value);
} else if (random().nextBoolean() && noReset == false) {
values[i] = null;
any = true;
update.reset(i);
} else {
values[i] = Boolean.FALSE;
}
}
if (noReset == false) {
for (int i = 0; i < values.length; i++) {
if (rarely()) {
if (values[i] == null) {
values[i] = Boolean.TRUE;
update.add(i, value);
} else if (values[i]) {
values[i] = null;
update.reset(i);
}
}
}
}
update.finish();
DocValuesFieldUpdates.Iterator iterator = update.iterator();
assertEquals(any, update.any());
assertEquals(delGen, iterator.delGen());
int index = 0;
while (iterator.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
int doc = iterator.docID();
if (index < iterator.docID()) {
for (;index < doc; index++) {
assertFalse(values[index]);
}
}
if (index == doc) {
if (values[index++] == null) {
assertFalse(iterator.hasValue());
} else {
assertTrue(iterator.hasValue());
assertEquals(value, iterator.longValue());
}
}
}
} }
} }