mirror of https://github.com/apache/lucene.git
LUCENE-10311: Make FixedBitSet#approximateCardinality faster (and actually approximate). (#710)
This computes a pop count on a sample of the longs that back the bitset. Quick benchmarks suggest that this runs 5x-10x faster than `FixedBitSet#cardinality` depending on the length of the bitset.
This commit is contained in:
parent
9ed526b70e
commit
ca73ed1c28
|
@ -253,6 +253,10 @@ Changes in runtime behavior
|
||||||
* LUCENE-10291: Lucene now only writes files for terms and postings if at least
|
* LUCENE-10291: Lucene now only writes files for terms and postings if at least
|
||||||
one field is indexed with postings. (Yannick Welsch)
|
one field is indexed with postings. (Yannick Welsch)
|
||||||
|
|
||||||
|
* LUCENE-10311: FixedBitSet#approximateCardinality now trades accuracy for
|
||||||
|
speed instead of delegating to FixedBitSet#cardinality.
|
||||||
|
(Robert Muir, Adrien Grand)
|
||||||
|
|
||||||
Bug Fixes
|
Bug Fixes
|
||||||
---------------------
|
---------------------
|
||||||
|
|
||||||
|
|
|
@ -68,9 +68,7 @@ public abstract class BitSet implements Bits, Accountable {
|
||||||
* for speed if they have the ability to estimate the cardinality of the set without iterating
|
* for speed if they have the ability to estimate the cardinality of the set without iterating
|
||||||
* over all the data. The default implementation returns {@link #cardinality()}.
|
* over all the data. The default implementation returns {@link #cardinality()}.
|
||||||
*/
|
*/
|
||||||
public int approximateCardinality() {
|
public abstract int approximateCardinality();
|
||||||
return cardinality();
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Returns the index of the last set bit before or on the index specified. -1 is returned if there
|
* Returns the index of the last set bit before or on the index specified. -1 is returned if there
|
||||||
|
|
|
@ -176,6 +176,30 @@ public final class FixedBitSet extends BitSet {
|
||||||
return (int) BitUtil.pop_array(bits, 0, numWords);
|
return (int) BitUtil.pop_array(bits, 0, numWords);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int approximateCardinality() {
|
||||||
|
// Naive sampling: compute the number of bits that are set on the first 16 longs every 1024
|
||||||
|
// longs and scale the result by 1024/16.
|
||||||
|
// This computes the pop count on ranges instead of single longs in order to take advantage of
|
||||||
|
// vectorization.
|
||||||
|
|
||||||
|
final int rangeLength = 16;
|
||||||
|
final int interval = 1024;
|
||||||
|
|
||||||
|
if (numWords < interval) {
|
||||||
|
return cardinality();
|
||||||
|
}
|
||||||
|
|
||||||
|
long popCount = 0;
|
||||||
|
int maxWord;
|
||||||
|
for (maxWord = 0; maxWord + interval < numWords; maxWord += interval) {
|
||||||
|
popCount += BitUtil.pop_array(bits, maxWord, rangeLength);
|
||||||
|
}
|
||||||
|
|
||||||
|
popCount *= (interval / rangeLength) * numWords / maxWord;
|
||||||
|
return (int) popCount;
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public boolean get(int index) {
|
public boolean get(int index) {
|
||||||
assert index >= 0 && index < numBits : "index=" + index + ", numBits=" + numBits;
|
assert index >= 0 && index < numBits : "index=" + index + ", numBits=" + numBits;
|
||||||
|
|
|
@ -36,6 +36,19 @@ public class TestFixedBitSet extends BaseBitSetTestCase<FixedBitSet> {
|
||||||
return set;
|
return set;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testApproximateCardinality() {
|
||||||
|
// The approximate cardinality works in such a way that it should be pretty accurate on a bitset
|
||||||
|
// whose bits are uniformly distributed.
|
||||||
|
final FixedBitSet set = new FixedBitSet(TestUtil.nextInt(random(), 100_000, 200_000));
|
||||||
|
final int first = random().nextInt(10);
|
||||||
|
final int interval = TestUtil.nextInt(random(), 10, 20);
|
||||||
|
for (int i = first; i < set.length(); i += interval) {
|
||||||
|
set.set(i);
|
||||||
|
}
|
||||||
|
final int cardinality = set.cardinality();
|
||||||
|
assertEquals(cardinality, set.approximateCardinality(), cardinality / 20); // 5% error at most
|
||||||
|
}
|
||||||
|
|
||||||
void doGet(java.util.BitSet a, FixedBitSet b) {
|
void doGet(java.util.BitSet a, FixedBitSet b) {
|
||||||
assertEquals(a.cardinality(), b.cardinality());
|
assertEquals(a.cardinality(), b.cardinality());
|
||||||
int max = b.length();
|
int max = b.length();
|
||||||
|
|
|
@ -291,6 +291,11 @@ public abstract class BaseBitSetTestCase<T extends BitSet> extends LuceneTestCas
|
||||||
return bitSet.cardinality();
|
return bitSet.cardinality();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int approximateCardinality() {
|
||||||
|
return bitSet.cardinality();
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public int prevSetBit(int index) {
|
public int prevSetBit(int index) {
|
||||||
return bitSet.previousSetBit(index);
|
return bitSet.previousSetBit(index);
|
||||||
|
|
Loading…
Reference in New Issue