LUCENE-10311: Make FixedBitSet#approximateCardinality faster (and actually approximate). (#710)

This computes a pop count on a sample of the longs that back the bitset.

Quick benchmarks suggest that this runs 5x-10x faster than
`FixedBitSet#cardinality` depending on the length of the bitset.
This commit is contained in:
Adrien Grand 2022-03-03 08:48:44 +01:00 committed by GitHub
parent 9ed526b70e
commit ca73ed1c28
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 47 additions and 3 deletions

View File

@ -253,6 +253,10 @@ Changes in runtime behavior
* LUCENE-10291: Lucene now only writes files for terms and postings if at least
one field is indexed with postings. (Yannick Welsch)
* LUCENE-10311: FixedBitSet#approximateCardinality now trades accuracy for
speed instead of delegating to FixedBitSet#cardinality.
(Robert Muir, Adrien Grand)
Bug Fixes
---------------------

View File

@ -68,9 +68,7 @@ public abstract class BitSet implements Bits, Accountable {
* for speed if they have the ability to estimate the cardinality of the set without iterating
* over all the data. The default implementation returns {@link #cardinality()}.
*/
public int approximateCardinality() {
return cardinality();
}
public abstract int approximateCardinality();
/**
* Returns the index of the last set bit before or on the index specified. -1 is returned if there

View File

@ -176,6 +176,30 @@ public final class FixedBitSet extends BitSet {
return (int) BitUtil.pop_array(bits, 0, numWords);
}
@Override
public int approximateCardinality() {
// Naive sampling: compute the number of bits that are set on the first 16 longs every 1024
// longs and scale the result by 1024/16.
// This computes the pop count on ranges instead of single longs in order to take advantage of
// vectorization.
final int rangeLength = 16;
final int interval = 1024;
if (numWords < interval) {
return cardinality();
}
long popCount = 0;
int maxWord;
for (maxWord = 0; maxWord + interval < numWords; maxWord += interval) {
popCount += BitUtil.pop_array(bits, maxWord, rangeLength);
}
popCount *= (interval / rangeLength) * numWords / maxWord;
return (int) popCount;
}
@Override
public boolean get(int index) {
assert index >= 0 && index < numBits : "index=" + index + ", numBits=" + numBits;

View File

@ -36,6 +36,19 @@ public class TestFixedBitSet extends BaseBitSetTestCase<FixedBitSet> {
return set;
}
public void testApproximateCardinality() {
// The approximate cardinality works in such a way that it should be pretty accurate on a bitset
// whose bits are uniformly distributed.
final FixedBitSet set = new FixedBitSet(TestUtil.nextInt(random(), 100_000, 200_000));
final int first = random().nextInt(10);
final int interval = TestUtil.nextInt(random(), 10, 20);
for (int i = first; i < set.length(); i += interval) {
set.set(i);
}
final int cardinality = set.cardinality();
assertEquals(cardinality, set.approximateCardinality(), cardinality / 20); // 5% error at most
}
void doGet(java.util.BitSet a, FixedBitSet b) {
assertEquals(a.cardinality(), b.cardinality());
int max = b.length();

View File

@ -291,6 +291,11 @@ public abstract class BaseBitSetTestCase<T extends BitSet> extends LuceneTestCas
return bitSet.cardinality();
}
@Override
public int approximateCardinality() {
return bitSet.cardinality();
}
@Override
public int prevSetBit(int index) {
return bitSet.previousSetBit(index);