mirror of https://github.com/apache/lucene.git
LUCENE-10311: Make FixedBitSet#approximateCardinality faster (and actually approximate). (#710)
This computes a pop count on a sample of the longs that back the bitset. Quick benchmarks suggest that this runs 5x-10x faster than `FixedBitSet#cardinality` depending on the length of the bitset.
This commit is contained in:
parent
9ed526b70e
commit
ca73ed1c28
|
@ -253,6 +253,10 @@ Changes in runtime behavior
|
|||
* LUCENE-10291: Lucene now only writes files for terms and postings if at least
|
||||
one field is indexed with postings. (Yannick Welsch)
|
||||
|
||||
* LUCENE-10311: FixedBitSet#approximateCardinality now trades accuracy for
|
||||
speed instead of delegating to FixedBitSet#cardinality.
|
||||
(Robert Muir, Adrien Grand)
|
||||
|
||||
Bug Fixes
|
||||
---------------------
|
||||
|
||||
|
|
|
@ -68,9 +68,7 @@ public abstract class BitSet implements Bits, Accountable {
|
|||
* for speed if they have the ability to estimate the cardinality of the set without iterating
|
||||
* over all the data. The default implementation returns {@link #cardinality()}.
|
||||
*/
|
||||
public int approximateCardinality() {
|
||||
return cardinality();
|
||||
}
|
||||
public abstract int approximateCardinality();
|
||||
|
||||
/**
|
||||
* Returns the index of the last set bit before or on the index specified. -1 is returned if there
|
||||
|
|
|
@ -176,6 +176,30 @@ public final class FixedBitSet extends BitSet {
|
|||
return (int) BitUtil.pop_array(bits, 0, numWords);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int approximateCardinality() {
|
||||
// Naive sampling: compute the number of bits that are set on the first 16 longs every 1024
|
||||
// longs and scale the result by 1024/16.
|
||||
// This computes the pop count on ranges instead of single longs in order to take advantage of
|
||||
// vectorization.
|
||||
|
||||
final int rangeLength = 16;
|
||||
final int interval = 1024;
|
||||
|
||||
if (numWords < interval) {
|
||||
return cardinality();
|
||||
}
|
||||
|
||||
long popCount = 0;
|
||||
int maxWord;
|
||||
for (maxWord = 0; maxWord + interval < numWords; maxWord += interval) {
|
||||
popCount += BitUtil.pop_array(bits, maxWord, rangeLength);
|
||||
}
|
||||
|
||||
popCount *= (interval / rangeLength) * numWords / maxWord;
|
||||
return (int) popCount;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean get(int index) {
|
||||
assert index >= 0 && index < numBits : "index=" + index + ", numBits=" + numBits;
|
||||
|
|
|
@ -36,6 +36,19 @@ public class TestFixedBitSet extends BaseBitSetTestCase<FixedBitSet> {
|
|||
return set;
|
||||
}
|
||||
|
||||
public void testApproximateCardinality() {
|
||||
// The approximate cardinality works in such a way that it should be pretty accurate on a bitset
|
||||
// whose bits are uniformly distributed.
|
||||
final FixedBitSet set = new FixedBitSet(TestUtil.nextInt(random(), 100_000, 200_000));
|
||||
final int first = random().nextInt(10);
|
||||
final int interval = TestUtil.nextInt(random(), 10, 20);
|
||||
for (int i = first; i < set.length(); i += interval) {
|
||||
set.set(i);
|
||||
}
|
||||
final int cardinality = set.cardinality();
|
||||
assertEquals(cardinality, set.approximateCardinality(), cardinality / 20); // 5% error at most
|
||||
}
|
||||
|
||||
void doGet(java.util.BitSet a, FixedBitSet b) {
|
||||
assertEquals(a.cardinality(), b.cardinality());
|
||||
int max = b.length();
|
||||
|
|
|
@ -291,6 +291,11 @@ public abstract class BaseBitSetTestCase<T extends BitSet> extends LuceneTestCas
|
|||
return bitSet.cardinality();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int approximateCardinality() {
|
||||
return bitSet.cardinality();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int prevSetBit(int index) {
|
||||
return bitSet.previousSetBit(index);
|
||||
|
|
Loading…
Reference in New Issue