From ca73ed1c2842b10c338f1d27ec54cead69ac090e Mon Sep 17 00:00:00 2001 From: Adrien Grand Date: Thu, 3 Mar 2022 08:48:44 +0100 Subject: [PATCH] LUCENE-10311: Make FixedBitSet#approximateCardinality faster (and actually approximate). (#710) This computes a pop count on a sample of the longs that back the bitset. Quick benchmarks suggest that this runs 5x-10x faster than `FixedBitSet#cardinality` depending on the length of the bitset. --- lucene/CHANGES.txt | 4 ++++ .../java/org/apache/lucene/util/BitSet.java | 4 +--- .../org/apache/lucene/util/FixedBitSet.java | 24 +++++++++++++++++++ .../apache/lucene/util/TestFixedBitSet.java | 13 ++++++++++ .../lucene/tests/util/BaseBitSetTestCase.java | 5 ++++ 5 files changed, 47 insertions(+), 3 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index a75c2539e5d..875c2258cdf 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -253,6 +253,10 @@ Changes in runtime behavior * LUCENE-10291: Lucene now only writes files for terms and postings if at least one field is indexed with postings. (Yannick Welsch) +* LUCENE-10311: FixedBitSet#approximateCardinality now trades accuracy for + speed instead of delegating to FixedBitSet#cardinality. + (Robert Muir, Adrien Grand) + Bug Fixes --------------------- diff --git a/lucene/core/src/java/org/apache/lucene/util/BitSet.java b/lucene/core/src/java/org/apache/lucene/util/BitSet.java index 60e6ea01bcf..f8b8ba65a59 100644 --- a/lucene/core/src/java/org/apache/lucene/util/BitSet.java +++ b/lucene/core/src/java/org/apache/lucene/util/BitSet.java @@ -68,9 +68,7 @@ public abstract class BitSet implements Bits, Accountable { * for speed if they have the ability to estimate the cardinality of the set without iterating * over all the data. The default implementation returns {@link #cardinality()}. */ - public int approximateCardinality() { - return cardinality(); - } + public abstract int approximateCardinality(); /** * Returns the index of the last set bit before or on the index specified. -1 is returned if there diff --git a/lucene/core/src/java/org/apache/lucene/util/FixedBitSet.java b/lucene/core/src/java/org/apache/lucene/util/FixedBitSet.java index 71eb31b47f3..5b5b5fc1348 100644 --- a/lucene/core/src/java/org/apache/lucene/util/FixedBitSet.java +++ b/lucene/core/src/java/org/apache/lucene/util/FixedBitSet.java @@ -176,6 +176,30 @@ public final class FixedBitSet extends BitSet { return (int) BitUtil.pop_array(bits, 0, numWords); } + @Override + public int approximateCardinality() { + // Naive sampling: compute the number of bits that are set on the first 16 longs every 1024 + // longs and scale the result by 1024/16. + // This computes the pop count on ranges instead of single longs in order to take advantage of + // vectorization. + + final int rangeLength = 16; + final int interval = 1024; + + if (numWords < interval) { + return cardinality(); + } + + long popCount = 0; + int maxWord; + for (maxWord = 0; maxWord + interval < numWords; maxWord += interval) { + popCount += BitUtil.pop_array(bits, maxWord, rangeLength); + } + + popCount *= (interval / rangeLength) * numWords / maxWord; + return (int) popCount; + } + @Override public boolean get(int index) { assert index >= 0 && index < numBits : "index=" + index + ", numBits=" + numBits; diff --git a/lucene/core/src/test/org/apache/lucene/util/TestFixedBitSet.java b/lucene/core/src/test/org/apache/lucene/util/TestFixedBitSet.java index 558b0590159..86315960da3 100644 --- a/lucene/core/src/test/org/apache/lucene/util/TestFixedBitSet.java +++ b/lucene/core/src/test/org/apache/lucene/util/TestFixedBitSet.java @@ -36,6 +36,19 @@ public class TestFixedBitSet extends BaseBitSetTestCase { return set; } + public void testApproximateCardinality() { + // The approximate cardinality works in such a way that it should be pretty accurate on a bitset + // whose bits are uniformly distributed. + final FixedBitSet set = new FixedBitSet(TestUtil.nextInt(random(), 100_000, 200_000)); + final int first = random().nextInt(10); + final int interval = TestUtil.nextInt(random(), 10, 20); + for (int i = first; i < set.length(); i += interval) { + set.set(i); + } + final int cardinality = set.cardinality(); + assertEquals(cardinality, set.approximateCardinality(), cardinality / 20); // 5% error at most + } + void doGet(java.util.BitSet a, FixedBitSet b) { assertEquals(a.cardinality(), b.cardinality()); int max = b.length(); diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/util/BaseBitSetTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/tests/util/BaseBitSetTestCase.java index fa001cfe411..1bb5e500a44 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/util/BaseBitSetTestCase.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/util/BaseBitSetTestCase.java @@ -291,6 +291,11 @@ public abstract class BaseBitSetTestCase extends LuceneTestCas return bitSet.cardinality(); } + @Override + public int approximateCardinality() { + return bitSet.cardinality(); + } + @Override public int prevSetBit(int index) { return bitSet.previousSetBit(index);