LUCENE-10040: Add test for vector search with skewed deletions (#527)

This exercises a challenging case where the documents to skip all happen to
be closest to the query vector. In many cases, HNSW appears to be robust to this
case and maintains good recall.
This commit is contained in:
Julie Tibshirani 2021-12-08 11:24:12 -08:00 committed by GitHub
parent b9287c8ce0
commit 5d39bca87a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
1 changed files with 35 additions and 0 deletions

View File

@ -196,6 +196,41 @@ public class TestHnswGraph extends LuceneTestCase {
assertTrue("sum(result docs)=" + sum, sum < 75);
}
public void testSearchWithSkewedAcceptOrds() throws IOException {
int nDoc = 1000;
CircularVectorValues vectors = new CircularVectorValues(nDoc);
HnswGraphBuilder builder =
new HnswGraphBuilder(
vectors, VectorSimilarityFunction.DOT_PRODUCT, 16, 100, random().nextInt());
HnswGraph hnsw = builder.build(vectors);
// Skip over half of the documents that are closest to the query vector
FixedBitSet acceptOrds = new FixedBitSet(nDoc);
for (int i = 500; i < nDoc; i++) {
acceptOrds.set(i);
}
NeighborQueue nn =
HnswGraph.search(
new float[] {1, 0},
10,
10,
vectors.randomAccess(),
VectorSimilarityFunction.DOT_PRODUCT,
hnsw,
acceptOrds,
new SplittableRandom(random().nextLong()));
int[] nodes = nn.nodes();
assertTrue("Number of found results is not equal to [10].", nodes.length == 10);
int sum = 0;
for (int node : nodes) {
assertTrue("the results include a deleted document: " + node, acceptOrds.get(node));
sum += node;
}
// We still expect to get reasonable recall. The lowest non-skipped docIds
// are closest to the query vector: sum(500,509) = 5045
assertTrue("sum(result docs)=" + sum, sum < 5100);
}
public void testBoundsCheckerMax() {
BoundsChecker max = BoundsChecker.create(false);
float f = random().nextFloat() - 0.5f;