LUCENE-10040: Add test for vector search with skewed deletions (#527)

This exercises a challenging case where the documents to skip all happen to
be closest to the query vector. In many cases, HNSW appears to be robust to this
case and maintains good recall.
This commit is contained in:
Julie Tibshirani 2021-12-08 11:24:12 -08:00
parent c74642d9a7
commit 394472d4b8
1 changed files with 35 additions and 0 deletions

View File

@ -196,6 +196,41 @@ public class TestHnswGraph extends LuceneTestCase {
assertTrue("sum(result docs)=" + sum, sum < 75);
}
public void testSearchWithSkewedAcceptOrds() throws IOException {
int nDoc = 1000;
CircularVectorValues vectors = new CircularVectorValues(nDoc);
HnswGraphBuilder builder =
new HnswGraphBuilder(
vectors, VectorSimilarityFunction.DOT_PRODUCT, 16, 100, random().nextInt());
HnswGraph hnsw = builder.build(vectors);
// Skip over half of the documents that are closest to the query vector
FixedBitSet acceptOrds = new FixedBitSet(nDoc);
for (int i = 500; i < nDoc; i++) {
acceptOrds.set(i);
}
NeighborQueue nn =
HnswGraph.search(
new float[] {1, 0},
10,
10,
vectors.randomAccess(),
VectorSimilarityFunction.DOT_PRODUCT,
hnsw,
acceptOrds,
new SplittableRandom(random().nextLong()));
int[] nodes = nn.nodes();
assertTrue("Number of found results is not equal to [10].", nodes.length == 10);
int sum = 0;
for (int node : nodes) {
assertTrue("the results include a deleted document: " + node, acceptOrds.get(node));
sum += node;
}
// We still expect to get reasonable recall. The lowest non-skipped docIds
// are closest to the query vector: sum(500,509) = 5045
assertTrue("sum(result docs)=" + sum, sum < 5100);
}
public void testBoundsCheckerMax() {
BoundsChecker max = BoundsChecker.create(false);
float f = random().nextFloat() - 0.5f;