From d992964493bf15b25361aadf93d44fa6a8ff1f65 Mon Sep 17 00:00:00 2001
From: Julie Tibshirani <julietibs@apache.org>
Date: Wed, 19 Oct 2022 09:59:59 -0700
Subject: [PATCH] Add monster test that indexes 1M vectors

---
 .../lucene/document/TestManyKnnVectors.java   | 135 ++++++++++++++++++
 1 file changed, 135 insertions(+)
 create mode 100644 lucene/core/src/test/org/apache/lucene/document/TestManyKnnVectors.java

diff --git a/lucene/core/src/test/org/apache/lucene/document/TestManyKnnVectors.java b/lucene/core/src/test/org/apache/lucene/document/TestManyKnnVectors.java
new file mode 100644
index 00000000000..5b3c2905ca3
--- /dev/null
+++ b/lucene/core/src/test/org/apache/lucene/document/TestManyKnnVectors.java
@@ -0,0 +1,135 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.document;
+
+import com.carrotsearch.randomizedtesting.annotations.TimeoutSuite;
+import org.apache.lucene.index.DirectoryReader;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.IndexWriterConfig;
+import org.apache.lucene.index.LeafReaderContext;
+import org.apache.lucene.index.VectorSimilarityFunction;
+import org.apache.lucene.index.VectorValues;
+import org.apache.lucene.search.Sort;
+import org.apache.lucene.search.SortField;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.FSDirectory;
+import org.apache.lucene.tests.util.LuceneTestCase;
+import org.apache.lucene.tests.util.LuceneTestCase.Monster;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.URL;
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
+import java.nio.FloatBuffer;
+import java.nio.channels.FileChannel;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+
+import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS;
+
+
+/**
+ * Tests a large dataset of kNN vectors to check for issues that only show up when
+ * segments are very large, like overflow. The dataset is based on the StackOverflow
+ * track from Elasticsearch's rally benchmarks: https://github.com/elastic/rally-tracks/tree/master/so_vector.
+ *
+ * Steps to run the test
+ *   1. Download the dataset: wget https://rally-tracks.elastic.co/so_vector/documents.bin
+ *   2. Move the dataset to the resources folder: mv documents.bin lucene/core/src/resources/
+ *   3. Start the test:
+ *     ./gradlew test --tests TestManyKnnVectors.testLargeSegment -Dtests.monster=true -Dtests.verbose=true \
+ *       -Dorg.gradle.jvmargs="-Xms2g -Xmx2g" --max-workers=1
+ */
+@TimeoutSuite(millis = 10_800_000) // 3 hour timeout
+@Monster("takes ~2 hours and needs 2GB heap")
+public class TestManyKnnVectors extends LuceneTestCase {
+  public void testLargeSegment() throws Exception {
+    IndexWriterConfig iwc = newIndexWriterConfig();
+    if (random().nextBoolean()) {
+      iwc.setIndexSort(new Sort(new SortField("sortkey", SortField.Type.INT)));
+    }
+    String fieldName = "field";
+    VectorSimilarityFunction similarityFunction = VectorSimilarityFunction.DOT_PRODUCT;
+
+    URL documentsPath = getClass().getClassLoader().getResource("documents.bin");
+    assertNotNull(documentsPath);
+
+    try (FileChannel input = FileChannel.open(Paths.get(documentsPath.toURI()));
+         Directory dir = FSDirectory.open(createTempDir("ManyKnnVectors"));
+         IndexWriter iw = new IndexWriter(dir, iwc)) {
+
+      // This data is enough to trigger the overflow bug in issue #11858,
+      // since 1_000_000 * 768 * 4 > Integer.MAX_VALUE
+      int numVectors = 1_000_000;
+      int dims = 768;
+
+      VectorReader vectorReader = new VectorReader(input, dims);
+      for (int i = 0; i < numVectors; i++) {
+        float[] vector = vectorReader.next();
+        Document doc = new Document();
+        doc.add(new KnnVectorField(fieldName, vector, similarityFunction));
+        doc.add(new NumericDocValuesField("sortkey", random().nextInt(100)));
+        iw.addDocument(doc);
+        if (VERBOSE && i % 10_000 == 0) {
+          System.out.println("Indexed " + i + " vectors out of " + numVectors);
+        }
+      }
+      iw.forceMerge(1);
+
+      try (IndexReader reader = DirectoryReader.open(iw)) {
+        assertEquals(1, reader.leaves().size());
+        LeafReaderContext ctx = reader.leaves().get(0);
+
+        VectorValues vectorValues = ctx.reader().getVectorValues(fieldName);
+        assertNotNull(vectorValues);
+        assertEquals(numVectors, vectorValues.size());
+
+        int numVectorsRead = 0;
+        while (vectorValues.nextDoc() != NO_MORE_DOCS) {
+          float[] v = vectorValues.vectorValue();
+          assertEquals(dims, v.length);
+          numVectorsRead++;
+        }
+        assertEquals(numVectors, numVectorsRead);
+      }
+    }
+  }
+
+  private static class VectorReader {
+    private final FileChannel input;
+    private final float[] vector;
+    private final ByteBuffer buffer;
+
+    public VectorReader(FileChannel input, int dims) {
+      this.input = input;
+      this.vector = new float[dims];
+
+      byte[] bytes = new byte[dims * Float.BYTES];
+      this.buffer = ByteBuffer.wrap(bytes).order(ByteOrder.LITTLE_ENDIAN);
+    }
+
+    public float[] next() throws IOException {
+      input.read(buffer);
+      buffer.position(0);
+      FloatBuffer floatBuffer = buffer.asFloatBuffer();
+      floatBuffer.get(vector);
+      return vector;
+    }
+  }
+}