From d992964493bf15b25361aadf93d44fa6a8ff1f65 Mon Sep 17 00:00:00 2001 From: Julie Tibshirani Date: Wed, 19 Oct 2022 09:59:59 -0700 Subject: [PATCH] Add monster test that indexes 1M vectors --- .../lucene/document/TestManyKnnVectors.java | 135 ++++++++++++++++++ 1 file changed, 135 insertions(+) create mode 100644 lucene/core/src/test/org/apache/lucene/document/TestManyKnnVectors.java diff --git a/lucene/core/src/test/org/apache/lucene/document/TestManyKnnVectors.java b/lucene/core/src/test/org/apache/lucene/document/TestManyKnnVectors.java new file mode 100644 index 00000000000..5b3c2905ca3 --- /dev/null +++ b/lucene/core/src/test/org/apache/lucene/document/TestManyKnnVectors.java @@ -0,0 +1,135 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.document; + +import com.carrotsearch.randomizedtesting.annotations.TimeoutSuite; +import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.index.VectorSimilarityFunction; +import org.apache.lucene.index.VectorValues; +import org.apache.lucene.search.Sort; +import org.apache.lucene.search.SortField; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.FSDirectory; +import org.apache.lucene.tests.util.LuceneTestCase; +import org.apache.lucene.tests.util.LuceneTestCase.Monster; + +import java.io.IOException; +import java.io.InputStream; +import java.net.URL; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import java.nio.FloatBuffer; +import java.nio.channels.FileChannel; +import java.nio.file.Path; +import java.nio.file.Paths; + +import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS; + + +/** + * Tests a large dataset of kNN vectors to check for issues that only show up when + * segments are very large, like overflow. The dataset is based on the StackOverflow + * track from Elasticsearch's rally benchmarks: https://github.com/elastic/rally-tracks/tree/master/so_vector. + * + * Steps to run the test + * 1. Download the dataset: wget https://rally-tracks.elastic.co/so_vector/documents.bin + * 2. Move the dataset to the resources folder: mv documents.bin lucene/core/src/resources/ + * 3. Start the test: + * ./gradlew test --tests TestManyKnnVectors.testLargeSegment -Dtests.monster=true -Dtests.verbose=true \ + * -Dorg.gradle.jvmargs="-Xms2g -Xmx2g" --max-workers=1 + */ +@TimeoutSuite(millis = 10_800_000) // 3 hour timeout +@Monster("takes ~2 hours and needs 2GB heap") +public class TestManyKnnVectors extends LuceneTestCase { + public void testLargeSegment() throws Exception { + IndexWriterConfig iwc = newIndexWriterConfig(); + if (random().nextBoolean()) { + iwc.setIndexSort(new Sort(new SortField("sortkey", SortField.Type.INT))); + } + String fieldName = "field"; + VectorSimilarityFunction similarityFunction = VectorSimilarityFunction.DOT_PRODUCT; + + URL documentsPath = getClass().getClassLoader().getResource("documents.bin"); + assertNotNull(documentsPath); + + try (FileChannel input = FileChannel.open(Paths.get(documentsPath.toURI())); + Directory dir = FSDirectory.open(createTempDir("ManyKnnVectors")); + IndexWriter iw = new IndexWriter(dir, iwc)) { + + // This data is enough to trigger the overflow bug in issue #11858, + // since 1_000_000 * 768 * 4 > Integer.MAX_VALUE + int numVectors = 1_000_000; + int dims = 768; + + VectorReader vectorReader = new VectorReader(input, dims); + for (int i = 0; i < numVectors; i++) { + float[] vector = vectorReader.next(); + Document doc = new Document(); + doc.add(new KnnVectorField(fieldName, vector, similarityFunction)); + doc.add(new NumericDocValuesField("sortkey", random().nextInt(100))); + iw.addDocument(doc); + if (VERBOSE && i % 10_000 == 0) { + System.out.println("Indexed " + i + " vectors out of " + numVectors); + } + } + iw.forceMerge(1); + + try (IndexReader reader = DirectoryReader.open(iw)) { + assertEquals(1, reader.leaves().size()); + LeafReaderContext ctx = reader.leaves().get(0); + + VectorValues vectorValues = ctx.reader().getVectorValues(fieldName); + assertNotNull(vectorValues); + assertEquals(numVectors, vectorValues.size()); + + int numVectorsRead = 0; + while (vectorValues.nextDoc() != NO_MORE_DOCS) { + float[] v = vectorValues.vectorValue(); + assertEquals(dims, v.length); + numVectorsRead++; + } + assertEquals(numVectors, numVectorsRead); + } + } + } + + private static class VectorReader { + private final FileChannel input; + private final float[] vector; + private final ByteBuffer buffer; + + public VectorReader(FileChannel input, int dims) { + this.input = input; + this.vector = new float[dims]; + + byte[] bytes = new byte[dims * Float.BYTES]; + this.buffer = ByteBuffer.wrap(bytes).order(ByteOrder.LITTLE_ENDIAN); + } + + public float[] next() throws IOException { + input.read(buffer); + buffer.position(0); + FloatBuffer floatBuffer = buffer.asFloatBuffer(); + floatBuffer.get(vector); + return vector; + } + } +}