Fix 9.12.0 backcompat break (Lucene 9.12.0 cannot read 9.11.x indices written with quantized HNSW, `Lucene99HnswScalarQuantizedVectorsFormat`) (#13874)

* carefully regenerate the int8_hnsw bwc indices so that they do in fact use Lucene99ScalarQuantizedVectorsFormat ... when running TestInt8HnswBackwardsCompatibility it now fails (as expected) on 9.11.0 and 9.11.1 bwc indices, but not on 9.10.0

* rename int8 -> int7 bwc tests since we are actually testing 7 bit quantization

* actually fix the bwc bug: only allow compress=true when bits is 7 or 8 in HNSW scalar quantization

* tidy

* Revert "rename int8 -> int7 bwc tests since we are actually testing 7 bit quantization"

This reverts commit eeb3f8a668.

* Reapply "rename int8 -> int7 bwc tests since we are actually testing 7 bit quantization"

This reverts commit 3487c4210b.

* #13880: add test to verify the int7 quantized indices are in fact using quantized vectors not float32

* bump 9.12.x version to 9.12.1 and add bwc indices for 9.12.0

* remove duplicate 9.12.0 Version constant

* revert changes to index.9.12.0-cfs.zip, index.9.12.0-nocfs.zip, sorted.9.12.0.zip

* remove unused bwc index

Closes #13867
Closes #13880
This commit is contained in:
Michael McCandless 2024-10-09 16:06:09 -06:00 committed by Mike McCandless
parent e6bb5e2c54
commit eadc07cc6a
12 changed files with 48 additions and 19 deletions

View File

@ -106,8 +106,8 @@ public abstract class BackwardsCompatibilityTestBase extends LuceneTestCase {
* This is a base constructor for parameterized BWC tests. The constructor arguments are provided
* by {@link com.carrotsearch.randomizedtesting.RandomizedRunner} during test execution. A {@link
* com.carrotsearch.randomizedtesting.annotations.ParametersFactory} specified in a subclass
* provides a list lists of arguments for the tests and RandomizedRunner will execute the test for
* each of the argument list.
* provides a list of arguments for the tests and RandomizedRunner will execute the test for each
* of the argument list.
*
* @param version the version this test should run for
* @param indexPattern an index pattern in order to open an index of see {@link

View File

@ -39,7 +39,7 @@ public class TestGenerateBwcIndices extends LuceneTestCase {
// To generate backcompat indexes with the current default codec, run the following gradle
// command:
// gradlew test -Ptests.bwcdir=/path/to/store/indexes -Ptests.codec=default
// -Ptests.useSecurityManager=false --tests TestGenerateBwcIndices
// -Ptests.useSecurityManager=false --tests TestGenerateBwcIndices --max-workers=1
//
// Also add testmethod with one of the index creation methods below, for example:
// -Ptestmethod=testCreateCFS
@ -82,14 +82,14 @@ public class TestGenerateBwcIndices extends LuceneTestCase {
sortedTest.createBWCIndex();
}
public void testCreateInt8HNSWIndices() throws IOException {
TestInt8HnswBackwardsCompatibility int8HnswBackwardsCompatibility =
new TestInt8HnswBackwardsCompatibility(
public void testCreateInt7HNSWIndices() throws IOException {
TestInt7HnswBackwardsCompatibility int7HnswBackwardsCompatibility =
new TestInt7HnswBackwardsCompatibility(
Version.LATEST,
createPattern(
TestInt8HnswBackwardsCompatibility.INDEX_NAME,
TestInt8HnswBackwardsCompatibility.SUFFIX));
int8HnswBackwardsCompatibility.createBWCIndex();
TestInt7HnswBackwardsCompatibility.INDEX_NAME,
TestInt7HnswBackwardsCompatibility.SUFFIX));
int7HnswBackwardsCompatibility.createBWCIndex();
}
private boolean isInitialMajorVersionRelease() {

View File

@ -23,17 +23,22 @@ import java.io.IOException;
import org.apache.lucene.backward_codecs.lucene99.Lucene99Codec;
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.codecs.KnnVectorsFormat;
import org.apache.lucene.codecs.KnnVectorsReader;
import org.apache.lucene.codecs.lucene99.Lucene99HnswScalarQuantizedVectorsFormat;
import org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat;
import org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsReader;
import org.apache.lucene.codecs.perfield.PerFieldKnnVectorsFormat;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.KnnFloatVectorField;
import org.apache.lucene.document.StringField;
import org.apache.lucene.index.CodecReader;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.NoMergePolicy;
import org.apache.lucene.index.VectorSimilarityFunction;
import org.apache.lucene.search.IndexSearcher;
@ -41,23 +46,23 @@ import org.apache.lucene.store.Directory;
import org.apache.lucene.tests.analysis.MockAnalyzer;
import org.apache.lucene.tests.util.TestUtil;
import org.apache.lucene.util.Version;
import org.apache.lucene.util.quantization.QuantizedByteVectorValues;
public class TestInt8HnswBackwardsCompatibility extends BackwardsCompatibilityTestBase {
public class TestInt7HnswBackwardsCompatibility extends BackwardsCompatibilityTestBase {
static final String INDEX_NAME = "int8_hnsw";
static final String INDEX_NAME = "int7_hnsw";
static final String SUFFIX = "";
private static final Version FIRST_INT8_HNSW_VERSION = Version.LUCENE_9_10_0;
private static final Version FIRST_INT7_HNSW_VERSION = Version.LUCENE_9_10_0;
private static final String KNN_VECTOR_FIELD = "knn_field";
private static final int DOC_COUNT = 30;
private static final FieldType KNN_VECTOR_FIELD_TYPE =
KnnFloatVectorField.createFieldType(3, VectorSimilarityFunction.COSINE);
private static final float[] KNN_VECTOR = {0.2f, -0.1f, 0.1f};
public TestInt8HnswBackwardsCompatibility(Version version, String pattern) {
public TestInt7HnswBackwardsCompatibility(Version version, String pattern) {
super(version, pattern);
}
/** Provides all sorted versions to the test-framework */
@ParametersFactory(argumentFormatting = "Lucene-Version:%1$s; Pattern: %2$s")
public static Iterable<Object[]> testVersionsFactory() throws IllegalAccessException {
return allVersion(INDEX_NAME, SUFFIX);
@ -76,7 +81,7 @@ public class TestInt8HnswBackwardsCompatibility extends BackwardsCompatibilityTe
@Override
protected boolean supportsVersion(Version version) {
return version.onOrAfter(FIRST_INT8_HNSW_VERSION);
return version.onOrAfter(FIRST_INT7_HNSW_VERSION);
}
@Override
@ -84,7 +89,7 @@ public class TestInt8HnswBackwardsCompatibility extends BackwardsCompatibilityTe
// We don't use the default codec
}
public void testInt8HnswIndexAndSearch() throws Exception {
public void testInt7HnswIndexAndSearch() throws Exception {
IndexWriterConfig indexWriterConfig =
newIndexWriterConfig(new MockAnalyzer(random()))
.setOpenMode(IndexWriterConfig.OpenMode.APPEND)
@ -108,7 +113,6 @@ public class TestInt8HnswBackwardsCompatibility extends BackwardsCompatibilityTe
assertKNNSearch(searcher, KNN_VECTOR, 10, 10, "0");
}
}
// This will confirm the docs are really sorted
TestUtil.checkIndex(directory);
}
@ -117,7 +121,7 @@ public class TestInt8HnswBackwardsCompatibility extends BackwardsCompatibilityTe
IndexWriterConfig conf =
new IndexWriterConfig(new MockAnalyzer(random()))
.setMaxBufferedDocs(10)
.setCodec(TestUtil.getDefaultCodec())
.setCodec(getCodec())
.setMergePolicy(NoMergePolicy.INSTANCE);
try (IndexWriter writer = new IndexWriter(dir, conf)) {
for (int i = 0; i < DOC_COUNT; i++) {
@ -147,4 +151,29 @@ public class TestInt8HnswBackwardsCompatibility extends BackwardsCompatibilityTe
assertKNNSearch(searcher, KNN_VECTOR, 10, 10, "0");
}
}
// #13880: make sure the BWC index really contains quantized HNSW not float32
public void testIndexIsReallyQuantized() throws Exception {
try (DirectoryReader reader = DirectoryReader.open(directory)) {
for (LeafReaderContext leafContext : reader.leaves()) {
KnnVectorsReader knnVectorsReader = ((CodecReader) leafContext.reader()).getVectorReader();
assertTrue(
"expected PerFieldKnnVectorsFormat.FieldsReader but got: " + knnVectorsReader,
knnVectorsReader instanceof PerFieldKnnVectorsFormat.FieldsReader);
KnnVectorsReader forField =
((PerFieldKnnVectorsFormat.FieldsReader) knnVectorsReader)
.getFieldReader(KNN_VECTOR_FIELD);
assertTrue(forField instanceof Lucene99HnswVectorsReader);
QuantizedByteVectorValues quantized =
((Lucene99HnswVectorsReader) forField).getQuantizedVectorValues(KNN_VECTOR_FIELD);
assertNotNull(
"KnnVectorsReader should have quantized interface for field " + KNN_VECTOR_FIELD,
quantized);
}
}
}
}

View File

@ -135,7 +135,7 @@ public final class Lucene99ScalarQuantizedVectorsReader extends FlatVectorsReade
}
final long quantizedVectorBytes;
if (fieldEntry.compress) {
if (fieldEntry.bits <= 4 && fieldEntry.compress) {
// two dimensions -> one byte
quantizedVectorBytes = ((dimension + 1) >> 1) + Float.BYTES;
} else {