Refactor Lucene95 to allow off heap vector reader reuse (#12629)

While going through: https://github.com/apache/lucene/pull/12582

I noticed that for a while now, our offheap vector readers haven't changed at all. We just keep copying them around for no reason.

To make adding a new vector codec simpler, this refactors the lucene95 codec to allow its offheap vector storage format (readers/writers) to be used. 

Additionally, it will handle writing the appropriate fields for sparse vectors (read/write) to a provided index output/inputs.

This should reduce the churn in new codecs significantly.
This commit is contained in:
Benjamin Trent 2023-10-10 11:53:54 -07:00 committed by GitHub
parent 04f38dd288
commit 05d26ac44d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 293 additions and 126 deletions

View File

@ -248,7 +248,13 @@ public final class Lucene95HnswVectorsReader extends KnnVectorsReader implements
+ " expected: "
+ VectorEncoding.FLOAT32);
}
return OffHeapFloatVectorValues.load(fieldEntry, vectorData);
return OffHeapFloatVectorValues.load(
fieldEntry.ordToDocVectorValues,
fieldEntry.vectorEncoding,
fieldEntry.dimension,
fieldEntry.vectorDataOffset,
fieldEntry.vectorDataLength,
vectorData);
}
@Override
@ -263,7 +269,13 @@ public final class Lucene95HnswVectorsReader extends KnnVectorsReader implements
+ " expected: "
+ VectorEncoding.FLOAT32);
}
return OffHeapByteVectorValues.load(fieldEntry, vectorData);
return OffHeapByteVectorValues.load(
fieldEntry.ordToDocVectorValues,
fieldEntry.vectorEncoding,
fieldEntry.dimension,
fieldEntry.vectorDataOffset,
fieldEntry.vectorDataLength,
vectorData);
}
@Override
@ -277,7 +289,14 @@ public final class Lucene95HnswVectorsReader extends KnnVectorsReader implements
return;
}
OffHeapFloatVectorValues vectorValues = OffHeapFloatVectorValues.load(fieldEntry, vectorData);
OffHeapFloatVectorValues vectorValues =
OffHeapFloatVectorValues.load(
fieldEntry.ordToDocVectorValues,
fieldEntry.vectorEncoding,
fieldEntry.dimension,
fieldEntry.vectorDataOffset,
fieldEntry.vectorDataLength,
vectorData);
RandomVectorScorer scorer =
RandomVectorScorer.createFloats(vectorValues, fieldEntry.similarityFunction, target);
HnswGraphSearcher.search(
@ -298,7 +317,14 @@ public final class Lucene95HnswVectorsReader extends KnnVectorsReader implements
return;
}
OffHeapByteVectorValues vectorValues = OffHeapByteVectorValues.load(fieldEntry, vectorData);
OffHeapByteVectorValues vectorValues =
OffHeapByteVectorValues.load(
fieldEntry.ordToDocVectorValues,
fieldEntry.vectorEncoding,
fieldEntry.dimension,
fieldEntry.vectorDataOffset,
fieldEntry.vectorDataLength,
vectorData);
RandomVectorScorer scorer =
RandomVectorScorer.createBytes(vectorValues, fieldEntry.similarityFunction, target);
HnswGraphSearcher.search(
@ -352,22 +378,9 @@ public final class Lucene95HnswVectorsReader extends KnnVectorsReader implements
final int offsetsBlockShift;
final long offsetsLength;
// the following four variables used to read docIds encoded by IndexDISI
// special values of docsWithFieldOffset are -1 and -2
// -1 : dense
// -2 : empty
// other: sparse
final long docsWithFieldOffset;
final long docsWithFieldLength;
final short jumpTableEntryCount;
final byte denseRankPower;
// the following four variables used to read ordToDoc encoded by DirectMonotonicWriter
// note that only spare case needs to store ordToDoc
final long addressesOffset;
final int blockShift;
final DirectMonotonicReader.Meta meta;
final long addressesLength;
// Contains the configuration for reading sparse vectors and translating vector ordinals to
// docId
OrdToDocDISIReaderConfiguration ordToDocVectorValues;
FieldEntry(
IndexInput input,
@ -383,24 +396,7 @@ public final class Lucene95HnswVectorsReader extends KnnVectorsReader implements
dimension = input.readVInt();
size = input.readInt();
docsWithFieldOffset = input.readLong();
docsWithFieldLength = input.readLong();
jumpTableEntryCount = input.readShort();
denseRankPower = input.readByte();
// dense or empty
if (docsWithFieldOffset == -1 || docsWithFieldOffset == -2) {
addressesOffset = 0;
blockShift = 0;
meta = null;
addressesLength = 0;
} else {
// sparse
addressesOffset = input.readLong();
blockShift = input.readVInt();
meta = DirectMonotonicReader.loadMeta(input, size, blockShift);
addressesLength = input.readLong();
}
ordToDocVectorValues = OrdToDocDISIReaderConfiguration.fromStoredMeta(input, size);
// read nodes by level
M = input.readVInt();
@ -441,7 +437,7 @@ public final class Lucene95HnswVectorsReader extends KnnVectorsReader implements
public long ramBytesUsed() {
return SHALLOW_SIZE
+ Arrays.stream(nodesByLevel).mapToLong(nodes -> RamUsageEstimator.sizeOf(nodes)).sum()
+ RamUsageEstimator.sizeOf(meta)
+ RamUsageEstimator.sizeOf(ordToDocVectorValues)
+ RamUsageEstimator.sizeOf(offsetsMeta);
}
}

View File

@ -34,7 +34,6 @@ import org.apache.lucene.codecs.HnswGraphProvider;
import org.apache.lucene.codecs.KnnFieldVectorsWriter;
import org.apache.lucene.codecs.KnnVectorsReader;
import org.apache.lucene.codecs.KnnVectorsWriter;
import org.apache.lucene.codecs.lucene90.IndexedDISI;
import org.apache.lucene.codecs.perfield.PerFieldKnnVectorsFormat;
import org.apache.lucene.index.*;
import org.apache.lucene.index.Sorter;
@ -727,43 +726,8 @@ public final class Lucene95HnswVectorsWriter extends KnnVectorsWriter {
// write docIDs
int count = docsWithField.cardinality();
meta.writeInt(count);
if (count == 0) {
meta.writeLong(-2); // docsWithFieldOffset
meta.writeLong(0L); // docsWithFieldLength
meta.writeShort((short) -1); // jumpTableEntryCount
meta.writeByte((byte) -1); // denseRankPower
} else if (count == maxDoc) {
meta.writeLong(-1); // docsWithFieldOffset
meta.writeLong(0L); // docsWithFieldLength
meta.writeShort((short) -1); // jumpTableEntryCount
meta.writeByte((byte) -1); // denseRankPower
} else {
long offset = vectorData.getFilePointer();
meta.writeLong(offset); // docsWithFieldOffset
final short jumpTableEntryCount =
IndexedDISI.writeBitSet(
docsWithField.iterator(), vectorData, IndexedDISI.DEFAULT_DENSE_RANK_POWER);
meta.writeLong(vectorData.getFilePointer() - offset); // docsWithFieldLength
meta.writeShort(jumpTableEntryCount);
meta.writeByte(IndexedDISI.DEFAULT_DENSE_RANK_POWER);
// write ordToDoc mapping
long start = vectorData.getFilePointer();
meta.writeLong(start);
meta.writeVInt(DIRECT_MONOTONIC_BLOCK_SHIFT);
// dense case and empty case do not need to store ordToMap mapping
final DirectMonotonicWriter ordToDocWriter =
DirectMonotonicWriter.getInstance(meta, vectorData, count, DIRECT_MONOTONIC_BLOCK_SHIFT);
DocIdSetIterator iterator = docsWithField.iterator();
for (int doc = iterator.nextDoc();
doc != DocIdSetIterator.NO_MORE_DOCS;
doc = iterator.nextDoc()) {
ordToDocWriter.add(doc);
}
ordToDocWriter.finish();
meta.writeLong(vectorData.getFilePointer() - start);
}
OrdToDocDISIReaderConfiguration.writeStoredMeta(
DIRECT_MONOTONIC_BLOCK_SHIFT, meta, vectorData, count, maxDoc, docsWithField);
meta.writeVInt(M);
// write graph nodes on each level
if (graph == null) {

View File

@ -29,7 +29,7 @@ import org.apache.lucene.util.hnsw.RandomAccessVectorValues;
import org.apache.lucene.util.packed.DirectMonotonicReader;
/** Read the vector values from the index input. This supports both iterated and random access. */
abstract class OffHeapByteVectorValues extends ByteVectorValues
public abstract class OffHeapByteVectorValues extends ByteVectorValues
implements RandomAccessVectorValues<byte[]> {
protected final int dimension;
@ -73,19 +73,24 @@ abstract class OffHeapByteVectorValues extends ByteVectorValues
slice.readBytes(byteBuffer.array(), byteBuffer.arrayOffset(), byteSize);
}
static OffHeapByteVectorValues load(
Lucene95HnswVectorsReader.FieldEntry fieldEntry, IndexInput vectorData) throws IOException {
if (fieldEntry.docsWithFieldOffset == -2 || fieldEntry.vectorEncoding != VectorEncoding.BYTE) {
return new EmptyOffHeapVectorValues(fieldEntry.dimension);
public static OffHeapByteVectorValues load(
OrdToDocDISIReaderConfiguration configuration,
VectorEncoding vectorEncoding,
int dimension,
long vectorDataOffset,
long vectorDataLength,
IndexInput vectorData)
throws IOException {
if (configuration.docsWithFieldOffset == -2 || vectorEncoding != VectorEncoding.BYTE) {
return new EmptyOffHeapVectorValues(dimension);
}
IndexInput bytesSlice =
vectorData.slice("vector-data", fieldEntry.vectorDataOffset, fieldEntry.vectorDataLength);
int byteSize = fieldEntry.dimension;
if (fieldEntry.docsWithFieldOffset == -1) {
return new DenseOffHeapVectorValues(
fieldEntry.dimension, fieldEntry.size, bytesSlice, byteSize);
IndexInput bytesSlice = vectorData.slice("vector-data", vectorDataOffset, vectorDataLength);
int byteSize = dimension;
if (configuration.docsWithFieldOffset == -1) {
return new DenseOffHeapVectorValues(dimension, configuration.size, bytesSlice, byteSize);
} else {
return new SparseOffHeapVectorValues(fieldEntry, vectorData, bytesSlice, byteSize);
return new SparseOffHeapVectorValues(
configuration, vectorData, bytesSlice, dimension, byteSize);
}
}
@ -139,29 +144,30 @@ abstract class OffHeapByteVectorValues extends ByteVectorValues
private final IndexedDISI disi;
// dataIn was used to init a new IndexedDIS for #randomAccess()
private final IndexInput dataIn;
private final Lucene95HnswVectorsReader.FieldEntry fieldEntry;
private final OrdToDocDISIReaderConfiguration configuration;
public SparseOffHeapVectorValues(
Lucene95HnswVectorsReader.FieldEntry fieldEntry,
OrdToDocDISIReaderConfiguration configuration,
IndexInput dataIn,
IndexInput slice,
int dimension,
int byteSize)
throws IOException {
super(fieldEntry.dimension, fieldEntry.size, slice, byteSize);
this.fieldEntry = fieldEntry;
super(dimension, configuration.size, slice, byteSize);
this.configuration = configuration;
final RandomAccessInput addressesData =
dataIn.randomAccessSlice(fieldEntry.addressesOffset, fieldEntry.addressesLength);
dataIn.randomAccessSlice(configuration.addressesOffset, configuration.addressesLength);
this.dataIn = dataIn;
this.ordToDoc = DirectMonotonicReader.getInstance(fieldEntry.meta, addressesData);
this.ordToDoc = DirectMonotonicReader.getInstance(configuration.meta, addressesData);
this.disi =
new IndexedDISI(
dataIn,
fieldEntry.docsWithFieldOffset,
fieldEntry.docsWithFieldLength,
fieldEntry.jumpTableEntryCount,
fieldEntry.denseRankPower,
fieldEntry.size);
configuration.docsWithFieldOffset,
configuration.docsWithFieldLength,
configuration.jumpTableEntryCount,
configuration.denseRankPower,
configuration.size);
}
@Override
@ -187,7 +193,8 @@ abstract class OffHeapByteVectorValues extends ByteVectorValues
@Override
public RandomAccessVectorValues<byte[]> copy() throws IOException {
return new SparseOffHeapVectorValues(fieldEntry, dataIn, slice.clone(), byteSize);
return new SparseOffHeapVectorValues(
configuration, dataIn, slice.clone(), dimension, byteSize);
}
@Override

View File

@ -28,7 +28,7 @@ import org.apache.lucene.util.hnsw.RandomAccessVectorValues;
import org.apache.lucene.util.packed.DirectMonotonicReader;
/** Read the vector values from the index input. This supports both iterated and random access. */
abstract class OffHeapFloatVectorValues extends FloatVectorValues
public abstract class OffHeapFloatVectorValues extends FloatVectorValues
implements RandomAccessVectorValues<float[]> {
protected final int dimension;
@ -67,20 +67,24 @@ abstract class OffHeapFloatVectorValues extends FloatVectorValues
return value;
}
static OffHeapFloatVectorValues load(
Lucene95HnswVectorsReader.FieldEntry fieldEntry, IndexInput vectorData) throws IOException {
if (fieldEntry.docsWithFieldOffset == -2
|| fieldEntry.vectorEncoding != VectorEncoding.FLOAT32) {
return new EmptyOffHeapVectorValues(fieldEntry.dimension);
public static OffHeapFloatVectorValues load(
OrdToDocDISIReaderConfiguration configuration,
VectorEncoding vectorEncoding,
int dimension,
long vectorDataOffset,
long vectorDataLength,
IndexInput vectorData)
throws IOException {
if (configuration.docsWithFieldOffset == -2 || vectorEncoding != VectorEncoding.FLOAT32) {
return new EmptyOffHeapVectorValues(dimension);
}
IndexInput bytesSlice =
vectorData.slice("vector-data", fieldEntry.vectorDataOffset, fieldEntry.vectorDataLength);
int byteSize = fieldEntry.dimension * Float.BYTES;
if (fieldEntry.docsWithFieldOffset == -1) {
return new DenseOffHeapVectorValues(
fieldEntry.dimension, fieldEntry.size, bytesSlice, byteSize);
IndexInput bytesSlice = vectorData.slice("vector-data", vectorDataOffset, vectorDataLength);
int byteSize = dimension * Float.BYTES;
if (configuration.docsWithFieldOffset == -1) {
return new DenseOffHeapVectorValues(dimension, configuration.size, bytesSlice, byteSize);
} else {
return new SparseOffHeapVectorValues(fieldEntry, vectorData, bytesSlice, byteSize);
return new SparseOffHeapVectorValues(
configuration, vectorData, bytesSlice, dimension, byteSize);
}
}
@ -134,29 +138,30 @@ abstract class OffHeapFloatVectorValues extends FloatVectorValues
private final IndexedDISI disi;
// dataIn was used to init a new IndexedDIS for #randomAccess()
private final IndexInput dataIn;
private final Lucene95HnswVectorsReader.FieldEntry fieldEntry;
private final OrdToDocDISIReaderConfiguration configuration;
public SparseOffHeapVectorValues(
Lucene95HnswVectorsReader.FieldEntry fieldEntry,
OrdToDocDISIReaderConfiguration configuration,
IndexInput dataIn,
IndexInput slice,
int dimension,
int byteSize)
throws IOException {
super(fieldEntry.dimension, fieldEntry.size, slice, byteSize);
this.fieldEntry = fieldEntry;
super(dimension, configuration.size, slice, byteSize);
this.configuration = configuration;
final RandomAccessInput addressesData =
dataIn.randomAccessSlice(fieldEntry.addressesOffset, fieldEntry.addressesLength);
dataIn.randomAccessSlice(configuration.addressesOffset, configuration.addressesLength);
this.dataIn = dataIn;
this.ordToDoc = DirectMonotonicReader.getInstance(fieldEntry.meta, addressesData);
this.ordToDoc = DirectMonotonicReader.getInstance(configuration.meta, addressesData);
this.disi =
new IndexedDISI(
dataIn,
fieldEntry.docsWithFieldOffset,
fieldEntry.docsWithFieldLength,
fieldEntry.jumpTableEntryCount,
fieldEntry.denseRankPower,
fieldEntry.size);
configuration.docsWithFieldOffset,
configuration.docsWithFieldLength,
configuration.jumpTableEntryCount,
configuration.denseRankPower,
configuration.size);
}
@Override
@ -182,7 +187,8 @@ abstract class OffHeapFloatVectorValues extends FloatVectorValues
@Override
public RandomAccessVectorValues<float[]> copy() throws IOException {
return new SparseOffHeapVectorValues(fieldEntry, dataIn, slice.clone(), byteSize);
return new SparseOffHeapVectorValues(
configuration, dataIn, slice.clone(), dimension, byteSize);
}
@Override

View File

@ -0,0 +1,194 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.codecs.lucene95;
import java.io.IOException;
import org.apache.lucene.codecs.lucene90.IndexedDISI;
import org.apache.lucene.index.DocsWithFieldSet;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.Accountable;
import org.apache.lucene.util.RamUsageEstimator;
import org.apache.lucene.util.packed.DirectMonotonicReader;
import org.apache.lucene.util.packed.DirectMonotonicWriter;
/**
* Configuration for {@link DirectMonotonicReader} and {@link IndexedDISI} for reading sparse
* vectors. The format in the static writing methods adheres to the Lucene95HnswVectorsFormat
*/
public class OrdToDocDISIReaderConfiguration implements Accountable {
private static final long SHALLOW_SIZE =
RamUsageEstimator.shallowSizeOfInstance(OrdToDocDISIReaderConfiguration.class);
/**
* Writes out the docsWithField and ordToDoc mapping to the outputMeta and vectorData
* respectively. This is in adherence to the Lucene95HnswVectorsFormat.
*
* <p>Within outputMeta the format is as follows:
*
* <ul>
* <li><b>[int8]</b> if equals to -2, empty - no vectory values. If equals to -1, dense all
* documents have values for a field. If equals to 0, sparse some documents missing
* values.
* <li>DocIds were encoded by {@link IndexedDISI#writeBitSet(DocIdSetIterator, IndexOutput,
* byte)}
* <li>OrdToDoc was encoded by {@link org.apache.lucene.util.packed.DirectMonotonicWriter}, note
* that only in sparse case
* </ul>
*
* <p>Within the vectorData the format is as follows:
*
* <ul>
* <li>DocIds encoded by {@link IndexedDISI#writeBitSet(DocIdSetIterator, IndexOutput, byte)},
* note that only in sparse case
* <li>OrdToDoc was encoded by {@link org.apache.lucene.util.packed.DirectMonotonicWriter}, note
* that only in sparse case
* </ul>
*
* @param outputMeta the outputMeta
* @param vectorData the vectorData
* @param count the count of docs with vectors
* @param maxDoc the maxDoc for the index
* @param docsWithField the docs contaiting a vector field
* @throws IOException thrown when writing data fails to either output
*/
public static void writeStoredMeta(
int directMonotonicBlockShift,
IndexOutput outputMeta,
IndexOutput vectorData,
int count,
int maxDoc,
DocsWithFieldSet docsWithField)
throws IOException {
if (count == 0) {
outputMeta.writeLong(-2); // docsWithFieldOffset
outputMeta.writeLong(0L); // docsWithFieldLength
outputMeta.writeShort((short) -1); // jumpTableEntryCount
outputMeta.writeByte((byte) -1); // denseRankPower
} else if (count == maxDoc) {
outputMeta.writeLong(-1); // docsWithFieldOffset
outputMeta.writeLong(0L); // docsWithFieldLength
outputMeta.writeShort((short) -1); // jumpTableEntryCount
outputMeta.writeByte((byte) -1); // denseRankPower
} else {
long offset = vectorData.getFilePointer();
outputMeta.writeLong(offset); // docsWithFieldOffset
final short jumpTableEntryCount =
IndexedDISI.writeBitSet(
docsWithField.iterator(), vectorData, IndexedDISI.DEFAULT_DENSE_RANK_POWER);
outputMeta.writeLong(vectorData.getFilePointer() - offset); // docsWithFieldLength
outputMeta.writeShort(jumpTableEntryCount);
outputMeta.writeByte(IndexedDISI.DEFAULT_DENSE_RANK_POWER);
// write ordToDoc mapping
long start = vectorData.getFilePointer();
outputMeta.writeLong(start);
outputMeta.writeVInt(directMonotonicBlockShift);
// dense case and empty case do not need to store ordToMap mapping
final DirectMonotonicWriter ordToDocWriter =
DirectMonotonicWriter.getInstance(
outputMeta, vectorData, count, directMonotonicBlockShift);
DocIdSetIterator iterator = docsWithField.iterator();
for (int doc = iterator.nextDoc();
doc != DocIdSetIterator.NO_MORE_DOCS;
doc = iterator.nextDoc()) {
ordToDocWriter.add(doc);
}
ordToDocWriter.finish();
outputMeta.writeLong(vectorData.getFilePointer() - start);
}
}
/**
* Reads in the necessary fields stored in the outputMeta to configure {@link
* DirectMonotonicReader} and {@link IndexedDISI}.
*
* @param inputMeta the inputMeta, previously written to via {@link #writeStoredMeta(int,
* IndexOutput, IndexOutput, int, int, DocsWithFieldSet)}
* @param size The number of vectors
* @return the configuration required to read sparse vectors
* @throws IOException thrown when reading data fails
*/
public static OrdToDocDISIReaderConfiguration fromStoredMeta(IndexInput inputMeta, int size)
throws IOException {
long docsWithFieldOffset = inputMeta.readLong();
long docsWithFieldLength = inputMeta.readLong();
short jumpTableEntryCount = inputMeta.readShort();
byte denseRankPower = inputMeta.readByte();
long addressesOffset = 0;
int blockShift = 0;
DirectMonotonicReader.Meta meta = null;
long addressesLength = 0;
if (docsWithFieldOffset > -1) {
addressesOffset = inputMeta.readLong();
blockShift = inputMeta.readVInt();
meta = DirectMonotonicReader.loadMeta(inputMeta, size, blockShift);
addressesLength = inputMeta.readLong();
}
return new OrdToDocDISIReaderConfiguration(
size,
jumpTableEntryCount,
addressesOffset,
addressesLength,
docsWithFieldOffset,
docsWithFieldLength,
denseRankPower,
meta);
}
final int size;
// the following four variables used to read docIds encoded by IndexDISI
// special values of docsWithFieldOffset are -1 and -2
// -1 : dense
// -2 : empty
// other: sparse
final short jumpTableEntryCount;
final long docsWithFieldOffset, docsWithFieldLength;
final byte denseRankPower;
// the following four variables used to read ordToDoc encoded by DirectMonotonicWriter
// note that only spare case needs to store ordToDoc
final long addressesOffset, addressesLength;
final DirectMonotonicReader.Meta meta;
OrdToDocDISIReaderConfiguration(
int size,
short jumpTableEntryCount,
long addressesOffset,
long addressesLength,
long docsWithFieldOffset,
long docsWithFieldLength,
byte denseRankPower,
DirectMonotonicReader.Meta meta) {
this.size = size;
this.jumpTableEntryCount = jumpTableEntryCount;
this.addressesOffset = addressesOffset;
this.addressesLength = addressesLength;
this.docsWithFieldOffset = docsWithFieldOffset;
this.docsWithFieldLength = docsWithFieldLength;
this.denseRankPower = denseRankPower;
this.meta = meta;
}
@Override
public long ramBytesUsed() {
return SHALLOW_SIZE + RamUsageEstimator.sizeOf(meta);
}
}