mirror of https://github.com/apache/lucene.git
Refactor Lucene95 to allow off heap vector reader reuse (#12629)
While going through: https://github.com/apache/lucene/pull/12582 I noticed that for a while now, our offheap vector readers haven't changed at all. We just keep copying them around for no reason. To make adding a new vector codec simpler, this refactors the lucene95 codec to allow its offheap vector storage format (readers/writers) to be used. Additionally, it will handle writing the appropriate fields for sparse vectors (read/write) to a provided index output/inputs. This should reduce the churn in new codecs significantly.
This commit is contained in:
parent
04f38dd288
commit
05d26ac44d
|
@ -248,7 +248,13 @@ public final class Lucene95HnswVectorsReader extends KnnVectorsReader implements
|
|||
+ " expected: "
|
||||
+ VectorEncoding.FLOAT32);
|
||||
}
|
||||
return OffHeapFloatVectorValues.load(fieldEntry, vectorData);
|
||||
return OffHeapFloatVectorValues.load(
|
||||
fieldEntry.ordToDocVectorValues,
|
||||
fieldEntry.vectorEncoding,
|
||||
fieldEntry.dimension,
|
||||
fieldEntry.vectorDataOffset,
|
||||
fieldEntry.vectorDataLength,
|
||||
vectorData);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -263,7 +269,13 @@ public final class Lucene95HnswVectorsReader extends KnnVectorsReader implements
|
|||
+ " expected: "
|
||||
+ VectorEncoding.FLOAT32);
|
||||
}
|
||||
return OffHeapByteVectorValues.load(fieldEntry, vectorData);
|
||||
return OffHeapByteVectorValues.load(
|
||||
fieldEntry.ordToDocVectorValues,
|
||||
fieldEntry.vectorEncoding,
|
||||
fieldEntry.dimension,
|
||||
fieldEntry.vectorDataOffset,
|
||||
fieldEntry.vectorDataLength,
|
||||
vectorData);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -277,7 +289,14 @@ public final class Lucene95HnswVectorsReader extends KnnVectorsReader implements
|
|||
return;
|
||||
}
|
||||
|
||||
OffHeapFloatVectorValues vectorValues = OffHeapFloatVectorValues.load(fieldEntry, vectorData);
|
||||
OffHeapFloatVectorValues vectorValues =
|
||||
OffHeapFloatVectorValues.load(
|
||||
fieldEntry.ordToDocVectorValues,
|
||||
fieldEntry.vectorEncoding,
|
||||
fieldEntry.dimension,
|
||||
fieldEntry.vectorDataOffset,
|
||||
fieldEntry.vectorDataLength,
|
||||
vectorData);
|
||||
RandomVectorScorer scorer =
|
||||
RandomVectorScorer.createFloats(vectorValues, fieldEntry.similarityFunction, target);
|
||||
HnswGraphSearcher.search(
|
||||
|
@ -298,7 +317,14 @@ public final class Lucene95HnswVectorsReader extends KnnVectorsReader implements
|
|||
return;
|
||||
}
|
||||
|
||||
OffHeapByteVectorValues vectorValues = OffHeapByteVectorValues.load(fieldEntry, vectorData);
|
||||
OffHeapByteVectorValues vectorValues =
|
||||
OffHeapByteVectorValues.load(
|
||||
fieldEntry.ordToDocVectorValues,
|
||||
fieldEntry.vectorEncoding,
|
||||
fieldEntry.dimension,
|
||||
fieldEntry.vectorDataOffset,
|
||||
fieldEntry.vectorDataLength,
|
||||
vectorData);
|
||||
RandomVectorScorer scorer =
|
||||
RandomVectorScorer.createBytes(vectorValues, fieldEntry.similarityFunction, target);
|
||||
HnswGraphSearcher.search(
|
||||
|
@ -352,22 +378,9 @@ public final class Lucene95HnswVectorsReader extends KnnVectorsReader implements
|
|||
final int offsetsBlockShift;
|
||||
final long offsetsLength;
|
||||
|
||||
// the following four variables used to read docIds encoded by IndexDISI
|
||||
// special values of docsWithFieldOffset are -1 and -2
|
||||
// -1 : dense
|
||||
// -2 : empty
|
||||
// other: sparse
|
||||
final long docsWithFieldOffset;
|
||||
final long docsWithFieldLength;
|
||||
final short jumpTableEntryCount;
|
||||
final byte denseRankPower;
|
||||
|
||||
// the following four variables used to read ordToDoc encoded by DirectMonotonicWriter
|
||||
// note that only spare case needs to store ordToDoc
|
||||
final long addressesOffset;
|
||||
final int blockShift;
|
||||
final DirectMonotonicReader.Meta meta;
|
||||
final long addressesLength;
|
||||
// Contains the configuration for reading sparse vectors and translating vector ordinals to
|
||||
// docId
|
||||
OrdToDocDISIReaderConfiguration ordToDocVectorValues;
|
||||
|
||||
FieldEntry(
|
||||
IndexInput input,
|
||||
|
@ -383,24 +396,7 @@ public final class Lucene95HnswVectorsReader extends KnnVectorsReader implements
|
|||
dimension = input.readVInt();
|
||||
size = input.readInt();
|
||||
|
||||
docsWithFieldOffset = input.readLong();
|
||||
docsWithFieldLength = input.readLong();
|
||||
jumpTableEntryCount = input.readShort();
|
||||
denseRankPower = input.readByte();
|
||||
|
||||
// dense or empty
|
||||
if (docsWithFieldOffset == -1 || docsWithFieldOffset == -2) {
|
||||
addressesOffset = 0;
|
||||
blockShift = 0;
|
||||
meta = null;
|
||||
addressesLength = 0;
|
||||
} else {
|
||||
// sparse
|
||||
addressesOffset = input.readLong();
|
||||
blockShift = input.readVInt();
|
||||
meta = DirectMonotonicReader.loadMeta(input, size, blockShift);
|
||||
addressesLength = input.readLong();
|
||||
}
|
||||
ordToDocVectorValues = OrdToDocDISIReaderConfiguration.fromStoredMeta(input, size);
|
||||
|
||||
// read nodes by level
|
||||
M = input.readVInt();
|
||||
|
@ -441,7 +437,7 @@ public final class Lucene95HnswVectorsReader extends KnnVectorsReader implements
|
|||
public long ramBytesUsed() {
|
||||
return SHALLOW_SIZE
|
||||
+ Arrays.stream(nodesByLevel).mapToLong(nodes -> RamUsageEstimator.sizeOf(nodes)).sum()
|
||||
+ RamUsageEstimator.sizeOf(meta)
|
||||
+ RamUsageEstimator.sizeOf(ordToDocVectorValues)
|
||||
+ RamUsageEstimator.sizeOf(offsetsMeta);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -34,7 +34,6 @@ import org.apache.lucene.codecs.HnswGraphProvider;
|
|||
import org.apache.lucene.codecs.KnnFieldVectorsWriter;
|
||||
import org.apache.lucene.codecs.KnnVectorsReader;
|
||||
import org.apache.lucene.codecs.KnnVectorsWriter;
|
||||
import org.apache.lucene.codecs.lucene90.IndexedDISI;
|
||||
import org.apache.lucene.codecs.perfield.PerFieldKnnVectorsFormat;
|
||||
import org.apache.lucene.index.*;
|
||||
import org.apache.lucene.index.Sorter;
|
||||
|
@ -727,43 +726,8 @@ public final class Lucene95HnswVectorsWriter extends KnnVectorsWriter {
|
|||
// write docIDs
|
||||
int count = docsWithField.cardinality();
|
||||
meta.writeInt(count);
|
||||
if (count == 0) {
|
||||
meta.writeLong(-2); // docsWithFieldOffset
|
||||
meta.writeLong(0L); // docsWithFieldLength
|
||||
meta.writeShort((short) -1); // jumpTableEntryCount
|
||||
meta.writeByte((byte) -1); // denseRankPower
|
||||
} else if (count == maxDoc) {
|
||||
meta.writeLong(-1); // docsWithFieldOffset
|
||||
meta.writeLong(0L); // docsWithFieldLength
|
||||
meta.writeShort((short) -1); // jumpTableEntryCount
|
||||
meta.writeByte((byte) -1); // denseRankPower
|
||||
} else {
|
||||
long offset = vectorData.getFilePointer();
|
||||
meta.writeLong(offset); // docsWithFieldOffset
|
||||
final short jumpTableEntryCount =
|
||||
IndexedDISI.writeBitSet(
|
||||
docsWithField.iterator(), vectorData, IndexedDISI.DEFAULT_DENSE_RANK_POWER);
|
||||
meta.writeLong(vectorData.getFilePointer() - offset); // docsWithFieldLength
|
||||
meta.writeShort(jumpTableEntryCount);
|
||||
meta.writeByte(IndexedDISI.DEFAULT_DENSE_RANK_POWER);
|
||||
|
||||
// write ordToDoc mapping
|
||||
long start = vectorData.getFilePointer();
|
||||
meta.writeLong(start);
|
||||
meta.writeVInt(DIRECT_MONOTONIC_BLOCK_SHIFT);
|
||||
// dense case and empty case do not need to store ordToMap mapping
|
||||
final DirectMonotonicWriter ordToDocWriter =
|
||||
DirectMonotonicWriter.getInstance(meta, vectorData, count, DIRECT_MONOTONIC_BLOCK_SHIFT);
|
||||
DocIdSetIterator iterator = docsWithField.iterator();
|
||||
for (int doc = iterator.nextDoc();
|
||||
doc != DocIdSetIterator.NO_MORE_DOCS;
|
||||
doc = iterator.nextDoc()) {
|
||||
ordToDocWriter.add(doc);
|
||||
}
|
||||
ordToDocWriter.finish();
|
||||
meta.writeLong(vectorData.getFilePointer() - start);
|
||||
}
|
||||
|
||||
OrdToDocDISIReaderConfiguration.writeStoredMeta(
|
||||
DIRECT_MONOTONIC_BLOCK_SHIFT, meta, vectorData, count, maxDoc, docsWithField);
|
||||
meta.writeVInt(M);
|
||||
// write graph nodes on each level
|
||||
if (graph == null) {
|
||||
|
|
|
@ -29,7 +29,7 @@ import org.apache.lucene.util.hnsw.RandomAccessVectorValues;
|
|||
import org.apache.lucene.util.packed.DirectMonotonicReader;
|
||||
|
||||
/** Read the vector values from the index input. This supports both iterated and random access. */
|
||||
abstract class OffHeapByteVectorValues extends ByteVectorValues
|
||||
public abstract class OffHeapByteVectorValues extends ByteVectorValues
|
||||
implements RandomAccessVectorValues<byte[]> {
|
||||
|
||||
protected final int dimension;
|
||||
|
@ -73,19 +73,24 @@ abstract class OffHeapByteVectorValues extends ByteVectorValues
|
|||
slice.readBytes(byteBuffer.array(), byteBuffer.arrayOffset(), byteSize);
|
||||
}
|
||||
|
||||
static OffHeapByteVectorValues load(
|
||||
Lucene95HnswVectorsReader.FieldEntry fieldEntry, IndexInput vectorData) throws IOException {
|
||||
if (fieldEntry.docsWithFieldOffset == -2 || fieldEntry.vectorEncoding != VectorEncoding.BYTE) {
|
||||
return new EmptyOffHeapVectorValues(fieldEntry.dimension);
|
||||
public static OffHeapByteVectorValues load(
|
||||
OrdToDocDISIReaderConfiguration configuration,
|
||||
VectorEncoding vectorEncoding,
|
||||
int dimension,
|
||||
long vectorDataOffset,
|
||||
long vectorDataLength,
|
||||
IndexInput vectorData)
|
||||
throws IOException {
|
||||
if (configuration.docsWithFieldOffset == -2 || vectorEncoding != VectorEncoding.BYTE) {
|
||||
return new EmptyOffHeapVectorValues(dimension);
|
||||
}
|
||||
IndexInput bytesSlice =
|
||||
vectorData.slice("vector-data", fieldEntry.vectorDataOffset, fieldEntry.vectorDataLength);
|
||||
int byteSize = fieldEntry.dimension;
|
||||
if (fieldEntry.docsWithFieldOffset == -1) {
|
||||
return new DenseOffHeapVectorValues(
|
||||
fieldEntry.dimension, fieldEntry.size, bytesSlice, byteSize);
|
||||
IndexInput bytesSlice = vectorData.slice("vector-data", vectorDataOffset, vectorDataLength);
|
||||
int byteSize = dimension;
|
||||
if (configuration.docsWithFieldOffset == -1) {
|
||||
return new DenseOffHeapVectorValues(dimension, configuration.size, bytesSlice, byteSize);
|
||||
} else {
|
||||
return new SparseOffHeapVectorValues(fieldEntry, vectorData, bytesSlice, byteSize);
|
||||
return new SparseOffHeapVectorValues(
|
||||
configuration, vectorData, bytesSlice, dimension, byteSize);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -139,29 +144,30 @@ abstract class OffHeapByteVectorValues extends ByteVectorValues
|
|||
private final IndexedDISI disi;
|
||||
// dataIn was used to init a new IndexedDIS for #randomAccess()
|
||||
private final IndexInput dataIn;
|
||||
private final Lucene95HnswVectorsReader.FieldEntry fieldEntry;
|
||||
private final OrdToDocDISIReaderConfiguration configuration;
|
||||
|
||||
public SparseOffHeapVectorValues(
|
||||
Lucene95HnswVectorsReader.FieldEntry fieldEntry,
|
||||
OrdToDocDISIReaderConfiguration configuration,
|
||||
IndexInput dataIn,
|
||||
IndexInput slice,
|
||||
int dimension,
|
||||
int byteSize)
|
||||
throws IOException {
|
||||
|
||||
super(fieldEntry.dimension, fieldEntry.size, slice, byteSize);
|
||||
this.fieldEntry = fieldEntry;
|
||||
super(dimension, configuration.size, slice, byteSize);
|
||||
this.configuration = configuration;
|
||||
final RandomAccessInput addressesData =
|
||||
dataIn.randomAccessSlice(fieldEntry.addressesOffset, fieldEntry.addressesLength);
|
||||
dataIn.randomAccessSlice(configuration.addressesOffset, configuration.addressesLength);
|
||||
this.dataIn = dataIn;
|
||||
this.ordToDoc = DirectMonotonicReader.getInstance(fieldEntry.meta, addressesData);
|
||||
this.ordToDoc = DirectMonotonicReader.getInstance(configuration.meta, addressesData);
|
||||
this.disi =
|
||||
new IndexedDISI(
|
||||
dataIn,
|
||||
fieldEntry.docsWithFieldOffset,
|
||||
fieldEntry.docsWithFieldLength,
|
||||
fieldEntry.jumpTableEntryCount,
|
||||
fieldEntry.denseRankPower,
|
||||
fieldEntry.size);
|
||||
configuration.docsWithFieldOffset,
|
||||
configuration.docsWithFieldLength,
|
||||
configuration.jumpTableEntryCount,
|
||||
configuration.denseRankPower,
|
||||
configuration.size);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -187,7 +193,8 @@ abstract class OffHeapByteVectorValues extends ByteVectorValues
|
|||
|
||||
@Override
|
||||
public RandomAccessVectorValues<byte[]> copy() throws IOException {
|
||||
return new SparseOffHeapVectorValues(fieldEntry, dataIn, slice.clone(), byteSize);
|
||||
return new SparseOffHeapVectorValues(
|
||||
configuration, dataIn, slice.clone(), dimension, byteSize);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -28,7 +28,7 @@ import org.apache.lucene.util.hnsw.RandomAccessVectorValues;
|
|||
import org.apache.lucene.util.packed.DirectMonotonicReader;
|
||||
|
||||
/** Read the vector values from the index input. This supports both iterated and random access. */
|
||||
abstract class OffHeapFloatVectorValues extends FloatVectorValues
|
||||
public abstract class OffHeapFloatVectorValues extends FloatVectorValues
|
||||
implements RandomAccessVectorValues<float[]> {
|
||||
|
||||
protected final int dimension;
|
||||
|
@ -67,20 +67,24 @@ abstract class OffHeapFloatVectorValues extends FloatVectorValues
|
|||
return value;
|
||||
}
|
||||
|
||||
static OffHeapFloatVectorValues load(
|
||||
Lucene95HnswVectorsReader.FieldEntry fieldEntry, IndexInput vectorData) throws IOException {
|
||||
if (fieldEntry.docsWithFieldOffset == -2
|
||||
|| fieldEntry.vectorEncoding != VectorEncoding.FLOAT32) {
|
||||
return new EmptyOffHeapVectorValues(fieldEntry.dimension);
|
||||
public static OffHeapFloatVectorValues load(
|
||||
OrdToDocDISIReaderConfiguration configuration,
|
||||
VectorEncoding vectorEncoding,
|
||||
int dimension,
|
||||
long vectorDataOffset,
|
||||
long vectorDataLength,
|
||||
IndexInput vectorData)
|
||||
throws IOException {
|
||||
if (configuration.docsWithFieldOffset == -2 || vectorEncoding != VectorEncoding.FLOAT32) {
|
||||
return new EmptyOffHeapVectorValues(dimension);
|
||||
}
|
||||
IndexInput bytesSlice =
|
||||
vectorData.slice("vector-data", fieldEntry.vectorDataOffset, fieldEntry.vectorDataLength);
|
||||
int byteSize = fieldEntry.dimension * Float.BYTES;
|
||||
if (fieldEntry.docsWithFieldOffset == -1) {
|
||||
return new DenseOffHeapVectorValues(
|
||||
fieldEntry.dimension, fieldEntry.size, bytesSlice, byteSize);
|
||||
IndexInput bytesSlice = vectorData.slice("vector-data", vectorDataOffset, vectorDataLength);
|
||||
int byteSize = dimension * Float.BYTES;
|
||||
if (configuration.docsWithFieldOffset == -1) {
|
||||
return new DenseOffHeapVectorValues(dimension, configuration.size, bytesSlice, byteSize);
|
||||
} else {
|
||||
return new SparseOffHeapVectorValues(fieldEntry, vectorData, bytesSlice, byteSize);
|
||||
return new SparseOffHeapVectorValues(
|
||||
configuration, vectorData, bytesSlice, dimension, byteSize);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -134,29 +138,30 @@ abstract class OffHeapFloatVectorValues extends FloatVectorValues
|
|||
private final IndexedDISI disi;
|
||||
// dataIn was used to init a new IndexedDIS for #randomAccess()
|
||||
private final IndexInput dataIn;
|
||||
private final Lucene95HnswVectorsReader.FieldEntry fieldEntry;
|
||||
private final OrdToDocDISIReaderConfiguration configuration;
|
||||
|
||||
public SparseOffHeapVectorValues(
|
||||
Lucene95HnswVectorsReader.FieldEntry fieldEntry,
|
||||
OrdToDocDISIReaderConfiguration configuration,
|
||||
IndexInput dataIn,
|
||||
IndexInput slice,
|
||||
int dimension,
|
||||
int byteSize)
|
||||
throws IOException {
|
||||
|
||||
super(fieldEntry.dimension, fieldEntry.size, slice, byteSize);
|
||||
this.fieldEntry = fieldEntry;
|
||||
super(dimension, configuration.size, slice, byteSize);
|
||||
this.configuration = configuration;
|
||||
final RandomAccessInput addressesData =
|
||||
dataIn.randomAccessSlice(fieldEntry.addressesOffset, fieldEntry.addressesLength);
|
||||
dataIn.randomAccessSlice(configuration.addressesOffset, configuration.addressesLength);
|
||||
this.dataIn = dataIn;
|
||||
this.ordToDoc = DirectMonotonicReader.getInstance(fieldEntry.meta, addressesData);
|
||||
this.ordToDoc = DirectMonotonicReader.getInstance(configuration.meta, addressesData);
|
||||
this.disi =
|
||||
new IndexedDISI(
|
||||
dataIn,
|
||||
fieldEntry.docsWithFieldOffset,
|
||||
fieldEntry.docsWithFieldLength,
|
||||
fieldEntry.jumpTableEntryCount,
|
||||
fieldEntry.denseRankPower,
|
||||
fieldEntry.size);
|
||||
configuration.docsWithFieldOffset,
|
||||
configuration.docsWithFieldLength,
|
||||
configuration.jumpTableEntryCount,
|
||||
configuration.denseRankPower,
|
||||
configuration.size);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -182,7 +187,8 @@ abstract class OffHeapFloatVectorValues extends FloatVectorValues
|
|||
|
||||
@Override
|
||||
public RandomAccessVectorValues<float[]> copy() throws IOException {
|
||||
return new SparseOffHeapVectorValues(fieldEntry, dataIn, slice.clone(), byteSize);
|
||||
return new SparseOffHeapVectorValues(
|
||||
configuration, dataIn, slice.clone(), dimension, byteSize);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -0,0 +1,194 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.codecs.lucene95;
|
||||
|
||||
import java.io.IOException;
|
||||
import org.apache.lucene.codecs.lucene90.IndexedDISI;
|
||||
import org.apache.lucene.index.DocsWithFieldSet;
|
||||
import org.apache.lucene.search.DocIdSetIterator;
|
||||
import org.apache.lucene.store.IndexInput;
|
||||
import org.apache.lucene.store.IndexOutput;
|
||||
import org.apache.lucene.util.Accountable;
|
||||
import org.apache.lucene.util.RamUsageEstimator;
|
||||
import org.apache.lucene.util.packed.DirectMonotonicReader;
|
||||
import org.apache.lucene.util.packed.DirectMonotonicWriter;
|
||||
|
||||
/**
|
||||
* Configuration for {@link DirectMonotonicReader} and {@link IndexedDISI} for reading sparse
|
||||
* vectors. The format in the static writing methods adheres to the Lucene95HnswVectorsFormat
|
||||
*/
|
||||
public class OrdToDocDISIReaderConfiguration implements Accountable {
|
||||
|
||||
private static final long SHALLOW_SIZE =
|
||||
RamUsageEstimator.shallowSizeOfInstance(OrdToDocDISIReaderConfiguration.class);
|
||||
|
||||
/**
|
||||
* Writes out the docsWithField and ordToDoc mapping to the outputMeta and vectorData
|
||||
* respectively. This is in adherence to the Lucene95HnswVectorsFormat.
|
||||
*
|
||||
* <p>Within outputMeta the format is as follows:
|
||||
*
|
||||
* <ul>
|
||||
* <li><b>[int8]</b> if equals to -2, empty - no vectory values. If equals to -1, dense – all
|
||||
* documents have values for a field. If equals to 0, sparse – some documents missing
|
||||
* values.
|
||||
* <li>DocIds were encoded by {@link IndexedDISI#writeBitSet(DocIdSetIterator, IndexOutput,
|
||||
* byte)}
|
||||
* <li>OrdToDoc was encoded by {@link org.apache.lucene.util.packed.DirectMonotonicWriter}, note
|
||||
* that only in sparse case
|
||||
* </ul>
|
||||
*
|
||||
* <p>Within the vectorData the format is as follows:
|
||||
*
|
||||
* <ul>
|
||||
* <li>DocIds encoded by {@link IndexedDISI#writeBitSet(DocIdSetIterator, IndexOutput, byte)},
|
||||
* note that only in sparse case
|
||||
* <li>OrdToDoc was encoded by {@link org.apache.lucene.util.packed.DirectMonotonicWriter}, note
|
||||
* that only in sparse case
|
||||
* </ul>
|
||||
*
|
||||
* @param outputMeta the outputMeta
|
||||
* @param vectorData the vectorData
|
||||
* @param count the count of docs with vectors
|
||||
* @param maxDoc the maxDoc for the index
|
||||
* @param docsWithField the docs contaiting a vector field
|
||||
* @throws IOException thrown when writing data fails to either output
|
||||
*/
|
||||
public static void writeStoredMeta(
|
||||
int directMonotonicBlockShift,
|
||||
IndexOutput outputMeta,
|
||||
IndexOutput vectorData,
|
||||
int count,
|
||||
int maxDoc,
|
||||
DocsWithFieldSet docsWithField)
|
||||
throws IOException {
|
||||
if (count == 0) {
|
||||
outputMeta.writeLong(-2); // docsWithFieldOffset
|
||||
outputMeta.writeLong(0L); // docsWithFieldLength
|
||||
outputMeta.writeShort((short) -1); // jumpTableEntryCount
|
||||
outputMeta.writeByte((byte) -1); // denseRankPower
|
||||
} else if (count == maxDoc) {
|
||||
outputMeta.writeLong(-1); // docsWithFieldOffset
|
||||
outputMeta.writeLong(0L); // docsWithFieldLength
|
||||
outputMeta.writeShort((short) -1); // jumpTableEntryCount
|
||||
outputMeta.writeByte((byte) -1); // denseRankPower
|
||||
} else {
|
||||
long offset = vectorData.getFilePointer();
|
||||
outputMeta.writeLong(offset); // docsWithFieldOffset
|
||||
final short jumpTableEntryCount =
|
||||
IndexedDISI.writeBitSet(
|
||||
docsWithField.iterator(), vectorData, IndexedDISI.DEFAULT_DENSE_RANK_POWER);
|
||||
outputMeta.writeLong(vectorData.getFilePointer() - offset); // docsWithFieldLength
|
||||
outputMeta.writeShort(jumpTableEntryCount);
|
||||
outputMeta.writeByte(IndexedDISI.DEFAULT_DENSE_RANK_POWER);
|
||||
|
||||
// write ordToDoc mapping
|
||||
long start = vectorData.getFilePointer();
|
||||
outputMeta.writeLong(start);
|
||||
outputMeta.writeVInt(directMonotonicBlockShift);
|
||||
// dense case and empty case do not need to store ordToMap mapping
|
||||
final DirectMonotonicWriter ordToDocWriter =
|
||||
DirectMonotonicWriter.getInstance(
|
||||
outputMeta, vectorData, count, directMonotonicBlockShift);
|
||||
DocIdSetIterator iterator = docsWithField.iterator();
|
||||
for (int doc = iterator.nextDoc();
|
||||
doc != DocIdSetIterator.NO_MORE_DOCS;
|
||||
doc = iterator.nextDoc()) {
|
||||
ordToDocWriter.add(doc);
|
||||
}
|
||||
ordToDocWriter.finish();
|
||||
outputMeta.writeLong(vectorData.getFilePointer() - start);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Reads in the necessary fields stored in the outputMeta to configure {@link
|
||||
* DirectMonotonicReader} and {@link IndexedDISI}.
|
||||
*
|
||||
* @param inputMeta the inputMeta, previously written to via {@link #writeStoredMeta(int,
|
||||
* IndexOutput, IndexOutput, int, int, DocsWithFieldSet)}
|
||||
* @param size The number of vectors
|
||||
* @return the configuration required to read sparse vectors
|
||||
* @throws IOException thrown when reading data fails
|
||||
*/
|
||||
public static OrdToDocDISIReaderConfiguration fromStoredMeta(IndexInput inputMeta, int size)
|
||||
throws IOException {
|
||||
long docsWithFieldOffset = inputMeta.readLong();
|
||||
long docsWithFieldLength = inputMeta.readLong();
|
||||
short jumpTableEntryCount = inputMeta.readShort();
|
||||
byte denseRankPower = inputMeta.readByte();
|
||||
long addressesOffset = 0;
|
||||
int blockShift = 0;
|
||||
DirectMonotonicReader.Meta meta = null;
|
||||
long addressesLength = 0;
|
||||
if (docsWithFieldOffset > -1) {
|
||||
addressesOffset = inputMeta.readLong();
|
||||
blockShift = inputMeta.readVInt();
|
||||
meta = DirectMonotonicReader.loadMeta(inputMeta, size, blockShift);
|
||||
addressesLength = inputMeta.readLong();
|
||||
}
|
||||
return new OrdToDocDISIReaderConfiguration(
|
||||
size,
|
||||
jumpTableEntryCount,
|
||||
addressesOffset,
|
||||
addressesLength,
|
||||
docsWithFieldOffset,
|
||||
docsWithFieldLength,
|
||||
denseRankPower,
|
||||
meta);
|
||||
}
|
||||
|
||||
final int size;
|
||||
// the following four variables used to read docIds encoded by IndexDISI
|
||||
// special values of docsWithFieldOffset are -1 and -2
|
||||
// -1 : dense
|
||||
// -2 : empty
|
||||
// other: sparse
|
||||
final short jumpTableEntryCount;
|
||||
final long docsWithFieldOffset, docsWithFieldLength;
|
||||
final byte denseRankPower;
|
||||
|
||||
// the following four variables used to read ordToDoc encoded by DirectMonotonicWriter
|
||||
// note that only spare case needs to store ordToDoc
|
||||
final long addressesOffset, addressesLength;
|
||||
final DirectMonotonicReader.Meta meta;
|
||||
|
||||
OrdToDocDISIReaderConfiguration(
|
||||
int size,
|
||||
short jumpTableEntryCount,
|
||||
long addressesOffset,
|
||||
long addressesLength,
|
||||
long docsWithFieldOffset,
|
||||
long docsWithFieldLength,
|
||||
byte denseRankPower,
|
||||
DirectMonotonicReader.Meta meta) {
|
||||
this.size = size;
|
||||
this.jumpTableEntryCount = jumpTableEntryCount;
|
||||
this.addressesOffset = addressesOffset;
|
||||
this.addressesLength = addressesLength;
|
||||
this.docsWithFieldOffset = docsWithFieldOffset;
|
||||
this.docsWithFieldLength = docsWithFieldLength;
|
||||
this.denseRankPower = denseRankPower;
|
||||
this.meta = meta;
|
||||
}
|
||||
|
||||
@Override
|
||||
public long ramBytesUsed() {
|
||||
return SHALLOW_SIZE + RamUsageEstimator.sizeOf(meta);
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue