mirror of https://github.com/apache/lucene.git
LUCENE-10375: Write vectors to file in flush (#617)
In a previous commit, we updated HNSW merge to first write the combined segment vectors to a file, then use that file to build the graph. This commit applies the same strategy to flush, which lets us use the same logic for flush and merge.
This commit is contained in:
parent
08d6633d94
commit
7ece8145bc
|
@ -183,3 +183,5 @@ apply from: file('gradle/hacks/turbocharge-jvm-opts.gradle')
|
||||||
apply from: file('gradle/hacks/dummy-outputs.gradle')
|
apply from: file('gradle/hacks/dummy-outputs.gradle')
|
||||||
|
|
||||||
apply from: file('gradle/pylucene/pylucene.gradle')
|
apply from: file('gradle/pylucene/pylucene.gradle')
|
||||||
|
sourceCompatibility = JavaVersion.VERSION_16
|
||||||
|
targetCompatibility = JavaVersion.VERSION_16
|
||||||
|
|
|
@ -117,7 +117,7 @@ public abstract class KnnVectorsWriter implements Closeable {
|
||||||
}
|
}
|
||||||
|
|
||||||
/** View over multiple VectorValues supporting iterator-style access via DocIdMerger. */
|
/** View over multiple VectorValues supporting iterator-style access via DocIdMerger. */
|
||||||
public static class MergedVectorValues extends VectorValues {
|
private static class MergedVectorValues extends VectorValues {
|
||||||
private final List<VectorValuesSub> subs;
|
private final List<VectorValuesSub> subs;
|
||||||
private final DocIDMerger<VectorValuesSub> docIdMerger;
|
private final DocIDMerger<VectorValuesSub> docIdMerger;
|
||||||
private final int cost;
|
private final int cost;
|
||||||
|
@ -127,7 +127,7 @@ public abstract class KnnVectorsWriter implements Closeable {
|
||||||
private VectorValuesSub current;
|
private VectorValuesSub current;
|
||||||
|
|
||||||
/** Returns a merged view over all the segment's {@link VectorValues}. */
|
/** Returns a merged view over all the segment's {@link VectorValues}. */
|
||||||
public static MergedVectorValues mergeVectorValues(FieldInfo fieldInfo, MergeState mergeState)
|
static MergedVectorValues mergeVectorValues(FieldInfo fieldInfo, MergeState mergeState)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
assert fieldInfo != null && fieldInfo.hasVectorValues();
|
assert fieldInfo != null && fieldInfo.hasVectorValues();
|
||||||
|
|
||||||
|
|
|
@ -354,7 +354,7 @@ public final class Lucene90HnswVectorsReader extends KnnVectorsReader {
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Read the vector values from the index input. This supports both iterated and random access. */
|
/** Read the vector values from the index input. This supports both iterated and random access. */
|
||||||
public static class OffHeapVectorValues extends VectorValues
|
static class OffHeapVectorValues extends VectorValues
|
||||||
implements RandomAccessVectorValues, RandomAccessVectorValuesProducer {
|
implements RandomAccessVectorValues, RandomAccessVectorValuesProducer {
|
||||||
|
|
||||||
final int dimension;
|
final int dimension;
|
||||||
|
|
|
@ -26,7 +26,6 @@ import org.apache.lucene.codecs.KnnVectorsReader;
|
||||||
import org.apache.lucene.codecs.KnnVectorsWriter;
|
import org.apache.lucene.codecs.KnnVectorsWriter;
|
||||||
import org.apache.lucene.index.FieldInfo;
|
import org.apache.lucene.index.FieldInfo;
|
||||||
import org.apache.lucene.index.IndexFileNames;
|
import org.apache.lucene.index.IndexFileNames;
|
||||||
import org.apache.lucene.index.MergeState;
|
|
||||||
import org.apache.lucene.index.RandomAccessVectorValuesProducer;
|
import org.apache.lucene.index.RandomAccessVectorValuesProducer;
|
||||||
import org.apache.lucene.index.SegmentWriteState;
|
import org.apache.lucene.index.SegmentWriteState;
|
||||||
import org.apache.lucene.index.VectorSimilarityFunction;
|
import org.apache.lucene.index.VectorSimilarityFunction;
|
||||||
|
@ -114,79 +113,16 @@ public final class Lucene90HnswVectorsWriter extends KnnVectorsWriter {
|
||||||
public void writeField(FieldInfo fieldInfo, KnnVectorsReader knnVectorsReader)
|
public void writeField(FieldInfo fieldInfo, KnnVectorsReader knnVectorsReader)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
long vectorDataOffset = vectorData.alignFilePointer(Float.BYTES);
|
long vectorDataOffset = vectorData.alignFilePointer(Float.BYTES);
|
||||||
|
|
||||||
VectorValues vectors = knnVectorsReader.getVectorValues(fieldInfo.name);
|
VectorValues vectors = knnVectorsReader.getVectorValues(fieldInfo.name);
|
||||||
// TODO - use a better data structure; a bitset? DocsWithFieldSet is p.p. in o.a.l.index
|
|
||||||
int[] docIds = writeVectorData(vectorData, vectors);
|
|
||||||
assert vectors.size() == docIds.length;
|
|
||||||
|
|
||||||
long[] offsets = new long[docIds.length];
|
|
||||||
long vectorIndexOffset = vectorIndex.getFilePointer();
|
|
||||||
if (vectors instanceof RandomAccessVectorValuesProducer) {
|
|
||||||
writeGraph(
|
|
||||||
vectorIndex,
|
|
||||||
(RandomAccessVectorValuesProducer) vectors,
|
|
||||||
fieldInfo.getVectorSimilarityFunction(),
|
|
||||||
vectorIndexOffset,
|
|
||||||
offsets,
|
|
||||||
maxConn,
|
|
||||||
beamWidth);
|
|
||||||
} else {
|
|
||||||
throw new IllegalArgumentException(
|
|
||||||
"Indexing an HNSW graph requires a random access vector values, got " + vectors);
|
|
||||||
}
|
|
||||||
|
|
||||||
long vectorDataLength = vectorData.getFilePointer() - vectorDataOffset;
|
|
||||||
long vectorIndexLength = vectorIndex.getFilePointer() - vectorIndexOffset;
|
|
||||||
writeMeta(
|
|
||||||
fieldInfo,
|
|
||||||
vectorDataOffset,
|
|
||||||
vectorDataLength,
|
|
||||||
vectorIndexOffset,
|
|
||||||
vectorIndexLength,
|
|
||||||
docIds);
|
|
||||||
writeGraphOffsets(meta, offsets);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public void merge(MergeState mergeState) throws IOException {
|
|
||||||
for (int i = 0; i < mergeState.fieldInfos.length; i++) {
|
|
||||||
KnnVectorsReader reader = mergeState.knnVectorsReaders[i];
|
|
||||||
assert reader != null || mergeState.fieldInfos[i].hasVectorValues() == false;
|
|
||||||
if (reader != null) {
|
|
||||||
reader.checkIntegrity();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
for (FieldInfo fieldInfo : mergeState.mergeFieldInfos) {
|
|
||||||
if (fieldInfo.hasVectorValues()) {
|
|
||||||
if (mergeState.infoStream.isEnabled("VV")) {
|
|
||||||
mergeState.infoStream.message("VV", "merging " + mergeState.segmentInfo);
|
|
||||||
}
|
|
||||||
mergeField(fieldInfo, mergeState);
|
|
||||||
if (mergeState.infoStream.isEnabled("VV")) {
|
|
||||||
mergeState.infoStream.message("VV", "merge done " + mergeState.segmentInfo);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
finish();
|
|
||||||
}
|
|
||||||
|
|
||||||
private void mergeField(FieldInfo fieldInfo, MergeState mergeState) throws IOException {
|
|
||||||
if (mergeState.infoStream.isEnabled("VV")) {
|
|
||||||
mergeState.infoStream.message("VV", "merging " + mergeState.segmentInfo);
|
|
||||||
}
|
|
||||||
|
|
||||||
long vectorDataOffset = vectorData.alignFilePointer(Float.BYTES);
|
|
||||||
|
|
||||||
VectorValues vectors = MergedVectorValues.mergeVectorValues(fieldInfo, mergeState);
|
|
||||||
IndexOutput tempVectorData =
|
IndexOutput tempVectorData =
|
||||||
segmentWriteState.directory.createTempOutput(
|
segmentWriteState.directory.createTempOutput(
|
||||||
vectorData.getName(), "temp", segmentWriteState.context);
|
vectorData.getName(), "temp", segmentWriteState.context);
|
||||||
IndexInput vectorDataInput = null;
|
IndexInput vectorDataInput = null;
|
||||||
boolean success = false;
|
boolean success = false;
|
||||||
try {
|
try {
|
||||||
// write the merged vector data to a temporary file
|
// write the vector data to a temporary file
|
||||||
|
// TODO - use a better data structure; a bitset? DocsWithFieldSet is p.p. in o.a.l.index
|
||||||
int[] docIds = writeVectorData(tempVectorData, vectors);
|
int[] docIds = writeVectorData(tempVectorData, vectors);
|
||||||
CodecUtil.writeFooter(tempVectorData);
|
CodecUtil.writeFooter(tempVectorData);
|
||||||
IOUtils.close(tempVectorData);
|
IOUtils.close(tempVectorData);
|
||||||
|
@ -235,10 +171,6 @@ public final class Lucene90HnswVectorsWriter extends KnnVectorsWriter {
|
||||||
segmentWriteState.directory, tempVectorData.getName());
|
segmentWriteState.directory, tempVectorData.getName());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (mergeState.infoStream.isEnabled("VV")) {
|
|
||||||
mergeState.infoStream.message("VV", "merge done " + mergeState.segmentInfo);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
|
@ -37,8 +37,6 @@ import org.apache.lucene.index.IndexWriter;
|
||||||
import org.apache.lucene.index.IndexWriterConfig;
|
import org.apache.lucene.index.IndexWriterConfig;
|
||||||
import org.apache.lucene.index.LeafReader;
|
import org.apache.lucene.index.LeafReader;
|
||||||
import org.apache.lucene.index.LeafReaderContext;
|
import org.apache.lucene.index.LeafReaderContext;
|
||||||
import org.apache.lucene.index.RandomAccessVectorValues;
|
|
||||||
import org.apache.lucene.index.RandomAccessVectorValuesProducer;
|
|
||||||
import org.apache.lucene.index.Term;
|
import org.apache.lucene.index.Term;
|
||||||
import org.apache.lucene.index.VectorSimilarityFunction;
|
import org.apache.lucene.index.VectorSimilarityFunction;
|
||||||
import org.apache.lucene.index.VectorValues;
|
import org.apache.lucene.index.VectorValues;
|
||||||
|
@ -693,12 +691,6 @@ public abstract class BaseKnnVectorsFormatTestCase extends BaseIndexFileFormatTe
|
||||||
assertEquals("4", leaf.document(vectorValues.nextDoc()).get("id"));
|
assertEquals("4", leaf.document(vectorValues.nextDoc()).get("id"));
|
||||||
assertEquals(0, vectorValues.vectorValue()[0], 0);
|
assertEquals(0, vectorValues.vectorValue()[0], 0);
|
||||||
assertEquals(NO_MORE_DOCS, vectorValues.nextDoc());
|
assertEquals(NO_MORE_DOCS, vectorValues.nextDoc());
|
||||||
|
|
||||||
RandomAccessVectorValues ra =
|
|
||||||
((RandomAccessVectorValuesProducer) vectorValues).randomAccess();
|
|
||||||
assertEquals(-1f, ra.vectorValue(0)[0], 0);
|
|
||||||
assertEquals(1f, ra.vectorValue(1)[0], 0);
|
|
||||||
assertEquals(0f, ra.vectorValue(2)[0], 0);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue