LUCENE-10375: Write vectors to file in flush (#617)

In a previous commit, we updated HNSW merge to first write the combined segment
vectors to a file, then use that file to build the graph. This commit applies
the same strategy to flush, which lets us use the same logic for flush and
merge.
This commit is contained in:
Julie Tibshirani 2022-01-23 16:19:23 -08:00 committed by GitHub
parent 08d6633d94
commit 7ece8145bc
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 7 additions and 81 deletions

View File

@ -183,3 +183,5 @@ apply from: file('gradle/hacks/turbocharge-jvm-opts.gradle')
apply from: file('gradle/hacks/dummy-outputs.gradle')
apply from: file('gradle/pylucene/pylucene.gradle')
sourceCompatibility = JavaVersion.VERSION_16
targetCompatibility = JavaVersion.VERSION_16

View File

@ -117,7 +117,7 @@ public abstract class KnnVectorsWriter implements Closeable {
}
/** View over multiple VectorValues supporting iterator-style access via DocIdMerger. */
public static class MergedVectorValues extends VectorValues {
private static class MergedVectorValues extends VectorValues {
private final List<VectorValuesSub> subs;
private final DocIDMerger<VectorValuesSub> docIdMerger;
private final int cost;
@ -127,7 +127,7 @@ public abstract class KnnVectorsWriter implements Closeable {
private VectorValuesSub current;
/** Returns a merged view over all the segment's {@link VectorValues}. */
public static MergedVectorValues mergeVectorValues(FieldInfo fieldInfo, MergeState mergeState)
static MergedVectorValues mergeVectorValues(FieldInfo fieldInfo, MergeState mergeState)
throws IOException {
assert fieldInfo != null && fieldInfo.hasVectorValues();

View File

@ -354,7 +354,7 @@ public final class Lucene90HnswVectorsReader extends KnnVectorsReader {
}
/** Read the vector values from the index input. This supports both iterated and random access. */
public static class OffHeapVectorValues extends VectorValues
static class OffHeapVectorValues extends VectorValues
implements RandomAccessVectorValues, RandomAccessVectorValuesProducer {
final int dimension;

View File

@ -26,7 +26,6 @@ import org.apache.lucene.codecs.KnnVectorsReader;
import org.apache.lucene.codecs.KnnVectorsWriter;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.MergeState;
import org.apache.lucene.index.RandomAccessVectorValuesProducer;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.index.VectorSimilarityFunction;
@ -114,79 +113,16 @@ public final class Lucene90HnswVectorsWriter extends KnnVectorsWriter {
public void writeField(FieldInfo fieldInfo, KnnVectorsReader knnVectorsReader)
throws IOException {
long vectorDataOffset = vectorData.alignFilePointer(Float.BYTES);
VectorValues vectors = knnVectorsReader.getVectorValues(fieldInfo.name);
// TODO - use a better data structure; a bitset? DocsWithFieldSet is p.p. in o.a.l.index
int[] docIds = writeVectorData(vectorData, vectors);
assert vectors.size() == docIds.length;
long[] offsets = new long[docIds.length];
long vectorIndexOffset = vectorIndex.getFilePointer();
if (vectors instanceof RandomAccessVectorValuesProducer) {
writeGraph(
vectorIndex,
(RandomAccessVectorValuesProducer) vectors,
fieldInfo.getVectorSimilarityFunction(),
vectorIndexOffset,
offsets,
maxConn,
beamWidth);
} else {
throw new IllegalArgumentException(
"Indexing an HNSW graph requires a random access vector values, got " + vectors);
}
long vectorDataLength = vectorData.getFilePointer() - vectorDataOffset;
long vectorIndexLength = vectorIndex.getFilePointer() - vectorIndexOffset;
writeMeta(
fieldInfo,
vectorDataOffset,
vectorDataLength,
vectorIndexOffset,
vectorIndexLength,
docIds);
writeGraphOffsets(meta, offsets);
}
@Override
public void merge(MergeState mergeState) throws IOException {
for (int i = 0; i < mergeState.fieldInfos.length; i++) {
KnnVectorsReader reader = mergeState.knnVectorsReaders[i];
assert reader != null || mergeState.fieldInfos[i].hasVectorValues() == false;
if (reader != null) {
reader.checkIntegrity();
}
}
for (FieldInfo fieldInfo : mergeState.mergeFieldInfos) {
if (fieldInfo.hasVectorValues()) {
if (mergeState.infoStream.isEnabled("VV")) {
mergeState.infoStream.message("VV", "merging " + mergeState.segmentInfo);
}
mergeField(fieldInfo, mergeState);
if (mergeState.infoStream.isEnabled("VV")) {
mergeState.infoStream.message("VV", "merge done " + mergeState.segmentInfo);
}
}
}
finish();
}
private void mergeField(FieldInfo fieldInfo, MergeState mergeState) throws IOException {
if (mergeState.infoStream.isEnabled("VV")) {
mergeState.infoStream.message("VV", "merging " + mergeState.segmentInfo);
}
long vectorDataOffset = vectorData.alignFilePointer(Float.BYTES);
VectorValues vectors = MergedVectorValues.mergeVectorValues(fieldInfo, mergeState);
IndexOutput tempVectorData =
segmentWriteState.directory.createTempOutput(
vectorData.getName(), "temp", segmentWriteState.context);
IndexInput vectorDataInput = null;
boolean success = false;
try {
// write the merged vector data to a temporary file
// write the vector data to a temporary file
// TODO - use a better data structure; a bitset? DocsWithFieldSet is p.p. in o.a.l.index
int[] docIds = writeVectorData(tempVectorData, vectors);
CodecUtil.writeFooter(tempVectorData);
IOUtils.close(tempVectorData);
@ -235,10 +171,6 @@ public final class Lucene90HnswVectorsWriter extends KnnVectorsWriter {
segmentWriteState.directory, tempVectorData.getName());
}
}
if (mergeState.infoStream.isEnabled("VV")) {
mergeState.infoStream.message("VV", "merge done " + mergeState.segmentInfo);
}
}
/**

View File

@ -37,8 +37,6 @@ import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.RandomAccessVectorValues;
import org.apache.lucene.index.RandomAccessVectorValuesProducer;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.VectorSimilarityFunction;
import org.apache.lucene.index.VectorValues;
@ -693,12 +691,6 @@ public abstract class BaseKnnVectorsFormatTestCase extends BaseIndexFileFormatTe
assertEquals("4", leaf.document(vectorValues.nextDoc()).get("id"));
assertEquals(0, vectorValues.vectorValue()[0], 0);
assertEquals(NO_MORE_DOCS, vectorValues.nextDoc());
RandomAccessVectorValues ra =
((RandomAccessVectorValuesProducer) vectorValues).randomAccess();
assertEquals(-1f, ra.vectorValue(0)[0], 0);
assertEquals(1f, ra.vectorValue(1)[0], 0);
assertEquals(0f, ra.vectorValue(2)[0], 0);
}
}
}