mirror of https://github.com/apache/lucene.git
LUCENE-10054 Make HnswGraph hierarchical (#608)
Currently HNSW has only a single layer. This patch makes HNSW graph multi-layered. This PR is based on the following PRs: #250, #267, #287, #315, #536, #416 Main changes: - Multi layers are introduced into HnswGraph and HnswGraphBuilder - A new Lucene91HnswVectorsFormat with new Lucene91HnswVectorsReader and Lucene91HnswVectorsWriter are introduced to encode graph layers' information - Lucene90Codec, Lucene90HnswVectorsFormat, and the reading logic of Lucene90HnswVectorsReader and Lucene90HnswGraph are moved to backward_codecs to support reading and searching of graphs built in pre 9.1 version. Lucene90HnswVectorsWriter is deleted. - For backwards compatible tests, previous Lucene90 graph reading and writing logic was copied into test files of Lucene90RWHnswVectorsFormat, Lucene90HnswVectorsWriter, Lucene90HnswGraphBuilder and Lucene90HnswRWGraph. TODO: tests for KNN search for graphs built in pre 9.1 version; tests for merge of indices of pre 9.1 + current versions.
This commit is contained in:
parent
1a4f838fe2
commit
b0d6fe68d1
|
@ -29,6 +29,7 @@ module org.apache.lucene.backward_codecs {
|
||||||
exports org.apache.lucene.backward_codecs.lucene84;
|
exports org.apache.lucene.backward_codecs.lucene84;
|
||||||
exports org.apache.lucene.backward_codecs.lucene86;
|
exports org.apache.lucene.backward_codecs.lucene86;
|
||||||
exports org.apache.lucene.backward_codecs.lucene87;
|
exports org.apache.lucene.backward_codecs.lucene87;
|
||||||
|
exports org.apache.lucene.backward_codecs.lucene90;
|
||||||
exports org.apache.lucene.backward_codecs.packed;
|
exports org.apache.lucene.backward_codecs.packed;
|
||||||
exports org.apache.lucene.backward_codecs.store;
|
exports org.apache.lucene.backward_codecs.store;
|
||||||
|
|
||||||
|
@ -37,9 +38,12 @@ module org.apache.lucene.backward_codecs {
|
||||||
provides org.apache.lucene.codecs.PostingsFormat with
|
provides org.apache.lucene.codecs.PostingsFormat with
|
||||||
org.apache.lucene.backward_codecs.lucene50.Lucene50PostingsFormat,
|
org.apache.lucene.backward_codecs.lucene50.Lucene50PostingsFormat,
|
||||||
org.apache.lucene.backward_codecs.lucene84.Lucene84PostingsFormat;
|
org.apache.lucene.backward_codecs.lucene84.Lucene84PostingsFormat;
|
||||||
|
provides org.apache.lucene.codecs.KnnVectorsFormat with
|
||||||
|
org.apache.lucene.backward_codecs.lucene90.Lucene90HnswVectorsFormat;
|
||||||
provides org.apache.lucene.codecs.Codec with
|
provides org.apache.lucene.codecs.Codec with
|
||||||
org.apache.lucene.backward_codecs.lucene80.Lucene80Codec,
|
org.apache.lucene.backward_codecs.lucene80.Lucene80Codec,
|
||||||
org.apache.lucene.backward_codecs.lucene84.Lucene84Codec,
|
org.apache.lucene.backward_codecs.lucene84.Lucene84Codec,
|
||||||
org.apache.lucene.backward_codecs.lucene86.Lucene86Codec,
|
org.apache.lucene.backward_codecs.lucene86.Lucene86Codec,
|
||||||
org.apache.lucene.backward_codecs.lucene87.Lucene87Codec;
|
org.apache.lucene.backward_codecs.lucene87.Lucene87Codec,
|
||||||
|
org.apache.lucene.backward_codecs.lucene90.Lucene90Codec;
|
||||||
}
|
}
|
||||||
|
|
|
@ -14,7 +14,7 @@
|
||||||
* See the License for the specific language governing permissions and
|
* See the License for the specific language governing permissions and
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
package org.apache.lucene.codecs.lucene90;
|
package org.apache.lucene.backward_codecs.lucene90;
|
||||||
|
|
||||||
import java.util.Objects;
|
import java.util.Objects;
|
||||||
import org.apache.lucene.codecs.Codec;
|
import org.apache.lucene.codecs.Codec;
|
||||||
|
@ -30,6 +30,16 @@ import org.apache.lucene.codecs.PostingsFormat;
|
||||||
import org.apache.lucene.codecs.SegmentInfoFormat;
|
import org.apache.lucene.codecs.SegmentInfoFormat;
|
||||||
import org.apache.lucene.codecs.StoredFieldsFormat;
|
import org.apache.lucene.codecs.StoredFieldsFormat;
|
||||||
import org.apache.lucene.codecs.TermVectorsFormat;
|
import org.apache.lucene.codecs.TermVectorsFormat;
|
||||||
|
import org.apache.lucene.codecs.lucene90.Lucene90CompoundFormat;
|
||||||
|
import org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat;
|
||||||
|
import org.apache.lucene.codecs.lucene90.Lucene90FieldInfosFormat;
|
||||||
|
import org.apache.lucene.codecs.lucene90.Lucene90LiveDocsFormat;
|
||||||
|
import org.apache.lucene.codecs.lucene90.Lucene90NormsFormat;
|
||||||
|
import org.apache.lucene.codecs.lucene90.Lucene90PointsFormat;
|
||||||
|
import org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat;
|
||||||
|
import org.apache.lucene.codecs.lucene90.Lucene90SegmentInfoFormat;
|
||||||
|
import org.apache.lucene.codecs.lucene90.Lucene90StoredFieldsFormat;
|
||||||
|
import org.apache.lucene.codecs.lucene90.Lucene90TermVectorsFormat;
|
||||||
import org.apache.lucene.codecs.perfield.PerFieldDocValuesFormat;
|
import org.apache.lucene.codecs.perfield.PerFieldDocValuesFormat;
|
||||||
import org.apache.lucene.codecs.perfield.PerFieldKnnVectorsFormat;
|
import org.apache.lucene.codecs.perfield.PerFieldKnnVectorsFormat;
|
||||||
import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat;
|
import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat;
|
|
@ -0,0 +1,216 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.lucene.backward_codecs.lucene90;
|
||||||
|
|
||||||
|
import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.SplittableRandom;
|
||||||
|
import org.apache.lucene.index.KnnGraphValues;
|
||||||
|
import org.apache.lucene.index.RandomAccessVectorValues;
|
||||||
|
import org.apache.lucene.index.VectorSimilarityFunction;
|
||||||
|
import org.apache.lucene.util.Bits;
|
||||||
|
import org.apache.lucene.util.SparseFixedBitSet;
|
||||||
|
import org.apache.lucene.util.hnsw.BoundsChecker;
|
||||||
|
import org.apache.lucene.util.hnsw.NeighborArray;
|
||||||
|
import org.apache.lucene.util.hnsw.NeighborQueue;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Navigable Small-world graph. Provides efficient approximate nearest neighbor search for high
|
||||||
|
* dimensional vectors. See <a href="https://doi.org/10.1016/j.is.2013.10.006">Approximate nearest
|
||||||
|
* neighbor algorithm based on navigable small world graphs [2014]</a> and <a
|
||||||
|
* href="https://arxiv.org/abs/1603.09320">this paper [2018]</a> for details.
|
||||||
|
*
|
||||||
|
* <p>The nomenclature is a bit different here from what's used in those papers:
|
||||||
|
*
|
||||||
|
* <h2>Hyperparameters</h2>
|
||||||
|
*
|
||||||
|
* <ul>
|
||||||
|
* <li><code>numSeed</code> is the equivalent of <code>m</code> in the 2014 paper; it controls the
|
||||||
|
* number of random entry points to sample.
|
||||||
|
* <li><code>beamWidth</code> in {@link Lucene90HnswGraphBuilder} has the same meaning as <code>
|
||||||
|
* efConst </code> in the 2018 paper. It is the number of nearest neighbor candidates to track
|
||||||
|
* while searching the graph for each newly inserted node.
|
||||||
|
* <li><code>maxConn</code> has the same meaning as <code>M</code> in the later paper; it controls
|
||||||
|
* how many of the <code>efConst</code> neighbors are connected to the new node
|
||||||
|
* </ul>
|
||||||
|
*
|
||||||
|
* <p>Note: The graph may be searched by multiple threads concurrently, but updates are not
|
||||||
|
* thread-safe. Also note: there is no notion of deletions. Document searching built on top of this
|
||||||
|
* must do its own deletion-filtering.
|
||||||
|
*
|
||||||
|
* <p>Graph building logic is preserved here only for tests.
|
||||||
|
*/
|
||||||
|
public final class Lucene90HnswGraph extends KnnGraphValues {
|
||||||
|
|
||||||
|
private final int maxConn;
|
||||||
|
|
||||||
|
// Each entry lists the top maxConn neighbors of a node. The nodes correspond to vectors added to
|
||||||
|
// HnswBuilder, and the
|
||||||
|
// node values are the ordinals of those vectors.
|
||||||
|
private final List<NeighborArray> graph;
|
||||||
|
|
||||||
|
// KnnGraphValues iterator members
|
||||||
|
private int upto;
|
||||||
|
private NeighborArray cur;
|
||||||
|
|
||||||
|
Lucene90HnswGraph(int maxConn) {
|
||||||
|
graph = new ArrayList<>();
|
||||||
|
// Typically with diversity criteria we see nodes not fully occupied; average fanout seems to be
|
||||||
|
// about 1/2 maxConn. There is some indexing time penalty for under-allocating, but saves RAM
|
||||||
|
graph.add(new NeighborArray(Math.max(32, maxConn / 4)));
|
||||||
|
this.maxConn = maxConn;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Searches for the nearest neighbors of a query vector.
|
||||||
|
*
|
||||||
|
* @param query search query vector
|
||||||
|
* @param topK the number of nodes to be returned
|
||||||
|
* @param numSeed the size of the queue maintained while searching, and controls the number of
|
||||||
|
* random entry points to sample
|
||||||
|
* @param vectors vector values
|
||||||
|
* @param graphValues the graph values. May represent the entire graph, or a level in a
|
||||||
|
* hierarchical graph.
|
||||||
|
* @param acceptOrds {@link Bits} that represents the allowed document ordinals to match, or
|
||||||
|
* {@code null} if they are all allowed to match.
|
||||||
|
* @param random a source of randomness, used for generating entry points to the graph
|
||||||
|
* @return a priority queue holding the closest neighbors found
|
||||||
|
*/
|
||||||
|
public static NeighborQueue search(
|
||||||
|
float[] query,
|
||||||
|
int topK,
|
||||||
|
int numSeed,
|
||||||
|
RandomAccessVectorValues vectors,
|
||||||
|
VectorSimilarityFunction similarityFunction,
|
||||||
|
KnnGraphValues graphValues,
|
||||||
|
Bits acceptOrds,
|
||||||
|
SplittableRandom random)
|
||||||
|
throws IOException {
|
||||||
|
int size = graphValues.size();
|
||||||
|
|
||||||
|
// MIN heap, holding the top results
|
||||||
|
NeighborQueue results = new NeighborQueue(numSeed, similarityFunction.reversed);
|
||||||
|
// MAX heap, from which to pull the candidate nodes
|
||||||
|
NeighborQueue candidates = new NeighborQueue(numSeed, !similarityFunction.reversed);
|
||||||
|
|
||||||
|
// set of ordinals that have been visited by search on this layer, used to avoid backtracking
|
||||||
|
SparseFixedBitSet visited = new SparseFixedBitSet(size);
|
||||||
|
// get initial candidates at random
|
||||||
|
int boundedNumSeed = Math.min(numSeed, 2 * size);
|
||||||
|
for (int i = 0; i < boundedNumSeed; i++) {
|
||||||
|
int entryPoint = random.nextInt(size);
|
||||||
|
if (visited.getAndSet(entryPoint) == false) {
|
||||||
|
// explore the topK starting points of some random numSeed probes
|
||||||
|
float score = similarityFunction.compare(query, vectors.vectorValue(entryPoint));
|
||||||
|
candidates.add(entryPoint, score);
|
||||||
|
if (acceptOrds == null || acceptOrds.get(entryPoint)) {
|
||||||
|
results.add(entryPoint, score);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Set the bound to the worst current result and below reject any newly-generated candidates
|
||||||
|
// failing
|
||||||
|
// to exceed this bound
|
||||||
|
BoundsChecker bound = BoundsChecker.create(similarityFunction.reversed);
|
||||||
|
bound.set(results.topScore());
|
||||||
|
while (candidates.size() > 0) {
|
||||||
|
// get the best candidate (closest or best scoring)
|
||||||
|
float topCandidateScore = candidates.topScore();
|
||||||
|
if (results.size() >= topK) {
|
||||||
|
if (bound.check(topCandidateScore)) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
int topCandidateNode = candidates.pop();
|
||||||
|
graphValues.seek(0, topCandidateNode);
|
||||||
|
int friendOrd;
|
||||||
|
while ((friendOrd = graphValues.nextNeighbor()) != NO_MORE_DOCS) {
|
||||||
|
assert friendOrd < size : "friendOrd=" + friendOrd + "; size=" + size;
|
||||||
|
if (visited.getAndSet(friendOrd)) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
float score = similarityFunction.compare(query, vectors.vectorValue(friendOrd));
|
||||||
|
if (results.size() < numSeed || bound.check(score) == false) {
|
||||||
|
candidates.add(friendOrd, score);
|
||||||
|
if (acceptOrds == null || acceptOrds.get(friendOrd)) {
|
||||||
|
results.insertWithOverflow(friendOrd, score);
|
||||||
|
bound.set(results.topScore());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
while (results.size() > topK) {
|
||||||
|
results.pop();
|
||||||
|
}
|
||||||
|
results.setVisitedCount(visited.approximateCardinality());
|
||||||
|
return results;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the {@link NeighborQueue} connected to the given node.
|
||||||
|
*
|
||||||
|
* @param node the node whose neighbors are returned
|
||||||
|
*/
|
||||||
|
public NeighborArray getNeighbors(int node) {
|
||||||
|
return graph.get(node);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int size() {
|
||||||
|
return graph.size();
|
||||||
|
}
|
||||||
|
|
||||||
|
int addNode() {
|
||||||
|
graph.add(new NeighborArray(maxConn + 1));
|
||||||
|
return graph.size() - 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void seek(int level, int targetNode) {
|
||||||
|
cur = getNeighbors(targetNode);
|
||||||
|
upto = -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int nextNeighbor() {
|
||||||
|
if (++upto < cur.size()) {
|
||||||
|
return cur.node()[upto];
|
||||||
|
}
|
||||||
|
return NO_MORE_DOCS;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int numLevels() {
|
||||||
|
throw new UnsupportedOperationException();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int entryNode() {
|
||||||
|
throw new UnsupportedOperationException();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public NodesIterator getNodesOnLevel(int level) {
|
||||||
|
throw new UnsupportedOperationException();
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,276 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.lucene.backward_codecs.lucene90;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.Locale;
|
||||||
|
import java.util.Objects;
|
||||||
|
import java.util.SplittableRandom;
|
||||||
|
import org.apache.lucene.index.RandomAccessVectorValues;
|
||||||
|
import org.apache.lucene.index.RandomAccessVectorValuesProducer;
|
||||||
|
import org.apache.lucene.index.VectorSimilarityFunction;
|
||||||
|
import org.apache.lucene.util.InfoStream;
|
||||||
|
import org.apache.lucene.util.hnsw.BoundsChecker;
|
||||||
|
import org.apache.lucene.util.hnsw.NeighborArray;
|
||||||
|
import org.apache.lucene.util.hnsw.NeighborQueue;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Builder for HNSW graph. See {@link Lucene90HnswGraph} for a gloss on the algorithm and the
|
||||||
|
* meaning of the hyperparameters.
|
||||||
|
*
|
||||||
|
* <p>This class is preserved here only for tests.
|
||||||
|
*/
|
||||||
|
public final class Lucene90HnswGraphBuilder {
|
||||||
|
|
||||||
|
/** Default random seed for level generation * */
|
||||||
|
private static final long DEFAULT_RAND_SEED = System.currentTimeMillis();
|
||||||
|
/** A name for the HNSW component for the info-stream * */
|
||||||
|
public static final String HNSW_COMPONENT = "HNSW";
|
||||||
|
|
||||||
|
/** Random seed for level generation; public to expose for testing * */
|
||||||
|
public static long randSeed = DEFAULT_RAND_SEED;
|
||||||
|
|
||||||
|
private final int maxConn;
|
||||||
|
private final int beamWidth;
|
||||||
|
private final NeighborArray scratch;
|
||||||
|
|
||||||
|
private final VectorSimilarityFunction similarityFunction;
|
||||||
|
private final RandomAccessVectorValues vectorValues;
|
||||||
|
private final SplittableRandom random;
|
||||||
|
private final BoundsChecker bound;
|
||||||
|
final Lucene90HnswGraph hnsw;
|
||||||
|
|
||||||
|
private InfoStream infoStream = InfoStream.getDefault();
|
||||||
|
|
||||||
|
// we need two sources of vectors in order to perform diversity check comparisons without
|
||||||
|
// colliding
|
||||||
|
private RandomAccessVectorValues buildVectors;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Reads all the vectors from a VectorValues, builds a graph connecting them by their dense
|
||||||
|
* ordinals, using the given hyperparameter settings, and returns the resulting graph.
|
||||||
|
*
|
||||||
|
* @param vectors the vectors whose relations are represented by the graph - must provide a
|
||||||
|
* different view over those vectors than the one used to add via addGraphNode.
|
||||||
|
* @param maxConn the number of connections to make when adding a new graph node; roughly speaking
|
||||||
|
* the graph fanout.
|
||||||
|
* @param beamWidth the size of the beam search to use when finding nearest neighbors.
|
||||||
|
* @param seed the seed for a random number generator used during graph construction. Provide this
|
||||||
|
* to ensure repeatable construction.
|
||||||
|
*/
|
||||||
|
public Lucene90HnswGraphBuilder(
|
||||||
|
RandomAccessVectorValuesProducer vectors,
|
||||||
|
VectorSimilarityFunction similarityFunction,
|
||||||
|
int maxConn,
|
||||||
|
int beamWidth,
|
||||||
|
long seed) {
|
||||||
|
vectorValues = vectors.randomAccess();
|
||||||
|
buildVectors = vectors.randomAccess();
|
||||||
|
this.similarityFunction = Objects.requireNonNull(similarityFunction);
|
||||||
|
if (maxConn <= 0) {
|
||||||
|
throw new IllegalArgumentException("maxConn must be positive");
|
||||||
|
}
|
||||||
|
if (beamWidth <= 0) {
|
||||||
|
throw new IllegalArgumentException("beamWidth must be positive");
|
||||||
|
}
|
||||||
|
this.maxConn = maxConn;
|
||||||
|
this.beamWidth = beamWidth;
|
||||||
|
this.hnsw = new Lucene90HnswGraph(maxConn);
|
||||||
|
bound = BoundsChecker.create(similarityFunction.reversed);
|
||||||
|
random = new SplittableRandom(seed);
|
||||||
|
scratch = new NeighborArray(Math.max(beamWidth, maxConn + 1));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Reads all the vectors from two copies of a random access VectorValues. Providing two copies
|
||||||
|
* enables efficient retrieval without extra data copying, while avoiding collision of the
|
||||||
|
* returned values.
|
||||||
|
*
|
||||||
|
* @param vectors the vectors for which to build a nearest neighbors graph. Must be an independet
|
||||||
|
* accessor for the vectors
|
||||||
|
*/
|
||||||
|
public Lucene90HnswGraph build(RandomAccessVectorValues vectors) throws IOException {
|
||||||
|
if (vectors == vectorValues) {
|
||||||
|
throw new IllegalArgumentException(
|
||||||
|
"Vectors to build must be independent of the source of vectors provided to HnswGraphBuilder()");
|
||||||
|
}
|
||||||
|
if (infoStream.isEnabled(HNSW_COMPONENT)) {
|
||||||
|
infoStream.message(HNSW_COMPONENT, "build graph from " + vectors.size() + " vectors");
|
||||||
|
}
|
||||||
|
long start = System.nanoTime(), t = start;
|
||||||
|
// start at node 1! node 0 is added implicitly, in the constructor
|
||||||
|
for (int node = 1; node < vectors.size(); node++) {
|
||||||
|
addGraphNode(vectors.vectorValue(node));
|
||||||
|
if (node % 10000 == 0) {
|
||||||
|
if (infoStream.isEnabled(HNSW_COMPONENT)) {
|
||||||
|
long now = System.nanoTime();
|
||||||
|
infoStream.message(
|
||||||
|
HNSW_COMPONENT,
|
||||||
|
String.format(
|
||||||
|
Locale.ROOT,
|
||||||
|
"built %d in %d/%d ms",
|
||||||
|
node,
|
||||||
|
((now - t) / 1_000_000),
|
||||||
|
((now - start) / 1_000_000)));
|
||||||
|
t = now;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return hnsw;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Set info-stream to output debugging information * */
|
||||||
|
public void setInfoStream(InfoStream infoStream) {
|
||||||
|
this.infoStream = infoStream;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Inserts a doc with vector value to the graph */
|
||||||
|
void addGraphNode(float[] value) throws IOException {
|
||||||
|
// We pass 'null' for acceptOrds because there are no deletions while building the graph
|
||||||
|
NeighborQueue candidates =
|
||||||
|
Lucene90HnswGraph.search(
|
||||||
|
value, beamWidth, beamWidth, vectorValues, similarityFunction, hnsw, null, random);
|
||||||
|
|
||||||
|
int node = hnsw.addNode();
|
||||||
|
|
||||||
|
/* connect neighbors to the new node, using a diversity heuristic that chooses successive
|
||||||
|
* nearest neighbors that are closer to the new node than they are to the previously-selected
|
||||||
|
* neighbors
|
||||||
|
*/
|
||||||
|
addDiverseNeighbors(node, candidates);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* TODO: we are not maintaining nodes in strict score order; the forward links
|
||||||
|
* are added in sorted order, but the reverse implicit ones are not. Diversity heuristic should
|
||||||
|
* work better if we keep the neighbor arrays sorted. Possibly we should switch back to a heap?
|
||||||
|
* But first we should just see if sorting makes a significant difference.
|
||||||
|
*/
|
||||||
|
private void addDiverseNeighbors(int node, NeighborQueue candidates) throws IOException {
|
||||||
|
/* For each of the beamWidth nearest candidates (going from best to worst), select it only if it
|
||||||
|
* is closer to target than it is to any of the already-selected neighbors (ie selected in this method,
|
||||||
|
* since the node is new and has no prior neighbors).
|
||||||
|
*/
|
||||||
|
NeighborArray neighbors = hnsw.getNeighbors(node);
|
||||||
|
assert neighbors.size() == 0; // new node
|
||||||
|
popToScratch(candidates);
|
||||||
|
selectDiverse(neighbors, scratch);
|
||||||
|
|
||||||
|
// Link the selected nodes to the new node, and the new node to the selected nodes (again
|
||||||
|
// applying diversity heuristic)
|
||||||
|
int size = neighbors.size();
|
||||||
|
for (int i = 0; i < size; i++) {
|
||||||
|
int nbr = neighbors.node()[i];
|
||||||
|
NeighborArray nbrNbr = hnsw.getNeighbors(nbr);
|
||||||
|
nbrNbr.add(node, neighbors.score()[i]);
|
||||||
|
if (nbrNbr.size() > maxConn) {
|
||||||
|
diversityUpdate(nbrNbr);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void selectDiverse(NeighborArray neighbors, NeighborArray candidates) throws IOException {
|
||||||
|
// Select the best maxConn neighbors of the new node, applying the diversity heuristic
|
||||||
|
for (int i = candidates.size() - 1; neighbors.size() < maxConn && i >= 0; i--) {
|
||||||
|
// compare each neighbor (in distance order) against the closer neighbors selected so far,
|
||||||
|
// only adding it if it is closer to the target than to any of the other selected neighbors
|
||||||
|
int cNode = candidates.node()[i];
|
||||||
|
float cScore = candidates.score()[i];
|
||||||
|
assert cNode < hnsw.size();
|
||||||
|
if (diversityCheck(vectorValues.vectorValue(cNode), cScore, neighbors, buildVectors)) {
|
||||||
|
neighbors.add(cNode, cScore);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void popToScratch(NeighborQueue candidates) {
|
||||||
|
scratch.clear();
|
||||||
|
int candidateCount = candidates.size();
|
||||||
|
// extract all the Neighbors from the queue into an array; these will now be
|
||||||
|
// sorted from worst to best
|
||||||
|
for (int i = 0; i < candidateCount; i++) {
|
||||||
|
float score = candidates.topScore();
|
||||||
|
scratch.add(candidates.pop(), score);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @param candidate the vector of a new candidate neighbor of a node n
|
||||||
|
* @param score the score of the new candidate and node n, to be compared with scores of the
|
||||||
|
* candidate and n's neighbors
|
||||||
|
* @param neighbors the neighbors selected so far
|
||||||
|
* @param vectorValues source of values used for making comparisons between candidate and existing
|
||||||
|
* neighbors
|
||||||
|
* @return whether the candidate is diverse given the existing neighbors
|
||||||
|
*/
|
||||||
|
private boolean diversityCheck(
|
||||||
|
float[] candidate,
|
||||||
|
float score,
|
||||||
|
NeighborArray neighbors,
|
||||||
|
RandomAccessVectorValues vectorValues)
|
||||||
|
throws IOException {
|
||||||
|
bound.set(score);
|
||||||
|
for (int i = 0; i < neighbors.size(); i++) {
|
||||||
|
float diversityCheck =
|
||||||
|
similarityFunction.compare(candidate, vectorValues.vectorValue(neighbors.node()[i]));
|
||||||
|
if (bound.check(diversityCheck) == false) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
private void diversityUpdate(NeighborArray neighbors) throws IOException {
|
||||||
|
assert neighbors.size() == maxConn + 1;
|
||||||
|
int replacePoint = findNonDiverse(neighbors);
|
||||||
|
if (replacePoint == -1) {
|
||||||
|
// none found; check score against worst existing neighbor
|
||||||
|
bound.set(neighbors.score()[0]);
|
||||||
|
if (bound.check(neighbors.score()[maxConn])) {
|
||||||
|
// drop the new neighbor; it is not competitive and there were no diversity failures
|
||||||
|
neighbors.removeLast();
|
||||||
|
return;
|
||||||
|
} else {
|
||||||
|
replacePoint = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
neighbors.node()[replacePoint] = neighbors.node()[maxConn];
|
||||||
|
neighbors.score()[replacePoint] = neighbors.score()[maxConn];
|
||||||
|
neighbors.removeLast();
|
||||||
|
}
|
||||||
|
|
||||||
|
// scan neighbors looking for diversity violations
|
||||||
|
private int findNonDiverse(NeighborArray neighbors) throws IOException {
|
||||||
|
for (int i = neighbors.size() - 1; i >= 0; i--) {
|
||||||
|
// check each neighbor against its better-scoring neighbors. If it fails diversity check with
|
||||||
|
// them, drop it
|
||||||
|
int nbrNode = neighbors.node()[i];
|
||||||
|
bound.set(neighbors.score()[i]);
|
||||||
|
float[] nbrVector = vectorValues.vectorValue(nbrNode);
|
||||||
|
for (int j = maxConn; j > i; j--) {
|
||||||
|
float diversityCheck =
|
||||||
|
similarityFunction.compare(nbrVector, buildVectors.vectorValue(neighbors.node()[j]));
|
||||||
|
if (bound.check(diversityCheck) == false) {
|
||||||
|
// node j is too similar to node i given its score relative to the base node
|
||||||
|
// replace it with the new node, which is at [maxConn]
|
||||||
|
return i;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
}
|
|
@ -15,7 +15,7 @@
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
package org.apache.lucene.codecs.lucene90;
|
package org.apache.lucene.backward_codecs.lucene90;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import org.apache.lucene.codecs.KnnVectorsFormat;
|
import org.apache.lucene.codecs.KnnVectorsFormat;
|
||||||
|
@ -65,7 +65,7 @@ import org.apache.lucene.util.hnsw.HnswGraph;
|
||||||
*
|
*
|
||||||
* @lucene.experimental
|
* @lucene.experimental
|
||||||
*/
|
*/
|
||||||
public final class Lucene90HnswVectorsFormat extends KnnVectorsFormat {
|
public class Lucene90HnswVectorsFormat extends KnnVectorsFormat {
|
||||||
|
|
||||||
static final String META_CODEC_NAME = "Lucene90HnswVectorsFormatMeta";
|
static final String META_CODEC_NAME = "Lucene90HnswVectorsFormatMeta";
|
||||||
static final String VECTOR_DATA_CODEC_NAME = "Lucene90HnswVectorsFormatData";
|
static final String VECTOR_DATA_CODEC_NAME = "Lucene90HnswVectorsFormatData";
|
||||||
|
@ -77,26 +77,33 @@ public final class Lucene90HnswVectorsFormat extends KnnVectorsFormat {
|
||||||
static final int VERSION_START = 0;
|
static final int VERSION_START = 0;
|
||||||
static final int VERSION_CURRENT = VERSION_START;
|
static final int VERSION_CURRENT = VERSION_START;
|
||||||
|
|
||||||
|
/** Default number of maximum connections per node */
|
||||||
public static final int DEFAULT_MAX_CONN = 16;
|
public static final int DEFAULT_MAX_CONN = 16;
|
||||||
|
/**
|
||||||
|
* Default number of the size of the queue maintained while searching and the number of random
|
||||||
|
* entry points to sample during a graph construction.
|
||||||
|
*/
|
||||||
public static final int DEFAULT_BEAM_WIDTH = 100;
|
public static final int DEFAULT_BEAM_WIDTH = 100;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Controls how many of the nearest neighbor candidates are connected to the new node. Defaults to
|
* Controls how many of the nearest neighbor candidates are connected to the new node. Defaults to
|
||||||
* {@link Lucene90HnswVectorsFormat#DEFAULT_MAX_CONN}. See {@link HnswGraph} for more details.
|
* {@link Lucene90HnswVectorsFormat#DEFAULT_MAX_CONN}. See {@link HnswGraph} for more details.
|
||||||
*/
|
*/
|
||||||
private final int maxConn;
|
final int maxConn;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* The number of candidate neighbors to track while searching the graph for each newly inserted
|
* The number of candidate neighbors to track while searching the graph for each newly inserted
|
||||||
* node. Defaults to to {@link Lucene90HnswVectorsFormat#DEFAULT_BEAM_WIDTH}. See {@link
|
* node. Defaults to to {@link Lucene90HnswVectorsFormat#DEFAULT_BEAM_WIDTH}. See {@link
|
||||||
* HnswGraph} for details.
|
* HnswGraph} for details.
|
||||||
*/
|
*/
|
||||||
private final int beamWidth;
|
final int beamWidth;
|
||||||
|
|
||||||
|
/** A constructor for vectors format with default parameters */
|
||||||
public Lucene90HnswVectorsFormat() {
|
public Lucene90HnswVectorsFormat() {
|
||||||
this(DEFAULT_MAX_CONN, DEFAULT_BEAM_WIDTH);
|
this(DEFAULT_MAX_CONN, DEFAULT_BEAM_WIDTH);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** A constructor for vectors format */
|
||||||
public Lucene90HnswVectorsFormat(int maxConn, int beamWidth) {
|
public Lucene90HnswVectorsFormat(int maxConn, int beamWidth) {
|
||||||
super("Lucene90HnswVectorsFormat");
|
super("Lucene90HnswVectorsFormat");
|
||||||
this.maxConn = maxConn;
|
this.maxConn = maxConn;
|
||||||
|
@ -105,7 +112,7 @@ public final class Lucene90HnswVectorsFormat extends KnnVectorsFormat {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public KnnVectorsWriter fieldsWriter(SegmentWriteState state) throws IOException {
|
public KnnVectorsWriter fieldsWriter(SegmentWriteState state) throws IOException {
|
||||||
return new Lucene90HnswVectorsWriter(state, maxConn, beamWidth);
|
throw new UnsupportedOperationException("Old codecs may only be used for reading");
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
|
@ -15,7 +15,7 @@
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
package org.apache.lucene.codecs.lucene90;
|
package org.apache.lucene.backward_codecs.lucene90;
|
||||||
|
|
||||||
import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS;
|
import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS;
|
||||||
|
|
||||||
|
@ -47,7 +47,6 @@ import org.apache.lucene.util.Bits;
|
||||||
import org.apache.lucene.util.BytesRef;
|
import org.apache.lucene.util.BytesRef;
|
||||||
import org.apache.lucene.util.IOUtils;
|
import org.apache.lucene.util.IOUtils;
|
||||||
import org.apache.lucene.util.RamUsageEstimator;
|
import org.apache.lucene.util.RamUsageEstimator;
|
||||||
import org.apache.lucene.util.hnsw.HnswGraph;
|
|
||||||
import org.apache.lucene.util.hnsw.NeighborQueue;
|
import org.apache.lucene.util.hnsw.NeighborQueue;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -244,7 +243,7 @@ public final class Lucene90HnswVectorsReader extends KnnVectorsReader {
|
||||||
// use a seed that is fixed for the index so we get reproducible results for the same query
|
// use a seed that is fixed for the index so we get reproducible results for the same query
|
||||||
final SplittableRandom random = new SplittableRandom(checksumSeed);
|
final SplittableRandom random = new SplittableRandom(checksumSeed);
|
||||||
NeighborQueue results =
|
NeighborQueue results =
|
||||||
HnswGraph.search(
|
Lucene90HnswGraph.search(
|
||||||
target,
|
target,
|
||||||
k,
|
k,
|
||||||
k,
|
k,
|
||||||
|
@ -291,6 +290,7 @@ public final class Lucene90HnswVectorsReader extends KnnVectorsReader {
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** Get knn graph values; used for testing */
|
||||||
public KnnGraphValues getGraphValues(String field) throws IOException {
|
public KnnGraphValues getGraphValues(String field) throws IOException {
|
||||||
FieldInfo info = fieldInfos.fieldInfo(field);
|
FieldInfo info = fieldInfos.fieldInfo(field);
|
||||||
if (info == null) {
|
if (info == null) {
|
||||||
|
@ -480,7 +480,7 @@ public final class Lucene90HnswVectorsReader extends KnnVectorsReader {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void seek(int targetOrd) throws IOException {
|
public void seek(int level, int targetOrd) throws IOException {
|
||||||
// unsafe; no bounds checking
|
// unsafe; no bounds checking
|
||||||
dataIn.seek(entry.ordOffsets[targetOrd]);
|
dataIn.seek(entry.ordOffsets[targetOrd]);
|
||||||
arcCount = dataIn.readInt();
|
arcCount = dataIn.readInt();
|
||||||
|
@ -502,5 +502,20 @@ public final class Lucene90HnswVectorsReader extends KnnVectorsReader {
|
||||||
arc += dataIn.readVInt();
|
arc += dataIn.readVInt();
|
||||||
return arc;
|
return arc;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int numLevels() {
|
||||||
|
throw new UnsupportedOperationException();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int entryNode() {
|
||||||
|
throw new UnsupportedOperationException();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public NodesIterator getNodesOnLevel(int level) {
|
||||||
|
throw new UnsupportedOperationException();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
|
@ -0,0 +1,19 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/** Lucene 9.0 file format. */
|
||||||
|
package org.apache.lucene.backward_codecs.lucene90;
|
|
@ -17,3 +17,4 @@ org.apache.lucene.backward_codecs.lucene80.Lucene80Codec
|
||||||
org.apache.lucene.backward_codecs.lucene84.Lucene84Codec
|
org.apache.lucene.backward_codecs.lucene84.Lucene84Codec
|
||||||
org.apache.lucene.backward_codecs.lucene86.Lucene86Codec
|
org.apache.lucene.backward_codecs.lucene86.Lucene86Codec
|
||||||
org.apache.lucene.backward_codecs.lucene87.Lucene87Codec
|
org.apache.lucene.backward_codecs.lucene87.Lucene87Codec
|
||||||
|
org.apache.lucene.backward_codecs.lucene90.Lucene90Codec
|
||||||
|
|
|
@ -0,0 +1,16 @@
|
||||||
|
# Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
# contributor license agreements. See the NOTICE file distributed with
|
||||||
|
# this work for additional information regarding copyright ownership.
|
||||||
|
# The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
# (the "License"); you may not use this file except in compliance with
|
||||||
|
# the License. You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
org.apache.lucene.backward_codecs.lucene90.Lucene90HnswVectorsFormat
|
|
@ -15,7 +15,7 @@
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
package org.apache.lucene.codecs.lucene90;
|
package org.apache.lucene.backward_codecs.lucene90;
|
||||||
|
|
||||||
import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS;
|
import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS;
|
||||||
|
|
||||||
|
@ -35,8 +35,6 @@ import org.apache.lucene.store.IndexOutput;
|
||||||
import org.apache.lucene.util.ArrayUtil;
|
import org.apache.lucene.util.ArrayUtil;
|
||||||
import org.apache.lucene.util.BytesRef;
|
import org.apache.lucene.util.BytesRef;
|
||||||
import org.apache.lucene.util.IOUtils;
|
import org.apache.lucene.util.IOUtils;
|
||||||
import org.apache.lucene.util.hnsw.HnswGraph;
|
|
||||||
import org.apache.lucene.util.hnsw.HnswGraphBuilder;
|
|
||||||
import org.apache.lucene.util.hnsw.NeighborArray;
|
import org.apache.lucene.util.hnsw.NeighborArray;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -235,11 +233,15 @@ public final class Lucene90HnswVectorsWriter extends KnnVectorsWriter {
|
||||||
int maxConn,
|
int maxConn,
|
||||||
int beamWidth)
|
int beamWidth)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
HnswGraphBuilder hnswGraphBuilder =
|
Lucene90HnswGraphBuilder hnswGraphBuilder =
|
||||||
new HnswGraphBuilder(
|
new Lucene90HnswGraphBuilder(
|
||||||
vectorValues, similarityFunction, maxConn, beamWidth, HnswGraphBuilder.randSeed);
|
vectorValues,
|
||||||
|
similarityFunction,
|
||||||
|
maxConn,
|
||||||
|
beamWidth,
|
||||||
|
Lucene90HnswGraphBuilder.randSeed);
|
||||||
hnswGraphBuilder.setInfoStream(segmentWriteState.infoStream);
|
hnswGraphBuilder.setInfoStream(segmentWriteState.infoStream);
|
||||||
HnswGraph graph = hnswGraphBuilder.build(vectorValues.randomAccess());
|
Lucene90HnswGraph graph = hnswGraphBuilder.build(vectorValues.randomAccess());
|
||||||
|
|
||||||
for (int ord = 0; ord < offsets.length; ord++) {
|
for (int ord = 0; ord < offsets.length; ord++) {
|
||||||
// write graph
|
// write graph
|
|
@ -0,0 +1,43 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.lucene.backward_codecs.lucene90;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import org.apache.lucene.codecs.KnnVectorsWriter;
|
||||||
|
import org.apache.lucene.index.SegmentWriteState;
|
||||||
|
|
||||||
|
public class Lucene90RWHnswVectorsFormat extends Lucene90HnswVectorsFormat {
|
||||||
|
|
||||||
|
public Lucene90RWHnswVectorsFormat(int maxConn, int beamWidth) {
|
||||||
|
super(maxConn, beamWidth);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public KnnVectorsWriter fieldsWriter(SegmentWriteState state) throws IOException {
|
||||||
|
return new Lucene90HnswVectorsWriter(state, maxConn, beamWidth);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String toString() {
|
||||||
|
return "Lucene90RWHnswVectorsFormat(name = Lucene90RWHnswVectorsFormat, maxConn = "
|
||||||
|
+ maxConn
|
||||||
|
+ ", beamWidth="
|
||||||
|
+ beamWidth
|
||||||
|
+ ")";
|
||||||
|
}
|
||||||
|
}
|
|
@ -14,11 +14,11 @@
|
||||||
* See the License for the specific language governing permissions and
|
* See the License for the specific language governing permissions and
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
package org.apache.lucene.codecs.lucene90;
|
package org.apache.lucene.backward_codecs.lucene90;
|
||||||
|
|
||||||
import static com.carrotsearch.randomizedtesting.RandomizedTest.randomIntBetween;
|
import static com.carrotsearch.randomizedtesting.RandomizedTest.randomIntBetween;
|
||||||
import static org.apache.lucene.codecs.lucene90.Lucene90HnswVectorsFormat.DEFAULT_BEAM_WIDTH;
|
import static org.apache.lucene.backward_codecs.lucene90.Lucene90HnswVectorsFormat.DEFAULT_BEAM_WIDTH;
|
||||||
import static org.apache.lucene.codecs.lucene90.Lucene90HnswVectorsFormat.DEFAULT_MAX_CONN;
|
import static org.apache.lucene.backward_codecs.lucene90.Lucene90HnswVectorsFormat.DEFAULT_MAX_CONN;
|
||||||
|
|
||||||
import org.apache.lucene.codecs.Codec;
|
import org.apache.lucene.codecs.Codec;
|
||||||
import org.apache.lucene.codecs.KnnVectorsFormat;
|
import org.apache.lucene.codecs.KnnVectorsFormat;
|
||||||
|
@ -38,11 +38,11 @@ public class TestLucene90HnswVectorsFormat extends BaseKnnVectorsFormatTestCase
|
||||||
new Lucene90Codec() {
|
new Lucene90Codec() {
|
||||||
@Override
|
@Override
|
||||||
public KnnVectorsFormat getKnnVectorsFormatForField(String field) {
|
public KnnVectorsFormat getKnnVectorsFormatForField(String field) {
|
||||||
return new Lucene90HnswVectorsFormat(maxConn, beamWidth);
|
return new Lucene90RWHnswVectorsFormat(maxConn, beamWidth);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
String expectedString =
|
String expectedString =
|
||||||
"Lucene90HnswVectorsFormat(name = Lucene90HnswVectorsFormat, maxConn = "
|
"Lucene90RWHnswVectorsFormat(name = Lucene90RWHnswVectorsFormat, maxConn = "
|
||||||
+ maxConn
|
+ maxConn
|
||||||
+ ", beamWidth="
|
+ ", beamWidth="
|
||||||
+ beamWidth
|
+ beamWidth
|
|
@ -27,7 +27,7 @@ import org.apache.lucene.benchmark.byTask.PerfRunData;
|
||||||
import org.apache.lucene.benchmark.byTask.utils.Config;
|
import org.apache.lucene.benchmark.byTask.utils.Config;
|
||||||
import org.apache.lucene.codecs.Codec;
|
import org.apache.lucene.codecs.Codec;
|
||||||
import org.apache.lucene.codecs.PostingsFormat;
|
import org.apache.lucene.codecs.PostingsFormat;
|
||||||
import org.apache.lucene.codecs.lucene90.Lucene90Codec;
|
import org.apache.lucene.codecs.lucene91.Lucene91Codec;
|
||||||
import org.apache.lucene.index.ConcurrentMergeScheduler;
|
import org.apache.lucene.index.ConcurrentMergeScheduler;
|
||||||
import org.apache.lucene.index.IndexCommit;
|
import org.apache.lucene.index.IndexCommit;
|
||||||
import org.apache.lucene.index.IndexDeletionPolicy;
|
import org.apache.lucene.index.IndexDeletionPolicy;
|
||||||
|
@ -152,7 +152,7 @@ public class CreateIndexTask extends PerfTask {
|
||||||
try {
|
try {
|
||||||
final PostingsFormat postingsFormatChosen = PostingsFormat.forName(postingsFormat);
|
final PostingsFormat postingsFormatChosen = PostingsFormat.forName(postingsFormat);
|
||||||
iwConf.setCodec(
|
iwConf.setCodec(
|
||||||
new Lucene90Codec() {
|
new Lucene91Codec() {
|
||||||
@Override
|
@Override
|
||||||
public PostingsFormat getPostingsFormatForField(String field) {
|
public PostingsFormat getPostingsFormatForField(String field) {
|
||||||
return postingsFormatChosen;
|
return postingsFormatChosen;
|
||||||
|
|
|
@ -27,6 +27,7 @@ module org.apache.lucene.core {
|
||||||
exports org.apache.lucene.analysis.tokenattributes;
|
exports org.apache.lucene.analysis.tokenattributes;
|
||||||
exports org.apache.lucene.codecs;
|
exports org.apache.lucene.codecs;
|
||||||
exports org.apache.lucene.codecs.compressing;
|
exports org.apache.lucene.codecs.compressing;
|
||||||
|
exports org.apache.lucene.codecs.lucene91;
|
||||||
exports org.apache.lucene.codecs.lucene90;
|
exports org.apache.lucene.codecs.lucene90;
|
||||||
exports org.apache.lucene.codecs.lucene90.blocktree;
|
exports org.apache.lucene.codecs.lucene90.blocktree;
|
||||||
exports org.apache.lucene.codecs.lucene90.compressing;
|
exports org.apache.lucene.codecs.lucene90.compressing;
|
||||||
|
@ -59,11 +60,11 @@ module org.apache.lucene.core {
|
||||||
provides org.apache.lucene.analysis.TokenizerFactory with
|
provides org.apache.lucene.analysis.TokenizerFactory with
|
||||||
org.apache.lucene.analysis.standard.StandardTokenizerFactory;
|
org.apache.lucene.analysis.standard.StandardTokenizerFactory;
|
||||||
provides org.apache.lucene.codecs.Codec with
|
provides org.apache.lucene.codecs.Codec with
|
||||||
org.apache.lucene.codecs.lucene90.Lucene90Codec;
|
org.apache.lucene.codecs.lucene91.Lucene91Codec;
|
||||||
provides org.apache.lucene.codecs.DocValuesFormat with
|
provides org.apache.lucene.codecs.DocValuesFormat with
|
||||||
org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat;
|
org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat;
|
||||||
provides org.apache.lucene.codecs.KnnVectorsFormat with
|
provides org.apache.lucene.codecs.KnnVectorsFormat with
|
||||||
org.apache.lucene.codecs.lucene90.Lucene90HnswVectorsFormat;
|
org.apache.lucene.codecs.lucene91.Lucene91HnswVectorsFormat;
|
||||||
provides org.apache.lucene.codecs.PostingsFormat with
|
provides org.apache.lucene.codecs.PostingsFormat with
|
||||||
org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat;
|
org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat;
|
||||||
provides org.apache.lucene.index.SortFieldProvider with
|
provides org.apache.lucene.index.SortFieldProvider with
|
||||||
|
|
|
@ -55,7 +55,7 @@ public abstract class Codec implements NamedSPILoader.NamedSPI {
|
||||||
return LOADER;
|
return LOADER;
|
||||||
}
|
}
|
||||||
|
|
||||||
static Codec defaultCodec = LOADER.lookup("Lucene90");
|
static Codec defaultCodec = LOADER.lookup("Lucene91");
|
||||||
}
|
}
|
||||||
|
|
||||||
private final String name;
|
private final String name;
|
||||||
|
|
|
@ -85,7 +85,7 @@ public abstract class KnnVectorsFormat implements NamedSPILoader.NamedSPI {
|
||||||
@Override
|
@Override
|
||||||
public KnnVectorsWriter fieldsWriter(SegmentWriteState state) {
|
public KnnVectorsWriter fieldsWriter(SegmentWriteState state) {
|
||||||
throw new UnsupportedOperationException(
|
throw new UnsupportedOperationException(
|
||||||
"Attempt to write EMPTY VectorValues: maybe you forgot to use codec=Lucene90");
|
"Attempt to write EMPTY VectorValues: maybe you forgot to use codec=Lucene91");
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|
|
@ -15,405 +15,5 @@
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
/**
|
/** Lucene 9.0 file format. */
|
||||||
* Lucene 9.0 file format.
|
|
||||||
*
|
|
||||||
* <h2>Apache Lucene - Index File Formats</h2>
|
|
||||||
*
|
|
||||||
* <div>
|
|
||||||
*
|
|
||||||
* <ul>
|
|
||||||
* <li><a href="#Introduction">Introduction</a>
|
|
||||||
* <li><a href="#Definitions">Definitions</a>
|
|
||||||
* <ul>
|
|
||||||
* <li><a href="#Inverted_Indexing">Inverted Indexing</a>
|
|
||||||
* <li><a href="#Types_of_Fields">Types of Fields</a>
|
|
||||||
* <li><a href="#Segments">Segments</a>
|
|
||||||
* <li><a href="#Document_Numbers">Document Numbers</a>
|
|
||||||
* </ul>
|
|
||||||
* <li><a href="#Overview">Index Structure Overview</a>
|
|
||||||
* <li><a href="#File_Naming">File Naming</a>
|
|
||||||
* <li><a href="#file-names">Summary of File Extensions</a>
|
|
||||||
* <ul>
|
|
||||||
* <li><a href="#Lock_File">Lock File</a>
|
|
||||||
* <li><a href="#History">History</a>
|
|
||||||
* <li><a href="#Limitations">Limitations</a>
|
|
||||||
* </ul>
|
|
||||||
* </ul>
|
|
||||||
*
|
|
||||||
* </div> <a id="Introduction"></a>
|
|
||||||
*
|
|
||||||
* <h3>Introduction</h3>
|
|
||||||
*
|
|
||||||
* <div>
|
|
||||||
*
|
|
||||||
* <p>This document defines the index file formats used in this version of Lucene. If you are using
|
|
||||||
* a different version of Lucene, please consult the copy of <code>docs/</code> that was distributed
|
|
||||||
* with the version you are using.
|
|
||||||
*
|
|
||||||
* <p>This document attempts to provide a high-level definition of the Apache Lucene file formats.
|
|
||||||
* </div> <a id="Definitions"></a>
|
|
||||||
*
|
|
||||||
* <h3>Definitions</h3>
|
|
||||||
*
|
|
||||||
* <div>
|
|
||||||
*
|
|
||||||
* <p>The fundamental concepts in Lucene are index, document, field and term.
|
|
||||||
*
|
|
||||||
* <p>An index contains a sequence of documents.
|
|
||||||
*
|
|
||||||
* <ul>
|
|
||||||
* <li>A document is a sequence of fields.
|
|
||||||
* <li>A field is a named sequence of terms.
|
|
||||||
* <li>A term is a sequence of bytes.
|
|
||||||
* </ul>
|
|
||||||
*
|
|
||||||
* <p>The same sequence of bytes in two different fields is considered a different term. Thus terms
|
|
||||||
* are represented as a pair: the string naming the field, and the bytes within the field. <a
|
|
||||||
* id="Inverted_Indexing"></a>
|
|
||||||
*
|
|
||||||
* <h4>Inverted Indexing</h4>
|
|
||||||
*
|
|
||||||
* <p>Lucene's index stores terms and statistics about those terms in order to make term-based
|
|
||||||
* search more efficient. Lucene's terms index falls into the family of indexes known as an
|
|
||||||
* <i>inverted index.</i> This is because it can list, for a term, the documents that contain it.
|
|
||||||
* This is the inverse of the natural relationship, in which documents list terms. <a
|
|
||||||
* id="Types_of_Fields"></a>
|
|
||||||
*
|
|
||||||
* <h4>Types of Fields</h4>
|
|
||||||
*
|
|
||||||
* <p>In Lucene, fields may be <i>stored</i>, in which case their text is stored in the index
|
|
||||||
* literally, in a non-inverted manner. Fields that are inverted are called <i>indexed</i>. A field
|
|
||||||
* may be both stored and indexed.
|
|
||||||
*
|
|
||||||
* <p>The text of a field may be <i>tokenized</i> into terms to be indexed, or the text of a field
|
|
||||||
* may be used literally as a term to be indexed. Most fields are tokenized, but sometimes it is
|
|
||||||
* useful for certain identifier fields to be indexed literally.
|
|
||||||
*
|
|
||||||
* <p>See the {@link org.apache.lucene.document.Field Field} java docs for more information on
|
|
||||||
* Fields. <a id="Segments"></a>
|
|
||||||
*
|
|
||||||
* <h4>Segments</h4>
|
|
||||||
*
|
|
||||||
* <p>Lucene indexes may be composed of multiple sub-indexes, or <i>segments</i>. Each segment is a
|
|
||||||
* fully independent index, which could be searched separately. Indexes evolve by:
|
|
||||||
*
|
|
||||||
* <ol>
|
|
||||||
* <li>Creating new segments for newly added documents.
|
|
||||||
* <li>Merging existing segments.
|
|
||||||
* </ol>
|
|
||||||
*
|
|
||||||
* <p>Searches may involve multiple segments and/or multiple indexes, each index potentially
|
|
||||||
* composed of a set of segments. <a id="Document_Numbers"></a>
|
|
||||||
*
|
|
||||||
* <h4>Document Numbers</h4>
|
|
||||||
*
|
|
||||||
* <p>Internally, Lucene refers to documents by an integer <i>document number</i>. The first
|
|
||||||
* document added to an index is numbered zero, and each subsequent document added gets a number one
|
|
||||||
* greater than the previous.
|
|
||||||
*
|
|
||||||
* <p>Note that a document's number may change, so caution should be taken when storing these
|
|
||||||
* numbers outside of Lucene. In particular, numbers may change in the following situations:
|
|
||||||
*
|
|
||||||
* <ul>
|
|
||||||
* <li>
|
|
||||||
* <p>The numbers stored in each segment are unique only within the segment, and must be
|
|
||||||
* converted before they can be used in a larger context. The standard technique is to
|
|
||||||
* allocate each segment a range of values, based on the range of numbers used in that
|
|
||||||
* segment. To convert a document number from a segment to an external value, the segment's
|
|
||||||
* <i>base</i> document number is added. To convert an external value back to a
|
|
||||||
* segment-specific value, the segment is identified by the range that the external value is
|
|
||||||
* in, and the segment's base value is subtracted. For example two five document segments
|
|
||||||
* might be combined, so that the first segment has a base value of zero, and the second of
|
|
||||||
* five. Document three from the second segment would have an external value of eight.
|
|
||||||
* <li>
|
|
||||||
* <p>When documents are deleted, gaps are created in the numbering. These are eventually
|
|
||||||
* removed as the index evolves through merging. Deleted documents are dropped when segments
|
|
||||||
* are merged. A freshly-merged segment thus has no gaps in its numbering.
|
|
||||||
* </ul>
|
|
||||||
*
|
|
||||||
* </div> <a id="Overview"></a>
|
|
||||||
*
|
|
||||||
* <h3>Index Structure Overview</h3>
|
|
||||||
*
|
|
||||||
* <div>
|
|
||||||
*
|
|
||||||
* <p>Each segment index maintains the following:
|
|
||||||
*
|
|
||||||
* <ul>
|
|
||||||
* <li>{@link org.apache.lucene.codecs.lucene90.Lucene90SegmentInfoFormat Segment info}. This
|
|
||||||
* contains metadata about a segment, such as the number of documents, what files it uses, and
|
|
||||||
* information about how the segment is sorted
|
|
||||||
* <li>{@link org.apache.lucene.codecs.lucene90.Lucene90FieldInfosFormat Field names}. This
|
|
||||||
* contains metadata about the set of named fields used in the index.
|
|
||||||
* <li>{@link org.apache.lucene.codecs.lucene90.Lucene90StoredFieldsFormat Stored Field values}.
|
|
||||||
* This contains, for each document, a list of attribute-value pairs, where the attributes are
|
|
||||||
* field names. These are used to store auxiliary information about the document, such as its
|
|
||||||
* title, url, or an identifier to access a database. The set of stored fields are what is
|
|
||||||
* returned for each hit when searching. This is keyed by document number.
|
|
||||||
* <li>{@link org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat Term dictionary}. A
|
|
||||||
* dictionary containing all of the terms used in all of the indexed fields of all of the
|
|
||||||
* documents. The dictionary also contains the number of documents which contain the term, and
|
|
||||||
* pointers to the term's frequency and proximity data.
|
|
||||||
* <li>{@link org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat Term Frequency data}. For
|
|
||||||
* each term in the dictionary, the numbers of all the documents that contain that term, and
|
|
||||||
* the frequency of the term in that document, unless frequencies are omitted ({@link
|
|
||||||
* org.apache.lucene.index.IndexOptions#DOCS IndexOptions.DOCS})
|
|
||||||
* <li>{@link org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat Term Proximity data}. For
|
|
||||||
* each term in the dictionary, the positions that the term occurs in each document. Note that
|
|
||||||
* this will not exist if all fields in all documents omit position data.
|
|
||||||
* <li>{@link org.apache.lucene.codecs.lucene90.Lucene90NormsFormat Normalization factors}. For
|
|
||||||
* each field in each document, a value is stored that is multiplied into the score for hits
|
|
||||||
* on that field.
|
|
||||||
* <li>{@link org.apache.lucene.codecs.lucene90.Lucene90TermVectorsFormat Term Vectors}. For each
|
|
||||||
* field in each document, the term vector (sometimes called document vector) may be stored. A
|
|
||||||
* term vector consists of term text and term frequency. To add Term Vectors to your index see
|
|
||||||
* the {@link org.apache.lucene.document.Field Field} constructors
|
|
||||||
* <li>{@link org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat Per-document values}. Like
|
|
||||||
* stored values, these are also keyed by document number, but are generally intended to be
|
|
||||||
* loaded into main memory for fast access. Whereas stored values are generally intended for
|
|
||||||
* summary results from searches, per-document values are useful for things like scoring
|
|
||||||
* factors.
|
|
||||||
* <li>{@link org.apache.lucene.codecs.lucene90.Lucene90LiveDocsFormat Live documents}. An
|
|
||||||
* optional file indicating which documents are live.
|
|
||||||
* <li>{@link org.apache.lucene.codecs.lucene90.Lucene90PointsFormat Point values}. Optional pair
|
|
||||||
* of files, recording dimensionally indexed fields, to enable fast numeric range filtering
|
|
||||||
* and large numeric values like BigInteger and BigDecimal (1D) and geographic shape
|
|
||||||
* intersection (2D, 3D).
|
|
||||||
* <li>{@link org.apache.lucene.codecs.lucene90.Lucene90HnswVectorsFormat Vector values}. The
|
|
||||||
* vector format stores numeric vectors in a format optimized for random access and
|
|
||||||
* computation, supporting high-dimensional nearest-neighbor search.
|
|
||||||
* </ul>
|
|
||||||
*
|
|
||||||
* <p>Details on each of these are provided in their linked pages. </div> <a id="File_Naming"></a>
|
|
||||||
*
|
|
||||||
* <h3>File Naming</h3>
|
|
||||||
*
|
|
||||||
* <div>
|
|
||||||
*
|
|
||||||
* <p>All files belonging to a segment have the same name with varying extensions. The extensions
|
|
||||||
* correspond to the different file formats described below. When using the Compound File format
|
|
||||||
* (default for small segments) these files (except for the Segment info file, the Lock file, and
|
|
||||||
* Deleted documents file) are collapsed into a single .cfs file (see below for details)
|
|
||||||
*
|
|
||||||
* <p>Typically, all segments in an index are stored in a single directory, although this is not
|
|
||||||
* required.
|
|
||||||
*
|
|
||||||
* <p>File names are never re-used. That is, when any file is saved to the Directory it is given a
|
|
||||||
* never before used filename. This is achieved using a simple generations approach. For example,
|
|
||||||
* the first segments file is segments_1, then segments_2, etc. The generation is a sequential long
|
|
||||||
* integer represented in alpha-numeric (base 36) form. </div> <a id="file-names"></a>
|
|
||||||
*
|
|
||||||
* <h3>Summary of File Extensions</h3>
|
|
||||||
*
|
|
||||||
* <div>
|
|
||||||
*
|
|
||||||
* <p>The following table summarizes the names and extensions of the files in Lucene:
|
|
||||||
*
|
|
||||||
* <table class="padding4" style="border-spacing: 1px; border-collapse: separate">
|
|
||||||
* <caption>lucene filenames by extension</caption>
|
|
||||||
* <tr>
|
|
||||||
* <th>Name</th>
|
|
||||||
* <th>Extension</th>
|
|
||||||
* <th>Brief Description</th>
|
|
||||||
* </tr>
|
|
||||||
* <tr>
|
|
||||||
* <td>{@link org.apache.lucene.index.SegmentInfos Segments File}</td>
|
|
||||||
* <td>segments_N</td>
|
|
||||||
* <td>Stores information about a commit point</td>
|
|
||||||
* </tr>
|
|
||||||
* <tr>
|
|
||||||
* <td><a href="#Lock_File">Lock File</a></td>
|
|
||||||
* <td>write.lock</td>
|
|
||||||
* <td>The Write lock prevents multiple IndexWriters from writing to the same
|
|
||||||
* file.</td>
|
|
||||||
* </tr>
|
|
||||||
* <tr>
|
|
||||||
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90SegmentInfoFormat Segment Info}</td>
|
|
||||||
* <td>.si</td>
|
|
||||||
* <td>Stores metadata about a segment</td>
|
|
||||||
* </tr>
|
|
||||||
* <tr>
|
|
||||||
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90CompoundFormat Compound File}</td>
|
|
||||||
* <td>.cfs, .cfe</td>
|
|
||||||
* <td>An optional "virtual" file consisting of all the other index files for
|
|
||||||
* systems that frequently run out of file handles.</td>
|
|
||||||
* </tr>
|
|
||||||
* <tr>
|
|
||||||
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90FieldInfosFormat Fields}</td>
|
|
||||||
* <td>.fnm</td>
|
|
||||||
* <td>Stores information about the fields</td>
|
|
||||||
* </tr>
|
|
||||||
* <tr>
|
|
||||||
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90StoredFieldsFormat Field Index}</td>
|
|
||||||
* <td>.fdx</td>
|
|
||||||
* <td>Contains pointers to field data</td>
|
|
||||||
* </tr>
|
|
||||||
* <tr>
|
|
||||||
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90StoredFieldsFormat Field Data}</td>
|
|
||||||
* <td>.fdt</td>
|
|
||||||
* <td>The stored fields for documents</td>
|
|
||||||
* </tr>
|
|
||||||
* <tr>
|
|
||||||
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat Term Dictionary}</td>
|
|
||||||
* <td>.tim</td>
|
|
||||||
* <td>The term dictionary, stores term info</td>
|
|
||||||
* </tr>
|
|
||||||
* <tr>
|
|
||||||
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat Term Index}</td>
|
|
||||||
* <td>.tip</td>
|
|
||||||
* <td>The index into the Term Dictionary</td>
|
|
||||||
* </tr>
|
|
||||||
* <tr>
|
|
||||||
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat Frequencies}</td>
|
|
||||||
* <td>.doc</td>
|
|
||||||
* <td>Contains the list of docs which contain each term along with frequency</td>
|
|
||||||
* </tr>
|
|
||||||
* <tr>
|
|
||||||
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat Positions}</td>
|
|
||||||
* <td>.pos</td>
|
|
||||||
* <td>Stores position information about where a term occurs in the index</td>
|
|
||||||
* </tr>
|
|
||||||
* <tr>
|
|
||||||
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat Payloads}</td>
|
|
||||||
* <td>.pay</td>
|
|
||||||
* <td>Stores additional per-position metadata information such as character offsets and user payloads</td>
|
|
||||||
* </tr>
|
|
||||||
* <tr>
|
|
||||||
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90NormsFormat Norms}</td>
|
|
||||||
* <td>.nvd, .nvm</td>
|
|
||||||
* <td>Encodes length and boost factors for docs and fields</td>
|
|
||||||
* </tr>
|
|
||||||
* <tr>
|
|
||||||
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat Per-Document Values}</td>
|
|
||||||
* <td>.dvd, .dvm</td>
|
|
||||||
* <td>Encodes additional scoring factors or other per-document information.</td>
|
|
||||||
* </tr>
|
|
||||||
* <tr>
|
|
||||||
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90TermVectorsFormat Term Vector Index}</td>
|
|
||||||
* <td>.tvx</td>
|
|
||||||
* <td>Stores offset into the document data file</td>
|
|
||||||
* </tr>
|
|
||||||
* <tr>
|
|
||||||
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90TermVectorsFormat Term Vector Data}</td>
|
|
||||||
* <td>.tvd</td>
|
|
||||||
* <td>Contains term vector data.</td>
|
|
||||||
* </tr>
|
|
||||||
* <tr>
|
|
||||||
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90LiveDocsFormat Live Documents}</td>
|
|
||||||
* <td>.liv</td>
|
|
||||||
* <td>Info about what documents are live</td>
|
|
||||||
* </tr>
|
|
||||||
* <tr>
|
|
||||||
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90PointsFormat Point values}</td>
|
|
||||||
* <td>.dii, .dim</td>
|
|
||||||
* <td>Holds indexed points</td>
|
|
||||||
* </tr>
|
|
||||||
* <tr>
|
|
||||||
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90HnswVectorsFormat Vector values}</td>
|
|
||||||
* <td>.vec, .vem</td>
|
|
||||||
* <td>Holds indexed vectors; <code>.vec</code> files contain the raw vector data, and
|
|
||||||
* <code>.vem</code> the vector metadata</td>
|
|
||||||
* </tr>
|
|
||||||
* </table>
|
|
||||||
*
|
|
||||||
* </div> <a id="Lock_File"></a>
|
|
||||||
*
|
|
||||||
* <h3>Lock File</h3>
|
|
||||||
*
|
|
||||||
* The write lock, which is stored in the index directory by default, is named "write.lock". If the
|
|
||||||
* lock directory is different from the index directory then the write lock will be named
|
|
||||||
* "XXXX-write.lock" where XXXX is a unique prefix derived from the full path to the index
|
|
||||||
* directory. When this file is present, a writer is currently modifying the index (adding or
|
|
||||||
* removing documents). This lock file ensures that only one writer is modifying the index at a
|
|
||||||
* time. <a id="History"></a>
|
|
||||||
*
|
|
||||||
* <h3>History</h3>
|
|
||||||
*
|
|
||||||
* <p>Compatibility notes are provided in this document, describing how file formats have changed
|
|
||||||
* from prior versions:
|
|
||||||
*
|
|
||||||
* <ul>
|
|
||||||
* <li>In version 2.1, the file format was changed to allow lock-less commits (ie, no more commit
|
|
||||||
* lock). The change is fully backwards compatible: you can open a pre-2.1 index for searching
|
|
||||||
* or adding/deleting of docs. When the new segments file is saved (committed), it will be
|
|
||||||
* written in the new file format (meaning no specific "upgrade" process is needed). But note
|
|
||||||
* that once a commit has occurred, pre-2.1 Lucene will not be able to read the index.
|
|
||||||
* <li>In version 2.3, the file format was changed to allow segments to share a single set of doc
|
|
||||||
* store (vectors & stored fields) files. This allows for faster indexing in certain
|
|
||||||
* cases. The change is fully backwards compatible (in the same way as the lock-less commits
|
|
||||||
* change in 2.1).
|
|
||||||
* <li>In version 2.4, Strings are now written as true UTF-8 byte sequence, not Java's modified
|
|
||||||
* UTF-8. See <a href="http://issues.apache.org/jira/browse/LUCENE-510">LUCENE-510</a> for
|
|
||||||
* details.
|
|
||||||
* <li>In version 2.9, an optional opaque Map<String,String> CommitUserData may be passed to
|
|
||||||
* IndexWriter's commit methods (and later retrieved), which is recorded in the segments_N
|
|
||||||
* file. See <a href="http://issues.apache.org/jira/browse/LUCENE-1382">LUCENE-1382</a> for
|
|
||||||
* details. Also, diagnostics were added to each segment written recording details about why
|
|
||||||
* it was written (due to flush, merge; which OS/JRE was used; etc.). See issue <a
|
|
||||||
* href="http://issues.apache.org/jira/browse/LUCENE-1654">LUCENE-1654</a> for details.
|
|
||||||
* <li>In version 3.0, compressed fields are no longer written to the index (they can still be
|
|
||||||
* read, but on merge the new segment will write them, uncompressed). See issue <a
|
|
||||||
* href="http://issues.apache.org/jira/browse/LUCENE-1960">LUCENE-1960</a> for details.
|
|
||||||
* <li>In version 3.1, segments records the code version that created them. See <a
|
|
||||||
* href="http://issues.apache.org/jira/browse/LUCENE-2720">LUCENE-2720</a> for details.
|
|
||||||
* Additionally segments track explicitly whether or not they have term vectors. See <a
|
|
||||||
* href="http://issues.apache.org/jira/browse/LUCENE-2811">LUCENE-2811</a> for details.
|
|
||||||
* <li>In version 3.2, numeric fields are written as natively to stored fields file, previously
|
|
||||||
* they were stored in text format only.
|
|
||||||
* <li>In version 3.4, fields can omit position data while still indexing term frequencies.
|
|
||||||
* <li>In version 4.0, the format of the inverted index became extensible via the {@link
|
|
||||||
* org.apache.lucene.codecs.Codec Codec} api. Fast per-document storage ({@code DocValues})
|
|
||||||
* was introduced. Normalization factors need no longer be a single byte, they can be any
|
|
||||||
* {@link org.apache.lucene.index.NumericDocValues NumericDocValues}. Terms need not be
|
|
||||||
* unicode strings, they can be any byte sequence. Term offsets can optionally be indexed into
|
|
||||||
* the postings lists. Payloads can be stored in the term vectors.
|
|
||||||
* <li>In version 4.1, the format of the postings list changed to use either of FOR compression or
|
|
||||||
* variable-byte encoding, depending upon the frequency of the term. Terms appearing only once
|
|
||||||
* were changed to inline directly into the term dictionary. Stored fields are compressed by
|
|
||||||
* default.
|
|
||||||
* <li>In version 4.2, term vectors are compressed by default. DocValues has a new multi-valued
|
|
||||||
* type (SortedSet), that can be used for faceting/grouping/joining on multi-valued fields.
|
|
||||||
* <li>In version 4.5, DocValues were extended to explicitly represent missing values.
|
|
||||||
* <li>In version 4.6, FieldInfos were extended to support per-field DocValues generation, to
|
|
||||||
* allow updating NumericDocValues fields.
|
|
||||||
* <li>In version 4.8, checksum footers were added to the end of each index file for improved data
|
|
||||||
* integrity. Specifically, the last 8 bytes of every index file contain the zlib-crc32
|
|
||||||
* checksum of the file.
|
|
||||||
* <li>In version 4.9, DocValues has a new multi-valued numeric type (SortedNumeric) that is
|
|
||||||
* suitable for faceting/sorting/analytics.
|
|
||||||
* <li>In version 5.4, DocValues have been improved to store more information on disk: addresses
|
|
||||||
* for binary fields and ord indexes for multi-valued fields.
|
|
||||||
* <li>In version 6.0, Points were added, for multi-dimensional range/distance search.
|
|
||||||
* <li>In version 6.2, new Segment info format that reads/writes the index sort, to support index
|
|
||||||
* sorting.
|
|
||||||
* <li>In version 7.0, DocValues have been improved to better support sparse doc values thanks to
|
|
||||||
* an iterator API.
|
|
||||||
* <li>In version 8.0, postings have been enhanced to record, for each block of doc ids, the (term
|
|
||||||
* freq, normalization factor) pairs that may trigger the maximum score of the block. This
|
|
||||||
* information is recorded alongside skip data in order to be able to skip blocks of doc ids
|
|
||||||
* if they may not produce high enough scores. Additionally doc values and norms has been
|
|
||||||
* extended with jump-tables to make access O(1) instead of O(n), where n is the number of
|
|
||||||
* elements to skip when advancing in the data.
|
|
||||||
* <li>In version 8.4, postings, positions, offsets and payload lengths have move to a more
|
|
||||||
* performant encoding that is vectorized.
|
|
||||||
* <li>In version 8.6, index sort serialization is delegated to the sorts themselves, to allow
|
|
||||||
* user-defined sorts to be used
|
|
||||||
* <li>In version 8.7, stored fields compression became adaptive to better handle documents with
|
|
||||||
* smaller stored fields.
|
|
||||||
* <li>In version 9.0, vector-valued fields were added.
|
|
||||||
* </ul>
|
|
||||||
*
|
|
||||||
* <a id="Limitations"></a>
|
|
||||||
*
|
|
||||||
* <h3>Limitations</h3>
|
|
||||||
*
|
|
||||||
* <div>
|
|
||||||
*
|
|
||||||
* <p>Lucene uses a Java <code>int</code> to refer to document numbers, and the index file format
|
|
||||||
* uses an <code>Int32</code> on-disk to store document numbers. This is a limitation of both the
|
|
||||||
* index file format and the current implementation. Eventually these should be replaced with either
|
|
||||||
* <code>UInt64</code> values, or better yet, {@link org.apache.lucene.store.DataOutput#writeVInt
|
|
||||||
* VInt} values which have no limit. </div>
|
|
||||||
*/
|
|
||||||
package org.apache.lucene.codecs.lucene90;
|
package org.apache.lucene.codecs.lucene90;
|
||||||
|
|
|
@ -0,0 +1,217 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package org.apache.lucene.codecs.lucene91;
|
||||||
|
|
||||||
|
import java.util.Objects;
|
||||||
|
import org.apache.lucene.codecs.Codec;
|
||||||
|
import org.apache.lucene.codecs.CompoundFormat;
|
||||||
|
import org.apache.lucene.codecs.DocValuesFormat;
|
||||||
|
import org.apache.lucene.codecs.FieldInfosFormat;
|
||||||
|
import org.apache.lucene.codecs.FilterCodec;
|
||||||
|
import org.apache.lucene.codecs.KnnVectorsFormat;
|
||||||
|
import org.apache.lucene.codecs.LiveDocsFormat;
|
||||||
|
import org.apache.lucene.codecs.NormsFormat;
|
||||||
|
import org.apache.lucene.codecs.PointsFormat;
|
||||||
|
import org.apache.lucene.codecs.PostingsFormat;
|
||||||
|
import org.apache.lucene.codecs.SegmentInfoFormat;
|
||||||
|
import org.apache.lucene.codecs.StoredFieldsFormat;
|
||||||
|
import org.apache.lucene.codecs.TermVectorsFormat;
|
||||||
|
import org.apache.lucene.codecs.lucene90.Lucene90CompoundFormat;
|
||||||
|
import org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat;
|
||||||
|
import org.apache.lucene.codecs.lucene90.Lucene90FieldInfosFormat;
|
||||||
|
import org.apache.lucene.codecs.lucene90.Lucene90LiveDocsFormat;
|
||||||
|
import org.apache.lucene.codecs.lucene90.Lucene90NormsFormat;
|
||||||
|
import org.apache.lucene.codecs.lucene90.Lucene90PointsFormat;
|
||||||
|
import org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat;
|
||||||
|
import org.apache.lucene.codecs.lucene90.Lucene90SegmentInfoFormat;
|
||||||
|
import org.apache.lucene.codecs.lucene90.Lucene90StoredFieldsFormat;
|
||||||
|
import org.apache.lucene.codecs.lucene90.Lucene90TermVectorsFormat;
|
||||||
|
import org.apache.lucene.codecs.perfield.PerFieldDocValuesFormat;
|
||||||
|
import org.apache.lucene.codecs.perfield.PerFieldKnnVectorsFormat;
|
||||||
|
import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Implements the Lucene 9.1 index format
|
||||||
|
*
|
||||||
|
* <p>If you want to reuse functionality of this codec in another codec, extend {@link FilterCodec}.
|
||||||
|
*
|
||||||
|
* @see org.apache.lucene.codecs.lucene91 package documentation for file format details.
|
||||||
|
* @lucene.experimental
|
||||||
|
*/
|
||||||
|
public class Lucene91Codec extends Codec {
|
||||||
|
|
||||||
|
/** Configuration option for the codec. */
|
||||||
|
public enum Mode {
|
||||||
|
/** Trade compression ratio for retrieval speed. */
|
||||||
|
BEST_SPEED(Lucene90StoredFieldsFormat.Mode.BEST_SPEED),
|
||||||
|
/** Trade retrieval speed for compression ratio. */
|
||||||
|
BEST_COMPRESSION(Lucene90StoredFieldsFormat.Mode.BEST_COMPRESSION);
|
||||||
|
|
||||||
|
private final Lucene90StoredFieldsFormat.Mode storedMode;
|
||||||
|
|
||||||
|
private Mode(Lucene90StoredFieldsFormat.Mode storedMode) {
|
||||||
|
this.storedMode = Objects.requireNonNull(storedMode);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private final TermVectorsFormat vectorsFormat = new Lucene90TermVectorsFormat();
|
||||||
|
private final FieldInfosFormat fieldInfosFormat = new Lucene90FieldInfosFormat();
|
||||||
|
private final SegmentInfoFormat segmentInfosFormat = new Lucene90SegmentInfoFormat();
|
||||||
|
private final LiveDocsFormat liveDocsFormat = new Lucene90LiveDocsFormat();
|
||||||
|
private final CompoundFormat compoundFormat = new Lucene90CompoundFormat();
|
||||||
|
private final NormsFormat normsFormat = new Lucene90NormsFormat();
|
||||||
|
|
||||||
|
private final PostingsFormat defaultPostingsFormat;
|
||||||
|
private final PostingsFormat postingsFormat =
|
||||||
|
new PerFieldPostingsFormat() {
|
||||||
|
@Override
|
||||||
|
public PostingsFormat getPostingsFormatForField(String field) {
|
||||||
|
return Lucene91Codec.this.getPostingsFormatForField(field);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
private final DocValuesFormat defaultDVFormat;
|
||||||
|
private final DocValuesFormat docValuesFormat =
|
||||||
|
new PerFieldDocValuesFormat() {
|
||||||
|
@Override
|
||||||
|
public DocValuesFormat getDocValuesFormatForField(String field) {
|
||||||
|
return Lucene91Codec.this.getDocValuesFormatForField(field);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
private final KnnVectorsFormat defaultKnnVectorsFormat;
|
||||||
|
private final KnnVectorsFormat knnVectorsFormat =
|
||||||
|
new PerFieldKnnVectorsFormat() {
|
||||||
|
@Override
|
||||||
|
public KnnVectorsFormat getKnnVectorsFormatForField(String field) {
|
||||||
|
return Lucene91Codec.this.getKnnVectorsFormatForField(field);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
private final StoredFieldsFormat storedFieldsFormat;
|
||||||
|
|
||||||
|
/** Instantiates a new codec. */
|
||||||
|
public Lucene91Codec() {
|
||||||
|
this(Mode.BEST_SPEED);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Instantiates a new codec, specifying the stored fields compression mode to use.
|
||||||
|
*
|
||||||
|
* @param mode stored fields compression mode to use for newly flushed/merged segments.
|
||||||
|
*/
|
||||||
|
public Lucene91Codec(Mode mode) {
|
||||||
|
super("Lucene91");
|
||||||
|
this.storedFieldsFormat =
|
||||||
|
new Lucene90StoredFieldsFormat(Objects.requireNonNull(mode).storedMode);
|
||||||
|
this.defaultPostingsFormat = new Lucene90PostingsFormat();
|
||||||
|
this.defaultDVFormat = new Lucene90DocValuesFormat();
|
||||||
|
this.defaultKnnVectorsFormat = new Lucene91HnswVectorsFormat();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public final StoredFieldsFormat storedFieldsFormat() {
|
||||||
|
return storedFieldsFormat;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public final TermVectorsFormat termVectorsFormat() {
|
||||||
|
return vectorsFormat;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public final PostingsFormat postingsFormat() {
|
||||||
|
return postingsFormat;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public final FieldInfosFormat fieldInfosFormat() {
|
||||||
|
return fieldInfosFormat;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public final SegmentInfoFormat segmentInfoFormat() {
|
||||||
|
return segmentInfosFormat;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public final LiveDocsFormat liveDocsFormat() {
|
||||||
|
return liveDocsFormat;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public final CompoundFormat compoundFormat() {
|
||||||
|
return compoundFormat;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public final PointsFormat pointsFormat() {
|
||||||
|
return new Lucene90PointsFormat();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public final KnnVectorsFormat knnVectorsFormat() {
|
||||||
|
return knnVectorsFormat;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the postings format that should be used for writing new segments of <code>field</code>.
|
||||||
|
*
|
||||||
|
* <p>The default implementation always returns "Lucene90".
|
||||||
|
*
|
||||||
|
* <p><b>WARNING:</b> if you subclass, you are responsible for index backwards compatibility:
|
||||||
|
* future version of Lucene are only guaranteed to be able to read the default implementation,
|
||||||
|
*/
|
||||||
|
public PostingsFormat getPostingsFormatForField(String field) {
|
||||||
|
return defaultPostingsFormat;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the docvalues format that should be used for writing new segments of <code>field</code>
|
||||||
|
* .
|
||||||
|
*
|
||||||
|
* <p>The default implementation always returns "Lucene90".
|
||||||
|
*
|
||||||
|
* <p><b>WARNING:</b> if you subclass, you are responsible for index backwards compatibility:
|
||||||
|
* future version of Lucene are only guaranteed to be able to read the default implementation.
|
||||||
|
*/
|
||||||
|
public DocValuesFormat getDocValuesFormatForField(String field) {
|
||||||
|
return defaultDVFormat;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the vectors format that should be used for writing new segments of <code>field</code>
|
||||||
|
*
|
||||||
|
* <p>The default implementation always returns "Lucene91".
|
||||||
|
*
|
||||||
|
* <p><b>WARNING:</b> if you subclass, you are responsible for index backwards compatibility:
|
||||||
|
* future version of Lucene are only guaranteed to be able to read the default implementation.
|
||||||
|
*/
|
||||||
|
public KnnVectorsFormat getKnnVectorsFormatForField(String field) {
|
||||||
|
return defaultKnnVectorsFormat;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public final DocValuesFormat docValuesFormat() {
|
||||||
|
return docValuesFormat;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public final NormsFormat normsFormat() {
|
||||||
|
return normsFormat;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,143 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.lucene.codecs.lucene91;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import org.apache.lucene.codecs.KnnVectorsFormat;
|
||||||
|
import org.apache.lucene.codecs.KnnVectorsReader;
|
||||||
|
import org.apache.lucene.codecs.KnnVectorsWriter;
|
||||||
|
import org.apache.lucene.index.SegmentReadState;
|
||||||
|
import org.apache.lucene.index.SegmentWriteState;
|
||||||
|
import org.apache.lucene.util.hnsw.HnswGraph;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Lucene 9.0 vector format, which encodes numeric vector values and an optional associated graph
|
||||||
|
* connecting the documents having values. The graph is used to power HNSW search. The format
|
||||||
|
* consists of three files:
|
||||||
|
*
|
||||||
|
* <h2>.vec (vector data) file</h2>
|
||||||
|
*
|
||||||
|
* <p>This file stores all the floating-point vector data ordered by field, document ordinal, and
|
||||||
|
* vector dimension. The floats are stored in little-endian byte order.
|
||||||
|
*
|
||||||
|
* <h2>.vex (vector index)</h2>
|
||||||
|
*
|
||||||
|
* <p>Stores graphs connecting the documents for each field organized as a list of nodes' neighbours
|
||||||
|
* as following:
|
||||||
|
*
|
||||||
|
* <ul>
|
||||||
|
* <li>For each level:
|
||||||
|
* <ul>
|
||||||
|
* <li>For each node:
|
||||||
|
* <ul>
|
||||||
|
* <li><b>[int32]</b> the number of neighbor nodes
|
||||||
|
* <li><b>array[int32]</b> the neighbor ordinals
|
||||||
|
* <li><b>array[int32]</b> padding from empty integers if the number of neighbors less
|
||||||
|
* than the maximum number of connections (maxConn). Padding is equal to
|
||||||
|
* ((maxConn-the number of neighbours) * 4) bytes.
|
||||||
|
* </ul>
|
||||||
|
* </ul>
|
||||||
|
* </ul>
|
||||||
|
*
|
||||||
|
* <h2>.vem (vector metadata) file</h2>
|
||||||
|
*
|
||||||
|
* <p>For each field:
|
||||||
|
*
|
||||||
|
* <ul>
|
||||||
|
* <li><b>[int32]</b> field number
|
||||||
|
* <li><b>[int32]</b> vector similarity function ordinal
|
||||||
|
* <li><b>[vlong]</b> offset to this field's vectors in the .vec file
|
||||||
|
* <li><b>[vlong]</b> length of this field's vectors, in bytes
|
||||||
|
* <li><b>[vlong]</b> offset to this field's index in the .vex file
|
||||||
|
* <li><b>[vlong]</b> length of this field's index data, in bytes
|
||||||
|
* <li><b>[int]</b> dimension of this field's vectors
|
||||||
|
* <li><b>[int]</b> the number of documents having values for this field
|
||||||
|
* <li><b>array[vint]</b> the docids of documents having vectors, in order
|
||||||
|
* <li><b>[int]</b> the maximum number of connections (neigbours) that each node can have
|
||||||
|
* <li><b>[int]</b> number of levels in the graph
|
||||||
|
* <li>Graph nodes by level. For each level
|
||||||
|
* <ul>
|
||||||
|
* <li><b>[int]</b> the number of nodes on this level
|
||||||
|
* <li><b>array[vint]</b> for levels greater than 0 list of nodes on this level, stored as
|
||||||
|
* the the level 0th nodes ordinals.
|
||||||
|
* </ul>
|
||||||
|
* </ul>
|
||||||
|
*
|
||||||
|
* @lucene.experimental
|
||||||
|
*/
|
||||||
|
public final class Lucene91HnswVectorsFormat extends KnnVectorsFormat {
|
||||||
|
|
||||||
|
static final String META_CODEC_NAME = "Lucene91HnswVectorsFormatMeta";
|
||||||
|
static final String VECTOR_DATA_CODEC_NAME = "Lucene91HnswVectorsFormatData";
|
||||||
|
static final String VECTOR_INDEX_CODEC_NAME = "Lucene91HnswVectorsFormatIndex";
|
||||||
|
static final String META_EXTENSION = "vem";
|
||||||
|
static final String VECTOR_DATA_EXTENSION = "vec";
|
||||||
|
static final String VECTOR_INDEX_EXTENSION = "vex";
|
||||||
|
|
||||||
|
static final int VERSION_START = 0;
|
||||||
|
static final int VERSION_CURRENT = VERSION_START;
|
||||||
|
|
||||||
|
/** Default number of maximum connections per node */
|
||||||
|
public static final int DEFAULT_MAX_CONN = 16;
|
||||||
|
/**
|
||||||
|
* Default number of the size of the queue maintained while searching during a graph construction.
|
||||||
|
*/
|
||||||
|
public static final int DEFAULT_BEAM_WIDTH = 100;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Controls how many of the nearest neighbor candidates are connected to the new node. Defaults to
|
||||||
|
* {@link Lucene91HnswVectorsFormat#DEFAULT_MAX_CONN}. See {@link HnswGraph} for more details.
|
||||||
|
*/
|
||||||
|
private final int maxConn;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The number of candidate neighbors to track while searching the graph for each newly inserted
|
||||||
|
* node. Defaults to to {@link Lucene91HnswVectorsFormat#DEFAULT_BEAM_WIDTH}. See {@link
|
||||||
|
* HnswGraph} for details.
|
||||||
|
*/
|
||||||
|
private final int beamWidth;
|
||||||
|
|
||||||
|
public Lucene91HnswVectorsFormat() {
|
||||||
|
this(DEFAULT_MAX_CONN, DEFAULT_BEAM_WIDTH);
|
||||||
|
}
|
||||||
|
|
||||||
|
public Lucene91HnswVectorsFormat(int maxConn, int beamWidth) {
|
||||||
|
super("Lucene91HnswVectorsFormat");
|
||||||
|
this.maxConn = maxConn;
|
||||||
|
this.beamWidth = beamWidth;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public KnnVectorsWriter fieldsWriter(SegmentWriteState state) throws IOException {
|
||||||
|
return new Lucene91HnswVectorsWriter(state, maxConn, beamWidth);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public KnnVectorsReader fieldsReader(SegmentReadState state) throws IOException {
|
||||||
|
return new Lucene91HnswVectorsReader(state);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String toString() {
|
||||||
|
return "Lucene91HnswVectorsFormat(name = Lucene91HnswVectorsFormat, maxConn = "
|
||||||
|
+ maxConn
|
||||||
|
+ ", beamWidth="
|
||||||
|
+ beamWidth
|
||||||
|
+ ")";
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,554 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.lucene.codecs.lucene91;
|
||||||
|
|
||||||
|
import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.nio.ByteBuffer;
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.Map;
|
||||||
|
import org.apache.lucene.codecs.CodecUtil;
|
||||||
|
import org.apache.lucene.codecs.KnnVectorsReader;
|
||||||
|
import org.apache.lucene.index.CorruptIndexException;
|
||||||
|
import org.apache.lucene.index.FieldInfo;
|
||||||
|
import org.apache.lucene.index.FieldInfos;
|
||||||
|
import org.apache.lucene.index.IndexFileNames;
|
||||||
|
import org.apache.lucene.index.KnnGraphValues;
|
||||||
|
import org.apache.lucene.index.RandomAccessVectorValues;
|
||||||
|
import org.apache.lucene.index.RandomAccessVectorValuesProducer;
|
||||||
|
import org.apache.lucene.index.SegmentReadState;
|
||||||
|
import org.apache.lucene.index.VectorSimilarityFunction;
|
||||||
|
import org.apache.lucene.index.VectorValues;
|
||||||
|
import org.apache.lucene.search.ScoreDoc;
|
||||||
|
import org.apache.lucene.search.TopDocs;
|
||||||
|
import org.apache.lucene.search.TotalHits;
|
||||||
|
import org.apache.lucene.store.ChecksumIndexInput;
|
||||||
|
import org.apache.lucene.store.DataInput;
|
||||||
|
import org.apache.lucene.store.IndexInput;
|
||||||
|
import org.apache.lucene.util.Bits;
|
||||||
|
import org.apache.lucene.util.BytesRef;
|
||||||
|
import org.apache.lucene.util.IOUtils;
|
||||||
|
import org.apache.lucene.util.RamUsageEstimator;
|
||||||
|
import org.apache.lucene.util.hnsw.HnswGraph;
|
||||||
|
import org.apache.lucene.util.hnsw.NeighborQueue;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Reads vectors from the index segments along with index data structures supporting KNN search.
|
||||||
|
*
|
||||||
|
* @lucene.experimental
|
||||||
|
*/
|
||||||
|
public final class Lucene91HnswVectorsReader extends KnnVectorsReader {
|
||||||
|
|
||||||
|
private final FieldInfos fieldInfos;
|
||||||
|
private final Map<String, FieldEntry> fields = new HashMap<>();
|
||||||
|
private final IndexInput vectorData;
|
||||||
|
private final IndexInput vectorIndex;
|
||||||
|
|
||||||
|
Lucene91HnswVectorsReader(SegmentReadState state) throws IOException {
|
||||||
|
this.fieldInfos = state.fieldInfos;
|
||||||
|
int versionMeta = readMetadata(state);
|
||||||
|
boolean success = false;
|
||||||
|
try {
|
||||||
|
vectorData =
|
||||||
|
openDataInput(
|
||||||
|
state,
|
||||||
|
versionMeta,
|
||||||
|
Lucene91HnswVectorsFormat.VECTOR_DATA_EXTENSION,
|
||||||
|
Lucene91HnswVectorsFormat.VECTOR_DATA_CODEC_NAME);
|
||||||
|
vectorIndex =
|
||||||
|
openDataInput(
|
||||||
|
state,
|
||||||
|
versionMeta,
|
||||||
|
Lucene91HnswVectorsFormat.VECTOR_INDEX_EXTENSION,
|
||||||
|
Lucene91HnswVectorsFormat.VECTOR_INDEX_CODEC_NAME);
|
||||||
|
success = true;
|
||||||
|
} finally {
|
||||||
|
if (success == false) {
|
||||||
|
IOUtils.closeWhileHandlingException(this);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private int readMetadata(SegmentReadState state) throws IOException {
|
||||||
|
String metaFileName =
|
||||||
|
IndexFileNames.segmentFileName(
|
||||||
|
state.segmentInfo.name, state.segmentSuffix, Lucene91HnswVectorsFormat.META_EXTENSION);
|
||||||
|
int versionMeta = -1;
|
||||||
|
try (ChecksumIndexInput meta = state.directory.openChecksumInput(metaFileName, state.context)) {
|
||||||
|
Throwable priorE = null;
|
||||||
|
try {
|
||||||
|
versionMeta =
|
||||||
|
CodecUtil.checkIndexHeader(
|
||||||
|
meta,
|
||||||
|
Lucene91HnswVectorsFormat.META_CODEC_NAME,
|
||||||
|
Lucene91HnswVectorsFormat.VERSION_START,
|
||||||
|
Lucene91HnswVectorsFormat.VERSION_CURRENT,
|
||||||
|
state.segmentInfo.getId(),
|
||||||
|
state.segmentSuffix);
|
||||||
|
readFields(meta, state.fieldInfos);
|
||||||
|
} catch (Throwable exception) {
|
||||||
|
priorE = exception;
|
||||||
|
} finally {
|
||||||
|
CodecUtil.checkFooter(meta, priorE);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return versionMeta;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static IndexInput openDataInput(
|
||||||
|
SegmentReadState state, int versionMeta, String fileExtension, String codecName)
|
||||||
|
throws IOException {
|
||||||
|
String fileName =
|
||||||
|
IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, fileExtension);
|
||||||
|
IndexInput in = state.directory.openInput(fileName, state.context);
|
||||||
|
int versionVectorData =
|
||||||
|
CodecUtil.checkIndexHeader(
|
||||||
|
in,
|
||||||
|
codecName,
|
||||||
|
Lucene91HnswVectorsFormat.VERSION_START,
|
||||||
|
Lucene91HnswVectorsFormat.VERSION_CURRENT,
|
||||||
|
state.segmentInfo.getId(),
|
||||||
|
state.segmentSuffix);
|
||||||
|
if (versionMeta != versionVectorData) {
|
||||||
|
throw new CorruptIndexException(
|
||||||
|
"Format versions mismatch: meta="
|
||||||
|
+ versionMeta
|
||||||
|
+ ", "
|
||||||
|
+ codecName
|
||||||
|
+ "="
|
||||||
|
+ versionVectorData,
|
||||||
|
in);
|
||||||
|
}
|
||||||
|
CodecUtil.retrieveChecksum(in);
|
||||||
|
return in;
|
||||||
|
}
|
||||||
|
|
||||||
|
private void readFields(ChecksumIndexInput meta, FieldInfos infos) throws IOException {
|
||||||
|
for (int fieldNumber = meta.readInt(); fieldNumber != -1; fieldNumber = meta.readInt()) {
|
||||||
|
FieldInfo info = infos.fieldInfo(fieldNumber);
|
||||||
|
if (info == null) {
|
||||||
|
throw new CorruptIndexException("Invalid field number: " + fieldNumber, meta);
|
||||||
|
}
|
||||||
|
FieldEntry fieldEntry = readField(meta);
|
||||||
|
validateFieldEntry(info, fieldEntry);
|
||||||
|
fields.put(info.name, fieldEntry);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void validateFieldEntry(FieldInfo info, FieldEntry fieldEntry) {
|
||||||
|
int dimension = info.getVectorDimension();
|
||||||
|
if (dimension != fieldEntry.dimension) {
|
||||||
|
throw new IllegalStateException(
|
||||||
|
"Inconsistent vector dimension for field=\""
|
||||||
|
+ info.name
|
||||||
|
+ "\"; "
|
||||||
|
+ dimension
|
||||||
|
+ " != "
|
||||||
|
+ fieldEntry.dimension);
|
||||||
|
}
|
||||||
|
|
||||||
|
long numBytes = (long) fieldEntry.size() * dimension * Float.BYTES;
|
||||||
|
if (numBytes != fieldEntry.vectorDataLength) {
|
||||||
|
throw new IllegalStateException(
|
||||||
|
"Vector data length "
|
||||||
|
+ fieldEntry.vectorDataLength
|
||||||
|
+ " not matching size="
|
||||||
|
+ fieldEntry.size()
|
||||||
|
+ " * dim="
|
||||||
|
+ dimension
|
||||||
|
+ " * 4 = "
|
||||||
|
+ numBytes);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private VectorSimilarityFunction readSimilarityFunction(DataInput input) throws IOException {
|
||||||
|
int similarityFunctionId = input.readInt();
|
||||||
|
if (similarityFunctionId < 0
|
||||||
|
|| similarityFunctionId >= VectorSimilarityFunction.values().length) {
|
||||||
|
throw new CorruptIndexException(
|
||||||
|
"Invalid similarity function id: " + similarityFunctionId, input);
|
||||||
|
}
|
||||||
|
return VectorSimilarityFunction.values()[similarityFunctionId];
|
||||||
|
}
|
||||||
|
|
||||||
|
private FieldEntry readField(DataInput input) throws IOException {
|
||||||
|
VectorSimilarityFunction similarityFunction = readSimilarityFunction(input);
|
||||||
|
return new FieldEntry(input, similarityFunction);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public long ramBytesUsed() {
|
||||||
|
long totalBytes = RamUsageEstimator.shallowSizeOfInstance(Lucene91HnswVectorsFormat.class);
|
||||||
|
totalBytes +=
|
||||||
|
RamUsageEstimator.sizeOfMap(
|
||||||
|
fields, RamUsageEstimator.shallowSizeOfInstance(FieldEntry.class));
|
||||||
|
for (FieldEntry entry : fields.values()) {
|
||||||
|
totalBytes += RamUsageEstimator.sizeOf(entry.ordToDoc);
|
||||||
|
}
|
||||||
|
return totalBytes;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void checkIntegrity() throws IOException {
|
||||||
|
CodecUtil.checksumEntireFile(vectorData);
|
||||||
|
CodecUtil.checksumEntireFile(vectorIndex);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public VectorValues getVectorValues(String field) throws IOException {
|
||||||
|
FieldEntry fieldEntry = fields.get(field);
|
||||||
|
return getOffHeapVectorValues(fieldEntry);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public TopDocs search(String field, float[] target, int k, Bits acceptDocs) throws IOException {
|
||||||
|
FieldEntry fieldEntry = fields.get(field);
|
||||||
|
|
||||||
|
if (fieldEntry.size() == 0) {
|
||||||
|
return new TopDocs(new TotalHits(0, TotalHits.Relation.EQUAL_TO), new ScoreDoc[0]);
|
||||||
|
}
|
||||||
|
|
||||||
|
// bound k by total number of vectors to prevent oversizing data structures
|
||||||
|
k = Math.min(k, fieldEntry.size());
|
||||||
|
|
||||||
|
OffHeapVectorValues vectorValues = getOffHeapVectorValues(fieldEntry);
|
||||||
|
// use a seed that is fixed for the index so we get reproducible results for the same query
|
||||||
|
NeighborQueue results =
|
||||||
|
HnswGraph.search(
|
||||||
|
target,
|
||||||
|
k,
|
||||||
|
vectorValues,
|
||||||
|
fieldEntry.similarityFunction,
|
||||||
|
getGraphValues(fieldEntry),
|
||||||
|
getAcceptOrds(acceptDocs, fieldEntry));
|
||||||
|
int i = 0;
|
||||||
|
ScoreDoc[] scoreDocs = new ScoreDoc[Math.min(results.size(), k)];
|
||||||
|
while (results.size() > 0) {
|
||||||
|
int node = results.topNode();
|
||||||
|
float score = fieldEntry.similarityFunction.convertToScore(results.topScore());
|
||||||
|
results.pop();
|
||||||
|
scoreDocs[scoreDocs.length - ++i] = new ScoreDoc(fieldEntry.ordToDoc[node], score);
|
||||||
|
}
|
||||||
|
// always return >= the case where we can assert == is only when there are fewer than topK
|
||||||
|
// vectors in the index
|
||||||
|
return new TopDocs(
|
||||||
|
new TotalHits(results.visitedCount(), TotalHits.Relation.GREATER_THAN_OR_EQUAL_TO),
|
||||||
|
scoreDocs);
|
||||||
|
}
|
||||||
|
|
||||||
|
private OffHeapVectorValues getOffHeapVectorValues(FieldEntry fieldEntry) throws IOException {
|
||||||
|
IndexInput bytesSlice =
|
||||||
|
vectorData.slice("vector-data", fieldEntry.vectorDataOffset, fieldEntry.vectorDataLength);
|
||||||
|
return new OffHeapVectorValues(fieldEntry.dimension, fieldEntry.ordToDoc, bytesSlice);
|
||||||
|
}
|
||||||
|
|
||||||
|
private Bits getAcceptOrds(Bits acceptDocs, FieldEntry fieldEntry) {
|
||||||
|
if (acceptDocs == null) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
return new Bits() {
|
||||||
|
@Override
|
||||||
|
public boolean get(int index) {
|
||||||
|
return acceptDocs.get(fieldEntry.ordToDoc[index]);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int length() {
|
||||||
|
return fieldEntry.ordToDoc.length;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Get knn graph values; used for testing */
|
||||||
|
public KnnGraphValues getGraphValues(String field) throws IOException {
|
||||||
|
FieldInfo info = fieldInfos.fieldInfo(field);
|
||||||
|
if (info == null) {
|
||||||
|
throw new IllegalArgumentException("No such field '" + field + "'");
|
||||||
|
}
|
||||||
|
FieldEntry entry = fields.get(field);
|
||||||
|
if (entry != null && entry.vectorIndexLength > 0) {
|
||||||
|
return getGraphValues(entry);
|
||||||
|
} else {
|
||||||
|
return KnnGraphValues.EMPTY;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private KnnGraphValues getGraphValues(FieldEntry entry) throws IOException {
|
||||||
|
IndexInput bytesSlice =
|
||||||
|
vectorIndex.slice("graph-data", entry.vectorIndexOffset, entry.vectorIndexLength);
|
||||||
|
return new IndexedKnnGraphReader(entry, bytesSlice);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void close() throws IOException {
|
||||||
|
IOUtils.close(vectorData, vectorIndex);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static class FieldEntry {
|
||||||
|
|
||||||
|
final VectorSimilarityFunction similarityFunction;
|
||||||
|
final long vectorDataOffset;
|
||||||
|
final long vectorDataLength;
|
||||||
|
final long vectorIndexOffset;
|
||||||
|
final long vectorIndexLength;
|
||||||
|
final int maxConn;
|
||||||
|
final int numLevels;
|
||||||
|
final int dimension;
|
||||||
|
final int[] ordToDoc;
|
||||||
|
final int[][] nodesByLevel;
|
||||||
|
// for each level the start offsets in vectorIndex file from where to read neighbours
|
||||||
|
final long[] graphOffsetsByLevel;
|
||||||
|
|
||||||
|
FieldEntry(DataInput input, VectorSimilarityFunction similarityFunction) throws IOException {
|
||||||
|
this.similarityFunction = similarityFunction;
|
||||||
|
vectorDataOffset = input.readVLong();
|
||||||
|
vectorDataLength = input.readVLong();
|
||||||
|
vectorIndexOffset = input.readVLong();
|
||||||
|
vectorIndexLength = input.readVLong();
|
||||||
|
dimension = input.readInt();
|
||||||
|
int size = input.readInt();
|
||||||
|
ordToDoc = new int[size];
|
||||||
|
for (int i = 0; i < size; i++) {
|
||||||
|
int doc = input.readVInt();
|
||||||
|
ordToDoc[i] = doc;
|
||||||
|
}
|
||||||
|
|
||||||
|
// read nodes by level
|
||||||
|
maxConn = input.readInt();
|
||||||
|
numLevels = input.readInt();
|
||||||
|
nodesByLevel = new int[numLevels][];
|
||||||
|
for (int level = 0; level < numLevels; level++) {
|
||||||
|
int numNodesOnLevel = input.readInt();
|
||||||
|
if (level == 0) {
|
||||||
|
// we don't store nodes for level 0th, as this level contains all nodes
|
||||||
|
assert numNodesOnLevel == size;
|
||||||
|
nodesByLevel[0] = null;
|
||||||
|
} else {
|
||||||
|
nodesByLevel[level] = new int[numNodesOnLevel];
|
||||||
|
for (int i = 0; i < numNodesOnLevel; i++) {
|
||||||
|
nodesByLevel[level][i] = input.readVInt();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// calculate for each level the start offsets in vectorIndex file from where to read
|
||||||
|
// neighbours
|
||||||
|
graphOffsetsByLevel = new long[numLevels];
|
||||||
|
for (int level = 0; level < numLevels; level++) {
|
||||||
|
if (level == 0) {
|
||||||
|
graphOffsetsByLevel[level] = 0;
|
||||||
|
} else {
|
||||||
|
int numNodesOnPrevLevel = level == 1 ? size : nodesByLevel[level - 1].length;
|
||||||
|
graphOffsetsByLevel[level] =
|
||||||
|
graphOffsetsByLevel[level - 1] + (1 + maxConn) * Integer.BYTES * numNodesOnPrevLevel;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
int size() {
|
||||||
|
return ordToDoc.length;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Read the vector values from the index input. This supports both iterated and random access. */
|
||||||
|
static class OffHeapVectorValues extends VectorValues
|
||||||
|
implements RandomAccessVectorValues, RandomAccessVectorValuesProducer {
|
||||||
|
|
||||||
|
final int dimension;
|
||||||
|
final int[] ordToDoc;
|
||||||
|
final IndexInput dataIn;
|
||||||
|
|
||||||
|
final BytesRef binaryValue;
|
||||||
|
final ByteBuffer byteBuffer;
|
||||||
|
final int byteSize;
|
||||||
|
final float[] value;
|
||||||
|
|
||||||
|
int ord = -1;
|
||||||
|
int doc = -1;
|
||||||
|
|
||||||
|
OffHeapVectorValues(int dimension, int[] ordToDoc, IndexInput dataIn) {
|
||||||
|
this.dimension = dimension;
|
||||||
|
this.ordToDoc = ordToDoc;
|
||||||
|
this.dataIn = dataIn;
|
||||||
|
byteSize = Float.BYTES * dimension;
|
||||||
|
byteBuffer = ByteBuffer.allocate(byteSize);
|
||||||
|
value = new float[dimension];
|
||||||
|
binaryValue = new BytesRef(byteBuffer.array(), byteBuffer.arrayOffset(), byteSize);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int dimension() {
|
||||||
|
return dimension;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int size() {
|
||||||
|
return ordToDoc.length;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public float[] vectorValue() throws IOException {
|
||||||
|
dataIn.seek((long) ord * byteSize);
|
||||||
|
dataIn.readFloats(value, 0, value.length);
|
||||||
|
return value;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public BytesRef binaryValue() throws IOException {
|
||||||
|
dataIn.seek((long) ord * byteSize);
|
||||||
|
dataIn.readBytes(byteBuffer.array(), byteBuffer.arrayOffset(), byteSize, false);
|
||||||
|
return binaryValue;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int docID() {
|
||||||
|
return doc;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int nextDoc() {
|
||||||
|
if (++ord >= size()) {
|
||||||
|
doc = NO_MORE_DOCS;
|
||||||
|
} else {
|
||||||
|
doc = ordToDoc[ord];
|
||||||
|
}
|
||||||
|
return doc;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int advance(int target) {
|
||||||
|
assert docID() < target;
|
||||||
|
ord = Arrays.binarySearch(ordToDoc, ord + 1, ordToDoc.length, target);
|
||||||
|
if (ord < 0) {
|
||||||
|
ord = -(ord + 1);
|
||||||
|
}
|
||||||
|
assert ord <= ordToDoc.length;
|
||||||
|
if (ord == ordToDoc.length) {
|
||||||
|
doc = NO_MORE_DOCS;
|
||||||
|
} else {
|
||||||
|
doc = ordToDoc[ord];
|
||||||
|
}
|
||||||
|
return doc;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public long cost() {
|
||||||
|
return ordToDoc.length;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public RandomAccessVectorValues randomAccess() {
|
||||||
|
return new OffHeapVectorValues(dimension, ordToDoc, dataIn.clone());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public float[] vectorValue(int targetOrd) throws IOException {
|
||||||
|
dataIn.seek((long) targetOrd * byteSize);
|
||||||
|
dataIn.readFloats(value, 0, value.length);
|
||||||
|
return value;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public BytesRef binaryValue(int targetOrd) throws IOException {
|
||||||
|
readValue(targetOrd);
|
||||||
|
return binaryValue;
|
||||||
|
}
|
||||||
|
|
||||||
|
private void readValue(int targetOrd) throws IOException {
|
||||||
|
dataIn.seek((long) targetOrd * byteSize);
|
||||||
|
dataIn.readBytes(byteBuffer.array(), byteBuffer.arrayOffset(), byteSize);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Read the nearest-neighbors graph from the index input */
|
||||||
|
private static final class IndexedKnnGraphReader extends KnnGraphValues {
|
||||||
|
|
||||||
|
final IndexInput dataIn;
|
||||||
|
final int[][] nodesByLevel;
|
||||||
|
final long[] graphOffsetsByLevel;
|
||||||
|
final int numLevels;
|
||||||
|
final int entryNode;
|
||||||
|
final int size;
|
||||||
|
final long bytesForConns;
|
||||||
|
|
||||||
|
int arcCount;
|
||||||
|
int arcUpTo;
|
||||||
|
int arc;
|
||||||
|
|
||||||
|
IndexedKnnGraphReader(FieldEntry entry, IndexInput dataIn) {
|
||||||
|
this.dataIn = dataIn;
|
||||||
|
this.nodesByLevel = entry.nodesByLevel;
|
||||||
|
this.numLevels = entry.numLevels;
|
||||||
|
this.entryNode = numLevels > 1 ? nodesByLevel[numLevels - 1][0] : 0;
|
||||||
|
this.size = entry.size();
|
||||||
|
this.graphOffsetsByLevel = entry.graphOffsetsByLevel;
|
||||||
|
this.bytesForConns = ((long) entry.maxConn + 1) * Integer.BYTES;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void seek(int level, int targetOrd) throws IOException {
|
||||||
|
int targetIndex =
|
||||||
|
level == 0
|
||||||
|
? targetOrd
|
||||||
|
: Arrays.binarySearch(nodesByLevel[level], 0, nodesByLevel[level].length, targetOrd);
|
||||||
|
assert targetIndex >= 0;
|
||||||
|
long graphDataOffset = graphOffsetsByLevel[level] + targetIndex * bytesForConns;
|
||||||
|
// unsafe; no bounds checking
|
||||||
|
dataIn.seek(graphDataOffset);
|
||||||
|
arcCount = dataIn.readInt();
|
||||||
|
arc = -1;
|
||||||
|
arcUpTo = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int size() {
|
||||||
|
return size;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int nextNeighbor() throws IOException {
|
||||||
|
if (arcUpTo >= arcCount) {
|
||||||
|
return NO_MORE_DOCS;
|
||||||
|
}
|
||||||
|
++arcUpTo;
|
||||||
|
arc = dataIn.readInt();
|
||||||
|
return arc;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int numLevels() throws IOException {
|
||||||
|
return numLevels;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int entryNode() throws IOException {
|
||||||
|
return entryNode;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public NodesIterator getNodesOnLevel(int level) {
|
||||||
|
if (level == 0) {
|
||||||
|
return new NodesIterator(size());
|
||||||
|
} else {
|
||||||
|
return new NodesIterator(nodesByLevel[level], nodesByLevel[level].length);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,295 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.lucene.codecs.lucene91;
|
||||||
|
|
||||||
|
import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.Arrays;
|
||||||
|
import org.apache.lucene.codecs.CodecUtil;
|
||||||
|
import org.apache.lucene.codecs.KnnVectorsReader;
|
||||||
|
import org.apache.lucene.codecs.KnnVectorsWriter;
|
||||||
|
import org.apache.lucene.index.FieldInfo;
|
||||||
|
import org.apache.lucene.index.IndexFileNames;
|
||||||
|
import org.apache.lucene.index.KnnGraphValues.NodesIterator;
|
||||||
|
import org.apache.lucene.index.RandomAccessVectorValuesProducer;
|
||||||
|
import org.apache.lucene.index.SegmentWriteState;
|
||||||
|
import org.apache.lucene.index.VectorSimilarityFunction;
|
||||||
|
import org.apache.lucene.index.VectorValues;
|
||||||
|
import org.apache.lucene.store.IndexInput;
|
||||||
|
import org.apache.lucene.store.IndexOutput;
|
||||||
|
import org.apache.lucene.util.ArrayUtil;
|
||||||
|
import org.apache.lucene.util.BytesRef;
|
||||||
|
import org.apache.lucene.util.IOUtils;
|
||||||
|
import org.apache.lucene.util.hnsw.HnswGraph;
|
||||||
|
import org.apache.lucene.util.hnsw.HnswGraphBuilder;
|
||||||
|
import org.apache.lucene.util.hnsw.NeighborArray;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Writes vector values and knn graphs to index segments.
|
||||||
|
*
|
||||||
|
* @lucene.experimental
|
||||||
|
*/
|
||||||
|
public final class Lucene91HnswVectorsWriter extends KnnVectorsWriter {
|
||||||
|
|
||||||
|
private final SegmentWriteState segmentWriteState;
|
||||||
|
private final IndexOutput meta, vectorData, vectorIndex;
|
||||||
|
|
||||||
|
private final int maxConn;
|
||||||
|
private final int beamWidth;
|
||||||
|
private boolean finished;
|
||||||
|
|
||||||
|
Lucene91HnswVectorsWriter(SegmentWriteState state, int maxConn, int beamWidth)
|
||||||
|
throws IOException {
|
||||||
|
this.maxConn = maxConn;
|
||||||
|
this.beamWidth = beamWidth;
|
||||||
|
|
||||||
|
assert state.fieldInfos.hasVectorValues();
|
||||||
|
segmentWriteState = state;
|
||||||
|
|
||||||
|
String metaFileName =
|
||||||
|
IndexFileNames.segmentFileName(
|
||||||
|
state.segmentInfo.name, state.segmentSuffix, Lucene91HnswVectorsFormat.META_EXTENSION);
|
||||||
|
|
||||||
|
String vectorDataFileName =
|
||||||
|
IndexFileNames.segmentFileName(
|
||||||
|
state.segmentInfo.name,
|
||||||
|
state.segmentSuffix,
|
||||||
|
Lucene91HnswVectorsFormat.VECTOR_DATA_EXTENSION);
|
||||||
|
|
||||||
|
String indexDataFileName =
|
||||||
|
IndexFileNames.segmentFileName(
|
||||||
|
state.segmentInfo.name,
|
||||||
|
state.segmentSuffix,
|
||||||
|
Lucene91HnswVectorsFormat.VECTOR_INDEX_EXTENSION);
|
||||||
|
|
||||||
|
boolean success = false;
|
||||||
|
try {
|
||||||
|
meta = state.directory.createOutput(metaFileName, state.context);
|
||||||
|
vectorData = state.directory.createOutput(vectorDataFileName, state.context);
|
||||||
|
vectorIndex = state.directory.createOutput(indexDataFileName, state.context);
|
||||||
|
|
||||||
|
CodecUtil.writeIndexHeader(
|
||||||
|
meta,
|
||||||
|
Lucene91HnswVectorsFormat.META_CODEC_NAME,
|
||||||
|
Lucene91HnswVectorsFormat.VERSION_CURRENT,
|
||||||
|
state.segmentInfo.getId(),
|
||||||
|
state.segmentSuffix);
|
||||||
|
CodecUtil.writeIndexHeader(
|
||||||
|
vectorData,
|
||||||
|
Lucene91HnswVectorsFormat.VECTOR_DATA_CODEC_NAME,
|
||||||
|
Lucene91HnswVectorsFormat.VERSION_CURRENT,
|
||||||
|
state.segmentInfo.getId(),
|
||||||
|
state.segmentSuffix);
|
||||||
|
CodecUtil.writeIndexHeader(
|
||||||
|
vectorIndex,
|
||||||
|
Lucene91HnswVectorsFormat.VECTOR_INDEX_CODEC_NAME,
|
||||||
|
Lucene91HnswVectorsFormat.VERSION_CURRENT,
|
||||||
|
state.segmentInfo.getId(),
|
||||||
|
state.segmentSuffix);
|
||||||
|
success = true;
|
||||||
|
} finally {
|
||||||
|
if (success == false) {
|
||||||
|
IOUtils.closeWhileHandlingException(this);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void writeField(FieldInfo fieldInfo, KnnVectorsReader knnVectorsReader)
|
||||||
|
throws IOException {
|
||||||
|
long vectorDataOffset = vectorData.alignFilePointer(Float.BYTES);
|
||||||
|
VectorValues vectors = knnVectorsReader.getVectorValues(fieldInfo.name);
|
||||||
|
|
||||||
|
IndexOutput tempVectorData =
|
||||||
|
segmentWriteState.directory.createTempOutput(
|
||||||
|
vectorData.getName(), "temp", segmentWriteState.context);
|
||||||
|
IndexInput vectorDataInput = null;
|
||||||
|
boolean success = false;
|
||||||
|
try {
|
||||||
|
// write the vector data to a temporary file
|
||||||
|
// TODO - use a better data structure; a bitset? DocsWithFieldSet is p.p. in o.a.l.index
|
||||||
|
int[] docIds = writeVectorData(tempVectorData, vectors);
|
||||||
|
CodecUtil.writeFooter(tempVectorData);
|
||||||
|
IOUtils.close(tempVectorData);
|
||||||
|
|
||||||
|
// copy the temporary file vectors to the actual data file
|
||||||
|
vectorDataInput =
|
||||||
|
segmentWriteState.directory.openInput(
|
||||||
|
tempVectorData.getName(), segmentWriteState.context);
|
||||||
|
vectorData.copyBytes(vectorDataInput, vectorDataInput.length() - CodecUtil.footerLength());
|
||||||
|
CodecUtil.retrieveChecksum(vectorDataInput);
|
||||||
|
long vectorDataLength = vectorData.getFilePointer() - vectorDataOffset;
|
||||||
|
|
||||||
|
long vectorIndexOffset = vectorIndex.getFilePointer();
|
||||||
|
// build the graph using the temporary vector data
|
||||||
|
Lucene91HnswVectorsReader.OffHeapVectorValues offHeapVectors =
|
||||||
|
new Lucene91HnswVectorsReader.OffHeapVectorValues(
|
||||||
|
vectors.dimension(), docIds, vectorDataInput);
|
||||||
|
HnswGraph graph =
|
||||||
|
offHeapVectors.size() == 0
|
||||||
|
? null
|
||||||
|
: writeGraph(offHeapVectors, fieldInfo.getVectorSimilarityFunction());
|
||||||
|
long vectorIndexLength = vectorIndex.getFilePointer() - vectorIndexOffset;
|
||||||
|
writeMeta(
|
||||||
|
fieldInfo,
|
||||||
|
vectorDataOffset,
|
||||||
|
vectorDataLength,
|
||||||
|
vectorIndexOffset,
|
||||||
|
vectorIndexLength,
|
||||||
|
docIds,
|
||||||
|
graph);
|
||||||
|
success = true;
|
||||||
|
} finally {
|
||||||
|
IOUtils.close(vectorDataInput);
|
||||||
|
if (success) {
|
||||||
|
segmentWriteState.directory.deleteFile(tempVectorData.getName());
|
||||||
|
} else {
|
||||||
|
IOUtils.closeWhileHandlingException(tempVectorData);
|
||||||
|
IOUtils.deleteFilesIgnoringExceptions(
|
||||||
|
segmentWriteState.directory, tempVectorData.getName());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Writes the vector values to the output and returns a mapping from dense ordinals to document
|
||||||
|
* IDs. The length of the returned array matches the total number of documents with a vector
|
||||||
|
* (which excludes deleted documents), so it may be less than {@link VectorValues#size()}.
|
||||||
|
*/
|
||||||
|
private static int[] writeVectorData(IndexOutput output, VectorValues vectors)
|
||||||
|
throws IOException {
|
||||||
|
int[] docIds = new int[vectors.size()];
|
||||||
|
int count = 0;
|
||||||
|
for (int docV = vectors.nextDoc(); docV != NO_MORE_DOCS; docV = vectors.nextDoc(), count++) {
|
||||||
|
// write vector
|
||||||
|
BytesRef binaryValue = vectors.binaryValue();
|
||||||
|
assert binaryValue.length == vectors.dimension() * Float.BYTES;
|
||||||
|
output.writeBytes(binaryValue.bytes, binaryValue.offset, binaryValue.length);
|
||||||
|
docIds[count] = docV;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (docIds.length > count) {
|
||||||
|
return ArrayUtil.copyOfSubArray(docIds, 0, count);
|
||||||
|
}
|
||||||
|
return docIds;
|
||||||
|
}
|
||||||
|
|
||||||
|
private void writeMeta(
|
||||||
|
FieldInfo field,
|
||||||
|
long vectorDataOffset,
|
||||||
|
long vectorDataLength,
|
||||||
|
long vectorIndexOffset,
|
||||||
|
long vectorIndexLength,
|
||||||
|
int[] docIds,
|
||||||
|
HnswGraph graph)
|
||||||
|
throws IOException {
|
||||||
|
meta.writeInt(field.number);
|
||||||
|
meta.writeInt(field.getVectorSimilarityFunction().ordinal());
|
||||||
|
meta.writeVLong(vectorDataOffset);
|
||||||
|
meta.writeVLong(vectorDataLength);
|
||||||
|
meta.writeVLong(vectorIndexOffset);
|
||||||
|
meta.writeVLong(vectorIndexLength);
|
||||||
|
meta.writeInt(field.getVectorDimension());
|
||||||
|
meta.writeInt(docIds.length);
|
||||||
|
for (int docId : docIds) {
|
||||||
|
// TODO: delta-encode, or write as bitset
|
||||||
|
meta.writeVInt(docId);
|
||||||
|
}
|
||||||
|
|
||||||
|
meta.writeInt(maxConn);
|
||||||
|
|
||||||
|
// write graph nodes on each level
|
||||||
|
if (graph == null) {
|
||||||
|
meta.writeInt(0);
|
||||||
|
} else {
|
||||||
|
meta.writeInt(graph.numLevels());
|
||||||
|
for (int level = 0; level < graph.numLevels(); level++) {
|
||||||
|
NodesIterator nodesOnLevel = graph.getNodesOnLevel(level);
|
||||||
|
meta.writeInt(nodesOnLevel.size()); // number of nodes on a level
|
||||||
|
if (level > 0) {
|
||||||
|
while (nodesOnLevel.hasNext()) {
|
||||||
|
int node = nodesOnLevel.nextInt();
|
||||||
|
meta.writeVInt(node); // list of nodes on a level
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private HnswGraph writeGraph(
|
||||||
|
RandomAccessVectorValuesProducer vectorValues, VectorSimilarityFunction similarityFunction)
|
||||||
|
throws IOException {
|
||||||
|
|
||||||
|
// build graph
|
||||||
|
HnswGraphBuilder hnswGraphBuilder =
|
||||||
|
new HnswGraphBuilder(
|
||||||
|
vectorValues, similarityFunction, maxConn, beamWidth, HnswGraphBuilder.randSeed);
|
||||||
|
hnswGraphBuilder.setInfoStream(segmentWriteState.infoStream);
|
||||||
|
HnswGraph graph = hnswGraphBuilder.build(vectorValues.randomAccess());
|
||||||
|
|
||||||
|
// write vectors' neighbours on each level into the vectorIndex file
|
||||||
|
int countOnLevel0 = graph.size();
|
||||||
|
for (int level = 0; level < graph.numLevels(); level++) {
|
||||||
|
NodesIterator nodesOnLevel = graph.getNodesOnLevel(level);
|
||||||
|
while (nodesOnLevel.hasNext()) {
|
||||||
|
int node = nodesOnLevel.nextInt();
|
||||||
|
NeighborArray neighbors = graph.getNeighbors(level, node);
|
||||||
|
int size = neighbors.size();
|
||||||
|
vectorIndex.writeInt(size);
|
||||||
|
// Destructively modify; it's ok we are discarding it after this
|
||||||
|
int[] nnodes = neighbors.node();
|
||||||
|
Arrays.sort(nnodes, 0, size);
|
||||||
|
for (int i = 0; i < size; i++) {
|
||||||
|
int nnode = nnodes[i];
|
||||||
|
assert nnode < countOnLevel0 : "node too large: " + nnode + ">=" + countOnLevel0;
|
||||||
|
vectorIndex.writeInt(nnode);
|
||||||
|
}
|
||||||
|
// if number of connections < maxConn, add bogus values up to maxConn to have predictable
|
||||||
|
// offsets
|
||||||
|
for (int i = size; i < maxConn; i++) {
|
||||||
|
vectorIndex.writeInt(0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return graph;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void finish() throws IOException {
|
||||||
|
if (finished) {
|
||||||
|
throw new IllegalStateException("already finished");
|
||||||
|
}
|
||||||
|
finished = true;
|
||||||
|
|
||||||
|
if (meta != null) {
|
||||||
|
// write end of fields marker
|
||||||
|
meta.writeInt(-1);
|
||||||
|
CodecUtil.writeFooter(meta);
|
||||||
|
}
|
||||||
|
if (vectorData != null) {
|
||||||
|
CodecUtil.writeFooter(vectorData);
|
||||||
|
CodecUtil.writeFooter(vectorIndex);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void close() throws IOException {
|
||||||
|
IOUtils.close(meta, vectorData, vectorIndex);
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,420 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Lucene 9.1 file format.
|
||||||
|
*
|
||||||
|
* <h2>Apache Lucene - Index File Formats</h2>
|
||||||
|
*
|
||||||
|
* <div>
|
||||||
|
*
|
||||||
|
* <ul>
|
||||||
|
* <li><a href="#Introduction">Introduction</a>
|
||||||
|
* <li><a href="#Definitions">Definitions</a>
|
||||||
|
* <ul>
|
||||||
|
* <li><a href="#Inverted_Indexing">Inverted Indexing</a>
|
||||||
|
* <li><a href="#Types_of_Fields">Types of Fields</a>
|
||||||
|
* <li><a href="#Segments">Segments</a>
|
||||||
|
* <li><a href="#Document_Numbers">Document Numbers</a>
|
||||||
|
* </ul>
|
||||||
|
* <li><a href="#Overview">Index Structure Overview</a>
|
||||||
|
* <li><a href="#File_Naming">File Naming</a>
|
||||||
|
* <li><a href="#file-names">Summary of File Extensions</a>
|
||||||
|
* <ul>
|
||||||
|
* <li><a href="#Lock_File">Lock File</a>
|
||||||
|
* <li><a href="#History">History</a>
|
||||||
|
* <li><a href="#Limitations">Limitations</a>
|
||||||
|
* </ul>
|
||||||
|
* </ul>
|
||||||
|
*
|
||||||
|
* </div> <a id="Introduction"></a>
|
||||||
|
*
|
||||||
|
* <h3>Introduction</h3>
|
||||||
|
*
|
||||||
|
* <div>
|
||||||
|
*
|
||||||
|
* <p>This document defines the index file formats used in this version of Lucene. If you are using
|
||||||
|
* a different version of Lucene, please consult the copy of <code>docs/</code> that was distributed
|
||||||
|
* with the version you are using.
|
||||||
|
*
|
||||||
|
* <p>This document attempts to provide a high-level definition of the Apache Lucene file formats.
|
||||||
|
* </div> <a id="Definitions"></a>
|
||||||
|
*
|
||||||
|
* <h3>Definitions</h3>
|
||||||
|
*
|
||||||
|
* <div>
|
||||||
|
*
|
||||||
|
* <p>The fundamental concepts in Lucene are index, document, field and term.
|
||||||
|
*
|
||||||
|
* <p>An index contains a sequence of documents.
|
||||||
|
*
|
||||||
|
* <ul>
|
||||||
|
* <li>A document is a sequence of fields.
|
||||||
|
* <li>A field is a named sequence of terms.
|
||||||
|
* <li>A term is a sequence of bytes.
|
||||||
|
* </ul>
|
||||||
|
*
|
||||||
|
* <p>The same sequence of bytes in two different fields is considered a different term. Thus terms
|
||||||
|
* are represented as a pair: the string naming the field, and the bytes within the field. <a
|
||||||
|
* id="Inverted_Indexing"></a>
|
||||||
|
*
|
||||||
|
* <h4>Inverted Indexing</h4>
|
||||||
|
*
|
||||||
|
* <p>Lucene's index stores terms and statistics about those terms in order to make term-based
|
||||||
|
* search more efficient. Lucene's terms index falls into the family of indexes known as an
|
||||||
|
* <i>inverted index.</i> This is because it can list, for a term, the documents that contain it.
|
||||||
|
* This is the inverse of the natural relationship, in which documents list terms. <a
|
||||||
|
* id="Types_of_Fields"></a>
|
||||||
|
*
|
||||||
|
* <h4>Types of Fields</h4>
|
||||||
|
*
|
||||||
|
* <p>In Lucene, fields may be <i>stored</i>, in which case their text is stored in the index
|
||||||
|
* literally, in a non-inverted manner. Fields that are inverted are called <i>indexed</i>. A field
|
||||||
|
* may be both stored and indexed.
|
||||||
|
*
|
||||||
|
* <p>The text of a field may be <i>tokenized</i> into terms to be indexed, or the text of a field
|
||||||
|
* may be used literally as a term to be indexed. Most fields are tokenized, but sometimes it is
|
||||||
|
* useful for certain identifier fields to be indexed literally.
|
||||||
|
*
|
||||||
|
* <p>See the {@link org.apache.lucene.document.Field Field} java docs for more information on
|
||||||
|
* Fields. <a id="Segments"></a>
|
||||||
|
*
|
||||||
|
* <h4>Segments</h4>
|
||||||
|
*
|
||||||
|
* <p>Lucene indexes may be composed of multiple sub-indexes, or <i>segments</i>. Each segment is a
|
||||||
|
* fully independent index, which could be searched separately. Indexes evolve by:
|
||||||
|
*
|
||||||
|
* <ol>
|
||||||
|
* <li>Creating new segments for newly added documents.
|
||||||
|
* <li>Merging existing segments.
|
||||||
|
* </ol>
|
||||||
|
*
|
||||||
|
* <p>Searches may involve multiple segments and/or multiple indexes, each index potentially
|
||||||
|
* composed of a set of segments. <a id="Document_Numbers"></a>
|
||||||
|
*
|
||||||
|
* <h4>Document Numbers</h4>
|
||||||
|
*
|
||||||
|
* <p>Internally, Lucene refers to documents by an integer <i>document number</i>. The first
|
||||||
|
* document added to an index is numbered zero, and each subsequent document added gets a number one
|
||||||
|
* greater than the previous.
|
||||||
|
*
|
||||||
|
* <p>Note that a document's number may change, so caution should be taken when storing these
|
||||||
|
* numbers outside of Lucene. In particular, numbers may change in the following situations:
|
||||||
|
*
|
||||||
|
* <ul>
|
||||||
|
* <li>
|
||||||
|
* <p>The numbers stored in each segment are unique only within the segment, and must be
|
||||||
|
* converted before they can be used in a larger context. The standard technique is to
|
||||||
|
* allocate each segment a range of values, based on the range of numbers used in that
|
||||||
|
* segment. To convert a document number from a segment to an external value, the segment's
|
||||||
|
* <i>base</i> document number is added. To convert an external value back to a
|
||||||
|
* segment-specific value, the segment is identified by the range that the external value is
|
||||||
|
* in, and the segment's base value is subtracted. For example two five document segments
|
||||||
|
* might be combined, so that the first segment has a base value of zero, and the second of
|
||||||
|
* five. Document three from the second segment would have an external value of eight.
|
||||||
|
* <li>
|
||||||
|
* <p>When documents are deleted, gaps are created in the numbering. These are eventually
|
||||||
|
* removed as the index evolves through merging. Deleted documents are dropped when segments
|
||||||
|
* are merged. A freshly-merged segment thus has no gaps in its numbering.
|
||||||
|
* </ul>
|
||||||
|
*
|
||||||
|
* </div> <a id="Overview"></a>
|
||||||
|
*
|
||||||
|
* <h3>Index Structure Overview</h3>
|
||||||
|
*
|
||||||
|
* <div>
|
||||||
|
*
|
||||||
|
* <p>Each segment index maintains the following:
|
||||||
|
*
|
||||||
|
* <ul>
|
||||||
|
* <li>{@link org.apache.lucene.codecs.lucene90.Lucene90SegmentInfoFormat Segment info}. This
|
||||||
|
* contains metadata about a segment, such as the number of documents, what files it uses, and
|
||||||
|
* information about how the segment is sorted
|
||||||
|
* <li>{@link org.apache.lucene.codecs.lucene90.Lucene90FieldInfosFormat Field names}. This
|
||||||
|
* contains metadata about the set of named fields used in the index.
|
||||||
|
* <li>{@link org.apache.lucene.codecs.lucene90.Lucene90StoredFieldsFormat Stored Field values}.
|
||||||
|
* This contains, for each document, a list of attribute-value pairs, where the attributes are
|
||||||
|
* field names. These are used to store auxiliary information about the document, such as its
|
||||||
|
* title, url, or an identifier to access a database. The set of stored fields are what is
|
||||||
|
* returned for each hit when searching. This is keyed by document number.
|
||||||
|
* <li>{@link org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat Term dictionary}. A
|
||||||
|
* dictionary containing all of the terms used in all of the indexed fields of all of the
|
||||||
|
* documents. The dictionary also contains the number of documents which contain the term, and
|
||||||
|
* pointers to the term's frequency and proximity data.
|
||||||
|
* <li>{@link org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat Term Frequency data}. For
|
||||||
|
* each term in the dictionary, the numbers of all the documents that contain that term, and
|
||||||
|
* the frequency of the term in that document, unless frequencies are omitted ({@link
|
||||||
|
* org.apache.lucene.index.IndexOptions#DOCS IndexOptions.DOCS})
|
||||||
|
* <li>{@link org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat Term Proximity data}. For
|
||||||
|
* each term in the dictionary, the positions that the term occurs in each document. Note that
|
||||||
|
* this will not exist if all fields in all documents omit position data.
|
||||||
|
* <li>{@link org.apache.lucene.codecs.lucene90.Lucene90NormsFormat Normalization factors}. For
|
||||||
|
* each field in each document, a value is stored that is multiplied into the score for hits
|
||||||
|
* on that field.
|
||||||
|
* <li>{@link org.apache.lucene.codecs.lucene90.Lucene90TermVectorsFormat Term Vectors}. For each
|
||||||
|
* field in each document, the term vector (sometimes called document vector) may be stored. A
|
||||||
|
* term vector consists of term text and term frequency. To add Term Vectors to your index see
|
||||||
|
* the {@link org.apache.lucene.document.Field Field} constructors
|
||||||
|
* <li>{@link org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat Per-document values}. Like
|
||||||
|
* stored values, these are also keyed by document number, but are generally intended to be
|
||||||
|
* loaded into main memory for fast access. Whereas stored values are generally intended for
|
||||||
|
* summary results from searches, per-document values are useful for things like scoring
|
||||||
|
* factors.
|
||||||
|
* <li>{@link org.apache.lucene.codecs.lucene90.Lucene90LiveDocsFormat Live documents}. An
|
||||||
|
* optional file indicating which documents are live.
|
||||||
|
* <li>{@link org.apache.lucene.codecs.lucene90.Lucene90PointsFormat Point values}. Optional pair
|
||||||
|
* of files, recording dimensionally indexed fields, to enable fast numeric range filtering
|
||||||
|
* and large numeric values like BigInteger and BigDecimal (1D) and geographic shape
|
||||||
|
* intersection (2D, 3D).
|
||||||
|
* <li>{@link org.apache.lucene.codecs.lucene91.Lucene91HnswVectorsFormat Vector values}. The
|
||||||
|
* vector format stores numeric vectors in a format optimized for random access and
|
||||||
|
* computation, supporting high-dimensional nearest-neighbor search.
|
||||||
|
* </ul>
|
||||||
|
*
|
||||||
|
* <p>Details on each of these are provided in their linked pages. </div> <a id="File_Naming"></a>
|
||||||
|
*
|
||||||
|
* <h3>File Naming</h3>
|
||||||
|
*
|
||||||
|
* <div>
|
||||||
|
*
|
||||||
|
* <p>All files belonging to a segment have the same name with varying extensions. The extensions
|
||||||
|
* correspond to the different file formats described below. When using the Compound File format
|
||||||
|
* (default for small segments) these files (except for the Segment info file, the Lock file, and
|
||||||
|
* Deleted documents file) are collapsed into a single .cfs file (see below for details)
|
||||||
|
*
|
||||||
|
* <p>Typically, all segments in an index are stored in a single directory, although this is not
|
||||||
|
* required.
|
||||||
|
*
|
||||||
|
* <p>File names are never re-used. That is, when any file is saved to the Directory it is given a
|
||||||
|
* never before used filename. This is achieved using a simple generations approach. For example,
|
||||||
|
* the first segments file is segments_1, then segments_2, etc. The generation is a sequential long
|
||||||
|
* integer represented in alpha-numeric (base 36) form. </div> <a id="file-names"></a>
|
||||||
|
*
|
||||||
|
* <h3>Summary of File Extensions</h3>
|
||||||
|
*
|
||||||
|
* <div>
|
||||||
|
*
|
||||||
|
* <p>The following table summarizes the names and extensions of the files in Lucene:
|
||||||
|
*
|
||||||
|
* <table class="padding4" style="border-spacing: 1px; border-collapse: separate">
|
||||||
|
* <caption>lucene filenames by extension</caption>
|
||||||
|
* <tr>
|
||||||
|
* <th>Name</th>
|
||||||
|
* <th>Extension</th>
|
||||||
|
* <th>Brief Description</th>
|
||||||
|
* </tr>
|
||||||
|
* <tr>
|
||||||
|
* <td>{@link org.apache.lucene.index.SegmentInfos Segments File}</td>
|
||||||
|
* <td>segments_N</td>
|
||||||
|
* <td>Stores information about a commit point</td>
|
||||||
|
* </tr>
|
||||||
|
* <tr>
|
||||||
|
* <td><a href="#Lock_File">Lock File</a></td>
|
||||||
|
* <td>write.lock</td>
|
||||||
|
* <td>The Write lock prevents multiple IndexWriters from writing to the same
|
||||||
|
* file.</td>
|
||||||
|
* </tr>
|
||||||
|
* <tr>
|
||||||
|
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90SegmentInfoFormat Segment Info}</td>
|
||||||
|
* <td>.si</td>
|
||||||
|
* <td>Stores metadata about a segment</td>
|
||||||
|
* </tr>
|
||||||
|
* <tr>
|
||||||
|
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90CompoundFormat Compound File}</td>
|
||||||
|
* <td>.cfs, .cfe</td>
|
||||||
|
* <td>An optional "virtual" file consisting of all the other index files for
|
||||||
|
* systems that frequently run out of file handles.</td>
|
||||||
|
* </tr>
|
||||||
|
* <tr>
|
||||||
|
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90FieldInfosFormat Fields}</td>
|
||||||
|
* <td>.fnm</td>
|
||||||
|
* <td>Stores information about the fields</td>
|
||||||
|
* </tr>
|
||||||
|
* <tr>
|
||||||
|
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90StoredFieldsFormat Field Index}</td>
|
||||||
|
* <td>.fdx</td>
|
||||||
|
* <td>Contains pointers to field data</td>
|
||||||
|
* </tr>
|
||||||
|
* <tr>
|
||||||
|
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90StoredFieldsFormat Field Data}</td>
|
||||||
|
* <td>.fdt</td>
|
||||||
|
* <td>The stored fields for documents</td>
|
||||||
|
* </tr>
|
||||||
|
* <tr>
|
||||||
|
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat Term Dictionary}</td>
|
||||||
|
* <td>.tim</td>
|
||||||
|
* <td>The term dictionary, stores term info</td>
|
||||||
|
* </tr>
|
||||||
|
* <tr>
|
||||||
|
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat Term Index}</td>
|
||||||
|
* <td>.tip</td>
|
||||||
|
* <td>The index into the Term Dictionary</td>
|
||||||
|
* </tr>
|
||||||
|
* <tr>
|
||||||
|
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat Frequencies}</td>
|
||||||
|
* <td>.doc</td>
|
||||||
|
* <td>Contains the list of docs which contain each term along with frequency</td>
|
||||||
|
* </tr>
|
||||||
|
* <tr>
|
||||||
|
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat Positions}</td>
|
||||||
|
* <td>.pos</td>
|
||||||
|
* <td>Stores position information about where a term occurs in the index</td>
|
||||||
|
* </tr>
|
||||||
|
* <tr>
|
||||||
|
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat Payloads}</td>
|
||||||
|
* <td>.pay</td>
|
||||||
|
* <td>Stores additional per-position metadata information such as character offsets and user payloads</td>
|
||||||
|
* </tr>
|
||||||
|
* <tr>
|
||||||
|
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90NormsFormat Norms}</td>
|
||||||
|
* <td>.nvd, .nvm</td>
|
||||||
|
* <td>Encodes length and boost factors for docs and fields</td>
|
||||||
|
* </tr>
|
||||||
|
* <tr>
|
||||||
|
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat Per-Document Values}</td>
|
||||||
|
* <td>.dvd, .dvm</td>
|
||||||
|
* <td>Encodes additional scoring factors or other per-document information.</td>
|
||||||
|
* </tr>
|
||||||
|
* <tr>
|
||||||
|
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90TermVectorsFormat Term Vector Index}</td>
|
||||||
|
* <td>.tvx</td>
|
||||||
|
* <td>Stores offset into the document data file</td>
|
||||||
|
* </tr>
|
||||||
|
* <tr>
|
||||||
|
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90TermVectorsFormat Term Vector Data}</td>
|
||||||
|
* <td>.tvd</td>
|
||||||
|
* <td>Contains term vector data.</td>
|
||||||
|
* </tr>
|
||||||
|
* <tr>
|
||||||
|
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90LiveDocsFormat Live Documents}</td>
|
||||||
|
* <td>.liv</td>
|
||||||
|
* <td>Info about what documents are live</td>
|
||||||
|
* </tr>
|
||||||
|
* <tr>
|
||||||
|
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90PointsFormat Point values}</td>
|
||||||
|
* <td>.dii, .dim</td>
|
||||||
|
* <td>Holds indexed points</td>
|
||||||
|
* </tr>
|
||||||
|
* <tr>
|
||||||
|
* <td>{@link org.apache.lucene.codecs.lucene91.Lucene91HnswVectorsFormat Vector values}</td>
|
||||||
|
* <td>.vec, .vem</td>
|
||||||
|
* <td>Holds indexed vectors; <code>.vec</code> files contain the raw vector data, and
|
||||||
|
* <code>.vem</code> the vector metadata</td>
|
||||||
|
* </tr>
|
||||||
|
* </table>
|
||||||
|
*
|
||||||
|
* </div> <a id="Lock_File"></a>
|
||||||
|
*
|
||||||
|
* <h3>Lock File</h3>
|
||||||
|
*
|
||||||
|
* The write lock, which is stored in the index directory by default, is named "write.lock". If the
|
||||||
|
* lock directory is different from the index directory then the write lock will be named
|
||||||
|
* "XXXX-write.lock" where XXXX is a unique prefix derived from the full path to the index
|
||||||
|
* directory. When this file is present, a writer is currently modifying the index (adding or
|
||||||
|
* removing documents). This lock file ensures that only one writer is modifying the index at a
|
||||||
|
* time. <a id="History"></a>
|
||||||
|
*
|
||||||
|
* <h3>History</h3>
|
||||||
|
*
|
||||||
|
* <p>Compatibility notes are provided in this document, describing how file formats have changed
|
||||||
|
* from prior versions:
|
||||||
|
*
|
||||||
|
* <ul>
|
||||||
|
* <li>In version 2.1, the file format was changed to allow lock-less commits (ie, no more commit
|
||||||
|
* lock). The change is fully backwards compatible: you can open a pre-2.1 index for searching
|
||||||
|
* or adding/deleting of docs. When the new segments file is saved (committed), it will be
|
||||||
|
* written in the new file format (meaning no specific "upgrade" process is needed). But note
|
||||||
|
* that once a commit has occurred, pre-2.1 Lucene will not be able to read the index.
|
||||||
|
* <li>In version 2.3, the file format was changed to allow segments to share a single set of doc
|
||||||
|
* store (vectors & stored fields) files. This allows for faster indexing in certain
|
||||||
|
* cases. The change is fully backwards compatible (in the same way as the lock-less commits
|
||||||
|
* change in 2.1).
|
||||||
|
* <li>In version 2.4, Strings are now written as true UTF-8 byte sequence, not Java's modified
|
||||||
|
* UTF-8. See <a href="http://issues.apache.org/jira/browse/LUCENE-510">LUCENE-510</a> for
|
||||||
|
* details.
|
||||||
|
* <li>In version 2.9, an optional opaque Map<String,String> CommitUserData may be passed to
|
||||||
|
* IndexWriter's commit methods (and later retrieved), which is recorded in the segments_N
|
||||||
|
* file. See <a href="http://issues.apache.org/jira/browse/LUCENE-1382">LUCENE-1382</a> for
|
||||||
|
* details. Also, diagnostics were added to each segment written recording details about why
|
||||||
|
* it was written (due to flush, merge; which OS/JRE was used; etc.). See issue <a
|
||||||
|
* href="http://issues.apache.org/jira/browse/LUCENE-1654">LUCENE-1654</a> for details.
|
||||||
|
* <li>In version 3.0, compressed fields are no longer written to the index (they can still be
|
||||||
|
* read, but on merge the new segment will write them, uncompressed). See issue <a
|
||||||
|
* href="http://issues.apache.org/jira/browse/LUCENE-1960">LUCENE-1960</a> for details.
|
||||||
|
* <li>In version 3.1, segments records the code version that created them. See <a
|
||||||
|
* href="http://issues.apache.org/jira/browse/LUCENE-2720">LUCENE-2720</a> for details.
|
||||||
|
* Additionally segments track explicitly whether or not they have term vectors. See <a
|
||||||
|
* href="http://issues.apache.org/jira/browse/LUCENE-2811">LUCENE-2811</a> for details.
|
||||||
|
* <li>In version 3.2, numeric fields are written as natively to stored fields file, previously
|
||||||
|
* they were stored in text format only.
|
||||||
|
* <li>In version 3.4, fields can omit position data while still indexing term frequencies.
|
||||||
|
* <li>In version 4.0, the format of the inverted index became extensible via the {@link
|
||||||
|
* org.apache.lucene.codecs.Codec Codec} api. Fast per-document storage ({@code DocValues})
|
||||||
|
* was introduced. Normalization factors need no longer be a single byte, they can be any
|
||||||
|
* {@link org.apache.lucene.index.NumericDocValues NumericDocValues}. Terms need not be
|
||||||
|
* unicode strings, they can be any byte sequence. Term offsets can optionally be indexed into
|
||||||
|
* the postings lists. Payloads can be stored in the term vectors.
|
||||||
|
* <li>In version 4.1, the format of the postings list changed to use either of FOR compression or
|
||||||
|
* variable-byte encoding, depending upon the frequency of the term. Terms appearing only once
|
||||||
|
* were changed to inline directly into the term dictionary. Stored fields are compressed by
|
||||||
|
* default.
|
||||||
|
* <li>In version 4.2, term vectors are compressed by default. DocValues has a new multi-valued
|
||||||
|
* type (SortedSet), that can be used for faceting/grouping/joining on multi-valued fields.
|
||||||
|
* <li>In version 4.5, DocValues were extended to explicitly represent missing values.
|
||||||
|
* <li>In version 4.6, FieldInfos were extended to support per-field DocValues generation, to
|
||||||
|
* allow updating NumericDocValues fields.
|
||||||
|
* <li>In version 4.8, checksum footers were added to the end of each index file for improved data
|
||||||
|
* integrity. Specifically, the last 8 bytes of every index file contain the zlib-crc32
|
||||||
|
* checksum of the file.
|
||||||
|
* <li>In version 4.9, DocValues has a new multi-valued numeric type (SortedNumeric) that is
|
||||||
|
* suitable for faceting/sorting/analytics.
|
||||||
|
* <li>In version 5.4, DocValues have been improved to store more information on disk: addresses
|
||||||
|
* for binary fields and ord indexes for multi-valued fields.
|
||||||
|
* <li>In version 6.0, Points were added, for multi-dimensional range/distance search.
|
||||||
|
* <li>In version 6.2, new Segment info format that reads/writes the index sort, to support index
|
||||||
|
* sorting.
|
||||||
|
* <li>In version 7.0, DocValues have been improved to better support sparse doc values thanks to
|
||||||
|
* an iterator API.
|
||||||
|
* <li>In version 8.0, postings have been enhanced to record, for each block of doc ids, the (term
|
||||||
|
* freq, normalization factor) pairs that may trigger the maximum score of the block. This
|
||||||
|
* information is recorded alongside skip data in order to be able to skip blocks of doc ids
|
||||||
|
* if they may not produce high enough scores. Additionally doc values and norms has been
|
||||||
|
* extended with jump-tables to make access O(1) instead of O(n), where n is the number of
|
||||||
|
* elements to skip when advancing in the data.
|
||||||
|
* <li>In version 8.4, postings, positions, offsets and payload lengths have move to a more
|
||||||
|
* performant encoding that is vectorized.
|
||||||
|
* <li>In version 8.6, index sort serialization is delegated to the sorts themselves, to allow
|
||||||
|
* user-defined sorts to be used
|
||||||
|
* <li>In version 8.7, stored fields compression became adaptive to better handle documents with
|
||||||
|
* smaller stored fields.
|
||||||
|
* <li>In version 9.0, vector-valued fields were added.
|
||||||
|
* <li>In version 9.1, vector-valued fields were modified to add a graph hierarchy.
|
||||||
|
* </ul>
|
||||||
|
*
|
||||||
|
* <a id="Limitations"></a>
|
||||||
|
*
|
||||||
|
* <h3>Limitations</h3>
|
||||||
|
*
|
||||||
|
* <div>
|
||||||
|
*
|
||||||
|
* <p>Lucene uses a Java <code>int</code> to refer to document numbers, and the index file format
|
||||||
|
* uses an <code>Int32</code> on-disk to store document numbers. This is a limitation of both the
|
||||||
|
* index file format and the current implementation. Eventually these should be replaced with either
|
||||||
|
* <code>UInt64</code> values, or better yet, {@link org.apache.lucene.store.DataOutput#writeVInt
|
||||||
|
* VInt} values which have no limit. </div>
|
||||||
|
*/
|
||||||
|
package org.apache.lucene.codecs.lucene91;
|
|
@ -20,6 +20,8 @@ package org.apache.lucene.index;
|
||||||
import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS;
|
import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.util.NoSuchElementException;
|
||||||
|
import java.util.PrimitiveIterator;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Access to per-document neighbor lists in a (hierarchical) knn search graph.
|
* Access to per-document neighbor lists in a (hierarchical) knn search graph.
|
||||||
|
@ -32,25 +34,40 @@ public abstract class KnnGraphValues {
|
||||||
protected KnnGraphValues() {}
|
protected KnnGraphValues() {}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Move the pointer to exactly {@code target}, the id of a node in the graph. After this method
|
* Move the pointer to exactly the given {@code level}'s {@code target}. After this method
|
||||||
* returns, call {@link #nextNeighbor()} to return successive (ordered) connected node ordinals.
|
* returns, call {@link #nextNeighbor()} to return successive (ordered) connected node ordinals.
|
||||||
*
|
*
|
||||||
* @param target must be a valid node in the graph, ie. ≥ 0 and < {@link
|
* @param level level of the graph
|
||||||
|
* @param target ordinal of a node in the graph, must be ≥ 0 and < {@link
|
||||||
* VectorValues#size()}.
|
* VectorValues#size()}.
|
||||||
*/
|
*/
|
||||||
public abstract void seek(int target) throws IOException;
|
public abstract void seek(int level, int target) throws IOException;
|
||||||
|
|
||||||
/** Returns the number of nodes in the graph */
|
/** Returns the number of nodes in the graph */
|
||||||
public abstract int size();
|
public abstract int size();
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Iterates over the neighbor list. It is illegal to call this method after it returns
|
* Iterates over the neighbor list. It is illegal to call this method after it returns
|
||||||
* NO_MORE_DOCS without calling {@link #seek(int)}, which resets the iterator.
|
* NO_MORE_DOCS without calling {@link #seek(int, int)}, which resets the iterator.
|
||||||
*
|
*
|
||||||
* @return a node ordinal in the graph, or NO_MORE_DOCS if the iteration is complete.
|
* @return a node ordinal in the graph, or NO_MORE_DOCS if the iteration is complete.
|
||||||
*/
|
*/
|
||||||
public abstract int nextNeighbor() throws IOException;
|
public abstract int nextNeighbor() throws IOException;
|
||||||
|
|
||||||
|
/** Returns the number of levels of the graph */
|
||||||
|
public abstract int numLevels() throws IOException;
|
||||||
|
|
||||||
|
/** Returns graph's entry point on the top level * */
|
||||||
|
public abstract int entryNode() throws IOException;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get all nodes on a given level as node 0th ordinals
|
||||||
|
*
|
||||||
|
* @param level level for which to get all nodes
|
||||||
|
* @return an iterator over nodes where {@code nextInt} returns a next node on the level
|
||||||
|
*/
|
||||||
|
public abstract NodesIterator getNodesOnLevel(int level) throws IOException;
|
||||||
|
|
||||||
/** Empty graph value */
|
/** Empty graph value */
|
||||||
public static KnnGraphValues EMPTY =
|
public static KnnGraphValues EMPTY =
|
||||||
new KnnGraphValues() {
|
new KnnGraphValues() {
|
||||||
|
@ -61,11 +78,74 @@ public abstract class KnnGraphValues {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void seek(int target) {}
|
public void seek(int level, int target) {}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public int size() {
|
public int size() {
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
};
|
|
||||||
|
@Override
|
||||||
|
public int numLevels() {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int entryNode() {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public NodesIterator getNodesOnLevel(int level) {
|
||||||
|
return NodesIterator.EMPTY;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Iterator over the graph nodes on a certain level, Iterator also provides the size – the total
|
||||||
|
* number of nodes to be iterated over.
|
||||||
|
*/
|
||||||
|
public static final class NodesIterator implements PrimitiveIterator.OfInt {
|
||||||
|
static NodesIterator EMPTY = new NodesIterator(0);
|
||||||
|
|
||||||
|
private final int[] nodes;
|
||||||
|
private final int size;
|
||||||
|
int cur = 0;
|
||||||
|
|
||||||
|
/** Constructor for iterator based on the nodes array up to the size */
|
||||||
|
public NodesIterator(int[] nodes, int size) {
|
||||||
|
assert nodes != null;
|
||||||
|
assert size <= nodes.length;
|
||||||
|
this.nodes = nodes;
|
||||||
|
this.size = size;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Constructor for iterator based on the size */
|
||||||
|
public NodesIterator(int size) {
|
||||||
|
this.nodes = null;
|
||||||
|
this.size = size;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int nextInt() {
|
||||||
|
if (hasNext() == false) {
|
||||||
|
throw new NoSuchElementException();
|
||||||
|
}
|
||||||
|
if (nodes == null) {
|
||||||
|
return cur++;
|
||||||
|
} else {
|
||||||
|
return nodes[cur++];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean hasNext() {
|
||||||
|
return cur < size;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** The number of elements in this iterator * */
|
||||||
|
public int size() {
|
||||||
|
return size;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -17,22 +17,26 @@
|
||||||
|
|
||||||
package org.apache.lucene.util.hnsw;
|
package org.apache.lucene.util.hnsw;
|
||||||
|
|
||||||
abstract class BoundsChecker {
|
/**
|
||||||
|
* A helper class for an hnsw graph that serves as a comparator of the currently set bound value
|
||||||
|
* with a new value.
|
||||||
|
*/
|
||||||
|
public abstract class BoundsChecker {
|
||||||
|
|
||||||
float bound;
|
float bound;
|
||||||
|
|
||||||
/** Update the bound if sample is better */
|
/** Update the bound if sample is better */
|
||||||
abstract void update(float sample);
|
public abstract void update(float sample);
|
||||||
|
|
||||||
/** Update the bound unconditionally */
|
/** Update the bound unconditionally */
|
||||||
void set(float sample) {
|
public void set(float sample) {
|
||||||
bound = sample;
|
bound = sample;
|
||||||
}
|
}
|
||||||
|
|
||||||
/** @return whether the sample exceeds (is worse than) the bound */
|
/** @return whether the sample exceeds (is worse than) the bound */
|
||||||
abstract boolean check(float sample);
|
public abstract boolean check(float sample);
|
||||||
|
|
||||||
static BoundsChecker create(boolean reversed) {
|
public static BoundsChecker create(boolean reversed) {
|
||||||
if (reversed) {
|
if (reversed) {
|
||||||
return new Min();
|
return new Min();
|
||||||
} else {
|
} else {
|
||||||
|
@ -40,39 +44,47 @@ abstract class BoundsChecker {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static class Max extends BoundsChecker {
|
/**
|
||||||
|
* A helper class for an hnsw graph that serves as a comparator of the currently set maximum value
|
||||||
|
* with a new value.
|
||||||
|
*/
|
||||||
|
public static class Max extends BoundsChecker {
|
||||||
Max() {
|
Max() {
|
||||||
bound = Float.NEGATIVE_INFINITY;
|
bound = Float.NEGATIVE_INFINITY;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
void update(float sample) {
|
public void update(float sample) {
|
||||||
if (sample > bound) {
|
if (sample > bound) {
|
||||||
bound = sample;
|
bound = sample;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
boolean check(float sample) {
|
public boolean check(float sample) {
|
||||||
return sample < bound;
|
return sample < bound;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static class Min extends BoundsChecker {
|
/**
|
||||||
|
* A helper class for an hnsw graph that serves as a comparator of the currently set minimum value
|
||||||
|
* with a new value.
|
||||||
|
*/
|
||||||
|
public static class Min extends BoundsChecker {
|
||||||
|
|
||||||
Min() {
|
Min() {
|
||||||
bound = Float.POSITIVE_INFINITY;
|
bound = Float.POSITIVE_INFINITY;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
void update(float sample) {
|
public void update(float sample) {
|
||||||
if (sample < bound) {
|
if (sample < bound) {
|
||||||
bound = sample;
|
bound = sample;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
boolean check(float sample) {
|
public boolean check(float sample) {
|
||||||
return sample > bound;
|
return sample > bound;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -21,32 +21,31 @@ import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
|
import java.util.Arrays;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.SplittableRandom;
|
|
||||||
import org.apache.lucene.index.KnnGraphValues;
|
import org.apache.lucene.index.KnnGraphValues;
|
||||||
import org.apache.lucene.index.RandomAccessVectorValues;
|
import org.apache.lucene.index.RandomAccessVectorValues;
|
||||||
import org.apache.lucene.index.VectorSimilarityFunction;
|
import org.apache.lucene.index.VectorSimilarityFunction;
|
||||||
|
import org.apache.lucene.util.ArrayUtil;
|
||||||
import org.apache.lucene.util.Bits;
|
import org.apache.lucene.util.Bits;
|
||||||
import org.apache.lucene.util.SparseFixedBitSet;
|
import org.apache.lucene.util.SparseFixedBitSet;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Navigable Small-world graph. Provides efficient approximate nearest neighbor search for high
|
* Hierarchical Navigable Small World graph. Provides efficient approximate nearest neighbor search
|
||||||
* dimensional vectors. See <a href="https://doi.org/10.1016/j.is.2013.10.006">Approximate nearest
|
* for high dimensional vectors. See <a href="https://arxiv.org/abs/1603.09320">Efficient and robust
|
||||||
* neighbor algorithm based on navigable small world graphs [2014]</a> and <a
|
* approximate nearest neighbor search using Hierarchical Navigable Small World graphs [2018]</a>
|
||||||
* href="https://arxiv.org/abs/1603.09320">this paper [2018]</a> for details.
|
* paper for details.
|
||||||
*
|
*
|
||||||
* <p>The nomenclature is a bit different here from what's used in those papers:
|
* <p>The nomenclature is a bit different here from what's used in the paper:
|
||||||
*
|
*
|
||||||
* <h2>Hyperparameters</h2>
|
* <h2>Hyperparameters</h2>
|
||||||
*
|
*
|
||||||
* <ul>
|
* <ul>
|
||||||
* <li><code>numSeed</code> is the equivalent of <code>m</code> in the 2012 paper; it controls the
|
|
||||||
* number of random entry points to sample.
|
|
||||||
* <li><code>beamWidth</code> in {@link HnswGraphBuilder} has the same meaning as <code>efConst
|
* <li><code>beamWidth</code> in {@link HnswGraphBuilder} has the same meaning as <code>efConst
|
||||||
* </code> in the 2016 paper. It is the number of nearest neighbor candidates to track while
|
* </code> in the paper. It is the number of nearest neighbor candidates to track while
|
||||||
* searching the graph for each newly inserted node.
|
* searching the graph for each newly inserted node.
|
||||||
* <li><code>maxConn</code> has the same meaning as <code>M</code> in the later paper; it controls
|
* <li><code>maxConn</code> has the same meaning as <code>M</code> in the paper; it controls how
|
||||||
* how many of the <code>efConst</code> neighbors are connected to the new node
|
* many of the <code>efConst</code> neighbors are connected to the new node
|
||||||
* </ul>
|
* </ul>
|
||||||
*
|
*
|
||||||
* <p>Note: The graph may be searched by multiple threads concurrently, but updates are not
|
* <p>Note: The graph may be searched by multiple threads concurrently, but updates are not
|
||||||
|
@ -56,75 +55,120 @@ import org.apache.lucene.util.SparseFixedBitSet;
|
||||||
public final class HnswGraph extends KnnGraphValues {
|
public final class HnswGraph extends KnnGraphValues {
|
||||||
|
|
||||||
private final int maxConn;
|
private final int maxConn;
|
||||||
|
private int numLevels; // the current number of levels in the graph
|
||||||
|
private int entryNode; // the current graph entry node on the top level
|
||||||
|
|
||||||
// Each entry lists the top maxConn neighbors of a node. The nodes correspond to vectors added to
|
// Nodes by level expressed as the level 0's nodes' ordinals.
|
||||||
// HnswBuilder, and the
|
// As level 0 contains all nodes, nodesByLevel.get(0) is null.
|
||||||
// node values are the ordinals of those vectors.
|
private final List<int[]> nodesByLevel;
|
||||||
private final List<NeighborArray> graph;
|
|
||||||
|
// graph is a list of graph levels.
|
||||||
|
// Each level is represented as List<NeighborArray> – nodes' connections on this level.
|
||||||
|
// Each entry in the list has the top maxConn neighbors of a node. The nodes correspond to vectors
|
||||||
|
// added to HnswBuilder, and the node values are the ordinals of those vectors.
|
||||||
|
// Thus, on all levels, neighbors expressed as the level 0's nodes' ordinals.
|
||||||
|
private final List<List<NeighborArray>> graph;
|
||||||
|
|
||||||
// KnnGraphValues iterator members
|
// KnnGraphValues iterator members
|
||||||
private int upto;
|
private int upto;
|
||||||
private NeighborArray cur;
|
private NeighborArray cur;
|
||||||
|
|
||||||
HnswGraph(int maxConn) {
|
HnswGraph(int maxConn, int levelOfFirstNode) {
|
||||||
graph = new ArrayList<>();
|
|
||||||
// Typically with diversity criteria we see nodes not fully occupied; average fanout seems to be
|
|
||||||
// about 1/2 maxConn. There is some indexing time penalty for under-allocating, but saves RAM
|
|
||||||
graph.add(new NeighborArray(Math.max(32, maxConn / 4)));
|
|
||||||
this.maxConn = maxConn;
|
this.maxConn = maxConn;
|
||||||
|
this.numLevels = levelOfFirstNode + 1;
|
||||||
|
this.graph = new ArrayList<>(numLevels);
|
||||||
|
this.entryNode = 0;
|
||||||
|
for (int i = 0; i < numLevels; i++) {
|
||||||
|
graph.add(new ArrayList<>());
|
||||||
|
// Typically with diversity criteria we see nodes not fully occupied;
|
||||||
|
// average fanout seems to be about 1/2 maxConn.
|
||||||
|
// There is some indexing time penalty for under-allocating, but saves RAM
|
||||||
|
graph.get(i).add(new NeighborArray(Math.max(32, maxConn / 4)));
|
||||||
|
}
|
||||||
|
|
||||||
|
this.nodesByLevel = new ArrayList<>(numLevels);
|
||||||
|
nodesByLevel.add(null); // we don't need this for 0th level, as it contains all nodes
|
||||||
|
for (int l = 1; l < numLevels; l++) {
|
||||||
|
nodesByLevel.add(new int[] {0});
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Searches for the nearest neighbors of a query vector.
|
* Searches HNSW graph for the nearest neighbors of a query vector.
|
||||||
*
|
*
|
||||||
* @param query search query vector
|
* @param query search query vector
|
||||||
* @param topK the number of nodes to be returned
|
* @param topK the number of nodes to be returned
|
||||||
* @param numSeed the size of the queue maintained while searching, and controls the number of
|
|
||||||
* random entry points to sample
|
|
||||||
* @param vectors vector values
|
* @param vectors vector values
|
||||||
* @param graphValues the graph values. May represent the entire graph, or a level in a
|
* @param graphValues the graph values. May represent the entire graph, or a level in a
|
||||||
* hierarchical graph.
|
* hierarchical graph.
|
||||||
* @param acceptOrds {@link Bits} that represents the allowed document ordinals to match, or
|
* @param acceptOrds {@link Bits} that represents the allowed document ordinals to match, or
|
||||||
* {@code null} if they are all allowed to match.
|
* {@code null} if they are all allowed to match.
|
||||||
* @param random a source of randomness, used for generating entry points to the graph
|
|
||||||
* @return a priority queue holding the closest neighbors found
|
* @return a priority queue holding the closest neighbors found
|
||||||
*/
|
*/
|
||||||
public static NeighborQueue search(
|
public static NeighborQueue search(
|
||||||
float[] query,
|
float[] query,
|
||||||
int topK,
|
int topK,
|
||||||
int numSeed,
|
|
||||||
RandomAccessVectorValues vectors,
|
RandomAccessVectorValues vectors,
|
||||||
VectorSimilarityFunction similarityFunction,
|
VectorSimilarityFunction similarityFunction,
|
||||||
KnnGraphValues graphValues,
|
KnnGraphValues graphValues,
|
||||||
Bits acceptOrds,
|
Bits acceptOrds)
|
||||||
SplittableRandom random)
|
|
||||||
throws IOException {
|
throws IOException {
|
||||||
|
|
||||||
|
NeighborQueue results;
|
||||||
|
int[] eps = new int[] {graphValues.entryNode()};
|
||||||
|
for (int level = graphValues.numLevels() - 1; level >= 1; level--) {
|
||||||
|
results = searchLevel(query, 1, level, eps, vectors, similarityFunction, graphValues, null);
|
||||||
|
eps[0] = results.pop();
|
||||||
|
}
|
||||||
|
results =
|
||||||
|
searchLevel(query, topK, 0, eps, vectors, similarityFunction, graphValues, acceptOrds);
|
||||||
|
return results;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Searches for the nearest neighbors of a query vector in a given level
|
||||||
|
*
|
||||||
|
* @param query search query vector
|
||||||
|
* @param topK the number of nearest to query results to return
|
||||||
|
* @param level level to search
|
||||||
|
* @param eps the entry points for search at this level expressed as level 0th ordinals
|
||||||
|
* @param vectors vector values
|
||||||
|
* @param similarityFunction similarity function
|
||||||
|
* @param graphValues the graph values
|
||||||
|
* @param acceptOrds {@link Bits} that represents the allowed document ordinals to match, or
|
||||||
|
* {@code null} if they are all allowed to match.
|
||||||
|
* @return a priority queue holding the closest neighbors found
|
||||||
|
*/
|
||||||
|
static NeighborQueue searchLevel(
|
||||||
|
float[] query,
|
||||||
|
int topK,
|
||||||
|
int level,
|
||||||
|
final int[] eps,
|
||||||
|
RandomAccessVectorValues vectors,
|
||||||
|
VectorSimilarityFunction similarityFunction,
|
||||||
|
KnnGraphValues graphValues,
|
||||||
|
Bits acceptOrds)
|
||||||
|
throws IOException {
|
||||||
|
|
||||||
int size = graphValues.size();
|
int size = graphValues.size();
|
||||||
|
|
||||||
// MIN heap, holding the top results
|
// MIN heap, holding the top results
|
||||||
NeighborQueue results = new NeighborQueue(numSeed, similarityFunction.reversed);
|
NeighborQueue results = new NeighborQueue(topK, similarityFunction.reversed);
|
||||||
// MAX heap, from which to pull the candidate nodes
|
// MAX heap, from which to pull the candidate nodes
|
||||||
NeighborQueue candidates = new NeighborQueue(numSeed, !similarityFunction.reversed);
|
NeighborQueue candidates = new NeighborQueue(topK, !similarityFunction.reversed);
|
||||||
|
|
||||||
// set of ordinals that have been visited by search on this layer, used to avoid backtracking
|
// set of ordinals that have been visited by search on this layer, used to avoid backtracking
|
||||||
SparseFixedBitSet visited = new SparseFixedBitSet(size);
|
SparseFixedBitSet visited = new SparseFixedBitSet(size);
|
||||||
// get initial candidates at random
|
for (int ep : eps) {
|
||||||
int boundedNumSeed = Math.min(numSeed, 2 * size);
|
if (visited.getAndSet(ep) == false) {
|
||||||
for (int i = 0; i < boundedNumSeed; i++) {
|
float score = similarityFunction.compare(query, vectors.vectorValue(ep));
|
||||||
int entryPoint = random.nextInt(size);
|
candidates.add(ep, score);
|
||||||
if (visited.getAndSet(entryPoint) == false) {
|
if (acceptOrds == null || acceptOrds.get(ep)) {
|
||||||
// explore the topK starting points of some random numSeed probes
|
results.add(ep, score);
|
||||||
float score = similarityFunction.compare(query, vectors.vectorValue(entryPoint));
|
|
||||||
candidates.add(entryPoint, score);
|
|
||||||
if (acceptOrds == null || acceptOrds.get(entryPoint)) {
|
|
||||||
results.add(entryPoint, score);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Set the bound to the worst current result and below reject any newly-generated candidates
|
// Set the bound to the worst current result and below reject any newly-generated candidates
|
||||||
// failing
|
// failing to exceed this bound
|
||||||
// to exceed this bound
|
|
||||||
BoundsChecker bound = BoundsChecker.create(similarityFunction.reversed);
|
BoundsChecker bound = BoundsChecker.create(similarityFunction.reversed);
|
||||||
bound.set(results.topScore());
|
bound.set(results.topScore());
|
||||||
while (candidates.size() > 0) {
|
while (candidates.size() > 0) {
|
||||||
|
@ -136,7 +180,7 @@ public final class HnswGraph extends KnnGraphValues {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
int topCandidateNode = candidates.pop();
|
int topCandidateNode = candidates.pop();
|
||||||
graphValues.seek(topCandidateNode);
|
graphValues.seek(level, topCandidateNode);
|
||||||
int friendOrd;
|
int friendOrd;
|
||||||
while ((friendOrd = graphValues.nextNeighbor()) != NO_MORE_DOCS) {
|
while ((friendOrd = graphValues.nextNeighbor()) != NO_MORE_DOCS) {
|
||||||
assert friendOrd < size : "friendOrd=" + friendOrd + "; size=" + size;
|
assert friendOrd < size : "friendOrd=" + friendOrd + "; size=" + size;
|
||||||
|
@ -145,7 +189,7 @@ public final class HnswGraph extends KnnGraphValues {
|
||||||
}
|
}
|
||||||
|
|
||||||
float score = similarityFunction.compare(query, vectors.vectorValue(friendOrd));
|
float score = similarityFunction.compare(query, vectors.vectorValue(friendOrd));
|
||||||
if (results.size() < numSeed || bound.check(score) == false) {
|
if (results.size() < topK || bound.check(score) == false) {
|
||||||
candidates.add(friendOrd, score);
|
candidates.add(friendOrd, score);
|
||||||
if (acceptOrds == null || acceptOrds.get(friendOrd)) {
|
if (acceptOrds == null || acceptOrds.get(friendOrd)) {
|
||||||
results.insertWithOverflow(friendOrd, score);
|
results.insertWithOverflow(friendOrd, score);
|
||||||
|
@ -164,25 +208,60 @@ public final class HnswGraph extends KnnGraphValues {
|
||||||
/**
|
/**
|
||||||
* Returns the {@link NeighborQueue} connected to the given node.
|
* Returns the {@link NeighborQueue} connected to the given node.
|
||||||
*
|
*
|
||||||
* @param node the node whose neighbors are returned
|
* @param level level of the graph
|
||||||
|
* @param node the node whose neighbors are returned, represented as an ordinal on the level 0.
|
||||||
*/
|
*/
|
||||||
public NeighborArray getNeighbors(int node) {
|
public NeighborArray getNeighbors(int level, int node) {
|
||||||
return graph.get(node);
|
if (level == 0) {
|
||||||
|
return graph.get(level).get(node);
|
||||||
|
}
|
||||||
|
int nodeIndex = Arrays.binarySearch(nodesByLevel.get(level), 0, graph.get(level).size(), node);
|
||||||
|
assert nodeIndex >= 0;
|
||||||
|
return graph.get(level).get(nodeIndex);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public int size() {
|
public int size() {
|
||||||
return graph.size();
|
return graph.get(0).size(); // all nodes are located on the 0th level
|
||||||
}
|
}
|
||||||
|
|
||||||
int addNode() {
|
/**
|
||||||
graph.add(new NeighborArray(maxConn + 1));
|
* Add node on the given level
|
||||||
return graph.size() - 1;
|
*
|
||||||
|
* @param level level to add a node on
|
||||||
|
* @param node the node to add, represented as an ordinal on the level 0.
|
||||||
|
*/
|
||||||
|
public void addNode(int level, int node) {
|
||||||
|
if (level > 0) {
|
||||||
|
// if the new node introduces a new level, add more levels to the graph,
|
||||||
|
// and make this node the graph's new entry point
|
||||||
|
if (level >= numLevels) {
|
||||||
|
for (int i = numLevels; i <= level; i++) {
|
||||||
|
graph.add(new ArrayList<>());
|
||||||
|
nodesByLevel.add(new int[] {node});
|
||||||
|
}
|
||||||
|
numLevels = level + 1;
|
||||||
|
entryNode = node;
|
||||||
|
} else {
|
||||||
|
// Add this node id to this level's nodes
|
||||||
|
int[] nodes = nodesByLevel.get(level);
|
||||||
|
int idx = graph.get(level).size();
|
||||||
|
if (idx < nodes.length) {
|
||||||
|
nodes[idx] = node;
|
||||||
|
} else {
|
||||||
|
nodes = ArrayUtil.grow(nodes);
|
||||||
|
nodes[idx] = node;
|
||||||
|
nodesByLevel.set(level, nodes);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
graph.get(level).add(new NeighborArray(maxConn + 1));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void seek(int targetNode) {
|
public void seek(int level, int targetNode) {
|
||||||
cur = getNeighbors(targetNode);
|
cur = getNeighbors(level, targetNode);
|
||||||
upto = -1;
|
upto = -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -193,4 +272,34 @@ public final class HnswGraph extends KnnGraphValues {
|
||||||
}
|
}
|
||||||
return NO_MORE_DOCS;
|
return NO_MORE_DOCS;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the current number of levels in the graph
|
||||||
|
*
|
||||||
|
* @return the current number of levels in the graph
|
||||||
|
*/
|
||||||
|
@Override
|
||||||
|
public int numLevels() {
|
||||||
|
return numLevels;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the graph's current entry node on the top level shown as ordinals of the nodes on 0th
|
||||||
|
* level
|
||||||
|
*
|
||||||
|
* @return the graph's current entry node on the top level
|
||||||
|
*/
|
||||||
|
@Override
|
||||||
|
public int entryNode() {
|
||||||
|
return entryNode;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public NodesIterator getNodesOnLevel(int level) {
|
||||||
|
if (level == 0) {
|
||||||
|
return new NodesIterator(size());
|
||||||
|
} else {
|
||||||
|
return new NodesIterator(nodesByLevel.get(level), graph.get(level).size());
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -17,6 +17,8 @@
|
||||||
|
|
||||||
package org.apache.lucene.util.hnsw;
|
package org.apache.lucene.util.hnsw;
|
||||||
|
|
||||||
|
import static java.lang.Math.log;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.Locale;
|
import java.util.Locale;
|
||||||
import java.util.Objects;
|
import java.util.Objects;
|
||||||
|
@ -32,15 +34,17 @@ import org.apache.lucene.util.InfoStream;
|
||||||
*/
|
*/
|
||||||
public final class HnswGraphBuilder {
|
public final class HnswGraphBuilder {
|
||||||
|
|
||||||
// default random seed for level generation
|
/** Default random seed for level generation * */
|
||||||
private static final long DEFAULT_RAND_SEED = System.currentTimeMillis();
|
private static final long DEFAULT_RAND_SEED = System.currentTimeMillis();
|
||||||
|
/** A name for the HNSW component for the info-stream * */
|
||||||
public static final String HNSW_COMPONENT = "HNSW";
|
public static final String HNSW_COMPONENT = "HNSW";
|
||||||
|
|
||||||
// expose for testing.
|
/** Random seed for level generation; public to expose for testing * */
|
||||||
public static long randSeed = DEFAULT_RAND_SEED;
|
public static long randSeed = DEFAULT_RAND_SEED;
|
||||||
|
|
||||||
private final int maxConn;
|
private final int maxConn;
|
||||||
private final int beamWidth;
|
private final int beamWidth;
|
||||||
|
private final double ml;
|
||||||
private final NeighborArray scratch;
|
private final NeighborArray scratch;
|
||||||
|
|
||||||
private final VectorSimilarityFunction similarityFunction;
|
private final VectorSimilarityFunction similarityFunction;
|
||||||
|
@ -84,9 +88,12 @@ public final class HnswGraphBuilder {
|
||||||
}
|
}
|
||||||
this.maxConn = maxConn;
|
this.maxConn = maxConn;
|
||||||
this.beamWidth = beamWidth;
|
this.beamWidth = beamWidth;
|
||||||
this.hnsw = new HnswGraph(maxConn);
|
// normalization factor for level generation; currently not configurable
|
||||||
|
this.ml = 1 / Math.log(1.0 * maxConn);
|
||||||
|
this.random = new SplittableRandom(seed);
|
||||||
|
int levelOfFirstNode = getRandomGraphLevel(ml, random);
|
||||||
|
this.hnsw = new HnswGraph(maxConn, levelOfFirstNode);
|
||||||
bound = BoundsChecker.create(similarityFunction.reversed);
|
bound = BoundsChecker.create(similarityFunction.reversed);
|
||||||
random = new SplittableRandom(seed);
|
|
||||||
scratch = new NeighborArray(Math.max(beamWidth, maxConn + 1));
|
scratch = new NeighborArray(Math.max(beamWidth, maxConn + 1));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -109,9 +116,48 @@ public final class HnswGraphBuilder {
|
||||||
long start = System.nanoTime(), t = start;
|
long start = System.nanoTime(), t = start;
|
||||||
// start at node 1! node 0 is added implicitly, in the constructor
|
// start at node 1! node 0 is added implicitly, in the constructor
|
||||||
for (int node = 1; node < vectors.size(); node++) {
|
for (int node = 1; node < vectors.size(); node++) {
|
||||||
addGraphNode(vectors.vectorValue(node));
|
addGraphNode(node, vectors.vectorValue(node));
|
||||||
if (node % 10000 == 0) {
|
if ((node % 10000 == 0) && infoStream.isEnabled(HNSW_COMPONENT)) {
|
||||||
if (infoStream.isEnabled(HNSW_COMPONENT)) {
|
t = printGraphBuildStatus(node, start, t);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return hnsw;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Set info-stream to output debugging information * */
|
||||||
|
public void setInfoStream(InfoStream infoStream) {
|
||||||
|
this.infoStream = infoStream;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Inserts a doc with vector value to the graph */
|
||||||
|
void addGraphNode(int node, float[] value) throws IOException {
|
||||||
|
NeighborQueue candidates;
|
||||||
|
final int nodeLevel = getRandomGraphLevel(ml, random);
|
||||||
|
int curMaxLevel = hnsw.numLevels() - 1;
|
||||||
|
int[] eps = new int[] {hnsw.entryNode()};
|
||||||
|
|
||||||
|
// if a node introduces new levels to the graph, add this new node on new levels
|
||||||
|
for (int level = nodeLevel; level > curMaxLevel; level--) {
|
||||||
|
hnsw.addNode(level, node);
|
||||||
|
}
|
||||||
|
// for levels > nodeLevel search with topk = 1
|
||||||
|
for (int level = curMaxLevel; level > nodeLevel; level--) {
|
||||||
|
candidates =
|
||||||
|
HnswGraph.searchLevel(value, 1, level, eps, vectorValues, similarityFunction, hnsw, null);
|
||||||
|
eps = new int[] {candidates.pop()};
|
||||||
|
}
|
||||||
|
// for levels <= nodeLevel search with topk = beamWidth, and add connections
|
||||||
|
for (int level = Math.min(nodeLevel, curMaxLevel); level >= 0; level--) {
|
||||||
|
candidates =
|
||||||
|
HnswGraph.searchLevel(
|
||||||
|
value, beamWidth, level, eps, vectorValues, similarityFunction, hnsw, null);
|
||||||
|
eps = candidates.nodes();
|
||||||
|
hnsw.addNode(level, node);
|
||||||
|
addDiverseNeighbors(level, node, candidates);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private long printGraphBuildStatus(int node, long start, long t) {
|
||||||
long now = System.nanoTime();
|
long now = System.nanoTime();
|
||||||
infoStream.message(
|
infoStream.message(
|
||||||
HNSW_COMPONENT,
|
HNSW_COMPONENT,
|
||||||
|
@ -121,31 +167,7 @@ public final class HnswGraphBuilder {
|
||||||
node,
|
node,
|
||||||
((now - t) / 1_000_000),
|
((now - t) / 1_000_000),
|
||||||
((now - start) / 1_000_000)));
|
((now - start) / 1_000_000)));
|
||||||
t = now;
|
return now;
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return hnsw;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setInfoStream(InfoStream infoStream) {
|
|
||||||
this.infoStream = infoStream;
|
|
||||||
}
|
|
||||||
|
|
||||||
/** Inserts a doc with vector value to the graph */
|
|
||||||
void addGraphNode(float[] value) throws IOException {
|
|
||||||
// We pass 'null' for acceptOrds because there are no deletions while building the graph
|
|
||||||
NeighborQueue candidates =
|
|
||||||
HnswGraph.search(
|
|
||||||
value, beamWidth, beamWidth, vectorValues, similarityFunction, hnsw, null, random);
|
|
||||||
|
|
||||||
int node = hnsw.addNode();
|
|
||||||
|
|
||||||
/* connect neighbors to the new node, using a diversity heuristic that chooses successive
|
|
||||||
* nearest neighbors that are closer to the new node than they are to the previously-selected
|
|
||||||
* neighbors
|
|
||||||
*/
|
|
||||||
addDiverseNeighbors(node, candidates);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/* TODO: we are not maintaining nodes in strict score order; the forward links
|
/* TODO: we are not maintaining nodes in strict score order; the forward links
|
||||||
|
@ -153,12 +175,13 @@ public final class HnswGraphBuilder {
|
||||||
* work better if we keep the neighbor arrays sorted. Possibly we should switch back to a heap?
|
* work better if we keep the neighbor arrays sorted. Possibly we should switch back to a heap?
|
||||||
* But first we should just see if sorting makes a significant difference.
|
* But first we should just see if sorting makes a significant difference.
|
||||||
*/
|
*/
|
||||||
private void addDiverseNeighbors(int node, NeighborQueue candidates) throws IOException {
|
private void addDiverseNeighbors(int level, int node, NeighborQueue candidates)
|
||||||
|
throws IOException {
|
||||||
/* For each of the beamWidth nearest candidates (going from best to worst), select it only if it
|
/* For each of the beamWidth nearest candidates (going from best to worst), select it only if it
|
||||||
* is closer to target than it is to any of the already-selected neighbors (ie selected in this method,
|
* is closer to target than it is to any of the already-selected neighbors (ie selected in this method,
|
||||||
* since the node is new and has no prior neighbors).
|
* since the node is new and has no prior neighbors).
|
||||||
*/
|
*/
|
||||||
NeighborArray neighbors = hnsw.getNeighbors(node);
|
NeighborArray neighbors = hnsw.getNeighbors(level, node);
|
||||||
assert neighbors.size() == 0; // new node
|
assert neighbors.size() == 0; // new node
|
||||||
popToScratch(candidates);
|
popToScratch(candidates);
|
||||||
selectDiverse(neighbors, scratch);
|
selectDiverse(neighbors, scratch);
|
||||||
|
@ -168,7 +191,7 @@ public final class HnswGraphBuilder {
|
||||||
int size = neighbors.size();
|
int size = neighbors.size();
|
||||||
for (int i = 0; i < size; i++) {
|
for (int i = 0; i < size; i++) {
|
||||||
int nbr = neighbors.node[i];
|
int nbr = neighbors.node[i];
|
||||||
NeighborArray nbrNbr = hnsw.getNeighbors(nbr);
|
NeighborArray nbrNbr = hnsw.getNeighbors(level, nbr);
|
||||||
nbrNbr.add(node, neighbors.score[i]);
|
nbrNbr.add(node, neighbors.score[i]);
|
||||||
if (nbrNbr.size() > maxConn) {
|
if (nbrNbr.size() > maxConn) {
|
||||||
diversityUpdate(nbrNbr);
|
diversityUpdate(nbrNbr);
|
||||||
|
@ -266,4 +289,12 @@ public final class HnswGraphBuilder {
|
||||||
}
|
}
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private static int getRandomGraphLevel(double ml, SplittableRandom random) {
|
||||||
|
double randDouble;
|
||||||
|
do {
|
||||||
|
randDouble = random.nextDouble(); // avoid 0 value, as log(0) is undefined
|
||||||
|
} while (randDouble == 0.0);
|
||||||
|
return ((int) (-log(randDouble) * ml));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -32,7 +32,7 @@ public class NeighborArray {
|
||||||
float[] score;
|
float[] score;
|
||||||
int[] node;
|
int[] node;
|
||||||
|
|
||||||
NeighborArray(int maxSize) {
|
public NeighborArray(int maxSize) {
|
||||||
node = new int[maxSize];
|
node = new int[maxSize];
|
||||||
score = new float[maxSize];
|
score = new float[maxSize];
|
||||||
}
|
}
|
||||||
|
@ -60,11 +60,15 @@ public class NeighborArray {
|
||||||
return node;
|
return node;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public float[] score() {
|
||||||
|
return score;
|
||||||
|
}
|
||||||
|
|
||||||
public void clear() {
|
public void clear() {
|
||||||
size = 0;
|
size = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
void removeLast() {
|
public void removeLast() {
|
||||||
size--;
|
size--;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -54,7 +54,7 @@ public class NeighborQueue {
|
||||||
// Used to track the number of neighbors visited during a single graph traversal
|
// Used to track the number of neighbors visited during a single graph traversal
|
||||||
private int visitedCount;
|
private int visitedCount;
|
||||||
|
|
||||||
NeighborQueue(int initialSize, boolean reversed) {
|
public NeighborQueue(int initialSize, boolean reversed) {
|
||||||
this.heap = new LongHeap(initialSize);
|
this.heap = new LongHeap(initialSize);
|
||||||
this.order = reversed ? Order.REVERSED : Order.NATURAL;
|
this.order = reversed ? Order.REVERSED : Order.NATURAL;
|
||||||
}
|
}
|
||||||
|
@ -119,7 +119,7 @@ public class NeighborQueue {
|
||||||
return visitedCount;
|
return visitedCount;
|
||||||
}
|
}
|
||||||
|
|
||||||
void setVisitedCount(int visitedCount) {
|
public void setVisitedCount(int visitedCount) {
|
||||||
this.visitedCount = visitedCount;
|
this.visitedCount = visitedCount;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -13,4 +13,4 @@
|
||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
|
|
||||||
org.apache.lucene.codecs.lucene90.Lucene90Codec
|
org.apache.lucene.codecs.lucene91.Lucene91Codec
|
||||||
|
|
|
@ -13,4 +13,4 @@
|
||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
|
|
||||||
org.apache.lucene.codecs.lucene90.Lucene90HnswVectorsFormat
|
org.apache.lucene.codecs.lucene91.Lucene91HnswVectorsFormat
|
||||||
|
|
|
@ -18,7 +18,8 @@ package org.apache.lucene.codecs.lucene90;
|
||||||
|
|
||||||
import com.carrotsearch.randomizedtesting.generators.RandomPicks;
|
import com.carrotsearch.randomizedtesting.generators.RandomPicks;
|
||||||
import org.apache.lucene.codecs.Codec;
|
import org.apache.lucene.codecs.Codec;
|
||||||
import org.apache.lucene.codecs.lucene90.Lucene90Codec.Mode;
|
import org.apache.lucene.codecs.lucene91.Lucene91Codec;
|
||||||
|
import org.apache.lucene.codecs.lucene91.Lucene91Codec.Mode;
|
||||||
import org.apache.lucene.document.Document;
|
import org.apache.lucene.document.Document;
|
||||||
import org.apache.lucene.document.StoredField;
|
import org.apache.lucene.document.StoredField;
|
||||||
import org.apache.lucene.index.DirectoryReader;
|
import org.apache.lucene.index.DirectoryReader;
|
||||||
|
@ -30,7 +31,7 @@ import org.apache.lucene.tests.index.BaseStoredFieldsFormatTestCase;
|
||||||
public class TestLucene90StoredFieldsFormatHighCompression extends BaseStoredFieldsFormatTestCase {
|
public class TestLucene90StoredFieldsFormatHighCompression extends BaseStoredFieldsFormatTestCase {
|
||||||
@Override
|
@Override
|
||||||
protected Codec getCodec() {
|
protected Codec getCodec() {
|
||||||
return new Lucene90Codec(Mode.BEST_COMPRESSION);
|
return new Lucene91Codec(Mode.BEST_COMPRESSION);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -40,7 +41,7 @@ public class TestLucene90StoredFieldsFormatHighCompression extends BaseStoredFie
|
||||||
Directory dir = newDirectory();
|
Directory dir = newDirectory();
|
||||||
for (int i = 0; i < 10; i++) {
|
for (int i = 0; i < 10; i++) {
|
||||||
IndexWriterConfig iwc = newIndexWriterConfig();
|
IndexWriterConfig iwc = newIndexWriterConfig();
|
||||||
iwc.setCodec(new Lucene90Codec(RandomPicks.randomFrom(random(), Mode.values())));
|
iwc.setCodec(new Lucene91Codec(RandomPicks.randomFrom(random(), Mode.values())));
|
||||||
IndexWriter iw = new IndexWriter(dir, newIndexWriterConfig());
|
IndexWriter iw = new IndexWriter(dir, newIndexWriterConfig());
|
||||||
Document doc = new Document();
|
Document doc = new Document();
|
||||||
doc.add(new StoredField("field1", "value1"));
|
doc.add(new StoredField("field1", "value1"));
|
||||||
|
@ -69,7 +70,7 @@ public class TestLucene90StoredFieldsFormatHighCompression extends BaseStoredFie
|
||||||
expectThrows(
|
expectThrows(
|
||||||
NullPointerException.class,
|
NullPointerException.class,
|
||||||
() -> {
|
() -> {
|
||||||
new Lucene90Codec(null);
|
new Lucene91Codec(null);
|
||||||
});
|
});
|
||||||
|
|
||||||
expectThrows(
|
expectThrows(
|
||||||
|
|
|
@ -0,0 +1,54 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package org.apache.lucene.codecs.lucene91;
|
||||||
|
|
||||||
|
import static com.carrotsearch.randomizedtesting.RandomizedTest.randomIntBetween;
|
||||||
|
import static org.apache.lucene.codecs.lucene91.Lucene91HnswVectorsFormat.DEFAULT_BEAM_WIDTH;
|
||||||
|
import static org.apache.lucene.codecs.lucene91.Lucene91HnswVectorsFormat.DEFAULT_MAX_CONN;
|
||||||
|
|
||||||
|
import org.apache.lucene.codecs.Codec;
|
||||||
|
import org.apache.lucene.codecs.KnnVectorsFormat;
|
||||||
|
import org.apache.lucene.tests.index.BaseKnnVectorsFormatTestCase;
|
||||||
|
import org.apache.lucene.tests.util.TestUtil;
|
||||||
|
|
||||||
|
public class TestLucene91HnswVectorsFormat extends BaseKnnVectorsFormatTestCase {
|
||||||
|
@Override
|
||||||
|
protected Codec getCodec() {
|
||||||
|
return TestUtil.getDefaultCodec();
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testToString() {
|
||||||
|
int maxConn = randomIntBetween(DEFAULT_MAX_CONN - 10, DEFAULT_MAX_CONN + 10);
|
||||||
|
int beamWidth = randomIntBetween(DEFAULT_BEAM_WIDTH - 50, DEFAULT_BEAM_WIDTH + 50);
|
||||||
|
Codec customCodec =
|
||||||
|
new Lucene91Codec() {
|
||||||
|
@Override
|
||||||
|
public KnnVectorsFormat getKnnVectorsFormatForField(String field) {
|
||||||
|
return new Lucene91HnswVectorsFormat(maxConn, beamWidth);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
String expectedString =
|
||||||
|
"Lucene91HnswVectorsFormat(name = Lucene91HnswVectorsFormat, maxConn = "
|
||||||
|
+ maxConn
|
||||||
|
+ ", beamWidth="
|
||||||
|
+ beamWidth
|
||||||
|
+ ")";
|
||||||
|
assertEquals(
|
||||||
|
expectedString,
|
||||||
|
((Lucene91Codec) customCodec).getKnnVectorsFormatForField("bogus_field").toString());
|
||||||
|
}
|
||||||
|
}
|
|
@ -16,6 +16,7 @@
|
||||||
*/
|
*/
|
||||||
package org.apache.lucene.index;
|
package org.apache.lucene.index;
|
||||||
|
|
||||||
|
import static com.carrotsearch.randomizedtesting.RandomizedTest.randomIntBetween;
|
||||||
import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS;
|
import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS;
|
||||||
import static org.apache.lucene.util.hnsw.HnswGraphBuilder.randSeed;
|
import static org.apache.lucene.util.hnsw.HnswGraphBuilder.randSeed;
|
||||||
|
|
||||||
|
@ -26,11 +27,12 @@ import java.util.HashSet;
|
||||||
import java.util.LinkedList;
|
import java.util.LinkedList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
|
import java.util.concurrent.CountDownLatch;
|
||||||
import org.apache.lucene.codecs.Codec;
|
import org.apache.lucene.codecs.Codec;
|
||||||
import org.apache.lucene.codecs.KnnVectorsFormat;
|
import org.apache.lucene.codecs.KnnVectorsFormat;
|
||||||
import org.apache.lucene.codecs.lucene90.Lucene90Codec;
|
import org.apache.lucene.codecs.lucene91.Lucene91Codec;
|
||||||
import org.apache.lucene.codecs.lucene90.Lucene90HnswVectorsFormat;
|
import org.apache.lucene.codecs.lucene91.Lucene91HnswVectorsFormat;
|
||||||
import org.apache.lucene.codecs.lucene90.Lucene90HnswVectorsReader;
|
import org.apache.lucene.codecs.lucene91.Lucene91HnswVectorsReader;
|
||||||
import org.apache.lucene.codecs.perfield.PerFieldKnnVectorsFormat;
|
import org.apache.lucene.codecs.perfield.PerFieldKnnVectorsFormat;
|
||||||
import org.apache.lucene.document.Document;
|
import org.apache.lucene.document.Document;
|
||||||
import org.apache.lucene.document.Field;
|
import org.apache.lucene.document.Field;
|
||||||
|
@ -38,13 +40,19 @@ import org.apache.lucene.document.FieldType;
|
||||||
import org.apache.lucene.document.KnnVectorField;
|
import org.apache.lucene.document.KnnVectorField;
|
||||||
import org.apache.lucene.document.SortedDocValuesField;
|
import org.apache.lucene.document.SortedDocValuesField;
|
||||||
import org.apache.lucene.document.StringField;
|
import org.apache.lucene.document.StringField;
|
||||||
|
import org.apache.lucene.index.KnnGraphValues.NodesIterator;
|
||||||
|
import org.apache.lucene.search.IndexSearcher;
|
||||||
|
import org.apache.lucene.search.KnnVectorQuery;
|
||||||
import org.apache.lucene.search.ScoreDoc;
|
import org.apache.lucene.search.ScoreDoc;
|
||||||
|
import org.apache.lucene.search.SearcherFactory;
|
||||||
|
import org.apache.lucene.search.SearcherManager;
|
||||||
import org.apache.lucene.search.TopDocs;
|
import org.apache.lucene.search.TopDocs;
|
||||||
import org.apache.lucene.store.Directory;
|
import org.apache.lucene.store.Directory;
|
||||||
import org.apache.lucene.tests.util.LuceneTestCase;
|
import org.apache.lucene.tests.util.LuceneTestCase;
|
||||||
import org.apache.lucene.util.ArrayUtil;
|
import org.apache.lucene.util.ArrayUtil;
|
||||||
import org.apache.lucene.util.Bits;
|
import org.apache.lucene.util.Bits;
|
||||||
import org.apache.lucene.util.BytesRef;
|
import org.apache.lucene.util.BytesRef;
|
||||||
|
import org.apache.lucene.util.IOUtils;
|
||||||
import org.apache.lucene.util.VectorUtil;
|
import org.apache.lucene.util.VectorUtil;
|
||||||
import org.apache.lucene.util.hnsw.HnswGraphBuilder;
|
import org.apache.lucene.util.hnsw.HnswGraphBuilder;
|
||||||
import org.junit.After;
|
import org.junit.After;
|
||||||
|
@ -55,7 +63,7 @@ public class TestKnnGraph extends LuceneTestCase {
|
||||||
|
|
||||||
private static final String KNN_GRAPH_FIELD = "vector";
|
private static final String KNN_GRAPH_FIELD = "vector";
|
||||||
|
|
||||||
private static int maxConn = Lucene90HnswVectorsFormat.DEFAULT_MAX_CONN;
|
private static int maxConn = Lucene91HnswVectorsFormat.DEFAULT_MAX_CONN;
|
||||||
|
|
||||||
private Codec codec;
|
private Codec codec;
|
||||||
private VectorSimilarityFunction similarityFunction;
|
private VectorSimilarityFunction similarityFunction;
|
||||||
|
@ -68,11 +76,11 @@ public class TestKnnGraph extends LuceneTestCase {
|
||||||
}
|
}
|
||||||
|
|
||||||
codec =
|
codec =
|
||||||
new Lucene90Codec() {
|
new Lucene91Codec() {
|
||||||
@Override
|
@Override
|
||||||
public KnnVectorsFormat getKnnVectorsFormatForField(String field) {
|
public KnnVectorsFormat getKnnVectorsFormatForField(String field) {
|
||||||
return new Lucene90HnswVectorsFormat(
|
return new Lucene91HnswVectorsFormat(
|
||||||
maxConn, Lucene90HnswVectorsFormat.DEFAULT_BEAM_WIDTH);
|
maxConn, Lucene91HnswVectorsFormat.DEFAULT_BEAM_WIDTH);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -82,7 +90,7 @@ public class TestKnnGraph extends LuceneTestCase {
|
||||||
|
|
||||||
@After
|
@After
|
||||||
public void cleanup() {
|
public void cleanup() {
|
||||||
maxConn = Lucene90HnswVectorsFormat.DEFAULT_MAX_CONN;
|
maxConn = Lucene91HnswVectorsFormat.DEFAULT_MAX_CONN;
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Basic test of creating documents in a graph */
|
/** Basic test of creating documents in a graph */
|
||||||
|
@ -153,21 +161,63 @@ public class TestKnnGraph extends LuceneTestCase {
|
||||||
int dimension = atLeast(10);
|
int dimension = atLeast(10);
|
||||||
float[][] values = randomVectors(numDoc, dimension);
|
float[][] values = randomVectors(numDoc, dimension);
|
||||||
int mergePoint = random().nextInt(numDoc);
|
int mergePoint = random().nextInt(numDoc);
|
||||||
int[][] mergedGraph = getIndexedGraph(values, mergePoint, seed);
|
int[][][] mergedGraph = getIndexedGraph(values, mergePoint, seed);
|
||||||
int[][] singleSegmentGraph = getIndexedGraph(values, -1, seed);
|
int[][][] singleSegmentGraph = getIndexedGraph(values, -1, seed);
|
||||||
assertGraphEquals(singleSegmentGraph, mergedGraph);
|
assertGraphEquals(singleSegmentGraph, mergedGraph);
|
||||||
}
|
}
|
||||||
|
|
||||||
private void assertGraphEquals(int[][] expected, int[][] actual) {
|
/** Test writing and reading of multiple vector fields * */
|
||||||
assertEquals("graph sizes differ", expected.length, actual.length);
|
public void testMultipleVectorFields() throws Exception {
|
||||||
for (int i = 0; i < expected.length; i++) {
|
int numVectorFields = randomIntBetween(2, 5);
|
||||||
assertArrayEquals("difference at ord=" + i, expected[i], actual[i]);
|
int numDoc = atLeast(100);
|
||||||
|
int[] dims = new int[numVectorFields];
|
||||||
|
float[][][] values = new float[numVectorFields][][];
|
||||||
|
for (int field = 0; field < numVectorFields; field++) {
|
||||||
|
dims[field] = atLeast(3);
|
||||||
|
values[field] = randomVectors(numDoc, dims[field]);
|
||||||
|
}
|
||||||
|
|
||||||
|
try (Directory dir = newDirectory();
|
||||||
|
IndexWriter iw = new IndexWriter(dir, newIndexWriterConfig(null).setCodec(codec))) {
|
||||||
|
for (int docID = 0; docID < numDoc; docID++) {
|
||||||
|
Document doc = new Document();
|
||||||
|
for (int field = 0; field < numVectorFields; field++) {
|
||||||
|
float[] vector = values[field][docID];
|
||||||
|
if (vector != null) {
|
||||||
|
FieldType fieldType = KnnVectorField.createFieldType(vector.length, similarityFunction);
|
||||||
|
doc.add(new KnnVectorField(KNN_GRAPH_FIELD + field, vector, fieldType));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
String idString = Integer.toString(docID);
|
||||||
|
doc.add(new StringField("id", idString, Field.Store.YES));
|
||||||
|
iw.addDocument(doc);
|
||||||
|
}
|
||||||
|
for (int field = 0; field < numVectorFields; field++) {
|
||||||
|
assertConsistentGraph(iw, values[field], KNN_GRAPH_FIELD + field);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private int[][] getIndexedGraph(float[][] values, int mergePoint, long seed) throws IOException {
|
private void assertGraphEquals(int[][][] expected, int[][][] actual) {
|
||||||
|
assertEquals("graph sizes differ", expected.length, actual.length);
|
||||||
|
for (int level = 0; level < expected.length; level++) {
|
||||||
|
for (int node = 0; node < expected[level].length; node++) {
|
||||||
|
assertArrayEquals("difference at ord=" + node, expected[level][node], actual[level][node]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Return a naive representation of an HNSW graph as a 3 dimensional array: 1st dim represents a
|
||||||
|
* graph layer. Each layer contains an array of arrays – a list of nodes and for each node a list
|
||||||
|
* of the node's neighbours. 2nd dim represents a node on a layer, and contains the node's
|
||||||
|
* neighbourhood, or {@code null} if a node is not present on this layer. 3rd dim represents
|
||||||
|
* neighbours of a node.
|
||||||
|
*/
|
||||||
|
private int[][][] getIndexedGraph(float[][] values, int mergePoint, long seed)
|
||||||
|
throws IOException {
|
||||||
HnswGraphBuilder.randSeed = seed;
|
HnswGraphBuilder.randSeed = seed;
|
||||||
int[][] graph;
|
int[][][] graph;
|
||||||
try (Directory dir = newDirectory()) {
|
try (Directory dir = newDirectory()) {
|
||||||
IndexWriterConfig iwc = newIndexWriterConfig();
|
IndexWriterConfig iwc = newIndexWriterConfig();
|
||||||
iwc.setMergePolicy(new LogDocMergePolicy()); // for predictable segment ordering when merging
|
iwc.setMergePolicy(new LogDocMergePolicy()); // for predictable segment ordering when merging
|
||||||
|
@ -186,8 +236,8 @@ public class TestKnnGraph extends LuceneTestCase {
|
||||||
PerFieldKnnVectorsFormat.FieldsReader perFieldReader =
|
PerFieldKnnVectorsFormat.FieldsReader perFieldReader =
|
||||||
(PerFieldKnnVectorsFormat.FieldsReader)
|
(PerFieldKnnVectorsFormat.FieldsReader)
|
||||||
((CodecReader) getOnlyLeafReader(reader)).getVectorReader();
|
((CodecReader) getOnlyLeafReader(reader)).getVectorReader();
|
||||||
Lucene90HnswVectorsReader vectorReader =
|
Lucene91HnswVectorsReader vectorReader =
|
||||||
(Lucene90HnswVectorsReader) perFieldReader.getFieldReader(KNN_GRAPH_FIELD);
|
(Lucene91HnswVectorsReader) perFieldReader.getFieldReader(KNN_GRAPH_FIELD);
|
||||||
graph = copyGraph(vectorReader.getGraphValues(KNN_GRAPH_FIELD));
|
graph = copyGraph(vectorReader.getGraphValues(KNN_GRAPH_FIELD));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -208,18 +258,23 @@ public class TestKnnGraph extends LuceneTestCase {
|
||||||
return values;
|
return values;
|
||||||
}
|
}
|
||||||
|
|
||||||
int[][] copyGraph(KnnGraphValues values) throws IOException {
|
int[][][] copyGraph(KnnGraphValues graphValues) throws IOException {
|
||||||
int size = values.size();
|
int[][][] graph = new int[graphValues.numLevels()][][];
|
||||||
int[][] graph = new int[size][];
|
int size = graphValues.size();
|
||||||
int[] scratch = new int[maxConn];
|
int[] scratch = new int[maxConn];
|
||||||
for (int node = 0; node < size; node++) {
|
|
||||||
|
for (int level = 0; level < graphValues.numLevels(); level++) {
|
||||||
|
NodesIterator nodesItr = graphValues.getNodesOnLevel(level);
|
||||||
|
graph[level] = new int[size][];
|
||||||
|
while (nodesItr.hasNext()) {
|
||||||
|
int node = nodesItr.nextInt();
|
||||||
|
graphValues.seek(level, node);
|
||||||
int n, count = 0;
|
int n, count = 0;
|
||||||
values.seek(node);
|
while ((n = graphValues.nextNeighbor()) != NO_MORE_DOCS) {
|
||||||
while ((n = values.nextNeighbor()) != NO_MORE_DOCS) {
|
|
||||||
scratch[count++] = n;
|
scratch[count++] = n;
|
||||||
// graph[node][i++] = n;
|
|
||||||
}
|
}
|
||||||
graph[node] = ArrayUtil.copyOfSubArray(scratch, 0, count);
|
graph[level][node] = ArrayUtil.copyOfSubArray(scratch, 0, count);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
return graph;
|
return graph;
|
||||||
}
|
}
|
||||||
|
@ -232,31 +287,7 @@ public class TestKnnGraph extends LuceneTestCase {
|
||||||
config.setCodec(codec); // test is not compatible with simpletext
|
config.setCodec(codec); // test is not compatible with simpletext
|
||||||
try (Directory dir = newDirectory();
|
try (Directory dir = newDirectory();
|
||||||
IndexWriter iw = new IndexWriter(dir, config)) {
|
IndexWriter iw = new IndexWriter(dir, config)) {
|
||||||
// Add a document for every cartesian point in an NxN square so we can
|
indexData(iw);
|
||||||
// easily know which are the nearest neighbors to every point. Insert by iterating
|
|
||||||
// using a prime number that is not a divisor of N*N so that we will hit each point once,
|
|
||||||
// and chosen so that points will be inserted in a deterministic
|
|
||||||
// but somewhat distributed pattern
|
|
||||||
int n = 5, stepSize = 17;
|
|
||||||
float[][] values = new float[n * n][];
|
|
||||||
int index = 0;
|
|
||||||
for (int i = 0; i < values.length; i++) {
|
|
||||||
// System.out.printf("%d: (%d, %d)\n", i, index % n, index / n);
|
|
||||||
int x = index % n, y = index / n;
|
|
||||||
values[i] = new float[] {x, y};
|
|
||||||
index = (index + stepSize) % (n * n);
|
|
||||||
add(iw, i, values[i]);
|
|
||||||
if (i == 13) {
|
|
||||||
// create 2 segments
|
|
||||||
iw.commit();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
boolean forceMerge = random().nextBoolean();
|
|
||||||
// System.out.println("");
|
|
||||||
if (forceMerge) {
|
|
||||||
iw.forceMerge(1);
|
|
||||||
}
|
|
||||||
assertConsistentGraph(iw, values);
|
|
||||||
try (DirectoryReader dr = DirectoryReader.open(iw)) {
|
try (DirectoryReader dr = DirectoryReader.open(iw)) {
|
||||||
// results are ordered by score (descending) and docid (ascending);
|
// results are ordered by score (descending) and docid (ascending);
|
||||||
// This is the insertion order:
|
// This is the insertion order:
|
||||||
|
@ -279,6 +310,77 @@ public class TestKnnGraph extends LuceneTestCase {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private void indexData(IndexWriter iw) throws IOException {
|
||||||
|
// Add a document for every cartesian point in an NxN square so we can
|
||||||
|
// easily know which are the nearest neighbors to every point. Insert by iterating
|
||||||
|
// using a prime number that is not a divisor of N*N so that we will hit each point once,
|
||||||
|
// and chosen so that points will be inserted in a deterministic
|
||||||
|
// but somewhat distributed pattern
|
||||||
|
int n = 5, stepSize = 17;
|
||||||
|
float[][] values = new float[n * n][];
|
||||||
|
int index = 0;
|
||||||
|
for (int i = 0; i < values.length; i++) {
|
||||||
|
// System.out.printf("%d: (%d, %d)\n", i, index % n, index / n);
|
||||||
|
int x = index % n, y = index / n;
|
||||||
|
values[i] = new float[] {x, y};
|
||||||
|
index = (index + stepSize) % (n * n);
|
||||||
|
add(iw, i, values[i]);
|
||||||
|
if (i == 13) {
|
||||||
|
// create 2 segments
|
||||||
|
iw.commit();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
boolean forceMerge = random().nextBoolean();
|
||||||
|
if (forceMerge) {
|
||||||
|
iw.forceMerge(1);
|
||||||
|
}
|
||||||
|
assertConsistentGraph(iw, values);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testMultiThreadedSearch() throws Exception {
|
||||||
|
similarityFunction = VectorSimilarityFunction.EUCLIDEAN;
|
||||||
|
IndexWriterConfig config = newIndexWriterConfig();
|
||||||
|
config.setCodec(codec);
|
||||||
|
Directory dir = newDirectory();
|
||||||
|
IndexWriter iw = new IndexWriter(dir, config);
|
||||||
|
indexData(iw);
|
||||||
|
|
||||||
|
final SearcherManager manager = new SearcherManager(iw, new SearcherFactory());
|
||||||
|
Thread[] threads = new Thread[randomIntBetween(2, 5)];
|
||||||
|
final CountDownLatch latch = new CountDownLatch(1);
|
||||||
|
for (int i = 0; i < threads.length; i++) {
|
||||||
|
threads[i] =
|
||||||
|
new Thread(
|
||||||
|
() -> {
|
||||||
|
try {
|
||||||
|
latch.await();
|
||||||
|
IndexSearcher searcher = manager.acquire();
|
||||||
|
try {
|
||||||
|
KnnVectorQuery query = new KnnVectorQuery("vector", new float[] {0f, 0.1f}, 5);
|
||||||
|
TopDocs results = searcher.search(query, 5);
|
||||||
|
for (ScoreDoc doc : results.scoreDocs) {
|
||||||
|
// map docId to insertion id
|
||||||
|
doc.doc =
|
||||||
|
Integer.parseInt(searcher.getIndexReader().document(doc.doc).get("id"));
|
||||||
|
}
|
||||||
|
assertResults(new int[] {0, 15, 3, 18, 5}, results);
|
||||||
|
} finally {
|
||||||
|
manager.release(searcher);
|
||||||
|
}
|
||||||
|
} catch (Exception e) {
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
threads[i].start();
|
||||||
|
}
|
||||||
|
|
||||||
|
latch.countDown();
|
||||||
|
for (Thread t : threads) {
|
||||||
|
t.join();
|
||||||
|
}
|
||||||
|
IOUtils.close(manager, iw, dir);
|
||||||
|
}
|
||||||
|
|
||||||
private void assertGraphSearch(int[] expected, float[] vector, IndexReader reader)
|
private void assertGraphSearch(int[] expected, float[] vector, IndexReader reader)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
TopDocs results = doKnnSearch(reader, vector, 5);
|
TopDocs results = doKnnSearch(reader, vector, 5);
|
||||||
|
@ -310,39 +412,40 @@ public class TestKnnGraph extends LuceneTestCase {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private void assertConsistentGraph(IndexWriter iw, float[][] values) throws IOException {
|
||||||
|
assertConsistentGraph(iw, values, KNN_GRAPH_FIELD);
|
||||||
|
}
|
||||||
|
|
||||||
// For each leaf, verify that its graph nodes are 1-1 with vectors, that the vectors are the
|
// For each leaf, verify that its graph nodes are 1-1 with vectors, that the vectors are the
|
||||||
// expected values,
|
// expected values, and that the graph is fully connected and symmetric.
|
||||||
// and that the graph is fully connected and symmetric.
|
|
||||||
// NOTE: when we impose max-fanout on the graph it wil no longer be symmetric, but should still
|
// NOTE: when we impose max-fanout on the graph it wil no longer be symmetric, but should still
|
||||||
// be fully connected. Is there any other invariant we can test? Well, we can check that max
|
// be fully connected. Is there any other invariant we can test? Well, we can check that max
|
||||||
// fanout
|
// fanout is respected. We can test *desirable* properties of the graph like small-world
|
||||||
// is respected. We can test *desirable* properties of the graph like small-world (the graph
|
// (the graph diameter should be tightly bounded).
|
||||||
// diameter
|
private void assertConsistentGraph(IndexWriter iw, float[][] values, String vectorField)
|
||||||
// should be tightly bounded).
|
throws IOException {
|
||||||
private void assertConsistentGraph(IndexWriter iw, float[][] values) throws IOException {
|
int numDocsWithVectors = 0;
|
||||||
int totalGraphDocs = 0;
|
|
||||||
try (DirectoryReader dr = DirectoryReader.open(iw)) {
|
try (DirectoryReader dr = DirectoryReader.open(iw)) {
|
||||||
for (LeafReaderContext ctx : dr.leaves()) {
|
for (LeafReaderContext ctx : dr.leaves()) {
|
||||||
LeafReader reader = ctx.reader();
|
LeafReader reader = ctx.reader();
|
||||||
VectorValues vectorValues = reader.getVectorValues(KNN_GRAPH_FIELD);
|
VectorValues vectorValues = reader.getVectorValues(vectorField);
|
||||||
PerFieldKnnVectorsFormat.FieldsReader perFieldReader =
|
PerFieldKnnVectorsFormat.FieldsReader perFieldReader =
|
||||||
(PerFieldKnnVectorsFormat.FieldsReader) ((CodecReader) reader).getVectorReader();
|
(PerFieldKnnVectorsFormat.FieldsReader) ((CodecReader) reader).getVectorReader();
|
||||||
if (perFieldReader == null) {
|
if (perFieldReader == null) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
Lucene90HnswVectorsReader vectorReader =
|
Lucene91HnswVectorsReader vectorReader =
|
||||||
(Lucene90HnswVectorsReader) perFieldReader.getFieldReader(KNN_GRAPH_FIELD);
|
(Lucene91HnswVectorsReader) perFieldReader.getFieldReader(vectorField);
|
||||||
KnnGraphValues graphValues = vectorReader.getGraphValues(KNN_GRAPH_FIELD);
|
KnnGraphValues graphValues = vectorReader.getGraphValues(vectorField);
|
||||||
assertEquals((vectorValues == null), (graphValues == null));
|
|
||||||
if (vectorValues == null) {
|
if (vectorValues == null) {
|
||||||
|
assert graphValues == null;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
int[][] graph = new int[reader.maxDoc()][];
|
|
||||||
boolean foundOrphan = false;
|
// assert vector values:
|
||||||
int graphSize = 0;
|
// stored vector values are the same as original
|
||||||
for (int i = 0; i < reader.maxDoc(); i++) {
|
for (int i = 0; i < reader.maxDoc(); i++) {
|
||||||
int nextDocWithVectors = vectorValues.advance(i);
|
int nextDocWithVectors = vectorValues.advance(i);
|
||||||
// System.out.println("advanced to " + nextDocWithVectors);
|
|
||||||
while (i < nextDocWithVectors && i < reader.maxDoc()) {
|
while (i < nextDocWithVectors && i < reader.maxDoc()) {
|
||||||
int id = Integer.parseInt(reader.document(i).get("id"));
|
int id = Integer.parseInt(reader.document(i).get("id"));
|
||||||
assertNull("document " + id + " has no vector, but was expected to", values[id]);
|
assertNull("document " + id + " has no vector, but was expected to", values[id]);
|
||||||
|
@ -352,7 +455,6 @@ public class TestKnnGraph extends LuceneTestCase {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
int id = Integer.parseInt(reader.document(i).get("id"));
|
int id = Integer.parseInt(reader.document(i).get("id"));
|
||||||
graphValues.seek(graphSize);
|
|
||||||
// documents with KnnGraphValues have the expected vectors
|
// documents with KnnGraphValues have the expected vectors
|
||||||
float[] scratch = vectorValues.vectorValue();
|
float[] scratch = vectorValues.vectorValue();
|
||||||
assertArrayEquals(
|
assertArrayEquals(
|
||||||
|
@ -360,54 +462,71 @@ public class TestKnnGraph extends LuceneTestCase {
|
||||||
values[id],
|
values[id],
|
||||||
scratch,
|
scratch,
|
||||||
0f);
|
0f);
|
||||||
// We collect neighbors for analysis below
|
numDocsWithVectors++;
|
||||||
List<Integer> friends = new ArrayList<>();
|
}
|
||||||
|
assertEquals(NO_MORE_DOCS, vectorValues.nextDoc());
|
||||||
|
|
||||||
|
// assert graph values:
|
||||||
|
// For each level of the graph assert that:
|
||||||
|
// 1. There are no orphan nodes without any friends
|
||||||
|
// 2. If orphans are found, than the level must contain only 0 or a single node
|
||||||
|
// 3. If the number of nodes on the level doesn't exceed maxConn, assert that the graph is
|
||||||
|
// fully connected, i.e. any node is reachable from any other node.
|
||||||
|
// 4. If the number of nodes on the level exceeds maxConn, assert that maxConn is respected.
|
||||||
|
for (int level = 0; level < graphValues.numLevels(); level++) {
|
||||||
|
int[][] graphOnLevel = new int[graphValues.size()][];
|
||||||
|
int countOnLevel = 0;
|
||||||
|
boolean foundOrphan = false;
|
||||||
|
NodesIterator nodesItr = graphValues.getNodesOnLevel(level);
|
||||||
|
while (nodesItr.hasNext()) {
|
||||||
|
int node = nodesItr.nextInt();
|
||||||
|
graphValues.seek(level, node);
|
||||||
int arc;
|
int arc;
|
||||||
|
List<Integer> friends = new ArrayList<>();
|
||||||
while ((arc = graphValues.nextNeighbor()) != NO_MORE_DOCS) {
|
while ((arc = graphValues.nextNeighbor()) != NO_MORE_DOCS) {
|
||||||
friends.add(arc);
|
friends.add(arc);
|
||||||
}
|
}
|
||||||
if (friends.size() == 0) {
|
if (friends.size() == 0) {
|
||||||
// System.out.printf("knngraph @%d is singleton (advance returns %d)\n", i,
|
|
||||||
// nextWithNeighbors);
|
|
||||||
foundOrphan = true;
|
foundOrphan = true;
|
||||||
} else {
|
} else {
|
||||||
// NOTE: these friends are dense ordinals, not docIds.
|
int[] friendsCopy = new int[friends.size()];
|
||||||
int[] friendCopy = new int[friends.size()];
|
Arrays.setAll(friendsCopy, friends::get);
|
||||||
for (int j = 0; j < friends.size(); j++) {
|
graphOnLevel[node] = friendsCopy;
|
||||||
friendCopy[j] = friends.get(j);
|
|
||||||
}
|
}
|
||||||
graph[graphSize] = friendCopy;
|
countOnLevel++;
|
||||||
// System.out.printf("knngraph @%d => %s\n", i, Arrays.toString(graph[i]));
|
|
||||||
}
|
}
|
||||||
graphSize++;
|
// System.out.println("Level[" + level + "] has [" + nodesCount + "] nodes.");
|
||||||
}
|
assertEquals(nodesItr.size(), countOnLevel);
|
||||||
assertEquals(NO_MORE_DOCS, vectorValues.nextDoc());
|
assertFalse("No nodes on level [" + level + "]", countOnLevel == 0);
|
||||||
if (foundOrphan) {
|
if (countOnLevel == 1) {
|
||||||
assertEquals("graph is not fully connected", 1, graphSize);
|
|
||||||
} else {
|
|
||||||
assertTrue(
|
assertTrue(
|
||||||
"Graph has " + graphSize + " nodes, but one of them has no neighbors", graphSize > 1);
|
"Graph with 1 node has unexpected neighbors on level [" + level + "]", foundOrphan);
|
||||||
}
|
} else {
|
||||||
if (maxConn > graphSize) {
|
assertFalse(
|
||||||
// assert that the graph in each leaf is connected
|
"Graph has orphan nodes with no friends on level [" + level + "]", foundOrphan);
|
||||||
assertConnected(graph);
|
if (maxConn > countOnLevel) {
|
||||||
|
// assert that the graph is fully connected,
|
||||||
|
// i.e. any node can be reached from any other node
|
||||||
|
assertConnected(graphOnLevel);
|
||||||
} else {
|
} else {
|
||||||
// assert that max-connections was respected
|
// assert that max-connections was respected
|
||||||
assertMaxConn(graph, maxConn);
|
assertMaxConn(graphOnLevel, maxConn);
|
||||||
}
|
|
||||||
totalGraphDocs += graphSize;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
int expectedCount = 0;
|
|
||||||
for (float[] friends : values) {
|
|
||||||
if (friends != null) {
|
|
||||||
++expectedCount;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
assertEquals(expectedCount, totalGraphDocs);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private void assertMaxConn(int[][] graph, int maxConn) {
|
int expectedNumDocsWithVectors = 0;
|
||||||
|
for (float[] value : values) {
|
||||||
|
if (value != null) {
|
||||||
|
++expectedNumDocsWithVectors;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
assertEquals(expectedNumDocsWithVectors, numDocsWithVectors);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static void assertMaxConn(int[][] graph, int maxConn) {
|
||||||
for (int[] ints : graph) {
|
for (int[] ints : graph) {
|
||||||
if (ints != null) {
|
if (ints != null) {
|
||||||
assert (ints.length <= maxConn);
|
assert (ints.length <= maxConn);
|
||||||
|
@ -418,37 +537,36 @@ public class TestKnnGraph extends LuceneTestCase {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private void assertConnected(int[][] graph) {
|
/** Assert that every node is reachable from some other node */
|
||||||
// every node in the graph is reachable from every other node
|
private static void assertConnected(int[][] graph) {
|
||||||
|
List<Integer> nodes = new ArrayList<>();
|
||||||
Set<Integer> visited = new HashSet<>();
|
Set<Integer> visited = new HashSet<>();
|
||||||
List<Integer> queue = new LinkedList<>();
|
List<Integer> queue = new LinkedList<>();
|
||||||
int count = 0;
|
for (int i = 0; i < graph.length; i++) {
|
||||||
for (int[] entry : graph) {
|
if (graph[i] != null) {
|
||||||
if (entry != null) {
|
nodes.add(i);
|
||||||
if (queue.isEmpty()) {
|
|
||||||
queue.add(entry[0]); // start from any node
|
|
||||||
// System.out.println("start at " + entry[0]);
|
|
||||||
}
|
|
||||||
++count;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// start from any node
|
||||||
|
int startIdx = random().nextInt(nodes.size());
|
||||||
|
queue.add(nodes.get(startIdx));
|
||||||
while (queue.isEmpty() == false) {
|
while (queue.isEmpty() == false) {
|
||||||
int i = queue.remove(0);
|
int i = queue.remove(0);
|
||||||
assertNotNull("expected neighbors of " + i, graph[i]);
|
assertNotNull("expected neighbors of " + i, graph[i]);
|
||||||
visited.add(i);
|
visited.add(i);
|
||||||
for (int j : graph[i]) {
|
for (int j : graph[i]) {
|
||||||
if (visited.contains(j) == false) {
|
if (visited.contains(j) == false) {
|
||||||
// System.out.println(" ... " + j);
|
|
||||||
queue.add(j);
|
queue.add(j);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
for (int i = 0; i < count; i++) {
|
// assert that every node is reachable from some other node as it was visited
|
||||||
assertTrue("Attempted to walk entire graph but never visited " + i, visited.contains(i));
|
for (int node : nodes) {
|
||||||
|
assertTrue(
|
||||||
|
"Attempted to walk entire graph but never visited node [" + node + "]",
|
||||||
|
visited.contains(node));
|
||||||
}
|
}
|
||||||
// we visited each node exactly once
|
|
||||||
assertEquals(
|
|
||||||
"Attempted to walk entire graph but only visited " + visited.size(), count, visited.size());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private void add(IndexWriter iw, int id, float[] vector) throws IOException {
|
private void add(IndexWriter iw, int id, float[] vector) throws IOException {
|
||||||
|
|
|
@ -37,9 +37,9 @@ import java.util.Locale;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
import org.apache.lucene.codecs.KnnVectorsFormat;
|
import org.apache.lucene.codecs.KnnVectorsFormat;
|
||||||
import org.apache.lucene.codecs.KnnVectorsReader;
|
import org.apache.lucene.codecs.KnnVectorsReader;
|
||||||
import org.apache.lucene.codecs.lucene90.Lucene90Codec;
|
import org.apache.lucene.codecs.lucene91.Lucene91Codec;
|
||||||
import org.apache.lucene.codecs.lucene90.Lucene90HnswVectorsFormat;
|
import org.apache.lucene.codecs.lucene91.Lucene91HnswVectorsFormat;
|
||||||
import org.apache.lucene.codecs.lucene90.Lucene90HnswVectorsReader;
|
import org.apache.lucene.codecs.lucene91.Lucene91HnswVectorsReader;
|
||||||
import org.apache.lucene.codecs.perfield.PerFieldKnnVectorsFormat;
|
import org.apache.lucene.codecs.perfield.PerFieldKnnVectorsFormat;
|
||||||
import org.apache.lucene.document.Document;
|
import org.apache.lucene.document.Document;
|
||||||
import org.apache.lucene.document.FieldType;
|
import org.apache.lucene.document.FieldType;
|
||||||
|
@ -253,7 +253,7 @@ public class KnnGraphTester {
|
||||||
((PerFieldKnnVectorsFormat.FieldsReader) ((CodecReader) leafReader).getVectorReader())
|
((PerFieldKnnVectorsFormat.FieldsReader) ((CodecReader) leafReader).getVectorReader())
|
||||||
.getFieldReader(KNN_FIELD);
|
.getFieldReader(KNN_FIELD);
|
||||||
KnnGraphValues knnValues =
|
KnnGraphValues knnValues =
|
||||||
((Lucene90HnswVectorsReader) vectorsReader).getGraphValues(KNN_FIELD);
|
((Lucene91HnswVectorsReader) vectorsReader).getGraphValues(KNN_FIELD);
|
||||||
System.out.printf("Leaf %d has %d documents\n", context.ord, leafReader.maxDoc());
|
System.out.printf("Leaf %d has %d documents\n", context.ord, leafReader.maxDoc());
|
||||||
printGraphFanout(knnValues, leafReader.maxDoc());
|
printGraphFanout(knnValues, leafReader.maxDoc());
|
||||||
}
|
}
|
||||||
|
@ -267,7 +267,7 @@ public class KnnGraphTester {
|
||||||
new HnswGraphBuilder(vectors, similarityFunction, maxConn, beamWidth, 0);
|
new HnswGraphBuilder(vectors, similarityFunction, maxConn, beamWidth, 0);
|
||||||
// start at node 1
|
// start at node 1
|
||||||
for (int i = 1; i < numDocs; i++) {
|
for (int i = 1; i < numDocs; i++) {
|
||||||
builder.addGraphNode(values.vectorValue(i));
|
builder.addGraphNode(i, values.vectorValue(i));
|
||||||
System.out.println("\nITERATION " + i);
|
System.out.println("\nITERATION " + i);
|
||||||
dumpGraph(builder.hnsw);
|
dumpGraph(builder.hnsw);
|
||||||
}
|
}
|
||||||
|
@ -276,7 +276,7 @@ public class KnnGraphTester {
|
||||||
|
|
||||||
private void dumpGraph(HnswGraph hnsw) {
|
private void dumpGraph(HnswGraph hnsw) {
|
||||||
for (int i = 0; i < hnsw.size(); i++) {
|
for (int i = 0; i < hnsw.size(); i++) {
|
||||||
NeighborArray neighbors = hnsw.getNeighbors(i);
|
NeighborArray neighbors = hnsw.getNeighbors(0, i);
|
||||||
System.out.printf(Locale.ROOT, "%5d", i);
|
System.out.printf(Locale.ROOT, "%5d", i);
|
||||||
NeighborArray sorted = new NeighborArray(neighbors.size());
|
NeighborArray sorted = new NeighborArray(neighbors.size());
|
||||||
for (int j = 0; j < neighbors.size(); j++) {
|
for (int j = 0; j < neighbors.size(); j++) {
|
||||||
|
@ -308,7 +308,7 @@ public class KnnGraphTester {
|
||||||
int count = 0;
|
int count = 0;
|
||||||
int[] leafHist = new int[numDocs];
|
int[] leafHist = new int[numDocs];
|
||||||
for (int node = 0; node < numDocs; node++) {
|
for (int node = 0; node < numDocs; node++) {
|
||||||
knnValues.seek(node);
|
knnValues.seek(0, node);
|
||||||
int n = 0;
|
int n = 0;
|
||||||
while (knnValues.nextNeighbor() != NO_MORE_DOCS) {
|
while (knnValues.nextNeighbor() != NO_MORE_DOCS) {
|
||||||
++n;
|
++n;
|
||||||
|
@ -580,10 +580,10 @@ public class KnnGraphTester {
|
||||||
private int createIndex(Path docsPath, Path indexPath) throws IOException {
|
private int createIndex(Path docsPath, Path indexPath) throws IOException {
|
||||||
IndexWriterConfig iwc = new IndexWriterConfig().setOpenMode(IndexWriterConfig.OpenMode.CREATE);
|
IndexWriterConfig iwc = new IndexWriterConfig().setOpenMode(IndexWriterConfig.OpenMode.CREATE);
|
||||||
iwc.setCodec(
|
iwc.setCodec(
|
||||||
new Lucene90Codec() {
|
new Lucene91Codec() {
|
||||||
@Override
|
@Override
|
||||||
public KnnVectorsFormat getKnnVectorsFormatForField(String field) {
|
public KnnVectorsFormat getKnnVectorsFormatForField(String field) {
|
||||||
return new Lucene90HnswVectorsFormat(maxConn, beamWidth);
|
return new Lucene91HnswVectorsFormat(maxConn, beamWidth);
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
// iwc.setMergePolicy(NoMergePolicy.INSTANCE);
|
// iwc.setMergePolicy(NoMergePolicy.INSTANCE);
|
||||||
|
|
|
@ -24,11 +24,10 @@ import java.util.Arrays;
|
||||||
import java.util.HashSet;
|
import java.util.HashSet;
|
||||||
import java.util.Random;
|
import java.util.Random;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
import java.util.SplittableRandom;
|
|
||||||
import org.apache.lucene.codecs.KnnVectorsFormat;
|
import org.apache.lucene.codecs.KnnVectorsFormat;
|
||||||
import org.apache.lucene.codecs.lucene90.Lucene90Codec;
|
import org.apache.lucene.codecs.lucene91.Lucene91Codec;
|
||||||
import org.apache.lucene.codecs.lucene90.Lucene90HnswVectorsFormat;
|
import org.apache.lucene.codecs.lucene91.Lucene91HnswVectorsFormat;
|
||||||
import org.apache.lucene.codecs.lucene90.Lucene90HnswVectorsReader;
|
import org.apache.lucene.codecs.lucene91.Lucene91HnswVectorsReader;
|
||||||
import org.apache.lucene.codecs.perfield.PerFieldKnnVectorsFormat;
|
import org.apache.lucene.codecs.perfield.PerFieldKnnVectorsFormat;
|
||||||
import org.apache.lucene.document.Document;
|
import org.apache.lucene.document.Document;
|
||||||
import org.apache.lucene.document.KnnVectorField;
|
import org.apache.lucene.document.KnnVectorField;
|
||||||
|
@ -39,6 +38,7 @@ import org.apache.lucene.index.IndexReader;
|
||||||
import org.apache.lucene.index.IndexWriter;
|
import org.apache.lucene.index.IndexWriter;
|
||||||
import org.apache.lucene.index.IndexWriterConfig;
|
import org.apache.lucene.index.IndexWriterConfig;
|
||||||
import org.apache.lucene.index.KnnGraphValues;
|
import org.apache.lucene.index.KnnGraphValues;
|
||||||
|
import org.apache.lucene.index.KnnGraphValues.NodesIterator;
|
||||||
import org.apache.lucene.index.LeafReaderContext;
|
import org.apache.lucene.index.LeafReaderContext;
|
||||||
import org.apache.lucene.index.RandomAccessVectorValues;
|
import org.apache.lucene.index.RandomAccessVectorValues;
|
||||||
import org.apache.lucene.index.RandomAccessVectorValuesProducer;
|
import org.apache.lucene.index.RandomAccessVectorValuesProducer;
|
||||||
|
@ -81,10 +81,10 @@ public class TestHnswGraph extends LuceneTestCase {
|
||||||
IndexWriterConfig iwc =
|
IndexWriterConfig iwc =
|
||||||
new IndexWriterConfig()
|
new IndexWriterConfig()
|
||||||
.setCodec(
|
.setCodec(
|
||||||
new Lucene90Codec() {
|
new Lucene91Codec() {
|
||||||
@Override
|
@Override
|
||||||
public KnnVectorsFormat getKnnVectorsFormatForField(String field) {
|
public KnnVectorsFormat getKnnVectorsFormatForField(String field) {
|
||||||
return new Lucene90HnswVectorsFormat(maxConn, beamWidth);
|
return new Lucene91HnswVectorsFormat(maxConn, beamWidth);
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
try (IndexWriter iw = new IndexWriter(dir, iwc)) {
|
try (IndexWriter iw = new IndexWriter(dir, iwc)) {
|
||||||
|
@ -111,38 +111,64 @@ public class TestHnswGraph extends LuceneTestCase {
|
||||||
assertEquals(indexedDoc, ctx.reader().numDocs());
|
assertEquals(indexedDoc, ctx.reader().numDocs());
|
||||||
assertVectorsEqual(v3, values);
|
assertVectorsEqual(v3, values);
|
||||||
KnnGraphValues graphValues =
|
KnnGraphValues graphValues =
|
||||||
((Lucene90HnswVectorsReader)
|
((Lucene91HnswVectorsReader)
|
||||||
((PerFieldKnnVectorsFormat.FieldsReader)
|
((PerFieldKnnVectorsFormat.FieldsReader)
|
||||||
((CodecReader) ctx.reader()).getVectorReader())
|
((CodecReader) ctx.reader()).getVectorReader())
|
||||||
.getFieldReader("field"))
|
.getFieldReader("field"))
|
||||||
.getGraphValues("field");
|
.getGraphValues("field");
|
||||||
assertGraphEqual(hnsw, graphValues, nVec);
|
assertGraphEqual(hnsw, graphValues);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private void assertGraphEqual(KnnGraphValues g, KnnGraphValues h) throws IOException {
|
||||||
|
assertEquals("the number of levels in the graphs are different!", g.numLevels(), h.numLevels());
|
||||||
|
assertEquals("the number of nodes in the graphs are different!", g.size(), h.size());
|
||||||
|
|
||||||
|
// assert equal nodes on each level
|
||||||
|
for (int level = 0; level < g.numLevels(); level++) {
|
||||||
|
NodesIterator nodesOnLevel = g.getNodesOnLevel(level);
|
||||||
|
NodesIterator nodesOnLevel2 = h.getNodesOnLevel(level);
|
||||||
|
while (nodesOnLevel.hasNext() && nodesOnLevel2.hasNext()) {
|
||||||
|
int node = nodesOnLevel.nextInt();
|
||||||
|
int node2 = nodesOnLevel2.nextInt();
|
||||||
|
assertEquals("nodes in the graphs are different", node, node2);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// assert equal nodes' neighbours on each level
|
||||||
|
for (int level = 0; level < g.numLevels(); level++) {
|
||||||
|
NodesIterator nodesOnLevel = g.getNodesOnLevel(level);
|
||||||
|
while (nodesOnLevel.hasNext()) {
|
||||||
|
int node = nodesOnLevel.nextInt();
|
||||||
|
g.seek(level, node);
|
||||||
|
h.seek(level, node);
|
||||||
|
assertEquals("arcs differ for node " + node, getNeighborNodes(g), getNeighborNodes(h));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Make sure we actually approximately find the closest k elements. Mostly this is about
|
// Make sure we actually approximately find the closest k elements. Mostly this is about
|
||||||
// ensuring that we have all the distance functions, comparators, priority queues and so on
|
// ensuring that we have all the distance functions, comparators, priority queues and so on
|
||||||
// oriented in the right directions
|
// oriented in the right directions
|
||||||
public void testAknnDiverse() throws IOException {
|
public void testAknnDiverse() throws IOException {
|
||||||
|
int maxConn = 10;
|
||||||
int nDoc = 100;
|
int nDoc = 100;
|
||||||
CircularVectorValues vectors = new CircularVectorValues(nDoc);
|
CircularVectorValues vectors = new CircularVectorValues(nDoc);
|
||||||
HnswGraphBuilder builder =
|
HnswGraphBuilder builder =
|
||||||
new HnswGraphBuilder(
|
new HnswGraphBuilder(
|
||||||
vectors, VectorSimilarityFunction.DOT_PRODUCT, 16, 100, random().nextInt());
|
vectors, VectorSimilarityFunction.DOT_PRODUCT, maxConn, 100, random().nextInt());
|
||||||
HnswGraph hnsw = builder.build(vectors);
|
HnswGraph hnsw = builder.build(vectors);
|
||||||
// run some searches
|
// run some searches
|
||||||
NeighborQueue nn =
|
NeighborQueue nn =
|
||||||
HnswGraph.search(
|
HnswGraph.search(
|
||||||
new float[] {1, 0},
|
new float[] {1, 0},
|
||||||
10,
|
10,
|
||||||
10,
|
|
||||||
vectors.randomAccess(),
|
vectors.randomAccess(),
|
||||||
VectorSimilarityFunction.DOT_PRODUCT,
|
VectorSimilarityFunction.DOT_PRODUCT,
|
||||||
hnsw,
|
hnsw,
|
||||||
null,
|
null);
|
||||||
new SplittableRandom(random().nextLong()));
|
|
||||||
|
|
||||||
int[] nodes = nn.nodes();
|
int[] nodes = nn.nodes();
|
||||||
assertTrue("Number of found results is not equal to [10].", nodes.length == 10);
|
assertTrue("Number of found results is not equal to [10].", nodes.length == 10);
|
||||||
|
@ -155,7 +181,7 @@ public class TestHnswGraph extends LuceneTestCase {
|
||||||
assertTrue("sum(result docs)=" + sum, sum < 75);
|
assertTrue("sum(result docs)=" + sum, sum < 75);
|
||||||
|
|
||||||
for (int i = 0; i < nDoc; i++) {
|
for (int i = 0; i < nDoc; i++) {
|
||||||
NeighborArray neighbors = hnsw.getNeighbors(i);
|
NeighborArray neighbors = hnsw.getNeighbors(0, i);
|
||||||
int[] nnodes = neighbors.node;
|
int[] nnodes = neighbors.node;
|
||||||
for (int j = 0; j < neighbors.size(); j++) {
|
for (int j = 0; j < neighbors.size(); j++) {
|
||||||
// all neighbors should be valid node ids.
|
// all neighbors should be valid node ids.
|
||||||
|
@ -166,24 +192,22 @@ public class TestHnswGraph extends LuceneTestCase {
|
||||||
|
|
||||||
public void testSearchWithAcceptOrds() throws IOException {
|
public void testSearchWithAcceptOrds() throws IOException {
|
||||||
int nDoc = 100;
|
int nDoc = 100;
|
||||||
|
int maxConn = 16;
|
||||||
CircularVectorValues vectors = new CircularVectorValues(nDoc);
|
CircularVectorValues vectors = new CircularVectorValues(nDoc);
|
||||||
HnswGraphBuilder builder =
|
HnswGraphBuilder builder =
|
||||||
new HnswGraphBuilder(
|
new HnswGraphBuilder(
|
||||||
vectors, VectorSimilarityFunction.DOT_PRODUCT, 16, 100, random().nextInt());
|
vectors, VectorSimilarityFunction.DOT_PRODUCT, maxConn, 100, random().nextInt());
|
||||||
HnswGraph hnsw = builder.build(vectors);
|
HnswGraph hnsw = builder.build(vectors);
|
||||||
|
|
||||||
// the first 10 docs must not be deleted to ensure the expected recall
|
// the first 10 docs must not be deleted to ensure the expected recall
|
||||||
Bits acceptOrds = createRandomAcceptOrds(10, vectors.size);
|
Bits acceptOrds = createRandomAcceptOrds(10, vectors.size);
|
||||||
NeighborQueue nn =
|
NeighborQueue nn =
|
||||||
HnswGraph.search(
|
HnswGraph.search(
|
||||||
new float[] {1, 0},
|
new float[] {1, 0},
|
||||||
10,
|
10,
|
||||||
10,
|
|
||||||
vectors.randomAccess(),
|
vectors.randomAccess(),
|
||||||
VectorSimilarityFunction.DOT_PRODUCT,
|
VectorSimilarityFunction.DOT_PRODUCT,
|
||||||
hnsw,
|
hnsw,
|
||||||
acceptOrds,
|
acceptOrds);
|
||||||
new SplittableRandom(random().nextLong()));
|
|
||||||
int[] nodes = nn.nodes();
|
int[] nodes = nn.nodes();
|
||||||
assertTrue("Number of found results is not equal to [10].", nodes.length == 10);
|
assertTrue("Number of found results is not equal to [10].", nodes.length == 10);
|
||||||
int sum = 0;
|
int sum = 0;
|
||||||
|
@ -213,12 +237,10 @@ public class TestHnswGraph extends LuceneTestCase {
|
||||||
HnswGraph.search(
|
HnswGraph.search(
|
||||||
new float[] {1, 0},
|
new float[] {1, 0},
|
||||||
10,
|
10,
|
||||||
10,
|
|
||||||
vectors.randomAccess(),
|
vectors.randomAccess(),
|
||||||
VectorSimilarityFunction.EUCLIDEAN,
|
VectorSimilarityFunction.EUCLIDEAN,
|
||||||
hnsw,
|
hnsw,
|
||||||
acceptOrds,
|
acceptOrds);
|
||||||
new SplittableRandom(random().nextLong()));
|
|
||||||
int[] nodes = nn.nodes();
|
int[] nodes = nn.nodes();
|
||||||
assertTrue("Number of found results is not equal to [10].", nodes.length == 10);
|
assertTrue("Number of found results is not equal to [10].", nodes.length == 10);
|
||||||
int sum = 0;
|
int sum = 0;
|
||||||
|
@ -295,46 +317,46 @@ public class TestHnswGraph extends LuceneTestCase {
|
||||||
vectors, VectorSimilarityFunction.DOT_PRODUCT, 2, 10, random().nextInt());
|
vectors, VectorSimilarityFunction.DOT_PRODUCT, 2, 10, random().nextInt());
|
||||||
// node 0 is added by the builder constructor
|
// node 0 is added by the builder constructor
|
||||||
// builder.addGraphNode(vectors.vectorValue(0));
|
// builder.addGraphNode(vectors.vectorValue(0));
|
||||||
builder.addGraphNode(vectors.vectorValue(1));
|
builder.addGraphNode(1, vectors.vectorValue(1));
|
||||||
builder.addGraphNode(vectors.vectorValue(2));
|
builder.addGraphNode(2, vectors.vectorValue(2));
|
||||||
// now every node has tried to attach every other node as a neighbor, but
|
// now every node has tried to attach every other node as a neighbor, but
|
||||||
// some were excluded based on diversity check.
|
// some were excluded based on diversity check.
|
||||||
assertNeighbors(builder.hnsw, 0, 1, 2);
|
assertLevel0Neighbors(builder.hnsw, 0, 1, 2);
|
||||||
assertNeighbors(builder.hnsw, 1, 0);
|
assertLevel0Neighbors(builder.hnsw, 1, 0);
|
||||||
assertNeighbors(builder.hnsw, 2, 0);
|
assertLevel0Neighbors(builder.hnsw, 2, 0);
|
||||||
|
|
||||||
builder.addGraphNode(vectors.vectorValue(3));
|
builder.addGraphNode(3, vectors.vectorValue(3));
|
||||||
assertNeighbors(builder.hnsw, 0, 1, 2);
|
assertLevel0Neighbors(builder.hnsw, 0, 1, 2);
|
||||||
// we added 3 here
|
// we added 3 here
|
||||||
assertNeighbors(builder.hnsw, 1, 0, 3);
|
assertLevel0Neighbors(builder.hnsw, 1, 0, 3);
|
||||||
assertNeighbors(builder.hnsw, 2, 0);
|
assertLevel0Neighbors(builder.hnsw, 2, 0);
|
||||||
assertNeighbors(builder.hnsw, 3, 1);
|
assertLevel0Neighbors(builder.hnsw, 3, 1);
|
||||||
|
|
||||||
// supplant an existing neighbor
|
// supplant an existing neighbor
|
||||||
builder.addGraphNode(vectors.vectorValue(4));
|
builder.addGraphNode(4, vectors.vectorValue(4));
|
||||||
// 4 is the same distance from 0 that 2 is; we leave the existing node in place
|
// 4 is the same distance from 0 that 2 is; we leave the existing node in place
|
||||||
assertNeighbors(builder.hnsw, 0, 1, 2);
|
assertLevel0Neighbors(builder.hnsw, 0, 1, 2);
|
||||||
// 4 is closer to 1 than either existing neighbor (0, 3). 3 fails diversity check with 4, so
|
// 4 is closer to 1 than either existing neighbor (0, 3). 3 fails diversity check with 4, so
|
||||||
// replace it
|
// replace it
|
||||||
assertNeighbors(builder.hnsw, 1, 0, 4);
|
assertLevel0Neighbors(builder.hnsw, 1, 0, 4);
|
||||||
assertNeighbors(builder.hnsw, 2, 0);
|
assertLevel0Neighbors(builder.hnsw, 2, 0);
|
||||||
// 1 survives the diversity check
|
// 1 survives the diversity check
|
||||||
assertNeighbors(builder.hnsw, 3, 1, 4);
|
assertLevel0Neighbors(builder.hnsw, 3, 1, 4);
|
||||||
assertNeighbors(builder.hnsw, 4, 1, 3);
|
assertLevel0Neighbors(builder.hnsw, 4, 1, 3);
|
||||||
|
|
||||||
builder.addGraphNode(vectors.vectorValue(5));
|
builder.addGraphNode(5, vectors.vectorValue(5));
|
||||||
assertNeighbors(builder.hnsw, 0, 1, 2);
|
assertLevel0Neighbors(builder.hnsw, 0, 1, 2);
|
||||||
assertNeighbors(builder.hnsw, 1, 0, 5);
|
assertLevel0Neighbors(builder.hnsw, 1, 0, 5);
|
||||||
assertNeighbors(builder.hnsw, 2, 0);
|
assertLevel0Neighbors(builder.hnsw, 2, 0);
|
||||||
// even though 5 is closer, 3 is not a neighbor of 5, so no update to *its* neighbors occurs
|
// even though 5 is closer, 3 is not a neighbor of 5, so no update to *its* neighbors occurs
|
||||||
assertNeighbors(builder.hnsw, 3, 1, 4);
|
assertLevel0Neighbors(builder.hnsw, 3, 1, 4);
|
||||||
assertNeighbors(builder.hnsw, 4, 3, 5);
|
assertLevel0Neighbors(builder.hnsw, 4, 3, 5);
|
||||||
assertNeighbors(builder.hnsw, 5, 1, 4);
|
assertLevel0Neighbors(builder.hnsw, 5, 1, 4);
|
||||||
}
|
}
|
||||||
|
|
||||||
private void assertNeighbors(HnswGraph graph, int node, int... expected) {
|
private void assertLevel0Neighbors(HnswGraph graph, int node, int... expected) {
|
||||||
Arrays.sort(expected);
|
Arrays.sort(expected);
|
||||||
NeighborArray nn = graph.getNeighbors(node);
|
NeighborArray nn = graph.getNeighbors(0, node);
|
||||||
int[] actual = ArrayUtil.copyOfSubArray(nn.node, 0, nn.size());
|
int[] actual = ArrayUtil.copyOfSubArray(nn.node, 0, nn.size());
|
||||||
Arrays.sort(actual);
|
Arrays.sort(actual);
|
||||||
assertArrayEquals(
|
assertArrayEquals(
|
||||||
|
@ -346,13 +368,14 @@ public class TestHnswGraph extends LuceneTestCase {
|
||||||
public void testRandom() throws IOException {
|
public void testRandom() throws IOException {
|
||||||
int size = atLeast(100);
|
int size = atLeast(100);
|
||||||
int dim = atLeast(10);
|
int dim = atLeast(10);
|
||||||
int topK = 5;
|
int maxConn = 10;
|
||||||
RandomVectorValues vectors = new RandomVectorValues(size, dim, random());
|
RandomVectorValues vectors = new RandomVectorValues(size, dim, random());
|
||||||
VectorSimilarityFunction similarityFunction =
|
VectorSimilarityFunction similarityFunction =
|
||||||
VectorSimilarityFunction.values()[
|
VectorSimilarityFunction.values()[
|
||||||
random().nextInt(VectorSimilarityFunction.values().length - 1) + 1];
|
random().nextInt(VectorSimilarityFunction.values().length - 1) + 1];
|
||||||
|
int topK = 5;
|
||||||
HnswGraphBuilder builder =
|
HnswGraphBuilder builder =
|
||||||
new HnswGraphBuilder(vectors, similarityFunction, 10, 30, random().nextLong());
|
new HnswGraphBuilder(vectors, similarityFunction, maxConn, 30, random().nextLong());
|
||||||
HnswGraph hnsw = builder.build(vectors);
|
HnswGraph hnsw = builder.build(vectors);
|
||||||
Bits acceptOrds = random().nextBoolean() ? null : createRandomAcceptOrds(0, size);
|
Bits acceptOrds = random().nextBoolean() ? null : createRandomAcceptOrds(0, size);
|
||||||
|
|
||||||
|
@ -360,15 +383,7 @@ public class TestHnswGraph extends LuceneTestCase {
|
||||||
for (int i = 0; i < 100; i++) {
|
for (int i = 0; i < 100; i++) {
|
||||||
float[] query = randomVector(random(), dim);
|
float[] query = randomVector(random(), dim);
|
||||||
NeighborQueue actual =
|
NeighborQueue actual =
|
||||||
HnswGraph.search(
|
HnswGraph.search(query, topK, vectors, similarityFunction, hnsw, acceptOrds);
|
||||||
query,
|
|
||||||
topK,
|
|
||||||
100,
|
|
||||||
vectors,
|
|
||||||
similarityFunction,
|
|
||||||
hnsw,
|
|
||||||
acceptOrds,
|
|
||||||
new SplittableRandom(random().nextLong()));
|
|
||||||
NeighborQueue expected = new NeighborQueue(topK, similarityFunction.reversed);
|
NeighborQueue expected = new NeighborQueue(topK, similarityFunction.reversed);
|
||||||
for (int j = 0; j < size; j++) {
|
for (int j = 0; j < size; j++) {
|
||||||
if (vectors.vectorValue(j) != null && (acceptOrds == null || acceptOrds.get(j))) {
|
if (vectors.vectorValue(j) != null && (acceptOrds == null || acceptOrds.get(j))) {
|
||||||
|
@ -383,7 +398,7 @@ public class TestHnswGraph extends LuceneTestCase {
|
||||||
}
|
}
|
||||||
double overlap = totalMatches / (double) (100 * topK);
|
double overlap = totalMatches / (double) (100 * topK);
|
||||||
System.out.println("overlap=" + overlap + " totalMatches=" + totalMatches);
|
System.out.println("overlap=" + overlap + " totalMatches=" + totalMatches);
|
||||||
assertTrue("overlap=" + overlap, overlap > 0.9);
|
assertTrue("overlap=" + overlap, overlap > 0.8);
|
||||||
}
|
}
|
||||||
|
|
||||||
private int computeOverlap(int[] a, int[] b) {
|
private int computeOverlap(int[] a, int[] b) {
|
||||||
|
@ -487,14 +502,6 @@ public class TestHnswGraph extends LuceneTestCase {
|
||||||
return value;
|
return value;
|
||||||
}
|
}
|
||||||
|
|
||||||
private void assertGraphEqual(KnnGraphValues g, KnnGraphValues h, int size) throws IOException {
|
|
||||||
for (int node = 0; node < size; node++) {
|
|
||||||
g.seek(node);
|
|
||||||
h.seek(node);
|
|
||||||
assertEquals("arcs differ for node " + node, getNeighborNodes(g), getNeighborNodes(h));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private Set<Integer> getNeighborNodes(KnnGraphValues g) throws IOException {
|
private Set<Integer> getNeighborNodes(KnnGraphValues g) throws IOException {
|
||||||
Set<Integer> neighbors = new HashSet<>();
|
Set<Integer> neighbors = new HashSet<>();
|
||||||
for (int n = g.nextNeighbor(); n != NO_MORE_DOCS; n = g.nextNeighbor()) {
|
for (int n = g.nextNeighbor(); n != NO_MORE_DOCS; n = g.nextNeighbor()) {
|
||||||
|
|
|
@ -40,7 +40,7 @@ import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||||
import org.apache.lucene.codecs.Codec;
|
import org.apache.lucene.codecs.Codec;
|
||||||
import org.apache.lucene.codecs.PostingsFormat;
|
import org.apache.lucene.codecs.PostingsFormat;
|
||||||
import org.apache.lucene.codecs.lucene90.Lucene90Codec;
|
import org.apache.lucene.codecs.lucene91.Lucene91Codec;
|
||||||
import org.apache.lucene.document.Document;
|
import org.apache.lucene.document.Document;
|
||||||
import org.apache.lucene.document.Field;
|
import org.apache.lucene.document.Field;
|
||||||
import org.apache.lucene.document.IntPoint;
|
import org.apache.lucene.document.IntPoint;
|
||||||
|
@ -961,7 +961,7 @@ public class TestSuggestField extends LuceneTestCase {
|
||||||
IndexWriterConfig iwc = newIndexWriterConfig(random(), analyzer);
|
IndexWriterConfig iwc = newIndexWriterConfig(random(), analyzer);
|
||||||
iwc.setMergePolicy(newLogMergePolicy());
|
iwc.setMergePolicy(newLogMergePolicy());
|
||||||
Codec filterCodec =
|
Codec filterCodec =
|
||||||
new Lucene90Codec() {
|
new Lucene91Codec() {
|
||||||
CompletionPostingsFormat.FSTLoadMode fstLoadMode =
|
CompletionPostingsFormat.FSTLoadMode fstLoadMode =
|
||||||
RandomPicks.randomFrom(random(), CompletionPostingsFormat.FSTLoadMode.values());
|
RandomPicks.randomFrom(random(), CompletionPostingsFormat.FSTLoadMode.values());
|
||||||
PostingsFormat postingsFormat = new Completion90PostingsFormat(fstLoadMode);
|
PostingsFormat postingsFormat = new Completion90PostingsFormat(fstLoadMode);
|
||||||
|
|
|
@ -38,7 +38,7 @@ import java.util.TimeZone;
|
||||||
import org.apache.lucene.codecs.Codec;
|
import org.apache.lucene.codecs.Codec;
|
||||||
import org.apache.lucene.codecs.DocValuesFormat;
|
import org.apache.lucene.codecs.DocValuesFormat;
|
||||||
import org.apache.lucene.codecs.PostingsFormat;
|
import org.apache.lucene.codecs.PostingsFormat;
|
||||||
import org.apache.lucene.codecs.lucene90.Lucene90Codec;
|
import org.apache.lucene.codecs.lucene91.Lucene91Codec;
|
||||||
import org.apache.lucene.codecs.simpletext.SimpleTextCodec;
|
import org.apache.lucene.codecs.simpletext.SimpleTextCodec;
|
||||||
import org.apache.lucene.search.similarities.Similarity;
|
import org.apache.lucene.search.similarities.Similarity;
|
||||||
import org.apache.lucene.tests.codecs.asserting.AssertingCodec;
|
import org.apache.lucene.tests.codecs.asserting.AssertingCodec;
|
||||||
|
@ -193,9 +193,9 @@ final class TestRuleSetupAndRestoreClassEnv extends AbstractBeforeAfterRule {
|
||||||
} else if ("Compressing".equals(TEST_CODEC)
|
} else if ("Compressing".equals(TEST_CODEC)
|
||||||
|| ("random".equals(TEST_CODEC) && randomVal == 6 && !shouldAvoidCodec("Compressing"))) {
|
|| ("random".equals(TEST_CODEC) && randomVal == 6 && !shouldAvoidCodec("Compressing"))) {
|
||||||
codec = CompressingCodec.randomInstance(random);
|
codec = CompressingCodec.randomInstance(random);
|
||||||
} else if ("Lucene90".equals(TEST_CODEC)
|
} else if ("Lucene91".equals(TEST_CODEC)
|
||||||
|| ("random".equals(TEST_CODEC) && randomVal == 5 && !shouldAvoidCodec("Lucene90"))) {
|
|| ("random".equals(TEST_CODEC) && randomVal == 5 && !shouldAvoidCodec("Lucene91"))) {
|
||||||
codec = new Lucene90Codec(RandomPicks.randomFrom(random, Lucene90Codec.Mode.values()));
|
codec = new Lucene91Codec(RandomPicks.randomFrom(random, Lucene91Codec.Mode.values()));
|
||||||
} else if (!"random".equals(TEST_CODEC)) {
|
} else if (!"random".equals(TEST_CODEC)) {
|
||||||
codec = Codec.forName(TEST_CODEC);
|
codec = Codec.forName(TEST_CODEC);
|
||||||
} else if ("random".equals(TEST_POSTINGSFORMAT)) {
|
} else if ("random".equals(TEST_POSTINGSFORMAT)) {
|
||||||
|
|
|
@ -53,10 +53,10 @@ import org.apache.lucene.codecs.DocValuesFormat;
|
||||||
import org.apache.lucene.codecs.KnnVectorsFormat;
|
import org.apache.lucene.codecs.KnnVectorsFormat;
|
||||||
import org.apache.lucene.codecs.PostingsFormat;
|
import org.apache.lucene.codecs.PostingsFormat;
|
||||||
import org.apache.lucene.codecs.blocktreeords.BlockTreeOrdsPostingsFormat;
|
import org.apache.lucene.codecs.blocktreeords.BlockTreeOrdsPostingsFormat;
|
||||||
import org.apache.lucene.codecs.lucene90.Lucene90Codec;
|
|
||||||
import org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat;
|
import org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat;
|
||||||
import org.apache.lucene.codecs.lucene90.Lucene90HnswVectorsFormat;
|
|
||||||
import org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat;
|
import org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat;
|
||||||
|
import org.apache.lucene.codecs.lucene91.Lucene91Codec;
|
||||||
|
import org.apache.lucene.codecs.lucene91.Lucene91HnswVectorsFormat;
|
||||||
import org.apache.lucene.codecs.perfield.PerFieldDocValuesFormat;
|
import org.apache.lucene.codecs.perfield.PerFieldDocValuesFormat;
|
||||||
import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat;
|
import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat;
|
||||||
import org.apache.lucene.document.BinaryDocValuesField;
|
import org.apache.lucene.document.BinaryDocValuesField;
|
||||||
|
@ -1236,7 +1236,7 @@ public final class TestUtil {
|
||||||
* different than {@link Codec#getDefault()} because that is randomized.
|
* different than {@link Codec#getDefault()} because that is randomized.
|
||||||
*/
|
*/
|
||||||
public static Codec getDefaultCodec() {
|
public static Codec getDefaultCodec() {
|
||||||
return new Lucene90Codec();
|
return new Lucene91Codec();
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -1322,7 +1322,7 @@ public final class TestUtil {
|
||||||
* Lucene.
|
* Lucene.
|
||||||
*/
|
*/
|
||||||
public static KnnVectorsFormat getDefaultKnnVectorsFormat() {
|
public static KnnVectorsFormat getDefaultKnnVectorsFormat() {
|
||||||
return new Lucene90HnswVectorsFormat();
|
return new Lucene91HnswVectorsFormat();
|
||||||
}
|
}
|
||||||
|
|
||||||
public static boolean anyFilesExceptWriteLock(Directory dir) throws IOException {
|
public static boolean anyFilesExceptWriteLock(Directory dir) throws IOException {
|
||||||
|
|
Loading…
Reference in New Issue