From bd360f9b3e9987357012c91a6c5aa6fca636bf23 Mon Sep 17 00:00:00 2001 From: Mayya Sharipova Date: Fri, 22 Jul 2022 10:04:10 -0400 Subject: [PATCH] Create Lucene94 Codec and move Lucene92 to backwards_codecs (#1041) --- .../backward-codecs/src/java/module-info.java | 7 +- .../lucene92/Lucene92Codec.java | 6 +- .../lucene92/Lucene92HnswVectorsFormat.java | 36 +- .../lucene92/Lucene92HnswVectorsReader.java | 2 +- .../lucene92/OffHeapVectorValues.java | 2 +- .../lucene92/package-info.java | 422 ++++++++++++++++ .../services/org.apache.lucene.codecs.Codec | 1 + .../org.apache.lucene.codecs.KnnVectorsFormat | 1 + .../lucene92/Lucene92HnswVectorsWriter.java | 4 +- .../lucene92/Lucene92RWCodec.java | 46 ++ .../lucene92/Lucene92RWHnswVectorsFormat.java | 52 ++ .../TestLucene92HnswVectorsFormat.java | 43 ++ .../byTask/tasks/CreateIndexTask.java | 4 +- lucene/core/src/java/module-info.java | 10 +- .../java/org/apache/lucene/codecs/Codec.java | 2 +- .../lucene/codecs/KnnVectorsFormat.java | 3 +- .../lucene/codecs/lucene94/Lucene94Codec.java | 217 ++++++++ .../lucene94/Lucene94HnswVectorsFormat.java | 167 +++++++ .../lucene94/Lucene94HnswVectorsReader.java | 463 ++++++++++++++++++ .../lucene94/Lucene94HnswVectorsWriter.java | 330 +++++++++++++ .../codecs/lucene94/OffHeapVectorValues.java | 321 ++++++++++++ .../{lucene92 => lucene94}/package-info.java | 8 +- .../services/org.apache.lucene.codecs.Codec | 2 +- .../org.apache.lucene.codecs.KnnVectorsFormat | 2 +- ...ne90StoredFieldsFormatHighCompression.java | 10 +- .../TestLucene94HnswVectorsFormat.java | 42 ++ .../org/apache/lucene/index/TestKnnGraph.java | 22 +- .../lucene/util/hnsw/KnnGraphTester.java | 12 +- .../lucene/util/hnsw/TestHnswGraph.java | 12 +- .../suggest/document/TestSuggestField.java | 4 +- .../util/TestRuleSetupAndRestoreClassEnv.java | 8 +- .../apache/lucene/tests/util/TestUtil.java | 8 +- 32 files changed, 2188 insertions(+), 81 deletions(-) rename lucene/{core/src/java/org/apache/lucene/codecs => backward-codecs/src/java/org/apache/lucene/backward_codecs}/lucene92/Lucene92Codec.java (97%) rename lucene/{core/src/java/org/apache/lucene/codecs => backward-codecs/src/java/org/apache/lucene/backward_codecs}/lucene92/Lucene92HnswVectorsFormat.java (90%) rename lucene/{core/src/java/org/apache/lucene/codecs => backward-codecs/src/java/org/apache/lucene/backward_codecs}/lucene92/Lucene92HnswVectorsReader.java (99%) rename lucene/{core/src/java/org/apache/lucene/codecs => backward-codecs/src/java/org/apache/lucene/backward_codecs}/lucene92/OffHeapVectorValues.java (99%) create mode 100644 lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene92/package-info.java rename lucene/{core/src/java/org/apache/lucene/codecs => backward-codecs/src/test/org/apache/lucene/backward_codecs}/lucene92/Lucene92HnswVectorsWriter.java (98%) create mode 100644 lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene92/Lucene92RWCodec.java create mode 100644 lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene92/Lucene92RWHnswVectorsFormat.java create mode 100644 lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene92/TestLucene92HnswVectorsFormat.java create mode 100644 lucene/core/src/java/org/apache/lucene/codecs/lucene94/Lucene94Codec.java create mode 100644 lucene/core/src/java/org/apache/lucene/codecs/lucene94/Lucene94HnswVectorsFormat.java create mode 100644 lucene/core/src/java/org/apache/lucene/codecs/lucene94/Lucene94HnswVectorsReader.java create mode 100644 lucene/core/src/java/org/apache/lucene/codecs/lucene94/Lucene94HnswVectorsWriter.java create mode 100644 lucene/core/src/java/org/apache/lucene/codecs/lucene94/OffHeapVectorValues.java rename lucene/core/src/java/org/apache/lucene/codecs/{lucene92 => lucene94}/package-info.java (99%) create mode 100644 lucene/core/src/test/org/apache/lucene/codecs/lucene94/TestLucene94HnswVectorsFormat.java diff --git a/lucene/backward-codecs/src/java/module-info.java b/lucene/backward-codecs/src/java/module-info.java index c3ae5ef4159..3a4fc7765a6 100644 --- a/lucene/backward-codecs/src/java/module-info.java +++ b/lucene/backward-codecs/src/java/module-info.java @@ -31,6 +31,7 @@ module org.apache.lucene.backward_codecs { exports org.apache.lucene.backward_codecs.lucene87; exports org.apache.lucene.backward_codecs.lucene90; exports org.apache.lucene.backward_codecs.lucene91; + exports org.apache.lucene.backward_codecs.lucene92; exports org.apache.lucene.backward_codecs.packed; exports org.apache.lucene.backward_codecs.store; @@ -41,12 +42,14 @@ module org.apache.lucene.backward_codecs { org.apache.lucene.backward_codecs.lucene84.Lucene84PostingsFormat; provides org.apache.lucene.codecs.KnnVectorsFormat with org.apache.lucene.backward_codecs.lucene90.Lucene90HnswVectorsFormat, - org.apache.lucene.backward_codecs.lucene91.Lucene91HnswVectorsFormat; + org.apache.lucene.backward_codecs.lucene91.Lucene91HnswVectorsFormat, + org.apache.lucene.backward_codecs.lucene92.Lucene92HnswVectorsFormat; provides org.apache.lucene.codecs.Codec with org.apache.lucene.backward_codecs.lucene80.Lucene80Codec, org.apache.lucene.backward_codecs.lucene84.Lucene84Codec, org.apache.lucene.backward_codecs.lucene86.Lucene86Codec, org.apache.lucene.backward_codecs.lucene87.Lucene87Codec, org.apache.lucene.backward_codecs.lucene90.Lucene90Codec, - org.apache.lucene.backward_codecs.lucene91.Lucene91Codec; + org.apache.lucene.backward_codecs.lucene91.Lucene91Codec, + org.apache.lucene.backward_codecs.lucene92.Lucene92Codec; } diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene92/Lucene92Codec.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene92/Lucene92Codec.java similarity index 97% rename from lucene/core/src/java/org/apache/lucene/codecs/lucene92/Lucene92Codec.java rename to lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene92/Lucene92Codec.java index 5264295a902..f8245d8b3ea 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene92/Lucene92Codec.java +++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene92/Lucene92Codec.java @@ -14,7 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.lucene.codecs.lucene92; +package org.apache.lucene.backward_codecs.lucene92; import java.util.Objects; import org.apache.lucene.codecs.Codec; @@ -49,7 +49,7 @@ import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat; * *

If you want to reuse functionality of this codec in another codec, extend {@link FilterCodec}. * - * @see org.apache.lucene.codecs.lucene92 package documentation for file format details. + * @see org.apache.lucene.backward_codecs.lucene92 package documentation for file format details. * @lucene.experimental */ public class Lucene92Codec extends Codec { @@ -164,7 +164,7 @@ public class Lucene92Codec extends Codec { } @Override - public final KnnVectorsFormat knnVectorsFormat() { + public KnnVectorsFormat knnVectorsFormat() { return knnVectorsFormat; } diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene92/Lucene92HnswVectorsFormat.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene92/Lucene92HnswVectorsFormat.java similarity index 90% rename from lucene/core/src/java/org/apache/lucene/codecs/lucene92/Lucene92HnswVectorsFormat.java rename to lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene92/Lucene92HnswVectorsFormat.java index 3b28b706890..a3efe347568 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene92/Lucene92HnswVectorsFormat.java +++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene92/Lucene92HnswVectorsFormat.java @@ -15,7 +15,7 @@ * limitations under the License. */ -package org.apache.lucene.codecs.lucene92; +package org.apache.lucene.backward_codecs.lucene92; import java.io.IOException; import org.apache.lucene.codecs.KnnVectorsFormat; @@ -95,7 +95,15 @@ import org.apache.lucene.util.hnsw.HnswGraph; * * @lucene.experimental */ -public final class Lucene92HnswVectorsFormat extends KnnVectorsFormat { +public class Lucene92HnswVectorsFormat extends KnnVectorsFormat { + + /** Default number of maximum connections per node */ + public static final int DEFAULT_MAX_CONN = 16; + + /** + * Default number of the size of the queue maintained while searching during a graph construction. + */ + public static final int DEFAULT_BEAM_WIDTH = 100; static final String META_CODEC_NAME = "lucene92HnswVectorsFormatMeta"; static final String VECTOR_DATA_CODEC_NAME = "lucene92HnswVectorsFormatData"; @@ -107,32 +115,24 @@ public final class Lucene92HnswVectorsFormat extends KnnVectorsFormat { static final int VERSION_START = 0; static final int VERSION_CURRENT = VERSION_START; - /** Default number of maximum connections per node */ - public static final int DEFAULT_MAX_CONN = 16; - /** - * Default number of the size of the queue maintained while searching during a graph construction. - */ - public static final int DEFAULT_BEAM_WIDTH = 100; - - static final int DIRECT_MONOTONIC_BLOCK_SHIFT = 16; - /** * Controls how many of the nearest neighbor candidates are connected to the new node. Defaults to - * {@link Lucene92HnswVectorsFormat#DEFAULT_MAX_CONN}. See {@link HnswGraph} for more details. + * {@link #maxConn}. See {@link HnswGraph} for more details. */ - private final int maxConn; + final int maxConn; /** * The number of candidate neighbors to track while searching the graph for each newly inserted - * node. Defaults to to {@link Lucene92HnswVectorsFormat#DEFAULT_BEAM_WIDTH}. See {@link - * HnswGraph} for details. + * node. Defaults to to {@link #DEFAULT_BEAM_WIDTH}. See {@link HnswGraph} for details. */ - private final int beamWidth; + final int beamWidth; + /** A constructor for vectors format with default parameters */ public Lucene92HnswVectorsFormat() { this(DEFAULT_MAX_CONN, DEFAULT_BEAM_WIDTH); } + /** Constructs a format for reading old indexes */ public Lucene92HnswVectorsFormat(int maxConn, int beamWidth) { super("lucene92HnswVectorsFormat"); this.maxConn = maxConn; @@ -141,7 +141,7 @@ public final class Lucene92HnswVectorsFormat extends KnnVectorsFormat { @Override public KnnVectorsWriter fieldsWriter(SegmentWriteState state) throws IOException { - return new Lucene92HnswVectorsWriter(state, maxConn, beamWidth); + throw new UnsupportedOperationException("Old codecs may only be used for reading"); } @Override @@ -151,7 +151,7 @@ public final class Lucene92HnswVectorsFormat extends KnnVectorsFormat { @Override public String toString() { - return "lucene92HnswVectorsFormat(name = lucene92HnswVectorsFormat, maxConn = " + return "Lucene92HnswVectorsFormat(name = Lucene92HnswVectorsFormat, maxConn = " + maxConn + ", beamWidth=" + beamWidth diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene92/Lucene92HnswVectorsReader.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene92/Lucene92HnswVectorsReader.java similarity index 99% rename from lucene/core/src/java/org/apache/lucene/codecs/lucene92/Lucene92HnswVectorsReader.java rename to lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene92/Lucene92HnswVectorsReader.java index 9628f82d6cb..1b6534f9876 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene92/Lucene92HnswVectorsReader.java +++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene92/Lucene92HnswVectorsReader.java @@ -15,7 +15,7 @@ * limitations under the License. */ -package org.apache.lucene.codecs.lucene92; +package org.apache.lucene.backward_codecs.lucene92; import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS; diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene92/OffHeapVectorValues.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene92/OffHeapVectorValues.java similarity index 99% rename from lucene/core/src/java/org/apache/lucene/codecs/lucene92/OffHeapVectorValues.java rename to lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene92/OffHeapVectorValues.java index 199df605fe0..694903c2147 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene92/OffHeapVectorValues.java +++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene92/OffHeapVectorValues.java @@ -15,7 +15,7 @@ * limitations under the License. */ -package org.apache.lucene.codecs.lucene92; +package org.apache.lucene.backward_codecs.lucene92; import java.io.IOException; import java.nio.ByteBuffer; diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene92/package-info.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene92/package-info.java new file mode 100644 index 00000000000..cb06d56fd71 --- /dev/null +++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene92/package-info.java @@ -0,0 +1,422 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Lucene 9.2 file format. + * + *

Apache Lucene - Index File Formats

+ * + *
+ * + * + * + *
+ * + *

Introduction

+ * + *
+ * + *

This document defines the index file formats used in this version of Lucene. If you are using + * a different version of Lucene, please consult the copy of docs/ that was distributed + * with the version you are using. + * + *

This document attempts to provide a high-level definition of the Apache Lucene file formats. + *

+ * + *

Definitions

+ * + *
+ * + *

The fundamental concepts in Lucene are index, document, field and term. + * + *

An index contains a sequence of documents. + * + *

+ * + *

The same sequence of bytes in two different fields is considered a different term. Thus terms + * are represented as a pair: the string naming the field, and the bytes within the field. + * + *

Inverted Indexing

+ * + *

Lucene's index stores terms and statistics about those terms in order to make term-based + * search more efficient. Lucene's terms index falls into the family of indexes known as an + * inverted index. This is because it can list, for a term, the documents that contain it. + * This is the inverse of the natural relationship, in which documents list terms. + * + *

Types of Fields

+ * + *

In Lucene, fields may be stored, in which case their text is stored in the index + * literally, in a non-inverted manner. Fields that are inverted are called indexed. A field + * may be both stored and indexed. + * + *

The text of a field may be tokenized into terms to be indexed, or the text of a field + * may be used literally as a term to be indexed. Most fields are tokenized, but sometimes it is + * useful for certain identifier fields to be indexed literally. + * + *

See the {@link org.apache.lucene.document.Field Field} java docs for more information on + * Fields. + * + *

Segments

+ * + *

Lucene indexes may be composed of multiple sub-indexes, or segments. Each segment is a + * fully independent index, which could be searched separately. Indexes evolve by: + * + *

    + *
  1. Creating new segments for newly added documents. + *
  2. Merging existing segments. + *
+ * + *

Searches may involve multiple segments and/or multiple indexes, each index potentially + * composed of a set of segments. + * + *

Document Numbers

+ * + *

Internally, Lucene refers to documents by an integer document number. The first + * document added to an index is numbered zero, and each subsequent document added gets a number one + * greater than the previous. + * + *

Note that a document's number may change, so caution should be taken when storing these + * numbers outside of Lucene. In particular, numbers may change in the following situations: + * + *

+ * + *
+ * + *

Index Structure Overview

+ * + *
+ * + *

Each segment index maintains the following: + * + *

+ * + *

Details on each of these are provided in their linked pages.

+ * + *

File Naming

+ * + *
+ * + *

All files belonging to a segment have the same name with varying extensions. The extensions + * correspond to the different file formats described below. When using the Compound File format + * (default for small segments) these files (except for the Segment info file, the Lock file, and + * Deleted documents file) are collapsed into a single .cfs file (see below for details) + * + *

Typically, all segments in an index are stored in a single directory, although this is not + * required. + * + *

File names are never re-used. That is, when any file is saved to the Directory it is given a + * never before used filename. This is achieved using a simple generations approach. For example, + * the first segments file is segments_1, then segments_2, etc. The generation is a sequential long + * integer represented in alpha-numeric (base 36) form.

+ * + *

Summary of File Extensions

+ * + *
+ * + *

The following table summarizes the names and extensions of the files in Lucene: + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + *
lucene filenames by extension
NameExtensionBrief Description
{@link org.apache.lucene.index.SegmentInfos Segments File}segments_NStores information about a commit point
Lock Filewrite.lockThe Write lock prevents multiple IndexWriters from writing to the same + * file.
{@link org.apache.lucene.codecs.lucene90.Lucene90SegmentInfoFormat Segment Info}.siStores metadata about a segment
{@link org.apache.lucene.codecs.lucene90.Lucene90CompoundFormat Compound File}.cfs, .cfeAn optional "virtual" file consisting of all the other index files for + * systems that frequently run out of file handles.
{@link org.apache.lucene.codecs.lucene90.Lucene90FieldInfosFormat Fields}.fnmStores information about the fields
{@link org.apache.lucene.codecs.lucene90.Lucene90StoredFieldsFormat Field Index}.fdxContains pointers to field data
{@link org.apache.lucene.codecs.lucene90.Lucene90StoredFieldsFormat Field Data}.fdtThe stored fields for documents
{@link org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat Term Dictionary}.timThe term dictionary, stores term info
{@link org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat Term Index}.tipThe index into the Term Dictionary
{@link org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat Frequencies}.docContains the list of docs which contain each term along with frequency
{@link org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat Positions}.posStores position information about where a term occurs in the index
{@link org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat Payloads}.payStores additional per-position metadata information such as character offsets and user payloads
{@link org.apache.lucene.codecs.lucene90.Lucene90NormsFormat Norms}.nvd, .nvmEncodes length and boost factors for docs and fields
{@link org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat Per-Document Values}.dvd, .dvmEncodes additional scoring factors or other per-document information.
{@link org.apache.lucene.codecs.lucene90.Lucene90TermVectorsFormat Term Vector Index}.tvxStores offset into the document data file
{@link org.apache.lucene.codecs.lucene90.Lucene90TermVectorsFormat Term Vector Data}.tvdContains term vector data.
{@link org.apache.lucene.codecs.lucene90.Lucene90LiveDocsFormat Live Documents}.livInfo about what documents are live
{@link org.apache.lucene.codecs.lucene90.Lucene90PointsFormat Point values}.dii, .dimHolds indexed points
{@link org.apache.lucene.backward_codecs.lucene92.Lucene92HnswVectorsFormat Vector values}.vec, .vemHolds indexed vectors; .vec files contain the raw vector data, and + * .vem the vector metadata
+ * + *

+ * + *

Lock File

+ * + * The write lock, which is stored in the index directory by default, is named "write.lock". If the + * lock directory is different from the index directory then the write lock will be named + * "XXXX-write.lock" where XXXX is a unique prefix derived from the full path to the index + * directory. When this file is present, a writer is currently modifying the index (adding or + * removing documents). This lock file ensures that only one writer is modifying the index at a + * time. + * + *

History

+ * + *

Compatibility notes are provided in this document, describing how file formats have changed + * from prior versions: + * + *

+ * + * + * + *

Limitations

+ * + *
+ * + *

Lucene uses a Java int to refer to document numbers, and the index file format + * uses an Int32 on-disk to store document numbers. This is a limitation of both the + * index file format and the current implementation. Eventually these should be replaced with either + * UInt64 values, or better yet, {@link org.apache.lucene.store.DataOutput#writeVInt + * VInt} values which have no limit.

+ */ +package org.apache.lucene.backward_codecs.lucene92; diff --git a/lucene/backward-codecs/src/resources/META-INF/services/org.apache.lucene.codecs.Codec b/lucene/backward-codecs/src/resources/META-INF/services/org.apache.lucene.codecs.Codec index 74957355697..d9a3ef60880 100644 --- a/lucene/backward-codecs/src/resources/META-INF/services/org.apache.lucene.codecs.Codec +++ b/lucene/backward-codecs/src/resources/META-INF/services/org.apache.lucene.codecs.Codec @@ -19,3 +19,4 @@ org.apache.lucene.backward_codecs.lucene86.Lucene86Codec org.apache.lucene.backward_codecs.lucene87.Lucene87Codec org.apache.lucene.backward_codecs.lucene90.Lucene90Codec org.apache.lucene.backward_codecs.lucene91.Lucene91Codec +org.apache.lucene.backward_codecs.lucene92.Lucene92Codec diff --git a/lucene/backward-codecs/src/resources/META-INF/services/org.apache.lucene.codecs.KnnVectorsFormat b/lucene/backward-codecs/src/resources/META-INF/services/org.apache.lucene.codecs.KnnVectorsFormat index 550912cbf86..037584b0318 100644 --- a/lucene/backward-codecs/src/resources/META-INF/services/org.apache.lucene.codecs.KnnVectorsFormat +++ b/lucene/backward-codecs/src/resources/META-INF/services/org.apache.lucene.codecs.KnnVectorsFormat @@ -15,3 +15,4 @@ org.apache.lucene.backward_codecs.lucene90.Lucene90HnswVectorsFormat org.apache.lucene.backward_codecs.lucene91.Lucene91HnswVectorsFormat +org.apache.lucene.backward_codecs.lucene92.Lucene92HnswVectorsFormat diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene92/Lucene92HnswVectorsWriter.java b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene92/Lucene92HnswVectorsWriter.java similarity index 98% rename from lucene/core/src/java/org/apache/lucene/codecs/lucene92/Lucene92HnswVectorsWriter.java rename to lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene92/Lucene92HnswVectorsWriter.java index e63ab2ce277..98e397084a8 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene92/Lucene92HnswVectorsWriter.java +++ b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene92/Lucene92HnswVectorsWriter.java @@ -15,9 +15,9 @@ * limitations under the License. */ -package org.apache.lucene.codecs.lucene92; +package org.apache.lucene.backward_codecs.lucene92; -import static org.apache.lucene.codecs.lucene92.Lucene92HnswVectorsFormat.DIRECT_MONOTONIC_BLOCK_SHIFT; +import static org.apache.lucene.backward_codecs.lucene92.Lucene92RWHnswVectorsFormat.DIRECT_MONOTONIC_BLOCK_SHIFT; import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS; import java.io.IOException; diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene92/Lucene92RWCodec.java b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene92/Lucene92RWCodec.java new file mode 100644 index 00000000000..65ca5257d63 --- /dev/null +++ b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene92/Lucene92RWCodec.java @@ -0,0 +1,46 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.backward_codecs.lucene92; + +import org.apache.lucene.codecs.KnnVectorsFormat; +import org.apache.lucene.codecs.perfield.PerFieldKnnVectorsFormat; + +/** Implements the Lucene 9.2 index format for backwards compat testing */ +public class Lucene92RWCodec extends Lucene92Codec { + + private final KnnVectorsFormat defaultKnnVectorsFormat; + private final KnnVectorsFormat knnVectorsFormat = + new PerFieldKnnVectorsFormat() { + @Override + public KnnVectorsFormat getKnnVectorsFormatForField(String field) { + return defaultKnnVectorsFormat; + } + }; + + /** Instantiates a new codec. */ + public Lucene92RWCodec() { + defaultKnnVectorsFormat = + new Lucene92RWHnswVectorsFormat( + Lucene92HnswVectorsFormat.DEFAULT_MAX_CONN, + Lucene92HnswVectorsFormat.DEFAULT_BEAM_WIDTH); + } + + @Override + public final KnnVectorsFormat knnVectorsFormat() { + return knnVectorsFormat; + } +} diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene92/Lucene92RWHnswVectorsFormat.java b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene92/Lucene92RWHnswVectorsFormat.java new file mode 100644 index 00000000000..3b72fa3fe9a --- /dev/null +++ b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene92/Lucene92RWHnswVectorsFormat.java @@ -0,0 +1,52 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.backward_codecs.lucene92; + +import java.io.IOException; +import org.apache.lucene.codecs.KnnVectorsReader; +import org.apache.lucene.codecs.KnnVectorsWriter; +import org.apache.lucene.index.SegmentReadState; +import org.apache.lucene.index.SegmentWriteState; + +public final class Lucene92RWHnswVectorsFormat extends Lucene92HnswVectorsFormat { + + static final int DIRECT_MONOTONIC_BLOCK_SHIFT = 16; + + public Lucene92RWHnswVectorsFormat(int maxConn, int beamWidth) { + super(maxConn, beamWidth); + } + + @Override + public KnnVectorsWriter fieldsWriter(SegmentWriteState state) throws IOException { + return new Lucene92HnswVectorsWriter(state, DEFAULT_MAX_CONN, DEFAULT_BEAM_WIDTH); + } + + @Override + public KnnVectorsReader fieldsReader(SegmentReadState state) throws IOException { + return new Lucene92HnswVectorsReader(state); + } + + @Override + public String toString() { + return "Lucene92RWHnswVectorsFormat(name = Lucene92RWHnswVectorsFormat, maxConn = " + + maxConn + + ", beamWidth=" + + beamWidth + + ")"; + } +} diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene92/TestLucene92HnswVectorsFormat.java b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene92/TestLucene92HnswVectorsFormat.java new file mode 100644 index 00000000000..3984bb36619 --- /dev/null +++ b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene92/TestLucene92HnswVectorsFormat.java @@ -0,0 +1,43 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.backward_codecs.lucene92; + +import org.apache.lucene.codecs.Codec; +import org.apache.lucene.codecs.KnnVectorsFormat; +import org.apache.lucene.tests.index.BaseKnnVectorsFormatTestCase; + +public class TestLucene92HnswVectorsFormat extends BaseKnnVectorsFormatTestCase { + @Override + protected Codec getCodec() { + return new Lucene92RWCodec(); + } + + public void testToString() { + Codec customCodec = + new Lucene92RWCodec() { + @Override + public KnnVectorsFormat getKnnVectorsFormatForField(String field) { + return new Lucene92RWHnswVectorsFormat(10, 20); + } + }; + String expectedString = + "Lucene92RWHnswVectorsFormat(name = Lucene92RWHnswVectorsFormat, maxConn = 10, beamWidth=20)"; + assertEquals( + expectedString, + ((Lucene92Codec) customCodec).getKnnVectorsFormatForField("bogus_field").toString()); + } +} diff --git a/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/CreateIndexTask.java b/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/CreateIndexTask.java index e21fabe9e5f..243ade6fe04 100644 --- a/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/CreateIndexTask.java +++ b/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/CreateIndexTask.java @@ -27,7 +27,7 @@ import org.apache.lucene.benchmark.byTask.PerfRunData; import org.apache.lucene.benchmark.byTask.utils.Config; import org.apache.lucene.codecs.Codec; import org.apache.lucene.codecs.PostingsFormat; -import org.apache.lucene.codecs.lucene92.Lucene92Codec; +import org.apache.lucene.codecs.lucene94.Lucene94Codec; import org.apache.lucene.index.ConcurrentMergeScheduler; import org.apache.lucene.index.IndexCommit; import org.apache.lucene.index.IndexDeletionPolicy; @@ -152,7 +152,7 @@ public class CreateIndexTask extends PerfTask { try { final PostingsFormat postingsFormatChosen = PostingsFormat.forName(postingsFormat); iwConf.setCodec( - new Lucene92Codec() { + new Lucene94Codec() { @Override public PostingsFormat getPostingsFormatForField(String field) { return postingsFormatChosen; diff --git a/lucene/core/src/java/module-info.java b/lucene/core/src/java/module-info.java index cbb5c3a3147..fd1a4935250 100644 --- a/lucene/core/src/java/module-info.java +++ b/lucene/core/src/java/module-info.java @@ -15,8 +15,8 @@ * limitations under the License. */ -import org.apache.lucene.codecs.lucene92.Lucene92Codec; -import org.apache.lucene.codecs.lucene92.Lucene92HnswVectorsFormat; +import org.apache.lucene.codecs.lucene94.Lucene94Codec; +import org.apache.lucene.codecs.lucene94.Lucene94HnswVectorsFormat; /** Lucene Core. */ @SuppressWarnings("module") // the test framework is compiled after the core... @@ -31,7 +31,7 @@ module org.apache.lucene.core { exports org.apache.lucene.codecs; exports org.apache.lucene.codecs.compressing; exports org.apache.lucene.codecs.lucene90; - exports org.apache.lucene.codecs.lucene92; + exports org.apache.lucene.codecs.lucene94; exports org.apache.lucene.codecs.lucene90.blocktree; exports org.apache.lucene.codecs.lucene90.compressing; exports org.apache.lucene.codecs.perfield; @@ -63,11 +63,11 @@ module org.apache.lucene.core { provides org.apache.lucene.analysis.TokenizerFactory with org.apache.lucene.analysis.standard.StandardTokenizerFactory; provides org.apache.lucene.codecs.Codec with - Lucene92Codec; + Lucene94Codec; provides org.apache.lucene.codecs.DocValuesFormat with org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat; provides org.apache.lucene.codecs.KnnVectorsFormat with - Lucene92HnswVectorsFormat; + Lucene94HnswVectorsFormat; provides org.apache.lucene.codecs.PostingsFormat with org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat; provides org.apache.lucene.index.SortFieldProvider with diff --git a/lucene/core/src/java/org/apache/lucene/codecs/Codec.java b/lucene/core/src/java/org/apache/lucene/codecs/Codec.java index bb75a5bca40..1192a45c388 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/Codec.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/Codec.java @@ -55,7 +55,7 @@ public abstract class Codec implements NamedSPILoader.NamedSPI { return LOADER; } - static Codec defaultCodec = LOADER.lookup("Lucene92"); + static Codec defaultCodec = LOADER.lookup("Lucene94"); } private final String name; diff --git a/lucene/core/src/java/org/apache/lucene/codecs/KnnVectorsFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/KnnVectorsFormat.java index 3e1903874c5..945b213b034 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/KnnVectorsFormat.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/KnnVectorsFormat.java @@ -84,8 +84,7 @@ public abstract class KnnVectorsFormat implements NamedSPILoader.NamedSPI { new KnnVectorsFormat("EMPTY") { @Override public KnnVectorsWriter fieldsWriter(SegmentWriteState state) { - throw new UnsupportedOperationException( - "Attempt to write EMPTY VectorValues: maybe you forgot to use codec=Lucene92"); + throw new UnsupportedOperationException("Attempt to write EMPTY VectorValues"); } @Override diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene94/Lucene94Codec.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene94/Lucene94Codec.java new file mode 100644 index 00000000000..5a4e308be48 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene94/Lucene94Codec.java @@ -0,0 +1,217 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.lucene94; + +import java.util.Objects; +import org.apache.lucene.codecs.Codec; +import org.apache.lucene.codecs.CompoundFormat; +import org.apache.lucene.codecs.DocValuesFormat; +import org.apache.lucene.codecs.FieldInfosFormat; +import org.apache.lucene.codecs.FilterCodec; +import org.apache.lucene.codecs.KnnVectorsFormat; +import org.apache.lucene.codecs.LiveDocsFormat; +import org.apache.lucene.codecs.NormsFormat; +import org.apache.lucene.codecs.PointsFormat; +import org.apache.lucene.codecs.PostingsFormat; +import org.apache.lucene.codecs.SegmentInfoFormat; +import org.apache.lucene.codecs.StoredFieldsFormat; +import org.apache.lucene.codecs.TermVectorsFormat; +import org.apache.lucene.codecs.lucene90.Lucene90CompoundFormat; +import org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat; +import org.apache.lucene.codecs.lucene90.Lucene90FieldInfosFormat; +import org.apache.lucene.codecs.lucene90.Lucene90LiveDocsFormat; +import org.apache.lucene.codecs.lucene90.Lucene90NormsFormat; +import org.apache.lucene.codecs.lucene90.Lucene90PointsFormat; +import org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat; +import org.apache.lucene.codecs.lucene90.Lucene90SegmentInfoFormat; +import org.apache.lucene.codecs.lucene90.Lucene90StoredFieldsFormat; +import org.apache.lucene.codecs.lucene90.Lucene90TermVectorsFormat; +import org.apache.lucene.codecs.perfield.PerFieldDocValuesFormat; +import org.apache.lucene.codecs.perfield.PerFieldKnnVectorsFormat; +import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat; + +/** + * Implements the Lucene 9.4 index format + * + *

If you want to reuse functionality of this codec in another codec, extend {@link FilterCodec}. + * + * @see org.apache.lucene.codecs.lucene94 package documentation for file format details. + * @lucene.experimental + */ +public class Lucene94Codec extends Codec { + + /** Configuration option for the codec. */ + public enum Mode { + /** Trade compression ratio for retrieval speed. */ + BEST_SPEED(Lucene90StoredFieldsFormat.Mode.BEST_SPEED), + /** Trade retrieval speed for compression ratio. */ + BEST_COMPRESSION(Lucene90StoredFieldsFormat.Mode.BEST_COMPRESSION); + + private final Lucene90StoredFieldsFormat.Mode storedMode; + + private Mode(Lucene90StoredFieldsFormat.Mode storedMode) { + this.storedMode = Objects.requireNonNull(storedMode); + } + } + + private final TermVectorsFormat vectorsFormat = new Lucene90TermVectorsFormat(); + private final FieldInfosFormat fieldInfosFormat = new Lucene90FieldInfosFormat(); + private final SegmentInfoFormat segmentInfosFormat = new Lucene90SegmentInfoFormat(); + private final LiveDocsFormat liveDocsFormat = new Lucene90LiveDocsFormat(); + private final CompoundFormat compoundFormat = new Lucene90CompoundFormat(); + private final NormsFormat normsFormat = new Lucene90NormsFormat(); + + private final PostingsFormat defaultPostingsFormat; + private final PostingsFormat postingsFormat = + new PerFieldPostingsFormat() { + @Override + public PostingsFormat getPostingsFormatForField(String field) { + return Lucene94Codec.this.getPostingsFormatForField(field); + } + }; + + private final DocValuesFormat defaultDVFormat; + private final DocValuesFormat docValuesFormat = + new PerFieldDocValuesFormat() { + @Override + public DocValuesFormat getDocValuesFormatForField(String field) { + return Lucene94Codec.this.getDocValuesFormatForField(field); + } + }; + + private final KnnVectorsFormat defaultKnnVectorsFormat; + private final KnnVectorsFormat knnVectorsFormat = + new PerFieldKnnVectorsFormat() { + @Override + public KnnVectorsFormat getKnnVectorsFormatForField(String field) { + return Lucene94Codec.this.getKnnVectorsFormatForField(field); + } + }; + + private final StoredFieldsFormat storedFieldsFormat; + + /** Instantiates a new codec. */ + public Lucene94Codec() { + this(Mode.BEST_SPEED); + } + + /** + * Instantiates a new codec, specifying the stored fields compression mode to use. + * + * @param mode stored fields compression mode to use for newly flushed/merged segments. + */ + public Lucene94Codec(Mode mode) { + super("Lucene94"); + this.storedFieldsFormat = + new Lucene90StoredFieldsFormat(Objects.requireNonNull(mode).storedMode); + this.defaultPostingsFormat = new Lucene90PostingsFormat(); + this.defaultDVFormat = new Lucene90DocValuesFormat(); + this.defaultKnnVectorsFormat = new Lucene94HnswVectorsFormat(); + } + + @Override + public final StoredFieldsFormat storedFieldsFormat() { + return storedFieldsFormat; + } + + @Override + public final TermVectorsFormat termVectorsFormat() { + return vectorsFormat; + } + + @Override + public final PostingsFormat postingsFormat() { + return postingsFormat; + } + + @Override + public final FieldInfosFormat fieldInfosFormat() { + return fieldInfosFormat; + } + + @Override + public final SegmentInfoFormat segmentInfoFormat() { + return segmentInfosFormat; + } + + @Override + public final LiveDocsFormat liveDocsFormat() { + return liveDocsFormat; + } + + @Override + public final CompoundFormat compoundFormat() { + return compoundFormat; + } + + @Override + public final PointsFormat pointsFormat() { + return new Lucene90PointsFormat(); + } + + @Override + public final KnnVectorsFormat knnVectorsFormat() { + return knnVectorsFormat; + } + + /** + * Returns the postings format that should be used for writing new segments of field. + * + *

The default implementation always returns "Lucene90". + * + *

WARNING: if you subclass, you are responsible for index backwards compatibility: + * future version of Lucene are only guaranteed to be able to read the default implementation, + */ + public PostingsFormat getPostingsFormatForField(String field) { + return defaultPostingsFormat; + } + + /** + * Returns the docvalues format that should be used for writing new segments of field + * . + * + *

The default implementation always returns "Lucene90". + * + *

WARNING: if you subclass, you are responsible for index backwards compatibility: + * future version of Lucene are only guaranteed to be able to read the default implementation. + */ + public DocValuesFormat getDocValuesFormatForField(String field) { + return defaultDVFormat; + } + + /** + * Returns the vectors format that should be used for writing new segments of field + * + *

The default implementation always returns "lucene94". + * + *

WARNING: if you subclass, you are responsible for index backwards compatibility: + * future version of Lucene are only guaranteed to be able to read the default implementation. + */ + public KnnVectorsFormat getKnnVectorsFormatForField(String field) { + return defaultKnnVectorsFormat; + } + + @Override + public final DocValuesFormat docValuesFormat() { + return docValuesFormat; + } + + @Override + public final NormsFormat normsFormat() { + return normsFormat; + } +} diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene94/Lucene94HnswVectorsFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene94/Lucene94HnswVectorsFormat.java new file mode 100644 index 00000000000..c6210e407e7 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene94/Lucene94HnswVectorsFormat.java @@ -0,0 +1,167 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.codecs.lucene94; + +import java.io.IOException; +import org.apache.lucene.codecs.KnnVectorsFormat; +import org.apache.lucene.codecs.KnnVectorsReader; +import org.apache.lucene.codecs.KnnVectorsWriter; +import org.apache.lucene.codecs.lucene90.IndexedDISI; +import org.apache.lucene.index.SegmentReadState; +import org.apache.lucene.index.SegmentWriteState; +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.util.hnsw.HnswGraph; + +/** + * Lucene 9.4 vector format, which encodes numeric vector values and an optional associated graph + * connecting the documents having values. The graph is used to power HNSW search. The format + * consists of three files: + * + *

.vec (vector data) file

+ * + *

For each field: + * + *

+ * + *

.vex (vector index)

+ * + *

Stores graphs connecting the documents for each field organized as a list of nodes' neighbours + * as following: + * + *

+ * + *

.vem (vector metadata) file

+ * + *

For each field: + * + *

+ * + * @lucene.experimental + */ +public final class Lucene94HnswVectorsFormat extends KnnVectorsFormat { + + static final String META_CODEC_NAME = "lucene94HnswVectorsFormatMeta"; + static final String VECTOR_DATA_CODEC_NAME = "lucene94HnswVectorsFormatData"; + static final String VECTOR_INDEX_CODEC_NAME = "lucene94HnswVectorsFormatIndex"; + static final String META_EXTENSION = "vem"; + static final String VECTOR_DATA_EXTENSION = "vec"; + static final String VECTOR_INDEX_EXTENSION = "vex"; + + static final int VERSION_START = 0; + static final int VERSION_CURRENT = VERSION_START; + + /** Default number of maximum connections per node */ + public static final int DEFAULT_MAX_CONN = 16; + /** + * Default number of the size of the queue maintained while searching during a graph construction. + */ + public static final int DEFAULT_BEAM_WIDTH = 100; + + static final int DIRECT_MONOTONIC_BLOCK_SHIFT = 16; + + /** + * Controls how many of the nearest neighbor candidates are connected to the new node. Defaults to + * {@link Lucene94HnswVectorsFormat#DEFAULT_MAX_CONN}. See {@link HnswGraph} for more details. + */ + private final int maxConn; + + /** + * The number of candidate neighbors to track while searching the graph for each newly inserted + * node. Defaults to to {@link Lucene94HnswVectorsFormat#DEFAULT_BEAM_WIDTH}. See {@link + * HnswGraph} for details. + */ + private final int beamWidth; + + /** Constructs a format using default graph construction parameters */ + public Lucene94HnswVectorsFormat() { + this(DEFAULT_MAX_CONN, DEFAULT_BEAM_WIDTH); + } + + /** + * Constructs a format using the given graph construction parameters. + * + * @param maxConn the maximum number of connections to a node in the HNSW graph + * @param beamWidth the size of the queue maintained during graph construction. + */ + public Lucene94HnswVectorsFormat(int maxConn, int beamWidth) { + super("Lucene94HnswVectorsFormat"); + this.maxConn = maxConn; + this.beamWidth = beamWidth; + } + + @Override + public KnnVectorsWriter fieldsWriter(SegmentWriteState state) throws IOException { + return new Lucene94HnswVectorsWriter(state, maxConn, beamWidth); + } + + @Override + public KnnVectorsReader fieldsReader(SegmentReadState state) throws IOException { + return new Lucene94HnswVectorsReader(state); + } + + @Override + public String toString() { + return "Lucene94HnswVectorsFormat(name=Lucene94HnswVectorsFormat, maxConn=" + + maxConn + + ", beamWidth=" + + beamWidth + + ")"; + } +} diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene94/Lucene94HnswVectorsReader.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene94/Lucene94HnswVectorsReader.java new file mode 100644 index 00000000000..c28f1411fd4 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene94/Lucene94HnswVectorsReader.java @@ -0,0 +1,463 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.codecs.lucene94; + +import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS; + +import java.io.IOException; +import java.util.Arrays; +import java.util.HashMap; +import java.util.Map; +import org.apache.lucene.codecs.CodecUtil; +import org.apache.lucene.codecs.KnnVectorsReader; +import org.apache.lucene.index.CorruptIndexException; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.FieldInfos; +import org.apache.lucene.index.IndexFileNames; +import org.apache.lucene.index.SegmentReadState; +import org.apache.lucene.index.VectorSimilarityFunction; +import org.apache.lucene.index.VectorValues; +import org.apache.lucene.search.ScoreDoc; +import org.apache.lucene.search.TopDocs; +import org.apache.lucene.search.TotalHits; +import org.apache.lucene.store.ChecksumIndexInput; +import org.apache.lucene.store.DataInput; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.util.Bits; +import org.apache.lucene.util.IOUtils; +import org.apache.lucene.util.RamUsageEstimator; +import org.apache.lucene.util.hnsw.HnswGraph; +import org.apache.lucene.util.hnsw.HnswGraphSearcher; +import org.apache.lucene.util.hnsw.NeighborQueue; +import org.apache.lucene.util.packed.DirectMonotonicReader; + +/** + * Reads vectors from the index segments along with index data structures supporting KNN search. + * + * @lucene.experimental + */ +public final class Lucene94HnswVectorsReader extends KnnVectorsReader { + + private final FieldInfos fieldInfos; + private final Map fields = new HashMap<>(); + private final IndexInput vectorData; + private final IndexInput vectorIndex; + + Lucene94HnswVectorsReader(SegmentReadState state) throws IOException { + this.fieldInfos = state.fieldInfos; + int versionMeta = readMetadata(state); + boolean success = false; + try { + vectorData = + openDataInput( + state, + versionMeta, + Lucene94HnswVectorsFormat.VECTOR_DATA_EXTENSION, + Lucene94HnswVectorsFormat.VECTOR_DATA_CODEC_NAME); + vectorIndex = + openDataInput( + state, + versionMeta, + Lucene94HnswVectorsFormat.VECTOR_INDEX_EXTENSION, + Lucene94HnswVectorsFormat.VECTOR_INDEX_CODEC_NAME); + success = true; + } finally { + if (success == false) { + IOUtils.closeWhileHandlingException(this); + } + } + } + + private int readMetadata(SegmentReadState state) throws IOException { + String metaFileName = + IndexFileNames.segmentFileName( + state.segmentInfo.name, state.segmentSuffix, Lucene94HnswVectorsFormat.META_EXTENSION); + int versionMeta = -1; + try (ChecksumIndexInput meta = state.directory.openChecksumInput(metaFileName, state.context)) { + Throwable priorE = null; + try { + versionMeta = + CodecUtil.checkIndexHeader( + meta, + Lucene94HnswVectorsFormat.META_CODEC_NAME, + Lucene94HnswVectorsFormat.VERSION_START, + Lucene94HnswVectorsFormat.VERSION_CURRENT, + state.segmentInfo.getId(), + state.segmentSuffix); + readFields(meta, state.fieldInfos); + } catch (Throwable exception) { + priorE = exception; + } finally { + CodecUtil.checkFooter(meta, priorE); + } + } + return versionMeta; + } + + private static IndexInput openDataInput( + SegmentReadState state, int versionMeta, String fileExtension, String codecName) + throws IOException { + String fileName = + IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, fileExtension); + IndexInput in = state.directory.openInput(fileName, state.context); + boolean success = false; + try { + int versionVectorData = + CodecUtil.checkIndexHeader( + in, + codecName, + Lucene94HnswVectorsFormat.VERSION_START, + Lucene94HnswVectorsFormat.VERSION_CURRENT, + state.segmentInfo.getId(), + state.segmentSuffix); + if (versionMeta != versionVectorData) { + throw new CorruptIndexException( + "Format versions mismatch: meta=" + + versionMeta + + ", " + + codecName + + "=" + + versionVectorData, + in); + } + CodecUtil.retrieveChecksum(in); + success = true; + return in; + } finally { + if (success == false) { + IOUtils.closeWhileHandlingException(in); + } + } + } + + private void readFields(ChecksumIndexInput meta, FieldInfos infos) throws IOException { + for (int fieldNumber = meta.readInt(); fieldNumber != -1; fieldNumber = meta.readInt()) { + FieldInfo info = infos.fieldInfo(fieldNumber); + if (info == null) { + throw new CorruptIndexException("Invalid field number: " + fieldNumber, meta); + } + FieldEntry fieldEntry = readField(meta); + validateFieldEntry(info, fieldEntry); + fields.put(info.name, fieldEntry); + } + } + + private void validateFieldEntry(FieldInfo info, FieldEntry fieldEntry) { + int dimension = info.getVectorDimension(); + if (dimension != fieldEntry.dimension) { + throw new IllegalStateException( + "Inconsistent vector dimension for field=\"" + + info.name + + "\"; " + + dimension + + " != " + + fieldEntry.dimension); + } + + long numBytes = (long) fieldEntry.size() * dimension * Float.BYTES; + if (numBytes != fieldEntry.vectorDataLength) { + throw new IllegalStateException( + "Vector data length " + + fieldEntry.vectorDataLength + + " not matching size=" + + fieldEntry.size() + + " * dim=" + + dimension + + " * 4 = " + + numBytes); + } + } + + private VectorSimilarityFunction readSimilarityFunction(DataInput input) throws IOException { + int similarityFunctionId = input.readInt(); + if (similarityFunctionId < 0 + || similarityFunctionId >= VectorSimilarityFunction.values().length) { + throw new CorruptIndexException( + "Invalid similarity function id: " + similarityFunctionId, input); + } + return VectorSimilarityFunction.values()[similarityFunctionId]; + } + + private FieldEntry readField(IndexInput input) throws IOException { + VectorSimilarityFunction similarityFunction = readSimilarityFunction(input); + return new FieldEntry(input, similarityFunction); + } + + @Override + public long ramBytesUsed() { + long totalBytes = RamUsageEstimator.shallowSizeOfInstance(Lucene94HnswVectorsFormat.class); + totalBytes += + RamUsageEstimator.sizeOfMap( + fields, RamUsageEstimator.shallowSizeOfInstance(FieldEntry.class)); + return totalBytes; + } + + @Override + public void checkIntegrity() throws IOException { + CodecUtil.checksumEntireFile(vectorData); + CodecUtil.checksumEntireFile(vectorIndex); + } + + @Override + public VectorValues getVectorValues(String field) throws IOException { + FieldEntry fieldEntry = fields.get(field); + return OffHeapVectorValues.load(fieldEntry, vectorData); + } + + @Override + public TopDocs search(String field, float[] target, int k, Bits acceptDocs, int visitedLimit) + throws IOException { + FieldEntry fieldEntry = fields.get(field); + + if (fieldEntry.size() == 0) { + return new TopDocs(new TotalHits(0, TotalHits.Relation.EQUAL_TO), new ScoreDoc[0]); + } + + // bound k by total number of vectors to prevent oversizing data structures + k = Math.min(k, fieldEntry.size()); + OffHeapVectorValues vectorValues = OffHeapVectorValues.load(fieldEntry, vectorData); + + NeighborQueue results = + HnswGraphSearcher.search( + target, + k, + vectorValues, + fieldEntry.similarityFunction, + getGraph(fieldEntry), + vectorValues.getAcceptOrds(acceptDocs), + visitedLimit); + + int i = 0; + ScoreDoc[] scoreDocs = new ScoreDoc[Math.min(results.size(), k)]; + while (results.size() > 0) { + int node = results.topNode(); + float score = results.topScore(); + results.pop(); + scoreDocs[scoreDocs.length - ++i] = new ScoreDoc(vectorValues.ordToDoc(node), score); + } + + TotalHits.Relation relation = + results.incomplete() + ? TotalHits.Relation.GREATER_THAN_OR_EQUAL_TO + : TotalHits.Relation.EQUAL_TO; + return new TopDocs(new TotalHits(results.visitedCount(), relation), scoreDocs); + } + + /** Get knn graph values; used for testing */ + public HnswGraph getGraph(String field) throws IOException { + FieldInfo info = fieldInfos.fieldInfo(field); + if (info == null) { + throw new IllegalArgumentException("No such field '" + field + "'"); + } + FieldEntry entry = fields.get(field); + if (entry != null && entry.vectorIndexLength > 0) { + return getGraph(entry); + } else { + return HnswGraph.EMPTY; + } + } + + private HnswGraph getGraph(FieldEntry entry) throws IOException { + IndexInput bytesSlice = + vectorIndex.slice("graph-data", entry.vectorIndexOffset, entry.vectorIndexLength); + return new OffHeapHnswGraph(entry, bytesSlice); + } + + @Override + public void close() throws IOException { + IOUtils.close(vectorData, vectorIndex); + } + + static class FieldEntry { + + final VectorSimilarityFunction similarityFunction; + final long vectorDataOffset; + final long vectorDataLength; + final long vectorIndexOffset; + final long vectorIndexLength; + final int M; + final int numLevels; + final int dimension; + final int size; + final int[][] nodesByLevel; + // for each level the start offsets in vectorIndex file from where to read neighbours + final long[] graphOffsetsByLevel; + + // the following four variables used to read docIds encoded by IndexDISI + // special values of docsWithFieldOffset are -1 and -2 + // -1 : dense + // -2 : empty + // other: sparse + final long docsWithFieldOffset; + final long docsWithFieldLength; + final short jumpTableEntryCount; + final byte denseRankPower; + + // the following four variables used to read ordToDoc encoded by DirectMonotonicWriter + // note that only spare case needs to store ordToDoc + final long addressesOffset; + final int blockShift; + final DirectMonotonicReader.Meta meta; + final long addressesLength; + + FieldEntry(IndexInput input, VectorSimilarityFunction similarityFunction) throws IOException { + this.similarityFunction = similarityFunction; + vectorDataOffset = input.readVLong(); + vectorDataLength = input.readVLong(); + vectorIndexOffset = input.readVLong(); + vectorIndexLength = input.readVLong(); + dimension = input.readInt(); + size = input.readInt(); + + docsWithFieldOffset = input.readLong(); + docsWithFieldLength = input.readLong(); + jumpTableEntryCount = input.readShort(); + denseRankPower = input.readByte(); + + // dense or empty + if (docsWithFieldOffset == -1 || docsWithFieldOffset == -2) { + addressesOffset = 0; + blockShift = 0; + meta = null; + addressesLength = 0; + } else { + // sparse + addressesOffset = input.readLong(); + blockShift = input.readVInt(); + meta = DirectMonotonicReader.loadMeta(input, size, blockShift); + addressesLength = input.readLong(); + } + + // read nodes by level + M = input.readInt(); + numLevels = input.readInt(); + nodesByLevel = new int[numLevels][]; + for (int level = 0; level < numLevels; level++) { + int numNodesOnLevel = input.readInt(); + if (level == 0) { + // we don't store nodes for level 0th, as this level contains all nodes + assert numNodesOnLevel == size; + nodesByLevel[0] = null; + } else { + nodesByLevel[level] = new int[numNodesOnLevel]; + for (int i = 0; i < numNodesOnLevel; i++) { + nodesByLevel[level][i] = input.readInt(); + } + } + } + + // calculate for each level the start offsets in vectorIndex file from where to read + // neighbours + graphOffsetsByLevel = new long[numLevels]; + for (int level = 0; level < numLevels; level++) { + if (level == 0) { + graphOffsetsByLevel[level] = 0; + } else if (level == 1) { + int numNodesOnLevel0 = size; + graphOffsetsByLevel[level] = (1 + (M * 2)) * Integer.BYTES * numNodesOnLevel0; + } else { + int numNodesOnPrevLevel = nodesByLevel[level - 1].length; + graphOffsetsByLevel[level] = + graphOffsetsByLevel[level - 1] + (1 + M) * Integer.BYTES * numNodesOnPrevLevel; + } + } + } + + int size() { + return size; + } + } + + /** Read the nearest-neighbors graph from the index input */ + private static final class OffHeapHnswGraph extends HnswGraph { + + final IndexInput dataIn; + final int[][] nodesByLevel; + final long[] graphOffsetsByLevel; + final int numLevels; + final int entryNode; + final int size; + final long bytesForConns; + final long bytesForConns0; + + int arcCount; + int arcUpTo; + int arc; + + OffHeapHnswGraph(FieldEntry entry, IndexInput dataIn) { + this.dataIn = dataIn; + this.nodesByLevel = entry.nodesByLevel; + this.numLevels = entry.numLevels; + this.entryNode = numLevels > 1 ? nodesByLevel[numLevels - 1][0] : 0; + this.size = entry.size(); + this.graphOffsetsByLevel = entry.graphOffsetsByLevel; + this.bytesForConns = ((long) entry.M + 1) * Integer.BYTES; + this.bytesForConns0 = ((long) (entry.M * 2) + 1) * Integer.BYTES; + } + + @Override + public void seek(int level, int targetOrd) throws IOException { + int targetIndex = + level == 0 + ? targetOrd + : Arrays.binarySearch(nodesByLevel[level], 0, nodesByLevel[level].length, targetOrd); + assert targetIndex >= 0; + long graphDataOffset = + graphOffsetsByLevel[level] + targetIndex * (level == 0 ? bytesForConns0 : bytesForConns); + // unsafe; no bounds checking + dataIn.seek(graphDataOffset); + arcCount = dataIn.readInt(); + arc = -1; + arcUpTo = 0; + } + + @Override + public int size() { + return size; + } + + @Override + public int nextNeighbor() throws IOException { + if (arcUpTo >= arcCount) { + return NO_MORE_DOCS; + } + ++arcUpTo; + arc = dataIn.readInt(); + return arc; + } + + @Override + public int numLevels() throws IOException { + return numLevels; + } + + @Override + public int entryNode() throws IOException { + return entryNode; + } + + @Override + public NodesIterator getNodesOnLevel(int level) { + if (level == 0) { + return new NodesIterator(size()); + } else { + return new NodesIterator(nodesByLevel[level], nodesByLevel[level].length); + } + } + } +} diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene94/Lucene94HnswVectorsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene94/Lucene94HnswVectorsWriter.java new file mode 100644 index 00000000000..3affa58cd0d --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene94/Lucene94HnswVectorsWriter.java @@ -0,0 +1,330 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.codecs.lucene94; + +import static org.apache.lucene.codecs.lucene94.Lucene94HnswVectorsFormat.DIRECT_MONOTONIC_BLOCK_SHIFT; +import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS; + +import java.io.IOException; +import java.util.Arrays; +import org.apache.lucene.codecs.CodecUtil; +import org.apache.lucene.codecs.KnnVectorsReader; +import org.apache.lucene.codecs.KnnVectorsWriter; +import org.apache.lucene.codecs.lucene90.IndexedDISI; +import org.apache.lucene.index.DocsWithFieldSet; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.IndexFileNames; +import org.apache.lucene.index.RandomAccessVectorValuesProducer; +import org.apache.lucene.index.SegmentWriteState; +import org.apache.lucene.index.VectorSimilarityFunction; +import org.apache.lucene.index.VectorValues; +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.IOUtils; +import org.apache.lucene.util.hnsw.HnswGraph.NodesIterator; +import org.apache.lucene.util.hnsw.HnswGraphBuilder; +import org.apache.lucene.util.hnsw.NeighborArray; +import org.apache.lucene.util.hnsw.OnHeapHnswGraph; +import org.apache.lucene.util.packed.DirectMonotonicWriter; + +/** + * Writes vector values and knn graphs to index segments. + * + * @lucene.experimental + */ +public final class Lucene94HnswVectorsWriter extends KnnVectorsWriter { + + private final SegmentWriteState segmentWriteState; + private final IndexOutput meta, vectorData, vectorIndex; + private final int maxDoc; + + private final int M; + private final int beamWidth; + private boolean finished; + + Lucene94HnswVectorsWriter(SegmentWriteState state, int M, int beamWidth) throws IOException { + this.M = M; + this.beamWidth = beamWidth; + + assert state.fieldInfos.hasVectorValues(); + segmentWriteState = state; + + String metaFileName = + IndexFileNames.segmentFileName( + state.segmentInfo.name, state.segmentSuffix, Lucene94HnswVectorsFormat.META_EXTENSION); + + String vectorDataFileName = + IndexFileNames.segmentFileName( + state.segmentInfo.name, + state.segmentSuffix, + Lucene94HnswVectorsFormat.VECTOR_DATA_EXTENSION); + + String indexDataFileName = + IndexFileNames.segmentFileName( + state.segmentInfo.name, + state.segmentSuffix, + Lucene94HnswVectorsFormat.VECTOR_INDEX_EXTENSION); + + boolean success = false; + try { + meta = state.directory.createOutput(metaFileName, state.context); + vectorData = state.directory.createOutput(vectorDataFileName, state.context); + vectorIndex = state.directory.createOutput(indexDataFileName, state.context); + + CodecUtil.writeIndexHeader( + meta, + Lucene94HnswVectorsFormat.META_CODEC_NAME, + Lucene94HnswVectorsFormat.VERSION_CURRENT, + state.segmentInfo.getId(), + state.segmentSuffix); + CodecUtil.writeIndexHeader( + vectorData, + Lucene94HnswVectorsFormat.VECTOR_DATA_CODEC_NAME, + Lucene94HnswVectorsFormat.VERSION_CURRENT, + state.segmentInfo.getId(), + state.segmentSuffix); + CodecUtil.writeIndexHeader( + vectorIndex, + Lucene94HnswVectorsFormat.VECTOR_INDEX_CODEC_NAME, + Lucene94HnswVectorsFormat.VERSION_CURRENT, + state.segmentInfo.getId(), + state.segmentSuffix); + maxDoc = state.segmentInfo.maxDoc(); + success = true; + } finally { + if (success == false) { + IOUtils.closeWhileHandlingException(this); + } + } + } + + @Override + public void writeField(FieldInfo fieldInfo, KnnVectorsReader knnVectorsReader) + throws IOException { + long vectorDataOffset = vectorData.alignFilePointer(Float.BYTES); + VectorValues vectors = knnVectorsReader.getVectorValues(fieldInfo.name); + + IndexOutput tempVectorData = + segmentWriteState.directory.createTempOutput( + vectorData.getName(), "temp", segmentWriteState.context); + IndexInput vectorDataInput = null; + boolean success = false; + try { + // write the vector data to a temporary file + DocsWithFieldSet docsWithField = writeVectorData(tempVectorData, vectors); + CodecUtil.writeFooter(tempVectorData); + IOUtils.close(tempVectorData); + + // copy the temporary file vectors to the actual data file + vectorDataInput = + segmentWriteState.directory.openInput( + tempVectorData.getName(), segmentWriteState.context); + vectorData.copyBytes(vectorDataInput, vectorDataInput.length() - CodecUtil.footerLength()); + CodecUtil.retrieveChecksum(vectorDataInput); + long vectorDataLength = vectorData.getFilePointer() - vectorDataOffset; + + long vectorIndexOffset = vectorIndex.getFilePointer(); + // build the graph using the temporary vector data + // we use Lucene94HnswVectorsReader.DenseOffHeapVectorValues for the graph construction + // doesn't need to know docIds + // TODO: separate random access vector values from DocIdSetIterator? + OffHeapVectorValues offHeapVectors = + new OffHeapVectorValues.DenseOffHeapVectorValues( + vectors.dimension(), docsWithField.cardinality(), vectorDataInput); + OnHeapHnswGraph graph = + offHeapVectors.size() == 0 + ? null + : writeGraph(offHeapVectors, fieldInfo.getVectorSimilarityFunction()); + long vectorIndexLength = vectorIndex.getFilePointer() - vectorIndexOffset; + writeMeta( + fieldInfo, + vectorDataOffset, + vectorDataLength, + vectorIndexOffset, + vectorIndexLength, + docsWithField, + graph); + success = true; + } finally { + IOUtils.close(vectorDataInput); + if (success) { + segmentWriteState.directory.deleteFile(tempVectorData.getName()); + } else { + IOUtils.closeWhileHandlingException(tempVectorData); + IOUtils.deleteFilesIgnoringExceptions( + segmentWriteState.directory, tempVectorData.getName()); + } + } + } + + /** + * Writes the vector values to the output and returns a set of documents that contains vectors. + */ + private static DocsWithFieldSet writeVectorData(IndexOutput output, VectorValues vectors) + throws IOException { + DocsWithFieldSet docsWithField = new DocsWithFieldSet(); + for (int docV = vectors.nextDoc(); docV != NO_MORE_DOCS; docV = vectors.nextDoc()) { + // write vector + BytesRef binaryValue = vectors.binaryValue(); + assert binaryValue.length == vectors.dimension() * Float.BYTES; + output.writeBytes(binaryValue.bytes, binaryValue.offset, binaryValue.length); + docsWithField.add(docV); + } + return docsWithField; + } + + private void writeMeta( + FieldInfo field, + long vectorDataOffset, + long vectorDataLength, + long vectorIndexOffset, + long vectorIndexLength, + DocsWithFieldSet docsWithField, + OnHeapHnswGraph graph) + throws IOException { + meta.writeInt(field.number); + meta.writeInt(field.getVectorSimilarityFunction().ordinal()); + meta.writeVLong(vectorDataOffset); + meta.writeVLong(vectorDataLength); + meta.writeVLong(vectorIndexOffset); + meta.writeVLong(vectorIndexLength); + meta.writeInt(field.getVectorDimension()); + + // write docIDs + int count = docsWithField.cardinality(); + meta.writeInt(count); + if (count == 0) { + meta.writeLong(-2); // docsWithFieldOffset + meta.writeLong(0L); // docsWithFieldLength + meta.writeShort((short) -1); // jumpTableEntryCount + meta.writeByte((byte) -1); // denseRankPower + } else if (count == maxDoc) { + meta.writeLong(-1); // docsWithFieldOffset + meta.writeLong(0L); // docsWithFieldLength + meta.writeShort((short) -1); // jumpTableEntryCount + meta.writeByte((byte) -1); // denseRankPower + } else { + long offset = vectorData.getFilePointer(); + meta.writeLong(offset); // docsWithFieldOffset + final short jumpTableEntryCount = + IndexedDISI.writeBitSet( + docsWithField.iterator(), vectorData, IndexedDISI.DEFAULT_DENSE_RANK_POWER); + meta.writeLong(vectorData.getFilePointer() - offset); // docsWithFieldLength + meta.writeShort(jumpTableEntryCount); + meta.writeByte(IndexedDISI.DEFAULT_DENSE_RANK_POWER); + + // write ordToDoc mapping + long start = vectorData.getFilePointer(); + meta.writeLong(start); + meta.writeVInt(DIRECT_MONOTONIC_BLOCK_SHIFT); + // dense case and empty case do not need to store ordToMap mapping + final DirectMonotonicWriter ordToDocWriter = + DirectMonotonicWriter.getInstance(meta, vectorData, count, DIRECT_MONOTONIC_BLOCK_SHIFT); + DocIdSetIterator iterator = docsWithField.iterator(); + for (int doc = iterator.nextDoc(); + doc != DocIdSetIterator.NO_MORE_DOCS; + doc = iterator.nextDoc()) { + ordToDocWriter.add(doc); + } + ordToDocWriter.finish(); + meta.writeLong(vectorData.getFilePointer() - start); + } + + meta.writeInt(M); + // write graph nodes on each level + if (graph == null) { + meta.writeInt(0); + } else { + meta.writeInt(graph.numLevels()); + for (int level = 0; level < graph.numLevels(); level++) { + NodesIterator nodesOnLevel = graph.getNodesOnLevel(level); + meta.writeInt(nodesOnLevel.size()); // number of nodes on a level + if (level > 0) { + while (nodesOnLevel.hasNext()) { + int node = nodesOnLevel.nextInt(); + meta.writeInt(node); // list of nodes on a level + } + } + } + } + } + + private OnHeapHnswGraph writeGraph( + RandomAccessVectorValuesProducer vectorValues, VectorSimilarityFunction similarityFunction) + throws IOException { + + // build graph + HnswGraphBuilder hnswGraphBuilder = + new HnswGraphBuilder( + vectorValues, similarityFunction, M, beamWidth, HnswGraphBuilder.randSeed); + hnswGraphBuilder.setInfoStream(segmentWriteState.infoStream); + OnHeapHnswGraph graph = hnswGraphBuilder.build(vectorValues.randomAccess()); + + // write vectors' neighbours on each level into the vectorIndex file + int countOnLevel0 = graph.size(); + for (int level = 0; level < graph.numLevels(); level++) { + int maxConnOnLevel = level == 0 ? (M * 2) : M; + NodesIterator nodesOnLevel = graph.getNodesOnLevel(level); + while (nodesOnLevel.hasNext()) { + int node = nodesOnLevel.nextInt(); + NeighborArray neighbors = graph.getNeighbors(level, node); + int size = neighbors.size(); + vectorIndex.writeInt(size); + // Destructively modify; it's ok we are discarding it after this + int[] nnodes = neighbors.node(); + Arrays.sort(nnodes, 0, size); + for (int i = 0; i < size; i++) { + int nnode = nnodes[i]; + assert nnode < countOnLevel0 : "node too large: " + nnode + ">=" + countOnLevel0; + vectorIndex.writeInt(nnode); + } + // if number of connections < maxConn, add bogus values up to maxConn to have predictable + // offsets + for (int i = size; i < maxConnOnLevel; i++) { + vectorIndex.writeInt(0); + } + } + } + return graph; + } + + @Override + public void finish() throws IOException { + if (finished) { + throw new IllegalStateException("already finished"); + } + finished = true; + + if (meta != null) { + // write end of fields marker + meta.writeInt(-1); + CodecUtil.writeFooter(meta); + } + if (vectorData != null) { + CodecUtil.writeFooter(vectorData); + CodecUtil.writeFooter(vectorIndex); + } + } + + @Override + public void close() throws IOException { + IOUtils.close(meta, vectorData, vectorIndex); + } +} diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene94/OffHeapVectorValues.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene94/OffHeapVectorValues.java new file mode 100644 index 00000000000..61bad110f33 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene94/OffHeapVectorValues.java @@ -0,0 +1,321 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.codecs.lucene94; + +import java.io.IOException; +import java.nio.ByteBuffer; +import org.apache.lucene.codecs.lucene90.IndexedDISI; +import org.apache.lucene.index.RandomAccessVectorValues; +import org.apache.lucene.index.RandomAccessVectorValuesProducer; +import org.apache.lucene.index.VectorValues; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.RandomAccessInput; +import org.apache.lucene.util.Bits; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.packed.DirectMonotonicReader; + +/** Read the vector values from the index input. This supports both iterated and random access. */ +abstract class OffHeapVectorValues extends VectorValues + implements RandomAccessVectorValues, RandomAccessVectorValuesProducer { + + protected final int dimension; + protected final int size; + protected final IndexInput slice; + protected final BytesRef binaryValue; + protected final ByteBuffer byteBuffer; + protected final int byteSize; + protected final float[] value; + + OffHeapVectorValues(int dimension, int size, IndexInput slice) { + this.dimension = dimension; + this.size = size; + this.slice = slice; + byteSize = Float.BYTES * dimension; + byteBuffer = ByteBuffer.allocate(byteSize); + value = new float[dimension]; + binaryValue = new BytesRef(byteBuffer.array(), byteBuffer.arrayOffset(), byteSize); + } + + @Override + public int dimension() { + return dimension; + } + + @Override + public int size() { + return size; + } + + @Override + public long cost() { + return size; + } + + @Override + public float[] vectorValue(int targetOrd) throws IOException { + slice.seek((long) targetOrd * byteSize); + slice.readFloats(value, 0, value.length); + return value; + } + + @Override + public BytesRef binaryValue(int targetOrd) throws IOException { + readValue(targetOrd); + return binaryValue; + } + + private void readValue(int targetOrd) throws IOException { + slice.seek((long) targetOrd * byteSize); + slice.readBytes(byteBuffer.array(), byteBuffer.arrayOffset(), byteSize); + } + + public abstract int ordToDoc(int ord); + + static OffHeapVectorValues load( + Lucene94HnswVectorsReader.FieldEntry fieldEntry, IndexInput vectorData) throws IOException { + if (fieldEntry.docsWithFieldOffset == -2) { + return new EmptyOffHeapVectorValues(fieldEntry.dimension); + } + IndexInput bytesSlice = + vectorData.slice("vector-data", fieldEntry.vectorDataOffset, fieldEntry.vectorDataLength); + if (fieldEntry.docsWithFieldOffset == -1) { + return new DenseOffHeapVectorValues(fieldEntry.dimension, fieldEntry.size, bytesSlice); + } else { + return new SparseOffHeapVectorValues(fieldEntry, vectorData, bytesSlice); + } + } + + abstract Bits getAcceptOrds(Bits acceptDocs); + + static class DenseOffHeapVectorValues extends OffHeapVectorValues { + + private int doc = -1; + + public DenseOffHeapVectorValues(int dimension, int size, IndexInput slice) { + super(dimension, size, slice); + } + + @Override + public float[] vectorValue() throws IOException { + slice.seek((long) doc * byteSize); + slice.readFloats(value, 0, value.length); + return value; + } + + @Override + public BytesRef binaryValue() throws IOException { + slice.seek((long) doc * byteSize); + slice.readBytes(byteBuffer.array(), byteBuffer.arrayOffset(), byteSize, false); + return binaryValue; + } + + @Override + public int docID() { + return doc; + } + + @Override + public int nextDoc() throws IOException { + return advance(doc + 1); + } + + @Override + public int advance(int target) throws IOException { + assert docID() < target; + if (target >= size) { + return doc = NO_MORE_DOCS; + } + return doc = target; + } + + @Override + public RandomAccessVectorValues randomAccess() throws IOException { + return new DenseOffHeapVectorValues(dimension, size, slice.clone()); + } + + @Override + public int ordToDoc(int ord) { + return ord; + } + + @Override + Bits getAcceptOrds(Bits acceptDocs) { + return acceptDocs; + } + } + + private static class SparseOffHeapVectorValues extends OffHeapVectorValues { + private final DirectMonotonicReader ordToDoc; + private final IndexedDISI disi; + // dataIn was used to init a new IndexedDIS for #randomAccess() + private final IndexInput dataIn; + private final Lucene94HnswVectorsReader.FieldEntry fieldEntry; + + public SparseOffHeapVectorValues( + Lucene94HnswVectorsReader.FieldEntry fieldEntry, IndexInput dataIn, IndexInput slice) + throws IOException { + + super(fieldEntry.dimension, fieldEntry.size, slice); + this.fieldEntry = fieldEntry; + final RandomAccessInput addressesData = + dataIn.randomAccessSlice(fieldEntry.addressesOffset, fieldEntry.addressesLength); + this.dataIn = dataIn; + this.ordToDoc = DirectMonotonicReader.getInstance(fieldEntry.meta, addressesData); + this.disi = + new IndexedDISI( + dataIn, + fieldEntry.docsWithFieldOffset, + fieldEntry.docsWithFieldLength, + fieldEntry.jumpTableEntryCount, + fieldEntry.denseRankPower, + fieldEntry.size); + } + + @Override + public float[] vectorValue() throws IOException { + slice.seek((long) (disi.index()) * byteSize); + slice.readFloats(value, 0, value.length); + return value; + } + + @Override + public BytesRef binaryValue() throws IOException { + slice.seek((long) (disi.index()) * byteSize); + slice.readBytes(byteBuffer.array(), byteBuffer.arrayOffset(), byteSize, false); + return binaryValue; + } + + @Override + public int docID() { + return disi.docID(); + } + + @Override + public int nextDoc() throws IOException { + return disi.nextDoc(); + } + + @Override + public int advance(int target) throws IOException { + assert docID() < target; + return disi.advance(target); + } + + @Override + public RandomAccessVectorValues randomAccess() throws IOException { + return new SparseOffHeapVectorValues(fieldEntry, dataIn, slice.clone()); + } + + @Override + public int ordToDoc(int ord) { + return (int) ordToDoc.get(ord); + } + + @Override + Bits getAcceptOrds(Bits acceptDocs) { + if (acceptDocs == null) { + return null; + } + return new Bits() { + @Override + public boolean get(int index) { + return acceptDocs.get(ordToDoc(index)); + } + + @Override + public int length() { + return size; + } + }; + } + } + + private static class EmptyOffHeapVectorValues extends OffHeapVectorValues { + + public EmptyOffHeapVectorValues(int dimension) { + super(dimension, 0, null); + } + + private int doc = -1; + + @Override + public int dimension() { + return super.dimension(); + } + + @Override + public int size() { + return 0; + } + + @Override + public float[] vectorValue() throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public BytesRef binaryValue() throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public int docID() { + return doc; + } + + @Override + public int nextDoc() throws IOException { + return advance(doc + 1); + } + + @Override + public int advance(int target) throws IOException { + return doc = NO_MORE_DOCS; + } + + @Override + public long cost() { + return 0; + } + + @Override + public RandomAccessVectorValues randomAccess() throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public float[] vectorValue(int targetOrd) throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public BytesRef binaryValue(int targetOrd) throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public int ordToDoc(int ord) { + throw new UnsupportedOperationException(); + } + + @Override + Bits getAcceptOrds(Bits acceptDocs) { + return null; + } + } +} diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene92/package-info.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene94/package-info.java similarity index 99% rename from lucene/core/src/java/org/apache/lucene/codecs/lucene92/package-info.java rename to lucene/core/src/java/org/apache/lucene/codecs/lucene94/package-info.java index f87fd0f2995..3e635ec40e4 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene92/package-info.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene94/package-info.java @@ -16,7 +16,7 @@ */ /** - * Lucene 9.2 file format. + * Lucene 9.3 file format. * *

Apache Lucene - Index File Formats

* @@ -180,7 +180,7 @@ * of files, recording dimensionally indexed fields, to enable fast numeric range filtering * and large numeric values like BigInteger and BigDecimal (1D) and geographic shape * intersection (2D, 3D). - *
  • {@link org.apache.lucene.codecs.lucene92.Lucene92HnswVectorsFormat Vector values}. The + *
  • {@link org.apache.lucene.codecs.lucene94.Lucene94HnswVectorsFormat Vector values}. The * vector format stores numeric vectors in a format optimized for random access and * computation, supporting high-dimensional nearest-neighbor search. * @@ -310,7 +310,7 @@ * Holds indexed points * * - * {@link org.apache.lucene.codecs.lucene92.Lucene92HnswVectorsFormat Vector values} + * {@link org.apache.lucene.codecs.lucene94.Lucene94HnswVectorsFormat Vector values} * .vec, .vem * Holds indexed vectors; .vec files contain the raw vector data, and * .vem the vector metadata @@ -419,4 +419,4 @@ * UInt64 values, or better yet, {@link org.apache.lucene.store.DataOutput#writeVInt * VInt} values which have no limit. */ -package org.apache.lucene.codecs.lucene92; +package org.apache.lucene.codecs.lucene94; diff --git a/lucene/core/src/resources/META-INF/services/org.apache.lucene.codecs.Codec b/lucene/core/src/resources/META-INF/services/org.apache.lucene.codecs.Codec index cd3ccaa8e17..11b662ce3a6 100644 --- a/lucene/core/src/resources/META-INF/services/org.apache.lucene.codecs.Codec +++ b/lucene/core/src/resources/META-INF/services/org.apache.lucene.codecs.Codec @@ -13,4 +13,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -org.apache.lucene.codecs.lucene92.Lucene92Codec +org.apache.lucene.codecs.lucene94.Lucene94Codec diff --git a/lucene/core/src/resources/META-INF/services/org.apache.lucene.codecs.KnnVectorsFormat b/lucene/core/src/resources/META-INF/services/org.apache.lucene.codecs.KnnVectorsFormat index 35bd0c0824a..5327e21d5fe 100644 --- a/lucene/core/src/resources/META-INF/services/org.apache.lucene.codecs.KnnVectorsFormat +++ b/lucene/core/src/resources/META-INF/services/org.apache.lucene.codecs.KnnVectorsFormat @@ -13,4 +13,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -org.apache.lucene.codecs.lucene92.Lucene92HnswVectorsFormat +org.apache.lucene.codecs.lucene94.Lucene94HnswVectorsFormat diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene90/TestLucene90StoredFieldsFormatHighCompression.java b/lucene/core/src/test/org/apache/lucene/codecs/lucene90/TestLucene90StoredFieldsFormatHighCompression.java index a8a521054af..23f4c3d4343 100644 --- a/lucene/core/src/test/org/apache/lucene/codecs/lucene90/TestLucene90StoredFieldsFormatHighCompression.java +++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene90/TestLucene90StoredFieldsFormatHighCompression.java @@ -18,8 +18,8 @@ package org.apache.lucene.codecs.lucene90; import com.carrotsearch.randomizedtesting.generators.RandomPicks; import org.apache.lucene.codecs.Codec; -import org.apache.lucene.codecs.lucene92.Lucene92Codec; -import org.apache.lucene.codecs.lucene92.Lucene92Codec.Mode; +import org.apache.lucene.codecs.lucene94.Lucene94Codec; +import org.apache.lucene.codecs.lucene94.Lucene94Codec.Mode; import org.apache.lucene.document.Document; import org.apache.lucene.document.StoredField; import org.apache.lucene.index.DirectoryReader; @@ -31,7 +31,7 @@ import org.apache.lucene.tests.index.BaseStoredFieldsFormatTestCase; public class TestLucene90StoredFieldsFormatHighCompression extends BaseStoredFieldsFormatTestCase { @Override protected Codec getCodec() { - return new Lucene92Codec(Mode.BEST_COMPRESSION); + return new Lucene94Codec(Mode.BEST_COMPRESSION); } /** @@ -41,7 +41,7 @@ public class TestLucene90StoredFieldsFormatHighCompression extends BaseStoredFie Directory dir = newDirectory(); for (int i = 0; i < 10; i++) { IndexWriterConfig iwc = newIndexWriterConfig(); - iwc.setCodec(new Lucene92Codec(RandomPicks.randomFrom(random(), Mode.values()))); + iwc.setCodec(new Lucene94Codec(RandomPicks.randomFrom(random(), Mode.values()))); IndexWriter iw = new IndexWriter(dir, newIndexWriterConfig()); Document doc = new Document(); doc.add(new StoredField("field1", "value1")); @@ -70,7 +70,7 @@ public class TestLucene90StoredFieldsFormatHighCompression extends BaseStoredFie expectThrows( NullPointerException.class, () -> { - new Lucene92Codec(null); + new Lucene94Codec(null); }); expectThrows( diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene94/TestLucene94HnswVectorsFormat.java b/lucene/core/src/test/org/apache/lucene/codecs/lucene94/TestLucene94HnswVectorsFormat.java new file mode 100644 index 00000000000..3590d3c168d --- /dev/null +++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene94/TestLucene94HnswVectorsFormat.java @@ -0,0 +1,42 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.lucene94; + +import org.apache.lucene.codecs.Codec; +import org.apache.lucene.codecs.KnnVectorsFormat; +import org.apache.lucene.tests.index.BaseKnnVectorsFormatTestCase; +import org.apache.lucene.tests.util.TestUtil; + +public class TestLucene94HnswVectorsFormat extends BaseKnnVectorsFormatTestCase { + @Override + protected Codec getCodec() { + return TestUtil.getDefaultCodec(); + } + + public void testToString() { + Lucene94Codec customCodec = + new Lucene94Codec() { + @Override + public KnnVectorsFormat getKnnVectorsFormatForField(String field) { + return new Lucene94HnswVectorsFormat(10, 20); + } + }; + String expectedString = + "Lucene94HnswVectorsFormat(name=Lucene94HnswVectorsFormat, maxConn=10, beamWidth=20)"; + assertEquals(expectedString, customCodec.getKnnVectorsFormatForField("bogus_field").toString()); + } +} diff --git a/lucene/core/src/test/org/apache/lucene/index/TestKnnGraph.java b/lucene/core/src/test/org/apache/lucene/index/TestKnnGraph.java index 3a54c407070..235cd9fe69d 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestKnnGraph.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestKnnGraph.java @@ -30,9 +30,9 @@ import java.util.Set; import java.util.concurrent.CountDownLatch; import org.apache.lucene.codecs.Codec; import org.apache.lucene.codecs.KnnVectorsFormat; -import org.apache.lucene.codecs.lucene92.Lucene92Codec; -import org.apache.lucene.codecs.lucene92.Lucene92HnswVectorsFormat; -import org.apache.lucene.codecs.lucene92.Lucene92HnswVectorsReader; +import org.apache.lucene.codecs.lucene94.Lucene94Codec; +import org.apache.lucene.codecs.lucene94.Lucene94HnswVectorsFormat; +import org.apache.lucene.codecs.lucene94.Lucene94HnswVectorsReader; import org.apache.lucene.codecs.perfield.PerFieldKnnVectorsFormat; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; @@ -64,7 +64,7 @@ public class TestKnnGraph extends LuceneTestCase { private static final String KNN_GRAPH_FIELD = "vector"; - private static int M = Lucene92HnswVectorsFormat.DEFAULT_MAX_CONN; + private static int M = Lucene94HnswVectorsFormat.DEFAULT_MAX_CONN; private Codec codec; private VectorSimilarityFunction similarityFunction; @@ -77,10 +77,10 @@ public class TestKnnGraph extends LuceneTestCase { } codec = - new Lucene92Codec() { + new Lucene94Codec() { @Override public KnnVectorsFormat getKnnVectorsFormatForField(String field) { - return new Lucene92HnswVectorsFormat(M, Lucene92HnswVectorsFormat.DEFAULT_BEAM_WIDTH); + return new Lucene94HnswVectorsFormat(M, Lucene94HnswVectorsFormat.DEFAULT_BEAM_WIDTH); } }; @@ -90,7 +90,7 @@ public class TestKnnGraph extends LuceneTestCase { @After public void cleanup() { - M = Lucene92HnswVectorsFormat.DEFAULT_MAX_CONN; + M = Lucene94HnswVectorsFormat.DEFAULT_MAX_CONN; } /** Basic test of creating documents in a graph */ @@ -237,8 +237,8 @@ public class TestKnnGraph extends LuceneTestCase { PerFieldKnnVectorsFormat.FieldsReader perFieldReader = (PerFieldKnnVectorsFormat.FieldsReader) ((CodecReader) getOnlyLeafReader(reader)).getVectorReader(); - Lucene92HnswVectorsReader vectorReader = - (Lucene92HnswVectorsReader) perFieldReader.getFieldReader(KNN_GRAPH_FIELD); + Lucene94HnswVectorsReader vectorReader = + (Lucene94HnswVectorsReader) perFieldReader.getFieldReader(KNN_GRAPH_FIELD); graph = copyGraph(vectorReader.getGraph(KNN_GRAPH_FIELD)); } } @@ -436,8 +436,8 @@ public class TestKnnGraph extends LuceneTestCase { if (perFieldReader == null) { continue; } - Lucene92HnswVectorsReader vectorReader = - (Lucene92HnswVectorsReader) perFieldReader.getFieldReader(vectorField); + Lucene94HnswVectorsReader vectorReader = + (Lucene94HnswVectorsReader) perFieldReader.getFieldReader(vectorField); if (vectorReader == null) { continue; } diff --git a/lucene/core/src/test/org/apache/lucene/util/hnsw/KnnGraphTester.java b/lucene/core/src/test/org/apache/lucene/util/hnsw/KnnGraphTester.java index 8ec2140cd35..1e9fdbbf126 100644 --- a/lucene/core/src/test/org/apache/lucene/util/hnsw/KnnGraphTester.java +++ b/lucene/core/src/test/org/apache/lucene/util/hnsw/KnnGraphTester.java @@ -39,9 +39,9 @@ import java.util.Objects; import java.util.Set; import org.apache.lucene.codecs.KnnVectorsFormat; import org.apache.lucene.codecs.KnnVectorsReader; -import org.apache.lucene.codecs.lucene92.Lucene92Codec; -import org.apache.lucene.codecs.lucene92.Lucene92HnswVectorsFormat; -import org.apache.lucene.codecs.lucene92.Lucene92HnswVectorsReader; +import org.apache.lucene.codecs.lucene94.Lucene94Codec; +import org.apache.lucene.codecs.lucene94.Lucene94HnswVectorsFormat; +import org.apache.lucene.codecs.lucene94.Lucene94HnswVectorsReader; import org.apache.lucene.codecs.perfield.PerFieldKnnVectorsFormat; import org.apache.lucene.document.Document; import org.apache.lucene.document.FieldType; @@ -248,7 +248,7 @@ public class KnnGraphTester { KnnVectorsReader vectorsReader = ((PerFieldKnnVectorsFormat.FieldsReader) ((CodecReader) leafReader).getVectorReader()) .getFieldReader(KNN_FIELD); - HnswGraph knnValues = ((Lucene92HnswVectorsReader) vectorsReader).getGraph(KNN_FIELD); + HnswGraph knnValues = ((Lucene94HnswVectorsReader) vectorsReader).getGraph(KNN_FIELD); System.out.printf("Leaf %d has %d documents\n", context.ord, leafReader.maxDoc()); printGraphFanout(knnValues, leafReader.maxDoc()); } @@ -579,10 +579,10 @@ public class KnnGraphTester { private int createIndex(Path docsPath, Path indexPath) throws IOException { IndexWriterConfig iwc = new IndexWriterConfig().setOpenMode(IndexWriterConfig.OpenMode.CREATE); iwc.setCodec( - new Lucene92Codec() { + new Lucene94Codec() { @Override public KnnVectorsFormat getKnnVectorsFormatForField(String field) { - return new Lucene92HnswVectorsFormat(maxConn, beamWidth); + return new Lucene94HnswVectorsFormat(maxConn, beamWidth); } }); // iwc.setMergePolicy(NoMergePolicy.INSTANCE); diff --git a/lucene/core/src/test/org/apache/lucene/util/hnsw/TestHnswGraph.java b/lucene/core/src/test/org/apache/lucene/util/hnsw/TestHnswGraph.java index ed94d7dd82a..2f3d8c07f09 100644 --- a/lucene/core/src/test/org/apache/lucene/util/hnsw/TestHnswGraph.java +++ b/lucene/core/src/test/org/apache/lucene/util/hnsw/TestHnswGraph.java @@ -25,9 +25,9 @@ import java.util.HashSet; import java.util.Random; import java.util.Set; import org.apache.lucene.codecs.KnnVectorsFormat; -import org.apache.lucene.codecs.lucene92.Lucene92Codec; -import org.apache.lucene.codecs.lucene92.Lucene92HnswVectorsFormat; -import org.apache.lucene.codecs.lucene92.Lucene92HnswVectorsReader; +import org.apache.lucene.codecs.lucene94.Lucene94Codec; +import org.apache.lucene.codecs.lucene94.Lucene94HnswVectorsFormat; +import org.apache.lucene.codecs.lucene94.Lucene94HnswVectorsReader; import org.apache.lucene.codecs.perfield.PerFieldKnnVectorsFormat; import org.apache.lucene.document.Document; import org.apache.lucene.document.KnnVectorField; @@ -81,10 +81,10 @@ public class TestHnswGraph extends LuceneTestCase { IndexWriterConfig iwc = new IndexWriterConfig() .setCodec( - new Lucene92Codec() { + new Lucene94Codec() { @Override public KnnVectorsFormat getKnnVectorsFormatForField(String field) { - return new Lucene92HnswVectorsFormat(M, beamWidth); + return new Lucene94HnswVectorsFormat(M, beamWidth); } }); try (IndexWriter iw = new IndexWriter(dir, iwc)) { @@ -111,7 +111,7 @@ public class TestHnswGraph extends LuceneTestCase { assertEquals(indexedDoc, ctx.reader().numDocs()); assertVectorsEqual(v3, values); HnswGraph graphValues = - ((Lucene92HnswVectorsReader) + ((Lucene94HnswVectorsReader) ((PerFieldKnnVectorsFormat.FieldsReader) ((CodecReader) ctx.reader()).getVectorReader()) .getFieldReader("field")) diff --git a/lucene/suggest/src/test/org/apache/lucene/search/suggest/document/TestSuggestField.java b/lucene/suggest/src/test/org/apache/lucene/search/suggest/document/TestSuggestField.java index b57c87d8c0a..77d10d8de0f 100644 --- a/lucene/suggest/src/test/org/apache/lucene/search/suggest/document/TestSuggestField.java +++ b/lucene/suggest/src/test/org/apache/lucene/search/suggest/document/TestSuggestField.java @@ -40,7 +40,7 @@ import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; import org.apache.lucene.analysis.tokenattributes.TypeAttribute; import org.apache.lucene.codecs.Codec; import org.apache.lucene.codecs.PostingsFormat; -import org.apache.lucene.codecs.lucene92.Lucene92Codec; +import org.apache.lucene.codecs.lucene94.Lucene94Codec; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.IntPoint; @@ -959,7 +959,7 @@ public class TestSuggestField extends LuceneTestCase { IndexWriterConfig iwc = newIndexWriterConfig(random(), analyzer); iwc.setMergePolicy(newLogMergePolicy()); Codec filterCodec = - new Lucene92Codec() { + new Lucene94Codec() { CompletionPostingsFormat.FSTLoadMode fstLoadMode = RandomPicks.randomFrom(random(), CompletionPostingsFormat.FSTLoadMode.values()); PostingsFormat postingsFormat = new Completion90PostingsFormat(fstLoadMode); diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/util/TestRuleSetupAndRestoreClassEnv.java b/lucene/test-framework/src/java/org/apache/lucene/tests/util/TestRuleSetupAndRestoreClassEnv.java index 8392640558a..a041d1951e1 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/util/TestRuleSetupAndRestoreClassEnv.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/util/TestRuleSetupAndRestoreClassEnv.java @@ -38,7 +38,7 @@ import java.util.TimeZone; import org.apache.lucene.codecs.Codec; import org.apache.lucene.codecs.DocValuesFormat; import org.apache.lucene.codecs.PostingsFormat; -import org.apache.lucene.codecs.lucene92.Lucene92Codec; +import org.apache.lucene.codecs.lucene94.Lucene94Codec; import org.apache.lucene.codecs.simpletext.SimpleTextCodec; import org.apache.lucene.search.similarities.Similarity; import org.apache.lucene.tests.codecs.asserting.AssertingCodec; @@ -193,9 +193,9 @@ final class TestRuleSetupAndRestoreClassEnv extends AbstractBeforeAfterRule { } else if ("Compressing".equals(TEST_CODEC) || ("random".equals(TEST_CODEC) && randomVal == 6 && !shouldAvoidCodec("Compressing"))) { codec = CompressingCodec.randomInstance(random); - } else if ("Lucene92".equals(TEST_CODEC) - || ("random".equals(TEST_CODEC) && randomVal == 5 && !shouldAvoidCodec("Lucene92"))) { - codec = new Lucene92Codec(RandomPicks.randomFrom(random, Lucene92Codec.Mode.values())); + } else if ("Lucene94".equals(TEST_CODEC) + || ("random".equals(TEST_CODEC) && randomVal == 5 && !shouldAvoidCodec("Lucene94"))) { + codec = new Lucene94Codec(RandomPicks.randomFrom(random, Lucene94Codec.Mode.values())); } else if (!"random".equals(TEST_CODEC)) { codec = Codec.forName(TEST_CODEC); } else if ("random".equals(TEST_POSTINGSFORMAT)) { diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/util/TestUtil.java b/lucene/test-framework/src/java/org/apache/lucene/tests/util/TestUtil.java index e6722972ac2..a6a7d9c1e4e 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/util/TestUtil.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/util/TestUtil.java @@ -55,8 +55,8 @@ import org.apache.lucene.codecs.PostingsFormat; import org.apache.lucene.codecs.blocktreeords.BlockTreeOrdsPostingsFormat; import org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat; import org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat; -import org.apache.lucene.codecs.lucene92.Lucene92Codec; -import org.apache.lucene.codecs.lucene92.Lucene92HnswVectorsFormat; +import org.apache.lucene.codecs.lucene94.Lucene94Codec; +import org.apache.lucene.codecs.lucene94.Lucene94HnswVectorsFormat; import org.apache.lucene.codecs.perfield.PerFieldDocValuesFormat; import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat; import org.apache.lucene.document.BinaryDocValuesField; @@ -1236,7 +1236,7 @@ public final class TestUtil { * different than {@link Codec#getDefault()} because that is randomized. */ public static Codec getDefaultCodec() { - return new Lucene92Codec(); + return new Lucene94Codec(); } /** @@ -1322,7 +1322,7 @@ public final class TestUtil { * Lucene. */ public static KnnVectorsFormat getDefaultKnnVectorsFormat() { - return new Lucene92HnswVectorsFormat(); + return new Lucene94HnswVectorsFormat(); } public static boolean anyFilesExceptWriteLock(Directory dir) throws IOException {