diff --git a/lucene/backward-codecs/src/java/module-info.java b/lucene/backward-codecs/src/java/module-info.java
index fbc2cdba98e..4607c21eb7e 100644
--- a/lucene/backward-codecs/src/java/module-info.java
+++ b/lucene/backward-codecs/src/java/module-info.java
@@ -36,6 +36,7 @@ module org.apache.lucene.backward_codecs {
exports org.apache.lucene.backward_codecs.lucene94;
exports org.apache.lucene.backward_codecs.lucene95;
exports org.apache.lucene.backward_codecs.lucene99;
+ exports org.apache.lucene.backward_codecs.lucene912;
exports org.apache.lucene.backward_codecs.packed;
exports org.apache.lucene.backward_codecs.store;
@@ -62,5 +63,6 @@ module org.apache.lucene.backward_codecs {
org.apache.lucene.backward_codecs.lucene92.Lucene92Codec,
org.apache.lucene.backward_codecs.lucene94.Lucene94Codec,
org.apache.lucene.backward_codecs.lucene95.Lucene95Codec,
- org.apache.lucene.backward_codecs.lucene99.Lucene99Codec;
+ org.apache.lucene.backward_codecs.lucene99.Lucene99Codec,
+ org.apache.lucene.backward_codecs.lucene912.Lucene912Codec;
}
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene912/Lucene912Codec.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene912/Lucene912Codec.java
similarity index 98%
rename from lucene/core/src/java/org/apache/lucene/codecs/lucene912/Lucene912Codec.java
rename to lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene912/Lucene912Codec.java
index cb4ef755a6b..d7b89d31081 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene912/Lucene912Codec.java
+++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene912/Lucene912Codec.java
@@ -14,7 +14,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-package org.apache.lucene.codecs.lucene912;
+package org.apache.lucene.backward_codecs.lucene912;
import java.util.Objects;
import org.apache.lucene.codecs.Codec;
@@ -37,6 +37,7 @@ import org.apache.lucene.codecs.lucene90.Lucene90NormsFormat;
import org.apache.lucene.codecs.lucene90.Lucene90PointsFormat;
import org.apache.lucene.codecs.lucene90.Lucene90StoredFieldsFormat;
import org.apache.lucene.codecs.lucene90.Lucene90TermVectorsFormat;
+import org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat;
import org.apache.lucene.codecs.lucene94.Lucene94FieldInfosFormat;
import org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat;
import org.apache.lucene.codecs.lucene99.Lucene99SegmentInfoFormat;
diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene912/package-info.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene912/package-info.java
new file mode 100644
index 00000000000..aac717a3e6c
--- /dev/null
+++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene912/package-info.java
@@ -0,0 +1,433 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Lucene 9.12 file format.
+ *
+ *
Apache Lucene - Index File Formats
+ *
+ *
+ *
+ * Introduction
+ *
+ *
+ *
+ *
This document defines the index file formats used in this version of Lucene. If you are using
+ * a different version of Lucene, please consult the copy of docs/
that was distributed
+ * with the version you are using.
+ *
+ *
This document attempts to provide a high-level definition of the Apache Lucene file formats.
+ *
+ *
+ * Definitions
+ *
+ *
+ *
+ *
The fundamental concepts in Lucene are index, document, field and term.
+ *
+ *
An index contains a sequence of documents.
+ *
+ *
+ * - A document is a sequence of fields.
+ *
- A field is a named sequence of terms.
+ *
- A term is a sequence of bytes.
+ *
+ *
+ *
The same sequence of bytes in two different fields is considered a different term. Thus terms
+ * are represented as a pair: the string naming the field, and the bytes within the field.
+ *
+ *
Inverted Indexing
+ *
+ *
Lucene's index stores terms and statistics about those terms in order to make term-based
+ * search more efficient. Lucene's terms index falls into the family of indexes known as an
+ * inverted index. This is because it can list, for a term, the documents that contain it.
+ * This is the inverse of the natural relationship, in which documents list terms.
+ *
+ *
Types of Fields
+ *
+ *
In Lucene, fields may be stored, in which case their text is stored in the index
+ * literally, in a non-inverted manner. Fields that are inverted are called indexed. A field
+ * may be both stored and indexed.
+ *
+ *
The text of a field may be tokenized into terms to be indexed, or the text of a field
+ * may be used literally as a term to be indexed. Most fields are tokenized, but sometimes it is
+ * useful for certain identifier fields to be indexed literally.
+ *
+ *
See the {@link org.apache.lucene.document.Field Field} java docs for more information on
+ * Fields.
+ *
+ *
Segments
+ *
+ *
Lucene indexes may be composed of multiple sub-indexes, or segments. Each segment is a
+ * fully independent index, which could be searched separately. Indexes evolve by:
+ *
+ *
+ * - Creating new segments for newly added documents.
+ *
- Merging existing segments.
+ *
+ *
+ *
Searches may involve multiple segments and/or multiple indexes, each index potentially
+ * composed of a set of segments.
+ *
+ *
Document Numbers
+ *
+ *
Internally, Lucene refers to documents by an integer document number. The first
+ * document added to an index is numbered zero, and each subsequent document added gets a number one
+ * greater than the previous.
+ *
+ *
Note that a document's number may change, so caution should be taken when storing these
+ * numbers outside of Lucene. In particular, numbers may change in the following situations:
+ *
+ *
+ * -
+ *
The numbers stored in each segment are unique only within the segment, and must be
+ * converted before they can be used in a larger context. The standard technique is to
+ * allocate each segment a range of values, based on the range of numbers used in that
+ * segment. To convert a document number from a segment to an external value, the segment's
+ * base document number is added. To convert an external value back to a
+ * segment-specific value, the segment is identified by the range that the external value is
+ * in, and the segment's base value is subtracted. For example two five document segments
+ * might be combined, so that the first segment has a base value of zero, and the second of
+ * five. Document three from the second segment would have an external value of eight.
+ *
-
+ *
When documents are deleted, gaps are created in the numbering. These are eventually
+ * removed as the index evolves through merging. Deleted documents are dropped when segments
+ * are merged. A freshly-merged segment thus has no gaps in its numbering.
+ *
+ *
+ *
+ *
+ * Index Structure Overview
+ *
+ *
+ *
+ *
Each segment index maintains the following:
+ *
+ *
+ * - {@link org.apache.lucene.codecs.lucene99.Lucene99SegmentInfoFormat Segment info}. This
+ * contains metadata about a segment, such as the number of documents, what files it uses, and
+ * information about how the segment is sorted
+ *
- {@link org.apache.lucene.codecs.lucene94.Lucene94FieldInfosFormat Field names}. This
+ * contains metadata about the set of named fields used in the index.
+ *
- {@link org.apache.lucene.codecs.lucene90.Lucene90StoredFieldsFormat Stored Field values}.
+ * This contains, for each document, a list of attribute-value pairs, where the attributes are
+ * field names. These are used to store auxiliary information about the document, such as its
+ * title, url, or an identifier to access a database. The set of stored fields are what is
+ * returned for each hit when searching. This is keyed by document number.
+ *
- {@link org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat Term dictionary}. A
+ * dictionary containing all of the terms used in all of the indexed fields of all of the
+ * documents. The dictionary also contains the number of documents which contain the term, and
+ * pointers to the term's frequency and proximity data.
+ *
- {@link org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat Term Frequency data}. For
+ * each term in the dictionary, the numbers of all the documents that contain that term, and
+ * the frequency of the term in that document, unless frequencies are omitted ({@link
+ * org.apache.lucene.index.IndexOptions#DOCS IndexOptions.DOCS})
+ *
- {@link org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat Term Proximity data}. For
+ * each term in the dictionary, the positions that the term occurs in each document. Note that
+ * this will not exist if all fields in all documents omit position data.
+ *
- {@link org.apache.lucene.codecs.lucene90.Lucene90NormsFormat Normalization factors}. For
+ * each field in each document, a value is stored that is multiplied into the score for hits
+ * on that field.
+ *
- {@link org.apache.lucene.codecs.lucene90.Lucene90TermVectorsFormat Term Vectors}. For each
+ * field in each document, the term vector (sometimes called document vector) may be stored. A
+ * term vector consists of term text and term frequency. To add Term Vectors to your index see
+ * the {@link org.apache.lucene.document.Field Field} constructors
+ *
- {@link org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat Per-document values}. Like
+ * stored values, these are also keyed by document number, but are generally intended to be
+ * loaded into main memory for fast access. Whereas stored values are generally intended for
+ * summary results from searches, per-document values are useful for things like scoring
+ * factors.
+ *
- {@link org.apache.lucene.codecs.lucene90.Lucene90LiveDocsFormat Live documents}. An
+ * optional file indicating which documents are live.
+ *
- {@link org.apache.lucene.codecs.lucene90.Lucene90PointsFormat Point values}. Optional pair
+ * of files, recording dimensionally indexed fields, to enable fast numeric range filtering
+ * and large numeric values like BigInteger and BigDecimal (1D) and geographic shape
+ * intersection (2D, 3D).
+ *
- {@link org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat Vector values}. The
+ * vector format stores numeric vectors in a format optimized for random access and
+ * computation, supporting high-dimensional nearest-neighbor search.
+ *
+ *
+ *
Details on each of these are provided in their linked pages.
+ *
+ * File Naming
+ *
+ *
+ *
+ *
All files belonging to a segment have the same name with varying extensions. The extensions
+ * correspond to the different file formats described below. When using the Compound File format
+ * (default for small segments) these files (except for the Segment info file, the Lock file, and
+ * Deleted documents file) are collapsed into a single .cfs file (see below for details)
+ *
+ *
Typically, all segments in an index are stored in a single directory, although this is not
+ * required.
+ *
+ *
File names are never re-used. That is, when any file is saved to the Directory it is given a
+ * never before used filename. This is achieved using a simple generations approach. For example,
+ * the first segments file is segments_1, then segments_2, etc. The generation is a sequential long
+ * integer represented in alpha-numeric (base 36) form.
+ *
+ * Summary of File Extensions
+ *
+ *
+ *
+ *
The following table summarizes the names and extensions of the files in Lucene:
+ *
+ *
+ * lucene filenames by extension
+ *
+ * Name |
+ * Extension |
+ * Brief Description |
+ *
+ *
+ * {@link org.apache.lucene.index.SegmentInfos Segments File} |
+ * segments_N |
+ * Stores information about a commit point |
+ *
+ *
+ * Lock File |
+ * write.lock |
+ * The Write lock prevents multiple IndexWriters from writing to the same
+ * file. |
+ *
+ *
+ * {@link org.apache.lucene.codecs.lucene99.Lucene99SegmentInfoFormat Segment Info} |
+ * .si |
+ * Stores metadata about a segment |
+ *
+ *
+ * {@link org.apache.lucene.codecs.lucene90.Lucene90CompoundFormat Compound File} |
+ * .cfs, .cfe |
+ * An optional "virtual" file consisting of all the other index files for
+ * systems that frequently run out of file handles. |
+ *
+ *
+ * {@link org.apache.lucene.codecs.lucene94.Lucene94FieldInfosFormat Fields} |
+ * .fnm |
+ * Stores information about the fields |
+ *
+ *
+ * {@link org.apache.lucene.codecs.lucene90.Lucene90StoredFieldsFormat Field Index} |
+ * .fdx |
+ * Contains pointers to field data |
+ *
+ *
+ * {@link org.apache.lucene.codecs.lucene90.Lucene90StoredFieldsFormat Field Data} |
+ * .fdt |
+ * The stored fields for documents |
+ *
+ *
+ * {@link org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat Term Dictionary} |
+ * .tim |
+ * The term dictionary, stores term info |
+ *
+ *
+ * {@link org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat Term Index} |
+ * .tip |
+ * The index into the Term Dictionary |
+ *
+ *
+ * {@link org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat Frequencies} |
+ * .doc |
+ * Contains the list of docs which contain each term along with frequency |
+ *
+ *
+ * {@link org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat Positions} |
+ * .pos |
+ * Stores position information about where a term occurs in the index |
+ *
+ *
+ * {@link org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat Payloads} |
+ * .pay |
+ * Stores additional per-position metadata information such as character offsets and user payloads |
+ *
+ *
+ * {@link org.apache.lucene.codecs.lucene90.Lucene90NormsFormat Norms} |
+ * .nvd, .nvm |
+ * Encodes length and boost factors for docs and fields |
+ *
+ *
+ * {@link org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat Per-Document Values} |
+ * .dvd, .dvm |
+ * Encodes additional scoring factors or other per-document information. |
+ *
+ *
+ * {@link org.apache.lucene.codecs.lucene90.Lucene90TermVectorsFormat Term Vector Index} |
+ * .tvx |
+ * Stores offset into the document data file |
+ *
+ *
+ * {@link org.apache.lucene.codecs.lucene90.Lucene90TermVectorsFormat Term Vector Data} |
+ * .tvd |
+ * Contains term vector data. |
+ *
+ *
+ * {@link org.apache.lucene.codecs.lucene90.Lucene90LiveDocsFormat Live Documents} |
+ * .liv |
+ * Info about what documents are live |
+ *
+ *
+ * {@link org.apache.lucene.codecs.lucene90.Lucene90PointsFormat Point values} |
+ * .kdd, .kdi, .kdm |
+ * Holds indexed points |
+ *
+ *
+ * {@link org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat Vector values} |
+ * .vec, .vem, .veq, vex |
+ * Holds indexed vectors; .vec files contain the raw vector data,
+ * .vem the vector metadata, .veq the quantized vector data, and .vex the
+ * hnsw graph data. |
+ *
+ *
+ *
+ *
+ *
+ * Lock File
+ *
+ * The write lock, which is stored in the index directory by default, is named "write.lock". If the
+ * lock directory is different from the index directory then the write lock will be named
+ * "XXXX-write.lock" where XXXX is a unique prefix derived from the full path to the index
+ * directory. When this file is present, a writer is currently modifying the index (adding or
+ * removing documents). This lock file ensures that only one writer is modifying the index at a
+ * time.
+ *
+ * History
+ *
+ * Compatibility notes are provided in this document, describing how file formats have changed
+ * from prior versions:
+ *
+ *
+ * - In version 2.1, the file format was changed to allow lock-less commits (ie, no more commit
+ * lock). The change is fully backwards compatible: you can open a pre-2.1 index for searching
+ * or adding/deleting of docs. When the new segments file is saved (committed), it will be
+ * written in the new file format (meaning no specific "upgrade" process is needed). But note
+ * that once a commit has occurred, pre-2.1 Lucene will not be able to read the index.
+ *
- In version 2.3, the file format was changed to allow segments to share a single set of doc
+ * store (vectors & stored fields) files. This allows for faster indexing in certain
+ * cases. The change is fully backwards compatible (in the same way as the lock-less commits
+ * change in 2.1).
+ *
- In version 2.4, Strings are now written as true UTF-8 byte sequence, not Java's modified
+ * UTF-8. See LUCENE-510 for
+ * details.
+ *
- In version 2.9, an optional opaque Map<String,String> CommitUserData may be passed to
+ * IndexWriter's commit methods (and later retrieved), which is recorded in the segments_N
+ * file. See LUCENE-1382 for
+ * details. Also, diagnostics were added to each segment written recording details about why
+ * it was written (due to flush, merge; which OS/JRE was used; etc.). See issue LUCENE-1654 for details.
+ *
- In version 3.0, compressed fields are no longer written to the index (they can still be
+ * read, but on merge the new segment will write them, uncompressed). See issue LUCENE-1960 for details.
+ *
- In version 3.1, segments records the code version that created them. See LUCENE-2720 for details.
+ * Additionally segments track explicitly whether or not they have term vectors. See LUCENE-2811 for details.
+ *
- In version 3.2, numeric fields are written as natively to stored fields file, previously
+ * they were stored in text format only.
+ *
- In version 3.4, fields can omit position data while still indexing term frequencies.
+ *
- In version 4.0, the format of the inverted index became extensible via the {@link
+ * org.apache.lucene.codecs.Codec Codec} api. Fast per-document storage ({@code DocValues})
+ * was introduced. Normalization factors need no longer be a single byte, they can be any
+ * {@link org.apache.lucene.index.NumericDocValues NumericDocValues}. Terms need not be
+ * unicode strings, they can be any byte sequence. Term offsets can optionally be indexed into
+ * the postings lists. Payloads can be stored in the term vectors.
+ *
- In version 4.1, the format of the postings list changed to use either of FOR compression or
+ * variable-byte encoding, depending upon the frequency of the term. Terms appearing only once
+ * were changed to inline directly into the term dictionary. Stored fields are compressed by
+ * default.
+ *
- In version 4.2, term vectors are compressed by default. DocValues has a new multi-valued
+ * type (SortedSet), that can be used for faceting/grouping/joining on multi-valued fields.
+ *
- In version 4.5, DocValues were extended to explicitly represent missing values.
+ *
- In version 4.6, FieldInfos were extended to support per-field DocValues generation, to
+ * allow updating NumericDocValues fields.
+ *
- In version 4.8, checksum footers were added to the end of each index file for improved data
+ * integrity. Specifically, the last 8 bytes of every index file contain the zlib-crc32
+ * checksum of the file.
+ *
- In version 4.9, DocValues has a new multi-valued numeric type (SortedNumeric) that is
+ * suitable for faceting/sorting/analytics.
+ *
- In version 5.4, DocValues have been improved to store more information on disk: addresses
+ * for binary fields and ord indexes for multi-valued fields.
+ *
- In version 6.0, Points were added, for multi-dimensional range/distance search.
+ *
- In version 6.2, new Segment info format that reads/writes the index sort, to support index
+ * sorting.
+ *
- In version 7.0, DocValues have been improved to better support sparse doc values thanks to
+ * an iterator API.
+ *
- In version 8.0, postings have been enhanced to record, for each block of doc ids, the (term
+ * freq, normalization factor) pairs that may trigger the maximum score of the block. This
+ * information is recorded alongside skip data in order to be able to skip blocks of doc ids
+ * if they may not produce high enough scores. Additionally doc values and norms has been
+ * extended with jump-tables to make access O(1) instead of O(n), where n is the number of
+ * elements to skip when advancing in the data.
+ *
- In version 8.4, postings, positions, offsets and payload lengths have move to a more
+ * performant encoding that is vectorized.
+ *
- In version 8.6, index sort serialization is delegated to the sorts themselves, to allow
+ * user-defined sorts to be used
+ *
- In version 8.6, points fields split the index tree and leaf data into separate files, to
+ * allow for different access patterns to the different data structures
+ *
- In version 8.7, stored fields compression became adaptive to better handle documents with
+ * smaller stored fields.
+ *
- In version 9.0, vector-valued fields were added.
+ *
- In version 9.1, vector-valued fields were modified to add a graph hierarchy.
+ *
- In version 9.2, docs of vector-valued fields were moved from .vem to .vec and encoded by
+ * IndexDISI. ordToDoc mappings was added to .vem.
+ *
- In version 9.5, HNSW graph connections were changed to be delta-encoded with vints.
+ * Additionally, metadata file size improvements were made by delta-encoding nodes by graph
+ * layer and not writing the node ids for the zeroth layer.
+ *
- In version 9.9, Vector scalar quantization support was added. Allowing the HNSW vector
+ * format to utilize int8 quantized vectors for float32 vector search.
+ *
- In version 9.12, skip data was refactored to have only two levels: every 128 docs and every
+ * 4,06 docs, and to be inlined in postings lists. This resulted in a speedup for queries that
+ * need skipping, especially conjunctions.
+ *
+ *
+ *
+ *
+ * Limitations
+ *
+ *
+ *
+ *
Lucene uses a Java int
to refer to document numbers, and the index file format
+ * uses an Int32
on-disk to store document numbers. This is a limitation of both the
+ * index file format and the current implementation. Eventually these should be replaced with either
+ * UInt64
values, or better yet, {@link org.apache.lucene.store.DataOutput#writeVInt
+ * VInt} values which have no limit.
+ */
+package org.apache.lucene.backward_codecs.lucene912;
diff --git a/lucene/backward-codecs/src/resources/META-INF/services/org.apache.lucene.codecs.Codec b/lucene/backward-codecs/src/resources/META-INF/services/org.apache.lucene.codecs.Codec
index df14387fc68..a4638b5fcc7 100644
--- a/lucene/backward-codecs/src/resources/META-INF/services/org.apache.lucene.codecs.Codec
+++ b/lucene/backward-codecs/src/resources/META-INF/services/org.apache.lucene.codecs.Codec
@@ -23,3 +23,4 @@ org.apache.lucene.backward_codecs.lucene92.Lucene92Codec
org.apache.lucene.backward_codecs.lucene94.Lucene94Codec
org.apache.lucene.backward_codecs.lucene95.Lucene95Codec
org.apache.lucene.backward_codecs.lucene99.Lucene99Codec
+org.apache.lucene.backward_codecs.lucene912.Lucene912Codec
diff --git a/lucene/codecs/src/test/org/apache/lucene/codecs/bitvectors/TestHnswBitVectorsFormat.java b/lucene/codecs/src/test/org/apache/lucene/codecs/bitvectors/TestHnswBitVectorsFormat.java
index ab20ee67c8c..a0ea5833e2e 100644
--- a/lucene/codecs/src/test/org/apache/lucene/codecs/bitvectors/TestHnswBitVectorsFormat.java
+++ b/lucene/codecs/src/test/org/apache/lucene/codecs/bitvectors/TestHnswBitVectorsFormat.java
@@ -22,7 +22,7 @@ import java.io.IOException;
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.codecs.FilterCodec;
import org.apache.lucene.codecs.KnnVectorsFormat;
-import org.apache.lucene.codecs.lucene912.Lucene912Codec;
+import org.apache.lucene.codecs.lucene100.Lucene100Codec;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.KnnByteVectorField;
@@ -42,7 +42,7 @@ import org.apache.lucene.tests.index.BaseIndexFileFormatTestCase;
public class TestHnswBitVectorsFormat extends BaseIndexFileFormatTestCase {
@Override
protected Codec getCodec() {
- return new Lucene912Codec() {
+ return new Lucene100Codec() {
@Override
public KnnVectorsFormat getKnnVectorsFormatForField(String field) {
return new HnswBitVectorsFormat();
diff --git a/lucene/core/src/java/module-info.java b/lucene/core/src/java/module-info.java
index a0f0bad01eb..6fd1767aa34 100644
--- a/lucene/core/src/java/module-info.java
+++ b/lucene/core/src/java/module-info.java
@@ -15,7 +15,7 @@
* limitations under the License.
*/
-import org.apache.lucene.codecs.lucene912.Lucene912Codec;
+import org.apache.lucene.codecs.lucene100.Lucene100Codec;
/** Lucene Core. */
@SuppressWarnings("module") // the test framework is compiled after the core...
@@ -34,6 +34,7 @@ module org.apache.lucene.core {
exports org.apache.lucene.codecs.lucene95;
exports org.apache.lucene.codecs.lucene99;
exports org.apache.lucene.codecs.lucene912;
+ exports org.apache.lucene.codecs.lucene100;
exports org.apache.lucene.codecs.perfield;
exports org.apache.lucene.codecs;
exports org.apache.lucene.document;
@@ -72,7 +73,7 @@ module org.apache.lucene.core {
provides org.apache.lucene.analysis.TokenizerFactory with
org.apache.lucene.analysis.standard.StandardTokenizerFactory;
provides org.apache.lucene.codecs.Codec with
- Lucene912Codec;
+ Lucene100Codec;
provides org.apache.lucene.codecs.DocValuesFormat with
org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat;
provides org.apache.lucene.codecs.KnnVectorsFormat with
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/Codec.java b/lucene/core/src/java/org/apache/lucene/codecs/Codec.java
index e5a5dac8ff5..ff5a5bb21c0 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/Codec.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/Codec.java
@@ -55,7 +55,7 @@ public abstract class Codec implements NamedSPILoader.NamedSPI {
return LOADER;
}
- static Codec defaultCodec = LOADER.lookup("Lucene912");
+ static Codec defaultCodec = LOADER.lookup("Lucene100");
}
private final String name;
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene100/Lucene100Codec.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene100/Lucene100Codec.java
new file mode 100644
index 00000000000..97dc23bc07b
--- /dev/null
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene100/Lucene100Codec.java
@@ -0,0 +1,218 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.codecs.lucene100;
+
+import java.util.Objects;
+import org.apache.lucene.codecs.Codec;
+import org.apache.lucene.codecs.CompoundFormat;
+import org.apache.lucene.codecs.DocValuesFormat;
+import org.apache.lucene.codecs.FieldInfosFormat;
+import org.apache.lucene.codecs.FilterCodec;
+import org.apache.lucene.codecs.KnnVectorsFormat;
+import org.apache.lucene.codecs.LiveDocsFormat;
+import org.apache.lucene.codecs.NormsFormat;
+import org.apache.lucene.codecs.PointsFormat;
+import org.apache.lucene.codecs.PostingsFormat;
+import org.apache.lucene.codecs.SegmentInfoFormat;
+import org.apache.lucene.codecs.StoredFieldsFormat;
+import org.apache.lucene.codecs.TermVectorsFormat;
+import org.apache.lucene.codecs.lucene90.Lucene90CompoundFormat;
+import org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat;
+import org.apache.lucene.codecs.lucene90.Lucene90LiveDocsFormat;
+import org.apache.lucene.codecs.lucene90.Lucene90NormsFormat;
+import org.apache.lucene.codecs.lucene90.Lucene90PointsFormat;
+import org.apache.lucene.codecs.lucene90.Lucene90StoredFieldsFormat;
+import org.apache.lucene.codecs.lucene90.Lucene90TermVectorsFormat;
+import org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat;
+import org.apache.lucene.codecs.lucene94.Lucene94FieldInfosFormat;
+import org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat;
+import org.apache.lucene.codecs.lucene99.Lucene99SegmentInfoFormat;
+import org.apache.lucene.codecs.perfield.PerFieldDocValuesFormat;
+import org.apache.lucene.codecs.perfield.PerFieldKnnVectorsFormat;
+import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat;
+
+/**
+ * Implements the Lucene 10.0 index format
+ *
+ * If you want to reuse functionality of this codec in another codec, extend {@link FilterCodec}.
+ *
+ * @see org.apache.lucene.codecs.lucene100 package documentation for file format details.
+ * @lucene.experimental
+ */
+public class Lucene100Codec extends Codec {
+
+ /** Configuration option for the codec. */
+ public enum Mode {
+ /** Trade compression ratio for retrieval speed. */
+ BEST_SPEED(Lucene90StoredFieldsFormat.Mode.BEST_SPEED),
+ /** Trade retrieval speed for compression ratio. */
+ BEST_COMPRESSION(Lucene90StoredFieldsFormat.Mode.BEST_COMPRESSION);
+
+ private final Lucene90StoredFieldsFormat.Mode storedMode;
+
+ private Mode(Lucene90StoredFieldsFormat.Mode storedMode) {
+ this.storedMode = Objects.requireNonNull(storedMode);
+ }
+ }
+
+ private final TermVectorsFormat vectorsFormat = new Lucene90TermVectorsFormat();
+ private final FieldInfosFormat fieldInfosFormat = new Lucene94FieldInfosFormat();
+ private final SegmentInfoFormat segmentInfosFormat = new Lucene99SegmentInfoFormat();
+ private final LiveDocsFormat liveDocsFormat = new Lucene90LiveDocsFormat();
+ private final CompoundFormat compoundFormat = new Lucene90CompoundFormat();
+ private final NormsFormat normsFormat = new Lucene90NormsFormat();
+
+ private final PostingsFormat defaultPostingsFormat;
+ private final PostingsFormat postingsFormat =
+ new PerFieldPostingsFormat() {
+ @Override
+ public PostingsFormat getPostingsFormatForField(String field) {
+ return Lucene100Codec.this.getPostingsFormatForField(field);
+ }
+ };
+
+ private final DocValuesFormat defaultDVFormat;
+ private final DocValuesFormat docValuesFormat =
+ new PerFieldDocValuesFormat() {
+ @Override
+ public DocValuesFormat getDocValuesFormatForField(String field) {
+ return Lucene100Codec.this.getDocValuesFormatForField(field);
+ }
+ };
+
+ private final KnnVectorsFormat defaultKnnVectorsFormat;
+ private final KnnVectorsFormat knnVectorsFormat =
+ new PerFieldKnnVectorsFormat() {
+ @Override
+ public KnnVectorsFormat getKnnVectorsFormatForField(String field) {
+ return Lucene100Codec.this.getKnnVectorsFormatForField(field);
+ }
+ };
+
+ private final StoredFieldsFormat storedFieldsFormat;
+
+ /** Instantiates a new codec. */
+ public Lucene100Codec() {
+ this(Mode.BEST_SPEED);
+ }
+
+ /**
+ * Instantiates a new codec, specifying the stored fields compression mode to use.
+ *
+ * @param mode stored fields compression mode to use for newly flushed/merged segments.
+ */
+ public Lucene100Codec(Mode mode) {
+ super("Lucene100");
+ this.storedFieldsFormat =
+ new Lucene90StoredFieldsFormat(Objects.requireNonNull(mode).storedMode);
+ this.defaultPostingsFormat = new Lucene912PostingsFormat();
+ this.defaultDVFormat = new Lucene90DocValuesFormat();
+ this.defaultKnnVectorsFormat = new Lucene99HnswVectorsFormat();
+ }
+
+ @Override
+ public final StoredFieldsFormat storedFieldsFormat() {
+ return storedFieldsFormat;
+ }
+
+ @Override
+ public final TermVectorsFormat termVectorsFormat() {
+ return vectorsFormat;
+ }
+
+ @Override
+ public final PostingsFormat postingsFormat() {
+ return postingsFormat;
+ }
+
+ @Override
+ public final FieldInfosFormat fieldInfosFormat() {
+ return fieldInfosFormat;
+ }
+
+ @Override
+ public final SegmentInfoFormat segmentInfoFormat() {
+ return segmentInfosFormat;
+ }
+
+ @Override
+ public final LiveDocsFormat liveDocsFormat() {
+ return liveDocsFormat;
+ }
+
+ @Override
+ public final CompoundFormat compoundFormat() {
+ return compoundFormat;
+ }
+
+ @Override
+ public final PointsFormat pointsFormat() {
+ return new Lucene90PointsFormat();
+ }
+
+ @Override
+ public final KnnVectorsFormat knnVectorsFormat() {
+ return knnVectorsFormat;
+ }
+
+ /**
+ * Returns the postings format that should be used for writing new segments of field
.
+ *
+ *
The default implementation always returns "Lucene912".
+ *
+ *
WARNING: if you subclass, you are responsible for index backwards compatibility:
+ * future version of Lucene are only guaranteed to be able to read the default implementation,
+ */
+ public PostingsFormat getPostingsFormatForField(String field) {
+ return defaultPostingsFormat;
+ }
+
+ /**
+ * Returns the docvalues format that should be used for writing new segments of field
+ * .
+ *
+ *
The default implementation always returns "Lucene90".
+ *
+ *
WARNING: if you subclass, you are responsible for index backwards compatibility:
+ * future version of Lucene are only guaranteed to be able to read the default implementation.
+ */
+ public DocValuesFormat getDocValuesFormatForField(String field) {
+ return defaultDVFormat;
+ }
+
+ /**
+ * Returns the vectors format that should be used for writing new segments of field
+ *
+ *
The default implementation always returns "Lucene99HnswVectorsFormat".
+ *
+ *
WARNING: if you subclass, you are responsible for index backwards compatibility:
+ * future version of Lucene are only guaranteed to be able to read the default implementation.
+ */
+ public KnnVectorsFormat getKnnVectorsFormatForField(String field) {
+ return defaultKnnVectorsFormat;
+ }
+
+ @Override
+ public final DocValuesFormat docValuesFormat() {
+ return docValuesFormat;
+ }
+
+ @Override
+ public final NormsFormat normsFormat() {
+ return normsFormat;
+ }
+}
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene100/package-info.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene100/package-info.java
new file mode 100644
index 00000000000..64189bfa9d1
--- /dev/null
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene100/package-info.java
@@ -0,0 +1,433 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Lucene 10.0 file format.
+ *
+ *
Apache Lucene - Index File Formats
+ *
+ *
+ *
+ * Introduction
+ *
+ *
+ *
+ *
This document defines the index file formats used in this version of Lucene. If you are using
+ * a different version of Lucene, please consult the copy of docs/
that was distributed
+ * with the version you are using.
+ *
+ *
This document attempts to provide a high-level definition of the Apache Lucene file formats.
+ *
+ *
+ * Definitions
+ *
+ *
+ *
+ *
The fundamental concepts in Lucene are index, document, field and term.
+ *
+ *
An index contains a sequence of documents.
+ *
+ *
+ * - A document is a sequence of fields.
+ *
- A field is a named sequence of terms.
+ *
- A term is a sequence of bytes.
+ *
+ *
+ *
The same sequence of bytes in two different fields is considered a different term. Thus terms
+ * are represented as a pair: the string naming the field, and the bytes within the field.
+ *
+ *
Inverted Indexing
+ *
+ *
Lucene's index stores terms and statistics about those terms in order to make term-based
+ * search more efficient. Lucene's terms index falls into the family of indexes known as an
+ * inverted index. This is because it can list, for a term, the documents that contain it.
+ * This is the inverse of the natural relationship, in which documents list terms.
+ *
+ *
Types of Fields
+ *
+ *
In Lucene, fields may be stored, in which case their text is stored in the index
+ * literally, in a non-inverted manner. Fields that are inverted are called indexed. A field
+ * may be both stored and indexed.
+ *
+ *
The text of a field may be tokenized into terms to be indexed, or the text of a field
+ * may be used literally as a term to be indexed. Most fields are tokenized, but sometimes it is
+ * useful for certain identifier fields to be indexed literally.
+ *
+ *
See the {@link org.apache.lucene.document.Field Field} java docs for more information on
+ * Fields.
+ *
+ *
Segments
+ *
+ *
Lucene indexes may be composed of multiple sub-indexes, or segments. Each segment is a
+ * fully independent index, which could be searched separately. Indexes evolve by:
+ *
+ *
+ * - Creating new segments for newly added documents.
+ *
- Merging existing segments.
+ *
+ *
+ *
Searches may involve multiple segments and/or multiple indexes, each index potentially
+ * composed of a set of segments.
+ *
+ *
Document Numbers
+ *
+ *
Internally, Lucene refers to documents by an integer document number. The first
+ * document added to an index is numbered zero, and each subsequent document added gets a number one
+ * greater than the previous.
+ *
+ *
Note that a document's number may change, so caution should be taken when storing these
+ * numbers outside of Lucene. In particular, numbers may change in the following situations:
+ *
+ *
+ * -
+ *
The numbers stored in each segment are unique only within the segment, and must be
+ * converted before they can be used in a larger context. The standard technique is to
+ * allocate each segment a range of values, based on the range of numbers used in that
+ * segment. To convert a document number from a segment to an external value, the segment's
+ * base document number is added. To convert an external value back to a
+ * segment-specific value, the segment is identified by the range that the external value is
+ * in, and the segment's base value is subtracted. For example two five document segments
+ * might be combined, so that the first segment has a base value of zero, and the second of
+ * five. Document three from the second segment would have an external value of eight.
+ *
-
+ *
When documents are deleted, gaps are created in the numbering. These are eventually
+ * removed as the index evolves through merging. Deleted documents are dropped when segments
+ * are merged. A freshly-merged segment thus has no gaps in its numbering.
+ *
+ *
+ *
+ *
+ * Index Structure Overview
+ *
+ *
+ *
+ *
Each segment index maintains the following:
+ *
+ *
+ * - {@link org.apache.lucene.codecs.lucene99.Lucene99SegmentInfoFormat Segment info}. This
+ * contains metadata about a segment, such as the number of documents, what files it uses, and
+ * information about how the segment is sorted
+ *
- {@link org.apache.lucene.codecs.lucene94.Lucene94FieldInfosFormat Field names}. This
+ * contains metadata about the set of named fields used in the index.
+ *
- {@link org.apache.lucene.codecs.lucene90.Lucene90StoredFieldsFormat Stored Field values}.
+ * This contains, for each document, a list of attribute-value pairs, where the attributes are
+ * field names. These are used to store auxiliary information about the document, such as its
+ * title, url, or an identifier to access a database. The set of stored fields are what is
+ * returned for each hit when searching. This is keyed by document number.
+ *
- {@link org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat Term dictionary}. A
+ * dictionary containing all of the terms used in all of the indexed fields of all of the
+ * documents. The dictionary also contains the number of documents which contain the term, and
+ * pointers to the term's frequency and proximity data.
+ *
- {@link org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat Term Frequency data}. For
+ * each term in the dictionary, the numbers of all the documents that contain that term, and
+ * the frequency of the term in that document, unless frequencies are omitted ({@link
+ * org.apache.lucene.index.IndexOptions#DOCS IndexOptions.DOCS})
+ *
- {@link org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat Term Proximity data}. For
+ * each term in the dictionary, the positions that the term occurs in each document. Note that
+ * this will not exist if all fields in all documents omit position data.
+ *
- {@link org.apache.lucene.codecs.lucene90.Lucene90NormsFormat Normalization factors}. For
+ * each field in each document, a value is stored that is multiplied into the score for hits
+ * on that field.
+ *
- {@link org.apache.lucene.codecs.lucene90.Lucene90TermVectorsFormat Term Vectors}. For each
+ * field in each document, the term vector (sometimes called document vector) may be stored. A
+ * term vector consists of term text and term frequency. To add Term Vectors to your index see
+ * the {@link org.apache.lucene.document.Field Field} constructors
+ *
- {@link org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat Per-document values}. Like
+ * stored values, these are also keyed by document number, but are generally intended to be
+ * loaded into main memory for fast access. Whereas stored values are generally intended for
+ * summary results from searches, per-document values are useful for things like scoring
+ * factors.
+ *
- {@link org.apache.lucene.codecs.lucene90.Lucene90LiveDocsFormat Live documents}. An
+ * optional file indicating which documents are live.
+ *
- {@link org.apache.lucene.codecs.lucene90.Lucene90PointsFormat Point values}. Optional pair
+ * of files, recording dimensionally indexed fields, to enable fast numeric range filtering
+ * and large numeric values like BigInteger and BigDecimal (1D) and geographic shape
+ * intersection (2D, 3D).
+ *
- {@link org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat Vector values}. The
+ * vector format stores numeric vectors in a format optimized for random access and
+ * computation, supporting high-dimensional nearest-neighbor search.
+ *
+ *
+ *
Details on each of these are provided in their linked pages.
+ *
+ * File Naming
+ *
+ *
+ *
+ *
All files belonging to a segment have the same name with varying extensions. The extensions
+ * correspond to the different file formats described below. When using the Compound File format
+ * (default for small segments) these files (except for the Segment info file, the Lock file, and
+ * Deleted documents file) are collapsed into a single .cfs file (see below for details)
+ *
+ *
Typically, all segments in an index are stored in a single directory, although this is not
+ * required.
+ *
+ *
File names are never re-used. That is, when any file is saved to the Directory it is given a
+ * never before used filename. This is achieved using a simple generations approach. For example,
+ * the first segments file is segments_1, then segments_2, etc. The generation is a sequential long
+ * integer represented in alpha-numeric (base 36) form.
+ *
+ * Summary of File Extensions
+ *
+ *
+ *
+ *
The following table summarizes the names and extensions of the files in Lucene:
+ *
+ *
+ * lucene filenames by extension
+ *
+ * Name |
+ * Extension |
+ * Brief Description |
+ *
+ *
+ * {@link org.apache.lucene.index.SegmentInfos Segments File} |
+ * segments_N |
+ * Stores information about a commit point |
+ *
+ *
+ * Lock File |
+ * write.lock |
+ * The Write lock prevents multiple IndexWriters from writing to the same
+ * file. |
+ *
+ *
+ * {@link org.apache.lucene.codecs.lucene99.Lucene99SegmentInfoFormat Segment Info} |
+ * .si |
+ * Stores metadata about a segment |
+ *
+ *
+ * {@link org.apache.lucene.codecs.lucene90.Lucene90CompoundFormat Compound File} |
+ * .cfs, .cfe |
+ * An optional "virtual" file consisting of all the other index files for
+ * systems that frequently run out of file handles. |
+ *
+ *
+ * {@link org.apache.lucene.codecs.lucene94.Lucene94FieldInfosFormat Fields} |
+ * .fnm |
+ * Stores information about the fields |
+ *
+ *
+ * {@link org.apache.lucene.codecs.lucene90.Lucene90StoredFieldsFormat Field Index} |
+ * .fdx |
+ * Contains pointers to field data |
+ *
+ *
+ * {@link org.apache.lucene.codecs.lucene90.Lucene90StoredFieldsFormat Field Data} |
+ * .fdt |
+ * The stored fields for documents |
+ *
+ *
+ * {@link org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat Term Dictionary} |
+ * .tim |
+ * The term dictionary, stores term info |
+ *
+ *
+ * {@link org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat Term Index} |
+ * .tip |
+ * The index into the Term Dictionary |
+ *
+ *
+ * {@link org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat Frequencies} |
+ * .doc |
+ * Contains the list of docs which contain each term along with frequency |
+ *
+ *
+ * {@link org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat Positions} |
+ * .pos |
+ * Stores position information about where a term occurs in the index |
+ *
+ *
+ * {@link org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat Payloads} |
+ * .pay |
+ * Stores additional per-position metadata information such as character offsets and user payloads |
+ *
+ *
+ * {@link org.apache.lucene.codecs.lucene90.Lucene90NormsFormat Norms} |
+ * .nvd, .nvm |
+ * Encodes length and boost factors for docs and fields |
+ *
+ *
+ * {@link org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat Per-Document Values} |
+ * .dvd, .dvm |
+ * Encodes additional scoring factors or other per-document information. |
+ *
+ *
+ * {@link org.apache.lucene.codecs.lucene90.Lucene90TermVectorsFormat Term Vector Index} |
+ * .tvx |
+ * Stores offset into the document data file |
+ *
+ *
+ * {@link org.apache.lucene.codecs.lucene90.Lucene90TermVectorsFormat Term Vector Data} |
+ * .tvd |
+ * Contains term vector data. |
+ *
+ *
+ * {@link org.apache.lucene.codecs.lucene90.Lucene90LiveDocsFormat Live Documents} |
+ * .liv |
+ * Info about what documents are live |
+ *
+ *
+ * {@link org.apache.lucene.codecs.lucene90.Lucene90PointsFormat Point values} |
+ * .kdd, .kdi, .kdm |
+ * Holds indexed points |
+ *
+ *
+ * {@link org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat Vector values} |
+ * .vec, .vem, .veq, vex |
+ * Holds indexed vectors; .vec files contain the raw vector data,
+ * .vem the vector metadata, .veq the quantized vector data, and .vex the
+ * hnsw graph data. |
+ *
+ *
+ *
+ *
+ *
+ * Lock File
+ *
+ * The write lock, which is stored in the index directory by default, is named "write.lock". If the
+ * lock directory is different from the index directory then the write lock will be named
+ * "XXXX-write.lock" where XXXX is a unique prefix derived from the full path to the index
+ * directory. When this file is present, a writer is currently modifying the index (adding or
+ * removing documents). This lock file ensures that only one writer is modifying the index at a
+ * time.
+ *
+ * History
+ *
+ * Compatibility notes are provided in this document, describing how file formats have changed
+ * from prior versions:
+ *
+ *
+ * - In version 2.1, the file format was changed to allow lock-less commits (ie, no more commit
+ * lock). The change is fully backwards compatible: you can open a pre-2.1 index for searching
+ * or adding/deleting of docs. When the new segments file is saved (committed), it will be
+ * written in the new file format (meaning no specific "upgrade" process is needed). But note
+ * that once a commit has occurred, pre-2.1 Lucene will not be able to read the index.
+ *
- In version 2.3, the file format was changed to allow segments to share a single set of doc
+ * store (vectors & stored fields) files. This allows for faster indexing in certain
+ * cases. The change is fully backwards compatible (in the same way as the lock-less commits
+ * change in 2.1).
+ *
- In version 2.4, Strings are now written as true UTF-8 byte sequence, not Java's modified
+ * UTF-8. See LUCENE-510 for
+ * details.
+ *
- In version 2.9, an optional opaque Map<String,String> CommitUserData may be passed to
+ * IndexWriter's commit methods (and later retrieved), which is recorded in the segments_N
+ * file. See LUCENE-1382 for
+ * details. Also, diagnostics were added to each segment written recording details about why
+ * it was written (due to flush, merge; which OS/JRE was used; etc.). See issue LUCENE-1654 for details.
+ *
- In version 3.0, compressed fields are no longer written to the index (they can still be
+ * read, but on merge the new segment will write them, uncompressed). See issue LUCENE-1960 for details.
+ *
- In version 3.1, segments records the code version that created them. See LUCENE-2720 for details.
+ * Additionally segments track explicitly whether or not they have term vectors. See LUCENE-2811 for details.
+ *
- In version 3.2, numeric fields are written as natively to stored fields file, previously
+ * they were stored in text format only.
+ *
- In version 3.4, fields can omit position data while still indexing term frequencies.
+ *
- In version 4.0, the format of the inverted index became extensible via the {@link
+ * org.apache.lucene.codecs.Codec Codec} api. Fast per-document storage ({@code DocValues})
+ * was introduced. Normalization factors need no longer be a single byte, they can be any
+ * {@link org.apache.lucene.index.NumericDocValues NumericDocValues}. Terms need not be
+ * unicode strings, they can be any byte sequence. Term offsets can optionally be indexed into
+ * the postings lists. Payloads can be stored in the term vectors.
+ *
- In version 4.1, the format of the postings list changed to use either of FOR compression or
+ * variable-byte encoding, depending upon the frequency of the term. Terms appearing only once
+ * were changed to inline directly into the term dictionary. Stored fields are compressed by
+ * default.
+ *
- In version 4.2, term vectors are compressed by default. DocValues has a new multi-valued
+ * type (SortedSet), that can be used for faceting/grouping/joining on multi-valued fields.
+ *
- In version 4.5, DocValues were extended to explicitly represent missing values.
+ *
- In version 4.6, FieldInfos were extended to support per-field DocValues generation, to
+ * allow updating NumericDocValues fields.
+ *
- In version 4.8, checksum footers were added to the end of each index file for improved data
+ * integrity. Specifically, the last 8 bytes of every index file contain the zlib-crc32
+ * checksum of the file.
+ *
- In version 4.9, DocValues has a new multi-valued numeric type (SortedNumeric) that is
+ * suitable for faceting/sorting/analytics.
+ *
- In version 5.4, DocValues have been improved to store more information on disk: addresses
+ * for binary fields and ord indexes for multi-valued fields.
+ *
- In version 6.0, Points were added, for multi-dimensional range/distance search.
+ *
- In version 6.2, new Segment info format that reads/writes the index sort, to support index
+ * sorting.
+ *
- In version 7.0, DocValues have been improved to better support sparse doc values thanks to
+ * an iterator API.
+ *
- In version 8.0, postings have been enhanced to record, for each block of doc ids, the (term
+ * freq, normalization factor) pairs that may trigger the maximum score of the block. This
+ * information is recorded alongside skip data in order to be able to skip blocks of doc ids
+ * if they may not produce high enough scores. Additionally doc values and norms has been
+ * extended with jump-tables to make access O(1) instead of O(n), where n is the number of
+ * elements to skip when advancing in the data.
+ *
- In version 8.4, postings, positions, offsets and payload lengths have move to a more
+ * performant encoding that is vectorized.
+ *
- In version 8.6, index sort serialization is delegated to the sorts themselves, to allow
+ * user-defined sorts to be used
+ *
- In version 8.6, points fields split the index tree and leaf data into separate files, to
+ * allow for different access patterns to the different data structures
+ *
- In version 8.7, stored fields compression became adaptive to better handle documents with
+ * smaller stored fields.
+ *
- In version 9.0, vector-valued fields were added.
+ *
- In version 9.1, vector-valued fields were modified to add a graph hierarchy.
+ *
- In version 9.2, docs of vector-valued fields were moved from .vem to .vec and encoded by
+ * IndexDISI. ordToDoc mappings was added to .vem.
+ *
- In version 9.5, HNSW graph connections were changed to be delta-encoded with vints.
+ * Additionally, metadata file size improvements were made by delta-encoding nodes by graph
+ * layer and not writing the node ids for the zeroth layer.
+ *
- In version 9.9, Vector scalar quantization support was added. Allowing the HNSW vector
+ * format to utilize int8 quantized vectors for float32 vector search.
+ *
- In version 9.12, skip data was refactored to have only two levels: every 128 docs and every
+ * 4,06 docs, and to be inlined in postings lists. This resulted in a speedup for queries that
+ * need skipping, especially conjunctions.
+ *
+ *
+ *
+ *
+ * Limitations
+ *
+ *
+ *
+ *
Lucene uses a Java int
to refer to document numbers, and the index file format
+ * uses an Int32
on-disk to store document numbers. This is a limitation of both the
+ * index file format and the current implementation. Eventually these should be replaced with either
+ * UInt64
values, or better yet, {@link org.apache.lucene.store.DataOutput#writeVInt
+ * VInt} values which have no limit.
+ */
+package org.apache.lucene.codecs.lucene100;
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90StoredFieldsFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90StoredFieldsFormat.java
index ce0310d6396..9e367a3d9d8 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90StoredFieldsFormat.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90StoredFieldsFormat.java
@@ -49,9 +49,9 @@ import org.apache.lucene.util.packed.DirectMonotonicWriter;
*
*
* // the default: for high performance
- * indexWriterConfig.setCodec(new Lucene912Codec(Mode.BEST_SPEED));
+ * indexWriterConfig.setCodec(new Lucene100Codec(Mode.BEST_SPEED));
* // instead for higher performance (but slower):
- * // indexWriterConfig.setCodec(new Lucene912Codec(Mode.BEST_COMPRESSION));
+ * // indexWriterConfig.setCodec(new Lucene100Codec(Mode.BEST_COMPRESSION));
*
*
* File formats
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene912/package-info.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene912/package-info.java
index 8dcab480d27..b9ddb1227b1 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene912/package-info.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene912/package-info.java
@@ -15,419 +15,5 @@
* limitations under the License.
*/
-/**
- * Lucene 9.12 file format.
- *
- *
Apache Lucene - Index File Formats
- *
- *
- *
- * Introduction
- *
- *
- *
- *
This document defines the index file formats used in this version of Lucene. If you are using
- * a different version of Lucene, please consult the copy of docs/
that was distributed
- * with the version you are using.
- *
- *
This document attempts to provide a high-level definition of the Apache Lucene file formats.
- *
- *
- * Definitions
- *
- *
- *
- *
The fundamental concepts in Lucene are index, document, field and term.
- *
- *
An index contains a sequence of documents.
- *
- *
- * - A document is a sequence of fields.
- *
- A field is a named sequence of terms.
- *
- A term is a sequence of bytes.
- *
- *
- *
The same sequence of bytes in two different fields is considered a different term. Thus terms
- * are represented as a pair: the string naming the field, and the bytes within the field.
- *
- *
Inverted Indexing
- *
- *
Lucene's index stores terms and statistics about those terms in order to make term-based
- * search more efficient. Lucene's terms index falls into the family of indexes known as an
- * inverted index. This is because it can list, for a term, the documents that contain it.
- * This is the inverse of the natural relationship, in which documents list terms.
- *
- *
Types of Fields
- *
- *
In Lucene, fields may be stored, in which case their text is stored in the index
- * literally, in a non-inverted manner. Fields that are inverted are called indexed. A field
- * may be both stored and indexed.
- *
- *
The text of a field may be tokenized into terms to be indexed, or the text of a field
- * may be used literally as a term to be indexed. Most fields are tokenized, but sometimes it is
- * useful for certain identifier fields to be indexed literally.
- *
- *
See the {@link org.apache.lucene.document.Field Field} java docs for more information on
- * Fields.
- *
- *
Segments
- *
- *
Lucene indexes may be composed of multiple sub-indexes, or segments. Each segment is a
- * fully independent index, which could be searched separately. Indexes evolve by:
- *
- *
- * - Creating new segments for newly added documents.
- *
- Merging existing segments.
- *
- *
- *
Searches may involve multiple segments and/or multiple indexes, each index potentially
- * composed of a set of segments.
- *
- *
Document Numbers
- *
- *
Internally, Lucene refers to documents by an integer document number. The first
- * document added to an index is numbered zero, and each subsequent document added gets a number one
- * greater than the previous.
- *
- *
Note that a document's number may change, so caution should be taken when storing these
- * numbers outside of Lucene. In particular, numbers may change in the following situations:
- *
- *
- * -
- *
The numbers stored in each segment are unique only within the segment, and must be
- * converted before they can be used in a larger context. The standard technique is to
- * allocate each segment a range of values, based on the range of numbers used in that
- * segment. To convert a document number from a segment to an external value, the segment's
- * base document number is added. To convert an external value back to a
- * segment-specific value, the segment is identified by the range that the external value is
- * in, and the segment's base value is subtracted. For example two five document segments
- * might be combined, so that the first segment has a base value of zero, and the second of
- * five. Document three from the second segment would have an external value of eight.
- *
-
- *
When documents are deleted, gaps are created in the numbering. These are eventually
- * removed as the index evolves through merging. Deleted documents are dropped when segments
- * are merged. A freshly-merged segment thus has no gaps in its numbering.
- *
- *
- *
- *
- * Index Structure Overview
- *
- *
- *
- *
Each segment index maintains the following:
- *
- *
- * - {@link org.apache.lucene.codecs.lucene99.Lucene99SegmentInfoFormat Segment info}. This
- * contains metadata about a segment, such as the number of documents, what files it uses, and
- * information about how the segment is sorted
- *
- {@link org.apache.lucene.codecs.lucene94.Lucene94FieldInfosFormat Field names}. This
- * contains metadata about the set of named fields used in the index.
- *
- {@link org.apache.lucene.codecs.lucene90.Lucene90StoredFieldsFormat Stored Field values}.
- * This contains, for each document, a list of attribute-value pairs, where the attributes are
- * field names. These are used to store auxiliary information about the document, such as its
- * title, url, or an identifier to access a database. The set of stored fields are what is
- * returned for each hit when searching. This is keyed by document number.
- *
- {@link org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat Term dictionary}. A
- * dictionary containing all of the terms used in all of the indexed fields of all of the
- * documents. The dictionary also contains the number of documents which contain the term, and
- * pointers to the term's frequency and proximity data.
- *
- {@link org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat Term Frequency data}. For
- * each term in the dictionary, the numbers of all the documents that contain that term, and
- * the frequency of the term in that document, unless frequencies are omitted ({@link
- * org.apache.lucene.index.IndexOptions#DOCS IndexOptions.DOCS})
- *
- {@link org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat Term Proximity data}. For
- * each term in the dictionary, the positions that the term occurs in each document. Note that
- * this will not exist if all fields in all documents omit position data.
- *
- {@link org.apache.lucene.codecs.lucene90.Lucene90NormsFormat Normalization factors}. For
- * each field in each document, a value is stored that is multiplied into the score for hits
- * on that field.
- *
- {@link org.apache.lucene.codecs.lucene90.Lucene90TermVectorsFormat Term Vectors}. For each
- * field in each document, the term vector (sometimes called document vector) may be stored. A
- * term vector consists of term text and term frequency. To add Term Vectors to your index see
- * the {@link org.apache.lucene.document.Field Field} constructors
- *
- {@link org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat Per-document values}. Like
- * stored values, these are also keyed by document number, but are generally intended to be
- * loaded into main memory for fast access. Whereas stored values are generally intended for
- * summary results from searches, per-document values are useful for things like scoring
- * factors.
- *
- {@link org.apache.lucene.codecs.lucene90.Lucene90LiveDocsFormat Live documents}. An
- * optional file indicating which documents are live.
- *
- {@link org.apache.lucene.codecs.lucene90.Lucene90PointsFormat Point values}. Optional pair
- * of files, recording dimensionally indexed fields, to enable fast numeric range filtering
- * and large numeric values like BigInteger and BigDecimal (1D) and geographic shape
- * intersection (2D, 3D).
- *
- {@link org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat Vector values}. The
- * vector format stores numeric vectors in a format optimized for random access and
- * computation, supporting high-dimensional nearest-neighbor search.
- *
- *
- *
Details on each of these are provided in their linked pages.
- *
- * File Naming
- *
- *
- *
- *
All files belonging to a segment have the same name with varying extensions. The extensions
- * correspond to the different file formats described below. When using the Compound File format
- * (default for small segments) these files (except for the Segment info file, the Lock file, and
- * Deleted documents file) are collapsed into a single .cfs file (see below for details)
- *
- *
Typically, all segments in an index are stored in a single directory, although this is not
- * required.
- *
- *
File names are never re-used. That is, when any file is saved to the Directory it is given a
- * never before used filename. This is achieved using a simple generations approach. For example,
- * the first segments file is segments_1, then segments_2, etc. The generation is a sequential long
- * integer represented in alpha-numeric (base 36) form.
- *
- * Summary of File Extensions
- *
- *
- *
- *
The following table summarizes the names and extensions of the files in Lucene:
- *
- *
- * lucene filenames by extension
- *
- * Name |
- * Extension |
- * Brief Description |
- *
- *
- * {@link org.apache.lucene.index.SegmentInfos Segments File} |
- * segments_N |
- * Stores information about a commit point |
- *
- *
- * Lock File |
- * write.lock |
- * The Write lock prevents multiple IndexWriters from writing to the same
- * file. |
- *
- *
- * {@link org.apache.lucene.codecs.lucene99.Lucene99SegmentInfoFormat Segment Info} |
- * .si |
- * Stores metadata about a segment |
- *
- *
- * {@link org.apache.lucene.codecs.lucene90.Lucene90CompoundFormat Compound File} |
- * .cfs, .cfe |
- * An optional "virtual" file consisting of all the other index files for
- * systems that frequently run out of file handles. |
- *
- *
- * {@link org.apache.lucene.codecs.lucene94.Lucene94FieldInfosFormat Fields} |
- * .fnm |
- * Stores information about the fields |
- *
- *
- * {@link org.apache.lucene.codecs.lucene90.Lucene90StoredFieldsFormat Field Index} |
- * .fdx |
- * Contains pointers to field data |
- *
- *
- * {@link org.apache.lucene.codecs.lucene90.Lucene90StoredFieldsFormat Field Data} |
- * .fdt |
- * The stored fields for documents |
- *
- *
- * {@link org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat Term Dictionary} |
- * .tim |
- * The term dictionary, stores term info |
- *
- *
- * {@link org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat Term Index} |
- * .tip |
- * The index into the Term Dictionary |
- *
- *
- * {@link org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat Frequencies} |
- * .doc |
- * Contains the list of docs which contain each term along with frequency |
- *
- *
- * {@link org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat Positions} |
- * .pos |
- * Stores position information about where a term occurs in the index |
- *
- *
- * {@link org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat Payloads} |
- * .pay |
- * Stores additional per-position metadata information such as character offsets and user payloads |
- *
- *
- * {@link org.apache.lucene.codecs.lucene90.Lucene90NormsFormat Norms} |
- * .nvd, .nvm |
- * Encodes length and boost factors for docs and fields |
- *
- *
- * {@link org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat Per-Document Values} |
- * .dvd, .dvm |
- * Encodes additional scoring factors or other per-document information. |
- *
- *
- * {@link org.apache.lucene.codecs.lucene90.Lucene90TermVectorsFormat Term Vector Index} |
- * .tvx |
- * Stores offset into the document data file |
- *
- *
- * {@link org.apache.lucene.codecs.lucene90.Lucene90TermVectorsFormat Term Vector Data} |
- * .tvd |
- * Contains term vector data. |
- *
- *
- * {@link org.apache.lucene.codecs.lucene90.Lucene90LiveDocsFormat Live Documents} |
- * .liv |
- * Info about what documents are live |
- *
- *
- * {@link org.apache.lucene.codecs.lucene90.Lucene90PointsFormat Point values} |
- * .kdd, .kdi, .kdm |
- * Holds indexed points |
- *
- *
- * {@link org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat Vector values} |
- * .vec, .vem, .veq, vex |
- * Holds indexed vectors; .vec files contain the raw vector data,
- * .vem the vector metadata, .veq the quantized vector data, and .vex the
- * hnsw graph data. |
- *
- *
- *
- *
- *
- * Lock File
- *
- * The write lock, which is stored in the index directory by default, is named "write.lock". If the
- * lock directory is different from the index directory then the write lock will be named
- * "XXXX-write.lock" where XXXX is a unique prefix derived from the full path to the index
- * directory. When this file is present, a writer is currently modifying the index (adding or
- * removing documents). This lock file ensures that only one writer is modifying the index at a
- * time.
- *
- * History
- *
- * Compatibility notes are provided in this document, describing how file formats have changed
- * from prior versions:
- *
- *
- * - In version 2.1, the file format was changed to allow lock-less commits (ie, no more commit
- * lock). The change is fully backwards compatible: you can open a pre-2.1 index for searching
- * or adding/deleting of docs. When the new segments file is saved (committed), it will be
- * written in the new file format (meaning no specific "upgrade" process is needed). But note
- * that once a commit has occurred, pre-2.1 Lucene will not be able to read the index.
- *
- In version 2.3, the file format was changed to allow segments to share a single set of doc
- * store (vectors & stored fields) files. This allows for faster indexing in certain
- * cases. The change is fully backwards compatible (in the same way as the lock-less commits
- * change in 2.1).
- *
- In version 2.4, Strings are now written as true UTF-8 byte sequence, not Java's modified
- * UTF-8. See LUCENE-510 for
- * details.
- *
- In version 2.9, an optional opaque Map<String,String> CommitUserData may be passed to
- * IndexWriter's commit methods (and later retrieved), which is recorded in the segments_N
- * file. See LUCENE-1382 for
- * details. Also, diagnostics were added to each segment written recording details about why
- * it was written (due to flush, merge; which OS/JRE was used; etc.). See issue LUCENE-1654 for details.
- *
- In version 3.0, compressed fields are no longer written to the index (they can still be
- * read, but on merge the new segment will write them, uncompressed). See issue LUCENE-1960 for details.
- *
- In version 3.1, segments records the code version that created them. See LUCENE-2720 for details.
- * Additionally segments track explicitly whether or not they have term vectors. See LUCENE-2811 for details.
- *
- In version 3.2, numeric fields are written as natively to stored fields file, previously
- * they were stored in text format only.
- *
- In version 3.4, fields can omit position data while still indexing term frequencies.
- *
- In version 4.0, the format of the inverted index became extensible via the {@link
- * org.apache.lucene.codecs.Codec Codec} api. Fast per-document storage ({@code DocValues})
- * was introduced. Normalization factors need no longer be a single byte, they can be any
- * {@link org.apache.lucene.index.NumericDocValues NumericDocValues}. Terms need not be
- * unicode strings, they can be any byte sequence. Term offsets can optionally be indexed into
- * the postings lists. Payloads can be stored in the term vectors.
- *
- In version 4.1, the format of the postings list changed to use either of FOR compression or
- * variable-byte encoding, depending upon the frequency of the term. Terms appearing only once
- * were changed to inline directly into the term dictionary. Stored fields are compressed by
- * default.
- *
- In version 4.2, term vectors are compressed by default. DocValues has a new multi-valued
- * type (SortedSet), that can be used for faceting/grouping/joining on multi-valued fields.
- *
- In version 4.5, DocValues were extended to explicitly represent missing values.
- *
- In version 4.6, FieldInfos were extended to support per-field DocValues generation, to
- * allow updating NumericDocValues fields.
- *
- In version 4.8, checksum footers were added to the end of each index file for improved data
- * integrity. Specifically, the last 8 bytes of every index file contain the zlib-crc32
- * checksum of the file.
- *
- In version 4.9, DocValues has a new multi-valued numeric type (SortedNumeric) that is
- * suitable for faceting/sorting/analytics.
- *
- In version 5.4, DocValues have been improved to store more information on disk: addresses
- * for binary fields and ord indexes for multi-valued fields.
- *
- In version 6.0, Points were added, for multi-dimensional range/distance search.
- *
- In version 6.2, new Segment info format that reads/writes the index sort, to support index
- * sorting.
- *
- In version 7.0, DocValues have been improved to better support sparse doc values thanks to
- * an iterator API.
- *
- In version 8.0, postings have been enhanced to record, for each block of doc ids, the (term
- * freq, normalization factor) pairs that may trigger the maximum score of the block. This
- * information is recorded alongside skip data in order to be able to skip blocks of doc ids
- * if they may not produce high enough scores. Additionally doc values and norms has been
- * extended with jump-tables to make access O(1) instead of O(n), where n is the number of
- * elements to skip when advancing in the data.
- *
- In version 8.4, postings, positions, offsets and payload lengths have move to a more
- * performant encoding that is vectorized.
- *
- In version 8.6, index sort serialization is delegated to the sorts themselves, to allow
- * user-defined sorts to be used
- *
- In version 8.6, points fields split the index tree and leaf data into separate files, to
- * allow for different access patterns to the different data structures
- *
- In version 8.7, stored fields compression became adaptive to better handle documents with
- * smaller stored fields.
- *
- In version 9.0, vector-valued fields were added.
- *
- In version 9.1, vector-valued fields were modified to add a graph hierarchy.
- *
- In version 9.2, docs of vector-valued fields were moved from .vem to .vec and encoded by
- * IndexDISI. ordToDoc mappings was added to .vem.
- *
- In version 9.5, HNSW graph connections were changed to be delta-encoded with vints.
- * Additionally, metadata file size improvements were made by delta-encoding nodes by graph
- * layer and not writing the node ids for the zeroth layer.
- *
- In version 9.9, Vector scalar quantization support was added. Allowing the HNSW vector
- * format to utilize int8 quantized vectors for float32 vector search.
- *
- In version 9.12, skip data was refactored to have only two levels: every 128 docs and every
- * 4,06 docs, and to be inlined in postings lists. This resulted in a speedup for queries that
- * need skipping, especially conjunctions.
- *
- *
- *
- *
- * Limitations
- *
- *
- *
- *
Lucene uses a Java int
to refer to document numbers, and the index file format
- * uses an Int32
on-disk to store document numbers. This is a limitation of both the
- * index file format and the current implementation. Eventually these should be replaced with either
- * UInt64
values, or better yet, {@link org.apache.lucene.store.DataOutput#writeVInt
- * VInt} values which have no limit.
- */
+/** Lucene 9.12 file format. */
package org.apache.lucene.codecs.lucene912;
diff --git a/lucene/core/src/resources/META-INF/services/org.apache.lucene.codecs.Codec b/lucene/core/src/resources/META-INF/services/org.apache.lucene.codecs.Codec
index 8b672496601..bd950aeaebd 100644
--- a/lucene/core/src/resources/META-INF/services/org.apache.lucene.codecs.Codec
+++ b/lucene/core/src/resources/META-INF/services/org.apache.lucene.codecs.Codec
@@ -13,4 +13,4 @@
# See the License for the specific language governing permissions and
# limitations under the License.
-org.apache.lucene.codecs.lucene912.Lucene912Codec
+org.apache.lucene.codecs.lucene100.Lucene100Codec
diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene90/TestLucene90StoredFieldsFormatHighCompression.java b/lucene/core/src/test/org/apache/lucene/codecs/lucene90/TestLucene90StoredFieldsFormatHighCompression.java
index c72bcfeea46..fe6c82e73bb 100644
--- a/lucene/core/src/test/org/apache/lucene/codecs/lucene90/TestLucene90StoredFieldsFormatHighCompression.java
+++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene90/TestLucene90StoredFieldsFormatHighCompression.java
@@ -18,7 +18,7 @@ package org.apache.lucene.codecs.lucene90;
import com.carrotsearch.randomizedtesting.generators.RandomPicks;
import org.apache.lucene.codecs.Codec;
-import org.apache.lucene.codecs.lucene912.Lucene912Codec;
+import org.apache.lucene.codecs.lucene100.Lucene100Codec;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.StoredField;
import org.apache.lucene.index.DirectoryReader;
@@ -31,7 +31,7 @@ import org.apache.lucene.tests.index.BaseStoredFieldsFormatTestCase;
public class TestLucene90StoredFieldsFormatHighCompression extends BaseStoredFieldsFormatTestCase {
@Override
protected Codec getCodec() {
- return new Lucene912Codec(Lucene912Codec.Mode.BEST_COMPRESSION);
+ return new Lucene100Codec(Lucene100Codec.Mode.BEST_COMPRESSION);
}
/**
@@ -42,7 +42,7 @@ public class TestLucene90StoredFieldsFormatHighCompression extends BaseStoredFie
for (int i = 0; i < 10; i++) {
IndexWriterConfig iwc = newIndexWriterConfig();
iwc.setCodec(
- new Lucene912Codec(RandomPicks.randomFrom(random(), Lucene912Codec.Mode.values())));
+ new Lucene100Codec(RandomPicks.randomFrom(random(), Lucene100Codec.Mode.values())));
IndexWriter iw = new IndexWriter(dir, newIndexWriterConfig());
Document doc = new Document();
doc.add(new StoredField("field1", "value1"));
@@ -72,7 +72,7 @@ public class TestLucene90StoredFieldsFormatHighCompression extends BaseStoredFie
expectThrows(
NullPointerException.class,
() -> {
- new Lucene912Codec(null);
+ new Lucene100Codec(null);
});
expectThrows(
diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestLucene99HnswQuantizedVectorsFormat.java b/lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestLucene99HnswQuantizedVectorsFormat.java
index c3225326f4c..ed70b2df002 100644
--- a/lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestLucene99HnswQuantizedVectorsFormat.java
+++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestLucene99HnswQuantizedVectorsFormat.java
@@ -28,7 +28,7 @@ import org.apache.lucene.codecs.Codec;
import org.apache.lucene.codecs.FilterCodec;
import org.apache.lucene.codecs.KnnVectorsFormat;
import org.apache.lucene.codecs.KnnVectorsReader;
-import org.apache.lucene.codecs.lucene912.Lucene912Codec;
+import org.apache.lucene.codecs.lucene100.Lucene100Codec;
import org.apache.lucene.codecs.perfield.PerFieldKnnVectorsFormat;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.KnnFloatVectorField;
@@ -74,7 +74,7 @@ public class TestLucene99HnswQuantizedVectorsFormat extends BaseKnnVectorsFormat
@Override
protected Codec getCodec() {
- return new Lucene912Codec() {
+ return new Lucene100Codec() {
@Override
public KnnVectorsFormat getKnnVectorsFormatForField(String field) {
return format;
@@ -106,7 +106,7 @@ public class TestLucene99HnswQuantizedVectorsFormat extends BaseKnnVectorsFormat
dir,
newIndexWriterConfig()
.setCodec(
- new Lucene912Codec() {
+ new Lucene100Codec() {
@Override
public KnnVectorsFormat getKnnVectorsFormatForField(String field) {
return getKnnFormat(4);
@@ -126,7 +126,7 @@ public class TestLucene99HnswQuantizedVectorsFormat extends BaseKnnVectorsFormat
dir,
newIndexWriterConfig()
.setCodec(
- new Lucene912Codec() {
+ new Lucene100Codec() {
@Override
public KnnVectorsFormat getKnnVectorsFormatForField(String field) {
return getKnnFormat(7);
@@ -163,7 +163,7 @@ public class TestLucene99HnswQuantizedVectorsFormat extends BaseKnnVectorsFormat
dir,
newIndexWriterConfig()
.setCodec(
- new Lucene912Codec() {
+ new Lucene100Codec() {
@Override
public KnnVectorsFormat getKnnVectorsFormatForField(String field) {
return new Lucene99HnswVectorsFormat();
@@ -183,7 +183,7 @@ public class TestLucene99HnswQuantizedVectorsFormat extends BaseKnnVectorsFormat
dir,
newIndexWriterConfig()
.setCodec(
- new Lucene912Codec() {
+ new Lucene100Codec() {
@Override
public KnnVectorsFormat getKnnVectorsFormatForField(String field) {
return getKnnFormat(7);
@@ -216,7 +216,7 @@ public class TestLucene99HnswQuantizedVectorsFormat extends BaseKnnVectorsFormat
dir,
newIndexWriterConfig()
.setCodec(
- new Lucene912Codec() {
+ new Lucene100Codec() {
@Override
public KnnVectorsFormat getKnnVectorsFormatForField(String field) {
return new Lucene99HnswScalarQuantizedVectorsFormat(
diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestLucene99ScalarQuantizedVectorScorer.java b/lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestLucene99ScalarQuantizedVectorScorer.java
index da1020dc36b..3b758de6ce6 100644
--- a/lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestLucene99ScalarQuantizedVectorScorer.java
+++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestLucene99ScalarQuantizedVectorScorer.java
@@ -27,7 +27,7 @@ import org.apache.lucene.codecs.Codec;
import org.apache.lucene.codecs.KnnVectorsFormat;
import org.apache.lucene.codecs.KnnVectorsReader;
import org.apache.lucene.codecs.hnsw.DefaultFlatVectorScorer;
-import org.apache.lucene.codecs.lucene912.Lucene912Codec;
+import org.apache.lucene.codecs.lucene100.Lucene100Codec;
import org.apache.lucene.codecs.perfield.PerFieldKnnVectorsFormat;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
@@ -52,7 +52,7 @@ import org.apache.lucene.util.quantization.ScalarQuantizer;
public class TestLucene99ScalarQuantizedVectorScorer extends LuceneTestCase {
private static Codec getCodec(int bits, boolean compress) {
- return new Lucene912Codec() {
+ return new Lucene100Codec() {
@Override
public KnnVectorsFormat getKnnVectorsFormatForField(String field) {
return new Lucene99HnswScalarQuantizedVectorsFormat(
diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestLucene99ScalarQuantizedVectorsFormat.java b/lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestLucene99ScalarQuantizedVectorsFormat.java
index d6b42c69708..ccba0975d73 100644
--- a/lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestLucene99ScalarQuantizedVectorsFormat.java
+++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestLucene99ScalarQuantizedVectorsFormat.java
@@ -28,7 +28,7 @@ import org.apache.lucene.codecs.Codec;
import org.apache.lucene.codecs.FilterCodec;
import org.apache.lucene.codecs.KnnVectorsFormat;
import org.apache.lucene.codecs.KnnVectorsReader;
-import org.apache.lucene.codecs.lucene912.Lucene912Codec;
+import org.apache.lucene.codecs.lucene100.Lucene100Codec;
import org.apache.lucene.codecs.perfield.PerFieldKnnVectorsFormat;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.KnnFloatVectorField;
@@ -70,7 +70,7 @@ public class TestLucene99ScalarQuantizedVectorsFormat extends BaseKnnVectorsForm
@Override
protected Codec getCodec() {
- return new Lucene912Codec() {
+ return new Lucene100Codec() {
@Override
public KnnVectorsFormat getKnnVectorsFormatForField(String field) {
return format;
diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/util/TestRuleSetupAndRestoreClassEnv.java b/lucene/test-framework/src/java/org/apache/lucene/tests/util/TestRuleSetupAndRestoreClassEnv.java
index 378444e394a..44f28b817ad 100644
--- a/lucene/test-framework/src/java/org/apache/lucene/tests/util/TestRuleSetupAndRestoreClassEnv.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/tests/util/TestRuleSetupAndRestoreClassEnv.java
@@ -38,7 +38,7 @@ import java.util.TimeZone;
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.codecs.DocValuesFormat;
import org.apache.lucene.codecs.PostingsFormat;
-import org.apache.lucene.codecs.lucene912.Lucene912Codec;
+import org.apache.lucene.codecs.lucene100.Lucene100Codec;
import org.apache.lucene.codecs.simpletext.SimpleTextCodec;
import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.tests.codecs.asserting.AssertingCodec;
@@ -188,9 +188,9 @@ final class TestRuleSetupAndRestoreClassEnv extends AbstractBeforeAfterRule {
} else if ("Compressing".equals(TEST_CODEC)
|| ("random".equals(TEST_CODEC) && randomVal == 6 && !shouldAvoidCodec("Compressing"))) {
codec = CompressingCodec.randomInstance(random);
- } else if ("Lucene912".equals(TEST_CODEC)
- || ("random".equals(TEST_CODEC) && randomVal == 5 && !shouldAvoidCodec("Lucene912"))) {
- codec = new Lucene912Codec(RandomPicks.randomFrom(random, Lucene912Codec.Mode.values()));
+ } else if ("Lucene100".equals(TEST_CODEC)
+ || ("random".equals(TEST_CODEC) && randomVal == 5 && !shouldAvoidCodec("Lucene100"))) {
+ codec = new Lucene100Codec(RandomPicks.randomFrom(random, Lucene100Codec.Mode.values()));
} else if (!"random".equals(TEST_CODEC)) {
codec = Codec.forName(TEST_CODEC);
} else if ("random".equals(TEST_POSTINGSFORMAT)) {
diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/util/TestUtil.java b/lucene/test-framework/src/java/org/apache/lucene/tests/util/TestUtil.java
index 0569bf9ae98..95f06ea5570 100644
--- a/lucene/test-framework/src/java/org/apache/lucene/tests/util/TestUtil.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/tests/util/TestUtil.java
@@ -55,8 +55,8 @@ import org.apache.lucene.codecs.DocValuesFormat;
import org.apache.lucene.codecs.KnnVectorsFormat;
import org.apache.lucene.codecs.PostingsFormat;
import org.apache.lucene.codecs.blocktreeords.BlockTreeOrdsPostingsFormat;
+import org.apache.lucene.codecs.lucene100.Lucene100Codec;
import org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat;
-import org.apache.lucene.codecs.lucene912.Lucene912Codec;
import org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat;
import org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat;
import org.apache.lucene.codecs.perfield.PerFieldDocValuesFormat;
@@ -1315,7 +1315,7 @@ public final class TestUtil {
* different from {@link Codec#getDefault()} because that is randomized.
*/
public static Codec getDefaultCodec() {
- return new Lucene912Codec();
+ return new Lucene100Codec();
}
/**