From 6677109ee608c38bb19503dc6927a18ab3e13b40 Mon Sep 17 00:00:00 2001 From: Simon Willnauer Date: Mon, 23 Oct 2023 09:46:12 +0200 Subject: [PATCH] Record if block API has been used in SegmentInfo (#12685) If the add/updateDocuments(List<>) API is used, lucene guarantees that all documents are indexed in the same segment with consecutive document IDs. This enables features like nested documents etc. This change records the usage of this API in SegmentsInfo and preserves this property across merges. Relates to #12665 --- lucene/CHANGES.txt | 4 + .../backward-codecs/src/java/module-info.java | 4 +- .../lucene70/Lucene70SegmentInfoFormat.java | 1 + .../lucene86/Lucene86SegmentInfoFormat.java | 1 + .../lucene90/Lucene90Codec.java | 3 +- .../lucene90/Lucene90SegmentInfoFormat.java | 179 ++++++++ .../lucene91/Lucene91Codec.java | 4 +- .../lucene91/package-info.java | 8 +- .../lucene92/Lucene92Codec.java | 4 +- .../lucene92/package-info.java | 8 +- .../lucene94/Lucene94Codec.java | 4 +- .../lucene94/package-info.java | 8 +- .../lucene95/Lucene95Codec.java | 5 +- .../lucene95/package-info.java | 425 ++++++++++++++++++ .../services/org.apache.lucene.codecs.Codec | 1 + .../lucene90/Lucene90RWCodec.java | 8 + .../Lucene90RWSegmentInfoFormat.java} | 22 +- .../TestLucene90SegmentInfoFormat.java | 7 +- .../lucene91/Lucene91RWCodec.java | 8 + .../lucene92/Lucene92RWCodec.java | 8 + .../lucene94/Lucene94RWCodec.java | 9 + .../byTask/tasks/CreateIndexTask.java | 6 +- .../SimpleTextSegmentInfoFormat.java | 10 + lucene/core/src/java/module-info.java | 5 +- .../java/org/apache/lucene/codecs/Codec.java | 2 +- .../lucene/codecs/lucene95/package-info.java | 408 +---------------- .../lucene/codecs/lucene99/Lucene99Codec.java | 198 ++++++++ .../lucene99/Lucene99SegmentInfoFormat.java | 236 ++++++++++ .../lucene/codecs/lucene99/package-info.java | 425 ++++++++++++++++++ .../index/DocumentsWriterPerThread.java | 6 +- .../org/apache/lucene/index/IndexWriter.java | 15 +- .../org/apache/lucene/index/LeafMetaData.java | 21 +- .../lucene/index/ParallelLeafReader.java | 5 +- .../org/apache/lucene/index/SegmentInfo.java | 18 + .../apache/lucene/index/SegmentReader.java | 6 +- .../lucene/index/SortingCodecReader.java | 6 +- .../services/org.apache.lucene.codecs.Codec | 2 +- ...ne90StoredFieldsFormatHighCompression.java | 9 +- .../TestLucene95HnswVectorsFormat.java | 9 +- .../TestLucene99SegmentInfoFormat.java | 36 ++ .../apache/lucene/index/TestAddIndexes.java | 71 +++ .../org/apache/lucene/index/TestCodecs.java | 2 + .../test/org/apache/lucene/index/TestDoc.java | 1 + .../apache/lucene/index/TestIndexWriter.java | 70 ++- .../org/apache/lucene/index/TestKnnGraph.java | 36 +- .../apache/lucene/index/TestMergePolicy.java | 1 + .../TestOneMergeWrappingMergePolicy.java | 1 + .../lucene/index/TestPendingDeletes.java | 3 + .../lucene/index/TestPendingSoftDeletes.java | 1 + .../apache/lucene/index/TestSegmentInfos.java | 12 + .../lucene/index/TestSegmentMerger.java | 1 + .../index/TestSegmentToThreadMapping.java | 2 +- .../lucene/util/hnsw/HnswGraphTestCase.java | 50 ++- .../highlight/TermVectorLeafReader.java | 2 +- .../lucene/index/memory/MemoryIndex.java | 2 +- .../lucene/misc/index/IndexSplitter.java | 1 + .../suggest/document/TestSuggestField.java | 21 +- .../codecs/vector/ConfigurableMCodec.java | 6 +- .../index/BaseCompoundFormatTestCase.java | 1 + .../index/BaseFieldInfoFormatTestCase.java | 1 + .../index/BaseIndexFileFormatTestCase.java | 1 + .../index/BaseLiveDocsFormatTestCase.java | 1 + .../tests/index/BaseMergePolicyTestCase.java | 2 + .../index/BaseSegmentInfoFormatTestCase.java | 12 + .../tests/index/RandomPostingsTester.java | 1 + .../lucene/tests/search/QueryUtils.java | 2 +- .../util/TestRuleSetupAndRestoreClassEnv.java | 8 +- .../apache/lucene/tests/util/TestUtil.java | 4 +- 68 files changed, 1942 insertions(+), 518 deletions(-) create mode 100644 lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/Lucene90SegmentInfoFormat.java rename lucene/{core/src/java/org/apache/lucene/codecs => backward-codecs/src/java/org/apache/lucene/backward_codecs}/lucene95/Lucene95Codec.java (97%) create mode 100644 lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene95/package-info.java rename lucene/{core/src/java/org/apache/lucene/codecs/lucene90/Lucene90SegmentInfoFormat.java => backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene90/Lucene90RWSegmentInfoFormat.java} (91%) rename lucene/{core/src/test/org/apache/lucene/codecs => backward-codecs/src/test/org/apache/lucene/backward_codecs}/lucene90/TestLucene90SegmentInfoFormat.java (86%) create mode 100644 lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99Codec.java create mode 100644 lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99SegmentInfoFormat.java create mode 100644 lucene/core/src/java/org/apache/lucene/codecs/lucene99/package-info.java create mode 100644 lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestLucene99SegmentInfoFormat.java diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index c3b39e0e378..2fe915e17a2 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -154,9 +154,13 @@ API Changes New Features --------------------- + * GITHUB#12548: Added similarityToQueryVector API to compute vector similarity scores with DoubleValuesSource. (Shubham Chaudhary) +* GITHUB#12685: Lucene now records if documents have been indexed as blocks in SegmentInfo. This is recorded on a per + segment basis and maintained across merges. The property is exposed via LeafReaderMetadata. (Simon Willnauer) + Improvements --------------------- * GITHUB#12523: TaskExecutor waits for all tasks to complete before returning when Exceptions diff --git a/lucene/backward-codecs/src/java/module-info.java b/lucene/backward-codecs/src/java/module-info.java index 205b1facaa8..a732ff7a2d8 100644 --- a/lucene/backward-codecs/src/java/module-info.java +++ b/lucene/backward-codecs/src/java/module-info.java @@ -34,6 +34,7 @@ module org.apache.lucene.backward_codecs { exports org.apache.lucene.backward_codecs.lucene91; exports org.apache.lucene.backward_codecs.lucene92; exports org.apache.lucene.backward_codecs.lucene94; + exports org.apache.lucene.backward_codecs.lucene95; exports org.apache.lucene.backward_codecs.packed; exports org.apache.lucene.backward_codecs.store; @@ -55,5 +56,6 @@ module org.apache.lucene.backward_codecs { org.apache.lucene.backward_codecs.lucene90.Lucene90Codec, org.apache.lucene.backward_codecs.lucene91.Lucene91Codec, org.apache.lucene.backward_codecs.lucene92.Lucene92Codec, - org.apache.lucene.backward_codecs.lucene94.Lucene94Codec; + org.apache.lucene.backward_codecs.lucene94.Lucene94Codec, + org.apache.lucene.backward_codecs.lucene95.Lucene95Codec; } diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene70/Lucene70SegmentInfoFormat.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene70/Lucene70SegmentInfoFormat.java index d89350e9df4..7f9fac4deed 100644 --- a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene70/Lucene70SegmentInfoFormat.java +++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene70/Lucene70SegmentInfoFormat.java @@ -307,6 +307,7 @@ public class Lucene70SegmentInfoFormat extends SegmentInfoFormat { segment, docCount, isCompoundFile, + false, null, diagnostics, segmentID, diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene86/Lucene86SegmentInfoFormat.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene86/Lucene86SegmentInfoFormat.java index 35879f62dc4..b7b660ff4f5 100644 --- a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene86/Lucene86SegmentInfoFormat.java +++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene86/Lucene86SegmentInfoFormat.java @@ -164,6 +164,7 @@ public class Lucene86SegmentInfoFormat extends SegmentInfoFormat { segment, docCount, isCompoundFile, + false, null, diagnostics, segmentID, diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/Lucene90Codec.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/Lucene90Codec.java index 6fc32bd8223..b9465985063 100644 --- a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/Lucene90Codec.java +++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/Lucene90Codec.java @@ -36,7 +36,6 @@ import org.apache.lucene.codecs.lucene90.Lucene90LiveDocsFormat; import org.apache.lucene.codecs.lucene90.Lucene90NormsFormat; import org.apache.lucene.codecs.lucene90.Lucene90PointsFormat; import org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat; -import org.apache.lucene.codecs.lucene90.Lucene90SegmentInfoFormat; import org.apache.lucene.codecs.lucene90.Lucene90StoredFieldsFormat; import org.apache.lucene.codecs.lucene90.Lucene90TermVectorsFormat; import org.apache.lucene.codecs.perfield.PerFieldDocValuesFormat; @@ -143,7 +142,7 @@ public class Lucene90Codec extends Codec { } @Override - public final SegmentInfoFormat segmentInfoFormat() { + public SegmentInfoFormat segmentInfoFormat() { return segmentInfosFormat; } diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/Lucene90SegmentInfoFormat.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/Lucene90SegmentInfoFormat.java new file mode 100644 index 00000000000..bfb1e8ab1ed --- /dev/null +++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/Lucene90SegmentInfoFormat.java @@ -0,0 +1,179 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.backward_codecs.lucene90; + +import java.io.IOException; +import java.util.Map; +import java.util.Set; +import org.apache.lucene.codecs.CodecUtil; +import org.apache.lucene.codecs.SegmentInfoFormat; +import org.apache.lucene.index.CorruptIndexException; +import org.apache.lucene.index.IndexFileNames; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.SegmentInfo; +import org.apache.lucene.index.SegmentInfos; +import org.apache.lucene.index.SortFieldProvider; +import org.apache.lucene.search.Sort; +import org.apache.lucene.search.SortField; +import org.apache.lucene.store.ChecksumIndexInput; +import org.apache.lucene.store.DataInput; +import org.apache.lucene.store.DataOutput; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.IOContext; +import org.apache.lucene.util.Version; + +/** + * Lucene 9.0 Segment info format. + * + *

Files: + * + *

+ * + * Data types: + * + * + * + * Field Descriptions: + * + * + * + * @see SegmentInfos + * @lucene.experimental + */ +public class Lucene90SegmentInfoFormat extends SegmentInfoFormat { + + /** File extension used to store {@link SegmentInfo}. */ + public static final String SI_EXTENSION = "si"; + + static final String CODEC_NAME = "Lucene90SegmentInfo"; + static final int VERSION_START = 0; + static final int VERSION_CURRENT = VERSION_START; + + /** Sole constructor. */ + public Lucene90SegmentInfoFormat() {} + + @Override + public SegmentInfo read(Directory dir, String segment, byte[] segmentID, IOContext context) + throws IOException { + final String fileName = IndexFileNames.segmentFileName(segment, "", SI_EXTENSION); + try (ChecksumIndexInput input = dir.openChecksumInput(fileName)) { + Throwable priorE = null; + SegmentInfo si = null; + try { + CodecUtil.checkIndexHeader( + input, CODEC_NAME, VERSION_START, VERSION_CURRENT, segmentID, ""); + si = parseSegmentInfo(dir, input, segment, segmentID); + } catch (Throwable exception) { + priorE = exception; + } finally { + CodecUtil.checkFooter(input, priorE); + } + return si; + } + } + + private SegmentInfo parseSegmentInfo( + Directory dir, DataInput input, String segment, byte[] segmentID) throws IOException { + final Version version = Version.fromBits(input.readInt(), input.readInt(), input.readInt()); + byte hasMinVersion = input.readByte(); + final Version minVersion; + switch (hasMinVersion) { + case 0: + minVersion = null; + break; + case 1: + minVersion = Version.fromBits(input.readInt(), input.readInt(), input.readInt()); + break; + default: + throw new CorruptIndexException("Illegal boolean value " + hasMinVersion, input); + } + + final int docCount = input.readInt(); + if (docCount < 0) { + throw new CorruptIndexException("invalid docCount: " + docCount, input); + } + final boolean isCompoundFile = input.readByte() == SegmentInfo.YES; + + final Map diagnostics = input.readMapOfStrings(); + final Set files = input.readSetOfStrings(); + final Map attributes = input.readMapOfStrings(); + + int numSortFields = input.readVInt(); + Sort indexSort; + if (numSortFields > 0) { + SortField[] sortFields = new SortField[numSortFields]; + for (int i = 0; i < numSortFields; i++) { + String name = input.readString(); + sortFields[i] = SortFieldProvider.forName(name).readSortField(input); + } + indexSort = new Sort(sortFields); + } else if (numSortFields < 0) { + throw new CorruptIndexException("invalid index sort field count: " + numSortFields, input); + } else { + indexSort = null; + } + + SegmentInfo si = + new SegmentInfo( + dir, + version, + minVersion, + segment, + docCount, + isCompoundFile, + false, + null, + diagnostics, + segmentID, + attributes, + indexSort); + si.setFiles(files); + return si; + } + + @Override + public void write(Directory dir, SegmentInfo si, IOContext ioContext) throws IOException { + throw new UnsupportedOperationException("Old formats can't be used for writing"); + } +} diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene91/Lucene91Codec.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene91/Lucene91Codec.java index 39c133b02ef..9393d0fa5a1 100644 --- a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene91/Lucene91Codec.java +++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene91/Lucene91Codec.java @@ -18,6 +18,7 @@ package org.apache.lucene.backward_codecs.lucene91; import java.util.Objects; import org.apache.lucene.backward_codecs.lucene90.Lucene90FieldInfosFormat; +import org.apache.lucene.backward_codecs.lucene90.Lucene90SegmentInfoFormat; import org.apache.lucene.codecs.Codec; import org.apache.lucene.codecs.CompoundFormat; import org.apache.lucene.codecs.DocValuesFormat; @@ -37,7 +38,6 @@ import org.apache.lucene.codecs.lucene90.Lucene90LiveDocsFormat; import org.apache.lucene.codecs.lucene90.Lucene90NormsFormat; import org.apache.lucene.codecs.lucene90.Lucene90PointsFormat; import org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat; -import org.apache.lucene.codecs.lucene90.Lucene90SegmentInfoFormat; import org.apache.lucene.codecs.lucene90.Lucene90StoredFieldsFormat; import org.apache.lucene.codecs.lucene90.Lucene90TermVectorsFormat; import org.apache.lucene.codecs.perfield.PerFieldDocValuesFormat; @@ -141,7 +141,7 @@ public class Lucene91Codec extends Codec { } @Override - public final SegmentInfoFormat segmentInfoFormat() { + public SegmentInfoFormat segmentInfoFormat() { return segmentInfosFormat; } diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene91/package-info.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene91/package-info.java index 31e00f6f39d..21bc835b61e 100644 --- a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene91/package-info.java +++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene91/package-info.java @@ -141,9 +141,9 @@ *

Each segment index maintains the following: * *

    - *
  • {@link org.apache.lucene.codecs.lucene90.Lucene90SegmentInfoFormat Segment info}. This - * contains metadata about a segment, such as the number of documents, what files it uses, and - * information about how the segment is sorted + *
  • {@link org.apache.lucene.backward_codecs.lucene90.Lucene90SegmentInfoFormat Segment info}. + * This contains metadata about a segment, such as the number of documents, what files it + * uses, and information about how the segment is sorted *
  • {@link org.apache.lucene.backward_codecs.lucene90.Lucene90FieldInfosFormat Field names}. * This contains metadata about the set of named fields used in the index. *
  • {@link org.apache.lucene.codecs.lucene90.Lucene90StoredFieldsFormat Stored Field values}. @@ -229,7 +229,7 @@ * file. * * - * {@link org.apache.lucene.codecs.lucene90.Lucene90SegmentInfoFormat Segment Info} + * {@link org.apache.lucene.backward_codecs.lucene90.Lucene90SegmentInfoFormat Segment Info} * .si * Stores metadata about a segment * diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene92/Lucene92Codec.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene92/Lucene92Codec.java index 0e9b676651c..28ba9923481 100644 --- a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene92/Lucene92Codec.java +++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene92/Lucene92Codec.java @@ -18,6 +18,7 @@ package org.apache.lucene.backward_codecs.lucene92; import java.util.Objects; import org.apache.lucene.backward_codecs.lucene90.Lucene90FieldInfosFormat; +import org.apache.lucene.backward_codecs.lucene90.Lucene90SegmentInfoFormat; import org.apache.lucene.codecs.Codec; import org.apache.lucene.codecs.CompoundFormat; import org.apache.lucene.codecs.DocValuesFormat; @@ -37,7 +38,6 @@ import org.apache.lucene.codecs.lucene90.Lucene90LiveDocsFormat; import org.apache.lucene.codecs.lucene90.Lucene90NormsFormat; import org.apache.lucene.codecs.lucene90.Lucene90PointsFormat; import org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat; -import org.apache.lucene.codecs.lucene90.Lucene90SegmentInfoFormat; import org.apache.lucene.codecs.lucene90.Lucene90StoredFieldsFormat; import org.apache.lucene.codecs.lucene90.Lucene90TermVectorsFormat; import org.apache.lucene.codecs.perfield.PerFieldDocValuesFormat; @@ -144,7 +144,7 @@ public class Lucene92Codec extends Codec { } @Override - public final SegmentInfoFormat segmentInfoFormat() { + public SegmentInfoFormat segmentInfoFormat() { return segmentInfosFormat; } diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene92/package-info.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene92/package-info.java index 4c4766752f1..2e85de218e0 100644 --- a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene92/package-info.java +++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene92/package-info.java @@ -141,9 +141,9 @@ *

    Each segment index maintains the following: * *

      - *
    • {@link org.apache.lucene.codecs.lucene90.Lucene90SegmentInfoFormat Segment info}. This - * contains metadata about a segment, such as the number of documents, what files it uses, and - * information about how the segment is sorted + *
    • {@link org.apache.lucene.backward_codecs.lucene90.Lucene90SegmentInfoFormat Segment info}. + * This contains metadata about a segment, such as the number of documents, what files it + * uses, and information about how the segment is sorted *
    • {@link org.apache.lucene.backward_codecs.lucene90.Lucene90FieldInfosFormat Field names}. * This contains metadata about the set of named fields used in the index. *
    • {@link org.apache.lucene.codecs.lucene90.Lucene90StoredFieldsFormat Stored Field values}. @@ -229,7 +229,7 @@ * file. * * - * {@link org.apache.lucene.codecs.lucene90.Lucene90SegmentInfoFormat Segment Info} + * {@link org.apache.lucene.backward_codecs.lucene90.Lucene90SegmentInfoFormat Segment Info} * .si * Stores metadata about a segment * diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene94/Lucene94Codec.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene94/Lucene94Codec.java index c967839ecbf..324ee042967 100644 --- a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene94/Lucene94Codec.java +++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene94/Lucene94Codec.java @@ -17,6 +17,7 @@ package org.apache.lucene.backward_codecs.lucene94; import java.util.Objects; +import org.apache.lucene.backward_codecs.lucene90.Lucene90SegmentInfoFormat; import org.apache.lucene.codecs.Codec; import org.apache.lucene.codecs.CompoundFormat; import org.apache.lucene.codecs.DocValuesFormat; @@ -36,7 +37,6 @@ import org.apache.lucene.codecs.lucene90.Lucene90LiveDocsFormat; import org.apache.lucene.codecs.lucene90.Lucene90NormsFormat; import org.apache.lucene.codecs.lucene90.Lucene90PointsFormat; import org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat; -import org.apache.lucene.codecs.lucene90.Lucene90SegmentInfoFormat; import org.apache.lucene.codecs.lucene90.Lucene90StoredFieldsFormat; import org.apache.lucene.codecs.lucene90.Lucene90TermVectorsFormat; import org.apache.lucene.codecs.lucene94.Lucene94FieldInfosFormat; @@ -144,7 +144,7 @@ public class Lucene94Codec extends Codec { } @Override - public final SegmentInfoFormat segmentInfoFormat() { + public SegmentInfoFormat segmentInfoFormat() { return segmentInfosFormat; } diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene94/package-info.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene94/package-info.java index db32e6bce67..ae2bffe0bbb 100644 --- a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene94/package-info.java +++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene94/package-info.java @@ -141,9 +141,9 @@ *

      Each segment index maintains the following: * *

        - *
      • {@link org.apache.lucene.codecs.lucene90.Lucene90SegmentInfoFormat Segment info}. This - * contains metadata about a segment, such as the number of documents, what files it uses, and - * information about how the segment is sorted + *
      • {@link org.apache.lucene.backward_codecs.lucene90.Lucene90SegmentInfoFormat Segment info}. + * This contains metadata about a segment, such as the number of documents, what files it + * uses, and information about how the segment is sorted *
      • {@link org.apache.lucene.codecs.lucene94.Lucene94FieldInfosFormat Field names}. This * contains metadata about the set of named fields used in the index. *
      • {@link org.apache.lucene.codecs.lucene90.Lucene90StoredFieldsFormat Stored Field values}. @@ -229,7 +229,7 @@ * file. * * - * {@link org.apache.lucene.codecs.lucene90.Lucene90SegmentInfoFormat Segment Info} + * {@link org.apache.lucene.backward_codecs.lucene90.Lucene90SegmentInfoFormat Segment Info} * .si * Stores metadata about a segment * diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene95/Lucene95Codec.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene95/Lucene95Codec.java similarity index 97% rename from lucene/core/src/java/org/apache/lucene/codecs/lucene95/Lucene95Codec.java rename to lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene95/Lucene95Codec.java index fd495b6026f..cff4ee2afe0 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene95/Lucene95Codec.java +++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene95/Lucene95Codec.java @@ -14,9 +14,10 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.lucene.codecs.lucene95; +package org.apache.lucene.backward_codecs.lucene95; import java.util.Objects; +import org.apache.lucene.backward_codecs.lucene90.Lucene90SegmentInfoFormat; import org.apache.lucene.codecs.Codec; import org.apache.lucene.codecs.CompoundFormat; import org.apache.lucene.codecs.DocValuesFormat; @@ -36,10 +37,10 @@ import org.apache.lucene.codecs.lucene90.Lucene90LiveDocsFormat; import org.apache.lucene.codecs.lucene90.Lucene90NormsFormat; import org.apache.lucene.codecs.lucene90.Lucene90PointsFormat; import org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat; -import org.apache.lucene.codecs.lucene90.Lucene90SegmentInfoFormat; import org.apache.lucene.codecs.lucene90.Lucene90StoredFieldsFormat; import org.apache.lucene.codecs.lucene90.Lucene90TermVectorsFormat; import org.apache.lucene.codecs.lucene94.Lucene94FieldInfosFormat; +import org.apache.lucene.codecs.lucene95.Lucene95HnswVectorsFormat; import org.apache.lucene.codecs.perfield.PerFieldDocValuesFormat; import org.apache.lucene.codecs.perfield.PerFieldKnnVectorsFormat; import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat; diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene95/package-info.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene95/package-info.java new file mode 100644 index 00000000000..3336f9160de --- /dev/null +++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene95/package-info.java @@ -0,0 +1,425 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Lucene 9.5 file format. + * + *

        Apache Lucene - Index File Formats

        + * + * + * + *

        Introduction

        + * + *
        + * + *

        This document defines the index file formats used in this version of Lucene. If you are using + * a different version of Lucene, please consult the copy of docs/ that was distributed + * with the version you are using. + * + *

        This document attempts to provide a high-level definition of the Apache Lucene file formats. + *

        + * + *

        Definitions

        + * + *
        + * + *

        The fundamental concepts in Lucene are index, document, field and term. + * + *

        An index contains a sequence of documents. + * + *

          + *
        • A document is a sequence of fields. + *
        • A field is a named sequence of terms. + *
        • A term is a sequence of bytes. + *
        + * + *

        The same sequence of bytes in two different fields is considered a different term. Thus terms + * are represented as a pair: the string naming the field, and the bytes within the field. + * + *

        Inverted Indexing

        + * + *

        Lucene's index stores terms and statistics about those terms in order to make term-based + * search more efficient. Lucene's terms index falls into the family of indexes known as an + * inverted index. This is because it can list, for a term, the documents that contain it. + * This is the inverse of the natural relationship, in which documents list terms. + * + *

        Types of Fields

        + * + *

        In Lucene, fields may be stored, in which case their text is stored in the index + * literally, in a non-inverted manner. Fields that are inverted are called indexed. A field + * may be both stored and indexed. + * + *

        The text of a field may be tokenized into terms to be indexed, or the text of a field + * may be used literally as a term to be indexed. Most fields are tokenized, but sometimes it is + * useful for certain identifier fields to be indexed literally. + * + *

        See the {@link org.apache.lucene.document.Field Field} java docs for more information on + * Fields. + * + *

        Segments

        + * + *

        Lucene indexes may be composed of multiple sub-indexes, or segments. Each segment is a + * fully independent index, which could be searched separately. Indexes evolve by: + * + *

          + *
        1. Creating new segments for newly added documents. + *
        2. Merging existing segments. + *
        + * + *

        Searches may involve multiple segments and/or multiple indexes, each index potentially + * composed of a set of segments. + * + *

        Document Numbers

        + * + *

        Internally, Lucene refers to documents by an integer document number. The first + * document added to an index is numbered zero, and each subsequent document added gets a number one + * greater than the previous. + * + *

        Note that a document's number may change, so caution should be taken when storing these + * numbers outside of Lucene. In particular, numbers may change in the following situations: + * + *

          + *
        • + *

          The numbers stored in each segment are unique only within the segment, and must be + * converted before they can be used in a larger context. The standard technique is to + * allocate each segment a range of values, based on the range of numbers used in that + * segment. To convert a document number from a segment to an external value, the segment's + * base document number is added. To convert an external value back to a + * segment-specific value, the segment is identified by the range that the external value is + * in, and the segment's base value is subtracted. For example two five document segments + * might be combined, so that the first segment has a base value of zero, and the second of + * five. Document three from the second segment would have an external value of eight. + *

        • + *

          When documents are deleted, gaps are created in the numbering. These are eventually + * removed as the index evolves through merging. Deleted documents are dropped when segments + * are merged. A freshly-merged segment thus has no gaps in its numbering. + *

        + * + *
        + * + *

        Index Structure Overview

        + * + *
        + * + *

        Each segment index maintains the following: + * + *

          + *
        • {@link org.apache.lucene.backward_codecs.lucene90.Lucene90SegmentInfoFormat info}. This + * contains metadata about a segment, such as the number of documents, what files it uses, and + * information about how the segment is sorted + *
        • {@link org.apache.lucene.codecs.lucene94.Lucene94FieldInfosFormat Field names}. This + * contains metadata about the set of named fields used in the index. + *
        • {@link org.apache.lucene.codecs.lucene90.Lucene90StoredFieldsFormat Stored Field values}. + * This contains, for each document, a list of attribute-value pairs, where the attributes are + * field names. These are used to store auxiliary information about the document, such as its + * title, url, or an identifier to access a database. The set of stored fields are what is + * returned for each hit when searching. This is keyed by document number. + *
        • {@link org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat Term dictionary}. A + * dictionary containing all of the terms used in all of the indexed fields of all of the + * documents. The dictionary also contains the number of documents which contain the term, and + * pointers to the term's frequency and proximity data. + *
        • {@link org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat Term Frequency data}. For + * each term in the dictionary, the numbers of all the documents that contain that term, and + * the frequency of the term in that document, unless frequencies are omitted ({@link + * org.apache.lucene.index.IndexOptions#DOCS IndexOptions.DOCS}) + *
        • {@link org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat Term Proximity data}. For + * each term in the dictionary, the positions that the term occurs in each document. Note that + * this will not exist if all fields in all documents omit position data. + *
        • {@link org.apache.lucene.codecs.lucene90.Lucene90NormsFormat Normalization factors}. For + * each field in each document, a value is stored that is multiplied into the score for hits + * on that field. + *
        • {@link org.apache.lucene.codecs.lucene90.Lucene90TermVectorsFormat Term Vectors}. For each + * field in each document, the term vector (sometimes called document vector) may be stored. A + * term vector consists of term text and term frequency. To add Term Vectors to your index see + * the {@link org.apache.lucene.document.Field Field} constructors + *
        • {@link org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat Per-document values}. Like + * stored values, these are also keyed by document number, but are generally intended to be + * loaded into main memory for fast access. Whereas stored values are generally intended for + * summary results from searches, per-document values are useful for things like scoring + * factors. + *
        • {@link org.apache.lucene.codecs.lucene90.Lucene90LiveDocsFormat Live documents}. An + * optional file indicating which documents are live. + *
        • {@link org.apache.lucene.codecs.lucene90.Lucene90PointsFormat Point values}. Optional pair + * of files, recording dimensionally indexed fields, to enable fast numeric range filtering + * and large numeric values like BigInteger and BigDecimal (1D) and geographic shape + * intersection (2D, 3D). + *
        • {@link org.apache.lucene.codecs.lucene95.Lucene95HnswVectorsFormat Vector values}. The + * vector format stores numeric vectors in a format optimized for random access and + * computation, supporting high-dimensional nearest-neighbor search. + *
        + * + *

        Details on each of these are provided in their linked pages.

        + * + *

        File Naming

        + * + *
        + * + *

        All files belonging to a segment have the same name with varying extensions. The extensions + * correspond to the different file formats described below. When using the Compound File format + * (default for small segments) these files (except for the Segment info file, the Lock file, and + * Deleted documents file) are collapsed into a single .cfs file (see below for details) + * + *

        Typically, all segments in an index are stored in a single directory, although this is not + * required. + * + *

        File names are never re-used. That is, when any file is saved to the Directory it is given a + * never before used filename. This is achieved using a simple generations approach. For example, + * the first segments file is segments_1, then segments_2, etc. The generation is a sequential long + * integer represented in alpha-numeric (base 36) form.

        + * + *

        Summary of File Extensions

        + * + *
        + * + *

        The following table summarizes the names and extensions of the files in Lucene: + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + *
        lucene filenames by extension
        NameExtensionBrief Description
        {@link org.apache.lucene.index.SegmentInfos Segments File}segments_NStores information about a commit point
        Lock Filewrite.lockThe Write lock prevents multiple IndexWriters from writing to the same + * file.
        {@link org.apache.lucene.backward_codecs.lucene90.Lucene90SegmentInfoFormat Segment Info}.siStores metadata about a segment
        {@link org.apache.lucene.codecs.lucene90.Lucene90CompoundFormat Compound File}.cfs, .cfeAn optional "virtual" file consisting of all the other index files for + * systems that frequently run out of file handles.
        {@link org.apache.lucene.codecs.lucene94.Lucene94FieldInfosFormat Fields}.fnmStores information about the fields
        {@link org.apache.lucene.codecs.lucene90.Lucene90StoredFieldsFormat Field Index}.fdxContains pointers to field data
        {@link org.apache.lucene.codecs.lucene90.Lucene90StoredFieldsFormat Field Data}.fdtThe stored fields for documents
        {@link org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat Term Dictionary}.timThe term dictionary, stores term info
        {@link org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat Term Index}.tipThe index into the Term Dictionary
        {@link org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat Frequencies}.docContains the list of docs which contain each term along with frequency
        {@link org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat Positions}.posStores position information about where a term occurs in the index
        {@link org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat Payloads}.payStores additional per-position metadata information such as character offsets and user payloads
        {@link org.apache.lucene.codecs.lucene90.Lucene90NormsFormat Norms}.nvd, .nvmEncodes length and boost factors for docs and fields
        {@link org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat Per-Document Values}.dvd, .dvmEncodes additional scoring factors or other per-document information.
        {@link org.apache.lucene.codecs.lucene90.Lucene90TermVectorsFormat Term Vector Index}.tvxStores offset into the document data file
        {@link org.apache.lucene.codecs.lucene90.Lucene90TermVectorsFormat Term Vector Data}.tvdContains term vector data.
        {@link org.apache.lucene.codecs.lucene90.Lucene90LiveDocsFormat Live Documents}.livInfo about what documents are live
        {@link org.apache.lucene.codecs.lucene90.Lucene90PointsFormat Point values}.dii, .dimHolds indexed points
        {@link org.apache.lucene.codecs.lucene95.Lucene95HnswVectorsFormat Vector values}.vec, .vemHolds indexed vectors; .vec files contain the raw vector data, and + * .vem the vector metadata
        + * + *

        + * + *

        Lock File

        + * + * The write lock, which is stored in the index directory by default, is named "write.lock". If the + * lock directory is different from the index directory then the write lock will be named + * "XXXX-write.lock" where XXXX is a unique prefix derived from the full path to the index + * directory. When this file is present, a writer is currently modifying the index (adding or + * removing documents). This lock file ensures that only one writer is modifying the index at a + * time. + * + *

        History

        + * + *

        Compatibility notes are provided in this document, describing how file formats have changed + * from prior versions: + * + *

          + *
        • In version 2.1, the file format was changed to allow lock-less commits (ie, no more commit + * lock). The change is fully backwards compatible: you can open a pre-2.1 index for searching + * or adding/deleting of docs. When the new segments file is saved (committed), it will be + * written in the new file format (meaning no specific "upgrade" process is needed). But note + * that once a commit has occurred, pre-2.1 Lucene will not be able to read the index. + *
        • In version 2.3, the file format was changed to allow segments to share a single set of doc + * store (vectors & stored fields) files. This allows for faster indexing in certain + * cases. The change is fully backwards compatible (in the same way as the lock-less commits + * change in 2.1). + *
        • In version 2.4, Strings are now written as true UTF-8 byte sequence, not Java's modified + * UTF-8. See LUCENE-510 for + * details. + *
        • In version 2.9, an optional opaque Map<String,String> CommitUserData may be passed to + * IndexWriter's commit methods (and later retrieved), which is recorded in the segments_N + * file. See LUCENE-1382 for + * details. Also, diagnostics were added to each segment written recording details about why + * it was written (due to flush, merge; which OS/JRE was used; etc.). See issue LUCENE-1654 for details. + *
        • In version 3.0, compressed fields are no longer written to the index (they can still be + * read, but on merge the new segment will write them, uncompressed). See issue LUCENE-1960 for details. + *
        • In version 3.1, segments records the code version that created them. See LUCENE-2720 for details. + * Additionally segments track explicitly whether or not they have term vectors. See LUCENE-2811 for details. + *
        • In version 3.2, numeric fields are written as natively to stored fields file, previously + * they were stored in text format only. + *
        • In version 3.4, fields can omit position data while still indexing term frequencies. + *
        • In version 4.0, the format of the inverted index became extensible via the {@link + * org.apache.lucene.codecs.Codec Codec} api. Fast per-document storage ({@code DocValues}) + * was introduced. Normalization factors need no longer be a single byte, they can be any + * {@link org.apache.lucene.index.NumericDocValues NumericDocValues}. Terms need not be + * unicode strings, they can be any byte sequence. Term offsets can optionally be indexed into + * the postings lists. Payloads can be stored in the term vectors. + *
        • In version 4.1, the format of the postings list changed to use either of FOR compression or + * variable-byte encoding, depending upon the frequency of the term. Terms appearing only once + * were changed to inline directly into the term dictionary. Stored fields are compressed by + * default. + *
        • In version 4.2, term vectors are compressed by default. DocValues has a new multi-valued + * type (SortedSet), that can be used for faceting/grouping/joining on multi-valued fields. + *
        • In version 4.5, DocValues were extended to explicitly represent missing values. + *
        • In version 4.6, FieldInfos were extended to support per-field DocValues generation, to + * allow updating NumericDocValues fields. + *
        • In version 4.8, checksum footers were added to the end of each index file for improved data + * integrity. Specifically, the last 8 bytes of every index file contain the zlib-crc32 + * checksum of the file. + *
        • In version 4.9, DocValues has a new multi-valued numeric type (SortedNumeric) that is + * suitable for faceting/sorting/analytics. + *
        • In version 5.4, DocValues have been improved to store more information on disk: addresses + * for binary fields and ord indexes for multi-valued fields. + *
        • In version 6.0, Points were added, for multi-dimensional range/distance search. + *
        • In version 6.2, new Segment info format that reads/writes the index sort, to support index + * sorting. + *
        • In version 7.0, DocValues have been improved to better support sparse doc values thanks to + * an iterator API. + *
        • In version 8.0, postings have been enhanced to record, for each block of doc ids, the (term + * freq, normalization factor) pairs that may trigger the maximum score of the block. This + * information is recorded alongside skip data in order to be able to skip blocks of doc ids + * if they may not produce high enough scores. Additionally doc values and norms has been + * extended with jump-tables to make access O(1) instead of O(n), where n is the number of + * elements to skip when advancing in the data. + *
        • In version 8.4, postings, positions, offsets and payload lengths have move to a more + * performant encoding that is vectorized. + *
        • In version 8.6, index sort serialization is delegated to the sorts themselves, to allow + * user-defined sorts to be used + *
        • In version 8.7, stored fields compression became adaptive to better handle documents with + * smaller stored fields. + *
        • In version 9.0, vector-valued fields were added. + *
        • In version 9.1, vector-valued fields were modified to add a graph hierarchy. + *
        • In version 9.2, docs of vector-valued fields were moved from .vem to .vec and encoded by + * IndexDISI. ordToDoc mappings was added to .vem. + *
        • In version 9.5, HNSW graph connections were changed to be delta-encoded with vints. + * Additionally, metadata file size improvements were made by delta-encoding nodes by graph + * layer and not writing the node ids for the zeroth layer. + *
        + * + * + * + *

        Limitations

        + * + *
        + * + *

        Lucene uses a Java int to refer to document numbers, and the index file format + * uses an Int32 on-disk to store document numbers. This is a limitation of both the + * index file format and the current implementation. Eventually these should be replaced with either + * UInt64 values, or better yet, {@link org.apache.lucene.store.DataOutput#writeVInt + * VInt} values which have no limit.

        + */ +package org.apache.lucene.backward_codecs.lucene95; diff --git a/lucene/backward-codecs/src/resources/META-INF/services/org.apache.lucene.codecs.Codec b/lucene/backward-codecs/src/resources/META-INF/services/org.apache.lucene.codecs.Codec index 92d161c885d..36cb0bbbd24 100644 --- a/lucene/backward-codecs/src/resources/META-INF/services/org.apache.lucene.codecs.Codec +++ b/lucene/backward-codecs/src/resources/META-INF/services/org.apache.lucene.codecs.Codec @@ -21,3 +21,4 @@ org.apache.lucene.backward_codecs.lucene90.Lucene90Codec org.apache.lucene.backward_codecs.lucene91.Lucene91Codec org.apache.lucene.backward_codecs.lucene92.Lucene92Codec org.apache.lucene.backward_codecs.lucene94.Lucene94Codec +org.apache.lucene.backward_codecs.lucene95.Lucene95Codec diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene90/Lucene90RWCodec.java b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene90/Lucene90RWCodec.java index b7c30361d2e..7c032f7f58b 100644 --- a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene90/Lucene90RWCodec.java +++ b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene90/Lucene90RWCodec.java @@ -17,10 +17,13 @@ package org.apache.lucene.backward_codecs.lucene90; import org.apache.lucene.codecs.KnnVectorsFormat; +import org.apache.lucene.codecs.SegmentInfoFormat; import org.apache.lucene.codecs.perfield.PerFieldKnnVectorsFormat; public class Lucene90RWCodec extends Lucene90Codec { + private final SegmentInfoFormat segmentInfosFormat = new Lucene90RWSegmentInfoFormat(); + private final KnnVectorsFormat defaultKnnVectorsFormat; private final KnnVectorsFormat knnVectorsFormat = new PerFieldKnnVectorsFormat() { @@ -41,4 +44,9 @@ public class Lucene90RWCodec extends Lucene90Codec { public KnnVectorsFormat knnVectorsFormat() { return knnVectorsFormat; } + + @Override + public SegmentInfoFormat segmentInfoFormat() { + return segmentInfosFormat; + } } diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90SegmentInfoFormat.java b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene90/Lucene90RWSegmentInfoFormat.java similarity index 91% rename from lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90SegmentInfoFormat.java rename to lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene90/Lucene90RWSegmentInfoFormat.java index 2286f631a35..1414d0874f9 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90SegmentInfoFormat.java +++ b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene90/Lucene90RWSegmentInfoFormat.java @@ -15,28 +15,17 @@ * limitations under the License. */ -package org.apache.lucene.codecs.lucene90; +package org.apache.lucene.backward_codecs.lucene90; import java.io.IOException; import java.util.Map; import java.util.Set; import org.apache.lucene.codecs.CodecUtil; import org.apache.lucene.codecs.SegmentInfoFormat; -import org.apache.lucene.index.CorruptIndexException; -import org.apache.lucene.index.IndexFileNames; -import org.apache.lucene.index.IndexSorter; -import org.apache.lucene.index.IndexWriter; -import org.apache.lucene.index.SegmentInfo; -import org.apache.lucene.index.SegmentInfos; -import org.apache.lucene.index.SortFieldProvider; +import org.apache.lucene.index.*; import org.apache.lucene.search.Sort; import org.apache.lucene.search.SortField; -import org.apache.lucene.store.ChecksumIndexInput; -import org.apache.lucene.store.DataInput; -import org.apache.lucene.store.DataOutput; -import org.apache.lucene.store.Directory; -import org.apache.lucene.store.IOContext; -import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.store.*; import org.apache.lucene.util.Version; /** @@ -83,7 +72,7 @@ import org.apache.lucene.util.Version; * @see SegmentInfos * @lucene.experimental */ -public class Lucene90SegmentInfoFormat extends SegmentInfoFormat { +public class Lucene90RWSegmentInfoFormat extends SegmentInfoFormat { /** File extension used to store {@link SegmentInfo}. */ public static final String SI_EXTENSION = "si"; @@ -93,7 +82,7 @@ public class Lucene90SegmentInfoFormat extends SegmentInfoFormat { static final int VERSION_CURRENT = VERSION_START; /** Sole constructor. */ - public Lucene90SegmentInfoFormat() {} + public Lucene90RWSegmentInfoFormat() {} @Override public SegmentInfo read(Directory dir, String segment, byte[] segmentID, IOContext context) @@ -164,6 +153,7 @@ public class Lucene90SegmentInfoFormat extends SegmentInfoFormat { segment, docCount, isCompoundFile, + false, null, diagnostics, segmentID, diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene90/TestLucene90SegmentInfoFormat.java b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene90/TestLucene90SegmentInfoFormat.java similarity index 86% rename from lucene/core/src/test/org/apache/lucene/codecs/lucene90/TestLucene90SegmentInfoFormat.java rename to lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene90/TestLucene90SegmentInfoFormat.java index baf266dc6ac..53a8a01a440 100644 --- a/lucene/core/src/test/org/apache/lucene/codecs/lucene90/TestLucene90SegmentInfoFormat.java +++ b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene90/TestLucene90SegmentInfoFormat.java @@ -15,22 +15,21 @@ * limitations under the License. */ -package org.apache.lucene.codecs.lucene90; +package org.apache.lucene.backward_codecs.lucene90; import org.apache.lucene.codecs.Codec; import org.apache.lucene.tests.index.BaseSegmentInfoFormatTestCase; -import org.apache.lucene.tests.util.TestUtil; import org.apache.lucene.util.Version; public class TestLucene90SegmentInfoFormat extends BaseSegmentInfoFormatTestCase { @Override protected Version[] getVersions() { - return new Version[] {Version.LATEST}; + return new Version[] {Version.LUCENE_9_0_0}; } @Override protected Codec getCodec() { - return TestUtil.getDefaultCodec(); + return new Lucene90RWCodec(); } } diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene91/Lucene91RWCodec.java b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene91/Lucene91RWCodec.java index 573f682754c..f9bac90b906 100644 --- a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene91/Lucene91RWCodec.java +++ b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene91/Lucene91RWCodec.java @@ -16,7 +16,9 @@ */ package org.apache.lucene.backward_codecs.lucene91; +import org.apache.lucene.backward_codecs.lucene90.Lucene90RWSegmentInfoFormat; import org.apache.lucene.codecs.KnnVectorsFormat; +import org.apache.lucene.codecs.SegmentInfoFormat; import org.apache.lucene.codecs.perfield.PerFieldKnnVectorsFormat; public class Lucene91RWCodec extends Lucene91Codec { @@ -29,6 +31,7 @@ public class Lucene91RWCodec extends Lucene91Codec { return defaultKnnVectorsFormat; } }; + private final SegmentInfoFormat segmentInfosFormat = new Lucene90RWSegmentInfoFormat(); public Lucene91RWCodec() { this.defaultKnnVectorsFormat = @@ -41,4 +44,9 @@ public class Lucene91RWCodec extends Lucene91Codec { public KnnVectorsFormat knnVectorsFormat() { return knnVectorsFormat; } + + @Override + public SegmentInfoFormat segmentInfoFormat() { + return segmentInfosFormat; + } } diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene92/Lucene92RWCodec.java b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene92/Lucene92RWCodec.java index 65ca5257d63..6008fa5df39 100644 --- a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene92/Lucene92RWCodec.java +++ b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene92/Lucene92RWCodec.java @@ -16,7 +16,9 @@ */ package org.apache.lucene.backward_codecs.lucene92; +import org.apache.lucene.backward_codecs.lucene90.Lucene90RWSegmentInfoFormat; import org.apache.lucene.codecs.KnnVectorsFormat; +import org.apache.lucene.codecs.SegmentInfoFormat; import org.apache.lucene.codecs.perfield.PerFieldKnnVectorsFormat; /** Implements the Lucene 9.2 index format for backwards compat testing */ @@ -30,6 +32,7 @@ public class Lucene92RWCodec extends Lucene92Codec { return defaultKnnVectorsFormat; } }; + private final SegmentInfoFormat segmentInfosFormat = new Lucene90RWSegmentInfoFormat(); /** Instantiates a new codec. */ public Lucene92RWCodec() { @@ -43,4 +46,9 @@ public class Lucene92RWCodec extends Lucene92Codec { public final KnnVectorsFormat knnVectorsFormat() { return knnVectorsFormat; } + + @Override + public SegmentInfoFormat segmentInfoFormat() { + return segmentInfosFormat; + } } diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene94/Lucene94RWCodec.java b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene94/Lucene94RWCodec.java index d43cfb8442d..88edd2431f8 100644 --- a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene94/Lucene94RWCodec.java +++ b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene94/Lucene94RWCodec.java @@ -16,7 +16,9 @@ */ package org.apache.lucene.backward_codecs.lucene94; +import org.apache.lucene.backward_codecs.lucene90.Lucene90RWSegmentInfoFormat; import org.apache.lucene.codecs.KnnVectorsFormat; +import org.apache.lucene.codecs.SegmentInfoFormat; import org.apache.lucene.codecs.perfield.PerFieldKnnVectorsFormat; /** Implements the Lucene 9.4 index format for backwards compat testing */ @@ -31,6 +33,8 @@ public class Lucene94RWCodec extends Lucene94Codec { } }; + private final SegmentInfoFormat segmentInfosFormat = new Lucene90RWSegmentInfoFormat(); + /** Instantiates a new codec. */ public Lucene94RWCodec() { defaultKnnVectorsFormat = @@ -43,4 +47,9 @@ public class Lucene94RWCodec extends Lucene94Codec { public final KnnVectorsFormat knnVectorsFormat() { return knnVectorsFormat; } + + @Override + public SegmentInfoFormat segmentInfoFormat() { + return segmentInfosFormat; + } } diff --git a/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/CreateIndexTask.java b/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/CreateIndexTask.java index 9a14710fbd2..defa00856d5 100644 --- a/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/CreateIndexTask.java +++ b/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/CreateIndexTask.java @@ -26,8 +26,8 @@ import java.nio.file.Paths; import org.apache.lucene.benchmark.byTask.PerfRunData; import org.apache.lucene.benchmark.byTask.utils.Config; import org.apache.lucene.codecs.Codec; +import org.apache.lucene.codecs.FilterCodec; import org.apache.lucene.codecs.PostingsFormat; -import org.apache.lucene.codecs.lucene95.Lucene95Codec; import org.apache.lucene.index.ConcurrentMergeScheduler; import org.apache.lucene.index.IndexCommit; import org.apache.lucene.index.IndexDeletionPolicy; @@ -152,9 +152,9 @@ public class CreateIndexTask extends PerfTask { try { final PostingsFormat postingsFormatChosen = PostingsFormat.forName(postingsFormat); iwConf.setCodec( - new Lucene95Codec() { + new FilterCodec(Codec.getDefault().getName(), Codec.getDefault()) { @Override - public PostingsFormat getPostingsFormatForField(String field) { + public PostingsFormat postingsFormat() { return postingsFormatChosen; } }); diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextSegmentInfoFormat.java b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextSegmentInfoFormat.java index cc705319035..accdb184df8 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextSegmentInfoFormat.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextSegmentInfoFormat.java @@ -55,6 +55,7 @@ public class SimpleTextSegmentInfoFormat extends SegmentInfoFormat { static final BytesRef SI_MIN_VERSION = new BytesRef(" min version "); static final BytesRef SI_DOCCOUNT = new BytesRef(" number of documents "); static final BytesRef SI_USECOMPOUND = new BytesRef(" uses compound file "); + static final BytesRef SI_HAS_BLOCKS = new BytesRef(" has blocks "); static final BytesRef SI_NUM_DIAG = new BytesRef(" diagnostics "); static final BytesRef SI_DIAG_KEY = new BytesRef(" key "); static final BytesRef SI_DIAG_VALUE = new BytesRef(" value "); @@ -113,6 +114,10 @@ public class SimpleTextSegmentInfoFormat extends SegmentInfoFormat { final boolean isCompoundFile = Boolean.parseBoolean(readString(SI_USECOMPOUND.length, scratch)); + SimpleTextUtil.readLine(input, scratch); + assert StringHelper.startsWith(scratch.get(), SI_HAS_BLOCKS); + final boolean hasBlocks = Boolean.parseBoolean(readString(SI_HAS_BLOCKS.length, scratch)); + SimpleTextUtil.readLine(input, scratch); assert StringHelper.startsWith(scratch.get(), SI_NUM_DIAG); int numDiag = Integer.parseInt(readString(SI_NUM_DIAG.length, scratch)); @@ -204,6 +209,7 @@ public class SimpleTextSegmentInfoFormat extends SegmentInfoFormat { segmentName, docCount, isCompoundFile, + hasBlocks, null, diagnostics, id, @@ -249,6 +255,10 @@ public class SimpleTextSegmentInfoFormat extends SegmentInfoFormat { SimpleTextUtil.write(output, Boolean.toString(si.getUseCompoundFile()), scratch); SimpleTextUtil.writeNewline(output); + SimpleTextUtil.write(output, SI_HAS_BLOCKS); + SimpleTextUtil.write(output, Boolean.toString(si.getHasBlocks()), scratch); + SimpleTextUtil.writeNewline(output); + Map diagnostics = si.getDiagnostics(); int numDiagnostics = diagnostics == null ? 0 : diagnostics.size(); SimpleTextUtil.write(output, SI_NUM_DIAG); diff --git a/lucene/core/src/java/module-info.java b/lucene/core/src/java/module-info.java index c728be82099..5d7168fd1ae 100644 --- a/lucene/core/src/java/module-info.java +++ b/lucene/core/src/java/module-info.java @@ -15,8 +15,8 @@ * limitations under the License. */ -import org.apache.lucene.codecs.lucene95.Lucene95Codec; import org.apache.lucene.codecs.lucene95.Lucene95HnswVectorsFormat; +import org.apache.lucene.codecs.lucene99.Lucene99Codec; /** Lucene Core. */ @SuppressWarnings("module") // the test framework is compiled after the core... @@ -33,6 +33,7 @@ module org.apache.lucene.core { exports org.apache.lucene.codecs.lucene90; exports org.apache.lucene.codecs.lucene94; exports org.apache.lucene.codecs.lucene95; + exports org.apache.lucene.codecs.lucene99; exports org.apache.lucene.codecs.lucene90.blocktree; exports org.apache.lucene.codecs.lucene90.compressing; exports org.apache.lucene.codecs.perfield; @@ -65,7 +66,7 @@ module org.apache.lucene.core { provides org.apache.lucene.analysis.TokenizerFactory with org.apache.lucene.analysis.standard.StandardTokenizerFactory; provides org.apache.lucene.codecs.Codec with - Lucene95Codec; + Lucene99Codec; provides org.apache.lucene.codecs.DocValuesFormat with org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat; provides org.apache.lucene.codecs.KnnVectorsFormat with diff --git a/lucene/core/src/java/org/apache/lucene/codecs/Codec.java b/lucene/core/src/java/org/apache/lucene/codecs/Codec.java index 0f512fccaa2..0b654a134a6 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/Codec.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/Codec.java @@ -55,7 +55,7 @@ public abstract class Codec implements NamedSPILoader.NamedSPI { return LOADER; } - static Codec defaultCodec = LOADER.lookup("Lucene95"); + static Codec defaultCodec = LOADER.lookup("Lucene99"); } private final String name; diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene95/package-info.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene95/package-info.java index 370a6edc103..4864c1ac10f 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene95/package-info.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene95/package-info.java @@ -15,411 +15,5 @@ * limitations under the License. */ -/** - * Lucene 9.5 file format. - * - *

        Apache Lucene - Index File Formats

        - * - * - * - *

        Introduction

        - * - *
        - * - *

        This document defines the index file formats used in this version of Lucene. If you are using - * a different version of Lucene, please consult the copy of docs/ that was distributed - * with the version you are using. - * - *

        This document attempts to provide a high-level definition of the Apache Lucene file formats. - *

        - * - *

        Definitions

        - * - *
        - * - *

        The fundamental concepts in Lucene are index, document, field and term. - * - *

        An index contains a sequence of documents. - * - *

          - *
        • A document is a sequence of fields. - *
        • A field is a named sequence of terms. - *
        • A term is a sequence of bytes. - *
        - * - *

        The same sequence of bytes in two different fields is considered a different term. Thus terms - * are represented as a pair: the string naming the field, and the bytes within the field. - * - *

        Inverted Indexing

        - * - *

        Lucene's index stores terms and statistics about those terms in order to make term-based - * search more efficient. Lucene's terms index falls into the family of indexes known as an - * inverted index. This is because it can list, for a term, the documents that contain it. - * This is the inverse of the natural relationship, in which documents list terms. - * - *

        Types of Fields

        - * - *

        In Lucene, fields may be stored, in which case their text is stored in the index - * literally, in a non-inverted manner. Fields that are inverted are called indexed. A field - * may be both stored and indexed. - * - *

        The text of a field may be tokenized into terms to be indexed, or the text of a field - * may be used literally as a term to be indexed. Most fields are tokenized, but sometimes it is - * useful for certain identifier fields to be indexed literally. - * - *

        See the {@link org.apache.lucene.document.Field Field} java docs for more information on - * Fields. - * - *

        Segments

        - * - *

        Lucene indexes may be composed of multiple sub-indexes, or segments. Each segment is a - * fully independent index, which could be searched separately. Indexes evolve by: - * - *

          - *
        1. Creating new segments for newly added documents. - *
        2. Merging existing segments. - *
        - * - *

        Searches may involve multiple segments and/or multiple indexes, each index potentially - * composed of a set of segments. - * - *

        Document Numbers

        - * - *

        Internally, Lucene refers to documents by an integer document number. The first - * document added to an index is numbered zero, and each subsequent document added gets a number one - * greater than the previous. - * - *

        Note that a document's number may change, so caution should be taken when storing these - * numbers outside of Lucene. In particular, numbers may change in the following situations: - * - *

          - *
        • - *

          The numbers stored in each segment are unique only within the segment, and must be - * converted before they can be used in a larger context. The standard technique is to - * allocate each segment a range of values, based on the range of numbers used in that - * segment. To convert a document number from a segment to an external value, the segment's - * base document number is added. To convert an external value back to a - * segment-specific value, the segment is identified by the range that the external value is - * in, and the segment's base value is subtracted. For example two five document segments - * might be combined, so that the first segment has a base value of zero, and the second of - * five. Document three from the second segment would have an external value of eight. - *

        • - *

          When documents are deleted, gaps are created in the numbering. These are eventually - * removed as the index evolves through merging. Deleted documents are dropped when segments - * are merged. A freshly-merged segment thus has no gaps in its numbering. - *

        - * - *
        - * - *

        Index Structure Overview

        - * - *
        - * - *

        Each segment index maintains the following: - * - *

          - *
        • {@link org.apache.lucene.codecs.lucene90.Lucene90SegmentInfoFormat Segment info}. This - * contains metadata about a segment, such as the number of documents, what files it uses, and - * information about how the segment is sorted - *
        • {@link org.apache.lucene.codecs.lucene94.Lucene94FieldInfosFormat Field names}. This - * contains metadata about the set of named fields used in the index. - *
        • {@link org.apache.lucene.codecs.lucene90.Lucene90StoredFieldsFormat Stored Field values}. - * This contains, for each document, a list of attribute-value pairs, where the attributes are - * field names. These are used to store auxiliary information about the document, such as its - * title, url, or an identifier to access a database. The set of stored fields are what is - * returned for each hit when searching. This is keyed by document number. - *
        • {@link org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat Term dictionary}. A - * dictionary containing all of the terms used in all of the indexed fields of all of the - * documents. The dictionary also contains the number of documents which contain the term, and - * pointers to the term's frequency and proximity data. - *
        • {@link org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat Term Frequency data}. For - * each term in the dictionary, the numbers of all the documents that contain that term, and - * the frequency of the term in that document, unless frequencies are omitted ({@link - * org.apache.lucene.index.IndexOptions#DOCS IndexOptions.DOCS}) - *
        • {@link org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat Term Proximity data}. For - * each term in the dictionary, the positions that the term occurs in each document. Note that - * this will not exist if all fields in all documents omit position data. - *
        • {@link org.apache.lucene.codecs.lucene90.Lucene90NormsFormat Normalization factors}. For - * each field in each document, a value is stored that is multiplied into the score for hits - * on that field. - *
        • {@link org.apache.lucene.codecs.lucene90.Lucene90TermVectorsFormat Term Vectors}. For each - * field in each document, the term vector (sometimes called document vector) may be stored. A - * term vector consists of term text and term frequency. To add Term Vectors to your index see - * the {@link org.apache.lucene.document.Field Field} constructors - *
        • {@link org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat Per-document values}. Like - * stored values, these are also keyed by document number, but are generally intended to be - * loaded into main memory for fast access. Whereas stored values are generally intended for - * summary results from searches, per-document values are useful for things like scoring - * factors. - *
        • {@link org.apache.lucene.codecs.lucene90.Lucene90LiveDocsFormat Live documents}. An - * optional file indicating which documents are live. - *
        • {@link org.apache.lucene.codecs.lucene90.Lucene90PointsFormat Point values}. Optional pair - * of files, recording dimensionally indexed fields, to enable fast numeric range filtering - * and large numeric values like BigInteger and BigDecimal (1D) and geographic shape - * intersection (2D, 3D). - *
        • {@link org.apache.lucene.codecs.lucene95.Lucene95HnswVectorsFormat Vector values}. The - * vector format stores numeric vectors in a format optimized for random access and - * computation, supporting high-dimensional nearest-neighbor search. - *
        - * - *

        Details on each of these are provided in their linked pages.

        - * - *

        File Naming

        - * - *
        - * - *

        All files belonging to a segment have the same name with varying extensions. The extensions - * correspond to the different file formats described below. When using the Compound File format - * (default for small segments) these files (except for the Segment info file, the Lock file, and - * Deleted documents file) are collapsed into a single .cfs file (see below for details) - * - *

        Typically, all segments in an index are stored in a single directory, although this is not - * required. - * - *

        File names are never re-used. That is, when any file is saved to the Directory it is given a - * never before used filename. This is achieved using a simple generations approach. For example, - * the first segments file is segments_1, then segments_2, etc. The generation is a sequential long - * integer represented in alpha-numeric (base 36) form.

        - * - *

        Summary of File Extensions

        - * - *
        - * - *

        The following table summarizes the names and extensions of the files in Lucene: - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - *
        lucene filenames by extension
        NameExtensionBrief Description
        {@link org.apache.lucene.index.SegmentInfos Segments File}segments_NStores information about a commit point
        Lock Filewrite.lockThe Write lock prevents multiple IndexWriters from writing to the same - * file.
        {@link org.apache.lucene.codecs.lucene90.Lucene90SegmentInfoFormat Segment Info}.siStores metadata about a segment
        {@link org.apache.lucene.codecs.lucene90.Lucene90CompoundFormat Compound File}.cfs, .cfeAn optional "virtual" file consisting of all the other index files for - * systems that frequently run out of file handles.
        {@link org.apache.lucene.codecs.lucene94.Lucene94FieldInfosFormat Fields}.fnmStores information about the fields
        {@link org.apache.lucene.codecs.lucene90.Lucene90StoredFieldsFormat Field Index}.fdxContains pointers to field data
        {@link org.apache.lucene.codecs.lucene90.Lucene90StoredFieldsFormat Field Data}.fdtThe stored fields for documents
        {@link org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat Term Dictionary}.timThe term dictionary, stores term info
        {@link org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat Term Index}.tipThe index into the Term Dictionary
        {@link org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat Frequencies}.docContains the list of docs which contain each term along with frequency
        {@link org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat Positions}.posStores position information about where a term occurs in the index
        {@link org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat Payloads}.payStores additional per-position metadata information such as character offsets and user payloads
        {@link org.apache.lucene.codecs.lucene90.Lucene90NormsFormat Norms}.nvd, .nvmEncodes length and boost factors for docs and fields
        {@link org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat Per-Document Values}.dvd, .dvmEncodes additional scoring factors or other per-document information.
        {@link org.apache.lucene.codecs.lucene90.Lucene90TermVectorsFormat Term Vector Index}.tvxStores offset into the document data file
        {@link org.apache.lucene.codecs.lucene90.Lucene90TermVectorsFormat Term Vector Data}.tvdContains term vector data.
        {@link org.apache.lucene.codecs.lucene90.Lucene90LiveDocsFormat Live Documents}.livInfo about what documents are live
        {@link org.apache.lucene.codecs.lucene90.Lucene90PointsFormat Point values}.dii, .dimHolds indexed points
        {@link org.apache.lucene.codecs.lucene95.Lucene95HnswVectorsFormat Vector values}.vec, .vemHolds indexed vectors; .vec files contain the raw vector data, and - * .vem the vector metadata
        - * - *

        - * - *

        Lock File

        - * - * The write lock, which is stored in the index directory by default, is named "write.lock". If the - * lock directory is different from the index directory then the write lock will be named - * "XXXX-write.lock" where XXXX is a unique prefix derived from the full path to the index - * directory. When this file is present, a writer is currently modifying the index (adding or - * removing documents). This lock file ensures that only one writer is modifying the index at a - * time. - * - *

        History

        - * - *

        Compatibility notes are provided in this document, describing how file formats have changed - * from prior versions: - * - *

          - *
        • In version 2.1, the file format was changed to allow lock-less commits (ie, no more commit - * lock). The change is fully backwards compatible: you can open a pre-2.1 index for searching - * or adding/deleting of docs. When the new segments file is saved (committed), it will be - * written in the new file format (meaning no specific "upgrade" process is needed). But note - * that once a commit has occurred, pre-2.1 Lucene will not be able to read the index. - *
        • In version 2.3, the file format was changed to allow segments to share a single set of doc - * store (vectors & stored fields) files. This allows for faster indexing in certain - * cases. The change is fully backwards compatible (in the same way as the lock-less commits - * change in 2.1). - *
        • In version 2.4, Strings are now written as true UTF-8 byte sequence, not Java's modified - * UTF-8. See LUCENE-510 for - * details. - *
        • In version 2.9, an optional opaque Map<String,String> CommitUserData may be passed to - * IndexWriter's commit methods (and later retrieved), which is recorded in the segments_N - * file. See LUCENE-1382 for - * details. Also, diagnostics were added to each segment written recording details about why - * it was written (due to flush, merge; which OS/JRE was used; etc.). See issue LUCENE-1654 for details. - *
        • In version 3.0, compressed fields are no longer written to the index (they can still be - * read, but on merge the new segment will write them, uncompressed). See issue LUCENE-1960 for details. - *
        • In version 3.1, segments records the code version that created them. See LUCENE-2720 for details. - * Additionally segments track explicitly whether or not they have term vectors. See LUCENE-2811 for details. - *
        • In version 3.2, numeric fields are written as natively to stored fields file, previously - * they were stored in text format only. - *
        • In version 3.4, fields can omit position data while still indexing term frequencies. - *
        • In version 4.0, the format of the inverted index became extensible via the {@link - * org.apache.lucene.codecs.Codec Codec} api. Fast per-document storage ({@code DocValues}) - * was introduced. Normalization factors need no longer be a single byte, they can be any - * {@link org.apache.lucene.index.NumericDocValues NumericDocValues}. Terms need not be - * unicode strings, they can be any byte sequence. Term offsets can optionally be indexed into - * the postings lists. Payloads can be stored in the term vectors. - *
        • In version 4.1, the format of the postings list changed to use either of FOR compression or - * variable-byte encoding, depending upon the frequency of the term. Terms appearing only once - * were changed to inline directly into the term dictionary. Stored fields are compressed by - * default. - *
        • In version 4.2, term vectors are compressed by default. DocValues has a new multi-valued - * type (SortedSet), that can be used for faceting/grouping/joining on multi-valued fields. - *
        • In version 4.5, DocValues were extended to explicitly represent missing values. - *
        • In version 4.6, FieldInfos were extended to support per-field DocValues generation, to - * allow updating NumericDocValues fields. - *
        • In version 4.8, checksum footers were added to the end of each index file for improved data - * integrity. Specifically, the last 8 bytes of every index file contain the zlib-crc32 - * checksum of the file. - *
        • In version 4.9, DocValues has a new multi-valued numeric type (SortedNumeric) that is - * suitable for faceting/sorting/analytics. - *
        • In version 5.4, DocValues have been improved to store more information on disk: addresses - * for binary fields and ord indexes for multi-valued fields. - *
        • In version 6.0, Points were added, for multi-dimensional range/distance search. - *
        • In version 6.2, new Segment info format that reads/writes the index sort, to support index - * sorting. - *
        • In version 7.0, DocValues have been improved to better support sparse doc values thanks to - * an iterator API. - *
        • In version 8.0, postings have been enhanced to record, for each block of doc ids, the (term - * freq, normalization factor) pairs that may trigger the maximum score of the block. This - * information is recorded alongside skip data in order to be able to skip blocks of doc ids - * if they may not produce high enough scores. Additionally doc values and norms has been - * extended with jump-tables to make access O(1) instead of O(n), where n is the number of - * elements to skip when advancing in the data. - *
        • In version 8.4, postings, positions, offsets and payload lengths have move to a more - * performant encoding that is vectorized. - *
        • In version 8.6, index sort serialization is delegated to the sorts themselves, to allow - * user-defined sorts to be used - *
        • In version 8.7, stored fields compression became adaptive to better handle documents with - * smaller stored fields. - *
        • In version 9.0, vector-valued fields were added. - *
        • In version 9.1, vector-valued fields were modified to add a graph hierarchy. - *
        • In version 9.2, docs of vector-valued fields were moved from .vem to .vec and encoded by - * IndexDISI. ordToDoc mappings was added to .vem. - *
        • In version 9.5, HNSW graph connections were changed to be delta-encoded with vints. - * Additionally, metadata file size improvements were made by delta-encoding nodes by graph - * layer and not writing the node ids for the zeroth layer. - *
        - * - * - * - *

        Limitations

        - * - *
        - * - *

        Lucene uses a Java int to refer to document numbers, and the index file format - * uses an Int32 on-disk to store document numbers. This is a limitation of both the - * index file format and the current implementation. Eventually these should be replaced with either - * UInt64 values, or better yet, {@link org.apache.lucene.store.DataOutput#writeVInt - * VInt} values which have no limit.

        - */ +/** Lucene 9.5 file format. */ package org.apache.lucene.codecs.lucene95; diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99Codec.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99Codec.java new file mode 100644 index 00000000000..c55401a3e8b --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99Codec.java @@ -0,0 +1,198 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.lucene99; + +import java.util.Objects; +import org.apache.lucene.codecs.*; +import org.apache.lucene.codecs.lucene90.*; +import org.apache.lucene.codecs.lucene94.Lucene94FieldInfosFormat; +import org.apache.lucene.codecs.lucene95.Lucene95HnswVectorsFormat; +import org.apache.lucene.codecs.perfield.PerFieldDocValuesFormat; +import org.apache.lucene.codecs.perfield.PerFieldKnnVectorsFormat; +import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat; + +/** + * Implements the Lucene 9.9 index format + * + *

        If you want to reuse functionality of this codec in another codec, extend {@link FilterCodec}. + * + * @see org.apache.lucene.codecs.lucene99 package documentation for file format details. + * @lucene.experimental + */ +public class Lucene99Codec extends Codec { + + /** Configuration option for the codec. */ + public enum Mode { + /** Trade compression ratio for retrieval speed. */ + BEST_SPEED(Lucene90StoredFieldsFormat.Mode.BEST_SPEED), + /** Trade retrieval speed for compression ratio. */ + BEST_COMPRESSION(Lucene90StoredFieldsFormat.Mode.BEST_COMPRESSION); + + private final Lucene90StoredFieldsFormat.Mode storedMode; + + private Mode(Lucene90StoredFieldsFormat.Mode storedMode) { + this.storedMode = Objects.requireNonNull(storedMode); + } + } + + private final TermVectorsFormat vectorsFormat = new Lucene90TermVectorsFormat(); + private final FieldInfosFormat fieldInfosFormat = new Lucene94FieldInfosFormat(); + private final SegmentInfoFormat segmentInfosFormat = new Lucene99SegmentInfoFormat(); + private final LiveDocsFormat liveDocsFormat = new Lucene90LiveDocsFormat(); + private final CompoundFormat compoundFormat = new Lucene90CompoundFormat(); + private final NormsFormat normsFormat = new Lucene90NormsFormat(); + + private final PostingsFormat defaultPostingsFormat; + private final PostingsFormat postingsFormat = + new PerFieldPostingsFormat() { + @Override + public PostingsFormat getPostingsFormatForField(String field) { + return Lucene99Codec.this.getPostingsFormatForField(field); + } + }; + + private final DocValuesFormat defaultDVFormat; + private final DocValuesFormat docValuesFormat = + new PerFieldDocValuesFormat() { + @Override + public DocValuesFormat getDocValuesFormatForField(String field) { + return Lucene99Codec.this.getDocValuesFormatForField(field); + } + }; + + private final KnnVectorsFormat defaultKnnVectorsFormat; + private final KnnVectorsFormat knnVectorsFormat = + new PerFieldKnnVectorsFormat() { + @Override + public KnnVectorsFormat getKnnVectorsFormatForField(String field) { + return Lucene99Codec.this.getKnnVectorsFormatForField(field); + } + }; + + private final StoredFieldsFormat storedFieldsFormat; + + /** Instantiates a new codec. */ + public Lucene99Codec() { + this(Mode.BEST_SPEED); + } + + /** + * Instantiates a new codec, specifying the stored fields compression mode to use. + * + * @param mode stored fields compression mode to use for newly flushed/merged segments. + */ + public Lucene99Codec(Mode mode) { + super("Lucene99"); + this.storedFieldsFormat = + new Lucene90StoredFieldsFormat(Objects.requireNonNull(mode).storedMode); + this.defaultPostingsFormat = new Lucene90PostingsFormat(); + this.defaultDVFormat = new Lucene90DocValuesFormat(); + this.defaultKnnVectorsFormat = new Lucene95HnswVectorsFormat(); + } + + @Override + public final StoredFieldsFormat storedFieldsFormat() { + return storedFieldsFormat; + } + + @Override + public final TermVectorsFormat termVectorsFormat() { + return vectorsFormat; + } + + @Override + public final PostingsFormat postingsFormat() { + return postingsFormat; + } + + @Override + public final FieldInfosFormat fieldInfosFormat() { + return fieldInfosFormat; + } + + @Override + public final SegmentInfoFormat segmentInfoFormat() { + return segmentInfosFormat; + } + + @Override + public final LiveDocsFormat liveDocsFormat() { + return liveDocsFormat; + } + + @Override + public final CompoundFormat compoundFormat() { + return compoundFormat; + } + + @Override + public final PointsFormat pointsFormat() { + return new Lucene90PointsFormat(); + } + + @Override + public final KnnVectorsFormat knnVectorsFormat() { + return knnVectorsFormat; + } + + /** + * Returns the postings format that should be used for writing new segments of field. + * + *

        The default implementation always returns "Lucene90". + * + *

        WARNING: if you subclass, you are responsible for index backwards compatibility: + * future version of Lucene are only guaranteed to be able to read the default implementation, + */ + public PostingsFormat getPostingsFormatForField(String field) { + return defaultPostingsFormat; + } + + /** + * Returns the docvalues format that should be used for writing new segments of field + * . + * + *

        The default implementation always returns "Lucene90". + * + *

        WARNING: if you subclass, you are responsible for index backwards compatibility: + * future version of Lucene are only guaranteed to be able to read the default implementation. + */ + public DocValuesFormat getDocValuesFormatForField(String field) { + return defaultDVFormat; + } + + /** + * Returns the vectors format that should be used for writing new segments of field + * + *

        The default implementation always returns "Lucene95". + * + *

        WARNING: if you subclass, you are responsible for index backwards compatibility: + * future version of Lucene are only guaranteed to be able to read the default implementation. + */ + public KnnVectorsFormat getKnnVectorsFormatForField(String field) { + return defaultKnnVectorsFormat; + } + + @Override + public final DocValuesFormat docValuesFormat() { + return docValuesFormat; + } + + @Override + public final NormsFormat normsFormat() { + return normsFormat; + } +} diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99SegmentInfoFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99SegmentInfoFormat.java new file mode 100644 index 00000000000..9341c312299 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99SegmentInfoFormat.java @@ -0,0 +1,236 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.codecs.lucene99; + +import java.io.IOException; +import java.util.Map; +import java.util.Set; +import org.apache.lucene.codecs.CodecUtil; +import org.apache.lucene.codecs.SegmentInfoFormat; +import org.apache.lucene.index.*; +import org.apache.lucene.search.Sort; +import org.apache.lucene.search.SortField; +import org.apache.lucene.store.*; +import org.apache.lucene.util.Version; + +/** + * Lucene 9.9 Segment info format. + * + *

        Files: + * + *

          + *
        • .si: Header, SegVersion, SegSize, IsCompoundFile, Diagnostics, Files, + * Attributes, IndexSort, Footer + *
        + * + * Data types: + * + *
          + *
        • Header --> {@link CodecUtil#writeIndexHeader IndexHeader} + *
        • SegSize --> {@link DataOutput#writeInt Int32} + *
        • SegVersion --> {@link DataOutput#writeString String} + *
        • SegMinVersion --> {@link DataOutput#writeString String} + *
        • Files --> {@link DataOutput#writeSetOfStrings Set<String>} + *
        • Diagnostics,Attributes --> {@link DataOutput#writeMapOfStrings Map<String,String>} + *
        • IsCompoundFile --> {@link DataOutput#writeByte Int8} + *
        • HasBlocks --> {@link DataOutput#writeByte Int8} + *
        • IndexSort --> {@link DataOutput#writeVInt Int32} count, followed by {@code count} + * SortField + *
        • SortField --> {@link DataOutput#writeString String} sort class, followed by a per-sort + * bytestream (see {@link SortFieldProvider#readSortField(DataInput)}) + *
        • Footer --> {@link CodecUtil#writeFooter CodecFooter} + *
        + * + * Field Descriptions: + * + *
          + *
        • SegVersion is the code version that created the segment. + *
        • SegMinVersion is the minimum code version that contributed documents to the segment. + *
        • SegSize is the number of documents contained in the segment index. + *
        • IsCompoundFile records whether the segment is written as a compound file or not. If this is + * -1, the segment is not a compound file. If it is 1, the segment is a compound file. + *
        • HasBlocks records whether the segment contains documents written as a block and guarantees + * consecutive document ids for all documents in the block + *
        • The Diagnostics Map is privately written by {@link IndexWriter}, as a debugging aid, for + * each segment it creates. It includes metadata like the current Lucene version, OS, Java + * version, why the segment was created (merge, flush, addIndexes), etc. + *
        • Files is a list of files referred to by this segment. + *
        + * + * @see SegmentInfos + * @lucene.experimental + */ +public class Lucene99SegmentInfoFormat extends SegmentInfoFormat { + + /** File extension used to store {@link SegmentInfo}. */ + public static final String SI_EXTENSION = "si"; + + static final String CODEC_NAME = "Lucene90SegmentInfo"; + static final int VERSION_START = 0; + static final int VERSION_CURRENT = VERSION_START; + + /** Sole constructor. */ + public Lucene99SegmentInfoFormat() {} + + @Override + public SegmentInfo read(Directory dir, String segment, byte[] segmentID, IOContext context) + throws IOException { + final String fileName = IndexFileNames.segmentFileName(segment, "", SI_EXTENSION); + try (ChecksumIndexInput input = dir.openChecksumInput(fileName)) { + Throwable priorE = null; + SegmentInfo si = null; + try { + CodecUtil.checkIndexHeader( + input, CODEC_NAME, VERSION_START, VERSION_CURRENT, segmentID, ""); + si = parseSegmentInfo(dir, input, segment, segmentID); + } catch (Throwable exception) { + priorE = exception; + } finally { + CodecUtil.checkFooter(input, priorE); + } + return si; + } + } + + private SegmentInfo parseSegmentInfo( + Directory dir, DataInput input, String segment, byte[] segmentID) throws IOException { + final Version version = Version.fromBits(input.readInt(), input.readInt(), input.readInt()); + byte hasMinVersion = input.readByte(); + final Version minVersion; + switch (hasMinVersion) { + case 0: + minVersion = null; + break; + case 1: + minVersion = Version.fromBits(input.readInt(), input.readInt(), input.readInt()); + break; + default: + throw new CorruptIndexException("Illegal boolean value " + hasMinVersion, input); + } + + final int docCount = input.readInt(); + if (docCount < 0) { + throw new CorruptIndexException("invalid docCount: " + docCount, input); + } + final boolean isCompoundFile = input.readByte() == SegmentInfo.YES; + final boolean hasBlocks = input.readByte() == SegmentInfo.YES; + + final Map diagnostics = input.readMapOfStrings(); + final Set files = input.readSetOfStrings(); + final Map attributes = input.readMapOfStrings(); + + int numSortFields = input.readVInt(); + Sort indexSort; + if (numSortFields > 0) { + SortField[] sortFields = new SortField[numSortFields]; + for (int i = 0; i < numSortFields; i++) { + String name = input.readString(); + sortFields[i] = SortFieldProvider.forName(name).readSortField(input); + } + indexSort = new Sort(sortFields); + } else if (numSortFields < 0) { + throw new CorruptIndexException("invalid index sort field count: " + numSortFields, input); + } else { + indexSort = null; + } + + SegmentInfo si = + new SegmentInfo( + dir, + version, + minVersion, + segment, + docCount, + isCompoundFile, + hasBlocks, + null, + diagnostics, + segmentID, + attributes, + indexSort); + si.setFiles(files); + return si; + } + + @Override + public void write(Directory dir, SegmentInfo si, IOContext ioContext) throws IOException { + final String fileName = IndexFileNames.segmentFileName(si.name, "", SI_EXTENSION); + + try (IndexOutput output = dir.createOutput(fileName, ioContext)) { + // Only add the file once we've successfully created it, else IFD assert can trip: + si.addFile(fileName); + CodecUtil.writeIndexHeader(output, CODEC_NAME, VERSION_CURRENT, si.getId(), ""); + + writeSegmentInfo(output, si); + + CodecUtil.writeFooter(output); + } + } + + private void writeSegmentInfo(DataOutput output, SegmentInfo si) throws IOException { + Version version = si.getVersion(); + if (version.major < 7) { + throw new IllegalArgumentException( + "invalid major version: should be >= 7 but got: " + version.major + " segment=" + si); + } + // Write the Lucene version that created this segment, since 3.1 + output.writeInt(version.major); + output.writeInt(version.minor); + output.writeInt(version.bugfix); + + // Write the min Lucene version that contributed docs to the segment, since 7.0 + if (si.getMinVersion() != null) { + output.writeByte((byte) 1); + Version minVersion = si.getMinVersion(); + output.writeInt(minVersion.major); + output.writeInt(minVersion.minor); + output.writeInt(minVersion.bugfix); + } else { + output.writeByte((byte) 0); + } + + assert version.prerelease == 0; + output.writeInt(si.maxDoc()); + + output.writeByte((byte) (si.getUseCompoundFile() ? SegmentInfo.YES : SegmentInfo.NO)); + output.writeByte((byte) (si.getHasBlocks() ? SegmentInfo.YES : SegmentInfo.NO)); + output.writeMapOfStrings(si.getDiagnostics()); + Set files = si.files(); + for (String file : files) { + if (!IndexFileNames.parseSegmentName(file).equals(si.name)) { + throw new IllegalArgumentException( + "invalid files: expected segment=" + si.name + ", got=" + files); + } + } + output.writeSetOfStrings(files); + output.writeMapOfStrings(si.getAttributes()); + + Sort indexSort = si.getIndexSort(); + int numSortFields = indexSort == null ? 0 : indexSort.getSort().length; + output.writeVInt(numSortFields); + for (int i = 0; i < numSortFields; ++i) { + SortField sortField = indexSort.getSort()[i]; + IndexSorter sorter = sortField.getIndexSorter(); + if (sorter == null) { + throw new IllegalArgumentException("cannot serialize SortField " + sortField); + } + output.writeString(sorter.getProviderName()); + SortFieldProvider.write(sortField, output); + } + } +} diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/package-info.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/package-info.java new file mode 100644 index 00000000000..700b6ed75e6 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/package-info.java @@ -0,0 +1,425 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Lucene 9.9 file format. + * + *

        Apache Lucene - Index File Formats

        + * + * + * + *

        Introduction

        + * + *
        + * + *

        This document defines the index file formats used in this version of Lucene. If you are using + * a different version of Lucene, please consult the copy of docs/ that was distributed + * with the version you are using. + * + *

        This document attempts to provide a high-level definition of the Apache Lucene file formats. + *

        + * + *

        Definitions

        + * + *
        + * + *

        The fundamental concepts in Lucene are index, document, field and term. + * + *

        An index contains a sequence of documents. + * + *

          + *
        • A document is a sequence of fields. + *
        • A field is a named sequence of terms. + *
        • A term is a sequence of bytes. + *
        + * + *

        The same sequence of bytes in two different fields is considered a different term. Thus terms + * are represented as a pair: the string naming the field, and the bytes within the field. + * + *

        Inverted Indexing

        + * + *

        Lucene's index stores terms and statistics about those terms in order to make term-based + * search more efficient. Lucene's terms index falls into the family of indexes known as an + * inverted index. This is because it can list, for a term, the documents that contain it. + * This is the inverse of the natural relationship, in which documents list terms. + * + *

        Types of Fields

        + * + *

        In Lucene, fields may be stored, in which case their text is stored in the index + * literally, in a non-inverted manner. Fields that are inverted are called indexed. A field + * may be both stored and indexed. + * + *

        The text of a field may be tokenized into terms to be indexed, or the text of a field + * may be used literally as a term to be indexed. Most fields are tokenized, but sometimes it is + * useful for certain identifier fields to be indexed literally. + * + *

        See the {@link org.apache.lucene.document.Field Field} java docs for more information on + * Fields. + * + *

        Segments

        + * + *

        Lucene indexes may be composed of multiple sub-indexes, or segments. Each segment is a + * fully independent index, which could be searched separately. Indexes evolve by: + * + *

          + *
        1. Creating new segments for newly added documents. + *
        2. Merging existing segments. + *
        + * + *

        Searches may involve multiple segments and/or multiple indexes, each index potentially + * composed of a set of segments. + * + *

        Document Numbers

        + * + *

        Internally, Lucene refers to documents by an integer document number. The first + * document added to an index is numbered zero, and each subsequent document added gets a number one + * greater than the previous. + * + *

        Note that a document's number may change, so caution should be taken when storing these + * numbers outside of Lucene. In particular, numbers may change in the following situations: + * + *

          + *
        • + *

          The numbers stored in each segment are unique only within the segment, and must be + * converted before they can be used in a larger context. The standard technique is to + * allocate each segment a range of values, based on the range of numbers used in that + * segment. To convert a document number from a segment to an external value, the segment's + * base document number is added. To convert an external value back to a + * segment-specific value, the segment is identified by the range that the external value is + * in, and the segment's base value is subtracted. For example two five document segments + * might be combined, so that the first segment has a base value of zero, and the second of + * five. Document three from the second segment would have an external value of eight. + *

        • + *

          When documents are deleted, gaps are created in the numbering. These are eventually + * removed as the index evolves through merging. Deleted documents are dropped when segments + * are merged. A freshly-merged segment thus has no gaps in its numbering. + *

        + * + *
        + * + *

        Index Structure Overview

        + * + *
        + * + *

        Each segment index maintains the following: + * + *

          + *
        • {@link org.apache.lucene.codecs.lucene99.Lucene99SegmentInfoFormat Segment info}. This + * contains metadata about a segment, such as the number of documents, what files it uses, and + * information about how the segment is sorted + *
        • {@link org.apache.lucene.codecs.lucene94.Lucene94FieldInfosFormat Field names}. This + * contains metadata about the set of named fields used in the index. + *
        • {@link org.apache.lucene.codecs.lucene90.Lucene90StoredFieldsFormat Stored Field values}. + * This contains, for each document, a list of attribute-value pairs, where the attributes are + * field names. These are used to store auxiliary information about the document, such as its + * title, url, or an identifier to access a database. The set of stored fields are what is + * returned for each hit when searching. This is keyed by document number. + *
        • {@link org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat Term dictionary}. A + * dictionary containing all of the terms used in all of the indexed fields of all of the + * documents. The dictionary also contains the number of documents which contain the term, and + * pointers to the term's frequency and proximity data. + *
        • {@link org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat Term Frequency data}. For + * each term in the dictionary, the numbers of all the documents that contain that term, and + * the frequency of the term in that document, unless frequencies are omitted ({@link + * org.apache.lucene.index.IndexOptions#DOCS IndexOptions.DOCS}) + *
        • {@link org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat Term Proximity data}. For + * each term in the dictionary, the positions that the term occurs in each document. Note that + * this will not exist if all fields in all documents omit position data. + *
        • {@link org.apache.lucene.codecs.lucene90.Lucene90NormsFormat Normalization factors}. For + * each field in each document, a value is stored that is multiplied into the score for hits + * on that field. + *
        • {@link org.apache.lucene.codecs.lucene90.Lucene90TermVectorsFormat Term Vectors}. For each + * field in each document, the term vector (sometimes called document vector) may be stored. A + * term vector consists of term text and term frequency. To add Term Vectors to your index see + * the {@link org.apache.lucene.document.Field Field} constructors + *
        • {@link org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat Per-document values}. Like + * stored values, these are also keyed by document number, but are generally intended to be + * loaded into main memory for fast access. Whereas stored values are generally intended for + * summary results from searches, per-document values are useful for things like scoring + * factors. + *
        • {@link org.apache.lucene.codecs.lucene90.Lucene90LiveDocsFormat Live documents}. An + * optional file indicating which documents are live. + *
        • {@link org.apache.lucene.codecs.lucene90.Lucene90PointsFormat Point values}. Optional pair + * of files, recording dimensionally indexed fields, to enable fast numeric range filtering + * and large numeric values like BigInteger and BigDecimal (1D) and geographic shape + * intersection (2D, 3D). + *
        • {@link org.apache.lucene.codecs.lucene95.Lucene95HnswVectorsFormat Vector values}. The + * vector format stores numeric vectors in a format optimized for random access and + * computation, supporting high-dimensional nearest-neighbor search. + *
        + * + *

        Details on each of these are provided in their linked pages.

        + * + *

        File Naming

        + * + *
        + * + *

        All files belonging to a segment have the same name with varying extensions. The extensions + * correspond to the different file formats described below. When using the Compound File format + * (default for small segments) these files (except for the Segment info file, the Lock file, and + * Deleted documents file) are collapsed into a single .cfs file (see below for details) + * + *

        Typically, all segments in an index are stored in a single directory, although this is not + * required. + * + *

        File names are never re-used. That is, when any file is saved to the Directory it is given a + * never before used filename. This is achieved using a simple generations approach. For example, + * the first segments file is segments_1, then segments_2, etc. The generation is a sequential long + * integer represented in alpha-numeric (base 36) form.

        + * + *

        Summary of File Extensions

        + * + *
        + * + *

        The following table summarizes the names and extensions of the files in Lucene: + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + *
        lucene filenames by extension
        NameExtensionBrief Description
        {@link org.apache.lucene.index.SegmentInfos Segments File}segments_NStores information about a commit point
        Lock Filewrite.lockThe Write lock prevents multiple IndexWriters from writing to the same + * file.
        {@link org.apache.lucene.codecs.lucene99.Lucene99SegmentInfoFormat Segment Info}.siStores metadata about a segment
        {@link org.apache.lucene.codecs.lucene90.Lucene90CompoundFormat Compound File}.cfs, .cfeAn optional "virtual" file consisting of all the other index files for + * systems that frequently run out of file handles.
        {@link org.apache.lucene.codecs.lucene94.Lucene94FieldInfosFormat Fields}.fnmStores information about the fields
        {@link org.apache.lucene.codecs.lucene90.Lucene90StoredFieldsFormat Field Index}.fdxContains pointers to field data
        {@link org.apache.lucene.codecs.lucene90.Lucene90StoredFieldsFormat Field Data}.fdtThe stored fields for documents
        {@link org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat Term Dictionary}.timThe term dictionary, stores term info
        {@link org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat Term Index}.tipThe index into the Term Dictionary
        {@link org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat Frequencies}.docContains the list of docs which contain each term along with frequency
        {@link org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat Positions}.posStores position information about where a term occurs in the index
        {@link org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat Payloads}.payStores additional per-position metadata information such as character offsets and user payloads
        {@link org.apache.lucene.codecs.lucene90.Lucene90NormsFormat Norms}.nvd, .nvmEncodes length and boost factors for docs and fields
        {@link org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat Per-Document Values}.dvd, .dvmEncodes additional scoring factors or other per-document information.
        {@link org.apache.lucene.codecs.lucene90.Lucene90TermVectorsFormat Term Vector Index}.tvxStores offset into the document data file
        {@link org.apache.lucene.codecs.lucene90.Lucene90TermVectorsFormat Term Vector Data}.tvdContains term vector data.
        {@link org.apache.lucene.codecs.lucene90.Lucene90LiveDocsFormat Live Documents}.livInfo about what documents are live
        {@link org.apache.lucene.codecs.lucene90.Lucene90PointsFormat Point values}.dii, .dimHolds indexed points
        {@link org.apache.lucene.codecs.lucene95.Lucene95HnswVectorsFormat Vector values}.vec, .vemHolds indexed vectors; .vec files contain the raw vector data, and + * .vem the vector metadata
        + * + *

        + * + *

        Lock File

        + * + * The write lock, which is stored in the index directory by default, is named "write.lock". If the + * lock directory is different from the index directory then the write lock will be named + * "XXXX-write.lock" where XXXX is a unique prefix derived from the full path to the index + * directory. When this file is present, a writer is currently modifying the index (adding or + * removing documents). This lock file ensures that only one writer is modifying the index at a + * time. + * + *

        History

        + * + *

        Compatibility notes are provided in this document, describing how file formats have changed + * from prior versions: + * + *

          + *
        • In version 2.1, the file format was changed to allow lock-less commits (ie, no more commit + * lock). The change is fully backwards compatible: you can open a pre-2.1 index for searching + * or adding/deleting of docs. When the new segments file is saved (committed), it will be + * written in the new file format (meaning no specific "upgrade" process is needed). But note + * that once a commit has occurred, pre-2.1 Lucene will not be able to read the index. + *
        • In version 2.3, the file format was changed to allow segments to share a single set of doc + * store (vectors & stored fields) files. This allows for faster indexing in certain + * cases. The change is fully backwards compatible (in the same way as the lock-less commits + * change in 2.1). + *
        • In version 2.4, Strings are now written as true UTF-8 byte sequence, not Java's modified + * UTF-8. See LUCENE-510 for + * details. + *
        • In version 2.9, an optional opaque Map<String,String> CommitUserData may be passed to + * IndexWriter's commit methods (and later retrieved), which is recorded in the segments_N + * file. See LUCENE-1382 for + * details. Also, diagnostics were added to each segment written recording details about why + * it was written (due to flush, merge; which OS/JRE was used; etc.). See issue LUCENE-1654 for details. + *
        • In version 3.0, compressed fields are no longer written to the index (they can still be + * read, but on merge the new segment will write them, uncompressed). See issue LUCENE-1960 for details. + *
        • In version 3.1, segments records the code version that created them. See LUCENE-2720 for details. + * Additionally segments track explicitly whether or not they have term vectors. See LUCENE-2811 for details. + *
        • In version 3.2, numeric fields are written as natively to stored fields file, previously + * they were stored in text format only. + *
        • In version 3.4, fields can omit position data while still indexing term frequencies. + *
        • In version 4.0, the format of the inverted index became extensible via the {@link + * org.apache.lucene.codecs.Codec Codec} api. Fast per-document storage ({@code DocValues}) + * was introduced. Normalization factors need no longer be a single byte, they can be any + * {@link org.apache.lucene.index.NumericDocValues NumericDocValues}. Terms need not be + * unicode strings, they can be any byte sequence. Term offsets can optionally be indexed into + * the postings lists. Payloads can be stored in the term vectors. + *
        • In version 4.1, the format of the postings list changed to use either of FOR compression or + * variable-byte encoding, depending upon the frequency of the term. Terms appearing only once + * were changed to inline directly into the term dictionary. Stored fields are compressed by + * default. + *
        • In version 4.2, term vectors are compressed by default. DocValues has a new multi-valued + * type (SortedSet), that can be used for faceting/grouping/joining on multi-valued fields. + *
        • In version 4.5, DocValues were extended to explicitly represent missing values. + *
        • In version 4.6, FieldInfos were extended to support per-field DocValues generation, to + * allow updating NumericDocValues fields. + *
        • In version 4.8, checksum footers were added to the end of each index file for improved data + * integrity. Specifically, the last 8 bytes of every index file contain the zlib-crc32 + * checksum of the file. + *
        • In version 4.9, DocValues has a new multi-valued numeric type (SortedNumeric) that is + * suitable for faceting/sorting/analytics. + *
        • In version 5.4, DocValues have been improved to store more information on disk: addresses + * for binary fields and ord indexes for multi-valued fields. + *
        • In version 6.0, Points were added, for multi-dimensional range/distance search. + *
        • In version 6.2, new Segment info format that reads/writes the index sort, to support index + * sorting. + *
        • In version 7.0, DocValues have been improved to better support sparse doc values thanks to + * an iterator API. + *
        • In version 8.0, postings have been enhanced to record, for each block of doc ids, the (term + * freq, normalization factor) pairs that may trigger the maximum score of the block. This + * information is recorded alongside skip data in order to be able to skip blocks of doc ids + * if they may not produce high enough scores. Additionally doc values and norms has been + * extended with jump-tables to make access O(1) instead of O(n), where n is the number of + * elements to skip when advancing in the data. + *
        • In version 8.4, postings, positions, offsets and payload lengths have move to a more + * performant encoding that is vectorized. + *
        • In version 8.6, index sort serialization is delegated to the sorts themselves, to allow + * user-defined sorts to be used + *
        • In version 8.7, stored fields compression became adaptive to better handle documents with + * smaller stored fields. + *
        • In version 9.0, vector-valued fields were added. + *
        • In version 9.1, vector-valued fields were modified to add a graph hierarchy. + *
        • In version 9.2, docs of vector-valued fields were moved from .vem to .vec and encoded by + * IndexDISI. ordToDoc mappings was added to .vem. + *
        • In version 9.5, HNSW graph connections were changed to be delta-encoded with vints. + * Additionally, metadata file size improvements were made by delta-encoding nodes by graph + * layer and not writing the node ids for the zeroth layer. + *
        + * + * + * + *

        Limitations

        + * + *
        + * + *

        Lucene uses a Java int to refer to document numbers, and the index file format + * uses an Int32 on-disk to store document numbers. This is a limitation of both the + * index file format and the current implementation. Eventually these should be replaced with either + * UInt64 values, or better yet, {@link org.apache.lucene.store.DataOutput#writeVInt + * VInt} values which have no limit.

        + */ +package org.apache.lucene.codecs.lucene99; diff --git a/lucene/core/src/java/org/apache/lucene/index/DocumentsWriterPerThread.java b/lucene/core/src/java/org/apache/lucene/index/DocumentsWriterPerThread.java index 347422e4ebe..57ada3a5602 100644 --- a/lucene/core/src/java/org/apache/lucene/index/DocumentsWriterPerThread.java +++ b/lucene/core/src/java/org/apache/lucene/index/DocumentsWriterPerThread.java @@ -164,6 +164,7 @@ final class DocumentsWriterPerThread implements Accountable { segmentName, -1, false, + false, codec, Collections.emptyMap(), StringHelper.randomId(), @@ -245,6 +246,9 @@ final class DocumentsWriterPerThread implements Accountable { } } allDocsIndexed = true; + if (numDocsInRAM - docsInRamBefore > 1) { + segmentInfo.setHasBlocks(); + } return finishDocuments(deleteNode, docsInRamBefore); } finally { if (!allDocsIndexed && !aborted) { @@ -636,7 +640,7 @@ final class DocumentsWriterPerThread implements Accountable { return "DocumentsWriterPerThread [pendingDeletes=" + pendingUpdates + ", segment=" - + (segmentInfo != null ? segmentInfo.name : "null") + + segmentInfo.name + ", aborted=" + aborted + ", numDocsInRAM=" diff --git a/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java b/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java index ac667bd73ff..aef273348fd 100644 --- a/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java +++ b/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java @@ -3368,9 +3368,13 @@ public class IndexWriter String mergedName = newSegmentName(); Directory mergeDirectory = mergeScheduler.wrapForMerge(merge, directory); int numSoftDeleted = 0; + boolean hasBlocks = false; for (MergePolicy.MergeReader reader : merge.getMergeReader()) { CodecReader leaf = reader.codecReader; numDocs += leaf.numDocs(); + for (LeafReaderContext context : reader.codecReader.leaves()) { + hasBlocks |= context.reader().getMetaData().hasBlocks(); + } if (softDeletesEnabled) { Bits liveDocs = reader.hardLiveDocs; numSoftDeleted += @@ -3398,6 +3402,7 @@ public class IndexWriter mergedName, -1, false, + hasBlocks, codec, Collections.emptyMap(), StringHelper.randomId(), @@ -3479,6 +3484,7 @@ public class IndexWriter segName, info.info.maxDoc(), info.info.getUseCompoundFile(), + info.info.getHasBlocks(), info.info.getCodec(), info.info.getDiagnostics(), info.info.getId(), @@ -4926,7 +4932,13 @@ public class IndexWriter if (readerPool.writeDocValuesUpdatesForMerge(merge.segments)) { checkpoint(); } - + boolean hasBlocks = false; + for (SegmentCommitInfo info : merge.segments) { + if (info.info.getHasBlocks()) { + hasBlocks = true; + break; + } + } // Bind a new segment name here so even with // ConcurrentMergePolicy we keep deterministic segment // names. @@ -4940,6 +4952,7 @@ public class IndexWriter mergeSegmentName, -1, false, + hasBlocks, config.getCodec(), Collections.emptyMap(), StringHelper.randomId(), diff --git a/lucene/core/src/java/org/apache/lucene/index/LeafMetaData.java b/lucene/core/src/java/org/apache/lucene/index/LeafMetaData.java index c53f8901e1f..77a9f2a84b6 100644 --- a/lucene/core/src/java/org/apache/lucene/index/LeafMetaData.java +++ b/lucene/core/src/java/org/apache/lucene/index/LeafMetaData.java @@ -16,6 +16,8 @@ */ package org.apache.lucene.index; +import org.apache.lucene.document.Field; +import org.apache.lucene.search.Query; import org.apache.lucene.search.Sort; import org.apache.lucene.util.Version; @@ -29,9 +31,10 @@ public final class LeafMetaData { private final int createdVersionMajor; private final Version minVersion; private final Sort sort; + private final boolean hasBlocks; /** Expert: Sole constructor. Public for use by custom {@link LeafReader} impls. */ - public LeafMetaData(int createdVersionMajor, Version minVersion, Sort sort) { + public LeafMetaData(int createdVersionMajor, Version minVersion, Sort sort, boolean hasBlocks) { this.createdVersionMajor = createdVersionMajor; if (createdVersionMajor > Version.LATEST.major) { throw new IllegalArgumentException( @@ -46,6 +49,7 @@ public final class LeafMetaData { } this.minVersion = minVersion; this.sort = sort; + this.hasBlocks = hasBlocks; } /** @@ -72,4 +76,19 @@ public final class LeafMetaData { public Sort getSort() { return sort; } + + /** + * Returns true iff this index contains blocks created with {@link + * IndexWriter#addDocument(Iterable)} or it's corresponding update methods with at least 2 or more + * documents per call. Note: This property was not recorded before {@link Version#LUCENE_9_9_0} + * this method will return false for all leaves written before {@link Version#LUCENE_9_9_0} + * + * @see IndexWriter#updateDocuments(Term, Iterable) + * @see IndexWriter#updateDocuments(Query, Iterable) + * @see IndexWriter#softUpdateDocuments(Term, Iterable, Field...) + * @see IndexWriter#addDocuments(Iterable) + */ + public boolean hasBlocks() { + return hasBlocks; + } } diff --git a/lucene/core/src/java/org/apache/lucene/index/ParallelLeafReader.java b/lucene/core/src/java/org/apache/lucene/index/ParallelLeafReader.java index df3bc759c47..80273cdebae 100644 --- a/lucene/core/src/java/org/apache/lucene/index/ParallelLeafReader.java +++ b/lucene/core/src/java/org/apache/lucene/index/ParallelLeafReader.java @@ -169,9 +169,10 @@ public class ParallelLeafReader extends LeafReader { } Version minVersion = Version.LATEST; + boolean hasBlocks = false; for (final LeafReader reader : this.parallelReaders) { Version leafVersion = reader.getMetaData().getMinVersion(); - + hasBlocks |= reader.getMetaData().hasBlocks(); if (leafVersion == null) { minVersion = null; break; @@ -181,7 +182,7 @@ public class ParallelLeafReader extends LeafReader { } fieldInfos = builder.finish(); - this.metaData = new LeafMetaData(createdVersionMajor, minVersion, indexSort); + this.metaData = new LeafMetaData(createdVersionMajor, minVersion, indexSort, hasBlocks); // do this finally so any Exceptions occurred before don't affect refcounts: for (LeafReader reader : completeReaderSet) { diff --git a/lucene/core/src/java/org/apache/lucene/index/SegmentInfo.java b/lucene/core/src/java/org/apache/lucene/index/SegmentInfo.java index de87d73fa10..b2819702315 100644 --- a/lucene/core/src/java/org/apache/lucene/index/SegmentInfo.java +++ b/lucene/core/src/java/org/apache/lucene/index/SegmentInfo.java @@ -81,6 +81,8 @@ public final class SegmentInfo { // into this segment Version minVersion; + private boolean hasBlocks; + void setDiagnostics(Map diagnostics) { this.diagnostics = Map.copyOf(Objects.requireNonNull(diagnostics)); } @@ -117,6 +119,7 @@ public final class SegmentInfo { String name, int maxDoc, boolean isCompoundFile, + boolean hasBlocks, Codec codec, Map diagnostics, byte[] id, @@ -129,6 +132,7 @@ public final class SegmentInfo { this.name = Objects.requireNonNull(name); this.maxDoc = maxDoc; this.isCompoundFile = isCompoundFile; + this.hasBlocks = hasBlocks; this.codec = codec; this.diagnostics = Map.copyOf(Objects.requireNonNull(diagnostics)); this.id = id; @@ -153,6 +157,20 @@ public final class SegmentInfo { return isCompoundFile; } + /** + * Returns true if this segment contains documents written as blocks. + * + * @see LeafMetaData#hasBlocks() + */ + public boolean getHasBlocks() { + return hasBlocks; + } + + /** Sets the hasBlocks property to true. This setting is viral and can't be unset. */ + void setHasBlocks() { + hasBlocks = true; + } + /** Can only be called once. */ public void setCodec(Codec codec) { assert this.codec == null; diff --git a/lucene/core/src/java/org/apache/lucene/index/SegmentReader.java b/lucene/core/src/java/org/apache/lucene/index/SegmentReader.java index 7a255c456de..071f3b1dfaa 100644 --- a/lucene/core/src/java/org/apache/lucene/index/SegmentReader.java +++ b/lucene/core/src/java/org/apache/lucene/index/SegmentReader.java @@ -83,7 +83,11 @@ public final class SegmentReader extends CodecReader { this.si = si.clone(); this.originalSi = si; this.metaData = - new LeafMetaData(createdVersionMajor, si.info.getMinVersion(), si.info.getIndexSort()); + new LeafMetaData( + createdVersionMajor, + si.info.getMinVersion(), + si.info.getIndexSort(), + si.info.getHasBlocks()); // We pull liveDocs/DV updates from disk: this.isNRT = false; diff --git a/lucene/core/src/java/org/apache/lucene/index/SortingCodecReader.java b/lucene/core/src/java/org/apache/lucene/index/SortingCodecReader.java index 741031dc403..6e3057585ed 100644 --- a/lucene/core/src/java/org/apache/lucene/index/SortingCodecReader.java +++ b/lucene/core/src/java/org/apache/lucene/index/SortingCodecReader.java @@ -336,7 +336,11 @@ public final class SortingCodecReader extends FilterCodecReader { public static CodecReader wrap(CodecReader reader, Sorter.DocMap docMap, Sort sort) { LeafMetaData metaData = reader.getMetaData(); LeafMetaData newMetaData = - new LeafMetaData(metaData.getCreatedVersionMajor(), metaData.getMinVersion(), sort); + new LeafMetaData( + metaData.getCreatedVersionMajor(), + metaData.getMinVersion(), + sort, + metaData.hasBlocks()); if (docMap == null) { // the reader is already sorted return new FilterCodecReader(reader) { diff --git a/lucene/core/src/resources/META-INF/services/org.apache.lucene.codecs.Codec b/lucene/core/src/resources/META-INF/services/org.apache.lucene.codecs.Codec index 946e496ab94..816097b3a66 100644 --- a/lucene/core/src/resources/META-INF/services/org.apache.lucene.codecs.Codec +++ b/lucene/core/src/resources/META-INF/services/org.apache.lucene.codecs.Codec @@ -13,4 +13,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -org.apache.lucene.codecs.lucene95.Lucene95Codec +org.apache.lucene.codecs.lucene99.Lucene99Codec diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene90/TestLucene90StoredFieldsFormatHighCompression.java b/lucene/core/src/test/org/apache/lucene/codecs/lucene90/TestLucene90StoredFieldsFormatHighCompression.java index 8b42d3026ba..8ca7fb8a41d 100644 --- a/lucene/core/src/test/org/apache/lucene/codecs/lucene90/TestLucene90StoredFieldsFormatHighCompression.java +++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene90/TestLucene90StoredFieldsFormatHighCompression.java @@ -18,8 +18,7 @@ package org.apache.lucene.codecs.lucene90; import com.carrotsearch.randomizedtesting.generators.RandomPicks; import org.apache.lucene.codecs.Codec; -import org.apache.lucene.codecs.lucene95.Lucene95Codec; -import org.apache.lucene.codecs.lucene95.Lucene95Codec.Mode; +import org.apache.lucene.codecs.lucene99.Lucene99Codec; import org.apache.lucene.document.Document; import org.apache.lucene.document.StoredField; import org.apache.lucene.index.DirectoryReader; @@ -32,7 +31,7 @@ import org.apache.lucene.tests.index.BaseStoredFieldsFormatTestCase; public class TestLucene90StoredFieldsFormatHighCompression extends BaseStoredFieldsFormatTestCase { @Override protected Codec getCodec() { - return new Lucene95Codec(Mode.BEST_COMPRESSION); + return new Lucene99Codec(Lucene99Codec.Mode.BEST_COMPRESSION); } /** @@ -43,7 +42,7 @@ public class TestLucene90StoredFieldsFormatHighCompression extends BaseStoredFie for (int i = 0; i < 10; i++) { IndexWriterConfig iwc = newIndexWriterConfig(); iwc.setCodec( - new Lucene95Codec(RandomPicks.randomFrom(random(), Lucene95Codec.Mode.values()))); + new Lucene99Codec(RandomPicks.randomFrom(random(), Lucene99Codec.Mode.values()))); IndexWriter iw = new IndexWriter(dir, newIndexWriterConfig()); Document doc = new Document(); doc.add(new StoredField("field1", "value1")); @@ -73,7 +72,7 @@ public class TestLucene90StoredFieldsFormatHighCompression extends BaseStoredFie expectThrows( NullPointerException.class, () -> { - new Lucene95Codec(null); + new Lucene99Codec(null); }); expectThrows( diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene95/TestLucene95HnswVectorsFormat.java b/lucene/core/src/test/org/apache/lucene/codecs/lucene95/TestLucene95HnswVectorsFormat.java index 1dd7c0de553..2cb61e3ecf3 100644 --- a/lucene/core/src/test/org/apache/lucene/codecs/lucene95/TestLucene95HnswVectorsFormat.java +++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene95/TestLucene95HnswVectorsFormat.java @@ -17,6 +17,7 @@ package org.apache.lucene.codecs.lucene95; import org.apache.lucene.codecs.Codec; +import org.apache.lucene.codecs.FilterCodec; import org.apache.lucene.codecs.KnnVectorsFormat; import org.apache.lucene.tests.index.BaseKnnVectorsFormatTestCase; import org.apache.lucene.tests.util.TestUtil; @@ -28,16 +29,16 @@ public class TestLucene95HnswVectorsFormat extends BaseKnnVectorsFormatTestCase } public void testToString() { - Lucene95Codec customCodec = - new Lucene95Codec() { + FilterCodec customCodec = + new FilterCodec("foo", Codec.getDefault()) { @Override - public KnnVectorsFormat getKnnVectorsFormatForField(String field) { + public KnnVectorsFormat knnVectorsFormat() { return new Lucene95HnswVectorsFormat(10, 20); } }; String expectedString = "Lucene95HnswVectorsFormat(name=Lucene95HnswVectorsFormat, maxConn=10, beamWidth=20)"; - assertEquals(expectedString, customCodec.getKnnVectorsFormatForField("bogus_field").toString()); + assertEquals(expectedString, customCodec.knnVectorsFormat().toString()); } public void testLimits() { diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestLucene99SegmentInfoFormat.java b/lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestLucene99SegmentInfoFormat.java new file mode 100644 index 00000000000..ebb2a0013e2 --- /dev/null +++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestLucene99SegmentInfoFormat.java @@ -0,0 +1,36 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.codecs.lucene99; + +import org.apache.lucene.codecs.Codec; +import org.apache.lucene.tests.index.BaseSegmentInfoFormatTestCase; +import org.apache.lucene.tests.util.TestUtil; +import org.apache.lucene.util.Version; + +public class TestLucene99SegmentInfoFormat extends BaseSegmentInfoFormatTestCase { + + @Override + protected Version[] getVersions() { + return new Version[] {Version.LATEST}; + } + + @Override + protected Codec getCodec() { + return TestUtil.getDefaultCodec(); + } +} diff --git a/lucene/core/src/test/org/apache/lucene/index/TestAddIndexes.java b/lucene/core/src/test/org/apache/lucene/index/TestAddIndexes.java index 4ada596067a..666ec1e44e9 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestAddIndexes.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestAddIndexes.java @@ -1815,4 +1815,75 @@ public class TestAddIndexes extends LuceneTestCase { assertEquals(wrappedReader.numDocs(), writer.getDocStats().maxDoc); IOUtils.close(reader, writer, dir3, dir2, dir1); } + + public void testAddIndicesWithBlocks() throws IOException { + boolean[] addHasBlocksPerm = {true, true, false, false}; + boolean[] baseHasBlocksPerm = {true, false, true, false}; + for (int perm = 0; perm < addHasBlocksPerm.length; perm++) { + boolean addHasBlocks = addHasBlocksPerm[perm]; + boolean baseHasBlocks = baseHasBlocksPerm[perm]; + try (Directory dir = newDirectory()) { + try (RandomIndexWriter writer = new RandomIndexWriter(random(), dir)) { + int numBlocks = random().nextInt(1, 10); + for (int i = 0; i < numBlocks; i++) { + int numDocs = baseHasBlocks ? random().nextInt(2, 10) : 1; + List docs = new ArrayList<>(); + for (int j = 0; j < numDocs; j++) { + Document doc = new Document(); + int value = random().nextInt(5); + doc.add(new StringField("value", "" + value, Field.Store.YES)); + docs.add(doc); + } + writer.addDocuments(docs); + } + writer.commit(); + } + + try (Directory addDir = newDirectory()) { + int numBlocks = random().nextInt(1, 10); + try (RandomIndexWriter writer = new RandomIndexWriter(random(), addDir)) { + for (int i = 0; i < numBlocks; i++) { + int numDocs = addHasBlocks ? random().nextInt(2, 10) : 1; + List docs = new ArrayList<>(); + for (int j = 0; j < numDocs; j++) { + Document doc = new Document(); + int value = random().nextInt(5); + doc.add(new StringField("value", "" + value, Field.Store.YES)); + docs.add(doc); + } + writer.addDocuments(docs); + } + writer.commit(); + } + + try (IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig())) { + if (random().nextBoolean()) { + writer.addIndexes(addDir); + } else { + try (DirectoryReader reader = DirectoryReader.open(addDir)) { + CodecReader[] readers = new CodecReader[(reader.leaves().size())]; + for (int i = 0; i < readers.length; i++) { + readers[i] = (CodecReader) reader.leaves().get(i).reader(); + } + writer.addIndexes(readers); + } + } + writer.forceMerge(1, true); + } + + try (DirectoryReader reader = DirectoryReader.open(dir)) { + SegmentReader codecReader = (SegmentReader) reader.leaves().get(0).reader(); + assertEquals(1, reader.leaves().size()); + if (addHasBlocks || baseHasBlocks) { + assertTrue( + "addHasBlocks: " + addHasBlocks + " baseHasBlocks: " + baseHasBlocks, + codecReader.getSegmentInfo().info.getHasBlocks()); + } else { + assertFalse(codecReader.getSegmentInfo().info.getHasBlocks()); + } + } + } + } + } + } } diff --git a/lucene/core/src/test/org/apache/lucene/index/TestCodecs.java b/lucene/core/src/test/org/apache/lucene/index/TestCodecs.java index 421ca5b3d90..40b00975805 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestCodecs.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestCodecs.java @@ -244,6 +244,7 @@ public class TestCodecs extends LuceneTestCase { SEGMENT, 10000, false, + false, codec, Collections.emptyMap(), StringHelper.randomId(), @@ -322,6 +323,7 @@ public class TestCodecs extends LuceneTestCase { SEGMENT, 10000, false, + false, codec, Collections.emptyMap(), StringHelper.randomId(), diff --git a/lucene/core/src/test/org/apache/lucene/index/TestDoc.java b/lucene/core/src/test/org/apache/lucene/index/TestDoc.java index be130c2e98a..9a8eda06c88 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestDoc.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestDoc.java @@ -223,6 +223,7 @@ public class TestDoc extends LuceneTestCase { merged, -1, false, + false, codec, Collections.emptyMap(), StringHelper.randomId(), diff --git a/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java b/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java index e226322696b..4b8ce1f70e2 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java @@ -47,6 +47,7 @@ import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicLong; import java.util.concurrent.atomic.AtomicReference; import java.util.function.Predicate; +import java.util.function.Supplier; import java.util.stream.Collectors; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; @@ -1733,14 +1734,79 @@ public class TestIndexWriter extends LuceneTestCase { d.close(); } - public void testOnlyUpdateDocuments() throws Exception { + public void testHasBlocksMergeFullyDelSegments() throws IOException { + Supplier documentSupplier = + () -> { + Document doc = new Document(); + doc.add(new StringField("foo", "bar", Field.Store.NO)); + return doc; + }; + try (Directory dir = newDirectory()) { + try (IndexWriter writer = + new IndexWriter(dir, new IndexWriterConfig(new MockAnalyzer(random())))) { + final List docs = new ArrayList<>(); + docs.add(documentSupplier.get()); + docs.add(documentSupplier.get()); + writer.updateDocuments(new Term("foo", "bar"), docs); + writer.commit(); + if (random().nextBoolean()) { + writer.updateDocuments(new Term("foo", "bar"), docs); + writer.commit(); // second segment + } + writer.updateDocument(new Term("foo", "bar"), documentSupplier.get()); + if (random().nextBoolean()) { + writer.forceMergeDeletes(true); + } else { + writer.forceMerge(1, true); + } + writer.commit(); + try (DirectoryReader reader = DirectoryReader.open(dir)) { + assertEquals(1, reader.leaves().size()); + assertFalse( + "hasBlocks should be cleared", + reader.leaves().get(0).reader().getMetaData().hasBlocks()); + } + } + } + } + + public void testCarryOverHasBlocks() throws Exception { Directory dir = newDirectory(); IndexWriter w = new IndexWriter(dir, new IndexWriterConfig(new MockAnalyzer(random()))); final List docs = new ArrayList<>(); docs.add(new Document()); w.updateDocuments(new Term("foo", "bar"), docs); - w.close(); + w.commit(); + try (DirectoryReader reader = DirectoryReader.open(dir)) { + SegmentCommitInfo segmentInfo = + ((SegmentReader) reader.leaves().get(0).reader()).getSegmentInfo(); + assertFalse(segmentInfo.info.getHasBlocks()); + } + + docs.add(new Document()); // now we have 2 docs + w.updateDocuments(new Term("foo", "bar"), docs); + w.commit(); + try (DirectoryReader reader = DirectoryReader.open(dir)) { + assertEquals(2, reader.leaves().size()); + SegmentCommitInfo segmentInfo = + ((SegmentReader) reader.leaves().get(0).reader()).getSegmentInfo(); + assertFalse( + "codec: " + segmentInfo.info.getCodec().toString(), segmentInfo.info.getHasBlocks()); + segmentInfo = ((SegmentReader) reader.leaves().get(1).reader()).getSegmentInfo(); + assertTrue( + "codec: " + segmentInfo.info.getCodec().toString(), segmentInfo.info.getHasBlocks()); + } + w.forceMerge(1, true); + w.commit(); + try (DirectoryReader reader = DirectoryReader.open(dir)) { + assertEquals(1, reader.leaves().size()); + SegmentCommitInfo segmentInfo = + ((SegmentReader) reader.leaves().get(0).reader()).getSegmentInfo(); + assertTrue( + "codec: " + segmentInfo.info.getCodec().toString(), segmentInfo.info.getHasBlocks()); + } + w.commit(); dir.close(); } diff --git a/lucene/core/src/test/org/apache/lucene/index/TestKnnGraph.java b/lucene/core/src/test/org/apache/lucene/index/TestKnnGraph.java index 372384df557..ae7e25c109a 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestKnnGraph.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestKnnGraph.java @@ -29,8 +29,8 @@ import java.util.List; import java.util.Set; import java.util.concurrent.CountDownLatch; import org.apache.lucene.codecs.Codec; +import org.apache.lucene.codecs.FilterCodec; import org.apache.lucene.codecs.KnnVectorsFormat; -import org.apache.lucene.codecs.lucene95.Lucene95Codec; import org.apache.lucene.codecs.lucene95.Lucene95HnswVectorsFormat; import org.apache.lucene.codecs.lucene95.Lucene95HnswVectorsReader; import org.apache.lucene.codecs.perfield.PerFieldKnnVectorsFormat; @@ -48,6 +48,7 @@ import org.apache.lucene.search.SearcherManager; import org.apache.lucene.search.TopDocs; import org.apache.lucene.store.Directory; import org.apache.lucene.tests.util.LuceneTestCase; +import org.apache.lucene.tests.util.TestUtil; import org.apache.lucene.util.Bits; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.IOUtils; @@ -78,10 +79,15 @@ public class TestKnnGraph extends LuceneTestCase { } codec = - new Lucene95Codec() { + new FilterCodec(TestUtil.getDefaultCodec().getName(), TestUtil.getDefaultCodec()) { @Override - public KnnVectorsFormat getKnnVectorsFormatForField(String field) { - return new Lucene95HnswVectorsFormat(M, HnswGraphBuilder.DEFAULT_BEAM_WIDTH); + public KnnVectorsFormat knnVectorsFormat() { + return new PerFieldKnnVectorsFormat() { + @Override + public KnnVectorsFormat getKnnVectorsFormatForField(String field) { + return new Lucene95HnswVectorsFormat(M, HnswGraphBuilder.DEFAULT_BEAM_WIDTH); + } + }; } }; @@ -90,10 +96,15 @@ public class TestKnnGraph extends LuceneTestCase { vectorEncoding = randomVectorEncoding(); codec = - new Lucene95Codec() { + new FilterCodec(TestUtil.getDefaultCodec().getName(), TestUtil.getDefaultCodec()) { @Override - public KnnVectorsFormat getKnnVectorsFormatForField(String field) { - return new Lucene95HnswVectorsFormat(M, HnswGraphBuilder.DEFAULT_BEAM_WIDTH); + public KnnVectorsFormat knnVectorsFormat() { + return new PerFieldKnnVectorsFormat() { + @Override + public KnnVectorsFormat getKnnVectorsFormatForField(String field) { + return new Lucene95HnswVectorsFormat(M, HnswGraphBuilder.DEFAULT_BEAM_WIDTH); + } + }; } }; @@ -101,10 +112,15 @@ public class TestKnnGraph extends LuceneTestCase { float32Codec = codec; } else { float32Codec = - new Lucene95Codec() { + new FilterCodec(TestUtil.getDefaultCodec().getName(), TestUtil.getDefaultCodec()) { @Override - public KnnVectorsFormat getKnnVectorsFormatForField(String field) { - return new Lucene95HnswVectorsFormat(M, HnswGraphBuilder.DEFAULT_BEAM_WIDTH); + public KnnVectorsFormat knnVectorsFormat() { + return new PerFieldKnnVectorsFormat() { + @Override + public KnnVectorsFormat getKnnVectorsFormatForField(String field) { + return new Lucene95HnswVectorsFormat(M, HnswGraphBuilder.DEFAULT_BEAM_WIDTH); + } + }; } }; } diff --git a/lucene/core/src/test/org/apache/lucene/index/TestMergePolicy.java b/lucene/core/src/test/org/apache/lucene/index/TestMergePolicy.java index 4f455fdd5db..a05dbd64203 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestMergePolicy.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestMergePolicy.java @@ -149,6 +149,7 @@ public class TestMergePolicy extends LuceneTestCase { TestUtil.randomSimpleString(random()), // name random().nextInt(1000), // maxDoc random().nextBoolean(), // isCompoundFile + false, null, // codec Collections.emptyMap(), // diagnostics TestUtil.randomSimpleString( // id diff --git a/lucene/core/src/test/org/apache/lucene/index/TestOneMergeWrappingMergePolicy.java b/lucene/core/src/test/org/apache/lucene/index/TestOneMergeWrappingMergePolicy.java index 94398c51a72..9256002d123 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestOneMergeWrappingMergePolicy.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestOneMergeWrappingMergePolicy.java @@ -131,6 +131,7 @@ public class TestOneMergeWrappingMergePolicy extends LuceneTestCase { TestUtil.randomSimpleString(random()), // name random().nextInt(), // maxDoc random().nextBoolean(), // isCompoundFile + false, null, // codec Collections.emptyMap(), // diagnostics TestUtil.randomSimpleString( // id diff --git a/lucene/core/src/test/org/apache/lucene/index/TestPendingDeletes.java b/lucene/core/src/test/org/apache/lucene/index/TestPendingDeletes.java index 5cba67078f7..555b32ecdea 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestPendingDeletes.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestPendingDeletes.java @@ -46,6 +46,7 @@ public class TestPendingDeletes extends LuceneTestCase { "test", 10, false, + false, Codec.getDefault(), Collections.emptyMap(), StringHelper.randomId(), @@ -92,6 +93,7 @@ public class TestPendingDeletes extends LuceneTestCase { "test", 6, false, + false, Codec.getDefault(), Collections.emptyMap(), StringHelper.randomId(), @@ -162,6 +164,7 @@ public class TestPendingDeletes extends LuceneTestCase { "test", 3, false, + false, Codec.getDefault(), Collections.emptyMap(), StringHelper.randomId(), diff --git a/lucene/core/src/test/org/apache/lucene/index/TestPendingSoftDeletes.java b/lucene/core/src/test/org/apache/lucene/index/TestPendingSoftDeletes.java index 0749203dd2f..8554dd3f40a 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestPendingSoftDeletes.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestPendingSoftDeletes.java @@ -161,6 +161,7 @@ public class TestPendingSoftDeletes extends TestPendingDeletes { "test", 10, false, + false, Codec.getDefault(), Collections.emptyMap(), StringHelper.randomId(), diff --git a/lucene/core/src/test/org/apache/lucene/index/TestSegmentInfos.java b/lucene/core/src/test/org/apache/lucene/index/TestSegmentInfos.java index 87369a770a4..a4521649b89 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestSegmentInfos.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestSegmentInfos.java @@ -76,6 +76,7 @@ public class TestSegmentInfos extends LuceneTestCase { "_0", 1, false, + false, Codec.getDefault(), Collections.emptyMap(), id, @@ -110,6 +111,7 @@ public class TestSegmentInfos extends LuceneTestCase { "_0", 1, false, + false, Codec.getDefault(), Collections.emptyMap(), id, @@ -129,6 +131,7 @@ public class TestSegmentInfos extends LuceneTestCase { "_1", 1, false, + false, Codec.getDefault(), Collections.emptyMap(), id, @@ -173,6 +176,7 @@ public class TestSegmentInfos extends LuceneTestCase { "TEST", 10000, false, + false, codec, Collections.emptyMap(), StringHelper.randomId(), @@ -191,6 +195,7 @@ public class TestSegmentInfos extends LuceneTestCase { "TEST", 10000, false, + false, codec, diagnostics, StringHelper.randomId(), @@ -216,6 +221,7 @@ public class TestSegmentInfos extends LuceneTestCase { "TEST", 10000, false, + false, codec, Collections.emptyMap(), StringHelper.randomId(), @@ -241,6 +247,7 @@ public class TestSegmentInfos extends LuceneTestCase { "TEST", 10000, false, + false, codec, diagnostics, StringHelper.randomId(), @@ -275,6 +282,7 @@ public class TestSegmentInfos extends LuceneTestCase { "_0", 1, false, + false, Codec.getDefault(), Collections.emptyMap(), StringHelper.randomId(), @@ -321,6 +329,7 @@ public class TestSegmentInfos extends LuceneTestCase { "_0", 1, false, + false, Codec.getDefault(), Collections.emptyMap(), id, @@ -340,6 +349,7 @@ public class TestSegmentInfos extends LuceneTestCase { "_1", 1, false, + false, Codec.getDefault(), Collections.emptyMap(), id, @@ -411,6 +421,7 @@ public class TestSegmentInfos extends LuceneTestCase { "TEST", 10000, false, + false, codec, diagnostics, StringHelper.randomId(), @@ -428,6 +439,7 @@ public class TestSegmentInfos extends LuceneTestCase { "TEST", 10000, false, + false, codec, diagnostics, StringHelper.randomId(), diff --git a/lucene/core/src/test/org/apache/lucene/index/TestSegmentMerger.java b/lucene/core/src/test/org/apache/lucene/index/TestSegmentMerger.java index 72f65c67c5c..da17f332e77 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestSegmentMerger.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestSegmentMerger.java @@ -91,6 +91,7 @@ public class TestSegmentMerger extends LuceneTestCase { mergedSegment, -1, false, + false, codec, Collections.emptyMap(), StringHelper.randomId(), diff --git a/lucene/core/src/test/org/apache/lucene/index/TestSegmentToThreadMapping.java b/lucene/core/src/test/org/apache/lucene/index/TestSegmentToThreadMapping.java index cdb01cbebfb..f6aee2f3f27 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestSegmentToThreadMapping.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestSegmentToThreadMapping.java @@ -140,7 +140,7 @@ public class TestSegmentToThreadMapping extends LuceneTestCase { @Override public LeafMetaData getMetaData() { - return new LeafMetaData(Version.LATEST.major, Version.LATEST, null); + return new LeafMetaData(Version.LATEST.major, Version.LATEST, null, false); } @Override diff --git a/lucene/core/src/test/org/apache/lucene/util/hnsw/HnswGraphTestCase.java b/lucene/core/src/test/org/apache/lucene/util/hnsw/HnswGraphTestCase.java index a18f0d2a2fe..284ca1dc13d 100644 --- a/lucene/core/src/test/org/apache/lucene/util/hnsw/HnswGraphTestCase.java +++ b/lucene/core/src/test/org/apache/lucene/util/hnsw/HnswGraphTestCase.java @@ -38,8 +38,8 @@ import java.util.concurrent.Future; import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeoutException; import java.util.stream.Collectors; +import org.apache.lucene.codecs.FilterCodec; import org.apache.lucene.codecs.KnnVectorsFormat; -import org.apache.lucene.codecs.lucene95.Lucene95Codec; import org.apache.lucene.codecs.lucene95.Lucene95HnswVectorsFormat; import org.apache.lucene.codecs.lucene95.Lucene95HnswVectorsReader; import org.apache.lucene.codecs.perfield.PerFieldKnnVectorsFormat; @@ -71,6 +71,7 @@ import org.apache.lucene.search.SortField; import org.apache.lucene.search.TopDocs; import org.apache.lucene.store.Directory; import org.apache.lucene.tests.util.LuceneTestCase; +import org.apache.lucene.tests.util.TestUtil; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.BitSet; import org.apache.lucene.util.Bits; @@ -156,10 +157,17 @@ abstract class HnswGraphTestCase extends LuceneTestCase { IndexWriterConfig iwc = new IndexWriterConfig() .setCodec( - new Lucene95Codec() { + new FilterCodec( + TestUtil.getDefaultCodec().getName(), TestUtil.getDefaultCodec()) { + @Override - public KnnVectorsFormat getKnnVectorsFormatForField(String field) { - return new Lucene95HnswVectorsFormat(M, beamWidth); + public KnnVectorsFormat knnVectorsFormat() { + return new PerFieldKnnVectorsFormat() { + @Override + public KnnVectorsFormat getKnnVectorsFormatForField(String field) { + return new Lucene95HnswVectorsFormat(M, beamWidth); + } + }; } }) // set a random merge policy @@ -222,10 +230,16 @@ abstract class HnswGraphTestCase extends LuceneTestCase { IndexWriterConfig iwc = new IndexWriterConfig() .setCodec( - new Lucene95Codec() { + new FilterCodec( + TestUtil.getDefaultCodec().getName(), TestUtil.getDefaultCodec()) { @Override - public KnnVectorsFormat getKnnVectorsFormatForField(String field) { - return new Lucene95HnswVectorsFormat(M, beamWidth); + public KnnVectorsFormat knnVectorsFormat() { + return new PerFieldKnnVectorsFormat() { + @Override + public KnnVectorsFormat getKnnVectorsFormatForField(String field) { + return new Lucene95HnswVectorsFormat(M, beamWidth); + } + }; } }); try (IndexWriter iw = new IndexWriter(dir, iwc)) { @@ -278,19 +292,29 @@ abstract class HnswGraphTestCase extends LuceneTestCase { IndexWriterConfig iwc = new IndexWriterConfig() .setCodec( - new Lucene95Codec() { + new FilterCodec(TestUtil.getDefaultCodec().getName(), TestUtil.getDefaultCodec()) { @Override - public KnnVectorsFormat getKnnVectorsFormatForField(String field) { - return new Lucene95HnswVectorsFormat(M, beamWidth); + public KnnVectorsFormat knnVectorsFormat() { + return new PerFieldKnnVectorsFormat() { + @Override + public KnnVectorsFormat getKnnVectorsFormatForField(String field) { + return new Lucene95HnswVectorsFormat(M, beamWidth); + } + }; } }); IndexWriterConfig iwc2 = new IndexWriterConfig() .setCodec( - new Lucene95Codec() { + new FilterCodec(TestUtil.getDefaultCodec().getName(), TestUtil.getDefaultCodec()) { @Override - public KnnVectorsFormat getKnnVectorsFormatForField(String field) { - return new Lucene95HnswVectorsFormat(M, beamWidth); + public KnnVectorsFormat knnVectorsFormat() { + return new PerFieldKnnVectorsFormat() { + @Override + public KnnVectorsFormat getKnnVectorsFormatForField(String field) { + return new Lucene95HnswVectorsFormat(M, beamWidth); + } + }; } }) .setIndexSort(new Sort(new SortField("sortkey", SortField.Type.LONG))); diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TermVectorLeafReader.java b/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TermVectorLeafReader.java index ac26e35b393..aeb68267732 100644 --- a/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TermVectorLeafReader.java +++ b/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TermVectorLeafReader.java @@ -215,7 +215,7 @@ public class TermVectorLeafReader extends LeafReader { @Override public LeafMetaData getMetaData() { - return new LeafMetaData(Version.LATEST.major, null, null); + return new LeafMetaData(Version.LATEST.major, null, null, false); } @Override diff --git a/lucene/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java b/lucene/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java index 060da63d8b3..c66770fda5b 100644 --- a/lucene/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java +++ b/lucene/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java @@ -2122,7 +2122,7 @@ public class MemoryIndex { @Override public LeafMetaData getMetaData() { - return new LeafMetaData(Version.LATEST.major, Version.LATEST, null); + return new LeafMetaData(Version.LATEST.major, Version.LATEST, null, false); } @Override diff --git a/lucene/misc/src/java/org/apache/lucene/misc/index/IndexSplitter.java b/lucene/misc/src/java/org/apache/lucene/misc/index/IndexSplitter.java index 20e8a8a0c46..3a9c4def640 100644 --- a/lucene/misc/src/java/org/apache/lucene/misc/index/IndexSplitter.java +++ b/lucene/misc/src/java/org/apache/lucene/misc/index/IndexSplitter.java @@ -132,6 +132,7 @@ public class IndexSplitter { info.name, info.maxDoc(), info.getUseCompoundFile(), + info.getHasBlocks(), info.getCodec(), info.getDiagnostics(), info.getId(), diff --git a/lucene/suggest/src/test/org/apache/lucene/search/suggest/document/TestSuggestField.java b/lucene/suggest/src/test/org/apache/lucene/search/suggest/document/TestSuggestField.java index 9cf270b72e8..682363f29cb 100644 --- a/lucene/suggest/src/test/org/apache/lucene/search/suggest/document/TestSuggestField.java +++ b/lucene/suggest/src/test/org/apache/lucene/search/suggest/document/TestSuggestField.java @@ -39,8 +39,9 @@ import org.apache.lucene.analysis.miscellaneous.ConcatenateGraphFilter; import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; import org.apache.lucene.analysis.tokenattributes.TypeAttribute; import org.apache.lucene.codecs.Codec; +import org.apache.lucene.codecs.FilterCodec; import org.apache.lucene.codecs.PostingsFormat; -import org.apache.lucene.codecs.lucene95.Lucene95Codec; +import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.IntPoint; @@ -961,17 +962,23 @@ public class TestSuggestField extends LuceneTestCase { IndexWriterConfig iwc = newIndexWriterConfig(random(), analyzer); iwc.setMergePolicy(newLogMergePolicy()); Codec filterCodec = - new Lucene95Codec() { + new FilterCodec(TestUtil.getDefaultCodec().getName(), TestUtil.getDefaultCodec()) { CompletionPostingsFormat.FSTLoadMode fstLoadMode = RandomPicks.randomFrom(random(), CompletionPostingsFormat.FSTLoadMode.values()); PostingsFormat postingsFormat = new Completion90PostingsFormat(fstLoadMode); @Override - public PostingsFormat getPostingsFormatForField(String field) { - if (suggestFields.contains(field)) { - return postingsFormat; - } - return super.getPostingsFormatForField(field); + public PostingsFormat postingsFormat() { + return new PerFieldPostingsFormat() { + @Override + public PostingsFormat getPostingsFormatForField(String field) { + if (suggestFields.contains(field)) { + return postingsFormat; + } + return ((PerFieldPostingsFormat) delegate.postingsFormat()) + .getPostingsFormatForField(field); + } + }; } }; iwc.setCodec(filterCodec); diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/codecs/vector/ConfigurableMCodec.java b/lucene/test-framework/src/java/org/apache/lucene/tests/codecs/vector/ConfigurableMCodec.java index 5e88b866c46..f738149e635 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/codecs/vector/ConfigurableMCodec.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/codecs/vector/ConfigurableMCodec.java @@ -18,8 +18,8 @@ package org.apache.lucene.tests.codecs.vector; import org.apache.lucene.codecs.FilterCodec; import org.apache.lucene.codecs.KnnVectorsFormat; -import org.apache.lucene.codecs.lucene95.Lucene95Codec; import org.apache.lucene.codecs.lucene95.Lucene95HnswVectorsFormat; +import org.apache.lucene.tests.util.TestUtil; /** * This codec allows customization of the number of connections made for an hnsw index. Increasing @@ -31,12 +31,12 @@ public class ConfigurableMCodec extends FilterCodec { private final KnnVectorsFormat knnVectorsFormat; public ConfigurableMCodec() { - super("ConfigurableMCodec", new Lucene95Codec()); + super("ConfigurableMCodec", TestUtil.getDefaultCodec()); knnVectorsFormat = new Lucene95HnswVectorsFormat(128, 100); } public ConfigurableMCodec(int maxConn) { - super("ConfigurableMCodec", new Lucene95Codec()); + super("ConfigurableMCodec", TestUtil.getDefaultCodec()); knnVectorsFormat = new Lucene95HnswVectorsFormat(maxConn, 100); } diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseCompoundFormatTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseCompoundFormatTestCase.java index be78a15db48..8d22046f296 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseCompoundFormatTestCase.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseCompoundFormatTestCase.java @@ -672,6 +672,7 @@ public abstract class BaseCompoundFormatTestCase extends BaseIndexFileFormatTest name, 10000, false, + false, Codec.getDefault(), Collections.emptyMap(), StringHelper.randomId(), diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseFieldInfoFormatTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseFieldInfoFormatTestCase.java index b3bd9656171..886fef08604 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseFieldInfoFormatTestCase.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseFieldInfoFormatTestCase.java @@ -401,6 +401,7 @@ public abstract class BaseFieldInfoFormatTestCase extends BaseIndexFileFormatTes name, 10000, false, + false, Codec.getDefault(), Collections.emptyMap(), StringHelper.randomId(), diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseIndexFileFormatTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseIndexFileFormatTestCase.java index b1550bb727a..968fd8ed61e 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseIndexFileFormatTestCase.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseIndexFileFormatTestCase.java @@ -341,6 +341,7 @@ abstract class BaseIndexFileFormatTestCase extends LuceneTestCase { "_0", 1, false, + false, codec, Collections.emptyMap(), StringHelper.randomId(), diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseLiveDocsFormatTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseLiveDocsFormatTestCase.java index ec114f4f57f..64d3549c574 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseLiveDocsFormatTestCase.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseLiveDocsFormatTestCase.java @@ -131,6 +131,7 @@ public abstract class BaseLiveDocsFormatTestCase extends LuceneTestCase { "foo", maxDoc, random().nextBoolean(), + false, codec, Collections.emptyMap(), StringHelper.randomId(), diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseMergePolicyTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseMergePolicyTestCase.java index 4ed4c8dfeaa..6dee2ab7abc 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseMergePolicyTestCase.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseMergePolicyTestCase.java @@ -154,6 +154,7 @@ public abstract class BaseMergePolicyTestCase extends LuceneTestCase { TestUtil.randomSimpleString(random()), // name random().nextInt(Integer.MAX_VALUE), // maxDoc random().nextBoolean(), // isCompoundFile + false, null, // codec Collections.emptyMap(), // diagnostics TestUtil.randomSimpleString( // id @@ -236,6 +237,7 @@ public abstract class BaseMergePolicyTestCase extends LuceneTestCase { name, maxDoc, false, + false, TestUtil.getDefaultCodec(), Collections.emptyMap(), id, diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseSegmentInfoFormatTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseSegmentInfoFormatTestCase.java index 665353d40e5..674edb89db8 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseSegmentInfoFormatTestCase.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseSegmentInfoFormatTestCase.java @@ -67,6 +67,7 @@ public abstract class BaseSegmentInfoFormatTestCase extends BaseIndexFileFormatT "_123", 1, false, + false, codec, Collections.emptyMap(), id, @@ -92,6 +93,7 @@ public abstract class BaseSegmentInfoFormatTestCase extends BaseIndexFileFormatT "_123", 1, false, + false, codec, Collections.emptyMap(), id, @@ -135,6 +137,7 @@ public abstract class BaseSegmentInfoFormatTestCase extends BaseIndexFileFormatT "_123", 1, false, + false, codec, diagnostics, id, @@ -171,6 +174,7 @@ public abstract class BaseSegmentInfoFormatTestCase extends BaseIndexFileFormatT "_123", 1, false, + false, codec, Collections.emptyMap(), id, @@ -204,6 +208,7 @@ public abstract class BaseSegmentInfoFormatTestCase extends BaseIndexFileFormatT "_123", 1, false, + false, codec, Collections.emptyMap(), id, @@ -231,6 +236,7 @@ public abstract class BaseSegmentInfoFormatTestCase extends BaseIndexFileFormatT "_123", 1, false, + false, codec, Collections.emptyMap(), id, @@ -368,6 +374,7 @@ public abstract class BaseSegmentInfoFormatTestCase extends BaseIndexFileFormatT "_123", 1, false, + false, codec, Collections.emptyMap(), id, @@ -408,6 +415,7 @@ public abstract class BaseSegmentInfoFormatTestCase extends BaseIndexFileFormatT "_123", 1, false, + false, codec, Collections.emptyMap(), id, @@ -453,6 +461,7 @@ public abstract class BaseSegmentInfoFormatTestCase extends BaseIndexFileFormatT "_123", 1, false, + false, codec, Collections.emptyMap(), id, @@ -498,6 +507,7 @@ public abstract class BaseSegmentInfoFormatTestCase extends BaseIndexFileFormatT "_123", 1, false, + false, codec, Collections.emptyMap(), id, @@ -544,6 +554,7 @@ public abstract class BaseSegmentInfoFormatTestCase extends BaseIndexFileFormatT "_123", 1, false, + false, codec, Collections.emptyMap(), id, @@ -614,6 +625,7 @@ public abstract class BaseSegmentInfoFormatTestCase extends BaseIndexFileFormatT name, docCount, isCompoundFile, + false, codec, diagnostics, id, diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/index/RandomPostingsTester.java b/lucene/test-framework/src/java/org/apache/lucene/tests/index/RandomPostingsTester.java index 5eb48d31cb5..924bc4a130c 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/index/RandomPostingsTester.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/index/RandomPostingsTester.java @@ -696,6 +696,7 @@ public class RandomPostingsTester { "_0", maxDoc, false, + false, codec, Collections.emptyMap(), StringHelper.randomId(), diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/search/QueryUtils.java b/lucene/test-framework/src/java/org/apache/lucene/tests/search/QueryUtils.java index 5a454681f54..0df805b01e4 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/search/QueryUtils.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/search/QueryUtils.java @@ -293,7 +293,7 @@ public class QueryUtils { @Override public LeafMetaData getMetaData() { - return new LeafMetaData(Version.LATEST.major, Version.LATEST, null); + return new LeafMetaData(Version.LATEST.major, Version.LATEST, null, false); } @Override diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/util/TestRuleSetupAndRestoreClassEnv.java b/lucene/test-framework/src/java/org/apache/lucene/tests/util/TestRuleSetupAndRestoreClassEnv.java index 49881465ec2..4b0719d72e3 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/util/TestRuleSetupAndRestoreClassEnv.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/util/TestRuleSetupAndRestoreClassEnv.java @@ -38,7 +38,7 @@ import java.util.TimeZone; import org.apache.lucene.codecs.Codec; import org.apache.lucene.codecs.DocValuesFormat; import org.apache.lucene.codecs.PostingsFormat; -import org.apache.lucene.codecs.lucene95.Lucene95Codec; +import org.apache.lucene.codecs.lucene99.Lucene99Codec; import org.apache.lucene.codecs.simpletext.SimpleTextCodec; import org.apache.lucene.search.similarities.Similarity; import org.apache.lucene.tests.codecs.asserting.AssertingCodec; @@ -195,9 +195,9 @@ final class TestRuleSetupAndRestoreClassEnv extends AbstractBeforeAfterRule { } else if ("Compressing".equals(TEST_CODEC) || ("random".equals(TEST_CODEC) && randomVal == 6 && !shouldAvoidCodec("Compressing"))) { codec = CompressingCodec.randomInstance(random); - } else if ("Lucene95".equals(TEST_CODEC) - || ("random".equals(TEST_CODEC) && randomVal == 5 && !shouldAvoidCodec("Lucene95"))) { - codec = new Lucene95Codec(RandomPicks.randomFrom(random, Lucene95Codec.Mode.values())); + } else if ("Lucene99".equals(TEST_CODEC) + || ("random".equals(TEST_CODEC) && randomVal == 5 && !shouldAvoidCodec("Lucene99"))) { + codec = new Lucene99Codec(RandomPicks.randomFrom(random, Lucene99Codec.Mode.values())); } else if (!"random".equals(TEST_CODEC)) { codec = Codec.forName(TEST_CODEC); } else if ("random".equals(TEST_POSTINGSFORMAT)) { diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/util/TestUtil.java b/lucene/test-framework/src/java/org/apache/lucene/tests/util/TestUtil.java index dc08e37f095..9c0cdb7b4c2 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/util/TestUtil.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/util/TestUtil.java @@ -55,8 +55,8 @@ import org.apache.lucene.codecs.PostingsFormat; import org.apache.lucene.codecs.blocktreeords.BlockTreeOrdsPostingsFormat; import org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat; import org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat; -import org.apache.lucene.codecs.lucene95.Lucene95Codec; import org.apache.lucene.codecs.lucene95.Lucene95HnswVectorsFormat; +import org.apache.lucene.codecs.lucene99.Lucene99Codec; import org.apache.lucene.codecs.perfield.PerFieldDocValuesFormat; import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat; import org.apache.lucene.document.BinaryDocValuesField; @@ -1236,7 +1236,7 @@ public final class TestUtil { * different than {@link Codec#getDefault()} because that is randomized. */ public static Codec getDefaultCodec() { - return new Lucene95Codec(); + return new Lucene99Codec(); } /**