mirror of https://github.com/apache/lucene.git
Record if block API has been used in SegmentInfo (#12685)
If the add/updateDocuments(List<>) API is used, lucene guarantees that all documents are indexed in the same segment with consecutive document IDs. This enables features like nested documents etc. This change records the usage of this API in SegmentsInfo and preserves this property across merges. Relates to #12665
This commit is contained in:
parent
bbf197fdc2
commit
6677109ee6
|
@ -154,9 +154,13 @@ API Changes
|
|||
|
||||
New Features
|
||||
---------------------
|
||||
|
||||
* GITHUB#12548: Added similarityToQueryVector API to compute vector similarity scores
|
||||
with DoubleValuesSource. (Shubham Chaudhary)
|
||||
|
||||
* GITHUB#12685: Lucene now records if documents have been indexed as blocks in SegmentInfo. This is recorded on a per
|
||||
segment basis and maintained across merges. The property is exposed via LeafReaderMetadata. (Simon Willnauer)
|
||||
|
||||
Improvements
|
||||
---------------------
|
||||
* GITHUB#12523: TaskExecutor waits for all tasks to complete before returning when Exceptions
|
||||
|
|
|
@ -34,6 +34,7 @@ module org.apache.lucene.backward_codecs {
|
|||
exports org.apache.lucene.backward_codecs.lucene91;
|
||||
exports org.apache.lucene.backward_codecs.lucene92;
|
||||
exports org.apache.lucene.backward_codecs.lucene94;
|
||||
exports org.apache.lucene.backward_codecs.lucene95;
|
||||
exports org.apache.lucene.backward_codecs.packed;
|
||||
exports org.apache.lucene.backward_codecs.store;
|
||||
|
||||
|
@ -55,5 +56,6 @@ module org.apache.lucene.backward_codecs {
|
|||
org.apache.lucene.backward_codecs.lucene90.Lucene90Codec,
|
||||
org.apache.lucene.backward_codecs.lucene91.Lucene91Codec,
|
||||
org.apache.lucene.backward_codecs.lucene92.Lucene92Codec,
|
||||
org.apache.lucene.backward_codecs.lucene94.Lucene94Codec;
|
||||
org.apache.lucene.backward_codecs.lucene94.Lucene94Codec,
|
||||
org.apache.lucene.backward_codecs.lucene95.Lucene95Codec;
|
||||
}
|
||||
|
|
|
@ -307,6 +307,7 @@ public class Lucene70SegmentInfoFormat extends SegmentInfoFormat {
|
|||
segment,
|
||||
docCount,
|
||||
isCompoundFile,
|
||||
false,
|
||||
null,
|
||||
diagnostics,
|
||||
segmentID,
|
||||
|
|
|
@ -164,6 +164,7 @@ public class Lucene86SegmentInfoFormat extends SegmentInfoFormat {
|
|||
segment,
|
||||
docCount,
|
||||
isCompoundFile,
|
||||
false,
|
||||
null,
|
||||
diagnostics,
|
||||
segmentID,
|
||||
|
|
|
@ -36,7 +36,6 @@ import org.apache.lucene.codecs.lucene90.Lucene90LiveDocsFormat;
|
|||
import org.apache.lucene.codecs.lucene90.Lucene90NormsFormat;
|
||||
import org.apache.lucene.codecs.lucene90.Lucene90PointsFormat;
|
||||
import org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat;
|
||||
import org.apache.lucene.codecs.lucene90.Lucene90SegmentInfoFormat;
|
||||
import org.apache.lucene.codecs.lucene90.Lucene90StoredFieldsFormat;
|
||||
import org.apache.lucene.codecs.lucene90.Lucene90TermVectorsFormat;
|
||||
import org.apache.lucene.codecs.perfield.PerFieldDocValuesFormat;
|
||||
|
@ -143,7 +142,7 @@ public class Lucene90Codec extends Codec {
|
|||
}
|
||||
|
||||
@Override
|
||||
public final SegmentInfoFormat segmentInfoFormat() {
|
||||
public SegmentInfoFormat segmentInfoFormat() {
|
||||
return segmentInfosFormat;
|
||||
}
|
||||
|
||||
|
|
|
@ -0,0 +1,179 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.backward_codecs.lucene90;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import org.apache.lucene.codecs.CodecUtil;
|
||||
import org.apache.lucene.codecs.SegmentInfoFormat;
|
||||
import org.apache.lucene.index.CorruptIndexException;
|
||||
import org.apache.lucene.index.IndexFileNames;
|
||||
import org.apache.lucene.index.IndexWriter;
|
||||
import org.apache.lucene.index.SegmentInfo;
|
||||
import org.apache.lucene.index.SegmentInfos;
|
||||
import org.apache.lucene.index.SortFieldProvider;
|
||||
import org.apache.lucene.search.Sort;
|
||||
import org.apache.lucene.search.SortField;
|
||||
import org.apache.lucene.store.ChecksumIndexInput;
|
||||
import org.apache.lucene.store.DataInput;
|
||||
import org.apache.lucene.store.DataOutput;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.IOContext;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
/**
|
||||
* Lucene 9.0 Segment info format.
|
||||
*
|
||||
* <p>Files:
|
||||
*
|
||||
* <ul>
|
||||
* <li><code>.si</code>: Header, SegVersion, SegSize, IsCompoundFile, Diagnostics, Files,
|
||||
* Attributes, IndexSort, Footer
|
||||
* </ul>
|
||||
*
|
||||
* Data types:
|
||||
*
|
||||
* <ul>
|
||||
* <li>Header --> {@link CodecUtil#writeIndexHeader IndexHeader}
|
||||
* <li>SegSize --> {@link DataOutput#writeInt Int32}
|
||||
* <li>SegVersion --> {@link DataOutput#writeString String}
|
||||
* <li>SegMinVersion --> {@link DataOutput#writeString String}
|
||||
* <li>Files --> {@link DataOutput#writeSetOfStrings Set<String>}
|
||||
* <li>Diagnostics,Attributes --> {@link DataOutput#writeMapOfStrings Map<String,String>}
|
||||
* <li>IsCompoundFile --> {@link DataOutput#writeByte Int8}
|
||||
* <li>IndexSort --> {@link DataOutput#writeVInt Int32} count, followed by {@code count}
|
||||
* SortField
|
||||
* <li>SortField --> {@link DataOutput#writeString String} sort class, followed by a per-sort
|
||||
* bytestream (see {@link SortFieldProvider#readSortField(DataInput)})
|
||||
* <li>Footer --> {@link CodecUtil#writeFooter CodecFooter}
|
||||
* </ul>
|
||||
*
|
||||
* Field Descriptions:
|
||||
*
|
||||
* <ul>
|
||||
* <li>SegVersion is the code version that created the segment.
|
||||
* <li>SegMinVersion is the minimum code version that contributed documents to the segment.
|
||||
* <li>SegSize is the number of documents contained in the segment index.
|
||||
* <li>IsCompoundFile records whether the segment is written as a compound file or not. If this is
|
||||
* -1, the segment is not a compound file. If it is 1, the segment is a compound file.
|
||||
* <li>The Diagnostics Map is privately written by {@link IndexWriter}, as a debugging aid, for
|
||||
* each segment it creates. It includes metadata like the current Lucene version, OS, Java
|
||||
* version, why the segment was created (merge, flush, addIndexes), etc.
|
||||
* <li>Files is a list of files referred to by this segment.
|
||||
* </ul>
|
||||
*
|
||||
* @see SegmentInfos
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public class Lucene90SegmentInfoFormat extends SegmentInfoFormat {
|
||||
|
||||
/** File extension used to store {@link SegmentInfo}. */
|
||||
public static final String SI_EXTENSION = "si";
|
||||
|
||||
static final String CODEC_NAME = "Lucene90SegmentInfo";
|
||||
static final int VERSION_START = 0;
|
||||
static final int VERSION_CURRENT = VERSION_START;
|
||||
|
||||
/** Sole constructor. */
|
||||
public Lucene90SegmentInfoFormat() {}
|
||||
|
||||
@Override
|
||||
public SegmentInfo read(Directory dir, String segment, byte[] segmentID, IOContext context)
|
||||
throws IOException {
|
||||
final String fileName = IndexFileNames.segmentFileName(segment, "", SI_EXTENSION);
|
||||
try (ChecksumIndexInput input = dir.openChecksumInput(fileName)) {
|
||||
Throwable priorE = null;
|
||||
SegmentInfo si = null;
|
||||
try {
|
||||
CodecUtil.checkIndexHeader(
|
||||
input, CODEC_NAME, VERSION_START, VERSION_CURRENT, segmentID, "");
|
||||
si = parseSegmentInfo(dir, input, segment, segmentID);
|
||||
} catch (Throwable exception) {
|
||||
priorE = exception;
|
||||
} finally {
|
||||
CodecUtil.checkFooter(input, priorE);
|
||||
}
|
||||
return si;
|
||||
}
|
||||
}
|
||||
|
||||
private SegmentInfo parseSegmentInfo(
|
||||
Directory dir, DataInput input, String segment, byte[] segmentID) throws IOException {
|
||||
final Version version = Version.fromBits(input.readInt(), input.readInt(), input.readInt());
|
||||
byte hasMinVersion = input.readByte();
|
||||
final Version minVersion;
|
||||
switch (hasMinVersion) {
|
||||
case 0:
|
||||
minVersion = null;
|
||||
break;
|
||||
case 1:
|
||||
minVersion = Version.fromBits(input.readInt(), input.readInt(), input.readInt());
|
||||
break;
|
||||
default:
|
||||
throw new CorruptIndexException("Illegal boolean value " + hasMinVersion, input);
|
||||
}
|
||||
|
||||
final int docCount = input.readInt();
|
||||
if (docCount < 0) {
|
||||
throw new CorruptIndexException("invalid docCount: " + docCount, input);
|
||||
}
|
||||
final boolean isCompoundFile = input.readByte() == SegmentInfo.YES;
|
||||
|
||||
final Map<String, String> diagnostics = input.readMapOfStrings();
|
||||
final Set<String> files = input.readSetOfStrings();
|
||||
final Map<String, String> attributes = input.readMapOfStrings();
|
||||
|
||||
int numSortFields = input.readVInt();
|
||||
Sort indexSort;
|
||||
if (numSortFields > 0) {
|
||||
SortField[] sortFields = new SortField[numSortFields];
|
||||
for (int i = 0; i < numSortFields; i++) {
|
||||
String name = input.readString();
|
||||
sortFields[i] = SortFieldProvider.forName(name).readSortField(input);
|
||||
}
|
||||
indexSort = new Sort(sortFields);
|
||||
} else if (numSortFields < 0) {
|
||||
throw new CorruptIndexException("invalid index sort field count: " + numSortFields, input);
|
||||
} else {
|
||||
indexSort = null;
|
||||
}
|
||||
|
||||
SegmentInfo si =
|
||||
new SegmentInfo(
|
||||
dir,
|
||||
version,
|
||||
minVersion,
|
||||
segment,
|
||||
docCount,
|
||||
isCompoundFile,
|
||||
false,
|
||||
null,
|
||||
diagnostics,
|
||||
segmentID,
|
||||
attributes,
|
||||
indexSort);
|
||||
si.setFiles(files);
|
||||
return si;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void write(Directory dir, SegmentInfo si, IOContext ioContext) throws IOException {
|
||||
throw new UnsupportedOperationException("Old formats can't be used for writing");
|
||||
}
|
||||
}
|
|
@ -18,6 +18,7 @@ package org.apache.lucene.backward_codecs.lucene91;
|
|||
|
||||
import java.util.Objects;
|
||||
import org.apache.lucene.backward_codecs.lucene90.Lucene90FieldInfosFormat;
|
||||
import org.apache.lucene.backward_codecs.lucene90.Lucene90SegmentInfoFormat;
|
||||
import org.apache.lucene.codecs.Codec;
|
||||
import org.apache.lucene.codecs.CompoundFormat;
|
||||
import org.apache.lucene.codecs.DocValuesFormat;
|
||||
|
@ -37,7 +38,6 @@ import org.apache.lucene.codecs.lucene90.Lucene90LiveDocsFormat;
|
|||
import org.apache.lucene.codecs.lucene90.Lucene90NormsFormat;
|
||||
import org.apache.lucene.codecs.lucene90.Lucene90PointsFormat;
|
||||
import org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat;
|
||||
import org.apache.lucene.codecs.lucene90.Lucene90SegmentInfoFormat;
|
||||
import org.apache.lucene.codecs.lucene90.Lucene90StoredFieldsFormat;
|
||||
import org.apache.lucene.codecs.lucene90.Lucene90TermVectorsFormat;
|
||||
import org.apache.lucene.codecs.perfield.PerFieldDocValuesFormat;
|
||||
|
@ -141,7 +141,7 @@ public class Lucene91Codec extends Codec {
|
|||
}
|
||||
|
||||
@Override
|
||||
public final SegmentInfoFormat segmentInfoFormat() {
|
||||
public SegmentInfoFormat segmentInfoFormat() {
|
||||
return segmentInfosFormat;
|
||||
}
|
||||
|
||||
|
|
|
@ -141,9 +141,9 @@
|
|||
* <p>Each segment index maintains the following:
|
||||
*
|
||||
* <ul>
|
||||
* <li>{@link org.apache.lucene.codecs.lucene90.Lucene90SegmentInfoFormat Segment info}. This
|
||||
* contains metadata about a segment, such as the number of documents, what files it uses, and
|
||||
* information about how the segment is sorted
|
||||
* <li>{@link org.apache.lucene.backward_codecs.lucene90.Lucene90SegmentInfoFormat Segment info}.
|
||||
* This contains metadata about a segment, such as the number of documents, what files it
|
||||
* uses, and information about how the segment is sorted
|
||||
* <li>{@link org.apache.lucene.backward_codecs.lucene90.Lucene90FieldInfosFormat Field names}.
|
||||
* This contains metadata about the set of named fields used in the index.
|
||||
* <li>{@link org.apache.lucene.codecs.lucene90.Lucene90StoredFieldsFormat Stored Field values}.
|
||||
|
@ -229,7 +229,7 @@
|
|||
* file.</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90SegmentInfoFormat Segment Info}</td>
|
||||
* <td>{@link org.apache.lucene.backward_codecs.lucene90.Lucene90SegmentInfoFormat Segment Info}</td>
|
||||
* <td>.si</td>
|
||||
* <td>Stores metadata about a segment</td>
|
||||
* </tr>
|
||||
|
|
|
@ -18,6 +18,7 @@ package org.apache.lucene.backward_codecs.lucene92;
|
|||
|
||||
import java.util.Objects;
|
||||
import org.apache.lucene.backward_codecs.lucene90.Lucene90FieldInfosFormat;
|
||||
import org.apache.lucene.backward_codecs.lucene90.Lucene90SegmentInfoFormat;
|
||||
import org.apache.lucene.codecs.Codec;
|
||||
import org.apache.lucene.codecs.CompoundFormat;
|
||||
import org.apache.lucene.codecs.DocValuesFormat;
|
||||
|
@ -37,7 +38,6 @@ import org.apache.lucene.codecs.lucene90.Lucene90LiveDocsFormat;
|
|||
import org.apache.lucene.codecs.lucene90.Lucene90NormsFormat;
|
||||
import org.apache.lucene.codecs.lucene90.Lucene90PointsFormat;
|
||||
import org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat;
|
||||
import org.apache.lucene.codecs.lucene90.Lucene90SegmentInfoFormat;
|
||||
import org.apache.lucene.codecs.lucene90.Lucene90StoredFieldsFormat;
|
||||
import org.apache.lucene.codecs.lucene90.Lucene90TermVectorsFormat;
|
||||
import org.apache.lucene.codecs.perfield.PerFieldDocValuesFormat;
|
||||
|
@ -144,7 +144,7 @@ public class Lucene92Codec extends Codec {
|
|||
}
|
||||
|
||||
@Override
|
||||
public final SegmentInfoFormat segmentInfoFormat() {
|
||||
public SegmentInfoFormat segmentInfoFormat() {
|
||||
return segmentInfosFormat;
|
||||
}
|
||||
|
||||
|
|
|
@ -141,9 +141,9 @@
|
|||
* <p>Each segment index maintains the following:
|
||||
*
|
||||
* <ul>
|
||||
* <li>{@link org.apache.lucene.codecs.lucene90.Lucene90SegmentInfoFormat Segment info}. This
|
||||
* contains metadata about a segment, such as the number of documents, what files it uses, and
|
||||
* information about how the segment is sorted
|
||||
* <li>{@link org.apache.lucene.backward_codecs.lucene90.Lucene90SegmentInfoFormat Segment info}.
|
||||
* This contains metadata about a segment, such as the number of documents, what files it
|
||||
* uses, and information about how the segment is sorted
|
||||
* <li>{@link org.apache.lucene.backward_codecs.lucene90.Lucene90FieldInfosFormat Field names}.
|
||||
* This contains metadata about the set of named fields used in the index.
|
||||
* <li>{@link org.apache.lucene.codecs.lucene90.Lucene90StoredFieldsFormat Stored Field values}.
|
||||
|
@ -229,7 +229,7 @@
|
|||
* file.</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90SegmentInfoFormat Segment Info}</td>
|
||||
* <td>{@link org.apache.lucene.backward_codecs.lucene90.Lucene90SegmentInfoFormat Segment Info}</td>
|
||||
* <td>.si</td>
|
||||
* <td>Stores metadata about a segment</td>
|
||||
* </tr>
|
||||
|
|
|
@ -17,6 +17,7 @@
|
|||
package org.apache.lucene.backward_codecs.lucene94;
|
||||
|
||||
import java.util.Objects;
|
||||
import org.apache.lucene.backward_codecs.lucene90.Lucene90SegmentInfoFormat;
|
||||
import org.apache.lucene.codecs.Codec;
|
||||
import org.apache.lucene.codecs.CompoundFormat;
|
||||
import org.apache.lucene.codecs.DocValuesFormat;
|
||||
|
@ -36,7 +37,6 @@ import org.apache.lucene.codecs.lucene90.Lucene90LiveDocsFormat;
|
|||
import org.apache.lucene.codecs.lucene90.Lucene90NormsFormat;
|
||||
import org.apache.lucene.codecs.lucene90.Lucene90PointsFormat;
|
||||
import org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat;
|
||||
import org.apache.lucene.codecs.lucene90.Lucene90SegmentInfoFormat;
|
||||
import org.apache.lucene.codecs.lucene90.Lucene90StoredFieldsFormat;
|
||||
import org.apache.lucene.codecs.lucene90.Lucene90TermVectorsFormat;
|
||||
import org.apache.lucene.codecs.lucene94.Lucene94FieldInfosFormat;
|
||||
|
@ -144,7 +144,7 @@ public class Lucene94Codec extends Codec {
|
|||
}
|
||||
|
||||
@Override
|
||||
public final SegmentInfoFormat segmentInfoFormat() {
|
||||
public SegmentInfoFormat segmentInfoFormat() {
|
||||
return segmentInfosFormat;
|
||||
}
|
||||
|
||||
|
|
|
@ -141,9 +141,9 @@
|
|||
* <p>Each segment index maintains the following:
|
||||
*
|
||||
* <ul>
|
||||
* <li>{@link org.apache.lucene.codecs.lucene90.Lucene90SegmentInfoFormat Segment info}. This
|
||||
* contains metadata about a segment, such as the number of documents, what files it uses, and
|
||||
* information about how the segment is sorted
|
||||
* <li>{@link org.apache.lucene.backward_codecs.lucene90.Lucene90SegmentInfoFormat Segment info}.
|
||||
* This contains metadata about a segment, such as the number of documents, what files it
|
||||
* uses, and information about how the segment is sorted
|
||||
* <li>{@link org.apache.lucene.codecs.lucene94.Lucene94FieldInfosFormat Field names}. This
|
||||
* contains metadata about the set of named fields used in the index.
|
||||
* <li>{@link org.apache.lucene.codecs.lucene90.Lucene90StoredFieldsFormat Stored Field values}.
|
||||
|
@ -229,7 +229,7 @@
|
|||
* file.</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90SegmentInfoFormat Segment Info}</td>
|
||||
* <td>{@link org.apache.lucene.backward_codecs.lucene90.Lucene90SegmentInfoFormat Segment Info}</td>
|
||||
* <td>.si</td>
|
||||
* <td>Stores metadata about a segment</td>
|
||||
* </tr>
|
||||
|
|
|
@ -14,9 +14,10 @@
|
|||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.codecs.lucene95;
|
||||
package org.apache.lucene.backward_codecs.lucene95;
|
||||
|
||||
import java.util.Objects;
|
||||
import org.apache.lucene.backward_codecs.lucene90.Lucene90SegmentInfoFormat;
|
||||
import org.apache.lucene.codecs.Codec;
|
||||
import org.apache.lucene.codecs.CompoundFormat;
|
||||
import org.apache.lucene.codecs.DocValuesFormat;
|
||||
|
@ -36,10 +37,10 @@ import org.apache.lucene.codecs.lucene90.Lucene90LiveDocsFormat;
|
|||
import org.apache.lucene.codecs.lucene90.Lucene90NormsFormat;
|
||||
import org.apache.lucene.codecs.lucene90.Lucene90PointsFormat;
|
||||
import org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat;
|
||||
import org.apache.lucene.codecs.lucene90.Lucene90SegmentInfoFormat;
|
||||
import org.apache.lucene.codecs.lucene90.Lucene90StoredFieldsFormat;
|
||||
import org.apache.lucene.codecs.lucene90.Lucene90TermVectorsFormat;
|
||||
import org.apache.lucene.codecs.lucene94.Lucene94FieldInfosFormat;
|
||||
import org.apache.lucene.codecs.lucene95.Lucene95HnswVectorsFormat;
|
||||
import org.apache.lucene.codecs.perfield.PerFieldDocValuesFormat;
|
||||
import org.apache.lucene.codecs.perfield.PerFieldKnnVectorsFormat;
|
||||
import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat;
|
|
@ -0,0 +1,425 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Lucene 9.5 file format.
|
||||
*
|
||||
* <h2>Apache Lucene - Index File Formats</h2>
|
||||
*
|
||||
* <div>
|
||||
*
|
||||
* <ul>
|
||||
* <li><a href="#Introduction">Introduction</a>
|
||||
* <li><a href="#Definitions">Definitions</a>
|
||||
* <ul>
|
||||
* <li><a href="#Inverted_Indexing">Inverted Indexing</a>
|
||||
* <li><a href="#Types_of_Fields">Types of Fields</a>
|
||||
* <li><a href="#Segments">Segments</a>
|
||||
* <li><a href="#Document_Numbers">Document Numbers</a>
|
||||
* </ul>
|
||||
* <li><a href="#Overview">Index Structure Overview</a>
|
||||
* <li><a href="#File_Naming">File Naming</a>
|
||||
* <li><a href="#file-names">Summary of File Extensions</a>
|
||||
* <ul>
|
||||
* <li><a href="#Lock_File">Lock File</a>
|
||||
* <li><a href="#History">History</a>
|
||||
* <li><a href="#Limitations">Limitations</a>
|
||||
* </ul>
|
||||
* </ul>
|
||||
*
|
||||
* </div> <a id="Introduction"></a>
|
||||
*
|
||||
* <h3>Introduction</h3>
|
||||
*
|
||||
* <div>
|
||||
*
|
||||
* <p>This document defines the index file formats used in this version of Lucene. If you are using
|
||||
* a different version of Lucene, please consult the copy of <code>docs/</code> that was distributed
|
||||
* with the version you are using.
|
||||
*
|
||||
* <p>This document attempts to provide a high-level definition of the Apache Lucene file formats.
|
||||
* </div> <a id="Definitions"></a>
|
||||
*
|
||||
* <h3>Definitions</h3>
|
||||
*
|
||||
* <div>
|
||||
*
|
||||
* <p>The fundamental concepts in Lucene are index, document, field and term.
|
||||
*
|
||||
* <p>An index contains a sequence of documents.
|
||||
*
|
||||
* <ul>
|
||||
* <li>A document is a sequence of fields.
|
||||
* <li>A field is a named sequence of terms.
|
||||
* <li>A term is a sequence of bytes.
|
||||
* </ul>
|
||||
*
|
||||
* <p>The same sequence of bytes in two different fields is considered a different term. Thus terms
|
||||
* are represented as a pair: the string naming the field, and the bytes within the field. <a
|
||||
* id="Inverted_Indexing"></a>
|
||||
*
|
||||
* <h4>Inverted Indexing</h4>
|
||||
*
|
||||
* <p>Lucene's index stores terms and statistics about those terms in order to make term-based
|
||||
* search more efficient. Lucene's terms index falls into the family of indexes known as an
|
||||
* <i>inverted index.</i> This is because it can list, for a term, the documents that contain it.
|
||||
* This is the inverse of the natural relationship, in which documents list terms. <a
|
||||
* id="Types_of_Fields"></a>
|
||||
*
|
||||
* <h4>Types of Fields</h4>
|
||||
*
|
||||
* <p>In Lucene, fields may be <i>stored</i>, in which case their text is stored in the index
|
||||
* literally, in a non-inverted manner. Fields that are inverted are called <i>indexed</i>. A field
|
||||
* may be both stored and indexed.
|
||||
*
|
||||
* <p>The text of a field may be <i>tokenized</i> into terms to be indexed, or the text of a field
|
||||
* may be used literally as a term to be indexed. Most fields are tokenized, but sometimes it is
|
||||
* useful for certain identifier fields to be indexed literally.
|
||||
*
|
||||
* <p>See the {@link org.apache.lucene.document.Field Field} java docs for more information on
|
||||
* Fields. <a id="Segments"></a>
|
||||
*
|
||||
* <h4>Segments</h4>
|
||||
*
|
||||
* <p>Lucene indexes may be composed of multiple sub-indexes, or <i>segments</i>. Each segment is a
|
||||
* fully independent index, which could be searched separately. Indexes evolve by:
|
||||
*
|
||||
* <ol>
|
||||
* <li>Creating new segments for newly added documents.
|
||||
* <li>Merging existing segments.
|
||||
* </ol>
|
||||
*
|
||||
* <p>Searches may involve multiple segments and/or multiple indexes, each index potentially
|
||||
* composed of a set of segments. <a id="Document_Numbers"></a>
|
||||
*
|
||||
* <h4>Document Numbers</h4>
|
||||
*
|
||||
* <p>Internally, Lucene refers to documents by an integer <i>document number</i>. The first
|
||||
* document added to an index is numbered zero, and each subsequent document added gets a number one
|
||||
* greater than the previous.
|
||||
*
|
||||
* <p>Note that a document's number may change, so caution should be taken when storing these
|
||||
* numbers outside of Lucene. In particular, numbers may change in the following situations:
|
||||
*
|
||||
* <ul>
|
||||
* <li>
|
||||
* <p>The numbers stored in each segment are unique only within the segment, and must be
|
||||
* converted before they can be used in a larger context. The standard technique is to
|
||||
* allocate each segment a range of values, based on the range of numbers used in that
|
||||
* segment. To convert a document number from a segment to an external value, the segment's
|
||||
* <i>base</i> document number is added. To convert an external value back to a
|
||||
* segment-specific value, the segment is identified by the range that the external value is
|
||||
* in, and the segment's base value is subtracted. For example two five document segments
|
||||
* might be combined, so that the first segment has a base value of zero, and the second of
|
||||
* five. Document three from the second segment would have an external value of eight.
|
||||
* <li>
|
||||
* <p>When documents are deleted, gaps are created in the numbering. These are eventually
|
||||
* removed as the index evolves through merging. Deleted documents are dropped when segments
|
||||
* are merged. A freshly-merged segment thus has no gaps in its numbering.
|
||||
* </ul>
|
||||
*
|
||||
* </div> <a id="Overview"></a>
|
||||
*
|
||||
* <h3>Index Structure Overview</h3>
|
||||
*
|
||||
* <div>
|
||||
*
|
||||
* <p>Each segment index maintains the following:
|
||||
*
|
||||
* <ul>
|
||||
* <li>{@link org.apache.lucene.backward_codecs.lucene90.Lucene90SegmentInfoFormat info}. This
|
||||
* contains metadata about a segment, such as the number of documents, what files it uses, and
|
||||
* information about how the segment is sorted
|
||||
* <li>{@link org.apache.lucene.codecs.lucene94.Lucene94FieldInfosFormat Field names}. This
|
||||
* contains metadata about the set of named fields used in the index.
|
||||
* <li>{@link org.apache.lucene.codecs.lucene90.Lucene90StoredFieldsFormat Stored Field values}.
|
||||
* This contains, for each document, a list of attribute-value pairs, where the attributes are
|
||||
* field names. These are used to store auxiliary information about the document, such as its
|
||||
* title, url, or an identifier to access a database. The set of stored fields are what is
|
||||
* returned for each hit when searching. This is keyed by document number.
|
||||
* <li>{@link org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat Term dictionary}. A
|
||||
* dictionary containing all of the terms used in all of the indexed fields of all of the
|
||||
* documents. The dictionary also contains the number of documents which contain the term, and
|
||||
* pointers to the term's frequency and proximity data.
|
||||
* <li>{@link org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat Term Frequency data}. For
|
||||
* each term in the dictionary, the numbers of all the documents that contain that term, and
|
||||
* the frequency of the term in that document, unless frequencies are omitted ({@link
|
||||
* org.apache.lucene.index.IndexOptions#DOCS IndexOptions.DOCS})
|
||||
* <li>{@link org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat Term Proximity data}. For
|
||||
* each term in the dictionary, the positions that the term occurs in each document. Note that
|
||||
* this will not exist if all fields in all documents omit position data.
|
||||
* <li>{@link org.apache.lucene.codecs.lucene90.Lucene90NormsFormat Normalization factors}. For
|
||||
* each field in each document, a value is stored that is multiplied into the score for hits
|
||||
* on that field.
|
||||
* <li>{@link org.apache.lucene.codecs.lucene90.Lucene90TermVectorsFormat Term Vectors}. For each
|
||||
* field in each document, the term vector (sometimes called document vector) may be stored. A
|
||||
* term vector consists of term text and term frequency. To add Term Vectors to your index see
|
||||
* the {@link org.apache.lucene.document.Field Field} constructors
|
||||
* <li>{@link org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat Per-document values}. Like
|
||||
* stored values, these are also keyed by document number, but are generally intended to be
|
||||
* loaded into main memory for fast access. Whereas stored values are generally intended for
|
||||
* summary results from searches, per-document values are useful for things like scoring
|
||||
* factors.
|
||||
* <li>{@link org.apache.lucene.codecs.lucene90.Lucene90LiveDocsFormat Live documents}. An
|
||||
* optional file indicating which documents are live.
|
||||
* <li>{@link org.apache.lucene.codecs.lucene90.Lucene90PointsFormat Point values}. Optional pair
|
||||
* of files, recording dimensionally indexed fields, to enable fast numeric range filtering
|
||||
* and large numeric values like BigInteger and BigDecimal (1D) and geographic shape
|
||||
* intersection (2D, 3D).
|
||||
* <li>{@link org.apache.lucene.codecs.lucene95.Lucene95HnswVectorsFormat Vector values}. The
|
||||
* vector format stores numeric vectors in a format optimized for random access and
|
||||
* computation, supporting high-dimensional nearest-neighbor search.
|
||||
* </ul>
|
||||
*
|
||||
* <p>Details on each of these are provided in their linked pages. </div> <a id="File_Naming"></a>
|
||||
*
|
||||
* <h3>File Naming</h3>
|
||||
*
|
||||
* <div>
|
||||
*
|
||||
* <p>All files belonging to a segment have the same name with varying extensions. The extensions
|
||||
* correspond to the different file formats described below. When using the Compound File format
|
||||
* (default for small segments) these files (except for the Segment info file, the Lock file, and
|
||||
* Deleted documents file) are collapsed into a single .cfs file (see below for details)
|
||||
*
|
||||
* <p>Typically, all segments in an index are stored in a single directory, although this is not
|
||||
* required.
|
||||
*
|
||||
* <p>File names are never re-used. That is, when any file is saved to the Directory it is given a
|
||||
* never before used filename. This is achieved using a simple generations approach. For example,
|
||||
* the first segments file is segments_1, then segments_2, etc. The generation is a sequential long
|
||||
* integer represented in alpha-numeric (base 36) form. </div> <a id="file-names"></a>
|
||||
*
|
||||
* <h3>Summary of File Extensions</h3>
|
||||
*
|
||||
* <div>
|
||||
*
|
||||
* <p>The following table summarizes the names and extensions of the files in Lucene:
|
||||
*
|
||||
* <table class="padding4" style="border-spacing: 1px; border-collapse: separate">
|
||||
* <caption>lucene filenames by extension</caption>
|
||||
* <tr>
|
||||
* <th>Name</th>
|
||||
* <th>Extension</th>
|
||||
* <th>Brief Description</th>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.index.SegmentInfos Segments File}</td>
|
||||
* <td>segments_N</td>
|
||||
* <td>Stores information about a commit point</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td><a href="#Lock_File">Lock File</a></td>
|
||||
* <td>write.lock</td>
|
||||
* <td>The Write lock prevents multiple IndexWriters from writing to the same
|
||||
* file.</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.backward_codecs.lucene90.Lucene90SegmentInfoFormat Segment Info}</td>
|
||||
* <td>.si</td>
|
||||
* <td>Stores metadata about a segment</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90CompoundFormat Compound File}</td>
|
||||
* <td>.cfs, .cfe</td>
|
||||
* <td>An optional "virtual" file consisting of all the other index files for
|
||||
* systems that frequently run out of file handles.</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene94.Lucene94FieldInfosFormat Fields}</td>
|
||||
* <td>.fnm</td>
|
||||
* <td>Stores information about the fields</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90StoredFieldsFormat Field Index}</td>
|
||||
* <td>.fdx</td>
|
||||
* <td>Contains pointers to field data</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90StoredFieldsFormat Field Data}</td>
|
||||
* <td>.fdt</td>
|
||||
* <td>The stored fields for documents</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat Term Dictionary}</td>
|
||||
* <td>.tim</td>
|
||||
* <td>The term dictionary, stores term info</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat Term Index}</td>
|
||||
* <td>.tip</td>
|
||||
* <td>The index into the Term Dictionary</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat Frequencies}</td>
|
||||
* <td>.doc</td>
|
||||
* <td>Contains the list of docs which contain each term along with frequency</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat Positions}</td>
|
||||
* <td>.pos</td>
|
||||
* <td>Stores position information about where a term occurs in the index</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat Payloads}</td>
|
||||
* <td>.pay</td>
|
||||
* <td>Stores additional per-position metadata information such as character offsets and user payloads</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90NormsFormat Norms}</td>
|
||||
* <td>.nvd, .nvm</td>
|
||||
* <td>Encodes length and boost factors for docs and fields</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat Per-Document Values}</td>
|
||||
* <td>.dvd, .dvm</td>
|
||||
* <td>Encodes additional scoring factors or other per-document information.</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90TermVectorsFormat Term Vector Index}</td>
|
||||
* <td>.tvx</td>
|
||||
* <td>Stores offset into the document data file</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90TermVectorsFormat Term Vector Data}</td>
|
||||
* <td>.tvd</td>
|
||||
* <td>Contains term vector data.</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90LiveDocsFormat Live Documents}</td>
|
||||
* <td>.liv</td>
|
||||
* <td>Info about what documents are live</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90PointsFormat Point values}</td>
|
||||
* <td>.dii, .dim</td>
|
||||
* <td>Holds indexed points</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene95.Lucene95HnswVectorsFormat Vector values}</td>
|
||||
* <td>.vec, .vem</td>
|
||||
* <td>Holds indexed vectors; <code>.vec</code> files contain the raw vector data, and
|
||||
* <code>.vem</code> the vector metadata</td>
|
||||
* </tr>
|
||||
* </table>
|
||||
*
|
||||
* </div> <a id="Lock_File"></a>
|
||||
*
|
||||
* <h3>Lock File</h3>
|
||||
*
|
||||
* The write lock, which is stored in the index directory by default, is named "write.lock". If the
|
||||
* lock directory is different from the index directory then the write lock will be named
|
||||
* "XXXX-write.lock" where XXXX is a unique prefix derived from the full path to the index
|
||||
* directory. When this file is present, a writer is currently modifying the index (adding or
|
||||
* removing documents). This lock file ensures that only one writer is modifying the index at a
|
||||
* time. <a id="History"></a>
|
||||
*
|
||||
* <h3>History</h3>
|
||||
*
|
||||
* <p>Compatibility notes are provided in this document, describing how file formats have changed
|
||||
* from prior versions:
|
||||
*
|
||||
* <ul>
|
||||
* <li>In version 2.1, the file format was changed to allow lock-less commits (ie, no more commit
|
||||
* lock). The change is fully backwards compatible: you can open a pre-2.1 index for searching
|
||||
* or adding/deleting of docs. When the new segments file is saved (committed), it will be
|
||||
* written in the new file format (meaning no specific "upgrade" process is needed). But note
|
||||
* that once a commit has occurred, pre-2.1 Lucene will not be able to read the index.
|
||||
* <li>In version 2.3, the file format was changed to allow segments to share a single set of doc
|
||||
* store (vectors & stored fields) files. This allows for faster indexing in certain
|
||||
* cases. The change is fully backwards compatible (in the same way as the lock-less commits
|
||||
* change in 2.1).
|
||||
* <li>In version 2.4, Strings are now written as true UTF-8 byte sequence, not Java's modified
|
||||
* UTF-8. See <a href="http://issues.apache.org/jira/browse/LUCENE-510">LUCENE-510</a> for
|
||||
* details.
|
||||
* <li>In version 2.9, an optional opaque Map<String,String> CommitUserData may be passed to
|
||||
* IndexWriter's commit methods (and later retrieved), which is recorded in the segments_N
|
||||
* file. See <a href="http://issues.apache.org/jira/browse/LUCENE-1382">LUCENE-1382</a> for
|
||||
* details. Also, diagnostics were added to each segment written recording details about why
|
||||
* it was written (due to flush, merge; which OS/JRE was used; etc.). See issue <a
|
||||
* href="http://issues.apache.org/jira/browse/LUCENE-1654">LUCENE-1654</a> for details.
|
||||
* <li>In version 3.0, compressed fields are no longer written to the index (they can still be
|
||||
* read, but on merge the new segment will write them, uncompressed). See issue <a
|
||||
* href="http://issues.apache.org/jira/browse/LUCENE-1960">LUCENE-1960</a> for details.
|
||||
* <li>In version 3.1, segments records the code version that created them. See <a
|
||||
* href="http://issues.apache.org/jira/browse/LUCENE-2720">LUCENE-2720</a> for details.
|
||||
* Additionally segments track explicitly whether or not they have term vectors. See <a
|
||||
* href="http://issues.apache.org/jira/browse/LUCENE-2811">LUCENE-2811</a> for details.
|
||||
* <li>In version 3.2, numeric fields are written as natively to stored fields file, previously
|
||||
* they were stored in text format only.
|
||||
* <li>In version 3.4, fields can omit position data while still indexing term frequencies.
|
||||
* <li>In version 4.0, the format of the inverted index became extensible via the {@link
|
||||
* org.apache.lucene.codecs.Codec Codec} api. Fast per-document storage ({@code DocValues})
|
||||
* was introduced. Normalization factors need no longer be a single byte, they can be any
|
||||
* {@link org.apache.lucene.index.NumericDocValues NumericDocValues}. Terms need not be
|
||||
* unicode strings, they can be any byte sequence. Term offsets can optionally be indexed into
|
||||
* the postings lists. Payloads can be stored in the term vectors.
|
||||
* <li>In version 4.1, the format of the postings list changed to use either of FOR compression or
|
||||
* variable-byte encoding, depending upon the frequency of the term. Terms appearing only once
|
||||
* were changed to inline directly into the term dictionary. Stored fields are compressed by
|
||||
* default.
|
||||
* <li>In version 4.2, term vectors are compressed by default. DocValues has a new multi-valued
|
||||
* type (SortedSet), that can be used for faceting/grouping/joining on multi-valued fields.
|
||||
* <li>In version 4.5, DocValues were extended to explicitly represent missing values.
|
||||
* <li>In version 4.6, FieldInfos were extended to support per-field DocValues generation, to
|
||||
* allow updating NumericDocValues fields.
|
||||
* <li>In version 4.8, checksum footers were added to the end of each index file for improved data
|
||||
* integrity. Specifically, the last 8 bytes of every index file contain the zlib-crc32
|
||||
* checksum of the file.
|
||||
* <li>In version 4.9, DocValues has a new multi-valued numeric type (SortedNumeric) that is
|
||||
* suitable for faceting/sorting/analytics.
|
||||
* <li>In version 5.4, DocValues have been improved to store more information on disk: addresses
|
||||
* for binary fields and ord indexes for multi-valued fields.
|
||||
* <li>In version 6.0, Points were added, for multi-dimensional range/distance search.
|
||||
* <li>In version 6.2, new Segment info format that reads/writes the index sort, to support index
|
||||
* sorting.
|
||||
* <li>In version 7.0, DocValues have been improved to better support sparse doc values thanks to
|
||||
* an iterator API.
|
||||
* <li>In version 8.0, postings have been enhanced to record, for each block of doc ids, the (term
|
||||
* freq, normalization factor) pairs that may trigger the maximum score of the block. This
|
||||
* information is recorded alongside skip data in order to be able to skip blocks of doc ids
|
||||
* if they may not produce high enough scores. Additionally doc values and norms has been
|
||||
* extended with jump-tables to make access O(1) instead of O(n), where n is the number of
|
||||
* elements to skip when advancing in the data.
|
||||
* <li>In version 8.4, postings, positions, offsets and payload lengths have move to a more
|
||||
* performant encoding that is vectorized.
|
||||
* <li>In version 8.6, index sort serialization is delegated to the sorts themselves, to allow
|
||||
* user-defined sorts to be used
|
||||
* <li>In version 8.7, stored fields compression became adaptive to better handle documents with
|
||||
* smaller stored fields.
|
||||
* <li>In version 9.0, vector-valued fields were added.
|
||||
* <li>In version 9.1, vector-valued fields were modified to add a graph hierarchy.
|
||||
* <li>In version 9.2, docs of vector-valued fields were moved from .vem to .vec and encoded by
|
||||
* IndexDISI. ordToDoc mappings was added to .vem.
|
||||
* <li>In version 9.5, HNSW graph connections were changed to be delta-encoded with vints.
|
||||
* Additionally, metadata file size improvements were made by delta-encoding nodes by graph
|
||||
* layer and not writing the node ids for the zeroth layer.
|
||||
* </ul>
|
||||
*
|
||||
* <a id="Limitations"></a>
|
||||
*
|
||||
* <h3>Limitations</h3>
|
||||
*
|
||||
* <div>
|
||||
*
|
||||
* <p>Lucene uses a Java <code>int</code> to refer to document numbers, and the index file format
|
||||
* uses an <code>Int32</code> on-disk to store document numbers. This is a limitation of both the
|
||||
* index file format and the current implementation. Eventually these should be replaced with either
|
||||
* <code>UInt64</code> values, or better yet, {@link org.apache.lucene.store.DataOutput#writeVInt
|
||||
* VInt} values which have no limit. </div>
|
||||
*/
|
||||
package org.apache.lucene.backward_codecs.lucene95;
|
|
@ -21,3 +21,4 @@ org.apache.lucene.backward_codecs.lucene90.Lucene90Codec
|
|||
org.apache.lucene.backward_codecs.lucene91.Lucene91Codec
|
||||
org.apache.lucene.backward_codecs.lucene92.Lucene92Codec
|
||||
org.apache.lucene.backward_codecs.lucene94.Lucene94Codec
|
||||
org.apache.lucene.backward_codecs.lucene95.Lucene95Codec
|
||||
|
|
|
@ -17,10 +17,13 @@
|
|||
package org.apache.lucene.backward_codecs.lucene90;
|
||||
|
||||
import org.apache.lucene.codecs.KnnVectorsFormat;
|
||||
import org.apache.lucene.codecs.SegmentInfoFormat;
|
||||
import org.apache.lucene.codecs.perfield.PerFieldKnnVectorsFormat;
|
||||
|
||||
public class Lucene90RWCodec extends Lucene90Codec {
|
||||
|
||||
private final SegmentInfoFormat segmentInfosFormat = new Lucene90RWSegmentInfoFormat();
|
||||
|
||||
private final KnnVectorsFormat defaultKnnVectorsFormat;
|
||||
private final KnnVectorsFormat knnVectorsFormat =
|
||||
new PerFieldKnnVectorsFormat() {
|
||||
|
@ -41,4 +44,9 @@ public class Lucene90RWCodec extends Lucene90Codec {
|
|||
public KnnVectorsFormat knnVectorsFormat() {
|
||||
return knnVectorsFormat;
|
||||
}
|
||||
|
||||
@Override
|
||||
public SegmentInfoFormat segmentInfoFormat() {
|
||||
return segmentInfosFormat;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -15,28 +15,17 @@
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.codecs.lucene90;
|
||||
package org.apache.lucene.backward_codecs.lucene90;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import org.apache.lucene.codecs.CodecUtil;
|
||||
import org.apache.lucene.codecs.SegmentInfoFormat;
|
||||
import org.apache.lucene.index.CorruptIndexException;
|
||||
import org.apache.lucene.index.IndexFileNames;
|
||||
import org.apache.lucene.index.IndexSorter;
|
||||
import org.apache.lucene.index.IndexWriter;
|
||||
import org.apache.lucene.index.SegmentInfo;
|
||||
import org.apache.lucene.index.SegmentInfos;
|
||||
import org.apache.lucene.index.SortFieldProvider;
|
||||
import org.apache.lucene.index.*;
|
||||
import org.apache.lucene.search.Sort;
|
||||
import org.apache.lucene.search.SortField;
|
||||
import org.apache.lucene.store.ChecksumIndexInput;
|
||||
import org.apache.lucene.store.DataInput;
|
||||
import org.apache.lucene.store.DataOutput;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.IOContext;
|
||||
import org.apache.lucene.store.IndexOutput;
|
||||
import org.apache.lucene.store.*;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
/**
|
||||
|
@ -83,7 +72,7 @@ import org.apache.lucene.util.Version;
|
|||
* @see SegmentInfos
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public class Lucene90SegmentInfoFormat extends SegmentInfoFormat {
|
||||
public class Lucene90RWSegmentInfoFormat extends SegmentInfoFormat {
|
||||
|
||||
/** File extension used to store {@link SegmentInfo}. */
|
||||
public static final String SI_EXTENSION = "si";
|
||||
|
@ -93,7 +82,7 @@ public class Lucene90SegmentInfoFormat extends SegmentInfoFormat {
|
|||
static final int VERSION_CURRENT = VERSION_START;
|
||||
|
||||
/** Sole constructor. */
|
||||
public Lucene90SegmentInfoFormat() {}
|
||||
public Lucene90RWSegmentInfoFormat() {}
|
||||
|
||||
@Override
|
||||
public SegmentInfo read(Directory dir, String segment, byte[] segmentID, IOContext context)
|
||||
|
@ -164,6 +153,7 @@ public class Lucene90SegmentInfoFormat extends SegmentInfoFormat {
|
|||
segment,
|
||||
docCount,
|
||||
isCompoundFile,
|
||||
false,
|
||||
null,
|
||||
diagnostics,
|
||||
segmentID,
|
|
@ -15,22 +15,21 @@
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.codecs.lucene90;
|
||||
package org.apache.lucene.backward_codecs.lucene90;
|
||||
|
||||
import org.apache.lucene.codecs.Codec;
|
||||
import org.apache.lucene.tests.index.BaseSegmentInfoFormatTestCase;
|
||||
import org.apache.lucene.tests.util.TestUtil;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
public class TestLucene90SegmentInfoFormat extends BaseSegmentInfoFormatTestCase {
|
||||
|
||||
@Override
|
||||
protected Version[] getVersions() {
|
||||
return new Version[] {Version.LATEST};
|
||||
return new Version[] {Version.LUCENE_9_0_0};
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Codec getCodec() {
|
||||
return TestUtil.getDefaultCodec();
|
||||
return new Lucene90RWCodec();
|
||||
}
|
||||
}
|
|
@ -16,7 +16,9 @@
|
|||
*/
|
||||
package org.apache.lucene.backward_codecs.lucene91;
|
||||
|
||||
import org.apache.lucene.backward_codecs.lucene90.Lucene90RWSegmentInfoFormat;
|
||||
import org.apache.lucene.codecs.KnnVectorsFormat;
|
||||
import org.apache.lucene.codecs.SegmentInfoFormat;
|
||||
import org.apache.lucene.codecs.perfield.PerFieldKnnVectorsFormat;
|
||||
|
||||
public class Lucene91RWCodec extends Lucene91Codec {
|
||||
|
@ -29,6 +31,7 @@ public class Lucene91RWCodec extends Lucene91Codec {
|
|||
return defaultKnnVectorsFormat;
|
||||
}
|
||||
};
|
||||
private final SegmentInfoFormat segmentInfosFormat = new Lucene90RWSegmentInfoFormat();
|
||||
|
||||
public Lucene91RWCodec() {
|
||||
this.defaultKnnVectorsFormat =
|
||||
|
@ -41,4 +44,9 @@ public class Lucene91RWCodec extends Lucene91Codec {
|
|||
public KnnVectorsFormat knnVectorsFormat() {
|
||||
return knnVectorsFormat;
|
||||
}
|
||||
|
||||
@Override
|
||||
public SegmentInfoFormat segmentInfoFormat() {
|
||||
return segmentInfosFormat;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -16,7 +16,9 @@
|
|||
*/
|
||||
package org.apache.lucene.backward_codecs.lucene92;
|
||||
|
||||
import org.apache.lucene.backward_codecs.lucene90.Lucene90RWSegmentInfoFormat;
|
||||
import org.apache.lucene.codecs.KnnVectorsFormat;
|
||||
import org.apache.lucene.codecs.SegmentInfoFormat;
|
||||
import org.apache.lucene.codecs.perfield.PerFieldKnnVectorsFormat;
|
||||
|
||||
/** Implements the Lucene 9.2 index format for backwards compat testing */
|
||||
|
@ -30,6 +32,7 @@ public class Lucene92RWCodec extends Lucene92Codec {
|
|||
return defaultKnnVectorsFormat;
|
||||
}
|
||||
};
|
||||
private final SegmentInfoFormat segmentInfosFormat = new Lucene90RWSegmentInfoFormat();
|
||||
|
||||
/** Instantiates a new codec. */
|
||||
public Lucene92RWCodec() {
|
||||
|
@ -43,4 +46,9 @@ public class Lucene92RWCodec extends Lucene92Codec {
|
|||
public final KnnVectorsFormat knnVectorsFormat() {
|
||||
return knnVectorsFormat;
|
||||
}
|
||||
|
||||
@Override
|
||||
public SegmentInfoFormat segmentInfoFormat() {
|
||||
return segmentInfosFormat;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -16,7 +16,9 @@
|
|||
*/
|
||||
package org.apache.lucene.backward_codecs.lucene94;
|
||||
|
||||
import org.apache.lucene.backward_codecs.lucene90.Lucene90RWSegmentInfoFormat;
|
||||
import org.apache.lucene.codecs.KnnVectorsFormat;
|
||||
import org.apache.lucene.codecs.SegmentInfoFormat;
|
||||
import org.apache.lucene.codecs.perfield.PerFieldKnnVectorsFormat;
|
||||
|
||||
/** Implements the Lucene 9.4 index format for backwards compat testing */
|
||||
|
@ -31,6 +33,8 @@ public class Lucene94RWCodec extends Lucene94Codec {
|
|||
}
|
||||
};
|
||||
|
||||
private final SegmentInfoFormat segmentInfosFormat = new Lucene90RWSegmentInfoFormat();
|
||||
|
||||
/** Instantiates a new codec. */
|
||||
public Lucene94RWCodec() {
|
||||
defaultKnnVectorsFormat =
|
||||
|
@ -43,4 +47,9 @@ public class Lucene94RWCodec extends Lucene94Codec {
|
|||
public final KnnVectorsFormat knnVectorsFormat() {
|
||||
return knnVectorsFormat;
|
||||
}
|
||||
|
||||
@Override
|
||||
public SegmentInfoFormat segmentInfoFormat() {
|
||||
return segmentInfosFormat;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -26,8 +26,8 @@ import java.nio.file.Paths;
|
|||
import org.apache.lucene.benchmark.byTask.PerfRunData;
|
||||
import org.apache.lucene.benchmark.byTask.utils.Config;
|
||||
import org.apache.lucene.codecs.Codec;
|
||||
import org.apache.lucene.codecs.FilterCodec;
|
||||
import org.apache.lucene.codecs.PostingsFormat;
|
||||
import org.apache.lucene.codecs.lucene95.Lucene95Codec;
|
||||
import org.apache.lucene.index.ConcurrentMergeScheduler;
|
||||
import org.apache.lucene.index.IndexCommit;
|
||||
import org.apache.lucene.index.IndexDeletionPolicy;
|
||||
|
@ -152,9 +152,9 @@ public class CreateIndexTask extends PerfTask {
|
|||
try {
|
||||
final PostingsFormat postingsFormatChosen = PostingsFormat.forName(postingsFormat);
|
||||
iwConf.setCodec(
|
||||
new Lucene95Codec() {
|
||||
new FilterCodec(Codec.getDefault().getName(), Codec.getDefault()) {
|
||||
@Override
|
||||
public PostingsFormat getPostingsFormatForField(String field) {
|
||||
public PostingsFormat postingsFormat() {
|
||||
return postingsFormatChosen;
|
||||
}
|
||||
});
|
||||
|
|
|
@ -55,6 +55,7 @@ public class SimpleTextSegmentInfoFormat extends SegmentInfoFormat {
|
|||
static final BytesRef SI_MIN_VERSION = new BytesRef(" min version ");
|
||||
static final BytesRef SI_DOCCOUNT = new BytesRef(" number of documents ");
|
||||
static final BytesRef SI_USECOMPOUND = new BytesRef(" uses compound file ");
|
||||
static final BytesRef SI_HAS_BLOCKS = new BytesRef(" has blocks ");
|
||||
static final BytesRef SI_NUM_DIAG = new BytesRef(" diagnostics ");
|
||||
static final BytesRef SI_DIAG_KEY = new BytesRef(" key ");
|
||||
static final BytesRef SI_DIAG_VALUE = new BytesRef(" value ");
|
||||
|
@ -113,6 +114,10 @@ public class SimpleTextSegmentInfoFormat extends SegmentInfoFormat {
|
|||
final boolean isCompoundFile =
|
||||
Boolean.parseBoolean(readString(SI_USECOMPOUND.length, scratch));
|
||||
|
||||
SimpleTextUtil.readLine(input, scratch);
|
||||
assert StringHelper.startsWith(scratch.get(), SI_HAS_BLOCKS);
|
||||
final boolean hasBlocks = Boolean.parseBoolean(readString(SI_HAS_BLOCKS.length, scratch));
|
||||
|
||||
SimpleTextUtil.readLine(input, scratch);
|
||||
assert StringHelper.startsWith(scratch.get(), SI_NUM_DIAG);
|
||||
int numDiag = Integer.parseInt(readString(SI_NUM_DIAG.length, scratch));
|
||||
|
@ -204,6 +209,7 @@ public class SimpleTextSegmentInfoFormat extends SegmentInfoFormat {
|
|||
segmentName,
|
||||
docCount,
|
||||
isCompoundFile,
|
||||
hasBlocks,
|
||||
null,
|
||||
diagnostics,
|
||||
id,
|
||||
|
@ -249,6 +255,10 @@ public class SimpleTextSegmentInfoFormat extends SegmentInfoFormat {
|
|||
SimpleTextUtil.write(output, Boolean.toString(si.getUseCompoundFile()), scratch);
|
||||
SimpleTextUtil.writeNewline(output);
|
||||
|
||||
SimpleTextUtil.write(output, SI_HAS_BLOCKS);
|
||||
SimpleTextUtil.write(output, Boolean.toString(si.getHasBlocks()), scratch);
|
||||
SimpleTextUtil.writeNewline(output);
|
||||
|
||||
Map<String, String> diagnostics = si.getDiagnostics();
|
||||
int numDiagnostics = diagnostics == null ? 0 : diagnostics.size();
|
||||
SimpleTextUtil.write(output, SI_NUM_DIAG);
|
||||
|
|
|
@ -15,8 +15,8 @@
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.codecs.lucene95.Lucene95Codec;
|
||||
import org.apache.lucene.codecs.lucene95.Lucene95HnswVectorsFormat;
|
||||
import org.apache.lucene.codecs.lucene99.Lucene99Codec;
|
||||
|
||||
/** Lucene Core. */
|
||||
@SuppressWarnings("module") // the test framework is compiled after the core...
|
||||
|
@ -33,6 +33,7 @@ module org.apache.lucene.core {
|
|||
exports org.apache.lucene.codecs.lucene90;
|
||||
exports org.apache.lucene.codecs.lucene94;
|
||||
exports org.apache.lucene.codecs.lucene95;
|
||||
exports org.apache.lucene.codecs.lucene99;
|
||||
exports org.apache.lucene.codecs.lucene90.blocktree;
|
||||
exports org.apache.lucene.codecs.lucene90.compressing;
|
||||
exports org.apache.lucene.codecs.perfield;
|
||||
|
@ -65,7 +66,7 @@ module org.apache.lucene.core {
|
|||
provides org.apache.lucene.analysis.TokenizerFactory with
|
||||
org.apache.lucene.analysis.standard.StandardTokenizerFactory;
|
||||
provides org.apache.lucene.codecs.Codec with
|
||||
Lucene95Codec;
|
||||
Lucene99Codec;
|
||||
provides org.apache.lucene.codecs.DocValuesFormat with
|
||||
org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat;
|
||||
provides org.apache.lucene.codecs.KnnVectorsFormat with
|
||||
|
|
|
@ -55,7 +55,7 @@ public abstract class Codec implements NamedSPILoader.NamedSPI {
|
|||
return LOADER;
|
||||
}
|
||||
|
||||
static Codec defaultCodec = LOADER.lookup("Lucene95");
|
||||
static Codec defaultCodec = LOADER.lookup("Lucene99");
|
||||
}
|
||||
|
||||
private final String name;
|
||||
|
|
|
@ -15,411 +15,5 @@
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Lucene 9.5 file format.
|
||||
*
|
||||
* <h2>Apache Lucene - Index File Formats</h2>
|
||||
*
|
||||
* <div>
|
||||
*
|
||||
* <ul>
|
||||
* <li><a href="#Introduction">Introduction</a>
|
||||
* <li><a href="#Definitions">Definitions</a>
|
||||
* <ul>
|
||||
* <li><a href="#Inverted_Indexing">Inverted Indexing</a>
|
||||
* <li><a href="#Types_of_Fields">Types of Fields</a>
|
||||
* <li><a href="#Segments">Segments</a>
|
||||
* <li><a href="#Document_Numbers">Document Numbers</a>
|
||||
* </ul>
|
||||
* <li><a href="#Overview">Index Structure Overview</a>
|
||||
* <li><a href="#File_Naming">File Naming</a>
|
||||
* <li><a href="#file-names">Summary of File Extensions</a>
|
||||
* <ul>
|
||||
* <li><a href="#Lock_File">Lock File</a>
|
||||
* <li><a href="#History">History</a>
|
||||
* <li><a href="#Limitations">Limitations</a>
|
||||
* </ul>
|
||||
* </ul>
|
||||
*
|
||||
* </div> <a id="Introduction"></a>
|
||||
*
|
||||
* <h3>Introduction</h3>
|
||||
*
|
||||
* <div>
|
||||
*
|
||||
* <p>This document defines the index file formats used in this version of Lucene. If you are using
|
||||
* a different version of Lucene, please consult the copy of <code>docs/</code> that was distributed
|
||||
* with the version you are using.
|
||||
*
|
||||
* <p>This document attempts to provide a high-level definition of the Apache Lucene file formats.
|
||||
* </div> <a id="Definitions"></a>
|
||||
*
|
||||
* <h3>Definitions</h3>
|
||||
*
|
||||
* <div>
|
||||
*
|
||||
* <p>The fundamental concepts in Lucene are index, document, field and term.
|
||||
*
|
||||
* <p>An index contains a sequence of documents.
|
||||
*
|
||||
* <ul>
|
||||
* <li>A document is a sequence of fields.
|
||||
* <li>A field is a named sequence of terms.
|
||||
* <li>A term is a sequence of bytes.
|
||||
* </ul>
|
||||
*
|
||||
* <p>The same sequence of bytes in two different fields is considered a different term. Thus terms
|
||||
* are represented as a pair: the string naming the field, and the bytes within the field. <a
|
||||
* id="Inverted_Indexing"></a>
|
||||
*
|
||||
* <h4>Inverted Indexing</h4>
|
||||
*
|
||||
* <p>Lucene's index stores terms and statistics about those terms in order to make term-based
|
||||
* search more efficient. Lucene's terms index falls into the family of indexes known as an
|
||||
* <i>inverted index.</i> This is because it can list, for a term, the documents that contain it.
|
||||
* This is the inverse of the natural relationship, in which documents list terms. <a
|
||||
* id="Types_of_Fields"></a>
|
||||
*
|
||||
* <h4>Types of Fields</h4>
|
||||
*
|
||||
* <p>In Lucene, fields may be <i>stored</i>, in which case their text is stored in the index
|
||||
* literally, in a non-inverted manner. Fields that are inverted are called <i>indexed</i>. A field
|
||||
* may be both stored and indexed.
|
||||
*
|
||||
* <p>The text of a field may be <i>tokenized</i> into terms to be indexed, or the text of a field
|
||||
* may be used literally as a term to be indexed. Most fields are tokenized, but sometimes it is
|
||||
* useful for certain identifier fields to be indexed literally.
|
||||
*
|
||||
* <p>See the {@link org.apache.lucene.document.Field Field} java docs for more information on
|
||||
* Fields. <a id="Segments"></a>
|
||||
*
|
||||
* <h4>Segments</h4>
|
||||
*
|
||||
* <p>Lucene indexes may be composed of multiple sub-indexes, or <i>segments</i>. Each segment is a
|
||||
* fully independent index, which could be searched separately. Indexes evolve by:
|
||||
*
|
||||
* <ol>
|
||||
* <li>Creating new segments for newly added documents.
|
||||
* <li>Merging existing segments.
|
||||
* </ol>
|
||||
*
|
||||
* <p>Searches may involve multiple segments and/or multiple indexes, each index potentially
|
||||
* composed of a set of segments. <a id="Document_Numbers"></a>
|
||||
*
|
||||
* <h4>Document Numbers</h4>
|
||||
*
|
||||
* <p>Internally, Lucene refers to documents by an integer <i>document number</i>. The first
|
||||
* document added to an index is numbered zero, and each subsequent document added gets a number one
|
||||
* greater than the previous.
|
||||
*
|
||||
* <p>Note that a document's number may change, so caution should be taken when storing these
|
||||
* numbers outside of Lucene. In particular, numbers may change in the following situations:
|
||||
*
|
||||
* <ul>
|
||||
* <li>
|
||||
* <p>The numbers stored in each segment are unique only within the segment, and must be
|
||||
* converted before they can be used in a larger context. The standard technique is to
|
||||
* allocate each segment a range of values, based on the range of numbers used in that
|
||||
* segment. To convert a document number from a segment to an external value, the segment's
|
||||
* <i>base</i> document number is added. To convert an external value back to a
|
||||
* segment-specific value, the segment is identified by the range that the external value is
|
||||
* in, and the segment's base value is subtracted. For example two five document segments
|
||||
* might be combined, so that the first segment has a base value of zero, and the second of
|
||||
* five. Document three from the second segment would have an external value of eight.
|
||||
* <li>
|
||||
* <p>When documents are deleted, gaps are created in the numbering. These are eventually
|
||||
* removed as the index evolves through merging. Deleted documents are dropped when segments
|
||||
* are merged. A freshly-merged segment thus has no gaps in its numbering.
|
||||
* </ul>
|
||||
*
|
||||
* </div> <a id="Overview"></a>
|
||||
*
|
||||
* <h3>Index Structure Overview</h3>
|
||||
*
|
||||
* <div>
|
||||
*
|
||||
* <p>Each segment index maintains the following:
|
||||
*
|
||||
* <ul>
|
||||
* <li>{@link org.apache.lucene.codecs.lucene90.Lucene90SegmentInfoFormat Segment info}. This
|
||||
* contains metadata about a segment, such as the number of documents, what files it uses, and
|
||||
* information about how the segment is sorted
|
||||
* <li>{@link org.apache.lucene.codecs.lucene94.Lucene94FieldInfosFormat Field names}. This
|
||||
* contains metadata about the set of named fields used in the index.
|
||||
* <li>{@link org.apache.lucene.codecs.lucene90.Lucene90StoredFieldsFormat Stored Field values}.
|
||||
* This contains, for each document, a list of attribute-value pairs, where the attributes are
|
||||
* field names. These are used to store auxiliary information about the document, such as its
|
||||
* title, url, or an identifier to access a database. The set of stored fields are what is
|
||||
* returned for each hit when searching. This is keyed by document number.
|
||||
* <li>{@link org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat Term dictionary}. A
|
||||
* dictionary containing all of the terms used in all of the indexed fields of all of the
|
||||
* documents. The dictionary also contains the number of documents which contain the term, and
|
||||
* pointers to the term's frequency and proximity data.
|
||||
* <li>{@link org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat Term Frequency data}. For
|
||||
* each term in the dictionary, the numbers of all the documents that contain that term, and
|
||||
* the frequency of the term in that document, unless frequencies are omitted ({@link
|
||||
* org.apache.lucene.index.IndexOptions#DOCS IndexOptions.DOCS})
|
||||
* <li>{@link org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat Term Proximity data}. For
|
||||
* each term in the dictionary, the positions that the term occurs in each document. Note that
|
||||
* this will not exist if all fields in all documents omit position data.
|
||||
* <li>{@link org.apache.lucene.codecs.lucene90.Lucene90NormsFormat Normalization factors}. For
|
||||
* each field in each document, a value is stored that is multiplied into the score for hits
|
||||
* on that field.
|
||||
* <li>{@link org.apache.lucene.codecs.lucene90.Lucene90TermVectorsFormat Term Vectors}. For each
|
||||
* field in each document, the term vector (sometimes called document vector) may be stored. A
|
||||
* term vector consists of term text and term frequency. To add Term Vectors to your index see
|
||||
* the {@link org.apache.lucene.document.Field Field} constructors
|
||||
* <li>{@link org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat Per-document values}. Like
|
||||
* stored values, these are also keyed by document number, but are generally intended to be
|
||||
* loaded into main memory for fast access. Whereas stored values are generally intended for
|
||||
* summary results from searches, per-document values are useful for things like scoring
|
||||
* factors.
|
||||
* <li>{@link org.apache.lucene.codecs.lucene90.Lucene90LiveDocsFormat Live documents}. An
|
||||
* optional file indicating which documents are live.
|
||||
* <li>{@link org.apache.lucene.codecs.lucene90.Lucene90PointsFormat Point values}. Optional pair
|
||||
* of files, recording dimensionally indexed fields, to enable fast numeric range filtering
|
||||
* and large numeric values like BigInteger and BigDecimal (1D) and geographic shape
|
||||
* intersection (2D, 3D).
|
||||
* <li>{@link org.apache.lucene.codecs.lucene95.Lucene95HnswVectorsFormat Vector values}. The
|
||||
* vector format stores numeric vectors in a format optimized for random access and
|
||||
* computation, supporting high-dimensional nearest-neighbor search.
|
||||
* </ul>
|
||||
*
|
||||
* <p>Details on each of these are provided in their linked pages. </div> <a id="File_Naming"></a>
|
||||
*
|
||||
* <h3>File Naming</h3>
|
||||
*
|
||||
* <div>
|
||||
*
|
||||
* <p>All files belonging to a segment have the same name with varying extensions. The extensions
|
||||
* correspond to the different file formats described below. When using the Compound File format
|
||||
* (default for small segments) these files (except for the Segment info file, the Lock file, and
|
||||
* Deleted documents file) are collapsed into a single .cfs file (see below for details)
|
||||
*
|
||||
* <p>Typically, all segments in an index are stored in a single directory, although this is not
|
||||
* required.
|
||||
*
|
||||
* <p>File names are never re-used. That is, when any file is saved to the Directory it is given a
|
||||
* never before used filename. This is achieved using a simple generations approach. For example,
|
||||
* the first segments file is segments_1, then segments_2, etc. The generation is a sequential long
|
||||
* integer represented in alpha-numeric (base 36) form. </div> <a id="file-names"></a>
|
||||
*
|
||||
* <h3>Summary of File Extensions</h3>
|
||||
*
|
||||
* <div>
|
||||
*
|
||||
* <p>The following table summarizes the names and extensions of the files in Lucene:
|
||||
*
|
||||
* <table class="padding4" style="border-spacing: 1px; border-collapse: separate">
|
||||
* <caption>lucene filenames by extension</caption>
|
||||
* <tr>
|
||||
* <th>Name</th>
|
||||
* <th>Extension</th>
|
||||
* <th>Brief Description</th>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.index.SegmentInfos Segments File}</td>
|
||||
* <td>segments_N</td>
|
||||
* <td>Stores information about a commit point</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td><a href="#Lock_File">Lock File</a></td>
|
||||
* <td>write.lock</td>
|
||||
* <td>The Write lock prevents multiple IndexWriters from writing to the same
|
||||
* file.</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90SegmentInfoFormat Segment Info}</td>
|
||||
* <td>.si</td>
|
||||
* <td>Stores metadata about a segment</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90CompoundFormat Compound File}</td>
|
||||
* <td>.cfs, .cfe</td>
|
||||
* <td>An optional "virtual" file consisting of all the other index files for
|
||||
* systems that frequently run out of file handles.</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene94.Lucene94FieldInfosFormat Fields}</td>
|
||||
* <td>.fnm</td>
|
||||
* <td>Stores information about the fields</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90StoredFieldsFormat Field Index}</td>
|
||||
* <td>.fdx</td>
|
||||
* <td>Contains pointers to field data</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90StoredFieldsFormat Field Data}</td>
|
||||
* <td>.fdt</td>
|
||||
* <td>The stored fields for documents</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat Term Dictionary}</td>
|
||||
* <td>.tim</td>
|
||||
* <td>The term dictionary, stores term info</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat Term Index}</td>
|
||||
* <td>.tip</td>
|
||||
* <td>The index into the Term Dictionary</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat Frequencies}</td>
|
||||
* <td>.doc</td>
|
||||
* <td>Contains the list of docs which contain each term along with frequency</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat Positions}</td>
|
||||
* <td>.pos</td>
|
||||
* <td>Stores position information about where a term occurs in the index</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat Payloads}</td>
|
||||
* <td>.pay</td>
|
||||
* <td>Stores additional per-position metadata information such as character offsets and user payloads</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90NormsFormat Norms}</td>
|
||||
* <td>.nvd, .nvm</td>
|
||||
* <td>Encodes length and boost factors for docs and fields</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat Per-Document Values}</td>
|
||||
* <td>.dvd, .dvm</td>
|
||||
* <td>Encodes additional scoring factors or other per-document information.</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90TermVectorsFormat Term Vector Index}</td>
|
||||
* <td>.tvx</td>
|
||||
* <td>Stores offset into the document data file</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90TermVectorsFormat Term Vector Data}</td>
|
||||
* <td>.tvd</td>
|
||||
* <td>Contains term vector data.</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90LiveDocsFormat Live Documents}</td>
|
||||
* <td>.liv</td>
|
||||
* <td>Info about what documents are live</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90PointsFormat Point values}</td>
|
||||
* <td>.dii, .dim</td>
|
||||
* <td>Holds indexed points</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene95.Lucene95HnswVectorsFormat Vector values}</td>
|
||||
* <td>.vec, .vem</td>
|
||||
* <td>Holds indexed vectors; <code>.vec</code> files contain the raw vector data, and
|
||||
* <code>.vem</code> the vector metadata</td>
|
||||
* </tr>
|
||||
* </table>
|
||||
*
|
||||
* </div> <a id="Lock_File"></a>
|
||||
*
|
||||
* <h3>Lock File</h3>
|
||||
*
|
||||
* The write lock, which is stored in the index directory by default, is named "write.lock". If the
|
||||
* lock directory is different from the index directory then the write lock will be named
|
||||
* "XXXX-write.lock" where XXXX is a unique prefix derived from the full path to the index
|
||||
* directory. When this file is present, a writer is currently modifying the index (adding or
|
||||
* removing documents). This lock file ensures that only one writer is modifying the index at a
|
||||
* time. <a id="History"></a>
|
||||
*
|
||||
* <h3>History</h3>
|
||||
*
|
||||
* <p>Compatibility notes are provided in this document, describing how file formats have changed
|
||||
* from prior versions:
|
||||
*
|
||||
* <ul>
|
||||
* <li>In version 2.1, the file format was changed to allow lock-less commits (ie, no more commit
|
||||
* lock). The change is fully backwards compatible: you can open a pre-2.1 index for searching
|
||||
* or adding/deleting of docs. When the new segments file is saved (committed), it will be
|
||||
* written in the new file format (meaning no specific "upgrade" process is needed). But note
|
||||
* that once a commit has occurred, pre-2.1 Lucene will not be able to read the index.
|
||||
* <li>In version 2.3, the file format was changed to allow segments to share a single set of doc
|
||||
* store (vectors & stored fields) files. This allows for faster indexing in certain
|
||||
* cases. The change is fully backwards compatible (in the same way as the lock-less commits
|
||||
* change in 2.1).
|
||||
* <li>In version 2.4, Strings are now written as true UTF-8 byte sequence, not Java's modified
|
||||
* UTF-8. See <a href="http://issues.apache.org/jira/browse/LUCENE-510">LUCENE-510</a> for
|
||||
* details.
|
||||
* <li>In version 2.9, an optional opaque Map<String,String> CommitUserData may be passed to
|
||||
* IndexWriter's commit methods (and later retrieved), which is recorded in the segments_N
|
||||
* file. See <a href="http://issues.apache.org/jira/browse/LUCENE-1382">LUCENE-1382</a> for
|
||||
* details. Also, diagnostics were added to each segment written recording details about why
|
||||
* it was written (due to flush, merge; which OS/JRE was used; etc.). See issue <a
|
||||
* href="http://issues.apache.org/jira/browse/LUCENE-1654">LUCENE-1654</a> for details.
|
||||
* <li>In version 3.0, compressed fields are no longer written to the index (they can still be
|
||||
* read, but on merge the new segment will write them, uncompressed). See issue <a
|
||||
* href="http://issues.apache.org/jira/browse/LUCENE-1960">LUCENE-1960</a> for details.
|
||||
* <li>In version 3.1, segments records the code version that created them. See <a
|
||||
* href="http://issues.apache.org/jira/browse/LUCENE-2720">LUCENE-2720</a> for details.
|
||||
* Additionally segments track explicitly whether or not they have term vectors. See <a
|
||||
* href="http://issues.apache.org/jira/browse/LUCENE-2811">LUCENE-2811</a> for details.
|
||||
* <li>In version 3.2, numeric fields are written as natively to stored fields file, previously
|
||||
* they were stored in text format only.
|
||||
* <li>In version 3.4, fields can omit position data while still indexing term frequencies.
|
||||
* <li>In version 4.0, the format of the inverted index became extensible via the {@link
|
||||
* org.apache.lucene.codecs.Codec Codec} api. Fast per-document storage ({@code DocValues})
|
||||
* was introduced. Normalization factors need no longer be a single byte, they can be any
|
||||
* {@link org.apache.lucene.index.NumericDocValues NumericDocValues}. Terms need not be
|
||||
* unicode strings, they can be any byte sequence. Term offsets can optionally be indexed into
|
||||
* the postings lists. Payloads can be stored in the term vectors.
|
||||
* <li>In version 4.1, the format of the postings list changed to use either of FOR compression or
|
||||
* variable-byte encoding, depending upon the frequency of the term. Terms appearing only once
|
||||
* were changed to inline directly into the term dictionary. Stored fields are compressed by
|
||||
* default.
|
||||
* <li>In version 4.2, term vectors are compressed by default. DocValues has a new multi-valued
|
||||
* type (SortedSet), that can be used for faceting/grouping/joining on multi-valued fields.
|
||||
* <li>In version 4.5, DocValues were extended to explicitly represent missing values.
|
||||
* <li>In version 4.6, FieldInfos were extended to support per-field DocValues generation, to
|
||||
* allow updating NumericDocValues fields.
|
||||
* <li>In version 4.8, checksum footers were added to the end of each index file for improved data
|
||||
* integrity. Specifically, the last 8 bytes of every index file contain the zlib-crc32
|
||||
* checksum of the file.
|
||||
* <li>In version 4.9, DocValues has a new multi-valued numeric type (SortedNumeric) that is
|
||||
* suitable for faceting/sorting/analytics.
|
||||
* <li>In version 5.4, DocValues have been improved to store more information on disk: addresses
|
||||
* for binary fields and ord indexes for multi-valued fields.
|
||||
* <li>In version 6.0, Points were added, for multi-dimensional range/distance search.
|
||||
* <li>In version 6.2, new Segment info format that reads/writes the index sort, to support index
|
||||
* sorting.
|
||||
* <li>In version 7.0, DocValues have been improved to better support sparse doc values thanks to
|
||||
* an iterator API.
|
||||
* <li>In version 8.0, postings have been enhanced to record, for each block of doc ids, the (term
|
||||
* freq, normalization factor) pairs that may trigger the maximum score of the block. This
|
||||
* information is recorded alongside skip data in order to be able to skip blocks of doc ids
|
||||
* if they may not produce high enough scores. Additionally doc values and norms has been
|
||||
* extended with jump-tables to make access O(1) instead of O(n), where n is the number of
|
||||
* elements to skip when advancing in the data.
|
||||
* <li>In version 8.4, postings, positions, offsets and payload lengths have move to a more
|
||||
* performant encoding that is vectorized.
|
||||
* <li>In version 8.6, index sort serialization is delegated to the sorts themselves, to allow
|
||||
* user-defined sorts to be used
|
||||
* <li>In version 8.7, stored fields compression became adaptive to better handle documents with
|
||||
* smaller stored fields.
|
||||
* <li>In version 9.0, vector-valued fields were added.
|
||||
* <li>In version 9.1, vector-valued fields were modified to add a graph hierarchy.
|
||||
* <li>In version 9.2, docs of vector-valued fields were moved from .vem to .vec and encoded by
|
||||
* IndexDISI. ordToDoc mappings was added to .vem.
|
||||
* <li>In version 9.5, HNSW graph connections were changed to be delta-encoded with vints.
|
||||
* Additionally, metadata file size improvements were made by delta-encoding nodes by graph
|
||||
* layer and not writing the node ids for the zeroth layer.
|
||||
* </ul>
|
||||
*
|
||||
* <a id="Limitations"></a>
|
||||
*
|
||||
* <h3>Limitations</h3>
|
||||
*
|
||||
* <div>
|
||||
*
|
||||
* <p>Lucene uses a Java <code>int</code> to refer to document numbers, and the index file format
|
||||
* uses an <code>Int32</code> on-disk to store document numbers. This is a limitation of both the
|
||||
* index file format and the current implementation. Eventually these should be replaced with either
|
||||
* <code>UInt64</code> values, or better yet, {@link org.apache.lucene.store.DataOutput#writeVInt
|
||||
* VInt} values which have no limit. </div>
|
||||
*/
|
||||
/** Lucene 9.5 file format. */
|
||||
package org.apache.lucene.codecs.lucene95;
|
||||
|
|
|
@ -0,0 +1,198 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.codecs.lucene99;
|
||||
|
||||
import java.util.Objects;
|
||||
import org.apache.lucene.codecs.*;
|
||||
import org.apache.lucene.codecs.lucene90.*;
|
||||
import org.apache.lucene.codecs.lucene94.Lucene94FieldInfosFormat;
|
||||
import org.apache.lucene.codecs.lucene95.Lucene95HnswVectorsFormat;
|
||||
import org.apache.lucene.codecs.perfield.PerFieldDocValuesFormat;
|
||||
import org.apache.lucene.codecs.perfield.PerFieldKnnVectorsFormat;
|
||||
import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat;
|
||||
|
||||
/**
|
||||
* Implements the Lucene 9.9 index format
|
||||
*
|
||||
* <p>If you want to reuse functionality of this codec in another codec, extend {@link FilterCodec}.
|
||||
*
|
||||
* @see org.apache.lucene.codecs.lucene99 package documentation for file format details.
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public class Lucene99Codec extends Codec {
|
||||
|
||||
/** Configuration option for the codec. */
|
||||
public enum Mode {
|
||||
/** Trade compression ratio for retrieval speed. */
|
||||
BEST_SPEED(Lucene90StoredFieldsFormat.Mode.BEST_SPEED),
|
||||
/** Trade retrieval speed for compression ratio. */
|
||||
BEST_COMPRESSION(Lucene90StoredFieldsFormat.Mode.BEST_COMPRESSION);
|
||||
|
||||
private final Lucene90StoredFieldsFormat.Mode storedMode;
|
||||
|
||||
private Mode(Lucene90StoredFieldsFormat.Mode storedMode) {
|
||||
this.storedMode = Objects.requireNonNull(storedMode);
|
||||
}
|
||||
}
|
||||
|
||||
private final TermVectorsFormat vectorsFormat = new Lucene90TermVectorsFormat();
|
||||
private final FieldInfosFormat fieldInfosFormat = new Lucene94FieldInfosFormat();
|
||||
private final SegmentInfoFormat segmentInfosFormat = new Lucene99SegmentInfoFormat();
|
||||
private final LiveDocsFormat liveDocsFormat = new Lucene90LiveDocsFormat();
|
||||
private final CompoundFormat compoundFormat = new Lucene90CompoundFormat();
|
||||
private final NormsFormat normsFormat = new Lucene90NormsFormat();
|
||||
|
||||
private final PostingsFormat defaultPostingsFormat;
|
||||
private final PostingsFormat postingsFormat =
|
||||
new PerFieldPostingsFormat() {
|
||||
@Override
|
||||
public PostingsFormat getPostingsFormatForField(String field) {
|
||||
return Lucene99Codec.this.getPostingsFormatForField(field);
|
||||
}
|
||||
};
|
||||
|
||||
private final DocValuesFormat defaultDVFormat;
|
||||
private final DocValuesFormat docValuesFormat =
|
||||
new PerFieldDocValuesFormat() {
|
||||
@Override
|
||||
public DocValuesFormat getDocValuesFormatForField(String field) {
|
||||
return Lucene99Codec.this.getDocValuesFormatForField(field);
|
||||
}
|
||||
};
|
||||
|
||||
private final KnnVectorsFormat defaultKnnVectorsFormat;
|
||||
private final KnnVectorsFormat knnVectorsFormat =
|
||||
new PerFieldKnnVectorsFormat() {
|
||||
@Override
|
||||
public KnnVectorsFormat getKnnVectorsFormatForField(String field) {
|
||||
return Lucene99Codec.this.getKnnVectorsFormatForField(field);
|
||||
}
|
||||
};
|
||||
|
||||
private final StoredFieldsFormat storedFieldsFormat;
|
||||
|
||||
/** Instantiates a new codec. */
|
||||
public Lucene99Codec() {
|
||||
this(Mode.BEST_SPEED);
|
||||
}
|
||||
|
||||
/**
|
||||
* Instantiates a new codec, specifying the stored fields compression mode to use.
|
||||
*
|
||||
* @param mode stored fields compression mode to use for newly flushed/merged segments.
|
||||
*/
|
||||
public Lucene99Codec(Mode mode) {
|
||||
super("Lucene99");
|
||||
this.storedFieldsFormat =
|
||||
new Lucene90StoredFieldsFormat(Objects.requireNonNull(mode).storedMode);
|
||||
this.defaultPostingsFormat = new Lucene90PostingsFormat();
|
||||
this.defaultDVFormat = new Lucene90DocValuesFormat();
|
||||
this.defaultKnnVectorsFormat = new Lucene95HnswVectorsFormat();
|
||||
}
|
||||
|
||||
@Override
|
||||
public final StoredFieldsFormat storedFieldsFormat() {
|
||||
return storedFieldsFormat;
|
||||
}
|
||||
|
||||
@Override
|
||||
public final TermVectorsFormat termVectorsFormat() {
|
||||
return vectorsFormat;
|
||||
}
|
||||
|
||||
@Override
|
||||
public final PostingsFormat postingsFormat() {
|
||||
return postingsFormat;
|
||||
}
|
||||
|
||||
@Override
|
||||
public final FieldInfosFormat fieldInfosFormat() {
|
||||
return fieldInfosFormat;
|
||||
}
|
||||
|
||||
@Override
|
||||
public final SegmentInfoFormat segmentInfoFormat() {
|
||||
return segmentInfosFormat;
|
||||
}
|
||||
|
||||
@Override
|
||||
public final LiveDocsFormat liveDocsFormat() {
|
||||
return liveDocsFormat;
|
||||
}
|
||||
|
||||
@Override
|
||||
public final CompoundFormat compoundFormat() {
|
||||
return compoundFormat;
|
||||
}
|
||||
|
||||
@Override
|
||||
public final PointsFormat pointsFormat() {
|
||||
return new Lucene90PointsFormat();
|
||||
}
|
||||
|
||||
@Override
|
||||
public final KnnVectorsFormat knnVectorsFormat() {
|
||||
return knnVectorsFormat;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the postings format that should be used for writing new segments of <code>field</code>.
|
||||
*
|
||||
* <p>The default implementation always returns "Lucene90".
|
||||
*
|
||||
* <p><b>WARNING:</b> if you subclass, you are responsible for index backwards compatibility:
|
||||
* future version of Lucene are only guaranteed to be able to read the default implementation,
|
||||
*/
|
||||
public PostingsFormat getPostingsFormatForField(String field) {
|
||||
return defaultPostingsFormat;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the docvalues format that should be used for writing new segments of <code>field</code>
|
||||
* .
|
||||
*
|
||||
* <p>The default implementation always returns "Lucene90".
|
||||
*
|
||||
* <p><b>WARNING:</b> if you subclass, you are responsible for index backwards compatibility:
|
||||
* future version of Lucene are only guaranteed to be able to read the default implementation.
|
||||
*/
|
||||
public DocValuesFormat getDocValuesFormatForField(String field) {
|
||||
return defaultDVFormat;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the vectors format that should be used for writing new segments of <code>field</code>
|
||||
*
|
||||
* <p>The default implementation always returns "Lucene95".
|
||||
*
|
||||
* <p><b>WARNING:</b> if you subclass, you are responsible for index backwards compatibility:
|
||||
* future version of Lucene are only guaranteed to be able to read the default implementation.
|
||||
*/
|
||||
public KnnVectorsFormat getKnnVectorsFormatForField(String field) {
|
||||
return defaultKnnVectorsFormat;
|
||||
}
|
||||
|
||||
@Override
|
||||
public final DocValuesFormat docValuesFormat() {
|
||||
return docValuesFormat;
|
||||
}
|
||||
|
||||
@Override
|
||||
public final NormsFormat normsFormat() {
|
||||
return normsFormat;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,236 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.codecs.lucene99;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import org.apache.lucene.codecs.CodecUtil;
|
||||
import org.apache.lucene.codecs.SegmentInfoFormat;
|
||||
import org.apache.lucene.index.*;
|
||||
import org.apache.lucene.search.Sort;
|
||||
import org.apache.lucene.search.SortField;
|
||||
import org.apache.lucene.store.*;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
/**
|
||||
* Lucene 9.9 Segment info format.
|
||||
*
|
||||
* <p>Files:
|
||||
*
|
||||
* <ul>
|
||||
* <li><code>.si</code>: Header, SegVersion, SegSize, IsCompoundFile, Diagnostics, Files,
|
||||
* Attributes, IndexSort, Footer
|
||||
* </ul>
|
||||
*
|
||||
* Data types:
|
||||
*
|
||||
* <ul>
|
||||
* <li>Header --> {@link CodecUtil#writeIndexHeader IndexHeader}
|
||||
* <li>SegSize --> {@link DataOutput#writeInt Int32}
|
||||
* <li>SegVersion --> {@link DataOutput#writeString String}
|
||||
* <li>SegMinVersion --> {@link DataOutput#writeString String}
|
||||
* <li>Files --> {@link DataOutput#writeSetOfStrings Set<String>}
|
||||
* <li>Diagnostics,Attributes --> {@link DataOutput#writeMapOfStrings Map<String,String>}
|
||||
* <li>IsCompoundFile --> {@link DataOutput#writeByte Int8}
|
||||
* <li>HasBlocks --> {@link DataOutput#writeByte Int8}
|
||||
* <li>IndexSort --> {@link DataOutput#writeVInt Int32} count, followed by {@code count}
|
||||
* SortField
|
||||
* <li>SortField --> {@link DataOutput#writeString String} sort class, followed by a per-sort
|
||||
* bytestream (see {@link SortFieldProvider#readSortField(DataInput)})
|
||||
* <li>Footer --> {@link CodecUtil#writeFooter CodecFooter}
|
||||
* </ul>
|
||||
*
|
||||
* Field Descriptions:
|
||||
*
|
||||
* <ul>
|
||||
* <li>SegVersion is the code version that created the segment.
|
||||
* <li>SegMinVersion is the minimum code version that contributed documents to the segment.
|
||||
* <li>SegSize is the number of documents contained in the segment index.
|
||||
* <li>IsCompoundFile records whether the segment is written as a compound file or not. If this is
|
||||
* -1, the segment is not a compound file. If it is 1, the segment is a compound file.
|
||||
* <li>HasBlocks records whether the segment contains documents written as a block and guarantees
|
||||
* consecutive document ids for all documents in the block
|
||||
* <li>The Diagnostics Map is privately written by {@link IndexWriter}, as a debugging aid, for
|
||||
* each segment it creates. It includes metadata like the current Lucene version, OS, Java
|
||||
* version, why the segment was created (merge, flush, addIndexes), etc.
|
||||
* <li>Files is a list of files referred to by this segment.
|
||||
* </ul>
|
||||
*
|
||||
* @see SegmentInfos
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public class Lucene99SegmentInfoFormat extends SegmentInfoFormat {
|
||||
|
||||
/** File extension used to store {@link SegmentInfo}. */
|
||||
public static final String SI_EXTENSION = "si";
|
||||
|
||||
static final String CODEC_NAME = "Lucene90SegmentInfo";
|
||||
static final int VERSION_START = 0;
|
||||
static final int VERSION_CURRENT = VERSION_START;
|
||||
|
||||
/** Sole constructor. */
|
||||
public Lucene99SegmentInfoFormat() {}
|
||||
|
||||
@Override
|
||||
public SegmentInfo read(Directory dir, String segment, byte[] segmentID, IOContext context)
|
||||
throws IOException {
|
||||
final String fileName = IndexFileNames.segmentFileName(segment, "", SI_EXTENSION);
|
||||
try (ChecksumIndexInput input = dir.openChecksumInput(fileName)) {
|
||||
Throwable priorE = null;
|
||||
SegmentInfo si = null;
|
||||
try {
|
||||
CodecUtil.checkIndexHeader(
|
||||
input, CODEC_NAME, VERSION_START, VERSION_CURRENT, segmentID, "");
|
||||
si = parseSegmentInfo(dir, input, segment, segmentID);
|
||||
} catch (Throwable exception) {
|
||||
priorE = exception;
|
||||
} finally {
|
||||
CodecUtil.checkFooter(input, priorE);
|
||||
}
|
||||
return si;
|
||||
}
|
||||
}
|
||||
|
||||
private SegmentInfo parseSegmentInfo(
|
||||
Directory dir, DataInput input, String segment, byte[] segmentID) throws IOException {
|
||||
final Version version = Version.fromBits(input.readInt(), input.readInt(), input.readInt());
|
||||
byte hasMinVersion = input.readByte();
|
||||
final Version minVersion;
|
||||
switch (hasMinVersion) {
|
||||
case 0:
|
||||
minVersion = null;
|
||||
break;
|
||||
case 1:
|
||||
minVersion = Version.fromBits(input.readInt(), input.readInt(), input.readInt());
|
||||
break;
|
||||
default:
|
||||
throw new CorruptIndexException("Illegal boolean value " + hasMinVersion, input);
|
||||
}
|
||||
|
||||
final int docCount = input.readInt();
|
||||
if (docCount < 0) {
|
||||
throw new CorruptIndexException("invalid docCount: " + docCount, input);
|
||||
}
|
||||
final boolean isCompoundFile = input.readByte() == SegmentInfo.YES;
|
||||
final boolean hasBlocks = input.readByte() == SegmentInfo.YES;
|
||||
|
||||
final Map<String, String> diagnostics = input.readMapOfStrings();
|
||||
final Set<String> files = input.readSetOfStrings();
|
||||
final Map<String, String> attributes = input.readMapOfStrings();
|
||||
|
||||
int numSortFields = input.readVInt();
|
||||
Sort indexSort;
|
||||
if (numSortFields > 0) {
|
||||
SortField[] sortFields = new SortField[numSortFields];
|
||||
for (int i = 0; i < numSortFields; i++) {
|
||||
String name = input.readString();
|
||||
sortFields[i] = SortFieldProvider.forName(name).readSortField(input);
|
||||
}
|
||||
indexSort = new Sort(sortFields);
|
||||
} else if (numSortFields < 0) {
|
||||
throw new CorruptIndexException("invalid index sort field count: " + numSortFields, input);
|
||||
} else {
|
||||
indexSort = null;
|
||||
}
|
||||
|
||||
SegmentInfo si =
|
||||
new SegmentInfo(
|
||||
dir,
|
||||
version,
|
||||
minVersion,
|
||||
segment,
|
||||
docCount,
|
||||
isCompoundFile,
|
||||
hasBlocks,
|
||||
null,
|
||||
diagnostics,
|
||||
segmentID,
|
||||
attributes,
|
||||
indexSort);
|
||||
si.setFiles(files);
|
||||
return si;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void write(Directory dir, SegmentInfo si, IOContext ioContext) throws IOException {
|
||||
final String fileName = IndexFileNames.segmentFileName(si.name, "", SI_EXTENSION);
|
||||
|
||||
try (IndexOutput output = dir.createOutput(fileName, ioContext)) {
|
||||
// Only add the file once we've successfully created it, else IFD assert can trip:
|
||||
si.addFile(fileName);
|
||||
CodecUtil.writeIndexHeader(output, CODEC_NAME, VERSION_CURRENT, si.getId(), "");
|
||||
|
||||
writeSegmentInfo(output, si);
|
||||
|
||||
CodecUtil.writeFooter(output);
|
||||
}
|
||||
}
|
||||
|
||||
private void writeSegmentInfo(DataOutput output, SegmentInfo si) throws IOException {
|
||||
Version version = si.getVersion();
|
||||
if (version.major < 7) {
|
||||
throw new IllegalArgumentException(
|
||||
"invalid major version: should be >= 7 but got: " + version.major + " segment=" + si);
|
||||
}
|
||||
// Write the Lucene version that created this segment, since 3.1
|
||||
output.writeInt(version.major);
|
||||
output.writeInt(version.minor);
|
||||
output.writeInt(version.bugfix);
|
||||
|
||||
// Write the min Lucene version that contributed docs to the segment, since 7.0
|
||||
if (si.getMinVersion() != null) {
|
||||
output.writeByte((byte) 1);
|
||||
Version minVersion = si.getMinVersion();
|
||||
output.writeInt(minVersion.major);
|
||||
output.writeInt(minVersion.minor);
|
||||
output.writeInt(minVersion.bugfix);
|
||||
} else {
|
||||
output.writeByte((byte) 0);
|
||||
}
|
||||
|
||||
assert version.prerelease == 0;
|
||||
output.writeInt(si.maxDoc());
|
||||
|
||||
output.writeByte((byte) (si.getUseCompoundFile() ? SegmentInfo.YES : SegmentInfo.NO));
|
||||
output.writeByte((byte) (si.getHasBlocks() ? SegmentInfo.YES : SegmentInfo.NO));
|
||||
output.writeMapOfStrings(si.getDiagnostics());
|
||||
Set<String> files = si.files();
|
||||
for (String file : files) {
|
||||
if (!IndexFileNames.parseSegmentName(file).equals(si.name)) {
|
||||
throw new IllegalArgumentException(
|
||||
"invalid files: expected segment=" + si.name + ", got=" + files);
|
||||
}
|
||||
}
|
||||
output.writeSetOfStrings(files);
|
||||
output.writeMapOfStrings(si.getAttributes());
|
||||
|
||||
Sort indexSort = si.getIndexSort();
|
||||
int numSortFields = indexSort == null ? 0 : indexSort.getSort().length;
|
||||
output.writeVInt(numSortFields);
|
||||
for (int i = 0; i < numSortFields; ++i) {
|
||||
SortField sortField = indexSort.getSort()[i];
|
||||
IndexSorter sorter = sortField.getIndexSorter();
|
||||
if (sorter == null) {
|
||||
throw new IllegalArgumentException("cannot serialize SortField " + sortField);
|
||||
}
|
||||
output.writeString(sorter.getProviderName());
|
||||
SortFieldProvider.write(sortField, output);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,425 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Lucene 9.9 file format.
|
||||
*
|
||||
* <h2>Apache Lucene - Index File Formats</h2>
|
||||
*
|
||||
* <div>
|
||||
*
|
||||
* <ul>
|
||||
* <li><a href="#Introduction">Introduction</a>
|
||||
* <li><a href="#Definitions">Definitions</a>
|
||||
* <ul>
|
||||
* <li><a href="#Inverted_Indexing">Inverted Indexing</a>
|
||||
* <li><a href="#Types_of_Fields">Types of Fields</a>
|
||||
* <li><a href="#Segments">Segments</a>
|
||||
* <li><a href="#Document_Numbers">Document Numbers</a>
|
||||
* </ul>
|
||||
* <li><a href="#Overview">Index Structure Overview</a>
|
||||
* <li><a href="#File_Naming">File Naming</a>
|
||||
* <li><a href="#file-names">Summary of File Extensions</a>
|
||||
* <ul>
|
||||
* <li><a href="#Lock_File">Lock File</a>
|
||||
* <li><a href="#History">History</a>
|
||||
* <li><a href="#Limitations">Limitations</a>
|
||||
* </ul>
|
||||
* </ul>
|
||||
*
|
||||
* </div> <a id="Introduction"></a>
|
||||
*
|
||||
* <h3>Introduction</h3>
|
||||
*
|
||||
* <div>
|
||||
*
|
||||
* <p>This document defines the index file formats used in this version of Lucene. If you are using
|
||||
* a different version of Lucene, please consult the copy of <code>docs/</code> that was distributed
|
||||
* with the version you are using.
|
||||
*
|
||||
* <p>This document attempts to provide a high-level definition of the Apache Lucene file formats.
|
||||
* </div> <a id="Definitions"></a>
|
||||
*
|
||||
* <h3>Definitions</h3>
|
||||
*
|
||||
* <div>
|
||||
*
|
||||
* <p>The fundamental concepts in Lucene are index, document, field and term.
|
||||
*
|
||||
* <p>An index contains a sequence of documents.
|
||||
*
|
||||
* <ul>
|
||||
* <li>A document is a sequence of fields.
|
||||
* <li>A field is a named sequence of terms.
|
||||
* <li>A term is a sequence of bytes.
|
||||
* </ul>
|
||||
*
|
||||
* <p>The same sequence of bytes in two different fields is considered a different term. Thus terms
|
||||
* are represented as a pair: the string naming the field, and the bytes within the field. <a
|
||||
* id="Inverted_Indexing"></a>
|
||||
*
|
||||
* <h4>Inverted Indexing</h4>
|
||||
*
|
||||
* <p>Lucene's index stores terms and statistics about those terms in order to make term-based
|
||||
* search more efficient. Lucene's terms index falls into the family of indexes known as an
|
||||
* <i>inverted index.</i> This is because it can list, for a term, the documents that contain it.
|
||||
* This is the inverse of the natural relationship, in which documents list terms. <a
|
||||
* id="Types_of_Fields"></a>
|
||||
*
|
||||
* <h4>Types of Fields</h4>
|
||||
*
|
||||
* <p>In Lucene, fields may be <i>stored</i>, in which case their text is stored in the index
|
||||
* literally, in a non-inverted manner. Fields that are inverted are called <i>indexed</i>. A field
|
||||
* may be both stored and indexed.
|
||||
*
|
||||
* <p>The text of a field may be <i>tokenized</i> into terms to be indexed, or the text of a field
|
||||
* may be used literally as a term to be indexed. Most fields are tokenized, but sometimes it is
|
||||
* useful for certain identifier fields to be indexed literally.
|
||||
*
|
||||
* <p>See the {@link org.apache.lucene.document.Field Field} java docs for more information on
|
||||
* Fields. <a id="Segments"></a>
|
||||
*
|
||||
* <h4>Segments</h4>
|
||||
*
|
||||
* <p>Lucene indexes may be composed of multiple sub-indexes, or <i>segments</i>. Each segment is a
|
||||
* fully independent index, which could be searched separately. Indexes evolve by:
|
||||
*
|
||||
* <ol>
|
||||
* <li>Creating new segments for newly added documents.
|
||||
* <li>Merging existing segments.
|
||||
* </ol>
|
||||
*
|
||||
* <p>Searches may involve multiple segments and/or multiple indexes, each index potentially
|
||||
* composed of a set of segments. <a id="Document_Numbers"></a>
|
||||
*
|
||||
* <h4>Document Numbers</h4>
|
||||
*
|
||||
* <p>Internally, Lucene refers to documents by an integer <i>document number</i>. The first
|
||||
* document added to an index is numbered zero, and each subsequent document added gets a number one
|
||||
* greater than the previous.
|
||||
*
|
||||
* <p>Note that a document's number may change, so caution should be taken when storing these
|
||||
* numbers outside of Lucene. In particular, numbers may change in the following situations:
|
||||
*
|
||||
* <ul>
|
||||
* <li>
|
||||
* <p>The numbers stored in each segment are unique only within the segment, and must be
|
||||
* converted before they can be used in a larger context. The standard technique is to
|
||||
* allocate each segment a range of values, based on the range of numbers used in that
|
||||
* segment. To convert a document number from a segment to an external value, the segment's
|
||||
* <i>base</i> document number is added. To convert an external value back to a
|
||||
* segment-specific value, the segment is identified by the range that the external value is
|
||||
* in, and the segment's base value is subtracted. For example two five document segments
|
||||
* might be combined, so that the first segment has a base value of zero, and the second of
|
||||
* five. Document three from the second segment would have an external value of eight.
|
||||
* <li>
|
||||
* <p>When documents are deleted, gaps are created in the numbering. These are eventually
|
||||
* removed as the index evolves through merging. Deleted documents are dropped when segments
|
||||
* are merged. A freshly-merged segment thus has no gaps in its numbering.
|
||||
* </ul>
|
||||
*
|
||||
* </div> <a id="Overview"></a>
|
||||
*
|
||||
* <h3>Index Structure Overview</h3>
|
||||
*
|
||||
* <div>
|
||||
*
|
||||
* <p>Each segment index maintains the following:
|
||||
*
|
||||
* <ul>
|
||||
* <li>{@link org.apache.lucene.codecs.lucene99.Lucene99SegmentInfoFormat Segment info}. This
|
||||
* contains metadata about a segment, such as the number of documents, what files it uses, and
|
||||
* information about how the segment is sorted
|
||||
* <li>{@link org.apache.lucene.codecs.lucene94.Lucene94FieldInfosFormat Field names}. This
|
||||
* contains metadata about the set of named fields used in the index.
|
||||
* <li>{@link org.apache.lucene.codecs.lucene90.Lucene90StoredFieldsFormat Stored Field values}.
|
||||
* This contains, for each document, a list of attribute-value pairs, where the attributes are
|
||||
* field names. These are used to store auxiliary information about the document, such as its
|
||||
* title, url, or an identifier to access a database. The set of stored fields are what is
|
||||
* returned for each hit when searching. This is keyed by document number.
|
||||
* <li>{@link org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat Term dictionary}. A
|
||||
* dictionary containing all of the terms used in all of the indexed fields of all of the
|
||||
* documents. The dictionary also contains the number of documents which contain the term, and
|
||||
* pointers to the term's frequency and proximity data.
|
||||
* <li>{@link org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat Term Frequency data}. For
|
||||
* each term in the dictionary, the numbers of all the documents that contain that term, and
|
||||
* the frequency of the term in that document, unless frequencies are omitted ({@link
|
||||
* org.apache.lucene.index.IndexOptions#DOCS IndexOptions.DOCS})
|
||||
* <li>{@link org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat Term Proximity data}. For
|
||||
* each term in the dictionary, the positions that the term occurs in each document. Note that
|
||||
* this will not exist if all fields in all documents omit position data.
|
||||
* <li>{@link org.apache.lucene.codecs.lucene90.Lucene90NormsFormat Normalization factors}. For
|
||||
* each field in each document, a value is stored that is multiplied into the score for hits
|
||||
* on that field.
|
||||
* <li>{@link org.apache.lucene.codecs.lucene90.Lucene90TermVectorsFormat Term Vectors}. For each
|
||||
* field in each document, the term vector (sometimes called document vector) may be stored. A
|
||||
* term vector consists of term text and term frequency. To add Term Vectors to your index see
|
||||
* the {@link org.apache.lucene.document.Field Field} constructors
|
||||
* <li>{@link org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat Per-document values}. Like
|
||||
* stored values, these are also keyed by document number, but are generally intended to be
|
||||
* loaded into main memory for fast access. Whereas stored values are generally intended for
|
||||
* summary results from searches, per-document values are useful for things like scoring
|
||||
* factors.
|
||||
* <li>{@link org.apache.lucene.codecs.lucene90.Lucene90LiveDocsFormat Live documents}. An
|
||||
* optional file indicating which documents are live.
|
||||
* <li>{@link org.apache.lucene.codecs.lucene90.Lucene90PointsFormat Point values}. Optional pair
|
||||
* of files, recording dimensionally indexed fields, to enable fast numeric range filtering
|
||||
* and large numeric values like BigInteger and BigDecimal (1D) and geographic shape
|
||||
* intersection (2D, 3D).
|
||||
* <li>{@link org.apache.lucene.codecs.lucene95.Lucene95HnswVectorsFormat Vector values}. The
|
||||
* vector format stores numeric vectors in a format optimized for random access and
|
||||
* computation, supporting high-dimensional nearest-neighbor search.
|
||||
* </ul>
|
||||
*
|
||||
* <p>Details on each of these are provided in their linked pages. </div> <a id="File_Naming"></a>
|
||||
*
|
||||
* <h3>File Naming</h3>
|
||||
*
|
||||
* <div>
|
||||
*
|
||||
* <p>All files belonging to a segment have the same name with varying extensions. The extensions
|
||||
* correspond to the different file formats described below. When using the Compound File format
|
||||
* (default for small segments) these files (except for the Segment info file, the Lock file, and
|
||||
* Deleted documents file) are collapsed into a single .cfs file (see below for details)
|
||||
*
|
||||
* <p>Typically, all segments in an index are stored in a single directory, although this is not
|
||||
* required.
|
||||
*
|
||||
* <p>File names are never re-used. That is, when any file is saved to the Directory it is given a
|
||||
* never before used filename. This is achieved using a simple generations approach. For example,
|
||||
* the first segments file is segments_1, then segments_2, etc. The generation is a sequential long
|
||||
* integer represented in alpha-numeric (base 36) form. </div> <a id="file-names"></a>
|
||||
*
|
||||
* <h3>Summary of File Extensions</h3>
|
||||
*
|
||||
* <div>
|
||||
*
|
||||
* <p>The following table summarizes the names and extensions of the files in Lucene:
|
||||
*
|
||||
* <table class="padding4" style="border-spacing: 1px; border-collapse: separate">
|
||||
* <caption>lucene filenames by extension</caption>
|
||||
* <tr>
|
||||
* <th>Name</th>
|
||||
* <th>Extension</th>
|
||||
* <th>Brief Description</th>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.index.SegmentInfos Segments File}</td>
|
||||
* <td>segments_N</td>
|
||||
* <td>Stores information about a commit point</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td><a href="#Lock_File">Lock File</a></td>
|
||||
* <td>write.lock</td>
|
||||
* <td>The Write lock prevents multiple IndexWriters from writing to the same
|
||||
* file.</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene99.Lucene99SegmentInfoFormat Segment Info}</td>
|
||||
* <td>.si</td>
|
||||
* <td>Stores metadata about a segment</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90CompoundFormat Compound File}</td>
|
||||
* <td>.cfs, .cfe</td>
|
||||
* <td>An optional "virtual" file consisting of all the other index files for
|
||||
* systems that frequently run out of file handles.</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene94.Lucene94FieldInfosFormat Fields}</td>
|
||||
* <td>.fnm</td>
|
||||
* <td>Stores information about the fields</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90StoredFieldsFormat Field Index}</td>
|
||||
* <td>.fdx</td>
|
||||
* <td>Contains pointers to field data</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90StoredFieldsFormat Field Data}</td>
|
||||
* <td>.fdt</td>
|
||||
* <td>The stored fields for documents</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat Term Dictionary}</td>
|
||||
* <td>.tim</td>
|
||||
* <td>The term dictionary, stores term info</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat Term Index}</td>
|
||||
* <td>.tip</td>
|
||||
* <td>The index into the Term Dictionary</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat Frequencies}</td>
|
||||
* <td>.doc</td>
|
||||
* <td>Contains the list of docs which contain each term along with frequency</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat Positions}</td>
|
||||
* <td>.pos</td>
|
||||
* <td>Stores position information about where a term occurs in the index</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat Payloads}</td>
|
||||
* <td>.pay</td>
|
||||
* <td>Stores additional per-position metadata information such as character offsets and user payloads</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90NormsFormat Norms}</td>
|
||||
* <td>.nvd, .nvm</td>
|
||||
* <td>Encodes length and boost factors for docs and fields</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat Per-Document Values}</td>
|
||||
* <td>.dvd, .dvm</td>
|
||||
* <td>Encodes additional scoring factors or other per-document information.</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90TermVectorsFormat Term Vector Index}</td>
|
||||
* <td>.tvx</td>
|
||||
* <td>Stores offset into the document data file</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90TermVectorsFormat Term Vector Data}</td>
|
||||
* <td>.tvd</td>
|
||||
* <td>Contains term vector data.</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90LiveDocsFormat Live Documents}</td>
|
||||
* <td>.liv</td>
|
||||
* <td>Info about what documents are live</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90PointsFormat Point values}</td>
|
||||
* <td>.dii, .dim</td>
|
||||
* <td>Holds indexed points</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene95.Lucene95HnswVectorsFormat Vector values}</td>
|
||||
* <td>.vec, .vem</td>
|
||||
* <td>Holds indexed vectors; <code>.vec</code> files contain the raw vector data, and
|
||||
* <code>.vem</code> the vector metadata</td>
|
||||
* </tr>
|
||||
* </table>
|
||||
*
|
||||
* </div> <a id="Lock_File"></a>
|
||||
*
|
||||
* <h3>Lock File</h3>
|
||||
*
|
||||
* The write lock, which is stored in the index directory by default, is named "write.lock". If the
|
||||
* lock directory is different from the index directory then the write lock will be named
|
||||
* "XXXX-write.lock" where XXXX is a unique prefix derived from the full path to the index
|
||||
* directory. When this file is present, a writer is currently modifying the index (adding or
|
||||
* removing documents). This lock file ensures that only one writer is modifying the index at a
|
||||
* time. <a id="History"></a>
|
||||
*
|
||||
* <h3>History</h3>
|
||||
*
|
||||
* <p>Compatibility notes are provided in this document, describing how file formats have changed
|
||||
* from prior versions:
|
||||
*
|
||||
* <ul>
|
||||
* <li>In version 2.1, the file format was changed to allow lock-less commits (ie, no more commit
|
||||
* lock). The change is fully backwards compatible: you can open a pre-2.1 index for searching
|
||||
* or adding/deleting of docs. When the new segments file is saved (committed), it will be
|
||||
* written in the new file format (meaning no specific "upgrade" process is needed). But note
|
||||
* that once a commit has occurred, pre-2.1 Lucene will not be able to read the index.
|
||||
* <li>In version 2.3, the file format was changed to allow segments to share a single set of doc
|
||||
* store (vectors & stored fields) files. This allows for faster indexing in certain
|
||||
* cases. The change is fully backwards compatible (in the same way as the lock-less commits
|
||||
* change in 2.1).
|
||||
* <li>In version 2.4, Strings are now written as true UTF-8 byte sequence, not Java's modified
|
||||
* UTF-8. See <a href="http://issues.apache.org/jira/browse/LUCENE-510">LUCENE-510</a> for
|
||||
* details.
|
||||
* <li>In version 2.9, an optional opaque Map<String,String> CommitUserData may be passed to
|
||||
* IndexWriter's commit methods (and later retrieved), which is recorded in the segments_N
|
||||
* file. See <a href="http://issues.apache.org/jira/browse/LUCENE-1382">LUCENE-1382</a> for
|
||||
* details. Also, diagnostics were added to each segment written recording details about why
|
||||
* it was written (due to flush, merge; which OS/JRE was used; etc.). See issue <a
|
||||
* href="http://issues.apache.org/jira/browse/LUCENE-1654">LUCENE-1654</a> for details.
|
||||
* <li>In version 3.0, compressed fields are no longer written to the index (they can still be
|
||||
* read, but on merge the new segment will write them, uncompressed). See issue <a
|
||||
* href="http://issues.apache.org/jira/browse/LUCENE-1960">LUCENE-1960</a> for details.
|
||||
* <li>In version 3.1, segments records the code version that created them. See <a
|
||||
* href="http://issues.apache.org/jira/browse/LUCENE-2720">LUCENE-2720</a> for details.
|
||||
* Additionally segments track explicitly whether or not they have term vectors. See <a
|
||||
* href="http://issues.apache.org/jira/browse/LUCENE-2811">LUCENE-2811</a> for details.
|
||||
* <li>In version 3.2, numeric fields are written as natively to stored fields file, previously
|
||||
* they were stored in text format only.
|
||||
* <li>In version 3.4, fields can omit position data while still indexing term frequencies.
|
||||
* <li>In version 4.0, the format of the inverted index became extensible via the {@link
|
||||
* org.apache.lucene.codecs.Codec Codec} api. Fast per-document storage ({@code DocValues})
|
||||
* was introduced. Normalization factors need no longer be a single byte, they can be any
|
||||
* {@link org.apache.lucene.index.NumericDocValues NumericDocValues}. Terms need not be
|
||||
* unicode strings, they can be any byte sequence. Term offsets can optionally be indexed into
|
||||
* the postings lists. Payloads can be stored in the term vectors.
|
||||
* <li>In version 4.1, the format of the postings list changed to use either of FOR compression or
|
||||
* variable-byte encoding, depending upon the frequency of the term. Terms appearing only once
|
||||
* were changed to inline directly into the term dictionary. Stored fields are compressed by
|
||||
* default.
|
||||
* <li>In version 4.2, term vectors are compressed by default. DocValues has a new multi-valued
|
||||
* type (SortedSet), that can be used for faceting/grouping/joining on multi-valued fields.
|
||||
* <li>In version 4.5, DocValues were extended to explicitly represent missing values.
|
||||
* <li>In version 4.6, FieldInfos were extended to support per-field DocValues generation, to
|
||||
* allow updating NumericDocValues fields.
|
||||
* <li>In version 4.8, checksum footers were added to the end of each index file for improved data
|
||||
* integrity. Specifically, the last 8 bytes of every index file contain the zlib-crc32
|
||||
* checksum of the file.
|
||||
* <li>In version 4.9, DocValues has a new multi-valued numeric type (SortedNumeric) that is
|
||||
* suitable for faceting/sorting/analytics.
|
||||
* <li>In version 5.4, DocValues have been improved to store more information on disk: addresses
|
||||
* for binary fields and ord indexes for multi-valued fields.
|
||||
* <li>In version 6.0, Points were added, for multi-dimensional range/distance search.
|
||||
* <li>In version 6.2, new Segment info format that reads/writes the index sort, to support index
|
||||
* sorting.
|
||||
* <li>In version 7.0, DocValues have been improved to better support sparse doc values thanks to
|
||||
* an iterator API.
|
||||
* <li>In version 8.0, postings have been enhanced to record, for each block of doc ids, the (term
|
||||
* freq, normalization factor) pairs that may trigger the maximum score of the block. This
|
||||
* information is recorded alongside skip data in order to be able to skip blocks of doc ids
|
||||
* if they may not produce high enough scores. Additionally doc values and norms has been
|
||||
* extended with jump-tables to make access O(1) instead of O(n), where n is the number of
|
||||
* elements to skip when advancing in the data.
|
||||
* <li>In version 8.4, postings, positions, offsets and payload lengths have move to a more
|
||||
* performant encoding that is vectorized.
|
||||
* <li>In version 8.6, index sort serialization is delegated to the sorts themselves, to allow
|
||||
* user-defined sorts to be used
|
||||
* <li>In version 8.7, stored fields compression became adaptive to better handle documents with
|
||||
* smaller stored fields.
|
||||
* <li>In version 9.0, vector-valued fields were added.
|
||||
* <li>In version 9.1, vector-valued fields were modified to add a graph hierarchy.
|
||||
* <li>In version 9.2, docs of vector-valued fields were moved from .vem to .vec and encoded by
|
||||
* IndexDISI. ordToDoc mappings was added to .vem.
|
||||
* <li>In version 9.5, HNSW graph connections were changed to be delta-encoded with vints.
|
||||
* Additionally, metadata file size improvements were made by delta-encoding nodes by graph
|
||||
* layer and not writing the node ids for the zeroth layer.
|
||||
* </ul>
|
||||
*
|
||||
* <a id="Limitations"></a>
|
||||
*
|
||||
* <h3>Limitations</h3>
|
||||
*
|
||||
* <div>
|
||||
*
|
||||
* <p>Lucene uses a Java <code>int</code> to refer to document numbers, and the index file format
|
||||
* uses an <code>Int32</code> on-disk to store document numbers. This is a limitation of both the
|
||||
* index file format and the current implementation. Eventually these should be replaced with either
|
||||
* <code>UInt64</code> values, or better yet, {@link org.apache.lucene.store.DataOutput#writeVInt
|
||||
* VInt} values which have no limit. </div>
|
||||
*/
|
||||
package org.apache.lucene.codecs.lucene99;
|
|
@ -164,6 +164,7 @@ final class DocumentsWriterPerThread implements Accountable {
|
|||
segmentName,
|
||||
-1,
|
||||
false,
|
||||
false,
|
||||
codec,
|
||||
Collections.emptyMap(),
|
||||
StringHelper.randomId(),
|
||||
|
@ -245,6 +246,9 @@ final class DocumentsWriterPerThread implements Accountable {
|
|||
}
|
||||
}
|
||||
allDocsIndexed = true;
|
||||
if (numDocsInRAM - docsInRamBefore > 1) {
|
||||
segmentInfo.setHasBlocks();
|
||||
}
|
||||
return finishDocuments(deleteNode, docsInRamBefore);
|
||||
} finally {
|
||||
if (!allDocsIndexed && !aborted) {
|
||||
|
@ -636,7 +640,7 @@ final class DocumentsWriterPerThread implements Accountable {
|
|||
return "DocumentsWriterPerThread [pendingDeletes="
|
||||
+ pendingUpdates
|
||||
+ ", segment="
|
||||
+ (segmentInfo != null ? segmentInfo.name : "null")
|
||||
+ segmentInfo.name
|
||||
+ ", aborted="
|
||||
+ aborted
|
||||
+ ", numDocsInRAM="
|
||||
|
|
|
@ -3368,9 +3368,13 @@ public class IndexWriter
|
|||
String mergedName = newSegmentName();
|
||||
Directory mergeDirectory = mergeScheduler.wrapForMerge(merge, directory);
|
||||
int numSoftDeleted = 0;
|
||||
boolean hasBlocks = false;
|
||||
for (MergePolicy.MergeReader reader : merge.getMergeReader()) {
|
||||
CodecReader leaf = reader.codecReader;
|
||||
numDocs += leaf.numDocs();
|
||||
for (LeafReaderContext context : reader.codecReader.leaves()) {
|
||||
hasBlocks |= context.reader().getMetaData().hasBlocks();
|
||||
}
|
||||
if (softDeletesEnabled) {
|
||||
Bits liveDocs = reader.hardLiveDocs;
|
||||
numSoftDeleted +=
|
||||
|
@ -3398,6 +3402,7 @@ public class IndexWriter
|
|||
mergedName,
|
||||
-1,
|
||||
false,
|
||||
hasBlocks,
|
||||
codec,
|
||||
Collections.emptyMap(),
|
||||
StringHelper.randomId(),
|
||||
|
@ -3479,6 +3484,7 @@ public class IndexWriter
|
|||
segName,
|
||||
info.info.maxDoc(),
|
||||
info.info.getUseCompoundFile(),
|
||||
info.info.getHasBlocks(),
|
||||
info.info.getCodec(),
|
||||
info.info.getDiagnostics(),
|
||||
info.info.getId(),
|
||||
|
@ -4926,7 +4932,13 @@ public class IndexWriter
|
|||
if (readerPool.writeDocValuesUpdatesForMerge(merge.segments)) {
|
||||
checkpoint();
|
||||
}
|
||||
|
||||
boolean hasBlocks = false;
|
||||
for (SegmentCommitInfo info : merge.segments) {
|
||||
if (info.info.getHasBlocks()) {
|
||||
hasBlocks = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
// Bind a new segment name here so even with
|
||||
// ConcurrentMergePolicy we keep deterministic segment
|
||||
// names.
|
||||
|
@ -4940,6 +4952,7 @@ public class IndexWriter
|
|||
mergeSegmentName,
|
||||
-1,
|
||||
false,
|
||||
hasBlocks,
|
||||
config.getCodec(),
|
||||
Collections.emptyMap(),
|
||||
StringHelper.randomId(),
|
||||
|
|
|
@ -16,6 +16,8 @@
|
|||
*/
|
||||
package org.apache.lucene.index;
|
||||
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.search.Sort;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
|
@ -29,9 +31,10 @@ public final class LeafMetaData {
|
|||
private final int createdVersionMajor;
|
||||
private final Version minVersion;
|
||||
private final Sort sort;
|
||||
private final boolean hasBlocks;
|
||||
|
||||
/** Expert: Sole constructor. Public for use by custom {@link LeafReader} impls. */
|
||||
public LeafMetaData(int createdVersionMajor, Version minVersion, Sort sort) {
|
||||
public LeafMetaData(int createdVersionMajor, Version minVersion, Sort sort, boolean hasBlocks) {
|
||||
this.createdVersionMajor = createdVersionMajor;
|
||||
if (createdVersionMajor > Version.LATEST.major) {
|
||||
throw new IllegalArgumentException(
|
||||
|
@ -46,6 +49,7 @@ public final class LeafMetaData {
|
|||
}
|
||||
this.minVersion = minVersion;
|
||||
this.sort = sort;
|
||||
this.hasBlocks = hasBlocks;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -72,4 +76,19 @@ public final class LeafMetaData {
|
|||
public Sort getSort() {
|
||||
return sort;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns <code>true</code> iff this index contains blocks created with {@link
|
||||
* IndexWriter#addDocument(Iterable)} or it's corresponding update methods with at least 2 or more
|
||||
* documents per call. Note: This property was not recorded before {@link Version#LUCENE_9_9_0}
|
||||
* this method will return false for all leaves written before {@link Version#LUCENE_9_9_0}
|
||||
*
|
||||
* @see IndexWriter#updateDocuments(Term, Iterable)
|
||||
* @see IndexWriter#updateDocuments(Query, Iterable)
|
||||
* @see IndexWriter#softUpdateDocuments(Term, Iterable, Field...)
|
||||
* @see IndexWriter#addDocuments(Iterable)
|
||||
*/
|
||||
public boolean hasBlocks() {
|
||||
return hasBlocks;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -169,9 +169,10 @@ public class ParallelLeafReader extends LeafReader {
|
|||
}
|
||||
|
||||
Version minVersion = Version.LATEST;
|
||||
boolean hasBlocks = false;
|
||||
for (final LeafReader reader : this.parallelReaders) {
|
||||
Version leafVersion = reader.getMetaData().getMinVersion();
|
||||
|
||||
hasBlocks |= reader.getMetaData().hasBlocks();
|
||||
if (leafVersion == null) {
|
||||
minVersion = null;
|
||||
break;
|
||||
|
@ -181,7 +182,7 @@ public class ParallelLeafReader extends LeafReader {
|
|||
}
|
||||
|
||||
fieldInfos = builder.finish();
|
||||
this.metaData = new LeafMetaData(createdVersionMajor, minVersion, indexSort);
|
||||
this.metaData = new LeafMetaData(createdVersionMajor, minVersion, indexSort, hasBlocks);
|
||||
|
||||
// do this finally so any Exceptions occurred before don't affect refcounts:
|
||||
for (LeafReader reader : completeReaderSet) {
|
||||
|
|
|
@ -81,6 +81,8 @@ public final class SegmentInfo {
|
|||
// into this segment
|
||||
Version minVersion;
|
||||
|
||||
private boolean hasBlocks;
|
||||
|
||||
void setDiagnostics(Map<String, String> diagnostics) {
|
||||
this.diagnostics = Map.copyOf(Objects.requireNonNull(diagnostics));
|
||||
}
|
||||
|
@ -117,6 +119,7 @@ public final class SegmentInfo {
|
|||
String name,
|
||||
int maxDoc,
|
||||
boolean isCompoundFile,
|
||||
boolean hasBlocks,
|
||||
Codec codec,
|
||||
Map<String, String> diagnostics,
|
||||
byte[] id,
|
||||
|
@ -129,6 +132,7 @@ public final class SegmentInfo {
|
|||
this.name = Objects.requireNonNull(name);
|
||||
this.maxDoc = maxDoc;
|
||||
this.isCompoundFile = isCompoundFile;
|
||||
this.hasBlocks = hasBlocks;
|
||||
this.codec = codec;
|
||||
this.diagnostics = Map.copyOf(Objects.requireNonNull(diagnostics));
|
||||
this.id = id;
|
||||
|
@ -153,6 +157,20 @@ public final class SegmentInfo {
|
|||
return isCompoundFile;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns true if this segment contains documents written as blocks.
|
||||
*
|
||||
* @see LeafMetaData#hasBlocks()
|
||||
*/
|
||||
public boolean getHasBlocks() {
|
||||
return hasBlocks;
|
||||
}
|
||||
|
||||
/** Sets the hasBlocks property to true. This setting is viral and can't be unset. */
|
||||
void setHasBlocks() {
|
||||
hasBlocks = true;
|
||||
}
|
||||
|
||||
/** Can only be called once. */
|
||||
public void setCodec(Codec codec) {
|
||||
assert this.codec == null;
|
||||
|
|
|
@ -83,7 +83,11 @@ public final class SegmentReader extends CodecReader {
|
|||
this.si = si.clone();
|
||||
this.originalSi = si;
|
||||
this.metaData =
|
||||
new LeafMetaData(createdVersionMajor, si.info.getMinVersion(), si.info.getIndexSort());
|
||||
new LeafMetaData(
|
||||
createdVersionMajor,
|
||||
si.info.getMinVersion(),
|
||||
si.info.getIndexSort(),
|
||||
si.info.getHasBlocks());
|
||||
|
||||
// We pull liveDocs/DV updates from disk:
|
||||
this.isNRT = false;
|
||||
|
|
|
@ -336,7 +336,11 @@ public final class SortingCodecReader extends FilterCodecReader {
|
|||
public static CodecReader wrap(CodecReader reader, Sorter.DocMap docMap, Sort sort) {
|
||||
LeafMetaData metaData = reader.getMetaData();
|
||||
LeafMetaData newMetaData =
|
||||
new LeafMetaData(metaData.getCreatedVersionMajor(), metaData.getMinVersion(), sort);
|
||||
new LeafMetaData(
|
||||
metaData.getCreatedVersionMajor(),
|
||||
metaData.getMinVersion(),
|
||||
sort,
|
||||
metaData.hasBlocks());
|
||||
if (docMap == null) {
|
||||
// the reader is already sorted
|
||||
return new FilterCodecReader(reader) {
|
||||
|
|
|
@ -13,4 +13,4 @@
|
|||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
org.apache.lucene.codecs.lucene95.Lucene95Codec
|
||||
org.apache.lucene.codecs.lucene99.Lucene99Codec
|
||||
|
|
|
@ -18,8 +18,7 @@ package org.apache.lucene.codecs.lucene90;
|
|||
|
||||
import com.carrotsearch.randomizedtesting.generators.RandomPicks;
|
||||
import org.apache.lucene.codecs.Codec;
|
||||
import org.apache.lucene.codecs.lucene95.Lucene95Codec;
|
||||
import org.apache.lucene.codecs.lucene95.Lucene95Codec.Mode;
|
||||
import org.apache.lucene.codecs.lucene99.Lucene99Codec;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.StoredField;
|
||||
import org.apache.lucene.index.DirectoryReader;
|
||||
|
@ -32,7 +31,7 @@ import org.apache.lucene.tests.index.BaseStoredFieldsFormatTestCase;
|
|||
public class TestLucene90StoredFieldsFormatHighCompression extends BaseStoredFieldsFormatTestCase {
|
||||
@Override
|
||||
protected Codec getCodec() {
|
||||
return new Lucene95Codec(Mode.BEST_COMPRESSION);
|
||||
return new Lucene99Codec(Lucene99Codec.Mode.BEST_COMPRESSION);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -43,7 +42,7 @@ public class TestLucene90StoredFieldsFormatHighCompression extends BaseStoredFie
|
|||
for (int i = 0; i < 10; i++) {
|
||||
IndexWriterConfig iwc = newIndexWriterConfig();
|
||||
iwc.setCodec(
|
||||
new Lucene95Codec(RandomPicks.randomFrom(random(), Lucene95Codec.Mode.values())));
|
||||
new Lucene99Codec(RandomPicks.randomFrom(random(), Lucene99Codec.Mode.values())));
|
||||
IndexWriter iw = new IndexWriter(dir, newIndexWriterConfig());
|
||||
Document doc = new Document();
|
||||
doc.add(new StoredField("field1", "value1"));
|
||||
|
@ -73,7 +72,7 @@ public class TestLucene90StoredFieldsFormatHighCompression extends BaseStoredFie
|
|||
expectThrows(
|
||||
NullPointerException.class,
|
||||
() -> {
|
||||
new Lucene95Codec(null);
|
||||
new Lucene99Codec(null);
|
||||
});
|
||||
|
||||
expectThrows(
|
||||
|
|
|
@ -17,6 +17,7 @@
|
|||
package org.apache.lucene.codecs.lucene95;
|
||||
|
||||
import org.apache.lucene.codecs.Codec;
|
||||
import org.apache.lucene.codecs.FilterCodec;
|
||||
import org.apache.lucene.codecs.KnnVectorsFormat;
|
||||
import org.apache.lucene.tests.index.BaseKnnVectorsFormatTestCase;
|
||||
import org.apache.lucene.tests.util.TestUtil;
|
||||
|
@ -28,16 +29,16 @@ public class TestLucene95HnswVectorsFormat extends BaseKnnVectorsFormatTestCase
|
|||
}
|
||||
|
||||
public void testToString() {
|
||||
Lucene95Codec customCodec =
|
||||
new Lucene95Codec() {
|
||||
FilterCodec customCodec =
|
||||
new FilterCodec("foo", Codec.getDefault()) {
|
||||
@Override
|
||||
public KnnVectorsFormat getKnnVectorsFormatForField(String field) {
|
||||
public KnnVectorsFormat knnVectorsFormat() {
|
||||
return new Lucene95HnswVectorsFormat(10, 20);
|
||||
}
|
||||
};
|
||||
String expectedString =
|
||||
"Lucene95HnswVectorsFormat(name=Lucene95HnswVectorsFormat, maxConn=10, beamWidth=20)";
|
||||
assertEquals(expectedString, customCodec.getKnnVectorsFormatForField("bogus_field").toString());
|
||||
assertEquals(expectedString, customCodec.knnVectorsFormat().toString());
|
||||
}
|
||||
|
||||
public void testLimits() {
|
||||
|
|
|
@ -0,0 +1,36 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.codecs.lucene99;
|
||||
|
||||
import org.apache.lucene.codecs.Codec;
|
||||
import org.apache.lucene.tests.index.BaseSegmentInfoFormatTestCase;
|
||||
import org.apache.lucene.tests.util.TestUtil;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
public class TestLucene99SegmentInfoFormat extends BaseSegmentInfoFormatTestCase {
|
||||
|
||||
@Override
|
||||
protected Version[] getVersions() {
|
||||
return new Version[] {Version.LATEST};
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Codec getCodec() {
|
||||
return TestUtil.getDefaultCodec();
|
||||
}
|
||||
}
|
|
@ -1815,4 +1815,75 @@ public class TestAddIndexes extends LuceneTestCase {
|
|||
assertEquals(wrappedReader.numDocs(), writer.getDocStats().maxDoc);
|
||||
IOUtils.close(reader, writer, dir3, dir2, dir1);
|
||||
}
|
||||
|
||||
public void testAddIndicesWithBlocks() throws IOException {
|
||||
boolean[] addHasBlocksPerm = {true, true, false, false};
|
||||
boolean[] baseHasBlocksPerm = {true, false, true, false};
|
||||
for (int perm = 0; perm < addHasBlocksPerm.length; perm++) {
|
||||
boolean addHasBlocks = addHasBlocksPerm[perm];
|
||||
boolean baseHasBlocks = baseHasBlocksPerm[perm];
|
||||
try (Directory dir = newDirectory()) {
|
||||
try (RandomIndexWriter writer = new RandomIndexWriter(random(), dir)) {
|
||||
int numBlocks = random().nextInt(1, 10);
|
||||
for (int i = 0; i < numBlocks; i++) {
|
||||
int numDocs = baseHasBlocks ? random().nextInt(2, 10) : 1;
|
||||
List<Document> docs = new ArrayList<>();
|
||||
for (int j = 0; j < numDocs; j++) {
|
||||
Document doc = new Document();
|
||||
int value = random().nextInt(5);
|
||||
doc.add(new StringField("value", "" + value, Field.Store.YES));
|
||||
docs.add(doc);
|
||||
}
|
||||
writer.addDocuments(docs);
|
||||
}
|
||||
writer.commit();
|
||||
}
|
||||
|
||||
try (Directory addDir = newDirectory()) {
|
||||
int numBlocks = random().nextInt(1, 10);
|
||||
try (RandomIndexWriter writer = new RandomIndexWriter(random(), addDir)) {
|
||||
for (int i = 0; i < numBlocks; i++) {
|
||||
int numDocs = addHasBlocks ? random().nextInt(2, 10) : 1;
|
||||
List<Document> docs = new ArrayList<>();
|
||||
for (int j = 0; j < numDocs; j++) {
|
||||
Document doc = new Document();
|
||||
int value = random().nextInt(5);
|
||||
doc.add(new StringField("value", "" + value, Field.Store.YES));
|
||||
docs.add(doc);
|
||||
}
|
||||
writer.addDocuments(docs);
|
||||
}
|
||||
writer.commit();
|
||||
}
|
||||
|
||||
try (IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig())) {
|
||||
if (random().nextBoolean()) {
|
||||
writer.addIndexes(addDir);
|
||||
} else {
|
||||
try (DirectoryReader reader = DirectoryReader.open(addDir)) {
|
||||
CodecReader[] readers = new CodecReader[(reader.leaves().size())];
|
||||
for (int i = 0; i < readers.length; i++) {
|
||||
readers[i] = (CodecReader) reader.leaves().get(i).reader();
|
||||
}
|
||||
writer.addIndexes(readers);
|
||||
}
|
||||
}
|
||||
writer.forceMerge(1, true);
|
||||
}
|
||||
|
||||
try (DirectoryReader reader = DirectoryReader.open(dir)) {
|
||||
SegmentReader codecReader = (SegmentReader) reader.leaves().get(0).reader();
|
||||
assertEquals(1, reader.leaves().size());
|
||||
if (addHasBlocks || baseHasBlocks) {
|
||||
assertTrue(
|
||||
"addHasBlocks: " + addHasBlocks + " baseHasBlocks: " + baseHasBlocks,
|
||||
codecReader.getSegmentInfo().info.getHasBlocks());
|
||||
} else {
|
||||
assertFalse(codecReader.getSegmentInfo().info.getHasBlocks());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -244,6 +244,7 @@ public class TestCodecs extends LuceneTestCase {
|
|||
SEGMENT,
|
||||
10000,
|
||||
false,
|
||||
false,
|
||||
codec,
|
||||
Collections.emptyMap(),
|
||||
StringHelper.randomId(),
|
||||
|
@ -322,6 +323,7 @@ public class TestCodecs extends LuceneTestCase {
|
|||
SEGMENT,
|
||||
10000,
|
||||
false,
|
||||
false,
|
||||
codec,
|
||||
Collections.emptyMap(),
|
||||
StringHelper.randomId(),
|
||||
|
|
|
@ -223,6 +223,7 @@ public class TestDoc extends LuceneTestCase {
|
|||
merged,
|
||||
-1,
|
||||
false,
|
||||
false,
|
||||
codec,
|
||||
Collections.emptyMap(),
|
||||
StringHelper.randomId(),
|
||||
|
|
|
@ -47,6 +47,7 @@ import java.util.concurrent.atomic.AtomicInteger;
|
|||
import java.util.concurrent.atomic.AtomicLong;
|
||||
import java.util.concurrent.atomic.AtomicReference;
|
||||
import java.util.function.Predicate;
|
||||
import java.util.function.Supplier;
|
||||
import java.util.stream.Collectors;
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
|
@ -1733,14 +1734,79 @@ public class TestIndexWriter extends LuceneTestCase {
|
|||
d.close();
|
||||
}
|
||||
|
||||
public void testOnlyUpdateDocuments() throws Exception {
|
||||
public void testHasBlocksMergeFullyDelSegments() throws IOException {
|
||||
Supplier<Document> documentSupplier =
|
||||
() -> {
|
||||
Document doc = new Document();
|
||||
doc.add(new StringField("foo", "bar", Field.Store.NO));
|
||||
return doc;
|
||||
};
|
||||
try (Directory dir = newDirectory()) {
|
||||
try (IndexWriter writer =
|
||||
new IndexWriter(dir, new IndexWriterConfig(new MockAnalyzer(random())))) {
|
||||
final List<Document> docs = new ArrayList<>();
|
||||
docs.add(documentSupplier.get());
|
||||
docs.add(documentSupplier.get());
|
||||
writer.updateDocuments(new Term("foo", "bar"), docs);
|
||||
writer.commit();
|
||||
if (random().nextBoolean()) {
|
||||
writer.updateDocuments(new Term("foo", "bar"), docs);
|
||||
writer.commit(); // second segment
|
||||
}
|
||||
writer.updateDocument(new Term("foo", "bar"), documentSupplier.get());
|
||||
if (random().nextBoolean()) {
|
||||
writer.forceMergeDeletes(true);
|
||||
} else {
|
||||
writer.forceMerge(1, true);
|
||||
}
|
||||
writer.commit();
|
||||
try (DirectoryReader reader = DirectoryReader.open(dir)) {
|
||||
assertEquals(1, reader.leaves().size());
|
||||
assertFalse(
|
||||
"hasBlocks should be cleared",
|
||||
reader.leaves().get(0).reader().getMetaData().hasBlocks());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public void testCarryOverHasBlocks() throws Exception {
|
||||
Directory dir = newDirectory();
|
||||
IndexWriter w = new IndexWriter(dir, new IndexWriterConfig(new MockAnalyzer(random())));
|
||||
|
||||
final List<Document> docs = new ArrayList<>();
|
||||
docs.add(new Document());
|
||||
w.updateDocuments(new Term("foo", "bar"), docs);
|
||||
w.close();
|
||||
w.commit();
|
||||
try (DirectoryReader reader = DirectoryReader.open(dir)) {
|
||||
SegmentCommitInfo segmentInfo =
|
||||
((SegmentReader) reader.leaves().get(0).reader()).getSegmentInfo();
|
||||
assertFalse(segmentInfo.info.getHasBlocks());
|
||||
}
|
||||
|
||||
docs.add(new Document()); // now we have 2 docs
|
||||
w.updateDocuments(new Term("foo", "bar"), docs);
|
||||
w.commit();
|
||||
try (DirectoryReader reader = DirectoryReader.open(dir)) {
|
||||
assertEquals(2, reader.leaves().size());
|
||||
SegmentCommitInfo segmentInfo =
|
||||
((SegmentReader) reader.leaves().get(0).reader()).getSegmentInfo();
|
||||
assertFalse(
|
||||
"codec: " + segmentInfo.info.getCodec().toString(), segmentInfo.info.getHasBlocks());
|
||||
segmentInfo = ((SegmentReader) reader.leaves().get(1).reader()).getSegmentInfo();
|
||||
assertTrue(
|
||||
"codec: " + segmentInfo.info.getCodec().toString(), segmentInfo.info.getHasBlocks());
|
||||
}
|
||||
w.forceMerge(1, true);
|
||||
w.commit();
|
||||
try (DirectoryReader reader = DirectoryReader.open(dir)) {
|
||||
assertEquals(1, reader.leaves().size());
|
||||
SegmentCommitInfo segmentInfo =
|
||||
((SegmentReader) reader.leaves().get(0).reader()).getSegmentInfo();
|
||||
assertTrue(
|
||||
"codec: " + segmentInfo.info.getCodec().toString(), segmentInfo.info.getHasBlocks());
|
||||
}
|
||||
w.commit();
|
||||
dir.close();
|
||||
}
|
||||
|
||||
|
|
|
@ -29,8 +29,8 @@ import java.util.List;
|
|||
import java.util.Set;
|
||||
import java.util.concurrent.CountDownLatch;
|
||||
import org.apache.lucene.codecs.Codec;
|
||||
import org.apache.lucene.codecs.FilterCodec;
|
||||
import org.apache.lucene.codecs.KnnVectorsFormat;
|
||||
import org.apache.lucene.codecs.lucene95.Lucene95Codec;
|
||||
import org.apache.lucene.codecs.lucene95.Lucene95HnswVectorsFormat;
|
||||
import org.apache.lucene.codecs.lucene95.Lucene95HnswVectorsReader;
|
||||
import org.apache.lucene.codecs.perfield.PerFieldKnnVectorsFormat;
|
||||
|
@ -48,6 +48,7 @@ import org.apache.lucene.search.SearcherManager;
|
|||
import org.apache.lucene.search.TopDocs;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.tests.util.LuceneTestCase;
|
||||
import org.apache.lucene.tests.util.TestUtil;
|
||||
import org.apache.lucene.util.Bits;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
|
@ -78,10 +79,15 @@ public class TestKnnGraph extends LuceneTestCase {
|
|||
}
|
||||
|
||||
codec =
|
||||
new Lucene95Codec() {
|
||||
new FilterCodec(TestUtil.getDefaultCodec().getName(), TestUtil.getDefaultCodec()) {
|
||||
@Override
|
||||
public KnnVectorsFormat getKnnVectorsFormatForField(String field) {
|
||||
return new Lucene95HnswVectorsFormat(M, HnswGraphBuilder.DEFAULT_BEAM_WIDTH);
|
||||
public KnnVectorsFormat knnVectorsFormat() {
|
||||
return new PerFieldKnnVectorsFormat() {
|
||||
@Override
|
||||
public KnnVectorsFormat getKnnVectorsFormatForField(String field) {
|
||||
return new Lucene95HnswVectorsFormat(M, HnswGraphBuilder.DEFAULT_BEAM_WIDTH);
|
||||
}
|
||||
};
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -90,10 +96,15 @@ public class TestKnnGraph extends LuceneTestCase {
|
|||
vectorEncoding = randomVectorEncoding();
|
||||
|
||||
codec =
|
||||
new Lucene95Codec() {
|
||||
new FilterCodec(TestUtil.getDefaultCodec().getName(), TestUtil.getDefaultCodec()) {
|
||||
@Override
|
||||
public KnnVectorsFormat getKnnVectorsFormatForField(String field) {
|
||||
return new Lucene95HnswVectorsFormat(M, HnswGraphBuilder.DEFAULT_BEAM_WIDTH);
|
||||
public KnnVectorsFormat knnVectorsFormat() {
|
||||
return new PerFieldKnnVectorsFormat() {
|
||||
@Override
|
||||
public KnnVectorsFormat getKnnVectorsFormatForField(String field) {
|
||||
return new Lucene95HnswVectorsFormat(M, HnswGraphBuilder.DEFAULT_BEAM_WIDTH);
|
||||
}
|
||||
};
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -101,10 +112,15 @@ public class TestKnnGraph extends LuceneTestCase {
|
|||
float32Codec = codec;
|
||||
} else {
|
||||
float32Codec =
|
||||
new Lucene95Codec() {
|
||||
new FilterCodec(TestUtil.getDefaultCodec().getName(), TestUtil.getDefaultCodec()) {
|
||||
@Override
|
||||
public KnnVectorsFormat getKnnVectorsFormatForField(String field) {
|
||||
return new Lucene95HnswVectorsFormat(M, HnswGraphBuilder.DEFAULT_BEAM_WIDTH);
|
||||
public KnnVectorsFormat knnVectorsFormat() {
|
||||
return new PerFieldKnnVectorsFormat() {
|
||||
@Override
|
||||
public KnnVectorsFormat getKnnVectorsFormatForField(String field) {
|
||||
return new Lucene95HnswVectorsFormat(M, HnswGraphBuilder.DEFAULT_BEAM_WIDTH);
|
||||
}
|
||||
};
|
||||
}
|
||||
};
|
||||
}
|
||||
|
|
|
@ -149,6 +149,7 @@ public class TestMergePolicy extends LuceneTestCase {
|
|||
TestUtil.randomSimpleString(random()), // name
|
||||
random().nextInt(1000), // maxDoc
|
||||
random().nextBoolean(), // isCompoundFile
|
||||
false,
|
||||
null, // codec
|
||||
Collections.emptyMap(), // diagnostics
|
||||
TestUtil.randomSimpleString( // id
|
||||
|
|
|
@ -131,6 +131,7 @@ public class TestOneMergeWrappingMergePolicy extends LuceneTestCase {
|
|||
TestUtil.randomSimpleString(random()), // name
|
||||
random().nextInt(), // maxDoc
|
||||
random().nextBoolean(), // isCompoundFile
|
||||
false,
|
||||
null, // codec
|
||||
Collections.emptyMap(), // diagnostics
|
||||
TestUtil.randomSimpleString( // id
|
||||
|
|
|
@ -46,6 +46,7 @@ public class TestPendingDeletes extends LuceneTestCase {
|
|||
"test",
|
||||
10,
|
||||
false,
|
||||
false,
|
||||
Codec.getDefault(),
|
||||
Collections.emptyMap(),
|
||||
StringHelper.randomId(),
|
||||
|
@ -92,6 +93,7 @@ public class TestPendingDeletes extends LuceneTestCase {
|
|||
"test",
|
||||
6,
|
||||
false,
|
||||
false,
|
||||
Codec.getDefault(),
|
||||
Collections.emptyMap(),
|
||||
StringHelper.randomId(),
|
||||
|
@ -162,6 +164,7 @@ public class TestPendingDeletes extends LuceneTestCase {
|
|||
"test",
|
||||
3,
|
||||
false,
|
||||
false,
|
||||
Codec.getDefault(),
|
||||
Collections.emptyMap(),
|
||||
StringHelper.randomId(),
|
||||
|
|
|
@ -161,6 +161,7 @@ public class TestPendingSoftDeletes extends TestPendingDeletes {
|
|||
"test",
|
||||
10,
|
||||
false,
|
||||
false,
|
||||
Codec.getDefault(),
|
||||
Collections.emptyMap(),
|
||||
StringHelper.randomId(),
|
||||
|
|
|
@ -76,6 +76,7 @@ public class TestSegmentInfos extends LuceneTestCase {
|
|||
"_0",
|
||||
1,
|
||||
false,
|
||||
false,
|
||||
Codec.getDefault(),
|
||||
Collections.<String, String>emptyMap(),
|
||||
id,
|
||||
|
@ -110,6 +111,7 @@ public class TestSegmentInfos extends LuceneTestCase {
|
|||
"_0",
|
||||
1,
|
||||
false,
|
||||
false,
|
||||
Codec.getDefault(),
|
||||
Collections.<String, String>emptyMap(),
|
||||
id,
|
||||
|
@ -129,6 +131,7 @@ public class TestSegmentInfos extends LuceneTestCase {
|
|||
"_1",
|
||||
1,
|
||||
false,
|
||||
false,
|
||||
Codec.getDefault(),
|
||||
Collections.<String, String>emptyMap(),
|
||||
id,
|
||||
|
@ -173,6 +176,7 @@ public class TestSegmentInfos extends LuceneTestCase {
|
|||
"TEST",
|
||||
10000,
|
||||
false,
|
||||
false,
|
||||
codec,
|
||||
Collections.emptyMap(),
|
||||
StringHelper.randomId(),
|
||||
|
@ -191,6 +195,7 @@ public class TestSegmentInfos extends LuceneTestCase {
|
|||
"TEST",
|
||||
10000,
|
||||
false,
|
||||
false,
|
||||
codec,
|
||||
diagnostics,
|
||||
StringHelper.randomId(),
|
||||
|
@ -216,6 +221,7 @@ public class TestSegmentInfos extends LuceneTestCase {
|
|||
"TEST",
|
||||
10000,
|
||||
false,
|
||||
false,
|
||||
codec,
|
||||
Collections.emptyMap(),
|
||||
StringHelper.randomId(),
|
||||
|
@ -241,6 +247,7 @@ public class TestSegmentInfos extends LuceneTestCase {
|
|||
"TEST",
|
||||
10000,
|
||||
false,
|
||||
false,
|
||||
codec,
|
||||
diagnostics,
|
||||
StringHelper.randomId(),
|
||||
|
@ -275,6 +282,7 @@ public class TestSegmentInfos extends LuceneTestCase {
|
|||
"_0",
|
||||
1,
|
||||
false,
|
||||
false,
|
||||
Codec.getDefault(),
|
||||
Collections.<String, String>emptyMap(),
|
||||
StringHelper.randomId(),
|
||||
|
@ -321,6 +329,7 @@ public class TestSegmentInfos extends LuceneTestCase {
|
|||
"_0",
|
||||
1,
|
||||
false,
|
||||
false,
|
||||
Codec.getDefault(),
|
||||
Collections.<String, String>emptyMap(),
|
||||
id,
|
||||
|
@ -340,6 +349,7 @@ public class TestSegmentInfos extends LuceneTestCase {
|
|||
"_1",
|
||||
1,
|
||||
false,
|
||||
false,
|
||||
Codec.getDefault(),
|
||||
Collections.<String, String>emptyMap(),
|
||||
id,
|
||||
|
@ -411,6 +421,7 @@ public class TestSegmentInfos extends LuceneTestCase {
|
|||
"TEST",
|
||||
10000,
|
||||
false,
|
||||
false,
|
||||
codec,
|
||||
diagnostics,
|
||||
StringHelper.randomId(),
|
||||
|
@ -428,6 +439,7 @@ public class TestSegmentInfos extends LuceneTestCase {
|
|||
"TEST",
|
||||
10000,
|
||||
false,
|
||||
false,
|
||||
codec,
|
||||
diagnostics,
|
||||
StringHelper.randomId(),
|
||||
|
|
|
@ -91,6 +91,7 @@ public class TestSegmentMerger extends LuceneTestCase {
|
|||
mergedSegment,
|
||||
-1,
|
||||
false,
|
||||
false,
|
||||
codec,
|
||||
Collections.emptyMap(),
|
||||
StringHelper.randomId(),
|
||||
|
|
|
@ -140,7 +140,7 @@ public class TestSegmentToThreadMapping extends LuceneTestCase {
|
|||
|
||||
@Override
|
||||
public LeafMetaData getMetaData() {
|
||||
return new LeafMetaData(Version.LATEST.major, Version.LATEST, null);
|
||||
return new LeafMetaData(Version.LATEST.major, Version.LATEST, null, false);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -38,8 +38,8 @@ import java.util.concurrent.Future;
|
|||
import java.util.concurrent.TimeUnit;
|
||||
import java.util.concurrent.TimeoutException;
|
||||
import java.util.stream.Collectors;
|
||||
import org.apache.lucene.codecs.FilterCodec;
|
||||
import org.apache.lucene.codecs.KnnVectorsFormat;
|
||||
import org.apache.lucene.codecs.lucene95.Lucene95Codec;
|
||||
import org.apache.lucene.codecs.lucene95.Lucene95HnswVectorsFormat;
|
||||
import org.apache.lucene.codecs.lucene95.Lucene95HnswVectorsReader;
|
||||
import org.apache.lucene.codecs.perfield.PerFieldKnnVectorsFormat;
|
||||
|
@ -71,6 +71,7 @@ import org.apache.lucene.search.SortField;
|
|||
import org.apache.lucene.search.TopDocs;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.tests.util.LuceneTestCase;
|
||||
import org.apache.lucene.tests.util.TestUtil;
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
import org.apache.lucene.util.BitSet;
|
||||
import org.apache.lucene.util.Bits;
|
||||
|
@ -156,10 +157,17 @@ abstract class HnswGraphTestCase<T> extends LuceneTestCase {
|
|||
IndexWriterConfig iwc =
|
||||
new IndexWriterConfig()
|
||||
.setCodec(
|
||||
new Lucene95Codec() {
|
||||
new FilterCodec(
|
||||
TestUtil.getDefaultCodec().getName(), TestUtil.getDefaultCodec()) {
|
||||
|
||||
@Override
|
||||
public KnnVectorsFormat getKnnVectorsFormatForField(String field) {
|
||||
return new Lucene95HnswVectorsFormat(M, beamWidth);
|
||||
public KnnVectorsFormat knnVectorsFormat() {
|
||||
return new PerFieldKnnVectorsFormat() {
|
||||
@Override
|
||||
public KnnVectorsFormat getKnnVectorsFormatForField(String field) {
|
||||
return new Lucene95HnswVectorsFormat(M, beamWidth);
|
||||
}
|
||||
};
|
||||
}
|
||||
})
|
||||
// set a random merge policy
|
||||
|
@ -222,10 +230,16 @@ abstract class HnswGraphTestCase<T> extends LuceneTestCase {
|
|||
IndexWriterConfig iwc =
|
||||
new IndexWriterConfig()
|
||||
.setCodec(
|
||||
new Lucene95Codec() {
|
||||
new FilterCodec(
|
||||
TestUtil.getDefaultCodec().getName(), TestUtil.getDefaultCodec()) {
|
||||
@Override
|
||||
public KnnVectorsFormat getKnnVectorsFormatForField(String field) {
|
||||
return new Lucene95HnswVectorsFormat(M, beamWidth);
|
||||
public KnnVectorsFormat knnVectorsFormat() {
|
||||
return new PerFieldKnnVectorsFormat() {
|
||||
@Override
|
||||
public KnnVectorsFormat getKnnVectorsFormatForField(String field) {
|
||||
return new Lucene95HnswVectorsFormat(M, beamWidth);
|
||||
}
|
||||
};
|
||||
}
|
||||
});
|
||||
try (IndexWriter iw = new IndexWriter(dir, iwc)) {
|
||||
|
@ -278,19 +292,29 @@ abstract class HnswGraphTestCase<T> extends LuceneTestCase {
|
|||
IndexWriterConfig iwc =
|
||||
new IndexWriterConfig()
|
||||
.setCodec(
|
||||
new Lucene95Codec() {
|
||||
new FilterCodec(TestUtil.getDefaultCodec().getName(), TestUtil.getDefaultCodec()) {
|
||||
@Override
|
||||
public KnnVectorsFormat getKnnVectorsFormatForField(String field) {
|
||||
return new Lucene95HnswVectorsFormat(M, beamWidth);
|
||||
public KnnVectorsFormat knnVectorsFormat() {
|
||||
return new PerFieldKnnVectorsFormat() {
|
||||
@Override
|
||||
public KnnVectorsFormat getKnnVectorsFormatForField(String field) {
|
||||
return new Lucene95HnswVectorsFormat(M, beamWidth);
|
||||
}
|
||||
};
|
||||
}
|
||||
});
|
||||
IndexWriterConfig iwc2 =
|
||||
new IndexWriterConfig()
|
||||
.setCodec(
|
||||
new Lucene95Codec() {
|
||||
new FilterCodec(TestUtil.getDefaultCodec().getName(), TestUtil.getDefaultCodec()) {
|
||||
@Override
|
||||
public KnnVectorsFormat getKnnVectorsFormatForField(String field) {
|
||||
return new Lucene95HnswVectorsFormat(M, beamWidth);
|
||||
public KnnVectorsFormat knnVectorsFormat() {
|
||||
return new PerFieldKnnVectorsFormat() {
|
||||
@Override
|
||||
public KnnVectorsFormat getKnnVectorsFormatForField(String field) {
|
||||
return new Lucene95HnswVectorsFormat(M, beamWidth);
|
||||
}
|
||||
};
|
||||
}
|
||||
})
|
||||
.setIndexSort(new Sort(new SortField("sortkey", SortField.Type.LONG)));
|
||||
|
|
|
@ -215,7 +215,7 @@ public class TermVectorLeafReader extends LeafReader {
|
|||
|
||||
@Override
|
||||
public LeafMetaData getMetaData() {
|
||||
return new LeafMetaData(Version.LATEST.major, null, null);
|
||||
return new LeafMetaData(Version.LATEST.major, null, null, false);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -2122,7 +2122,7 @@ public class MemoryIndex {
|
|||
|
||||
@Override
|
||||
public LeafMetaData getMetaData() {
|
||||
return new LeafMetaData(Version.LATEST.major, Version.LATEST, null);
|
||||
return new LeafMetaData(Version.LATEST.major, Version.LATEST, null, false);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -132,6 +132,7 @@ public class IndexSplitter {
|
|||
info.name,
|
||||
info.maxDoc(),
|
||||
info.getUseCompoundFile(),
|
||||
info.getHasBlocks(),
|
||||
info.getCodec(),
|
||||
info.getDiagnostics(),
|
||||
info.getId(),
|
||||
|
|
|
@ -39,8 +39,9 @@ import org.apache.lucene.analysis.miscellaneous.ConcatenateGraphFilter;
|
|||
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||
import org.apache.lucene.codecs.Codec;
|
||||
import org.apache.lucene.codecs.FilterCodec;
|
||||
import org.apache.lucene.codecs.PostingsFormat;
|
||||
import org.apache.lucene.codecs.lucene95.Lucene95Codec;
|
||||
import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.document.IntPoint;
|
||||
|
@ -961,17 +962,23 @@ public class TestSuggestField extends LuceneTestCase {
|
|||
IndexWriterConfig iwc = newIndexWriterConfig(random(), analyzer);
|
||||
iwc.setMergePolicy(newLogMergePolicy());
|
||||
Codec filterCodec =
|
||||
new Lucene95Codec() {
|
||||
new FilterCodec(TestUtil.getDefaultCodec().getName(), TestUtil.getDefaultCodec()) {
|
||||
CompletionPostingsFormat.FSTLoadMode fstLoadMode =
|
||||
RandomPicks.randomFrom(random(), CompletionPostingsFormat.FSTLoadMode.values());
|
||||
PostingsFormat postingsFormat = new Completion90PostingsFormat(fstLoadMode);
|
||||
|
||||
@Override
|
||||
public PostingsFormat getPostingsFormatForField(String field) {
|
||||
if (suggestFields.contains(field)) {
|
||||
return postingsFormat;
|
||||
}
|
||||
return super.getPostingsFormatForField(field);
|
||||
public PostingsFormat postingsFormat() {
|
||||
return new PerFieldPostingsFormat() {
|
||||
@Override
|
||||
public PostingsFormat getPostingsFormatForField(String field) {
|
||||
if (suggestFields.contains(field)) {
|
||||
return postingsFormat;
|
||||
}
|
||||
return ((PerFieldPostingsFormat) delegate.postingsFormat())
|
||||
.getPostingsFormatForField(field);
|
||||
}
|
||||
};
|
||||
}
|
||||
};
|
||||
iwc.setCodec(filterCodec);
|
||||
|
|
|
@ -18,8 +18,8 @@ package org.apache.lucene.tests.codecs.vector;
|
|||
|
||||
import org.apache.lucene.codecs.FilterCodec;
|
||||
import org.apache.lucene.codecs.KnnVectorsFormat;
|
||||
import org.apache.lucene.codecs.lucene95.Lucene95Codec;
|
||||
import org.apache.lucene.codecs.lucene95.Lucene95HnswVectorsFormat;
|
||||
import org.apache.lucene.tests.util.TestUtil;
|
||||
|
||||
/**
|
||||
* This codec allows customization of the number of connections made for an hnsw index. Increasing
|
||||
|
@ -31,12 +31,12 @@ public class ConfigurableMCodec extends FilterCodec {
|
|||
private final KnnVectorsFormat knnVectorsFormat;
|
||||
|
||||
public ConfigurableMCodec() {
|
||||
super("ConfigurableMCodec", new Lucene95Codec());
|
||||
super("ConfigurableMCodec", TestUtil.getDefaultCodec());
|
||||
knnVectorsFormat = new Lucene95HnswVectorsFormat(128, 100);
|
||||
}
|
||||
|
||||
public ConfigurableMCodec(int maxConn) {
|
||||
super("ConfigurableMCodec", new Lucene95Codec());
|
||||
super("ConfigurableMCodec", TestUtil.getDefaultCodec());
|
||||
knnVectorsFormat = new Lucene95HnswVectorsFormat(maxConn, 100);
|
||||
}
|
||||
|
||||
|
|
|
@ -672,6 +672,7 @@ public abstract class BaseCompoundFormatTestCase extends BaseIndexFileFormatTest
|
|||
name,
|
||||
10000,
|
||||
false,
|
||||
false,
|
||||
Codec.getDefault(),
|
||||
Collections.emptyMap(),
|
||||
StringHelper.randomId(),
|
||||
|
|
|
@ -401,6 +401,7 @@ public abstract class BaseFieldInfoFormatTestCase extends BaseIndexFileFormatTes
|
|||
name,
|
||||
10000,
|
||||
false,
|
||||
false,
|
||||
Codec.getDefault(),
|
||||
Collections.emptyMap(),
|
||||
StringHelper.randomId(),
|
||||
|
|
|
@ -341,6 +341,7 @@ abstract class BaseIndexFileFormatTestCase extends LuceneTestCase {
|
|||
"_0",
|
||||
1,
|
||||
false,
|
||||
false,
|
||||
codec,
|
||||
Collections.emptyMap(),
|
||||
StringHelper.randomId(),
|
||||
|
|
|
@ -131,6 +131,7 @@ public abstract class BaseLiveDocsFormatTestCase extends LuceneTestCase {
|
|||
"foo",
|
||||
maxDoc,
|
||||
random().nextBoolean(),
|
||||
false,
|
||||
codec,
|
||||
Collections.emptyMap(),
|
||||
StringHelper.randomId(),
|
||||
|
|
|
@ -154,6 +154,7 @@ public abstract class BaseMergePolicyTestCase extends LuceneTestCase {
|
|||
TestUtil.randomSimpleString(random()), // name
|
||||
random().nextInt(Integer.MAX_VALUE), // maxDoc
|
||||
random().nextBoolean(), // isCompoundFile
|
||||
false,
|
||||
null, // codec
|
||||
Collections.emptyMap(), // diagnostics
|
||||
TestUtil.randomSimpleString( // id
|
||||
|
@ -236,6 +237,7 @@ public abstract class BaseMergePolicyTestCase extends LuceneTestCase {
|
|||
name,
|
||||
maxDoc,
|
||||
false,
|
||||
false,
|
||||
TestUtil.getDefaultCodec(),
|
||||
Collections.emptyMap(),
|
||||
id,
|
||||
|
|
|
@ -67,6 +67,7 @@ public abstract class BaseSegmentInfoFormatTestCase extends BaseIndexFileFormatT
|
|||
"_123",
|
||||
1,
|
||||
false,
|
||||
false,
|
||||
codec,
|
||||
Collections.emptyMap(),
|
||||
id,
|
||||
|
@ -92,6 +93,7 @@ public abstract class BaseSegmentInfoFormatTestCase extends BaseIndexFileFormatT
|
|||
"_123",
|
||||
1,
|
||||
false,
|
||||
false,
|
||||
codec,
|
||||
Collections.emptyMap(),
|
||||
id,
|
||||
|
@ -135,6 +137,7 @@ public abstract class BaseSegmentInfoFormatTestCase extends BaseIndexFileFormatT
|
|||
"_123",
|
||||
1,
|
||||
false,
|
||||
false,
|
||||
codec,
|
||||
diagnostics,
|
||||
id,
|
||||
|
@ -171,6 +174,7 @@ public abstract class BaseSegmentInfoFormatTestCase extends BaseIndexFileFormatT
|
|||
"_123",
|
||||
1,
|
||||
false,
|
||||
false,
|
||||
codec,
|
||||
Collections.emptyMap(),
|
||||
id,
|
||||
|
@ -204,6 +208,7 @@ public abstract class BaseSegmentInfoFormatTestCase extends BaseIndexFileFormatT
|
|||
"_123",
|
||||
1,
|
||||
false,
|
||||
false,
|
||||
codec,
|
||||
Collections.<String, String>emptyMap(),
|
||||
id,
|
||||
|
@ -231,6 +236,7 @@ public abstract class BaseSegmentInfoFormatTestCase extends BaseIndexFileFormatT
|
|||
"_123",
|
||||
1,
|
||||
false,
|
||||
false,
|
||||
codec,
|
||||
Collections.<String, String>emptyMap(),
|
||||
id,
|
||||
|
@ -368,6 +374,7 @@ public abstract class BaseSegmentInfoFormatTestCase extends BaseIndexFileFormatT
|
|||
"_123",
|
||||
1,
|
||||
false,
|
||||
false,
|
||||
codec,
|
||||
Collections.<String, String>emptyMap(),
|
||||
id,
|
||||
|
@ -408,6 +415,7 @@ public abstract class BaseSegmentInfoFormatTestCase extends BaseIndexFileFormatT
|
|||
"_123",
|
||||
1,
|
||||
false,
|
||||
false,
|
||||
codec,
|
||||
Collections.<String, String>emptyMap(),
|
||||
id,
|
||||
|
@ -453,6 +461,7 @@ public abstract class BaseSegmentInfoFormatTestCase extends BaseIndexFileFormatT
|
|||
"_123",
|
||||
1,
|
||||
false,
|
||||
false,
|
||||
codec,
|
||||
Collections.<String, String>emptyMap(),
|
||||
id,
|
||||
|
@ -498,6 +507,7 @@ public abstract class BaseSegmentInfoFormatTestCase extends BaseIndexFileFormatT
|
|||
"_123",
|
||||
1,
|
||||
false,
|
||||
false,
|
||||
codec,
|
||||
Collections.<String, String>emptyMap(),
|
||||
id,
|
||||
|
@ -544,6 +554,7 @@ public abstract class BaseSegmentInfoFormatTestCase extends BaseIndexFileFormatT
|
|||
"_123",
|
||||
1,
|
||||
false,
|
||||
false,
|
||||
codec,
|
||||
Collections.<String, String>emptyMap(),
|
||||
id,
|
||||
|
@ -614,6 +625,7 @@ public abstract class BaseSegmentInfoFormatTestCase extends BaseIndexFileFormatT
|
|||
name,
|
||||
docCount,
|
||||
isCompoundFile,
|
||||
false,
|
||||
codec,
|
||||
diagnostics,
|
||||
id,
|
||||
|
|
|
@ -696,6 +696,7 @@ public class RandomPostingsTester {
|
|||
"_0",
|
||||
maxDoc,
|
||||
false,
|
||||
false,
|
||||
codec,
|
||||
Collections.emptyMap(),
|
||||
StringHelper.randomId(),
|
||||
|
|
|
@ -293,7 +293,7 @@ public class QueryUtils {
|
|||
|
||||
@Override
|
||||
public LeafMetaData getMetaData() {
|
||||
return new LeafMetaData(Version.LATEST.major, Version.LATEST, null);
|
||||
return new LeafMetaData(Version.LATEST.major, Version.LATEST, null, false);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -38,7 +38,7 @@ import java.util.TimeZone;
|
|||
import org.apache.lucene.codecs.Codec;
|
||||
import org.apache.lucene.codecs.DocValuesFormat;
|
||||
import org.apache.lucene.codecs.PostingsFormat;
|
||||
import org.apache.lucene.codecs.lucene95.Lucene95Codec;
|
||||
import org.apache.lucene.codecs.lucene99.Lucene99Codec;
|
||||
import org.apache.lucene.codecs.simpletext.SimpleTextCodec;
|
||||
import org.apache.lucene.search.similarities.Similarity;
|
||||
import org.apache.lucene.tests.codecs.asserting.AssertingCodec;
|
||||
|
@ -195,9 +195,9 @@ final class TestRuleSetupAndRestoreClassEnv extends AbstractBeforeAfterRule {
|
|||
} else if ("Compressing".equals(TEST_CODEC)
|
||||
|| ("random".equals(TEST_CODEC) && randomVal == 6 && !shouldAvoidCodec("Compressing"))) {
|
||||
codec = CompressingCodec.randomInstance(random);
|
||||
} else if ("Lucene95".equals(TEST_CODEC)
|
||||
|| ("random".equals(TEST_CODEC) && randomVal == 5 && !shouldAvoidCodec("Lucene95"))) {
|
||||
codec = new Lucene95Codec(RandomPicks.randomFrom(random, Lucene95Codec.Mode.values()));
|
||||
} else if ("Lucene99".equals(TEST_CODEC)
|
||||
|| ("random".equals(TEST_CODEC) && randomVal == 5 && !shouldAvoidCodec("Lucene99"))) {
|
||||
codec = new Lucene99Codec(RandomPicks.randomFrom(random, Lucene99Codec.Mode.values()));
|
||||
} else if (!"random".equals(TEST_CODEC)) {
|
||||
codec = Codec.forName(TEST_CODEC);
|
||||
} else if ("random".equals(TEST_POSTINGSFORMAT)) {
|
||||
|
|
|
@ -55,8 +55,8 @@ import org.apache.lucene.codecs.PostingsFormat;
|
|||
import org.apache.lucene.codecs.blocktreeords.BlockTreeOrdsPostingsFormat;
|
||||
import org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat;
|
||||
import org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat;
|
||||
import org.apache.lucene.codecs.lucene95.Lucene95Codec;
|
||||
import org.apache.lucene.codecs.lucene95.Lucene95HnswVectorsFormat;
|
||||
import org.apache.lucene.codecs.lucene99.Lucene99Codec;
|
||||
import org.apache.lucene.codecs.perfield.PerFieldDocValuesFormat;
|
||||
import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat;
|
||||
import org.apache.lucene.document.BinaryDocValuesField;
|
||||
|
@ -1236,7 +1236,7 @@ public final class TestUtil {
|
|||
* different than {@link Codec#getDefault()} because that is randomized.
|
||||
*/
|
||||
public static Codec getDefaultCodec() {
|
||||
return new Lucene95Codec();
|
||||
return new Lucene99Codec();
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
Loading…
Reference in New Issue