Merge branch 'main' into optimize_prefix_query

This commit is contained in:
zhouhui 2024-11-18 09:49:06 +08:00
commit 9a97fbe5ca
325 changed files with 6514 additions and 4546 deletions

View File

@ -10,7 +10,7 @@ on:
push:
branches:
- 'main'
- 'branch_9x'
- 'branch_10x'
env:
GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }}

View File

@ -6,7 +6,7 @@ on:
pull_request:
branches:
- 'main'
- 'branch_9x'
- 'branch_10x'
paths:
- '.github/workflows/run-checks-gradle-upgrade.yml'
- 'gradle/wrapper/**'
@ -14,7 +14,7 @@ on:
push:
branches:
- 'main'
- 'branch_9x'
- 'branch_10x'
paths:
- '.github/workflows/run-checks-gradle-upgrade.yml'
- 'gradle/wrapper/**'

View File

@ -6,7 +6,7 @@ on:
pull_request:
branches:
- 'main'
- 'branch_9x'
- 'branch_10x'
paths:
- '.github/workflows/run-checks-mod-analysis-common.yml'
- 'lucene/analysis/common/**'
@ -14,7 +14,7 @@ on:
push:
branches:
- 'main'
- 'branch_9x'
- 'branch_10x'
paths:
- '.github/workflows/run-checks-mod-analysis-common.yml'
- 'lucene/analysis/common/**'

View File

@ -6,12 +6,12 @@ on:
pull_request:
branches:
- 'main'
- 'branch_9x'
- 'branch_10x'
push:
branches:
- 'main'
- 'branch_9x'
- 'branch_10x'
env:
GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }}

View File

@ -1,5 +1,5 @@
Apache Lucene
Copyright 2001-2022 The Apache Software Foundation
Copyright 2001-2024 The Apache Software Foundation
This product includes software developed at
The Apache Software Foundation (http://www.apache.org/).

View File

@ -41,7 +41,7 @@ apply from: file('gradle/globals.gradle')
// Calculate project version:
version = {
// Release manager: update base version here after release:
String baseVersion = '10.0.0'
String baseVersion = '11.0.0'
// On a release explicitly set release version in one go:
// -Dversion.release=x.y.z

View File

@ -51,7 +51,7 @@ cd lucene
git clone git@github.com:apache/lucene.git main
cd main
# For each branch that you want a separate directory created for, add a worktree
git worktree add ../9x branch_9x
git worktree add ../10x branch_10x
----
=== Using the Worktrees

View File

@ -67,13 +67,27 @@
</maintainer>
<!-- NOTE: please insert releases in numeric order, NOT chronologically. -->
<release>
<Version>
<name>lucene-10.0.0</name>
<created>2024-10-14</created>
<revision>10.0.0</revision>
</Version>
</release>
<release>
<Version>
<name>lucene-9.12.0</name>
<created>2024-09-28</created>
<revision>9.12.0</revision>
</Version>
</release>
<release>
<Version>
<name>lucene-9.11.1</name>
<created>2024-06-27</created>
<revision>9.11.1</revision>
</Version>
</release>.
</release>
<release>
<Version>
<name>lucene-9.11.0</name>
@ -186,6 +200,13 @@
<revision>9.0.0</revision>
</Version>
</release>
<release>
<Version>
<name>lucene-8.11.4</name>
<created>2024-09-24</created>
<revision>8.11.4</revision>
</Version>
</release>
<release>
<Version>
<name>lucene-8.11.3</name>

View File

@ -40,7 +40,7 @@ def create_and_add_index(source, indextype, index_version, current_version, temp
'cfs': 'index',
'nocfs': 'index',
'sorted': 'sorted',
'int8_hnsw': 'int8_hnsw',
'int7_hnsw': 'int7_hnsw',
'moreterms': 'moreterms',
'dvupdates': 'dvupdates',
'emptyIndex': 'empty'
@ -61,7 +61,7 @@ def create_and_add_index(source, indextype, index_version, current_version, temp
'cfs': 'testCreateCFS',
'nocfs': 'testCreateNoCFS',
'sorted': 'testCreateSortedIndex',
'int8_hnsw': 'testCreateInt8HNSWIndices',
'int7_hnsw': 'testCreateInt7HNSWIndices',
'moreterms': 'testCreateMoreTermsIndex',
'dvupdates': 'testCreateIndexWithDocValuesUpdates',
'emptyIndex': 'testCreateEmptyIndex'
@ -206,7 +206,7 @@ def main():
current_version = scriptutil.Version.parse(scriptutil.find_current_version())
create_and_add_index(source, 'cfs', c.version, current_version, c.temp_dir)
create_and_add_index(source, 'nocfs', c.version, current_version, c.temp_dir)
create_and_add_index(source, 'int8_hnsw', c.version, current_version, c.temp_dir)
create_and_add_index(source, 'int7_hnsw', c.version, current_version, c.temp_dir)
should_make_sorted = current_version.is_back_compat_with(c.version) \
and (c.version.major > 6 or (c.version.major == 6 and c.version.minor >= 2))
if should_make_sorted:

View File

@ -112,8 +112,10 @@ def prepare(root, version, pause_before_sign, gpg_key_id, gpg_password, gpg_home
checkDOAPfiles(version)
if not dev_mode:
print(' ./gradlew --stacktrace --no-daemon clean check')
run('./gradlew --stacktrace --no-daemon clean check')
print(' ./gradlew --stacktrace --no-daemon clean')
run('./gradlew --stacktrace --no-daemon clean')
print(' ./gradlew --stacktrace --no-daemon check')
run('./gradlew --stacktrace --no-daemon check')
else:
print(' skipping precommit check due to dev-mode')

View File

@ -239,7 +239,7 @@ def maybe_remove_rc_from_svn():
logfile="svn_rm.log",
tee=True,
vars={
'dist_folder': """lucene-{{ release_version }}-RC{{ rc_number }}-rev{{ build_rc.git_rev | default("<git_rev>", True) }}""",
'dist_folder': """lucene-{{ release_version }}-RC{{ rc_number }}-rev-{{ build_rc.git_rev | default("<git_rev>", True) }}""",
'dist_url': "{{ dist_url_base }}/{{ dist_folder }}"
}
)],

View File

@ -19,6 +19,7 @@
allprojects {
tasks.withType(AbstractArchiveTask).configureEach { task ->
duplicatesStrategy = DuplicatesStrategy.FAIL
preserveFileTimestamps = false
reproducibleFileOrder = true
dirPermissions {
it.unix(0755)

View File

@ -3,6 +3,78 @@ Lucene Change Log
For more information on past and future Lucene versions, please see:
http://s.apache.org/luceneversions
======================= Lucene 11.0.0 =======================
API Changes
---------------------
(No changes)
New Features
---------------------
(No changes)
Improvements
---------------------
(No changes)
Optimizations
---------------------
(No changes)
Bug Fixes
---------------------
(No changes)
Other
---------------------
(No changes)
======================= Lucene 10.1.0 =======================
API Changes
---------------------
* GITHUB#13859: Allow open-ended ranges in Intervals range queries. (Mayya Sharipova)
New Features
---------------------
(No changes)
Improvements
---------------------
(No changes)
Optimizations
---------------------
* GITHUB#13828: Reduce long[] array allocation for bitset in readBitSetIterator. (Zhang Chao)
* GITHUB#13800: MaxScoreBulkScorer now recomputes scorer partitions when the
minimum competitive allows for a more favorable partitioning. (Adrien Grand)
Bug Fixes
---------------------
* GITHUB#13832: Fixed an issue where the DefaultPassageFormatter.format method did not format passages as intended
when they were not sorted by startOffset. (Seunghan Jung)
* GITHUB#13884: Remove broken .toArray from Long/CharObjectHashMap entirely. (Pan Guixin)
* GITHUB#12686: Added support for highlighting IndexOrDocValuesQuery. (Prudhvi Godithi)
Build
---------------------
* Upgrade forbiddenapis to version 3.8. (Uwe Schindler)
Other
---------------------
(No changes)
======================== Lucene 10.0.1 =======================
Bug Fixes
---------------------
======================= Lucene 10.0.0 =======================
API Changes
@ -48,9 +120,9 @@ API Changes
* GITHUB#12296: Make IndexReader and IndexReaderContext classes explicitly sealed.
They have already been runtime-checked to only be implemented by the specific classes
so this is effectively a non-breaking change.
so this is effectively a non-breaking change. (Petr Portnov)
* GITHUB#12276: Rename DaciukMihovAutomatonBuilder to StringsToAutomaton
* GITHUB#12276: Rename DaciukMihovAutomatonBuilder to StringsToAutomaton. (Michael McCandless)
* GITHUB#12321: Reduced visibility of StringsToAutomaton. Please use Automata#makeStringUnion instead. (Greg Miller)
@ -120,8 +192,17 @@ API Changes
* GITHUB#13328: Convert many basic Lucene classes to record classes, including CollectionStatistics, TermStatistics and LeafMetadata. (Shubham Chaudhary)
* GITHUB#13780: Remove `IndexSearcher#search(List<LeafReaderContext>, Weight, Collector)` in favour of the newly
introduced `IndexSearcher#search(LeafReaderContextPartition[], Weight, Collector)`
* GITHUB#13780: Remove IndexSearcher#search(List<LeafReaderContext>, Weight, Collector) in favour of the newly
introduced IndexSearcher#search(LeafReaderContextPartition[], Weight, Collector). (Luca Cavanna)
* GITHUB#13779: First-class random access API for KnnVectorValues
unifies Byte/FloatVectorValues incorporating RandomAccess* API and introduces
DocIndexIterator for iterative access in place of direct inheritance from DISI. (Michael Sokolov)
* GITHUB#13845: Add missing with-discountOverlaps Similarity constructor variants. (Pierre Salagnac, Christine Poerschke, Robert Muir)
* GITHUB#13820, GITHUB#13825, GITHUB#13830: Corrects DataInput.readGroupVInts to be public and not-final, removes the protected
DataInput.readGroupVInt method. (Zhang Chao, Robert Muir, Uwe Schindler, Dawid Weiss)
New Features
---------------------
@ -209,7 +290,7 @@ Bug Fixes
* LUCENE-10599: LogMergePolicy is more likely to keep merging segments until
they reach the maximum merge size. (Adrien Grand)
* GITHUB#12220: Hunspell: disallow hidden title-case entries from compound middle/end
* GITHUB#12220: Hunspell: disallow hidden title-case entries from compound middle/end. (Peter Gromov)
* GITHUB#12878: Fix the declared Exceptions of Expression#evaluate() to match those
of DoubleValues#doubleValue(). (Uwe Schindler)
@ -292,9 +373,17 @@ Build
======================== Lucene 9.12.0 =======================
Security Fixes
---------------------
* Deserialization of Untrusted Data vulnerability in Apache Lucene Replicator - CVE-2024-45772
(Summ3r from Vidar-Team, Robert Muir, Paul Irwin)
API Changes
---------------------
* GITHUB#13806: Add TermInSetQuery#getBytesRefIterator to be able to iterate over query terms. (Christoph Büscher)
* GITHUB#13469: Expose FlatVectorsFormat as a first-class format; can be configured using a custom Codec. (Michael Sokolov)
* GITHUB#13612: Hunspell: add Suggester#proceedPastRep to avoid losing relevant suggestions. (Peter Gromov)
@ -311,6 +400,9 @@ API Changes
* GITHUB#13568, GITHUB#13750: Add DrillSideways#search method that supports any CollectorManagers for drill-sideways dimensions
or drill-down. (Egor Potemkin)
* GITHUB#13757: For similarities, provide default computeNorm implementation and remove remaining discountOverlaps setters.
(Christine Poerschke, Adrien Grand, Robert Muir)
New Features
---------------------
@ -418,8 +510,6 @@ Optimizations
* GITHUB#13742: Reorder checks in LRUQueryCache#count (Shubham Chaudhary)
* GITHUB#13686: Replace Map<String,Object> with IntObjectHashMap for DV producer (Pan Guixin)
* GITHUB#13697: Add a bulk scorer to ToParentBlockJoinQuery, which delegates to the bulk scorer of the child query.
This should speed up query evaluation when the child query has a specialized bulk scorer, such as disjunctive queries.
(Mike Pellegrini)
@ -470,6 +560,8 @@ Bug Fixes
`IndexWriter.forceMerge` or
`IndexWriter.addIndexes(CodecReader...)`, or reindexing entirely.
* GITHUB#13799: Disable intra-merge parallelism for all structures but kNN vectors. (Ben Trent)
Build
---------------------
@ -482,6 +574,8 @@ Other
* GITHUB#13720: Add float comparison based on unit of least precision and use it to stop test failures caused by float
summation not being associative in IEEE 754. (Alex Herbert, Stefan Vodita)
* Remove code triggering forbidden-apis regarding Java serialization. (Uwe Schindler, Robert Muir)
======================== Lucene 9.11.1 =======================
Bug Fixes

View File

@ -19,6 +19,12 @@
## Migration from Lucene 9.x to Lucene 10.0
### Changes to DataInput.readGroupVInt and readGroupVInts methods
As part of GITHUB#13820, GITHUB#13825, GITHUB#13830, this issue corrects DataInput.readGroupVInts
to be public and not-final, allowing subclasses to override it. This change also removes the protected
DataInput.readGroupVInt method: subclasses should delegate or reimplement it entirely.
### OpenNLP dependency upgrade
[Apache OpenNLP](https://opennlp.apache.org) 2.x opens the door to accessing various models via the ONNX runtime. To migrate you will need to update any deprecated OpenNLP methods that you may be using.
@ -888,3 +894,7 @@ additional vectors into the same field with either 4 or 7 bit
quantization (or no quantization), and ensure all older (9.x written)
segments are rewritten either via `IndexWriter.forceMerge` or
`IndexWriter.addIndexes(CodecReader...)`, or reindexing entirely.
### Vector values APIs switched to primarily random-access
`{Byte/Float}VectorValues` no longer inherit from `DocIdSetIterator`. Rather they extend a common class, `KnnVectorValues`, that provides a random access API (previously provided by `RandomAccessVectorValues`, now removed), and an `iterator()` method for retrieving `DocIndexIterator`: an iterator which is a DISI that also provides an `index()` method. Therefore, any iteration over vector values must now be performed using the values' `iterator()`. Random access works as before, but does not require casting to `RandomAccessVectorValues`.

View File

@ -18,10 +18,10 @@
package org.apache.lucene.analysis.synonym.word2vec;
import java.io.IOException;
import org.apache.lucene.index.FloatVectorValues;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefHash;
import org.apache.lucene.util.TermAndVector;
import org.apache.lucene.util.hnsw.RandomAccessVectorValues;
/**
* Word2VecModel is a class representing the parsed Word2Vec model containing the vectors for each
@ -29,7 +29,7 @@ import org.apache.lucene.util.hnsw.RandomAccessVectorValues;
*
* @lucene.experimental
*/
public class Word2VecModel implements RandomAccessVectorValues.Floats {
public class Word2VecModel extends FloatVectorValues {
private final int dictionarySize;
private final int vectorDimension;

View File

@ -49,7 +49,7 @@ import org.apache.lucene.util.Version;
public class TestCustomAnalyzer extends BaseTokenStreamTestCase {
@SuppressWarnings("deprecation")
private static final Version LUCENE_9_0_0 = Version.LUCENE_9_0_0;
private static final Version LUCENE_10_0_0 = Version.LUCENE_10_0_0;
// Test some examples (TODO: we only check behavior, we may need something like
// TestRandomChains...)
@ -111,7 +111,7 @@ public class TestCustomAnalyzer extends BaseTokenStreamTestCase {
public void testVersionAwareFilter() throws Exception {
CustomAnalyzer a =
CustomAnalyzer.builder()
.withDefaultMatchVersion(LUCENE_9_0_0)
.withDefaultMatchVersion(LUCENE_10_0_0)
.withTokenizer(StandardTokenizerFactory.class)
.addTokenFilter(DummyVersionAwareTokenFilterFactory.class)
.build();
@ -128,7 +128,7 @@ public class TestCustomAnalyzer extends BaseTokenStreamTestCase {
public void testFactoryHtmlStripClassicFolding() throws Exception {
CustomAnalyzer a =
CustomAnalyzer.builder()
.withDefaultMatchVersion(LUCENE_9_0_0)
.withDefaultMatchVersion(LUCENE_10_0_0)
.addCharFilter(HTMLStripCharFilterFactory.class)
.withTokenizer(ClassicTokenizerFactory.class)
.addTokenFilter(ASCIIFoldingFilterFactory.class, "preserveOriginal", "true")
@ -164,7 +164,7 @@ public class TestCustomAnalyzer extends BaseTokenStreamTestCase {
public void testHtmlStripClassicFolding() throws Exception {
CustomAnalyzer a =
CustomAnalyzer.builder()
.withDefaultMatchVersion(LUCENE_9_0_0)
.withDefaultMatchVersion(LUCENE_10_0_0)
.addCharFilter("htmlstrip")
.withTokenizer("classic")
.addTokenFilter("asciifolding", "preserveOriginal", "true")
@ -513,7 +513,7 @@ public class TestCustomAnalyzer extends BaseTokenStreamTestCase {
@Override
public TokenStream create(TokenStream input) {
if (luceneMatchVersion.equals(LUCENE_9_0_0)) {
if (luceneMatchVersion.equals(LUCENE_10_0_0)) {
return input;
}
return new LowerCaseFilter(input);

View File

@ -36,6 +36,7 @@ module org.apache.lucene.backward_codecs {
exports org.apache.lucene.backward_codecs.lucene94;
exports org.apache.lucene.backward_codecs.lucene95;
exports org.apache.lucene.backward_codecs.lucene99;
exports org.apache.lucene.backward_codecs.lucene912;
exports org.apache.lucene.backward_codecs.packed;
exports org.apache.lucene.backward_codecs.store;
@ -62,5 +63,6 @@ module org.apache.lucene.backward_codecs {
org.apache.lucene.backward_codecs.lucene92.Lucene92Codec,
org.apache.lucene.backward_codecs.lucene94.Lucene94Codec,
org.apache.lucene.backward_codecs.lucene95.Lucene95Codec,
org.apache.lucene.backward_codecs.lucene99.Lucene99Codec;
org.apache.lucene.backward_codecs.lucene99.Lucene99Codec,
org.apache.lucene.backward_codecs.lucene912.Lucene912Codec;
}

View File

@ -77,9 +77,8 @@ public final class Lucene50CompoundFormat extends CompoundFormat {
public Lucene50CompoundFormat() {}
@Override
public CompoundDirectory getCompoundReader(Directory dir, SegmentInfo si, IOContext context)
throws IOException {
return new Lucene50CompoundReader(dir, si, context);
public CompoundDirectory getCompoundReader(Directory dir, SegmentInfo si) throws IOException {
return new Lucene50CompoundReader(dir, si);
}
@Override

View File

@ -31,6 +31,7 @@ import org.apache.lucene.store.ChecksumIndexInput;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.ReadAdvice;
import org.apache.lucene.util.CollectionUtil;
import org.apache.lucene.util.IOUtils;
@ -57,8 +58,7 @@ final class Lucene50CompoundReader extends CompoundDirectory {
/** Create a new CompoundFileDirectory. */
// TODO: we should just pre-strip "entries" and append segment name up-front like simpletext?
// this need not be a "general purpose" directory anymore (it only writes index files)
public Lucene50CompoundReader(Directory directory, SegmentInfo si, IOContext context)
throws IOException {
public Lucene50CompoundReader(Directory directory, SegmentInfo si) throws IOException {
this.directory = directory;
this.segmentName = si.name;
String dataFileName =
@ -74,7 +74,7 @@ final class Lucene50CompoundReader extends CompoundDirectory {
}
expectedLength += CodecUtil.footerLength();
handle = directory.openInput(dataFileName, context);
handle = directory.openInput(dataFileName, IOContext.DEFAULT.withReadAdvice(ReadAdvice.NORMAL));
// DirectoryUtil.openInput(directory, dataFileName, context);
try {
CodecUtil.checkIndexHeader(
@ -170,7 +170,7 @@ final class Lucene50CompoundReader extends CompoundDirectory {
+ entries.keySet()
+ ")");
}
return handle.slice(name, entry.offset, entry.length);
return handle.slice(name, entry.offset, entry.length, context.readAdvice());
}
/** Returns an array of strings, one for each file in the directory. */

View File

@ -17,6 +17,8 @@
package org.apache.lucene.backward_codecs.lucene80;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import org.apache.lucene.backward_codecs.packed.LegacyDirectMonotonicReader;
import org.apache.lucene.backward_codecs.packed.LegacyDirectReader;
import org.apache.lucene.backward_codecs.store.EndiannessReverserUtil;
@ -39,7 +41,6 @@ import org.apache.lucene.index.SortedNumericDocValues;
import org.apache.lucene.index.SortedSetDocValues;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.index.TermsEnum.SeekStatus;
import org.apache.lucene.internal.hppc.IntObjectHashMap;
import org.apache.lucene.store.ByteArrayDataInput;
import org.apache.lucene.store.ChecksumIndexInput;
import org.apache.lucene.store.DataInput;
@ -52,11 +53,11 @@ import org.apache.lucene.util.compress.LZ4;
/** reader for {@link Lucene80DocValuesFormat} */
final class Lucene80DocValuesProducer extends DocValuesProducer {
private final IntObjectHashMap<NumericEntry> numerics = new IntObjectHashMap<>();
private final IntObjectHashMap<BinaryEntry> binaries = new IntObjectHashMap<>();
private final IntObjectHashMap<SortedEntry> sorted = new IntObjectHashMap<>();
private final IntObjectHashMap<SortedSetEntry> sortedSets = new IntObjectHashMap<>();
private final IntObjectHashMap<SortedNumericEntry> sortedNumerics = new IntObjectHashMap<>();
private final Map<String, NumericEntry> numerics = new HashMap<>();
private final Map<String, BinaryEntry> binaries = new HashMap<>();
private final Map<String, SortedEntry> sorted = new HashMap<>();
private final Map<String, SortedSetEntry> sortedSets = new HashMap<>();
private final Map<String, SortedNumericEntry> sortedNumerics = new HashMap<>();
private final IndexInput data;
private final int maxDoc;
private int version = -1;
@ -138,7 +139,7 @@ final class Lucene80DocValuesProducer extends DocValuesProducer {
}
byte type = meta.readByte();
if (type == Lucene80DocValuesFormat.NUMERIC) {
numerics.put(info.number, readNumeric(meta));
numerics.put(info.name, readNumeric(meta));
} else if (type == Lucene80DocValuesFormat.BINARY) {
final boolean compressed;
if (version >= Lucene80DocValuesFormat.VERSION_CONFIGURABLE_COMPRESSION) {
@ -157,13 +158,13 @@ final class Lucene80DocValuesProducer extends DocValuesProducer {
} else {
compressed = version >= Lucene80DocValuesFormat.VERSION_BIN_COMPRESSED;
}
binaries.put(info.number, readBinary(meta, compressed));
binaries.put(info.name, readBinary(meta, compressed));
} else if (type == Lucene80DocValuesFormat.SORTED) {
sorted.put(info.number, readSorted(meta));
sorted.put(info.name, readSorted(meta));
} else if (type == Lucene80DocValuesFormat.SORTED_SET) {
sortedSets.put(info.number, readSortedSet(meta));
sortedSets.put(info.name, readSortedSet(meta));
} else if (type == Lucene80DocValuesFormat.SORTED_NUMERIC) {
sortedNumerics.put(info.number, readSortedNumeric(meta));
sortedNumerics.put(info.name, readSortedNumeric(meta));
} else {
throw new CorruptIndexException("invalid type: " + type, meta);
}
@ -425,7 +426,7 @@ final class Lucene80DocValuesProducer extends DocValuesProducer {
@Override
public NumericDocValues getNumeric(FieldInfo field) throws IOException {
NumericEntry entry = numerics.get(field.number);
NumericEntry entry = numerics.get(field.name);
return getNumeric(entry);
}
@ -914,7 +915,7 @@ final class Lucene80DocValuesProducer extends DocValuesProducer {
@Override
public BinaryDocValues getBinary(FieldInfo field) throws IOException {
BinaryEntry entry = binaries.get(field.number);
BinaryEntry entry = binaries.get(field.name);
if (entry.compressed) {
return getCompressedBinary(entry);
} else {
@ -972,7 +973,7 @@ final class Lucene80DocValuesProducer extends DocValuesProducer {
@Override
public SortedDocValues getSorted(FieldInfo field) throws IOException {
SortedEntry entry = sorted.get(field.number);
SortedEntry entry = sorted.get(field.name);
return getSorted(entry);
}
@ -1406,7 +1407,7 @@ final class Lucene80DocValuesProducer extends DocValuesProducer {
@Override
public SortedNumericDocValues getSortedNumeric(FieldInfo field) throws IOException {
SortedNumericEntry entry = sortedNumerics.get(field.number);
SortedNumericEntry entry = sortedNumerics.get(field.name);
if (entry.numValues == entry.numDocsWithField) {
return DocValues.singleton(getNumeric(entry));
}
@ -1542,7 +1543,7 @@ final class Lucene80DocValuesProducer extends DocValuesProducer {
@Override
public SortedSetDocValues getSortedSet(FieldInfo field) throws IOException {
SortedSetEntry entry = sortedSets.get(field.number);
SortedSetEntry entry = sortedSets.get(field.name);
if (entry.singleValueEntry != null) {
return DocValues.singleton(getSorted(entry.singleValueEntry));
}

View File

@ -22,10 +22,10 @@ import java.util.Locale;
import java.util.Objects;
import java.util.SplittableRandom;
import java.util.concurrent.TimeUnit;
import org.apache.lucene.index.FloatVectorValues;
import org.apache.lucene.index.VectorSimilarityFunction;
import org.apache.lucene.util.InfoStream;
import org.apache.lucene.util.hnsw.NeighborQueue;
import org.apache.lucene.util.hnsw.RandomAccessVectorValues;
/**
* Builder for HNSW graph. See {@link Lucene90OnHeapHnswGraph} for a gloss on the algorithm and the
@ -49,7 +49,7 @@ public final class Lucene90HnswGraphBuilder {
private final Lucene90NeighborArray scratch;
private final VectorSimilarityFunction similarityFunction;
private final RandomAccessVectorValues.Floats vectorValues;
private final FloatVectorValues vectorValues;
private final SplittableRandom random;
private final Lucene90BoundsChecker bound;
final Lucene90OnHeapHnswGraph hnsw;
@ -58,7 +58,7 @@ public final class Lucene90HnswGraphBuilder {
// we need two sources of vectors in order to perform diversity check comparisons without
// colliding
private final RandomAccessVectorValues.Floats buildVectors;
private final FloatVectorValues buildVectors;
/**
* Reads all the vectors from vector values, builds a graph connecting them by their dense
@ -73,7 +73,7 @@ public final class Lucene90HnswGraphBuilder {
* to ensure repeatable construction.
*/
public Lucene90HnswGraphBuilder(
RandomAccessVectorValues.Floats vectors,
FloatVectorValues vectors,
VectorSimilarityFunction similarityFunction,
int maxConn,
int beamWidth,
@ -97,14 +97,14 @@ public final class Lucene90HnswGraphBuilder {
}
/**
* Reads all the vectors from two copies of a {@link RandomAccessVectorValues}. Providing two
* copies enables efficient retrieval without extra data copying, while avoiding collision of the
* Reads all the vectors from two copies of a {@link FloatVectorValues}. Providing two copies
* enables efficient retrieval without extra data copying, while avoiding collision of the
* returned values.
*
* @param vectors the vectors for which to build a nearest neighbors graph. Must be an independet
* accessor for the vectors
*/
public Lucene90OnHeapHnswGraph build(RandomAccessVectorValues.Floats vectors) throws IOException {
public Lucene90OnHeapHnswGraph build(FloatVectorValues vectors) throws IOException {
if (vectors == vectorValues) {
throw new IllegalArgumentException(
"Vectors to build must be independent of the source of vectors provided to HnswGraphBuilder()");
@ -230,7 +230,7 @@ public final class Lucene90HnswGraphBuilder {
float[] candidate,
float score,
Lucene90NeighborArray neighbors,
RandomAccessVectorValues.Floats vectorValues)
FloatVectorValues vectorValues)
throws IOException {
bound.set(score);
for (int i = 0; i < neighbors.size(); i++) {

View File

@ -20,7 +20,6 @@ package org.apache.lucene.backward_codecs.lucene90;
import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS;
import java.io.IOException;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Map;
import java.util.SplittableRandom;
@ -34,7 +33,6 @@ import org.apache.lucene.index.FloatVectorValues;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.VectorSimilarityFunction;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.KnnCollector;
import org.apache.lucene.search.VectorScorer;
import org.apache.lucene.store.ChecksumIndexInput;
@ -44,7 +42,6 @@ import org.apache.lucene.util.Bits;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.hnsw.HnswGraph;
import org.apache.lucene.util.hnsw.NeighborQueue;
import org.apache.lucene.util.hnsw.RandomAccessVectorValues;
/**
* Reads vectors from the index segments along with index data structures supporting KNN search.
@ -263,7 +260,7 @@ public final class Lucene90HnswVectorsReader extends KnnVectorsReader {
int node = results.topNode();
float minSimilarity = results.topScore();
results.pop();
knnCollector.collect(node, minSimilarity);
knnCollector.collect(vectorValues.ordToDoc(node), minSimilarity);
}
}
@ -355,8 +352,7 @@ public final class Lucene90HnswVectorsReader extends KnnVectorsReader {
}
/** Read the vector values from the index input. This supports both iterated and random access. */
static class OffHeapFloatVectorValues extends FloatVectorValues
implements RandomAccessVectorValues.Floats {
static class OffHeapFloatVectorValues extends FloatVectorValues {
final int dimension;
final int[] ordToDoc;
@ -367,9 +363,6 @@ public final class Lucene90HnswVectorsReader extends KnnVectorsReader {
final float[] value;
final VectorSimilarityFunction similarityFunction;
int ord = -1;
int doc = -1;
OffHeapFloatVectorValues(
int dimension,
int[] ordToDoc,
@ -394,42 +387,6 @@ public final class Lucene90HnswVectorsReader extends KnnVectorsReader {
return ordToDoc.length;
}
@Override
public float[] vectorValue() throws IOException {
return vectorValue(ord);
}
@Override
public int docID() {
return doc;
}
@Override
public int nextDoc() {
if (++ord >= size()) {
doc = NO_MORE_DOCS;
} else {
doc = ordToDoc[ord];
}
return doc;
}
@Override
public int advance(int target) {
assert docID() < target;
ord = Arrays.binarySearch(ordToDoc, ord + 1, ordToDoc.length, target);
if (ord < 0) {
ord = -(ord + 1);
}
assert ord <= ordToDoc.length;
if (ord == ordToDoc.length) {
doc = NO_MORE_DOCS;
} else {
doc = ordToDoc[ord];
}
return doc;
}
@Override
public OffHeapFloatVectorValues copy() {
return new OffHeapFloatVectorValues(dimension, ordToDoc, similarityFunction, dataIn.clone());
@ -446,21 +403,32 @@ public final class Lucene90HnswVectorsReader extends KnnVectorsReader {
return value;
}
@Override
public int ordToDoc(int ord) {
return ordToDoc[ord];
}
@Override
public DocIndexIterator iterator() {
return createSparseIterator();
}
@Override
public VectorScorer scorer(float[] target) {
if (size() == 0) {
return null;
}
OffHeapFloatVectorValues values = this.copy();
DocIndexIterator iterator = values.iterator();
return new VectorScorer() {
@Override
public float score() throws IOException {
return values.similarityFunction.compare(values.vectorValue(), target);
return values.similarityFunction.compare(values.vectorValue(iterator.index()), target);
}
@Override
public DocIdSetIterator iterator() {
return values;
public DocIndexIterator iterator() {
return iterator;
}
};
}

View File

@ -23,12 +23,12 @@ import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.SplittableRandom;
import org.apache.lucene.index.FloatVectorValues;
import org.apache.lucene.index.VectorSimilarityFunction;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.SparseFixedBitSet;
import org.apache.lucene.util.hnsw.HnswGraph;
import org.apache.lucene.util.hnsw.NeighborQueue;
import org.apache.lucene.util.hnsw.RandomAccessVectorValues;
/**
* An {@link HnswGraph} where all nodes and connections are held in memory. This class is used to
@ -74,7 +74,7 @@ public final class Lucene90OnHeapHnswGraph extends HnswGraph {
float[] query,
int topK,
int numSeed,
RandomAccessVectorValues.Floats vectors,
FloatVectorValues vectors,
VectorSimilarityFunction similarityFunction,
HnswGraph graphValues,
Bits acceptOrds,

View File

@ -46,7 +46,6 @@ import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.hnsw.HnswGraph;
import org.apache.lucene.util.hnsw.HnswGraphSearcher;
import org.apache.lucene.util.hnsw.OrdinalTranslatedKnnCollector;
import org.apache.lucene.util.hnsw.RandomAccessVectorValues;
import org.apache.lucene.util.hnsw.RandomVectorScorer;
/**
@ -398,8 +397,7 @@ public final class Lucene91HnswVectorsReader extends KnnVectorsReader {
}
/** Read the vector values from the index input. This supports both iterated and random access. */
static class OffHeapFloatVectorValues extends FloatVectorValues
implements RandomAccessVectorValues.Floats {
static class OffHeapFloatVectorValues extends FloatVectorValues {
private final int dimension;
private final int size;
@ -410,9 +408,6 @@ public final class Lucene91HnswVectorsReader extends KnnVectorsReader {
private final float[] value;
private final VectorSimilarityFunction similarityFunction;
private int ord = -1;
private int doc = -1;
OffHeapFloatVectorValues(
int dimension,
int size,
@ -439,49 +434,6 @@ public final class Lucene91HnswVectorsReader extends KnnVectorsReader {
return size;
}
@Override
public float[] vectorValue() throws IOException {
dataIn.seek((long) ord * byteSize);
dataIn.readFloats(value, 0, value.length);
return value;
}
@Override
public int docID() {
return doc;
}
@Override
public int nextDoc() {
if (++ord >= size) {
doc = NO_MORE_DOCS;
} else {
doc = ordToDocOperator.applyAsInt(ord);
}
return doc;
}
@Override
public int advance(int target) {
assert docID() < target;
if (ordToDoc == null) {
ord = target;
} else {
ord = Arrays.binarySearch(ordToDoc, ord + 1, ordToDoc.length, target);
if (ord < 0) {
ord = -(ord + 1);
}
}
if (ord < size) {
doc = ordToDocOperator.applyAsInt(ord);
} else {
doc = NO_MORE_DOCS;
}
return doc;
}
@Override
public OffHeapFloatVectorValues copy() {
return new OffHeapFloatVectorValues(
@ -495,21 +447,32 @@ public final class Lucene91HnswVectorsReader extends KnnVectorsReader {
return value;
}
@Override
public int ordToDoc(int ord) {
return ordToDocOperator.applyAsInt(ord);
}
@Override
public DocIndexIterator iterator() {
return createSparseIterator();
}
@Override
public VectorScorer scorer(float[] target) {
if (size == 0) {
return null;
}
OffHeapFloatVectorValues values = this.copy();
DocIndexIterator iterator = values.iterator();
return new VectorScorer() {
@Override
public float score() throws IOException {
return values.similarityFunction.compare(values.vectorValue(), target);
return values.similarityFunction.compare(values.vectorValue(iterator.index()), target);
}
@Override
public DocIdSetIterator iterator() {
return values;
return iterator;
}
};
}

View File

@ -14,7 +14,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.codecs.lucene912;
package org.apache.lucene.backward_codecs.lucene912;
import java.util.Objects;
import org.apache.lucene.codecs.Codec;
@ -37,6 +37,7 @@ import org.apache.lucene.codecs.lucene90.Lucene90NormsFormat;
import org.apache.lucene.codecs.lucene90.Lucene90PointsFormat;
import org.apache.lucene.codecs.lucene90.Lucene90StoredFieldsFormat;
import org.apache.lucene.codecs.lucene90.Lucene90TermVectorsFormat;
import org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat;
import org.apache.lucene.codecs.lucene94.Lucene94FieldInfosFormat;
import org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat;
import org.apache.lucene.codecs.lucene99.Lucene99SegmentInfoFormat;

View File

@ -0,0 +1,433 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Lucene 9.12 file format.
*
* <h2>Apache Lucene - Index File Formats</h2>
*
* <div>
*
* <ul>
* <li><a href="#Introduction">Introduction</a>
* <li><a href="#Definitions">Definitions</a>
* <ul>
* <li><a href="#Inverted_Indexing">Inverted Indexing</a>
* <li><a href="#Types_of_Fields">Types of Fields</a>
* <li><a href="#Segments">Segments</a>
* <li><a href="#Document_Numbers">Document Numbers</a>
* </ul>
* <li><a href="#Overview">Index Structure Overview</a>
* <li><a href="#File_Naming">File Naming</a>
* <li><a href="#file-names">Summary of File Extensions</a>
* <ul>
* <li><a href="#Lock_File">Lock File</a>
* <li><a href="#History">History</a>
* <li><a href="#Limitations">Limitations</a>
* </ul>
* </ul>
*
* </div> <a id="Introduction"></a>
*
* <h3>Introduction</h3>
*
* <div>
*
* <p>This document defines the index file formats used in this version of Lucene. If you are using
* a different version of Lucene, please consult the copy of <code>docs/</code> that was distributed
* with the version you are using.
*
* <p>This document attempts to provide a high-level definition of the Apache Lucene file formats.
* </div> <a id="Definitions"></a>
*
* <h3>Definitions</h3>
*
* <div>
*
* <p>The fundamental concepts in Lucene are index, document, field and term.
*
* <p>An index contains a sequence of documents.
*
* <ul>
* <li>A document is a sequence of fields.
* <li>A field is a named sequence of terms.
* <li>A term is a sequence of bytes.
* </ul>
*
* <p>The same sequence of bytes in two different fields is considered a different term. Thus terms
* are represented as a pair: the string naming the field, and the bytes within the field. <a
* id="Inverted_Indexing"></a>
*
* <h4>Inverted Indexing</h4>
*
* <p>Lucene's index stores terms and statistics about those terms in order to make term-based
* search more efficient. Lucene's terms index falls into the family of indexes known as an
* <i>inverted index.</i> This is because it can list, for a term, the documents that contain it.
* This is the inverse of the natural relationship, in which documents list terms. <a
* id="Types_of_Fields"></a>
*
* <h4>Types of Fields</h4>
*
* <p>In Lucene, fields may be <i>stored</i>, in which case their text is stored in the index
* literally, in a non-inverted manner. Fields that are inverted are called <i>indexed</i>. A field
* may be both stored and indexed.
*
* <p>The text of a field may be <i>tokenized</i> into terms to be indexed, or the text of a field
* may be used literally as a term to be indexed. Most fields are tokenized, but sometimes it is
* useful for certain identifier fields to be indexed literally.
*
* <p>See the {@link org.apache.lucene.document.Field Field} java docs for more information on
* Fields. <a id="Segments"></a>
*
* <h4>Segments</h4>
*
* <p>Lucene indexes may be composed of multiple sub-indexes, or <i>segments</i>. Each segment is a
* fully independent index, which could be searched separately. Indexes evolve by:
*
* <ol>
* <li>Creating new segments for newly added documents.
* <li>Merging existing segments.
* </ol>
*
* <p>Searches may involve multiple segments and/or multiple indexes, each index potentially
* composed of a set of segments. <a id="Document_Numbers"></a>
*
* <h4>Document Numbers</h4>
*
* <p>Internally, Lucene refers to documents by an integer <i>document number</i>. The first
* document added to an index is numbered zero, and each subsequent document added gets a number one
* greater than the previous.
*
* <p>Note that a document's number may change, so caution should be taken when storing these
* numbers outside of Lucene. In particular, numbers may change in the following situations:
*
* <ul>
* <li>
* <p>The numbers stored in each segment are unique only within the segment, and must be
* converted before they can be used in a larger context. The standard technique is to
* allocate each segment a range of values, based on the range of numbers used in that
* segment. To convert a document number from a segment to an external value, the segment's
* <i>base</i> document number is added. To convert an external value back to a
* segment-specific value, the segment is identified by the range that the external value is
* in, and the segment's base value is subtracted. For example two five document segments
* might be combined, so that the first segment has a base value of zero, and the second of
* five. Document three from the second segment would have an external value of eight.
* <li>
* <p>When documents are deleted, gaps are created in the numbering. These are eventually
* removed as the index evolves through merging. Deleted documents are dropped when segments
* are merged. A freshly-merged segment thus has no gaps in its numbering.
* </ul>
*
* </div> <a id="Overview"></a>
*
* <h3>Index Structure Overview</h3>
*
* <div>
*
* <p>Each segment index maintains the following:
*
* <ul>
* <li>{@link org.apache.lucene.codecs.lucene99.Lucene99SegmentInfoFormat Segment info}. This
* contains metadata about a segment, such as the number of documents, what files it uses, and
* information about how the segment is sorted
* <li>{@link org.apache.lucene.codecs.lucene94.Lucene94FieldInfosFormat Field names}. This
* contains metadata about the set of named fields used in the index.
* <li>{@link org.apache.lucene.codecs.lucene90.Lucene90StoredFieldsFormat Stored Field values}.
* This contains, for each document, a list of attribute-value pairs, where the attributes are
* field names. These are used to store auxiliary information about the document, such as its
* title, url, or an identifier to access a database. The set of stored fields are what is
* returned for each hit when searching. This is keyed by document number.
* <li>{@link org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat Term dictionary}. A
* dictionary containing all of the terms used in all of the indexed fields of all of the
* documents. The dictionary also contains the number of documents which contain the term, and
* pointers to the term's frequency and proximity data.
* <li>{@link org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat Term Frequency data}. For
* each term in the dictionary, the numbers of all the documents that contain that term, and
* the frequency of the term in that document, unless frequencies are omitted ({@link
* org.apache.lucene.index.IndexOptions#DOCS IndexOptions.DOCS})
* <li>{@link org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat Term Proximity data}. For
* each term in the dictionary, the positions that the term occurs in each document. Note that
* this will not exist if all fields in all documents omit position data.
* <li>{@link org.apache.lucene.codecs.lucene90.Lucene90NormsFormat Normalization factors}. For
* each field in each document, a value is stored that is multiplied into the score for hits
* on that field.
* <li>{@link org.apache.lucene.codecs.lucene90.Lucene90TermVectorsFormat Term Vectors}. For each
* field in each document, the term vector (sometimes called document vector) may be stored. A
* term vector consists of term text and term frequency. To add Term Vectors to your index see
* the {@link org.apache.lucene.document.Field Field} constructors
* <li>{@link org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat Per-document values}. Like
* stored values, these are also keyed by document number, but are generally intended to be
* loaded into main memory for fast access. Whereas stored values are generally intended for
* summary results from searches, per-document values are useful for things like scoring
* factors.
* <li>{@link org.apache.lucene.codecs.lucene90.Lucene90LiveDocsFormat Live documents}. An
* optional file indicating which documents are live.
* <li>{@link org.apache.lucene.codecs.lucene90.Lucene90PointsFormat Point values}. Optional pair
* of files, recording dimensionally indexed fields, to enable fast numeric range filtering
* and large numeric values like BigInteger and BigDecimal (1D) and geographic shape
* intersection (2D, 3D).
* <li>{@link org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat Vector values}. The
* vector format stores numeric vectors in a format optimized for random access and
* computation, supporting high-dimensional nearest-neighbor search.
* </ul>
*
* <p>Details on each of these are provided in their linked pages. </div> <a id="File_Naming"></a>
*
* <h3>File Naming</h3>
*
* <div>
*
* <p>All files belonging to a segment have the same name with varying extensions. The extensions
* correspond to the different file formats described below. When using the Compound File format
* (default for small segments) these files (except for the Segment info file, the Lock file, and
* Deleted documents file) are collapsed into a single .cfs file (see below for details)
*
* <p>Typically, all segments in an index are stored in a single directory, although this is not
* required.
*
* <p>File names are never re-used. That is, when any file is saved to the Directory it is given a
* never before used filename. This is achieved using a simple generations approach. For example,
* the first segments file is segments_1, then segments_2, etc. The generation is a sequential long
* integer represented in alpha-numeric (base 36) form. </div> <a id="file-names"></a>
*
* <h3>Summary of File Extensions</h3>
*
* <div>
*
* <p>The following table summarizes the names and extensions of the files in Lucene:
*
* <table class="padding4" style="border-spacing: 1px; border-collapse: separate">
* <caption>lucene filenames by extension</caption>
* <tr>
* <th>Name</th>
* <th>Extension</th>
* <th>Brief Description</th>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.index.SegmentInfos Segments File}</td>
* <td>segments_N</td>
* <td>Stores information about a commit point</td>
* </tr>
* <tr>
* <td><a href="#Lock_File">Lock File</a></td>
* <td>write.lock</td>
* <td>The Write lock prevents multiple IndexWriters from writing to the same
* file.</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene99.Lucene99SegmentInfoFormat Segment Info}</td>
* <td>.si</td>
* <td>Stores metadata about a segment</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90CompoundFormat Compound File}</td>
* <td>.cfs, .cfe</td>
* <td>An optional "virtual" file consisting of all the other index files for
* systems that frequently run out of file handles.</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene94.Lucene94FieldInfosFormat Fields}</td>
* <td>.fnm</td>
* <td>Stores information about the fields</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90StoredFieldsFormat Field Index}</td>
* <td>.fdx</td>
* <td>Contains pointers to field data</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90StoredFieldsFormat Field Data}</td>
* <td>.fdt</td>
* <td>The stored fields for documents</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat Term Dictionary}</td>
* <td>.tim</td>
* <td>The term dictionary, stores term info</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat Term Index}</td>
* <td>.tip</td>
* <td>The index into the Term Dictionary</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat Frequencies}</td>
* <td>.doc</td>
* <td>Contains the list of docs which contain each term along with frequency</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat Positions}</td>
* <td>.pos</td>
* <td>Stores position information about where a term occurs in the index</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat Payloads}</td>
* <td>.pay</td>
* <td>Stores additional per-position metadata information such as character offsets and user payloads</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90NormsFormat Norms}</td>
* <td>.nvd, .nvm</td>
* <td>Encodes length and boost factors for docs and fields</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat Per-Document Values}</td>
* <td>.dvd, .dvm</td>
* <td>Encodes additional scoring factors or other per-document information.</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90TermVectorsFormat Term Vector Index}</td>
* <td>.tvx</td>
* <td>Stores offset into the document data file</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90TermVectorsFormat Term Vector Data}</td>
* <td>.tvd</td>
* <td>Contains term vector data.</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90LiveDocsFormat Live Documents}</td>
* <td>.liv</td>
* <td>Info about what documents are live</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90PointsFormat Point values}</td>
* <td>.kdd, .kdi, .kdm</td>
* <td>Holds indexed points</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat Vector values}</td>
* <td>.vec, .vem, .veq, vex</td>
* <td>Holds indexed vectors; <code>.vec</code> files contain the raw vector data,
* <code>.vem</code> the vector metadata, <code>.veq</code> the quantized vector data, and <code>.vex</code> the
* hnsw graph data.</td>
* </tr>
* </table>
*
* </div> <a id="Lock_File"></a>
*
* <h3>Lock File</h3>
*
* The write lock, which is stored in the index directory by default, is named "write.lock". If the
* lock directory is different from the index directory then the write lock will be named
* "XXXX-write.lock" where XXXX is a unique prefix derived from the full path to the index
* directory. When this file is present, a writer is currently modifying the index (adding or
* removing documents). This lock file ensures that only one writer is modifying the index at a
* time. <a id="History"></a>
*
* <h3>History</h3>
*
* <p>Compatibility notes are provided in this document, describing how file formats have changed
* from prior versions:
*
* <ul>
* <li>In version 2.1, the file format was changed to allow lock-less commits (ie, no more commit
* lock). The change is fully backwards compatible: you can open a pre-2.1 index for searching
* or adding/deleting of docs. When the new segments file is saved (committed), it will be
* written in the new file format (meaning no specific "upgrade" process is needed). But note
* that once a commit has occurred, pre-2.1 Lucene will not be able to read the index.
* <li>In version 2.3, the file format was changed to allow segments to share a single set of doc
* store (vectors &amp; stored fields) files. This allows for faster indexing in certain
* cases. The change is fully backwards compatible (in the same way as the lock-less commits
* change in 2.1).
* <li>In version 2.4, Strings are now written as true UTF-8 byte sequence, not Java's modified
* UTF-8. See <a href="http://issues.apache.org/jira/browse/LUCENE-510">LUCENE-510</a> for
* details.
* <li>In version 2.9, an optional opaque Map&lt;String,String&gt; CommitUserData may be passed to
* IndexWriter's commit methods (and later retrieved), which is recorded in the segments_N
* file. See <a href="http://issues.apache.org/jira/browse/LUCENE-1382">LUCENE-1382</a> for
* details. Also, diagnostics were added to each segment written recording details about why
* it was written (due to flush, merge; which OS/JRE was used; etc.). See issue <a
* href="http://issues.apache.org/jira/browse/LUCENE-1654">LUCENE-1654</a> for details.
* <li>In version 3.0, compressed fields are no longer written to the index (they can still be
* read, but on merge the new segment will write them, uncompressed). See issue <a
* href="http://issues.apache.org/jira/browse/LUCENE-1960">LUCENE-1960</a> for details.
* <li>In version 3.1, segments records the code version that created them. See <a
* href="http://issues.apache.org/jira/browse/LUCENE-2720">LUCENE-2720</a> for details.
* Additionally segments track explicitly whether or not they have term vectors. See <a
* href="http://issues.apache.org/jira/browse/LUCENE-2811">LUCENE-2811</a> for details.
* <li>In version 3.2, numeric fields are written as natively to stored fields file, previously
* they were stored in text format only.
* <li>In version 3.4, fields can omit position data while still indexing term frequencies.
* <li>In version 4.0, the format of the inverted index became extensible via the {@link
* org.apache.lucene.codecs.Codec Codec} api. Fast per-document storage ({@code DocValues})
* was introduced. Normalization factors need no longer be a single byte, they can be any
* {@link org.apache.lucene.index.NumericDocValues NumericDocValues}. Terms need not be
* unicode strings, they can be any byte sequence. Term offsets can optionally be indexed into
* the postings lists. Payloads can be stored in the term vectors.
* <li>In version 4.1, the format of the postings list changed to use either of FOR compression or
* variable-byte encoding, depending upon the frequency of the term. Terms appearing only once
* were changed to inline directly into the term dictionary. Stored fields are compressed by
* default.
* <li>In version 4.2, term vectors are compressed by default. DocValues has a new multi-valued
* type (SortedSet), that can be used for faceting/grouping/joining on multi-valued fields.
* <li>In version 4.5, DocValues were extended to explicitly represent missing values.
* <li>In version 4.6, FieldInfos were extended to support per-field DocValues generation, to
* allow updating NumericDocValues fields.
* <li>In version 4.8, checksum footers were added to the end of each index file for improved data
* integrity. Specifically, the last 8 bytes of every index file contain the zlib-crc32
* checksum of the file.
* <li>In version 4.9, DocValues has a new multi-valued numeric type (SortedNumeric) that is
* suitable for faceting/sorting/analytics.
* <li>In version 5.4, DocValues have been improved to store more information on disk: addresses
* for binary fields and ord indexes for multi-valued fields.
* <li>In version 6.0, Points were added, for multi-dimensional range/distance search.
* <li>In version 6.2, new Segment info format that reads/writes the index sort, to support index
* sorting.
* <li>In version 7.0, DocValues have been improved to better support sparse doc values thanks to
* an iterator API.
* <li>In version 8.0, postings have been enhanced to record, for each block of doc ids, the (term
* freq, normalization factor) pairs that may trigger the maximum score of the block. This
* information is recorded alongside skip data in order to be able to skip blocks of doc ids
* if they may not produce high enough scores. Additionally doc values and norms has been
* extended with jump-tables to make access O(1) instead of O(n), where n is the number of
* elements to skip when advancing in the data.
* <li>In version 8.4, postings, positions, offsets and payload lengths have move to a more
* performant encoding that is vectorized.
* <li>In version 8.6, index sort serialization is delegated to the sorts themselves, to allow
* user-defined sorts to be used
* <li>In version 8.6, points fields split the index tree and leaf data into separate files, to
* allow for different access patterns to the different data structures
* <li>In version 8.7, stored fields compression became adaptive to better handle documents with
* smaller stored fields.
* <li>In version 9.0, vector-valued fields were added.
* <li>In version 9.1, vector-valued fields were modified to add a graph hierarchy.
* <li>In version 9.2, docs of vector-valued fields were moved from .vem to .vec and encoded by
* IndexDISI. ordToDoc mappings was added to .vem.
* <li>In version 9.5, HNSW graph connections were changed to be delta-encoded with vints.
* Additionally, metadata file size improvements were made by delta-encoding nodes by graph
* layer and not writing the node ids for the zeroth layer.
* <li>In version 9.9, Vector scalar quantization support was added. Allowing the HNSW vector
* format to utilize int8 quantized vectors for float32 vector search.
* <li>In version 9.12, skip data was refactored to have only two levels: every 128 docs and every
* 4,06 docs, and to be inlined in postings lists. This resulted in a speedup for queries that
* need skipping, especially conjunctions.
* </ul>
*
* <a id="Limitations"></a>
*
* <h3>Limitations</h3>
*
* <div>
*
* <p>Lucene uses a Java <code>int</code> to refer to document numbers, and the index file format
* uses an <code>Int32</code> on-disk to store document numbers. This is a limitation of both the
* index file format and the current implementation. Eventually these should be replaced with either
* <code>UInt64</code> values, or better yet, {@link org.apache.lucene.store.DataOutput#writeVInt
* VInt} values which have no limit. </div>
*/
package org.apache.lucene.backward_codecs.lucene912;

View File

@ -26,12 +26,10 @@ import org.apache.lucene.search.VectorScorer;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.RandomAccessInput;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.hnsw.RandomAccessVectorValues;
import org.apache.lucene.util.packed.DirectMonotonicReader;
/** Read the vector values from the index input. This supports both iterated and random access. */
abstract class OffHeapFloatVectorValues extends FloatVectorValues
implements RandomAccessVectorValues.Floats {
abstract class OffHeapFloatVectorValues extends FloatVectorValues {
protected final int dimension;
protected final int size;
@ -95,8 +93,6 @@ abstract class OffHeapFloatVectorValues extends FloatVectorValues
static class DenseOffHeapVectorValues extends OffHeapFloatVectorValues {
private int doc = -1;
public DenseOffHeapVectorValues(
int dimension,
int size,
@ -105,35 +101,16 @@ abstract class OffHeapFloatVectorValues extends FloatVectorValues
super(dimension, size, vectorSimilarityFunction, slice);
}
@Override
public float[] vectorValue() throws IOException {
return vectorValue(doc);
}
@Override
public int docID() {
return doc;
}
@Override
public int nextDoc() throws IOException {
return advance(doc + 1);
}
@Override
public int advance(int target) throws IOException {
assert docID() < target;
if (target >= size) {
return doc = NO_MORE_DOCS;
}
return doc = target;
}
@Override
public DenseOffHeapVectorValues copy() throws IOException {
return new DenseOffHeapVectorValues(dimension, size, vectorSimilarityFunction, slice.clone());
}
@Override
public DocIndexIterator iterator() {
return createDenseIterator();
}
@Override
public Bits getAcceptOrds(Bits acceptDocs) {
return acceptDocs;
@ -142,15 +119,17 @@ abstract class OffHeapFloatVectorValues extends FloatVectorValues
@Override
public VectorScorer scorer(float[] query) throws IOException {
DenseOffHeapVectorValues values = this.copy();
DocIndexIterator iterator = values.iterator();
return new VectorScorer() {
@Override
public float score() throws IOException {
return values.vectorSimilarityFunction.compare(values.vectorValue(), query);
return values.vectorSimilarityFunction.compare(
values.vectorValue(iterator.index()), query);
}
@Override
public DocIdSetIterator iterator() {
return values;
return iterator;
}
};
}
@ -186,33 +165,17 @@ abstract class OffHeapFloatVectorValues extends FloatVectorValues
fieldEntry.size());
}
@Override
public float[] vectorValue() throws IOException {
return vectorValue(disi.index());
}
@Override
public int docID() {
return disi.docID();
}
@Override
public int nextDoc() throws IOException {
return disi.nextDoc();
}
@Override
public int advance(int target) throws IOException {
assert docID() < target;
return disi.advance(target);
}
@Override
public SparseOffHeapVectorValues copy() throws IOException {
return new SparseOffHeapVectorValues(
fieldEntry, dataIn, vectorSimilarityFunction, slice.clone());
}
@Override
public DocIndexIterator iterator() {
return IndexedDISI.asDocIndexIterator(disi);
}
@Override
public int ordToDoc(int ord) {
return (int) ordToDoc.get(ord);
@ -239,15 +202,17 @@ abstract class OffHeapFloatVectorValues extends FloatVectorValues
@Override
public VectorScorer scorer(float[] query) throws IOException {
SparseOffHeapVectorValues values = this.copy();
DocIndexIterator iterator = values.iterator();
return new VectorScorer() {
@Override
public float score() throws IOException {
return values.vectorSimilarityFunction.compare(values.vectorValue(), query);
return values.vectorSimilarityFunction.compare(
values.vectorValue(iterator.index()), query);
}
@Override
public DocIdSetIterator iterator() {
return values;
return iterator;
}
};
}
@ -259,8 +224,6 @@ abstract class OffHeapFloatVectorValues extends FloatVectorValues
super(dimension, 0, VectorSimilarityFunction.COSINE, null);
}
private int doc = -1;
@Override
public int dimension() {
return super.dimension();
@ -271,26 +234,6 @@ abstract class OffHeapFloatVectorValues extends FloatVectorValues
return 0;
}
@Override
public float[] vectorValue() throws IOException {
throw new UnsupportedOperationException();
}
@Override
public int docID() {
return doc;
}
@Override
public int nextDoc() throws IOException {
return advance(doc + 1);
}
@Override
public int advance(int target) throws IOException {
return doc = NO_MORE_DOCS;
}
@Override
public OffHeapFloatVectorValues copy() throws IOException {
throw new UnsupportedOperationException();
@ -306,6 +249,11 @@ abstract class OffHeapFloatVectorValues extends FloatVectorValues
throw new UnsupportedOperationException();
}
@Override
public DocIndexIterator iterator() {
return createDenseIterator();
}
@Override
public Bits getAcceptOrds(Bits acceptDocs) {
return null;

View File

@ -28,12 +28,10 @@ import org.apache.lucene.search.VectorScorer;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.RandomAccessInput;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.hnsw.RandomAccessVectorValues;
import org.apache.lucene.util.packed.DirectMonotonicReader;
/** Read the vector values from the index input. This supports both iterated and random access. */
abstract class OffHeapByteVectorValues extends ByteVectorValues
implements RandomAccessVectorValues.Bytes {
abstract class OffHeapByteVectorValues extends ByteVectorValues {
protected final int dimension;
protected final int size;
@ -108,8 +106,6 @@ abstract class OffHeapByteVectorValues extends ByteVectorValues
static class DenseOffHeapVectorValues extends OffHeapByteVectorValues {
private int doc = -1;
public DenseOffHeapVectorValues(
int dimension,
int size,
@ -119,36 +115,17 @@ abstract class OffHeapByteVectorValues extends ByteVectorValues
super(dimension, size, slice, vectorSimilarityFunction, byteSize);
}
@Override
public byte[] vectorValue() throws IOException {
return vectorValue(doc);
}
@Override
public int docID() {
return doc;
}
@Override
public int nextDoc() throws IOException {
return advance(doc + 1);
}
@Override
public int advance(int target) throws IOException {
assert docID() < target;
if (target >= size) {
return doc = NO_MORE_DOCS;
}
return doc = target;
}
@Override
public DenseOffHeapVectorValues copy() throws IOException {
return new DenseOffHeapVectorValues(
dimension, size, slice.clone(), vectorSimilarityFunction, byteSize);
}
@Override
public DocIndexIterator iterator() {
return createDenseIterator();
}
@Override
public Bits getAcceptOrds(Bits acceptDocs) {
return acceptDocs;
@ -157,15 +134,16 @@ abstract class OffHeapByteVectorValues extends ByteVectorValues
@Override
public VectorScorer scorer(byte[] query) throws IOException {
DenseOffHeapVectorValues copy = this.copy();
DocIndexIterator iterator = copy.iterator();
return new VectorScorer() {
@Override
public float score() throws IOException {
return vectorSimilarityFunction.compare(copy.vectorValue(), query);
return vectorSimilarityFunction.compare(copy.vectorValue(iterator.index()), query);
}
@Override
public DocIdSetIterator iterator() {
return copy;
return iterator;
}
};
}
@ -202,27 +180,6 @@ abstract class OffHeapByteVectorValues extends ByteVectorValues
fieldEntry.size());
}
@Override
public byte[] vectorValue() throws IOException {
return vectorValue(disi.index());
}
@Override
public int docID() {
return disi.docID();
}
@Override
public int nextDoc() throws IOException {
return disi.nextDoc();
}
@Override
public int advance(int target) throws IOException {
assert docID() < target;
return disi.advance(target);
}
@Override
public SparseOffHeapVectorValues copy() throws IOException {
return new SparseOffHeapVectorValues(
@ -234,6 +191,11 @@ abstract class OffHeapByteVectorValues extends ByteVectorValues
return (int) ordToDoc.get(ord);
}
@Override
public DocIndexIterator iterator() {
return fromDISI(disi);
}
@Override
public Bits getAcceptOrds(Bits acceptDocs) {
if (acceptDocs == null) {
@ -255,15 +217,16 @@ abstract class OffHeapByteVectorValues extends ByteVectorValues
@Override
public VectorScorer scorer(byte[] query) throws IOException {
SparseOffHeapVectorValues copy = this.copy();
IndexedDISI disi = copy.disi;
return new VectorScorer() {
@Override
public float score() throws IOException {
return vectorSimilarityFunction.compare(copy.vectorValue(), query);
return vectorSimilarityFunction.compare(copy.vectorValue(disi.index()), query);
}
@Override
public DocIdSetIterator iterator() {
return copy;
return disi;
}
};
}
@ -275,8 +238,6 @@ abstract class OffHeapByteVectorValues extends ByteVectorValues
super(dimension, 0, null, VectorSimilarityFunction.COSINE, 0);
}
private int doc = -1;
@Override
public int dimension() {
return super.dimension();
@ -287,26 +248,6 @@ abstract class OffHeapByteVectorValues extends ByteVectorValues
return 0;
}
@Override
public byte[] vectorValue() throws IOException {
throw new UnsupportedOperationException();
}
@Override
public int docID() {
return doc;
}
@Override
public int nextDoc() throws IOException {
return advance(doc + 1);
}
@Override
public int advance(int target) throws IOException {
return doc = NO_MORE_DOCS;
}
@Override
public OffHeapByteVectorValues copy() throws IOException {
throw new UnsupportedOperationException();
@ -322,6 +263,11 @@ abstract class OffHeapByteVectorValues extends ByteVectorValues
throw new UnsupportedOperationException();
}
@Override
public DocIndexIterator iterator() {
return createDenseIterator();
}
@Override
public Bits getAcceptOrds(Bits acceptDocs) {
return null;

View File

@ -26,12 +26,10 @@ import org.apache.lucene.search.VectorScorer;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.RandomAccessInput;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.hnsw.RandomAccessVectorValues;
import org.apache.lucene.util.packed.DirectMonotonicReader;
/** Read the vector values from the index input. This supports both iterated and random access. */
abstract class OffHeapFloatVectorValues extends FloatVectorValues
implements RandomAccessVectorValues.Floats {
abstract class OffHeapFloatVectorValues extends FloatVectorValues {
protected final int dimension;
protected final int size;
@ -104,8 +102,6 @@ abstract class OffHeapFloatVectorValues extends FloatVectorValues
static class DenseOffHeapVectorValues extends OffHeapFloatVectorValues {
private int doc = -1;
public DenseOffHeapVectorValues(
int dimension,
int size,
@ -115,36 +111,17 @@ abstract class OffHeapFloatVectorValues extends FloatVectorValues
super(dimension, size, slice, vectorSimilarityFunction, byteSize);
}
@Override
public float[] vectorValue() throws IOException {
return vectorValue(doc);
}
@Override
public int docID() {
return doc;
}
@Override
public int nextDoc() throws IOException {
return advance(doc + 1);
}
@Override
public int advance(int target) throws IOException {
assert docID() < target;
if (target >= size) {
return doc = NO_MORE_DOCS;
}
return doc = target;
}
@Override
public DenseOffHeapVectorValues copy() throws IOException {
return new DenseOffHeapVectorValues(
dimension, size, slice.clone(), vectorSimilarityFunction, byteSize);
}
@Override
public DocIndexIterator iterator() {
return createDenseIterator();
}
@Override
public Bits getAcceptOrds(Bits acceptDocs) {
return acceptDocs;
@ -153,15 +130,18 @@ abstract class OffHeapFloatVectorValues extends FloatVectorValues
@Override
public VectorScorer scorer(float[] query) throws IOException {
DenseOffHeapVectorValues values = this.copy();
DocIndexIterator iterator = values.iterator();
return new VectorScorer() {
@Override
public float score() throws IOException {
return values.vectorSimilarityFunction.compare(values.vectorValue(), query);
return values.vectorSimilarityFunction.compare(
values.vectorValue(iterator.index()), query);
}
@Override
public DocIdSetIterator iterator() {
return values;
return iterator;
}
};
}
@ -198,33 +178,17 @@ abstract class OffHeapFloatVectorValues extends FloatVectorValues
fieldEntry.size());
}
@Override
public float[] vectorValue() throws IOException {
return vectorValue(disi.index());
}
@Override
public int docID() {
return disi.docID();
}
@Override
public int nextDoc() throws IOException {
return disi.nextDoc();
}
@Override
public int advance(int target) throws IOException {
assert docID() < target;
return disi.advance(target);
}
@Override
public SparseOffHeapVectorValues copy() throws IOException {
return new SparseOffHeapVectorValues(
fieldEntry, dataIn, slice.clone(), vectorSimilarityFunction, byteSize);
}
@Override
public DocIndexIterator iterator() {
return IndexedDISI.asDocIndexIterator(disi);
}
@Override
public int ordToDoc(int ord) {
return (int) ordToDoc.get(ord);
@ -251,15 +215,17 @@ abstract class OffHeapFloatVectorValues extends FloatVectorValues
@Override
public VectorScorer scorer(float[] query) throws IOException {
SparseOffHeapVectorValues values = this.copy();
DocIndexIterator iterator = values.iterator();
return new VectorScorer() {
@Override
public float score() throws IOException {
return values.vectorSimilarityFunction.compare(values.vectorValue(), query);
return values.vectorSimilarityFunction.compare(
values.vectorValue(iterator.index()), query);
}
@Override
public DocIdSetIterator iterator() {
return values;
return iterator;
}
};
}
@ -271,8 +237,6 @@ abstract class OffHeapFloatVectorValues extends FloatVectorValues
super(dimension, 0, null, VectorSimilarityFunction.COSINE, 0);
}
private int doc = -1;
@Override
public int dimension() {
return super.dimension();
@ -283,26 +247,6 @@ abstract class OffHeapFloatVectorValues extends FloatVectorValues
return 0;
}
@Override
public float[] vectorValue() throws IOException {
throw new UnsupportedOperationException();
}
@Override
public int docID() {
return doc;
}
@Override
public int nextDoc() throws IOException {
return advance(doc + 1);
}
@Override
public int advance(int target) throws IOException {
return doc = NO_MORE_DOCS;
}
@Override
public OffHeapFloatVectorValues copy() throws IOException {
throw new UnsupportedOperationException();
@ -318,6 +262,11 @@ abstract class OffHeapFloatVectorValues extends FloatVectorValues
throw new UnsupportedOperationException();
}
@Override
public DocIndexIterator iterator() {
return createDenseIterator();
}
@Override
public Bits getAcceptOrds(Bits acceptDocs) {
return null;

View File

@ -19,6 +19,7 @@ package org.apache.lucene.backward_codecs.lucene99;
import java.io.IOException;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.GroupVIntUtil;
/** Utility class to encode/decode postings block. */
final class PostingsUtil {
@ -35,7 +36,7 @@ final class PostingsUtil {
boolean indexHasFreq,
boolean decodeFreq)
throws IOException {
docIn.readGroupVInts(docBuffer, num);
GroupVIntUtil.readGroupVInts(docIn, docBuffer, num);
if (indexHasFreq && decodeFreq) {
for (int i = 0; i < num; ++i) {
freqBuffer[i] = docBuffer[i] & 0x01;

View File

@ -23,3 +23,4 @@ org.apache.lucene.backward_codecs.lucene92.Lucene92Codec
org.apache.lucene.backward_codecs.lucene94.Lucene94Codec
org.apache.lucene.backward_codecs.lucene95.Lucene95Codec
org.apache.lucene.backward_codecs.lucene99.Lucene99Codec
org.apache.lucene.backward_codecs.lucene912.Lucene912Codec

View File

@ -81,9 +81,8 @@ public final class Lucene50RWCompoundFormat extends CompoundFormat {
public Lucene50RWCompoundFormat() {}
@Override
public CompoundDirectory getCompoundReader(Directory dir, SegmentInfo si, IOContext context)
throws IOException {
return new Lucene50CompoundReader(dir, si, context);
public CompoundDirectory getCompoundReader(Directory dir, SegmentInfo si) throws IOException {
return new Lucene50CompoundReader(dir, si);
}
@Override

View File

@ -218,7 +218,7 @@ public class Lucene60PointsWriter extends PointsWriter {
FieldInfos readerFieldInfos = mergeState.fieldInfos[i];
FieldInfo readerFieldInfo = readerFieldInfos.fieldInfo(fieldInfo.name);
if (readerFieldInfo != null && readerFieldInfo.getPointDimensionCount() > 0) {
PointValues bkdReader = reader60.readers.get(readerFieldInfo.number);
PointValues bkdReader = reader60.getValues(readerFieldInfo.name);
if (bkdReader != null) {
bkdReaders.add(bkdReader);
docMaps.add(mergeState.docMaps[i]);

View File

@ -249,7 +249,7 @@ public class Lucene86PointsWriter extends PointsWriter {
// we confirmed this up above
assert reader instanceof Lucene86PointsReader;
Lucene86PointsReader reader60 = (Lucene86PointsReader) reader;
Lucene86PointsReader reader86 = (Lucene86PointsReader) reader;
// NOTE: we cannot just use the merged fieldInfo.number (instead of resolving to
// this
@ -259,7 +259,7 @@ public class Lucene86PointsWriter extends PointsWriter {
FieldInfos readerFieldInfos = mergeState.fieldInfos[i];
FieldInfo readerFieldInfo = readerFieldInfos.fieldInfo(fieldInfo.name);
if (readerFieldInfo != null && readerFieldInfo.getPointDimensionCount() > 0) {
PointValues aPointValues = reader60.readers.get(readerFieldInfo.number);
PointValues aPointValues = reader86.getValues(readerFieldInfo.name);
if (aPointValues != null) {
pointValues.add(aPointValues);
docMaps.add(mergeState.docMaps[i]);

View File

@ -29,13 +29,13 @@ import org.apache.lucene.index.ByteVectorValues;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FloatVectorValues;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.KnnVectorValues;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.index.VectorSimilarityFunction;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.hnsw.RandomAccessVectorValues;
/**
* Writes vector values and knn graphs to index segments.
@ -188,12 +188,13 @@ public final class Lucene90HnswVectorsWriter extends BufferingKnnVectorsWriter {
int count = 0;
ByteBuffer binaryVector =
ByteBuffer.allocate(vectors.dimension() * Float.BYTES).order(ByteOrder.LITTLE_ENDIAN);
for (int docV = vectors.nextDoc(); docV != NO_MORE_DOCS; docV = vectors.nextDoc(), count++) {
KnnVectorValues.DocIndexIterator iter = vectors.iterator();
for (int docV = iter.nextDoc(); docV != NO_MORE_DOCS; docV = iter.nextDoc()) {
// write vector
float[] vectorValue = vectors.vectorValue();
float[] vectorValue = vectors.vectorValue(iter.index());
binaryVector.asFloatBuffer().put(vectorValue);
output.writeBytes(binaryVector.array(), binaryVector.limit());
docIds[count] = docV;
docIds[count++] = docV;
}
if (docIds.length > count) {
@ -234,7 +235,7 @@ public final class Lucene90HnswVectorsWriter extends BufferingKnnVectorsWriter {
private void writeGraph(
IndexOutput graphData,
RandomAccessVectorValues.Floats vectorValues,
FloatVectorValues vectorValues,
VectorSimilarityFunction similarityFunction,
long graphDataOffset,
long[] offsets,

View File

@ -12,7 +12,7 @@
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
* limIndexedDISIitations under the License.
*/
package org.apache.lucene.backward_codecs.lucene90;
@ -83,4 +83,9 @@ public class TestLucene90HnswVectorsFormat extends BaseKnnVectorsFormatTestCase
public void testMergingWithDifferentByteKnnFields() {
// unimplemented
}
@Override
public void testMismatchedFields() throws Exception {
// requires byte support
}
}

View File

@ -25,7 +25,7 @@ public class TestLucene90SegmentInfoFormat extends BaseSegmentInfoFormatTestCase
@Override
protected Version[] getVersions() {
return new Version[] {Version.LUCENE_9_0_0};
return new Version[] {Version.fromBits(9, 0, 0)};
}
@Override

View File

@ -25,6 +25,7 @@ import java.util.Objects;
import java.util.SplittableRandom;
import java.util.concurrent.TimeUnit;
import org.apache.lucene.codecs.hnsw.DefaultFlatVectorScorer;
import org.apache.lucene.index.FloatVectorValues;
import org.apache.lucene.index.VectorSimilarityFunction;
import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.InfoStream;
@ -32,7 +33,6 @@ import org.apache.lucene.util.hnsw.HnswGraph;
import org.apache.lucene.util.hnsw.HnswGraphBuilder;
import org.apache.lucene.util.hnsw.HnswGraphSearcher;
import org.apache.lucene.util.hnsw.NeighborQueue;
import org.apache.lucene.util.hnsw.RandomAccessVectorValues;
import org.apache.lucene.util.hnsw.RandomVectorScorer;
/**
@ -57,7 +57,7 @@ public final class Lucene91HnswGraphBuilder {
private final DefaultFlatVectorScorer defaultFlatVectorScorer = new DefaultFlatVectorScorer();
private final VectorSimilarityFunction similarityFunction;
private final RandomAccessVectorValues.Floats vectorValues;
private final FloatVectorValues vectorValues;
private final SplittableRandom random;
private final Lucene91BoundsChecker bound;
private final HnswGraphSearcher graphSearcher;
@ -68,7 +68,7 @@ public final class Lucene91HnswGraphBuilder {
// we need two sources of vectors in order to perform diversity check comparisons without
// colliding
private RandomAccessVectorValues.Floats buildVectors;
private FloatVectorValues buildVectors;
/**
* Reads all the vectors from vector values, builds a graph connecting them by their dense
@ -83,7 +83,7 @@ public final class Lucene91HnswGraphBuilder {
* to ensure repeatable construction.
*/
public Lucene91HnswGraphBuilder(
RandomAccessVectorValues.Floats vectors,
FloatVectorValues vectors,
VectorSimilarityFunction similarityFunction,
int maxConn,
int beamWidth,
@ -113,14 +113,14 @@ public final class Lucene91HnswGraphBuilder {
}
/**
* Reads all the vectors from two copies of a {@link RandomAccessVectorValues}. Providing two
* copies enables efficient retrieval without extra data copying, while avoiding collision of the
* Reads all the vectors from two copies of a {@link FloatVectorValues}. Providing two copies
* enables efficient retrieval without extra data copying, while avoiding collision of the
* returned values.
*
* @param vectors the vectors for which to build a nearest neighbors graph. Must be an independet
* @param vectors the vectors for which to build a nearest neighbors graph. Must be an independent
* accessor for the vectors
*/
public Lucene91OnHeapHnswGraph build(RandomAccessVectorValues.Floats vectors) throws IOException {
public Lucene91OnHeapHnswGraph build(FloatVectorValues vectors) throws IOException {
if (vectors == vectorValues) {
throw new IllegalArgumentException(
"Vectors to build must be independent of the source of vectors provided to HnswGraphBuilder()");
@ -236,7 +236,7 @@ public final class Lucene91HnswGraphBuilder {
// extract all the Neighbors from the queue into an array; these will now be
// sorted from worst to best
for (int i = 0; i < candidateCount; i++) {
float similarity = candidates.minCompetitiveSimilarity();
float similarity = candidates.minimumScore();
scratch.add(candidates.popNode(), similarity);
}
}
@ -254,7 +254,7 @@ public final class Lucene91HnswGraphBuilder {
float[] candidate,
float score,
Lucene91NeighborArray neighbors,
RandomAccessVectorValues.Floats vectorValues)
FloatVectorValues vectorValues)
throws IOException {
bound.set(score);
for (int i = 0; i < neighbors.size(); i++) {

View File

@ -17,8 +17,6 @@
package org.apache.lucene.backward_codecs.lucene91;
import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
@ -30,6 +28,7 @@ import org.apache.lucene.index.DocsWithFieldSet;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FloatVectorValues;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.KnnVectorValues;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.index.VectorSimilarityFunction;
import org.apache.lucene.search.DocIdSetIterator;
@ -37,7 +36,6 @@ import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.hnsw.HnswGraph;
import org.apache.lucene.util.hnsw.RandomAccessVectorValues;
/**
* Writes vector values and knn graphs to index segments.
@ -183,9 +181,10 @@ public final class Lucene91HnswVectorsWriter extends BufferingKnnVectorsWriter {
DocsWithFieldSet docsWithField = new DocsWithFieldSet();
ByteBuffer binaryVector =
ByteBuffer.allocate(vectors.dimension() * Float.BYTES).order(ByteOrder.LITTLE_ENDIAN);
for (int docV = vectors.nextDoc(); docV != NO_MORE_DOCS; docV = vectors.nextDoc()) {
KnnVectorValues.DocIndexIterator iter = vectors.iterator();
for (int docV = iter.nextDoc(); docV != DocIdSetIterator.NO_MORE_DOCS; docV = iter.nextDoc()) {
// write vector
float[] vectorValue = vectors.vectorValue();
float[] vectorValue = vectors.vectorValue(iter.index());
binaryVector.asFloatBuffer().put(vectorValue);
output.writeBytes(binaryVector.array(), binaryVector.limit());
docsWithField.add(docV);
@ -243,7 +242,7 @@ public final class Lucene91HnswVectorsWriter extends BufferingKnnVectorsWriter {
}
private Lucene91OnHeapHnswGraph writeGraph(
RandomAccessVectorValues.Floats vectorValues, VectorSimilarityFunction similarityFunction)
FloatVectorValues vectorValues, VectorSimilarityFunction similarityFunction)
throws IOException {
// build graph

View File

@ -82,4 +82,9 @@ public class TestLucene91HnswVectorsFormat extends BaseKnnVectorsFormatTestCase
public void testMergingWithDifferentByteKnnFields() {
// unimplemented
}
@Override
public void testMismatchedFields() throws Exception {
// requires byte support
}
}

View File

@ -18,7 +18,6 @@
package org.apache.lucene.backward_codecs.lucene92;
import static org.apache.lucene.backward_codecs.lucene92.Lucene92RWHnswVectorsFormat.DIRECT_MONOTONIC_BLOCK_SHIFT;
import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS;
import java.io.IOException;
import java.nio.ByteBuffer;
@ -33,6 +32,7 @@ import org.apache.lucene.index.DocsWithFieldSet;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FloatVectorValues;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.KnnVectorValues;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.index.VectorSimilarityFunction;
import org.apache.lucene.search.DocIdSetIterator;
@ -43,7 +43,6 @@ import org.apache.lucene.util.hnsw.HnswGraph;
import org.apache.lucene.util.hnsw.HnswGraphBuilder;
import org.apache.lucene.util.hnsw.NeighborArray;
import org.apache.lucene.util.hnsw.OnHeapHnswGraph;
import org.apache.lucene.util.hnsw.RandomAccessVectorValues;
import org.apache.lucene.util.hnsw.RandomVectorScorerSupplier;
import org.apache.lucene.util.packed.DirectMonotonicWriter;
@ -190,9 +189,12 @@ public final class Lucene92HnswVectorsWriter extends BufferingKnnVectorsWriter {
DocsWithFieldSet docsWithField = new DocsWithFieldSet();
ByteBuffer binaryVector =
ByteBuffer.allocate(vectors.dimension() * Float.BYTES).order(ByteOrder.LITTLE_ENDIAN);
for (int docV = vectors.nextDoc(); docV != NO_MORE_DOCS; docV = vectors.nextDoc()) {
KnnVectorValues.DocIndexIterator iterator = vectors.iterator();
for (int docV = iterator.nextDoc();
docV != DocIdSetIterator.NO_MORE_DOCS;
docV = iterator.nextDoc()) {
// write vector
float[] vectorValue = vectors.vectorValue();
float[] vectorValue = vectors.vectorValue(iterator.index());
binaryVector.asFloatBuffer().put(vectorValue);
output.writeBytes(binaryVector.array(), binaryVector.limit());
docsWithField.add(docV);
@ -277,7 +279,7 @@ public final class Lucene92HnswVectorsWriter extends BufferingKnnVectorsWriter {
}
private OnHeapHnswGraph writeGraph(
RandomAccessVectorValues.Floats vectorValues, VectorSimilarityFunction similarityFunction)
FloatVectorValues vectorValues, VectorSimilarityFunction similarityFunction)
throws IOException {
DefaultFlatVectorScorer defaultFlatVectorScorer = new DefaultFlatVectorScorer();
// build graph

View File

@ -72,4 +72,9 @@ public class TestLucene92HnswVectorsFormat extends BaseKnnVectorsFormatTestCase
public void testMergingWithDifferentByteKnnFields() {
// unimplemented
}
@Override
public void testMismatchedFields() throws Exception {
// requires byte support
}
}

View File

@ -36,6 +36,7 @@ import org.apache.lucene.index.DocsWithFieldSet;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FloatVectorValues;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.KnnVectorValues;
import org.apache.lucene.index.MergeState;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.index.Sorter;
@ -52,7 +53,6 @@ import org.apache.lucene.util.hnsw.HnswGraph.NodesIterator;
import org.apache.lucene.util.hnsw.HnswGraphBuilder;
import org.apache.lucene.util.hnsw.NeighborArray;
import org.apache.lucene.util.hnsw.OnHeapHnswGraph;
import org.apache.lucene.util.hnsw.RandomAccessVectorValues;
import org.apache.lucene.util.hnsw.RandomVectorScorerSupplier;
import org.apache.lucene.util.packed.DirectMonotonicWriter;
@ -216,9 +216,7 @@ public final class Lucene94HnswVectorsWriter extends KnnVectorsWriter {
final int[] docIdOffsets = new int[sortMap.size()];
int offset = 1; // 0 means no vector for this (field, document)
DocIdSetIterator iterator = fieldData.docsWithField.iterator();
for (int docID = iterator.nextDoc();
docID != DocIdSetIterator.NO_MORE_DOCS;
docID = iterator.nextDoc()) {
for (int docID = iterator.nextDoc(); docID != NO_MORE_DOCS; docID = iterator.nextDoc()) {
int newDocID = sortMap.oldToNew(docID);
docIdOffsets[newDocID] = offset++;
}
@ -556,9 +554,7 @@ public final class Lucene94HnswVectorsWriter extends KnnVectorsWriter {
final DirectMonotonicWriter ordToDocWriter =
DirectMonotonicWriter.getInstance(meta, vectorData, count, DIRECT_MONOTONIC_BLOCK_SHIFT);
DocIdSetIterator iterator = docsWithField.iterator();
for (int doc = iterator.nextDoc();
doc != DocIdSetIterator.NO_MORE_DOCS;
doc = iterator.nextDoc()) {
for (int doc = iterator.nextDoc(); doc != NO_MORE_DOCS; doc = iterator.nextDoc()) {
ordToDocWriter.add(doc);
}
ordToDocWriter.finish();
@ -590,11 +586,10 @@ public final class Lucene94HnswVectorsWriter extends KnnVectorsWriter {
private static DocsWithFieldSet writeByteVectorData(
IndexOutput output, ByteVectorValues byteVectorValues) throws IOException {
DocsWithFieldSet docsWithField = new DocsWithFieldSet();
for (int docV = byteVectorValues.nextDoc();
docV != NO_MORE_DOCS;
docV = byteVectorValues.nextDoc()) {
KnnVectorValues.DocIndexIterator iter = byteVectorValues.iterator();
for (int docV = iter.nextDoc(); docV != NO_MORE_DOCS; docV = iter.nextDoc()) {
// write vector
byte[] binaryValue = byteVectorValues.vectorValue();
byte[] binaryValue = byteVectorValues.vectorValue(iter.index());
assert binaryValue.length == byteVectorValues.dimension() * VectorEncoding.BYTE.byteSize;
output.writeBytes(binaryValue, binaryValue.length);
docsWithField.add(docV);
@ -608,14 +603,13 @@ public final class Lucene94HnswVectorsWriter extends KnnVectorsWriter {
private static DocsWithFieldSet writeVectorData(
IndexOutput output, FloatVectorValues floatVectorValues) throws IOException {
DocsWithFieldSet docsWithField = new DocsWithFieldSet();
KnnVectorValues.DocIndexIterator iter = floatVectorValues.iterator();
ByteBuffer binaryVector =
ByteBuffer.allocate(floatVectorValues.dimension() * VectorEncoding.FLOAT32.byteSize)
.order(ByteOrder.LITTLE_ENDIAN);
for (int docV = floatVectorValues.nextDoc();
docV != NO_MORE_DOCS;
docV = floatVectorValues.nextDoc()) {
for (int docV = iter.nextDoc(); docV != NO_MORE_DOCS; docV = iter.nextDoc()) {
// write vector
float[] vectorValue = floatVectorValues.vectorValue();
float[] vectorValue = floatVectorValues.vectorValue(iter.index());
binaryVector.asFloatBuffer().put(vectorValue);
output.writeBytes(binaryVector.array(), binaryVector.limit());
docsWithField.add(docV);
@ -672,11 +666,11 @@ public final class Lucene94HnswVectorsWriter extends KnnVectorsWriter {
case BYTE ->
defaultFlatVectorScorer.getRandomVectorScorerSupplier(
fieldInfo.getVectorSimilarityFunction(),
RandomAccessVectorValues.fromBytes((List<byte[]>) vectors, dim));
ByteVectorValues.fromBytes((List<byte[]>) vectors, dim));
case FLOAT32 ->
defaultFlatVectorScorer.getRandomVectorScorerSupplier(
fieldInfo.getVectorSimilarityFunction(),
RandomAccessVectorValues.fromFloats((List<float[]>) vectors, dim));
FloatVectorValues.fromFloats((List<float[]>) vectors, dim));
};
hnswGraphBuilder =
HnswGraphBuilder.create(scorerSupplier, M, beamWidth, HnswGraphBuilder.randSeed);

View File

@ -39,6 +39,7 @@ import org.apache.lucene.index.DocsWithFieldSet;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FloatVectorValues;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.KnnVectorValues;
import org.apache.lucene.index.MergeState;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.index.Sorter;
@ -56,7 +57,6 @@ import org.apache.lucene.util.hnsw.HnswGraphBuilder;
import org.apache.lucene.util.hnsw.IncrementalHnswGraphMerger;
import org.apache.lucene.util.hnsw.NeighborArray;
import org.apache.lucene.util.hnsw.OnHeapHnswGraph;
import org.apache.lucene.util.hnsw.RandomAccessVectorValues;
import org.apache.lucene.util.hnsw.RandomVectorScorerSupplier;
import org.apache.lucene.util.packed.DirectMonotonicWriter;
@ -221,9 +221,7 @@ public final class Lucene95HnswVectorsWriter extends KnnVectorsWriter {
final int[] docIdOffsets = new int[sortMap.size()];
int offset = 1; // 0 means no vector for this (field, document)
DocIdSetIterator iterator = fieldData.docsWithField.iterator();
for (int docID = iterator.nextDoc();
docID != DocIdSetIterator.NO_MORE_DOCS;
docID = iterator.nextDoc()) {
for (int docID = iterator.nextDoc(); docID != NO_MORE_DOCS; docID = iterator.nextDoc()) {
int newDocID = sortMap.oldToNew(docID);
docIdOffsets[newDocID] = offset++;
}
@ -482,18 +480,18 @@ public final class Lucene95HnswVectorsWriter extends KnnVectorsWriter {
mergeState.knnVectorsReaders[i], mergeState.docMaps[i], mergeState.liveDocs[i]);
}
}
DocIdSetIterator mergedVectorIterator = null;
KnnVectorValues mergedVectorValues = null;
switch (fieldInfo.getVectorEncoding()) {
case BYTE ->
mergedVectorIterator =
mergedVectorValues =
KnnVectorsWriter.MergedVectorValues.mergeByteVectorValues(fieldInfo, mergeState);
case FLOAT32 ->
mergedVectorIterator =
mergedVectorValues =
KnnVectorsWriter.MergedVectorValues.mergeFloatVectorValues(fieldInfo, mergeState);
}
graph =
merger.merge(
mergedVectorIterator, segmentWriteState.infoStream, docsWithField.cardinality());
mergedVectorValues, segmentWriteState.infoStream, docsWithField.cardinality());
vectorIndexNodeOffsets = writeGraph(graph);
}
long vectorIndexLength = vectorIndex.getFilePointer() - vectorIndexOffset;
@ -636,14 +634,13 @@ public final class Lucene95HnswVectorsWriter extends KnnVectorsWriter {
private static DocsWithFieldSet writeByteVectorData(
IndexOutput output, ByteVectorValues byteVectorValues) throws IOException {
DocsWithFieldSet docsWithField = new DocsWithFieldSet();
for (int docV = byteVectorValues.nextDoc();
docV != NO_MORE_DOCS;
docV = byteVectorValues.nextDoc()) {
KnnVectorValues.DocIndexIterator iter = byteVectorValues.iterator();
for (int docId = iter.nextDoc(); docId != NO_MORE_DOCS; docId = iter.nextDoc()) {
// write vector
byte[] binaryValue = byteVectorValues.vectorValue();
byte[] binaryValue = byteVectorValues.vectorValue(iter.index());
assert binaryValue.length == byteVectorValues.dimension() * VectorEncoding.BYTE.byteSize;
output.writeBytes(binaryValue, binaryValue.length);
docsWithField.add(docV);
docsWithField.add(docId);
}
return docsWithField;
}
@ -657,11 +654,10 @@ public final class Lucene95HnswVectorsWriter extends KnnVectorsWriter {
ByteBuffer buffer =
ByteBuffer.allocate(floatVectorValues.dimension() * VectorEncoding.FLOAT32.byteSize)
.order(ByteOrder.LITTLE_ENDIAN);
for (int docV = floatVectorValues.nextDoc();
docV != NO_MORE_DOCS;
docV = floatVectorValues.nextDoc()) {
KnnVectorValues.DocIndexIterator iter = floatVectorValues.iterator();
for (int docV = iter.nextDoc(); docV != NO_MORE_DOCS; docV = iter.nextDoc()) {
// write vector
float[] value = floatVectorValues.vectorValue();
float[] value = floatVectorValues.vectorValue(iter.index());
buffer.asFloatBuffer().put(value);
output.writeBytes(buffer.array(), buffer.limit());
docsWithField.add(docV);
@ -718,11 +714,11 @@ public final class Lucene95HnswVectorsWriter extends KnnVectorsWriter {
case BYTE ->
defaultFlatVectorScorer.getRandomVectorScorerSupplier(
fieldInfo.getVectorSimilarityFunction(),
RandomAccessVectorValues.fromBytes((List<byte[]>) vectors, dim));
ByteVectorValues.fromBytes((List<byte[]>) vectors, dim));
case FLOAT32 ->
defaultFlatVectorScorer.getRandomVectorScorerSupplier(
fieldInfo.getVectorSimilarityFunction(),
RandomAccessVectorValues.fromFloats((List<float[]>) vectors, dim));
FloatVectorValues.fromFloats((List<float[]>) vectors, dim));
};
hnswGraphBuilder =
HnswGraphBuilder.create(scorerSupplier, M, beamWidth, HnswGraphBuilder.randSeed);

View File

@ -106,8 +106,8 @@ public abstract class BackwardsCompatibilityTestBase extends LuceneTestCase {
* This is a base constructor for parameterized BWC tests. The constructor arguments are provided
* by {@link com.carrotsearch.randomizedtesting.RandomizedRunner} during test execution. A {@link
* com.carrotsearch.randomizedtesting.annotations.ParametersFactory} specified in a subclass
* provides a list lists of arguments for the tests and RandomizedRunner will execute the test for
* each of the argument list.
* provides a list of arguments for the tests and RandomizedRunner will execute the test for each
* of the argument list.
*
* @param version the version this test should run for
* @param indexPattern an index pattern in order to open an index of see {@link

View File

@ -198,7 +198,7 @@ public class TestAncientIndicesCompatibility extends LuceneTestCase {
checker.setInfoStream(new PrintStream(bos, false, UTF_8));
checker.setLevel(CheckIndex.Level.MIN_LEVEL_FOR_INTEGRITY_CHECKS);
CheckIndex.Status indexStatus = checker.checkIndex();
if (version.startsWith("8.")) {
if (version.startsWith("8.") || version.startsWith("9.")) {
assertTrue(indexStatus.clean);
} else {
assertFalse(indexStatus.clean);
@ -219,10 +219,11 @@ public class TestAncientIndicesCompatibility extends LuceneTestCase {
// #12895: test on a carefully crafted 9.8.0 index (from a small contiguous subset
// of wikibigall unique terms) that shows the read-time exception of
// IntersectTermsEnum (used by WildcardQuery)
@AwaitsFix(bugUrl = "https://github.com/apache/lucene/issues/13847")
public void testWildcardQueryExceptions990() throws IOException {
Path path = createTempDir("12895");
String name = "index.12895.9.8.0.zip";
String name = "unsupported.12895.9.8.0.zip";
InputStream resource = TestAncientIndicesCompatibility.class.getResourceAsStream(name);
assertNotNull("missing zip file to reproduce #12895", resource);
TestUtil.unzip(resource, path);

View File

@ -17,7 +17,6 @@
package org.apache.lucene.backward_index;
import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS;
import static org.apache.lucene.util.Version.LUCENE_9_0_0;
import com.carrotsearch.randomizedtesting.annotations.ParametersFactory;
import java.io.IOException;
@ -52,6 +51,7 @@ import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.index.KnnVectorValues;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.LogByteSizeMergePolicy;
import org.apache.lucene.index.MultiBits;
@ -95,7 +95,7 @@ public class TestBasicBackwardsCompatibility extends BackwardsCompatibilityTestB
private static final int DOCS_COUNT = 35;
private static final int DELETED_ID = 7;
private static final int KNN_VECTOR_MIN_SUPPORTED_VERSION = LUCENE_9_0_0.major;
private static final int KNN_VECTOR_MIN_SUPPORTED_VERSION = Version.fromBits(9, 0, 0).major;
private static final String KNN_VECTOR_FIELD = "knn_field";
private static final FieldType KNN_VECTOR_FIELD_TYPE =
KnnFloatVectorField.createFieldType(3, VectorSimilarityFunction.COSINE);
@ -477,10 +477,14 @@ public class TestBasicBackwardsCompatibility extends BackwardsCompatibilityTestB
FloatVectorValues values = ctx.reader().getFloatVectorValues(KNN_VECTOR_FIELD);
if (values != null) {
assertEquals(KNN_VECTOR_FIELD_TYPE.vectorDimension(), values.dimension());
for (int doc = values.nextDoc(); doc != NO_MORE_DOCS; doc = values.nextDoc()) {
KnnVectorValues.DocIndexIterator it = values.iterator();
for (int doc = it.nextDoc(); doc != NO_MORE_DOCS; doc = it.nextDoc()) {
float[] expectedVector = {KNN_VECTOR[0], KNN_VECTOR[1], KNN_VECTOR[2] + 0.1f * cnt};
assertArrayEquals(
"vectors do not match for doc=" + cnt, expectedVector, values.vectorValue(), 0);
"vectors do not match for doc=" + cnt,
expectedVector,
values.vectorValue(it.index()),
0);
cnt++;
}
}
@ -828,7 +832,7 @@ public class TestBasicBackwardsCompatibility extends BackwardsCompatibilityTestB
expectThrows(IllegalArgumentException.class, () -> TestUtil.addIndexesSlowly(w, reader));
assertEquals(
e.getMessage(),
"Cannot merge a segment that has been created with major version 9 into this index which has been created by major version 10");
"Cannot merge a segment that has been created with major version 10 into this index which has been created by major version 11");
w.close();
targetDir2.close();

View File

@ -58,7 +58,7 @@ public class TestDVUpdateBackwardsCompatibility extends BackwardsCompatibilityTe
public static Iterable<Object[]> testVersionsFactory() {
List<Object[]> params = new ArrayList<>();
// TODO - WHY ONLY on the first major version?
params.add(new Object[] {Version.LUCENE_9_0_0, createPattern(INDEX_NAME, SUFFIX)});
params.add(new Object[] {Version.LUCENE_10_0_0, createPattern(INDEX_NAME, SUFFIX)});
return params;
}

View File

@ -53,14 +53,14 @@ public class TestEmptyIndexBackwardsCompatibility extends BackwardsCompatibility
public static Iterable<Object[]> testVersionsFactory() {
List<Object[]> params = new ArrayList<>();
// TODO - WHY ONLY on the first major version?
params.add(new Object[] {Version.LUCENE_9_0_0, createPattern(INDEX_NAME, SUFFIX)});
params.add(new Object[] {Version.LUCENE_10_0_0, createPattern(INDEX_NAME, SUFFIX)});
return params;
}
public void testUpgradeEmptyOldIndex() throws Exception {
try (Directory dir = newDirectory(directory)) {
TestIndexUpgradeBackwardsCompatibility.newIndexUpgrader(dir).upgrade();
TestIndexUpgradeBackwardsCompatibility.checkAllSegmentsUpgraded(dir, 9);
TestIndexUpgradeBackwardsCompatibility.checkAllSegmentsUpgraded(dir, 10);
}
}
}

View File

@ -39,7 +39,7 @@ public class TestGenerateBwcIndices extends LuceneTestCase {
// To generate backcompat indexes with the current default codec, run the following gradle
// command:
// gradlew test -Ptests.bwcdir=/path/to/store/indexes -Ptests.codec=default
// -Ptests.useSecurityManager=false --tests TestGenerateBwcIndices
// -Ptests.useSecurityManager=false --tests TestGenerateBwcIndices --max-workers=1
//
// Also add testmethod with one of the index creation methods below, for example:
// -Ptestmethod=testCreateCFS
@ -82,14 +82,14 @@ public class TestGenerateBwcIndices extends LuceneTestCase {
sortedTest.createBWCIndex();
}
public void testCreateInt8HNSWIndices() throws IOException {
TestInt8HnswBackwardsCompatibility int8HnswBackwardsCompatibility =
new TestInt8HnswBackwardsCompatibility(
public void testCreateInt7HNSWIndices() throws IOException {
TestInt7HnswBackwardsCompatibility int7HnswBackwardsCompatibility =
new TestInt7HnswBackwardsCompatibility(
Version.LATEST,
createPattern(
TestInt8HnswBackwardsCompatibility.INDEX_NAME,
TestInt8HnswBackwardsCompatibility.SUFFIX));
int8HnswBackwardsCompatibility.createBWCIndex();
TestInt7HnswBackwardsCompatibility.INDEX_NAME,
TestInt7HnswBackwardsCompatibility.SUFFIX));
int7HnswBackwardsCompatibility.createBWCIndex();
}
private boolean isInitialMajorVersionRelease() {

View File

@ -55,7 +55,7 @@ public class TestIndexSortBackwardsCompatibility extends BackwardsCompatibilityT
static final String INDEX_NAME = "sorted";
static final String SUFFIX = "";
private static final Version FIRST_PARENT_DOC_VERSION = Version.LUCENE_9_11_0;
private static final Version FIRST_PARENT_DOC_VERSION = Version.fromBits(9, 11, 0);
private static final String PARENT_FIELD_NAME = "___parent";
public TestIndexSortBackwardsCompatibility(Version version, String pattern) {

View File

@ -23,17 +23,22 @@ import java.io.IOException;
import org.apache.lucene.backward_codecs.lucene99.Lucene99Codec;
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.codecs.KnnVectorsFormat;
import org.apache.lucene.codecs.KnnVectorsReader;
import org.apache.lucene.codecs.lucene99.Lucene99HnswScalarQuantizedVectorsFormat;
import org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat;
import org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsReader;
import org.apache.lucene.codecs.perfield.PerFieldKnnVectorsFormat;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.KnnFloatVectorField;
import org.apache.lucene.document.StringField;
import org.apache.lucene.index.CodecReader;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.NoMergePolicy;
import org.apache.lucene.index.VectorSimilarityFunction;
import org.apache.lucene.search.IndexSearcher;
@ -41,23 +46,23 @@ import org.apache.lucene.store.Directory;
import org.apache.lucene.tests.analysis.MockAnalyzer;
import org.apache.lucene.tests.util.TestUtil;
import org.apache.lucene.util.Version;
import org.apache.lucene.util.quantization.QuantizedByteVectorValues;
public class TestInt8HnswBackwardsCompatibility extends BackwardsCompatibilityTestBase {
public class TestInt7HnswBackwardsCompatibility extends BackwardsCompatibilityTestBase {
static final String INDEX_NAME = "int8_hnsw";
static final String INDEX_NAME = "int7_hnsw";
static final String SUFFIX = "";
private static final Version FIRST_INT8_HNSW_VERSION = Version.LUCENE_9_10_0;
private static final Version FIRST_INT7_HNSW_VERSION = Version.fromBits(9, 10, 0);
private static final String KNN_VECTOR_FIELD = "knn_field";
private static final int DOC_COUNT = 30;
private static final FieldType KNN_VECTOR_FIELD_TYPE =
KnnFloatVectorField.createFieldType(3, VectorSimilarityFunction.COSINE);
private static final float[] KNN_VECTOR = {0.2f, -0.1f, 0.1f};
public TestInt8HnswBackwardsCompatibility(Version version, String pattern) {
public TestInt7HnswBackwardsCompatibility(Version version, String pattern) {
super(version, pattern);
}
/** Provides all sorted versions to the test-framework */
@ParametersFactory(argumentFormatting = "Lucene-Version:%1$s; Pattern: %2$s")
public static Iterable<Object[]> testVersionsFactory() throws IllegalAccessException {
return allVersion(INDEX_NAME, SUFFIX);
@ -76,7 +81,7 @@ public class TestInt8HnswBackwardsCompatibility extends BackwardsCompatibilityTe
@Override
protected boolean supportsVersion(Version version) {
return version.onOrAfter(FIRST_INT8_HNSW_VERSION);
return version.onOrAfter(FIRST_INT7_HNSW_VERSION);
}
@Override
@ -84,7 +89,7 @@ public class TestInt8HnswBackwardsCompatibility extends BackwardsCompatibilityTe
// We don't use the default codec
}
public void testInt8HnswIndexAndSearch() throws Exception {
public void testInt7HnswIndexAndSearch() throws Exception {
IndexWriterConfig indexWriterConfig =
newIndexWriterConfig(new MockAnalyzer(random()))
.setOpenMode(IndexWriterConfig.OpenMode.APPEND)
@ -108,7 +113,6 @@ public class TestInt8HnswBackwardsCompatibility extends BackwardsCompatibilityTe
assertKNNSearch(searcher, KNN_VECTOR, 10, 10, "0");
}
}
// This will confirm the docs are really sorted
TestUtil.checkIndex(directory);
}
@ -117,7 +121,7 @@ public class TestInt8HnswBackwardsCompatibility extends BackwardsCompatibilityTe
IndexWriterConfig conf =
new IndexWriterConfig(new MockAnalyzer(random()))
.setMaxBufferedDocs(10)
.setCodec(TestUtil.getDefaultCodec())
.setCodec(getCodec())
.setMergePolicy(NoMergePolicy.INSTANCE);
try (IndexWriter writer = new IndexWriter(dir, conf)) {
for (int i = 0; i < DOC_COUNT; i++) {
@ -147,4 +151,29 @@ public class TestInt8HnswBackwardsCompatibility extends BackwardsCompatibilityTe
assertKNNSearch(searcher, KNN_VECTOR, 10, 10, "0");
}
}
// #13880: make sure the BWC index really contains quantized HNSW not float32
public void testIndexIsReallyQuantized() throws Exception {
try (DirectoryReader reader = DirectoryReader.open(directory)) {
for (LeafReaderContext leafContext : reader.leaves()) {
KnnVectorsReader knnVectorsReader = ((CodecReader) leafContext.reader()).getVectorReader();
assertTrue(
"expected PerFieldKnnVectorsFormat.FieldsReader but got: " + knnVectorsReader,
knnVectorsReader instanceof PerFieldKnnVectorsFormat.FieldsReader);
KnnVectorsReader forField =
((PerFieldKnnVectorsFormat.FieldsReader) knnVectorsReader)
.getFieldReader(KNN_VECTOR_FIELD);
assertTrue(forField instanceof Lucene99HnswVectorsReader);
QuantizedByteVectorValues quantized =
((Lucene99HnswVectorsReader) forField).getQuantizedVectorValues(KNN_VECTOR_FIELD);
assertNotNull(
"KnnVectorsReader should have quantized interface for field " + KNN_VECTOR_FIELD,
quantized);
}
}
}
}

View File

@ -31,13 +31,15 @@ import org.apache.lucene.index.LogByteSizeMergePolicy;
import org.apache.lucene.store.Directory;
import org.apache.lucene.tests.analysis.MockAnalyzer;
import org.apache.lucene.tests.util.LineFileDocs;
import org.apache.lucene.tests.util.LuceneTestCase;
import org.apache.lucene.tests.util.TestUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.Version;
@LuceneTestCase.AwaitsFix(bugUrl = "https://github.com/apache/lucene/issues/13847")
public class TestMoreTermsBackwardsCompatibility extends BackwardsCompatibilityTestBase {
static final String INDEX_NAME = "moreterms";
static final String INDEX_NAME = "unsupported.moreterms";
static final String SUFFIX = "";
@ -48,7 +50,7 @@ public class TestMoreTermsBackwardsCompatibility extends BackwardsCompatibilityT
@ParametersFactory(argumentFormatting = "Lucene-Version:%1$s; Pattern: %2$s")
public static Iterable<Object[]> testVersionsFactory() {
List<Object[]> params = new ArrayList<>();
params.add(new Object[] {Version.LUCENE_9_0_0, createPattern(INDEX_NAME, SUFFIX)});
params.add(new Object[] {Version.fromBits(9, 0, 0), createPattern(INDEX_NAME, SUFFIX)});
return params;
}

Some files were not shown because too many files have changed in this diff Show More