diff --git a/.github/workflows/run-checks-all.yml b/.github/workflows/run-checks-all.yml index 3f899903dce..1bc0a0b564b 100644 --- a/.github/workflows/run-checks-all.yml +++ b/.github/workflows/run-checks-all.yml @@ -10,7 +10,7 @@ on: push: branches: - 'main' - - 'branch_9x' + - 'branch_10x' env: GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} diff --git a/.github/workflows/run-checks-gradle-upgrade.yml b/.github/workflows/run-checks-gradle-upgrade.yml index 4e0ef65f03a..07b7210cf4e 100644 --- a/.github/workflows/run-checks-gradle-upgrade.yml +++ b/.github/workflows/run-checks-gradle-upgrade.yml @@ -6,7 +6,7 @@ on: pull_request: branches: - 'main' - - 'branch_9x' + - 'branch_10x' paths: - '.github/workflows/run-checks-gradle-upgrade.yml' - 'gradle/wrapper/**' @@ -14,7 +14,7 @@ on: push: branches: - 'main' - - 'branch_9x' + - 'branch_10x' paths: - '.github/workflows/run-checks-gradle-upgrade.yml' - 'gradle/wrapper/**' diff --git a/.github/workflows/run-checks-mod-analysis-common.yml b/.github/workflows/run-checks-mod-analysis-common.yml index df83212757c..a208039a99f 100644 --- a/.github/workflows/run-checks-mod-analysis-common.yml +++ b/.github/workflows/run-checks-mod-analysis-common.yml @@ -6,7 +6,7 @@ on: pull_request: branches: - 'main' - - 'branch_9x' + - 'branch_10x' paths: - '.github/workflows/run-checks-mod-analysis-common.yml' - 'lucene/analysis/common/**' @@ -14,7 +14,7 @@ on: push: branches: - 'main' - - 'branch_9x' + - 'branch_10x' paths: - '.github/workflows/run-checks-mod-analysis-common.yml' - 'lucene/analysis/common/**' diff --git a/.github/workflows/run-checks-mod-distribution.tests.yml b/.github/workflows/run-checks-mod-distribution.tests.yml index 497382d7579..e3af5812c80 100644 --- a/.github/workflows/run-checks-mod-distribution.tests.yml +++ b/.github/workflows/run-checks-mod-distribution.tests.yml @@ -6,12 +6,12 @@ on: pull_request: branches: - 'main' - - 'branch_9x' + - 'branch_10x' push: branches: - 'main' - - 'branch_9x' + - 'branch_10x' env: GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} diff --git a/NOTICE.txt b/NOTICE.txt index ea6903484c0..4b758e824d1 100644 --- a/NOTICE.txt +++ b/NOTICE.txt @@ -1,5 +1,5 @@ Apache Lucene -Copyright 2001-2022 The Apache Software Foundation +Copyright 2001-2024 The Apache Software Foundation This product includes software developed at The Apache Software Foundation (http://www.apache.org/). diff --git a/build.gradle b/build.gradle index d406f237541..81e61a35f13 100644 --- a/build.gradle +++ b/build.gradle @@ -41,7 +41,7 @@ apply from: file('gradle/globals.gradle') // Calculate project version: version = { // Release manager: update base version here after release: - String baseVersion = '10.0.0' + String baseVersion = '11.0.0' // On a release explicitly set release version in one go: // -Dversion.release=x.y.z diff --git a/dev-docs/working-between-major-versions.adoc b/dev-docs/working-between-major-versions.adoc index 41824740683..0a42299f252 100644 --- a/dev-docs/working-between-major-versions.adoc +++ b/dev-docs/working-between-major-versions.adoc @@ -51,7 +51,7 @@ cd lucene git clone git@github.com:apache/lucene.git main cd main # For each branch that you want a separate directory created for, add a worktree -git worktree add ../9x branch_9x +git worktree add ../10x branch_10x ---- === Using the Worktrees diff --git a/dev-tools/doap/lucene.rdf b/dev-tools/doap/lucene.rdf index 7c400eb545b..afc39dea5a3 100644 --- a/dev-tools/doap/lucene.rdf +++ b/dev-tools/doap/lucene.rdf @@ -67,13 +67,27 @@ + + + lucene-10.0.0 + 2024-10-14 + 10.0.0 + + + + + lucene-9.12.0 + 2024-09-28 + 9.12.0 + + lucene-9.11.1 2024-06-27 9.11.1 - . + lucene-9.11.0 @@ -186,6 +200,13 @@ 9.0.0 + + + lucene-8.11.4 + 2024-09-24 + 8.11.4 + + lucene-8.11.3 diff --git a/dev-tools/scripts/addBackcompatIndexes.py b/dev-tools/scripts/addBackcompatIndexes.py index 80272ec0f0c..3056c8268d4 100755 --- a/dev-tools/scripts/addBackcompatIndexes.py +++ b/dev-tools/scripts/addBackcompatIndexes.py @@ -40,7 +40,7 @@ def create_and_add_index(source, indextype, index_version, current_version, temp 'cfs': 'index', 'nocfs': 'index', 'sorted': 'sorted', - 'int8_hnsw': 'int8_hnsw', + 'int7_hnsw': 'int7_hnsw', 'moreterms': 'moreterms', 'dvupdates': 'dvupdates', 'emptyIndex': 'empty' @@ -61,7 +61,7 @@ def create_and_add_index(source, indextype, index_version, current_version, temp 'cfs': 'testCreateCFS', 'nocfs': 'testCreateNoCFS', 'sorted': 'testCreateSortedIndex', - 'int8_hnsw': 'testCreateInt8HNSWIndices', + 'int7_hnsw': 'testCreateInt7HNSWIndices', 'moreterms': 'testCreateMoreTermsIndex', 'dvupdates': 'testCreateIndexWithDocValuesUpdates', 'emptyIndex': 'testCreateEmptyIndex' @@ -206,7 +206,7 @@ def main(): current_version = scriptutil.Version.parse(scriptutil.find_current_version()) create_and_add_index(source, 'cfs', c.version, current_version, c.temp_dir) create_and_add_index(source, 'nocfs', c.version, current_version, c.temp_dir) - create_and_add_index(source, 'int8_hnsw', c.version, current_version, c.temp_dir) + create_and_add_index(source, 'int7_hnsw', c.version, current_version, c.temp_dir) should_make_sorted = current_version.is_back_compat_with(c.version) \ and (c.version.major > 6 or (c.version.major == 6 and c.version.minor >= 2)) if should_make_sorted: diff --git a/dev-tools/scripts/buildAndPushRelease.py b/dev-tools/scripts/buildAndPushRelease.py index 117d07af5f3..8985d77cec2 100755 --- a/dev-tools/scripts/buildAndPushRelease.py +++ b/dev-tools/scripts/buildAndPushRelease.py @@ -112,8 +112,10 @@ def prepare(root, version, pause_before_sign, gpg_key_id, gpg_password, gpg_home checkDOAPfiles(version) if not dev_mode: - print(' ./gradlew --stacktrace --no-daemon clean check') - run('./gradlew --stacktrace --no-daemon clean check') + print(' ./gradlew --stacktrace --no-daemon clean') + run('./gradlew --stacktrace --no-daemon clean') + print(' ./gradlew --stacktrace --no-daemon check') + run('./gradlew --stacktrace --no-daemon check') else: print(' skipping precommit check due to dev-mode') diff --git a/dev-tools/scripts/releaseWizard.py b/dev-tools/scripts/releaseWizard.py index 562abf8f6e7..d599095619d 100755 --- a/dev-tools/scripts/releaseWizard.py +++ b/dev-tools/scripts/releaseWizard.py @@ -239,7 +239,7 @@ def maybe_remove_rc_from_svn(): logfile="svn_rm.log", tee=True, vars={ - 'dist_folder': """lucene-{{ release_version }}-RC{{ rc_number }}-rev{{ build_rc.git_rev | default("", True) }}""", + 'dist_folder': """lucene-{{ release_version }}-RC{{ rc_number }}-rev-{{ build_rc.git_rev | default("", True) }}""", 'dist_url': "{{ dist_url_base }}/{{ dist_folder }}" } )], diff --git a/gradle/hacks/gradle-archives.gradle b/gradle/hacks/gradle-archives.gradle index cc8561c47a0..363b6765140 100644 --- a/gradle/hacks/gradle-archives.gradle +++ b/gradle/hacks/gradle-archives.gradle @@ -19,6 +19,7 @@ allprojects { tasks.withType(AbstractArchiveTask).configureEach { task -> duplicatesStrategy = DuplicatesStrategy.FAIL + preserveFileTimestamps = false reproducibleFileOrder = true dirPermissions { it.unix(0755) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index fba7e1144c2..5d9343bca2a 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -3,6 +3,78 @@ Lucene Change Log For more information on past and future Lucene versions, please see: http://s.apache.org/luceneversions +======================= Lucene 11.0.0 ======================= + +API Changes +--------------------- +(No changes) + +New Features +--------------------- +(No changes) + +Improvements +--------------------- +(No changes) + +Optimizations +--------------------- +(No changes) + +Bug Fixes +--------------------- +(No changes) + +Other +--------------------- +(No changes) + +======================= Lucene 10.1.0 ======================= + +API Changes +--------------------- + +* GITHUB#13859: Allow open-ended ranges in Intervals range queries. (Mayya Sharipova) + + +New Features +--------------------- +(No changes) + +Improvements +--------------------- +(No changes) + +Optimizations +--------------------- + +* GITHUB#13828: Reduce long[] array allocation for bitset in readBitSetIterator. (Zhang Chao) + +* GITHUB#13800: MaxScoreBulkScorer now recomputes scorer partitions when the + minimum competitive allows for a more favorable partitioning. (Adrien Grand) + +Bug Fixes +--------------------- +* GITHUB#13832: Fixed an issue where the DefaultPassageFormatter.format method did not format passages as intended + when they were not sorted by startOffset. (Seunghan Jung) +* GITHUB#13884: Remove broken .toArray from Long/CharObjectHashMap entirely. (Pan Guixin) +* GITHUB#12686: Added support for highlighting IndexOrDocValuesQuery. (Prudhvi Godithi) + +Build +--------------------- + +* Upgrade forbiddenapis to version 3.8. (Uwe Schindler) + +Other +--------------------- +(No changes) + +======================== Lucene 10.0.1 ======================= + +Bug Fixes +--------------------- + + ======================= Lucene 10.0.0 ======================= API Changes @@ -48,9 +120,9 @@ API Changes * GITHUB#12296: Make IndexReader and IndexReaderContext classes explicitly sealed. They have already been runtime-checked to only be implemented by the specific classes - so this is effectively a non-breaking change. + so this is effectively a non-breaking change. (Petr Portnov) -* GITHUB#12276: Rename DaciukMihovAutomatonBuilder to StringsToAutomaton +* GITHUB#12276: Rename DaciukMihovAutomatonBuilder to StringsToAutomaton. (Michael McCandless) * GITHUB#12321: Reduced visibility of StringsToAutomaton. Please use Automata#makeStringUnion instead. (Greg Miller) @@ -120,8 +192,17 @@ API Changes * GITHUB#13328: Convert many basic Lucene classes to record classes, including CollectionStatistics, TermStatistics and LeafMetadata. (Shubham Chaudhary) -* GITHUB#13780: Remove `IndexSearcher#search(List, Weight, Collector)` in favour of the newly - introduced `IndexSearcher#search(LeafReaderContextPartition[], Weight, Collector)` +* GITHUB#13780: Remove IndexSearcher#search(List, Weight, Collector) in favour of the newly + introduced IndexSearcher#search(LeafReaderContextPartition[], Weight, Collector). (Luca Cavanna) + +* GITHUB#13779: First-class random access API for KnnVectorValues + unifies Byte/FloatVectorValues incorporating RandomAccess* API and introduces + DocIndexIterator for iterative access in place of direct inheritance from DISI. (Michael Sokolov) + +* GITHUB#13845: Add missing with-discountOverlaps Similarity constructor variants. (Pierre Salagnac, Christine Poerschke, Robert Muir) + +* GITHUB#13820, GITHUB#13825, GITHUB#13830: Corrects DataInput.readGroupVInts to be public and not-final, removes the protected + DataInput.readGroupVInt method. (Zhang Chao, Robert Muir, Uwe Schindler, Dawid Weiss) New Features --------------------- @@ -209,7 +290,7 @@ Bug Fixes * LUCENE-10599: LogMergePolicy is more likely to keep merging segments until they reach the maximum merge size. (Adrien Grand) -* GITHUB#12220: Hunspell: disallow hidden title-case entries from compound middle/end +* GITHUB#12220: Hunspell: disallow hidden title-case entries from compound middle/end. (Peter Gromov) * GITHUB#12878: Fix the declared Exceptions of Expression#evaluate() to match those of DoubleValues#doubleValue(). (Uwe Schindler) @@ -292,9 +373,17 @@ Build ======================== Lucene 9.12.0 ======================= +Security Fixes +--------------------- + +* Deserialization of Untrusted Data vulnerability in Apache Lucene Replicator - CVE-2024-45772 + (Summ3r from Vidar-Team, Robert Muir, Paul Irwin) + API Changes --------------------- +* GITHUB#13806: Add TermInSetQuery#getBytesRefIterator to be able to iterate over query terms. (Christoph Büscher) + * GITHUB#13469: Expose FlatVectorsFormat as a first-class format; can be configured using a custom Codec. (Michael Sokolov) * GITHUB#13612: Hunspell: add Suggester#proceedPastRep to avoid losing relevant suggestions. (Peter Gromov) @@ -311,6 +400,9 @@ API Changes * GITHUB#13568, GITHUB#13750: Add DrillSideways#search method that supports any CollectorManagers for drill-sideways dimensions or drill-down. (Egor Potemkin) +* GITHUB#13757: For similarities, provide default computeNorm implementation and remove remaining discountOverlaps setters. + (Christine Poerschke, Adrien Grand, Robert Muir) + New Features --------------------- @@ -418,8 +510,6 @@ Optimizations * GITHUB#13742: Reorder checks in LRUQueryCache#count (Shubham Chaudhary) -* GITHUB#13686: Replace Map with IntObjectHashMap for DV producer (Pan Guixin) - * GITHUB#13697: Add a bulk scorer to ToParentBlockJoinQuery, which delegates to the bulk scorer of the child query. This should speed up query evaluation when the child query has a specialized bulk scorer, such as disjunctive queries. (Mike Pellegrini) @@ -470,6 +560,8 @@ Bug Fixes `IndexWriter.forceMerge` or `IndexWriter.addIndexes(CodecReader...)`, or reindexing entirely. +* GITHUB#13799: Disable intra-merge parallelism for all structures but kNN vectors. (Ben Trent) + Build --------------------- @@ -482,6 +574,8 @@ Other * GITHUB#13720: Add float comparison based on unit of least precision and use it to stop test failures caused by float summation not being associative in IEEE 754. (Alex Herbert, Stefan Vodita) +* Remove code triggering forbidden-apis regarding Java serialization. (Uwe Schindler, Robert Muir) + ======================== Lucene 9.11.1 ======================= Bug Fixes diff --git a/lucene/MIGRATE.md b/lucene/MIGRATE.md index 3052c7319ac..456e867800f 100644 --- a/lucene/MIGRATE.md +++ b/lucene/MIGRATE.md @@ -19,6 +19,12 @@ ## Migration from Lucene 9.x to Lucene 10.0 +### Changes to DataInput.readGroupVInt and readGroupVInts methods + +As part of GITHUB#13820, GITHUB#13825, GITHUB#13830, this issue corrects DataInput.readGroupVInts +to be public and not-final, allowing subclasses to override it. This change also removes the protected +DataInput.readGroupVInt method: subclasses should delegate or reimplement it entirely. + ### OpenNLP dependency upgrade [Apache OpenNLP](https://opennlp.apache.org) 2.x opens the door to accessing various models via the ONNX runtime. To migrate you will need to update any deprecated OpenNLP methods that you may be using. @@ -888,3 +894,7 @@ additional vectors into the same field with either 4 or 7 bit quantization (or no quantization), and ensure all older (9.x written) segments are rewritten either via `IndexWriter.forceMerge` or `IndexWriter.addIndexes(CodecReader...)`, or reindexing entirely. + +### Vector values APIs switched to primarily random-access + +`{Byte/Float}VectorValues` no longer inherit from `DocIdSetIterator`. Rather they extend a common class, `KnnVectorValues`, that provides a random access API (previously provided by `RandomAccessVectorValues`, now removed), and an `iterator()` method for retrieving `DocIndexIterator`: an iterator which is a DISI that also provides an `index()` method. Therefore, any iteration over vector values must now be performed using the values' `iterator()`. Random access works as before, but does not require casting to `RandomAccessVectorValues`. diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/word2vec/Word2VecModel.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/word2vec/Word2VecModel.java index 8189de4dd6c..68fd3b5884b 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/word2vec/Word2VecModel.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/word2vec/Word2VecModel.java @@ -18,10 +18,10 @@ package org.apache.lucene.analysis.synonym.word2vec; import java.io.IOException; +import org.apache.lucene.index.FloatVectorValues; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRefHash; import org.apache.lucene.util.TermAndVector; -import org.apache.lucene.util.hnsw.RandomAccessVectorValues; /** * Word2VecModel is a class representing the parsed Word2Vec model containing the vectors for each @@ -29,7 +29,7 @@ import org.apache.lucene.util.hnsw.RandomAccessVectorValues; * * @lucene.experimental */ -public class Word2VecModel implements RandomAccessVectorValues.Floats { +public class Word2VecModel extends FloatVectorValues { private final int dictionarySize; private final int vectorDimension; diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/custom/TestCustomAnalyzer.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/custom/TestCustomAnalyzer.java index ae2bcfdfd48..72aa96ed2a9 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/custom/TestCustomAnalyzer.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/custom/TestCustomAnalyzer.java @@ -49,7 +49,7 @@ import org.apache.lucene.util.Version; public class TestCustomAnalyzer extends BaseTokenStreamTestCase { @SuppressWarnings("deprecation") - private static final Version LUCENE_9_0_0 = Version.LUCENE_9_0_0; + private static final Version LUCENE_10_0_0 = Version.LUCENE_10_0_0; // Test some examples (TODO: we only check behavior, we may need something like // TestRandomChains...) @@ -111,7 +111,7 @@ public class TestCustomAnalyzer extends BaseTokenStreamTestCase { public void testVersionAwareFilter() throws Exception { CustomAnalyzer a = CustomAnalyzer.builder() - .withDefaultMatchVersion(LUCENE_9_0_0) + .withDefaultMatchVersion(LUCENE_10_0_0) .withTokenizer(StandardTokenizerFactory.class) .addTokenFilter(DummyVersionAwareTokenFilterFactory.class) .build(); @@ -128,7 +128,7 @@ public class TestCustomAnalyzer extends BaseTokenStreamTestCase { public void testFactoryHtmlStripClassicFolding() throws Exception { CustomAnalyzer a = CustomAnalyzer.builder() - .withDefaultMatchVersion(LUCENE_9_0_0) + .withDefaultMatchVersion(LUCENE_10_0_0) .addCharFilter(HTMLStripCharFilterFactory.class) .withTokenizer(ClassicTokenizerFactory.class) .addTokenFilter(ASCIIFoldingFilterFactory.class, "preserveOriginal", "true") @@ -164,7 +164,7 @@ public class TestCustomAnalyzer extends BaseTokenStreamTestCase { public void testHtmlStripClassicFolding() throws Exception { CustomAnalyzer a = CustomAnalyzer.builder() - .withDefaultMatchVersion(LUCENE_9_0_0) + .withDefaultMatchVersion(LUCENE_10_0_0) .addCharFilter("htmlstrip") .withTokenizer("classic") .addTokenFilter("asciifolding", "preserveOriginal", "true") @@ -513,7 +513,7 @@ public class TestCustomAnalyzer extends BaseTokenStreamTestCase { @Override public TokenStream create(TokenStream input) { - if (luceneMatchVersion.equals(LUCENE_9_0_0)) { + if (luceneMatchVersion.equals(LUCENE_10_0_0)) { return input; } return new LowerCaseFilter(input); diff --git a/lucene/backward-codecs/src/java/module-info.java b/lucene/backward-codecs/src/java/module-info.java index fbc2cdba98e..4607c21eb7e 100644 --- a/lucene/backward-codecs/src/java/module-info.java +++ b/lucene/backward-codecs/src/java/module-info.java @@ -36,6 +36,7 @@ module org.apache.lucene.backward_codecs { exports org.apache.lucene.backward_codecs.lucene94; exports org.apache.lucene.backward_codecs.lucene95; exports org.apache.lucene.backward_codecs.lucene99; + exports org.apache.lucene.backward_codecs.lucene912; exports org.apache.lucene.backward_codecs.packed; exports org.apache.lucene.backward_codecs.store; @@ -62,5 +63,6 @@ module org.apache.lucene.backward_codecs { org.apache.lucene.backward_codecs.lucene92.Lucene92Codec, org.apache.lucene.backward_codecs.lucene94.Lucene94Codec, org.apache.lucene.backward_codecs.lucene95.Lucene95Codec, - org.apache.lucene.backward_codecs.lucene99.Lucene99Codec; + org.apache.lucene.backward_codecs.lucene99.Lucene99Codec, + org.apache.lucene.backward_codecs.lucene912.Lucene912Codec; } diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene50/Lucene50CompoundFormat.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene50/Lucene50CompoundFormat.java index d473efa14a4..ca697b2948b 100644 --- a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene50/Lucene50CompoundFormat.java +++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene50/Lucene50CompoundFormat.java @@ -77,9 +77,8 @@ public final class Lucene50CompoundFormat extends CompoundFormat { public Lucene50CompoundFormat() {} @Override - public CompoundDirectory getCompoundReader(Directory dir, SegmentInfo si, IOContext context) - throws IOException { - return new Lucene50CompoundReader(dir, si, context); + public CompoundDirectory getCompoundReader(Directory dir, SegmentInfo si) throws IOException { + return new Lucene50CompoundReader(dir, si); } @Override diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene50/Lucene50CompoundReader.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene50/Lucene50CompoundReader.java index d833ba7b342..8083a2de7d3 100644 --- a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene50/Lucene50CompoundReader.java +++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene50/Lucene50CompoundReader.java @@ -31,6 +31,7 @@ import org.apache.lucene.store.ChecksumIndexInput; import org.apache.lucene.store.Directory; import org.apache.lucene.store.IOContext; import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.ReadAdvice; import org.apache.lucene.util.CollectionUtil; import org.apache.lucene.util.IOUtils; @@ -57,8 +58,7 @@ final class Lucene50CompoundReader extends CompoundDirectory { /** Create a new CompoundFileDirectory. */ // TODO: we should just pre-strip "entries" and append segment name up-front like simpletext? // this need not be a "general purpose" directory anymore (it only writes index files) - public Lucene50CompoundReader(Directory directory, SegmentInfo si, IOContext context) - throws IOException { + public Lucene50CompoundReader(Directory directory, SegmentInfo si) throws IOException { this.directory = directory; this.segmentName = si.name; String dataFileName = @@ -74,7 +74,7 @@ final class Lucene50CompoundReader extends CompoundDirectory { } expectedLength += CodecUtil.footerLength(); - handle = directory.openInput(dataFileName, context); + handle = directory.openInput(dataFileName, IOContext.DEFAULT.withReadAdvice(ReadAdvice.NORMAL)); // DirectoryUtil.openInput(directory, dataFileName, context); try { CodecUtil.checkIndexHeader( @@ -170,7 +170,7 @@ final class Lucene50CompoundReader extends CompoundDirectory { + entries.keySet() + ")"); } - return handle.slice(name, entry.offset, entry.length); + return handle.slice(name, entry.offset, entry.length, context.readAdvice()); } /** Returns an array of strings, one for each file in the directory. */ diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene80/Lucene80DocValuesProducer.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene80/Lucene80DocValuesProducer.java index 211267d4c03..c5754e5d1e5 100644 --- a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene80/Lucene80DocValuesProducer.java +++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene80/Lucene80DocValuesProducer.java @@ -17,6 +17,8 @@ package org.apache.lucene.backward_codecs.lucene80; import java.io.IOException; +import java.util.HashMap; +import java.util.Map; import org.apache.lucene.backward_codecs.packed.LegacyDirectMonotonicReader; import org.apache.lucene.backward_codecs.packed.LegacyDirectReader; import org.apache.lucene.backward_codecs.store.EndiannessReverserUtil; @@ -39,7 +41,6 @@ import org.apache.lucene.index.SortedNumericDocValues; import org.apache.lucene.index.SortedSetDocValues; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.index.TermsEnum.SeekStatus; -import org.apache.lucene.internal.hppc.IntObjectHashMap; import org.apache.lucene.store.ByteArrayDataInput; import org.apache.lucene.store.ChecksumIndexInput; import org.apache.lucene.store.DataInput; @@ -52,11 +53,11 @@ import org.apache.lucene.util.compress.LZ4; /** reader for {@link Lucene80DocValuesFormat} */ final class Lucene80DocValuesProducer extends DocValuesProducer { - private final IntObjectHashMap numerics = new IntObjectHashMap<>(); - private final IntObjectHashMap binaries = new IntObjectHashMap<>(); - private final IntObjectHashMap sorted = new IntObjectHashMap<>(); - private final IntObjectHashMap sortedSets = new IntObjectHashMap<>(); - private final IntObjectHashMap sortedNumerics = new IntObjectHashMap<>(); + private final Map numerics = new HashMap<>(); + private final Map binaries = new HashMap<>(); + private final Map sorted = new HashMap<>(); + private final Map sortedSets = new HashMap<>(); + private final Map sortedNumerics = new HashMap<>(); private final IndexInput data; private final int maxDoc; private int version = -1; @@ -138,7 +139,7 @@ final class Lucene80DocValuesProducer extends DocValuesProducer { } byte type = meta.readByte(); if (type == Lucene80DocValuesFormat.NUMERIC) { - numerics.put(info.number, readNumeric(meta)); + numerics.put(info.name, readNumeric(meta)); } else if (type == Lucene80DocValuesFormat.BINARY) { final boolean compressed; if (version >= Lucene80DocValuesFormat.VERSION_CONFIGURABLE_COMPRESSION) { @@ -157,13 +158,13 @@ final class Lucene80DocValuesProducer extends DocValuesProducer { } else { compressed = version >= Lucene80DocValuesFormat.VERSION_BIN_COMPRESSED; } - binaries.put(info.number, readBinary(meta, compressed)); + binaries.put(info.name, readBinary(meta, compressed)); } else if (type == Lucene80DocValuesFormat.SORTED) { - sorted.put(info.number, readSorted(meta)); + sorted.put(info.name, readSorted(meta)); } else if (type == Lucene80DocValuesFormat.SORTED_SET) { - sortedSets.put(info.number, readSortedSet(meta)); + sortedSets.put(info.name, readSortedSet(meta)); } else if (type == Lucene80DocValuesFormat.SORTED_NUMERIC) { - sortedNumerics.put(info.number, readSortedNumeric(meta)); + sortedNumerics.put(info.name, readSortedNumeric(meta)); } else { throw new CorruptIndexException("invalid type: " + type, meta); } @@ -425,7 +426,7 @@ final class Lucene80DocValuesProducer extends DocValuesProducer { @Override public NumericDocValues getNumeric(FieldInfo field) throws IOException { - NumericEntry entry = numerics.get(field.number); + NumericEntry entry = numerics.get(field.name); return getNumeric(entry); } @@ -914,7 +915,7 @@ final class Lucene80DocValuesProducer extends DocValuesProducer { @Override public BinaryDocValues getBinary(FieldInfo field) throws IOException { - BinaryEntry entry = binaries.get(field.number); + BinaryEntry entry = binaries.get(field.name); if (entry.compressed) { return getCompressedBinary(entry); } else { @@ -972,7 +973,7 @@ final class Lucene80DocValuesProducer extends DocValuesProducer { @Override public SortedDocValues getSorted(FieldInfo field) throws IOException { - SortedEntry entry = sorted.get(field.number); + SortedEntry entry = sorted.get(field.name); return getSorted(entry); } @@ -1406,7 +1407,7 @@ final class Lucene80DocValuesProducer extends DocValuesProducer { @Override public SortedNumericDocValues getSortedNumeric(FieldInfo field) throws IOException { - SortedNumericEntry entry = sortedNumerics.get(field.number); + SortedNumericEntry entry = sortedNumerics.get(field.name); if (entry.numValues == entry.numDocsWithField) { return DocValues.singleton(getNumeric(entry)); } @@ -1542,7 +1543,7 @@ final class Lucene80DocValuesProducer extends DocValuesProducer { @Override public SortedSetDocValues getSortedSet(FieldInfo field) throws IOException { - SortedSetEntry entry = sortedSets.get(field.number); + SortedSetEntry entry = sortedSets.get(field.name); if (entry.singleValueEntry != null) { return DocValues.singleton(getSorted(entry.singleValueEntry)); } diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/Lucene90HnswGraphBuilder.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/Lucene90HnswGraphBuilder.java index 52972e9dcda..0d7fd520a30 100644 --- a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/Lucene90HnswGraphBuilder.java +++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/Lucene90HnswGraphBuilder.java @@ -22,10 +22,10 @@ import java.util.Locale; import java.util.Objects; import java.util.SplittableRandom; import java.util.concurrent.TimeUnit; +import org.apache.lucene.index.FloatVectorValues; import org.apache.lucene.index.VectorSimilarityFunction; import org.apache.lucene.util.InfoStream; import org.apache.lucene.util.hnsw.NeighborQueue; -import org.apache.lucene.util.hnsw.RandomAccessVectorValues; /** * Builder for HNSW graph. See {@link Lucene90OnHeapHnswGraph} for a gloss on the algorithm and the @@ -49,7 +49,7 @@ public final class Lucene90HnswGraphBuilder { private final Lucene90NeighborArray scratch; private final VectorSimilarityFunction similarityFunction; - private final RandomAccessVectorValues.Floats vectorValues; + private final FloatVectorValues vectorValues; private final SplittableRandom random; private final Lucene90BoundsChecker bound; final Lucene90OnHeapHnswGraph hnsw; @@ -58,7 +58,7 @@ public final class Lucene90HnswGraphBuilder { // we need two sources of vectors in order to perform diversity check comparisons without // colliding - private final RandomAccessVectorValues.Floats buildVectors; + private final FloatVectorValues buildVectors; /** * Reads all the vectors from vector values, builds a graph connecting them by their dense @@ -73,7 +73,7 @@ public final class Lucene90HnswGraphBuilder { * to ensure repeatable construction. */ public Lucene90HnswGraphBuilder( - RandomAccessVectorValues.Floats vectors, + FloatVectorValues vectors, VectorSimilarityFunction similarityFunction, int maxConn, int beamWidth, @@ -97,14 +97,14 @@ public final class Lucene90HnswGraphBuilder { } /** - * Reads all the vectors from two copies of a {@link RandomAccessVectorValues}. Providing two - * copies enables efficient retrieval without extra data copying, while avoiding collision of the + * Reads all the vectors from two copies of a {@link FloatVectorValues}. Providing two copies + * enables efficient retrieval without extra data copying, while avoiding collision of the * returned values. * * @param vectors the vectors for which to build a nearest neighbors graph. Must be an independet * accessor for the vectors */ - public Lucene90OnHeapHnswGraph build(RandomAccessVectorValues.Floats vectors) throws IOException { + public Lucene90OnHeapHnswGraph build(FloatVectorValues vectors) throws IOException { if (vectors == vectorValues) { throw new IllegalArgumentException( "Vectors to build must be independent of the source of vectors provided to HnswGraphBuilder()"); @@ -230,7 +230,7 @@ public final class Lucene90HnswGraphBuilder { float[] candidate, float score, Lucene90NeighborArray neighbors, - RandomAccessVectorValues.Floats vectorValues) + FloatVectorValues vectorValues) throws IOException { bound.set(score); for (int i = 0; i < neighbors.size(); i++) { diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/Lucene90HnswVectorsReader.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/Lucene90HnswVectorsReader.java index 665d3140321..3ffd4f4d75a 100644 --- a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/Lucene90HnswVectorsReader.java +++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/Lucene90HnswVectorsReader.java @@ -20,7 +20,6 @@ package org.apache.lucene.backward_codecs.lucene90; import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS; import java.io.IOException; -import java.util.Arrays; import java.util.HashMap; import java.util.Map; import java.util.SplittableRandom; @@ -34,7 +33,6 @@ import org.apache.lucene.index.FloatVectorValues; import org.apache.lucene.index.IndexFileNames; import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.index.VectorSimilarityFunction; -import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.KnnCollector; import org.apache.lucene.search.VectorScorer; import org.apache.lucene.store.ChecksumIndexInput; @@ -44,7 +42,6 @@ import org.apache.lucene.util.Bits; import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.hnsw.HnswGraph; import org.apache.lucene.util.hnsw.NeighborQueue; -import org.apache.lucene.util.hnsw.RandomAccessVectorValues; /** * Reads vectors from the index segments along with index data structures supporting KNN search. @@ -263,7 +260,7 @@ public final class Lucene90HnswVectorsReader extends KnnVectorsReader { int node = results.topNode(); float minSimilarity = results.topScore(); results.pop(); - knnCollector.collect(node, minSimilarity); + knnCollector.collect(vectorValues.ordToDoc(node), minSimilarity); } } @@ -355,8 +352,7 @@ public final class Lucene90HnswVectorsReader extends KnnVectorsReader { } /** Read the vector values from the index input. This supports both iterated and random access. */ - static class OffHeapFloatVectorValues extends FloatVectorValues - implements RandomAccessVectorValues.Floats { + static class OffHeapFloatVectorValues extends FloatVectorValues { final int dimension; final int[] ordToDoc; @@ -367,9 +363,6 @@ public final class Lucene90HnswVectorsReader extends KnnVectorsReader { final float[] value; final VectorSimilarityFunction similarityFunction; - int ord = -1; - int doc = -1; - OffHeapFloatVectorValues( int dimension, int[] ordToDoc, @@ -394,42 +387,6 @@ public final class Lucene90HnswVectorsReader extends KnnVectorsReader { return ordToDoc.length; } - @Override - public float[] vectorValue() throws IOException { - return vectorValue(ord); - } - - @Override - public int docID() { - return doc; - } - - @Override - public int nextDoc() { - if (++ord >= size()) { - doc = NO_MORE_DOCS; - } else { - doc = ordToDoc[ord]; - } - return doc; - } - - @Override - public int advance(int target) { - assert docID() < target; - ord = Arrays.binarySearch(ordToDoc, ord + 1, ordToDoc.length, target); - if (ord < 0) { - ord = -(ord + 1); - } - assert ord <= ordToDoc.length; - if (ord == ordToDoc.length) { - doc = NO_MORE_DOCS; - } else { - doc = ordToDoc[ord]; - } - return doc; - } - @Override public OffHeapFloatVectorValues copy() { return new OffHeapFloatVectorValues(dimension, ordToDoc, similarityFunction, dataIn.clone()); @@ -446,21 +403,32 @@ public final class Lucene90HnswVectorsReader extends KnnVectorsReader { return value; } + @Override + public int ordToDoc(int ord) { + return ordToDoc[ord]; + } + + @Override + public DocIndexIterator iterator() { + return createSparseIterator(); + } + @Override public VectorScorer scorer(float[] target) { if (size() == 0) { return null; } OffHeapFloatVectorValues values = this.copy(); + DocIndexIterator iterator = values.iterator(); return new VectorScorer() { @Override public float score() throws IOException { - return values.similarityFunction.compare(values.vectorValue(), target); + return values.similarityFunction.compare(values.vectorValue(iterator.index()), target); } @Override - public DocIdSetIterator iterator() { - return values; + public DocIndexIterator iterator() { + return iterator; } }; } diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/Lucene90OnHeapHnswGraph.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/Lucene90OnHeapHnswGraph.java index 52f2146e836..845987c2957 100644 --- a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/Lucene90OnHeapHnswGraph.java +++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/Lucene90OnHeapHnswGraph.java @@ -23,12 +23,12 @@ import java.io.IOException; import java.util.ArrayList; import java.util.List; import java.util.SplittableRandom; +import org.apache.lucene.index.FloatVectorValues; import org.apache.lucene.index.VectorSimilarityFunction; import org.apache.lucene.util.Bits; import org.apache.lucene.util.SparseFixedBitSet; import org.apache.lucene.util.hnsw.HnswGraph; import org.apache.lucene.util.hnsw.NeighborQueue; -import org.apache.lucene.util.hnsw.RandomAccessVectorValues; /** * An {@link HnswGraph} where all nodes and connections are held in memory. This class is used to @@ -74,7 +74,7 @@ public final class Lucene90OnHeapHnswGraph extends HnswGraph { float[] query, int topK, int numSeed, - RandomAccessVectorValues.Floats vectors, + FloatVectorValues vectors, VectorSimilarityFunction similarityFunction, HnswGraph graphValues, Bits acceptOrds, diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene91/Lucene91HnswVectorsReader.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene91/Lucene91HnswVectorsReader.java index 81f8d97a9a0..a140b4fd7f3 100644 --- a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene91/Lucene91HnswVectorsReader.java +++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene91/Lucene91HnswVectorsReader.java @@ -46,7 +46,6 @@ import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.hnsw.HnswGraph; import org.apache.lucene.util.hnsw.HnswGraphSearcher; import org.apache.lucene.util.hnsw.OrdinalTranslatedKnnCollector; -import org.apache.lucene.util.hnsw.RandomAccessVectorValues; import org.apache.lucene.util.hnsw.RandomVectorScorer; /** @@ -398,8 +397,7 @@ public final class Lucene91HnswVectorsReader extends KnnVectorsReader { } /** Read the vector values from the index input. This supports both iterated and random access. */ - static class OffHeapFloatVectorValues extends FloatVectorValues - implements RandomAccessVectorValues.Floats { + static class OffHeapFloatVectorValues extends FloatVectorValues { private final int dimension; private final int size; @@ -410,9 +408,6 @@ public final class Lucene91HnswVectorsReader extends KnnVectorsReader { private final float[] value; private final VectorSimilarityFunction similarityFunction; - private int ord = -1; - private int doc = -1; - OffHeapFloatVectorValues( int dimension, int size, @@ -439,49 +434,6 @@ public final class Lucene91HnswVectorsReader extends KnnVectorsReader { return size; } - @Override - public float[] vectorValue() throws IOException { - dataIn.seek((long) ord * byteSize); - dataIn.readFloats(value, 0, value.length); - return value; - } - - @Override - public int docID() { - return doc; - } - - @Override - public int nextDoc() { - if (++ord >= size) { - doc = NO_MORE_DOCS; - } else { - doc = ordToDocOperator.applyAsInt(ord); - } - return doc; - } - - @Override - public int advance(int target) { - assert docID() < target; - - if (ordToDoc == null) { - ord = target; - } else { - ord = Arrays.binarySearch(ordToDoc, ord + 1, ordToDoc.length, target); - if (ord < 0) { - ord = -(ord + 1); - } - } - - if (ord < size) { - doc = ordToDocOperator.applyAsInt(ord); - } else { - doc = NO_MORE_DOCS; - } - return doc; - } - @Override public OffHeapFloatVectorValues copy() { return new OffHeapFloatVectorValues( @@ -495,21 +447,32 @@ public final class Lucene91HnswVectorsReader extends KnnVectorsReader { return value; } + @Override + public int ordToDoc(int ord) { + return ordToDocOperator.applyAsInt(ord); + } + + @Override + public DocIndexIterator iterator() { + return createSparseIterator(); + } + @Override public VectorScorer scorer(float[] target) { if (size == 0) { return null; } OffHeapFloatVectorValues values = this.copy(); + DocIndexIterator iterator = values.iterator(); return new VectorScorer() { @Override public float score() throws IOException { - return values.similarityFunction.compare(values.vectorValue(), target); + return values.similarityFunction.compare(values.vectorValue(iterator.index()), target); } @Override public DocIdSetIterator iterator() { - return values; + return iterator; } }; } diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene912/Lucene912Codec.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene912/Lucene912Codec.java similarity index 98% rename from lucene/core/src/java/org/apache/lucene/codecs/lucene912/Lucene912Codec.java rename to lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene912/Lucene912Codec.java index cb4ef755a6b..d7b89d31081 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene912/Lucene912Codec.java +++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene912/Lucene912Codec.java @@ -14,7 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.lucene.codecs.lucene912; +package org.apache.lucene.backward_codecs.lucene912; import java.util.Objects; import org.apache.lucene.codecs.Codec; @@ -37,6 +37,7 @@ import org.apache.lucene.codecs.lucene90.Lucene90NormsFormat; import org.apache.lucene.codecs.lucene90.Lucene90PointsFormat; import org.apache.lucene.codecs.lucene90.Lucene90StoredFieldsFormat; import org.apache.lucene.codecs.lucene90.Lucene90TermVectorsFormat; +import org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat; import org.apache.lucene.codecs.lucene94.Lucene94FieldInfosFormat; import org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat; import org.apache.lucene.codecs.lucene99.Lucene99SegmentInfoFormat; diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene912/package-info.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene912/package-info.java new file mode 100644 index 00000000000..aac717a3e6c --- /dev/null +++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene912/package-info.java @@ -0,0 +1,433 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Lucene 9.12 file format. + * + *

Apache Lucene - Index File Formats

+ * + * + * + *

Introduction

+ * + *
+ * + *

This document defines the index file formats used in this version of Lucene. If you are using + * a different version of Lucene, please consult the copy of docs/ that was distributed + * with the version you are using. + * + *

This document attempts to provide a high-level definition of the Apache Lucene file formats. + *

+ * + *

Definitions

+ * + *
+ * + *

The fundamental concepts in Lucene are index, document, field and term. + * + *

An index contains a sequence of documents. + * + *

    + *
  • A document is a sequence of fields. + *
  • A field is a named sequence of terms. + *
  • A term is a sequence of bytes. + *
+ * + *

The same sequence of bytes in two different fields is considered a different term. Thus terms + * are represented as a pair: the string naming the field, and the bytes within the field. + * + *

Inverted Indexing

+ * + *

Lucene's index stores terms and statistics about those terms in order to make term-based + * search more efficient. Lucene's terms index falls into the family of indexes known as an + * inverted index. This is because it can list, for a term, the documents that contain it. + * This is the inverse of the natural relationship, in which documents list terms. + * + *

Types of Fields

+ * + *

In Lucene, fields may be stored, in which case their text is stored in the index + * literally, in a non-inverted manner. Fields that are inverted are called indexed. A field + * may be both stored and indexed. + * + *

The text of a field may be tokenized into terms to be indexed, or the text of a field + * may be used literally as a term to be indexed. Most fields are tokenized, but sometimes it is + * useful for certain identifier fields to be indexed literally. + * + *

See the {@link org.apache.lucene.document.Field Field} java docs for more information on + * Fields. + * + *

Segments

+ * + *

Lucene indexes may be composed of multiple sub-indexes, or segments. Each segment is a + * fully independent index, which could be searched separately. Indexes evolve by: + * + *

    + *
  1. Creating new segments for newly added documents. + *
  2. Merging existing segments. + *
+ * + *

Searches may involve multiple segments and/or multiple indexes, each index potentially + * composed of a set of segments. + * + *

Document Numbers

+ * + *

Internally, Lucene refers to documents by an integer document number. The first + * document added to an index is numbered zero, and each subsequent document added gets a number one + * greater than the previous. + * + *

Note that a document's number may change, so caution should be taken when storing these + * numbers outside of Lucene. In particular, numbers may change in the following situations: + * + *

    + *
  • + *

    The numbers stored in each segment are unique only within the segment, and must be + * converted before they can be used in a larger context. The standard technique is to + * allocate each segment a range of values, based on the range of numbers used in that + * segment. To convert a document number from a segment to an external value, the segment's + * base document number is added. To convert an external value back to a + * segment-specific value, the segment is identified by the range that the external value is + * in, and the segment's base value is subtracted. For example two five document segments + * might be combined, so that the first segment has a base value of zero, and the second of + * five. Document three from the second segment would have an external value of eight. + *

  • + *

    When documents are deleted, gaps are created in the numbering. These are eventually + * removed as the index evolves through merging. Deleted documents are dropped when segments + * are merged. A freshly-merged segment thus has no gaps in its numbering. + *

+ * + *
+ * + *

Index Structure Overview

+ * + *
+ * + *

Each segment index maintains the following: + * + *

    + *
  • {@link org.apache.lucene.codecs.lucene99.Lucene99SegmentInfoFormat Segment info}. This + * contains metadata about a segment, such as the number of documents, what files it uses, and + * information about how the segment is sorted + *
  • {@link org.apache.lucene.codecs.lucene94.Lucene94FieldInfosFormat Field names}. This + * contains metadata about the set of named fields used in the index. + *
  • {@link org.apache.lucene.codecs.lucene90.Lucene90StoredFieldsFormat Stored Field values}. + * This contains, for each document, a list of attribute-value pairs, where the attributes are + * field names. These are used to store auxiliary information about the document, such as its + * title, url, or an identifier to access a database. The set of stored fields are what is + * returned for each hit when searching. This is keyed by document number. + *
  • {@link org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat Term dictionary}. A + * dictionary containing all of the terms used in all of the indexed fields of all of the + * documents. The dictionary also contains the number of documents which contain the term, and + * pointers to the term's frequency and proximity data. + *
  • {@link org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat Term Frequency data}. For + * each term in the dictionary, the numbers of all the documents that contain that term, and + * the frequency of the term in that document, unless frequencies are omitted ({@link + * org.apache.lucene.index.IndexOptions#DOCS IndexOptions.DOCS}) + *
  • {@link org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat Term Proximity data}. For + * each term in the dictionary, the positions that the term occurs in each document. Note that + * this will not exist if all fields in all documents omit position data. + *
  • {@link org.apache.lucene.codecs.lucene90.Lucene90NormsFormat Normalization factors}. For + * each field in each document, a value is stored that is multiplied into the score for hits + * on that field. + *
  • {@link org.apache.lucene.codecs.lucene90.Lucene90TermVectorsFormat Term Vectors}. For each + * field in each document, the term vector (sometimes called document vector) may be stored. A + * term vector consists of term text and term frequency. To add Term Vectors to your index see + * the {@link org.apache.lucene.document.Field Field} constructors + *
  • {@link org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat Per-document values}. Like + * stored values, these are also keyed by document number, but are generally intended to be + * loaded into main memory for fast access. Whereas stored values are generally intended for + * summary results from searches, per-document values are useful for things like scoring + * factors. + *
  • {@link org.apache.lucene.codecs.lucene90.Lucene90LiveDocsFormat Live documents}. An + * optional file indicating which documents are live. + *
  • {@link org.apache.lucene.codecs.lucene90.Lucene90PointsFormat Point values}. Optional pair + * of files, recording dimensionally indexed fields, to enable fast numeric range filtering + * and large numeric values like BigInteger and BigDecimal (1D) and geographic shape + * intersection (2D, 3D). + *
  • {@link org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat Vector values}. The + * vector format stores numeric vectors in a format optimized for random access and + * computation, supporting high-dimensional nearest-neighbor search. + *
+ * + *

Details on each of these are provided in their linked pages.

+ * + *

File Naming

+ * + *
+ * + *

All files belonging to a segment have the same name with varying extensions. The extensions + * correspond to the different file formats described below. When using the Compound File format + * (default for small segments) these files (except for the Segment info file, the Lock file, and + * Deleted documents file) are collapsed into a single .cfs file (see below for details) + * + *

Typically, all segments in an index are stored in a single directory, although this is not + * required. + * + *

File names are never re-used. That is, when any file is saved to the Directory it is given a + * never before used filename. This is achieved using a simple generations approach. For example, + * the first segments file is segments_1, then segments_2, etc. The generation is a sequential long + * integer represented in alpha-numeric (base 36) form.

+ * + *

Summary of File Extensions

+ * + *
+ * + *

The following table summarizes the names and extensions of the files in Lucene: + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + *
lucene filenames by extension
NameExtensionBrief Description
{@link org.apache.lucene.index.SegmentInfos Segments File}segments_NStores information about a commit point
Lock Filewrite.lockThe Write lock prevents multiple IndexWriters from writing to the same + * file.
{@link org.apache.lucene.codecs.lucene99.Lucene99SegmentInfoFormat Segment Info}.siStores metadata about a segment
{@link org.apache.lucene.codecs.lucene90.Lucene90CompoundFormat Compound File}.cfs, .cfeAn optional "virtual" file consisting of all the other index files for + * systems that frequently run out of file handles.
{@link org.apache.lucene.codecs.lucene94.Lucene94FieldInfosFormat Fields}.fnmStores information about the fields
{@link org.apache.lucene.codecs.lucene90.Lucene90StoredFieldsFormat Field Index}.fdxContains pointers to field data
{@link org.apache.lucene.codecs.lucene90.Lucene90StoredFieldsFormat Field Data}.fdtThe stored fields for documents
{@link org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat Term Dictionary}.timThe term dictionary, stores term info
{@link org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat Term Index}.tipThe index into the Term Dictionary
{@link org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat Frequencies}.docContains the list of docs which contain each term along with frequency
{@link org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat Positions}.posStores position information about where a term occurs in the index
{@link org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat Payloads}.payStores additional per-position metadata information such as character offsets and user payloads
{@link org.apache.lucene.codecs.lucene90.Lucene90NormsFormat Norms}.nvd, .nvmEncodes length and boost factors for docs and fields
{@link org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat Per-Document Values}.dvd, .dvmEncodes additional scoring factors or other per-document information.
{@link org.apache.lucene.codecs.lucene90.Lucene90TermVectorsFormat Term Vector Index}.tvxStores offset into the document data file
{@link org.apache.lucene.codecs.lucene90.Lucene90TermVectorsFormat Term Vector Data}.tvdContains term vector data.
{@link org.apache.lucene.codecs.lucene90.Lucene90LiveDocsFormat Live Documents}.livInfo about what documents are live
{@link org.apache.lucene.codecs.lucene90.Lucene90PointsFormat Point values}.kdd, .kdi, .kdmHolds indexed points
{@link org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat Vector values}.vec, .vem, .veq, vexHolds indexed vectors; .vec files contain the raw vector data, + * .vem the vector metadata, .veq the quantized vector data, and .vex the + * hnsw graph data.
+ * + *

+ * + *

Lock File

+ * + * The write lock, which is stored in the index directory by default, is named "write.lock". If the + * lock directory is different from the index directory then the write lock will be named + * "XXXX-write.lock" where XXXX is a unique prefix derived from the full path to the index + * directory. When this file is present, a writer is currently modifying the index (adding or + * removing documents). This lock file ensures that only one writer is modifying the index at a + * time. + * + *

History

+ * + *

Compatibility notes are provided in this document, describing how file formats have changed + * from prior versions: + * + *

    + *
  • In version 2.1, the file format was changed to allow lock-less commits (ie, no more commit + * lock). The change is fully backwards compatible: you can open a pre-2.1 index for searching + * or adding/deleting of docs. When the new segments file is saved (committed), it will be + * written in the new file format (meaning no specific "upgrade" process is needed). But note + * that once a commit has occurred, pre-2.1 Lucene will not be able to read the index. + *
  • In version 2.3, the file format was changed to allow segments to share a single set of doc + * store (vectors & stored fields) files. This allows for faster indexing in certain + * cases. The change is fully backwards compatible (in the same way as the lock-less commits + * change in 2.1). + *
  • In version 2.4, Strings are now written as true UTF-8 byte sequence, not Java's modified + * UTF-8. See LUCENE-510 for + * details. + *
  • In version 2.9, an optional opaque Map<String,String> CommitUserData may be passed to + * IndexWriter's commit methods (and later retrieved), which is recorded in the segments_N + * file. See LUCENE-1382 for + * details. Also, diagnostics were added to each segment written recording details about why + * it was written (due to flush, merge; which OS/JRE was used; etc.). See issue LUCENE-1654 for details. + *
  • In version 3.0, compressed fields are no longer written to the index (they can still be + * read, but on merge the new segment will write them, uncompressed). See issue LUCENE-1960 for details. + *
  • In version 3.1, segments records the code version that created them. See LUCENE-2720 for details. + * Additionally segments track explicitly whether or not they have term vectors. See LUCENE-2811 for details. + *
  • In version 3.2, numeric fields are written as natively to stored fields file, previously + * they were stored in text format only. + *
  • In version 3.4, fields can omit position data while still indexing term frequencies. + *
  • In version 4.0, the format of the inverted index became extensible via the {@link + * org.apache.lucene.codecs.Codec Codec} api. Fast per-document storage ({@code DocValues}) + * was introduced. Normalization factors need no longer be a single byte, they can be any + * {@link org.apache.lucene.index.NumericDocValues NumericDocValues}. Terms need not be + * unicode strings, they can be any byte sequence. Term offsets can optionally be indexed into + * the postings lists. Payloads can be stored in the term vectors. + *
  • In version 4.1, the format of the postings list changed to use either of FOR compression or + * variable-byte encoding, depending upon the frequency of the term. Terms appearing only once + * were changed to inline directly into the term dictionary. Stored fields are compressed by + * default. + *
  • In version 4.2, term vectors are compressed by default. DocValues has a new multi-valued + * type (SortedSet), that can be used for faceting/grouping/joining on multi-valued fields. + *
  • In version 4.5, DocValues were extended to explicitly represent missing values. + *
  • In version 4.6, FieldInfos were extended to support per-field DocValues generation, to + * allow updating NumericDocValues fields. + *
  • In version 4.8, checksum footers were added to the end of each index file for improved data + * integrity. Specifically, the last 8 bytes of every index file contain the zlib-crc32 + * checksum of the file. + *
  • In version 4.9, DocValues has a new multi-valued numeric type (SortedNumeric) that is + * suitable for faceting/sorting/analytics. + *
  • In version 5.4, DocValues have been improved to store more information on disk: addresses + * for binary fields and ord indexes for multi-valued fields. + *
  • In version 6.0, Points were added, for multi-dimensional range/distance search. + *
  • In version 6.2, new Segment info format that reads/writes the index sort, to support index + * sorting. + *
  • In version 7.0, DocValues have been improved to better support sparse doc values thanks to + * an iterator API. + *
  • In version 8.0, postings have been enhanced to record, for each block of doc ids, the (term + * freq, normalization factor) pairs that may trigger the maximum score of the block. This + * information is recorded alongside skip data in order to be able to skip blocks of doc ids + * if they may not produce high enough scores. Additionally doc values and norms has been + * extended with jump-tables to make access O(1) instead of O(n), where n is the number of + * elements to skip when advancing in the data. + *
  • In version 8.4, postings, positions, offsets and payload lengths have move to a more + * performant encoding that is vectorized. + *
  • In version 8.6, index sort serialization is delegated to the sorts themselves, to allow + * user-defined sorts to be used + *
  • In version 8.6, points fields split the index tree and leaf data into separate files, to + * allow for different access patterns to the different data structures + *
  • In version 8.7, stored fields compression became adaptive to better handle documents with + * smaller stored fields. + *
  • In version 9.0, vector-valued fields were added. + *
  • In version 9.1, vector-valued fields were modified to add a graph hierarchy. + *
  • In version 9.2, docs of vector-valued fields were moved from .vem to .vec and encoded by + * IndexDISI. ordToDoc mappings was added to .vem. + *
  • In version 9.5, HNSW graph connections were changed to be delta-encoded with vints. + * Additionally, metadata file size improvements were made by delta-encoding nodes by graph + * layer and not writing the node ids for the zeroth layer. + *
  • In version 9.9, Vector scalar quantization support was added. Allowing the HNSW vector + * format to utilize int8 quantized vectors for float32 vector search. + *
  • In version 9.12, skip data was refactored to have only two levels: every 128 docs and every + * 4,06 docs, and to be inlined in postings lists. This resulted in a speedup for queries that + * need skipping, especially conjunctions. + *
+ * + * + * + *

Limitations

+ * + *
+ * + *

Lucene uses a Java int to refer to document numbers, and the index file format + * uses an Int32 on-disk to store document numbers. This is a limitation of both the + * index file format and the current implementation. Eventually these should be replaced with either + * UInt64 values, or better yet, {@link org.apache.lucene.store.DataOutput#writeVInt + * VInt} values which have no limit.

+ */ +package org.apache.lucene.backward_codecs.lucene912; diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene92/OffHeapFloatVectorValues.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene92/OffHeapFloatVectorValues.java index 19dc82cc46d..7c87bac5e54 100644 --- a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene92/OffHeapFloatVectorValues.java +++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene92/OffHeapFloatVectorValues.java @@ -26,12 +26,10 @@ import org.apache.lucene.search.VectorScorer; import org.apache.lucene.store.IndexInput; import org.apache.lucene.store.RandomAccessInput; import org.apache.lucene.util.Bits; -import org.apache.lucene.util.hnsw.RandomAccessVectorValues; import org.apache.lucene.util.packed.DirectMonotonicReader; /** Read the vector values from the index input. This supports both iterated and random access. */ -abstract class OffHeapFloatVectorValues extends FloatVectorValues - implements RandomAccessVectorValues.Floats { +abstract class OffHeapFloatVectorValues extends FloatVectorValues { protected final int dimension; protected final int size; @@ -95,8 +93,6 @@ abstract class OffHeapFloatVectorValues extends FloatVectorValues static class DenseOffHeapVectorValues extends OffHeapFloatVectorValues { - private int doc = -1; - public DenseOffHeapVectorValues( int dimension, int size, @@ -105,35 +101,16 @@ abstract class OffHeapFloatVectorValues extends FloatVectorValues super(dimension, size, vectorSimilarityFunction, slice); } - @Override - public float[] vectorValue() throws IOException { - return vectorValue(doc); - } - - @Override - public int docID() { - return doc; - } - - @Override - public int nextDoc() throws IOException { - return advance(doc + 1); - } - - @Override - public int advance(int target) throws IOException { - assert docID() < target; - if (target >= size) { - return doc = NO_MORE_DOCS; - } - return doc = target; - } - @Override public DenseOffHeapVectorValues copy() throws IOException { return new DenseOffHeapVectorValues(dimension, size, vectorSimilarityFunction, slice.clone()); } + @Override + public DocIndexIterator iterator() { + return createDenseIterator(); + } + @Override public Bits getAcceptOrds(Bits acceptDocs) { return acceptDocs; @@ -142,15 +119,17 @@ abstract class OffHeapFloatVectorValues extends FloatVectorValues @Override public VectorScorer scorer(float[] query) throws IOException { DenseOffHeapVectorValues values = this.copy(); + DocIndexIterator iterator = values.iterator(); return new VectorScorer() { @Override public float score() throws IOException { - return values.vectorSimilarityFunction.compare(values.vectorValue(), query); + return values.vectorSimilarityFunction.compare( + values.vectorValue(iterator.index()), query); } @Override public DocIdSetIterator iterator() { - return values; + return iterator; } }; } @@ -186,33 +165,17 @@ abstract class OffHeapFloatVectorValues extends FloatVectorValues fieldEntry.size()); } - @Override - public float[] vectorValue() throws IOException { - return vectorValue(disi.index()); - } - - @Override - public int docID() { - return disi.docID(); - } - - @Override - public int nextDoc() throws IOException { - return disi.nextDoc(); - } - - @Override - public int advance(int target) throws IOException { - assert docID() < target; - return disi.advance(target); - } - @Override public SparseOffHeapVectorValues copy() throws IOException { return new SparseOffHeapVectorValues( fieldEntry, dataIn, vectorSimilarityFunction, slice.clone()); } + @Override + public DocIndexIterator iterator() { + return IndexedDISI.asDocIndexIterator(disi); + } + @Override public int ordToDoc(int ord) { return (int) ordToDoc.get(ord); @@ -239,15 +202,17 @@ abstract class OffHeapFloatVectorValues extends FloatVectorValues @Override public VectorScorer scorer(float[] query) throws IOException { SparseOffHeapVectorValues values = this.copy(); + DocIndexIterator iterator = values.iterator(); return new VectorScorer() { @Override public float score() throws IOException { - return values.vectorSimilarityFunction.compare(values.vectorValue(), query); + return values.vectorSimilarityFunction.compare( + values.vectorValue(iterator.index()), query); } @Override public DocIdSetIterator iterator() { - return values; + return iterator; } }; } @@ -259,8 +224,6 @@ abstract class OffHeapFloatVectorValues extends FloatVectorValues super(dimension, 0, VectorSimilarityFunction.COSINE, null); } - private int doc = -1; - @Override public int dimension() { return super.dimension(); @@ -271,26 +234,6 @@ abstract class OffHeapFloatVectorValues extends FloatVectorValues return 0; } - @Override - public float[] vectorValue() throws IOException { - throw new UnsupportedOperationException(); - } - - @Override - public int docID() { - return doc; - } - - @Override - public int nextDoc() throws IOException { - return advance(doc + 1); - } - - @Override - public int advance(int target) throws IOException { - return doc = NO_MORE_DOCS; - } - @Override public OffHeapFloatVectorValues copy() throws IOException { throw new UnsupportedOperationException(); @@ -306,6 +249,11 @@ abstract class OffHeapFloatVectorValues extends FloatVectorValues throw new UnsupportedOperationException(); } + @Override + public DocIndexIterator iterator() { + return createDenseIterator(); + } + @Override public Bits getAcceptOrds(Bits acceptDocs) { return null; diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene94/OffHeapByteVectorValues.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene94/OffHeapByteVectorValues.java index 0c909e3839d..0c428bb169f 100644 --- a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene94/OffHeapByteVectorValues.java +++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene94/OffHeapByteVectorValues.java @@ -28,12 +28,10 @@ import org.apache.lucene.search.VectorScorer; import org.apache.lucene.store.IndexInput; import org.apache.lucene.store.RandomAccessInput; import org.apache.lucene.util.Bits; -import org.apache.lucene.util.hnsw.RandomAccessVectorValues; import org.apache.lucene.util.packed.DirectMonotonicReader; /** Read the vector values from the index input. This supports both iterated and random access. */ -abstract class OffHeapByteVectorValues extends ByteVectorValues - implements RandomAccessVectorValues.Bytes { +abstract class OffHeapByteVectorValues extends ByteVectorValues { protected final int dimension; protected final int size; @@ -108,8 +106,6 @@ abstract class OffHeapByteVectorValues extends ByteVectorValues static class DenseOffHeapVectorValues extends OffHeapByteVectorValues { - private int doc = -1; - public DenseOffHeapVectorValues( int dimension, int size, @@ -119,36 +115,17 @@ abstract class OffHeapByteVectorValues extends ByteVectorValues super(dimension, size, slice, vectorSimilarityFunction, byteSize); } - @Override - public byte[] vectorValue() throws IOException { - return vectorValue(doc); - } - - @Override - public int docID() { - return doc; - } - - @Override - public int nextDoc() throws IOException { - return advance(doc + 1); - } - - @Override - public int advance(int target) throws IOException { - assert docID() < target; - if (target >= size) { - return doc = NO_MORE_DOCS; - } - return doc = target; - } - @Override public DenseOffHeapVectorValues copy() throws IOException { return new DenseOffHeapVectorValues( dimension, size, slice.clone(), vectorSimilarityFunction, byteSize); } + @Override + public DocIndexIterator iterator() { + return createDenseIterator(); + } + @Override public Bits getAcceptOrds(Bits acceptDocs) { return acceptDocs; @@ -157,15 +134,16 @@ abstract class OffHeapByteVectorValues extends ByteVectorValues @Override public VectorScorer scorer(byte[] query) throws IOException { DenseOffHeapVectorValues copy = this.copy(); + DocIndexIterator iterator = copy.iterator(); return new VectorScorer() { @Override public float score() throws IOException { - return vectorSimilarityFunction.compare(copy.vectorValue(), query); + return vectorSimilarityFunction.compare(copy.vectorValue(iterator.index()), query); } @Override public DocIdSetIterator iterator() { - return copy; + return iterator; } }; } @@ -202,27 +180,6 @@ abstract class OffHeapByteVectorValues extends ByteVectorValues fieldEntry.size()); } - @Override - public byte[] vectorValue() throws IOException { - return vectorValue(disi.index()); - } - - @Override - public int docID() { - return disi.docID(); - } - - @Override - public int nextDoc() throws IOException { - return disi.nextDoc(); - } - - @Override - public int advance(int target) throws IOException { - assert docID() < target; - return disi.advance(target); - } - @Override public SparseOffHeapVectorValues copy() throws IOException { return new SparseOffHeapVectorValues( @@ -234,6 +191,11 @@ abstract class OffHeapByteVectorValues extends ByteVectorValues return (int) ordToDoc.get(ord); } + @Override + public DocIndexIterator iterator() { + return fromDISI(disi); + } + @Override public Bits getAcceptOrds(Bits acceptDocs) { if (acceptDocs == null) { @@ -255,15 +217,16 @@ abstract class OffHeapByteVectorValues extends ByteVectorValues @Override public VectorScorer scorer(byte[] query) throws IOException { SparseOffHeapVectorValues copy = this.copy(); + IndexedDISI disi = copy.disi; return new VectorScorer() { @Override public float score() throws IOException { - return vectorSimilarityFunction.compare(copy.vectorValue(), query); + return vectorSimilarityFunction.compare(copy.vectorValue(disi.index()), query); } @Override public DocIdSetIterator iterator() { - return copy; + return disi; } }; } @@ -275,8 +238,6 @@ abstract class OffHeapByteVectorValues extends ByteVectorValues super(dimension, 0, null, VectorSimilarityFunction.COSINE, 0); } - private int doc = -1; - @Override public int dimension() { return super.dimension(); @@ -287,26 +248,6 @@ abstract class OffHeapByteVectorValues extends ByteVectorValues return 0; } - @Override - public byte[] vectorValue() throws IOException { - throw new UnsupportedOperationException(); - } - - @Override - public int docID() { - return doc; - } - - @Override - public int nextDoc() throws IOException { - return advance(doc + 1); - } - - @Override - public int advance(int target) throws IOException { - return doc = NO_MORE_DOCS; - } - @Override public OffHeapByteVectorValues copy() throws IOException { throw new UnsupportedOperationException(); @@ -322,6 +263,11 @@ abstract class OffHeapByteVectorValues extends ByteVectorValues throw new UnsupportedOperationException(); } + @Override + public DocIndexIterator iterator() { + return createDenseIterator(); + } + @Override public Bits getAcceptOrds(Bits acceptDocs) { return null; diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene94/OffHeapFloatVectorValues.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene94/OffHeapFloatVectorValues.java index 91f97b8a41f..b21df901ddb 100644 --- a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene94/OffHeapFloatVectorValues.java +++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene94/OffHeapFloatVectorValues.java @@ -26,12 +26,10 @@ import org.apache.lucene.search.VectorScorer; import org.apache.lucene.store.IndexInput; import org.apache.lucene.store.RandomAccessInput; import org.apache.lucene.util.Bits; -import org.apache.lucene.util.hnsw.RandomAccessVectorValues; import org.apache.lucene.util.packed.DirectMonotonicReader; /** Read the vector values from the index input. This supports both iterated and random access. */ -abstract class OffHeapFloatVectorValues extends FloatVectorValues - implements RandomAccessVectorValues.Floats { +abstract class OffHeapFloatVectorValues extends FloatVectorValues { protected final int dimension; protected final int size; @@ -104,8 +102,6 @@ abstract class OffHeapFloatVectorValues extends FloatVectorValues static class DenseOffHeapVectorValues extends OffHeapFloatVectorValues { - private int doc = -1; - public DenseOffHeapVectorValues( int dimension, int size, @@ -115,36 +111,17 @@ abstract class OffHeapFloatVectorValues extends FloatVectorValues super(dimension, size, slice, vectorSimilarityFunction, byteSize); } - @Override - public float[] vectorValue() throws IOException { - return vectorValue(doc); - } - - @Override - public int docID() { - return doc; - } - - @Override - public int nextDoc() throws IOException { - return advance(doc + 1); - } - - @Override - public int advance(int target) throws IOException { - assert docID() < target; - if (target >= size) { - return doc = NO_MORE_DOCS; - } - return doc = target; - } - @Override public DenseOffHeapVectorValues copy() throws IOException { return new DenseOffHeapVectorValues( dimension, size, slice.clone(), vectorSimilarityFunction, byteSize); } + @Override + public DocIndexIterator iterator() { + return createDenseIterator(); + } + @Override public Bits getAcceptOrds(Bits acceptDocs) { return acceptDocs; @@ -153,15 +130,18 @@ abstract class OffHeapFloatVectorValues extends FloatVectorValues @Override public VectorScorer scorer(float[] query) throws IOException { DenseOffHeapVectorValues values = this.copy(); + DocIndexIterator iterator = values.iterator(); + return new VectorScorer() { @Override public float score() throws IOException { - return values.vectorSimilarityFunction.compare(values.vectorValue(), query); + return values.vectorSimilarityFunction.compare( + values.vectorValue(iterator.index()), query); } @Override public DocIdSetIterator iterator() { - return values; + return iterator; } }; } @@ -198,33 +178,17 @@ abstract class OffHeapFloatVectorValues extends FloatVectorValues fieldEntry.size()); } - @Override - public float[] vectorValue() throws IOException { - return vectorValue(disi.index()); - } - - @Override - public int docID() { - return disi.docID(); - } - - @Override - public int nextDoc() throws IOException { - return disi.nextDoc(); - } - - @Override - public int advance(int target) throws IOException { - assert docID() < target; - return disi.advance(target); - } - @Override public SparseOffHeapVectorValues copy() throws IOException { return new SparseOffHeapVectorValues( fieldEntry, dataIn, slice.clone(), vectorSimilarityFunction, byteSize); } + @Override + public DocIndexIterator iterator() { + return IndexedDISI.asDocIndexIterator(disi); + } + @Override public int ordToDoc(int ord) { return (int) ordToDoc.get(ord); @@ -251,15 +215,17 @@ abstract class OffHeapFloatVectorValues extends FloatVectorValues @Override public VectorScorer scorer(float[] query) throws IOException { SparseOffHeapVectorValues values = this.copy(); + DocIndexIterator iterator = values.iterator(); return new VectorScorer() { @Override public float score() throws IOException { - return values.vectorSimilarityFunction.compare(values.vectorValue(), query); + return values.vectorSimilarityFunction.compare( + values.vectorValue(iterator.index()), query); } @Override public DocIdSetIterator iterator() { - return values; + return iterator; } }; } @@ -271,8 +237,6 @@ abstract class OffHeapFloatVectorValues extends FloatVectorValues super(dimension, 0, null, VectorSimilarityFunction.COSINE, 0); } - private int doc = -1; - @Override public int dimension() { return super.dimension(); @@ -283,26 +247,6 @@ abstract class OffHeapFloatVectorValues extends FloatVectorValues return 0; } - @Override - public float[] vectorValue() throws IOException { - throw new UnsupportedOperationException(); - } - - @Override - public int docID() { - return doc; - } - - @Override - public int nextDoc() throws IOException { - return advance(doc + 1); - } - - @Override - public int advance(int target) throws IOException { - return doc = NO_MORE_DOCS; - } - @Override public OffHeapFloatVectorValues copy() throws IOException { throw new UnsupportedOperationException(); @@ -318,6 +262,11 @@ abstract class OffHeapFloatVectorValues extends FloatVectorValues throw new UnsupportedOperationException(); } + @Override + public DocIndexIterator iterator() { + return createDenseIterator(); + } + @Override public Bits getAcceptOrds(Bits acceptDocs) { return null; diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene99/PostingsUtil.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene99/PostingsUtil.java index 7b95bada5bc..dce8c2b145d 100644 --- a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene99/PostingsUtil.java +++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene99/PostingsUtil.java @@ -19,6 +19,7 @@ package org.apache.lucene.backward_codecs.lucene99; import java.io.IOException; import org.apache.lucene.store.IndexInput; import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.util.GroupVIntUtil; /** Utility class to encode/decode postings block. */ final class PostingsUtil { @@ -35,7 +36,7 @@ final class PostingsUtil { boolean indexHasFreq, boolean decodeFreq) throws IOException { - docIn.readGroupVInts(docBuffer, num); + GroupVIntUtil.readGroupVInts(docIn, docBuffer, num); if (indexHasFreq && decodeFreq) { for (int i = 0; i < num; ++i) { freqBuffer[i] = docBuffer[i] & 0x01; diff --git a/lucene/backward-codecs/src/resources/META-INF/services/org.apache.lucene.codecs.Codec b/lucene/backward-codecs/src/resources/META-INF/services/org.apache.lucene.codecs.Codec index df14387fc68..a4638b5fcc7 100644 --- a/lucene/backward-codecs/src/resources/META-INF/services/org.apache.lucene.codecs.Codec +++ b/lucene/backward-codecs/src/resources/META-INF/services/org.apache.lucene.codecs.Codec @@ -23,3 +23,4 @@ org.apache.lucene.backward_codecs.lucene92.Lucene92Codec org.apache.lucene.backward_codecs.lucene94.Lucene94Codec org.apache.lucene.backward_codecs.lucene95.Lucene95Codec org.apache.lucene.backward_codecs.lucene99.Lucene99Codec +org.apache.lucene.backward_codecs.lucene912.Lucene912Codec diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene50/Lucene50RWCompoundFormat.java b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene50/Lucene50RWCompoundFormat.java index 2817b19828d..cfd269d9ddd 100644 --- a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene50/Lucene50RWCompoundFormat.java +++ b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene50/Lucene50RWCompoundFormat.java @@ -81,9 +81,8 @@ public final class Lucene50RWCompoundFormat extends CompoundFormat { public Lucene50RWCompoundFormat() {} @Override - public CompoundDirectory getCompoundReader(Directory dir, SegmentInfo si, IOContext context) - throws IOException { - return new Lucene50CompoundReader(dir, si, context); + public CompoundDirectory getCompoundReader(Directory dir, SegmentInfo si) throws IOException { + return new Lucene50CompoundReader(dir, si); } @Override diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene60/Lucene60PointsWriter.java b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene60/Lucene60PointsWriter.java index bcfa479a058..ca148491753 100644 --- a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene60/Lucene60PointsWriter.java +++ b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene60/Lucene60PointsWriter.java @@ -218,7 +218,7 @@ public class Lucene60PointsWriter extends PointsWriter { FieldInfos readerFieldInfos = mergeState.fieldInfos[i]; FieldInfo readerFieldInfo = readerFieldInfos.fieldInfo(fieldInfo.name); if (readerFieldInfo != null && readerFieldInfo.getPointDimensionCount() > 0) { - PointValues bkdReader = reader60.readers.get(readerFieldInfo.number); + PointValues bkdReader = reader60.getValues(readerFieldInfo.name); if (bkdReader != null) { bkdReaders.add(bkdReader); docMaps.add(mergeState.docMaps[i]); diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene86/Lucene86PointsWriter.java b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene86/Lucene86PointsWriter.java index 89555b370ea..94714111be1 100644 --- a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene86/Lucene86PointsWriter.java +++ b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene86/Lucene86PointsWriter.java @@ -249,7 +249,7 @@ public class Lucene86PointsWriter extends PointsWriter { // we confirmed this up above assert reader instanceof Lucene86PointsReader; - Lucene86PointsReader reader60 = (Lucene86PointsReader) reader; + Lucene86PointsReader reader86 = (Lucene86PointsReader) reader; // NOTE: we cannot just use the merged fieldInfo.number (instead of resolving to // this @@ -259,7 +259,7 @@ public class Lucene86PointsWriter extends PointsWriter { FieldInfos readerFieldInfos = mergeState.fieldInfos[i]; FieldInfo readerFieldInfo = readerFieldInfos.fieldInfo(fieldInfo.name); if (readerFieldInfo != null && readerFieldInfo.getPointDimensionCount() > 0) { - PointValues aPointValues = reader60.readers.get(readerFieldInfo.number); + PointValues aPointValues = reader86.getValues(readerFieldInfo.name); if (aPointValues != null) { pointValues.add(aPointValues); docMaps.add(mergeState.docMaps[i]); diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene90/Lucene90HnswVectorsWriter.java b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene90/Lucene90HnswVectorsWriter.java index 39828524d26..f60411752d2 100644 --- a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene90/Lucene90HnswVectorsWriter.java +++ b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene90/Lucene90HnswVectorsWriter.java @@ -29,13 +29,13 @@ import org.apache.lucene.index.ByteVectorValues; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.FloatVectorValues; import org.apache.lucene.index.IndexFileNames; +import org.apache.lucene.index.KnnVectorValues; import org.apache.lucene.index.SegmentWriteState; import org.apache.lucene.index.VectorSimilarityFunction; import org.apache.lucene.store.IndexInput; import org.apache.lucene.store.IndexOutput; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.IOUtils; -import org.apache.lucene.util.hnsw.RandomAccessVectorValues; /** * Writes vector values and knn graphs to index segments. @@ -188,12 +188,13 @@ public final class Lucene90HnswVectorsWriter extends BufferingKnnVectorsWriter { int count = 0; ByteBuffer binaryVector = ByteBuffer.allocate(vectors.dimension() * Float.BYTES).order(ByteOrder.LITTLE_ENDIAN); - for (int docV = vectors.nextDoc(); docV != NO_MORE_DOCS; docV = vectors.nextDoc(), count++) { + KnnVectorValues.DocIndexIterator iter = vectors.iterator(); + for (int docV = iter.nextDoc(); docV != NO_MORE_DOCS; docV = iter.nextDoc()) { // write vector - float[] vectorValue = vectors.vectorValue(); + float[] vectorValue = vectors.vectorValue(iter.index()); binaryVector.asFloatBuffer().put(vectorValue); output.writeBytes(binaryVector.array(), binaryVector.limit()); - docIds[count] = docV; + docIds[count++] = docV; } if (docIds.length > count) { @@ -234,7 +235,7 @@ public final class Lucene90HnswVectorsWriter extends BufferingKnnVectorsWriter { private void writeGraph( IndexOutput graphData, - RandomAccessVectorValues.Floats vectorValues, + FloatVectorValues vectorValues, VectorSimilarityFunction similarityFunction, long graphDataOffset, long[] offsets, diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene90/TestLucene90HnswVectorsFormat.java b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene90/TestLucene90HnswVectorsFormat.java index 2c689d5c0e5..b4840c9fd5b 100644 --- a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene90/TestLucene90HnswVectorsFormat.java +++ b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene90/TestLucene90HnswVectorsFormat.java @@ -12,7 +12,7 @@ * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and - * limitations under the License. + * limIndexedDISIitations under the License. */ package org.apache.lucene.backward_codecs.lucene90; @@ -83,4 +83,9 @@ public class TestLucene90HnswVectorsFormat extends BaseKnnVectorsFormatTestCase public void testMergingWithDifferentByteKnnFields() { // unimplemented } + + @Override + public void testMismatchedFields() throws Exception { + // requires byte support + } } diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene90/TestLucene90SegmentInfoFormat.java b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene90/TestLucene90SegmentInfoFormat.java index 9ecc3490641..ec1b93499ae 100644 --- a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene90/TestLucene90SegmentInfoFormat.java +++ b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene90/TestLucene90SegmentInfoFormat.java @@ -25,7 +25,7 @@ public class TestLucene90SegmentInfoFormat extends BaseSegmentInfoFormatTestCase @Override protected Version[] getVersions() { - return new Version[] {Version.LUCENE_9_0_0}; + return new Version[] {Version.fromBits(9, 0, 0)}; } @Override diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene91/Lucene91HnswGraphBuilder.java b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene91/Lucene91HnswGraphBuilder.java index dbb9a71b421..8dd6fba689f 100644 --- a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene91/Lucene91HnswGraphBuilder.java +++ b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene91/Lucene91HnswGraphBuilder.java @@ -25,6 +25,7 @@ import java.util.Objects; import java.util.SplittableRandom; import java.util.concurrent.TimeUnit; import org.apache.lucene.codecs.hnsw.DefaultFlatVectorScorer; +import org.apache.lucene.index.FloatVectorValues; import org.apache.lucene.index.VectorSimilarityFunction; import org.apache.lucene.util.FixedBitSet; import org.apache.lucene.util.InfoStream; @@ -32,7 +33,6 @@ import org.apache.lucene.util.hnsw.HnswGraph; import org.apache.lucene.util.hnsw.HnswGraphBuilder; import org.apache.lucene.util.hnsw.HnswGraphSearcher; import org.apache.lucene.util.hnsw.NeighborQueue; -import org.apache.lucene.util.hnsw.RandomAccessVectorValues; import org.apache.lucene.util.hnsw.RandomVectorScorer; /** @@ -57,7 +57,7 @@ public final class Lucene91HnswGraphBuilder { private final DefaultFlatVectorScorer defaultFlatVectorScorer = new DefaultFlatVectorScorer(); private final VectorSimilarityFunction similarityFunction; - private final RandomAccessVectorValues.Floats vectorValues; + private final FloatVectorValues vectorValues; private final SplittableRandom random; private final Lucene91BoundsChecker bound; private final HnswGraphSearcher graphSearcher; @@ -68,7 +68,7 @@ public final class Lucene91HnswGraphBuilder { // we need two sources of vectors in order to perform diversity check comparisons without // colliding - private RandomAccessVectorValues.Floats buildVectors; + private FloatVectorValues buildVectors; /** * Reads all the vectors from vector values, builds a graph connecting them by their dense @@ -83,7 +83,7 @@ public final class Lucene91HnswGraphBuilder { * to ensure repeatable construction. */ public Lucene91HnswGraphBuilder( - RandomAccessVectorValues.Floats vectors, + FloatVectorValues vectors, VectorSimilarityFunction similarityFunction, int maxConn, int beamWidth, @@ -113,14 +113,14 @@ public final class Lucene91HnswGraphBuilder { } /** - * Reads all the vectors from two copies of a {@link RandomAccessVectorValues}. Providing two - * copies enables efficient retrieval without extra data copying, while avoiding collision of the + * Reads all the vectors from two copies of a {@link FloatVectorValues}. Providing two copies + * enables efficient retrieval without extra data copying, while avoiding collision of the * returned values. * - * @param vectors the vectors for which to build a nearest neighbors graph. Must be an independet + * @param vectors the vectors for which to build a nearest neighbors graph. Must be an independent * accessor for the vectors */ - public Lucene91OnHeapHnswGraph build(RandomAccessVectorValues.Floats vectors) throws IOException { + public Lucene91OnHeapHnswGraph build(FloatVectorValues vectors) throws IOException { if (vectors == vectorValues) { throw new IllegalArgumentException( "Vectors to build must be independent of the source of vectors provided to HnswGraphBuilder()"); @@ -236,7 +236,7 @@ public final class Lucene91HnswGraphBuilder { // extract all the Neighbors from the queue into an array; these will now be // sorted from worst to best for (int i = 0; i < candidateCount; i++) { - float similarity = candidates.minCompetitiveSimilarity(); + float similarity = candidates.minimumScore(); scratch.add(candidates.popNode(), similarity); } } @@ -254,7 +254,7 @@ public final class Lucene91HnswGraphBuilder { float[] candidate, float score, Lucene91NeighborArray neighbors, - RandomAccessVectorValues.Floats vectorValues) + FloatVectorValues vectorValues) throws IOException { bound.set(score); for (int i = 0; i < neighbors.size(); i++) { diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene91/Lucene91HnswVectorsWriter.java b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene91/Lucene91HnswVectorsWriter.java index 37b75250381..a984a3ef1f8 100644 --- a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene91/Lucene91HnswVectorsWriter.java +++ b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene91/Lucene91HnswVectorsWriter.java @@ -17,8 +17,6 @@ package org.apache.lucene.backward_codecs.lucene91; -import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS; - import java.io.IOException; import java.nio.ByteBuffer; import java.nio.ByteOrder; @@ -30,6 +28,7 @@ import org.apache.lucene.index.DocsWithFieldSet; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.FloatVectorValues; import org.apache.lucene.index.IndexFileNames; +import org.apache.lucene.index.KnnVectorValues; import org.apache.lucene.index.SegmentWriteState; import org.apache.lucene.index.VectorSimilarityFunction; import org.apache.lucene.search.DocIdSetIterator; @@ -37,7 +36,6 @@ import org.apache.lucene.store.IndexInput; import org.apache.lucene.store.IndexOutput; import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.hnsw.HnswGraph; -import org.apache.lucene.util.hnsw.RandomAccessVectorValues; /** * Writes vector values and knn graphs to index segments. @@ -183,9 +181,10 @@ public final class Lucene91HnswVectorsWriter extends BufferingKnnVectorsWriter { DocsWithFieldSet docsWithField = new DocsWithFieldSet(); ByteBuffer binaryVector = ByteBuffer.allocate(vectors.dimension() * Float.BYTES).order(ByteOrder.LITTLE_ENDIAN); - for (int docV = vectors.nextDoc(); docV != NO_MORE_DOCS; docV = vectors.nextDoc()) { + KnnVectorValues.DocIndexIterator iter = vectors.iterator(); + for (int docV = iter.nextDoc(); docV != DocIdSetIterator.NO_MORE_DOCS; docV = iter.nextDoc()) { // write vector - float[] vectorValue = vectors.vectorValue(); + float[] vectorValue = vectors.vectorValue(iter.index()); binaryVector.asFloatBuffer().put(vectorValue); output.writeBytes(binaryVector.array(), binaryVector.limit()); docsWithField.add(docV); @@ -243,7 +242,7 @@ public final class Lucene91HnswVectorsWriter extends BufferingKnnVectorsWriter { } private Lucene91OnHeapHnswGraph writeGraph( - RandomAccessVectorValues.Floats vectorValues, VectorSimilarityFunction similarityFunction) + FloatVectorValues vectorValues, VectorSimilarityFunction similarityFunction) throws IOException { // build graph diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene91/TestLucene91HnswVectorsFormat.java b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene91/TestLucene91HnswVectorsFormat.java index df79316db0a..7bf2d426eac 100644 --- a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene91/TestLucene91HnswVectorsFormat.java +++ b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene91/TestLucene91HnswVectorsFormat.java @@ -82,4 +82,9 @@ public class TestLucene91HnswVectorsFormat extends BaseKnnVectorsFormatTestCase public void testMergingWithDifferentByteKnnFields() { // unimplemented } + + @Override + public void testMismatchedFields() throws Exception { + // requires byte support + } } diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene92/Lucene92HnswVectorsWriter.java b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene92/Lucene92HnswVectorsWriter.java index caa8fc3da14..bf1c89a536d 100644 --- a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene92/Lucene92HnswVectorsWriter.java +++ b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene92/Lucene92HnswVectorsWriter.java @@ -18,7 +18,6 @@ package org.apache.lucene.backward_codecs.lucene92; import static org.apache.lucene.backward_codecs.lucene92.Lucene92RWHnswVectorsFormat.DIRECT_MONOTONIC_BLOCK_SHIFT; -import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS; import java.io.IOException; import java.nio.ByteBuffer; @@ -33,6 +32,7 @@ import org.apache.lucene.index.DocsWithFieldSet; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.FloatVectorValues; import org.apache.lucene.index.IndexFileNames; +import org.apache.lucene.index.KnnVectorValues; import org.apache.lucene.index.SegmentWriteState; import org.apache.lucene.index.VectorSimilarityFunction; import org.apache.lucene.search.DocIdSetIterator; @@ -43,7 +43,6 @@ import org.apache.lucene.util.hnsw.HnswGraph; import org.apache.lucene.util.hnsw.HnswGraphBuilder; import org.apache.lucene.util.hnsw.NeighborArray; import org.apache.lucene.util.hnsw.OnHeapHnswGraph; -import org.apache.lucene.util.hnsw.RandomAccessVectorValues; import org.apache.lucene.util.hnsw.RandomVectorScorerSupplier; import org.apache.lucene.util.packed.DirectMonotonicWriter; @@ -190,9 +189,12 @@ public final class Lucene92HnswVectorsWriter extends BufferingKnnVectorsWriter { DocsWithFieldSet docsWithField = new DocsWithFieldSet(); ByteBuffer binaryVector = ByteBuffer.allocate(vectors.dimension() * Float.BYTES).order(ByteOrder.LITTLE_ENDIAN); - for (int docV = vectors.nextDoc(); docV != NO_MORE_DOCS; docV = vectors.nextDoc()) { + KnnVectorValues.DocIndexIterator iterator = vectors.iterator(); + for (int docV = iterator.nextDoc(); + docV != DocIdSetIterator.NO_MORE_DOCS; + docV = iterator.nextDoc()) { // write vector - float[] vectorValue = vectors.vectorValue(); + float[] vectorValue = vectors.vectorValue(iterator.index()); binaryVector.asFloatBuffer().put(vectorValue); output.writeBytes(binaryVector.array(), binaryVector.limit()); docsWithField.add(docV); @@ -277,7 +279,7 @@ public final class Lucene92HnswVectorsWriter extends BufferingKnnVectorsWriter { } private OnHeapHnswGraph writeGraph( - RandomAccessVectorValues.Floats vectorValues, VectorSimilarityFunction similarityFunction) + FloatVectorValues vectorValues, VectorSimilarityFunction similarityFunction) throws IOException { DefaultFlatVectorScorer defaultFlatVectorScorer = new DefaultFlatVectorScorer(); // build graph diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene92/TestLucene92HnswVectorsFormat.java b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene92/TestLucene92HnswVectorsFormat.java index 0e003dafc3b..192f70a6397 100644 --- a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene92/TestLucene92HnswVectorsFormat.java +++ b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene92/TestLucene92HnswVectorsFormat.java @@ -72,4 +72,9 @@ public class TestLucene92HnswVectorsFormat extends BaseKnnVectorsFormatTestCase public void testMergingWithDifferentByteKnnFields() { // unimplemented } + + @Override + public void testMismatchedFields() throws Exception { + // requires byte support + } } diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene94/Lucene94HnswVectorsWriter.java b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene94/Lucene94HnswVectorsWriter.java index 1cb445cab77..01698da7989 100644 --- a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene94/Lucene94HnswVectorsWriter.java +++ b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene94/Lucene94HnswVectorsWriter.java @@ -36,6 +36,7 @@ import org.apache.lucene.index.DocsWithFieldSet; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.FloatVectorValues; import org.apache.lucene.index.IndexFileNames; +import org.apache.lucene.index.KnnVectorValues; import org.apache.lucene.index.MergeState; import org.apache.lucene.index.SegmentWriteState; import org.apache.lucene.index.Sorter; @@ -52,7 +53,6 @@ import org.apache.lucene.util.hnsw.HnswGraph.NodesIterator; import org.apache.lucene.util.hnsw.HnswGraphBuilder; import org.apache.lucene.util.hnsw.NeighborArray; import org.apache.lucene.util.hnsw.OnHeapHnswGraph; -import org.apache.lucene.util.hnsw.RandomAccessVectorValues; import org.apache.lucene.util.hnsw.RandomVectorScorerSupplier; import org.apache.lucene.util.packed.DirectMonotonicWriter; @@ -216,9 +216,7 @@ public final class Lucene94HnswVectorsWriter extends KnnVectorsWriter { final int[] docIdOffsets = new int[sortMap.size()]; int offset = 1; // 0 means no vector for this (field, document) DocIdSetIterator iterator = fieldData.docsWithField.iterator(); - for (int docID = iterator.nextDoc(); - docID != DocIdSetIterator.NO_MORE_DOCS; - docID = iterator.nextDoc()) { + for (int docID = iterator.nextDoc(); docID != NO_MORE_DOCS; docID = iterator.nextDoc()) { int newDocID = sortMap.oldToNew(docID); docIdOffsets[newDocID] = offset++; } @@ -556,9 +554,7 @@ public final class Lucene94HnswVectorsWriter extends KnnVectorsWriter { final DirectMonotonicWriter ordToDocWriter = DirectMonotonicWriter.getInstance(meta, vectorData, count, DIRECT_MONOTONIC_BLOCK_SHIFT); DocIdSetIterator iterator = docsWithField.iterator(); - for (int doc = iterator.nextDoc(); - doc != DocIdSetIterator.NO_MORE_DOCS; - doc = iterator.nextDoc()) { + for (int doc = iterator.nextDoc(); doc != NO_MORE_DOCS; doc = iterator.nextDoc()) { ordToDocWriter.add(doc); } ordToDocWriter.finish(); @@ -590,11 +586,10 @@ public final class Lucene94HnswVectorsWriter extends KnnVectorsWriter { private static DocsWithFieldSet writeByteVectorData( IndexOutput output, ByteVectorValues byteVectorValues) throws IOException { DocsWithFieldSet docsWithField = new DocsWithFieldSet(); - for (int docV = byteVectorValues.nextDoc(); - docV != NO_MORE_DOCS; - docV = byteVectorValues.nextDoc()) { + KnnVectorValues.DocIndexIterator iter = byteVectorValues.iterator(); + for (int docV = iter.nextDoc(); docV != NO_MORE_DOCS; docV = iter.nextDoc()) { // write vector - byte[] binaryValue = byteVectorValues.vectorValue(); + byte[] binaryValue = byteVectorValues.vectorValue(iter.index()); assert binaryValue.length == byteVectorValues.dimension() * VectorEncoding.BYTE.byteSize; output.writeBytes(binaryValue, binaryValue.length); docsWithField.add(docV); @@ -608,14 +603,13 @@ public final class Lucene94HnswVectorsWriter extends KnnVectorsWriter { private static DocsWithFieldSet writeVectorData( IndexOutput output, FloatVectorValues floatVectorValues) throws IOException { DocsWithFieldSet docsWithField = new DocsWithFieldSet(); + KnnVectorValues.DocIndexIterator iter = floatVectorValues.iterator(); ByteBuffer binaryVector = ByteBuffer.allocate(floatVectorValues.dimension() * VectorEncoding.FLOAT32.byteSize) .order(ByteOrder.LITTLE_ENDIAN); - for (int docV = floatVectorValues.nextDoc(); - docV != NO_MORE_DOCS; - docV = floatVectorValues.nextDoc()) { + for (int docV = iter.nextDoc(); docV != NO_MORE_DOCS; docV = iter.nextDoc()) { // write vector - float[] vectorValue = floatVectorValues.vectorValue(); + float[] vectorValue = floatVectorValues.vectorValue(iter.index()); binaryVector.asFloatBuffer().put(vectorValue); output.writeBytes(binaryVector.array(), binaryVector.limit()); docsWithField.add(docV); @@ -672,11 +666,11 @@ public final class Lucene94HnswVectorsWriter extends KnnVectorsWriter { case BYTE -> defaultFlatVectorScorer.getRandomVectorScorerSupplier( fieldInfo.getVectorSimilarityFunction(), - RandomAccessVectorValues.fromBytes((List) vectors, dim)); + ByteVectorValues.fromBytes((List) vectors, dim)); case FLOAT32 -> defaultFlatVectorScorer.getRandomVectorScorerSupplier( fieldInfo.getVectorSimilarityFunction(), - RandomAccessVectorValues.fromFloats((List) vectors, dim)); + FloatVectorValues.fromFloats((List) vectors, dim)); }; hnswGraphBuilder = HnswGraphBuilder.create(scorerSupplier, M, beamWidth, HnswGraphBuilder.randSeed); diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene95/Lucene95HnswVectorsWriter.java b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene95/Lucene95HnswVectorsWriter.java index 37c39d311d6..c855d8f5e07 100644 --- a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene95/Lucene95HnswVectorsWriter.java +++ b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene95/Lucene95HnswVectorsWriter.java @@ -39,6 +39,7 @@ import org.apache.lucene.index.DocsWithFieldSet; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.FloatVectorValues; import org.apache.lucene.index.IndexFileNames; +import org.apache.lucene.index.KnnVectorValues; import org.apache.lucene.index.MergeState; import org.apache.lucene.index.SegmentWriteState; import org.apache.lucene.index.Sorter; @@ -56,7 +57,6 @@ import org.apache.lucene.util.hnsw.HnswGraphBuilder; import org.apache.lucene.util.hnsw.IncrementalHnswGraphMerger; import org.apache.lucene.util.hnsw.NeighborArray; import org.apache.lucene.util.hnsw.OnHeapHnswGraph; -import org.apache.lucene.util.hnsw.RandomAccessVectorValues; import org.apache.lucene.util.hnsw.RandomVectorScorerSupplier; import org.apache.lucene.util.packed.DirectMonotonicWriter; @@ -221,9 +221,7 @@ public final class Lucene95HnswVectorsWriter extends KnnVectorsWriter { final int[] docIdOffsets = new int[sortMap.size()]; int offset = 1; // 0 means no vector for this (field, document) DocIdSetIterator iterator = fieldData.docsWithField.iterator(); - for (int docID = iterator.nextDoc(); - docID != DocIdSetIterator.NO_MORE_DOCS; - docID = iterator.nextDoc()) { + for (int docID = iterator.nextDoc(); docID != NO_MORE_DOCS; docID = iterator.nextDoc()) { int newDocID = sortMap.oldToNew(docID); docIdOffsets[newDocID] = offset++; } @@ -482,18 +480,18 @@ public final class Lucene95HnswVectorsWriter extends KnnVectorsWriter { mergeState.knnVectorsReaders[i], mergeState.docMaps[i], mergeState.liveDocs[i]); } } - DocIdSetIterator mergedVectorIterator = null; + KnnVectorValues mergedVectorValues = null; switch (fieldInfo.getVectorEncoding()) { case BYTE -> - mergedVectorIterator = + mergedVectorValues = KnnVectorsWriter.MergedVectorValues.mergeByteVectorValues(fieldInfo, mergeState); case FLOAT32 -> - mergedVectorIterator = + mergedVectorValues = KnnVectorsWriter.MergedVectorValues.mergeFloatVectorValues(fieldInfo, mergeState); } graph = merger.merge( - mergedVectorIterator, segmentWriteState.infoStream, docsWithField.cardinality()); + mergedVectorValues, segmentWriteState.infoStream, docsWithField.cardinality()); vectorIndexNodeOffsets = writeGraph(graph); } long vectorIndexLength = vectorIndex.getFilePointer() - vectorIndexOffset; @@ -636,14 +634,13 @@ public final class Lucene95HnswVectorsWriter extends KnnVectorsWriter { private static DocsWithFieldSet writeByteVectorData( IndexOutput output, ByteVectorValues byteVectorValues) throws IOException { DocsWithFieldSet docsWithField = new DocsWithFieldSet(); - for (int docV = byteVectorValues.nextDoc(); - docV != NO_MORE_DOCS; - docV = byteVectorValues.nextDoc()) { + KnnVectorValues.DocIndexIterator iter = byteVectorValues.iterator(); + for (int docId = iter.nextDoc(); docId != NO_MORE_DOCS; docId = iter.nextDoc()) { // write vector - byte[] binaryValue = byteVectorValues.vectorValue(); + byte[] binaryValue = byteVectorValues.vectorValue(iter.index()); assert binaryValue.length == byteVectorValues.dimension() * VectorEncoding.BYTE.byteSize; output.writeBytes(binaryValue, binaryValue.length); - docsWithField.add(docV); + docsWithField.add(docId); } return docsWithField; } @@ -657,11 +654,10 @@ public final class Lucene95HnswVectorsWriter extends KnnVectorsWriter { ByteBuffer buffer = ByteBuffer.allocate(floatVectorValues.dimension() * VectorEncoding.FLOAT32.byteSize) .order(ByteOrder.LITTLE_ENDIAN); - for (int docV = floatVectorValues.nextDoc(); - docV != NO_MORE_DOCS; - docV = floatVectorValues.nextDoc()) { + KnnVectorValues.DocIndexIterator iter = floatVectorValues.iterator(); + for (int docV = iter.nextDoc(); docV != NO_MORE_DOCS; docV = iter.nextDoc()) { // write vector - float[] value = floatVectorValues.vectorValue(); + float[] value = floatVectorValues.vectorValue(iter.index()); buffer.asFloatBuffer().put(value); output.writeBytes(buffer.array(), buffer.limit()); docsWithField.add(docV); @@ -718,11 +714,11 @@ public final class Lucene95HnswVectorsWriter extends KnnVectorsWriter { case BYTE -> defaultFlatVectorScorer.getRandomVectorScorerSupplier( fieldInfo.getVectorSimilarityFunction(), - RandomAccessVectorValues.fromBytes((List) vectors, dim)); + ByteVectorValues.fromBytes((List) vectors, dim)); case FLOAT32 -> defaultFlatVectorScorer.getRandomVectorScorerSupplier( fieldInfo.getVectorSimilarityFunction(), - RandomAccessVectorValues.fromFloats((List) vectors, dim)); + FloatVectorValues.fromFloats((List) vectors, dim)); }; hnswGraphBuilder = HnswGraphBuilder.create(scorerSupplier, M, beamWidth, HnswGraphBuilder.randSeed); diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/BackwardsCompatibilityTestBase.java b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/BackwardsCompatibilityTestBase.java index ae5920de368..edbed96be75 100644 --- a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/BackwardsCompatibilityTestBase.java +++ b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/BackwardsCompatibilityTestBase.java @@ -106,8 +106,8 @@ public abstract class BackwardsCompatibilityTestBase extends LuceneTestCase { * This is a base constructor for parameterized BWC tests. The constructor arguments are provided * by {@link com.carrotsearch.randomizedtesting.RandomizedRunner} during test execution. A {@link * com.carrotsearch.randomizedtesting.annotations.ParametersFactory} specified in a subclass - * provides a list lists of arguments for the tests and RandomizedRunner will execute the test for - * each of the argument list. + * provides a list of arguments for the tests and RandomizedRunner will execute the test for each + * of the argument list. * * @param version the version this test should run for * @param indexPattern an index pattern in order to open an index of see {@link diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/TestAncientIndicesCompatibility.java b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/TestAncientIndicesCompatibility.java index 88adfadf1c8..cf7df98345d 100644 --- a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/TestAncientIndicesCompatibility.java +++ b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/TestAncientIndicesCompatibility.java @@ -198,7 +198,7 @@ public class TestAncientIndicesCompatibility extends LuceneTestCase { checker.setInfoStream(new PrintStream(bos, false, UTF_8)); checker.setLevel(CheckIndex.Level.MIN_LEVEL_FOR_INTEGRITY_CHECKS); CheckIndex.Status indexStatus = checker.checkIndex(); - if (version.startsWith("8.")) { + if (version.startsWith("8.") || version.startsWith("9.")) { assertTrue(indexStatus.clean); } else { assertFalse(indexStatus.clean); @@ -219,10 +219,11 @@ public class TestAncientIndicesCompatibility extends LuceneTestCase { // #12895: test on a carefully crafted 9.8.0 index (from a small contiguous subset // of wikibigall unique terms) that shows the read-time exception of // IntersectTermsEnum (used by WildcardQuery) + @AwaitsFix(bugUrl = "https://github.com/apache/lucene/issues/13847") public void testWildcardQueryExceptions990() throws IOException { Path path = createTempDir("12895"); - String name = "index.12895.9.8.0.zip"; + String name = "unsupported.12895.9.8.0.zip"; InputStream resource = TestAncientIndicesCompatibility.class.getResourceAsStream(name); assertNotNull("missing zip file to reproduce #12895", resource); TestUtil.unzip(resource, path); diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/TestBasicBackwardsCompatibility.java b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/TestBasicBackwardsCompatibility.java index 8d35a1128be..262567f9f76 100644 --- a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/TestBasicBackwardsCompatibility.java +++ b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/TestBasicBackwardsCompatibility.java @@ -17,7 +17,6 @@ package org.apache.lucene.backward_index; import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS; -import static org.apache.lucene.util.Version.LUCENE_9_0_0; import com.carrotsearch.randomizedtesting.annotations.ParametersFactory; import java.io.IOException; @@ -52,6 +51,7 @@ import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.IndexableField; +import org.apache.lucene.index.KnnVectorValues; import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.index.LogByteSizeMergePolicy; import org.apache.lucene.index.MultiBits; @@ -95,7 +95,7 @@ public class TestBasicBackwardsCompatibility extends BackwardsCompatibilityTestB private static final int DOCS_COUNT = 35; private static final int DELETED_ID = 7; - private static final int KNN_VECTOR_MIN_SUPPORTED_VERSION = LUCENE_9_0_0.major; + private static final int KNN_VECTOR_MIN_SUPPORTED_VERSION = Version.fromBits(9, 0, 0).major; private static final String KNN_VECTOR_FIELD = "knn_field"; private static final FieldType KNN_VECTOR_FIELD_TYPE = KnnFloatVectorField.createFieldType(3, VectorSimilarityFunction.COSINE); @@ -477,10 +477,14 @@ public class TestBasicBackwardsCompatibility extends BackwardsCompatibilityTestB FloatVectorValues values = ctx.reader().getFloatVectorValues(KNN_VECTOR_FIELD); if (values != null) { assertEquals(KNN_VECTOR_FIELD_TYPE.vectorDimension(), values.dimension()); - for (int doc = values.nextDoc(); doc != NO_MORE_DOCS; doc = values.nextDoc()) { + KnnVectorValues.DocIndexIterator it = values.iterator(); + for (int doc = it.nextDoc(); doc != NO_MORE_DOCS; doc = it.nextDoc()) { float[] expectedVector = {KNN_VECTOR[0], KNN_VECTOR[1], KNN_VECTOR[2] + 0.1f * cnt}; assertArrayEquals( - "vectors do not match for doc=" + cnt, expectedVector, values.vectorValue(), 0); + "vectors do not match for doc=" + cnt, + expectedVector, + values.vectorValue(it.index()), + 0); cnt++; } } @@ -828,7 +832,7 @@ public class TestBasicBackwardsCompatibility extends BackwardsCompatibilityTestB expectThrows(IllegalArgumentException.class, () -> TestUtil.addIndexesSlowly(w, reader)); assertEquals( e.getMessage(), - "Cannot merge a segment that has been created with major version 9 into this index which has been created by major version 10"); + "Cannot merge a segment that has been created with major version 10 into this index which has been created by major version 11"); w.close(); targetDir2.close(); diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/TestDVUpdateBackwardsCompatibility.java b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/TestDVUpdateBackwardsCompatibility.java index 332daa621ed..cfe29028cdb 100644 --- a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/TestDVUpdateBackwardsCompatibility.java +++ b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/TestDVUpdateBackwardsCompatibility.java @@ -58,7 +58,7 @@ public class TestDVUpdateBackwardsCompatibility extends BackwardsCompatibilityTe public static Iterable testVersionsFactory() { List params = new ArrayList<>(); // TODO - WHY ONLY on the first major version? - params.add(new Object[] {Version.LUCENE_9_0_0, createPattern(INDEX_NAME, SUFFIX)}); + params.add(new Object[] {Version.LUCENE_10_0_0, createPattern(INDEX_NAME, SUFFIX)}); return params; } diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/TestEmptyIndexBackwardsCompatibility.java b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/TestEmptyIndexBackwardsCompatibility.java index 40fcd4c59bf..2367e20d6a0 100644 --- a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/TestEmptyIndexBackwardsCompatibility.java +++ b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/TestEmptyIndexBackwardsCompatibility.java @@ -53,14 +53,14 @@ public class TestEmptyIndexBackwardsCompatibility extends BackwardsCompatibility public static Iterable testVersionsFactory() { List params = new ArrayList<>(); // TODO - WHY ONLY on the first major version? - params.add(new Object[] {Version.LUCENE_9_0_0, createPattern(INDEX_NAME, SUFFIX)}); + params.add(new Object[] {Version.LUCENE_10_0_0, createPattern(INDEX_NAME, SUFFIX)}); return params; } public void testUpgradeEmptyOldIndex() throws Exception { try (Directory dir = newDirectory(directory)) { TestIndexUpgradeBackwardsCompatibility.newIndexUpgrader(dir).upgrade(); - TestIndexUpgradeBackwardsCompatibility.checkAllSegmentsUpgraded(dir, 9); + TestIndexUpgradeBackwardsCompatibility.checkAllSegmentsUpgraded(dir, 10); } } } diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/TestGenerateBwcIndices.java b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/TestGenerateBwcIndices.java index 936a4c28cf2..6989731ae14 100644 --- a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/TestGenerateBwcIndices.java +++ b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/TestGenerateBwcIndices.java @@ -39,7 +39,7 @@ public class TestGenerateBwcIndices extends LuceneTestCase { // To generate backcompat indexes with the current default codec, run the following gradle // command: // gradlew test -Ptests.bwcdir=/path/to/store/indexes -Ptests.codec=default - // -Ptests.useSecurityManager=false --tests TestGenerateBwcIndices + // -Ptests.useSecurityManager=false --tests TestGenerateBwcIndices --max-workers=1 // // Also add testmethod with one of the index creation methods below, for example: // -Ptestmethod=testCreateCFS @@ -82,14 +82,14 @@ public class TestGenerateBwcIndices extends LuceneTestCase { sortedTest.createBWCIndex(); } - public void testCreateInt8HNSWIndices() throws IOException { - TestInt8HnswBackwardsCompatibility int8HnswBackwardsCompatibility = - new TestInt8HnswBackwardsCompatibility( + public void testCreateInt7HNSWIndices() throws IOException { + TestInt7HnswBackwardsCompatibility int7HnswBackwardsCompatibility = + new TestInt7HnswBackwardsCompatibility( Version.LATEST, createPattern( - TestInt8HnswBackwardsCompatibility.INDEX_NAME, - TestInt8HnswBackwardsCompatibility.SUFFIX)); - int8HnswBackwardsCompatibility.createBWCIndex(); + TestInt7HnswBackwardsCompatibility.INDEX_NAME, + TestInt7HnswBackwardsCompatibility.SUFFIX)); + int7HnswBackwardsCompatibility.createBWCIndex(); } private boolean isInitialMajorVersionRelease() { diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/TestIndexSortBackwardsCompatibility.java b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/TestIndexSortBackwardsCompatibility.java index f7446011b26..ad5432b91ad 100644 --- a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/TestIndexSortBackwardsCompatibility.java +++ b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/TestIndexSortBackwardsCompatibility.java @@ -55,7 +55,7 @@ public class TestIndexSortBackwardsCompatibility extends BackwardsCompatibilityT static final String INDEX_NAME = "sorted"; static final String SUFFIX = ""; - private static final Version FIRST_PARENT_DOC_VERSION = Version.LUCENE_9_11_0; + private static final Version FIRST_PARENT_DOC_VERSION = Version.fromBits(9, 11, 0); private static final String PARENT_FIELD_NAME = "___parent"; public TestIndexSortBackwardsCompatibility(Version version, String pattern) { diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/TestInt8HnswBackwardsCompatibility.java b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/TestInt7HnswBackwardsCompatibility.java similarity index 75% rename from lucene/backward-codecs/src/test/org/apache/lucene/backward_index/TestInt8HnswBackwardsCompatibility.java rename to lucene/backward-codecs/src/test/org/apache/lucene/backward_index/TestInt7HnswBackwardsCompatibility.java index 8db406df992..7e4b59542fa 100644 --- a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/TestInt8HnswBackwardsCompatibility.java +++ b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/TestInt7HnswBackwardsCompatibility.java @@ -23,17 +23,22 @@ import java.io.IOException; import org.apache.lucene.backward_codecs.lucene99.Lucene99Codec; import org.apache.lucene.codecs.Codec; import org.apache.lucene.codecs.KnnVectorsFormat; +import org.apache.lucene.codecs.KnnVectorsReader; import org.apache.lucene.codecs.lucene99.Lucene99HnswScalarQuantizedVectorsFormat; import org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat; +import org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsReader; +import org.apache.lucene.codecs.perfield.PerFieldKnnVectorsFormat; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.FieldType; import org.apache.lucene.document.KnnFloatVectorField; import org.apache.lucene.document.StringField; +import org.apache.lucene.index.CodecReader; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.index.NoMergePolicy; import org.apache.lucene.index.VectorSimilarityFunction; import org.apache.lucene.search.IndexSearcher; @@ -41,23 +46,23 @@ import org.apache.lucene.store.Directory; import org.apache.lucene.tests.analysis.MockAnalyzer; import org.apache.lucene.tests.util.TestUtil; import org.apache.lucene.util.Version; +import org.apache.lucene.util.quantization.QuantizedByteVectorValues; -public class TestInt8HnswBackwardsCompatibility extends BackwardsCompatibilityTestBase { +public class TestInt7HnswBackwardsCompatibility extends BackwardsCompatibilityTestBase { - static final String INDEX_NAME = "int8_hnsw"; + static final String INDEX_NAME = "int7_hnsw"; static final String SUFFIX = ""; - private static final Version FIRST_INT8_HNSW_VERSION = Version.LUCENE_9_10_0; + private static final Version FIRST_INT7_HNSW_VERSION = Version.fromBits(9, 10, 0); private static final String KNN_VECTOR_FIELD = "knn_field"; private static final int DOC_COUNT = 30; private static final FieldType KNN_VECTOR_FIELD_TYPE = KnnFloatVectorField.createFieldType(3, VectorSimilarityFunction.COSINE); private static final float[] KNN_VECTOR = {0.2f, -0.1f, 0.1f}; - public TestInt8HnswBackwardsCompatibility(Version version, String pattern) { + public TestInt7HnswBackwardsCompatibility(Version version, String pattern) { super(version, pattern); } - /** Provides all sorted versions to the test-framework */ @ParametersFactory(argumentFormatting = "Lucene-Version:%1$s; Pattern: %2$s") public static Iterable testVersionsFactory() throws IllegalAccessException { return allVersion(INDEX_NAME, SUFFIX); @@ -76,7 +81,7 @@ public class TestInt8HnswBackwardsCompatibility extends BackwardsCompatibilityTe @Override protected boolean supportsVersion(Version version) { - return version.onOrAfter(FIRST_INT8_HNSW_VERSION); + return version.onOrAfter(FIRST_INT7_HNSW_VERSION); } @Override @@ -84,7 +89,7 @@ public class TestInt8HnswBackwardsCompatibility extends BackwardsCompatibilityTe // We don't use the default codec } - public void testInt8HnswIndexAndSearch() throws Exception { + public void testInt7HnswIndexAndSearch() throws Exception { IndexWriterConfig indexWriterConfig = newIndexWriterConfig(new MockAnalyzer(random())) .setOpenMode(IndexWriterConfig.OpenMode.APPEND) @@ -108,7 +113,6 @@ public class TestInt8HnswBackwardsCompatibility extends BackwardsCompatibilityTe assertKNNSearch(searcher, KNN_VECTOR, 10, 10, "0"); } } - // This will confirm the docs are really sorted TestUtil.checkIndex(directory); } @@ -117,7 +121,7 @@ public class TestInt8HnswBackwardsCompatibility extends BackwardsCompatibilityTe IndexWriterConfig conf = new IndexWriterConfig(new MockAnalyzer(random())) .setMaxBufferedDocs(10) - .setCodec(TestUtil.getDefaultCodec()) + .setCodec(getCodec()) .setMergePolicy(NoMergePolicy.INSTANCE); try (IndexWriter writer = new IndexWriter(dir, conf)) { for (int i = 0; i < DOC_COUNT; i++) { @@ -147,4 +151,29 @@ public class TestInt8HnswBackwardsCompatibility extends BackwardsCompatibilityTe assertKNNSearch(searcher, KNN_VECTOR, 10, 10, "0"); } } + + // #13880: make sure the BWC index really contains quantized HNSW not float32 + public void testIndexIsReallyQuantized() throws Exception { + try (DirectoryReader reader = DirectoryReader.open(directory)) { + for (LeafReaderContext leafContext : reader.leaves()) { + KnnVectorsReader knnVectorsReader = ((CodecReader) leafContext.reader()).getVectorReader(); + assertTrue( + "expected PerFieldKnnVectorsFormat.FieldsReader but got: " + knnVectorsReader, + knnVectorsReader instanceof PerFieldKnnVectorsFormat.FieldsReader); + + KnnVectorsReader forField = + ((PerFieldKnnVectorsFormat.FieldsReader) knnVectorsReader) + .getFieldReader(KNN_VECTOR_FIELD); + + assertTrue(forField instanceof Lucene99HnswVectorsReader); + + QuantizedByteVectorValues quantized = + ((Lucene99HnswVectorsReader) forField).getQuantizedVectorValues(KNN_VECTOR_FIELD); + + assertNotNull( + "KnnVectorsReader should have quantized interface for field " + KNN_VECTOR_FIELD, + quantized); + } + } + } } diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/TestMoreTermsBackwardsCompatibility.java b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/TestMoreTermsBackwardsCompatibility.java index 6bacb49dd65..6b33eeb5add 100644 --- a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/TestMoreTermsBackwardsCompatibility.java +++ b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/TestMoreTermsBackwardsCompatibility.java @@ -31,13 +31,15 @@ import org.apache.lucene.index.LogByteSizeMergePolicy; import org.apache.lucene.store.Directory; import org.apache.lucene.tests.analysis.MockAnalyzer; import org.apache.lucene.tests.util.LineFileDocs; +import org.apache.lucene.tests.util.LuceneTestCase; import org.apache.lucene.tests.util.TestUtil; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.Version; +@LuceneTestCase.AwaitsFix(bugUrl = "https://github.com/apache/lucene/issues/13847") public class TestMoreTermsBackwardsCompatibility extends BackwardsCompatibilityTestBase { - static final String INDEX_NAME = "moreterms"; + static final String INDEX_NAME = "unsupported.moreterms"; static final String SUFFIX = ""; @@ -48,7 +50,7 @@ public class TestMoreTermsBackwardsCompatibility extends BackwardsCompatibilityT @ParametersFactory(argumentFormatting = "Lucene-Version:%1$s; Pattern: %2$s") public static Iterable testVersionsFactory() { List params = new ArrayList<>(); - params.add(new Object[] {Version.LUCENE_9_0_0, createPattern(INDEX_NAME, SUFFIX)}); + params.add(new Object[] {Version.fromBits(9, 0, 0), createPattern(INDEX_NAME, SUFFIX)}); return params; } diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/dvupdates.10.0.0.zip b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/dvupdates.10.0.0.zip new file mode 100644 index 00000000000..db5d5260bcc Binary files /dev/null and b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/dvupdates.10.0.0.zip differ diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/empty.10.0.0.zip b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/empty.10.0.0.zip new file mode 100644 index 00000000000..d906538645b Binary files /dev/null and b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/empty.10.0.0.zip differ diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.10.0.0-cfs.zip b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.10.0.0-cfs.zip new file mode 100644 index 00000000000..73c79500c85 Binary files /dev/null and b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.10.0.0-cfs.zip differ diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.10.0.0-nocfs.zip b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.10.0.0-nocfs.zip new file mode 100644 index 00000000000..d8b8216c639 Binary files /dev/null and b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.10.0.0-nocfs.zip differ diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/int7_hnsw.10.0.0.zip b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/int7_hnsw.10.0.0.zip new file mode 100644 index 00000000000..99a28f7631c Binary files /dev/null and b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/int7_hnsw.10.0.0.zip differ diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/int8_hnsw.9.10.0.zip b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/int8_hnsw.9.10.0.zip deleted file mode 100644 index 2799f04b65a..00000000000 Binary files a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/int8_hnsw.9.10.0.zip and /dev/null differ diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/int8_hnsw.9.11.0.zip b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/int8_hnsw.9.11.0.zip deleted file mode 100644 index 5fd94783427..00000000000 Binary files a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/int8_hnsw.9.11.0.zip and /dev/null differ diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/int8_hnsw.9.11.1.zip b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/int8_hnsw.9.11.1.zip deleted file mode 100644 index c4bb86b5f1b..00000000000 Binary files a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/int8_hnsw.9.11.1.zip and /dev/null differ diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/moreterms.10.0.0.zip b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/moreterms.10.0.0.zip new file mode 100644 index 00000000000..6ee086756cc Binary files /dev/null and b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/moreterms.10.0.0.zip differ diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/sorted.10.0.0.zip b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/sorted.10.0.0.zip new file mode 100644 index 00000000000..e0896256896 Binary files /dev/null and b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/sorted.10.0.0.zip differ diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.12895.9.8.0.zip b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.12895.9.8.0.zip similarity index 100% rename from lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.12895.9.8.0.zip rename to lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.12895.9.8.0.zip diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.8.11.4-cfs.zip b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.8.11.4-cfs.zip new file mode 100644 index 00000000000..bb3e4f01753 Binary files /dev/null and b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.8.11.4-cfs.zip differ diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.8.11.4-nocfs.zip b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.8.11.4-nocfs.zip new file mode 100644 index 00000000000..a19fa717096 Binary files /dev/null and b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.8.11.4-nocfs.zip differ diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.9.0.0-cfs.zip b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.9.0.0-cfs.zip similarity index 100% rename from lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.9.0.0-cfs.zip rename to lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.9.0.0-cfs.zip diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.9.0.0-nocfs.zip b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.9.0.0-nocfs.zip similarity index 100% rename from lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.9.0.0-nocfs.zip rename to lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.9.0.0-nocfs.zip diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.9.1.0-cfs.zip b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.9.1.0-cfs.zip similarity index 100% rename from lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.9.1.0-cfs.zip rename to lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.9.1.0-cfs.zip diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.9.1.0-nocfs.zip b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.9.1.0-nocfs.zip similarity index 100% rename from lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.9.1.0-nocfs.zip rename to lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.9.1.0-nocfs.zip diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.9.10.0-cfs.zip b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.9.10.0-cfs.zip similarity index 100% rename from lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.9.10.0-cfs.zip rename to lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.9.10.0-cfs.zip diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.9.10.0-nocfs.zip b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.9.10.0-nocfs.zip similarity index 100% rename from lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.9.10.0-nocfs.zip rename to lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.9.10.0-nocfs.zip diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.9.11.0-cfs.zip b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.9.11.0-cfs.zip similarity index 100% rename from lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.9.11.0-cfs.zip rename to lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.9.11.0-cfs.zip diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.9.11.0-nocfs.zip b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.9.11.0-nocfs.zip similarity index 100% rename from lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.9.11.0-nocfs.zip rename to lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.9.11.0-nocfs.zip diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.9.11.1-cfs.zip b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.9.11.1-cfs.zip similarity index 100% rename from lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.9.11.1-cfs.zip rename to lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.9.11.1-cfs.zip diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.9.11.1-nocfs.zip b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.9.11.1-nocfs.zip similarity index 100% rename from lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.9.11.1-nocfs.zip rename to lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.9.11.1-nocfs.zip diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.9.12.0-cfs.zip b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.9.12.0-cfs.zip new file mode 100644 index 00000000000..6fc0118f222 Binary files /dev/null and b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.9.12.0-cfs.zip differ diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.9.12.0-nocfs.zip b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.9.12.0-nocfs.zip new file mode 100644 index 00000000000..56b5c1325c8 Binary files /dev/null and b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.9.12.0-nocfs.zip differ diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.9.2.0-cfs.zip b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.9.2.0-cfs.zip similarity index 100% rename from lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.9.2.0-cfs.zip rename to lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.9.2.0-cfs.zip diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.9.2.0-nocfs.zip b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.9.2.0-nocfs.zip similarity index 100% rename from lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.9.2.0-nocfs.zip rename to lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.9.2.0-nocfs.zip diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.9.3.0-cfs.zip b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.9.3.0-cfs.zip similarity index 100% rename from lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.9.3.0-cfs.zip rename to lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.9.3.0-cfs.zip diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.9.3.0-nocfs.zip b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.9.3.0-nocfs.zip similarity index 100% rename from lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.9.3.0-nocfs.zip rename to lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.9.3.0-nocfs.zip diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.9.4.0-cfs.zip b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.9.4.0-cfs.zip similarity index 100% rename from lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.9.4.0-cfs.zip rename to lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.9.4.0-cfs.zip diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.9.4.0-nocfs.zip b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.9.4.0-nocfs.zip similarity index 100% rename from lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.9.4.0-nocfs.zip rename to lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.9.4.0-nocfs.zip diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.9.4.1-cfs.zip b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.9.4.1-cfs.zip similarity index 100% rename from lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.9.4.1-cfs.zip rename to lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.9.4.1-cfs.zip diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.9.4.1-nocfs.zip b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.9.4.1-nocfs.zip similarity index 100% rename from lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.9.4.1-nocfs.zip rename to lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.9.4.1-nocfs.zip diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.9.4.2-cfs.zip b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.9.4.2-cfs.zip similarity index 100% rename from lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.9.4.2-cfs.zip rename to lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.9.4.2-cfs.zip diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.9.4.2-nocfs.zip b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.9.4.2-nocfs.zip similarity index 100% rename from lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.9.4.2-nocfs.zip rename to lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.9.4.2-nocfs.zip diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.9.5.0-cfs.zip b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.9.5.0-cfs.zip similarity index 100% rename from lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.9.5.0-cfs.zip rename to lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.9.5.0-cfs.zip diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.9.5.0-nocfs.zip b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.9.5.0-nocfs.zip similarity index 100% rename from lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.9.5.0-nocfs.zip rename to lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.9.5.0-nocfs.zip diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.9.6.0-cfs.zip b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.9.6.0-cfs.zip similarity index 100% rename from lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.9.6.0-cfs.zip rename to lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.9.6.0-cfs.zip diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.9.6.0-nocfs.zip b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.9.6.0-nocfs.zip similarity index 100% rename from lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.9.6.0-nocfs.zip rename to lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.9.6.0-nocfs.zip diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.9.7.0-cfs.zip b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.9.7.0-cfs.zip similarity index 100% rename from lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.9.7.0-cfs.zip rename to lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.9.7.0-cfs.zip diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.9.7.0-nocfs.zip b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.9.7.0-nocfs.zip similarity index 100% rename from lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.9.7.0-nocfs.zip rename to lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.9.7.0-nocfs.zip diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.9.8.0-cfs.zip b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.9.8.0-cfs.zip similarity index 100% rename from lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.9.8.0-cfs.zip rename to lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.9.8.0-cfs.zip diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.9.8.0-nocfs.zip b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.9.8.0-nocfs.zip similarity index 100% rename from lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.9.8.0-nocfs.zip rename to lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.9.8.0-nocfs.zip diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.9.9.0-cfs.zip b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.9.9.0-cfs.zip similarity index 100% rename from lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.9.9.0-cfs.zip rename to lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.9.9.0-cfs.zip diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.9.9.0-nocfs.zip b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.9.9.0-nocfs.zip similarity index 100% rename from lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.9.9.0-nocfs.zip rename to lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.9.9.0-nocfs.zip diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.9.9.1-cfs.zip b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.9.9.1-cfs.zip similarity index 100% rename from lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.9.9.1-cfs.zip rename to lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.9.9.1-cfs.zip diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.9.9.1-nocfs.zip b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.9.9.1-nocfs.zip similarity index 100% rename from lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.9.9.1-nocfs.zip rename to lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.9.9.1-nocfs.zip diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.9.9.2-cfs.zip b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.9.9.2-cfs.zip similarity index 100% rename from lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.9.9.2-cfs.zip rename to lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.9.9.2-cfs.zip diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.9.9.2-nocfs.zip b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.9.9.2-nocfs.zip similarity index 100% rename from lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.9.9.2-nocfs.zip rename to lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.9.9.2-nocfs.zip diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/dvupdates.9.0.0.zip b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.dvupdates.9.0.0.zip similarity index 100% rename from lucene/backward-codecs/src/test/org/apache/lucene/backward_index/dvupdates.9.0.0.zip rename to lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.dvupdates.9.0.0.zip diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/empty.9.0.0.zip b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.empty.9.0.0.zip similarity index 100% rename from lucene/backward-codecs/src/test/org/apache/lucene/backward_index/empty.9.0.0.zip rename to lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.empty.9.0.0.zip diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.int7_hnsw.9.10.0.zip b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.int7_hnsw.9.10.0.zip new file mode 100644 index 00000000000..0425b451fa0 Binary files /dev/null and b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.int7_hnsw.9.10.0.zip differ diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.int7_hnsw.9.11.0.zip b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.int7_hnsw.9.11.0.zip new file mode 100644 index 00000000000..9dd53d92a99 Binary files /dev/null and b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.int7_hnsw.9.11.0.zip differ diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.int7_hnsw.9.11.1.zip b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.int7_hnsw.9.11.1.zip new file mode 100644 index 00000000000..29aef1b909f Binary files /dev/null and b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.int7_hnsw.9.11.1.zip differ diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.int7_hnsw.9.12.0.zip b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.int7_hnsw.9.12.0.zip new file mode 100644 index 00000000000..bfe07de8143 Binary files /dev/null and b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.int7_hnsw.9.12.0.zip differ diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/moreterms.9.0.0.zip b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.moreterms.9.0.0.zip similarity index 100% rename from lucene/backward-codecs/src/test/org/apache/lucene/backward_index/moreterms.9.0.0.zip rename to lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.moreterms.9.0.0.zip diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.sorted.8.11.4.zip b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.sorted.8.11.4.zip new file mode 100644 index 00000000000..9736c6aca98 Binary files /dev/null and b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.sorted.8.11.4.zip differ diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/sorted.9.0.0.zip b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.sorted.9.0.0.zip similarity index 100% rename from lucene/backward-codecs/src/test/org/apache/lucene/backward_index/sorted.9.0.0.zip rename to lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.sorted.9.0.0.zip diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/sorted.9.1.0.zip b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.sorted.9.1.0.zip similarity index 100% rename from lucene/backward-codecs/src/test/org/apache/lucene/backward_index/sorted.9.1.0.zip rename to lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.sorted.9.1.0.zip diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/sorted.9.10.0.zip b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.sorted.9.10.0.zip similarity index 100% rename from lucene/backward-codecs/src/test/org/apache/lucene/backward_index/sorted.9.10.0.zip rename to lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.sorted.9.10.0.zip diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/sorted.9.11.0.zip b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.sorted.9.11.0.zip similarity index 100% rename from lucene/backward-codecs/src/test/org/apache/lucene/backward_index/sorted.9.11.0.zip rename to lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.sorted.9.11.0.zip diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/sorted.9.11.1.zip b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.sorted.9.11.1.zip similarity index 100% rename from lucene/backward-codecs/src/test/org/apache/lucene/backward_index/sorted.9.11.1.zip rename to lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.sorted.9.11.1.zip diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.sorted.9.12.0.zip b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.sorted.9.12.0.zip new file mode 100644 index 00000000000..9ad1590e3e4 Binary files /dev/null and b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.sorted.9.12.0.zip differ diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/sorted.9.2.0.zip b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.sorted.9.2.0.zip similarity index 100% rename from lucene/backward-codecs/src/test/org/apache/lucene/backward_index/sorted.9.2.0.zip rename to lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.sorted.9.2.0.zip diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/sorted.9.3.0.zip b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.sorted.9.3.0.zip similarity index 100% rename from lucene/backward-codecs/src/test/org/apache/lucene/backward_index/sorted.9.3.0.zip rename to lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.sorted.9.3.0.zip diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/sorted.9.4.0.zip b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.sorted.9.4.0.zip similarity index 100% rename from lucene/backward-codecs/src/test/org/apache/lucene/backward_index/sorted.9.4.0.zip rename to lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.sorted.9.4.0.zip diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/sorted.9.4.1.zip b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.sorted.9.4.1.zip similarity index 100% rename from lucene/backward-codecs/src/test/org/apache/lucene/backward_index/sorted.9.4.1.zip rename to lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.sorted.9.4.1.zip diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/sorted.9.4.2.zip b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.sorted.9.4.2.zip similarity index 100% rename from lucene/backward-codecs/src/test/org/apache/lucene/backward_index/sorted.9.4.2.zip rename to lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.sorted.9.4.2.zip diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/sorted.9.5.0.zip b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.sorted.9.5.0.zip similarity index 100% rename from lucene/backward-codecs/src/test/org/apache/lucene/backward_index/sorted.9.5.0.zip rename to lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.sorted.9.5.0.zip diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/sorted.9.6.0.zip b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.sorted.9.6.0.zip similarity index 100% rename from lucene/backward-codecs/src/test/org/apache/lucene/backward_index/sorted.9.6.0.zip rename to lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.sorted.9.6.0.zip diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/sorted.9.7.0.zip b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.sorted.9.7.0.zip similarity index 100% rename from lucene/backward-codecs/src/test/org/apache/lucene/backward_index/sorted.9.7.0.zip rename to lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.sorted.9.7.0.zip diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/sorted.9.8.0.zip b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.sorted.9.8.0.zip similarity index 100% rename from lucene/backward-codecs/src/test/org/apache/lucene/backward_index/sorted.9.8.0.zip rename to lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.sorted.9.8.0.zip diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/sorted.9.9.0.zip b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.sorted.9.9.0.zip similarity index 100% rename from lucene/backward-codecs/src/test/org/apache/lucene/backward_index/sorted.9.9.0.zip rename to lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.sorted.9.9.0.zip diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/sorted.9.9.1.zip b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.sorted.9.9.1.zip similarity index 100% rename from lucene/backward-codecs/src/test/org/apache/lucene/backward_index/sorted.9.9.1.zip rename to lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.sorted.9.9.1.zip diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/sorted.9.9.2.zip b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.sorted.9.9.2.zip similarity index 100% rename from lucene/backward-codecs/src/test/org/apache/lucene/backward_index/sorted.9.9.2.zip rename to lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported.sorted.9.9.2.zip diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported_versions.txt b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported_versions.txt index 8f298d3ae05..521f12c2804 100644 --- a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported_versions.txt +++ b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/unsupported_versions.txt @@ -119,4 +119,23 @@ 8.11.0 8.11.1 8.11.2 -8.11.3 \ No newline at end of file +8.11.3 +8.11.4 +9.0.0 +9.1.0 +9.2.0 +9.3.0 +9.4.0 +9.4.1 +9.4.2 +9.5.0 +9.6.0 +9.7.0 +9.8.0 +9.9.0 +9.9.1 +9.9.2 +9.10.0 +9.11.0 +9.11.1 +9.12.0 \ No newline at end of file diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/versions.txt b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/versions.txt index 4572b6fadfe..7529186caca 100644 --- a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/versions.txt +++ b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/versions.txt @@ -1,29 +1,3 @@ -8.0.0 -8.1.0 -8.1.1 -8.2.0 -8.3.0 -8.3.1 -8.4.0 -8.4.1 -8.5.0 -8.5.1 -8.5.2 -8.6.0 -8.6.1 -8.6.2 -8.6.3 -8.7.0 -8.8.0 -8.8.1 -8.8.2 -8.9.0 -8.10.0 -8.10.1 -8.11.0 -8.11.1 -8.11.2 -8.11.3 9.0.0 9.1.0 9.2.0 @@ -41,3 +15,5 @@ 9.10.0 9.11.0 9.11.1 +9.12.0 +10.0.0 \ No newline at end of file diff --git a/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/GroupVIntBenchmark.java b/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/GroupVIntBenchmark.java index 0df0d7ecf50..48b95570694 100644 --- a/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/GroupVIntBenchmark.java +++ b/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/GroupVIntBenchmark.java @@ -186,7 +186,7 @@ public class GroupVIntBenchmark { @Benchmark public void benchMMapDirectoryInputs_readGroupVInt(Blackhole bh) throws IOException { byteBufferGVIntIn.seek(0); - byteBufferGVIntIn.readGroupVInts(values, size); + GroupVIntUtil.readGroupVInts(byteBufferGVIntIn, values, size); bh.consume(values); } @@ -209,14 +209,14 @@ public class GroupVIntBenchmark { @Benchmark public void benchByteArrayDataInput_readGroupVInt(Blackhole bh) throws IOException { byteArrayGVIntIn.rewind(); - byteArrayGVIntIn.readGroupVInts(values, size); + GroupVIntUtil.readGroupVInts(byteArrayGVIntIn, values, size); bh.consume(values); } @Benchmark public void benchNIOFSDirectoryInputs_readGroupVInt(Blackhole bh) throws IOException { nioGVIntIn.seek(0); - nioGVIntIn.readGroupVInts(values, size); + GroupVIntUtil.readGroupVInts(nioGVIntIn, values, size); bh.consume(values); } @@ -230,7 +230,7 @@ public class GroupVIntBenchmark { @Benchmark public void benchByteBuffersIndexInput_readGroupVInt(Blackhole bh) throws IOException { byteBuffersGVIntIn.seek(0); - byteBuffersGVIntIn.readGroupVInts(values, size); + GroupVIntUtil.readGroupVInts(byteBuffersGVIntIn, values, size); bh.consume(values); } diff --git a/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/VectorScorerBenchmark.java b/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/VectorScorerBenchmark.java index c4d3040f283..0a4da1f4886 100644 --- a/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/VectorScorerBenchmark.java +++ b/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/VectorScorerBenchmark.java @@ -25,6 +25,7 @@ import java.util.concurrent.TimeUnit; import org.apache.lucene.codecs.hnsw.FlatVectorScorerUtil; import org.apache.lucene.codecs.hnsw.FlatVectorsScorer; import org.apache.lucene.codecs.lucene95.OffHeapByteVectorValues; +import org.apache.lucene.index.KnnVectorValues; import org.apache.lucene.index.VectorSimilarityFunction; import org.apache.lucene.store.Directory; import org.apache.lucene.store.IOContext; @@ -32,7 +33,6 @@ import org.apache.lucene.store.IndexInput; import org.apache.lucene.store.IndexOutput; import org.apache.lucene.store.MMapDirectory; import org.apache.lucene.util.IOUtils; -import org.apache.lucene.util.hnsw.RandomAccessVectorValues; import org.apache.lucene.util.hnsw.RandomVectorScorer; import org.apache.lucene.util.hnsw.RandomVectorScorerSupplier; import org.openjdk.jmh.annotations.*; @@ -55,7 +55,7 @@ public class VectorScorerBenchmark { Directory dir; IndexInput in; - RandomAccessVectorValues vectorValues; + KnnVectorValues vectorValues; byte[] vec1, vec2; RandomVectorScorer scorer; @@ -95,7 +95,7 @@ public class VectorScorerBenchmark { return scorer.score(1); } - static RandomAccessVectorValues vectorValues( + static KnnVectorValues vectorValues( int dims, int size, IndexInput in, VectorSimilarityFunction sim) throws IOException { return new OffHeapByteVectorValues.DenseOffHeapVectorValues( dims, size, in.slice("test", 0, in.length()), dims, new ThrowingFlatVectorScorer(), sim); @@ -105,23 +105,19 @@ public class VectorScorerBenchmark { @Override public RandomVectorScorerSupplier getRandomVectorScorerSupplier( - VectorSimilarityFunction similarityFunction, RandomAccessVectorValues vectorValues) { + VectorSimilarityFunction similarityFunction, KnnVectorValues vectorValues) { throw new UnsupportedOperationException(); } @Override public RandomVectorScorer getRandomVectorScorer( - VectorSimilarityFunction similarityFunction, - RandomAccessVectorValues vectorValues, - float[] target) { + VectorSimilarityFunction similarityFunction, KnnVectorValues vectorValues, float[] target) { throw new UnsupportedOperationException(); } @Override public RandomVectorScorer getRandomVectorScorer( - VectorSimilarityFunction similarityFunction, - RandomAccessVectorValues vectorValues, - byte[] target) { + VectorSimilarityFunction similarityFunction, KnnVectorValues vectorValues, byte[] target) { throw new UnsupportedOperationException(); } } diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/bitvectors/FlatBitVectorsScorer.java b/lucene/codecs/src/java/org/apache/lucene/codecs/bitvectors/FlatBitVectorsScorer.java index b8ff37c2654..8ffcc1c8d50 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/bitvectors/FlatBitVectorsScorer.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/bitvectors/FlatBitVectorsScorer.java @@ -19,10 +19,11 @@ package org.apache.lucene.codecs.bitvectors; import java.io.IOException; import org.apache.lucene.codecs.hnsw.FlatVectorsScorer; +import org.apache.lucene.index.ByteVectorValues; +import org.apache.lucene.index.KnnVectorValues; import org.apache.lucene.index.VectorSimilarityFunction; import org.apache.lucene.util.Bits; import org.apache.lucene.util.VectorUtil; -import org.apache.lucene.util.hnsw.RandomAccessVectorValues; import org.apache.lucene.util.hnsw.RandomVectorScorer; import org.apache.lucene.util.hnsw.RandomVectorScorerSupplier; @@ -30,45 +31,39 @@ import org.apache.lucene.util.hnsw.RandomVectorScorerSupplier; public class FlatBitVectorsScorer implements FlatVectorsScorer { @Override public RandomVectorScorerSupplier getRandomVectorScorerSupplier( - VectorSimilarityFunction similarityFunction, RandomAccessVectorValues vectorValues) + VectorSimilarityFunction similarityFunction, KnnVectorValues vectorValues) throws IOException { - assert vectorValues instanceof RandomAccessVectorValues.Bytes; - if (vectorValues instanceof RandomAccessVectorValues.Bytes byteVectorValues) { + assert vectorValues instanceof ByteVectorValues; + if (vectorValues instanceof ByteVectorValues byteVectorValues) { return new BitRandomVectorScorerSupplier(byteVectorValues); } - throw new IllegalArgumentException( - "vectorValues must be an instance of RandomAccessVectorValues.Bytes"); + throw new IllegalArgumentException("vectorValues must be an instance of ByteVectorValues"); } @Override public RandomVectorScorer getRandomVectorScorer( - VectorSimilarityFunction similarityFunction, - RandomAccessVectorValues vectorValues, - float[] target) + VectorSimilarityFunction similarityFunction, KnnVectorValues vectorValues, float[] target) throws IOException { throw new IllegalArgumentException("bit vectors do not support float[] targets"); } @Override public RandomVectorScorer getRandomVectorScorer( - VectorSimilarityFunction similarityFunction, - RandomAccessVectorValues vectorValues, - byte[] target) + VectorSimilarityFunction similarityFunction, KnnVectorValues vectorValues, byte[] target) throws IOException { - assert vectorValues instanceof RandomAccessVectorValues.Bytes; - if (vectorValues instanceof RandomAccessVectorValues.Bytes byteVectorValues) { + assert vectorValues instanceof ByteVectorValues; + if (vectorValues instanceof ByteVectorValues byteVectorValues) { return new BitRandomVectorScorer(byteVectorValues, target); } - throw new IllegalArgumentException( - "vectorValues must be an instance of RandomAccessVectorValues.Bytes"); + throw new IllegalArgumentException("vectorValues must be an instance of ByteVectorValues"); } static class BitRandomVectorScorer implements RandomVectorScorer { - private final RandomAccessVectorValues.Bytes vectorValues; + private final ByteVectorValues vectorValues; private final int bitDimensions; private final byte[] query; - BitRandomVectorScorer(RandomAccessVectorValues.Bytes vectorValues, byte[] query) { + BitRandomVectorScorer(ByteVectorValues vectorValues, byte[] query) { this.query = query; this.bitDimensions = vectorValues.dimension() * Byte.SIZE; this.vectorValues = vectorValues; @@ -97,12 +92,11 @@ public class FlatBitVectorsScorer implements FlatVectorsScorer { } static class BitRandomVectorScorerSupplier implements RandomVectorScorerSupplier { - protected final RandomAccessVectorValues.Bytes vectorValues; - protected final RandomAccessVectorValues.Bytes vectorValues1; - protected final RandomAccessVectorValues.Bytes vectorValues2; + protected final ByteVectorValues vectorValues; + protected final ByteVectorValues vectorValues1; + protected final ByteVectorValues vectorValues2; - public BitRandomVectorScorerSupplier(RandomAccessVectorValues.Bytes vectorValues) - throws IOException { + public BitRandomVectorScorerSupplier(ByteVectorValues vectorValues) throws IOException { this.vectorValues = vectorValues; this.vectorValues1 = vectorValues.copy(); this.vectorValues2 = vectorValues.copy(); diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/bloom/BloomFilteringPostingsFormat.java b/lucene/codecs/src/java/org/apache/lucene/codecs/bloom/BloomFilteringPostingsFormat.java index 1daa1761fd8..2a0472fa028 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/bloom/BloomFilteringPostingsFormat.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/bloom/BloomFilteringPostingsFormat.java @@ -54,8 +54,9 @@ import org.apache.lucene.util.automaton.CompiledAutomaton; * *

A choice of {@link BloomFilterFactory} can be passed to tailor Bloom Filter settings on a * per-field basis. The default configuration is {@link DefaultBloomFilterFactory} which allocates a - * ~8mb bitset and hashes values using {@link MurmurHash64}. This should be suitable for most - * purposes. + * ~8mb bitset and hashes values using {@link + * org.apache.lucene.util.StringHelper#murmurhash3_x64_128(BytesRef)}. This should be suitable for + * most purposes. * *

The format of the blm file is as follows: * diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/bloom/FuzzySet.java b/lucene/codecs/src/java/org/apache/lucene/codecs/bloom/FuzzySet.java index f1d2dee65c7..7d6fd1b64b5 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/bloom/FuzzySet.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/bloom/FuzzySet.java @@ -24,6 +24,7 @@ import org.apache.lucene.util.Accountable; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.FixedBitSet; import org.apache.lucene.util.RamUsageEstimator; +import org.apache.lucene.util.StringHelper; /** * A class used to represent a set of many, potentially large, values (e.g. many long strings such @@ -53,7 +54,6 @@ public class FuzzySet implements Accountable { NO }; - private HashFunction hashFunction; private FixedBitSet filter; private int bloomSize; private final int hashCount; @@ -138,7 +138,6 @@ public class FuzzySet implements Accountable { super(); this.filter = filter; this.bloomSize = bloomSize; - this.hashFunction = MurmurHash64.INSTANCE; this.hashCount = hashCount; } @@ -150,11 +149,12 @@ public class FuzzySet implements Accountable { * @return NO or MAYBE */ public ContainsResult contains(BytesRef value) { - long hash = hashFunction.hash(value); - int msb = (int) (hash >>> Integer.SIZE); - int lsb = (int) hash; + long[] hash = StringHelper.murmurhash3_x64_128(value); + + long msb = hash[0]; + long lsb = hash[1]; for (int i = 0; i < hashCount; i++) { - int bloomPos = (lsb + i * msb); + int bloomPos = ((int) (lsb + i * msb)) & bloomSize; if (!mayContainValue(bloomPos)) { return ContainsResult.NO; } @@ -216,15 +216,14 @@ public class FuzzySet implements Accountable { * is modulo n'd where n is the chosen size of the internal bitset. * * @param value the key value to be hashed - * @throws IOException If there is a low-level I/O error */ - public void addValue(BytesRef value) throws IOException { - long hash = hashFunction.hash(value); - int msb = (int) (hash >>> Integer.SIZE); - int lsb = (int) hash; + public void addValue(BytesRef value) { + long[] hash = StringHelper.murmurhash3_x64_128(value); + long msb = hash[0]; + long lsb = hash[1]; for (int i = 0; i < hashCount; i++) { // Bitmasking using bloomSize is effectively a modulo operation. - int bloomPos = (lsb + i * msb) & bloomSize; + int bloomPos = ((int) (lsb + i * msb)) & bloomSize; filter.set(bloomPos); } } @@ -302,9 +301,7 @@ public class FuzzySet implements Accountable { @Override public String toString() { return getClass().getSimpleName() - + "(hash=" - + hashFunction - + ", k=" + + "(k=" + hashCount + ", bits=" + filter.cardinality() diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/bloom/HashFunction.java b/lucene/codecs/src/java/org/apache/lucene/codecs/bloom/HashFunction.java deleted file mode 100644 index eac514a7bb8..00000000000 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/bloom/HashFunction.java +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.lucene.codecs.bloom; - -import org.apache.lucene.util.BytesRef; - -/** - * Base class for hashing functions that can be referred to by name. Subclasses are expected to - * provide threadsafe implementations of the hash function on the range of bytes referenced in the - * provided {@link BytesRef} - * - * @lucene.experimental - */ -public abstract class HashFunction { - - /** - * Hashes the contents of the referenced bytes - * - * @param bytes the data to be hashed - * @return the hash of the bytes referenced by bytes.offset and length bytes.length - */ - public abstract long hash(BytesRef bytes); -} diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/bloom/MurmurHash64.java b/lucene/codecs/src/java/org/apache/lucene/codecs/bloom/MurmurHash64.java deleted file mode 100644 index 1d189773143..00000000000 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/bloom/MurmurHash64.java +++ /dev/null @@ -1,85 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.lucene.codecs.bloom; - -import org.apache.lucene.util.BitUtil; -import org.apache.lucene.util.BytesRef; - -/** - * This is a very fast, non-cryptographic hash suitable for general hash-based lookup. See - * http://murmurhash.googlepages.com/ for more details. - * - *

The code from Apache Commons was adapted in the form here to work with BytesRefs with offsets - * and lengths rather than raw byte arrays. - */ -public class MurmurHash64 extends HashFunction { - private static final long M64 = 0xc6a4a7935bd1e995L; - private static final int R64 = 47; - public static final HashFunction INSTANCE = new MurmurHash64(); - - /** - * Generates a 64-bit hash from byte array of the given length and seed. - * - * @param data The input byte array - * @param seed The initial seed value - * @param length The length of the array - * @return The 64-bit hash of the given array - */ - public static long hash64(byte[] data, int seed, int offset, int length) { - long h = (seed & 0xffffffffL) ^ (length * M64); - - final int nblocks = length >> 3; - - // body - for (int i = 0; i < nblocks; i++) { - - long k = (long) BitUtil.VH_LE_LONG.get(data, offset); - k *= M64; - k ^= k >>> R64; - k *= M64; - - h ^= k; - h *= M64; - - offset += Long.BYTES; - } - - int remaining = length & 0x07; - if (0 < remaining) { - for (int i = 0; i < remaining; i++) { - h ^= ((long) data[offset + i] & 0xff) << (Byte.SIZE * i); - } - h *= M64; - } - - h ^= h >>> R64; - h *= M64; - h ^= h >>> R64; - - return h; - } - - @Override - public final long hash(BytesRef br) { - return hash64(br.bytes, 0xe17a1465, br.offset, br.length); - } - - @Override - public String toString() { - return getClass().getSimpleName(); - } -} diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextCompoundFormat.java b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextCompoundFormat.java index bfb5888a56b..8cb48e36919 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextCompoundFormat.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextCompoundFormat.java @@ -35,6 +35,7 @@ import org.apache.lucene.store.Directory; import org.apache.lucene.store.IOContext; import org.apache.lucene.store.IndexInput; import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.store.ReadAdvice; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRefBuilder; import org.apache.lucene.util.StringHelper; @@ -52,10 +53,10 @@ public class SimpleTextCompoundFormat extends CompoundFormat { public SimpleTextCompoundFormat() {} @Override - public CompoundDirectory getCompoundReader(Directory dir, SegmentInfo si, IOContext context) - throws IOException { + public CompoundDirectory getCompoundReader(Directory dir, SegmentInfo si) throws IOException { String dataFile = IndexFileNames.segmentFileName(si.name, "", DATA_EXTENSION); - final IndexInput in = dir.openInput(dataFile, context); + final IndexInput in = + dir.openInput(dataFile, IOContext.DEFAULT.withReadAdvice(ReadAdvice.NORMAL)); BytesRefBuilder scratch = new BytesRefBuilder(); @@ -135,7 +136,11 @@ public class SimpleTextCompoundFormat extends CompoundFormat { public IndexInput openInput(String name, IOContext context) throws IOException { ensureOpen(); int index = getIndex(name); - return in.slice(name, startOffsets[index], endOffsets[index] - startOffsets[index]); + return in.slice( + name, + startOffsets[index], + endOffsets[index] - startOffsets[index], + context.readAdvice()); } @Override diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextKnnVectorsReader.java b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextKnnVectorsReader.java index faba629715b..0a8c4836321 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextKnnVectorsReader.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextKnnVectorsReader.java @@ -192,8 +192,8 @@ public class SimpleTextKnnVectorsReader extends KnnVectorsReader { } FieldInfo info = readState.fieldInfos.fieldInfo(field); VectorSimilarityFunction vectorSimilarity = info.getVectorSimilarityFunction(); - int doc; - while ((doc = values.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { + for (int ord = 0; ord < values.size(); ord++) { + int doc = values.ordToDoc(ord); if (acceptDocs != null && acceptDocs.get(doc) == false) { continue; } @@ -202,7 +202,7 @@ public class SimpleTextKnnVectorsReader extends KnnVectorsReader { break; } - float[] vector = values.vectorValue(); + float[] vector = values.vectorValue(ord); float score = vectorSimilarity.compare(vector, target); knnCollector.collect(doc, score); knnCollector.incVisitedCount(1); @@ -223,8 +223,8 @@ public class SimpleTextKnnVectorsReader extends KnnVectorsReader { FieldInfo info = readState.fieldInfos.fieldInfo(field); VectorSimilarityFunction vectorSimilarity = info.getVectorSimilarityFunction(); - int doc; - while ((doc = values.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { + for (int ord = 0; ord < values.size(); ord++) { + int doc = values.ordToDoc(ord); if (acceptDocs != null && acceptDocs.get(doc) == false) { continue; } @@ -233,7 +233,7 @@ public class SimpleTextKnnVectorsReader extends KnnVectorsReader { break; } - byte[] vector = values.vectorValue(); + byte[] vector = values.vectorValue(ord); float score = vectorSimilarity.compare(vector, target); knnCollector.collect(doc, score); knnCollector.incVisitedCount(1); @@ -327,35 +327,18 @@ public class SimpleTextKnnVectorsReader extends KnnVectorsReader { } @Override - public float[] vectorValue() { - return values[curOrd]; + public float[] vectorValue(int ord) { + return values[ord]; } @Override - public int docID() { - if (curOrd == -1) { - return -1; - } else if (curOrd >= entry.size()) { - // when call to advance / nextDoc below already returns NO_MORE_DOCS, calling docID - // immediately afterward should also return NO_MORE_DOCS - // this is needed for TestSimpleTextKnnVectorsFormat.testAdvance test case - return NO_MORE_DOCS; - } - - return entry.ordToDoc[curOrd]; + public int ordToDoc(int ord) { + return entry.ordToDoc[ord]; } @Override - public int nextDoc() throws IOException { - if (++curOrd < entry.size()) { - return docID(); - } - return NO_MORE_DOCS; - } - - @Override - public int advance(int target) throws IOException { - return slowAdvance(target); + public DocIndexIterator iterator() { + return createSparseIterator(); } @Override @@ -365,17 +348,19 @@ public class SimpleTextKnnVectorsReader extends KnnVectorsReader { } SimpleTextFloatVectorValues simpleTextFloatVectorValues = new SimpleTextFloatVectorValues(this); + DocIndexIterator iterator = simpleTextFloatVectorValues.iterator(); return new VectorScorer() { @Override public float score() throws IOException { + int ord = iterator.index(); return entry .similarityFunction() - .compare(simpleTextFloatVectorValues.vectorValue(), target); + .compare(simpleTextFloatVectorValues.vectorValue(ord), target); } @Override public DocIdSetIterator iterator() { - return simpleTextFloatVectorValues; + return iterator; } }; } @@ -397,6 +382,11 @@ public class SimpleTextKnnVectorsReader extends KnnVectorsReader { value[i] = Float.parseFloat(floatStrings[i]); } } + + @Override + public SimpleTextFloatVectorValues copy() { + return this; + } } private static class SimpleTextByteVectorValues extends ByteVectorValues { @@ -439,36 +429,19 @@ public class SimpleTextKnnVectorsReader extends KnnVectorsReader { } @Override - public byte[] vectorValue() { - binaryValue.bytes = values[curOrd]; + public byte[] vectorValue(int ord) { + binaryValue.bytes = values[ord]; return binaryValue.bytes; } @Override - public int docID() { - if (curOrd == -1) { - return -1; - } else if (curOrd >= entry.size()) { - // when call to advance / nextDoc below already returns NO_MORE_DOCS, calling docID - // immediately afterward should also return NO_MORE_DOCS - // this is needed for TestSimpleTextKnnVectorsFormat.testAdvance test case - return NO_MORE_DOCS; - } - - return entry.ordToDoc[curOrd]; + public int ordToDoc(int ord) { + return entry.ordToDoc[ord]; } @Override - public int nextDoc() throws IOException { - if (++curOrd < entry.size()) { - return docID(); - } - return NO_MORE_DOCS; - } - - @Override - public int advance(int target) throws IOException { - return slowAdvance(target); + public DocIndexIterator iterator() { + return createSparseIterator(); } @Override @@ -478,16 +451,19 @@ public class SimpleTextKnnVectorsReader extends KnnVectorsReader { } SimpleTextByteVectorValues simpleTextByteVectorValues = new SimpleTextByteVectorValues(this); return new VectorScorer() { + DocIndexIterator it = simpleTextByteVectorValues.iterator(); + @Override public float score() throws IOException { + int ord = it.index(); return entry .similarityFunction() - .compare(simpleTextByteVectorValues.vectorValue(), target); + .compare(simpleTextByteVectorValues.vectorValue(ord), target); } @Override public DocIdSetIterator iterator() { - return simpleTextByteVectorValues; + return it; } }; } @@ -509,6 +485,11 @@ public class SimpleTextKnnVectorsReader extends KnnVectorsReader { value[i] = (byte) Float.parseFloat(floatStrings[i]); } } + + @Override + public SimpleTextByteVectorValues copy() { + return this; + } } private int readInt(IndexInput in, BytesRef field) throws IOException { diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextKnnVectorsWriter.java b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextKnnVectorsWriter.java index a7a76ac1bb9..eaf4b657755 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextKnnVectorsWriter.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextKnnVectorsWriter.java @@ -28,6 +28,7 @@ import org.apache.lucene.index.ByteVectorValues; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.FloatVectorValues; import org.apache.lucene.index.IndexFileNames; +import org.apache.lucene.index.KnnVectorValues; import org.apache.lucene.index.SegmentWriteState; import org.apache.lucene.store.IndexOutput; import org.apache.lucene.util.BytesRef; @@ -77,19 +78,18 @@ public class SimpleTextKnnVectorsWriter extends BufferingKnnVectorsWriter { throws IOException { long vectorDataOffset = vectorData.getFilePointer(); List docIds = new ArrayList<>(); - for (int docV = floatVectorValues.nextDoc(); - docV != NO_MORE_DOCS; - docV = floatVectorValues.nextDoc()) { - writeFloatVectorValue(floatVectorValues); - docIds.add(docV); + KnnVectorValues.DocIndexIterator iter = floatVectorValues.iterator(); + for (int docId = iter.nextDoc(); docId != NO_MORE_DOCS; docId = iter.nextDoc()) { + writeFloatVectorValue(floatVectorValues, iter.index()); + docIds.add(docId); } long vectorDataLength = vectorData.getFilePointer() - vectorDataOffset; writeMeta(fieldInfo, vectorDataOffset, vectorDataLength, docIds); } - private void writeFloatVectorValue(FloatVectorValues vectors) throws IOException { + private void writeFloatVectorValue(FloatVectorValues vectors, int ord) throws IOException { // write vector value - float[] value = vectors.vectorValue(); + float[] value = vectors.vectorValue(ord); assert value.length == vectors.dimension(); write(vectorData, Arrays.toString(value)); newline(vectorData); @@ -100,19 +100,18 @@ public class SimpleTextKnnVectorsWriter extends BufferingKnnVectorsWriter { throws IOException { long vectorDataOffset = vectorData.getFilePointer(); List docIds = new ArrayList<>(); - for (int docV = byteVectorValues.nextDoc(); - docV != NO_MORE_DOCS; - docV = byteVectorValues.nextDoc()) { - writeByteVectorValue(byteVectorValues); + KnnVectorValues.DocIndexIterator it = byteVectorValues.iterator(); + for (int docV = it.nextDoc(); docV != NO_MORE_DOCS; docV = it.nextDoc()) { + writeByteVectorValue(byteVectorValues, it.index()); docIds.add(docV); } long vectorDataLength = vectorData.getFilePointer() - vectorDataOffset; writeMeta(fieldInfo, vectorDataOffset, vectorDataLength, docIds); } - private void writeByteVectorValue(ByteVectorValues vectors) throws IOException { + private void writeByteVectorValue(ByteVectorValues vectors, int ord) throws IOException { // write vector value - byte[] value = vectors.vectorValue(); + byte[] value = vectors.vectorValue(ord); assert value.length == vectors.dimension(); write(vectorData, Arrays.toString(value)); newline(vectorData); diff --git a/lucene/codecs/src/test/org/apache/lucene/codecs/bitvectors/TestHnswBitVectorsFormat.java b/lucene/codecs/src/test/org/apache/lucene/codecs/bitvectors/TestHnswBitVectorsFormat.java index ab20ee67c8c..a0ea5833e2e 100644 --- a/lucene/codecs/src/test/org/apache/lucene/codecs/bitvectors/TestHnswBitVectorsFormat.java +++ b/lucene/codecs/src/test/org/apache/lucene/codecs/bitvectors/TestHnswBitVectorsFormat.java @@ -22,7 +22,7 @@ import java.io.IOException; import org.apache.lucene.codecs.Codec; import org.apache.lucene.codecs.FilterCodec; import org.apache.lucene.codecs.KnnVectorsFormat; -import org.apache.lucene.codecs.lucene912.Lucene912Codec; +import org.apache.lucene.codecs.lucene100.Lucene100Codec; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.KnnByteVectorField; @@ -42,7 +42,7 @@ import org.apache.lucene.tests.index.BaseIndexFileFormatTestCase; public class TestHnswBitVectorsFormat extends BaseIndexFileFormatTestCase { @Override protected Codec getCodec() { - return new Lucene912Codec() { + return new Lucene100Codec() { @Override public KnnVectorsFormat getKnnVectorsFormatForField(String field) { return new HnswBitVectorsFormat(); diff --git a/lucene/core/src/generated/checksums/generateForDeltaUtil.json b/lucene/core/src/generated/checksums/generateForDeltaUtil.json index 6546e25c4be..26ebc1198d9 100644 --- a/lucene/core/src/generated/checksums/generateForDeltaUtil.json +++ b/lucene/core/src/generated/checksums/generateForDeltaUtil.json @@ -1,4 +1,4 @@ { - "lucene/core/src/java/org/apache/lucene/codecs/lucene912/ForDeltaUtil.java": "5115b12ac31537ce31d73c0a279df92060749a3a", - "lucene/core/src/java/org/apache/lucene/codecs/lucene912/gen_ForDeltaUtil.py": "db6154406e68b80d2c90116b5d0bfa9ba220762a" + "lucene/core/src/java/org/apache/lucene/codecs/lucene912/ForDeltaUtil.java": "f561578ccb6a95364bb62c5ed86b38ff0b4a009d", + "lucene/core/src/java/org/apache/lucene/codecs/lucene912/gen_ForDeltaUtil.py": "eea1a71be9da8a13fdd979354dc4a8c6edf21be1" } \ No newline at end of file diff --git a/lucene/core/src/java/module-info.java b/lucene/core/src/java/module-info.java index a0f0bad01eb..6fd1767aa34 100644 --- a/lucene/core/src/java/module-info.java +++ b/lucene/core/src/java/module-info.java @@ -15,7 +15,7 @@ * limitations under the License. */ -import org.apache.lucene.codecs.lucene912.Lucene912Codec; +import org.apache.lucene.codecs.lucene100.Lucene100Codec; /** Lucene Core. */ @SuppressWarnings("module") // the test framework is compiled after the core... @@ -34,6 +34,7 @@ module org.apache.lucene.core { exports org.apache.lucene.codecs.lucene95; exports org.apache.lucene.codecs.lucene99; exports org.apache.lucene.codecs.lucene912; + exports org.apache.lucene.codecs.lucene100; exports org.apache.lucene.codecs.perfield; exports org.apache.lucene.codecs; exports org.apache.lucene.document; @@ -72,7 +73,7 @@ module org.apache.lucene.core { provides org.apache.lucene.analysis.TokenizerFactory with org.apache.lucene.analysis.standard.StandardTokenizerFactory; provides org.apache.lucene.codecs.Codec with - Lucene912Codec; + Lucene100Codec; provides org.apache.lucene.codecs.DocValuesFormat with org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat; provides org.apache.lucene.codecs.KnnVectorsFormat with diff --git a/lucene/core/src/java/org/apache/lucene/codecs/BufferingKnnVectorsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/BufferingKnnVectorsWriter.java index 8a9b4816571..96b0f75a259 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/BufferingKnnVectorsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/BufferingKnnVectorsWriter.java @@ -20,14 +20,16 @@ package org.apache.lucene.codecs; import java.io.IOException; import java.util.ArrayList; import java.util.List; +import java.util.function.Supplier; import org.apache.lucene.index.ByteVectorValues; import org.apache.lucene.index.DocsWithFieldSet; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.FloatVectorValues; import org.apache.lucene.index.MergeState; import org.apache.lucene.index.Sorter; -import org.apache.lucene.search.DocIdSetIterator; -import org.apache.lucene.search.VectorScorer; +import org.apache.lucene.index.SortingCodecReader; +import org.apache.lucene.index.SortingCodecReader.SortingValuesIterator; +import org.apache.lucene.search.DocIdSet; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.RamUsageEstimator; @@ -80,24 +82,26 @@ public abstract class BufferingKnnVectorsWriter extends KnnVectorsWriter { case FLOAT32: BufferedFloatVectorValues bufferedFloatVectorValues = new BufferedFloatVectorValues( - fieldData.docsWithField, (List) fieldData.vectors, - fieldData.fieldInfo.getVectorDimension()); + fieldData.fieldInfo.getVectorDimension(), + fieldData.docsWithField); FloatVectorValues floatVectorValues = sortMap != null - ? new SortingFloatVectorValues(bufferedFloatVectorValues, sortMap) + ? new SortingFloatVectorValues( + bufferedFloatVectorValues, fieldData.docsWithField, sortMap) : bufferedFloatVectorValues; writeField(fieldData.fieldInfo, floatVectorValues, maxDoc); break; case BYTE: BufferedByteVectorValues bufferedByteVectorValues = new BufferedByteVectorValues( - fieldData.docsWithField, (List) fieldData.vectors, - fieldData.fieldInfo.getVectorDimension()); + fieldData.fieldInfo.getVectorDimension(), + fieldData.docsWithField); ByteVectorValues byteVectorValues = sortMap != null - ? new SortingByteVectorValues(bufferedByteVectorValues, sortMap) + ? new SortingByteVectorValues( + bufferedByteVectorValues, fieldData.docsWithField, sortMap) : bufferedByteVectorValues; writeField(fieldData.fieldInfo, byteVectorValues, maxDoc); break; @@ -107,125 +111,77 @@ public abstract class BufferingKnnVectorsWriter extends KnnVectorsWriter { /** Sorting FloatVectorValues that iterate over documents in the order of the provided sortMap */ private static class SortingFloatVectorValues extends FloatVectorValues { - private final BufferedFloatVectorValues randomAccess; - private final int[] docIdOffsets; - private int docId = -1; + private final BufferedFloatVectorValues delegate; + private final Supplier iteratorSupplier; - SortingFloatVectorValues(BufferedFloatVectorValues delegate, Sorter.DocMap sortMap) + SortingFloatVectorValues( + BufferedFloatVectorValues delegate, DocsWithFieldSet docsWithField, Sorter.DocMap sortMap) throws IOException { - this.randomAccess = delegate.copy(); - this.docIdOffsets = new int[sortMap.size()]; - - int offset = 1; // 0 means no vector for this (field, document) - int docID; - while ((docID = delegate.nextDoc()) != NO_MORE_DOCS) { - int newDocID = sortMap.oldToNew(docID); - docIdOffsets[newDocID] = offset++; - } + this.delegate = delegate.copy(); + iteratorSupplier = SortingCodecReader.iteratorSupplier(delegate, sortMap); } @Override - public int docID() { - return docId; - } - - @Override - public int nextDoc() throws IOException { - while (docId < docIdOffsets.length - 1) { - ++docId; - if (docIdOffsets[docId] != 0) { - return docId; - } - } - docId = NO_MORE_DOCS; - return docId; - } - - @Override - public float[] vectorValue() throws IOException { - return randomAccess.vectorValue(docIdOffsets[docId] - 1); + public float[] vectorValue(int ord) throws IOException { + return delegate.vectorValue(ord); } @Override public int dimension() { - return randomAccess.dimension(); + return delegate.dimension(); } @Override public int size() { - return randomAccess.size(); + return delegate.size(); } @Override - public int advance(int target) throws IOException { + public SortingFloatVectorValues copy() { throw new UnsupportedOperationException(); } @Override - public VectorScorer scorer(float[] target) { - throw new UnsupportedOperationException(); + public DocIndexIterator iterator() { + return iteratorSupplier.get(); } } - /** Sorting FloatVectorValues that iterate over documents in the order of the provided sortMap */ + /** Sorting ByteVectorValues that iterate over documents in the order of the provided sortMap */ private static class SortingByteVectorValues extends ByteVectorValues { - private final BufferedByteVectorValues randomAccess; - private final int[] docIdOffsets; - private int docId = -1; + private final BufferedByteVectorValues delegate; + private final Supplier iteratorSupplier; - SortingByteVectorValues(BufferedByteVectorValues delegate, Sorter.DocMap sortMap) + SortingByteVectorValues( + BufferedByteVectorValues delegate, DocsWithFieldSet docsWithField, Sorter.DocMap sortMap) throws IOException { - this.randomAccess = delegate.copy(); - this.docIdOffsets = new int[sortMap.size()]; - - int offset = 1; // 0 means no vector for this (field, document) - int docID; - while ((docID = delegate.nextDoc()) != NO_MORE_DOCS) { - int newDocID = sortMap.oldToNew(docID); - docIdOffsets[newDocID] = offset++; - } + this.delegate = delegate; + iteratorSupplier = SortingCodecReader.iteratorSupplier(delegate, sortMap); } @Override - public int docID() { - return docId; - } - - @Override - public int nextDoc() throws IOException { - while (docId < docIdOffsets.length - 1) { - ++docId; - if (docIdOffsets[docId] != 0) { - return docId; - } - } - docId = NO_MORE_DOCS; - return docId; - } - - @Override - public byte[] vectorValue() throws IOException { - return randomAccess.vectorValue(docIdOffsets[docId] - 1); + public byte[] vectorValue(int ord) throws IOException { + return delegate.vectorValue(ord); } @Override public int dimension() { - return randomAccess.dimension(); + return delegate.dimension(); } @Override public int size() { - return randomAccess.size(); + return delegate.size(); } @Override - public int advance(int target) throws IOException { + public SortingByteVectorValues copy() { throw new UnsupportedOperationException(); } @Override - public VectorScorer scorer(byte[] target) { - throw new UnsupportedOperationException(); + public DocIndexIterator iterator() { + return iteratorSupplier.get(); } } @@ -296,7 +252,9 @@ public abstract class BufferingKnnVectorsWriter extends KnnVectorsWriter { @Override public final long ramBytesUsed() { - if (vectors.size() == 0) return 0; + if (vectors.isEmpty()) { + return 0; + } return docsWithField.ramBytesUsed() + vectors.size() * (long) @@ -307,25 +265,18 @@ public abstract class BufferingKnnVectorsWriter extends KnnVectorsWriter { } private static class BufferedFloatVectorValues extends FloatVectorValues { - final DocsWithFieldSet docsWithField; - // These are always the vectors of a VectorValuesWriter, which are copied when added to it final List vectors; final int dimension; + private final DocIdSet docsWithField; + private final DocIndexIterator iterator; - DocIdSetIterator docsWithFieldIter; - int ord = -1; - - BufferedFloatVectorValues( - DocsWithFieldSet docsWithField, List vectors, int dimension) { - this.docsWithField = docsWithField; + BufferedFloatVectorValues(List vectors, int dimension, DocIdSet docsWithField) + throws IOException { this.vectors = vectors; this.dimension = dimension; - docsWithFieldIter = docsWithField.iterator(); - } - - public BufferedFloatVectorValues copy() { - return new BufferedFloatVectorValues(docsWithField, vectors, dimension); + this.docsWithField = docsWithField; + this.iterator = fromDISI(docsWithField.iterator()); } @Override @@ -339,58 +290,39 @@ public abstract class BufferingKnnVectorsWriter extends KnnVectorsWriter { } @Override - public float[] vectorValue() { - return vectors.get(ord); + public int ordToDoc(int ord) { + return ord; } - float[] vectorValue(int targetOrd) { + @Override + public float[] vectorValue(int targetOrd) { return vectors.get(targetOrd); } @Override - public int docID() { - return docsWithFieldIter.docID(); + public DocIndexIterator iterator() { + return iterator; } @Override - public int nextDoc() throws IOException { - int docID = docsWithFieldIter.nextDoc(); - if (docID != NO_MORE_DOCS) { - ++ord; - } - return docID; - } - - @Override - public int advance(int target) { - throw new UnsupportedOperationException(); - } - - @Override - public VectorScorer scorer(float[] target) { - throw new UnsupportedOperationException(); + public BufferedFloatVectorValues copy() throws IOException { + return new BufferedFloatVectorValues(vectors, dimension, docsWithField); } } private static class BufferedByteVectorValues extends ByteVectorValues { - final DocsWithFieldSet docsWithField; - // These are always the vectors of a VectorValuesWriter, which are copied when added to it final List vectors; final int dimension; + private final DocIdSet docsWithField; + private final DocIndexIterator iterator; - DocIdSetIterator docsWithFieldIter; - int ord = -1; - - BufferedByteVectorValues(DocsWithFieldSet docsWithField, List vectors, int dimension) { - this.docsWithField = docsWithField; + BufferedByteVectorValues(List vectors, int dimension, DocIdSet docsWithField) + throws IOException { this.vectors = vectors; this.dimension = dimension; - docsWithFieldIter = docsWithField.iterator(); - } - - public BufferedByteVectorValues copy() { - return new BufferedByteVectorValues(docsWithField, vectors, dimension); + this.docsWithField = docsWithField; + iterator = fromDISI(docsWithField.iterator()); } @Override @@ -404,36 +336,18 @@ public abstract class BufferingKnnVectorsWriter extends KnnVectorsWriter { } @Override - public byte[] vectorValue() { - return vectors.get(ord); - } - - byte[] vectorValue(int targetOrd) { + public byte[] vectorValue(int targetOrd) { return vectors.get(targetOrd); } @Override - public int docID() { - return docsWithFieldIter.docID(); + public DocIndexIterator iterator() { + return iterator; } @Override - public int nextDoc() throws IOException { - int docID = docsWithFieldIter.nextDoc(); - if (docID != NO_MORE_DOCS) { - ++ord; - } - return docID; - } - - @Override - public int advance(int target) { - throw new UnsupportedOperationException(); - } - - @Override - public VectorScorer scorer(byte[] target) { - throw new UnsupportedOperationException(); + public BufferedByteVectorValues copy() throws IOException { + return new BufferedByteVectorValues(vectors, dimension, docsWithField); } } } diff --git a/lucene/core/src/java/org/apache/lucene/codecs/Codec.java b/lucene/core/src/java/org/apache/lucene/codecs/Codec.java index e5a5dac8ff5..ff5a5bb21c0 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/Codec.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/Codec.java @@ -55,7 +55,7 @@ public abstract class Codec implements NamedSPILoader.NamedSPI { return LOADER; } - static Codec defaultCodec = LOADER.lookup("Lucene912"); + static Codec defaultCodec = LOADER.lookup("Lucene100"); } private final String name; diff --git a/lucene/core/src/java/org/apache/lucene/codecs/CompoundFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/CompoundFormat.java index 371e192887b..6a7e75f267e 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/CompoundFormat.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/CompoundFormat.java @@ -35,8 +35,8 @@ public abstract class CompoundFormat { // we can add 'producer' classes. /** Returns a Directory view (read-only) for the compound files in this segment */ - public abstract CompoundDirectory getCompoundReader( - Directory dir, SegmentInfo si, IOContext context) throws IOException; + public abstract CompoundDirectory getCompoundReader(Directory dir, SegmentInfo si) + throws IOException; /** * Packs the provided segment's files into a compound format. All files referenced by the provided diff --git a/lucene/core/src/java/org/apache/lucene/codecs/DocValuesConsumer.java b/lucene/core/src/java/org/apache/lucene/codecs/DocValuesConsumer.java index cbb906788e5..08c08ec5075 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/DocValuesConsumer.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/DocValuesConsumer.java @@ -613,7 +613,7 @@ public abstract class DocValuesConsumer implements Closeable { if (docValuesProducer != null) { FieldInfo readerFieldInfo = mergeState.fieldInfos[i].fieldInfo(fieldInfo.name); if (readerFieldInfo != null && readerFieldInfo.getDocValuesType() == DocValuesType.SORTED) { - values = docValuesProducer.getSorted(fieldInfo); + values = docValuesProducer.getSorted(readerFieldInfo); } } if (values == null) { diff --git a/lucene/core/src/java/org/apache/lucene/codecs/KnnVectorsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/KnnVectorsWriter.java index 3b185fd13a0..50af32a7e16 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/KnnVectorsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/KnnVectorsWriter.java @@ -30,6 +30,7 @@ import org.apache.lucene.index.DocsWithFieldSet; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.FieldInfos; import org.apache.lucene.index.FloatVectorValues; +import org.apache.lucene.index.KnnVectorValues; import org.apache.lucene.index.MergeState; import org.apache.lucene.index.Sorter; import org.apache.lucene.index.VectorEncoding; @@ -55,28 +56,26 @@ public abstract class KnnVectorsWriter implements Accountable, Closeable { @SuppressWarnings("unchecked") public void mergeOneField(FieldInfo fieldInfo, MergeState mergeState) throws IOException { switch (fieldInfo.getVectorEncoding()) { - case BYTE: + case BYTE -> { KnnFieldVectorsWriter byteWriter = (KnnFieldVectorsWriter) addField(fieldInfo); ByteVectorValues mergedBytes = MergedVectorValues.mergeByteVectorValues(fieldInfo, mergeState); - for (int doc = mergedBytes.nextDoc(); - doc != DocIdSetIterator.NO_MORE_DOCS; - doc = mergedBytes.nextDoc()) { - byteWriter.addValue(doc, mergedBytes.vectorValue()); + KnnVectorValues.DocIndexIterator iter = mergedBytes.iterator(); + for (int doc = iter.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = iter.nextDoc()) { + byteWriter.addValue(doc, mergedBytes.vectorValue(iter.index())); } - break; - case FLOAT32: + } + case FLOAT32 -> { KnnFieldVectorsWriter floatWriter = (KnnFieldVectorsWriter) addField(fieldInfo); FloatVectorValues mergedFloats = MergedVectorValues.mergeFloatVectorValues(fieldInfo, mergeState); - for (int doc = mergedFloats.nextDoc(); - doc != DocIdSetIterator.NO_MORE_DOCS; - doc = mergedFloats.nextDoc()) { - floatWriter.addValue(doc, mergedFloats.vectorValue()); + KnnVectorValues.DocIndexIterator iter = mergedFloats.iterator(); + for (int doc = iter.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = iter.nextDoc()) { + floatWriter.addValue(doc, mergedFloats.vectorValue(iter.index())); } - break; + } } } @@ -117,32 +116,44 @@ public abstract class KnnVectorsWriter implements Accountable, Closeable { private static class FloatVectorValuesSub extends DocIDMerger.Sub { final FloatVectorValues values; + final KnnVectorValues.DocIndexIterator iterator; FloatVectorValuesSub(MergeState.DocMap docMap, FloatVectorValues values) { super(docMap); this.values = values; - assert values.docID() == -1; + this.iterator = values.iterator(); + assert iterator.docID() == -1; } @Override public int nextDoc() throws IOException { - return values.nextDoc(); + return iterator.nextDoc(); + } + + public int index() { + return iterator.index(); } } private static class ByteVectorValuesSub extends DocIDMerger.Sub { final ByteVectorValues values; + final KnnVectorValues.DocIndexIterator iterator; ByteVectorValuesSub(MergeState.DocMap docMap, ByteVectorValues values) { super(docMap); this.values = values; - assert values.docID() == -1; + iterator = values.iterator(); + assert iterator.docID() == -1; } @Override public int nextDoc() throws IOException { - return values.nextDoc(); + return iterator.nextDoc(); + } + + int index() { + return iterator.index(); } } @@ -287,7 +298,8 @@ public abstract class KnnVectorsWriter implements Accountable, Closeable { private final List subs; private final DocIDMerger docIdMerger; private final int size; - private int docId; + private int docId = -1; + private int lastOrd = -1; FloatVectorValuesSub current; private MergedFloat32VectorValues(List subs, MergeState mergeState) @@ -299,33 +311,59 @@ public abstract class KnnVectorsWriter implements Accountable, Closeable { totalSize += sub.values.size(); } size = totalSize; - docId = -1; } @Override - public int docID() { - return docId; + public DocIndexIterator iterator() { + return new DocIndexIterator() { + private int index = -1; + + @Override + public int docID() { + return docId; + } + + @Override + public int index() { + return index; + } + + @Override + public int nextDoc() throws IOException { + current = docIdMerger.next(); + if (current == null) { + docId = NO_MORE_DOCS; + index = NO_MORE_DOCS; + } else { + docId = current.mappedDocID; + ++lastOrd; + ++index; + } + return docId; + } + + @Override + public int advance(int target) throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public long cost() { + return size; + } + }; } @Override - public int nextDoc() throws IOException { - current = docIdMerger.next(); - if (current == null) { - docId = NO_MORE_DOCS; - } else { - docId = current.mappedDocID; + public float[] vectorValue(int ord) throws IOException { + if (ord != lastOrd) { + throw new IllegalStateException( + "only supports forward iteration with a single iterator: ord=" + + ord + + ", lastOrd=" + + lastOrd); } - return docId; - } - - @Override - public float[] vectorValue() throws IOException { - return current.values.vectorValue(); - } - - @Override - public int advance(int target) { - throw new UnsupportedOperationException(); + return current.values.vectorValue(current.index()); } @Override @@ -338,10 +376,20 @@ public abstract class KnnVectorsWriter implements Accountable, Closeable { return subs.get(0).values.dimension(); } + @Override + public int ordToDoc(int ord) { + throw new UnsupportedOperationException(); + } + @Override public VectorScorer scorer(float[] target) { throw new UnsupportedOperationException(); } + + @Override + public FloatVectorValues copy() { + throw new UnsupportedOperationException(); + } } static class MergedByteVectorValues extends ByteVectorValues { @@ -349,7 +397,8 @@ public abstract class KnnVectorsWriter implements Accountable, Closeable { private final DocIDMerger docIdMerger; private final int size; - private int docId; + private int lastOrd = -1; + private int docId = -1; ByteVectorValuesSub current; private MergedByteVectorValues(List subs, MergeState mergeState) @@ -361,33 +410,57 @@ public abstract class KnnVectorsWriter implements Accountable, Closeable { totalSize += sub.values.size(); } size = totalSize; - docId = -1; } @Override - public byte[] vectorValue() throws IOException { - return current.values.vectorValue(); - } - - @Override - public int docID() { - return docId; - } - - @Override - public int nextDoc() throws IOException { - current = docIdMerger.next(); - if (current == null) { - docId = NO_MORE_DOCS; + public byte[] vectorValue(int ord) throws IOException { + if (ord != lastOrd + 1) { + throw new IllegalStateException( + "only supports forward iteration: ord=" + ord + ", lastOrd=" + lastOrd); } else { - docId = current.mappedDocID; + lastOrd = ord; } - return docId; + return current.values.vectorValue(current.index()); } @Override - public int advance(int target) { - throw new UnsupportedOperationException(); + public DocIndexIterator iterator() { + return new DocIndexIterator() { + private int index = -1; + + @Override + public int docID() { + return docId; + } + + @Override + public int index() { + return index; + } + + @Override + public int nextDoc() throws IOException { + current = docIdMerger.next(); + if (current == null) { + docId = NO_MORE_DOCS; + index = NO_MORE_DOCS; + } else { + docId = current.mappedDocID; + ++index; + } + return docId; + } + + @Override + public int advance(int target) throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public long cost() { + return size; + } + }; } @Override @@ -400,10 +473,20 @@ public abstract class KnnVectorsWriter implements Accountable, Closeable { return subs.get(0).values.dimension(); } + @Override + public int ordToDoc(int ord) { + throw new UnsupportedOperationException(); + } + @Override public VectorScorer scorer(byte[] target) { throw new UnsupportedOperationException(); } + + @Override + public ByteVectorValues copy() { + throw new UnsupportedOperationException(); + } } } } diff --git a/lucene/core/src/java/org/apache/lucene/codecs/hnsw/DefaultFlatVectorScorer.java b/lucene/core/src/java/org/apache/lucene/codecs/hnsw/DefaultFlatVectorScorer.java index 1274e1c789e..3e506037969 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/hnsw/DefaultFlatVectorScorer.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/hnsw/DefaultFlatVectorScorer.java @@ -18,8 +18,10 @@ package org.apache.lucene.codecs.hnsw; import java.io.IOException; +import org.apache.lucene.index.ByteVectorValues; +import org.apache.lucene.index.FloatVectorValues; +import org.apache.lucene.index.KnnVectorValues; import org.apache.lucene.index.VectorSimilarityFunction; -import org.apache.lucene.util.hnsw.RandomAccessVectorValues; import org.apache.lucene.util.hnsw.RandomVectorScorer; import org.apache.lucene.util.hnsw.RandomVectorScorerSupplier; @@ -34,24 +36,26 @@ public class DefaultFlatVectorScorer implements FlatVectorsScorer { @Override public RandomVectorScorerSupplier getRandomVectorScorerSupplier( - VectorSimilarityFunction similarityFunction, RandomAccessVectorValues vectorValues) + VectorSimilarityFunction similarityFunction, KnnVectorValues vectorValues) throws IOException { - if (vectorValues instanceof RandomAccessVectorValues.Floats floatVectorValues) { - return new FloatScoringSupplier(floatVectorValues, similarityFunction); - } else if (vectorValues instanceof RandomAccessVectorValues.Bytes byteVectorValues) { - return new ByteScoringSupplier(byteVectorValues, similarityFunction); + switch (vectorValues.getEncoding()) { + case FLOAT32 -> { + return new FloatScoringSupplier((FloatVectorValues) vectorValues, similarityFunction); + } + case BYTE -> { + return new ByteScoringSupplier((ByteVectorValues) vectorValues, similarityFunction); + } } throw new IllegalArgumentException( - "vectorValues must be an instance of RandomAccessVectorValues.Floats or RandomAccessVectorValues.Bytes"); + "vectorValues must be an instance of FloatVectorValues or ByteVectorValues, got a " + + vectorValues.getClass().getName()); } @Override public RandomVectorScorer getRandomVectorScorer( - VectorSimilarityFunction similarityFunction, - RandomAccessVectorValues vectorValues, - float[] target) + VectorSimilarityFunction similarityFunction, KnnVectorValues vectorValues, float[] target) throws IOException { - assert vectorValues instanceof RandomAccessVectorValues.Floats; + assert vectorValues instanceof FloatVectorValues; if (target.length != vectorValues.dimension()) { throw new IllegalArgumentException( "vector query dimension: " @@ -59,17 +63,14 @@ public class DefaultFlatVectorScorer implements FlatVectorsScorer { + " differs from field dimension: " + vectorValues.dimension()); } - return new FloatVectorScorer( - (RandomAccessVectorValues.Floats) vectorValues, target, similarityFunction); + return new FloatVectorScorer((FloatVectorValues) vectorValues, target, similarityFunction); } @Override public RandomVectorScorer getRandomVectorScorer( - VectorSimilarityFunction similarityFunction, - RandomAccessVectorValues vectorValues, - byte[] target) + VectorSimilarityFunction similarityFunction, KnnVectorValues vectorValues, byte[] target) throws IOException { - assert vectorValues instanceof RandomAccessVectorValues.Bytes; + assert vectorValues instanceof ByteVectorValues; if (target.length != vectorValues.dimension()) { throw new IllegalArgumentException( "vector query dimension: " @@ -77,8 +78,7 @@ public class DefaultFlatVectorScorer implements FlatVectorsScorer { + " differs from field dimension: " + vectorValues.dimension()); } - return new ByteVectorScorer( - (RandomAccessVectorValues.Bytes) vectorValues, target, similarityFunction); + return new ByteVectorScorer((ByteVectorValues) vectorValues, target, similarityFunction); } @Override @@ -88,14 +88,13 @@ public class DefaultFlatVectorScorer implements FlatVectorsScorer { /** RandomVectorScorerSupplier for bytes vector */ private static final class ByteScoringSupplier implements RandomVectorScorerSupplier { - private final RandomAccessVectorValues.Bytes vectors; - private final RandomAccessVectorValues.Bytes vectors1; - private final RandomAccessVectorValues.Bytes vectors2; + private final ByteVectorValues vectors; + private final ByteVectorValues vectors1; + private final ByteVectorValues vectors2; private final VectorSimilarityFunction similarityFunction; private ByteScoringSupplier( - RandomAccessVectorValues.Bytes vectors, VectorSimilarityFunction similarityFunction) - throws IOException { + ByteVectorValues vectors, VectorSimilarityFunction similarityFunction) throws IOException { this.vectors = vectors; vectors1 = vectors.copy(); vectors2 = vectors.copy(); @@ -125,14 +124,13 @@ public class DefaultFlatVectorScorer implements FlatVectorsScorer { /** RandomVectorScorerSupplier for Float vector */ private static final class FloatScoringSupplier implements RandomVectorScorerSupplier { - private final RandomAccessVectorValues.Floats vectors; - private final RandomAccessVectorValues.Floats vectors1; - private final RandomAccessVectorValues.Floats vectors2; + private final FloatVectorValues vectors; + private final FloatVectorValues vectors1; + private final FloatVectorValues vectors2; private final VectorSimilarityFunction similarityFunction; private FloatScoringSupplier( - RandomAccessVectorValues.Floats vectors, VectorSimilarityFunction similarityFunction) - throws IOException { + FloatVectorValues vectors, VectorSimilarityFunction similarityFunction) throws IOException { this.vectors = vectors; vectors1 = vectors.copy(); vectors2 = vectors.copy(); @@ -162,14 +160,12 @@ public class DefaultFlatVectorScorer implements FlatVectorsScorer { /** A {@link RandomVectorScorer} for float vectors. */ private static class FloatVectorScorer extends RandomVectorScorer.AbstractRandomVectorScorer { - private final RandomAccessVectorValues.Floats values; + private final FloatVectorValues values; private final float[] query; private final VectorSimilarityFunction similarityFunction; public FloatVectorScorer( - RandomAccessVectorValues.Floats values, - float[] query, - VectorSimilarityFunction similarityFunction) { + FloatVectorValues values, float[] query, VectorSimilarityFunction similarityFunction) { super(values); this.values = values; this.query = query; @@ -184,14 +180,12 @@ public class DefaultFlatVectorScorer implements FlatVectorsScorer { /** A {@link RandomVectorScorer} for byte vectors. */ private static class ByteVectorScorer extends RandomVectorScorer.AbstractRandomVectorScorer { - private final RandomAccessVectorValues.Bytes values; + private final ByteVectorValues values; private final byte[] query; private final VectorSimilarityFunction similarityFunction; public ByteVectorScorer( - RandomAccessVectorValues.Bytes values, - byte[] query, - VectorSimilarityFunction similarityFunction) { + ByteVectorValues values, byte[] query, VectorSimilarityFunction similarityFunction) { super(values); this.values = values; this.query = query; diff --git a/lucene/core/src/java/org/apache/lucene/codecs/hnsw/FlatVectorsScorer.java b/lucene/core/src/java/org/apache/lucene/codecs/hnsw/FlatVectorsScorer.java index 17430c24f27..6ed170731de 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/hnsw/FlatVectorsScorer.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/hnsw/FlatVectorsScorer.java @@ -18,8 +18,8 @@ package org.apache.lucene.codecs.hnsw; import java.io.IOException; +import org.apache.lucene.index.KnnVectorValues; import org.apache.lucene.index.VectorSimilarityFunction; -import org.apache.lucene.util.hnsw.RandomAccessVectorValues; import org.apache.lucene.util.hnsw.RandomVectorScorer; import org.apache.lucene.util.hnsw.RandomVectorScorerSupplier; @@ -40,7 +40,19 @@ public interface FlatVectorsScorer { * @throws IOException if an I/O error occurs */ RandomVectorScorerSupplier getRandomVectorScorerSupplier( - VectorSimilarityFunction similarityFunction, RandomAccessVectorValues vectorValues) + VectorSimilarityFunction similarityFunction, KnnVectorValues vectorValues) throws IOException; + + /** + * Returns a {@link RandomVectorScorer} for the given set of vectors and target vector. + * + * @param similarityFunction the similarity function to use + * @param vectorValues the vector values to score + * @param target the target vector + * @return a {@link RandomVectorScorer} for the given field and target vector. + * @throws IOException if an I/O error occurs when reading from the index. + */ + RandomVectorScorer getRandomVectorScorer( + VectorSimilarityFunction similarityFunction, KnnVectorValues vectorValues, float[] target) throws IOException; /** @@ -53,23 +65,6 @@ public interface FlatVectorsScorer { * @throws IOException if an I/O error occurs when reading from the index. */ RandomVectorScorer getRandomVectorScorer( - VectorSimilarityFunction similarityFunction, - RandomAccessVectorValues vectorValues, - float[] target) - throws IOException; - - /** - * Returns a {@link RandomVectorScorer} for the given set of vectors and target vector. - * - * @param similarityFunction the similarity function to use - * @param vectorValues the vector values to score - * @param target the target vector - * @return a {@link RandomVectorScorer} for the given field and target vector. - * @throws IOException if an I/O error occurs when reading from the index. - */ - RandomVectorScorer getRandomVectorScorer( - VectorSimilarityFunction similarityFunction, - RandomAccessVectorValues vectorValues, - byte[] target) + VectorSimilarityFunction similarityFunction, KnnVectorValues vectorValues, byte[] target) throws IOException; } diff --git a/lucene/core/src/java/org/apache/lucene/codecs/hnsw/ScalarQuantizedVectorScorer.java b/lucene/core/src/java/org/apache/lucene/codecs/hnsw/ScalarQuantizedVectorScorer.java index 4b73e1f7a4a..ceb826aa3a1 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/hnsw/ScalarQuantizedVectorScorer.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/hnsw/ScalarQuantizedVectorScorer.java @@ -18,13 +18,13 @@ package org.apache.lucene.codecs.hnsw; import java.io.IOException; +import org.apache.lucene.index.KnnVectorValues; import org.apache.lucene.index.VectorSimilarityFunction; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.VectorUtil; -import org.apache.lucene.util.hnsw.RandomAccessVectorValues; import org.apache.lucene.util.hnsw.RandomVectorScorer; import org.apache.lucene.util.hnsw.RandomVectorScorerSupplier; -import org.apache.lucene.util.quantization.RandomAccessQuantizedByteVectorValues; +import org.apache.lucene.util.quantization.QuantizedByteVectorValues; import org.apache.lucene.util.quantization.ScalarQuantizedVectorSimilarity; import org.apache.lucene.util.quantization.ScalarQuantizer; @@ -60,9 +60,9 @@ public class ScalarQuantizedVectorScorer implements FlatVectorsScorer { @Override public RandomVectorScorerSupplier getRandomVectorScorerSupplier( - VectorSimilarityFunction similarityFunction, RandomAccessVectorValues vectorValues) + VectorSimilarityFunction similarityFunction, KnnVectorValues vectorValues) throws IOException { - if (vectorValues instanceof RandomAccessQuantizedByteVectorValues quantizedByteVectorValues) { + if (vectorValues instanceof QuantizedByteVectorValues quantizedByteVectorValues) { return new ScalarQuantizedRandomVectorScorerSupplier( similarityFunction, quantizedByteVectorValues.getScalarQuantizer(), @@ -74,11 +74,9 @@ public class ScalarQuantizedVectorScorer implements FlatVectorsScorer { @Override public RandomVectorScorer getRandomVectorScorer( - VectorSimilarityFunction similarityFunction, - RandomAccessVectorValues vectorValues, - float[] target) + VectorSimilarityFunction similarityFunction, KnnVectorValues vectorValues, float[] target) throws IOException { - if (vectorValues instanceof RandomAccessQuantizedByteVectorValues quantizedByteVectorValues) { + if (vectorValues instanceof QuantizedByteVectorValues quantizedByteVectorValues) { ScalarQuantizer scalarQuantizer = quantizedByteVectorValues.getScalarQuantizer(); byte[] targetBytes = new byte[target.length]; float offsetCorrection = @@ -104,9 +102,7 @@ public class ScalarQuantizedVectorScorer implements FlatVectorsScorer { @Override public RandomVectorScorer getRandomVectorScorer( - VectorSimilarityFunction similarityFunction, - RandomAccessVectorValues vectorValues, - byte[] target) + VectorSimilarityFunction similarityFunction, KnnVectorValues vectorValues, byte[] target) throws IOException { return nonQuantizedDelegate.getRandomVectorScorer(similarityFunction, vectorValues, target); } @@ -124,14 +120,14 @@ public class ScalarQuantizedVectorScorer implements FlatVectorsScorer { public static class ScalarQuantizedRandomVectorScorerSupplier implements RandomVectorScorerSupplier { - private final RandomAccessQuantizedByteVectorValues values; + private final QuantizedByteVectorValues values; private final ScalarQuantizedVectorSimilarity similarity; private final VectorSimilarityFunction vectorSimilarityFunction; public ScalarQuantizedRandomVectorScorerSupplier( VectorSimilarityFunction similarityFunction, ScalarQuantizer scalarQuantizer, - RandomAccessQuantizedByteVectorValues values) { + QuantizedByteVectorValues values) { this.similarity = ScalarQuantizedVectorSimilarity.fromVectorSimilarity( similarityFunction, @@ -144,7 +140,7 @@ public class ScalarQuantizedVectorScorer implements FlatVectorsScorer { private ScalarQuantizedRandomVectorScorerSupplier( ScalarQuantizedVectorSimilarity similarity, VectorSimilarityFunction vectorSimilarityFunction, - RandomAccessQuantizedByteVectorValues values) { + QuantizedByteVectorValues values) { this.similarity = similarity; this.values = values; this.vectorSimilarityFunction = vectorSimilarityFunction; @@ -152,7 +148,7 @@ public class ScalarQuantizedVectorScorer implements FlatVectorsScorer { @Override public RandomVectorScorer scorer(int ord) throws IOException { - final RandomAccessQuantizedByteVectorValues vectorsCopy = values.copy(); + final QuantizedByteVectorValues vectorsCopy = values.copy(); final byte[] queryVector = values.vectorValue(ord); final float queryOffset = values.getScoreCorrectionConstant(ord); return new RandomVectorScorer.AbstractRandomVectorScorer(vectorsCopy) { diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene100/Lucene100Codec.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene100/Lucene100Codec.java new file mode 100644 index 00000000000..97dc23bc07b --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene100/Lucene100Codec.java @@ -0,0 +1,218 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.lucene100; + +import java.util.Objects; +import org.apache.lucene.codecs.Codec; +import org.apache.lucene.codecs.CompoundFormat; +import org.apache.lucene.codecs.DocValuesFormat; +import org.apache.lucene.codecs.FieldInfosFormat; +import org.apache.lucene.codecs.FilterCodec; +import org.apache.lucene.codecs.KnnVectorsFormat; +import org.apache.lucene.codecs.LiveDocsFormat; +import org.apache.lucene.codecs.NormsFormat; +import org.apache.lucene.codecs.PointsFormat; +import org.apache.lucene.codecs.PostingsFormat; +import org.apache.lucene.codecs.SegmentInfoFormat; +import org.apache.lucene.codecs.StoredFieldsFormat; +import org.apache.lucene.codecs.TermVectorsFormat; +import org.apache.lucene.codecs.lucene90.Lucene90CompoundFormat; +import org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat; +import org.apache.lucene.codecs.lucene90.Lucene90LiveDocsFormat; +import org.apache.lucene.codecs.lucene90.Lucene90NormsFormat; +import org.apache.lucene.codecs.lucene90.Lucene90PointsFormat; +import org.apache.lucene.codecs.lucene90.Lucene90StoredFieldsFormat; +import org.apache.lucene.codecs.lucene90.Lucene90TermVectorsFormat; +import org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat; +import org.apache.lucene.codecs.lucene94.Lucene94FieldInfosFormat; +import org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat; +import org.apache.lucene.codecs.lucene99.Lucene99SegmentInfoFormat; +import org.apache.lucene.codecs.perfield.PerFieldDocValuesFormat; +import org.apache.lucene.codecs.perfield.PerFieldKnnVectorsFormat; +import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat; + +/** + * Implements the Lucene 10.0 index format + * + *

If you want to reuse functionality of this codec in another codec, extend {@link FilterCodec}. + * + * @see org.apache.lucene.codecs.lucene100 package documentation for file format details. + * @lucene.experimental + */ +public class Lucene100Codec extends Codec { + + /** Configuration option for the codec. */ + public enum Mode { + /** Trade compression ratio for retrieval speed. */ + BEST_SPEED(Lucene90StoredFieldsFormat.Mode.BEST_SPEED), + /** Trade retrieval speed for compression ratio. */ + BEST_COMPRESSION(Lucene90StoredFieldsFormat.Mode.BEST_COMPRESSION); + + private final Lucene90StoredFieldsFormat.Mode storedMode; + + private Mode(Lucene90StoredFieldsFormat.Mode storedMode) { + this.storedMode = Objects.requireNonNull(storedMode); + } + } + + private final TermVectorsFormat vectorsFormat = new Lucene90TermVectorsFormat(); + private final FieldInfosFormat fieldInfosFormat = new Lucene94FieldInfosFormat(); + private final SegmentInfoFormat segmentInfosFormat = new Lucene99SegmentInfoFormat(); + private final LiveDocsFormat liveDocsFormat = new Lucene90LiveDocsFormat(); + private final CompoundFormat compoundFormat = new Lucene90CompoundFormat(); + private final NormsFormat normsFormat = new Lucene90NormsFormat(); + + private final PostingsFormat defaultPostingsFormat; + private final PostingsFormat postingsFormat = + new PerFieldPostingsFormat() { + @Override + public PostingsFormat getPostingsFormatForField(String field) { + return Lucene100Codec.this.getPostingsFormatForField(field); + } + }; + + private final DocValuesFormat defaultDVFormat; + private final DocValuesFormat docValuesFormat = + new PerFieldDocValuesFormat() { + @Override + public DocValuesFormat getDocValuesFormatForField(String field) { + return Lucene100Codec.this.getDocValuesFormatForField(field); + } + }; + + private final KnnVectorsFormat defaultKnnVectorsFormat; + private final KnnVectorsFormat knnVectorsFormat = + new PerFieldKnnVectorsFormat() { + @Override + public KnnVectorsFormat getKnnVectorsFormatForField(String field) { + return Lucene100Codec.this.getKnnVectorsFormatForField(field); + } + }; + + private final StoredFieldsFormat storedFieldsFormat; + + /** Instantiates a new codec. */ + public Lucene100Codec() { + this(Mode.BEST_SPEED); + } + + /** + * Instantiates a new codec, specifying the stored fields compression mode to use. + * + * @param mode stored fields compression mode to use for newly flushed/merged segments. + */ + public Lucene100Codec(Mode mode) { + super("Lucene100"); + this.storedFieldsFormat = + new Lucene90StoredFieldsFormat(Objects.requireNonNull(mode).storedMode); + this.defaultPostingsFormat = new Lucene912PostingsFormat(); + this.defaultDVFormat = new Lucene90DocValuesFormat(); + this.defaultKnnVectorsFormat = new Lucene99HnswVectorsFormat(); + } + + @Override + public final StoredFieldsFormat storedFieldsFormat() { + return storedFieldsFormat; + } + + @Override + public final TermVectorsFormat termVectorsFormat() { + return vectorsFormat; + } + + @Override + public final PostingsFormat postingsFormat() { + return postingsFormat; + } + + @Override + public final FieldInfosFormat fieldInfosFormat() { + return fieldInfosFormat; + } + + @Override + public final SegmentInfoFormat segmentInfoFormat() { + return segmentInfosFormat; + } + + @Override + public final LiveDocsFormat liveDocsFormat() { + return liveDocsFormat; + } + + @Override + public final CompoundFormat compoundFormat() { + return compoundFormat; + } + + @Override + public final PointsFormat pointsFormat() { + return new Lucene90PointsFormat(); + } + + @Override + public final KnnVectorsFormat knnVectorsFormat() { + return knnVectorsFormat; + } + + /** + * Returns the postings format that should be used for writing new segments of field. + * + *

The default implementation always returns "Lucene912". + * + *

WARNING: if you subclass, you are responsible for index backwards compatibility: + * future version of Lucene are only guaranteed to be able to read the default implementation, + */ + public PostingsFormat getPostingsFormatForField(String field) { + return defaultPostingsFormat; + } + + /** + * Returns the docvalues format that should be used for writing new segments of field + * . + * + *

The default implementation always returns "Lucene90". + * + *

WARNING: if you subclass, you are responsible for index backwards compatibility: + * future version of Lucene are only guaranteed to be able to read the default implementation. + */ + public DocValuesFormat getDocValuesFormatForField(String field) { + return defaultDVFormat; + } + + /** + * Returns the vectors format that should be used for writing new segments of field + * + *

The default implementation always returns "Lucene99HnswVectorsFormat". + * + *

WARNING: if you subclass, you are responsible for index backwards compatibility: + * future version of Lucene are only guaranteed to be able to read the default implementation. + */ + public KnnVectorsFormat getKnnVectorsFormatForField(String field) { + return defaultKnnVectorsFormat; + } + + @Override + public final DocValuesFormat docValuesFormat() { + return docValuesFormat; + } + + @Override + public final NormsFormat normsFormat() { + return normsFormat; + } +} diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene100/package-info.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene100/package-info.java new file mode 100644 index 00000000000..64189bfa9d1 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene100/package-info.java @@ -0,0 +1,433 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Lucene 10.0 file format. + * + *

Apache Lucene - Index File Formats

+ * + * + * + *

Introduction

+ * + *
+ * + *

This document defines the index file formats used in this version of Lucene. If you are using + * a different version of Lucene, please consult the copy of docs/ that was distributed + * with the version you are using. + * + *

This document attempts to provide a high-level definition of the Apache Lucene file formats. + *

+ * + *

Definitions

+ * + *
+ * + *

The fundamental concepts in Lucene are index, document, field and term. + * + *

An index contains a sequence of documents. + * + *

    + *
  • A document is a sequence of fields. + *
  • A field is a named sequence of terms. + *
  • A term is a sequence of bytes. + *
+ * + *

The same sequence of bytes in two different fields is considered a different term. Thus terms + * are represented as a pair: the string naming the field, and the bytes within the field. + * + *

Inverted Indexing

+ * + *

Lucene's index stores terms and statistics about those terms in order to make term-based + * search more efficient. Lucene's terms index falls into the family of indexes known as an + * inverted index. This is because it can list, for a term, the documents that contain it. + * This is the inverse of the natural relationship, in which documents list terms. + * + *

Types of Fields

+ * + *

In Lucene, fields may be stored, in which case their text is stored in the index + * literally, in a non-inverted manner. Fields that are inverted are called indexed. A field + * may be both stored and indexed. + * + *

The text of a field may be tokenized into terms to be indexed, or the text of a field + * may be used literally as a term to be indexed. Most fields are tokenized, but sometimes it is + * useful for certain identifier fields to be indexed literally. + * + *

See the {@link org.apache.lucene.document.Field Field} java docs for more information on + * Fields. + * + *

Segments

+ * + *

Lucene indexes may be composed of multiple sub-indexes, or segments. Each segment is a + * fully independent index, which could be searched separately. Indexes evolve by: + * + *

    + *
  1. Creating new segments for newly added documents. + *
  2. Merging existing segments. + *
+ * + *

Searches may involve multiple segments and/or multiple indexes, each index potentially + * composed of a set of segments. + * + *

Document Numbers

+ * + *

Internally, Lucene refers to documents by an integer document number. The first + * document added to an index is numbered zero, and each subsequent document added gets a number one + * greater than the previous. + * + *

Note that a document's number may change, so caution should be taken when storing these + * numbers outside of Lucene. In particular, numbers may change in the following situations: + * + *

    + *
  • + *

    The numbers stored in each segment are unique only within the segment, and must be + * converted before they can be used in a larger context. The standard technique is to + * allocate each segment a range of values, based on the range of numbers used in that + * segment. To convert a document number from a segment to an external value, the segment's + * base document number is added. To convert an external value back to a + * segment-specific value, the segment is identified by the range that the external value is + * in, and the segment's base value is subtracted. For example two five document segments + * might be combined, so that the first segment has a base value of zero, and the second of + * five. Document three from the second segment would have an external value of eight. + *

  • + *

    When documents are deleted, gaps are created in the numbering. These are eventually + * removed as the index evolves through merging. Deleted documents are dropped when segments + * are merged. A freshly-merged segment thus has no gaps in its numbering. + *

+ * + *
+ * + *

Index Structure Overview

+ * + *
+ * + *

Each segment index maintains the following: + * + *

    + *
  • {@link org.apache.lucene.codecs.lucene99.Lucene99SegmentInfoFormat Segment info}. This + * contains metadata about a segment, such as the number of documents, what files it uses, and + * information about how the segment is sorted + *
  • {@link org.apache.lucene.codecs.lucene94.Lucene94FieldInfosFormat Field names}. This + * contains metadata about the set of named fields used in the index. + *
  • {@link org.apache.lucene.codecs.lucene90.Lucene90StoredFieldsFormat Stored Field values}. + * This contains, for each document, a list of attribute-value pairs, where the attributes are + * field names. These are used to store auxiliary information about the document, such as its + * title, url, or an identifier to access a database. The set of stored fields are what is + * returned for each hit when searching. This is keyed by document number. + *
  • {@link org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat Term dictionary}. A + * dictionary containing all of the terms used in all of the indexed fields of all of the + * documents. The dictionary also contains the number of documents which contain the term, and + * pointers to the term's frequency and proximity data. + *
  • {@link org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat Term Frequency data}. For + * each term in the dictionary, the numbers of all the documents that contain that term, and + * the frequency of the term in that document, unless frequencies are omitted ({@link + * org.apache.lucene.index.IndexOptions#DOCS IndexOptions.DOCS}) + *
  • {@link org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat Term Proximity data}. For + * each term in the dictionary, the positions that the term occurs in each document. Note that + * this will not exist if all fields in all documents omit position data. + *
  • {@link org.apache.lucene.codecs.lucene90.Lucene90NormsFormat Normalization factors}. For + * each field in each document, a value is stored that is multiplied into the score for hits + * on that field. + *
  • {@link org.apache.lucene.codecs.lucene90.Lucene90TermVectorsFormat Term Vectors}. For each + * field in each document, the term vector (sometimes called document vector) may be stored. A + * term vector consists of term text and term frequency. To add Term Vectors to your index see + * the {@link org.apache.lucene.document.Field Field} constructors + *
  • {@link org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat Per-document values}. Like + * stored values, these are also keyed by document number, but are generally intended to be + * loaded into main memory for fast access. Whereas stored values are generally intended for + * summary results from searches, per-document values are useful for things like scoring + * factors. + *
  • {@link org.apache.lucene.codecs.lucene90.Lucene90LiveDocsFormat Live documents}. An + * optional file indicating which documents are live. + *
  • {@link org.apache.lucene.codecs.lucene90.Lucene90PointsFormat Point values}. Optional pair + * of files, recording dimensionally indexed fields, to enable fast numeric range filtering + * and large numeric values like BigInteger and BigDecimal (1D) and geographic shape + * intersection (2D, 3D). + *
  • {@link org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat Vector values}. The + * vector format stores numeric vectors in a format optimized for random access and + * computation, supporting high-dimensional nearest-neighbor search. + *
+ * + *

Details on each of these are provided in their linked pages.

+ * + *

File Naming

+ * + *
+ * + *

All files belonging to a segment have the same name with varying extensions. The extensions + * correspond to the different file formats described below. When using the Compound File format + * (default for small segments) these files (except for the Segment info file, the Lock file, and + * Deleted documents file) are collapsed into a single .cfs file (see below for details) + * + *

Typically, all segments in an index are stored in a single directory, although this is not + * required. + * + *

File names are never re-used. That is, when any file is saved to the Directory it is given a + * never before used filename. This is achieved using a simple generations approach. For example, + * the first segments file is segments_1, then segments_2, etc. The generation is a sequential long + * integer represented in alpha-numeric (base 36) form.

+ * + *

Summary of File Extensions

+ * + *
+ * + *

The following table summarizes the names and extensions of the files in Lucene: + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + *
lucene filenames by extension
NameExtensionBrief Description
{@link org.apache.lucene.index.SegmentInfos Segments File}segments_NStores information about a commit point
Lock Filewrite.lockThe Write lock prevents multiple IndexWriters from writing to the same + * file.
{@link org.apache.lucene.codecs.lucene99.Lucene99SegmentInfoFormat Segment Info}.siStores metadata about a segment
{@link org.apache.lucene.codecs.lucene90.Lucene90CompoundFormat Compound File}.cfs, .cfeAn optional "virtual" file consisting of all the other index files for + * systems that frequently run out of file handles.
{@link org.apache.lucene.codecs.lucene94.Lucene94FieldInfosFormat Fields}.fnmStores information about the fields
{@link org.apache.lucene.codecs.lucene90.Lucene90StoredFieldsFormat Field Index}.fdxContains pointers to field data
{@link org.apache.lucene.codecs.lucene90.Lucene90StoredFieldsFormat Field Data}.fdtThe stored fields for documents
{@link org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat Term Dictionary}.timThe term dictionary, stores term info
{@link org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat Term Index}.tipThe index into the Term Dictionary
{@link org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat Frequencies}.docContains the list of docs which contain each term along with frequency
{@link org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat Positions}.posStores position information about where a term occurs in the index
{@link org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat Payloads}.payStores additional per-position metadata information such as character offsets and user payloads
{@link org.apache.lucene.codecs.lucene90.Lucene90NormsFormat Norms}.nvd, .nvmEncodes length and boost factors for docs and fields
{@link org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat Per-Document Values}.dvd, .dvmEncodes additional scoring factors or other per-document information.
{@link org.apache.lucene.codecs.lucene90.Lucene90TermVectorsFormat Term Vector Index}.tvxStores offset into the document data file
{@link org.apache.lucene.codecs.lucene90.Lucene90TermVectorsFormat Term Vector Data}.tvdContains term vector data.
{@link org.apache.lucene.codecs.lucene90.Lucene90LiveDocsFormat Live Documents}.livInfo about what documents are live
{@link org.apache.lucene.codecs.lucene90.Lucene90PointsFormat Point values}.kdd, .kdi, .kdmHolds indexed points
{@link org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat Vector values}.vec, .vem, .veq, vexHolds indexed vectors; .vec files contain the raw vector data, + * .vem the vector metadata, .veq the quantized vector data, and .vex the + * hnsw graph data.
+ * + *

+ * + *

Lock File

+ * + * The write lock, which is stored in the index directory by default, is named "write.lock". If the + * lock directory is different from the index directory then the write lock will be named + * "XXXX-write.lock" where XXXX is a unique prefix derived from the full path to the index + * directory. When this file is present, a writer is currently modifying the index (adding or + * removing documents). This lock file ensures that only one writer is modifying the index at a + * time. + * + *

History

+ * + *

Compatibility notes are provided in this document, describing how file formats have changed + * from prior versions: + * + *

    + *
  • In version 2.1, the file format was changed to allow lock-less commits (ie, no more commit + * lock). The change is fully backwards compatible: you can open a pre-2.1 index for searching + * or adding/deleting of docs. When the new segments file is saved (committed), it will be + * written in the new file format (meaning no specific "upgrade" process is needed). But note + * that once a commit has occurred, pre-2.1 Lucene will not be able to read the index. + *
  • In version 2.3, the file format was changed to allow segments to share a single set of doc + * store (vectors & stored fields) files. This allows for faster indexing in certain + * cases. The change is fully backwards compatible (in the same way as the lock-less commits + * change in 2.1). + *
  • In version 2.4, Strings are now written as true UTF-8 byte sequence, not Java's modified + * UTF-8. See LUCENE-510 for + * details. + *
  • In version 2.9, an optional opaque Map<String,String> CommitUserData may be passed to + * IndexWriter's commit methods (and later retrieved), which is recorded in the segments_N + * file. See LUCENE-1382 for + * details. Also, diagnostics were added to each segment written recording details about why + * it was written (due to flush, merge; which OS/JRE was used; etc.). See issue LUCENE-1654 for details. + *
  • In version 3.0, compressed fields are no longer written to the index (they can still be + * read, but on merge the new segment will write them, uncompressed). See issue LUCENE-1960 for details. + *
  • In version 3.1, segments records the code version that created them. See LUCENE-2720 for details. + * Additionally segments track explicitly whether or not they have term vectors. See LUCENE-2811 for details. + *
  • In version 3.2, numeric fields are written as natively to stored fields file, previously + * they were stored in text format only. + *
  • In version 3.4, fields can omit position data while still indexing term frequencies. + *
  • In version 4.0, the format of the inverted index became extensible via the {@link + * org.apache.lucene.codecs.Codec Codec} api. Fast per-document storage ({@code DocValues}) + * was introduced. Normalization factors need no longer be a single byte, they can be any + * {@link org.apache.lucene.index.NumericDocValues NumericDocValues}. Terms need not be + * unicode strings, they can be any byte sequence. Term offsets can optionally be indexed into + * the postings lists. Payloads can be stored in the term vectors. + *
  • In version 4.1, the format of the postings list changed to use either of FOR compression or + * variable-byte encoding, depending upon the frequency of the term. Terms appearing only once + * were changed to inline directly into the term dictionary. Stored fields are compressed by + * default. + *
  • In version 4.2, term vectors are compressed by default. DocValues has a new multi-valued + * type (SortedSet), that can be used for faceting/grouping/joining on multi-valued fields. + *
  • In version 4.5, DocValues were extended to explicitly represent missing values. + *
  • In version 4.6, FieldInfos were extended to support per-field DocValues generation, to + * allow updating NumericDocValues fields. + *
  • In version 4.8, checksum footers were added to the end of each index file for improved data + * integrity. Specifically, the last 8 bytes of every index file contain the zlib-crc32 + * checksum of the file. + *
  • In version 4.9, DocValues has a new multi-valued numeric type (SortedNumeric) that is + * suitable for faceting/sorting/analytics. + *
  • In version 5.4, DocValues have been improved to store more information on disk: addresses + * for binary fields and ord indexes for multi-valued fields. + *
  • In version 6.0, Points were added, for multi-dimensional range/distance search. + *
  • In version 6.2, new Segment info format that reads/writes the index sort, to support index + * sorting. + *
  • In version 7.0, DocValues have been improved to better support sparse doc values thanks to + * an iterator API. + *
  • In version 8.0, postings have been enhanced to record, for each block of doc ids, the (term + * freq, normalization factor) pairs that may trigger the maximum score of the block. This + * information is recorded alongside skip data in order to be able to skip blocks of doc ids + * if they may not produce high enough scores. Additionally doc values and norms has been + * extended with jump-tables to make access O(1) instead of O(n), where n is the number of + * elements to skip when advancing in the data. + *
  • In version 8.4, postings, positions, offsets and payload lengths have move to a more + * performant encoding that is vectorized. + *
  • In version 8.6, index sort serialization is delegated to the sorts themselves, to allow + * user-defined sorts to be used + *
  • In version 8.6, points fields split the index tree and leaf data into separate files, to + * allow for different access patterns to the different data structures + *
  • In version 8.7, stored fields compression became adaptive to better handle documents with + * smaller stored fields. + *
  • In version 9.0, vector-valued fields were added. + *
  • In version 9.1, vector-valued fields were modified to add a graph hierarchy. + *
  • In version 9.2, docs of vector-valued fields were moved from .vem to .vec and encoded by + * IndexDISI. ordToDoc mappings was added to .vem. + *
  • In version 9.5, HNSW graph connections were changed to be delta-encoded with vints. + * Additionally, metadata file size improvements were made by delta-encoding nodes by graph + * layer and not writing the node ids for the zeroth layer. + *
  • In version 9.9, Vector scalar quantization support was added. Allowing the HNSW vector + * format to utilize int8 quantized vectors for float32 vector search. + *
  • In version 9.12, skip data was refactored to have only two levels: every 128 docs and every + * 4,06 docs, and to be inlined in postings lists. This resulted in a speedup for queries that + * need skipping, especially conjunctions. + *
+ * + * + * + *

Limitations

+ * + *
+ * + *

Lucene uses a Java int to refer to document numbers, and the index file format + * uses an Int32 on-disk to store document numbers. This is a limitation of both the + * index file format and the current implementation. Eventually these should be replaced with either + * UInt64 values, or better yet, {@link org.apache.lucene.store.DataOutput#writeVInt + * VInt} values which have no limit.

+ */ +package org.apache.lucene.codecs.lucene100; diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/IndexedDISI.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/IndexedDISI.java index a2b2c84e12a..dbd56125fcd 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/IndexedDISI.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/IndexedDISI.java @@ -18,6 +18,7 @@ package org.apache.lucene.codecs.lucene90; import java.io.DataInput; import java.io.IOException; +import org.apache.lucene.index.KnnVectorValues; import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.store.IndexInput; import org.apache.lucene.store.IndexOutput; @@ -439,6 +440,40 @@ public final class IndexedDISI extends DocIdSetIterator { // ALL variables int gap; + /** + * Returns an iterator that delegates to the IndexedDISI. Advancing this iterator will advance the + * underlying IndexedDISI, and vice-versa. + */ + public static KnnVectorValues.DocIndexIterator asDocIndexIterator(IndexedDISI disi) { + // can we replace with fromDISI? + return new KnnVectorValues.DocIndexIterator() { + @Override + public int docID() { + return disi.docID(); + } + + @Override + public int index() { + return disi.index(); + } + + @Override + public int nextDoc() throws IOException { + return disi.nextDoc(); + } + + @Override + public int advance(int target) throws IOException { + return disi.advance(target); + } + + @Override + public long cost() { + return disi.cost(); + } + }; + } + @Override public int docID() { return doc; diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90CompoundFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90CompoundFormat.java index fd9ec5f9c28..80b98e0a4c5 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90CompoundFormat.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90CompoundFormat.java @@ -82,9 +82,8 @@ public final class Lucene90CompoundFormat extends CompoundFormat { public Lucene90CompoundFormat() {} @Override - public CompoundDirectory getCompoundReader(Directory dir, SegmentInfo si, IOContext context) - throws IOException { - return new Lucene90CompoundReader(dir, si, context); + public CompoundDirectory getCompoundReader(Directory dir, SegmentInfo si) throws IOException { + return new Lucene90CompoundReader(dir, si); } @Override diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90CompoundReader.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90CompoundReader.java index ee9c9ae40fa..8f6211bc959 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90CompoundReader.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90CompoundReader.java @@ -30,6 +30,7 @@ import org.apache.lucene.store.ChecksumIndexInput; import org.apache.lucene.store.Directory; import org.apache.lucene.store.IOContext; import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.ReadAdvice; import org.apache.lucene.util.CollectionUtil; import org.apache.lucene.util.IOUtils; @@ -56,8 +57,7 @@ final class Lucene90CompoundReader extends CompoundDirectory { /** Create a new CompoundFileDirectory. */ // TODO: we should just pre-strip "entries" and append segment name up-front like simpletext? // this need not be a "general purpose" directory anymore (it only writes index files) - public Lucene90CompoundReader(Directory directory, SegmentInfo si, IOContext context) - throws IOException { + public Lucene90CompoundReader(Directory directory, SegmentInfo si) throws IOException { this.directory = directory; this.segmentName = si.name; String dataFileName = @@ -75,7 +75,7 @@ final class Lucene90CompoundReader extends CompoundDirectory { .orElseGet(() -> CodecUtil.indexHeaderLength(Lucene90CompoundFormat.DATA_CODEC, "")) + CodecUtil.footerLength(); - handle = directory.openInput(dataFileName, context); + handle = directory.openInput(dataFileName, IOContext.DEFAULT.withReadAdvice(ReadAdvice.NORMAL)); try { CodecUtil.checkIndexHeader( handle, Lucene90CompoundFormat.DATA_CODEC, version, version, si.getId(), ""); @@ -169,7 +169,7 @@ final class Lucene90CompoundReader extends CompoundDirectory { + entries.keySet() + ")"); } - return handle.slice(name, entry.offset, entry.length); + return handle.slice(name, entry.offset, entry.length, context.readAdvice()); } /** Returns an array of strings, one for each file in the directory. */ diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90DocValuesProducer.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90DocValuesProducer.java index fb8d578acdf..da027a35f17 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90DocValuesProducer.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90DocValuesProducer.java @@ -21,6 +21,8 @@ import static org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat.SKIP_IND import static org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat.TERMS_DICT_BLOCK_LZ4_SHIFT; import java.io.IOException; +import java.util.HashMap; +import java.util.Map; import org.apache.lucene.codecs.CodecUtil; import org.apache.lucene.codecs.DocValuesProducer; import org.apache.lucene.index.BaseTermsEnum; @@ -41,7 +43,6 @@ import org.apache.lucene.index.SortedNumericDocValues; import org.apache.lucene.index.SortedSetDocValues; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.index.TermsEnum.SeekStatus; -import org.apache.lucene.internal.hppc.IntObjectHashMap; import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.store.ByteArrayDataInput; import org.apache.lucene.store.ChecksumIndexInput; @@ -58,12 +59,12 @@ import org.apache.lucene.util.packed.DirectReader; /** reader for {@link Lucene90DocValuesFormat} */ final class Lucene90DocValuesProducer extends DocValuesProducer { - private final IntObjectHashMap numerics; - private final IntObjectHashMap binaries; - private final IntObjectHashMap sorted; - private final IntObjectHashMap sortedSets; - private final IntObjectHashMap sortedNumerics; - private final IntObjectHashMap skippers; + private final Map numerics; + private final Map binaries; + private final Map sorted; + private final Map sortedSets; + private final Map sortedNumerics; + private final Map skippers; private final IndexInput data; private final int maxDoc; private int version = -1; @@ -80,12 +81,12 @@ final class Lucene90DocValuesProducer extends DocValuesProducer { String metaName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, metaExtension); this.maxDoc = state.segmentInfo.maxDoc(); - numerics = new IntObjectHashMap<>(); - binaries = new IntObjectHashMap<>(); - sorted = new IntObjectHashMap<>(); - sortedSets = new IntObjectHashMap<>(); - sortedNumerics = new IntObjectHashMap<>(); - skippers = new IntObjectHashMap<>(); + numerics = new HashMap<>(); + binaries = new HashMap<>(); + sorted = new HashMap<>(); + sortedSets = new HashMap<>(); + sortedNumerics = new HashMap<>(); + skippers = new HashMap<>(); merging = false; // read in the entries from the metadata file. @@ -148,12 +149,12 @@ final class Lucene90DocValuesProducer extends DocValuesProducer { // Used for cloning private Lucene90DocValuesProducer( - IntObjectHashMap numerics, - IntObjectHashMap binaries, - IntObjectHashMap sorted, - IntObjectHashMap sortedSets, - IntObjectHashMap sortedNumerics, - IntObjectHashMap skippers, + Map numerics, + Map binaries, + Map sorted, + Map sortedSets, + Map sortedNumerics, + Map skippers, IndexInput data, int maxDoc, int version, @@ -193,18 +194,18 @@ final class Lucene90DocValuesProducer extends DocValuesProducer { } byte type = meta.readByte(); if (info.docValuesSkipIndexType() != DocValuesSkipIndexType.NONE) { - skippers.put(info.number, readDocValueSkipperMeta(meta)); + skippers.put(info.name, readDocValueSkipperMeta(meta)); } if (type == Lucene90DocValuesFormat.NUMERIC) { - numerics.put(info.number, readNumeric(meta)); + numerics.put(info.name, readNumeric(meta)); } else if (type == Lucene90DocValuesFormat.BINARY) { - binaries.put(info.number, readBinary(meta)); + binaries.put(info.name, readBinary(meta)); } else if (type == Lucene90DocValuesFormat.SORTED) { - sorted.put(info.number, readSorted(meta)); + sorted.put(info.name, readSorted(meta)); } else if (type == Lucene90DocValuesFormat.SORTED_SET) { - sortedSets.put(info.number, readSortedSet(meta)); + sortedSets.put(info.name, readSortedSet(meta)); } else if (type == Lucene90DocValuesFormat.SORTED_NUMERIC) { - sortedNumerics.put(info.number, readSortedNumeric(meta)); + sortedNumerics.put(info.name, readSortedNumeric(meta)); } else { throw new CorruptIndexException("invalid type: " + type, meta); } @@ -429,7 +430,7 @@ final class Lucene90DocValuesProducer extends DocValuesProducer { @Override public NumericDocValues getNumeric(FieldInfo field) throws IOException { - NumericEntry entry = numerics.get(field.number); + NumericEntry entry = numerics.get(field.name); return getNumeric(entry); } @@ -785,13 +786,13 @@ final class Lucene90DocValuesProducer extends DocValuesProducer { @Override public BinaryDocValues getBinary(FieldInfo field) throws IOException { - BinaryEntry entry = binaries.get(field.number); + BinaryEntry entry = binaries.get(field.name); if (entry.docsWithFieldOffset == -2) { return DocValues.emptyBinary(); } - final IndexInput bytesSlice = data.slice("fixed-binary", entry.dataOffset, entry.dataLength); + final RandomAccessInput bytesSlice = data.randomAccessSlice(entry.dataOffset, entry.dataLength); // Prefetch the first page of data. Following pages are expected to get prefetched through // read-ahead. if (bytesSlice.length() > 0) { @@ -808,8 +809,7 @@ final class Lucene90DocValuesProducer extends DocValuesProducer { @Override public BytesRef binaryValue() throws IOException { - bytesSlice.seek((long) doc * length); - bytesSlice.readBytes(bytes.bytes, 0, length); + bytesSlice.readBytes((long) doc * length, bytes.bytes, 0, length); return bytes; } }; @@ -831,8 +831,7 @@ final class Lucene90DocValuesProducer extends DocValuesProducer { public BytesRef binaryValue() throws IOException { long startOffset = addresses.get(doc); bytes.length = (int) (addresses.get(doc + 1L) - startOffset); - bytesSlice.seek(startOffset); - bytesSlice.readBytes(bytes.bytes, 0, bytes.length); + bytesSlice.readBytes(startOffset, bytes.bytes, 0, bytes.length); return bytes; } }; @@ -855,8 +854,7 @@ final class Lucene90DocValuesProducer extends DocValuesProducer { @Override public BytesRef binaryValue() throws IOException { - bytesSlice.seek((long) disi.index() * length); - bytesSlice.readBytes(bytes.bytes, 0, length); + bytesSlice.readBytes((long) disi.index() * length, bytes.bytes, 0, length); return bytes; } }; @@ -879,8 +877,7 @@ final class Lucene90DocValuesProducer extends DocValuesProducer { final int index = disi.index(); long startOffset = addresses.get(index); bytes.length = (int) (addresses.get(index + 1L) - startOffset); - bytesSlice.seek(startOffset); - bytesSlice.readBytes(bytes.bytes, 0, bytes.length); + bytesSlice.readBytes(startOffset, bytes.bytes, 0, bytes.length); return bytes; } }; @@ -890,7 +887,7 @@ final class Lucene90DocValuesProducer extends DocValuesProducer { @Override public SortedDocValues getSorted(FieldInfo field) throws IOException { - SortedEntry entry = sorted.get(field.number); + SortedEntry entry = sorted.get(field.name); return getSorted(entry); } @@ -1124,7 +1121,7 @@ final class Lucene90DocValuesProducer extends DocValuesProducer { final IndexInput bytes; final long blockMask; final LongValues indexAddresses; - final IndexInput indexBytes; + final RandomAccessInput indexBytes; final BytesRef term; long ord = -1; @@ -1146,7 +1143,7 @@ final class Lucene90DocValuesProducer extends DocValuesProducer { indexAddresses = DirectMonotonicReader.getInstance( entry.termsIndexAddressesMeta, indexAddressesSlice, merging); - indexBytes = data.slice("terms-index", entry.termsIndexOffset, entry.termsIndexLength); + indexBytes = data.randomAccessSlice(entry.termsIndexOffset, entry.termsIndexLength); term = new BytesRef(entry.maxTermLength); // add the max term length for the dictionary @@ -1204,8 +1201,7 @@ final class Lucene90DocValuesProducer extends DocValuesProducer { assert index >= 0 && index <= (entry.termsDictSize - 1) >>> entry.termsDictIndexShift; final long start = indexAddresses.get(index); term.length = (int) (indexAddresses.get(index + 1) - start); - indexBytes.seek(start); - indexBytes.readBytes(term.bytes, 0, term.length); + indexBytes.readBytes(start, term.bytes, 0, term.length); return term; } @@ -1367,7 +1363,7 @@ final class Lucene90DocValuesProducer extends DocValuesProducer { @Override public SortedNumericDocValues getSortedNumeric(FieldInfo field) throws IOException { - SortedNumericEntry entry = sortedNumerics.get(field.number); + SortedNumericEntry entry = sortedNumerics.get(field.name); return getSortedNumeric(entry); } @@ -1512,7 +1508,7 @@ final class Lucene90DocValuesProducer extends DocValuesProducer { @Override public SortedSetDocValues getSortedSet(FieldInfo field) throws IOException { - SortedSetEntry entry = sortedSets.get(field.number); + SortedSetEntry entry = sortedSets.get(field.name); if (entry.singleValueEntry != null) { return DocValues.singleton(getSorted(entry.singleValueEntry)); } @@ -1786,7 +1782,7 @@ final class Lucene90DocValuesProducer extends DocValuesProducer { @Override public DocValuesSkipper getSkipper(FieldInfo field) throws IOException { - final DocValuesSkipperEntry entry = skippers.get(field.number); + final DocValuesSkipperEntry entry = skippers.get(field.name); final IndexInput input = data.slice("doc value skipper", entry.offset, entry.length); // Prefetch the first page of data. Following pages are expected to get prefetched through diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90PointsReader.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90PointsReader.java index d3f256cbf00..82910e23ab9 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90PointsReader.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90PointsReader.java @@ -33,9 +33,9 @@ import org.apache.lucene.util.bkd.BKDReader; /** Reads point values previously written with {@link Lucene90PointsWriter} */ public class Lucene90PointsReader extends PointsReader { - final IndexInput indexIn, dataIn; - final SegmentReadState readState; - final IntObjectHashMap readers = new IntObjectHashMap<>(); + private final IndexInput indexIn, dataIn; + private final SegmentReadState readState; + private final IntObjectHashMap readers = new IntObjectHashMap<>(); /** Sole constructor */ public Lucene90PointsReader(SegmentReadState readState) throws IOException { diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90PointsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90PointsWriter.java index e50d6a0fdb5..45a946e8ac4 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90PointsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90PointsWriter.java @@ -253,7 +253,7 @@ public class Lucene90PointsWriter extends PointsWriter { FieldInfos readerFieldInfos = mergeState.fieldInfos[i]; FieldInfo readerFieldInfo = readerFieldInfos.fieldInfo(fieldInfo.name); if (readerFieldInfo != null && readerFieldInfo.getPointDimensionCount() > 0) { - PointValues aPointValues = reader90.readers.get(readerFieldInfo.number); + PointValues aPointValues = reader90.getValues(readerFieldInfo.name); if (aPointValues != null) { pointValues.add(aPointValues); docMaps.add(mergeState.docMaps[i]); diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90StoredFieldsFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90StoredFieldsFormat.java index ce0310d6396..9e367a3d9d8 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90StoredFieldsFormat.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90StoredFieldsFormat.java @@ -49,9 +49,9 @@ import org.apache.lucene.util.packed.DirectMonotonicWriter; * *
  *   // the default: for high performance
- *   indexWriterConfig.setCodec(new Lucene912Codec(Mode.BEST_SPEED));
+ *   indexWriterConfig.setCodec(new Lucene100Codec(Mode.BEST_SPEED));
  *   // instead for higher performance (but slower):
- *   // indexWriterConfig.setCodec(new Lucene912Codec(Mode.BEST_COMPRESSION));
+ *   // indexWriterConfig.setCodec(new Lucene100Codec(Mode.BEST_COMPRESSION));
  * 
* *

File formats diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/SegmentTermsEnumFrame.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/SegmentTermsEnumFrame.java index 9988c45bdf7..85d23a489fe 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/SegmentTermsEnumFrame.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/SegmentTermsEnumFrame.java @@ -598,7 +598,7 @@ final class SegmentTermsEnumFrame { startBytePos = suffixesReader.getPosition(); suffixesReader.skipBytes(suffixLength); - // Loop over bytes in the suffix, comparing to the target + // Compare suffix and target. final int cmp = Arrays.compareUnsigned( suffixBytes, @@ -686,7 +686,7 @@ final class SegmentTermsEnumFrame { nextEnt = mid + 1; startBytePos = mid * suffixLength; - // Binary search bytes in the suffix, comparing to the target. + // Compare suffix and target. cmp = Arrays.compareUnsigned( suffixBytes, @@ -792,6 +792,7 @@ final class SegmentTermsEnumFrame { lastSubFP = fp - subCode; } + // Compare suffix and target. final int cmp = Arrays.compareUnsigned( suffixBytes, diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene912/ForDeltaUtil.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene912/ForDeltaUtil.java index 25601388a0f..f13b3cde69c 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene912/ForDeltaUtil.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene912/ForDeltaUtil.java @@ -23,7 +23,6 @@ import static org.apache.lucene.codecs.lucene912.ForUtil.*; import java.io.IOException; import org.apache.lucene.internal.vectorization.PostingDecodingUtil; import org.apache.lucene.store.DataOutput; -import org.apache.lucene.store.IndexInput; import org.apache.lucene.util.packed.PackedInts; /** @@ -282,11 +281,6 @@ public final class ForDeltaUtil { } } - void skip(IndexInput in) throws IOException { - final int bitsPerValue = Byte.toUnsignedInt(in.readByte()); - in.skipBytes(numBytes(bitsPerValue)); - } - /** Delta-decode 128 integers into {@code longs}. */ void decodeAndPrefixSum(int bitsPerValue, PostingDecodingUtil pdu, long base, long[] longs) throws IOException { diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene912/Lucene912PostingsReader.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene912/Lucene912PostingsReader.java index 9c65078cfa9..bdb4dc4db08 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene912/Lucene912PostingsReader.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene912/Lucene912PostingsReader.java @@ -47,7 +47,6 @@ import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.index.SlowImpactsEnum; import org.apache.lucene.internal.vectorization.PostingDecodingUtil; import org.apache.lucene.internal.vectorization.VectorizationProvider; -import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.store.ByteArrayDataInput; import org.apache.lucene.store.ChecksumIndexInput; import org.apache.lucene.store.DataInput; @@ -56,7 +55,6 @@ import org.apache.lucene.store.ReadAdvice; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.BitUtil; import org.apache.lucene.util.BytesRef; -import org.apache.lucene.util.BytesRefBuilder; import org.apache.lucene.util.IOUtils; /** @@ -67,6 +65,12 @@ import org.apache.lucene.util.IOUtils; public final class Lucene912PostingsReader extends PostingsReaderBase { static final VectorizationProvider VECTORIZATION_PROVIDER = VectorizationProvider.getInstance(); + // Dummy impacts, composed of the maximum possible term frequency and the lowest possible + // (unsigned) norm value. This is typically used on tail blocks, which don't actually record + // impacts as the storage overhead would not be worth any query evaluation speedup, since there's + // less than 128 docs left to evaluate anyway. + private static final List DUMMY_IMPACTS = + Collections.singletonList(new Impact(Integer.MAX_VALUE, 1L)); private final IndexInput docIn; private final IndexInput posIn; @@ -77,8 +81,6 @@ public final class Lucene912PostingsReader extends PostingsReaderBase { private final int maxNumImpactsAtLevel1; private final int maxImpactNumBytesAtLevel1; - private final int version; - /** Sole constructor. */ public Lucene912PostingsReader(SegmentReadState state) throws IOException { String metaName = @@ -87,6 +89,7 @@ public final class Lucene912PostingsReader extends PostingsReaderBase { final long expectedDocFileLength, expectedPosFileLength, expectedPayFileLength; ChecksumIndexInput metaIn = null; boolean success = false; + int version; try { metaIn = state.directory.openChecksumInput(metaName); version = @@ -236,13 +239,6 @@ public final class Lucene912PostingsReader extends PostingsReaderBase { DataInput in, FieldInfo fieldInfo, BlockTermState _termState, boolean absolute) throws IOException { final IntBlockTermState termState = (IntBlockTermState) _termState; - final boolean fieldHasPositions = - fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0; - final boolean fieldHasOffsets = - fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) - >= 0; - final boolean fieldHasPayloads = fieldInfo.hasPayloads(); - if (absolute) { termState.docStartFP = 0; termState.posStartFP = 0; @@ -263,9 +259,13 @@ public final class Lucene912PostingsReader extends PostingsReaderBase { termState.singletonDocID += BitUtil.zigZagDecode(l >>> 1); } - if (fieldHasPositions) { + if (fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0) { termState.posStartFP += in.readVLong(); - if (fieldHasOffsets || fieldHasPayloads) { + if (fieldInfo + .getIndexOptions() + .compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) + >= 0 + || fieldInfo.hasPayloads()) { termState.payStartFP += in.readVLong(); } if (termState.totalTermFreq > BLOCK_SIZE) { @@ -280,156 +280,115 @@ public final class Lucene912PostingsReader extends PostingsReaderBase { public PostingsEnum postings( FieldInfo fieldInfo, BlockTermState termState, PostingsEnum reuse, int flags) throws IOException { - - boolean indexHasPositions = - fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0; - - if (indexHasPositions == false + if (fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) < 0 || PostingsEnum.featureRequested(flags, PostingsEnum.POSITIONS) == false) { - BlockDocsEnum docsEnum; - if (reuse instanceof BlockDocsEnum) { - docsEnum = (BlockDocsEnum) reuse; - if (!docsEnum.canReuse(docIn, fieldInfo)) { - docsEnum = new BlockDocsEnum(fieldInfo); - } - } else { - docsEnum = new BlockDocsEnum(fieldInfo); - } - return docsEnum.reset((IntBlockTermState) termState, flags); + return (reuse instanceof BlockDocsEnum blockDocsEnum + && blockDocsEnum.canReuse(docIn, fieldInfo) + ? blockDocsEnum + : new BlockDocsEnum(fieldInfo)) + .reset((IntBlockTermState) termState, flags); } else { - EverythingEnum everythingEnum; - if (reuse instanceof EverythingEnum) { - everythingEnum = (EverythingEnum) reuse; - if (!everythingEnum.canReuse(docIn, fieldInfo)) { - everythingEnum = new EverythingEnum(fieldInfo); - } - } else { - everythingEnum = new EverythingEnum(fieldInfo); - } - return everythingEnum.reset((IntBlockTermState) termState, flags); + return (reuse instanceof EverythingEnum everythingEnum + && everythingEnum.canReuse(docIn, fieldInfo) + ? everythingEnum + : new EverythingEnum(fieldInfo)) + .reset((IntBlockTermState) termState, flags); } } @Override public ImpactsEnum impacts(FieldInfo fieldInfo, BlockTermState state, int flags) throws IOException { - final boolean indexHasFreqs = - fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS) >= 0; + final IndexOptions options = fieldInfo.getIndexOptions(); final boolean indexHasPositions = - fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0; + options.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0; - if (state.docFreq >= BLOCK_SIZE - && indexHasFreqs - && (indexHasPositions == false - || PostingsEnum.featureRequested(flags, PostingsEnum.POSITIONS) == false)) { - return new BlockImpactsDocsEnum(fieldInfo, (IntBlockTermState) state); - } + if (state.docFreq >= BLOCK_SIZE) { + if (options.compareTo(IndexOptions.DOCS_AND_FREQS) >= 0 + && (indexHasPositions == false + || PostingsEnum.featureRequested(flags, PostingsEnum.POSITIONS) == false)) { + return new BlockImpactsDocsEnum(indexHasPositions, (IntBlockTermState) state); + } - final boolean indexHasOffsets = - fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) - >= 0; - final boolean indexHasPayloads = fieldInfo.hasPayloads(); - - if (state.docFreq >= BLOCK_SIZE - && indexHasPositions - && (indexHasOffsets == false - || PostingsEnum.featureRequested(flags, PostingsEnum.OFFSETS) == false) - && (indexHasPayloads == false - || PostingsEnum.featureRequested(flags, PostingsEnum.PAYLOADS) == false)) { - return new BlockImpactsPostingsEnum(fieldInfo, (IntBlockTermState) state); + if (indexHasPositions + && (options.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) < 0 + || PostingsEnum.featureRequested(flags, PostingsEnum.OFFSETS) == false) + && (fieldInfo.hasPayloads() == false + || PostingsEnum.featureRequested(flags, PostingsEnum.PAYLOADS) == false)) { + return new BlockImpactsPostingsEnum(fieldInfo, (IntBlockTermState) state); + } } return new SlowImpactsEnum(postings(fieldInfo, state, null, flags)); } - final class BlockDocsEnum extends PostingsEnum { + private static long sumOverRange(long[] arr, int start, int end) { + long res = 0L; + for (int i = start; i < end; i++) { + res += arr[i]; + } + return res; + } - final ForUtil forUtil = new ForUtil(); - final ForDeltaUtil forDeltaUtil = new ForDeltaUtil(); - final PForUtil pforUtil = new PForUtil(forUtil); + private abstract class AbstractPostingsEnum extends PostingsEnum { - private final long[] docBuffer = new long[BLOCK_SIZE + 1]; - private final long[] freqBuffer = new long[BLOCK_SIZE]; + protected ForDeltaUtil forDeltaUtil; + protected PForUtil pforUtil; - private int docBufferUpto; + protected final long[] docBuffer = new long[BLOCK_SIZE + 1]; + protected final boolean indexHasFreq; - final IndexInput startDocIn; - - IndexInput docIn; - PostingDecodingUtil docInUtil; - final boolean indexHasFreq; - final boolean indexHasPos; - final boolean indexHasOffsetsOrPayloads; - - private int docFreq; // number of docs in this posting list - private long totalTermFreq; // sum of freqBuffer in this posting list (or docFreq when omitted) - private int docCountUpto; // number of docs in or before the current block - private int doc; // doc we last read - private long prevDocID; // last doc ID of the previous block + protected int doc; // doc we last read // level 0 skip data - private int level0LastDocID; + protected int level0LastDocID; + // level 1 skip data - private int level1LastDocID; - private long level1DocEndFP; - private int level1DocCountUpto; + protected int level1LastDocID; + protected long level1DocEndFP; + protected int level1DocCountUpto; - private boolean needsFreq; // true if the caller actually needs frequencies - private int singletonDocID; // docid when there is a single pulsed posting, otherwise -1 - private long freqFP; + protected int docFreq; // number of docs in this posting list + protected long + totalTermFreq; // sum of freqBuffer in this posting list (or docFreq when omitted) - public BlockDocsEnum(FieldInfo fieldInfo) throws IOException { - this.startDocIn = Lucene912PostingsReader.this.docIn; - this.docIn = null; + protected int singletonDocID; // docid when there is a single pulsed posting, otherwise -1 + + protected int docCountUpto; // number of docs in or before the current block + protected long prevDocID; // last doc ID of the previous block + + protected int docBufferUpto; + + protected IndexInput docIn; + protected PostingDecodingUtil docInUtil; + + protected AbstractPostingsEnum(FieldInfo fieldInfo) { indexHasFreq = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS) >= 0; - indexHasPos = - fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0; - indexHasOffsetsOrPayloads = - fieldInfo - .getIndexOptions() - .compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) - >= 0 - || fieldInfo.hasPayloads(); // We set the last element of docBuffer to NO_MORE_DOCS, it helps save conditionals in // advance() docBuffer[BLOCK_SIZE] = NO_MORE_DOCS; } - public boolean canReuse(IndexInput docIn, FieldInfo fieldInfo) { - return docIn == startDocIn - && indexHasFreq - == (fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS) >= 0) - && indexHasPos - == (fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) - >= 0) - && indexHasOffsetsOrPayloads - == (fieldInfo - .getIndexOptions() - .compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) - >= 0 - || fieldInfo.hasPayloads()); + @Override + public int docID() { + return doc; } - public PostingsEnum reset(IntBlockTermState termState, int flags) throws IOException { + protected void resetIndexInput(IntBlockTermState termState) throws IOException { docFreq = termState.docFreq; - totalTermFreq = indexHasFreq ? termState.totalTermFreq : docFreq; singletonDocID = termState.singletonDocID; if (docFreq > 1) { if (docIn == null) { // lazy init - docIn = startDocIn.clone(); + docIn = Lucene912PostingsReader.this.docIn.clone(); docInUtil = VECTORIZATION_PROVIDER.newPostingDecodingUtil(docIn); } prefetchPostings(docIn, termState); } + } + protected PostingsEnum resetIdsAndLevelParams(IntBlockTermState termState) throws IOException { doc = -1; - this.needsFreq = PostingsEnum.featureRequested(flags, PostingsEnum.FREQS); - if (indexHasFreq == false || needsFreq == false) { - // Filling this buffer may not be cheap when doing primary key lookups, so we make sure to - // not fill more than `docFreq` entries. - Arrays.fill(freqBuffer, 0, Math.min(ForUtil.BLOCK_SIZE, docFreq), 1); - } prevDocID = -1; docCountUpto = 0; level0LastDocID = -1; @@ -444,9 +403,44 @@ public final class Lucene912PostingsReader extends PostingsReaderBase { } level1DocCountUpto = 0; docBufferUpto = BLOCK_SIZE; - freqFP = -1; return this; } + } + + final class BlockDocsEnum extends AbstractPostingsEnum { + + private final long[] freqBuffer = new long[BLOCK_SIZE]; + + private boolean needsFreq; // true if the caller actually needs frequencies + private long freqFP; + + public BlockDocsEnum(FieldInfo fieldInfo) { + super(fieldInfo); + } + + public boolean canReuse(IndexInput docIn, FieldInfo fieldInfo) { + final IndexOptions options = fieldInfo.getIndexOptions(); + return docIn == Lucene912PostingsReader.this.docIn + && indexHasFreq == (options.compareTo(IndexOptions.DOCS_AND_FREQS) >= 0); + } + + public PostingsEnum reset(IntBlockTermState termState, int flags) throws IOException { + resetIndexInput(termState); + if (pforUtil == null && docFreq >= BLOCK_SIZE) { + pforUtil = new PForUtil(new ForUtil()); + forDeltaUtil = new ForDeltaUtil(); + } + totalTermFreq = indexHasFreq ? termState.totalTermFreq : docFreq; + + this.needsFreq = PostingsEnum.featureRequested(flags, PostingsEnum.FREQS); + if (indexHasFreq == false || needsFreq == false) { + // Filling this buffer may not be cheap when doing primary key lookups, so we make sure to + // not fill more than `docFreq` entries. + Arrays.fill(freqBuffer, 0, Math.min(ForUtil.BLOCK_SIZE, docFreq), 1); + } + freqFP = -1; + return resetIdsAndLevelParams(termState); + } @Override public int freq() throws IOException { @@ -460,30 +454,25 @@ public final class Lucene912PostingsReader extends PostingsReaderBase { } @Override - public int nextPosition() throws IOException { + public int nextPosition() { return -1; } @Override - public int startOffset() throws IOException { + public int startOffset() { return -1; } @Override - public int endOffset() throws IOException { + public int endOffset() { return -1; } @Override - public BytesRef getPayload() throws IOException { + public BytesRef getPayload() { return null; } - @Override - public int docID() { - return doc; - } - private void refillFullBlock() throws IOException { assert docFreq - docCountUpto >= BLOCK_SIZE; @@ -493,7 +482,7 @@ public final class Lucene912PostingsReader extends PostingsReaderBase { if (needsFreq) { freqFP = docIn.getFilePointer(); } - pforUtil.skip(docIn); + PForUtil.skip(docIn); } docCountUpto += BLOCK_SIZE; prevDocID = docBuffer[BLOCK_SIZE - 1]; @@ -531,7 +520,7 @@ public final class Lucene912PostingsReader extends PostingsReaderBase { level1DocCountUpto += LEVEL1_NUM_DOCS; if (docFreq - docCountUpto < LEVEL1_NUM_DOCS) { - level1LastDocID = DocIdSetIterator.NO_MORE_DOCS; + level1LastDocID = NO_MORE_DOCS; break; } @@ -567,7 +556,7 @@ public final class Lucene912PostingsReader extends PostingsReaderBase { docIn.skipBytes(readVLong15(docIn)); docCountUpto += BLOCK_SIZE; } else { - level0LastDocID = DocIdSetIterator.NO_MORE_DOCS; + level0LastDocID = NO_MORE_DOCS; break; } } @@ -584,7 +573,7 @@ public final class Lucene912PostingsReader extends PostingsReaderBase { refillFullBlock(); level0LastDocID = (int) docBuffer[BLOCK_SIZE - 1]; } else { - level0LastDocID = DocIdSetIterator.NO_MORE_DOCS; + level0LastDocID = NO_MORE_DOCS; refillRemainder(); } } @@ -627,13 +616,8 @@ public final class Lucene912PostingsReader extends PostingsReaderBase { } } - final class EverythingEnum extends PostingsEnum { + final class EverythingEnum extends AbstractPostingsEnum { - final ForUtil forUtil = new ForUtil(); - final ForDeltaUtil forDeltaUtil = new ForDeltaUtil(); - final PForUtil pforUtil = new PForUtil(forUtil); - - private final long[] docBuffer = new long[BLOCK_SIZE + 1]; private final long[] freqBuffer = new long[BLOCK_SIZE + 1]; private final long[] posDeltaBuffer = new long[BLOCK_SIZE]; @@ -649,30 +633,18 @@ public final class Lucene912PostingsReader extends PostingsReaderBase { private int startOffset; private int endOffset; - private int docBufferUpto; private int posBufferUpto; - final IndexInput startDocIn; - - IndexInput docIn; - PostingDecodingUtil docInUtil; final IndexInput posIn; final PostingDecodingUtil posInUtil; final IndexInput payIn; final PostingDecodingUtil payInUtil; final BytesRef payload; - final boolean indexHasFreq; - final boolean indexHasPos; final boolean indexHasOffsets; final boolean indexHasPayloads; final boolean indexHasOffsetsOrPayloads; - private int docFreq; // number of docs in this posting list - private long totalTermFreq; // sum of freqBuffer in this posting list (or docFreq when omitted) - private int docCountUpto; // number of docs in or before the current block - private int doc; // doc we last read - private long prevDocID; // last doc ID of the previous block private int freq; // freq we last read private int position; // current position @@ -680,28 +652,16 @@ public final class Lucene912PostingsReader extends PostingsReaderBase { // skip these to "catch up": private long posPendingCount; - // Where this term's postings start in the .pos file: - private long posTermStartFP; - - // Where this term's payloads/offsets start in the .pay - // file: - private long payTermStartFP; - // File pointer where the last (vInt encoded) pos delta // block is. We need this to know whether to bulk // decode vs vInt decode the block: private long lastPosBlockFP; - // level 0 skip data - private int level0LastDocID; private long level0PosEndFP; private int level0BlockPosUpto; private long level0PayEndFP; private int level0BlockPayUpto; - // level 1 skip data - private int level1LastDocID; - private long level1DocEndFP; - private int level1DocCountUpto; + private long level1PosEndFP; private int level1BlockPosUpto; private long level1PayEndFP; @@ -710,14 +670,8 @@ public final class Lucene912PostingsReader extends PostingsReaderBase { private boolean needsOffsets; // true if we actually need offsets private boolean needsPayloads; // true if we actually need payloads - private int singletonDocID; // docid when there is a single pulsed posting, otherwise -1 - public EverythingEnum(FieldInfo fieldInfo) throws IOException { - this.startDocIn = Lucene912PostingsReader.this.docIn; - this.docIn = null; - indexHasFreq = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS) >= 0; - indexHasPos = - fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0; + super(fieldInfo); indexHasOffsets = fieldInfo .getIndexOptions() @@ -754,14 +708,10 @@ public final class Lucene912PostingsReader extends PostingsReaderBase { payloadBytes = null; payload = null; } - - // We set the last element of docBuffer to NO_MORE_DOCS, it helps save conditionals in - // advance() - docBuffer[BLOCK_SIZE] = NO_MORE_DOCS; } public boolean canReuse(IndexInput docIn, FieldInfo fieldInfo) { - return docIn == startDocIn + return docIn == Lucene912PostingsReader.this.docIn && indexHasOffsets == (fieldInfo .getIndexOptions() @@ -771,19 +721,19 @@ public final class Lucene912PostingsReader extends PostingsReaderBase { } public PostingsEnum reset(IntBlockTermState termState, int flags) throws IOException { - docFreq = termState.docFreq; - posTermStartFP = termState.posStartFP; - payTermStartFP = termState.payStartFP; - totalTermFreq = termState.totalTermFreq; - singletonDocID = termState.singletonDocID; - if (docFreq > 1) { - if (docIn == null) { - // lazy init - docIn = startDocIn.clone(); - docInUtil = VECTORIZATION_PROVIDER.newPostingDecodingUtil(docIn); - } - prefetchPostings(docIn, termState); + resetIndexInput(termState); + if (forDeltaUtil == null && docFreq >= BLOCK_SIZE) { + forDeltaUtil = new ForDeltaUtil(); } + totalTermFreq = termState.totalTermFreq; + if (pforUtil == null && totalTermFreq >= BLOCK_SIZE) { + pforUtil = new PForUtil(new ForUtil()); + } + // Where this term's postings start in the .pos file: + final long posTermStartFP = termState.posStartFP; + // Where this term's payloads/offsets start in the .pay + // file: + final long payTermStartFP = termState.payStartFP; posIn.seek(posTermStartFP); if (indexHasOffsetsOrPayloads) { payIn.seek(payTermStartFP); @@ -805,39 +755,20 @@ public final class Lucene912PostingsReader extends PostingsReaderBase { this.needsOffsets = PostingsEnum.featureRequested(flags, PostingsEnum.OFFSETS); this.needsPayloads = PostingsEnum.featureRequested(flags, PostingsEnum.PAYLOADS); - doc = -1; - prevDocID = -1; - docCountUpto = 0; - level0LastDocID = -1; - if (docFreq < LEVEL1_NUM_DOCS) { - level1LastDocID = NO_MORE_DOCS; - if (docFreq > 1) { - docIn.seek(termState.docStartFP); - } - } else { - level1LastDocID = -1; - level1DocEndFP = termState.docStartFP; - } - level1DocCountUpto = 0; level1BlockPosUpto = 0; level1BlockPayUpto = 0; level0BlockPosUpto = 0; level0BlockPayUpto = 0; - docBufferUpto = BLOCK_SIZE; posBufferUpto = BLOCK_SIZE; - return this; + + return resetIdsAndLevelParams(termState); } @Override - public int freq() throws IOException { + public int freq() { return freq; } - @Override - public int docID() { - return doc; - } - private void refillDocs() throws IOException { final int left = docFreq - docCountUpto; assert left >= 0; @@ -878,7 +809,7 @@ public final class Lucene912PostingsReader extends PostingsReaderBase { level1DocCountUpto += LEVEL1_NUM_DOCS; if (docFreq - docCountUpto < LEVEL1_NUM_DOCS) { - level1LastDocID = DocIdSetIterator.NO_MORE_DOCS; + level1LastDocID = NO_MORE_DOCS; break; } @@ -936,7 +867,7 @@ public final class Lucene912PostingsReader extends PostingsReaderBase { level0BlockPayUpto = docIn.readVInt(); } } else { - level0LastDocID = DocIdSetIterator.NO_MORE_DOCS; + level0LastDocID = NO_MORE_DOCS; } refillDocs(); @@ -975,9 +906,7 @@ public final class Lucene912PostingsReader extends PostingsReaderBase { } posBufferUpto = BLOCK_SIZE; } else { - for (int i = docBufferUpto; i < BLOCK_SIZE; ++i) { - posPendingCount += freqBuffer[i]; - } + posPendingCount += sumOverRange(freqBuffer, docBufferUpto, BLOCK_SIZE); } if (docFreq - docCountUpto >= BLOCK_SIZE) { @@ -1003,7 +932,7 @@ public final class Lucene912PostingsReader extends PostingsReaderBase { docIn.seek(blockEndFP); docCountUpto += BLOCK_SIZE; } else { - level0LastDocID = DocIdSetIterator.NO_MORE_DOCS; + level0LastDocID = NO_MORE_DOCS; break; } } @@ -1023,9 +952,7 @@ public final class Lucene912PostingsReader extends PostingsReaderBase { } int next = findFirstGreater(docBuffer, target, docBufferUpto); - for (int i = docBufferUpto; i <= next; ++i) { - posPendingCount += freqBuffer[i]; - } + posPendingCount += sumOverRange(freqBuffer, docBufferUpto, next + 1); this.freq = (int) freqBuffer[next]; this.docBufferUpto = next + 1; position = 0; @@ -1045,20 +972,18 @@ public final class Lucene912PostingsReader extends PostingsReaderBase { if (toSkip < leftInBlock) { int end = (int) (posBufferUpto + toSkip); if (indexHasPayloads) { - for (int i = posBufferUpto; i < end; ++i) { - payloadByteUpto += payloadLengthBuffer[i]; - } + payloadByteUpto += sumOverRange(payloadLengthBuffer, posBufferUpto, end); } posBufferUpto = end; } else { toSkip -= leftInBlock; while (toSkip >= BLOCK_SIZE) { assert posIn.getFilePointer() != lastPosBlockFP; - pforUtil.skip(posIn); + PForUtil.skip(posIn); if (indexHasPayloads) { // Skip payloadLength block: - pforUtil.skip(payIn); + PForUtil.skip(payIn); // Skip payloadBytes block: int numBytes = payIn.readVInt(); @@ -1066,19 +991,16 @@ public final class Lucene912PostingsReader extends PostingsReaderBase { } if (indexHasOffsets) { - pforUtil.skip(payIn); - pforUtil.skip(payIn); + PForUtil.skip(payIn); + PForUtil.skip(payIn); } toSkip -= BLOCK_SIZE; } refillPositions(); payloadByteUpto = 0; - posBufferUpto = 0; final int toSkipInt = (int) toSkip; if (indexHasPayloads) { - for (int i = 0; i < toSkipInt; ++i) { - payloadByteUpto += payloadLengthBuffer[i]; - } + payloadByteUpto += sumOverRange(payloadLengthBuffer, 0, toSkipInt); } posBufferUpto = toSkipInt; } @@ -1137,7 +1059,7 @@ public final class Lucene912PostingsReader extends PostingsReaderBase { } else { // this works, because when writing a vint block we always force the first length to be // written - pforUtil.skip(payIn); // skip over lengths + PForUtil.skip(payIn); // skip over lengths int numBytes = payIn.readVInt(); // read length of payloadBytes payIn.seek(payIn.getFilePointer() + numBytes); // skip over payloadBytes } @@ -1151,8 +1073,8 @@ public final class Lucene912PostingsReader extends PostingsReaderBase { } else { // this works, because when writing a vint block we always force the first length to be // written - pforUtil.skip(payIn); // skip over starts - pforUtil.skip(payIn); // skip over lengths + PForUtil.skip(payIn); // skip over starts + PForUtil.skip(payIn); // skip over lengths } } } @@ -1217,83 +1139,48 @@ public final class Lucene912PostingsReader extends PostingsReaderBase { } } - final class BlockImpactsDocsEnum extends ImpactsEnum { + private abstract class BlockImpactsEnum extends ImpactsEnum { - final ForUtil forUtil = new ForUtil(); - final ForDeltaUtil forDeltaUtil = new ForDeltaUtil(); - final PForUtil pforUtil = new PForUtil(forUtil); + protected final ForDeltaUtil forDeltaUtil = new ForDeltaUtil(); + protected final PForUtil pforUtil = new PForUtil(new ForUtil()); - private final long[] docBuffer = new long[BLOCK_SIZE + 1]; - private final long[] freqBuffer = new long[BLOCK_SIZE]; + protected final long[] docBuffer = new long[BLOCK_SIZE + 1]; + protected final long[] freqBuffer = new long[BLOCK_SIZE]; - private int docBufferUpto; + protected final int docFreq; // number of docs in this posting list - final IndexInput startDocIn; + protected final IndexInput docIn; + protected final PostingDecodingUtil docInUtil; - final IndexInput docIn; - final PostingDecodingUtil docInUtil; - final boolean indexHasFreq; - final boolean indexHasPos; - final boolean indexHasOffsetsOrPayloads; - - private int docFreq; // number of docs in this posting list - private int docCountUpto; // number of docs in or before the current block - private int doc; // doc we last read - private long prevDocID; // last doc ID of the previous block - private long freqFP; + protected int docCountUpto; // number of docs in or before the current block + protected int doc = -1; // doc we last read + protected long prevDocID = -1; // last doc ID of the previous block + protected int docBufferUpto = BLOCK_SIZE; // true if we shallow-advanced to a new block that we have not decoded yet - private boolean needsRefilling; + protected boolean needsRefilling; // level 0 skip data - private int level0LastDocID; - private long level0DocEndFP; - private final BytesRef level0SerializedImpacts; - private final ByteArrayDataInput level0SerializedImpactsIn = new ByteArrayDataInput(); - private final MutableImpactList level0Impacts; + protected int level0LastDocID = -1; + protected long level0DocEndFP; + protected final BytesRef level0SerializedImpacts; + protected final MutableImpactList level0Impacts; // level 1 skip data - private int level1LastDocID; - private long level1DocEndFP; - private int level1DocCountUpto; - private final BytesRef level1SerializedImpacts; - private final ByteArrayDataInput level1SerializedImpactsIn = new ByteArrayDataInput(); - private final MutableImpactList level1Impacts; + protected int level1LastDocID; + protected long level1DocEndFP; + protected int level1DocCountUpto = 0; + protected final BytesRef level1SerializedImpacts; + protected final MutableImpactList level1Impacts; - public BlockImpactsDocsEnum(FieldInfo fieldInfo, IntBlockTermState termState) - throws IOException { - this.startDocIn = Lucene912PostingsReader.this.docIn; - indexHasFreq = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS) >= 0; - indexHasPos = - fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0; - indexHasOffsetsOrPayloads = - fieldInfo - .getIndexOptions() - .compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) - >= 0 - || fieldInfo.hasPayloads(); - // We set the last element of docBuffer to NO_MORE_DOCS, it helps save conditionals in - // advance() - docBuffer[BLOCK_SIZE] = NO_MORE_DOCS; - - docFreq = termState.docFreq; - if (docFreq > 1) { - docIn = startDocIn.clone(); - docInUtil = VECTORIZATION_PROVIDER.newPostingDecodingUtil(docIn); - prefetchPostings(docIn, termState); - } else { - docIn = null; - docInUtil = null; - } - - doc = -1; - if (indexHasFreq == false) { - // Filling this buffer may not be cheap when doing primary key lookups, so we make sure to - // not fill more than `docFreq` entries. - Arrays.fill(freqBuffer, 0, Math.min(ForUtil.BLOCK_SIZE, docFreq), 1); - } - prevDocID = -1; - docCountUpto = 0; - level0LastDocID = -1; + private BlockImpactsEnum(IntBlockTermState termState) throws IOException { + this.docFreq = termState.docFreq; + this.docIn = Lucene912PostingsReader.this.docIn.clone(); + this.docInUtil = VECTORIZATION_PROVIDER.newPostingDecodingUtil(docIn); + prefetchPostings(docIn, termState); + level0SerializedImpacts = new BytesRef(maxImpactNumBytesAtLevel0); + level1SerializedImpacts = new BytesRef(maxImpactNumBytesAtLevel1); + level0Impacts = new MutableImpactList(maxNumImpactsAtLevel0); + level1Impacts = new MutableImpactList(maxNumImpactsAtLevel1); if (docFreq < LEVEL1_NUM_DOCS) { level1LastDocID = NO_MORE_DOCS; if (docFreq > 1) { @@ -1303,13 +1190,89 @@ public final class Lucene912PostingsReader extends PostingsReaderBase { level1LastDocID = -1; level1DocEndFP = termState.docStartFP; } - level1DocCountUpto = 0; - docBufferUpto = BLOCK_SIZE; + // We set the last element of docBuffer to NO_MORE_DOCS, it helps save conditionals in + // advance() + docBuffer[BLOCK_SIZE] = NO_MORE_DOCS; + } + + @Override + public int docID() { + return doc; + } + + @Override + public int startOffset() { + return -1; + } + + @Override + public int endOffset() { + return -1; + } + + @Override + public BytesRef getPayload() { + return null; + } + + @Override + public long cost() { + return docFreq; + } + + private final Impacts impacts = + new Impacts() { + + private final ByteArrayDataInput scratch = new ByteArrayDataInput(); + + @Override + public int numLevels() { + return level1LastDocID == NO_MORE_DOCS ? 1 : 2; + } + + @Override + public int getDocIdUpTo(int level) { + if (level == 0) { + return level0LastDocID; + } + return level == 1 ? level1LastDocID : NO_MORE_DOCS; + } + + @Override + public List getImpacts(int level) { + if (level == 0 && level0LastDocID != NO_MORE_DOCS) { + return readImpacts(level0SerializedImpacts, level0Impacts); + } + if (level == 1) { + return readImpacts(level1SerializedImpacts, level1Impacts); + } + return DUMMY_IMPACTS; + } + + private List readImpacts(BytesRef serialized, MutableImpactList impactsList) { + var scratch = this.scratch; + scratch.reset(serialized.bytes, 0, serialized.length); + Lucene912PostingsReader.readImpacts(scratch, impactsList); + return impactsList; + } + }; + + @Override + public Impacts getImpacts() { + return impacts; + } + } + + final class BlockImpactsDocsEnum extends BlockImpactsEnum { + final boolean indexHasPos; + + private long freqFP; + + public BlockImpactsDocsEnum(boolean indexHasPos, IntBlockTermState termState) + throws IOException { + super(termState); + this.indexHasPos = indexHasPos; freqFP = -1; - level0SerializedImpacts = new BytesRef(maxImpactNumBytesAtLevel0); - level1SerializedImpacts = new BytesRef(maxImpactNumBytesAtLevel1); - level0Impacts = new MutableImpactList(maxNumImpactsAtLevel0); - level1Impacts = new MutableImpactList(maxNumImpactsAtLevel1); } @Override @@ -1323,45 +1286,22 @@ public final class Lucene912PostingsReader extends PostingsReaderBase { } @Override - public int nextPosition() throws IOException { + public int nextPosition() { return -1; } - @Override - public int startOffset() throws IOException { - return -1; - } - - @Override - public int endOffset() throws IOException { - return -1; - } - - @Override - public BytesRef getPayload() throws IOException { - return null; - } - - @Override - public int docID() { - return doc; - } - private void refillDocs() throws IOException { final int left = docFreq - docCountUpto; assert left >= 0; if (left >= BLOCK_SIZE) { forDeltaUtil.decodeAndPrefixSum(docInUtil, prevDocID, docBuffer); - - if (indexHasFreq) { - freqFP = docIn.getFilePointer(); - pforUtil.skip(docIn); - } + freqFP = docIn.getFilePointer(); + PForUtil.skip(docIn); docCountUpto += BLOCK_SIZE; } else { // Read vInts: - PostingsUtil.readVIntBlock(docIn, docBuffer, freqBuffer, left, indexHasFreq, true); + PostingsUtil.readVIntBlock(docIn, docBuffer, freqBuffer, left, true, true); prefixSum(docBuffer, left, prevDocID); docBuffer[left] = NO_MORE_DOCS; freqFP = -1; @@ -1381,7 +1321,7 @@ public final class Lucene912PostingsReader extends PostingsReaderBase { level1DocCountUpto += LEVEL1_NUM_DOCS; if (docFreq - docCountUpto < LEVEL1_NUM_DOCS) { - level1LastDocID = DocIdSetIterator.NO_MORE_DOCS; + level1LastDocID = NO_MORE_DOCS; break; } @@ -1425,7 +1365,7 @@ public final class Lucene912PostingsReader extends PostingsReaderBase { docIn.skipBytes(blockLength); docCountUpto += BLOCK_SIZE; } else { - level0LastDocID = DocIdSetIterator.NO_MORE_DOCS; + level0LastDocID = NO_MORE_DOCS; break; } } @@ -1468,7 +1408,7 @@ public final class Lucene912PostingsReader extends PostingsReaderBase { level0SerializedImpacts.length = numImpactBytes; docIn.seek(skip0End); } else { - level0LastDocID = DocIdSetIterator.NO_MORE_DOCS; + level0LastDocID = NO_MORE_DOCS; } refillDocs(); @@ -1500,109 +1440,22 @@ public final class Lucene912PostingsReader extends PostingsReaderBase { docBufferUpto = next + 1; return doc; } - - @Override - public Impacts getImpacts() throws IOException { - return new Impacts() { - - @Override - public int numLevels() { - int numLevels = 0; - if (level0LastDocID != NO_MORE_DOCS) { - numLevels++; - } - if (level1LastDocID != NO_MORE_DOCS) { - numLevels++; - } - if (numLevels == 0) { - numLevels++; - } - return numLevels; - } - - @Override - public int getDocIdUpTo(int level) { - if (level0LastDocID != NO_MORE_DOCS) { - if (level == 0) { - return level0LastDocID; - } - level--; - } - - if (level1LastDocID != NO_MORE_DOCS) { - if (level == 0) { - return level1LastDocID; - } - level--; - } - - return NO_MORE_DOCS; - } - - @Override - public List getImpacts(int level) { - if (level0LastDocID != NO_MORE_DOCS) { - if (level == 0) { - level0SerializedImpactsIn.reset( - level0SerializedImpacts.bytes, 0, level0SerializedImpacts.length); - readImpacts(level0SerializedImpactsIn, level0Impacts); - return level0Impacts; - } - level--; - } - - if (level1LastDocID != NO_MORE_DOCS) { - if (level == 0) { - level1SerializedImpactsIn.reset( - level1SerializedImpacts.bytes, 0, level1SerializedImpacts.length); - readImpacts(level1SerializedImpactsIn, level1Impacts); - return level1Impacts; - } - level--; - } - - return Collections.singletonList(new Impact(Integer.MAX_VALUE, 1L)); - } - }; - } - - @Override - public long cost() { - return docFreq; - } } - final class BlockImpactsPostingsEnum extends ImpactsEnum { - - final ForUtil forUtil = new ForUtil(); - final ForDeltaUtil forDeltaUtil = new ForDeltaUtil(); - final PForUtil pforUtil = new PForUtil(forUtil); - - private final long[] docBuffer = new long[BLOCK_SIZE + 1]; - private final long[] freqBuffer = new long[BLOCK_SIZE]; + final class BlockImpactsPostingsEnum extends BlockImpactsEnum { private final long[] posDeltaBuffer = new long[BLOCK_SIZE]; - private int docBufferUpto; private int posBufferUpto; - - final IndexInput startDocIn; - - final IndexInput docIn; - final PostingDecodingUtil docInUtil; final IndexInput posIn; final PostingDecodingUtil posInUtil; final boolean indexHasFreq; - final boolean indexHasPos; final boolean indexHasOffsets; final boolean indexHasPayloads; final boolean indexHasOffsetsOrPayloads; - private int docFreq; // number of docs in this posting list - private long totalTermFreq; // sum of freqBuffer in this posting list (or docFreq when omitted) - private int docCountUpto; // number of docs in or before the current block - private int doc; // doc we last read - private long prevDocID; // last doc ID of the previous block + private final long + totalTermFreq; // sum of freqBuffer in this posting list (or docFreq when omitted) private int freq; // freq we last read private int position; // current position @@ -1610,70 +1463,37 @@ public final class Lucene912PostingsReader extends PostingsReaderBase { // skip these to "catch up": private long posPendingCount; - // Where this term's postings start in the .pos file: - private long posTermStartFP; - // File pointer where the last (vInt encoded) pos delta // block is. We need this to know whether to bulk // decode vs vInt decode the block: - private long lastPosBlockFP; - - // true if we shallow-advanced to a new block that we have not decoded yet - private boolean needsRefilling; + private final long lastPosBlockFP; // level 0 skip data - private int level0LastDocID; - private long level0DocEndFP; private long level0PosEndFP; private int level0BlockPosUpto; - private final BytesRefBuilder level0SerializedImpacts = new BytesRefBuilder(); - private final ByteArrayDataInput level0SerializedImpactsIn = new ByteArrayDataInput(); - private final MutableImpactList level0Impacts; // level 1 skip data - private int level1LastDocID; - private long level1DocEndFP; - private int level1DocCountUpto; private long level1PosEndFP; private int level1BlockPosUpto; - private final BytesRefBuilder level1SerializedImpacts = new BytesRefBuilder(); - private final ByteArrayDataInput level1SerializedImpactsIn = new ByteArrayDataInput(); - private final MutableImpactList level1Impacts; - private int singletonDocID; // docid when there is a single pulsed posting, otherwise -1 + private final int singletonDocID; // docid when there is a single pulsed posting, otherwise -1 public BlockImpactsPostingsEnum(FieldInfo fieldInfo, IntBlockTermState termState) throws IOException { - this.startDocIn = Lucene912PostingsReader.this.docIn; - indexHasFreq = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS) >= 0; - indexHasPos = - fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0; + super(termState); + final IndexOptions options = fieldInfo.getIndexOptions(); + indexHasFreq = options.compareTo(IndexOptions.DOCS_AND_FREQS) >= 0; indexHasOffsets = - fieldInfo - .getIndexOptions() - .compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) - >= 0; + options.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0; indexHasPayloads = fieldInfo.hasPayloads(); indexHasOffsetsOrPayloads = indexHasOffsets || indexHasPayloads; this.posIn = Lucene912PostingsReader.this.posIn.clone(); posInUtil = VECTORIZATION_PROVIDER.newPostingDecodingUtil(posIn); - // We set the last element of docBuffer to NO_MORE_DOCS, it helps save conditionals in - // advance() - docBuffer[BLOCK_SIZE] = NO_MORE_DOCS; - - docFreq = termState.docFreq; - posTermStartFP = termState.posStartFP; + // Where this term's postings start in the .pos file: + final long posTermStartFP = termState.posStartFP; totalTermFreq = termState.totalTermFreq; singletonDocID = termState.singletonDocID; - if (docFreq > 1) { - docIn = startDocIn.clone(); - docInUtil = VECTORIZATION_PROVIDER.newPostingDecodingUtil(docIn); - prefetchPostings(docIn, termState); - } else { - docIn = null; - docInUtil = null; - } posIn.seek(posTermStartFP); level1PosEndFP = posTermStartFP; level0PosEndFP = posTermStartFP; @@ -1685,40 +1505,15 @@ public final class Lucene912PostingsReader extends PostingsReaderBase { } else { lastPosBlockFP = posTermStartFP + termState.lastPosBlockOffset; } - - doc = -1; - prevDocID = -1; - docCountUpto = 0; - level0LastDocID = -1; - if (docFreq < LEVEL1_NUM_DOCS) { - level1LastDocID = NO_MORE_DOCS; - if (docFreq > 1) { - docIn.seek(termState.docStartFP); - } - } else { - level1LastDocID = -1; - level1DocEndFP = termState.docStartFP; - } - level1DocCountUpto = 0; level1BlockPosUpto = 0; - docBufferUpto = BLOCK_SIZE; posBufferUpto = BLOCK_SIZE; - level0SerializedImpacts.growNoCopy(maxImpactNumBytesAtLevel0); - level1SerializedImpacts.growNoCopy(maxImpactNumBytesAtLevel1); - level0Impacts = new MutableImpactList(maxNumImpactsAtLevel0); - level1Impacts = new MutableImpactList(maxNumImpactsAtLevel1); } @Override - public int freq() throws IOException { + public int freq() { return freq; } - @Override - public int docID() { - return doc; - } - private void refillDocs() throws IOException { final int left = docFreq - docCountUpto; assert left >= 0; @@ -1755,7 +1550,7 @@ public final class Lucene912PostingsReader extends PostingsReaderBase { level1DocCountUpto += LEVEL1_NUM_DOCS; if (docFreq - docCountUpto < LEVEL1_NUM_DOCS) { - level1LastDocID = DocIdSetIterator.NO_MORE_DOCS; + level1LastDocID = NO_MORE_DOCS; break; } @@ -1765,8 +1560,8 @@ public final class Lucene912PostingsReader extends PostingsReaderBase { long skip1EndFP = docIn.readShort() + docIn.getFilePointer(); int numImpactBytes = docIn.readShort(); if (level1LastDocID >= target) { - docIn.readBytes(level1SerializedImpacts.bytes(), 0, numImpactBytes); - level1SerializedImpacts.setLength(numImpactBytes); + docIn.readBytes(level1SerializedImpacts.bytes, 0, numImpactBytes); + level1SerializedImpacts.length = numImpactBytes; } else { docIn.skipBytes(numImpactBytes); } @@ -1794,9 +1589,7 @@ public final class Lucene912PostingsReader extends PostingsReaderBase { posPendingCount = level0BlockPosUpto; posBufferUpto = BLOCK_SIZE; } else { - for (int i = docBufferUpto; i < BLOCK_SIZE; ++i) { - posPendingCount += freqBuffer[i]; - } + posPendingCount += sumOverRange(freqBuffer, docBufferUpto, BLOCK_SIZE); } if (docFreq - docCountUpto >= BLOCK_SIZE) { @@ -1809,8 +1602,8 @@ public final class Lucene912PostingsReader extends PostingsReaderBase { if (target <= level0LastDocID) { int numImpactBytes = docIn.readVInt(); - docIn.readBytes(level0SerializedImpacts.bytes(), 0, numImpactBytes); - level0SerializedImpacts.setLength(numImpactBytes); + docIn.readBytes(level0SerializedImpacts.bytes, 0, numImpactBytes); + level0SerializedImpacts.length = numImpactBytes; level0PosEndFP += docIn.readVLong(); level0BlockPosUpto = docIn.readByte(); if (indexHasOffsetsOrPayloads) { @@ -1826,7 +1619,7 @@ public final class Lucene912PostingsReader extends PostingsReaderBase { docIn.seek(level0DocEndFP); docCountUpto += BLOCK_SIZE; } else { - level0LastDocID = DocIdSetIterator.NO_MORE_DOCS; + level0LastDocID = NO_MORE_DOCS; break; } } @@ -1849,71 +1642,6 @@ public final class Lucene912PostingsReader extends PostingsReaderBase { } } - @Override - public Impacts getImpacts() throws IOException { - return new Impacts() { - - @Override - public int numLevels() { - int numLevels = 0; - if (level0LastDocID != NO_MORE_DOCS) { - numLevels++; - } - if (level1LastDocID != NO_MORE_DOCS) { - numLevels++; - } - if (numLevels == 0) { - numLevels++; - } - return numLevels; - } - - @Override - public int getDocIdUpTo(int level) { - if (level0LastDocID != NO_MORE_DOCS) { - if (level == 0) { - return level0LastDocID; - } - level--; - } - - if (level1LastDocID != NO_MORE_DOCS) { - if (level == 0) { - return level1LastDocID; - } - level--; - } - - return NO_MORE_DOCS; - } - - @Override - public List getImpacts(int level) { - if (level0LastDocID != NO_MORE_DOCS) { - if (level == 0) { - level0SerializedImpactsIn.reset( - level0SerializedImpacts.bytes(), 0, level0SerializedImpacts.length()); - readImpacts(level0SerializedImpactsIn, level0Impacts); - return level0Impacts; - } - level--; - } - - if (level1LastDocID != NO_MORE_DOCS) { - if (level == 0) { - level1SerializedImpactsIn.reset( - level1SerializedImpacts.bytes(), 0, level1SerializedImpacts.length()); - readImpacts(level1SerializedImpactsIn, level1Impacts); - return level1Impacts; - } - level--; - } - - return Collections.singletonList(new Impact(Integer.MAX_VALUE, 1L)); - } - }; - } - @Override public int nextDoc() throws IOException { advanceShallow(doc + 1); @@ -1939,9 +1667,7 @@ public final class Lucene912PostingsReader extends PostingsReaderBase { } int next = findFirstGreater(docBuffer, target, docBufferUpto); - for (int i = docBufferUpto; i <= next; ++i) { - posPendingCount += freqBuffer[i]; - } + posPendingCount += sumOverRange(freqBuffer, docBufferUpto, next + 1); freq = (int) freqBuffer[next]; docBufferUpto = next + 1; position = 0; @@ -1962,7 +1688,7 @@ public final class Lucene912PostingsReader extends PostingsReaderBase { toSkip -= leftInBlock; while (toSkip >= BLOCK_SIZE) { assert posIn.getFilePointer() != lastPosBlockFP; - pforUtil.skip(posIn); + PForUtil.skip(posIn); toSkip -= BLOCK_SIZE; } refillPositions(); @@ -2021,26 +1747,6 @@ public final class Lucene912PostingsReader extends PostingsReaderBase { posPendingCount--; return position; } - - @Override - public int startOffset() { - return -1; - } - - @Override - public int endOffset() { - return -1; - } - - @Override - public BytesRef getPayload() { - return null; - } - - @Override - public long cost() { - return docFreq; - } } /** @@ -2067,7 +1773,8 @@ public final class Lucene912PostingsReader extends PostingsReaderBase { } } - private void prefetchPostings(IndexInput docIn, IntBlockTermState state) throws IOException { + private static void prefetchPostings(IndexInput docIn, IntBlockTermState state) + throws IOException { assert state.docFreq > 1; // Singletons are inlined in the terms dict, nothing to prefetch if (docIn.getFilePointer() != state.docStartFP) { // Don't prefetch if the input is already positioned at the right offset, which suggests that diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene912/Lucene912PostingsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene912/Lucene912PostingsWriter.java index 3d493622c05..df34510de07 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene912/Lucene912PostingsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene912/Lucene912PostingsWriter.java @@ -342,7 +342,7 @@ public class Lucene912PostingsWriter extends PushPostingsWriterBase { } @Override - public void finishDoc() throws IOException { + public void finishDoc() { docBufferUpto++; docCount++; @@ -443,7 +443,6 @@ public class Lucene912PostingsWriter extends PushPostingsWriterBase { private void writeLevel1SkipData() throws IOException { docOut.writeVInt(docID - level1LastDocID); - long numImpactBytes = scratchOutput.size(); final long level1End; if (writeFreqs) { List impacts = level1CompetitiveFreqNormAccumulator.getCompetitiveFreqNormPairs(); @@ -451,7 +450,7 @@ public class Lucene912PostingsWriter extends PushPostingsWriterBase { maxNumImpactsAtLevel1 = impacts.size(); } writeImpacts(impacts, scratchOutput); - numImpactBytes = scratchOutput.size(); + long numImpactBytes = scratchOutput.size(); if (numImpactBytes > maxImpactNumBytesAtLevel1) { maxImpactNumBytesAtLevel1 = Math.toIntExact(numImpactBytes); } diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene912/PForUtil.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene912/PForUtil.java index 798101b6531..3857eabbe44 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene912/PForUtil.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene912/PForUtil.java @@ -121,7 +121,7 @@ final class PForUtil { } /** Skip 128 integers. */ - void skip(DataInput in) throws IOException { + static void skip(DataInput in) throws IOException { final int token = Byte.toUnsignedInt(in.readByte()); final int bitsPerValue = token & 0x1f; final int numExceptions = token >>> 5; diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene912/PostingsUtil.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene912/PostingsUtil.java index 4834dd73e22..1ae808d308f 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene912/PostingsUtil.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene912/PostingsUtil.java @@ -19,6 +19,7 @@ package org.apache.lucene.codecs.lucene912; import java.io.IOException; import org.apache.lucene.store.DataOutput; import org.apache.lucene.store.IndexInput; +import org.apache.lucene.util.GroupVIntUtil; /** Utility class to encode/decode postings block. */ final class PostingsUtil { @@ -35,7 +36,7 @@ final class PostingsUtil { boolean indexHasFreq, boolean decodeFreq) throws IOException { - docIn.readGroupVInts(docBuffer, num); + GroupVIntUtil.readGroupVInts(docIn, docBuffer, num); if (indexHasFreq && decodeFreq) { for (int i = 0; i < num; ++i) { freqBuffer[i] = docBuffer[i] & 0x01; diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene912/gen_ForDeltaUtil.py b/lucene/core/src/java/org/apache/lucene/codecs/lucene912/gen_ForDeltaUtil.py index b3bf493c86b..56c402372a6 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene912/gen_ForDeltaUtil.py +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene912/gen_ForDeltaUtil.py @@ -308,11 +308,6 @@ public final class ForDeltaUtil { } } - void skip(IndexInput in) throws IOException { - final int bitsPerValue = Byte.toUnsignedInt(in.readByte()); - in.skipBytes(numBytes(bitsPerValue)); - } - """ def primitive_size_for_bpv(bpv): diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene912/package-info.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene912/package-info.java index 853f86a855a..b9ddb1227b1 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene912/package-info.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene912/package-info.java @@ -15,417 +15,5 @@ * limitations under the License. */ -/** - * Lucene 9.12 file format. - * - *

Apache Lucene - Index File Formats

- * - * - * - *

Introduction

- * - *
- * - *

This document defines the index file formats used in this version of Lucene. If you are using - * a different version of Lucene, please consult the copy of docs/ that was distributed - * with the version you are using. - * - *

This document attempts to provide a high-level definition of the Apache Lucene file formats. - *

- * - *

Definitions

- * - *
- * - *

The fundamental concepts in Lucene are index, document, field and term. - * - *

An index contains a sequence of documents. - * - *

    - *
  • A document is a sequence of fields. - *
  • A field is a named sequence of terms. - *
  • A term is a sequence of bytes. - *
- * - *

The same sequence of bytes in two different fields is considered a different term. Thus terms - * are represented as a pair: the string naming the field, and the bytes within the field. - * - *

Inverted Indexing

- * - *

Lucene's index stores terms and statistics about those terms in order to make term-based - * search more efficient. Lucene's terms index falls into the family of indexes known as an - * inverted index. This is because it can list, for a term, the documents that contain it. - * This is the inverse of the natural relationship, in which documents list terms. - * - *

Types of Fields

- * - *

In Lucene, fields may be stored, in which case their text is stored in the index - * literally, in a non-inverted manner. Fields that are inverted are called indexed. A field - * may be both stored and indexed. - * - *

The text of a field may be tokenized into terms to be indexed, or the text of a field - * may be used literally as a term to be indexed. Most fields are tokenized, but sometimes it is - * useful for certain identifier fields to be indexed literally. - * - *

See the {@link org.apache.lucene.document.Field Field} java docs for more information on - * Fields. - * - *

Segments

- * - *

Lucene indexes may be composed of multiple sub-indexes, or segments. Each segment is a - * fully independent index, which could be searched separately. Indexes evolve by: - * - *

    - *
  1. Creating new segments for newly added documents. - *
  2. Merging existing segments. - *
- * - *

Searches may involve multiple segments and/or multiple indexes, each index potentially - * composed of a set of segments. - * - *

Document Numbers

- * - *

Internally, Lucene refers to documents by an integer document number. The first - * document added to an index is numbered zero, and each subsequent document added gets a number one - * greater than the previous. - * - *

Note that a document's number may change, so caution should be taken when storing these - * numbers outside of Lucene. In particular, numbers may change in the following situations: - * - *

    - *
  • - *

    The numbers stored in each segment are unique only within the segment, and must be - * converted before they can be used in a larger context. The standard technique is to - * allocate each segment a range of values, based on the range of numbers used in that - * segment. To convert a document number from a segment to an external value, the segment's - * base document number is added. To convert an external value back to a - * segment-specific value, the segment is identified by the range that the external value is - * in, and the segment's base value is subtracted. For example two five document segments - * might be combined, so that the first segment has a base value of zero, and the second of - * five. Document three from the second segment would have an external value of eight. - *

  • - *

    When documents are deleted, gaps are created in the numbering. These are eventually - * removed as the index evolves through merging. Deleted documents are dropped when segments - * are merged. A freshly-merged segment thus has no gaps in its numbering. - *

- * - *
- * - *

Index Structure Overview

- * - *
- * - *

Each segment index maintains the following: - * - *

    - *
  • {@link org.apache.lucene.codecs.lucene99.Lucene99SegmentInfoFormat Segment info}. This - * contains metadata about a segment, such as the number of documents, what files it uses, and - * information about how the segment is sorted - *
  • {@link org.apache.lucene.codecs.lucene94.Lucene94FieldInfosFormat Field names}. This - * contains metadata about the set of named fields used in the index. - *
  • {@link org.apache.lucene.codecs.lucene90.Lucene90StoredFieldsFormat Stored Field values}. - * This contains, for each document, a list of attribute-value pairs, where the attributes are - * field names. These are used to store auxiliary information about the document, such as its - * title, url, or an identifier to access a database. The set of stored fields are what is - * returned for each hit when searching. This is keyed by document number. - *
  • {@link org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat Term dictionary}. A - * dictionary containing all of the terms used in all of the indexed fields of all of the - * documents. The dictionary also contains the number of documents which contain the term, and - * pointers to the term's frequency and proximity data. - *
  • {@link org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat Term Frequency data}. For - * each term in the dictionary, the numbers of all the documents that contain that term, and - * the frequency of the term in that document, unless frequencies are omitted ({@link - * org.apache.lucene.index.IndexOptions#DOCS IndexOptions.DOCS}) - *
  • {@link org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat Term Proximity data}. For - * each term in the dictionary, the positions that the term occurs in each document. Note that - * this will not exist if all fields in all documents omit position data. - *
  • {@link org.apache.lucene.codecs.lucene90.Lucene90NormsFormat Normalization factors}. For - * each field in each document, a value is stored that is multiplied into the score for hits - * on that field. - *
  • {@link org.apache.lucene.codecs.lucene90.Lucene90TermVectorsFormat Term Vectors}. For each - * field in each document, the term vector (sometimes called document vector) may be stored. A - * term vector consists of term text and term frequency. To add Term Vectors to your index see - * the {@link org.apache.lucene.document.Field Field} constructors - *
  • {@link org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat Per-document values}. Like - * stored values, these are also keyed by document number, but are generally intended to be - * loaded into main memory for fast access. Whereas stored values are generally intended for - * summary results from searches, per-document values are useful for things like scoring - * factors. - *
  • {@link org.apache.lucene.codecs.lucene90.Lucene90LiveDocsFormat Live documents}. An - * optional file indicating which documents are live. - *
  • {@link org.apache.lucene.codecs.lucene90.Lucene90PointsFormat Point values}. Optional pair - * of files, recording dimensionally indexed fields, to enable fast numeric range filtering - * and large numeric values like BigInteger and BigDecimal (1D) and geographic shape - * intersection (2D, 3D). - *
  • {@link org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat Vector values}. The - * vector format stores numeric vectors in a format optimized for random access and - * computation, supporting high-dimensional nearest-neighbor search. - *
- * - *

Details on each of these are provided in their linked pages.

- * - *

File Naming

- * - *
- * - *

All files belonging to a segment have the same name with varying extensions. The extensions - * correspond to the different file formats described below. When using the Compound File format - * (default for small segments) these files (except for the Segment info file, the Lock file, and - * Deleted documents file) are collapsed into a single .cfs file (see below for details) - * - *

Typically, all segments in an index are stored in a single directory, although this is not - * required. - * - *

File names are never re-used. That is, when any file is saved to the Directory it is given a - * never before used filename. This is achieved using a simple generations approach. For example, - * the first segments file is segments_1, then segments_2, etc. The generation is a sequential long - * integer represented in alpha-numeric (base 36) form.

- * - *

Summary of File Extensions

- * - *
- * - *

The following table summarizes the names and extensions of the files in Lucene: - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - *
lucene filenames by extension
NameExtensionBrief Description
{@link org.apache.lucene.index.SegmentInfos Segments File}segments_NStores information about a commit point
Lock Filewrite.lockThe Write lock prevents multiple IndexWriters from writing to the same - * file.
{@link org.apache.lucene.codecs.lucene99.Lucene99SegmentInfoFormat Segment Info}.siStores metadata about a segment
{@link org.apache.lucene.codecs.lucene90.Lucene90CompoundFormat Compound File}.cfs, .cfeAn optional "virtual" file consisting of all the other index files for - * systems that frequently run out of file handles.
{@link org.apache.lucene.codecs.lucene94.Lucene94FieldInfosFormat Fields}.fnmStores information about the fields
{@link org.apache.lucene.codecs.lucene90.Lucene90StoredFieldsFormat Field Index}.fdxContains pointers to field data
{@link org.apache.lucene.codecs.lucene90.Lucene90StoredFieldsFormat Field Data}.fdtThe stored fields for documents
{@link org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat Term Dictionary}.timThe term dictionary, stores term info
{@link org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat Term Index}.tipThe index into the Term Dictionary
{@link org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat Frequencies}.docContains the list of docs which contain each term along with frequency
{@link org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat Positions}.posStores position information about where a term occurs in the index
{@link org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat Payloads}.payStores additional per-position metadata information such as character offsets and user payloads
{@link org.apache.lucene.codecs.lucene90.Lucene90NormsFormat Norms}.nvd, .nvmEncodes length and boost factors for docs and fields
{@link org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat Per-Document Values}.dvd, .dvmEncodes additional scoring factors or other per-document information.
{@link org.apache.lucene.codecs.lucene90.Lucene90TermVectorsFormat Term Vector Index}.tvxStores offset into the document data file
{@link org.apache.lucene.codecs.lucene90.Lucene90TermVectorsFormat Term Vector Data}.tvdContains term vector data.
{@link org.apache.lucene.codecs.lucene90.Lucene90LiveDocsFormat Live Documents}.livInfo about what documents are live
{@link org.apache.lucene.codecs.lucene90.Lucene90PointsFormat Point values}.dii, .dimHolds indexed points
{@link org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat Vector values}.vec, .vem, .veq, vexHolds indexed vectors; .vec files contain the raw vector data, - * .vem the vector metadata, .veq the quantized vector data, and .vex the - * hnsw graph data.
- * - *

- * - *

Lock File

- * - * The write lock, which is stored in the index directory by default, is named "write.lock". If the - * lock directory is different from the index directory then the write lock will be named - * "XXXX-write.lock" where XXXX is a unique prefix derived from the full path to the index - * directory. When this file is present, a writer is currently modifying the index (adding or - * removing documents). This lock file ensures that only one writer is modifying the index at a - * time. - * - *

History

- * - *

Compatibility notes are provided in this document, describing how file formats have changed - * from prior versions: - * - *

    - *
  • In version 2.1, the file format was changed to allow lock-less commits (ie, no more commit - * lock). The change is fully backwards compatible: you can open a pre-2.1 index for searching - * or adding/deleting of docs. When the new segments file is saved (committed), it will be - * written in the new file format (meaning no specific "upgrade" process is needed). But note - * that once a commit has occurred, pre-2.1 Lucene will not be able to read the index. - *
  • In version 2.3, the file format was changed to allow segments to share a single set of doc - * store (vectors & stored fields) files. This allows for faster indexing in certain - * cases. The change is fully backwards compatible (in the same way as the lock-less commits - * change in 2.1). - *
  • In version 2.4, Strings are now written as true UTF-8 byte sequence, not Java's modified - * UTF-8. See LUCENE-510 for - * details. - *
  • In version 2.9, an optional opaque Map<String,String> CommitUserData may be passed to - * IndexWriter's commit methods (and later retrieved), which is recorded in the segments_N - * file. See LUCENE-1382 for - * details. Also, diagnostics were added to each segment written recording details about why - * it was written (due to flush, merge; which OS/JRE was used; etc.). See issue LUCENE-1654 for details. - *
  • In version 3.0, compressed fields are no longer written to the index (they can still be - * read, but on merge the new segment will write them, uncompressed). See issue LUCENE-1960 for details. - *
  • In version 3.1, segments records the code version that created them. See LUCENE-2720 for details. - * Additionally segments track explicitly whether or not they have term vectors. See LUCENE-2811 for details. - *
  • In version 3.2, numeric fields are written as natively to stored fields file, previously - * they were stored in text format only. - *
  • In version 3.4, fields can omit position data while still indexing term frequencies. - *
  • In version 4.0, the format of the inverted index became extensible via the {@link - * org.apache.lucene.codecs.Codec Codec} api. Fast per-document storage ({@code DocValues}) - * was introduced. Normalization factors need no longer be a single byte, they can be any - * {@link org.apache.lucene.index.NumericDocValues NumericDocValues}. Terms need not be - * unicode strings, they can be any byte sequence. Term offsets can optionally be indexed into - * the postings lists. Payloads can be stored in the term vectors. - *
  • In version 4.1, the format of the postings list changed to use either of FOR compression or - * variable-byte encoding, depending upon the frequency of the term. Terms appearing only once - * were changed to inline directly into the term dictionary. Stored fields are compressed by - * default. - *
  • In version 4.2, term vectors are compressed by default. DocValues has a new multi-valued - * type (SortedSet), that can be used for faceting/grouping/joining on multi-valued fields. - *
  • In version 4.5, DocValues were extended to explicitly represent missing values. - *
  • In version 4.6, FieldInfos were extended to support per-field DocValues generation, to - * allow updating NumericDocValues fields. - *
  • In version 4.8, checksum footers were added to the end of each index file for improved data - * integrity. Specifically, the last 8 bytes of every index file contain the zlib-crc32 - * checksum of the file. - *
  • In version 4.9, DocValues has a new multi-valued numeric type (SortedNumeric) that is - * suitable for faceting/sorting/analytics. - *
  • In version 5.4, DocValues have been improved to store more information on disk: addresses - * for binary fields and ord indexes for multi-valued fields. - *
  • In version 6.0, Points were added, for multi-dimensional range/distance search. - *
  • In version 6.2, new Segment info format that reads/writes the index sort, to support index - * sorting. - *
  • In version 7.0, DocValues have been improved to better support sparse doc values thanks to - * an iterator API. - *
  • In version 8.0, postings have been enhanced to record, for each block of doc ids, the (term - * freq, normalization factor) pairs that may trigger the maximum score of the block. This - * information is recorded alongside skip data in order to be able to skip blocks of doc ids - * if they may not produce high enough scores. Additionally doc values and norms has been - * extended with jump-tables to make access O(1) instead of O(n), where n is the number of - * elements to skip when advancing in the data. - *
  • In version 8.4, postings, positions, offsets and payload lengths have move to a more - * performant encoding that is vectorized. - *
  • In version 8.6, index sort serialization is delegated to the sorts themselves, to allow - * user-defined sorts to be used - *
  • In version 8.7, stored fields compression became adaptive to better handle documents with - * smaller stored fields. - *
  • In version 9.0, vector-valued fields were added. - *
  • In version 9.1, vector-valued fields were modified to add a graph hierarchy. - *
  • In version 9.2, docs of vector-valued fields were moved from .vem to .vec and encoded by - * IndexDISI. ordToDoc mappings was added to .vem. - *
  • In version 9.5, HNSW graph connections were changed to be delta-encoded with vints. - * Additionally, metadata file size improvements were made by delta-encoding nodes by graph - * layer and not writing the node ids for the zeroth layer. - *
  • In version 9.9, Vector scalar quantization support was added. Allowing the HNSW vector - * format to utilize int8 quantized vectors for float32 vector search. - *
  • In version 9.12, skip data was refactored to have only two levels: every 128 docs and every - * 4,06 docs, and to be inlined in postings lists. This resulted in a speedup for queries that - * need skipping, especially conjunctions. - *
- * - * - * - *

Limitations

- * - *
- * - *

Lucene uses a Java int to refer to document numbers, and the index file format - * uses an Int32 on-disk to store document numbers. This is a limitation of both the - * index file format and the current implementation. Eventually these should be replaced with either - * UInt64 values, or better yet, {@link org.apache.lucene.store.DataOutput#writeVInt - * VInt} values which have no limit.

- */ +/** Lucene 9.12 file format. */ package org.apache.lucene.codecs.lucene912; diff --git a/lucene/core/src/java/org/apache/lucene/util/quantization/RandomAccessQuantizedByteVectorValues.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene95/HasIndexSlice.java similarity index 57% rename from lucene/core/src/java/org/apache/lucene/util/quantization/RandomAccessQuantizedByteVectorValues.java rename to lucene/core/src/java/org/apache/lucene/codecs/lucene95/HasIndexSlice.java index b86009a690e..2bfe72386a0 100644 --- a/lucene/core/src/java/org/apache/lucene/util/quantization/RandomAccessQuantizedByteVectorValues.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene95/HasIndexSlice.java @@ -14,23 +14,16 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.lucene.util.quantization; +package org.apache.lucene.codecs.lucene95; -import java.io.IOException; -import org.apache.lucene.util.hnsw.RandomAccessVectorValues; +import org.apache.lucene.store.IndexInput; /** - * Random access values for byte[], but also includes accessing the score correction - * constant for the current vector in the buffer. - * - * @lucene.experimental + * Implementors can return the IndexInput from which their values are read. For use by vector + * quantizers. */ -public interface RandomAccessQuantizedByteVectorValues extends RandomAccessVectorValues.Bytes { +public interface HasIndexSlice { - ScalarQuantizer getScalarQuantizer(); - - float getScoreCorrectionConstant(int vectorOrd) throws IOException; - - @Override - RandomAccessQuantizedByteVectorValues copy() throws IOException; + /** Returns an IndexInput from which to read this instance's values. */ + IndexInput getSlice(); } diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene95/OffHeapByteVectorValues.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene95/OffHeapByteVectorValues.java index f45158eadac..1e78c8ea7aa 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene95/OffHeapByteVectorValues.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene95/OffHeapByteVectorValues.java @@ -29,13 +29,11 @@ import org.apache.lucene.search.VectorScorer; import org.apache.lucene.store.IndexInput; import org.apache.lucene.store.RandomAccessInput; import org.apache.lucene.util.Bits; -import org.apache.lucene.util.hnsw.RandomAccessVectorValues; import org.apache.lucene.util.hnsw.RandomVectorScorer; import org.apache.lucene.util.packed.DirectMonotonicReader; /** Read the vector values from the index input. This supports both iterated and random access. */ -public abstract class OffHeapByteVectorValues extends ByteVectorValues - implements RandomAccessVectorValues.Bytes { +public abstract class OffHeapByteVectorValues extends ByteVectorValues implements HasIndexSlice { protected final int dimension; protected final int size; @@ -132,9 +130,6 @@ public abstract class OffHeapByteVectorValues extends ByteVectorValues * vector. */ public static class DenseOffHeapVectorValues extends OffHeapByteVectorValues { - - private int doc = -1; - public DenseOffHeapVectorValues( int dimension, int size, @@ -145,36 +140,17 @@ public abstract class OffHeapByteVectorValues extends ByteVectorValues super(dimension, size, slice, byteSize, flatVectorsScorer, vectorSimilarityFunction); } - @Override - public byte[] vectorValue() throws IOException { - return vectorValue(doc); - } - - @Override - public int docID() { - return doc; - } - - @Override - public int nextDoc() throws IOException { - return advance(doc + 1); - } - - @Override - public int advance(int target) throws IOException { - assert docID() < target; - if (target >= size) { - return doc = NO_MORE_DOCS; - } - return doc = target; - } - @Override public DenseOffHeapVectorValues copy() throws IOException { return new DenseOffHeapVectorValues( dimension, size, slice.clone(), byteSize, flatVectorsScorer, similarityFunction); } + @Override + public DocIndexIterator iterator() { + return createDenseIterator(); + } + @Override public Bits getAcceptOrds(Bits acceptDocs) { return acceptDocs; @@ -183,17 +159,18 @@ public abstract class OffHeapByteVectorValues extends ByteVectorValues @Override public VectorScorer scorer(byte[] query) throws IOException { DenseOffHeapVectorValues copy = copy(); + DocIndexIterator iterator = copy.iterator(); RandomVectorScorer scorer = flatVectorsScorer.getRandomVectorScorer(similarityFunction, copy, query); return new VectorScorer() { @Override public float score() throws IOException { - return scorer.score(copy.doc); + return scorer.score(iterator.docID()); } @Override public DocIdSetIterator iterator() { - return copy; + return iterator; } }; } @@ -238,27 +215,6 @@ public abstract class OffHeapByteVectorValues extends ByteVectorValues configuration.size); } - @Override - public byte[] vectorValue() throws IOException { - return vectorValue(disi.index()); - } - - @Override - public int docID() { - return disi.docID(); - } - - @Override - public int nextDoc() throws IOException { - return disi.nextDoc(); - } - - @Override - public int advance(int target) throws IOException { - assert docID() < target; - return disi.advance(target); - } - @Override public SparseOffHeapVectorValues copy() throws IOException { return new SparseOffHeapVectorValues( @@ -276,6 +232,11 @@ public abstract class OffHeapByteVectorValues extends ByteVectorValues return (int) ordToDoc.get(ord); } + @Override + public DocIndexIterator iterator() { + return IndexedDISI.asDocIndexIterator(disi); + } + @Override public Bits getAcceptOrds(Bits acceptDocs) { if (acceptDocs == null) { @@ -307,7 +268,7 @@ public abstract class OffHeapByteVectorValues extends ByteVectorValues @Override public DocIdSetIterator iterator() { - return copy; + return copy.disi; } }; } @@ -322,8 +283,6 @@ public abstract class OffHeapByteVectorValues extends ByteVectorValues super(dimension, 0, null, 0, flatVectorsScorer, vectorSimilarityFunction); } - private int doc = -1; - @Override public int dimension() { return super.dimension(); @@ -335,23 +294,13 @@ public abstract class OffHeapByteVectorValues extends ByteVectorValues } @Override - public byte[] vectorValue() throws IOException { + public byte[] vectorValue(int ord) throws IOException { throw new UnsupportedOperationException(); } @Override - public int docID() { - return doc; - } - - @Override - public int nextDoc() throws IOException { - return advance(doc + 1); - } - - @Override - public int advance(int target) throws IOException { - return doc = NO_MORE_DOCS; + public DocIndexIterator iterator() { + return createDenseIterator(); } @Override @@ -359,11 +308,6 @@ public abstract class OffHeapByteVectorValues extends ByteVectorValues throw new UnsupportedOperationException(); } - @Override - public byte[] vectorValue(int targetOrd) throws IOException { - throw new UnsupportedOperationException(); - } - @Override public int ordToDoc(int ord) { throw new UnsupportedOperationException(); diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene95/OffHeapFloatVectorValues.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene95/OffHeapFloatVectorValues.java index 1f61283b500..2384657e93e 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene95/OffHeapFloatVectorValues.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene95/OffHeapFloatVectorValues.java @@ -28,13 +28,11 @@ import org.apache.lucene.search.VectorScorer; import org.apache.lucene.store.IndexInput; import org.apache.lucene.store.RandomAccessInput; import org.apache.lucene.util.Bits; -import org.apache.lucene.util.hnsw.RandomAccessVectorValues; import org.apache.lucene.util.hnsw.RandomVectorScorer; import org.apache.lucene.util.packed.DirectMonotonicReader; /** Read the vector values from the index input. This supports both iterated and random access. */ -public abstract class OffHeapFloatVectorValues extends FloatVectorValues - implements RandomAccessVectorValues.Floats { +public abstract class OffHeapFloatVectorValues extends FloatVectorValues implements HasIndexSlice { protected final int dimension; protected final int size; @@ -128,8 +126,6 @@ public abstract class OffHeapFloatVectorValues extends FloatVectorValues */ public static class DenseOffHeapVectorValues extends OffHeapFloatVectorValues { - private int doc = -1; - public DenseOffHeapVectorValues( int dimension, int size, @@ -140,55 +136,42 @@ public abstract class OffHeapFloatVectorValues extends FloatVectorValues super(dimension, size, slice, byteSize, flatVectorsScorer, similarityFunction); } - @Override - public float[] vectorValue() throws IOException { - return vectorValue(doc); - } - - @Override - public int docID() { - return doc; - } - - @Override - public int nextDoc() throws IOException { - return advance(doc + 1); - } - - @Override - public int advance(int target) throws IOException { - assert docID() < target; - if (target >= size) { - return doc = NO_MORE_DOCS; - } - return doc = target; - } - @Override public DenseOffHeapVectorValues copy() throws IOException { return new DenseOffHeapVectorValues( dimension, size, slice.clone(), byteSize, flatVectorsScorer, similarityFunction); } + @Override + public int ordToDoc(int ord) { + return ord; + } + @Override public Bits getAcceptOrds(Bits acceptDocs) { return acceptDocs; } + @Override + public DocIndexIterator iterator() { + return createDenseIterator(); + } + @Override public VectorScorer scorer(float[] query) throws IOException { DenseOffHeapVectorValues copy = copy(); + DocIndexIterator iterator = copy.iterator(); RandomVectorScorer randomVectorScorer = flatVectorsScorer.getRandomVectorScorer(similarityFunction, copy, query); return new VectorScorer() { @Override public float score() throws IOException { - return randomVectorScorer.score(copy.doc); + return randomVectorScorer.score(iterator.docID()); } @Override public DocIdSetIterator iterator() { - return copy; + return iterator; } }; } @@ -227,27 +210,6 @@ public abstract class OffHeapFloatVectorValues extends FloatVectorValues configuration.size); } - @Override - public float[] vectorValue() throws IOException { - return vectorValue(disi.index()); - } - - @Override - public int docID() { - return disi.docID(); - } - - @Override - public int nextDoc() throws IOException { - return disi.nextDoc(); - } - - @Override - public int advance(int target) throws IOException { - assert docID() < target; - return disi.advance(target); - } - @Override public SparseOffHeapVectorValues copy() throws IOException { return new SparseOffHeapVectorValues( @@ -283,20 +245,26 @@ public abstract class OffHeapFloatVectorValues extends FloatVectorValues }; } + @Override + public DocIndexIterator iterator() { + return IndexedDISI.asDocIndexIterator(disi); + } + @Override public VectorScorer scorer(float[] query) throws IOException { SparseOffHeapVectorValues copy = copy(); + DocIndexIterator iterator = copy.iterator(); RandomVectorScorer randomVectorScorer = flatVectorsScorer.getRandomVectorScorer(similarityFunction, copy, query); return new VectorScorer() { @Override public float score() throws IOException { - return randomVectorScorer.score(copy.disi.index()); + return randomVectorScorer.score(iterator.index()); } @Override public DocIdSetIterator iterator() { - return copy; + return iterator; } }; } @@ -311,8 +279,6 @@ public abstract class OffHeapFloatVectorValues extends FloatVectorValues super(dimension, 0, null, 0, flatVectorsScorer, similarityFunction); } - private int doc = -1; - @Override public int dimension() { return super.dimension(); @@ -323,26 +289,6 @@ public abstract class OffHeapFloatVectorValues extends FloatVectorValues return 0; } - @Override - public float[] vectorValue() throws IOException { - throw new UnsupportedOperationException(); - } - - @Override - public int docID() { - return doc; - } - - @Override - public int nextDoc() throws IOException { - return advance(doc + 1); - } - - @Override - public int advance(int target) { - return doc = NO_MORE_DOCS; - } - @Override public EmptyOffHeapVectorValues copy() { throw new UnsupportedOperationException(); @@ -354,8 +300,8 @@ public abstract class OffHeapFloatVectorValues extends FloatVectorValues } @Override - public int ordToDoc(int ord) { - throw new UnsupportedOperationException(); + public DocIndexIterator iterator() { + return createDenseIterator(); } @Override diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99FlatVectorsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99FlatVectorsWriter.java index 1af68618d83..b731e758b7a 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99FlatVectorsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99FlatVectorsWriter.java @@ -39,6 +39,7 @@ import org.apache.lucene.index.DocsWithFieldSet; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.FloatVectorValues; import org.apache.lucene.index.IndexFileNames; +import org.apache.lucene.index.KnnVectorValues; import org.apache.lucene.index.MergeState; import org.apache.lucene.index.SegmentWriteState; import org.apache.lucene.index.Sorter; @@ -361,11 +362,10 @@ public final class Lucene99FlatVectorsWriter extends FlatVectorsWriter { private static DocsWithFieldSet writeByteVectorData( IndexOutput output, ByteVectorValues byteVectorValues) throws IOException { DocsWithFieldSet docsWithField = new DocsWithFieldSet(); - for (int docV = byteVectorValues.nextDoc(); - docV != NO_MORE_DOCS; - docV = byteVectorValues.nextDoc()) { + KnnVectorValues.DocIndexIterator iter = byteVectorValues.iterator(); + for (int docV = iter.nextDoc(); docV != NO_MORE_DOCS; docV = iter.nextDoc()) { // write vector - byte[] binaryValue = byteVectorValues.vectorValue(); + byte[] binaryValue = byteVectorValues.vectorValue(iter.index()); assert binaryValue.length == byteVectorValues.dimension() * VectorEncoding.BYTE.byteSize; output.writeBytes(binaryValue, binaryValue.length); docsWithField.add(docV); @@ -382,11 +382,10 @@ public final class Lucene99FlatVectorsWriter extends FlatVectorsWriter { ByteBuffer buffer = ByteBuffer.allocate(floatVectorValues.dimension() * VectorEncoding.FLOAT32.byteSize) .order(ByteOrder.LITTLE_ENDIAN); - for (int docV = floatVectorValues.nextDoc(); - docV != NO_MORE_DOCS; - docV = floatVectorValues.nextDoc()) { + KnnVectorValues.DocIndexIterator iter = floatVectorValues.iterator(); + for (int docV = iter.nextDoc(); docV != NO_MORE_DOCS; docV = iter.nextDoc()) { // write vector - float[] value = floatVectorValues.vectorValue(); + float[] value = floatVectorValues.vectorValue(iter.index()); buffer.asFloatBuffer().put(value); output.writeBytes(buffer.array(), buffer.limit()); docsWithField.add(docV); diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99HnswVectorsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99HnswVectorsWriter.java index dc0fb7184c7..0f4e8196d52 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99HnswVectorsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99HnswVectorsWriter.java @@ -32,14 +32,16 @@ import org.apache.lucene.codecs.KnnVectorsWriter; import org.apache.lucene.codecs.hnsw.FlatFieldVectorsWriter; import org.apache.lucene.codecs.hnsw.FlatVectorsScorer; import org.apache.lucene.codecs.hnsw.FlatVectorsWriter; +import org.apache.lucene.index.ByteVectorValues; import org.apache.lucene.index.DocsWithFieldSet; import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.FloatVectorValues; import org.apache.lucene.index.IndexFileNames; +import org.apache.lucene.index.KnnVectorValues; import org.apache.lucene.index.MergeState; import org.apache.lucene.index.SegmentWriteState; import org.apache.lucene.index.Sorter; import org.apache.lucene.index.VectorSimilarityFunction; -import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.TaskExecutor; import org.apache.lucene.store.IndexOutput; import org.apache.lucene.util.IOUtils; @@ -54,7 +56,6 @@ import org.apache.lucene.util.hnsw.HnswGraphMerger; import org.apache.lucene.util.hnsw.IncrementalHnswGraphMerger; import org.apache.lucene.util.hnsw.NeighborArray; import org.apache.lucene.util.hnsw.OnHeapHnswGraph; -import org.apache.lucene.util.hnsw.RandomAccessVectorValues; import org.apache.lucene.util.hnsw.RandomVectorScorerSupplier; import org.apache.lucene.util.packed.DirectMonotonicWriter; @@ -359,18 +360,18 @@ public final class Lucene99HnswVectorsWriter extends KnnVectorsWriter { mergeState.knnVectorsReaders[i], mergeState.docMaps[i], mergeState.liveDocs[i]); } } - DocIdSetIterator mergedVectorIterator = null; + KnnVectorValues mergedVectorValues = null; switch (fieldInfo.getVectorEncoding()) { case BYTE -> - mergedVectorIterator = + mergedVectorValues = KnnVectorsWriter.MergedVectorValues.mergeByteVectorValues(fieldInfo, mergeState); case FLOAT32 -> - mergedVectorIterator = + mergedVectorValues = KnnVectorsWriter.MergedVectorValues.mergeFloatVectorValues(fieldInfo, mergeState); } graph = merger.merge( - mergedVectorIterator, + mergedVectorValues, segmentWriteState.infoStream, scorerSupplier.totalVectorCount()); vectorIndexNodeOffsets = writeGraph(graph); @@ -582,13 +583,13 @@ public final class Lucene99HnswVectorsWriter extends KnnVectorsWriter { case BYTE -> scorer.getRandomVectorScorerSupplier( fieldInfo.getVectorSimilarityFunction(), - RandomAccessVectorValues.fromBytes( + ByteVectorValues.fromBytes( (List) flatFieldVectorsWriter.getVectors(), fieldInfo.getVectorDimension())); case FLOAT32 -> scorer.getRandomVectorScorerSupplier( fieldInfo.getVectorSimilarityFunction(), - RandomAccessVectorValues.fromFloats( + FloatVectorValues.fromFloats( (List) flatFieldVectorsWriter.getVectors(), fieldInfo.getVectorDimension())); }; diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99ScalarQuantizedVectorScorer.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99ScalarQuantizedVectorScorer.java index 8443017d3f9..a4770f01f46 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99ScalarQuantizedVectorScorer.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99ScalarQuantizedVectorScorer.java @@ -21,12 +21,12 @@ import static org.apache.lucene.codecs.hnsw.ScalarQuantizedVectorScorer.quantize import java.io.IOException; import org.apache.lucene.codecs.hnsw.FlatVectorsScorer; +import org.apache.lucene.index.KnnVectorValues; import org.apache.lucene.index.VectorSimilarityFunction; import org.apache.lucene.util.VectorUtil; -import org.apache.lucene.util.hnsw.RandomAccessVectorValues; import org.apache.lucene.util.hnsw.RandomVectorScorer; import org.apache.lucene.util.hnsw.RandomVectorScorerSupplier; -import org.apache.lucene.util.quantization.RandomAccessQuantizedByteVectorValues; +import org.apache.lucene.util.quantization.QuantizedByteVectorValues; import org.apache.lucene.util.quantization.ScalarQuantizer; /** @@ -45,9 +45,9 @@ public class Lucene99ScalarQuantizedVectorScorer implements FlatVectorsScorer { @Override public RandomVectorScorerSupplier getRandomVectorScorerSupplier( - VectorSimilarityFunction similarityFunction, RandomAccessVectorValues vectorValues) + VectorSimilarityFunction similarityFunction, KnnVectorValues vectorValues) throws IOException { - if (vectorValues instanceof RandomAccessQuantizedByteVectorValues quantizedByteVectorValues) { + if (vectorValues instanceof QuantizedByteVectorValues quantizedByteVectorValues) { return new ScalarQuantizedRandomVectorScorerSupplier( quantizedByteVectorValues, similarityFunction); } @@ -57,11 +57,9 @@ public class Lucene99ScalarQuantizedVectorScorer implements FlatVectorsScorer { @Override public RandomVectorScorer getRandomVectorScorer( - VectorSimilarityFunction similarityFunction, - RandomAccessVectorValues vectorValues, - float[] target) + VectorSimilarityFunction similarityFunction, KnnVectorValues vectorValues, float[] target) throws IOException { - if (vectorValues instanceof RandomAccessQuantizedByteVectorValues quantizedByteVectorValues) { + if (vectorValues instanceof QuantizedByteVectorValues quantizedByteVectorValues) { ScalarQuantizer scalarQuantizer = quantizedByteVectorValues.getScalarQuantizer(); byte[] targetBytes = new byte[target.length]; float offsetCorrection = @@ -79,9 +77,7 @@ public class Lucene99ScalarQuantizedVectorScorer implements FlatVectorsScorer { @Override public RandomVectorScorer getRandomVectorScorer( - VectorSimilarityFunction similarityFunction, - RandomAccessVectorValues vectorValues, - byte[] target) + VectorSimilarityFunction similarityFunction, KnnVectorValues vectorValues, byte[] target) throws IOException { return nonQuantizedDelegate.getRandomVectorScorer(similarityFunction, vectorValues, target); } @@ -96,7 +92,7 @@ public class Lucene99ScalarQuantizedVectorScorer implements FlatVectorsScorer { float offsetCorrection, VectorSimilarityFunction sim, float constMultiplier, - RandomAccessQuantizedByteVectorValues values) { + QuantizedByteVectorValues values) { return switch (sim) { case EUCLIDEAN -> new Euclidean(values, constMultiplier, targetBytes); case COSINE, DOT_PRODUCT -> @@ -120,7 +116,7 @@ public class Lucene99ScalarQuantizedVectorScorer implements FlatVectorsScorer { byte[] targetBytes, float offsetCorrection, float constMultiplier, - RandomAccessQuantizedByteVectorValues values, + QuantizedByteVectorValues values, FloatToFloatFunction scoreAdjustmentFunction) { if (values.getScalarQuantizer().getBits() <= 4) { if (values.getVectorByteLength() != values.dimension() && values.getSlice() != null) { @@ -137,10 +133,9 @@ public class Lucene99ScalarQuantizedVectorScorer implements FlatVectorsScorer { private static class Euclidean extends RandomVectorScorer.AbstractRandomVectorScorer { private final float constMultiplier; private final byte[] targetBytes; - private final RandomAccessQuantizedByteVectorValues values; + private final QuantizedByteVectorValues values; - private Euclidean( - RandomAccessQuantizedByteVectorValues values, float constMultiplier, byte[] targetBytes) { + private Euclidean(QuantizedByteVectorValues values, float constMultiplier, byte[] targetBytes) { super(values); this.values = values; this.constMultiplier = constMultiplier; @@ -159,13 +154,13 @@ public class Lucene99ScalarQuantizedVectorScorer implements FlatVectorsScorer { /** Calculates dot product on quantized vectors, applying the appropriate corrections */ private static class DotProduct extends RandomVectorScorer.AbstractRandomVectorScorer { private final float constMultiplier; - private final RandomAccessQuantizedByteVectorValues values; + private final QuantizedByteVectorValues values; private final byte[] targetBytes; private final float offsetCorrection; private final FloatToFloatFunction scoreAdjustmentFunction; public DotProduct( - RandomAccessQuantizedByteVectorValues values, + QuantizedByteVectorValues values, float constMultiplier, byte[] targetBytes, float offsetCorrection, @@ -193,14 +188,14 @@ public class Lucene99ScalarQuantizedVectorScorer implements FlatVectorsScorer { private static class CompressedInt4DotProduct extends RandomVectorScorer.AbstractRandomVectorScorer { private final float constMultiplier; - private final RandomAccessQuantizedByteVectorValues values; + private final QuantizedByteVectorValues values; private final byte[] compressedVector; private final byte[] targetBytes; private final float offsetCorrection; private final FloatToFloatFunction scoreAdjustmentFunction; private CompressedInt4DotProduct( - RandomAccessQuantizedByteVectorValues values, + QuantizedByteVectorValues values, float constMultiplier, byte[] targetBytes, float offsetCorrection, @@ -231,13 +226,13 @@ public class Lucene99ScalarQuantizedVectorScorer implements FlatVectorsScorer { private static class Int4DotProduct extends RandomVectorScorer.AbstractRandomVectorScorer { private final float constMultiplier; - private final RandomAccessQuantizedByteVectorValues values; + private final QuantizedByteVectorValues values; private final byte[] targetBytes; private final float offsetCorrection; private final FloatToFloatFunction scoreAdjustmentFunction; public Int4DotProduct( - RandomAccessQuantizedByteVectorValues values, + QuantizedByteVectorValues values, float constMultiplier, byte[] targetBytes, float offsetCorrection, @@ -271,13 +266,12 @@ public class Lucene99ScalarQuantizedVectorScorer implements FlatVectorsScorer { implements RandomVectorScorerSupplier { private final VectorSimilarityFunction vectorSimilarityFunction; - private final RandomAccessQuantizedByteVectorValues values; - private final RandomAccessQuantizedByteVectorValues values1; - private final RandomAccessQuantizedByteVectorValues values2; + private final QuantizedByteVectorValues values; + private final QuantizedByteVectorValues values1; + private final QuantizedByteVectorValues values2; public ScalarQuantizedRandomVectorScorerSupplier( - RandomAccessQuantizedByteVectorValues values, - VectorSimilarityFunction vectorSimilarityFunction) + QuantizedByteVectorValues values, VectorSimilarityFunction vectorSimilarityFunction) throws IOException { this.values = values; this.values1 = values.copy(); diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99ScalarQuantizedVectorsReader.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99ScalarQuantizedVectorsReader.java index 40002fe06a6..32eea942e2a 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99ScalarQuantizedVectorsReader.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99ScalarQuantizedVectorsReader.java @@ -135,7 +135,7 @@ public final class Lucene99ScalarQuantizedVectorsReader extends FlatVectorsReade } final long quantizedVectorBytes; - if (fieldEntry.compress) { + if (fieldEntry.bits <= 4 && fieldEntry.compress) { // two dimensions -> one byte quantizedVectorBytes = ((dimension + 1) >> 1) + Float.BYTES; } else { @@ -402,10 +402,10 @@ public final class Lucene99ScalarQuantizedVectorsReader extends FlatVectorsReade private static final class QuantizedVectorValues extends FloatVectorValues { private final FloatVectorValues rawVectorValues; - private final OffHeapQuantizedByteVectorValues quantizedVectorValues; + private final QuantizedByteVectorValues quantizedVectorValues; QuantizedVectorValues( - FloatVectorValues rawVectorValues, OffHeapQuantizedByteVectorValues quantizedVectorValues) { + FloatVectorValues rawVectorValues, QuantizedByteVectorValues quantizedVectorValues) { this.rawVectorValues = rawVectorValues; this.quantizedVectorValues = quantizedVectorValues; } @@ -421,34 +421,28 @@ public final class Lucene99ScalarQuantizedVectorsReader extends FlatVectorsReade } @Override - public float[] vectorValue() throws IOException { - return rawVectorValues.vectorValue(); + public float[] vectorValue(int ord) throws IOException { + return rawVectorValues.vectorValue(ord); } @Override - public int docID() { - return rawVectorValues.docID(); + public int ordToDoc(int ord) { + return rawVectorValues.ordToDoc(ord); } @Override - public int nextDoc() throws IOException { - int rawDocId = rawVectorValues.nextDoc(); - int quantizedDocId = quantizedVectorValues.nextDoc(); - assert rawDocId == quantizedDocId; - return quantizedDocId; - } - - @Override - public int advance(int target) throws IOException { - int rawDocId = rawVectorValues.advance(target); - int quantizedDocId = quantizedVectorValues.advance(target); - assert rawDocId == quantizedDocId; - return quantizedDocId; + public QuantizedVectorValues copy() throws IOException { + return new QuantizedVectorValues(rawVectorValues.copy(), quantizedVectorValues.copy()); } @Override public VectorScorer scorer(float[] query) throws IOException { return quantizedVectorValues.scorer(query); } + + @Override + public DocIndexIterator iterator() { + return rawVectorValues.iterator(); + } } } diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99ScalarQuantizedVectorsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99ScalarQuantizedVectorsWriter.java index bb333ad45c2..1a30b5271cd 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99ScalarQuantizedVectorsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99ScalarQuantizedVectorsWriter.java @@ -19,9 +19,7 @@ package org.apache.lucene.codecs.lucene99; import static org.apache.lucene.codecs.KnnVectorsWriter.MergedVectorValues.hasVectorValues; import static org.apache.lucene.codecs.lucene99.Lucene99FlatVectorsFormat.DIRECT_MONOTONIC_BLOCK_SHIFT; -import static org.apache.lucene.codecs.lucene99.Lucene99ScalarQuantizedVectorsFormat.DYNAMIC_CONFIDENCE_INTERVAL; -import static org.apache.lucene.codecs.lucene99.Lucene99ScalarQuantizedVectorsFormat.QUANTIZED_VECTOR_COMPONENT; -import static org.apache.lucene.codecs.lucene99.Lucene99ScalarQuantizedVectorsFormat.calculateDefaultConfidenceInterval; +import static org.apache.lucene.codecs.lucene99.Lucene99ScalarQuantizedVectorsFormat.*; import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS; import static org.apache.lucene.util.RamUsageEstimator.shallowSizeOfInstance; @@ -45,6 +43,7 @@ import org.apache.lucene.index.DocsWithFieldSet; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.FloatVectorValues; import org.apache.lucene.index.IndexFileNames; +import org.apache.lucene.index.KnnVectorValues; import org.apache.lucene.index.MergeState; import org.apache.lucene.index.SegmentWriteState; import org.apache.lucene.index.Sorter; @@ -653,12 +652,11 @@ public final class Lucene99ScalarQuantizedVectorsWriter extends FlatVectorsWrite || bits <= 4 || shouldRecomputeQuantiles(mergedQuantiles, quantizationStates)) { int numVectors = 0; - FloatVectorValues vectorValues = - KnnVectorsWriter.MergedVectorValues.mergeFloatVectorValues(fieldInfo, mergeState); + DocIdSetIterator iter = + KnnVectorsWriter.MergedVectorValues.mergeFloatVectorValues(fieldInfo, mergeState) + .iterator(); // iterate vectorValues and increment numVectors - for (int doc = vectorValues.nextDoc(); - doc != DocIdSetIterator.NO_MORE_DOCS; - doc = vectorValues.nextDoc()) { + for (int doc = iter.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = iter.nextDoc()) { numVectors++; } return buildScalarQuantizer( @@ -730,11 +728,10 @@ public final class Lucene99ScalarQuantizedVectorsWriter extends FlatVectorsWrite ? OffHeapQuantizedByteVectorValues.compressedArray( quantizedByteVectorValues.dimension(), bits) : null; - for (int docV = quantizedByteVectorValues.nextDoc(); - docV != NO_MORE_DOCS; - docV = quantizedByteVectorValues.nextDoc()) { + KnnVectorValues.DocIndexIterator iter = quantizedByteVectorValues.iterator(); + for (int docV = iter.nextDoc(); docV != NO_MORE_DOCS; docV = iter.nextDoc()) { // write vector - byte[] binaryValue = quantizedByteVectorValues.vectorValue(); + byte[] binaryValue = quantizedByteVectorValues.vectorValue(iter.index()); assert binaryValue.length == quantizedByteVectorValues.dimension() : "dim=" + quantizedByteVectorValues.dimension() + " len=" + binaryValue.length; if (compressedVector != null) { @@ -743,7 +740,8 @@ public final class Lucene99ScalarQuantizedVectorsWriter extends FlatVectorsWrite } else { output.writeBytes(binaryValue, binaryValue.length); } - output.writeInt(Float.floatToIntBits(quantizedByteVectorValues.getScoreCorrectionConstant())); + output.writeInt( + Float.floatToIntBits(quantizedByteVectorValues.getScoreCorrectionConstant(iter.index()))); docsWithField.add(docV); } return docsWithField; @@ -855,7 +853,6 @@ public final class Lucene99ScalarQuantizedVectorsWriter extends FlatVectorsWrite static class FloatVectorWrapper extends FloatVectorValues { private final List vectorList; - protected int curDoc = -1; FloatVectorWrapper(List vectorList) { this.vectorList = vectorList; @@ -872,51 +869,42 @@ public final class Lucene99ScalarQuantizedVectorsWriter extends FlatVectorsWrite } @Override - public float[] vectorValue() throws IOException { - if (curDoc == -1 || curDoc >= vectorList.size()) { - throw new IOException("Current doc not set or too many iterations"); + public FloatVectorValues copy() throws IOException { + return this; + } + + @Override + public float[] vectorValue(int ord) throws IOException { + if (ord < 0 || ord >= vectorList.size()) { + throw new IOException("vector ord " + ord + " out of bounds"); } - return vectorList.get(curDoc); + return vectorList.get(ord); } @Override - public int docID() { - if (curDoc >= vectorList.size()) { - return NO_MORE_DOCS; - } - return curDoc; - } - - @Override - public int nextDoc() throws IOException { - curDoc++; - return docID(); - } - - @Override - public int advance(int target) throws IOException { - curDoc = target; - return docID(); - } - - @Override - public VectorScorer scorer(float[] target) { - throw new UnsupportedOperationException(); + public DocIndexIterator iterator() { + return createDenseIterator(); } } static class QuantizedByteVectorValueSub extends DocIDMerger.Sub { private final QuantizedByteVectorValues values; + private final KnnVectorValues.DocIndexIterator iterator; QuantizedByteVectorValueSub(MergeState.DocMap docMap, QuantizedByteVectorValues values) { super(docMap); this.values = values; - assert values.docID() == -1; + iterator = values.iterator(); + assert iterator.docID() == -1; } @Override public int nextDoc() throws IOException { - return values.nextDoc(); + return iterator.nextDoc(); + } + + public int index() { + return iterator.index(); } } @@ -973,7 +961,6 @@ public final class Lucene99ScalarQuantizedVectorsWriter extends FlatVectorsWrite private final DocIDMerger docIdMerger; private final int size; - private int docId; private QuantizedByteVectorValueSub current; private MergedQuantizedVectorValues( @@ -985,33 +972,16 @@ public final class Lucene99ScalarQuantizedVectorsWriter extends FlatVectorsWrite totalSize += sub.values.size(); } size = totalSize; - docId = -1; } @Override - public byte[] vectorValue() throws IOException { - return current.values.vectorValue(); + public byte[] vectorValue(int ord) throws IOException { + return current.values.vectorValue(current.index()); } @Override - public int docID() { - return docId; - } - - @Override - public int nextDoc() throws IOException { - current = docIdMerger.next(); - if (current == null) { - docId = NO_MORE_DOCS; - } else { - docId = current.mappedDocID; - } - return docId; - } - - @Override - public int advance(int target) { - throw new UnsupportedOperationException(); + public DocIndexIterator iterator() { + return new CompositeIterator(); } @Override @@ -1025,13 +995,51 @@ public final class Lucene99ScalarQuantizedVectorsWriter extends FlatVectorsWrite } @Override - public float getScoreCorrectionConstant() throws IOException { - return current.values.getScoreCorrectionConstant(); + public float getScoreCorrectionConstant(int ord) throws IOException { + return current.values.getScoreCorrectionConstant(current.index()); } - @Override - public VectorScorer scorer(float[] target) throws IOException { - throw new UnsupportedOperationException(); + private class CompositeIterator extends DocIndexIterator { + private int docId; + private int ord; + + public CompositeIterator() { + docId = -1; + ord = -1; + } + + @Override + public int index() { + return ord; + } + + @Override + public int docID() { + return docId; + } + + @Override + public int nextDoc() throws IOException { + current = docIdMerger.next(); + if (current == null) { + docId = NO_MORE_DOCS; + ord = NO_MORE_DOCS; + } else { + docId = current.mappedDocID; + ++ord; + } + return docId; + } + + @Override + public int advance(int target) throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public long cost() { + return size; + } } } @@ -1039,6 +1047,7 @@ public final class Lucene99ScalarQuantizedVectorsWriter extends FlatVectorsWrite private final FloatVectorValues values; private final ScalarQuantizer quantizer; private final byte[] quantizedVector; + private int lastOrd = -1; private float offsetValue = 0f; private final VectorSimilarityFunction vectorSimilarityFunction; @@ -1054,7 +1063,14 @@ public final class Lucene99ScalarQuantizedVectorsWriter extends FlatVectorsWrite } @Override - public float getScoreCorrectionConstant() { + public float getScoreCorrectionConstant(int ord) { + if (ord != lastOrd) { + throw new IllegalStateException( + "attempt to retrieve score correction for different ord " + + ord + + " than the quantization was done for: " + + lastOrd); + } return offsetValue; } @@ -1069,41 +1085,31 @@ public final class Lucene99ScalarQuantizedVectorsWriter extends FlatVectorsWrite } @Override - public byte[] vectorValue() throws IOException { + public byte[] vectorValue(int ord) throws IOException { + if (ord != lastOrd) { + offsetValue = quantize(ord); + lastOrd = ord; + } return quantizedVector; } - @Override - public int docID() { - return values.docID(); - } - - @Override - public int nextDoc() throws IOException { - int doc = values.nextDoc(); - if (doc != NO_MORE_DOCS) { - quantize(); - } - return doc; - } - - @Override - public int advance(int target) throws IOException { - int doc = values.advance(target); - if (doc != NO_MORE_DOCS) { - quantize(); - } - return doc; - } - @Override public VectorScorer scorer(float[] target) throws IOException { throw new UnsupportedOperationException(); } - private void quantize() throws IOException { - offsetValue = - quantizer.quantize(values.vectorValue(), quantizedVector, vectorSimilarityFunction); + private float quantize(int ord) throws IOException { + return quantizer.quantize(values.vectorValue(ord), quantizedVector, vectorSimilarityFunction); + } + + @Override + public int ordToDoc(int ord) { + return values.ordToDoc(ord); + } + + @Override + public DocIndexIterator iterator() { + return values.iterator(); } } @@ -1160,9 +1166,9 @@ public final class Lucene99ScalarQuantizedVectorsWriter extends FlatVectorsWrite } @Override - public float getScoreCorrectionConstant() throws IOException { + public float getScoreCorrectionConstant(int ord) throws IOException { return scalarQuantizer.recalculateCorrectiveOffset( - in.vectorValue(), oldScalarQuantizer, vectorSimilarityFunction); + in.vectorValue(ord), oldScalarQuantizer, vectorSimilarityFunction); } @Override @@ -1176,35 +1182,24 @@ public final class Lucene99ScalarQuantizedVectorsWriter extends FlatVectorsWrite } @Override - public byte[] vectorValue() throws IOException { - return in.vectorValue(); + public byte[] vectorValue(int ord) throws IOException { + return in.vectorValue(ord); } @Override - public int docID() { - return in.docID(); + public int ordToDoc(int ord) { + return in.ordToDoc(ord); } @Override - public int nextDoc() throws IOException { - return in.nextDoc(); - } - - @Override - public int advance(int target) throws IOException { - return in.advance(target); - } - - @Override - public VectorScorer scorer(float[] target) throws IOException { - throw new UnsupportedOperationException(); + public DocIndexIterator iterator() { + return in.iterator(); } } static final class NormalizedFloatVectorValues extends FloatVectorValues { private final FloatVectorValues values; private final float[] normalizedVector; - int curDoc = -1; public NormalizedFloatVectorValues(FloatVectorValues values) { this.values = values; @@ -1222,38 +1217,25 @@ public final class Lucene99ScalarQuantizedVectorsWriter extends FlatVectorsWrite } @Override - public float[] vectorValue() throws IOException { + public int ordToDoc(int ord) { + return values.ordToDoc(ord); + } + + @Override + public float[] vectorValue(int ord) throws IOException { + System.arraycopy(values.vectorValue(ord), 0, normalizedVector, 0, normalizedVector.length); + VectorUtil.l2normalize(normalizedVector); return normalizedVector; } @Override - public VectorScorer scorer(float[] query) throws IOException { - throw new UnsupportedOperationException(); + public DocIndexIterator iterator() { + return values.iterator(); } @Override - public int docID() { - return values.docID(); - } - - @Override - public int nextDoc() throws IOException { - curDoc = values.nextDoc(); - if (curDoc != NO_MORE_DOCS) { - System.arraycopy(values.vectorValue(), 0, normalizedVector, 0, normalizedVector.length); - VectorUtil.l2normalize(normalizedVector); - } - return curDoc; - } - - @Override - public int advance(int target) throws IOException { - curDoc = values.advance(target); - if (curDoc != NO_MORE_DOCS) { - System.arraycopy(values.vectorValue(), 0, normalizedVector, 0, normalizedVector.length); - VectorUtil.l2normalize(normalizedVector); - } - return curDoc; + public NormalizedFloatVectorValues copy() throws IOException { + return new NormalizedFloatVectorValues(values.copy()); } } } diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/OffHeapQuantizedByteVectorValues.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/OffHeapQuantizedByteVectorValues.java index 655dcca1166..051c926a679 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/OffHeapQuantizedByteVectorValues.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/OffHeapQuantizedByteVectorValues.java @@ -30,15 +30,13 @@ import org.apache.lucene.util.Bits; import org.apache.lucene.util.hnsw.RandomVectorScorer; import org.apache.lucene.util.packed.DirectMonotonicReader; import org.apache.lucene.util.quantization.QuantizedByteVectorValues; -import org.apache.lucene.util.quantization.RandomAccessQuantizedByteVectorValues; import org.apache.lucene.util.quantization.ScalarQuantizer; /** * Read the quantized vector values and their score correction values from the index input. This * supports both iterated and random access. */ -public abstract class OffHeapQuantizedByteVectorValues extends QuantizedByteVectorValues - implements RandomAccessQuantizedByteVectorValues { +public abstract class OffHeapQuantizedByteVectorValues extends QuantizedByteVectorValues { protected final int dimension; protected final int size; @@ -141,11 +139,6 @@ public abstract class OffHeapQuantizedByteVectorValues extends QuantizedByteVect return binaryValue; } - @Override - public float getScoreCorrectionConstant() { - return scoreCorrectionConstant[0]; - } - @Override public float getScoreCorrectionConstant(int targetOrd) throws IOException { if (lastOrd == targetOrd) { @@ -213,8 +206,6 @@ public abstract class OffHeapQuantizedByteVectorValues extends QuantizedByteVect */ public static class DenseOffHeapVectorValues extends OffHeapQuantizedByteVectorValues { - private int doc = -1; - public DenseOffHeapVectorValues( int dimension, int size, @@ -226,30 +217,6 @@ public abstract class OffHeapQuantizedByteVectorValues extends QuantizedByteVect super(dimension, size, scalarQuantizer, similarityFunction, vectorsScorer, compress, slice); } - @Override - public byte[] vectorValue() throws IOException { - return vectorValue(doc); - } - - @Override - public int docID() { - return doc; - } - - @Override - public int nextDoc() throws IOException { - return advance(doc + 1); - } - - @Override - public int advance(int target) throws IOException { - assert docID() < target; - if (target >= size) { - return doc = NO_MORE_DOCS; - } - return doc = target; - } - @Override public DenseOffHeapVectorValues copy() throws IOException { return new DenseOffHeapVectorValues( @@ -270,20 +237,26 @@ public abstract class OffHeapQuantizedByteVectorValues extends QuantizedByteVect @Override public VectorScorer scorer(float[] target) throws IOException { DenseOffHeapVectorValues copy = copy(); + DocIndexIterator iterator = copy.iterator(); RandomVectorScorer vectorScorer = vectorsScorer.getRandomVectorScorer(similarityFunction, copy, target); return new VectorScorer() { @Override public float score() throws IOException { - return vectorScorer.score(copy.doc); + return vectorScorer.score(iterator.index()); } @Override public DocIdSetIterator iterator() { - return copy; + return iterator; } }; } + + @Override + public DocIndexIterator iterator() { + return createDenseIterator(); + } } private static class SparseOffHeapVectorValues extends OffHeapQuantizedByteVectorValues { @@ -312,24 +285,8 @@ public abstract class OffHeapQuantizedByteVectorValues extends QuantizedByteVect } @Override - public byte[] vectorValue() throws IOException { - return vectorValue(disi.index()); - } - - @Override - public int docID() { - return disi.docID(); - } - - @Override - public int nextDoc() throws IOException { - return disi.nextDoc(); - } - - @Override - public int advance(int target) throws IOException { - assert docID() < target; - return disi.advance(target); + public DocIndexIterator iterator() { + return IndexedDISI.asDocIndexIterator(disi); } @Override @@ -372,17 +329,18 @@ public abstract class OffHeapQuantizedByteVectorValues extends QuantizedByteVect @Override public VectorScorer scorer(float[] target) throws IOException { SparseOffHeapVectorValues copy = copy(); + DocIndexIterator iterator = copy.iterator(); RandomVectorScorer vectorScorer = vectorsScorer.getRandomVectorScorer(similarityFunction, copy, target); return new VectorScorer() { @Override public float score() throws IOException { - return vectorScorer.score(copy.disi.index()); + return vectorScorer.score(iterator.index()); } @Override public DocIdSetIterator iterator() { - return copy; + return iterator; } }; } @@ -404,8 +362,6 @@ public abstract class OffHeapQuantizedByteVectorValues extends QuantizedByteVect null); } - private int doc = -1; - @Override public int dimension() { return super.dimension(); @@ -417,23 +373,8 @@ public abstract class OffHeapQuantizedByteVectorValues extends QuantizedByteVect } @Override - public byte[] vectorValue() { - throw new UnsupportedOperationException(); - } - - @Override - public int docID() { - return doc; - } - - @Override - public int nextDoc() throws IOException { - return advance(doc + 1); - } - - @Override - public int advance(int target) { - return doc = NO_MORE_DOCS; + public DocIndexIterator iterator() { + return createDenseIterator(); } @Override diff --git a/lucene/core/src/java/org/apache/lucene/codecs/perfield/PerFieldDocValuesFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/perfield/PerFieldDocValuesFormat.java index 9350c016f67..2e45e232b5f 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/perfield/PerFieldDocValuesFormat.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/perfield/PerFieldDocValuesFormat.java @@ -38,7 +38,6 @@ import org.apache.lucene.index.SegmentWriteState; import org.apache.lucene.index.SortedDocValues; import org.apache.lucene.index.SortedNumericDocValues; import org.apache.lucene.index.SortedSetDocValues; -import org.apache.lucene.internal.hppc.IntObjectHashMap; import org.apache.lucene.util.IOUtils; /** @@ -257,7 +256,7 @@ public abstract class PerFieldDocValuesFormat extends DocValuesFormat { private static class FieldsReader extends DocValuesProducer { - private final IntObjectHashMap fields = new IntObjectHashMap<>(); + private final Map fields = new HashMap<>(); private final Map formats = new HashMap<>(); // clone for merge @@ -271,10 +270,10 @@ public abstract class PerFieldDocValuesFormat extends DocValuesFormat { } // Then rebuild fields: - for (IntObjectHashMap.IntObjectCursor ent : other.fields) { - DocValuesProducer producer = oldToNew.get(ent.value); + for (Map.Entry ent : other.fields.entrySet()) { + DocValuesProducer producer = oldToNew.get(ent.getValue()); assert producer != null; - fields.put(ent.key, producer); + fields.put(ent.getKey(), producer); } } @@ -303,7 +302,7 @@ public abstract class PerFieldDocValuesFormat extends DocValuesFormat { segmentSuffix, format.fieldsProducer(new SegmentReadState(readState, segmentSuffix))); } - fields.put(fi.number, formats.get(segmentSuffix)); + fields.put(fieldName, formats.get(segmentSuffix)); } } } @@ -317,37 +316,37 @@ public abstract class PerFieldDocValuesFormat extends DocValuesFormat { @Override public NumericDocValues getNumeric(FieldInfo field) throws IOException { - DocValuesProducer producer = fields.get(field.number); + DocValuesProducer producer = fields.get(field.name); return producer == null ? null : producer.getNumeric(field); } @Override public BinaryDocValues getBinary(FieldInfo field) throws IOException { - DocValuesProducer producer = fields.get(field.number); + DocValuesProducer producer = fields.get(field.name); return producer == null ? null : producer.getBinary(field); } @Override public SortedDocValues getSorted(FieldInfo field) throws IOException { - DocValuesProducer producer = fields.get(field.number); + DocValuesProducer producer = fields.get(field.name); return producer == null ? null : producer.getSorted(field); } @Override public SortedNumericDocValues getSortedNumeric(FieldInfo field) throws IOException { - DocValuesProducer producer = fields.get(field.number); + DocValuesProducer producer = fields.get(field.name); return producer == null ? null : producer.getSortedNumeric(field); } @Override public SortedSetDocValues getSortedSet(FieldInfo field) throws IOException { - DocValuesProducer producer = fields.get(field.number); + DocValuesProducer producer = fields.get(field.name); return producer == null ? null : producer.getSortedSet(field); } @Override public DocValuesSkipper getSkipper(FieldInfo field) throws IOException { - DocValuesProducer producer = fields.get(field.number); + DocValuesProducer producer = fields.get(field.name); return producer == null ? null : producer.getSkipper(field); } diff --git a/lucene/core/src/java/org/apache/lucene/index/ByteVectorValues.java b/lucene/core/src/java/org/apache/lucene/index/ByteVectorValues.java index d33ca1ca354..e9be3423c18 100644 --- a/lucene/core/src/java/org/apache/lucene/index/ByteVectorValues.java +++ b/lucene/core/src/java/org/apache/lucene/index/ByteVectorValues.java @@ -17,8 +17,8 @@ package org.apache.lucene.index; import java.io.IOException; +import java.util.List; import org.apache.lucene.document.KnnByteVectorField; -import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.VectorScorer; /** @@ -27,34 +27,21 @@ import org.apache.lucene.search.VectorScorer; * * @lucene.experimental */ -public abstract class ByteVectorValues extends DocIdSetIterator { +public abstract class ByteVectorValues extends KnnVectorValues { /** Sole constructor */ protected ByteVectorValues() {} - /** Return the dimension of the vectors */ - public abstract int dimension(); - /** - * Return the number of vectors for this field. - * - * @return the number of vectors returned by this iterator - */ - public abstract int size(); - - @Override - public final long cost() { - return size(); - } - - /** - * Return the vector value for the current document ID. It is illegal to call this method when the - * iterator is not positioned: before advancing, or after failing to advance. The returned array - * may be shared across calls, re-used, and modified as the iterator advances. + * Return the vector value for the given vector ordinal which must be in [0, size() - 1], + * otherwise IndexOutOfBoundsException is thrown. The returned array may be shared across calls. * * @return the vector value */ - public abstract byte[] vectorValue() throws IOException; + public abstract byte[] vectorValue(int ord) throws IOException; + + @Override + public abstract ByteVectorValues copy() throws IOException; /** * Checks the Vector Encoding of a field @@ -78,12 +65,53 @@ public abstract class ByteVectorValues extends DocIdSetIterator { } /** - * Return a {@link VectorScorer} for the given query vector. The iterator for the scorer is not - * the same instance as the iterator for this {@link ByteVectorValues}. It is a copy, and - * iteration over the scorer will not affect the iteration of this {@link ByteVectorValues}. + * Return a {@link VectorScorer} for the given query vector. * * @param query the query vector * @return a {@link VectorScorer} instance or null */ - public abstract VectorScorer scorer(byte[] query) throws IOException; + public VectorScorer scorer(byte[] query) throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public VectorEncoding getEncoding() { + return VectorEncoding.BYTE; + } + + /** + * Creates a {@link ByteVectorValues} from a list of byte arrays. + * + * @param vectors the list of byte arrays + * @param dim the dimension of the vectors + * @return a {@link ByteVectorValues} instancec + */ + public static ByteVectorValues fromBytes(List vectors, int dim) { + return new ByteVectorValues() { + @Override + public int size() { + return vectors.size(); + } + + @Override + public int dimension() { + return dim; + } + + @Override + public byte[] vectorValue(int targetOrd) { + return vectors.get(targetOrd); + } + + @Override + public ByteVectorValues copy() { + return this; + } + + @Override + public DocIndexIterator iterator() { + return createDenseIterator(); + } + }; + } } diff --git a/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java b/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java index b8256ecf587..becb00cbb5b 100644 --- a/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java +++ b/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java @@ -2760,16 +2760,16 @@ public final class CheckIndex implements Closeable { CheckIndex.Status.VectorValuesStatus status, CodecReader codecReader) throws IOException { - int docCount = 0; + int count = 0; int everyNdoc = Math.max(values.size() / 64, 1); - while (values.nextDoc() != NO_MORE_DOCS) { + while (count < values.size()) { // search the first maxNumSearches vectors to exercise the graph - if (values.docID() % everyNdoc == 0) { + if (values.ordToDoc(count) % everyNdoc == 0) { KnnCollector collector = new TopKnnCollector(10, Integer.MAX_VALUE); if (vectorsReaderSupportsSearch(codecReader, fieldInfo.name)) { codecReader .getVectorReader() - .search(fieldInfo.name, values.vectorValue(), collector, null); + .search(fieldInfo.name, values.vectorValue(count), collector, null); TopDocs docs = collector.topDocs(); if (docs.scoreDocs.length == 0) { throw new CheckIndexException( @@ -2777,7 +2777,7 @@ public final class CheckIndex implements Closeable { } } } - int valueLength = values.vectorValue().length; + int valueLength = values.vectorValue(count).length; if (valueLength != fieldInfo.getVectorDimension()) { throw new CheckIndexException( "Field \"" @@ -2787,19 +2787,19 @@ public final class CheckIndex implements Closeable { + " not matching the field's dimension=" + fieldInfo.getVectorDimension()); } - ++docCount; + ++count; } - if (docCount != values.size()) { + if (count != values.size()) { throw new CheckIndexException( "Field \"" + fieldInfo.name + "\" has size=" + values.size() + " but when iterated, returns " - + docCount + + count + " docs with values"); } - status.totalVectorValues += docCount; + status.totalVectorValues += count; } private static void checkByteVectorValues( @@ -2808,21 +2808,23 @@ public final class CheckIndex implements Closeable { CheckIndex.Status.VectorValuesStatus status, CodecReader codecReader) throws IOException { - int docCount = 0; + int count = 0; int everyNdoc = Math.max(values.size() / 64, 1); boolean supportsSearch = vectorsReaderSupportsSearch(codecReader, fieldInfo.name); - while (values.nextDoc() != NO_MORE_DOCS) { + while (count < values.size()) { // search the first maxNumSearches vectors to exercise the graph - if (supportsSearch && values.docID() % everyNdoc == 0) { + if (supportsSearch && values.ordToDoc(count) % everyNdoc == 0) { KnnCollector collector = new TopKnnCollector(10, Integer.MAX_VALUE); - codecReader.getVectorReader().search(fieldInfo.name, values.vectorValue(), collector, null); + codecReader + .getVectorReader() + .search(fieldInfo.name, values.vectorValue(count), collector, null); TopDocs docs = collector.topDocs(); if (docs.scoreDocs.length == 0) { throw new CheckIndexException( "Field \"" + fieldInfo.name + "\" failed to search k nearest neighbors"); } } - int valueLength = values.vectorValue().length; + int valueLength = values.vectorValue(count).length; if (valueLength != fieldInfo.getVectorDimension()) { throw new CheckIndexException( "Field \"" @@ -2832,19 +2834,19 @@ public final class CheckIndex implements Closeable { + " not matching the field's dimension=" + fieldInfo.getVectorDimension()); } - ++docCount; + ++count; } - if (docCount != values.size()) { + if (count != values.size()) { throw new CheckIndexException( "Field \"" + fieldInfo.name + "\" has size=" + values.size() + " but when iterated, returns " - + docCount + + count + " docs with values"); } - status.totalVectorValues += docCount; + status.totalVectorValues += count; } /** diff --git a/lucene/core/src/java/org/apache/lucene/index/ExitableDirectoryReader.java b/lucene/core/src/java/org/apache/lucene/index/ExitableDirectoryReader.java index ca2cb1a27d4..614a652cd35 100644 --- a/lucene/core/src/java/org/apache/lucene/index/ExitableDirectoryReader.java +++ b/lucene/core/src/java/org/apache/lucene/index/ExitableDirectoryReader.java @@ -429,37 +429,10 @@ public class ExitableDirectoryReader extends FilterDirectoryReader { } private class ExitableFloatVectorValues extends FloatVectorValues { - private int docToCheck; private final FloatVectorValues vectorValues; public ExitableFloatVectorValues(FloatVectorValues vectorValues) { this.vectorValues = vectorValues; - docToCheck = 0; - } - - @Override - public int advance(int target) throws IOException { - final int advance = vectorValues.advance(target); - if (advance >= docToCheck) { - checkAndThrow(); - docToCheck = advance + DOCS_BETWEEN_TIMEOUT_CHECK; - } - return advance; - } - - @Override - public int docID() { - return vectorValues.docID(); - } - - @Override - public int nextDoc() throws IOException { - final int nextDoc = vectorValues.nextDoc(); - if (nextDoc >= docToCheck) { - checkAndThrow(); - docToCheck = nextDoc + DOCS_BETWEEN_TIMEOUT_CHECK; - } - return nextDoc; } @Override @@ -468,8 +441,13 @@ public class ExitableDirectoryReader extends FilterDirectoryReader { } @Override - public float[] vectorValue() throws IOException { - return vectorValues.vectorValue(); + public float[] vectorValue(int ord) throws IOException { + return vectorValues.vectorValue(ord); + } + + @Override + public int ordToDoc(int ord) { + return vectorValues.ordToDoc(ord); } @Override @@ -477,61 +455,27 @@ public class ExitableDirectoryReader extends FilterDirectoryReader { return vectorValues.size(); } + @Override + public DocIndexIterator iterator() { + return createExitableIterator(vectorValues.iterator(), queryTimeout); + } + @Override public VectorScorer scorer(float[] target) throws IOException { return vectorValues.scorer(target); } - /** - * Throws {@link ExitingReaderException} if {@link QueryTimeout#shouldExit()} returns true, or - * if {@link Thread#interrupted()} returns true. - */ - private void checkAndThrow() { - if (queryTimeout.shouldExit()) { - throw new ExitingReaderException( - "The request took too long to iterate over vector values. Timeout: " - + queryTimeout.toString() - + ", FloatVectorValues=" - + in); - } else if (Thread.interrupted()) { - throw new ExitingReaderException( - "Interrupted while iterating over vector values. FloatVectorValues=" + in); - } + @Override + public FloatVectorValues copy() { + throw new UnsupportedOperationException(); } } private class ExitableByteVectorValues extends ByteVectorValues { - private int docToCheck; private final ByteVectorValues vectorValues; public ExitableByteVectorValues(ByteVectorValues vectorValues) { this.vectorValues = vectorValues; - docToCheck = 0; - } - - @Override - public int advance(int target) throws IOException { - final int advance = vectorValues.advance(target); - if (advance >= docToCheck) { - checkAndThrow(); - docToCheck = advance + DOCS_BETWEEN_TIMEOUT_CHECK; - } - return advance; - } - - @Override - public int docID() { - return vectorValues.docID(); - } - - @Override - public int nextDoc() throws IOException { - final int nextDoc = vectorValues.nextDoc(); - if (nextDoc >= docToCheck) { - checkAndThrow(); - docToCheck = nextDoc + DOCS_BETWEEN_TIMEOUT_CHECK; - } - return nextDoc; } @Override @@ -545,8 +489,18 @@ public class ExitableDirectoryReader extends FilterDirectoryReader { } @Override - public byte[] vectorValue() throws IOException { - return vectorValues.vectorValue(); + public byte[] vectorValue(int ord) throws IOException { + return vectorValues.vectorValue(ord); + } + + @Override + public int ordToDoc(int ord) { + return vectorValues.ordToDoc(ord); + } + + @Override + public DocIndexIterator iterator() { + return createExitableIterator(vectorValues.iterator(), queryTimeout); } @Override @@ -554,23 +508,66 @@ public class ExitableDirectoryReader extends FilterDirectoryReader { return vectorValues.scorer(target); } - /** - * Throws {@link ExitingReaderException} if {@link QueryTimeout#shouldExit()} returns true, or - * if {@link Thread#interrupted()} returns true. - */ + @Override + public ByteVectorValues copy() { + throw new UnsupportedOperationException(); + } + } + } + + private static KnnVectorValues.DocIndexIterator createExitableIterator( + KnnVectorValues.DocIndexIterator delegate, QueryTimeout queryTimeout) { + return new KnnVectorValues.DocIndexIterator() { + private int nextCheck; + + @Override + public int index() { + return delegate.index(); + } + + @Override + public int docID() { + return delegate.docID(); + } + + @Override + public int nextDoc() throws IOException { + int doc = delegate.nextDoc(); + if (doc >= nextCheck) { + checkAndThrow(); + nextCheck = doc + ExitableFilterAtomicReader.DOCS_BETWEEN_TIMEOUT_CHECK; + } + return doc; + } + + @Override + public long cost() { + return delegate.cost(); + } + + @Override + public int advance(int target) throws IOException { + int doc = delegate.advance(target); + if (doc >= nextCheck) { + checkAndThrow(); + nextCheck = doc + ExitableFilterAtomicReader.DOCS_BETWEEN_TIMEOUT_CHECK; + } + return doc; + } + private void checkAndThrow() { if (queryTimeout.shouldExit()) { throw new ExitingReaderException( - "The request took too long to iterate over vector values. Timeout: " + "The request took too long to iterate over knn vector values. Timeout: " + queryTimeout.toString() - + ", ByteVectorValues=" - + in); + + ", KnnVectorValues=" + + delegate); } else if (Thread.interrupted()) { throw new ExitingReaderException( - "Interrupted while iterating over vector values. ByteVectorValues=" + in); + "Interrupted while iterating over knn vector values. KnnVectorValues=" + delegate); } } - } + }; } /** Wrapper class for another PointValues implementation that is used by ExitableFields. */ @@ -683,7 +680,7 @@ public class ExitableDirectoryReader extends FilterDirectoryReader { if (queryTimeout.shouldExit()) { throw new ExitingReaderException( "The request took too long to intersect point values. Timeout: " - + queryTimeout.toString() + + queryTimeout + ", PointValues=" + pointValues); } else if (Thread.interrupted()) { @@ -815,7 +812,7 @@ public class ExitableDirectoryReader extends FilterDirectoryReader { /** Wrapper class for another Terms implementation that is used by ExitableFields. */ public static class ExitableTerms extends FilterTerms { - private QueryTimeout queryTimeout; + private final QueryTimeout queryTimeout; /** Constructor * */ public ExitableTerms(Terms terms, QueryTimeout queryTimeout) { diff --git a/lucene/core/src/java/org/apache/lucene/index/FloatVectorValues.java b/lucene/core/src/java/org/apache/lucene/index/FloatVectorValues.java index e5dbc620f5c..aa840fc3931 100644 --- a/lucene/core/src/java/org/apache/lucene/index/FloatVectorValues.java +++ b/lucene/core/src/java/org/apache/lucene/index/FloatVectorValues.java @@ -17,8 +17,8 @@ package org.apache.lucene.index; import java.io.IOException; +import java.util.List; import org.apache.lucene.document.KnnFloatVectorField; -import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.VectorScorer; /** @@ -27,34 +27,21 @@ import org.apache.lucene.search.VectorScorer; * * @lucene.experimental */ -public abstract class FloatVectorValues extends DocIdSetIterator { +public abstract class FloatVectorValues extends KnnVectorValues { /** Sole constructor */ protected FloatVectorValues() {} - /** Return the dimension of the vectors */ - public abstract int dimension(); - /** - * Return the number of vectors for this field. - * - * @return the number of vectors returned by this iterator - */ - public abstract int size(); - - @Override - public final long cost() { - return size(); - } - - /** - * Return the vector value for the current document ID. It is illegal to call this method when the - * iterator is not positioned: before advancing, or after failing to advance. The returned array - * may be shared across calls, re-used, and modified as the iterator advances. + * Return the vector value for the given vector ordinal which must be in [0, size() - 1], + * otherwise IndexOutOfBoundsException is thrown. The returned array may be shared across calls. * * @return the vector value */ - public abstract float[] vectorValue() throws IOException; + public abstract float[] vectorValue(int ord) throws IOException; + + @Override + public abstract FloatVectorValues copy() throws IOException; /** * Checks the Vector Encoding of a field @@ -79,12 +66,53 @@ public abstract class FloatVectorValues extends DocIdSetIterator { /** * Return a {@link VectorScorer} for the given query vector and the current {@link - * FloatVectorValues}. The iterator for the scorer is not the same instance as the iterator for - * this {@link FloatVectorValues}. It is a copy, and iteration over the scorer will not affect the - * iteration of this {@link FloatVectorValues}. + * FloatVectorValues}. * - * @param query the query vector + * @param target the query vector * @return a {@link VectorScorer} instance or null */ - public abstract VectorScorer scorer(float[] query) throws IOException; + public VectorScorer scorer(float[] target) throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public VectorEncoding getEncoding() { + return VectorEncoding.FLOAT32; + } + + /** + * Creates a {@link FloatVectorValues} from a list of float arrays. + * + * @param vectors the list of float arrays + * @param dim the dimension of the vectors + * @return a {@link FloatVectorValues} instance + */ + public static FloatVectorValues fromFloats(List vectors, int dim) { + return new FloatVectorValues() { + @Override + public int size() { + return vectors.size(); + } + + @Override + public int dimension() { + return dim; + } + + @Override + public float[] vectorValue(int targetOrd) { + return vectors.get(targetOrd); + } + + @Override + public FloatVectorValues copy() { + return this; + } + + @Override + public DocIndexIterator iterator() { + return createDenseIterator(); + } + }; + } } diff --git a/lucene/core/src/java/org/apache/lucene/index/Impacts.java b/lucene/core/src/java/org/apache/lucene/index/Impacts.java index 35e8cca5c70..e366b6f6b0b 100644 --- a/lucene/core/src/java/org/apache/lucene/index/Impacts.java +++ b/lucene/core/src/java/org/apache/lucene/index/Impacts.java @@ -40,7 +40,8 @@ public abstract class Impacts { /** * Return impacts on the given level. These impacts are sorted by increasing frequency and * increasing unsigned norm, and only valid until the doc ID returned by {@link - * #getDocIdUpTo(int)} for the same level, included. The returned list is never empty. NOTE: There + * #getDocIdUpTo(int)} for the same level, included. The returned list is never empty and should + * implement {@link java.util.RandomAccess} if it contains more than a single element. NOTE: There * is no guarantee that these impacts actually appear in postings, only that they trigger scores * that are greater than or equal to the impacts that actually appear in postings. */ diff --git a/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java b/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java index def9ef06fce..346da8a907e 100644 --- a/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java +++ b/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java @@ -1255,8 +1255,7 @@ public class IndexWriter return reader.read(si.info.dir, si.info, segmentSuffix, IOContext.READONCE); } else if (si.info.getUseCompoundFile()) { // cfs - try (Directory cfs = - codec.compoundFormat().getCompoundReader(si.info.dir, si.info, IOContext.DEFAULT)) { + try (Directory cfs = codec.compoundFormat().getCompoundReader(si.info.dir, si.info)) { return reader.read(cfs, si.info, "", IOContext.READONCE); } } else { diff --git a/lucene/core/src/java/org/apache/lucene/index/KnnVectorValues.java b/lucene/core/src/java/org/apache/lucene/index/KnnVectorValues.java new file mode 100644 index 00000000000..8e58f387a33 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/index/KnnVectorValues.java @@ -0,0 +1,229 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.index; + +import java.io.IOException; +import org.apache.lucene.document.KnnByteVectorField; +import org.apache.lucene.document.KnnFloatVectorField; +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.util.Bits; + +/** + * This class abstracts addressing of document vector values indexed as {@link KnnFloatVectorField} + * or {@link KnnByteVectorField}. + * + * @lucene.experimental + */ +public abstract class KnnVectorValues { + + /** Return the dimension of the vectors */ + public abstract int dimension(); + + /** + * Return the number of vectors for this field. + * + * @return the number of vectors returned by this iterator + */ + public abstract int size(); + + /** + * Return the docid of the document indexed with the given vector ordinal. This default + * implementation returns the argument and is appropriate for dense values implementations where + * every doc has a single value. + */ + public int ordToDoc(int ord) { + return ord; + } + + /** + * Creates a new copy of this {@link KnnVectorValues}. This is helpful when you need to access + * different values at once, to avoid overwriting the underlying vector returned. + */ + public abstract KnnVectorValues copy() throws IOException; + + /** Returns the vector byte length, defaults to dimension multiplied by float byte size */ + public int getVectorByteLength() { + return dimension() * getEncoding().byteSize; + } + + /** The vector encoding of these values. */ + public abstract VectorEncoding getEncoding(); + + /** Returns a Bits accepting docs accepted by the argument and having a vector value */ + public Bits getAcceptOrds(Bits acceptDocs) { + // FIXME: change default to return acceptDocs and provide this impl + // somewhere more specialized (in every non-dense impl). + if (acceptDocs == null) { + return null; + } + return new Bits() { + @Override + public boolean get(int index) { + return acceptDocs.get(ordToDoc(index)); + } + + @Override + public int length() { + return size(); + } + }; + } + + /** Create an iterator for this instance. */ + public DocIndexIterator iterator() { + throw new UnsupportedOperationException(); + } + + /** + * A DocIdSetIterator that also provides an index() method tracking a distinct ordinal for a + * vector associated with each doc. + */ + public abstract static class DocIndexIterator extends DocIdSetIterator { + + /** return the value index (aka "ordinal" or "ord") corresponding to the current doc */ + public abstract int index(); + } + + /** + * Creates an iterator for instances where every doc has a value, and the value ordinals are equal + * to the docids. + */ + protected DocIndexIterator createDenseIterator() { + return new DocIndexIterator() { + + int doc = -1; + + @Override + public int docID() { + return doc; + } + + @Override + public int index() { + return doc; + } + + @Override + public int nextDoc() throws IOException { + if (doc >= size() - 1) { + return doc = NO_MORE_DOCS; + } else { + return ++doc; + } + } + + @Override + public int advance(int target) { + if (target >= size()) { + return doc = NO_MORE_DOCS; + } + return doc = target; + } + + @Override + public long cost() { + return size(); + } + }; + } + + /** + * Creates an iterator from a DocIdSetIterator indicating which docs have values, and for which + * ordinals increase monotonically with docid. + */ + protected static DocIndexIterator fromDISI(DocIdSetIterator docsWithField) { + return new DocIndexIterator() { + + int ord = -1; + + @Override + public int docID() { + return docsWithField.docID(); + } + + @Override + public int index() { + return ord; + } + + @Override + public int nextDoc() throws IOException { + if (docID() == NO_MORE_DOCS) { + return NO_MORE_DOCS; + } + ord++; + return docsWithField.nextDoc(); + } + + @Override + public int advance(int target) throws IOException { + return docsWithField.advance(target); + } + + @Override + public long cost() { + return docsWithField.cost(); + } + }; + } + + /** + * Creates an iterator from this instance's ordinal-to-docid mapping which must be monotonic + * (docid increases when ordinal does). + */ + protected DocIndexIterator createSparseIterator() { + return new DocIndexIterator() { + private int ord = -1; + + @Override + public int docID() { + if (ord == -1) { + return -1; + } + if (ord == NO_MORE_DOCS) { + return NO_MORE_DOCS; + } + return ordToDoc(ord); + } + + @Override + public int index() { + return ord; + } + + @Override + public int nextDoc() throws IOException { + if (ord >= size() - 1) { + ord = NO_MORE_DOCS; + } else { + ++ord; + } + return docID(); + } + + @Override + public int advance(int target) throws IOException { + return slowAdvance(target); + } + + @Override + public long cost() { + return size(); + } + }; + } +} diff --git a/lucene/core/src/java/org/apache/lucene/index/LeafMetaData.java b/lucene/core/src/java/org/apache/lucene/index/LeafMetaData.java index 4d9f753e2e3..4595560eff8 100644 --- a/lucene/core/src/java/org/apache/lucene/index/LeafMetaData.java +++ b/lucene/core/src/java/org/apache/lucene/index/LeafMetaData.java @@ -33,9 +33,9 @@ import org.apache.lucene.util.Version; * are in no particular order. * @param hasBlocks Returns true iff this index contains blocks created with {@link * IndexWriter#addDocument(Iterable)} or it's corresponding update methods with at least 2 or - * more documents per call. Note: This property was not recorded before {@link - * Version#LUCENE_9_9_0} this method will return false for all leaves written before {@link - * Version#LUCENE_9_9_0} + * more documents per call. Note: This property was not recorded before {@link Version + * LUCENE_9_9_0} this method will return false for all leaves written before {@link Version + * LUCENE_9_9_0} * @see IndexWriter#updateDocuments(Term, Iterable) * @see IndexWriter#updateDocuments(Query, Iterable) * @see IndexWriter#softUpdateDocuments(Term, Iterable, Field...) diff --git a/lucene/core/src/java/org/apache/lucene/index/MergeState.java b/lucene/core/src/java/org/apache/lucene/index/MergeState.java index dcf9923feb3..838699215f0 100644 --- a/lucene/core/src/java/org/apache/lucene/index/MergeState.java +++ b/lucene/core/src/java/org/apache/lucene/index/MergeState.java @@ -41,7 +41,7 @@ import org.apache.lucene.util.packed.PackedLongValues; * * @lucene.experimental */ -public class MergeState implements Cloneable { +public class MergeState { /** Maps document IDs from old segments to document IDs in the new segment */ public final DocMap[] docMaps; @@ -302,55 +302,4 @@ public class MergeState implements Cloneable { this.intraMergeTaskExecutor = intraMergeTaskExecutor; this.needsIndexSort = needsIndexSort; } - - @Override - public MergeState clone() { - StoredFieldsReader[] storedFieldsReaders = this.storedFieldsReaders.clone(); - TermVectorsReader[] termVectorsReaders = this.termVectorsReaders.clone(); - NormsProducer[] normsProducers = this.normsProducers.clone(); - DocValuesProducer[] docValuesProducers = this.docValuesProducers.clone(); - FieldsProducer[] fieldsProducers = this.fieldsProducers.clone(); - PointsReader[] pointsReaders = this.pointsReaders.clone(); - KnnVectorsReader[] knnVectorsReaders = this.knnVectorsReaders.clone(); - for (int i = 0; i < storedFieldsReaders.length; ++i) { - if (storedFieldsReaders[i] != null) { - storedFieldsReaders[i] = storedFieldsReaders[i].getMergeInstance(); - } - if (termVectorsReaders[i] != null) { - termVectorsReaders[i] = termVectorsReaders[i].getMergeInstance(); - } - if (normsProducers[i] != null) { - normsProducers[i] = normsProducers[i].getMergeInstance(); - } - if (docValuesProducers[i] != null) { - docValuesProducers[i] = docValuesProducers[i].getMergeInstance(); - } - if (fieldsProducers[i] != null) { - fieldsProducers[i] = fieldsProducers[i].getMergeInstance(); - } - if (pointsReaders[i] != null) { - pointsReaders[i] = pointsReaders[i].getMergeInstance(); - } - if (knnVectorsReaders[i] != null) { - knnVectorsReaders[i] = knnVectorsReaders[i].getMergeInstance(); - } - } - return new MergeState( - docMaps, - segmentInfo, - mergeFieldInfos, - storedFieldsReaders, - termVectorsReaders, - normsProducers, - docValuesProducers, - fieldInfos, - liveDocs, - fieldsProducers, - pointsReaders, - knnVectorsReaders, - maxDocs, - infoStream, - intraMergeTaskExecutor, - needsIndexSort); - } } diff --git a/lucene/core/src/java/org/apache/lucene/index/PendingSoftDeletes.java b/lucene/core/src/java/org/apache/lucene/index/PendingSoftDeletes.java index 557d31ad441..63c021660c7 100644 --- a/lucene/core/src/java/org/apache/lucene/index/PendingSoftDeletes.java +++ b/lucene/core/src/java/org/apache/lucene/index/PendingSoftDeletes.java @@ -76,15 +76,14 @@ final class PendingSoftDeletes extends PendingDeletes { hardDeletes.onNewReader(reader, info); // only re-calculate this if we haven't seen this generation if (dvGeneration < info.getDocValuesGen()) { - final DocIdSetIterator iterator = - FieldExistsQuery.getDocValuesDocIdSetIterator(field, reader); - int newDelCount; - if (iterator - != null) { // nothing is deleted we don't have a soft deletes field in this segment - assert info.info.maxDoc() > 0 : "maxDoc is 0"; + final int newDelCount; + var iterator = FieldExistsQuery.getDocValuesDocIdSetIterator(field, reader); + if (iterator != null && iterator.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { + iterator = FieldExistsQuery.getDocValuesDocIdSetIterator(field, reader); newDelCount = applySoftDeletes(iterator, getMutableBits()); assert newDelCount >= 0 : " illegal pending delete count: " + newDelCount; } else { + // nothing is deleted we don't have a soft deletes field in this segment newDelCount = 0; } assert info.getSoftDelCount() == newDelCount @@ -227,12 +226,7 @@ final class PendingSoftDeletes extends PendingDeletes { // updates always outside of CFS Closeable toClose; if (segInfo.getUseCompoundFile()) { - toClose = - dir = - segInfo - .getCodec() - .compoundFormat() - .getCompoundReader(segInfo.dir, segInfo, IOContext.READONCE); + toClose = dir = segInfo.getCodec().compoundFormat().getCompoundReader(segInfo.dir, segInfo); } else { toClose = null; dir = segInfo.dir; diff --git a/lucene/core/src/java/org/apache/lucene/index/SegmentCoreReaders.java b/lucene/core/src/java/org/apache/lucene/index/SegmentCoreReaders.java index a29f734ea2f..7da6d77136c 100644 --- a/lucene/core/src/java/org/apache/lucene/index/SegmentCoreReaders.java +++ b/lucene/core/src/java/org/apache/lucene/index/SegmentCoreReaders.java @@ -80,7 +80,7 @@ final class SegmentCoreReaders { try { if (si.info.getUseCompoundFile()) { - cfsDir = cfsReader = codec.compoundFormat().getCompoundReader(dir, si.info, context); + cfsDir = cfsReader = codec.compoundFormat().getCompoundReader(dir, si.info); } else { cfsReader = null; cfsDir = dir; diff --git a/lucene/core/src/java/org/apache/lucene/index/SegmentDocValuesProducer.java b/lucene/core/src/java/org/apache/lucene/index/SegmentDocValuesProducer.java index 0f4df818ddc..1d9878fe0db 100644 --- a/lucene/core/src/java/org/apache/lucene/index/SegmentDocValuesProducer.java +++ b/lucene/core/src/java/org/apache/lucene/index/SegmentDocValuesProducer.java @@ -18,10 +18,11 @@ package org.apache.lucene.index; import java.io.IOException; import java.util.Collections; +import java.util.HashMap; import java.util.IdentityHashMap; +import java.util.Map; import java.util.Set; import org.apache.lucene.codecs.DocValuesProducer; -import org.apache.lucene.internal.hppc.IntObjectHashMap; import org.apache.lucene.internal.hppc.LongArrayList; import org.apache.lucene.store.Directory; @@ -31,7 +32,7 @@ import org.apache.lucene.store.Directory; // producer? class SegmentDocValuesProducer extends DocValuesProducer { - final IntObjectHashMap dvProducersByField = new IntObjectHashMap<>(); + final Map dvProducersByField = new HashMap<>(); final Set dvProducers = Collections.newSetFromMap(new IdentityHashMap()); final LongArrayList dvGens = new LongArrayList(); @@ -66,7 +67,7 @@ class SegmentDocValuesProducer extends DocValuesProducer { dvGens.add(docValuesGen); dvProducers.add(baseProducer); } - dvProducersByField.put(fi.number, baseProducer); + dvProducersByField.put(fi.name, baseProducer); } else { assert !dvGens.contains(docValuesGen); // otherwise, producer sees only the one fieldinfo it wrote @@ -75,7 +76,7 @@ class SegmentDocValuesProducer extends DocValuesProducer { docValuesGen, si, dir, new FieldInfos(new FieldInfo[] {fi})); dvGens.add(docValuesGen); dvProducers.add(dvp); - dvProducersByField.put(fi.number, dvp); + dvProducersByField.put(fi.name, dvp); } } } catch (Throwable t) { @@ -90,42 +91,42 @@ class SegmentDocValuesProducer extends DocValuesProducer { @Override public NumericDocValues getNumeric(FieldInfo field) throws IOException { - DocValuesProducer dvProducer = dvProducersByField.get(field.number); + DocValuesProducer dvProducer = dvProducersByField.get(field.name); assert dvProducer != null; return dvProducer.getNumeric(field); } @Override public BinaryDocValues getBinary(FieldInfo field) throws IOException { - DocValuesProducer dvProducer = dvProducersByField.get(field.number); + DocValuesProducer dvProducer = dvProducersByField.get(field.name); assert dvProducer != null; return dvProducer.getBinary(field); } @Override public SortedDocValues getSorted(FieldInfo field) throws IOException { - DocValuesProducer dvProducer = dvProducersByField.get(field.number); + DocValuesProducer dvProducer = dvProducersByField.get(field.name); assert dvProducer != null; return dvProducer.getSorted(field); } @Override public SortedNumericDocValues getSortedNumeric(FieldInfo field) throws IOException { - DocValuesProducer dvProducer = dvProducersByField.get(field.number); + DocValuesProducer dvProducer = dvProducersByField.get(field.name); assert dvProducer != null; return dvProducer.getSortedNumeric(field); } @Override public SortedSetDocValues getSortedSet(FieldInfo field) throws IOException { - DocValuesProducer dvProducer = dvProducersByField.get(field.number); + DocValuesProducer dvProducer = dvProducersByField.get(field.name); assert dvProducer != null; return dvProducer.getSortedSet(field); } @Override public DocValuesSkipper getSkipper(FieldInfo field) throws IOException { - DocValuesProducer dvProducer = dvProducersByField.get(field.number); + DocValuesProducer dvProducer = dvProducersByField.get(field.name); assert dvProducer != null; return dvProducer.getSkipper(field); } diff --git a/lucene/core/src/java/org/apache/lucene/index/SegmentMerger.java b/lucene/core/src/java/org/apache/lucene/index/SegmentMerger.java index b9f14b4e39c..5e336c7fef0 100644 --- a/lucene/core/src/java/org/apache/lucene/index/SegmentMerger.java +++ b/lucene/core/src/java/org/apache/lucene/index/SegmentMerger.java @@ -17,9 +17,7 @@ package org.apache.lucene.index; import java.io.IOException; -import java.util.ArrayList; import java.util.List; -import java.util.concurrent.Callable; import java.util.concurrent.Executor; import java.util.concurrent.TimeUnit; import org.apache.lucene.codecs.Codec; @@ -31,7 +29,6 @@ import org.apache.lucene.codecs.NormsProducer; import org.apache.lucene.codecs.PointsWriter; import org.apache.lucene.codecs.StoredFieldsWriter; import org.apache.lucene.codecs.TermVectorsWriter; -import org.apache.lucene.search.TaskExecutor; import org.apache.lucene.store.Directory; import org.apache.lucene.store.IOContext; import org.apache.lucene.util.InfoStream; @@ -102,12 +99,7 @@ final class SegmentMerger { } private MergeState mergeState() { - MergeState mergeState = this.mergeState; - if (Thread.currentThread() != mergeStateCreationThread) { - // Most merges, e.g. small merges, run in the same thread, so save the cost of pulling a clone - // in that case. - mergeState = mergeState.clone(); - } + assert Thread.currentThread() == mergeStateCreationThread; return mergeState; } @@ -147,8 +139,6 @@ final class SegmentMerger { IOContext.DEFAULT, segmentWriteState.segmentSuffix); - TaskExecutor taskExecutor = new TaskExecutor(mergeState.intraMergeTaskExecutor); - List> mergingTasks = new ArrayList<>(); if (mergeState.mergeFieldInfos.hasNorms()) { mergeWithLogging(this::mergeNorms, segmentWriteState, segmentReadState, "norms", numMerged); } @@ -161,12 +151,7 @@ final class SegmentMerger { } if (mergeState.mergeFieldInfos.hasPointValues()) { - mergingTasks.add( - () -> { - mergeWithLogging( - this::mergePoints, segmentWriteState, segmentReadState, "points", numMerged); - return null; - }); + mergeWithLogging(this::mergePoints, segmentWriteState, segmentReadState, "points", numMerged); } if (mergeState.mergeFieldInfos.hasVectorValues()) { @@ -179,14 +164,9 @@ final class SegmentMerger { } if (mergeState.mergeFieldInfos.hasTermVectors()) { - mergingTasks.add( - () -> { - mergeWithLogging(this::mergeTermVectors, "term vectors"); - return null; - }); + mergeWithLogging(this::mergeTermVectors, "term vectors"); } - taskExecutor.invokeAll(mergingTasks); // write the merged infos mergeWithLogging( this::mergeFieldInfos, segmentWriteState, segmentReadState, "field infos", numMerged); diff --git a/lucene/core/src/java/org/apache/lucene/index/SlowCompositeCodecReaderWrapper.java b/lucene/core/src/java/org/apache/lucene/index/SlowCompositeCodecReaderWrapper.java index 148ead9cb2e..69d557d270a 100644 --- a/lucene/core/src/java/org/apache/lucene/index/SlowCompositeCodecReaderWrapper.java +++ b/lucene/core/src/java/org/apache/lucene/index/SlowCompositeCodecReaderWrapper.java @@ -20,8 +20,10 @@ import java.io.IOException; import java.io.UncheckedIOException; import java.util.ArrayList; import java.util.Arrays; +import java.util.HashMap; import java.util.Iterator; import java.util.List; +import java.util.Map; import java.util.Objects; import org.apache.lucene.codecs.DocValuesProducer; import org.apache.lucene.codecs.FieldsProducer; @@ -32,10 +34,7 @@ import org.apache.lucene.codecs.StoredFieldsReader; import org.apache.lucene.codecs.TermVectorsReader; import org.apache.lucene.index.MultiDocValues.MultiSortedDocValues; import org.apache.lucene.index.MultiDocValues.MultiSortedSetDocValues; -import org.apache.lucene.internal.hppc.IntObjectHashMap; -import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.KnnCollector; -import org.apache.lucene.search.VectorScorer; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.Bits; import org.apache.lucene.util.IOUtils; @@ -302,38 +301,21 @@ final class SlowCompositeCodecReaderWrapper extends CodecReader { } } - private record DocValuesSub(T sub, int docStart, int docEnd) {} + private record DocValuesSub(T sub, int docStart, int ordStart) {} - private static class MergedDocIdSetIterator extends DocIdSetIterator { + private static class MergedDocIterator + extends KnnVectorValues.DocIndexIterator { final Iterator> it; - final long cost; DocValuesSub current; - int currentIndex = 0; + KnnVectorValues.DocIndexIterator currentIterator; + int ord = -1; int doc = -1; - MergedDocIdSetIterator(List> subs) { - long cost = 0; - for (DocValuesSub sub : subs) { - if (sub.sub != null) { - cost += sub.sub.cost(); - } - } - this.cost = cost; + MergedDocIterator(List> subs) { this.it = subs.iterator(); current = it.next(); - } - - private boolean advanceSub(int target) { - while (current.sub == null || current.docEnd <= target) { - if (it.hasNext() == false) { - doc = NO_MORE_DOCS; - return false; - } - current = it.next(); - currentIndex++; - } - return true; + currentIterator = currentIterator(); } @Override @@ -341,41 +323,47 @@ final class SlowCompositeCodecReaderWrapper extends CodecReader { return doc; } + @Override + public int index() { + return ord; + } + @Override public int nextDoc() throws IOException { while (true) { if (current.sub != null) { - int next = current.sub.nextDoc(); + int next = currentIterator.nextDoc(); if (next != NO_MORE_DOCS) { + ++ord; return doc = current.docStart + next; } } if (it.hasNext() == false) { + ord = NO_MORE_DOCS; return doc = NO_MORE_DOCS; } current = it.next(); - currentIndex++; + currentIterator = currentIterator(); + ord = current.ordStart - 1; } } - @Override - public int advance(int target) throws IOException { - while (true) { - if (advanceSub(target) == false) { - return DocIdSetIterator.NO_MORE_DOCS; - } - int next = current.sub.advance(target - current.docStart); - if (next == DocIdSetIterator.NO_MORE_DOCS) { - target = current.docEnd; - } else { - return doc = current.docStart + next; - } + private KnnVectorValues.DocIndexIterator currentIterator() { + if (current.sub != null) { + return current.sub.iterator(); + } else { + return null; } } @Override public long cost() { - return cost; + throw new UnsupportedOperationException(); + } + + @Override + public int advance(int target) throws IOException { + throw new UnsupportedOperationException(); } } @@ -389,7 +377,7 @@ final class SlowCompositeCodecReaderWrapper extends CodecReader { private final CodecReader[] codecReaders; private final DocValuesProducer[] producers; private final int[] docStarts; - private final IntObjectHashMap cachedOrdMaps = new IntObjectHashMap<>(); + private final Map cachedOrdMaps = new HashMap<>(); SlowCompositeDocValuesProducerWrapper(CodecReader[] codecReaders, int[] docStarts) { this.codecReaders = codecReaders; @@ -428,14 +416,14 @@ final class SlowCompositeCodecReaderWrapper extends CodecReader { public SortedDocValues getSorted(FieldInfo field) throws IOException { OrdinalMap map = null; synchronized (cachedOrdMaps) { - map = cachedOrdMaps.get(field.number); + map = cachedOrdMaps.get(field.name); if (map == null) { // uncached, or not a multi dv SortedDocValues dv = MultiDocValues.getSortedValues(new MultiReader(codecReaders), field.name); if (dv instanceof MultiSortedDocValues) { map = ((MultiSortedDocValues) dv).mapping; - cachedOrdMaps.put(field.number, map); + cachedOrdMaps.put(field.name, map); } return dv; } @@ -464,14 +452,14 @@ final class SlowCompositeCodecReaderWrapper extends CodecReader { public SortedSetDocValues getSortedSet(FieldInfo field) throws IOException { OrdinalMap map = null; synchronized (cachedOrdMaps) { - map = cachedOrdMaps.get(field.number); + map = cachedOrdMaps.get(field.name); if (map == null) { // uncached, or not a multi dv SortedSetDocValues dv = MultiDocValues.getSortedSetValues(new MultiReader(codecReaders), field.name); if (dv instanceof MultiSortedSetDocValues) { map = ((MultiSortedSetDocValues) dv).mapping; - cachedOrdMaps.put(field.number, map); + cachedOrdMaps.put(field.name, map); } return dv; } @@ -847,55 +835,75 @@ final class SlowCompositeCodecReaderWrapper extends CodecReader { int size = 0; for (CodecReader reader : codecReaders) { FloatVectorValues values = reader.getFloatVectorValues(field); + subs.add(new DocValuesSub<>(values, docStarts[i], size)); if (values != null) { if (dimension == -1) { dimension = values.dimension(); } size += values.size(); } - subs.add(new DocValuesSub<>(values, docStarts[i], docStarts[i + 1])); i++; } - final int finalDimension = dimension; - final int finalSize = size; - MergedDocIdSetIterator mergedIterator = new MergedDocIdSetIterator<>(subs); - return new FloatVectorValues() { + return new MergedFloatVectorValues(dimension, size, subs); + } - @Override - public int dimension() { - return finalDimension; - } + class MergedFloatVectorValues extends FloatVectorValues { + final int dimension; + final int size; + final DocValuesSub[] subs; + final MergedDocIterator iter; + final int[] starts; + int lastSubIndex; - @Override - public int size() { - return finalSize; + MergedFloatVectorValues(int dimension, int size, List> subs) { + this.dimension = dimension; + this.size = size; + this.subs = subs.toArray(new DocValuesSub[0]); + iter = new MergedDocIterator<>(subs); + // [0, start(1), ..., size] - we want the extra element + // to avoid checking for out-of-array bounds + starts = new int[subs.size() + 1]; + for (int i = 0; i < subs.size(); i++) { + starts[i] = subs.get(i).ordStart; } + starts[starts.length - 1] = size; + } - @Override - public float[] vectorValue() throws IOException { - return mergedIterator.current.sub.vectorValue(); - } + @Override + public MergedDocIterator iterator() { + return iter; + } - @Override - public int docID() { - return mergedIterator.docID(); - } + @Override + public int dimension() { + return dimension; + } - @Override - public int nextDoc() throws IOException { - return mergedIterator.nextDoc(); - } + @Override + public int size() { + return size; + } - @Override - public int advance(int target) throws IOException { - return mergedIterator.advance(target); + @SuppressWarnings("unchecked") + @Override + public FloatVectorValues copy() throws IOException { + List> subsCopy = new ArrayList<>(); + for (Object sub : subs) { + subsCopy.add((DocValuesSub) sub); } + return new MergedFloatVectorValues(dimension, size, subsCopy); + } - @Override - public VectorScorer scorer(float[] target) { - throw new UnsupportedOperationException(); - } - }; + @Override + public float[] vectorValue(int ord) throws IOException { + assert ord >= 0 && ord < size; + // We need to implement fully random-access API here in order to support callers like + // SortingCodecReader that rely on it. + lastSubIndex = findSub(ord, lastSubIndex, starts); + assert subs[lastSubIndex].sub != null; + return ((FloatVectorValues) subs[lastSubIndex].sub) + .vectorValue(ord - subs[lastSubIndex].ordStart); + } } @Override @@ -906,55 +914,101 @@ final class SlowCompositeCodecReaderWrapper extends CodecReader { int size = 0; for (CodecReader reader : codecReaders) { ByteVectorValues values = reader.getByteVectorValues(field); + subs.add(new DocValuesSub<>(values, docStarts[i], size)); if (values != null) { if (dimension == -1) { dimension = values.dimension(); } size += values.size(); } - subs.add(new DocValuesSub<>(values, docStarts[i], docStarts[i + 1])); i++; } - final int finalDimension = dimension; - final int finalSize = size; - MergedDocIdSetIterator mergedIterator = new MergedDocIdSetIterator<>(subs); - return new ByteVectorValues() { + return new MergedByteVectorValues(dimension, size, subs); + } - @Override - public int dimension() { - return finalDimension; - } + class MergedByteVectorValues extends ByteVectorValues { + final int dimension; + final int size; + final DocValuesSub[] subs; + final MergedDocIterator iter; + final int[] starts; + int lastSubIndex; - @Override - public int size() { - return finalSize; + MergedByteVectorValues(int dimension, int size, List> subs) { + this.dimension = dimension; + this.size = size; + this.subs = subs.toArray(new DocValuesSub[0]); + iter = new MergedDocIterator<>(subs); + // [0, start(1), ..., size] - we want the extra element + // to avoid checking for out-of-array bounds + starts = new int[subs.size() + 1]; + for (int i = 0; i < subs.size(); i++) { + starts[i] = subs.get(i).ordStart; } + starts[starts.length - 1] = size; + } - @Override - public byte[] vectorValue() throws IOException { - return mergedIterator.current.sub.vectorValue(); - } + @Override + public MergedDocIterator iterator() { + return iter; + } - @Override - public int docID() { - return mergedIterator.docID(); - } + @Override + public int dimension() { + return dimension; + } - @Override - public int nextDoc() throws IOException { - return mergedIterator.nextDoc(); - } + @Override + public int size() { + return size; + } - @Override - public int advance(int target) throws IOException { - return mergedIterator.advance(target); - } + @Override + public byte[] vectorValue(int ord) throws IOException { + assert ord >= 0 && ord < size; + // We need to implement fully random-access API here in order to support callers like + // SortingCodecReader that rely on it. We maintain lastSubIndex since we expect some + // repetition. + lastSubIndex = findSub(ord, lastSubIndex, starts); + return ((ByteVectorValues) subs[lastSubIndex].sub) + .vectorValue(ord - subs[lastSubIndex].ordStart); + } - @Override - public VectorScorer scorer(byte[] target) { - throw new UnsupportedOperationException(); + @SuppressWarnings("unchecked") + @Override + public ByteVectorValues copy() throws IOException { + List> newSubs = new ArrayList<>(); + for (Object sub : subs) { + newSubs.add((DocValuesSub) sub); } - }; + return new MergedByteVectorValues(dimension, size, newSubs); + } + } + + private static int findSub(int ord, int lastSubIndex, int[] starts) { + if (ord >= starts[lastSubIndex]) { + if (ord >= starts[lastSubIndex + 1]) { + return binarySearchStarts(starts, ord, lastSubIndex + 1, starts.length); + } + } else { + return binarySearchStarts(starts, ord, 0, lastSubIndex); + } + return lastSubIndex; + } + + private static int binarySearchStarts(int[] starts, int ord, int from, int to) { + int pos = Arrays.binarySearch(starts, from, to, ord); + if (pos < 0) { + // subtract one since binarySearch returns an *insertion point* + return -2 - pos; + } else { + while (pos < starts.length - 1 && starts[pos + 1] == ord) { + // Arrays.binarySearch can return any of a sequence of repeated value + // but we always want the last one + ++pos; + } + return pos; + } } @Override diff --git a/lucene/core/src/java/org/apache/lucene/index/SortingCodecReader.java b/lucene/core/src/java/org/apache/lucene/index/SortingCodecReader.java index fee0fc2f730..daec0c197d6 100644 --- a/lucene/core/src/java/org/apache/lucene/index/SortingCodecReader.java +++ b/lucene/core/src/java/org/apache/lucene/index/SortingCodecReader.java @@ -25,6 +25,7 @@ import java.util.HashMap; import java.util.Iterator; import java.util.Map; import java.util.Objects; +import java.util.function.Supplier; import org.apache.lucene.codecs.DocValuesProducer; import org.apache.lucene.codecs.FieldsProducer; import org.apache.lucene.codecs.KnnVectorsReader; @@ -32,10 +33,11 @@ import org.apache.lucene.codecs.NormsProducer; import org.apache.lucene.codecs.PointsReader; import org.apache.lucene.codecs.StoredFieldsReader; import org.apache.lucene.codecs.TermVectorsReader; +import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.KnnCollector; import org.apache.lucene.search.Sort; import org.apache.lucene.search.SortField; -import org.apache.lucene.search.VectorScorer; +import org.apache.lucene.util.BitSetIterator; import org.apache.lucene.util.Bits; import org.apache.lucene.util.FixedBitSet; import org.apache.lucene.util.IOSupplier; @@ -206,121 +208,175 @@ public final class SortingCodecReader extends FilterCodecReader { } } - /** Sorting FloatVectorValues that iterate over documents in the order of the provided sortMap */ - private static class SortingFloatVectorValues extends FloatVectorValues { - final int size; - final int dimension; - final FixedBitSet docsWithField; - final float[][] vectors; + /** + * Factory for SortingValuesIterator. This enables us to create new iterators as needed without + * recomputing the sorting mappings. + */ + static class SortingIteratorSupplier implements Supplier { + private final FixedBitSet docBits; + private final int[] docToOrd; + private final int size; - private int docId = -1; + SortingIteratorSupplier(FixedBitSet docBits, int[] docToOrd, int size) { + this.docBits = docBits; + this.docToOrd = docToOrd; + this.size = size; + } - SortingFloatVectorValues(FloatVectorValues delegate, Sorter.DocMap sortMap) throws IOException { - this.size = delegate.size(); - this.dimension = delegate.dimension(); - docsWithField = new FixedBitSet(sortMap.size()); - vectors = new float[sortMap.size()][]; - for (int doc = delegate.nextDoc(); doc != NO_MORE_DOCS; doc = delegate.nextDoc()) { - int newDocID = sortMap.oldToNew(doc); - docsWithField.set(newDocID); - vectors[newDocID] = delegate.vectorValue().clone(); + @Override + public SortingValuesIterator get() { + return new SortingValuesIterator(docBits, docToOrd, size); + } + + public int size() { + return size; + } + } + + /** + * Creates a factory for SortingValuesIterator. Does the work of computing the (new docId to old + * ordinal) mapping, and caches the result, enabling it to create new iterators cheaply. + * + * @param values the values over which to iterate + * @param docMap the mapping from "old" docIds to "new" (sorted) docIds. + */ + public static SortingIteratorSupplier iteratorSupplier( + KnnVectorValues values, Sorter.DocMap docMap) throws IOException { + + final int[] docToOrd = new int[docMap.size()]; + final FixedBitSet docBits = new FixedBitSet(docMap.size()); + int count = 0; + // Note: docToOrd will contain zero for docids that have no vector. This is OK though + // because the iterator cannot be positioned on such docs + KnnVectorValues.DocIndexIterator iter = values.iterator(); + for (int doc = iter.nextDoc(); doc != NO_MORE_DOCS; doc = iter.nextDoc()) { + int newDocId = docMap.oldToNew(doc); + if (newDocId != -1) { + docToOrd[newDocId] = iter.index(); + docBits.set(newDocId); + ++count; } } + return new SortingIteratorSupplier(docBits, docToOrd, count); + } + + /** + * Iterator over KnnVectorValues accepting a mapping to differently-sorted docs. Consequently + * index() may skip around, not increasing monotonically as iteration proceeds. + */ + public static class SortingValuesIterator extends KnnVectorValues.DocIndexIterator { + private final FixedBitSet docBits; + private final DocIdSetIterator docsWithValues; + private final int[] docToOrd; + + int doc = -1; + + SortingValuesIterator(FixedBitSet docBits, int[] docToOrd, int size) { + this.docBits = docBits; + this.docToOrd = docToOrd; + docsWithValues = new BitSetIterator(docBits, size); + } @Override public int docID() { - return docId; + return doc; + } + + @Override + public int index() { + assert docBits.get(doc); + return docToOrd[doc]; } @Override public int nextDoc() throws IOException { - return advance(docId + 1); - } - - @Override - public float[] vectorValue() throws IOException { - return vectors[docId]; - } - - @Override - public int dimension() { - return dimension; - } - - @Override - public int size() { - return size; - } - - @Override - public int advance(int target) throws IOException { - if (target >= docsWithField.length()) { - return NO_MORE_DOCS; + if (doc != NO_MORE_DOCS) { + doc = docsWithValues.nextDoc(); } - return docId = docsWithField.nextSetBit(target); + return doc; } @Override - public VectorScorer scorer(float[] target) { + public long cost() { + return docBits.cardinality(); + } + + @Override + public int advance(int target) { throw new UnsupportedOperationException(); } } - private static class SortingByteVectorValues extends ByteVectorValues { - final int size; - final int dimension; - final FixedBitSet docsWithField; - final byte[][] vectors; + /** Sorting FloatVectorValues that maps ordinals using the provided sortMap */ + private static class SortingFloatVectorValues extends FloatVectorValues { + final FloatVectorValues delegate; + final SortingIteratorSupplier iteratorSupplier; - private int docId = -1; - - SortingByteVectorValues(ByteVectorValues delegate, Sorter.DocMap sortMap) throws IOException { - this.size = delegate.size(); - this.dimension = delegate.dimension(); - docsWithField = new FixedBitSet(sortMap.size()); - vectors = new byte[sortMap.size()][]; - for (int doc = delegate.nextDoc(); doc != NO_MORE_DOCS; doc = delegate.nextDoc()) { - int newDocID = sortMap.oldToNew(doc); - docsWithField.set(newDocID); - vectors[newDocID] = delegate.vectorValue().clone(); - } + SortingFloatVectorValues(FloatVectorValues delegate, Sorter.DocMap sortMap) throws IOException { + this.delegate = delegate; + // SortingValuesIterator consumes the iterator and records the docs and ord mapping + iteratorSupplier = iteratorSupplier(delegate, sortMap); } @Override - public int docID() { - return docId; - } - - @Override - public int nextDoc() throws IOException { - return advance(docId + 1); - } - - @Override - public byte[] vectorValue() throws IOException { - return vectors[docId]; + public float[] vectorValue(int ord) throws IOException { + // ords are interpreted in the delegate's ord-space. + return delegate.vectorValue(ord); } @Override public int dimension() { - return dimension; + return delegate.dimension(); } @Override public int size() { - return size; + return iteratorSupplier.size(); } @Override - public int advance(int target) throws IOException { - if (target >= docsWithField.length()) { - return NO_MORE_DOCS; - } - return docId = docsWithField.nextSetBit(target); + public FloatVectorValues copy() { + throw new UnsupportedOperationException(); } @Override - public VectorScorer scorer(byte[] target) { + public DocIndexIterator iterator() { + return iteratorSupplier.get(); + } + } + + private static class SortingByteVectorValues extends ByteVectorValues { + final ByteVectorValues delegate; + final SortingIteratorSupplier iteratorSupplier; + + SortingByteVectorValues(ByteVectorValues delegate, Sorter.DocMap sortMap) throws IOException { + this.delegate = delegate; + // SortingValuesIterator consumes the iterator and records the docs and ord mapping + iteratorSupplier = iteratorSupplier(delegate, sortMap); + } + + @Override + public byte[] vectorValue(int ord) throws IOException { + return delegate.vectorValue(ord); + } + + @Override + public DocIndexIterator iterator() { + return iteratorSupplier.get(); + } + + @Override + public int dimension() { + return delegate.dimension(); + } + + @Override + public int size() { + return iteratorSupplier.size(); + } + + @Override + public ByteVectorValues copy() { throw new UnsupportedOperationException(); } } diff --git a/lucene/core/src/java/org/apache/lucene/index/TieredMergePolicy.java b/lucene/core/src/java/org/apache/lucene/index/TieredMergePolicy.java index 702df660c44..2fb0c0783a2 100644 --- a/lucene/core/src/java/org/apache/lucene/index/TieredMergePolicy.java +++ b/lucene/core/src/java/org/apache/lucene/index/TieredMergePolicy.java @@ -438,7 +438,10 @@ public class TieredMergePolicy extends MergePolicy { } // allowedSegCount may occasionally be less than segsPerTier // if segment sizes are below the floor size - allowedSegCount = Math.max(allowedSegCount, Math.max(segsPerTier, targetSearchConcurrency)); + allowedSegCount = Math.max(allowedSegCount, segsPerTier); + // No need to merge if the total number of segments (including too big segments) is less than or + // equal to the target search concurrency. + allowedSegCount = Math.max(allowedSegCount, targetSearchConcurrency - tooBigCount); int allowedDocCount = getMaxAllowedDocs(totalMaxDoc, totalDelDocs); if (verbose(mergeContext) && tooBigCount > 0) { diff --git a/lucene/core/src/java/org/apache/lucene/internal/hppc/CharObjectHashMap.java b/lucene/core/src/java/org/apache/lucene/internal/hppc/CharObjectHashMap.java index b1fad7017b5..40b32141f3f 100644 --- a/lucene/core/src/java/org/apache/lucene/internal/hppc/CharObjectHashMap.java +++ b/lucene/core/src/java/org/apache/lucene/internal/hppc/CharObjectHashMap.java @@ -574,15 +574,6 @@ public class CharObjectHashMap public int size() { return CharObjectHashMap.this.size(); } - - public VType[] toArray() { - VType[] array = (VType[]) new Object[size()]; - int i = 0; - for (ObjectCursor cursor : this) { - array[i++] = cursor.value; - } - return array; - } } /** An iterator over the set of assigned values. */ diff --git a/lucene/core/src/java/org/apache/lucene/internal/hppc/IntObjectHashMap.java b/lucene/core/src/java/org/apache/lucene/internal/hppc/IntObjectHashMap.java index 180bb3249f3..732b0ecb71c 100644 --- a/lucene/core/src/java/org/apache/lucene/internal/hppc/IntObjectHashMap.java +++ b/lucene/core/src/java/org/apache/lucene/internal/hppc/IntObjectHashMap.java @@ -562,15 +562,6 @@ public class IntObjectHashMap public int size() { return IntObjectHashMap.this.size(); } - - public VType[] toArray() { - VType[] array = (VType[]) new Object[size()]; - int i = 0; - for (ObjectCursor cursor : this) { - array[i++] = cursor.value; - } - return array; - } } /** An iterator over the set of assigned values. */ diff --git a/lucene/core/src/java/org/apache/lucene/internal/hppc/LongObjectHashMap.java b/lucene/core/src/java/org/apache/lucene/internal/hppc/LongObjectHashMap.java index 4bc890b80b1..5f34625f675 100644 --- a/lucene/core/src/java/org/apache/lucene/internal/hppc/LongObjectHashMap.java +++ b/lucene/core/src/java/org/apache/lucene/internal/hppc/LongObjectHashMap.java @@ -562,15 +562,6 @@ public class LongObjectHashMap public int size() { return LongObjectHashMap.this.size(); } - - public VType[] toArray() { - VType[] array = (VType[]) new Object[size()]; - int i = 0; - for (ObjectCursor cursor : this) { - array[i++] = cursor.value; - } - return array; - } } /** An iterator over the set of assigned values. */ diff --git a/lucene/core/src/java/org/apache/lucene/search/FieldExistsQuery.java b/lucene/core/src/java/org/apache/lucene/search/FieldExistsQuery.java index 409bcbc0b64..adaace27727 100644 --- a/lucene/core/src/java/org/apache/lucene/search/FieldExistsQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/FieldExistsQuery.java @@ -181,8 +181,8 @@ public class FieldExistsQuery extends Query { } else if (fieldInfo.getVectorDimension() != 0) { // the field indexes vectors iterator = switch (fieldInfo.getVectorEncoding()) { - case FLOAT32 -> context.reader().getFloatVectorValues(field); - case BYTE -> context.reader().getByteVectorValues(field); + case FLOAT32 -> context.reader().getFloatVectorValues(field).iterator(); + case BYTE -> context.reader().getByteVectorValues(field).iterator(); }; } else if (fieldInfo.getDocValuesType() != DocValuesType.NONE) { // the field indexes doc values diff --git a/lucene/core/src/java/org/apache/lucene/search/HitsThresholdChecker.java b/lucene/core/src/java/org/apache/lucene/search/HitsThresholdChecker.java index 43ff4fecdbb..78e1a589a77 100644 --- a/lucene/core/src/java/org/apache/lucene/search/HitsThresholdChecker.java +++ b/lucene/core/src/java/org/apache/lucene/search/HitsThresholdChecker.java @@ -46,7 +46,11 @@ abstract class HitsThresholdChecker { if (thresholdReached) { return true; } - return thresholdReached = globalHitCount.longValue() > getHitsThreshold(); + if (globalHitCount.longValue() > getHitsThreshold()) { + thresholdReached = true; + return true; + } + return false; } @Override diff --git a/lucene/core/src/java/org/apache/lucene/search/MaxScoreAccumulator.java b/lucene/core/src/java/org/apache/lucene/search/MaxScoreAccumulator.java index 5940a80a961..eac33dbf039 100644 --- a/lucene/core/src/java/org/apache/lucene/search/MaxScoreAccumulator.java +++ b/lucene/core/src/java/org/apache/lucene/search/MaxScoreAccumulator.java @@ -35,8 +35,8 @@ final class MaxScoreAccumulator { } /** - * Return the max encoded DocAndScore in a way that is consistent with {@link - * DocAndScore#compareTo}. + * Return the max encoded docId and score found in the two longs, following the encoding in {@link + * #accumulate}. */ private static long maxEncode(long v1, long v2) { float score1 = Float.intBitsToFloat((int) (v1 >> 32)); @@ -57,26 +57,15 @@ final class MaxScoreAccumulator { acc.accumulate(encode); } - DocAndScore get() { - long value = acc.get(); - if (value == Long.MIN_VALUE) { - return null; - } - float score = Float.intBitsToFloat((int) (value >> 32)); - int docId = (int) value; - return new DocAndScore(docId, score); + public static float toScore(long value) { + return Float.intBitsToFloat((int) (value >> 32)); } - record DocAndScore(int docId, float score) implements Comparable { + public static int docId(long value) { + return (int) value; + } - @Override - public int compareTo(DocAndScore o) { - int cmp = Float.compare(score, o.score); - if (cmp == 0) { - // tie-break on doc id, lower id has the priority - return Integer.compare(o.docId, docId); - } - return cmp; - } + long getRaw() { + return acc.get(); } } diff --git a/lucene/core/src/java/org/apache/lucene/search/MaxScoreBulkScorer.java b/lucene/core/src/java/org/apache/lucene/search/MaxScoreBulkScorer.java index 8786343ccec..18f5b83e93a 100644 --- a/lucene/core/src/java/org/apache/lucene/search/MaxScoreBulkScorer.java +++ b/lucene/core/src/java/org/apache/lucene/search/MaxScoreBulkScorer.java @@ -40,6 +40,8 @@ final class MaxScoreBulkScorer extends BulkScorer { // Index of the first scorer that is required, this scorer and all following scorers are required // for a document to match. int firstRequiredScorer; + // The minimum value of minCompetitiveScore that would produce a more favorable partitioning. + float nextMinCompetitiveScore; private final long cost; float minCompetitiveScore; private final Score scorable = new Score(); @@ -114,9 +116,14 @@ final class MaxScoreBulkScorer extends BulkScorer { while (top.doc < outerWindowMax) { scoreInnerWindow(collector, acceptDocs, outerWindowMax); top = essentialQueue.top(); + if (minCompetitiveScore >= nextMinCompetitiveScore) { + // The minimum competitive score increased substantially, so we can now partition scorers + // in a more favorable way. + break; + } } - outerWindowMin = outerWindowMax; + outerWindowMin = Math.min(top.doc, outerWindowMax); } return nextCandidate(max); @@ -337,6 +344,7 @@ final class MaxScoreBulkScorer extends BulkScorer { }); double maxScoreSum = 0; firstEssentialScorer = 0; + nextMinCompetitiveScore = Float.POSITIVE_INFINITY; for (int i = 0; i < allScorers.length; ++i) { final DisiWrapper w = scratch[i]; double newMaxScoreSum = maxScoreSum + w.maxWindowScore; @@ -349,6 +357,7 @@ final class MaxScoreBulkScorer extends BulkScorer { firstEssentialScorer++; } else { allScorers[allScorers.length - 1 - (i - firstEssentialScorer)] = w; + nextMinCompetitiveScore = Math.min(maxScoreSumFloat, nextMinCompetitiveScore); } } diff --git a/lucene/core/src/java/org/apache/lucene/search/MaxScoreCache.java b/lucene/core/src/java/org/apache/lucene/search/MaxScoreCache.java index 2d38370e86a..a449f675daa 100644 --- a/lucene/core/src/java/org/apache/lucene/search/MaxScoreCache.java +++ b/lucene/core/src/java/org/apache/lucene/search/MaxScoreCache.java @@ -71,7 +71,9 @@ public final class MaxScoreCache { private float computeMaxScore(List impacts) { float maxScore = 0; - for (Impact impact : impacts) { + var scorer = this.scorer; + for (int i = 0, length = impacts.size(); i < length; i++) { + Impact impact = impacts.get(i); maxScore = Math.max(scorer.score(impact.freq, impact.norm), maxScore); } return maxScore; diff --git a/lucene/core/src/java/org/apache/lucene/search/PointInSetQuery.java b/lucene/core/src/java/org/apache/lucene/search/PointInSetQuery.java index c2efa68a45b..f0e0cfd6bdb 100644 --- a/lucene/core/src/java/org/apache/lucene/search/PointInSetQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/PointInSetQuery.java @@ -181,7 +181,7 @@ public abstract class PointInSetQuery extends Query implements Accountable { @Override public Scorer get(long leadCost) throws IOException { DocIdSetBuilder result = new DocIdSetBuilder(reader.maxDoc(), values, field); - values.intersect(new MergePointVisitor(sortedPackedPoints, result)); + values.intersect(new MergePointVisitor(sortedPackedPoints.iterator(), result)); DocIdSetIterator iterator = result.build().iterator(); return new ConstantScoreScorer(score(), scoreMode, iterator); } @@ -192,7 +192,9 @@ public abstract class PointInSetQuery extends Query implements Accountable { if (cost == -1) { // Computing the cost may be expensive, so only do it if necessary DocIdSetBuilder result = new DocIdSetBuilder(reader.maxDoc(), values, field); - cost = values.estimateDocCount(new MergePointVisitor(sortedPackedPoints, result)); + cost = + values.estimateDocCount( + new MergePointVisitor(sortedPackedPoints.iterator(), result)); assert cost >= 0; } return cost; @@ -260,18 +262,15 @@ public abstract class PointInSetQuery extends Query implements Accountable { private class MergePointVisitor implements IntersectVisitor { private final DocIdSetBuilder result; - private TermIterator iterator; + private final TermIterator iterator; private BytesRef nextQueryPoint; private final ByteArrayComparator comparator; - private final PrefixCodedTerms sortedPackedPoints; private DocIdSetBuilder.BulkAdder adder; - public MergePointVisitor(PrefixCodedTerms sortedPackedPoints, DocIdSetBuilder result) - throws IOException { + public MergePointVisitor(TermIterator iterator, DocIdSetBuilder result) throws IOException { this.result = result; - this.sortedPackedPoints = sortedPackedPoints; this.comparator = ArrayUtil.getUnsignedComparator(bytesPerDim); - this.iterator = this.sortedPackedPoints.iterator(); + this.iterator = iterator; nextQueryPoint = iterator.next(); } diff --git a/lucene/core/src/java/org/apache/lucene/search/TaskExecutor.java b/lucene/core/src/java/org/apache/lucene/search/TaskExecutor.java index 331d692a854..6c89c267a52 100644 --- a/lucene/core/src/java/org/apache/lucene/search/TaskExecutor.java +++ b/lucene/core/src/java/org/apache/lucene/search/TaskExecutor.java @@ -20,7 +20,6 @@ package org.apache.lucene.search; import java.io.IOException; import java.util.ArrayList; import java.util.Collection; -import java.util.Collections; import java.util.List; import java.util.Objects; import java.util.concurrent.Callable; @@ -73,15 +72,68 @@ public final class TaskExecutor { /** * Execute all the callables provided as an argument, wait for them to complete and return the * obtained results. If an exception is thrown by more than one callable, the subsequent ones will - * be added as suppressed exceptions to the first one that was caught. + * be added as suppressed exceptions to the first one that was caught. Additionally, if one task + * throws an exception, all other tasks from the same group are cancelled, to avoid needless + * computation as their results would not be exposed anyways. * * @param callables the callables to execute * @return a list containing the results from the tasks execution * @param the return type of the task execution */ public List invokeAll(Collection> callables) throws IOException { - TaskGroup taskGroup = new TaskGroup<>(callables); - return taskGroup.invokeAll(executor); + List> futures = new ArrayList<>(callables.size()); + for (Callable callable : callables) { + futures.add(new Task<>(callable, futures)); + } + final int count = futures.size(); + // taskId provides the first index of an un-executed task in #futures + final AtomicInteger taskId = new AtomicInteger(0); + // we fork execution count - 1 tasks to execute at least one task on the current thread to + // minimize needless forking and blocking of the current thread + if (count > 1) { + final Runnable work = + () -> { + int id = taskId.getAndIncrement(); + if (id < count) { + futures.get(id).run(); + } + }; + for (int j = 0; j < count - 1; j++) { + executor.execute(work); + } + } + // try to execute as many tasks as possible on the current thread to minimize context + // switching in case of long running concurrent + // tasks as well as dead-locking if the current thread is part of #executor for executors that + // have limited or no parallelism + int id; + while ((id = taskId.getAndIncrement()) < count) { + futures.get(id).run(); + if (id >= count - 1) { + // save redundant CAS in case this was the last task + break; + } + } + return collectResults(futures); + } + + private static List collectResults(List> futures) throws IOException { + Throwable exc = null; + List results = new ArrayList<>(futures.size()); + for (Future future : futures) { + try { + results.add(future.get()); + } catch (InterruptedException e) { + exc = IOUtils.useOrSuppress(exc, new ThreadInterruptedException(e)); + } catch (ExecutionException e) { + exc = IOUtils.useOrSuppress(exc, e.getCause()); + } + } + assert assertAllFuturesCompleted(futures) : "Some tasks are still running?"; + if (exc != null) { + throw IOUtils.rethrowAlways(exc); + } + return results; } @Override @@ -89,128 +141,62 @@ public final class TaskExecutor { return "TaskExecutor(" + "executor=" + executor + ')'; } - /** - * Holds all the sub-tasks that a certain operation gets split into as it gets parallelized and - * exposes the ability to invoke such tasks and wait for them all to complete their execution and - * provide their results. Additionally, if one task throws an exception, all other tasks from the - * same group are cancelled, to avoid needless computation as their results would not be exposed - * anyways. Creates one {@link FutureTask} for each {@link Callable} provided - * - * @param the return type of all the callables - */ - private static final class TaskGroup { - private final List> futures; - - TaskGroup(Collection> callables) { - List> tasks = new ArrayList<>(callables.size()); - for (Callable callable : callables) { - tasks.add(createTask(callable)); + private static boolean assertAllFuturesCompleted(Collection> futures) { + for (Future future : futures) { + if (future.isDone() == false) { + return false; } - this.futures = Collections.unmodifiableList(tasks); + } + return true; + } + + private static void cancelAll(Collection> futures) { + for (Future future : futures) { + future.cancel(false); + } + } + + private static class Task extends FutureTask { + + private final AtomicBoolean startedOrCancelled = new AtomicBoolean(false); + + private final Collection> futures; + + public Task(Callable callable, Collection> futures) { + super(callable); + this.futures = futures; } - RunnableFuture createTask(Callable callable) { - return new FutureTask<>(callable) { - - private final AtomicBoolean startedOrCancelled = new AtomicBoolean(false); - - @Override - public void run() { - if (startedOrCancelled.compareAndSet(false, true)) { - super.run(); - } - } - - @Override - protected void setException(Throwable t) { - super.setException(t); - cancelAll(); - } - - @Override - public boolean cancel(boolean mayInterruptIfRunning) { - assert mayInterruptIfRunning == false - : "cancelling tasks that are running is not supported"; - /* - Future#get (called in invokeAll) throws CancellationException when invoked against a running task that has been cancelled but - leaves the task running. We rather want to make sure that invokeAll does not leave any running tasks behind when it returns. - Overriding cancel ensures that tasks that are already started will complete normally once cancelled, and Future#get will - wait for them to finish instead of throwing CancellationException. A cleaner way would have been to override FutureTask#get and - make it wait for cancelled tasks, but FutureTask#awaitDone is private. Tasks that are cancelled before they are started will be no-op. - */ - if (startedOrCancelled.compareAndSet(false, true)) { - // task is cancelled hence it has no results to return. That's fine: they would be - // ignored anyway. - set(null); - return true; - } - return false; - } - }; + @Override + public void run() { + if (startedOrCancelled.compareAndSet(false, true)) { + super.run(); + } } - List invokeAll(Executor executor) throws IOException { - final int count = futures.size(); - // taskId provides the first index of an un-executed task in #futures - final AtomicInteger taskId = new AtomicInteger(0); - // we fork execution count - 1 tasks to execute at least one task on the current thread to - // minimize needless forking and blocking of the current thread - if (count > 1) { - final Runnable work = - () -> { - int id = taskId.getAndIncrement(); - if (id < count) { - futures.get(id).run(); - } - }; - for (int j = 0; j < count - 1; j++) { - executor.execute(work); - } - } - // try to execute as many tasks as possible on the current thread to minimize context - // switching in case of long running concurrent - // tasks as well as dead-locking if the current thread is part of #executor for executors that - // have limited or no parallelism - int id; - while ((id = taskId.getAndIncrement()) < count) { - futures.get(id).run(); - if (id >= count - 1) { - // save redundant CAS in case this was the last task - break; - } - } - Throwable exc = null; - List results = new ArrayList<>(count); - for (int i = 0; i < count; i++) { - Future future = futures.get(i); - try { - results.add(future.get()); - } catch (InterruptedException e) { - exc = IOUtils.useOrSuppress(exc, new ThreadInterruptedException(e)); - } catch (ExecutionException e) { - exc = IOUtils.useOrSuppress(exc, e.getCause()); - } - } - assert assertAllFuturesCompleted() : "Some tasks are still running?"; - if (exc != null) { - throw IOUtils.rethrowAlways(exc); - } - return results; + @Override + protected void setException(Throwable t) { + super.setException(t); + cancelAll(futures); } - private boolean assertAllFuturesCompleted() { - for (RunnableFuture future : futures) { - if (future.isDone() == false) { - return false; - } - } - return true; - } - - private void cancelAll() { - for (Future future : futures) { - future.cancel(false); + @Override + public boolean cancel(boolean mayInterruptIfRunning) { + assert mayInterruptIfRunning == false : "cancelling tasks that are running is not supported"; + /* + Future#get (called in #collectResults) throws CancellationException when invoked against a running task that has been cancelled but + leaves the task running. We rather want to make sure that invokeAll does not leave any running tasks behind when it returns. + Overriding cancel ensures that tasks that are already started will complete normally once cancelled, and Future#get will + wait for them to finish instead of throwing CancellationException. A cleaner way would have been to override FutureTask#get and + make it wait for cancelled tasks, but FutureTask#awaitDone is private. Tasks that are cancelled before they are started will be no-op. + */ + if (startedOrCancelled.compareAndSet(false, true)) { + // task is cancelled hence it has no results to return. That's fine: they would be + // ignored anyway. + set(null); + return true; } + return false; } } } diff --git a/lucene/core/src/java/org/apache/lucene/search/TermInSetQuery.java b/lucene/core/src/java/org/apache/lucene/search/TermInSetQuery.java index da01b24f0bd..c82df0ac1eb 100644 --- a/lucene/core/src/java/org/apache/lucene/search/TermInSetQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/TermInSetQuery.java @@ -27,13 +27,7 @@ import org.apache.lucene.index.PrefixCodedTerms.TermIterator; import org.apache.lucene.index.Term; import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; -import org.apache.lucene.util.Accountable; -import org.apache.lucene.util.AttributeSource; -import org.apache.lucene.util.BytesRef; -import org.apache.lucene.util.BytesRefBuilder; -import org.apache.lucene.util.BytesRefComparator; -import org.apache.lucene.util.RamUsageEstimator; -import org.apache.lucene.util.StringSorter; +import org.apache.lucene.util.*; import org.apache.lucene.util.automaton.Automata; import org.apache.lucene.util.automaton.Automaton; import org.apache.lucene.util.automaton.ByteRunAutomaton; @@ -141,6 +135,16 @@ public class TermInSetQuery extends MultiTermQuery implements Accountable { return termData.size(); } + /** + * Get an iterator over the encoded terms for query inspection. + * + * @lucene.experimental + */ + public BytesRefIterator getBytesRefIterator() { + final TermIterator iterator = this.termData.iterator(); + return () -> iterator.next(); + } + @Override public void visit(QueryVisitor visitor) { if (visitor.acceptField(field) == false) { diff --git a/lucene/core/src/java/org/apache/lucene/search/TopFieldCollector.java b/lucene/core/src/java/org/apache/lucene/search/TopFieldCollector.java index 114797f44cb..eac31bf89d0 100644 --- a/lucene/core/src/java/org/apache/lucene/search/TopFieldCollector.java +++ b/lucene/core/src/java/org/apache/lucene/search/TopFieldCollector.java @@ -24,7 +24,6 @@ import java.util.Objects; import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.index.ReaderUtil; import org.apache.lucene.search.FieldValueHitQueue.Entry; -import org.apache.lucene.search.MaxScoreAccumulator.DocAndScore; import org.apache.lucene.search.TotalHits.Relation; /** @@ -366,10 +365,12 @@ public abstract class TopFieldCollector extends TopDocsCollector { // we can start checking the global maximum score even // if the local queue is not full because the threshold // is reached. - DocAndScore maxMinScore = minScoreAcc.get(); - if (maxMinScore != null && maxMinScore.score() > minCompetitiveScore) { - scorer.setMinCompetitiveScore(maxMinScore.score()); - minCompetitiveScore = maxMinScore.score(); + long maxMinScore = minScoreAcc.getRaw(); + float score; + if (maxMinScore != Long.MIN_VALUE + && (score = MaxScoreAccumulator.toScore(maxMinScore)) > minCompetitiveScore) { + scorer.setMinCompetitiveScore(score); + minCompetitiveScore = score; totalHitsRelation = TotalHits.Relation.GREATER_THAN_OR_EQUAL_TO; } } diff --git a/lucene/core/src/java/org/apache/lucene/search/TopScoreDocCollector.java b/lucene/core/src/java/org/apache/lucene/search/TopScoreDocCollector.java index b951aaa7f89..3469276982b 100644 --- a/lucene/core/src/java/org/apache/lucene/search/TopScoreDocCollector.java +++ b/lucene/core/src/java/org/apache/lucene/search/TopScoreDocCollector.java @@ -18,7 +18,6 @@ package org.apache.lucene.search; import java.io.IOException; import org.apache.lucene.index.LeafReaderContext; -import org.apache.lucene.search.MaxScoreAccumulator.DocAndScore; /** * A {@link Collector} implementation that collects the top-scoring hits, returning them as a {@link @@ -226,13 +225,13 @@ public abstract class TopScoreDocCollector extends TopDocsCollector { protected void updateGlobalMinCompetitiveScore(Scorable scorer) throws IOException { assert minScoreAcc != null; - DocAndScore maxMinScore = minScoreAcc.get(); - if (maxMinScore != null) { + long maxMinScore = minScoreAcc.getRaw(); + if (maxMinScore != Long.MIN_VALUE) { // since we tie-break on doc id and collect in doc id order we can require // the next float if the global minimum score is set on a document id that is // smaller than the ids in the current leaf - float score = - docBase >= maxMinScore.docId() ? Math.nextUp(maxMinScore.score()) : maxMinScore.score(); + float score = MaxScoreAccumulator.toScore(maxMinScore); + score = docBase >= MaxScoreAccumulator.docId(maxMinScore) ? Math.nextUp(score) : score; if (score > minCompetitiveScore) { assert hitsThresholdChecker.isThresholdReached(); scorer.setMinCompetitiveScore(score); diff --git a/lucene/core/src/java/org/apache/lucene/search/knn/MultiLeafKnnCollector.java b/lucene/core/src/java/org/apache/lucene/search/knn/MultiLeafKnnCollector.java index 1ca979d6794..051cd9ed633 100644 --- a/lucene/core/src/java/org/apache/lucene/search/knn/MultiLeafKnnCollector.java +++ b/lucene/core/src/java/org/apache/lucene/search/knn/MultiLeafKnnCollector.java @@ -33,17 +33,17 @@ public final class MultiLeafKnnCollector implements KnnCollector { // greediness of globally non-competitive search: (0,1] private static final float DEFAULT_GREEDINESS = 0.9f; + private static final int DEFAULT_INTERVAL = 0xff; // the global queue of the highest similarities collected so far across all segments private final BlockingFloatHeap globalSimilarityQueue; // the local queue of the highest similarities if we are not competitive globally // the size of this queue is defined by greediness private final FloatHeap nonCompetitiveQueue; - private final float greediness; // the queue of the local similarities to periodically update with the global queue private final FloatHeap updatesQueue; private final float[] updatesScratch; // interval to synchronize the local and global queues, as a number of visited vectors - private final int interval = 0xff; // 255 + private final int interval; private boolean kResultsCollected = false; private float cachedGlobalMinSim = Float.NEGATIVE_INFINITY; private final AbstractKnnCollector subCollector; @@ -58,7 +58,32 @@ public final class MultiLeafKnnCollector implements KnnCollector { */ public MultiLeafKnnCollector( int k, BlockingFloatHeap globalSimilarityQueue, AbstractKnnCollector subCollector) { - this.greediness = DEFAULT_GREEDINESS; + this(k, DEFAULT_GREEDINESS, DEFAULT_INTERVAL, globalSimilarityQueue, subCollector); + } + + /** + * Create a new MultiLeafKnnCollector. + * + * @param k the number of neighbors to collect + * @param greediness the greediness of the global search + * @param interval (by number of collected values) the interval to synchronize the local and + * global queues + * @param globalSimilarityQueue the global queue of the highest similarities collected so far + * @param subCollector the local collector + */ + public MultiLeafKnnCollector( + int k, + float greediness, + int interval, + BlockingFloatHeap globalSimilarityQueue, + AbstractKnnCollector subCollector) { + if (greediness < 0 || greediness > 1) { + throw new IllegalArgumentException("greediness must be in [0,1]"); + } + if (interval <= 0) { + throw new IllegalArgumentException("interval must be positive"); + } + this.interval = interval; this.subCollector = subCollector; this.globalSimilarityQueue = globalSimilarityQueue; this.nonCompetitiveQueue = new FloatHeap(Math.max(1, Math.round((1 - greediness) * k))); diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/Axiomatic.java b/lucene/core/src/java/org/apache/lucene/search/similarities/Axiomatic.java index 77f71782e31..b4546946acf 100644 --- a/lucene/core/src/java/org/apache/lucene/search/similarities/Axiomatic.java +++ b/lucene/core/src/java/org/apache/lucene/search/similarities/Axiomatic.java @@ -44,13 +44,26 @@ public abstract class Axiomatic extends SimilarityBase { protected final int queryLen; /** - * Constructor setting all Axiomatic hyperparameters + * Constructor setting all Axiomatic hyperparameters and using default discountOverlaps value. * * @param s hyperparam for the growth function * @param queryLen the query length * @param k hyperparam for the primitive weighting function */ public Axiomatic(float s, int queryLen, float k) { + this(true, s, queryLen, k); + } + + /** + * Constructor setting all Axiomatic hyperparameters + * + * @param discountOverlaps true if overlap tokens should not impact document length for scoring. + * @param s hyperparam for the growth function + * @param queryLen the query length + * @param k hyperparam for the primitive weighting function + */ + public Axiomatic(boolean discountOverlaps, float s, int queryLen, float k) { + super(discountOverlaps); if (Float.isFinite(s) == false || Float.isNaN(s) || s < 0 || s > 1) { throw new IllegalArgumentException("illegal s value: " + s + ", must be between 0 and 1"); } diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/DFISimilarity.java b/lucene/core/src/java/org/apache/lucene/search/similarities/DFISimilarity.java index b9c651008cc..34d619ea69f 100644 --- a/lucene/core/src/java/org/apache/lucene/search/similarities/DFISimilarity.java +++ b/lucene/core/src/java/org/apache/lucene/search/similarities/DFISimilarity.java @@ -46,11 +46,23 @@ public class DFISimilarity extends SimilarityBase { private final Independence independence; /** - * Create DFI with the specified divergence from independence measure + * Create DFI with the specified divergence from independence measure and using default + * discountOverlaps value * * @param independenceMeasure measure of divergence from independence */ public DFISimilarity(Independence independenceMeasure) { + this(independenceMeasure, true); + } + + /** + * Create DFI with the specified parameters + * + * @param independenceMeasure measure of divergence from independence + * @param discountOverlaps true if overlap tokens should not impact document length for scoring. + */ + public DFISimilarity(Independence independenceMeasure, boolean discountOverlaps) { + super(discountOverlaps); this.independence = independenceMeasure; } diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/DFRSimilarity.java b/lucene/core/src/java/org/apache/lucene/search/similarities/DFRSimilarity.java index 0b3c1a5e7f0..08e424b3230 100644 --- a/lucene/core/src/java/org/apache/lucene/search/similarities/DFRSimilarity.java +++ b/lucene/core/src/java/org/apache/lucene/search/similarities/DFRSimilarity.java @@ -83,7 +83,7 @@ public class DFRSimilarity extends SimilarityBase { protected final Normalization normalization; /** - * Creates DFRSimilarity from the three components. + * Creates DFRSimilarity from the three components and using default discountOverlaps value. * *

Note that null values are not allowed: if you want no normalization, instead * pass {@link NoNormalization}. @@ -98,7 +98,7 @@ public class DFRSimilarity extends SimilarityBase { } /** - * Creates DFRSimilarity from the three components. + * Creates DFRSimilarity from the three components and with the specified discountOverlaps value. * *

Note that null values are not allowed: if you want no normalization, instead * pass {@link NoNormalization}. diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/IBSimilarity.java b/lucene/core/src/java/org/apache/lucene/search/similarities/IBSimilarity.java index 5b0e93571b1..d2325d20033 100644 --- a/lucene/core/src/java/org/apache/lucene/search/similarities/IBSimilarity.java +++ b/lucene/core/src/java/org/apache/lucene/search/similarities/IBSimilarity.java @@ -76,7 +76,7 @@ public class IBSimilarity extends SimilarityBase { protected final Normalization normalization; /** - * Creates IBSimilarity from the three components. + * Creates IBSimilarity from the three components and using default discountOverlaps value. * *

Note that null values are not allowed: if you want no normalization, instead * pass {@link NoNormalization}. @@ -86,6 +86,26 @@ public class IBSimilarity extends SimilarityBase { * @param normalization term frequency normalization */ public IBSimilarity(Distribution distribution, Lambda lambda, Normalization normalization) { + this(distribution, lambda, normalization, true); + } + + /** + * Creates IBSimilarity from the three components and with the specified discountOverlaps value. + * + *

Note that null values are not allowed: if you want no normalization, instead + * pass {@link NoNormalization}. + * + * @param distribution probabilistic distribution modeling term occurrence + * @param lambda distribution's λw parameter + * @param normalization term frequency normalization + * @param discountOverlaps true if overlap tokens should not impact document length for scoring. + */ + public IBSimilarity( + Distribution distribution, + Lambda lambda, + Normalization normalization, + boolean discountOverlaps) { + super(discountOverlaps); this.distribution = distribution; this.lambda = lambda; this.normalization = normalization; diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/IndriDirichletSimilarity.java b/lucene/core/src/java/org/apache/lucene/search/similarities/IndriDirichletSimilarity.java index 9f708362bb5..b3994c5dc46 100644 --- a/lucene/core/src/java/org/apache/lucene/search/similarities/IndriDirichletSimilarity.java +++ b/lucene/core/src/java/org/apache/lucene/search/similarities/IndriDirichletSimilarity.java @@ -37,6 +37,13 @@ public class IndriDirichletSimilarity extends LMSimilarity { /** The μ parameter. */ private final float mu; + /** Instantiates the similarity with the provided parameters. */ + public IndriDirichletSimilarity( + CollectionModel collectionModel, boolean discountOverlaps, float mu) { + super(collectionModel, discountOverlaps); + this.mu = mu; + } + /** Instantiates the similarity with the provided μ parameter. */ public IndriDirichletSimilarity(CollectionModel collectionModel, float mu) { super(collectionModel); diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/LMDirichletSimilarity.java b/lucene/core/src/java/org/apache/lucene/search/similarities/LMDirichletSimilarity.java index 51b1604aef1..ab80d0d337e 100644 --- a/lucene/core/src/java/org/apache/lucene/search/similarities/LMDirichletSimilarity.java +++ b/lucene/core/src/java/org/apache/lucene/search/similarities/LMDirichletSimilarity.java @@ -39,7 +39,13 @@ public class LMDirichletSimilarity extends LMSimilarity { /** Instantiates the similarity with the provided μ parameter. */ public LMDirichletSimilarity(CollectionModel collectionModel, float mu) { - super(collectionModel); + this(collectionModel, true, mu); + } + + /** Instantiates the similarity with the provided parameters. */ + public LMDirichletSimilarity( + CollectionModel collectionModel, boolean discountOverlaps, float mu) { + super(collectionModel, discountOverlaps); if (Float.isFinite(mu) == false || mu < 0) { throw new IllegalArgumentException( "illegal mu value: " + mu + ", must be a non-negative finite value"); diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/LMJelinekMercerSimilarity.java b/lucene/core/src/java/org/apache/lucene/search/similarities/LMJelinekMercerSimilarity.java index e1990f34b0b..7029fa8e133 100644 --- a/lucene/core/src/java/org/apache/lucene/search/similarities/LMJelinekMercerSimilarity.java +++ b/lucene/core/src/java/org/apache/lucene/search/similarities/LMJelinekMercerSimilarity.java @@ -43,7 +43,13 @@ public class LMJelinekMercerSimilarity extends LMSimilarity { /** Instantiates with the specified collectionModel and λ parameter. */ public LMJelinekMercerSimilarity(CollectionModel collectionModel, float lambda) { - super(collectionModel); + this(collectionModel, true, lambda); + } + + /** Instantiates with the specified collectionModel and parameters. */ + public LMJelinekMercerSimilarity( + CollectionModel collectionModel, boolean discountOverlaps, float lambda) { + super(collectionModel, discountOverlaps); if (Float.isNaN(lambda) || lambda <= 0 || lambda > 1) { throw new IllegalArgumentException("lambda must be in the range (0 .. 1]"); } diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/LMSimilarity.java b/lucene/core/src/java/org/apache/lucene/search/similarities/LMSimilarity.java index e1536db268f..5bd48f37a34 100644 --- a/lucene/core/src/java/org/apache/lucene/search/similarities/LMSimilarity.java +++ b/lucene/core/src/java/org/apache/lucene/search/similarities/LMSimilarity.java @@ -43,6 +43,12 @@ public abstract class LMSimilarity extends SimilarityBase { /** Creates a new instance with the specified collection language model. */ public LMSimilarity(CollectionModel collectionModel) { + this(collectionModel, true); + } + + /** Creates a new instance with the specified collection language model and discountOverlaps. */ + public LMSimilarity(CollectionModel collectionModel, boolean discountOverlaps) { + super(discountOverlaps); this.collectionModel = collectionModel; } diff --git a/lucene/core/src/java/org/apache/lucene/store/BufferedIndexInput.java b/lucene/core/src/java/org/apache/lucene/store/BufferedIndexInput.java index 13151692bc0..7f2aadf54a5 100644 --- a/lucene/core/src/java/org/apache/lucene/store/BufferedIndexInput.java +++ b/lucene/core/src/java/org/apache/lucene/store/BufferedIndexInput.java @@ -151,7 +151,7 @@ public abstract class BufferedIndexInput extends IndexInput implements RandomAcc } @Override - protected void readGroupVInt(long[] dst, int offset) throws IOException { + public void readGroupVInt(long[] dst, int offset) throws IOException { final int len = GroupVIntUtil.readGroupVInt( this, buffer.remaining(), p -> buffer.getInt((int) p), buffer.position(), dst, offset); diff --git a/lucene/core/src/java/org/apache/lucene/store/ByteBuffersDataInput.java b/lucene/core/src/java/org/apache/lucene/store/ByteBuffersDataInput.java index 4b722b61689..a09f78e5f3a 100644 --- a/lucene/core/src/java/org/apache/lucene/store/ByteBuffersDataInput.java +++ b/lucene/core/src/java/org/apache/lucene/store/ByteBuffersDataInput.java @@ -204,7 +204,7 @@ public final class ByteBuffersDataInput extends DataInput } @Override - protected void readGroupVInt(long[] dst, int offset) throws IOException { + public void readGroupVInt(long[] dst, int offset) throws IOException { final ByteBuffer block = blocks[blockIndex(pos)]; final int blockOffset = blockOffset(pos); // We MUST save the return value to local variable, could not use pos += readGroupVInt(...). diff --git a/lucene/core/src/java/org/apache/lucene/store/ByteBuffersDataOutput.java b/lucene/core/src/java/org/apache/lucene/store/ByteBuffersDataOutput.java index eaa0929848d..1c6bcd63629 100644 --- a/lucene/core/src/java/org/apache/lucene/store/ByteBuffersDataOutput.java +++ b/lucene/core/src/java/org/apache/lucene/store/ByteBuffersDataOutput.java @@ -31,7 +31,6 @@ import java.util.function.Consumer; import java.util.function.IntFunction; import org.apache.lucene.util.Accountable; import org.apache.lucene.util.BitUtil; -import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.RamUsageEstimator; import org.apache.lucene.util.UnicodeUtil; @@ -415,12 +414,17 @@ public final class ByteBuffersDataOutput extends DataOutput implements Accountab @Override public void writeString(String v) { try { - if (v.length() <= MAX_CHARS_PER_WINDOW) { - final BytesRef utf8 = new BytesRef(v); - writeVInt(utf8.length); - writeBytes(utf8.bytes, utf8.offset, utf8.length); + final int charCount = v.length(); + final int byteLen = UnicodeUtil.calcUTF16toUTF8Length(v, 0, charCount); + writeVInt(byteLen); + ByteBuffer currentBlock = this.currentBlock; + if (currentBlock.hasArray() && currentBlock.remaining() >= byteLen) { + int startingPos = currentBlock.position(); + UnicodeUtil.UTF16toUTF8( + v, 0, charCount, currentBlock.array(), currentBlock.arrayOffset() + startingPos); + currentBlock.position(startingPos + byteLen); } else { - writeLongString(v); + writeLongString(byteLen, v); } } catch (IOException e) { throw new UncheckedIOException(e); @@ -541,9 +545,7 @@ public final class ByteBuffersDataOutput extends DataOutput implements Accountab } /** Writes a long string in chunks */ - private void writeLongString(final String s) throws IOException { - final int byteLen = UnicodeUtil.calcUTF16toUTF8Length(s, 0, s.length()); - writeVInt(byteLen); + private void writeLongString(int byteLen, final String s) throws IOException { final byte[] buf = new byte[Math.min(byteLen, UnicodeUtil.MAX_UTF8_BYTES_PER_CHAR * MAX_CHARS_PER_WINDOW)]; for (int i = 0, end = s.length(); i < end; ) { diff --git a/lucene/core/src/java/org/apache/lucene/store/ByteBuffersIndexInput.java b/lucene/core/src/java/org/apache/lucene/store/ByteBuffersIndexInput.java index c66d864d570..6aebb771b68 100644 --- a/lucene/core/src/java/org/apache/lucene/store/ByteBuffersIndexInput.java +++ b/lucene/core/src/java/org/apache/lucene/store/ByteBuffersIndexInput.java @@ -206,7 +206,7 @@ public final class ByteBuffersIndexInput extends IndexInput implements RandomAcc } @Override - protected void readGroupVInt(long[] dst, int offset) throws IOException { + public void readGroupVInt(long[] dst, int offset) throws IOException { ensureOpen(); in.readGroupVInt(dst, offset); } diff --git a/lucene/core/src/java/org/apache/lucene/store/DataInput.java b/lucene/core/src/java/org/apache/lucene/store/DataInput.java index 427e81f2df2..70f9a96db9c 100644 --- a/lucene/core/src/java/org/apache/lucene/store/DataInput.java +++ b/lucene/core/src/java/org/apache/lucene/store/DataInput.java @@ -100,28 +100,10 @@ public abstract class DataInput implements Cloneable { } /** - * Read all the group varints, including the tail vints. we need a long[] because this is what - * postings are using, all longs are actually required to be integers. - * - * @param dst the array to read ints into. - * @param limit the number of int values to read. - * @lucene.experimental - */ - public final void readGroupVInts(long[] dst, int limit) throws IOException { - int i; - for (i = 0; i <= limit - 4; i += 4) { - readGroupVInt(dst, i); - } - for (; i < limit; ++i) { - dst[i] = readVInt() & 0xFFFFFFFFL; - } - } - - /** - * Override if you have a efficient implementation. In general this is when the input supports + * Override if you have an efficient implementation. In general this is when the input supports * random access. */ - protected void readGroupVInt(long[] dst, int offset) throws IOException { + public void readGroupVInt(long[] dst, int offset) throws IOException { GroupVIntUtil.readGroupVInt(this, dst, offset); } diff --git a/lucene/core/src/java/org/apache/lucene/store/IndexInput.java b/lucene/core/src/java/org/apache/lucene/store/IndexInput.java index ee84d908838..38eb1dcbcee 100644 --- a/lucene/core/src/java/org/apache/lucene/store/IndexInput.java +++ b/lucene/core/src/java/org/apache/lucene/store/IndexInput.java @@ -127,6 +127,10 @@ public abstract class IndexInput extends DataInput implements Closeable { * CompoundFormat} implementations to honor the {@link ReadAdvice} of each file within the * compound file. * + *

NOTE: it is only legal to call this method if this {@link IndexInput} has been open + * with {@link ReadAdvice#NORMAL}. However, this method accepts any {@link ReadAdvice} value but + * {@code null} as a read advice for the slice. + * *

The default implementation delegates to {@link #slice(String, long, long)} and ignores the * {@link ReadAdvice}. */ diff --git a/lucene/core/src/java/org/apache/lucene/util/GroupVIntUtil.java b/lucene/core/src/java/org/apache/lucene/util/GroupVIntUtil.java index e1b5466342a..1c5033172db 100644 --- a/lucene/core/src/java/org/apache/lucene/util/GroupVIntUtil.java +++ b/lucene/core/src/java/org/apache/lucene/util/GroupVIntUtil.java @@ -33,10 +33,29 @@ public final class GroupVIntUtil { private static final long[] MASKS = new long[] {0xFFL, 0xFFFFL, 0xFFFFFFL, 0xFFFFFFFFL}; /** - * Default implementation of read single group, for optimal performance, you should use {@link - * DataInput#readGroupVInts(long[], int)} instead. + * Read all the group varints, including the tail vints. we need a long[] because this is what + * postings are using, all longs are actually required to be integers. * * @param dst the array to read ints into. + * @param limit the number of int values to read. + * @lucene.experimental + */ + public static void readGroupVInts(DataInput in, long[] dst, int limit) throws IOException { + int i; + for (i = 0; i <= limit - 4; i += 4) { + in.readGroupVInt(dst, i); + } + for (; i < limit; ++i) { + dst[i] = in.readVInt() & 0xFFFFFFFFL; + } + } + + /** + * Default implementation of read single group, for optimal performance, you should use {@link + * GroupVIntUtil#readGroupVInts(DataInput, long[], int)} instead. + * + * @param in the input to use to read data. + * @param dst the array to read ints into. * @param offset the offset in the array to start storing ints. */ public static void readGroupVInt(DataInput in, long[] dst, int offset) throws IOException { diff --git a/lucene/core/src/java/org/apache/lucene/util/StringHelper.java b/lucene/core/src/java/org/apache/lucene/util/StringHelper.java index d264c1da58d..5580f5fbe8e 100644 --- a/lucene/core/src/java/org/apache/lucene/util/StringHelper.java +++ b/lucene/core/src/java/org/apache/lucene/util/StringHelper.java @@ -209,6 +209,156 @@ public abstract class StringHelper { return murmurhash3_x86_32(bytes.bytes, bytes.offset, bytes.length, seed); } + /** + * Generates 128-bit hash from the byte array with the given offset, length and seed. + * + *

The code is adopted from Apache Commons (link) + * + * @param data The input byte array + * @param offset The first element of array + * @param length The length of array + * @param seed The initial seed value + * @return The 128-bit hash (2 longs) + */ + public static long[] murmurhash3_x64_128( + final byte[] data, final int offset, final int length, final int seed) { + // Use an unsigned 32-bit integer as the seed + return murmurhash3_x64_128(data, offset, length, seed & 0xFFFFFFFFL); + } + + @SuppressWarnings("fallthrough") + private static long[] murmurhash3_x64_128( + final byte[] data, final int offset, final int length, final long seed) { + long h1 = seed; + long h2 = seed; + final int nblocks = length >> 4; + + // Constants for 128-bit variant + final long C1 = 0x87c37b91114253d5L; + final long C2 = 0x4cf5ad432745937fL; + final int R1 = 31; + final int R2 = 27; + final int R3 = 33; + final int M = 5; + final int N1 = 0x52dce729; + final int N2 = 0x38495ab5; + + // body + for (int i = 0; i < nblocks; i++) { + final int index = offset + (i << 4); + long k1 = (long) BitUtil.VH_LE_LONG.get(data, index); + long k2 = (long) BitUtil.VH_LE_LONG.get(data, index + 8); + + // mix functions for k1 + k1 *= C1; + k1 = Long.rotateLeft(k1, R1); + k1 *= C2; + h1 ^= k1; + h1 = Long.rotateLeft(h1, R2); + h1 += h2; + h1 = h1 * M + N1; + + // mix functions for k2 + k2 *= C2; + k2 = Long.rotateLeft(k2, R3); + k2 *= C1; + h2 ^= k2; + h2 = Long.rotateLeft(h2, R1); + h2 += h1; + h2 = h2 * M + N2; + } + + // tail + long k1 = 0; + long k2 = 0; + final int index = offset + (nblocks << 4); + switch (length & 0x0F) { + case 15: + k2 ^= ((long) data[index + 14] & 0xff) << 48; + case 14: + k2 ^= ((long) data[index + 13] & 0xff) << 40; + case 13: + k2 ^= ((long) data[index + 12] & 0xff) << 32; + case 12: + k2 ^= ((long) data[index + 11] & 0xff) << 24; + case 11: + k2 ^= ((long) data[index + 10] & 0xff) << 16; + case 10: + k2 ^= ((long) data[index + 9] & 0xff) << 8; + case 9: + k2 ^= data[index + 8] & 0xff; + k2 *= C2; + k2 = Long.rotateLeft(k2, R3); + k2 *= C1; + h2 ^= k2; + + case 8: + k1 ^= ((long) data[index + 7] & 0xff) << 56; + case 7: + k1 ^= ((long) data[index + 6] & 0xff) << 48; + case 6: + k1 ^= ((long) data[index + 5] & 0xff) << 40; + case 5: + k1 ^= ((long) data[index + 4] & 0xff) << 32; + case 4: + k1 ^= ((long) data[index + 3] & 0xff) << 24; + case 3: + k1 ^= ((long) data[index + 2] & 0xff) << 16; + case 2: + k1 ^= ((long) data[index + 1] & 0xff) << 8; + case 1: + k1 ^= data[index] & 0xff; + k1 *= C1; + k1 = Long.rotateLeft(k1, R1); + k1 *= C2; + h1 ^= k1; + } + + // finalization + h1 ^= length; + h2 ^= length; + + h1 += h2; + h2 += h1; + + h1 = fmix64(h1); + h2 = fmix64(h2); + + h1 += h2; + h2 += h1; + + return new long[] {h1, h2}; + } + + /** + * Performs the final avalanche mix step of the 64-bit hash function. + * + * @param hash The current hash + * @return The final hash + */ + private static long fmix64(long hash) { + hash ^= (hash >>> 33); + hash *= 0xff51afd7ed558ccdL; + hash ^= (hash >>> 33); + hash *= 0xc4ceb9fe1a85ec53L; + hash ^= (hash >>> 33); + return hash; + } + + /** + * Generates 128-bit hash from the byte array with the given offset, length and seed. + * + *

The code is adopted from Apache Commons (link) + * + * @param data The input data + * @return The 128-bit hash (2 longs) + */ + public static long[] murmurhash3_x64_128(BytesRef data) { + return murmurhash3_x64_128(data.bytes, data.offset, data.length, 104729); + } + // Holds 128 bit unsigned value: private static BigInteger nextId; private static final BigInteger mask128; diff --git a/lucene/core/src/java/org/apache/lucene/util/Version.java b/lucene/core/src/java/org/apache/lucene/util/Version.java index 91eb4649efc..e232f1ab6d2 100644 --- a/lucene/core/src/java/org/apache/lucene/util/Version.java +++ b/lucene/core/src/java/org/apache/lucene/util/Version.java @@ -32,140 +32,23 @@ import java.util.jar.Manifest; public final class Version { /** - * Match settings and bugs in Lucene's 9.0.0 release. - * - * @deprecated (9.1.0) Use latest + * @deprecated Use latest */ - @Deprecated public static final Version LUCENE_9_0_0 = new Version(9, 0, 0); + @Deprecated public static final Version LUCENE_10_0_0 = new Version(10, 0, 0); /** - * Match settings and bugs in Lucene's 9.1.0 release. - * - * @deprecated (9.2.0) Use latest - */ - @Deprecated public static final Version LUCENE_9_1_0 = new Version(9, 1, 0); - - /** - * Match settings and bugs in Lucene's 9.2.0 release. - * - * @deprecated (9.3.0) Use latest - */ - @Deprecated public static final Version LUCENE_9_2_0 = new Version(9, 2, 0); - - /** - * Match settings and bugs in Lucene's 9.3.0 release. - * - * @deprecated (9.4.0) Use latest - */ - @Deprecated public static final Version LUCENE_9_3_0 = new Version(9, 3, 0); - - /** - * Match settings and bugs in Lucene's 9.4.0 release. + * Match settings and bugs in Lucene's 10.1.0 release. * * @deprecated Use latest */ - @Deprecated public static final Version LUCENE_9_4_0 = new Version(9, 4, 0); + @Deprecated public static final Version LUCENE_10_1_0 = new Version(10, 1, 0); /** - * Match settings and bugs in Lucene's 9.4.1 release. - * - * @deprecated Use latest - * @deprecated (9.4.2) Use latest - */ - @Deprecated public static final Version LUCENE_9_4_1 = new Version(9, 4, 1); - - /** - * Match settings and bugs in Lucene's 9.4.2 release. - * - * @deprecated Use latest - */ - @Deprecated public static final Version LUCENE_9_4_2 = new Version(9, 4, 2); - - /** - * Match settings and bugs in Lucene's 9.5.0 release. - * - * @deprecated (9.6.0) Use latest - */ - @Deprecated public static final Version LUCENE_9_5_0 = new Version(9, 5, 0); - - /** - * Match settings and bugs in Lucene's 9.6.0 release. - * - * @deprecated (9.7.0) Use latest - */ - @Deprecated public static final Version LUCENE_9_6_0 = new Version(9, 6, 0); - - /** - * Match settings and bugs in Lucene's 9.7.0 release. - * - * @deprecated (9.8.0) Use latest - */ - @Deprecated public static final Version LUCENE_9_7_0 = new Version(9, 7, 0); - - /** - * Match settings and bugs in Lucene's 9.8.0 release. - * - * @deprecated (9.9.0) Use latest - */ - @Deprecated public static final Version LUCENE_9_8_0 = new Version(9, 8, 0); - - /** - * Match settings and bugs in Lucene's 9.9.0 release. - * - * @deprecated (9.9.1) Use latest - */ - @Deprecated public static final Version LUCENE_9_9_0 = new Version(9, 9, 0); - - /** - * Match settings and bugs in Lucene's 9.9.1 release. - * - * @deprecated (9.9.2) Use latest - */ - @Deprecated public static final Version LUCENE_9_9_1 = new Version(9, 9, 1); - - /** - * Match settings and bugs in Lucene's 9.9.2 release. - * - * @deprecated (9.10.0) Use latest - */ - @Deprecated public static final Version LUCENE_9_9_2 = new Version(9, 9, 2); - - /** - * Match settings and bugs in Lucene's 9.10.0 release. - * - * @deprecated (9.11.0) Use latest - */ - @Deprecated public static final Version LUCENE_9_10_0 = new Version(9, 10, 0); - - /** - * Match settings and bugs in Lucene's 9.11.0 release. - * - * @deprecated Use latest - * @deprecated (9.12.0) Use latest - * @deprecated (9.11.1) Use latest - */ - @Deprecated public static final Version LUCENE_9_11_0 = new Version(9, 11, 0); - - /** - * Match settings and bugs in Lucene's 9.11.1 release. - * - * @deprecated Use latest - */ - @Deprecated public static final Version LUCENE_9_11_1 = new Version(9, 11, 1); - - /** - * Match settings and bugs in Lucene's 9.12.0 release. - * - * @deprecated Use latest - */ - @Deprecated public static final Version LUCENE_9_12_0 = new Version(9, 12, 0); - - /** - * Match settings and bugs in Lucene's 10.0.0 release. + * Match settings and bugs in Lucene's 11.0.0 release. * *

Use this to get the latest & greatest settings, bug fixes, etc, for Lucene. */ - public static final Version LUCENE_10_0_0 = new Version(10, 0, 0); + public static final Version LUCENE_11_0_0 = new Version(11, 0, 0); // To add a new version: // * Only add above this comment @@ -181,7 +64,7 @@ public final class Version { * re-test your entire application to ensure it behaves as expected, as some defaults may * have changed and may break functionality in your application. */ - public static final Version LATEST = LUCENE_10_0_0; + public static final Version LATEST = LUCENE_11_0_0; /** * Constant for backwards compatibility. diff --git a/lucene/core/src/java/org/apache/lucene/util/bkd/DocIdsWriter.java b/lucene/core/src/java/org/apache/lucene/util/bkd/DocIdsWriter.java index 9f6a10b9ddc..b9ea0d9aa08 100644 --- a/lucene/core/src/java/org/apache/lucene/util/bkd/DocIdsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/util/bkd/DocIdsWriter.java @@ -17,13 +17,16 @@ package org.apache.lucene.util.bkd; import java.io.IOException; +import java.util.Arrays; import org.apache.lucene.index.PointValues.IntersectVisitor; import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.store.DataOutput; import org.apache.lucene.store.IndexInput; +import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.DocBaseBitSetIterator; import org.apache.lucene.util.FixedBitSet; import org.apache.lucene.util.IntsRef; +import org.apache.lucene.util.LongsRef; final class DocIdsWriter { @@ -36,6 +39,7 @@ final class DocIdsWriter { private static final byte LEGACY_DELTA_VINT = (byte) 0; private final int[] scratch; + private final LongsRef scratchLongs = new LongsRef(); /** * IntsRef to be used to iterate over the scratch buffer. A single instance is reused to avoid @@ -205,12 +209,17 @@ final class DocIdsWriter { } } - private static DocIdSetIterator readBitSetIterator(IndexInput in, int count) throws IOException { + private DocIdSetIterator readBitSetIterator(IndexInput in, int count) throws IOException { int offsetWords = in.readVInt(); int longLen = in.readVInt(); - long[] bits = new long[longLen]; - in.readLongs(bits, 0, longLen); - FixedBitSet bitSet = new FixedBitSet(bits, longLen << 6); + scratchLongs.longs = ArrayUtil.growNoCopy(scratchLongs.longs, longLen); + in.readLongs(scratchLongs.longs, 0, longLen); + // make ghost bits clear for FixedBitSet. + if (longLen < scratchLongs.length) { + Arrays.fill(scratchLongs.longs, longLen, scratchLongs.longs.length, 0); + } + scratchLongs.length = longLen; + FixedBitSet bitSet = new FixedBitSet(scratchLongs.longs, longLen << 6); return new DocBaseBitSetIterator(bitSet, count, offsetWords << 6); } @@ -230,7 +239,7 @@ final class DocIdsWriter { } } - private static void readBitSet(IndexInput in, int count, int[] docIDs) throws IOException { + private void readBitSet(IndexInput in, int count, int[] docIDs) throws IOException { DocIdSetIterator iterator = readBitSetIterator(in, count); int docId, pos = 0; while ((docId = iterator.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { @@ -307,8 +316,7 @@ final class DocIdsWriter { } } - private static void readBitSet(IndexInput in, int count, IntersectVisitor visitor) - throws IOException { + private void readBitSet(IndexInput in, int count, IntersectVisitor visitor) throws IOException { DocIdSetIterator bitSetIterator = readBitSetIterator(in, count); visitor.visit(bitSetIterator); } diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/Util.java b/lucene/core/src/java/org/apache/lucene/util/fst/Util.java index a6df78eaab5..c0f4bfeb572 100644 --- a/lucene/core/src/java/org/apache/lucene/util/fst/Util.java +++ b/lucene/core/src/java/org/apache/lucene/util/fst/Util.java @@ -775,10 +775,11 @@ public final class Util { /** Just takes unsigned byte values from the BytesRef and converts into an IntsRef. */ public static IntsRef toIntsRef(BytesRef input, IntsRefBuilder scratch) { - scratch.clear(); + scratch.growNoCopy(input.length); for (int i = 0; i < input.length; i++) { - scratch.append(input.bytes[i + input.offset] & 0xFF); + scratch.setIntAt(i, input.bytes[i + input.offset] & 0xFF); } + scratch.setLength(input.length); return scratch.get(); } diff --git a/lucene/core/src/java/org/apache/lucene/util/hnsw/ConcurrentHnswMerger.java b/lucene/core/src/java/org/apache/lucene/util/hnsw/ConcurrentHnswMerger.java index 392d83fa262..c4e7d159b48 100644 --- a/lucene/core/src/java/org/apache/lucene/util/hnsw/ConcurrentHnswMerger.java +++ b/lucene/core/src/java/org/apache/lucene/util/hnsw/ConcurrentHnswMerger.java @@ -19,7 +19,7 @@ package org.apache.lucene.util.hnsw; import java.io.IOException; import org.apache.lucene.codecs.hnsw.HnswGraphProvider; import org.apache.lucene.index.FieldInfo; -import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.index.KnnVectorValues; import org.apache.lucene.search.TaskExecutor; import org.apache.lucene.util.BitSet; import org.apache.lucene.util.FixedBitSet; @@ -46,7 +46,7 @@ public class ConcurrentHnswMerger extends IncrementalHnswGraphMerger { } @Override - protected HnswBuilder createBuilder(DocIdSetIterator mergedVectorIterator, int maxOrd) + protected HnswBuilder createBuilder(KnnVectorValues mergedVectorValues, int maxOrd) throws IOException { if (initReader == null) { return new HnswConcurrentMergeBuilder( @@ -61,7 +61,7 @@ public class ConcurrentHnswMerger extends IncrementalHnswGraphMerger { HnswGraph initializerGraph = ((HnswGraphProvider) initReader).getGraph(fieldInfo.name); BitSet initializedNodes = new FixedBitSet(maxOrd); - int[] oldToNewOrdinalMap = getNewOrdMapping(mergedVectorIterator, initializedNodes); + int[] oldToNewOrdinalMap = getNewOrdMapping(mergedVectorValues, initializedNodes); return new HnswConcurrentMergeBuilder( taskExecutor, diff --git a/lucene/core/src/java/org/apache/lucene/util/hnsw/HnswGraphBuilder.java b/lucene/core/src/java/org/apache/lucene/util/hnsw/HnswGraphBuilder.java index 1f5253ef7f8..bed1480e926 100644 --- a/lucene/core/src/java/org/apache/lucene/util/hnsw/HnswGraphBuilder.java +++ b/lucene/core/src/java/org/apache/lucene/util/hnsw/HnswGraphBuilder.java @@ -439,7 +439,11 @@ public class HnswGraphBuilder implements HnswBuilder { maxConn *= 2; } List components = HnswUtil.components(hnsw, level, notFullyConnected, maxConn); - // System.out.println("HnswGraphBuilder.connectComponents level=" + level + ": " + components); + if (infoStream.isEnabled(HNSW_COMPONENT)) { + infoStream.message( + HNSW_COMPONENT, "connect " + components.size() + " components on level=" + level); + } + // System.out.println("HnswGraphBuilder. level=" + level + ": " + components); boolean result = true; if (components.size() > 1) { // connect other components to the largest one @@ -457,12 +461,16 @@ public class HnswGraphBuilder implements HnswBuilder { if (c.start() == NO_MORE_DOCS) { continue; } + if (infoStream.isEnabled(HNSW_COMPONENT)) { + infoStream.message(HNSW_COMPONENT, "connect component " + c + " to " + c0); + } + beam.clear(); eps[0] = c0.start(); RandomVectorScorer scorer = scorerSupplier.scorer(c.start()); // find the closest node in the largest component to the lowest-numbered node in this // component that has room to make a connection - graphSearcher.searchLevel(beam, scorer, 0, eps, hnsw, notFullyConnected); + graphSearcher.searchLevel(beam, scorer, level, eps, hnsw, notFullyConnected); boolean linked = false; while (beam.size() > 0) { int c0node = beam.popNode(); @@ -475,8 +483,14 @@ public class HnswGraphBuilder implements HnswBuilder { // System.out.println("link " + c0 + "." + c0node + " to " + c + "." + c.start()); link(level, c0node, c.start(), score, notFullyConnected); linked = true; + if (infoStream.isEnabled(HNSW_COMPONENT)) { + infoStream.message(HNSW_COMPONENT, "connected ok " + c0node + " -> " + c.start()); + } } if (!linked) { + if (infoStream.isEnabled(HNSW_COMPONENT)) { + infoStream.message(HNSW_COMPONENT, "not connected; no free nodes found"); + } result = false; } } @@ -541,7 +555,7 @@ public class HnswGraphBuilder implements HnswBuilder { return queue.nodes(); } - float minimumScore() { + public float minimumScore() { return queue.topScore(); } diff --git a/lucene/core/src/java/org/apache/lucene/util/hnsw/HnswGraphMerger.java b/lucene/core/src/java/org/apache/lucene/util/hnsw/HnswGraphMerger.java index 7ed5dd142de..31e9c768dc0 100644 --- a/lucene/core/src/java/org/apache/lucene/util/hnsw/HnswGraphMerger.java +++ b/lucene/core/src/java/org/apache/lucene/util/hnsw/HnswGraphMerger.java @@ -18,8 +18,8 @@ package org.apache.lucene.util.hnsw; import java.io.IOException; import org.apache.lucene.codecs.KnnVectorsReader; +import org.apache.lucene.index.KnnVectorValues; import org.apache.lucene.index.MergeState; -import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.util.Bits; import org.apache.lucene.util.InfoStream; @@ -45,12 +45,12 @@ public interface HnswGraphMerger { /** * Merge and produce the on heap graph * - * @param mergedVectorIterator iterator over the vectors in the merged segment + * @param mergedVectorValues view of the vectors in the merged segment * @param infoStream optional info stream to set to builder * @param maxOrd max number of vectors that will be added to the graph * @return merged graph * @throws IOException during merge */ - OnHeapHnswGraph merge(DocIdSetIterator mergedVectorIterator, InfoStream infoStream, int maxOrd) + OnHeapHnswGraph merge(KnnVectorValues mergedVectorValues, InfoStream infoStream, int maxOrd) throws IOException; } diff --git a/lucene/core/src/java/org/apache/lucene/util/hnsw/IncrementalHnswGraphMerger.java b/lucene/core/src/java/org/apache/lucene/util/hnsw/IncrementalHnswGraphMerger.java index 7331111d45a..d64961a02ee 100644 --- a/lucene/core/src/java/org/apache/lucene/util/hnsw/IncrementalHnswGraphMerger.java +++ b/lucene/core/src/java/org/apache/lucene/util/hnsw/IncrementalHnswGraphMerger.java @@ -25,9 +25,9 @@ import org.apache.lucene.codecs.perfield.PerFieldKnnVectorsFormat; import org.apache.lucene.index.ByteVectorValues; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.FloatVectorValues; +import org.apache.lucene.index.KnnVectorValues; import org.apache.lucene.index.MergeState; import org.apache.lucene.internal.hppc.IntIntHashMap; -import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.util.BitSet; import org.apache.lucene.util.Bits; import org.apache.lucene.util.FixedBitSet; @@ -108,12 +108,12 @@ public class IncrementalHnswGraphMerger implements HnswGraphMerger { * Builds a new HnswGraphBuilder using the biggest graph from the merge state as a starting point. * If no valid readers were added to the merge state, a new graph is created. * - * @param mergedVectorIterator iterator over the vectors in the merged segment + * @param mergedVectorValues vector values in the merged segment * @param maxOrd max num of vectors that will be merged into the graph * @return HnswGraphBuilder * @throws IOException If an error occurs while reading from the merge state */ - protected HnswBuilder createBuilder(DocIdSetIterator mergedVectorIterator, int maxOrd) + protected HnswBuilder createBuilder(KnnVectorValues mergedVectorValues, int maxOrd) throws IOException { if (initReader == null) { return HnswGraphBuilder.create( @@ -123,7 +123,7 @@ public class IncrementalHnswGraphMerger implements HnswGraphMerger { HnswGraph initializerGraph = ((HnswGraphProvider) initReader).getGraph(fieldInfo.name); BitSet initializedNodes = new FixedBitSet(maxOrd); - int[] oldToNewOrdinalMap = getNewOrdMapping(mergedVectorIterator, initializedNodes); + int[] oldToNewOrdinalMap = getNewOrdMapping(mergedVectorValues, initializedNodes); return InitializedHnswGraphBuilder.fromGraph( scorerSupplier, M, @@ -137,8 +137,8 @@ public class IncrementalHnswGraphMerger implements HnswGraphMerger { @Override public OnHeapHnswGraph merge( - DocIdSetIterator mergedVectorIterator, InfoStream infoStream, int maxOrd) throws IOException { - HnswBuilder builder = createBuilder(mergedVectorIterator, maxOrd); + KnnVectorValues mergedVectorValues, InfoStream infoStream, int maxOrd) throws IOException { + HnswBuilder builder = createBuilder(mergedVectorValues, maxOrd); builder.setInfoStream(infoStream); return builder.build(maxOrd); } @@ -147,46 +147,45 @@ public class IncrementalHnswGraphMerger implements HnswGraphMerger { * Creates a new mapping from old ordinals to new ordinals and returns the total number of vectors * in the newly merged segment. * - * @param mergedVectorIterator iterator over the vectors in the merged segment + * @param mergedVectorValues vector values in the merged segment * @param initializedNodes track what nodes have been initialized * @return the mapping from old ordinals to new ordinals * @throws IOException If an error occurs while reading from the merge state */ protected final int[] getNewOrdMapping( - DocIdSetIterator mergedVectorIterator, BitSet initializedNodes) throws IOException { - DocIdSetIterator initializerIterator = null; + KnnVectorValues mergedVectorValues, BitSet initializedNodes) throws IOException { + KnnVectorValues.DocIndexIterator initializerIterator = null; switch (fieldInfo.getVectorEncoding()) { - case BYTE -> initializerIterator = initReader.getByteVectorValues(fieldInfo.name); - case FLOAT32 -> initializerIterator = initReader.getFloatVectorValues(fieldInfo.name); + case BYTE -> initializerIterator = initReader.getByteVectorValues(fieldInfo.name).iterator(); + case FLOAT32 -> + initializerIterator = initReader.getFloatVectorValues(fieldInfo.name).iterator(); } IntIntHashMap newIdToOldOrdinal = new IntIntHashMap(initGraphSize); - int oldOrd = 0; int maxNewDocID = -1; - for (int oldId = initializerIterator.nextDoc(); - oldId != NO_MORE_DOCS; - oldId = initializerIterator.nextDoc()) { - int newId = initDocMap.get(oldId); + for (int docId = initializerIterator.nextDoc(); + docId != NO_MORE_DOCS; + docId = initializerIterator.nextDoc()) { + int newId = initDocMap.get(docId); maxNewDocID = Math.max(newId, maxNewDocID); - newIdToOldOrdinal.put(newId, oldOrd); - oldOrd++; + newIdToOldOrdinal.put(newId, initializerIterator.index()); } if (maxNewDocID == -1) { return new int[0]; } final int[] oldToNewOrdinalMap = new int[initGraphSize]; - int newOrd = 0; + KnnVectorValues.DocIndexIterator mergedVectorIterator = mergedVectorValues.iterator(); for (int newDocId = mergedVectorIterator.nextDoc(); newDocId <= maxNewDocID; newDocId = mergedVectorIterator.nextDoc()) { int hashDocIndex = newIdToOldOrdinal.indexOf(newDocId); if (newIdToOldOrdinal.indexExists(hashDocIndex)) { + int newOrd = mergedVectorIterator.index(); initializedNodes.set(newOrd); oldToNewOrdinalMap[newIdToOldOrdinal.indexGet(hashDocIndex)] = newOrd; } - newOrd++; } return oldToNewOrdinalMap; } diff --git a/lucene/core/src/java/org/apache/lucene/util/hnsw/RandomAccessVectorValues.java b/lucene/core/src/java/org/apache/lucene/util/hnsw/RandomAccessVectorValues.java deleted file mode 100644 index e2c7372b667..00000000000 --- a/lucene/core/src/java/org/apache/lucene/util/hnsw/RandomAccessVectorValues.java +++ /dev/null @@ -1,175 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.lucene.util.hnsw; - -import java.io.IOException; -import java.util.List; -import org.apache.lucene.store.IndexInput; -import org.apache.lucene.util.Bits; - -/** - * Provides random access to vectors by dense ordinal. This interface is used by HNSW-based - * implementations of KNN search. - * - * @lucene.experimental - */ -public interface RandomAccessVectorValues { - - /** Return the number of vector values */ - int size(); - - /** Return the dimension of the returned vector values */ - int dimension(); - - /** - * Creates a new copy of this {@link RandomAccessVectorValues}. This is helpful when you need to - * access different values at once, to avoid overwriting the underlying vector returned. - */ - RandomAccessVectorValues copy() throws IOException; - - /** - * Returns a slice of the underlying {@link IndexInput} that contains the vector values if - * available - */ - default IndexInput getSlice() { - return null; - } - - /** Returns the byte length of the vector values. */ - int getVectorByteLength(); - - /** - * Translates vector ordinal to the correct document ID. By default, this is an identity function. - * - * @param ord the vector ordinal - * @return the document Id for that vector ordinal - */ - default int ordToDoc(int ord) { - return ord; - } - - /** - * Returns the {@link Bits} representing live documents. By default, this is an identity function. - * - * @param acceptDocs the accept docs - * @return the accept docs - */ - default Bits getAcceptOrds(Bits acceptDocs) { - return acceptDocs; - } - - /** Float vector values. */ - interface Floats extends RandomAccessVectorValues { - @Override - RandomAccessVectorValues.Floats copy() throws IOException; - - /** - * Return the vector value indexed at the given ordinal. - * - * @param targetOrd a valid ordinal, ≥ 0 and < {@link #size()}. - */ - float[] vectorValue(int targetOrd) throws IOException; - - /** Returns the vector byte length, defaults to dimension multiplied by float byte size */ - @Override - default int getVectorByteLength() { - return dimension() * Float.BYTES; - } - } - - /** Byte vector values. */ - interface Bytes extends RandomAccessVectorValues { - @Override - RandomAccessVectorValues.Bytes copy() throws IOException; - - /** - * Return the vector value indexed at the given ordinal. - * - * @param targetOrd a valid ordinal, ≥ 0 and < {@link #size()}. - */ - byte[] vectorValue(int targetOrd) throws IOException; - - /** Returns the vector byte length, defaults to dimension multiplied by byte size */ - @Override - default int getVectorByteLength() { - return dimension() * Byte.BYTES; - } - } - - /** - * Creates a {@link RandomAccessVectorValues.Floats} from a list of float arrays. - * - * @param vectors the list of float arrays - * @param dim the dimension of the vectors - * @return a {@link RandomAccessVectorValues.Floats} instance - */ - static RandomAccessVectorValues.Floats fromFloats(List vectors, int dim) { - return new RandomAccessVectorValues.Floats() { - @Override - public int size() { - return vectors.size(); - } - - @Override - public int dimension() { - return dim; - } - - @Override - public float[] vectorValue(int targetOrd) { - return vectors.get(targetOrd); - } - - @Override - public RandomAccessVectorValues.Floats copy() { - return this; - } - }; - } - - /** - * Creates a {@link RandomAccessVectorValues.Bytes} from a list of byte arrays. - * - * @param vectors the list of byte arrays - * @param dim the dimension of the vectors - * @return a {@link RandomAccessVectorValues.Bytes} instance - */ - static RandomAccessVectorValues.Bytes fromBytes(List vectors, int dim) { - return new RandomAccessVectorValues.Bytes() { - @Override - public int size() { - return vectors.size(); - } - - @Override - public int dimension() { - return dim; - } - - @Override - public byte[] vectorValue(int targetOrd) { - return vectors.get(targetOrd); - } - - @Override - public RandomAccessVectorValues.Bytes copy() { - return this; - } - }; - } -} diff --git a/lucene/core/src/java/org/apache/lucene/util/hnsw/RandomVectorScorer.java b/lucene/core/src/java/org/apache/lucene/util/hnsw/RandomVectorScorer.java index fc8ed3d004a..a135df43699 100644 --- a/lucene/core/src/java/org/apache/lucene/util/hnsw/RandomVectorScorer.java +++ b/lucene/core/src/java/org/apache/lucene/util/hnsw/RandomVectorScorer.java @@ -18,6 +18,7 @@ package org.apache.lucene.util.hnsw; import java.io.IOException; +import org.apache.lucene.index.KnnVectorValues; import org.apache.lucene.util.Bits; /** A {@link RandomVectorScorer} for scoring random nodes in batches against an abstract query. */ @@ -57,14 +58,14 @@ public interface RandomVectorScorer { /** Creates a default scorer for random access vectors. */ abstract class AbstractRandomVectorScorer implements RandomVectorScorer { - private final RandomAccessVectorValues values; + private final KnnVectorValues values; /** * Creates a new scorer for the given vector values. * * @param values the vector values */ - public AbstractRandomVectorScorer(RandomAccessVectorValues values) { + public AbstractRandomVectorScorer(KnnVectorValues values) { this.values = values; } diff --git a/lucene/core/src/java/org/apache/lucene/util/quantization/QuantizedByteVectorValues.java b/lucene/core/src/java/org/apache/lucene/util/quantization/QuantizedByteVectorValues.java index a0fe957fecb..b90ab8276dd 100644 --- a/lucene/core/src/java/org/apache/lucene/util/quantization/QuantizedByteVectorValues.java +++ b/lucene/core/src/java/org/apache/lucene/util/quantization/QuantizedByteVectorValues.java @@ -17,9 +17,10 @@ package org.apache.lucene.util.quantization; import java.io.IOException; +import org.apache.lucene.codecs.lucene95.HasIndexSlice; import org.apache.lucene.index.ByteVectorValues; -import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.VectorScorer; +import org.apache.lucene.store.IndexInput; /** * A version of {@link ByteVectorValues}, but additionally retrieving score correction offset for @@ -27,31 +28,31 @@ import org.apache.lucene.search.VectorScorer; * * @lucene.experimental */ -public abstract class QuantizedByteVectorValues extends DocIdSetIterator { - public abstract float getScoreCorrectionConstant() throws IOException; +public abstract class QuantizedByteVectorValues extends ByteVectorValues implements HasIndexSlice { - public abstract byte[] vectorValue() throws IOException; - - /** Return the dimension of the vectors */ - public abstract int dimension(); - - /** - * Return the number of vectors for this field. - * - * @return the number of vectors returned by this iterator - */ - public abstract int size(); - - @Override - public final long cost() { - return size(); + public ScalarQuantizer getScalarQuantizer() { + throw new UnsupportedOperationException(); } + public abstract float getScoreCorrectionConstant(int ord) throws IOException; + /** * Return a {@link VectorScorer} for the given query vector. * * @param query the query vector * @return a {@link VectorScorer} instance or null */ - public abstract VectorScorer scorer(float[] query) throws IOException; + public VectorScorer scorer(float[] query) throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public QuantizedByteVectorValues copy() throws IOException { + return this; + } + + @Override + public IndexInput getSlice() { + return null; + } } diff --git a/lucene/core/src/java/org/apache/lucene/util/quantization/ScalarQuantizer.java b/lucene/core/src/java/org/apache/lucene/util/quantization/ScalarQuantizer.java index ab8a911ddfa..3f7bcf6c5c4 100644 --- a/lucene/core/src/java/org/apache/lucene/util/quantization/ScalarQuantizer.java +++ b/lucene/core/src/java/org/apache/lucene/util/quantization/ScalarQuantizer.java @@ -25,6 +25,7 @@ import java.util.List; import java.util.Random; import java.util.stream.IntStream; import org.apache.lucene.index.FloatVectorValues; +import org.apache.lucene.index.KnnVectorValues; import org.apache.lucene.index.VectorSimilarityFunction; import org.apache.lucene.search.HitQueue; import org.apache.lucene.search.ScoreDoc; @@ -269,11 +270,12 @@ public class ScalarQuantizer { if (totalVectorCount == 0) { return new ScalarQuantizer(0f, 0f, bits); } + KnnVectorValues.DocIndexIterator iterator = floatVectorValues.iterator(); if (confidenceInterval == 1f) { float min = Float.POSITIVE_INFINITY; float max = Float.NEGATIVE_INFINITY; - while (floatVectorValues.nextDoc() != NO_MORE_DOCS) { - for (float v : floatVectorValues.vectorValue()) { + while (iterator.nextDoc() != NO_MORE_DOCS) { + for (float v : floatVectorValues.vectorValue(iterator.index())) { min = Math.min(min, v); max = Math.max(max, v); } @@ -289,8 +291,8 @@ public class ScalarQuantizer { if (totalVectorCount <= quantizationSampleSize) { int scratchSize = Math.min(SCRATCH_SIZE, totalVectorCount); int i = 0; - while (floatVectorValues.nextDoc() != NO_MORE_DOCS) { - float[] vectorValue = floatVectorValues.vectorValue(); + while (iterator.nextDoc() != NO_MORE_DOCS) { + float[] vectorValue = floatVectorValues.vectorValue(iterator.index()); System.arraycopy( vectorValue, 0, quantileGatheringScratch, i * vectorValue.length, vectorValue.length); i++; @@ -311,11 +313,11 @@ public class ScalarQuantizer { for (int i : vectorsToTake) { while (index <= i) { // We cannot use `advance(docId)` as MergedVectorValues does not support it - floatVectorValues.nextDoc(); + iterator.nextDoc(); index++; } - assert floatVectorValues.docID() != NO_MORE_DOCS; - float[] vectorValue = floatVectorValues.vectorValue(); + assert iterator.docID() != NO_MORE_DOCS; + float[] vectorValue = floatVectorValues.vectorValue(iterator.index()); System.arraycopy( vectorValue, 0, quantileGatheringScratch, idx * vectorValue.length, vectorValue.length); idx++; @@ -353,11 +355,16 @@ public class ScalarQuantizer { / (floatVectorValues.dimension() + 1), 1 - 1f / (floatVectorValues.dimension() + 1) }; + KnnVectorValues.DocIndexIterator iterator = floatVectorValues.iterator(); if (totalVectorCount <= sampleSize) { int scratchSize = Math.min(SCRATCH_SIZE, totalVectorCount); int i = 0; - while (floatVectorValues.nextDoc() != NO_MORE_DOCS) { - gatherSample(floatVectorValues, quantileGatheringScratch, sampledDocs, i); + while (iterator.nextDoc() != NO_MORE_DOCS) { + gatherSample( + floatVectorValues.vectorValue(iterator.index()), + quantileGatheringScratch, + sampledDocs, + i); i++; if (i == scratchSize) { extractQuantiles(confidenceIntervals, quantileGatheringScratch, upperSum, lowerSum); @@ -374,11 +381,15 @@ public class ScalarQuantizer { for (int i : vectorsToTake) { while (index <= i) { // We cannot use `advance(docId)` as MergedVectorValues does not support it - floatVectorValues.nextDoc(); + iterator.nextDoc(); index++; } - assert floatVectorValues.docID() != NO_MORE_DOCS; - gatherSample(floatVectorValues, quantileGatheringScratch, sampledDocs, idx); + assert iterator.docID() != NO_MORE_DOCS; + gatherSample( + floatVectorValues.vectorValue(iterator.index()), + quantileGatheringScratch, + sampledDocs, + idx); idx++; if (idx == SCRATCH_SIZE) { extractQuantiles(confidenceIntervals, quantileGatheringScratch, upperSum, lowerSum); @@ -437,12 +448,7 @@ public class ScalarQuantizer { } private static void gatherSample( - FloatVectorValues floatVectorValues, - float[] quantileGatheringScratch, - List sampledDocs, - int i) - throws IOException { - float[] vectorValue = floatVectorValues.vectorValue(); + float[] vectorValue, float[] quantileGatheringScratch, List sampledDocs, int i) { float[] copy = new float[vectorValue.length]; System.arraycopy(vectorValue, 0, copy, 0, vectorValue.length); sampledDocs.add(copy); diff --git a/lucene/core/src/java21/org/apache/lucene/internal/vectorization/Lucene99MemorySegmentByteVectorScorer.java b/lucene/core/src/java21/org/apache/lucene/internal/vectorization/Lucene99MemorySegmentByteVectorScorer.java index 0798885c906..b65f1e57092 100644 --- a/lucene/core/src/java21/org/apache/lucene/internal/vectorization/Lucene99MemorySegmentByteVectorScorer.java +++ b/lucene/core/src/java21/org/apache/lucene/internal/vectorization/Lucene99MemorySegmentByteVectorScorer.java @@ -19,11 +19,12 @@ package org.apache.lucene.internal.vectorization; import java.io.IOException; import java.lang.foreign.MemorySegment; import java.util.Optional; +import org.apache.lucene.index.ByteVectorValues; +import org.apache.lucene.index.KnnVectorValues; import org.apache.lucene.index.VectorSimilarityFunction; import org.apache.lucene.store.FilterIndexInput; import org.apache.lucene.store.IndexInput; import org.apache.lucene.store.MemorySegmentAccessInput; -import org.apache.lucene.util.hnsw.RandomAccessVectorValues; import org.apache.lucene.util.hnsw.RandomVectorScorer; abstract sealed class Lucene99MemorySegmentByteVectorScorer @@ -39,10 +40,8 @@ abstract sealed class Lucene99MemorySegmentByteVectorScorer * returned. */ public static Optional create( - VectorSimilarityFunction type, - IndexInput input, - RandomAccessVectorValues values, - byte[] queryVector) { + VectorSimilarityFunction type, IndexInput input, KnnVectorValues values, byte[] queryVector) { + assert values instanceof ByteVectorValues; input = FilterIndexInput.unwrapOnlyTest(input); if (!(input instanceof MemorySegmentAccessInput msInput)) { return Optional.empty(); @@ -58,7 +57,7 @@ abstract sealed class Lucene99MemorySegmentByteVectorScorer } Lucene99MemorySegmentByteVectorScorer( - MemorySegmentAccessInput input, RandomAccessVectorValues values, byte[] queryVector) { + MemorySegmentAccessInput input, KnnVectorValues values, byte[] queryVector) { super(values); this.input = input; this.vectorByteSize = values.getVectorByteLength(); @@ -92,7 +91,7 @@ abstract sealed class Lucene99MemorySegmentByteVectorScorer } static final class CosineScorer extends Lucene99MemorySegmentByteVectorScorer { - CosineScorer(MemorySegmentAccessInput input, RandomAccessVectorValues values, byte[] query) { + CosineScorer(MemorySegmentAccessInput input, KnnVectorValues values, byte[] query) { super(input, values, query); } @@ -105,8 +104,7 @@ abstract sealed class Lucene99MemorySegmentByteVectorScorer } static final class DotProductScorer extends Lucene99MemorySegmentByteVectorScorer { - DotProductScorer( - MemorySegmentAccessInput input, RandomAccessVectorValues values, byte[] query) { + DotProductScorer(MemorySegmentAccessInput input, KnnVectorValues values, byte[] query) { super(input, values, query); } @@ -120,7 +118,7 @@ abstract sealed class Lucene99MemorySegmentByteVectorScorer } static final class EuclideanScorer extends Lucene99MemorySegmentByteVectorScorer { - EuclideanScorer(MemorySegmentAccessInput input, RandomAccessVectorValues values, byte[] query) { + EuclideanScorer(MemorySegmentAccessInput input, KnnVectorValues values, byte[] query) { super(input, values, query); } @@ -133,8 +131,7 @@ abstract sealed class Lucene99MemorySegmentByteVectorScorer } static final class MaxInnerProductScorer extends Lucene99MemorySegmentByteVectorScorer { - MaxInnerProductScorer( - MemorySegmentAccessInput input, RandomAccessVectorValues values, byte[] query) { + MaxInnerProductScorer(MemorySegmentAccessInput input, KnnVectorValues values, byte[] query) { super(input, values, query); } diff --git a/lucene/core/src/java21/org/apache/lucene/internal/vectorization/Lucene99MemorySegmentByteVectorScorerSupplier.java b/lucene/core/src/java21/org/apache/lucene/internal/vectorization/Lucene99MemorySegmentByteVectorScorerSupplier.java index 90b3bfb014c..02c71561122 100644 --- a/lucene/core/src/java21/org/apache/lucene/internal/vectorization/Lucene99MemorySegmentByteVectorScorerSupplier.java +++ b/lucene/core/src/java21/org/apache/lucene/internal/vectorization/Lucene99MemorySegmentByteVectorScorerSupplier.java @@ -19,11 +19,12 @@ package org.apache.lucene.internal.vectorization; import java.io.IOException; import java.lang.foreign.MemorySegment; import java.util.Optional; +import org.apache.lucene.index.ByteVectorValues; +import org.apache.lucene.index.KnnVectorValues; import org.apache.lucene.index.VectorSimilarityFunction; import org.apache.lucene.store.FilterIndexInput; import org.apache.lucene.store.IndexInput; import org.apache.lucene.store.MemorySegmentAccessInput; -import org.apache.lucene.util.hnsw.RandomAccessVectorValues; import org.apache.lucene.util.hnsw.RandomVectorScorer; import org.apache.lucene.util.hnsw.RandomVectorScorerSupplier; @@ -33,7 +34,7 @@ public abstract sealed class Lucene99MemorySegmentByteVectorScorerSupplier final int vectorByteSize; final int maxOrd; final MemorySegmentAccessInput input; - final RandomAccessVectorValues values; // to support ordToDoc/getAcceptOrds + final KnnVectorValues values; // to support ordToDoc/getAcceptOrds byte[] scratch1, scratch2; /** @@ -41,7 +42,8 @@ public abstract sealed class Lucene99MemorySegmentByteVectorScorerSupplier * optional is returned. */ static Optional create( - VectorSimilarityFunction type, IndexInput input, RandomAccessVectorValues values) { + VectorSimilarityFunction type, IndexInput input, KnnVectorValues values) { + assert values instanceof ByteVectorValues; input = FilterIndexInput.unwrapOnlyTest(input); if (!(input instanceof MemorySegmentAccessInput msInput)) { return Optional.empty(); @@ -56,7 +58,7 @@ public abstract sealed class Lucene99MemorySegmentByteVectorScorerSupplier } Lucene99MemorySegmentByteVectorScorerSupplier( - MemorySegmentAccessInput input, RandomAccessVectorValues values) { + MemorySegmentAccessInput input, KnnVectorValues values) { this.input = input; this.values = values; this.vectorByteSize = values.getVectorByteLength(); @@ -103,7 +105,7 @@ public abstract sealed class Lucene99MemorySegmentByteVectorScorerSupplier static final class CosineSupplier extends Lucene99MemorySegmentByteVectorScorerSupplier { - CosineSupplier(MemorySegmentAccessInput input, RandomAccessVectorValues values) { + CosineSupplier(MemorySegmentAccessInput input, KnnVectorValues values) { super(input, values); } @@ -128,7 +130,7 @@ public abstract sealed class Lucene99MemorySegmentByteVectorScorerSupplier static final class DotProductSupplier extends Lucene99MemorySegmentByteVectorScorerSupplier { - DotProductSupplier(MemorySegmentAccessInput input, RandomAccessVectorValues values) { + DotProductSupplier(MemorySegmentAccessInput input, KnnVectorValues values) { super(input, values); } @@ -155,7 +157,7 @@ public abstract sealed class Lucene99MemorySegmentByteVectorScorerSupplier static final class EuclideanSupplier extends Lucene99MemorySegmentByteVectorScorerSupplier { - EuclideanSupplier(MemorySegmentAccessInput input, RandomAccessVectorValues values) { + EuclideanSupplier(MemorySegmentAccessInput input, KnnVectorValues values) { super(input, values); } @@ -181,7 +183,7 @@ public abstract sealed class Lucene99MemorySegmentByteVectorScorerSupplier static final class MaxInnerProductSupplier extends Lucene99MemorySegmentByteVectorScorerSupplier { - MaxInnerProductSupplier(MemorySegmentAccessInput input, RandomAccessVectorValues values) { + MaxInnerProductSupplier(MemorySegmentAccessInput input, KnnVectorValues values) { super(input, values); } diff --git a/lucene/core/src/java21/org/apache/lucene/internal/vectorization/Lucene99MemorySegmentFlatVectorsScorer.java b/lucene/core/src/java21/org/apache/lucene/internal/vectorization/Lucene99MemorySegmentFlatVectorsScorer.java index b085185fb11..bd8cbb2c388 100644 --- a/lucene/core/src/java21/org/apache/lucene/internal/vectorization/Lucene99MemorySegmentFlatVectorsScorer.java +++ b/lucene/core/src/java21/org/apache/lucene/internal/vectorization/Lucene99MemorySegmentFlatVectorsScorer.java @@ -19,11 +19,13 @@ package org.apache.lucene.internal.vectorization; import java.io.IOException; import org.apache.lucene.codecs.hnsw.DefaultFlatVectorScorer; import org.apache.lucene.codecs.hnsw.FlatVectorsScorer; +import org.apache.lucene.codecs.lucene95.HasIndexSlice; +import org.apache.lucene.index.ByteVectorValues; +import org.apache.lucene.index.KnnVectorValues; import org.apache.lucene.index.VectorSimilarityFunction; -import org.apache.lucene.util.hnsw.RandomAccessVectorValues; import org.apache.lucene.util.hnsw.RandomVectorScorer; import org.apache.lucene.util.hnsw.RandomVectorScorerSupplier; -import org.apache.lucene.util.quantization.RandomAccessQuantizedByteVectorValues; +import org.apache.lucene.util.quantization.QuantizedByteVectorValues; public class Lucene99MemorySegmentFlatVectorsScorer implements FlatVectorsScorer { @@ -38,15 +40,16 @@ public class Lucene99MemorySegmentFlatVectorsScorer implements FlatVectorsScorer @Override public RandomVectorScorerSupplier getRandomVectorScorerSupplier( - VectorSimilarityFunction similarityType, RandomAccessVectorValues vectorValues) - throws IOException { + VectorSimilarityFunction similarityType, KnnVectorValues vectorValues) throws IOException { // a quantized values here is a wrapping or delegation issue - assert !(vectorValues instanceof RandomAccessQuantizedByteVectorValues); + assert !(vectorValues instanceof QuantizedByteVectorValues); // currently only supports binary vectors - if (vectorValues instanceof RandomAccessVectorValues.Bytes && vectorValues.getSlice() != null) { + if (vectorValues instanceof ByteVectorValues bvv + && bvv instanceof HasIndexSlice byteVectorValues + && byteVectorValues.getSlice() != null) { var scorer = Lucene99MemorySegmentByteVectorScorerSupplier.create( - similarityType, vectorValues.getSlice(), vectorValues); + similarityType, byteVectorValues.getSlice(), vectorValues); if (scorer.isPresent()) { return scorer.get(); } @@ -56,9 +59,7 @@ public class Lucene99MemorySegmentFlatVectorsScorer implements FlatVectorsScorer @Override public RandomVectorScorer getRandomVectorScorer( - VectorSimilarityFunction similarityType, - RandomAccessVectorValues vectorValues, - float[] target) + VectorSimilarityFunction similarityType, KnnVectorValues vectorValues, float[] target) throws IOException { // currently only supports binary vectors, so always delegate return delegate.getRandomVectorScorer(similarityType, vectorValues, target); @@ -66,17 +67,17 @@ public class Lucene99MemorySegmentFlatVectorsScorer implements FlatVectorsScorer @Override public RandomVectorScorer getRandomVectorScorer( - VectorSimilarityFunction similarityType, - RandomAccessVectorValues vectorValues, - byte[] queryVector) + VectorSimilarityFunction similarityType, KnnVectorValues vectorValues, byte[] queryVector) throws IOException { checkDimensions(queryVector.length, vectorValues.dimension()); // a quantized values here is a wrapping or delegation issue - assert !(vectorValues instanceof RandomAccessQuantizedByteVectorValues); - if (vectorValues instanceof RandomAccessVectorValues.Bytes && vectorValues.getSlice() != null) { + assert !(vectorValues instanceof QuantizedByteVectorValues); + if (vectorValues instanceof ByteVectorValues bvv + && bvv instanceof HasIndexSlice byteVectorValues + && byteVectorValues.getSlice() != null) { var scorer = Lucene99MemorySegmentByteVectorScorer.create( - similarityType, vectorValues.getSlice(), vectorValues, queryVector); + similarityType, byteVectorValues.getSlice(), vectorValues, queryVector); if (scorer.isPresent()) { return scorer.get(); } diff --git a/lucene/core/src/java21/org/apache/lucene/store/MemorySegmentIndexInput.java b/lucene/core/src/java21/org/apache/lucene/store/MemorySegmentIndexInput.java index c6ac3d23a12..832fa5f98e6 100644 --- a/lucene/core/src/java21/org/apache/lucene/store/MemorySegmentIndexInput.java +++ b/lucene/core/src/java21/org/apache/lucene/store/MemorySegmentIndexInput.java @@ -419,7 +419,7 @@ abstract class MemorySegmentIndexInput extends IndexInput } @Override - protected void readGroupVInt(long[] dst, int offset) throws IOException { + public void readGroupVInt(long[] dst, int offset) throws IOException { try { final int len = GroupVIntUtil.readGroupVInt( @@ -530,7 +530,29 @@ abstract class MemorySegmentIndexInput extends IndexInput @Override public final MemorySegmentIndexInput clone() { - final MemorySegmentIndexInput clone = buildSlice((String) null, 0L, this.length); + ensureOpen(); + ensureAccessible(); + final MemorySegmentIndexInput clone; + if (segments.length == 1) { + clone = + new SingleSegmentImpl( + toString(), + null, // clones don't have an Arena, as they can't close) + segments[0], + length, + chunkSizePower, + confined); + } else { + clone = + new MultiSegmentImpl( + toString(), + null, // clones don't have an Arena, as they can't close) + segments, + ((MultiSegmentImpl) this).offset, + length, + chunkSizePower, + confined); + } try { clone.seek(getFilePointer()); } catch (IOException ioe) { @@ -567,14 +589,23 @@ abstract class MemorySegmentIndexInput extends IndexInput public final MemorySegmentIndexInput slice( String sliceDescription, long offset, long length, ReadAdvice advice) throws IOException { MemorySegmentIndexInput slice = slice(sliceDescription, offset, length); - if (NATIVE_ACCESS.isPresent()) { + if (NATIVE_ACCESS.isPresent() && advice != ReadAdvice.NORMAL) { + // No need to madvise with a normal advice, since it's the OS' default. final NativeAccess nativeAccess = NATIVE_ACCESS.get(); - slice.advise( - 0, - slice.length, - segment -> { - nativeAccess.madvise(segment, advice); - }); + if (length >= nativeAccess.getPageSize()) { + // Only set the read advice if the inner file is large enough. Otherwise the cons are likely + // outweighing the pros as we're: + // - potentially overriding the advice of other files that share the same pages, + // - paying the cost of a madvise system call for little value. + // We could align inner files with the page size to avoid the first issue, but again the + // pros don't clearly overweigh the cons. + slice.advise( + 0, + slice.length, + segment -> { + nativeAccess.madvise(segment, advice); + }); + } } return slice; } @@ -583,26 +614,30 @@ abstract class MemorySegmentIndexInput extends IndexInput MemorySegmentIndexInput buildSlice(String sliceDescription, long offset, long length) { ensureOpen(); ensureAccessible(); + final MemorySegment[] slices; + final boolean isClone = offset == 0 && length == this.length; + if (isClone) { + slices = segments; + } else { + final long sliceEnd = offset + length; + final int startIndex = (int) (offset >>> chunkSizePower); + final int endIndex = (int) (sliceEnd >>> chunkSizePower); + // we always allocate one more slice, the last one may be a 0 byte one after truncating with + // asSlice(): + slices = ArrayUtil.copyOfSubArray(segments, startIndex, endIndex + 1); - final long sliceEnd = offset + length; - final int startIndex = (int) (offset >>> chunkSizePower); - final int endIndex = (int) (sliceEnd >>> chunkSizePower); + // set the last segment's limit for the sliced view. + slices[slices.length - 1] = slices[slices.length - 1].asSlice(0L, sliceEnd & chunkSizeMask); - // we always allocate one more slice, the last one may be a 0 byte one after truncating with - // asSlice(): - final MemorySegment slices[] = ArrayUtil.copyOfSubArray(segments, startIndex, endIndex + 1); - - // set the last segment's limit for the sliced view. - slices[slices.length - 1] = slices[slices.length - 1].asSlice(0L, sliceEnd & chunkSizeMask); - - offset = offset & chunkSizeMask; + offset = offset & chunkSizeMask; + } final String newResourceDescription = getFullSliceDescription(sliceDescription); if (slices.length == 1) { return new SingleSegmentImpl( newResourceDescription, null, // clones don't have an Arena, as they can't close) - slices[0].asSlice(offset, length), + isClone ? slices[0] : slices[0].asSlice(offset, length), length, chunkSizePower, confined); diff --git a/lucene/core/src/java21/org/apache/lucene/store/MemorySegmentIndexInputProvider.java b/lucene/core/src/java21/org/apache/lucene/store/MemorySegmentIndexInputProvider.java index 1e5a305219b..7cbe376678b 100644 --- a/lucene/core/src/java21/org/apache/lucene/store/MemorySegmentIndexInputProvider.java +++ b/lucene/core/src/java21/org/apache/lucene/store/MemorySegmentIndexInputProvider.java @@ -129,7 +129,9 @@ final class MemorySegmentIndexInputProvider // internal FileChannel logic) if (preload) { segment.load(); - } else if (nativeAccess.filter(na -> segment.address() % na.getPageSize() == 0).isPresent()) { + } else if (readAdvice != ReadAdvice.NORMAL + && nativeAccess.filter(na -> segment.address() % na.getPageSize() == 0).isPresent()) { + // No need to madvise with ReadAdvice.NORMAL since it is the OS' default read advice. nativeAccess.get().madvise(segment, readAdvice); } segments[segNr] = segment; diff --git a/lucene/core/src/java21/org/apache/lucene/store/PosixNativeAccess.java b/lucene/core/src/java21/org/apache/lucene/store/PosixNativeAccess.java index 80c1665cdd1..05eb6157118 100644 --- a/lucene/core/src/java21/org/apache/lucene/store/PosixNativeAccess.java +++ b/lucene/core/src/java21/org/apache/lucene/store/PosixNativeAccess.java @@ -122,10 +122,7 @@ final class PosixNativeAccess extends NativeAccess { @Override public void madvise(MemorySegment segment, ReadAdvice readAdvice) throws IOException { - final Integer advice = mapReadAdvice(readAdvice); - if (advice == null) { - return; // do nothing - } + final int advice = mapReadAdvice(readAdvice); madvise(segment, advice); } @@ -156,7 +153,7 @@ final class PosixNativeAccess extends NativeAccess { } } - private Integer mapReadAdvice(ReadAdvice readAdvice) { + private int mapReadAdvice(ReadAdvice readAdvice) { return switch (readAdvice) { case NORMAL -> POSIX_MADV_NORMAL; case RANDOM -> POSIX_MADV_RANDOM; diff --git a/lucene/core/src/resources/META-INF/services/org.apache.lucene.codecs.Codec b/lucene/core/src/resources/META-INF/services/org.apache.lucene.codecs.Codec index 8b672496601..bd950aeaebd 100644 --- a/lucene/core/src/resources/META-INF/services/org.apache.lucene.codecs.Codec +++ b/lucene/core/src/resources/META-INF/services/org.apache.lucene.codecs.Codec @@ -13,4 +13,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -org.apache.lucene.codecs.lucene912.Lucene912Codec +org.apache.lucene.codecs.lucene100.Lucene100Codec diff --git a/lucene/core/src/test/org/apache/lucene/codecs/hnsw/TestFlatVectorScorer.java b/lucene/core/src/test/org/apache/lucene/codecs/hnsw/TestFlatVectorScorer.java index 9bce1f10a43..6fe9a685e1b 100644 --- a/lucene/core/src/test/org/apache/lucene/codecs/hnsw/TestFlatVectorScorer.java +++ b/lucene/core/src/test/org/apache/lucene/codecs/hnsw/TestFlatVectorScorer.java @@ -35,6 +35,8 @@ import java.util.concurrent.atomic.AtomicInteger; import org.apache.lucene.codecs.lucene95.OffHeapByteVectorValues; import org.apache.lucene.codecs.lucene95.OffHeapFloatVectorValues; import org.apache.lucene.codecs.lucene99.Lucene99ScalarQuantizedVectorScorer; +import org.apache.lucene.index.ByteVectorValues; +import org.apache.lucene.index.FloatVectorValues; import org.apache.lucene.index.VectorSimilarityFunction; import org.apache.lucene.store.Directory; import org.apache.lucene.store.IOContext; @@ -42,7 +44,6 @@ import org.apache.lucene.store.IndexInput; import org.apache.lucene.store.IndexOutput; import org.apache.lucene.store.MMapDirectory; import org.apache.lucene.tests.util.LuceneTestCase; -import org.apache.lucene.util.hnsw.RandomAccessVectorValues; import org.hamcrest.Matcher; import org.hamcrest.MatcherAssert; @@ -174,13 +175,13 @@ public class TestFlatVectorScorer extends LuceneTestCase { } } - RandomAccessVectorValues byteVectorValues( - int dims, int size, IndexInput in, VectorSimilarityFunction sim) throws IOException { + ByteVectorValues byteVectorValues(int dims, int size, IndexInput in, VectorSimilarityFunction sim) + throws IOException { return new OffHeapByteVectorValues.DenseOffHeapVectorValues( dims, size, in.slice("byteValues", 0, in.length()), dims, flatVectorsScorer, sim); } - RandomAccessVectorValues floatVectorValues( + FloatVectorValues floatVectorValues( int dims, int size, IndexInput in, VectorSimilarityFunction sim) throws IOException { return new OffHeapFloatVectorValues.DenseOffHeapVectorValues( dims, diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene90/TestLucene90StoredFieldsFormatHighCompression.java b/lucene/core/src/test/org/apache/lucene/codecs/lucene90/TestLucene90StoredFieldsFormatHighCompression.java index c72bcfeea46..fe6c82e73bb 100644 --- a/lucene/core/src/test/org/apache/lucene/codecs/lucene90/TestLucene90StoredFieldsFormatHighCompression.java +++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene90/TestLucene90StoredFieldsFormatHighCompression.java @@ -18,7 +18,7 @@ package org.apache.lucene.codecs.lucene90; import com.carrotsearch.randomizedtesting.generators.RandomPicks; import org.apache.lucene.codecs.Codec; -import org.apache.lucene.codecs.lucene912.Lucene912Codec; +import org.apache.lucene.codecs.lucene100.Lucene100Codec; import org.apache.lucene.document.Document; import org.apache.lucene.document.StoredField; import org.apache.lucene.index.DirectoryReader; @@ -31,7 +31,7 @@ import org.apache.lucene.tests.index.BaseStoredFieldsFormatTestCase; public class TestLucene90StoredFieldsFormatHighCompression extends BaseStoredFieldsFormatTestCase { @Override protected Codec getCodec() { - return new Lucene912Codec(Lucene912Codec.Mode.BEST_COMPRESSION); + return new Lucene100Codec(Lucene100Codec.Mode.BEST_COMPRESSION); } /** @@ -42,7 +42,7 @@ public class TestLucene90StoredFieldsFormatHighCompression extends BaseStoredFie for (int i = 0; i < 10; i++) { IndexWriterConfig iwc = newIndexWriterConfig(); iwc.setCodec( - new Lucene912Codec(RandomPicks.randomFrom(random(), Lucene912Codec.Mode.values()))); + new Lucene100Codec(RandomPicks.randomFrom(random(), Lucene100Codec.Mode.values()))); IndexWriter iw = new IndexWriter(dir, newIndexWriterConfig()); Document doc = new Document(); doc.add(new StoredField("field1", "value1")); @@ -72,7 +72,7 @@ public class TestLucene90StoredFieldsFormatHighCompression extends BaseStoredFie expectThrows( NullPointerException.class, () -> { - new Lucene912Codec(null); + new Lucene100Codec(null); }); expectThrows( diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene912/TestPForUtil.java b/lucene/core/src/test/org/apache/lucene/codecs/lucene912/TestPForUtil.java index 0740a8a708b..ccc786293a1 100644 --- a/lucene/core/src/test/org/apache/lucene/codecs/lucene912/TestPForUtil.java +++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene912/TestPForUtil.java @@ -46,7 +46,7 @@ public class TestPForUtil extends LuceneTestCase { final PForUtil pforUtil = new PForUtil(forUtil); for (int i = 0; i < iterations; ++i) { if (random().nextInt(5) == 0) { - pforUtil.skip(in); + PForUtil.skip(in); continue; } final long[] restored = new long[ForUtil.BLOCK_SIZE]; diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestLucene99HnswQuantizedVectorsFormat.java b/lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestLucene99HnswQuantizedVectorsFormat.java index 825de3ab725..ed70b2df002 100644 --- a/lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestLucene99HnswQuantizedVectorsFormat.java +++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestLucene99HnswQuantizedVectorsFormat.java @@ -17,7 +17,6 @@ package org.apache.lucene.codecs.lucene99; import static java.lang.String.format; -import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS; import static org.hamcrest.Matchers.is; import static org.hamcrest.Matchers.oneOf; @@ -29,7 +28,7 @@ import org.apache.lucene.codecs.Codec; import org.apache.lucene.codecs.FilterCodec; import org.apache.lucene.codecs.KnnVectorsFormat; import org.apache.lucene.codecs.KnnVectorsReader; -import org.apache.lucene.codecs.lucene912.Lucene912Codec; +import org.apache.lucene.codecs.lucene100.Lucene100Codec; import org.apache.lucene.codecs.perfield.PerFieldKnnVectorsFormat; import org.apache.lucene.document.Document; import org.apache.lucene.document.KnnFloatVectorField; @@ -75,7 +74,7 @@ public class TestLucene99HnswQuantizedVectorsFormat extends BaseKnnVectorsFormat @Override protected Codec getCodec() { - return new Lucene912Codec() { + return new Lucene100Codec() { @Override public KnnVectorsFormat getKnnVectorsFormatForField(String field) { return format; @@ -107,7 +106,7 @@ public class TestLucene99HnswQuantizedVectorsFormat extends BaseKnnVectorsFormat dir, newIndexWriterConfig() .setCodec( - new Lucene912Codec() { + new Lucene100Codec() { @Override public KnnVectorsFormat getKnnVectorsFormatForField(String field) { return getKnnFormat(4); @@ -127,7 +126,7 @@ public class TestLucene99HnswQuantizedVectorsFormat extends BaseKnnVectorsFormat dir, newIndexWriterConfig() .setCodec( - new Lucene912Codec() { + new Lucene100Codec() { @Override public KnnVectorsFormat getKnnVectorsFormatForField(String field) { return getKnnFormat(7); @@ -164,7 +163,7 @@ public class TestLucene99HnswQuantizedVectorsFormat extends BaseKnnVectorsFormat dir, newIndexWriterConfig() .setCodec( - new Lucene912Codec() { + new Lucene100Codec() { @Override public KnnVectorsFormat getKnnVectorsFormatForField(String field) { return new Lucene99HnswVectorsFormat(); @@ -184,7 +183,7 @@ public class TestLucene99HnswQuantizedVectorsFormat extends BaseKnnVectorsFormat dir, newIndexWriterConfig() .setCodec( - new Lucene912Codec() { + new Lucene100Codec() { @Override public KnnVectorsFormat getKnnVectorsFormatForField(String field) { return getKnnFormat(7); @@ -217,7 +216,7 @@ public class TestLucene99HnswQuantizedVectorsFormat extends BaseKnnVectorsFormat dir, newIndexWriterConfig() .setCodec( - new Lucene912Codec() { + new Lucene100Codec() { @Override public KnnVectorsFormat getKnnVectorsFormatForField(String field) { return new Lucene99HnswScalarQuantizedVectorsFormat( @@ -312,14 +311,13 @@ public class TestLucene99HnswQuantizedVectorsFormat extends BaseKnnVectorsFormat assertNotNull(hnswReader.getQuantizationState("f")); QuantizedByteVectorValues quantizedByteVectorValues = hnswReader.getQuantizedVectorValues("f"); - int docId = -1; - while ((docId = quantizedByteVectorValues.nextDoc()) != NO_MORE_DOCS) { - byte[] vector = quantizedByteVectorValues.vectorValue(); - float offset = quantizedByteVectorValues.getScoreCorrectionConstant(); + for (int ord = 0; ord < quantizedByteVectorValues.size(); ord++) { + byte[] vector = quantizedByteVectorValues.vectorValue(ord); + float offset = quantizedByteVectorValues.getScoreCorrectionConstant(ord); for (int i = 0; i < dim; i++) { - assertEquals(vector[i], expectedVectors[docId][i]); + assertEquals(vector[i], expectedVectors[ord][i]); } - assertEquals(offset, expectedCorrections[docId], 0.00001f); + assertEquals(offset, expectedCorrections[ord], 0.00001f); } } else { fail("reader is not Lucene99HnswVectorsReader"); diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestLucene99ScalarQuantizedVectorScorer.java b/lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestLucene99ScalarQuantizedVectorScorer.java index a0f640fa650..3b758de6ce6 100644 --- a/lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestLucene99ScalarQuantizedVectorScorer.java +++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestLucene99ScalarQuantizedVectorScorer.java @@ -27,7 +27,7 @@ import org.apache.lucene.codecs.Codec; import org.apache.lucene.codecs.KnnVectorsFormat; import org.apache.lucene.codecs.KnnVectorsReader; import org.apache.lucene.codecs.hnsw.DefaultFlatVectorScorer; -import org.apache.lucene.codecs.lucene912.Lucene912Codec; +import org.apache.lucene.codecs.lucene100.Lucene100Codec; import org.apache.lucene.codecs.perfield.PerFieldKnnVectorsFormat; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; @@ -46,13 +46,13 @@ import org.apache.lucene.store.IndexOutput; import org.apache.lucene.tests.util.LuceneTestCase; import org.apache.lucene.util.VectorUtil; import org.apache.lucene.util.hnsw.RandomVectorScorer; -import org.apache.lucene.util.quantization.RandomAccessQuantizedByteVectorValues; +import org.apache.lucene.util.quantization.QuantizedByteVectorValues; import org.apache.lucene.util.quantization.ScalarQuantizer; public class TestLucene99ScalarQuantizedVectorScorer extends LuceneTestCase { private static Codec getCodec(int bits, boolean compress) { - return new Lucene912Codec() { + return new Lucene100Codec() { @Override public KnnVectorsFormat getKnnVectorsFormatForField(String field) { return new Lucene99HnswScalarQuantizedVectorsFormat( @@ -100,8 +100,8 @@ public class TestLucene99ScalarQuantizedVectorScorer extends LuceneTestCase { try (IndexInput in = dir.openInput(fileName, IOContext.DEFAULT)) { Lucene99ScalarQuantizedVectorScorer scorer = new Lucene99ScalarQuantizedVectorScorer(new DefaultFlatVectorScorer()); - RandomAccessQuantizedByteVectorValues values = - new RandomAccessQuantizedByteVectorValues() { + QuantizedByteVectorValues values = + new QuantizedByteVectorValues() { @Override public int dimension() { return 32; @@ -128,7 +128,7 @@ public class TestLucene99ScalarQuantizedVectorScorer extends LuceneTestCase { } @Override - public RandomAccessQuantizedByteVectorValues copy() throws IOException { + public QuantizedByteVectorValues copy() throws IOException { return this; } diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestLucene99ScalarQuantizedVectorsFormat.java b/lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestLucene99ScalarQuantizedVectorsFormat.java index 64df927c765..c7f4515c25c 100644 --- a/lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestLucene99ScalarQuantizedVectorsFormat.java +++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestLucene99ScalarQuantizedVectorsFormat.java @@ -28,7 +28,7 @@ import org.apache.lucene.codecs.Codec; import org.apache.lucene.codecs.FilterCodec; import org.apache.lucene.codecs.KnnVectorsFormat; import org.apache.lucene.codecs.KnnVectorsReader; -import org.apache.lucene.codecs.lucene912.Lucene912Codec; +import org.apache.lucene.codecs.lucene100.Lucene100Codec; import org.apache.lucene.codecs.perfield.PerFieldKnnVectorsFormat; import org.apache.lucene.document.Document; import org.apache.lucene.document.KnnFloatVectorField; @@ -37,6 +37,7 @@ import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.index.KnnVectorValues; import org.apache.lucene.index.LeafReader; import org.apache.lucene.index.NoMergePolicy; import org.apache.lucene.index.VectorSimilarityFunction; @@ -69,7 +70,7 @@ public class TestLucene99ScalarQuantizedVectorsFormat extends BaseKnnVectorsForm @Override protected Codec getCodec() { - return new Lucene912Codec() { + return new Lucene100Codec() { @Override public KnnVectorsFormat getKnnVectorsFormatForField(String field) { return format; @@ -101,6 +102,11 @@ public class TestLucene99ScalarQuantizedVectorsFormat extends BaseKnnVectorsForm } } + @Override + public void testRecall() { + // ignore this test since this class always returns no results from search + } + public void testQuantizedVectorsWriteAndRead() throws Exception { // create lucene directory with codec int numVectors = 1 + random().nextInt(50); @@ -173,9 +179,10 @@ public class TestLucene99ScalarQuantizedVectorsFormat extends BaseKnnVectorsForm QuantizedByteVectorValues quantizedByteVectorValues = quantizedReader.getQuantizedVectorValues("f"); int docId = -1; - while ((docId = quantizedByteVectorValues.nextDoc()) != NO_MORE_DOCS) { - byte[] vector = quantizedByteVectorValues.vectorValue(); - float offset = quantizedByteVectorValues.getScoreCorrectionConstant(); + KnnVectorValues.DocIndexIterator iter = quantizedByteVectorValues.iterator(); + for (docId = iter.nextDoc(); docId != NO_MORE_DOCS; docId = iter.nextDoc()) { + byte[] vector = quantizedByteVectorValues.vectorValue(iter.index()); + float offset = quantizedByteVectorValues.getScoreCorrectionConstant(iter.index()); for (int i = 0; i < dim; i++) { assertEquals(vector[i], expectedVectors[docId][i]); } diff --git a/lucene/core/src/test/org/apache/lucene/document/TestField.java b/lucene/core/src/test/org/apache/lucene/document/TestField.java index 6e3a855a0df..5c1b8f17294 100644 --- a/lucene/core/src/test/org/apache/lucene/document/TestField.java +++ b/lucene/core/src/test/org/apache/lucene/document/TestField.java @@ -18,6 +18,7 @@ package org.apache.lucene.document; import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS; +import java.io.IOException; import java.io.StringReader; import java.nio.charset.StandardCharsets; import org.apache.lucene.codecs.Codec; @@ -27,6 +28,7 @@ import org.apache.lucene.index.FloatVectorValues; import org.apache.lucene.index.IndexOptions; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.KnnVectorValues; import org.apache.lucene.index.Term; import org.apache.lucene.index.VectorSimilarityFunction; import org.apache.lucene.search.IndexSearcher; @@ -713,17 +715,21 @@ public class TestField extends LuceneTestCase { try (IndexReader r = DirectoryReader.open(w)) { ByteVectorValues binary = r.leaves().get(0).reader().getByteVectorValues("binary"); assertEquals(1, binary.size()); - assertNotEquals(NO_MORE_DOCS, binary.nextDoc()); - assertNotNull(binary.vectorValue()); - assertArrayEquals(b, binary.vectorValue()); - assertEquals(NO_MORE_DOCS, binary.nextDoc()); + KnnVectorValues.DocIndexIterator iterator = binary.iterator(); + assertNotEquals(NO_MORE_DOCS, iterator.nextDoc()); + assertNotNull(binary.vectorValue(0)); + assertArrayEquals(b, binary.vectorValue(0)); + assertEquals(NO_MORE_DOCS, iterator.nextDoc()); + expectThrows(IOException.class, () -> binary.vectorValue(1)); FloatVectorValues floatValues = r.leaves().get(0).reader().getFloatVectorValues("float"); assertEquals(1, floatValues.size()); - assertNotEquals(NO_MORE_DOCS, floatValues.nextDoc()); - assertEquals(vector.length, floatValues.vectorValue().length); - assertEquals(vector[0], floatValues.vectorValue()[0], 0); - assertEquals(NO_MORE_DOCS, floatValues.nextDoc()); + KnnVectorValues.DocIndexIterator iterator1 = floatValues.iterator(); + assertNotEquals(NO_MORE_DOCS, iterator1.nextDoc()); + assertEquals(vector.length, floatValues.vectorValue(0).length); + assertEquals(vector[0], floatValues.vectorValue(0)[0], 0); + assertEquals(NO_MORE_DOCS, iterator1.nextDoc()); + expectThrows(IOException.class, () -> floatValues.vectorValue(1)); } } } diff --git a/lucene/core/src/test/org/apache/lucene/index/TestAllFilesHaveChecksumFooter.java b/lucene/core/src/test/org/apache/lucene/index/TestAllFilesHaveChecksumFooter.java index 3826962779a..9db1d305a74 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestAllFilesHaveChecksumFooter.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestAllFilesHaveChecksumFooter.java @@ -65,10 +65,7 @@ public class TestAllFilesHaveChecksumFooter extends LuceneTestCase { } if (si.info.getUseCompoundFile()) { try (Directory cfsDir = - si.info - .getCodec() - .compoundFormat() - .getCompoundReader(dir, si.info, newIOContext(random()))) { + si.info.getCodec().compoundFormat().getCompoundReader(dir, si.info)) { for (String cfsFile : cfsDir.listAll()) { checkFooter(cfsDir, cfsFile); } diff --git a/lucene/core/src/test/org/apache/lucene/index/TestAllFilesHaveCodecHeader.java b/lucene/core/src/test/org/apache/lucene/index/TestAllFilesHaveCodecHeader.java index 76c3ee75f25..e8857791c3a 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestAllFilesHaveCodecHeader.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestAllFilesHaveCodecHeader.java @@ -70,10 +70,7 @@ public class TestAllFilesHaveCodecHeader extends LuceneTestCase { } if (si.info.getUseCompoundFile()) { try (Directory cfsDir = - si.info - .getCodec() - .compoundFormat() - .getCompoundReader(dir, si.info, newIOContext(random()))) { + si.info.getCodec().compoundFormat().getCompoundReader(dir, si.info)) { for (String cfsFile : cfsDir.listAll()) { checkHeader(cfsDir, cfsFile, namesToExtensions, si.info.getId()); } diff --git a/lucene/core/src/test/org/apache/lucene/index/TestDefaultCodecParallelizesIO.java b/lucene/core/src/test/org/apache/lucene/index/TestDefaultCodecParallelizesIO.java index 2c4351fa170..58579ab93a4 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestDefaultCodecParallelizesIO.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestDefaultCodecParallelizesIO.java @@ -40,7 +40,14 @@ public class TestDefaultCodecParallelizesIO extends LuceneTestCase { Directory bbDir = new ByteBuffersDirectory(); try (LineFileDocs docs = new LineFileDocs(random()); IndexWriter w = - new IndexWriter(bbDir, new IndexWriterConfig().setCodec(TestUtil.getDefaultCodec()))) { + new IndexWriter( + bbDir, + new IndexWriterConfig() + // Disable CFS, this test needs to know about files that are open with the + // RANDOM_PRELOAD advice, which CFS doesn't allow us to detect. + .setUseCompoundFile(false) + .setMergePolicy(newLogMergePolicy(false)) + .setCodec(TestUtil.getDefaultCodec()))) { final int numDocs = atLeast(10_000); for (int d = 0; d < numDocs; ++d) { Document doc = docs.nextDoc(); diff --git a/lucene/core/src/test/org/apache/lucene/index/TestExitableDirectoryReader.java b/lucene/core/src/test/org/apache/lucene/index/TestExitableDirectoryReader.java index 3c82cd6b33e..d03c8cf42b5 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestExitableDirectoryReader.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestExitableDirectoryReader.java @@ -459,8 +459,8 @@ public class TestExitableDirectoryReader extends LuceneTestCase { expectThrows( ExitingReaderException.class, () -> { - DocIdSetIterator iter = leaf.getFloatVectorValues("vector"); - scanAndRetrieve(leaf, iter); + KnnVectorValues values = leaf.getFloatVectorValues("vector"); + scanAndRetrieve(leaf, values); }); expectThrows( @@ -473,8 +473,8 @@ public class TestExitableDirectoryReader extends LuceneTestCase { leaf.getLiveDocs(), Integer.MAX_VALUE)); } else { - DocIdSetIterator iter = leaf.getFloatVectorValues("vector"); - scanAndRetrieve(leaf, iter); + KnnVectorValues values = leaf.getFloatVectorValues("vector"); + scanAndRetrieve(leaf, values); leaf.searchNearestVectors( "vector", @@ -534,8 +534,8 @@ public class TestExitableDirectoryReader extends LuceneTestCase { expectThrows( ExitingReaderException.class, () -> { - DocIdSetIterator iter = leaf.getByteVectorValues("vector"); - scanAndRetrieve(leaf, iter); + KnnVectorValues values = leaf.getByteVectorValues("vector"); + scanAndRetrieve(leaf, values); }); expectThrows( @@ -549,8 +549,8 @@ public class TestExitableDirectoryReader extends LuceneTestCase { Integer.MAX_VALUE)); } else { - DocIdSetIterator iter = leaf.getByteVectorValues("vector"); - scanAndRetrieve(leaf, iter); + KnnVectorValues values = leaf.getByteVectorValues("vector"); + scanAndRetrieve(leaf, values); leaf.searchNearestVectors( "vector", @@ -564,20 +564,24 @@ public class TestExitableDirectoryReader extends LuceneTestCase { directory.close(); } - private static void scanAndRetrieve(LeafReader leaf, DocIdSetIterator iter) throws IOException { + private static void scanAndRetrieve(LeafReader leaf, KnnVectorValues values) throws IOException { + KnnVectorValues.DocIndexIterator iter = values.iterator(); for (iter.nextDoc(); iter.docID() != DocIdSetIterator.NO_MORE_DOCS && iter.docID() < leaf.maxDoc(); ) { - final int nextDocId = iter.docID() + 1; + int docId = iter.docID(); + if (docId >= leaf.maxDoc()) { + break; + } + final int nextDocId = docId + 1; if (random().nextBoolean() && nextDocId < leaf.maxDoc()) { iter.advance(nextDocId); } else { iter.nextDoc(); } - if (random().nextBoolean() && iter.docID() != DocIdSetIterator.NO_MORE_DOCS - && iter instanceof FloatVectorValues) { - ((FloatVectorValues) iter).vectorValue(); + && values instanceof FloatVectorValues) { + ((FloatVectorValues) values).vectorValue(iter.index()); } } } diff --git a/lucene/core/src/test/org/apache/lucene/index/TestIndexWriterForceMerge.java b/lucene/core/src/test/org/apache/lucene/index/TestIndexWriterForceMerge.java index 8186eda8462..52cd21630bc 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestIndexWriterForceMerge.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestIndexWriterForceMerge.java @@ -40,7 +40,6 @@ import org.apache.lucene.document.LongPoint; import org.apache.lucene.document.StringField; import org.apache.lucene.index.IndexWriterConfig.OpenMode; import org.apache.lucene.store.Directory; -import org.apache.lucene.store.IOContext; import org.apache.lucene.tests.analysis.MockAnalyzer; import org.apache.lucene.tests.analysis.MockTokenizer; import org.apache.lucene.tests.store.MockDirectoryWrapper; @@ -244,10 +243,7 @@ public class TestIndexWriterForceMerge extends LuceneTestCase { } if (info.info.getUseCompoundFile()) { try (Directory cfs = - info.info - .getCodec() - .compoundFormat() - .getCompoundReader(dir, info.info, IOContext.DEFAULT)) { + info.info.getCodec().compoundFormat().getCompoundReader(dir, info.info)) { for (String file : cfs.listAll()) { sb.append( String.format( diff --git a/lucene/core/src/test/org/apache/lucene/index/TestKnnGraph.java b/lucene/core/src/test/org/apache/lucene/index/TestKnnGraph.java index 72be0bd929f..5def0a26d84 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestKnnGraph.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestKnnGraph.java @@ -413,11 +413,13 @@ public class TestKnnGraph extends LuceneTestCase { // stored vector values are the same as original int nextDocWithVectors = 0; StoredFields storedFields = reader.storedFields(); + KnnVectorValues.DocIndexIterator iterator = vectorValues.iterator(); for (int i = 0; i < reader.maxDoc(); i++) { - nextDocWithVectors = vectorValues.advance(i); + nextDocWithVectors = iterator.advance(i); while (i < nextDocWithVectors && i < reader.maxDoc()) { int id = Integer.parseInt(storedFields.document(i).get("id")); - assertNull("document " + id + " has no vector, but was expected to", values[id]); + assertNull( + "document " + id + ", expected to have no vector, does have one", values[id]); ++i; } if (nextDocWithVectors == NO_MORE_DOCS) { @@ -425,7 +427,7 @@ public class TestKnnGraph extends LuceneTestCase { } int id = Integer.parseInt(storedFields.document(i).get("id")); // documents with KnnGraphValues have the expected vectors - float[] scratch = vectorValues.vectorValue(); + float[] scratch = vectorValues.vectorValue(iterator.index()); assertArrayEquals( "vector did not match for doc " + i + ", id=" + id + ": " + Arrays.toString(scratch), values[id], @@ -435,9 +437,9 @@ public class TestKnnGraph extends LuceneTestCase { } // if IndexDisi.doc == NO_MORE_DOCS, we should not call IndexDisi.nextDoc() if (nextDocWithVectors != NO_MORE_DOCS) { - assertEquals(NO_MORE_DOCS, vectorValues.nextDoc()); + assertEquals(NO_MORE_DOCS, iterator.nextDoc()); } else { - assertEquals(NO_MORE_DOCS, vectorValues.docID()); + assertEquals(NO_MORE_DOCS, iterator.docID()); } // assert graph values: @@ -560,7 +562,6 @@ public class TestKnnGraph extends LuceneTestCase { String idString = Integer.toString(id); doc.add(new StringField("id", idString, Field.Store.YES)); doc.add(new SortedDocValuesField("id", new BytesRef(idString))); - // XSSystem.out.println("add " + idString + " " + Arrays.toString(vector)); iw.updateDocument(new Term("id", idString), doc); } } diff --git a/lucene/core/src/test/org/apache/lucene/index/TestSegmentInfos.java b/lucene/core/src/test/org/apache/lucene/index/TestSegmentInfos.java index 90b0a07aa34..e222c20d639 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestSegmentInfos.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestSegmentInfos.java @@ -71,8 +71,8 @@ public class TestSegmentInfos extends LuceneTestCase { SegmentInfo info = new SegmentInfo( dir, - Version.LUCENE_10_0_0, - Version.LUCENE_10_0_0, + Version.LUCENE_11_0_0, + Version.LUCENE_11_0_0, "_0", 1, false, @@ -90,7 +90,7 @@ public class TestSegmentInfos extends LuceneTestCase { sis.add(commitInfo); sis.commit(dir); sis = SegmentInfos.readLatestCommit(dir); - assertEquals(Version.LUCENE_10_0_0, sis.getMinSegmentLuceneVersion()); + assertEquals(Version.LUCENE_11_0_0, sis.getMinSegmentLuceneVersion()); assertEquals(Version.LATEST, sis.getCommitLuceneVersion()); dir.close(); } @@ -106,8 +106,8 @@ public class TestSegmentInfos extends LuceneTestCase { SegmentInfo info = new SegmentInfo( dir, - Version.LUCENE_10_0_0, - Version.LUCENE_10_0_0, + Version.LUCENE_11_0_0, + Version.LUCENE_11_0_0, "_0", 1, false, @@ -126,8 +126,8 @@ public class TestSegmentInfos extends LuceneTestCase { info = new SegmentInfo( dir, - Version.LUCENE_10_0_0, - Version.LUCENE_10_0_0, + Version.LUCENE_11_0_0, + Version.LUCENE_11_0_0, "_1", 1, false, @@ -146,7 +146,7 @@ public class TestSegmentInfos extends LuceneTestCase { byte[] commitInfoId0 = sis.info(0).getId(); byte[] commitInfoId1 = sis.info(1).getId(); sis = SegmentInfos.readLatestCommit(dir); - assertEquals(Version.LUCENE_10_0_0, sis.getMinSegmentLuceneVersion()); + assertEquals(Version.LUCENE_11_0_0, sis.getMinSegmentLuceneVersion()); assertEquals(Version.LATEST, sis.getCommitLuceneVersion()); assertEquals( StringHelper.idToString(commitInfoId0), StringHelper.idToString(sis.info(0).getId())); @@ -277,8 +277,8 @@ public class TestSegmentInfos extends LuceneTestCase { SegmentInfo info = new SegmentInfo( dir, - Version.LUCENE_9_0_0, - Version.LUCENE_9_0_0, + Version.LUCENE_10_0_0, + Version.LUCENE_10_0_0, "_0", 1, false, diff --git a/lucene/core/src/test/org/apache/lucene/index/TestSoftDeletesDirectoryReaderWrapper.java b/lucene/core/src/test/org/apache/lucene/index/TestSoftDeletesDirectoryReaderWrapper.java index 2098f57910d..5214b97fdc5 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestSoftDeletesDirectoryReaderWrapper.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestSoftDeletesDirectoryReaderWrapper.java @@ -306,6 +306,12 @@ public class TestSoftDeletesDirectoryReaderWrapper extends LuceneTestCase { softDeletesField, MatchNoDocsQuery::new, mergePolicy)); writer.forceMerge(1); try (DirectoryReader reader = DirectoryReader.open(writer)) { + for (LeafReaderContext leafContext : reader.leaves()) { + assertThat(leafContext.reader(), instanceOf(SegmentReader.class)); + SegmentReader segmentReader = (SegmentReader) leafContext.reader(); + assertNull(segmentReader.getLiveDocs()); + assertNull(segmentReader.getHardLiveDocs()); + } SoftDeletesDirectoryReaderWrapper wrapped = new SoftDeletesDirectoryReaderWrapper(reader, softDeletesField); assertEquals(numDocs, wrapped.numDocs()); diff --git a/lucene/core/src/test/org/apache/lucene/index/TestSortingCodecReader.java b/lucene/core/src/test/org/apache/lucene/index/TestSortingCodecReader.java index 241fc0a5fe5..9663d676255 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestSortingCodecReader.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestSortingCodecReader.java @@ -242,6 +242,7 @@ public class TestSortingCodecReader extends LuceneTestCase { NumericDocValues ids = leaf.getNumericDocValues("id"); long prevValue = -1; boolean usingAltIds = false; + KnnVectorValues.DocIndexIterator valuesIterator = vectorValues.iterator(); for (int i = 0; i < actualNumDocs; i++) { int idNext = ids.nextDoc(); if (idNext == DocIdSetIterator.NO_MORE_DOCS) { @@ -254,6 +255,7 @@ public class TestSortingCodecReader extends LuceneTestCase { sorted_set_dv = leaf.getSortedSetDocValues("sorted_set_dv"); binary_sorted_dv = leaf.getSortedDocValues("binary_sorted_dv"); vectorValues = leaf.getFloatVectorValues("vector"); + valuesIterator = vectorValues.iterator(); prevValue = -1; } assertTrue(prevValue + " < " + ids.longValue(), prevValue < ids.longValue()); @@ -262,7 +264,7 @@ public class TestSortingCodecReader extends LuceneTestCase { assertTrue(sorted_numeric_dv.advanceExact(idNext)); assertTrue(sorted_set_dv.advanceExact(idNext)); assertTrue(binary_sorted_dv.advanceExact(idNext)); - assertEquals(idNext, vectorValues.advance(idNext)); + assertEquals(idNext, valuesIterator.advance(idNext)); assertEquals(new BytesRef(ids.longValue() + ""), binary_dv.binaryValue()); assertEquals( new BytesRef(ids.longValue() + ""), @@ -274,7 +276,7 @@ public class TestSortingCodecReader extends LuceneTestCase { assertEquals(1, sorted_numeric_dv.docValueCount()); assertEquals(ids.longValue(), sorted_numeric_dv.nextValue()); - float[] vectorValue = vectorValues.vectorValue(); + float[] vectorValue = vectorValues.vectorValue(valuesIterator.index()); assertEquals(1, vectorValue.length); assertEquals((float) ids.longValue(), vectorValue[0], 0.001f); diff --git a/lucene/core/src/test/org/apache/lucene/index/TestTieredMergePolicy.java b/lucene/core/src/test/org/apache/lucene/index/TestTieredMergePolicy.java index eb24d964702..a2d678a3ec0 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestTieredMergePolicy.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestTieredMergePolicy.java @@ -19,6 +19,7 @@ package org.apache.lucene.index; import java.io.IOException; import java.util.ArrayList; import java.util.Collections; +import java.util.Comparator; import java.util.HashMap; import java.util.List; import java.util.Locale; @@ -39,6 +40,8 @@ import org.apache.lucene.util.Version; public class TestTieredMergePolicy extends BaseMergePolicyTestCase { + private record DocCountAndSizeInBytes(int docCount, long sizeInBytes) {} + @Override public TieredMergePolicy mergePolicy() { return newTieredMergePolicy(); @@ -54,7 +57,7 @@ public class TestTieredMergePolicy extends BaseMergePolicyTestCase { int totalDelCount = 0; int totalMaxDoc = 0; long totalBytes = 0; - List segmentSizes = new ArrayList<>(); + List segmentSizes = new ArrayList<>(); for (SegmentCommitInfo sci : infos) { totalDelCount += sci.getDelCount(); totalMaxDoc += sci.info.maxDoc(); @@ -62,10 +65,11 @@ public class TestTieredMergePolicy extends BaseMergePolicyTestCase { double liveRatio = 1 - (double) sci.getDelCount() / sci.info.maxDoc(); long weightedByteSize = (long) (liveRatio * byteSize); totalBytes += weightedByteSize; - segmentSizes.add(weightedByteSize); + segmentSizes.add( + new DocCountAndSizeInBytes(sci.info.maxDoc() - sci.getDelCount(), weightedByteSize)); minSegmentBytes = Math.min(minSegmentBytes, weightedByteSize); } - Collections.sort(segmentSizes); + Collections.sort(segmentSizes, Comparator.comparingLong(DocCountAndSizeInBytes::sizeInBytes)); final double delPercentage = 100.0 * totalDelCount / totalMaxDoc; assertTrue( @@ -78,7 +82,7 @@ public class TestTieredMergePolicy extends BaseMergePolicyTestCase { long levelSizeBytes = Math.max(minSegmentBytes, (long) (tmp.getFloorSegmentMB() * 1024 * 1024)); long bytesLeft = totalBytes; double allowedSegCount = 0; - List biggestSegments = segmentSizes; + List biggestSegments = segmentSizes; if (biggestSegments.size() > tmp.getTargetSearchConcurrency() - 1) { biggestSegments = biggestSegments.subList( @@ -86,11 +90,18 @@ public class TestTieredMergePolicy extends BaseMergePolicyTestCase { biggestSegments.size()); } // Allow whole segments for the targetSearchConcurrency-1 biggest segments - for (long size : biggestSegments) { - bytesLeft -= size; + for (DocCountAndSizeInBytes size : biggestSegments) { + bytesLeft -= size.sizeInBytes(); allowedSegCount++; } + int tooBigCount = 0; + for (DocCountAndSizeInBytes size : segmentSizes) { + if (size.sizeInBytes() >= maxMergedSegmentBytes / 2) { + tooBigCount++; + } + } + // below we make the assumption that segments that reached the max segment // size divided by 2 don't need merging anymore int mergeFactor = (int) Math.min(tmp.getSegmentsPerTier(), tmp.getMaxMergeAtOnce()); @@ -105,39 +116,31 @@ public class TestTieredMergePolicy extends BaseMergePolicyTestCase { bytesLeft -= tmp.getSegmentsPerTier() * levelSizeBytes; levelSizeBytes = Math.min(levelSizeBytes * mergeFactor, maxMergedSegmentBytes / 2); } - allowedSegCount = Math.max(allowedSegCount, tmp.getSegmentsPerTier()); + // Allow at least a full tier in addition of the too big segments. + allowedSegCount = Math.max(allowedSegCount, tooBigCount + tmp.getSegmentsPerTier()); + // Allow at least `targetSearchConcurrency` segments. + allowedSegCount = Math.max(allowedSegCount, tmp.getTargetSearchConcurrency()); - // It's ok to be over the allowed segment count if none of the most balanced merges are balanced - // enough - boolean hasBalancedMerges = false; - for (int i = 0; i < segmentSizes.size() - mergeFactor; ++i) { - long maxMergeSegmentSize = segmentSizes.get(i + mergeFactor - 1); - if (maxMergeSegmentSize >= maxMergedSegmentBytes / 2) { - break; - } - long totalMergeSize = 0; - for (int j = 0; j < i + mergeFactor; ++j) { - totalMergeSize += segmentSizes.get(j); - } - if (maxMergedSegmentBytes * 1.5 <= totalMergeSize) { - hasBalancedMerges = true; + // It's ok to be over the allowed segment count if none of the merges are legal, because they + // are either not balanced or because they exceed the max merged segment doc count. + // We only check pairwise merges instead of every possible merge to keep things simple. If none + // of the pairwise merges are legal, chances are high that no merge is legal. + int maxDocsPerSegment = tmp.getMaxAllowedDocs(infos.totalMaxDoc(), totalDelCount); + boolean hasLegalMerges = false; + for (int i = 0; i < segmentSizes.size() - 1; ++i) { + DocCountAndSizeInBytes size1 = segmentSizes.get(i); + DocCountAndSizeInBytes size2 = segmentSizes.get(i + 1); + long mergedSegmentSizeInBytes = size1.sizeInBytes() + size2.sizeInBytes(); + int mergedSegmentDocCount = size1.docCount() + size2.docCount(); + + if (mergedSegmentSizeInBytes <= maxMergedSegmentBytes + && size2.sizeInBytes() * 1.5 <= mergedSegmentSizeInBytes + && mergedSegmentDocCount <= maxDocsPerSegment) { + hasLegalMerges = true; break; } } - // There can be more segments if we can't merge docs because they are balanced between segments. - // At least the - // 2 smallest segments should be mergeable. - // should be 2 segments to merge - int maxDocsPerSegment = tmp.getMaxAllowedDocs(infos.totalMaxDoc(), totalDelCount); - List segmentDocs = - infos.asList().stream() - .map(info -> info.info.maxDoc() - info.getDelCount()) - .sorted() - .toList(); - boolean eligibleDocsMerge = - segmentDocs.size() >= 2 && segmentDocs.get(0) + segmentDocs.get(1) < maxDocsPerSegment; - int numSegments = infos.asList().size(); assertTrue( String.format( @@ -154,7 +157,7 @@ public class TestTieredMergePolicy extends BaseMergePolicyTestCase { delPercentage, tmp.getDeletesPctAllowed(), tmp.getTargetSearchConcurrency()), - numSegments <= allowedSegCount || hasBalancedMerges == false || eligibleDocsMerge == false); + numSegments <= allowedSegCount || hasLegalMerges == false); } @Override diff --git a/lucene/core/src/test/org/apache/lucene/internal/hppc/TestCharObjectHashMap.java b/lucene/core/src/test/org/apache/lucene/internal/hppc/TestCharObjectHashMap.java index 4cc036dcfe6..f9a1a259ce8 100644 --- a/lucene/core/src/test/org/apache/lucene/internal/hppc/TestCharObjectHashMap.java +++ b/lucene/core/src/test/org/apache/lucene/internal/hppc/TestCharObjectHashMap.java @@ -17,6 +17,8 @@ package org.apache.lucene.internal.hppc; +import static org.apache.lucene.internal.hppc.TestIntObjectHashMap.toList; + import com.carrotsearch.randomizedtesting.RandomizedTest; import java.util.Arrays; import java.util.HashMap; @@ -24,6 +26,8 @@ import java.util.HashSet; import java.util.Random; import java.util.concurrent.atomic.AtomicInteger; import org.apache.lucene.tests.util.LuceneTestCase; +import org.hamcrest.MatcherAssert; +import org.hamcrest.Matchers; import org.junit.After; import org.junit.Test; @@ -66,13 +70,6 @@ public class TestCharObjectHashMap extends LuceneTestCase { assertArrayEquals(elements, array); } - /** Check if the array's content is identical to a given sequence of elements. */ - private static void assertSortedListEquals(Object[] array, Object... elements) { - assertEquals(elements.length, array.length); - Arrays.sort(array); - assertArrayEquals(elements, array); - } - private final int value0 = vcast(0); private final int value1 = vcast(1); private final int value2 = vcast(2); @@ -603,13 +600,15 @@ public class TestCharObjectHashMap extends LuceneTestCase { map.put(key1, value3); map.put(key2, value2); map.put(key3, value1); - assertSortedListEquals(map.values().toArray(), value1, value2, value3); + MatcherAssert.assertThat( + toList(map.values()), Matchers.containsInAnyOrder(value1, value2, value3)); map.clear(); map.put(key1, value1); map.put(key2, value2); map.put(key3, value2); - assertSortedListEquals(map.values().toArray(), value1, value2, value2); + MatcherAssert.assertThat( + toList(map.values()), Matchers.containsInAnyOrder(value1, value2, value2)); } /* */ diff --git a/lucene/core/src/test/org/apache/lucene/internal/hppc/TestIntObjectHashMap.java b/lucene/core/src/test/org/apache/lucene/internal/hppc/TestIntObjectHashMap.java index 6c6c0872ede..4144300ba55 100644 --- a/lucene/core/src/test/org/apache/lucene/internal/hppc/TestIntObjectHashMap.java +++ b/lucene/core/src/test/org/apache/lucene/internal/hppc/TestIntObjectHashMap.java @@ -18,12 +18,15 @@ package org.apache.lucene.internal.hppc; import com.carrotsearch.randomizedtesting.RandomizedTest; +import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; import java.util.HashSet; +import java.util.List; import java.util.Random; import java.util.concurrent.atomic.AtomicInteger; import org.apache.lucene.tests.util.LuceneTestCase; +import org.junit.Assert; import org.junit.Test; /** @@ -66,10 +69,8 @@ public class TestIntObjectHashMap extends LuceneTestCase { } /** Check if the array's content is identical to a given sequence of elements. */ - private static void assertSortedListEquals(Object[] array, Object... elements) { - assertEquals(elements.length, array.length); - Arrays.sort(array); - assertArrayEquals(elements, array); + private static void assertSortedListEquals(List array, Object... elements) { + Assert.assertEquals(Arrays.asList(elements), array.stream().sorted().toList()); } private final int value0 = vcast(0); @@ -584,13 +585,21 @@ public class TestIntObjectHashMap extends LuceneTestCase { map.put(key1, value3); map.put(key2, value2); map.put(key3, value1); - assertSortedListEquals(map.values().toArray(), value1, value2, value3); + assertSortedListEquals(toList(map.values()), value1, value2, value3); map.clear(); map.put(key1, value1); map.put(key2, value2); map.put(key3, value2); - assertSortedListEquals(map.values().toArray(), value1, value2, value2); + assertSortedListEquals(toList(map.values()), value1, value2, value2); + } + + static List toList(Iterable> values) { + ArrayList list = new ArrayList<>(); + for (var c : values) { + list.add(c.value); + } + return list; } /* */ diff --git a/lucene/core/src/test/org/apache/lucene/internal/hppc/TestLongObjectHashMap.java b/lucene/core/src/test/org/apache/lucene/internal/hppc/TestLongObjectHashMap.java index f5d6176e24b..df66561197d 100644 --- a/lucene/core/src/test/org/apache/lucene/internal/hppc/TestLongObjectHashMap.java +++ b/lucene/core/src/test/org/apache/lucene/internal/hppc/TestLongObjectHashMap.java @@ -17,6 +17,8 @@ package org.apache.lucene.internal.hppc; +import static org.apache.lucene.internal.hppc.TestIntObjectHashMap.toList; + import com.carrotsearch.randomizedtesting.RandomizedTest; import java.util.Arrays; import java.util.HashMap; @@ -24,6 +26,8 @@ import java.util.HashSet; import java.util.Random; import java.util.concurrent.atomic.AtomicInteger; import org.apache.lucene.tests.util.LuceneTestCase; +import org.hamcrest.MatcherAssert; +import org.hamcrest.Matchers; import org.junit.Test; /** @@ -65,13 +69,6 @@ public class TestLongObjectHashMap extends LuceneTestCase { assertArrayEquals(elements, array); } - /** Check if the array's content is identical to a given sequence of elements. */ - private static void assertSortedListEquals(Object[] array, Object... elements) { - assertEquals(elements.length, array.length); - Arrays.sort(array); - assertArrayEquals(elements, array); - } - private final int value0 = vcast(0); private final int value1 = vcast(1); private final int value2 = vcast(2); @@ -585,13 +582,15 @@ public class TestLongObjectHashMap extends LuceneTestCase { map.put(key1, value3); map.put(key2, value2); map.put(key3, value1); - assertSortedListEquals(map.values().toArray(), value1, value2, value3); + MatcherAssert.assertThat( + toList(map.values()), Matchers.containsInAnyOrder(value1, value2, value3)); map.clear(); map.put(key1, value1); map.put(key2, value2); map.put(key3, value2); - assertSortedListEquals(map.values().toArray(), value1, value2, value2); + MatcherAssert.assertThat( + toList(map.values()), Matchers.containsInAnyOrder(value1, value2, value2)); } /* */ diff --git a/lucene/core/src/test/org/apache/lucene/internal/vectorization/TestVectorScorer.java b/lucene/core/src/test/org/apache/lucene/internal/vectorization/TestVectorScorer.java index da9c312ef96..bc3b6813a5b 100644 --- a/lucene/core/src/test/org/apache/lucene/internal/vectorization/TestVectorScorer.java +++ b/lucene/core/src/test/org/apache/lucene/internal/vectorization/TestVectorScorer.java @@ -16,6 +16,7 @@ */ package org.apache.lucene.internal.vectorization; +import static java.util.Locale.ROOT; import static org.apache.lucene.index.VectorSimilarityFunction.COSINE; import static org.apache.lucene.index.VectorSimilarityFunction.DOT_PRODUCT; import static org.apache.lucene.index.VectorSimilarityFunction.EUCLIDEAN; @@ -24,6 +25,8 @@ import static org.apache.lucene.index.VectorSimilarityFunction.MAXIMUM_INNER_PRO import com.carrotsearch.randomizedtesting.generators.RandomNumbers; import java.io.ByteArrayOutputStream; import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; import java.util.Arrays; import java.util.List; import java.util.Objects; @@ -39,6 +42,8 @@ import java.util.stream.IntStream; import org.apache.lucene.codecs.hnsw.DefaultFlatVectorScorer; import org.apache.lucene.codecs.hnsw.FlatVectorsScorer; import org.apache.lucene.codecs.lucene95.OffHeapByteVectorValues; +import org.apache.lucene.codecs.lucene95.OffHeapFloatVectorValues; +import org.apache.lucene.index.KnnVectorValues; import org.apache.lucene.index.VectorSimilarityFunction; import org.apache.lucene.store.Directory; import org.apache.lucene.store.IOContext; @@ -47,7 +52,6 @@ import org.apache.lucene.store.IndexOutput; import org.apache.lucene.store.MMapDirectory; import org.apache.lucene.tests.util.LuceneTestCase; import org.apache.lucene.util.NamedThreadFactory; -import org.apache.lucene.util.hnsw.RandomAccessVectorValues; import org.apache.lucene.util.hnsw.RandomVectorScorer; import org.junit.BeforeClass; @@ -329,12 +333,63 @@ public class TestVectorScorer extends LuceneTestCase { } } - RandomAccessVectorValues vectorValues( - int dims, int size, IndexInput in, VectorSimilarityFunction sim) throws IOException { + // Tests that the FlatVectorsScorer handles float vectors correctly. + public void testWithFloatValues() throws IOException { + try (Directory dir = new MMapDirectory(createTempDir("testWithFloatValues"))) { + final String fileName = "floatvalues"; + try (IndexOutput out = dir.createOutput(fileName, IOContext.DEFAULT)) { + var vec = floatToByteArray(1f); // single vector, with one dimension + out.writeBytes(vec, 0, vec.length); + } + + try (IndexInput in = dir.openInput(fileName, IOContext.DEFAULT)) { + for (int times = 0; times < TIMES; times++) { + for (var sim : List.of(COSINE, EUCLIDEAN, DOT_PRODUCT, MAXIMUM_INNER_PRODUCT)) { + var vectorValues = floatVectorValues(1, 1, in, sim); + assert vectorValues.getEncoding().byteSize == 4; + + var supplier1 = DEFAULT_SCORER.getRandomVectorScorerSupplier(sim, vectorValues); + var supplier2 = MEMSEG_SCORER.getRandomVectorScorerSupplier(sim, vectorValues); + // these assertion assumes that the supplier and scorer's toString will have float + // in it, since it's based on float vectors. + assertTrue(supplier1.toString().toLowerCase(ROOT).contains("float")); + assertTrue(supplier2.toString().toLowerCase(ROOT).contains("float")); + assertTrue(supplier1.scorer(0).toString().toLowerCase(ROOT).contains("float")); + assertTrue(supplier2.scorer(0).toString().toLowerCase(ROOT).contains("float")); + float expected = supplier1.scorer(0).score(0); + assertEquals(supplier2.scorer(0).score(0), expected, DELTA); + + var scorer1 = DEFAULT_SCORER.getRandomVectorScorer(sim, vectorValues, new float[] {1f}); + var scorer2 = MEMSEG_SCORER.getRandomVectorScorer(sim, vectorValues, new float[] {1f}); + assertTrue(scorer1.toString().toLowerCase(ROOT).contains("float")); + assertTrue(scorer2.toString().toLowerCase(ROOT).contains("float")); + expected = scorer1.score(0); + assertEquals(scorer2.score(0), expected, DELTA); + + expectThrows( + Throwable.class, + () -> DEFAULT_SCORER.getRandomVectorScorer(sim, vectorValues, new byte[] {1})); + expectThrows( + Throwable.class, + () -> MEMSEG_SCORER.getRandomVectorScorer(sim, vectorValues, new byte[] {1})); + } + } + } + } + } + + KnnVectorValues vectorValues(int dims, int size, IndexInput in, VectorSimilarityFunction sim) + throws IOException { return new OffHeapByteVectorValues.DenseOffHeapVectorValues( dims, size, in.slice("byteValues", 0, in.length()), dims, MEMSEG_SCORER, sim); } + KnnVectorValues floatVectorValues(int dims, int size, IndexInput in, VectorSimilarityFunction sim) + throws IOException { + return new OffHeapFloatVectorValues.DenseOffHeapVectorValues( + dims, size, in.slice("floatValues", 0, in.length()), dims, MEMSEG_SCORER, sim); + } + // creates the vector based on the given ordinal, which is reproducible given the ord and dims static byte[] vector(int ord, int dims) { var random = new Random(Objects.hash(ord, dims)); @@ -355,6 +410,11 @@ public class TestVectorScorer extends LuceneTestCase { } } + /** Converts a float value to a byte array. */ + public static byte[] floatToByteArray(float value) { + return ByteBuffer.allocate(4).order(ByteOrder.LITTLE_ENDIAN).putFloat(value).array(); + } + static int randomIntBetween(int minInclusive, int maxInclusive) { return RandomNumbers.randomIntBetween(random(), minInclusive, maxInclusive); } diff --git a/lucene/core/src/test/org/apache/lucene/search/BaseKnnVectorQueryTestCase.java b/lucene/core/src/test/org/apache/lucene/search/BaseKnnVectorQueryTestCase.java index 21a33f9ca3e..afa150e387f 100644 --- a/lucene/core/src/test/org/apache/lucene/search/BaseKnnVectorQueryTestCase.java +++ b/lucene/core/src/test/org/apache/lucene/search/BaseKnnVectorQueryTestCase.java @@ -38,6 +38,7 @@ import org.apache.lucene.index.FilterLeafReader; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.index.KnnVectorValues; import org.apache.lucene.index.LeafReader; import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.index.QueryTimeout; @@ -740,7 +741,7 @@ abstract class BaseKnnVectorQueryTestCase extends LuceneTestCase { LeafReader leafReader = getOnlyLeafReader(reader); FieldInfo fi = leafReader.getFieldInfos().fieldInfo("field"); assertNotNull(fi); - DocIdSetIterator vectorValues; + KnnVectorValues vectorValues; switch (fi.getVectorEncoding()) { case BYTE: vectorValues = leafReader.getByteVectorValues("field"); @@ -752,7 +753,7 @@ abstract class BaseKnnVectorQueryTestCase extends LuceneTestCase { throw new AssertionError(); } assertNotNull(vectorValues); - assertEquals(NO_MORE_DOCS, vectorValues.nextDoc()); + assertEquals(NO_MORE_DOCS, vectorValues.iterator().nextDoc()); } } } diff --git a/lucene/core/src/test/org/apache/lucene/search/TestMaxScoreAccumulator.java b/lucene/core/src/test/org/apache/lucene/search/TestMaxScoreAccumulator.java index d816d419c4c..56160971931 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestMaxScoreAccumulator.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestMaxScoreAccumulator.java @@ -23,44 +23,28 @@ public class TestMaxScoreAccumulator extends LuceneTestCase { public void testSimple() { MaxScoreAccumulator acc = new MaxScoreAccumulator(); acc.accumulate(0, 0f); - assertEquals(0f, acc.get().score(), 0); - assertEquals(0, acc.get().docId(), 0); + assertEquals(0f, MaxScoreAccumulator.toScore(acc.getRaw()), 0); + assertEquals(0, MaxScoreAccumulator.docId(acc.getRaw()), 0); acc.accumulate(10, 0f); - assertEquals(0f, acc.get().score(), 0); - assertEquals(0, acc.get().docId(), 0); + assertEquals(0f, MaxScoreAccumulator.toScore(acc.getRaw()), 0); + assertEquals(0, MaxScoreAccumulator.docId(acc.getRaw()), 0); acc.accumulate(100, 1000f); - assertEquals(1000f, acc.get().score(), 0); - assertEquals(100, acc.get().docId(), 0); + assertEquals(1000f, MaxScoreAccumulator.toScore(acc.getRaw()), 0); + assertEquals(100, MaxScoreAccumulator.docId(acc.getRaw()), 0); acc.accumulate(1000, 5f); - assertEquals(1000f, acc.get().score(), 0); - assertEquals(100, acc.get().docId(), 0); + assertEquals(1000f, MaxScoreAccumulator.toScore(acc.getRaw()), 0); + assertEquals(100, MaxScoreAccumulator.docId(acc.getRaw()), 0); acc.accumulate(99, 1000f); - assertEquals(1000f, acc.get().score(), 0); - assertEquals(99, acc.get().docId(), 0); + assertEquals(1000f, MaxScoreAccumulator.toScore(acc.getRaw()), 0); + assertEquals(99, MaxScoreAccumulator.docId(acc.getRaw()), 0); acc.accumulate(1000, 1001f); - assertEquals(1001f, acc.get().score(), 0); - assertEquals(1000, acc.get().docId(), 0); + assertEquals(1001f, MaxScoreAccumulator.toScore(acc.getRaw()), 0); + assertEquals(1000, MaxScoreAccumulator.docId(acc.getRaw()), 0); acc.accumulate(10, 1001f); - assertEquals(1001f, acc.get().score(), 0); - assertEquals(10, acc.get().docId(), 0); + assertEquals(1001f, MaxScoreAccumulator.toScore(acc.getRaw()), 0); + assertEquals(10, MaxScoreAccumulator.docId(acc.getRaw()), 0); acc.accumulate(100, 1001f); - assertEquals(1001f, acc.get().score(), 0); - assertEquals(10, acc.get().docId(), 0); - } - - public void testRandom() { - MaxScoreAccumulator acc = new MaxScoreAccumulator(); - int numDocs = atLeast(100); - int maxDocs = atLeast(10000); - MaxScoreAccumulator.DocAndScore max = new MaxScoreAccumulator.DocAndScore(-1, -1); - for (int i = 0; i < numDocs; i++) { - MaxScoreAccumulator.DocAndScore res = - new MaxScoreAccumulator.DocAndScore(random().nextInt(maxDocs), random().nextFloat()); - acc.accumulate(res.docId(), res.score()); - if (res.compareTo(max) > 0) { - max = res; - } - } - assertEquals(max, acc.get()); + assertEquals(1001f, MaxScoreAccumulator.toScore(acc.getRaw()), 0); + assertEquals(10, MaxScoreAccumulator.docId(acc.getRaw()), 0); } } diff --git a/lucene/core/src/test/org/apache/lucene/search/TestMaxScoreBulkScorer.java b/lucene/core/src/test/org/apache/lucene/search/TestMaxScoreBulkScorer.java index c6920403c91..6973cc0025a 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestMaxScoreBulkScorer.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestMaxScoreBulkScorer.java @@ -38,23 +38,6 @@ import org.apache.lucene.util.Bits; // These basic tests are similar to some of the tests in TestWANDScorer, and may not need to be kept public class TestMaxScoreBulkScorer extends LuceneTestCase { - private static class CapMaxScoreWindowAt2048Scorer extends FilterScorer { - - public CapMaxScoreWindowAt2048Scorer(Scorer in) { - super(in); - } - - @Override - public int advanceShallow(int target) throws IOException { - return Math.min(target | 0x7FF, in.advanceShallow(target)); - } - - @Override - public float getMaxScore(int upTo) throws IOException { - return in.getMaxScore(upTo); - } - } - private void writeDocuments(Directory dir) throws IOException { try (IndexWriter w = new IndexWriter(dir, newIndexWriterConfig().setMergePolicy(newLogMergePolicy()))) { @@ -96,12 +79,10 @@ public class TestMaxScoreBulkScorer extends LuceneTestCase { searcher .createWeight(searcher.rewrite(clause1), ScoreMode.TOP_SCORES, 1f) .scorer(context); - scorer1 = new CapMaxScoreWindowAt2048Scorer(scorer1); Scorer scorer2 = searcher .createWeight(searcher.rewrite(clause2), ScoreMode.TOP_SCORES, 1f) .scorer(context); - scorer2 = new CapMaxScoreWindowAt2048Scorer(scorer2); BulkScorer scorer = new MaxScoreBulkScorer(context.reader().maxDoc(), Arrays.asList(scorer1, scorer2)); @@ -168,12 +149,10 @@ public class TestMaxScoreBulkScorer extends LuceneTestCase { searcher .createWeight(searcher.rewrite(clause1), ScoreMode.TOP_SCORES, 1f) .scorer(context); - scorer1 = new CapMaxScoreWindowAt2048Scorer(scorer1); Scorer scorer2 = searcher .createWeight(searcher.rewrite(clause2), ScoreMode.TOP_SCORES, 1f) .scorer(context); - scorer2 = new CapMaxScoreWindowAt2048Scorer(scorer2); BulkScorer scorer = new MaxScoreBulkScorer(context.reader().maxDoc(), Arrays.asList(scorer1, scorer2)); @@ -237,17 +216,14 @@ public class TestMaxScoreBulkScorer extends LuceneTestCase { searcher .createWeight(searcher.rewrite(clause1), ScoreMode.TOP_SCORES, 1f) .scorer(context); - scorer1 = new CapMaxScoreWindowAt2048Scorer(scorer1); Scorer scorer2 = searcher .createWeight(searcher.rewrite(clause2), ScoreMode.TOP_SCORES, 1f) .scorer(context); - scorer2 = new CapMaxScoreWindowAt2048Scorer(scorer2); Scorer scorer3 = searcher .createWeight(searcher.rewrite(clause3), ScoreMode.TOP_SCORES, 1f) .scorer(context); - scorer3 = new CapMaxScoreWindowAt2048Scorer(scorer3); BulkScorer scorer = new MaxScoreBulkScorer( @@ -317,17 +293,14 @@ public class TestMaxScoreBulkScorer extends LuceneTestCase { searcher .createWeight(searcher.rewrite(clause1), ScoreMode.TOP_SCORES, 1f) .scorer(context); - scorer1 = new CapMaxScoreWindowAt2048Scorer(scorer1); Scorer scorer2 = searcher .createWeight(searcher.rewrite(clause2), ScoreMode.TOP_SCORES, 1f) .scorer(context); - scorer2 = new CapMaxScoreWindowAt2048Scorer(scorer2); Scorer scorer3 = searcher .createWeight(searcher.rewrite(clause3), ScoreMode.TOP_SCORES, 1f) .scorer(context); - scorer3 = new CapMaxScoreWindowAt2048Scorer(scorer3); BulkScorer scorer = new MaxScoreBulkScorer( diff --git a/lucene/core/src/test/org/apache/lucene/search/TestTermInSetQuery.java b/lucene/core/src/test/org/apache/lucene/search/TestTermInSetQuery.java index 7cfd0c5adde..b6503021617 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestTermInSetQuery.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestTermInSetQuery.java @@ -52,6 +52,7 @@ import org.apache.lucene.tests.util.LuceneTestCase; import org.apache.lucene.tests.util.RamUsageTester; import org.apache.lucene.tests.util.TestUtil; import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.BytesRefIterator; import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.automaton.ByteRunAutomaton; @@ -527,4 +528,19 @@ public class TestTermInSetQuery extends LuceneTestCase { } }); } + + public void testTermsIterator() throws IOException { + TermInSetQuery empty = new TermInSetQuery("field", Collections.emptyList()); + BytesRefIterator it = empty.getBytesRefIterator(); + assertNull(it.next()); + + TermInSetQuery query = + new TermInSetQuery( + "field", List.of(newBytesRef("term1"), newBytesRef("term2"), newBytesRef("term3"))); + it = query.getBytesRefIterator(); + assertEquals(newBytesRef("term1"), it.next()); + assertEquals(newBytesRef("term2"), it.next()); + assertEquals(newBytesRef("term3"), it.next()); + assertNull(it.next()); + } } diff --git a/lucene/core/src/test/org/apache/lucene/search/TestTimeLimitingBulkScorer.java b/lucene/core/src/test/org/apache/lucene/search/TestTimeLimitingBulkScorer.java index cce82cd34ac..de5512a904a 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestTimeLimitingBulkScorer.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestTimeLimitingBulkScorer.java @@ -113,7 +113,7 @@ public class TestTimeLimitingBulkScorer extends LuceneTestCase { private static QueryTimeout countingQueryTimeout(int timeallowed) { return new QueryTimeout() { - static int counter = 0; + int counter = 0; @Override public boolean shouldExit() { diff --git a/lucene/core/src/test/org/apache/lucene/search/TestTopDocsCollector.java b/lucene/core/src/test/org/apache/lucene/search/TestTopDocsCollector.java index d4df59f2f72..14b51ca214e 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestTopDocsCollector.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestTopDocsCollector.java @@ -519,47 +519,47 @@ public class TestTopDocsCollector extends LuceneTestCase { scorer.score = 3; leafCollector.collect(0); - assertNull(minValueChecker.get()); + assertEquals(Long.MIN_VALUE, minValueChecker.getRaw()); assertNull(scorer.minCompetitiveScore); scorer2.score = 6; leafCollector2.collect(0); - assertNull(minValueChecker.get()); + assertEquals(Long.MIN_VALUE, minValueChecker.getRaw()); assertNull(scorer2.minCompetitiveScore); scorer.score = 2; leafCollector.collect(1); - assertEquals(2f, minValueChecker.get().score(), 0f); + assertEquals(2f, MaxScoreAccumulator.toScore(minValueChecker.getRaw()), 0f); assertEquals(Math.nextUp(2f), scorer.minCompetitiveScore, 0f); assertNull(scorer2.minCompetitiveScore); scorer2.score = 9; leafCollector2.collect(1); - assertEquals(6f, minValueChecker.get().score(), 0f); + assertEquals(6f, MaxScoreAccumulator.toScore(minValueChecker.getRaw()), 0f); assertEquals(Math.nextUp(2f), scorer.minCompetitiveScore, 0f); assertEquals(Math.nextUp(6f), scorer2.minCompetitiveScore, 0f); scorer2.score = 7; leafCollector2.collect(2); - assertEquals(minValueChecker.get().score(), 7f, 0f); + assertEquals(MaxScoreAccumulator.toScore(minValueChecker.getRaw()), 7f, 0f); assertEquals(Math.nextUp(2f), scorer.minCompetitiveScore, 0f); assertEquals(Math.nextUp(7f), scorer2.minCompetitiveScore, 0f); scorer2.score = 1; leafCollector2.collect(3); - assertEquals(minValueChecker.get().score(), 7f, 0f); + assertEquals(MaxScoreAccumulator.toScore(minValueChecker.getRaw()), 7f, 0f); assertEquals(Math.nextUp(2f), scorer.minCompetitiveScore, 0f); assertEquals(Math.nextUp(7f), scorer2.minCompetitiveScore, 0f); scorer.score = 10; leafCollector.collect(2); - assertEquals(minValueChecker.get().score(), 7f, 0f); + assertEquals(MaxScoreAccumulator.toScore(minValueChecker.getRaw()), 7f, 0f); assertEquals(7f, scorer.minCompetitiveScore, 0f); assertEquals(Math.nextUp(7f), scorer2.minCompetitiveScore, 0f); scorer.score = 11; leafCollector.collect(3); - assertEquals(minValueChecker.get().score(), 10, 0f); + assertEquals(MaxScoreAccumulator.toScore(minValueChecker.getRaw()), 10, 0f); assertEquals(Math.nextUp(10f), scorer.minCompetitiveScore, 0f); assertEquals(Math.nextUp(7f), scorer2.minCompetitiveScore, 0f); @@ -571,19 +571,19 @@ public class TestTopDocsCollector extends LuceneTestCase { scorer3.score = 1f; leafCollector3.collect(0); - assertEquals(10f, minValueChecker.get().score(), 0f); + assertEquals(10f, MaxScoreAccumulator.toScore(minValueChecker.getRaw()), 0f); assertEquals(Math.nextUp(10f), scorer3.minCompetitiveScore, 0f); scorer.score = 11; leafCollector.collect(4); - assertEquals(11f, minValueChecker.get().score(), 0f); + assertEquals(11f, MaxScoreAccumulator.toScore(minValueChecker.getRaw()), 0f); assertEquals(Math.nextUp(11f), scorer.minCompetitiveScore, 0f); assertEquals(Math.nextUp(7f), scorer2.minCompetitiveScore, 0f); assertEquals(Math.nextUp(10f), scorer3.minCompetitiveScore, 0f); scorer3.score = 2f; leafCollector3.collect(1); - assertEquals(minValueChecker.get().score(), 11f, 0f); + assertEquals(MaxScoreAccumulator.toScore(minValueChecker.getRaw()), 11f, 0f); assertEquals(Math.nextUp(11f), scorer.minCompetitiveScore, 0f); assertEquals(Math.nextUp(7f), scorer2.minCompetitiveScore, 0f); assertEquals(Math.nextUp(11f), scorer3.minCompetitiveScore, 0f); diff --git a/lucene/core/src/test/org/apache/lucene/search/TestTopFieldCollector.java b/lucene/core/src/test/org/apache/lucene/search/TestTopFieldCollector.java index cd6f0ac079d..c507eb0f647 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestTopFieldCollector.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestTopFieldCollector.java @@ -577,47 +577,47 @@ public class TestTopFieldCollector extends LuceneTestCase { scorer.score = 3; leafCollector.collect(0); - assertNull(minValueChecker.get()); + assertEquals(Long.MIN_VALUE, minValueChecker.getRaw()); assertNull(scorer.minCompetitiveScore); scorer2.score = 6; leafCollector2.collect(0); - assertNull(minValueChecker.get()); + assertEquals(Long.MIN_VALUE, minValueChecker.getRaw()); assertNull(scorer2.minCompetitiveScore); scorer.score = 2; leafCollector.collect(1); - assertEquals(2f, minValueChecker.get().score(), 0f); + assertEquals(2f, MaxScoreAccumulator.toScore(minValueChecker.getRaw()), 0f); assertEquals(2f, scorer.minCompetitiveScore, 0f); assertNull(scorer2.minCompetitiveScore); scorer2.score = 9; leafCollector2.collect(1); - assertEquals(6f, minValueChecker.get().score(), 0f); + assertEquals(6f, MaxScoreAccumulator.toScore(minValueChecker.getRaw()), 0f); assertEquals(2f, scorer.minCompetitiveScore, 0f); assertEquals(6f, scorer2.minCompetitiveScore, 0f); scorer2.score = 7; leafCollector2.collect(2); - assertEquals(7f, minValueChecker.get().score(), 0f); + assertEquals(7f, MaxScoreAccumulator.toScore(minValueChecker.getRaw()), 0f); assertEquals(2f, scorer.minCompetitiveScore, 0f); assertEquals(7f, scorer2.minCompetitiveScore, 0f); scorer2.score = 1; leafCollector2.collect(3); - assertEquals(7f, minValueChecker.get().score(), 0f); + assertEquals(7f, MaxScoreAccumulator.toScore(minValueChecker.getRaw()), 0f); assertEquals(2f, scorer.minCompetitiveScore, 0f); assertEquals(7f, scorer2.minCompetitiveScore, 0f); scorer.score = 10; leafCollector.collect(2); - assertEquals(7f, minValueChecker.get().score(), 0f); + assertEquals(7f, MaxScoreAccumulator.toScore(minValueChecker.getRaw()), 0f); assertEquals(7f, scorer.minCompetitiveScore, 0f); assertEquals(7f, scorer2.minCompetitiveScore, 0f); scorer.score = 11; leafCollector.collect(3); - assertEquals(10f, minValueChecker.get().score(), 0f); + assertEquals(10f, MaxScoreAccumulator.toScore(minValueChecker.getRaw()), 0f); assertEquals(10f, scorer.minCompetitiveScore, 0f); assertEquals(7f, scorer2.minCompetitiveScore, 0f); @@ -629,19 +629,19 @@ public class TestTopFieldCollector extends LuceneTestCase { scorer3.score = 1f; leafCollector3.collect(0); - assertEquals(10f, minValueChecker.get().score(), 0f); + assertEquals(10f, MaxScoreAccumulator.toScore(minValueChecker.getRaw()), 0f); assertEquals(10f, scorer3.minCompetitiveScore, 0f); scorer.score = 11; leafCollector.collect(4); - assertEquals(11f, minValueChecker.get().score(), 0f); + assertEquals(11f, MaxScoreAccumulator.toScore(minValueChecker.getRaw()), 0f); assertEquals(11f, scorer.minCompetitiveScore, 0f); assertEquals(7f, scorer2.minCompetitiveScore, 0f); assertEquals(10f, scorer3.minCompetitiveScore, 0f); scorer3.score = 2f; leafCollector3.collect(1); - assertEquals(11f, minValueChecker.get().score(), 0f); + assertEquals(11f, MaxScoreAccumulator.toScore(minValueChecker.getRaw()), 0f); assertEquals(11f, scorer.minCompetitiveScore, 0f); assertEquals(7f, scorer2.minCompetitiveScore, 0f); assertEquals(11f, scorer3.minCompetitiveScore, 0f); diff --git a/lucene/core/src/test/org/apache/lucene/util/TestVersion.java b/lucene/core/src/test/org/apache/lucene/util/TestVersion.java index d34ee2f78db..b3a69b48fa4 100644 --- a/lucene/core/src/test/org/apache/lucene/util/TestVersion.java +++ b/lucene/core/src/test/org/apache/lucene/util/TestVersion.java @@ -33,24 +33,27 @@ public class TestVersion extends LuceneTestCase { assertTrue("LATEST must be always onOrAfter(" + v + ")", Version.LATEST.onOrAfter(v)); } } - assertTrue(Version.LUCENE_10_0_0.onOrAfter(Version.LUCENE_9_0_0)); + assertTrue(Version.LUCENE_11_0_0.onOrAfter(Version.fromBits(9, 0, 0))); + assertTrue(Version.LUCENE_11_0_0.onOrAfter(Version.LUCENE_10_0_0)); + assertTrue(Version.LUCENE_11_0_0.onOrAfter(Version.LUCENE_10_1_0)); } public void testToString() { - assertEquals("9.0.0", Version.LUCENE_9_0_0.toString()); + assertEquals("9.0.0", Version.fromBits(9, 0, 0).toString()); assertEquals("10.0.0", Version.LUCENE_10_0_0.toString()); + assertEquals("10.1.0", Version.LUCENE_10_1_0.toString()); + assertEquals("11.0.0", Version.LUCENE_11_0_0.toString()); } public void testParseLeniently() throws Exception { + assertEquals(Version.LUCENE_11_0_0, Version.parseLeniently("11.0")); + assertEquals(Version.LUCENE_11_0_0, Version.parseLeniently("11.0.0")); + assertEquals(Version.LUCENE_11_0_0, Version.parseLeniently("LUCENE_11_0")); + assertEquals(Version.LUCENE_11_0_0, Version.parseLeniently("LUCENE_11_0_0")); assertEquals(Version.LUCENE_10_0_0, Version.parseLeniently("10.0")); assertEquals(Version.LUCENE_10_0_0, Version.parseLeniently("10.0.0")); assertEquals(Version.LUCENE_10_0_0, Version.parseLeniently("LUCENE_10_0")); assertEquals(Version.LUCENE_10_0_0, Version.parseLeniently("LUCENE_10_0_0")); - assertEquals(Version.LUCENE_9_0_0, Version.parseLeniently("9.0")); - assertEquals(Version.LUCENE_9_0_0, Version.parseLeniently("9.0.0")); - assertEquals(Version.LUCENE_9_0_0, Version.parseLeniently("LUCENE_90")); - assertEquals(Version.LUCENE_9_0_0, Version.parseLeniently("LUCENE_9_0")); - assertEquals(Version.LUCENE_9_0_0, Version.parseLeniently("LUCENE_9_0_0")); assertEquals(Version.LATEST, Version.parseLeniently("LATEST")); assertEquals(Version.LATEST, Version.parseLeniently("latest")); @@ -108,7 +111,7 @@ public class TestVersion extends LuceneTestCase { public void testParse() throws Exception { assertEquals(Version.LUCENE_10_0_0, Version.parse("10.0.0")); - assertEquals(Version.LUCENE_9_0_0, Version.parse("9.0.0")); + assertEquals(Version.LUCENE_11_0_0, Version.parse("11.0.0")); // Version does not pass judgement on the major version: assertEquals(1, Version.parse("1.0").major); @@ -116,7 +119,9 @@ public class TestVersion extends LuceneTestCase { } public void testForwardsCompatibility() throws Exception { - assertTrue(Version.parse("9.10.20").onOrAfter(Version.LUCENE_9_0_0)); + assertTrue(Version.parse("11.10.20").onOrAfter(Version.LUCENE_11_0_0)); + assertTrue(Version.parse("10.10.20").onOrAfter(Version.LUCENE_10_0_0)); + assertTrue(Version.parse("9.10.20").onOrAfter(Version.fromBits(9, 0, 0))); } public void testParseExceptions() { diff --git a/lucene/core/src/test/org/apache/lucene/util/hnsw/AbstractMockVectorValues.java b/lucene/core/src/test/org/apache/lucene/util/hnsw/AbstractMockVectorValues.java deleted file mode 100644 index 54de3919b51..00000000000 --- a/lucene/core/src/test/org/apache/lucene/util/hnsw/AbstractMockVectorValues.java +++ /dev/null @@ -1,89 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.lucene.util.hnsw; - -import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS; - -import java.io.IOException; -import org.apache.lucene.util.BytesRef; - -abstract class AbstractMockVectorValues implements RandomAccessVectorValues { - - protected final int dimension; - protected final T[] denseValues; - protected final T[] values; - protected final int numVectors; - protected final BytesRef binaryValue; - - protected int pos = -1; - - AbstractMockVectorValues(T[] values, int dimension, T[] denseValues, int numVectors) { - this.dimension = dimension; - this.values = values; - this.denseValues = denseValues; - // used by tests that build a graph from bytes rather than floats - binaryValue = new BytesRef(dimension); - binaryValue.length = dimension; - this.numVectors = numVectors; - } - - @Override - public int size() { - return numVectors; - } - - @Override - public int dimension() { - return dimension; - } - - public T vectorValue(int targetOrd) { - return denseValues[targetOrd]; - } - - @Override - public abstract AbstractMockVectorValues copy(); - - public abstract T vectorValue() throws IOException; - - private boolean seek(int target) { - if (target >= 0 && target < values.length && values[target] != null) { - pos = target; - return true; - } else { - return false; - } - } - - public int docID() { - return pos; - } - - public int nextDoc() { - return advance(pos + 1); - } - - public int advance(int target) { - while (++pos < values.length) { - if (seek(pos)) { - return pos; - } - } - return NO_MORE_DOCS; - } -} diff --git a/lucene/core/src/test/org/apache/lucene/util/hnsw/HnswGraphTestCase.java b/lucene/core/src/test/org/apache/lucene/util/hnsw/HnswGraphTestCase.java index 4a6794b4994..41aeef2e5c8 100644 --- a/lucene/core/src/test/org/apache/lucene/util/hnsw/HnswGraphTestCase.java +++ b/lucene/core/src/test/org/apache/lucene/util/hnsw/HnswGraphTestCase.java @@ -56,6 +56,7 @@ import org.apache.lucene.index.FloatVectorValues; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.index.KnnVectorValues; import org.apache.lucene.index.LeafReader; import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.index.StoredFields; @@ -97,33 +98,28 @@ abstract class HnswGraphTestCase extends LuceneTestCase { abstract T randomVector(int dim); - abstract AbstractMockVectorValues vectorValues(int size, int dimension); + abstract KnnVectorValues vectorValues(int size, int dimension); - abstract AbstractMockVectorValues vectorValues(float[][] values); + abstract KnnVectorValues vectorValues(float[][] values); - abstract AbstractMockVectorValues vectorValues(LeafReader reader, String fieldName) - throws IOException; + abstract KnnVectorValues vectorValues(LeafReader reader, String fieldName) throws IOException; - abstract AbstractMockVectorValues vectorValues( - int size, - int dimension, - AbstractMockVectorValues pregeneratedVectorValues, - int pregeneratedOffset); + abstract KnnVectorValues vectorValues( + int size, int dimension, KnnVectorValues pregeneratedVectorValues, int pregeneratedOffset); abstract Field knnVectorField(String name, T vector, VectorSimilarityFunction similarityFunction); - abstract RandomAccessVectorValues circularVectorValues(int nDoc); + abstract KnnVectorValues circularVectorValues(int nDoc); abstract T getTargetVector(); - protected RandomVectorScorerSupplier buildScorerSupplier(RandomAccessVectorValues vectors) + protected RandomVectorScorerSupplier buildScorerSupplier(KnnVectorValues vectors) throws IOException { return flatVectorScorer.getRandomVectorScorerSupplier(similarityFunction, vectors); } - protected RandomVectorScorer buildScorer(RandomAccessVectorValues vectors, T query) - throws IOException { - RandomAccessVectorValues vectorsCopy = vectors.copy(); + protected RandomVectorScorer buildScorer(KnnVectorValues vectors, T query) throws IOException { + KnnVectorValues vectorsCopy = vectors.copy(); return switch (getVectorEncoding()) { case BYTE -> flatVectorScorer.getRandomVectorScorer(similarityFunction, vectorsCopy, (byte[]) query); @@ -134,6 +130,7 @@ abstract class HnswGraphTestCase extends LuceneTestCase { // Tests writing segments of various sizes and merging to ensure there are no errors // in the HNSW graph merging logic. + @SuppressWarnings("unchecked") public void testRandomReadWriteAndMerge() throws IOException { int dim = random().nextInt(100) + 1; int[] segmentSizes = @@ -148,7 +145,7 @@ abstract class HnswGraphTestCase extends LuceneTestCase { int M = random().nextInt(4) + 2; int beamWidth = random().nextInt(10) + 5; long seed = random().nextLong(); - AbstractMockVectorValues vectors = vectorValues(numVectors, dim); + KnnVectorValues vectors = vectorValues(numVectors, dim); HnswGraphBuilder.randSeed = seed; try (Directory dir = newDirectory()) { @@ -173,7 +170,7 @@ abstract class HnswGraphTestCase extends LuceneTestCase { try (IndexWriter iw = new IndexWriter(dir, iwc)) { for (int i = 0; i < segmentSizes.length; i++) { int size = segmentSizes[i]; - while (vectors.nextDoc() < size) { + for (int ord = 0; ord < size; ord++) { if (isSparse[i] && random().nextBoolean()) { int d = random().nextInt(10) + 1; for (int j = 0; j < d; j++) { @@ -182,8 +179,24 @@ abstract class HnswGraphTestCase extends LuceneTestCase { } } Document doc = new Document(); - doc.add(knnVectorField("field", vectors.vectorValue(), similarityFunction)); - doc.add(new StringField("id", Integer.toString(vectors.docID()), Field.Store.NO)); + switch (vectors.getEncoding()) { + case BYTE -> { + doc.add( + knnVectorField( + "field", + (T) ((ByteVectorValues) vectors).vectorValue(ord), + similarityFunction)); + } + case FLOAT32 -> { + doc.add( + knnVectorField( + "field", + (T) ((FloatVectorValues) vectors).vectorValue(ord), + similarityFunction)); + } + } + ; + doc.add(new StringField("id", Integer.toString(vectors.ordToDoc(ord)), Field.Store.NO)); iw.addDocument(doc); } iw.commit(); @@ -199,13 +212,26 @@ abstract class HnswGraphTestCase extends LuceneTestCase { } try (IndexReader reader = DirectoryReader.open(dir)) { for (LeafReaderContext ctx : reader.leaves()) { - AbstractMockVectorValues values = vectorValues(ctx.reader(), "field"); + KnnVectorValues values = vectorValues(ctx.reader(), "field"); assertEquals(dim, values.dimension()); } } } } + @SuppressWarnings("unchecked") + private T vectorValue(KnnVectorValues vectors, int ord) throws IOException { + switch (vectors.getEncoding()) { + case BYTE -> { + return (T) ((ByteVectorValues) vectors).vectorValue(ord); + } + case FLOAT32 -> { + return (T) ((FloatVectorValues) vectors).vectorValue(ord); + } + } + throw new AssertionError("unknown encoding " + vectors.getEncoding()); + } + // test writing out and reading in a graph gives the expected graph public void testReadWrite() throws IOException { int dim = random().nextInt(100) + 1; @@ -213,8 +239,8 @@ abstract class HnswGraphTestCase extends LuceneTestCase { int M = random().nextInt(4) + 2; int beamWidth = random().nextInt(10) + 5; long seed = random().nextLong(); - AbstractMockVectorValues vectors = vectorValues(nDoc, dim); - AbstractMockVectorValues v2 = vectors.copy(), v3 = vectors.copy(); + KnnVectorValues vectors = vectorValues(nDoc, dim); + KnnVectorValues v2 = vectors.copy(), v3 = vectors.copy(); RandomVectorScorerSupplier scorerSupplier = buildScorerSupplier(vectors); HnswGraphBuilder builder = HnswGraphBuilder.create(scorerSupplier, M, beamWidth, seed); HnswGraph hnsw = builder.build(vectors.size()); @@ -242,15 +268,16 @@ abstract class HnswGraphTestCase extends LuceneTestCase { } }); try (IndexWriter iw = new IndexWriter(dir, iwc)) { - while (v2.nextDoc() != NO_MORE_DOCS) { - while (indexedDoc < v2.docID()) { + KnnVectorValues.DocIndexIterator it2 = v2.iterator(); + while (it2.nextDoc() != NO_MORE_DOCS) { + while (indexedDoc < it2.docID()) { // increment docId in the index by adding empty documents iw.addDocument(new Document()); indexedDoc++; } Document doc = new Document(); - doc.add(knnVectorField("field", v2.vectorValue(), similarityFunction)); - doc.add(new StoredField("id", v2.docID())); + doc.add(knnVectorField("field", vectorValue(v2, it2.index()), similarityFunction)); + doc.add(new StoredField("id", it2.docID())); iw.addDocument(doc); nVec++; indexedDoc++; @@ -258,7 +285,7 @@ abstract class HnswGraphTestCase extends LuceneTestCase { } try (IndexReader reader = DirectoryReader.open(dir)) { for (LeafReaderContext ctx : reader.leaves()) { - AbstractMockVectorValues values = vectorValues(ctx.reader(), "field"); + KnnVectorValues values = vectorValues(ctx.reader(), "field"); assertEquals(dim, values.dimension()); assertEquals(nVec, values.size()); assertEquals(indexedDoc, ctx.reader().maxDoc()); @@ -280,7 +307,7 @@ abstract class HnswGraphTestCase extends LuceneTestCase { public void testSortedAndUnsortedIndicesReturnSameResults() throws IOException { int dim = random().nextInt(10) + 3; int nDoc = random().nextInt(200) + 100; - AbstractMockVectorValues vectors = vectorValues(nDoc, dim); + KnnVectorValues vectors = vectorValues(nDoc, dim); int M = random().nextInt(10) + 5; int beamWidth = random().nextInt(10) + 10; @@ -323,15 +350,15 @@ abstract class HnswGraphTestCase extends LuceneTestCase { int indexedDoc = 0; try (IndexWriter iw = new IndexWriter(dir, iwc); IndexWriter iw2 = new IndexWriter(dir2, iwc2)) { - while (vectors.nextDoc() != NO_MORE_DOCS) { - while (indexedDoc < vectors.docID()) { + for (int ord = 0; ord < vectors.size(); ord++) { + while (indexedDoc < vectors.ordToDoc(ord)) { // increment docId in the index by adding empty documents iw.addDocument(new Document()); indexedDoc++; } Document doc = new Document(); - doc.add(knnVectorField("vector", vectors.vectorValue(), similarityFunction)); - doc.add(new StoredField("id", vectors.docID())); + doc.add(knnVectorField("vector", vectorValue(vectors, ord), similarityFunction)); + doc.add(new StoredField("id", vectors.ordToDoc(ord))); doc.add(new NumericDocValuesField("sortkey", random().nextLong())); iw.addDocument(doc); iw2.addDocument(doc); @@ -461,7 +488,7 @@ abstract class HnswGraphTestCase extends LuceneTestCase { public void testAknnDiverse() throws IOException { int nDoc = 100; similarityFunction = VectorSimilarityFunction.DOT_PRODUCT; - RandomAccessVectorValues vectors = circularVectorValues(nDoc); + KnnVectorValues vectors = circularVectorValues(nDoc); RandomVectorScorerSupplier scorerSupplier = buildScorerSupplier(vectors); HnswGraphBuilder builder = HnswGraphBuilder.create(scorerSupplier, 10, 100, random().nextInt()); OnHeapHnswGraph hnsw = builder.build(vectors.size()); @@ -493,7 +520,7 @@ abstract class HnswGraphTestCase extends LuceneTestCase { @SuppressWarnings("unchecked") public void testSearchWithAcceptOrds() throws IOException { int nDoc = 100; - RandomAccessVectorValues vectors = circularVectorValues(nDoc); + KnnVectorValues vectors = circularVectorValues(nDoc); similarityFunction = VectorSimilarityFunction.DOT_PRODUCT; RandomVectorScorerSupplier scorerSupplier = buildScorerSupplier(vectors); HnswGraphBuilder builder = HnswGraphBuilder.create(scorerSupplier, 16, 100, random().nextInt()); @@ -518,7 +545,7 @@ abstract class HnswGraphTestCase extends LuceneTestCase { @SuppressWarnings("unchecked") public void testSearchWithSelectiveAcceptOrds() throws IOException { int nDoc = 100; - RandomAccessVectorValues vectors = circularVectorValues(nDoc); + KnnVectorValues vectors = circularVectorValues(nDoc); similarityFunction = VectorSimilarityFunction.DOT_PRODUCT; RandomVectorScorerSupplier scorerSupplier = buildScorerSupplier(vectors); HnswGraphBuilder builder = HnswGraphBuilder.create(scorerSupplier, 16, 100, random().nextInt()); @@ -552,13 +579,13 @@ abstract class HnswGraphTestCase extends LuceneTestCase { int dim = atLeast(10); long seed = random().nextLong(); - AbstractMockVectorValues initializerVectors = vectorValues(initializerSize, dim); + KnnVectorValues initializerVectors = vectorValues(initializerSize, dim); RandomVectorScorerSupplier initialscorerSupplier = buildScorerSupplier(initializerVectors); HnswGraphBuilder initializerBuilder = HnswGraphBuilder.create(initialscorerSupplier, 10, 30, seed); OnHeapHnswGraph initializerGraph = initializerBuilder.build(initializerVectors.size()); - AbstractMockVectorValues finalVectorValues = + KnnVectorValues finalVectorValues = vectorValues(totalSize, dim, initializerVectors, docIdOffset); int[] initializerOrdMap = createOffsetOrdinalMap(initializerSize, finalVectorValues, docIdOffset); @@ -598,13 +625,13 @@ abstract class HnswGraphTestCase extends LuceneTestCase { int dim = atLeast(10); long seed = random().nextLong(); - AbstractMockVectorValues initializerVectors = vectorValues(initializerSize, dim); + KnnVectorValues initializerVectors = vectorValues(initializerSize, dim); RandomVectorScorerSupplier initialscorerSupplier = buildScorerSupplier(initializerVectors); HnswGraphBuilder initializerBuilder = HnswGraphBuilder.create(initialscorerSupplier, 10, 30, seed); OnHeapHnswGraph initializerGraph = initializerBuilder.build(initializerVectors.size()); - AbstractMockVectorValues finalVectorValues = + KnnVectorValues finalVectorValues = vectorValues(totalSize, dim, initializerVectors.copy(), docIdOffset); int[] initializerOrdMap = createOffsetOrdinalMap(initializerSize, finalVectorValues, docIdOffset); @@ -688,19 +715,17 @@ abstract class HnswGraphTestCase extends LuceneTestCase { } private int[] createOffsetOrdinalMap( - int docIdSize, AbstractMockVectorValues totalVectorValues, int docIdOffset) { + int docIdSize, KnnVectorValues totalVectorValues, int docIdOffset) throws IOException { // Compute the offset for the ordinal map to be the number of non-null vectors in the total - // vector values - // before the docIdOffset + // vector values before the docIdOffset int ordinalOffset = 0; - while (totalVectorValues.nextDoc() < docIdOffset) { + KnnVectorValues.DocIndexIterator it = totalVectorValues.iterator(); + while (it.nextDoc() < docIdOffset) { ordinalOffset++; } int[] offsetOrdinalMap = new int[docIdSize]; - for (int curr = 0; - totalVectorValues.docID() < docIdOffset + docIdSize; - totalVectorValues.nextDoc()) { + for (int curr = 0; it.docID() < docIdOffset + docIdSize; it.nextDoc()) { offsetOrdinalMap[curr] = ordinalOffset + curr++; } @@ -711,7 +736,7 @@ abstract class HnswGraphTestCase extends LuceneTestCase { public void testVisitedLimit() throws IOException { int nDoc = 500; similarityFunction = VectorSimilarityFunction.DOT_PRODUCT; - RandomAccessVectorValues vectors = circularVectorValues(nDoc); + KnnVectorValues vectors = circularVectorValues(nDoc); RandomVectorScorerSupplier scorerSupplier = buildScorerSupplier(vectors); HnswGraphBuilder builder = HnswGraphBuilder.create(scorerSupplier, 16, 100, random().nextInt()); OnHeapHnswGraph hnsw = builder.build(vectors.size()); @@ -746,7 +771,7 @@ abstract class HnswGraphTestCase extends LuceneTestCase { int M = randomIntBetween(4, 96); similarityFunction = RandomizedTest.randomFrom(VectorSimilarityFunction.values()); - RandomAccessVectorValues vectors = vectorValues(size, dim); + KnnVectorValues vectors = vectorValues(size, dim); RandomVectorScorerSupplier scorerSupplier = buildScorerSupplier(vectors); HnswGraphBuilder builder = @@ -771,7 +796,7 @@ abstract class HnswGraphTestCase extends LuceneTestCase { unitVector2d(0.77), unitVector2d(0.6) }; - AbstractMockVectorValues vectors = vectorValues(values); + KnnVectorValues vectors = vectorValues(values); // First add nodes until everybody gets a full neighbor list RandomVectorScorerSupplier scorerSupplier = buildScorerSupplier(vectors); HnswGraphBuilder builder = HnswGraphBuilder.create(scorerSupplier, 2, 10, random().nextInt()); @@ -825,7 +850,7 @@ abstract class HnswGraphTestCase extends LuceneTestCase { {10, 0, 0}, {0, 4, 0} }; - AbstractMockVectorValues vectors = vectorValues(values); + KnnVectorValues vectors = vectorValues(values); // First add nodes until everybody gets a full neighbor list RandomVectorScorerSupplier scorerSupplier = buildScorerSupplier(vectors); HnswGraphBuilder builder = HnswGraphBuilder.create(scorerSupplier, 1, 10, random().nextInt()); @@ -855,7 +880,7 @@ abstract class HnswGraphTestCase extends LuceneTestCase { {0, 0, 20}, {0, 9, 0} }; - AbstractMockVectorValues vectors = vectorValues(values); + KnnVectorValues vectors = vectorValues(values); // First add nodes until everybody gets a full neighbor list RandomVectorScorerSupplier scorerSupplier = buildScorerSupplier(vectors); HnswGraphBuilder builder = HnswGraphBuilder.create(scorerSupplier, 1, 10, random().nextInt()); @@ -891,7 +916,7 @@ abstract class HnswGraphTestCase extends LuceneTestCase { public void testRandom() throws IOException { int size = atLeast(100); int dim = atLeast(10); - AbstractMockVectorValues vectors = vectorValues(size, dim); + KnnVectorValues vectors = vectorValues(size, dim); int topK = 5; RandomVectorScorerSupplier scorerSupplier = buildScorerSupplier(vectors); HnswGraphBuilder builder = HnswGraphBuilder.create(scorerSupplier, 10, 30, random().nextLong()); @@ -908,15 +933,13 @@ abstract class HnswGraphTestCase extends LuceneTestCase { TopDocs topDocs = actual.topDocs(); NeighborQueue expected = new NeighborQueue(topK, false); for (int j = 0; j < size; j++) { - if (vectors.vectorValue(j) != null && (acceptOrds == null || acceptOrds.get(j))) { + if (vectorValue(vectors, j) != null && (acceptOrds == null || acceptOrds.get(j))) { if (getVectorEncoding() == VectorEncoding.BYTE) { - assert query instanceof byte[]; expected.add( - j, similarityFunction.compare((byte[]) query, (byte[]) vectors.vectorValue(j))); + j, similarityFunction.compare((byte[]) query, (byte[]) vectorValue(vectors, j))); } else { - assert query instanceof float[]; expected.add( - j, similarityFunction.compare((float[]) query, (float[]) vectors.vectorValue(j))); + j, similarityFunction.compare((float[]) query, (float[]) vectorValue(vectors, j))); } if (expected.size() > topK) { expected.pop(); @@ -940,7 +963,7 @@ abstract class HnswGraphTestCase extends LuceneTestCase { throws IOException, ExecutionException, InterruptedException, TimeoutException { int size = atLeast(100); int dim = atLeast(10); - AbstractMockVectorValues vectors = vectorValues(size, dim); + KnnVectorValues vectors = vectorValues(size, dim); RandomVectorScorerSupplier scorerSupplier = buildScorerSupplier(vectors); HnswGraphBuilder builder = HnswGraphBuilder.create(scorerSupplier, 10, 30, random().nextLong()); OnHeapHnswGraph hnsw = builder.build(vectors.size()); @@ -1004,7 +1027,7 @@ abstract class HnswGraphTestCase extends LuceneTestCase { public void testConcurrentMergeBuilder() throws IOException { int size = atLeast(1000); int dim = atLeast(10); - AbstractMockVectorValues vectors = vectorValues(size, dim); + KnnVectorValues vectors = vectorValues(size, dim); RandomVectorScorerSupplier scorerSupplier = buildScorerSupplier(vectors); ExecutorService exec = Executors.newFixedThreadPool(4, new NamedThreadFactory("hnswMerge")); TaskExecutor taskExecutor = new TaskExecutor(exec); @@ -1033,7 +1056,7 @@ abstract class HnswGraphTestCase extends LuceneTestCase { // Search for a large number of results int topK = size - 1; - AbstractMockVectorValues docVectors = vectorValues(size, dim); + KnnVectorValues docVectors = vectorValues(size, dim); HnswGraph graph = HnswGraphBuilder.create(buildScorerSupplier(docVectors), 10, 30, random().nextLong()) .build(size); @@ -1047,8 +1070,8 @@ abstract class HnswGraphTestCase extends LuceneTestCase { } }; - AbstractMockVectorValues queryVectors = vectorValues(1, dim); - RandomVectorScorer queryScorer = buildScorer(docVectors, queryVectors.vectorValue(0)); + KnnVectorValues queryVectors = vectorValues(1, dim); + RandomVectorScorer queryScorer = buildScorer(docVectors, vectorValue(queryVectors, 0)); KnnCollector collector = new TopKnnCollector(topK, Integer.MAX_VALUE); HnswGraphSearcher.search(queryScorer, collector, singleLevelGraph, null); @@ -1076,8 +1099,7 @@ abstract class HnswGraphTestCase extends LuceneTestCase { } /** Returns vectors evenly distributed around the upper unit semicircle. */ - static class CircularFloatVectorValues extends FloatVectorValues - implements RandomAccessVectorValues.Floats { + static class CircularFloatVectorValues extends FloatVectorValues { private final int size; private final float[] value; @@ -1103,22 +1125,18 @@ abstract class HnswGraphTestCase extends LuceneTestCase { return size; } - @Override public float[] vectorValue() { return vectorValue(doc); } - @Override public int docID() { return doc; } - @Override public int nextDoc() { return advance(doc + 1); } - @Override public int advance(int target) { if (target >= 0 && target < size) { doc = target; @@ -1140,8 +1158,7 @@ abstract class HnswGraphTestCase extends LuceneTestCase { } /** Returns vectors evenly distributed around the upper unit semicircle. */ - static class CircularByteVectorValues extends ByteVectorValues - implements RandomAccessVectorValues.Bytes { + static class CircularByteVectorValues extends ByteVectorValues { private final int size; private final float[] value; private final byte[] bValue; @@ -1169,22 +1186,18 @@ abstract class HnswGraphTestCase extends LuceneTestCase { return size; } - @Override public byte[] vectorValue() { return vectorValue(doc); } - @Override public int docID() { return doc; } - @Override public int nextDoc() { return advance(doc + 1); } - @Override public int advance(int target) { if (target >= 0 && target < size) { doc = target; @@ -1227,27 +1240,25 @@ abstract class HnswGraphTestCase extends LuceneTestCase { return neighbors; } - void assertVectorsEqual(AbstractMockVectorValues u, AbstractMockVectorValues v) - throws IOException { + void assertVectorsEqual(KnnVectorValues u, KnnVectorValues v) throws IOException { int uDoc, vDoc; - while (true) { - uDoc = u.nextDoc(); - vDoc = v.nextDoc(); + assertEquals(u.size(), v.size()); + for (int ord = 0; ord < u.size(); ord++) { + uDoc = u.ordToDoc(ord); + vDoc = v.ordToDoc(ord); assertEquals(uDoc, vDoc); - if (uDoc == NO_MORE_DOCS) { - break; - } + assertNotEquals(NO_MORE_DOCS, uDoc); switch (getVectorEncoding()) { case BYTE -> assertArrayEquals( "vectors do not match for doc=" + uDoc, - (byte[]) u.vectorValue(), - (byte[]) v.vectorValue()); + (byte[]) vectorValue(u, ord), + (byte[]) vectorValue(v, ord)); case FLOAT32 -> assertArrayEquals( "vectors do not match for doc=" + uDoc, - (float[]) u.vectorValue(), - (float[]) v.vectorValue(), + (float[]) vectorValue(u, ord), + (float[]) vectorValue(v, ord), 1e-4f); default -> throw new IllegalArgumentException("unknown vector encoding: " + getVectorEncoding()); diff --git a/lucene/core/src/test/org/apache/lucene/util/hnsw/MockByteVectorValues.java b/lucene/core/src/test/org/apache/lucene/util/hnsw/MockByteVectorValues.java index a3b17b9a621..4ab86c70781 100644 --- a/lucene/core/src/test/org/apache/lucene/util/hnsw/MockByteVectorValues.java +++ b/lucene/core/src/test/org/apache/lucene/util/hnsw/MockByteVectorValues.java @@ -17,11 +17,17 @@ package org.apache.lucene.util.hnsw; +import org.apache.lucene.index.ByteVectorValues; import org.apache.lucene.tests.util.LuceneTestCase; import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.BytesRef; -class MockByteVectorValues extends AbstractMockVectorValues - implements RandomAccessVectorValues.Bytes { +class MockByteVectorValues extends ByteVectorValues { + private final int dimension; + private final byte[][] denseValues; + protected final byte[][] values; + private final int numVectors; + private final BytesRef binaryValue; private final byte[] scratch; static MockByteVectorValues fromValues(byte[][] values) { @@ -43,10 +49,26 @@ class MockByteVectorValues extends AbstractMockVectorValues } MockByteVectorValues(byte[][] values, int dimension, byte[][] denseValues, int numVectors) { - super(values, dimension, denseValues, numVectors); + this.dimension = dimension; + this.values = values; + this.denseValues = denseValues; + this.numVectors = numVectors; + // used by tests that build a graph from bytes rather than floats + binaryValue = new BytesRef(dimension); + binaryValue.length = dimension; scratch = new byte[dimension]; } + @Override + public int size() { + return values.length; + } + + @Override + public int dimension() { + return dimension; + } + @Override public MockByteVectorValues copy() { return new MockByteVectorValues( @@ -55,20 +77,20 @@ class MockByteVectorValues extends AbstractMockVectorValues @Override public byte[] vectorValue(int ord) { - return values[ord]; - } - - @Override - public byte[] vectorValue() { if (LuceneTestCase.random().nextBoolean()) { - return values[pos]; + return values[ord]; } else { // Sometimes use the same scratch array repeatedly, mimicing what the codec will do. // This should help us catch cases of aliasing where the same ByteVectorValues source is used // twice in a // single computation. - System.arraycopy(values[pos], 0, scratch, 0, dimension); + System.arraycopy(values[ord], 0, scratch, 0, dimension); return scratch; } } + + @Override + public DocIndexIterator iterator() { + return createDenseIterator(); + } } diff --git a/lucene/core/src/test/org/apache/lucene/util/hnsw/MockVectorValues.java b/lucene/core/src/test/org/apache/lucene/util/hnsw/MockVectorValues.java index f183f6c99a6..5411f2418de 100644 --- a/lucene/core/src/test/org/apache/lucene/util/hnsw/MockVectorValues.java +++ b/lucene/core/src/test/org/apache/lucene/util/hnsw/MockVectorValues.java @@ -17,11 +17,15 @@ package org.apache.lucene.util.hnsw; +import org.apache.lucene.index.FloatVectorValues; import org.apache.lucene.tests.util.LuceneTestCase; import org.apache.lucene.util.ArrayUtil; -class MockVectorValues extends AbstractMockVectorValues - implements RandomAccessVectorValues.Floats { +class MockVectorValues extends FloatVectorValues { + private final int dimension; + private final float[][] denseValues; + protected final float[][] values; + private final int numVectors; private final float[] scratch; static MockVectorValues fromValues(float[][] values) { @@ -43,10 +47,23 @@ class MockVectorValues extends AbstractMockVectorValues } MockVectorValues(float[][] values, int dimension, float[][] denseValues, int numVectors) { - super(values, dimension, denseValues, numVectors); + this.dimension = dimension; + this.values = values; + this.denseValues = denseValues; + this.numVectors = numVectors; this.scratch = new float[dimension]; } + @Override + public int size() { + return values.length; + } + + @Override + public int dimension() { + return dimension; + } + @Override public MockVectorValues copy() { return new MockVectorValues( @@ -54,20 +71,20 @@ class MockVectorValues extends AbstractMockVectorValues } @Override - public float[] vectorValue() { + public float[] vectorValue(int ord) { if (LuceneTestCase.random().nextBoolean()) { - return values[pos]; + return values[ord]; } else { // Sometimes use the same scratch array repeatedly, mimicing what the codec will do. // This should help us catch cases of aliasing where the same vector values source is used // twice in a single computation. - System.arraycopy(values[pos], 0, scratch, 0, dimension); + System.arraycopy(values[ord], 0, scratch, 0, dimension); return scratch; } } @Override - public float[] vectorValue(int targetOrd) { - return denseValues[targetOrd]; + public DocIndexIterator iterator() { + return createDenseIterator(); } } diff --git a/lucene/core/src/test/org/apache/lucene/util/hnsw/TestHnswByteVectorGraph.java b/lucene/core/src/test/org/apache/lucene/util/hnsw/TestHnswByteVectorGraph.java index 649bc1a6451..f0e6745211c 100644 --- a/lucene/core/src/test/org/apache/lucene/util/hnsw/TestHnswByteVectorGraph.java +++ b/lucene/core/src/test/org/apache/lucene/util/hnsw/TestHnswByteVectorGraph.java @@ -17,13 +17,12 @@ package org.apache.lucene.util.hnsw; -import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS; - import com.carrotsearch.randomizedtesting.RandomizedTest; import java.io.IOException; import org.apache.lucene.document.Field; import org.apache.lucene.document.KnnByteVectorField; import org.apache.lucene.index.ByteVectorValues; +import org.apache.lucene.index.KnnVectorValues; import org.apache.lucene.index.LeafReader; import org.apache.lucene.index.VectorEncoding; import org.apache.lucene.index.VectorSimilarityFunction; @@ -56,7 +55,7 @@ public class TestHnswByteVectorGraph extends HnswGraphTestCase { } @Override - AbstractMockVectorValues vectorValues(int size, int dimension) { + MockByteVectorValues vectorValues(int size, int dimension) { return MockByteVectorValues.fromValues(createRandomByteVectors(size, dimension, random())); } @@ -65,7 +64,7 @@ public class TestHnswByteVectorGraph extends HnswGraphTestCase { } @Override - AbstractMockVectorValues vectorValues(float[][] values) { + MockByteVectorValues vectorValues(float[][] values) { byte[][] bValues = new byte[values.length][]; // The case when all floats fit within a byte already. boolean scaleSimple = fitsInByte(values[0][0]); @@ -86,42 +85,35 @@ public class TestHnswByteVectorGraph extends HnswGraphTestCase { } @Override - AbstractMockVectorValues vectorValues( - int size, - int dimension, - AbstractMockVectorValues pregeneratedVectorValues, - int pregeneratedOffset) { + MockByteVectorValues vectorValues( + int size, int dimension, KnnVectorValues pregeneratedVectorValues, int pregeneratedOffset) { + + MockByteVectorValues pvv = (MockByteVectorValues) pregeneratedVectorValues; byte[][] vectors = new byte[size][]; - byte[][] randomVectors = - createRandomByteVectors(size - pregeneratedVectorValues.values.length, dimension, random()); + byte[][] randomVectors = createRandomByteVectors(size - pvv.values.length, dimension, random()); for (int i = 0; i < pregeneratedOffset; i++) { vectors[i] = randomVectors[i]; } - int currentDoc; - while ((currentDoc = pregeneratedVectorValues.nextDoc()) != NO_MORE_DOCS) { - vectors[pregeneratedOffset + currentDoc] = pregeneratedVectorValues.values[currentDoc]; + for (int currentOrd = 0; currentOrd < pvv.size(); currentOrd++) { + vectors[pregeneratedOffset + currentOrd] = pvv.values[currentOrd]; } - for (int i = pregeneratedOffset + pregeneratedVectorValues.values.length; - i < vectors.length; - i++) { - vectors[i] = randomVectors[i - pregeneratedVectorValues.values.length]; + for (int i = pregeneratedOffset + pvv.values.length; i < vectors.length; i++) { + vectors[i] = randomVectors[i - pvv.values.length]; } return MockByteVectorValues.fromValues(vectors); } @Override - AbstractMockVectorValues vectorValues(LeafReader reader, String fieldName) - throws IOException { + MockByteVectorValues vectorValues(LeafReader reader, String fieldName) throws IOException { ByteVectorValues vectorValues = reader.getByteVectorValues(fieldName); byte[][] vectors = new byte[reader.maxDoc()][]; - while (vectorValues.nextDoc() != NO_MORE_DOCS) { - vectors[vectorValues.docID()] = - ArrayUtil.copyOfSubArray( - vectorValues.vectorValue(), 0, vectorValues.vectorValue().length); + for (int i = 0; i < vectorValues.size(); i++) { + vectors[vectorValues.ordToDoc(i)] = + ArrayUtil.copyOfSubArray(vectorValues.vectorValue(i), 0, vectorValues.dimension()); } return MockByteVectorValues.fromValues(vectors); } diff --git a/lucene/core/src/test/org/apache/lucene/util/hnsw/TestHnswFloatVectorGraph.java b/lucene/core/src/test/org/apache/lucene/util/hnsw/TestHnswFloatVectorGraph.java index 5621edc4b35..52d1da3dfa8 100644 --- a/lucene/core/src/test/org/apache/lucene/util/hnsw/TestHnswFloatVectorGraph.java +++ b/lucene/core/src/test/org/apache/lucene/util/hnsw/TestHnswFloatVectorGraph.java @@ -17,13 +17,12 @@ package org.apache.lucene.util.hnsw; -import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS; - import com.carrotsearch.randomizedtesting.RandomizedTest; import java.io.IOException; import org.apache.lucene.document.Field; import org.apache.lucene.document.KnnFloatVectorField; import org.apache.lucene.index.FloatVectorValues; +import org.apache.lucene.index.KnnVectorValues; import org.apache.lucene.index.LeafReader; import org.apache.lucene.index.VectorEncoding; import org.apache.lucene.index.VectorSimilarityFunction; @@ -60,52 +59,44 @@ public class TestHnswFloatVectorGraph extends HnswGraphTestCase { } @Override - AbstractMockVectorValues vectorValues(int size, int dimension) { + MockVectorValues vectorValues(int size, int dimension) { return MockVectorValues.fromValues(createRandomFloatVectors(size, dimension, random())); } @Override - AbstractMockVectorValues vectorValues(float[][] values) { + MockVectorValues vectorValues(float[][] values) { return MockVectorValues.fromValues(values); } @Override - AbstractMockVectorValues vectorValues(LeafReader reader, String fieldName) - throws IOException { + MockVectorValues vectorValues(LeafReader reader, String fieldName) throws IOException { FloatVectorValues vectorValues = reader.getFloatVectorValues(fieldName); float[][] vectors = new float[reader.maxDoc()][]; - while (vectorValues.nextDoc() != NO_MORE_DOCS) { - vectors[vectorValues.docID()] = - ArrayUtil.copyOfSubArray( - vectorValues.vectorValue(), 0, vectorValues.vectorValue().length); + for (int i = 0; i < vectorValues.size(); i++) { + vectors[vectorValues.ordToDoc(i)] = + ArrayUtil.copyOfSubArray(vectorValues.vectorValue(i), 0, vectorValues.dimension()); } return MockVectorValues.fromValues(vectors); } @Override - AbstractMockVectorValues vectorValues( - int size, - int dimension, - AbstractMockVectorValues pregeneratedVectorValues, - int pregeneratedOffset) { + MockVectorValues vectorValues( + int size, int dimension, KnnVectorValues pregeneratedVectorValues, int pregeneratedOffset) { + MockVectorValues pvv = (MockVectorValues) pregeneratedVectorValues; float[][] vectors = new float[size][]; float[][] randomVectors = - createRandomFloatVectors( - size - pregeneratedVectorValues.values.length, dimension, random()); + createRandomFloatVectors(size - pvv.values.length, dimension, random()); for (int i = 0; i < pregeneratedOffset; i++) { vectors[i] = randomVectors[i]; } - int currentDoc; - while ((currentDoc = pregeneratedVectorValues.nextDoc()) != NO_MORE_DOCS) { - vectors[pregeneratedOffset + currentDoc] = pregeneratedVectorValues.values[currentDoc]; + for (int currentOrd = 0; currentOrd < pvv.size(); currentOrd++) { + vectors[pregeneratedOffset + currentOrd] = pvv.values[currentOrd]; } - for (int i = pregeneratedOffset + pregeneratedVectorValues.values.length; - i < vectors.length; - i++) { - vectors[i] = randomVectors[i - pregeneratedVectorValues.values.length]; + for (int i = pregeneratedOffset + pvv.values.length; i < vectors.length; i++) { + vectors[i] = randomVectors[i - pvv.values.length]; } return MockVectorValues.fromValues(vectors); @@ -129,7 +120,7 @@ public class TestHnswFloatVectorGraph extends HnswGraphTestCase { public void testSearchWithSkewedAcceptOrds() throws IOException { int nDoc = 1000; similarityFunction = VectorSimilarityFunction.EUCLIDEAN; - RandomAccessVectorValues.Floats vectors = circularVectorValues(nDoc); + FloatVectorValues vectors = circularVectorValues(nDoc); RandomVectorScorerSupplier scorerSupplier = buildScorerSupplier(vectors); HnswGraphBuilder builder = HnswGraphBuilder.create(scorerSupplier, 16, 100, random().nextInt()); OnHeapHnswGraph hnsw = builder.build(vectors.size()); diff --git a/lucene/core/src/test/org/apache/lucene/util/hnsw/TestHnswUtil.java b/lucene/core/src/test/org/apache/lucene/util/hnsw/TestHnswUtil.java index 3ec133ac46e..316afff5ee2 100644 --- a/lucene/core/src/test/org/apache/lucene/util/hnsw/TestHnswUtil.java +++ b/lucene/core/src/test/org/apache/lucene/util/hnsw/TestHnswUtil.java @@ -138,12 +138,6 @@ public class TestHnswUtil extends LuceneTestCase { } } MockGraph graph = new MockGraph(nodes); - /**/ - if (i == 2) { - System.out.println("iter " + i); - System.out.print(graph.toString()); - } - /**/ assertEquals(isRooted(nodes), HnswUtil.isRooted(graph)); } } diff --git a/lucene/core/src/test/org/apache/lucene/util/quantization/TestScalarQuantizedVectorSimilarity.java b/lucene/core/src/test/org/apache/lucene/util/quantization/TestScalarQuantizedVectorSimilarity.java index bdba822d4ec..f2cc3ac35c0 100644 --- a/lucene/core/src/test/org/apache/lucene/util/quantization/TestScalarQuantizedVectorSimilarity.java +++ b/lucene/core/src/test/org/apache/lucene/util/quantization/TestScalarQuantizedVectorSimilarity.java @@ -59,8 +59,7 @@ public class TestScalarQuantizedVectorSimilarity extends LuceneTestCase { float error = Math.max((100 - confidenceInterval) * 0.01f, 0.01f); FloatVectorValues floatVectorValues = fromFloats(floats); ScalarQuantizer scalarQuantizer = - ScalarQuantizer.fromVectors( - floatVectorValues, confidenceInterval, floats.length, (byte) 7); + ScalarQuantizer.fromVectors(floatVectorValues, confidenceInterval, numVecs, (byte) 7); byte[][] quantized = new byte[floats.length][]; float[] offsets = quantizeVectors(scalarQuantizer, floats, quantized, VectorSimilarityFunction.EUCLIDEAN); @@ -92,8 +91,7 @@ public class TestScalarQuantizedVectorSimilarity extends LuceneTestCase { float error = Math.max((100 - confidenceInterval) * 0.01f, 0.01f); FloatVectorValues floatVectorValues = fromFloatsNormalized(floats, null); ScalarQuantizer scalarQuantizer = - ScalarQuantizer.fromVectors( - floatVectorValues, confidenceInterval, floats.length, (byte) 7); + ScalarQuantizer.fromVectors(floatVectorValues, confidenceInterval, numVecs, (byte) 7); byte[][] quantized = new byte[floats.length][]; float[] offsets = quantizeVectorsNormalized( @@ -129,8 +127,7 @@ public class TestScalarQuantizedVectorSimilarity extends LuceneTestCase { float error = Math.max((100 - confidenceInterval) * 0.01f, 0.01f); FloatVectorValues floatVectorValues = fromFloats(floats); ScalarQuantizer scalarQuantizer = - ScalarQuantizer.fromVectors( - floatVectorValues, confidenceInterval, floats.length, (byte) 7); + ScalarQuantizer.fromVectors(floatVectorValues, confidenceInterval, numVecs, (byte) 7); byte[][] quantized = new byte[floats.length][]; float[] offsets = quantizeVectors(scalarQuantizer, floats, quantized, VectorSimilarityFunction.DOT_PRODUCT); @@ -162,8 +159,7 @@ public class TestScalarQuantizedVectorSimilarity extends LuceneTestCase { float error = Math.max((100 - confidenceInterval) * 0.5f, 0.5f); FloatVectorValues floatVectorValues = fromFloats(floats); ScalarQuantizer scalarQuantizer = - ScalarQuantizer.fromVectors( - floatVectorValues, confidenceInterval, floats.length, (byte) 7); + ScalarQuantizer.fromVectors(floatVectorValues, confidenceInterval, numVecs, (byte) 7); byte[][] quantized = new byte[floats.length][]; float[] offsets = quantizeVectors( @@ -242,11 +238,8 @@ public class TestScalarQuantizedVectorSimilarity extends LuceneTestCase { float[][] floats, Set deletedVectors) { return new TestScalarQuantizer.TestSimpleFloatVectorValues(floats, deletedVectors) { @Override - public float[] vectorValue() throws IOException { - if (curDoc == -1 || curDoc >= floats.length) { - throw new IOException("Current doc not set or too many iterations"); - } - float[] v = ArrayUtil.copyArray(floats[curDoc]); + public float[] vectorValue(int ord) throws IOException { + float[] v = ArrayUtil.copyArray(floats[ordToDoc[ord]]); VectorUtil.l2normalize(v); return v; } diff --git a/lucene/core/src/test/org/apache/lucene/util/quantization/TestScalarQuantizer.java b/lucene/core/src/test/org/apache/lucene/util/quantization/TestScalarQuantizer.java index 48eb7ce651c..7f56688b799 100644 --- a/lucene/core/src/test/org/apache/lucene/util/quantization/TestScalarQuantizer.java +++ b/lucene/core/src/test/org/apache/lucene/util/quantization/TestScalarQuantizer.java @@ -272,14 +272,27 @@ public class TestScalarQuantizer extends LuceneTestCase { static class TestSimpleFloatVectorValues extends FloatVectorValues { protected final float[][] floats; protected final Set deletedVectors; + protected final int[] ordToDoc; protected final int numLiveVectors; - protected int curDoc = -1; TestSimpleFloatVectorValues(float[][] values, Set deletedVectors) { this.floats = values; this.deletedVectors = deletedVectors; - this.numLiveVectors = + numLiveVectors = deletedVectors == null ? values.length : values.length - deletedVectors.size(); + ordToDoc = new int[numLiveVectors]; + if (deletedVectors == null) { + for (int i = 0; i < numLiveVectors; i++) { + ordToDoc[i] = i; + } + } else { + int ord = 0; + for (int doc = 0; doc < values.length; doc++) { + if (!deletedVectors.contains(doc)) { + ordToDoc[ord++] = doc; + } + } + } } @Override @@ -293,40 +306,64 @@ public class TestScalarQuantizer extends LuceneTestCase { } @Override - public float[] vectorValue() throws IOException { - if (curDoc == -1 || curDoc >= floats.length) { - throw new IOException("Current doc not set or too many iterations"); - } - return floats[curDoc]; + public float[] vectorValue(int ord) throws IOException { + return floats[ordToDoc(ord)]; } @Override - public int docID() { - if (curDoc >= floats.length) { - return NO_MORE_DOCS; - } - return curDoc; + public int ordToDoc(int ord) { + return ordToDoc[ord]; } @Override - public int nextDoc() throws IOException { - while (++curDoc < floats.length) { - if (deletedVectors == null || !deletedVectors.contains(curDoc)) { - return curDoc; + public DocIndexIterator iterator() { + return new DocIndexIterator() { + + int ord = -1; + int doc = -1; + + @Override + public int docID() { + return doc; } - } - return docID(); - } - @Override - public int advance(int target) throws IOException { - curDoc = target - 1; - return nextDoc(); + @Override + public int nextDoc() throws IOException { + while (doc < floats.length - 1) { + ++doc; + if (deletedVectors == null || !deletedVectors.contains(doc)) { + ++ord; + return doc; + } + } + return doc = NO_MORE_DOCS; + } + + @Override + public int index() { + return ord; + } + + @Override + public long cost() { + return floats.length - deletedVectors.size(); + } + + @Override + public int advance(int target) throws IOException { + throw new UnsupportedOperationException(); + } + }; } @Override public VectorScorer scorer(float[] target) { throw new UnsupportedOperationException(); } + + @Override + public TestSimpleFloatVectorValues copy() { + return this; + } } } diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTermExtractor.java b/lucene/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTermExtractor.java index c4cd50274f1..b94f73baefb 100644 --- a/lucene/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTermExtractor.java +++ b/lucene/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTermExtractor.java @@ -54,6 +54,7 @@ import org.apache.lucene.search.BoostQuery; import org.apache.lucene.search.ConstantScoreQuery; import org.apache.lucene.search.DisjunctionMaxQuery; import org.apache.lucene.search.FieldExistsQuery; +import org.apache.lucene.search.IndexOrDocValuesQuery; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.MatchAllDocsQuery; import org.apache.lucene.search.MultiPhraseQuery; @@ -163,6 +164,11 @@ public class WeightedSpanTermExtractor { new SpanNearQuery(clauses, phraseQuery.getSlop() + positionGaps, inorder); extractWeightedSpanTerms(terms, sp, boost); } + } else if (query instanceof IndexOrDocValuesQuery) { + Query indexQuery = ((IndexOrDocValuesQuery) query).getIndexQuery(); + if (indexQuery != null) { + extract(indexQuery, boost, terms); + } } else if (query instanceof TermQuery || query instanceof SynonymQuery) { extractWeightedTerms(terms, query, boost); } else if (query instanceof SpanQuery) { diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/DefaultPassageFormatter.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/DefaultPassageFormatter.java index 27281a91be7..4cd2b07fc1c 100644 --- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/DefaultPassageFormatter.java +++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/DefaultPassageFormatter.java @@ -64,7 +64,7 @@ public class DefaultPassageFormatter extends PassageFormatter { int pos = 0; for (Passage passage : passages) { // don't add ellipsis if its the first one, or if its connected. - if (passage.getStartOffset() > pos && pos > 0) { + if (!sb.isEmpty() && passage.getStartOffset() != pos) { sb.append(ellipsis); } pos = passage.getStartOffset(); diff --git a/lucene/highlighter/src/test/org/apache/lucene/search/highlight/TestHighlighter.java b/lucene/highlighter/src/test/org/apache/lucene/search/highlight/TestHighlighter.java index cee95a0ee16..a379cf26e6c 100644 --- a/lucene/highlighter/src/test/org/apache/lucene/search/highlight/TestHighlighter.java +++ b/lucene/highlighter/src/test/org/apache/lucene/search/highlight/TestHighlighter.java @@ -59,12 +59,14 @@ import org.apache.lucene.queries.spans.SpanNotQuery; import org.apache.lucene.queries.spans.SpanOrQuery; import org.apache.lucene.queries.spans.SpanQuery; import org.apache.lucene.queries.spans.SpanTermQuery; +import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.BooleanClause.Occur; import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.ConstantScoreQuery; import org.apache.lucene.search.DoubleValuesSource; import org.apache.lucene.search.FieldExistsQuery; import org.apache.lucene.search.FuzzyQuery; +import org.apache.lucene.search.IndexOrDocValuesQuery; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.MultiPhraseQuery; import org.apache.lucene.search.MultiTermQuery; @@ -255,6 +257,27 @@ public class TestHighlighter extends BaseTokenStreamTestCase implements Formatte assertEquals("John Kennedy has been shot", fragment); } + public void testHighlightingIndexOrDocValuesQuery() throws Exception { + searcher = newSearcher(reader); + BooleanQuery.Builder booleanQueryBuilder = new BooleanQuery.Builder(); + booleanQueryBuilder.add(new TermQuery(new Term(FIELD_NAME, "jfk")), BooleanClause.Occur.SHOULD); + booleanQueryBuilder.add( + new TermQuery(new Term(FIELD_NAME, "kennedy")), BooleanClause.Occur.SHOULD); + Query indexQuery = booleanQueryBuilder.build(); + Query dvQuery = TermRangeQuery.newStringRange(FIELD_NAME, "a", "z", true, true); + Query query = new IndexOrDocValuesQuery(indexQuery, dvQuery); + QueryScorer scorer = new QueryScorer(query, FIELD_NAME); + Highlighter highlighter = new Highlighter(scorer); + TokenStream stream = getAnyTokenStream(FIELD_NAME, 2); + String storedField = searcher.storedFields().document(2).get(FIELD_NAME); + String fragment = highlighter.getBestFragment(stream, storedField); + assertEquals("JFK has been shot", fragment); + stream = getAnyTokenStream(FIELD_NAME, 3); + storedField = searcher.storedFields().document(3).get(FIELD_NAME); + fragment = highlighter.getBestFragment(stream, storedField); + assertEquals("John Kennedy has been shot", fragment); + } + public void testHighlightUnknownQueryAfterRewrite() throws IOException, InvalidTokenOffsetsException { Query query = diff --git a/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestDefaultPassageFormatter.java b/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestDefaultPassageFormatter.java index b59fea47453..617077c987c 100644 --- a/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestDefaultPassageFormatter.java +++ b/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestDefaultPassageFormatter.java @@ -75,4 +75,30 @@ public class TestDefaultPassageFormatter extends LuceneTestCase { "Yin yang loooooooooong, yin gap yang yong", formatter.format(passages, content)); } + + public void testReversedStartOffsetOrder() { + String content = + "When indexing data in Solr, each document is composed of various fields. " + + "A document essentially represents a single record, and each document typically contains a unique ID field."; + + Passage[] passages = new Passage[2]; + passages[0] = new Passage(); + passages[0].setStartOffset(73); + passages[0].setEndOffset(179); + passages[0].setScore(1.8846991f); + passages[0].addMatch(75, 83, new BytesRef("document"), 1); + passages[0].addMatch(133, 141, new BytesRef("document"), 1); + + passages[1] = new Passage(); + passages[1].setStartOffset(0); + passages[1].setEndOffset(73); + passages[1].setScore(1.5923802f); + passages[1].addMatch(33, 41, new BytesRef("document"), 1); + + DefaultPassageFormatter formatter = new DefaultPassageFormatter("", "", "\n", false); + assertEquals( + "A document essentially represents a single record, and each document typically contains a unique ID field.\n" + + "When indexing data in Solr, each document is composed of various fields. ", + formatter.format(passages, content)); + } } diff --git a/lucene/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java b/lucene/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java index 2d46b243d83..04ac9285bab 100644 --- a/lucene/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java +++ b/lucene/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java @@ -2285,7 +2285,6 @@ public class MemoryIndex { private static final class MemoryFloatVectorValues extends FloatVectorValues { private final Info info; - private int currentDoc = -1; MemoryFloatVectorValues(Info info) { this.info = info; @@ -2302,14 +2301,19 @@ public class MemoryIndex { } @Override - public float[] vectorValue() { - if (currentDoc == 0) { + public float[] vectorValue(int ord) { + if (ord == 0) { return info.floatVectorValues[0]; } else { return null; } } + @Override + public DocIndexIterator iterator() { + return createDenseIterator(); + } + @Override public VectorScorer scorer(float[] query) { if (query.length != info.fieldInfo.getVectorDimension()) { @@ -2320,50 +2324,31 @@ public class MemoryIndex { + info.fieldInfo.getVectorDimension()); } MemoryFloatVectorValues vectorValues = new MemoryFloatVectorValues(info); + DocIndexIterator iterator = vectorValues.iterator(); return new VectorScorer() { @Override public float score() throws IOException { + assert iterator.docID() == 0; return info.fieldInfo .getVectorSimilarityFunction() - .compare(vectorValues.vectorValue(), query); + .compare(vectorValues.vectorValue(0), query); } @Override public DocIdSetIterator iterator() { - return vectorValues; + return iterator; } }; } @Override - public int docID() { - return currentDoc; - } - - @Override - public int nextDoc() { - int doc = ++currentDoc; - if (doc == 0) { - return doc; - } else { - return NO_MORE_DOCS; - } - } - - @Override - public int advance(int target) { - if (target == 0) { - currentDoc = target; - return target; - } else { - return NO_MORE_DOCS; - } + public MemoryFloatVectorValues copy() { + return this; } } private static final class MemoryByteVectorValues extends ByteVectorValues { private final Info info; - private int currentDoc = -1; MemoryByteVectorValues(Info info) { this.info = info; @@ -2380,14 +2365,19 @@ public class MemoryIndex { } @Override - public byte[] vectorValue() { - if (currentDoc == 0) { + public byte[] vectorValue(int ord) { + if (ord == 0) { return info.byteVectorValues[0]; } else { return null; } } + @Override + public DocIndexIterator iterator() { + return createDenseIterator(); + } + @Override public VectorScorer scorer(byte[] query) { if (query.length != info.fieldInfo.getVectorDimension()) { @@ -2398,44 +2388,26 @@ public class MemoryIndex { + info.fieldInfo.getVectorDimension()); } MemoryByteVectorValues vectorValues = new MemoryByteVectorValues(info); + DocIndexIterator iterator = vectorValues.iterator(); return new VectorScorer() { @Override public float score() { + assert iterator.docID() == 0; return info.fieldInfo .getVectorSimilarityFunction() - .compare(vectorValues.vectorValue(), query); + .compare(vectorValues.vectorValue(0), query); } @Override public DocIdSetIterator iterator() { - return vectorValues; + return iterator; } }; } @Override - public int docID() { - return currentDoc; - } - - @Override - public int nextDoc() { - int doc = ++currentDoc; - if (doc == 0) { - return doc; - } else { - return NO_MORE_DOCS; - } - } - - @Override - public int advance(int target) { - if (target == 0) { - currentDoc = target; - return target; - } else { - return NO_MORE_DOCS; - } + public MemoryByteVectorValues copy() { + return this; } } } diff --git a/lucene/memory/src/test/org/apache/lucene/index/memory/TestMemoryIndex.java b/lucene/memory/src/test/org/apache/lucene/index/memory/TestMemoryIndex.java index 18e97c67d9d..7c592868912 100644 --- a/lucene/memory/src/test/org/apache/lucene/index/memory/TestMemoryIndex.java +++ b/lucene/memory/src/test/org/apache/lucene/index/memory/TestMemoryIndex.java @@ -63,6 +63,7 @@ import org.apache.lucene.index.IndexOptions; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexableField; import org.apache.lucene.index.IndexableFieldType; +import org.apache.lucene.index.KnnVectorValues; import org.apache.lucene.index.LeafReader; import org.apache.lucene.index.NumericDocValues; import org.apache.lucene.index.PostingsEnum; @@ -851,9 +852,10 @@ public class TestMemoryIndex extends LuceneTestCase { .reader() .getFloatVectorValues(fieldName); assertNotNull(fvv); - assertEquals(0, fvv.nextDoc()); - assertArrayEquals(expected, fvv.vectorValue(), 1e-6f); - assertEquals(DocIdSetIterator.NO_MORE_DOCS, fvv.nextDoc()); + KnnVectorValues.DocIndexIterator iterator = fvv.iterator(); + assertEquals(0, iterator.nextDoc()); + assertArrayEquals(expected, fvv.vectorValue(0), 1e-6f); + assertEquals(DocIdSetIterator.NO_MORE_DOCS, iterator.nextDoc()); } private static void assertFloatVectorScore( @@ -868,7 +870,7 @@ public class TestMemoryIndex extends LuceneTestCase { .getFloatVectorValues(fieldName); assertNotNull(fvv); if (random().nextBoolean()) { - fvv.nextDoc(); + fvv.iterator().nextDoc(); } VectorScorer scorer = fvv.scorer(queryVector); assertEquals(0, scorer.iterator().nextDoc()); @@ -886,9 +888,10 @@ public class TestMemoryIndex extends LuceneTestCase { .reader() .getByteVectorValues(fieldName); assertNotNull(bvv); - assertEquals(0, bvv.nextDoc()); - assertArrayEquals(expected, bvv.vectorValue()); - assertEquals(DocIdSetIterator.NO_MORE_DOCS, bvv.nextDoc()); + KnnVectorValues.DocIndexIterator iterator = bvv.iterator(); + assertEquals(0, iterator.nextDoc()); + assertArrayEquals(expected, bvv.vectorValue(0)); + assertEquals(DocIdSetIterator.NO_MORE_DOCS, iterator.nextDoc()); } private static void assertByteVectorScore( @@ -903,7 +906,7 @@ public class TestMemoryIndex extends LuceneTestCase { .getByteVectorValues(fieldName); assertNotNull(bvv); if (random().nextBoolean()) { - bvv.nextDoc(); + bvv.iterator().nextDoc(); } VectorScorer scorer = bvv.scorer(queryVector); assertEquals(0, scorer.iterator().nextDoc()); diff --git a/lucene/queries/src/java/org/apache/lucene/queries/function/valuesource/ByteKnnVectorFieldSource.java b/lucene/queries/src/java/org/apache/lucene/queries/function/valuesource/ByteKnnVectorFieldSource.java index 32517496d54..c95bf632a73 100644 --- a/lucene/queries/src/java/org/apache/lucene/queries/function/valuesource/ByteKnnVectorFieldSource.java +++ b/lucene/queries/src/java/org/apache/lucene/queries/function/valuesource/ByteKnnVectorFieldSource.java @@ -20,6 +20,7 @@ import java.io.IOException; import java.util.Map; import java.util.Objects; import org.apache.lucene.index.ByteVectorValues; +import org.apache.lucene.index.KnnVectorValues; import org.apache.lucene.index.LeafReader; import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.index.VectorEncoding; @@ -63,11 +64,12 @@ public class ByteKnnVectorFieldSource extends ValueSource { } return new VectorFieldFunction(this) { + KnnVectorValues.DocIndexIterator iterator = vectorValues.iterator(); @Override public byte[] byteVectorVal(int doc) throws IOException { if (exists(doc)) { - return vectorValues.vectorValue(); + return vectorValues.vectorValue(iterator.index()); } else { return null; } @@ -75,7 +77,7 @@ public class ByteKnnVectorFieldSource extends ValueSource { @Override protected DocIdSetIterator getVectorIterator() { - return vectorValues; + return iterator; } }; } diff --git a/lucene/queries/src/java/org/apache/lucene/queries/function/valuesource/FloatKnnVectorFieldSource.java b/lucene/queries/src/java/org/apache/lucene/queries/function/valuesource/FloatKnnVectorFieldSource.java index 43cc3aff880..f026d9537bc 100644 --- a/lucene/queries/src/java/org/apache/lucene/queries/function/valuesource/FloatKnnVectorFieldSource.java +++ b/lucene/queries/src/java/org/apache/lucene/queries/function/valuesource/FloatKnnVectorFieldSource.java @@ -20,6 +20,7 @@ import java.io.IOException; import java.util.Map; import java.util.Objects; import org.apache.lucene.index.FloatVectorValues; +import org.apache.lucene.index.KnnVectorValues; import org.apache.lucene.index.LeafReader; import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.index.VectorEncoding; @@ -62,11 +63,12 @@ public class FloatKnnVectorFieldSource extends ValueSource { } return new VectorFieldFunction(this) { + KnnVectorValues.DocIndexIterator iterator = vectorValues.iterator(); @Override public float[] floatVectorVal(int doc) throws IOException { if (exists(doc)) { - return vectorValues.vectorValue(); + return vectorValues.vectorValue(iterator.index()); } else { return null; } @@ -74,7 +76,7 @@ public class FloatKnnVectorFieldSource extends ValueSource { @Override protected DocIdSetIterator getVectorIterator() { - return vectorValues; + return iterator; } }; } diff --git a/lucene/queries/src/java/org/apache/lucene/queries/intervals/BlockIntervalsSource.java b/lucene/queries/src/java/org/apache/lucene/queries/intervals/BlockIntervalsSource.java index 91a5e94d5f2..efb03b154ef 100644 --- a/lucene/queries/src/java/org/apache/lucene/queries/intervals/BlockIntervalsSource.java +++ b/lucene/queries/src/java/org/apache/lucene/queries/intervals/BlockIntervalsSource.java @@ -38,6 +38,7 @@ class BlockIntervalsSource extends ConjunctionIntervalsSource { List flattened = new ArrayList<>(); for (IntervalsSource s : sources) { if (s instanceof BlockIntervalsSource) { + // Block sources can be flattened because they do not increase the gap (gap = 0) flattened.addAll(((BlockIntervalsSource) s).subSources); } else { flattened.add(s); diff --git a/lucene/queries/src/java/org/apache/lucene/queries/intervals/Intervals.java b/lucene/queries/src/java/org/apache/lucene/queries/intervals/Intervals.java index 57d5674b254..e3808f406cf 100644 --- a/lucene/queries/src/java/org/apache/lucene/queries/intervals/Intervals.java +++ b/lucene/queries/src/java/org/apache/lucene/queries/intervals/Intervals.java @@ -247,8 +247,10 @@ public final class Intervals { * Return an {@link IntervalsSource} over the disjunction of all terms that fall within the given * range * - * @param lowerTerm The term text at the lower end of the range - * @param upperTerm The term text at the upper end of the range + * @param lowerTerm The term text at the lower end of the range; can be {@code null} to indicate + * an open-ended range at this end + * @param upperTerm The term text at the upper end of the range; can be {@code null} to indicate + * an open-ended range at this end * @param includeLower If true, the lowerTerm is included in the range * @param includeUpper If true, the upperTerm is included in the range * @throws IllegalStateException if the range expands to more than {@link #DEFAULT_MAX_EXPANSIONS} @@ -266,8 +268,10 @@ public final class Intervals { *

WARNING: Setting {@code maxExpansions} to higher than the default value of {@link * #DEFAULT_MAX_EXPANSIONS} can be both slow and memory-intensive * - * @param lowerTerm The term text at the lower end of the range - * @param upperTerm The term text at the upper end of the range + * @param lowerTerm The term text at the lower end of the range; can be {@code null} to indicate + * an open-ended range at this end + * @param upperTerm The term text at the upper end of the range; can be {@code null} to indicate + * an open-ended range at this end * @param includeLower If true, the lowerTerm is included in the range * @param includeUpper If true, the upperTerm is included in the range * @param maxExpansions the maximum number of terms to expand to @@ -286,9 +290,9 @@ public final class Intervals { StringBuilder buffer = new StringBuilder(); buffer.append("{"); - buffer.append(lowerTerm.utf8ToString()); + buffer.append(lowerTerm == null ? "* " : lowerTerm.utf8ToString()); buffer.append(","); - buffer.append(upperTerm.utf8ToString()); + buffer.append(upperTerm == null ? "*" : upperTerm.utf8ToString()); buffer.append("}"); return new MultiTermIntervalsSource(ca, maxExpansions, buffer.toString()); } diff --git a/lucene/queries/src/java/org/apache/lucene/queries/intervals/OrderedIntervalsSource.java b/lucene/queries/src/java/org/apache/lucene/queries/intervals/OrderedIntervalsSource.java index 65fa6d03395..c855d61d051 100644 --- a/lucene/queries/src/java/org/apache/lucene/queries/intervals/OrderedIntervalsSource.java +++ b/lucene/queries/src/java/org/apache/lucene/queries/intervals/OrderedIntervalsSource.java @@ -30,25 +30,13 @@ class OrderedIntervalsSource extends MinimizingConjunctionIntervalsSource { if (sources.size() == 1) { return sources.get(0); } - List rewritten = deduplicate(flatten(sources)); + List rewritten = deduplicate(sources); if (rewritten.size() == 1) { return rewritten.get(0); } return new OrderedIntervalsSource(rewritten); } - private static List flatten(List sources) { - List flattened = new ArrayList<>(); - for (IntervalsSource s : sources) { - if (s instanceof OrderedIntervalsSource) { - flattened.addAll(((OrderedIntervalsSource) s).subSources); - } else { - flattened.add(s); - } - } - return flattened; - } - private static List deduplicate(List sources) { List deduplicated = new ArrayList<>(); List current = new ArrayList<>(); @@ -136,38 +124,54 @@ class OrderedIntervalsSource extends MinimizingConjunctionIntervalsSource { start = end = slop = IntervalIterator.NO_MORE_INTERVALS; int lastStart = Integer.MAX_VALUE; boolean minimizing = false; + final var subIterators = this.subIterators; + int currentIndex = i; while (true) { while (true) { - if (subIterators.get(i - 1).end() >= lastStart) { + var prev = subIterators.get(currentIndex - 1); + if (prev.end() >= lastStart) { + i = currentIndex; return start; } - if (i == subIterators.size() - || (minimizing && subIterators.get(i).start() > subIterators.get(i - 1).end())) { + if (currentIndex == subIterators.size()) { + break; + } + final IntervalIterator current = subIterators.get(currentIndex); + if (minimizing && (current.start() > prev.end())) { break; } do { - if (subIterators.get(i).end() >= lastStart - || subIterators.get(i).nextInterval() == IntervalIterator.NO_MORE_INTERVALS) { + if (current.end() >= lastStart + || current.nextInterval() == IntervalIterator.NO_MORE_INTERVALS) { + i = currentIndex; return start; } - } while (subIterators.get(i).start() <= subIterators.get(i - 1).end()); - i++; + } while (current.start() <= prev.end()); + currentIndex++; } - start = subIterators.get(0).start(); + var first = subIterators.getFirst(); + final int start = first.start(); + this.start = start; if (start == NO_MORE_INTERVALS) { + i = currentIndex; return end = NO_MORE_INTERVALS; } - end = subIterators.get(subIterators.size() - 1).end(); - slop = end - start + 1; + var last = subIterators.getLast(); + + final int end = last.end(); + this.end = end; + int slop = end - start + 1; for (IntervalIterator subIterator : subIterators) { slop -= subIterator.width(); } + this.slop = slop; onMatch.onMatch(); - lastStart = subIterators.get(subIterators.size() - 1).start(); - i = 1; - if (subIterators.get(0).nextInterval() == IntervalIterator.NO_MORE_INTERVALS) { + currentIndex = 1; + if (first.nextInterval() == IntervalIterator.NO_MORE_INTERVALS) { + i = currentIndex; return start; } + lastStart = last.start(); minimizing = true; } } diff --git a/lucene/queries/src/java/org/apache/lucene/queries/intervals/UnorderedIntervalsSource.java b/lucene/queries/src/java/org/apache/lucene/queries/intervals/UnorderedIntervalsSource.java index 132ab4b3976..d2c708b4fee 100644 --- a/lucene/queries/src/java/org/apache/lucene/queries/intervals/UnorderedIntervalsSource.java +++ b/lucene/queries/src/java/org/apache/lucene/queries/intervals/UnorderedIntervalsSource.java @@ -33,7 +33,7 @@ class UnorderedIntervalsSource extends MinimizingConjunctionIntervalsSource { if (sources.size() == 1) { return sources.get(0); } - List rewritten = deduplicate(flatten(sources)); + List rewritten = deduplicate(sources); if (rewritten.size() == 1) { return rewritten.get(0); } @@ -55,18 +55,6 @@ class UnorderedIntervalsSource extends MinimizingConjunctionIntervalsSource { return deduplicated; } - private static List flatten(List sources) { - List flattened = new ArrayList<>(); - for (IntervalsSource s : sources) { - if (s instanceof UnorderedIntervalsSource) { - flattened.addAll(((UnorderedIntervalsSource) s).subSources); - } else { - flattened.add(s); - } - } - return flattened; - } - private UnorderedIntervalsSource(List sources) { super(sources); } diff --git a/lucene/queries/src/test/org/apache/lucene/queries/intervals/TestIntervalQuery.java b/lucene/queries/src/test/org/apache/lucene/queries/intervals/TestIntervalQuery.java index af2ec0230c2..a0539a0f852 100644 --- a/lucene/queries/src/test/org/apache/lucene/queries/intervals/TestIntervalQuery.java +++ b/lucene/queries/src/test/org/apache/lucene/queries/intervals/TestIntervalQuery.java @@ -338,6 +338,18 @@ public class TestIntervalQuery extends LuceneTestCase { checkHits(q, new int[] {6, 7}); } + public void testUnorderedWithNoGap() throws IOException { + Query q = + new IntervalQuery( + field, + Intervals.maxgaps( + 0, + Intervals.unordered( + Intervals.term("w3"), + Intervals.unordered(Intervals.term("w1"), Intervals.term("w5"))))); + checkHits(q, new int[] {0}); + } + public void testOrderedWithGaps() throws IOException { Query q = new IntervalQuery( @@ -360,6 +372,18 @@ public class TestIntervalQuery extends LuceneTestCase { checkHits(q, new int[] {12}); } + public void testOrderedWithNoGap() throws IOException { + Query q = + new IntervalQuery( + field, + Intervals.maxgaps( + 0, + Intervals.ordered( + Intervals.ordered(Intervals.term("w1"), Intervals.term("w4")), + Intervals.term("w5")))); + checkHits(q, new int[] {0}); + } + public void testNestedOrInContainedBy() throws IOException { Query q = new IntervalQuery( diff --git a/lucene/queries/src/test/org/apache/lucene/queries/intervals/TestIntervals.java b/lucene/queries/src/test/org/apache/lucene/queries/intervals/TestIntervals.java index 43fd8fb7fb8..3d9560858ac 100644 --- a/lucene/queries/src/test/org/apache/lucene/queries/intervals/TestIntervals.java +++ b/lucene/queries/src/test/org/apache/lucene/queries/intervals/TestIntervals.java @@ -1138,6 +1138,46 @@ public class TestIntervals extends LuceneTestCase { checkVisits(source, 1); } + public void testOpenEndedRange() throws IOException { + { + IntervalsSource source = Intervals.range(new BytesRef("porridge"), null, false, false); + checkIntervals( + source, + "field1", + 5, + new int[][] { + {3, 3}, + {9, 9, 10, 10, 14, 14, 18, 18, 22, 22, 26, 26, 27, 27}, + {9, 9, 10, 10, 11, 11, 14, 14, 18, 18, 22, 22, 26, 26}, + {8, 8}, + {9, 9, 10, 10, 12, 12, 14, 14, 18, 18, 21, 21}, + {} + }); + MatchesIterator mi = getMatches(source, 3, "field1"); + assertNotNull(mi); + assertMatch(mi, 8, 8, 37, 41); + } + + { + IntervalsSource source = Intervals.range(null, new BytesRef("anyone"), false, true); + checkIntervals( + source, + "field1", + 1, + new int[][] { + {4, 4}, + {}, + {}, + {}, + {}, + {} + }); + MatchesIterator mi = getMatches(source, 0, "field1"); + assertNotNull(mi); + assertMatch(mi, 4, 4, 23, 29); + } + } + public void testWrappedFilters() throws IOException { IntervalsSource source = Intervals.or( diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/quantization/KMeans.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/quantization/KMeans.java index bb9d3ca63df..88d2adba5fa 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/quantization/KMeans.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/quantization/KMeans.java @@ -25,11 +25,11 @@ import java.util.HashSet; import java.util.List; import java.util.Random; import java.util.Set; +import org.apache.lucene.index.FloatVectorValues; import org.apache.lucene.index.VectorSimilarityFunction; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.VectorUtil; import org.apache.lucene.util.hnsw.NeighborQueue; -import org.apache.lucene.util.hnsw.RandomAccessVectorValues; /** KMeans clustering algorithm for vectors */ public class KMeans { @@ -38,7 +38,7 @@ public class KMeans { public static final int DEFAULT_ITRS = 10; public static final int DEFAULT_SAMPLE_SIZE = 100_000; - private final RandomAccessVectorValues.Floats vectors; + private final FloatVectorValues vectors; private final int numVectors; private final int numCentroids; private final Random random; @@ -57,9 +57,7 @@ public class KMeans { * @throws IOException when if there is an error accessing vectors */ public static Results cluster( - RandomAccessVectorValues.Floats vectors, - VectorSimilarityFunction similarityFunction, - int numClusters) + FloatVectorValues vectors, VectorSimilarityFunction similarityFunction, int numClusters) throws IOException { return cluster( vectors, @@ -93,7 +91,7 @@ public class KMeans { * @throws IOException if there is error accessing vectors */ public static Results cluster( - RandomAccessVectorValues.Floats vectors, + FloatVectorValues vectors, int numClusters, boolean assignCentroidsToVectors, long seed, @@ -124,7 +122,7 @@ public class KMeans { if (numClusters == 1) { centroids = new float[1][vectors.dimension()]; } else { - RandomAccessVectorValues.Floats sampleVectors = + FloatVectorValues sampleVectors = vectors.size() <= sampleSize ? vectors : createSampleReader(vectors, sampleSize, seed); KMeans kmeans = new KMeans(sampleVectors, numClusters, random, initializationMethod, restarts, iters); @@ -142,7 +140,7 @@ public class KMeans { } private KMeans( - RandomAccessVectorValues.Floats vectors, + FloatVectorValues vectors, int numCentroids, Random random, KmeansInitializationMethod initializationMethod, @@ -276,7 +274,7 @@ public class KMeans { * @throws IOException if there is an error accessing vector values */ private static double runKMeansStep( - RandomAccessVectorValues.Floats vectors, + FloatVectorValues vectors, float[][] centroids, short[] docCentroids, boolean useKahanSummation, @@ -348,9 +346,7 @@ public class KMeans { * descending distance to the current centroid set */ static void assignCentroids( - RandomAccessVectorValues.Floats vectors, - float[][] centroids, - List unassignedCentroidsIdxs) + FloatVectorValues vectors, float[][] centroids, List unassignedCentroidsIdxs) throws IOException { int[] assignedCentroidsIdxs = new int[centroids.length - unassignedCentroidsIdxs.size()]; int assignedIndex = 0; diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/quantization/SampleReader.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/quantization/SampleReader.java index 9a718c81101..684c9fac838 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/quantization/SampleReader.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/quantization/SampleReader.java @@ -20,18 +20,18 @@ package org.apache.lucene.sandbox.codecs.quantization; import java.io.IOException; import java.util.Random; import java.util.function.IntUnaryOperator; +import org.apache.lucene.codecs.lucene95.HasIndexSlice; +import org.apache.lucene.index.FloatVectorValues; import org.apache.lucene.store.IndexInput; import org.apache.lucene.util.Bits; -import org.apache.lucene.util.hnsw.RandomAccessVectorValues; /** A reader of vector values that samples a subset of the vectors. */ -public class SampleReader implements RandomAccessVectorValues.Floats { - private final RandomAccessVectorValues.Floats origin; +public class SampleReader extends FloatVectorValues implements HasIndexSlice { + private final FloatVectorValues origin; private final int sampleSize; private final IntUnaryOperator sampleFunction; - SampleReader( - RandomAccessVectorValues.Floats origin, int sampleSize, IntUnaryOperator sampleFunction) { + SampleReader(FloatVectorValues origin, int sampleSize, IntUnaryOperator sampleFunction) { this.origin = origin; this.sampleSize = sampleSize; this.sampleFunction = sampleFunction; @@ -48,13 +48,13 @@ public class SampleReader implements RandomAccessVectorValues.Floats { } @Override - public Floats copy() throws IOException { + public FloatVectorValues copy() throws IOException { throw new IllegalStateException("Not supported"); } @Override public IndexInput getSlice() { - return origin.getSlice(); + return ((HasIndexSlice) origin).getSlice(); } @Override @@ -77,8 +77,7 @@ public class SampleReader implements RandomAccessVectorValues.Floats { throw new IllegalStateException("Not supported"); } - public static SampleReader createSampleReader( - RandomAccessVectorValues.Floats origin, int k, long seed) { + public static SampleReader createSampleReader(FloatVectorValues origin, int k, long seed) { int[] samples = reservoirSample(origin.size(), k, seed); return new SampleReader(origin, samples.length, i -> samples[i]); } diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/idversion/TestIDVersionPostingsFormat.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/idversion/TestIDVersionPostingsFormat.java index 7f67e4767f6..a34fbfa6db5 100644 --- a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/idversion/TestIDVersionPostingsFormat.java +++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/idversion/TestIDVersionPostingsFormat.java @@ -343,7 +343,7 @@ public class TestIDVersionPostingsFormat extends LuceneTestCase { /** Returns docID if found, else -1. */ public int lookup(BytesRef id, long version) throws IOException { - for (int seg = 0; seg < numSegs; seg++) { + for (int seg = 0; seg < numEnums; seg++) { if (((IDVersionSegmentTermsEnum) termsEnums[seg]).seekExact(id, version)) { if (VERBOSE) { System.out.println(" found in seg=" + termsEnums[seg]); diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/quantization/TestKMeans.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/quantization/TestKMeans.java index 61c0e58c91e..3669079b719 100644 --- a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/quantization/TestKMeans.java +++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/quantization/TestKMeans.java @@ -20,9 +20,9 @@ package org.apache.lucene.sandbox.codecs.quantization; import java.io.IOException; import java.util.ArrayList; import java.util.List; +import org.apache.lucene.index.FloatVectorValues; import org.apache.lucene.index.VectorSimilarityFunction; import org.apache.lucene.tests.util.LuceneTestCase; -import org.apache.lucene.util.hnsw.RandomAccessVectorValues; public class TestKMeans extends LuceneTestCase { @@ -32,7 +32,7 @@ public class TestKMeans extends LuceneTestCase { int dims = random().nextInt(2, 20); int randIdx = random().nextInt(VectorSimilarityFunction.values().length); VectorSimilarityFunction similarityFunction = VectorSimilarityFunction.values()[randIdx]; - RandomAccessVectorValues.Floats vectors = generateData(nVectors, dims, nClusters); + FloatVectorValues vectors = generateData(nVectors, dims, nClusters); // default case { @@ -75,7 +75,7 @@ public class TestKMeans extends LuceneTestCase { // nClusters > nVectors int nClusters = 20; int nVectors = 10; - RandomAccessVectorValues.Floats vectors = generateData(nVectors, 5, nClusters); + FloatVectorValues vectors = generateData(nVectors, 5, nClusters); KMeans.Results results = KMeans.cluster(vectors, VectorSimilarityFunction.EUCLIDEAN, nClusters); // assert that we get 1 centroid, as nClusters will be adjusted @@ -87,7 +87,7 @@ public class TestKMeans extends LuceneTestCase { int sampleSize = 2; int nClusters = 2; int nVectors = 300; - RandomAccessVectorValues.Floats vectors = generateData(nVectors, 5, nClusters); + FloatVectorValues vectors = generateData(nVectors, 5, nClusters); KMeans.KmeansInitializationMethod initializationMethod = KMeans.KmeansInitializationMethod.PLUS_PLUS; KMeans.Results results = @@ -108,7 +108,7 @@ public class TestKMeans extends LuceneTestCase { // test unassigned centroids int nClusters = 4; int nVectors = 400; - RandomAccessVectorValues.Floats vectors = generateData(nVectors, 5, nClusters); + FloatVectorValues vectors = generateData(nVectors, 5, nClusters); KMeans.Results results = KMeans.cluster(vectors, VectorSimilarityFunction.EUCLIDEAN, nClusters); float[][] centroids = results.centroids(); @@ -118,8 +118,7 @@ public class TestKMeans extends LuceneTestCase { } } - private static RandomAccessVectorValues.Floats generateData( - int nSamples, int nDims, int nClusters) { + private static FloatVectorValues generateData(int nSamples, int nDims, int nClusters) { List vectors = new ArrayList<>(nSamples); float[][] centroids = new float[nClusters][nDims]; // Generate random centroids @@ -137,6 +136,6 @@ public class TestKMeans extends LuceneTestCase { } vectors.add(vector); } - return RandomAccessVectorValues.fromFloats(vectors, nDims); + return FloatVectorValues.fromFloats(vectors, nDims); } } diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/codecs/asserting/AssertingDocValuesFormat.java b/lucene/test-framework/src/java/org/apache/lucene/tests/codecs/asserting/AssertingDocValuesFormat.java index 619b5b02b80..046fd850304 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/codecs/asserting/AssertingDocValuesFormat.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/codecs/asserting/AssertingDocValuesFormat.java @@ -27,6 +27,7 @@ import org.apache.lucene.index.DocValuesSkipIndexType; import org.apache.lucene.index.DocValuesSkipper; import org.apache.lucene.index.DocValuesType; import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.FieldInfos; import org.apache.lucene.index.NumericDocValues; import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.index.SegmentWriteState; @@ -59,7 +60,8 @@ public class AssertingDocValuesFormat extends DocValuesFormat { assert state.fieldInfos.hasDocValues(); DocValuesProducer producer = in.fieldsProducer(state); assert producer != null; - return new AssertingDocValuesProducer(producer, state.segmentInfo.maxDoc(), false); + return new AssertingDocValuesProducer( + producer, state.fieldInfos, state.segmentInfo.maxDoc(), false); } static class AssertingDocValuesConsumer extends DocValuesConsumer { @@ -214,12 +216,15 @@ public class AssertingDocValuesFormat extends DocValuesFormat { static class AssertingDocValuesProducer extends DocValuesProducer { private final DocValuesProducer in; + private final FieldInfos fieldInfos; private final int maxDoc; private final boolean merging; private final Thread creationThread; - AssertingDocValuesProducer(DocValuesProducer in, int maxDoc, boolean merging) { + AssertingDocValuesProducer( + DocValuesProducer in, FieldInfos fieldInfos, int maxDoc, boolean merging) { this.in = in; + this.fieldInfos = fieldInfos; this.maxDoc = maxDoc; this.merging = merging; this.creationThread = Thread.currentThread(); @@ -229,6 +234,7 @@ public class AssertingDocValuesFormat extends DocValuesFormat { @Override public NumericDocValues getNumeric(FieldInfo field) throws IOException { + assert fieldInfos.fieldInfo(field.name).number == field.number; if (merging) { AssertingCodec.assertThread("DocValuesProducer", creationThread); } @@ -240,6 +246,7 @@ public class AssertingDocValuesFormat extends DocValuesFormat { @Override public BinaryDocValues getBinary(FieldInfo field) throws IOException { + assert fieldInfos.fieldInfo(field.name).number == field.number; if (merging) { AssertingCodec.assertThread("DocValuesProducer", creationThread); } @@ -251,6 +258,7 @@ public class AssertingDocValuesFormat extends DocValuesFormat { @Override public SortedDocValues getSorted(FieldInfo field) throws IOException { + assert fieldInfos.fieldInfo(field.name).number == field.number; if (merging) { AssertingCodec.assertThread("DocValuesProducer", creationThread); } @@ -262,6 +270,7 @@ public class AssertingDocValuesFormat extends DocValuesFormat { @Override public SortedNumericDocValues getSortedNumeric(FieldInfo field) throws IOException { + assert fieldInfos.fieldInfo(field.name).number == field.number; if (merging) { AssertingCodec.assertThread("DocValuesProducer", creationThread); } @@ -273,6 +282,7 @@ public class AssertingDocValuesFormat extends DocValuesFormat { @Override public SortedSetDocValues getSortedSet(FieldInfo field) throws IOException { + assert fieldInfos.fieldInfo(field.name).number == field.number; if (merging) { AssertingCodec.assertThread("DocValuesProducer", creationThread); } @@ -284,6 +294,7 @@ public class AssertingDocValuesFormat extends DocValuesFormat { @Override public DocValuesSkipper getSkipper(FieldInfo field) throws IOException { + assert fieldInfos.fieldInfo(field.name).number == field.number; assert field.docValuesSkipIndexType() != DocValuesSkipIndexType.NONE; DocValuesSkipper skipper = in.getSkipper(field); assert skipper != null; @@ -303,7 +314,7 @@ public class AssertingDocValuesFormat extends DocValuesFormat { @Override public DocValuesProducer getMergeInstance() { - return new AssertingDocValuesProducer(in.getMergeInstance(), maxDoc, true); + return new AssertingDocValuesProducer(in.getMergeInstance(), fieldInfos, maxDoc, true); } @Override diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/codecs/asserting/AssertingKnnVectorsFormat.java b/lucene/test-framework/src/java/org/apache/lucene/tests/codecs/asserting/AssertingKnnVectorsFormat.java index 501e2e5616f..21c62090a69 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/codecs/asserting/AssertingKnnVectorsFormat.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/codecs/asserting/AssertingKnnVectorsFormat.java @@ -125,7 +125,7 @@ public class AssertingKnnVectorsFormat extends KnnVectorsFormat { && fi.getVectorEncoding() == VectorEncoding.FLOAT32; FloatVectorValues floatValues = delegate.getFloatVectorValues(field); assert floatValues != null; - assert floatValues.docID() == -1; + assert floatValues.iterator().docID() == -1; assert floatValues.size() >= 0; assert floatValues.dimension() > 0; return floatValues; @@ -139,7 +139,7 @@ public class AssertingKnnVectorsFormat extends KnnVectorsFormat { && fi.getVectorEncoding() == VectorEncoding.BYTE; ByteVectorValues values = delegate.getByteVectorValues(field); assert values != null; - assert values.docID() == -1; + assert values.iterator().docID() == -1; assert values.size() >= 0; assert values.dimension() > 0; return values; diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/codecs/cranky/CrankyCompoundFormat.java b/lucene/test-framework/src/java/org/apache/lucene/tests/codecs/cranky/CrankyCompoundFormat.java index bced58d2a6f..4e0ac271859 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/codecs/cranky/CrankyCompoundFormat.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/codecs/cranky/CrankyCompoundFormat.java @@ -34,9 +34,8 @@ class CrankyCompoundFormat extends CompoundFormat { } @Override - public CompoundDirectory getCompoundReader(Directory dir, SegmentInfo si, IOContext context) - throws IOException { - return delegate.getCompoundReader(dir, si, context); + public CompoundDirectory getCompoundReader(Directory dir, SegmentInfo si) throws IOException { + return delegate.getCompoundReader(dir, si); } @Override diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/index/AssertingLeafReader.java b/lucene/test-framework/src/java/org/apache/lucene/tests/index/AssertingLeafReader.java index 50300dc30bc..ae6e8813890 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/index/AssertingLeafReader.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/index/AssertingLeafReader.java @@ -21,6 +21,7 @@ import java.util.Arrays; import java.util.Iterator; import java.util.List; import java.util.Objects; +import java.util.RandomAccess; import org.apache.lucene.index.BinaryDocValues; import org.apache.lucene.index.DocValues; import org.apache.lucene.index.DocValuesSkipIndexType; @@ -700,7 +701,10 @@ public class AssertingLeafReader extends FilterLeafReader { public List getImpacts(int level) { assert validFor == Math.max(impactsEnum.docID(), impactsEnum.lastShallowTarget) : "Cannot reuse impacts after advancing the iterator"; - return in.getImpacts(level); + List impacts = in.getImpacts(level); + assert impacts.size() <= 1 || impacts instanceof RandomAccess + : "impact lists longer than 1 should implement RandomAccess but saw impacts = " + impacts; + return impacts; } } diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseCompoundFormatTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseCompoundFormatTestCase.java index b0e30ef2272..0fe563d7550 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseCompoundFormatTestCase.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseCompoundFormatTestCase.java @@ -64,7 +64,7 @@ public abstract class BaseCompoundFormatTestCase extends BaseIndexFileFormatTest SegmentInfo si = newSegmentInfo(dir, "_123"); si.setFiles(Collections.emptySet()); si.getCodec().compoundFormat().write(dir, si, IOContext.DEFAULT); - Directory cfs = si.getCodec().compoundFormat().getCompoundReader(dir, si, IOContext.DEFAULT); + Directory cfs = si.getCodec().compoundFormat().getCompoundReader(dir, si); assertEquals(0, cfs.listAll().length); cfs.close(); dir.close(); @@ -84,7 +84,7 @@ public abstract class BaseCompoundFormatTestCase extends BaseIndexFileFormatTest si.setFiles(Collections.singleton(testfile)); si.getCodec().compoundFormat().write(dir, si, IOContext.DEFAULT); - Directory cfs = si.getCodec().compoundFormat().getCompoundReader(dir, si, IOContext.DEFAULT); + Directory cfs = si.getCodec().compoundFormat().getCompoundReader(dir, si); IndexInput expected = dir.openInput(testfile, newIOContext(random())); IndexInput actual = cfs.openInput(testfile, newIOContext(random())); @@ -107,7 +107,7 @@ public abstract class BaseCompoundFormatTestCase extends BaseIndexFileFormatTest si.setFiles(Arrays.asList(files)); si.getCodec().compoundFormat().write(dir, si, IOContext.DEFAULT); - Directory cfs = si.getCodec().compoundFormat().getCompoundReader(dir, si, IOContext.DEFAULT); + Directory cfs = si.getCodec().compoundFormat().getCompoundReader(dir, si); for (String file : files) { IndexInput expected = dir.openInput(file, newIOContext(random())); @@ -136,7 +136,7 @@ public abstract class BaseCompoundFormatTestCase extends BaseIndexFileFormatTest si.setFiles(Collections.singleton(testfile)); si.getCodec().compoundFormat().write(dir, si, IOContext.DEFAULT); - Directory cfs = si.getCodec().compoundFormat().getCompoundReader(dir, si, IOContext.DEFAULT); + Directory cfs = si.getCodec().compoundFormat().getCompoundReader(dir, si); assertEquals(1, cfs.listAll().length); cfs.close(); cfs.close(); // second close should not throw exception @@ -215,10 +215,7 @@ public abstract class BaseCompoundFormatTestCase extends BaseIndexFileFormatTest for (SegmentCommitInfo si : infos) { if (si.info.getUseCompoundFile()) { try (Directory cfsDir = - si.info - .getCodec() - .compoundFormat() - .getCompoundReader(dir, si.info, newIOContext(random()))) { + si.info.getCodec().compoundFormat().getCompoundReader(dir, si.info)) { for (String cfsFile : cfsDir.listAll()) { try (IndexInput cfsIn = cfsDir.openInput(cfsFile, IOContext.DEFAULT)) { assert cfsIn != null; @@ -237,7 +234,7 @@ public abstract class BaseCompoundFormatTestCase extends BaseIndexFileFormatTest SegmentInfo si = newSegmentInfo(dir, "_123"); si.setFiles(Collections.emptyList()); si.getCodec().compoundFormat().write(dir, si, IOContext.DEFAULT); - Directory cfs = si.getCodec().compoundFormat().getCompoundReader(dir, si, IOContext.DEFAULT); + Directory cfs = si.getCodec().compoundFormat().getCompoundReader(dir, si); expectThrows( UnsupportedOperationException.class, () -> { @@ -260,7 +257,7 @@ public abstract class BaseCompoundFormatTestCase extends BaseIndexFileFormatTest SegmentInfo si = newSegmentInfo(dir, "_123"); si.setFiles(Collections.emptyList()); si.getCodec().compoundFormat().write(dir, si, IOContext.DEFAULT); - Directory cfs = si.getCodec().compoundFormat().getCompoundReader(dir, si, IOContext.DEFAULT); + Directory cfs = si.getCodec().compoundFormat().getCompoundReader(dir, si); expectThrows( UnsupportedOperationException.class, () -> { @@ -283,7 +280,7 @@ public abstract class BaseCompoundFormatTestCase extends BaseIndexFileFormatTest SegmentInfo si = newSegmentInfo(dir, "_123"); si.setFiles(Collections.emptyList()); si.getCodec().compoundFormat().write(dir, si, IOContext.DEFAULT); - Directory cfs = si.getCodec().compoundFormat().getCompoundReader(dir, si, IOContext.DEFAULT); + Directory cfs = si.getCodec().compoundFormat().getCompoundReader(dir, si); expectThrows( UnsupportedOperationException.class, () -> { @@ -306,7 +303,7 @@ public abstract class BaseCompoundFormatTestCase extends BaseIndexFileFormatTest SegmentInfo si = newSegmentInfo(dir, "_123"); si.setFiles(Collections.emptyList()); si.getCodec().compoundFormat().write(dir, si, IOContext.DEFAULT); - Directory cfs = si.getCodec().compoundFormat().getCompoundReader(dir, si, IOContext.DEFAULT); + Directory cfs = si.getCodec().compoundFormat().getCompoundReader(dir, si); expectThrows( UnsupportedOperationException.class, () -> { @@ -329,7 +326,7 @@ public abstract class BaseCompoundFormatTestCase extends BaseIndexFileFormatTest SegmentInfo si = newSegmentInfo(dir, "_123"); si.setFiles(Collections.emptyList()); si.getCodec().compoundFormat().write(dir, si, IOContext.DEFAULT); - Directory cfs = si.getCodec().compoundFormat().getCompoundReader(dir, si, IOContext.DEFAULT); + Directory cfs = si.getCodec().compoundFormat().getCompoundReader(dir, si); expectThrows( UnsupportedOperationException.class, () -> { @@ -374,7 +371,7 @@ public abstract class BaseCompoundFormatTestCase extends BaseIndexFileFormatTest si.setFiles(files); si.getCodec().compoundFormat().write(dir, si, IOContext.DEFAULT); - Directory cfs = si.getCodec().compoundFormat().getCompoundReader(dir, si, IOContext.DEFAULT); + Directory cfs = si.getCodec().compoundFormat().getCompoundReader(dir, si); for (String file : files) { IndexInput check = dir.openInput(file, newIOContext(random())); @@ -411,7 +408,7 @@ public abstract class BaseCompoundFormatTestCase extends BaseIndexFileFormatTest si.setFiles(files); si.getCodec().compoundFormat().write(dir, si, IOContext.DEFAULT); - Directory cfs = si.getCodec().compoundFormat().getCompoundReader(dir, si, IOContext.DEFAULT); + Directory cfs = si.getCodec().compoundFormat().getCompoundReader(dir, si); final IndexInput[] ins = new IndexInput[FILE_COUNT]; for (int fileIdx = 0; fileIdx < FILE_COUNT; fileIdx++) { @@ -793,7 +790,7 @@ public abstract class BaseCompoundFormatTestCase extends BaseIndexFileFormatTest si.setFiles(files); si.getCodec().compoundFormat().write(dir, si, IOContext.DEFAULT); - Directory cfs = si.getCodec().compoundFormat().getCompoundReader(dir, si, IOContext.DEFAULT); + Directory cfs = si.getCodec().compoundFormat().getCompoundReader(dir, si); return cfs; } @@ -817,7 +814,7 @@ public abstract class BaseCompoundFormatTestCase extends BaseIndexFileFormatTest si.setFiles(Collections.singletonList(subFile)); si.getCodec().compoundFormat().write(dir, si, IOContext.DEFAULT); - Directory cfs = si.getCodec().compoundFormat().getCompoundReader(dir, si, IOContext.DEFAULT); + Directory cfs = si.getCodec().compoundFormat().getCompoundReader(dir, si); IndexInput in = cfs.openInput(subFile, IOContext.DEFAULT); String desc = in.toString(); assertTrue( @@ -899,7 +896,7 @@ public abstract class BaseCompoundFormatTestCase extends BaseIndexFileFormatTest ReadBytesDirectoryWrapper readTrackingDir = new ReadBytesDirectoryWrapper(dir); CompoundDirectory compoundDir = - si.getCodec().compoundFormat().getCompoundReader(readTrackingDir, si, IOContext.DEFAULT); + si.getCodec().compoundFormat().getCompoundReader(readTrackingDir, si); compoundDir.checkIntegrity(); Map readBytes = readTrackingDir.getReadBytes(); assertEquals(createdFiles, readBytes.keySet()); diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseDocValuesFormatTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseDocValuesFormatTestCase.java index a312b42a910..9b99aeecba6 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseDocValuesFormatTestCase.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseDocValuesFormatTestCase.java @@ -24,6 +24,7 @@ import java.io.IOException; import java.io.PrintStream; import java.util.function.Supplier; import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.document.BinaryDocValuesField; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.NumericDocValuesField; @@ -31,22 +32,26 @@ import org.apache.lucene.document.SortedDocValuesField; import org.apache.lucene.document.SortedNumericDocValuesField; import org.apache.lucene.document.SortedSetDocValuesField; import org.apache.lucene.document.StringField; +import org.apache.lucene.index.BinaryDocValues; import org.apache.lucene.index.CheckIndex; import org.apache.lucene.index.CheckIndex.Status.DocValuesStatus; import org.apache.lucene.index.CodecReader; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.DocValuesSkipper; import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.LeafReader; import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.index.NumericDocValues; +import org.apache.lucene.index.SerialMergeScheduler; import org.apache.lucene.index.SortedDocValues; import org.apache.lucene.index.SortedNumericDocValues; import org.apache.lucene.index.SortedSetDocValues; import org.apache.lucene.index.Term; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.index.TermsEnum.SeekStatus; +import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.store.Directory; import org.apache.lucene.tests.analysis.MockAnalyzer; import org.apache.lucene.tests.util.TestUtil; @@ -832,4 +837,74 @@ public abstract class BaseDocValuesFormatTestCase extends LegacyBaseDocValuesFor int docID(); } + + public void testMismatchedFields() throws Exception { + Directory dir1 = newDirectory(); + IndexWriter w1 = new IndexWriter(dir1, newIndexWriterConfig()); + Document doc = new Document(); + doc.add(new BinaryDocValuesField("binary", new BytesRef("lucene"))); + doc.add(new NumericDocValuesField("numeric", 0L)); + doc.add(new SortedDocValuesField("sorted", new BytesRef("search"))); + doc.add(new SortedNumericDocValuesField("sorted_numeric", 1L)); + doc.add(new SortedSetDocValuesField("sorted_set", new BytesRef("engine"))); + w1.addDocument(doc); + + Directory dir2 = newDirectory(); + IndexWriter w2 = + new IndexWriter(dir2, newIndexWriterConfig().setMergeScheduler(new SerialMergeScheduler())); + w2.addDocument(doc); + w2.commit(); + + DirectoryReader reader = DirectoryReader.open(w1); + w1.close(); + w2.addIndexes(new MismatchedCodecReader((CodecReader) getOnlyLeafReader(reader), random())); + reader.close(); + w2.forceMerge(1); + reader = DirectoryReader.open(w2); + w2.close(); + + LeafReader leafReader = getOnlyLeafReader(reader); + + BinaryDocValues bdv = leafReader.getBinaryDocValues("binary"); + assertNotNull(bdv); + assertEquals(0, bdv.nextDoc()); + assertEquals(new BytesRef("lucene"), bdv.binaryValue()); + assertEquals(1, bdv.nextDoc()); + assertEquals(new BytesRef("lucene"), bdv.binaryValue()); + assertEquals(DocIdSetIterator.NO_MORE_DOCS, bdv.nextDoc()); + + NumericDocValues ndv = leafReader.getNumericDocValues("numeric"); + assertNotNull(ndv); + assertEquals(0, ndv.nextDoc()); + assertEquals(0, ndv.longValue()); + assertEquals(1, ndv.nextDoc()); + assertEquals(0, ndv.longValue()); + assertEquals(DocIdSetIterator.NO_MORE_DOCS, ndv.nextDoc()); + + SortedDocValues sdv = leafReader.getSortedDocValues("sorted"); + assertNotNull(sdv); + assertEquals(0, sdv.nextDoc()); + assertEquals(new BytesRef("search"), sdv.lookupOrd(sdv.ordValue())); + assertEquals(1, sdv.nextDoc()); + assertEquals(new BytesRef("search"), sdv.lookupOrd(sdv.ordValue())); + assertEquals(DocIdSetIterator.NO_MORE_DOCS, sdv.nextDoc()); + + SortedNumericDocValues sndv = leafReader.getSortedNumericDocValues("sorted_numeric"); + assertNotNull(sndv); + assertEquals(0, sndv.nextDoc()); + assertEquals(1, sndv.nextValue()); + assertEquals(1, sndv.nextDoc()); + assertEquals(1, sndv.nextValue()); + assertEquals(DocIdSetIterator.NO_MORE_DOCS, sndv.nextDoc()); + + SortedSetDocValues ssdv = leafReader.getSortedSetDocValues("sorted_set"); + assertNotNull(ssdv); + assertEquals(0, ssdv.nextDoc()); + assertEquals(new BytesRef("engine"), ssdv.lookupOrd(ssdv.nextOrd())); + assertEquals(1, ssdv.nextDoc()); + assertEquals(new BytesRef("engine"), ssdv.lookupOrd(ssdv.nextOrd())); + assertEquals(DocIdSetIterator.NO_MORE_DOCS, ssdv.nextDoc()); + + IOUtils.close(reader, w2, dir1, dir2); + } } diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseKnnVectorsFormatTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseKnnVectorsFormatTestCase.java index 4c9165e1a10..752f21ea5d7 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseKnnVectorsFormatTestCase.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseKnnVectorsFormatTestCase.java @@ -16,15 +16,21 @@ */ package org.apache.lucene.tests.index; +import static com.carrotsearch.randomizedtesting.RandomizedTest.randomIntBetween; import static java.nio.charset.StandardCharsets.UTF_8; import static org.apache.lucene.index.VectorSimilarityFunction.DOT_PRODUCT; import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS; +import java.io.BufferedReader; import java.io.ByteArrayOutputStream; import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; import java.util.Arrays; import java.util.Collections; import java.util.HashMap; +import java.util.HashSet; +import java.util.List; import java.util.Map; import java.util.Set; import java.util.concurrent.atomic.AtomicReference; @@ -55,6 +61,7 @@ import org.apache.lucene.index.IndexOptions; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.index.KnnVectorValues; import org.apache.lucene.index.LeafReader; import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.index.MergePolicy; @@ -63,11 +70,16 @@ import org.apache.lucene.index.MergeTrigger; import org.apache.lucene.index.NoMergePolicy; import org.apache.lucene.index.SegmentInfo; import org.apache.lucene.index.SegmentWriteState; +import org.apache.lucene.index.SerialMergeScheduler; import org.apache.lucene.index.StoredFields; import org.apache.lucene.index.Term; import org.apache.lucene.index.VectorEncoding; import org.apache.lucene.index.VectorSimilarityFunction; import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.KnnFloatVectorQuery; +import org.apache.lucene.search.MatchAllDocsQuery; +import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.Sort; import org.apache.lucene.search.SortField; import org.apache.lucene.search.TopDocs; @@ -78,6 +90,7 @@ import org.apache.lucene.store.FSDirectory; import org.apache.lucene.tests.util.TestUtil; import org.apache.lucene.util.Bits; import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.InfoStream; import org.apache.lucene.util.StringHelper; import org.apache.lucene.util.VectorUtil; @@ -435,9 +448,10 @@ public abstract class BaseKnnVectorsFormatTestCase extends BaseIndexFileFormatTe try (IndexReader reader = DirectoryReader.open(w2)) { LeafReader r = getOnlyLeafReader(reader); FloatVectorValues vectorValues = r.getFloatVectorValues(fieldName); - assertEquals(0, vectorValues.nextDoc()); - assertEquals(0, vectorValues.vectorValue()[0], 0); - assertEquals(NO_MORE_DOCS, vectorValues.nextDoc()); + KnnVectorValues.DocIndexIterator iterator = vectorValues.iterator(); + assertEquals(0, iterator.nextDoc()); + assertEquals(0, vectorValues.vectorValue(0)[0], 0); + assertEquals(NO_MORE_DOCS, iterator.nextDoc()); } } } @@ -460,9 +474,10 @@ public abstract class BaseKnnVectorsFormatTestCase extends BaseIndexFileFormatTe try (IndexReader reader = DirectoryReader.open(w2)) { LeafReader r = getOnlyLeafReader(reader); FloatVectorValues vectorValues = r.getFloatVectorValues(fieldName); - assertNotEquals(NO_MORE_DOCS, vectorValues.nextDoc()); - assertEquals(0, vectorValues.vectorValue()[0], 0); - assertEquals(NO_MORE_DOCS, vectorValues.nextDoc()); + KnnVectorValues.DocIndexIterator iterator = vectorValues.iterator(); + assertNotEquals(NO_MORE_DOCS, iterator.nextDoc()); + assertEquals(0, vectorValues.vectorValue(iterator.index())[0], 0); + assertEquals(NO_MORE_DOCS, iterator.nextDoc()); } } } @@ -487,12 +502,13 @@ public abstract class BaseKnnVectorsFormatTestCase extends BaseIndexFileFormatTe try (IndexReader reader = DirectoryReader.open(w2)) { LeafReader r = getOnlyLeafReader(reader); FloatVectorValues vectorValues = r.getFloatVectorValues(fieldName); - assertEquals(0, vectorValues.nextDoc()); + KnnVectorValues.DocIndexIterator iterator = vectorValues.iterator(); + assertEquals(0, iterator.nextDoc()); // The merge order is randomized, we might get 0 first, or 1 - float value = vectorValues.vectorValue()[0]; + float value = vectorValues.vectorValue(0)[0]; assertTrue(value == 0 || value == 1); - assertEquals(1, vectorValues.nextDoc()); - value += vectorValues.vectorValue()[0]; + assertEquals(1, iterator.nextDoc()); + value += vectorValues.vectorValue(1)[0]; assertEquals(1, value, 0); } } @@ -877,8 +893,10 @@ public abstract class BaseKnnVectorsFormatTestCase extends BaseIndexFileFormatTe ByteVectorValues byteVectorValues = ctx.reader().getByteVectorValues(fieldName); if (byteVectorValues != null) { docCount += byteVectorValues.size(); - while (byteVectorValues.nextDoc() != NO_MORE_DOCS) { - checksum += byteVectorValues.vectorValue()[0]; + KnnVectorValues.DocIndexIterator iterator = byteVectorValues.iterator(); + while (true) { + if (!(iterator.nextDoc() != NO_MORE_DOCS)) break; + checksum += byteVectorValues.vectorValue(iterator.index())[0]; } } } @@ -888,8 +906,10 @@ public abstract class BaseKnnVectorsFormatTestCase extends BaseIndexFileFormatTe FloatVectorValues vectorValues = ctx.reader().getFloatVectorValues(fieldName); if (vectorValues != null) { docCount += vectorValues.size(); - while (vectorValues.nextDoc() != NO_MORE_DOCS) { - checksum += vectorValues.vectorValue()[0]; + KnnVectorValues.DocIndexIterator iterator = vectorValues.iterator(); + while (true) { + if (!(iterator.nextDoc() != NO_MORE_DOCS)) break; + checksum += vectorValues.vectorValue(iterator.index())[0]; } } } @@ -948,10 +968,12 @@ public abstract class BaseKnnVectorsFormatTestCase extends BaseIndexFileFormatTe assertSame(iterator, scorer.iterator()); assertNotSame(iterator, scorer); // verify scorer iteration scores are valid & iteration with vectorValues is consistent - while (iterator.nextDoc() != NO_MORE_DOCS && vectorValues.nextDoc() != NO_MORE_DOCS) { + KnnVectorValues.DocIndexIterator valuesIterator = vectorValues.iterator(); + while (iterator.nextDoc() != NO_MORE_DOCS) { + if (!(valuesIterator.nextDoc() != NO_MORE_DOCS)) break; float score = scorer.score(); assertTrue(score >= 0f); - assertEquals(iterator.docID(), vectorValues.docID()); + assertEquals(iterator.docID(), valuesIterator.docID()); } // verify that a new scorer can be obtained after iteration VectorScorer newScorer = vectorValues.scorer(vectorToScore); @@ -1007,10 +1029,12 @@ public abstract class BaseKnnVectorsFormatTestCase extends BaseIndexFileFormatTe assertSame(iterator, scorer.iterator()); assertNotSame(iterator, scorer); // verify scorer iteration scores are valid & iteration with vectorValues is consistent - while (iterator.nextDoc() != NO_MORE_DOCS && vectorValues.nextDoc() != NO_MORE_DOCS) { + KnnVectorValues.DocIndexIterator valuesIterator = vectorValues.iterator(); + while (iterator.nextDoc() != NO_MORE_DOCS) { + if (!(valuesIterator.nextDoc() != NO_MORE_DOCS)) break; float score = scorer.score(); assertTrue(score >= 0f); - assertEquals(iterator.docID(), vectorValues.docID()); + assertEquals(iterator.docID(), valuesIterator.docID()); } // verify that a new scorer can be obtained after iteration VectorScorer newScorer = vectorValues.scorer(vectorToScore); @@ -1116,12 +1140,16 @@ public abstract class BaseKnnVectorsFormatTestCase extends BaseIndexFileFormatTe LeafReader r = getOnlyLeafReader(reader); FloatVectorValues vectorValues = r.getFloatVectorValues(fieldName); assertEquals(3, vectorValues.size()); - vectorValues.nextDoc(); - assertEquals(1, vectorValues.vectorValue()[0], 0); - vectorValues.nextDoc(); - assertEquals(1, vectorValues.vectorValue()[0], 0); - vectorValues.nextDoc(); - assertEquals(2, vectorValues.vectorValue()[0], 0); + KnnVectorValues.DocIndexIterator iterator = vectorValues.iterator(); + iterator.nextDoc(); + assertEquals(0, iterator.index()); + assertEquals(1, vectorValues.vectorValue(0)[0], 0); + iterator.nextDoc(); + assertEquals(1, iterator.index()); + assertEquals(1, vectorValues.vectorValue(1)[0], 0); + iterator.nextDoc(); + assertEquals(2, iterator.index()); + assertEquals(2, vectorValues.vectorValue(2)[0], 0); } } } @@ -1144,13 +1172,14 @@ public abstract class BaseKnnVectorsFormatTestCase extends BaseIndexFileFormatTe FloatVectorValues vectorValues = leaf.getFloatVectorValues(fieldName); assertEquals(2, vectorValues.dimension()); assertEquals(3, vectorValues.size()); - assertEquals("1", storedFields.document(vectorValues.nextDoc()).get("id")); - assertEquals(-1f, vectorValues.vectorValue()[0], 0); - assertEquals("2", storedFields.document(vectorValues.nextDoc()).get("id")); - assertEquals(1, vectorValues.vectorValue()[0], 0); - assertEquals("4", storedFields.document(vectorValues.nextDoc()).get("id")); - assertEquals(0, vectorValues.vectorValue()[0], 0); - assertEquals(NO_MORE_DOCS, vectorValues.nextDoc()); + KnnVectorValues.DocIndexIterator iterator = vectorValues.iterator(); + assertEquals("1", storedFields.document(iterator.nextDoc()).get("id")); + assertEquals(-1f, vectorValues.vectorValue(0)[0], 0); + assertEquals("2", storedFields.document(iterator.nextDoc()).get("id")); + assertEquals(1, vectorValues.vectorValue(1)[0], 0); + assertEquals("4", storedFields.document(iterator.nextDoc()).get("id")); + assertEquals(0, vectorValues.vectorValue(2)[0], 0); + assertEquals(NO_MORE_DOCS, iterator.nextDoc()); } } } @@ -1173,13 +1202,13 @@ public abstract class BaseKnnVectorsFormatTestCase extends BaseIndexFileFormatTe ByteVectorValues vectorValues = leaf.getByteVectorValues(fieldName); assertEquals(2, vectorValues.dimension()); assertEquals(3, vectorValues.size()); - assertEquals("1", storedFields.document(vectorValues.nextDoc()).get("id")); - assertEquals(-1, vectorValues.vectorValue()[0], 0); - assertEquals("2", storedFields.document(vectorValues.nextDoc()).get("id")); - assertEquals(1, vectorValues.vectorValue()[0], 0); - assertEquals("4", storedFields.document(vectorValues.nextDoc()).get("id")); - assertEquals(0, vectorValues.vectorValue()[0], 0); - assertEquals(NO_MORE_DOCS, vectorValues.nextDoc()); + assertEquals("1", storedFields.document(vectorValues.iterator().nextDoc()).get("id")); + assertEquals(-1, vectorValues.vectorValue(0)[0], 0); + assertEquals("2", storedFields.document(vectorValues.iterator().nextDoc()).get("id")); + assertEquals(1, vectorValues.vectorValue(1)[0], 0); + assertEquals("4", storedFields.document(vectorValues.iterator().nextDoc()).get("id")); + assertEquals(0, vectorValues.vectorValue(2)[0], 0); + assertEquals(NO_MORE_DOCS, vectorValues.iterator().nextDoc()); } } } @@ -1209,27 +1238,30 @@ public abstract class BaseKnnVectorsFormatTestCase extends BaseIndexFileFormatTe FloatVectorValues vectorValues = leaf.getFloatVectorValues("field1"); assertEquals(2, vectorValues.dimension()); assertEquals(2, vectorValues.size()); - vectorValues.nextDoc(); - assertEquals(1f, vectorValues.vectorValue()[0], 0); - vectorValues.nextDoc(); - assertEquals(2f, vectorValues.vectorValue()[0], 0); - assertEquals(NO_MORE_DOCS, vectorValues.nextDoc()); + KnnVectorValues.DocIndexIterator iterator = vectorValues.iterator(); + iterator.nextDoc(); + assertEquals(1f, vectorValues.vectorValue(0)[0], 0); + iterator.nextDoc(); + assertEquals(2f, vectorValues.vectorValue(1)[0], 0); + assertEquals(NO_MORE_DOCS, iterator.nextDoc()); FloatVectorValues vectorValues2 = leaf.getFloatVectorValues("field2"); + KnnVectorValues.DocIndexIterator it2 = vectorValues2.iterator(); assertEquals(4, vectorValues2.dimension()); assertEquals(2, vectorValues2.size()); - vectorValues2.nextDoc(); - assertEquals(2f, vectorValues2.vectorValue()[1], 0); - vectorValues2.nextDoc(); - assertEquals(2f, vectorValues2.vectorValue()[1], 0); - assertEquals(NO_MORE_DOCS, vectorValues2.nextDoc()); + it2.nextDoc(); + assertEquals(2f, vectorValues2.vectorValue(0)[1], 0); + it2.nextDoc(); + assertEquals(2f, vectorValues2.vectorValue(1)[1], 0); + assertEquals(NO_MORE_DOCS, it2.nextDoc()); FloatVectorValues vectorValues3 = leaf.getFloatVectorValues("field3"); assertEquals(4, vectorValues3.dimension()); assertEquals(1, vectorValues3.size()); - vectorValues3.nextDoc(); - assertEquals(1f, vectorValues3.vectorValue()[0], 0.1); - assertEquals(NO_MORE_DOCS, vectorValues3.nextDoc()); + KnnVectorValues.DocIndexIterator it3 = vectorValues3.iterator(); + it3.nextDoc(); + assertEquals(1f, vectorValues3.vectorValue(0)[0], 0.1); + assertEquals(NO_MORE_DOCS, it3.nextDoc()); } } } @@ -1293,13 +1325,15 @@ public abstract class BaseKnnVectorsFormatTestCase extends BaseIndexFileFormatTe totalSize += vectorValues.size(); StoredFields storedFields = ctx.reader().storedFields(); int docId; - while ((docId = vectorValues.nextDoc()) != NO_MORE_DOCS) { - float[] v = vectorValues.vectorValue(); + KnnVectorValues.DocIndexIterator iterator = vectorValues.iterator(); + while (true) { + if (!((docId = iterator.nextDoc()) != NO_MORE_DOCS)) break; + float[] v = vectorValues.vectorValue(iterator.index()); assertEquals(dimension, v.length); String idString = storedFields.document(docId).getField("id").stringValue(); int id = Integer.parseInt(idString); if (ctx.reader().getLiveDocs() == null || ctx.reader().getLiveDocs().get(docId)) { - assertArrayEquals(idString, values[id], v, 0); + assertArrayEquals(idString + " " + docId, values[id], v, 0); ++valueCount; } else { ++numDeletes; @@ -1373,8 +1407,10 @@ public abstract class BaseKnnVectorsFormatTestCase extends BaseIndexFileFormatTe totalSize += vectorValues.size(); StoredFields storedFields = ctx.reader().storedFields(); int docId; - while ((docId = vectorValues.nextDoc()) != NO_MORE_DOCS) { - byte[] v = vectorValues.vectorValue(); + KnnVectorValues.DocIndexIterator iterator = vectorValues.iterator(); + while (true) { + if (!((docId = iterator.nextDoc()) != NO_MORE_DOCS)) break; + byte[] v = vectorValues.vectorValue(iterator.index()); assertEquals(dimension, v.length); String idString = storedFields.document(docId).getField("id").stringValue(); int id = Integer.parseInt(idString); @@ -1493,8 +1529,10 @@ public abstract class BaseKnnVectorsFormatTestCase extends BaseIndexFileFormatTe StoredFields storedFields = ctx.reader().storedFields(); int docId; int numLiveDocsWithVectors = 0; - while ((docId = vectorValues.nextDoc()) != NO_MORE_DOCS) { - float[] v = vectorValues.vectorValue(); + KnnVectorValues.DocIndexIterator iterator = vectorValues.iterator(); + while (true) { + if (!((docId = iterator.nextDoc()) != NO_MORE_DOCS)) break; + float[] v = vectorValues.vectorValue(iterator.index()); assertEquals(dimension, v.length); String idString = storedFields.document(docId).getField("id").stringValue(); int id = Integer.parseInt(idString); @@ -1701,25 +1739,27 @@ public abstract class BaseKnnVectorsFormatTestCase extends BaseIndexFileFormatTe FloatVectorValues vectorValues = r.getFloatVectorValues(fieldName); int[] vectorDocs = new int[vectorValues.size() + 1]; int cur = -1; + KnnVectorValues.DocIndexIterator iterator = vectorValues.iterator(); while (++cur < vectorValues.size() + 1) { - vectorDocs[cur] = vectorValues.nextDoc(); + vectorDocs[cur] = iterator.nextDoc(); if (cur != 0) { assertTrue(vectorDocs[cur] > vectorDocs[cur - 1]); } } vectorValues = r.getFloatVectorValues(fieldName); + DocIdSetIterator iter = vectorValues.iterator(); cur = -1; for (int i = 0; i < numdocs; i++) { // randomly advance to i if (random().nextInt(4) == 3) { while (vectorDocs[++cur] < i) {} - assertEquals(vectorDocs[cur], vectorValues.advance(i)); - assertEquals(vectorDocs[cur], vectorValues.docID()); - if (vectorValues.docID() == NO_MORE_DOCS) { + assertEquals(vectorDocs[cur], iter.advance(i)); + assertEquals(vectorDocs[cur], iter.docID()); + if (iter.docID() == NO_MORE_DOCS) { break; } // make i equal to docid so that it is greater than docId in the next loop iteration - i = vectorValues.docID(); + i = iter.docID(); } } } @@ -1770,6 +1810,7 @@ public abstract class BaseKnnVectorsFormatTestCase extends BaseIndexFileFormatTe double checksum = 0; int docCount = 0; long sumDocIds = 0; + long sumOrdToDocIds = 0; switch (vectorEncoding) { case BYTE -> { for (LeafReaderContext ctx : r.leaves()) { @@ -1777,11 +1818,18 @@ public abstract class BaseKnnVectorsFormatTestCase extends BaseIndexFileFormatTe if (byteVectorValues != null) { docCount += byteVectorValues.size(); StoredFields storedFields = ctx.reader().storedFields(); - while (byteVectorValues.nextDoc() != NO_MORE_DOCS) { - checksum += byteVectorValues.vectorValue()[0]; - Document doc = storedFields.document(byteVectorValues.docID(), Set.of("id")); + KnnVectorValues.DocIndexIterator iter = byteVectorValues.iterator(); + for (iter.nextDoc(); iter.docID() != NO_MORE_DOCS; iter.nextDoc()) { + int ord = iter.index(); + checksum += byteVectorValues.vectorValue(ord)[0]; + Document doc = storedFields.document(iter.docID(), Set.of("id")); sumDocIds += Integer.parseInt(doc.get("id")); } + for (int ord = 0; ord < byteVectorValues.size(); ord++) { + Document doc = + storedFields.document(byteVectorValues.ordToDoc(ord), Set.of("id")); + sumOrdToDocIds += Integer.parseInt(doc.get("id")); + } } } } @@ -1791,11 +1839,17 @@ public abstract class BaseKnnVectorsFormatTestCase extends BaseIndexFileFormatTe if (vectorValues != null) { docCount += vectorValues.size(); StoredFields storedFields = ctx.reader().storedFields(); - while (vectorValues.nextDoc() != NO_MORE_DOCS) { - checksum += vectorValues.vectorValue()[0]; - Document doc = storedFields.document(vectorValues.docID(), Set.of("id")); + KnnVectorValues.DocIndexIterator iter = vectorValues.iterator(); + for (iter.nextDoc(); iter.docID() != NO_MORE_DOCS; iter.nextDoc()) { + int ord = iter.index(); + checksum += vectorValues.vectorValue(ord)[0]; + Document doc = storedFields.document(iter.docID(), Set.of("id")); sumDocIds += Integer.parseInt(doc.get("id")); } + for (int ord = 0; ord < vectorValues.size(); ord++) { + Document doc = storedFields.document(vectorValues.ordToDoc(ord), Set.of("id")); + sumOrdToDocIds += Integer.parseInt(doc.get("id")); + } } } } @@ -1807,7 +1861,223 @@ public abstract class BaseKnnVectorsFormatTestCase extends BaseIndexFileFormatTe vectorEncoding == VectorEncoding.BYTE ? numDocs * 0.2 : 1e-5); assertEquals(fieldDocCount, docCount); assertEquals(fieldSumDocIDs, sumDocIds); + assertEquals(fieldSumDocIDs, sumOrdToDocIds); } } } + + public void testMismatchedFields() throws Exception { + Directory dir1 = newDirectory(); + IndexWriter w1 = new IndexWriter(dir1, newIndexWriterConfig()); + Document doc = new Document(); + doc.add(new KnnFloatVectorField("float", new float[] {1f, 2f})); + doc.add(new KnnByteVectorField("byte", new byte[] {42})); + w1.addDocument(doc); + + Directory dir2 = newDirectory(); + IndexWriter w2 = + new IndexWriter(dir2, newIndexWriterConfig().setMergeScheduler(new SerialMergeScheduler())); + w2.addDocument(doc); + w2.commit(); + + DirectoryReader reader = DirectoryReader.open(w1); + w1.close(); + w2.addIndexes(new MismatchedCodecReader((CodecReader) getOnlyLeafReader(reader), random())); + reader.close(); + w2.forceMerge(1); + reader = DirectoryReader.open(w2); + w2.close(); + + LeafReader leafReader = getOnlyLeafReader(reader); + + ByteVectorValues byteVectors = leafReader.getByteVectorValues("byte"); + assertNotNull(byteVectors); + KnnVectorValues.DocIndexIterator iter = byteVectors.iterator(); + assertEquals(0, iter.nextDoc()); + assertArrayEquals(new byte[] {42}, byteVectors.vectorValue(0)); + assertEquals(1, iter.nextDoc()); + assertArrayEquals(new byte[] {42}, byteVectors.vectorValue(1)); + assertEquals(DocIdSetIterator.NO_MORE_DOCS, iter.nextDoc()); + + FloatVectorValues floatVectors = leafReader.getFloatVectorValues("float"); + assertNotNull(floatVectors); + iter = floatVectors.iterator(); + assertEquals(0, iter.nextDoc()); + float[] vector = floatVectors.vectorValue(0); + assertEquals(2, vector.length); + assertEquals(1f, vector[0], 0f); + assertEquals(2f, vector[1], 0f); + assertEquals(1, iter.nextDoc()); + vector = floatVectors.vectorValue(1); + assertEquals(2, vector.length); + assertEquals(1f, vector[0], 0f); + assertEquals(2f, vector[1], 0f); + assertEquals(DocIdSetIterator.NO_MORE_DOCS, iter.nextDoc()); + + IOUtils.close(reader, w2, dir1, dir2); + } + + /** + * Test that the query is a viable approximation to exact search. This test is designed to uncover + * gross failures only, not to represent the true expected recall. + */ + public void testRecall() throws IOException { + VectorSimilarityFunction[] functions = { + VectorSimilarityFunction.EUCLIDEAN, + VectorSimilarityFunction.COSINE, + VectorSimilarityFunction.DOT_PRODUCT, + VectorSimilarityFunction.MAXIMUM_INNER_PRODUCT + }; + for (VectorSimilarityFunction similarity : functions) { + assertRecall(similarity, 0.5, 1.0); + } + } + + protected void assertRecall(VectorSimilarityFunction similarity, double min, double max) + throws IOException { + int dim = 16; + int recalled = 0; + try (Directory indexStore = getKnownIndexStore("field", dim, similarity); + IndexReader reader = DirectoryReader.open(indexStore)) { + IndexSearcher searcher = newSearcher(reader); + float[] queryEmbedding = new float[dim]; + // indexed 421 lines from LICENSE.txt + // indexed 157 lines from NOTICE.txt + int topK = 10; + int numQueries = 578; + String[] testQueries = { + "Apache Lucene", + "Apache License", + "TERMS AND CONDITIONS", + "Copyright 2001", + "Permission is hereby", + "Copyright © 2003", + "The dictionary comes from Morfologik project", + "The levenshtein automata tables" + }; + for (String queryString : testQueries) { + computeLineEmbedding(queryString, queryEmbedding); + + // pass match-all "filter" to force full traversal, bypassing graph + KnnFloatVectorQuery exactQuery = + new KnnFloatVectorQuery("field", queryEmbedding, 1000, new MatchAllDocsQuery()); + assertEquals(numQueries, searcher.count(exactQuery)); // Same for exact search + + KnnFloatVectorQuery query = new KnnFloatVectorQuery("field", queryEmbedding, topK); + assertEquals(10, searcher.count(query)); // Expect some results without timeout + TopDocs results = searcher.search(query, topK); + Set resultDocs = new HashSet<>(); + int i = 0; + for (ScoreDoc scoreDoc : results.scoreDocs) { + if (VERBOSE) { + System.out.println( + "result " + + i++ + + ": " + + reader.storedFields().document(scoreDoc.doc) + + " " + + scoreDoc); + } + resultDocs.add(scoreDoc.doc); + } + TopDocs expected = searcher.search(exactQuery, topK); + i = 0; + for (ScoreDoc scoreDoc : expected.scoreDocs) { + if (VERBOSE) { + System.out.println( + "expected " + + i++ + + ": " + + reader.storedFields().document(scoreDoc.doc) + + " " + + scoreDoc); + } + if (resultDocs.contains(scoreDoc.doc)) { + ++recalled; + } + } + } + int totalResults = testQueries.length * topK; + assertTrue( + "Average recall for " + + similarity + + " should be at least " + + (totalResults * min) + + " / " + + totalResults + + ", got " + + recalled, + recalled >= (int) (totalResults * min)); + assertTrue( + "Average recall for " + + similarity + + " should be no more than " + + (totalResults * max) + + " / " + + totalResults + + ", got " + + recalled, + recalled <= (int) (totalResults * max)); + } + } + + /** Creates a new directory and adds documents with the given vectors as kNN vector fields */ + Directory getKnownIndexStore( + String field, int dimension, VectorSimilarityFunction vectorSimilarityFunction) + throws IOException { + Directory indexStore = newDirectory(random()); + IndexWriter writer = new IndexWriter(indexStore, newIndexWriterConfig()); + float[] scratch = new float[dimension]; + for (String file : List.of("LICENSE.txt", "NOTICE.txt")) { + try (InputStream in = BaseKnnVectorsFormatTestCase.class.getResourceAsStream(file); + BufferedReader reader = new BufferedReader(new InputStreamReader(in, UTF_8))) { + String line; + int lineNo = -1; + while ((line = reader.readLine()) != null) { + line = line.strip(); + if (line.isEmpty()) { + continue; + } + ++lineNo; + Document doc = new Document(); + doc.add( + new KnnFloatVectorField( + field, computeLineEmbedding(line, scratch), vectorSimilarityFunction)); + doc.add(new StoredField("text", line)); + doc.add(new StringField("id", file + "." + lineNo, Field.Store.YES)); + writer.addDocument(doc); + if (random().nextBoolean()) { + // Add some documents without a vector + addDocuments(writer, "id" + lineNo + ".", randomIntBetween(1, 5)); + } + } + // System.out.println("indexed " + (lineNo + 1) + " lines from " + file); + } + } + // Add some documents without a vector nor an id + addDocuments(writer, null, 5); + writer.close(); + return indexStore; + } + + private float[] computeLineEmbedding(String line, float[] vector) { + Arrays.fill(vector, 0); + for (int i = 0; i < line.length(); i++) { + char c = line.charAt(i); + vector[i % vector.length] += c / ((float) (i + 1) / vector.length); + } + VectorUtil.l2normalize(vector, false); + return vector; + } + + private void addDocuments(IndexWriter writer, String idBase, int count) throws IOException { + for (int i = 0; i < count; i++) { + Document doc = new Document(); + doc.add(new StringField("other", "value", Field.Store.NO)); + if (idBase != null) { + doc.add(new StringField("id", idBase + i, Field.Store.YES)); + } + writer.addDocument(doc); + } + } } diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/index/BasePointsFormatTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/tests/index/BasePointsFormatTestCase.java index 265e3f073be..a15dd07a79e 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/index/BasePointsFormatTestCase.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/index/BasePointsFormatTestCase.java @@ -30,6 +30,7 @@ import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.FieldType; import org.apache.lucene.document.IntPoint; +import org.apache.lucene.document.LongPoint; import org.apache.lucene.document.NumericDocValuesField; import org.apache.lucene.document.StringField; import org.apache.lucene.index.CodecReader; @@ -46,6 +47,7 @@ import org.apache.lucene.index.NumericDocValues; import org.apache.lucene.index.PointValues; import org.apache.lucene.index.PointValues.IntersectVisitor; import org.apache.lucene.index.PointValues.Relation; +import org.apache.lucene.index.SerialMergeScheduler; import org.apache.lucene.index.Term; import org.apache.lucene.internal.tests.ConcurrentMergeSchedulerAccess; import org.apache.lucene.internal.tests.TestSecrets; @@ -1408,4 +1410,80 @@ public abstract class BasePointsFormatTestCase extends BaseIndexFileFormatTestCa } }; } + + public void testMismatchedFields() throws Exception { + Directory dir1 = newDirectory(); + IndexWriter w1 = new IndexWriter(dir1, newIndexWriterConfig()); + Document doc = new Document(); + doc.add(new LongPoint("f", 1L)); + doc.add(new LongPoint("g", 42L, 43L)); + w1.addDocument(doc); + + Directory dir2 = newDirectory(); + IndexWriter w2 = + new IndexWriter(dir2, newIndexWriterConfig().setMergeScheduler(new SerialMergeScheduler())); + w2.addDocument(doc); + w2.commit(); + + DirectoryReader reader = DirectoryReader.open(w1); + w1.close(); + w2.addIndexes(new MismatchedCodecReader((CodecReader) getOnlyLeafReader(reader), random())); + reader.close(); + w2.forceMerge(1); + reader = DirectoryReader.open(w2); + w2.close(); + + LeafReader leafReader = getOnlyLeafReader(reader); + assertEquals(2, leafReader.maxDoc()); + + PointValues fPoints = leafReader.getPointValues("f"); + assertEquals(2, fPoints.size()); + fPoints.intersect( + new IntersectVisitor() { + + int expectedDoc = 0; + + @Override + public void visit(int docID, byte[] packedValue) throws IOException { + assertEquals(LongPoint.pack(1L), new BytesRef(packedValue)); + assertEquals(expectedDoc++, docID); + } + + @Override + public void visit(int docID) throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public Relation compare(byte[] minPackedValue, byte[] maxPackedValue) { + return Relation.CELL_CROSSES_QUERY; + } + }); + + PointValues gPoints = leafReader.getPointValues("g"); + assertEquals(2, fPoints.size()); + gPoints.intersect( + new IntersectVisitor() { + + int expectedDoc = 0; + + @Override + public void visit(int docID, byte[] packedValue) throws IOException { + assertEquals(LongPoint.pack(42L, 43L), new BytesRef(packedValue)); + assertEquals(expectedDoc++, docID); + } + + @Override + public void visit(int docID) throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public Relation compare(byte[] minPackedValue, byte[] maxPackedValue) { + return Relation.CELL_CROSSES_QUERY; + } + }); + + IOUtils.close(reader, w2, dir1, dir2); + } } diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/index/BasePostingsFormatTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/tests/index/BasePostingsFormatTestCase.java index 8f8233ee680..8e0292b3f8d 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/index/BasePostingsFormatTestCase.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/index/BasePostingsFormatTestCase.java @@ -42,6 +42,7 @@ import org.apache.lucene.document.Field.Store; import org.apache.lucene.document.FieldType; import org.apache.lucene.document.StringField; import org.apache.lucene.document.TextField; +import org.apache.lucene.index.CodecReader; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.Fields; import org.apache.lucene.index.IndexOptions; @@ -54,6 +55,7 @@ import org.apache.lucene.index.MultiTerms; import org.apache.lucene.index.PostingsEnum; import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.index.SegmentWriteState; +import org.apache.lucene.index.SerialMergeScheduler; import org.apache.lucene.index.Term; import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; @@ -71,6 +73,7 @@ import org.apache.lucene.tests.util.LineFileDocs; import org.apache.lucene.tests.util.RamUsageTester; import org.apache.lucene.tests.util.TestUtil; import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.IOUtils; import org.junit.AfterClass; import org.junit.BeforeClass; @@ -1728,4 +1731,41 @@ public abstract class BasePostingsFormatTestCase extends BaseIndexFileFormatTest TestUtil.checkIndex(dir); } } + + public void testMismatchedFields() throws Exception { + Directory dir1 = newDirectory(); + IndexWriter w1 = new IndexWriter(dir1, newIndexWriterConfig()); + Document doc = new Document(); + doc.add(new StringField("f", "a", Store.NO)); + doc.add(new StringField("g", "b", Store.NO)); + w1.addDocument(doc); + + Directory dir2 = newDirectory(); + IndexWriter w2 = + new IndexWriter(dir2, newIndexWriterConfig().setMergeScheduler(new SerialMergeScheduler())); + w2.addDocument(doc); + w2.commit(); + + DirectoryReader reader = DirectoryReader.open(w1); + w1.close(); + w2.addIndexes(new MismatchedCodecReader((CodecReader) getOnlyLeafReader(reader), random())); + reader.close(); + w2.forceMerge(1); + reader = DirectoryReader.open(w2); + w2.close(); + + LeafReader leafReader = getOnlyLeafReader(reader); + + TermsEnum te = leafReader.terms("f").iterator(); + assertEquals("a", te.next().utf8ToString()); + assertEquals(2, te.docFreq()); + assertNull(te.next()); + + te = leafReader.terms("g").iterator(); + assertEquals("b", te.next().utf8ToString()); + assertEquals(2, te.docFreq()); + assertNull(te.next()); + + IOUtils.close(reader, w2, dir1, dir2); + } } diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/index/MismatchedCodecReader.java b/lucene/test-framework/src/java/org/apache/lucene/tests/index/MismatchedCodecReader.java new file mode 100644 index 00000000000..8c856aafcba --- /dev/null +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/index/MismatchedCodecReader.java @@ -0,0 +1,216 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.tests.index; + +import java.io.IOException; +import java.util.Objects; +import java.util.Random; +import org.apache.lucene.codecs.DocValuesProducer; +import org.apache.lucene.codecs.NormsProducer; +import org.apache.lucene.codecs.StoredFieldsReader; +import org.apache.lucene.index.BinaryDocValues; +import org.apache.lucene.index.CodecReader; +import org.apache.lucene.index.DocValuesSkipper; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.FieldInfos; +import org.apache.lucene.index.FilterCodecReader; +import org.apache.lucene.index.NumericDocValues; +import org.apache.lucene.index.SortedDocValues; +import org.apache.lucene.index.SortedNumericDocValues; +import org.apache.lucene.index.SortedSetDocValues; +import org.apache.lucene.index.StoredFieldVisitor; + +/** + * Shuffles field numbers around to try to trip bugs where field numbers are assumed to always be + * consistent across segments. + */ +public class MismatchedCodecReader extends FilterCodecReader { + + private final FieldInfos shuffled; + + /** Sole constructor. */ + public MismatchedCodecReader(CodecReader in, Random random) { + super(in); + shuffled = MismatchedLeafReader.shuffleInfos(in.getFieldInfos(), random); + } + + @Override + public FieldInfos getFieldInfos() { + return shuffled; + } + + @Override + public CacheHelper getCoreCacheHelper() { + return in.getCoreCacheHelper(); + } + + @Override + public CacheHelper getReaderCacheHelper() { + return in.getReaderCacheHelper(); + } + + @Override + public StoredFieldsReader getFieldsReader() { + StoredFieldsReader in = super.getFieldsReader(); + if (in == null) { + return null; + } + return new MismatchedStoredFieldsReader(in, shuffled); + } + + private static class MismatchedStoredFieldsReader extends StoredFieldsReader { + + private final StoredFieldsReader in; + private final FieldInfos shuffled; + + MismatchedStoredFieldsReader(StoredFieldsReader in, FieldInfos shuffled) { + this.in = Objects.requireNonNull(in); + this.shuffled = shuffled; + } + + @Override + public void close() throws IOException { + in.close(); + } + + @Override + public StoredFieldsReader clone() { + return new MismatchedStoredFieldsReader(in.clone(), shuffled); + } + + @Override + public void checkIntegrity() throws IOException { + in.checkIntegrity(); + } + + @Override + public void document(int docID, StoredFieldVisitor visitor) throws IOException { + in.document(docID, new MismatchedLeafReader.MismatchedVisitor(visitor, shuffled)); + } + } + + @Override + public DocValuesProducer getDocValuesReader() { + DocValuesProducer in = super.getDocValuesReader(); + if (in == null) { + return null; + } + return new MismatchedDocValuesProducer(in, shuffled, super.getFieldInfos()); + } + + private static class MismatchedDocValuesProducer extends DocValuesProducer { + + private final DocValuesProducer in; + private final FieldInfos shuffled; + private final FieldInfos orig; + + MismatchedDocValuesProducer(DocValuesProducer in, FieldInfos shuffled, FieldInfos orig) { + this.in = Objects.requireNonNull(in); + this.shuffled = shuffled; + this.orig = orig; + } + + @Override + public void close() throws IOException { + in.close(); + } + + private FieldInfo remapFieldInfo(FieldInfo field) { + FieldInfo fi = shuffled.fieldInfo(field.name); + assert fi != null && fi.number == field.number; + return orig.fieldInfo(field.name); + } + + @Override + public NumericDocValues getNumeric(FieldInfo field) throws IOException { + return in.getNumeric(remapFieldInfo(field)); + } + + @Override + public BinaryDocValues getBinary(FieldInfo field) throws IOException { + return in.getBinary(remapFieldInfo(field)); + } + + @Override + public SortedDocValues getSorted(FieldInfo field) throws IOException { + return in.getSorted(remapFieldInfo(field)); + } + + @Override + public SortedNumericDocValues getSortedNumeric(FieldInfo field) throws IOException { + return in.getSortedNumeric(remapFieldInfo(field)); + } + + @Override + public SortedSetDocValues getSortedSet(FieldInfo field) throws IOException { + return in.getSortedSet(remapFieldInfo(field)); + } + + @Override + public DocValuesSkipper getSkipper(FieldInfo field) throws IOException { + return in.getSkipper(remapFieldInfo(field)); + } + + @Override + public void checkIntegrity() throws IOException { + in.checkIntegrity(); + } + } + + @Override + public NormsProducer getNormsReader() { + NormsProducer in = super.getNormsReader(); + if (in == null) { + return null; + } + return new MismatchedNormsProducer(in, shuffled, super.getFieldInfos()); + } + + private static class MismatchedNormsProducer extends NormsProducer { + + private final NormsProducer in; + private final FieldInfos shuffled; + private final FieldInfos orig; + + MismatchedNormsProducer(NormsProducer in, FieldInfos shuffled, FieldInfos orig) { + this.in = Objects.requireNonNull(in); + this.shuffled = shuffled; + this.orig = orig; + } + + @Override + public void close() throws IOException { + in.close(); + } + + private FieldInfo remapFieldInfo(FieldInfo field) { + FieldInfo fi = shuffled.fieldInfo(field.name); + assert fi != null && fi.number == field.number; + return orig.fieldInfo(field.name); + } + + @Override + public NumericDocValues getNorms(FieldInfo field) throws IOException { + return in.getNorms(remapFieldInfo(field)); + } + + @Override + public void checkIntegrity() throws IOException { + in.checkIntegrity(); + } + } +} diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/index/MismatchedLeafReader.java b/lucene/test-framework/src/java/org/apache/lucene/tests/index/MismatchedLeafReader.java index eddee35240f..46404f514c6 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/index/MismatchedLeafReader.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/index/MismatchedLeafReader.java @@ -28,8 +28,6 @@ import org.apache.lucene.index.FilterLeafReader; import org.apache.lucene.index.LeafReader; import org.apache.lucene.index.StoredFieldVisitor; import org.apache.lucene.index.StoredFields; -import org.apache.lucene.search.KnnCollector; -import org.apache.lucene.util.Bits; /** * Shuffles field numbers around to try to trip bugs where field numbers are assumed to always be @@ -55,7 +53,7 @@ public class MismatchedLeafReader extends FilterLeafReader { return new StoredFields() { @Override public void document(int docID, StoredFieldVisitor visitor) throws IOException { - inStoredFields.document(docID, new MismatchedVisitor(visitor)); + inStoredFields.document(docID, new MismatchedVisitor(visitor, shuffled)); } }; } @@ -70,18 +68,6 @@ public class MismatchedLeafReader extends FilterLeafReader { return in.getReaderCacheHelper(); } - @Override - public void searchNearestVectors( - String field, float[] target, KnnCollector knnCollector, Bits acceptDocs) throws IOException { - in.searchNearestVectors(field, target, knnCollector, acceptDocs); - } - - @Override - public void searchNearestVectors( - String field, byte[] target, KnnCollector knnCollector, Bits acceptDocs) throws IOException { - in.searchNearestVectors(field, target, knnCollector, acceptDocs); - } - static FieldInfos shuffleInfos(FieldInfos infos, Random random) { // first, shuffle the order List shuffled = new ArrayList<>(); @@ -124,11 +110,13 @@ public class MismatchedLeafReader extends FilterLeafReader { /** StoredFieldsVisitor that remaps actual field numbers to our new shuffled ones. */ // TODO: its strange this part of our IR api exposes FieldInfo, // no other "user-accessible" codec apis do this? - class MismatchedVisitor extends StoredFieldVisitor { + static class MismatchedVisitor extends StoredFieldVisitor { final StoredFieldVisitor in; + final FieldInfos shuffled; - MismatchedVisitor(StoredFieldVisitor in) { + MismatchedVisitor(StoredFieldVisitor in, FieldInfos shuffled) { this.in = in; + this.shuffled = shuffled; } @Override diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/index/MockRandomMergePolicy.java b/lucene/test-framework/src/java/org/apache/lucene/tests/index/MockRandomMergePolicy.java index 74f3b87ed5d..d3f202ad9dc 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/index/MockRandomMergePolicy.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/index/MockRandomMergePolicy.java @@ -237,8 +237,7 @@ public class MockRandomMergePolicy extends MergePolicy { "NOTE: MockRandomMergePolicy now swaps in a MismatchedLeafReader for merging reader=" + reader); } - return SlowCodecReaderWrapper.wrap( - new MismatchedLeafReader(new MergeReaderWrapper(reader), r)); + return new MismatchedCodecReader(reader, r); } else { // otherwise, reader is unchanged return reader; diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/index/PerThreadPKLookup.java b/lucene/test-framework/src/java/org/apache/lucene/tests/index/PerThreadPKLookup.java index 5cbb9bc3f83..5db9a2409e8 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/index/PerThreadPKLookup.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/index/PerThreadPKLookup.java @@ -18,8 +18,13 @@ package org.apache.lucene.tests.index; import java.io.IOException; import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; import java.util.List; +import java.util.Map; import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexReader.CacheHelper; +import org.apache.lucene.index.LeafReader; import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.index.PostingsEnum; import org.apache.lucene.index.Terms; @@ -35,17 +40,29 @@ import org.apache.lucene.util.BytesRef; */ public class PerThreadPKLookup { + private final String idFieldName; protected final TermsEnum[] termsEnums; protected final PostingsEnum[] postingsEnums; protected final Bits[] liveDocs; protected final int[] docBases; - protected final int numSegs; + protected final int numEnums; protected final boolean hasDeletions; + private final Map enumIndexes; - public PerThreadPKLookup(IndexReader r, String idFieldName) throws IOException { + public PerThreadPKLookup(IndexReader reader, String idFieldName) throws IOException { + this(reader, idFieldName, Collections.emptyMap(), null, null); + } - List leaves = new ArrayList<>(r.leaves()); + private PerThreadPKLookup( + IndexReader reader, + String idFieldName, + Map prevEnumIndexes, + TermsEnum[] reusableTermsEnums, + PostingsEnum[] reusablePostingsEnums) + throws IOException { + this.idFieldName = idFieldName; + List leaves = new ArrayList<>(reader.leaves()); // Larger segments are more likely to have the id, so we sort largest to smallest by numDocs: leaves.sort((c1, c2) -> c2.reader().numDocs() - c1.reader().numDocs()); @@ -53,26 +70,50 @@ public class PerThreadPKLookup { postingsEnums = new PostingsEnum[leaves.size()]; liveDocs = new Bits[leaves.size()]; docBases = new int[leaves.size()]; - int numSegs = 0; + enumIndexes = new HashMap<>(); + int numEnums = 0; boolean hasDeletions = false; + for (int i = 0; i < leaves.size(); i++) { - Terms terms = leaves.get(i).reader().terms(idFieldName); - if (terms != null) { - termsEnums[numSegs] = terms.iterator(); - assert termsEnums[numSegs] != null; - docBases[numSegs] = leaves.get(i).docBase; - liveDocs[numSegs] = leaves.get(i).reader().getLiveDocs(); - hasDeletions |= leaves.get(i).reader().hasDeletions(); - numSegs++; + LeafReaderContext context = leaves.get(i); + LeafReader leafReader = context.reader(); + CacheHelper cacheHelper = leafReader.getCoreCacheHelper(); + IndexReader.CacheKey cacheKey = cacheHelper == null ? null : cacheHelper.getKey(); + + if (cacheKey != null && prevEnumIndexes.containsKey(cacheKey)) { + // Reuse termsEnum, postingsEnum. + int seg = prevEnumIndexes.get(cacheKey); + termsEnums[numEnums] = reusableTermsEnums[seg]; + postingsEnums[numEnums] = reusablePostingsEnums[seg]; + } else { + // New or empty segment. + Terms terms = leafReader.terms(idFieldName); + if (terms != null) { + termsEnums[numEnums] = terms.iterator(); + assert termsEnums[numEnums] != null; + } + } + + if (termsEnums[numEnums] != null) { + if (cacheKey != null) { + enumIndexes.put(cacheKey, numEnums); + } + + docBases[numEnums] = context.docBase; + liveDocs[numEnums] = leafReader.getLiveDocs(); + hasDeletions |= leafReader.hasDeletions(); + + numEnums++; } } - this.numSegs = numSegs; + + this.numEnums = numEnums; this.hasDeletions = hasDeletions; } /** Returns docID if found, else -1. */ public int lookup(BytesRef id) throws IOException { - for (int seg = 0; seg < numSegs; seg++) { + for (int seg = 0; seg < numEnums; seg++) { if (termsEnums[seg].seekExact(id)) { postingsEnums[seg] = termsEnums[seg].postings(postingsEnums[seg], 0); int docID = -1; @@ -88,5 +129,12 @@ public class PerThreadPKLookup { return -1; } - // TODO: add reopen method to carry over re-used enums...? + /** Reuse previous PerThreadPKLookup's termsEnum and postingsEnum. */ + public PerThreadPKLookup reopen(IndexReader reader) throws IOException { + if (reader == null) { + return null; + } + return new PerThreadPKLookup( + reader, this.idFieldName, this.enumIndexes, this.termsEnums, this.postingsEnums); + } } diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/search/AssertingScorer.java b/lucene/test-framework/src/java/org/apache/lucene/tests/search/AssertingScorer.java index 8badba0d12b..dd408befdbf 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/search/AssertingScorer.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/search/AssertingScorer.java @@ -183,7 +183,7 @@ public class AssertingScorer extends Scorer { } else { state = IteratorState.ITERATING; } - assert in.docID() == advanced; + assert in.docID() == advanced : in.docID() + " != " + advanced + " in " + in; assert AssertingScorer.this.in.docID() == in.docID(); return doc = advanced; } diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/store/BaseChunkedDirectoryTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/tests/store/BaseChunkedDirectoryTestCase.java index dd956c6c3fd..8de332eeec9 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/store/BaseChunkedDirectoryTestCase.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/store/BaseChunkedDirectoryTestCase.java @@ -33,6 +33,7 @@ import org.apache.lucene.tests.analysis.MockAnalyzer; import org.apache.lucene.tests.index.RandomIndexWriter; import org.apache.lucene.tests.util.TestUtil; import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.GroupVIntUtil; /** * Base class for Directories that "chunk" the input into blocks. @@ -77,7 +78,7 @@ public abstract class BaseChunkedDirectoryTestCase extends BaseDirectoryTestCase expectThrows( AlreadyClosedException.class, () -> { - two.readGroupVInts(values, values.length); + GroupVIntUtil.readGroupVInts(two, values, values.length); }); assertEquals(5, three.readVInt()); one.close(); @@ -105,7 +106,7 @@ public abstract class BaseChunkedDirectoryTestCase extends BaseDirectoryTestCase expectThrows( AlreadyClosedException.class, () -> { - one.readGroupVInts(values, values.length); + GroupVIntUtil.readGroupVInts(one, values, values.length); }); assertEquals(2, two.readInt()); // reopen a new slice "another": diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/store/BaseDirectoryTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/tests/store/BaseDirectoryTestCase.java index 9cc271a9d61..41d72c509db 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/store/BaseDirectoryTestCase.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/store/BaseDirectoryTestCase.java @@ -59,6 +59,7 @@ import org.apache.lucene.tests.util.LuceneTestCase; import org.apache.lucene.tests.util.TestUtil; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.BitUtil; +import org.apache.lucene.util.GroupVIntUtil; import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.packed.PackedInts; import org.junit.Assert; @@ -1458,7 +1459,7 @@ public abstract class BaseDirectoryTestCase extends LuceneTestCase { assertEquals(43, in.readByte()); assertEquals(12345, in.readShort()); assertEquals(1234567890, in.readInt()); - in.readGroupVInts(restored, 4); + GroupVIntUtil.readGroupVInts(in, restored, 4); assertArrayEquals(values, restored); assertEquals(1234567890123456789L, in.readLong()); in.close(); @@ -1485,7 +1486,7 @@ public abstract class BaseDirectoryTestCase extends LuceneTestCase { out.writeGroupVInts(values, limit); out.close(); try (IndexInput in = dir.openInput("test", IOContext.DEFAULT)) { - in.readGroupVInts(restore, limit); + GroupVIntUtil.readGroupVInts(in, restore, limit); for (int i = 0; i < limit; i++) { assertEquals(values[i], restore[i]); } @@ -1533,7 +1534,7 @@ public abstract class BaseDirectoryTestCase extends LuceneTestCase { IndexInput groupVIntIn = dir.openInput("group-varint", IOContext.DEFAULT); IndexInput vIntIn = dir.openInput("vint", IOContext.DEFAULT); for (int iter = 0; iter < iterations; iter++) { - groupVIntIn.readGroupVInts(values, numValuesArray[iter]); + GroupVIntUtil.readGroupVInts(groupVIntIn, values, numValuesArray[iter]); for (int j = 0; j < numValuesArray[iter]; j++) { assertEquals(vIntIn.readVInt(), values[j]); } diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/store/MockDirectoryWrapper.java b/lucene/test-framework/src/java/org/apache/lucene/tests/store/MockDirectoryWrapper.java index 2f30a8cda50..5f329209d80 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/store/MockDirectoryWrapper.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/store/MockDirectoryWrapper.java @@ -53,6 +53,7 @@ import org.apache.lucene.store.IOContext; import org.apache.lucene.store.IndexInput; import org.apache.lucene.store.IndexOutput; import org.apache.lucene.store.Lock; +import org.apache.lucene.store.ReadAdvice; import org.apache.lucene.tests.util.LuceneTestCase; import org.apache.lucene.tests.util.TestUtil; import org.apache.lucene.tests.util.ThrottledIndexOutput; @@ -812,6 +813,8 @@ public class MockDirectoryWrapper extends BaseDirectoryWrapper { false); } + // record the read advice before randomizing the context + ReadAdvice readAdvice = context.readAdvice(); context = LuceneTestCase.newIOContext(randomState, context); final boolean confined = context == IOContext.READONCE; if (name.startsWith(IndexFileNames.SEGMENTS) && confined == false) { @@ -831,15 +834,15 @@ public class MockDirectoryWrapper extends BaseDirectoryWrapper { System.out.println( "MockDirectoryWrapper: using SlowClosingMockIndexInputWrapper for file " + name); } - ii = new SlowClosingMockIndexInputWrapper(this, name, delegateInput, confined); + ii = new SlowClosingMockIndexInputWrapper(this, name, delegateInput, readAdvice, confined); } else if (useSlowOpenClosers && randomInt == 1) { if (LuceneTestCase.VERBOSE) { System.out.println( "MockDirectoryWrapper: using SlowOpeningMockIndexInputWrapper for file " + name); } - ii = new SlowOpeningMockIndexInputWrapper(this, name, delegateInput, confined); + ii = new SlowOpeningMockIndexInputWrapper(this, name, delegateInput, readAdvice, confined); } else { - ii = new MockIndexInputWrapper(this, name, delegateInput, null, confined); + ii = new MockIndexInputWrapper(this, name, delegateInput, null, readAdvice, confined); } addFileHandle(ii, name, Handle.Input); return ii; diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/store/MockIndexInputWrapper.java b/lucene/test-framework/src/java/org/apache/lucene/tests/store/MockIndexInputWrapper.java index 87279008614..3171d8d2216 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/store/MockIndexInputWrapper.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/store/MockIndexInputWrapper.java @@ -23,6 +23,7 @@ import java.util.Set; import org.apache.lucene.internal.tests.TestSecrets; import org.apache.lucene.store.FilterIndexInput; import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.ReadAdvice; /** * Used by MockDirectoryWrapper to create an input stream that keeps track of when it's been closed. @@ -39,6 +40,7 @@ public class MockIndexInputWrapper extends FilterIndexInput { // Which MockIndexInputWrapper we were cloned from, or null if we are not a clone: private final MockIndexInputWrapper parent; + private final ReadAdvice readAdvice; private final boolean confined; private final Thread thread; @@ -48,6 +50,7 @@ public class MockIndexInputWrapper extends FilterIndexInput { String name, IndexInput delegate, MockIndexInputWrapper parent, + ReadAdvice readAdvice, boolean confined) { super("MockIndexInputWrapper(name=" + name + " delegate=" + delegate + ")", delegate); @@ -57,6 +60,7 @@ public class MockIndexInputWrapper extends FilterIndexInput { this.parent = parent; this.name = name; this.dir = dir; + this.readAdvice = readAdvice; this.confined = confined; this.thread = Thread.currentThread(); } @@ -107,7 +111,8 @@ public class MockIndexInputWrapper extends FilterIndexInput { dir.inputCloneCount.incrementAndGet(); IndexInput iiclone = in.clone(); MockIndexInputWrapper clone = - new MockIndexInputWrapper(dir, name, iiclone, parent != null ? parent : this, confined); + new MockIndexInputWrapper( + dir, name, iiclone, parent != null ? parent : this, readAdvice, confined); // Pending resolution on LUCENE-686 we may want to // uncomment this code so that we also track that all // clones get closed: @@ -135,7 +140,26 @@ public class MockIndexInputWrapper extends FilterIndexInput { IndexInput slice = in.slice(sliceDescription, offset, length); MockIndexInputWrapper clone = new MockIndexInputWrapper( - dir, sliceDescription, slice, parent != null ? parent : this, confined); + dir, sliceDescription, slice, parent != null ? parent : this, readAdvice, confined); + return clone; + } + + @Override + public IndexInput slice(String sliceDescription, long offset, long length, ReadAdvice readAdvice) + throws IOException { + if (this.readAdvice != ReadAdvice.NORMAL) { + throw new IllegalStateException( + "slice() may only be called with a custom read advice on inputs that have been open with ReadAdvice.NORMAL"); + } + ensureOpen(); + if (dir.verboseClone) { + new Exception("slice: " + this).printStackTrace(System.out); + } + dir.inputCloneCount.incrementAndGet(); + IndexInput slice = in.slice(sliceDescription, offset, length); + MockIndexInputWrapper clone = + new MockIndexInputWrapper( + dir, sliceDescription, slice, parent != null ? parent : this, readAdvice, confined); return clone; } diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/store/SlowClosingMockIndexInputWrapper.java b/lucene/test-framework/src/java/org/apache/lucene/tests/store/SlowClosingMockIndexInputWrapper.java index 1f9e61f5195..851860f1c64 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/store/SlowClosingMockIndexInputWrapper.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/store/SlowClosingMockIndexInputWrapper.java @@ -19,6 +19,7 @@ package org.apache.lucene.tests.store; import java.io.IOException; import org.apache.lucene.internal.tests.TestSecrets; import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.ReadAdvice; import org.apache.lucene.util.SuppressForbidden; import org.apache.lucene.util.ThreadInterruptedException; @@ -35,8 +36,12 @@ class SlowClosingMockIndexInputWrapper extends MockIndexInputWrapper { } public SlowClosingMockIndexInputWrapper( - MockDirectoryWrapper dir, String name, IndexInput delegate, boolean confined) { - super(dir, name, delegate, null, confined); + MockDirectoryWrapper dir, + String name, + IndexInput delegate, + ReadAdvice readAdvice, + boolean confined) { + super(dir, name, delegate, null, readAdvice, confined); } @SuppressForbidden(reason = "Thread sleep") diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/store/SlowOpeningMockIndexInputWrapper.java b/lucene/test-framework/src/java/org/apache/lucene/tests/store/SlowOpeningMockIndexInputWrapper.java index 033785af9c7..0d75408ec8e 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/store/SlowOpeningMockIndexInputWrapper.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/store/SlowOpeningMockIndexInputWrapper.java @@ -19,6 +19,7 @@ package org.apache.lucene.tests.store; import java.io.IOException; import org.apache.lucene.internal.tests.TestSecrets; import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.ReadAdvice; import org.apache.lucene.util.SuppressForbidden; import org.apache.lucene.util.ThreadInterruptedException; @@ -35,9 +36,13 @@ class SlowOpeningMockIndexInputWrapper extends MockIndexInputWrapper { @SuppressForbidden(reason = "Thread sleep") public SlowOpeningMockIndexInputWrapper( - MockDirectoryWrapper dir, String name, IndexInput delegate, boolean confined) + MockDirectoryWrapper dir, + String name, + IndexInput delegate, + ReadAdvice readAdvice, + boolean confined) throws IOException { - super(dir, name, delegate, null, confined); + super(dir, name, delegate, null, readAdvice, confined); try { Thread.sleep(50); } catch (InterruptedException ie) { diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/util/LuceneTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/tests/util/LuceneTestCase.java index 26dd29e27b9..84fa120b88b 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/util/LuceneTestCase.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/util/LuceneTestCase.java @@ -182,6 +182,7 @@ import org.apache.lucene.tests.index.AssertingLeafReader; import org.apache.lucene.tests.index.FieldFilterLeafReader; import org.apache.lucene.tests.index.MergingCodecReader; import org.apache.lucene.tests.index.MergingDirectoryReaderWrapper; +import org.apache.lucene.tests.index.MismatchedCodecReader; import org.apache.lucene.tests.index.MismatchedDirectoryReader; import org.apache.lucene.tests.index.MismatchedLeafReader; import org.apache.lucene.tests.index.MockIndexWriterEventListener; @@ -1746,12 +1747,14 @@ public abstract class LuceneTestCase extends Assert { System.out.println( "NOTE: LuceneTestCase.wrapReader: wrapping previous reader=" + r - + " with MismatchedLeaf/DirectoryReader"); + + " with MismatchedLeaf/Directory/CodecReader"); } if (r instanceof LeafReader) { r = new MismatchedLeafReader((LeafReader) r, random); } else if (r instanceof DirectoryReader) { r = new MismatchedDirectoryReader((DirectoryReader) r, random); + } else if (r instanceof CodecReader) { + r = new MismatchedCodecReader((CodecReader) r, random); } break; case 4: diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/util/TestRuleSetupAndRestoreClassEnv.java b/lucene/test-framework/src/java/org/apache/lucene/tests/util/TestRuleSetupAndRestoreClassEnv.java index 378444e394a..44f28b817ad 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/util/TestRuleSetupAndRestoreClassEnv.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/util/TestRuleSetupAndRestoreClassEnv.java @@ -38,7 +38,7 @@ import java.util.TimeZone; import org.apache.lucene.codecs.Codec; import org.apache.lucene.codecs.DocValuesFormat; import org.apache.lucene.codecs.PostingsFormat; -import org.apache.lucene.codecs.lucene912.Lucene912Codec; +import org.apache.lucene.codecs.lucene100.Lucene100Codec; import org.apache.lucene.codecs.simpletext.SimpleTextCodec; import org.apache.lucene.search.similarities.Similarity; import org.apache.lucene.tests.codecs.asserting.AssertingCodec; @@ -188,9 +188,9 @@ final class TestRuleSetupAndRestoreClassEnv extends AbstractBeforeAfterRule { } else if ("Compressing".equals(TEST_CODEC) || ("random".equals(TEST_CODEC) && randomVal == 6 && !shouldAvoidCodec("Compressing"))) { codec = CompressingCodec.randomInstance(random); - } else if ("Lucene912".equals(TEST_CODEC) - || ("random".equals(TEST_CODEC) && randomVal == 5 && !shouldAvoidCodec("Lucene912"))) { - codec = new Lucene912Codec(RandomPicks.randomFrom(random, Lucene912Codec.Mode.values())); + } else if ("Lucene100".equals(TEST_CODEC) + || ("random".equals(TEST_CODEC) && randomVal == 5 && !shouldAvoidCodec("Lucene100"))) { + codec = new Lucene100Codec(RandomPicks.randomFrom(random, Lucene100Codec.Mode.values())); } else if (!"random".equals(TEST_CODEC)) { codec = Codec.forName(TEST_CODEC); } else if ("random".equals(TEST_POSTINGSFORMAT)) { diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/util/TestUtil.java b/lucene/test-framework/src/java/org/apache/lucene/tests/util/TestUtil.java index 0569bf9ae98..95f06ea5570 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/util/TestUtil.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/util/TestUtil.java @@ -55,8 +55,8 @@ import org.apache.lucene.codecs.DocValuesFormat; import org.apache.lucene.codecs.KnnVectorsFormat; import org.apache.lucene.codecs.PostingsFormat; import org.apache.lucene.codecs.blocktreeords.BlockTreeOrdsPostingsFormat; +import org.apache.lucene.codecs.lucene100.Lucene100Codec; import org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat; -import org.apache.lucene.codecs.lucene912.Lucene912Codec; import org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat; import org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat; import org.apache.lucene.codecs.perfield.PerFieldDocValuesFormat; @@ -1315,7 +1315,7 @@ public final class TestUtil { * different from {@link Codec#getDefault()} because that is randomized. */ public static Codec getDefaultCodec() { - return new Lucene912Codec(); + return new Lucene100Codec(); } /** diff --git a/lucene/test-framework/src/resources/org/apache/lucene/tests/index/LICENSE.txt b/lucene/test-framework/src/resources/org/apache/lucene/tests/index/LICENSE.txt new file mode 100644 index 00000000000..fc1b33ae9b3 --- /dev/null +++ b/lucene/test-framework/src/resources/org/apache/lucene/tests/index/LICENSE.txt @@ -0,0 +1,507 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + + + +Some code in core/src/java/org/apache/lucene/util/UnicodeUtil.java was +derived from unicode conversion examples available at +http://www.unicode.org/Public/PROGRAMS/CVTUTF. Here is the copyright +from those sources: + +/* + * Copyright 2001-2004 Unicode, Inc. + * + * Disclaimer + * + * This source code is provided as is by Unicode, Inc. No claims are + * made as to fitness for any particular purpose. No warranties of any + * kind are expressed or implied. The recipient agrees to determine + * applicability of information provided. If this file has been + * purchased on magnetic or optical media from Unicode, Inc., the + * sole remedy for any claim will be exchange of defective media + * within 90 days of receipt. + * + * Limitations on Rights to Redistribute This Code + * + * Unicode, Inc. hereby grants the right to freely use the information + * supplied in this file in the creation of products supporting the + * Unicode Standard, and to make copies of this file in any form + * for internal or external distribution as long as this notice + * remains attached. + */ + + +Some code in core/src/java/org/apache/lucene/util/ArrayUtil.java was +derived from Python 2.4.2 sources available at +http://www.python.org. Full license is here: + + http://www.python.org/download/releases/2.4.2/license/ + +Some code in core/src/java/org/apache/lucene/util/UnicodeUtil.java was +derived from Python 3.1.2 sources available at +http://www.python.org. Full license is here: + + http://www.python.org/download/releases/3.1.2/license/ + +Some code in core/src/java/org/apache/lucene/util/automaton was +derived from Brics automaton sources available at +www.brics.dk/automaton/. Here is the copyright from those sources: + +/* + * Copyright (c) 2001-2009 Anders Moeller + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +The levenshtein automata tables in core/src/java/org/apache/lucene/util/automaton +were automatically generated with the moman/finenight FSA package. +Here is the copyright for those sources: + +# Copyright (c) 2010, Jean-Philippe Barrette-LaPierre, +# +# Permission is hereby granted, free of charge, to any person +# obtaining a copy of this software and associated documentation +# files (the "Software"), to deal in the Software without +# restriction, including without limitation the rights to use, +# copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following +# conditions: +# +# The above copyright notice and this permission notice shall be +# included in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. + +Some code in core/src/java/org/apache/lucene/util/UnicodeUtil.java was +derived from ICU (http://www.icu-project.org) +The full license is available here: + https://github.com/unicode-org/icu/blob/main/icu4c/LICENSE + +/* + * Copyright (C) 1999-2010, International Business Machines + * Corporation and others. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, and/or sell copies of the + * Software, and to permit persons to whom the Software is furnished to do so, + * provided that the above copyright notice(s) and this permission notice appear + * in all copies of the Software and that both the above copyright notice(s) and + * this permission notice appear in supporting documentation. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. + * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE BE + * LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR + * ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER + * IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT + * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + * + * Except as contained in this notice, the name of a copyright holder shall not + * be used in advertising or otherwise to promote the sale, use or other + * dealings in this Software without prior written authorization of the + * copyright holder. + */ + +The following license applies to the Snowball stemmers: + +Copyright (c) 2001, Dr Martin Porter +Copyright (c) 2002, Richard Boulton +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holders nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +The following license applies to the KStemmer: + +Copyright © 2003, +Center for Intelligent Information Retrieval, +University of Massachusetts, Amherst. +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, this +list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright notice, +this list of conditions and the following disclaimer in the documentation +and/or other materials provided with the distribution. + +3. The names "Center for Intelligent Information Retrieval" and +"University of Massachusetts" must not be used to endorse or promote products +derived from this software without prior written permission. To obtain +permission, contact info@ciir.cs.umass.edu. + +THIS SOFTWARE IS PROVIDED BY UNIVERSITY OF MASSACHUSETTS AND OTHER CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE +GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +SUCH DAMAGE. + +The following license applies to the Morfologik project: + +Copyright (c) 2006 Dawid Weiss +Copyright (c) 2007-2011 Dawid Weiss, Marcin Miłkowski +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + + * Neither the name of Morfologik nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +--- + +The dictionary comes from Morfologik project. Morfologik uses data from +Polish ispell/myspell dictionary hosted at http://www.sjp.pl/slownik/en/ and +is licenced on the terms of (inter alia) LGPL and Creative Commons +ShareAlike. The part-of-speech tags were added in Morfologik project and +are not found in the data from sjp.pl. The tagset is similar to IPI PAN +tagset. + +--- + +The following license applies to the Morfeusz project, +used by org.apache.lucene.analysis.morfologik. + +BSD-licensed dictionary of Polish (SGJP) +http://sgjp.pl/morfeusz/ + +Copyright © 2011 Zygmunt Saloni, Włodzimierz Gruszczyński, +Marcin Woliński, Robert Wołosz + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the + distribution. + +THIS SOFTWARE IS PROVIDED BY COPYRIGHT HOLDERS “AS IS” AND ANY EXPRESS +OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL COPYRIGHT HOLDERS OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR +BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE +OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN +IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +--- + +core/src/java/org/apache/lucene/util/compress/LZ4.java is a Java +implementation of the LZ4 (https://github.com/lz4/lz4/tree/dev/lib) +compression format for Lucene's DataInput/DataOutput abstractions. + +LZ4 Library +Copyright (c) 2011-2016, Yann Collet +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright notice, this + list of conditions and the following disclaimer in the documentation and/or + other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + diff --git a/lucene/test-framework/src/resources/org/apache/lucene/tests/index/NOTICE.txt b/lucene/test-framework/src/resources/org/apache/lucene/tests/index/NOTICE.txt new file mode 100644 index 00000000000..ea6903484c0 --- /dev/null +++ b/lucene/test-framework/src/resources/org/apache/lucene/tests/index/NOTICE.txt @@ -0,0 +1,197 @@ +Apache Lucene +Copyright 2001-2022 The Apache Software Foundation + +This product includes software developed at +The Apache Software Foundation (http://www.apache.org/). + +Includes software from other Apache Software Foundation projects, +including, but not limited to: + - Apache Jakarta Regexp + - Apache Commons + - Apache Xerces + +ICU4J, (under analysis/icu) is licensed under an MIT styles license +and Copyright (c) 1995-2008 International Business Machines Corporation and others + +Some data files (under analysis/icu/src/data) are derived from Unicode data such +as the Unicode Character Database. See http://unicode.org/copyright.html for more +details. + +Brics Automaton (under core/src/java/org/apache/lucene/util/automaton) is +BSD-licensed, created by Anders Møller. See http://www.brics.dk/automaton/ + +The levenshtein automata tables (under core/src/java/org/apache/lucene/util/automaton) were +automatically generated with the moman/finenight FSA library, created by +Jean-Philippe Barrette-LaPierre. This library is available under an MIT license, +see http://sites.google.com/site/rrettesite/moman and +http://bitbucket.org/jpbarrette/moman/overview/ + +The class org.apache.lucene.util.WeakIdentityMap was derived from +the Apache CXF project and is Apache License 2.0. + +The class org.apache.lucene.util.compress.LZ4 is a Java rewrite of the LZ4 +compression library (https://github.com/lz4/lz4/tree/dev/lib) that is licensed +under the 2-clause BSD license. +(https://opensource.org/licenses/bsd-license.php) + +The Google Code Prettify is Apache License 2.0. +See http://code.google.com/p/google-code-prettify/ + +This product includes code (JaspellTernarySearchTrie) from Java Spelling Checkin +g Package (jaspell): http://jaspell.sourceforge.net/ +License: The BSD License (http://www.opensource.org/licenses/bsd-license.php) + +The snowball stemmers in + analysis/common/src/java/net/sf/snowball +were developed by Martin Porter and Richard Boulton. +The snowball stopword lists in + analysis/common/src/resources/org/apache/lucene/analysis/snowball +were developed by Martin Porter and Richard Boulton. +The full snowball package is available from + https://snowballstem.org/ + +The KStem stemmer in + analysis/common/src/org/apache/lucene/analysis/en +was developed by Bob Krovetz and Sergio Guzman-Lara (CIIR-UMass Amherst) +under the BSD-license. + +The Arabic,Persian,Romanian,Bulgarian, Hindi and Bengali analyzers (common) come with a default +stopword list that is BSD-licensed created by Jacques Savoy. These files reside in: +analysis/common/src/resources/org/apache/lucene/analysis/ar/stopwords.txt, +analysis/common/src/resources/org/apache/lucene/analysis/fa/stopwords.txt, +analysis/common/src/resources/org/apache/lucene/analysis/ro/stopwords.txt, +analysis/common/src/resources/org/apache/lucene/analysis/bg/stopwords.txt, +analysis/common/src/resources/org/apache/lucene/analysis/hi/stopwords.txt, +analysis/common/src/resources/org/apache/lucene/analysis/bn/stopwords.txt +See http://members.unine.ch/jacques.savoy/clef/index.html. + +The German,Spanish,Finnish,French,Hungarian,Italian,Portuguese,Russian and Swedish light stemmers +(common) are based on BSD-licensed reference implementations created by Jacques Savoy and +Ljiljana Dolamic. These files reside in: +analysis/common/src/java/org/apache/lucene/analysis/de/GermanLightStemmer.java +analysis/common/src/java/org/apache/lucene/analysis/de/GermanMinimalStemmer.java +analysis/common/src/java/org/apache/lucene/analysis/es/SpanishLightStemmer.java +analysis/common/src/java/org/apache/lucene/analysis/fi/FinnishLightStemmer.java +analysis/common/src/java/org/apache/lucene/analysis/fr/FrenchLightStemmer.java +analysis/common/src/java/org/apache/lucene/analysis/fr/FrenchMinimalStemmer.java +analysis/common/src/java/org/apache/lucene/analysis/hu/HungarianLightStemmer.java +analysis/common/src/java/org/apache/lucene/analysis/it/ItalianLightStemmer.java +analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseLightStemmer.java +analysis/common/src/java/org/apache/lucene/analysis/ru/RussianLightStemmer.java +analysis/common/src/java/org/apache/lucene/analysis/sv/SwedishLightStemmer.java + +The Stempel analyzer (stempel) includes BSD-licensed software developed +by the Egothor project http://egothor.sf.net/, created by Leo Galambos, Martin Kvapil, +and Edmond Nolan. + +The Polish analyzer (stempel) comes with a default +stopword list that is BSD-licensed created by the Carrot2 project. The file resides +in stempel/src/resources/org/apache/lucene/analysis/pl/stopwords.txt. +See https://github.com/carrot2/carrot2. + +The SmartChineseAnalyzer source code (smartcn) was +provided by Xiaoping Gao and copyright 2009 by www.imdict.net. + +WordBreakTestUnicode_*.java (under modules/analysis/common/src/test/) +is derived from Unicode data such as the Unicode Character Database. +See http://unicode.org/copyright.html for more details. + +The Morfologik analyzer (morfologik) includes BSD-licensed software +developed by Dawid Weiss and Marcin Miłkowski +(https://github.com/morfologik/morfologik-stemming) and uses +data from the BSD-licensed dictionary of Polish (SGJP, http://sgjp.pl/morfeusz/). + +=========================================================================== +Kuromoji Japanese Morphological Analyzer - Apache Lucene Integration +=========================================================================== + +This software includes a binary and/or source version of data from + + mecab-ipadic-2.7.0-20070801 + +which can be obtained from + + http://atilika.com/releases/mecab-ipadic/mecab-ipadic-2.7.0-20070801.tar.gz + +or + + http://jaist.dl.sourceforge.net/project/mecab/mecab-ipadic/2.7.0-20070801/mecab-ipadic-2.7.0-20070801.tar.gz + +=========================================================================== +mecab-ipadic-2.7.0-20070801 Notice +=========================================================================== + +Nara Institute of Science and Technology (NAIST), +the copyright holders, disclaims all warranties with regard to this +software, including all implied warranties of merchantability and +fitness, in no event shall NAIST be liable for +any special, indirect or consequential damages or any damages +whatsoever resulting from loss of use, data or profits, whether in an +action of contract, negligence or other tortuous action, arising out +of or in connection with the use or performance of this software. + +A large portion of the dictionary entries +originate from ICOT Free Software. The following conditions for ICOT +Free Software applies to the current dictionary as well. + +Each User may also freely distribute the Program, whether in its +original form or modified, to any third party or parties, PROVIDED +that the provisions of Section 3 ("NO WARRANTY") will ALWAYS appear +on, or be attached to, the Program, which is distributed substantially +in the same form as set out herein and that such intended +distribution, if actually made, will neither violate or otherwise +contravene any of the laws and regulations of the countries having +jurisdiction over the User or the intended distribution itself. + +NO WARRANTY + +The program was produced on an experimental basis in the course of the +research and development conducted during the project and is provided +to users as so produced on an experimental basis. Accordingly, the +program is provided without any warranty whatsoever, whether express, +implied, statutory or otherwise. The term "warranty" used herein +includes, but is not limited to, any warranty of the quality, +performance, merchantability and fitness for a particular purpose of +the program and the nonexistence of any infringement or violation of +any right of any third party. + +Each user of the program will agree and understand, and be deemed to +have agreed and understood, that there is no warranty whatsoever for +the program and, accordingly, the entire risk arising from or +otherwise connected with the program is assumed by the user. + +Therefore, neither ICOT, the copyright holder, or any other +organization that participated in or was otherwise related to the +development of the program and their respective officials, directors, +officers and other employees shall be held liable for any and all +damages, including, without limitation, general, special, incidental +and consequential damages, arising out of or otherwise in connection +with the use or inability to use the program or any product, material +or result produced or otherwise obtained by using the program, +regardless of whether they have been advised of, or otherwise had +knowledge of, the possibility of such damages at any time during the +project or thereafter. Each user will be deemed to have agreed to the +foregoing by his or her commencement of use of the program. The term +"use" as used herein includes, but is not limited to, the use, +modification, copying and distribution of the program and the +production of secondary products from the program. + +In the case where the program, whether in its original form or +modified, was distributed or delivered to or received by a user from +any person, organization or entity other than ICOT, unless it makes or +grants independently of ICOT any specific warranty to the user in +writing, such person, organization or entity, will also be exempted +from and not be held liable to the user for any such damages as noted +above as far as the program is concerned. + +=========================================================================== +Nori Korean Morphological Analyzer - Apache Lucene Integration +=========================================================================== + +This software includes a binary and/or source version of data from + + mecab-ko-dic-2.1.1-20180720 + +which can be obtained from + + https://bitbucket.org/eunjeon/mecab-ko-dic/downloads/mecab-ko-dic-2.1.1-20180720.tar.gz diff --git a/lucene/test-framework/src/test/org/apache/lucene/tests/search/TestPerThreadPKLookup.java b/lucene/test-framework/src/test/org/apache/lucene/tests/search/TestPerThreadPKLookup.java index 2136727838c..7807626c462 100644 --- a/lucene/test-framework/src/test/org/apache/lucene/tests/search/TestPerThreadPKLookup.java +++ b/lucene/test-framework/src/test/org/apache/lucene/tests/search/TestPerThreadPKLookup.java @@ -31,6 +31,94 @@ import org.apache.lucene.tests.util.LuceneTestCase; public class TestPerThreadPKLookup extends LuceneTestCase { + public void testReopen() throws Exception { + Directory dir = newDirectory(); + IndexWriter writer = + new IndexWriter( + dir, + new IndexWriterConfig(new MockAnalyzer(random())) + .setMergePolicy(NoMergePolicy.INSTANCE)); + + Document doc; + doc = new Document(); + doc.add(new KeywordField("PK", "1", Field.Store.NO)); + writer.addDocument(doc); + + doc = new Document(); + doc.add(new KeywordField("PK", "2", Field.Store.NO)); + writer.addDocument(doc); + writer.flush(); + + // Terms in PK is null. + doc = new Document(); + doc.add(new KeywordField("PK2", "3", Field.Store.NO)); + writer.addDocument(doc); + + doc = new Document(); + doc.add(new KeywordField("PK2", "4", Field.Store.NO)); + writer.addDocument(doc); + writer.flush(); + + DirectoryReader reader1 = DirectoryReader.open(writer); + PerThreadPKLookup pkLookup1 = new PerThreadPKLookup(reader1, "PK"); + + doc = new Document(); + doc.add(new KeywordField("PK", "5", Field.Store.NO)); + writer.addDocument(doc); + + doc = new Document(); + doc.add(new KeywordField("PK", "6", Field.Store.NO)); + writer.addDocument(doc); + // Update liveDocs. + writer.deleteDocuments(new Term("PK", "1")); + writer.flush(); + + // Terms in PK is null. + doc = new Document(); + doc.add(new KeywordField("PK2", "7", Field.Store.NO)); + writer.addDocument(doc); + + doc = new Document(); + doc.add(new KeywordField("PK2", "8", Field.Store.NO)); + writer.addDocument(doc); + writer.flush(); + + assertEquals(0, pkLookup1.lookup(newBytesRef("1"))); + assertEquals(1, pkLookup1.lookup(newBytesRef("2"))); + assertEquals(-1, pkLookup1.lookup(newBytesRef("5"))); + assertEquals(-1, pkLookup1.lookup(newBytesRef("8"))); + DirectoryReader reader2 = DirectoryReader.openIfChanged(reader1); + PerThreadPKLookup pkLookup2 = pkLookup1.reopen(reader2); + + assertEquals(-1, pkLookup2.lookup(newBytesRef("1"))); + assertEquals(1, pkLookup2.lookup(newBytesRef("2"))); + assertEquals(4, pkLookup2.lookup(newBytesRef("5"))); + assertEquals(-1, pkLookup2.lookup(newBytesRef("8"))); + + doc = new Document(); + doc.add(new KeywordField("PK", "9", Field.Store.NO)); + writer.addDocument(doc); + + doc = new Document(); + doc.add(new KeywordField("PK", "10", Field.Store.NO)); + writer.addDocument(doc); + writer.flush(); + + assertEquals(-1, pkLookup2.lookup(newBytesRef("9"))); + DirectoryReader reader3 = DirectoryReader.openIfChanged(reader2); + PerThreadPKLookup pkLookup3 = pkLookup2.reopen(reader3); + assertEquals(8, pkLookup3.lookup(newBytesRef("9"))); + + DirectoryReader reader4 = DirectoryReader.openIfChanged(reader3); + assertNull(pkLookup3.reopen(reader4)); + + writer.close(); + reader1.close(); + reader2.close(); + reader3.close(); + dir.close(); + } + public void testPKLookupWithUpdate() throws Exception { Directory dir = newDirectory(); IndexWriter writer = diff --git a/versions.toml b/versions.toml index 96dfc797080..80dc51f39bf 100644 --- a/versions.toml +++ b/versions.toml @@ -76,7 +76,7 @@ zstd = { module = "com.github.luben:zstd-jni", version.ref = "zstd" } benmanes-versions = "com.github.ben-manes.versions:0.51.0" dependencychecks = "com.carrotsearch.gradle.dependencychecks:0.0.9" errorprone = "net.ltgt.errorprone:3.1.0" -forbiddenapis = "de.thetaphi.forbiddenapis:3.7" +forbiddenapis = "de.thetaphi.forbiddenapis:3.8" jacocolog = "org.barfuin.gradle.jacocolog:3.1.0" owasp-dependencycheck = "org.owasp.dependencycheck:7.2.0" randomizedtesting = "com.carrotsearch.gradle.randomizedtesting:0.0.6"