Merge branch 'apache:main' into bpv21_main

This commit is contained in:
expani1729 2024-08-29 19:47:04 +05:30 committed by GitHub
commit 0a0701995a
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
451 changed files with 28570 additions and 4825 deletions

View File

@ -23,6 +23,7 @@ Apache Lucene is a high-performance, full-featured text search engine library
written in Java.
[![Build Status](https://ci-builds.apache.org/job/Lucene/job/Lucene-Artifacts-main/badge/icon?subject=Lucene)](https://ci-builds.apache.org/job/Lucene/job/Lucene-Artifacts-main/)
[![Revved up by Develocity](https://img.shields.io/badge/Revved%20up%20by-Develocity-06A0CE?logo=Gradle&labelColor=02303A)](https://ge.apache.org/scans?search.buildToolType=gradle&search.rootProjectNames=lucene-root)
## Online Documentation

View File

@ -41,7 +41,7 @@ import jdk.jfr.consumer.RecordingFile;
*/
public class ProfileResults {
/** Formats a frame to a formatted line. This is deduplicated on! */
static String frameToString(RecordedFrame frame, boolean lineNumbers) {
static String frameToString(RecordedFrame frame, boolean lineNumbers, boolean frameTypes) {
StringBuilder builder = new StringBuilder();
RecordedMethod method = frame.getMethod();
RecordedClass clazz = method.getType();
@ -55,13 +55,14 @@ public class ProfileResults {
builder.append("#");
builder.append(method.getName());
builder.append("()");
if (lineNumbers) {
if (lineNumbers && frame.getLineNumber() != -1) {
builder.append(":");
if (frame.getLineNumber() == -1) {
builder.append("(" + frame.getType() + " code)");
} else {
builder.append(frame.getLineNumber());
}
builder.append(frame.getLineNumber());
}
if (clazz != null && frameTypes) {
builder.append(" [");
builder.append(frame.getType());
builder.append(" code]");
}
return builder.toString();
}
@ -77,6 +78,8 @@ public class ProfileResults {
public static final String COUNT_DEFAULT = "10";
public static final String LINENUMBERS_KEY = "tests.profile.linenumbers";
public static final String LINENUMBERS_DEFAULT = "false";
public static final String FRAMETYPES_KEY = "tests.profile.frametypes";
public static final String FRAMETYPES_DEFAULT = "true";
/**
* Driver method, for testing standalone.
@ -92,7 +95,8 @@ public class ProfileResults {
System.getProperty(MODE_KEY, MODE_DEFAULT),
Integer.parseInt(System.getProperty(STACKSIZE_KEY, STACKSIZE_DEFAULT)),
Integer.parseInt(System.getProperty(COUNT_KEY, COUNT_DEFAULT)),
Boolean.parseBoolean(System.getProperty(LINENUMBERS_KEY, LINENUMBERS_DEFAULT)));
Boolean.parseBoolean(System.getProperty(LINENUMBERS_KEY, LINENUMBERS_DEFAULT)),
Boolean.parseBoolean(System.getProperty(FRAMETYPES_KEY, FRAMETYPES_DEFAULT)));
}
/** true if we care about this event */
@ -152,7 +156,12 @@ public class ProfileResults {
/** Process all the JFR files passed in args and print a merged summary. */
public static void printReport(
List<String> files, String mode, int stacksize, int count, boolean lineNumbers)
List<String> files,
String mode,
int stacksize,
int count,
boolean lineNumbers,
boolean frameTypes)
throws IOException {
if (!"cpu".equals(mode) && !"heap".equals(mode)) {
throw new IllegalArgumentException("tests.profile.mode must be one of (cpu,heap)");
@ -181,7 +190,7 @@ public class ProfileResults {
if (stack.length() > 0) {
stack.append("\n").append(framePadding).append(" at ");
}
stack.append(frameToString(trace.getFrames().get(i), lineNumbers));
stack.append(frameToString(trace.getFrames().get(i), lineNumbers, frameTypes));
}
String line = stack.toString();
SimpleEntry<String, Long> entry =

View File

@ -231,8 +231,8 @@ public class MissingDoclet extends StandardDoclet {
case PACKAGE:
checkComment(element);
break;
// class-like elements, check them, then recursively check their children (fields and
// methods)
// class-like elements, check them, then recursively check their children (fields and
// methods)
case CLASS:
case INTERFACE:
case ENUM:
@ -257,7 +257,7 @@ public class MissingDoclet extends StandardDoclet {
}
}
break;
// method-like elements, check them if we are configured to do so
// method-like elements, check them if we are configured to do so
case METHOD:
case CONSTRUCTOR:
case FIELD:

View File

@ -80,6 +80,9 @@ ext {
// Minimum Java version required to compile and run Lucene.
minJavaVersion = JavaVersion.toVersion(deps.versions.minJava.get())
// also change this in extractor tool: ExtractForeignAPI
vectorIncubatorJavaVersions = [ JavaVersion.VERSION_21, JavaVersion.VERSION_22, JavaVersion.VERSION_23 ] as Set
// snapshot build marker used in scripts.
snapshotBuild = version.contains("SNAPSHOT")
@ -117,10 +120,6 @@ apply from: file('gradle/generation/local-settings.gradle')
// Make sure the build environment is consistent.
apply from: file('gradle/validation/check-environment.gradle')
// IDE support, settings and specials.
apply from: file('gradle/ide/intellij-idea.gradle')
apply from: file('gradle/ide/eclipse.gradle')
// Set up defaults and configure aspects for certain modules or functionality
// (java, tests)
apply from: file('gradle/java/folder-layout.gradle')
@ -133,6 +132,10 @@ apply from: file('gradle/testing/alternative-jdk-support.gradle')
apply from: file('gradle/java/jar-manifest.gradle')
apply from: file('gradle/java/modules.gradle')
// IDE support, settings and specials.
apply from: file('gradle/ide/intellij-idea.gradle')
apply from: file('gradle/ide/eclipse.gradle')
// Maven artifact publishing.
apply from: file('gradle/maven/publications.gradle')

View File

@ -67,6 +67,13 @@
</maintainer>
<!-- NOTE: please insert releases in numeric order, NOT chronologically. -->
<release>
<Version>
<name>lucene-9.11.1</name>
<created>2024-06-27</created>
<revision>9.11.1</revision>
</Version>
</release>.
<release>
<Version>
<name>lucene-9.11.0</name>

View File

@ -0,0 +1,78 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import re
import subprocess
import sys
import tempfile
import urllib.request
'''
A simple tool to see diffs between main's version of CHANGES.txt entries for
a given release vs the stable branch's version. It's best to keep these 1)
identical and 2) matching what changes were actually backported to be honest
to users and avoid future annoying conflicts on backport.
'''
# e.g. python3 -u diff_lucene_changes.py branch_9_9 main 9.9.0
#
def get_changes_url(branch_name):
if os.path.isdir(branch_name):
url = f'file://{branch_name}/lucene/CHANGES.txt'
else:
url = f'https://raw.githubusercontent.com/apache/lucene/{branch_name}/lucene/CHANGES.txt'
print(f'NOTE: resolving {branch_name} --> {url}')
return url
def extract_release_section(changes_txt, release_name):
return re.search(f'=======+ Lucene {re.escape(release_name)} =======+(.*?)=======+ Lucene .*? =======+$',
changes_txt.decode('utf-8'), re.MULTILINE | re.DOTALL).group(1).encode('utf-8')
def main():
if len(sys.argv) < 3 or len(sys.argv) > 5:
print('\nUsage: python3 -u dev-tools/scripts/diff_lucene_changes.py <branch1-or-local-clone> <branch2-or-local-clone> <release-name> [diff-commandline-extras]\n')
print(' e.g.: python3 -u dev-tools/scripts/diff_lucene_changes.py branch_9_9 /l/trunk 9.9.0 "-w"\n')
sys.exit(1)
branch1 = sys.argv[1]
branch2 = sys.argv[2]
release_name = sys.argv[3]
if len(sys.argv) > 4:
diff_cl_extras = [sys.argv[4]]
else:
diff_cl_extras = []
branch1_changes = extract_release_section(urllib.request.urlopen(get_changes_url(branch1)).read(),
release_name)
branch2_changes = extract_release_section(urllib.request.urlopen(get_changes_url(branch2)).read(),
release_name)
with tempfile.NamedTemporaryFile() as f1, tempfile.NamedTemporaryFile() as f2:
f1.write(branch1_changes)
f2.write(branch2_changes)
command = ['diff'] + diff_cl_extras + [f1.name, f2.name]
# diff returns non-zero exit status when there are diffs, so don't pass check=True
print(subprocess.run(command, check=False, capture_output=True).stdout.decode('utf-8'))
if __name__ == '__main__':
main()

View File

@ -17,13 +17,6 @@
def resources = scriptResources(buildscript)
configure(rootProject) {
ext {
// also change this in extractor tool: ExtractForeignAPI
vectorIncubatorJavaVersions = [ JavaVersion.VERSION_21, JavaVersion.VERSION_22 ] as Set
}
}
configure(project(":lucene:core")) {
ext {
apijars = layout.projectDirectory.dir("src/generated/jdk")

View File

@ -23,7 +23,7 @@ configure(project(":lucene:core")) {
description "Regenerate gen_ForUtil.py"
group "generation"
def genDir = file("src/java/org/apache/lucene/codecs/lucene99")
def genDir = file("src/java/org/apache/lucene/codecs/lucene912")
def genScript = file("${genDir}/gen_ForUtil.py")
def genOutput = file("${genDir}/ForUtil.java")
@ -43,6 +43,31 @@ configure(project(":lucene:core")) {
andThenTasks: ["spotlessJava", "spotlessJavaApply"],
mustRunBefore: [ "compileJava" ]
])
task generateForDeltaUtilInternal() {
description "Regenerate gen_ForDeltaUtil.py"
group "generation"
def genDir = file("src/java/org/apache/lucene/codecs/lucene912")
def genScript = file("${genDir}/gen_ForDeltaUtil.py")
def genOutput = file("${genDir}/ForDeltaUtil.java")
inputs.file genScript
outputs.file genOutput
doLast {
quietExec {
workingDir genDir
executable project.externalTool("python3")
args = [ '-B', genScript ]
}
}
}
regenerate.dependsOn wrapWithPersistentChecksums(generateForDeltaUtilInternal, [
andThenTasks: ["spotlessJava", "spotlessJavaApply"],
mustRunBefore: [ "compileJava" ]
])
}
configure(project(":lucene:backward-codecs")) {
@ -96,5 +121,30 @@ configure(project(":lucene:backward-codecs")) {
andThenTasks: ["spotlessJava", "spotlessJavaApply"],
mustRunBefore: [ "compileJava" ]
])
task generateForUtil99Internal() {
description "Regenerate gen_ForUtil.py"
group "generation"
def genDir = file("src/java/org/apache/lucene/backward_codecs/lucene99")
def genScript = file("${genDir}/gen_ForUtil.py")
def genOutput = file("${genDir}/ForUtil.java")
inputs.file genScript
outputs.file genOutput
doLast {
quietExec {
workingDir genDir
executable project.externalTool("python3")
args = [ '-B', genScript ]
}
}
}
regenerate.dependsOn wrapWithPersistentChecksums(generateForUtil99Internal, [
andThenTasks: ["spotlessJava", "spotlessJavaApply"],
mustRunBefore: [ "compileJava" ]
])
}

View File

@ -65,10 +65,8 @@ configure(project(":lucene:analysis:icu")) {
icupkg = file("${icuBinDir}/icupkg")
}
// Resolve version lazily (can't resolve at configuration time).
def icu4jVersionProvider = project.provider { getVersion('com.ibm.icu', 'icu4j') }
// lazy gstring with ICU version.
def icu4jVersion = "${-> icu4jVersionProvider.get()}"
def icu4jVersion = deps.icu4j.get().version
def icuCompileTask = Os.isFamily(Os.FAMILY_WINDOWS) ? "compileIcuWindows" : "compileIcuLinux"

View File

@ -22,10 +22,11 @@ import org.gradle.plugins.ide.eclipse.model.ClasspathEntry
def resources = scriptResources(buildscript)
configure(rootProject) {
plugins.withType(JavaPlugin) {
apply plugin: "eclipse"
if (gradle.startParameter.taskNames.contains("eclipse")) {
project.pluginManager.apply("java-base")
project.pluginManager.apply("eclipse")
def eclipseJavaVersion = propertyOrDefault("eclipse.javaVersion", rootProject.minJavaVersion)
def eclipseJavaVersion = propertyOrDefault("eclipse.javaVersion", deps.versions.minJava.get())
def relativize = { other -> rootProject.rootDir.relativePath(other).toString() }
eclipse {
@ -105,9 +106,9 @@ configure(rootProject) {
}
}
eclipseJdt {
eclipseJdt {
enabled = false
dependsOn 'luceneEclipse'
dependsOn 'luceneEclipseJdt'
}
eclipseClasspath {

View File

@ -75,6 +75,18 @@ configure(rootProject) {
it.dependsOn(":versionCatalogFormatDeps")
}
// correct crlf/ default encoding after version catalog formatting finishes.
tasks.matching {
it.path in [
":versionCatalogFormatDeps"
]
}.configureEach {
it.doLast {
ant.fixcrlf(file: it.catalogFile.get().asFile,
eol: "lf", fixlast: "true", encoding: "UTF-8")
}
}
tasks.matching {
it.path in [
":versionCatalogUpdateDeps"

View File

@ -13,7 +13,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
@defaultMessage Spawns threads with vague names; use a custom thread factory (Lucene's NamedThreadFactory, Solr's SolrNamedThreadFactory) and name threads so that you can tell (by its name) which executor it is associated with
@defaultMessage Spawns threads with vague names; use a custom thread factory (Lucene's NamedThreadFactory) and name threads so that you can tell (by its name) which executor it is associated with
java.util.concurrent.Executors#newFixedThreadPool(int)
java.util.concurrent.Executors#newSingleThreadExecutor()
java.util.concurrent.Executors#newCachedThreadPool()

View File

@ -20,6 +20,10 @@
// 2) notice file
// 3) checksum validation/ generation.
// WARNING: The tasks in this file share internal state between tasks without using files.
// Because of this all tasks here must always execute together, so they cannot define task outputs.
// TODO: Rewrite the internal state to use state files containing the ext.jarInfos and its referencedFiles
// This should be false only for debugging.
def failOnError = true
@ -194,13 +198,6 @@ subprojects {
description = "Validate license and notice files of dependencies"
dependsOn collectJarInfos
def outputFileName = 'validateJarLicenses'
inputs.dir(file(project.rootDir.path + '/lucene/licenses'))
.withPropertyName('licenses')
.withPathSensitivity(PathSensitivity.RELATIVE)
outputs.file(layout.buildDirectory.file(outputFileName))
.withPropertyName('validateJarLicensesResult')
doLast {
def errors = []
jarInfos.each { dep ->
@ -246,9 +243,7 @@ subprojects {
}
}
}
// Required to take advantage of incremental building and the build cache
def f = new File(project.buildDir.path + "/" + outputFileName)
f.write(errors.toString(), "UTF-8")
if (errors) {
def msg = "Certain license/ notice files are missing:\n - " + errors.join("\n - ")
if (failOnError) {

View File

@ -80,10 +80,6 @@ API Changes
* GITHUB#12875: Ensure token position is always increased in PathHierarchyTokenizer and ReversePathHierarchyTokenizer
and resulting tokens do not overlap. (Michael Froh, Lukáš Vlček)
* GITHUB#12624, GITHUB#12831: Allow FSTCompiler to stream to any DataOutput while building, and
make compile() only return the FSTMetadata. For on-heap (default) use case, please use
FST.fromFSTReader(fstMetadata, fstCompiler.getFSTReader()) to create the FST. (Anh Dung Bui)
* GITHUB#13146, GITHUB#13148: Remove ByteBufferIndexInput and only use MemorySegment APIs
for MMapDirectory. (Uwe Schindler)
@ -112,6 +108,11 @@ API Changes
* GITHUB#13410: Removed Scorer#getWeight (Sanjay Dutt, Adrien Grand)
* GITHUB#13499: Remove deprecated TopScoreDocCollector + TopFieldCollector methods (#create, #createSharedManager) (Jakub Slowinski)
* GITHUB#13632: CandidateMatcher public matching functions (Bryan Jacobowitz)
New Features
---------------------
@ -133,6 +134,16 @@ New Features
DocValuesSkipper abstraction. A new flag is added to FieldType.java that configures whether
to create a "skip index" for doc values. (Ignacio Vera)
* GITHUB#13563: Add levels to doc values skip index. (Ignacio Vera)
* GITHUB#13597: Align doc value skipper interval boundaries when an interval contains a constant
value. (Ignacio Vera)
* GITHUB#13604: Add Kmeans clustering on vectors (Mayya Sharipova, Jim Ferenczi, Tom Veasey)
* GITHUB#13592: Take advantage of the doc value skipper when it is primary sort in SortedNumericDocValuesRangeQuery
and SortedSetDocValuesRangeQuery. (Ignacio Vera)
Improvements
---------------------
@ -168,6 +179,8 @@ Optimizations
* GITHUB#12552: Make FSTPostingsFormat load FSTs off-heap. (Tony X)
* GITHUB#13672: Leverage doc value skip lists in DocValuesRewriteMethod if indexed. (Greg Miller)
Bug Fixes
---------------------
@ -205,6 +218,9 @@ Changes in Backwards Compatibility Policy
* GITHUB#13230: Remove the Kp and Lovins snowball algorithms which are not supported
or intended for general use. (Robert Muir)
* GITHUB#13602: SearchWithCollectorTask no longer supports the `collector.class` config parameter to load a custom
collector implementation. `collector.manager.class` allows users to load a collector manager instead. (Luca Cavanna)
Other
---------------------
@ -243,22 +259,71 @@ Other
* GITHUB#13332: Improve MissingDoclet linter to check records correctly. (Uwe Schindler)
* GITHUB#13499: Remove usage of TopScoreDocCollector + TopFieldCollector deprecated methods (#create, #createSharedManager) (Jakub Slowinski)
Build
---------------------
* GITHUB#13649: Fix eclipse ide settings generation #13649 (Uwe Schindler, Dawid Weiss)
======================== Lucene 9.12.0 =======================
API Changes
---------------------
* GITHUB#13281: Mark COSINE VectorSimilarityFunction as deprecated. (Pulkit Gupta)
* GITHUB#13469: Expose FlatVectorsFormat as a first-class format; can be configured using a custom Codec. (Michael Sokolov)
* GITHUB#13612: Hunspell: add Suggester#proceedPastRep to avoid losing relevant suggestions. (Peter Gromov)
* GITHUB#13603: Introduced `IndexSearcher#searchLeaf(LeafReaderContext, Weight, Collector)` protected method to
facilitate customizing per-leaf behavior of search without requiring to override
`search(LeafReaderContext[], Weight, Collector)` which requires overriding the entire loop across the leaves (Luca Cavanna)
* GITHUB#13559: Add BitSet#nextSetBit(int, int) to get the index of the first set bit in range. (Egor Potemkin)
* GITHUB#13568: Add DoubleValuesSource#toSortableLongDoubleValuesSource and
MultiDoubleValuesSource#toSortableMultiLongValuesSource methods. (Shradha Shankar)
* GITHUB#13568: Add CollectorOwner class that wraps CollectorManager, and handles list of Collectors and results.
Add IndexSearcher#search method that takes CollectorOwner. (Egor Potemkin)
* GITHUB#13568: Add DrillSideways#search method that supports any collector types for any drill-sideways dimensions
or drill-down. (Egor Potemkin)
New Features
---------------------
(No changes)
* GITHUB#13430: Allow configuring the search concurrency via
TieredMergePolicy#setTargetSearchConcurrency. This in-turn instructs the
merge policy to try to have at least this number of segments on the highest
tier. (Adrien Grand, Carlos Delgado)
* GITHUB#13517: Allow configuring the search concurrency on LogDocMergePolicy
and LogByteSizeMergePolicy via a new #setTargetConcurrency setter.
(Adrien Grand)
* GITHUB#13568: Add sandbox facets module to compute facets while collecting. (Egor Potemkin, Shradha Shankar)
* GITHUB#13678: Add support JDK 23 to the Panama Vectorization Provider. (Chris Hegarty)
Improvements
---------------------
(No changes)
* GITHUB#13548: Refactor and javadoc update for KNN vector writer classes. (Patrick Zhai)
* GITHUB#13562: Add Intervals.regexp and Intervals.range methods to produce IntervalsSource
for regexp and range queries. (Mayya Sharipova)
* GITHUB#13625: Remove BitSet#nextSetBit code duplication. (Greg Miller)
* GITHUB#13285: Early terminate graph searches of AbstractVectorSimilarityQuery to follow timeout set from
IndexSearcher#setTimeout(QueryTimeout). (Kaival Parikh)
* GITHUB#13633: Add ability to read/write knn vector values to a MemoryIndex. (Ben Trent)
* GITHUB#12627: patch HNSW graphs to improve reachability of all nodes from entry points
* GITHUB#13201: Better cost estimation on MultiTermQuery over few terms. (Michael Froh)
Optimizations
---------------------
@ -277,16 +342,100 @@ Optimizations
* GITHUB#12941: Don't preserve auxiliary buffer contents in LSBRadixSorter if it grows. (Stefan Vodita)
* GITHUB#13175: Stop double-checking priority queue inserts in some FacetCount classes. (Jakub Slowinski)
* GITHUB#13538: Slightly reduce heap usage for HNSW and scalar quantized vector writers. (Ben Trent)
* GITHUB#12100: WordBreakSpellChecker.suggestWordBreaks now does a breadth first search, allowing it to return
better matches with fewer evaluations (hossman)
* GITHUB#13582: Stop requiring MaxScoreBulkScorer's outer window from having at
least INNER_WINDOW_SIZE docs. (Adrien Grand)
* GITHUB#13570, GITHUB#13574, GITHUB#13535: Avoid performance degradation with closing shared Arenas.
Closing many individual index files can potentially lead to a degradation in execution performance.
Index files are mmapped one-to-one with the JDK's foreign shared Arena. The JVM deoptimizes the top
few frames of all threads when closing a shared Arena (see JDK-8335480). We mitigate this situation
when running with JDK 21 and greater, by 1) using a confined Arena where appropriate, and 2) grouping
files from the same segment to a single shared Arena.
A system property has been added that allows to control the total maximum number of mmapped files
that may be associated with a single shared Arena. For example, to set the max number of permits to
256, pass the following on the command line
-Dorg.apache.lucene.store.MMapDirectory.sharedArenaMaxPermits=256. Setting a value of 1 associates
a single file to a single shared arena.
(Chris Hegarty, Michael Gibney, Uwe Schindler)
* GITHUB#13585: Lucene912PostingsFormat, the new default postings format, now
only has 2 levels of skip data, which are inlined into postings instead of
being stored at the end of postings lists. This translates into better
performance for queries that need skipping such as conjunctions.
(Adrien Grand)
* GITHUB#13581: OnHeapHnswGraph no longer allocates a lock for every graph node (Mike Sokolov)
* GITHUB#13636, GITHUB#13658: Optimizations to the decoding logic of blocks of
postings. (Adrien Grand, Uwe Schindler, Greg Miller)
* GITHUB##13644: Improve NumericComparator competitive iterator logic by comparing the missing value with the top
value even after the hit queue is full (Pan Guixin)
* GITHUB#13587: Use Max WAND optimizations with ToParentBlockJoinQuery when using ScoreMode.Max (Mike Pellegrini)
Changes in runtime behavior
---------------------
* GITHUB#13472: When an executor is provided to the IndexSearcher constructor, the searcher now executes tasks on the
thread that invoked a search as well as its configured executor. Users should reduce the executor's thread-count by 1
to retain the previous level of parallelism. Moreover, it is now possible to start searches from the same executor
that is configured in the IndexSearcher without risk of deadlocking. A separate executor for starting searches is no
longer required. (Armin Braun)
Bug Fixes
---------------------
* GITHUB#13384: Fix highlighter to use longer passages instead of shorter individual terms. (Zack Kendall)
* GITHUB#13463: Address bug in MultiLeafKnnCollector causing #minCompetitiveSimilarity to stay artificially low in
some corner cases. (Greg Miller)
* GITHUB#13553: Correct RamUsageEstimate for scalar quantized knn vector formats so that raw vectors are correctly
accounted for. (Ben Trent)
* GITHUB#13615: Correct scalar quantization when used in conjunction with COSINE similarity. Vectors are normalized
before quantization to ensure the cosine similarity is correctly calculated. (Ben Trent)
* GITHUB#13627: Fix race condition on flush for DWPT seqNo generation. (Ben Trent, Ao Li)
* GITHUB#13691: Fix incorrect exponent value in explain of SigmoidFunction. (Owais Kazi)
Build
---------------------
* GITHUB#13695, GITHUB#13696: Fix Gradle build sometimes gives spurious "unreferenced license file" warnings.
(Uwe Schindler)
Other
--------------------
(No changes)
======================== Lucene 9.11.1 =======================
Bug Fixes
---------------------
* GITHUB#13498: Avoid performance regression by constructing lazily the PointTree in NumericComparator. (Ignacio Vera)
* GITHUB#13501, GITHUB#13478: Remove intra-merge parallelism for everything except HNSW graph merges. (Ben Trent)
* GITHUB#13498, GITHUB#13340: Allow adding a parent field to an index with no fields (Michael Sokolov)
* GITHUB#12431: Fix IndexOutOfBoundsException thrown in DefaultPassageFormatter
by unordered matches. (Stephane Campinas)
* GITHUB#13493: StringValueFacetCounts stops throwing NPE when faceting over an empty match-set. (Grebennikov Roman,
Stefan Vodita)
======================== Lucene 9.11.0 =======================
API Changes
@ -494,6 +643,10 @@ API Changes
* GITHUB#12854: Mark DrillSideways#createDrillDownFacetsCollector as @Deprecated. (Greg Miller)
* GITHUB#12624, GITHUB#12831: Allow FSTCompiler to stream to any DataOutput while building, and
make compile() only return the FSTMetadata. For on-heap (default) use case, please use
FST.fromFSTReader(fstMetadata, fstCompiler.getFSTReader()) to create the FST. (Anh Dung Bui)
New Features
---------------------
* GITHUB#12679: Add support for similarity-based vector searches using [Byte|Float]VectorSimilarityQuery. Uses a new
@ -501,6 +654,12 @@ New Features
better-scoring nodes are available, or the best candidate is below a score of `traversalSimilarity` in the lowest
level. (Aditya Prakash, Kaival Parikh)
* GITHUB#12829: For indices newly created as of 9.10.0 onwards, IndexWriter preserves document blocks indexed via
IndexWriter#addDocuments or IndexWriter#updateDocuments also when index sorting is configured. Document blocks are
maintained alongside their parent documents during sort and merge. IndexWriterConfig accepts a parent field that is used
to maintain block orders if index sorting is used. Note, this is fully optional in Lucene 9.x while will be mandatory for
indices that use document blocks together with index sorting as of 10.0.0. (Simon Willnauer)
* GITHUB#12336: Index additional data per facet label in the taxonomy. (Shai Erera, Egor Potemkin, Mike McCandless,
Stefan Vodita)
@ -592,7 +751,6 @@ Build
Other
---------------------
* GITHUB#11023: Removing some dead code in CheckIndex. (Jakub Slowinski)
* GITHUB#11023: Removing @lucene.experimental tags in testXXX methods in CheckIndex. (Jakub Slowinski)

View File

@ -1,5 +1,5 @@
{
"gradle/generation/jflex/skeleton.default.txt": "58944f66c9113a940dfaf6a17210ec8219024390",
"lucene/analysis/common/src/java/org/apache/lucene/analysis/classic/ClassicTokenizerImpl.java": "1f7a446f3483326385eef257cea8366c27da0850",
"lucene/analysis/common/src/java/org/apache/lucene/analysis/classic/ClassicTokenizerImpl.java": "e62dcd8c25219d8f5d783823b228ffe38d2bacde",
"lucene/analysis/common/src/java/org/apache/lucene/analysis/classic/ClassicTokenizerImpl.jflex": "f52109bb7d5701979fde90aeeeda726246a8d5fd"
}

View File

@ -1,5 +1,5 @@
{
"gradle/generation/jflex/skeleton.default.txt": "58944f66c9113a940dfaf6a17210ec8219024390",
"lucene/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.java": "ac298e08bc5b96202efca0c01f9f0376fda976bd",
"lucene/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.java": "2b5df5ff35543a6380c82f298225eb5fa06e4453",
"lucene/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.jflex": "0b8c7774b98e8237702013e82c352d4711509bd0"
}

View File

@ -37,23 +37,23 @@ class BengaliNormalizer {
for (int i = 0; i < len; i++) {
switch (s[i]) {
// delete Chandrabindu
// delete Chandrabindu
case '\u0981':
len = delete(s, i, len);
i--;
break;
// DirghoI kar -> RosshoI kar
// DirghoI kar -> RosshoI kar
case '\u09C0':
s[i] = '\u09BF';
break;
// DirghoU kar -> RosshoU kar
// DirghoU kar -> RosshoU kar
case '\u09C2':
s[i] = '\u09C1';
break;
// Khio (Ka + Hoshonto + Murdorno Sh)
// Khio (Ka + Hoshonto + Murdorno Sh)
case '\u0995':
if (i + 2 < len && s[i + 1] == '\u09CD' && s[i + 2] == '\u09BF') {
if (i == 0) {
@ -67,12 +67,12 @@ class BengaliNormalizer {
}
break;
// Nga to Anusvara
// Nga to Anusvara
case '\u0999':
s[i] = '\u0982';
break;
// Ja Phala
// Ja Phala
case '\u09AF':
if (i - 2 == 0 && s[i - 1] == '\u09CD') {
s[i - 1] = '\u09C7';
@ -89,7 +89,7 @@ class BengaliNormalizer {
}
break;
// Ba Phalaa
// Ba Phalaa
case '\u09AC':
if ((i >= 1 && s[i - 1] != '\u09CD') || i == 0) {
break;
@ -109,7 +109,7 @@ class BengaliNormalizer {
}
break;
// Visarga
// Visarga
case '\u0983':
if (i == len - 1) {
if (len <= 3) {
@ -122,18 +122,18 @@ class BengaliNormalizer {
}
break;
// All sh
// All sh
case '\u09B6':
case '\u09B7':
s[i] = '\u09B8';
break;
// check na
// check na
case '\u09A3':
s[i] = '\u09A8';
break;
// check ra
// check ra
case '\u09DC':
case '\u09DD':
s[i] = '\u09B0';

View File

@ -747,70 +747,70 @@ class ClassicTokenizerImpl {
/* Break so we don't hit fall-through warning: */
break; /* ignore */
}
// fall through
// fall through
case 11:
break;
case 2:
{
return ALPHANUM;
}
// fall through
// fall through
case 12:
break;
case 3:
{
return CJ;
}
// fall through
// fall through
case 13:
break;
case 4:
{
return NUM;
}
// fall through
// fall through
case 14:
break;
case 5:
{
return HOST;
}
// fall through
// fall through
case 15:
break;
case 6:
{
return COMPANY;
}
// fall through
// fall through
case 16:
break;
case 7:
{
return APOSTROPHE;
}
// fall through
// fall through
case 17:
break;
case 8:
{
return ACRONYM_DEP;
}
// fall through
// fall through
case 18:
break;
case 9:
{
return ACRONYM;
}
// fall through
// fall through
case 19:
break;
case 10:
{
return EMAIL;
}
// fall through
// fall through
case 20:
break;
default:

View File

@ -53,18 +53,18 @@ public final class GreekLowerCaseFilter extends TokenFilter {
private int lowerCase(int codepoint) {
switch (codepoint) {
/* There are two lowercase forms of sigma:
* U+03C2: small final sigma (end of word)
* U+03C3: small sigma (otherwise)
*
* Standardize both to U+03C3
*/
/* There are two lowercase forms of sigma:
* U+03C2: small final sigma (end of word)
* U+03C3: small sigma (otherwise)
*
* Standardize both to U+03C3
*/
case '\u03C2': /* small final sigma */
return '\u03C3'; /* small sigma */
/* Some greek characters contain diacritics.
* This filter removes these, converting to the lowercase base form.
*/
/* Some greek characters contain diacritics.
* This filter removes these, converting to the lowercase base form.
*/
case '\u0386': /* capital alpha with tonos */
case '\u03AC': /* small alpha with tonos */
@ -100,9 +100,9 @@ public final class GreekLowerCaseFilter extends TokenFilter {
case '\u03CE': /* small omega with tonos */
return '\u03C9'; /* small omega */
/* The previous implementation did the conversion below.
* Only implemented for backwards compatibility with old indexes.
*/
/* The previous implementation did the conversion below.
* Only implemented for backwards compatibility with old indexes.
*/
case '\u03A2': /* reserved */
return '\u03C2'; /* small final sigma */

View File

@ -456,7 +456,7 @@ class PorterStemmer {
/* j >= 0 fixes Bug 2 */
if (ends("ou")) break;
return;
/* takes care of -ous */
/* takes care of -ous */
case 's':
if (ends("ism")) break;
return;

View File

@ -67,7 +67,7 @@ public final class IrishLowerCaseFilter extends TokenFilter {
case 'I':
case 'O':
case 'U':
// vowels with acute accent (fada)
// vowels with acute accent (fada)
case '\u00c1':
case '\u00c9':
case '\u00cd':

View File

@ -47,18 +47,18 @@ class HindiNormalizer {
for (int i = 0; i < len; i++) {
switch (s[i]) {
// dead n -> bindu
// dead n -> bindu
case '\u0928':
if (i + 1 < len && s[i + 1] == '\u094D') {
s[i] = '\u0902';
len = delete(s, i + 1, len);
}
break;
// candrabindu -> bindu
// candrabindu -> bindu
case '\u0901':
s[i] = '\u0902';
break;
// nukta deletions
// nukta deletions
case '\u093C':
len = delete(s, i, len);
i--;
@ -96,18 +96,18 @@ class HindiNormalizer {
case '\u095F':
s[i] = '\u092F';
break;
// zwj/zwnj -> delete
// zwj/zwnj -> delete
case '\u200D':
case '\u200C':
len = delete(s, i, len);
i--;
break;
// virama -> delete
// virama -> delete
case '\u094D':
len = delete(s, i, len);
i--;
break;
// chandra/short -> replace
// chandra/short -> replace
case '\u0945':
case '\u0946':
s[i] = '\u0947';
@ -127,7 +127,7 @@ class HindiNormalizer {
case '\u0972':
s[i] = '\u0905';
break;
// long -> short ind. vowels
// long -> short ind. vowels
case '\u0906':
s[i] = '\u0905';
break;
@ -149,7 +149,7 @@ class HindiNormalizer {
case '\u0914':
s[i] = '\u0913';
break;
// long -> short dep. vowels
// long -> short dep. vowels
case '\u0940':
s[i] = '\u093F';
break;

View File

@ -31,6 +31,7 @@ class ModifyingSuggester {
private final String misspelled;
private final WordCase wordCase;
private final FragmentChecker fragmentChecker;
private final boolean proceedPastRep;
private final char[] tryChars;
private final Hunspell speller;
@ -39,13 +40,15 @@ class ModifyingSuggester {
LinkedHashSet<Suggestion> result,
String misspelled,
WordCase wordCase,
FragmentChecker checker) {
FragmentChecker checker,
boolean proceedPastRep) {
this.speller = speller;
tryChars = speller.dictionary.tryChars.toCharArray();
this.result = result;
this.misspelled = misspelled;
this.wordCase = wordCase;
fragmentChecker = checker;
this.proceedPastRep = proceedPastRep;
}
/**
@ -125,9 +128,9 @@ class ModifyingSuggester {
boolean hasGoodSuggestions = trySuggestion(word.toUpperCase(Locale.ROOT));
GradedSuggestions repResult = tryRep(word);
if (repResult == GradedSuggestions.Best) return true;
if (repResult == GradedSuggestions.Best && !proceedPastRep) return true;
hasGoodSuggestions |= repResult == GradedSuggestions.Normal;
hasGoodSuggestions |= repResult != GradedSuggestions.None;
if (!speller.dictionary.mapTable.isEmpty()) {
enumerateMapReplacements(word, "", 0);

View File

@ -53,16 +53,21 @@ public class Suggester {
private final Dictionary dictionary;
private final SuggestibleEntryCache suggestibleCache;
private final FragmentChecker fragmentChecker;
private final boolean proceedPastRep;
public Suggester(Dictionary dictionary) {
this(dictionary, null, FragmentChecker.EVERYTHING_POSSIBLE);
this(dictionary, null, FragmentChecker.EVERYTHING_POSSIBLE, false);
}
private Suggester(
Dictionary dictionary, SuggestibleEntryCache suggestibleCache, FragmentChecker checker) {
Dictionary dictionary,
SuggestibleEntryCache suggestibleCache,
FragmentChecker checker,
boolean proceedPastRep) {
this.dictionary = dictionary;
this.suggestibleCache = suggestibleCache;
this.fragmentChecker = checker;
this.proceedPastRep = proceedPastRep;
}
/**
@ -71,8 +76,8 @@ public class Suggester {
* entries are stored as fast-to-iterate plain words instead of highly compressed prefix trees.
*/
public Suggester withSuggestibleEntryCache() {
return new Suggester(
dictionary, SuggestibleEntryCache.buildCache(dictionary.words), fragmentChecker);
SuggestibleEntryCache cache = SuggestibleEntryCache.buildCache(dictionary.words);
return new Suggester(dictionary, cache, fragmentChecker, proceedPastRep);
}
/**
@ -80,7 +85,17 @@ public class Suggester {
* the performance of the "Modification" phase performance.
*/
public Suggester withFragmentChecker(FragmentChecker checker) {
return new Suggester(dictionary, suggestibleCache, checker);
return new Suggester(dictionary, suggestibleCache, checker, proceedPastRep);
}
/**
* Returns a copy of this suggester instance that doesn't stop after encountering acceptable words
* after applying REP rules. By default, Hunspell stops when it finds any, but this behavior may
* not always be desirable, e.g., if we have "REP i ea", "tims" be replaced only by "teams" and
* not "times", which could also be meant.
*/
public Suggester proceedPastRep() {
return new Suggester(dictionary, suggestibleCache, fragmentChecker, true);
}
/**
@ -174,7 +189,8 @@ public class Suggester {
}
boolean hasGoodSuggestions =
new ModifyingSuggester(suggestionSpeller, suggestions, word, wordCase, fragmentChecker)
new ModifyingSuggester(
suggestionSpeller, suggestions, word, wordCase, fragmentChecker, proceedPastRep)
.suggest();
if (!hasGoodSuggestions && dictionary.maxNGramSuggestions > 0) {

View File

@ -194,7 +194,7 @@ public final class WordDelimiterIterator {
int type = charType(text[current]);
switch (type) {
// return ALPHA word type for both lower and upper
// return ALPHA word type for both lower and upper
case LOWER:
case UPPER:
return ALPHA;
@ -332,27 +332,27 @@ public final class WordDelimiterIterator {
case Character.OTHER_NUMBER:
return DIGIT;
// case Character.SPACE_SEPARATOR:
// case Character.LINE_SEPARATOR:
// case Character.PARAGRAPH_SEPARATOR:
// case Character.CONTROL:
// case Character.FORMAT:
// case Character.PRIVATE_USE:
// case Character.SPACE_SEPARATOR:
// case Character.LINE_SEPARATOR:
// case Character.PARAGRAPH_SEPARATOR:
// case Character.CONTROL:
// case Character.FORMAT:
// case Character.PRIVATE_USE:
case Character.SURROGATE: // prevent splitting
return ALPHA | DIGIT;
// case Character.DASH_PUNCTUATION:
// case Character.START_PUNCTUATION:
// case Character.END_PUNCTUATION:
// case Character.CONNECTOR_PUNCTUATION:
// case Character.OTHER_PUNCTUATION:
// case Character.MATH_SYMBOL:
// case Character.CURRENCY_SYMBOL:
// case Character.MODIFIER_SYMBOL:
// case Character.OTHER_SYMBOL:
// case Character.INITIAL_QUOTE_PUNCTUATION:
// case Character.FINAL_QUOTE_PUNCTUATION:
// case Character.DASH_PUNCTUATION:
// case Character.START_PUNCTUATION:
// case Character.END_PUNCTUATION:
// case Character.CONNECTOR_PUNCTUATION:
// case Character.OTHER_PUNCTUATION:
// case Character.MATH_SYMBOL:
// case Character.CURRENCY_SYMBOL:
// case Character.MODIFIER_SYMBOL:
// case Character.OTHER_SYMBOL:
// case Character.INITIAL_QUOTE_PUNCTUATION:
// case Character.FINAL_QUOTE_PUNCTUATION:
default:
return SUBWORD_DELIM;

View File

@ -38,25 +38,25 @@ class TeluguNormalizer {
for (int i = 0; i < len; i++) {
switch (s[i]) {
// candrabindu ( and ) -> bindu ()
// candrabindu ( and ) -> bindu ()
case '\u0C00': //
case '\u0C01': //
s[i] = '\u0C02'; //
break;
// delete visarga ()
// delete visarga ()
case '\u0C03':
len = delete(s, i, len);
i--;
break;
// zwj/zwnj -> delete
// zwj/zwnj -> delete
case '\u200D':
case '\u200C':
len = delete(s, i, len);
i--;
break;
// long -> short vowels
// long -> short vowels
case '\u0C14': //
s[i] = '\u0C13'; //
break;
@ -73,7 +73,7 @@ class TeluguNormalizer {
s[i] = '\u0C09'; //
break;
// long -> short vowels matras
// long -> short vowels matras
case '\u0C40': //
s[i] = '\u0C3F'; // ి
break;
@ -86,14 +86,14 @@ class TeluguNormalizer {
case '\u0C4B': //
s[i] = '\u0C4A'; //
break;
// decomposed dipthong ( + ) -> precomposed diphthong vowel sign ()
// decomposed dipthong ( + ) -> precomposed diphthong vowel sign ()
case '\u0C46':
if (i + 1 < len && s[i + 1] == '\u0C56') {
s[i] = '\u0C48';
len = delete(s, i + 1, len);
}
break;
// composed oo or au -> oo or au
// composed oo or au -> oo or au
case '\u0C12':
if (i + 1 < len && s[i + 1] == '\u0C55') {
// ( + ) -> oo ()

View File

@ -61,12 +61,12 @@ public final class TurkishLowerCaseFilter extends TokenFilter {
if (iOrAfter) { // all the special I turkish handling happens here.
switch (ch) {
// remove COMBINING_DOT_ABOVE to mimic composed lowercase
// remove COMBINING_DOT_ABOVE to mimic composed lowercase
case COMBINING_DOT_ABOVE:
length = delete(buffer, i, length);
continue;
// i itself, it depends if it is followed by COMBINING_DOT_ABOVE
// if it is, we will make it small i and later remove the dot
// i itself, it depends if it is followed by COMBINING_DOT_ABOVE
// if it is, we will make it small i and later remove the dot
case LATIN_CAPITAL_LETTER_I:
if (isBeforeDot(buffer, i + 1, length)) {
buffer[i] = LATIN_SMALL_LETTER_I;

View File

@ -901,7 +901,7 @@ class WikipediaTokenizerImpl {
positionInc = 1; /* Break so we don't hit fall-through warning: */
break;
}
// fall through
// fall through
case 47:
break;
case 2:
@ -909,7 +909,7 @@ class WikipediaTokenizerImpl {
positionInc = 1;
return ALPHANUM;
}
// fall through
// fall through
case 48:
break;
case 3:
@ -920,7 +920,7 @@ class WikipediaTokenizerImpl {
yybegin(EXTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */
break;
}
// fall through
// fall through
case 49:
break;
case 4:
@ -928,7 +928,7 @@ class WikipediaTokenizerImpl {
positionInc = 1;
return CJ;
}
// fall through
// fall through
case 50:
break;
case 5:
@ -936,7 +936,7 @@ class WikipediaTokenizerImpl {
positionInc = 1; /* Break so we don't hit fall-through warning: */
break;
}
// fall through
// fall through
case 51:
break;
case 6:
@ -945,7 +945,7 @@ class WikipediaTokenizerImpl {
numWikiTokensSeen++;
return currentTokType;
}
// fall through
// fall through
case 52:
break;
case 7:
@ -954,7 +954,7 @@ class WikipediaTokenizerImpl {
numWikiTokensSeen++;
return currentTokType;
}
// fall through
// fall through
case 53:
break;
case 8:
@ -962,7 +962,7 @@ class WikipediaTokenizerImpl {
/* Break so we don't hit fall-through warning: */
break; /* ignore */
}
// fall through
// fall through
case 54:
break;
case 9:
@ -978,7 +978,7 @@ class WikipediaTokenizerImpl {
numLinkToks++;
return currentTokType;
}
// fall through
// fall through
case 55:
break;
case 10:
@ -988,7 +988,7 @@ class WikipediaTokenizerImpl {
yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */
break;
}
// fall through
// fall through
case 56:
break;
case 11:
@ -997,7 +997,7 @@ class WikipediaTokenizerImpl {
yybegin(THREE_SINGLE_QUOTES_STATE); /* Break so we don't hit fall-through warning: */
break;
}
// fall through
// fall through
case 57:
break;
case 12:
@ -1007,7 +1007,7 @@ class WikipediaTokenizerImpl {
yybegin(STRING);
return currentTokType; /*italics*/
}
// fall through
// fall through
case 58:
break;
case 13:
@ -1017,7 +1017,7 @@ class WikipediaTokenizerImpl {
yybegin(EXTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */
break;
}
// fall through
// fall through
case 59:
break;
case 14:
@ -1026,7 +1026,7 @@ class WikipediaTokenizerImpl {
numWikiTokensSeen++;
return currentTokType;
}
// fall through
// fall through
case 60:
break;
case 15:
@ -1036,7 +1036,7 @@ class WikipediaTokenizerImpl {
numWikiTokensSeen++;
return currentTokType;
}
// fall through
// fall through
case 61:
break;
case 16:
@ -1046,7 +1046,7 @@ class WikipediaTokenizerImpl {
yybegin(STRING); /* Break so we don't hit fall-through warning: */
break;
}
// fall through
// fall through
case 62:
break;
case 17:
@ -1055,7 +1055,7 @@ class WikipediaTokenizerImpl {
numWikiTokensSeen = 0;
return currentTokType;
}
// fall through
// fall through
case 63:
break;
case 18:
@ -1063,7 +1063,7 @@ class WikipediaTokenizerImpl {
/* Break so we don't hit fall-through warning: */
break; /* ignore STRING */
}
// fall through
// fall through
case 64:
break;
case 19:
@ -1072,7 +1072,7 @@ class WikipediaTokenizerImpl {
numWikiTokensSeen++;
return currentTokType; /* STRING ALPHANUM*/
}
// fall through
// fall through
case 65:
break;
case 20:
@ -1083,7 +1083,7 @@ class WikipediaTokenizerImpl {
yybegin(EXTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */
break;
}
// fall through
// fall through
case 66:
break;
case 21:
@ -1091,7 +1091,7 @@ class WikipediaTokenizerImpl {
yybegin(STRING);
return currentTokType; /*pipe*/
}
// fall through
// fall through
case 67:
break;
case 22:
@ -1106,7 +1106,7 @@ class WikipediaTokenizerImpl {
} /* Break so we don't hit fall-through warning: */
break;
}
// fall through
// fall through
case 68:
break;
case 23:
@ -1116,7 +1116,7 @@ class WikipediaTokenizerImpl {
yybegin(DOUBLE_EQUALS_STATE); /* Break so we don't hit fall-through warning: */
break;
}
// fall through
// fall through
case 69:
break;
case 24:
@ -1127,7 +1127,7 @@ class WikipediaTokenizerImpl {
yybegin(INTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */
break;
}
// fall through
// fall through
case 70:
break;
case 25:
@ -1138,7 +1138,7 @@ class WikipediaTokenizerImpl {
yybegin(DOUBLE_BRACE_STATE); /* Break so we don't hit fall-through warning: */
break;
}
// fall through
// fall through
case 71:
break;
case 26:
@ -1146,7 +1146,7 @@ class WikipediaTokenizerImpl {
yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */
break;
}
// fall through
// fall through
case 72:
break;
case 27:
@ -1155,7 +1155,7 @@ class WikipediaTokenizerImpl {
yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */
break;
}
// fall through
// fall through
case 73:
break;
case 28:
@ -1165,7 +1165,7 @@ class WikipediaTokenizerImpl {
yybegin(INTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */
break;
}
// fall through
// fall through
case 74:
break;
case 29:
@ -1175,7 +1175,7 @@ class WikipediaTokenizerImpl {
yybegin(INTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */
break;
}
// fall through
// fall through
case 75:
break;
case 30:
@ -1183,7 +1183,7 @@ class WikipediaTokenizerImpl {
yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */
break;
}
// fall through
// fall through
case 76:
break;
case 31:
@ -1193,7 +1193,7 @@ class WikipediaTokenizerImpl {
yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */
break; /*end italics*/
}
// fall through
// fall through
case 77:
break;
case 32:
@ -1204,7 +1204,7 @@ class WikipediaTokenizerImpl {
yybegin(INTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */
break;
}
// fall through
// fall through
case 78:
break;
case 33:
@ -1212,7 +1212,7 @@ class WikipediaTokenizerImpl {
positionInc = 1;
return NUM;
}
// fall through
// fall through
case 79:
break;
case 34:
@ -1220,7 +1220,7 @@ class WikipediaTokenizerImpl {
positionInc = 1;
return COMPANY;
}
// fall through
// fall through
case 80:
break;
case 35:
@ -1228,7 +1228,7 @@ class WikipediaTokenizerImpl {
positionInc = 1;
return APOSTROPHE;
}
// fall through
// fall through
case 81:
break;
case 36:
@ -1236,7 +1236,7 @@ class WikipediaTokenizerImpl {
positionInc = 1;
return HOST;
}
// fall through
// fall through
case 82:
break;
case 37:
@ -1245,7 +1245,7 @@ class WikipediaTokenizerImpl {
yybegin(FIVE_SINGLE_QUOTES_STATE); /* Break so we don't hit fall-through warning: */
break;
}
// fall through
// fall through
case 83:
break;
case 38:
@ -1255,7 +1255,7 @@ class WikipediaTokenizerImpl {
yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */
break; /*end bold*/
}
// fall through
// fall through
case 84:
break;
case 39:
@ -1265,7 +1265,7 @@ class WikipediaTokenizerImpl {
yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */
break; /*end sub header*/
}
// fall through
// fall through
case 85:
break;
case 40:
@ -1273,7 +1273,7 @@ class WikipediaTokenizerImpl {
positionInc = 1;
return ACRONYM;
}
// fall through
// fall through
case 86:
break;
case 41:
@ -1281,7 +1281,7 @@ class WikipediaTokenizerImpl {
positionInc = 1;
return EMAIL;
}
// fall through
// fall through
case 87:
break;
case 42:
@ -1291,7 +1291,7 @@ class WikipediaTokenizerImpl {
yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */
break; /*end bold italics*/
}
// fall through
// fall through
case 88:
break;
case 43:
@ -1301,7 +1301,7 @@ class WikipediaTokenizerImpl {
yybegin(EXTERNAL_LINK_STATE);
return currentTokType;
}
// fall through
// fall through
case 89:
break;
case 44:
@ -1312,7 +1312,7 @@ class WikipediaTokenizerImpl {
yybegin(CATEGORY_STATE); /* Break so we don't hit fall-through warning: */
break;
}
// fall through
// fall through
case 90:
break;
case 45:
@ -1322,7 +1322,7 @@ class WikipediaTokenizerImpl {
yybegin(CATEGORY_STATE); /* Break so we don't hit fall-through warning: */
break;
}
// fall through
// fall through
case 91:
break;
case 46:
@ -1333,7 +1333,7 @@ class WikipediaTokenizerImpl {
yybegin(CATEGORY_STATE); /* Break so we don't hit fall-through warning: */
break;
}
// fall through
// fall through
case 92:
break;
default:

View File

@ -59,6 +59,14 @@ public class TestSpellChecking extends LuceneTestCase {
public void testRepSuggestions() throws Exception {
doTest("rep");
//noinspection DataFlowIssue
Path aff = Path.of(getClass().getResource("rep.aff").toURI());
Dictionary dictionary = TestAllDictionaries.loadDictionary(aff);
Suggester suggester = new Suggester(dictionary);
assertEquals(List.of("auto's"), suggester.suggestNoTimeout("autos", () -> {}));
assertEquals(
List.of("auto's", "auto"), suggester.proceedPastRep().suggestNoTimeout("autos", () -> {}));
}
public void testPhSuggestions() throws Exception {

View File

@ -245,7 +245,7 @@ public class Diff {
deletes++;
x--;
break;
// delete
// delete
case Y:
if (deletes != base) {
result.append('D').append(deletes);
@ -258,7 +258,7 @@ public class Diff {
result.append('I');
result.append(b.charAt(--y));
break;
// insert
// insert
case R:
if (deletes != base) {
result.append('D').append(deletes);
@ -272,7 +272,7 @@ public class Diff {
result.append(b.charAt(--y));
x--;
break;
// replace
// replace
case D:
if (deletes != base) {
result.append('D').append(deletes);

View File

@ -0,0 +1,4 @@
{
"lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene99/ForUtil.java": "f31797842f047626df6a1a6b97167bec60269fec",
"lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene99/gen_ForUtil.py": "325f2610974b0e76e278b6445405a098a3763feb"
}

View File

@ -35,6 +35,7 @@ module org.apache.lucene.backward_codecs {
exports org.apache.lucene.backward_codecs.lucene92;
exports org.apache.lucene.backward_codecs.lucene94;
exports org.apache.lucene.backward_codecs.lucene95;
exports org.apache.lucene.backward_codecs.lucene99;
exports org.apache.lucene.backward_codecs.packed;
exports org.apache.lucene.backward_codecs.store;
@ -43,7 +44,8 @@ module org.apache.lucene.backward_codecs {
provides org.apache.lucene.codecs.PostingsFormat with
org.apache.lucene.backward_codecs.lucene50.Lucene50PostingsFormat,
org.apache.lucene.backward_codecs.lucene84.Lucene84PostingsFormat,
org.apache.lucene.backward_codecs.lucene90.Lucene90PostingsFormat;
org.apache.lucene.backward_codecs.lucene90.Lucene90PostingsFormat,
org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat;
provides org.apache.lucene.codecs.KnnVectorsFormat with
org.apache.lucene.backward_codecs.lucene90.Lucene90HnswVectorsFormat,
org.apache.lucene.backward_codecs.lucene91.Lucene91HnswVectorsFormat,
@ -59,5 +61,6 @@ module org.apache.lucene.backward_codecs {
org.apache.lucene.backward_codecs.lucene91.Lucene91Codec,
org.apache.lucene.backward_codecs.lucene92.Lucene92Codec,
org.apache.lucene.backward_codecs.lucene94.Lucene94Codec,
org.apache.lucene.backward_codecs.lucene95.Lucene95Codec;
org.apache.lucene.backward_codecs.lucene95.Lucene95Codec,
org.apache.lucene.backward_codecs.lucene99.Lucene99Codec;
}

View File

@ -88,21 +88,17 @@ public final class FieldReader extends Terms {
(new ByteArrayDataInput(rootCode.bytes, rootCode.offset, rootCode.length)).readVLong()
>>> Lucene40BlockTreeTermsReader.OUTPUT_FLAGS_NUM_BITS;
// Initialize FST always off-heap.
final IndexInput clone = indexIn.clone();
clone.seek(indexStartFP);
final FST.FSTMetadata<BytesRef> fstMetadata;
if (metaIn == indexIn) { // Only true before Lucene 8.6
index =
new FST<>(
readMetadata(clone, ByteSequenceOutputs.getSingleton()),
clone,
new OffHeapFSTStore());
final IndexInput clone = indexIn.clone();
clone.seek(indexStartFP);
fstMetadata = readMetadata(clone, ByteSequenceOutputs.getSingleton());
// FST bytes actually only start after the metadata.
indexStartFP = clone.getFilePointer();
} else {
index =
new FST<>(
readMetadata(metaIn, ByteSequenceOutputs.getSingleton()),
clone,
new OffHeapFSTStore());
fstMetadata = readMetadata(metaIn, ByteSequenceOutputs.getSingleton());
}
index = FST.fromFSTReader(fstMetadata, new OffHeapFSTStore(indexIn, indexStartFP, fstMetadata));
/*
if (false) {
final String dotFileName = segment + "_" + fieldInfo.name + ".dot";

View File

@ -14,7 +14,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.codecs.lucene99;
package org.apache.lucene.backward_codecs.lucene99;
import java.io.IOException;
import org.apache.lucene.store.DataInput;

View File

@ -16,7 +16,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.codecs.lucene99;
package org.apache.lucene.backward_codecs.lucene99;
import java.io.IOException;
import org.apache.lucene.store.DataInput;

View File

@ -14,12 +14,33 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.codecs.lucene99;
package org.apache.lucene.backward_codecs.lucene99;
import java.util.Objects;
import org.apache.lucene.codecs.*;
import org.apache.lucene.codecs.lucene90.*;
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.codecs.CompoundFormat;
import org.apache.lucene.codecs.DocValuesFormat;
import org.apache.lucene.codecs.FieldInfosFormat;
import org.apache.lucene.codecs.FilterCodec;
import org.apache.lucene.codecs.KnnVectorsFormat;
import org.apache.lucene.codecs.LiveDocsFormat;
import org.apache.lucene.codecs.NormsFormat;
import org.apache.lucene.codecs.PointsFormat;
import org.apache.lucene.codecs.PostingsFormat;
import org.apache.lucene.codecs.SegmentInfoFormat;
import org.apache.lucene.codecs.StoredFieldsFormat;
import org.apache.lucene.codecs.TermVectorsFormat;
import org.apache.lucene.codecs.lucene90.Lucene90CompoundFormat;
import org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat;
import org.apache.lucene.codecs.lucene90.Lucene90LiveDocsFormat;
import org.apache.lucene.codecs.lucene90.Lucene90NormsFormat;
import org.apache.lucene.codecs.lucene90.Lucene90PointsFormat;
import org.apache.lucene.codecs.lucene90.Lucene90StoredFieldsFormat;
import org.apache.lucene.codecs.lucene90.Lucene90TermVectorsFormat;
import org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat;
import org.apache.lucene.codecs.lucene94.Lucene94FieldInfosFormat;
import org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat;
import org.apache.lucene.codecs.lucene99.Lucene99SegmentInfoFormat;
import org.apache.lucene.codecs.perfield.PerFieldDocValuesFormat;
import org.apache.lucene.codecs.perfield.PerFieldKnnVectorsFormat;
import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat;
@ -98,7 +119,7 @@ public class Lucene99Codec extends Codec {
super("Lucene99");
this.storedFieldsFormat =
new Lucene90StoredFieldsFormat(Objects.requireNonNull(mode).storedMode);
this.defaultPostingsFormat = new Lucene99PostingsFormat();
this.defaultPostingsFormat = new Lucene912PostingsFormat();
this.defaultDVFormat = new Lucene90DocValuesFormat();
this.defaultKnnVectorsFormat = new Lucene99HnswVectorsFormat();
}

View File

@ -14,7 +14,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.codecs.lucene99;
package org.apache.lucene.backward_codecs.lucene99;
import java.io.IOException;
import org.apache.lucene.codecs.BlockTermState;
@ -24,7 +24,6 @@ import org.apache.lucene.codecs.FieldsProducer;
import org.apache.lucene.codecs.MultiLevelSkipListWriter;
import org.apache.lucene.codecs.PostingsFormat;
import org.apache.lucene.codecs.PostingsReaderBase;
import org.apache.lucene.codecs.PostingsWriterBase;
import org.apache.lucene.codecs.lucene90.blocktree.Lucene90BlockTreeTermsReader;
import org.apache.lucene.codecs.lucene90.blocktree.Lucene90BlockTreeTermsWriter;
import org.apache.lucene.index.IndexOptions;
@ -339,7 +338,7 @@ import org.apache.lucene.util.packed.PackedInts;
*
* @lucene.experimental
*/
public final class Lucene99PostingsFormat extends PostingsFormat {
public class Lucene99PostingsFormat extends PostingsFormat {
/**
* Filename extension for document number, frequencies, and skip data. See chapter: <a
@ -374,28 +373,9 @@ public final class Lucene99PostingsFormat extends PostingsFormat {
static final int VERSION_START = 0;
static final int VERSION_CURRENT = VERSION_START;
private final int minTermBlockSize;
private final int maxTermBlockSize;
/** Creates {@code Lucene99PostingsFormat} with default settings. */
public Lucene99PostingsFormat() {
this(
Lucene90BlockTreeTermsWriter.DEFAULT_MIN_BLOCK_SIZE,
Lucene90BlockTreeTermsWriter.DEFAULT_MAX_BLOCK_SIZE);
}
/**
* Creates {@code Lucene99PostingsFormat} with custom values for {@code minBlockSize} and {@code
* maxBlockSize} passed to block terms dictionary.
*
* @see
* Lucene90BlockTreeTermsWriter#Lucene90BlockTreeTermsWriter(SegmentWriteState,PostingsWriterBase,int,int)
*/
public Lucene99PostingsFormat(int minTermBlockSize, int maxTermBlockSize) {
super("Lucene99");
Lucene90BlockTreeTermsWriter.validateSettings(minTermBlockSize, maxTermBlockSize);
this.minTermBlockSize = minTermBlockSize;
this.maxTermBlockSize = maxTermBlockSize;
}
@Override
@ -405,19 +385,7 @@ public final class Lucene99PostingsFormat extends PostingsFormat {
@Override
public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
PostingsWriterBase postingsWriter = new Lucene99PostingsWriter(state);
boolean success = false;
try {
FieldsConsumer ret =
new Lucene90BlockTreeTermsWriter(
state, postingsWriter, minTermBlockSize, maxTermBlockSize);
success = true;
return ret;
} finally {
if (!success) {
IOUtils.closeWhileHandlingException(postingsWriter);
}
}
throw new UnsupportedOperationException();
}
@Override

View File

@ -14,23 +14,23 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.codecs.lucene99;
package org.apache.lucene.backward_codecs.lucene99;
import static org.apache.lucene.codecs.lucene99.ForUtil.BLOCK_SIZE;
import static org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.DOC_CODEC;
import static org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.MAX_SKIP_LEVELS;
import static org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.PAY_CODEC;
import static org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.POS_CODEC;
import static org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.TERMS_CODEC;
import static org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.VERSION_CURRENT;
import static org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.VERSION_START;
import static org.apache.lucene.backward_codecs.lucene99.ForUtil.BLOCK_SIZE;
import static org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat.DOC_CODEC;
import static org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat.MAX_SKIP_LEVELS;
import static org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat.PAY_CODEC;
import static org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat.POS_CODEC;
import static org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat.TERMS_CODEC;
import static org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat.VERSION_CURRENT;
import static org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat.VERSION_START;
import java.io.IOException;
import java.util.Arrays;
import org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat.IntBlockTermState;
import org.apache.lucene.codecs.BlockTermState;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.PostingsReaderBase;
import org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.IntBlockTermState;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.Impacts;
import org.apache.lucene.index.ImpactsEnum;

View File

@ -14,7 +14,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.codecs.lucene99;
package org.apache.lucene.backward_codecs.lucene99;
import java.io.IOException;
import java.util.AbstractList;

View File

@ -14,7 +14,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.codecs.lucene99;
package org.apache.lucene.backward_codecs.lucene99;
import java.io.IOException;
import java.util.Arrays;
@ -61,6 +61,7 @@ public class Lucene99SkipReader extends MultiLevelSkipListReader {
private long lastDocPointer;
private int lastPosBufferUpto;
/** Sole constructor. */
public Lucene99SkipReader(
IndexInput skipStream,
int maxSkipLevels,
@ -98,6 +99,7 @@ public class Lucene99SkipReader extends MultiLevelSkipListReader {
return df % ForUtil.BLOCK_SIZE == 0 ? df - 1 : df;
}
/** Initialize state. */
public void init(
long skipPointer, long docBasePointer, long posBasePointer, long payBasePointer, int df)
throws IOException {
@ -125,22 +127,27 @@ public class Lucene99SkipReader extends MultiLevelSkipListReader {
return lastDocPointer;
}
/** Returns the pointer in the pos file. */
public long getPosPointer() {
return lastPosPointer;
}
/** Return the start offset in the position block. */
public int getPosBufferUpto() {
return lastPosBufferUpto;
}
/** Returns the pointer in the pay file. */
public long getPayPointer() {
return lastPayPointer;
}
/** Return the number of bytes in the pay block that belongs to docs from the previous block. */
public int getPayloadByteUpto() {
return lastPayloadByteUpto;
}
/** Return the next skip doc, no skipping can be performed until this doc. */
public int getNextSkipDoc() {
return skipDoc[0];
}
@ -199,7 +206,7 @@ public class Lucene99SkipReader extends MultiLevelSkipListReader {
return delta;
}
// The default impl skips impacts
/** Read impacts. The default implementation skips them. */
protected void readImpacts(int level, IndexInput skipStream) throws IOException {
skipStream.skipBytes(skipStream.readVInt());
}

View File

@ -14,7 +14,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.codecs.lucene99;
package org.apache.lucene.backward_codecs.lucene99;
import java.io.IOException;
import java.util.Arrays;
@ -46,10 +46,10 @@ import org.apache.lucene.store.IndexOutput;
* uptos(position, payload). 4. start offset.
*/
public final class Lucene99SkipWriter extends MultiLevelSkipListWriter {
private int[] lastSkipDoc;
private long[] lastSkipDocPointer;
private long[] lastSkipPosPointer;
private long[] lastSkipPayPointer;
private final int[] lastSkipDoc;
private final long[] lastSkipDocPointer;
private final long[] lastSkipPosPointer;
private final long[] lastSkipPayPointer;
private final IndexOutput docOut;
private final IndexOutput posOut;
@ -61,11 +61,12 @@ public final class Lucene99SkipWriter extends MultiLevelSkipListWriter {
private long curPayPointer;
private int curPosBufferUpto;
private int curPayloadByteUpto;
private CompetitiveImpactAccumulator[] curCompetitiveFreqNorms;
private final CompetitiveImpactAccumulator[] curCompetitiveFreqNorms;
private boolean fieldHasPositions;
private boolean fieldHasOffsets;
private boolean fieldHasPayloads;
/** Sole constructor. */
public Lucene99SkipWriter(
int maxSkipLevels,
int blockSize,
@ -84,7 +85,12 @@ public final class Lucene99SkipWriter extends MultiLevelSkipListWriter {
lastSkipPosPointer = new long[maxSkipLevels];
if (payOut != null) {
lastSkipPayPointer = new long[maxSkipLevels];
} else {
lastSkipPayPointer = null;
}
} else {
lastSkipPosPointer = null;
lastSkipPayPointer = null;
}
curCompetitiveFreqNorms = new CompetitiveImpactAccumulator[maxSkipLevels];
for (int i = 0; i < maxSkipLevels; ++i) {
@ -92,6 +98,7 @@ public final class Lucene99SkipWriter extends MultiLevelSkipListWriter {
}
}
/** Reset state for the given index options. */
public void setField(
boolean fieldHasPositions, boolean fieldHasOffsets, boolean fieldHasPayloads) {
this.fieldHasPositions = fieldHasPositions;
@ -211,6 +218,7 @@ public final class Lucene99SkipWriter extends MultiLevelSkipListWriter {
competitiveFreqNorms.clear();
}
/** Write impacts to the given output. */
public static void writeImpacts(CompetitiveImpactAccumulator acc, DataOutput out)
throws IOException {
Collection<Impact> impacts = acc.getCompetitiveFreqNormPairs();

View File

@ -14,7 +14,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.codecs.lucene99;
package org.apache.lucene.backward_codecs.lucene99;
import java.io.IOException;
import java.util.Arrays;

View File

@ -14,7 +14,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.codecs.lucene99;
package org.apache.lucene.backward_codecs.lucene99;
import java.io.IOException;
import org.apache.lucene.store.IndexInput;

View File

@ -40,7 +40,7 @@ HEADER = """// This file has been automatically generated, DO NOT EDIT
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.codecs.lucene99;
package org.apache.lucene.backward_codecs.lucene99;
import java.io.IOException;
import org.apache.lucene.store.DataInput;

View File

@ -0,0 +1,428 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Lucene 9.9 file format.
*
* <h2>Apache Lucene - Index File Formats</h2>
*
* <div>
*
* <ul>
* <li><a href="#Introduction">Introduction</a>
* <li><a href="#Definitions">Definitions</a>
* <ul>
* <li><a href="#Inverted_Indexing">Inverted Indexing</a>
* <li><a href="#Types_of_Fields">Types of Fields</a>
* <li><a href="#Segments">Segments</a>
* <li><a href="#Document_Numbers">Document Numbers</a>
* </ul>
* <li><a href="#Overview">Index Structure Overview</a>
* <li><a href="#File_Naming">File Naming</a>
* <li><a href="#file-names">Summary of File Extensions</a>
* <ul>
* <li><a href="#Lock_File">Lock File</a>
* <li><a href="#History">History</a>
* <li><a href="#Limitations">Limitations</a>
* </ul>
* </ul>
*
* </div> <a id="Introduction"></a>
*
* <h3>Introduction</h3>
*
* <div>
*
* <p>This document defines the index file formats used in this version of Lucene. If you are using
* a different version of Lucene, please consult the copy of <code>docs/</code> that was distributed
* with the version you are using.
*
* <p>This document attempts to provide a high-level definition of the Apache Lucene file formats.
* </div> <a id="Definitions"></a>
*
* <h3>Definitions</h3>
*
* <div>
*
* <p>The fundamental concepts in Lucene are index, document, field and term.
*
* <p>An index contains a sequence of documents.
*
* <ul>
* <li>A document is a sequence of fields.
* <li>A field is a named sequence of terms.
* <li>A term is a sequence of bytes.
* </ul>
*
* <p>The same sequence of bytes in two different fields is considered a different term. Thus terms
* are represented as a pair: the string naming the field, and the bytes within the field. <a
* id="Inverted_Indexing"></a>
*
* <h4>Inverted Indexing</h4>
*
* <p>Lucene's index stores terms and statistics about those terms in order to make term-based
* search more efficient. Lucene's terms index falls into the family of indexes known as an
* <i>inverted index.</i> This is because it can list, for a term, the documents that contain it.
* This is the inverse of the natural relationship, in which documents list terms. <a
* id="Types_of_Fields"></a>
*
* <h4>Types of Fields</h4>
*
* <p>In Lucene, fields may be <i>stored</i>, in which case their text is stored in the index
* literally, in a non-inverted manner. Fields that are inverted are called <i>indexed</i>. A field
* may be both stored and indexed.
*
* <p>The text of a field may be <i>tokenized</i> into terms to be indexed, or the text of a field
* may be used literally as a term to be indexed. Most fields are tokenized, but sometimes it is
* useful for certain identifier fields to be indexed literally.
*
* <p>See the {@link org.apache.lucene.document.Field Field} java docs for more information on
* Fields. <a id="Segments"></a>
*
* <h4>Segments</h4>
*
* <p>Lucene indexes may be composed of multiple sub-indexes, or <i>segments</i>. Each segment is a
* fully independent index, which could be searched separately. Indexes evolve by:
*
* <ol>
* <li>Creating new segments for newly added documents.
* <li>Merging existing segments.
* </ol>
*
* <p>Searches may involve multiple segments and/or multiple indexes, each index potentially
* composed of a set of segments. <a id="Document_Numbers"></a>
*
* <h4>Document Numbers</h4>
*
* <p>Internally, Lucene refers to documents by an integer <i>document number</i>. The first
* document added to an index is numbered zero, and each subsequent document added gets a number one
* greater than the previous.
*
* <p>Note that a document's number may change, so caution should be taken when storing these
* numbers outside of Lucene. In particular, numbers may change in the following situations:
*
* <ul>
* <li>
* <p>The numbers stored in each segment are unique only within the segment, and must be
* converted before they can be used in a larger context. The standard technique is to
* allocate each segment a range of values, based on the range of numbers used in that
* segment. To convert a document number from a segment to an external value, the segment's
* <i>base</i> document number is added. To convert an external value back to a
* segment-specific value, the segment is identified by the range that the external value is
* in, and the segment's base value is subtracted. For example two five document segments
* might be combined, so that the first segment has a base value of zero, and the second of
* five. Document three from the second segment would have an external value of eight.
* <li>
* <p>When documents are deleted, gaps are created in the numbering. These are eventually
* removed as the index evolves through merging. Deleted documents are dropped when segments
* are merged. A freshly-merged segment thus has no gaps in its numbering.
* </ul>
*
* </div> <a id="Overview"></a>
*
* <h3>Index Structure Overview</h3>
*
* <div>
*
* <p>Each segment index maintains the following:
*
* <ul>
* <li>{@link org.apache.lucene.codecs.lucene99.Lucene99SegmentInfoFormat Segment info}. This
* contains metadata about a segment, such as the number of documents, what files it uses, and
* information about how the segment is sorted
* <li>{@link org.apache.lucene.codecs.lucene94.Lucene94FieldInfosFormat Field names}. This
* contains metadata about the set of named fields used in the index.
* <li>{@link org.apache.lucene.codecs.lucene90.Lucene90StoredFieldsFormat Stored Field values}.
* This contains, for each document, a list of attribute-value pairs, where the attributes are
* field names. These are used to store auxiliary information about the document, such as its
* title, url, or an identifier to access a database. The set of stored fields are what is
* returned for each hit when searching. This is keyed by document number.
* <li>{@link org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat Term dictionary}.
* A dictionary containing all of the terms used in all of the indexed fields of all of the
* documents. The dictionary also contains the number of documents which contain the term, and
* pointers to the term's frequency and proximity data.
* <li>{@link org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat Term Frequency
* data}. For each term in the dictionary, the numbers of all the documents that contain that
* term, and the frequency of the term in that document, unless frequencies are omitted
* ({@link org.apache.lucene.index.IndexOptions#DOCS IndexOptions.DOCS})
* <li>{@link org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat Term Proximity
* data}. For each term in the dictionary, the positions that the term occurs in each
* document. Note that this will not exist if all fields in all documents omit position data.
* <li>{@link org.apache.lucene.codecs.lucene90.Lucene90NormsFormat Normalization factors}. For
* each field in each document, a value is stored that is multiplied into the score for hits
* on that field.
* <li>{@link org.apache.lucene.codecs.lucene90.Lucene90TermVectorsFormat Term Vectors}. For each
* field in each document, the term vector (sometimes called document vector) may be stored. A
* term vector consists of term text and term frequency. To add Term Vectors to your index see
* the {@link org.apache.lucene.document.Field Field} constructors
* <li>{@link org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat Per-document values}. Like
* stored values, these are also keyed by document number, but are generally intended to be
* loaded into main memory for fast access. Whereas stored values are generally intended for
* summary results from searches, per-document values are useful for things like scoring
* factors.
* <li>{@link org.apache.lucene.codecs.lucene90.Lucene90LiveDocsFormat Live documents}. An
* optional file indicating which documents are live.
* <li>{@link org.apache.lucene.codecs.lucene90.Lucene90PointsFormat Point values}. Optional pair
* of files, recording dimensionally indexed fields, to enable fast numeric range filtering
* and large numeric values like BigInteger and BigDecimal (1D) and geographic shape
* intersection (2D, 3D).
* <li>{@link org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat Vector values}. The
* vector format stores numeric vectors in a format optimized for random access and
* computation, supporting high-dimensional nearest-neighbor search.
* </ul>
*
* <p>Details on each of these are provided in their linked pages. </div> <a id="File_Naming"></a>
*
* <h3>File Naming</h3>
*
* <div>
*
* <p>All files belonging to a segment have the same name with varying extensions. The extensions
* correspond to the different file formats described below. When using the Compound File format
* (default for small segments) these files (except for the Segment info file, the Lock file, and
* Deleted documents file) are collapsed into a single .cfs file (see below for details)
*
* <p>Typically, all segments in an index are stored in a single directory, although this is not
* required.
*
* <p>File names are never re-used. That is, when any file is saved to the Directory it is given a
* never before used filename. This is achieved using a simple generations approach. For example,
* the first segments file is segments_1, then segments_2, etc. The generation is a sequential long
* integer represented in alpha-numeric (base 36) form. </div> <a id="file-names"></a>
*
* <h3>Summary of File Extensions</h3>
*
* <div>
*
* <p>The following table summarizes the names and extensions of the files in Lucene:
*
* <table class="padding4" style="border-spacing: 1px; border-collapse: separate">
* <caption>lucene filenames by extension</caption>
* <tr>
* <th>Name</th>
* <th>Extension</th>
* <th>Brief Description</th>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.index.SegmentInfos Segments File}</td>
* <td>segments_N</td>
* <td>Stores information about a commit point</td>
* </tr>
* <tr>
* <td><a href="#Lock_File">Lock File</a></td>
* <td>write.lock</td>
* <td>The Write lock prevents multiple IndexWriters from writing to the same
* file.</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene99.Lucene99SegmentInfoFormat Segment Info}</td>
* <td>.si</td>
* <td>Stores metadata about a segment</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90CompoundFormat Compound File}</td>
* <td>.cfs, .cfe</td>
* <td>An optional "virtual" file consisting of all the other index files for
* systems that frequently run out of file handles.</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene94.Lucene94FieldInfosFormat Fields}</td>
* <td>.fnm</td>
* <td>Stores information about the fields</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90StoredFieldsFormat Field Index}</td>
* <td>.fdx</td>
* <td>Contains pointers to field data</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90StoredFieldsFormat Field Data}</td>
* <td>.fdt</td>
* <td>The stored fields for documents</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat Term Dictionary}</td>
* <td>.tim</td>
* <td>The term dictionary, stores term info</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat Term Index}</td>
* <td>.tip</td>
* <td>The index into the Term Dictionary</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat Frequencies}</td>
* <td>.doc</td>
* <td>Contains the list of docs which contain each term along with frequency</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat Positions}</td>
* <td>.pos</td>
* <td>Stores position information about where a term occurs in the index</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat Payloads}</td>
* <td>.pay</td>
* <td>Stores additional per-position metadata information such as character offsets and user payloads</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90NormsFormat Norms}</td>
* <td>.nvd, .nvm</td>
* <td>Encodes length and boost factors for docs and fields</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat Per-Document Values}</td>
* <td>.dvd, .dvm</td>
* <td>Encodes additional scoring factors or other per-document information.</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90TermVectorsFormat Term Vector Index}</td>
* <td>.tvx</td>
* <td>Stores offset into the document data file</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90TermVectorsFormat Term Vector Data}</td>
* <td>.tvd</td>
* <td>Contains term vector data.</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90LiveDocsFormat Live Documents}</td>
* <td>.liv</td>
* <td>Info about what documents are live</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90PointsFormat Point values}</td>
* <td>.dii, .dim</td>
* <td>Holds indexed points</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat Vector values}</td>
* <td>.vec, .vem, .veq, vex</td>
* <td>Holds indexed vectors; <code>.vec</code> files contain the raw vector data,
* <code>.vem</code> the vector metadata, <code>.veq</code> the quantized vector data, and <code>.vex</code> the
* hnsw graph data.</td>
* </tr>
* </table>
*
* </div> <a id="Lock_File"></a>
*
* <h3>Lock File</h3>
*
* The write lock, which is stored in the index directory by default, is named "write.lock". If the
* lock directory is different from the index directory then the write lock will be named
* "XXXX-write.lock" where XXXX is a unique prefix derived from the full path to the index
* directory. When this file is present, a writer is currently modifying the index (adding or
* removing documents). This lock file ensures that only one writer is modifying the index at a
* time. <a id="History"></a>
*
* <h3>History</h3>
*
* <p>Compatibility notes are provided in this document, describing how file formats have changed
* from prior versions:
*
* <ul>
* <li>In version 2.1, the file format was changed to allow lock-less commits (ie, no more commit
* lock). The change is fully backwards compatible: you can open a pre-2.1 index for searching
* or adding/deleting of docs. When the new segments file is saved (committed), it will be
* written in the new file format (meaning no specific "upgrade" process is needed). But note
* that once a commit has occurred, pre-2.1 Lucene will not be able to read the index.
* <li>In version 2.3, the file format was changed to allow segments to share a single set of doc
* store (vectors &amp; stored fields) files. This allows for faster indexing in certain
* cases. The change is fully backwards compatible (in the same way as the lock-less commits
* change in 2.1).
* <li>In version 2.4, Strings are now written as true UTF-8 byte sequence, not Java's modified
* UTF-8. See <a href="http://issues.apache.org/jira/browse/LUCENE-510">LUCENE-510</a> for
* details.
* <li>In version 2.9, an optional opaque Map&lt;String,String&gt; CommitUserData may be passed to
* IndexWriter's commit methods (and later retrieved), which is recorded in the segments_N
* file. See <a href="http://issues.apache.org/jira/browse/LUCENE-1382">LUCENE-1382</a> for
* details. Also, diagnostics were added to each segment written recording details about why
* it was written (due to flush, merge; which OS/JRE was used; etc.). See issue <a
* href="http://issues.apache.org/jira/browse/LUCENE-1654">LUCENE-1654</a> for details.
* <li>In version 3.0, compressed fields are no longer written to the index (they can still be
* read, but on merge the new segment will write them, uncompressed). See issue <a
* href="http://issues.apache.org/jira/browse/LUCENE-1960">LUCENE-1960</a> for details.
* <li>In version 3.1, segments records the code version that created them. See <a
* href="http://issues.apache.org/jira/browse/LUCENE-2720">LUCENE-2720</a> for details.
* Additionally segments track explicitly whether or not they have term vectors. See <a
* href="http://issues.apache.org/jira/browse/LUCENE-2811">LUCENE-2811</a> for details.
* <li>In version 3.2, numeric fields are written as natively to stored fields file, previously
* they were stored in text format only.
* <li>In version 3.4, fields can omit position data while still indexing term frequencies.
* <li>In version 4.0, the format of the inverted index became extensible via the {@link
* org.apache.lucene.codecs.Codec Codec} api. Fast per-document storage ({@code DocValues})
* was introduced. Normalization factors need no longer be a single byte, they can be any
* {@link org.apache.lucene.index.NumericDocValues NumericDocValues}. Terms need not be
* unicode strings, they can be any byte sequence. Term offsets can optionally be indexed into
* the postings lists. Payloads can be stored in the term vectors.
* <li>In version 4.1, the format of the postings list changed to use either of FOR compression or
* variable-byte encoding, depending upon the frequency of the term. Terms appearing only once
* were changed to inline directly into the term dictionary. Stored fields are compressed by
* default.
* <li>In version 4.2, term vectors are compressed by default. DocValues has a new multi-valued
* type (SortedSet), that can be used for faceting/grouping/joining on multi-valued fields.
* <li>In version 4.5, DocValues were extended to explicitly represent missing values.
* <li>In version 4.6, FieldInfos were extended to support per-field DocValues generation, to
* allow updating NumericDocValues fields.
* <li>In version 4.8, checksum footers were added to the end of each index file for improved data
* integrity. Specifically, the last 8 bytes of every index file contain the zlib-crc32
* checksum of the file.
* <li>In version 4.9, DocValues has a new multi-valued numeric type (SortedNumeric) that is
* suitable for faceting/sorting/analytics.
* <li>In version 5.4, DocValues have been improved to store more information on disk: addresses
* for binary fields and ord indexes for multi-valued fields.
* <li>In version 6.0, Points were added, for multi-dimensional range/distance search.
* <li>In version 6.2, new Segment info format that reads/writes the index sort, to support index
* sorting.
* <li>In version 7.0, DocValues have been improved to better support sparse doc values thanks to
* an iterator API.
* <li>In version 8.0, postings have been enhanced to record, for each block of doc ids, the (term
* freq, normalization factor) pairs that may trigger the maximum score of the block. This
* information is recorded alongside skip data in order to be able to skip blocks of doc ids
* if they may not produce high enough scores. Additionally doc values and norms has been
* extended with jump-tables to make access O(1) instead of O(n), where n is the number of
* elements to skip when advancing in the data.
* <li>In version 8.4, postings, positions, offsets and payload lengths have move to a more
* performant encoding that is vectorized.
* <li>In version 8.6, index sort serialization is delegated to the sorts themselves, to allow
* user-defined sorts to be used
* <li>In version 8.7, stored fields compression became adaptive to better handle documents with
* smaller stored fields.
* <li>In version 9.0, vector-valued fields were added.
* <li>In version 9.1, vector-valued fields were modified to add a graph hierarchy.
* <li>In version 9.2, docs of vector-valued fields were moved from .vem to .vec and encoded by
* IndexDISI. ordToDoc mappings was added to .vem.
* <li>In version 9.5, HNSW graph connections were changed to be delta-encoded with vints.
* Additionally, metadata file size improvements were made by delta-encoding nodes by graph
* layer and not writing the node ids for the zeroth layer.
* <li>In version 9.9, Vector scalar quantization support was added. Allowing the HNSW vector
* format to utilize int8 quantized vectors for float32 vector search.
* </ul>
*
* <a id="Limitations"></a>
*
* <h3>Limitations</h3>
*
* <div>
*
* <p>Lucene uses a Java <code>int</code> to refer to document numbers, and the index file format
* uses an <code>Int32</code> on-disk to store document numbers. This is a limitation of both the
* index file format and the current implementation. Eventually these should be replaced with either
* <code>UInt64</code> values, or better yet, {@link org.apache.lucene.store.DataOutput#writeVInt
* VInt} values which have no limit. </div>
*/
package org.apache.lucene.backward_codecs.lucene99;

View File

@ -22,3 +22,4 @@ org.apache.lucene.backward_codecs.lucene91.Lucene91Codec
org.apache.lucene.backward_codecs.lucene92.Lucene92Codec
org.apache.lucene.backward_codecs.lucene94.Lucene94Codec
org.apache.lucene.backward_codecs.lucene95.Lucene95Codec
org.apache.lucene.backward_codecs.lucene99.Lucene99Codec

View File

@ -16,3 +16,4 @@
org.apache.lucene.backward_codecs.lucene50.Lucene50PostingsFormat
org.apache.lucene.backward_codecs.lucene84.Lucene84PostingsFormat
org.apache.lucene.backward_codecs.lucene90.Lucene90PostingsFormat
org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat

View File

@ -17,7 +17,7 @@
package org.apache.lucene.backward_codecs.lucene50;
import java.io.IOException;
import org.apache.lucene.backward_codecs.lucene40.blocktree.Lucene40BlockTreeTermsWriter;
import org.apache.lucene.backward_codecs.lucene40.blocktree.Lucene40BlockTreeTermsWriterV5;
import org.apache.lucene.codecs.FieldsConsumer;
import org.apache.lucene.codecs.PostingsWriterBase;
import org.apache.lucene.index.SegmentWriteState;
@ -31,11 +31,11 @@ public class Lucene50RWPostingsFormat extends Lucene50PostingsFormat {
boolean success = false;
try {
FieldsConsumer ret =
new Lucene40BlockTreeTermsWriter(
new Lucene40BlockTreeTermsWriterV5(
state,
postingsWriter,
Lucene40BlockTreeTermsWriter.DEFAULT_MIN_BLOCK_SIZE,
Lucene40BlockTreeTermsWriter.DEFAULT_MAX_BLOCK_SIZE);
Lucene40BlockTreeTermsWriterV5.DEFAULT_MIN_BLOCK_SIZE,
Lucene40BlockTreeTermsWriterV5.DEFAULT_MAX_BLOCK_SIZE);
success = true;
return ret;
} finally {

View File

@ -642,13 +642,13 @@ public class BKDWriter60 implements Closeable {
throws IOException {
assert docMaps == null || readers.size() == docMaps.size();
BKDMergeQueue queue = new BKDMergeQueue(config.bytesPerDim, readers.size());
BKDMergeQueue queue = new BKDMergeQueue(config.bytesPerDim(), readers.size());
for (int i = 0; i < readers.size(); i++) {
PointValues pointValues = readers.get(i);
assert pointValues.getNumDimensions() == config.numDims
&& pointValues.getBytesPerDimension() == config.bytesPerDim
&& pointValues.getNumIndexDimensions() == config.numIndexDims;
assert pointValues.getNumDimensions() == config.numDims()
&& pointValues.getBytesPerDimension() == config.bytesPerDim()
&& pointValues.getNumIndexDimensions() == config.numIndexDims();
MergeState.DocMap docMap;
if (docMaps == null) {
docMap = null;

View File

@ -23,12 +23,11 @@ import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import org.apache.lucene.backward_codecs.lucene90.Lucene90ScoreSkipReader.MutableImpactList;
import org.apache.lucene.backward_codecs.lucene99.Lucene99SkipWriter;
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.codecs.CompetitiveImpactAccumulator;
import org.apache.lucene.codecs.lucene90.blocktree.FieldReader;
import org.apache.lucene.codecs.lucene90.blocktree.Stats;
import org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat;
import org.apache.lucene.codecs.lucene99.Lucene99SkipWriter;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.DirectoryReader;
@ -77,22 +76,6 @@ public class TestLucene90PostingsFormat extends BasePostingsFormatTestCase {
d.close();
}
private void shouldFail(int minItemsInBlock, int maxItemsInBlock) {
expectThrows(
IllegalArgumentException.class,
() -> {
new Lucene99PostingsFormat(minItemsInBlock, maxItemsInBlock);
});
}
public void testInvalidBlockSizes() throws Exception {
shouldFail(0, 0);
shouldFail(10, 8);
shouldFail(-1, 10);
shouldFail(10, -1);
shouldFail(10, 12);
}
public void testImpactSerialization() throws IOException {
// omit norms and omit freqs
doTestImpactSerialization(Collections.singletonList(new Impact(1, 1L)));

View File

@ -388,10 +388,14 @@ public final class Lucene94HnswVectorsWriter extends KnnVectorsWriter {
// write the vector data to a temporary file
DocsWithFieldSet docsWithField =
switch (fieldInfo.getVectorEncoding()) {
case BYTE -> writeByteVectorData(
tempVectorData, MergedVectorValues.mergeByteVectorValues(fieldInfo, mergeState));
case FLOAT32 -> writeVectorData(
tempVectorData, MergedVectorValues.mergeFloatVectorValues(fieldInfo, mergeState));
case BYTE ->
writeByteVectorData(
tempVectorData,
MergedVectorValues.mergeByteVectorValues(fieldInfo, mergeState));
case FLOAT32 ->
writeVectorData(
tempVectorData,
MergedVectorValues.mergeFloatVectorValues(fieldInfo, mergeState));
};
CodecUtil.writeFooter(tempVectorData);
IOUtils.close(tempVectorData);
@ -638,18 +642,20 @@ public final class Lucene94HnswVectorsWriter extends KnnVectorsWriter {
throws IOException {
int dim = fieldInfo.getVectorDimension();
return switch (fieldInfo.getVectorEncoding()) {
case BYTE -> new FieldWriter<byte[]>(fieldInfo, M, beamWidth, infoStream) {
@Override
public byte[] copyValue(byte[] value) {
return ArrayUtil.copyOfSubArray(value, 0, dim);
}
};
case FLOAT32 -> new FieldWriter<float[]>(fieldInfo, M, beamWidth, infoStream) {
@Override
public float[] copyValue(float[] value) {
return ArrayUtil.copyOfSubArray(value, 0, dim);
}
};
case BYTE ->
new FieldWriter<byte[]>(fieldInfo, M, beamWidth, infoStream) {
@Override
public byte[] copyValue(byte[] value) {
return ArrayUtil.copyOfSubArray(value, 0, dim);
}
};
case FLOAT32 ->
new FieldWriter<float[]>(fieldInfo, M, beamWidth, infoStream) {
@Override
public float[] copyValue(float[] value) {
return ArrayUtil.copyOfSubArray(value, 0, dim);
}
};
};
}
@ -663,12 +669,14 @@ public final class Lucene94HnswVectorsWriter extends KnnVectorsWriter {
DefaultFlatVectorScorer defaultFlatVectorScorer = new DefaultFlatVectorScorer();
RandomVectorScorerSupplier scorerSupplier =
switch (fieldInfo.getVectorEncoding()) {
case BYTE -> defaultFlatVectorScorer.getRandomVectorScorerSupplier(
fieldInfo.getVectorSimilarityFunction(),
RandomAccessVectorValues.fromBytes((List<byte[]>) vectors, dim));
case FLOAT32 -> defaultFlatVectorScorer.getRandomVectorScorerSupplier(
fieldInfo.getVectorSimilarityFunction(),
RandomAccessVectorValues.fromFloats((List<float[]>) vectors, dim));
case BYTE ->
defaultFlatVectorScorer.getRandomVectorScorerSupplier(
fieldInfo.getVectorSimilarityFunction(),
RandomAccessVectorValues.fromBytes((List<byte[]>) vectors, dim));
case FLOAT32 ->
defaultFlatVectorScorer.getRandomVectorScorerSupplier(
fieldInfo.getVectorSimilarityFunction(),
RandomAccessVectorValues.fromFloats((List<float[]>) vectors, dim));
};
hnswGraphBuilder =
HnswGraphBuilder.create(scorerSupplier, M, beamWidth, HnswGraphBuilder.randSeed);
@ -693,9 +701,9 @@ public final class Lucene94HnswVectorsWriter extends KnnVectorsWriter {
lastDocID = docID;
}
OnHeapHnswGraph getGraph() {
OnHeapHnswGraph getGraph() throws IOException {
if (vectors.size() > 0) {
return hnswGraphBuilder.getGraph();
return hnswGraphBuilder.getCompletedGraph();
} else {
return null;
}

View File

@ -414,10 +414,14 @@ public final class Lucene95HnswVectorsWriter extends KnnVectorsWriter {
// write the vector data to a temporary file
DocsWithFieldSet docsWithField =
switch (fieldInfo.getVectorEncoding()) {
case BYTE -> writeByteVectorData(
tempVectorData, MergedVectorValues.mergeByteVectorValues(fieldInfo, mergeState));
case FLOAT32 -> writeVectorData(
tempVectorData, MergedVectorValues.mergeFloatVectorValues(fieldInfo, mergeState));
case BYTE ->
writeByteVectorData(
tempVectorData,
MergedVectorValues.mergeByteVectorValues(fieldInfo, mergeState));
case FLOAT32 ->
writeVectorData(
tempVectorData,
MergedVectorValues.mergeFloatVectorValues(fieldInfo, mergeState));
};
CodecUtil.writeFooter(tempVectorData);
IOUtils.close(tempVectorData);
@ -477,10 +481,12 @@ public final class Lucene95HnswVectorsWriter extends KnnVectorsWriter {
}
DocIdSetIterator mergedVectorIterator = null;
switch (fieldInfo.getVectorEncoding()) {
case BYTE -> mergedVectorIterator =
KnnVectorsWriter.MergedVectorValues.mergeByteVectorValues(fieldInfo, mergeState);
case FLOAT32 -> mergedVectorIterator =
KnnVectorsWriter.MergedVectorValues.mergeFloatVectorValues(fieldInfo, mergeState);
case BYTE ->
mergedVectorIterator =
KnnVectorsWriter.MergedVectorValues.mergeByteVectorValues(fieldInfo, mergeState);
case FLOAT32 ->
mergedVectorIterator =
KnnVectorsWriter.MergedVectorValues.mergeFloatVectorValues(fieldInfo, mergeState);
}
graph =
merger.merge(
@ -680,18 +686,20 @@ public final class Lucene95HnswVectorsWriter extends KnnVectorsWriter {
throws IOException {
int dim = fieldInfo.getVectorDimension();
return switch (fieldInfo.getVectorEncoding()) {
case BYTE -> new FieldWriter<byte[]>(fieldInfo, M, beamWidth, infoStream) {
@Override
public byte[] copyValue(byte[] value) {
return ArrayUtil.copyOfSubArray(value, 0, dim);
}
};
case FLOAT32 -> new FieldWriter<float[]>(fieldInfo, M, beamWidth, infoStream) {
@Override
public float[] copyValue(float[] value) {
return ArrayUtil.copyOfSubArray(value, 0, dim);
}
};
case BYTE ->
new FieldWriter<byte[]>(fieldInfo, M, beamWidth, infoStream) {
@Override
public byte[] copyValue(byte[] value) {
return ArrayUtil.copyOfSubArray(value, 0, dim);
}
};
case FLOAT32 ->
new FieldWriter<float[]>(fieldInfo, M, beamWidth, infoStream) {
@Override
public float[] copyValue(float[] value) {
return ArrayUtil.copyOfSubArray(value, 0, dim);
}
};
};
}
@ -704,12 +712,14 @@ public final class Lucene95HnswVectorsWriter extends KnnVectorsWriter {
vectors = new ArrayList<>();
RandomVectorScorerSupplier scorerSupplier =
switch (fieldInfo.getVectorEncoding()) {
case BYTE -> defaultFlatVectorScorer.getRandomVectorScorerSupplier(
fieldInfo.getVectorSimilarityFunction(),
RandomAccessVectorValues.fromBytes((List<byte[]>) vectors, dim));
case FLOAT32 -> defaultFlatVectorScorer.getRandomVectorScorerSupplier(
fieldInfo.getVectorSimilarityFunction(),
RandomAccessVectorValues.fromFloats((List<float[]>) vectors, dim));
case BYTE ->
defaultFlatVectorScorer.getRandomVectorScorerSupplier(
fieldInfo.getVectorSimilarityFunction(),
RandomAccessVectorValues.fromBytes((List<byte[]>) vectors, dim));
case FLOAT32 ->
defaultFlatVectorScorer.getRandomVectorScorerSupplier(
fieldInfo.getVectorSimilarityFunction(),
RandomAccessVectorValues.fromFloats((List<float[]>) vectors, dim));
};
hnswGraphBuilder =
HnswGraphBuilder.create(scorerSupplier, M, beamWidth, HnswGraphBuilder.randSeed);
@ -732,9 +742,9 @@ public final class Lucene95HnswVectorsWriter extends KnnVectorsWriter {
lastDocID = docID;
}
OnHeapHnswGraph getGraph() {
OnHeapHnswGraph getGraph() throws IOException {
if (vectors.size() > 0) {
return hnswGraphBuilder.getGraph();
return hnswGraphBuilder.getCompletedGraph();
} else {
return null;
}

View File

@ -14,22 +14,22 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.codecs.lucene99;
package org.apache.lucene.backward_codecs.lucene99;
import static org.apache.lucene.codecs.lucene99.ForUtil.BLOCK_SIZE;
import static org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.DOC_CODEC;
import static org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.MAX_SKIP_LEVELS;
import static org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.PAY_CODEC;
import static org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.POS_CODEC;
import static org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.TERMS_CODEC;
import static org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.VERSION_CURRENT;
import static org.apache.lucene.backward_codecs.lucene99.ForUtil.BLOCK_SIZE;
import static org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat.DOC_CODEC;
import static org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat.MAX_SKIP_LEVELS;
import static org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat.PAY_CODEC;
import static org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat.POS_CODEC;
import static org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat.TERMS_CODEC;
import static org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat.VERSION_CURRENT;
import java.io.IOException;
import org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat.IntBlockTermState;
import org.apache.lucene.codecs.BlockTermState;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.CompetitiveImpactAccumulator;
import org.apache.lucene.codecs.PushPostingsWriterBase;
import org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.IntBlockTermState;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.IndexFileNames;

View File

@ -0,0 +1,68 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.backward_codecs.lucene99;
import java.io.IOException;
import org.apache.lucene.codecs.FieldsConsumer;
import org.apache.lucene.codecs.PostingsWriterBase;
import org.apache.lucene.codecs.lucene90.blocktree.Lucene90BlockTreeTermsWriter;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.util.IOUtils;
public class Lucene99RWPostingsFormat extends Lucene99PostingsFormat {
private final int minTermBlockSize;
private final int maxTermBlockSize;
/** Creates {@code Lucene99PostingsFormat} with default settings. */
public Lucene99RWPostingsFormat() {
this(
Lucene90BlockTreeTermsWriter.DEFAULT_MIN_BLOCK_SIZE,
Lucene90BlockTreeTermsWriter.DEFAULT_MAX_BLOCK_SIZE);
}
/**
* Creates {@code Lucene99PostingsFormat} with custom values for {@code minBlockSize} and {@code
* maxBlockSize} passed to block terms dictionary.
*
* @see
* Lucene90BlockTreeTermsWriter#Lucene90BlockTreeTermsWriter(SegmentWriteState,PostingsWriterBase,int,int)
*/
public Lucene99RWPostingsFormat(int minTermBlockSize, int maxTermBlockSize) {
super();
Lucene90BlockTreeTermsWriter.validateSettings(minTermBlockSize, maxTermBlockSize);
this.minTermBlockSize = minTermBlockSize;
this.maxTermBlockSize = maxTermBlockSize;
}
@Override
public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
PostingsWriterBase postingsWriter = new Lucene99PostingsWriter(state);
boolean success = false;
try {
FieldsConsumer ret =
new Lucene90BlockTreeTermsWriter(
state, postingsWriter, minTermBlockSize, maxTermBlockSize);
success = true;
return ret;
} finally {
if (!success) {
IOUtils.closeWhileHandlingException(postingsWriter);
}
}
}
}

View File

@ -14,7 +14,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.codecs.lucene99;
package org.apache.lucene.backward_codecs.lucene99;
import com.carrotsearch.randomizedtesting.generators.RandomNumbers;
import java.io.IOException;

View File

@ -14,7 +14,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.codecs.lucene99;
package org.apache.lucene.backward_codecs.lucene99;
import com.carrotsearch.randomizedtesting.generators.RandomNumbers;
import java.io.IOException;

View File

@ -19,7 +19,6 @@ package org.apache.lucene.backward_codecs.lucene99;
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.codecs.KnnVectorsFormat;
import org.apache.lucene.codecs.lucene99.Lucene99Codec;
import org.apache.lucene.tests.index.BaseKnnVectorsFormatTestCase;
public class TestLucene99HnswScalarQuantizedVectorsFormat extends BaseKnnVectorsFormatTestCase {

View File

@ -14,22 +14,26 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.codecs.lucene99;
package org.apache.lucene.backward_codecs.lucene99;
import static org.apache.lucene.codecs.lucene99.Lucene99ScoreSkipReader.readImpacts;
import static org.apache.lucene.backward_codecs.lucene99.Lucene99ScoreSkipReader.readImpacts;
import java.io.IOException;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import org.apache.lucene.backward_codecs.lucene99.Lucene99ScoreSkipReader.MutableImpactList;
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.codecs.CompetitiveImpactAccumulator;
import org.apache.lucene.codecs.lucene90.blocktree.FieldReader;
import org.apache.lucene.codecs.lucene90.blocktree.Stats;
import org.apache.lucene.codecs.lucene99.Lucene99ScoreSkipReader.MutableImpactList;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.*;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.Impact;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.store.ByteArrayDataInput;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
@ -41,7 +45,7 @@ import org.apache.lucene.tests.util.TestUtil;
import org.apache.lucene.util.BytesRef;
public class TestLucene99PostingsFormat extends BasePostingsFormatTestCase {
private final Codec codec = TestUtil.alwaysPostingsFormat(new Lucene99PostingsFormat());
private final Codec codec = TestUtil.alwaysPostingsFormat(new Lucene99RWPostingsFormat());
@Override
protected Codec getCodec() {
@ -77,7 +81,7 @@ public class TestLucene99PostingsFormat extends BasePostingsFormatTestCase {
expectThrows(
IllegalArgumentException.class,
() -> {
new Lucene99PostingsFormat(minItemsInBlock, maxItemsInBlock);
new Lucene99RWPostingsFormat(minItemsInBlock, maxItemsInBlock);
});
}

View File

@ -14,7 +14,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.codecs.lucene99;
package org.apache.lucene.backward_codecs.lucene99;
import com.carrotsearch.randomizedtesting.generators.RandomNumbers;
import java.io.IOException;

View File

@ -0,0 +1,49 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.backward_codecs.lucene99;
import java.io.IOException;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.tests.util.LuceneTestCase;
public class TestPostingsUtil extends LuceneTestCase {
// checks for bug described in https://github.com/apache/lucene/issues/13373
public void testIntegerOverflow() throws IOException {
final int size = random().nextInt(1, ForUtil.BLOCK_SIZE);
final long[] docDeltaBuffer = new long[size];
final long[] freqBuffer = new long[size];
final int delta = 1 << 30;
docDeltaBuffer[0] = delta;
try (Directory dir = newDirectory()) {
try (IndexOutput out = dir.createOutput("test", IOContext.DEFAULT)) {
// In old implementation, this would cause integer overflow exception.
PostingsUtil.writeVIntBlock(out, docDeltaBuffer, freqBuffer, size, true);
}
long[] restoredDocs = new long[size];
long[] restoredFreqs = new long[size];
try (IndexInput in = dir.openInput("test", IOContext.DEFAULT)) {
PostingsUtil.readVIntBlock(in, restoredDocs, restoredFreqs, size, true, true);
}
assertEquals(delta, restoredDocs[0]);
}
}
}

View File

@ -196,6 +196,7 @@ public class TestAncientIndicesCompatibility extends LuceneTestCase {
ByteArrayOutputStream bos = new ByteArrayOutputStream(1024);
CheckIndex checker = new CheckIndex(dir);
checker.setInfoStream(new PrintStream(bos, false, UTF_8));
checker.setLevel(CheckIndex.Level.MIN_LEVEL_FOR_INTEGRITY_CHECKS);
CheckIndex.Status indexStatus = checker.checkIndex();
if (version.startsWith("8.")) {
assertTrue(indexStatus.clean);

View File

@ -20,9 +20,9 @@ import static org.apache.lucene.backward_index.TestBasicBackwardsCompatibility.a
import com.carrotsearch.randomizedtesting.annotations.ParametersFactory;
import java.io.IOException;
import org.apache.lucene.backward_codecs.lucene99.Lucene99Codec;
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.codecs.KnnVectorsFormat;
import org.apache.lucene.codecs.lucene99.Lucene99Codec;
import org.apache.lucene.codecs.lucene99.Lucene99HnswScalarQuantizedVectorsFormat;
import org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat;
import org.apache.lucene.document.Document;

View File

@ -40,3 +40,4 @@
9.9.2
9.10.0
9.11.0
9.11.1

View File

@ -0,0 +1,376 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.benchmark.jmh;
import java.util.Arrays;
import java.util.Random;
import java.util.concurrent.TimeUnit;
import org.apache.lucene.search.DocIdSetIterator;
import org.openjdk.jmh.annotations.Benchmark;
import org.openjdk.jmh.annotations.BenchmarkMode;
import org.openjdk.jmh.annotations.CompilerControl;
import org.openjdk.jmh.annotations.Fork;
import org.openjdk.jmh.annotations.Level;
import org.openjdk.jmh.annotations.Measurement;
import org.openjdk.jmh.annotations.Mode;
import org.openjdk.jmh.annotations.OutputTimeUnit;
import org.openjdk.jmh.annotations.Scope;
import org.openjdk.jmh.annotations.Setup;
import org.openjdk.jmh.annotations.State;
import org.openjdk.jmh.annotations.Warmup;
@BenchmarkMode(Mode.Throughput)
@OutputTimeUnit(TimeUnit.MILLISECONDS)
@State(Scope.Benchmark)
@Warmup(iterations = 5, time = 1)
@Measurement(iterations = 5, time = 1)
@Fork(
value = 1,
jvmArgsAppend = {"-Xmx1g", "-Xms1g", "-XX:+AlwaysPreTouch"})
public class AdvanceBenchmark {
private final long[] values = new long[129];
private final int[] startIndexes = new int[1_000];
private final long[] targets = new long[startIndexes.length];
@Setup(Level.Trial)
public void setup() throws Exception {
for (int i = 0; i < 128; ++i) {
values[i] = i;
}
values[128] = DocIdSetIterator.NO_MORE_DOCS;
Random r = new Random(0);
for (int i = 0; i < startIndexes.length; ++i) {
startIndexes[i] = r.nextInt(64);
targets[i] = startIndexes[i] + 1 + r.nextInt(1 << r.nextInt(7));
}
}
@Benchmark
public void binarySearch() {
for (int i = 0; i < startIndexes.length; ++i) {
binarySearch(values, targets[i], startIndexes[i]);
}
}
@CompilerControl(CompilerControl.Mode.DONT_INLINE)
private static int binarySearch(long[] values, long target, int startIndex) {
// Standard binary search
int i = Arrays.binarySearch(values, startIndex, values.length, target);
if (i < 0) {
i = -1 - i;
}
return i;
}
@Benchmark
public void binarySearch2() {
for (int i = 0; i < startIndexes.length; ++i) {
binarySearch2(values, targets[i], startIndexes[i]);
}
}
@CompilerControl(CompilerControl.Mode.DONT_INLINE)
private static int binarySearch2(long[] values, long target, int startIndex) {
// Try to help the compiler by providing predictable start/end offsets.
int i = Arrays.binarySearch(values, 0, 128, target);
if (i < 0) {
i = -1 - i;
}
return i;
}
@Benchmark
public void binarySearch3() {
for (int i = 0; i < startIndexes.length; ++i) {
binarySearch3(values, targets[i], startIndexes[i]);
}
}
@CompilerControl(CompilerControl.Mode.DONT_INLINE)
private static int binarySearch3(long[] values, long target, int startIndex) {
// Organize code the same way as suggested in https://quickwit.io/blog/search-a-sorted-block,
// which proved to help with LLVM.
int start = 0;
int length = 128;
while (length > 1) {
length /= 2;
if (values[start + length - 1] < target) {
start += length;
}
}
return start;
}
@Benchmark
public void binarySearch4() {
for (int i = 0; i < startIndexes.length; ++i) {
binarySearch4(values, targets[i], startIndexes[i]);
}
}
@CompilerControl(CompilerControl.Mode.DONT_INLINE)
private static int binarySearch4(long[] values, long target, int startIndex) {
// Explicitly inline the binary-search logic to see if it helps the compiler.
int start = 0;
if (values[63] < target) {
start += 64;
}
if (values[start + 31] < target) {
start += 32;
}
if (values[start + 15] < target) {
start += 16;
}
if (values[start + 7] < target) {
start += 8;
}
if (values[start + 3] < target) {
start += 4;
}
if (values[start + 1] < target) {
start += 2;
}
if (values[start] < target) {
start += 1;
}
return start;
}
@Benchmark
public void binarySearch5() {
for (int i = 0; i < startIndexes.length; ++i) {
binarySearch5(values, targets[i], startIndexes[i]);
}
}
@CompilerControl(CompilerControl.Mode.DONT_INLINE)
private static int binarySearch5(long[] values, long target, int startIndex) {
// Other way to write a binary search
int start = 0;
for (int shift = 6; shift >= 0; --shift) {
int halfRange = 1 << shift;
if (values[start + halfRange - 1] < target) {
start += halfRange;
}
}
return start;
}
@Benchmark
public void binarySearch6() {
for (int i = 0; i < startIndexes.length; ++i) {
binarySearch6(values, targets[i], startIndexes[i]);
}
}
@CompilerControl(CompilerControl.Mode.DONT_INLINE)
private static int binarySearch6(long[] values, long target, int startIndex) {
// Other way to write a binary search
int start = 0;
for (int halfRange = 64; halfRange > 0; halfRange >>= 1) {
if (values[start + halfRange - 1] < target) {
start += halfRange;
}
}
return start;
}
@Benchmark
public void linearSearch() {
for (int i = 0; i < startIndexes.length; ++i) {
linearSearch(values, targets[i], startIndexes[i]);
}
}
@CompilerControl(CompilerControl.Mode.DONT_INLINE)
private static int linearSearch(long[] values, long target, int startIndex) {
// Naive linear search.
for (int i = startIndex; i < values.length; ++i) {
if (values[i] >= target) {
return i;
}
}
return values.length;
}
@Benchmark
public void bruteForceSearch() {
for (int i = 0; i < startIndexes.length; ++i) {
bruteForceSearch(values, targets[i], startIndexes[i]);
}
}
@CompilerControl(CompilerControl.Mode.DONT_INLINE)
private static int bruteForceSearch(long[] values, long target, int startIndex) {
// Linear search with predictable start/end offsets to see if it helps the compiler.
for (int i = 0; i < 128; ++i) {
if (values[i] >= target) {
return i;
}
}
return values.length;
}
@Benchmark
public void linearSearch2() {
for (int i = 0; i < startIndexes.length; ++i) {
linearSearch2(values, targets[i], startIndexes[i]);
}
}
@CompilerControl(CompilerControl.Mode.DONT_INLINE)
private static int linearSearch2(long[] values, long target, int startIndex) {
// Two-level linear search, first checking every 8-th value, then values within an 8-value range
int rangeStart = values.length - 8;
for (int i = startIndex; i + 8 <= values.length; i += 8) {
if (values[i + 7] >= target) {
rangeStart = i;
break;
}
}
for (int i = 0; i < 8; ++i) {
if (values[rangeStart + i] >= target) {
return rangeStart + i;
}
}
return values.length;
}
@Benchmark
public void linearSearch3() {
for (int i = 0; i < startIndexes.length; ++i) {
linearSearch3(values, targets[i], startIndexes[i]);
}
}
@CompilerControl(CompilerControl.Mode.DONT_INLINE)
private static int linearSearch3(long[] values, long target, int startIndex) {
// Iteration over linearSearch that tries to reduce branches
while (startIndex + 4 <= values.length) {
int count = values[startIndex] < target ? 1 : 0;
if (values[startIndex + 1] < target) {
count++;
}
if (values[startIndex + 2] < target) {
count++;
}
if (values[startIndex + 3] < target) {
count++;
}
if (count != 4) {
return startIndex + count;
}
startIndex += 4;
}
for (int i = startIndex; i < values.length; ++i) {
if (values[i] >= target) {
return i;
}
}
return values.length;
}
@Benchmark
public void hybridSearch() {
for (int i = 0; i < startIndexes.length; ++i) {
hybridSearch(values, targets[i], startIndexes[i]);
}
}
@CompilerControl(CompilerControl.Mode.DONT_INLINE)
private static int hybridSearch(long[] values, long target, int startIndex) {
// Two-level linear search, first checking every 8-th value, then values within an 8-value range
int rangeStart = values.length - 8;
for (int i = startIndex; i + 8 <= values.length; i += 8) {
if (values[i + 7] >= target) {
rangeStart = i;
break;
}
}
return binarySearchHelper8(values, target, rangeStart);
}
// branchless binary search over 8 values
private static int binarySearchHelper8(long[] values, long target, int start) {
if (values[start + 3] < target) {
start += 4;
}
if (values[start + 1] < target) {
start += 2;
}
if (values[start] < target) {
start += 1;
}
return start;
}
private static void assertEquals(int expected, int actual) {
if (expected != actual) {
throw new AssertionError("Expected: " + expected + ", got " + actual);
}
}
public static void main(String[] args) {
// For testing purposes
long[] values = new long[129];
for (int i = 0; i < 128; ++i) {
values[i] = i;
}
values[128] = DocIdSetIterator.NO_MORE_DOCS;
for (int start = 0; start < 128; ++start) {
for (int targetIndex = start; targetIndex < 128; ++targetIndex) {
int actualIndex = binarySearch(values, values[targetIndex], start);
assertEquals(targetIndex, actualIndex);
actualIndex = binarySearch2(values, values[targetIndex], start);
assertEquals(targetIndex, actualIndex);
actualIndex = binarySearch3(values, values[targetIndex], start);
assertEquals(targetIndex, actualIndex);
actualIndex = binarySearch4(values, values[targetIndex], start);
assertEquals(targetIndex, actualIndex);
actualIndex = binarySearch5(values, values[targetIndex], start);
assertEquals(targetIndex, actualIndex);
actualIndex = binarySearch6(values, values[targetIndex], start);
assertEquals(targetIndex, actualIndex);
actualIndex = bruteForceSearch(values, values[targetIndex], start);
assertEquals(targetIndex, actualIndex);
actualIndex = hybridSearch(values, values[targetIndex], start);
assertEquals(targetIndex, actualIndex);
actualIndex = linearSearch(values, values[targetIndex], start);
assertEquals(targetIndex, actualIndex);
actualIndex = linearSearch2(values, values[targetIndex], start);
assertEquals(targetIndex, actualIndex);
actualIndex = linearSearch3(values, values[targetIndex], start);
assertEquals(targetIndex, actualIndex);
}
}
}
}

View File

@ -0,0 +1,75 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.benchmark.jmh;
import java.io.IOException;
import java.util.Random;
import java.util.concurrent.TimeUnit;
import org.apache.lucene.util.VectorUtil;
import org.openjdk.jmh.annotations.Benchmark;
import org.openjdk.jmh.annotations.BenchmarkMode;
import org.openjdk.jmh.annotations.Fork;
import org.openjdk.jmh.annotations.Measurement;
import org.openjdk.jmh.annotations.Mode;
import org.openjdk.jmh.annotations.OutputTimeUnit;
import org.openjdk.jmh.annotations.Param;
import org.openjdk.jmh.annotations.Scope;
import org.openjdk.jmh.annotations.Setup;
import org.openjdk.jmh.annotations.State;
import org.openjdk.jmh.annotations.Warmup;
@Fork(1)
@Warmup(iterations = 3, time = 3)
@Measurement(iterations = 5, time = 3)
@BenchmarkMode(Mode.Throughput)
@OutputTimeUnit(TimeUnit.SECONDS)
@State(Scope.Benchmark)
public class HammingDistanceBenchmark {
@Param({"1000000"})
int nb = 1_000_000;
@Param({"1024"})
int dims = 1024;
byte[][] xb;
byte[] xq;
@Setup
public void setup() throws IOException {
Random rand = new Random();
this.xb = new byte[nb][dims / 8];
for (int i = 0; i < nb; i++) {
for (int j = 0; j < dims / 8; j++) {
xb[i][j] = (byte) rand.nextInt(0, 255);
}
}
this.xq = new byte[dims / 8];
for (int i = 0; i < xq.length; i++) {
xq[i] = (byte) rand.nextInt(0, 255);
}
}
@Benchmark
public int xorBitCount() {
int tot = 0;
for (int i = 0; i < nb; i++) {
tot += VectorUtil.xorBitCount(xb[i], xq);
}
return tot;
}
}

View File

@ -0,0 +1,108 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.benchmark.jmh;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.Random;
import java.util.concurrent.TimeUnit;
import org.apache.lucene.codecs.lucene912.ForDeltaUtil;
import org.apache.lucene.codecs.lucene912.ForUtil;
import org.apache.lucene.codecs.lucene912.PostingIndexInput;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.store.MMapDirectory;
import org.apache.lucene.util.IOUtils;
import org.openjdk.jmh.annotations.Benchmark;
import org.openjdk.jmh.annotations.BenchmarkMode;
import org.openjdk.jmh.annotations.Fork;
import org.openjdk.jmh.annotations.Level;
import org.openjdk.jmh.annotations.Measurement;
import org.openjdk.jmh.annotations.Mode;
import org.openjdk.jmh.annotations.OutputTimeUnit;
import org.openjdk.jmh.annotations.Param;
import org.openjdk.jmh.annotations.Scope;
import org.openjdk.jmh.annotations.Setup;
import org.openjdk.jmh.annotations.State;
import org.openjdk.jmh.annotations.TearDown;
import org.openjdk.jmh.annotations.Warmup;
import org.openjdk.jmh.infra.Blackhole;
@BenchmarkMode(Mode.Throughput)
@OutputTimeUnit(TimeUnit.MICROSECONDS)
@State(Scope.Benchmark)
@Warmup(iterations = 5, time = 1)
@Measurement(iterations = 5, time = 1)
@Fork(
value = 3,
jvmArgsAppend = {"-Xmx1g", "-Xms1g", "-XX:+AlwaysPreTouch"})
public class PostingIndexInputBenchmark {
private Path path;
private Directory dir;
private IndexInput in;
private PostingIndexInput postingIn;
private final ForUtil forUtil = new ForUtil();
private final ForDeltaUtil forDeltaUtil = new ForDeltaUtil();
private final long[] values = new long[128];
@Param({"2", "3", "4", "5", "6", "7", "8", "9", "10"})
public int bpv;
@Setup(Level.Trial)
public void setup() throws Exception {
path = Files.createTempDirectory("forUtil");
dir = MMapDirectory.open(path);
try (IndexOutput out = dir.createOutput("docs", IOContext.DEFAULT)) {
Random r = new Random(0);
// Write enough random data to not reach EOF while decoding
for (int i = 0; i < 100; ++i) {
out.writeLong(r.nextLong());
}
}
in = dir.openInput("docs", IOContext.DEFAULT);
postingIn = new PostingIndexInput(in, forUtil, forDeltaUtil);
}
@TearDown(Level.Trial)
public void tearDown() throws Exception {
if (dir != null) {
dir.deleteFile("docs");
}
IOUtils.close(in, dir);
in = null;
dir = null;
Files.deleteIfExists(path);
}
@Benchmark
public void decode(Blackhole bh) throws IOException {
in.seek(3); // random unaligned offset
postingIn.decode(bpv, values);
bh.consume(values);
}
@Benchmark
public void decodeAndPrefixSum(Blackhole bh) throws IOException {
in.seek(3); // random unaligned offset
postingIn.decodeAndPrefixSum(bpv, 100, values);
bh.consume(values);
}
}

View File

@ -17,11 +17,10 @@
# -------------------------------------------------------------------------------------
# multi val params are iterated by NewRound's, added to reports, start with column name.
# collector.class can be:
# Fully Qualified Class Name of a Collector with a empty constructor
# topScoreDocOrdered - Creates a TopScoreDocCollector that requires in order docs
# topScoreDocUnordered - Like above, but allows out of order
collector.class=coll:topScoreDoc
# collector.manager.class can be:
# Fully Qualified Class Name of a CollectorManager with a empty constructor
# topScoreDoc - Creates a TopScoreDocCollectorManager
collector.manager.class=coll:topScoreDoc
analyzer=org.apache.lucene.analysis.core.WhitespaceAnalyzer
directory=FSDirectory

View File

@ -17,11 +17,10 @@
# -------------------------------------------------------------------------------------
# multi val params are iterated by NewRound's, added to reports, start with column name.
# collector.class can be:
# Fully Qualified Class Name of a Collector with a empty constructor
# topScoreDocOrdered - Creates a TopScoreDocCollector that requires in order docs
# topScoreDocUnordered - Like above, but allows out of order
collector.class=coll:topScoreDoc
# collector.manager.class can be:
# Fully Qualified Class Name of a CollectorManager with a empty constructor
# topScoreDoc - Creates a TopScoreDocCollectorManager
collector.manager.class=coll:topScoreDoc
analyzer=org.apache.lucene.analysis.core.WhitespaceAnalyzer
directory=FSDirectory

View File

@ -238,7 +238,7 @@ public class EnwikiContentSource extends ContentSource {
time = null;
id = null;
break;
// intentional fall-through.
// intentional fall-through.
case BODY:
case DATE:
case TITLE:

View File

@ -99,7 +99,7 @@ public class SpatialDocMaker extends DocMaker {
return makeRPTStrategy(SPATIAL_FIELD, config, configMap, ctx);
case "composite":
return makeCompositeStrategy(config, configMap, ctx);
// TODO add more as-needed
// TODO add more as-needed
default:
throw new IllegalStateException("Unknown spatial.strategy: " + strategyName);
}

View File

@ -24,7 +24,7 @@ import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.MultiBits;
import org.apache.lucene.index.StoredFields;
import org.apache.lucene.search.Collector;
import org.apache.lucene.search.CollectorManager;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
@ -119,9 +119,7 @@ public abstract class ReadTask extends PerfTask {
hits = searcher.search(q, numHits);
}
} else {
Collector collector = createCollector();
searcher.search(q, collector);
searcher.search(q, createCollectorManager());
// hits = collector.topDocs();
}
@ -184,9 +182,8 @@ public abstract class ReadTask extends PerfTask {
return res;
}
protected Collector createCollector() throws Exception {
return new TopScoreDocCollectorManager(numHits(), withTotalHits() ? Integer.MAX_VALUE : 1)
.newCollector();
protected CollectorManager<?, ?> createCollectorManager() throws Exception {
return new TopScoreDocCollectorManager(numHits(), withTotalHits() ? Integer.MAX_VALUE : 1);
}
protected Document retrieveDoc(StoredFields storedFields, int id) throws IOException {

View File

@ -19,8 +19,8 @@ package org.apache.lucene.benchmark.byTask.tasks;
import org.apache.lucene.benchmark.byTask.PerfRunData;
import org.apache.lucene.benchmark.byTask.feeds.QueryMaker;
import org.apache.lucene.benchmark.byTask.utils.Config;
import org.apache.lucene.search.Collector;
import org.apache.lucene.search.TopScoreDocCollector;
import org.apache.lucene.search.CollectorManager;
import org.apache.lucene.search.TopScoreDocCollectorManager;
/** Does search w/ a custom collector */
public class SearchWithCollectorTask extends SearchTask {
@ -37,7 +37,11 @@ public class SearchWithCollectorTask extends SearchTask {
// check to make sure either the doc is being stored
PerfRunData runData = getRunData();
Config config = runData.getConfig();
clnName = config.get("collector.class", "");
if (config.get("collector.class", null) != null) {
throw new IllegalArgumentException(
"collector.class is no longer supported as a config parameter, use collector.manager.class instead to provide a CollectorManager class name");
}
clnName = config.get("collector.manager.class", "");
}
@Override
@ -46,17 +50,17 @@ public class SearchWithCollectorTask extends SearchTask {
}
@Override
protected Collector createCollector() throws Exception {
Collector collector = null;
protected CollectorManager<?, ?> createCollectorManager() throws Exception {
CollectorManager<?, ?> collectorManager;
if (clnName.equalsIgnoreCase("topScoreDoc") == true) {
collector = TopScoreDocCollector.create(numHits(), Integer.MAX_VALUE);
collectorManager = new TopScoreDocCollectorManager(numHits(), Integer.MAX_VALUE);
} else if (clnName.length() > 0) {
collector = Class.forName(clnName).asSubclass(Collector.class).getConstructor().newInstance();
collectorManager =
Class.forName(clnName).asSubclass(CollectorManager.class).getConstructor().newInstance();
} else {
collector = super.createCollector();
collectorManager = super.createCollectorManager();
}
return collector;
return collectorManager;
}
@Override

View File

@ -23,13 +23,13 @@ import org.apache.lucene.codecs.PostingsFormat;
import org.apache.lucene.codecs.PostingsReaderBase;
import org.apache.lucene.codecs.PostingsWriterBase;
import org.apache.lucene.codecs.lucene90.blocktree.Lucene90BlockTreeTermsWriter;
import org.apache.lucene.codecs.lucene99.Lucene99PostingsReader;
import org.apache.lucene.codecs.lucene99.Lucene99PostingsWriter;
import org.apache.lucene.codecs.lucene912.Lucene912PostingsReader;
import org.apache.lucene.codecs.lucene912.Lucene912PostingsWriter;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.util.IOUtils;
/** Uses {@link OrdsBlockTreeTermsWriter} with {@link Lucene99PostingsWriter}. */
/** Uses {@link OrdsBlockTreeTermsWriter} with {@link Lucene912PostingsWriter}. */
public class BlockTreeOrdsPostingsFormat extends PostingsFormat {
private final int minTermBlockSize;
@ -67,7 +67,7 @@ public class BlockTreeOrdsPostingsFormat extends PostingsFormat {
@Override
public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
PostingsWriterBase postingsWriter = new Lucene99PostingsWriter(state);
PostingsWriterBase postingsWriter = new Lucene912PostingsWriter(state);
boolean success = false;
try {
@ -84,7 +84,7 @@ public class BlockTreeOrdsPostingsFormat extends PostingsFormat {
@Override
public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException {
PostingsReaderBase postingsReader = new Lucene99PostingsReader(state);
PostingsReaderBase postingsReader = new Lucene912PostingsReader(state);
boolean success = false;
try {
FieldsProducer ret = new OrdsBlockTreeTermsReader(postingsReader, state);

View File

@ -43,6 +43,7 @@ import org.apache.lucene.store.ChecksumIndexInput;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOBooleanSupplier;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.automaton.CompiledAutomaton;
@ -315,12 +316,21 @@ public final class BloomFilteringPostingsFormat extends PostingsFormat {
}
@Override
public boolean seekExact(BytesRef text) throws IOException {
public IOBooleanSupplier prepareSeekExact(BytesRef text) throws IOException {
// The magical fail-fast speed up that is the entire point of all of
// this code - save a disk seek if there is a match on an in-memory
// structure
// that may occasionally give a false positive but guaranteed no false
// negatives
if (filter.contains(text) == ContainsResult.NO) {
return null;
}
return delegate().prepareSeekExact(text);
}
@Override
public boolean seekExact(BytesRef text) throws IOException {
// See #prepareSeekExact
if (filter.contains(text) == ContainsResult.NO) {
return false;
}

View File

@ -24,7 +24,7 @@ import java.util.TreeMap;
import org.apache.lucene.codecs.FieldsConsumer;
import org.apache.lucene.codecs.FieldsProducer;
import org.apache.lucene.codecs.PostingsFormat;
import org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat;
import org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat;
import org.apache.lucene.index.BaseTermsEnum;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.Fields;
@ -54,7 +54,7 @@ import org.apache.lucene.util.automaton.TransitionAccessor;
// - or: longer dense skip lists than just next byte?
/**
* Wraps {@link Lucene99PostingsFormat} format for on-disk storage, but then at read time loads and
* Wraps {@link Lucene912PostingsFormat} format for on-disk storage, but then at read time loads and
* stores all terms and postings directly in RAM as byte[], int[].
*
* <p><b>WARNING</b>: This is exceptionally RAM intensive: it makes no effort to compress the
@ -97,12 +97,12 @@ public final class DirectPostingsFormat extends PostingsFormat {
@Override
public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
return PostingsFormat.forName("Lucene99").fieldsConsumer(state);
return PostingsFormat.forName("Lucene912").fieldsConsumer(state);
}
@Override
public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException {
FieldsProducer postings = PostingsFormat.forName("Lucene99").fieldsProducer(state);
FieldsProducer postings = PostingsFormat.forName("Lucene912").fieldsProducer(state);
if (state.context.context() != IOContext.Context.MERGE) {
FieldsProducer loadedPostings;
try {

View File

@ -22,8 +22,8 @@ import org.apache.lucene.codecs.FieldsProducer;
import org.apache.lucene.codecs.PostingsFormat;
import org.apache.lucene.codecs.PostingsReaderBase;
import org.apache.lucene.codecs.PostingsWriterBase;
import org.apache.lucene.codecs.lucene99.Lucene99PostingsReader;
import org.apache.lucene.codecs.lucene99.Lucene99PostingsWriter;
import org.apache.lucene.codecs.lucene912.Lucene912PostingsReader;
import org.apache.lucene.codecs.lucene912.Lucene912PostingsWriter;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.util.IOUtils;
@ -41,7 +41,7 @@ public final class FSTPostingsFormat extends PostingsFormat {
@Override
public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
PostingsWriterBase postingsWriter = new Lucene99PostingsWriter(state);
PostingsWriterBase postingsWriter = new Lucene912PostingsWriter(state);
boolean success = false;
try {
@ -57,7 +57,7 @@ public final class FSTPostingsFormat extends PostingsFormat {
@Override
public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException {
PostingsReaderBase postingsReader = new Lucene99PostingsReader(state);
PostingsReaderBase postingsReader = new Lucene912PostingsReader(state);
boolean success = false;
try {
FieldsProducer ret = new FSTTermsReader(state, postingsReader);

View File

@ -195,9 +195,10 @@ public class FSTTermsReader extends FieldsProducer {
this.sumTotalTermFreq = sumTotalTermFreq;
this.sumDocFreq = sumDocFreq;
this.docCount = docCount;
OffHeapFSTStore offHeapFSTStore = new OffHeapFSTStore();
FSTTermOutputs outputs = new FSTTermOutputs(fieldInfo);
this.dict = new FST<>(FST.readMetadata(in, outputs), in, offHeapFSTStore);
final var fstMetadata = FST.readMetadata(in, outputs);
OffHeapFSTStore offHeapFSTStore = new OffHeapFSTStore(in, in.getFilePointer(), fstMetadata);
this.dict = FST.fromFSTReader(fstMetadata, offHeapFSTStore);
in.skipBytes(offHeapFSTStore.size());
}

View File

@ -71,8 +71,8 @@ final class SimpleTextBKDReader extends PointValues {
this.pointCount = pointCount;
this.docCount = docCount;
this.version = SimpleTextBKDWriter.VERSION_CURRENT;
assert minPackedValue.length == config.packedIndexBytesLength;
assert maxPackedValue.length == config.packedIndexBytesLength;
assert minPackedValue.length == config.packedIndexBytesLength();
assert maxPackedValue.length == config.packedIndexBytesLength();
}
@Override
@ -99,8 +99,8 @@ final class SimpleTextBKDReader extends PointValues {
private SimpleTextPointTree(
IndexInput in, int nodeID, int level, byte[] minPackedValue, byte[] maxPackedValue) {
this.in = in;
this.scratchDocIDs = new int[config.maxPointsInLeafNode];
this.scratchPackedValue = new byte[config.packedBytesLength];
this.scratchDocIDs = new int[config.maxPointsInLeafNode()];
this.scratchPackedValue = new byte[config.packedBytesLength()];
this.nodeID = nodeID;
this.rootNode = nodeID;
this.level = level;
@ -145,38 +145,39 @@ final class SimpleTextBKDReader extends PointValues {
private void pushLeft() {
int address = nodeID * bytesPerIndexEntry;
// final int splitDimPos;
if (config.numIndexDims == 1) {
if (config.numIndexDims() == 1) {
splitDims[level] = 0;
} else {
splitDims[level] = (splitPackedValues[address++] & 0xff);
}
final int splitDimPos = splitDims[level] * config.bytesPerDim;
final int splitDimPos = splitDims[level] * config.bytesPerDim();
if (splitDimValueStack[level] == null) {
splitDimValueStack[level] = new byte[config.bytesPerDim];
splitDimValueStack[level] = new byte[config.bytesPerDim()];
}
// save the dimension we are going to change
System.arraycopy(
maxPackedValue, splitDimPos, splitDimValueStack[level], 0, config.bytesPerDim);
maxPackedValue, splitDimPos, splitDimValueStack[level], 0, config.bytesPerDim());
assert Arrays.compareUnsigned(
maxPackedValue,
splitDimPos,
splitDimPos + config.bytesPerDim,
splitDimPos + config.bytesPerDim(),
splitPackedValues,
address,
address + config.bytesPerDim)
address + config.bytesPerDim())
>= 0
: "config.bytesPerDim="
+ config.bytesPerDim
: "config.bytesPerDim()="
+ config.bytesPerDim()
+ " splitDim="
+ splitDims[level]
+ " config.numIndexDims="
+ config.numIndexDims
+ " config.numIndexDims()="
+ config.numIndexDims()
+ " config.numDims="
+ config.numDims;
+ config.numDims();
nodeID *= 2;
level++;
// add the split dim value:
System.arraycopy(splitPackedValues, address, maxPackedValue, splitDimPos, config.bytesPerDim);
System.arraycopy(
splitPackedValues, address, maxPackedValue, splitDimPos, config.bytesPerDim());
}
@Override
@ -191,37 +192,38 @@ final class SimpleTextBKDReader extends PointValues {
private void pushRight() {
int address = nodeID * bytesPerIndexEntry;
if (config.numIndexDims == 1) {
if (config.numIndexDims() == 1) {
splitDims[level] = 0;
} else {
splitDims[level] = (splitPackedValues[address++] & 0xff);
}
final int splitDimPos = splitDims[level] * config.bytesPerDim;
final int splitDimPos = splitDims[level] * config.bytesPerDim();
// we should have already visit the left node
assert splitDimValueStack[level] != null;
// save the dimension we are going to change
System.arraycopy(
minPackedValue, splitDimPos, splitDimValueStack[level], 0, config.bytesPerDim);
minPackedValue, splitDimPos, splitDimValueStack[level], 0, config.bytesPerDim());
assert Arrays.compareUnsigned(
minPackedValue,
splitDimPos,
splitDimPos + config.bytesPerDim,
splitDimPos + config.bytesPerDim(),
splitPackedValues,
address,
address + config.bytesPerDim)
address + config.bytesPerDim())
<= 0
: "config.bytesPerDim="
+ config.bytesPerDim
: "config.bytesPerDim()="
+ config.bytesPerDim()
+ " splitDim="
+ splitDims[level]
+ " config.numIndexDims="
+ config.numIndexDims
+ " config.numIndexDims()="
+ config.numIndexDims()
+ " config.numDims="
+ config.numDims;
+ config.numDims();
nodeID = 2 * nodeID + 1;
level++;
// add the split dim value:
System.arraycopy(splitPackedValues, address, minPackedValue, splitDimPos, config.bytesPerDim);
System.arraycopy(
splitPackedValues, address, minPackedValue, splitDimPos, config.bytesPerDim());
}
@Override
@ -242,16 +244,16 @@ final class SimpleTextBKDReader extends PointValues {
splitDimValueStack[level],
0,
maxPackedValue,
splitDims[level] * config.bytesPerDim,
config.bytesPerDim);
splitDims[level] * config.bytesPerDim(),
config.bytesPerDim());
} else {
System.arraycopy(
splitDimValueStack[level],
0,
minPackedValue,
splitDims[level] * config.bytesPerDim,
config.bytesPerDim);
splitDims[level] * config.bytesPerDim(),
config.bytesPerDim());
}
}
@ -290,7 +292,7 @@ final class SimpleTextBKDReader extends PointValues {
private long sizeFromBalancedTree(int leftMostLeafNode, int rightMostLeafNode) {
// number of points that need to be distributed between leaves, one per leaf
final int extraPoints =
Math.toIntExact(((long) config.maxPointsInLeafNode * leafNodeOffset) - pointCount);
Math.toIntExact(((long) config.maxPointsInLeafNode() * leafNodeOffset) - pointCount);
assert extraPoints < leafNodeOffset : "point excess should be lower than leafNodeOffset";
// offset where we stop adding one point to the leaves
final int nodeOffset = leafNodeOffset - extraPoints;
@ -298,9 +300,9 @@ final class SimpleTextBKDReader extends PointValues {
for (int node = leftMostLeafNode; node <= rightMostLeafNode; node++) {
// offsetPosition provides which extra point will be added to this node
if (balanceTreeNodePosition(0, leafNodeOffset, node - leafNodeOffset, 0, 0) < nodeOffset) {
count += config.maxPointsInLeafNode;
count += config.maxPointsInLeafNode();
} else {
count += config.maxPointsInLeafNode - 1;
count += config.maxPointsInLeafNode() - 1;
}
}
return count;
@ -376,14 +378,14 @@ final class SimpleTextBKDReader extends PointValues {
// Again, this time reading values and checking with the visitor
visitor.grow(count);
// NOTE: we don't do prefix coding, so we ignore commonPrefixLengths
assert scratchPackedValue.length == config.packedBytesLength;
assert scratchPackedValue.length == config.packedBytesLength();
BytesRefBuilder scratch = new BytesRefBuilder();
for (int i = 0; i < count; i++) {
readLine(in, scratch);
assert startsWith(scratch, BLOCK_VALUE);
BytesRef br = SimpleTextUtil.fromBytesRefString(stripPrefix(scratch, BLOCK_VALUE));
assert br.length == config.packedBytesLength;
System.arraycopy(br.bytes, br.offset, scratchPackedValue, 0, config.packedBytesLength);
assert br.length == config.packedBytesLength();
System.arraycopy(br.bytes, br.offset, scratchPackedValue, 0, config.packedBytesLength());
visitor.visit(scratchDocIDs[i], scratchPackedValue);
}
} else {
@ -443,17 +445,17 @@ final class SimpleTextBKDReader extends PointValues {
@Override
public int getNumDimensions() throws IOException {
return config.numDims;
return config.numDims();
}
@Override
public int getNumIndexDimensions() throws IOException {
return config.numIndexDims;
return config.numIndexDims();
}
@Override
public int getBytesPerDimension() throws IOException {
return config.bytesPerDim;
return config.bytesPerDim();
}
@Override

View File

@ -144,28 +144,28 @@ final class SimpleTextBKDWriter implements Closeable {
this.maxDoc = maxDoc;
docsSeen = new FixedBitSet(maxDoc);
scratchDiff = new byte[config.bytesPerDim];
scratch1 = new byte[config.packedBytesLength];
scratch2 = new byte[config.packedBytesLength];
commonPrefixLengths = new int[config.numDims];
scratchDiff = new byte[config.bytesPerDim()];
scratch1 = new byte[config.packedBytesLength()];
scratch2 = new byte[config.packedBytesLength()];
commonPrefixLengths = new int[config.numDims()];
minPackedValue = new byte[config.packedIndexBytesLength];
maxPackedValue = new byte[config.packedIndexBytesLength];
minPackedValue = new byte[config.packedIndexBytesLength()];
maxPackedValue = new byte[config.packedIndexBytesLength()];
// Maximum number of points we hold in memory at any time
maxPointsSortInHeap =
(int) ((maxMBSortInHeap * 1024 * 1024) / (config.bytesPerDoc * config.numDims));
(int) ((maxMBSortInHeap * 1024 * 1024) / (config.bytesPerDoc() * config.numDims()));
// Finally, we must be able to hold at least the leaf node in heap during build:
if (maxPointsSortInHeap < config.maxPointsInLeafNode) {
if (maxPointsSortInHeap < config.maxPointsInLeafNode()) {
throw new IllegalArgumentException(
"maxMBSortInHeap="
+ maxMBSortInHeap
+ " only allows for maxPointsSortInHeap="
+ maxPointsSortInHeap
+ ", but this is less than config.maxPointsInLeafNode="
+ config.maxPointsInLeafNode
+ "; either increase maxMBSortInHeap or decrease config.maxPointsInLeafNode");
+ ", but this is less than config.maxPointsInLeafNode()="
+ config.maxPointsInLeafNode()
+ "; either increase maxMBSortInHeap or decrease config.maxPointsInLeafNode()");
}
this.maxMBSortInHeap = maxMBSortInHeap;
@ -183,10 +183,10 @@ final class SimpleTextBKDWriter implements Closeable {
}
public void add(byte[] packedValue, int docID) throws IOException {
if (packedValue.length != config.packedBytesLength) {
if (packedValue.length != config.packedBytesLength()) {
throw new IllegalArgumentException(
"packedValue should be length="
+ config.packedBytesLength
+ config.packedBytesLength()
+ " (got: "
+ packedValue.length
+ ")");
@ -209,30 +209,30 @@ final class SimpleTextBKDWriter implements Closeable {
} else {
pointWriter = new HeapPointWriter(config, Math.toIntExact(totalPointCount));
}
System.arraycopy(packedValue, 0, minPackedValue, 0, config.packedIndexBytesLength);
System.arraycopy(packedValue, 0, maxPackedValue, 0, config.packedIndexBytesLength);
System.arraycopy(packedValue, 0, minPackedValue, 0, config.packedIndexBytesLength());
System.arraycopy(packedValue, 0, maxPackedValue, 0, config.packedIndexBytesLength());
} else {
for (int dim = 0; dim < config.numIndexDims; dim++) {
int offset = dim * config.bytesPerDim;
for (int dim = 0; dim < config.numIndexDims(); dim++) {
int offset = dim * config.bytesPerDim();
if (Arrays.compareUnsigned(
packedValue,
offset,
offset + config.bytesPerDim,
offset + config.bytesPerDim(),
minPackedValue,
offset,
offset + config.bytesPerDim)
offset + config.bytesPerDim())
< 0) {
System.arraycopy(packedValue, offset, minPackedValue, offset, config.bytesPerDim);
System.arraycopy(packedValue, offset, minPackedValue, offset, config.bytesPerDim());
}
if (Arrays.compareUnsigned(
packedValue,
offset,
offset + config.bytesPerDim,
offset + config.bytesPerDim(),
maxPackedValue,
offset,
offset + config.bytesPerDim)
offset + config.bytesPerDim())
> 0) {
System.arraycopy(packedValue, offset, maxPackedValue, offset, config.bytesPerDim);
System.arraycopy(packedValue, offset, maxPackedValue, offset, config.bytesPerDim());
}
}
}
@ -254,7 +254,7 @@ final class SimpleTextBKDWriter implements Closeable {
*/
public long writeField(IndexOutput out, String fieldName, MutablePointTree reader)
throws IOException {
if (config.numIndexDims == 1) {
if (config.numIndexDims() == 1) {
return writeField1Dim(out, fieldName, reader);
} else {
return writeFieldNDims(out, fieldName, reader);
@ -280,7 +280,7 @@ final class SimpleTextBKDWriter implements Closeable {
long countPerLeaf = pointCount = values.size();
long innerNodeCount = 1;
while (countPerLeaf > config.maxPointsInLeafNode) {
while (countPerLeaf > config.maxPointsInLeafNode()) {
countPerLeaf = (countPerLeaf + 1) / 2;
innerNodeCount *= 2;
}
@ -289,7 +289,7 @@ final class SimpleTextBKDWriter implements Closeable {
checkMaxLeafNodeCount(numLeaves);
final byte[] splitPackedValues = new byte[numLeaves * (config.bytesPerDim + 1)];
final byte[] splitPackedValues = new byte[numLeaves * (config.bytesPerDim() + 1)];
final long[] leafBlockFPs = new long[numLeaves];
// compute the min/max for this slice
@ -297,37 +297,37 @@ final class SimpleTextBKDWriter implements Closeable {
Arrays.fill(maxPackedValue, (byte) 0);
for (int i = 0; i < Math.toIntExact(pointCount); ++i) {
values.getValue(i, scratchBytesRef1);
for (int dim = 0; dim < config.numIndexDims; dim++) {
int offset = dim * config.bytesPerDim;
for (int dim = 0; dim < config.numIndexDims(); dim++) {
int offset = dim * config.bytesPerDim();
if (Arrays.compareUnsigned(
scratchBytesRef1.bytes,
scratchBytesRef1.offset + offset,
scratchBytesRef1.offset + offset + config.bytesPerDim,
scratchBytesRef1.offset + offset + config.bytesPerDim(),
minPackedValue,
offset,
offset + config.bytesPerDim)
offset + config.bytesPerDim())
< 0) {
System.arraycopy(
scratchBytesRef1.bytes,
scratchBytesRef1.offset + offset,
minPackedValue,
offset,
config.bytesPerDim);
config.bytesPerDim());
}
if (Arrays.compareUnsigned(
scratchBytesRef1.bytes,
scratchBytesRef1.offset + offset,
scratchBytesRef1.offset + offset + config.bytesPerDim,
scratchBytesRef1.offset + offset + config.bytesPerDim(),
maxPackedValue,
offset,
offset + config.bytesPerDim)
offset + config.bytesPerDim())
> 0) {
System.arraycopy(
scratchBytesRef1.bytes,
scratchBytesRef1.offset + offset,
maxPackedValue,
offset,
config.bytesPerDim);
config.bytesPerDim());
}
}
@ -345,7 +345,7 @@ final class SimpleTextBKDWriter implements Closeable {
maxPackedValue,
splitPackedValues,
leafBlockFPs,
new int[config.maxPointsInLeafNode]);
new int[config.maxPointsInLeafNode()]);
long indexFP = out.getFilePointer();
writeIndex(out, leafBlockFPs, splitPackedValues, Math.toIntExact(countPerLeaf));
@ -387,15 +387,15 @@ final class SimpleTextBKDWriter implements Closeable {
final IndexOutput out;
final List<Long> leafBlockFPs = new ArrayList<>();
final List<byte[]> leafBlockStartValues = new ArrayList<>();
final byte[] leafValues = new byte[config.maxPointsInLeafNode * config.packedBytesLength];
final int[] leafDocs = new int[config.maxPointsInLeafNode];
final byte[] leafValues = new byte[config.maxPointsInLeafNode() * config.packedBytesLength()];
final int[] leafDocs = new int[config.maxPointsInLeafNode()];
long valueCount;
int leafCount;
OneDimensionBKDWriter(IndexOutput out) {
if (config.numIndexDims != 1) {
if (config.numIndexDims() != 1) {
throw new UnsupportedOperationException(
"config.numIndexDims must be 1 but got " + config.numIndexDims);
"config.numIndexDims() must be 1 but got " + config.numIndexDims());
}
if (pointCount != 0) {
throw new IllegalStateException("cannot mix add and merge");
@ -411,7 +411,7 @@ final class SimpleTextBKDWriter implements Closeable {
this.out = out;
lastPackedValue = new byte[config.packedBytesLength];
lastPackedValue = new byte[config.packedBytesLength()];
}
// for asserts
@ -426,8 +426,8 @@ final class SimpleTextBKDWriter implements Closeable {
packedValue,
0,
leafValues,
leafCount * config.packedBytesLength,
config.packedBytesLength);
leafCount * config.packedBytesLength(),
config.packedBytesLength());
leafDocs[leafCount] = docID;
docsSeen.set(docID);
leafCount++;
@ -441,7 +441,7 @@ final class SimpleTextBKDWriter implements Closeable {
+ " values");
}
if (leafCount == config.maxPointsInLeafNode) {
if (leafCount == config.maxPointsInLeafNode()) {
// We write a block once we hit exactly the max count ... this is different from
// when we flush a new segment, where we write between max/2 and max per leaf block,
// so merged segments will behave differently from newly flushed segments:
@ -471,43 +471,44 @@ final class SimpleTextBKDWriter implements Closeable {
// System.out.println("BKDW: now rotate numInnerNodes=" + numInnerNodes + " leafBlockStarts="
// + leafBlockStartValues.size());
byte[] index = new byte[(1 + numInnerNodes) * (1 + config.bytesPerDim)];
byte[] index = new byte[(1 + numInnerNodes) * (1 + config.bytesPerDim())];
rotateToTree(1, 0, numInnerNodes, index, leafBlockStartValues);
long[] arr = new long[leafBlockFPs.size()];
for (int i = 0; i < leafBlockFPs.size(); i++) {
arr[i] = leafBlockFPs.get(i);
}
writeIndex(out, arr, index, config.maxPointsInLeafNode);
writeIndex(out, arr, index, config.maxPointsInLeafNode());
return indexFP;
}
private void writeLeafBlock() throws IOException {
assert leafCount != 0;
if (valueCount == 0) {
System.arraycopy(leafValues, 0, minPackedValue, 0, config.packedIndexBytesLength);
System.arraycopy(leafValues, 0, minPackedValue, 0, config.packedIndexBytesLength());
}
System.arraycopy(
leafValues,
(leafCount - 1) * config.packedBytesLength,
(leafCount - 1) * config.packedBytesLength(),
maxPackedValue,
0,
config.packedIndexBytesLength);
config.packedIndexBytesLength());
valueCount += leafCount;
if (leafBlockFPs.size() > 0) {
// Save the first (minimum) value in each leaf block except the first, to build the split
// value index in the end:
leafBlockStartValues.add(ArrayUtil.copyOfSubArray(leafValues, 0, config.packedBytesLength));
leafBlockStartValues.add(
ArrayUtil.copyOfSubArray(leafValues, 0, config.packedBytesLength()));
}
leafBlockFPs.add(out.getFilePointer());
checkMaxLeafNodeCount(leafBlockFPs.size());
Arrays.fill(commonPrefixLengths, config.bytesPerDim);
Arrays.fill(commonPrefixLengths, config.bytesPerDim());
// Find per-dim common prefix:
for (int dim = 0; dim < config.numDims; dim++) {
int offset1 = dim * config.bytesPerDim;
int offset2 = (leafCount - 1) * config.packedBytesLength + offset1;
for (int dim = 0; dim < config.numDims(); dim++) {
int offset1 = dim * config.bytesPerDim();
int offset2 = (leafCount - 1) * config.packedBytesLength() + offset1;
for (int j = 0; j < commonPrefixLengths[dim]; j++) {
if (leafValues[offset1 + j] != leafValues[offset2 + j]) {
commonPrefixLengths[dim] = j;
@ -523,24 +524,24 @@ final class SimpleTextBKDWriter implements Closeable {
final BytesRef scratch = new BytesRef();
{
scratch.length = config.packedBytesLength;
scratch.length = config.packedBytesLength();
scratch.bytes = leafValues;
}
@Override
public BytesRef apply(int i) {
scratch.offset = config.packedBytesLength * i;
scratch.offset = config.packedBytesLength() * i;
return scratch;
}
};
assert valuesInOrderAndBounds(
leafCount,
0,
ArrayUtil.copyOfSubArray(leafValues, 0, config.packedBytesLength),
ArrayUtil.copyOfSubArray(leafValues, 0, config.packedBytesLength()),
ArrayUtil.copyOfSubArray(
leafValues,
(leafCount - 1) * config.packedBytesLength,
leafCount * config.packedBytesLength),
(leafCount - 1) * config.packedBytesLength(),
leafCount * config.packedBytesLength()),
packedValues,
leafDocs,
0);
@ -552,7 +553,7 @@ final class SimpleTextBKDWriter implements Closeable {
private void rotateToTree(
int nodeID, int offset, int count, byte[] index, List<byte[]> leafBlockStartValues) {
// System.out.println("ROTATE: nodeID=" + nodeID + " offset=" + offset + " count=" + count + "
// bpd=" + config.bytesPerDim + " index.length=" + index.length);
// bpd=" + config.bytesPerDim() + " index.length=" + index.length);
if (count == 1) {
// Leaf index node
// System.out.println(" leaf index node");
@ -561,8 +562,8 @@ final class SimpleTextBKDWriter implements Closeable {
leafBlockStartValues.get(offset),
0,
index,
nodeID * (1 + config.bytesPerDim) + 1,
config.bytesPerDim);
nodeID * (1 + config.bytesPerDim()) + 1,
config.bytesPerDim());
} else if (count > 1) {
// Internal index node: binary partition of count
int countAtLevel = 1;
@ -587,8 +588,8 @@ final class SimpleTextBKDWriter implements Closeable {
leafBlockStartValues.get(rootOffset),
0,
index,
nodeID * (1 + config.bytesPerDim) + 1,
config.bytesPerDim);
nodeID * (1 + config.bytesPerDim()) + 1,
config.bytesPerDim());
// System.out.println(" index[" + nodeID + "] = blockStartValues[" + rootOffset + "]");
// TODO: we could optimize/specialize, when we know it's simply fully balanced binary tree
@ -611,10 +612,10 @@ final class SimpleTextBKDWriter implements Closeable {
}
private void checkMaxLeafNodeCount(int numLeaves) {
if ((1 + config.bytesPerDim) * (long) numLeaves > ArrayUtil.MAX_ARRAY_LENGTH) {
if ((1 + config.bytesPerDim()) * (long) numLeaves > ArrayUtil.MAX_ARRAY_LENGTH) {
throw new IllegalStateException(
"too many nodes; increase config.maxPointsInLeafNode (currently "
+ config.maxPointsInLeafNode
"too many nodes; increase config.maxPointsInLeafNode() (currently "
+ config.maxPointsInLeafNode()
+ ") and reindex");
}
}
@ -652,7 +653,7 @@ final class SimpleTextBKDWriter implements Closeable {
long countPerLeaf = pointCount;
long innerNodeCount = 1;
while (countPerLeaf > config.maxPointsInLeafNode) {
while (countPerLeaf > config.maxPointsInLeafNode()) {
countPerLeaf = (countPerLeaf + 1) / 2;
innerNodeCount *= 2;
}
@ -667,20 +668,20 @@ final class SimpleTextBKDWriter implements Closeable {
// Indexed by nodeID, but first (root) nodeID is 1. We do 1+ because the lead byte at each
// recursion says which dim we split on.
byte[] splitPackedValues = new byte[Math.multiplyExact(numLeaves, 1 + config.bytesPerDim)];
byte[] splitPackedValues = new byte[Math.multiplyExact(numLeaves, 1 + config.bytesPerDim())];
// +1 because leaf count is power of 2 (e.g. 8), and innerNodeCount is power of 2 minus 1 (e.g.
// 7)
long[] leafBlockFPs = new long[numLeaves];
// Make sure the math above "worked":
assert pointCount / numLeaves <= config.maxPointsInLeafNode
assert pointCount / numLeaves <= config.maxPointsInLeafNode()
: "pointCount="
+ pointCount
+ " numLeaves="
+ numLeaves
+ " config.maxPointsInLeafNode="
+ config.maxPointsInLeafNode;
+ " config.maxPointsInLeafNode()="
+ config.maxPointsInLeafNode();
// We re-use the selector so we do not need to create an object every time.
BKDRadixSelector radixSelector =
@ -699,7 +700,7 @@ final class SimpleTextBKDWriter implements Closeable {
maxPackedValue,
splitPackedValues,
leafBlockFPs,
new int[config.maxPointsInLeafNode]);
new int[config.maxPointsInLeafNode()]);
// If no exception, we should have cleaned everything up:
assert tempDir.getCreatedFiles().isEmpty();
@ -724,15 +725,15 @@ final class SimpleTextBKDWriter implements Closeable {
IndexOutput out, long[] leafBlockFPs, byte[] splitPackedValues, int maxPointsInLeafNode)
throws IOException {
write(out, NUM_DATA_DIMS);
writeInt(out, config.numDims);
writeInt(out, config.numDims());
newline(out);
write(out, NUM_INDEX_DIMS);
writeInt(out, config.numIndexDims);
writeInt(out, config.numIndexDims());
newline(out);
write(out, BYTES_PER_DIM);
writeInt(out, config.bytesPerDim);
writeInt(out, config.bytesPerDim());
newline(out);
write(out, MAX_LEAF_POINTS);
@ -767,8 +768,8 @@ final class SimpleTextBKDWriter implements Closeable {
newline(out);
}
assert (splitPackedValues.length % (1 + config.bytesPerDim)) == 0;
int count = splitPackedValues.length / (1 + config.bytesPerDim);
assert (splitPackedValues.length % (1 + config.bytesPerDim())) == 0;
int count = splitPackedValues.length / (1 + config.bytesPerDim());
assert count == leafBlockFPs.length;
write(out, SPLIT_COUNT);
@ -777,10 +778,12 @@ final class SimpleTextBKDWriter implements Closeable {
for (int i = 0; i < count; i++) {
write(out, SPLIT_DIM);
writeInt(out, splitPackedValues[i * (1 + config.bytesPerDim)] & 0xff);
writeInt(out, splitPackedValues[i * (1 + config.bytesPerDim())] & 0xff);
newline(out);
write(out, SPLIT_VALUE);
br = new BytesRef(splitPackedValues, 1 + (i * (1 + config.bytesPerDim)), config.bytesPerDim);
br =
new BytesRef(
splitPackedValues, 1 + (i * (1 + config.bytesPerDim())), config.bytesPerDim());
write(out, br.toString());
newline(out);
}
@ -852,25 +855,25 @@ final class SimpleTextBKDWriter implements Closeable {
/** Called only in assert */
private boolean valueInBounds(
BytesRef packedValue, byte[] minPackedValue, byte[] maxPackedValue) {
for (int dim = 0; dim < config.numIndexDims; dim++) {
int offset = config.bytesPerDim * dim;
for (int dim = 0; dim < config.numIndexDims(); dim++) {
int offset = config.bytesPerDim() * dim;
if (Arrays.compareUnsigned(
packedValue.bytes,
packedValue.offset + offset,
packedValue.offset + offset + config.bytesPerDim,
packedValue.offset + offset + config.bytesPerDim(),
minPackedValue,
offset,
offset + config.bytesPerDim)
offset + config.bytesPerDim())
< 0) {
return false;
}
if (Arrays.compareUnsigned(
packedValue.bytes,
packedValue.offset + offset,
packedValue.offset + offset + config.bytesPerDim,
packedValue.offset + offset + config.bytesPerDim(),
maxPackedValue,
offset,
offset + config.bytesPerDim)
offset + config.bytesPerDim())
> 0) {
return false;
}
@ -882,13 +885,13 @@ final class SimpleTextBKDWriter implements Closeable {
protected int split(byte[] minPackedValue, byte[] maxPackedValue) {
// Find which dim has the largest span so we can split on it:
int splitDim = -1;
for (int dim = 0; dim < config.numIndexDims; dim++) {
NumericUtils.subtract(config.bytesPerDim, dim, maxPackedValue, minPackedValue, scratchDiff);
for (int dim = 0; dim < config.numIndexDims(); dim++) {
NumericUtils.subtract(config.bytesPerDim(), dim, maxPackedValue, minPackedValue, scratchDiff);
if (splitDim == -1
|| Arrays.compareUnsigned(
scratchDiff, 0, config.bytesPerDim, scratch1, 0, config.bytesPerDim)
scratchDiff, 0, config.bytesPerDim(), scratch1, 0, config.bytesPerDim())
> 0) {
System.arraycopy(scratchDiff, 0, scratch1, 0, config.bytesPerDim);
System.arraycopy(scratchDiff, 0, scratch1, 0, config.bytesPerDim());
splitDim = dim;
}
}
@ -931,15 +934,15 @@ final class SimpleTextBKDWriter implements Closeable {
if (nodeID >= leafNodeOffset) {
// leaf node
final int count = to - from;
assert count <= config.maxPointsInLeafNode;
assert count <= config.maxPointsInLeafNode();
// Compute common prefixes
Arrays.fill(commonPrefixLengths, config.bytesPerDim);
Arrays.fill(commonPrefixLengths, config.bytesPerDim());
reader.getValue(from, scratchBytesRef1);
for (int i = from + 1; i < to; ++i) {
reader.getValue(i, scratchBytesRef2);
for (int dim = 0; dim < config.numDims; dim++) {
final int offset = dim * config.bytesPerDim;
for (int dim = 0; dim < config.numDims(); dim++) {
final int offset = dim * config.bytesPerDim();
for (int j = 0; j < commonPrefixLengths[dim]; j++) {
if (scratchBytesRef1.bytes[scratchBytesRef1.offset + offset + j]
!= scratchBytesRef2.bytes[scratchBytesRef2.offset + offset + j]) {
@ -951,23 +954,23 @@ final class SimpleTextBKDWriter implements Closeable {
}
// Find the dimension that has the least number of unique bytes at commonPrefixLengths[dim]
FixedBitSet[] usedBytes = new FixedBitSet[config.numDims];
for (int dim = 0; dim < config.numDims; ++dim) {
if (commonPrefixLengths[dim] < config.bytesPerDim) {
FixedBitSet[] usedBytes = new FixedBitSet[config.numDims()];
for (int dim = 0; dim < config.numDims(); ++dim) {
if (commonPrefixLengths[dim] < config.bytesPerDim()) {
usedBytes[dim] = new FixedBitSet(256);
}
}
for (int i = from + 1; i < to; ++i) {
for (int dim = 0; dim < config.numDims; dim++) {
for (int dim = 0; dim < config.numDims(); dim++) {
if (usedBytes[dim] != null) {
byte b = reader.getByteAt(i, dim * config.bytesPerDim + commonPrefixLengths[dim]);
byte b = reader.getByteAt(i, dim * config.bytesPerDim() + commonPrefixLengths[dim]);
usedBytes[dim].set(Byte.toUnsignedInt(b));
}
}
}
int sortedDim = 0;
int sortedDimCardinality = Integer.MAX_VALUE;
for (int dim = 0; dim < config.numDims; ++dim) {
for (int dim = 0; dim < config.numDims(); ++dim) {
if (usedBytes[dim] != null) {
final int cardinality = usedBytes[dim].cardinality();
if (cardinality < sortedDimCardinality) {
@ -1001,7 +1004,7 @@ final class SimpleTextBKDWriter implements Closeable {
// Write the common prefixes:
reader.getValue(from, scratchBytesRef1);
System.arraycopy(
scratchBytesRef1.bytes, scratchBytesRef1.offset, scratch1, 0, config.packedBytesLength);
scratchBytesRef1.bytes, scratchBytesRef1.offset, scratch1, 0, config.packedBytesLength());
// Write the full values:
IntFunction<BytesRef> packedValues =
@ -1023,10 +1026,10 @@ final class SimpleTextBKDWriter implements Closeable {
final int splitDim = split(minPackedValue, maxPackedValue);
final int mid = (from + to + 1) >>> 1;
int commonPrefixLen = config.bytesPerDim;
for (int i = 0; i < config.bytesPerDim; ++i) {
if (minPackedValue[splitDim * config.bytesPerDim + i]
!= maxPackedValue[splitDim * config.bytesPerDim + i]) {
int commonPrefixLen = config.bytesPerDim();
for (int i = 0; i < config.bytesPerDim(); ++i) {
if (minPackedValue[splitDim * config.bytesPerDim() + i]
!= maxPackedValue[splitDim * config.bytesPerDim() + i]) {
commonPrefixLen = i;
break;
}
@ -1044,32 +1047,32 @@ final class SimpleTextBKDWriter implements Closeable {
scratchBytesRef2);
// set the split value
final int address = nodeID * (1 + config.bytesPerDim);
final int address = nodeID * (1 + config.bytesPerDim());
splitPackedValues[address] = (byte) splitDim;
reader.getValue(mid, scratchBytesRef1);
System.arraycopy(
scratchBytesRef1.bytes,
scratchBytesRef1.offset + splitDim * config.bytesPerDim,
scratchBytesRef1.offset + splitDim * config.bytesPerDim(),
splitPackedValues,
address + 1,
config.bytesPerDim);
config.bytesPerDim());
byte[] minSplitPackedValue =
ArrayUtil.copyOfSubArray(minPackedValue, 0, config.packedIndexBytesLength);
ArrayUtil.copyOfSubArray(minPackedValue, 0, config.packedIndexBytesLength());
byte[] maxSplitPackedValue =
ArrayUtil.copyOfSubArray(maxPackedValue, 0, config.packedIndexBytesLength);
ArrayUtil.copyOfSubArray(maxPackedValue, 0, config.packedIndexBytesLength());
System.arraycopy(
scratchBytesRef1.bytes,
scratchBytesRef1.offset + splitDim * config.bytesPerDim,
scratchBytesRef1.offset + splitDim * config.bytesPerDim(),
minSplitPackedValue,
splitDim * config.bytesPerDim,
config.bytesPerDim);
splitDim * config.bytesPerDim(),
config.bytesPerDim());
System.arraycopy(
scratchBytesRef1.bytes,
scratchBytesRef1.offset + splitDim * config.bytesPerDim,
scratchBytesRef1.offset + splitDim * config.bytesPerDim(),
maxSplitPackedValue,
splitDim * config.bytesPerDim,
config.bytesPerDim);
splitDim * config.bytesPerDim(),
config.bytesPerDim());
// recurse
build(
@ -1137,17 +1140,17 @@ final class SimpleTextBKDWriter implements Closeable {
int sortedDim = 0;
int sortedDimCardinality = Integer.MAX_VALUE;
FixedBitSet[] usedBytes = new FixedBitSet[config.numDims];
for (int dim = 0; dim < config.numDims; ++dim) {
if (commonPrefixLengths[dim] < config.bytesPerDim) {
FixedBitSet[] usedBytes = new FixedBitSet[config.numDims()];
for (int dim = 0; dim < config.numDims(); ++dim) {
if (commonPrefixLengths[dim] < config.bytesPerDim()) {
usedBytes[dim] = new FixedBitSet(256);
}
}
// Find the dimension to compress
for (int dim = 0; dim < config.numDims; dim++) {
for (int dim = 0; dim < config.numDims(); dim++) {
int prefix = commonPrefixLengths[dim];
if (prefix < config.bytesPerDim) {
int offset = dim * config.bytesPerDim;
if (prefix < config.bytesPerDim()) {
int offset = dim * config.bytesPerDim();
for (int i = 0; i < heapSource.count(); ++i) {
PointValue value = heapSource.getPackedValueSlice(i);
BytesRef packedValue = value.packedValue();
@ -1190,7 +1193,7 @@ final class SimpleTextBKDWriter implements Closeable {
final BytesRef scratch = new BytesRef();
{
scratch.length = config.packedBytesLength;
scratch.length = config.packedBytesLength();
}
@Override
@ -1207,7 +1210,7 @@ final class SimpleTextBKDWriter implements Closeable {
// Inner node: partition/recurse
int splitDim;
if (config.numIndexDims > 1) {
if (config.numIndexDims() > 1) {
splitDim = split(minPackedValue, maxPackedValue);
} else {
splitDim = 0;
@ -1223,13 +1226,13 @@ final class SimpleTextBKDWriter implements Closeable {
int commonPrefixLen =
Arrays.mismatch(
minPackedValue,
splitDim * config.bytesPerDim,
splitDim * config.bytesPerDim + config.bytesPerDim,
splitDim * config.bytesPerDim(),
splitDim * config.bytesPerDim() + config.bytesPerDim(),
maxPackedValue,
splitDim * config.bytesPerDim,
splitDim * config.bytesPerDim + config.bytesPerDim);
splitDim * config.bytesPerDim(),
splitDim * config.bytesPerDim() + config.bytesPerDim());
if (commonPrefixLen == -1) {
commonPrefixLen = config.bytesPerDim;
commonPrefixLen = config.bytesPerDim();
}
BKDRadixSelector.PathSlice[] pathSlices = new BKDRadixSelector.PathSlice[2];
@ -1244,20 +1247,28 @@ final class SimpleTextBKDWriter implements Closeable {
splitDim,
commonPrefixLen);
int address = nodeID * (1 + config.bytesPerDim);
int address = nodeID * (1 + config.bytesPerDim());
splitPackedValues[address] = (byte) splitDim;
System.arraycopy(splitValue, 0, splitPackedValues, address + 1, config.bytesPerDim);
System.arraycopy(splitValue, 0, splitPackedValues, address + 1, config.bytesPerDim());
byte[] minSplitPackedValue = new byte[config.packedIndexBytesLength];
System.arraycopy(minPackedValue, 0, minSplitPackedValue, 0, config.packedIndexBytesLength);
byte[] minSplitPackedValue = new byte[config.packedIndexBytesLength()];
System.arraycopy(minPackedValue, 0, minSplitPackedValue, 0, config.packedIndexBytesLength());
byte[] maxSplitPackedValue = new byte[config.packedIndexBytesLength];
System.arraycopy(maxPackedValue, 0, maxSplitPackedValue, 0, config.packedIndexBytesLength);
byte[] maxSplitPackedValue = new byte[config.packedIndexBytesLength()];
System.arraycopy(maxPackedValue, 0, maxSplitPackedValue, 0, config.packedIndexBytesLength());
System.arraycopy(
splitValue, 0, minSplitPackedValue, splitDim * config.bytesPerDim, config.bytesPerDim);
splitValue,
0,
minSplitPackedValue,
splitDim * config.bytesPerDim(),
config.bytesPerDim());
System.arraycopy(
splitValue, 0, maxSplitPackedValue, splitDim * config.bytesPerDim, config.bytesPerDim);
splitValue,
0,
maxSplitPackedValue,
splitDim * config.bytesPerDim(),
config.bytesPerDim());
// Recurse on left tree:
build(
@ -1289,30 +1300,30 @@ final class SimpleTextBKDWriter implements Closeable {
}
private void computeCommonPrefixLength(HeapPointWriter heapPointWriter, byte[] commonPrefix) {
Arrays.fill(commonPrefixLengths, config.bytesPerDim);
Arrays.fill(commonPrefixLengths, config.bytesPerDim());
PointValue value = heapPointWriter.getPackedValueSlice(0);
BytesRef packedValue = value.packedValue();
for (int dim = 0; dim < config.numDims; dim++) {
for (int dim = 0; dim < config.numDims(); dim++) {
System.arraycopy(
packedValue.bytes,
packedValue.offset + dim * config.bytesPerDim,
packedValue.offset + dim * config.bytesPerDim(),
commonPrefix,
dim * config.bytesPerDim,
config.bytesPerDim);
dim * config.bytesPerDim(),
config.bytesPerDim());
}
for (int i = 1; i < heapPointWriter.count(); i++) {
value = heapPointWriter.getPackedValueSlice(i);
packedValue = value.packedValue();
for (int dim = 0; dim < config.numDims; dim++) {
for (int dim = 0; dim < config.numDims(); dim++) {
if (commonPrefixLengths[dim] != 0) {
int j =
Arrays.mismatch(
commonPrefix,
dim * config.bytesPerDim,
dim * config.bytesPerDim + commonPrefixLengths[dim],
dim * config.bytesPerDim(),
dim * config.bytesPerDim() + commonPrefixLengths[dim],
packedValue.bytes,
packedValue.offset + dim * config.bytesPerDim,
packedValue.offset + dim * config.bytesPerDim + commonPrefixLengths[dim]);
packedValue.offset + dim * config.bytesPerDim(),
packedValue.offset + dim * config.bytesPerDim() + commonPrefixLengths[dim]);
if (j != -1) {
commonPrefixLengths[dim] = j;
}
@ -1331,11 +1342,11 @@ final class SimpleTextBKDWriter implements Closeable {
int[] docs,
int docsOffset)
throws IOException {
byte[] lastPackedValue = new byte[config.packedBytesLength];
byte[] lastPackedValue = new byte[config.packedBytesLength()];
int lastDoc = -1;
for (int i = 0; i < count; i++) {
BytesRef packedValue = values.apply(i);
assert packedValue.length == config.packedBytesLength;
assert packedValue.length == config.packedBytesLength();
assert valueInOrder(
i,
sortedDim,
@ -1361,43 +1372,43 @@ final class SimpleTextBKDWriter implements Closeable {
int packedValueOffset,
int doc,
int lastDoc) {
int dimOffset = sortedDim * config.bytesPerDim;
int dimOffset = sortedDim * config.bytesPerDim();
if (ord > 0) {
int cmp =
Arrays.compareUnsigned(
lastPackedValue,
dimOffset,
dimOffset + config.bytesPerDim,
dimOffset + config.bytesPerDim(),
packedValue,
packedValueOffset + dimOffset,
packedValueOffset + dimOffset + config.bytesPerDim);
packedValueOffset + dimOffset + config.bytesPerDim());
if (cmp > 0) {
throw new AssertionError(
"values out of order: last value="
+ new BytesRef(lastPackedValue)
+ " current value="
+ new BytesRef(packedValue, packedValueOffset, config.packedBytesLength)
+ new BytesRef(packedValue, packedValueOffset, config.packedBytesLength())
+ " ord="
+ ord
+ " sortedDim="
+ sortedDim);
}
if (cmp == 0 && config.numDims > config.numIndexDims) {
int dataOffset = config.numIndexDims * config.bytesPerDim;
if (cmp == 0 && config.numDims() > config.numIndexDims()) {
int dataOffset = config.numIndexDims() * config.bytesPerDim();
cmp =
Arrays.compareUnsigned(
lastPackedValue,
dataOffset,
config.packedBytesLength,
config.packedBytesLength(),
packedValue,
packedValueOffset + dataOffset,
packedValueOffset + config.packedBytesLength);
packedValueOffset + config.packedBytesLength());
if (cmp > 0) {
throw new AssertionError(
"data values out of order: last value="
+ new BytesRef(lastPackedValue)
+ " current value="
+ new BytesRef(packedValue, packedValueOffset, config.packedBytesLength)
+ new BytesRef(packedValue, packedValueOffset, config.packedBytesLength())
+ " ord="
+ ord);
}
@ -1414,7 +1425,8 @@ final class SimpleTextBKDWriter implements Closeable {
+ sortedDim);
}
}
System.arraycopy(packedValue, packedValueOffset, lastPackedValue, 0, config.packedBytesLength);
System.arraycopy(
packedValue, packedValueOffset, lastPackedValue, 0, config.packedBytesLength());
return true;
}

View File

@ -829,7 +829,7 @@ class SimpleTextDocValuesReader extends DocValuesProducer {
clone.seek(0);
// checksum is fixed-width encoded with 20 bytes, plus 1 byte for newline (the space is included
// in SimpleTextUtil.CHECKSUM):
long footerStartPos = data.length() - (SimpleTextUtil.CHECKSUM.length + 21);
long footerStartPos = clone.length() - (SimpleTextUtil.CHECKSUM.length + 21);
ChecksumIndexInput input = new BufferedChecksumIndexInput(clone);
while (true) {
SimpleTextUtil.readLine(input, scratch);

View File

@ -227,7 +227,7 @@ class SimpleTextPointsReader extends PointsReader {
// checksum is fixed-width encoded with 20 bytes, plus 1 byte for newline (the space is included
// in SimpleTextUtil.CHECKSUM):
long footerStartPos = dataIn.length() - (SimpleTextUtil.CHECKSUM.length + 21);
long footerStartPos = clone.length() - (SimpleTextUtil.CHECKSUM.length + 21);
ChecksumIndexInput input = new BufferedChecksumIndexInput(clone);
while (true) {
SimpleTextUtil.readLine(input, scratch);

View File

@ -17,13 +17,13 @@
package org.apache.lucene.codecs.uniformsplit;
import static org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.BLOCK_SIZE;
import static org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat.BLOCK_SIZE;
import java.io.IOException;
import org.apache.lucene.codecs.BlockTermState;
import org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.IntBlockTermState;
import org.apache.lucene.codecs.lucene99.Lucene99PostingsReader;
import org.apache.lucene.codecs.lucene99.Lucene99PostingsWriter;
import org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat.IntBlockTermState;
import org.apache.lucene.codecs.lucene912.Lucene912PostingsReader;
import org.apache.lucene.codecs.lucene912.Lucene912PostingsWriter;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.TermState;
@ -34,7 +34,7 @@ import org.apache.lucene.util.RamUsageEstimator;
/**
* {@link TermState} serializer which encodes each file pointer as a delta relative to a base file
* pointer. It differs from {@link Lucene99PostingsWriter#encodeTerm} which encodes each file
* pointer. It differs from {@link Lucene912PostingsWriter#encodeTerm} which encodes each file
* pointer as a delta relative to the previous file pointer.
*
* <p>It automatically sets the base file pointer to the first valid file pointer for doc start FP,
@ -95,7 +95,7 @@ public class DeltaBaseTermStateSerializer implements Accountable {
/**
* Writes a {@link BlockTermState} to the provided {@link DataOutput}.
*
* <p>Simpler variant of {@link Lucene99PostingsWriter#encodeTerm(DataOutput, FieldInfo,
* <p>Simpler variant of {@link Lucene912PostingsWriter#encodeTerm(DataOutput, FieldInfo,
* BlockTermState, boolean)}.
*/
public void writeTermState(
@ -140,15 +140,12 @@ public class DeltaBaseTermStateSerializer implements Accountable {
termStatesOutput.writeVLong(intTermState.lastPosBlockOffset);
}
}
if (intTermState.skipOffset != -1) {
termStatesOutput.writeVLong(intTermState.skipOffset);
}
}
/**
* Reads a {@link BlockTermState} from the provided {@link DataInput}.
*
* <p>Simpler variant of {@link Lucene99PostingsReader#decodeTerm(DataInput, FieldInfo,
* <p>Simpler variant of {@link Lucene912PostingsReader#decodeTerm(DataInput, FieldInfo,
* BlockTermState, boolean)}.
*
* @param reuse {@link BlockTermState} to reuse; or null to create a new one.
@ -190,9 +187,6 @@ public class DeltaBaseTermStateSerializer implements Accountable {
intTermState.lastPosBlockOffset = termStatesInput.readVLong();
}
}
if (intTermState.docFreq > BLOCK_SIZE) {
intTermState.skipOffset = termStatesInput.readVLong();
}
return intTermState;
}
@ -210,7 +204,6 @@ public class DeltaBaseTermStateSerializer implements Accountable {
termState.docStartFP = 0;
termState.posStartFP = 0;
termState.payStartFP = 0;
termState.skipOffset = -1;
termState.lastPosBlockOffset = -1;
termState.singletonDocID = -1;

View File

@ -90,10 +90,15 @@ public class FSTDictionary implements IndexDictionary {
}
PositiveIntOutputs fstOutputs = PositiveIntOutputs.getSingleton();
FST.FSTMetadata<Long> metadata = FST.readMetadata(fstDataInput, fstOutputs);
FST<Long> fst =
isFSTOnHeap
? new FST<>(metadata, fstDataInput)
: new FST<>(metadata, fstDataInput, new OffHeapFSTStore());
FST<Long> fst;
if (isFSTOnHeap) {
fst = new FST<>(metadata, fstDataInput);
} else {
final IndexInput indexInput = (IndexInput) fstDataInput;
fst =
FST.fromFSTReader(
metadata, new OffHeapFSTStore(indexInput, indexInput.getFilePointer(), metadata));
}
return new FSTDictionary(fst);
}

View File

@ -23,8 +23,8 @@ import org.apache.lucene.codecs.FieldsProducer;
import org.apache.lucene.codecs.PostingsFormat;
import org.apache.lucene.codecs.PostingsReaderBase;
import org.apache.lucene.codecs.PostingsWriterBase;
import org.apache.lucene.codecs.lucene99.Lucene99PostingsReader;
import org.apache.lucene.codecs.lucene99.Lucene99PostingsWriter;
import org.apache.lucene.codecs.lucene912.Lucene912PostingsReader;
import org.apache.lucene.codecs.lucene912.Lucene912PostingsWriter;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.util.IOUtils;
@ -113,7 +113,7 @@ public class UniformSplitPostingsFormat extends PostingsFormat {
@Override
public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
PostingsWriterBase postingsWriter = new Lucene99PostingsWriter(state);
PostingsWriterBase postingsWriter = new Lucene912PostingsWriter(state);
boolean success = false;
try {
FieldsConsumer termsWriter =
@ -130,7 +130,7 @@ public class UniformSplitPostingsFormat extends PostingsFormat {
@Override
public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException {
PostingsReaderBase postingsReader = new Lucene99PostingsReader(state);
PostingsReaderBase postingsReader = new Lucene912PostingsReader(state);
boolean success = false;
try {
FieldsProducer termsReader =

View File

@ -28,7 +28,7 @@
* org.apache.lucene.search.PhraseQuery})
* <li>Quite efficient for {@link org.apache.lucene.search.PrefixQuery}
* <li>Not efficient for spell-check and {@link org.apache.lucene.search.FuzzyQuery}, in this case
* prefer {@link org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat}
* prefer {@link org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat}
* </ul>
*/
package org.apache.lucene.codecs.uniformsplit;

View File

@ -20,11 +20,11 @@ package org.apache.lucene.codecs.uniformsplit.sharedterms;
import java.io.IOException;
import java.util.List;
import java.util.RandomAccess;
import org.apache.lucene.index.BaseTermsEnum;
import org.apache.lucene.index.ImpactsEnum;
import org.apache.lucene.index.MergeState;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.TermState;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.BytesRef;
@ -34,7 +34,7 @@ import org.apache.lucene.util.BytesRef;
*
* @lucene.experimental
*/
class STMergingTermsEnum extends TermsEnum {
class STMergingTermsEnum extends BaseTermsEnum {
protected final String fieldName;
protected final MultiSegmentsPostingsEnum multiPostingsEnum;
@ -63,11 +63,6 @@ class STMergingTermsEnum extends TermsEnum {
throw new UnsupportedOperationException();
}
@Override
public boolean seekExact(BytesRef text) throws IOException {
throw new UnsupportedOperationException();
}
@Override
public SeekStatus seekCeil(BytesRef text) {
throw new UnsupportedOperationException();

View File

@ -22,7 +22,7 @@ import java.io.IOException;
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.codecs.FilterCodec;
import org.apache.lucene.codecs.KnnVectorsFormat;
import org.apache.lucene.codecs.lucene99.Lucene99Codec;
import org.apache.lucene.codecs.lucene912.Lucene912Codec;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.KnnByteVectorField;
@ -42,7 +42,7 @@ import org.apache.lucene.tests.index.BaseIndexFileFormatTestCase;
public class TestHnswBitVectorsFormat extends BaseIndexFileFormatTestCase {
@Override
protected Codec getCodec() {
return new Lucene99Codec() {
return new Lucene912Codec() {
@Override
public KnnVectorsFormat getKnnVectorsFormatForField(String field) {
return new HnswBitVectorsFormat();

View File

@ -17,7 +17,7 @@
package org.apache.lucene.codecs.lucene90.tests;
import org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.IntBlockTermState;
import org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat.IntBlockTermState;
/** Test utility class to create mock {@link IntBlockTermState}. */
public class MockTermStateFactory {

View File

@ -0,0 +1,4 @@
{
"lucene/core/src/java/org/apache/lucene/codecs/lucene912/ForDeltaUtil.java": "5115b12ac31537ce31d73c0a279df92060749a3a",
"lucene/core/src/java/org/apache/lucene/codecs/lucene912/gen_ForDeltaUtil.py": "db6154406e68b80d2c90116b5d0bfa9ba220762a"
}

View File

@ -1,4 +1,4 @@
{
"lucene/core/src/java/org/apache/lucene/codecs/lucene99/ForUtil.java": "1292ad354d255b1272ffd3db684aa2ddb2bc49ec",
"lucene/core/src/java/org/apache/lucene/codecs/lucene99/gen_ForUtil.py": "ab7b63a1b73986cc04e43de1c8f474b97aef5116"
"lucene/core/src/java/org/apache/lucene/codecs/lucene912/ForUtil.java": "159e82388346fde147924d5e15ca65df4dd63b9a",
"lucene/core/src/java/org/apache/lucene/codecs/lucene912/gen_ForUtil.py": "66dc8813160feae2a37d8b50474f5f9830b6cb22"
}

View File

@ -15,7 +15,7 @@
* limitations under the License.
*/
import org.apache.lucene.codecs.lucene99.Lucene99Codec;
import org.apache.lucene.codecs.lucene912.Lucene912Codec;
/** Lucene Core. */
@SuppressWarnings("module") // the test framework is compiled after the core...
@ -33,6 +33,7 @@ module org.apache.lucene.core {
exports org.apache.lucene.codecs.lucene94;
exports org.apache.lucene.codecs.lucene95;
exports org.apache.lucene.codecs.lucene99;
exports org.apache.lucene.codecs.lucene912;
exports org.apache.lucene.codecs.perfield;
exports org.apache.lucene.codecs;
exports org.apache.lucene.document;
@ -71,7 +72,7 @@ module org.apache.lucene.core {
provides org.apache.lucene.analysis.TokenizerFactory with
org.apache.lucene.analysis.standard.StandardTokenizerFactory;
provides org.apache.lucene.codecs.Codec with
Lucene99Codec;
Lucene912Codec;
provides org.apache.lucene.codecs.DocValuesFormat with
org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat;
provides org.apache.lucene.codecs.KnnVectorsFormat with
@ -79,7 +80,7 @@ module org.apache.lucene.core {
org.apache.lucene.codecs.lucene99.Lucene99HnswScalarQuantizedVectorsFormat,
org.apache.lucene.codecs.lucene99.Lucene99ScalarQuantizedVectorsFormat;
provides org.apache.lucene.codecs.PostingsFormat with
org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat;
org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat;
provides org.apache.lucene.index.SortFieldProvider with
org.apache.lucene.search.SortField.Provider,
org.apache.lucene.search.SortedNumericSortField.Provider,

View File

@ -55,7 +55,7 @@ public abstract class Codec implements NamedSPILoader.NamedSPI {
return LOADER;
}
static Codec defaultCodec = LOADER.lookup("Lucene99");
static Codec defaultCodec = LOADER.lookup("Lucene912");
}
private final String name;

View File

@ -18,8 +18,6 @@ package org.apache.lucene.codecs;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.Iterator;
import java.util.List;
@ -106,7 +104,7 @@ public final class CompetitiveImpactAccumulator {
}
/** Get the set of competitive freq and norm pairs, ordered by increasing freq and norm. */
public Collection<Impact> getCompetitiveFreqNormPairs() {
public List<Impact> getCompetitiveFreqNormPairs() {
List<Impact> impacts = new ArrayList<>();
int maxFreqForLowerNorms = 0;
for (int i = 0; i < maxFreqs.length; ++i) {
@ -126,7 +124,7 @@ public final class CompetitiveImpactAccumulator {
for (Impact impact : impacts) {
add(impact, freqNormPairs);
}
return Collections.unmodifiableSet(freqNormPairs);
return List.copyOf(freqNormPairs);
}
private void add(Impact newEntry, TreeSet<Impact> freqNormPairs) {

View File

@ -23,6 +23,7 @@ import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import org.apache.lucene.index.BaseTermsEnum;
import org.apache.lucene.index.BinaryDocValues;
import org.apache.lucene.index.DocIDMerger;
import org.apache.lucene.index.DocValues;
@ -498,7 +499,7 @@ public abstract class DocValuesConsumer implements Closeable {
* {@link SortedDocValues#lookupOrd(int)} or {@link SortedSetDocValues#lookupOrd(long)} on every
* call to {@link TermsEnum#next()}.
*/
private static class MergedTermsEnum extends TermsEnum {
private static class MergedTermsEnum extends BaseTermsEnum {
private final TermsEnum[] subs;
private final OrdinalMap ordinalMap;
@ -542,11 +543,6 @@ public abstract class DocValuesConsumer implements Closeable {
throw new UnsupportedOperationException();
}
@Override
public boolean seekExact(BytesRef text) throws IOException {
throw new UnsupportedOperationException();
}
@Override
public SeekStatus seekCeil(BytesRef text) throws IOException {
throw new UnsupportedOperationException();
@ -557,11 +553,6 @@ public abstract class DocValuesConsumer implements Closeable {
throw new UnsupportedOperationException();
}
@Override
public void seekExact(BytesRef term, TermState state) throws IOException {
throw new UnsupportedOperationException();
}
@Override
public int docFreq() throws IOException {
throw new UnsupportedOperationException();

View File

@ -20,17 +20,23 @@ package org.apache.lucene.codecs;
import java.io.Closeable;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Objects;
import java.util.function.BiFunction;
import org.apache.lucene.index.ByteVectorValues;
import org.apache.lucene.index.DocIDMerger;
import org.apache.lucene.index.DocsWithFieldSet;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FloatVectorValues;
import org.apache.lucene.index.MergeState;
import org.apache.lucene.index.Sorter;
import org.apache.lucene.index.VectorEncoding;
import org.apache.lucene.internal.hppc.IntIntHashMap;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.VectorScorer;
import org.apache.lucene.util.Accountable;
import org.apache.lucene.util.IOFunction;
/** Writes vectors to an index. */
public abstract class KnnVectorsWriter implements Accountable, Closeable {
@ -107,11 +113,11 @@ public abstract class KnnVectorsWriter implements Accountable, Closeable {
}
/** Tracks state of one sub-reader that we are merging */
private static class VectorValuesSub extends DocIDMerger.Sub {
private static class FloatVectorValuesSub extends DocIDMerger.Sub {
final FloatVectorValues values;
VectorValuesSub(MergeState.DocMap docMap, FloatVectorValues values) {
FloatVectorValuesSub(MergeState.DocMap docMap, FloatVectorValues values) {
super(docMap);
this.values = values;
assert values.docID() == -1;
@ -139,65 +145,139 @@ public abstract class KnnVectorsWriter implements Accountable, Closeable {
}
}
/**
* Given old doc ids and an id mapping, maps old ordinal to new ordinal. Note: this method return
* nothing and output are written to parameters
*
* @param oldDocIds the old or current document ordinals. Must not be null.
* @param sortMap the document sorting map for how to make the new ordinals. Must not be null.
* @param old2NewOrd int[] maps from old ord to new ord
* @param new2OldOrd int[] maps from new ord to old ord
* @param newDocsWithField set of new doc ids which has the value
*/
public static void mapOldOrdToNewOrd(
DocsWithFieldSet oldDocIds,
Sorter.DocMap sortMap,
int[] old2NewOrd,
int[] new2OldOrd,
DocsWithFieldSet newDocsWithField)
throws IOException {
// TODO: a similar function exists in IncrementalHnswGraphMerger#getNewOrdMapping
// maybe we can do a further refactoring
Objects.requireNonNull(oldDocIds);
Objects.requireNonNull(sortMap);
assert (old2NewOrd != null || new2OldOrd != null || newDocsWithField != null);
assert (old2NewOrd == null || old2NewOrd.length == oldDocIds.cardinality());
assert (new2OldOrd == null || new2OldOrd.length == oldDocIds.cardinality());
IntIntHashMap newIdToOldOrd = new IntIntHashMap();
DocIdSetIterator iterator = oldDocIds.iterator();
int[] newDocIds = new int[oldDocIds.cardinality()];
int oldOrd = 0;
for (int oldDocId = iterator.nextDoc();
oldDocId != DocIdSetIterator.NO_MORE_DOCS;
oldDocId = iterator.nextDoc()) {
int newId = sortMap.oldToNew(oldDocId);
newIdToOldOrd.put(newId, oldOrd);
newDocIds[oldOrd] = newId;
oldOrd++;
}
Arrays.sort(newDocIds);
int newOrd = 0;
for (int newDocId : newDocIds) {
int currOldOrd = newIdToOldOrd.get(newDocId);
if (old2NewOrd != null) {
old2NewOrd[currOldOrd] = newOrd;
}
if (new2OldOrd != null) {
new2OldOrd[newOrd] = currOldOrd;
}
if (newDocsWithField != null) {
newDocsWithField.add(newDocId);
}
newOrd++;
}
}
/** View over multiple vector values supporting iterator-style access via DocIdMerger. */
public static final class MergedVectorValues {
private MergedVectorValues() {}
/** Returns a merged view over all the segment's {@link FloatVectorValues}. */
public static FloatVectorValues mergeFloatVectorValues(
FieldInfo fieldInfo, MergeState mergeState) throws IOException {
private static void validateFieldEncoding(FieldInfo fieldInfo, VectorEncoding expected) {
assert fieldInfo != null && fieldInfo.hasVectorValues();
if (fieldInfo.getVectorEncoding() != VectorEncoding.FLOAT32) {
VectorEncoding fieldEncoding = fieldInfo.getVectorEncoding();
if (fieldEncoding != expected) {
throw new UnsupportedOperationException(
"Cannot merge vectors encoded as [" + fieldInfo.getVectorEncoding() + "] as FLOAT32");
"Cannot merge vectors encoded as [" + fieldEncoding + "] as " + expected);
}
List<VectorValuesSub> subs = new ArrayList<>();
for (int i = 0; i < mergeState.knnVectorsReaders.length; i++) {
KnnVectorsReader knnVectorsReader = mergeState.knnVectorsReaders[i];
}
private static <V, S> List<S> mergeVectorValues(
KnnVectorsReader[] knnVectorsReaders,
MergeState.DocMap[] docMaps,
IOFunction<KnnVectorsReader, V> valuesSupplier,
BiFunction<MergeState.DocMap, V, S> newSub)
throws IOException {
List<S> subs = new ArrayList<>();
for (int i = 0; i < knnVectorsReaders.length; i++) {
KnnVectorsReader knnVectorsReader = knnVectorsReaders[i];
if (knnVectorsReader != null) {
FloatVectorValues values = knnVectorsReader.getFloatVectorValues(fieldInfo.name);
V values = valuesSupplier.apply(knnVectorsReader);
if (values != null) {
subs.add(new VectorValuesSub(mergeState.docMaps[i], values));
subs.add(newSub.apply(docMaps[i], values));
}
}
}
return new MergedFloat32VectorValues(subs, mergeState);
return subs;
}
/** Returns a merged view over all the segment's {@link FloatVectorValues}. */
public static FloatVectorValues mergeFloatVectorValues(
FieldInfo fieldInfo, MergeState mergeState) throws IOException {
validateFieldEncoding(fieldInfo, VectorEncoding.FLOAT32);
return new MergedFloat32VectorValues(
mergeVectorValues(
mergeState.knnVectorsReaders,
mergeState.docMaps,
knnVectorsReader -> {
return knnVectorsReader.getFloatVectorValues(fieldInfo.name);
},
(docMap, values) -> {
return new FloatVectorValuesSub(docMap, values);
}),
mergeState);
}
/** Returns a merged view over all the segment's {@link ByteVectorValues}. */
public static ByteVectorValues mergeByteVectorValues(FieldInfo fieldInfo, MergeState mergeState)
throws IOException {
assert fieldInfo != null && fieldInfo.hasVectorValues();
if (fieldInfo.getVectorEncoding() != VectorEncoding.BYTE) {
throw new UnsupportedOperationException(
"Cannot merge vectors encoded as [" + fieldInfo.getVectorEncoding() + "] as BYTE");
}
List<ByteVectorValuesSub> subs = new ArrayList<>();
for (int i = 0; i < mergeState.knnVectorsReaders.length; i++) {
KnnVectorsReader knnVectorsReader = mergeState.knnVectorsReaders[i];
if (knnVectorsReader != null) {
ByteVectorValues values = knnVectorsReader.getByteVectorValues(fieldInfo.name);
if (values != null) {
subs.add(new ByteVectorValuesSub(mergeState.docMaps[i], values));
}
}
}
return new MergedByteVectorValues(subs, mergeState);
validateFieldEncoding(fieldInfo, VectorEncoding.BYTE);
return new MergedByteVectorValues(
mergeVectorValues(
mergeState.knnVectorsReaders,
mergeState.docMaps,
knnVectorsReader -> {
return knnVectorsReader.getByteVectorValues(fieldInfo.name);
},
(docMap, values) -> {
return new ByteVectorValuesSub(docMap, values);
}),
mergeState);
}
static class MergedFloat32VectorValues extends FloatVectorValues {
private final List<VectorValuesSub> subs;
private final DocIDMerger<VectorValuesSub> docIdMerger;
private final List<FloatVectorValuesSub> subs;
private final DocIDMerger<FloatVectorValuesSub> docIdMerger;
private final int size;
private int docId;
VectorValuesSub current;
FloatVectorValuesSub current;
private MergedFloat32VectorValues(List<VectorValuesSub> subs, MergeState mergeState)
private MergedFloat32VectorValues(List<FloatVectorValuesSub> subs, MergeState mergeState)
throws IOException {
this.subs = subs;
docIdMerger = DocIDMerger.of(subs, mergeState.needsIndexSort);
int totalSize = 0;
for (VectorValuesSub sub : subs) {
for (FloatVectorValuesSub sub : subs) {
totalSize += sub.values.size();
}
size = totalSize;

Some files were not shown because too many files have changed in this diff Show More