Merge branch 'apache:main' into bpv21_main

This commit is contained in:
expani1729 2024-08-29 19:47:04 +05:30 committed by GitHub
commit 0a0701995a
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
451 changed files with 28570 additions and 4825 deletions

View File

@ -23,6 +23,7 @@ Apache Lucene is a high-performance, full-featured text search engine library
written in Java. written in Java.
[![Build Status](https://ci-builds.apache.org/job/Lucene/job/Lucene-Artifacts-main/badge/icon?subject=Lucene)](https://ci-builds.apache.org/job/Lucene/job/Lucene-Artifacts-main/) [![Build Status](https://ci-builds.apache.org/job/Lucene/job/Lucene-Artifacts-main/badge/icon?subject=Lucene)](https://ci-builds.apache.org/job/Lucene/job/Lucene-Artifacts-main/)
[![Revved up by Develocity](https://img.shields.io/badge/Revved%20up%20by-Develocity-06A0CE?logo=Gradle&labelColor=02303A)](https://ge.apache.org/scans?search.buildToolType=gradle&search.rootProjectNames=lucene-root)
## Online Documentation ## Online Documentation

View File

@ -41,7 +41,7 @@ import jdk.jfr.consumer.RecordingFile;
*/ */
public class ProfileResults { public class ProfileResults {
/** Formats a frame to a formatted line. This is deduplicated on! */ /** Formats a frame to a formatted line. This is deduplicated on! */
static String frameToString(RecordedFrame frame, boolean lineNumbers) { static String frameToString(RecordedFrame frame, boolean lineNumbers, boolean frameTypes) {
StringBuilder builder = new StringBuilder(); StringBuilder builder = new StringBuilder();
RecordedMethod method = frame.getMethod(); RecordedMethod method = frame.getMethod();
RecordedClass clazz = method.getType(); RecordedClass clazz = method.getType();
@ -55,13 +55,14 @@ public class ProfileResults {
builder.append("#"); builder.append("#");
builder.append(method.getName()); builder.append(method.getName());
builder.append("()"); builder.append("()");
if (lineNumbers) { if (lineNumbers && frame.getLineNumber() != -1) {
builder.append(":"); builder.append(":");
if (frame.getLineNumber() == -1) { builder.append(frame.getLineNumber());
builder.append("(" + frame.getType() + " code)"); }
} else { if (clazz != null && frameTypes) {
builder.append(frame.getLineNumber()); builder.append(" [");
} builder.append(frame.getType());
builder.append(" code]");
} }
return builder.toString(); return builder.toString();
} }
@ -77,6 +78,8 @@ public class ProfileResults {
public static final String COUNT_DEFAULT = "10"; public static final String COUNT_DEFAULT = "10";
public static final String LINENUMBERS_KEY = "tests.profile.linenumbers"; public static final String LINENUMBERS_KEY = "tests.profile.linenumbers";
public static final String LINENUMBERS_DEFAULT = "false"; public static final String LINENUMBERS_DEFAULT = "false";
public static final String FRAMETYPES_KEY = "tests.profile.frametypes";
public static final String FRAMETYPES_DEFAULT = "true";
/** /**
* Driver method, for testing standalone. * Driver method, for testing standalone.
@ -92,7 +95,8 @@ public class ProfileResults {
System.getProperty(MODE_KEY, MODE_DEFAULT), System.getProperty(MODE_KEY, MODE_DEFAULT),
Integer.parseInt(System.getProperty(STACKSIZE_KEY, STACKSIZE_DEFAULT)), Integer.parseInt(System.getProperty(STACKSIZE_KEY, STACKSIZE_DEFAULT)),
Integer.parseInt(System.getProperty(COUNT_KEY, COUNT_DEFAULT)), Integer.parseInt(System.getProperty(COUNT_KEY, COUNT_DEFAULT)),
Boolean.parseBoolean(System.getProperty(LINENUMBERS_KEY, LINENUMBERS_DEFAULT))); Boolean.parseBoolean(System.getProperty(LINENUMBERS_KEY, LINENUMBERS_DEFAULT)),
Boolean.parseBoolean(System.getProperty(FRAMETYPES_KEY, FRAMETYPES_DEFAULT)));
} }
/** true if we care about this event */ /** true if we care about this event */
@ -152,7 +156,12 @@ public class ProfileResults {
/** Process all the JFR files passed in args and print a merged summary. */ /** Process all the JFR files passed in args and print a merged summary. */
public static void printReport( public static void printReport(
List<String> files, String mode, int stacksize, int count, boolean lineNumbers) List<String> files,
String mode,
int stacksize,
int count,
boolean lineNumbers,
boolean frameTypes)
throws IOException { throws IOException {
if (!"cpu".equals(mode) && !"heap".equals(mode)) { if (!"cpu".equals(mode) && !"heap".equals(mode)) {
throw new IllegalArgumentException("tests.profile.mode must be one of (cpu,heap)"); throw new IllegalArgumentException("tests.profile.mode must be one of (cpu,heap)");
@ -181,7 +190,7 @@ public class ProfileResults {
if (stack.length() > 0) { if (stack.length() > 0) {
stack.append("\n").append(framePadding).append(" at "); stack.append("\n").append(framePadding).append(" at ");
} }
stack.append(frameToString(trace.getFrames().get(i), lineNumbers)); stack.append(frameToString(trace.getFrames().get(i), lineNumbers, frameTypes));
} }
String line = stack.toString(); String line = stack.toString();
SimpleEntry<String, Long> entry = SimpleEntry<String, Long> entry =

View File

@ -231,8 +231,8 @@ public class MissingDoclet extends StandardDoclet {
case PACKAGE: case PACKAGE:
checkComment(element); checkComment(element);
break; break;
// class-like elements, check them, then recursively check their children (fields and // class-like elements, check them, then recursively check their children (fields and
// methods) // methods)
case CLASS: case CLASS:
case INTERFACE: case INTERFACE:
case ENUM: case ENUM:
@ -257,7 +257,7 @@ public class MissingDoclet extends StandardDoclet {
} }
} }
break; break;
// method-like elements, check them if we are configured to do so // method-like elements, check them if we are configured to do so
case METHOD: case METHOD:
case CONSTRUCTOR: case CONSTRUCTOR:
case FIELD: case FIELD:

View File

@ -80,6 +80,9 @@ ext {
// Minimum Java version required to compile and run Lucene. // Minimum Java version required to compile and run Lucene.
minJavaVersion = JavaVersion.toVersion(deps.versions.minJava.get()) minJavaVersion = JavaVersion.toVersion(deps.versions.minJava.get())
// also change this in extractor tool: ExtractForeignAPI
vectorIncubatorJavaVersions = [ JavaVersion.VERSION_21, JavaVersion.VERSION_22, JavaVersion.VERSION_23 ] as Set
// snapshot build marker used in scripts. // snapshot build marker used in scripts.
snapshotBuild = version.contains("SNAPSHOT") snapshotBuild = version.contains("SNAPSHOT")
@ -117,10 +120,6 @@ apply from: file('gradle/generation/local-settings.gradle')
// Make sure the build environment is consistent. // Make sure the build environment is consistent.
apply from: file('gradle/validation/check-environment.gradle') apply from: file('gradle/validation/check-environment.gradle')
// IDE support, settings and specials.
apply from: file('gradle/ide/intellij-idea.gradle')
apply from: file('gradle/ide/eclipse.gradle')
// Set up defaults and configure aspects for certain modules or functionality // Set up defaults and configure aspects for certain modules or functionality
// (java, tests) // (java, tests)
apply from: file('gradle/java/folder-layout.gradle') apply from: file('gradle/java/folder-layout.gradle')
@ -133,6 +132,10 @@ apply from: file('gradle/testing/alternative-jdk-support.gradle')
apply from: file('gradle/java/jar-manifest.gradle') apply from: file('gradle/java/jar-manifest.gradle')
apply from: file('gradle/java/modules.gradle') apply from: file('gradle/java/modules.gradle')
// IDE support, settings and specials.
apply from: file('gradle/ide/intellij-idea.gradle')
apply from: file('gradle/ide/eclipse.gradle')
// Maven artifact publishing. // Maven artifact publishing.
apply from: file('gradle/maven/publications.gradle') apply from: file('gradle/maven/publications.gradle')

View File

@ -67,6 +67,13 @@
</maintainer> </maintainer>
<!-- NOTE: please insert releases in numeric order, NOT chronologically. --> <!-- NOTE: please insert releases in numeric order, NOT chronologically. -->
<release>
<Version>
<name>lucene-9.11.1</name>
<created>2024-06-27</created>
<revision>9.11.1</revision>
</Version>
</release>.
<release> <release>
<Version> <Version>
<name>lucene-9.11.0</name> <name>lucene-9.11.0</name>

View File

@ -0,0 +1,78 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import re
import subprocess
import sys
import tempfile
import urllib.request
'''
A simple tool to see diffs between main's version of CHANGES.txt entries for
a given release vs the stable branch's version. It's best to keep these 1)
identical and 2) matching what changes were actually backported to be honest
to users and avoid future annoying conflicts on backport.
'''
# e.g. python3 -u diff_lucene_changes.py branch_9_9 main 9.9.0
#
def get_changes_url(branch_name):
if os.path.isdir(branch_name):
url = f'file://{branch_name}/lucene/CHANGES.txt'
else:
url = f'https://raw.githubusercontent.com/apache/lucene/{branch_name}/lucene/CHANGES.txt'
print(f'NOTE: resolving {branch_name} --> {url}')
return url
def extract_release_section(changes_txt, release_name):
return re.search(f'=======+ Lucene {re.escape(release_name)} =======+(.*?)=======+ Lucene .*? =======+$',
changes_txt.decode('utf-8'), re.MULTILINE | re.DOTALL).group(1).encode('utf-8')
def main():
if len(sys.argv) < 3 or len(sys.argv) > 5:
print('\nUsage: python3 -u dev-tools/scripts/diff_lucene_changes.py <branch1-or-local-clone> <branch2-or-local-clone> <release-name> [diff-commandline-extras]\n')
print(' e.g.: python3 -u dev-tools/scripts/diff_lucene_changes.py branch_9_9 /l/trunk 9.9.0 "-w"\n')
sys.exit(1)
branch1 = sys.argv[1]
branch2 = sys.argv[2]
release_name = sys.argv[3]
if len(sys.argv) > 4:
diff_cl_extras = [sys.argv[4]]
else:
diff_cl_extras = []
branch1_changes = extract_release_section(urllib.request.urlopen(get_changes_url(branch1)).read(),
release_name)
branch2_changes = extract_release_section(urllib.request.urlopen(get_changes_url(branch2)).read(),
release_name)
with tempfile.NamedTemporaryFile() as f1, tempfile.NamedTemporaryFile() as f2:
f1.write(branch1_changes)
f2.write(branch2_changes)
command = ['diff'] + diff_cl_extras + [f1.name, f2.name]
# diff returns non-zero exit status when there are diffs, so don't pass check=True
print(subprocess.run(command, check=False, capture_output=True).stdout.decode('utf-8'))
if __name__ == '__main__':
main()

View File

@ -17,13 +17,6 @@
def resources = scriptResources(buildscript) def resources = scriptResources(buildscript)
configure(rootProject) {
ext {
// also change this in extractor tool: ExtractForeignAPI
vectorIncubatorJavaVersions = [ JavaVersion.VERSION_21, JavaVersion.VERSION_22 ] as Set
}
}
configure(project(":lucene:core")) { configure(project(":lucene:core")) {
ext { ext {
apijars = layout.projectDirectory.dir("src/generated/jdk") apijars = layout.projectDirectory.dir("src/generated/jdk")

View File

@ -23,7 +23,7 @@ configure(project(":lucene:core")) {
description "Regenerate gen_ForUtil.py" description "Regenerate gen_ForUtil.py"
group "generation" group "generation"
def genDir = file("src/java/org/apache/lucene/codecs/lucene99") def genDir = file("src/java/org/apache/lucene/codecs/lucene912")
def genScript = file("${genDir}/gen_ForUtil.py") def genScript = file("${genDir}/gen_ForUtil.py")
def genOutput = file("${genDir}/ForUtil.java") def genOutput = file("${genDir}/ForUtil.java")
@ -43,6 +43,31 @@ configure(project(":lucene:core")) {
andThenTasks: ["spotlessJava", "spotlessJavaApply"], andThenTasks: ["spotlessJava", "spotlessJavaApply"],
mustRunBefore: [ "compileJava" ] mustRunBefore: [ "compileJava" ]
]) ])
task generateForDeltaUtilInternal() {
description "Regenerate gen_ForDeltaUtil.py"
group "generation"
def genDir = file("src/java/org/apache/lucene/codecs/lucene912")
def genScript = file("${genDir}/gen_ForDeltaUtil.py")
def genOutput = file("${genDir}/ForDeltaUtil.java")
inputs.file genScript
outputs.file genOutput
doLast {
quietExec {
workingDir genDir
executable project.externalTool("python3")
args = [ '-B', genScript ]
}
}
}
regenerate.dependsOn wrapWithPersistentChecksums(generateForDeltaUtilInternal, [
andThenTasks: ["spotlessJava", "spotlessJavaApply"],
mustRunBefore: [ "compileJava" ]
])
} }
configure(project(":lucene:backward-codecs")) { configure(project(":lucene:backward-codecs")) {
@ -96,5 +121,30 @@ configure(project(":lucene:backward-codecs")) {
andThenTasks: ["spotlessJava", "spotlessJavaApply"], andThenTasks: ["spotlessJava", "spotlessJavaApply"],
mustRunBefore: [ "compileJava" ] mustRunBefore: [ "compileJava" ]
]) ])
task generateForUtil99Internal() {
description "Regenerate gen_ForUtil.py"
group "generation"
def genDir = file("src/java/org/apache/lucene/backward_codecs/lucene99")
def genScript = file("${genDir}/gen_ForUtil.py")
def genOutput = file("${genDir}/ForUtil.java")
inputs.file genScript
outputs.file genOutput
doLast {
quietExec {
workingDir genDir
executable project.externalTool("python3")
args = [ '-B', genScript ]
}
}
}
regenerate.dependsOn wrapWithPersistentChecksums(generateForUtil99Internal, [
andThenTasks: ["spotlessJava", "spotlessJavaApply"],
mustRunBefore: [ "compileJava" ]
])
} }

View File

@ -65,10 +65,8 @@ configure(project(":lucene:analysis:icu")) {
icupkg = file("${icuBinDir}/icupkg") icupkg = file("${icuBinDir}/icupkg")
} }
// Resolve version lazily (can't resolve at configuration time).
def icu4jVersionProvider = project.provider { getVersion('com.ibm.icu', 'icu4j') }
// lazy gstring with ICU version. // lazy gstring with ICU version.
def icu4jVersion = "${-> icu4jVersionProvider.get()}" def icu4jVersion = deps.icu4j.get().version
def icuCompileTask = Os.isFamily(Os.FAMILY_WINDOWS) ? "compileIcuWindows" : "compileIcuLinux" def icuCompileTask = Os.isFamily(Os.FAMILY_WINDOWS) ? "compileIcuWindows" : "compileIcuLinux"

View File

@ -22,10 +22,11 @@ import org.gradle.plugins.ide.eclipse.model.ClasspathEntry
def resources = scriptResources(buildscript) def resources = scriptResources(buildscript)
configure(rootProject) { configure(rootProject) {
plugins.withType(JavaPlugin) { if (gradle.startParameter.taskNames.contains("eclipse")) {
apply plugin: "eclipse" project.pluginManager.apply("java-base")
project.pluginManager.apply("eclipse")
def eclipseJavaVersion = propertyOrDefault("eclipse.javaVersion", rootProject.minJavaVersion) def eclipseJavaVersion = propertyOrDefault("eclipse.javaVersion", deps.versions.minJava.get())
def relativize = { other -> rootProject.rootDir.relativePath(other).toString() } def relativize = { other -> rootProject.rootDir.relativePath(other).toString() }
eclipse { eclipse {
@ -105,9 +106,9 @@ configure(rootProject) {
} }
} }
eclipseJdt { eclipseJdt {
enabled = false enabled = false
dependsOn 'luceneEclipse' dependsOn 'luceneEclipseJdt'
} }
eclipseClasspath { eclipseClasspath {

View File

@ -75,6 +75,18 @@ configure(rootProject) {
it.dependsOn(":versionCatalogFormatDeps") it.dependsOn(":versionCatalogFormatDeps")
} }
// correct crlf/ default encoding after version catalog formatting finishes.
tasks.matching {
it.path in [
":versionCatalogFormatDeps"
]
}.configureEach {
it.doLast {
ant.fixcrlf(file: it.catalogFile.get().asFile,
eol: "lf", fixlast: "true", encoding: "UTF-8")
}
}
tasks.matching { tasks.matching {
it.path in [ it.path in [
":versionCatalogUpdateDeps" ":versionCatalogUpdateDeps"

View File

@ -13,7 +13,7 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
@defaultMessage Spawns threads with vague names; use a custom thread factory (Lucene's NamedThreadFactory, Solr's SolrNamedThreadFactory) and name threads so that you can tell (by its name) which executor it is associated with @defaultMessage Spawns threads with vague names; use a custom thread factory (Lucene's NamedThreadFactory) and name threads so that you can tell (by its name) which executor it is associated with
java.util.concurrent.Executors#newFixedThreadPool(int) java.util.concurrent.Executors#newFixedThreadPool(int)
java.util.concurrent.Executors#newSingleThreadExecutor() java.util.concurrent.Executors#newSingleThreadExecutor()
java.util.concurrent.Executors#newCachedThreadPool() java.util.concurrent.Executors#newCachedThreadPool()

View File

@ -20,6 +20,10 @@
// 2) notice file // 2) notice file
// 3) checksum validation/ generation. // 3) checksum validation/ generation.
// WARNING: The tasks in this file share internal state between tasks without using files.
// Because of this all tasks here must always execute together, so they cannot define task outputs.
// TODO: Rewrite the internal state to use state files containing the ext.jarInfos and its referencedFiles
// This should be false only for debugging. // This should be false only for debugging.
def failOnError = true def failOnError = true
@ -194,13 +198,6 @@ subprojects {
description = "Validate license and notice files of dependencies" description = "Validate license and notice files of dependencies"
dependsOn collectJarInfos dependsOn collectJarInfos
def outputFileName = 'validateJarLicenses'
inputs.dir(file(project.rootDir.path + '/lucene/licenses'))
.withPropertyName('licenses')
.withPathSensitivity(PathSensitivity.RELATIVE)
outputs.file(layout.buildDirectory.file(outputFileName))
.withPropertyName('validateJarLicensesResult')
doLast { doLast {
def errors = [] def errors = []
jarInfos.each { dep -> jarInfos.each { dep ->
@ -246,9 +243,7 @@ subprojects {
} }
} }
} }
// Required to take advantage of incremental building and the build cache
def f = new File(project.buildDir.path + "/" + outputFileName)
f.write(errors.toString(), "UTF-8")
if (errors) { if (errors) {
def msg = "Certain license/ notice files are missing:\n - " + errors.join("\n - ") def msg = "Certain license/ notice files are missing:\n - " + errors.join("\n - ")
if (failOnError) { if (failOnError) {

View File

@ -80,10 +80,6 @@ API Changes
* GITHUB#12875: Ensure token position is always increased in PathHierarchyTokenizer and ReversePathHierarchyTokenizer * GITHUB#12875: Ensure token position is always increased in PathHierarchyTokenizer and ReversePathHierarchyTokenizer
and resulting tokens do not overlap. (Michael Froh, Lukáš Vlček) and resulting tokens do not overlap. (Michael Froh, Lukáš Vlček)
* GITHUB#12624, GITHUB#12831: Allow FSTCompiler to stream to any DataOutput while building, and
make compile() only return the FSTMetadata. For on-heap (default) use case, please use
FST.fromFSTReader(fstMetadata, fstCompiler.getFSTReader()) to create the FST. (Anh Dung Bui)
* GITHUB#13146, GITHUB#13148: Remove ByteBufferIndexInput and only use MemorySegment APIs * GITHUB#13146, GITHUB#13148: Remove ByteBufferIndexInput and only use MemorySegment APIs
for MMapDirectory. (Uwe Schindler) for MMapDirectory. (Uwe Schindler)
@ -112,6 +108,11 @@ API Changes
* GITHUB#13410: Removed Scorer#getWeight (Sanjay Dutt, Adrien Grand) * GITHUB#13410: Removed Scorer#getWeight (Sanjay Dutt, Adrien Grand)
* GITHUB#13499: Remove deprecated TopScoreDocCollector + TopFieldCollector methods (#create, #createSharedManager) (Jakub Slowinski)
* GITHUB#13632: CandidateMatcher public matching functions (Bryan Jacobowitz)
New Features New Features
--------------------- ---------------------
@ -133,6 +134,16 @@ New Features
DocValuesSkipper abstraction. A new flag is added to FieldType.java that configures whether DocValuesSkipper abstraction. A new flag is added to FieldType.java that configures whether
to create a "skip index" for doc values. (Ignacio Vera) to create a "skip index" for doc values. (Ignacio Vera)
* GITHUB#13563: Add levels to doc values skip index. (Ignacio Vera)
* GITHUB#13597: Align doc value skipper interval boundaries when an interval contains a constant
value. (Ignacio Vera)
* GITHUB#13604: Add Kmeans clustering on vectors (Mayya Sharipova, Jim Ferenczi, Tom Veasey)
* GITHUB#13592: Take advantage of the doc value skipper when it is primary sort in SortedNumericDocValuesRangeQuery
and SortedSetDocValuesRangeQuery. (Ignacio Vera)
Improvements Improvements
--------------------- ---------------------
@ -168,6 +179,8 @@ Optimizations
* GITHUB#12552: Make FSTPostingsFormat load FSTs off-heap. (Tony X) * GITHUB#12552: Make FSTPostingsFormat load FSTs off-heap. (Tony X)
* GITHUB#13672: Leverage doc value skip lists in DocValuesRewriteMethod if indexed. (Greg Miller)
Bug Fixes Bug Fixes
--------------------- ---------------------
@ -205,6 +218,9 @@ Changes in Backwards Compatibility Policy
* GITHUB#13230: Remove the Kp and Lovins snowball algorithms which are not supported * GITHUB#13230: Remove the Kp and Lovins snowball algorithms which are not supported
or intended for general use. (Robert Muir) or intended for general use. (Robert Muir)
* GITHUB#13602: SearchWithCollectorTask no longer supports the `collector.class` config parameter to load a custom
collector implementation. `collector.manager.class` allows users to load a collector manager instead. (Luca Cavanna)
Other Other
--------------------- ---------------------
@ -243,22 +259,71 @@ Other
* GITHUB#13332: Improve MissingDoclet linter to check records correctly. (Uwe Schindler) * GITHUB#13332: Improve MissingDoclet linter to check records correctly. (Uwe Schindler)
* GITHUB#13499: Remove usage of TopScoreDocCollector + TopFieldCollector deprecated methods (#create, #createSharedManager) (Jakub Slowinski)
Build
---------------------
* GITHUB#13649: Fix eclipse ide settings generation #13649 (Uwe Schindler, Dawid Weiss)
======================== Lucene 9.12.0 ======================= ======================== Lucene 9.12.0 =======================
API Changes API Changes
--------------------- ---------------------
* GITHUB#13281: Mark COSINE VectorSimilarityFunction as deprecated. (Pulkit Gupta)
* GITHUB#13469: Expose FlatVectorsFormat as a first-class format; can be configured using a custom Codec. (Michael Sokolov) * GITHUB#13469: Expose FlatVectorsFormat as a first-class format; can be configured using a custom Codec. (Michael Sokolov)
* GITHUB#13612: Hunspell: add Suggester#proceedPastRep to avoid losing relevant suggestions. (Peter Gromov)
* GITHUB#13603: Introduced `IndexSearcher#searchLeaf(LeafReaderContext, Weight, Collector)` protected method to
facilitate customizing per-leaf behavior of search without requiring to override
`search(LeafReaderContext[], Weight, Collector)` which requires overriding the entire loop across the leaves (Luca Cavanna)
* GITHUB#13559: Add BitSet#nextSetBit(int, int) to get the index of the first set bit in range. (Egor Potemkin)
* GITHUB#13568: Add DoubleValuesSource#toSortableLongDoubleValuesSource and
MultiDoubleValuesSource#toSortableMultiLongValuesSource methods. (Shradha Shankar)
* GITHUB#13568: Add CollectorOwner class that wraps CollectorManager, and handles list of Collectors and results.
Add IndexSearcher#search method that takes CollectorOwner. (Egor Potemkin)
* GITHUB#13568: Add DrillSideways#search method that supports any collector types for any drill-sideways dimensions
or drill-down. (Egor Potemkin)
New Features New Features
--------------------- ---------------------
(No changes)
* GITHUB#13430: Allow configuring the search concurrency via
TieredMergePolicy#setTargetSearchConcurrency. This in-turn instructs the
merge policy to try to have at least this number of segments on the highest
tier. (Adrien Grand, Carlos Delgado)
* GITHUB#13517: Allow configuring the search concurrency on LogDocMergePolicy
and LogByteSizeMergePolicy via a new #setTargetConcurrency setter.
(Adrien Grand)
* GITHUB#13568: Add sandbox facets module to compute facets while collecting. (Egor Potemkin, Shradha Shankar)
* GITHUB#13678: Add support JDK 23 to the Panama Vectorization Provider. (Chris Hegarty)
Improvements Improvements
--------------------- ---------------------
(No changes)
* GITHUB#13548: Refactor and javadoc update for KNN vector writer classes. (Patrick Zhai)
* GITHUB#13562: Add Intervals.regexp and Intervals.range methods to produce IntervalsSource
for regexp and range queries. (Mayya Sharipova)
* GITHUB#13625: Remove BitSet#nextSetBit code duplication. (Greg Miller)
* GITHUB#13285: Early terminate graph searches of AbstractVectorSimilarityQuery to follow timeout set from
IndexSearcher#setTimeout(QueryTimeout). (Kaival Parikh)
* GITHUB#13633: Add ability to read/write knn vector values to a MemoryIndex. (Ben Trent)
* GITHUB#12627: patch HNSW graphs to improve reachability of all nodes from entry points
* GITHUB#13201: Better cost estimation on MultiTermQuery over few terms. (Michael Froh)
Optimizations Optimizations
--------------------- ---------------------
@ -277,16 +342,100 @@ Optimizations
* GITHUB#12941: Don't preserve auxiliary buffer contents in LSBRadixSorter if it grows. (Stefan Vodita) * GITHUB#12941: Don't preserve auxiliary buffer contents in LSBRadixSorter if it grows. (Stefan Vodita)
* GITHUB#13175: Stop double-checking priority queue inserts in some FacetCount classes. (Jakub Slowinski)
* GITHUB#13538: Slightly reduce heap usage for HNSW and scalar quantized vector writers. (Ben Trent)
* GITHUB#12100: WordBreakSpellChecker.suggestWordBreaks now does a breadth first search, allowing it to return
better matches with fewer evaluations (hossman)
* GITHUB#13582: Stop requiring MaxScoreBulkScorer's outer window from having at
least INNER_WINDOW_SIZE docs. (Adrien Grand)
* GITHUB#13570, GITHUB#13574, GITHUB#13535: Avoid performance degradation with closing shared Arenas.
Closing many individual index files can potentially lead to a degradation in execution performance.
Index files are mmapped one-to-one with the JDK's foreign shared Arena. The JVM deoptimizes the top
few frames of all threads when closing a shared Arena (see JDK-8335480). We mitigate this situation
when running with JDK 21 and greater, by 1) using a confined Arena where appropriate, and 2) grouping
files from the same segment to a single shared Arena.
A system property has been added that allows to control the total maximum number of mmapped files
that may be associated with a single shared Arena. For example, to set the max number of permits to
256, pass the following on the command line
-Dorg.apache.lucene.store.MMapDirectory.sharedArenaMaxPermits=256. Setting a value of 1 associates
a single file to a single shared arena.
(Chris Hegarty, Michael Gibney, Uwe Schindler)
* GITHUB#13585: Lucene912PostingsFormat, the new default postings format, now
only has 2 levels of skip data, which are inlined into postings instead of
being stored at the end of postings lists. This translates into better
performance for queries that need skipping such as conjunctions.
(Adrien Grand)
* GITHUB#13581: OnHeapHnswGraph no longer allocates a lock for every graph node (Mike Sokolov)
* GITHUB#13636, GITHUB#13658: Optimizations to the decoding logic of blocks of
postings. (Adrien Grand, Uwe Schindler, Greg Miller)
* GITHUB##13644: Improve NumericComparator competitive iterator logic by comparing the missing value with the top
value even after the hit queue is full (Pan Guixin)
* GITHUB#13587: Use Max WAND optimizations with ToParentBlockJoinQuery when using ScoreMode.Max (Mike Pellegrini)
Changes in runtime behavior
---------------------
* GITHUB#13472: When an executor is provided to the IndexSearcher constructor, the searcher now executes tasks on the
thread that invoked a search as well as its configured executor. Users should reduce the executor's thread-count by 1
to retain the previous level of parallelism. Moreover, it is now possible to start searches from the same executor
that is configured in the IndexSearcher without risk of deadlocking. A separate executor for starting searches is no
longer required. (Armin Braun)
Bug Fixes Bug Fixes
--------------------- ---------------------
* GITHUB#13384: Fix highlighter to use longer passages instead of shorter individual terms. (Zack Kendall)
* GITHUB#13463: Address bug in MultiLeafKnnCollector causing #minCompetitiveSimilarity to stay artificially low in * GITHUB#13463: Address bug in MultiLeafKnnCollector causing #minCompetitiveSimilarity to stay artificially low in
some corner cases. (Greg Miller) some corner cases. (Greg Miller)
* GITHUB#13553: Correct RamUsageEstimate for scalar quantized knn vector formats so that raw vectors are correctly
accounted for. (Ben Trent)
* GITHUB#13615: Correct scalar quantization when used in conjunction with COSINE similarity. Vectors are normalized
before quantization to ensure the cosine similarity is correctly calculated. (Ben Trent)
* GITHUB#13627: Fix race condition on flush for DWPT seqNo generation. (Ben Trent, Ao Li)
* GITHUB#13691: Fix incorrect exponent value in explain of SigmoidFunction. (Owais Kazi)
Build
---------------------
* GITHUB#13695, GITHUB#13696: Fix Gradle build sometimes gives spurious "unreferenced license file" warnings.
(Uwe Schindler)
Other Other
-------------------- --------------------
(No changes) (No changes)
======================== Lucene 9.11.1 =======================
Bug Fixes
---------------------
* GITHUB#13498: Avoid performance regression by constructing lazily the PointTree in NumericComparator. (Ignacio Vera)
* GITHUB#13501, GITHUB#13478: Remove intra-merge parallelism for everything except HNSW graph merges. (Ben Trent)
* GITHUB#13498, GITHUB#13340: Allow adding a parent field to an index with no fields (Michael Sokolov)
* GITHUB#12431: Fix IndexOutOfBoundsException thrown in DefaultPassageFormatter
by unordered matches. (Stephane Campinas)
* GITHUB#13493: StringValueFacetCounts stops throwing NPE when faceting over an empty match-set. (Grebennikov Roman,
Stefan Vodita)
======================== Lucene 9.11.0 ======================= ======================== Lucene 9.11.0 =======================
API Changes API Changes
@ -494,6 +643,10 @@ API Changes
* GITHUB#12854: Mark DrillSideways#createDrillDownFacetsCollector as @Deprecated. (Greg Miller) * GITHUB#12854: Mark DrillSideways#createDrillDownFacetsCollector as @Deprecated. (Greg Miller)
* GITHUB#12624, GITHUB#12831: Allow FSTCompiler to stream to any DataOutput while building, and
make compile() only return the FSTMetadata. For on-heap (default) use case, please use
FST.fromFSTReader(fstMetadata, fstCompiler.getFSTReader()) to create the FST. (Anh Dung Bui)
New Features New Features
--------------------- ---------------------
* GITHUB#12679: Add support for similarity-based vector searches using [Byte|Float]VectorSimilarityQuery. Uses a new * GITHUB#12679: Add support for similarity-based vector searches using [Byte|Float]VectorSimilarityQuery. Uses a new
@ -501,6 +654,12 @@ New Features
better-scoring nodes are available, or the best candidate is below a score of `traversalSimilarity` in the lowest better-scoring nodes are available, or the best candidate is below a score of `traversalSimilarity` in the lowest
level. (Aditya Prakash, Kaival Parikh) level. (Aditya Prakash, Kaival Parikh)
* GITHUB#12829: For indices newly created as of 9.10.0 onwards, IndexWriter preserves document blocks indexed via
IndexWriter#addDocuments or IndexWriter#updateDocuments also when index sorting is configured. Document blocks are
maintained alongside their parent documents during sort and merge. IndexWriterConfig accepts a parent field that is used
to maintain block orders if index sorting is used. Note, this is fully optional in Lucene 9.x while will be mandatory for
indices that use document blocks together with index sorting as of 10.0.0. (Simon Willnauer)
* GITHUB#12336: Index additional data per facet label in the taxonomy. (Shai Erera, Egor Potemkin, Mike McCandless, * GITHUB#12336: Index additional data per facet label in the taxonomy. (Shai Erera, Egor Potemkin, Mike McCandless,
Stefan Vodita) Stefan Vodita)
@ -592,7 +751,6 @@ Build
Other Other
--------------------- ---------------------
* GITHUB#11023: Removing some dead code in CheckIndex. (Jakub Slowinski) * GITHUB#11023: Removing some dead code in CheckIndex. (Jakub Slowinski)
* GITHUB#11023: Removing @lucene.experimental tags in testXXX methods in CheckIndex. (Jakub Slowinski) * GITHUB#11023: Removing @lucene.experimental tags in testXXX methods in CheckIndex. (Jakub Slowinski)

View File

@ -1,5 +1,5 @@
{ {
"gradle/generation/jflex/skeleton.default.txt": "58944f66c9113a940dfaf6a17210ec8219024390", "gradle/generation/jflex/skeleton.default.txt": "58944f66c9113a940dfaf6a17210ec8219024390",
"lucene/analysis/common/src/java/org/apache/lucene/analysis/classic/ClassicTokenizerImpl.java": "1f7a446f3483326385eef257cea8366c27da0850", "lucene/analysis/common/src/java/org/apache/lucene/analysis/classic/ClassicTokenizerImpl.java": "e62dcd8c25219d8f5d783823b228ffe38d2bacde",
"lucene/analysis/common/src/java/org/apache/lucene/analysis/classic/ClassicTokenizerImpl.jflex": "f52109bb7d5701979fde90aeeeda726246a8d5fd" "lucene/analysis/common/src/java/org/apache/lucene/analysis/classic/ClassicTokenizerImpl.jflex": "f52109bb7d5701979fde90aeeeda726246a8d5fd"
} }

View File

@ -1,5 +1,5 @@
{ {
"gradle/generation/jflex/skeleton.default.txt": "58944f66c9113a940dfaf6a17210ec8219024390", "gradle/generation/jflex/skeleton.default.txt": "58944f66c9113a940dfaf6a17210ec8219024390",
"lucene/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.java": "ac298e08bc5b96202efca0c01f9f0376fda976bd", "lucene/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.java": "2b5df5ff35543a6380c82f298225eb5fa06e4453",
"lucene/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.jflex": "0b8c7774b98e8237702013e82c352d4711509bd0" "lucene/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.jflex": "0b8c7774b98e8237702013e82c352d4711509bd0"
} }

View File

@ -37,23 +37,23 @@ class BengaliNormalizer {
for (int i = 0; i < len; i++) { for (int i = 0; i < len; i++) {
switch (s[i]) { switch (s[i]) {
// delete Chandrabindu // delete Chandrabindu
case '\u0981': case '\u0981':
len = delete(s, i, len); len = delete(s, i, len);
i--; i--;
break; break;
// DirghoI kar -> RosshoI kar // DirghoI kar -> RosshoI kar
case '\u09C0': case '\u09C0':
s[i] = '\u09BF'; s[i] = '\u09BF';
break; break;
// DirghoU kar -> RosshoU kar // DirghoU kar -> RosshoU kar
case '\u09C2': case '\u09C2':
s[i] = '\u09C1'; s[i] = '\u09C1';
break; break;
// Khio (Ka + Hoshonto + Murdorno Sh) // Khio (Ka + Hoshonto + Murdorno Sh)
case '\u0995': case '\u0995':
if (i + 2 < len && s[i + 1] == '\u09CD' && s[i + 2] == '\u09BF') { if (i + 2 < len && s[i + 1] == '\u09CD' && s[i + 2] == '\u09BF') {
if (i == 0) { if (i == 0) {
@ -67,12 +67,12 @@ class BengaliNormalizer {
} }
break; break;
// Nga to Anusvara // Nga to Anusvara
case '\u0999': case '\u0999':
s[i] = '\u0982'; s[i] = '\u0982';
break; break;
// Ja Phala // Ja Phala
case '\u09AF': case '\u09AF':
if (i - 2 == 0 && s[i - 1] == '\u09CD') { if (i - 2 == 0 && s[i - 1] == '\u09CD') {
s[i - 1] = '\u09C7'; s[i - 1] = '\u09C7';
@ -89,7 +89,7 @@ class BengaliNormalizer {
} }
break; break;
// Ba Phalaa // Ba Phalaa
case '\u09AC': case '\u09AC':
if ((i >= 1 && s[i - 1] != '\u09CD') || i == 0) { if ((i >= 1 && s[i - 1] != '\u09CD') || i == 0) {
break; break;
@ -109,7 +109,7 @@ class BengaliNormalizer {
} }
break; break;
// Visarga // Visarga
case '\u0983': case '\u0983':
if (i == len - 1) { if (i == len - 1) {
if (len <= 3) { if (len <= 3) {
@ -122,18 +122,18 @@ class BengaliNormalizer {
} }
break; break;
// All sh // All sh
case '\u09B6': case '\u09B6':
case '\u09B7': case '\u09B7':
s[i] = '\u09B8'; s[i] = '\u09B8';
break; break;
// check na // check na
case '\u09A3': case '\u09A3':
s[i] = '\u09A8'; s[i] = '\u09A8';
break; break;
// check ra // check ra
case '\u09DC': case '\u09DC':
case '\u09DD': case '\u09DD':
s[i] = '\u09B0'; s[i] = '\u09B0';

View File

@ -747,70 +747,70 @@ class ClassicTokenizerImpl {
/* Break so we don't hit fall-through warning: */ /* Break so we don't hit fall-through warning: */
break; /* ignore */ break; /* ignore */
} }
// fall through // fall through
case 11: case 11:
break; break;
case 2: case 2:
{ {
return ALPHANUM; return ALPHANUM;
} }
// fall through // fall through
case 12: case 12:
break; break;
case 3: case 3:
{ {
return CJ; return CJ;
} }
// fall through // fall through
case 13: case 13:
break; break;
case 4: case 4:
{ {
return NUM; return NUM;
} }
// fall through // fall through
case 14: case 14:
break; break;
case 5: case 5:
{ {
return HOST; return HOST;
} }
// fall through // fall through
case 15: case 15:
break; break;
case 6: case 6:
{ {
return COMPANY; return COMPANY;
} }
// fall through // fall through
case 16: case 16:
break; break;
case 7: case 7:
{ {
return APOSTROPHE; return APOSTROPHE;
} }
// fall through // fall through
case 17: case 17:
break; break;
case 8: case 8:
{ {
return ACRONYM_DEP; return ACRONYM_DEP;
} }
// fall through // fall through
case 18: case 18:
break; break;
case 9: case 9:
{ {
return ACRONYM; return ACRONYM;
} }
// fall through // fall through
case 19: case 19:
break; break;
case 10: case 10:
{ {
return EMAIL; return EMAIL;
} }
// fall through // fall through
case 20: case 20:
break; break;
default: default:

View File

@ -53,18 +53,18 @@ public final class GreekLowerCaseFilter extends TokenFilter {
private int lowerCase(int codepoint) { private int lowerCase(int codepoint) {
switch (codepoint) { switch (codepoint) {
/* There are two lowercase forms of sigma: /* There are two lowercase forms of sigma:
* U+03C2: small final sigma (end of word) * U+03C2: small final sigma (end of word)
* U+03C3: small sigma (otherwise) * U+03C3: small sigma (otherwise)
* *
* Standardize both to U+03C3 * Standardize both to U+03C3
*/ */
case '\u03C2': /* small final sigma */ case '\u03C2': /* small final sigma */
return '\u03C3'; /* small sigma */ return '\u03C3'; /* small sigma */
/* Some greek characters contain diacritics. /* Some greek characters contain diacritics.
* This filter removes these, converting to the lowercase base form. * This filter removes these, converting to the lowercase base form.
*/ */
case '\u0386': /* capital alpha with tonos */ case '\u0386': /* capital alpha with tonos */
case '\u03AC': /* small alpha with tonos */ case '\u03AC': /* small alpha with tonos */
@ -100,9 +100,9 @@ public final class GreekLowerCaseFilter extends TokenFilter {
case '\u03CE': /* small omega with tonos */ case '\u03CE': /* small omega with tonos */
return '\u03C9'; /* small omega */ return '\u03C9'; /* small omega */
/* The previous implementation did the conversion below. /* The previous implementation did the conversion below.
* Only implemented for backwards compatibility with old indexes. * Only implemented for backwards compatibility with old indexes.
*/ */
case '\u03A2': /* reserved */ case '\u03A2': /* reserved */
return '\u03C2'; /* small final sigma */ return '\u03C2'; /* small final sigma */

View File

@ -456,7 +456,7 @@ class PorterStemmer {
/* j >= 0 fixes Bug 2 */ /* j >= 0 fixes Bug 2 */
if (ends("ou")) break; if (ends("ou")) break;
return; return;
/* takes care of -ous */ /* takes care of -ous */
case 's': case 's':
if (ends("ism")) break; if (ends("ism")) break;
return; return;

View File

@ -67,7 +67,7 @@ public final class IrishLowerCaseFilter extends TokenFilter {
case 'I': case 'I':
case 'O': case 'O':
case 'U': case 'U':
// vowels with acute accent (fada) // vowels with acute accent (fada)
case '\u00c1': case '\u00c1':
case '\u00c9': case '\u00c9':
case '\u00cd': case '\u00cd':

View File

@ -47,18 +47,18 @@ class HindiNormalizer {
for (int i = 0; i < len; i++) { for (int i = 0; i < len; i++) {
switch (s[i]) { switch (s[i]) {
// dead n -> bindu // dead n -> bindu
case '\u0928': case '\u0928':
if (i + 1 < len && s[i + 1] == '\u094D') { if (i + 1 < len && s[i + 1] == '\u094D') {
s[i] = '\u0902'; s[i] = '\u0902';
len = delete(s, i + 1, len); len = delete(s, i + 1, len);
} }
break; break;
// candrabindu -> bindu // candrabindu -> bindu
case '\u0901': case '\u0901':
s[i] = '\u0902'; s[i] = '\u0902';
break; break;
// nukta deletions // nukta deletions
case '\u093C': case '\u093C':
len = delete(s, i, len); len = delete(s, i, len);
i--; i--;
@ -96,18 +96,18 @@ class HindiNormalizer {
case '\u095F': case '\u095F':
s[i] = '\u092F'; s[i] = '\u092F';
break; break;
// zwj/zwnj -> delete // zwj/zwnj -> delete
case '\u200D': case '\u200D':
case '\u200C': case '\u200C':
len = delete(s, i, len); len = delete(s, i, len);
i--; i--;
break; break;
// virama -> delete // virama -> delete
case '\u094D': case '\u094D':
len = delete(s, i, len); len = delete(s, i, len);
i--; i--;
break; break;
// chandra/short -> replace // chandra/short -> replace
case '\u0945': case '\u0945':
case '\u0946': case '\u0946':
s[i] = '\u0947'; s[i] = '\u0947';
@ -127,7 +127,7 @@ class HindiNormalizer {
case '\u0972': case '\u0972':
s[i] = '\u0905'; s[i] = '\u0905';
break; break;
// long -> short ind. vowels // long -> short ind. vowels
case '\u0906': case '\u0906':
s[i] = '\u0905'; s[i] = '\u0905';
break; break;
@ -149,7 +149,7 @@ class HindiNormalizer {
case '\u0914': case '\u0914':
s[i] = '\u0913'; s[i] = '\u0913';
break; break;
// long -> short dep. vowels // long -> short dep. vowels
case '\u0940': case '\u0940':
s[i] = '\u093F'; s[i] = '\u093F';
break; break;

View File

@ -31,6 +31,7 @@ class ModifyingSuggester {
private final String misspelled; private final String misspelled;
private final WordCase wordCase; private final WordCase wordCase;
private final FragmentChecker fragmentChecker; private final FragmentChecker fragmentChecker;
private final boolean proceedPastRep;
private final char[] tryChars; private final char[] tryChars;
private final Hunspell speller; private final Hunspell speller;
@ -39,13 +40,15 @@ class ModifyingSuggester {
LinkedHashSet<Suggestion> result, LinkedHashSet<Suggestion> result,
String misspelled, String misspelled,
WordCase wordCase, WordCase wordCase,
FragmentChecker checker) { FragmentChecker checker,
boolean proceedPastRep) {
this.speller = speller; this.speller = speller;
tryChars = speller.dictionary.tryChars.toCharArray(); tryChars = speller.dictionary.tryChars.toCharArray();
this.result = result; this.result = result;
this.misspelled = misspelled; this.misspelled = misspelled;
this.wordCase = wordCase; this.wordCase = wordCase;
fragmentChecker = checker; fragmentChecker = checker;
this.proceedPastRep = proceedPastRep;
} }
/** /**
@ -125,9 +128,9 @@ class ModifyingSuggester {
boolean hasGoodSuggestions = trySuggestion(word.toUpperCase(Locale.ROOT)); boolean hasGoodSuggestions = trySuggestion(word.toUpperCase(Locale.ROOT));
GradedSuggestions repResult = tryRep(word); GradedSuggestions repResult = tryRep(word);
if (repResult == GradedSuggestions.Best) return true; if (repResult == GradedSuggestions.Best && !proceedPastRep) return true;
hasGoodSuggestions |= repResult == GradedSuggestions.Normal; hasGoodSuggestions |= repResult != GradedSuggestions.None;
if (!speller.dictionary.mapTable.isEmpty()) { if (!speller.dictionary.mapTable.isEmpty()) {
enumerateMapReplacements(word, "", 0); enumerateMapReplacements(word, "", 0);

View File

@ -53,16 +53,21 @@ public class Suggester {
private final Dictionary dictionary; private final Dictionary dictionary;
private final SuggestibleEntryCache suggestibleCache; private final SuggestibleEntryCache suggestibleCache;
private final FragmentChecker fragmentChecker; private final FragmentChecker fragmentChecker;
private final boolean proceedPastRep;
public Suggester(Dictionary dictionary) { public Suggester(Dictionary dictionary) {
this(dictionary, null, FragmentChecker.EVERYTHING_POSSIBLE); this(dictionary, null, FragmentChecker.EVERYTHING_POSSIBLE, false);
} }
private Suggester( private Suggester(
Dictionary dictionary, SuggestibleEntryCache suggestibleCache, FragmentChecker checker) { Dictionary dictionary,
SuggestibleEntryCache suggestibleCache,
FragmentChecker checker,
boolean proceedPastRep) {
this.dictionary = dictionary; this.dictionary = dictionary;
this.suggestibleCache = suggestibleCache; this.suggestibleCache = suggestibleCache;
this.fragmentChecker = checker; this.fragmentChecker = checker;
this.proceedPastRep = proceedPastRep;
} }
/** /**
@ -71,8 +76,8 @@ public class Suggester {
* entries are stored as fast-to-iterate plain words instead of highly compressed prefix trees. * entries are stored as fast-to-iterate plain words instead of highly compressed prefix trees.
*/ */
public Suggester withSuggestibleEntryCache() { public Suggester withSuggestibleEntryCache() {
return new Suggester( SuggestibleEntryCache cache = SuggestibleEntryCache.buildCache(dictionary.words);
dictionary, SuggestibleEntryCache.buildCache(dictionary.words), fragmentChecker); return new Suggester(dictionary, cache, fragmentChecker, proceedPastRep);
} }
/** /**
@ -80,7 +85,17 @@ public class Suggester {
* the performance of the "Modification" phase performance. * the performance of the "Modification" phase performance.
*/ */
public Suggester withFragmentChecker(FragmentChecker checker) { public Suggester withFragmentChecker(FragmentChecker checker) {
return new Suggester(dictionary, suggestibleCache, checker); return new Suggester(dictionary, suggestibleCache, checker, proceedPastRep);
}
/**
* Returns a copy of this suggester instance that doesn't stop after encountering acceptable words
* after applying REP rules. By default, Hunspell stops when it finds any, but this behavior may
* not always be desirable, e.g., if we have "REP i ea", "tims" be replaced only by "teams" and
* not "times", which could also be meant.
*/
public Suggester proceedPastRep() {
return new Suggester(dictionary, suggestibleCache, fragmentChecker, true);
} }
/** /**
@ -174,7 +189,8 @@ public class Suggester {
} }
boolean hasGoodSuggestions = boolean hasGoodSuggestions =
new ModifyingSuggester(suggestionSpeller, suggestions, word, wordCase, fragmentChecker) new ModifyingSuggester(
suggestionSpeller, suggestions, word, wordCase, fragmentChecker, proceedPastRep)
.suggest(); .suggest();
if (!hasGoodSuggestions && dictionary.maxNGramSuggestions > 0) { if (!hasGoodSuggestions && dictionary.maxNGramSuggestions > 0) {

View File

@ -194,7 +194,7 @@ public final class WordDelimiterIterator {
int type = charType(text[current]); int type = charType(text[current]);
switch (type) { switch (type) {
// return ALPHA word type for both lower and upper // return ALPHA word type for both lower and upper
case LOWER: case LOWER:
case UPPER: case UPPER:
return ALPHA; return ALPHA;
@ -332,27 +332,27 @@ public final class WordDelimiterIterator {
case Character.OTHER_NUMBER: case Character.OTHER_NUMBER:
return DIGIT; return DIGIT;
// case Character.SPACE_SEPARATOR: // case Character.SPACE_SEPARATOR:
// case Character.LINE_SEPARATOR: // case Character.LINE_SEPARATOR:
// case Character.PARAGRAPH_SEPARATOR: // case Character.PARAGRAPH_SEPARATOR:
// case Character.CONTROL: // case Character.CONTROL:
// case Character.FORMAT: // case Character.FORMAT:
// case Character.PRIVATE_USE: // case Character.PRIVATE_USE:
case Character.SURROGATE: // prevent splitting case Character.SURROGATE: // prevent splitting
return ALPHA | DIGIT; return ALPHA | DIGIT;
// case Character.DASH_PUNCTUATION: // case Character.DASH_PUNCTUATION:
// case Character.START_PUNCTUATION: // case Character.START_PUNCTUATION:
// case Character.END_PUNCTUATION: // case Character.END_PUNCTUATION:
// case Character.CONNECTOR_PUNCTUATION: // case Character.CONNECTOR_PUNCTUATION:
// case Character.OTHER_PUNCTUATION: // case Character.OTHER_PUNCTUATION:
// case Character.MATH_SYMBOL: // case Character.MATH_SYMBOL:
// case Character.CURRENCY_SYMBOL: // case Character.CURRENCY_SYMBOL:
// case Character.MODIFIER_SYMBOL: // case Character.MODIFIER_SYMBOL:
// case Character.OTHER_SYMBOL: // case Character.OTHER_SYMBOL:
// case Character.INITIAL_QUOTE_PUNCTUATION: // case Character.INITIAL_QUOTE_PUNCTUATION:
// case Character.FINAL_QUOTE_PUNCTUATION: // case Character.FINAL_QUOTE_PUNCTUATION:
default: default:
return SUBWORD_DELIM; return SUBWORD_DELIM;

View File

@ -38,25 +38,25 @@ class TeluguNormalizer {
for (int i = 0; i < len; i++) { for (int i = 0; i < len; i++) {
switch (s[i]) { switch (s[i]) {
// candrabindu ( and ) -> bindu () // candrabindu ( and ) -> bindu ()
case '\u0C00': // case '\u0C00': //
case '\u0C01': // case '\u0C01': //
s[i] = '\u0C02'; // s[i] = '\u0C02'; //
break; break;
// delete visarga () // delete visarga ()
case '\u0C03': case '\u0C03':
len = delete(s, i, len); len = delete(s, i, len);
i--; i--;
break; break;
// zwj/zwnj -> delete // zwj/zwnj -> delete
case '\u200D': case '\u200D':
case '\u200C': case '\u200C':
len = delete(s, i, len); len = delete(s, i, len);
i--; i--;
break; break;
// long -> short vowels // long -> short vowels
case '\u0C14': // case '\u0C14': //
s[i] = '\u0C13'; // s[i] = '\u0C13'; //
break; break;
@ -73,7 +73,7 @@ class TeluguNormalizer {
s[i] = '\u0C09'; // s[i] = '\u0C09'; //
break; break;
// long -> short vowels matras // long -> short vowels matras
case '\u0C40': // case '\u0C40': //
s[i] = '\u0C3F'; // ి s[i] = '\u0C3F'; // ి
break; break;
@ -86,14 +86,14 @@ class TeluguNormalizer {
case '\u0C4B': // case '\u0C4B': //
s[i] = '\u0C4A'; // s[i] = '\u0C4A'; //
break; break;
// decomposed dipthong ( + ) -> precomposed diphthong vowel sign () // decomposed dipthong ( + ) -> precomposed diphthong vowel sign ()
case '\u0C46': case '\u0C46':
if (i + 1 < len && s[i + 1] == '\u0C56') { if (i + 1 < len && s[i + 1] == '\u0C56') {
s[i] = '\u0C48'; s[i] = '\u0C48';
len = delete(s, i + 1, len); len = delete(s, i + 1, len);
} }
break; break;
// composed oo or au -> oo or au // composed oo or au -> oo or au
case '\u0C12': case '\u0C12':
if (i + 1 < len && s[i + 1] == '\u0C55') { if (i + 1 < len && s[i + 1] == '\u0C55') {
// ( + ) -> oo () // ( + ) -> oo ()

View File

@ -61,12 +61,12 @@ public final class TurkishLowerCaseFilter extends TokenFilter {
if (iOrAfter) { // all the special I turkish handling happens here. if (iOrAfter) { // all the special I turkish handling happens here.
switch (ch) { switch (ch) {
// remove COMBINING_DOT_ABOVE to mimic composed lowercase // remove COMBINING_DOT_ABOVE to mimic composed lowercase
case COMBINING_DOT_ABOVE: case COMBINING_DOT_ABOVE:
length = delete(buffer, i, length); length = delete(buffer, i, length);
continue; continue;
// i itself, it depends if it is followed by COMBINING_DOT_ABOVE // i itself, it depends if it is followed by COMBINING_DOT_ABOVE
// if it is, we will make it small i and later remove the dot // if it is, we will make it small i and later remove the dot
case LATIN_CAPITAL_LETTER_I: case LATIN_CAPITAL_LETTER_I:
if (isBeforeDot(buffer, i + 1, length)) { if (isBeforeDot(buffer, i + 1, length)) {
buffer[i] = LATIN_SMALL_LETTER_I; buffer[i] = LATIN_SMALL_LETTER_I;

View File

@ -901,7 +901,7 @@ class WikipediaTokenizerImpl {
positionInc = 1; /* Break so we don't hit fall-through warning: */ positionInc = 1; /* Break so we don't hit fall-through warning: */
break; break;
} }
// fall through // fall through
case 47: case 47:
break; break;
case 2: case 2:
@ -909,7 +909,7 @@ class WikipediaTokenizerImpl {
positionInc = 1; positionInc = 1;
return ALPHANUM; return ALPHANUM;
} }
// fall through // fall through
case 48: case 48:
break; break;
case 3: case 3:
@ -920,7 +920,7 @@ class WikipediaTokenizerImpl {
yybegin(EXTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */ yybegin(EXTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */
break; break;
} }
// fall through // fall through
case 49: case 49:
break; break;
case 4: case 4:
@ -928,7 +928,7 @@ class WikipediaTokenizerImpl {
positionInc = 1; positionInc = 1;
return CJ; return CJ;
} }
// fall through // fall through
case 50: case 50:
break; break;
case 5: case 5:
@ -936,7 +936,7 @@ class WikipediaTokenizerImpl {
positionInc = 1; /* Break so we don't hit fall-through warning: */ positionInc = 1; /* Break so we don't hit fall-through warning: */
break; break;
} }
// fall through // fall through
case 51: case 51:
break; break;
case 6: case 6:
@ -945,7 +945,7 @@ class WikipediaTokenizerImpl {
numWikiTokensSeen++; numWikiTokensSeen++;
return currentTokType; return currentTokType;
} }
// fall through // fall through
case 52: case 52:
break; break;
case 7: case 7:
@ -954,7 +954,7 @@ class WikipediaTokenizerImpl {
numWikiTokensSeen++; numWikiTokensSeen++;
return currentTokType; return currentTokType;
} }
// fall through // fall through
case 53: case 53:
break; break;
case 8: case 8:
@ -962,7 +962,7 @@ class WikipediaTokenizerImpl {
/* Break so we don't hit fall-through warning: */ /* Break so we don't hit fall-through warning: */
break; /* ignore */ break; /* ignore */
} }
// fall through // fall through
case 54: case 54:
break; break;
case 9: case 9:
@ -978,7 +978,7 @@ class WikipediaTokenizerImpl {
numLinkToks++; numLinkToks++;
return currentTokType; return currentTokType;
} }
// fall through // fall through
case 55: case 55:
break; break;
case 10: case 10:
@ -988,7 +988,7 @@ class WikipediaTokenizerImpl {
yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */
break; break;
} }
// fall through // fall through
case 56: case 56:
break; break;
case 11: case 11:
@ -997,7 +997,7 @@ class WikipediaTokenizerImpl {
yybegin(THREE_SINGLE_QUOTES_STATE); /* Break so we don't hit fall-through warning: */ yybegin(THREE_SINGLE_QUOTES_STATE); /* Break so we don't hit fall-through warning: */
break; break;
} }
// fall through // fall through
case 57: case 57:
break; break;
case 12: case 12:
@ -1007,7 +1007,7 @@ class WikipediaTokenizerImpl {
yybegin(STRING); yybegin(STRING);
return currentTokType; /*italics*/ return currentTokType; /*italics*/
} }
// fall through // fall through
case 58: case 58:
break; break;
case 13: case 13:
@ -1017,7 +1017,7 @@ class WikipediaTokenizerImpl {
yybegin(EXTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */ yybegin(EXTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */
break; break;
} }
// fall through // fall through
case 59: case 59:
break; break;
case 14: case 14:
@ -1026,7 +1026,7 @@ class WikipediaTokenizerImpl {
numWikiTokensSeen++; numWikiTokensSeen++;
return currentTokType; return currentTokType;
} }
// fall through // fall through
case 60: case 60:
break; break;
case 15: case 15:
@ -1036,7 +1036,7 @@ class WikipediaTokenizerImpl {
numWikiTokensSeen++; numWikiTokensSeen++;
return currentTokType; return currentTokType;
} }
// fall through // fall through
case 61: case 61:
break; break;
case 16: case 16:
@ -1046,7 +1046,7 @@ class WikipediaTokenizerImpl {
yybegin(STRING); /* Break so we don't hit fall-through warning: */ yybegin(STRING); /* Break so we don't hit fall-through warning: */
break; break;
} }
// fall through // fall through
case 62: case 62:
break; break;
case 17: case 17:
@ -1055,7 +1055,7 @@ class WikipediaTokenizerImpl {
numWikiTokensSeen = 0; numWikiTokensSeen = 0;
return currentTokType; return currentTokType;
} }
// fall through // fall through
case 63: case 63:
break; break;
case 18: case 18:
@ -1063,7 +1063,7 @@ class WikipediaTokenizerImpl {
/* Break so we don't hit fall-through warning: */ /* Break so we don't hit fall-through warning: */
break; /* ignore STRING */ break; /* ignore STRING */
} }
// fall through // fall through
case 64: case 64:
break; break;
case 19: case 19:
@ -1072,7 +1072,7 @@ class WikipediaTokenizerImpl {
numWikiTokensSeen++; numWikiTokensSeen++;
return currentTokType; /* STRING ALPHANUM*/ return currentTokType; /* STRING ALPHANUM*/
} }
// fall through // fall through
case 65: case 65:
break; break;
case 20: case 20:
@ -1083,7 +1083,7 @@ class WikipediaTokenizerImpl {
yybegin(EXTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */ yybegin(EXTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */
break; break;
} }
// fall through // fall through
case 66: case 66:
break; break;
case 21: case 21:
@ -1091,7 +1091,7 @@ class WikipediaTokenizerImpl {
yybegin(STRING); yybegin(STRING);
return currentTokType; /*pipe*/ return currentTokType; /*pipe*/
} }
// fall through // fall through
case 67: case 67:
break; break;
case 22: case 22:
@ -1106,7 +1106,7 @@ class WikipediaTokenizerImpl {
} /* Break so we don't hit fall-through warning: */ } /* Break so we don't hit fall-through warning: */
break; break;
} }
// fall through // fall through
case 68: case 68:
break; break;
case 23: case 23:
@ -1116,7 +1116,7 @@ class WikipediaTokenizerImpl {
yybegin(DOUBLE_EQUALS_STATE); /* Break so we don't hit fall-through warning: */ yybegin(DOUBLE_EQUALS_STATE); /* Break so we don't hit fall-through warning: */
break; break;
} }
// fall through // fall through
case 69: case 69:
break; break;
case 24: case 24:
@ -1127,7 +1127,7 @@ class WikipediaTokenizerImpl {
yybegin(INTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */ yybegin(INTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */
break; break;
} }
// fall through // fall through
case 70: case 70:
break; break;
case 25: case 25:
@ -1138,7 +1138,7 @@ class WikipediaTokenizerImpl {
yybegin(DOUBLE_BRACE_STATE); /* Break so we don't hit fall-through warning: */ yybegin(DOUBLE_BRACE_STATE); /* Break so we don't hit fall-through warning: */
break; break;
} }
// fall through // fall through
case 71: case 71:
break; break;
case 26: case 26:
@ -1146,7 +1146,7 @@ class WikipediaTokenizerImpl {
yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */
break; break;
} }
// fall through // fall through
case 72: case 72:
break; break;
case 27: case 27:
@ -1155,7 +1155,7 @@ class WikipediaTokenizerImpl {
yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */
break; break;
} }
// fall through // fall through
case 73: case 73:
break; break;
case 28: case 28:
@ -1165,7 +1165,7 @@ class WikipediaTokenizerImpl {
yybegin(INTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */ yybegin(INTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */
break; break;
} }
// fall through // fall through
case 74: case 74:
break; break;
case 29: case 29:
@ -1175,7 +1175,7 @@ class WikipediaTokenizerImpl {
yybegin(INTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */ yybegin(INTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */
break; break;
} }
// fall through // fall through
case 75: case 75:
break; break;
case 30: case 30:
@ -1183,7 +1183,7 @@ class WikipediaTokenizerImpl {
yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */
break; break;
} }
// fall through // fall through
case 76: case 76:
break; break;
case 31: case 31:
@ -1193,7 +1193,7 @@ class WikipediaTokenizerImpl {
yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */
break; /*end italics*/ break; /*end italics*/
} }
// fall through // fall through
case 77: case 77:
break; break;
case 32: case 32:
@ -1204,7 +1204,7 @@ class WikipediaTokenizerImpl {
yybegin(INTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */ yybegin(INTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */
break; break;
} }
// fall through // fall through
case 78: case 78:
break; break;
case 33: case 33:
@ -1212,7 +1212,7 @@ class WikipediaTokenizerImpl {
positionInc = 1; positionInc = 1;
return NUM; return NUM;
} }
// fall through // fall through
case 79: case 79:
break; break;
case 34: case 34:
@ -1220,7 +1220,7 @@ class WikipediaTokenizerImpl {
positionInc = 1; positionInc = 1;
return COMPANY; return COMPANY;
} }
// fall through // fall through
case 80: case 80:
break; break;
case 35: case 35:
@ -1228,7 +1228,7 @@ class WikipediaTokenizerImpl {
positionInc = 1; positionInc = 1;
return APOSTROPHE; return APOSTROPHE;
} }
// fall through // fall through
case 81: case 81:
break; break;
case 36: case 36:
@ -1236,7 +1236,7 @@ class WikipediaTokenizerImpl {
positionInc = 1; positionInc = 1;
return HOST; return HOST;
} }
// fall through // fall through
case 82: case 82:
break; break;
case 37: case 37:
@ -1245,7 +1245,7 @@ class WikipediaTokenizerImpl {
yybegin(FIVE_SINGLE_QUOTES_STATE); /* Break so we don't hit fall-through warning: */ yybegin(FIVE_SINGLE_QUOTES_STATE); /* Break so we don't hit fall-through warning: */
break; break;
} }
// fall through // fall through
case 83: case 83:
break; break;
case 38: case 38:
@ -1255,7 +1255,7 @@ class WikipediaTokenizerImpl {
yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */
break; /*end bold*/ break; /*end bold*/
} }
// fall through // fall through
case 84: case 84:
break; break;
case 39: case 39:
@ -1265,7 +1265,7 @@ class WikipediaTokenizerImpl {
yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */
break; /*end sub header*/ break; /*end sub header*/
} }
// fall through // fall through
case 85: case 85:
break; break;
case 40: case 40:
@ -1273,7 +1273,7 @@ class WikipediaTokenizerImpl {
positionInc = 1; positionInc = 1;
return ACRONYM; return ACRONYM;
} }
// fall through // fall through
case 86: case 86:
break; break;
case 41: case 41:
@ -1281,7 +1281,7 @@ class WikipediaTokenizerImpl {
positionInc = 1; positionInc = 1;
return EMAIL; return EMAIL;
} }
// fall through // fall through
case 87: case 87:
break; break;
case 42: case 42:
@ -1291,7 +1291,7 @@ class WikipediaTokenizerImpl {
yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */
break; /*end bold italics*/ break; /*end bold italics*/
} }
// fall through // fall through
case 88: case 88:
break; break;
case 43: case 43:
@ -1301,7 +1301,7 @@ class WikipediaTokenizerImpl {
yybegin(EXTERNAL_LINK_STATE); yybegin(EXTERNAL_LINK_STATE);
return currentTokType; return currentTokType;
} }
// fall through // fall through
case 89: case 89:
break; break;
case 44: case 44:
@ -1312,7 +1312,7 @@ class WikipediaTokenizerImpl {
yybegin(CATEGORY_STATE); /* Break so we don't hit fall-through warning: */ yybegin(CATEGORY_STATE); /* Break so we don't hit fall-through warning: */
break; break;
} }
// fall through // fall through
case 90: case 90:
break; break;
case 45: case 45:
@ -1322,7 +1322,7 @@ class WikipediaTokenizerImpl {
yybegin(CATEGORY_STATE); /* Break so we don't hit fall-through warning: */ yybegin(CATEGORY_STATE); /* Break so we don't hit fall-through warning: */
break; break;
} }
// fall through // fall through
case 91: case 91:
break; break;
case 46: case 46:
@ -1333,7 +1333,7 @@ class WikipediaTokenizerImpl {
yybegin(CATEGORY_STATE); /* Break so we don't hit fall-through warning: */ yybegin(CATEGORY_STATE); /* Break so we don't hit fall-through warning: */
break; break;
} }
// fall through // fall through
case 92: case 92:
break; break;
default: default:

View File

@ -59,6 +59,14 @@ public class TestSpellChecking extends LuceneTestCase {
public void testRepSuggestions() throws Exception { public void testRepSuggestions() throws Exception {
doTest("rep"); doTest("rep");
//noinspection DataFlowIssue
Path aff = Path.of(getClass().getResource("rep.aff").toURI());
Dictionary dictionary = TestAllDictionaries.loadDictionary(aff);
Suggester suggester = new Suggester(dictionary);
assertEquals(List.of("auto's"), suggester.suggestNoTimeout("autos", () -> {}));
assertEquals(
List.of("auto's", "auto"), suggester.proceedPastRep().suggestNoTimeout("autos", () -> {}));
} }
public void testPhSuggestions() throws Exception { public void testPhSuggestions() throws Exception {

View File

@ -245,7 +245,7 @@ public class Diff {
deletes++; deletes++;
x--; x--;
break; break;
// delete // delete
case Y: case Y:
if (deletes != base) { if (deletes != base) {
result.append('D').append(deletes); result.append('D').append(deletes);
@ -258,7 +258,7 @@ public class Diff {
result.append('I'); result.append('I');
result.append(b.charAt(--y)); result.append(b.charAt(--y));
break; break;
// insert // insert
case R: case R:
if (deletes != base) { if (deletes != base) {
result.append('D').append(deletes); result.append('D').append(deletes);
@ -272,7 +272,7 @@ public class Diff {
result.append(b.charAt(--y)); result.append(b.charAt(--y));
x--; x--;
break; break;
// replace // replace
case D: case D:
if (deletes != base) { if (deletes != base) {
result.append('D').append(deletes); result.append('D').append(deletes);

View File

@ -0,0 +1,4 @@
{
"lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene99/ForUtil.java": "f31797842f047626df6a1a6b97167bec60269fec",
"lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene99/gen_ForUtil.py": "325f2610974b0e76e278b6445405a098a3763feb"
}

View File

@ -35,6 +35,7 @@ module org.apache.lucene.backward_codecs {
exports org.apache.lucene.backward_codecs.lucene92; exports org.apache.lucene.backward_codecs.lucene92;
exports org.apache.lucene.backward_codecs.lucene94; exports org.apache.lucene.backward_codecs.lucene94;
exports org.apache.lucene.backward_codecs.lucene95; exports org.apache.lucene.backward_codecs.lucene95;
exports org.apache.lucene.backward_codecs.lucene99;
exports org.apache.lucene.backward_codecs.packed; exports org.apache.lucene.backward_codecs.packed;
exports org.apache.lucene.backward_codecs.store; exports org.apache.lucene.backward_codecs.store;
@ -43,7 +44,8 @@ module org.apache.lucene.backward_codecs {
provides org.apache.lucene.codecs.PostingsFormat with provides org.apache.lucene.codecs.PostingsFormat with
org.apache.lucene.backward_codecs.lucene50.Lucene50PostingsFormat, org.apache.lucene.backward_codecs.lucene50.Lucene50PostingsFormat,
org.apache.lucene.backward_codecs.lucene84.Lucene84PostingsFormat, org.apache.lucene.backward_codecs.lucene84.Lucene84PostingsFormat,
org.apache.lucene.backward_codecs.lucene90.Lucene90PostingsFormat; org.apache.lucene.backward_codecs.lucene90.Lucene90PostingsFormat,
org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat;
provides org.apache.lucene.codecs.KnnVectorsFormat with provides org.apache.lucene.codecs.KnnVectorsFormat with
org.apache.lucene.backward_codecs.lucene90.Lucene90HnswVectorsFormat, org.apache.lucene.backward_codecs.lucene90.Lucene90HnswVectorsFormat,
org.apache.lucene.backward_codecs.lucene91.Lucene91HnswVectorsFormat, org.apache.lucene.backward_codecs.lucene91.Lucene91HnswVectorsFormat,
@ -59,5 +61,6 @@ module org.apache.lucene.backward_codecs {
org.apache.lucene.backward_codecs.lucene91.Lucene91Codec, org.apache.lucene.backward_codecs.lucene91.Lucene91Codec,
org.apache.lucene.backward_codecs.lucene92.Lucene92Codec, org.apache.lucene.backward_codecs.lucene92.Lucene92Codec,
org.apache.lucene.backward_codecs.lucene94.Lucene94Codec, org.apache.lucene.backward_codecs.lucene94.Lucene94Codec,
org.apache.lucene.backward_codecs.lucene95.Lucene95Codec; org.apache.lucene.backward_codecs.lucene95.Lucene95Codec,
org.apache.lucene.backward_codecs.lucene99.Lucene99Codec;
} }

View File

@ -88,21 +88,17 @@ public final class FieldReader extends Terms {
(new ByteArrayDataInput(rootCode.bytes, rootCode.offset, rootCode.length)).readVLong() (new ByteArrayDataInput(rootCode.bytes, rootCode.offset, rootCode.length)).readVLong()
>>> Lucene40BlockTreeTermsReader.OUTPUT_FLAGS_NUM_BITS; >>> Lucene40BlockTreeTermsReader.OUTPUT_FLAGS_NUM_BITS;
// Initialize FST always off-heap. // Initialize FST always off-heap.
final IndexInput clone = indexIn.clone(); final FST.FSTMetadata<BytesRef> fstMetadata;
clone.seek(indexStartFP);
if (metaIn == indexIn) { // Only true before Lucene 8.6 if (metaIn == indexIn) { // Only true before Lucene 8.6
index = final IndexInput clone = indexIn.clone();
new FST<>( clone.seek(indexStartFP);
readMetadata(clone, ByteSequenceOutputs.getSingleton()), fstMetadata = readMetadata(clone, ByteSequenceOutputs.getSingleton());
clone, // FST bytes actually only start after the metadata.
new OffHeapFSTStore()); indexStartFP = clone.getFilePointer();
} else { } else {
index = fstMetadata = readMetadata(metaIn, ByteSequenceOutputs.getSingleton());
new FST<>(
readMetadata(metaIn, ByteSequenceOutputs.getSingleton()),
clone,
new OffHeapFSTStore());
} }
index = FST.fromFSTReader(fstMetadata, new OffHeapFSTStore(indexIn, indexStartFP, fstMetadata));
/* /*
if (false) { if (false) {
final String dotFileName = segment + "_" + fieldInfo.name + ".dot"; final String dotFileName = segment + "_" + fieldInfo.name + ".dot";

View File

@ -14,7 +14,7 @@
* See the License for the specific language governing permissions and * See the License for the specific language governing permissions and
* limitations under the License. * limitations under the License.
*/ */
package org.apache.lucene.codecs.lucene99; package org.apache.lucene.backward_codecs.lucene99;
import java.io.IOException; import java.io.IOException;
import org.apache.lucene.store.DataInput; import org.apache.lucene.store.DataInput;

View File

@ -16,7 +16,7 @@
* See the License for the specific language governing permissions and * See the License for the specific language governing permissions and
* limitations under the License. * limitations under the License.
*/ */
package org.apache.lucene.codecs.lucene99; package org.apache.lucene.backward_codecs.lucene99;
import java.io.IOException; import java.io.IOException;
import org.apache.lucene.store.DataInput; import org.apache.lucene.store.DataInput;

View File

@ -14,12 +14,33 @@
* See the License for the specific language governing permissions and * See the License for the specific language governing permissions and
* limitations under the License. * limitations under the License.
*/ */
package org.apache.lucene.codecs.lucene99; package org.apache.lucene.backward_codecs.lucene99;
import java.util.Objects; import java.util.Objects;
import org.apache.lucene.codecs.*; import org.apache.lucene.codecs.Codec;
import org.apache.lucene.codecs.lucene90.*; import org.apache.lucene.codecs.CompoundFormat;
import org.apache.lucene.codecs.DocValuesFormat;
import org.apache.lucene.codecs.FieldInfosFormat;
import org.apache.lucene.codecs.FilterCodec;
import org.apache.lucene.codecs.KnnVectorsFormat;
import org.apache.lucene.codecs.LiveDocsFormat;
import org.apache.lucene.codecs.NormsFormat;
import org.apache.lucene.codecs.PointsFormat;
import org.apache.lucene.codecs.PostingsFormat;
import org.apache.lucene.codecs.SegmentInfoFormat;
import org.apache.lucene.codecs.StoredFieldsFormat;
import org.apache.lucene.codecs.TermVectorsFormat;
import org.apache.lucene.codecs.lucene90.Lucene90CompoundFormat;
import org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat;
import org.apache.lucene.codecs.lucene90.Lucene90LiveDocsFormat;
import org.apache.lucene.codecs.lucene90.Lucene90NormsFormat;
import org.apache.lucene.codecs.lucene90.Lucene90PointsFormat;
import org.apache.lucene.codecs.lucene90.Lucene90StoredFieldsFormat;
import org.apache.lucene.codecs.lucene90.Lucene90TermVectorsFormat;
import org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat;
import org.apache.lucene.codecs.lucene94.Lucene94FieldInfosFormat; import org.apache.lucene.codecs.lucene94.Lucene94FieldInfosFormat;
import org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat;
import org.apache.lucene.codecs.lucene99.Lucene99SegmentInfoFormat;
import org.apache.lucene.codecs.perfield.PerFieldDocValuesFormat; import org.apache.lucene.codecs.perfield.PerFieldDocValuesFormat;
import org.apache.lucene.codecs.perfield.PerFieldKnnVectorsFormat; import org.apache.lucene.codecs.perfield.PerFieldKnnVectorsFormat;
import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat; import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat;
@ -98,7 +119,7 @@ public class Lucene99Codec extends Codec {
super("Lucene99"); super("Lucene99");
this.storedFieldsFormat = this.storedFieldsFormat =
new Lucene90StoredFieldsFormat(Objects.requireNonNull(mode).storedMode); new Lucene90StoredFieldsFormat(Objects.requireNonNull(mode).storedMode);
this.defaultPostingsFormat = new Lucene99PostingsFormat(); this.defaultPostingsFormat = new Lucene912PostingsFormat();
this.defaultDVFormat = new Lucene90DocValuesFormat(); this.defaultDVFormat = new Lucene90DocValuesFormat();
this.defaultKnnVectorsFormat = new Lucene99HnswVectorsFormat(); this.defaultKnnVectorsFormat = new Lucene99HnswVectorsFormat();
} }

View File

@ -14,7 +14,7 @@
* See the License for the specific language governing permissions and * See the License for the specific language governing permissions and
* limitations under the License. * limitations under the License.
*/ */
package org.apache.lucene.codecs.lucene99; package org.apache.lucene.backward_codecs.lucene99;
import java.io.IOException; import java.io.IOException;
import org.apache.lucene.codecs.BlockTermState; import org.apache.lucene.codecs.BlockTermState;
@ -24,7 +24,6 @@ import org.apache.lucene.codecs.FieldsProducer;
import org.apache.lucene.codecs.MultiLevelSkipListWriter; import org.apache.lucene.codecs.MultiLevelSkipListWriter;
import org.apache.lucene.codecs.PostingsFormat; import org.apache.lucene.codecs.PostingsFormat;
import org.apache.lucene.codecs.PostingsReaderBase; import org.apache.lucene.codecs.PostingsReaderBase;
import org.apache.lucene.codecs.PostingsWriterBase;
import org.apache.lucene.codecs.lucene90.blocktree.Lucene90BlockTreeTermsReader; import org.apache.lucene.codecs.lucene90.blocktree.Lucene90BlockTreeTermsReader;
import org.apache.lucene.codecs.lucene90.blocktree.Lucene90BlockTreeTermsWriter; import org.apache.lucene.codecs.lucene90.blocktree.Lucene90BlockTreeTermsWriter;
import org.apache.lucene.index.IndexOptions; import org.apache.lucene.index.IndexOptions;
@ -339,7 +338,7 @@ import org.apache.lucene.util.packed.PackedInts;
* *
* @lucene.experimental * @lucene.experimental
*/ */
public final class Lucene99PostingsFormat extends PostingsFormat { public class Lucene99PostingsFormat extends PostingsFormat {
/** /**
* Filename extension for document number, frequencies, and skip data. See chapter: <a * Filename extension for document number, frequencies, and skip data. See chapter: <a
@ -374,28 +373,9 @@ public final class Lucene99PostingsFormat extends PostingsFormat {
static final int VERSION_START = 0; static final int VERSION_START = 0;
static final int VERSION_CURRENT = VERSION_START; static final int VERSION_CURRENT = VERSION_START;
private final int minTermBlockSize;
private final int maxTermBlockSize;
/** Creates {@code Lucene99PostingsFormat} with default settings. */ /** Creates {@code Lucene99PostingsFormat} with default settings. */
public Lucene99PostingsFormat() { public Lucene99PostingsFormat() {
this(
Lucene90BlockTreeTermsWriter.DEFAULT_MIN_BLOCK_SIZE,
Lucene90BlockTreeTermsWriter.DEFAULT_MAX_BLOCK_SIZE);
}
/**
* Creates {@code Lucene99PostingsFormat} with custom values for {@code minBlockSize} and {@code
* maxBlockSize} passed to block terms dictionary.
*
* @see
* Lucene90BlockTreeTermsWriter#Lucene90BlockTreeTermsWriter(SegmentWriteState,PostingsWriterBase,int,int)
*/
public Lucene99PostingsFormat(int minTermBlockSize, int maxTermBlockSize) {
super("Lucene99"); super("Lucene99");
Lucene90BlockTreeTermsWriter.validateSettings(minTermBlockSize, maxTermBlockSize);
this.minTermBlockSize = minTermBlockSize;
this.maxTermBlockSize = maxTermBlockSize;
} }
@Override @Override
@ -405,19 +385,7 @@ public final class Lucene99PostingsFormat extends PostingsFormat {
@Override @Override
public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException { public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
PostingsWriterBase postingsWriter = new Lucene99PostingsWriter(state); throw new UnsupportedOperationException();
boolean success = false;
try {
FieldsConsumer ret =
new Lucene90BlockTreeTermsWriter(
state, postingsWriter, minTermBlockSize, maxTermBlockSize);
success = true;
return ret;
} finally {
if (!success) {
IOUtils.closeWhileHandlingException(postingsWriter);
}
}
} }
@Override @Override

View File

@ -14,23 +14,23 @@
* See the License for the specific language governing permissions and * See the License for the specific language governing permissions and
* limitations under the License. * limitations under the License.
*/ */
package org.apache.lucene.codecs.lucene99; package org.apache.lucene.backward_codecs.lucene99;
import static org.apache.lucene.codecs.lucene99.ForUtil.BLOCK_SIZE; import static org.apache.lucene.backward_codecs.lucene99.ForUtil.BLOCK_SIZE;
import static org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.DOC_CODEC; import static org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat.DOC_CODEC;
import static org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.MAX_SKIP_LEVELS; import static org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat.MAX_SKIP_LEVELS;
import static org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.PAY_CODEC; import static org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat.PAY_CODEC;
import static org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.POS_CODEC; import static org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat.POS_CODEC;
import static org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.TERMS_CODEC; import static org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat.TERMS_CODEC;
import static org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.VERSION_CURRENT; import static org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat.VERSION_CURRENT;
import static org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.VERSION_START; import static org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat.VERSION_START;
import java.io.IOException; import java.io.IOException;
import java.util.Arrays; import java.util.Arrays;
import org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat.IntBlockTermState;
import org.apache.lucene.codecs.BlockTermState; import org.apache.lucene.codecs.BlockTermState;
import org.apache.lucene.codecs.CodecUtil; import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.PostingsReaderBase; import org.apache.lucene.codecs.PostingsReaderBase;
import org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.IntBlockTermState;
import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.Impacts; import org.apache.lucene.index.Impacts;
import org.apache.lucene.index.ImpactsEnum; import org.apache.lucene.index.ImpactsEnum;

View File

@ -14,7 +14,7 @@
* See the License for the specific language governing permissions and * See the License for the specific language governing permissions and
* limitations under the License. * limitations under the License.
*/ */
package org.apache.lucene.codecs.lucene99; package org.apache.lucene.backward_codecs.lucene99;
import java.io.IOException; import java.io.IOException;
import java.util.AbstractList; import java.util.AbstractList;

View File

@ -14,7 +14,7 @@
* See the License for the specific language governing permissions and * See the License for the specific language governing permissions and
* limitations under the License. * limitations under the License.
*/ */
package org.apache.lucene.codecs.lucene99; package org.apache.lucene.backward_codecs.lucene99;
import java.io.IOException; import java.io.IOException;
import java.util.Arrays; import java.util.Arrays;
@ -61,6 +61,7 @@ public class Lucene99SkipReader extends MultiLevelSkipListReader {
private long lastDocPointer; private long lastDocPointer;
private int lastPosBufferUpto; private int lastPosBufferUpto;
/** Sole constructor. */
public Lucene99SkipReader( public Lucene99SkipReader(
IndexInput skipStream, IndexInput skipStream,
int maxSkipLevels, int maxSkipLevels,
@ -98,6 +99,7 @@ public class Lucene99SkipReader extends MultiLevelSkipListReader {
return df % ForUtil.BLOCK_SIZE == 0 ? df - 1 : df; return df % ForUtil.BLOCK_SIZE == 0 ? df - 1 : df;
} }
/** Initialize state. */
public void init( public void init(
long skipPointer, long docBasePointer, long posBasePointer, long payBasePointer, int df) long skipPointer, long docBasePointer, long posBasePointer, long payBasePointer, int df)
throws IOException { throws IOException {
@ -125,22 +127,27 @@ public class Lucene99SkipReader extends MultiLevelSkipListReader {
return lastDocPointer; return lastDocPointer;
} }
/** Returns the pointer in the pos file. */
public long getPosPointer() { public long getPosPointer() {
return lastPosPointer; return lastPosPointer;
} }
/** Return the start offset in the position block. */
public int getPosBufferUpto() { public int getPosBufferUpto() {
return lastPosBufferUpto; return lastPosBufferUpto;
} }
/** Returns the pointer in the pay file. */
public long getPayPointer() { public long getPayPointer() {
return lastPayPointer; return lastPayPointer;
} }
/** Return the number of bytes in the pay block that belongs to docs from the previous block. */
public int getPayloadByteUpto() { public int getPayloadByteUpto() {
return lastPayloadByteUpto; return lastPayloadByteUpto;
} }
/** Return the next skip doc, no skipping can be performed until this doc. */
public int getNextSkipDoc() { public int getNextSkipDoc() {
return skipDoc[0]; return skipDoc[0];
} }
@ -199,7 +206,7 @@ public class Lucene99SkipReader extends MultiLevelSkipListReader {
return delta; return delta;
} }
// The default impl skips impacts /** Read impacts. The default implementation skips them. */
protected void readImpacts(int level, IndexInput skipStream) throws IOException { protected void readImpacts(int level, IndexInput skipStream) throws IOException {
skipStream.skipBytes(skipStream.readVInt()); skipStream.skipBytes(skipStream.readVInt());
} }

View File

@ -14,7 +14,7 @@
* See the License for the specific language governing permissions and * See the License for the specific language governing permissions and
* limitations under the License. * limitations under the License.
*/ */
package org.apache.lucene.codecs.lucene99; package org.apache.lucene.backward_codecs.lucene99;
import java.io.IOException; import java.io.IOException;
import java.util.Arrays; import java.util.Arrays;
@ -46,10 +46,10 @@ import org.apache.lucene.store.IndexOutput;
* uptos(position, payload). 4. start offset. * uptos(position, payload). 4. start offset.
*/ */
public final class Lucene99SkipWriter extends MultiLevelSkipListWriter { public final class Lucene99SkipWriter extends MultiLevelSkipListWriter {
private int[] lastSkipDoc; private final int[] lastSkipDoc;
private long[] lastSkipDocPointer; private final long[] lastSkipDocPointer;
private long[] lastSkipPosPointer; private final long[] lastSkipPosPointer;
private long[] lastSkipPayPointer; private final long[] lastSkipPayPointer;
private final IndexOutput docOut; private final IndexOutput docOut;
private final IndexOutput posOut; private final IndexOutput posOut;
@ -61,11 +61,12 @@ public final class Lucene99SkipWriter extends MultiLevelSkipListWriter {
private long curPayPointer; private long curPayPointer;
private int curPosBufferUpto; private int curPosBufferUpto;
private int curPayloadByteUpto; private int curPayloadByteUpto;
private CompetitiveImpactAccumulator[] curCompetitiveFreqNorms; private final CompetitiveImpactAccumulator[] curCompetitiveFreqNorms;
private boolean fieldHasPositions; private boolean fieldHasPositions;
private boolean fieldHasOffsets; private boolean fieldHasOffsets;
private boolean fieldHasPayloads; private boolean fieldHasPayloads;
/** Sole constructor. */
public Lucene99SkipWriter( public Lucene99SkipWriter(
int maxSkipLevels, int maxSkipLevels,
int blockSize, int blockSize,
@ -84,7 +85,12 @@ public final class Lucene99SkipWriter extends MultiLevelSkipListWriter {
lastSkipPosPointer = new long[maxSkipLevels]; lastSkipPosPointer = new long[maxSkipLevels];
if (payOut != null) { if (payOut != null) {
lastSkipPayPointer = new long[maxSkipLevels]; lastSkipPayPointer = new long[maxSkipLevels];
} else {
lastSkipPayPointer = null;
} }
} else {
lastSkipPosPointer = null;
lastSkipPayPointer = null;
} }
curCompetitiveFreqNorms = new CompetitiveImpactAccumulator[maxSkipLevels]; curCompetitiveFreqNorms = new CompetitiveImpactAccumulator[maxSkipLevels];
for (int i = 0; i < maxSkipLevels; ++i) { for (int i = 0; i < maxSkipLevels; ++i) {
@ -92,6 +98,7 @@ public final class Lucene99SkipWriter extends MultiLevelSkipListWriter {
} }
} }
/** Reset state for the given index options. */
public void setField( public void setField(
boolean fieldHasPositions, boolean fieldHasOffsets, boolean fieldHasPayloads) { boolean fieldHasPositions, boolean fieldHasOffsets, boolean fieldHasPayloads) {
this.fieldHasPositions = fieldHasPositions; this.fieldHasPositions = fieldHasPositions;
@ -211,6 +218,7 @@ public final class Lucene99SkipWriter extends MultiLevelSkipListWriter {
competitiveFreqNorms.clear(); competitiveFreqNorms.clear();
} }
/** Write impacts to the given output. */
public static void writeImpacts(CompetitiveImpactAccumulator acc, DataOutput out) public static void writeImpacts(CompetitiveImpactAccumulator acc, DataOutput out)
throws IOException { throws IOException {
Collection<Impact> impacts = acc.getCompetitiveFreqNormPairs(); Collection<Impact> impacts = acc.getCompetitiveFreqNormPairs();

View File

@ -14,7 +14,7 @@
* See the License for the specific language governing permissions and * See the License for the specific language governing permissions and
* limitations under the License. * limitations under the License.
*/ */
package org.apache.lucene.codecs.lucene99; package org.apache.lucene.backward_codecs.lucene99;
import java.io.IOException; import java.io.IOException;
import java.util.Arrays; import java.util.Arrays;

View File

@ -14,7 +14,7 @@
* See the License for the specific language governing permissions and * See the License for the specific language governing permissions and
* limitations under the License. * limitations under the License.
*/ */
package org.apache.lucene.codecs.lucene99; package org.apache.lucene.backward_codecs.lucene99;
import java.io.IOException; import java.io.IOException;
import org.apache.lucene.store.IndexInput; import org.apache.lucene.store.IndexInput;

View File

@ -40,7 +40,7 @@ HEADER = """// This file has been automatically generated, DO NOT EDIT
* See the License for the specific language governing permissions and * See the License for the specific language governing permissions and
* limitations under the License. * limitations under the License.
*/ */
package org.apache.lucene.codecs.lucene99; package org.apache.lucene.backward_codecs.lucene99;
import java.io.IOException; import java.io.IOException;
import org.apache.lucene.store.DataInput; import org.apache.lucene.store.DataInput;

View File

@ -0,0 +1,428 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Lucene 9.9 file format.
*
* <h2>Apache Lucene - Index File Formats</h2>
*
* <div>
*
* <ul>
* <li><a href="#Introduction">Introduction</a>
* <li><a href="#Definitions">Definitions</a>
* <ul>
* <li><a href="#Inverted_Indexing">Inverted Indexing</a>
* <li><a href="#Types_of_Fields">Types of Fields</a>
* <li><a href="#Segments">Segments</a>
* <li><a href="#Document_Numbers">Document Numbers</a>
* </ul>
* <li><a href="#Overview">Index Structure Overview</a>
* <li><a href="#File_Naming">File Naming</a>
* <li><a href="#file-names">Summary of File Extensions</a>
* <ul>
* <li><a href="#Lock_File">Lock File</a>
* <li><a href="#History">History</a>
* <li><a href="#Limitations">Limitations</a>
* </ul>
* </ul>
*
* </div> <a id="Introduction"></a>
*
* <h3>Introduction</h3>
*
* <div>
*
* <p>This document defines the index file formats used in this version of Lucene. If you are using
* a different version of Lucene, please consult the copy of <code>docs/</code> that was distributed
* with the version you are using.
*
* <p>This document attempts to provide a high-level definition of the Apache Lucene file formats.
* </div> <a id="Definitions"></a>
*
* <h3>Definitions</h3>
*
* <div>
*
* <p>The fundamental concepts in Lucene are index, document, field and term.
*
* <p>An index contains a sequence of documents.
*
* <ul>
* <li>A document is a sequence of fields.
* <li>A field is a named sequence of terms.
* <li>A term is a sequence of bytes.
* </ul>
*
* <p>The same sequence of bytes in two different fields is considered a different term. Thus terms
* are represented as a pair: the string naming the field, and the bytes within the field. <a
* id="Inverted_Indexing"></a>
*
* <h4>Inverted Indexing</h4>
*
* <p>Lucene's index stores terms and statistics about those terms in order to make term-based
* search more efficient. Lucene's terms index falls into the family of indexes known as an
* <i>inverted index.</i> This is because it can list, for a term, the documents that contain it.
* This is the inverse of the natural relationship, in which documents list terms. <a
* id="Types_of_Fields"></a>
*
* <h4>Types of Fields</h4>
*
* <p>In Lucene, fields may be <i>stored</i>, in which case their text is stored in the index
* literally, in a non-inverted manner. Fields that are inverted are called <i>indexed</i>. A field
* may be both stored and indexed.
*
* <p>The text of a field may be <i>tokenized</i> into terms to be indexed, or the text of a field
* may be used literally as a term to be indexed. Most fields are tokenized, but sometimes it is
* useful for certain identifier fields to be indexed literally.
*
* <p>See the {@link org.apache.lucene.document.Field Field} java docs for more information on
* Fields. <a id="Segments"></a>
*
* <h4>Segments</h4>
*
* <p>Lucene indexes may be composed of multiple sub-indexes, or <i>segments</i>. Each segment is a
* fully independent index, which could be searched separately. Indexes evolve by:
*
* <ol>
* <li>Creating new segments for newly added documents.
* <li>Merging existing segments.
* </ol>
*
* <p>Searches may involve multiple segments and/or multiple indexes, each index potentially
* composed of a set of segments. <a id="Document_Numbers"></a>
*
* <h4>Document Numbers</h4>
*
* <p>Internally, Lucene refers to documents by an integer <i>document number</i>. The first
* document added to an index is numbered zero, and each subsequent document added gets a number one
* greater than the previous.
*
* <p>Note that a document's number may change, so caution should be taken when storing these
* numbers outside of Lucene. In particular, numbers may change in the following situations:
*
* <ul>
* <li>
* <p>The numbers stored in each segment are unique only within the segment, and must be
* converted before they can be used in a larger context. The standard technique is to
* allocate each segment a range of values, based on the range of numbers used in that
* segment. To convert a document number from a segment to an external value, the segment's
* <i>base</i> document number is added. To convert an external value back to a
* segment-specific value, the segment is identified by the range that the external value is
* in, and the segment's base value is subtracted. For example two five document segments
* might be combined, so that the first segment has a base value of zero, and the second of
* five. Document three from the second segment would have an external value of eight.
* <li>
* <p>When documents are deleted, gaps are created in the numbering. These are eventually
* removed as the index evolves through merging. Deleted documents are dropped when segments
* are merged. A freshly-merged segment thus has no gaps in its numbering.
* </ul>
*
* </div> <a id="Overview"></a>
*
* <h3>Index Structure Overview</h3>
*
* <div>
*
* <p>Each segment index maintains the following:
*
* <ul>
* <li>{@link org.apache.lucene.codecs.lucene99.Lucene99SegmentInfoFormat Segment info}. This
* contains metadata about a segment, such as the number of documents, what files it uses, and
* information about how the segment is sorted
* <li>{@link org.apache.lucene.codecs.lucene94.Lucene94FieldInfosFormat Field names}. This
* contains metadata about the set of named fields used in the index.
* <li>{@link org.apache.lucene.codecs.lucene90.Lucene90StoredFieldsFormat Stored Field values}.
* This contains, for each document, a list of attribute-value pairs, where the attributes are
* field names. These are used to store auxiliary information about the document, such as its
* title, url, or an identifier to access a database. The set of stored fields are what is
* returned for each hit when searching. This is keyed by document number.
* <li>{@link org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat Term dictionary}.
* A dictionary containing all of the terms used in all of the indexed fields of all of the
* documents. The dictionary also contains the number of documents which contain the term, and
* pointers to the term's frequency and proximity data.
* <li>{@link org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat Term Frequency
* data}. For each term in the dictionary, the numbers of all the documents that contain that
* term, and the frequency of the term in that document, unless frequencies are omitted
* ({@link org.apache.lucene.index.IndexOptions#DOCS IndexOptions.DOCS})
* <li>{@link org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat Term Proximity
* data}. For each term in the dictionary, the positions that the term occurs in each
* document. Note that this will not exist if all fields in all documents omit position data.
* <li>{@link org.apache.lucene.codecs.lucene90.Lucene90NormsFormat Normalization factors}. For
* each field in each document, a value is stored that is multiplied into the score for hits
* on that field.
* <li>{@link org.apache.lucene.codecs.lucene90.Lucene90TermVectorsFormat Term Vectors}. For each
* field in each document, the term vector (sometimes called document vector) may be stored. A
* term vector consists of term text and term frequency. To add Term Vectors to your index see
* the {@link org.apache.lucene.document.Field Field} constructors
* <li>{@link org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat Per-document values}. Like
* stored values, these are also keyed by document number, but are generally intended to be
* loaded into main memory for fast access. Whereas stored values are generally intended for
* summary results from searches, per-document values are useful for things like scoring
* factors.
* <li>{@link org.apache.lucene.codecs.lucene90.Lucene90LiveDocsFormat Live documents}. An
* optional file indicating which documents are live.
* <li>{@link org.apache.lucene.codecs.lucene90.Lucene90PointsFormat Point values}. Optional pair
* of files, recording dimensionally indexed fields, to enable fast numeric range filtering
* and large numeric values like BigInteger and BigDecimal (1D) and geographic shape
* intersection (2D, 3D).
* <li>{@link org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat Vector values}. The
* vector format stores numeric vectors in a format optimized for random access and
* computation, supporting high-dimensional nearest-neighbor search.
* </ul>
*
* <p>Details on each of these are provided in their linked pages. </div> <a id="File_Naming"></a>
*
* <h3>File Naming</h3>
*
* <div>
*
* <p>All files belonging to a segment have the same name with varying extensions. The extensions
* correspond to the different file formats described below. When using the Compound File format
* (default for small segments) these files (except for the Segment info file, the Lock file, and
* Deleted documents file) are collapsed into a single .cfs file (see below for details)
*
* <p>Typically, all segments in an index are stored in a single directory, although this is not
* required.
*
* <p>File names are never re-used. That is, when any file is saved to the Directory it is given a
* never before used filename. This is achieved using a simple generations approach. For example,
* the first segments file is segments_1, then segments_2, etc. The generation is a sequential long
* integer represented in alpha-numeric (base 36) form. </div> <a id="file-names"></a>
*
* <h3>Summary of File Extensions</h3>
*
* <div>
*
* <p>The following table summarizes the names and extensions of the files in Lucene:
*
* <table class="padding4" style="border-spacing: 1px; border-collapse: separate">
* <caption>lucene filenames by extension</caption>
* <tr>
* <th>Name</th>
* <th>Extension</th>
* <th>Brief Description</th>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.index.SegmentInfos Segments File}</td>
* <td>segments_N</td>
* <td>Stores information about a commit point</td>
* </tr>
* <tr>
* <td><a href="#Lock_File">Lock File</a></td>
* <td>write.lock</td>
* <td>The Write lock prevents multiple IndexWriters from writing to the same
* file.</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene99.Lucene99SegmentInfoFormat Segment Info}</td>
* <td>.si</td>
* <td>Stores metadata about a segment</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90CompoundFormat Compound File}</td>
* <td>.cfs, .cfe</td>
* <td>An optional "virtual" file consisting of all the other index files for
* systems that frequently run out of file handles.</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene94.Lucene94FieldInfosFormat Fields}</td>
* <td>.fnm</td>
* <td>Stores information about the fields</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90StoredFieldsFormat Field Index}</td>
* <td>.fdx</td>
* <td>Contains pointers to field data</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90StoredFieldsFormat Field Data}</td>
* <td>.fdt</td>
* <td>The stored fields for documents</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat Term Dictionary}</td>
* <td>.tim</td>
* <td>The term dictionary, stores term info</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat Term Index}</td>
* <td>.tip</td>
* <td>The index into the Term Dictionary</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat Frequencies}</td>
* <td>.doc</td>
* <td>Contains the list of docs which contain each term along with frequency</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat Positions}</td>
* <td>.pos</td>
* <td>Stores position information about where a term occurs in the index</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat Payloads}</td>
* <td>.pay</td>
* <td>Stores additional per-position metadata information such as character offsets and user payloads</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90NormsFormat Norms}</td>
* <td>.nvd, .nvm</td>
* <td>Encodes length and boost factors for docs and fields</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat Per-Document Values}</td>
* <td>.dvd, .dvm</td>
* <td>Encodes additional scoring factors or other per-document information.</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90TermVectorsFormat Term Vector Index}</td>
* <td>.tvx</td>
* <td>Stores offset into the document data file</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90TermVectorsFormat Term Vector Data}</td>
* <td>.tvd</td>
* <td>Contains term vector data.</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90LiveDocsFormat Live Documents}</td>
* <td>.liv</td>
* <td>Info about what documents are live</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90PointsFormat Point values}</td>
* <td>.dii, .dim</td>
* <td>Holds indexed points</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat Vector values}</td>
* <td>.vec, .vem, .veq, vex</td>
* <td>Holds indexed vectors; <code>.vec</code> files contain the raw vector data,
* <code>.vem</code> the vector metadata, <code>.veq</code> the quantized vector data, and <code>.vex</code> the
* hnsw graph data.</td>
* </tr>
* </table>
*
* </div> <a id="Lock_File"></a>
*
* <h3>Lock File</h3>
*
* The write lock, which is stored in the index directory by default, is named "write.lock". If the
* lock directory is different from the index directory then the write lock will be named
* "XXXX-write.lock" where XXXX is a unique prefix derived from the full path to the index
* directory. When this file is present, a writer is currently modifying the index (adding or
* removing documents). This lock file ensures that only one writer is modifying the index at a
* time. <a id="History"></a>
*
* <h3>History</h3>
*
* <p>Compatibility notes are provided in this document, describing how file formats have changed
* from prior versions:
*
* <ul>
* <li>In version 2.1, the file format was changed to allow lock-less commits (ie, no more commit
* lock). The change is fully backwards compatible: you can open a pre-2.1 index for searching
* or adding/deleting of docs. When the new segments file is saved (committed), it will be
* written in the new file format (meaning no specific "upgrade" process is needed). But note
* that once a commit has occurred, pre-2.1 Lucene will not be able to read the index.
* <li>In version 2.3, the file format was changed to allow segments to share a single set of doc
* store (vectors &amp; stored fields) files. This allows for faster indexing in certain
* cases. The change is fully backwards compatible (in the same way as the lock-less commits
* change in 2.1).
* <li>In version 2.4, Strings are now written as true UTF-8 byte sequence, not Java's modified
* UTF-8. See <a href="http://issues.apache.org/jira/browse/LUCENE-510">LUCENE-510</a> for
* details.
* <li>In version 2.9, an optional opaque Map&lt;String,String&gt; CommitUserData may be passed to
* IndexWriter's commit methods (and later retrieved), which is recorded in the segments_N
* file. See <a href="http://issues.apache.org/jira/browse/LUCENE-1382">LUCENE-1382</a> for
* details. Also, diagnostics were added to each segment written recording details about why
* it was written (due to flush, merge; which OS/JRE was used; etc.). See issue <a
* href="http://issues.apache.org/jira/browse/LUCENE-1654">LUCENE-1654</a> for details.
* <li>In version 3.0, compressed fields are no longer written to the index (they can still be
* read, but on merge the new segment will write them, uncompressed). See issue <a
* href="http://issues.apache.org/jira/browse/LUCENE-1960">LUCENE-1960</a> for details.
* <li>In version 3.1, segments records the code version that created them. See <a
* href="http://issues.apache.org/jira/browse/LUCENE-2720">LUCENE-2720</a> for details.
* Additionally segments track explicitly whether or not they have term vectors. See <a
* href="http://issues.apache.org/jira/browse/LUCENE-2811">LUCENE-2811</a> for details.
* <li>In version 3.2, numeric fields are written as natively to stored fields file, previously
* they were stored in text format only.
* <li>In version 3.4, fields can omit position data while still indexing term frequencies.
* <li>In version 4.0, the format of the inverted index became extensible via the {@link
* org.apache.lucene.codecs.Codec Codec} api. Fast per-document storage ({@code DocValues})
* was introduced. Normalization factors need no longer be a single byte, they can be any
* {@link org.apache.lucene.index.NumericDocValues NumericDocValues}. Terms need not be
* unicode strings, they can be any byte sequence. Term offsets can optionally be indexed into
* the postings lists. Payloads can be stored in the term vectors.
* <li>In version 4.1, the format of the postings list changed to use either of FOR compression or
* variable-byte encoding, depending upon the frequency of the term. Terms appearing only once
* were changed to inline directly into the term dictionary. Stored fields are compressed by
* default.
* <li>In version 4.2, term vectors are compressed by default. DocValues has a new multi-valued
* type (SortedSet), that can be used for faceting/grouping/joining on multi-valued fields.
* <li>In version 4.5, DocValues were extended to explicitly represent missing values.
* <li>In version 4.6, FieldInfos were extended to support per-field DocValues generation, to
* allow updating NumericDocValues fields.
* <li>In version 4.8, checksum footers were added to the end of each index file for improved data
* integrity. Specifically, the last 8 bytes of every index file contain the zlib-crc32
* checksum of the file.
* <li>In version 4.9, DocValues has a new multi-valued numeric type (SortedNumeric) that is
* suitable for faceting/sorting/analytics.
* <li>In version 5.4, DocValues have been improved to store more information on disk: addresses
* for binary fields and ord indexes for multi-valued fields.
* <li>In version 6.0, Points were added, for multi-dimensional range/distance search.
* <li>In version 6.2, new Segment info format that reads/writes the index sort, to support index
* sorting.
* <li>In version 7.0, DocValues have been improved to better support sparse doc values thanks to
* an iterator API.
* <li>In version 8.0, postings have been enhanced to record, for each block of doc ids, the (term
* freq, normalization factor) pairs that may trigger the maximum score of the block. This
* information is recorded alongside skip data in order to be able to skip blocks of doc ids
* if they may not produce high enough scores. Additionally doc values and norms has been
* extended with jump-tables to make access O(1) instead of O(n), where n is the number of
* elements to skip when advancing in the data.
* <li>In version 8.4, postings, positions, offsets and payload lengths have move to a more
* performant encoding that is vectorized.
* <li>In version 8.6, index sort serialization is delegated to the sorts themselves, to allow
* user-defined sorts to be used
* <li>In version 8.7, stored fields compression became adaptive to better handle documents with
* smaller stored fields.
* <li>In version 9.0, vector-valued fields were added.
* <li>In version 9.1, vector-valued fields were modified to add a graph hierarchy.
* <li>In version 9.2, docs of vector-valued fields were moved from .vem to .vec and encoded by
* IndexDISI. ordToDoc mappings was added to .vem.
* <li>In version 9.5, HNSW graph connections were changed to be delta-encoded with vints.
* Additionally, metadata file size improvements were made by delta-encoding nodes by graph
* layer and not writing the node ids for the zeroth layer.
* <li>In version 9.9, Vector scalar quantization support was added. Allowing the HNSW vector
* format to utilize int8 quantized vectors for float32 vector search.
* </ul>
*
* <a id="Limitations"></a>
*
* <h3>Limitations</h3>
*
* <div>
*
* <p>Lucene uses a Java <code>int</code> to refer to document numbers, and the index file format
* uses an <code>Int32</code> on-disk to store document numbers. This is a limitation of both the
* index file format and the current implementation. Eventually these should be replaced with either
* <code>UInt64</code> values, or better yet, {@link org.apache.lucene.store.DataOutput#writeVInt
* VInt} values which have no limit. </div>
*/
package org.apache.lucene.backward_codecs.lucene99;

View File

@ -22,3 +22,4 @@ org.apache.lucene.backward_codecs.lucene91.Lucene91Codec
org.apache.lucene.backward_codecs.lucene92.Lucene92Codec org.apache.lucene.backward_codecs.lucene92.Lucene92Codec
org.apache.lucene.backward_codecs.lucene94.Lucene94Codec org.apache.lucene.backward_codecs.lucene94.Lucene94Codec
org.apache.lucene.backward_codecs.lucene95.Lucene95Codec org.apache.lucene.backward_codecs.lucene95.Lucene95Codec
org.apache.lucene.backward_codecs.lucene99.Lucene99Codec

View File

@ -16,3 +16,4 @@
org.apache.lucene.backward_codecs.lucene50.Lucene50PostingsFormat org.apache.lucene.backward_codecs.lucene50.Lucene50PostingsFormat
org.apache.lucene.backward_codecs.lucene84.Lucene84PostingsFormat org.apache.lucene.backward_codecs.lucene84.Lucene84PostingsFormat
org.apache.lucene.backward_codecs.lucene90.Lucene90PostingsFormat org.apache.lucene.backward_codecs.lucene90.Lucene90PostingsFormat
org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat

View File

@ -17,7 +17,7 @@
package org.apache.lucene.backward_codecs.lucene50; package org.apache.lucene.backward_codecs.lucene50;
import java.io.IOException; import java.io.IOException;
import org.apache.lucene.backward_codecs.lucene40.blocktree.Lucene40BlockTreeTermsWriter; import org.apache.lucene.backward_codecs.lucene40.blocktree.Lucene40BlockTreeTermsWriterV5;
import org.apache.lucene.codecs.FieldsConsumer; import org.apache.lucene.codecs.FieldsConsumer;
import org.apache.lucene.codecs.PostingsWriterBase; import org.apache.lucene.codecs.PostingsWriterBase;
import org.apache.lucene.index.SegmentWriteState; import org.apache.lucene.index.SegmentWriteState;
@ -31,11 +31,11 @@ public class Lucene50RWPostingsFormat extends Lucene50PostingsFormat {
boolean success = false; boolean success = false;
try { try {
FieldsConsumer ret = FieldsConsumer ret =
new Lucene40BlockTreeTermsWriter( new Lucene40BlockTreeTermsWriterV5(
state, state,
postingsWriter, postingsWriter,
Lucene40BlockTreeTermsWriter.DEFAULT_MIN_BLOCK_SIZE, Lucene40BlockTreeTermsWriterV5.DEFAULT_MIN_BLOCK_SIZE,
Lucene40BlockTreeTermsWriter.DEFAULT_MAX_BLOCK_SIZE); Lucene40BlockTreeTermsWriterV5.DEFAULT_MAX_BLOCK_SIZE);
success = true; success = true;
return ret; return ret;
} finally { } finally {

View File

@ -642,13 +642,13 @@ public class BKDWriter60 implements Closeable {
throws IOException { throws IOException {
assert docMaps == null || readers.size() == docMaps.size(); assert docMaps == null || readers.size() == docMaps.size();
BKDMergeQueue queue = new BKDMergeQueue(config.bytesPerDim, readers.size()); BKDMergeQueue queue = new BKDMergeQueue(config.bytesPerDim(), readers.size());
for (int i = 0; i < readers.size(); i++) { for (int i = 0; i < readers.size(); i++) {
PointValues pointValues = readers.get(i); PointValues pointValues = readers.get(i);
assert pointValues.getNumDimensions() == config.numDims assert pointValues.getNumDimensions() == config.numDims()
&& pointValues.getBytesPerDimension() == config.bytesPerDim && pointValues.getBytesPerDimension() == config.bytesPerDim()
&& pointValues.getNumIndexDimensions() == config.numIndexDims; && pointValues.getNumIndexDimensions() == config.numIndexDims();
MergeState.DocMap docMap; MergeState.DocMap docMap;
if (docMaps == null) { if (docMaps == null) {
docMap = null; docMap = null;

View File

@ -23,12 +23,11 @@ import java.util.Arrays;
import java.util.Collections; import java.util.Collections;
import java.util.List; import java.util.List;
import org.apache.lucene.backward_codecs.lucene90.Lucene90ScoreSkipReader.MutableImpactList; import org.apache.lucene.backward_codecs.lucene90.Lucene90ScoreSkipReader.MutableImpactList;
import org.apache.lucene.backward_codecs.lucene99.Lucene99SkipWriter;
import org.apache.lucene.codecs.Codec; import org.apache.lucene.codecs.Codec;
import org.apache.lucene.codecs.CompetitiveImpactAccumulator; import org.apache.lucene.codecs.CompetitiveImpactAccumulator;
import org.apache.lucene.codecs.lucene90.blocktree.FieldReader; import org.apache.lucene.codecs.lucene90.blocktree.FieldReader;
import org.apache.lucene.codecs.lucene90.blocktree.Stats; import org.apache.lucene.codecs.lucene90.blocktree.Stats;
import org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat;
import org.apache.lucene.codecs.lucene99.Lucene99SkipWriter;
import org.apache.lucene.document.Document; import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field; import org.apache.lucene.document.Field;
import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.DirectoryReader;
@ -77,22 +76,6 @@ public class TestLucene90PostingsFormat extends BasePostingsFormatTestCase {
d.close(); d.close();
} }
private void shouldFail(int minItemsInBlock, int maxItemsInBlock) {
expectThrows(
IllegalArgumentException.class,
() -> {
new Lucene99PostingsFormat(minItemsInBlock, maxItemsInBlock);
});
}
public void testInvalidBlockSizes() throws Exception {
shouldFail(0, 0);
shouldFail(10, 8);
shouldFail(-1, 10);
shouldFail(10, -1);
shouldFail(10, 12);
}
public void testImpactSerialization() throws IOException { public void testImpactSerialization() throws IOException {
// omit norms and omit freqs // omit norms and omit freqs
doTestImpactSerialization(Collections.singletonList(new Impact(1, 1L))); doTestImpactSerialization(Collections.singletonList(new Impact(1, 1L)));

View File

@ -388,10 +388,14 @@ public final class Lucene94HnswVectorsWriter extends KnnVectorsWriter {
// write the vector data to a temporary file // write the vector data to a temporary file
DocsWithFieldSet docsWithField = DocsWithFieldSet docsWithField =
switch (fieldInfo.getVectorEncoding()) { switch (fieldInfo.getVectorEncoding()) {
case BYTE -> writeByteVectorData( case BYTE ->
tempVectorData, MergedVectorValues.mergeByteVectorValues(fieldInfo, mergeState)); writeByteVectorData(
case FLOAT32 -> writeVectorData( tempVectorData,
tempVectorData, MergedVectorValues.mergeFloatVectorValues(fieldInfo, mergeState)); MergedVectorValues.mergeByteVectorValues(fieldInfo, mergeState));
case FLOAT32 ->
writeVectorData(
tempVectorData,
MergedVectorValues.mergeFloatVectorValues(fieldInfo, mergeState));
}; };
CodecUtil.writeFooter(tempVectorData); CodecUtil.writeFooter(tempVectorData);
IOUtils.close(tempVectorData); IOUtils.close(tempVectorData);
@ -638,18 +642,20 @@ public final class Lucene94HnswVectorsWriter extends KnnVectorsWriter {
throws IOException { throws IOException {
int dim = fieldInfo.getVectorDimension(); int dim = fieldInfo.getVectorDimension();
return switch (fieldInfo.getVectorEncoding()) { return switch (fieldInfo.getVectorEncoding()) {
case BYTE -> new FieldWriter<byte[]>(fieldInfo, M, beamWidth, infoStream) { case BYTE ->
@Override new FieldWriter<byte[]>(fieldInfo, M, beamWidth, infoStream) {
public byte[] copyValue(byte[] value) { @Override
return ArrayUtil.copyOfSubArray(value, 0, dim); public byte[] copyValue(byte[] value) {
} return ArrayUtil.copyOfSubArray(value, 0, dim);
}; }
case FLOAT32 -> new FieldWriter<float[]>(fieldInfo, M, beamWidth, infoStream) { };
@Override case FLOAT32 ->
public float[] copyValue(float[] value) { new FieldWriter<float[]>(fieldInfo, M, beamWidth, infoStream) {
return ArrayUtil.copyOfSubArray(value, 0, dim); @Override
} public float[] copyValue(float[] value) {
}; return ArrayUtil.copyOfSubArray(value, 0, dim);
}
};
}; };
} }
@ -663,12 +669,14 @@ public final class Lucene94HnswVectorsWriter extends KnnVectorsWriter {
DefaultFlatVectorScorer defaultFlatVectorScorer = new DefaultFlatVectorScorer(); DefaultFlatVectorScorer defaultFlatVectorScorer = new DefaultFlatVectorScorer();
RandomVectorScorerSupplier scorerSupplier = RandomVectorScorerSupplier scorerSupplier =
switch (fieldInfo.getVectorEncoding()) { switch (fieldInfo.getVectorEncoding()) {
case BYTE -> defaultFlatVectorScorer.getRandomVectorScorerSupplier( case BYTE ->
fieldInfo.getVectorSimilarityFunction(), defaultFlatVectorScorer.getRandomVectorScorerSupplier(
RandomAccessVectorValues.fromBytes((List<byte[]>) vectors, dim)); fieldInfo.getVectorSimilarityFunction(),
case FLOAT32 -> defaultFlatVectorScorer.getRandomVectorScorerSupplier( RandomAccessVectorValues.fromBytes((List<byte[]>) vectors, dim));
fieldInfo.getVectorSimilarityFunction(), case FLOAT32 ->
RandomAccessVectorValues.fromFloats((List<float[]>) vectors, dim)); defaultFlatVectorScorer.getRandomVectorScorerSupplier(
fieldInfo.getVectorSimilarityFunction(),
RandomAccessVectorValues.fromFloats((List<float[]>) vectors, dim));
}; };
hnswGraphBuilder = hnswGraphBuilder =
HnswGraphBuilder.create(scorerSupplier, M, beamWidth, HnswGraphBuilder.randSeed); HnswGraphBuilder.create(scorerSupplier, M, beamWidth, HnswGraphBuilder.randSeed);
@ -693,9 +701,9 @@ public final class Lucene94HnswVectorsWriter extends KnnVectorsWriter {
lastDocID = docID; lastDocID = docID;
} }
OnHeapHnswGraph getGraph() { OnHeapHnswGraph getGraph() throws IOException {
if (vectors.size() > 0) { if (vectors.size() > 0) {
return hnswGraphBuilder.getGraph(); return hnswGraphBuilder.getCompletedGraph();
} else { } else {
return null; return null;
} }

View File

@ -414,10 +414,14 @@ public final class Lucene95HnswVectorsWriter extends KnnVectorsWriter {
// write the vector data to a temporary file // write the vector data to a temporary file
DocsWithFieldSet docsWithField = DocsWithFieldSet docsWithField =
switch (fieldInfo.getVectorEncoding()) { switch (fieldInfo.getVectorEncoding()) {
case BYTE -> writeByteVectorData( case BYTE ->
tempVectorData, MergedVectorValues.mergeByteVectorValues(fieldInfo, mergeState)); writeByteVectorData(
case FLOAT32 -> writeVectorData( tempVectorData,
tempVectorData, MergedVectorValues.mergeFloatVectorValues(fieldInfo, mergeState)); MergedVectorValues.mergeByteVectorValues(fieldInfo, mergeState));
case FLOAT32 ->
writeVectorData(
tempVectorData,
MergedVectorValues.mergeFloatVectorValues(fieldInfo, mergeState));
}; };
CodecUtil.writeFooter(tempVectorData); CodecUtil.writeFooter(tempVectorData);
IOUtils.close(tempVectorData); IOUtils.close(tempVectorData);
@ -477,10 +481,12 @@ public final class Lucene95HnswVectorsWriter extends KnnVectorsWriter {
} }
DocIdSetIterator mergedVectorIterator = null; DocIdSetIterator mergedVectorIterator = null;
switch (fieldInfo.getVectorEncoding()) { switch (fieldInfo.getVectorEncoding()) {
case BYTE -> mergedVectorIterator = case BYTE ->
KnnVectorsWriter.MergedVectorValues.mergeByteVectorValues(fieldInfo, mergeState); mergedVectorIterator =
case FLOAT32 -> mergedVectorIterator = KnnVectorsWriter.MergedVectorValues.mergeByteVectorValues(fieldInfo, mergeState);
KnnVectorsWriter.MergedVectorValues.mergeFloatVectorValues(fieldInfo, mergeState); case FLOAT32 ->
mergedVectorIterator =
KnnVectorsWriter.MergedVectorValues.mergeFloatVectorValues(fieldInfo, mergeState);
} }
graph = graph =
merger.merge( merger.merge(
@ -680,18 +686,20 @@ public final class Lucene95HnswVectorsWriter extends KnnVectorsWriter {
throws IOException { throws IOException {
int dim = fieldInfo.getVectorDimension(); int dim = fieldInfo.getVectorDimension();
return switch (fieldInfo.getVectorEncoding()) { return switch (fieldInfo.getVectorEncoding()) {
case BYTE -> new FieldWriter<byte[]>(fieldInfo, M, beamWidth, infoStream) { case BYTE ->
@Override new FieldWriter<byte[]>(fieldInfo, M, beamWidth, infoStream) {
public byte[] copyValue(byte[] value) { @Override
return ArrayUtil.copyOfSubArray(value, 0, dim); public byte[] copyValue(byte[] value) {
} return ArrayUtil.copyOfSubArray(value, 0, dim);
}; }
case FLOAT32 -> new FieldWriter<float[]>(fieldInfo, M, beamWidth, infoStream) { };
@Override case FLOAT32 ->
public float[] copyValue(float[] value) { new FieldWriter<float[]>(fieldInfo, M, beamWidth, infoStream) {
return ArrayUtil.copyOfSubArray(value, 0, dim); @Override
} public float[] copyValue(float[] value) {
}; return ArrayUtil.copyOfSubArray(value, 0, dim);
}
};
}; };
} }
@ -704,12 +712,14 @@ public final class Lucene95HnswVectorsWriter extends KnnVectorsWriter {
vectors = new ArrayList<>(); vectors = new ArrayList<>();
RandomVectorScorerSupplier scorerSupplier = RandomVectorScorerSupplier scorerSupplier =
switch (fieldInfo.getVectorEncoding()) { switch (fieldInfo.getVectorEncoding()) {
case BYTE -> defaultFlatVectorScorer.getRandomVectorScorerSupplier( case BYTE ->
fieldInfo.getVectorSimilarityFunction(), defaultFlatVectorScorer.getRandomVectorScorerSupplier(
RandomAccessVectorValues.fromBytes((List<byte[]>) vectors, dim)); fieldInfo.getVectorSimilarityFunction(),
case FLOAT32 -> defaultFlatVectorScorer.getRandomVectorScorerSupplier( RandomAccessVectorValues.fromBytes((List<byte[]>) vectors, dim));
fieldInfo.getVectorSimilarityFunction(), case FLOAT32 ->
RandomAccessVectorValues.fromFloats((List<float[]>) vectors, dim)); defaultFlatVectorScorer.getRandomVectorScorerSupplier(
fieldInfo.getVectorSimilarityFunction(),
RandomAccessVectorValues.fromFloats((List<float[]>) vectors, dim));
}; };
hnswGraphBuilder = hnswGraphBuilder =
HnswGraphBuilder.create(scorerSupplier, M, beamWidth, HnswGraphBuilder.randSeed); HnswGraphBuilder.create(scorerSupplier, M, beamWidth, HnswGraphBuilder.randSeed);
@ -732,9 +742,9 @@ public final class Lucene95HnswVectorsWriter extends KnnVectorsWriter {
lastDocID = docID; lastDocID = docID;
} }
OnHeapHnswGraph getGraph() { OnHeapHnswGraph getGraph() throws IOException {
if (vectors.size() > 0) { if (vectors.size() > 0) {
return hnswGraphBuilder.getGraph(); return hnswGraphBuilder.getCompletedGraph();
} else { } else {
return null; return null;
} }

View File

@ -14,22 +14,22 @@
* See the License for the specific language governing permissions and * See the License for the specific language governing permissions and
* limitations under the License. * limitations under the License.
*/ */
package org.apache.lucene.codecs.lucene99; package org.apache.lucene.backward_codecs.lucene99;
import static org.apache.lucene.codecs.lucene99.ForUtil.BLOCK_SIZE; import static org.apache.lucene.backward_codecs.lucene99.ForUtil.BLOCK_SIZE;
import static org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.DOC_CODEC; import static org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat.DOC_CODEC;
import static org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.MAX_SKIP_LEVELS; import static org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat.MAX_SKIP_LEVELS;
import static org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.PAY_CODEC; import static org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat.PAY_CODEC;
import static org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.POS_CODEC; import static org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat.POS_CODEC;
import static org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.TERMS_CODEC; import static org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat.TERMS_CODEC;
import static org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.VERSION_CURRENT; import static org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat.VERSION_CURRENT;
import java.io.IOException; import java.io.IOException;
import org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat.IntBlockTermState;
import org.apache.lucene.codecs.BlockTermState; import org.apache.lucene.codecs.BlockTermState;
import org.apache.lucene.codecs.CodecUtil; import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.CompetitiveImpactAccumulator; import org.apache.lucene.codecs.CompetitiveImpactAccumulator;
import org.apache.lucene.codecs.PushPostingsWriterBase; import org.apache.lucene.codecs.PushPostingsWriterBase;
import org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.IntBlockTermState;
import org.apache.lucene.index.CorruptIndexException; import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.IndexFileNames; import org.apache.lucene.index.IndexFileNames;

View File

@ -0,0 +1,68 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.backward_codecs.lucene99;
import java.io.IOException;
import org.apache.lucene.codecs.FieldsConsumer;
import org.apache.lucene.codecs.PostingsWriterBase;
import org.apache.lucene.codecs.lucene90.blocktree.Lucene90BlockTreeTermsWriter;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.util.IOUtils;
public class Lucene99RWPostingsFormat extends Lucene99PostingsFormat {
private final int minTermBlockSize;
private final int maxTermBlockSize;
/** Creates {@code Lucene99PostingsFormat} with default settings. */
public Lucene99RWPostingsFormat() {
this(
Lucene90BlockTreeTermsWriter.DEFAULT_MIN_BLOCK_SIZE,
Lucene90BlockTreeTermsWriter.DEFAULT_MAX_BLOCK_SIZE);
}
/**
* Creates {@code Lucene99PostingsFormat} with custom values for {@code minBlockSize} and {@code
* maxBlockSize} passed to block terms dictionary.
*
* @see
* Lucene90BlockTreeTermsWriter#Lucene90BlockTreeTermsWriter(SegmentWriteState,PostingsWriterBase,int,int)
*/
public Lucene99RWPostingsFormat(int minTermBlockSize, int maxTermBlockSize) {
super();
Lucene90BlockTreeTermsWriter.validateSettings(minTermBlockSize, maxTermBlockSize);
this.minTermBlockSize = minTermBlockSize;
this.maxTermBlockSize = maxTermBlockSize;
}
@Override
public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
PostingsWriterBase postingsWriter = new Lucene99PostingsWriter(state);
boolean success = false;
try {
FieldsConsumer ret =
new Lucene90BlockTreeTermsWriter(
state, postingsWriter, minTermBlockSize, maxTermBlockSize);
success = true;
return ret;
} finally {
if (!success) {
IOUtils.closeWhileHandlingException(postingsWriter);
}
}
}
}

View File

@ -14,7 +14,7 @@
* See the License for the specific language governing permissions and * See the License for the specific language governing permissions and
* limitations under the License. * limitations under the License.
*/ */
package org.apache.lucene.codecs.lucene99; package org.apache.lucene.backward_codecs.lucene99;
import com.carrotsearch.randomizedtesting.generators.RandomNumbers; import com.carrotsearch.randomizedtesting.generators.RandomNumbers;
import java.io.IOException; import java.io.IOException;

View File

@ -14,7 +14,7 @@
* See the License for the specific language governing permissions and * See the License for the specific language governing permissions and
* limitations under the License. * limitations under the License.
*/ */
package org.apache.lucene.codecs.lucene99; package org.apache.lucene.backward_codecs.lucene99;
import com.carrotsearch.randomizedtesting.generators.RandomNumbers; import com.carrotsearch.randomizedtesting.generators.RandomNumbers;
import java.io.IOException; import java.io.IOException;

View File

@ -19,7 +19,6 @@ package org.apache.lucene.backward_codecs.lucene99;
import org.apache.lucene.codecs.Codec; import org.apache.lucene.codecs.Codec;
import org.apache.lucene.codecs.KnnVectorsFormat; import org.apache.lucene.codecs.KnnVectorsFormat;
import org.apache.lucene.codecs.lucene99.Lucene99Codec;
import org.apache.lucene.tests.index.BaseKnnVectorsFormatTestCase; import org.apache.lucene.tests.index.BaseKnnVectorsFormatTestCase;
public class TestLucene99HnswScalarQuantizedVectorsFormat extends BaseKnnVectorsFormatTestCase { public class TestLucene99HnswScalarQuantizedVectorsFormat extends BaseKnnVectorsFormatTestCase {

View File

@ -14,22 +14,26 @@
* See the License for the specific language governing permissions and * See the License for the specific language governing permissions and
* limitations under the License. * limitations under the License.
*/ */
package org.apache.lucene.codecs.lucene99; package org.apache.lucene.backward_codecs.lucene99;
import static org.apache.lucene.codecs.lucene99.Lucene99ScoreSkipReader.readImpacts; import static org.apache.lucene.backward_codecs.lucene99.Lucene99ScoreSkipReader.readImpacts;
import java.io.IOException; import java.io.IOException;
import java.util.Arrays; import java.util.Arrays;
import java.util.Collections; import java.util.Collections;
import java.util.List; import java.util.List;
import org.apache.lucene.backward_codecs.lucene99.Lucene99ScoreSkipReader.MutableImpactList;
import org.apache.lucene.codecs.Codec; import org.apache.lucene.codecs.Codec;
import org.apache.lucene.codecs.CompetitiveImpactAccumulator; import org.apache.lucene.codecs.CompetitiveImpactAccumulator;
import org.apache.lucene.codecs.lucene90.blocktree.FieldReader; import org.apache.lucene.codecs.lucene90.blocktree.FieldReader;
import org.apache.lucene.codecs.lucene90.blocktree.Stats; import org.apache.lucene.codecs.lucene90.blocktree.Stats;
import org.apache.lucene.codecs.lucene99.Lucene99ScoreSkipReader.MutableImpactList;
import org.apache.lucene.document.Document; import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field; import org.apache.lucene.document.Field;
import org.apache.lucene.index.*; import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.Impact;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.store.ByteArrayDataInput; import org.apache.lucene.store.ByteArrayDataInput;
import org.apache.lucene.store.Directory; import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext; import org.apache.lucene.store.IOContext;
@ -41,7 +45,7 @@ import org.apache.lucene.tests.util.TestUtil;
import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRef;
public class TestLucene99PostingsFormat extends BasePostingsFormatTestCase { public class TestLucene99PostingsFormat extends BasePostingsFormatTestCase {
private final Codec codec = TestUtil.alwaysPostingsFormat(new Lucene99PostingsFormat()); private final Codec codec = TestUtil.alwaysPostingsFormat(new Lucene99RWPostingsFormat());
@Override @Override
protected Codec getCodec() { protected Codec getCodec() {
@ -77,7 +81,7 @@ public class TestLucene99PostingsFormat extends BasePostingsFormatTestCase {
expectThrows( expectThrows(
IllegalArgumentException.class, IllegalArgumentException.class,
() -> { () -> {
new Lucene99PostingsFormat(minItemsInBlock, maxItemsInBlock); new Lucene99RWPostingsFormat(minItemsInBlock, maxItemsInBlock);
}); });
} }

View File

@ -14,7 +14,7 @@
* See the License for the specific language governing permissions and * See the License for the specific language governing permissions and
* limitations under the License. * limitations under the License.
*/ */
package org.apache.lucene.codecs.lucene99; package org.apache.lucene.backward_codecs.lucene99;
import com.carrotsearch.randomizedtesting.generators.RandomNumbers; import com.carrotsearch.randomizedtesting.generators.RandomNumbers;
import java.io.IOException; import java.io.IOException;

View File

@ -0,0 +1,49 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.backward_codecs.lucene99;
import java.io.IOException;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.tests.util.LuceneTestCase;
public class TestPostingsUtil extends LuceneTestCase {
// checks for bug described in https://github.com/apache/lucene/issues/13373
public void testIntegerOverflow() throws IOException {
final int size = random().nextInt(1, ForUtil.BLOCK_SIZE);
final long[] docDeltaBuffer = new long[size];
final long[] freqBuffer = new long[size];
final int delta = 1 << 30;
docDeltaBuffer[0] = delta;
try (Directory dir = newDirectory()) {
try (IndexOutput out = dir.createOutput("test", IOContext.DEFAULT)) {
// In old implementation, this would cause integer overflow exception.
PostingsUtil.writeVIntBlock(out, docDeltaBuffer, freqBuffer, size, true);
}
long[] restoredDocs = new long[size];
long[] restoredFreqs = new long[size];
try (IndexInput in = dir.openInput("test", IOContext.DEFAULT)) {
PostingsUtil.readVIntBlock(in, restoredDocs, restoredFreqs, size, true, true);
}
assertEquals(delta, restoredDocs[0]);
}
}
}

View File

@ -196,6 +196,7 @@ public class TestAncientIndicesCompatibility extends LuceneTestCase {
ByteArrayOutputStream bos = new ByteArrayOutputStream(1024); ByteArrayOutputStream bos = new ByteArrayOutputStream(1024);
CheckIndex checker = new CheckIndex(dir); CheckIndex checker = new CheckIndex(dir);
checker.setInfoStream(new PrintStream(bos, false, UTF_8)); checker.setInfoStream(new PrintStream(bos, false, UTF_8));
checker.setLevel(CheckIndex.Level.MIN_LEVEL_FOR_INTEGRITY_CHECKS);
CheckIndex.Status indexStatus = checker.checkIndex(); CheckIndex.Status indexStatus = checker.checkIndex();
if (version.startsWith("8.")) { if (version.startsWith("8.")) {
assertTrue(indexStatus.clean); assertTrue(indexStatus.clean);

View File

@ -20,9 +20,9 @@ import static org.apache.lucene.backward_index.TestBasicBackwardsCompatibility.a
import com.carrotsearch.randomizedtesting.annotations.ParametersFactory; import com.carrotsearch.randomizedtesting.annotations.ParametersFactory;
import java.io.IOException; import java.io.IOException;
import org.apache.lucene.backward_codecs.lucene99.Lucene99Codec;
import org.apache.lucene.codecs.Codec; import org.apache.lucene.codecs.Codec;
import org.apache.lucene.codecs.KnnVectorsFormat; import org.apache.lucene.codecs.KnnVectorsFormat;
import org.apache.lucene.codecs.lucene99.Lucene99Codec;
import org.apache.lucene.codecs.lucene99.Lucene99HnswScalarQuantizedVectorsFormat; import org.apache.lucene.codecs.lucene99.Lucene99HnswScalarQuantizedVectorsFormat;
import org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat; import org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat;
import org.apache.lucene.document.Document; import org.apache.lucene.document.Document;

View File

@ -40,3 +40,4 @@
9.9.2 9.9.2
9.10.0 9.10.0
9.11.0 9.11.0
9.11.1

View File

@ -0,0 +1,376 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.benchmark.jmh;
import java.util.Arrays;
import java.util.Random;
import java.util.concurrent.TimeUnit;
import org.apache.lucene.search.DocIdSetIterator;
import org.openjdk.jmh.annotations.Benchmark;
import org.openjdk.jmh.annotations.BenchmarkMode;
import org.openjdk.jmh.annotations.CompilerControl;
import org.openjdk.jmh.annotations.Fork;
import org.openjdk.jmh.annotations.Level;
import org.openjdk.jmh.annotations.Measurement;
import org.openjdk.jmh.annotations.Mode;
import org.openjdk.jmh.annotations.OutputTimeUnit;
import org.openjdk.jmh.annotations.Scope;
import org.openjdk.jmh.annotations.Setup;
import org.openjdk.jmh.annotations.State;
import org.openjdk.jmh.annotations.Warmup;
@BenchmarkMode(Mode.Throughput)
@OutputTimeUnit(TimeUnit.MILLISECONDS)
@State(Scope.Benchmark)
@Warmup(iterations = 5, time = 1)
@Measurement(iterations = 5, time = 1)
@Fork(
value = 1,
jvmArgsAppend = {"-Xmx1g", "-Xms1g", "-XX:+AlwaysPreTouch"})
public class AdvanceBenchmark {
private final long[] values = new long[129];
private final int[] startIndexes = new int[1_000];
private final long[] targets = new long[startIndexes.length];
@Setup(Level.Trial)
public void setup() throws Exception {
for (int i = 0; i < 128; ++i) {
values[i] = i;
}
values[128] = DocIdSetIterator.NO_MORE_DOCS;
Random r = new Random(0);
for (int i = 0; i < startIndexes.length; ++i) {
startIndexes[i] = r.nextInt(64);
targets[i] = startIndexes[i] + 1 + r.nextInt(1 << r.nextInt(7));
}
}
@Benchmark
public void binarySearch() {
for (int i = 0; i < startIndexes.length; ++i) {
binarySearch(values, targets[i], startIndexes[i]);
}
}
@CompilerControl(CompilerControl.Mode.DONT_INLINE)
private static int binarySearch(long[] values, long target, int startIndex) {
// Standard binary search
int i = Arrays.binarySearch(values, startIndex, values.length, target);
if (i < 0) {
i = -1 - i;
}
return i;
}
@Benchmark
public void binarySearch2() {
for (int i = 0; i < startIndexes.length; ++i) {
binarySearch2(values, targets[i], startIndexes[i]);
}
}
@CompilerControl(CompilerControl.Mode.DONT_INLINE)
private static int binarySearch2(long[] values, long target, int startIndex) {
// Try to help the compiler by providing predictable start/end offsets.
int i = Arrays.binarySearch(values, 0, 128, target);
if (i < 0) {
i = -1 - i;
}
return i;
}
@Benchmark
public void binarySearch3() {
for (int i = 0; i < startIndexes.length; ++i) {
binarySearch3(values, targets[i], startIndexes[i]);
}
}
@CompilerControl(CompilerControl.Mode.DONT_INLINE)
private static int binarySearch3(long[] values, long target, int startIndex) {
// Organize code the same way as suggested in https://quickwit.io/blog/search-a-sorted-block,
// which proved to help with LLVM.
int start = 0;
int length = 128;
while (length > 1) {
length /= 2;
if (values[start + length - 1] < target) {
start += length;
}
}
return start;
}
@Benchmark
public void binarySearch4() {
for (int i = 0; i < startIndexes.length; ++i) {
binarySearch4(values, targets[i], startIndexes[i]);
}
}
@CompilerControl(CompilerControl.Mode.DONT_INLINE)
private static int binarySearch4(long[] values, long target, int startIndex) {
// Explicitly inline the binary-search logic to see if it helps the compiler.
int start = 0;
if (values[63] < target) {
start += 64;
}
if (values[start + 31] < target) {
start += 32;
}
if (values[start + 15] < target) {
start += 16;
}
if (values[start + 7] < target) {
start += 8;
}
if (values[start + 3] < target) {
start += 4;
}
if (values[start + 1] < target) {
start += 2;
}
if (values[start] < target) {
start += 1;
}
return start;
}
@Benchmark
public void binarySearch5() {
for (int i = 0; i < startIndexes.length; ++i) {
binarySearch5(values, targets[i], startIndexes[i]);
}
}
@CompilerControl(CompilerControl.Mode.DONT_INLINE)
private static int binarySearch5(long[] values, long target, int startIndex) {
// Other way to write a binary search
int start = 0;
for (int shift = 6; shift >= 0; --shift) {
int halfRange = 1 << shift;
if (values[start + halfRange - 1] < target) {
start += halfRange;
}
}
return start;
}
@Benchmark
public void binarySearch6() {
for (int i = 0; i < startIndexes.length; ++i) {
binarySearch6(values, targets[i], startIndexes[i]);
}
}
@CompilerControl(CompilerControl.Mode.DONT_INLINE)
private static int binarySearch6(long[] values, long target, int startIndex) {
// Other way to write a binary search
int start = 0;
for (int halfRange = 64; halfRange > 0; halfRange >>= 1) {
if (values[start + halfRange - 1] < target) {
start += halfRange;
}
}
return start;
}
@Benchmark
public void linearSearch() {
for (int i = 0; i < startIndexes.length; ++i) {
linearSearch(values, targets[i], startIndexes[i]);
}
}
@CompilerControl(CompilerControl.Mode.DONT_INLINE)
private static int linearSearch(long[] values, long target, int startIndex) {
// Naive linear search.
for (int i = startIndex; i < values.length; ++i) {
if (values[i] >= target) {
return i;
}
}
return values.length;
}
@Benchmark
public void bruteForceSearch() {
for (int i = 0; i < startIndexes.length; ++i) {
bruteForceSearch(values, targets[i], startIndexes[i]);
}
}
@CompilerControl(CompilerControl.Mode.DONT_INLINE)
private static int bruteForceSearch(long[] values, long target, int startIndex) {
// Linear search with predictable start/end offsets to see if it helps the compiler.
for (int i = 0; i < 128; ++i) {
if (values[i] >= target) {
return i;
}
}
return values.length;
}
@Benchmark
public void linearSearch2() {
for (int i = 0; i < startIndexes.length; ++i) {
linearSearch2(values, targets[i], startIndexes[i]);
}
}
@CompilerControl(CompilerControl.Mode.DONT_INLINE)
private static int linearSearch2(long[] values, long target, int startIndex) {
// Two-level linear search, first checking every 8-th value, then values within an 8-value range
int rangeStart = values.length - 8;
for (int i = startIndex; i + 8 <= values.length; i += 8) {
if (values[i + 7] >= target) {
rangeStart = i;
break;
}
}
for (int i = 0; i < 8; ++i) {
if (values[rangeStart + i] >= target) {
return rangeStart + i;
}
}
return values.length;
}
@Benchmark
public void linearSearch3() {
for (int i = 0; i < startIndexes.length; ++i) {
linearSearch3(values, targets[i], startIndexes[i]);
}
}
@CompilerControl(CompilerControl.Mode.DONT_INLINE)
private static int linearSearch3(long[] values, long target, int startIndex) {
// Iteration over linearSearch that tries to reduce branches
while (startIndex + 4 <= values.length) {
int count = values[startIndex] < target ? 1 : 0;
if (values[startIndex + 1] < target) {
count++;
}
if (values[startIndex + 2] < target) {
count++;
}
if (values[startIndex + 3] < target) {
count++;
}
if (count != 4) {
return startIndex + count;
}
startIndex += 4;
}
for (int i = startIndex; i < values.length; ++i) {
if (values[i] >= target) {
return i;
}
}
return values.length;
}
@Benchmark
public void hybridSearch() {
for (int i = 0; i < startIndexes.length; ++i) {
hybridSearch(values, targets[i], startIndexes[i]);
}
}
@CompilerControl(CompilerControl.Mode.DONT_INLINE)
private static int hybridSearch(long[] values, long target, int startIndex) {
// Two-level linear search, first checking every 8-th value, then values within an 8-value range
int rangeStart = values.length - 8;
for (int i = startIndex; i + 8 <= values.length; i += 8) {
if (values[i + 7] >= target) {
rangeStart = i;
break;
}
}
return binarySearchHelper8(values, target, rangeStart);
}
// branchless binary search over 8 values
private static int binarySearchHelper8(long[] values, long target, int start) {
if (values[start + 3] < target) {
start += 4;
}
if (values[start + 1] < target) {
start += 2;
}
if (values[start] < target) {
start += 1;
}
return start;
}
private static void assertEquals(int expected, int actual) {
if (expected != actual) {
throw new AssertionError("Expected: " + expected + ", got " + actual);
}
}
public static void main(String[] args) {
// For testing purposes
long[] values = new long[129];
for (int i = 0; i < 128; ++i) {
values[i] = i;
}
values[128] = DocIdSetIterator.NO_MORE_DOCS;
for (int start = 0; start < 128; ++start) {
for (int targetIndex = start; targetIndex < 128; ++targetIndex) {
int actualIndex = binarySearch(values, values[targetIndex], start);
assertEquals(targetIndex, actualIndex);
actualIndex = binarySearch2(values, values[targetIndex], start);
assertEquals(targetIndex, actualIndex);
actualIndex = binarySearch3(values, values[targetIndex], start);
assertEquals(targetIndex, actualIndex);
actualIndex = binarySearch4(values, values[targetIndex], start);
assertEquals(targetIndex, actualIndex);
actualIndex = binarySearch5(values, values[targetIndex], start);
assertEquals(targetIndex, actualIndex);
actualIndex = binarySearch6(values, values[targetIndex], start);
assertEquals(targetIndex, actualIndex);
actualIndex = bruteForceSearch(values, values[targetIndex], start);
assertEquals(targetIndex, actualIndex);
actualIndex = hybridSearch(values, values[targetIndex], start);
assertEquals(targetIndex, actualIndex);
actualIndex = linearSearch(values, values[targetIndex], start);
assertEquals(targetIndex, actualIndex);
actualIndex = linearSearch2(values, values[targetIndex], start);
assertEquals(targetIndex, actualIndex);
actualIndex = linearSearch3(values, values[targetIndex], start);
assertEquals(targetIndex, actualIndex);
}
}
}
}

View File

@ -0,0 +1,75 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.benchmark.jmh;
import java.io.IOException;
import java.util.Random;
import java.util.concurrent.TimeUnit;
import org.apache.lucene.util.VectorUtil;
import org.openjdk.jmh.annotations.Benchmark;
import org.openjdk.jmh.annotations.BenchmarkMode;
import org.openjdk.jmh.annotations.Fork;
import org.openjdk.jmh.annotations.Measurement;
import org.openjdk.jmh.annotations.Mode;
import org.openjdk.jmh.annotations.OutputTimeUnit;
import org.openjdk.jmh.annotations.Param;
import org.openjdk.jmh.annotations.Scope;
import org.openjdk.jmh.annotations.Setup;
import org.openjdk.jmh.annotations.State;
import org.openjdk.jmh.annotations.Warmup;
@Fork(1)
@Warmup(iterations = 3, time = 3)
@Measurement(iterations = 5, time = 3)
@BenchmarkMode(Mode.Throughput)
@OutputTimeUnit(TimeUnit.SECONDS)
@State(Scope.Benchmark)
public class HammingDistanceBenchmark {
@Param({"1000000"})
int nb = 1_000_000;
@Param({"1024"})
int dims = 1024;
byte[][] xb;
byte[] xq;
@Setup
public void setup() throws IOException {
Random rand = new Random();
this.xb = new byte[nb][dims / 8];
for (int i = 0; i < nb; i++) {
for (int j = 0; j < dims / 8; j++) {
xb[i][j] = (byte) rand.nextInt(0, 255);
}
}
this.xq = new byte[dims / 8];
for (int i = 0; i < xq.length; i++) {
xq[i] = (byte) rand.nextInt(0, 255);
}
}
@Benchmark
public int xorBitCount() {
int tot = 0;
for (int i = 0; i < nb; i++) {
tot += VectorUtil.xorBitCount(xb[i], xq);
}
return tot;
}
}

View File

@ -0,0 +1,108 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.benchmark.jmh;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.Random;
import java.util.concurrent.TimeUnit;
import org.apache.lucene.codecs.lucene912.ForDeltaUtil;
import org.apache.lucene.codecs.lucene912.ForUtil;
import org.apache.lucene.codecs.lucene912.PostingIndexInput;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.store.MMapDirectory;
import org.apache.lucene.util.IOUtils;
import org.openjdk.jmh.annotations.Benchmark;
import org.openjdk.jmh.annotations.BenchmarkMode;
import org.openjdk.jmh.annotations.Fork;
import org.openjdk.jmh.annotations.Level;
import org.openjdk.jmh.annotations.Measurement;
import org.openjdk.jmh.annotations.Mode;
import org.openjdk.jmh.annotations.OutputTimeUnit;
import org.openjdk.jmh.annotations.Param;
import org.openjdk.jmh.annotations.Scope;
import org.openjdk.jmh.annotations.Setup;
import org.openjdk.jmh.annotations.State;
import org.openjdk.jmh.annotations.TearDown;
import org.openjdk.jmh.annotations.Warmup;
import org.openjdk.jmh.infra.Blackhole;
@BenchmarkMode(Mode.Throughput)
@OutputTimeUnit(TimeUnit.MICROSECONDS)
@State(Scope.Benchmark)
@Warmup(iterations = 5, time = 1)
@Measurement(iterations = 5, time = 1)
@Fork(
value = 3,
jvmArgsAppend = {"-Xmx1g", "-Xms1g", "-XX:+AlwaysPreTouch"})
public class PostingIndexInputBenchmark {
private Path path;
private Directory dir;
private IndexInput in;
private PostingIndexInput postingIn;
private final ForUtil forUtil = new ForUtil();
private final ForDeltaUtil forDeltaUtil = new ForDeltaUtil();
private final long[] values = new long[128];
@Param({"2", "3", "4", "5", "6", "7", "8", "9", "10"})
public int bpv;
@Setup(Level.Trial)
public void setup() throws Exception {
path = Files.createTempDirectory("forUtil");
dir = MMapDirectory.open(path);
try (IndexOutput out = dir.createOutput("docs", IOContext.DEFAULT)) {
Random r = new Random(0);
// Write enough random data to not reach EOF while decoding
for (int i = 0; i < 100; ++i) {
out.writeLong(r.nextLong());
}
}
in = dir.openInput("docs", IOContext.DEFAULT);
postingIn = new PostingIndexInput(in, forUtil, forDeltaUtil);
}
@TearDown(Level.Trial)
public void tearDown() throws Exception {
if (dir != null) {
dir.deleteFile("docs");
}
IOUtils.close(in, dir);
in = null;
dir = null;
Files.deleteIfExists(path);
}
@Benchmark
public void decode(Blackhole bh) throws IOException {
in.seek(3); // random unaligned offset
postingIn.decode(bpv, values);
bh.consume(values);
}
@Benchmark
public void decodeAndPrefixSum(Blackhole bh) throws IOException {
in.seek(3); // random unaligned offset
postingIn.decodeAndPrefixSum(bpv, 100, values);
bh.consume(values);
}
}

View File

@ -17,11 +17,10 @@
# ------------------------------------------------------------------------------------- # -------------------------------------------------------------------------------------
# multi val params are iterated by NewRound's, added to reports, start with column name. # multi val params are iterated by NewRound's, added to reports, start with column name.
# collector.class can be: # collector.manager.class can be:
# Fully Qualified Class Name of a Collector with a empty constructor # Fully Qualified Class Name of a CollectorManager with a empty constructor
# topScoreDocOrdered - Creates a TopScoreDocCollector that requires in order docs # topScoreDoc - Creates a TopScoreDocCollectorManager
# topScoreDocUnordered - Like above, but allows out of order collector.manager.class=coll:topScoreDoc
collector.class=coll:topScoreDoc
analyzer=org.apache.lucene.analysis.core.WhitespaceAnalyzer analyzer=org.apache.lucene.analysis.core.WhitespaceAnalyzer
directory=FSDirectory directory=FSDirectory

View File

@ -17,11 +17,10 @@
# ------------------------------------------------------------------------------------- # -------------------------------------------------------------------------------------
# multi val params are iterated by NewRound's, added to reports, start with column name. # multi val params are iterated by NewRound's, added to reports, start with column name.
# collector.class can be: # collector.manager.class can be:
# Fully Qualified Class Name of a Collector with a empty constructor # Fully Qualified Class Name of a CollectorManager with a empty constructor
# topScoreDocOrdered - Creates a TopScoreDocCollector that requires in order docs # topScoreDoc - Creates a TopScoreDocCollectorManager
# topScoreDocUnordered - Like above, but allows out of order collector.manager.class=coll:topScoreDoc
collector.class=coll:topScoreDoc
analyzer=org.apache.lucene.analysis.core.WhitespaceAnalyzer analyzer=org.apache.lucene.analysis.core.WhitespaceAnalyzer
directory=FSDirectory directory=FSDirectory

View File

@ -238,7 +238,7 @@ public class EnwikiContentSource extends ContentSource {
time = null; time = null;
id = null; id = null;
break; break;
// intentional fall-through. // intentional fall-through.
case BODY: case BODY:
case DATE: case DATE:
case TITLE: case TITLE:

View File

@ -99,7 +99,7 @@ public class SpatialDocMaker extends DocMaker {
return makeRPTStrategy(SPATIAL_FIELD, config, configMap, ctx); return makeRPTStrategy(SPATIAL_FIELD, config, configMap, ctx);
case "composite": case "composite":
return makeCompositeStrategy(config, configMap, ctx); return makeCompositeStrategy(config, configMap, ctx);
// TODO add more as-needed // TODO add more as-needed
default: default:
throw new IllegalStateException("Unknown spatial.strategy: " + strategyName); throw new IllegalStateException("Unknown spatial.strategy: " + strategyName);
} }

View File

@ -24,7 +24,7 @@ import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.MultiBits; import org.apache.lucene.index.MultiBits;
import org.apache.lucene.index.StoredFields; import org.apache.lucene.index.StoredFields;
import org.apache.lucene.search.Collector; import org.apache.lucene.search.CollectorManager;
import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query; import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.ScoreDoc;
@ -119,9 +119,7 @@ public abstract class ReadTask extends PerfTask {
hits = searcher.search(q, numHits); hits = searcher.search(q, numHits);
} }
} else { } else {
Collector collector = createCollector(); searcher.search(q, createCollectorManager());
searcher.search(q, collector);
// hits = collector.topDocs(); // hits = collector.topDocs();
} }
@ -184,9 +182,8 @@ public abstract class ReadTask extends PerfTask {
return res; return res;
} }
protected Collector createCollector() throws Exception { protected CollectorManager<?, ?> createCollectorManager() throws Exception {
return new TopScoreDocCollectorManager(numHits(), withTotalHits() ? Integer.MAX_VALUE : 1) return new TopScoreDocCollectorManager(numHits(), withTotalHits() ? Integer.MAX_VALUE : 1);
.newCollector();
} }
protected Document retrieveDoc(StoredFields storedFields, int id) throws IOException { protected Document retrieveDoc(StoredFields storedFields, int id) throws IOException {

View File

@ -19,8 +19,8 @@ package org.apache.lucene.benchmark.byTask.tasks;
import org.apache.lucene.benchmark.byTask.PerfRunData; import org.apache.lucene.benchmark.byTask.PerfRunData;
import org.apache.lucene.benchmark.byTask.feeds.QueryMaker; import org.apache.lucene.benchmark.byTask.feeds.QueryMaker;
import org.apache.lucene.benchmark.byTask.utils.Config; import org.apache.lucene.benchmark.byTask.utils.Config;
import org.apache.lucene.search.Collector; import org.apache.lucene.search.CollectorManager;
import org.apache.lucene.search.TopScoreDocCollector; import org.apache.lucene.search.TopScoreDocCollectorManager;
/** Does search w/ a custom collector */ /** Does search w/ a custom collector */
public class SearchWithCollectorTask extends SearchTask { public class SearchWithCollectorTask extends SearchTask {
@ -37,7 +37,11 @@ public class SearchWithCollectorTask extends SearchTask {
// check to make sure either the doc is being stored // check to make sure either the doc is being stored
PerfRunData runData = getRunData(); PerfRunData runData = getRunData();
Config config = runData.getConfig(); Config config = runData.getConfig();
clnName = config.get("collector.class", ""); if (config.get("collector.class", null) != null) {
throw new IllegalArgumentException(
"collector.class is no longer supported as a config parameter, use collector.manager.class instead to provide a CollectorManager class name");
}
clnName = config.get("collector.manager.class", "");
} }
@Override @Override
@ -46,17 +50,17 @@ public class SearchWithCollectorTask extends SearchTask {
} }
@Override @Override
protected Collector createCollector() throws Exception { protected CollectorManager<?, ?> createCollectorManager() throws Exception {
Collector collector = null; CollectorManager<?, ?> collectorManager;
if (clnName.equalsIgnoreCase("topScoreDoc") == true) { if (clnName.equalsIgnoreCase("topScoreDoc") == true) {
collector = TopScoreDocCollector.create(numHits(), Integer.MAX_VALUE); collectorManager = new TopScoreDocCollectorManager(numHits(), Integer.MAX_VALUE);
} else if (clnName.length() > 0) { } else if (clnName.length() > 0) {
collector = Class.forName(clnName).asSubclass(Collector.class).getConstructor().newInstance(); collectorManager =
Class.forName(clnName).asSubclass(CollectorManager.class).getConstructor().newInstance();
} else { } else {
collector = super.createCollector(); collectorManager = super.createCollectorManager();
} }
return collector; return collectorManager;
} }
@Override @Override

View File

@ -23,13 +23,13 @@ import org.apache.lucene.codecs.PostingsFormat;
import org.apache.lucene.codecs.PostingsReaderBase; import org.apache.lucene.codecs.PostingsReaderBase;
import org.apache.lucene.codecs.PostingsWriterBase; import org.apache.lucene.codecs.PostingsWriterBase;
import org.apache.lucene.codecs.lucene90.blocktree.Lucene90BlockTreeTermsWriter; import org.apache.lucene.codecs.lucene90.blocktree.Lucene90BlockTreeTermsWriter;
import org.apache.lucene.codecs.lucene99.Lucene99PostingsReader; import org.apache.lucene.codecs.lucene912.Lucene912PostingsReader;
import org.apache.lucene.codecs.lucene99.Lucene99PostingsWriter; import org.apache.lucene.codecs.lucene912.Lucene912PostingsWriter;
import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.SegmentWriteState; import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.IOUtils;
/** Uses {@link OrdsBlockTreeTermsWriter} with {@link Lucene99PostingsWriter}. */ /** Uses {@link OrdsBlockTreeTermsWriter} with {@link Lucene912PostingsWriter}. */
public class BlockTreeOrdsPostingsFormat extends PostingsFormat { public class BlockTreeOrdsPostingsFormat extends PostingsFormat {
private final int minTermBlockSize; private final int minTermBlockSize;
@ -67,7 +67,7 @@ public class BlockTreeOrdsPostingsFormat extends PostingsFormat {
@Override @Override
public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException { public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
PostingsWriterBase postingsWriter = new Lucene99PostingsWriter(state); PostingsWriterBase postingsWriter = new Lucene912PostingsWriter(state);
boolean success = false; boolean success = false;
try { try {
@ -84,7 +84,7 @@ public class BlockTreeOrdsPostingsFormat extends PostingsFormat {
@Override @Override
public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException { public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException {
PostingsReaderBase postingsReader = new Lucene99PostingsReader(state); PostingsReaderBase postingsReader = new Lucene912PostingsReader(state);
boolean success = false; boolean success = false;
try { try {
FieldsProducer ret = new OrdsBlockTreeTermsReader(postingsReader, state); FieldsProducer ret = new OrdsBlockTreeTermsReader(postingsReader, state);

View File

@ -43,6 +43,7 @@ import org.apache.lucene.store.ChecksumIndexInput;
import org.apache.lucene.store.DataOutput; import org.apache.lucene.store.DataOutput;
import org.apache.lucene.store.IndexOutput; import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOBooleanSupplier;
import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.automaton.CompiledAutomaton; import org.apache.lucene.util.automaton.CompiledAutomaton;
@ -315,12 +316,21 @@ public final class BloomFilteringPostingsFormat extends PostingsFormat {
} }
@Override @Override
public boolean seekExact(BytesRef text) throws IOException { public IOBooleanSupplier prepareSeekExact(BytesRef text) throws IOException {
// The magical fail-fast speed up that is the entire point of all of // The magical fail-fast speed up that is the entire point of all of
// this code - save a disk seek if there is a match on an in-memory // this code - save a disk seek if there is a match on an in-memory
// structure // structure
// that may occasionally give a false positive but guaranteed no false // that may occasionally give a false positive but guaranteed no false
// negatives // negatives
if (filter.contains(text) == ContainsResult.NO) {
return null;
}
return delegate().prepareSeekExact(text);
}
@Override
public boolean seekExact(BytesRef text) throws IOException {
// See #prepareSeekExact
if (filter.contains(text) == ContainsResult.NO) { if (filter.contains(text) == ContainsResult.NO) {
return false; return false;
} }

View File

@ -24,7 +24,7 @@ import java.util.TreeMap;
import org.apache.lucene.codecs.FieldsConsumer; import org.apache.lucene.codecs.FieldsConsumer;
import org.apache.lucene.codecs.FieldsProducer; import org.apache.lucene.codecs.FieldsProducer;
import org.apache.lucene.codecs.PostingsFormat; import org.apache.lucene.codecs.PostingsFormat;
import org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat; import org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat;
import org.apache.lucene.index.BaseTermsEnum; import org.apache.lucene.index.BaseTermsEnum;
import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.Fields; import org.apache.lucene.index.Fields;
@ -54,7 +54,7 @@ import org.apache.lucene.util.automaton.TransitionAccessor;
// - or: longer dense skip lists than just next byte? // - or: longer dense skip lists than just next byte?
/** /**
* Wraps {@link Lucene99PostingsFormat} format for on-disk storage, but then at read time loads and * Wraps {@link Lucene912PostingsFormat} format for on-disk storage, but then at read time loads and
* stores all terms and postings directly in RAM as byte[], int[]. * stores all terms and postings directly in RAM as byte[], int[].
* *
* <p><b>WARNING</b>: This is exceptionally RAM intensive: it makes no effort to compress the * <p><b>WARNING</b>: This is exceptionally RAM intensive: it makes no effort to compress the
@ -97,12 +97,12 @@ public final class DirectPostingsFormat extends PostingsFormat {
@Override @Override
public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException { public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
return PostingsFormat.forName("Lucene99").fieldsConsumer(state); return PostingsFormat.forName("Lucene912").fieldsConsumer(state);
} }
@Override @Override
public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException { public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException {
FieldsProducer postings = PostingsFormat.forName("Lucene99").fieldsProducer(state); FieldsProducer postings = PostingsFormat.forName("Lucene912").fieldsProducer(state);
if (state.context.context() != IOContext.Context.MERGE) { if (state.context.context() != IOContext.Context.MERGE) {
FieldsProducer loadedPostings; FieldsProducer loadedPostings;
try { try {

View File

@ -22,8 +22,8 @@ import org.apache.lucene.codecs.FieldsProducer;
import org.apache.lucene.codecs.PostingsFormat; import org.apache.lucene.codecs.PostingsFormat;
import org.apache.lucene.codecs.PostingsReaderBase; import org.apache.lucene.codecs.PostingsReaderBase;
import org.apache.lucene.codecs.PostingsWriterBase; import org.apache.lucene.codecs.PostingsWriterBase;
import org.apache.lucene.codecs.lucene99.Lucene99PostingsReader; import org.apache.lucene.codecs.lucene912.Lucene912PostingsReader;
import org.apache.lucene.codecs.lucene99.Lucene99PostingsWriter; import org.apache.lucene.codecs.lucene912.Lucene912PostingsWriter;
import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.SegmentWriteState; import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.IOUtils;
@ -41,7 +41,7 @@ public final class FSTPostingsFormat extends PostingsFormat {
@Override @Override
public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException { public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
PostingsWriterBase postingsWriter = new Lucene99PostingsWriter(state); PostingsWriterBase postingsWriter = new Lucene912PostingsWriter(state);
boolean success = false; boolean success = false;
try { try {
@ -57,7 +57,7 @@ public final class FSTPostingsFormat extends PostingsFormat {
@Override @Override
public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException { public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException {
PostingsReaderBase postingsReader = new Lucene99PostingsReader(state); PostingsReaderBase postingsReader = new Lucene912PostingsReader(state);
boolean success = false; boolean success = false;
try { try {
FieldsProducer ret = new FSTTermsReader(state, postingsReader); FieldsProducer ret = new FSTTermsReader(state, postingsReader);

View File

@ -195,9 +195,10 @@ public class FSTTermsReader extends FieldsProducer {
this.sumTotalTermFreq = sumTotalTermFreq; this.sumTotalTermFreq = sumTotalTermFreq;
this.sumDocFreq = sumDocFreq; this.sumDocFreq = sumDocFreq;
this.docCount = docCount; this.docCount = docCount;
OffHeapFSTStore offHeapFSTStore = new OffHeapFSTStore();
FSTTermOutputs outputs = new FSTTermOutputs(fieldInfo); FSTTermOutputs outputs = new FSTTermOutputs(fieldInfo);
this.dict = new FST<>(FST.readMetadata(in, outputs), in, offHeapFSTStore); final var fstMetadata = FST.readMetadata(in, outputs);
OffHeapFSTStore offHeapFSTStore = new OffHeapFSTStore(in, in.getFilePointer(), fstMetadata);
this.dict = FST.fromFSTReader(fstMetadata, offHeapFSTStore);
in.skipBytes(offHeapFSTStore.size()); in.skipBytes(offHeapFSTStore.size());
} }

View File

@ -71,8 +71,8 @@ final class SimpleTextBKDReader extends PointValues {
this.pointCount = pointCount; this.pointCount = pointCount;
this.docCount = docCount; this.docCount = docCount;
this.version = SimpleTextBKDWriter.VERSION_CURRENT; this.version = SimpleTextBKDWriter.VERSION_CURRENT;
assert minPackedValue.length == config.packedIndexBytesLength; assert minPackedValue.length == config.packedIndexBytesLength();
assert maxPackedValue.length == config.packedIndexBytesLength; assert maxPackedValue.length == config.packedIndexBytesLength();
} }
@Override @Override
@ -99,8 +99,8 @@ final class SimpleTextBKDReader extends PointValues {
private SimpleTextPointTree( private SimpleTextPointTree(
IndexInput in, int nodeID, int level, byte[] minPackedValue, byte[] maxPackedValue) { IndexInput in, int nodeID, int level, byte[] minPackedValue, byte[] maxPackedValue) {
this.in = in; this.in = in;
this.scratchDocIDs = new int[config.maxPointsInLeafNode]; this.scratchDocIDs = new int[config.maxPointsInLeafNode()];
this.scratchPackedValue = new byte[config.packedBytesLength]; this.scratchPackedValue = new byte[config.packedBytesLength()];
this.nodeID = nodeID; this.nodeID = nodeID;
this.rootNode = nodeID; this.rootNode = nodeID;
this.level = level; this.level = level;
@ -145,38 +145,39 @@ final class SimpleTextBKDReader extends PointValues {
private void pushLeft() { private void pushLeft() {
int address = nodeID * bytesPerIndexEntry; int address = nodeID * bytesPerIndexEntry;
// final int splitDimPos; // final int splitDimPos;
if (config.numIndexDims == 1) { if (config.numIndexDims() == 1) {
splitDims[level] = 0; splitDims[level] = 0;
} else { } else {
splitDims[level] = (splitPackedValues[address++] & 0xff); splitDims[level] = (splitPackedValues[address++] & 0xff);
} }
final int splitDimPos = splitDims[level] * config.bytesPerDim; final int splitDimPos = splitDims[level] * config.bytesPerDim();
if (splitDimValueStack[level] == null) { if (splitDimValueStack[level] == null) {
splitDimValueStack[level] = new byte[config.bytesPerDim]; splitDimValueStack[level] = new byte[config.bytesPerDim()];
} }
// save the dimension we are going to change // save the dimension we are going to change
System.arraycopy( System.arraycopy(
maxPackedValue, splitDimPos, splitDimValueStack[level], 0, config.bytesPerDim); maxPackedValue, splitDimPos, splitDimValueStack[level], 0, config.bytesPerDim());
assert Arrays.compareUnsigned( assert Arrays.compareUnsigned(
maxPackedValue, maxPackedValue,
splitDimPos, splitDimPos,
splitDimPos + config.bytesPerDim, splitDimPos + config.bytesPerDim(),
splitPackedValues, splitPackedValues,
address, address,
address + config.bytesPerDim) address + config.bytesPerDim())
>= 0 >= 0
: "config.bytesPerDim=" : "config.bytesPerDim()="
+ config.bytesPerDim + config.bytesPerDim()
+ " splitDim=" + " splitDim="
+ splitDims[level] + splitDims[level]
+ " config.numIndexDims=" + " config.numIndexDims()="
+ config.numIndexDims + config.numIndexDims()
+ " config.numDims=" + " config.numDims="
+ config.numDims; + config.numDims();
nodeID *= 2; nodeID *= 2;
level++; level++;
// add the split dim value: // add the split dim value:
System.arraycopy(splitPackedValues, address, maxPackedValue, splitDimPos, config.bytesPerDim); System.arraycopy(
splitPackedValues, address, maxPackedValue, splitDimPos, config.bytesPerDim());
} }
@Override @Override
@ -191,37 +192,38 @@ final class SimpleTextBKDReader extends PointValues {
private void pushRight() { private void pushRight() {
int address = nodeID * bytesPerIndexEntry; int address = nodeID * bytesPerIndexEntry;
if (config.numIndexDims == 1) { if (config.numIndexDims() == 1) {
splitDims[level] = 0; splitDims[level] = 0;
} else { } else {
splitDims[level] = (splitPackedValues[address++] & 0xff); splitDims[level] = (splitPackedValues[address++] & 0xff);
} }
final int splitDimPos = splitDims[level] * config.bytesPerDim; final int splitDimPos = splitDims[level] * config.bytesPerDim();
// we should have already visit the left node // we should have already visit the left node
assert splitDimValueStack[level] != null; assert splitDimValueStack[level] != null;
// save the dimension we are going to change // save the dimension we are going to change
System.arraycopy( System.arraycopy(
minPackedValue, splitDimPos, splitDimValueStack[level], 0, config.bytesPerDim); minPackedValue, splitDimPos, splitDimValueStack[level], 0, config.bytesPerDim());
assert Arrays.compareUnsigned( assert Arrays.compareUnsigned(
minPackedValue, minPackedValue,
splitDimPos, splitDimPos,
splitDimPos + config.bytesPerDim, splitDimPos + config.bytesPerDim(),
splitPackedValues, splitPackedValues,
address, address,
address + config.bytesPerDim) address + config.bytesPerDim())
<= 0 <= 0
: "config.bytesPerDim=" : "config.bytesPerDim()="
+ config.bytesPerDim + config.bytesPerDim()
+ " splitDim=" + " splitDim="
+ splitDims[level] + splitDims[level]
+ " config.numIndexDims=" + " config.numIndexDims()="
+ config.numIndexDims + config.numIndexDims()
+ " config.numDims=" + " config.numDims="
+ config.numDims; + config.numDims();
nodeID = 2 * nodeID + 1; nodeID = 2 * nodeID + 1;
level++; level++;
// add the split dim value: // add the split dim value:
System.arraycopy(splitPackedValues, address, minPackedValue, splitDimPos, config.bytesPerDim); System.arraycopy(
splitPackedValues, address, minPackedValue, splitDimPos, config.bytesPerDim());
} }
@Override @Override
@ -242,16 +244,16 @@ final class SimpleTextBKDReader extends PointValues {
splitDimValueStack[level], splitDimValueStack[level],
0, 0,
maxPackedValue, maxPackedValue,
splitDims[level] * config.bytesPerDim, splitDims[level] * config.bytesPerDim(),
config.bytesPerDim); config.bytesPerDim());
} else { } else {
System.arraycopy( System.arraycopy(
splitDimValueStack[level], splitDimValueStack[level],
0, 0,
minPackedValue, minPackedValue,
splitDims[level] * config.bytesPerDim, splitDims[level] * config.bytesPerDim(),
config.bytesPerDim); config.bytesPerDim());
} }
} }
@ -290,7 +292,7 @@ final class SimpleTextBKDReader extends PointValues {
private long sizeFromBalancedTree(int leftMostLeafNode, int rightMostLeafNode) { private long sizeFromBalancedTree(int leftMostLeafNode, int rightMostLeafNode) {
// number of points that need to be distributed between leaves, one per leaf // number of points that need to be distributed between leaves, one per leaf
final int extraPoints = final int extraPoints =
Math.toIntExact(((long) config.maxPointsInLeafNode * leafNodeOffset) - pointCount); Math.toIntExact(((long) config.maxPointsInLeafNode() * leafNodeOffset) - pointCount);
assert extraPoints < leafNodeOffset : "point excess should be lower than leafNodeOffset"; assert extraPoints < leafNodeOffset : "point excess should be lower than leafNodeOffset";
// offset where we stop adding one point to the leaves // offset where we stop adding one point to the leaves
final int nodeOffset = leafNodeOffset - extraPoints; final int nodeOffset = leafNodeOffset - extraPoints;
@ -298,9 +300,9 @@ final class SimpleTextBKDReader extends PointValues {
for (int node = leftMostLeafNode; node <= rightMostLeafNode; node++) { for (int node = leftMostLeafNode; node <= rightMostLeafNode; node++) {
// offsetPosition provides which extra point will be added to this node // offsetPosition provides which extra point will be added to this node
if (balanceTreeNodePosition(0, leafNodeOffset, node - leafNodeOffset, 0, 0) < nodeOffset) { if (balanceTreeNodePosition(0, leafNodeOffset, node - leafNodeOffset, 0, 0) < nodeOffset) {
count += config.maxPointsInLeafNode; count += config.maxPointsInLeafNode();
} else { } else {
count += config.maxPointsInLeafNode - 1; count += config.maxPointsInLeafNode() - 1;
} }
} }
return count; return count;
@ -376,14 +378,14 @@ final class SimpleTextBKDReader extends PointValues {
// Again, this time reading values and checking with the visitor // Again, this time reading values and checking with the visitor
visitor.grow(count); visitor.grow(count);
// NOTE: we don't do prefix coding, so we ignore commonPrefixLengths // NOTE: we don't do prefix coding, so we ignore commonPrefixLengths
assert scratchPackedValue.length == config.packedBytesLength; assert scratchPackedValue.length == config.packedBytesLength();
BytesRefBuilder scratch = new BytesRefBuilder(); BytesRefBuilder scratch = new BytesRefBuilder();
for (int i = 0; i < count; i++) { for (int i = 0; i < count; i++) {
readLine(in, scratch); readLine(in, scratch);
assert startsWith(scratch, BLOCK_VALUE); assert startsWith(scratch, BLOCK_VALUE);
BytesRef br = SimpleTextUtil.fromBytesRefString(stripPrefix(scratch, BLOCK_VALUE)); BytesRef br = SimpleTextUtil.fromBytesRefString(stripPrefix(scratch, BLOCK_VALUE));
assert br.length == config.packedBytesLength; assert br.length == config.packedBytesLength();
System.arraycopy(br.bytes, br.offset, scratchPackedValue, 0, config.packedBytesLength); System.arraycopy(br.bytes, br.offset, scratchPackedValue, 0, config.packedBytesLength());
visitor.visit(scratchDocIDs[i], scratchPackedValue); visitor.visit(scratchDocIDs[i], scratchPackedValue);
} }
} else { } else {
@ -443,17 +445,17 @@ final class SimpleTextBKDReader extends PointValues {
@Override @Override
public int getNumDimensions() throws IOException { public int getNumDimensions() throws IOException {
return config.numDims; return config.numDims();
} }
@Override @Override
public int getNumIndexDimensions() throws IOException { public int getNumIndexDimensions() throws IOException {
return config.numIndexDims; return config.numIndexDims();
} }
@Override @Override
public int getBytesPerDimension() throws IOException { public int getBytesPerDimension() throws IOException {
return config.bytesPerDim; return config.bytesPerDim();
} }
@Override @Override

View File

@ -144,28 +144,28 @@ final class SimpleTextBKDWriter implements Closeable {
this.maxDoc = maxDoc; this.maxDoc = maxDoc;
docsSeen = new FixedBitSet(maxDoc); docsSeen = new FixedBitSet(maxDoc);
scratchDiff = new byte[config.bytesPerDim]; scratchDiff = new byte[config.bytesPerDim()];
scratch1 = new byte[config.packedBytesLength]; scratch1 = new byte[config.packedBytesLength()];
scratch2 = new byte[config.packedBytesLength]; scratch2 = new byte[config.packedBytesLength()];
commonPrefixLengths = new int[config.numDims]; commonPrefixLengths = new int[config.numDims()];
minPackedValue = new byte[config.packedIndexBytesLength]; minPackedValue = new byte[config.packedIndexBytesLength()];
maxPackedValue = new byte[config.packedIndexBytesLength]; maxPackedValue = new byte[config.packedIndexBytesLength()];
// Maximum number of points we hold in memory at any time // Maximum number of points we hold in memory at any time
maxPointsSortInHeap = maxPointsSortInHeap =
(int) ((maxMBSortInHeap * 1024 * 1024) / (config.bytesPerDoc * config.numDims)); (int) ((maxMBSortInHeap * 1024 * 1024) / (config.bytesPerDoc() * config.numDims()));
// Finally, we must be able to hold at least the leaf node in heap during build: // Finally, we must be able to hold at least the leaf node in heap during build:
if (maxPointsSortInHeap < config.maxPointsInLeafNode) { if (maxPointsSortInHeap < config.maxPointsInLeafNode()) {
throw new IllegalArgumentException( throw new IllegalArgumentException(
"maxMBSortInHeap=" "maxMBSortInHeap="
+ maxMBSortInHeap + maxMBSortInHeap
+ " only allows for maxPointsSortInHeap=" + " only allows for maxPointsSortInHeap="
+ maxPointsSortInHeap + maxPointsSortInHeap
+ ", but this is less than config.maxPointsInLeafNode=" + ", but this is less than config.maxPointsInLeafNode()="
+ config.maxPointsInLeafNode + config.maxPointsInLeafNode()
+ "; either increase maxMBSortInHeap or decrease config.maxPointsInLeafNode"); + "; either increase maxMBSortInHeap or decrease config.maxPointsInLeafNode()");
} }
this.maxMBSortInHeap = maxMBSortInHeap; this.maxMBSortInHeap = maxMBSortInHeap;
@ -183,10 +183,10 @@ final class SimpleTextBKDWriter implements Closeable {
} }
public void add(byte[] packedValue, int docID) throws IOException { public void add(byte[] packedValue, int docID) throws IOException {
if (packedValue.length != config.packedBytesLength) { if (packedValue.length != config.packedBytesLength()) {
throw new IllegalArgumentException( throw new IllegalArgumentException(
"packedValue should be length=" "packedValue should be length="
+ config.packedBytesLength + config.packedBytesLength()
+ " (got: " + " (got: "
+ packedValue.length + packedValue.length
+ ")"); + ")");
@ -209,30 +209,30 @@ final class SimpleTextBKDWriter implements Closeable {
} else { } else {
pointWriter = new HeapPointWriter(config, Math.toIntExact(totalPointCount)); pointWriter = new HeapPointWriter(config, Math.toIntExact(totalPointCount));
} }
System.arraycopy(packedValue, 0, minPackedValue, 0, config.packedIndexBytesLength); System.arraycopy(packedValue, 0, minPackedValue, 0, config.packedIndexBytesLength());
System.arraycopy(packedValue, 0, maxPackedValue, 0, config.packedIndexBytesLength); System.arraycopy(packedValue, 0, maxPackedValue, 0, config.packedIndexBytesLength());
} else { } else {
for (int dim = 0; dim < config.numIndexDims; dim++) { for (int dim = 0; dim < config.numIndexDims(); dim++) {
int offset = dim * config.bytesPerDim; int offset = dim * config.bytesPerDim();
if (Arrays.compareUnsigned( if (Arrays.compareUnsigned(
packedValue, packedValue,
offset, offset,
offset + config.bytesPerDim, offset + config.bytesPerDim(),
minPackedValue, minPackedValue,
offset, offset,
offset + config.bytesPerDim) offset + config.bytesPerDim())
< 0) { < 0) {
System.arraycopy(packedValue, offset, minPackedValue, offset, config.bytesPerDim); System.arraycopy(packedValue, offset, minPackedValue, offset, config.bytesPerDim());
} }
if (Arrays.compareUnsigned( if (Arrays.compareUnsigned(
packedValue, packedValue,
offset, offset,
offset + config.bytesPerDim, offset + config.bytesPerDim(),
maxPackedValue, maxPackedValue,
offset, offset,
offset + config.bytesPerDim) offset + config.bytesPerDim())
> 0) { > 0) {
System.arraycopy(packedValue, offset, maxPackedValue, offset, config.bytesPerDim); System.arraycopy(packedValue, offset, maxPackedValue, offset, config.bytesPerDim());
} }
} }
} }
@ -254,7 +254,7 @@ final class SimpleTextBKDWriter implements Closeable {
*/ */
public long writeField(IndexOutput out, String fieldName, MutablePointTree reader) public long writeField(IndexOutput out, String fieldName, MutablePointTree reader)
throws IOException { throws IOException {
if (config.numIndexDims == 1) { if (config.numIndexDims() == 1) {
return writeField1Dim(out, fieldName, reader); return writeField1Dim(out, fieldName, reader);
} else { } else {
return writeFieldNDims(out, fieldName, reader); return writeFieldNDims(out, fieldName, reader);
@ -280,7 +280,7 @@ final class SimpleTextBKDWriter implements Closeable {
long countPerLeaf = pointCount = values.size(); long countPerLeaf = pointCount = values.size();
long innerNodeCount = 1; long innerNodeCount = 1;
while (countPerLeaf > config.maxPointsInLeafNode) { while (countPerLeaf > config.maxPointsInLeafNode()) {
countPerLeaf = (countPerLeaf + 1) / 2; countPerLeaf = (countPerLeaf + 1) / 2;
innerNodeCount *= 2; innerNodeCount *= 2;
} }
@ -289,7 +289,7 @@ final class SimpleTextBKDWriter implements Closeable {
checkMaxLeafNodeCount(numLeaves); checkMaxLeafNodeCount(numLeaves);
final byte[] splitPackedValues = new byte[numLeaves * (config.bytesPerDim + 1)]; final byte[] splitPackedValues = new byte[numLeaves * (config.bytesPerDim() + 1)];
final long[] leafBlockFPs = new long[numLeaves]; final long[] leafBlockFPs = new long[numLeaves];
// compute the min/max for this slice // compute the min/max for this slice
@ -297,37 +297,37 @@ final class SimpleTextBKDWriter implements Closeable {
Arrays.fill(maxPackedValue, (byte) 0); Arrays.fill(maxPackedValue, (byte) 0);
for (int i = 0; i < Math.toIntExact(pointCount); ++i) { for (int i = 0; i < Math.toIntExact(pointCount); ++i) {
values.getValue(i, scratchBytesRef1); values.getValue(i, scratchBytesRef1);
for (int dim = 0; dim < config.numIndexDims; dim++) { for (int dim = 0; dim < config.numIndexDims(); dim++) {
int offset = dim * config.bytesPerDim; int offset = dim * config.bytesPerDim();
if (Arrays.compareUnsigned( if (Arrays.compareUnsigned(
scratchBytesRef1.bytes, scratchBytesRef1.bytes,
scratchBytesRef1.offset + offset, scratchBytesRef1.offset + offset,
scratchBytesRef1.offset + offset + config.bytesPerDim, scratchBytesRef1.offset + offset + config.bytesPerDim(),
minPackedValue, minPackedValue,
offset, offset,
offset + config.bytesPerDim) offset + config.bytesPerDim())
< 0) { < 0) {
System.arraycopy( System.arraycopy(
scratchBytesRef1.bytes, scratchBytesRef1.bytes,
scratchBytesRef1.offset + offset, scratchBytesRef1.offset + offset,
minPackedValue, minPackedValue,
offset, offset,
config.bytesPerDim); config.bytesPerDim());
} }
if (Arrays.compareUnsigned( if (Arrays.compareUnsigned(
scratchBytesRef1.bytes, scratchBytesRef1.bytes,
scratchBytesRef1.offset + offset, scratchBytesRef1.offset + offset,
scratchBytesRef1.offset + offset + config.bytesPerDim, scratchBytesRef1.offset + offset + config.bytesPerDim(),
maxPackedValue, maxPackedValue,
offset, offset,
offset + config.bytesPerDim) offset + config.bytesPerDim())
> 0) { > 0) {
System.arraycopy( System.arraycopy(
scratchBytesRef1.bytes, scratchBytesRef1.bytes,
scratchBytesRef1.offset + offset, scratchBytesRef1.offset + offset,
maxPackedValue, maxPackedValue,
offset, offset,
config.bytesPerDim); config.bytesPerDim());
} }
} }
@ -345,7 +345,7 @@ final class SimpleTextBKDWriter implements Closeable {
maxPackedValue, maxPackedValue,
splitPackedValues, splitPackedValues,
leafBlockFPs, leafBlockFPs,
new int[config.maxPointsInLeafNode]); new int[config.maxPointsInLeafNode()]);
long indexFP = out.getFilePointer(); long indexFP = out.getFilePointer();
writeIndex(out, leafBlockFPs, splitPackedValues, Math.toIntExact(countPerLeaf)); writeIndex(out, leafBlockFPs, splitPackedValues, Math.toIntExact(countPerLeaf));
@ -387,15 +387,15 @@ final class SimpleTextBKDWriter implements Closeable {
final IndexOutput out; final IndexOutput out;
final List<Long> leafBlockFPs = new ArrayList<>(); final List<Long> leafBlockFPs = new ArrayList<>();
final List<byte[]> leafBlockStartValues = new ArrayList<>(); final List<byte[]> leafBlockStartValues = new ArrayList<>();
final byte[] leafValues = new byte[config.maxPointsInLeafNode * config.packedBytesLength]; final byte[] leafValues = new byte[config.maxPointsInLeafNode() * config.packedBytesLength()];
final int[] leafDocs = new int[config.maxPointsInLeafNode]; final int[] leafDocs = new int[config.maxPointsInLeafNode()];
long valueCount; long valueCount;
int leafCount; int leafCount;
OneDimensionBKDWriter(IndexOutput out) { OneDimensionBKDWriter(IndexOutput out) {
if (config.numIndexDims != 1) { if (config.numIndexDims() != 1) {
throw new UnsupportedOperationException( throw new UnsupportedOperationException(
"config.numIndexDims must be 1 but got " + config.numIndexDims); "config.numIndexDims() must be 1 but got " + config.numIndexDims());
} }
if (pointCount != 0) { if (pointCount != 0) {
throw new IllegalStateException("cannot mix add and merge"); throw new IllegalStateException("cannot mix add and merge");
@ -411,7 +411,7 @@ final class SimpleTextBKDWriter implements Closeable {
this.out = out; this.out = out;
lastPackedValue = new byte[config.packedBytesLength]; lastPackedValue = new byte[config.packedBytesLength()];
} }
// for asserts // for asserts
@ -426,8 +426,8 @@ final class SimpleTextBKDWriter implements Closeable {
packedValue, packedValue,
0, 0,
leafValues, leafValues,
leafCount * config.packedBytesLength, leafCount * config.packedBytesLength(),
config.packedBytesLength); config.packedBytesLength());
leafDocs[leafCount] = docID; leafDocs[leafCount] = docID;
docsSeen.set(docID); docsSeen.set(docID);
leafCount++; leafCount++;
@ -441,7 +441,7 @@ final class SimpleTextBKDWriter implements Closeable {
+ " values"); + " values");
} }
if (leafCount == config.maxPointsInLeafNode) { if (leafCount == config.maxPointsInLeafNode()) {
// We write a block once we hit exactly the max count ... this is different from // We write a block once we hit exactly the max count ... this is different from
// when we flush a new segment, where we write between max/2 and max per leaf block, // when we flush a new segment, where we write between max/2 and max per leaf block,
// so merged segments will behave differently from newly flushed segments: // so merged segments will behave differently from newly flushed segments:
@ -471,43 +471,44 @@ final class SimpleTextBKDWriter implements Closeable {
// System.out.println("BKDW: now rotate numInnerNodes=" + numInnerNodes + " leafBlockStarts=" // System.out.println("BKDW: now rotate numInnerNodes=" + numInnerNodes + " leafBlockStarts="
// + leafBlockStartValues.size()); // + leafBlockStartValues.size());
byte[] index = new byte[(1 + numInnerNodes) * (1 + config.bytesPerDim)]; byte[] index = new byte[(1 + numInnerNodes) * (1 + config.bytesPerDim())];
rotateToTree(1, 0, numInnerNodes, index, leafBlockStartValues); rotateToTree(1, 0, numInnerNodes, index, leafBlockStartValues);
long[] arr = new long[leafBlockFPs.size()]; long[] arr = new long[leafBlockFPs.size()];
for (int i = 0; i < leafBlockFPs.size(); i++) { for (int i = 0; i < leafBlockFPs.size(); i++) {
arr[i] = leafBlockFPs.get(i); arr[i] = leafBlockFPs.get(i);
} }
writeIndex(out, arr, index, config.maxPointsInLeafNode); writeIndex(out, arr, index, config.maxPointsInLeafNode());
return indexFP; return indexFP;
} }
private void writeLeafBlock() throws IOException { private void writeLeafBlock() throws IOException {
assert leafCount != 0; assert leafCount != 0;
if (valueCount == 0) { if (valueCount == 0) {
System.arraycopy(leafValues, 0, minPackedValue, 0, config.packedIndexBytesLength); System.arraycopy(leafValues, 0, minPackedValue, 0, config.packedIndexBytesLength());
} }
System.arraycopy( System.arraycopy(
leafValues, leafValues,
(leafCount - 1) * config.packedBytesLength, (leafCount - 1) * config.packedBytesLength(),
maxPackedValue, maxPackedValue,
0, 0,
config.packedIndexBytesLength); config.packedIndexBytesLength());
valueCount += leafCount; valueCount += leafCount;
if (leafBlockFPs.size() > 0) { if (leafBlockFPs.size() > 0) {
// Save the first (minimum) value in each leaf block except the first, to build the split // Save the first (minimum) value in each leaf block except the first, to build the split
// value index in the end: // value index in the end:
leafBlockStartValues.add(ArrayUtil.copyOfSubArray(leafValues, 0, config.packedBytesLength)); leafBlockStartValues.add(
ArrayUtil.copyOfSubArray(leafValues, 0, config.packedBytesLength()));
} }
leafBlockFPs.add(out.getFilePointer()); leafBlockFPs.add(out.getFilePointer());
checkMaxLeafNodeCount(leafBlockFPs.size()); checkMaxLeafNodeCount(leafBlockFPs.size());
Arrays.fill(commonPrefixLengths, config.bytesPerDim); Arrays.fill(commonPrefixLengths, config.bytesPerDim());
// Find per-dim common prefix: // Find per-dim common prefix:
for (int dim = 0; dim < config.numDims; dim++) { for (int dim = 0; dim < config.numDims(); dim++) {
int offset1 = dim * config.bytesPerDim; int offset1 = dim * config.bytesPerDim();
int offset2 = (leafCount - 1) * config.packedBytesLength + offset1; int offset2 = (leafCount - 1) * config.packedBytesLength() + offset1;
for (int j = 0; j < commonPrefixLengths[dim]; j++) { for (int j = 0; j < commonPrefixLengths[dim]; j++) {
if (leafValues[offset1 + j] != leafValues[offset2 + j]) { if (leafValues[offset1 + j] != leafValues[offset2 + j]) {
commonPrefixLengths[dim] = j; commonPrefixLengths[dim] = j;
@ -523,24 +524,24 @@ final class SimpleTextBKDWriter implements Closeable {
final BytesRef scratch = new BytesRef(); final BytesRef scratch = new BytesRef();
{ {
scratch.length = config.packedBytesLength; scratch.length = config.packedBytesLength();
scratch.bytes = leafValues; scratch.bytes = leafValues;
} }
@Override @Override
public BytesRef apply(int i) { public BytesRef apply(int i) {
scratch.offset = config.packedBytesLength * i; scratch.offset = config.packedBytesLength() * i;
return scratch; return scratch;
} }
}; };
assert valuesInOrderAndBounds( assert valuesInOrderAndBounds(
leafCount, leafCount,
0, 0,
ArrayUtil.copyOfSubArray(leafValues, 0, config.packedBytesLength), ArrayUtil.copyOfSubArray(leafValues, 0, config.packedBytesLength()),
ArrayUtil.copyOfSubArray( ArrayUtil.copyOfSubArray(
leafValues, leafValues,
(leafCount - 1) * config.packedBytesLength, (leafCount - 1) * config.packedBytesLength(),
leafCount * config.packedBytesLength), leafCount * config.packedBytesLength()),
packedValues, packedValues,
leafDocs, leafDocs,
0); 0);
@ -552,7 +553,7 @@ final class SimpleTextBKDWriter implements Closeable {
private void rotateToTree( private void rotateToTree(
int nodeID, int offset, int count, byte[] index, List<byte[]> leafBlockStartValues) { int nodeID, int offset, int count, byte[] index, List<byte[]> leafBlockStartValues) {
// System.out.println("ROTATE: nodeID=" + nodeID + " offset=" + offset + " count=" + count + " // System.out.println("ROTATE: nodeID=" + nodeID + " offset=" + offset + " count=" + count + "
// bpd=" + config.bytesPerDim + " index.length=" + index.length); // bpd=" + config.bytesPerDim() + " index.length=" + index.length);
if (count == 1) { if (count == 1) {
// Leaf index node // Leaf index node
// System.out.println(" leaf index node"); // System.out.println(" leaf index node");
@ -561,8 +562,8 @@ final class SimpleTextBKDWriter implements Closeable {
leafBlockStartValues.get(offset), leafBlockStartValues.get(offset),
0, 0,
index, index,
nodeID * (1 + config.bytesPerDim) + 1, nodeID * (1 + config.bytesPerDim()) + 1,
config.bytesPerDim); config.bytesPerDim());
} else if (count > 1) { } else if (count > 1) {
// Internal index node: binary partition of count // Internal index node: binary partition of count
int countAtLevel = 1; int countAtLevel = 1;
@ -587,8 +588,8 @@ final class SimpleTextBKDWriter implements Closeable {
leafBlockStartValues.get(rootOffset), leafBlockStartValues.get(rootOffset),
0, 0,
index, index,
nodeID * (1 + config.bytesPerDim) + 1, nodeID * (1 + config.bytesPerDim()) + 1,
config.bytesPerDim); config.bytesPerDim());
// System.out.println(" index[" + nodeID + "] = blockStartValues[" + rootOffset + "]"); // System.out.println(" index[" + nodeID + "] = blockStartValues[" + rootOffset + "]");
// TODO: we could optimize/specialize, when we know it's simply fully balanced binary tree // TODO: we could optimize/specialize, when we know it's simply fully balanced binary tree
@ -611,10 +612,10 @@ final class SimpleTextBKDWriter implements Closeable {
} }
private void checkMaxLeafNodeCount(int numLeaves) { private void checkMaxLeafNodeCount(int numLeaves) {
if ((1 + config.bytesPerDim) * (long) numLeaves > ArrayUtil.MAX_ARRAY_LENGTH) { if ((1 + config.bytesPerDim()) * (long) numLeaves > ArrayUtil.MAX_ARRAY_LENGTH) {
throw new IllegalStateException( throw new IllegalStateException(
"too many nodes; increase config.maxPointsInLeafNode (currently " "too many nodes; increase config.maxPointsInLeafNode() (currently "
+ config.maxPointsInLeafNode + config.maxPointsInLeafNode()
+ ") and reindex"); + ") and reindex");
} }
} }
@ -652,7 +653,7 @@ final class SimpleTextBKDWriter implements Closeable {
long countPerLeaf = pointCount; long countPerLeaf = pointCount;
long innerNodeCount = 1; long innerNodeCount = 1;
while (countPerLeaf > config.maxPointsInLeafNode) { while (countPerLeaf > config.maxPointsInLeafNode()) {
countPerLeaf = (countPerLeaf + 1) / 2; countPerLeaf = (countPerLeaf + 1) / 2;
innerNodeCount *= 2; innerNodeCount *= 2;
} }
@ -667,20 +668,20 @@ final class SimpleTextBKDWriter implements Closeable {
// Indexed by nodeID, but first (root) nodeID is 1. We do 1+ because the lead byte at each // Indexed by nodeID, but first (root) nodeID is 1. We do 1+ because the lead byte at each
// recursion says which dim we split on. // recursion says which dim we split on.
byte[] splitPackedValues = new byte[Math.multiplyExact(numLeaves, 1 + config.bytesPerDim)]; byte[] splitPackedValues = new byte[Math.multiplyExact(numLeaves, 1 + config.bytesPerDim())];
// +1 because leaf count is power of 2 (e.g. 8), and innerNodeCount is power of 2 minus 1 (e.g. // +1 because leaf count is power of 2 (e.g. 8), and innerNodeCount is power of 2 minus 1 (e.g.
// 7) // 7)
long[] leafBlockFPs = new long[numLeaves]; long[] leafBlockFPs = new long[numLeaves];
// Make sure the math above "worked": // Make sure the math above "worked":
assert pointCount / numLeaves <= config.maxPointsInLeafNode assert pointCount / numLeaves <= config.maxPointsInLeafNode()
: "pointCount=" : "pointCount="
+ pointCount + pointCount
+ " numLeaves=" + " numLeaves="
+ numLeaves + numLeaves
+ " config.maxPointsInLeafNode=" + " config.maxPointsInLeafNode()="
+ config.maxPointsInLeafNode; + config.maxPointsInLeafNode();
// We re-use the selector so we do not need to create an object every time. // We re-use the selector so we do not need to create an object every time.
BKDRadixSelector radixSelector = BKDRadixSelector radixSelector =
@ -699,7 +700,7 @@ final class SimpleTextBKDWriter implements Closeable {
maxPackedValue, maxPackedValue,
splitPackedValues, splitPackedValues,
leafBlockFPs, leafBlockFPs,
new int[config.maxPointsInLeafNode]); new int[config.maxPointsInLeafNode()]);
// If no exception, we should have cleaned everything up: // If no exception, we should have cleaned everything up:
assert tempDir.getCreatedFiles().isEmpty(); assert tempDir.getCreatedFiles().isEmpty();
@ -724,15 +725,15 @@ final class SimpleTextBKDWriter implements Closeable {
IndexOutput out, long[] leafBlockFPs, byte[] splitPackedValues, int maxPointsInLeafNode) IndexOutput out, long[] leafBlockFPs, byte[] splitPackedValues, int maxPointsInLeafNode)
throws IOException { throws IOException {
write(out, NUM_DATA_DIMS); write(out, NUM_DATA_DIMS);
writeInt(out, config.numDims); writeInt(out, config.numDims());
newline(out); newline(out);
write(out, NUM_INDEX_DIMS); write(out, NUM_INDEX_DIMS);
writeInt(out, config.numIndexDims); writeInt(out, config.numIndexDims());
newline(out); newline(out);
write(out, BYTES_PER_DIM); write(out, BYTES_PER_DIM);
writeInt(out, config.bytesPerDim); writeInt(out, config.bytesPerDim());
newline(out); newline(out);
write(out, MAX_LEAF_POINTS); write(out, MAX_LEAF_POINTS);
@ -767,8 +768,8 @@ final class SimpleTextBKDWriter implements Closeable {
newline(out); newline(out);
} }
assert (splitPackedValues.length % (1 + config.bytesPerDim)) == 0; assert (splitPackedValues.length % (1 + config.bytesPerDim())) == 0;
int count = splitPackedValues.length / (1 + config.bytesPerDim); int count = splitPackedValues.length / (1 + config.bytesPerDim());
assert count == leafBlockFPs.length; assert count == leafBlockFPs.length;
write(out, SPLIT_COUNT); write(out, SPLIT_COUNT);
@ -777,10 +778,12 @@ final class SimpleTextBKDWriter implements Closeable {
for (int i = 0; i < count; i++) { for (int i = 0; i < count; i++) {
write(out, SPLIT_DIM); write(out, SPLIT_DIM);
writeInt(out, splitPackedValues[i * (1 + config.bytesPerDim)] & 0xff); writeInt(out, splitPackedValues[i * (1 + config.bytesPerDim())] & 0xff);
newline(out); newline(out);
write(out, SPLIT_VALUE); write(out, SPLIT_VALUE);
br = new BytesRef(splitPackedValues, 1 + (i * (1 + config.bytesPerDim)), config.bytesPerDim); br =
new BytesRef(
splitPackedValues, 1 + (i * (1 + config.bytesPerDim())), config.bytesPerDim());
write(out, br.toString()); write(out, br.toString());
newline(out); newline(out);
} }
@ -852,25 +855,25 @@ final class SimpleTextBKDWriter implements Closeable {
/** Called only in assert */ /** Called only in assert */
private boolean valueInBounds( private boolean valueInBounds(
BytesRef packedValue, byte[] minPackedValue, byte[] maxPackedValue) { BytesRef packedValue, byte[] minPackedValue, byte[] maxPackedValue) {
for (int dim = 0; dim < config.numIndexDims; dim++) { for (int dim = 0; dim < config.numIndexDims(); dim++) {
int offset = config.bytesPerDim * dim; int offset = config.bytesPerDim() * dim;
if (Arrays.compareUnsigned( if (Arrays.compareUnsigned(
packedValue.bytes, packedValue.bytes,
packedValue.offset + offset, packedValue.offset + offset,
packedValue.offset + offset + config.bytesPerDim, packedValue.offset + offset + config.bytesPerDim(),
minPackedValue, minPackedValue,
offset, offset,
offset + config.bytesPerDim) offset + config.bytesPerDim())
< 0) { < 0) {
return false; return false;
} }
if (Arrays.compareUnsigned( if (Arrays.compareUnsigned(
packedValue.bytes, packedValue.bytes,
packedValue.offset + offset, packedValue.offset + offset,
packedValue.offset + offset + config.bytesPerDim, packedValue.offset + offset + config.bytesPerDim(),
maxPackedValue, maxPackedValue,
offset, offset,
offset + config.bytesPerDim) offset + config.bytesPerDim())
> 0) { > 0) {
return false; return false;
} }
@ -882,13 +885,13 @@ final class SimpleTextBKDWriter implements Closeable {
protected int split(byte[] minPackedValue, byte[] maxPackedValue) { protected int split(byte[] minPackedValue, byte[] maxPackedValue) {
// Find which dim has the largest span so we can split on it: // Find which dim has the largest span so we can split on it:
int splitDim = -1; int splitDim = -1;
for (int dim = 0; dim < config.numIndexDims; dim++) { for (int dim = 0; dim < config.numIndexDims(); dim++) {
NumericUtils.subtract(config.bytesPerDim, dim, maxPackedValue, minPackedValue, scratchDiff); NumericUtils.subtract(config.bytesPerDim(), dim, maxPackedValue, minPackedValue, scratchDiff);
if (splitDim == -1 if (splitDim == -1
|| Arrays.compareUnsigned( || Arrays.compareUnsigned(
scratchDiff, 0, config.bytesPerDim, scratch1, 0, config.bytesPerDim) scratchDiff, 0, config.bytesPerDim(), scratch1, 0, config.bytesPerDim())
> 0) { > 0) {
System.arraycopy(scratchDiff, 0, scratch1, 0, config.bytesPerDim); System.arraycopy(scratchDiff, 0, scratch1, 0, config.bytesPerDim());
splitDim = dim; splitDim = dim;
} }
} }
@ -931,15 +934,15 @@ final class SimpleTextBKDWriter implements Closeable {
if (nodeID >= leafNodeOffset) { if (nodeID >= leafNodeOffset) {
// leaf node // leaf node
final int count = to - from; final int count = to - from;
assert count <= config.maxPointsInLeafNode; assert count <= config.maxPointsInLeafNode();
// Compute common prefixes // Compute common prefixes
Arrays.fill(commonPrefixLengths, config.bytesPerDim); Arrays.fill(commonPrefixLengths, config.bytesPerDim());
reader.getValue(from, scratchBytesRef1); reader.getValue(from, scratchBytesRef1);
for (int i = from + 1; i < to; ++i) { for (int i = from + 1; i < to; ++i) {
reader.getValue(i, scratchBytesRef2); reader.getValue(i, scratchBytesRef2);
for (int dim = 0; dim < config.numDims; dim++) { for (int dim = 0; dim < config.numDims(); dim++) {
final int offset = dim * config.bytesPerDim; final int offset = dim * config.bytesPerDim();
for (int j = 0; j < commonPrefixLengths[dim]; j++) { for (int j = 0; j < commonPrefixLengths[dim]; j++) {
if (scratchBytesRef1.bytes[scratchBytesRef1.offset + offset + j] if (scratchBytesRef1.bytes[scratchBytesRef1.offset + offset + j]
!= scratchBytesRef2.bytes[scratchBytesRef2.offset + offset + j]) { != scratchBytesRef2.bytes[scratchBytesRef2.offset + offset + j]) {
@ -951,23 +954,23 @@ final class SimpleTextBKDWriter implements Closeable {
} }
// Find the dimension that has the least number of unique bytes at commonPrefixLengths[dim] // Find the dimension that has the least number of unique bytes at commonPrefixLengths[dim]
FixedBitSet[] usedBytes = new FixedBitSet[config.numDims]; FixedBitSet[] usedBytes = new FixedBitSet[config.numDims()];
for (int dim = 0; dim < config.numDims; ++dim) { for (int dim = 0; dim < config.numDims(); ++dim) {
if (commonPrefixLengths[dim] < config.bytesPerDim) { if (commonPrefixLengths[dim] < config.bytesPerDim()) {
usedBytes[dim] = new FixedBitSet(256); usedBytes[dim] = new FixedBitSet(256);
} }
} }
for (int i = from + 1; i < to; ++i) { for (int i = from + 1; i < to; ++i) {
for (int dim = 0; dim < config.numDims; dim++) { for (int dim = 0; dim < config.numDims(); dim++) {
if (usedBytes[dim] != null) { if (usedBytes[dim] != null) {
byte b = reader.getByteAt(i, dim * config.bytesPerDim + commonPrefixLengths[dim]); byte b = reader.getByteAt(i, dim * config.bytesPerDim() + commonPrefixLengths[dim]);
usedBytes[dim].set(Byte.toUnsignedInt(b)); usedBytes[dim].set(Byte.toUnsignedInt(b));
} }
} }
} }
int sortedDim = 0; int sortedDim = 0;
int sortedDimCardinality = Integer.MAX_VALUE; int sortedDimCardinality = Integer.MAX_VALUE;
for (int dim = 0; dim < config.numDims; ++dim) { for (int dim = 0; dim < config.numDims(); ++dim) {
if (usedBytes[dim] != null) { if (usedBytes[dim] != null) {
final int cardinality = usedBytes[dim].cardinality(); final int cardinality = usedBytes[dim].cardinality();
if (cardinality < sortedDimCardinality) { if (cardinality < sortedDimCardinality) {
@ -1001,7 +1004,7 @@ final class SimpleTextBKDWriter implements Closeable {
// Write the common prefixes: // Write the common prefixes:
reader.getValue(from, scratchBytesRef1); reader.getValue(from, scratchBytesRef1);
System.arraycopy( System.arraycopy(
scratchBytesRef1.bytes, scratchBytesRef1.offset, scratch1, 0, config.packedBytesLength); scratchBytesRef1.bytes, scratchBytesRef1.offset, scratch1, 0, config.packedBytesLength());
// Write the full values: // Write the full values:
IntFunction<BytesRef> packedValues = IntFunction<BytesRef> packedValues =
@ -1023,10 +1026,10 @@ final class SimpleTextBKDWriter implements Closeable {
final int splitDim = split(minPackedValue, maxPackedValue); final int splitDim = split(minPackedValue, maxPackedValue);
final int mid = (from + to + 1) >>> 1; final int mid = (from + to + 1) >>> 1;
int commonPrefixLen = config.bytesPerDim; int commonPrefixLen = config.bytesPerDim();
for (int i = 0; i < config.bytesPerDim; ++i) { for (int i = 0; i < config.bytesPerDim(); ++i) {
if (minPackedValue[splitDim * config.bytesPerDim + i] if (minPackedValue[splitDim * config.bytesPerDim() + i]
!= maxPackedValue[splitDim * config.bytesPerDim + i]) { != maxPackedValue[splitDim * config.bytesPerDim() + i]) {
commonPrefixLen = i; commonPrefixLen = i;
break; break;
} }
@ -1044,32 +1047,32 @@ final class SimpleTextBKDWriter implements Closeable {
scratchBytesRef2); scratchBytesRef2);
// set the split value // set the split value
final int address = nodeID * (1 + config.bytesPerDim); final int address = nodeID * (1 + config.bytesPerDim());
splitPackedValues[address] = (byte) splitDim; splitPackedValues[address] = (byte) splitDim;
reader.getValue(mid, scratchBytesRef1); reader.getValue(mid, scratchBytesRef1);
System.arraycopy( System.arraycopy(
scratchBytesRef1.bytes, scratchBytesRef1.bytes,
scratchBytesRef1.offset + splitDim * config.bytesPerDim, scratchBytesRef1.offset + splitDim * config.bytesPerDim(),
splitPackedValues, splitPackedValues,
address + 1, address + 1,
config.bytesPerDim); config.bytesPerDim());
byte[] minSplitPackedValue = byte[] minSplitPackedValue =
ArrayUtil.copyOfSubArray(minPackedValue, 0, config.packedIndexBytesLength); ArrayUtil.copyOfSubArray(minPackedValue, 0, config.packedIndexBytesLength());
byte[] maxSplitPackedValue = byte[] maxSplitPackedValue =
ArrayUtil.copyOfSubArray(maxPackedValue, 0, config.packedIndexBytesLength); ArrayUtil.copyOfSubArray(maxPackedValue, 0, config.packedIndexBytesLength());
System.arraycopy( System.arraycopy(
scratchBytesRef1.bytes, scratchBytesRef1.bytes,
scratchBytesRef1.offset + splitDim * config.bytesPerDim, scratchBytesRef1.offset + splitDim * config.bytesPerDim(),
minSplitPackedValue, minSplitPackedValue,
splitDim * config.bytesPerDim, splitDim * config.bytesPerDim(),
config.bytesPerDim); config.bytesPerDim());
System.arraycopy( System.arraycopy(
scratchBytesRef1.bytes, scratchBytesRef1.bytes,
scratchBytesRef1.offset + splitDim * config.bytesPerDim, scratchBytesRef1.offset + splitDim * config.bytesPerDim(),
maxSplitPackedValue, maxSplitPackedValue,
splitDim * config.bytesPerDim, splitDim * config.bytesPerDim(),
config.bytesPerDim); config.bytesPerDim());
// recurse // recurse
build( build(
@ -1137,17 +1140,17 @@ final class SimpleTextBKDWriter implements Closeable {
int sortedDim = 0; int sortedDim = 0;
int sortedDimCardinality = Integer.MAX_VALUE; int sortedDimCardinality = Integer.MAX_VALUE;
FixedBitSet[] usedBytes = new FixedBitSet[config.numDims]; FixedBitSet[] usedBytes = new FixedBitSet[config.numDims()];
for (int dim = 0; dim < config.numDims; ++dim) { for (int dim = 0; dim < config.numDims(); ++dim) {
if (commonPrefixLengths[dim] < config.bytesPerDim) { if (commonPrefixLengths[dim] < config.bytesPerDim()) {
usedBytes[dim] = new FixedBitSet(256); usedBytes[dim] = new FixedBitSet(256);
} }
} }
// Find the dimension to compress // Find the dimension to compress
for (int dim = 0; dim < config.numDims; dim++) { for (int dim = 0; dim < config.numDims(); dim++) {
int prefix = commonPrefixLengths[dim]; int prefix = commonPrefixLengths[dim];
if (prefix < config.bytesPerDim) { if (prefix < config.bytesPerDim()) {
int offset = dim * config.bytesPerDim; int offset = dim * config.bytesPerDim();
for (int i = 0; i < heapSource.count(); ++i) { for (int i = 0; i < heapSource.count(); ++i) {
PointValue value = heapSource.getPackedValueSlice(i); PointValue value = heapSource.getPackedValueSlice(i);
BytesRef packedValue = value.packedValue(); BytesRef packedValue = value.packedValue();
@ -1190,7 +1193,7 @@ final class SimpleTextBKDWriter implements Closeable {
final BytesRef scratch = new BytesRef(); final BytesRef scratch = new BytesRef();
{ {
scratch.length = config.packedBytesLength; scratch.length = config.packedBytesLength();
} }
@Override @Override
@ -1207,7 +1210,7 @@ final class SimpleTextBKDWriter implements Closeable {
// Inner node: partition/recurse // Inner node: partition/recurse
int splitDim; int splitDim;
if (config.numIndexDims > 1) { if (config.numIndexDims() > 1) {
splitDim = split(minPackedValue, maxPackedValue); splitDim = split(minPackedValue, maxPackedValue);
} else { } else {
splitDim = 0; splitDim = 0;
@ -1223,13 +1226,13 @@ final class SimpleTextBKDWriter implements Closeable {
int commonPrefixLen = int commonPrefixLen =
Arrays.mismatch( Arrays.mismatch(
minPackedValue, minPackedValue,
splitDim * config.bytesPerDim, splitDim * config.bytesPerDim(),
splitDim * config.bytesPerDim + config.bytesPerDim, splitDim * config.bytesPerDim() + config.bytesPerDim(),
maxPackedValue, maxPackedValue,
splitDim * config.bytesPerDim, splitDim * config.bytesPerDim(),
splitDim * config.bytesPerDim + config.bytesPerDim); splitDim * config.bytesPerDim() + config.bytesPerDim());
if (commonPrefixLen == -1) { if (commonPrefixLen == -1) {
commonPrefixLen = config.bytesPerDim; commonPrefixLen = config.bytesPerDim();
} }
BKDRadixSelector.PathSlice[] pathSlices = new BKDRadixSelector.PathSlice[2]; BKDRadixSelector.PathSlice[] pathSlices = new BKDRadixSelector.PathSlice[2];
@ -1244,20 +1247,28 @@ final class SimpleTextBKDWriter implements Closeable {
splitDim, splitDim,
commonPrefixLen); commonPrefixLen);
int address = nodeID * (1 + config.bytesPerDim); int address = nodeID * (1 + config.bytesPerDim());
splitPackedValues[address] = (byte) splitDim; splitPackedValues[address] = (byte) splitDim;
System.arraycopy(splitValue, 0, splitPackedValues, address + 1, config.bytesPerDim); System.arraycopy(splitValue, 0, splitPackedValues, address + 1, config.bytesPerDim());
byte[] minSplitPackedValue = new byte[config.packedIndexBytesLength]; byte[] minSplitPackedValue = new byte[config.packedIndexBytesLength()];
System.arraycopy(minPackedValue, 0, minSplitPackedValue, 0, config.packedIndexBytesLength); System.arraycopy(minPackedValue, 0, minSplitPackedValue, 0, config.packedIndexBytesLength());
byte[] maxSplitPackedValue = new byte[config.packedIndexBytesLength]; byte[] maxSplitPackedValue = new byte[config.packedIndexBytesLength()];
System.arraycopy(maxPackedValue, 0, maxSplitPackedValue, 0, config.packedIndexBytesLength); System.arraycopy(maxPackedValue, 0, maxSplitPackedValue, 0, config.packedIndexBytesLength());
System.arraycopy( System.arraycopy(
splitValue, 0, minSplitPackedValue, splitDim * config.bytesPerDim, config.bytesPerDim); splitValue,
0,
minSplitPackedValue,
splitDim * config.bytesPerDim(),
config.bytesPerDim());
System.arraycopy( System.arraycopy(
splitValue, 0, maxSplitPackedValue, splitDim * config.bytesPerDim, config.bytesPerDim); splitValue,
0,
maxSplitPackedValue,
splitDim * config.bytesPerDim(),
config.bytesPerDim());
// Recurse on left tree: // Recurse on left tree:
build( build(
@ -1289,30 +1300,30 @@ final class SimpleTextBKDWriter implements Closeable {
} }
private void computeCommonPrefixLength(HeapPointWriter heapPointWriter, byte[] commonPrefix) { private void computeCommonPrefixLength(HeapPointWriter heapPointWriter, byte[] commonPrefix) {
Arrays.fill(commonPrefixLengths, config.bytesPerDim); Arrays.fill(commonPrefixLengths, config.bytesPerDim());
PointValue value = heapPointWriter.getPackedValueSlice(0); PointValue value = heapPointWriter.getPackedValueSlice(0);
BytesRef packedValue = value.packedValue(); BytesRef packedValue = value.packedValue();
for (int dim = 0; dim < config.numDims; dim++) { for (int dim = 0; dim < config.numDims(); dim++) {
System.arraycopy( System.arraycopy(
packedValue.bytes, packedValue.bytes,
packedValue.offset + dim * config.bytesPerDim, packedValue.offset + dim * config.bytesPerDim(),
commonPrefix, commonPrefix,
dim * config.bytesPerDim, dim * config.bytesPerDim(),
config.bytesPerDim); config.bytesPerDim());
} }
for (int i = 1; i < heapPointWriter.count(); i++) { for (int i = 1; i < heapPointWriter.count(); i++) {
value = heapPointWriter.getPackedValueSlice(i); value = heapPointWriter.getPackedValueSlice(i);
packedValue = value.packedValue(); packedValue = value.packedValue();
for (int dim = 0; dim < config.numDims; dim++) { for (int dim = 0; dim < config.numDims(); dim++) {
if (commonPrefixLengths[dim] != 0) { if (commonPrefixLengths[dim] != 0) {
int j = int j =
Arrays.mismatch( Arrays.mismatch(
commonPrefix, commonPrefix,
dim * config.bytesPerDim, dim * config.bytesPerDim(),
dim * config.bytesPerDim + commonPrefixLengths[dim], dim * config.bytesPerDim() + commonPrefixLengths[dim],
packedValue.bytes, packedValue.bytes,
packedValue.offset + dim * config.bytesPerDim, packedValue.offset + dim * config.bytesPerDim(),
packedValue.offset + dim * config.bytesPerDim + commonPrefixLengths[dim]); packedValue.offset + dim * config.bytesPerDim() + commonPrefixLengths[dim]);
if (j != -1) { if (j != -1) {
commonPrefixLengths[dim] = j; commonPrefixLengths[dim] = j;
} }
@ -1331,11 +1342,11 @@ final class SimpleTextBKDWriter implements Closeable {
int[] docs, int[] docs,
int docsOffset) int docsOffset)
throws IOException { throws IOException {
byte[] lastPackedValue = new byte[config.packedBytesLength]; byte[] lastPackedValue = new byte[config.packedBytesLength()];
int lastDoc = -1; int lastDoc = -1;
for (int i = 0; i < count; i++) { for (int i = 0; i < count; i++) {
BytesRef packedValue = values.apply(i); BytesRef packedValue = values.apply(i);
assert packedValue.length == config.packedBytesLength; assert packedValue.length == config.packedBytesLength();
assert valueInOrder( assert valueInOrder(
i, i,
sortedDim, sortedDim,
@ -1361,43 +1372,43 @@ final class SimpleTextBKDWriter implements Closeable {
int packedValueOffset, int packedValueOffset,
int doc, int doc,
int lastDoc) { int lastDoc) {
int dimOffset = sortedDim * config.bytesPerDim; int dimOffset = sortedDim * config.bytesPerDim();
if (ord > 0) { if (ord > 0) {
int cmp = int cmp =
Arrays.compareUnsigned( Arrays.compareUnsigned(
lastPackedValue, lastPackedValue,
dimOffset, dimOffset,
dimOffset + config.bytesPerDim, dimOffset + config.bytesPerDim(),
packedValue, packedValue,
packedValueOffset + dimOffset, packedValueOffset + dimOffset,
packedValueOffset + dimOffset + config.bytesPerDim); packedValueOffset + dimOffset + config.bytesPerDim());
if (cmp > 0) { if (cmp > 0) {
throw new AssertionError( throw new AssertionError(
"values out of order: last value=" "values out of order: last value="
+ new BytesRef(lastPackedValue) + new BytesRef(lastPackedValue)
+ " current value=" + " current value="
+ new BytesRef(packedValue, packedValueOffset, config.packedBytesLength) + new BytesRef(packedValue, packedValueOffset, config.packedBytesLength())
+ " ord=" + " ord="
+ ord + ord
+ " sortedDim=" + " sortedDim="
+ sortedDim); + sortedDim);
} }
if (cmp == 0 && config.numDims > config.numIndexDims) { if (cmp == 0 && config.numDims() > config.numIndexDims()) {
int dataOffset = config.numIndexDims * config.bytesPerDim; int dataOffset = config.numIndexDims() * config.bytesPerDim();
cmp = cmp =
Arrays.compareUnsigned( Arrays.compareUnsigned(
lastPackedValue, lastPackedValue,
dataOffset, dataOffset,
config.packedBytesLength, config.packedBytesLength(),
packedValue, packedValue,
packedValueOffset + dataOffset, packedValueOffset + dataOffset,
packedValueOffset + config.packedBytesLength); packedValueOffset + config.packedBytesLength());
if (cmp > 0) { if (cmp > 0) {
throw new AssertionError( throw new AssertionError(
"data values out of order: last value=" "data values out of order: last value="
+ new BytesRef(lastPackedValue) + new BytesRef(lastPackedValue)
+ " current value=" + " current value="
+ new BytesRef(packedValue, packedValueOffset, config.packedBytesLength) + new BytesRef(packedValue, packedValueOffset, config.packedBytesLength())
+ " ord=" + " ord="
+ ord); + ord);
} }
@ -1414,7 +1425,8 @@ final class SimpleTextBKDWriter implements Closeable {
+ sortedDim); + sortedDim);
} }
} }
System.arraycopy(packedValue, packedValueOffset, lastPackedValue, 0, config.packedBytesLength); System.arraycopy(
packedValue, packedValueOffset, lastPackedValue, 0, config.packedBytesLength());
return true; return true;
} }

View File

@ -829,7 +829,7 @@ class SimpleTextDocValuesReader extends DocValuesProducer {
clone.seek(0); clone.seek(0);
// checksum is fixed-width encoded with 20 bytes, plus 1 byte for newline (the space is included // checksum is fixed-width encoded with 20 bytes, plus 1 byte for newline (the space is included
// in SimpleTextUtil.CHECKSUM): // in SimpleTextUtil.CHECKSUM):
long footerStartPos = data.length() - (SimpleTextUtil.CHECKSUM.length + 21); long footerStartPos = clone.length() - (SimpleTextUtil.CHECKSUM.length + 21);
ChecksumIndexInput input = new BufferedChecksumIndexInput(clone); ChecksumIndexInput input = new BufferedChecksumIndexInput(clone);
while (true) { while (true) {
SimpleTextUtil.readLine(input, scratch); SimpleTextUtil.readLine(input, scratch);

View File

@ -227,7 +227,7 @@ class SimpleTextPointsReader extends PointsReader {
// checksum is fixed-width encoded with 20 bytes, plus 1 byte for newline (the space is included // checksum is fixed-width encoded with 20 bytes, plus 1 byte for newline (the space is included
// in SimpleTextUtil.CHECKSUM): // in SimpleTextUtil.CHECKSUM):
long footerStartPos = dataIn.length() - (SimpleTextUtil.CHECKSUM.length + 21); long footerStartPos = clone.length() - (SimpleTextUtil.CHECKSUM.length + 21);
ChecksumIndexInput input = new BufferedChecksumIndexInput(clone); ChecksumIndexInput input = new BufferedChecksumIndexInput(clone);
while (true) { while (true) {
SimpleTextUtil.readLine(input, scratch); SimpleTextUtil.readLine(input, scratch);

View File

@ -17,13 +17,13 @@
package org.apache.lucene.codecs.uniformsplit; package org.apache.lucene.codecs.uniformsplit;
import static org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.BLOCK_SIZE; import static org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat.BLOCK_SIZE;
import java.io.IOException; import java.io.IOException;
import org.apache.lucene.codecs.BlockTermState; import org.apache.lucene.codecs.BlockTermState;
import org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.IntBlockTermState; import org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat.IntBlockTermState;
import org.apache.lucene.codecs.lucene99.Lucene99PostingsReader; import org.apache.lucene.codecs.lucene912.Lucene912PostingsReader;
import org.apache.lucene.codecs.lucene99.Lucene99PostingsWriter; import org.apache.lucene.codecs.lucene912.Lucene912PostingsWriter;
import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.IndexOptions; import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.TermState; import org.apache.lucene.index.TermState;
@ -34,7 +34,7 @@ import org.apache.lucene.util.RamUsageEstimator;
/** /**
* {@link TermState} serializer which encodes each file pointer as a delta relative to a base file * {@link TermState} serializer which encodes each file pointer as a delta relative to a base file
* pointer. It differs from {@link Lucene99PostingsWriter#encodeTerm} which encodes each file * pointer. It differs from {@link Lucene912PostingsWriter#encodeTerm} which encodes each file
* pointer as a delta relative to the previous file pointer. * pointer as a delta relative to the previous file pointer.
* *
* <p>It automatically sets the base file pointer to the first valid file pointer for doc start FP, * <p>It automatically sets the base file pointer to the first valid file pointer for doc start FP,
@ -95,7 +95,7 @@ public class DeltaBaseTermStateSerializer implements Accountable {
/** /**
* Writes a {@link BlockTermState} to the provided {@link DataOutput}. * Writes a {@link BlockTermState} to the provided {@link DataOutput}.
* *
* <p>Simpler variant of {@link Lucene99PostingsWriter#encodeTerm(DataOutput, FieldInfo, * <p>Simpler variant of {@link Lucene912PostingsWriter#encodeTerm(DataOutput, FieldInfo,
* BlockTermState, boolean)}. * BlockTermState, boolean)}.
*/ */
public void writeTermState( public void writeTermState(
@ -140,15 +140,12 @@ public class DeltaBaseTermStateSerializer implements Accountable {
termStatesOutput.writeVLong(intTermState.lastPosBlockOffset); termStatesOutput.writeVLong(intTermState.lastPosBlockOffset);
} }
} }
if (intTermState.skipOffset != -1) {
termStatesOutput.writeVLong(intTermState.skipOffset);
}
} }
/** /**
* Reads a {@link BlockTermState} from the provided {@link DataInput}. * Reads a {@link BlockTermState} from the provided {@link DataInput}.
* *
* <p>Simpler variant of {@link Lucene99PostingsReader#decodeTerm(DataInput, FieldInfo, * <p>Simpler variant of {@link Lucene912PostingsReader#decodeTerm(DataInput, FieldInfo,
* BlockTermState, boolean)}. * BlockTermState, boolean)}.
* *
* @param reuse {@link BlockTermState} to reuse; or null to create a new one. * @param reuse {@link BlockTermState} to reuse; or null to create a new one.
@ -190,9 +187,6 @@ public class DeltaBaseTermStateSerializer implements Accountable {
intTermState.lastPosBlockOffset = termStatesInput.readVLong(); intTermState.lastPosBlockOffset = termStatesInput.readVLong();
} }
} }
if (intTermState.docFreq > BLOCK_SIZE) {
intTermState.skipOffset = termStatesInput.readVLong();
}
return intTermState; return intTermState;
} }
@ -210,7 +204,6 @@ public class DeltaBaseTermStateSerializer implements Accountable {
termState.docStartFP = 0; termState.docStartFP = 0;
termState.posStartFP = 0; termState.posStartFP = 0;
termState.payStartFP = 0; termState.payStartFP = 0;
termState.skipOffset = -1;
termState.lastPosBlockOffset = -1; termState.lastPosBlockOffset = -1;
termState.singletonDocID = -1; termState.singletonDocID = -1;

View File

@ -90,10 +90,15 @@ public class FSTDictionary implements IndexDictionary {
} }
PositiveIntOutputs fstOutputs = PositiveIntOutputs.getSingleton(); PositiveIntOutputs fstOutputs = PositiveIntOutputs.getSingleton();
FST.FSTMetadata<Long> metadata = FST.readMetadata(fstDataInput, fstOutputs); FST.FSTMetadata<Long> metadata = FST.readMetadata(fstDataInput, fstOutputs);
FST<Long> fst = FST<Long> fst;
isFSTOnHeap if (isFSTOnHeap) {
? new FST<>(metadata, fstDataInput) fst = new FST<>(metadata, fstDataInput);
: new FST<>(metadata, fstDataInput, new OffHeapFSTStore()); } else {
final IndexInput indexInput = (IndexInput) fstDataInput;
fst =
FST.fromFSTReader(
metadata, new OffHeapFSTStore(indexInput, indexInput.getFilePointer(), metadata));
}
return new FSTDictionary(fst); return new FSTDictionary(fst);
} }

View File

@ -23,8 +23,8 @@ import org.apache.lucene.codecs.FieldsProducer;
import org.apache.lucene.codecs.PostingsFormat; import org.apache.lucene.codecs.PostingsFormat;
import org.apache.lucene.codecs.PostingsReaderBase; import org.apache.lucene.codecs.PostingsReaderBase;
import org.apache.lucene.codecs.PostingsWriterBase; import org.apache.lucene.codecs.PostingsWriterBase;
import org.apache.lucene.codecs.lucene99.Lucene99PostingsReader; import org.apache.lucene.codecs.lucene912.Lucene912PostingsReader;
import org.apache.lucene.codecs.lucene99.Lucene99PostingsWriter; import org.apache.lucene.codecs.lucene912.Lucene912PostingsWriter;
import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.SegmentWriteState; import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.IOUtils;
@ -113,7 +113,7 @@ public class UniformSplitPostingsFormat extends PostingsFormat {
@Override @Override
public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException { public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
PostingsWriterBase postingsWriter = new Lucene99PostingsWriter(state); PostingsWriterBase postingsWriter = new Lucene912PostingsWriter(state);
boolean success = false; boolean success = false;
try { try {
FieldsConsumer termsWriter = FieldsConsumer termsWriter =
@ -130,7 +130,7 @@ public class UniformSplitPostingsFormat extends PostingsFormat {
@Override @Override
public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException { public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException {
PostingsReaderBase postingsReader = new Lucene99PostingsReader(state); PostingsReaderBase postingsReader = new Lucene912PostingsReader(state);
boolean success = false; boolean success = false;
try { try {
FieldsProducer termsReader = FieldsProducer termsReader =

View File

@ -28,7 +28,7 @@
* org.apache.lucene.search.PhraseQuery}) * org.apache.lucene.search.PhraseQuery})
* <li>Quite efficient for {@link org.apache.lucene.search.PrefixQuery} * <li>Quite efficient for {@link org.apache.lucene.search.PrefixQuery}
* <li>Not efficient for spell-check and {@link org.apache.lucene.search.FuzzyQuery}, in this case * <li>Not efficient for spell-check and {@link org.apache.lucene.search.FuzzyQuery}, in this case
* prefer {@link org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat} * prefer {@link org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat}
* </ul> * </ul>
*/ */
package org.apache.lucene.codecs.uniformsplit; package org.apache.lucene.codecs.uniformsplit;

View File

@ -20,11 +20,11 @@ package org.apache.lucene.codecs.uniformsplit.sharedterms;
import java.io.IOException; import java.io.IOException;
import java.util.List; import java.util.List;
import java.util.RandomAccess; import java.util.RandomAccess;
import org.apache.lucene.index.BaseTermsEnum;
import org.apache.lucene.index.ImpactsEnum; import org.apache.lucene.index.ImpactsEnum;
import org.apache.lucene.index.MergeState; import org.apache.lucene.index.MergeState;
import org.apache.lucene.index.PostingsEnum; import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.TermState; import org.apache.lucene.index.TermState;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.util.AttributeSource; import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRef;
@ -34,7 +34,7 @@ import org.apache.lucene.util.BytesRef;
* *
* @lucene.experimental * @lucene.experimental
*/ */
class STMergingTermsEnum extends TermsEnum { class STMergingTermsEnum extends BaseTermsEnum {
protected final String fieldName; protected final String fieldName;
protected final MultiSegmentsPostingsEnum multiPostingsEnum; protected final MultiSegmentsPostingsEnum multiPostingsEnum;
@ -63,11 +63,6 @@ class STMergingTermsEnum extends TermsEnum {
throw new UnsupportedOperationException(); throw new UnsupportedOperationException();
} }
@Override
public boolean seekExact(BytesRef text) throws IOException {
throw new UnsupportedOperationException();
}
@Override @Override
public SeekStatus seekCeil(BytesRef text) { public SeekStatus seekCeil(BytesRef text) {
throw new UnsupportedOperationException(); throw new UnsupportedOperationException();

View File

@ -22,7 +22,7 @@ import java.io.IOException;
import org.apache.lucene.codecs.Codec; import org.apache.lucene.codecs.Codec;
import org.apache.lucene.codecs.FilterCodec; import org.apache.lucene.codecs.FilterCodec;
import org.apache.lucene.codecs.KnnVectorsFormat; import org.apache.lucene.codecs.KnnVectorsFormat;
import org.apache.lucene.codecs.lucene99.Lucene99Codec; import org.apache.lucene.codecs.lucene912.Lucene912Codec;
import org.apache.lucene.document.Document; import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field; import org.apache.lucene.document.Field;
import org.apache.lucene.document.KnnByteVectorField; import org.apache.lucene.document.KnnByteVectorField;
@ -42,7 +42,7 @@ import org.apache.lucene.tests.index.BaseIndexFileFormatTestCase;
public class TestHnswBitVectorsFormat extends BaseIndexFileFormatTestCase { public class TestHnswBitVectorsFormat extends BaseIndexFileFormatTestCase {
@Override @Override
protected Codec getCodec() { protected Codec getCodec() {
return new Lucene99Codec() { return new Lucene912Codec() {
@Override @Override
public KnnVectorsFormat getKnnVectorsFormatForField(String field) { public KnnVectorsFormat getKnnVectorsFormatForField(String field) {
return new HnswBitVectorsFormat(); return new HnswBitVectorsFormat();

View File

@ -17,7 +17,7 @@
package org.apache.lucene.codecs.lucene90.tests; package org.apache.lucene.codecs.lucene90.tests;
import org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.IntBlockTermState; import org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat.IntBlockTermState;
/** Test utility class to create mock {@link IntBlockTermState}. */ /** Test utility class to create mock {@link IntBlockTermState}. */
public class MockTermStateFactory { public class MockTermStateFactory {

View File

@ -0,0 +1,4 @@
{
"lucene/core/src/java/org/apache/lucene/codecs/lucene912/ForDeltaUtil.java": "5115b12ac31537ce31d73c0a279df92060749a3a",
"lucene/core/src/java/org/apache/lucene/codecs/lucene912/gen_ForDeltaUtil.py": "db6154406e68b80d2c90116b5d0bfa9ba220762a"
}

View File

@ -1,4 +1,4 @@
{ {
"lucene/core/src/java/org/apache/lucene/codecs/lucene99/ForUtil.java": "1292ad354d255b1272ffd3db684aa2ddb2bc49ec", "lucene/core/src/java/org/apache/lucene/codecs/lucene912/ForUtil.java": "159e82388346fde147924d5e15ca65df4dd63b9a",
"lucene/core/src/java/org/apache/lucene/codecs/lucene99/gen_ForUtil.py": "ab7b63a1b73986cc04e43de1c8f474b97aef5116" "lucene/core/src/java/org/apache/lucene/codecs/lucene912/gen_ForUtil.py": "66dc8813160feae2a37d8b50474f5f9830b6cb22"
} }

View File

@ -15,7 +15,7 @@
* limitations under the License. * limitations under the License.
*/ */
import org.apache.lucene.codecs.lucene99.Lucene99Codec; import org.apache.lucene.codecs.lucene912.Lucene912Codec;
/** Lucene Core. */ /** Lucene Core. */
@SuppressWarnings("module") // the test framework is compiled after the core... @SuppressWarnings("module") // the test framework is compiled after the core...
@ -33,6 +33,7 @@ module org.apache.lucene.core {
exports org.apache.lucene.codecs.lucene94; exports org.apache.lucene.codecs.lucene94;
exports org.apache.lucene.codecs.lucene95; exports org.apache.lucene.codecs.lucene95;
exports org.apache.lucene.codecs.lucene99; exports org.apache.lucene.codecs.lucene99;
exports org.apache.lucene.codecs.lucene912;
exports org.apache.lucene.codecs.perfield; exports org.apache.lucene.codecs.perfield;
exports org.apache.lucene.codecs; exports org.apache.lucene.codecs;
exports org.apache.lucene.document; exports org.apache.lucene.document;
@ -71,7 +72,7 @@ module org.apache.lucene.core {
provides org.apache.lucene.analysis.TokenizerFactory with provides org.apache.lucene.analysis.TokenizerFactory with
org.apache.lucene.analysis.standard.StandardTokenizerFactory; org.apache.lucene.analysis.standard.StandardTokenizerFactory;
provides org.apache.lucene.codecs.Codec with provides org.apache.lucene.codecs.Codec with
Lucene99Codec; Lucene912Codec;
provides org.apache.lucene.codecs.DocValuesFormat with provides org.apache.lucene.codecs.DocValuesFormat with
org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat; org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat;
provides org.apache.lucene.codecs.KnnVectorsFormat with provides org.apache.lucene.codecs.KnnVectorsFormat with
@ -79,7 +80,7 @@ module org.apache.lucene.core {
org.apache.lucene.codecs.lucene99.Lucene99HnswScalarQuantizedVectorsFormat, org.apache.lucene.codecs.lucene99.Lucene99HnswScalarQuantizedVectorsFormat,
org.apache.lucene.codecs.lucene99.Lucene99ScalarQuantizedVectorsFormat; org.apache.lucene.codecs.lucene99.Lucene99ScalarQuantizedVectorsFormat;
provides org.apache.lucene.codecs.PostingsFormat with provides org.apache.lucene.codecs.PostingsFormat with
org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat; org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat;
provides org.apache.lucene.index.SortFieldProvider with provides org.apache.lucene.index.SortFieldProvider with
org.apache.lucene.search.SortField.Provider, org.apache.lucene.search.SortField.Provider,
org.apache.lucene.search.SortedNumericSortField.Provider, org.apache.lucene.search.SortedNumericSortField.Provider,

View File

@ -55,7 +55,7 @@ public abstract class Codec implements NamedSPILoader.NamedSPI {
return LOADER; return LOADER;
} }
static Codec defaultCodec = LOADER.lookup("Lucene99"); static Codec defaultCodec = LOADER.lookup("Lucene912");
} }
private final String name; private final String name;

View File

@ -18,8 +18,6 @@ package org.apache.lucene.codecs;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Arrays; import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator; import java.util.Comparator;
import java.util.Iterator; import java.util.Iterator;
import java.util.List; import java.util.List;
@ -106,7 +104,7 @@ public final class CompetitiveImpactAccumulator {
} }
/** Get the set of competitive freq and norm pairs, ordered by increasing freq and norm. */ /** Get the set of competitive freq and norm pairs, ordered by increasing freq and norm. */
public Collection<Impact> getCompetitiveFreqNormPairs() { public List<Impact> getCompetitiveFreqNormPairs() {
List<Impact> impacts = new ArrayList<>(); List<Impact> impacts = new ArrayList<>();
int maxFreqForLowerNorms = 0; int maxFreqForLowerNorms = 0;
for (int i = 0; i < maxFreqs.length; ++i) { for (int i = 0; i < maxFreqs.length; ++i) {
@ -126,7 +124,7 @@ public final class CompetitiveImpactAccumulator {
for (Impact impact : impacts) { for (Impact impact : impacts) {
add(impact, freqNormPairs); add(impact, freqNormPairs);
} }
return Collections.unmodifiableSet(freqNormPairs); return List.copyOf(freqNormPairs);
} }
private void add(Impact newEntry, TreeSet<Impact> freqNormPairs) { private void add(Impact newEntry, TreeSet<Impact> freqNormPairs) {

View File

@ -23,6 +23,7 @@ import java.io.IOException;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Iterator; import java.util.Iterator;
import java.util.List; import java.util.List;
import org.apache.lucene.index.BaseTermsEnum;
import org.apache.lucene.index.BinaryDocValues; import org.apache.lucene.index.BinaryDocValues;
import org.apache.lucene.index.DocIDMerger; import org.apache.lucene.index.DocIDMerger;
import org.apache.lucene.index.DocValues; import org.apache.lucene.index.DocValues;
@ -498,7 +499,7 @@ public abstract class DocValuesConsumer implements Closeable {
* {@link SortedDocValues#lookupOrd(int)} or {@link SortedSetDocValues#lookupOrd(long)} on every * {@link SortedDocValues#lookupOrd(int)} or {@link SortedSetDocValues#lookupOrd(long)} on every
* call to {@link TermsEnum#next()}. * call to {@link TermsEnum#next()}.
*/ */
private static class MergedTermsEnum extends TermsEnum { private static class MergedTermsEnum extends BaseTermsEnum {
private final TermsEnum[] subs; private final TermsEnum[] subs;
private final OrdinalMap ordinalMap; private final OrdinalMap ordinalMap;
@ -542,11 +543,6 @@ public abstract class DocValuesConsumer implements Closeable {
throw new UnsupportedOperationException(); throw new UnsupportedOperationException();
} }
@Override
public boolean seekExact(BytesRef text) throws IOException {
throw new UnsupportedOperationException();
}
@Override @Override
public SeekStatus seekCeil(BytesRef text) throws IOException { public SeekStatus seekCeil(BytesRef text) throws IOException {
throw new UnsupportedOperationException(); throw new UnsupportedOperationException();
@ -557,11 +553,6 @@ public abstract class DocValuesConsumer implements Closeable {
throw new UnsupportedOperationException(); throw new UnsupportedOperationException();
} }
@Override
public void seekExact(BytesRef term, TermState state) throws IOException {
throw new UnsupportedOperationException();
}
@Override @Override
public int docFreq() throws IOException { public int docFreq() throws IOException {
throw new UnsupportedOperationException(); throw new UnsupportedOperationException();

View File

@ -20,17 +20,23 @@ package org.apache.lucene.codecs;
import java.io.Closeable; import java.io.Closeable;
import java.io.IOException; import java.io.IOException;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Arrays;
import java.util.List; import java.util.List;
import java.util.Objects;
import java.util.function.BiFunction;
import org.apache.lucene.index.ByteVectorValues; import org.apache.lucene.index.ByteVectorValues;
import org.apache.lucene.index.DocIDMerger; import org.apache.lucene.index.DocIDMerger;
import org.apache.lucene.index.DocsWithFieldSet;
import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FloatVectorValues; import org.apache.lucene.index.FloatVectorValues;
import org.apache.lucene.index.MergeState; import org.apache.lucene.index.MergeState;
import org.apache.lucene.index.Sorter; import org.apache.lucene.index.Sorter;
import org.apache.lucene.index.VectorEncoding; import org.apache.lucene.index.VectorEncoding;
import org.apache.lucene.internal.hppc.IntIntHashMap;
import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.VectorScorer; import org.apache.lucene.search.VectorScorer;
import org.apache.lucene.util.Accountable; import org.apache.lucene.util.Accountable;
import org.apache.lucene.util.IOFunction;
/** Writes vectors to an index. */ /** Writes vectors to an index. */
public abstract class KnnVectorsWriter implements Accountable, Closeable { public abstract class KnnVectorsWriter implements Accountable, Closeable {
@ -107,11 +113,11 @@ public abstract class KnnVectorsWriter implements Accountable, Closeable {
} }
/** Tracks state of one sub-reader that we are merging */ /** Tracks state of one sub-reader that we are merging */
private static class VectorValuesSub extends DocIDMerger.Sub { private static class FloatVectorValuesSub extends DocIDMerger.Sub {
final FloatVectorValues values; final FloatVectorValues values;
VectorValuesSub(MergeState.DocMap docMap, FloatVectorValues values) { FloatVectorValuesSub(MergeState.DocMap docMap, FloatVectorValues values) {
super(docMap); super(docMap);
this.values = values; this.values = values;
assert values.docID() == -1; assert values.docID() == -1;
@ -139,65 +145,139 @@ public abstract class KnnVectorsWriter implements Accountable, Closeable {
} }
} }
/**
* Given old doc ids and an id mapping, maps old ordinal to new ordinal. Note: this method return
* nothing and output are written to parameters
*
* @param oldDocIds the old or current document ordinals. Must not be null.
* @param sortMap the document sorting map for how to make the new ordinals. Must not be null.
* @param old2NewOrd int[] maps from old ord to new ord
* @param new2OldOrd int[] maps from new ord to old ord
* @param newDocsWithField set of new doc ids which has the value
*/
public static void mapOldOrdToNewOrd(
DocsWithFieldSet oldDocIds,
Sorter.DocMap sortMap,
int[] old2NewOrd,
int[] new2OldOrd,
DocsWithFieldSet newDocsWithField)
throws IOException {
// TODO: a similar function exists in IncrementalHnswGraphMerger#getNewOrdMapping
// maybe we can do a further refactoring
Objects.requireNonNull(oldDocIds);
Objects.requireNonNull(sortMap);
assert (old2NewOrd != null || new2OldOrd != null || newDocsWithField != null);
assert (old2NewOrd == null || old2NewOrd.length == oldDocIds.cardinality());
assert (new2OldOrd == null || new2OldOrd.length == oldDocIds.cardinality());
IntIntHashMap newIdToOldOrd = new IntIntHashMap();
DocIdSetIterator iterator = oldDocIds.iterator();
int[] newDocIds = new int[oldDocIds.cardinality()];
int oldOrd = 0;
for (int oldDocId = iterator.nextDoc();
oldDocId != DocIdSetIterator.NO_MORE_DOCS;
oldDocId = iterator.nextDoc()) {
int newId = sortMap.oldToNew(oldDocId);
newIdToOldOrd.put(newId, oldOrd);
newDocIds[oldOrd] = newId;
oldOrd++;
}
Arrays.sort(newDocIds);
int newOrd = 0;
for (int newDocId : newDocIds) {
int currOldOrd = newIdToOldOrd.get(newDocId);
if (old2NewOrd != null) {
old2NewOrd[currOldOrd] = newOrd;
}
if (new2OldOrd != null) {
new2OldOrd[newOrd] = currOldOrd;
}
if (newDocsWithField != null) {
newDocsWithField.add(newDocId);
}
newOrd++;
}
}
/** View over multiple vector values supporting iterator-style access via DocIdMerger. */ /** View over multiple vector values supporting iterator-style access via DocIdMerger. */
public static final class MergedVectorValues { public static final class MergedVectorValues {
private MergedVectorValues() {} private MergedVectorValues() {}
/** Returns a merged view over all the segment's {@link FloatVectorValues}. */ private static void validateFieldEncoding(FieldInfo fieldInfo, VectorEncoding expected) {
public static FloatVectorValues mergeFloatVectorValues(
FieldInfo fieldInfo, MergeState mergeState) throws IOException {
assert fieldInfo != null && fieldInfo.hasVectorValues(); assert fieldInfo != null && fieldInfo.hasVectorValues();
if (fieldInfo.getVectorEncoding() != VectorEncoding.FLOAT32) { VectorEncoding fieldEncoding = fieldInfo.getVectorEncoding();
if (fieldEncoding != expected) {
throw new UnsupportedOperationException( throw new UnsupportedOperationException(
"Cannot merge vectors encoded as [" + fieldInfo.getVectorEncoding() + "] as FLOAT32"); "Cannot merge vectors encoded as [" + fieldEncoding + "] as " + expected);
} }
List<VectorValuesSub> subs = new ArrayList<>(); }
for (int i = 0; i < mergeState.knnVectorsReaders.length; i++) {
KnnVectorsReader knnVectorsReader = mergeState.knnVectorsReaders[i]; private static <V, S> List<S> mergeVectorValues(
KnnVectorsReader[] knnVectorsReaders,
MergeState.DocMap[] docMaps,
IOFunction<KnnVectorsReader, V> valuesSupplier,
BiFunction<MergeState.DocMap, V, S> newSub)
throws IOException {
List<S> subs = new ArrayList<>();
for (int i = 0; i < knnVectorsReaders.length; i++) {
KnnVectorsReader knnVectorsReader = knnVectorsReaders[i];
if (knnVectorsReader != null) { if (knnVectorsReader != null) {
FloatVectorValues values = knnVectorsReader.getFloatVectorValues(fieldInfo.name); V values = valuesSupplier.apply(knnVectorsReader);
if (values != null) { if (values != null) {
subs.add(new VectorValuesSub(mergeState.docMaps[i], values)); subs.add(newSub.apply(docMaps[i], values));
} }
} }
} }
return new MergedFloat32VectorValues(subs, mergeState); return subs;
}
/** Returns a merged view over all the segment's {@link FloatVectorValues}. */
public static FloatVectorValues mergeFloatVectorValues(
FieldInfo fieldInfo, MergeState mergeState) throws IOException {
validateFieldEncoding(fieldInfo, VectorEncoding.FLOAT32);
return new MergedFloat32VectorValues(
mergeVectorValues(
mergeState.knnVectorsReaders,
mergeState.docMaps,
knnVectorsReader -> {
return knnVectorsReader.getFloatVectorValues(fieldInfo.name);
},
(docMap, values) -> {
return new FloatVectorValuesSub(docMap, values);
}),
mergeState);
} }
/** Returns a merged view over all the segment's {@link ByteVectorValues}. */ /** Returns a merged view over all the segment's {@link ByteVectorValues}. */
public static ByteVectorValues mergeByteVectorValues(FieldInfo fieldInfo, MergeState mergeState) public static ByteVectorValues mergeByteVectorValues(FieldInfo fieldInfo, MergeState mergeState)
throws IOException { throws IOException {
assert fieldInfo != null && fieldInfo.hasVectorValues(); validateFieldEncoding(fieldInfo, VectorEncoding.BYTE);
if (fieldInfo.getVectorEncoding() != VectorEncoding.BYTE) { return new MergedByteVectorValues(
throw new UnsupportedOperationException( mergeVectorValues(
"Cannot merge vectors encoded as [" + fieldInfo.getVectorEncoding() + "] as BYTE"); mergeState.knnVectorsReaders,
} mergeState.docMaps,
List<ByteVectorValuesSub> subs = new ArrayList<>(); knnVectorsReader -> {
for (int i = 0; i < mergeState.knnVectorsReaders.length; i++) { return knnVectorsReader.getByteVectorValues(fieldInfo.name);
KnnVectorsReader knnVectorsReader = mergeState.knnVectorsReaders[i]; },
if (knnVectorsReader != null) { (docMap, values) -> {
ByteVectorValues values = knnVectorsReader.getByteVectorValues(fieldInfo.name); return new ByteVectorValuesSub(docMap, values);
if (values != null) { }),
subs.add(new ByteVectorValuesSub(mergeState.docMaps[i], values)); mergeState);
}
}
}
return new MergedByteVectorValues(subs, mergeState);
} }
static class MergedFloat32VectorValues extends FloatVectorValues { static class MergedFloat32VectorValues extends FloatVectorValues {
private final List<VectorValuesSub> subs; private final List<FloatVectorValuesSub> subs;
private final DocIDMerger<VectorValuesSub> docIdMerger; private final DocIDMerger<FloatVectorValuesSub> docIdMerger;
private final int size; private final int size;
private int docId; private int docId;
VectorValuesSub current; FloatVectorValuesSub current;
private MergedFloat32VectorValues(List<VectorValuesSub> subs, MergeState mergeState) private MergedFloat32VectorValues(List<FloatVectorValuesSub> subs, MergeState mergeState)
throws IOException { throws IOException {
this.subs = subs; this.subs = subs;
docIdMerger = DocIDMerger.of(subs, mergeState.needsIndexSort); docIdMerger = DocIDMerger.of(subs, mergeState.needsIndexSort);
int totalSize = 0; int totalSize = 0;
for (VectorValuesSub sub : subs) { for (FloatVectorValuesSub sub : subs) {
totalSize += sub.values.size(); totalSize += sub.values.size();
} }
size = totalSize; size = totalSize;

Some files were not shown because too many files have changed in this diff Show More