mirror of https://github.com/apache/lucene.git
Merge branch 'apache:main' into bpv21_main
This commit is contained in:
commit
0a0701995a
|
@ -23,6 +23,7 @@ Apache Lucene is a high-performance, full-featured text search engine library
|
|||
written in Java.
|
||||
|
||||
[![Build Status](https://ci-builds.apache.org/job/Lucene/job/Lucene-Artifacts-main/badge/icon?subject=Lucene)](https://ci-builds.apache.org/job/Lucene/job/Lucene-Artifacts-main/)
|
||||
[![Revved up by Develocity](https://img.shields.io/badge/Revved%20up%20by-Develocity-06A0CE?logo=Gradle&labelColor=02303A)](https://ge.apache.org/scans?search.buildToolType=gradle&search.rootProjectNames=lucene-root)
|
||||
|
||||
## Online Documentation
|
||||
|
||||
|
|
|
@ -41,7 +41,7 @@ import jdk.jfr.consumer.RecordingFile;
|
|||
*/
|
||||
public class ProfileResults {
|
||||
/** Formats a frame to a formatted line. This is deduplicated on! */
|
||||
static String frameToString(RecordedFrame frame, boolean lineNumbers) {
|
||||
static String frameToString(RecordedFrame frame, boolean lineNumbers, boolean frameTypes) {
|
||||
StringBuilder builder = new StringBuilder();
|
||||
RecordedMethod method = frame.getMethod();
|
||||
RecordedClass clazz = method.getType();
|
||||
|
@ -55,13 +55,14 @@ public class ProfileResults {
|
|||
builder.append("#");
|
||||
builder.append(method.getName());
|
||||
builder.append("()");
|
||||
if (lineNumbers) {
|
||||
if (lineNumbers && frame.getLineNumber() != -1) {
|
||||
builder.append(":");
|
||||
if (frame.getLineNumber() == -1) {
|
||||
builder.append("(" + frame.getType() + " code)");
|
||||
} else {
|
||||
builder.append(frame.getLineNumber());
|
||||
}
|
||||
if (clazz != null && frameTypes) {
|
||||
builder.append(" [");
|
||||
builder.append(frame.getType());
|
||||
builder.append(" code]");
|
||||
}
|
||||
return builder.toString();
|
||||
}
|
||||
|
@ -77,6 +78,8 @@ public class ProfileResults {
|
|||
public static final String COUNT_DEFAULT = "10";
|
||||
public static final String LINENUMBERS_KEY = "tests.profile.linenumbers";
|
||||
public static final String LINENUMBERS_DEFAULT = "false";
|
||||
public static final String FRAMETYPES_KEY = "tests.profile.frametypes";
|
||||
public static final String FRAMETYPES_DEFAULT = "true";
|
||||
|
||||
/**
|
||||
* Driver method, for testing standalone.
|
||||
|
@ -92,7 +95,8 @@ public class ProfileResults {
|
|||
System.getProperty(MODE_KEY, MODE_DEFAULT),
|
||||
Integer.parseInt(System.getProperty(STACKSIZE_KEY, STACKSIZE_DEFAULT)),
|
||||
Integer.parseInt(System.getProperty(COUNT_KEY, COUNT_DEFAULT)),
|
||||
Boolean.parseBoolean(System.getProperty(LINENUMBERS_KEY, LINENUMBERS_DEFAULT)));
|
||||
Boolean.parseBoolean(System.getProperty(LINENUMBERS_KEY, LINENUMBERS_DEFAULT)),
|
||||
Boolean.parseBoolean(System.getProperty(FRAMETYPES_KEY, FRAMETYPES_DEFAULT)));
|
||||
}
|
||||
|
||||
/** true if we care about this event */
|
||||
|
@ -152,7 +156,12 @@ public class ProfileResults {
|
|||
|
||||
/** Process all the JFR files passed in args and print a merged summary. */
|
||||
public static void printReport(
|
||||
List<String> files, String mode, int stacksize, int count, boolean lineNumbers)
|
||||
List<String> files,
|
||||
String mode,
|
||||
int stacksize,
|
||||
int count,
|
||||
boolean lineNumbers,
|
||||
boolean frameTypes)
|
||||
throws IOException {
|
||||
if (!"cpu".equals(mode) && !"heap".equals(mode)) {
|
||||
throw new IllegalArgumentException("tests.profile.mode must be one of (cpu,heap)");
|
||||
|
@ -181,7 +190,7 @@ public class ProfileResults {
|
|||
if (stack.length() > 0) {
|
||||
stack.append("\n").append(framePadding).append(" at ");
|
||||
}
|
||||
stack.append(frameToString(trace.getFrames().get(i), lineNumbers));
|
||||
stack.append(frameToString(trace.getFrames().get(i), lineNumbers, frameTypes));
|
||||
}
|
||||
String line = stack.toString();
|
||||
SimpleEntry<String, Long> entry =
|
||||
|
|
11
build.gradle
11
build.gradle
|
@ -80,6 +80,9 @@ ext {
|
|||
// Minimum Java version required to compile and run Lucene.
|
||||
minJavaVersion = JavaVersion.toVersion(deps.versions.minJava.get())
|
||||
|
||||
// also change this in extractor tool: ExtractForeignAPI
|
||||
vectorIncubatorJavaVersions = [ JavaVersion.VERSION_21, JavaVersion.VERSION_22, JavaVersion.VERSION_23 ] as Set
|
||||
|
||||
// snapshot build marker used in scripts.
|
||||
snapshotBuild = version.contains("SNAPSHOT")
|
||||
|
||||
|
@ -117,10 +120,6 @@ apply from: file('gradle/generation/local-settings.gradle')
|
|||
// Make sure the build environment is consistent.
|
||||
apply from: file('gradle/validation/check-environment.gradle')
|
||||
|
||||
// IDE support, settings and specials.
|
||||
apply from: file('gradle/ide/intellij-idea.gradle')
|
||||
apply from: file('gradle/ide/eclipse.gradle')
|
||||
|
||||
// Set up defaults and configure aspects for certain modules or functionality
|
||||
// (java, tests)
|
||||
apply from: file('gradle/java/folder-layout.gradle')
|
||||
|
@ -133,6 +132,10 @@ apply from: file('gradle/testing/alternative-jdk-support.gradle')
|
|||
apply from: file('gradle/java/jar-manifest.gradle')
|
||||
apply from: file('gradle/java/modules.gradle')
|
||||
|
||||
// IDE support, settings and specials.
|
||||
apply from: file('gradle/ide/intellij-idea.gradle')
|
||||
apply from: file('gradle/ide/eclipse.gradle')
|
||||
|
||||
// Maven artifact publishing.
|
||||
apply from: file('gradle/maven/publications.gradle')
|
||||
|
||||
|
|
|
@ -67,6 +67,13 @@
|
|||
</maintainer>
|
||||
|
||||
<!-- NOTE: please insert releases in numeric order, NOT chronologically. -->
|
||||
<release>
|
||||
<Version>
|
||||
<name>lucene-9.11.1</name>
|
||||
<created>2024-06-27</created>
|
||||
<revision>9.11.1</revision>
|
||||
</Version>
|
||||
</release>.
|
||||
<release>
|
||||
<Version>
|
||||
<name>lucene-9.11.0</name>
|
||||
|
|
|
@ -0,0 +1,78 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
# Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
# contributor license agreements. See the NOTICE file distributed with
|
||||
# this work for additional information regarding copyright ownership.
|
||||
# The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
# (the "License"); you may not use this file except in compliance with
|
||||
# the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import os
|
||||
import re
|
||||
import subprocess
|
||||
import sys
|
||||
import tempfile
|
||||
import urllib.request
|
||||
|
||||
'''
|
||||
A simple tool to see diffs between main's version of CHANGES.txt entries for
|
||||
a given release vs the stable branch's version. It's best to keep these 1)
|
||||
identical and 2) matching what changes were actually backported to be honest
|
||||
to users and avoid future annoying conflicts on backport.
|
||||
'''
|
||||
|
||||
# e.g. python3 -u diff_lucene_changes.py branch_9_9 main 9.9.0
|
||||
|
||||
#
|
||||
|
||||
def get_changes_url(branch_name):
|
||||
if os.path.isdir(branch_name):
|
||||
url = f'file://{branch_name}/lucene/CHANGES.txt'
|
||||
else:
|
||||
url = f'https://raw.githubusercontent.com/apache/lucene/{branch_name}/lucene/CHANGES.txt'
|
||||
print(f'NOTE: resolving {branch_name} --> {url}')
|
||||
return url
|
||||
|
||||
def extract_release_section(changes_txt, release_name):
|
||||
return re.search(f'=======+ Lucene {re.escape(release_name)} =======+(.*?)=======+ Lucene .*? =======+$',
|
||||
changes_txt.decode('utf-8'), re.MULTILINE | re.DOTALL).group(1).encode('utf-8')
|
||||
|
||||
def main():
|
||||
if len(sys.argv) < 3 or len(sys.argv) > 5:
|
||||
print('\nUsage: python3 -u dev-tools/scripts/diff_lucene_changes.py <branch1-or-local-clone> <branch2-or-local-clone> <release-name> [diff-commandline-extras]\n')
|
||||
print(' e.g.: python3 -u dev-tools/scripts/diff_lucene_changes.py branch_9_9 /l/trunk 9.9.0 "-w"\n')
|
||||
sys.exit(1)
|
||||
|
||||
branch1 = sys.argv[1]
|
||||
branch2 = sys.argv[2]
|
||||
release_name = sys.argv[3]
|
||||
|
||||
if len(sys.argv) > 4:
|
||||
diff_cl_extras = [sys.argv[4]]
|
||||
else:
|
||||
diff_cl_extras = []
|
||||
|
||||
branch1_changes = extract_release_section(urllib.request.urlopen(get_changes_url(branch1)).read(),
|
||||
release_name)
|
||||
branch2_changes = extract_release_section(urllib.request.urlopen(get_changes_url(branch2)).read(),
|
||||
release_name)
|
||||
|
||||
with tempfile.NamedTemporaryFile() as f1, tempfile.NamedTemporaryFile() as f2:
|
||||
f1.write(branch1_changes)
|
||||
f2.write(branch2_changes)
|
||||
|
||||
command = ['diff'] + diff_cl_extras + [f1.name, f2.name]
|
||||
|
||||
# diff returns non-zero exit status when there are diffs, so don't pass check=True
|
||||
print(subprocess.run(command, check=False, capture_output=True).stdout.decode('utf-8'))
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
|
@ -17,13 +17,6 @@
|
|||
|
||||
def resources = scriptResources(buildscript)
|
||||
|
||||
configure(rootProject) {
|
||||
ext {
|
||||
// also change this in extractor tool: ExtractForeignAPI
|
||||
vectorIncubatorJavaVersions = [ JavaVersion.VERSION_21, JavaVersion.VERSION_22 ] as Set
|
||||
}
|
||||
}
|
||||
|
||||
configure(project(":lucene:core")) {
|
||||
ext {
|
||||
apijars = layout.projectDirectory.dir("src/generated/jdk")
|
||||
|
|
|
@ -23,7 +23,7 @@ configure(project(":lucene:core")) {
|
|||
description "Regenerate gen_ForUtil.py"
|
||||
group "generation"
|
||||
|
||||
def genDir = file("src/java/org/apache/lucene/codecs/lucene99")
|
||||
def genDir = file("src/java/org/apache/lucene/codecs/lucene912")
|
||||
def genScript = file("${genDir}/gen_ForUtil.py")
|
||||
def genOutput = file("${genDir}/ForUtil.java")
|
||||
|
||||
|
@ -43,6 +43,31 @@ configure(project(":lucene:core")) {
|
|||
andThenTasks: ["spotlessJava", "spotlessJavaApply"],
|
||||
mustRunBefore: [ "compileJava" ]
|
||||
])
|
||||
|
||||
task generateForDeltaUtilInternal() {
|
||||
description "Regenerate gen_ForDeltaUtil.py"
|
||||
group "generation"
|
||||
|
||||
def genDir = file("src/java/org/apache/lucene/codecs/lucene912")
|
||||
def genScript = file("${genDir}/gen_ForDeltaUtil.py")
|
||||
def genOutput = file("${genDir}/ForDeltaUtil.java")
|
||||
|
||||
inputs.file genScript
|
||||
outputs.file genOutput
|
||||
|
||||
doLast {
|
||||
quietExec {
|
||||
workingDir genDir
|
||||
executable project.externalTool("python3")
|
||||
args = [ '-B', genScript ]
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
regenerate.dependsOn wrapWithPersistentChecksums(generateForDeltaUtilInternal, [
|
||||
andThenTasks: ["spotlessJava", "spotlessJavaApply"],
|
||||
mustRunBefore: [ "compileJava" ]
|
||||
])
|
||||
}
|
||||
|
||||
configure(project(":lucene:backward-codecs")) {
|
||||
|
@ -96,5 +121,30 @@ configure(project(":lucene:backward-codecs")) {
|
|||
andThenTasks: ["spotlessJava", "spotlessJavaApply"],
|
||||
mustRunBefore: [ "compileJava" ]
|
||||
])
|
||||
|
||||
task generateForUtil99Internal() {
|
||||
description "Regenerate gen_ForUtil.py"
|
||||
group "generation"
|
||||
|
||||
def genDir = file("src/java/org/apache/lucene/backward_codecs/lucene99")
|
||||
def genScript = file("${genDir}/gen_ForUtil.py")
|
||||
def genOutput = file("${genDir}/ForUtil.java")
|
||||
|
||||
inputs.file genScript
|
||||
outputs.file genOutput
|
||||
|
||||
doLast {
|
||||
quietExec {
|
||||
workingDir genDir
|
||||
executable project.externalTool("python3")
|
||||
args = [ '-B', genScript ]
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
regenerate.dependsOn wrapWithPersistentChecksums(generateForUtil99Internal, [
|
||||
andThenTasks: ["spotlessJava", "spotlessJavaApply"],
|
||||
mustRunBefore: [ "compileJava" ]
|
||||
])
|
||||
}
|
||||
|
||||
|
|
|
@ -65,10 +65,8 @@ configure(project(":lucene:analysis:icu")) {
|
|||
icupkg = file("${icuBinDir}/icupkg")
|
||||
}
|
||||
|
||||
// Resolve version lazily (can't resolve at configuration time).
|
||||
def icu4jVersionProvider = project.provider { getVersion('com.ibm.icu', 'icu4j') }
|
||||
// lazy gstring with ICU version.
|
||||
def icu4jVersion = "${-> icu4jVersionProvider.get()}"
|
||||
def icu4jVersion = deps.icu4j.get().version
|
||||
|
||||
def icuCompileTask = Os.isFamily(Os.FAMILY_WINDOWS) ? "compileIcuWindows" : "compileIcuLinux"
|
||||
|
||||
|
|
|
@ -22,10 +22,11 @@ import org.gradle.plugins.ide.eclipse.model.ClasspathEntry
|
|||
def resources = scriptResources(buildscript)
|
||||
|
||||
configure(rootProject) {
|
||||
plugins.withType(JavaPlugin) {
|
||||
apply plugin: "eclipse"
|
||||
if (gradle.startParameter.taskNames.contains("eclipse")) {
|
||||
project.pluginManager.apply("java-base")
|
||||
project.pluginManager.apply("eclipse")
|
||||
|
||||
def eclipseJavaVersion = propertyOrDefault("eclipse.javaVersion", rootProject.minJavaVersion)
|
||||
def eclipseJavaVersion = propertyOrDefault("eclipse.javaVersion", deps.versions.minJava.get())
|
||||
def relativize = { other -> rootProject.rootDir.relativePath(other).toString() }
|
||||
|
||||
eclipse {
|
||||
|
@ -107,7 +108,7 @@ configure(rootProject) {
|
|||
|
||||
eclipseJdt {
|
||||
enabled = false
|
||||
dependsOn 'luceneEclipse'
|
||||
dependsOn 'luceneEclipseJdt'
|
||||
}
|
||||
|
||||
eclipseClasspath {
|
||||
|
|
|
@ -75,6 +75,18 @@ configure(rootProject) {
|
|||
it.dependsOn(":versionCatalogFormatDeps")
|
||||
}
|
||||
|
||||
// correct crlf/ default encoding after version catalog formatting finishes.
|
||||
tasks.matching {
|
||||
it.path in [
|
||||
":versionCatalogFormatDeps"
|
||||
]
|
||||
}.configureEach {
|
||||
it.doLast {
|
||||
ant.fixcrlf(file: it.catalogFile.get().asFile,
|
||||
eol: "lf", fixlast: "true", encoding: "UTF-8")
|
||||
}
|
||||
}
|
||||
|
||||
tasks.matching {
|
||||
it.path in [
|
||||
":versionCatalogUpdateDeps"
|
||||
|
|
|
@ -13,7 +13,7 @@
|
|||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
@defaultMessage Spawns threads with vague names; use a custom thread factory (Lucene's NamedThreadFactory, Solr's SolrNamedThreadFactory) and name threads so that you can tell (by its name) which executor it is associated with
|
||||
@defaultMessage Spawns threads with vague names; use a custom thread factory (Lucene's NamedThreadFactory) and name threads so that you can tell (by its name) which executor it is associated with
|
||||
java.util.concurrent.Executors#newFixedThreadPool(int)
|
||||
java.util.concurrent.Executors#newSingleThreadExecutor()
|
||||
java.util.concurrent.Executors#newCachedThreadPool()
|
||||
|
|
|
@ -20,6 +20,10 @@
|
|||
// 2) notice file
|
||||
// 3) checksum validation/ generation.
|
||||
|
||||
// WARNING: The tasks in this file share internal state between tasks without using files.
|
||||
// Because of this all tasks here must always execute together, so they cannot define task outputs.
|
||||
// TODO: Rewrite the internal state to use state files containing the ext.jarInfos and its referencedFiles
|
||||
|
||||
// This should be false only for debugging.
|
||||
def failOnError = true
|
||||
|
||||
|
@ -194,13 +198,6 @@ subprojects {
|
|||
description = "Validate license and notice files of dependencies"
|
||||
dependsOn collectJarInfos
|
||||
|
||||
def outputFileName = 'validateJarLicenses'
|
||||
inputs.dir(file(project.rootDir.path + '/lucene/licenses'))
|
||||
.withPropertyName('licenses')
|
||||
.withPathSensitivity(PathSensitivity.RELATIVE)
|
||||
outputs.file(layout.buildDirectory.file(outputFileName))
|
||||
.withPropertyName('validateJarLicensesResult')
|
||||
|
||||
doLast {
|
||||
def errors = []
|
||||
jarInfos.each { dep ->
|
||||
|
@ -246,9 +243,7 @@ subprojects {
|
|||
}
|
||||
}
|
||||
}
|
||||
// Required to take advantage of incremental building and the build cache
|
||||
def f = new File(project.buildDir.path + "/" + outputFileName)
|
||||
f.write(errors.toString(), "UTF-8")
|
||||
|
||||
if (errors) {
|
||||
def msg = "Certain license/ notice files are missing:\n - " + errors.join("\n - ")
|
||||
if (failOnError) {
|
||||
|
|
|
@ -80,10 +80,6 @@ API Changes
|
|||
* GITHUB#12875: Ensure token position is always increased in PathHierarchyTokenizer and ReversePathHierarchyTokenizer
|
||||
and resulting tokens do not overlap. (Michael Froh, Lukáš Vlček)
|
||||
|
||||
* GITHUB#12624, GITHUB#12831: Allow FSTCompiler to stream to any DataOutput while building, and
|
||||
make compile() only return the FSTMetadata. For on-heap (default) use case, please use
|
||||
FST.fromFSTReader(fstMetadata, fstCompiler.getFSTReader()) to create the FST. (Anh Dung Bui)
|
||||
|
||||
* GITHUB#13146, GITHUB#13148: Remove ByteBufferIndexInput and only use MemorySegment APIs
|
||||
for MMapDirectory. (Uwe Schindler)
|
||||
|
||||
|
@ -112,6 +108,11 @@ API Changes
|
|||
|
||||
* GITHUB#13410: Removed Scorer#getWeight (Sanjay Dutt, Adrien Grand)
|
||||
|
||||
* GITHUB#13499: Remove deprecated TopScoreDocCollector + TopFieldCollector methods (#create, #createSharedManager) (Jakub Slowinski)
|
||||
|
||||
* GITHUB#13632: CandidateMatcher public matching functions (Bryan Jacobowitz)
|
||||
|
||||
|
||||
New Features
|
||||
---------------------
|
||||
|
||||
|
@ -133,6 +134,16 @@ New Features
|
|||
DocValuesSkipper abstraction. A new flag is added to FieldType.java that configures whether
|
||||
to create a "skip index" for doc values. (Ignacio Vera)
|
||||
|
||||
* GITHUB#13563: Add levels to doc values skip index. (Ignacio Vera)
|
||||
|
||||
* GITHUB#13597: Align doc value skipper interval boundaries when an interval contains a constant
|
||||
value. (Ignacio Vera)
|
||||
|
||||
* GITHUB#13604: Add Kmeans clustering on vectors (Mayya Sharipova, Jim Ferenczi, Tom Veasey)
|
||||
|
||||
* GITHUB#13592: Take advantage of the doc value skipper when it is primary sort in SortedNumericDocValuesRangeQuery
|
||||
and SortedSetDocValuesRangeQuery. (Ignacio Vera)
|
||||
|
||||
Improvements
|
||||
---------------------
|
||||
|
||||
|
@ -168,6 +179,8 @@ Optimizations
|
|||
|
||||
* GITHUB#12552: Make FSTPostingsFormat load FSTs off-heap. (Tony X)
|
||||
|
||||
* GITHUB#13672: Leverage doc value skip lists in DocValuesRewriteMethod if indexed. (Greg Miller)
|
||||
|
||||
Bug Fixes
|
||||
---------------------
|
||||
|
||||
|
@ -205,6 +218,9 @@ Changes in Backwards Compatibility Policy
|
|||
* GITHUB#13230: Remove the Kp and Lovins snowball algorithms which are not supported
|
||||
or intended for general use. (Robert Muir)
|
||||
|
||||
* GITHUB#13602: SearchWithCollectorTask no longer supports the `collector.class` config parameter to load a custom
|
||||
collector implementation. `collector.manager.class` allows users to load a collector manager instead. (Luca Cavanna)
|
||||
|
||||
Other
|
||||
---------------------
|
||||
|
||||
|
@ -243,22 +259,71 @@ Other
|
|||
|
||||
* GITHUB#13332: Improve MissingDoclet linter to check records correctly. (Uwe Schindler)
|
||||
|
||||
* GITHUB#13499: Remove usage of TopScoreDocCollector + TopFieldCollector deprecated methods (#create, #createSharedManager) (Jakub Slowinski)
|
||||
|
||||
Build
|
||||
---------------------
|
||||
|
||||
* GITHUB#13649: Fix eclipse ide settings generation #13649 (Uwe Schindler, Dawid Weiss)
|
||||
|
||||
======================== Lucene 9.12.0 =======================
|
||||
|
||||
API Changes
|
||||
---------------------
|
||||
|
||||
* GITHUB#13281: Mark COSINE VectorSimilarityFunction as deprecated. (Pulkit Gupta)
|
||||
|
||||
* GITHUB#13469: Expose FlatVectorsFormat as a first-class format; can be configured using a custom Codec. (Michael Sokolov)
|
||||
|
||||
* GITHUB#13612: Hunspell: add Suggester#proceedPastRep to avoid losing relevant suggestions. (Peter Gromov)
|
||||
|
||||
* GITHUB#13603: Introduced `IndexSearcher#searchLeaf(LeafReaderContext, Weight, Collector)` protected method to
|
||||
facilitate customizing per-leaf behavior of search without requiring to override
|
||||
`search(LeafReaderContext[], Weight, Collector)` which requires overriding the entire loop across the leaves (Luca Cavanna)
|
||||
|
||||
* GITHUB#13559: Add BitSet#nextSetBit(int, int) to get the index of the first set bit in range. (Egor Potemkin)
|
||||
|
||||
* GITHUB#13568: Add DoubleValuesSource#toSortableLongDoubleValuesSource and
|
||||
MultiDoubleValuesSource#toSortableMultiLongValuesSource methods. (Shradha Shankar)
|
||||
|
||||
* GITHUB#13568: Add CollectorOwner class that wraps CollectorManager, and handles list of Collectors and results.
|
||||
Add IndexSearcher#search method that takes CollectorOwner. (Egor Potemkin)
|
||||
|
||||
* GITHUB#13568: Add DrillSideways#search method that supports any collector types for any drill-sideways dimensions
|
||||
or drill-down. (Egor Potemkin)
|
||||
|
||||
New Features
|
||||
---------------------
|
||||
(No changes)
|
||||
|
||||
* GITHUB#13430: Allow configuring the search concurrency via
|
||||
TieredMergePolicy#setTargetSearchConcurrency. This in-turn instructs the
|
||||
merge policy to try to have at least this number of segments on the highest
|
||||
tier. (Adrien Grand, Carlos Delgado)
|
||||
|
||||
* GITHUB#13517: Allow configuring the search concurrency on LogDocMergePolicy
|
||||
and LogByteSizeMergePolicy via a new #setTargetConcurrency setter.
|
||||
(Adrien Grand)
|
||||
|
||||
* GITHUB#13568: Add sandbox facets module to compute facets while collecting. (Egor Potemkin, Shradha Shankar)
|
||||
|
||||
* GITHUB#13678: Add support JDK 23 to the Panama Vectorization Provider. (Chris Hegarty)
|
||||
|
||||
Improvements
|
||||
---------------------
|
||||
(No changes)
|
||||
|
||||
* GITHUB#13548: Refactor and javadoc update for KNN vector writer classes. (Patrick Zhai)
|
||||
|
||||
* GITHUB#13562: Add Intervals.regexp and Intervals.range methods to produce IntervalsSource
|
||||
for regexp and range queries. (Mayya Sharipova)
|
||||
|
||||
* GITHUB#13625: Remove BitSet#nextSetBit code duplication. (Greg Miller)
|
||||
|
||||
* GITHUB#13285: Early terminate graph searches of AbstractVectorSimilarityQuery to follow timeout set from
|
||||
IndexSearcher#setTimeout(QueryTimeout). (Kaival Parikh)
|
||||
|
||||
* GITHUB#13633: Add ability to read/write knn vector values to a MemoryIndex. (Ben Trent)
|
||||
|
||||
* GITHUB#12627: patch HNSW graphs to improve reachability of all nodes from entry points
|
||||
|
||||
* GITHUB#13201: Better cost estimation on MultiTermQuery over few terms. (Michael Froh)
|
||||
|
||||
Optimizations
|
||||
---------------------
|
||||
|
@ -277,16 +342,100 @@ Optimizations
|
|||
|
||||
* GITHUB#12941: Don't preserve auxiliary buffer contents in LSBRadixSorter if it grows. (Stefan Vodita)
|
||||
|
||||
* GITHUB#13175: Stop double-checking priority queue inserts in some FacetCount classes. (Jakub Slowinski)
|
||||
|
||||
* GITHUB#13538: Slightly reduce heap usage for HNSW and scalar quantized vector writers. (Ben Trent)
|
||||
|
||||
* GITHUB#12100: WordBreakSpellChecker.suggestWordBreaks now does a breadth first search, allowing it to return
|
||||
better matches with fewer evaluations (hossman)
|
||||
|
||||
* GITHUB#13582: Stop requiring MaxScoreBulkScorer's outer window from having at
|
||||
least INNER_WINDOW_SIZE docs. (Adrien Grand)
|
||||
|
||||
* GITHUB#13570, GITHUB#13574, GITHUB#13535: Avoid performance degradation with closing shared Arenas.
|
||||
Closing many individual index files can potentially lead to a degradation in execution performance.
|
||||
Index files are mmapped one-to-one with the JDK's foreign shared Arena. The JVM deoptimizes the top
|
||||
few frames of all threads when closing a shared Arena (see JDK-8335480). We mitigate this situation
|
||||
when running with JDK 21 and greater, by 1) using a confined Arena where appropriate, and 2) grouping
|
||||
files from the same segment to a single shared Arena.
|
||||
A system property has been added that allows to control the total maximum number of mmapped files
|
||||
that may be associated with a single shared Arena. For example, to set the max number of permits to
|
||||
256, pass the following on the command line
|
||||
-Dorg.apache.lucene.store.MMapDirectory.sharedArenaMaxPermits=256. Setting a value of 1 associates
|
||||
a single file to a single shared arena.
|
||||
(Chris Hegarty, Michael Gibney, Uwe Schindler)
|
||||
|
||||
* GITHUB#13585: Lucene912PostingsFormat, the new default postings format, now
|
||||
only has 2 levels of skip data, which are inlined into postings instead of
|
||||
being stored at the end of postings lists. This translates into better
|
||||
performance for queries that need skipping such as conjunctions.
|
||||
(Adrien Grand)
|
||||
|
||||
* GITHUB#13581: OnHeapHnswGraph no longer allocates a lock for every graph node (Mike Sokolov)
|
||||
|
||||
* GITHUB#13636, GITHUB#13658: Optimizations to the decoding logic of blocks of
|
||||
postings. (Adrien Grand, Uwe Schindler, Greg Miller)
|
||||
|
||||
* GITHUB##13644: Improve NumericComparator competitive iterator logic by comparing the missing value with the top
|
||||
value even after the hit queue is full (Pan Guixin)
|
||||
|
||||
* GITHUB#13587: Use Max WAND optimizations with ToParentBlockJoinQuery when using ScoreMode.Max (Mike Pellegrini)
|
||||
|
||||
Changes in runtime behavior
|
||||
---------------------
|
||||
|
||||
* GITHUB#13472: When an executor is provided to the IndexSearcher constructor, the searcher now executes tasks on the
|
||||
thread that invoked a search as well as its configured executor. Users should reduce the executor's thread-count by 1
|
||||
to retain the previous level of parallelism. Moreover, it is now possible to start searches from the same executor
|
||||
that is configured in the IndexSearcher without risk of deadlocking. A separate executor for starting searches is no
|
||||
longer required. (Armin Braun)
|
||||
|
||||
Bug Fixes
|
||||
---------------------
|
||||
|
||||
* GITHUB#13384: Fix highlighter to use longer passages instead of shorter individual terms. (Zack Kendall)
|
||||
|
||||
* GITHUB#13463: Address bug in MultiLeafKnnCollector causing #minCompetitiveSimilarity to stay artificially low in
|
||||
some corner cases. (Greg Miller)
|
||||
|
||||
* GITHUB#13553: Correct RamUsageEstimate for scalar quantized knn vector formats so that raw vectors are correctly
|
||||
accounted for. (Ben Trent)
|
||||
|
||||
* GITHUB#13615: Correct scalar quantization when used in conjunction with COSINE similarity. Vectors are normalized
|
||||
before quantization to ensure the cosine similarity is correctly calculated. (Ben Trent)
|
||||
|
||||
* GITHUB#13627: Fix race condition on flush for DWPT seqNo generation. (Ben Trent, Ao Li)
|
||||
|
||||
* GITHUB#13691: Fix incorrect exponent value in explain of SigmoidFunction. (Owais Kazi)
|
||||
|
||||
Build
|
||||
---------------------
|
||||
|
||||
* GITHUB#13695, GITHUB#13696: Fix Gradle build sometimes gives spurious "unreferenced license file" warnings.
|
||||
(Uwe Schindler)
|
||||
|
||||
Other
|
||||
--------------------
|
||||
(No changes)
|
||||
|
||||
======================== Lucene 9.11.1 =======================
|
||||
|
||||
Bug Fixes
|
||||
---------------------
|
||||
|
||||
* GITHUB#13498: Avoid performance regression by constructing lazily the PointTree in NumericComparator. (Ignacio Vera)
|
||||
|
||||
* GITHUB#13501, GITHUB#13478: Remove intra-merge parallelism for everything except HNSW graph merges. (Ben Trent)
|
||||
|
||||
* GITHUB#13498, GITHUB#13340: Allow adding a parent field to an index with no fields (Michael Sokolov)
|
||||
|
||||
* GITHUB#12431: Fix IndexOutOfBoundsException thrown in DefaultPassageFormatter
|
||||
by unordered matches. (Stephane Campinas)
|
||||
|
||||
* GITHUB#13493: StringValueFacetCounts stops throwing NPE when faceting over an empty match-set. (Grebennikov Roman,
|
||||
Stefan Vodita)
|
||||
|
||||
|
||||
======================== Lucene 9.11.0 =======================
|
||||
|
||||
API Changes
|
||||
|
@ -494,6 +643,10 @@ API Changes
|
|||
|
||||
* GITHUB#12854: Mark DrillSideways#createDrillDownFacetsCollector as @Deprecated. (Greg Miller)
|
||||
|
||||
* GITHUB#12624, GITHUB#12831: Allow FSTCompiler to stream to any DataOutput while building, and
|
||||
make compile() only return the FSTMetadata. For on-heap (default) use case, please use
|
||||
FST.fromFSTReader(fstMetadata, fstCompiler.getFSTReader()) to create the FST. (Anh Dung Bui)
|
||||
|
||||
New Features
|
||||
---------------------
|
||||
* GITHUB#12679: Add support for similarity-based vector searches using [Byte|Float]VectorSimilarityQuery. Uses a new
|
||||
|
@ -501,6 +654,12 @@ New Features
|
|||
better-scoring nodes are available, or the best candidate is below a score of `traversalSimilarity` in the lowest
|
||||
level. (Aditya Prakash, Kaival Parikh)
|
||||
|
||||
* GITHUB#12829: For indices newly created as of 9.10.0 onwards, IndexWriter preserves document blocks indexed via
|
||||
IndexWriter#addDocuments or IndexWriter#updateDocuments also when index sorting is configured. Document blocks are
|
||||
maintained alongside their parent documents during sort and merge. IndexWriterConfig accepts a parent field that is used
|
||||
to maintain block orders if index sorting is used. Note, this is fully optional in Lucene 9.x while will be mandatory for
|
||||
indices that use document blocks together with index sorting as of 10.0.0. (Simon Willnauer)
|
||||
|
||||
* GITHUB#12336: Index additional data per facet label in the taxonomy. (Shai Erera, Egor Potemkin, Mike McCandless,
|
||||
Stefan Vodita)
|
||||
|
||||
|
@ -592,7 +751,6 @@ Build
|
|||
|
||||
Other
|
||||
---------------------
|
||||
|
||||
* GITHUB#11023: Removing some dead code in CheckIndex. (Jakub Slowinski)
|
||||
|
||||
* GITHUB#11023: Removing @lucene.experimental tags in testXXX methods in CheckIndex. (Jakub Slowinski)
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
{
|
||||
"gradle/generation/jflex/skeleton.default.txt": "58944f66c9113a940dfaf6a17210ec8219024390",
|
||||
"lucene/analysis/common/src/java/org/apache/lucene/analysis/classic/ClassicTokenizerImpl.java": "1f7a446f3483326385eef257cea8366c27da0850",
|
||||
"lucene/analysis/common/src/java/org/apache/lucene/analysis/classic/ClassicTokenizerImpl.java": "e62dcd8c25219d8f5d783823b228ffe38d2bacde",
|
||||
"lucene/analysis/common/src/java/org/apache/lucene/analysis/classic/ClassicTokenizerImpl.jflex": "f52109bb7d5701979fde90aeeeda726246a8d5fd"
|
||||
}
|
|
@ -1,5 +1,5 @@
|
|||
{
|
||||
"gradle/generation/jflex/skeleton.default.txt": "58944f66c9113a940dfaf6a17210ec8219024390",
|
||||
"lucene/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.java": "ac298e08bc5b96202efca0c01f9f0376fda976bd",
|
||||
"lucene/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.java": "2b5df5ff35543a6380c82f298225eb5fa06e4453",
|
||||
"lucene/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.jflex": "0b8c7774b98e8237702013e82c352d4711509bd0"
|
||||
}
|
|
@ -31,6 +31,7 @@ class ModifyingSuggester {
|
|||
private final String misspelled;
|
||||
private final WordCase wordCase;
|
||||
private final FragmentChecker fragmentChecker;
|
||||
private final boolean proceedPastRep;
|
||||
private final char[] tryChars;
|
||||
private final Hunspell speller;
|
||||
|
||||
|
@ -39,13 +40,15 @@ class ModifyingSuggester {
|
|||
LinkedHashSet<Suggestion> result,
|
||||
String misspelled,
|
||||
WordCase wordCase,
|
||||
FragmentChecker checker) {
|
||||
FragmentChecker checker,
|
||||
boolean proceedPastRep) {
|
||||
this.speller = speller;
|
||||
tryChars = speller.dictionary.tryChars.toCharArray();
|
||||
this.result = result;
|
||||
this.misspelled = misspelled;
|
||||
this.wordCase = wordCase;
|
||||
fragmentChecker = checker;
|
||||
this.proceedPastRep = proceedPastRep;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -125,9 +128,9 @@ class ModifyingSuggester {
|
|||
boolean hasGoodSuggestions = trySuggestion(word.toUpperCase(Locale.ROOT));
|
||||
|
||||
GradedSuggestions repResult = tryRep(word);
|
||||
if (repResult == GradedSuggestions.Best) return true;
|
||||
if (repResult == GradedSuggestions.Best && !proceedPastRep) return true;
|
||||
|
||||
hasGoodSuggestions |= repResult == GradedSuggestions.Normal;
|
||||
hasGoodSuggestions |= repResult != GradedSuggestions.None;
|
||||
|
||||
if (!speller.dictionary.mapTable.isEmpty()) {
|
||||
enumerateMapReplacements(word, "", 0);
|
||||
|
|
|
@ -53,16 +53,21 @@ public class Suggester {
|
|||
private final Dictionary dictionary;
|
||||
private final SuggestibleEntryCache suggestibleCache;
|
||||
private final FragmentChecker fragmentChecker;
|
||||
private final boolean proceedPastRep;
|
||||
|
||||
public Suggester(Dictionary dictionary) {
|
||||
this(dictionary, null, FragmentChecker.EVERYTHING_POSSIBLE);
|
||||
this(dictionary, null, FragmentChecker.EVERYTHING_POSSIBLE, false);
|
||||
}
|
||||
|
||||
private Suggester(
|
||||
Dictionary dictionary, SuggestibleEntryCache suggestibleCache, FragmentChecker checker) {
|
||||
Dictionary dictionary,
|
||||
SuggestibleEntryCache suggestibleCache,
|
||||
FragmentChecker checker,
|
||||
boolean proceedPastRep) {
|
||||
this.dictionary = dictionary;
|
||||
this.suggestibleCache = suggestibleCache;
|
||||
this.fragmentChecker = checker;
|
||||
this.proceedPastRep = proceedPastRep;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -71,8 +76,8 @@ public class Suggester {
|
|||
* entries are stored as fast-to-iterate plain words instead of highly compressed prefix trees.
|
||||
*/
|
||||
public Suggester withSuggestibleEntryCache() {
|
||||
return new Suggester(
|
||||
dictionary, SuggestibleEntryCache.buildCache(dictionary.words), fragmentChecker);
|
||||
SuggestibleEntryCache cache = SuggestibleEntryCache.buildCache(dictionary.words);
|
||||
return new Suggester(dictionary, cache, fragmentChecker, proceedPastRep);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -80,7 +85,17 @@ public class Suggester {
|
|||
* the performance of the "Modification" phase performance.
|
||||
*/
|
||||
public Suggester withFragmentChecker(FragmentChecker checker) {
|
||||
return new Suggester(dictionary, suggestibleCache, checker);
|
||||
return new Suggester(dictionary, suggestibleCache, checker, proceedPastRep);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a copy of this suggester instance that doesn't stop after encountering acceptable words
|
||||
* after applying REP rules. By default, Hunspell stops when it finds any, but this behavior may
|
||||
* not always be desirable, e.g., if we have "REP i ea", "tims" be replaced only by "teams" and
|
||||
* not "times", which could also be meant.
|
||||
*/
|
||||
public Suggester proceedPastRep() {
|
||||
return new Suggester(dictionary, suggestibleCache, fragmentChecker, true);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -174,7 +189,8 @@ public class Suggester {
|
|||
}
|
||||
|
||||
boolean hasGoodSuggestions =
|
||||
new ModifyingSuggester(suggestionSpeller, suggestions, word, wordCase, fragmentChecker)
|
||||
new ModifyingSuggester(
|
||||
suggestionSpeller, suggestions, word, wordCase, fragmentChecker, proceedPastRep)
|
||||
.suggest();
|
||||
|
||||
if (!hasGoodSuggestions && dictionary.maxNGramSuggestions > 0) {
|
||||
|
|
|
@ -59,6 +59,14 @@ public class TestSpellChecking extends LuceneTestCase {
|
|||
|
||||
public void testRepSuggestions() throws Exception {
|
||||
doTest("rep");
|
||||
|
||||
//noinspection DataFlowIssue
|
||||
Path aff = Path.of(getClass().getResource("rep.aff").toURI());
|
||||
Dictionary dictionary = TestAllDictionaries.loadDictionary(aff);
|
||||
Suggester suggester = new Suggester(dictionary);
|
||||
assertEquals(List.of("auto's"), suggester.suggestNoTimeout("autos", () -> {}));
|
||||
assertEquals(
|
||||
List.of("auto's", "auto"), suggester.proceedPastRep().suggestNoTimeout("autos", () -> {}));
|
||||
}
|
||||
|
||||
public void testPhSuggestions() throws Exception {
|
||||
|
|
|
@ -0,0 +1,4 @@
|
|||
{
|
||||
"lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene99/ForUtil.java": "f31797842f047626df6a1a6b97167bec60269fec",
|
||||
"lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene99/gen_ForUtil.py": "325f2610974b0e76e278b6445405a098a3763feb"
|
||||
}
|
|
@ -35,6 +35,7 @@ module org.apache.lucene.backward_codecs {
|
|||
exports org.apache.lucene.backward_codecs.lucene92;
|
||||
exports org.apache.lucene.backward_codecs.lucene94;
|
||||
exports org.apache.lucene.backward_codecs.lucene95;
|
||||
exports org.apache.lucene.backward_codecs.lucene99;
|
||||
exports org.apache.lucene.backward_codecs.packed;
|
||||
exports org.apache.lucene.backward_codecs.store;
|
||||
|
||||
|
@ -43,7 +44,8 @@ module org.apache.lucene.backward_codecs {
|
|||
provides org.apache.lucene.codecs.PostingsFormat with
|
||||
org.apache.lucene.backward_codecs.lucene50.Lucene50PostingsFormat,
|
||||
org.apache.lucene.backward_codecs.lucene84.Lucene84PostingsFormat,
|
||||
org.apache.lucene.backward_codecs.lucene90.Lucene90PostingsFormat;
|
||||
org.apache.lucene.backward_codecs.lucene90.Lucene90PostingsFormat,
|
||||
org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat;
|
||||
provides org.apache.lucene.codecs.KnnVectorsFormat with
|
||||
org.apache.lucene.backward_codecs.lucene90.Lucene90HnswVectorsFormat,
|
||||
org.apache.lucene.backward_codecs.lucene91.Lucene91HnswVectorsFormat,
|
||||
|
@ -59,5 +61,6 @@ module org.apache.lucene.backward_codecs {
|
|||
org.apache.lucene.backward_codecs.lucene91.Lucene91Codec,
|
||||
org.apache.lucene.backward_codecs.lucene92.Lucene92Codec,
|
||||
org.apache.lucene.backward_codecs.lucene94.Lucene94Codec,
|
||||
org.apache.lucene.backward_codecs.lucene95.Lucene95Codec;
|
||||
org.apache.lucene.backward_codecs.lucene95.Lucene95Codec,
|
||||
org.apache.lucene.backward_codecs.lucene99.Lucene99Codec;
|
||||
}
|
||||
|
|
|
@ -88,21 +88,17 @@ public final class FieldReader extends Terms {
|
|||
(new ByteArrayDataInput(rootCode.bytes, rootCode.offset, rootCode.length)).readVLong()
|
||||
>>> Lucene40BlockTreeTermsReader.OUTPUT_FLAGS_NUM_BITS;
|
||||
// Initialize FST always off-heap.
|
||||
final FST.FSTMetadata<BytesRef> fstMetadata;
|
||||
if (metaIn == indexIn) { // Only true before Lucene 8.6
|
||||
final IndexInput clone = indexIn.clone();
|
||||
clone.seek(indexStartFP);
|
||||
if (metaIn == indexIn) { // Only true before Lucene 8.6
|
||||
index =
|
||||
new FST<>(
|
||||
readMetadata(clone, ByteSequenceOutputs.getSingleton()),
|
||||
clone,
|
||||
new OffHeapFSTStore());
|
||||
fstMetadata = readMetadata(clone, ByteSequenceOutputs.getSingleton());
|
||||
// FST bytes actually only start after the metadata.
|
||||
indexStartFP = clone.getFilePointer();
|
||||
} else {
|
||||
index =
|
||||
new FST<>(
|
||||
readMetadata(metaIn, ByteSequenceOutputs.getSingleton()),
|
||||
clone,
|
||||
new OffHeapFSTStore());
|
||||
fstMetadata = readMetadata(metaIn, ByteSequenceOutputs.getSingleton());
|
||||
}
|
||||
index = FST.fromFSTReader(fstMetadata, new OffHeapFSTStore(indexIn, indexStartFP, fstMetadata));
|
||||
/*
|
||||
if (false) {
|
||||
final String dotFileName = segment + "_" + fieldInfo.name + ".dot";
|
||||
|
|
|
@ -14,7 +14,7 @@
|
|||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.codecs.lucene99;
|
||||
package org.apache.lucene.backward_codecs.lucene99;
|
||||
|
||||
import java.io.IOException;
|
||||
import org.apache.lucene.store.DataInput;
|
|
@ -16,7 +16,7 @@
|
|||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.codecs.lucene99;
|
||||
package org.apache.lucene.backward_codecs.lucene99;
|
||||
|
||||
import java.io.IOException;
|
||||
import org.apache.lucene.store.DataInput;
|
|
@ -14,12 +14,33 @@
|
|||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.codecs.lucene99;
|
||||
package org.apache.lucene.backward_codecs.lucene99;
|
||||
|
||||
import java.util.Objects;
|
||||
import org.apache.lucene.codecs.*;
|
||||
import org.apache.lucene.codecs.lucene90.*;
|
||||
import org.apache.lucene.codecs.Codec;
|
||||
import org.apache.lucene.codecs.CompoundFormat;
|
||||
import org.apache.lucene.codecs.DocValuesFormat;
|
||||
import org.apache.lucene.codecs.FieldInfosFormat;
|
||||
import org.apache.lucene.codecs.FilterCodec;
|
||||
import org.apache.lucene.codecs.KnnVectorsFormat;
|
||||
import org.apache.lucene.codecs.LiveDocsFormat;
|
||||
import org.apache.lucene.codecs.NormsFormat;
|
||||
import org.apache.lucene.codecs.PointsFormat;
|
||||
import org.apache.lucene.codecs.PostingsFormat;
|
||||
import org.apache.lucene.codecs.SegmentInfoFormat;
|
||||
import org.apache.lucene.codecs.StoredFieldsFormat;
|
||||
import org.apache.lucene.codecs.TermVectorsFormat;
|
||||
import org.apache.lucene.codecs.lucene90.Lucene90CompoundFormat;
|
||||
import org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat;
|
||||
import org.apache.lucene.codecs.lucene90.Lucene90LiveDocsFormat;
|
||||
import org.apache.lucene.codecs.lucene90.Lucene90NormsFormat;
|
||||
import org.apache.lucene.codecs.lucene90.Lucene90PointsFormat;
|
||||
import org.apache.lucene.codecs.lucene90.Lucene90StoredFieldsFormat;
|
||||
import org.apache.lucene.codecs.lucene90.Lucene90TermVectorsFormat;
|
||||
import org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat;
|
||||
import org.apache.lucene.codecs.lucene94.Lucene94FieldInfosFormat;
|
||||
import org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat;
|
||||
import org.apache.lucene.codecs.lucene99.Lucene99SegmentInfoFormat;
|
||||
import org.apache.lucene.codecs.perfield.PerFieldDocValuesFormat;
|
||||
import org.apache.lucene.codecs.perfield.PerFieldKnnVectorsFormat;
|
||||
import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat;
|
||||
|
@ -98,7 +119,7 @@ public class Lucene99Codec extends Codec {
|
|||
super("Lucene99");
|
||||
this.storedFieldsFormat =
|
||||
new Lucene90StoredFieldsFormat(Objects.requireNonNull(mode).storedMode);
|
||||
this.defaultPostingsFormat = new Lucene99PostingsFormat();
|
||||
this.defaultPostingsFormat = new Lucene912PostingsFormat();
|
||||
this.defaultDVFormat = new Lucene90DocValuesFormat();
|
||||
this.defaultKnnVectorsFormat = new Lucene99HnswVectorsFormat();
|
||||
}
|
|
@ -14,7 +14,7 @@
|
|||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.codecs.lucene99;
|
||||
package org.apache.lucene.backward_codecs.lucene99;
|
||||
|
||||
import java.io.IOException;
|
||||
import org.apache.lucene.codecs.BlockTermState;
|
||||
|
@ -24,7 +24,6 @@ import org.apache.lucene.codecs.FieldsProducer;
|
|||
import org.apache.lucene.codecs.MultiLevelSkipListWriter;
|
||||
import org.apache.lucene.codecs.PostingsFormat;
|
||||
import org.apache.lucene.codecs.PostingsReaderBase;
|
||||
import org.apache.lucene.codecs.PostingsWriterBase;
|
||||
import org.apache.lucene.codecs.lucene90.blocktree.Lucene90BlockTreeTermsReader;
|
||||
import org.apache.lucene.codecs.lucene90.blocktree.Lucene90BlockTreeTermsWriter;
|
||||
import org.apache.lucene.index.IndexOptions;
|
||||
|
@ -339,7 +338,7 @@ import org.apache.lucene.util.packed.PackedInts;
|
|||
*
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public final class Lucene99PostingsFormat extends PostingsFormat {
|
||||
public class Lucene99PostingsFormat extends PostingsFormat {
|
||||
|
||||
/**
|
||||
* Filename extension for document number, frequencies, and skip data. See chapter: <a
|
||||
|
@ -374,28 +373,9 @@ public final class Lucene99PostingsFormat extends PostingsFormat {
|
|||
static final int VERSION_START = 0;
|
||||
static final int VERSION_CURRENT = VERSION_START;
|
||||
|
||||
private final int minTermBlockSize;
|
||||
private final int maxTermBlockSize;
|
||||
|
||||
/** Creates {@code Lucene99PostingsFormat} with default settings. */
|
||||
public Lucene99PostingsFormat() {
|
||||
this(
|
||||
Lucene90BlockTreeTermsWriter.DEFAULT_MIN_BLOCK_SIZE,
|
||||
Lucene90BlockTreeTermsWriter.DEFAULT_MAX_BLOCK_SIZE);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates {@code Lucene99PostingsFormat} with custom values for {@code minBlockSize} and {@code
|
||||
* maxBlockSize} passed to block terms dictionary.
|
||||
*
|
||||
* @see
|
||||
* Lucene90BlockTreeTermsWriter#Lucene90BlockTreeTermsWriter(SegmentWriteState,PostingsWriterBase,int,int)
|
||||
*/
|
||||
public Lucene99PostingsFormat(int minTermBlockSize, int maxTermBlockSize) {
|
||||
super("Lucene99");
|
||||
Lucene90BlockTreeTermsWriter.validateSettings(minTermBlockSize, maxTermBlockSize);
|
||||
this.minTermBlockSize = minTermBlockSize;
|
||||
this.maxTermBlockSize = maxTermBlockSize;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -405,19 +385,7 @@ public final class Lucene99PostingsFormat extends PostingsFormat {
|
|||
|
||||
@Override
|
||||
public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
|
||||
PostingsWriterBase postingsWriter = new Lucene99PostingsWriter(state);
|
||||
boolean success = false;
|
||||
try {
|
||||
FieldsConsumer ret =
|
||||
new Lucene90BlockTreeTermsWriter(
|
||||
state, postingsWriter, minTermBlockSize, maxTermBlockSize);
|
||||
success = true;
|
||||
return ret;
|
||||
} finally {
|
||||
if (!success) {
|
||||
IOUtils.closeWhileHandlingException(postingsWriter);
|
||||
}
|
||||
}
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
@Override
|
|
@ -14,23 +14,23 @@
|
|||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.codecs.lucene99;
|
||||
package org.apache.lucene.backward_codecs.lucene99;
|
||||
|
||||
import static org.apache.lucene.codecs.lucene99.ForUtil.BLOCK_SIZE;
|
||||
import static org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.DOC_CODEC;
|
||||
import static org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.MAX_SKIP_LEVELS;
|
||||
import static org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.PAY_CODEC;
|
||||
import static org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.POS_CODEC;
|
||||
import static org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.TERMS_CODEC;
|
||||
import static org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.VERSION_CURRENT;
|
||||
import static org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.VERSION_START;
|
||||
import static org.apache.lucene.backward_codecs.lucene99.ForUtil.BLOCK_SIZE;
|
||||
import static org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat.DOC_CODEC;
|
||||
import static org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat.MAX_SKIP_LEVELS;
|
||||
import static org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat.PAY_CODEC;
|
||||
import static org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat.POS_CODEC;
|
||||
import static org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat.TERMS_CODEC;
|
||||
import static org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat.VERSION_CURRENT;
|
||||
import static org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat.VERSION_START;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
import org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat.IntBlockTermState;
|
||||
import org.apache.lucene.codecs.BlockTermState;
|
||||
import org.apache.lucene.codecs.CodecUtil;
|
||||
import org.apache.lucene.codecs.PostingsReaderBase;
|
||||
import org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.IntBlockTermState;
|
||||
import org.apache.lucene.index.FieldInfo;
|
||||
import org.apache.lucene.index.Impacts;
|
||||
import org.apache.lucene.index.ImpactsEnum;
|
|
@ -14,7 +14,7 @@
|
|||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.codecs.lucene99;
|
||||
package org.apache.lucene.backward_codecs.lucene99;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.AbstractList;
|
|
@ -14,7 +14,7 @@
|
|||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.codecs.lucene99;
|
||||
package org.apache.lucene.backward_codecs.lucene99;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
|
@ -61,6 +61,7 @@ public class Lucene99SkipReader extends MultiLevelSkipListReader {
|
|||
private long lastDocPointer;
|
||||
private int lastPosBufferUpto;
|
||||
|
||||
/** Sole constructor. */
|
||||
public Lucene99SkipReader(
|
||||
IndexInput skipStream,
|
||||
int maxSkipLevels,
|
||||
|
@ -98,6 +99,7 @@ public class Lucene99SkipReader extends MultiLevelSkipListReader {
|
|||
return df % ForUtil.BLOCK_SIZE == 0 ? df - 1 : df;
|
||||
}
|
||||
|
||||
/** Initialize state. */
|
||||
public void init(
|
||||
long skipPointer, long docBasePointer, long posBasePointer, long payBasePointer, int df)
|
||||
throws IOException {
|
||||
|
@ -125,22 +127,27 @@ public class Lucene99SkipReader extends MultiLevelSkipListReader {
|
|||
return lastDocPointer;
|
||||
}
|
||||
|
||||
/** Returns the pointer in the pos file. */
|
||||
public long getPosPointer() {
|
||||
return lastPosPointer;
|
||||
}
|
||||
|
||||
/** Return the start offset in the position block. */
|
||||
public int getPosBufferUpto() {
|
||||
return lastPosBufferUpto;
|
||||
}
|
||||
|
||||
/** Returns the pointer in the pay file. */
|
||||
public long getPayPointer() {
|
||||
return lastPayPointer;
|
||||
}
|
||||
|
||||
/** Return the number of bytes in the pay block that belongs to docs from the previous block. */
|
||||
public int getPayloadByteUpto() {
|
||||
return lastPayloadByteUpto;
|
||||
}
|
||||
|
||||
/** Return the next skip doc, no skipping can be performed until this doc. */
|
||||
public int getNextSkipDoc() {
|
||||
return skipDoc[0];
|
||||
}
|
||||
|
@ -199,7 +206,7 @@ public class Lucene99SkipReader extends MultiLevelSkipListReader {
|
|||
return delta;
|
||||
}
|
||||
|
||||
// The default impl skips impacts
|
||||
/** Read impacts. The default implementation skips them. */
|
||||
protected void readImpacts(int level, IndexInput skipStream) throws IOException {
|
||||
skipStream.skipBytes(skipStream.readVInt());
|
||||
}
|
|
@ -14,7 +14,7 @@
|
|||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.codecs.lucene99;
|
||||
package org.apache.lucene.backward_codecs.lucene99;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
|
@ -46,10 +46,10 @@ import org.apache.lucene.store.IndexOutput;
|
|||
* uptos(position, payload). 4. start offset.
|
||||
*/
|
||||
public final class Lucene99SkipWriter extends MultiLevelSkipListWriter {
|
||||
private int[] lastSkipDoc;
|
||||
private long[] lastSkipDocPointer;
|
||||
private long[] lastSkipPosPointer;
|
||||
private long[] lastSkipPayPointer;
|
||||
private final int[] lastSkipDoc;
|
||||
private final long[] lastSkipDocPointer;
|
||||
private final long[] lastSkipPosPointer;
|
||||
private final long[] lastSkipPayPointer;
|
||||
|
||||
private final IndexOutput docOut;
|
||||
private final IndexOutput posOut;
|
||||
|
@ -61,11 +61,12 @@ public final class Lucene99SkipWriter extends MultiLevelSkipListWriter {
|
|||
private long curPayPointer;
|
||||
private int curPosBufferUpto;
|
||||
private int curPayloadByteUpto;
|
||||
private CompetitiveImpactAccumulator[] curCompetitiveFreqNorms;
|
||||
private final CompetitiveImpactAccumulator[] curCompetitiveFreqNorms;
|
||||
private boolean fieldHasPositions;
|
||||
private boolean fieldHasOffsets;
|
||||
private boolean fieldHasPayloads;
|
||||
|
||||
/** Sole constructor. */
|
||||
public Lucene99SkipWriter(
|
||||
int maxSkipLevels,
|
||||
int blockSize,
|
||||
|
@ -84,7 +85,12 @@ public final class Lucene99SkipWriter extends MultiLevelSkipListWriter {
|
|||
lastSkipPosPointer = new long[maxSkipLevels];
|
||||
if (payOut != null) {
|
||||
lastSkipPayPointer = new long[maxSkipLevels];
|
||||
} else {
|
||||
lastSkipPayPointer = null;
|
||||
}
|
||||
} else {
|
||||
lastSkipPosPointer = null;
|
||||
lastSkipPayPointer = null;
|
||||
}
|
||||
curCompetitiveFreqNorms = new CompetitiveImpactAccumulator[maxSkipLevels];
|
||||
for (int i = 0; i < maxSkipLevels; ++i) {
|
||||
|
@ -92,6 +98,7 @@ public final class Lucene99SkipWriter extends MultiLevelSkipListWriter {
|
|||
}
|
||||
}
|
||||
|
||||
/** Reset state for the given index options. */
|
||||
public void setField(
|
||||
boolean fieldHasPositions, boolean fieldHasOffsets, boolean fieldHasPayloads) {
|
||||
this.fieldHasPositions = fieldHasPositions;
|
||||
|
@ -211,6 +218,7 @@ public final class Lucene99SkipWriter extends MultiLevelSkipListWriter {
|
|||
competitiveFreqNorms.clear();
|
||||
}
|
||||
|
||||
/** Write impacts to the given output. */
|
||||
public static void writeImpacts(CompetitiveImpactAccumulator acc, DataOutput out)
|
||||
throws IOException {
|
||||
Collection<Impact> impacts = acc.getCompetitiveFreqNormPairs();
|
|
@ -14,7 +14,7 @@
|
|||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.codecs.lucene99;
|
||||
package org.apache.lucene.backward_codecs.lucene99;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
|
@ -14,7 +14,7 @@
|
|||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.codecs.lucene99;
|
||||
package org.apache.lucene.backward_codecs.lucene99;
|
||||
|
||||
import java.io.IOException;
|
||||
import org.apache.lucene.store.IndexInput;
|
|
@ -40,7 +40,7 @@ HEADER = """// This file has been automatically generated, DO NOT EDIT
|
|||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.codecs.lucene99;
|
||||
package org.apache.lucene.backward_codecs.lucene99;
|
||||
|
||||
import java.io.IOException;
|
||||
import org.apache.lucene.store.DataInput;
|
|
@ -0,0 +1,428 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Lucene 9.9 file format.
|
||||
*
|
||||
* <h2>Apache Lucene - Index File Formats</h2>
|
||||
*
|
||||
* <div>
|
||||
*
|
||||
* <ul>
|
||||
* <li><a href="#Introduction">Introduction</a>
|
||||
* <li><a href="#Definitions">Definitions</a>
|
||||
* <ul>
|
||||
* <li><a href="#Inverted_Indexing">Inverted Indexing</a>
|
||||
* <li><a href="#Types_of_Fields">Types of Fields</a>
|
||||
* <li><a href="#Segments">Segments</a>
|
||||
* <li><a href="#Document_Numbers">Document Numbers</a>
|
||||
* </ul>
|
||||
* <li><a href="#Overview">Index Structure Overview</a>
|
||||
* <li><a href="#File_Naming">File Naming</a>
|
||||
* <li><a href="#file-names">Summary of File Extensions</a>
|
||||
* <ul>
|
||||
* <li><a href="#Lock_File">Lock File</a>
|
||||
* <li><a href="#History">History</a>
|
||||
* <li><a href="#Limitations">Limitations</a>
|
||||
* </ul>
|
||||
* </ul>
|
||||
*
|
||||
* </div> <a id="Introduction"></a>
|
||||
*
|
||||
* <h3>Introduction</h3>
|
||||
*
|
||||
* <div>
|
||||
*
|
||||
* <p>This document defines the index file formats used in this version of Lucene. If you are using
|
||||
* a different version of Lucene, please consult the copy of <code>docs/</code> that was distributed
|
||||
* with the version you are using.
|
||||
*
|
||||
* <p>This document attempts to provide a high-level definition of the Apache Lucene file formats.
|
||||
* </div> <a id="Definitions"></a>
|
||||
*
|
||||
* <h3>Definitions</h3>
|
||||
*
|
||||
* <div>
|
||||
*
|
||||
* <p>The fundamental concepts in Lucene are index, document, field and term.
|
||||
*
|
||||
* <p>An index contains a sequence of documents.
|
||||
*
|
||||
* <ul>
|
||||
* <li>A document is a sequence of fields.
|
||||
* <li>A field is a named sequence of terms.
|
||||
* <li>A term is a sequence of bytes.
|
||||
* </ul>
|
||||
*
|
||||
* <p>The same sequence of bytes in two different fields is considered a different term. Thus terms
|
||||
* are represented as a pair: the string naming the field, and the bytes within the field. <a
|
||||
* id="Inverted_Indexing"></a>
|
||||
*
|
||||
* <h4>Inverted Indexing</h4>
|
||||
*
|
||||
* <p>Lucene's index stores terms and statistics about those terms in order to make term-based
|
||||
* search more efficient. Lucene's terms index falls into the family of indexes known as an
|
||||
* <i>inverted index.</i> This is because it can list, for a term, the documents that contain it.
|
||||
* This is the inverse of the natural relationship, in which documents list terms. <a
|
||||
* id="Types_of_Fields"></a>
|
||||
*
|
||||
* <h4>Types of Fields</h4>
|
||||
*
|
||||
* <p>In Lucene, fields may be <i>stored</i>, in which case their text is stored in the index
|
||||
* literally, in a non-inverted manner. Fields that are inverted are called <i>indexed</i>. A field
|
||||
* may be both stored and indexed.
|
||||
*
|
||||
* <p>The text of a field may be <i>tokenized</i> into terms to be indexed, or the text of a field
|
||||
* may be used literally as a term to be indexed. Most fields are tokenized, but sometimes it is
|
||||
* useful for certain identifier fields to be indexed literally.
|
||||
*
|
||||
* <p>See the {@link org.apache.lucene.document.Field Field} java docs for more information on
|
||||
* Fields. <a id="Segments"></a>
|
||||
*
|
||||
* <h4>Segments</h4>
|
||||
*
|
||||
* <p>Lucene indexes may be composed of multiple sub-indexes, or <i>segments</i>. Each segment is a
|
||||
* fully independent index, which could be searched separately. Indexes evolve by:
|
||||
*
|
||||
* <ol>
|
||||
* <li>Creating new segments for newly added documents.
|
||||
* <li>Merging existing segments.
|
||||
* </ol>
|
||||
*
|
||||
* <p>Searches may involve multiple segments and/or multiple indexes, each index potentially
|
||||
* composed of a set of segments. <a id="Document_Numbers"></a>
|
||||
*
|
||||
* <h4>Document Numbers</h4>
|
||||
*
|
||||
* <p>Internally, Lucene refers to documents by an integer <i>document number</i>. The first
|
||||
* document added to an index is numbered zero, and each subsequent document added gets a number one
|
||||
* greater than the previous.
|
||||
*
|
||||
* <p>Note that a document's number may change, so caution should be taken when storing these
|
||||
* numbers outside of Lucene. In particular, numbers may change in the following situations:
|
||||
*
|
||||
* <ul>
|
||||
* <li>
|
||||
* <p>The numbers stored in each segment are unique only within the segment, and must be
|
||||
* converted before they can be used in a larger context. The standard technique is to
|
||||
* allocate each segment a range of values, based on the range of numbers used in that
|
||||
* segment. To convert a document number from a segment to an external value, the segment's
|
||||
* <i>base</i> document number is added. To convert an external value back to a
|
||||
* segment-specific value, the segment is identified by the range that the external value is
|
||||
* in, and the segment's base value is subtracted. For example two five document segments
|
||||
* might be combined, so that the first segment has a base value of zero, and the second of
|
||||
* five. Document three from the second segment would have an external value of eight.
|
||||
* <li>
|
||||
* <p>When documents are deleted, gaps are created in the numbering. These are eventually
|
||||
* removed as the index evolves through merging. Deleted documents are dropped when segments
|
||||
* are merged. A freshly-merged segment thus has no gaps in its numbering.
|
||||
* </ul>
|
||||
*
|
||||
* </div> <a id="Overview"></a>
|
||||
*
|
||||
* <h3>Index Structure Overview</h3>
|
||||
*
|
||||
* <div>
|
||||
*
|
||||
* <p>Each segment index maintains the following:
|
||||
*
|
||||
* <ul>
|
||||
* <li>{@link org.apache.lucene.codecs.lucene99.Lucene99SegmentInfoFormat Segment info}. This
|
||||
* contains metadata about a segment, such as the number of documents, what files it uses, and
|
||||
* information about how the segment is sorted
|
||||
* <li>{@link org.apache.lucene.codecs.lucene94.Lucene94FieldInfosFormat Field names}. This
|
||||
* contains metadata about the set of named fields used in the index.
|
||||
* <li>{@link org.apache.lucene.codecs.lucene90.Lucene90StoredFieldsFormat Stored Field values}.
|
||||
* This contains, for each document, a list of attribute-value pairs, where the attributes are
|
||||
* field names. These are used to store auxiliary information about the document, such as its
|
||||
* title, url, or an identifier to access a database. The set of stored fields are what is
|
||||
* returned for each hit when searching. This is keyed by document number.
|
||||
* <li>{@link org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat Term dictionary}.
|
||||
* A dictionary containing all of the terms used in all of the indexed fields of all of the
|
||||
* documents. The dictionary also contains the number of documents which contain the term, and
|
||||
* pointers to the term's frequency and proximity data.
|
||||
* <li>{@link org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat Term Frequency
|
||||
* data}. For each term in the dictionary, the numbers of all the documents that contain that
|
||||
* term, and the frequency of the term in that document, unless frequencies are omitted
|
||||
* ({@link org.apache.lucene.index.IndexOptions#DOCS IndexOptions.DOCS})
|
||||
* <li>{@link org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat Term Proximity
|
||||
* data}. For each term in the dictionary, the positions that the term occurs in each
|
||||
* document. Note that this will not exist if all fields in all documents omit position data.
|
||||
* <li>{@link org.apache.lucene.codecs.lucene90.Lucene90NormsFormat Normalization factors}. For
|
||||
* each field in each document, a value is stored that is multiplied into the score for hits
|
||||
* on that field.
|
||||
* <li>{@link org.apache.lucene.codecs.lucene90.Lucene90TermVectorsFormat Term Vectors}. For each
|
||||
* field in each document, the term vector (sometimes called document vector) may be stored. A
|
||||
* term vector consists of term text and term frequency. To add Term Vectors to your index see
|
||||
* the {@link org.apache.lucene.document.Field Field} constructors
|
||||
* <li>{@link org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat Per-document values}. Like
|
||||
* stored values, these are also keyed by document number, but are generally intended to be
|
||||
* loaded into main memory for fast access. Whereas stored values are generally intended for
|
||||
* summary results from searches, per-document values are useful for things like scoring
|
||||
* factors.
|
||||
* <li>{@link org.apache.lucene.codecs.lucene90.Lucene90LiveDocsFormat Live documents}. An
|
||||
* optional file indicating which documents are live.
|
||||
* <li>{@link org.apache.lucene.codecs.lucene90.Lucene90PointsFormat Point values}. Optional pair
|
||||
* of files, recording dimensionally indexed fields, to enable fast numeric range filtering
|
||||
* and large numeric values like BigInteger and BigDecimal (1D) and geographic shape
|
||||
* intersection (2D, 3D).
|
||||
* <li>{@link org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat Vector values}. The
|
||||
* vector format stores numeric vectors in a format optimized for random access and
|
||||
* computation, supporting high-dimensional nearest-neighbor search.
|
||||
* </ul>
|
||||
*
|
||||
* <p>Details on each of these are provided in their linked pages. </div> <a id="File_Naming"></a>
|
||||
*
|
||||
* <h3>File Naming</h3>
|
||||
*
|
||||
* <div>
|
||||
*
|
||||
* <p>All files belonging to a segment have the same name with varying extensions. The extensions
|
||||
* correspond to the different file formats described below. When using the Compound File format
|
||||
* (default for small segments) these files (except for the Segment info file, the Lock file, and
|
||||
* Deleted documents file) are collapsed into a single .cfs file (see below for details)
|
||||
*
|
||||
* <p>Typically, all segments in an index are stored in a single directory, although this is not
|
||||
* required.
|
||||
*
|
||||
* <p>File names are never re-used. That is, when any file is saved to the Directory it is given a
|
||||
* never before used filename. This is achieved using a simple generations approach. For example,
|
||||
* the first segments file is segments_1, then segments_2, etc. The generation is a sequential long
|
||||
* integer represented in alpha-numeric (base 36) form. </div> <a id="file-names"></a>
|
||||
*
|
||||
* <h3>Summary of File Extensions</h3>
|
||||
*
|
||||
* <div>
|
||||
*
|
||||
* <p>The following table summarizes the names and extensions of the files in Lucene:
|
||||
*
|
||||
* <table class="padding4" style="border-spacing: 1px; border-collapse: separate">
|
||||
* <caption>lucene filenames by extension</caption>
|
||||
* <tr>
|
||||
* <th>Name</th>
|
||||
* <th>Extension</th>
|
||||
* <th>Brief Description</th>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.index.SegmentInfos Segments File}</td>
|
||||
* <td>segments_N</td>
|
||||
* <td>Stores information about a commit point</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td><a href="#Lock_File">Lock File</a></td>
|
||||
* <td>write.lock</td>
|
||||
* <td>The Write lock prevents multiple IndexWriters from writing to the same
|
||||
* file.</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene99.Lucene99SegmentInfoFormat Segment Info}</td>
|
||||
* <td>.si</td>
|
||||
* <td>Stores metadata about a segment</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90CompoundFormat Compound File}</td>
|
||||
* <td>.cfs, .cfe</td>
|
||||
* <td>An optional "virtual" file consisting of all the other index files for
|
||||
* systems that frequently run out of file handles.</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene94.Lucene94FieldInfosFormat Fields}</td>
|
||||
* <td>.fnm</td>
|
||||
* <td>Stores information about the fields</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90StoredFieldsFormat Field Index}</td>
|
||||
* <td>.fdx</td>
|
||||
* <td>Contains pointers to field data</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90StoredFieldsFormat Field Data}</td>
|
||||
* <td>.fdt</td>
|
||||
* <td>The stored fields for documents</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat Term Dictionary}</td>
|
||||
* <td>.tim</td>
|
||||
* <td>The term dictionary, stores term info</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat Term Index}</td>
|
||||
* <td>.tip</td>
|
||||
* <td>The index into the Term Dictionary</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat Frequencies}</td>
|
||||
* <td>.doc</td>
|
||||
* <td>Contains the list of docs which contain each term along with frequency</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat Positions}</td>
|
||||
* <td>.pos</td>
|
||||
* <td>Stores position information about where a term occurs in the index</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat Payloads}</td>
|
||||
* <td>.pay</td>
|
||||
* <td>Stores additional per-position metadata information such as character offsets and user payloads</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90NormsFormat Norms}</td>
|
||||
* <td>.nvd, .nvm</td>
|
||||
* <td>Encodes length and boost factors for docs and fields</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat Per-Document Values}</td>
|
||||
* <td>.dvd, .dvm</td>
|
||||
* <td>Encodes additional scoring factors or other per-document information.</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90TermVectorsFormat Term Vector Index}</td>
|
||||
* <td>.tvx</td>
|
||||
* <td>Stores offset into the document data file</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90TermVectorsFormat Term Vector Data}</td>
|
||||
* <td>.tvd</td>
|
||||
* <td>Contains term vector data.</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90LiveDocsFormat Live Documents}</td>
|
||||
* <td>.liv</td>
|
||||
* <td>Info about what documents are live</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90PointsFormat Point values}</td>
|
||||
* <td>.dii, .dim</td>
|
||||
* <td>Holds indexed points</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat Vector values}</td>
|
||||
* <td>.vec, .vem, .veq, vex</td>
|
||||
* <td>Holds indexed vectors; <code>.vec</code> files contain the raw vector data,
|
||||
* <code>.vem</code> the vector metadata, <code>.veq</code> the quantized vector data, and <code>.vex</code> the
|
||||
* hnsw graph data.</td>
|
||||
* </tr>
|
||||
* </table>
|
||||
*
|
||||
* </div> <a id="Lock_File"></a>
|
||||
*
|
||||
* <h3>Lock File</h3>
|
||||
*
|
||||
* The write lock, which is stored in the index directory by default, is named "write.lock". If the
|
||||
* lock directory is different from the index directory then the write lock will be named
|
||||
* "XXXX-write.lock" where XXXX is a unique prefix derived from the full path to the index
|
||||
* directory. When this file is present, a writer is currently modifying the index (adding or
|
||||
* removing documents). This lock file ensures that only one writer is modifying the index at a
|
||||
* time. <a id="History"></a>
|
||||
*
|
||||
* <h3>History</h3>
|
||||
*
|
||||
* <p>Compatibility notes are provided in this document, describing how file formats have changed
|
||||
* from prior versions:
|
||||
*
|
||||
* <ul>
|
||||
* <li>In version 2.1, the file format was changed to allow lock-less commits (ie, no more commit
|
||||
* lock). The change is fully backwards compatible: you can open a pre-2.1 index for searching
|
||||
* or adding/deleting of docs. When the new segments file is saved (committed), it will be
|
||||
* written in the new file format (meaning no specific "upgrade" process is needed). But note
|
||||
* that once a commit has occurred, pre-2.1 Lucene will not be able to read the index.
|
||||
* <li>In version 2.3, the file format was changed to allow segments to share a single set of doc
|
||||
* store (vectors & stored fields) files. This allows for faster indexing in certain
|
||||
* cases. The change is fully backwards compatible (in the same way as the lock-less commits
|
||||
* change in 2.1).
|
||||
* <li>In version 2.4, Strings are now written as true UTF-8 byte sequence, not Java's modified
|
||||
* UTF-8. See <a href="http://issues.apache.org/jira/browse/LUCENE-510">LUCENE-510</a> for
|
||||
* details.
|
||||
* <li>In version 2.9, an optional opaque Map<String,String> CommitUserData may be passed to
|
||||
* IndexWriter's commit methods (and later retrieved), which is recorded in the segments_N
|
||||
* file. See <a href="http://issues.apache.org/jira/browse/LUCENE-1382">LUCENE-1382</a> for
|
||||
* details. Also, diagnostics were added to each segment written recording details about why
|
||||
* it was written (due to flush, merge; which OS/JRE was used; etc.). See issue <a
|
||||
* href="http://issues.apache.org/jira/browse/LUCENE-1654">LUCENE-1654</a> for details.
|
||||
* <li>In version 3.0, compressed fields are no longer written to the index (they can still be
|
||||
* read, but on merge the new segment will write them, uncompressed). See issue <a
|
||||
* href="http://issues.apache.org/jira/browse/LUCENE-1960">LUCENE-1960</a> for details.
|
||||
* <li>In version 3.1, segments records the code version that created them. See <a
|
||||
* href="http://issues.apache.org/jira/browse/LUCENE-2720">LUCENE-2720</a> for details.
|
||||
* Additionally segments track explicitly whether or not they have term vectors. See <a
|
||||
* href="http://issues.apache.org/jira/browse/LUCENE-2811">LUCENE-2811</a> for details.
|
||||
* <li>In version 3.2, numeric fields are written as natively to stored fields file, previously
|
||||
* they were stored in text format only.
|
||||
* <li>In version 3.4, fields can omit position data while still indexing term frequencies.
|
||||
* <li>In version 4.0, the format of the inverted index became extensible via the {@link
|
||||
* org.apache.lucene.codecs.Codec Codec} api. Fast per-document storage ({@code DocValues})
|
||||
* was introduced. Normalization factors need no longer be a single byte, they can be any
|
||||
* {@link org.apache.lucene.index.NumericDocValues NumericDocValues}. Terms need not be
|
||||
* unicode strings, they can be any byte sequence. Term offsets can optionally be indexed into
|
||||
* the postings lists. Payloads can be stored in the term vectors.
|
||||
* <li>In version 4.1, the format of the postings list changed to use either of FOR compression or
|
||||
* variable-byte encoding, depending upon the frequency of the term. Terms appearing only once
|
||||
* were changed to inline directly into the term dictionary. Stored fields are compressed by
|
||||
* default.
|
||||
* <li>In version 4.2, term vectors are compressed by default. DocValues has a new multi-valued
|
||||
* type (SortedSet), that can be used for faceting/grouping/joining on multi-valued fields.
|
||||
* <li>In version 4.5, DocValues were extended to explicitly represent missing values.
|
||||
* <li>In version 4.6, FieldInfos were extended to support per-field DocValues generation, to
|
||||
* allow updating NumericDocValues fields.
|
||||
* <li>In version 4.8, checksum footers were added to the end of each index file for improved data
|
||||
* integrity. Specifically, the last 8 bytes of every index file contain the zlib-crc32
|
||||
* checksum of the file.
|
||||
* <li>In version 4.9, DocValues has a new multi-valued numeric type (SortedNumeric) that is
|
||||
* suitable for faceting/sorting/analytics.
|
||||
* <li>In version 5.4, DocValues have been improved to store more information on disk: addresses
|
||||
* for binary fields and ord indexes for multi-valued fields.
|
||||
* <li>In version 6.0, Points were added, for multi-dimensional range/distance search.
|
||||
* <li>In version 6.2, new Segment info format that reads/writes the index sort, to support index
|
||||
* sorting.
|
||||
* <li>In version 7.0, DocValues have been improved to better support sparse doc values thanks to
|
||||
* an iterator API.
|
||||
* <li>In version 8.0, postings have been enhanced to record, for each block of doc ids, the (term
|
||||
* freq, normalization factor) pairs that may trigger the maximum score of the block. This
|
||||
* information is recorded alongside skip data in order to be able to skip blocks of doc ids
|
||||
* if they may not produce high enough scores. Additionally doc values and norms has been
|
||||
* extended with jump-tables to make access O(1) instead of O(n), where n is the number of
|
||||
* elements to skip when advancing in the data.
|
||||
* <li>In version 8.4, postings, positions, offsets and payload lengths have move to a more
|
||||
* performant encoding that is vectorized.
|
||||
* <li>In version 8.6, index sort serialization is delegated to the sorts themselves, to allow
|
||||
* user-defined sorts to be used
|
||||
* <li>In version 8.7, stored fields compression became adaptive to better handle documents with
|
||||
* smaller stored fields.
|
||||
* <li>In version 9.0, vector-valued fields were added.
|
||||
* <li>In version 9.1, vector-valued fields were modified to add a graph hierarchy.
|
||||
* <li>In version 9.2, docs of vector-valued fields were moved from .vem to .vec and encoded by
|
||||
* IndexDISI. ordToDoc mappings was added to .vem.
|
||||
* <li>In version 9.5, HNSW graph connections were changed to be delta-encoded with vints.
|
||||
* Additionally, metadata file size improvements were made by delta-encoding nodes by graph
|
||||
* layer and not writing the node ids for the zeroth layer.
|
||||
* <li>In version 9.9, Vector scalar quantization support was added. Allowing the HNSW vector
|
||||
* format to utilize int8 quantized vectors for float32 vector search.
|
||||
* </ul>
|
||||
*
|
||||
* <a id="Limitations"></a>
|
||||
*
|
||||
* <h3>Limitations</h3>
|
||||
*
|
||||
* <div>
|
||||
*
|
||||
* <p>Lucene uses a Java <code>int</code> to refer to document numbers, and the index file format
|
||||
* uses an <code>Int32</code> on-disk to store document numbers. This is a limitation of both the
|
||||
* index file format and the current implementation. Eventually these should be replaced with either
|
||||
* <code>UInt64</code> values, or better yet, {@link org.apache.lucene.store.DataOutput#writeVInt
|
||||
* VInt} values which have no limit. </div>
|
||||
*/
|
||||
package org.apache.lucene.backward_codecs.lucene99;
|
|
@ -22,3 +22,4 @@ org.apache.lucene.backward_codecs.lucene91.Lucene91Codec
|
|||
org.apache.lucene.backward_codecs.lucene92.Lucene92Codec
|
||||
org.apache.lucene.backward_codecs.lucene94.Lucene94Codec
|
||||
org.apache.lucene.backward_codecs.lucene95.Lucene95Codec
|
||||
org.apache.lucene.backward_codecs.lucene99.Lucene99Codec
|
||||
|
|
|
@ -16,3 +16,4 @@
|
|||
org.apache.lucene.backward_codecs.lucene50.Lucene50PostingsFormat
|
||||
org.apache.lucene.backward_codecs.lucene84.Lucene84PostingsFormat
|
||||
org.apache.lucene.backward_codecs.lucene90.Lucene90PostingsFormat
|
||||
org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -17,7 +17,7 @@
|
|||
package org.apache.lucene.backward_codecs.lucene50;
|
||||
|
||||
import java.io.IOException;
|
||||
import org.apache.lucene.backward_codecs.lucene40.blocktree.Lucene40BlockTreeTermsWriter;
|
||||
import org.apache.lucene.backward_codecs.lucene40.blocktree.Lucene40BlockTreeTermsWriterV5;
|
||||
import org.apache.lucene.codecs.FieldsConsumer;
|
||||
import org.apache.lucene.codecs.PostingsWriterBase;
|
||||
import org.apache.lucene.index.SegmentWriteState;
|
||||
|
@ -31,11 +31,11 @@ public class Lucene50RWPostingsFormat extends Lucene50PostingsFormat {
|
|||
boolean success = false;
|
||||
try {
|
||||
FieldsConsumer ret =
|
||||
new Lucene40BlockTreeTermsWriter(
|
||||
new Lucene40BlockTreeTermsWriterV5(
|
||||
state,
|
||||
postingsWriter,
|
||||
Lucene40BlockTreeTermsWriter.DEFAULT_MIN_BLOCK_SIZE,
|
||||
Lucene40BlockTreeTermsWriter.DEFAULT_MAX_BLOCK_SIZE);
|
||||
Lucene40BlockTreeTermsWriterV5.DEFAULT_MIN_BLOCK_SIZE,
|
||||
Lucene40BlockTreeTermsWriterV5.DEFAULT_MAX_BLOCK_SIZE);
|
||||
success = true;
|
||||
return ret;
|
||||
} finally {
|
||||
|
|
|
@ -642,13 +642,13 @@ public class BKDWriter60 implements Closeable {
|
|||
throws IOException {
|
||||
assert docMaps == null || readers.size() == docMaps.size();
|
||||
|
||||
BKDMergeQueue queue = new BKDMergeQueue(config.bytesPerDim, readers.size());
|
||||
BKDMergeQueue queue = new BKDMergeQueue(config.bytesPerDim(), readers.size());
|
||||
|
||||
for (int i = 0; i < readers.size(); i++) {
|
||||
PointValues pointValues = readers.get(i);
|
||||
assert pointValues.getNumDimensions() == config.numDims
|
||||
&& pointValues.getBytesPerDimension() == config.bytesPerDim
|
||||
&& pointValues.getNumIndexDimensions() == config.numIndexDims;
|
||||
assert pointValues.getNumDimensions() == config.numDims()
|
||||
&& pointValues.getBytesPerDimension() == config.bytesPerDim()
|
||||
&& pointValues.getNumIndexDimensions() == config.numIndexDims();
|
||||
MergeState.DocMap docMap;
|
||||
if (docMaps == null) {
|
||||
docMap = null;
|
||||
|
|
|
@ -23,12 +23,11 @@ import java.util.Arrays;
|
|||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import org.apache.lucene.backward_codecs.lucene90.Lucene90ScoreSkipReader.MutableImpactList;
|
||||
import org.apache.lucene.backward_codecs.lucene99.Lucene99SkipWriter;
|
||||
import org.apache.lucene.codecs.Codec;
|
||||
import org.apache.lucene.codecs.CompetitiveImpactAccumulator;
|
||||
import org.apache.lucene.codecs.lucene90.blocktree.FieldReader;
|
||||
import org.apache.lucene.codecs.lucene90.blocktree.Stats;
|
||||
import org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat;
|
||||
import org.apache.lucene.codecs.lucene99.Lucene99SkipWriter;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.index.DirectoryReader;
|
||||
|
@ -77,22 +76,6 @@ public class TestLucene90PostingsFormat extends BasePostingsFormatTestCase {
|
|||
d.close();
|
||||
}
|
||||
|
||||
private void shouldFail(int minItemsInBlock, int maxItemsInBlock) {
|
||||
expectThrows(
|
||||
IllegalArgumentException.class,
|
||||
() -> {
|
||||
new Lucene99PostingsFormat(minItemsInBlock, maxItemsInBlock);
|
||||
});
|
||||
}
|
||||
|
||||
public void testInvalidBlockSizes() throws Exception {
|
||||
shouldFail(0, 0);
|
||||
shouldFail(10, 8);
|
||||
shouldFail(-1, 10);
|
||||
shouldFail(10, -1);
|
||||
shouldFail(10, 12);
|
||||
}
|
||||
|
||||
public void testImpactSerialization() throws IOException {
|
||||
// omit norms and omit freqs
|
||||
doTestImpactSerialization(Collections.singletonList(new Impact(1, 1L)));
|
||||
|
|
|
@ -388,10 +388,14 @@ public final class Lucene94HnswVectorsWriter extends KnnVectorsWriter {
|
|||
// write the vector data to a temporary file
|
||||
DocsWithFieldSet docsWithField =
|
||||
switch (fieldInfo.getVectorEncoding()) {
|
||||
case BYTE -> writeByteVectorData(
|
||||
tempVectorData, MergedVectorValues.mergeByteVectorValues(fieldInfo, mergeState));
|
||||
case FLOAT32 -> writeVectorData(
|
||||
tempVectorData, MergedVectorValues.mergeFloatVectorValues(fieldInfo, mergeState));
|
||||
case BYTE ->
|
||||
writeByteVectorData(
|
||||
tempVectorData,
|
||||
MergedVectorValues.mergeByteVectorValues(fieldInfo, mergeState));
|
||||
case FLOAT32 ->
|
||||
writeVectorData(
|
||||
tempVectorData,
|
||||
MergedVectorValues.mergeFloatVectorValues(fieldInfo, mergeState));
|
||||
};
|
||||
CodecUtil.writeFooter(tempVectorData);
|
||||
IOUtils.close(tempVectorData);
|
||||
|
@ -638,13 +642,15 @@ public final class Lucene94HnswVectorsWriter extends KnnVectorsWriter {
|
|||
throws IOException {
|
||||
int dim = fieldInfo.getVectorDimension();
|
||||
return switch (fieldInfo.getVectorEncoding()) {
|
||||
case BYTE -> new FieldWriter<byte[]>(fieldInfo, M, beamWidth, infoStream) {
|
||||
case BYTE ->
|
||||
new FieldWriter<byte[]>(fieldInfo, M, beamWidth, infoStream) {
|
||||
@Override
|
||||
public byte[] copyValue(byte[] value) {
|
||||
return ArrayUtil.copyOfSubArray(value, 0, dim);
|
||||
}
|
||||
};
|
||||
case FLOAT32 -> new FieldWriter<float[]>(fieldInfo, M, beamWidth, infoStream) {
|
||||
case FLOAT32 ->
|
||||
new FieldWriter<float[]>(fieldInfo, M, beamWidth, infoStream) {
|
||||
@Override
|
||||
public float[] copyValue(float[] value) {
|
||||
return ArrayUtil.copyOfSubArray(value, 0, dim);
|
||||
|
@ -663,10 +669,12 @@ public final class Lucene94HnswVectorsWriter extends KnnVectorsWriter {
|
|||
DefaultFlatVectorScorer defaultFlatVectorScorer = new DefaultFlatVectorScorer();
|
||||
RandomVectorScorerSupplier scorerSupplier =
|
||||
switch (fieldInfo.getVectorEncoding()) {
|
||||
case BYTE -> defaultFlatVectorScorer.getRandomVectorScorerSupplier(
|
||||
case BYTE ->
|
||||
defaultFlatVectorScorer.getRandomVectorScorerSupplier(
|
||||
fieldInfo.getVectorSimilarityFunction(),
|
||||
RandomAccessVectorValues.fromBytes((List<byte[]>) vectors, dim));
|
||||
case FLOAT32 -> defaultFlatVectorScorer.getRandomVectorScorerSupplier(
|
||||
case FLOAT32 ->
|
||||
defaultFlatVectorScorer.getRandomVectorScorerSupplier(
|
||||
fieldInfo.getVectorSimilarityFunction(),
|
||||
RandomAccessVectorValues.fromFloats((List<float[]>) vectors, dim));
|
||||
};
|
||||
|
@ -693,9 +701,9 @@ public final class Lucene94HnswVectorsWriter extends KnnVectorsWriter {
|
|||
lastDocID = docID;
|
||||
}
|
||||
|
||||
OnHeapHnswGraph getGraph() {
|
||||
OnHeapHnswGraph getGraph() throws IOException {
|
||||
if (vectors.size() > 0) {
|
||||
return hnswGraphBuilder.getGraph();
|
||||
return hnswGraphBuilder.getCompletedGraph();
|
||||
} else {
|
||||
return null;
|
||||
}
|
||||
|
|
|
@ -414,10 +414,14 @@ public final class Lucene95HnswVectorsWriter extends KnnVectorsWriter {
|
|||
// write the vector data to a temporary file
|
||||
DocsWithFieldSet docsWithField =
|
||||
switch (fieldInfo.getVectorEncoding()) {
|
||||
case BYTE -> writeByteVectorData(
|
||||
tempVectorData, MergedVectorValues.mergeByteVectorValues(fieldInfo, mergeState));
|
||||
case FLOAT32 -> writeVectorData(
|
||||
tempVectorData, MergedVectorValues.mergeFloatVectorValues(fieldInfo, mergeState));
|
||||
case BYTE ->
|
||||
writeByteVectorData(
|
||||
tempVectorData,
|
||||
MergedVectorValues.mergeByteVectorValues(fieldInfo, mergeState));
|
||||
case FLOAT32 ->
|
||||
writeVectorData(
|
||||
tempVectorData,
|
||||
MergedVectorValues.mergeFloatVectorValues(fieldInfo, mergeState));
|
||||
};
|
||||
CodecUtil.writeFooter(tempVectorData);
|
||||
IOUtils.close(tempVectorData);
|
||||
|
@ -477,9 +481,11 @@ public final class Lucene95HnswVectorsWriter extends KnnVectorsWriter {
|
|||
}
|
||||
DocIdSetIterator mergedVectorIterator = null;
|
||||
switch (fieldInfo.getVectorEncoding()) {
|
||||
case BYTE -> mergedVectorIterator =
|
||||
case BYTE ->
|
||||
mergedVectorIterator =
|
||||
KnnVectorsWriter.MergedVectorValues.mergeByteVectorValues(fieldInfo, mergeState);
|
||||
case FLOAT32 -> mergedVectorIterator =
|
||||
case FLOAT32 ->
|
||||
mergedVectorIterator =
|
||||
KnnVectorsWriter.MergedVectorValues.mergeFloatVectorValues(fieldInfo, mergeState);
|
||||
}
|
||||
graph =
|
||||
|
@ -680,13 +686,15 @@ public final class Lucene95HnswVectorsWriter extends KnnVectorsWriter {
|
|||
throws IOException {
|
||||
int dim = fieldInfo.getVectorDimension();
|
||||
return switch (fieldInfo.getVectorEncoding()) {
|
||||
case BYTE -> new FieldWriter<byte[]>(fieldInfo, M, beamWidth, infoStream) {
|
||||
case BYTE ->
|
||||
new FieldWriter<byte[]>(fieldInfo, M, beamWidth, infoStream) {
|
||||
@Override
|
||||
public byte[] copyValue(byte[] value) {
|
||||
return ArrayUtil.copyOfSubArray(value, 0, dim);
|
||||
}
|
||||
};
|
||||
case FLOAT32 -> new FieldWriter<float[]>(fieldInfo, M, beamWidth, infoStream) {
|
||||
case FLOAT32 ->
|
||||
new FieldWriter<float[]>(fieldInfo, M, beamWidth, infoStream) {
|
||||
@Override
|
||||
public float[] copyValue(float[] value) {
|
||||
return ArrayUtil.copyOfSubArray(value, 0, dim);
|
||||
|
@ -704,10 +712,12 @@ public final class Lucene95HnswVectorsWriter extends KnnVectorsWriter {
|
|||
vectors = new ArrayList<>();
|
||||
RandomVectorScorerSupplier scorerSupplier =
|
||||
switch (fieldInfo.getVectorEncoding()) {
|
||||
case BYTE -> defaultFlatVectorScorer.getRandomVectorScorerSupplier(
|
||||
case BYTE ->
|
||||
defaultFlatVectorScorer.getRandomVectorScorerSupplier(
|
||||
fieldInfo.getVectorSimilarityFunction(),
|
||||
RandomAccessVectorValues.fromBytes((List<byte[]>) vectors, dim));
|
||||
case FLOAT32 -> defaultFlatVectorScorer.getRandomVectorScorerSupplier(
|
||||
case FLOAT32 ->
|
||||
defaultFlatVectorScorer.getRandomVectorScorerSupplier(
|
||||
fieldInfo.getVectorSimilarityFunction(),
|
||||
RandomAccessVectorValues.fromFloats((List<float[]>) vectors, dim));
|
||||
};
|
||||
|
@ -732,9 +742,9 @@ public final class Lucene95HnswVectorsWriter extends KnnVectorsWriter {
|
|||
lastDocID = docID;
|
||||
}
|
||||
|
||||
OnHeapHnswGraph getGraph() {
|
||||
OnHeapHnswGraph getGraph() throws IOException {
|
||||
if (vectors.size() > 0) {
|
||||
return hnswGraphBuilder.getGraph();
|
||||
return hnswGraphBuilder.getCompletedGraph();
|
||||
} else {
|
||||
return null;
|
||||
}
|
||||
|
|
|
@ -14,22 +14,22 @@
|
|||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.codecs.lucene99;
|
||||
package org.apache.lucene.backward_codecs.lucene99;
|
||||
|
||||
import static org.apache.lucene.codecs.lucene99.ForUtil.BLOCK_SIZE;
|
||||
import static org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.DOC_CODEC;
|
||||
import static org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.MAX_SKIP_LEVELS;
|
||||
import static org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.PAY_CODEC;
|
||||
import static org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.POS_CODEC;
|
||||
import static org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.TERMS_CODEC;
|
||||
import static org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.VERSION_CURRENT;
|
||||
import static org.apache.lucene.backward_codecs.lucene99.ForUtil.BLOCK_SIZE;
|
||||
import static org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat.DOC_CODEC;
|
||||
import static org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat.MAX_SKIP_LEVELS;
|
||||
import static org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat.PAY_CODEC;
|
||||
import static org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat.POS_CODEC;
|
||||
import static org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat.TERMS_CODEC;
|
||||
import static org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat.VERSION_CURRENT;
|
||||
|
||||
import java.io.IOException;
|
||||
import org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat.IntBlockTermState;
|
||||
import org.apache.lucene.codecs.BlockTermState;
|
||||
import org.apache.lucene.codecs.CodecUtil;
|
||||
import org.apache.lucene.codecs.CompetitiveImpactAccumulator;
|
||||
import org.apache.lucene.codecs.PushPostingsWriterBase;
|
||||
import org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.IntBlockTermState;
|
||||
import org.apache.lucene.index.CorruptIndexException;
|
||||
import org.apache.lucene.index.FieldInfo;
|
||||
import org.apache.lucene.index.IndexFileNames;
|
|
@ -0,0 +1,68 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.backward_codecs.lucene99;
|
||||
|
||||
import java.io.IOException;
|
||||
import org.apache.lucene.codecs.FieldsConsumer;
|
||||
import org.apache.lucene.codecs.PostingsWriterBase;
|
||||
import org.apache.lucene.codecs.lucene90.blocktree.Lucene90BlockTreeTermsWriter;
|
||||
import org.apache.lucene.index.SegmentWriteState;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
|
||||
public class Lucene99RWPostingsFormat extends Lucene99PostingsFormat {
|
||||
|
||||
private final int minTermBlockSize;
|
||||
private final int maxTermBlockSize;
|
||||
|
||||
/** Creates {@code Lucene99PostingsFormat} with default settings. */
|
||||
public Lucene99RWPostingsFormat() {
|
||||
this(
|
||||
Lucene90BlockTreeTermsWriter.DEFAULT_MIN_BLOCK_SIZE,
|
||||
Lucene90BlockTreeTermsWriter.DEFAULT_MAX_BLOCK_SIZE);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates {@code Lucene99PostingsFormat} with custom values for {@code minBlockSize} and {@code
|
||||
* maxBlockSize} passed to block terms dictionary.
|
||||
*
|
||||
* @see
|
||||
* Lucene90BlockTreeTermsWriter#Lucene90BlockTreeTermsWriter(SegmentWriteState,PostingsWriterBase,int,int)
|
||||
*/
|
||||
public Lucene99RWPostingsFormat(int minTermBlockSize, int maxTermBlockSize) {
|
||||
super();
|
||||
Lucene90BlockTreeTermsWriter.validateSettings(minTermBlockSize, maxTermBlockSize);
|
||||
this.minTermBlockSize = minTermBlockSize;
|
||||
this.maxTermBlockSize = maxTermBlockSize;
|
||||
}
|
||||
|
||||
@Override
|
||||
public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
|
||||
PostingsWriterBase postingsWriter = new Lucene99PostingsWriter(state);
|
||||
boolean success = false;
|
||||
try {
|
||||
FieldsConsumer ret =
|
||||
new Lucene90BlockTreeTermsWriter(
|
||||
state, postingsWriter, minTermBlockSize, maxTermBlockSize);
|
||||
success = true;
|
||||
return ret;
|
||||
} finally {
|
||||
if (!success) {
|
||||
IOUtils.closeWhileHandlingException(postingsWriter);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -14,7 +14,7 @@
|
|||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.codecs.lucene99;
|
||||
package org.apache.lucene.backward_codecs.lucene99;
|
||||
|
||||
import com.carrotsearch.randomizedtesting.generators.RandomNumbers;
|
||||
import java.io.IOException;
|
|
@ -14,7 +14,7 @@
|
|||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.codecs.lucene99;
|
||||
package org.apache.lucene.backward_codecs.lucene99;
|
||||
|
||||
import com.carrotsearch.randomizedtesting.generators.RandomNumbers;
|
||||
import java.io.IOException;
|
|
@ -19,7 +19,6 @@ package org.apache.lucene.backward_codecs.lucene99;
|
|||
|
||||
import org.apache.lucene.codecs.Codec;
|
||||
import org.apache.lucene.codecs.KnnVectorsFormat;
|
||||
import org.apache.lucene.codecs.lucene99.Lucene99Codec;
|
||||
import org.apache.lucene.tests.index.BaseKnnVectorsFormatTestCase;
|
||||
|
||||
public class TestLucene99HnswScalarQuantizedVectorsFormat extends BaseKnnVectorsFormatTestCase {
|
||||
|
|
|
@ -14,22 +14,26 @@
|
|||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.codecs.lucene99;
|
||||
package org.apache.lucene.backward_codecs.lucene99;
|
||||
|
||||
import static org.apache.lucene.codecs.lucene99.Lucene99ScoreSkipReader.readImpacts;
|
||||
import static org.apache.lucene.backward_codecs.lucene99.Lucene99ScoreSkipReader.readImpacts;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import org.apache.lucene.backward_codecs.lucene99.Lucene99ScoreSkipReader.MutableImpactList;
|
||||
import org.apache.lucene.codecs.Codec;
|
||||
import org.apache.lucene.codecs.CompetitiveImpactAccumulator;
|
||||
import org.apache.lucene.codecs.lucene90.blocktree.FieldReader;
|
||||
import org.apache.lucene.codecs.lucene90.blocktree.Stats;
|
||||
import org.apache.lucene.codecs.lucene99.Lucene99ScoreSkipReader.MutableImpactList;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.index.*;
|
||||
import org.apache.lucene.index.DirectoryReader;
|
||||
import org.apache.lucene.index.Impact;
|
||||
import org.apache.lucene.index.IndexWriter;
|
||||
import org.apache.lucene.index.IndexWriterConfig;
|
||||
import org.apache.lucene.index.TermsEnum;
|
||||
import org.apache.lucene.store.ByteArrayDataInput;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.IOContext;
|
||||
|
@ -41,7 +45,7 @@ import org.apache.lucene.tests.util.TestUtil;
|
|||
import org.apache.lucene.util.BytesRef;
|
||||
|
||||
public class TestLucene99PostingsFormat extends BasePostingsFormatTestCase {
|
||||
private final Codec codec = TestUtil.alwaysPostingsFormat(new Lucene99PostingsFormat());
|
||||
private final Codec codec = TestUtil.alwaysPostingsFormat(new Lucene99RWPostingsFormat());
|
||||
|
||||
@Override
|
||||
protected Codec getCodec() {
|
||||
|
@ -77,7 +81,7 @@ public class TestLucene99PostingsFormat extends BasePostingsFormatTestCase {
|
|||
expectThrows(
|
||||
IllegalArgumentException.class,
|
||||
() -> {
|
||||
new Lucene99PostingsFormat(minItemsInBlock, maxItemsInBlock);
|
||||
new Lucene99RWPostingsFormat(minItemsInBlock, maxItemsInBlock);
|
||||
});
|
||||
}
|
||||
|
|
@ -14,7 +14,7 @@
|
|||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.codecs.lucene99;
|
||||
package org.apache.lucene.backward_codecs.lucene99;
|
||||
|
||||
import com.carrotsearch.randomizedtesting.generators.RandomNumbers;
|
||||
import java.io.IOException;
|
|
@ -0,0 +1,49 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.backward_codecs.lucene99;
|
||||
|
||||
import java.io.IOException;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.IOContext;
|
||||
import org.apache.lucene.store.IndexInput;
|
||||
import org.apache.lucene.store.IndexOutput;
|
||||
import org.apache.lucene.tests.util.LuceneTestCase;
|
||||
|
||||
public class TestPostingsUtil extends LuceneTestCase {
|
||||
|
||||
// checks for bug described in https://github.com/apache/lucene/issues/13373
|
||||
public void testIntegerOverflow() throws IOException {
|
||||
final int size = random().nextInt(1, ForUtil.BLOCK_SIZE);
|
||||
final long[] docDeltaBuffer = new long[size];
|
||||
final long[] freqBuffer = new long[size];
|
||||
|
||||
final int delta = 1 << 30;
|
||||
docDeltaBuffer[0] = delta;
|
||||
try (Directory dir = newDirectory()) {
|
||||
try (IndexOutput out = dir.createOutput("test", IOContext.DEFAULT)) {
|
||||
// In old implementation, this would cause integer overflow exception.
|
||||
PostingsUtil.writeVIntBlock(out, docDeltaBuffer, freqBuffer, size, true);
|
||||
}
|
||||
long[] restoredDocs = new long[size];
|
||||
long[] restoredFreqs = new long[size];
|
||||
try (IndexInput in = dir.openInput("test", IOContext.DEFAULT)) {
|
||||
PostingsUtil.readVIntBlock(in, restoredDocs, restoredFreqs, size, true, true);
|
||||
}
|
||||
assertEquals(delta, restoredDocs[0]);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -196,6 +196,7 @@ public class TestAncientIndicesCompatibility extends LuceneTestCase {
|
|||
ByteArrayOutputStream bos = new ByteArrayOutputStream(1024);
|
||||
CheckIndex checker = new CheckIndex(dir);
|
||||
checker.setInfoStream(new PrintStream(bos, false, UTF_8));
|
||||
checker.setLevel(CheckIndex.Level.MIN_LEVEL_FOR_INTEGRITY_CHECKS);
|
||||
CheckIndex.Status indexStatus = checker.checkIndex();
|
||||
if (version.startsWith("8.")) {
|
||||
assertTrue(indexStatus.clean);
|
||||
|
|
|
@ -20,9 +20,9 @@ import static org.apache.lucene.backward_index.TestBasicBackwardsCompatibility.a
|
|||
|
||||
import com.carrotsearch.randomizedtesting.annotations.ParametersFactory;
|
||||
import java.io.IOException;
|
||||
import org.apache.lucene.backward_codecs.lucene99.Lucene99Codec;
|
||||
import org.apache.lucene.codecs.Codec;
|
||||
import org.apache.lucene.codecs.KnnVectorsFormat;
|
||||
import org.apache.lucene.codecs.lucene99.Lucene99Codec;
|
||||
import org.apache.lucene.codecs.lucene99.Lucene99HnswScalarQuantizedVectorsFormat;
|
||||
import org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat;
|
||||
import org.apache.lucene.document.Document;
|
||||
|
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
@ -40,3 +40,4 @@
|
|||
9.9.2
|
||||
9.10.0
|
||||
9.11.0
|
||||
9.11.1
|
||||
|
|
|
@ -0,0 +1,376 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.benchmark.jmh;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.Random;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import org.apache.lucene.search.DocIdSetIterator;
|
||||
import org.openjdk.jmh.annotations.Benchmark;
|
||||
import org.openjdk.jmh.annotations.BenchmarkMode;
|
||||
import org.openjdk.jmh.annotations.CompilerControl;
|
||||
import org.openjdk.jmh.annotations.Fork;
|
||||
import org.openjdk.jmh.annotations.Level;
|
||||
import org.openjdk.jmh.annotations.Measurement;
|
||||
import org.openjdk.jmh.annotations.Mode;
|
||||
import org.openjdk.jmh.annotations.OutputTimeUnit;
|
||||
import org.openjdk.jmh.annotations.Scope;
|
||||
import org.openjdk.jmh.annotations.Setup;
|
||||
import org.openjdk.jmh.annotations.State;
|
||||
import org.openjdk.jmh.annotations.Warmup;
|
||||
|
||||
@BenchmarkMode(Mode.Throughput)
|
||||
@OutputTimeUnit(TimeUnit.MILLISECONDS)
|
||||
@State(Scope.Benchmark)
|
||||
@Warmup(iterations = 5, time = 1)
|
||||
@Measurement(iterations = 5, time = 1)
|
||||
@Fork(
|
||||
value = 1,
|
||||
jvmArgsAppend = {"-Xmx1g", "-Xms1g", "-XX:+AlwaysPreTouch"})
|
||||
public class AdvanceBenchmark {
|
||||
|
||||
private final long[] values = new long[129];
|
||||
private final int[] startIndexes = new int[1_000];
|
||||
private final long[] targets = new long[startIndexes.length];
|
||||
|
||||
@Setup(Level.Trial)
|
||||
public void setup() throws Exception {
|
||||
for (int i = 0; i < 128; ++i) {
|
||||
values[i] = i;
|
||||
}
|
||||
values[128] = DocIdSetIterator.NO_MORE_DOCS;
|
||||
Random r = new Random(0);
|
||||
for (int i = 0; i < startIndexes.length; ++i) {
|
||||
startIndexes[i] = r.nextInt(64);
|
||||
targets[i] = startIndexes[i] + 1 + r.nextInt(1 << r.nextInt(7));
|
||||
}
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public void binarySearch() {
|
||||
for (int i = 0; i < startIndexes.length; ++i) {
|
||||
binarySearch(values, targets[i], startIndexes[i]);
|
||||
}
|
||||
}
|
||||
|
||||
@CompilerControl(CompilerControl.Mode.DONT_INLINE)
|
||||
private static int binarySearch(long[] values, long target, int startIndex) {
|
||||
// Standard binary search
|
||||
int i = Arrays.binarySearch(values, startIndex, values.length, target);
|
||||
if (i < 0) {
|
||||
i = -1 - i;
|
||||
}
|
||||
return i;
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public void binarySearch2() {
|
||||
for (int i = 0; i < startIndexes.length; ++i) {
|
||||
binarySearch2(values, targets[i], startIndexes[i]);
|
||||
}
|
||||
}
|
||||
|
||||
@CompilerControl(CompilerControl.Mode.DONT_INLINE)
|
||||
private static int binarySearch2(long[] values, long target, int startIndex) {
|
||||
// Try to help the compiler by providing predictable start/end offsets.
|
||||
int i = Arrays.binarySearch(values, 0, 128, target);
|
||||
if (i < 0) {
|
||||
i = -1 - i;
|
||||
}
|
||||
return i;
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public void binarySearch3() {
|
||||
for (int i = 0; i < startIndexes.length; ++i) {
|
||||
binarySearch3(values, targets[i], startIndexes[i]);
|
||||
}
|
||||
}
|
||||
|
||||
@CompilerControl(CompilerControl.Mode.DONT_INLINE)
|
||||
private static int binarySearch3(long[] values, long target, int startIndex) {
|
||||
// Organize code the same way as suggested in https://quickwit.io/blog/search-a-sorted-block,
|
||||
// which proved to help with LLVM.
|
||||
int start = 0;
|
||||
int length = 128;
|
||||
|
||||
while (length > 1) {
|
||||
length /= 2;
|
||||
if (values[start + length - 1] < target) {
|
||||
start += length;
|
||||
}
|
||||
}
|
||||
return start;
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public void binarySearch4() {
|
||||
for (int i = 0; i < startIndexes.length; ++i) {
|
||||
binarySearch4(values, targets[i], startIndexes[i]);
|
||||
}
|
||||
}
|
||||
|
||||
@CompilerControl(CompilerControl.Mode.DONT_INLINE)
|
||||
private static int binarySearch4(long[] values, long target, int startIndex) {
|
||||
// Explicitly inline the binary-search logic to see if it helps the compiler.
|
||||
int start = 0;
|
||||
|
||||
if (values[63] < target) {
|
||||
start += 64;
|
||||
}
|
||||
if (values[start + 31] < target) {
|
||||
start += 32;
|
||||
}
|
||||
if (values[start + 15] < target) {
|
||||
start += 16;
|
||||
}
|
||||
if (values[start + 7] < target) {
|
||||
start += 8;
|
||||
}
|
||||
if (values[start + 3] < target) {
|
||||
start += 4;
|
||||
}
|
||||
if (values[start + 1] < target) {
|
||||
start += 2;
|
||||
}
|
||||
if (values[start] < target) {
|
||||
start += 1;
|
||||
}
|
||||
|
||||
return start;
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public void binarySearch5() {
|
||||
for (int i = 0; i < startIndexes.length; ++i) {
|
||||
binarySearch5(values, targets[i], startIndexes[i]);
|
||||
}
|
||||
}
|
||||
|
||||
@CompilerControl(CompilerControl.Mode.DONT_INLINE)
|
||||
private static int binarySearch5(long[] values, long target, int startIndex) {
|
||||
// Other way to write a binary search
|
||||
int start = 0;
|
||||
|
||||
for (int shift = 6; shift >= 0; --shift) {
|
||||
int halfRange = 1 << shift;
|
||||
if (values[start + halfRange - 1] < target) {
|
||||
start += halfRange;
|
||||
}
|
||||
}
|
||||
|
||||
return start;
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public void binarySearch6() {
|
||||
for (int i = 0; i < startIndexes.length; ++i) {
|
||||
binarySearch6(values, targets[i], startIndexes[i]);
|
||||
}
|
||||
}
|
||||
|
||||
@CompilerControl(CompilerControl.Mode.DONT_INLINE)
|
||||
private static int binarySearch6(long[] values, long target, int startIndex) {
|
||||
// Other way to write a binary search
|
||||
int start = 0;
|
||||
|
||||
for (int halfRange = 64; halfRange > 0; halfRange >>= 1) {
|
||||
if (values[start + halfRange - 1] < target) {
|
||||
start += halfRange;
|
||||
}
|
||||
}
|
||||
|
||||
return start;
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public void linearSearch() {
|
||||
for (int i = 0; i < startIndexes.length; ++i) {
|
||||
linearSearch(values, targets[i], startIndexes[i]);
|
||||
}
|
||||
}
|
||||
|
||||
@CompilerControl(CompilerControl.Mode.DONT_INLINE)
|
||||
private static int linearSearch(long[] values, long target, int startIndex) {
|
||||
// Naive linear search.
|
||||
for (int i = startIndex; i < values.length; ++i) {
|
||||
if (values[i] >= target) {
|
||||
return i;
|
||||
}
|
||||
}
|
||||
return values.length;
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public void bruteForceSearch() {
|
||||
for (int i = 0; i < startIndexes.length; ++i) {
|
||||
bruteForceSearch(values, targets[i], startIndexes[i]);
|
||||
}
|
||||
}
|
||||
|
||||
@CompilerControl(CompilerControl.Mode.DONT_INLINE)
|
||||
private static int bruteForceSearch(long[] values, long target, int startIndex) {
|
||||
// Linear search with predictable start/end offsets to see if it helps the compiler.
|
||||
for (int i = 0; i < 128; ++i) {
|
||||
if (values[i] >= target) {
|
||||
return i;
|
||||
}
|
||||
}
|
||||
return values.length;
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public void linearSearch2() {
|
||||
for (int i = 0; i < startIndexes.length; ++i) {
|
||||
linearSearch2(values, targets[i], startIndexes[i]);
|
||||
}
|
||||
}
|
||||
|
||||
@CompilerControl(CompilerControl.Mode.DONT_INLINE)
|
||||
private static int linearSearch2(long[] values, long target, int startIndex) {
|
||||
// Two-level linear search, first checking every 8-th value, then values within an 8-value range
|
||||
int rangeStart = values.length - 8;
|
||||
|
||||
for (int i = startIndex; i + 8 <= values.length; i += 8) {
|
||||
if (values[i + 7] >= target) {
|
||||
rangeStart = i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 0; i < 8; ++i) {
|
||||
if (values[rangeStart + i] >= target) {
|
||||
return rangeStart + i;
|
||||
}
|
||||
}
|
||||
|
||||
return values.length;
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public void linearSearch3() {
|
||||
for (int i = 0; i < startIndexes.length; ++i) {
|
||||
linearSearch3(values, targets[i], startIndexes[i]);
|
||||
}
|
||||
}
|
||||
|
||||
@CompilerControl(CompilerControl.Mode.DONT_INLINE)
|
||||
private static int linearSearch3(long[] values, long target, int startIndex) {
|
||||
// Iteration over linearSearch that tries to reduce branches
|
||||
while (startIndex + 4 <= values.length) {
|
||||
int count = values[startIndex] < target ? 1 : 0;
|
||||
if (values[startIndex + 1] < target) {
|
||||
count++;
|
||||
}
|
||||
if (values[startIndex + 2] < target) {
|
||||
count++;
|
||||
}
|
||||
if (values[startIndex + 3] < target) {
|
||||
count++;
|
||||
}
|
||||
if (count != 4) {
|
||||
return startIndex + count;
|
||||
}
|
||||
startIndex += 4;
|
||||
}
|
||||
|
||||
for (int i = startIndex; i < values.length; ++i) {
|
||||
if (values[i] >= target) {
|
||||
return i;
|
||||
}
|
||||
}
|
||||
|
||||
return values.length;
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public void hybridSearch() {
|
||||
for (int i = 0; i < startIndexes.length; ++i) {
|
||||
hybridSearch(values, targets[i], startIndexes[i]);
|
||||
}
|
||||
}
|
||||
|
||||
@CompilerControl(CompilerControl.Mode.DONT_INLINE)
|
||||
private static int hybridSearch(long[] values, long target, int startIndex) {
|
||||
// Two-level linear search, first checking every 8-th value, then values within an 8-value range
|
||||
int rangeStart = values.length - 8;
|
||||
|
||||
for (int i = startIndex; i + 8 <= values.length; i += 8) {
|
||||
if (values[i + 7] >= target) {
|
||||
rangeStart = i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return binarySearchHelper8(values, target, rangeStart);
|
||||
}
|
||||
|
||||
// branchless binary search over 8 values
|
||||
private static int binarySearchHelper8(long[] values, long target, int start) {
|
||||
if (values[start + 3] < target) {
|
||||
start += 4;
|
||||
}
|
||||
if (values[start + 1] < target) {
|
||||
start += 2;
|
||||
}
|
||||
if (values[start] < target) {
|
||||
start += 1;
|
||||
}
|
||||
return start;
|
||||
}
|
||||
|
||||
private static void assertEquals(int expected, int actual) {
|
||||
if (expected != actual) {
|
||||
throw new AssertionError("Expected: " + expected + ", got " + actual);
|
||||
}
|
||||
}
|
||||
|
||||
public static void main(String[] args) {
|
||||
// For testing purposes
|
||||
long[] values = new long[129];
|
||||
for (int i = 0; i < 128; ++i) {
|
||||
values[i] = i;
|
||||
}
|
||||
values[128] = DocIdSetIterator.NO_MORE_DOCS;
|
||||
for (int start = 0; start < 128; ++start) {
|
||||
for (int targetIndex = start; targetIndex < 128; ++targetIndex) {
|
||||
int actualIndex = binarySearch(values, values[targetIndex], start);
|
||||
assertEquals(targetIndex, actualIndex);
|
||||
actualIndex = binarySearch2(values, values[targetIndex], start);
|
||||
assertEquals(targetIndex, actualIndex);
|
||||
actualIndex = binarySearch3(values, values[targetIndex], start);
|
||||
assertEquals(targetIndex, actualIndex);
|
||||
actualIndex = binarySearch4(values, values[targetIndex], start);
|
||||
assertEquals(targetIndex, actualIndex);
|
||||
actualIndex = binarySearch5(values, values[targetIndex], start);
|
||||
assertEquals(targetIndex, actualIndex);
|
||||
actualIndex = binarySearch6(values, values[targetIndex], start);
|
||||
assertEquals(targetIndex, actualIndex);
|
||||
actualIndex = bruteForceSearch(values, values[targetIndex], start);
|
||||
assertEquals(targetIndex, actualIndex);
|
||||
actualIndex = hybridSearch(values, values[targetIndex], start);
|
||||
assertEquals(targetIndex, actualIndex);
|
||||
actualIndex = linearSearch(values, values[targetIndex], start);
|
||||
assertEquals(targetIndex, actualIndex);
|
||||
actualIndex = linearSearch2(values, values[targetIndex], start);
|
||||
assertEquals(targetIndex, actualIndex);
|
||||
actualIndex = linearSearch3(values, values[targetIndex], start);
|
||||
assertEquals(targetIndex, actualIndex);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,75 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.benchmark.jmh;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Random;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import org.apache.lucene.util.VectorUtil;
|
||||
import org.openjdk.jmh.annotations.Benchmark;
|
||||
import org.openjdk.jmh.annotations.BenchmarkMode;
|
||||
import org.openjdk.jmh.annotations.Fork;
|
||||
import org.openjdk.jmh.annotations.Measurement;
|
||||
import org.openjdk.jmh.annotations.Mode;
|
||||
import org.openjdk.jmh.annotations.OutputTimeUnit;
|
||||
import org.openjdk.jmh.annotations.Param;
|
||||
import org.openjdk.jmh.annotations.Scope;
|
||||
import org.openjdk.jmh.annotations.Setup;
|
||||
import org.openjdk.jmh.annotations.State;
|
||||
import org.openjdk.jmh.annotations.Warmup;
|
||||
|
||||
@Fork(1)
|
||||
@Warmup(iterations = 3, time = 3)
|
||||
@Measurement(iterations = 5, time = 3)
|
||||
@BenchmarkMode(Mode.Throughput)
|
||||
@OutputTimeUnit(TimeUnit.SECONDS)
|
||||
@State(Scope.Benchmark)
|
||||
public class HammingDistanceBenchmark {
|
||||
@Param({"1000000"})
|
||||
int nb = 1_000_000;
|
||||
|
||||
@Param({"1024"})
|
||||
int dims = 1024;
|
||||
|
||||
byte[][] xb;
|
||||
byte[] xq;
|
||||
|
||||
@Setup
|
||||
public void setup() throws IOException {
|
||||
Random rand = new Random();
|
||||
this.xb = new byte[nb][dims / 8];
|
||||
for (int i = 0; i < nb; i++) {
|
||||
for (int j = 0; j < dims / 8; j++) {
|
||||
xb[i][j] = (byte) rand.nextInt(0, 255);
|
||||
}
|
||||
}
|
||||
this.xq = new byte[dims / 8];
|
||||
for (int i = 0; i < xq.length; i++) {
|
||||
xq[i] = (byte) rand.nextInt(0, 255);
|
||||
}
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public int xorBitCount() {
|
||||
int tot = 0;
|
||||
for (int i = 0; i < nb; i++) {
|
||||
tot += VectorUtil.xorBitCount(xb[i], xq);
|
||||
}
|
||||
return tot;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,108 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.benchmark.jmh;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.Random;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import org.apache.lucene.codecs.lucene912.ForDeltaUtil;
|
||||
import org.apache.lucene.codecs.lucene912.ForUtil;
|
||||
import org.apache.lucene.codecs.lucene912.PostingIndexInput;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.IOContext;
|
||||
import org.apache.lucene.store.IndexInput;
|
||||
import org.apache.lucene.store.IndexOutput;
|
||||
import org.apache.lucene.store.MMapDirectory;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.openjdk.jmh.annotations.Benchmark;
|
||||
import org.openjdk.jmh.annotations.BenchmarkMode;
|
||||
import org.openjdk.jmh.annotations.Fork;
|
||||
import org.openjdk.jmh.annotations.Level;
|
||||
import org.openjdk.jmh.annotations.Measurement;
|
||||
import org.openjdk.jmh.annotations.Mode;
|
||||
import org.openjdk.jmh.annotations.OutputTimeUnit;
|
||||
import org.openjdk.jmh.annotations.Param;
|
||||
import org.openjdk.jmh.annotations.Scope;
|
||||
import org.openjdk.jmh.annotations.Setup;
|
||||
import org.openjdk.jmh.annotations.State;
|
||||
import org.openjdk.jmh.annotations.TearDown;
|
||||
import org.openjdk.jmh.annotations.Warmup;
|
||||
import org.openjdk.jmh.infra.Blackhole;
|
||||
|
||||
@BenchmarkMode(Mode.Throughput)
|
||||
@OutputTimeUnit(TimeUnit.MICROSECONDS)
|
||||
@State(Scope.Benchmark)
|
||||
@Warmup(iterations = 5, time = 1)
|
||||
@Measurement(iterations = 5, time = 1)
|
||||
@Fork(
|
||||
value = 3,
|
||||
jvmArgsAppend = {"-Xmx1g", "-Xms1g", "-XX:+AlwaysPreTouch"})
|
||||
public class PostingIndexInputBenchmark {
|
||||
|
||||
private Path path;
|
||||
private Directory dir;
|
||||
private IndexInput in;
|
||||
private PostingIndexInput postingIn;
|
||||
private final ForUtil forUtil = new ForUtil();
|
||||
private final ForDeltaUtil forDeltaUtil = new ForDeltaUtil();
|
||||
private final long[] values = new long[128];
|
||||
|
||||
@Param({"2", "3", "4", "5", "6", "7", "8", "9", "10"})
|
||||
public int bpv;
|
||||
|
||||
@Setup(Level.Trial)
|
||||
public void setup() throws Exception {
|
||||
path = Files.createTempDirectory("forUtil");
|
||||
dir = MMapDirectory.open(path);
|
||||
try (IndexOutput out = dir.createOutput("docs", IOContext.DEFAULT)) {
|
||||
Random r = new Random(0);
|
||||
// Write enough random data to not reach EOF while decoding
|
||||
for (int i = 0; i < 100; ++i) {
|
||||
out.writeLong(r.nextLong());
|
||||
}
|
||||
}
|
||||
in = dir.openInput("docs", IOContext.DEFAULT);
|
||||
postingIn = new PostingIndexInput(in, forUtil, forDeltaUtil);
|
||||
}
|
||||
|
||||
@TearDown(Level.Trial)
|
||||
public void tearDown() throws Exception {
|
||||
if (dir != null) {
|
||||
dir.deleteFile("docs");
|
||||
}
|
||||
IOUtils.close(in, dir);
|
||||
in = null;
|
||||
dir = null;
|
||||
Files.deleteIfExists(path);
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public void decode(Blackhole bh) throws IOException {
|
||||
in.seek(3); // random unaligned offset
|
||||
postingIn.decode(bpv, values);
|
||||
bh.consume(values);
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public void decodeAndPrefixSum(Blackhole bh) throws IOException {
|
||||
in.seek(3); // random unaligned offset
|
||||
postingIn.decodeAndPrefixSum(bpv, 100, values);
|
||||
bh.consume(values);
|
||||
}
|
||||
}
|
|
@ -17,11 +17,10 @@
|
|||
# -------------------------------------------------------------------------------------
|
||||
# multi val params are iterated by NewRound's, added to reports, start with column name.
|
||||
|
||||
# collector.class can be:
|
||||
# Fully Qualified Class Name of a Collector with a empty constructor
|
||||
# topScoreDocOrdered - Creates a TopScoreDocCollector that requires in order docs
|
||||
# topScoreDocUnordered - Like above, but allows out of order
|
||||
collector.class=coll:topScoreDoc
|
||||
# collector.manager.class can be:
|
||||
# Fully Qualified Class Name of a CollectorManager with a empty constructor
|
||||
# topScoreDoc - Creates a TopScoreDocCollectorManager
|
||||
collector.manager.class=coll:topScoreDoc
|
||||
|
||||
analyzer=org.apache.lucene.analysis.core.WhitespaceAnalyzer
|
||||
directory=FSDirectory
|
||||
|
|
|
@ -17,11 +17,10 @@
|
|||
# -------------------------------------------------------------------------------------
|
||||
# multi val params are iterated by NewRound's, added to reports, start with column name.
|
||||
|
||||
# collector.class can be:
|
||||
# Fully Qualified Class Name of a Collector with a empty constructor
|
||||
# topScoreDocOrdered - Creates a TopScoreDocCollector that requires in order docs
|
||||
# topScoreDocUnordered - Like above, but allows out of order
|
||||
collector.class=coll:topScoreDoc
|
||||
# collector.manager.class can be:
|
||||
# Fully Qualified Class Name of a CollectorManager with a empty constructor
|
||||
# topScoreDoc - Creates a TopScoreDocCollectorManager
|
||||
collector.manager.class=coll:topScoreDoc
|
||||
|
||||
analyzer=org.apache.lucene.analysis.core.WhitespaceAnalyzer
|
||||
directory=FSDirectory
|
||||
|
|
|
@ -24,7 +24,7 @@ import org.apache.lucene.index.DirectoryReader;
|
|||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.MultiBits;
|
||||
import org.apache.lucene.index.StoredFields;
|
||||
import org.apache.lucene.search.Collector;
|
||||
import org.apache.lucene.search.CollectorManager;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.search.ScoreDoc;
|
||||
|
@ -119,9 +119,7 @@ public abstract class ReadTask extends PerfTask {
|
|||
hits = searcher.search(q, numHits);
|
||||
}
|
||||
} else {
|
||||
Collector collector = createCollector();
|
||||
|
||||
searcher.search(q, collector);
|
||||
searcher.search(q, createCollectorManager());
|
||||
// hits = collector.topDocs();
|
||||
}
|
||||
|
||||
|
@ -184,9 +182,8 @@ public abstract class ReadTask extends PerfTask {
|
|||
return res;
|
||||
}
|
||||
|
||||
protected Collector createCollector() throws Exception {
|
||||
return new TopScoreDocCollectorManager(numHits(), withTotalHits() ? Integer.MAX_VALUE : 1)
|
||||
.newCollector();
|
||||
protected CollectorManager<?, ?> createCollectorManager() throws Exception {
|
||||
return new TopScoreDocCollectorManager(numHits(), withTotalHits() ? Integer.MAX_VALUE : 1);
|
||||
}
|
||||
|
||||
protected Document retrieveDoc(StoredFields storedFields, int id) throws IOException {
|
||||
|
|
|
@ -19,8 +19,8 @@ package org.apache.lucene.benchmark.byTask.tasks;
|
|||
import org.apache.lucene.benchmark.byTask.PerfRunData;
|
||||
import org.apache.lucene.benchmark.byTask.feeds.QueryMaker;
|
||||
import org.apache.lucene.benchmark.byTask.utils.Config;
|
||||
import org.apache.lucene.search.Collector;
|
||||
import org.apache.lucene.search.TopScoreDocCollector;
|
||||
import org.apache.lucene.search.CollectorManager;
|
||||
import org.apache.lucene.search.TopScoreDocCollectorManager;
|
||||
|
||||
/** Does search w/ a custom collector */
|
||||
public class SearchWithCollectorTask extends SearchTask {
|
||||
|
@ -37,7 +37,11 @@ public class SearchWithCollectorTask extends SearchTask {
|
|||
// check to make sure either the doc is being stored
|
||||
PerfRunData runData = getRunData();
|
||||
Config config = runData.getConfig();
|
||||
clnName = config.get("collector.class", "");
|
||||
if (config.get("collector.class", null) != null) {
|
||||
throw new IllegalArgumentException(
|
||||
"collector.class is no longer supported as a config parameter, use collector.manager.class instead to provide a CollectorManager class name");
|
||||
}
|
||||
clnName = config.get("collector.manager.class", "");
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -46,17 +50,17 @@ public class SearchWithCollectorTask extends SearchTask {
|
|||
}
|
||||
|
||||
@Override
|
||||
protected Collector createCollector() throws Exception {
|
||||
Collector collector = null;
|
||||
protected CollectorManager<?, ?> createCollectorManager() throws Exception {
|
||||
CollectorManager<?, ?> collectorManager;
|
||||
if (clnName.equalsIgnoreCase("topScoreDoc") == true) {
|
||||
collector = TopScoreDocCollector.create(numHits(), Integer.MAX_VALUE);
|
||||
collectorManager = new TopScoreDocCollectorManager(numHits(), Integer.MAX_VALUE);
|
||||
} else if (clnName.length() > 0) {
|
||||
collector = Class.forName(clnName).asSubclass(Collector.class).getConstructor().newInstance();
|
||||
|
||||
collectorManager =
|
||||
Class.forName(clnName).asSubclass(CollectorManager.class).getConstructor().newInstance();
|
||||
} else {
|
||||
collector = super.createCollector();
|
||||
collectorManager = super.createCollectorManager();
|
||||
}
|
||||
return collector;
|
||||
return collectorManager;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -23,13 +23,13 @@ import org.apache.lucene.codecs.PostingsFormat;
|
|||
import org.apache.lucene.codecs.PostingsReaderBase;
|
||||
import org.apache.lucene.codecs.PostingsWriterBase;
|
||||
import org.apache.lucene.codecs.lucene90.blocktree.Lucene90BlockTreeTermsWriter;
|
||||
import org.apache.lucene.codecs.lucene99.Lucene99PostingsReader;
|
||||
import org.apache.lucene.codecs.lucene99.Lucene99PostingsWriter;
|
||||
import org.apache.lucene.codecs.lucene912.Lucene912PostingsReader;
|
||||
import org.apache.lucene.codecs.lucene912.Lucene912PostingsWriter;
|
||||
import org.apache.lucene.index.SegmentReadState;
|
||||
import org.apache.lucene.index.SegmentWriteState;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
|
||||
/** Uses {@link OrdsBlockTreeTermsWriter} with {@link Lucene99PostingsWriter}. */
|
||||
/** Uses {@link OrdsBlockTreeTermsWriter} with {@link Lucene912PostingsWriter}. */
|
||||
public class BlockTreeOrdsPostingsFormat extends PostingsFormat {
|
||||
|
||||
private final int minTermBlockSize;
|
||||
|
@ -67,7 +67,7 @@ public class BlockTreeOrdsPostingsFormat extends PostingsFormat {
|
|||
|
||||
@Override
|
||||
public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
|
||||
PostingsWriterBase postingsWriter = new Lucene99PostingsWriter(state);
|
||||
PostingsWriterBase postingsWriter = new Lucene912PostingsWriter(state);
|
||||
|
||||
boolean success = false;
|
||||
try {
|
||||
|
@ -84,7 +84,7 @@ public class BlockTreeOrdsPostingsFormat extends PostingsFormat {
|
|||
|
||||
@Override
|
||||
public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException {
|
||||
PostingsReaderBase postingsReader = new Lucene99PostingsReader(state);
|
||||
PostingsReaderBase postingsReader = new Lucene912PostingsReader(state);
|
||||
boolean success = false;
|
||||
try {
|
||||
FieldsProducer ret = new OrdsBlockTreeTermsReader(postingsReader, state);
|
||||
|
|
|
@ -43,6 +43,7 @@ import org.apache.lucene.store.ChecksumIndexInput;
|
|||
import org.apache.lucene.store.DataOutput;
|
||||
import org.apache.lucene.store.IndexOutput;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.IOBooleanSupplier;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.lucene.util.automaton.CompiledAutomaton;
|
||||
|
||||
|
@ -315,12 +316,21 @@ public final class BloomFilteringPostingsFormat extends PostingsFormat {
|
|||
}
|
||||
|
||||
@Override
|
||||
public boolean seekExact(BytesRef text) throws IOException {
|
||||
public IOBooleanSupplier prepareSeekExact(BytesRef text) throws IOException {
|
||||
// The magical fail-fast speed up that is the entire point of all of
|
||||
// this code - save a disk seek if there is a match on an in-memory
|
||||
// structure
|
||||
// that may occasionally give a false positive but guaranteed no false
|
||||
// negatives
|
||||
if (filter.contains(text) == ContainsResult.NO) {
|
||||
return null;
|
||||
}
|
||||
return delegate().prepareSeekExact(text);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean seekExact(BytesRef text) throws IOException {
|
||||
// See #prepareSeekExact
|
||||
if (filter.contains(text) == ContainsResult.NO) {
|
||||
return false;
|
||||
}
|
||||
|
|
|
@ -24,7 +24,7 @@ import java.util.TreeMap;
|
|||
import org.apache.lucene.codecs.FieldsConsumer;
|
||||
import org.apache.lucene.codecs.FieldsProducer;
|
||||
import org.apache.lucene.codecs.PostingsFormat;
|
||||
import org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat;
|
||||
import org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat;
|
||||
import org.apache.lucene.index.BaseTermsEnum;
|
||||
import org.apache.lucene.index.FieldInfo;
|
||||
import org.apache.lucene.index.Fields;
|
||||
|
@ -54,7 +54,7 @@ import org.apache.lucene.util.automaton.TransitionAccessor;
|
|||
// - or: longer dense skip lists than just next byte?
|
||||
|
||||
/**
|
||||
* Wraps {@link Lucene99PostingsFormat} format for on-disk storage, but then at read time loads and
|
||||
* Wraps {@link Lucene912PostingsFormat} format for on-disk storage, but then at read time loads and
|
||||
* stores all terms and postings directly in RAM as byte[], int[].
|
||||
*
|
||||
* <p><b>WARNING</b>: This is exceptionally RAM intensive: it makes no effort to compress the
|
||||
|
@ -97,12 +97,12 @@ public final class DirectPostingsFormat extends PostingsFormat {
|
|||
|
||||
@Override
|
||||
public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
|
||||
return PostingsFormat.forName("Lucene99").fieldsConsumer(state);
|
||||
return PostingsFormat.forName("Lucene912").fieldsConsumer(state);
|
||||
}
|
||||
|
||||
@Override
|
||||
public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException {
|
||||
FieldsProducer postings = PostingsFormat.forName("Lucene99").fieldsProducer(state);
|
||||
FieldsProducer postings = PostingsFormat.forName("Lucene912").fieldsProducer(state);
|
||||
if (state.context.context() != IOContext.Context.MERGE) {
|
||||
FieldsProducer loadedPostings;
|
||||
try {
|
||||
|
|
|
@ -22,8 +22,8 @@ import org.apache.lucene.codecs.FieldsProducer;
|
|||
import org.apache.lucene.codecs.PostingsFormat;
|
||||
import org.apache.lucene.codecs.PostingsReaderBase;
|
||||
import org.apache.lucene.codecs.PostingsWriterBase;
|
||||
import org.apache.lucene.codecs.lucene99.Lucene99PostingsReader;
|
||||
import org.apache.lucene.codecs.lucene99.Lucene99PostingsWriter;
|
||||
import org.apache.lucene.codecs.lucene912.Lucene912PostingsReader;
|
||||
import org.apache.lucene.codecs.lucene912.Lucene912PostingsWriter;
|
||||
import org.apache.lucene.index.SegmentReadState;
|
||||
import org.apache.lucene.index.SegmentWriteState;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
|
@ -41,7 +41,7 @@ public final class FSTPostingsFormat extends PostingsFormat {
|
|||
|
||||
@Override
|
||||
public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
|
||||
PostingsWriterBase postingsWriter = new Lucene99PostingsWriter(state);
|
||||
PostingsWriterBase postingsWriter = new Lucene912PostingsWriter(state);
|
||||
|
||||
boolean success = false;
|
||||
try {
|
||||
|
@ -57,7 +57,7 @@ public final class FSTPostingsFormat extends PostingsFormat {
|
|||
|
||||
@Override
|
||||
public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException {
|
||||
PostingsReaderBase postingsReader = new Lucene99PostingsReader(state);
|
||||
PostingsReaderBase postingsReader = new Lucene912PostingsReader(state);
|
||||
boolean success = false;
|
||||
try {
|
||||
FieldsProducer ret = new FSTTermsReader(state, postingsReader);
|
||||
|
|
|
@ -195,9 +195,10 @@ public class FSTTermsReader extends FieldsProducer {
|
|||
this.sumTotalTermFreq = sumTotalTermFreq;
|
||||
this.sumDocFreq = sumDocFreq;
|
||||
this.docCount = docCount;
|
||||
OffHeapFSTStore offHeapFSTStore = new OffHeapFSTStore();
|
||||
FSTTermOutputs outputs = new FSTTermOutputs(fieldInfo);
|
||||
this.dict = new FST<>(FST.readMetadata(in, outputs), in, offHeapFSTStore);
|
||||
final var fstMetadata = FST.readMetadata(in, outputs);
|
||||
OffHeapFSTStore offHeapFSTStore = new OffHeapFSTStore(in, in.getFilePointer(), fstMetadata);
|
||||
this.dict = FST.fromFSTReader(fstMetadata, offHeapFSTStore);
|
||||
in.skipBytes(offHeapFSTStore.size());
|
||||
}
|
||||
|
||||
|
|
|
@ -71,8 +71,8 @@ final class SimpleTextBKDReader extends PointValues {
|
|||
this.pointCount = pointCount;
|
||||
this.docCount = docCount;
|
||||
this.version = SimpleTextBKDWriter.VERSION_CURRENT;
|
||||
assert minPackedValue.length == config.packedIndexBytesLength;
|
||||
assert maxPackedValue.length == config.packedIndexBytesLength;
|
||||
assert minPackedValue.length == config.packedIndexBytesLength();
|
||||
assert maxPackedValue.length == config.packedIndexBytesLength();
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -99,8 +99,8 @@ final class SimpleTextBKDReader extends PointValues {
|
|||
private SimpleTextPointTree(
|
||||
IndexInput in, int nodeID, int level, byte[] minPackedValue, byte[] maxPackedValue) {
|
||||
this.in = in;
|
||||
this.scratchDocIDs = new int[config.maxPointsInLeafNode];
|
||||
this.scratchPackedValue = new byte[config.packedBytesLength];
|
||||
this.scratchDocIDs = new int[config.maxPointsInLeafNode()];
|
||||
this.scratchPackedValue = new byte[config.packedBytesLength()];
|
||||
this.nodeID = nodeID;
|
||||
this.rootNode = nodeID;
|
||||
this.level = level;
|
||||
|
@ -145,38 +145,39 @@ final class SimpleTextBKDReader extends PointValues {
|
|||
private void pushLeft() {
|
||||
int address = nodeID * bytesPerIndexEntry;
|
||||
// final int splitDimPos;
|
||||
if (config.numIndexDims == 1) {
|
||||
if (config.numIndexDims() == 1) {
|
||||
splitDims[level] = 0;
|
||||
} else {
|
||||
splitDims[level] = (splitPackedValues[address++] & 0xff);
|
||||
}
|
||||
final int splitDimPos = splitDims[level] * config.bytesPerDim;
|
||||
final int splitDimPos = splitDims[level] * config.bytesPerDim();
|
||||
if (splitDimValueStack[level] == null) {
|
||||
splitDimValueStack[level] = new byte[config.bytesPerDim];
|
||||
splitDimValueStack[level] = new byte[config.bytesPerDim()];
|
||||
}
|
||||
// save the dimension we are going to change
|
||||
System.arraycopy(
|
||||
maxPackedValue, splitDimPos, splitDimValueStack[level], 0, config.bytesPerDim);
|
||||
maxPackedValue, splitDimPos, splitDimValueStack[level], 0, config.bytesPerDim());
|
||||
assert Arrays.compareUnsigned(
|
||||
maxPackedValue,
|
||||
splitDimPos,
|
||||
splitDimPos + config.bytesPerDim,
|
||||
splitDimPos + config.bytesPerDim(),
|
||||
splitPackedValues,
|
||||
address,
|
||||
address + config.bytesPerDim)
|
||||
address + config.bytesPerDim())
|
||||
>= 0
|
||||
: "config.bytesPerDim="
|
||||
+ config.bytesPerDim
|
||||
: "config.bytesPerDim()="
|
||||
+ config.bytesPerDim()
|
||||
+ " splitDim="
|
||||
+ splitDims[level]
|
||||
+ " config.numIndexDims="
|
||||
+ config.numIndexDims
|
||||
+ " config.numIndexDims()="
|
||||
+ config.numIndexDims()
|
||||
+ " config.numDims="
|
||||
+ config.numDims;
|
||||
+ config.numDims();
|
||||
nodeID *= 2;
|
||||
level++;
|
||||
// add the split dim value:
|
||||
System.arraycopy(splitPackedValues, address, maxPackedValue, splitDimPos, config.bytesPerDim);
|
||||
System.arraycopy(
|
||||
splitPackedValues, address, maxPackedValue, splitDimPos, config.bytesPerDim());
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -191,37 +192,38 @@ final class SimpleTextBKDReader extends PointValues {
|
|||
|
||||
private void pushRight() {
|
||||
int address = nodeID * bytesPerIndexEntry;
|
||||
if (config.numIndexDims == 1) {
|
||||
if (config.numIndexDims() == 1) {
|
||||
splitDims[level] = 0;
|
||||
} else {
|
||||
splitDims[level] = (splitPackedValues[address++] & 0xff);
|
||||
}
|
||||
final int splitDimPos = splitDims[level] * config.bytesPerDim;
|
||||
final int splitDimPos = splitDims[level] * config.bytesPerDim();
|
||||
// we should have already visit the left node
|
||||
assert splitDimValueStack[level] != null;
|
||||
// save the dimension we are going to change
|
||||
System.arraycopy(
|
||||
minPackedValue, splitDimPos, splitDimValueStack[level], 0, config.bytesPerDim);
|
||||
minPackedValue, splitDimPos, splitDimValueStack[level], 0, config.bytesPerDim());
|
||||
assert Arrays.compareUnsigned(
|
||||
minPackedValue,
|
||||
splitDimPos,
|
||||
splitDimPos + config.bytesPerDim,
|
||||
splitDimPos + config.bytesPerDim(),
|
||||
splitPackedValues,
|
||||
address,
|
||||
address + config.bytesPerDim)
|
||||
address + config.bytesPerDim())
|
||||
<= 0
|
||||
: "config.bytesPerDim="
|
||||
+ config.bytesPerDim
|
||||
: "config.bytesPerDim()="
|
||||
+ config.bytesPerDim()
|
||||
+ " splitDim="
|
||||
+ splitDims[level]
|
||||
+ " config.numIndexDims="
|
||||
+ config.numIndexDims
|
||||
+ " config.numIndexDims()="
|
||||
+ config.numIndexDims()
|
||||
+ " config.numDims="
|
||||
+ config.numDims;
|
||||
+ config.numDims();
|
||||
nodeID = 2 * nodeID + 1;
|
||||
level++;
|
||||
// add the split dim value:
|
||||
System.arraycopy(splitPackedValues, address, minPackedValue, splitDimPos, config.bytesPerDim);
|
||||
System.arraycopy(
|
||||
splitPackedValues, address, minPackedValue, splitDimPos, config.bytesPerDim());
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -242,16 +244,16 @@ final class SimpleTextBKDReader extends PointValues {
|
|||
splitDimValueStack[level],
|
||||
0,
|
||||
maxPackedValue,
|
||||
splitDims[level] * config.bytesPerDim,
|
||||
config.bytesPerDim);
|
||||
splitDims[level] * config.bytesPerDim(),
|
||||
config.bytesPerDim());
|
||||
} else {
|
||||
|
||||
System.arraycopy(
|
||||
splitDimValueStack[level],
|
||||
0,
|
||||
minPackedValue,
|
||||
splitDims[level] * config.bytesPerDim,
|
||||
config.bytesPerDim);
|
||||
splitDims[level] * config.bytesPerDim(),
|
||||
config.bytesPerDim());
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -290,7 +292,7 @@ final class SimpleTextBKDReader extends PointValues {
|
|||
private long sizeFromBalancedTree(int leftMostLeafNode, int rightMostLeafNode) {
|
||||
// number of points that need to be distributed between leaves, one per leaf
|
||||
final int extraPoints =
|
||||
Math.toIntExact(((long) config.maxPointsInLeafNode * leafNodeOffset) - pointCount);
|
||||
Math.toIntExact(((long) config.maxPointsInLeafNode() * leafNodeOffset) - pointCount);
|
||||
assert extraPoints < leafNodeOffset : "point excess should be lower than leafNodeOffset";
|
||||
// offset where we stop adding one point to the leaves
|
||||
final int nodeOffset = leafNodeOffset - extraPoints;
|
||||
|
@ -298,9 +300,9 @@ final class SimpleTextBKDReader extends PointValues {
|
|||
for (int node = leftMostLeafNode; node <= rightMostLeafNode; node++) {
|
||||
// offsetPosition provides which extra point will be added to this node
|
||||
if (balanceTreeNodePosition(0, leafNodeOffset, node - leafNodeOffset, 0, 0) < nodeOffset) {
|
||||
count += config.maxPointsInLeafNode;
|
||||
count += config.maxPointsInLeafNode();
|
||||
} else {
|
||||
count += config.maxPointsInLeafNode - 1;
|
||||
count += config.maxPointsInLeafNode() - 1;
|
||||
}
|
||||
}
|
||||
return count;
|
||||
|
@ -376,14 +378,14 @@ final class SimpleTextBKDReader extends PointValues {
|
|||
// Again, this time reading values and checking with the visitor
|
||||
visitor.grow(count);
|
||||
// NOTE: we don't do prefix coding, so we ignore commonPrefixLengths
|
||||
assert scratchPackedValue.length == config.packedBytesLength;
|
||||
assert scratchPackedValue.length == config.packedBytesLength();
|
||||
BytesRefBuilder scratch = new BytesRefBuilder();
|
||||
for (int i = 0; i < count; i++) {
|
||||
readLine(in, scratch);
|
||||
assert startsWith(scratch, BLOCK_VALUE);
|
||||
BytesRef br = SimpleTextUtil.fromBytesRefString(stripPrefix(scratch, BLOCK_VALUE));
|
||||
assert br.length == config.packedBytesLength;
|
||||
System.arraycopy(br.bytes, br.offset, scratchPackedValue, 0, config.packedBytesLength);
|
||||
assert br.length == config.packedBytesLength();
|
||||
System.arraycopy(br.bytes, br.offset, scratchPackedValue, 0, config.packedBytesLength());
|
||||
visitor.visit(scratchDocIDs[i], scratchPackedValue);
|
||||
}
|
||||
} else {
|
||||
|
@ -443,17 +445,17 @@ final class SimpleTextBKDReader extends PointValues {
|
|||
|
||||
@Override
|
||||
public int getNumDimensions() throws IOException {
|
||||
return config.numDims;
|
||||
return config.numDims();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int getNumIndexDimensions() throws IOException {
|
||||
return config.numIndexDims;
|
||||
return config.numIndexDims();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int getBytesPerDimension() throws IOException {
|
||||
return config.bytesPerDim;
|
||||
return config.bytesPerDim();
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -144,28 +144,28 @@ final class SimpleTextBKDWriter implements Closeable {
|
|||
this.maxDoc = maxDoc;
|
||||
docsSeen = new FixedBitSet(maxDoc);
|
||||
|
||||
scratchDiff = new byte[config.bytesPerDim];
|
||||
scratch1 = new byte[config.packedBytesLength];
|
||||
scratch2 = new byte[config.packedBytesLength];
|
||||
commonPrefixLengths = new int[config.numDims];
|
||||
scratchDiff = new byte[config.bytesPerDim()];
|
||||
scratch1 = new byte[config.packedBytesLength()];
|
||||
scratch2 = new byte[config.packedBytesLength()];
|
||||
commonPrefixLengths = new int[config.numDims()];
|
||||
|
||||
minPackedValue = new byte[config.packedIndexBytesLength];
|
||||
maxPackedValue = new byte[config.packedIndexBytesLength];
|
||||
minPackedValue = new byte[config.packedIndexBytesLength()];
|
||||
maxPackedValue = new byte[config.packedIndexBytesLength()];
|
||||
|
||||
// Maximum number of points we hold in memory at any time
|
||||
maxPointsSortInHeap =
|
||||
(int) ((maxMBSortInHeap * 1024 * 1024) / (config.bytesPerDoc * config.numDims));
|
||||
(int) ((maxMBSortInHeap * 1024 * 1024) / (config.bytesPerDoc() * config.numDims()));
|
||||
|
||||
// Finally, we must be able to hold at least the leaf node in heap during build:
|
||||
if (maxPointsSortInHeap < config.maxPointsInLeafNode) {
|
||||
if (maxPointsSortInHeap < config.maxPointsInLeafNode()) {
|
||||
throw new IllegalArgumentException(
|
||||
"maxMBSortInHeap="
|
||||
+ maxMBSortInHeap
|
||||
+ " only allows for maxPointsSortInHeap="
|
||||
+ maxPointsSortInHeap
|
||||
+ ", but this is less than config.maxPointsInLeafNode="
|
||||
+ config.maxPointsInLeafNode
|
||||
+ "; either increase maxMBSortInHeap or decrease config.maxPointsInLeafNode");
|
||||
+ ", but this is less than config.maxPointsInLeafNode()="
|
||||
+ config.maxPointsInLeafNode()
|
||||
+ "; either increase maxMBSortInHeap or decrease config.maxPointsInLeafNode()");
|
||||
}
|
||||
|
||||
this.maxMBSortInHeap = maxMBSortInHeap;
|
||||
|
@ -183,10 +183,10 @@ final class SimpleTextBKDWriter implements Closeable {
|
|||
}
|
||||
|
||||
public void add(byte[] packedValue, int docID) throws IOException {
|
||||
if (packedValue.length != config.packedBytesLength) {
|
||||
if (packedValue.length != config.packedBytesLength()) {
|
||||
throw new IllegalArgumentException(
|
||||
"packedValue should be length="
|
||||
+ config.packedBytesLength
|
||||
+ config.packedBytesLength()
|
||||
+ " (got: "
|
||||
+ packedValue.length
|
||||
+ ")");
|
||||
|
@ -209,30 +209,30 @@ final class SimpleTextBKDWriter implements Closeable {
|
|||
} else {
|
||||
pointWriter = new HeapPointWriter(config, Math.toIntExact(totalPointCount));
|
||||
}
|
||||
System.arraycopy(packedValue, 0, minPackedValue, 0, config.packedIndexBytesLength);
|
||||
System.arraycopy(packedValue, 0, maxPackedValue, 0, config.packedIndexBytesLength);
|
||||
System.arraycopy(packedValue, 0, minPackedValue, 0, config.packedIndexBytesLength());
|
||||
System.arraycopy(packedValue, 0, maxPackedValue, 0, config.packedIndexBytesLength());
|
||||
} else {
|
||||
for (int dim = 0; dim < config.numIndexDims; dim++) {
|
||||
int offset = dim * config.bytesPerDim;
|
||||
for (int dim = 0; dim < config.numIndexDims(); dim++) {
|
||||
int offset = dim * config.bytesPerDim();
|
||||
if (Arrays.compareUnsigned(
|
||||
packedValue,
|
||||
offset,
|
||||
offset + config.bytesPerDim,
|
||||
offset + config.bytesPerDim(),
|
||||
minPackedValue,
|
||||
offset,
|
||||
offset + config.bytesPerDim)
|
||||
offset + config.bytesPerDim())
|
||||
< 0) {
|
||||
System.arraycopy(packedValue, offset, minPackedValue, offset, config.bytesPerDim);
|
||||
System.arraycopy(packedValue, offset, minPackedValue, offset, config.bytesPerDim());
|
||||
}
|
||||
if (Arrays.compareUnsigned(
|
||||
packedValue,
|
||||
offset,
|
||||
offset + config.bytesPerDim,
|
||||
offset + config.bytesPerDim(),
|
||||
maxPackedValue,
|
||||
offset,
|
||||
offset + config.bytesPerDim)
|
||||
offset + config.bytesPerDim())
|
||||
> 0) {
|
||||
System.arraycopy(packedValue, offset, maxPackedValue, offset, config.bytesPerDim);
|
||||
System.arraycopy(packedValue, offset, maxPackedValue, offset, config.bytesPerDim());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -254,7 +254,7 @@ final class SimpleTextBKDWriter implements Closeable {
|
|||
*/
|
||||
public long writeField(IndexOutput out, String fieldName, MutablePointTree reader)
|
||||
throws IOException {
|
||||
if (config.numIndexDims == 1) {
|
||||
if (config.numIndexDims() == 1) {
|
||||
return writeField1Dim(out, fieldName, reader);
|
||||
} else {
|
||||
return writeFieldNDims(out, fieldName, reader);
|
||||
|
@ -280,7 +280,7 @@ final class SimpleTextBKDWriter implements Closeable {
|
|||
long countPerLeaf = pointCount = values.size();
|
||||
long innerNodeCount = 1;
|
||||
|
||||
while (countPerLeaf > config.maxPointsInLeafNode) {
|
||||
while (countPerLeaf > config.maxPointsInLeafNode()) {
|
||||
countPerLeaf = (countPerLeaf + 1) / 2;
|
||||
innerNodeCount *= 2;
|
||||
}
|
||||
|
@ -289,7 +289,7 @@ final class SimpleTextBKDWriter implements Closeable {
|
|||
|
||||
checkMaxLeafNodeCount(numLeaves);
|
||||
|
||||
final byte[] splitPackedValues = new byte[numLeaves * (config.bytesPerDim + 1)];
|
||||
final byte[] splitPackedValues = new byte[numLeaves * (config.bytesPerDim() + 1)];
|
||||
final long[] leafBlockFPs = new long[numLeaves];
|
||||
|
||||
// compute the min/max for this slice
|
||||
|
@ -297,37 +297,37 @@ final class SimpleTextBKDWriter implements Closeable {
|
|||
Arrays.fill(maxPackedValue, (byte) 0);
|
||||
for (int i = 0; i < Math.toIntExact(pointCount); ++i) {
|
||||
values.getValue(i, scratchBytesRef1);
|
||||
for (int dim = 0; dim < config.numIndexDims; dim++) {
|
||||
int offset = dim * config.bytesPerDim;
|
||||
for (int dim = 0; dim < config.numIndexDims(); dim++) {
|
||||
int offset = dim * config.bytesPerDim();
|
||||
if (Arrays.compareUnsigned(
|
||||
scratchBytesRef1.bytes,
|
||||
scratchBytesRef1.offset + offset,
|
||||
scratchBytesRef1.offset + offset + config.bytesPerDim,
|
||||
scratchBytesRef1.offset + offset + config.bytesPerDim(),
|
||||
minPackedValue,
|
||||
offset,
|
||||
offset + config.bytesPerDim)
|
||||
offset + config.bytesPerDim())
|
||||
< 0) {
|
||||
System.arraycopy(
|
||||
scratchBytesRef1.bytes,
|
||||
scratchBytesRef1.offset + offset,
|
||||
minPackedValue,
|
||||
offset,
|
||||
config.bytesPerDim);
|
||||
config.bytesPerDim());
|
||||
}
|
||||
if (Arrays.compareUnsigned(
|
||||
scratchBytesRef1.bytes,
|
||||
scratchBytesRef1.offset + offset,
|
||||
scratchBytesRef1.offset + offset + config.bytesPerDim,
|
||||
scratchBytesRef1.offset + offset + config.bytesPerDim(),
|
||||
maxPackedValue,
|
||||
offset,
|
||||
offset + config.bytesPerDim)
|
||||
offset + config.bytesPerDim())
|
||||
> 0) {
|
||||
System.arraycopy(
|
||||
scratchBytesRef1.bytes,
|
||||
scratchBytesRef1.offset + offset,
|
||||
maxPackedValue,
|
||||
offset,
|
||||
config.bytesPerDim);
|
||||
config.bytesPerDim());
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -345,7 +345,7 @@ final class SimpleTextBKDWriter implements Closeable {
|
|||
maxPackedValue,
|
||||
splitPackedValues,
|
||||
leafBlockFPs,
|
||||
new int[config.maxPointsInLeafNode]);
|
||||
new int[config.maxPointsInLeafNode()]);
|
||||
|
||||
long indexFP = out.getFilePointer();
|
||||
writeIndex(out, leafBlockFPs, splitPackedValues, Math.toIntExact(countPerLeaf));
|
||||
|
@ -387,15 +387,15 @@ final class SimpleTextBKDWriter implements Closeable {
|
|||
final IndexOutput out;
|
||||
final List<Long> leafBlockFPs = new ArrayList<>();
|
||||
final List<byte[]> leafBlockStartValues = new ArrayList<>();
|
||||
final byte[] leafValues = new byte[config.maxPointsInLeafNode * config.packedBytesLength];
|
||||
final int[] leafDocs = new int[config.maxPointsInLeafNode];
|
||||
final byte[] leafValues = new byte[config.maxPointsInLeafNode() * config.packedBytesLength()];
|
||||
final int[] leafDocs = new int[config.maxPointsInLeafNode()];
|
||||
long valueCount;
|
||||
int leafCount;
|
||||
|
||||
OneDimensionBKDWriter(IndexOutput out) {
|
||||
if (config.numIndexDims != 1) {
|
||||
if (config.numIndexDims() != 1) {
|
||||
throw new UnsupportedOperationException(
|
||||
"config.numIndexDims must be 1 but got " + config.numIndexDims);
|
||||
"config.numIndexDims() must be 1 but got " + config.numIndexDims());
|
||||
}
|
||||
if (pointCount != 0) {
|
||||
throw new IllegalStateException("cannot mix add and merge");
|
||||
|
@ -411,7 +411,7 @@ final class SimpleTextBKDWriter implements Closeable {
|
|||
|
||||
this.out = out;
|
||||
|
||||
lastPackedValue = new byte[config.packedBytesLength];
|
||||
lastPackedValue = new byte[config.packedBytesLength()];
|
||||
}
|
||||
|
||||
// for asserts
|
||||
|
@ -426,8 +426,8 @@ final class SimpleTextBKDWriter implements Closeable {
|
|||
packedValue,
|
||||
0,
|
||||
leafValues,
|
||||
leafCount * config.packedBytesLength,
|
||||
config.packedBytesLength);
|
||||
leafCount * config.packedBytesLength(),
|
||||
config.packedBytesLength());
|
||||
leafDocs[leafCount] = docID;
|
||||
docsSeen.set(docID);
|
||||
leafCount++;
|
||||
|
@ -441,7 +441,7 @@ final class SimpleTextBKDWriter implements Closeable {
|
|||
+ " values");
|
||||
}
|
||||
|
||||
if (leafCount == config.maxPointsInLeafNode) {
|
||||
if (leafCount == config.maxPointsInLeafNode()) {
|
||||
// We write a block once we hit exactly the max count ... this is different from
|
||||
// when we flush a new segment, where we write between max/2 and max per leaf block,
|
||||
// so merged segments will behave differently from newly flushed segments:
|
||||
|
@ -471,43 +471,44 @@ final class SimpleTextBKDWriter implements Closeable {
|
|||
// System.out.println("BKDW: now rotate numInnerNodes=" + numInnerNodes + " leafBlockStarts="
|
||||
// + leafBlockStartValues.size());
|
||||
|
||||
byte[] index = new byte[(1 + numInnerNodes) * (1 + config.bytesPerDim)];
|
||||
byte[] index = new byte[(1 + numInnerNodes) * (1 + config.bytesPerDim())];
|
||||
rotateToTree(1, 0, numInnerNodes, index, leafBlockStartValues);
|
||||
long[] arr = new long[leafBlockFPs.size()];
|
||||
for (int i = 0; i < leafBlockFPs.size(); i++) {
|
||||
arr[i] = leafBlockFPs.get(i);
|
||||
}
|
||||
writeIndex(out, arr, index, config.maxPointsInLeafNode);
|
||||
writeIndex(out, arr, index, config.maxPointsInLeafNode());
|
||||
return indexFP;
|
||||
}
|
||||
|
||||
private void writeLeafBlock() throws IOException {
|
||||
assert leafCount != 0;
|
||||
if (valueCount == 0) {
|
||||
System.arraycopy(leafValues, 0, minPackedValue, 0, config.packedIndexBytesLength);
|
||||
System.arraycopy(leafValues, 0, minPackedValue, 0, config.packedIndexBytesLength());
|
||||
}
|
||||
System.arraycopy(
|
||||
leafValues,
|
||||
(leafCount - 1) * config.packedBytesLength,
|
||||
(leafCount - 1) * config.packedBytesLength(),
|
||||
maxPackedValue,
|
||||
0,
|
||||
config.packedIndexBytesLength);
|
||||
config.packedIndexBytesLength());
|
||||
|
||||
valueCount += leafCount;
|
||||
|
||||
if (leafBlockFPs.size() > 0) {
|
||||
// Save the first (minimum) value in each leaf block except the first, to build the split
|
||||
// value index in the end:
|
||||
leafBlockStartValues.add(ArrayUtil.copyOfSubArray(leafValues, 0, config.packedBytesLength));
|
||||
leafBlockStartValues.add(
|
||||
ArrayUtil.copyOfSubArray(leafValues, 0, config.packedBytesLength()));
|
||||
}
|
||||
leafBlockFPs.add(out.getFilePointer());
|
||||
checkMaxLeafNodeCount(leafBlockFPs.size());
|
||||
|
||||
Arrays.fill(commonPrefixLengths, config.bytesPerDim);
|
||||
Arrays.fill(commonPrefixLengths, config.bytesPerDim());
|
||||
// Find per-dim common prefix:
|
||||
for (int dim = 0; dim < config.numDims; dim++) {
|
||||
int offset1 = dim * config.bytesPerDim;
|
||||
int offset2 = (leafCount - 1) * config.packedBytesLength + offset1;
|
||||
for (int dim = 0; dim < config.numDims(); dim++) {
|
||||
int offset1 = dim * config.bytesPerDim();
|
||||
int offset2 = (leafCount - 1) * config.packedBytesLength() + offset1;
|
||||
for (int j = 0; j < commonPrefixLengths[dim]; j++) {
|
||||
if (leafValues[offset1 + j] != leafValues[offset2 + j]) {
|
||||
commonPrefixLengths[dim] = j;
|
||||
|
@ -523,24 +524,24 @@ final class SimpleTextBKDWriter implements Closeable {
|
|||
final BytesRef scratch = new BytesRef();
|
||||
|
||||
{
|
||||
scratch.length = config.packedBytesLength;
|
||||
scratch.length = config.packedBytesLength();
|
||||
scratch.bytes = leafValues;
|
||||
}
|
||||
|
||||
@Override
|
||||
public BytesRef apply(int i) {
|
||||
scratch.offset = config.packedBytesLength * i;
|
||||
scratch.offset = config.packedBytesLength() * i;
|
||||
return scratch;
|
||||
}
|
||||
};
|
||||
assert valuesInOrderAndBounds(
|
||||
leafCount,
|
||||
0,
|
||||
ArrayUtil.copyOfSubArray(leafValues, 0, config.packedBytesLength),
|
||||
ArrayUtil.copyOfSubArray(leafValues, 0, config.packedBytesLength()),
|
||||
ArrayUtil.copyOfSubArray(
|
||||
leafValues,
|
||||
(leafCount - 1) * config.packedBytesLength,
|
||||
leafCount * config.packedBytesLength),
|
||||
(leafCount - 1) * config.packedBytesLength(),
|
||||
leafCount * config.packedBytesLength()),
|
||||
packedValues,
|
||||
leafDocs,
|
||||
0);
|
||||
|
@ -552,7 +553,7 @@ final class SimpleTextBKDWriter implements Closeable {
|
|||
private void rotateToTree(
|
||||
int nodeID, int offset, int count, byte[] index, List<byte[]> leafBlockStartValues) {
|
||||
// System.out.println("ROTATE: nodeID=" + nodeID + " offset=" + offset + " count=" + count + "
|
||||
// bpd=" + config.bytesPerDim + " index.length=" + index.length);
|
||||
// bpd=" + config.bytesPerDim() + " index.length=" + index.length);
|
||||
if (count == 1) {
|
||||
// Leaf index node
|
||||
// System.out.println(" leaf index node");
|
||||
|
@ -561,8 +562,8 @@ final class SimpleTextBKDWriter implements Closeable {
|
|||
leafBlockStartValues.get(offset),
|
||||
0,
|
||||
index,
|
||||
nodeID * (1 + config.bytesPerDim) + 1,
|
||||
config.bytesPerDim);
|
||||
nodeID * (1 + config.bytesPerDim()) + 1,
|
||||
config.bytesPerDim());
|
||||
} else if (count > 1) {
|
||||
// Internal index node: binary partition of count
|
||||
int countAtLevel = 1;
|
||||
|
@ -587,8 +588,8 @@ final class SimpleTextBKDWriter implements Closeable {
|
|||
leafBlockStartValues.get(rootOffset),
|
||||
0,
|
||||
index,
|
||||
nodeID * (1 + config.bytesPerDim) + 1,
|
||||
config.bytesPerDim);
|
||||
nodeID * (1 + config.bytesPerDim()) + 1,
|
||||
config.bytesPerDim());
|
||||
// System.out.println(" index[" + nodeID + "] = blockStartValues[" + rootOffset + "]");
|
||||
|
||||
// TODO: we could optimize/specialize, when we know it's simply fully balanced binary tree
|
||||
|
@ -611,10 +612,10 @@ final class SimpleTextBKDWriter implements Closeable {
|
|||
}
|
||||
|
||||
private void checkMaxLeafNodeCount(int numLeaves) {
|
||||
if ((1 + config.bytesPerDim) * (long) numLeaves > ArrayUtil.MAX_ARRAY_LENGTH) {
|
||||
if ((1 + config.bytesPerDim()) * (long) numLeaves > ArrayUtil.MAX_ARRAY_LENGTH) {
|
||||
throw new IllegalStateException(
|
||||
"too many nodes; increase config.maxPointsInLeafNode (currently "
|
||||
+ config.maxPointsInLeafNode
|
||||
"too many nodes; increase config.maxPointsInLeafNode() (currently "
|
||||
+ config.maxPointsInLeafNode()
|
||||
+ ") and reindex");
|
||||
}
|
||||
}
|
||||
|
@ -652,7 +653,7 @@ final class SimpleTextBKDWriter implements Closeable {
|
|||
long countPerLeaf = pointCount;
|
||||
long innerNodeCount = 1;
|
||||
|
||||
while (countPerLeaf > config.maxPointsInLeafNode) {
|
||||
while (countPerLeaf > config.maxPointsInLeafNode()) {
|
||||
countPerLeaf = (countPerLeaf + 1) / 2;
|
||||
innerNodeCount *= 2;
|
||||
}
|
||||
|
@ -667,20 +668,20 @@ final class SimpleTextBKDWriter implements Closeable {
|
|||
|
||||
// Indexed by nodeID, but first (root) nodeID is 1. We do 1+ because the lead byte at each
|
||||
// recursion says which dim we split on.
|
||||
byte[] splitPackedValues = new byte[Math.multiplyExact(numLeaves, 1 + config.bytesPerDim)];
|
||||
byte[] splitPackedValues = new byte[Math.multiplyExact(numLeaves, 1 + config.bytesPerDim())];
|
||||
|
||||
// +1 because leaf count is power of 2 (e.g. 8), and innerNodeCount is power of 2 minus 1 (e.g.
|
||||
// 7)
|
||||
long[] leafBlockFPs = new long[numLeaves];
|
||||
|
||||
// Make sure the math above "worked":
|
||||
assert pointCount / numLeaves <= config.maxPointsInLeafNode
|
||||
assert pointCount / numLeaves <= config.maxPointsInLeafNode()
|
||||
: "pointCount="
|
||||
+ pointCount
|
||||
+ " numLeaves="
|
||||
+ numLeaves
|
||||
+ " config.maxPointsInLeafNode="
|
||||
+ config.maxPointsInLeafNode;
|
||||
+ " config.maxPointsInLeafNode()="
|
||||
+ config.maxPointsInLeafNode();
|
||||
|
||||
// We re-use the selector so we do not need to create an object every time.
|
||||
BKDRadixSelector radixSelector =
|
||||
|
@ -699,7 +700,7 @@ final class SimpleTextBKDWriter implements Closeable {
|
|||
maxPackedValue,
|
||||
splitPackedValues,
|
||||
leafBlockFPs,
|
||||
new int[config.maxPointsInLeafNode]);
|
||||
new int[config.maxPointsInLeafNode()]);
|
||||
|
||||
// If no exception, we should have cleaned everything up:
|
||||
assert tempDir.getCreatedFiles().isEmpty();
|
||||
|
@ -724,15 +725,15 @@ final class SimpleTextBKDWriter implements Closeable {
|
|||
IndexOutput out, long[] leafBlockFPs, byte[] splitPackedValues, int maxPointsInLeafNode)
|
||||
throws IOException {
|
||||
write(out, NUM_DATA_DIMS);
|
||||
writeInt(out, config.numDims);
|
||||
writeInt(out, config.numDims());
|
||||
newline(out);
|
||||
|
||||
write(out, NUM_INDEX_DIMS);
|
||||
writeInt(out, config.numIndexDims);
|
||||
writeInt(out, config.numIndexDims());
|
||||
newline(out);
|
||||
|
||||
write(out, BYTES_PER_DIM);
|
||||
writeInt(out, config.bytesPerDim);
|
||||
writeInt(out, config.bytesPerDim());
|
||||
newline(out);
|
||||
|
||||
write(out, MAX_LEAF_POINTS);
|
||||
|
@ -767,8 +768,8 @@ final class SimpleTextBKDWriter implements Closeable {
|
|||
newline(out);
|
||||
}
|
||||
|
||||
assert (splitPackedValues.length % (1 + config.bytesPerDim)) == 0;
|
||||
int count = splitPackedValues.length / (1 + config.bytesPerDim);
|
||||
assert (splitPackedValues.length % (1 + config.bytesPerDim())) == 0;
|
||||
int count = splitPackedValues.length / (1 + config.bytesPerDim());
|
||||
assert count == leafBlockFPs.length;
|
||||
|
||||
write(out, SPLIT_COUNT);
|
||||
|
@ -777,10 +778,12 @@ final class SimpleTextBKDWriter implements Closeable {
|
|||
|
||||
for (int i = 0; i < count; i++) {
|
||||
write(out, SPLIT_DIM);
|
||||
writeInt(out, splitPackedValues[i * (1 + config.bytesPerDim)] & 0xff);
|
||||
writeInt(out, splitPackedValues[i * (1 + config.bytesPerDim())] & 0xff);
|
||||
newline(out);
|
||||
write(out, SPLIT_VALUE);
|
||||
br = new BytesRef(splitPackedValues, 1 + (i * (1 + config.bytesPerDim)), config.bytesPerDim);
|
||||
br =
|
||||
new BytesRef(
|
||||
splitPackedValues, 1 + (i * (1 + config.bytesPerDim())), config.bytesPerDim());
|
||||
write(out, br.toString());
|
||||
newline(out);
|
||||
}
|
||||
|
@ -852,25 +855,25 @@ final class SimpleTextBKDWriter implements Closeable {
|
|||
/** Called only in assert */
|
||||
private boolean valueInBounds(
|
||||
BytesRef packedValue, byte[] minPackedValue, byte[] maxPackedValue) {
|
||||
for (int dim = 0; dim < config.numIndexDims; dim++) {
|
||||
int offset = config.bytesPerDim * dim;
|
||||
for (int dim = 0; dim < config.numIndexDims(); dim++) {
|
||||
int offset = config.bytesPerDim() * dim;
|
||||
if (Arrays.compareUnsigned(
|
||||
packedValue.bytes,
|
||||
packedValue.offset + offset,
|
||||
packedValue.offset + offset + config.bytesPerDim,
|
||||
packedValue.offset + offset + config.bytesPerDim(),
|
||||
minPackedValue,
|
||||
offset,
|
||||
offset + config.bytesPerDim)
|
||||
offset + config.bytesPerDim())
|
||||
< 0) {
|
||||
return false;
|
||||
}
|
||||
if (Arrays.compareUnsigned(
|
||||
packedValue.bytes,
|
||||
packedValue.offset + offset,
|
||||
packedValue.offset + offset + config.bytesPerDim,
|
||||
packedValue.offset + offset + config.bytesPerDim(),
|
||||
maxPackedValue,
|
||||
offset,
|
||||
offset + config.bytesPerDim)
|
||||
offset + config.bytesPerDim())
|
||||
> 0) {
|
||||
return false;
|
||||
}
|
||||
|
@ -882,13 +885,13 @@ final class SimpleTextBKDWriter implements Closeable {
|
|||
protected int split(byte[] minPackedValue, byte[] maxPackedValue) {
|
||||
// Find which dim has the largest span so we can split on it:
|
||||
int splitDim = -1;
|
||||
for (int dim = 0; dim < config.numIndexDims; dim++) {
|
||||
NumericUtils.subtract(config.bytesPerDim, dim, maxPackedValue, minPackedValue, scratchDiff);
|
||||
for (int dim = 0; dim < config.numIndexDims(); dim++) {
|
||||
NumericUtils.subtract(config.bytesPerDim(), dim, maxPackedValue, minPackedValue, scratchDiff);
|
||||
if (splitDim == -1
|
||||
|| Arrays.compareUnsigned(
|
||||
scratchDiff, 0, config.bytesPerDim, scratch1, 0, config.bytesPerDim)
|
||||
scratchDiff, 0, config.bytesPerDim(), scratch1, 0, config.bytesPerDim())
|
||||
> 0) {
|
||||
System.arraycopy(scratchDiff, 0, scratch1, 0, config.bytesPerDim);
|
||||
System.arraycopy(scratchDiff, 0, scratch1, 0, config.bytesPerDim());
|
||||
splitDim = dim;
|
||||
}
|
||||
}
|
||||
|
@ -931,15 +934,15 @@ final class SimpleTextBKDWriter implements Closeable {
|
|||
if (nodeID >= leafNodeOffset) {
|
||||
// leaf node
|
||||
final int count = to - from;
|
||||
assert count <= config.maxPointsInLeafNode;
|
||||
assert count <= config.maxPointsInLeafNode();
|
||||
|
||||
// Compute common prefixes
|
||||
Arrays.fill(commonPrefixLengths, config.bytesPerDim);
|
||||
Arrays.fill(commonPrefixLengths, config.bytesPerDim());
|
||||
reader.getValue(from, scratchBytesRef1);
|
||||
for (int i = from + 1; i < to; ++i) {
|
||||
reader.getValue(i, scratchBytesRef2);
|
||||
for (int dim = 0; dim < config.numDims; dim++) {
|
||||
final int offset = dim * config.bytesPerDim;
|
||||
for (int dim = 0; dim < config.numDims(); dim++) {
|
||||
final int offset = dim * config.bytesPerDim();
|
||||
for (int j = 0; j < commonPrefixLengths[dim]; j++) {
|
||||
if (scratchBytesRef1.bytes[scratchBytesRef1.offset + offset + j]
|
||||
!= scratchBytesRef2.bytes[scratchBytesRef2.offset + offset + j]) {
|
||||
|
@ -951,23 +954,23 @@ final class SimpleTextBKDWriter implements Closeable {
|
|||
}
|
||||
|
||||
// Find the dimension that has the least number of unique bytes at commonPrefixLengths[dim]
|
||||
FixedBitSet[] usedBytes = new FixedBitSet[config.numDims];
|
||||
for (int dim = 0; dim < config.numDims; ++dim) {
|
||||
if (commonPrefixLengths[dim] < config.bytesPerDim) {
|
||||
FixedBitSet[] usedBytes = new FixedBitSet[config.numDims()];
|
||||
for (int dim = 0; dim < config.numDims(); ++dim) {
|
||||
if (commonPrefixLengths[dim] < config.bytesPerDim()) {
|
||||
usedBytes[dim] = new FixedBitSet(256);
|
||||
}
|
||||
}
|
||||
for (int i = from + 1; i < to; ++i) {
|
||||
for (int dim = 0; dim < config.numDims; dim++) {
|
||||
for (int dim = 0; dim < config.numDims(); dim++) {
|
||||
if (usedBytes[dim] != null) {
|
||||
byte b = reader.getByteAt(i, dim * config.bytesPerDim + commonPrefixLengths[dim]);
|
||||
byte b = reader.getByteAt(i, dim * config.bytesPerDim() + commonPrefixLengths[dim]);
|
||||
usedBytes[dim].set(Byte.toUnsignedInt(b));
|
||||
}
|
||||
}
|
||||
}
|
||||
int sortedDim = 0;
|
||||
int sortedDimCardinality = Integer.MAX_VALUE;
|
||||
for (int dim = 0; dim < config.numDims; ++dim) {
|
||||
for (int dim = 0; dim < config.numDims(); ++dim) {
|
||||
if (usedBytes[dim] != null) {
|
||||
final int cardinality = usedBytes[dim].cardinality();
|
||||
if (cardinality < sortedDimCardinality) {
|
||||
|
@ -1001,7 +1004,7 @@ final class SimpleTextBKDWriter implements Closeable {
|
|||
// Write the common prefixes:
|
||||
reader.getValue(from, scratchBytesRef1);
|
||||
System.arraycopy(
|
||||
scratchBytesRef1.bytes, scratchBytesRef1.offset, scratch1, 0, config.packedBytesLength);
|
||||
scratchBytesRef1.bytes, scratchBytesRef1.offset, scratch1, 0, config.packedBytesLength());
|
||||
|
||||
// Write the full values:
|
||||
IntFunction<BytesRef> packedValues =
|
||||
|
@ -1023,10 +1026,10 @@ final class SimpleTextBKDWriter implements Closeable {
|
|||
final int splitDim = split(minPackedValue, maxPackedValue);
|
||||
final int mid = (from + to + 1) >>> 1;
|
||||
|
||||
int commonPrefixLen = config.bytesPerDim;
|
||||
for (int i = 0; i < config.bytesPerDim; ++i) {
|
||||
if (minPackedValue[splitDim * config.bytesPerDim + i]
|
||||
!= maxPackedValue[splitDim * config.bytesPerDim + i]) {
|
||||
int commonPrefixLen = config.bytesPerDim();
|
||||
for (int i = 0; i < config.bytesPerDim(); ++i) {
|
||||
if (minPackedValue[splitDim * config.bytesPerDim() + i]
|
||||
!= maxPackedValue[splitDim * config.bytesPerDim() + i]) {
|
||||
commonPrefixLen = i;
|
||||
break;
|
||||
}
|
||||
|
@ -1044,32 +1047,32 @@ final class SimpleTextBKDWriter implements Closeable {
|
|||
scratchBytesRef2);
|
||||
|
||||
// set the split value
|
||||
final int address = nodeID * (1 + config.bytesPerDim);
|
||||
final int address = nodeID * (1 + config.bytesPerDim());
|
||||
splitPackedValues[address] = (byte) splitDim;
|
||||
reader.getValue(mid, scratchBytesRef1);
|
||||
System.arraycopy(
|
||||
scratchBytesRef1.bytes,
|
||||
scratchBytesRef1.offset + splitDim * config.bytesPerDim,
|
||||
scratchBytesRef1.offset + splitDim * config.bytesPerDim(),
|
||||
splitPackedValues,
|
||||
address + 1,
|
||||
config.bytesPerDim);
|
||||
config.bytesPerDim());
|
||||
|
||||
byte[] minSplitPackedValue =
|
||||
ArrayUtil.copyOfSubArray(minPackedValue, 0, config.packedIndexBytesLength);
|
||||
ArrayUtil.copyOfSubArray(minPackedValue, 0, config.packedIndexBytesLength());
|
||||
byte[] maxSplitPackedValue =
|
||||
ArrayUtil.copyOfSubArray(maxPackedValue, 0, config.packedIndexBytesLength);
|
||||
ArrayUtil.copyOfSubArray(maxPackedValue, 0, config.packedIndexBytesLength());
|
||||
System.arraycopy(
|
||||
scratchBytesRef1.bytes,
|
||||
scratchBytesRef1.offset + splitDim * config.bytesPerDim,
|
||||
scratchBytesRef1.offset + splitDim * config.bytesPerDim(),
|
||||
minSplitPackedValue,
|
||||
splitDim * config.bytesPerDim,
|
||||
config.bytesPerDim);
|
||||
splitDim * config.bytesPerDim(),
|
||||
config.bytesPerDim());
|
||||
System.arraycopy(
|
||||
scratchBytesRef1.bytes,
|
||||
scratchBytesRef1.offset + splitDim * config.bytesPerDim,
|
||||
scratchBytesRef1.offset + splitDim * config.bytesPerDim(),
|
||||
maxSplitPackedValue,
|
||||
splitDim * config.bytesPerDim,
|
||||
config.bytesPerDim);
|
||||
splitDim * config.bytesPerDim(),
|
||||
config.bytesPerDim());
|
||||
|
||||
// recurse
|
||||
build(
|
||||
|
@ -1137,17 +1140,17 @@ final class SimpleTextBKDWriter implements Closeable {
|
|||
|
||||
int sortedDim = 0;
|
||||
int sortedDimCardinality = Integer.MAX_VALUE;
|
||||
FixedBitSet[] usedBytes = new FixedBitSet[config.numDims];
|
||||
for (int dim = 0; dim < config.numDims; ++dim) {
|
||||
if (commonPrefixLengths[dim] < config.bytesPerDim) {
|
||||
FixedBitSet[] usedBytes = new FixedBitSet[config.numDims()];
|
||||
for (int dim = 0; dim < config.numDims(); ++dim) {
|
||||
if (commonPrefixLengths[dim] < config.bytesPerDim()) {
|
||||
usedBytes[dim] = new FixedBitSet(256);
|
||||
}
|
||||
}
|
||||
// Find the dimension to compress
|
||||
for (int dim = 0; dim < config.numDims; dim++) {
|
||||
for (int dim = 0; dim < config.numDims(); dim++) {
|
||||
int prefix = commonPrefixLengths[dim];
|
||||
if (prefix < config.bytesPerDim) {
|
||||
int offset = dim * config.bytesPerDim;
|
||||
if (prefix < config.bytesPerDim()) {
|
||||
int offset = dim * config.bytesPerDim();
|
||||
for (int i = 0; i < heapSource.count(); ++i) {
|
||||
PointValue value = heapSource.getPackedValueSlice(i);
|
||||
BytesRef packedValue = value.packedValue();
|
||||
|
@ -1190,7 +1193,7 @@ final class SimpleTextBKDWriter implements Closeable {
|
|||
final BytesRef scratch = new BytesRef();
|
||||
|
||||
{
|
||||
scratch.length = config.packedBytesLength;
|
||||
scratch.length = config.packedBytesLength();
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -1207,7 +1210,7 @@ final class SimpleTextBKDWriter implements Closeable {
|
|||
// Inner node: partition/recurse
|
||||
|
||||
int splitDim;
|
||||
if (config.numIndexDims > 1) {
|
||||
if (config.numIndexDims() > 1) {
|
||||
splitDim = split(minPackedValue, maxPackedValue);
|
||||
} else {
|
||||
splitDim = 0;
|
||||
|
@ -1223,13 +1226,13 @@ final class SimpleTextBKDWriter implements Closeable {
|
|||
int commonPrefixLen =
|
||||
Arrays.mismatch(
|
||||
minPackedValue,
|
||||
splitDim * config.bytesPerDim,
|
||||
splitDim * config.bytesPerDim + config.bytesPerDim,
|
||||
splitDim * config.bytesPerDim(),
|
||||
splitDim * config.bytesPerDim() + config.bytesPerDim(),
|
||||
maxPackedValue,
|
||||
splitDim * config.bytesPerDim,
|
||||
splitDim * config.bytesPerDim + config.bytesPerDim);
|
||||
splitDim * config.bytesPerDim(),
|
||||
splitDim * config.bytesPerDim() + config.bytesPerDim());
|
||||
if (commonPrefixLen == -1) {
|
||||
commonPrefixLen = config.bytesPerDim;
|
||||
commonPrefixLen = config.bytesPerDim();
|
||||
}
|
||||
|
||||
BKDRadixSelector.PathSlice[] pathSlices = new BKDRadixSelector.PathSlice[2];
|
||||
|
@ -1244,20 +1247,28 @@ final class SimpleTextBKDWriter implements Closeable {
|
|||
splitDim,
|
||||
commonPrefixLen);
|
||||
|
||||
int address = nodeID * (1 + config.bytesPerDim);
|
||||
int address = nodeID * (1 + config.bytesPerDim());
|
||||
splitPackedValues[address] = (byte) splitDim;
|
||||
System.arraycopy(splitValue, 0, splitPackedValues, address + 1, config.bytesPerDim);
|
||||
System.arraycopy(splitValue, 0, splitPackedValues, address + 1, config.bytesPerDim());
|
||||
|
||||
byte[] minSplitPackedValue = new byte[config.packedIndexBytesLength];
|
||||
System.arraycopy(minPackedValue, 0, minSplitPackedValue, 0, config.packedIndexBytesLength);
|
||||
byte[] minSplitPackedValue = new byte[config.packedIndexBytesLength()];
|
||||
System.arraycopy(minPackedValue, 0, minSplitPackedValue, 0, config.packedIndexBytesLength());
|
||||
|
||||
byte[] maxSplitPackedValue = new byte[config.packedIndexBytesLength];
|
||||
System.arraycopy(maxPackedValue, 0, maxSplitPackedValue, 0, config.packedIndexBytesLength);
|
||||
byte[] maxSplitPackedValue = new byte[config.packedIndexBytesLength()];
|
||||
System.arraycopy(maxPackedValue, 0, maxSplitPackedValue, 0, config.packedIndexBytesLength());
|
||||
|
||||
System.arraycopy(
|
||||
splitValue, 0, minSplitPackedValue, splitDim * config.bytesPerDim, config.bytesPerDim);
|
||||
splitValue,
|
||||
0,
|
||||
minSplitPackedValue,
|
||||
splitDim * config.bytesPerDim(),
|
||||
config.bytesPerDim());
|
||||
System.arraycopy(
|
||||
splitValue, 0, maxSplitPackedValue, splitDim * config.bytesPerDim, config.bytesPerDim);
|
||||
splitValue,
|
||||
0,
|
||||
maxSplitPackedValue,
|
||||
splitDim * config.bytesPerDim(),
|
||||
config.bytesPerDim());
|
||||
|
||||
// Recurse on left tree:
|
||||
build(
|
||||
|
@ -1289,30 +1300,30 @@ final class SimpleTextBKDWriter implements Closeable {
|
|||
}
|
||||
|
||||
private void computeCommonPrefixLength(HeapPointWriter heapPointWriter, byte[] commonPrefix) {
|
||||
Arrays.fill(commonPrefixLengths, config.bytesPerDim);
|
||||
Arrays.fill(commonPrefixLengths, config.bytesPerDim());
|
||||
PointValue value = heapPointWriter.getPackedValueSlice(0);
|
||||
BytesRef packedValue = value.packedValue();
|
||||
for (int dim = 0; dim < config.numDims; dim++) {
|
||||
for (int dim = 0; dim < config.numDims(); dim++) {
|
||||
System.arraycopy(
|
||||
packedValue.bytes,
|
||||
packedValue.offset + dim * config.bytesPerDim,
|
||||
packedValue.offset + dim * config.bytesPerDim(),
|
||||
commonPrefix,
|
||||
dim * config.bytesPerDim,
|
||||
config.bytesPerDim);
|
||||
dim * config.bytesPerDim(),
|
||||
config.bytesPerDim());
|
||||
}
|
||||
for (int i = 1; i < heapPointWriter.count(); i++) {
|
||||
value = heapPointWriter.getPackedValueSlice(i);
|
||||
packedValue = value.packedValue();
|
||||
for (int dim = 0; dim < config.numDims; dim++) {
|
||||
for (int dim = 0; dim < config.numDims(); dim++) {
|
||||
if (commonPrefixLengths[dim] != 0) {
|
||||
int j =
|
||||
Arrays.mismatch(
|
||||
commonPrefix,
|
||||
dim * config.bytesPerDim,
|
||||
dim * config.bytesPerDim + commonPrefixLengths[dim],
|
||||
dim * config.bytesPerDim(),
|
||||
dim * config.bytesPerDim() + commonPrefixLengths[dim],
|
||||
packedValue.bytes,
|
||||
packedValue.offset + dim * config.bytesPerDim,
|
||||
packedValue.offset + dim * config.bytesPerDim + commonPrefixLengths[dim]);
|
||||
packedValue.offset + dim * config.bytesPerDim(),
|
||||
packedValue.offset + dim * config.bytesPerDim() + commonPrefixLengths[dim]);
|
||||
if (j != -1) {
|
||||
commonPrefixLengths[dim] = j;
|
||||
}
|
||||
|
@ -1331,11 +1342,11 @@ final class SimpleTextBKDWriter implements Closeable {
|
|||
int[] docs,
|
||||
int docsOffset)
|
||||
throws IOException {
|
||||
byte[] lastPackedValue = new byte[config.packedBytesLength];
|
||||
byte[] lastPackedValue = new byte[config.packedBytesLength()];
|
||||
int lastDoc = -1;
|
||||
for (int i = 0; i < count; i++) {
|
||||
BytesRef packedValue = values.apply(i);
|
||||
assert packedValue.length == config.packedBytesLength;
|
||||
assert packedValue.length == config.packedBytesLength();
|
||||
assert valueInOrder(
|
||||
i,
|
||||
sortedDim,
|
||||
|
@ -1361,43 +1372,43 @@ final class SimpleTextBKDWriter implements Closeable {
|
|||
int packedValueOffset,
|
||||
int doc,
|
||||
int lastDoc) {
|
||||
int dimOffset = sortedDim * config.bytesPerDim;
|
||||
int dimOffset = sortedDim * config.bytesPerDim();
|
||||
if (ord > 0) {
|
||||
int cmp =
|
||||
Arrays.compareUnsigned(
|
||||
lastPackedValue,
|
||||
dimOffset,
|
||||
dimOffset + config.bytesPerDim,
|
||||
dimOffset + config.bytesPerDim(),
|
||||
packedValue,
|
||||
packedValueOffset + dimOffset,
|
||||
packedValueOffset + dimOffset + config.bytesPerDim);
|
||||
packedValueOffset + dimOffset + config.bytesPerDim());
|
||||
if (cmp > 0) {
|
||||
throw new AssertionError(
|
||||
"values out of order: last value="
|
||||
+ new BytesRef(lastPackedValue)
|
||||
+ " current value="
|
||||
+ new BytesRef(packedValue, packedValueOffset, config.packedBytesLength)
|
||||
+ new BytesRef(packedValue, packedValueOffset, config.packedBytesLength())
|
||||
+ " ord="
|
||||
+ ord
|
||||
+ " sortedDim="
|
||||
+ sortedDim);
|
||||
}
|
||||
if (cmp == 0 && config.numDims > config.numIndexDims) {
|
||||
int dataOffset = config.numIndexDims * config.bytesPerDim;
|
||||
if (cmp == 0 && config.numDims() > config.numIndexDims()) {
|
||||
int dataOffset = config.numIndexDims() * config.bytesPerDim();
|
||||
cmp =
|
||||
Arrays.compareUnsigned(
|
||||
lastPackedValue,
|
||||
dataOffset,
|
||||
config.packedBytesLength,
|
||||
config.packedBytesLength(),
|
||||
packedValue,
|
||||
packedValueOffset + dataOffset,
|
||||
packedValueOffset + config.packedBytesLength);
|
||||
packedValueOffset + config.packedBytesLength());
|
||||
if (cmp > 0) {
|
||||
throw new AssertionError(
|
||||
"data values out of order: last value="
|
||||
+ new BytesRef(lastPackedValue)
|
||||
+ " current value="
|
||||
+ new BytesRef(packedValue, packedValueOffset, config.packedBytesLength)
|
||||
+ new BytesRef(packedValue, packedValueOffset, config.packedBytesLength())
|
||||
+ " ord="
|
||||
+ ord);
|
||||
}
|
||||
|
@ -1414,7 +1425,8 @@ final class SimpleTextBKDWriter implements Closeable {
|
|||
+ sortedDim);
|
||||
}
|
||||
}
|
||||
System.arraycopy(packedValue, packedValueOffset, lastPackedValue, 0, config.packedBytesLength);
|
||||
System.arraycopy(
|
||||
packedValue, packedValueOffset, lastPackedValue, 0, config.packedBytesLength());
|
||||
return true;
|
||||
}
|
||||
|
||||
|
|
|
@ -829,7 +829,7 @@ class SimpleTextDocValuesReader extends DocValuesProducer {
|
|||
clone.seek(0);
|
||||
// checksum is fixed-width encoded with 20 bytes, plus 1 byte for newline (the space is included
|
||||
// in SimpleTextUtil.CHECKSUM):
|
||||
long footerStartPos = data.length() - (SimpleTextUtil.CHECKSUM.length + 21);
|
||||
long footerStartPos = clone.length() - (SimpleTextUtil.CHECKSUM.length + 21);
|
||||
ChecksumIndexInput input = new BufferedChecksumIndexInput(clone);
|
||||
while (true) {
|
||||
SimpleTextUtil.readLine(input, scratch);
|
||||
|
|
|
@ -227,7 +227,7 @@ class SimpleTextPointsReader extends PointsReader {
|
|||
|
||||
// checksum is fixed-width encoded with 20 bytes, plus 1 byte for newline (the space is included
|
||||
// in SimpleTextUtil.CHECKSUM):
|
||||
long footerStartPos = dataIn.length() - (SimpleTextUtil.CHECKSUM.length + 21);
|
||||
long footerStartPos = clone.length() - (SimpleTextUtil.CHECKSUM.length + 21);
|
||||
ChecksumIndexInput input = new BufferedChecksumIndexInput(clone);
|
||||
while (true) {
|
||||
SimpleTextUtil.readLine(input, scratch);
|
||||
|
|
|
@ -17,13 +17,13 @@
|
|||
|
||||
package org.apache.lucene.codecs.uniformsplit;
|
||||
|
||||
import static org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.BLOCK_SIZE;
|
||||
import static org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat.BLOCK_SIZE;
|
||||
|
||||
import java.io.IOException;
|
||||
import org.apache.lucene.codecs.BlockTermState;
|
||||
import org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.IntBlockTermState;
|
||||
import org.apache.lucene.codecs.lucene99.Lucene99PostingsReader;
|
||||
import org.apache.lucene.codecs.lucene99.Lucene99PostingsWriter;
|
||||
import org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat.IntBlockTermState;
|
||||
import org.apache.lucene.codecs.lucene912.Lucene912PostingsReader;
|
||||
import org.apache.lucene.codecs.lucene912.Lucene912PostingsWriter;
|
||||
import org.apache.lucene.index.FieldInfo;
|
||||
import org.apache.lucene.index.IndexOptions;
|
||||
import org.apache.lucene.index.TermState;
|
||||
|
@ -34,7 +34,7 @@ import org.apache.lucene.util.RamUsageEstimator;
|
|||
|
||||
/**
|
||||
* {@link TermState} serializer which encodes each file pointer as a delta relative to a base file
|
||||
* pointer. It differs from {@link Lucene99PostingsWriter#encodeTerm} which encodes each file
|
||||
* pointer. It differs from {@link Lucene912PostingsWriter#encodeTerm} which encodes each file
|
||||
* pointer as a delta relative to the previous file pointer.
|
||||
*
|
||||
* <p>It automatically sets the base file pointer to the first valid file pointer for doc start FP,
|
||||
|
@ -95,7 +95,7 @@ public class DeltaBaseTermStateSerializer implements Accountable {
|
|||
/**
|
||||
* Writes a {@link BlockTermState} to the provided {@link DataOutput}.
|
||||
*
|
||||
* <p>Simpler variant of {@link Lucene99PostingsWriter#encodeTerm(DataOutput, FieldInfo,
|
||||
* <p>Simpler variant of {@link Lucene912PostingsWriter#encodeTerm(DataOutput, FieldInfo,
|
||||
* BlockTermState, boolean)}.
|
||||
*/
|
||||
public void writeTermState(
|
||||
|
@ -140,15 +140,12 @@ public class DeltaBaseTermStateSerializer implements Accountable {
|
|||
termStatesOutput.writeVLong(intTermState.lastPosBlockOffset);
|
||||
}
|
||||
}
|
||||
if (intTermState.skipOffset != -1) {
|
||||
termStatesOutput.writeVLong(intTermState.skipOffset);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Reads a {@link BlockTermState} from the provided {@link DataInput}.
|
||||
*
|
||||
* <p>Simpler variant of {@link Lucene99PostingsReader#decodeTerm(DataInput, FieldInfo,
|
||||
* <p>Simpler variant of {@link Lucene912PostingsReader#decodeTerm(DataInput, FieldInfo,
|
||||
* BlockTermState, boolean)}.
|
||||
*
|
||||
* @param reuse {@link BlockTermState} to reuse; or null to create a new one.
|
||||
|
@ -190,9 +187,6 @@ public class DeltaBaseTermStateSerializer implements Accountable {
|
|||
intTermState.lastPosBlockOffset = termStatesInput.readVLong();
|
||||
}
|
||||
}
|
||||
if (intTermState.docFreq > BLOCK_SIZE) {
|
||||
intTermState.skipOffset = termStatesInput.readVLong();
|
||||
}
|
||||
return intTermState;
|
||||
}
|
||||
|
||||
|
@ -210,7 +204,6 @@ public class DeltaBaseTermStateSerializer implements Accountable {
|
|||
termState.docStartFP = 0;
|
||||
termState.posStartFP = 0;
|
||||
termState.payStartFP = 0;
|
||||
termState.skipOffset = -1;
|
||||
termState.lastPosBlockOffset = -1;
|
||||
termState.singletonDocID = -1;
|
||||
|
||||
|
|
|
@ -90,10 +90,15 @@ public class FSTDictionary implements IndexDictionary {
|
|||
}
|
||||
PositiveIntOutputs fstOutputs = PositiveIntOutputs.getSingleton();
|
||||
FST.FSTMetadata<Long> metadata = FST.readMetadata(fstDataInput, fstOutputs);
|
||||
FST<Long> fst =
|
||||
isFSTOnHeap
|
||||
? new FST<>(metadata, fstDataInput)
|
||||
: new FST<>(metadata, fstDataInput, new OffHeapFSTStore());
|
||||
FST<Long> fst;
|
||||
if (isFSTOnHeap) {
|
||||
fst = new FST<>(metadata, fstDataInput);
|
||||
} else {
|
||||
final IndexInput indexInput = (IndexInput) fstDataInput;
|
||||
fst =
|
||||
FST.fromFSTReader(
|
||||
metadata, new OffHeapFSTStore(indexInput, indexInput.getFilePointer(), metadata));
|
||||
}
|
||||
return new FSTDictionary(fst);
|
||||
}
|
||||
|
||||
|
|
|
@ -23,8 +23,8 @@ import org.apache.lucene.codecs.FieldsProducer;
|
|||
import org.apache.lucene.codecs.PostingsFormat;
|
||||
import org.apache.lucene.codecs.PostingsReaderBase;
|
||||
import org.apache.lucene.codecs.PostingsWriterBase;
|
||||
import org.apache.lucene.codecs.lucene99.Lucene99PostingsReader;
|
||||
import org.apache.lucene.codecs.lucene99.Lucene99PostingsWriter;
|
||||
import org.apache.lucene.codecs.lucene912.Lucene912PostingsReader;
|
||||
import org.apache.lucene.codecs.lucene912.Lucene912PostingsWriter;
|
||||
import org.apache.lucene.index.SegmentReadState;
|
||||
import org.apache.lucene.index.SegmentWriteState;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
|
@ -113,7 +113,7 @@ public class UniformSplitPostingsFormat extends PostingsFormat {
|
|||
|
||||
@Override
|
||||
public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
|
||||
PostingsWriterBase postingsWriter = new Lucene99PostingsWriter(state);
|
||||
PostingsWriterBase postingsWriter = new Lucene912PostingsWriter(state);
|
||||
boolean success = false;
|
||||
try {
|
||||
FieldsConsumer termsWriter =
|
||||
|
@ -130,7 +130,7 @@ public class UniformSplitPostingsFormat extends PostingsFormat {
|
|||
|
||||
@Override
|
||||
public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException {
|
||||
PostingsReaderBase postingsReader = new Lucene99PostingsReader(state);
|
||||
PostingsReaderBase postingsReader = new Lucene912PostingsReader(state);
|
||||
boolean success = false;
|
||||
try {
|
||||
FieldsProducer termsReader =
|
||||
|
|
|
@ -28,7 +28,7 @@
|
|||
* org.apache.lucene.search.PhraseQuery})
|
||||
* <li>Quite efficient for {@link org.apache.lucene.search.PrefixQuery}
|
||||
* <li>Not efficient for spell-check and {@link org.apache.lucene.search.FuzzyQuery}, in this case
|
||||
* prefer {@link org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat}
|
||||
* prefer {@link org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat}
|
||||
* </ul>
|
||||
*/
|
||||
package org.apache.lucene.codecs.uniformsplit;
|
||||
|
|
|
@ -20,11 +20,11 @@ package org.apache.lucene.codecs.uniformsplit.sharedterms;
|
|||
import java.io.IOException;
|
||||
import java.util.List;
|
||||
import java.util.RandomAccess;
|
||||
import org.apache.lucene.index.BaseTermsEnum;
|
||||
import org.apache.lucene.index.ImpactsEnum;
|
||||
import org.apache.lucene.index.MergeState;
|
||||
import org.apache.lucene.index.PostingsEnum;
|
||||
import org.apache.lucene.index.TermState;
|
||||
import org.apache.lucene.index.TermsEnum;
|
||||
import org.apache.lucene.util.AttributeSource;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
|
||||
|
@ -34,7 +34,7 @@ import org.apache.lucene.util.BytesRef;
|
|||
*
|
||||
* @lucene.experimental
|
||||
*/
|
||||
class STMergingTermsEnum extends TermsEnum {
|
||||
class STMergingTermsEnum extends BaseTermsEnum {
|
||||
|
||||
protected final String fieldName;
|
||||
protected final MultiSegmentsPostingsEnum multiPostingsEnum;
|
||||
|
@ -63,11 +63,6 @@ class STMergingTermsEnum extends TermsEnum {
|
|||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean seekExact(BytesRef text) throws IOException {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
@Override
|
||||
public SeekStatus seekCeil(BytesRef text) {
|
||||
throw new UnsupportedOperationException();
|
||||
|
|
|
@ -22,7 +22,7 @@ import java.io.IOException;
|
|||
import org.apache.lucene.codecs.Codec;
|
||||
import org.apache.lucene.codecs.FilterCodec;
|
||||
import org.apache.lucene.codecs.KnnVectorsFormat;
|
||||
import org.apache.lucene.codecs.lucene99.Lucene99Codec;
|
||||
import org.apache.lucene.codecs.lucene912.Lucene912Codec;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.document.KnnByteVectorField;
|
||||
|
@ -42,7 +42,7 @@ import org.apache.lucene.tests.index.BaseIndexFileFormatTestCase;
|
|||
public class TestHnswBitVectorsFormat extends BaseIndexFileFormatTestCase {
|
||||
@Override
|
||||
protected Codec getCodec() {
|
||||
return new Lucene99Codec() {
|
||||
return new Lucene912Codec() {
|
||||
@Override
|
||||
public KnnVectorsFormat getKnnVectorsFormatForField(String field) {
|
||||
return new HnswBitVectorsFormat();
|
||||
|
|
|
@ -17,7 +17,7 @@
|
|||
|
||||
package org.apache.lucene.codecs.lucene90.tests;
|
||||
|
||||
import org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.IntBlockTermState;
|
||||
import org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat.IntBlockTermState;
|
||||
|
||||
/** Test utility class to create mock {@link IntBlockTermState}. */
|
||||
public class MockTermStateFactory {
|
||||
|
|
|
@ -0,0 +1,4 @@
|
|||
{
|
||||
"lucene/core/src/java/org/apache/lucene/codecs/lucene912/ForDeltaUtil.java": "5115b12ac31537ce31d73c0a279df92060749a3a",
|
||||
"lucene/core/src/java/org/apache/lucene/codecs/lucene912/gen_ForDeltaUtil.py": "db6154406e68b80d2c90116b5d0bfa9ba220762a"
|
||||
}
|
|
@ -1,4 +1,4 @@
|
|||
{
|
||||
"lucene/core/src/java/org/apache/lucene/codecs/lucene99/ForUtil.java": "1292ad354d255b1272ffd3db684aa2ddb2bc49ec",
|
||||
"lucene/core/src/java/org/apache/lucene/codecs/lucene99/gen_ForUtil.py": "ab7b63a1b73986cc04e43de1c8f474b97aef5116"
|
||||
"lucene/core/src/java/org/apache/lucene/codecs/lucene912/ForUtil.java": "159e82388346fde147924d5e15ca65df4dd63b9a",
|
||||
"lucene/core/src/java/org/apache/lucene/codecs/lucene912/gen_ForUtil.py": "66dc8813160feae2a37d8b50474f5f9830b6cb22"
|
||||
}
|
|
@ -15,7 +15,7 @@
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.codecs.lucene99.Lucene99Codec;
|
||||
import org.apache.lucene.codecs.lucene912.Lucene912Codec;
|
||||
|
||||
/** Lucene Core. */
|
||||
@SuppressWarnings("module") // the test framework is compiled after the core...
|
||||
|
@ -33,6 +33,7 @@ module org.apache.lucene.core {
|
|||
exports org.apache.lucene.codecs.lucene94;
|
||||
exports org.apache.lucene.codecs.lucene95;
|
||||
exports org.apache.lucene.codecs.lucene99;
|
||||
exports org.apache.lucene.codecs.lucene912;
|
||||
exports org.apache.lucene.codecs.perfield;
|
||||
exports org.apache.lucene.codecs;
|
||||
exports org.apache.lucene.document;
|
||||
|
@ -71,7 +72,7 @@ module org.apache.lucene.core {
|
|||
provides org.apache.lucene.analysis.TokenizerFactory with
|
||||
org.apache.lucene.analysis.standard.StandardTokenizerFactory;
|
||||
provides org.apache.lucene.codecs.Codec with
|
||||
Lucene99Codec;
|
||||
Lucene912Codec;
|
||||
provides org.apache.lucene.codecs.DocValuesFormat with
|
||||
org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat;
|
||||
provides org.apache.lucene.codecs.KnnVectorsFormat with
|
||||
|
@ -79,7 +80,7 @@ module org.apache.lucene.core {
|
|||
org.apache.lucene.codecs.lucene99.Lucene99HnswScalarQuantizedVectorsFormat,
|
||||
org.apache.lucene.codecs.lucene99.Lucene99ScalarQuantizedVectorsFormat;
|
||||
provides org.apache.lucene.codecs.PostingsFormat with
|
||||
org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat;
|
||||
org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat;
|
||||
provides org.apache.lucene.index.SortFieldProvider with
|
||||
org.apache.lucene.search.SortField.Provider,
|
||||
org.apache.lucene.search.SortedNumericSortField.Provider,
|
||||
|
|
|
@ -55,7 +55,7 @@ public abstract class Codec implements NamedSPILoader.NamedSPI {
|
|||
return LOADER;
|
||||
}
|
||||
|
||||
static Codec defaultCodec = LOADER.lookup("Lucene99");
|
||||
static Codec defaultCodec = LOADER.lookup("Lucene912");
|
||||
}
|
||||
|
||||
private final String name;
|
||||
|
|
|
@ -18,8 +18,6 @@ package org.apache.lucene.codecs;
|
|||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collection;
|
||||
import java.util.Collections;
|
||||
import java.util.Comparator;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
|
@ -106,7 +104,7 @@ public final class CompetitiveImpactAccumulator {
|
|||
}
|
||||
|
||||
/** Get the set of competitive freq and norm pairs, ordered by increasing freq and norm. */
|
||||
public Collection<Impact> getCompetitiveFreqNormPairs() {
|
||||
public List<Impact> getCompetitiveFreqNormPairs() {
|
||||
List<Impact> impacts = new ArrayList<>();
|
||||
int maxFreqForLowerNorms = 0;
|
||||
for (int i = 0; i < maxFreqs.length; ++i) {
|
||||
|
@ -126,7 +124,7 @@ public final class CompetitiveImpactAccumulator {
|
|||
for (Impact impact : impacts) {
|
||||
add(impact, freqNormPairs);
|
||||
}
|
||||
return Collections.unmodifiableSet(freqNormPairs);
|
||||
return List.copyOf(freqNormPairs);
|
||||
}
|
||||
|
||||
private void add(Impact newEntry, TreeSet<Impact> freqNormPairs) {
|
||||
|
|
|
@ -23,6 +23,7 @@ import java.io.IOException;
|
|||
import java.util.ArrayList;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import org.apache.lucene.index.BaseTermsEnum;
|
||||
import org.apache.lucene.index.BinaryDocValues;
|
||||
import org.apache.lucene.index.DocIDMerger;
|
||||
import org.apache.lucene.index.DocValues;
|
||||
|
@ -498,7 +499,7 @@ public abstract class DocValuesConsumer implements Closeable {
|
|||
* {@link SortedDocValues#lookupOrd(int)} or {@link SortedSetDocValues#lookupOrd(long)} on every
|
||||
* call to {@link TermsEnum#next()}.
|
||||
*/
|
||||
private static class MergedTermsEnum extends TermsEnum {
|
||||
private static class MergedTermsEnum extends BaseTermsEnum {
|
||||
|
||||
private final TermsEnum[] subs;
|
||||
private final OrdinalMap ordinalMap;
|
||||
|
@ -542,11 +543,6 @@ public abstract class DocValuesConsumer implements Closeable {
|
|||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean seekExact(BytesRef text) throws IOException {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
@Override
|
||||
public SeekStatus seekCeil(BytesRef text) throws IOException {
|
||||
throw new UnsupportedOperationException();
|
||||
|
@ -557,11 +553,6 @@ public abstract class DocValuesConsumer implements Closeable {
|
|||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void seekExact(BytesRef term, TermState state) throws IOException {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int docFreq() throws IOException {
|
||||
throw new UnsupportedOperationException();
|
||||
|
|
|
@ -20,17 +20,23 @@ package org.apache.lucene.codecs;
|
|||
import java.io.Closeable;
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.Objects;
|
||||
import java.util.function.BiFunction;
|
||||
import org.apache.lucene.index.ByteVectorValues;
|
||||
import org.apache.lucene.index.DocIDMerger;
|
||||
import org.apache.lucene.index.DocsWithFieldSet;
|
||||
import org.apache.lucene.index.FieldInfo;
|
||||
import org.apache.lucene.index.FloatVectorValues;
|
||||
import org.apache.lucene.index.MergeState;
|
||||
import org.apache.lucene.index.Sorter;
|
||||
import org.apache.lucene.index.VectorEncoding;
|
||||
import org.apache.lucene.internal.hppc.IntIntHashMap;
|
||||
import org.apache.lucene.search.DocIdSetIterator;
|
||||
import org.apache.lucene.search.VectorScorer;
|
||||
import org.apache.lucene.util.Accountable;
|
||||
import org.apache.lucene.util.IOFunction;
|
||||
|
||||
/** Writes vectors to an index. */
|
||||
public abstract class KnnVectorsWriter implements Accountable, Closeable {
|
||||
|
@ -107,11 +113,11 @@ public abstract class KnnVectorsWriter implements Accountable, Closeable {
|
|||
}
|
||||
|
||||
/** Tracks state of one sub-reader that we are merging */
|
||||
private static class VectorValuesSub extends DocIDMerger.Sub {
|
||||
private static class FloatVectorValuesSub extends DocIDMerger.Sub {
|
||||
|
||||
final FloatVectorValues values;
|
||||
|
||||
VectorValuesSub(MergeState.DocMap docMap, FloatVectorValues values) {
|
||||
FloatVectorValuesSub(MergeState.DocMap docMap, FloatVectorValues values) {
|
||||
super(docMap);
|
||||
this.values = values;
|
||||
assert values.docID() == -1;
|
||||
|
@ -139,65 +145,139 @@ public abstract class KnnVectorsWriter implements Accountable, Closeable {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Given old doc ids and an id mapping, maps old ordinal to new ordinal. Note: this method return
|
||||
* nothing and output are written to parameters
|
||||
*
|
||||
* @param oldDocIds the old or current document ordinals. Must not be null.
|
||||
* @param sortMap the document sorting map for how to make the new ordinals. Must not be null.
|
||||
* @param old2NewOrd int[] maps from old ord to new ord
|
||||
* @param new2OldOrd int[] maps from new ord to old ord
|
||||
* @param newDocsWithField set of new doc ids which has the value
|
||||
*/
|
||||
public static void mapOldOrdToNewOrd(
|
||||
DocsWithFieldSet oldDocIds,
|
||||
Sorter.DocMap sortMap,
|
||||
int[] old2NewOrd,
|
||||
int[] new2OldOrd,
|
||||
DocsWithFieldSet newDocsWithField)
|
||||
throws IOException {
|
||||
// TODO: a similar function exists in IncrementalHnswGraphMerger#getNewOrdMapping
|
||||
// maybe we can do a further refactoring
|
||||
Objects.requireNonNull(oldDocIds);
|
||||
Objects.requireNonNull(sortMap);
|
||||
assert (old2NewOrd != null || new2OldOrd != null || newDocsWithField != null);
|
||||
assert (old2NewOrd == null || old2NewOrd.length == oldDocIds.cardinality());
|
||||
assert (new2OldOrd == null || new2OldOrd.length == oldDocIds.cardinality());
|
||||
IntIntHashMap newIdToOldOrd = new IntIntHashMap();
|
||||
DocIdSetIterator iterator = oldDocIds.iterator();
|
||||
int[] newDocIds = new int[oldDocIds.cardinality()];
|
||||
int oldOrd = 0;
|
||||
for (int oldDocId = iterator.nextDoc();
|
||||
oldDocId != DocIdSetIterator.NO_MORE_DOCS;
|
||||
oldDocId = iterator.nextDoc()) {
|
||||
int newId = sortMap.oldToNew(oldDocId);
|
||||
newIdToOldOrd.put(newId, oldOrd);
|
||||
newDocIds[oldOrd] = newId;
|
||||
oldOrd++;
|
||||
}
|
||||
|
||||
Arrays.sort(newDocIds);
|
||||
int newOrd = 0;
|
||||
for (int newDocId : newDocIds) {
|
||||
int currOldOrd = newIdToOldOrd.get(newDocId);
|
||||
if (old2NewOrd != null) {
|
||||
old2NewOrd[currOldOrd] = newOrd;
|
||||
}
|
||||
if (new2OldOrd != null) {
|
||||
new2OldOrd[newOrd] = currOldOrd;
|
||||
}
|
||||
if (newDocsWithField != null) {
|
||||
newDocsWithField.add(newDocId);
|
||||
}
|
||||
newOrd++;
|
||||
}
|
||||
}
|
||||
|
||||
/** View over multiple vector values supporting iterator-style access via DocIdMerger. */
|
||||
public static final class MergedVectorValues {
|
||||
private MergedVectorValues() {}
|
||||
|
||||
private static void validateFieldEncoding(FieldInfo fieldInfo, VectorEncoding expected) {
|
||||
assert fieldInfo != null && fieldInfo.hasVectorValues();
|
||||
VectorEncoding fieldEncoding = fieldInfo.getVectorEncoding();
|
||||
if (fieldEncoding != expected) {
|
||||
throw new UnsupportedOperationException(
|
||||
"Cannot merge vectors encoded as [" + fieldEncoding + "] as " + expected);
|
||||
}
|
||||
}
|
||||
|
||||
private static <V, S> List<S> mergeVectorValues(
|
||||
KnnVectorsReader[] knnVectorsReaders,
|
||||
MergeState.DocMap[] docMaps,
|
||||
IOFunction<KnnVectorsReader, V> valuesSupplier,
|
||||
BiFunction<MergeState.DocMap, V, S> newSub)
|
||||
throws IOException {
|
||||
List<S> subs = new ArrayList<>();
|
||||
for (int i = 0; i < knnVectorsReaders.length; i++) {
|
||||
KnnVectorsReader knnVectorsReader = knnVectorsReaders[i];
|
||||
if (knnVectorsReader != null) {
|
||||
V values = valuesSupplier.apply(knnVectorsReader);
|
||||
if (values != null) {
|
||||
subs.add(newSub.apply(docMaps[i], values));
|
||||
}
|
||||
}
|
||||
}
|
||||
return subs;
|
||||
}
|
||||
|
||||
/** Returns a merged view over all the segment's {@link FloatVectorValues}. */
|
||||
public static FloatVectorValues mergeFloatVectorValues(
|
||||
FieldInfo fieldInfo, MergeState mergeState) throws IOException {
|
||||
assert fieldInfo != null && fieldInfo.hasVectorValues();
|
||||
if (fieldInfo.getVectorEncoding() != VectorEncoding.FLOAT32) {
|
||||
throw new UnsupportedOperationException(
|
||||
"Cannot merge vectors encoded as [" + fieldInfo.getVectorEncoding() + "] as FLOAT32");
|
||||
}
|
||||
List<VectorValuesSub> subs = new ArrayList<>();
|
||||
for (int i = 0; i < mergeState.knnVectorsReaders.length; i++) {
|
||||
KnnVectorsReader knnVectorsReader = mergeState.knnVectorsReaders[i];
|
||||
if (knnVectorsReader != null) {
|
||||
FloatVectorValues values = knnVectorsReader.getFloatVectorValues(fieldInfo.name);
|
||||
if (values != null) {
|
||||
subs.add(new VectorValuesSub(mergeState.docMaps[i], values));
|
||||
}
|
||||
}
|
||||
}
|
||||
return new MergedFloat32VectorValues(subs, mergeState);
|
||||
validateFieldEncoding(fieldInfo, VectorEncoding.FLOAT32);
|
||||
return new MergedFloat32VectorValues(
|
||||
mergeVectorValues(
|
||||
mergeState.knnVectorsReaders,
|
||||
mergeState.docMaps,
|
||||
knnVectorsReader -> {
|
||||
return knnVectorsReader.getFloatVectorValues(fieldInfo.name);
|
||||
},
|
||||
(docMap, values) -> {
|
||||
return new FloatVectorValuesSub(docMap, values);
|
||||
}),
|
||||
mergeState);
|
||||
}
|
||||
|
||||
/** Returns a merged view over all the segment's {@link ByteVectorValues}. */
|
||||
public static ByteVectorValues mergeByteVectorValues(FieldInfo fieldInfo, MergeState mergeState)
|
||||
throws IOException {
|
||||
assert fieldInfo != null && fieldInfo.hasVectorValues();
|
||||
if (fieldInfo.getVectorEncoding() != VectorEncoding.BYTE) {
|
||||
throw new UnsupportedOperationException(
|
||||
"Cannot merge vectors encoded as [" + fieldInfo.getVectorEncoding() + "] as BYTE");
|
||||
}
|
||||
List<ByteVectorValuesSub> subs = new ArrayList<>();
|
||||
for (int i = 0; i < mergeState.knnVectorsReaders.length; i++) {
|
||||
KnnVectorsReader knnVectorsReader = mergeState.knnVectorsReaders[i];
|
||||
if (knnVectorsReader != null) {
|
||||
ByteVectorValues values = knnVectorsReader.getByteVectorValues(fieldInfo.name);
|
||||
if (values != null) {
|
||||
subs.add(new ByteVectorValuesSub(mergeState.docMaps[i], values));
|
||||
}
|
||||
}
|
||||
}
|
||||
return new MergedByteVectorValues(subs, mergeState);
|
||||
validateFieldEncoding(fieldInfo, VectorEncoding.BYTE);
|
||||
return new MergedByteVectorValues(
|
||||
mergeVectorValues(
|
||||
mergeState.knnVectorsReaders,
|
||||
mergeState.docMaps,
|
||||
knnVectorsReader -> {
|
||||
return knnVectorsReader.getByteVectorValues(fieldInfo.name);
|
||||
},
|
||||
(docMap, values) -> {
|
||||
return new ByteVectorValuesSub(docMap, values);
|
||||
}),
|
||||
mergeState);
|
||||
}
|
||||
|
||||
static class MergedFloat32VectorValues extends FloatVectorValues {
|
||||
private final List<VectorValuesSub> subs;
|
||||
private final DocIDMerger<VectorValuesSub> docIdMerger;
|
||||
private final List<FloatVectorValuesSub> subs;
|
||||
private final DocIDMerger<FloatVectorValuesSub> docIdMerger;
|
||||
private final int size;
|
||||
private int docId;
|
||||
VectorValuesSub current;
|
||||
FloatVectorValuesSub current;
|
||||
|
||||
private MergedFloat32VectorValues(List<VectorValuesSub> subs, MergeState mergeState)
|
||||
private MergedFloat32VectorValues(List<FloatVectorValuesSub> subs, MergeState mergeState)
|
||||
throws IOException {
|
||||
this.subs = subs;
|
||||
docIdMerger = DocIDMerger.of(subs, mergeState.needsIndexSort);
|
||||
int totalSize = 0;
|
||||
for (VectorValuesSub sub : subs) {
|
||||
for (FloatVectorValuesSub sub : subs) {
|
||||
totalSize += sub.values.size();
|
||||
}
|
||||
size = totalSize;
|
||||
|
|
|
@ -116,6 +116,11 @@ public class DefaultFlatVectorScorer implements FlatVectorsScorer {
|
|||
public RandomVectorScorerSupplier copy() throws IOException {
|
||||
return new ByteScoringSupplier(vectors, similarityFunction);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "ByteScoringSupplier(similarityFunction=" + similarityFunction + ")";
|
||||
}
|
||||
}
|
||||
|
||||
/** RandomVectorScorerSupplier for Float vector */
|
||||
|
@ -148,6 +153,11 @@ public class DefaultFlatVectorScorer implements FlatVectorsScorer {
|
|||
public RandomVectorScorerSupplier copy() throws IOException {
|
||||
return new FloatScoringSupplier(vectors, similarityFunction);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "FloatScoringSupplier(similarityFunction=" + similarityFunction + ")";
|
||||
}
|
||||
}
|
||||
|
||||
/** A {@link RandomVectorScorer} for float vectors. */
|
||||
|
|
|
@ -17,7 +17,10 @@
|
|||
|
||||
package org.apache.lucene.codecs.hnsw;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.List;
|
||||
import org.apache.lucene.codecs.KnnFieldVectorsWriter;
|
||||
import org.apache.lucene.index.DocsWithFieldSet;
|
||||
|
||||
/**
|
||||
* Vectors' writer for a field
|
||||
|
@ -26,20 +29,25 @@ import org.apache.lucene.codecs.KnnFieldVectorsWriter;
|
|||
* @lucene.experimental
|
||||
*/
|
||||
public abstract class FlatFieldVectorsWriter<T> extends KnnFieldVectorsWriter<T> {
|
||||
|
||||
/**
|
||||
* The delegate to write to, can be null When non-null, all vectors seen should be written to the
|
||||
* delegate along with being written to the flat vectors.
|
||||
* @return a list of vectors to be written
|
||||
*/
|
||||
protected final KnnFieldVectorsWriter<T> indexingDelegate;
|
||||
public abstract List<T> getVectors();
|
||||
|
||||
/**
|
||||
* Sole constructor that expects some indexingDelegate. All vectors seen should be written to the
|
||||
* delegate along with being written to the flat vectors.
|
||||
* @return the docsWithFieldSet for the field writer
|
||||
*/
|
||||
public abstract DocsWithFieldSet getDocsWithFieldSet();
|
||||
|
||||
/**
|
||||
* indicates that this writer is done and no new vectors are allowed to be added
|
||||
*
|
||||
* @param indexingDelegate the delegate to write to, can be null
|
||||
* @throws IOException if an I/O error occurs
|
||||
*/
|
||||
protected FlatFieldVectorsWriter(KnnFieldVectorsWriter<T> indexingDelegate) {
|
||||
this.indexingDelegate = indexingDelegate;
|
||||
}
|
||||
public abstract void finish() throws IOException;
|
||||
|
||||
/**
|
||||
* @return true if the writer is done and no new vectors are allowed to be added
|
||||
*/
|
||||
public abstract boolean isFinished();
|
||||
}
|
||||
|
|
|
@ -18,7 +18,6 @@
|
|||
package org.apache.lucene.codecs.hnsw;
|
||||
|
||||
import java.io.IOException;
|
||||
import org.apache.lucene.codecs.KnnFieldVectorsWriter;
|
||||
import org.apache.lucene.codecs.KnnVectorsWriter;
|
||||
import org.apache.lucene.index.FieldInfo;
|
||||
import org.apache.lucene.index.MergeState;
|
||||
|
@ -46,21 +45,14 @@ public abstract class FlatVectorsWriter extends KnnVectorsWriter {
|
|||
}
|
||||
|
||||
/**
|
||||
* Add a new field for indexing, allowing the user to provide a writer that the flat vectors
|
||||
* writer can delegate to if additional indexing logic is required.
|
||||
* Add a new field for indexing
|
||||
*
|
||||
* @param fieldInfo fieldInfo of the field to add
|
||||
* @param indexWriter the writer to delegate to, can be null
|
||||
* @return a writer for the field
|
||||
* @throws IOException if an I/O error occurs when adding the field
|
||||
*/
|
||||
public abstract FlatFieldVectorsWriter<?> addField(
|
||||
FieldInfo fieldInfo, KnnFieldVectorsWriter<?> indexWriter) throws IOException;
|
||||
|
||||
@Override
|
||||
public FlatFieldVectorsWriter<?> addField(FieldInfo fieldInfo) throws IOException {
|
||||
return addField(fieldInfo, null);
|
||||
}
|
||||
public abstract FlatFieldVectorsWriter<?> addField(FieldInfo fieldInfo) throws IOException;
|
||||
|
||||
/**
|
||||
* Write the field for merging, providing a scorer over the newly merged flat vectors. This way
|
||||
|
|
|
@ -170,5 +170,12 @@ public class ScalarQuantizedVectorScorer implements FlatVectorsScorer {
|
|||
return new ScalarQuantizedRandomVectorScorerSupplier(
|
||||
similarity, vectorSimilarityFunction, values.copy());
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "ScalarQuantizedRandomVectorScorerSupplier(vectorSimilarityFunction="
|
||||
+ vectorSimilarityFunction
|
||||
+ ")";
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -19,10 +19,13 @@ package org.apache.lucene.codecs.lucene90;
|
|||
import static org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat.DIRECT_MONOTONIC_BLOCK_SHIFT;
|
||||
import static org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat.NUMERIC_BLOCK_SHIFT;
|
||||
import static org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat.NUMERIC_BLOCK_SIZE;
|
||||
import static org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat.SKIP_INDEX_INTERVAL_SIZE;
|
||||
import static org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat.SKIP_INDEX_LEVEL_SHIFT;
|
||||
import static org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat.SKIP_INDEX_MAX_LEVEL;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import org.apache.lucene.codecs.CodecUtil;
|
||||
import org.apache.lucene.codecs.DocValuesConsumer;
|
||||
import org.apache.lucene.codecs.DocValuesProducer;
|
||||
|
@ -44,7 +47,6 @@ import org.apache.lucene.search.SortedSetSelector;
|
|||
import org.apache.lucene.store.ByteArrayDataOutput;
|
||||
import org.apache.lucene.store.ByteBuffersDataOutput;
|
||||
import org.apache.lucene.store.ByteBuffersIndexOutput;
|
||||
import org.apache.lucene.store.DataOutput;
|
||||
import org.apache.lucene.store.IndexOutput;
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
|
@ -63,10 +65,12 @@ final class Lucene90DocValuesConsumer extends DocValuesConsumer {
|
|||
IndexOutput data, meta;
|
||||
final int maxDoc;
|
||||
private byte[] termsDictBuffer;
|
||||
private final int skipIndexIntervalSize;
|
||||
|
||||
/** expert: Creates a new writer */
|
||||
public Lucene90DocValuesConsumer(
|
||||
SegmentWriteState state,
|
||||
int skipIndexIntervalSize,
|
||||
String dataCodec,
|
||||
String dataExtension,
|
||||
String metaCodec,
|
||||
|
@ -96,6 +100,7 @@ final class Lucene90DocValuesConsumer extends DocValuesConsumer {
|
|||
state.segmentInfo.getId(),
|
||||
state.segmentSuffix);
|
||||
maxDoc = state.segmentInfo.maxDoc();
|
||||
this.skipIndexIntervalSize = skipIndexIntervalSize;
|
||||
success = true;
|
||||
} finally {
|
||||
if (!success) {
|
||||
|
@ -200,70 +205,150 @@ final class Lucene90DocValuesConsumer extends DocValuesConsumer {
|
|||
docCount = 0;
|
||||
}
|
||||
|
||||
boolean isDone(int skipIndexIntervalSize, int valueCount, long nextValue, int nextDoc) {
|
||||
if (docCount < skipIndexIntervalSize) {
|
||||
return false;
|
||||
}
|
||||
// Once we reach the interval size, we will keep accepting documents if
|
||||
// - next doc value is not a multi-value
|
||||
// - current accumulator only contains a single value and next value is the same value
|
||||
// - the accumulator is dense and the next doc keeps the density (no gaps)
|
||||
return valueCount > 1
|
||||
|| minValue != maxValue
|
||||
|| minValue != nextValue
|
||||
|| docCount != nextDoc - minDocID;
|
||||
}
|
||||
|
||||
void accumulate(long value) {
|
||||
minValue = Math.min(minValue, value);
|
||||
maxValue = Math.max(maxValue, value);
|
||||
}
|
||||
|
||||
void accumulate(SkipAccumulator other) {
|
||||
assert minDocID <= other.minDocID && maxDocID < other.maxDocID;
|
||||
maxDocID = other.maxDocID;
|
||||
minValue = Math.min(minValue, other.minValue);
|
||||
maxValue = Math.max(maxValue, other.maxValue);
|
||||
docCount += other.docCount;
|
||||
}
|
||||
|
||||
void nextDoc(int docID) {
|
||||
maxDocID = docID;
|
||||
++docCount;
|
||||
}
|
||||
|
||||
void writeTo(DataOutput output) throws IOException {
|
||||
output.writeInt(maxDocID);
|
||||
output.writeInt(minDocID);
|
||||
output.writeLong(maxValue);
|
||||
output.writeLong(minValue);
|
||||
output.writeInt(docCount);
|
||||
public static SkipAccumulator merge(List<SkipAccumulator> list, int index, int length) {
|
||||
SkipAccumulator acc = new SkipAccumulator(list.get(index).minDocID);
|
||||
for (int i = 0; i < length; i++) {
|
||||
acc.accumulate(list.get(index + i));
|
||||
}
|
||||
return acc;
|
||||
}
|
||||
}
|
||||
|
||||
private void writeSkipIndex(FieldInfo field, DocValuesProducer valuesProducer)
|
||||
throws IOException {
|
||||
assert field.hasDocValuesSkipIndex();
|
||||
// TODO: This disk compression once we introduce levels
|
||||
long start = data.getFilePointer();
|
||||
SortedNumericDocValues values = valuesProducer.getSortedNumeric(field);
|
||||
final long start = data.getFilePointer();
|
||||
final SortedNumericDocValues values = valuesProducer.getSortedNumeric(field);
|
||||
long globalMaxValue = Long.MIN_VALUE;
|
||||
long globalMinValue = Long.MAX_VALUE;
|
||||
int globalDocCount = 0;
|
||||
int maxDocId = -1;
|
||||
final List<SkipAccumulator> accumulators = new ArrayList<>();
|
||||
SkipAccumulator accumulator = null;
|
||||
int counter = 0;
|
||||
final int maxAccumulators = 1 << (SKIP_INDEX_LEVEL_SHIFT * (SKIP_INDEX_MAX_LEVEL - 1));
|
||||
for (int doc = values.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = values.nextDoc()) {
|
||||
if (counter == 0) {
|
||||
accumulator = new SkipAccumulator(doc);
|
||||
}
|
||||
accumulator.nextDoc(doc);
|
||||
for (int i = 0, end = values.docValueCount(); i < end; ++i) {
|
||||
accumulator.accumulate(values.nextValue());
|
||||
}
|
||||
if (++counter == SKIP_INDEX_INTERVAL_SIZE) {
|
||||
final long firstValue = values.nextValue();
|
||||
if (accumulator != null
|
||||
&& accumulator.isDone(skipIndexIntervalSize, values.docValueCount(), firstValue, doc)) {
|
||||
globalMaxValue = Math.max(globalMaxValue, accumulator.maxValue);
|
||||
globalMinValue = Math.min(globalMinValue, accumulator.minValue);
|
||||
globalDocCount += accumulator.docCount;
|
||||
maxDocId = accumulator.maxDocID;
|
||||
accumulator.writeTo(data);
|
||||
counter = 0;
|
||||
accumulator = null;
|
||||
if (accumulators.size() == maxAccumulators) {
|
||||
writeLevels(accumulators);
|
||||
accumulators.clear();
|
||||
}
|
||||
}
|
||||
if (accumulator == null) {
|
||||
accumulator = new SkipAccumulator(doc);
|
||||
accumulators.add(accumulator);
|
||||
}
|
||||
accumulator.nextDoc(doc);
|
||||
accumulator.accumulate(firstValue);
|
||||
for (int i = 1, end = values.docValueCount(); i < end; ++i) {
|
||||
accumulator.accumulate(values.nextValue());
|
||||
}
|
||||
}
|
||||
|
||||
if (counter > 0) {
|
||||
if (accumulators.isEmpty() == false) {
|
||||
globalMaxValue = Math.max(globalMaxValue, accumulator.maxValue);
|
||||
globalMinValue = Math.min(globalMinValue, accumulator.minValue);
|
||||
globalDocCount += accumulator.docCount;
|
||||
maxDocId = accumulator.maxDocID;
|
||||
accumulator.writeTo(data);
|
||||
writeLevels(accumulators);
|
||||
}
|
||||
meta.writeLong(start); // record the start in meta
|
||||
meta.writeLong(data.getFilePointer() - start); // record the length
|
||||
assert globalDocCount == 0 || globalMaxValue >= globalMinValue;
|
||||
meta.writeLong(globalMaxValue);
|
||||
meta.writeLong(globalMinValue);
|
||||
assert globalDocCount <= maxDocId + 1;
|
||||
meta.writeInt(globalDocCount);
|
||||
meta.writeInt(maxDocId);
|
||||
}
|
||||
|
||||
private void writeLevels(List<SkipAccumulator> accumulators) throws IOException {
|
||||
final List<List<SkipAccumulator>> accumulatorsLevels = new ArrayList<>(SKIP_INDEX_MAX_LEVEL);
|
||||
accumulatorsLevels.add(accumulators);
|
||||
for (int i = 0; i < SKIP_INDEX_MAX_LEVEL - 1; i++) {
|
||||
accumulatorsLevels.add(buildLevel(accumulatorsLevels.get(i)));
|
||||
}
|
||||
int totalAccumulators = accumulators.size();
|
||||
for (int index = 0; index < totalAccumulators; index++) {
|
||||
// compute how many levels we need to write for the current accumulator
|
||||
final int levels = getLevels(index, totalAccumulators);
|
||||
// write the number of levels
|
||||
data.writeByte((byte) levels);
|
||||
// write intervals in reverse order. This is done so we don't
|
||||
// need to read all of them in case of slipping
|
||||
for (int level = levels - 1; level >= 0; level--) {
|
||||
final SkipAccumulator accumulator =
|
||||
accumulatorsLevels.get(level).get(index >> (SKIP_INDEX_LEVEL_SHIFT * level));
|
||||
data.writeInt(accumulator.maxDocID);
|
||||
data.writeInt(accumulator.minDocID);
|
||||
data.writeLong(accumulator.maxValue);
|
||||
data.writeLong(accumulator.minValue);
|
||||
data.writeInt(accumulator.docCount);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static List<SkipAccumulator> buildLevel(List<SkipAccumulator> accumulators) {
|
||||
final int levelSize = 1 << SKIP_INDEX_LEVEL_SHIFT;
|
||||
final List<SkipAccumulator> collector = new ArrayList<>();
|
||||
for (int i = 0; i < accumulators.size() - levelSize + 1; i += levelSize) {
|
||||
collector.add(SkipAccumulator.merge(accumulators, i, levelSize));
|
||||
}
|
||||
return collector;
|
||||
}
|
||||
|
||||
private static int getLevels(int index, int size) {
|
||||
if (Integer.numberOfTrailingZeros(index) >= SKIP_INDEX_LEVEL_SHIFT) {
|
||||
// TODO: can we do it in constant time rather than linearly with SKIP_INDEX_MAX_LEVEL?
|
||||
final int left = size - index;
|
||||
for (int level = SKIP_INDEX_MAX_LEVEL - 1; level > 0; level--) {
|
||||
final int numberIntervals = 1 << (SKIP_INDEX_LEVEL_SHIFT * level);
|
||||
if (left >= numberIntervals && index % numberIntervals == 0) {
|
||||
return level + 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
private long[] writeValues(FieldInfo field, DocValuesProducer valuesProducer, boolean ords)
|
||||
throws IOException {
|
||||
SortedNumericDocValues values = valuesProducer.getSortedNumeric(field);
|
||||
|
|
|
@ -138,15 +138,27 @@ import org.apache.lucene.util.packed.DirectWriter;
|
|||
*/
|
||||
public final class Lucene90DocValuesFormat extends DocValuesFormat {
|
||||
|
||||
private final int skipIndexIntervalSize;
|
||||
|
||||
/** Default constructor. */
|
||||
public Lucene90DocValuesFormat() {
|
||||
this(DEFAULT_SKIP_INDEX_INTERVAL_SIZE);
|
||||
}
|
||||
|
||||
/** Doc values fields format with specified skipIndexIntervalSize. */
|
||||
public Lucene90DocValuesFormat(int skipIndexIntervalSize) {
|
||||
super("Lucene90");
|
||||
if (skipIndexIntervalSize < 2) {
|
||||
throw new IllegalArgumentException(
|
||||
"skipIndexIntervalSize must be > 1, got [" + skipIndexIntervalSize + "]");
|
||||
}
|
||||
this.skipIndexIntervalSize = skipIndexIntervalSize;
|
||||
}
|
||||
|
||||
@Override
|
||||
public DocValuesConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
|
||||
return new Lucene90DocValuesConsumer(
|
||||
state, DATA_CODEC, DATA_EXTENSION, META_CODEC, META_EXTENSION);
|
||||
state, skipIndexIntervalSize, DATA_CODEC, DATA_EXTENSION, META_CODEC, META_EXTENSION);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -182,6 +194,36 @@ public final class Lucene90DocValuesFormat extends DocValuesFormat {
|
|||
static final int TERMS_DICT_REVERSE_INDEX_SIZE = 1 << TERMS_DICT_REVERSE_INDEX_SHIFT;
|
||||
static final int TERMS_DICT_REVERSE_INDEX_MASK = TERMS_DICT_REVERSE_INDEX_SIZE - 1;
|
||||
|
||||
static final int SKIP_INDEX_INTERVAL_SHIFT = 12;
|
||||
static final int SKIP_INDEX_INTERVAL_SIZE = 1 << SKIP_INDEX_INTERVAL_SHIFT;
|
||||
// number of documents in an interval
|
||||
private static final int DEFAULT_SKIP_INDEX_INTERVAL_SIZE = 4096;
|
||||
// bytes on an interval:
|
||||
// * 1 byte : number of levels
|
||||
// * 16 bytes: min / max value,
|
||||
// * 8 bytes: min / max docID
|
||||
// * 4 bytes: number of documents
|
||||
private static final long SKIP_INDEX_INTERVAL_BYTES = 29L;
|
||||
// number of intervals represented as a shift to create a new level, this is 1 << 3 == 8
|
||||
// intervals.
|
||||
static final int SKIP_INDEX_LEVEL_SHIFT = 3;
|
||||
// max number of levels
|
||||
// Increasing this number, it increases how much heap we need at index time.
|
||||
// we currently need (1 * 8 * 8 * 8) = 512 accumulators on heap
|
||||
static final int SKIP_INDEX_MAX_LEVEL = 4;
|
||||
// number of bytes to skip when skipping a level. It does not take into account the
|
||||
// current interval that is being read.
|
||||
static final long[] SKIP_INDEX_JUMP_LENGTH_PER_LEVEL = new long[SKIP_INDEX_MAX_LEVEL];
|
||||
|
||||
static {
|
||||
// Size of the interval minus read bytes (1 byte for level and 4 bytes for maxDocID)
|
||||
SKIP_INDEX_JUMP_LENGTH_PER_LEVEL[0] = SKIP_INDEX_INTERVAL_BYTES - 5L;
|
||||
for (int level = 1; level < SKIP_INDEX_MAX_LEVEL; level++) {
|
||||
// jump from previous level
|
||||
SKIP_INDEX_JUMP_LENGTH_PER_LEVEL[level] = SKIP_INDEX_JUMP_LENGTH_PER_LEVEL[level - 1];
|
||||
// nodes added by new level
|
||||
SKIP_INDEX_JUMP_LENGTH_PER_LEVEL[level] +=
|
||||
(1 << (level * SKIP_INDEX_LEVEL_SHIFT)) * SKIP_INDEX_INTERVAL_BYTES;
|
||||
// remove the byte levels added in the previous level
|
||||
SKIP_INDEX_JUMP_LENGTH_PER_LEVEL[level] -= (1 << ((level - 1) * SKIP_INDEX_LEVEL_SHIFT));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -16,6 +16,8 @@
|
|||
*/
|
||||
package org.apache.lucene.codecs.lucene90;
|
||||
|
||||
import static org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat.SKIP_INDEX_JUMP_LENGTH_PER_LEVEL;
|
||||
import static org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat.SKIP_INDEX_MAX_LEVEL;
|
||||
import static org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat.TERMS_DICT_BLOCK_LZ4_SHIFT;
|
||||
|
||||
import java.io.IOException;
|
||||
|
@ -1792,28 +1794,55 @@ final class Lucene90DocValuesProducer extends DocValuesProducer {
|
|||
if (input.length() > 0) {
|
||||
input.prefetch(0, 1);
|
||||
}
|
||||
// TODO: should we write to disk the actual max level for this segment?
|
||||
return new DocValuesSkipper() {
|
||||
int minDocID = -1;
|
||||
int maxDocID = -1;
|
||||
long minValue, maxValue;
|
||||
int docCount;
|
||||
final int[] minDocID = new int[SKIP_INDEX_MAX_LEVEL];
|
||||
final int[] maxDocID = new int[SKIP_INDEX_MAX_LEVEL];
|
||||
|
||||
{
|
||||
for (int i = 0; i < SKIP_INDEX_MAX_LEVEL; i++) {
|
||||
minDocID[i] = maxDocID[i] = -1;
|
||||
}
|
||||
}
|
||||
|
||||
final long[] minValue = new long[SKIP_INDEX_MAX_LEVEL];
|
||||
final long[] maxValue = new long[SKIP_INDEX_MAX_LEVEL];
|
||||
final int[] docCount = new int[SKIP_INDEX_MAX_LEVEL];
|
||||
int levels = 1;
|
||||
|
||||
@Override
|
||||
public void advance(int target) throws IOException {
|
||||
if (target > entry.maxDocId) {
|
||||
minDocID = DocIdSetIterator.NO_MORE_DOCS;
|
||||
maxDocID = DocIdSetIterator.NO_MORE_DOCS;
|
||||
// skipper is exhausted
|
||||
for (int i = 0; i < SKIP_INDEX_MAX_LEVEL; i++) {
|
||||
minDocID[i] = maxDocID[i] = DocIdSetIterator.NO_MORE_DOCS;
|
||||
}
|
||||
} else {
|
||||
// find next interval
|
||||
assert target > maxDocID[0] : "target must be bigger that current interval";
|
||||
while (true) {
|
||||
maxDocID = input.readInt();
|
||||
if (maxDocID >= target) {
|
||||
minDocID = input.readInt();
|
||||
maxValue = input.readLong();
|
||||
minValue = input.readLong();
|
||||
docCount = input.readInt();
|
||||
levels = input.readByte();
|
||||
assert levels <= SKIP_INDEX_MAX_LEVEL && levels > 0
|
||||
: "level out of range [" + levels + "]";
|
||||
boolean valid = true;
|
||||
// check if current interval is competitive or we can jump to the next position
|
||||
for (int level = levels - 1; level >= 0; level--) {
|
||||
if ((maxDocID[level] = input.readInt()) < target) {
|
||||
input.skipBytes(SKIP_INDEX_JUMP_LENGTH_PER_LEVEL[level]); // the jump for the level
|
||||
valid = false;
|
||||
break;
|
||||
}
|
||||
minDocID[level] = input.readInt();
|
||||
maxValue[level] = input.readLong();
|
||||
minValue[level] = input.readLong();
|
||||
docCount[level] = input.readInt();
|
||||
}
|
||||
if (valid) {
|
||||
// adjust levels
|
||||
while (levels < SKIP_INDEX_MAX_LEVEL && maxDocID[levels] >= target) {
|
||||
levels++;
|
||||
}
|
||||
break;
|
||||
} else {
|
||||
input.skipBytes(24);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -1821,32 +1850,32 @@ final class Lucene90DocValuesProducer extends DocValuesProducer {
|
|||
|
||||
@Override
|
||||
public int numLevels() {
|
||||
return 1;
|
||||
return levels;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int minDocID(int level) {
|
||||
return minDocID;
|
||||
return minDocID[level];
|
||||
}
|
||||
|
||||
@Override
|
||||
public int maxDocID(int level) {
|
||||
return maxDocID;
|
||||
return maxDocID[level];
|
||||
}
|
||||
|
||||
@Override
|
||||
public long minValue(int level) {
|
||||
return minValue;
|
||||
return minValue[level];
|
||||
}
|
||||
|
||||
@Override
|
||||
public long maxValue(int level) {
|
||||
return maxValue;
|
||||
return maxValue[level];
|
||||
}
|
||||
|
||||
@Override
|
||||
public int docCount(int level) {
|
||||
return docCount;
|
||||
return docCount[level];
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -49,9 +49,9 @@ import org.apache.lucene.util.packed.DirectMonotonicWriter;
|
|||
*
|
||||
* <pre class="prettyprint">
|
||||
* // the default: for high performance
|
||||
* indexWriterConfig.setCodec(new Lucene99Codec(Mode.BEST_SPEED));
|
||||
* indexWriterConfig.setCodec(new Lucene912Codec(Mode.BEST_SPEED));
|
||||
* // instead for higher performance (but slower):
|
||||
* // indexWriterConfig.setCodec(new Lucene99Codec(Mode.BEST_COMPRESSION));
|
||||
* // indexWriterConfig.setCodec(new Lucene912Codec(Mode.BEST_COMPRESSION));
|
||||
* </pre>
|
||||
*
|
||||
* <p><b>File formats</b>
|
||||
|
|
|
@ -78,7 +78,6 @@ public final class FieldReader extends Terms {
|
|||
this.sumTotalTermFreq = sumTotalTermFreq;
|
||||
this.sumDocFreq = sumDocFreq;
|
||||
this.docCount = docCount;
|
||||
this.rootCode = rootCode;
|
||||
this.minTerm = minTerm;
|
||||
this.maxTerm = maxTerm;
|
||||
// if (DEBUG) {
|
||||
|
@ -89,13 +88,8 @@ public final class FieldReader extends Terms {
|
|||
readVLongOutput(new ByteArrayDataInput(rootCode.bytes, rootCode.offset, rootCode.length))
|
||||
>>> Lucene90BlockTreeTermsReader.OUTPUT_FLAGS_NUM_BITS;
|
||||
// Initialize FST always off-heap.
|
||||
final IndexInput clone = indexIn.clone();
|
||||
clone.seek(indexStartFP);
|
||||
index =
|
||||
new FST<>(
|
||||
FST.readMetadata(metaIn, ByteSequenceOutputs.getSingleton()),
|
||||
clone,
|
||||
new OffHeapFSTStore());
|
||||
var metadata = FST.readMetadata(metaIn, ByteSequenceOutputs.getSingleton());
|
||||
index = FST.fromFSTReader(metadata, new OffHeapFSTStore(indexIn, indexStartFP, metadata));
|
||||
/*
|
||||
if (false) {
|
||||
final String dotFileName = segment + "_" + fieldInfo.name + ".dot";
|
||||
|
@ -105,6 +99,14 @@ public final class FieldReader extends Terms {
|
|||
w.close();
|
||||
}
|
||||
*/
|
||||
BytesRef emptyOutput = metadata.getEmptyOutput();
|
||||
if (rootCode.equals(emptyOutput) == false) {
|
||||
// TODO: this branch is never taken
|
||||
assert false;
|
||||
this.rootCode = rootCode;
|
||||
} else {
|
||||
this.rootCode = emptyOutput;
|
||||
}
|
||||
}
|
||||
|
||||
long readVLongOutput(DataInput in) throws IOException {
|
||||
|
|
|
@ -200,6 +200,11 @@ public final class Lucene90BlockTreeTermsReader extends FieldsProducer {
|
|||
final int docCount = metaIn.readVInt();
|
||||
BytesRef minTerm = readBytesRef(metaIn);
|
||||
BytesRef maxTerm = readBytesRef(metaIn);
|
||||
if (numTerms == 1) {
|
||||
assert maxTerm.equals(minTerm);
|
||||
// save heap for edge case of a single term only so min == max
|
||||
maxTerm = minTerm;
|
||||
}
|
||||
if (docCount < 0
|
||||
|| docCount > state.segmentInfo.maxDoc()) { // #docs with field must be <= #docs
|
||||
throw new CorruptIndexException(
|
||||
|
@ -270,9 +275,8 @@ public final class Lucene90BlockTreeTermsReader extends FieldsProducer {
|
|||
throw new CorruptIndexException("invalid bytes length: " + numBytes, in);
|
||||
}
|
||||
|
||||
BytesRef bytes = new BytesRef();
|
||||
BytesRef bytes = new BytesRef(numBytes);
|
||||
bytes.length = numBytes;
|
||||
bytes.bytes = new byte[numBytes];
|
||||
in.readBytes(bytes.bytes, 0, numBytes);
|
||||
|
||||
return bytes;
|
||||
|
|
|
@ -598,8 +598,6 @@ public final class Lucene90BlockTreeTermsWriter extends FieldsConsumer {
|
|||
private final ByteBuffersDataOutput scratchBytes = ByteBuffersDataOutput.newResettableInstance();
|
||||
private final IntsRefBuilder scratchIntsRef = new IntsRefBuilder();
|
||||
|
||||
static final BytesRef EMPTY_BYTES_REF = new BytesRef();
|
||||
|
||||
private static class StatsWriter {
|
||||
|
||||
private final DataOutput out;
|
||||
|
|
|
@ -30,6 +30,7 @@ import org.apache.lucene.store.IndexInput;
|
|||
import org.apache.lucene.util.ArrayUtil;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.BytesRefBuilder;
|
||||
import org.apache.lucene.util.IOBooleanSupplier;
|
||||
import org.apache.lucene.util.RamUsageEstimator;
|
||||
import org.apache.lucene.util.fst.FST;
|
||||
import org.apache.lucene.util.fst.Util;
|
||||
|
@ -276,10 +277,10 @@ final class SegmentTermsEnum extends BaseTermsEnum {
|
|||
// System.out.println(" skip rewind!");
|
||||
// }
|
||||
}
|
||||
assert length == f.prefix;
|
||||
assert length == f.prefixLength;
|
||||
} else {
|
||||
f.nextEnt = -1;
|
||||
f.prefix = length;
|
||||
f.prefixLength = length;
|
||||
f.state.termBlockOrd = 0;
|
||||
f.fpOrig = f.fp = fp;
|
||||
f.lastSubFP = -1;
|
||||
|
@ -307,15 +308,13 @@ final class SegmentTermsEnum extends BaseTermsEnum {
|
|||
return true;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean seekExact(BytesRef target) throws IOException {
|
||||
|
||||
private IOBooleanSupplier prepareSeekExact(BytesRef target, boolean prefetch) throws IOException {
|
||||
if (fr.index == null) {
|
||||
throw new IllegalStateException("terms index was not loaded");
|
||||
}
|
||||
|
||||
if (fr.size() > 0 && (target.compareTo(fr.getMin()) < 0 || target.compareTo(fr.getMax()) > 0)) {
|
||||
return false;
|
||||
return null;
|
||||
}
|
||||
|
||||
term.grow(1 + target.length);
|
||||
|
@ -431,7 +430,7 @@ final class SegmentTermsEnum extends BaseTermsEnum {
|
|||
// if (DEBUG) {
|
||||
// System.out.println(" target is same as current; return true");
|
||||
// }
|
||||
return true;
|
||||
return () -> true;
|
||||
} else {
|
||||
// if (DEBUG) {
|
||||
// System.out.println(" target is same as current but term doesn't exist");
|
||||
|
@ -489,7 +488,7 @@ final class SegmentTermsEnum extends BaseTermsEnum {
|
|||
// toHex(targetLabel));
|
||||
// }
|
||||
|
||||
validIndexPrefix = currentFrame.prefix;
|
||||
validIndexPrefix = currentFrame.prefixLength;
|
||||
// validIndexPrefix = targetUpto;
|
||||
|
||||
currentFrame.scanToFloorFrame(target);
|
||||
|
@ -501,9 +500,14 @@ final class SegmentTermsEnum extends BaseTermsEnum {
|
|||
// if (DEBUG) {
|
||||
// System.out.println(" FAST NOT_FOUND term=" + ToStringUtils.bytesRefToString(term));
|
||||
// }
|
||||
return false;
|
||||
return null;
|
||||
}
|
||||
|
||||
if (prefetch) {
|
||||
currentFrame.prefetchBlock();
|
||||
}
|
||||
|
||||
return () -> {
|
||||
currentFrame.loadBlock();
|
||||
|
||||
final SeekStatus result = currentFrame.scanToTerm(target, true);
|
||||
|
@ -519,6 +523,7 @@ final class SegmentTermsEnum extends BaseTermsEnum {
|
|||
// }
|
||||
return false;
|
||||
}
|
||||
};
|
||||
} else {
|
||||
// Follow this arc
|
||||
arc = nextArc;
|
||||
|
@ -545,7 +550,7 @@ final class SegmentTermsEnum extends BaseTermsEnum {
|
|||
}
|
||||
|
||||
// validIndexPrefix = targetUpto;
|
||||
validIndexPrefix = currentFrame.prefix;
|
||||
validIndexPrefix = currentFrame.prefixLength;
|
||||
|
||||
currentFrame.scanToFloorFrame(target);
|
||||
|
||||
|
@ -556,9 +561,14 @@ final class SegmentTermsEnum extends BaseTermsEnum {
|
|||
// if (DEBUG) {
|
||||
// System.out.println(" FAST NOT_FOUND term=" + ToStringUtils.bytesRefToString(term));
|
||||
// }
|
||||
return false;
|
||||
return null;
|
||||
}
|
||||
|
||||
if (prefetch) {
|
||||
currentFrame.prefetchBlock();
|
||||
}
|
||||
|
||||
return () -> {
|
||||
currentFrame.loadBlock();
|
||||
|
||||
final SeekStatus result = currentFrame.scanToTerm(target, true);
|
||||
|
@ -575,6 +585,18 @@ final class SegmentTermsEnum extends BaseTermsEnum {
|
|||
|
||||
return false;
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
@Override
|
||||
public IOBooleanSupplier prepareSeekExact(BytesRef target) throws IOException {
|
||||
return prepareSeekExact(target, true);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean seekExact(BytesRef target) throws IOException {
|
||||
IOBooleanSupplier termExistsSupplier = prepareSeekExact(target, false);
|
||||
return termExistsSupplier != null && termExistsSupplier.get();
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -750,7 +772,7 @@ final class SegmentTermsEnum extends BaseTermsEnum {
|
|||
// targetLabel);
|
||||
// }
|
||||
|
||||
validIndexPrefix = currentFrame.prefix;
|
||||
validIndexPrefix = currentFrame.prefixLength;
|
||||
// validIndexPrefix = targetUpto;
|
||||
|
||||
currentFrame.scanToFloorFrame(target);
|
||||
|
@ -808,7 +830,7 @@ final class SegmentTermsEnum extends BaseTermsEnum {
|
|||
}
|
||||
|
||||
// validIndexPrefix = targetUpto;
|
||||
validIndexPrefix = currentFrame.prefix;
|
||||
validIndexPrefix = currentFrame.prefixLength;
|
||||
|
||||
currentFrame.scanToFloorFrame(target);
|
||||
|
||||
|
@ -846,7 +868,7 @@ final class SegmentTermsEnum extends BaseTermsEnum {
|
|||
while (true) {
|
||||
SegmentTermsEnumFrame f = getFrame(ord);
|
||||
assert f != null;
|
||||
final BytesRef prefix = new BytesRef(term.get().bytes, 0, f.prefix);
|
||||
final BytesRef prefix = new BytesRef(term.get().bytes, 0, f.prefixLength);
|
||||
if (f.nextEnt == -1) {
|
||||
out.println(
|
||||
" frame "
|
||||
|
@ -857,7 +879,7 @@ final class SegmentTermsEnum extends BaseTermsEnum {
|
|||
+ f.fp
|
||||
+ (f.isFloor ? (" (fpOrig=" + f.fpOrig + ")") : "")
|
||||
+ " prefixLen="
|
||||
+ f.prefix
|
||||
+ f.prefixLength
|
||||
+ " prefix="
|
||||
+ prefix
|
||||
+ (f.nextEnt == -1 ? "" : (" (of " + f.entCount + ")"))
|
||||
|
@ -885,7 +907,7 @@ final class SegmentTermsEnum extends BaseTermsEnum {
|
|||
+ f.fp
|
||||
+ (f.isFloor ? (" (fpOrig=" + f.fpOrig + ")") : "")
|
||||
+ " prefixLen="
|
||||
+ f.prefix
|
||||
+ f.prefixLength
|
||||
+ " prefix="
|
||||
+ prefix
|
||||
+ " nextEnt="
|
||||
|
@ -910,12 +932,14 @@ final class SegmentTermsEnum extends BaseTermsEnum {
|
|||
}
|
||||
if (fr.index != null) {
|
||||
assert !isSeekFrame || f.arc != null : "isSeekFrame=" + isSeekFrame + " f.arc=" + f.arc;
|
||||
if (f.prefix > 0 && isSeekFrame && f.arc.label() != (term.byteAt(f.prefix - 1) & 0xFF)) {
|
||||
if (f.prefixLength > 0
|
||||
&& isSeekFrame
|
||||
&& f.arc.label() != (term.byteAt(f.prefixLength - 1) & 0xFF)) {
|
||||
out.println(
|
||||
" broken seek state: arc.label="
|
||||
+ (char) f.arc.label()
|
||||
+ " vs term byte="
|
||||
+ (char) (term.byteAt(f.prefix - 1) & 0xFF));
|
||||
+ (char) (term.byteAt(f.prefixLength - 1) & 0xFF));
|
||||
throw new RuntimeException("seek state is broken");
|
||||
}
|
||||
BytesRef output = Util.get(fr.index, prefix);
|
||||
|
@ -943,7 +967,7 @@ final class SegmentTermsEnum extends BaseTermsEnum {
|
|||
if (f == currentFrame) {
|
||||
break;
|
||||
}
|
||||
if (f.prefix == validIndexPrefix) {
|
||||
if (f.prefixLength == validIndexPrefix) {
|
||||
isSeekFrame = false;
|
||||
}
|
||||
ord++;
|
||||
|
@ -1024,7 +1048,7 @@ final class SegmentTermsEnum extends BaseTermsEnum {
|
|||
|
||||
// Note that the seek state (last seek) has been
|
||||
// invalidated beyond this depth
|
||||
validIndexPrefix = Math.min(validIndexPrefix, currentFrame.prefix);
|
||||
validIndexPrefix = Math.min(validIndexPrefix, currentFrame.prefixLength);
|
||||
// if (DEBUG) {
|
||||
// System.out.println(" reset validIndexPrefix=" + validIndexPrefix);
|
||||
// }
|
||||
|
|
|
@ -59,7 +59,7 @@ final class SegmentTermsEnumFrame {
|
|||
final ByteArrayDataInput floorDataReader = new ByteArrayDataInput();
|
||||
|
||||
// Length of prefix shared by all terms in this block
|
||||
int prefix;
|
||||
int prefixLength;
|
||||
|
||||
// Number of entries (term or sub-block) in this block
|
||||
int entCount;
|
||||
|
@ -133,6 +133,21 @@ final class SegmentTermsEnumFrame {
|
|||
loadBlock();
|
||||
}
|
||||
|
||||
void prefetchBlock() throws IOException {
|
||||
if (nextEnt != -1) {
|
||||
// Already loaded
|
||||
return;
|
||||
}
|
||||
|
||||
// Clone the IndexInput lazily, so that consumers
|
||||
// that just pull a TermsEnum to
|
||||
// seekExact(TermState) don't pay this cost:
|
||||
ste.initIndexInput();
|
||||
|
||||
// TODO: Could we know the number of bytes to prefetch?
|
||||
ste.in.prefetch(fp, 1);
|
||||
}
|
||||
|
||||
/* Does initial decode of next block of terms; this
|
||||
doesn't actually decode the docFreq, totalTermFreq,
|
||||
postings details (frq/prx offset, etc.) metadata;
|
||||
|
@ -303,11 +318,11 @@ final class SegmentTermsEnumFrame {
|
|||
assert nextEnt != -1 && nextEnt < entCount
|
||||
: "nextEnt=" + nextEnt + " entCount=" + entCount + " fp=" + fp;
|
||||
nextEnt++;
|
||||
suffix = suffixLengthsReader.readVInt();
|
||||
suffixLength = suffixLengthsReader.readVInt();
|
||||
startBytePos = suffixesReader.getPosition();
|
||||
ste.term.setLength(prefix + suffix);
|
||||
ste.term.setLength(prefixLength + suffixLength);
|
||||
ste.term.grow(ste.term.length());
|
||||
suffixesReader.readBytes(ste.term.bytes(), prefix, suffix);
|
||||
suffixesReader.readBytes(ste.term.bytes(), prefixLength, suffixLength);
|
||||
ste.termExists = true;
|
||||
}
|
||||
|
||||
|
@ -331,11 +346,11 @@ final class SegmentTermsEnumFrame {
|
|||
: "nextEnt=" + nextEnt + " entCount=" + entCount + " fp=" + fp;
|
||||
nextEnt++;
|
||||
final int code = suffixLengthsReader.readVInt();
|
||||
suffix = code >>> 1;
|
||||
suffixLength = code >>> 1;
|
||||
startBytePos = suffixesReader.getPosition();
|
||||
ste.term.setLength(prefix + suffix);
|
||||
ste.term.setLength(prefixLength + suffixLength);
|
||||
ste.term.grow(ste.term.length());
|
||||
suffixesReader.readBytes(ste.term.bytes(), prefix, suffix);
|
||||
suffixesReader.readBytes(ste.term.bytes(), prefixLength, suffixLength);
|
||||
if ((code & 1) == 0) {
|
||||
// A normal term
|
||||
ste.termExists = true;
|
||||
|
@ -360,7 +375,7 @@ final class SegmentTermsEnumFrame {
|
|||
// floor blocks we "typically" get
|
||||
public void scanToFloorFrame(BytesRef target) {
|
||||
|
||||
if (!isFloor || target.length <= prefix) {
|
||||
if (!isFloor || target.length <= prefixLength) {
|
||||
// if (DEBUG) {
|
||||
// System.out.println(" scanToFloorFrame skip: isFloor=" + isFloor + " target.length=" +
|
||||
// target.length + " vs prefix=" + prefix);
|
||||
|
@ -368,7 +383,7 @@ final class SegmentTermsEnumFrame {
|
|||
return;
|
||||
}
|
||||
|
||||
final int targetLabel = target.bytes[target.offset + prefix] & 0xFF;
|
||||
final int targetLabel = target.bytes[target.offset + prefixLength] & 0xFF;
|
||||
|
||||
// if (DEBUG) {
|
||||
// System.out.println(" scanToFloorFrame fpOrig=" + fpOrig + " targetLabel=" +
|
||||
|
@ -482,7 +497,7 @@ final class SegmentTermsEnumFrame {
|
|||
|
||||
// Used only by assert
|
||||
private boolean prefixMatches(BytesRef target) {
|
||||
for (int bytePos = 0; bytePos < prefix; bytePos++) {
|
||||
for (int bytePos = 0; bytePos < prefixLength; bytePos++) {
|
||||
if (target.bytes[target.offset + bytePos] != ste.term.byteAt(bytePos)) {
|
||||
return false;
|
||||
}
|
||||
|
@ -538,7 +553,7 @@ final class SegmentTermsEnumFrame {
|
|||
}
|
||||
|
||||
private int startBytePos;
|
||||
private int suffix;
|
||||
private int suffixLength;
|
||||
private long subCode;
|
||||
CompressionAlgorithm compressionAlg = CompressionAlgorithm.NO_COMPRESSION;
|
||||
|
||||
|
@ -569,7 +584,7 @@ final class SegmentTermsEnumFrame {
|
|||
do {
|
||||
nextEnt++;
|
||||
|
||||
suffix = suffixLengthsReader.readVInt();
|
||||
suffixLength = suffixLengthsReader.readVInt();
|
||||
|
||||
// if (DEBUG) {
|
||||
// BytesRef suffixBytesRef = new BytesRef();
|
||||
|
@ -581,16 +596,16 @@ final class SegmentTermsEnumFrame {
|
|||
// }
|
||||
|
||||
startBytePos = suffixesReader.getPosition();
|
||||
suffixesReader.skipBytes(suffix);
|
||||
suffixesReader.skipBytes(suffixLength);
|
||||
|
||||
// Loop over bytes in the suffix, comparing to the target
|
||||
final int cmp =
|
||||
Arrays.compareUnsigned(
|
||||
suffixBytes,
|
||||
startBytePos,
|
||||
startBytePos + suffix,
|
||||
startBytePos + suffixLength,
|
||||
target.bytes,
|
||||
target.offset + prefix,
|
||||
target.offset + prefixLength,
|
||||
target.offset + target.length);
|
||||
|
||||
if (cmp < 0) {
|
||||
|
@ -659,7 +674,7 @@ final class SegmentTermsEnumFrame {
|
|||
|
||||
assert prefixMatches(target);
|
||||
|
||||
suffix = suffixLengthsReader.readVInt();
|
||||
suffixLength = suffixLengthsReader.readVInt();
|
||||
// TODO early terminate when target length unequals suffix + prefix.
|
||||
// But we need to keep the same status with scanToTermLeaf.
|
||||
int start = nextEnt;
|
||||
|
@ -669,16 +684,16 @@ final class SegmentTermsEnumFrame {
|
|||
while (start <= end) {
|
||||
int mid = (start + end) >>> 1;
|
||||
nextEnt = mid + 1;
|
||||
startBytePos = mid * suffix;
|
||||
startBytePos = mid * suffixLength;
|
||||
|
||||
// Binary search bytes in the suffix, comparing to the target.
|
||||
cmp =
|
||||
Arrays.compareUnsigned(
|
||||
suffixBytes,
|
||||
startBytePos,
|
||||
startBytePos + suffix,
|
||||
startBytePos + suffixLength,
|
||||
target.bytes,
|
||||
target.offset + prefix,
|
||||
target.offset + prefixLength,
|
||||
target.offset + target.length);
|
||||
if (cmp < 0) {
|
||||
start = mid + 1;
|
||||
|
@ -686,7 +701,7 @@ final class SegmentTermsEnumFrame {
|
|||
end = mid - 1;
|
||||
} else {
|
||||
// Exact match!
|
||||
suffixesReader.setPosition(startBytePos + suffix);
|
||||
suffixesReader.setPosition(startBytePos + suffixLength);
|
||||
fillTerm();
|
||||
// if (DEBUG) System.out.println(" found!");
|
||||
return SeekStatus.FOUND;
|
||||
|
@ -709,14 +724,14 @@ final class SegmentTermsEnumFrame {
|
|||
// If binary search ended at the less term, and greater term exists.
|
||||
// We need to advance to the greater term.
|
||||
if (cmp < 0) {
|
||||
startBytePos += suffix;
|
||||
startBytePos += suffixLength;
|
||||
nextEnt++;
|
||||
}
|
||||
suffixesReader.setPosition(startBytePos + suffix);
|
||||
suffixesReader.setPosition(startBytePos + suffixLength);
|
||||
fillTerm();
|
||||
} else {
|
||||
seekStatus = SeekStatus.END;
|
||||
suffixesReader.setPosition(startBytePos + suffix);
|
||||
suffixesReader.setPosition(startBytePos + suffixLength);
|
||||
if (exactOnly) {
|
||||
fillTerm();
|
||||
}
|
||||
|
@ -754,7 +769,7 @@ final class SegmentTermsEnumFrame {
|
|||
nextEnt++;
|
||||
|
||||
final int code = suffixLengthsReader.readVInt();
|
||||
suffix = code >>> 1;
|
||||
suffixLength = code >>> 1;
|
||||
|
||||
// if (DEBUG) {
|
||||
// BytesRef suffixBytesRef = new BytesRef();
|
||||
|
@ -767,7 +782,7 @@ final class SegmentTermsEnumFrame {
|
|||
// }
|
||||
|
||||
startBytePos = suffixesReader.getPosition();
|
||||
suffixesReader.skipBytes(suffix);
|
||||
suffixesReader.skipBytes(suffixLength);
|
||||
ste.termExists = (code & 1) == 0;
|
||||
if (ste.termExists) {
|
||||
state.termBlockOrd++;
|
||||
|
@ -781,9 +796,9 @@ final class SegmentTermsEnumFrame {
|
|||
Arrays.compareUnsigned(
|
||||
suffixBytes,
|
||||
startBytePos,
|
||||
startBytePos + suffix,
|
||||
startBytePos + suffixLength,
|
||||
target.bytes,
|
||||
target.offset + prefix,
|
||||
target.offset + prefixLength,
|
||||
target.offset + target.length);
|
||||
|
||||
if (cmp < 0) {
|
||||
|
@ -804,7 +819,8 @@ final class SegmentTermsEnumFrame {
|
|||
// us to position to the next term after
|
||||
// the target, so we must recurse into the
|
||||
// sub-frame(s):
|
||||
ste.currentFrame = ste.pushFrame(null, ste.currentFrame.lastSubFP, prefix + suffix);
|
||||
ste.currentFrame =
|
||||
ste.pushFrame(null, ste.currentFrame.lastSubFP, prefixLength + suffixLength);
|
||||
ste.currentFrame.loadBlock();
|
||||
while (ste.currentFrame.next()) {
|
||||
ste.currentFrame = ste.pushFrame(null, ste.currentFrame.lastSubFP, ste.term.length());
|
||||
|
@ -849,9 +865,9 @@ final class SegmentTermsEnumFrame {
|
|||
}
|
||||
|
||||
private void fillTerm() {
|
||||
final int termLength = prefix + suffix;
|
||||
final int termLength = prefixLength + suffixLength;
|
||||
ste.term.setLength(termLength);
|
||||
ste.term.grow(termLength);
|
||||
System.arraycopy(suffixBytes, startBytePos, ste.term.bytes(), prefix, suffix);
|
||||
System.arraycopy(suffixBytes, startBytePos, ste.term.bytes(), prefixLength, suffixLength);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -116,10 +116,10 @@ public class Stats {
|
|||
nonFloorBlockCount++;
|
||||
}
|
||||
|
||||
if (blockCountByPrefixLen.length <= frame.prefix) {
|
||||
blockCountByPrefixLen = ArrayUtil.grow(blockCountByPrefixLen, 1 + frame.prefix);
|
||||
if (blockCountByPrefixLen.length <= frame.prefixLength) {
|
||||
blockCountByPrefixLen = ArrayUtil.grow(blockCountByPrefixLen, 1 + frame.prefixLength);
|
||||
}
|
||||
blockCountByPrefixLen[frame.prefix]++;
|
||||
blockCountByPrefixLen[frame.prefixLength]++;
|
||||
startBlockCount++;
|
||||
totalBlockSuffixBytes += frame.totalSuffixBytes;
|
||||
totalUncompressedBlockSuffixBytes += frame.suffixesReader.length();
|
||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue