mirror of https://github.com/apache/lucene.git
Merge branch 'apache:main' into bpv21_main
This commit is contained in:
commit
0a0701995a
|
@ -23,6 +23,7 @@ Apache Lucene is a high-performance, full-featured text search engine library
|
|||
written in Java.
|
||||
|
||||
[![Build Status](https://ci-builds.apache.org/job/Lucene/job/Lucene-Artifacts-main/badge/icon?subject=Lucene)](https://ci-builds.apache.org/job/Lucene/job/Lucene-Artifacts-main/)
|
||||
[![Revved up by Develocity](https://img.shields.io/badge/Revved%20up%20by-Develocity-06A0CE?logo=Gradle&labelColor=02303A)](https://ge.apache.org/scans?search.buildToolType=gradle&search.rootProjectNames=lucene-root)
|
||||
|
||||
## Online Documentation
|
||||
|
||||
|
|
|
@ -41,7 +41,7 @@ import jdk.jfr.consumer.RecordingFile;
|
|||
*/
|
||||
public class ProfileResults {
|
||||
/** Formats a frame to a formatted line. This is deduplicated on! */
|
||||
static String frameToString(RecordedFrame frame, boolean lineNumbers) {
|
||||
static String frameToString(RecordedFrame frame, boolean lineNumbers, boolean frameTypes) {
|
||||
StringBuilder builder = new StringBuilder();
|
||||
RecordedMethod method = frame.getMethod();
|
||||
RecordedClass clazz = method.getType();
|
||||
|
@ -55,13 +55,14 @@ public class ProfileResults {
|
|||
builder.append("#");
|
||||
builder.append(method.getName());
|
||||
builder.append("()");
|
||||
if (lineNumbers) {
|
||||
if (lineNumbers && frame.getLineNumber() != -1) {
|
||||
builder.append(":");
|
||||
if (frame.getLineNumber() == -1) {
|
||||
builder.append("(" + frame.getType() + " code)");
|
||||
} else {
|
||||
builder.append(frame.getLineNumber());
|
||||
}
|
||||
builder.append(frame.getLineNumber());
|
||||
}
|
||||
if (clazz != null && frameTypes) {
|
||||
builder.append(" [");
|
||||
builder.append(frame.getType());
|
||||
builder.append(" code]");
|
||||
}
|
||||
return builder.toString();
|
||||
}
|
||||
|
@ -77,6 +78,8 @@ public class ProfileResults {
|
|||
public static final String COUNT_DEFAULT = "10";
|
||||
public static final String LINENUMBERS_KEY = "tests.profile.linenumbers";
|
||||
public static final String LINENUMBERS_DEFAULT = "false";
|
||||
public static final String FRAMETYPES_KEY = "tests.profile.frametypes";
|
||||
public static final String FRAMETYPES_DEFAULT = "true";
|
||||
|
||||
/**
|
||||
* Driver method, for testing standalone.
|
||||
|
@ -92,7 +95,8 @@ public class ProfileResults {
|
|||
System.getProperty(MODE_KEY, MODE_DEFAULT),
|
||||
Integer.parseInt(System.getProperty(STACKSIZE_KEY, STACKSIZE_DEFAULT)),
|
||||
Integer.parseInt(System.getProperty(COUNT_KEY, COUNT_DEFAULT)),
|
||||
Boolean.parseBoolean(System.getProperty(LINENUMBERS_KEY, LINENUMBERS_DEFAULT)));
|
||||
Boolean.parseBoolean(System.getProperty(LINENUMBERS_KEY, LINENUMBERS_DEFAULT)),
|
||||
Boolean.parseBoolean(System.getProperty(FRAMETYPES_KEY, FRAMETYPES_DEFAULT)));
|
||||
}
|
||||
|
||||
/** true if we care about this event */
|
||||
|
@ -152,7 +156,12 @@ public class ProfileResults {
|
|||
|
||||
/** Process all the JFR files passed in args and print a merged summary. */
|
||||
public static void printReport(
|
||||
List<String> files, String mode, int stacksize, int count, boolean lineNumbers)
|
||||
List<String> files,
|
||||
String mode,
|
||||
int stacksize,
|
||||
int count,
|
||||
boolean lineNumbers,
|
||||
boolean frameTypes)
|
||||
throws IOException {
|
||||
if (!"cpu".equals(mode) && !"heap".equals(mode)) {
|
||||
throw new IllegalArgumentException("tests.profile.mode must be one of (cpu,heap)");
|
||||
|
@ -181,7 +190,7 @@ public class ProfileResults {
|
|||
if (stack.length() > 0) {
|
||||
stack.append("\n").append(framePadding).append(" at ");
|
||||
}
|
||||
stack.append(frameToString(trace.getFrames().get(i), lineNumbers));
|
||||
stack.append(frameToString(trace.getFrames().get(i), lineNumbers, frameTypes));
|
||||
}
|
||||
String line = stack.toString();
|
||||
SimpleEntry<String, Long> entry =
|
||||
|
|
|
@ -231,8 +231,8 @@ public class MissingDoclet extends StandardDoclet {
|
|||
case PACKAGE:
|
||||
checkComment(element);
|
||||
break;
|
||||
// class-like elements, check them, then recursively check their children (fields and
|
||||
// methods)
|
||||
// class-like elements, check them, then recursively check their children (fields and
|
||||
// methods)
|
||||
case CLASS:
|
||||
case INTERFACE:
|
||||
case ENUM:
|
||||
|
@ -257,7 +257,7 @@ public class MissingDoclet extends StandardDoclet {
|
|||
}
|
||||
}
|
||||
break;
|
||||
// method-like elements, check them if we are configured to do so
|
||||
// method-like elements, check them if we are configured to do so
|
||||
case METHOD:
|
||||
case CONSTRUCTOR:
|
||||
case FIELD:
|
||||
|
|
11
build.gradle
11
build.gradle
|
@ -80,6 +80,9 @@ ext {
|
|||
// Minimum Java version required to compile and run Lucene.
|
||||
minJavaVersion = JavaVersion.toVersion(deps.versions.minJava.get())
|
||||
|
||||
// also change this in extractor tool: ExtractForeignAPI
|
||||
vectorIncubatorJavaVersions = [ JavaVersion.VERSION_21, JavaVersion.VERSION_22, JavaVersion.VERSION_23 ] as Set
|
||||
|
||||
// snapshot build marker used in scripts.
|
||||
snapshotBuild = version.contains("SNAPSHOT")
|
||||
|
||||
|
@ -117,10 +120,6 @@ apply from: file('gradle/generation/local-settings.gradle')
|
|||
// Make sure the build environment is consistent.
|
||||
apply from: file('gradle/validation/check-environment.gradle')
|
||||
|
||||
// IDE support, settings and specials.
|
||||
apply from: file('gradle/ide/intellij-idea.gradle')
|
||||
apply from: file('gradle/ide/eclipse.gradle')
|
||||
|
||||
// Set up defaults and configure aspects for certain modules or functionality
|
||||
// (java, tests)
|
||||
apply from: file('gradle/java/folder-layout.gradle')
|
||||
|
@ -133,6 +132,10 @@ apply from: file('gradle/testing/alternative-jdk-support.gradle')
|
|||
apply from: file('gradle/java/jar-manifest.gradle')
|
||||
apply from: file('gradle/java/modules.gradle')
|
||||
|
||||
// IDE support, settings and specials.
|
||||
apply from: file('gradle/ide/intellij-idea.gradle')
|
||||
apply from: file('gradle/ide/eclipse.gradle')
|
||||
|
||||
// Maven artifact publishing.
|
||||
apply from: file('gradle/maven/publications.gradle')
|
||||
|
||||
|
|
|
@ -67,6 +67,13 @@
|
|||
</maintainer>
|
||||
|
||||
<!-- NOTE: please insert releases in numeric order, NOT chronologically. -->
|
||||
<release>
|
||||
<Version>
|
||||
<name>lucene-9.11.1</name>
|
||||
<created>2024-06-27</created>
|
||||
<revision>9.11.1</revision>
|
||||
</Version>
|
||||
</release>.
|
||||
<release>
|
||||
<Version>
|
||||
<name>lucene-9.11.0</name>
|
||||
|
|
|
@ -0,0 +1,78 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
# Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
# contributor license agreements. See the NOTICE file distributed with
|
||||
# this work for additional information regarding copyright ownership.
|
||||
# The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
# (the "License"); you may not use this file except in compliance with
|
||||
# the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import os
|
||||
import re
|
||||
import subprocess
|
||||
import sys
|
||||
import tempfile
|
||||
import urllib.request
|
||||
|
||||
'''
|
||||
A simple tool to see diffs between main's version of CHANGES.txt entries for
|
||||
a given release vs the stable branch's version. It's best to keep these 1)
|
||||
identical and 2) matching what changes were actually backported to be honest
|
||||
to users and avoid future annoying conflicts on backport.
|
||||
'''
|
||||
|
||||
# e.g. python3 -u diff_lucene_changes.py branch_9_9 main 9.9.0
|
||||
|
||||
#
|
||||
|
||||
def get_changes_url(branch_name):
|
||||
if os.path.isdir(branch_name):
|
||||
url = f'file://{branch_name}/lucene/CHANGES.txt'
|
||||
else:
|
||||
url = f'https://raw.githubusercontent.com/apache/lucene/{branch_name}/lucene/CHANGES.txt'
|
||||
print(f'NOTE: resolving {branch_name} --> {url}')
|
||||
return url
|
||||
|
||||
def extract_release_section(changes_txt, release_name):
|
||||
return re.search(f'=======+ Lucene {re.escape(release_name)} =======+(.*?)=======+ Lucene .*? =======+$',
|
||||
changes_txt.decode('utf-8'), re.MULTILINE | re.DOTALL).group(1).encode('utf-8')
|
||||
|
||||
def main():
|
||||
if len(sys.argv) < 3 or len(sys.argv) > 5:
|
||||
print('\nUsage: python3 -u dev-tools/scripts/diff_lucene_changes.py <branch1-or-local-clone> <branch2-or-local-clone> <release-name> [diff-commandline-extras]\n')
|
||||
print(' e.g.: python3 -u dev-tools/scripts/diff_lucene_changes.py branch_9_9 /l/trunk 9.9.0 "-w"\n')
|
||||
sys.exit(1)
|
||||
|
||||
branch1 = sys.argv[1]
|
||||
branch2 = sys.argv[2]
|
||||
release_name = sys.argv[3]
|
||||
|
||||
if len(sys.argv) > 4:
|
||||
diff_cl_extras = [sys.argv[4]]
|
||||
else:
|
||||
diff_cl_extras = []
|
||||
|
||||
branch1_changes = extract_release_section(urllib.request.urlopen(get_changes_url(branch1)).read(),
|
||||
release_name)
|
||||
branch2_changes = extract_release_section(urllib.request.urlopen(get_changes_url(branch2)).read(),
|
||||
release_name)
|
||||
|
||||
with tempfile.NamedTemporaryFile() as f1, tempfile.NamedTemporaryFile() as f2:
|
||||
f1.write(branch1_changes)
|
||||
f2.write(branch2_changes)
|
||||
|
||||
command = ['diff'] + diff_cl_extras + [f1.name, f2.name]
|
||||
|
||||
# diff returns non-zero exit status when there are diffs, so don't pass check=True
|
||||
print(subprocess.run(command, check=False, capture_output=True).stdout.decode('utf-8'))
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
|
@ -17,13 +17,6 @@
|
|||
|
||||
def resources = scriptResources(buildscript)
|
||||
|
||||
configure(rootProject) {
|
||||
ext {
|
||||
// also change this in extractor tool: ExtractForeignAPI
|
||||
vectorIncubatorJavaVersions = [ JavaVersion.VERSION_21, JavaVersion.VERSION_22 ] as Set
|
||||
}
|
||||
}
|
||||
|
||||
configure(project(":lucene:core")) {
|
||||
ext {
|
||||
apijars = layout.projectDirectory.dir("src/generated/jdk")
|
||||
|
|
|
@ -23,7 +23,7 @@ configure(project(":lucene:core")) {
|
|||
description "Regenerate gen_ForUtil.py"
|
||||
group "generation"
|
||||
|
||||
def genDir = file("src/java/org/apache/lucene/codecs/lucene99")
|
||||
def genDir = file("src/java/org/apache/lucene/codecs/lucene912")
|
||||
def genScript = file("${genDir}/gen_ForUtil.py")
|
||||
def genOutput = file("${genDir}/ForUtil.java")
|
||||
|
||||
|
@ -43,6 +43,31 @@ configure(project(":lucene:core")) {
|
|||
andThenTasks: ["spotlessJava", "spotlessJavaApply"],
|
||||
mustRunBefore: [ "compileJava" ]
|
||||
])
|
||||
|
||||
task generateForDeltaUtilInternal() {
|
||||
description "Regenerate gen_ForDeltaUtil.py"
|
||||
group "generation"
|
||||
|
||||
def genDir = file("src/java/org/apache/lucene/codecs/lucene912")
|
||||
def genScript = file("${genDir}/gen_ForDeltaUtil.py")
|
||||
def genOutput = file("${genDir}/ForDeltaUtil.java")
|
||||
|
||||
inputs.file genScript
|
||||
outputs.file genOutput
|
||||
|
||||
doLast {
|
||||
quietExec {
|
||||
workingDir genDir
|
||||
executable project.externalTool("python3")
|
||||
args = [ '-B', genScript ]
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
regenerate.dependsOn wrapWithPersistentChecksums(generateForDeltaUtilInternal, [
|
||||
andThenTasks: ["spotlessJava", "spotlessJavaApply"],
|
||||
mustRunBefore: [ "compileJava" ]
|
||||
])
|
||||
}
|
||||
|
||||
configure(project(":lucene:backward-codecs")) {
|
||||
|
@ -96,5 +121,30 @@ configure(project(":lucene:backward-codecs")) {
|
|||
andThenTasks: ["spotlessJava", "spotlessJavaApply"],
|
||||
mustRunBefore: [ "compileJava" ]
|
||||
])
|
||||
|
||||
task generateForUtil99Internal() {
|
||||
description "Regenerate gen_ForUtil.py"
|
||||
group "generation"
|
||||
|
||||
def genDir = file("src/java/org/apache/lucene/backward_codecs/lucene99")
|
||||
def genScript = file("${genDir}/gen_ForUtil.py")
|
||||
def genOutput = file("${genDir}/ForUtil.java")
|
||||
|
||||
inputs.file genScript
|
||||
outputs.file genOutput
|
||||
|
||||
doLast {
|
||||
quietExec {
|
||||
workingDir genDir
|
||||
executable project.externalTool("python3")
|
||||
args = [ '-B', genScript ]
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
regenerate.dependsOn wrapWithPersistentChecksums(generateForUtil99Internal, [
|
||||
andThenTasks: ["spotlessJava", "spotlessJavaApply"],
|
||||
mustRunBefore: [ "compileJava" ]
|
||||
])
|
||||
}
|
||||
|
||||
|
|
|
@ -65,10 +65,8 @@ configure(project(":lucene:analysis:icu")) {
|
|||
icupkg = file("${icuBinDir}/icupkg")
|
||||
}
|
||||
|
||||
// Resolve version lazily (can't resolve at configuration time).
|
||||
def icu4jVersionProvider = project.provider { getVersion('com.ibm.icu', 'icu4j') }
|
||||
// lazy gstring with ICU version.
|
||||
def icu4jVersion = "${-> icu4jVersionProvider.get()}"
|
||||
def icu4jVersion = deps.icu4j.get().version
|
||||
|
||||
def icuCompileTask = Os.isFamily(Os.FAMILY_WINDOWS) ? "compileIcuWindows" : "compileIcuLinux"
|
||||
|
||||
|
|
|
@ -22,10 +22,11 @@ import org.gradle.plugins.ide.eclipse.model.ClasspathEntry
|
|||
def resources = scriptResources(buildscript)
|
||||
|
||||
configure(rootProject) {
|
||||
plugins.withType(JavaPlugin) {
|
||||
apply plugin: "eclipse"
|
||||
if (gradle.startParameter.taskNames.contains("eclipse")) {
|
||||
project.pluginManager.apply("java-base")
|
||||
project.pluginManager.apply("eclipse")
|
||||
|
||||
def eclipseJavaVersion = propertyOrDefault("eclipse.javaVersion", rootProject.minJavaVersion)
|
||||
def eclipseJavaVersion = propertyOrDefault("eclipse.javaVersion", deps.versions.minJava.get())
|
||||
def relativize = { other -> rootProject.rootDir.relativePath(other).toString() }
|
||||
|
||||
eclipse {
|
||||
|
@ -105,9 +106,9 @@ configure(rootProject) {
|
|||
}
|
||||
}
|
||||
|
||||
eclipseJdt {
|
||||
eclipseJdt {
|
||||
enabled = false
|
||||
dependsOn 'luceneEclipse'
|
||||
dependsOn 'luceneEclipseJdt'
|
||||
}
|
||||
|
||||
eclipseClasspath {
|
||||
|
|
|
@ -75,6 +75,18 @@ configure(rootProject) {
|
|||
it.dependsOn(":versionCatalogFormatDeps")
|
||||
}
|
||||
|
||||
// correct crlf/ default encoding after version catalog formatting finishes.
|
||||
tasks.matching {
|
||||
it.path in [
|
||||
":versionCatalogFormatDeps"
|
||||
]
|
||||
}.configureEach {
|
||||
it.doLast {
|
||||
ant.fixcrlf(file: it.catalogFile.get().asFile,
|
||||
eol: "lf", fixlast: "true", encoding: "UTF-8")
|
||||
}
|
||||
}
|
||||
|
||||
tasks.matching {
|
||||
it.path in [
|
||||
":versionCatalogUpdateDeps"
|
||||
|
|
|
@ -13,7 +13,7 @@
|
|||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
@defaultMessage Spawns threads with vague names; use a custom thread factory (Lucene's NamedThreadFactory, Solr's SolrNamedThreadFactory) and name threads so that you can tell (by its name) which executor it is associated with
|
||||
@defaultMessage Spawns threads with vague names; use a custom thread factory (Lucene's NamedThreadFactory) and name threads so that you can tell (by its name) which executor it is associated with
|
||||
java.util.concurrent.Executors#newFixedThreadPool(int)
|
||||
java.util.concurrent.Executors#newSingleThreadExecutor()
|
||||
java.util.concurrent.Executors#newCachedThreadPool()
|
||||
|
|
|
@ -20,6 +20,10 @@
|
|||
// 2) notice file
|
||||
// 3) checksum validation/ generation.
|
||||
|
||||
// WARNING: The tasks in this file share internal state between tasks without using files.
|
||||
// Because of this all tasks here must always execute together, so they cannot define task outputs.
|
||||
// TODO: Rewrite the internal state to use state files containing the ext.jarInfos and its referencedFiles
|
||||
|
||||
// This should be false only for debugging.
|
||||
def failOnError = true
|
||||
|
||||
|
@ -194,13 +198,6 @@ subprojects {
|
|||
description = "Validate license and notice files of dependencies"
|
||||
dependsOn collectJarInfos
|
||||
|
||||
def outputFileName = 'validateJarLicenses'
|
||||
inputs.dir(file(project.rootDir.path + '/lucene/licenses'))
|
||||
.withPropertyName('licenses')
|
||||
.withPathSensitivity(PathSensitivity.RELATIVE)
|
||||
outputs.file(layout.buildDirectory.file(outputFileName))
|
||||
.withPropertyName('validateJarLicensesResult')
|
||||
|
||||
doLast {
|
||||
def errors = []
|
||||
jarInfos.each { dep ->
|
||||
|
@ -246,9 +243,7 @@ subprojects {
|
|||
}
|
||||
}
|
||||
}
|
||||
// Required to take advantage of incremental building and the build cache
|
||||
def f = new File(project.buildDir.path + "/" + outputFileName)
|
||||
f.write(errors.toString(), "UTF-8")
|
||||
|
||||
if (errors) {
|
||||
def msg = "Certain license/ notice files are missing:\n - " + errors.join("\n - ")
|
||||
if (failOnError) {
|
||||
|
|
|
@ -80,10 +80,6 @@ API Changes
|
|||
* GITHUB#12875: Ensure token position is always increased in PathHierarchyTokenizer and ReversePathHierarchyTokenizer
|
||||
and resulting tokens do not overlap. (Michael Froh, Lukáš Vlček)
|
||||
|
||||
* GITHUB#12624, GITHUB#12831: Allow FSTCompiler to stream to any DataOutput while building, and
|
||||
make compile() only return the FSTMetadata. For on-heap (default) use case, please use
|
||||
FST.fromFSTReader(fstMetadata, fstCompiler.getFSTReader()) to create the FST. (Anh Dung Bui)
|
||||
|
||||
* GITHUB#13146, GITHUB#13148: Remove ByteBufferIndexInput and only use MemorySegment APIs
|
||||
for MMapDirectory. (Uwe Schindler)
|
||||
|
||||
|
@ -112,6 +108,11 @@ API Changes
|
|||
|
||||
* GITHUB#13410: Removed Scorer#getWeight (Sanjay Dutt, Adrien Grand)
|
||||
|
||||
* GITHUB#13499: Remove deprecated TopScoreDocCollector + TopFieldCollector methods (#create, #createSharedManager) (Jakub Slowinski)
|
||||
|
||||
* GITHUB#13632: CandidateMatcher public matching functions (Bryan Jacobowitz)
|
||||
|
||||
|
||||
New Features
|
||||
---------------------
|
||||
|
||||
|
@ -133,6 +134,16 @@ New Features
|
|||
DocValuesSkipper abstraction. A new flag is added to FieldType.java that configures whether
|
||||
to create a "skip index" for doc values. (Ignacio Vera)
|
||||
|
||||
* GITHUB#13563: Add levels to doc values skip index. (Ignacio Vera)
|
||||
|
||||
* GITHUB#13597: Align doc value skipper interval boundaries when an interval contains a constant
|
||||
value. (Ignacio Vera)
|
||||
|
||||
* GITHUB#13604: Add Kmeans clustering on vectors (Mayya Sharipova, Jim Ferenczi, Tom Veasey)
|
||||
|
||||
* GITHUB#13592: Take advantage of the doc value skipper when it is primary sort in SortedNumericDocValuesRangeQuery
|
||||
and SortedSetDocValuesRangeQuery. (Ignacio Vera)
|
||||
|
||||
Improvements
|
||||
---------------------
|
||||
|
||||
|
@ -168,6 +179,8 @@ Optimizations
|
|||
|
||||
* GITHUB#12552: Make FSTPostingsFormat load FSTs off-heap. (Tony X)
|
||||
|
||||
* GITHUB#13672: Leverage doc value skip lists in DocValuesRewriteMethod if indexed. (Greg Miller)
|
||||
|
||||
Bug Fixes
|
||||
---------------------
|
||||
|
||||
|
@ -205,6 +218,9 @@ Changes in Backwards Compatibility Policy
|
|||
* GITHUB#13230: Remove the Kp and Lovins snowball algorithms which are not supported
|
||||
or intended for general use. (Robert Muir)
|
||||
|
||||
* GITHUB#13602: SearchWithCollectorTask no longer supports the `collector.class` config parameter to load a custom
|
||||
collector implementation. `collector.manager.class` allows users to load a collector manager instead. (Luca Cavanna)
|
||||
|
||||
Other
|
||||
---------------------
|
||||
|
||||
|
@ -243,22 +259,71 @@ Other
|
|||
|
||||
* GITHUB#13332: Improve MissingDoclet linter to check records correctly. (Uwe Schindler)
|
||||
|
||||
* GITHUB#13499: Remove usage of TopScoreDocCollector + TopFieldCollector deprecated methods (#create, #createSharedManager) (Jakub Slowinski)
|
||||
|
||||
Build
|
||||
---------------------
|
||||
|
||||
* GITHUB#13649: Fix eclipse ide settings generation #13649 (Uwe Schindler, Dawid Weiss)
|
||||
|
||||
======================== Lucene 9.12.0 =======================
|
||||
|
||||
API Changes
|
||||
---------------------
|
||||
|
||||
* GITHUB#13281: Mark COSINE VectorSimilarityFunction as deprecated. (Pulkit Gupta)
|
||||
|
||||
* GITHUB#13469: Expose FlatVectorsFormat as a first-class format; can be configured using a custom Codec. (Michael Sokolov)
|
||||
|
||||
* GITHUB#13612: Hunspell: add Suggester#proceedPastRep to avoid losing relevant suggestions. (Peter Gromov)
|
||||
|
||||
* GITHUB#13603: Introduced `IndexSearcher#searchLeaf(LeafReaderContext, Weight, Collector)` protected method to
|
||||
facilitate customizing per-leaf behavior of search without requiring to override
|
||||
`search(LeafReaderContext[], Weight, Collector)` which requires overriding the entire loop across the leaves (Luca Cavanna)
|
||||
|
||||
* GITHUB#13559: Add BitSet#nextSetBit(int, int) to get the index of the first set bit in range. (Egor Potemkin)
|
||||
|
||||
* GITHUB#13568: Add DoubleValuesSource#toSortableLongDoubleValuesSource and
|
||||
MultiDoubleValuesSource#toSortableMultiLongValuesSource methods. (Shradha Shankar)
|
||||
|
||||
* GITHUB#13568: Add CollectorOwner class that wraps CollectorManager, and handles list of Collectors and results.
|
||||
Add IndexSearcher#search method that takes CollectorOwner. (Egor Potemkin)
|
||||
|
||||
* GITHUB#13568: Add DrillSideways#search method that supports any collector types for any drill-sideways dimensions
|
||||
or drill-down. (Egor Potemkin)
|
||||
|
||||
New Features
|
||||
---------------------
|
||||
(No changes)
|
||||
|
||||
* GITHUB#13430: Allow configuring the search concurrency via
|
||||
TieredMergePolicy#setTargetSearchConcurrency. This in-turn instructs the
|
||||
merge policy to try to have at least this number of segments on the highest
|
||||
tier. (Adrien Grand, Carlos Delgado)
|
||||
|
||||
* GITHUB#13517: Allow configuring the search concurrency on LogDocMergePolicy
|
||||
and LogByteSizeMergePolicy via a new #setTargetConcurrency setter.
|
||||
(Adrien Grand)
|
||||
|
||||
* GITHUB#13568: Add sandbox facets module to compute facets while collecting. (Egor Potemkin, Shradha Shankar)
|
||||
|
||||
* GITHUB#13678: Add support JDK 23 to the Panama Vectorization Provider. (Chris Hegarty)
|
||||
|
||||
Improvements
|
||||
---------------------
|
||||
(No changes)
|
||||
|
||||
* GITHUB#13548: Refactor and javadoc update for KNN vector writer classes. (Patrick Zhai)
|
||||
|
||||
* GITHUB#13562: Add Intervals.regexp and Intervals.range methods to produce IntervalsSource
|
||||
for regexp and range queries. (Mayya Sharipova)
|
||||
|
||||
* GITHUB#13625: Remove BitSet#nextSetBit code duplication. (Greg Miller)
|
||||
|
||||
* GITHUB#13285: Early terminate graph searches of AbstractVectorSimilarityQuery to follow timeout set from
|
||||
IndexSearcher#setTimeout(QueryTimeout). (Kaival Parikh)
|
||||
|
||||
* GITHUB#13633: Add ability to read/write knn vector values to a MemoryIndex. (Ben Trent)
|
||||
|
||||
* GITHUB#12627: patch HNSW graphs to improve reachability of all nodes from entry points
|
||||
|
||||
* GITHUB#13201: Better cost estimation on MultiTermQuery over few terms. (Michael Froh)
|
||||
|
||||
Optimizations
|
||||
---------------------
|
||||
|
@ -277,16 +342,100 @@ Optimizations
|
|||
|
||||
* GITHUB#12941: Don't preserve auxiliary buffer contents in LSBRadixSorter if it grows. (Stefan Vodita)
|
||||
|
||||
* GITHUB#13175: Stop double-checking priority queue inserts in some FacetCount classes. (Jakub Slowinski)
|
||||
|
||||
* GITHUB#13538: Slightly reduce heap usage for HNSW and scalar quantized vector writers. (Ben Trent)
|
||||
|
||||
* GITHUB#12100: WordBreakSpellChecker.suggestWordBreaks now does a breadth first search, allowing it to return
|
||||
better matches with fewer evaluations (hossman)
|
||||
|
||||
* GITHUB#13582: Stop requiring MaxScoreBulkScorer's outer window from having at
|
||||
least INNER_WINDOW_SIZE docs. (Adrien Grand)
|
||||
|
||||
* GITHUB#13570, GITHUB#13574, GITHUB#13535: Avoid performance degradation with closing shared Arenas.
|
||||
Closing many individual index files can potentially lead to a degradation in execution performance.
|
||||
Index files are mmapped one-to-one with the JDK's foreign shared Arena. The JVM deoptimizes the top
|
||||
few frames of all threads when closing a shared Arena (see JDK-8335480). We mitigate this situation
|
||||
when running with JDK 21 and greater, by 1) using a confined Arena where appropriate, and 2) grouping
|
||||
files from the same segment to a single shared Arena.
|
||||
A system property has been added that allows to control the total maximum number of mmapped files
|
||||
that may be associated with a single shared Arena. For example, to set the max number of permits to
|
||||
256, pass the following on the command line
|
||||
-Dorg.apache.lucene.store.MMapDirectory.sharedArenaMaxPermits=256. Setting a value of 1 associates
|
||||
a single file to a single shared arena.
|
||||
(Chris Hegarty, Michael Gibney, Uwe Schindler)
|
||||
|
||||
* GITHUB#13585: Lucene912PostingsFormat, the new default postings format, now
|
||||
only has 2 levels of skip data, which are inlined into postings instead of
|
||||
being stored at the end of postings lists. This translates into better
|
||||
performance for queries that need skipping such as conjunctions.
|
||||
(Adrien Grand)
|
||||
|
||||
* GITHUB#13581: OnHeapHnswGraph no longer allocates a lock for every graph node (Mike Sokolov)
|
||||
|
||||
* GITHUB#13636, GITHUB#13658: Optimizations to the decoding logic of blocks of
|
||||
postings. (Adrien Grand, Uwe Schindler, Greg Miller)
|
||||
|
||||
* GITHUB##13644: Improve NumericComparator competitive iterator logic by comparing the missing value with the top
|
||||
value even after the hit queue is full (Pan Guixin)
|
||||
|
||||
* GITHUB#13587: Use Max WAND optimizations with ToParentBlockJoinQuery when using ScoreMode.Max (Mike Pellegrini)
|
||||
|
||||
Changes in runtime behavior
|
||||
---------------------
|
||||
|
||||
* GITHUB#13472: When an executor is provided to the IndexSearcher constructor, the searcher now executes tasks on the
|
||||
thread that invoked a search as well as its configured executor. Users should reduce the executor's thread-count by 1
|
||||
to retain the previous level of parallelism. Moreover, it is now possible to start searches from the same executor
|
||||
that is configured in the IndexSearcher without risk of deadlocking. A separate executor for starting searches is no
|
||||
longer required. (Armin Braun)
|
||||
|
||||
Bug Fixes
|
||||
---------------------
|
||||
|
||||
* GITHUB#13384: Fix highlighter to use longer passages instead of shorter individual terms. (Zack Kendall)
|
||||
|
||||
* GITHUB#13463: Address bug in MultiLeafKnnCollector causing #minCompetitiveSimilarity to stay artificially low in
|
||||
some corner cases. (Greg Miller)
|
||||
|
||||
* GITHUB#13553: Correct RamUsageEstimate for scalar quantized knn vector formats so that raw vectors are correctly
|
||||
accounted for. (Ben Trent)
|
||||
|
||||
* GITHUB#13615: Correct scalar quantization when used in conjunction with COSINE similarity. Vectors are normalized
|
||||
before quantization to ensure the cosine similarity is correctly calculated. (Ben Trent)
|
||||
|
||||
* GITHUB#13627: Fix race condition on flush for DWPT seqNo generation. (Ben Trent, Ao Li)
|
||||
|
||||
* GITHUB#13691: Fix incorrect exponent value in explain of SigmoidFunction. (Owais Kazi)
|
||||
|
||||
Build
|
||||
---------------------
|
||||
|
||||
* GITHUB#13695, GITHUB#13696: Fix Gradle build sometimes gives spurious "unreferenced license file" warnings.
|
||||
(Uwe Schindler)
|
||||
|
||||
Other
|
||||
--------------------
|
||||
(No changes)
|
||||
|
||||
======================== Lucene 9.11.1 =======================
|
||||
|
||||
Bug Fixes
|
||||
---------------------
|
||||
|
||||
* GITHUB#13498: Avoid performance regression by constructing lazily the PointTree in NumericComparator. (Ignacio Vera)
|
||||
|
||||
* GITHUB#13501, GITHUB#13478: Remove intra-merge parallelism for everything except HNSW graph merges. (Ben Trent)
|
||||
|
||||
* GITHUB#13498, GITHUB#13340: Allow adding a parent field to an index with no fields (Michael Sokolov)
|
||||
|
||||
* GITHUB#12431: Fix IndexOutOfBoundsException thrown in DefaultPassageFormatter
|
||||
by unordered matches. (Stephane Campinas)
|
||||
|
||||
* GITHUB#13493: StringValueFacetCounts stops throwing NPE when faceting over an empty match-set. (Grebennikov Roman,
|
||||
Stefan Vodita)
|
||||
|
||||
|
||||
======================== Lucene 9.11.0 =======================
|
||||
|
||||
API Changes
|
||||
|
@ -494,6 +643,10 @@ API Changes
|
|||
|
||||
* GITHUB#12854: Mark DrillSideways#createDrillDownFacetsCollector as @Deprecated. (Greg Miller)
|
||||
|
||||
* GITHUB#12624, GITHUB#12831: Allow FSTCompiler to stream to any DataOutput while building, and
|
||||
make compile() only return the FSTMetadata. For on-heap (default) use case, please use
|
||||
FST.fromFSTReader(fstMetadata, fstCompiler.getFSTReader()) to create the FST. (Anh Dung Bui)
|
||||
|
||||
New Features
|
||||
---------------------
|
||||
* GITHUB#12679: Add support for similarity-based vector searches using [Byte|Float]VectorSimilarityQuery. Uses a new
|
||||
|
@ -501,6 +654,12 @@ New Features
|
|||
better-scoring nodes are available, or the best candidate is below a score of `traversalSimilarity` in the lowest
|
||||
level. (Aditya Prakash, Kaival Parikh)
|
||||
|
||||
* GITHUB#12829: For indices newly created as of 9.10.0 onwards, IndexWriter preserves document blocks indexed via
|
||||
IndexWriter#addDocuments or IndexWriter#updateDocuments also when index sorting is configured. Document blocks are
|
||||
maintained alongside their parent documents during sort and merge. IndexWriterConfig accepts a parent field that is used
|
||||
to maintain block orders if index sorting is used. Note, this is fully optional in Lucene 9.x while will be mandatory for
|
||||
indices that use document blocks together with index sorting as of 10.0.0. (Simon Willnauer)
|
||||
|
||||
* GITHUB#12336: Index additional data per facet label in the taxonomy. (Shai Erera, Egor Potemkin, Mike McCandless,
|
||||
Stefan Vodita)
|
||||
|
||||
|
@ -592,7 +751,6 @@ Build
|
|||
|
||||
Other
|
||||
---------------------
|
||||
|
||||
* GITHUB#11023: Removing some dead code in CheckIndex. (Jakub Slowinski)
|
||||
|
||||
* GITHUB#11023: Removing @lucene.experimental tags in testXXX methods in CheckIndex. (Jakub Slowinski)
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
{
|
||||
"gradle/generation/jflex/skeleton.default.txt": "58944f66c9113a940dfaf6a17210ec8219024390",
|
||||
"lucene/analysis/common/src/java/org/apache/lucene/analysis/classic/ClassicTokenizerImpl.java": "1f7a446f3483326385eef257cea8366c27da0850",
|
||||
"lucene/analysis/common/src/java/org/apache/lucene/analysis/classic/ClassicTokenizerImpl.java": "e62dcd8c25219d8f5d783823b228ffe38d2bacde",
|
||||
"lucene/analysis/common/src/java/org/apache/lucene/analysis/classic/ClassicTokenizerImpl.jflex": "f52109bb7d5701979fde90aeeeda726246a8d5fd"
|
||||
}
|
|
@ -1,5 +1,5 @@
|
|||
{
|
||||
"gradle/generation/jflex/skeleton.default.txt": "58944f66c9113a940dfaf6a17210ec8219024390",
|
||||
"lucene/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.java": "ac298e08bc5b96202efca0c01f9f0376fda976bd",
|
||||
"lucene/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.java": "2b5df5ff35543a6380c82f298225eb5fa06e4453",
|
||||
"lucene/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.jflex": "0b8c7774b98e8237702013e82c352d4711509bd0"
|
||||
}
|
|
@ -37,23 +37,23 @@ class BengaliNormalizer {
|
|||
|
||||
for (int i = 0; i < len; i++) {
|
||||
switch (s[i]) {
|
||||
// delete Chandrabindu
|
||||
// delete Chandrabindu
|
||||
case '\u0981':
|
||||
len = delete(s, i, len);
|
||||
i--;
|
||||
break;
|
||||
|
||||
// DirghoI kar -> RosshoI kar
|
||||
// DirghoI kar -> RosshoI kar
|
||||
case '\u09C0':
|
||||
s[i] = '\u09BF';
|
||||
break;
|
||||
|
||||
// DirghoU kar -> RosshoU kar
|
||||
// DirghoU kar -> RosshoU kar
|
||||
case '\u09C2':
|
||||
s[i] = '\u09C1';
|
||||
break;
|
||||
|
||||
// Khio (Ka + Hoshonto + Murdorno Sh)
|
||||
// Khio (Ka + Hoshonto + Murdorno Sh)
|
||||
case '\u0995':
|
||||
if (i + 2 < len && s[i + 1] == '\u09CD' && s[i + 2] == '\u09BF') {
|
||||
if (i == 0) {
|
||||
|
@ -67,12 +67,12 @@ class BengaliNormalizer {
|
|||
}
|
||||
break;
|
||||
|
||||
// Nga to Anusvara
|
||||
// Nga to Anusvara
|
||||
case '\u0999':
|
||||
s[i] = '\u0982';
|
||||
break;
|
||||
|
||||
// Ja Phala
|
||||
// Ja Phala
|
||||
case '\u09AF':
|
||||
if (i - 2 == 0 && s[i - 1] == '\u09CD') {
|
||||
s[i - 1] = '\u09C7';
|
||||
|
@ -89,7 +89,7 @@ class BengaliNormalizer {
|
|||
}
|
||||
break;
|
||||
|
||||
// Ba Phalaa
|
||||
// Ba Phalaa
|
||||
case '\u09AC':
|
||||
if ((i >= 1 && s[i - 1] != '\u09CD') || i == 0) {
|
||||
break;
|
||||
|
@ -109,7 +109,7 @@ class BengaliNormalizer {
|
|||
}
|
||||
break;
|
||||
|
||||
// Visarga
|
||||
// Visarga
|
||||
case '\u0983':
|
||||
if (i == len - 1) {
|
||||
if (len <= 3) {
|
||||
|
@ -122,18 +122,18 @@ class BengaliNormalizer {
|
|||
}
|
||||
break;
|
||||
|
||||
// All sh
|
||||
// All sh
|
||||
case '\u09B6':
|
||||
case '\u09B7':
|
||||
s[i] = '\u09B8';
|
||||
break;
|
||||
|
||||
// check na
|
||||
// check na
|
||||
case '\u09A3':
|
||||
s[i] = '\u09A8';
|
||||
break;
|
||||
|
||||
// check ra
|
||||
// check ra
|
||||
case '\u09DC':
|
||||
case '\u09DD':
|
||||
s[i] = '\u09B0';
|
||||
|
|
|
@ -747,70 +747,70 @@ class ClassicTokenizerImpl {
|
|||
/* Break so we don't hit fall-through warning: */
|
||||
break; /* ignore */
|
||||
}
|
||||
// fall through
|
||||
// fall through
|
||||
case 11:
|
||||
break;
|
||||
case 2:
|
||||
{
|
||||
return ALPHANUM;
|
||||
}
|
||||
// fall through
|
||||
// fall through
|
||||
case 12:
|
||||
break;
|
||||
case 3:
|
||||
{
|
||||
return CJ;
|
||||
}
|
||||
// fall through
|
||||
// fall through
|
||||
case 13:
|
||||
break;
|
||||
case 4:
|
||||
{
|
||||
return NUM;
|
||||
}
|
||||
// fall through
|
||||
// fall through
|
||||
case 14:
|
||||
break;
|
||||
case 5:
|
||||
{
|
||||
return HOST;
|
||||
}
|
||||
// fall through
|
||||
// fall through
|
||||
case 15:
|
||||
break;
|
||||
case 6:
|
||||
{
|
||||
return COMPANY;
|
||||
}
|
||||
// fall through
|
||||
// fall through
|
||||
case 16:
|
||||
break;
|
||||
case 7:
|
||||
{
|
||||
return APOSTROPHE;
|
||||
}
|
||||
// fall through
|
||||
// fall through
|
||||
case 17:
|
||||
break;
|
||||
case 8:
|
||||
{
|
||||
return ACRONYM_DEP;
|
||||
}
|
||||
// fall through
|
||||
// fall through
|
||||
case 18:
|
||||
break;
|
||||
case 9:
|
||||
{
|
||||
return ACRONYM;
|
||||
}
|
||||
// fall through
|
||||
// fall through
|
||||
case 19:
|
||||
break;
|
||||
case 10:
|
||||
{
|
||||
return EMAIL;
|
||||
}
|
||||
// fall through
|
||||
// fall through
|
||||
case 20:
|
||||
break;
|
||||
default:
|
||||
|
|
|
@ -53,18 +53,18 @@ public final class GreekLowerCaseFilter extends TokenFilter {
|
|||
|
||||
private int lowerCase(int codepoint) {
|
||||
switch (codepoint) {
|
||||
/* There are two lowercase forms of sigma:
|
||||
* U+03C2: small final sigma (end of word)
|
||||
* U+03C3: small sigma (otherwise)
|
||||
*
|
||||
* Standardize both to U+03C3
|
||||
*/
|
||||
/* There are two lowercase forms of sigma:
|
||||
* U+03C2: small final sigma (end of word)
|
||||
* U+03C3: small sigma (otherwise)
|
||||
*
|
||||
* Standardize both to U+03C3
|
||||
*/
|
||||
case '\u03C2': /* small final sigma */
|
||||
return '\u03C3'; /* small sigma */
|
||||
|
||||
/* Some greek characters contain diacritics.
|
||||
* This filter removes these, converting to the lowercase base form.
|
||||
*/
|
||||
/* Some greek characters contain diacritics.
|
||||
* This filter removes these, converting to the lowercase base form.
|
||||
*/
|
||||
|
||||
case '\u0386': /* capital alpha with tonos */
|
||||
case '\u03AC': /* small alpha with tonos */
|
||||
|
@ -100,9 +100,9 @@ public final class GreekLowerCaseFilter extends TokenFilter {
|
|||
case '\u03CE': /* small omega with tonos */
|
||||
return '\u03C9'; /* small omega */
|
||||
|
||||
/* The previous implementation did the conversion below.
|
||||
* Only implemented for backwards compatibility with old indexes.
|
||||
*/
|
||||
/* The previous implementation did the conversion below.
|
||||
* Only implemented for backwards compatibility with old indexes.
|
||||
*/
|
||||
|
||||
case '\u03A2': /* reserved */
|
||||
return '\u03C2'; /* small final sigma */
|
||||
|
|
|
@ -456,7 +456,7 @@ class PorterStemmer {
|
|||
/* j >= 0 fixes Bug 2 */
|
||||
if (ends("ou")) break;
|
||||
return;
|
||||
/* takes care of -ous */
|
||||
/* takes care of -ous */
|
||||
case 's':
|
||||
if (ends("ism")) break;
|
||||
return;
|
||||
|
|
|
@ -67,7 +67,7 @@ public final class IrishLowerCaseFilter extends TokenFilter {
|
|||
case 'I':
|
||||
case 'O':
|
||||
case 'U':
|
||||
// vowels with acute accent (fada)
|
||||
// vowels with acute accent (fada)
|
||||
case '\u00c1':
|
||||
case '\u00c9':
|
||||
case '\u00cd':
|
||||
|
|
|
@ -47,18 +47,18 @@ class HindiNormalizer {
|
|||
|
||||
for (int i = 0; i < len; i++) {
|
||||
switch (s[i]) {
|
||||
// dead n -> bindu
|
||||
// dead n -> bindu
|
||||
case '\u0928':
|
||||
if (i + 1 < len && s[i + 1] == '\u094D') {
|
||||
s[i] = '\u0902';
|
||||
len = delete(s, i + 1, len);
|
||||
}
|
||||
break;
|
||||
// candrabindu -> bindu
|
||||
// candrabindu -> bindu
|
||||
case '\u0901':
|
||||
s[i] = '\u0902';
|
||||
break;
|
||||
// nukta deletions
|
||||
// nukta deletions
|
||||
case '\u093C':
|
||||
len = delete(s, i, len);
|
||||
i--;
|
||||
|
@ -96,18 +96,18 @@ class HindiNormalizer {
|
|||
case '\u095F':
|
||||
s[i] = '\u092F';
|
||||
break;
|
||||
// zwj/zwnj -> delete
|
||||
// zwj/zwnj -> delete
|
||||
case '\u200D':
|
||||
case '\u200C':
|
||||
len = delete(s, i, len);
|
||||
i--;
|
||||
break;
|
||||
// virama -> delete
|
||||
// virama -> delete
|
||||
case '\u094D':
|
||||
len = delete(s, i, len);
|
||||
i--;
|
||||
break;
|
||||
// chandra/short -> replace
|
||||
// chandra/short -> replace
|
||||
case '\u0945':
|
||||
case '\u0946':
|
||||
s[i] = '\u0947';
|
||||
|
@ -127,7 +127,7 @@ class HindiNormalizer {
|
|||
case '\u0972':
|
||||
s[i] = '\u0905';
|
||||
break;
|
||||
// long -> short ind. vowels
|
||||
// long -> short ind. vowels
|
||||
case '\u0906':
|
||||
s[i] = '\u0905';
|
||||
break;
|
||||
|
@ -149,7 +149,7 @@ class HindiNormalizer {
|
|||
case '\u0914':
|
||||
s[i] = '\u0913';
|
||||
break;
|
||||
// long -> short dep. vowels
|
||||
// long -> short dep. vowels
|
||||
case '\u0940':
|
||||
s[i] = '\u093F';
|
||||
break;
|
||||
|
|
|
@ -31,6 +31,7 @@ class ModifyingSuggester {
|
|||
private final String misspelled;
|
||||
private final WordCase wordCase;
|
||||
private final FragmentChecker fragmentChecker;
|
||||
private final boolean proceedPastRep;
|
||||
private final char[] tryChars;
|
||||
private final Hunspell speller;
|
||||
|
||||
|
@ -39,13 +40,15 @@ class ModifyingSuggester {
|
|||
LinkedHashSet<Suggestion> result,
|
||||
String misspelled,
|
||||
WordCase wordCase,
|
||||
FragmentChecker checker) {
|
||||
FragmentChecker checker,
|
||||
boolean proceedPastRep) {
|
||||
this.speller = speller;
|
||||
tryChars = speller.dictionary.tryChars.toCharArray();
|
||||
this.result = result;
|
||||
this.misspelled = misspelled;
|
||||
this.wordCase = wordCase;
|
||||
fragmentChecker = checker;
|
||||
this.proceedPastRep = proceedPastRep;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -125,9 +128,9 @@ class ModifyingSuggester {
|
|||
boolean hasGoodSuggestions = trySuggestion(word.toUpperCase(Locale.ROOT));
|
||||
|
||||
GradedSuggestions repResult = tryRep(word);
|
||||
if (repResult == GradedSuggestions.Best) return true;
|
||||
if (repResult == GradedSuggestions.Best && !proceedPastRep) return true;
|
||||
|
||||
hasGoodSuggestions |= repResult == GradedSuggestions.Normal;
|
||||
hasGoodSuggestions |= repResult != GradedSuggestions.None;
|
||||
|
||||
if (!speller.dictionary.mapTable.isEmpty()) {
|
||||
enumerateMapReplacements(word, "", 0);
|
||||
|
|
|
@ -53,16 +53,21 @@ public class Suggester {
|
|||
private final Dictionary dictionary;
|
||||
private final SuggestibleEntryCache suggestibleCache;
|
||||
private final FragmentChecker fragmentChecker;
|
||||
private final boolean proceedPastRep;
|
||||
|
||||
public Suggester(Dictionary dictionary) {
|
||||
this(dictionary, null, FragmentChecker.EVERYTHING_POSSIBLE);
|
||||
this(dictionary, null, FragmentChecker.EVERYTHING_POSSIBLE, false);
|
||||
}
|
||||
|
||||
private Suggester(
|
||||
Dictionary dictionary, SuggestibleEntryCache suggestibleCache, FragmentChecker checker) {
|
||||
Dictionary dictionary,
|
||||
SuggestibleEntryCache suggestibleCache,
|
||||
FragmentChecker checker,
|
||||
boolean proceedPastRep) {
|
||||
this.dictionary = dictionary;
|
||||
this.suggestibleCache = suggestibleCache;
|
||||
this.fragmentChecker = checker;
|
||||
this.proceedPastRep = proceedPastRep;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -71,8 +76,8 @@ public class Suggester {
|
|||
* entries are stored as fast-to-iterate plain words instead of highly compressed prefix trees.
|
||||
*/
|
||||
public Suggester withSuggestibleEntryCache() {
|
||||
return new Suggester(
|
||||
dictionary, SuggestibleEntryCache.buildCache(dictionary.words), fragmentChecker);
|
||||
SuggestibleEntryCache cache = SuggestibleEntryCache.buildCache(dictionary.words);
|
||||
return new Suggester(dictionary, cache, fragmentChecker, proceedPastRep);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -80,7 +85,17 @@ public class Suggester {
|
|||
* the performance of the "Modification" phase performance.
|
||||
*/
|
||||
public Suggester withFragmentChecker(FragmentChecker checker) {
|
||||
return new Suggester(dictionary, suggestibleCache, checker);
|
||||
return new Suggester(dictionary, suggestibleCache, checker, proceedPastRep);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a copy of this suggester instance that doesn't stop after encountering acceptable words
|
||||
* after applying REP rules. By default, Hunspell stops when it finds any, but this behavior may
|
||||
* not always be desirable, e.g., if we have "REP i ea", "tims" be replaced only by "teams" and
|
||||
* not "times", which could also be meant.
|
||||
*/
|
||||
public Suggester proceedPastRep() {
|
||||
return new Suggester(dictionary, suggestibleCache, fragmentChecker, true);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -174,7 +189,8 @@ public class Suggester {
|
|||
}
|
||||
|
||||
boolean hasGoodSuggestions =
|
||||
new ModifyingSuggester(suggestionSpeller, suggestions, word, wordCase, fragmentChecker)
|
||||
new ModifyingSuggester(
|
||||
suggestionSpeller, suggestions, word, wordCase, fragmentChecker, proceedPastRep)
|
||||
.suggest();
|
||||
|
||||
if (!hasGoodSuggestions && dictionary.maxNGramSuggestions > 0) {
|
||||
|
|
|
@ -194,7 +194,7 @@ public final class WordDelimiterIterator {
|
|||
|
||||
int type = charType(text[current]);
|
||||
switch (type) {
|
||||
// return ALPHA word type for both lower and upper
|
||||
// return ALPHA word type for both lower and upper
|
||||
case LOWER:
|
||||
case UPPER:
|
||||
return ALPHA;
|
||||
|
@ -332,27 +332,27 @@ public final class WordDelimiterIterator {
|
|||
case Character.OTHER_NUMBER:
|
||||
return DIGIT;
|
||||
|
||||
// case Character.SPACE_SEPARATOR:
|
||||
// case Character.LINE_SEPARATOR:
|
||||
// case Character.PARAGRAPH_SEPARATOR:
|
||||
// case Character.CONTROL:
|
||||
// case Character.FORMAT:
|
||||
// case Character.PRIVATE_USE:
|
||||
// case Character.SPACE_SEPARATOR:
|
||||
// case Character.LINE_SEPARATOR:
|
||||
// case Character.PARAGRAPH_SEPARATOR:
|
||||
// case Character.CONTROL:
|
||||
// case Character.FORMAT:
|
||||
// case Character.PRIVATE_USE:
|
||||
|
||||
case Character.SURROGATE: // prevent splitting
|
||||
return ALPHA | DIGIT;
|
||||
|
||||
// case Character.DASH_PUNCTUATION:
|
||||
// case Character.START_PUNCTUATION:
|
||||
// case Character.END_PUNCTUATION:
|
||||
// case Character.CONNECTOR_PUNCTUATION:
|
||||
// case Character.OTHER_PUNCTUATION:
|
||||
// case Character.MATH_SYMBOL:
|
||||
// case Character.CURRENCY_SYMBOL:
|
||||
// case Character.MODIFIER_SYMBOL:
|
||||
// case Character.OTHER_SYMBOL:
|
||||
// case Character.INITIAL_QUOTE_PUNCTUATION:
|
||||
// case Character.FINAL_QUOTE_PUNCTUATION:
|
||||
// case Character.DASH_PUNCTUATION:
|
||||
// case Character.START_PUNCTUATION:
|
||||
// case Character.END_PUNCTUATION:
|
||||
// case Character.CONNECTOR_PUNCTUATION:
|
||||
// case Character.OTHER_PUNCTUATION:
|
||||
// case Character.MATH_SYMBOL:
|
||||
// case Character.CURRENCY_SYMBOL:
|
||||
// case Character.MODIFIER_SYMBOL:
|
||||
// case Character.OTHER_SYMBOL:
|
||||
// case Character.INITIAL_QUOTE_PUNCTUATION:
|
||||
// case Character.FINAL_QUOTE_PUNCTUATION:
|
||||
|
||||
default:
|
||||
return SUBWORD_DELIM;
|
||||
|
|
|
@ -38,25 +38,25 @@ class TeluguNormalizer {
|
|||
|
||||
for (int i = 0; i < len; i++) {
|
||||
switch (s[i]) {
|
||||
// candrabindu (ఀ and ఁ) -> bindu (ం)
|
||||
// candrabindu (ఀ and ఁ) -> bindu (ం)
|
||||
case '\u0C00': // ఀ
|
||||
case '\u0C01': // ఁ
|
||||
s[i] = '\u0C02'; // ం
|
||||
break;
|
||||
// delete visarga (ః)
|
||||
// delete visarga (ః)
|
||||
case '\u0C03':
|
||||
len = delete(s, i, len);
|
||||
i--;
|
||||
break;
|
||||
|
||||
// zwj/zwnj -> delete
|
||||
// zwj/zwnj -> delete
|
||||
case '\u200D':
|
||||
case '\u200C':
|
||||
len = delete(s, i, len);
|
||||
i--;
|
||||
break;
|
||||
|
||||
// long -> short vowels
|
||||
// long -> short vowels
|
||||
case '\u0C14': // ఔ
|
||||
s[i] = '\u0C13'; // ఓ
|
||||
break;
|
||||
|
@ -73,7 +73,7 @@ class TeluguNormalizer {
|
|||
s[i] = '\u0C09'; // ఉ
|
||||
break;
|
||||
|
||||
// long -> short vowels matras
|
||||
// long -> short vowels matras
|
||||
case '\u0C40': // ీ
|
||||
s[i] = '\u0C3F'; // ి
|
||||
break;
|
||||
|
@ -86,14 +86,14 @@ class TeluguNormalizer {
|
|||
case '\u0C4B': // ో
|
||||
s[i] = '\u0C4A'; // ొ
|
||||
break;
|
||||
// decomposed dipthong (ె + ౖ) -> precomposed diphthong vowel sign (ై)
|
||||
// decomposed dipthong (ె + ౖ) -> precomposed diphthong vowel sign (ై)
|
||||
case '\u0C46':
|
||||
if (i + 1 < len && s[i + 1] == '\u0C56') {
|
||||
s[i] = '\u0C48';
|
||||
len = delete(s, i + 1, len);
|
||||
}
|
||||
break;
|
||||
// composed oo or au -> oo or au
|
||||
// composed oo or au -> oo or au
|
||||
case '\u0C12':
|
||||
if (i + 1 < len && s[i + 1] == '\u0C55') {
|
||||
// (ఒ + ౕ) -> oo (ఓ)
|
||||
|
|
|
@ -61,12 +61,12 @@ public final class TurkishLowerCaseFilter extends TokenFilter {
|
|||
|
||||
if (iOrAfter) { // all the special I turkish handling happens here.
|
||||
switch (ch) {
|
||||
// remove COMBINING_DOT_ABOVE to mimic composed lowercase
|
||||
// remove COMBINING_DOT_ABOVE to mimic composed lowercase
|
||||
case COMBINING_DOT_ABOVE:
|
||||
length = delete(buffer, i, length);
|
||||
continue;
|
||||
// i itself, it depends if it is followed by COMBINING_DOT_ABOVE
|
||||
// if it is, we will make it small i and later remove the dot
|
||||
// i itself, it depends if it is followed by COMBINING_DOT_ABOVE
|
||||
// if it is, we will make it small i and later remove the dot
|
||||
case LATIN_CAPITAL_LETTER_I:
|
||||
if (isBeforeDot(buffer, i + 1, length)) {
|
||||
buffer[i] = LATIN_SMALL_LETTER_I;
|
||||
|
|
|
@ -901,7 +901,7 @@ class WikipediaTokenizerImpl {
|
|||
positionInc = 1; /* Break so we don't hit fall-through warning: */
|
||||
break;
|
||||
}
|
||||
// fall through
|
||||
// fall through
|
||||
case 47:
|
||||
break;
|
||||
case 2:
|
||||
|
@ -909,7 +909,7 @@ class WikipediaTokenizerImpl {
|
|||
positionInc = 1;
|
||||
return ALPHANUM;
|
||||
}
|
||||
// fall through
|
||||
// fall through
|
||||
case 48:
|
||||
break;
|
||||
case 3:
|
||||
|
@ -920,7 +920,7 @@ class WikipediaTokenizerImpl {
|
|||
yybegin(EXTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */
|
||||
break;
|
||||
}
|
||||
// fall through
|
||||
// fall through
|
||||
case 49:
|
||||
break;
|
||||
case 4:
|
||||
|
@ -928,7 +928,7 @@ class WikipediaTokenizerImpl {
|
|||
positionInc = 1;
|
||||
return CJ;
|
||||
}
|
||||
// fall through
|
||||
// fall through
|
||||
case 50:
|
||||
break;
|
||||
case 5:
|
||||
|
@ -936,7 +936,7 @@ class WikipediaTokenizerImpl {
|
|||
positionInc = 1; /* Break so we don't hit fall-through warning: */
|
||||
break;
|
||||
}
|
||||
// fall through
|
||||
// fall through
|
||||
case 51:
|
||||
break;
|
||||
case 6:
|
||||
|
@ -945,7 +945,7 @@ class WikipediaTokenizerImpl {
|
|||
numWikiTokensSeen++;
|
||||
return currentTokType;
|
||||
}
|
||||
// fall through
|
||||
// fall through
|
||||
case 52:
|
||||
break;
|
||||
case 7:
|
||||
|
@ -954,7 +954,7 @@ class WikipediaTokenizerImpl {
|
|||
numWikiTokensSeen++;
|
||||
return currentTokType;
|
||||
}
|
||||
// fall through
|
||||
// fall through
|
||||
case 53:
|
||||
break;
|
||||
case 8:
|
||||
|
@ -962,7 +962,7 @@ class WikipediaTokenizerImpl {
|
|||
/* Break so we don't hit fall-through warning: */
|
||||
break; /* ignore */
|
||||
}
|
||||
// fall through
|
||||
// fall through
|
||||
case 54:
|
||||
break;
|
||||
case 9:
|
||||
|
@ -978,7 +978,7 @@ class WikipediaTokenizerImpl {
|
|||
numLinkToks++;
|
||||
return currentTokType;
|
||||
}
|
||||
// fall through
|
||||
// fall through
|
||||
case 55:
|
||||
break;
|
||||
case 10:
|
||||
|
@ -988,7 +988,7 @@ class WikipediaTokenizerImpl {
|
|||
yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */
|
||||
break;
|
||||
}
|
||||
// fall through
|
||||
// fall through
|
||||
case 56:
|
||||
break;
|
||||
case 11:
|
||||
|
@ -997,7 +997,7 @@ class WikipediaTokenizerImpl {
|
|||
yybegin(THREE_SINGLE_QUOTES_STATE); /* Break so we don't hit fall-through warning: */
|
||||
break;
|
||||
}
|
||||
// fall through
|
||||
// fall through
|
||||
case 57:
|
||||
break;
|
||||
case 12:
|
||||
|
@ -1007,7 +1007,7 @@ class WikipediaTokenizerImpl {
|
|||
yybegin(STRING);
|
||||
return currentTokType; /*italics*/
|
||||
}
|
||||
// fall through
|
||||
// fall through
|
||||
case 58:
|
||||
break;
|
||||
case 13:
|
||||
|
@ -1017,7 +1017,7 @@ class WikipediaTokenizerImpl {
|
|||
yybegin(EXTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */
|
||||
break;
|
||||
}
|
||||
// fall through
|
||||
// fall through
|
||||
case 59:
|
||||
break;
|
||||
case 14:
|
||||
|
@ -1026,7 +1026,7 @@ class WikipediaTokenizerImpl {
|
|||
numWikiTokensSeen++;
|
||||
return currentTokType;
|
||||
}
|
||||
// fall through
|
||||
// fall through
|
||||
case 60:
|
||||
break;
|
||||
case 15:
|
||||
|
@ -1036,7 +1036,7 @@ class WikipediaTokenizerImpl {
|
|||
numWikiTokensSeen++;
|
||||
return currentTokType;
|
||||
}
|
||||
// fall through
|
||||
// fall through
|
||||
case 61:
|
||||
break;
|
||||
case 16:
|
||||
|
@ -1046,7 +1046,7 @@ class WikipediaTokenizerImpl {
|
|||
yybegin(STRING); /* Break so we don't hit fall-through warning: */
|
||||
break;
|
||||
}
|
||||
// fall through
|
||||
// fall through
|
||||
case 62:
|
||||
break;
|
||||
case 17:
|
||||
|
@ -1055,7 +1055,7 @@ class WikipediaTokenizerImpl {
|
|||
numWikiTokensSeen = 0;
|
||||
return currentTokType;
|
||||
}
|
||||
// fall through
|
||||
// fall through
|
||||
case 63:
|
||||
break;
|
||||
case 18:
|
||||
|
@ -1063,7 +1063,7 @@ class WikipediaTokenizerImpl {
|
|||
/* Break so we don't hit fall-through warning: */
|
||||
break; /* ignore STRING */
|
||||
}
|
||||
// fall through
|
||||
// fall through
|
||||
case 64:
|
||||
break;
|
||||
case 19:
|
||||
|
@ -1072,7 +1072,7 @@ class WikipediaTokenizerImpl {
|
|||
numWikiTokensSeen++;
|
||||
return currentTokType; /* STRING ALPHANUM*/
|
||||
}
|
||||
// fall through
|
||||
// fall through
|
||||
case 65:
|
||||
break;
|
||||
case 20:
|
||||
|
@ -1083,7 +1083,7 @@ class WikipediaTokenizerImpl {
|
|||
yybegin(EXTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */
|
||||
break;
|
||||
}
|
||||
// fall through
|
||||
// fall through
|
||||
case 66:
|
||||
break;
|
||||
case 21:
|
||||
|
@ -1091,7 +1091,7 @@ class WikipediaTokenizerImpl {
|
|||
yybegin(STRING);
|
||||
return currentTokType; /*pipe*/
|
||||
}
|
||||
// fall through
|
||||
// fall through
|
||||
case 67:
|
||||
break;
|
||||
case 22:
|
||||
|
@ -1106,7 +1106,7 @@ class WikipediaTokenizerImpl {
|
|||
} /* Break so we don't hit fall-through warning: */
|
||||
break;
|
||||
}
|
||||
// fall through
|
||||
// fall through
|
||||
case 68:
|
||||
break;
|
||||
case 23:
|
||||
|
@ -1116,7 +1116,7 @@ class WikipediaTokenizerImpl {
|
|||
yybegin(DOUBLE_EQUALS_STATE); /* Break so we don't hit fall-through warning: */
|
||||
break;
|
||||
}
|
||||
// fall through
|
||||
// fall through
|
||||
case 69:
|
||||
break;
|
||||
case 24:
|
||||
|
@ -1127,7 +1127,7 @@ class WikipediaTokenizerImpl {
|
|||
yybegin(INTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */
|
||||
break;
|
||||
}
|
||||
// fall through
|
||||
// fall through
|
||||
case 70:
|
||||
break;
|
||||
case 25:
|
||||
|
@ -1138,7 +1138,7 @@ class WikipediaTokenizerImpl {
|
|||
yybegin(DOUBLE_BRACE_STATE); /* Break so we don't hit fall-through warning: */
|
||||
break;
|
||||
}
|
||||
// fall through
|
||||
// fall through
|
||||
case 71:
|
||||
break;
|
||||
case 26:
|
||||
|
@ -1146,7 +1146,7 @@ class WikipediaTokenizerImpl {
|
|||
yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */
|
||||
break;
|
||||
}
|
||||
// fall through
|
||||
// fall through
|
||||
case 72:
|
||||
break;
|
||||
case 27:
|
||||
|
@ -1155,7 +1155,7 @@ class WikipediaTokenizerImpl {
|
|||
yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */
|
||||
break;
|
||||
}
|
||||
// fall through
|
||||
// fall through
|
||||
case 73:
|
||||
break;
|
||||
case 28:
|
||||
|
@ -1165,7 +1165,7 @@ class WikipediaTokenizerImpl {
|
|||
yybegin(INTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */
|
||||
break;
|
||||
}
|
||||
// fall through
|
||||
// fall through
|
||||
case 74:
|
||||
break;
|
||||
case 29:
|
||||
|
@ -1175,7 +1175,7 @@ class WikipediaTokenizerImpl {
|
|||
yybegin(INTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */
|
||||
break;
|
||||
}
|
||||
// fall through
|
||||
// fall through
|
||||
case 75:
|
||||
break;
|
||||
case 30:
|
||||
|
@ -1183,7 +1183,7 @@ class WikipediaTokenizerImpl {
|
|||
yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */
|
||||
break;
|
||||
}
|
||||
// fall through
|
||||
// fall through
|
||||
case 76:
|
||||
break;
|
||||
case 31:
|
||||
|
@ -1193,7 +1193,7 @@ class WikipediaTokenizerImpl {
|
|||
yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */
|
||||
break; /*end italics*/
|
||||
}
|
||||
// fall through
|
||||
// fall through
|
||||
case 77:
|
||||
break;
|
||||
case 32:
|
||||
|
@ -1204,7 +1204,7 @@ class WikipediaTokenizerImpl {
|
|||
yybegin(INTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */
|
||||
break;
|
||||
}
|
||||
// fall through
|
||||
// fall through
|
||||
case 78:
|
||||
break;
|
||||
case 33:
|
||||
|
@ -1212,7 +1212,7 @@ class WikipediaTokenizerImpl {
|
|||
positionInc = 1;
|
||||
return NUM;
|
||||
}
|
||||
// fall through
|
||||
// fall through
|
||||
case 79:
|
||||
break;
|
||||
case 34:
|
||||
|
@ -1220,7 +1220,7 @@ class WikipediaTokenizerImpl {
|
|||
positionInc = 1;
|
||||
return COMPANY;
|
||||
}
|
||||
// fall through
|
||||
// fall through
|
||||
case 80:
|
||||
break;
|
||||
case 35:
|
||||
|
@ -1228,7 +1228,7 @@ class WikipediaTokenizerImpl {
|
|||
positionInc = 1;
|
||||
return APOSTROPHE;
|
||||
}
|
||||
// fall through
|
||||
// fall through
|
||||
case 81:
|
||||
break;
|
||||
case 36:
|
||||
|
@ -1236,7 +1236,7 @@ class WikipediaTokenizerImpl {
|
|||
positionInc = 1;
|
||||
return HOST;
|
||||
}
|
||||
// fall through
|
||||
// fall through
|
||||
case 82:
|
||||
break;
|
||||
case 37:
|
||||
|
@ -1245,7 +1245,7 @@ class WikipediaTokenizerImpl {
|
|||
yybegin(FIVE_SINGLE_QUOTES_STATE); /* Break so we don't hit fall-through warning: */
|
||||
break;
|
||||
}
|
||||
// fall through
|
||||
// fall through
|
||||
case 83:
|
||||
break;
|
||||
case 38:
|
||||
|
@ -1255,7 +1255,7 @@ class WikipediaTokenizerImpl {
|
|||
yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */
|
||||
break; /*end bold*/
|
||||
}
|
||||
// fall through
|
||||
// fall through
|
||||
case 84:
|
||||
break;
|
||||
case 39:
|
||||
|
@ -1265,7 +1265,7 @@ class WikipediaTokenizerImpl {
|
|||
yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */
|
||||
break; /*end sub header*/
|
||||
}
|
||||
// fall through
|
||||
// fall through
|
||||
case 85:
|
||||
break;
|
||||
case 40:
|
||||
|
@ -1273,7 +1273,7 @@ class WikipediaTokenizerImpl {
|
|||
positionInc = 1;
|
||||
return ACRONYM;
|
||||
}
|
||||
// fall through
|
||||
// fall through
|
||||
case 86:
|
||||
break;
|
||||
case 41:
|
||||
|
@ -1281,7 +1281,7 @@ class WikipediaTokenizerImpl {
|
|||
positionInc = 1;
|
||||
return EMAIL;
|
||||
}
|
||||
// fall through
|
||||
// fall through
|
||||
case 87:
|
||||
break;
|
||||
case 42:
|
||||
|
@ -1291,7 +1291,7 @@ class WikipediaTokenizerImpl {
|
|||
yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */
|
||||
break; /*end bold italics*/
|
||||
}
|
||||
// fall through
|
||||
// fall through
|
||||
case 88:
|
||||
break;
|
||||
case 43:
|
||||
|
@ -1301,7 +1301,7 @@ class WikipediaTokenizerImpl {
|
|||
yybegin(EXTERNAL_LINK_STATE);
|
||||
return currentTokType;
|
||||
}
|
||||
// fall through
|
||||
// fall through
|
||||
case 89:
|
||||
break;
|
||||
case 44:
|
||||
|
@ -1312,7 +1312,7 @@ class WikipediaTokenizerImpl {
|
|||
yybegin(CATEGORY_STATE); /* Break so we don't hit fall-through warning: */
|
||||
break;
|
||||
}
|
||||
// fall through
|
||||
// fall through
|
||||
case 90:
|
||||
break;
|
||||
case 45:
|
||||
|
@ -1322,7 +1322,7 @@ class WikipediaTokenizerImpl {
|
|||
yybegin(CATEGORY_STATE); /* Break so we don't hit fall-through warning: */
|
||||
break;
|
||||
}
|
||||
// fall through
|
||||
// fall through
|
||||
case 91:
|
||||
break;
|
||||
case 46:
|
||||
|
@ -1333,7 +1333,7 @@ class WikipediaTokenizerImpl {
|
|||
yybegin(CATEGORY_STATE); /* Break so we don't hit fall-through warning: */
|
||||
break;
|
||||
}
|
||||
// fall through
|
||||
// fall through
|
||||
case 92:
|
||||
break;
|
||||
default:
|
||||
|
|
|
@ -59,6 +59,14 @@ public class TestSpellChecking extends LuceneTestCase {
|
|||
|
||||
public void testRepSuggestions() throws Exception {
|
||||
doTest("rep");
|
||||
|
||||
//noinspection DataFlowIssue
|
||||
Path aff = Path.of(getClass().getResource("rep.aff").toURI());
|
||||
Dictionary dictionary = TestAllDictionaries.loadDictionary(aff);
|
||||
Suggester suggester = new Suggester(dictionary);
|
||||
assertEquals(List.of("auto's"), suggester.suggestNoTimeout("autos", () -> {}));
|
||||
assertEquals(
|
||||
List.of("auto's", "auto"), suggester.proceedPastRep().suggestNoTimeout("autos", () -> {}));
|
||||
}
|
||||
|
||||
public void testPhSuggestions() throws Exception {
|
||||
|
|
|
@ -245,7 +245,7 @@ public class Diff {
|
|||
deletes++;
|
||||
x--;
|
||||
break;
|
||||
// delete
|
||||
// delete
|
||||
case Y:
|
||||
if (deletes != base) {
|
||||
result.append('D').append(deletes);
|
||||
|
@ -258,7 +258,7 @@ public class Diff {
|
|||
result.append('I');
|
||||
result.append(b.charAt(--y));
|
||||
break;
|
||||
// insert
|
||||
// insert
|
||||
case R:
|
||||
if (deletes != base) {
|
||||
result.append('D').append(deletes);
|
||||
|
@ -272,7 +272,7 @@ public class Diff {
|
|||
result.append(b.charAt(--y));
|
||||
x--;
|
||||
break;
|
||||
// replace
|
||||
// replace
|
||||
case D:
|
||||
if (deletes != base) {
|
||||
result.append('D').append(deletes);
|
||||
|
|
|
@ -0,0 +1,4 @@
|
|||
{
|
||||
"lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene99/ForUtil.java": "f31797842f047626df6a1a6b97167bec60269fec",
|
||||
"lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene99/gen_ForUtil.py": "325f2610974b0e76e278b6445405a098a3763feb"
|
||||
}
|
|
@ -35,6 +35,7 @@ module org.apache.lucene.backward_codecs {
|
|||
exports org.apache.lucene.backward_codecs.lucene92;
|
||||
exports org.apache.lucene.backward_codecs.lucene94;
|
||||
exports org.apache.lucene.backward_codecs.lucene95;
|
||||
exports org.apache.lucene.backward_codecs.lucene99;
|
||||
exports org.apache.lucene.backward_codecs.packed;
|
||||
exports org.apache.lucene.backward_codecs.store;
|
||||
|
||||
|
@ -43,7 +44,8 @@ module org.apache.lucene.backward_codecs {
|
|||
provides org.apache.lucene.codecs.PostingsFormat with
|
||||
org.apache.lucene.backward_codecs.lucene50.Lucene50PostingsFormat,
|
||||
org.apache.lucene.backward_codecs.lucene84.Lucene84PostingsFormat,
|
||||
org.apache.lucene.backward_codecs.lucene90.Lucene90PostingsFormat;
|
||||
org.apache.lucene.backward_codecs.lucene90.Lucene90PostingsFormat,
|
||||
org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat;
|
||||
provides org.apache.lucene.codecs.KnnVectorsFormat with
|
||||
org.apache.lucene.backward_codecs.lucene90.Lucene90HnswVectorsFormat,
|
||||
org.apache.lucene.backward_codecs.lucene91.Lucene91HnswVectorsFormat,
|
||||
|
@ -59,5 +61,6 @@ module org.apache.lucene.backward_codecs {
|
|||
org.apache.lucene.backward_codecs.lucene91.Lucene91Codec,
|
||||
org.apache.lucene.backward_codecs.lucene92.Lucene92Codec,
|
||||
org.apache.lucene.backward_codecs.lucene94.Lucene94Codec,
|
||||
org.apache.lucene.backward_codecs.lucene95.Lucene95Codec;
|
||||
org.apache.lucene.backward_codecs.lucene95.Lucene95Codec,
|
||||
org.apache.lucene.backward_codecs.lucene99.Lucene99Codec;
|
||||
}
|
||||
|
|
|
@ -88,21 +88,17 @@ public final class FieldReader extends Terms {
|
|||
(new ByteArrayDataInput(rootCode.bytes, rootCode.offset, rootCode.length)).readVLong()
|
||||
>>> Lucene40BlockTreeTermsReader.OUTPUT_FLAGS_NUM_BITS;
|
||||
// Initialize FST always off-heap.
|
||||
final IndexInput clone = indexIn.clone();
|
||||
clone.seek(indexStartFP);
|
||||
final FST.FSTMetadata<BytesRef> fstMetadata;
|
||||
if (metaIn == indexIn) { // Only true before Lucene 8.6
|
||||
index =
|
||||
new FST<>(
|
||||
readMetadata(clone, ByteSequenceOutputs.getSingleton()),
|
||||
clone,
|
||||
new OffHeapFSTStore());
|
||||
final IndexInput clone = indexIn.clone();
|
||||
clone.seek(indexStartFP);
|
||||
fstMetadata = readMetadata(clone, ByteSequenceOutputs.getSingleton());
|
||||
// FST bytes actually only start after the metadata.
|
||||
indexStartFP = clone.getFilePointer();
|
||||
} else {
|
||||
index =
|
||||
new FST<>(
|
||||
readMetadata(metaIn, ByteSequenceOutputs.getSingleton()),
|
||||
clone,
|
||||
new OffHeapFSTStore());
|
||||
fstMetadata = readMetadata(metaIn, ByteSequenceOutputs.getSingleton());
|
||||
}
|
||||
index = FST.fromFSTReader(fstMetadata, new OffHeapFSTStore(indexIn, indexStartFP, fstMetadata));
|
||||
/*
|
||||
if (false) {
|
||||
final String dotFileName = segment + "_" + fieldInfo.name + ".dot";
|
||||
|
|
|
@ -14,7 +14,7 @@
|
|||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.codecs.lucene99;
|
||||
package org.apache.lucene.backward_codecs.lucene99;
|
||||
|
||||
import java.io.IOException;
|
||||
import org.apache.lucene.store.DataInput;
|
|
@ -16,7 +16,7 @@
|
|||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.codecs.lucene99;
|
||||
package org.apache.lucene.backward_codecs.lucene99;
|
||||
|
||||
import java.io.IOException;
|
||||
import org.apache.lucene.store.DataInput;
|
|
@ -14,12 +14,33 @@
|
|||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.codecs.lucene99;
|
||||
package org.apache.lucene.backward_codecs.lucene99;
|
||||
|
||||
import java.util.Objects;
|
||||
import org.apache.lucene.codecs.*;
|
||||
import org.apache.lucene.codecs.lucene90.*;
|
||||
import org.apache.lucene.codecs.Codec;
|
||||
import org.apache.lucene.codecs.CompoundFormat;
|
||||
import org.apache.lucene.codecs.DocValuesFormat;
|
||||
import org.apache.lucene.codecs.FieldInfosFormat;
|
||||
import org.apache.lucene.codecs.FilterCodec;
|
||||
import org.apache.lucene.codecs.KnnVectorsFormat;
|
||||
import org.apache.lucene.codecs.LiveDocsFormat;
|
||||
import org.apache.lucene.codecs.NormsFormat;
|
||||
import org.apache.lucene.codecs.PointsFormat;
|
||||
import org.apache.lucene.codecs.PostingsFormat;
|
||||
import org.apache.lucene.codecs.SegmentInfoFormat;
|
||||
import org.apache.lucene.codecs.StoredFieldsFormat;
|
||||
import org.apache.lucene.codecs.TermVectorsFormat;
|
||||
import org.apache.lucene.codecs.lucene90.Lucene90CompoundFormat;
|
||||
import org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat;
|
||||
import org.apache.lucene.codecs.lucene90.Lucene90LiveDocsFormat;
|
||||
import org.apache.lucene.codecs.lucene90.Lucene90NormsFormat;
|
||||
import org.apache.lucene.codecs.lucene90.Lucene90PointsFormat;
|
||||
import org.apache.lucene.codecs.lucene90.Lucene90StoredFieldsFormat;
|
||||
import org.apache.lucene.codecs.lucene90.Lucene90TermVectorsFormat;
|
||||
import org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat;
|
||||
import org.apache.lucene.codecs.lucene94.Lucene94FieldInfosFormat;
|
||||
import org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat;
|
||||
import org.apache.lucene.codecs.lucene99.Lucene99SegmentInfoFormat;
|
||||
import org.apache.lucene.codecs.perfield.PerFieldDocValuesFormat;
|
||||
import org.apache.lucene.codecs.perfield.PerFieldKnnVectorsFormat;
|
||||
import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat;
|
||||
|
@ -98,7 +119,7 @@ public class Lucene99Codec extends Codec {
|
|||
super("Lucene99");
|
||||
this.storedFieldsFormat =
|
||||
new Lucene90StoredFieldsFormat(Objects.requireNonNull(mode).storedMode);
|
||||
this.defaultPostingsFormat = new Lucene99PostingsFormat();
|
||||
this.defaultPostingsFormat = new Lucene912PostingsFormat();
|
||||
this.defaultDVFormat = new Lucene90DocValuesFormat();
|
||||
this.defaultKnnVectorsFormat = new Lucene99HnswVectorsFormat();
|
||||
}
|
|
@ -14,7 +14,7 @@
|
|||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.codecs.lucene99;
|
||||
package org.apache.lucene.backward_codecs.lucene99;
|
||||
|
||||
import java.io.IOException;
|
||||
import org.apache.lucene.codecs.BlockTermState;
|
||||
|
@ -24,7 +24,6 @@ import org.apache.lucene.codecs.FieldsProducer;
|
|||
import org.apache.lucene.codecs.MultiLevelSkipListWriter;
|
||||
import org.apache.lucene.codecs.PostingsFormat;
|
||||
import org.apache.lucene.codecs.PostingsReaderBase;
|
||||
import org.apache.lucene.codecs.PostingsWriterBase;
|
||||
import org.apache.lucene.codecs.lucene90.blocktree.Lucene90BlockTreeTermsReader;
|
||||
import org.apache.lucene.codecs.lucene90.blocktree.Lucene90BlockTreeTermsWriter;
|
||||
import org.apache.lucene.index.IndexOptions;
|
||||
|
@ -339,7 +338,7 @@ import org.apache.lucene.util.packed.PackedInts;
|
|||
*
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public final class Lucene99PostingsFormat extends PostingsFormat {
|
||||
public class Lucene99PostingsFormat extends PostingsFormat {
|
||||
|
||||
/**
|
||||
* Filename extension for document number, frequencies, and skip data. See chapter: <a
|
||||
|
@ -374,28 +373,9 @@ public final class Lucene99PostingsFormat extends PostingsFormat {
|
|||
static final int VERSION_START = 0;
|
||||
static final int VERSION_CURRENT = VERSION_START;
|
||||
|
||||
private final int minTermBlockSize;
|
||||
private final int maxTermBlockSize;
|
||||
|
||||
/** Creates {@code Lucene99PostingsFormat} with default settings. */
|
||||
public Lucene99PostingsFormat() {
|
||||
this(
|
||||
Lucene90BlockTreeTermsWriter.DEFAULT_MIN_BLOCK_SIZE,
|
||||
Lucene90BlockTreeTermsWriter.DEFAULT_MAX_BLOCK_SIZE);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates {@code Lucene99PostingsFormat} with custom values for {@code minBlockSize} and {@code
|
||||
* maxBlockSize} passed to block terms dictionary.
|
||||
*
|
||||
* @see
|
||||
* Lucene90BlockTreeTermsWriter#Lucene90BlockTreeTermsWriter(SegmentWriteState,PostingsWriterBase,int,int)
|
||||
*/
|
||||
public Lucene99PostingsFormat(int minTermBlockSize, int maxTermBlockSize) {
|
||||
super("Lucene99");
|
||||
Lucene90BlockTreeTermsWriter.validateSettings(minTermBlockSize, maxTermBlockSize);
|
||||
this.minTermBlockSize = minTermBlockSize;
|
||||
this.maxTermBlockSize = maxTermBlockSize;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -405,19 +385,7 @@ public final class Lucene99PostingsFormat extends PostingsFormat {
|
|||
|
||||
@Override
|
||||
public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
|
||||
PostingsWriterBase postingsWriter = new Lucene99PostingsWriter(state);
|
||||
boolean success = false;
|
||||
try {
|
||||
FieldsConsumer ret =
|
||||
new Lucene90BlockTreeTermsWriter(
|
||||
state, postingsWriter, minTermBlockSize, maxTermBlockSize);
|
||||
success = true;
|
||||
return ret;
|
||||
} finally {
|
||||
if (!success) {
|
||||
IOUtils.closeWhileHandlingException(postingsWriter);
|
||||
}
|
||||
}
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
@Override
|
|
@ -14,23 +14,23 @@
|
|||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.codecs.lucene99;
|
||||
package org.apache.lucene.backward_codecs.lucene99;
|
||||
|
||||
import static org.apache.lucene.codecs.lucene99.ForUtil.BLOCK_SIZE;
|
||||
import static org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.DOC_CODEC;
|
||||
import static org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.MAX_SKIP_LEVELS;
|
||||
import static org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.PAY_CODEC;
|
||||
import static org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.POS_CODEC;
|
||||
import static org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.TERMS_CODEC;
|
||||
import static org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.VERSION_CURRENT;
|
||||
import static org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.VERSION_START;
|
||||
import static org.apache.lucene.backward_codecs.lucene99.ForUtil.BLOCK_SIZE;
|
||||
import static org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat.DOC_CODEC;
|
||||
import static org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat.MAX_SKIP_LEVELS;
|
||||
import static org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat.PAY_CODEC;
|
||||
import static org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat.POS_CODEC;
|
||||
import static org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat.TERMS_CODEC;
|
||||
import static org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat.VERSION_CURRENT;
|
||||
import static org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat.VERSION_START;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
import org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat.IntBlockTermState;
|
||||
import org.apache.lucene.codecs.BlockTermState;
|
||||
import org.apache.lucene.codecs.CodecUtil;
|
||||
import org.apache.lucene.codecs.PostingsReaderBase;
|
||||
import org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.IntBlockTermState;
|
||||
import org.apache.lucene.index.FieldInfo;
|
||||
import org.apache.lucene.index.Impacts;
|
||||
import org.apache.lucene.index.ImpactsEnum;
|
|
@ -14,7 +14,7 @@
|
|||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.codecs.lucene99;
|
||||
package org.apache.lucene.backward_codecs.lucene99;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.AbstractList;
|
|
@ -14,7 +14,7 @@
|
|||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.codecs.lucene99;
|
||||
package org.apache.lucene.backward_codecs.lucene99;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
|
@ -61,6 +61,7 @@ public class Lucene99SkipReader extends MultiLevelSkipListReader {
|
|||
private long lastDocPointer;
|
||||
private int lastPosBufferUpto;
|
||||
|
||||
/** Sole constructor. */
|
||||
public Lucene99SkipReader(
|
||||
IndexInput skipStream,
|
||||
int maxSkipLevels,
|
||||
|
@ -98,6 +99,7 @@ public class Lucene99SkipReader extends MultiLevelSkipListReader {
|
|||
return df % ForUtil.BLOCK_SIZE == 0 ? df - 1 : df;
|
||||
}
|
||||
|
||||
/** Initialize state. */
|
||||
public void init(
|
||||
long skipPointer, long docBasePointer, long posBasePointer, long payBasePointer, int df)
|
||||
throws IOException {
|
||||
|
@ -125,22 +127,27 @@ public class Lucene99SkipReader extends MultiLevelSkipListReader {
|
|||
return lastDocPointer;
|
||||
}
|
||||
|
||||
/** Returns the pointer in the pos file. */
|
||||
public long getPosPointer() {
|
||||
return lastPosPointer;
|
||||
}
|
||||
|
||||
/** Return the start offset in the position block. */
|
||||
public int getPosBufferUpto() {
|
||||
return lastPosBufferUpto;
|
||||
}
|
||||
|
||||
/** Returns the pointer in the pay file. */
|
||||
public long getPayPointer() {
|
||||
return lastPayPointer;
|
||||
}
|
||||
|
||||
/** Return the number of bytes in the pay block that belongs to docs from the previous block. */
|
||||
public int getPayloadByteUpto() {
|
||||
return lastPayloadByteUpto;
|
||||
}
|
||||
|
||||
/** Return the next skip doc, no skipping can be performed until this doc. */
|
||||
public int getNextSkipDoc() {
|
||||
return skipDoc[0];
|
||||
}
|
||||
|
@ -199,7 +206,7 @@ public class Lucene99SkipReader extends MultiLevelSkipListReader {
|
|||
return delta;
|
||||
}
|
||||
|
||||
// The default impl skips impacts
|
||||
/** Read impacts. The default implementation skips them. */
|
||||
protected void readImpacts(int level, IndexInput skipStream) throws IOException {
|
||||
skipStream.skipBytes(skipStream.readVInt());
|
||||
}
|
|
@ -14,7 +14,7 @@
|
|||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.codecs.lucene99;
|
||||
package org.apache.lucene.backward_codecs.lucene99;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
|
@ -46,10 +46,10 @@ import org.apache.lucene.store.IndexOutput;
|
|||
* uptos(position, payload). 4. start offset.
|
||||
*/
|
||||
public final class Lucene99SkipWriter extends MultiLevelSkipListWriter {
|
||||
private int[] lastSkipDoc;
|
||||
private long[] lastSkipDocPointer;
|
||||
private long[] lastSkipPosPointer;
|
||||
private long[] lastSkipPayPointer;
|
||||
private final int[] lastSkipDoc;
|
||||
private final long[] lastSkipDocPointer;
|
||||
private final long[] lastSkipPosPointer;
|
||||
private final long[] lastSkipPayPointer;
|
||||
|
||||
private final IndexOutput docOut;
|
||||
private final IndexOutput posOut;
|
||||
|
@ -61,11 +61,12 @@ public final class Lucene99SkipWriter extends MultiLevelSkipListWriter {
|
|||
private long curPayPointer;
|
||||
private int curPosBufferUpto;
|
||||
private int curPayloadByteUpto;
|
||||
private CompetitiveImpactAccumulator[] curCompetitiveFreqNorms;
|
||||
private final CompetitiveImpactAccumulator[] curCompetitiveFreqNorms;
|
||||
private boolean fieldHasPositions;
|
||||
private boolean fieldHasOffsets;
|
||||
private boolean fieldHasPayloads;
|
||||
|
||||
/** Sole constructor. */
|
||||
public Lucene99SkipWriter(
|
||||
int maxSkipLevels,
|
||||
int blockSize,
|
||||
|
@ -84,7 +85,12 @@ public final class Lucene99SkipWriter extends MultiLevelSkipListWriter {
|
|||
lastSkipPosPointer = new long[maxSkipLevels];
|
||||
if (payOut != null) {
|
||||
lastSkipPayPointer = new long[maxSkipLevels];
|
||||
} else {
|
||||
lastSkipPayPointer = null;
|
||||
}
|
||||
} else {
|
||||
lastSkipPosPointer = null;
|
||||
lastSkipPayPointer = null;
|
||||
}
|
||||
curCompetitiveFreqNorms = new CompetitiveImpactAccumulator[maxSkipLevels];
|
||||
for (int i = 0; i < maxSkipLevels; ++i) {
|
||||
|
@ -92,6 +98,7 @@ public final class Lucene99SkipWriter extends MultiLevelSkipListWriter {
|
|||
}
|
||||
}
|
||||
|
||||
/** Reset state for the given index options. */
|
||||
public void setField(
|
||||
boolean fieldHasPositions, boolean fieldHasOffsets, boolean fieldHasPayloads) {
|
||||
this.fieldHasPositions = fieldHasPositions;
|
||||
|
@ -211,6 +218,7 @@ public final class Lucene99SkipWriter extends MultiLevelSkipListWriter {
|
|||
competitiveFreqNorms.clear();
|
||||
}
|
||||
|
||||
/** Write impacts to the given output. */
|
||||
public static void writeImpacts(CompetitiveImpactAccumulator acc, DataOutput out)
|
||||
throws IOException {
|
||||
Collection<Impact> impacts = acc.getCompetitiveFreqNormPairs();
|
|
@ -14,7 +14,7 @@
|
|||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.codecs.lucene99;
|
||||
package org.apache.lucene.backward_codecs.lucene99;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
|
@ -14,7 +14,7 @@
|
|||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.codecs.lucene99;
|
||||
package org.apache.lucene.backward_codecs.lucene99;
|
||||
|
||||
import java.io.IOException;
|
||||
import org.apache.lucene.store.IndexInput;
|
|
@ -40,7 +40,7 @@ HEADER = """// This file has been automatically generated, DO NOT EDIT
|
|||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.codecs.lucene99;
|
||||
package org.apache.lucene.backward_codecs.lucene99;
|
||||
|
||||
import java.io.IOException;
|
||||
import org.apache.lucene.store.DataInput;
|
|
@ -0,0 +1,428 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Lucene 9.9 file format.
|
||||
*
|
||||
* <h2>Apache Lucene - Index File Formats</h2>
|
||||
*
|
||||
* <div>
|
||||
*
|
||||
* <ul>
|
||||
* <li><a href="#Introduction">Introduction</a>
|
||||
* <li><a href="#Definitions">Definitions</a>
|
||||
* <ul>
|
||||
* <li><a href="#Inverted_Indexing">Inverted Indexing</a>
|
||||
* <li><a href="#Types_of_Fields">Types of Fields</a>
|
||||
* <li><a href="#Segments">Segments</a>
|
||||
* <li><a href="#Document_Numbers">Document Numbers</a>
|
||||
* </ul>
|
||||
* <li><a href="#Overview">Index Structure Overview</a>
|
||||
* <li><a href="#File_Naming">File Naming</a>
|
||||
* <li><a href="#file-names">Summary of File Extensions</a>
|
||||
* <ul>
|
||||
* <li><a href="#Lock_File">Lock File</a>
|
||||
* <li><a href="#History">History</a>
|
||||
* <li><a href="#Limitations">Limitations</a>
|
||||
* </ul>
|
||||
* </ul>
|
||||
*
|
||||
* </div> <a id="Introduction"></a>
|
||||
*
|
||||
* <h3>Introduction</h3>
|
||||
*
|
||||
* <div>
|
||||
*
|
||||
* <p>This document defines the index file formats used in this version of Lucene. If you are using
|
||||
* a different version of Lucene, please consult the copy of <code>docs/</code> that was distributed
|
||||
* with the version you are using.
|
||||
*
|
||||
* <p>This document attempts to provide a high-level definition of the Apache Lucene file formats.
|
||||
* </div> <a id="Definitions"></a>
|
||||
*
|
||||
* <h3>Definitions</h3>
|
||||
*
|
||||
* <div>
|
||||
*
|
||||
* <p>The fundamental concepts in Lucene are index, document, field and term.
|
||||
*
|
||||
* <p>An index contains a sequence of documents.
|
||||
*
|
||||
* <ul>
|
||||
* <li>A document is a sequence of fields.
|
||||
* <li>A field is a named sequence of terms.
|
||||
* <li>A term is a sequence of bytes.
|
||||
* </ul>
|
||||
*
|
||||
* <p>The same sequence of bytes in two different fields is considered a different term. Thus terms
|
||||
* are represented as a pair: the string naming the field, and the bytes within the field. <a
|
||||
* id="Inverted_Indexing"></a>
|
||||
*
|
||||
* <h4>Inverted Indexing</h4>
|
||||
*
|
||||
* <p>Lucene's index stores terms and statistics about those terms in order to make term-based
|
||||
* search more efficient. Lucene's terms index falls into the family of indexes known as an
|
||||
* <i>inverted index.</i> This is because it can list, for a term, the documents that contain it.
|
||||
* This is the inverse of the natural relationship, in which documents list terms. <a
|
||||
* id="Types_of_Fields"></a>
|
||||
*
|
||||
* <h4>Types of Fields</h4>
|
||||
*
|
||||
* <p>In Lucene, fields may be <i>stored</i>, in which case their text is stored in the index
|
||||
* literally, in a non-inverted manner. Fields that are inverted are called <i>indexed</i>. A field
|
||||
* may be both stored and indexed.
|
||||
*
|
||||
* <p>The text of a field may be <i>tokenized</i> into terms to be indexed, or the text of a field
|
||||
* may be used literally as a term to be indexed. Most fields are tokenized, but sometimes it is
|
||||
* useful for certain identifier fields to be indexed literally.
|
||||
*
|
||||
* <p>See the {@link org.apache.lucene.document.Field Field} java docs for more information on
|
||||
* Fields. <a id="Segments"></a>
|
||||
*
|
||||
* <h4>Segments</h4>
|
||||
*
|
||||
* <p>Lucene indexes may be composed of multiple sub-indexes, or <i>segments</i>. Each segment is a
|
||||
* fully independent index, which could be searched separately. Indexes evolve by:
|
||||
*
|
||||
* <ol>
|
||||
* <li>Creating new segments for newly added documents.
|
||||
* <li>Merging existing segments.
|
||||
* </ol>
|
||||
*
|
||||
* <p>Searches may involve multiple segments and/or multiple indexes, each index potentially
|
||||
* composed of a set of segments. <a id="Document_Numbers"></a>
|
||||
*
|
||||
* <h4>Document Numbers</h4>
|
||||
*
|
||||
* <p>Internally, Lucene refers to documents by an integer <i>document number</i>. The first
|
||||
* document added to an index is numbered zero, and each subsequent document added gets a number one
|
||||
* greater than the previous.
|
||||
*
|
||||
* <p>Note that a document's number may change, so caution should be taken when storing these
|
||||
* numbers outside of Lucene. In particular, numbers may change in the following situations:
|
||||
*
|
||||
* <ul>
|
||||
* <li>
|
||||
* <p>The numbers stored in each segment are unique only within the segment, and must be
|
||||
* converted before they can be used in a larger context. The standard technique is to
|
||||
* allocate each segment a range of values, based on the range of numbers used in that
|
||||
* segment. To convert a document number from a segment to an external value, the segment's
|
||||
* <i>base</i> document number is added. To convert an external value back to a
|
||||
* segment-specific value, the segment is identified by the range that the external value is
|
||||
* in, and the segment's base value is subtracted. For example two five document segments
|
||||
* might be combined, so that the first segment has a base value of zero, and the second of
|
||||
* five. Document three from the second segment would have an external value of eight.
|
||||
* <li>
|
||||
* <p>When documents are deleted, gaps are created in the numbering. These are eventually
|
||||
* removed as the index evolves through merging. Deleted documents are dropped when segments
|
||||
* are merged. A freshly-merged segment thus has no gaps in its numbering.
|
||||
* </ul>
|
||||
*
|
||||
* </div> <a id="Overview"></a>
|
||||
*
|
||||
* <h3>Index Structure Overview</h3>
|
||||
*
|
||||
* <div>
|
||||
*
|
||||
* <p>Each segment index maintains the following:
|
||||
*
|
||||
* <ul>
|
||||
* <li>{@link org.apache.lucene.codecs.lucene99.Lucene99SegmentInfoFormat Segment info}. This
|
||||
* contains metadata about a segment, such as the number of documents, what files it uses, and
|
||||
* information about how the segment is sorted
|
||||
* <li>{@link org.apache.lucene.codecs.lucene94.Lucene94FieldInfosFormat Field names}. This
|
||||
* contains metadata about the set of named fields used in the index.
|
||||
* <li>{@link org.apache.lucene.codecs.lucene90.Lucene90StoredFieldsFormat Stored Field values}.
|
||||
* This contains, for each document, a list of attribute-value pairs, where the attributes are
|
||||
* field names. These are used to store auxiliary information about the document, such as its
|
||||
* title, url, or an identifier to access a database. The set of stored fields are what is
|
||||
* returned for each hit when searching. This is keyed by document number.
|
||||
* <li>{@link org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat Term dictionary}.
|
||||
* A dictionary containing all of the terms used in all of the indexed fields of all of the
|
||||
* documents. The dictionary also contains the number of documents which contain the term, and
|
||||
* pointers to the term's frequency and proximity data.
|
||||
* <li>{@link org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat Term Frequency
|
||||
* data}. For each term in the dictionary, the numbers of all the documents that contain that
|
||||
* term, and the frequency of the term in that document, unless frequencies are omitted
|
||||
* ({@link org.apache.lucene.index.IndexOptions#DOCS IndexOptions.DOCS})
|
||||
* <li>{@link org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat Term Proximity
|
||||
* data}. For each term in the dictionary, the positions that the term occurs in each
|
||||
* document. Note that this will not exist if all fields in all documents omit position data.
|
||||
* <li>{@link org.apache.lucene.codecs.lucene90.Lucene90NormsFormat Normalization factors}. For
|
||||
* each field in each document, a value is stored that is multiplied into the score for hits
|
||||
* on that field.
|
||||
* <li>{@link org.apache.lucene.codecs.lucene90.Lucene90TermVectorsFormat Term Vectors}. For each
|
||||
* field in each document, the term vector (sometimes called document vector) may be stored. A
|
||||
* term vector consists of term text and term frequency. To add Term Vectors to your index see
|
||||
* the {@link org.apache.lucene.document.Field Field} constructors
|
||||
* <li>{@link org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat Per-document values}. Like
|
||||
* stored values, these are also keyed by document number, but are generally intended to be
|
||||
* loaded into main memory for fast access. Whereas stored values are generally intended for
|
||||
* summary results from searches, per-document values are useful for things like scoring
|
||||
* factors.
|
||||
* <li>{@link org.apache.lucene.codecs.lucene90.Lucene90LiveDocsFormat Live documents}. An
|
||||
* optional file indicating which documents are live.
|
||||
* <li>{@link org.apache.lucene.codecs.lucene90.Lucene90PointsFormat Point values}. Optional pair
|
||||
* of files, recording dimensionally indexed fields, to enable fast numeric range filtering
|
||||
* and large numeric values like BigInteger and BigDecimal (1D) and geographic shape
|
||||
* intersection (2D, 3D).
|
||||
* <li>{@link org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat Vector values}. The
|
||||
* vector format stores numeric vectors in a format optimized for random access and
|
||||
* computation, supporting high-dimensional nearest-neighbor search.
|
||||
* </ul>
|
||||
*
|
||||
* <p>Details on each of these are provided in their linked pages. </div> <a id="File_Naming"></a>
|
||||
*
|
||||
* <h3>File Naming</h3>
|
||||
*
|
||||
* <div>
|
||||
*
|
||||
* <p>All files belonging to a segment have the same name with varying extensions. The extensions
|
||||
* correspond to the different file formats described below. When using the Compound File format
|
||||
* (default for small segments) these files (except for the Segment info file, the Lock file, and
|
||||
* Deleted documents file) are collapsed into a single .cfs file (see below for details)
|
||||
*
|
||||
* <p>Typically, all segments in an index are stored in a single directory, although this is not
|
||||
* required.
|
||||
*
|
||||
* <p>File names are never re-used. That is, when any file is saved to the Directory it is given a
|
||||
* never before used filename. This is achieved using a simple generations approach. For example,
|
||||
* the first segments file is segments_1, then segments_2, etc. The generation is a sequential long
|
||||
* integer represented in alpha-numeric (base 36) form. </div> <a id="file-names"></a>
|
||||
*
|
||||
* <h3>Summary of File Extensions</h3>
|
||||
*
|
||||
* <div>
|
||||
*
|
||||
* <p>The following table summarizes the names and extensions of the files in Lucene:
|
||||
*
|
||||
* <table class="padding4" style="border-spacing: 1px; border-collapse: separate">
|
||||
* <caption>lucene filenames by extension</caption>
|
||||
* <tr>
|
||||
* <th>Name</th>
|
||||
* <th>Extension</th>
|
||||
* <th>Brief Description</th>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.index.SegmentInfos Segments File}</td>
|
||||
* <td>segments_N</td>
|
||||
* <td>Stores information about a commit point</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td><a href="#Lock_File">Lock File</a></td>
|
||||
* <td>write.lock</td>
|
||||
* <td>The Write lock prevents multiple IndexWriters from writing to the same
|
||||
* file.</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene99.Lucene99SegmentInfoFormat Segment Info}</td>
|
||||
* <td>.si</td>
|
||||
* <td>Stores metadata about a segment</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90CompoundFormat Compound File}</td>
|
||||
* <td>.cfs, .cfe</td>
|
||||
* <td>An optional "virtual" file consisting of all the other index files for
|
||||
* systems that frequently run out of file handles.</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene94.Lucene94FieldInfosFormat Fields}</td>
|
||||
* <td>.fnm</td>
|
||||
* <td>Stores information about the fields</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90StoredFieldsFormat Field Index}</td>
|
||||
* <td>.fdx</td>
|
||||
* <td>Contains pointers to field data</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90StoredFieldsFormat Field Data}</td>
|
||||
* <td>.fdt</td>
|
||||
* <td>The stored fields for documents</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat Term Dictionary}</td>
|
||||
* <td>.tim</td>
|
||||
* <td>The term dictionary, stores term info</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat Term Index}</td>
|
||||
* <td>.tip</td>
|
||||
* <td>The index into the Term Dictionary</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat Frequencies}</td>
|
||||
* <td>.doc</td>
|
||||
* <td>Contains the list of docs which contain each term along with frequency</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat Positions}</td>
|
||||
* <td>.pos</td>
|
||||
* <td>Stores position information about where a term occurs in the index</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat Payloads}</td>
|
||||
* <td>.pay</td>
|
||||
* <td>Stores additional per-position metadata information such as character offsets and user payloads</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90NormsFormat Norms}</td>
|
||||
* <td>.nvd, .nvm</td>
|
||||
* <td>Encodes length and boost factors for docs and fields</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat Per-Document Values}</td>
|
||||
* <td>.dvd, .dvm</td>
|
||||
* <td>Encodes additional scoring factors or other per-document information.</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90TermVectorsFormat Term Vector Index}</td>
|
||||
* <td>.tvx</td>
|
||||
* <td>Stores offset into the document data file</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90TermVectorsFormat Term Vector Data}</td>
|
||||
* <td>.tvd</td>
|
||||
* <td>Contains term vector data.</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90LiveDocsFormat Live Documents}</td>
|
||||
* <td>.liv</td>
|
||||
* <td>Info about what documents are live</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90PointsFormat Point values}</td>
|
||||
* <td>.dii, .dim</td>
|
||||
* <td>Holds indexed points</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>{@link org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat Vector values}</td>
|
||||
* <td>.vec, .vem, .veq, vex</td>
|
||||
* <td>Holds indexed vectors; <code>.vec</code> files contain the raw vector data,
|
||||
* <code>.vem</code> the vector metadata, <code>.veq</code> the quantized vector data, and <code>.vex</code> the
|
||||
* hnsw graph data.</td>
|
||||
* </tr>
|
||||
* </table>
|
||||
*
|
||||
* </div> <a id="Lock_File"></a>
|
||||
*
|
||||
* <h3>Lock File</h3>
|
||||
*
|
||||
* The write lock, which is stored in the index directory by default, is named "write.lock". If the
|
||||
* lock directory is different from the index directory then the write lock will be named
|
||||
* "XXXX-write.lock" where XXXX is a unique prefix derived from the full path to the index
|
||||
* directory. When this file is present, a writer is currently modifying the index (adding or
|
||||
* removing documents). This lock file ensures that only one writer is modifying the index at a
|
||||
* time. <a id="History"></a>
|
||||
*
|
||||
* <h3>History</h3>
|
||||
*
|
||||
* <p>Compatibility notes are provided in this document, describing how file formats have changed
|
||||
* from prior versions:
|
||||
*
|
||||
* <ul>
|
||||
* <li>In version 2.1, the file format was changed to allow lock-less commits (ie, no more commit
|
||||
* lock). The change is fully backwards compatible: you can open a pre-2.1 index for searching
|
||||
* or adding/deleting of docs. When the new segments file is saved (committed), it will be
|
||||
* written in the new file format (meaning no specific "upgrade" process is needed). But note
|
||||
* that once a commit has occurred, pre-2.1 Lucene will not be able to read the index.
|
||||
* <li>In version 2.3, the file format was changed to allow segments to share a single set of doc
|
||||
* store (vectors & stored fields) files. This allows for faster indexing in certain
|
||||
* cases. The change is fully backwards compatible (in the same way as the lock-less commits
|
||||
* change in 2.1).
|
||||
* <li>In version 2.4, Strings are now written as true UTF-8 byte sequence, not Java's modified
|
||||
* UTF-8. See <a href="http://issues.apache.org/jira/browse/LUCENE-510">LUCENE-510</a> for
|
||||
* details.
|
||||
* <li>In version 2.9, an optional opaque Map<String,String> CommitUserData may be passed to
|
||||
* IndexWriter's commit methods (and later retrieved), which is recorded in the segments_N
|
||||
* file. See <a href="http://issues.apache.org/jira/browse/LUCENE-1382">LUCENE-1382</a> for
|
||||
* details. Also, diagnostics were added to each segment written recording details about why
|
||||
* it was written (due to flush, merge; which OS/JRE was used; etc.). See issue <a
|
||||
* href="http://issues.apache.org/jira/browse/LUCENE-1654">LUCENE-1654</a> for details.
|
||||
* <li>In version 3.0, compressed fields are no longer written to the index (they can still be
|
||||
* read, but on merge the new segment will write them, uncompressed). See issue <a
|
||||
* href="http://issues.apache.org/jira/browse/LUCENE-1960">LUCENE-1960</a> for details.
|
||||
* <li>In version 3.1, segments records the code version that created them. See <a
|
||||
* href="http://issues.apache.org/jira/browse/LUCENE-2720">LUCENE-2720</a> for details.
|
||||
* Additionally segments track explicitly whether or not they have term vectors. See <a
|
||||
* href="http://issues.apache.org/jira/browse/LUCENE-2811">LUCENE-2811</a> for details.
|
||||
* <li>In version 3.2, numeric fields are written as natively to stored fields file, previously
|
||||
* they were stored in text format only.
|
||||
* <li>In version 3.4, fields can omit position data while still indexing term frequencies.
|
||||
* <li>In version 4.0, the format of the inverted index became extensible via the {@link
|
||||
* org.apache.lucene.codecs.Codec Codec} api. Fast per-document storage ({@code DocValues})
|
||||
* was introduced. Normalization factors need no longer be a single byte, they can be any
|
||||
* {@link org.apache.lucene.index.NumericDocValues NumericDocValues}. Terms need not be
|
||||
* unicode strings, they can be any byte sequence. Term offsets can optionally be indexed into
|
||||
* the postings lists. Payloads can be stored in the term vectors.
|
||||
* <li>In version 4.1, the format of the postings list changed to use either of FOR compression or
|
||||
* variable-byte encoding, depending upon the frequency of the term. Terms appearing only once
|
||||
* were changed to inline directly into the term dictionary. Stored fields are compressed by
|
||||
* default.
|
||||
* <li>In version 4.2, term vectors are compressed by default. DocValues has a new multi-valued
|
||||
* type (SortedSet), that can be used for faceting/grouping/joining on multi-valued fields.
|
||||
* <li>In version 4.5, DocValues were extended to explicitly represent missing values.
|
||||
* <li>In version 4.6, FieldInfos were extended to support per-field DocValues generation, to
|
||||
* allow updating NumericDocValues fields.
|
||||
* <li>In version 4.8, checksum footers were added to the end of each index file for improved data
|
||||
* integrity. Specifically, the last 8 bytes of every index file contain the zlib-crc32
|
||||
* checksum of the file.
|
||||
* <li>In version 4.9, DocValues has a new multi-valued numeric type (SortedNumeric) that is
|
||||
* suitable for faceting/sorting/analytics.
|
||||
* <li>In version 5.4, DocValues have been improved to store more information on disk: addresses
|
||||
* for binary fields and ord indexes for multi-valued fields.
|
||||
* <li>In version 6.0, Points were added, for multi-dimensional range/distance search.
|
||||
* <li>In version 6.2, new Segment info format that reads/writes the index sort, to support index
|
||||
* sorting.
|
||||
* <li>In version 7.0, DocValues have been improved to better support sparse doc values thanks to
|
||||
* an iterator API.
|
||||
* <li>In version 8.0, postings have been enhanced to record, for each block of doc ids, the (term
|
||||
* freq, normalization factor) pairs that may trigger the maximum score of the block. This
|
||||
* information is recorded alongside skip data in order to be able to skip blocks of doc ids
|
||||
* if they may not produce high enough scores. Additionally doc values and norms has been
|
||||
* extended with jump-tables to make access O(1) instead of O(n), where n is the number of
|
||||
* elements to skip when advancing in the data.
|
||||
* <li>In version 8.4, postings, positions, offsets and payload lengths have move to a more
|
||||
* performant encoding that is vectorized.
|
||||
* <li>In version 8.6, index sort serialization is delegated to the sorts themselves, to allow
|
||||
* user-defined sorts to be used
|
||||
* <li>In version 8.7, stored fields compression became adaptive to better handle documents with
|
||||
* smaller stored fields.
|
||||
* <li>In version 9.0, vector-valued fields were added.
|
||||
* <li>In version 9.1, vector-valued fields were modified to add a graph hierarchy.
|
||||
* <li>In version 9.2, docs of vector-valued fields were moved from .vem to .vec and encoded by
|
||||
* IndexDISI. ordToDoc mappings was added to .vem.
|
||||
* <li>In version 9.5, HNSW graph connections were changed to be delta-encoded with vints.
|
||||
* Additionally, metadata file size improvements were made by delta-encoding nodes by graph
|
||||
* layer and not writing the node ids for the zeroth layer.
|
||||
* <li>In version 9.9, Vector scalar quantization support was added. Allowing the HNSW vector
|
||||
* format to utilize int8 quantized vectors for float32 vector search.
|
||||
* </ul>
|
||||
*
|
||||
* <a id="Limitations"></a>
|
||||
*
|
||||
* <h3>Limitations</h3>
|
||||
*
|
||||
* <div>
|
||||
*
|
||||
* <p>Lucene uses a Java <code>int</code> to refer to document numbers, and the index file format
|
||||
* uses an <code>Int32</code> on-disk to store document numbers. This is a limitation of both the
|
||||
* index file format and the current implementation. Eventually these should be replaced with either
|
||||
* <code>UInt64</code> values, or better yet, {@link org.apache.lucene.store.DataOutput#writeVInt
|
||||
* VInt} values which have no limit. </div>
|
||||
*/
|
||||
package org.apache.lucene.backward_codecs.lucene99;
|
|
@ -22,3 +22,4 @@ org.apache.lucene.backward_codecs.lucene91.Lucene91Codec
|
|||
org.apache.lucene.backward_codecs.lucene92.Lucene92Codec
|
||||
org.apache.lucene.backward_codecs.lucene94.Lucene94Codec
|
||||
org.apache.lucene.backward_codecs.lucene95.Lucene95Codec
|
||||
org.apache.lucene.backward_codecs.lucene99.Lucene99Codec
|
||||
|
|
|
@ -16,3 +16,4 @@
|
|||
org.apache.lucene.backward_codecs.lucene50.Lucene50PostingsFormat
|
||||
org.apache.lucene.backward_codecs.lucene84.Lucene84PostingsFormat
|
||||
org.apache.lucene.backward_codecs.lucene90.Lucene90PostingsFormat
|
||||
org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -17,7 +17,7 @@
|
|||
package org.apache.lucene.backward_codecs.lucene50;
|
||||
|
||||
import java.io.IOException;
|
||||
import org.apache.lucene.backward_codecs.lucene40.blocktree.Lucene40BlockTreeTermsWriter;
|
||||
import org.apache.lucene.backward_codecs.lucene40.blocktree.Lucene40BlockTreeTermsWriterV5;
|
||||
import org.apache.lucene.codecs.FieldsConsumer;
|
||||
import org.apache.lucene.codecs.PostingsWriterBase;
|
||||
import org.apache.lucene.index.SegmentWriteState;
|
||||
|
@ -31,11 +31,11 @@ public class Lucene50RWPostingsFormat extends Lucene50PostingsFormat {
|
|||
boolean success = false;
|
||||
try {
|
||||
FieldsConsumer ret =
|
||||
new Lucene40BlockTreeTermsWriter(
|
||||
new Lucene40BlockTreeTermsWriterV5(
|
||||
state,
|
||||
postingsWriter,
|
||||
Lucene40BlockTreeTermsWriter.DEFAULT_MIN_BLOCK_SIZE,
|
||||
Lucene40BlockTreeTermsWriter.DEFAULT_MAX_BLOCK_SIZE);
|
||||
Lucene40BlockTreeTermsWriterV5.DEFAULT_MIN_BLOCK_SIZE,
|
||||
Lucene40BlockTreeTermsWriterV5.DEFAULT_MAX_BLOCK_SIZE);
|
||||
success = true;
|
||||
return ret;
|
||||
} finally {
|
||||
|
|
|
@ -642,13 +642,13 @@ public class BKDWriter60 implements Closeable {
|
|||
throws IOException {
|
||||
assert docMaps == null || readers.size() == docMaps.size();
|
||||
|
||||
BKDMergeQueue queue = new BKDMergeQueue(config.bytesPerDim, readers.size());
|
||||
BKDMergeQueue queue = new BKDMergeQueue(config.bytesPerDim(), readers.size());
|
||||
|
||||
for (int i = 0; i < readers.size(); i++) {
|
||||
PointValues pointValues = readers.get(i);
|
||||
assert pointValues.getNumDimensions() == config.numDims
|
||||
&& pointValues.getBytesPerDimension() == config.bytesPerDim
|
||||
&& pointValues.getNumIndexDimensions() == config.numIndexDims;
|
||||
assert pointValues.getNumDimensions() == config.numDims()
|
||||
&& pointValues.getBytesPerDimension() == config.bytesPerDim()
|
||||
&& pointValues.getNumIndexDimensions() == config.numIndexDims();
|
||||
MergeState.DocMap docMap;
|
||||
if (docMaps == null) {
|
||||
docMap = null;
|
||||
|
|
|
@ -23,12 +23,11 @@ import java.util.Arrays;
|
|||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import org.apache.lucene.backward_codecs.lucene90.Lucene90ScoreSkipReader.MutableImpactList;
|
||||
import org.apache.lucene.backward_codecs.lucene99.Lucene99SkipWriter;
|
||||
import org.apache.lucene.codecs.Codec;
|
||||
import org.apache.lucene.codecs.CompetitiveImpactAccumulator;
|
||||
import org.apache.lucene.codecs.lucene90.blocktree.FieldReader;
|
||||
import org.apache.lucene.codecs.lucene90.blocktree.Stats;
|
||||
import org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat;
|
||||
import org.apache.lucene.codecs.lucene99.Lucene99SkipWriter;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.index.DirectoryReader;
|
||||
|
@ -77,22 +76,6 @@ public class TestLucene90PostingsFormat extends BasePostingsFormatTestCase {
|
|||
d.close();
|
||||
}
|
||||
|
||||
private void shouldFail(int minItemsInBlock, int maxItemsInBlock) {
|
||||
expectThrows(
|
||||
IllegalArgumentException.class,
|
||||
() -> {
|
||||
new Lucene99PostingsFormat(minItemsInBlock, maxItemsInBlock);
|
||||
});
|
||||
}
|
||||
|
||||
public void testInvalidBlockSizes() throws Exception {
|
||||
shouldFail(0, 0);
|
||||
shouldFail(10, 8);
|
||||
shouldFail(-1, 10);
|
||||
shouldFail(10, -1);
|
||||
shouldFail(10, 12);
|
||||
}
|
||||
|
||||
public void testImpactSerialization() throws IOException {
|
||||
// omit norms and omit freqs
|
||||
doTestImpactSerialization(Collections.singletonList(new Impact(1, 1L)));
|
||||
|
|
|
@ -388,10 +388,14 @@ public final class Lucene94HnswVectorsWriter extends KnnVectorsWriter {
|
|||
// write the vector data to a temporary file
|
||||
DocsWithFieldSet docsWithField =
|
||||
switch (fieldInfo.getVectorEncoding()) {
|
||||
case BYTE -> writeByteVectorData(
|
||||
tempVectorData, MergedVectorValues.mergeByteVectorValues(fieldInfo, mergeState));
|
||||
case FLOAT32 -> writeVectorData(
|
||||
tempVectorData, MergedVectorValues.mergeFloatVectorValues(fieldInfo, mergeState));
|
||||
case BYTE ->
|
||||
writeByteVectorData(
|
||||
tempVectorData,
|
||||
MergedVectorValues.mergeByteVectorValues(fieldInfo, mergeState));
|
||||
case FLOAT32 ->
|
||||
writeVectorData(
|
||||
tempVectorData,
|
||||
MergedVectorValues.mergeFloatVectorValues(fieldInfo, mergeState));
|
||||
};
|
||||
CodecUtil.writeFooter(tempVectorData);
|
||||
IOUtils.close(tempVectorData);
|
||||
|
@ -638,18 +642,20 @@ public final class Lucene94HnswVectorsWriter extends KnnVectorsWriter {
|
|||
throws IOException {
|
||||
int dim = fieldInfo.getVectorDimension();
|
||||
return switch (fieldInfo.getVectorEncoding()) {
|
||||
case BYTE -> new FieldWriter<byte[]>(fieldInfo, M, beamWidth, infoStream) {
|
||||
@Override
|
||||
public byte[] copyValue(byte[] value) {
|
||||
return ArrayUtil.copyOfSubArray(value, 0, dim);
|
||||
}
|
||||
};
|
||||
case FLOAT32 -> new FieldWriter<float[]>(fieldInfo, M, beamWidth, infoStream) {
|
||||
@Override
|
||||
public float[] copyValue(float[] value) {
|
||||
return ArrayUtil.copyOfSubArray(value, 0, dim);
|
||||
}
|
||||
};
|
||||
case BYTE ->
|
||||
new FieldWriter<byte[]>(fieldInfo, M, beamWidth, infoStream) {
|
||||
@Override
|
||||
public byte[] copyValue(byte[] value) {
|
||||
return ArrayUtil.copyOfSubArray(value, 0, dim);
|
||||
}
|
||||
};
|
||||
case FLOAT32 ->
|
||||
new FieldWriter<float[]>(fieldInfo, M, beamWidth, infoStream) {
|
||||
@Override
|
||||
public float[] copyValue(float[] value) {
|
||||
return ArrayUtil.copyOfSubArray(value, 0, dim);
|
||||
}
|
||||
};
|
||||
};
|
||||
}
|
||||
|
||||
|
@ -663,12 +669,14 @@ public final class Lucene94HnswVectorsWriter extends KnnVectorsWriter {
|
|||
DefaultFlatVectorScorer defaultFlatVectorScorer = new DefaultFlatVectorScorer();
|
||||
RandomVectorScorerSupplier scorerSupplier =
|
||||
switch (fieldInfo.getVectorEncoding()) {
|
||||
case BYTE -> defaultFlatVectorScorer.getRandomVectorScorerSupplier(
|
||||
fieldInfo.getVectorSimilarityFunction(),
|
||||
RandomAccessVectorValues.fromBytes((List<byte[]>) vectors, dim));
|
||||
case FLOAT32 -> defaultFlatVectorScorer.getRandomVectorScorerSupplier(
|
||||
fieldInfo.getVectorSimilarityFunction(),
|
||||
RandomAccessVectorValues.fromFloats((List<float[]>) vectors, dim));
|
||||
case BYTE ->
|
||||
defaultFlatVectorScorer.getRandomVectorScorerSupplier(
|
||||
fieldInfo.getVectorSimilarityFunction(),
|
||||
RandomAccessVectorValues.fromBytes((List<byte[]>) vectors, dim));
|
||||
case FLOAT32 ->
|
||||
defaultFlatVectorScorer.getRandomVectorScorerSupplier(
|
||||
fieldInfo.getVectorSimilarityFunction(),
|
||||
RandomAccessVectorValues.fromFloats((List<float[]>) vectors, dim));
|
||||
};
|
||||
hnswGraphBuilder =
|
||||
HnswGraphBuilder.create(scorerSupplier, M, beamWidth, HnswGraphBuilder.randSeed);
|
||||
|
@ -693,9 +701,9 @@ public final class Lucene94HnswVectorsWriter extends KnnVectorsWriter {
|
|||
lastDocID = docID;
|
||||
}
|
||||
|
||||
OnHeapHnswGraph getGraph() {
|
||||
OnHeapHnswGraph getGraph() throws IOException {
|
||||
if (vectors.size() > 0) {
|
||||
return hnswGraphBuilder.getGraph();
|
||||
return hnswGraphBuilder.getCompletedGraph();
|
||||
} else {
|
||||
return null;
|
||||
}
|
||||
|
|
|
@ -414,10 +414,14 @@ public final class Lucene95HnswVectorsWriter extends KnnVectorsWriter {
|
|||
// write the vector data to a temporary file
|
||||
DocsWithFieldSet docsWithField =
|
||||
switch (fieldInfo.getVectorEncoding()) {
|
||||
case BYTE -> writeByteVectorData(
|
||||
tempVectorData, MergedVectorValues.mergeByteVectorValues(fieldInfo, mergeState));
|
||||
case FLOAT32 -> writeVectorData(
|
||||
tempVectorData, MergedVectorValues.mergeFloatVectorValues(fieldInfo, mergeState));
|
||||
case BYTE ->
|
||||
writeByteVectorData(
|
||||
tempVectorData,
|
||||
MergedVectorValues.mergeByteVectorValues(fieldInfo, mergeState));
|
||||
case FLOAT32 ->
|
||||
writeVectorData(
|
||||
tempVectorData,
|
||||
MergedVectorValues.mergeFloatVectorValues(fieldInfo, mergeState));
|
||||
};
|
||||
CodecUtil.writeFooter(tempVectorData);
|
||||
IOUtils.close(tempVectorData);
|
||||
|
@ -477,10 +481,12 @@ public final class Lucene95HnswVectorsWriter extends KnnVectorsWriter {
|
|||
}
|
||||
DocIdSetIterator mergedVectorIterator = null;
|
||||
switch (fieldInfo.getVectorEncoding()) {
|
||||
case BYTE -> mergedVectorIterator =
|
||||
KnnVectorsWriter.MergedVectorValues.mergeByteVectorValues(fieldInfo, mergeState);
|
||||
case FLOAT32 -> mergedVectorIterator =
|
||||
KnnVectorsWriter.MergedVectorValues.mergeFloatVectorValues(fieldInfo, mergeState);
|
||||
case BYTE ->
|
||||
mergedVectorIterator =
|
||||
KnnVectorsWriter.MergedVectorValues.mergeByteVectorValues(fieldInfo, mergeState);
|
||||
case FLOAT32 ->
|
||||
mergedVectorIterator =
|
||||
KnnVectorsWriter.MergedVectorValues.mergeFloatVectorValues(fieldInfo, mergeState);
|
||||
}
|
||||
graph =
|
||||
merger.merge(
|
||||
|
@ -680,18 +686,20 @@ public final class Lucene95HnswVectorsWriter extends KnnVectorsWriter {
|
|||
throws IOException {
|
||||
int dim = fieldInfo.getVectorDimension();
|
||||
return switch (fieldInfo.getVectorEncoding()) {
|
||||
case BYTE -> new FieldWriter<byte[]>(fieldInfo, M, beamWidth, infoStream) {
|
||||
@Override
|
||||
public byte[] copyValue(byte[] value) {
|
||||
return ArrayUtil.copyOfSubArray(value, 0, dim);
|
||||
}
|
||||
};
|
||||
case FLOAT32 -> new FieldWriter<float[]>(fieldInfo, M, beamWidth, infoStream) {
|
||||
@Override
|
||||
public float[] copyValue(float[] value) {
|
||||
return ArrayUtil.copyOfSubArray(value, 0, dim);
|
||||
}
|
||||
};
|
||||
case BYTE ->
|
||||
new FieldWriter<byte[]>(fieldInfo, M, beamWidth, infoStream) {
|
||||
@Override
|
||||
public byte[] copyValue(byte[] value) {
|
||||
return ArrayUtil.copyOfSubArray(value, 0, dim);
|
||||
}
|
||||
};
|
||||
case FLOAT32 ->
|
||||
new FieldWriter<float[]>(fieldInfo, M, beamWidth, infoStream) {
|
||||
@Override
|
||||
public float[] copyValue(float[] value) {
|
||||
return ArrayUtil.copyOfSubArray(value, 0, dim);
|
||||
}
|
||||
};
|
||||
};
|
||||
}
|
||||
|
||||
|
@ -704,12 +712,14 @@ public final class Lucene95HnswVectorsWriter extends KnnVectorsWriter {
|
|||
vectors = new ArrayList<>();
|
||||
RandomVectorScorerSupplier scorerSupplier =
|
||||
switch (fieldInfo.getVectorEncoding()) {
|
||||
case BYTE -> defaultFlatVectorScorer.getRandomVectorScorerSupplier(
|
||||
fieldInfo.getVectorSimilarityFunction(),
|
||||
RandomAccessVectorValues.fromBytes((List<byte[]>) vectors, dim));
|
||||
case FLOAT32 -> defaultFlatVectorScorer.getRandomVectorScorerSupplier(
|
||||
fieldInfo.getVectorSimilarityFunction(),
|
||||
RandomAccessVectorValues.fromFloats((List<float[]>) vectors, dim));
|
||||
case BYTE ->
|
||||
defaultFlatVectorScorer.getRandomVectorScorerSupplier(
|
||||
fieldInfo.getVectorSimilarityFunction(),
|
||||
RandomAccessVectorValues.fromBytes((List<byte[]>) vectors, dim));
|
||||
case FLOAT32 ->
|
||||
defaultFlatVectorScorer.getRandomVectorScorerSupplier(
|
||||
fieldInfo.getVectorSimilarityFunction(),
|
||||
RandomAccessVectorValues.fromFloats((List<float[]>) vectors, dim));
|
||||
};
|
||||
hnswGraphBuilder =
|
||||
HnswGraphBuilder.create(scorerSupplier, M, beamWidth, HnswGraphBuilder.randSeed);
|
||||
|
@ -732,9 +742,9 @@ public final class Lucene95HnswVectorsWriter extends KnnVectorsWriter {
|
|||
lastDocID = docID;
|
||||
}
|
||||
|
||||
OnHeapHnswGraph getGraph() {
|
||||
OnHeapHnswGraph getGraph() throws IOException {
|
||||
if (vectors.size() > 0) {
|
||||
return hnswGraphBuilder.getGraph();
|
||||
return hnswGraphBuilder.getCompletedGraph();
|
||||
} else {
|
||||
return null;
|
||||
}
|
||||
|
|
|
@ -14,22 +14,22 @@
|
|||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.codecs.lucene99;
|
||||
package org.apache.lucene.backward_codecs.lucene99;
|
||||
|
||||
import static org.apache.lucene.codecs.lucene99.ForUtil.BLOCK_SIZE;
|
||||
import static org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.DOC_CODEC;
|
||||
import static org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.MAX_SKIP_LEVELS;
|
||||
import static org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.PAY_CODEC;
|
||||
import static org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.POS_CODEC;
|
||||
import static org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.TERMS_CODEC;
|
||||
import static org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.VERSION_CURRENT;
|
||||
import static org.apache.lucene.backward_codecs.lucene99.ForUtil.BLOCK_SIZE;
|
||||
import static org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat.DOC_CODEC;
|
||||
import static org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat.MAX_SKIP_LEVELS;
|
||||
import static org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat.PAY_CODEC;
|
||||
import static org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat.POS_CODEC;
|
||||
import static org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat.TERMS_CODEC;
|
||||
import static org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat.VERSION_CURRENT;
|
||||
|
||||
import java.io.IOException;
|
||||
import org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat.IntBlockTermState;
|
||||
import org.apache.lucene.codecs.BlockTermState;
|
||||
import org.apache.lucene.codecs.CodecUtil;
|
||||
import org.apache.lucene.codecs.CompetitiveImpactAccumulator;
|
||||
import org.apache.lucene.codecs.PushPostingsWriterBase;
|
||||
import org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.IntBlockTermState;
|
||||
import org.apache.lucene.index.CorruptIndexException;
|
||||
import org.apache.lucene.index.FieldInfo;
|
||||
import org.apache.lucene.index.IndexFileNames;
|
|
@ -0,0 +1,68 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.backward_codecs.lucene99;
|
||||
|
||||
import java.io.IOException;
|
||||
import org.apache.lucene.codecs.FieldsConsumer;
|
||||
import org.apache.lucene.codecs.PostingsWriterBase;
|
||||
import org.apache.lucene.codecs.lucene90.blocktree.Lucene90BlockTreeTermsWriter;
|
||||
import org.apache.lucene.index.SegmentWriteState;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
|
||||
public class Lucene99RWPostingsFormat extends Lucene99PostingsFormat {
|
||||
|
||||
private final int minTermBlockSize;
|
||||
private final int maxTermBlockSize;
|
||||
|
||||
/** Creates {@code Lucene99PostingsFormat} with default settings. */
|
||||
public Lucene99RWPostingsFormat() {
|
||||
this(
|
||||
Lucene90BlockTreeTermsWriter.DEFAULT_MIN_BLOCK_SIZE,
|
||||
Lucene90BlockTreeTermsWriter.DEFAULT_MAX_BLOCK_SIZE);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates {@code Lucene99PostingsFormat} with custom values for {@code minBlockSize} and {@code
|
||||
* maxBlockSize} passed to block terms dictionary.
|
||||
*
|
||||
* @see
|
||||
* Lucene90BlockTreeTermsWriter#Lucene90BlockTreeTermsWriter(SegmentWriteState,PostingsWriterBase,int,int)
|
||||
*/
|
||||
public Lucene99RWPostingsFormat(int minTermBlockSize, int maxTermBlockSize) {
|
||||
super();
|
||||
Lucene90BlockTreeTermsWriter.validateSettings(minTermBlockSize, maxTermBlockSize);
|
||||
this.minTermBlockSize = minTermBlockSize;
|
||||
this.maxTermBlockSize = maxTermBlockSize;
|
||||
}
|
||||
|
||||
@Override
|
||||
public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
|
||||
PostingsWriterBase postingsWriter = new Lucene99PostingsWriter(state);
|
||||
boolean success = false;
|
||||
try {
|
||||
FieldsConsumer ret =
|
||||
new Lucene90BlockTreeTermsWriter(
|
||||
state, postingsWriter, minTermBlockSize, maxTermBlockSize);
|
||||
success = true;
|
||||
return ret;
|
||||
} finally {
|
||||
if (!success) {
|
||||
IOUtils.closeWhileHandlingException(postingsWriter);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -14,7 +14,7 @@
|
|||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.codecs.lucene99;
|
||||
package org.apache.lucene.backward_codecs.lucene99;
|
||||
|
||||
import com.carrotsearch.randomizedtesting.generators.RandomNumbers;
|
||||
import java.io.IOException;
|
|
@ -14,7 +14,7 @@
|
|||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.codecs.lucene99;
|
||||
package org.apache.lucene.backward_codecs.lucene99;
|
||||
|
||||
import com.carrotsearch.randomizedtesting.generators.RandomNumbers;
|
||||
import java.io.IOException;
|
|
@ -19,7 +19,6 @@ package org.apache.lucene.backward_codecs.lucene99;
|
|||
|
||||
import org.apache.lucene.codecs.Codec;
|
||||
import org.apache.lucene.codecs.KnnVectorsFormat;
|
||||
import org.apache.lucene.codecs.lucene99.Lucene99Codec;
|
||||
import org.apache.lucene.tests.index.BaseKnnVectorsFormatTestCase;
|
||||
|
||||
public class TestLucene99HnswScalarQuantizedVectorsFormat extends BaseKnnVectorsFormatTestCase {
|
||||
|
|
|
@ -14,22 +14,26 @@
|
|||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.codecs.lucene99;
|
||||
package org.apache.lucene.backward_codecs.lucene99;
|
||||
|
||||
import static org.apache.lucene.codecs.lucene99.Lucene99ScoreSkipReader.readImpacts;
|
||||
import static org.apache.lucene.backward_codecs.lucene99.Lucene99ScoreSkipReader.readImpacts;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import org.apache.lucene.backward_codecs.lucene99.Lucene99ScoreSkipReader.MutableImpactList;
|
||||
import org.apache.lucene.codecs.Codec;
|
||||
import org.apache.lucene.codecs.CompetitiveImpactAccumulator;
|
||||
import org.apache.lucene.codecs.lucene90.blocktree.FieldReader;
|
||||
import org.apache.lucene.codecs.lucene90.blocktree.Stats;
|
||||
import org.apache.lucene.codecs.lucene99.Lucene99ScoreSkipReader.MutableImpactList;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.index.*;
|
||||
import org.apache.lucene.index.DirectoryReader;
|
||||
import org.apache.lucene.index.Impact;
|
||||
import org.apache.lucene.index.IndexWriter;
|
||||
import org.apache.lucene.index.IndexWriterConfig;
|
||||
import org.apache.lucene.index.TermsEnum;
|
||||
import org.apache.lucene.store.ByteArrayDataInput;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.IOContext;
|
||||
|
@ -41,7 +45,7 @@ import org.apache.lucene.tests.util.TestUtil;
|
|||
import org.apache.lucene.util.BytesRef;
|
||||
|
||||
public class TestLucene99PostingsFormat extends BasePostingsFormatTestCase {
|
||||
private final Codec codec = TestUtil.alwaysPostingsFormat(new Lucene99PostingsFormat());
|
||||
private final Codec codec = TestUtil.alwaysPostingsFormat(new Lucene99RWPostingsFormat());
|
||||
|
||||
@Override
|
||||
protected Codec getCodec() {
|
||||
|
@ -77,7 +81,7 @@ public class TestLucene99PostingsFormat extends BasePostingsFormatTestCase {
|
|||
expectThrows(
|
||||
IllegalArgumentException.class,
|
||||
() -> {
|
||||
new Lucene99PostingsFormat(minItemsInBlock, maxItemsInBlock);
|
||||
new Lucene99RWPostingsFormat(minItemsInBlock, maxItemsInBlock);
|
||||
});
|
||||
}
|
||||
|
|
@ -14,7 +14,7 @@
|
|||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.codecs.lucene99;
|
||||
package org.apache.lucene.backward_codecs.lucene99;
|
||||
|
||||
import com.carrotsearch.randomizedtesting.generators.RandomNumbers;
|
||||
import java.io.IOException;
|
|
@ -0,0 +1,49 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.backward_codecs.lucene99;
|
||||
|
||||
import java.io.IOException;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.IOContext;
|
||||
import org.apache.lucene.store.IndexInput;
|
||||
import org.apache.lucene.store.IndexOutput;
|
||||
import org.apache.lucene.tests.util.LuceneTestCase;
|
||||
|
||||
public class TestPostingsUtil extends LuceneTestCase {
|
||||
|
||||
// checks for bug described in https://github.com/apache/lucene/issues/13373
|
||||
public void testIntegerOverflow() throws IOException {
|
||||
final int size = random().nextInt(1, ForUtil.BLOCK_SIZE);
|
||||
final long[] docDeltaBuffer = new long[size];
|
||||
final long[] freqBuffer = new long[size];
|
||||
|
||||
final int delta = 1 << 30;
|
||||
docDeltaBuffer[0] = delta;
|
||||
try (Directory dir = newDirectory()) {
|
||||
try (IndexOutput out = dir.createOutput("test", IOContext.DEFAULT)) {
|
||||
// In old implementation, this would cause integer overflow exception.
|
||||
PostingsUtil.writeVIntBlock(out, docDeltaBuffer, freqBuffer, size, true);
|
||||
}
|
||||
long[] restoredDocs = new long[size];
|
||||
long[] restoredFreqs = new long[size];
|
||||
try (IndexInput in = dir.openInput("test", IOContext.DEFAULT)) {
|
||||
PostingsUtil.readVIntBlock(in, restoredDocs, restoredFreqs, size, true, true);
|
||||
}
|
||||
assertEquals(delta, restoredDocs[0]);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -196,6 +196,7 @@ public class TestAncientIndicesCompatibility extends LuceneTestCase {
|
|||
ByteArrayOutputStream bos = new ByteArrayOutputStream(1024);
|
||||
CheckIndex checker = new CheckIndex(dir);
|
||||
checker.setInfoStream(new PrintStream(bos, false, UTF_8));
|
||||
checker.setLevel(CheckIndex.Level.MIN_LEVEL_FOR_INTEGRITY_CHECKS);
|
||||
CheckIndex.Status indexStatus = checker.checkIndex();
|
||||
if (version.startsWith("8.")) {
|
||||
assertTrue(indexStatus.clean);
|
||||
|
|
|
@ -20,9 +20,9 @@ import static org.apache.lucene.backward_index.TestBasicBackwardsCompatibility.a
|
|||
|
||||
import com.carrotsearch.randomizedtesting.annotations.ParametersFactory;
|
||||
import java.io.IOException;
|
||||
import org.apache.lucene.backward_codecs.lucene99.Lucene99Codec;
|
||||
import org.apache.lucene.codecs.Codec;
|
||||
import org.apache.lucene.codecs.KnnVectorsFormat;
|
||||
import org.apache.lucene.codecs.lucene99.Lucene99Codec;
|
||||
import org.apache.lucene.codecs.lucene99.Lucene99HnswScalarQuantizedVectorsFormat;
|
||||
import org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat;
|
||||
import org.apache.lucene.document.Document;
|
||||
|
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
@ -40,3 +40,4 @@
|
|||
9.9.2
|
||||
9.10.0
|
||||
9.11.0
|
||||
9.11.1
|
||||
|
|
|
@ -0,0 +1,376 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.benchmark.jmh;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.Random;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import org.apache.lucene.search.DocIdSetIterator;
|
||||
import org.openjdk.jmh.annotations.Benchmark;
|
||||
import org.openjdk.jmh.annotations.BenchmarkMode;
|
||||
import org.openjdk.jmh.annotations.CompilerControl;
|
||||
import org.openjdk.jmh.annotations.Fork;
|
||||
import org.openjdk.jmh.annotations.Level;
|
||||
import org.openjdk.jmh.annotations.Measurement;
|
||||
import org.openjdk.jmh.annotations.Mode;
|
||||
import org.openjdk.jmh.annotations.OutputTimeUnit;
|
||||
import org.openjdk.jmh.annotations.Scope;
|
||||
import org.openjdk.jmh.annotations.Setup;
|
||||
import org.openjdk.jmh.annotations.State;
|
||||
import org.openjdk.jmh.annotations.Warmup;
|
||||
|
||||
@BenchmarkMode(Mode.Throughput)
|
||||
@OutputTimeUnit(TimeUnit.MILLISECONDS)
|
||||
@State(Scope.Benchmark)
|
||||
@Warmup(iterations = 5, time = 1)
|
||||
@Measurement(iterations = 5, time = 1)
|
||||
@Fork(
|
||||
value = 1,
|
||||
jvmArgsAppend = {"-Xmx1g", "-Xms1g", "-XX:+AlwaysPreTouch"})
|
||||
public class AdvanceBenchmark {
|
||||
|
||||
private final long[] values = new long[129];
|
||||
private final int[] startIndexes = new int[1_000];
|
||||
private final long[] targets = new long[startIndexes.length];
|
||||
|
||||
@Setup(Level.Trial)
|
||||
public void setup() throws Exception {
|
||||
for (int i = 0; i < 128; ++i) {
|
||||
values[i] = i;
|
||||
}
|
||||
values[128] = DocIdSetIterator.NO_MORE_DOCS;
|
||||
Random r = new Random(0);
|
||||
for (int i = 0; i < startIndexes.length; ++i) {
|
||||
startIndexes[i] = r.nextInt(64);
|
||||
targets[i] = startIndexes[i] + 1 + r.nextInt(1 << r.nextInt(7));
|
||||
}
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public void binarySearch() {
|
||||
for (int i = 0; i < startIndexes.length; ++i) {
|
||||
binarySearch(values, targets[i], startIndexes[i]);
|
||||
}
|
||||
}
|
||||
|
||||
@CompilerControl(CompilerControl.Mode.DONT_INLINE)
|
||||
private static int binarySearch(long[] values, long target, int startIndex) {
|
||||
// Standard binary search
|
||||
int i = Arrays.binarySearch(values, startIndex, values.length, target);
|
||||
if (i < 0) {
|
||||
i = -1 - i;
|
||||
}
|
||||
return i;
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public void binarySearch2() {
|
||||
for (int i = 0; i < startIndexes.length; ++i) {
|
||||
binarySearch2(values, targets[i], startIndexes[i]);
|
||||
}
|
||||
}
|
||||
|
||||
@CompilerControl(CompilerControl.Mode.DONT_INLINE)
|
||||
private static int binarySearch2(long[] values, long target, int startIndex) {
|
||||
// Try to help the compiler by providing predictable start/end offsets.
|
||||
int i = Arrays.binarySearch(values, 0, 128, target);
|
||||
if (i < 0) {
|
||||
i = -1 - i;
|
||||
}
|
||||
return i;
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public void binarySearch3() {
|
||||
for (int i = 0; i < startIndexes.length; ++i) {
|
||||
binarySearch3(values, targets[i], startIndexes[i]);
|
||||
}
|
||||
}
|
||||
|
||||
@CompilerControl(CompilerControl.Mode.DONT_INLINE)
|
||||
private static int binarySearch3(long[] values, long target, int startIndex) {
|
||||
// Organize code the same way as suggested in https://quickwit.io/blog/search-a-sorted-block,
|
||||
// which proved to help with LLVM.
|
||||
int start = 0;
|
||||
int length = 128;
|
||||
|
||||
while (length > 1) {
|
||||
length /= 2;
|
||||
if (values[start + length - 1] < target) {
|
||||
start += length;
|
||||
}
|
||||
}
|
||||
return start;
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public void binarySearch4() {
|
||||
for (int i = 0; i < startIndexes.length; ++i) {
|
||||
binarySearch4(values, targets[i], startIndexes[i]);
|
||||
}
|
||||
}
|
||||
|
||||
@CompilerControl(CompilerControl.Mode.DONT_INLINE)
|
||||
private static int binarySearch4(long[] values, long target, int startIndex) {
|
||||
// Explicitly inline the binary-search logic to see if it helps the compiler.
|
||||
int start = 0;
|
||||
|
||||
if (values[63] < target) {
|
||||
start += 64;
|
||||
}
|
||||
if (values[start + 31] < target) {
|
||||
start += 32;
|
||||
}
|
||||
if (values[start + 15] < target) {
|
||||
start += 16;
|
||||
}
|
||||
if (values[start + 7] < target) {
|
||||
start += 8;
|
||||
}
|
||||
if (values[start + 3] < target) {
|
||||
start += 4;
|
||||
}
|
||||
if (values[start + 1] < target) {
|
||||
start += 2;
|
||||
}
|
||||
if (values[start] < target) {
|
||||
start += 1;
|
||||
}
|
||||
|
||||
return start;
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public void binarySearch5() {
|
||||
for (int i = 0; i < startIndexes.length; ++i) {
|
||||
binarySearch5(values, targets[i], startIndexes[i]);
|
||||
}
|
||||
}
|
||||
|
||||
@CompilerControl(CompilerControl.Mode.DONT_INLINE)
|
||||
private static int binarySearch5(long[] values, long target, int startIndex) {
|
||||
// Other way to write a binary search
|
||||
int start = 0;
|
||||
|
||||
for (int shift = 6; shift >= 0; --shift) {
|
||||
int halfRange = 1 << shift;
|
||||
if (values[start + halfRange - 1] < target) {
|
||||
start += halfRange;
|
||||
}
|
||||
}
|
||||
|
||||
return start;
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public void binarySearch6() {
|
||||
for (int i = 0; i < startIndexes.length; ++i) {
|
||||
binarySearch6(values, targets[i], startIndexes[i]);
|
||||
}
|
||||
}
|
||||
|
||||
@CompilerControl(CompilerControl.Mode.DONT_INLINE)
|
||||
private static int binarySearch6(long[] values, long target, int startIndex) {
|
||||
// Other way to write a binary search
|
||||
int start = 0;
|
||||
|
||||
for (int halfRange = 64; halfRange > 0; halfRange >>= 1) {
|
||||
if (values[start + halfRange - 1] < target) {
|
||||
start += halfRange;
|
||||
}
|
||||
}
|
||||
|
||||
return start;
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public void linearSearch() {
|
||||
for (int i = 0; i < startIndexes.length; ++i) {
|
||||
linearSearch(values, targets[i], startIndexes[i]);
|
||||
}
|
||||
}
|
||||
|
||||
@CompilerControl(CompilerControl.Mode.DONT_INLINE)
|
||||
private static int linearSearch(long[] values, long target, int startIndex) {
|
||||
// Naive linear search.
|
||||
for (int i = startIndex; i < values.length; ++i) {
|
||||
if (values[i] >= target) {
|
||||
return i;
|
||||
}
|
||||
}
|
||||
return values.length;
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public void bruteForceSearch() {
|
||||
for (int i = 0; i < startIndexes.length; ++i) {
|
||||
bruteForceSearch(values, targets[i], startIndexes[i]);
|
||||
}
|
||||
}
|
||||
|
||||
@CompilerControl(CompilerControl.Mode.DONT_INLINE)
|
||||
private static int bruteForceSearch(long[] values, long target, int startIndex) {
|
||||
// Linear search with predictable start/end offsets to see if it helps the compiler.
|
||||
for (int i = 0; i < 128; ++i) {
|
||||
if (values[i] >= target) {
|
||||
return i;
|
||||
}
|
||||
}
|
||||
return values.length;
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public void linearSearch2() {
|
||||
for (int i = 0; i < startIndexes.length; ++i) {
|
||||
linearSearch2(values, targets[i], startIndexes[i]);
|
||||
}
|
||||
}
|
||||
|
||||
@CompilerControl(CompilerControl.Mode.DONT_INLINE)
|
||||
private static int linearSearch2(long[] values, long target, int startIndex) {
|
||||
// Two-level linear search, first checking every 8-th value, then values within an 8-value range
|
||||
int rangeStart = values.length - 8;
|
||||
|
||||
for (int i = startIndex; i + 8 <= values.length; i += 8) {
|
||||
if (values[i + 7] >= target) {
|
||||
rangeStart = i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 0; i < 8; ++i) {
|
||||
if (values[rangeStart + i] >= target) {
|
||||
return rangeStart + i;
|
||||
}
|
||||
}
|
||||
|
||||
return values.length;
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public void linearSearch3() {
|
||||
for (int i = 0; i < startIndexes.length; ++i) {
|
||||
linearSearch3(values, targets[i], startIndexes[i]);
|
||||
}
|
||||
}
|
||||
|
||||
@CompilerControl(CompilerControl.Mode.DONT_INLINE)
|
||||
private static int linearSearch3(long[] values, long target, int startIndex) {
|
||||
// Iteration over linearSearch that tries to reduce branches
|
||||
while (startIndex + 4 <= values.length) {
|
||||
int count = values[startIndex] < target ? 1 : 0;
|
||||
if (values[startIndex + 1] < target) {
|
||||
count++;
|
||||
}
|
||||
if (values[startIndex + 2] < target) {
|
||||
count++;
|
||||
}
|
||||
if (values[startIndex + 3] < target) {
|
||||
count++;
|
||||
}
|
||||
if (count != 4) {
|
||||
return startIndex + count;
|
||||
}
|
||||
startIndex += 4;
|
||||
}
|
||||
|
||||
for (int i = startIndex; i < values.length; ++i) {
|
||||
if (values[i] >= target) {
|
||||
return i;
|
||||
}
|
||||
}
|
||||
|
||||
return values.length;
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public void hybridSearch() {
|
||||
for (int i = 0; i < startIndexes.length; ++i) {
|
||||
hybridSearch(values, targets[i], startIndexes[i]);
|
||||
}
|
||||
}
|
||||
|
||||
@CompilerControl(CompilerControl.Mode.DONT_INLINE)
|
||||
private static int hybridSearch(long[] values, long target, int startIndex) {
|
||||
// Two-level linear search, first checking every 8-th value, then values within an 8-value range
|
||||
int rangeStart = values.length - 8;
|
||||
|
||||
for (int i = startIndex; i + 8 <= values.length; i += 8) {
|
||||
if (values[i + 7] >= target) {
|
||||
rangeStart = i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return binarySearchHelper8(values, target, rangeStart);
|
||||
}
|
||||
|
||||
// branchless binary search over 8 values
|
||||
private static int binarySearchHelper8(long[] values, long target, int start) {
|
||||
if (values[start + 3] < target) {
|
||||
start += 4;
|
||||
}
|
||||
if (values[start + 1] < target) {
|
||||
start += 2;
|
||||
}
|
||||
if (values[start] < target) {
|
||||
start += 1;
|
||||
}
|
||||
return start;
|
||||
}
|
||||
|
||||
private static void assertEquals(int expected, int actual) {
|
||||
if (expected != actual) {
|
||||
throw new AssertionError("Expected: " + expected + ", got " + actual);
|
||||
}
|
||||
}
|
||||
|
||||
public static void main(String[] args) {
|
||||
// For testing purposes
|
||||
long[] values = new long[129];
|
||||
for (int i = 0; i < 128; ++i) {
|
||||
values[i] = i;
|
||||
}
|
||||
values[128] = DocIdSetIterator.NO_MORE_DOCS;
|
||||
for (int start = 0; start < 128; ++start) {
|
||||
for (int targetIndex = start; targetIndex < 128; ++targetIndex) {
|
||||
int actualIndex = binarySearch(values, values[targetIndex], start);
|
||||
assertEquals(targetIndex, actualIndex);
|
||||
actualIndex = binarySearch2(values, values[targetIndex], start);
|
||||
assertEquals(targetIndex, actualIndex);
|
||||
actualIndex = binarySearch3(values, values[targetIndex], start);
|
||||
assertEquals(targetIndex, actualIndex);
|
||||
actualIndex = binarySearch4(values, values[targetIndex], start);
|
||||
assertEquals(targetIndex, actualIndex);
|
||||
actualIndex = binarySearch5(values, values[targetIndex], start);
|
||||
assertEquals(targetIndex, actualIndex);
|
||||
actualIndex = binarySearch6(values, values[targetIndex], start);
|
||||
assertEquals(targetIndex, actualIndex);
|
||||
actualIndex = bruteForceSearch(values, values[targetIndex], start);
|
||||
assertEquals(targetIndex, actualIndex);
|
||||
actualIndex = hybridSearch(values, values[targetIndex], start);
|
||||
assertEquals(targetIndex, actualIndex);
|
||||
actualIndex = linearSearch(values, values[targetIndex], start);
|
||||
assertEquals(targetIndex, actualIndex);
|
||||
actualIndex = linearSearch2(values, values[targetIndex], start);
|
||||
assertEquals(targetIndex, actualIndex);
|
||||
actualIndex = linearSearch3(values, values[targetIndex], start);
|
||||
assertEquals(targetIndex, actualIndex);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,75 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.benchmark.jmh;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Random;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import org.apache.lucene.util.VectorUtil;
|
||||
import org.openjdk.jmh.annotations.Benchmark;
|
||||
import org.openjdk.jmh.annotations.BenchmarkMode;
|
||||
import org.openjdk.jmh.annotations.Fork;
|
||||
import org.openjdk.jmh.annotations.Measurement;
|
||||
import org.openjdk.jmh.annotations.Mode;
|
||||
import org.openjdk.jmh.annotations.OutputTimeUnit;
|
||||
import org.openjdk.jmh.annotations.Param;
|
||||
import org.openjdk.jmh.annotations.Scope;
|
||||
import org.openjdk.jmh.annotations.Setup;
|
||||
import org.openjdk.jmh.annotations.State;
|
||||
import org.openjdk.jmh.annotations.Warmup;
|
||||
|
||||
@Fork(1)
|
||||
@Warmup(iterations = 3, time = 3)
|
||||
@Measurement(iterations = 5, time = 3)
|
||||
@BenchmarkMode(Mode.Throughput)
|
||||
@OutputTimeUnit(TimeUnit.SECONDS)
|
||||
@State(Scope.Benchmark)
|
||||
public class HammingDistanceBenchmark {
|
||||
@Param({"1000000"})
|
||||
int nb = 1_000_000;
|
||||
|
||||
@Param({"1024"})
|
||||
int dims = 1024;
|
||||
|
||||
byte[][] xb;
|
||||
byte[] xq;
|
||||
|
||||
@Setup
|
||||
public void setup() throws IOException {
|
||||
Random rand = new Random();
|
||||
this.xb = new byte[nb][dims / 8];
|
||||
for (int i = 0; i < nb; i++) {
|
||||
for (int j = 0; j < dims / 8; j++) {
|
||||
xb[i][j] = (byte) rand.nextInt(0, 255);
|
||||
}
|
||||
}
|
||||
this.xq = new byte[dims / 8];
|
||||
for (int i = 0; i < xq.length; i++) {
|
||||
xq[i] = (byte) rand.nextInt(0, 255);
|
||||
}
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public int xorBitCount() {
|
||||
int tot = 0;
|
||||
for (int i = 0; i < nb; i++) {
|
||||
tot += VectorUtil.xorBitCount(xb[i], xq);
|
||||
}
|
||||
return tot;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,108 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.benchmark.jmh;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.Random;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import org.apache.lucene.codecs.lucene912.ForDeltaUtil;
|
||||
import org.apache.lucene.codecs.lucene912.ForUtil;
|
||||
import org.apache.lucene.codecs.lucene912.PostingIndexInput;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.IOContext;
|
||||
import org.apache.lucene.store.IndexInput;
|
||||
import org.apache.lucene.store.IndexOutput;
|
||||
import org.apache.lucene.store.MMapDirectory;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.openjdk.jmh.annotations.Benchmark;
|
||||
import org.openjdk.jmh.annotations.BenchmarkMode;
|
||||
import org.openjdk.jmh.annotations.Fork;
|
||||
import org.openjdk.jmh.annotations.Level;
|
||||
import org.openjdk.jmh.annotations.Measurement;
|
||||
import org.openjdk.jmh.annotations.Mode;
|
||||
import org.openjdk.jmh.annotations.OutputTimeUnit;
|
||||
import org.openjdk.jmh.annotations.Param;
|
||||
import org.openjdk.jmh.annotations.Scope;
|
||||
import org.openjdk.jmh.annotations.Setup;
|
||||
import org.openjdk.jmh.annotations.State;
|
||||
import org.openjdk.jmh.annotations.TearDown;
|
||||
import org.openjdk.jmh.annotations.Warmup;
|
||||
import org.openjdk.jmh.infra.Blackhole;
|
||||
|
||||
@BenchmarkMode(Mode.Throughput)
|
||||
@OutputTimeUnit(TimeUnit.MICROSECONDS)
|
||||
@State(Scope.Benchmark)
|
||||
@Warmup(iterations = 5, time = 1)
|
||||
@Measurement(iterations = 5, time = 1)
|
||||
@Fork(
|
||||
value = 3,
|
||||
jvmArgsAppend = {"-Xmx1g", "-Xms1g", "-XX:+AlwaysPreTouch"})
|
||||
public class PostingIndexInputBenchmark {
|
||||
|
||||
private Path path;
|
||||
private Directory dir;
|
||||
private IndexInput in;
|
||||
private PostingIndexInput postingIn;
|
||||
private final ForUtil forUtil = new ForUtil();
|
||||
private final ForDeltaUtil forDeltaUtil = new ForDeltaUtil();
|
||||
private final long[] values = new long[128];
|
||||
|
||||
@Param({"2", "3", "4", "5", "6", "7", "8", "9", "10"})
|
||||
public int bpv;
|
||||
|
||||
@Setup(Level.Trial)
|
||||
public void setup() throws Exception {
|
||||
path = Files.createTempDirectory("forUtil");
|
||||
dir = MMapDirectory.open(path);
|
||||
try (IndexOutput out = dir.createOutput("docs", IOContext.DEFAULT)) {
|
||||
Random r = new Random(0);
|
||||
// Write enough random data to not reach EOF while decoding
|
||||
for (int i = 0; i < 100; ++i) {
|
||||
out.writeLong(r.nextLong());
|
||||
}
|
||||
}
|
||||
in = dir.openInput("docs", IOContext.DEFAULT);
|
||||
postingIn = new PostingIndexInput(in, forUtil, forDeltaUtil);
|
||||
}
|
||||
|
||||
@TearDown(Level.Trial)
|
||||
public void tearDown() throws Exception {
|
||||
if (dir != null) {
|
||||
dir.deleteFile("docs");
|
||||
}
|
||||
IOUtils.close(in, dir);
|
||||
in = null;
|
||||
dir = null;
|
||||
Files.deleteIfExists(path);
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public void decode(Blackhole bh) throws IOException {
|
||||
in.seek(3); // random unaligned offset
|
||||
postingIn.decode(bpv, values);
|
||||
bh.consume(values);
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public void decodeAndPrefixSum(Blackhole bh) throws IOException {
|
||||
in.seek(3); // random unaligned offset
|
||||
postingIn.decodeAndPrefixSum(bpv, 100, values);
|
||||
bh.consume(values);
|
||||
}
|
||||
}
|
|
@ -17,11 +17,10 @@
|
|||
# -------------------------------------------------------------------------------------
|
||||
# multi val params are iterated by NewRound's, added to reports, start with column name.
|
||||
|
||||
# collector.class can be:
|
||||
# Fully Qualified Class Name of a Collector with a empty constructor
|
||||
# topScoreDocOrdered - Creates a TopScoreDocCollector that requires in order docs
|
||||
# topScoreDocUnordered - Like above, but allows out of order
|
||||
collector.class=coll:topScoreDoc
|
||||
# collector.manager.class can be:
|
||||
# Fully Qualified Class Name of a CollectorManager with a empty constructor
|
||||
# topScoreDoc - Creates a TopScoreDocCollectorManager
|
||||
collector.manager.class=coll:topScoreDoc
|
||||
|
||||
analyzer=org.apache.lucene.analysis.core.WhitespaceAnalyzer
|
||||
directory=FSDirectory
|
||||
|
|
|
@ -17,11 +17,10 @@
|
|||
# -------------------------------------------------------------------------------------
|
||||
# multi val params are iterated by NewRound's, added to reports, start with column name.
|
||||
|
||||
# collector.class can be:
|
||||
# Fully Qualified Class Name of a Collector with a empty constructor
|
||||
# topScoreDocOrdered - Creates a TopScoreDocCollector that requires in order docs
|
||||
# topScoreDocUnordered - Like above, but allows out of order
|
||||
collector.class=coll:topScoreDoc
|
||||
# collector.manager.class can be:
|
||||
# Fully Qualified Class Name of a CollectorManager with a empty constructor
|
||||
# topScoreDoc - Creates a TopScoreDocCollectorManager
|
||||
collector.manager.class=coll:topScoreDoc
|
||||
|
||||
analyzer=org.apache.lucene.analysis.core.WhitespaceAnalyzer
|
||||
directory=FSDirectory
|
||||
|
|
|
@ -238,7 +238,7 @@ public class EnwikiContentSource extends ContentSource {
|
|||
time = null;
|
||||
id = null;
|
||||
break;
|
||||
// intentional fall-through.
|
||||
// intentional fall-through.
|
||||
case BODY:
|
||||
case DATE:
|
||||
case TITLE:
|
||||
|
|
|
@ -99,7 +99,7 @@ public class SpatialDocMaker extends DocMaker {
|
|||
return makeRPTStrategy(SPATIAL_FIELD, config, configMap, ctx);
|
||||
case "composite":
|
||||
return makeCompositeStrategy(config, configMap, ctx);
|
||||
// TODO add more as-needed
|
||||
// TODO add more as-needed
|
||||
default:
|
||||
throw new IllegalStateException("Unknown spatial.strategy: " + strategyName);
|
||||
}
|
||||
|
|
|
@ -24,7 +24,7 @@ import org.apache.lucene.index.DirectoryReader;
|
|||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.MultiBits;
|
||||
import org.apache.lucene.index.StoredFields;
|
||||
import org.apache.lucene.search.Collector;
|
||||
import org.apache.lucene.search.CollectorManager;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.search.ScoreDoc;
|
||||
|
@ -119,9 +119,7 @@ public abstract class ReadTask extends PerfTask {
|
|||
hits = searcher.search(q, numHits);
|
||||
}
|
||||
} else {
|
||||
Collector collector = createCollector();
|
||||
|
||||
searcher.search(q, collector);
|
||||
searcher.search(q, createCollectorManager());
|
||||
// hits = collector.topDocs();
|
||||
}
|
||||
|
||||
|
@ -184,9 +182,8 @@ public abstract class ReadTask extends PerfTask {
|
|||
return res;
|
||||
}
|
||||
|
||||
protected Collector createCollector() throws Exception {
|
||||
return new TopScoreDocCollectorManager(numHits(), withTotalHits() ? Integer.MAX_VALUE : 1)
|
||||
.newCollector();
|
||||
protected CollectorManager<?, ?> createCollectorManager() throws Exception {
|
||||
return new TopScoreDocCollectorManager(numHits(), withTotalHits() ? Integer.MAX_VALUE : 1);
|
||||
}
|
||||
|
||||
protected Document retrieveDoc(StoredFields storedFields, int id) throws IOException {
|
||||
|
|
|
@ -19,8 +19,8 @@ package org.apache.lucene.benchmark.byTask.tasks;
|
|||
import org.apache.lucene.benchmark.byTask.PerfRunData;
|
||||
import org.apache.lucene.benchmark.byTask.feeds.QueryMaker;
|
||||
import org.apache.lucene.benchmark.byTask.utils.Config;
|
||||
import org.apache.lucene.search.Collector;
|
||||
import org.apache.lucene.search.TopScoreDocCollector;
|
||||
import org.apache.lucene.search.CollectorManager;
|
||||
import org.apache.lucene.search.TopScoreDocCollectorManager;
|
||||
|
||||
/** Does search w/ a custom collector */
|
||||
public class SearchWithCollectorTask extends SearchTask {
|
||||
|
@ -37,7 +37,11 @@ public class SearchWithCollectorTask extends SearchTask {
|
|||
// check to make sure either the doc is being stored
|
||||
PerfRunData runData = getRunData();
|
||||
Config config = runData.getConfig();
|
||||
clnName = config.get("collector.class", "");
|
||||
if (config.get("collector.class", null) != null) {
|
||||
throw new IllegalArgumentException(
|
||||
"collector.class is no longer supported as a config parameter, use collector.manager.class instead to provide a CollectorManager class name");
|
||||
}
|
||||
clnName = config.get("collector.manager.class", "");
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -46,17 +50,17 @@ public class SearchWithCollectorTask extends SearchTask {
|
|||
}
|
||||
|
||||
@Override
|
||||
protected Collector createCollector() throws Exception {
|
||||
Collector collector = null;
|
||||
protected CollectorManager<?, ?> createCollectorManager() throws Exception {
|
||||
CollectorManager<?, ?> collectorManager;
|
||||
if (clnName.equalsIgnoreCase("topScoreDoc") == true) {
|
||||
collector = TopScoreDocCollector.create(numHits(), Integer.MAX_VALUE);
|
||||
collectorManager = new TopScoreDocCollectorManager(numHits(), Integer.MAX_VALUE);
|
||||
} else if (clnName.length() > 0) {
|
||||
collector = Class.forName(clnName).asSubclass(Collector.class).getConstructor().newInstance();
|
||||
|
||||
collectorManager =
|
||||
Class.forName(clnName).asSubclass(CollectorManager.class).getConstructor().newInstance();
|
||||
} else {
|
||||
collector = super.createCollector();
|
||||
collectorManager = super.createCollectorManager();
|
||||
}
|
||||
return collector;
|
||||
return collectorManager;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -23,13 +23,13 @@ import org.apache.lucene.codecs.PostingsFormat;
|
|||
import org.apache.lucene.codecs.PostingsReaderBase;
|
||||
import org.apache.lucene.codecs.PostingsWriterBase;
|
||||
import org.apache.lucene.codecs.lucene90.blocktree.Lucene90BlockTreeTermsWriter;
|
||||
import org.apache.lucene.codecs.lucene99.Lucene99PostingsReader;
|
||||
import org.apache.lucene.codecs.lucene99.Lucene99PostingsWriter;
|
||||
import org.apache.lucene.codecs.lucene912.Lucene912PostingsReader;
|
||||
import org.apache.lucene.codecs.lucene912.Lucene912PostingsWriter;
|
||||
import org.apache.lucene.index.SegmentReadState;
|
||||
import org.apache.lucene.index.SegmentWriteState;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
|
||||
/** Uses {@link OrdsBlockTreeTermsWriter} with {@link Lucene99PostingsWriter}. */
|
||||
/** Uses {@link OrdsBlockTreeTermsWriter} with {@link Lucene912PostingsWriter}. */
|
||||
public class BlockTreeOrdsPostingsFormat extends PostingsFormat {
|
||||
|
||||
private final int minTermBlockSize;
|
||||
|
@ -67,7 +67,7 @@ public class BlockTreeOrdsPostingsFormat extends PostingsFormat {
|
|||
|
||||
@Override
|
||||
public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
|
||||
PostingsWriterBase postingsWriter = new Lucene99PostingsWriter(state);
|
||||
PostingsWriterBase postingsWriter = new Lucene912PostingsWriter(state);
|
||||
|
||||
boolean success = false;
|
||||
try {
|
||||
|
@ -84,7 +84,7 @@ public class BlockTreeOrdsPostingsFormat extends PostingsFormat {
|
|||
|
||||
@Override
|
||||
public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException {
|
||||
PostingsReaderBase postingsReader = new Lucene99PostingsReader(state);
|
||||
PostingsReaderBase postingsReader = new Lucene912PostingsReader(state);
|
||||
boolean success = false;
|
||||
try {
|
||||
FieldsProducer ret = new OrdsBlockTreeTermsReader(postingsReader, state);
|
||||
|
|
|
@ -43,6 +43,7 @@ import org.apache.lucene.store.ChecksumIndexInput;
|
|||
import org.apache.lucene.store.DataOutput;
|
||||
import org.apache.lucene.store.IndexOutput;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.IOBooleanSupplier;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.lucene.util.automaton.CompiledAutomaton;
|
||||
|
||||
|
@ -315,12 +316,21 @@ public final class BloomFilteringPostingsFormat extends PostingsFormat {
|
|||
}
|
||||
|
||||
@Override
|
||||
public boolean seekExact(BytesRef text) throws IOException {
|
||||
public IOBooleanSupplier prepareSeekExact(BytesRef text) throws IOException {
|
||||
// The magical fail-fast speed up that is the entire point of all of
|
||||
// this code - save a disk seek if there is a match on an in-memory
|
||||
// structure
|
||||
// that may occasionally give a false positive but guaranteed no false
|
||||
// negatives
|
||||
if (filter.contains(text) == ContainsResult.NO) {
|
||||
return null;
|
||||
}
|
||||
return delegate().prepareSeekExact(text);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean seekExact(BytesRef text) throws IOException {
|
||||
// See #prepareSeekExact
|
||||
if (filter.contains(text) == ContainsResult.NO) {
|
||||
return false;
|
||||
}
|
||||
|
|
|
@ -24,7 +24,7 @@ import java.util.TreeMap;
|
|||
import org.apache.lucene.codecs.FieldsConsumer;
|
||||
import org.apache.lucene.codecs.FieldsProducer;
|
||||
import org.apache.lucene.codecs.PostingsFormat;
|
||||
import org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat;
|
||||
import org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat;
|
||||
import org.apache.lucene.index.BaseTermsEnum;
|
||||
import org.apache.lucene.index.FieldInfo;
|
||||
import org.apache.lucene.index.Fields;
|
||||
|
@ -54,7 +54,7 @@ import org.apache.lucene.util.automaton.TransitionAccessor;
|
|||
// - or: longer dense skip lists than just next byte?
|
||||
|
||||
/**
|
||||
* Wraps {@link Lucene99PostingsFormat} format for on-disk storage, but then at read time loads and
|
||||
* Wraps {@link Lucene912PostingsFormat} format for on-disk storage, but then at read time loads and
|
||||
* stores all terms and postings directly in RAM as byte[], int[].
|
||||
*
|
||||
* <p><b>WARNING</b>: This is exceptionally RAM intensive: it makes no effort to compress the
|
||||
|
@ -97,12 +97,12 @@ public final class DirectPostingsFormat extends PostingsFormat {
|
|||
|
||||
@Override
|
||||
public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
|
||||
return PostingsFormat.forName("Lucene99").fieldsConsumer(state);
|
||||
return PostingsFormat.forName("Lucene912").fieldsConsumer(state);
|
||||
}
|
||||
|
||||
@Override
|
||||
public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException {
|
||||
FieldsProducer postings = PostingsFormat.forName("Lucene99").fieldsProducer(state);
|
||||
FieldsProducer postings = PostingsFormat.forName("Lucene912").fieldsProducer(state);
|
||||
if (state.context.context() != IOContext.Context.MERGE) {
|
||||
FieldsProducer loadedPostings;
|
||||
try {
|
||||
|
|
|
@ -22,8 +22,8 @@ import org.apache.lucene.codecs.FieldsProducer;
|
|||
import org.apache.lucene.codecs.PostingsFormat;
|
||||
import org.apache.lucene.codecs.PostingsReaderBase;
|
||||
import org.apache.lucene.codecs.PostingsWriterBase;
|
||||
import org.apache.lucene.codecs.lucene99.Lucene99PostingsReader;
|
||||
import org.apache.lucene.codecs.lucene99.Lucene99PostingsWriter;
|
||||
import org.apache.lucene.codecs.lucene912.Lucene912PostingsReader;
|
||||
import org.apache.lucene.codecs.lucene912.Lucene912PostingsWriter;
|
||||
import org.apache.lucene.index.SegmentReadState;
|
||||
import org.apache.lucene.index.SegmentWriteState;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
|
@ -41,7 +41,7 @@ public final class FSTPostingsFormat extends PostingsFormat {
|
|||
|
||||
@Override
|
||||
public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
|
||||
PostingsWriterBase postingsWriter = new Lucene99PostingsWriter(state);
|
||||
PostingsWriterBase postingsWriter = new Lucene912PostingsWriter(state);
|
||||
|
||||
boolean success = false;
|
||||
try {
|
||||
|
@ -57,7 +57,7 @@ public final class FSTPostingsFormat extends PostingsFormat {
|
|||
|
||||
@Override
|
||||
public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException {
|
||||
PostingsReaderBase postingsReader = new Lucene99PostingsReader(state);
|
||||
PostingsReaderBase postingsReader = new Lucene912PostingsReader(state);
|
||||
boolean success = false;
|
||||
try {
|
||||
FieldsProducer ret = new FSTTermsReader(state, postingsReader);
|
||||
|
|
|
@ -195,9 +195,10 @@ public class FSTTermsReader extends FieldsProducer {
|
|||
this.sumTotalTermFreq = sumTotalTermFreq;
|
||||
this.sumDocFreq = sumDocFreq;
|
||||
this.docCount = docCount;
|
||||
OffHeapFSTStore offHeapFSTStore = new OffHeapFSTStore();
|
||||
FSTTermOutputs outputs = new FSTTermOutputs(fieldInfo);
|
||||
this.dict = new FST<>(FST.readMetadata(in, outputs), in, offHeapFSTStore);
|
||||
final var fstMetadata = FST.readMetadata(in, outputs);
|
||||
OffHeapFSTStore offHeapFSTStore = new OffHeapFSTStore(in, in.getFilePointer(), fstMetadata);
|
||||
this.dict = FST.fromFSTReader(fstMetadata, offHeapFSTStore);
|
||||
in.skipBytes(offHeapFSTStore.size());
|
||||
}
|
||||
|
||||
|
|
|
@ -71,8 +71,8 @@ final class SimpleTextBKDReader extends PointValues {
|
|||
this.pointCount = pointCount;
|
||||
this.docCount = docCount;
|
||||
this.version = SimpleTextBKDWriter.VERSION_CURRENT;
|
||||
assert minPackedValue.length == config.packedIndexBytesLength;
|
||||
assert maxPackedValue.length == config.packedIndexBytesLength;
|
||||
assert minPackedValue.length == config.packedIndexBytesLength();
|
||||
assert maxPackedValue.length == config.packedIndexBytesLength();
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -99,8 +99,8 @@ final class SimpleTextBKDReader extends PointValues {
|
|||
private SimpleTextPointTree(
|
||||
IndexInput in, int nodeID, int level, byte[] minPackedValue, byte[] maxPackedValue) {
|
||||
this.in = in;
|
||||
this.scratchDocIDs = new int[config.maxPointsInLeafNode];
|
||||
this.scratchPackedValue = new byte[config.packedBytesLength];
|
||||
this.scratchDocIDs = new int[config.maxPointsInLeafNode()];
|
||||
this.scratchPackedValue = new byte[config.packedBytesLength()];
|
||||
this.nodeID = nodeID;
|
||||
this.rootNode = nodeID;
|
||||
this.level = level;
|
||||
|
@ -145,38 +145,39 @@ final class SimpleTextBKDReader extends PointValues {
|
|||
private void pushLeft() {
|
||||
int address = nodeID * bytesPerIndexEntry;
|
||||
// final int splitDimPos;
|
||||
if (config.numIndexDims == 1) {
|
||||
if (config.numIndexDims() == 1) {
|
||||
splitDims[level] = 0;
|
||||
} else {
|
||||
splitDims[level] = (splitPackedValues[address++] & 0xff);
|
||||
}
|
||||
final int splitDimPos = splitDims[level] * config.bytesPerDim;
|
||||
final int splitDimPos = splitDims[level] * config.bytesPerDim();
|
||||
if (splitDimValueStack[level] == null) {
|
||||
splitDimValueStack[level] = new byte[config.bytesPerDim];
|
||||
splitDimValueStack[level] = new byte[config.bytesPerDim()];
|
||||
}
|
||||
// save the dimension we are going to change
|
||||
System.arraycopy(
|
||||
maxPackedValue, splitDimPos, splitDimValueStack[level], 0, config.bytesPerDim);
|
||||
maxPackedValue, splitDimPos, splitDimValueStack[level], 0, config.bytesPerDim());
|
||||
assert Arrays.compareUnsigned(
|
||||
maxPackedValue,
|
||||
splitDimPos,
|
||||
splitDimPos + config.bytesPerDim,
|
||||
splitDimPos + config.bytesPerDim(),
|
||||
splitPackedValues,
|
||||
address,
|
||||
address + config.bytesPerDim)
|
||||
address + config.bytesPerDim())
|
||||
>= 0
|
||||
: "config.bytesPerDim="
|
||||
+ config.bytesPerDim
|
||||
: "config.bytesPerDim()="
|
||||
+ config.bytesPerDim()
|
||||
+ " splitDim="
|
||||
+ splitDims[level]
|
||||
+ " config.numIndexDims="
|
||||
+ config.numIndexDims
|
||||
+ " config.numIndexDims()="
|
||||
+ config.numIndexDims()
|
||||
+ " config.numDims="
|
||||
+ config.numDims;
|
||||
+ config.numDims();
|
||||
nodeID *= 2;
|
||||
level++;
|
||||
// add the split dim value:
|
||||
System.arraycopy(splitPackedValues, address, maxPackedValue, splitDimPos, config.bytesPerDim);
|
||||
System.arraycopy(
|
||||
splitPackedValues, address, maxPackedValue, splitDimPos, config.bytesPerDim());
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -191,37 +192,38 @@ final class SimpleTextBKDReader extends PointValues {
|
|||
|
||||
private void pushRight() {
|
||||
int address = nodeID * bytesPerIndexEntry;
|
||||
if (config.numIndexDims == 1) {
|
||||
if (config.numIndexDims() == 1) {
|
||||
splitDims[level] = 0;
|
||||
} else {
|
||||
splitDims[level] = (splitPackedValues[address++] & 0xff);
|
||||
}
|
||||
final int splitDimPos = splitDims[level] * config.bytesPerDim;
|
||||
final int splitDimPos = splitDims[level] * config.bytesPerDim();
|
||||
// we should have already visit the left node
|
||||
assert splitDimValueStack[level] != null;
|
||||
// save the dimension we are going to change
|
||||
System.arraycopy(
|
||||
minPackedValue, splitDimPos, splitDimValueStack[level], 0, config.bytesPerDim);
|
||||
minPackedValue, splitDimPos, splitDimValueStack[level], 0, config.bytesPerDim());
|
||||
assert Arrays.compareUnsigned(
|
||||
minPackedValue,
|
||||
splitDimPos,
|
||||
splitDimPos + config.bytesPerDim,
|
||||
splitDimPos + config.bytesPerDim(),
|
||||
splitPackedValues,
|
||||
address,
|
||||
address + config.bytesPerDim)
|
||||
address + config.bytesPerDim())
|
||||
<= 0
|
||||
: "config.bytesPerDim="
|
||||
+ config.bytesPerDim
|
||||
: "config.bytesPerDim()="
|
||||
+ config.bytesPerDim()
|
||||
+ " splitDim="
|
||||
+ splitDims[level]
|
||||
+ " config.numIndexDims="
|
||||
+ config.numIndexDims
|
||||
+ " config.numIndexDims()="
|
||||
+ config.numIndexDims()
|
||||
+ " config.numDims="
|
||||
+ config.numDims;
|
||||
+ config.numDims();
|
||||
nodeID = 2 * nodeID + 1;
|
||||
level++;
|
||||
// add the split dim value:
|
||||
System.arraycopy(splitPackedValues, address, minPackedValue, splitDimPos, config.bytesPerDim);
|
||||
System.arraycopy(
|
||||
splitPackedValues, address, minPackedValue, splitDimPos, config.bytesPerDim());
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -242,16 +244,16 @@ final class SimpleTextBKDReader extends PointValues {
|
|||
splitDimValueStack[level],
|
||||
0,
|
||||
maxPackedValue,
|
||||
splitDims[level] * config.bytesPerDim,
|
||||
config.bytesPerDim);
|
||||
splitDims[level] * config.bytesPerDim(),
|
||||
config.bytesPerDim());
|
||||
} else {
|
||||
|
||||
System.arraycopy(
|
||||
splitDimValueStack[level],
|
||||
0,
|
||||
minPackedValue,
|
||||
splitDims[level] * config.bytesPerDim,
|
||||
config.bytesPerDim);
|
||||
splitDims[level] * config.bytesPerDim(),
|
||||
config.bytesPerDim());
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -290,7 +292,7 @@ final class SimpleTextBKDReader extends PointValues {
|
|||
private long sizeFromBalancedTree(int leftMostLeafNode, int rightMostLeafNode) {
|
||||
// number of points that need to be distributed between leaves, one per leaf
|
||||
final int extraPoints =
|
||||
Math.toIntExact(((long) config.maxPointsInLeafNode * leafNodeOffset) - pointCount);
|
||||
Math.toIntExact(((long) config.maxPointsInLeafNode() * leafNodeOffset) - pointCount);
|
||||
assert extraPoints < leafNodeOffset : "point excess should be lower than leafNodeOffset";
|
||||
// offset where we stop adding one point to the leaves
|
||||
final int nodeOffset = leafNodeOffset - extraPoints;
|
||||
|
@ -298,9 +300,9 @@ final class SimpleTextBKDReader extends PointValues {
|
|||
for (int node = leftMostLeafNode; node <= rightMostLeafNode; node++) {
|
||||
// offsetPosition provides which extra point will be added to this node
|
||||
if (balanceTreeNodePosition(0, leafNodeOffset, node - leafNodeOffset, 0, 0) < nodeOffset) {
|
||||
count += config.maxPointsInLeafNode;
|
||||
count += config.maxPointsInLeafNode();
|
||||
} else {
|
||||
count += config.maxPointsInLeafNode - 1;
|
||||
count += config.maxPointsInLeafNode() - 1;
|
||||
}
|
||||
}
|
||||
return count;
|
||||
|
@ -376,14 +378,14 @@ final class SimpleTextBKDReader extends PointValues {
|
|||
// Again, this time reading values and checking with the visitor
|
||||
visitor.grow(count);
|
||||
// NOTE: we don't do prefix coding, so we ignore commonPrefixLengths
|
||||
assert scratchPackedValue.length == config.packedBytesLength;
|
||||
assert scratchPackedValue.length == config.packedBytesLength();
|
||||
BytesRefBuilder scratch = new BytesRefBuilder();
|
||||
for (int i = 0; i < count; i++) {
|
||||
readLine(in, scratch);
|
||||
assert startsWith(scratch, BLOCK_VALUE);
|
||||
BytesRef br = SimpleTextUtil.fromBytesRefString(stripPrefix(scratch, BLOCK_VALUE));
|
||||
assert br.length == config.packedBytesLength;
|
||||
System.arraycopy(br.bytes, br.offset, scratchPackedValue, 0, config.packedBytesLength);
|
||||
assert br.length == config.packedBytesLength();
|
||||
System.arraycopy(br.bytes, br.offset, scratchPackedValue, 0, config.packedBytesLength());
|
||||
visitor.visit(scratchDocIDs[i], scratchPackedValue);
|
||||
}
|
||||
} else {
|
||||
|
@ -443,17 +445,17 @@ final class SimpleTextBKDReader extends PointValues {
|
|||
|
||||
@Override
|
||||
public int getNumDimensions() throws IOException {
|
||||
return config.numDims;
|
||||
return config.numDims();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int getNumIndexDimensions() throws IOException {
|
||||
return config.numIndexDims;
|
||||
return config.numIndexDims();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int getBytesPerDimension() throws IOException {
|
||||
return config.bytesPerDim;
|
||||
return config.bytesPerDim();
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -144,28 +144,28 @@ final class SimpleTextBKDWriter implements Closeable {
|
|||
this.maxDoc = maxDoc;
|
||||
docsSeen = new FixedBitSet(maxDoc);
|
||||
|
||||
scratchDiff = new byte[config.bytesPerDim];
|
||||
scratch1 = new byte[config.packedBytesLength];
|
||||
scratch2 = new byte[config.packedBytesLength];
|
||||
commonPrefixLengths = new int[config.numDims];
|
||||
scratchDiff = new byte[config.bytesPerDim()];
|
||||
scratch1 = new byte[config.packedBytesLength()];
|
||||
scratch2 = new byte[config.packedBytesLength()];
|
||||
commonPrefixLengths = new int[config.numDims()];
|
||||
|
||||
minPackedValue = new byte[config.packedIndexBytesLength];
|
||||
maxPackedValue = new byte[config.packedIndexBytesLength];
|
||||
minPackedValue = new byte[config.packedIndexBytesLength()];
|
||||
maxPackedValue = new byte[config.packedIndexBytesLength()];
|
||||
|
||||
// Maximum number of points we hold in memory at any time
|
||||
maxPointsSortInHeap =
|
||||
(int) ((maxMBSortInHeap * 1024 * 1024) / (config.bytesPerDoc * config.numDims));
|
||||
(int) ((maxMBSortInHeap * 1024 * 1024) / (config.bytesPerDoc() * config.numDims()));
|
||||
|
||||
// Finally, we must be able to hold at least the leaf node in heap during build:
|
||||
if (maxPointsSortInHeap < config.maxPointsInLeafNode) {
|
||||
if (maxPointsSortInHeap < config.maxPointsInLeafNode()) {
|
||||
throw new IllegalArgumentException(
|
||||
"maxMBSortInHeap="
|
||||
+ maxMBSortInHeap
|
||||
+ " only allows for maxPointsSortInHeap="
|
||||
+ maxPointsSortInHeap
|
||||
+ ", but this is less than config.maxPointsInLeafNode="
|
||||
+ config.maxPointsInLeafNode
|
||||
+ "; either increase maxMBSortInHeap or decrease config.maxPointsInLeafNode");
|
||||
+ ", but this is less than config.maxPointsInLeafNode()="
|
||||
+ config.maxPointsInLeafNode()
|
||||
+ "; either increase maxMBSortInHeap or decrease config.maxPointsInLeafNode()");
|
||||
}
|
||||
|
||||
this.maxMBSortInHeap = maxMBSortInHeap;
|
||||
|
@ -183,10 +183,10 @@ final class SimpleTextBKDWriter implements Closeable {
|
|||
}
|
||||
|
||||
public void add(byte[] packedValue, int docID) throws IOException {
|
||||
if (packedValue.length != config.packedBytesLength) {
|
||||
if (packedValue.length != config.packedBytesLength()) {
|
||||
throw new IllegalArgumentException(
|
||||
"packedValue should be length="
|
||||
+ config.packedBytesLength
|
||||
+ config.packedBytesLength()
|
||||
+ " (got: "
|
||||
+ packedValue.length
|
||||
+ ")");
|
||||
|
@ -209,30 +209,30 @@ final class SimpleTextBKDWriter implements Closeable {
|
|||
} else {
|
||||
pointWriter = new HeapPointWriter(config, Math.toIntExact(totalPointCount));
|
||||
}
|
||||
System.arraycopy(packedValue, 0, minPackedValue, 0, config.packedIndexBytesLength);
|
||||
System.arraycopy(packedValue, 0, maxPackedValue, 0, config.packedIndexBytesLength);
|
||||
System.arraycopy(packedValue, 0, minPackedValue, 0, config.packedIndexBytesLength());
|
||||
System.arraycopy(packedValue, 0, maxPackedValue, 0, config.packedIndexBytesLength());
|
||||
} else {
|
||||
for (int dim = 0; dim < config.numIndexDims; dim++) {
|
||||
int offset = dim * config.bytesPerDim;
|
||||
for (int dim = 0; dim < config.numIndexDims(); dim++) {
|
||||
int offset = dim * config.bytesPerDim();
|
||||
if (Arrays.compareUnsigned(
|
||||
packedValue,
|
||||
offset,
|
||||
offset + config.bytesPerDim,
|
||||
offset + config.bytesPerDim(),
|
||||
minPackedValue,
|
||||
offset,
|
||||
offset + config.bytesPerDim)
|
||||
offset + config.bytesPerDim())
|
||||
< 0) {
|
||||
System.arraycopy(packedValue, offset, minPackedValue, offset, config.bytesPerDim);
|
||||
System.arraycopy(packedValue, offset, minPackedValue, offset, config.bytesPerDim());
|
||||
}
|
||||
if (Arrays.compareUnsigned(
|
||||
packedValue,
|
||||
offset,
|
||||
offset + config.bytesPerDim,
|
||||
offset + config.bytesPerDim(),
|
||||
maxPackedValue,
|
||||
offset,
|
||||
offset + config.bytesPerDim)
|
||||
offset + config.bytesPerDim())
|
||||
> 0) {
|
||||
System.arraycopy(packedValue, offset, maxPackedValue, offset, config.bytesPerDim);
|
||||
System.arraycopy(packedValue, offset, maxPackedValue, offset, config.bytesPerDim());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -254,7 +254,7 @@ final class SimpleTextBKDWriter implements Closeable {
|
|||
*/
|
||||
public long writeField(IndexOutput out, String fieldName, MutablePointTree reader)
|
||||
throws IOException {
|
||||
if (config.numIndexDims == 1) {
|
||||
if (config.numIndexDims() == 1) {
|
||||
return writeField1Dim(out, fieldName, reader);
|
||||
} else {
|
||||
return writeFieldNDims(out, fieldName, reader);
|
||||
|
@ -280,7 +280,7 @@ final class SimpleTextBKDWriter implements Closeable {
|
|||
long countPerLeaf = pointCount = values.size();
|
||||
long innerNodeCount = 1;
|
||||
|
||||
while (countPerLeaf > config.maxPointsInLeafNode) {
|
||||
while (countPerLeaf > config.maxPointsInLeafNode()) {
|
||||
countPerLeaf = (countPerLeaf + 1) / 2;
|
||||
innerNodeCount *= 2;
|
||||
}
|
||||
|
@ -289,7 +289,7 @@ final class SimpleTextBKDWriter implements Closeable {
|
|||
|
||||
checkMaxLeafNodeCount(numLeaves);
|
||||
|
||||
final byte[] splitPackedValues = new byte[numLeaves * (config.bytesPerDim + 1)];
|
||||
final byte[] splitPackedValues = new byte[numLeaves * (config.bytesPerDim() + 1)];
|
||||
final long[] leafBlockFPs = new long[numLeaves];
|
||||
|
||||
// compute the min/max for this slice
|
||||
|
@ -297,37 +297,37 @@ final class SimpleTextBKDWriter implements Closeable {
|
|||
Arrays.fill(maxPackedValue, (byte) 0);
|
||||
for (int i = 0; i < Math.toIntExact(pointCount); ++i) {
|
||||
values.getValue(i, scratchBytesRef1);
|
||||
for (int dim = 0; dim < config.numIndexDims; dim++) {
|
||||
int offset = dim * config.bytesPerDim;
|
||||
for (int dim = 0; dim < config.numIndexDims(); dim++) {
|
||||
int offset = dim * config.bytesPerDim();
|
||||
if (Arrays.compareUnsigned(
|
||||
scratchBytesRef1.bytes,
|
||||
scratchBytesRef1.offset + offset,
|
||||
scratchBytesRef1.offset + offset + config.bytesPerDim,
|
||||
scratchBytesRef1.offset + offset + config.bytesPerDim(),
|
||||
minPackedValue,
|
||||
offset,
|
||||
offset + config.bytesPerDim)
|
||||
offset + config.bytesPerDim())
|
||||
< 0) {
|
||||
System.arraycopy(
|
||||
scratchBytesRef1.bytes,
|
||||
scratchBytesRef1.offset + offset,
|
||||
minPackedValue,
|
||||
offset,
|
||||
config.bytesPerDim);
|
||||
config.bytesPerDim());
|
||||
}
|
||||
if (Arrays.compareUnsigned(
|
||||
scratchBytesRef1.bytes,
|
||||
scratchBytesRef1.offset + offset,
|
||||
scratchBytesRef1.offset + offset + config.bytesPerDim,
|
||||
scratchBytesRef1.offset + offset + config.bytesPerDim(),
|
||||
maxPackedValue,
|
||||
offset,
|
||||
offset + config.bytesPerDim)
|
||||
offset + config.bytesPerDim())
|
||||
> 0) {
|
||||
System.arraycopy(
|
||||
scratchBytesRef1.bytes,
|
||||
scratchBytesRef1.offset + offset,
|
||||
maxPackedValue,
|
||||
offset,
|
||||
config.bytesPerDim);
|
||||
config.bytesPerDim());
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -345,7 +345,7 @@ final class SimpleTextBKDWriter implements Closeable {
|
|||
maxPackedValue,
|
||||
splitPackedValues,
|
||||
leafBlockFPs,
|
||||
new int[config.maxPointsInLeafNode]);
|
||||
new int[config.maxPointsInLeafNode()]);
|
||||
|
||||
long indexFP = out.getFilePointer();
|
||||
writeIndex(out, leafBlockFPs, splitPackedValues, Math.toIntExact(countPerLeaf));
|
||||
|
@ -387,15 +387,15 @@ final class SimpleTextBKDWriter implements Closeable {
|
|||
final IndexOutput out;
|
||||
final List<Long> leafBlockFPs = new ArrayList<>();
|
||||
final List<byte[]> leafBlockStartValues = new ArrayList<>();
|
||||
final byte[] leafValues = new byte[config.maxPointsInLeafNode * config.packedBytesLength];
|
||||
final int[] leafDocs = new int[config.maxPointsInLeafNode];
|
||||
final byte[] leafValues = new byte[config.maxPointsInLeafNode() * config.packedBytesLength()];
|
||||
final int[] leafDocs = new int[config.maxPointsInLeafNode()];
|
||||
long valueCount;
|
||||
int leafCount;
|
||||
|
||||
OneDimensionBKDWriter(IndexOutput out) {
|
||||
if (config.numIndexDims != 1) {
|
||||
if (config.numIndexDims() != 1) {
|
||||
throw new UnsupportedOperationException(
|
||||
"config.numIndexDims must be 1 but got " + config.numIndexDims);
|
||||
"config.numIndexDims() must be 1 but got " + config.numIndexDims());
|
||||
}
|
||||
if (pointCount != 0) {
|
||||
throw new IllegalStateException("cannot mix add and merge");
|
||||
|
@ -411,7 +411,7 @@ final class SimpleTextBKDWriter implements Closeable {
|
|||
|
||||
this.out = out;
|
||||
|
||||
lastPackedValue = new byte[config.packedBytesLength];
|
||||
lastPackedValue = new byte[config.packedBytesLength()];
|
||||
}
|
||||
|
||||
// for asserts
|
||||
|
@ -426,8 +426,8 @@ final class SimpleTextBKDWriter implements Closeable {
|
|||
packedValue,
|
||||
0,
|
||||
leafValues,
|
||||
leafCount * config.packedBytesLength,
|
||||
config.packedBytesLength);
|
||||
leafCount * config.packedBytesLength(),
|
||||
config.packedBytesLength());
|
||||
leafDocs[leafCount] = docID;
|
||||
docsSeen.set(docID);
|
||||
leafCount++;
|
||||
|
@ -441,7 +441,7 @@ final class SimpleTextBKDWriter implements Closeable {
|
|||
+ " values");
|
||||
}
|
||||
|
||||
if (leafCount == config.maxPointsInLeafNode) {
|
||||
if (leafCount == config.maxPointsInLeafNode()) {
|
||||
// We write a block once we hit exactly the max count ... this is different from
|
||||
// when we flush a new segment, where we write between max/2 and max per leaf block,
|
||||
// so merged segments will behave differently from newly flushed segments:
|
||||
|
@ -471,43 +471,44 @@ final class SimpleTextBKDWriter implements Closeable {
|
|||
// System.out.println("BKDW: now rotate numInnerNodes=" + numInnerNodes + " leafBlockStarts="
|
||||
// + leafBlockStartValues.size());
|
||||
|
||||
byte[] index = new byte[(1 + numInnerNodes) * (1 + config.bytesPerDim)];
|
||||
byte[] index = new byte[(1 + numInnerNodes) * (1 + config.bytesPerDim())];
|
||||
rotateToTree(1, 0, numInnerNodes, index, leafBlockStartValues);
|
||||
long[] arr = new long[leafBlockFPs.size()];
|
||||
for (int i = 0; i < leafBlockFPs.size(); i++) {
|
||||
arr[i] = leafBlockFPs.get(i);
|
||||
}
|
||||
writeIndex(out, arr, index, config.maxPointsInLeafNode);
|
||||
writeIndex(out, arr, index, config.maxPointsInLeafNode());
|
||||
return indexFP;
|
||||
}
|
||||
|
||||
private void writeLeafBlock() throws IOException {
|
||||
assert leafCount != 0;
|
||||
if (valueCount == 0) {
|
||||
System.arraycopy(leafValues, 0, minPackedValue, 0, config.packedIndexBytesLength);
|
||||
System.arraycopy(leafValues, 0, minPackedValue, 0, config.packedIndexBytesLength());
|
||||
}
|
||||
System.arraycopy(
|
||||
leafValues,
|
||||
(leafCount - 1) * config.packedBytesLength,
|
||||
(leafCount - 1) * config.packedBytesLength(),
|
||||
maxPackedValue,
|
||||
0,
|
||||
config.packedIndexBytesLength);
|
||||
config.packedIndexBytesLength());
|
||||
|
||||
valueCount += leafCount;
|
||||
|
||||
if (leafBlockFPs.size() > 0) {
|
||||
// Save the first (minimum) value in each leaf block except the first, to build the split
|
||||
// value index in the end:
|
||||
leafBlockStartValues.add(ArrayUtil.copyOfSubArray(leafValues, 0, config.packedBytesLength));
|
||||
leafBlockStartValues.add(
|
||||
ArrayUtil.copyOfSubArray(leafValues, 0, config.packedBytesLength()));
|
||||
}
|
||||
leafBlockFPs.add(out.getFilePointer());
|
||||
checkMaxLeafNodeCount(leafBlockFPs.size());
|
||||
|
||||
Arrays.fill(commonPrefixLengths, config.bytesPerDim);
|
||||
Arrays.fill(commonPrefixLengths, config.bytesPerDim());
|
||||
// Find per-dim common prefix:
|
||||
for (int dim = 0; dim < config.numDims; dim++) {
|
||||
int offset1 = dim * config.bytesPerDim;
|
||||
int offset2 = (leafCount - 1) * config.packedBytesLength + offset1;
|
||||
for (int dim = 0; dim < config.numDims(); dim++) {
|
||||
int offset1 = dim * config.bytesPerDim();
|
||||
int offset2 = (leafCount - 1) * config.packedBytesLength() + offset1;
|
||||
for (int j = 0; j < commonPrefixLengths[dim]; j++) {
|
||||
if (leafValues[offset1 + j] != leafValues[offset2 + j]) {
|
||||
commonPrefixLengths[dim] = j;
|
||||
|
@ -523,24 +524,24 @@ final class SimpleTextBKDWriter implements Closeable {
|
|||
final BytesRef scratch = new BytesRef();
|
||||
|
||||
{
|
||||
scratch.length = config.packedBytesLength;
|
||||
scratch.length = config.packedBytesLength();
|
||||
scratch.bytes = leafValues;
|
||||
}
|
||||
|
||||
@Override
|
||||
public BytesRef apply(int i) {
|
||||
scratch.offset = config.packedBytesLength * i;
|
||||
scratch.offset = config.packedBytesLength() * i;
|
||||
return scratch;
|
||||
}
|
||||
};
|
||||
assert valuesInOrderAndBounds(
|
||||
leafCount,
|
||||
0,
|
||||
ArrayUtil.copyOfSubArray(leafValues, 0, config.packedBytesLength),
|
||||
ArrayUtil.copyOfSubArray(leafValues, 0, config.packedBytesLength()),
|
||||
ArrayUtil.copyOfSubArray(
|
||||
leafValues,
|
||||
(leafCount - 1) * config.packedBytesLength,
|
||||
leafCount * config.packedBytesLength),
|
||||
(leafCount - 1) * config.packedBytesLength(),
|
||||
leafCount * config.packedBytesLength()),
|
||||
packedValues,
|
||||
leafDocs,
|
||||
0);
|
||||
|
@ -552,7 +553,7 @@ final class SimpleTextBKDWriter implements Closeable {
|
|||
private void rotateToTree(
|
||||
int nodeID, int offset, int count, byte[] index, List<byte[]> leafBlockStartValues) {
|
||||
// System.out.println("ROTATE: nodeID=" + nodeID + " offset=" + offset + " count=" + count + "
|
||||
// bpd=" + config.bytesPerDim + " index.length=" + index.length);
|
||||
// bpd=" + config.bytesPerDim() + " index.length=" + index.length);
|
||||
if (count == 1) {
|
||||
// Leaf index node
|
||||
// System.out.println(" leaf index node");
|
||||
|
@ -561,8 +562,8 @@ final class SimpleTextBKDWriter implements Closeable {
|
|||
leafBlockStartValues.get(offset),
|
||||
0,
|
||||
index,
|
||||
nodeID * (1 + config.bytesPerDim) + 1,
|
||||
config.bytesPerDim);
|
||||
nodeID * (1 + config.bytesPerDim()) + 1,
|
||||
config.bytesPerDim());
|
||||
} else if (count > 1) {
|
||||
// Internal index node: binary partition of count
|
||||
int countAtLevel = 1;
|
||||
|
@ -587,8 +588,8 @@ final class SimpleTextBKDWriter implements Closeable {
|
|||
leafBlockStartValues.get(rootOffset),
|
||||
0,
|
||||
index,
|
||||
nodeID * (1 + config.bytesPerDim) + 1,
|
||||
config.bytesPerDim);
|
||||
nodeID * (1 + config.bytesPerDim()) + 1,
|
||||
config.bytesPerDim());
|
||||
// System.out.println(" index[" + nodeID + "] = blockStartValues[" + rootOffset + "]");
|
||||
|
||||
// TODO: we could optimize/specialize, when we know it's simply fully balanced binary tree
|
||||
|
@ -611,10 +612,10 @@ final class SimpleTextBKDWriter implements Closeable {
|
|||
}
|
||||
|
||||
private void checkMaxLeafNodeCount(int numLeaves) {
|
||||
if ((1 + config.bytesPerDim) * (long) numLeaves > ArrayUtil.MAX_ARRAY_LENGTH) {
|
||||
if ((1 + config.bytesPerDim()) * (long) numLeaves > ArrayUtil.MAX_ARRAY_LENGTH) {
|
||||
throw new IllegalStateException(
|
||||
"too many nodes; increase config.maxPointsInLeafNode (currently "
|
||||
+ config.maxPointsInLeafNode
|
||||
"too many nodes; increase config.maxPointsInLeafNode() (currently "
|
||||
+ config.maxPointsInLeafNode()
|
||||
+ ") and reindex");
|
||||
}
|
||||
}
|
||||
|
@ -652,7 +653,7 @@ final class SimpleTextBKDWriter implements Closeable {
|
|||
long countPerLeaf = pointCount;
|
||||
long innerNodeCount = 1;
|
||||
|
||||
while (countPerLeaf > config.maxPointsInLeafNode) {
|
||||
while (countPerLeaf > config.maxPointsInLeafNode()) {
|
||||
countPerLeaf = (countPerLeaf + 1) / 2;
|
||||
innerNodeCount *= 2;
|
||||
}
|
||||
|
@ -667,20 +668,20 @@ final class SimpleTextBKDWriter implements Closeable {
|
|||
|
||||
// Indexed by nodeID, but first (root) nodeID is 1. We do 1+ because the lead byte at each
|
||||
// recursion says which dim we split on.
|
||||
byte[] splitPackedValues = new byte[Math.multiplyExact(numLeaves, 1 + config.bytesPerDim)];
|
||||
byte[] splitPackedValues = new byte[Math.multiplyExact(numLeaves, 1 + config.bytesPerDim())];
|
||||
|
||||
// +1 because leaf count is power of 2 (e.g. 8), and innerNodeCount is power of 2 minus 1 (e.g.
|
||||
// 7)
|
||||
long[] leafBlockFPs = new long[numLeaves];
|
||||
|
||||
// Make sure the math above "worked":
|
||||
assert pointCount / numLeaves <= config.maxPointsInLeafNode
|
||||
assert pointCount / numLeaves <= config.maxPointsInLeafNode()
|
||||
: "pointCount="
|
||||
+ pointCount
|
||||
+ " numLeaves="
|
||||
+ numLeaves
|
||||
+ " config.maxPointsInLeafNode="
|
||||
+ config.maxPointsInLeafNode;
|
||||
+ " config.maxPointsInLeafNode()="
|
||||
+ config.maxPointsInLeafNode();
|
||||
|
||||
// We re-use the selector so we do not need to create an object every time.
|
||||
BKDRadixSelector radixSelector =
|
||||
|
@ -699,7 +700,7 @@ final class SimpleTextBKDWriter implements Closeable {
|
|||
maxPackedValue,
|
||||
splitPackedValues,
|
||||
leafBlockFPs,
|
||||
new int[config.maxPointsInLeafNode]);
|
||||
new int[config.maxPointsInLeafNode()]);
|
||||
|
||||
// If no exception, we should have cleaned everything up:
|
||||
assert tempDir.getCreatedFiles().isEmpty();
|
||||
|
@ -724,15 +725,15 @@ final class SimpleTextBKDWriter implements Closeable {
|
|||
IndexOutput out, long[] leafBlockFPs, byte[] splitPackedValues, int maxPointsInLeafNode)
|
||||
throws IOException {
|
||||
write(out, NUM_DATA_DIMS);
|
||||
writeInt(out, config.numDims);
|
||||
writeInt(out, config.numDims());
|
||||
newline(out);
|
||||
|
||||
write(out, NUM_INDEX_DIMS);
|
||||
writeInt(out, config.numIndexDims);
|
||||
writeInt(out, config.numIndexDims());
|
||||
newline(out);
|
||||
|
||||
write(out, BYTES_PER_DIM);
|
||||
writeInt(out, config.bytesPerDim);
|
||||
writeInt(out, config.bytesPerDim());
|
||||
newline(out);
|
||||
|
||||
write(out, MAX_LEAF_POINTS);
|
||||
|
@ -767,8 +768,8 @@ final class SimpleTextBKDWriter implements Closeable {
|
|||
newline(out);
|
||||
}
|
||||
|
||||
assert (splitPackedValues.length % (1 + config.bytesPerDim)) == 0;
|
||||
int count = splitPackedValues.length / (1 + config.bytesPerDim);
|
||||
assert (splitPackedValues.length % (1 + config.bytesPerDim())) == 0;
|
||||
int count = splitPackedValues.length / (1 + config.bytesPerDim());
|
||||
assert count == leafBlockFPs.length;
|
||||
|
||||
write(out, SPLIT_COUNT);
|
||||
|
@ -777,10 +778,12 @@ final class SimpleTextBKDWriter implements Closeable {
|
|||
|
||||
for (int i = 0; i < count; i++) {
|
||||
write(out, SPLIT_DIM);
|
||||
writeInt(out, splitPackedValues[i * (1 + config.bytesPerDim)] & 0xff);
|
||||
writeInt(out, splitPackedValues[i * (1 + config.bytesPerDim())] & 0xff);
|
||||
newline(out);
|
||||
write(out, SPLIT_VALUE);
|
||||
br = new BytesRef(splitPackedValues, 1 + (i * (1 + config.bytesPerDim)), config.bytesPerDim);
|
||||
br =
|
||||
new BytesRef(
|
||||
splitPackedValues, 1 + (i * (1 + config.bytesPerDim())), config.bytesPerDim());
|
||||
write(out, br.toString());
|
||||
newline(out);
|
||||
}
|
||||
|
@ -852,25 +855,25 @@ final class SimpleTextBKDWriter implements Closeable {
|
|||
/** Called only in assert */
|
||||
private boolean valueInBounds(
|
||||
BytesRef packedValue, byte[] minPackedValue, byte[] maxPackedValue) {
|
||||
for (int dim = 0; dim < config.numIndexDims; dim++) {
|
||||
int offset = config.bytesPerDim * dim;
|
||||
for (int dim = 0; dim < config.numIndexDims(); dim++) {
|
||||
int offset = config.bytesPerDim() * dim;
|
||||
if (Arrays.compareUnsigned(
|
||||
packedValue.bytes,
|
||||
packedValue.offset + offset,
|
||||
packedValue.offset + offset + config.bytesPerDim,
|
||||
packedValue.offset + offset + config.bytesPerDim(),
|
||||
minPackedValue,
|
||||
offset,
|
||||
offset + config.bytesPerDim)
|
||||
offset + config.bytesPerDim())
|
||||
< 0) {
|
||||
return false;
|
||||
}
|
||||
if (Arrays.compareUnsigned(
|
||||
packedValue.bytes,
|
||||
packedValue.offset + offset,
|
||||
packedValue.offset + offset + config.bytesPerDim,
|
||||
packedValue.offset + offset + config.bytesPerDim(),
|
||||
maxPackedValue,
|
||||
offset,
|
||||
offset + config.bytesPerDim)
|
||||
offset + config.bytesPerDim())
|
||||
> 0) {
|
||||
return false;
|
||||
}
|
||||
|
@ -882,13 +885,13 @@ final class SimpleTextBKDWriter implements Closeable {
|
|||
protected int split(byte[] minPackedValue, byte[] maxPackedValue) {
|
||||
// Find which dim has the largest span so we can split on it:
|
||||
int splitDim = -1;
|
||||
for (int dim = 0; dim < config.numIndexDims; dim++) {
|
||||
NumericUtils.subtract(config.bytesPerDim, dim, maxPackedValue, minPackedValue, scratchDiff);
|
||||
for (int dim = 0; dim < config.numIndexDims(); dim++) {
|
||||
NumericUtils.subtract(config.bytesPerDim(), dim, maxPackedValue, minPackedValue, scratchDiff);
|
||||
if (splitDim == -1
|
||||
|| Arrays.compareUnsigned(
|
||||
scratchDiff, 0, config.bytesPerDim, scratch1, 0, config.bytesPerDim)
|
||||
scratchDiff, 0, config.bytesPerDim(), scratch1, 0, config.bytesPerDim())
|
||||
> 0) {
|
||||
System.arraycopy(scratchDiff, 0, scratch1, 0, config.bytesPerDim);
|
||||
System.arraycopy(scratchDiff, 0, scratch1, 0, config.bytesPerDim());
|
||||
splitDim = dim;
|
||||
}
|
||||
}
|
||||
|
@ -931,15 +934,15 @@ final class SimpleTextBKDWriter implements Closeable {
|
|||
if (nodeID >= leafNodeOffset) {
|
||||
// leaf node
|
||||
final int count = to - from;
|
||||
assert count <= config.maxPointsInLeafNode;
|
||||
assert count <= config.maxPointsInLeafNode();
|
||||
|
||||
// Compute common prefixes
|
||||
Arrays.fill(commonPrefixLengths, config.bytesPerDim);
|
||||
Arrays.fill(commonPrefixLengths, config.bytesPerDim());
|
||||
reader.getValue(from, scratchBytesRef1);
|
||||
for (int i = from + 1; i < to; ++i) {
|
||||
reader.getValue(i, scratchBytesRef2);
|
||||
for (int dim = 0; dim < config.numDims; dim++) {
|
||||
final int offset = dim * config.bytesPerDim;
|
||||
for (int dim = 0; dim < config.numDims(); dim++) {
|
||||
final int offset = dim * config.bytesPerDim();
|
||||
for (int j = 0; j < commonPrefixLengths[dim]; j++) {
|
||||
if (scratchBytesRef1.bytes[scratchBytesRef1.offset + offset + j]
|
||||
!= scratchBytesRef2.bytes[scratchBytesRef2.offset + offset + j]) {
|
||||
|
@ -951,23 +954,23 @@ final class SimpleTextBKDWriter implements Closeable {
|
|||
}
|
||||
|
||||
// Find the dimension that has the least number of unique bytes at commonPrefixLengths[dim]
|
||||
FixedBitSet[] usedBytes = new FixedBitSet[config.numDims];
|
||||
for (int dim = 0; dim < config.numDims; ++dim) {
|
||||
if (commonPrefixLengths[dim] < config.bytesPerDim) {
|
||||
FixedBitSet[] usedBytes = new FixedBitSet[config.numDims()];
|
||||
for (int dim = 0; dim < config.numDims(); ++dim) {
|
||||
if (commonPrefixLengths[dim] < config.bytesPerDim()) {
|
||||
usedBytes[dim] = new FixedBitSet(256);
|
||||
}
|
||||
}
|
||||
for (int i = from + 1; i < to; ++i) {
|
||||
for (int dim = 0; dim < config.numDims; dim++) {
|
||||
for (int dim = 0; dim < config.numDims(); dim++) {
|
||||
if (usedBytes[dim] != null) {
|
||||
byte b = reader.getByteAt(i, dim * config.bytesPerDim + commonPrefixLengths[dim]);
|
||||
byte b = reader.getByteAt(i, dim * config.bytesPerDim() + commonPrefixLengths[dim]);
|
||||
usedBytes[dim].set(Byte.toUnsignedInt(b));
|
||||
}
|
||||
}
|
||||
}
|
||||
int sortedDim = 0;
|
||||
int sortedDimCardinality = Integer.MAX_VALUE;
|
||||
for (int dim = 0; dim < config.numDims; ++dim) {
|
||||
for (int dim = 0; dim < config.numDims(); ++dim) {
|
||||
if (usedBytes[dim] != null) {
|
||||
final int cardinality = usedBytes[dim].cardinality();
|
||||
if (cardinality < sortedDimCardinality) {
|
||||
|
@ -1001,7 +1004,7 @@ final class SimpleTextBKDWriter implements Closeable {
|
|||
// Write the common prefixes:
|
||||
reader.getValue(from, scratchBytesRef1);
|
||||
System.arraycopy(
|
||||
scratchBytesRef1.bytes, scratchBytesRef1.offset, scratch1, 0, config.packedBytesLength);
|
||||
scratchBytesRef1.bytes, scratchBytesRef1.offset, scratch1, 0, config.packedBytesLength());
|
||||
|
||||
// Write the full values:
|
||||
IntFunction<BytesRef> packedValues =
|
||||
|
@ -1023,10 +1026,10 @@ final class SimpleTextBKDWriter implements Closeable {
|
|||
final int splitDim = split(minPackedValue, maxPackedValue);
|
||||
final int mid = (from + to + 1) >>> 1;
|
||||
|
||||
int commonPrefixLen = config.bytesPerDim;
|
||||
for (int i = 0; i < config.bytesPerDim; ++i) {
|
||||
if (minPackedValue[splitDim * config.bytesPerDim + i]
|
||||
!= maxPackedValue[splitDim * config.bytesPerDim + i]) {
|
||||
int commonPrefixLen = config.bytesPerDim();
|
||||
for (int i = 0; i < config.bytesPerDim(); ++i) {
|
||||
if (minPackedValue[splitDim * config.bytesPerDim() + i]
|
||||
!= maxPackedValue[splitDim * config.bytesPerDim() + i]) {
|
||||
commonPrefixLen = i;
|
||||
break;
|
||||
}
|
||||
|
@ -1044,32 +1047,32 @@ final class SimpleTextBKDWriter implements Closeable {
|
|||
scratchBytesRef2);
|
||||
|
||||
// set the split value
|
||||
final int address = nodeID * (1 + config.bytesPerDim);
|
||||
final int address = nodeID * (1 + config.bytesPerDim());
|
||||
splitPackedValues[address] = (byte) splitDim;
|
||||
reader.getValue(mid, scratchBytesRef1);
|
||||
System.arraycopy(
|
||||
scratchBytesRef1.bytes,
|
||||
scratchBytesRef1.offset + splitDim * config.bytesPerDim,
|
||||
scratchBytesRef1.offset + splitDim * config.bytesPerDim(),
|
||||
splitPackedValues,
|
||||
address + 1,
|
||||
config.bytesPerDim);
|
||||
config.bytesPerDim());
|
||||
|
||||
byte[] minSplitPackedValue =
|
||||
ArrayUtil.copyOfSubArray(minPackedValue, 0, config.packedIndexBytesLength);
|
||||
ArrayUtil.copyOfSubArray(minPackedValue, 0, config.packedIndexBytesLength());
|
||||
byte[] maxSplitPackedValue =
|
||||
ArrayUtil.copyOfSubArray(maxPackedValue, 0, config.packedIndexBytesLength);
|
||||
ArrayUtil.copyOfSubArray(maxPackedValue, 0, config.packedIndexBytesLength());
|
||||
System.arraycopy(
|
||||
scratchBytesRef1.bytes,
|
||||
scratchBytesRef1.offset + splitDim * config.bytesPerDim,
|
||||
scratchBytesRef1.offset + splitDim * config.bytesPerDim(),
|
||||
minSplitPackedValue,
|
||||
splitDim * config.bytesPerDim,
|
||||
config.bytesPerDim);
|
||||
splitDim * config.bytesPerDim(),
|
||||
config.bytesPerDim());
|
||||
System.arraycopy(
|
||||
scratchBytesRef1.bytes,
|
||||
scratchBytesRef1.offset + splitDim * config.bytesPerDim,
|
||||
scratchBytesRef1.offset + splitDim * config.bytesPerDim(),
|
||||
maxSplitPackedValue,
|
||||
splitDim * config.bytesPerDim,
|
||||
config.bytesPerDim);
|
||||
splitDim * config.bytesPerDim(),
|
||||
config.bytesPerDim());
|
||||
|
||||
// recurse
|
||||
build(
|
||||
|
@ -1137,17 +1140,17 @@ final class SimpleTextBKDWriter implements Closeable {
|
|||
|
||||
int sortedDim = 0;
|
||||
int sortedDimCardinality = Integer.MAX_VALUE;
|
||||
FixedBitSet[] usedBytes = new FixedBitSet[config.numDims];
|
||||
for (int dim = 0; dim < config.numDims; ++dim) {
|
||||
if (commonPrefixLengths[dim] < config.bytesPerDim) {
|
||||
FixedBitSet[] usedBytes = new FixedBitSet[config.numDims()];
|
||||
for (int dim = 0; dim < config.numDims(); ++dim) {
|
||||
if (commonPrefixLengths[dim] < config.bytesPerDim()) {
|
||||
usedBytes[dim] = new FixedBitSet(256);
|
||||
}
|
||||
}
|
||||
// Find the dimension to compress
|
||||
for (int dim = 0; dim < config.numDims; dim++) {
|
||||
for (int dim = 0; dim < config.numDims(); dim++) {
|
||||
int prefix = commonPrefixLengths[dim];
|
||||
if (prefix < config.bytesPerDim) {
|
||||
int offset = dim * config.bytesPerDim;
|
||||
if (prefix < config.bytesPerDim()) {
|
||||
int offset = dim * config.bytesPerDim();
|
||||
for (int i = 0; i < heapSource.count(); ++i) {
|
||||
PointValue value = heapSource.getPackedValueSlice(i);
|
||||
BytesRef packedValue = value.packedValue();
|
||||
|
@ -1190,7 +1193,7 @@ final class SimpleTextBKDWriter implements Closeable {
|
|||
final BytesRef scratch = new BytesRef();
|
||||
|
||||
{
|
||||
scratch.length = config.packedBytesLength;
|
||||
scratch.length = config.packedBytesLength();
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -1207,7 +1210,7 @@ final class SimpleTextBKDWriter implements Closeable {
|
|||
// Inner node: partition/recurse
|
||||
|
||||
int splitDim;
|
||||
if (config.numIndexDims > 1) {
|
||||
if (config.numIndexDims() > 1) {
|
||||
splitDim = split(minPackedValue, maxPackedValue);
|
||||
} else {
|
||||
splitDim = 0;
|
||||
|
@ -1223,13 +1226,13 @@ final class SimpleTextBKDWriter implements Closeable {
|
|||
int commonPrefixLen =
|
||||
Arrays.mismatch(
|
||||
minPackedValue,
|
||||
splitDim * config.bytesPerDim,
|
||||
splitDim * config.bytesPerDim + config.bytesPerDim,
|
||||
splitDim * config.bytesPerDim(),
|
||||
splitDim * config.bytesPerDim() + config.bytesPerDim(),
|
||||
maxPackedValue,
|
||||
splitDim * config.bytesPerDim,
|
||||
splitDim * config.bytesPerDim + config.bytesPerDim);
|
||||
splitDim * config.bytesPerDim(),
|
||||
splitDim * config.bytesPerDim() + config.bytesPerDim());
|
||||
if (commonPrefixLen == -1) {
|
||||
commonPrefixLen = config.bytesPerDim;
|
||||
commonPrefixLen = config.bytesPerDim();
|
||||
}
|
||||
|
||||
BKDRadixSelector.PathSlice[] pathSlices = new BKDRadixSelector.PathSlice[2];
|
||||
|
@ -1244,20 +1247,28 @@ final class SimpleTextBKDWriter implements Closeable {
|
|||
splitDim,
|
||||
commonPrefixLen);
|
||||
|
||||
int address = nodeID * (1 + config.bytesPerDim);
|
||||
int address = nodeID * (1 + config.bytesPerDim());
|
||||
splitPackedValues[address] = (byte) splitDim;
|
||||
System.arraycopy(splitValue, 0, splitPackedValues, address + 1, config.bytesPerDim);
|
||||
System.arraycopy(splitValue, 0, splitPackedValues, address + 1, config.bytesPerDim());
|
||||
|
||||
byte[] minSplitPackedValue = new byte[config.packedIndexBytesLength];
|
||||
System.arraycopy(minPackedValue, 0, minSplitPackedValue, 0, config.packedIndexBytesLength);
|
||||
byte[] minSplitPackedValue = new byte[config.packedIndexBytesLength()];
|
||||
System.arraycopy(minPackedValue, 0, minSplitPackedValue, 0, config.packedIndexBytesLength());
|
||||
|
||||
byte[] maxSplitPackedValue = new byte[config.packedIndexBytesLength];
|
||||
System.arraycopy(maxPackedValue, 0, maxSplitPackedValue, 0, config.packedIndexBytesLength);
|
||||
byte[] maxSplitPackedValue = new byte[config.packedIndexBytesLength()];
|
||||
System.arraycopy(maxPackedValue, 0, maxSplitPackedValue, 0, config.packedIndexBytesLength());
|
||||
|
||||
System.arraycopy(
|
||||
splitValue, 0, minSplitPackedValue, splitDim * config.bytesPerDim, config.bytesPerDim);
|
||||
splitValue,
|
||||
0,
|
||||
minSplitPackedValue,
|
||||
splitDim * config.bytesPerDim(),
|
||||
config.bytesPerDim());
|
||||
System.arraycopy(
|
||||
splitValue, 0, maxSplitPackedValue, splitDim * config.bytesPerDim, config.bytesPerDim);
|
||||
splitValue,
|
||||
0,
|
||||
maxSplitPackedValue,
|
||||
splitDim * config.bytesPerDim(),
|
||||
config.bytesPerDim());
|
||||
|
||||
// Recurse on left tree:
|
||||
build(
|
||||
|
@ -1289,30 +1300,30 @@ final class SimpleTextBKDWriter implements Closeable {
|
|||
}
|
||||
|
||||
private void computeCommonPrefixLength(HeapPointWriter heapPointWriter, byte[] commonPrefix) {
|
||||
Arrays.fill(commonPrefixLengths, config.bytesPerDim);
|
||||
Arrays.fill(commonPrefixLengths, config.bytesPerDim());
|
||||
PointValue value = heapPointWriter.getPackedValueSlice(0);
|
||||
BytesRef packedValue = value.packedValue();
|
||||
for (int dim = 0; dim < config.numDims; dim++) {
|
||||
for (int dim = 0; dim < config.numDims(); dim++) {
|
||||
System.arraycopy(
|
||||
packedValue.bytes,
|
||||
packedValue.offset + dim * config.bytesPerDim,
|
||||
packedValue.offset + dim * config.bytesPerDim(),
|
||||
commonPrefix,
|
||||
dim * config.bytesPerDim,
|
||||
config.bytesPerDim);
|
||||
dim * config.bytesPerDim(),
|
||||
config.bytesPerDim());
|
||||
}
|
||||
for (int i = 1; i < heapPointWriter.count(); i++) {
|
||||
value = heapPointWriter.getPackedValueSlice(i);
|
||||
packedValue = value.packedValue();
|
||||
for (int dim = 0; dim < config.numDims; dim++) {
|
||||
for (int dim = 0; dim < config.numDims(); dim++) {
|
||||
if (commonPrefixLengths[dim] != 0) {
|
||||
int j =
|
||||
Arrays.mismatch(
|
||||
commonPrefix,
|
||||
dim * config.bytesPerDim,
|
||||
dim * config.bytesPerDim + commonPrefixLengths[dim],
|
||||
dim * config.bytesPerDim(),
|
||||
dim * config.bytesPerDim() + commonPrefixLengths[dim],
|
||||
packedValue.bytes,
|
||||
packedValue.offset + dim * config.bytesPerDim,
|
||||
packedValue.offset + dim * config.bytesPerDim + commonPrefixLengths[dim]);
|
||||
packedValue.offset + dim * config.bytesPerDim(),
|
||||
packedValue.offset + dim * config.bytesPerDim() + commonPrefixLengths[dim]);
|
||||
if (j != -1) {
|
||||
commonPrefixLengths[dim] = j;
|
||||
}
|
||||
|
@ -1331,11 +1342,11 @@ final class SimpleTextBKDWriter implements Closeable {
|
|||
int[] docs,
|
||||
int docsOffset)
|
||||
throws IOException {
|
||||
byte[] lastPackedValue = new byte[config.packedBytesLength];
|
||||
byte[] lastPackedValue = new byte[config.packedBytesLength()];
|
||||
int lastDoc = -1;
|
||||
for (int i = 0; i < count; i++) {
|
||||
BytesRef packedValue = values.apply(i);
|
||||
assert packedValue.length == config.packedBytesLength;
|
||||
assert packedValue.length == config.packedBytesLength();
|
||||
assert valueInOrder(
|
||||
i,
|
||||
sortedDim,
|
||||
|
@ -1361,43 +1372,43 @@ final class SimpleTextBKDWriter implements Closeable {
|
|||
int packedValueOffset,
|
||||
int doc,
|
||||
int lastDoc) {
|
||||
int dimOffset = sortedDim * config.bytesPerDim;
|
||||
int dimOffset = sortedDim * config.bytesPerDim();
|
||||
if (ord > 0) {
|
||||
int cmp =
|
||||
Arrays.compareUnsigned(
|
||||
lastPackedValue,
|
||||
dimOffset,
|
||||
dimOffset + config.bytesPerDim,
|
||||
dimOffset + config.bytesPerDim(),
|
||||
packedValue,
|
||||
packedValueOffset + dimOffset,
|
||||
packedValueOffset + dimOffset + config.bytesPerDim);
|
||||
packedValueOffset + dimOffset + config.bytesPerDim());
|
||||
if (cmp > 0) {
|
||||
throw new AssertionError(
|
||||
"values out of order: last value="
|
||||
+ new BytesRef(lastPackedValue)
|
||||
+ " current value="
|
||||
+ new BytesRef(packedValue, packedValueOffset, config.packedBytesLength)
|
||||
+ new BytesRef(packedValue, packedValueOffset, config.packedBytesLength())
|
||||
+ " ord="
|
||||
+ ord
|
||||
+ " sortedDim="
|
||||
+ sortedDim);
|
||||
}
|
||||
if (cmp == 0 && config.numDims > config.numIndexDims) {
|
||||
int dataOffset = config.numIndexDims * config.bytesPerDim;
|
||||
if (cmp == 0 && config.numDims() > config.numIndexDims()) {
|
||||
int dataOffset = config.numIndexDims() * config.bytesPerDim();
|
||||
cmp =
|
||||
Arrays.compareUnsigned(
|
||||
lastPackedValue,
|
||||
dataOffset,
|
||||
config.packedBytesLength,
|
||||
config.packedBytesLength(),
|
||||
packedValue,
|
||||
packedValueOffset + dataOffset,
|
||||
packedValueOffset + config.packedBytesLength);
|
||||
packedValueOffset + config.packedBytesLength());
|
||||
if (cmp > 0) {
|
||||
throw new AssertionError(
|
||||
"data values out of order: last value="
|
||||
+ new BytesRef(lastPackedValue)
|
||||
+ " current value="
|
||||
+ new BytesRef(packedValue, packedValueOffset, config.packedBytesLength)
|
||||
+ new BytesRef(packedValue, packedValueOffset, config.packedBytesLength())
|
||||
+ " ord="
|
||||
+ ord);
|
||||
}
|
||||
|
@ -1414,7 +1425,8 @@ final class SimpleTextBKDWriter implements Closeable {
|
|||
+ sortedDim);
|
||||
}
|
||||
}
|
||||
System.arraycopy(packedValue, packedValueOffset, lastPackedValue, 0, config.packedBytesLength);
|
||||
System.arraycopy(
|
||||
packedValue, packedValueOffset, lastPackedValue, 0, config.packedBytesLength());
|
||||
return true;
|
||||
}
|
||||
|
||||
|
|
|
@ -829,7 +829,7 @@ class SimpleTextDocValuesReader extends DocValuesProducer {
|
|||
clone.seek(0);
|
||||
// checksum is fixed-width encoded with 20 bytes, plus 1 byte for newline (the space is included
|
||||
// in SimpleTextUtil.CHECKSUM):
|
||||
long footerStartPos = data.length() - (SimpleTextUtil.CHECKSUM.length + 21);
|
||||
long footerStartPos = clone.length() - (SimpleTextUtil.CHECKSUM.length + 21);
|
||||
ChecksumIndexInput input = new BufferedChecksumIndexInput(clone);
|
||||
while (true) {
|
||||
SimpleTextUtil.readLine(input, scratch);
|
||||
|
|
|
@ -227,7 +227,7 @@ class SimpleTextPointsReader extends PointsReader {
|
|||
|
||||
// checksum is fixed-width encoded with 20 bytes, plus 1 byte for newline (the space is included
|
||||
// in SimpleTextUtil.CHECKSUM):
|
||||
long footerStartPos = dataIn.length() - (SimpleTextUtil.CHECKSUM.length + 21);
|
||||
long footerStartPos = clone.length() - (SimpleTextUtil.CHECKSUM.length + 21);
|
||||
ChecksumIndexInput input = new BufferedChecksumIndexInput(clone);
|
||||
while (true) {
|
||||
SimpleTextUtil.readLine(input, scratch);
|
||||
|
|
|
@ -17,13 +17,13 @@
|
|||
|
||||
package org.apache.lucene.codecs.uniformsplit;
|
||||
|
||||
import static org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.BLOCK_SIZE;
|
||||
import static org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat.BLOCK_SIZE;
|
||||
|
||||
import java.io.IOException;
|
||||
import org.apache.lucene.codecs.BlockTermState;
|
||||
import org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.IntBlockTermState;
|
||||
import org.apache.lucene.codecs.lucene99.Lucene99PostingsReader;
|
||||
import org.apache.lucene.codecs.lucene99.Lucene99PostingsWriter;
|
||||
import org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat.IntBlockTermState;
|
||||
import org.apache.lucene.codecs.lucene912.Lucene912PostingsReader;
|
||||
import org.apache.lucene.codecs.lucene912.Lucene912PostingsWriter;
|
||||
import org.apache.lucene.index.FieldInfo;
|
||||
import org.apache.lucene.index.IndexOptions;
|
||||
import org.apache.lucene.index.TermState;
|
||||
|
@ -34,7 +34,7 @@ import org.apache.lucene.util.RamUsageEstimator;
|
|||
|
||||
/**
|
||||
* {@link TermState} serializer which encodes each file pointer as a delta relative to a base file
|
||||
* pointer. It differs from {@link Lucene99PostingsWriter#encodeTerm} which encodes each file
|
||||
* pointer. It differs from {@link Lucene912PostingsWriter#encodeTerm} which encodes each file
|
||||
* pointer as a delta relative to the previous file pointer.
|
||||
*
|
||||
* <p>It automatically sets the base file pointer to the first valid file pointer for doc start FP,
|
||||
|
@ -95,7 +95,7 @@ public class DeltaBaseTermStateSerializer implements Accountable {
|
|||
/**
|
||||
* Writes a {@link BlockTermState} to the provided {@link DataOutput}.
|
||||
*
|
||||
* <p>Simpler variant of {@link Lucene99PostingsWriter#encodeTerm(DataOutput, FieldInfo,
|
||||
* <p>Simpler variant of {@link Lucene912PostingsWriter#encodeTerm(DataOutput, FieldInfo,
|
||||
* BlockTermState, boolean)}.
|
||||
*/
|
||||
public void writeTermState(
|
||||
|
@ -140,15 +140,12 @@ public class DeltaBaseTermStateSerializer implements Accountable {
|
|||
termStatesOutput.writeVLong(intTermState.lastPosBlockOffset);
|
||||
}
|
||||
}
|
||||
if (intTermState.skipOffset != -1) {
|
||||
termStatesOutput.writeVLong(intTermState.skipOffset);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Reads a {@link BlockTermState} from the provided {@link DataInput}.
|
||||
*
|
||||
* <p>Simpler variant of {@link Lucene99PostingsReader#decodeTerm(DataInput, FieldInfo,
|
||||
* <p>Simpler variant of {@link Lucene912PostingsReader#decodeTerm(DataInput, FieldInfo,
|
||||
* BlockTermState, boolean)}.
|
||||
*
|
||||
* @param reuse {@link BlockTermState} to reuse; or null to create a new one.
|
||||
|
@ -190,9 +187,6 @@ public class DeltaBaseTermStateSerializer implements Accountable {
|
|||
intTermState.lastPosBlockOffset = termStatesInput.readVLong();
|
||||
}
|
||||
}
|
||||
if (intTermState.docFreq > BLOCK_SIZE) {
|
||||
intTermState.skipOffset = termStatesInput.readVLong();
|
||||
}
|
||||
return intTermState;
|
||||
}
|
||||
|
||||
|
@ -210,7 +204,6 @@ public class DeltaBaseTermStateSerializer implements Accountable {
|
|||
termState.docStartFP = 0;
|
||||
termState.posStartFP = 0;
|
||||
termState.payStartFP = 0;
|
||||
termState.skipOffset = -1;
|
||||
termState.lastPosBlockOffset = -1;
|
||||
termState.singletonDocID = -1;
|
||||
|
||||
|
|
|
@ -90,10 +90,15 @@ public class FSTDictionary implements IndexDictionary {
|
|||
}
|
||||
PositiveIntOutputs fstOutputs = PositiveIntOutputs.getSingleton();
|
||||
FST.FSTMetadata<Long> metadata = FST.readMetadata(fstDataInput, fstOutputs);
|
||||
FST<Long> fst =
|
||||
isFSTOnHeap
|
||||
? new FST<>(metadata, fstDataInput)
|
||||
: new FST<>(metadata, fstDataInput, new OffHeapFSTStore());
|
||||
FST<Long> fst;
|
||||
if (isFSTOnHeap) {
|
||||
fst = new FST<>(metadata, fstDataInput);
|
||||
} else {
|
||||
final IndexInput indexInput = (IndexInput) fstDataInput;
|
||||
fst =
|
||||
FST.fromFSTReader(
|
||||
metadata, new OffHeapFSTStore(indexInput, indexInput.getFilePointer(), metadata));
|
||||
}
|
||||
return new FSTDictionary(fst);
|
||||
}
|
||||
|
||||
|
|
|
@ -23,8 +23,8 @@ import org.apache.lucene.codecs.FieldsProducer;
|
|||
import org.apache.lucene.codecs.PostingsFormat;
|
||||
import org.apache.lucene.codecs.PostingsReaderBase;
|
||||
import org.apache.lucene.codecs.PostingsWriterBase;
|
||||
import org.apache.lucene.codecs.lucene99.Lucene99PostingsReader;
|
||||
import org.apache.lucene.codecs.lucene99.Lucene99PostingsWriter;
|
||||
import org.apache.lucene.codecs.lucene912.Lucene912PostingsReader;
|
||||
import org.apache.lucene.codecs.lucene912.Lucene912PostingsWriter;
|
||||
import org.apache.lucene.index.SegmentReadState;
|
||||
import org.apache.lucene.index.SegmentWriteState;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
|
@ -113,7 +113,7 @@ public class UniformSplitPostingsFormat extends PostingsFormat {
|
|||
|
||||
@Override
|
||||
public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
|
||||
PostingsWriterBase postingsWriter = new Lucene99PostingsWriter(state);
|
||||
PostingsWriterBase postingsWriter = new Lucene912PostingsWriter(state);
|
||||
boolean success = false;
|
||||
try {
|
||||
FieldsConsumer termsWriter =
|
||||
|
@ -130,7 +130,7 @@ public class UniformSplitPostingsFormat extends PostingsFormat {
|
|||
|
||||
@Override
|
||||
public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException {
|
||||
PostingsReaderBase postingsReader = new Lucene99PostingsReader(state);
|
||||
PostingsReaderBase postingsReader = new Lucene912PostingsReader(state);
|
||||
boolean success = false;
|
||||
try {
|
||||
FieldsProducer termsReader =
|
||||
|
|
|
@ -28,7 +28,7 @@
|
|||
* org.apache.lucene.search.PhraseQuery})
|
||||
* <li>Quite efficient for {@link org.apache.lucene.search.PrefixQuery}
|
||||
* <li>Not efficient for spell-check and {@link org.apache.lucene.search.FuzzyQuery}, in this case
|
||||
* prefer {@link org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat}
|
||||
* prefer {@link org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat}
|
||||
* </ul>
|
||||
*/
|
||||
package org.apache.lucene.codecs.uniformsplit;
|
||||
|
|
|
@ -20,11 +20,11 @@ package org.apache.lucene.codecs.uniformsplit.sharedterms;
|
|||
import java.io.IOException;
|
||||
import java.util.List;
|
||||
import java.util.RandomAccess;
|
||||
import org.apache.lucene.index.BaseTermsEnum;
|
||||
import org.apache.lucene.index.ImpactsEnum;
|
||||
import org.apache.lucene.index.MergeState;
|
||||
import org.apache.lucene.index.PostingsEnum;
|
||||
import org.apache.lucene.index.TermState;
|
||||
import org.apache.lucene.index.TermsEnum;
|
||||
import org.apache.lucene.util.AttributeSource;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
|
||||
|
@ -34,7 +34,7 @@ import org.apache.lucene.util.BytesRef;
|
|||
*
|
||||
* @lucene.experimental
|
||||
*/
|
||||
class STMergingTermsEnum extends TermsEnum {
|
||||
class STMergingTermsEnum extends BaseTermsEnum {
|
||||
|
||||
protected final String fieldName;
|
||||
protected final MultiSegmentsPostingsEnum multiPostingsEnum;
|
||||
|
@ -63,11 +63,6 @@ class STMergingTermsEnum extends TermsEnum {
|
|||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean seekExact(BytesRef text) throws IOException {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
@Override
|
||||
public SeekStatus seekCeil(BytesRef text) {
|
||||
throw new UnsupportedOperationException();
|
||||
|
|
|
@ -22,7 +22,7 @@ import java.io.IOException;
|
|||
import org.apache.lucene.codecs.Codec;
|
||||
import org.apache.lucene.codecs.FilterCodec;
|
||||
import org.apache.lucene.codecs.KnnVectorsFormat;
|
||||
import org.apache.lucene.codecs.lucene99.Lucene99Codec;
|
||||
import org.apache.lucene.codecs.lucene912.Lucene912Codec;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.document.KnnByteVectorField;
|
||||
|
@ -42,7 +42,7 @@ import org.apache.lucene.tests.index.BaseIndexFileFormatTestCase;
|
|||
public class TestHnswBitVectorsFormat extends BaseIndexFileFormatTestCase {
|
||||
@Override
|
||||
protected Codec getCodec() {
|
||||
return new Lucene99Codec() {
|
||||
return new Lucene912Codec() {
|
||||
@Override
|
||||
public KnnVectorsFormat getKnnVectorsFormatForField(String field) {
|
||||
return new HnswBitVectorsFormat();
|
||||
|
|
|
@ -17,7 +17,7 @@
|
|||
|
||||
package org.apache.lucene.codecs.lucene90.tests;
|
||||
|
||||
import org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.IntBlockTermState;
|
||||
import org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat.IntBlockTermState;
|
||||
|
||||
/** Test utility class to create mock {@link IntBlockTermState}. */
|
||||
public class MockTermStateFactory {
|
||||
|
|
|
@ -0,0 +1,4 @@
|
|||
{
|
||||
"lucene/core/src/java/org/apache/lucene/codecs/lucene912/ForDeltaUtil.java": "5115b12ac31537ce31d73c0a279df92060749a3a",
|
||||
"lucene/core/src/java/org/apache/lucene/codecs/lucene912/gen_ForDeltaUtil.py": "db6154406e68b80d2c90116b5d0bfa9ba220762a"
|
||||
}
|
|
@ -1,4 +1,4 @@
|
|||
{
|
||||
"lucene/core/src/java/org/apache/lucene/codecs/lucene99/ForUtil.java": "1292ad354d255b1272ffd3db684aa2ddb2bc49ec",
|
||||
"lucene/core/src/java/org/apache/lucene/codecs/lucene99/gen_ForUtil.py": "ab7b63a1b73986cc04e43de1c8f474b97aef5116"
|
||||
"lucene/core/src/java/org/apache/lucene/codecs/lucene912/ForUtil.java": "159e82388346fde147924d5e15ca65df4dd63b9a",
|
||||
"lucene/core/src/java/org/apache/lucene/codecs/lucene912/gen_ForUtil.py": "66dc8813160feae2a37d8b50474f5f9830b6cb22"
|
||||
}
|
|
@ -15,7 +15,7 @@
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.codecs.lucene99.Lucene99Codec;
|
||||
import org.apache.lucene.codecs.lucene912.Lucene912Codec;
|
||||
|
||||
/** Lucene Core. */
|
||||
@SuppressWarnings("module") // the test framework is compiled after the core...
|
||||
|
@ -33,6 +33,7 @@ module org.apache.lucene.core {
|
|||
exports org.apache.lucene.codecs.lucene94;
|
||||
exports org.apache.lucene.codecs.lucene95;
|
||||
exports org.apache.lucene.codecs.lucene99;
|
||||
exports org.apache.lucene.codecs.lucene912;
|
||||
exports org.apache.lucene.codecs.perfield;
|
||||
exports org.apache.lucene.codecs;
|
||||
exports org.apache.lucene.document;
|
||||
|
@ -71,7 +72,7 @@ module org.apache.lucene.core {
|
|||
provides org.apache.lucene.analysis.TokenizerFactory with
|
||||
org.apache.lucene.analysis.standard.StandardTokenizerFactory;
|
||||
provides org.apache.lucene.codecs.Codec with
|
||||
Lucene99Codec;
|
||||
Lucene912Codec;
|
||||
provides org.apache.lucene.codecs.DocValuesFormat with
|
||||
org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat;
|
||||
provides org.apache.lucene.codecs.KnnVectorsFormat with
|
||||
|
@ -79,7 +80,7 @@ module org.apache.lucene.core {
|
|||
org.apache.lucene.codecs.lucene99.Lucene99HnswScalarQuantizedVectorsFormat,
|
||||
org.apache.lucene.codecs.lucene99.Lucene99ScalarQuantizedVectorsFormat;
|
||||
provides org.apache.lucene.codecs.PostingsFormat with
|
||||
org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat;
|
||||
org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat;
|
||||
provides org.apache.lucene.index.SortFieldProvider with
|
||||
org.apache.lucene.search.SortField.Provider,
|
||||
org.apache.lucene.search.SortedNumericSortField.Provider,
|
||||
|
|
|
@ -55,7 +55,7 @@ public abstract class Codec implements NamedSPILoader.NamedSPI {
|
|||
return LOADER;
|
||||
}
|
||||
|
||||
static Codec defaultCodec = LOADER.lookup("Lucene99");
|
||||
static Codec defaultCodec = LOADER.lookup("Lucene912");
|
||||
}
|
||||
|
||||
private final String name;
|
||||
|
|
|
@ -18,8 +18,6 @@ package org.apache.lucene.codecs;
|
|||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collection;
|
||||
import java.util.Collections;
|
||||
import java.util.Comparator;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
|
@ -106,7 +104,7 @@ public final class CompetitiveImpactAccumulator {
|
|||
}
|
||||
|
||||
/** Get the set of competitive freq and norm pairs, ordered by increasing freq and norm. */
|
||||
public Collection<Impact> getCompetitiveFreqNormPairs() {
|
||||
public List<Impact> getCompetitiveFreqNormPairs() {
|
||||
List<Impact> impacts = new ArrayList<>();
|
||||
int maxFreqForLowerNorms = 0;
|
||||
for (int i = 0; i < maxFreqs.length; ++i) {
|
||||
|
@ -126,7 +124,7 @@ public final class CompetitiveImpactAccumulator {
|
|||
for (Impact impact : impacts) {
|
||||
add(impact, freqNormPairs);
|
||||
}
|
||||
return Collections.unmodifiableSet(freqNormPairs);
|
||||
return List.copyOf(freqNormPairs);
|
||||
}
|
||||
|
||||
private void add(Impact newEntry, TreeSet<Impact> freqNormPairs) {
|
||||
|
|
|
@ -23,6 +23,7 @@ import java.io.IOException;
|
|||
import java.util.ArrayList;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import org.apache.lucene.index.BaseTermsEnum;
|
||||
import org.apache.lucene.index.BinaryDocValues;
|
||||
import org.apache.lucene.index.DocIDMerger;
|
||||
import org.apache.lucene.index.DocValues;
|
||||
|
@ -498,7 +499,7 @@ public abstract class DocValuesConsumer implements Closeable {
|
|||
* {@link SortedDocValues#lookupOrd(int)} or {@link SortedSetDocValues#lookupOrd(long)} on every
|
||||
* call to {@link TermsEnum#next()}.
|
||||
*/
|
||||
private static class MergedTermsEnum extends TermsEnum {
|
||||
private static class MergedTermsEnum extends BaseTermsEnum {
|
||||
|
||||
private final TermsEnum[] subs;
|
||||
private final OrdinalMap ordinalMap;
|
||||
|
@ -542,11 +543,6 @@ public abstract class DocValuesConsumer implements Closeable {
|
|||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean seekExact(BytesRef text) throws IOException {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
@Override
|
||||
public SeekStatus seekCeil(BytesRef text) throws IOException {
|
||||
throw new UnsupportedOperationException();
|
||||
|
@ -557,11 +553,6 @@ public abstract class DocValuesConsumer implements Closeable {
|
|||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void seekExact(BytesRef term, TermState state) throws IOException {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int docFreq() throws IOException {
|
||||
throw new UnsupportedOperationException();
|
||||
|
|
|
@ -20,17 +20,23 @@ package org.apache.lucene.codecs;
|
|||
import java.io.Closeable;
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.Objects;
|
||||
import java.util.function.BiFunction;
|
||||
import org.apache.lucene.index.ByteVectorValues;
|
||||
import org.apache.lucene.index.DocIDMerger;
|
||||
import org.apache.lucene.index.DocsWithFieldSet;
|
||||
import org.apache.lucene.index.FieldInfo;
|
||||
import org.apache.lucene.index.FloatVectorValues;
|
||||
import org.apache.lucene.index.MergeState;
|
||||
import org.apache.lucene.index.Sorter;
|
||||
import org.apache.lucene.index.VectorEncoding;
|
||||
import org.apache.lucene.internal.hppc.IntIntHashMap;
|
||||
import org.apache.lucene.search.DocIdSetIterator;
|
||||
import org.apache.lucene.search.VectorScorer;
|
||||
import org.apache.lucene.util.Accountable;
|
||||
import org.apache.lucene.util.IOFunction;
|
||||
|
||||
/** Writes vectors to an index. */
|
||||
public abstract class KnnVectorsWriter implements Accountable, Closeable {
|
||||
|
@ -107,11 +113,11 @@ public abstract class KnnVectorsWriter implements Accountable, Closeable {
|
|||
}
|
||||
|
||||
/** Tracks state of one sub-reader that we are merging */
|
||||
private static class VectorValuesSub extends DocIDMerger.Sub {
|
||||
private static class FloatVectorValuesSub extends DocIDMerger.Sub {
|
||||
|
||||
final FloatVectorValues values;
|
||||
|
||||
VectorValuesSub(MergeState.DocMap docMap, FloatVectorValues values) {
|
||||
FloatVectorValuesSub(MergeState.DocMap docMap, FloatVectorValues values) {
|
||||
super(docMap);
|
||||
this.values = values;
|
||||
assert values.docID() == -1;
|
||||
|
@ -139,65 +145,139 @@ public abstract class KnnVectorsWriter implements Accountable, Closeable {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Given old doc ids and an id mapping, maps old ordinal to new ordinal. Note: this method return
|
||||
* nothing and output are written to parameters
|
||||
*
|
||||
* @param oldDocIds the old or current document ordinals. Must not be null.
|
||||
* @param sortMap the document sorting map for how to make the new ordinals. Must not be null.
|
||||
* @param old2NewOrd int[] maps from old ord to new ord
|
||||
* @param new2OldOrd int[] maps from new ord to old ord
|
||||
* @param newDocsWithField set of new doc ids which has the value
|
||||
*/
|
||||
public static void mapOldOrdToNewOrd(
|
||||
DocsWithFieldSet oldDocIds,
|
||||
Sorter.DocMap sortMap,
|
||||
int[] old2NewOrd,
|
||||
int[] new2OldOrd,
|
||||
DocsWithFieldSet newDocsWithField)
|
||||
throws IOException {
|
||||
// TODO: a similar function exists in IncrementalHnswGraphMerger#getNewOrdMapping
|
||||
// maybe we can do a further refactoring
|
||||
Objects.requireNonNull(oldDocIds);
|
||||
Objects.requireNonNull(sortMap);
|
||||
assert (old2NewOrd != null || new2OldOrd != null || newDocsWithField != null);
|
||||
assert (old2NewOrd == null || old2NewOrd.length == oldDocIds.cardinality());
|
||||
assert (new2OldOrd == null || new2OldOrd.length == oldDocIds.cardinality());
|
||||
IntIntHashMap newIdToOldOrd = new IntIntHashMap();
|
||||
DocIdSetIterator iterator = oldDocIds.iterator();
|
||||
int[] newDocIds = new int[oldDocIds.cardinality()];
|
||||
int oldOrd = 0;
|
||||
for (int oldDocId = iterator.nextDoc();
|
||||
oldDocId != DocIdSetIterator.NO_MORE_DOCS;
|
||||
oldDocId = iterator.nextDoc()) {
|
||||
int newId = sortMap.oldToNew(oldDocId);
|
||||
newIdToOldOrd.put(newId, oldOrd);
|
||||
newDocIds[oldOrd] = newId;
|
||||
oldOrd++;
|
||||
}
|
||||
|
||||
Arrays.sort(newDocIds);
|
||||
int newOrd = 0;
|
||||
for (int newDocId : newDocIds) {
|
||||
int currOldOrd = newIdToOldOrd.get(newDocId);
|
||||
if (old2NewOrd != null) {
|
||||
old2NewOrd[currOldOrd] = newOrd;
|
||||
}
|
||||
if (new2OldOrd != null) {
|
||||
new2OldOrd[newOrd] = currOldOrd;
|
||||
}
|
||||
if (newDocsWithField != null) {
|
||||
newDocsWithField.add(newDocId);
|
||||
}
|
||||
newOrd++;
|
||||
}
|
||||
}
|
||||
|
||||
/** View over multiple vector values supporting iterator-style access via DocIdMerger. */
|
||||
public static final class MergedVectorValues {
|
||||
private MergedVectorValues() {}
|
||||
|
||||
/** Returns a merged view over all the segment's {@link FloatVectorValues}. */
|
||||
public static FloatVectorValues mergeFloatVectorValues(
|
||||
FieldInfo fieldInfo, MergeState mergeState) throws IOException {
|
||||
private static void validateFieldEncoding(FieldInfo fieldInfo, VectorEncoding expected) {
|
||||
assert fieldInfo != null && fieldInfo.hasVectorValues();
|
||||
if (fieldInfo.getVectorEncoding() != VectorEncoding.FLOAT32) {
|
||||
VectorEncoding fieldEncoding = fieldInfo.getVectorEncoding();
|
||||
if (fieldEncoding != expected) {
|
||||
throw new UnsupportedOperationException(
|
||||
"Cannot merge vectors encoded as [" + fieldInfo.getVectorEncoding() + "] as FLOAT32");
|
||||
"Cannot merge vectors encoded as [" + fieldEncoding + "] as " + expected);
|
||||
}
|
||||
List<VectorValuesSub> subs = new ArrayList<>();
|
||||
for (int i = 0; i < mergeState.knnVectorsReaders.length; i++) {
|
||||
KnnVectorsReader knnVectorsReader = mergeState.knnVectorsReaders[i];
|
||||
}
|
||||
|
||||
private static <V, S> List<S> mergeVectorValues(
|
||||
KnnVectorsReader[] knnVectorsReaders,
|
||||
MergeState.DocMap[] docMaps,
|
||||
IOFunction<KnnVectorsReader, V> valuesSupplier,
|
||||
BiFunction<MergeState.DocMap, V, S> newSub)
|
||||
throws IOException {
|
||||
List<S> subs = new ArrayList<>();
|
||||
for (int i = 0; i < knnVectorsReaders.length; i++) {
|
||||
KnnVectorsReader knnVectorsReader = knnVectorsReaders[i];
|
||||
if (knnVectorsReader != null) {
|
||||
FloatVectorValues values = knnVectorsReader.getFloatVectorValues(fieldInfo.name);
|
||||
V values = valuesSupplier.apply(knnVectorsReader);
|
||||
if (values != null) {
|
||||
subs.add(new VectorValuesSub(mergeState.docMaps[i], values));
|
||||
subs.add(newSub.apply(docMaps[i], values));
|
||||
}
|
||||
}
|
||||
}
|
||||
return new MergedFloat32VectorValues(subs, mergeState);
|
||||
return subs;
|
||||
}
|
||||
|
||||
/** Returns a merged view over all the segment's {@link FloatVectorValues}. */
|
||||
public static FloatVectorValues mergeFloatVectorValues(
|
||||
FieldInfo fieldInfo, MergeState mergeState) throws IOException {
|
||||
validateFieldEncoding(fieldInfo, VectorEncoding.FLOAT32);
|
||||
return new MergedFloat32VectorValues(
|
||||
mergeVectorValues(
|
||||
mergeState.knnVectorsReaders,
|
||||
mergeState.docMaps,
|
||||
knnVectorsReader -> {
|
||||
return knnVectorsReader.getFloatVectorValues(fieldInfo.name);
|
||||
},
|
||||
(docMap, values) -> {
|
||||
return new FloatVectorValuesSub(docMap, values);
|
||||
}),
|
||||
mergeState);
|
||||
}
|
||||
|
||||
/** Returns a merged view over all the segment's {@link ByteVectorValues}. */
|
||||
public static ByteVectorValues mergeByteVectorValues(FieldInfo fieldInfo, MergeState mergeState)
|
||||
throws IOException {
|
||||
assert fieldInfo != null && fieldInfo.hasVectorValues();
|
||||
if (fieldInfo.getVectorEncoding() != VectorEncoding.BYTE) {
|
||||
throw new UnsupportedOperationException(
|
||||
"Cannot merge vectors encoded as [" + fieldInfo.getVectorEncoding() + "] as BYTE");
|
||||
}
|
||||
List<ByteVectorValuesSub> subs = new ArrayList<>();
|
||||
for (int i = 0; i < mergeState.knnVectorsReaders.length; i++) {
|
||||
KnnVectorsReader knnVectorsReader = mergeState.knnVectorsReaders[i];
|
||||
if (knnVectorsReader != null) {
|
||||
ByteVectorValues values = knnVectorsReader.getByteVectorValues(fieldInfo.name);
|
||||
if (values != null) {
|
||||
subs.add(new ByteVectorValuesSub(mergeState.docMaps[i], values));
|
||||
}
|
||||
}
|
||||
}
|
||||
return new MergedByteVectorValues(subs, mergeState);
|
||||
validateFieldEncoding(fieldInfo, VectorEncoding.BYTE);
|
||||
return new MergedByteVectorValues(
|
||||
mergeVectorValues(
|
||||
mergeState.knnVectorsReaders,
|
||||
mergeState.docMaps,
|
||||
knnVectorsReader -> {
|
||||
return knnVectorsReader.getByteVectorValues(fieldInfo.name);
|
||||
},
|
||||
(docMap, values) -> {
|
||||
return new ByteVectorValuesSub(docMap, values);
|
||||
}),
|
||||
mergeState);
|
||||
}
|
||||
|
||||
static class MergedFloat32VectorValues extends FloatVectorValues {
|
||||
private final List<VectorValuesSub> subs;
|
||||
private final DocIDMerger<VectorValuesSub> docIdMerger;
|
||||
private final List<FloatVectorValuesSub> subs;
|
||||
private final DocIDMerger<FloatVectorValuesSub> docIdMerger;
|
||||
private final int size;
|
||||
private int docId;
|
||||
VectorValuesSub current;
|
||||
FloatVectorValuesSub current;
|
||||
|
||||
private MergedFloat32VectorValues(List<VectorValuesSub> subs, MergeState mergeState)
|
||||
private MergedFloat32VectorValues(List<FloatVectorValuesSub> subs, MergeState mergeState)
|
||||
throws IOException {
|
||||
this.subs = subs;
|
||||
docIdMerger = DocIDMerger.of(subs, mergeState.needsIndexSort);
|
||||
int totalSize = 0;
|
||||
for (VectorValuesSub sub : subs) {
|
||||
for (FloatVectorValuesSub sub : subs) {
|
||||
totalSize += sub.values.size();
|
||||
}
|
||||
size = totalSize;
|
||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue