mirror of https://github.com/apache/lucene.git
Merge branch 'apache:main' into bpv21_main
This commit is contained in:
commit
0a0701995a
|
@ -23,6 +23,7 @@ Apache Lucene is a high-performance, full-featured text search engine library
|
||||||
written in Java.
|
written in Java.
|
||||||
|
|
||||||
[![Build Status](https://ci-builds.apache.org/job/Lucene/job/Lucene-Artifacts-main/badge/icon?subject=Lucene)](https://ci-builds.apache.org/job/Lucene/job/Lucene-Artifacts-main/)
|
[![Build Status](https://ci-builds.apache.org/job/Lucene/job/Lucene-Artifacts-main/badge/icon?subject=Lucene)](https://ci-builds.apache.org/job/Lucene/job/Lucene-Artifacts-main/)
|
||||||
|
[![Revved up by Develocity](https://img.shields.io/badge/Revved%20up%20by-Develocity-06A0CE?logo=Gradle&labelColor=02303A)](https://ge.apache.org/scans?search.buildToolType=gradle&search.rootProjectNames=lucene-root)
|
||||||
|
|
||||||
## Online Documentation
|
## Online Documentation
|
||||||
|
|
||||||
|
|
|
@ -41,7 +41,7 @@ import jdk.jfr.consumer.RecordingFile;
|
||||||
*/
|
*/
|
||||||
public class ProfileResults {
|
public class ProfileResults {
|
||||||
/** Formats a frame to a formatted line. This is deduplicated on! */
|
/** Formats a frame to a formatted line. This is deduplicated on! */
|
||||||
static String frameToString(RecordedFrame frame, boolean lineNumbers) {
|
static String frameToString(RecordedFrame frame, boolean lineNumbers, boolean frameTypes) {
|
||||||
StringBuilder builder = new StringBuilder();
|
StringBuilder builder = new StringBuilder();
|
||||||
RecordedMethod method = frame.getMethod();
|
RecordedMethod method = frame.getMethod();
|
||||||
RecordedClass clazz = method.getType();
|
RecordedClass clazz = method.getType();
|
||||||
|
@ -55,13 +55,14 @@ public class ProfileResults {
|
||||||
builder.append("#");
|
builder.append("#");
|
||||||
builder.append(method.getName());
|
builder.append(method.getName());
|
||||||
builder.append("()");
|
builder.append("()");
|
||||||
if (lineNumbers) {
|
if (lineNumbers && frame.getLineNumber() != -1) {
|
||||||
builder.append(":");
|
builder.append(":");
|
||||||
if (frame.getLineNumber() == -1) {
|
builder.append(frame.getLineNumber());
|
||||||
builder.append("(" + frame.getType() + " code)");
|
}
|
||||||
} else {
|
if (clazz != null && frameTypes) {
|
||||||
builder.append(frame.getLineNumber());
|
builder.append(" [");
|
||||||
}
|
builder.append(frame.getType());
|
||||||
|
builder.append(" code]");
|
||||||
}
|
}
|
||||||
return builder.toString();
|
return builder.toString();
|
||||||
}
|
}
|
||||||
|
@ -77,6 +78,8 @@ public class ProfileResults {
|
||||||
public static final String COUNT_DEFAULT = "10";
|
public static final String COUNT_DEFAULT = "10";
|
||||||
public static final String LINENUMBERS_KEY = "tests.profile.linenumbers";
|
public static final String LINENUMBERS_KEY = "tests.profile.linenumbers";
|
||||||
public static final String LINENUMBERS_DEFAULT = "false";
|
public static final String LINENUMBERS_DEFAULT = "false";
|
||||||
|
public static final String FRAMETYPES_KEY = "tests.profile.frametypes";
|
||||||
|
public static final String FRAMETYPES_DEFAULT = "true";
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Driver method, for testing standalone.
|
* Driver method, for testing standalone.
|
||||||
|
@ -92,7 +95,8 @@ public class ProfileResults {
|
||||||
System.getProperty(MODE_KEY, MODE_DEFAULT),
|
System.getProperty(MODE_KEY, MODE_DEFAULT),
|
||||||
Integer.parseInt(System.getProperty(STACKSIZE_KEY, STACKSIZE_DEFAULT)),
|
Integer.parseInt(System.getProperty(STACKSIZE_KEY, STACKSIZE_DEFAULT)),
|
||||||
Integer.parseInt(System.getProperty(COUNT_KEY, COUNT_DEFAULT)),
|
Integer.parseInt(System.getProperty(COUNT_KEY, COUNT_DEFAULT)),
|
||||||
Boolean.parseBoolean(System.getProperty(LINENUMBERS_KEY, LINENUMBERS_DEFAULT)));
|
Boolean.parseBoolean(System.getProperty(LINENUMBERS_KEY, LINENUMBERS_DEFAULT)),
|
||||||
|
Boolean.parseBoolean(System.getProperty(FRAMETYPES_KEY, FRAMETYPES_DEFAULT)));
|
||||||
}
|
}
|
||||||
|
|
||||||
/** true if we care about this event */
|
/** true if we care about this event */
|
||||||
|
@ -152,7 +156,12 @@ public class ProfileResults {
|
||||||
|
|
||||||
/** Process all the JFR files passed in args and print a merged summary. */
|
/** Process all the JFR files passed in args and print a merged summary. */
|
||||||
public static void printReport(
|
public static void printReport(
|
||||||
List<String> files, String mode, int stacksize, int count, boolean lineNumbers)
|
List<String> files,
|
||||||
|
String mode,
|
||||||
|
int stacksize,
|
||||||
|
int count,
|
||||||
|
boolean lineNumbers,
|
||||||
|
boolean frameTypes)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
if (!"cpu".equals(mode) && !"heap".equals(mode)) {
|
if (!"cpu".equals(mode) && !"heap".equals(mode)) {
|
||||||
throw new IllegalArgumentException("tests.profile.mode must be one of (cpu,heap)");
|
throw new IllegalArgumentException("tests.profile.mode must be one of (cpu,heap)");
|
||||||
|
@ -181,7 +190,7 @@ public class ProfileResults {
|
||||||
if (stack.length() > 0) {
|
if (stack.length() > 0) {
|
||||||
stack.append("\n").append(framePadding).append(" at ");
|
stack.append("\n").append(framePadding).append(" at ");
|
||||||
}
|
}
|
||||||
stack.append(frameToString(trace.getFrames().get(i), lineNumbers));
|
stack.append(frameToString(trace.getFrames().get(i), lineNumbers, frameTypes));
|
||||||
}
|
}
|
||||||
String line = stack.toString();
|
String line = stack.toString();
|
||||||
SimpleEntry<String, Long> entry =
|
SimpleEntry<String, Long> entry =
|
||||||
|
|
|
@ -231,8 +231,8 @@ public class MissingDoclet extends StandardDoclet {
|
||||||
case PACKAGE:
|
case PACKAGE:
|
||||||
checkComment(element);
|
checkComment(element);
|
||||||
break;
|
break;
|
||||||
// class-like elements, check them, then recursively check their children (fields and
|
// class-like elements, check them, then recursively check their children (fields and
|
||||||
// methods)
|
// methods)
|
||||||
case CLASS:
|
case CLASS:
|
||||||
case INTERFACE:
|
case INTERFACE:
|
||||||
case ENUM:
|
case ENUM:
|
||||||
|
@ -257,7 +257,7 @@ public class MissingDoclet extends StandardDoclet {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
// method-like elements, check them if we are configured to do so
|
// method-like elements, check them if we are configured to do so
|
||||||
case METHOD:
|
case METHOD:
|
||||||
case CONSTRUCTOR:
|
case CONSTRUCTOR:
|
||||||
case FIELD:
|
case FIELD:
|
||||||
|
|
11
build.gradle
11
build.gradle
|
@ -80,6 +80,9 @@ ext {
|
||||||
// Minimum Java version required to compile and run Lucene.
|
// Minimum Java version required to compile and run Lucene.
|
||||||
minJavaVersion = JavaVersion.toVersion(deps.versions.minJava.get())
|
minJavaVersion = JavaVersion.toVersion(deps.versions.minJava.get())
|
||||||
|
|
||||||
|
// also change this in extractor tool: ExtractForeignAPI
|
||||||
|
vectorIncubatorJavaVersions = [ JavaVersion.VERSION_21, JavaVersion.VERSION_22, JavaVersion.VERSION_23 ] as Set
|
||||||
|
|
||||||
// snapshot build marker used in scripts.
|
// snapshot build marker used in scripts.
|
||||||
snapshotBuild = version.contains("SNAPSHOT")
|
snapshotBuild = version.contains("SNAPSHOT")
|
||||||
|
|
||||||
|
@ -117,10 +120,6 @@ apply from: file('gradle/generation/local-settings.gradle')
|
||||||
// Make sure the build environment is consistent.
|
// Make sure the build environment is consistent.
|
||||||
apply from: file('gradle/validation/check-environment.gradle')
|
apply from: file('gradle/validation/check-environment.gradle')
|
||||||
|
|
||||||
// IDE support, settings and specials.
|
|
||||||
apply from: file('gradle/ide/intellij-idea.gradle')
|
|
||||||
apply from: file('gradle/ide/eclipse.gradle')
|
|
||||||
|
|
||||||
// Set up defaults and configure aspects for certain modules or functionality
|
// Set up defaults and configure aspects for certain modules or functionality
|
||||||
// (java, tests)
|
// (java, tests)
|
||||||
apply from: file('gradle/java/folder-layout.gradle')
|
apply from: file('gradle/java/folder-layout.gradle')
|
||||||
|
@ -133,6 +132,10 @@ apply from: file('gradle/testing/alternative-jdk-support.gradle')
|
||||||
apply from: file('gradle/java/jar-manifest.gradle')
|
apply from: file('gradle/java/jar-manifest.gradle')
|
||||||
apply from: file('gradle/java/modules.gradle')
|
apply from: file('gradle/java/modules.gradle')
|
||||||
|
|
||||||
|
// IDE support, settings and specials.
|
||||||
|
apply from: file('gradle/ide/intellij-idea.gradle')
|
||||||
|
apply from: file('gradle/ide/eclipse.gradle')
|
||||||
|
|
||||||
// Maven artifact publishing.
|
// Maven artifact publishing.
|
||||||
apply from: file('gradle/maven/publications.gradle')
|
apply from: file('gradle/maven/publications.gradle')
|
||||||
|
|
||||||
|
|
|
@ -67,6 +67,13 @@
|
||||||
</maintainer>
|
</maintainer>
|
||||||
|
|
||||||
<!-- NOTE: please insert releases in numeric order, NOT chronologically. -->
|
<!-- NOTE: please insert releases in numeric order, NOT chronologically. -->
|
||||||
|
<release>
|
||||||
|
<Version>
|
||||||
|
<name>lucene-9.11.1</name>
|
||||||
|
<created>2024-06-27</created>
|
||||||
|
<revision>9.11.1</revision>
|
||||||
|
</Version>
|
||||||
|
</release>.
|
||||||
<release>
|
<release>
|
||||||
<Version>
|
<Version>
|
||||||
<name>lucene-9.11.0</name>
|
<name>lucene-9.11.0</name>
|
||||||
|
|
|
@ -0,0 +1,78 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
# Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
# contributor license agreements. See the NOTICE file distributed with
|
||||||
|
# this work for additional information regarding copyright ownership.
|
||||||
|
# The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
# (the "License"); you may not use this file except in compliance with
|
||||||
|
# the License. You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
import subprocess
|
||||||
|
import sys
|
||||||
|
import tempfile
|
||||||
|
import urllib.request
|
||||||
|
|
||||||
|
'''
|
||||||
|
A simple tool to see diffs between main's version of CHANGES.txt entries for
|
||||||
|
a given release vs the stable branch's version. It's best to keep these 1)
|
||||||
|
identical and 2) matching what changes were actually backported to be honest
|
||||||
|
to users and avoid future annoying conflicts on backport.
|
||||||
|
'''
|
||||||
|
|
||||||
|
# e.g. python3 -u diff_lucene_changes.py branch_9_9 main 9.9.0
|
||||||
|
|
||||||
|
#
|
||||||
|
|
||||||
|
def get_changes_url(branch_name):
|
||||||
|
if os.path.isdir(branch_name):
|
||||||
|
url = f'file://{branch_name}/lucene/CHANGES.txt'
|
||||||
|
else:
|
||||||
|
url = f'https://raw.githubusercontent.com/apache/lucene/{branch_name}/lucene/CHANGES.txt'
|
||||||
|
print(f'NOTE: resolving {branch_name} --> {url}')
|
||||||
|
return url
|
||||||
|
|
||||||
|
def extract_release_section(changes_txt, release_name):
|
||||||
|
return re.search(f'=======+ Lucene {re.escape(release_name)} =======+(.*?)=======+ Lucene .*? =======+$',
|
||||||
|
changes_txt.decode('utf-8'), re.MULTILINE | re.DOTALL).group(1).encode('utf-8')
|
||||||
|
|
||||||
|
def main():
|
||||||
|
if len(sys.argv) < 3 or len(sys.argv) > 5:
|
||||||
|
print('\nUsage: python3 -u dev-tools/scripts/diff_lucene_changes.py <branch1-or-local-clone> <branch2-or-local-clone> <release-name> [diff-commandline-extras]\n')
|
||||||
|
print(' e.g.: python3 -u dev-tools/scripts/diff_lucene_changes.py branch_9_9 /l/trunk 9.9.0 "-w"\n')
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
branch1 = sys.argv[1]
|
||||||
|
branch2 = sys.argv[2]
|
||||||
|
release_name = sys.argv[3]
|
||||||
|
|
||||||
|
if len(sys.argv) > 4:
|
||||||
|
diff_cl_extras = [sys.argv[4]]
|
||||||
|
else:
|
||||||
|
diff_cl_extras = []
|
||||||
|
|
||||||
|
branch1_changes = extract_release_section(urllib.request.urlopen(get_changes_url(branch1)).read(),
|
||||||
|
release_name)
|
||||||
|
branch2_changes = extract_release_section(urllib.request.urlopen(get_changes_url(branch2)).read(),
|
||||||
|
release_name)
|
||||||
|
|
||||||
|
with tempfile.NamedTemporaryFile() as f1, tempfile.NamedTemporaryFile() as f2:
|
||||||
|
f1.write(branch1_changes)
|
||||||
|
f2.write(branch2_changes)
|
||||||
|
|
||||||
|
command = ['diff'] + diff_cl_extras + [f1.name, f2.name]
|
||||||
|
|
||||||
|
# diff returns non-zero exit status when there are diffs, so don't pass check=True
|
||||||
|
print(subprocess.run(command, check=False, capture_output=True).stdout.decode('utf-8'))
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
|
@ -17,13 +17,6 @@
|
||||||
|
|
||||||
def resources = scriptResources(buildscript)
|
def resources = scriptResources(buildscript)
|
||||||
|
|
||||||
configure(rootProject) {
|
|
||||||
ext {
|
|
||||||
// also change this in extractor tool: ExtractForeignAPI
|
|
||||||
vectorIncubatorJavaVersions = [ JavaVersion.VERSION_21, JavaVersion.VERSION_22 ] as Set
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
configure(project(":lucene:core")) {
|
configure(project(":lucene:core")) {
|
||||||
ext {
|
ext {
|
||||||
apijars = layout.projectDirectory.dir("src/generated/jdk")
|
apijars = layout.projectDirectory.dir("src/generated/jdk")
|
||||||
|
|
|
@ -23,7 +23,7 @@ configure(project(":lucene:core")) {
|
||||||
description "Regenerate gen_ForUtil.py"
|
description "Regenerate gen_ForUtil.py"
|
||||||
group "generation"
|
group "generation"
|
||||||
|
|
||||||
def genDir = file("src/java/org/apache/lucene/codecs/lucene99")
|
def genDir = file("src/java/org/apache/lucene/codecs/lucene912")
|
||||||
def genScript = file("${genDir}/gen_ForUtil.py")
|
def genScript = file("${genDir}/gen_ForUtil.py")
|
||||||
def genOutput = file("${genDir}/ForUtil.java")
|
def genOutput = file("${genDir}/ForUtil.java")
|
||||||
|
|
||||||
|
@ -43,6 +43,31 @@ configure(project(":lucene:core")) {
|
||||||
andThenTasks: ["spotlessJava", "spotlessJavaApply"],
|
andThenTasks: ["spotlessJava", "spotlessJavaApply"],
|
||||||
mustRunBefore: [ "compileJava" ]
|
mustRunBefore: [ "compileJava" ]
|
||||||
])
|
])
|
||||||
|
|
||||||
|
task generateForDeltaUtilInternal() {
|
||||||
|
description "Regenerate gen_ForDeltaUtil.py"
|
||||||
|
group "generation"
|
||||||
|
|
||||||
|
def genDir = file("src/java/org/apache/lucene/codecs/lucene912")
|
||||||
|
def genScript = file("${genDir}/gen_ForDeltaUtil.py")
|
||||||
|
def genOutput = file("${genDir}/ForDeltaUtil.java")
|
||||||
|
|
||||||
|
inputs.file genScript
|
||||||
|
outputs.file genOutput
|
||||||
|
|
||||||
|
doLast {
|
||||||
|
quietExec {
|
||||||
|
workingDir genDir
|
||||||
|
executable project.externalTool("python3")
|
||||||
|
args = [ '-B', genScript ]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
regenerate.dependsOn wrapWithPersistentChecksums(generateForDeltaUtilInternal, [
|
||||||
|
andThenTasks: ["spotlessJava", "spotlessJavaApply"],
|
||||||
|
mustRunBefore: [ "compileJava" ]
|
||||||
|
])
|
||||||
}
|
}
|
||||||
|
|
||||||
configure(project(":lucene:backward-codecs")) {
|
configure(project(":lucene:backward-codecs")) {
|
||||||
|
@ -96,5 +121,30 @@ configure(project(":lucene:backward-codecs")) {
|
||||||
andThenTasks: ["spotlessJava", "spotlessJavaApply"],
|
andThenTasks: ["spotlessJava", "spotlessJavaApply"],
|
||||||
mustRunBefore: [ "compileJava" ]
|
mustRunBefore: [ "compileJava" ]
|
||||||
])
|
])
|
||||||
|
|
||||||
|
task generateForUtil99Internal() {
|
||||||
|
description "Regenerate gen_ForUtil.py"
|
||||||
|
group "generation"
|
||||||
|
|
||||||
|
def genDir = file("src/java/org/apache/lucene/backward_codecs/lucene99")
|
||||||
|
def genScript = file("${genDir}/gen_ForUtil.py")
|
||||||
|
def genOutput = file("${genDir}/ForUtil.java")
|
||||||
|
|
||||||
|
inputs.file genScript
|
||||||
|
outputs.file genOutput
|
||||||
|
|
||||||
|
doLast {
|
||||||
|
quietExec {
|
||||||
|
workingDir genDir
|
||||||
|
executable project.externalTool("python3")
|
||||||
|
args = [ '-B', genScript ]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
regenerate.dependsOn wrapWithPersistentChecksums(generateForUtil99Internal, [
|
||||||
|
andThenTasks: ["spotlessJava", "spotlessJavaApply"],
|
||||||
|
mustRunBefore: [ "compileJava" ]
|
||||||
|
])
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -65,10 +65,8 @@ configure(project(":lucene:analysis:icu")) {
|
||||||
icupkg = file("${icuBinDir}/icupkg")
|
icupkg = file("${icuBinDir}/icupkg")
|
||||||
}
|
}
|
||||||
|
|
||||||
// Resolve version lazily (can't resolve at configuration time).
|
|
||||||
def icu4jVersionProvider = project.provider { getVersion('com.ibm.icu', 'icu4j') }
|
|
||||||
// lazy gstring with ICU version.
|
// lazy gstring with ICU version.
|
||||||
def icu4jVersion = "${-> icu4jVersionProvider.get()}"
|
def icu4jVersion = deps.icu4j.get().version
|
||||||
|
|
||||||
def icuCompileTask = Os.isFamily(Os.FAMILY_WINDOWS) ? "compileIcuWindows" : "compileIcuLinux"
|
def icuCompileTask = Os.isFamily(Os.FAMILY_WINDOWS) ? "compileIcuWindows" : "compileIcuLinux"
|
||||||
|
|
||||||
|
|
|
@ -22,10 +22,11 @@ import org.gradle.plugins.ide.eclipse.model.ClasspathEntry
|
||||||
def resources = scriptResources(buildscript)
|
def resources = scriptResources(buildscript)
|
||||||
|
|
||||||
configure(rootProject) {
|
configure(rootProject) {
|
||||||
plugins.withType(JavaPlugin) {
|
if (gradle.startParameter.taskNames.contains("eclipse")) {
|
||||||
apply plugin: "eclipse"
|
project.pluginManager.apply("java-base")
|
||||||
|
project.pluginManager.apply("eclipse")
|
||||||
|
|
||||||
def eclipseJavaVersion = propertyOrDefault("eclipse.javaVersion", rootProject.minJavaVersion)
|
def eclipseJavaVersion = propertyOrDefault("eclipse.javaVersion", deps.versions.minJava.get())
|
||||||
def relativize = { other -> rootProject.rootDir.relativePath(other).toString() }
|
def relativize = { other -> rootProject.rootDir.relativePath(other).toString() }
|
||||||
|
|
||||||
eclipse {
|
eclipse {
|
||||||
|
@ -105,9 +106,9 @@ configure(rootProject) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
eclipseJdt {
|
eclipseJdt {
|
||||||
enabled = false
|
enabled = false
|
||||||
dependsOn 'luceneEclipse'
|
dependsOn 'luceneEclipseJdt'
|
||||||
}
|
}
|
||||||
|
|
||||||
eclipseClasspath {
|
eclipseClasspath {
|
||||||
|
|
|
@ -75,6 +75,18 @@ configure(rootProject) {
|
||||||
it.dependsOn(":versionCatalogFormatDeps")
|
it.dependsOn(":versionCatalogFormatDeps")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// correct crlf/ default encoding after version catalog formatting finishes.
|
||||||
|
tasks.matching {
|
||||||
|
it.path in [
|
||||||
|
":versionCatalogFormatDeps"
|
||||||
|
]
|
||||||
|
}.configureEach {
|
||||||
|
it.doLast {
|
||||||
|
ant.fixcrlf(file: it.catalogFile.get().asFile,
|
||||||
|
eol: "lf", fixlast: "true", encoding: "UTF-8")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
tasks.matching {
|
tasks.matching {
|
||||||
it.path in [
|
it.path in [
|
||||||
":versionCatalogUpdateDeps"
|
":versionCatalogUpdateDeps"
|
||||||
|
|
|
@ -13,7 +13,7 @@
|
||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
|
|
||||||
@defaultMessage Spawns threads with vague names; use a custom thread factory (Lucene's NamedThreadFactory, Solr's SolrNamedThreadFactory) and name threads so that you can tell (by its name) which executor it is associated with
|
@defaultMessage Spawns threads with vague names; use a custom thread factory (Lucene's NamedThreadFactory) and name threads so that you can tell (by its name) which executor it is associated with
|
||||||
java.util.concurrent.Executors#newFixedThreadPool(int)
|
java.util.concurrent.Executors#newFixedThreadPool(int)
|
||||||
java.util.concurrent.Executors#newSingleThreadExecutor()
|
java.util.concurrent.Executors#newSingleThreadExecutor()
|
||||||
java.util.concurrent.Executors#newCachedThreadPool()
|
java.util.concurrent.Executors#newCachedThreadPool()
|
||||||
|
|
|
@ -20,6 +20,10 @@
|
||||||
// 2) notice file
|
// 2) notice file
|
||||||
// 3) checksum validation/ generation.
|
// 3) checksum validation/ generation.
|
||||||
|
|
||||||
|
// WARNING: The tasks in this file share internal state between tasks without using files.
|
||||||
|
// Because of this all tasks here must always execute together, so they cannot define task outputs.
|
||||||
|
// TODO: Rewrite the internal state to use state files containing the ext.jarInfos and its referencedFiles
|
||||||
|
|
||||||
// This should be false only for debugging.
|
// This should be false only for debugging.
|
||||||
def failOnError = true
|
def failOnError = true
|
||||||
|
|
||||||
|
@ -194,13 +198,6 @@ subprojects {
|
||||||
description = "Validate license and notice files of dependencies"
|
description = "Validate license and notice files of dependencies"
|
||||||
dependsOn collectJarInfos
|
dependsOn collectJarInfos
|
||||||
|
|
||||||
def outputFileName = 'validateJarLicenses'
|
|
||||||
inputs.dir(file(project.rootDir.path + '/lucene/licenses'))
|
|
||||||
.withPropertyName('licenses')
|
|
||||||
.withPathSensitivity(PathSensitivity.RELATIVE)
|
|
||||||
outputs.file(layout.buildDirectory.file(outputFileName))
|
|
||||||
.withPropertyName('validateJarLicensesResult')
|
|
||||||
|
|
||||||
doLast {
|
doLast {
|
||||||
def errors = []
|
def errors = []
|
||||||
jarInfos.each { dep ->
|
jarInfos.each { dep ->
|
||||||
|
@ -246,9 +243,7 @@ subprojects {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// Required to take advantage of incremental building and the build cache
|
|
||||||
def f = new File(project.buildDir.path + "/" + outputFileName)
|
|
||||||
f.write(errors.toString(), "UTF-8")
|
|
||||||
if (errors) {
|
if (errors) {
|
||||||
def msg = "Certain license/ notice files are missing:\n - " + errors.join("\n - ")
|
def msg = "Certain license/ notice files are missing:\n - " + errors.join("\n - ")
|
||||||
if (failOnError) {
|
if (failOnError) {
|
||||||
|
|
|
@ -80,10 +80,6 @@ API Changes
|
||||||
* GITHUB#12875: Ensure token position is always increased in PathHierarchyTokenizer and ReversePathHierarchyTokenizer
|
* GITHUB#12875: Ensure token position is always increased in PathHierarchyTokenizer and ReversePathHierarchyTokenizer
|
||||||
and resulting tokens do not overlap. (Michael Froh, Lukáš Vlček)
|
and resulting tokens do not overlap. (Michael Froh, Lukáš Vlček)
|
||||||
|
|
||||||
* GITHUB#12624, GITHUB#12831: Allow FSTCompiler to stream to any DataOutput while building, and
|
|
||||||
make compile() only return the FSTMetadata. For on-heap (default) use case, please use
|
|
||||||
FST.fromFSTReader(fstMetadata, fstCompiler.getFSTReader()) to create the FST. (Anh Dung Bui)
|
|
||||||
|
|
||||||
* GITHUB#13146, GITHUB#13148: Remove ByteBufferIndexInput and only use MemorySegment APIs
|
* GITHUB#13146, GITHUB#13148: Remove ByteBufferIndexInput and only use MemorySegment APIs
|
||||||
for MMapDirectory. (Uwe Schindler)
|
for MMapDirectory. (Uwe Schindler)
|
||||||
|
|
||||||
|
@ -112,6 +108,11 @@ API Changes
|
||||||
|
|
||||||
* GITHUB#13410: Removed Scorer#getWeight (Sanjay Dutt, Adrien Grand)
|
* GITHUB#13410: Removed Scorer#getWeight (Sanjay Dutt, Adrien Grand)
|
||||||
|
|
||||||
|
* GITHUB#13499: Remove deprecated TopScoreDocCollector + TopFieldCollector methods (#create, #createSharedManager) (Jakub Slowinski)
|
||||||
|
|
||||||
|
* GITHUB#13632: CandidateMatcher public matching functions (Bryan Jacobowitz)
|
||||||
|
|
||||||
|
|
||||||
New Features
|
New Features
|
||||||
---------------------
|
---------------------
|
||||||
|
|
||||||
|
@ -133,6 +134,16 @@ New Features
|
||||||
DocValuesSkipper abstraction. A new flag is added to FieldType.java that configures whether
|
DocValuesSkipper abstraction. A new flag is added to FieldType.java that configures whether
|
||||||
to create a "skip index" for doc values. (Ignacio Vera)
|
to create a "skip index" for doc values. (Ignacio Vera)
|
||||||
|
|
||||||
|
* GITHUB#13563: Add levels to doc values skip index. (Ignacio Vera)
|
||||||
|
|
||||||
|
* GITHUB#13597: Align doc value skipper interval boundaries when an interval contains a constant
|
||||||
|
value. (Ignacio Vera)
|
||||||
|
|
||||||
|
* GITHUB#13604: Add Kmeans clustering on vectors (Mayya Sharipova, Jim Ferenczi, Tom Veasey)
|
||||||
|
|
||||||
|
* GITHUB#13592: Take advantage of the doc value skipper when it is primary sort in SortedNumericDocValuesRangeQuery
|
||||||
|
and SortedSetDocValuesRangeQuery. (Ignacio Vera)
|
||||||
|
|
||||||
Improvements
|
Improvements
|
||||||
---------------------
|
---------------------
|
||||||
|
|
||||||
|
@ -168,6 +179,8 @@ Optimizations
|
||||||
|
|
||||||
* GITHUB#12552: Make FSTPostingsFormat load FSTs off-heap. (Tony X)
|
* GITHUB#12552: Make FSTPostingsFormat load FSTs off-heap. (Tony X)
|
||||||
|
|
||||||
|
* GITHUB#13672: Leverage doc value skip lists in DocValuesRewriteMethod if indexed. (Greg Miller)
|
||||||
|
|
||||||
Bug Fixes
|
Bug Fixes
|
||||||
---------------------
|
---------------------
|
||||||
|
|
||||||
|
@ -205,6 +218,9 @@ Changes in Backwards Compatibility Policy
|
||||||
* GITHUB#13230: Remove the Kp and Lovins snowball algorithms which are not supported
|
* GITHUB#13230: Remove the Kp and Lovins snowball algorithms which are not supported
|
||||||
or intended for general use. (Robert Muir)
|
or intended for general use. (Robert Muir)
|
||||||
|
|
||||||
|
* GITHUB#13602: SearchWithCollectorTask no longer supports the `collector.class` config parameter to load a custom
|
||||||
|
collector implementation. `collector.manager.class` allows users to load a collector manager instead. (Luca Cavanna)
|
||||||
|
|
||||||
Other
|
Other
|
||||||
---------------------
|
---------------------
|
||||||
|
|
||||||
|
@ -243,22 +259,71 @@ Other
|
||||||
|
|
||||||
* GITHUB#13332: Improve MissingDoclet linter to check records correctly. (Uwe Schindler)
|
* GITHUB#13332: Improve MissingDoclet linter to check records correctly. (Uwe Schindler)
|
||||||
|
|
||||||
|
* GITHUB#13499: Remove usage of TopScoreDocCollector + TopFieldCollector deprecated methods (#create, #createSharedManager) (Jakub Slowinski)
|
||||||
|
|
||||||
|
Build
|
||||||
|
---------------------
|
||||||
|
|
||||||
|
* GITHUB#13649: Fix eclipse ide settings generation #13649 (Uwe Schindler, Dawid Weiss)
|
||||||
|
|
||||||
======================== Lucene 9.12.0 =======================
|
======================== Lucene 9.12.0 =======================
|
||||||
|
|
||||||
API Changes
|
API Changes
|
||||||
---------------------
|
---------------------
|
||||||
|
|
||||||
* GITHUB#13281: Mark COSINE VectorSimilarityFunction as deprecated. (Pulkit Gupta)
|
|
||||||
|
|
||||||
* GITHUB#13469: Expose FlatVectorsFormat as a first-class format; can be configured using a custom Codec. (Michael Sokolov)
|
* GITHUB#13469: Expose FlatVectorsFormat as a first-class format; can be configured using a custom Codec. (Michael Sokolov)
|
||||||
|
|
||||||
|
* GITHUB#13612: Hunspell: add Suggester#proceedPastRep to avoid losing relevant suggestions. (Peter Gromov)
|
||||||
|
|
||||||
|
* GITHUB#13603: Introduced `IndexSearcher#searchLeaf(LeafReaderContext, Weight, Collector)` protected method to
|
||||||
|
facilitate customizing per-leaf behavior of search without requiring to override
|
||||||
|
`search(LeafReaderContext[], Weight, Collector)` which requires overriding the entire loop across the leaves (Luca Cavanna)
|
||||||
|
|
||||||
|
* GITHUB#13559: Add BitSet#nextSetBit(int, int) to get the index of the first set bit in range. (Egor Potemkin)
|
||||||
|
|
||||||
|
* GITHUB#13568: Add DoubleValuesSource#toSortableLongDoubleValuesSource and
|
||||||
|
MultiDoubleValuesSource#toSortableMultiLongValuesSource methods. (Shradha Shankar)
|
||||||
|
|
||||||
|
* GITHUB#13568: Add CollectorOwner class that wraps CollectorManager, and handles list of Collectors and results.
|
||||||
|
Add IndexSearcher#search method that takes CollectorOwner. (Egor Potemkin)
|
||||||
|
|
||||||
|
* GITHUB#13568: Add DrillSideways#search method that supports any collector types for any drill-sideways dimensions
|
||||||
|
or drill-down. (Egor Potemkin)
|
||||||
|
|
||||||
New Features
|
New Features
|
||||||
---------------------
|
---------------------
|
||||||
(No changes)
|
|
||||||
|
* GITHUB#13430: Allow configuring the search concurrency via
|
||||||
|
TieredMergePolicy#setTargetSearchConcurrency. This in-turn instructs the
|
||||||
|
merge policy to try to have at least this number of segments on the highest
|
||||||
|
tier. (Adrien Grand, Carlos Delgado)
|
||||||
|
|
||||||
|
* GITHUB#13517: Allow configuring the search concurrency on LogDocMergePolicy
|
||||||
|
and LogByteSizeMergePolicy via a new #setTargetConcurrency setter.
|
||||||
|
(Adrien Grand)
|
||||||
|
|
||||||
|
* GITHUB#13568: Add sandbox facets module to compute facets while collecting. (Egor Potemkin, Shradha Shankar)
|
||||||
|
|
||||||
|
* GITHUB#13678: Add support JDK 23 to the Panama Vectorization Provider. (Chris Hegarty)
|
||||||
|
|
||||||
Improvements
|
Improvements
|
||||||
---------------------
|
---------------------
|
||||||
(No changes)
|
|
||||||
|
* GITHUB#13548: Refactor and javadoc update for KNN vector writer classes. (Patrick Zhai)
|
||||||
|
|
||||||
|
* GITHUB#13562: Add Intervals.regexp and Intervals.range methods to produce IntervalsSource
|
||||||
|
for regexp and range queries. (Mayya Sharipova)
|
||||||
|
|
||||||
|
* GITHUB#13625: Remove BitSet#nextSetBit code duplication. (Greg Miller)
|
||||||
|
|
||||||
|
* GITHUB#13285: Early terminate graph searches of AbstractVectorSimilarityQuery to follow timeout set from
|
||||||
|
IndexSearcher#setTimeout(QueryTimeout). (Kaival Parikh)
|
||||||
|
|
||||||
|
* GITHUB#13633: Add ability to read/write knn vector values to a MemoryIndex. (Ben Trent)
|
||||||
|
|
||||||
|
* GITHUB#12627: patch HNSW graphs to improve reachability of all nodes from entry points
|
||||||
|
|
||||||
|
* GITHUB#13201: Better cost estimation on MultiTermQuery over few terms. (Michael Froh)
|
||||||
|
|
||||||
Optimizations
|
Optimizations
|
||||||
---------------------
|
---------------------
|
||||||
|
@ -277,16 +342,100 @@ Optimizations
|
||||||
|
|
||||||
* GITHUB#12941: Don't preserve auxiliary buffer contents in LSBRadixSorter if it grows. (Stefan Vodita)
|
* GITHUB#12941: Don't preserve auxiliary buffer contents in LSBRadixSorter if it grows. (Stefan Vodita)
|
||||||
|
|
||||||
|
* GITHUB#13175: Stop double-checking priority queue inserts in some FacetCount classes. (Jakub Slowinski)
|
||||||
|
|
||||||
|
* GITHUB#13538: Slightly reduce heap usage for HNSW and scalar quantized vector writers. (Ben Trent)
|
||||||
|
|
||||||
|
* GITHUB#12100: WordBreakSpellChecker.suggestWordBreaks now does a breadth first search, allowing it to return
|
||||||
|
better matches with fewer evaluations (hossman)
|
||||||
|
|
||||||
|
* GITHUB#13582: Stop requiring MaxScoreBulkScorer's outer window from having at
|
||||||
|
least INNER_WINDOW_SIZE docs. (Adrien Grand)
|
||||||
|
|
||||||
|
* GITHUB#13570, GITHUB#13574, GITHUB#13535: Avoid performance degradation with closing shared Arenas.
|
||||||
|
Closing many individual index files can potentially lead to a degradation in execution performance.
|
||||||
|
Index files are mmapped one-to-one with the JDK's foreign shared Arena. The JVM deoptimizes the top
|
||||||
|
few frames of all threads when closing a shared Arena (see JDK-8335480). We mitigate this situation
|
||||||
|
when running with JDK 21 and greater, by 1) using a confined Arena where appropriate, and 2) grouping
|
||||||
|
files from the same segment to a single shared Arena.
|
||||||
|
A system property has been added that allows to control the total maximum number of mmapped files
|
||||||
|
that may be associated with a single shared Arena. For example, to set the max number of permits to
|
||||||
|
256, pass the following on the command line
|
||||||
|
-Dorg.apache.lucene.store.MMapDirectory.sharedArenaMaxPermits=256. Setting a value of 1 associates
|
||||||
|
a single file to a single shared arena.
|
||||||
|
(Chris Hegarty, Michael Gibney, Uwe Schindler)
|
||||||
|
|
||||||
|
* GITHUB#13585: Lucene912PostingsFormat, the new default postings format, now
|
||||||
|
only has 2 levels of skip data, which are inlined into postings instead of
|
||||||
|
being stored at the end of postings lists. This translates into better
|
||||||
|
performance for queries that need skipping such as conjunctions.
|
||||||
|
(Adrien Grand)
|
||||||
|
|
||||||
|
* GITHUB#13581: OnHeapHnswGraph no longer allocates a lock for every graph node (Mike Sokolov)
|
||||||
|
|
||||||
|
* GITHUB#13636, GITHUB#13658: Optimizations to the decoding logic of blocks of
|
||||||
|
postings. (Adrien Grand, Uwe Schindler, Greg Miller)
|
||||||
|
|
||||||
|
* GITHUB##13644: Improve NumericComparator competitive iterator logic by comparing the missing value with the top
|
||||||
|
value even after the hit queue is full (Pan Guixin)
|
||||||
|
|
||||||
|
* GITHUB#13587: Use Max WAND optimizations with ToParentBlockJoinQuery when using ScoreMode.Max (Mike Pellegrini)
|
||||||
|
|
||||||
|
Changes in runtime behavior
|
||||||
|
---------------------
|
||||||
|
|
||||||
|
* GITHUB#13472: When an executor is provided to the IndexSearcher constructor, the searcher now executes tasks on the
|
||||||
|
thread that invoked a search as well as its configured executor. Users should reduce the executor's thread-count by 1
|
||||||
|
to retain the previous level of parallelism. Moreover, it is now possible to start searches from the same executor
|
||||||
|
that is configured in the IndexSearcher without risk of deadlocking. A separate executor for starting searches is no
|
||||||
|
longer required. (Armin Braun)
|
||||||
|
|
||||||
Bug Fixes
|
Bug Fixes
|
||||||
---------------------
|
---------------------
|
||||||
|
|
||||||
|
* GITHUB#13384: Fix highlighter to use longer passages instead of shorter individual terms. (Zack Kendall)
|
||||||
|
|
||||||
* GITHUB#13463: Address bug in MultiLeafKnnCollector causing #minCompetitiveSimilarity to stay artificially low in
|
* GITHUB#13463: Address bug in MultiLeafKnnCollector causing #minCompetitiveSimilarity to stay artificially low in
|
||||||
some corner cases. (Greg Miller)
|
some corner cases. (Greg Miller)
|
||||||
|
|
||||||
|
* GITHUB#13553: Correct RamUsageEstimate for scalar quantized knn vector formats so that raw vectors are correctly
|
||||||
|
accounted for. (Ben Trent)
|
||||||
|
|
||||||
|
* GITHUB#13615: Correct scalar quantization when used in conjunction with COSINE similarity. Vectors are normalized
|
||||||
|
before quantization to ensure the cosine similarity is correctly calculated. (Ben Trent)
|
||||||
|
|
||||||
|
* GITHUB#13627: Fix race condition on flush for DWPT seqNo generation. (Ben Trent, Ao Li)
|
||||||
|
|
||||||
|
* GITHUB#13691: Fix incorrect exponent value in explain of SigmoidFunction. (Owais Kazi)
|
||||||
|
|
||||||
|
Build
|
||||||
|
---------------------
|
||||||
|
|
||||||
|
* GITHUB#13695, GITHUB#13696: Fix Gradle build sometimes gives spurious "unreferenced license file" warnings.
|
||||||
|
(Uwe Schindler)
|
||||||
|
|
||||||
Other
|
Other
|
||||||
--------------------
|
--------------------
|
||||||
(No changes)
|
(No changes)
|
||||||
|
|
||||||
|
======================== Lucene 9.11.1 =======================
|
||||||
|
|
||||||
|
Bug Fixes
|
||||||
|
---------------------
|
||||||
|
|
||||||
|
* GITHUB#13498: Avoid performance regression by constructing lazily the PointTree in NumericComparator. (Ignacio Vera)
|
||||||
|
|
||||||
|
* GITHUB#13501, GITHUB#13478: Remove intra-merge parallelism for everything except HNSW graph merges. (Ben Trent)
|
||||||
|
|
||||||
|
* GITHUB#13498, GITHUB#13340: Allow adding a parent field to an index with no fields (Michael Sokolov)
|
||||||
|
|
||||||
|
* GITHUB#12431: Fix IndexOutOfBoundsException thrown in DefaultPassageFormatter
|
||||||
|
by unordered matches. (Stephane Campinas)
|
||||||
|
|
||||||
|
* GITHUB#13493: StringValueFacetCounts stops throwing NPE when faceting over an empty match-set. (Grebennikov Roman,
|
||||||
|
Stefan Vodita)
|
||||||
|
|
||||||
|
|
||||||
======================== Lucene 9.11.0 =======================
|
======================== Lucene 9.11.0 =======================
|
||||||
|
|
||||||
API Changes
|
API Changes
|
||||||
|
@ -494,6 +643,10 @@ API Changes
|
||||||
|
|
||||||
* GITHUB#12854: Mark DrillSideways#createDrillDownFacetsCollector as @Deprecated. (Greg Miller)
|
* GITHUB#12854: Mark DrillSideways#createDrillDownFacetsCollector as @Deprecated. (Greg Miller)
|
||||||
|
|
||||||
|
* GITHUB#12624, GITHUB#12831: Allow FSTCompiler to stream to any DataOutput while building, and
|
||||||
|
make compile() only return the FSTMetadata. For on-heap (default) use case, please use
|
||||||
|
FST.fromFSTReader(fstMetadata, fstCompiler.getFSTReader()) to create the FST. (Anh Dung Bui)
|
||||||
|
|
||||||
New Features
|
New Features
|
||||||
---------------------
|
---------------------
|
||||||
* GITHUB#12679: Add support for similarity-based vector searches using [Byte|Float]VectorSimilarityQuery. Uses a new
|
* GITHUB#12679: Add support for similarity-based vector searches using [Byte|Float]VectorSimilarityQuery. Uses a new
|
||||||
|
@ -501,6 +654,12 @@ New Features
|
||||||
better-scoring nodes are available, or the best candidate is below a score of `traversalSimilarity` in the lowest
|
better-scoring nodes are available, or the best candidate is below a score of `traversalSimilarity` in the lowest
|
||||||
level. (Aditya Prakash, Kaival Parikh)
|
level. (Aditya Prakash, Kaival Parikh)
|
||||||
|
|
||||||
|
* GITHUB#12829: For indices newly created as of 9.10.0 onwards, IndexWriter preserves document blocks indexed via
|
||||||
|
IndexWriter#addDocuments or IndexWriter#updateDocuments also when index sorting is configured. Document blocks are
|
||||||
|
maintained alongside their parent documents during sort and merge. IndexWriterConfig accepts a parent field that is used
|
||||||
|
to maintain block orders if index sorting is used. Note, this is fully optional in Lucene 9.x while will be mandatory for
|
||||||
|
indices that use document blocks together with index sorting as of 10.0.0. (Simon Willnauer)
|
||||||
|
|
||||||
* GITHUB#12336: Index additional data per facet label in the taxonomy. (Shai Erera, Egor Potemkin, Mike McCandless,
|
* GITHUB#12336: Index additional data per facet label in the taxonomy. (Shai Erera, Egor Potemkin, Mike McCandless,
|
||||||
Stefan Vodita)
|
Stefan Vodita)
|
||||||
|
|
||||||
|
@ -592,7 +751,6 @@ Build
|
||||||
|
|
||||||
Other
|
Other
|
||||||
---------------------
|
---------------------
|
||||||
|
|
||||||
* GITHUB#11023: Removing some dead code in CheckIndex. (Jakub Slowinski)
|
* GITHUB#11023: Removing some dead code in CheckIndex. (Jakub Slowinski)
|
||||||
|
|
||||||
* GITHUB#11023: Removing @lucene.experimental tags in testXXX methods in CheckIndex. (Jakub Slowinski)
|
* GITHUB#11023: Removing @lucene.experimental tags in testXXX methods in CheckIndex. (Jakub Slowinski)
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
{
|
{
|
||||||
"gradle/generation/jflex/skeleton.default.txt": "58944f66c9113a940dfaf6a17210ec8219024390",
|
"gradle/generation/jflex/skeleton.default.txt": "58944f66c9113a940dfaf6a17210ec8219024390",
|
||||||
"lucene/analysis/common/src/java/org/apache/lucene/analysis/classic/ClassicTokenizerImpl.java": "1f7a446f3483326385eef257cea8366c27da0850",
|
"lucene/analysis/common/src/java/org/apache/lucene/analysis/classic/ClassicTokenizerImpl.java": "e62dcd8c25219d8f5d783823b228ffe38d2bacde",
|
||||||
"lucene/analysis/common/src/java/org/apache/lucene/analysis/classic/ClassicTokenizerImpl.jflex": "f52109bb7d5701979fde90aeeeda726246a8d5fd"
|
"lucene/analysis/common/src/java/org/apache/lucene/analysis/classic/ClassicTokenizerImpl.jflex": "f52109bb7d5701979fde90aeeeda726246a8d5fd"
|
||||||
}
|
}
|
|
@ -1,5 +1,5 @@
|
||||||
{
|
{
|
||||||
"gradle/generation/jflex/skeleton.default.txt": "58944f66c9113a940dfaf6a17210ec8219024390",
|
"gradle/generation/jflex/skeleton.default.txt": "58944f66c9113a940dfaf6a17210ec8219024390",
|
||||||
"lucene/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.java": "ac298e08bc5b96202efca0c01f9f0376fda976bd",
|
"lucene/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.java": "2b5df5ff35543a6380c82f298225eb5fa06e4453",
|
||||||
"lucene/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.jflex": "0b8c7774b98e8237702013e82c352d4711509bd0"
|
"lucene/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.jflex": "0b8c7774b98e8237702013e82c352d4711509bd0"
|
||||||
}
|
}
|
|
@ -37,23 +37,23 @@ class BengaliNormalizer {
|
||||||
|
|
||||||
for (int i = 0; i < len; i++) {
|
for (int i = 0; i < len; i++) {
|
||||||
switch (s[i]) {
|
switch (s[i]) {
|
||||||
// delete Chandrabindu
|
// delete Chandrabindu
|
||||||
case '\u0981':
|
case '\u0981':
|
||||||
len = delete(s, i, len);
|
len = delete(s, i, len);
|
||||||
i--;
|
i--;
|
||||||
break;
|
break;
|
||||||
|
|
||||||
// DirghoI kar -> RosshoI kar
|
// DirghoI kar -> RosshoI kar
|
||||||
case '\u09C0':
|
case '\u09C0':
|
||||||
s[i] = '\u09BF';
|
s[i] = '\u09BF';
|
||||||
break;
|
break;
|
||||||
|
|
||||||
// DirghoU kar -> RosshoU kar
|
// DirghoU kar -> RosshoU kar
|
||||||
case '\u09C2':
|
case '\u09C2':
|
||||||
s[i] = '\u09C1';
|
s[i] = '\u09C1';
|
||||||
break;
|
break;
|
||||||
|
|
||||||
// Khio (Ka + Hoshonto + Murdorno Sh)
|
// Khio (Ka + Hoshonto + Murdorno Sh)
|
||||||
case '\u0995':
|
case '\u0995':
|
||||||
if (i + 2 < len && s[i + 1] == '\u09CD' && s[i + 2] == '\u09BF') {
|
if (i + 2 < len && s[i + 1] == '\u09CD' && s[i + 2] == '\u09BF') {
|
||||||
if (i == 0) {
|
if (i == 0) {
|
||||||
|
@ -67,12 +67,12 @@ class BengaliNormalizer {
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
|
||||||
// Nga to Anusvara
|
// Nga to Anusvara
|
||||||
case '\u0999':
|
case '\u0999':
|
||||||
s[i] = '\u0982';
|
s[i] = '\u0982';
|
||||||
break;
|
break;
|
||||||
|
|
||||||
// Ja Phala
|
// Ja Phala
|
||||||
case '\u09AF':
|
case '\u09AF':
|
||||||
if (i - 2 == 0 && s[i - 1] == '\u09CD') {
|
if (i - 2 == 0 && s[i - 1] == '\u09CD') {
|
||||||
s[i - 1] = '\u09C7';
|
s[i - 1] = '\u09C7';
|
||||||
|
@ -89,7 +89,7 @@ class BengaliNormalizer {
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
|
||||||
// Ba Phalaa
|
// Ba Phalaa
|
||||||
case '\u09AC':
|
case '\u09AC':
|
||||||
if ((i >= 1 && s[i - 1] != '\u09CD') || i == 0) {
|
if ((i >= 1 && s[i - 1] != '\u09CD') || i == 0) {
|
||||||
break;
|
break;
|
||||||
|
@ -109,7 +109,7 @@ class BengaliNormalizer {
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
|
||||||
// Visarga
|
// Visarga
|
||||||
case '\u0983':
|
case '\u0983':
|
||||||
if (i == len - 1) {
|
if (i == len - 1) {
|
||||||
if (len <= 3) {
|
if (len <= 3) {
|
||||||
|
@ -122,18 +122,18 @@ class BengaliNormalizer {
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
|
||||||
// All sh
|
// All sh
|
||||||
case '\u09B6':
|
case '\u09B6':
|
||||||
case '\u09B7':
|
case '\u09B7':
|
||||||
s[i] = '\u09B8';
|
s[i] = '\u09B8';
|
||||||
break;
|
break;
|
||||||
|
|
||||||
// check na
|
// check na
|
||||||
case '\u09A3':
|
case '\u09A3':
|
||||||
s[i] = '\u09A8';
|
s[i] = '\u09A8';
|
||||||
break;
|
break;
|
||||||
|
|
||||||
// check ra
|
// check ra
|
||||||
case '\u09DC':
|
case '\u09DC':
|
||||||
case '\u09DD':
|
case '\u09DD':
|
||||||
s[i] = '\u09B0';
|
s[i] = '\u09B0';
|
||||||
|
|
|
@ -747,70 +747,70 @@ class ClassicTokenizerImpl {
|
||||||
/* Break so we don't hit fall-through warning: */
|
/* Break so we don't hit fall-through warning: */
|
||||||
break; /* ignore */
|
break; /* ignore */
|
||||||
}
|
}
|
||||||
// fall through
|
// fall through
|
||||||
case 11:
|
case 11:
|
||||||
break;
|
break;
|
||||||
case 2:
|
case 2:
|
||||||
{
|
{
|
||||||
return ALPHANUM;
|
return ALPHANUM;
|
||||||
}
|
}
|
||||||
// fall through
|
// fall through
|
||||||
case 12:
|
case 12:
|
||||||
break;
|
break;
|
||||||
case 3:
|
case 3:
|
||||||
{
|
{
|
||||||
return CJ;
|
return CJ;
|
||||||
}
|
}
|
||||||
// fall through
|
// fall through
|
||||||
case 13:
|
case 13:
|
||||||
break;
|
break;
|
||||||
case 4:
|
case 4:
|
||||||
{
|
{
|
||||||
return NUM;
|
return NUM;
|
||||||
}
|
}
|
||||||
// fall through
|
// fall through
|
||||||
case 14:
|
case 14:
|
||||||
break;
|
break;
|
||||||
case 5:
|
case 5:
|
||||||
{
|
{
|
||||||
return HOST;
|
return HOST;
|
||||||
}
|
}
|
||||||
// fall through
|
// fall through
|
||||||
case 15:
|
case 15:
|
||||||
break;
|
break;
|
||||||
case 6:
|
case 6:
|
||||||
{
|
{
|
||||||
return COMPANY;
|
return COMPANY;
|
||||||
}
|
}
|
||||||
// fall through
|
// fall through
|
||||||
case 16:
|
case 16:
|
||||||
break;
|
break;
|
||||||
case 7:
|
case 7:
|
||||||
{
|
{
|
||||||
return APOSTROPHE;
|
return APOSTROPHE;
|
||||||
}
|
}
|
||||||
// fall through
|
// fall through
|
||||||
case 17:
|
case 17:
|
||||||
break;
|
break;
|
||||||
case 8:
|
case 8:
|
||||||
{
|
{
|
||||||
return ACRONYM_DEP;
|
return ACRONYM_DEP;
|
||||||
}
|
}
|
||||||
// fall through
|
// fall through
|
||||||
case 18:
|
case 18:
|
||||||
break;
|
break;
|
||||||
case 9:
|
case 9:
|
||||||
{
|
{
|
||||||
return ACRONYM;
|
return ACRONYM;
|
||||||
}
|
}
|
||||||
// fall through
|
// fall through
|
||||||
case 19:
|
case 19:
|
||||||
break;
|
break;
|
||||||
case 10:
|
case 10:
|
||||||
{
|
{
|
||||||
return EMAIL;
|
return EMAIL;
|
||||||
}
|
}
|
||||||
// fall through
|
// fall through
|
||||||
case 20:
|
case 20:
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
|
|
|
@ -53,18 +53,18 @@ public final class GreekLowerCaseFilter extends TokenFilter {
|
||||||
|
|
||||||
private int lowerCase(int codepoint) {
|
private int lowerCase(int codepoint) {
|
||||||
switch (codepoint) {
|
switch (codepoint) {
|
||||||
/* There are two lowercase forms of sigma:
|
/* There are two lowercase forms of sigma:
|
||||||
* U+03C2: small final sigma (end of word)
|
* U+03C2: small final sigma (end of word)
|
||||||
* U+03C3: small sigma (otherwise)
|
* U+03C3: small sigma (otherwise)
|
||||||
*
|
*
|
||||||
* Standardize both to U+03C3
|
* Standardize both to U+03C3
|
||||||
*/
|
*/
|
||||||
case '\u03C2': /* small final sigma */
|
case '\u03C2': /* small final sigma */
|
||||||
return '\u03C3'; /* small sigma */
|
return '\u03C3'; /* small sigma */
|
||||||
|
|
||||||
/* Some greek characters contain diacritics.
|
/* Some greek characters contain diacritics.
|
||||||
* This filter removes these, converting to the lowercase base form.
|
* This filter removes these, converting to the lowercase base form.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
case '\u0386': /* capital alpha with tonos */
|
case '\u0386': /* capital alpha with tonos */
|
||||||
case '\u03AC': /* small alpha with tonos */
|
case '\u03AC': /* small alpha with tonos */
|
||||||
|
@ -100,9 +100,9 @@ public final class GreekLowerCaseFilter extends TokenFilter {
|
||||||
case '\u03CE': /* small omega with tonos */
|
case '\u03CE': /* small omega with tonos */
|
||||||
return '\u03C9'; /* small omega */
|
return '\u03C9'; /* small omega */
|
||||||
|
|
||||||
/* The previous implementation did the conversion below.
|
/* The previous implementation did the conversion below.
|
||||||
* Only implemented for backwards compatibility with old indexes.
|
* Only implemented for backwards compatibility with old indexes.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
case '\u03A2': /* reserved */
|
case '\u03A2': /* reserved */
|
||||||
return '\u03C2'; /* small final sigma */
|
return '\u03C2'; /* small final sigma */
|
||||||
|
|
|
@ -456,7 +456,7 @@ class PorterStemmer {
|
||||||
/* j >= 0 fixes Bug 2 */
|
/* j >= 0 fixes Bug 2 */
|
||||||
if (ends("ou")) break;
|
if (ends("ou")) break;
|
||||||
return;
|
return;
|
||||||
/* takes care of -ous */
|
/* takes care of -ous */
|
||||||
case 's':
|
case 's':
|
||||||
if (ends("ism")) break;
|
if (ends("ism")) break;
|
||||||
return;
|
return;
|
||||||
|
|
|
@ -67,7 +67,7 @@ public final class IrishLowerCaseFilter extends TokenFilter {
|
||||||
case 'I':
|
case 'I':
|
||||||
case 'O':
|
case 'O':
|
||||||
case 'U':
|
case 'U':
|
||||||
// vowels with acute accent (fada)
|
// vowels with acute accent (fada)
|
||||||
case '\u00c1':
|
case '\u00c1':
|
||||||
case '\u00c9':
|
case '\u00c9':
|
||||||
case '\u00cd':
|
case '\u00cd':
|
||||||
|
|
|
@ -47,18 +47,18 @@ class HindiNormalizer {
|
||||||
|
|
||||||
for (int i = 0; i < len; i++) {
|
for (int i = 0; i < len; i++) {
|
||||||
switch (s[i]) {
|
switch (s[i]) {
|
||||||
// dead n -> bindu
|
// dead n -> bindu
|
||||||
case '\u0928':
|
case '\u0928':
|
||||||
if (i + 1 < len && s[i + 1] == '\u094D') {
|
if (i + 1 < len && s[i + 1] == '\u094D') {
|
||||||
s[i] = '\u0902';
|
s[i] = '\u0902';
|
||||||
len = delete(s, i + 1, len);
|
len = delete(s, i + 1, len);
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
// candrabindu -> bindu
|
// candrabindu -> bindu
|
||||||
case '\u0901':
|
case '\u0901':
|
||||||
s[i] = '\u0902';
|
s[i] = '\u0902';
|
||||||
break;
|
break;
|
||||||
// nukta deletions
|
// nukta deletions
|
||||||
case '\u093C':
|
case '\u093C':
|
||||||
len = delete(s, i, len);
|
len = delete(s, i, len);
|
||||||
i--;
|
i--;
|
||||||
|
@ -96,18 +96,18 @@ class HindiNormalizer {
|
||||||
case '\u095F':
|
case '\u095F':
|
||||||
s[i] = '\u092F';
|
s[i] = '\u092F';
|
||||||
break;
|
break;
|
||||||
// zwj/zwnj -> delete
|
// zwj/zwnj -> delete
|
||||||
case '\u200D':
|
case '\u200D':
|
||||||
case '\u200C':
|
case '\u200C':
|
||||||
len = delete(s, i, len);
|
len = delete(s, i, len);
|
||||||
i--;
|
i--;
|
||||||
break;
|
break;
|
||||||
// virama -> delete
|
// virama -> delete
|
||||||
case '\u094D':
|
case '\u094D':
|
||||||
len = delete(s, i, len);
|
len = delete(s, i, len);
|
||||||
i--;
|
i--;
|
||||||
break;
|
break;
|
||||||
// chandra/short -> replace
|
// chandra/short -> replace
|
||||||
case '\u0945':
|
case '\u0945':
|
||||||
case '\u0946':
|
case '\u0946':
|
||||||
s[i] = '\u0947';
|
s[i] = '\u0947';
|
||||||
|
@ -127,7 +127,7 @@ class HindiNormalizer {
|
||||||
case '\u0972':
|
case '\u0972':
|
||||||
s[i] = '\u0905';
|
s[i] = '\u0905';
|
||||||
break;
|
break;
|
||||||
// long -> short ind. vowels
|
// long -> short ind. vowels
|
||||||
case '\u0906':
|
case '\u0906':
|
||||||
s[i] = '\u0905';
|
s[i] = '\u0905';
|
||||||
break;
|
break;
|
||||||
|
@ -149,7 +149,7 @@ class HindiNormalizer {
|
||||||
case '\u0914':
|
case '\u0914':
|
||||||
s[i] = '\u0913';
|
s[i] = '\u0913';
|
||||||
break;
|
break;
|
||||||
// long -> short dep. vowels
|
// long -> short dep. vowels
|
||||||
case '\u0940':
|
case '\u0940':
|
||||||
s[i] = '\u093F';
|
s[i] = '\u093F';
|
||||||
break;
|
break;
|
||||||
|
|
|
@ -31,6 +31,7 @@ class ModifyingSuggester {
|
||||||
private final String misspelled;
|
private final String misspelled;
|
||||||
private final WordCase wordCase;
|
private final WordCase wordCase;
|
||||||
private final FragmentChecker fragmentChecker;
|
private final FragmentChecker fragmentChecker;
|
||||||
|
private final boolean proceedPastRep;
|
||||||
private final char[] tryChars;
|
private final char[] tryChars;
|
||||||
private final Hunspell speller;
|
private final Hunspell speller;
|
||||||
|
|
||||||
|
@ -39,13 +40,15 @@ class ModifyingSuggester {
|
||||||
LinkedHashSet<Suggestion> result,
|
LinkedHashSet<Suggestion> result,
|
||||||
String misspelled,
|
String misspelled,
|
||||||
WordCase wordCase,
|
WordCase wordCase,
|
||||||
FragmentChecker checker) {
|
FragmentChecker checker,
|
||||||
|
boolean proceedPastRep) {
|
||||||
this.speller = speller;
|
this.speller = speller;
|
||||||
tryChars = speller.dictionary.tryChars.toCharArray();
|
tryChars = speller.dictionary.tryChars.toCharArray();
|
||||||
this.result = result;
|
this.result = result;
|
||||||
this.misspelled = misspelled;
|
this.misspelled = misspelled;
|
||||||
this.wordCase = wordCase;
|
this.wordCase = wordCase;
|
||||||
fragmentChecker = checker;
|
fragmentChecker = checker;
|
||||||
|
this.proceedPastRep = proceedPastRep;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -125,9 +128,9 @@ class ModifyingSuggester {
|
||||||
boolean hasGoodSuggestions = trySuggestion(word.toUpperCase(Locale.ROOT));
|
boolean hasGoodSuggestions = trySuggestion(word.toUpperCase(Locale.ROOT));
|
||||||
|
|
||||||
GradedSuggestions repResult = tryRep(word);
|
GradedSuggestions repResult = tryRep(word);
|
||||||
if (repResult == GradedSuggestions.Best) return true;
|
if (repResult == GradedSuggestions.Best && !proceedPastRep) return true;
|
||||||
|
|
||||||
hasGoodSuggestions |= repResult == GradedSuggestions.Normal;
|
hasGoodSuggestions |= repResult != GradedSuggestions.None;
|
||||||
|
|
||||||
if (!speller.dictionary.mapTable.isEmpty()) {
|
if (!speller.dictionary.mapTable.isEmpty()) {
|
||||||
enumerateMapReplacements(word, "", 0);
|
enumerateMapReplacements(word, "", 0);
|
||||||
|
|
|
@ -53,16 +53,21 @@ public class Suggester {
|
||||||
private final Dictionary dictionary;
|
private final Dictionary dictionary;
|
||||||
private final SuggestibleEntryCache suggestibleCache;
|
private final SuggestibleEntryCache suggestibleCache;
|
||||||
private final FragmentChecker fragmentChecker;
|
private final FragmentChecker fragmentChecker;
|
||||||
|
private final boolean proceedPastRep;
|
||||||
|
|
||||||
public Suggester(Dictionary dictionary) {
|
public Suggester(Dictionary dictionary) {
|
||||||
this(dictionary, null, FragmentChecker.EVERYTHING_POSSIBLE);
|
this(dictionary, null, FragmentChecker.EVERYTHING_POSSIBLE, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
private Suggester(
|
private Suggester(
|
||||||
Dictionary dictionary, SuggestibleEntryCache suggestibleCache, FragmentChecker checker) {
|
Dictionary dictionary,
|
||||||
|
SuggestibleEntryCache suggestibleCache,
|
||||||
|
FragmentChecker checker,
|
||||||
|
boolean proceedPastRep) {
|
||||||
this.dictionary = dictionary;
|
this.dictionary = dictionary;
|
||||||
this.suggestibleCache = suggestibleCache;
|
this.suggestibleCache = suggestibleCache;
|
||||||
this.fragmentChecker = checker;
|
this.fragmentChecker = checker;
|
||||||
|
this.proceedPastRep = proceedPastRep;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -71,8 +76,8 @@ public class Suggester {
|
||||||
* entries are stored as fast-to-iterate plain words instead of highly compressed prefix trees.
|
* entries are stored as fast-to-iterate plain words instead of highly compressed prefix trees.
|
||||||
*/
|
*/
|
||||||
public Suggester withSuggestibleEntryCache() {
|
public Suggester withSuggestibleEntryCache() {
|
||||||
return new Suggester(
|
SuggestibleEntryCache cache = SuggestibleEntryCache.buildCache(dictionary.words);
|
||||||
dictionary, SuggestibleEntryCache.buildCache(dictionary.words), fragmentChecker);
|
return new Suggester(dictionary, cache, fragmentChecker, proceedPastRep);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -80,7 +85,17 @@ public class Suggester {
|
||||||
* the performance of the "Modification" phase performance.
|
* the performance of the "Modification" phase performance.
|
||||||
*/
|
*/
|
||||||
public Suggester withFragmentChecker(FragmentChecker checker) {
|
public Suggester withFragmentChecker(FragmentChecker checker) {
|
||||||
return new Suggester(dictionary, suggestibleCache, checker);
|
return new Suggester(dictionary, suggestibleCache, checker, proceedPastRep);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns a copy of this suggester instance that doesn't stop after encountering acceptable words
|
||||||
|
* after applying REP rules. By default, Hunspell stops when it finds any, but this behavior may
|
||||||
|
* not always be desirable, e.g., if we have "REP i ea", "tims" be replaced only by "teams" and
|
||||||
|
* not "times", which could also be meant.
|
||||||
|
*/
|
||||||
|
public Suggester proceedPastRep() {
|
||||||
|
return new Suggester(dictionary, suggestibleCache, fragmentChecker, true);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -174,7 +189,8 @@ public class Suggester {
|
||||||
}
|
}
|
||||||
|
|
||||||
boolean hasGoodSuggestions =
|
boolean hasGoodSuggestions =
|
||||||
new ModifyingSuggester(suggestionSpeller, suggestions, word, wordCase, fragmentChecker)
|
new ModifyingSuggester(
|
||||||
|
suggestionSpeller, suggestions, word, wordCase, fragmentChecker, proceedPastRep)
|
||||||
.suggest();
|
.suggest();
|
||||||
|
|
||||||
if (!hasGoodSuggestions && dictionary.maxNGramSuggestions > 0) {
|
if (!hasGoodSuggestions && dictionary.maxNGramSuggestions > 0) {
|
||||||
|
|
|
@ -194,7 +194,7 @@ public final class WordDelimiterIterator {
|
||||||
|
|
||||||
int type = charType(text[current]);
|
int type = charType(text[current]);
|
||||||
switch (type) {
|
switch (type) {
|
||||||
// return ALPHA word type for both lower and upper
|
// return ALPHA word type for both lower and upper
|
||||||
case LOWER:
|
case LOWER:
|
||||||
case UPPER:
|
case UPPER:
|
||||||
return ALPHA;
|
return ALPHA;
|
||||||
|
@ -332,27 +332,27 @@ public final class WordDelimiterIterator {
|
||||||
case Character.OTHER_NUMBER:
|
case Character.OTHER_NUMBER:
|
||||||
return DIGIT;
|
return DIGIT;
|
||||||
|
|
||||||
// case Character.SPACE_SEPARATOR:
|
// case Character.SPACE_SEPARATOR:
|
||||||
// case Character.LINE_SEPARATOR:
|
// case Character.LINE_SEPARATOR:
|
||||||
// case Character.PARAGRAPH_SEPARATOR:
|
// case Character.PARAGRAPH_SEPARATOR:
|
||||||
// case Character.CONTROL:
|
// case Character.CONTROL:
|
||||||
// case Character.FORMAT:
|
// case Character.FORMAT:
|
||||||
// case Character.PRIVATE_USE:
|
// case Character.PRIVATE_USE:
|
||||||
|
|
||||||
case Character.SURROGATE: // prevent splitting
|
case Character.SURROGATE: // prevent splitting
|
||||||
return ALPHA | DIGIT;
|
return ALPHA | DIGIT;
|
||||||
|
|
||||||
// case Character.DASH_PUNCTUATION:
|
// case Character.DASH_PUNCTUATION:
|
||||||
// case Character.START_PUNCTUATION:
|
// case Character.START_PUNCTUATION:
|
||||||
// case Character.END_PUNCTUATION:
|
// case Character.END_PUNCTUATION:
|
||||||
// case Character.CONNECTOR_PUNCTUATION:
|
// case Character.CONNECTOR_PUNCTUATION:
|
||||||
// case Character.OTHER_PUNCTUATION:
|
// case Character.OTHER_PUNCTUATION:
|
||||||
// case Character.MATH_SYMBOL:
|
// case Character.MATH_SYMBOL:
|
||||||
// case Character.CURRENCY_SYMBOL:
|
// case Character.CURRENCY_SYMBOL:
|
||||||
// case Character.MODIFIER_SYMBOL:
|
// case Character.MODIFIER_SYMBOL:
|
||||||
// case Character.OTHER_SYMBOL:
|
// case Character.OTHER_SYMBOL:
|
||||||
// case Character.INITIAL_QUOTE_PUNCTUATION:
|
// case Character.INITIAL_QUOTE_PUNCTUATION:
|
||||||
// case Character.FINAL_QUOTE_PUNCTUATION:
|
// case Character.FINAL_QUOTE_PUNCTUATION:
|
||||||
|
|
||||||
default:
|
default:
|
||||||
return SUBWORD_DELIM;
|
return SUBWORD_DELIM;
|
||||||
|
|
|
@ -38,25 +38,25 @@ class TeluguNormalizer {
|
||||||
|
|
||||||
for (int i = 0; i < len; i++) {
|
for (int i = 0; i < len; i++) {
|
||||||
switch (s[i]) {
|
switch (s[i]) {
|
||||||
// candrabindu (ఀ and ఁ) -> bindu (ం)
|
// candrabindu (ఀ and ఁ) -> bindu (ం)
|
||||||
case '\u0C00': // ఀ
|
case '\u0C00': // ఀ
|
||||||
case '\u0C01': // ఁ
|
case '\u0C01': // ఁ
|
||||||
s[i] = '\u0C02'; // ం
|
s[i] = '\u0C02'; // ం
|
||||||
break;
|
break;
|
||||||
// delete visarga (ః)
|
// delete visarga (ః)
|
||||||
case '\u0C03':
|
case '\u0C03':
|
||||||
len = delete(s, i, len);
|
len = delete(s, i, len);
|
||||||
i--;
|
i--;
|
||||||
break;
|
break;
|
||||||
|
|
||||||
// zwj/zwnj -> delete
|
// zwj/zwnj -> delete
|
||||||
case '\u200D':
|
case '\u200D':
|
||||||
case '\u200C':
|
case '\u200C':
|
||||||
len = delete(s, i, len);
|
len = delete(s, i, len);
|
||||||
i--;
|
i--;
|
||||||
break;
|
break;
|
||||||
|
|
||||||
// long -> short vowels
|
// long -> short vowels
|
||||||
case '\u0C14': // ఔ
|
case '\u0C14': // ఔ
|
||||||
s[i] = '\u0C13'; // ఓ
|
s[i] = '\u0C13'; // ఓ
|
||||||
break;
|
break;
|
||||||
|
@ -73,7 +73,7 @@ class TeluguNormalizer {
|
||||||
s[i] = '\u0C09'; // ఉ
|
s[i] = '\u0C09'; // ఉ
|
||||||
break;
|
break;
|
||||||
|
|
||||||
// long -> short vowels matras
|
// long -> short vowels matras
|
||||||
case '\u0C40': // ీ
|
case '\u0C40': // ీ
|
||||||
s[i] = '\u0C3F'; // ి
|
s[i] = '\u0C3F'; // ి
|
||||||
break;
|
break;
|
||||||
|
@ -86,14 +86,14 @@ class TeluguNormalizer {
|
||||||
case '\u0C4B': // ో
|
case '\u0C4B': // ో
|
||||||
s[i] = '\u0C4A'; // ొ
|
s[i] = '\u0C4A'; // ొ
|
||||||
break;
|
break;
|
||||||
// decomposed dipthong (ె + ౖ) -> precomposed diphthong vowel sign (ై)
|
// decomposed dipthong (ె + ౖ) -> precomposed diphthong vowel sign (ై)
|
||||||
case '\u0C46':
|
case '\u0C46':
|
||||||
if (i + 1 < len && s[i + 1] == '\u0C56') {
|
if (i + 1 < len && s[i + 1] == '\u0C56') {
|
||||||
s[i] = '\u0C48';
|
s[i] = '\u0C48';
|
||||||
len = delete(s, i + 1, len);
|
len = delete(s, i + 1, len);
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
// composed oo or au -> oo or au
|
// composed oo or au -> oo or au
|
||||||
case '\u0C12':
|
case '\u0C12':
|
||||||
if (i + 1 < len && s[i + 1] == '\u0C55') {
|
if (i + 1 < len && s[i + 1] == '\u0C55') {
|
||||||
// (ఒ + ౕ) -> oo (ఓ)
|
// (ఒ + ౕ) -> oo (ఓ)
|
||||||
|
|
|
@ -61,12 +61,12 @@ public final class TurkishLowerCaseFilter extends TokenFilter {
|
||||||
|
|
||||||
if (iOrAfter) { // all the special I turkish handling happens here.
|
if (iOrAfter) { // all the special I turkish handling happens here.
|
||||||
switch (ch) {
|
switch (ch) {
|
||||||
// remove COMBINING_DOT_ABOVE to mimic composed lowercase
|
// remove COMBINING_DOT_ABOVE to mimic composed lowercase
|
||||||
case COMBINING_DOT_ABOVE:
|
case COMBINING_DOT_ABOVE:
|
||||||
length = delete(buffer, i, length);
|
length = delete(buffer, i, length);
|
||||||
continue;
|
continue;
|
||||||
// i itself, it depends if it is followed by COMBINING_DOT_ABOVE
|
// i itself, it depends if it is followed by COMBINING_DOT_ABOVE
|
||||||
// if it is, we will make it small i and later remove the dot
|
// if it is, we will make it small i and later remove the dot
|
||||||
case LATIN_CAPITAL_LETTER_I:
|
case LATIN_CAPITAL_LETTER_I:
|
||||||
if (isBeforeDot(buffer, i + 1, length)) {
|
if (isBeforeDot(buffer, i + 1, length)) {
|
||||||
buffer[i] = LATIN_SMALL_LETTER_I;
|
buffer[i] = LATIN_SMALL_LETTER_I;
|
||||||
|
|
|
@ -901,7 +901,7 @@ class WikipediaTokenizerImpl {
|
||||||
positionInc = 1; /* Break so we don't hit fall-through warning: */
|
positionInc = 1; /* Break so we don't hit fall-through warning: */
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
// fall through
|
// fall through
|
||||||
case 47:
|
case 47:
|
||||||
break;
|
break;
|
||||||
case 2:
|
case 2:
|
||||||
|
@ -909,7 +909,7 @@ class WikipediaTokenizerImpl {
|
||||||
positionInc = 1;
|
positionInc = 1;
|
||||||
return ALPHANUM;
|
return ALPHANUM;
|
||||||
}
|
}
|
||||||
// fall through
|
// fall through
|
||||||
case 48:
|
case 48:
|
||||||
break;
|
break;
|
||||||
case 3:
|
case 3:
|
||||||
|
@ -920,7 +920,7 @@ class WikipediaTokenizerImpl {
|
||||||
yybegin(EXTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */
|
yybegin(EXTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
// fall through
|
// fall through
|
||||||
case 49:
|
case 49:
|
||||||
break;
|
break;
|
||||||
case 4:
|
case 4:
|
||||||
|
@ -928,7 +928,7 @@ class WikipediaTokenizerImpl {
|
||||||
positionInc = 1;
|
positionInc = 1;
|
||||||
return CJ;
|
return CJ;
|
||||||
}
|
}
|
||||||
// fall through
|
// fall through
|
||||||
case 50:
|
case 50:
|
||||||
break;
|
break;
|
||||||
case 5:
|
case 5:
|
||||||
|
@ -936,7 +936,7 @@ class WikipediaTokenizerImpl {
|
||||||
positionInc = 1; /* Break so we don't hit fall-through warning: */
|
positionInc = 1; /* Break so we don't hit fall-through warning: */
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
// fall through
|
// fall through
|
||||||
case 51:
|
case 51:
|
||||||
break;
|
break;
|
||||||
case 6:
|
case 6:
|
||||||
|
@ -945,7 +945,7 @@ class WikipediaTokenizerImpl {
|
||||||
numWikiTokensSeen++;
|
numWikiTokensSeen++;
|
||||||
return currentTokType;
|
return currentTokType;
|
||||||
}
|
}
|
||||||
// fall through
|
// fall through
|
||||||
case 52:
|
case 52:
|
||||||
break;
|
break;
|
||||||
case 7:
|
case 7:
|
||||||
|
@ -954,7 +954,7 @@ class WikipediaTokenizerImpl {
|
||||||
numWikiTokensSeen++;
|
numWikiTokensSeen++;
|
||||||
return currentTokType;
|
return currentTokType;
|
||||||
}
|
}
|
||||||
// fall through
|
// fall through
|
||||||
case 53:
|
case 53:
|
||||||
break;
|
break;
|
||||||
case 8:
|
case 8:
|
||||||
|
@ -962,7 +962,7 @@ class WikipediaTokenizerImpl {
|
||||||
/* Break so we don't hit fall-through warning: */
|
/* Break so we don't hit fall-through warning: */
|
||||||
break; /* ignore */
|
break; /* ignore */
|
||||||
}
|
}
|
||||||
// fall through
|
// fall through
|
||||||
case 54:
|
case 54:
|
||||||
break;
|
break;
|
||||||
case 9:
|
case 9:
|
||||||
|
@ -978,7 +978,7 @@ class WikipediaTokenizerImpl {
|
||||||
numLinkToks++;
|
numLinkToks++;
|
||||||
return currentTokType;
|
return currentTokType;
|
||||||
}
|
}
|
||||||
// fall through
|
// fall through
|
||||||
case 55:
|
case 55:
|
||||||
break;
|
break;
|
||||||
case 10:
|
case 10:
|
||||||
|
@ -988,7 +988,7 @@ class WikipediaTokenizerImpl {
|
||||||
yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */
|
yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
// fall through
|
// fall through
|
||||||
case 56:
|
case 56:
|
||||||
break;
|
break;
|
||||||
case 11:
|
case 11:
|
||||||
|
@ -997,7 +997,7 @@ class WikipediaTokenizerImpl {
|
||||||
yybegin(THREE_SINGLE_QUOTES_STATE); /* Break so we don't hit fall-through warning: */
|
yybegin(THREE_SINGLE_QUOTES_STATE); /* Break so we don't hit fall-through warning: */
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
// fall through
|
// fall through
|
||||||
case 57:
|
case 57:
|
||||||
break;
|
break;
|
||||||
case 12:
|
case 12:
|
||||||
|
@ -1007,7 +1007,7 @@ class WikipediaTokenizerImpl {
|
||||||
yybegin(STRING);
|
yybegin(STRING);
|
||||||
return currentTokType; /*italics*/
|
return currentTokType; /*italics*/
|
||||||
}
|
}
|
||||||
// fall through
|
// fall through
|
||||||
case 58:
|
case 58:
|
||||||
break;
|
break;
|
||||||
case 13:
|
case 13:
|
||||||
|
@ -1017,7 +1017,7 @@ class WikipediaTokenizerImpl {
|
||||||
yybegin(EXTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */
|
yybegin(EXTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
// fall through
|
// fall through
|
||||||
case 59:
|
case 59:
|
||||||
break;
|
break;
|
||||||
case 14:
|
case 14:
|
||||||
|
@ -1026,7 +1026,7 @@ class WikipediaTokenizerImpl {
|
||||||
numWikiTokensSeen++;
|
numWikiTokensSeen++;
|
||||||
return currentTokType;
|
return currentTokType;
|
||||||
}
|
}
|
||||||
// fall through
|
// fall through
|
||||||
case 60:
|
case 60:
|
||||||
break;
|
break;
|
||||||
case 15:
|
case 15:
|
||||||
|
@ -1036,7 +1036,7 @@ class WikipediaTokenizerImpl {
|
||||||
numWikiTokensSeen++;
|
numWikiTokensSeen++;
|
||||||
return currentTokType;
|
return currentTokType;
|
||||||
}
|
}
|
||||||
// fall through
|
// fall through
|
||||||
case 61:
|
case 61:
|
||||||
break;
|
break;
|
||||||
case 16:
|
case 16:
|
||||||
|
@ -1046,7 +1046,7 @@ class WikipediaTokenizerImpl {
|
||||||
yybegin(STRING); /* Break so we don't hit fall-through warning: */
|
yybegin(STRING); /* Break so we don't hit fall-through warning: */
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
// fall through
|
// fall through
|
||||||
case 62:
|
case 62:
|
||||||
break;
|
break;
|
||||||
case 17:
|
case 17:
|
||||||
|
@ -1055,7 +1055,7 @@ class WikipediaTokenizerImpl {
|
||||||
numWikiTokensSeen = 0;
|
numWikiTokensSeen = 0;
|
||||||
return currentTokType;
|
return currentTokType;
|
||||||
}
|
}
|
||||||
// fall through
|
// fall through
|
||||||
case 63:
|
case 63:
|
||||||
break;
|
break;
|
||||||
case 18:
|
case 18:
|
||||||
|
@ -1063,7 +1063,7 @@ class WikipediaTokenizerImpl {
|
||||||
/* Break so we don't hit fall-through warning: */
|
/* Break so we don't hit fall-through warning: */
|
||||||
break; /* ignore STRING */
|
break; /* ignore STRING */
|
||||||
}
|
}
|
||||||
// fall through
|
// fall through
|
||||||
case 64:
|
case 64:
|
||||||
break;
|
break;
|
||||||
case 19:
|
case 19:
|
||||||
|
@ -1072,7 +1072,7 @@ class WikipediaTokenizerImpl {
|
||||||
numWikiTokensSeen++;
|
numWikiTokensSeen++;
|
||||||
return currentTokType; /* STRING ALPHANUM*/
|
return currentTokType; /* STRING ALPHANUM*/
|
||||||
}
|
}
|
||||||
// fall through
|
// fall through
|
||||||
case 65:
|
case 65:
|
||||||
break;
|
break;
|
||||||
case 20:
|
case 20:
|
||||||
|
@ -1083,7 +1083,7 @@ class WikipediaTokenizerImpl {
|
||||||
yybegin(EXTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */
|
yybegin(EXTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
// fall through
|
// fall through
|
||||||
case 66:
|
case 66:
|
||||||
break;
|
break;
|
||||||
case 21:
|
case 21:
|
||||||
|
@ -1091,7 +1091,7 @@ class WikipediaTokenizerImpl {
|
||||||
yybegin(STRING);
|
yybegin(STRING);
|
||||||
return currentTokType; /*pipe*/
|
return currentTokType; /*pipe*/
|
||||||
}
|
}
|
||||||
// fall through
|
// fall through
|
||||||
case 67:
|
case 67:
|
||||||
break;
|
break;
|
||||||
case 22:
|
case 22:
|
||||||
|
@ -1106,7 +1106,7 @@ class WikipediaTokenizerImpl {
|
||||||
} /* Break so we don't hit fall-through warning: */
|
} /* Break so we don't hit fall-through warning: */
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
// fall through
|
// fall through
|
||||||
case 68:
|
case 68:
|
||||||
break;
|
break;
|
||||||
case 23:
|
case 23:
|
||||||
|
@ -1116,7 +1116,7 @@ class WikipediaTokenizerImpl {
|
||||||
yybegin(DOUBLE_EQUALS_STATE); /* Break so we don't hit fall-through warning: */
|
yybegin(DOUBLE_EQUALS_STATE); /* Break so we don't hit fall-through warning: */
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
// fall through
|
// fall through
|
||||||
case 69:
|
case 69:
|
||||||
break;
|
break;
|
||||||
case 24:
|
case 24:
|
||||||
|
@ -1127,7 +1127,7 @@ class WikipediaTokenizerImpl {
|
||||||
yybegin(INTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */
|
yybegin(INTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
// fall through
|
// fall through
|
||||||
case 70:
|
case 70:
|
||||||
break;
|
break;
|
||||||
case 25:
|
case 25:
|
||||||
|
@ -1138,7 +1138,7 @@ class WikipediaTokenizerImpl {
|
||||||
yybegin(DOUBLE_BRACE_STATE); /* Break so we don't hit fall-through warning: */
|
yybegin(DOUBLE_BRACE_STATE); /* Break so we don't hit fall-through warning: */
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
// fall through
|
// fall through
|
||||||
case 71:
|
case 71:
|
||||||
break;
|
break;
|
||||||
case 26:
|
case 26:
|
||||||
|
@ -1146,7 +1146,7 @@ class WikipediaTokenizerImpl {
|
||||||
yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */
|
yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
// fall through
|
// fall through
|
||||||
case 72:
|
case 72:
|
||||||
break;
|
break;
|
||||||
case 27:
|
case 27:
|
||||||
|
@ -1155,7 +1155,7 @@ class WikipediaTokenizerImpl {
|
||||||
yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */
|
yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
// fall through
|
// fall through
|
||||||
case 73:
|
case 73:
|
||||||
break;
|
break;
|
||||||
case 28:
|
case 28:
|
||||||
|
@ -1165,7 +1165,7 @@ class WikipediaTokenizerImpl {
|
||||||
yybegin(INTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */
|
yybegin(INTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
// fall through
|
// fall through
|
||||||
case 74:
|
case 74:
|
||||||
break;
|
break;
|
||||||
case 29:
|
case 29:
|
||||||
|
@ -1175,7 +1175,7 @@ class WikipediaTokenizerImpl {
|
||||||
yybegin(INTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */
|
yybegin(INTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
// fall through
|
// fall through
|
||||||
case 75:
|
case 75:
|
||||||
break;
|
break;
|
||||||
case 30:
|
case 30:
|
||||||
|
@ -1183,7 +1183,7 @@ class WikipediaTokenizerImpl {
|
||||||
yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */
|
yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
// fall through
|
// fall through
|
||||||
case 76:
|
case 76:
|
||||||
break;
|
break;
|
||||||
case 31:
|
case 31:
|
||||||
|
@ -1193,7 +1193,7 @@ class WikipediaTokenizerImpl {
|
||||||
yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */
|
yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */
|
||||||
break; /*end italics*/
|
break; /*end italics*/
|
||||||
}
|
}
|
||||||
// fall through
|
// fall through
|
||||||
case 77:
|
case 77:
|
||||||
break;
|
break;
|
||||||
case 32:
|
case 32:
|
||||||
|
@ -1204,7 +1204,7 @@ class WikipediaTokenizerImpl {
|
||||||
yybegin(INTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */
|
yybegin(INTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
// fall through
|
// fall through
|
||||||
case 78:
|
case 78:
|
||||||
break;
|
break;
|
||||||
case 33:
|
case 33:
|
||||||
|
@ -1212,7 +1212,7 @@ class WikipediaTokenizerImpl {
|
||||||
positionInc = 1;
|
positionInc = 1;
|
||||||
return NUM;
|
return NUM;
|
||||||
}
|
}
|
||||||
// fall through
|
// fall through
|
||||||
case 79:
|
case 79:
|
||||||
break;
|
break;
|
||||||
case 34:
|
case 34:
|
||||||
|
@ -1220,7 +1220,7 @@ class WikipediaTokenizerImpl {
|
||||||
positionInc = 1;
|
positionInc = 1;
|
||||||
return COMPANY;
|
return COMPANY;
|
||||||
}
|
}
|
||||||
// fall through
|
// fall through
|
||||||
case 80:
|
case 80:
|
||||||
break;
|
break;
|
||||||
case 35:
|
case 35:
|
||||||
|
@ -1228,7 +1228,7 @@ class WikipediaTokenizerImpl {
|
||||||
positionInc = 1;
|
positionInc = 1;
|
||||||
return APOSTROPHE;
|
return APOSTROPHE;
|
||||||
}
|
}
|
||||||
// fall through
|
// fall through
|
||||||
case 81:
|
case 81:
|
||||||
break;
|
break;
|
||||||
case 36:
|
case 36:
|
||||||
|
@ -1236,7 +1236,7 @@ class WikipediaTokenizerImpl {
|
||||||
positionInc = 1;
|
positionInc = 1;
|
||||||
return HOST;
|
return HOST;
|
||||||
}
|
}
|
||||||
// fall through
|
// fall through
|
||||||
case 82:
|
case 82:
|
||||||
break;
|
break;
|
||||||
case 37:
|
case 37:
|
||||||
|
@ -1245,7 +1245,7 @@ class WikipediaTokenizerImpl {
|
||||||
yybegin(FIVE_SINGLE_QUOTES_STATE); /* Break so we don't hit fall-through warning: */
|
yybegin(FIVE_SINGLE_QUOTES_STATE); /* Break so we don't hit fall-through warning: */
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
// fall through
|
// fall through
|
||||||
case 83:
|
case 83:
|
||||||
break;
|
break;
|
||||||
case 38:
|
case 38:
|
||||||
|
@ -1255,7 +1255,7 @@ class WikipediaTokenizerImpl {
|
||||||
yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */
|
yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */
|
||||||
break; /*end bold*/
|
break; /*end bold*/
|
||||||
}
|
}
|
||||||
// fall through
|
// fall through
|
||||||
case 84:
|
case 84:
|
||||||
break;
|
break;
|
||||||
case 39:
|
case 39:
|
||||||
|
@ -1265,7 +1265,7 @@ class WikipediaTokenizerImpl {
|
||||||
yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */
|
yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */
|
||||||
break; /*end sub header*/
|
break; /*end sub header*/
|
||||||
}
|
}
|
||||||
// fall through
|
// fall through
|
||||||
case 85:
|
case 85:
|
||||||
break;
|
break;
|
||||||
case 40:
|
case 40:
|
||||||
|
@ -1273,7 +1273,7 @@ class WikipediaTokenizerImpl {
|
||||||
positionInc = 1;
|
positionInc = 1;
|
||||||
return ACRONYM;
|
return ACRONYM;
|
||||||
}
|
}
|
||||||
// fall through
|
// fall through
|
||||||
case 86:
|
case 86:
|
||||||
break;
|
break;
|
||||||
case 41:
|
case 41:
|
||||||
|
@ -1281,7 +1281,7 @@ class WikipediaTokenizerImpl {
|
||||||
positionInc = 1;
|
positionInc = 1;
|
||||||
return EMAIL;
|
return EMAIL;
|
||||||
}
|
}
|
||||||
// fall through
|
// fall through
|
||||||
case 87:
|
case 87:
|
||||||
break;
|
break;
|
||||||
case 42:
|
case 42:
|
||||||
|
@ -1291,7 +1291,7 @@ class WikipediaTokenizerImpl {
|
||||||
yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */
|
yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */
|
||||||
break; /*end bold italics*/
|
break; /*end bold italics*/
|
||||||
}
|
}
|
||||||
// fall through
|
// fall through
|
||||||
case 88:
|
case 88:
|
||||||
break;
|
break;
|
||||||
case 43:
|
case 43:
|
||||||
|
@ -1301,7 +1301,7 @@ class WikipediaTokenizerImpl {
|
||||||
yybegin(EXTERNAL_LINK_STATE);
|
yybegin(EXTERNAL_LINK_STATE);
|
||||||
return currentTokType;
|
return currentTokType;
|
||||||
}
|
}
|
||||||
// fall through
|
// fall through
|
||||||
case 89:
|
case 89:
|
||||||
break;
|
break;
|
||||||
case 44:
|
case 44:
|
||||||
|
@ -1312,7 +1312,7 @@ class WikipediaTokenizerImpl {
|
||||||
yybegin(CATEGORY_STATE); /* Break so we don't hit fall-through warning: */
|
yybegin(CATEGORY_STATE); /* Break so we don't hit fall-through warning: */
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
// fall through
|
// fall through
|
||||||
case 90:
|
case 90:
|
||||||
break;
|
break;
|
||||||
case 45:
|
case 45:
|
||||||
|
@ -1322,7 +1322,7 @@ class WikipediaTokenizerImpl {
|
||||||
yybegin(CATEGORY_STATE); /* Break so we don't hit fall-through warning: */
|
yybegin(CATEGORY_STATE); /* Break so we don't hit fall-through warning: */
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
// fall through
|
// fall through
|
||||||
case 91:
|
case 91:
|
||||||
break;
|
break;
|
||||||
case 46:
|
case 46:
|
||||||
|
@ -1333,7 +1333,7 @@ class WikipediaTokenizerImpl {
|
||||||
yybegin(CATEGORY_STATE); /* Break so we don't hit fall-through warning: */
|
yybegin(CATEGORY_STATE); /* Break so we don't hit fall-through warning: */
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
// fall through
|
// fall through
|
||||||
case 92:
|
case 92:
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
|
|
|
@ -59,6 +59,14 @@ public class TestSpellChecking extends LuceneTestCase {
|
||||||
|
|
||||||
public void testRepSuggestions() throws Exception {
|
public void testRepSuggestions() throws Exception {
|
||||||
doTest("rep");
|
doTest("rep");
|
||||||
|
|
||||||
|
//noinspection DataFlowIssue
|
||||||
|
Path aff = Path.of(getClass().getResource("rep.aff").toURI());
|
||||||
|
Dictionary dictionary = TestAllDictionaries.loadDictionary(aff);
|
||||||
|
Suggester suggester = new Suggester(dictionary);
|
||||||
|
assertEquals(List.of("auto's"), suggester.suggestNoTimeout("autos", () -> {}));
|
||||||
|
assertEquals(
|
||||||
|
List.of("auto's", "auto"), suggester.proceedPastRep().suggestNoTimeout("autos", () -> {}));
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testPhSuggestions() throws Exception {
|
public void testPhSuggestions() throws Exception {
|
||||||
|
|
|
@ -245,7 +245,7 @@ public class Diff {
|
||||||
deletes++;
|
deletes++;
|
||||||
x--;
|
x--;
|
||||||
break;
|
break;
|
||||||
// delete
|
// delete
|
||||||
case Y:
|
case Y:
|
||||||
if (deletes != base) {
|
if (deletes != base) {
|
||||||
result.append('D').append(deletes);
|
result.append('D').append(deletes);
|
||||||
|
@ -258,7 +258,7 @@ public class Diff {
|
||||||
result.append('I');
|
result.append('I');
|
||||||
result.append(b.charAt(--y));
|
result.append(b.charAt(--y));
|
||||||
break;
|
break;
|
||||||
// insert
|
// insert
|
||||||
case R:
|
case R:
|
||||||
if (deletes != base) {
|
if (deletes != base) {
|
||||||
result.append('D').append(deletes);
|
result.append('D').append(deletes);
|
||||||
|
@ -272,7 +272,7 @@ public class Diff {
|
||||||
result.append(b.charAt(--y));
|
result.append(b.charAt(--y));
|
||||||
x--;
|
x--;
|
||||||
break;
|
break;
|
||||||
// replace
|
// replace
|
||||||
case D:
|
case D:
|
||||||
if (deletes != base) {
|
if (deletes != base) {
|
||||||
result.append('D').append(deletes);
|
result.append('D').append(deletes);
|
||||||
|
|
|
@ -0,0 +1,4 @@
|
||||||
|
{
|
||||||
|
"lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene99/ForUtil.java": "f31797842f047626df6a1a6b97167bec60269fec",
|
||||||
|
"lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene99/gen_ForUtil.py": "325f2610974b0e76e278b6445405a098a3763feb"
|
||||||
|
}
|
|
@ -35,6 +35,7 @@ module org.apache.lucene.backward_codecs {
|
||||||
exports org.apache.lucene.backward_codecs.lucene92;
|
exports org.apache.lucene.backward_codecs.lucene92;
|
||||||
exports org.apache.lucene.backward_codecs.lucene94;
|
exports org.apache.lucene.backward_codecs.lucene94;
|
||||||
exports org.apache.lucene.backward_codecs.lucene95;
|
exports org.apache.lucene.backward_codecs.lucene95;
|
||||||
|
exports org.apache.lucene.backward_codecs.lucene99;
|
||||||
exports org.apache.lucene.backward_codecs.packed;
|
exports org.apache.lucene.backward_codecs.packed;
|
||||||
exports org.apache.lucene.backward_codecs.store;
|
exports org.apache.lucene.backward_codecs.store;
|
||||||
|
|
||||||
|
@ -43,7 +44,8 @@ module org.apache.lucene.backward_codecs {
|
||||||
provides org.apache.lucene.codecs.PostingsFormat with
|
provides org.apache.lucene.codecs.PostingsFormat with
|
||||||
org.apache.lucene.backward_codecs.lucene50.Lucene50PostingsFormat,
|
org.apache.lucene.backward_codecs.lucene50.Lucene50PostingsFormat,
|
||||||
org.apache.lucene.backward_codecs.lucene84.Lucene84PostingsFormat,
|
org.apache.lucene.backward_codecs.lucene84.Lucene84PostingsFormat,
|
||||||
org.apache.lucene.backward_codecs.lucene90.Lucene90PostingsFormat;
|
org.apache.lucene.backward_codecs.lucene90.Lucene90PostingsFormat,
|
||||||
|
org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat;
|
||||||
provides org.apache.lucene.codecs.KnnVectorsFormat with
|
provides org.apache.lucene.codecs.KnnVectorsFormat with
|
||||||
org.apache.lucene.backward_codecs.lucene90.Lucene90HnswVectorsFormat,
|
org.apache.lucene.backward_codecs.lucene90.Lucene90HnswVectorsFormat,
|
||||||
org.apache.lucene.backward_codecs.lucene91.Lucene91HnswVectorsFormat,
|
org.apache.lucene.backward_codecs.lucene91.Lucene91HnswVectorsFormat,
|
||||||
|
@ -59,5 +61,6 @@ module org.apache.lucene.backward_codecs {
|
||||||
org.apache.lucene.backward_codecs.lucene91.Lucene91Codec,
|
org.apache.lucene.backward_codecs.lucene91.Lucene91Codec,
|
||||||
org.apache.lucene.backward_codecs.lucene92.Lucene92Codec,
|
org.apache.lucene.backward_codecs.lucene92.Lucene92Codec,
|
||||||
org.apache.lucene.backward_codecs.lucene94.Lucene94Codec,
|
org.apache.lucene.backward_codecs.lucene94.Lucene94Codec,
|
||||||
org.apache.lucene.backward_codecs.lucene95.Lucene95Codec;
|
org.apache.lucene.backward_codecs.lucene95.Lucene95Codec,
|
||||||
|
org.apache.lucene.backward_codecs.lucene99.Lucene99Codec;
|
||||||
}
|
}
|
||||||
|
|
|
@ -88,21 +88,17 @@ public final class FieldReader extends Terms {
|
||||||
(new ByteArrayDataInput(rootCode.bytes, rootCode.offset, rootCode.length)).readVLong()
|
(new ByteArrayDataInput(rootCode.bytes, rootCode.offset, rootCode.length)).readVLong()
|
||||||
>>> Lucene40BlockTreeTermsReader.OUTPUT_FLAGS_NUM_BITS;
|
>>> Lucene40BlockTreeTermsReader.OUTPUT_FLAGS_NUM_BITS;
|
||||||
// Initialize FST always off-heap.
|
// Initialize FST always off-heap.
|
||||||
final IndexInput clone = indexIn.clone();
|
final FST.FSTMetadata<BytesRef> fstMetadata;
|
||||||
clone.seek(indexStartFP);
|
|
||||||
if (metaIn == indexIn) { // Only true before Lucene 8.6
|
if (metaIn == indexIn) { // Only true before Lucene 8.6
|
||||||
index =
|
final IndexInput clone = indexIn.clone();
|
||||||
new FST<>(
|
clone.seek(indexStartFP);
|
||||||
readMetadata(clone, ByteSequenceOutputs.getSingleton()),
|
fstMetadata = readMetadata(clone, ByteSequenceOutputs.getSingleton());
|
||||||
clone,
|
// FST bytes actually only start after the metadata.
|
||||||
new OffHeapFSTStore());
|
indexStartFP = clone.getFilePointer();
|
||||||
} else {
|
} else {
|
||||||
index =
|
fstMetadata = readMetadata(metaIn, ByteSequenceOutputs.getSingleton());
|
||||||
new FST<>(
|
|
||||||
readMetadata(metaIn, ByteSequenceOutputs.getSingleton()),
|
|
||||||
clone,
|
|
||||||
new OffHeapFSTStore());
|
|
||||||
}
|
}
|
||||||
|
index = FST.fromFSTReader(fstMetadata, new OffHeapFSTStore(indexIn, indexStartFP, fstMetadata));
|
||||||
/*
|
/*
|
||||||
if (false) {
|
if (false) {
|
||||||
final String dotFileName = segment + "_" + fieldInfo.name + ".dot";
|
final String dotFileName = segment + "_" + fieldInfo.name + ".dot";
|
||||||
|
|
|
@ -14,7 +14,7 @@
|
||||||
* See the License for the specific language governing permissions and
|
* See the License for the specific language governing permissions and
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
package org.apache.lucene.codecs.lucene99;
|
package org.apache.lucene.backward_codecs.lucene99;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import org.apache.lucene.store.DataInput;
|
import org.apache.lucene.store.DataInput;
|
|
@ -16,7 +16,7 @@
|
||||||
* See the License for the specific language governing permissions and
|
* See the License for the specific language governing permissions and
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
package org.apache.lucene.codecs.lucene99;
|
package org.apache.lucene.backward_codecs.lucene99;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import org.apache.lucene.store.DataInput;
|
import org.apache.lucene.store.DataInput;
|
|
@ -14,12 +14,33 @@
|
||||||
* See the License for the specific language governing permissions and
|
* See the License for the specific language governing permissions and
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
package org.apache.lucene.codecs.lucene99;
|
package org.apache.lucene.backward_codecs.lucene99;
|
||||||
|
|
||||||
import java.util.Objects;
|
import java.util.Objects;
|
||||||
import org.apache.lucene.codecs.*;
|
import org.apache.lucene.codecs.Codec;
|
||||||
import org.apache.lucene.codecs.lucene90.*;
|
import org.apache.lucene.codecs.CompoundFormat;
|
||||||
|
import org.apache.lucene.codecs.DocValuesFormat;
|
||||||
|
import org.apache.lucene.codecs.FieldInfosFormat;
|
||||||
|
import org.apache.lucene.codecs.FilterCodec;
|
||||||
|
import org.apache.lucene.codecs.KnnVectorsFormat;
|
||||||
|
import org.apache.lucene.codecs.LiveDocsFormat;
|
||||||
|
import org.apache.lucene.codecs.NormsFormat;
|
||||||
|
import org.apache.lucene.codecs.PointsFormat;
|
||||||
|
import org.apache.lucene.codecs.PostingsFormat;
|
||||||
|
import org.apache.lucene.codecs.SegmentInfoFormat;
|
||||||
|
import org.apache.lucene.codecs.StoredFieldsFormat;
|
||||||
|
import org.apache.lucene.codecs.TermVectorsFormat;
|
||||||
|
import org.apache.lucene.codecs.lucene90.Lucene90CompoundFormat;
|
||||||
|
import org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat;
|
||||||
|
import org.apache.lucene.codecs.lucene90.Lucene90LiveDocsFormat;
|
||||||
|
import org.apache.lucene.codecs.lucene90.Lucene90NormsFormat;
|
||||||
|
import org.apache.lucene.codecs.lucene90.Lucene90PointsFormat;
|
||||||
|
import org.apache.lucene.codecs.lucene90.Lucene90StoredFieldsFormat;
|
||||||
|
import org.apache.lucene.codecs.lucene90.Lucene90TermVectorsFormat;
|
||||||
|
import org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat;
|
||||||
import org.apache.lucene.codecs.lucene94.Lucene94FieldInfosFormat;
|
import org.apache.lucene.codecs.lucene94.Lucene94FieldInfosFormat;
|
||||||
|
import org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat;
|
||||||
|
import org.apache.lucene.codecs.lucene99.Lucene99SegmentInfoFormat;
|
||||||
import org.apache.lucene.codecs.perfield.PerFieldDocValuesFormat;
|
import org.apache.lucene.codecs.perfield.PerFieldDocValuesFormat;
|
||||||
import org.apache.lucene.codecs.perfield.PerFieldKnnVectorsFormat;
|
import org.apache.lucene.codecs.perfield.PerFieldKnnVectorsFormat;
|
||||||
import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat;
|
import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat;
|
||||||
|
@ -98,7 +119,7 @@ public class Lucene99Codec extends Codec {
|
||||||
super("Lucene99");
|
super("Lucene99");
|
||||||
this.storedFieldsFormat =
|
this.storedFieldsFormat =
|
||||||
new Lucene90StoredFieldsFormat(Objects.requireNonNull(mode).storedMode);
|
new Lucene90StoredFieldsFormat(Objects.requireNonNull(mode).storedMode);
|
||||||
this.defaultPostingsFormat = new Lucene99PostingsFormat();
|
this.defaultPostingsFormat = new Lucene912PostingsFormat();
|
||||||
this.defaultDVFormat = new Lucene90DocValuesFormat();
|
this.defaultDVFormat = new Lucene90DocValuesFormat();
|
||||||
this.defaultKnnVectorsFormat = new Lucene99HnswVectorsFormat();
|
this.defaultKnnVectorsFormat = new Lucene99HnswVectorsFormat();
|
||||||
}
|
}
|
|
@ -14,7 +14,7 @@
|
||||||
* See the License for the specific language governing permissions and
|
* See the License for the specific language governing permissions and
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
package org.apache.lucene.codecs.lucene99;
|
package org.apache.lucene.backward_codecs.lucene99;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import org.apache.lucene.codecs.BlockTermState;
|
import org.apache.lucene.codecs.BlockTermState;
|
||||||
|
@ -24,7 +24,6 @@ import org.apache.lucene.codecs.FieldsProducer;
|
||||||
import org.apache.lucene.codecs.MultiLevelSkipListWriter;
|
import org.apache.lucene.codecs.MultiLevelSkipListWriter;
|
||||||
import org.apache.lucene.codecs.PostingsFormat;
|
import org.apache.lucene.codecs.PostingsFormat;
|
||||||
import org.apache.lucene.codecs.PostingsReaderBase;
|
import org.apache.lucene.codecs.PostingsReaderBase;
|
||||||
import org.apache.lucene.codecs.PostingsWriterBase;
|
|
||||||
import org.apache.lucene.codecs.lucene90.blocktree.Lucene90BlockTreeTermsReader;
|
import org.apache.lucene.codecs.lucene90.blocktree.Lucene90BlockTreeTermsReader;
|
||||||
import org.apache.lucene.codecs.lucene90.blocktree.Lucene90BlockTreeTermsWriter;
|
import org.apache.lucene.codecs.lucene90.blocktree.Lucene90BlockTreeTermsWriter;
|
||||||
import org.apache.lucene.index.IndexOptions;
|
import org.apache.lucene.index.IndexOptions;
|
||||||
|
@ -339,7 +338,7 @@ import org.apache.lucene.util.packed.PackedInts;
|
||||||
*
|
*
|
||||||
* @lucene.experimental
|
* @lucene.experimental
|
||||||
*/
|
*/
|
||||||
public final class Lucene99PostingsFormat extends PostingsFormat {
|
public class Lucene99PostingsFormat extends PostingsFormat {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Filename extension for document number, frequencies, and skip data. See chapter: <a
|
* Filename extension for document number, frequencies, and skip data. See chapter: <a
|
||||||
|
@ -374,28 +373,9 @@ public final class Lucene99PostingsFormat extends PostingsFormat {
|
||||||
static final int VERSION_START = 0;
|
static final int VERSION_START = 0;
|
||||||
static final int VERSION_CURRENT = VERSION_START;
|
static final int VERSION_CURRENT = VERSION_START;
|
||||||
|
|
||||||
private final int minTermBlockSize;
|
|
||||||
private final int maxTermBlockSize;
|
|
||||||
|
|
||||||
/** Creates {@code Lucene99PostingsFormat} with default settings. */
|
/** Creates {@code Lucene99PostingsFormat} with default settings. */
|
||||||
public Lucene99PostingsFormat() {
|
public Lucene99PostingsFormat() {
|
||||||
this(
|
|
||||||
Lucene90BlockTreeTermsWriter.DEFAULT_MIN_BLOCK_SIZE,
|
|
||||||
Lucene90BlockTreeTermsWriter.DEFAULT_MAX_BLOCK_SIZE);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Creates {@code Lucene99PostingsFormat} with custom values for {@code minBlockSize} and {@code
|
|
||||||
* maxBlockSize} passed to block terms dictionary.
|
|
||||||
*
|
|
||||||
* @see
|
|
||||||
* Lucene90BlockTreeTermsWriter#Lucene90BlockTreeTermsWriter(SegmentWriteState,PostingsWriterBase,int,int)
|
|
||||||
*/
|
|
||||||
public Lucene99PostingsFormat(int minTermBlockSize, int maxTermBlockSize) {
|
|
||||||
super("Lucene99");
|
super("Lucene99");
|
||||||
Lucene90BlockTreeTermsWriter.validateSettings(minTermBlockSize, maxTermBlockSize);
|
|
||||||
this.minTermBlockSize = minTermBlockSize;
|
|
||||||
this.maxTermBlockSize = maxTermBlockSize;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -405,19 +385,7 @@ public final class Lucene99PostingsFormat extends PostingsFormat {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
|
public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
|
||||||
PostingsWriterBase postingsWriter = new Lucene99PostingsWriter(state);
|
throw new UnsupportedOperationException();
|
||||||
boolean success = false;
|
|
||||||
try {
|
|
||||||
FieldsConsumer ret =
|
|
||||||
new Lucene90BlockTreeTermsWriter(
|
|
||||||
state, postingsWriter, minTermBlockSize, maxTermBlockSize);
|
|
||||||
success = true;
|
|
||||||
return ret;
|
|
||||||
} finally {
|
|
||||||
if (!success) {
|
|
||||||
IOUtils.closeWhileHandlingException(postingsWriter);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
|
@ -14,23 +14,23 @@
|
||||||
* See the License for the specific language governing permissions and
|
* See the License for the specific language governing permissions and
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
package org.apache.lucene.codecs.lucene99;
|
package org.apache.lucene.backward_codecs.lucene99;
|
||||||
|
|
||||||
import static org.apache.lucene.codecs.lucene99.ForUtil.BLOCK_SIZE;
|
import static org.apache.lucene.backward_codecs.lucene99.ForUtil.BLOCK_SIZE;
|
||||||
import static org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.DOC_CODEC;
|
import static org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat.DOC_CODEC;
|
||||||
import static org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.MAX_SKIP_LEVELS;
|
import static org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat.MAX_SKIP_LEVELS;
|
||||||
import static org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.PAY_CODEC;
|
import static org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat.PAY_CODEC;
|
||||||
import static org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.POS_CODEC;
|
import static org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat.POS_CODEC;
|
||||||
import static org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.TERMS_CODEC;
|
import static org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat.TERMS_CODEC;
|
||||||
import static org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.VERSION_CURRENT;
|
import static org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat.VERSION_CURRENT;
|
||||||
import static org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.VERSION_START;
|
import static org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat.VERSION_START;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
|
import org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat.IntBlockTermState;
|
||||||
import org.apache.lucene.codecs.BlockTermState;
|
import org.apache.lucene.codecs.BlockTermState;
|
||||||
import org.apache.lucene.codecs.CodecUtil;
|
import org.apache.lucene.codecs.CodecUtil;
|
||||||
import org.apache.lucene.codecs.PostingsReaderBase;
|
import org.apache.lucene.codecs.PostingsReaderBase;
|
||||||
import org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.IntBlockTermState;
|
|
||||||
import org.apache.lucene.index.FieldInfo;
|
import org.apache.lucene.index.FieldInfo;
|
||||||
import org.apache.lucene.index.Impacts;
|
import org.apache.lucene.index.Impacts;
|
||||||
import org.apache.lucene.index.ImpactsEnum;
|
import org.apache.lucene.index.ImpactsEnum;
|
|
@ -14,7 +14,7 @@
|
||||||
* See the License for the specific language governing permissions and
|
* See the License for the specific language governing permissions and
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
package org.apache.lucene.codecs.lucene99;
|
package org.apache.lucene.backward_codecs.lucene99;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.AbstractList;
|
import java.util.AbstractList;
|
|
@ -14,7 +14,7 @@
|
||||||
* See the License for the specific language governing permissions and
|
* See the License for the specific language governing permissions and
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
package org.apache.lucene.codecs.lucene99;
|
package org.apache.lucene.backward_codecs.lucene99;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
|
@ -61,6 +61,7 @@ public class Lucene99SkipReader extends MultiLevelSkipListReader {
|
||||||
private long lastDocPointer;
|
private long lastDocPointer;
|
||||||
private int lastPosBufferUpto;
|
private int lastPosBufferUpto;
|
||||||
|
|
||||||
|
/** Sole constructor. */
|
||||||
public Lucene99SkipReader(
|
public Lucene99SkipReader(
|
||||||
IndexInput skipStream,
|
IndexInput skipStream,
|
||||||
int maxSkipLevels,
|
int maxSkipLevels,
|
||||||
|
@ -98,6 +99,7 @@ public class Lucene99SkipReader extends MultiLevelSkipListReader {
|
||||||
return df % ForUtil.BLOCK_SIZE == 0 ? df - 1 : df;
|
return df % ForUtil.BLOCK_SIZE == 0 ? df - 1 : df;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** Initialize state. */
|
||||||
public void init(
|
public void init(
|
||||||
long skipPointer, long docBasePointer, long posBasePointer, long payBasePointer, int df)
|
long skipPointer, long docBasePointer, long posBasePointer, long payBasePointer, int df)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
|
@ -125,22 +127,27 @@ public class Lucene99SkipReader extends MultiLevelSkipListReader {
|
||||||
return lastDocPointer;
|
return lastDocPointer;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** Returns the pointer in the pos file. */
|
||||||
public long getPosPointer() {
|
public long getPosPointer() {
|
||||||
return lastPosPointer;
|
return lastPosPointer;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** Return the start offset in the position block. */
|
||||||
public int getPosBufferUpto() {
|
public int getPosBufferUpto() {
|
||||||
return lastPosBufferUpto;
|
return lastPosBufferUpto;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** Returns the pointer in the pay file. */
|
||||||
public long getPayPointer() {
|
public long getPayPointer() {
|
||||||
return lastPayPointer;
|
return lastPayPointer;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** Return the number of bytes in the pay block that belongs to docs from the previous block. */
|
||||||
public int getPayloadByteUpto() {
|
public int getPayloadByteUpto() {
|
||||||
return lastPayloadByteUpto;
|
return lastPayloadByteUpto;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** Return the next skip doc, no skipping can be performed until this doc. */
|
||||||
public int getNextSkipDoc() {
|
public int getNextSkipDoc() {
|
||||||
return skipDoc[0];
|
return skipDoc[0];
|
||||||
}
|
}
|
||||||
|
@ -199,7 +206,7 @@ public class Lucene99SkipReader extends MultiLevelSkipListReader {
|
||||||
return delta;
|
return delta;
|
||||||
}
|
}
|
||||||
|
|
||||||
// The default impl skips impacts
|
/** Read impacts. The default implementation skips them. */
|
||||||
protected void readImpacts(int level, IndexInput skipStream) throws IOException {
|
protected void readImpacts(int level, IndexInput skipStream) throws IOException {
|
||||||
skipStream.skipBytes(skipStream.readVInt());
|
skipStream.skipBytes(skipStream.readVInt());
|
||||||
}
|
}
|
|
@ -14,7 +14,7 @@
|
||||||
* See the License for the specific language governing permissions and
|
* See the License for the specific language governing permissions and
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
package org.apache.lucene.codecs.lucene99;
|
package org.apache.lucene.backward_codecs.lucene99;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
|
@ -46,10 +46,10 @@ import org.apache.lucene.store.IndexOutput;
|
||||||
* uptos(position, payload). 4. start offset.
|
* uptos(position, payload). 4. start offset.
|
||||||
*/
|
*/
|
||||||
public final class Lucene99SkipWriter extends MultiLevelSkipListWriter {
|
public final class Lucene99SkipWriter extends MultiLevelSkipListWriter {
|
||||||
private int[] lastSkipDoc;
|
private final int[] lastSkipDoc;
|
||||||
private long[] lastSkipDocPointer;
|
private final long[] lastSkipDocPointer;
|
||||||
private long[] lastSkipPosPointer;
|
private final long[] lastSkipPosPointer;
|
||||||
private long[] lastSkipPayPointer;
|
private final long[] lastSkipPayPointer;
|
||||||
|
|
||||||
private final IndexOutput docOut;
|
private final IndexOutput docOut;
|
||||||
private final IndexOutput posOut;
|
private final IndexOutput posOut;
|
||||||
|
@ -61,11 +61,12 @@ public final class Lucene99SkipWriter extends MultiLevelSkipListWriter {
|
||||||
private long curPayPointer;
|
private long curPayPointer;
|
||||||
private int curPosBufferUpto;
|
private int curPosBufferUpto;
|
||||||
private int curPayloadByteUpto;
|
private int curPayloadByteUpto;
|
||||||
private CompetitiveImpactAccumulator[] curCompetitiveFreqNorms;
|
private final CompetitiveImpactAccumulator[] curCompetitiveFreqNorms;
|
||||||
private boolean fieldHasPositions;
|
private boolean fieldHasPositions;
|
||||||
private boolean fieldHasOffsets;
|
private boolean fieldHasOffsets;
|
||||||
private boolean fieldHasPayloads;
|
private boolean fieldHasPayloads;
|
||||||
|
|
||||||
|
/** Sole constructor. */
|
||||||
public Lucene99SkipWriter(
|
public Lucene99SkipWriter(
|
||||||
int maxSkipLevels,
|
int maxSkipLevels,
|
||||||
int blockSize,
|
int blockSize,
|
||||||
|
@ -84,7 +85,12 @@ public final class Lucene99SkipWriter extends MultiLevelSkipListWriter {
|
||||||
lastSkipPosPointer = new long[maxSkipLevels];
|
lastSkipPosPointer = new long[maxSkipLevels];
|
||||||
if (payOut != null) {
|
if (payOut != null) {
|
||||||
lastSkipPayPointer = new long[maxSkipLevels];
|
lastSkipPayPointer = new long[maxSkipLevels];
|
||||||
|
} else {
|
||||||
|
lastSkipPayPointer = null;
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
lastSkipPosPointer = null;
|
||||||
|
lastSkipPayPointer = null;
|
||||||
}
|
}
|
||||||
curCompetitiveFreqNorms = new CompetitiveImpactAccumulator[maxSkipLevels];
|
curCompetitiveFreqNorms = new CompetitiveImpactAccumulator[maxSkipLevels];
|
||||||
for (int i = 0; i < maxSkipLevels; ++i) {
|
for (int i = 0; i < maxSkipLevels; ++i) {
|
||||||
|
@ -92,6 +98,7 @@ public final class Lucene99SkipWriter extends MultiLevelSkipListWriter {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** Reset state for the given index options. */
|
||||||
public void setField(
|
public void setField(
|
||||||
boolean fieldHasPositions, boolean fieldHasOffsets, boolean fieldHasPayloads) {
|
boolean fieldHasPositions, boolean fieldHasOffsets, boolean fieldHasPayloads) {
|
||||||
this.fieldHasPositions = fieldHasPositions;
|
this.fieldHasPositions = fieldHasPositions;
|
||||||
|
@ -211,6 +218,7 @@ public final class Lucene99SkipWriter extends MultiLevelSkipListWriter {
|
||||||
competitiveFreqNorms.clear();
|
competitiveFreqNorms.clear();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** Write impacts to the given output. */
|
||||||
public static void writeImpacts(CompetitiveImpactAccumulator acc, DataOutput out)
|
public static void writeImpacts(CompetitiveImpactAccumulator acc, DataOutput out)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
Collection<Impact> impacts = acc.getCompetitiveFreqNormPairs();
|
Collection<Impact> impacts = acc.getCompetitiveFreqNormPairs();
|
|
@ -14,7 +14,7 @@
|
||||||
* See the License for the specific language governing permissions and
|
* See the License for the specific language governing permissions and
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
package org.apache.lucene.codecs.lucene99;
|
package org.apache.lucene.backward_codecs.lucene99;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
|
@ -14,7 +14,7 @@
|
||||||
* See the License for the specific language governing permissions and
|
* See the License for the specific language governing permissions and
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
package org.apache.lucene.codecs.lucene99;
|
package org.apache.lucene.backward_codecs.lucene99;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import org.apache.lucene.store.IndexInput;
|
import org.apache.lucene.store.IndexInput;
|
|
@ -40,7 +40,7 @@ HEADER = """// This file has been automatically generated, DO NOT EDIT
|
||||||
* See the License for the specific language governing permissions and
|
* See the License for the specific language governing permissions and
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
package org.apache.lucene.codecs.lucene99;
|
package org.apache.lucene.backward_codecs.lucene99;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import org.apache.lucene.store.DataInput;
|
import org.apache.lucene.store.DataInput;
|
|
@ -0,0 +1,428 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Lucene 9.9 file format.
|
||||||
|
*
|
||||||
|
* <h2>Apache Lucene - Index File Formats</h2>
|
||||||
|
*
|
||||||
|
* <div>
|
||||||
|
*
|
||||||
|
* <ul>
|
||||||
|
* <li><a href="#Introduction">Introduction</a>
|
||||||
|
* <li><a href="#Definitions">Definitions</a>
|
||||||
|
* <ul>
|
||||||
|
* <li><a href="#Inverted_Indexing">Inverted Indexing</a>
|
||||||
|
* <li><a href="#Types_of_Fields">Types of Fields</a>
|
||||||
|
* <li><a href="#Segments">Segments</a>
|
||||||
|
* <li><a href="#Document_Numbers">Document Numbers</a>
|
||||||
|
* </ul>
|
||||||
|
* <li><a href="#Overview">Index Structure Overview</a>
|
||||||
|
* <li><a href="#File_Naming">File Naming</a>
|
||||||
|
* <li><a href="#file-names">Summary of File Extensions</a>
|
||||||
|
* <ul>
|
||||||
|
* <li><a href="#Lock_File">Lock File</a>
|
||||||
|
* <li><a href="#History">History</a>
|
||||||
|
* <li><a href="#Limitations">Limitations</a>
|
||||||
|
* </ul>
|
||||||
|
* </ul>
|
||||||
|
*
|
||||||
|
* </div> <a id="Introduction"></a>
|
||||||
|
*
|
||||||
|
* <h3>Introduction</h3>
|
||||||
|
*
|
||||||
|
* <div>
|
||||||
|
*
|
||||||
|
* <p>This document defines the index file formats used in this version of Lucene. If you are using
|
||||||
|
* a different version of Lucene, please consult the copy of <code>docs/</code> that was distributed
|
||||||
|
* with the version you are using.
|
||||||
|
*
|
||||||
|
* <p>This document attempts to provide a high-level definition of the Apache Lucene file formats.
|
||||||
|
* </div> <a id="Definitions"></a>
|
||||||
|
*
|
||||||
|
* <h3>Definitions</h3>
|
||||||
|
*
|
||||||
|
* <div>
|
||||||
|
*
|
||||||
|
* <p>The fundamental concepts in Lucene are index, document, field and term.
|
||||||
|
*
|
||||||
|
* <p>An index contains a sequence of documents.
|
||||||
|
*
|
||||||
|
* <ul>
|
||||||
|
* <li>A document is a sequence of fields.
|
||||||
|
* <li>A field is a named sequence of terms.
|
||||||
|
* <li>A term is a sequence of bytes.
|
||||||
|
* </ul>
|
||||||
|
*
|
||||||
|
* <p>The same sequence of bytes in two different fields is considered a different term. Thus terms
|
||||||
|
* are represented as a pair: the string naming the field, and the bytes within the field. <a
|
||||||
|
* id="Inverted_Indexing"></a>
|
||||||
|
*
|
||||||
|
* <h4>Inverted Indexing</h4>
|
||||||
|
*
|
||||||
|
* <p>Lucene's index stores terms and statistics about those terms in order to make term-based
|
||||||
|
* search more efficient. Lucene's terms index falls into the family of indexes known as an
|
||||||
|
* <i>inverted index.</i> This is because it can list, for a term, the documents that contain it.
|
||||||
|
* This is the inverse of the natural relationship, in which documents list terms. <a
|
||||||
|
* id="Types_of_Fields"></a>
|
||||||
|
*
|
||||||
|
* <h4>Types of Fields</h4>
|
||||||
|
*
|
||||||
|
* <p>In Lucene, fields may be <i>stored</i>, in which case their text is stored in the index
|
||||||
|
* literally, in a non-inverted manner. Fields that are inverted are called <i>indexed</i>. A field
|
||||||
|
* may be both stored and indexed.
|
||||||
|
*
|
||||||
|
* <p>The text of a field may be <i>tokenized</i> into terms to be indexed, or the text of a field
|
||||||
|
* may be used literally as a term to be indexed. Most fields are tokenized, but sometimes it is
|
||||||
|
* useful for certain identifier fields to be indexed literally.
|
||||||
|
*
|
||||||
|
* <p>See the {@link org.apache.lucene.document.Field Field} java docs for more information on
|
||||||
|
* Fields. <a id="Segments"></a>
|
||||||
|
*
|
||||||
|
* <h4>Segments</h4>
|
||||||
|
*
|
||||||
|
* <p>Lucene indexes may be composed of multiple sub-indexes, or <i>segments</i>. Each segment is a
|
||||||
|
* fully independent index, which could be searched separately. Indexes evolve by:
|
||||||
|
*
|
||||||
|
* <ol>
|
||||||
|
* <li>Creating new segments for newly added documents.
|
||||||
|
* <li>Merging existing segments.
|
||||||
|
* </ol>
|
||||||
|
*
|
||||||
|
* <p>Searches may involve multiple segments and/or multiple indexes, each index potentially
|
||||||
|
* composed of a set of segments. <a id="Document_Numbers"></a>
|
||||||
|
*
|
||||||
|
* <h4>Document Numbers</h4>
|
||||||
|
*
|
||||||
|
* <p>Internally, Lucene refers to documents by an integer <i>document number</i>. The first
|
||||||
|
* document added to an index is numbered zero, and each subsequent document added gets a number one
|
||||||
|
* greater than the previous.
|
||||||
|
*
|
||||||
|
* <p>Note that a document's number may change, so caution should be taken when storing these
|
||||||
|
* numbers outside of Lucene. In particular, numbers may change in the following situations:
|
||||||
|
*
|
||||||
|
* <ul>
|
||||||
|
* <li>
|
||||||
|
* <p>The numbers stored in each segment are unique only within the segment, and must be
|
||||||
|
* converted before they can be used in a larger context. The standard technique is to
|
||||||
|
* allocate each segment a range of values, based on the range of numbers used in that
|
||||||
|
* segment. To convert a document number from a segment to an external value, the segment's
|
||||||
|
* <i>base</i> document number is added. To convert an external value back to a
|
||||||
|
* segment-specific value, the segment is identified by the range that the external value is
|
||||||
|
* in, and the segment's base value is subtracted. For example two five document segments
|
||||||
|
* might be combined, so that the first segment has a base value of zero, and the second of
|
||||||
|
* five. Document three from the second segment would have an external value of eight.
|
||||||
|
* <li>
|
||||||
|
* <p>When documents are deleted, gaps are created in the numbering. These are eventually
|
||||||
|
* removed as the index evolves through merging. Deleted documents are dropped when segments
|
||||||
|
* are merged. A freshly-merged segment thus has no gaps in its numbering.
|
||||||
|
* </ul>
|
||||||
|
*
|
||||||
|
* </div> <a id="Overview"></a>
|
||||||
|
*
|
||||||
|
* <h3>Index Structure Overview</h3>
|
||||||
|
*
|
||||||
|
* <div>
|
||||||
|
*
|
||||||
|
* <p>Each segment index maintains the following:
|
||||||
|
*
|
||||||
|
* <ul>
|
||||||
|
* <li>{@link org.apache.lucene.codecs.lucene99.Lucene99SegmentInfoFormat Segment info}. This
|
||||||
|
* contains metadata about a segment, such as the number of documents, what files it uses, and
|
||||||
|
* information about how the segment is sorted
|
||||||
|
* <li>{@link org.apache.lucene.codecs.lucene94.Lucene94FieldInfosFormat Field names}. This
|
||||||
|
* contains metadata about the set of named fields used in the index.
|
||||||
|
* <li>{@link org.apache.lucene.codecs.lucene90.Lucene90StoredFieldsFormat Stored Field values}.
|
||||||
|
* This contains, for each document, a list of attribute-value pairs, where the attributes are
|
||||||
|
* field names. These are used to store auxiliary information about the document, such as its
|
||||||
|
* title, url, or an identifier to access a database. The set of stored fields are what is
|
||||||
|
* returned for each hit when searching. This is keyed by document number.
|
||||||
|
* <li>{@link org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat Term dictionary}.
|
||||||
|
* A dictionary containing all of the terms used in all of the indexed fields of all of the
|
||||||
|
* documents. The dictionary also contains the number of documents which contain the term, and
|
||||||
|
* pointers to the term's frequency and proximity data.
|
||||||
|
* <li>{@link org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat Term Frequency
|
||||||
|
* data}. For each term in the dictionary, the numbers of all the documents that contain that
|
||||||
|
* term, and the frequency of the term in that document, unless frequencies are omitted
|
||||||
|
* ({@link org.apache.lucene.index.IndexOptions#DOCS IndexOptions.DOCS})
|
||||||
|
* <li>{@link org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat Term Proximity
|
||||||
|
* data}. For each term in the dictionary, the positions that the term occurs in each
|
||||||
|
* document. Note that this will not exist if all fields in all documents omit position data.
|
||||||
|
* <li>{@link org.apache.lucene.codecs.lucene90.Lucene90NormsFormat Normalization factors}. For
|
||||||
|
* each field in each document, a value is stored that is multiplied into the score for hits
|
||||||
|
* on that field.
|
||||||
|
* <li>{@link org.apache.lucene.codecs.lucene90.Lucene90TermVectorsFormat Term Vectors}. For each
|
||||||
|
* field in each document, the term vector (sometimes called document vector) may be stored. A
|
||||||
|
* term vector consists of term text and term frequency. To add Term Vectors to your index see
|
||||||
|
* the {@link org.apache.lucene.document.Field Field} constructors
|
||||||
|
* <li>{@link org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat Per-document values}. Like
|
||||||
|
* stored values, these are also keyed by document number, but are generally intended to be
|
||||||
|
* loaded into main memory for fast access. Whereas stored values are generally intended for
|
||||||
|
* summary results from searches, per-document values are useful for things like scoring
|
||||||
|
* factors.
|
||||||
|
* <li>{@link org.apache.lucene.codecs.lucene90.Lucene90LiveDocsFormat Live documents}. An
|
||||||
|
* optional file indicating which documents are live.
|
||||||
|
* <li>{@link org.apache.lucene.codecs.lucene90.Lucene90PointsFormat Point values}. Optional pair
|
||||||
|
* of files, recording dimensionally indexed fields, to enable fast numeric range filtering
|
||||||
|
* and large numeric values like BigInteger and BigDecimal (1D) and geographic shape
|
||||||
|
* intersection (2D, 3D).
|
||||||
|
* <li>{@link org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat Vector values}. The
|
||||||
|
* vector format stores numeric vectors in a format optimized for random access and
|
||||||
|
* computation, supporting high-dimensional nearest-neighbor search.
|
||||||
|
* </ul>
|
||||||
|
*
|
||||||
|
* <p>Details on each of these are provided in their linked pages. </div> <a id="File_Naming"></a>
|
||||||
|
*
|
||||||
|
* <h3>File Naming</h3>
|
||||||
|
*
|
||||||
|
* <div>
|
||||||
|
*
|
||||||
|
* <p>All files belonging to a segment have the same name with varying extensions. The extensions
|
||||||
|
* correspond to the different file formats described below. When using the Compound File format
|
||||||
|
* (default for small segments) these files (except for the Segment info file, the Lock file, and
|
||||||
|
* Deleted documents file) are collapsed into a single .cfs file (see below for details)
|
||||||
|
*
|
||||||
|
* <p>Typically, all segments in an index are stored in a single directory, although this is not
|
||||||
|
* required.
|
||||||
|
*
|
||||||
|
* <p>File names are never re-used. That is, when any file is saved to the Directory it is given a
|
||||||
|
* never before used filename. This is achieved using a simple generations approach. For example,
|
||||||
|
* the first segments file is segments_1, then segments_2, etc. The generation is a sequential long
|
||||||
|
* integer represented in alpha-numeric (base 36) form. </div> <a id="file-names"></a>
|
||||||
|
*
|
||||||
|
* <h3>Summary of File Extensions</h3>
|
||||||
|
*
|
||||||
|
* <div>
|
||||||
|
*
|
||||||
|
* <p>The following table summarizes the names and extensions of the files in Lucene:
|
||||||
|
*
|
||||||
|
* <table class="padding4" style="border-spacing: 1px; border-collapse: separate">
|
||||||
|
* <caption>lucene filenames by extension</caption>
|
||||||
|
* <tr>
|
||||||
|
* <th>Name</th>
|
||||||
|
* <th>Extension</th>
|
||||||
|
* <th>Brief Description</th>
|
||||||
|
* </tr>
|
||||||
|
* <tr>
|
||||||
|
* <td>{@link org.apache.lucene.index.SegmentInfos Segments File}</td>
|
||||||
|
* <td>segments_N</td>
|
||||||
|
* <td>Stores information about a commit point</td>
|
||||||
|
* </tr>
|
||||||
|
* <tr>
|
||||||
|
* <td><a href="#Lock_File">Lock File</a></td>
|
||||||
|
* <td>write.lock</td>
|
||||||
|
* <td>The Write lock prevents multiple IndexWriters from writing to the same
|
||||||
|
* file.</td>
|
||||||
|
* </tr>
|
||||||
|
* <tr>
|
||||||
|
* <td>{@link org.apache.lucene.codecs.lucene99.Lucene99SegmentInfoFormat Segment Info}</td>
|
||||||
|
* <td>.si</td>
|
||||||
|
* <td>Stores metadata about a segment</td>
|
||||||
|
* </tr>
|
||||||
|
* <tr>
|
||||||
|
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90CompoundFormat Compound File}</td>
|
||||||
|
* <td>.cfs, .cfe</td>
|
||||||
|
* <td>An optional "virtual" file consisting of all the other index files for
|
||||||
|
* systems that frequently run out of file handles.</td>
|
||||||
|
* </tr>
|
||||||
|
* <tr>
|
||||||
|
* <td>{@link org.apache.lucene.codecs.lucene94.Lucene94FieldInfosFormat Fields}</td>
|
||||||
|
* <td>.fnm</td>
|
||||||
|
* <td>Stores information about the fields</td>
|
||||||
|
* </tr>
|
||||||
|
* <tr>
|
||||||
|
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90StoredFieldsFormat Field Index}</td>
|
||||||
|
* <td>.fdx</td>
|
||||||
|
* <td>Contains pointers to field data</td>
|
||||||
|
* </tr>
|
||||||
|
* <tr>
|
||||||
|
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90StoredFieldsFormat Field Data}</td>
|
||||||
|
* <td>.fdt</td>
|
||||||
|
* <td>The stored fields for documents</td>
|
||||||
|
* </tr>
|
||||||
|
* <tr>
|
||||||
|
* <td>{@link org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat Term Dictionary}</td>
|
||||||
|
* <td>.tim</td>
|
||||||
|
* <td>The term dictionary, stores term info</td>
|
||||||
|
* </tr>
|
||||||
|
* <tr>
|
||||||
|
* <td>{@link org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat Term Index}</td>
|
||||||
|
* <td>.tip</td>
|
||||||
|
* <td>The index into the Term Dictionary</td>
|
||||||
|
* </tr>
|
||||||
|
* <tr>
|
||||||
|
* <td>{@link org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat Frequencies}</td>
|
||||||
|
* <td>.doc</td>
|
||||||
|
* <td>Contains the list of docs which contain each term along with frequency</td>
|
||||||
|
* </tr>
|
||||||
|
* <tr>
|
||||||
|
* <td>{@link org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat Positions}</td>
|
||||||
|
* <td>.pos</td>
|
||||||
|
* <td>Stores position information about where a term occurs in the index</td>
|
||||||
|
* </tr>
|
||||||
|
* <tr>
|
||||||
|
* <td>{@link org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat Payloads}</td>
|
||||||
|
* <td>.pay</td>
|
||||||
|
* <td>Stores additional per-position metadata information such as character offsets and user payloads</td>
|
||||||
|
* </tr>
|
||||||
|
* <tr>
|
||||||
|
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90NormsFormat Norms}</td>
|
||||||
|
* <td>.nvd, .nvm</td>
|
||||||
|
* <td>Encodes length and boost factors for docs and fields</td>
|
||||||
|
* </tr>
|
||||||
|
* <tr>
|
||||||
|
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat Per-Document Values}</td>
|
||||||
|
* <td>.dvd, .dvm</td>
|
||||||
|
* <td>Encodes additional scoring factors or other per-document information.</td>
|
||||||
|
* </tr>
|
||||||
|
* <tr>
|
||||||
|
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90TermVectorsFormat Term Vector Index}</td>
|
||||||
|
* <td>.tvx</td>
|
||||||
|
* <td>Stores offset into the document data file</td>
|
||||||
|
* </tr>
|
||||||
|
* <tr>
|
||||||
|
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90TermVectorsFormat Term Vector Data}</td>
|
||||||
|
* <td>.tvd</td>
|
||||||
|
* <td>Contains term vector data.</td>
|
||||||
|
* </tr>
|
||||||
|
* <tr>
|
||||||
|
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90LiveDocsFormat Live Documents}</td>
|
||||||
|
* <td>.liv</td>
|
||||||
|
* <td>Info about what documents are live</td>
|
||||||
|
* </tr>
|
||||||
|
* <tr>
|
||||||
|
* <td>{@link org.apache.lucene.codecs.lucene90.Lucene90PointsFormat Point values}</td>
|
||||||
|
* <td>.dii, .dim</td>
|
||||||
|
* <td>Holds indexed points</td>
|
||||||
|
* </tr>
|
||||||
|
* <tr>
|
||||||
|
* <td>{@link org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat Vector values}</td>
|
||||||
|
* <td>.vec, .vem, .veq, vex</td>
|
||||||
|
* <td>Holds indexed vectors; <code>.vec</code> files contain the raw vector data,
|
||||||
|
* <code>.vem</code> the vector metadata, <code>.veq</code> the quantized vector data, and <code>.vex</code> the
|
||||||
|
* hnsw graph data.</td>
|
||||||
|
* </tr>
|
||||||
|
* </table>
|
||||||
|
*
|
||||||
|
* </div> <a id="Lock_File"></a>
|
||||||
|
*
|
||||||
|
* <h3>Lock File</h3>
|
||||||
|
*
|
||||||
|
* The write lock, which is stored in the index directory by default, is named "write.lock". If the
|
||||||
|
* lock directory is different from the index directory then the write lock will be named
|
||||||
|
* "XXXX-write.lock" where XXXX is a unique prefix derived from the full path to the index
|
||||||
|
* directory. When this file is present, a writer is currently modifying the index (adding or
|
||||||
|
* removing documents). This lock file ensures that only one writer is modifying the index at a
|
||||||
|
* time. <a id="History"></a>
|
||||||
|
*
|
||||||
|
* <h3>History</h3>
|
||||||
|
*
|
||||||
|
* <p>Compatibility notes are provided in this document, describing how file formats have changed
|
||||||
|
* from prior versions:
|
||||||
|
*
|
||||||
|
* <ul>
|
||||||
|
* <li>In version 2.1, the file format was changed to allow lock-less commits (ie, no more commit
|
||||||
|
* lock). The change is fully backwards compatible: you can open a pre-2.1 index for searching
|
||||||
|
* or adding/deleting of docs. When the new segments file is saved (committed), it will be
|
||||||
|
* written in the new file format (meaning no specific "upgrade" process is needed). But note
|
||||||
|
* that once a commit has occurred, pre-2.1 Lucene will not be able to read the index.
|
||||||
|
* <li>In version 2.3, the file format was changed to allow segments to share a single set of doc
|
||||||
|
* store (vectors & stored fields) files. This allows for faster indexing in certain
|
||||||
|
* cases. The change is fully backwards compatible (in the same way as the lock-less commits
|
||||||
|
* change in 2.1).
|
||||||
|
* <li>In version 2.4, Strings are now written as true UTF-8 byte sequence, not Java's modified
|
||||||
|
* UTF-8. See <a href="http://issues.apache.org/jira/browse/LUCENE-510">LUCENE-510</a> for
|
||||||
|
* details.
|
||||||
|
* <li>In version 2.9, an optional opaque Map<String,String> CommitUserData may be passed to
|
||||||
|
* IndexWriter's commit methods (and later retrieved), which is recorded in the segments_N
|
||||||
|
* file. See <a href="http://issues.apache.org/jira/browse/LUCENE-1382">LUCENE-1382</a> for
|
||||||
|
* details. Also, diagnostics were added to each segment written recording details about why
|
||||||
|
* it was written (due to flush, merge; which OS/JRE was used; etc.). See issue <a
|
||||||
|
* href="http://issues.apache.org/jira/browse/LUCENE-1654">LUCENE-1654</a> for details.
|
||||||
|
* <li>In version 3.0, compressed fields are no longer written to the index (they can still be
|
||||||
|
* read, but on merge the new segment will write them, uncompressed). See issue <a
|
||||||
|
* href="http://issues.apache.org/jira/browse/LUCENE-1960">LUCENE-1960</a> for details.
|
||||||
|
* <li>In version 3.1, segments records the code version that created them. See <a
|
||||||
|
* href="http://issues.apache.org/jira/browse/LUCENE-2720">LUCENE-2720</a> for details.
|
||||||
|
* Additionally segments track explicitly whether or not they have term vectors. See <a
|
||||||
|
* href="http://issues.apache.org/jira/browse/LUCENE-2811">LUCENE-2811</a> for details.
|
||||||
|
* <li>In version 3.2, numeric fields are written as natively to stored fields file, previously
|
||||||
|
* they were stored in text format only.
|
||||||
|
* <li>In version 3.4, fields can omit position data while still indexing term frequencies.
|
||||||
|
* <li>In version 4.0, the format of the inverted index became extensible via the {@link
|
||||||
|
* org.apache.lucene.codecs.Codec Codec} api. Fast per-document storage ({@code DocValues})
|
||||||
|
* was introduced. Normalization factors need no longer be a single byte, they can be any
|
||||||
|
* {@link org.apache.lucene.index.NumericDocValues NumericDocValues}. Terms need not be
|
||||||
|
* unicode strings, they can be any byte sequence. Term offsets can optionally be indexed into
|
||||||
|
* the postings lists. Payloads can be stored in the term vectors.
|
||||||
|
* <li>In version 4.1, the format of the postings list changed to use either of FOR compression or
|
||||||
|
* variable-byte encoding, depending upon the frequency of the term. Terms appearing only once
|
||||||
|
* were changed to inline directly into the term dictionary. Stored fields are compressed by
|
||||||
|
* default.
|
||||||
|
* <li>In version 4.2, term vectors are compressed by default. DocValues has a new multi-valued
|
||||||
|
* type (SortedSet), that can be used for faceting/grouping/joining on multi-valued fields.
|
||||||
|
* <li>In version 4.5, DocValues were extended to explicitly represent missing values.
|
||||||
|
* <li>In version 4.6, FieldInfos were extended to support per-field DocValues generation, to
|
||||||
|
* allow updating NumericDocValues fields.
|
||||||
|
* <li>In version 4.8, checksum footers were added to the end of each index file for improved data
|
||||||
|
* integrity. Specifically, the last 8 bytes of every index file contain the zlib-crc32
|
||||||
|
* checksum of the file.
|
||||||
|
* <li>In version 4.9, DocValues has a new multi-valued numeric type (SortedNumeric) that is
|
||||||
|
* suitable for faceting/sorting/analytics.
|
||||||
|
* <li>In version 5.4, DocValues have been improved to store more information on disk: addresses
|
||||||
|
* for binary fields and ord indexes for multi-valued fields.
|
||||||
|
* <li>In version 6.0, Points were added, for multi-dimensional range/distance search.
|
||||||
|
* <li>In version 6.2, new Segment info format that reads/writes the index sort, to support index
|
||||||
|
* sorting.
|
||||||
|
* <li>In version 7.0, DocValues have been improved to better support sparse doc values thanks to
|
||||||
|
* an iterator API.
|
||||||
|
* <li>In version 8.0, postings have been enhanced to record, for each block of doc ids, the (term
|
||||||
|
* freq, normalization factor) pairs that may trigger the maximum score of the block. This
|
||||||
|
* information is recorded alongside skip data in order to be able to skip blocks of doc ids
|
||||||
|
* if they may not produce high enough scores. Additionally doc values and norms has been
|
||||||
|
* extended with jump-tables to make access O(1) instead of O(n), where n is the number of
|
||||||
|
* elements to skip when advancing in the data.
|
||||||
|
* <li>In version 8.4, postings, positions, offsets and payload lengths have move to a more
|
||||||
|
* performant encoding that is vectorized.
|
||||||
|
* <li>In version 8.6, index sort serialization is delegated to the sorts themselves, to allow
|
||||||
|
* user-defined sorts to be used
|
||||||
|
* <li>In version 8.7, stored fields compression became adaptive to better handle documents with
|
||||||
|
* smaller stored fields.
|
||||||
|
* <li>In version 9.0, vector-valued fields were added.
|
||||||
|
* <li>In version 9.1, vector-valued fields were modified to add a graph hierarchy.
|
||||||
|
* <li>In version 9.2, docs of vector-valued fields were moved from .vem to .vec and encoded by
|
||||||
|
* IndexDISI. ordToDoc mappings was added to .vem.
|
||||||
|
* <li>In version 9.5, HNSW graph connections were changed to be delta-encoded with vints.
|
||||||
|
* Additionally, metadata file size improvements were made by delta-encoding nodes by graph
|
||||||
|
* layer and not writing the node ids for the zeroth layer.
|
||||||
|
* <li>In version 9.9, Vector scalar quantization support was added. Allowing the HNSW vector
|
||||||
|
* format to utilize int8 quantized vectors for float32 vector search.
|
||||||
|
* </ul>
|
||||||
|
*
|
||||||
|
* <a id="Limitations"></a>
|
||||||
|
*
|
||||||
|
* <h3>Limitations</h3>
|
||||||
|
*
|
||||||
|
* <div>
|
||||||
|
*
|
||||||
|
* <p>Lucene uses a Java <code>int</code> to refer to document numbers, and the index file format
|
||||||
|
* uses an <code>Int32</code> on-disk to store document numbers. This is a limitation of both the
|
||||||
|
* index file format and the current implementation. Eventually these should be replaced with either
|
||||||
|
* <code>UInt64</code> values, or better yet, {@link org.apache.lucene.store.DataOutput#writeVInt
|
||||||
|
* VInt} values which have no limit. </div>
|
||||||
|
*/
|
||||||
|
package org.apache.lucene.backward_codecs.lucene99;
|
|
@ -22,3 +22,4 @@ org.apache.lucene.backward_codecs.lucene91.Lucene91Codec
|
||||||
org.apache.lucene.backward_codecs.lucene92.Lucene92Codec
|
org.apache.lucene.backward_codecs.lucene92.Lucene92Codec
|
||||||
org.apache.lucene.backward_codecs.lucene94.Lucene94Codec
|
org.apache.lucene.backward_codecs.lucene94.Lucene94Codec
|
||||||
org.apache.lucene.backward_codecs.lucene95.Lucene95Codec
|
org.apache.lucene.backward_codecs.lucene95.Lucene95Codec
|
||||||
|
org.apache.lucene.backward_codecs.lucene99.Lucene99Codec
|
||||||
|
|
|
@ -16,3 +16,4 @@
|
||||||
org.apache.lucene.backward_codecs.lucene50.Lucene50PostingsFormat
|
org.apache.lucene.backward_codecs.lucene50.Lucene50PostingsFormat
|
||||||
org.apache.lucene.backward_codecs.lucene84.Lucene84PostingsFormat
|
org.apache.lucene.backward_codecs.lucene84.Lucene84PostingsFormat
|
||||||
org.apache.lucene.backward_codecs.lucene90.Lucene90PostingsFormat
|
org.apache.lucene.backward_codecs.lucene90.Lucene90PostingsFormat
|
||||||
|
org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -17,7 +17,7 @@
|
||||||
package org.apache.lucene.backward_codecs.lucene50;
|
package org.apache.lucene.backward_codecs.lucene50;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import org.apache.lucene.backward_codecs.lucene40.blocktree.Lucene40BlockTreeTermsWriter;
|
import org.apache.lucene.backward_codecs.lucene40.blocktree.Lucene40BlockTreeTermsWriterV5;
|
||||||
import org.apache.lucene.codecs.FieldsConsumer;
|
import org.apache.lucene.codecs.FieldsConsumer;
|
||||||
import org.apache.lucene.codecs.PostingsWriterBase;
|
import org.apache.lucene.codecs.PostingsWriterBase;
|
||||||
import org.apache.lucene.index.SegmentWriteState;
|
import org.apache.lucene.index.SegmentWriteState;
|
||||||
|
@ -31,11 +31,11 @@ public class Lucene50RWPostingsFormat extends Lucene50PostingsFormat {
|
||||||
boolean success = false;
|
boolean success = false;
|
||||||
try {
|
try {
|
||||||
FieldsConsumer ret =
|
FieldsConsumer ret =
|
||||||
new Lucene40BlockTreeTermsWriter(
|
new Lucene40BlockTreeTermsWriterV5(
|
||||||
state,
|
state,
|
||||||
postingsWriter,
|
postingsWriter,
|
||||||
Lucene40BlockTreeTermsWriter.DEFAULT_MIN_BLOCK_SIZE,
|
Lucene40BlockTreeTermsWriterV5.DEFAULT_MIN_BLOCK_SIZE,
|
||||||
Lucene40BlockTreeTermsWriter.DEFAULT_MAX_BLOCK_SIZE);
|
Lucene40BlockTreeTermsWriterV5.DEFAULT_MAX_BLOCK_SIZE);
|
||||||
success = true;
|
success = true;
|
||||||
return ret;
|
return ret;
|
||||||
} finally {
|
} finally {
|
||||||
|
|
|
@ -642,13 +642,13 @@ public class BKDWriter60 implements Closeable {
|
||||||
throws IOException {
|
throws IOException {
|
||||||
assert docMaps == null || readers.size() == docMaps.size();
|
assert docMaps == null || readers.size() == docMaps.size();
|
||||||
|
|
||||||
BKDMergeQueue queue = new BKDMergeQueue(config.bytesPerDim, readers.size());
|
BKDMergeQueue queue = new BKDMergeQueue(config.bytesPerDim(), readers.size());
|
||||||
|
|
||||||
for (int i = 0; i < readers.size(); i++) {
|
for (int i = 0; i < readers.size(); i++) {
|
||||||
PointValues pointValues = readers.get(i);
|
PointValues pointValues = readers.get(i);
|
||||||
assert pointValues.getNumDimensions() == config.numDims
|
assert pointValues.getNumDimensions() == config.numDims()
|
||||||
&& pointValues.getBytesPerDimension() == config.bytesPerDim
|
&& pointValues.getBytesPerDimension() == config.bytesPerDim()
|
||||||
&& pointValues.getNumIndexDimensions() == config.numIndexDims;
|
&& pointValues.getNumIndexDimensions() == config.numIndexDims();
|
||||||
MergeState.DocMap docMap;
|
MergeState.DocMap docMap;
|
||||||
if (docMaps == null) {
|
if (docMaps == null) {
|
||||||
docMap = null;
|
docMap = null;
|
||||||
|
|
|
@ -23,12 +23,11 @@ import java.util.Arrays;
|
||||||
import java.util.Collections;
|
import java.util.Collections;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import org.apache.lucene.backward_codecs.lucene90.Lucene90ScoreSkipReader.MutableImpactList;
|
import org.apache.lucene.backward_codecs.lucene90.Lucene90ScoreSkipReader.MutableImpactList;
|
||||||
|
import org.apache.lucene.backward_codecs.lucene99.Lucene99SkipWriter;
|
||||||
import org.apache.lucene.codecs.Codec;
|
import org.apache.lucene.codecs.Codec;
|
||||||
import org.apache.lucene.codecs.CompetitiveImpactAccumulator;
|
import org.apache.lucene.codecs.CompetitiveImpactAccumulator;
|
||||||
import org.apache.lucene.codecs.lucene90.blocktree.FieldReader;
|
import org.apache.lucene.codecs.lucene90.blocktree.FieldReader;
|
||||||
import org.apache.lucene.codecs.lucene90.blocktree.Stats;
|
import org.apache.lucene.codecs.lucene90.blocktree.Stats;
|
||||||
import org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat;
|
|
||||||
import org.apache.lucene.codecs.lucene99.Lucene99SkipWriter;
|
|
||||||
import org.apache.lucene.document.Document;
|
import org.apache.lucene.document.Document;
|
||||||
import org.apache.lucene.document.Field;
|
import org.apache.lucene.document.Field;
|
||||||
import org.apache.lucene.index.DirectoryReader;
|
import org.apache.lucene.index.DirectoryReader;
|
||||||
|
@ -77,22 +76,6 @@ public class TestLucene90PostingsFormat extends BasePostingsFormatTestCase {
|
||||||
d.close();
|
d.close();
|
||||||
}
|
}
|
||||||
|
|
||||||
private void shouldFail(int minItemsInBlock, int maxItemsInBlock) {
|
|
||||||
expectThrows(
|
|
||||||
IllegalArgumentException.class,
|
|
||||||
() -> {
|
|
||||||
new Lucene99PostingsFormat(minItemsInBlock, maxItemsInBlock);
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
public void testInvalidBlockSizes() throws Exception {
|
|
||||||
shouldFail(0, 0);
|
|
||||||
shouldFail(10, 8);
|
|
||||||
shouldFail(-1, 10);
|
|
||||||
shouldFail(10, -1);
|
|
||||||
shouldFail(10, 12);
|
|
||||||
}
|
|
||||||
|
|
||||||
public void testImpactSerialization() throws IOException {
|
public void testImpactSerialization() throws IOException {
|
||||||
// omit norms and omit freqs
|
// omit norms and omit freqs
|
||||||
doTestImpactSerialization(Collections.singletonList(new Impact(1, 1L)));
|
doTestImpactSerialization(Collections.singletonList(new Impact(1, 1L)));
|
||||||
|
|
|
@ -388,10 +388,14 @@ public final class Lucene94HnswVectorsWriter extends KnnVectorsWriter {
|
||||||
// write the vector data to a temporary file
|
// write the vector data to a temporary file
|
||||||
DocsWithFieldSet docsWithField =
|
DocsWithFieldSet docsWithField =
|
||||||
switch (fieldInfo.getVectorEncoding()) {
|
switch (fieldInfo.getVectorEncoding()) {
|
||||||
case BYTE -> writeByteVectorData(
|
case BYTE ->
|
||||||
tempVectorData, MergedVectorValues.mergeByteVectorValues(fieldInfo, mergeState));
|
writeByteVectorData(
|
||||||
case FLOAT32 -> writeVectorData(
|
tempVectorData,
|
||||||
tempVectorData, MergedVectorValues.mergeFloatVectorValues(fieldInfo, mergeState));
|
MergedVectorValues.mergeByteVectorValues(fieldInfo, mergeState));
|
||||||
|
case FLOAT32 ->
|
||||||
|
writeVectorData(
|
||||||
|
tempVectorData,
|
||||||
|
MergedVectorValues.mergeFloatVectorValues(fieldInfo, mergeState));
|
||||||
};
|
};
|
||||||
CodecUtil.writeFooter(tempVectorData);
|
CodecUtil.writeFooter(tempVectorData);
|
||||||
IOUtils.close(tempVectorData);
|
IOUtils.close(tempVectorData);
|
||||||
|
@ -638,18 +642,20 @@ public final class Lucene94HnswVectorsWriter extends KnnVectorsWriter {
|
||||||
throws IOException {
|
throws IOException {
|
||||||
int dim = fieldInfo.getVectorDimension();
|
int dim = fieldInfo.getVectorDimension();
|
||||||
return switch (fieldInfo.getVectorEncoding()) {
|
return switch (fieldInfo.getVectorEncoding()) {
|
||||||
case BYTE -> new FieldWriter<byte[]>(fieldInfo, M, beamWidth, infoStream) {
|
case BYTE ->
|
||||||
@Override
|
new FieldWriter<byte[]>(fieldInfo, M, beamWidth, infoStream) {
|
||||||
public byte[] copyValue(byte[] value) {
|
@Override
|
||||||
return ArrayUtil.copyOfSubArray(value, 0, dim);
|
public byte[] copyValue(byte[] value) {
|
||||||
}
|
return ArrayUtil.copyOfSubArray(value, 0, dim);
|
||||||
};
|
}
|
||||||
case FLOAT32 -> new FieldWriter<float[]>(fieldInfo, M, beamWidth, infoStream) {
|
};
|
||||||
@Override
|
case FLOAT32 ->
|
||||||
public float[] copyValue(float[] value) {
|
new FieldWriter<float[]>(fieldInfo, M, beamWidth, infoStream) {
|
||||||
return ArrayUtil.copyOfSubArray(value, 0, dim);
|
@Override
|
||||||
}
|
public float[] copyValue(float[] value) {
|
||||||
};
|
return ArrayUtil.copyOfSubArray(value, 0, dim);
|
||||||
|
}
|
||||||
|
};
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -663,12 +669,14 @@ public final class Lucene94HnswVectorsWriter extends KnnVectorsWriter {
|
||||||
DefaultFlatVectorScorer defaultFlatVectorScorer = new DefaultFlatVectorScorer();
|
DefaultFlatVectorScorer defaultFlatVectorScorer = new DefaultFlatVectorScorer();
|
||||||
RandomVectorScorerSupplier scorerSupplier =
|
RandomVectorScorerSupplier scorerSupplier =
|
||||||
switch (fieldInfo.getVectorEncoding()) {
|
switch (fieldInfo.getVectorEncoding()) {
|
||||||
case BYTE -> defaultFlatVectorScorer.getRandomVectorScorerSupplier(
|
case BYTE ->
|
||||||
fieldInfo.getVectorSimilarityFunction(),
|
defaultFlatVectorScorer.getRandomVectorScorerSupplier(
|
||||||
RandomAccessVectorValues.fromBytes((List<byte[]>) vectors, dim));
|
fieldInfo.getVectorSimilarityFunction(),
|
||||||
case FLOAT32 -> defaultFlatVectorScorer.getRandomVectorScorerSupplier(
|
RandomAccessVectorValues.fromBytes((List<byte[]>) vectors, dim));
|
||||||
fieldInfo.getVectorSimilarityFunction(),
|
case FLOAT32 ->
|
||||||
RandomAccessVectorValues.fromFloats((List<float[]>) vectors, dim));
|
defaultFlatVectorScorer.getRandomVectorScorerSupplier(
|
||||||
|
fieldInfo.getVectorSimilarityFunction(),
|
||||||
|
RandomAccessVectorValues.fromFloats((List<float[]>) vectors, dim));
|
||||||
};
|
};
|
||||||
hnswGraphBuilder =
|
hnswGraphBuilder =
|
||||||
HnswGraphBuilder.create(scorerSupplier, M, beamWidth, HnswGraphBuilder.randSeed);
|
HnswGraphBuilder.create(scorerSupplier, M, beamWidth, HnswGraphBuilder.randSeed);
|
||||||
|
@ -693,9 +701,9 @@ public final class Lucene94HnswVectorsWriter extends KnnVectorsWriter {
|
||||||
lastDocID = docID;
|
lastDocID = docID;
|
||||||
}
|
}
|
||||||
|
|
||||||
OnHeapHnswGraph getGraph() {
|
OnHeapHnswGraph getGraph() throws IOException {
|
||||||
if (vectors.size() > 0) {
|
if (vectors.size() > 0) {
|
||||||
return hnswGraphBuilder.getGraph();
|
return hnswGraphBuilder.getCompletedGraph();
|
||||||
} else {
|
} else {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
|
@ -414,10 +414,14 @@ public final class Lucene95HnswVectorsWriter extends KnnVectorsWriter {
|
||||||
// write the vector data to a temporary file
|
// write the vector data to a temporary file
|
||||||
DocsWithFieldSet docsWithField =
|
DocsWithFieldSet docsWithField =
|
||||||
switch (fieldInfo.getVectorEncoding()) {
|
switch (fieldInfo.getVectorEncoding()) {
|
||||||
case BYTE -> writeByteVectorData(
|
case BYTE ->
|
||||||
tempVectorData, MergedVectorValues.mergeByteVectorValues(fieldInfo, mergeState));
|
writeByteVectorData(
|
||||||
case FLOAT32 -> writeVectorData(
|
tempVectorData,
|
||||||
tempVectorData, MergedVectorValues.mergeFloatVectorValues(fieldInfo, mergeState));
|
MergedVectorValues.mergeByteVectorValues(fieldInfo, mergeState));
|
||||||
|
case FLOAT32 ->
|
||||||
|
writeVectorData(
|
||||||
|
tempVectorData,
|
||||||
|
MergedVectorValues.mergeFloatVectorValues(fieldInfo, mergeState));
|
||||||
};
|
};
|
||||||
CodecUtil.writeFooter(tempVectorData);
|
CodecUtil.writeFooter(tempVectorData);
|
||||||
IOUtils.close(tempVectorData);
|
IOUtils.close(tempVectorData);
|
||||||
|
@ -477,10 +481,12 @@ public final class Lucene95HnswVectorsWriter extends KnnVectorsWriter {
|
||||||
}
|
}
|
||||||
DocIdSetIterator mergedVectorIterator = null;
|
DocIdSetIterator mergedVectorIterator = null;
|
||||||
switch (fieldInfo.getVectorEncoding()) {
|
switch (fieldInfo.getVectorEncoding()) {
|
||||||
case BYTE -> mergedVectorIterator =
|
case BYTE ->
|
||||||
KnnVectorsWriter.MergedVectorValues.mergeByteVectorValues(fieldInfo, mergeState);
|
mergedVectorIterator =
|
||||||
case FLOAT32 -> mergedVectorIterator =
|
KnnVectorsWriter.MergedVectorValues.mergeByteVectorValues(fieldInfo, mergeState);
|
||||||
KnnVectorsWriter.MergedVectorValues.mergeFloatVectorValues(fieldInfo, mergeState);
|
case FLOAT32 ->
|
||||||
|
mergedVectorIterator =
|
||||||
|
KnnVectorsWriter.MergedVectorValues.mergeFloatVectorValues(fieldInfo, mergeState);
|
||||||
}
|
}
|
||||||
graph =
|
graph =
|
||||||
merger.merge(
|
merger.merge(
|
||||||
|
@ -680,18 +686,20 @@ public final class Lucene95HnswVectorsWriter extends KnnVectorsWriter {
|
||||||
throws IOException {
|
throws IOException {
|
||||||
int dim = fieldInfo.getVectorDimension();
|
int dim = fieldInfo.getVectorDimension();
|
||||||
return switch (fieldInfo.getVectorEncoding()) {
|
return switch (fieldInfo.getVectorEncoding()) {
|
||||||
case BYTE -> new FieldWriter<byte[]>(fieldInfo, M, beamWidth, infoStream) {
|
case BYTE ->
|
||||||
@Override
|
new FieldWriter<byte[]>(fieldInfo, M, beamWidth, infoStream) {
|
||||||
public byte[] copyValue(byte[] value) {
|
@Override
|
||||||
return ArrayUtil.copyOfSubArray(value, 0, dim);
|
public byte[] copyValue(byte[] value) {
|
||||||
}
|
return ArrayUtil.copyOfSubArray(value, 0, dim);
|
||||||
};
|
}
|
||||||
case FLOAT32 -> new FieldWriter<float[]>(fieldInfo, M, beamWidth, infoStream) {
|
};
|
||||||
@Override
|
case FLOAT32 ->
|
||||||
public float[] copyValue(float[] value) {
|
new FieldWriter<float[]>(fieldInfo, M, beamWidth, infoStream) {
|
||||||
return ArrayUtil.copyOfSubArray(value, 0, dim);
|
@Override
|
||||||
}
|
public float[] copyValue(float[] value) {
|
||||||
};
|
return ArrayUtil.copyOfSubArray(value, 0, dim);
|
||||||
|
}
|
||||||
|
};
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -704,12 +712,14 @@ public final class Lucene95HnswVectorsWriter extends KnnVectorsWriter {
|
||||||
vectors = new ArrayList<>();
|
vectors = new ArrayList<>();
|
||||||
RandomVectorScorerSupplier scorerSupplier =
|
RandomVectorScorerSupplier scorerSupplier =
|
||||||
switch (fieldInfo.getVectorEncoding()) {
|
switch (fieldInfo.getVectorEncoding()) {
|
||||||
case BYTE -> defaultFlatVectorScorer.getRandomVectorScorerSupplier(
|
case BYTE ->
|
||||||
fieldInfo.getVectorSimilarityFunction(),
|
defaultFlatVectorScorer.getRandomVectorScorerSupplier(
|
||||||
RandomAccessVectorValues.fromBytes((List<byte[]>) vectors, dim));
|
fieldInfo.getVectorSimilarityFunction(),
|
||||||
case FLOAT32 -> defaultFlatVectorScorer.getRandomVectorScorerSupplier(
|
RandomAccessVectorValues.fromBytes((List<byte[]>) vectors, dim));
|
||||||
fieldInfo.getVectorSimilarityFunction(),
|
case FLOAT32 ->
|
||||||
RandomAccessVectorValues.fromFloats((List<float[]>) vectors, dim));
|
defaultFlatVectorScorer.getRandomVectorScorerSupplier(
|
||||||
|
fieldInfo.getVectorSimilarityFunction(),
|
||||||
|
RandomAccessVectorValues.fromFloats((List<float[]>) vectors, dim));
|
||||||
};
|
};
|
||||||
hnswGraphBuilder =
|
hnswGraphBuilder =
|
||||||
HnswGraphBuilder.create(scorerSupplier, M, beamWidth, HnswGraphBuilder.randSeed);
|
HnswGraphBuilder.create(scorerSupplier, M, beamWidth, HnswGraphBuilder.randSeed);
|
||||||
|
@ -732,9 +742,9 @@ public final class Lucene95HnswVectorsWriter extends KnnVectorsWriter {
|
||||||
lastDocID = docID;
|
lastDocID = docID;
|
||||||
}
|
}
|
||||||
|
|
||||||
OnHeapHnswGraph getGraph() {
|
OnHeapHnswGraph getGraph() throws IOException {
|
||||||
if (vectors.size() > 0) {
|
if (vectors.size() > 0) {
|
||||||
return hnswGraphBuilder.getGraph();
|
return hnswGraphBuilder.getCompletedGraph();
|
||||||
} else {
|
} else {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
|
@ -14,22 +14,22 @@
|
||||||
* See the License for the specific language governing permissions and
|
* See the License for the specific language governing permissions and
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
package org.apache.lucene.codecs.lucene99;
|
package org.apache.lucene.backward_codecs.lucene99;
|
||||||
|
|
||||||
import static org.apache.lucene.codecs.lucene99.ForUtil.BLOCK_SIZE;
|
import static org.apache.lucene.backward_codecs.lucene99.ForUtil.BLOCK_SIZE;
|
||||||
import static org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.DOC_CODEC;
|
import static org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat.DOC_CODEC;
|
||||||
import static org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.MAX_SKIP_LEVELS;
|
import static org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat.MAX_SKIP_LEVELS;
|
||||||
import static org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.PAY_CODEC;
|
import static org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat.PAY_CODEC;
|
||||||
import static org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.POS_CODEC;
|
import static org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat.POS_CODEC;
|
||||||
import static org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.TERMS_CODEC;
|
import static org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat.TERMS_CODEC;
|
||||||
import static org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.VERSION_CURRENT;
|
import static org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat.VERSION_CURRENT;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat.IntBlockTermState;
|
||||||
import org.apache.lucene.codecs.BlockTermState;
|
import org.apache.lucene.codecs.BlockTermState;
|
||||||
import org.apache.lucene.codecs.CodecUtil;
|
import org.apache.lucene.codecs.CodecUtil;
|
||||||
import org.apache.lucene.codecs.CompetitiveImpactAccumulator;
|
import org.apache.lucene.codecs.CompetitiveImpactAccumulator;
|
||||||
import org.apache.lucene.codecs.PushPostingsWriterBase;
|
import org.apache.lucene.codecs.PushPostingsWriterBase;
|
||||||
import org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.IntBlockTermState;
|
|
||||||
import org.apache.lucene.index.CorruptIndexException;
|
import org.apache.lucene.index.CorruptIndexException;
|
||||||
import org.apache.lucene.index.FieldInfo;
|
import org.apache.lucene.index.FieldInfo;
|
||||||
import org.apache.lucene.index.IndexFileNames;
|
import org.apache.lucene.index.IndexFileNames;
|
|
@ -0,0 +1,68 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package org.apache.lucene.backward_codecs.lucene99;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import org.apache.lucene.codecs.FieldsConsumer;
|
||||||
|
import org.apache.lucene.codecs.PostingsWriterBase;
|
||||||
|
import org.apache.lucene.codecs.lucene90.blocktree.Lucene90BlockTreeTermsWriter;
|
||||||
|
import org.apache.lucene.index.SegmentWriteState;
|
||||||
|
import org.apache.lucene.util.IOUtils;
|
||||||
|
|
||||||
|
public class Lucene99RWPostingsFormat extends Lucene99PostingsFormat {
|
||||||
|
|
||||||
|
private final int minTermBlockSize;
|
||||||
|
private final int maxTermBlockSize;
|
||||||
|
|
||||||
|
/** Creates {@code Lucene99PostingsFormat} with default settings. */
|
||||||
|
public Lucene99RWPostingsFormat() {
|
||||||
|
this(
|
||||||
|
Lucene90BlockTreeTermsWriter.DEFAULT_MIN_BLOCK_SIZE,
|
||||||
|
Lucene90BlockTreeTermsWriter.DEFAULT_MAX_BLOCK_SIZE);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates {@code Lucene99PostingsFormat} with custom values for {@code minBlockSize} and {@code
|
||||||
|
* maxBlockSize} passed to block terms dictionary.
|
||||||
|
*
|
||||||
|
* @see
|
||||||
|
* Lucene90BlockTreeTermsWriter#Lucene90BlockTreeTermsWriter(SegmentWriteState,PostingsWriterBase,int,int)
|
||||||
|
*/
|
||||||
|
public Lucene99RWPostingsFormat(int minTermBlockSize, int maxTermBlockSize) {
|
||||||
|
super();
|
||||||
|
Lucene90BlockTreeTermsWriter.validateSettings(minTermBlockSize, maxTermBlockSize);
|
||||||
|
this.minTermBlockSize = minTermBlockSize;
|
||||||
|
this.maxTermBlockSize = maxTermBlockSize;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
|
||||||
|
PostingsWriterBase postingsWriter = new Lucene99PostingsWriter(state);
|
||||||
|
boolean success = false;
|
||||||
|
try {
|
||||||
|
FieldsConsumer ret =
|
||||||
|
new Lucene90BlockTreeTermsWriter(
|
||||||
|
state, postingsWriter, minTermBlockSize, maxTermBlockSize);
|
||||||
|
success = true;
|
||||||
|
return ret;
|
||||||
|
} finally {
|
||||||
|
if (!success) {
|
||||||
|
IOUtils.closeWhileHandlingException(postingsWriter);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -14,7 +14,7 @@
|
||||||
* See the License for the specific language governing permissions and
|
* See the License for the specific language governing permissions and
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
package org.apache.lucene.codecs.lucene99;
|
package org.apache.lucene.backward_codecs.lucene99;
|
||||||
|
|
||||||
import com.carrotsearch.randomizedtesting.generators.RandomNumbers;
|
import com.carrotsearch.randomizedtesting.generators.RandomNumbers;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
|
@ -14,7 +14,7 @@
|
||||||
* See the License for the specific language governing permissions and
|
* See the License for the specific language governing permissions and
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
package org.apache.lucene.codecs.lucene99;
|
package org.apache.lucene.backward_codecs.lucene99;
|
||||||
|
|
||||||
import com.carrotsearch.randomizedtesting.generators.RandomNumbers;
|
import com.carrotsearch.randomizedtesting.generators.RandomNumbers;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
|
@ -19,7 +19,6 @@ package org.apache.lucene.backward_codecs.lucene99;
|
||||||
|
|
||||||
import org.apache.lucene.codecs.Codec;
|
import org.apache.lucene.codecs.Codec;
|
||||||
import org.apache.lucene.codecs.KnnVectorsFormat;
|
import org.apache.lucene.codecs.KnnVectorsFormat;
|
||||||
import org.apache.lucene.codecs.lucene99.Lucene99Codec;
|
|
||||||
import org.apache.lucene.tests.index.BaseKnnVectorsFormatTestCase;
|
import org.apache.lucene.tests.index.BaseKnnVectorsFormatTestCase;
|
||||||
|
|
||||||
public class TestLucene99HnswScalarQuantizedVectorsFormat extends BaseKnnVectorsFormatTestCase {
|
public class TestLucene99HnswScalarQuantizedVectorsFormat extends BaseKnnVectorsFormatTestCase {
|
||||||
|
|
|
@ -14,22 +14,26 @@
|
||||||
* See the License for the specific language governing permissions and
|
* See the License for the specific language governing permissions and
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
package org.apache.lucene.codecs.lucene99;
|
package org.apache.lucene.backward_codecs.lucene99;
|
||||||
|
|
||||||
import static org.apache.lucene.codecs.lucene99.Lucene99ScoreSkipReader.readImpacts;
|
import static org.apache.lucene.backward_codecs.lucene99.Lucene99ScoreSkipReader.readImpacts;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
import java.util.Collections;
|
import java.util.Collections;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
import org.apache.lucene.backward_codecs.lucene99.Lucene99ScoreSkipReader.MutableImpactList;
|
||||||
import org.apache.lucene.codecs.Codec;
|
import org.apache.lucene.codecs.Codec;
|
||||||
import org.apache.lucene.codecs.CompetitiveImpactAccumulator;
|
import org.apache.lucene.codecs.CompetitiveImpactAccumulator;
|
||||||
import org.apache.lucene.codecs.lucene90.blocktree.FieldReader;
|
import org.apache.lucene.codecs.lucene90.blocktree.FieldReader;
|
||||||
import org.apache.lucene.codecs.lucene90.blocktree.Stats;
|
import org.apache.lucene.codecs.lucene90.blocktree.Stats;
|
||||||
import org.apache.lucene.codecs.lucene99.Lucene99ScoreSkipReader.MutableImpactList;
|
|
||||||
import org.apache.lucene.document.Document;
|
import org.apache.lucene.document.Document;
|
||||||
import org.apache.lucene.document.Field;
|
import org.apache.lucene.document.Field;
|
||||||
import org.apache.lucene.index.*;
|
import org.apache.lucene.index.DirectoryReader;
|
||||||
|
import org.apache.lucene.index.Impact;
|
||||||
|
import org.apache.lucene.index.IndexWriter;
|
||||||
|
import org.apache.lucene.index.IndexWriterConfig;
|
||||||
|
import org.apache.lucene.index.TermsEnum;
|
||||||
import org.apache.lucene.store.ByteArrayDataInput;
|
import org.apache.lucene.store.ByteArrayDataInput;
|
||||||
import org.apache.lucene.store.Directory;
|
import org.apache.lucene.store.Directory;
|
||||||
import org.apache.lucene.store.IOContext;
|
import org.apache.lucene.store.IOContext;
|
||||||
|
@ -41,7 +45,7 @@ import org.apache.lucene.tests.util.TestUtil;
|
||||||
import org.apache.lucene.util.BytesRef;
|
import org.apache.lucene.util.BytesRef;
|
||||||
|
|
||||||
public class TestLucene99PostingsFormat extends BasePostingsFormatTestCase {
|
public class TestLucene99PostingsFormat extends BasePostingsFormatTestCase {
|
||||||
private final Codec codec = TestUtil.alwaysPostingsFormat(new Lucene99PostingsFormat());
|
private final Codec codec = TestUtil.alwaysPostingsFormat(new Lucene99RWPostingsFormat());
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected Codec getCodec() {
|
protected Codec getCodec() {
|
||||||
|
@ -77,7 +81,7 @@ public class TestLucene99PostingsFormat extends BasePostingsFormatTestCase {
|
||||||
expectThrows(
|
expectThrows(
|
||||||
IllegalArgumentException.class,
|
IllegalArgumentException.class,
|
||||||
() -> {
|
() -> {
|
||||||
new Lucene99PostingsFormat(minItemsInBlock, maxItemsInBlock);
|
new Lucene99RWPostingsFormat(minItemsInBlock, maxItemsInBlock);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
|
@ -14,7 +14,7 @@
|
||||||
* See the License for the specific language governing permissions and
|
* See the License for the specific language governing permissions and
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
package org.apache.lucene.codecs.lucene99;
|
package org.apache.lucene.backward_codecs.lucene99;
|
||||||
|
|
||||||
import com.carrotsearch.randomizedtesting.generators.RandomNumbers;
|
import com.carrotsearch.randomizedtesting.generators.RandomNumbers;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
|
@ -0,0 +1,49 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package org.apache.lucene.backward_codecs.lucene99;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import org.apache.lucene.store.Directory;
|
||||||
|
import org.apache.lucene.store.IOContext;
|
||||||
|
import org.apache.lucene.store.IndexInput;
|
||||||
|
import org.apache.lucene.store.IndexOutput;
|
||||||
|
import org.apache.lucene.tests.util.LuceneTestCase;
|
||||||
|
|
||||||
|
public class TestPostingsUtil extends LuceneTestCase {
|
||||||
|
|
||||||
|
// checks for bug described in https://github.com/apache/lucene/issues/13373
|
||||||
|
public void testIntegerOverflow() throws IOException {
|
||||||
|
final int size = random().nextInt(1, ForUtil.BLOCK_SIZE);
|
||||||
|
final long[] docDeltaBuffer = new long[size];
|
||||||
|
final long[] freqBuffer = new long[size];
|
||||||
|
|
||||||
|
final int delta = 1 << 30;
|
||||||
|
docDeltaBuffer[0] = delta;
|
||||||
|
try (Directory dir = newDirectory()) {
|
||||||
|
try (IndexOutput out = dir.createOutput("test", IOContext.DEFAULT)) {
|
||||||
|
// In old implementation, this would cause integer overflow exception.
|
||||||
|
PostingsUtil.writeVIntBlock(out, docDeltaBuffer, freqBuffer, size, true);
|
||||||
|
}
|
||||||
|
long[] restoredDocs = new long[size];
|
||||||
|
long[] restoredFreqs = new long[size];
|
||||||
|
try (IndexInput in = dir.openInput("test", IOContext.DEFAULT)) {
|
||||||
|
PostingsUtil.readVIntBlock(in, restoredDocs, restoredFreqs, size, true, true);
|
||||||
|
}
|
||||||
|
assertEquals(delta, restoredDocs[0]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -196,6 +196,7 @@ public class TestAncientIndicesCompatibility extends LuceneTestCase {
|
||||||
ByteArrayOutputStream bos = new ByteArrayOutputStream(1024);
|
ByteArrayOutputStream bos = new ByteArrayOutputStream(1024);
|
||||||
CheckIndex checker = new CheckIndex(dir);
|
CheckIndex checker = new CheckIndex(dir);
|
||||||
checker.setInfoStream(new PrintStream(bos, false, UTF_8));
|
checker.setInfoStream(new PrintStream(bos, false, UTF_8));
|
||||||
|
checker.setLevel(CheckIndex.Level.MIN_LEVEL_FOR_INTEGRITY_CHECKS);
|
||||||
CheckIndex.Status indexStatus = checker.checkIndex();
|
CheckIndex.Status indexStatus = checker.checkIndex();
|
||||||
if (version.startsWith("8.")) {
|
if (version.startsWith("8.")) {
|
||||||
assertTrue(indexStatus.clean);
|
assertTrue(indexStatus.clean);
|
||||||
|
|
|
@ -20,9 +20,9 @@ import static org.apache.lucene.backward_index.TestBasicBackwardsCompatibility.a
|
||||||
|
|
||||||
import com.carrotsearch.randomizedtesting.annotations.ParametersFactory;
|
import com.carrotsearch.randomizedtesting.annotations.ParametersFactory;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import org.apache.lucene.backward_codecs.lucene99.Lucene99Codec;
|
||||||
import org.apache.lucene.codecs.Codec;
|
import org.apache.lucene.codecs.Codec;
|
||||||
import org.apache.lucene.codecs.KnnVectorsFormat;
|
import org.apache.lucene.codecs.KnnVectorsFormat;
|
||||||
import org.apache.lucene.codecs.lucene99.Lucene99Codec;
|
|
||||||
import org.apache.lucene.codecs.lucene99.Lucene99HnswScalarQuantizedVectorsFormat;
|
import org.apache.lucene.codecs.lucene99.Lucene99HnswScalarQuantizedVectorsFormat;
|
||||||
import org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat;
|
import org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat;
|
||||||
import org.apache.lucene.document.Document;
|
import org.apache.lucene.document.Document;
|
||||||
|
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
@ -40,3 +40,4 @@
|
||||||
9.9.2
|
9.9.2
|
||||||
9.10.0
|
9.10.0
|
||||||
9.11.0
|
9.11.0
|
||||||
|
9.11.1
|
||||||
|
|
|
@ -0,0 +1,376 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package org.apache.lucene.benchmark.jmh;
|
||||||
|
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.Random;
|
||||||
|
import java.util.concurrent.TimeUnit;
|
||||||
|
import org.apache.lucene.search.DocIdSetIterator;
|
||||||
|
import org.openjdk.jmh.annotations.Benchmark;
|
||||||
|
import org.openjdk.jmh.annotations.BenchmarkMode;
|
||||||
|
import org.openjdk.jmh.annotations.CompilerControl;
|
||||||
|
import org.openjdk.jmh.annotations.Fork;
|
||||||
|
import org.openjdk.jmh.annotations.Level;
|
||||||
|
import org.openjdk.jmh.annotations.Measurement;
|
||||||
|
import org.openjdk.jmh.annotations.Mode;
|
||||||
|
import org.openjdk.jmh.annotations.OutputTimeUnit;
|
||||||
|
import org.openjdk.jmh.annotations.Scope;
|
||||||
|
import org.openjdk.jmh.annotations.Setup;
|
||||||
|
import org.openjdk.jmh.annotations.State;
|
||||||
|
import org.openjdk.jmh.annotations.Warmup;
|
||||||
|
|
||||||
|
@BenchmarkMode(Mode.Throughput)
|
||||||
|
@OutputTimeUnit(TimeUnit.MILLISECONDS)
|
||||||
|
@State(Scope.Benchmark)
|
||||||
|
@Warmup(iterations = 5, time = 1)
|
||||||
|
@Measurement(iterations = 5, time = 1)
|
||||||
|
@Fork(
|
||||||
|
value = 1,
|
||||||
|
jvmArgsAppend = {"-Xmx1g", "-Xms1g", "-XX:+AlwaysPreTouch"})
|
||||||
|
public class AdvanceBenchmark {
|
||||||
|
|
||||||
|
private final long[] values = new long[129];
|
||||||
|
private final int[] startIndexes = new int[1_000];
|
||||||
|
private final long[] targets = new long[startIndexes.length];
|
||||||
|
|
||||||
|
@Setup(Level.Trial)
|
||||||
|
public void setup() throws Exception {
|
||||||
|
for (int i = 0; i < 128; ++i) {
|
||||||
|
values[i] = i;
|
||||||
|
}
|
||||||
|
values[128] = DocIdSetIterator.NO_MORE_DOCS;
|
||||||
|
Random r = new Random(0);
|
||||||
|
for (int i = 0; i < startIndexes.length; ++i) {
|
||||||
|
startIndexes[i] = r.nextInt(64);
|
||||||
|
targets[i] = startIndexes[i] + 1 + r.nextInt(1 << r.nextInt(7));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Benchmark
|
||||||
|
public void binarySearch() {
|
||||||
|
for (int i = 0; i < startIndexes.length; ++i) {
|
||||||
|
binarySearch(values, targets[i], startIndexes[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@CompilerControl(CompilerControl.Mode.DONT_INLINE)
|
||||||
|
private static int binarySearch(long[] values, long target, int startIndex) {
|
||||||
|
// Standard binary search
|
||||||
|
int i = Arrays.binarySearch(values, startIndex, values.length, target);
|
||||||
|
if (i < 0) {
|
||||||
|
i = -1 - i;
|
||||||
|
}
|
||||||
|
return i;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Benchmark
|
||||||
|
public void binarySearch2() {
|
||||||
|
for (int i = 0; i < startIndexes.length; ++i) {
|
||||||
|
binarySearch2(values, targets[i], startIndexes[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@CompilerControl(CompilerControl.Mode.DONT_INLINE)
|
||||||
|
private static int binarySearch2(long[] values, long target, int startIndex) {
|
||||||
|
// Try to help the compiler by providing predictable start/end offsets.
|
||||||
|
int i = Arrays.binarySearch(values, 0, 128, target);
|
||||||
|
if (i < 0) {
|
||||||
|
i = -1 - i;
|
||||||
|
}
|
||||||
|
return i;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Benchmark
|
||||||
|
public void binarySearch3() {
|
||||||
|
for (int i = 0; i < startIndexes.length; ++i) {
|
||||||
|
binarySearch3(values, targets[i], startIndexes[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@CompilerControl(CompilerControl.Mode.DONT_INLINE)
|
||||||
|
private static int binarySearch3(long[] values, long target, int startIndex) {
|
||||||
|
// Organize code the same way as suggested in https://quickwit.io/blog/search-a-sorted-block,
|
||||||
|
// which proved to help with LLVM.
|
||||||
|
int start = 0;
|
||||||
|
int length = 128;
|
||||||
|
|
||||||
|
while (length > 1) {
|
||||||
|
length /= 2;
|
||||||
|
if (values[start + length - 1] < target) {
|
||||||
|
start += length;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return start;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Benchmark
|
||||||
|
public void binarySearch4() {
|
||||||
|
for (int i = 0; i < startIndexes.length; ++i) {
|
||||||
|
binarySearch4(values, targets[i], startIndexes[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@CompilerControl(CompilerControl.Mode.DONT_INLINE)
|
||||||
|
private static int binarySearch4(long[] values, long target, int startIndex) {
|
||||||
|
// Explicitly inline the binary-search logic to see if it helps the compiler.
|
||||||
|
int start = 0;
|
||||||
|
|
||||||
|
if (values[63] < target) {
|
||||||
|
start += 64;
|
||||||
|
}
|
||||||
|
if (values[start + 31] < target) {
|
||||||
|
start += 32;
|
||||||
|
}
|
||||||
|
if (values[start + 15] < target) {
|
||||||
|
start += 16;
|
||||||
|
}
|
||||||
|
if (values[start + 7] < target) {
|
||||||
|
start += 8;
|
||||||
|
}
|
||||||
|
if (values[start + 3] < target) {
|
||||||
|
start += 4;
|
||||||
|
}
|
||||||
|
if (values[start + 1] < target) {
|
||||||
|
start += 2;
|
||||||
|
}
|
||||||
|
if (values[start] < target) {
|
||||||
|
start += 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
return start;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Benchmark
|
||||||
|
public void binarySearch5() {
|
||||||
|
for (int i = 0; i < startIndexes.length; ++i) {
|
||||||
|
binarySearch5(values, targets[i], startIndexes[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@CompilerControl(CompilerControl.Mode.DONT_INLINE)
|
||||||
|
private static int binarySearch5(long[] values, long target, int startIndex) {
|
||||||
|
// Other way to write a binary search
|
||||||
|
int start = 0;
|
||||||
|
|
||||||
|
for (int shift = 6; shift >= 0; --shift) {
|
||||||
|
int halfRange = 1 << shift;
|
||||||
|
if (values[start + halfRange - 1] < target) {
|
||||||
|
start += halfRange;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return start;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Benchmark
|
||||||
|
public void binarySearch6() {
|
||||||
|
for (int i = 0; i < startIndexes.length; ++i) {
|
||||||
|
binarySearch6(values, targets[i], startIndexes[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@CompilerControl(CompilerControl.Mode.DONT_INLINE)
|
||||||
|
private static int binarySearch6(long[] values, long target, int startIndex) {
|
||||||
|
// Other way to write a binary search
|
||||||
|
int start = 0;
|
||||||
|
|
||||||
|
for (int halfRange = 64; halfRange > 0; halfRange >>= 1) {
|
||||||
|
if (values[start + halfRange - 1] < target) {
|
||||||
|
start += halfRange;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return start;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Benchmark
|
||||||
|
public void linearSearch() {
|
||||||
|
for (int i = 0; i < startIndexes.length; ++i) {
|
||||||
|
linearSearch(values, targets[i], startIndexes[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@CompilerControl(CompilerControl.Mode.DONT_INLINE)
|
||||||
|
private static int linearSearch(long[] values, long target, int startIndex) {
|
||||||
|
// Naive linear search.
|
||||||
|
for (int i = startIndex; i < values.length; ++i) {
|
||||||
|
if (values[i] >= target) {
|
||||||
|
return i;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return values.length;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Benchmark
|
||||||
|
public void bruteForceSearch() {
|
||||||
|
for (int i = 0; i < startIndexes.length; ++i) {
|
||||||
|
bruteForceSearch(values, targets[i], startIndexes[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@CompilerControl(CompilerControl.Mode.DONT_INLINE)
|
||||||
|
private static int bruteForceSearch(long[] values, long target, int startIndex) {
|
||||||
|
// Linear search with predictable start/end offsets to see if it helps the compiler.
|
||||||
|
for (int i = 0; i < 128; ++i) {
|
||||||
|
if (values[i] >= target) {
|
||||||
|
return i;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return values.length;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Benchmark
|
||||||
|
public void linearSearch2() {
|
||||||
|
for (int i = 0; i < startIndexes.length; ++i) {
|
||||||
|
linearSearch2(values, targets[i], startIndexes[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@CompilerControl(CompilerControl.Mode.DONT_INLINE)
|
||||||
|
private static int linearSearch2(long[] values, long target, int startIndex) {
|
||||||
|
// Two-level linear search, first checking every 8-th value, then values within an 8-value range
|
||||||
|
int rangeStart = values.length - 8;
|
||||||
|
|
||||||
|
for (int i = startIndex; i + 8 <= values.length; i += 8) {
|
||||||
|
if (values[i + 7] >= target) {
|
||||||
|
rangeStart = i;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int i = 0; i < 8; ++i) {
|
||||||
|
if (values[rangeStart + i] >= target) {
|
||||||
|
return rangeStart + i;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return values.length;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Benchmark
|
||||||
|
public void linearSearch3() {
|
||||||
|
for (int i = 0; i < startIndexes.length; ++i) {
|
||||||
|
linearSearch3(values, targets[i], startIndexes[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@CompilerControl(CompilerControl.Mode.DONT_INLINE)
|
||||||
|
private static int linearSearch3(long[] values, long target, int startIndex) {
|
||||||
|
// Iteration over linearSearch that tries to reduce branches
|
||||||
|
while (startIndex + 4 <= values.length) {
|
||||||
|
int count = values[startIndex] < target ? 1 : 0;
|
||||||
|
if (values[startIndex + 1] < target) {
|
||||||
|
count++;
|
||||||
|
}
|
||||||
|
if (values[startIndex + 2] < target) {
|
||||||
|
count++;
|
||||||
|
}
|
||||||
|
if (values[startIndex + 3] < target) {
|
||||||
|
count++;
|
||||||
|
}
|
||||||
|
if (count != 4) {
|
||||||
|
return startIndex + count;
|
||||||
|
}
|
||||||
|
startIndex += 4;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int i = startIndex; i < values.length; ++i) {
|
||||||
|
if (values[i] >= target) {
|
||||||
|
return i;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return values.length;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Benchmark
|
||||||
|
public void hybridSearch() {
|
||||||
|
for (int i = 0; i < startIndexes.length; ++i) {
|
||||||
|
hybridSearch(values, targets[i], startIndexes[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@CompilerControl(CompilerControl.Mode.DONT_INLINE)
|
||||||
|
private static int hybridSearch(long[] values, long target, int startIndex) {
|
||||||
|
// Two-level linear search, first checking every 8-th value, then values within an 8-value range
|
||||||
|
int rangeStart = values.length - 8;
|
||||||
|
|
||||||
|
for (int i = startIndex; i + 8 <= values.length; i += 8) {
|
||||||
|
if (values[i + 7] >= target) {
|
||||||
|
rangeStart = i;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return binarySearchHelper8(values, target, rangeStart);
|
||||||
|
}
|
||||||
|
|
||||||
|
// branchless binary search over 8 values
|
||||||
|
private static int binarySearchHelper8(long[] values, long target, int start) {
|
||||||
|
if (values[start + 3] < target) {
|
||||||
|
start += 4;
|
||||||
|
}
|
||||||
|
if (values[start + 1] < target) {
|
||||||
|
start += 2;
|
||||||
|
}
|
||||||
|
if (values[start] < target) {
|
||||||
|
start += 1;
|
||||||
|
}
|
||||||
|
return start;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static void assertEquals(int expected, int actual) {
|
||||||
|
if (expected != actual) {
|
||||||
|
throw new AssertionError("Expected: " + expected + ", got " + actual);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public static void main(String[] args) {
|
||||||
|
// For testing purposes
|
||||||
|
long[] values = new long[129];
|
||||||
|
for (int i = 0; i < 128; ++i) {
|
||||||
|
values[i] = i;
|
||||||
|
}
|
||||||
|
values[128] = DocIdSetIterator.NO_MORE_DOCS;
|
||||||
|
for (int start = 0; start < 128; ++start) {
|
||||||
|
for (int targetIndex = start; targetIndex < 128; ++targetIndex) {
|
||||||
|
int actualIndex = binarySearch(values, values[targetIndex], start);
|
||||||
|
assertEquals(targetIndex, actualIndex);
|
||||||
|
actualIndex = binarySearch2(values, values[targetIndex], start);
|
||||||
|
assertEquals(targetIndex, actualIndex);
|
||||||
|
actualIndex = binarySearch3(values, values[targetIndex], start);
|
||||||
|
assertEquals(targetIndex, actualIndex);
|
||||||
|
actualIndex = binarySearch4(values, values[targetIndex], start);
|
||||||
|
assertEquals(targetIndex, actualIndex);
|
||||||
|
actualIndex = binarySearch5(values, values[targetIndex], start);
|
||||||
|
assertEquals(targetIndex, actualIndex);
|
||||||
|
actualIndex = binarySearch6(values, values[targetIndex], start);
|
||||||
|
assertEquals(targetIndex, actualIndex);
|
||||||
|
actualIndex = bruteForceSearch(values, values[targetIndex], start);
|
||||||
|
assertEquals(targetIndex, actualIndex);
|
||||||
|
actualIndex = hybridSearch(values, values[targetIndex], start);
|
||||||
|
assertEquals(targetIndex, actualIndex);
|
||||||
|
actualIndex = linearSearch(values, values[targetIndex], start);
|
||||||
|
assertEquals(targetIndex, actualIndex);
|
||||||
|
actualIndex = linearSearch2(values, values[targetIndex], start);
|
||||||
|
assertEquals(targetIndex, actualIndex);
|
||||||
|
actualIndex = linearSearch3(values, values[targetIndex], start);
|
||||||
|
assertEquals(targetIndex, actualIndex);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,75 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.lucene.benchmark.jmh;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.Random;
|
||||||
|
import java.util.concurrent.TimeUnit;
|
||||||
|
import org.apache.lucene.util.VectorUtil;
|
||||||
|
import org.openjdk.jmh.annotations.Benchmark;
|
||||||
|
import org.openjdk.jmh.annotations.BenchmarkMode;
|
||||||
|
import org.openjdk.jmh.annotations.Fork;
|
||||||
|
import org.openjdk.jmh.annotations.Measurement;
|
||||||
|
import org.openjdk.jmh.annotations.Mode;
|
||||||
|
import org.openjdk.jmh.annotations.OutputTimeUnit;
|
||||||
|
import org.openjdk.jmh.annotations.Param;
|
||||||
|
import org.openjdk.jmh.annotations.Scope;
|
||||||
|
import org.openjdk.jmh.annotations.Setup;
|
||||||
|
import org.openjdk.jmh.annotations.State;
|
||||||
|
import org.openjdk.jmh.annotations.Warmup;
|
||||||
|
|
||||||
|
@Fork(1)
|
||||||
|
@Warmup(iterations = 3, time = 3)
|
||||||
|
@Measurement(iterations = 5, time = 3)
|
||||||
|
@BenchmarkMode(Mode.Throughput)
|
||||||
|
@OutputTimeUnit(TimeUnit.SECONDS)
|
||||||
|
@State(Scope.Benchmark)
|
||||||
|
public class HammingDistanceBenchmark {
|
||||||
|
@Param({"1000000"})
|
||||||
|
int nb = 1_000_000;
|
||||||
|
|
||||||
|
@Param({"1024"})
|
||||||
|
int dims = 1024;
|
||||||
|
|
||||||
|
byte[][] xb;
|
||||||
|
byte[] xq;
|
||||||
|
|
||||||
|
@Setup
|
||||||
|
public void setup() throws IOException {
|
||||||
|
Random rand = new Random();
|
||||||
|
this.xb = new byte[nb][dims / 8];
|
||||||
|
for (int i = 0; i < nb; i++) {
|
||||||
|
for (int j = 0; j < dims / 8; j++) {
|
||||||
|
xb[i][j] = (byte) rand.nextInt(0, 255);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
this.xq = new byte[dims / 8];
|
||||||
|
for (int i = 0; i < xq.length; i++) {
|
||||||
|
xq[i] = (byte) rand.nextInt(0, 255);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Benchmark
|
||||||
|
public int xorBitCount() {
|
||||||
|
int tot = 0;
|
||||||
|
for (int i = 0; i < nb; i++) {
|
||||||
|
tot += VectorUtil.xorBitCount(xb[i], xq);
|
||||||
|
}
|
||||||
|
return tot;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,108 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package org.apache.lucene.benchmark.jmh;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
import java.util.Random;
|
||||||
|
import java.util.concurrent.TimeUnit;
|
||||||
|
import org.apache.lucene.codecs.lucene912.ForDeltaUtil;
|
||||||
|
import org.apache.lucene.codecs.lucene912.ForUtil;
|
||||||
|
import org.apache.lucene.codecs.lucene912.PostingIndexInput;
|
||||||
|
import org.apache.lucene.store.Directory;
|
||||||
|
import org.apache.lucene.store.IOContext;
|
||||||
|
import org.apache.lucene.store.IndexInput;
|
||||||
|
import org.apache.lucene.store.IndexOutput;
|
||||||
|
import org.apache.lucene.store.MMapDirectory;
|
||||||
|
import org.apache.lucene.util.IOUtils;
|
||||||
|
import org.openjdk.jmh.annotations.Benchmark;
|
||||||
|
import org.openjdk.jmh.annotations.BenchmarkMode;
|
||||||
|
import org.openjdk.jmh.annotations.Fork;
|
||||||
|
import org.openjdk.jmh.annotations.Level;
|
||||||
|
import org.openjdk.jmh.annotations.Measurement;
|
||||||
|
import org.openjdk.jmh.annotations.Mode;
|
||||||
|
import org.openjdk.jmh.annotations.OutputTimeUnit;
|
||||||
|
import org.openjdk.jmh.annotations.Param;
|
||||||
|
import org.openjdk.jmh.annotations.Scope;
|
||||||
|
import org.openjdk.jmh.annotations.Setup;
|
||||||
|
import org.openjdk.jmh.annotations.State;
|
||||||
|
import org.openjdk.jmh.annotations.TearDown;
|
||||||
|
import org.openjdk.jmh.annotations.Warmup;
|
||||||
|
import org.openjdk.jmh.infra.Blackhole;
|
||||||
|
|
||||||
|
@BenchmarkMode(Mode.Throughput)
|
||||||
|
@OutputTimeUnit(TimeUnit.MICROSECONDS)
|
||||||
|
@State(Scope.Benchmark)
|
||||||
|
@Warmup(iterations = 5, time = 1)
|
||||||
|
@Measurement(iterations = 5, time = 1)
|
||||||
|
@Fork(
|
||||||
|
value = 3,
|
||||||
|
jvmArgsAppend = {"-Xmx1g", "-Xms1g", "-XX:+AlwaysPreTouch"})
|
||||||
|
public class PostingIndexInputBenchmark {
|
||||||
|
|
||||||
|
private Path path;
|
||||||
|
private Directory dir;
|
||||||
|
private IndexInput in;
|
||||||
|
private PostingIndexInput postingIn;
|
||||||
|
private final ForUtil forUtil = new ForUtil();
|
||||||
|
private final ForDeltaUtil forDeltaUtil = new ForDeltaUtil();
|
||||||
|
private final long[] values = new long[128];
|
||||||
|
|
||||||
|
@Param({"2", "3", "4", "5", "6", "7", "8", "9", "10"})
|
||||||
|
public int bpv;
|
||||||
|
|
||||||
|
@Setup(Level.Trial)
|
||||||
|
public void setup() throws Exception {
|
||||||
|
path = Files.createTempDirectory("forUtil");
|
||||||
|
dir = MMapDirectory.open(path);
|
||||||
|
try (IndexOutput out = dir.createOutput("docs", IOContext.DEFAULT)) {
|
||||||
|
Random r = new Random(0);
|
||||||
|
// Write enough random data to not reach EOF while decoding
|
||||||
|
for (int i = 0; i < 100; ++i) {
|
||||||
|
out.writeLong(r.nextLong());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
in = dir.openInput("docs", IOContext.DEFAULT);
|
||||||
|
postingIn = new PostingIndexInput(in, forUtil, forDeltaUtil);
|
||||||
|
}
|
||||||
|
|
||||||
|
@TearDown(Level.Trial)
|
||||||
|
public void tearDown() throws Exception {
|
||||||
|
if (dir != null) {
|
||||||
|
dir.deleteFile("docs");
|
||||||
|
}
|
||||||
|
IOUtils.close(in, dir);
|
||||||
|
in = null;
|
||||||
|
dir = null;
|
||||||
|
Files.deleteIfExists(path);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Benchmark
|
||||||
|
public void decode(Blackhole bh) throws IOException {
|
||||||
|
in.seek(3); // random unaligned offset
|
||||||
|
postingIn.decode(bpv, values);
|
||||||
|
bh.consume(values);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Benchmark
|
||||||
|
public void decodeAndPrefixSum(Blackhole bh) throws IOException {
|
||||||
|
in.seek(3); // random unaligned offset
|
||||||
|
postingIn.decodeAndPrefixSum(bpv, 100, values);
|
||||||
|
bh.consume(values);
|
||||||
|
}
|
||||||
|
}
|
|
@ -17,11 +17,10 @@
|
||||||
# -------------------------------------------------------------------------------------
|
# -------------------------------------------------------------------------------------
|
||||||
# multi val params are iterated by NewRound's, added to reports, start with column name.
|
# multi val params are iterated by NewRound's, added to reports, start with column name.
|
||||||
|
|
||||||
# collector.class can be:
|
# collector.manager.class can be:
|
||||||
# Fully Qualified Class Name of a Collector with a empty constructor
|
# Fully Qualified Class Name of a CollectorManager with a empty constructor
|
||||||
# topScoreDocOrdered - Creates a TopScoreDocCollector that requires in order docs
|
# topScoreDoc - Creates a TopScoreDocCollectorManager
|
||||||
# topScoreDocUnordered - Like above, but allows out of order
|
collector.manager.class=coll:topScoreDoc
|
||||||
collector.class=coll:topScoreDoc
|
|
||||||
|
|
||||||
analyzer=org.apache.lucene.analysis.core.WhitespaceAnalyzer
|
analyzer=org.apache.lucene.analysis.core.WhitespaceAnalyzer
|
||||||
directory=FSDirectory
|
directory=FSDirectory
|
||||||
|
|
|
@ -17,11 +17,10 @@
|
||||||
# -------------------------------------------------------------------------------------
|
# -------------------------------------------------------------------------------------
|
||||||
# multi val params are iterated by NewRound's, added to reports, start with column name.
|
# multi val params are iterated by NewRound's, added to reports, start with column name.
|
||||||
|
|
||||||
# collector.class can be:
|
# collector.manager.class can be:
|
||||||
# Fully Qualified Class Name of a Collector with a empty constructor
|
# Fully Qualified Class Name of a CollectorManager with a empty constructor
|
||||||
# topScoreDocOrdered - Creates a TopScoreDocCollector that requires in order docs
|
# topScoreDoc - Creates a TopScoreDocCollectorManager
|
||||||
# topScoreDocUnordered - Like above, but allows out of order
|
collector.manager.class=coll:topScoreDoc
|
||||||
collector.class=coll:topScoreDoc
|
|
||||||
|
|
||||||
analyzer=org.apache.lucene.analysis.core.WhitespaceAnalyzer
|
analyzer=org.apache.lucene.analysis.core.WhitespaceAnalyzer
|
||||||
directory=FSDirectory
|
directory=FSDirectory
|
||||||
|
|
|
@ -238,7 +238,7 @@ public class EnwikiContentSource extends ContentSource {
|
||||||
time = null;
|
time = null;
|
||||||
id = null;
|
id = null;
|
||||||
break;
|
break;
|
||||||
// intentional fall-through.
|
// intentional fall-through.
|
||||||
case BODY:
|
case BODY:
|
||||||
case DATE:
|
case DATE:
|
||||||
case TITLE:
|
case TITLE:
|
||||||
|
|
|
@ -99,7 +99,7 @@ public class SpatialDocMaker extends DocMaker {
|
||||||
return makeRPTStrategy(SPATIAL_FIELD, config, configMap, ctx);
|
return makeRPTStrategy(SPATIAL_FIELD, config, configMap, ctx);
|
||||||
case "composite":
|
case "composite":
|
||||||
return makeCompositeStrategy(config, configMap, ctx);
|
return makeCompositeStrategy(config, configMap, ctx);
|
||||||
// TODO add more as-needed
|
// TODO add more as-needed
|
||||||
default:
|
default:
|
||||||
throw new IllegalStateException("Unknown spatial.strategy: " + strategyName);
|
throw new IllegalStateException("Unknown spatial.strategy: " + strategyName);
|
||||||
}
|
}
|
||||||
|
|
|
@ -24,7 +24,7 @@ import org.apache.lucene.index.DirectoryReader;
|
||||||
import org.apache.lucene.index.IndexReader;
|
import org.apache.lucene.index.IndexReader;
|
||||||
import org.apache.lucene.index.MultiBits;
|
import org.apache.lucene.index.MultiBits;
|
||||||
import org.apache.lucene.index.StoredFields;
|
import org.apache.lucene.index.StoredFields;
|
||||||
import org.apache.lucene.search.Collector;
|
import org.apache.lucene.search.CollectorManager;
|
||||||
import org.apache.lucene.search.IndexSearcher;
|
import org.apache.lucene.search.IndexSearcher;
|
||||||
import org.apache.lucene.search.Query;
|
import org.apache.lucene.search.Query;
|
||||||
import org.apache.lucene.search.ScoreDoc;
|
import org.apache.lucene.search.ScoreDoc;
|
||||||
|
@ -119,9 +119,7 @@ public abstract class ReadTask extends PerfTask {
|
||||||
hits = searcher.search(q, numHits);
|
hits = searcher.search(q, numHits);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
Collector collector = createCollector();
|
searcher.search(q, createCollectorManager());
|
||||||
|
|
||||||
searcher.search(q, collector);
|
|
||||||
// hits = collector.topDocs();
|
// hits = collector.topDocs();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -184,9 +182,8 @@ public abstract class ReadTask extends PerfTask {
|
||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
|
|
||||||
protected Collector createCollector() throws Exception {
|
protected CollectorManager<?, ?> createCollectorManager() throws Exception {
|
||||||
return new TopScoreDocCollectorManager(numHits(), withTotalHits() ? Integer.MAX_VALUE : 1)
|
return new TopScoreDocCollectorManager(numHits(), withTotalHits() ? Integer.MAX_VALUE : 1);
|
||||||
.newCollector();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
protected Document retrieveDoc(StoredFields storedFields, int id) throws IOException {
|
protected Document retrieveDoc(StoredFields storedFields, int id) throws IOException {
|
||||||
|
|
|
@ -19,8 +19,8 @@ package org.apache.lucene.benchmark.byTask.tasks;
|
||||||
import org.apache.lucene.benchmark.byTask.PerfRunData;
|
import org.apache.lucene.benchmark.byTask.PerfRunData;
|
||||||
import org.apache.lucene.benchmark.byTask.feeds.QueryMaker;
|
import org.apache.lucene.benchmark.byTask.feeds.QueryMaker;
|
||||||
import org.apache.lucene.benchmark.byTask.utils.Config;
|
import org.apache.lucene.benchmark.byTask.utils.Config;
|
||||||
import org.apache.lucene.search.Collector;
|
import org.apache.lucene.search.CollectorManager;
|
||||||
import org.apache.lucene.search.TopScoreDocCollector;
|
import org.apache.lucene.search.TopScoreDocCollectorManager;
|
||||||
|
|
||||||
/** Does search w/ a custom collector */
|
/** Does search w/ a custom collector */
|
||||||
public class SearchWithCollectorTask extends SearchTask {
|
public class SearchWithCollectorTask extends SearchTask {
|
||||||
|
@ -37,7 +37,11 @@ public class SearchWithCollectorTask extends SearchTask {
|
||||||
// check to make sure either the doc is being stored
|
// check to make sure either the doc is being stored
|
||||||
PerfRunData runData = getRunData();
|
PerfRunData runData = getRunData();
|
||||||
Config config = runData.getConfig();
|
Config config = runData.getConfig();
|
||||||
clnName = config.get("collector.class", "");
|
if (config.get("collector.class", null) != null) {
|
||||||
|
throw new IllegalArgumentException(
|
||||||
|
"collector.class is no longer supported as a config parameter, use collector.manager.class instead to provide a CollectorManager class name");
|
||||||
|
}
|
||||||
|
clnName = config.get("collector.manager.class", "");
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -46,17 +50,17 @@ public class SearchWithCollectorTask extends SearchTask {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected Collector createCollector() throws Exception {
|
protected CollectorManager<?, ?> createCollectorManager() throws Exception {
|
||||||
Collector collector = null;
|
CollectorManager<?, ?> collectorManager;
|
||||||
if (clnName.equalsIgnoreCase("topScoreDoc") == true) {
|
if (clnName.equalsIgnoreCase("topScoreDoc") == true) {
|
||||||
collector = TopScoreDocCollector.create(numHits(), Integer.MAX_VALUE);
|
collectorManager = new TopScoreDocCollectorManager(numHits(), Integer.MAX_VALUE);
|
||||||
} else if (clnName.length() > 0) {
|
} else if (clnName.length() > 0) {
|
||||||
collector = Class.forName(clnName).asSubclass(Collector.class).getConstructor().newInstance();
|
collectorManager =
|
||||||
|
Class.forName(clnName).asSubclass(CollectorManager.class).getConstructor().newInstance();
|
||||||
} else {
|
} else {
|
||||||
collector = super.createCollector();
|
collectorManager = super.createCollectorManager();
|
||||||
}
|
}
|
||||||
return collector;
|
return collectorManager;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|
|
@ -23,13 +23,13 @@ import org.apache.lucene.codecs.PostingsFormat;
|
||||||
import org.apache.lucene.codecs.PostingsReaderBase;
|
import org.apache.lucene.codecs.PostingsReaderBase;
|
||||||
import org.apache.lucene.codecs.PostingsWriterBase;
|
import org.apache.lucene.codecs.PostingsWriterBase;
|
||||||
import org.apache.lucene.codecs.lucene90.blocktree.Lucene90BlockTreeTermsWriter;
|
import org.apache.lucene.codecs.lucene90.blocktree.Lucene90BlockTreeTermsWriter;
|
||||||
import org.apache.lucene.codecs.lucene99.Lucene99PostingsReader;
|
import org.apache.lucene.codecs.lucene912.Lucene912PostingsReader;
|
||||||
import org.apache.lucene.codecs.lucene99.Lucene99PostingsWriter;
|
import org.apache.lucene.codecs.lucene912.Lucene912PostingsWriter;
|
||||||
import org.apache.lucene.index.SegmentReadState;
|
import org.apache.lucene.index.SegmentReadState;
|
||||||
import org.apache.lucene.index.SegmentWriteState;
|
import org.apache.lucene.index.SegmentWriteState;
|
||||||
import org.apache.lucene.util.IOUtils;
|
import org.apache.lucene.util.IOUtils;
|
||||||
|
|
||||||
/** Uses {@link OrdsBlockTreeTermsWriter} with {@link Lucene99PostingsWriter}. */
|
/** Uses {@link OrdsBlockTreeTermsWriter} with {@link Lucene912PostingsWriter}. */
|
||||||
public class BlockTreeOrdsPostingsFormat extends PostingsFormat {
|
public class BlockTreeOrdsPostingsFormat extends PostingsFormat {
|
||||||
|
|
||||||
private final int minTermBlockSize;
|
private final int minTermBlockSize;
|
||||||
|
@ -67,7 +67,7 @@ public class BlockTreeOrdsPostingsFormat extends PostingsFormat {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
|
public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
|
||||||
PostingsWriterBase postingsWriter = new Lucene99PostingsWriter(state);
|
PostingsWriterBase postingsWriter = new Lucene912PostingsWriter(state);
|
||||||
|
|
||||||
boolean success = false;
|
boolean success = false;
|
||||||
try {
|
try {
|
||||||
|
@ -84,7 +84,7 @@ public class BlockTreeOrdsPostingsFormat extends PostingsFormat {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException {
|
public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException {
|
||||||
PostingsReaderBase postingsReader = new Lucene99PostingsReader(state);
|
PostingsReaderBase postingsReader = new Lucene912PostingsReader(state);
|
||||||
boolean success = false;
|
boolean success = false;
|
||||||
try {
|
try {
|
||||||
FieldsProducer ret = new OrdsBlockTreeTermsReader(postingsReader, state);
|
FieldsProducer ret = new OrdsBlockTreeTermsReader(postingsReader, state);
|
||||||
|
|
|
@ -43,6 +43,7 @@ import org.apache.lucene.store.ChecksumIndexInput;
|
||||||
import org.apache.lucene.store.DataOutput;
|
import org.apache.lucene.store.DataOutput;
|
||||||
import org.apache.lucene.store.IndexOutput;
|
import org.apache.lucene.store.IndexOutput;
|
||||||
import org.apache.lucene.util.BytesRef;
|
import org.apache.lucene.util.BytesRef;
|
||||||
|
import org.apache.lucene.util.IOBooleanSupplier;
|
||||||
import org.apache.lucene.util.IOUtils;
|
import org.apache.lucene.util.IOUtils;
|
||||||
import org.apache.lucene.util.automaton.CompiledAutomaton;
|
import org.apache.lucene.util.automaton.CompiledAutomaton;
|
||||||
|
|
||||||
|
@ -315,12 +316,21 @@ public final class BloomFilteringPostingsFormat extends PostingsFormat {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public boolean seekExact(BytesRef text) throws IOException {
|
public IOBooleanSupplier prepareSeekExact(BytesRef text) throws IOException {
|
||||||
// The magical fail-fast speed up that is the entire point of all of
|
// The magical fail-fast speed up that is the entire point of all of
|
||||||
// this code - save a disk seek if there is a match on an in-memory
|
// this code - save a disk seek if there is a match on an in-memory
|
||||||
// structure
|
// structure
|
||||||
// that may occasionally give a false positive but guaranteed no false
|
// that may occasionally give a false positive but guaranteed no false
|
||||||
// negatives
|
// negatives
|
||||||
|
if (filter.contains(text) == ContainsResult.NO) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
return delegate().prepareSeekExact(text);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean seekExact(BytesRef text) throws IOException {
|
||||||
|
// See #prepareSeekExact
|
||||||
if (filter.contains(text) == ContainsResult.NO) {
|
if (filter.contains(text) == ContainsResult.NO) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
|
@ -24,7 +24,7 @@ import java.util.TreeMap;
|
||||||
import org.apache.lucene.codecs.FieldsConsumer;
|
import org.apache.lucene.codecs.FieldsConsumer;
|
||||||
import org.apache.lucene.codecs.FieldsProducer;
|
import org.apache.lucene.codecs.FieldsProducer;
|
||||||
import org.apache.lucene.codecs.PostingsFormat;
|
import org.apache.lucene.codecs.PostingsFormat;
|
||||||
import org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat;
|
import org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat;
|
||||||
import org.apache.lucene.index.BaseTermsEnum;
|
import org.apache.lucene.index.BaseTermsEnum;
|
||||||
import org.apache.lucene.index.FieldInfo;
|
import org.apache.lucene.index.FieldInfo;
|
||||||
import org.apache.lucene.index.Fields;
|
import org.apache.lucene.index.Fields;
|
||||||
|
@ -54,7 +54,7 @@ import org.apache.lucene.util.automaton.TransitionAccessor;
|
||||||
// - or: longer dense skip lists than just next byte?
|
// - or: longer dense skip lists than just next byte?
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Wraps {@link Lucene99PostingsFormat} format for on-disk storage, but then at read time loads and
|
* Wraps {@link Lucene912PostingsFormat} format for on-disk storage, but then at read time loads and
|
||||||
* stores all terms and postings directly in RAM as byte[], int[].
|
* stores all terms and postings directly in RAM as byte[], int[].
|
||||||
*
|
*
|
||||||
* <p><b>WARNING</b>: This is exceptionally RAM intensive: it makes no effort to compress the
|
* <p><b>WARNING</b>: This is exceptionally RAM intensive: it makes no effort to compress the
|
||||||
|
@ -97,12 +97,12 @@ public final class DirectPostingsFormat extends PostingsFormat {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
|
public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
|
||||||
return PostingsFormat.forName("Lucene99").fieldsConsumer(state);
|
return PostingsFormat.forName("Lucene912").fieldsConsumer(state);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException {
|
public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException {
|
||||||
FieldsProducer postings = PostingsFormat.forName("Lucene99").fieldsProducer(state);
|
FieldsProducer postings = PostingsFormat.forName("Lucene912").fieldsProducer(state);
|
||||||
if (state.context.context() != IOContext.Context.MERGE) {
|
if (state.context.context() != IOContext.Context.MERGE) {
|
||||||
FieldsProducer loadedPostings;
|
FieldsProducer loadedPostings;
|
||||||
try {
|
try {
|
||||||
|
|
|
@ -22,8 +22,8 @@ import org.apache.lucene.codecs.FieldsProducer;
|
||||||
import org.apache.lucene.codecs.PostingsFormat;
|
import org.apache.lucene.codecs.PostingsFormat;
|
||||||
import org.apache.lucene.codecs.PostingsReaderBase;
|
import org.apache.lucene.codecs.PostingsReaderBase;
|
||||||
import org.apache.lucene.codecs.PostingsWriterBase;
|
import org.apache.lucene.codecs.PostingsWriterBase;
|
||||||
import org.apache.lucene.codecs.lucene99.Lucene99PostingsReader;
|
import org.apache.lucene.codecs.lucene912.Lucene912PostingsReader;
|
||||||
import org.apache.lucene.codecs.lucene99.Lucene99PostingsWriter;
|
import org.apache.lucene.codecs.lucene912.Lucene912PostingsWriter;
|
||||||
import org.apache.lucene.index.SegmentReadState;
|
import org.apache.lucene.index.SegmentReadState;
|
||||||
import org.apache.lucene.index.SegmentWriteState;
|
import org.apache.lucene.index.SegmentWriteState;
|
||||||
import org.apache.lucene.util.IOUtils;
|
import org.apache.lucene.util.IOUtils;
|
||||||
|
@ -41,7 +41,7 @@ public final class FSTPostingsFormat extends PostingsFormat {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
|
public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
|
||||||
PostingsWriterBase postingsWriter = new Lucene99PostingsWriter(state);
|
PostingsWriterBase postingsWriter = new Lucene912PostingsWriter(state);
|
||||||
|
|
||||||
boolean success = false;
|
boolean success = false;
|
||||||
try {
|
try {
|
||||||
|
@ -57,7 +57,7 @@ public final class FSTPostingsFormat extends PostingsFormat {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException {
|
public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException {
|
||||||
PostingsReaderBase postingsReader = new Lucene99PostingsReader(state);
|
PostingsReaderBase postingsReader = new Lucene912PostingsReader(state);
|
||||||
boolean success = false;
|
boolean success = false;
|
||||||
try {
|
try {
|
||||||
FieldsProducer ret = new FSTTermsReader(state, postingsReader);
|
FieldsProducer ret = new FSTTermsReader(state, postingsReader);
|
||||||
|
|
|
@ -195,9 +195,10 @@ public class FSTTermsReader extends FieldsProducer {
|
||||||
this.sumTotalTermFreq = sumTotalTermFreq;
|
this.sumTotalTermFreq = sumTotalTermFreq;
|
||||||
this.sumDocFreq = sumDocFreq;
|
this.sumDocFreq = sumDocFreq;
|
||||||
this.docCount = docCount;
|
this.docCount = docCount;
|
||||||
OffHeapFSTStore offHeapFSTStore = new OffHeapFSTStore();
|
|
||||||
FSTTermOutputs outputs = new FSTTermOutputs(fieldInfo);
|
FSTTermOutputs outputs = new FSTTermOutputs(fieldInfo);
|
||||||
this.dict = new FST<>(FST.readMetadata(in, outputs), in, offHeapFSTStore);
|
final var fstMetadata = FST.readMetadata(in, outputs);
|
||||||
|
OffHeapFSTStore offHeapFSTStore = new OffHeapFSTStore(in, in.getFilePointer(), fstMetadata);
|
||||||
|
this.dict = FST.fromFSTReader(fstMetadata, offHeapFSTStore);
|
||||||
in.skipBytes(offHeapFSTStore.size());
|
in.skipBytes(offHeapFSTStore.size());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -71,8 +71,8 @@ final class SimpleTextBKDReader extends PointValues {
|
||||||
this.pointCount = pointCount;
|
this.pointCount = pointCount;
|
||||||
this.docCount = docCount;
|
this.docCount = docCount;
|
||||||
this.version = SimpleTextBKDWriter.VERSION_CURRENT;
|
this.version = SimpleTextBKDWriter.VERSION_CURRENT;
|
||||||
assert minPackedValue.length == config.packedIndexBytesLength;
|
assert minPackedValue.length == config.packedIndexBytesLength();
|
||||||
assert maxPackedValue.length == config.packedIndexBytesLength;
|
assert maxPackedValue.length == config.packedIndexBytesLength();
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -99,8 +99,8 @@ final class SimpleTextBKDReader extends PointValues {
|
||||||
private SimpleTextPointTree(
|
private SimpleTextPointTree(
|
||||||
IndexInput in, int nodeID, int level, byte[] minPackedValue, byte[] maxPackedValue) {
|
IndexInput in, int nodeID, int level, byte[] minPackedValue, byte[] maxPackedValue) {
|
||||||
this.in = in;
|
this.in = in;
|
||||||
this.scratchDocIDs = new int[config.maxPointsInLeafNode];
|
this.scratchDocIDs = new int[config.maxPointsInLeafNode()];
|
||||||
this.scratchPackedValue = new byte[config.packedBytesLength];
|
this.scratchPackedValue = new byte[config.packedBytesLength()];
|
||||||
this.nodeID = nodeID;
|
this.nodeID = nodeID;
|
||||||
this.rootNode = nodeID;
|
this.rootNode = nodeID;
|
||||||
this.level = level;
|
this.level = level;
|
||||||
|
@ -145,38 +145,39 @@ final class SimpleTextBKDReader extends PointValues {
|
||||||
private void pushLeft() {
|
private void pushLeft() {
|
||||||
int address = nodeID * bytesPerIndexEntry;
|
int address = nodeID * bytesPerIndexEntry;
|
||||||
// final int splitDimPos;
|
// final int splitDimPos;
|
||||||
if (config.numIndexDims == 1) {
|
if (config.numIndexDims() == 1) {
|
||||||
splitDims[level] = 0;
|
splitDims[level] = 0;
|
||||||
} else {
|
} else {
|
||||||
splitDims[level] = (splitPackedValues[address++] & 0xff);
|
splitDims[level] = (splitPackedValues[address++] & 0xff);
|
||||||
}
|
}
|
||||||
final int splitDimPos = splitDims[level] * config.bytesPerDim;
|
final int splitDimPos = splitDims[level] * config.bytesPerDim();
|
||||||
if (splitDimValueStack[level] == null) {
|
if (splitDimValueStack[level] == null) {
|
||||||
splitDimValueStack[level] = new byte[config.bytesPerDim];
|
splitDimValueStack[level] = new byte[config.bytesPerDim()];
|
||||||
}
|
}
|
||||||
// save the dimension we are going to change
|
// save the dimension we are going to change
|
||||||
System.arraycopy(
|
System.arraycopy(
|
||||||
maxPackedValue, splitDimPos, splitDimValueStack[level], 0, config.bytesPerDim);
|
maxPackedValue, splitDimPos, splitDimValueStack[level], 0, config.bytesPerDim());
|
||||||
assert Arrays.compareUnsigned(
|
assert Arrays.compareUnsigned(
|
||||||
maxPackedValue,
|
maxPackedValue,
|
||||||
splitDimPos,
|
splitDimPos,
|
||||||
splitDimPos + config.bytesPerDim,
|
splitDimPos + config.bytesPerDim(),
|
||||||
splitPackedValues,
|
splitPackedValues,
|
||||||
address,
|
address,
|
||||||
address + config.bytesPerDim)
|
address + config.bytesPerDim())
|
||||||
>= 0
|
>= 0
|
||||||
: "config.bytesPerDim="
|
: "config.bytesPerDim()="
|
||||||
+ config.bytesPerDim
|
+ config.bytesPerDim()
|
||||||
+ " splitDim="
|
+ " splitDim="
|
||||||
+ splitDims[level]
|
+ splitDims[level]
|
||||||
+ " config.numIndexDims="
|
+ " config.numIndexDims()="
|
||||||
+ config.numIndexDims
|
+ config.numIndexDims()
|
||||||
+ " config.numDims="
|
+ " config.numDims="
|
||||||
+ config.numDims;
|
+ config.numDims();
|
||||||
nodeID *= 2;
|
nodeID *= 2;
|
||||||
level++;
|
level++;
|
||||||
// add the split dim value:
|
// add the split dim value:
|
||||||
System.arraycopy(splitPackedValues, address, maxPackedValue, splitDimPos, config.bytesPerDim);
|
System.arraycopy(
|
||||||
|
splitPackedValues, address, maxPackedValue, splitDimPos, config.bytesPerDim());
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -191,37 +192,38 @@ final class SimpleTextBKDReader extends PointValues {
|
||||||
|
|
||||||
private void pushRight() {
|
private void pushRight() {
|
||||||
int address = nodeID * bytesPerIndexEntry;
|
int address = nodeID * bytesPerIndexEntry;
|
||||||
if (config.numIndexDims == 1) {
|
if (config.numIndexDims() == 1) {
|
||||||
splitDims[level] = 0;
|
splitDims[level] = 0;
|
||||||
} else {
|
} else {
|
||||||
splitDims[level] = (splitPackedValues[address++] & 0xff);
|
splitDims[level] = (splitPackedValues[address++] & 0xff);
|
||||||
}
|
}
|
||||||
final int splitDimPos = splitDims[level] * config.bytesPerDim;
|
final int splitDimPos = splitDims[level] * config.bytesPerDim();
|
||||||
// we should have already visit the left node
|
// we should have already visit the left node
|
||||||
assert splitDimValueStack[level] != null;
|
assert splitDimValueStack[level] != null;
|
||||||
// save the dimension we are going to change
|
// save the dimension we are going to change
|
||||||
System.arraycopy(
|
System.arraycopy(
|
||||||
minPackedValue, splitDimPos, splitDimValueStack[level], 0, config.bytesPerDim);
|
minPackedValue, splitDimPos, splitDimValueStack[level], 0, config.bytesPerDim());
|
||||||
assert Arrays.compareUnsigned(
|
assert Arrays.compareUnsigned(
|
||||||
minPackedValue,
|
minPackedValue,
|
||||||
splitDimPos,
|
splitDimPos,
|
||||||
splitDimPos + config.bytesPerDim,
|
splitDimPos + config.bytesPerDim(),
|
||||||
splitPackedValues,
|
splitPackedValues,
|
||||||
address,
|
address,
|
||||||
address + config.bytesPerDim)
|
address + config.bytesPerDim())
|
||||||
<= 0
|
<= 0
|
||||||
: "config.bytesPerDim="
|
: "config.bytesPerDim()="
|
||||||
+ config.bytesPerDim
|
+ config.bytesPerDim()
|
||||||
+ " splitDim="
|
+ " splitDim="
|
||||||
+ splitDims[level]
|
+ splitDims[level]
|
||||||
+ " config.numIndexDims="
|
+ " config.numIndexDims()="
|
||||||
+ config.numIndexDims
|
+ config.numIndexDims()
|
||||||
+ " config.numDims="
|
+ " config.numDims="
|
||||||
+ config.numDims;
|
+ config.numDims();
|
||||||
nodeID = 2 * nodeID + 1;
|
nodeID = 2 * nodeID + 1;
|
||||||
level++;
|
level++;
|
||||||
// add the split dim value:
|
// add the split dim value:
|
||||||
System.arraycopy(splitPackedValues, address, minPackedValue, splitDimPos, config.bytesPerDim);
|
System.arraycopy(
|
||||||
|
splitPackedValues, address, minPackedValue, splitDimPos, config.bytesPerDim());
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -242,16 +244,16 @@ final class SimpleTextBKDReader extends PointValues {
|
||||||
splitDimValueStack[level],
|
splitDimValueStack[level],
|
||||||
0,
|
0,
|
||||||
maxPackedValue,
|
maxPackedValue,
|
||||||
splitDims[level] * config.bytesPerDim,
|
splitDims[level] * config.bytesPerDim(),
|
||||||
config.bytesPerDim);
|
config.bytesPerDim());
|
||||||
} else {
|
} else {
|
||||||
|
|
||||||
System.arraycopy(
|
System.arraycopy(
|
||||||
splitDimValueStack[level],
|
splitDimValueStack[level],
|
||||||
0,
|
0,
|
||||||
minPackedValue,
|
minPackedValue,
|
||||||
splitDims[level] * config.bytesPerDim,
|
splitDims[level] * config.bytesPerDim(),
|
||||||
config.bytesPerDim);
|
config.bytesPerDim());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -290,7 +292,7 @@ final class SimpleTextBKDReader extends PointValues {
|
||||||
private long sizeFromBalancedTree(int leftMostLeafNode, int rightMostLeafNode) {
|
private long sizeFromBalancedTree(int leftMostLeafNode, int rightMostLeafNode) {
|
||||||
// number of points that need to be distributed between leaves, one per leaf
|
// number of points that need to be distributed between leaves, one per leaf
|
||||||
final int extraPoints =
|
final int extraPoints =
|
||||||
Math.toIntExact(((long) config.maxPointsInLeafNode * leafNodeOffset) - pointCount);
|
Math.toIntExact(((long) config.maxPointsInLeafNode() * leafNodeOffset) - pointCount);
|
||||||
assert extraPoints < leafNodeOffset : "point excess should be lower than leafNodeOffset";
|
assert extraPoints < leafNodeOffset : "point excess should be lower than leafNodeOffset";
|
||||||
// offset where we stop adding one point to the leaves
|
// offset where we stop adding one point to the leaves
|
||||||
final int nodeOffset = leafNodeOffset - extraPoints;
|
final int nodeOffset = leafNodeOffset - extraPoints;
|
||||||
|
@ -298,9 +300,9 @@ final class SimpleTextBKDReader extends PointValues {
|
||||||
for (int node = leftMostLeafNode; node <= rightMostLeafNode; node++) {
|
for (int node = leftMostLeafNode; node <= rightMostLeafNode; node++) {
|
||||||
// offsetPosition provides which extra point will be added to this node
|
// offsetPosition provides which extra point will be added to this node
|
||||||
if (balanceTreeNodePosition(0, leafNodeOffset, node - leafNodeOffset, 0, 0) < nodeOffset) {
|
if (balanceTreeNodePosition(0, leafNodeOffset, node - leafNodeOffset, 0, 0) < nodeOffset) {
|
||||||
count += config.maxPointsInLeafNode;
|
count += config.maxPointsInLeafNode();
|
||||||
} else {
|
} else {
|
||||||
count += config.maxPointsInLeafNode - 1;
|
count += config.maxPointsInLeafNode() - 1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return count;
|
return count;
|
||||||
|
@ -376,14 +378,14 @@ final class SimpleTextBKDReader extends PointValues {
|
||||||
// Again, this time reading values and checking with the visitor
|
// Again, this time reading values and checking with the visitor
|
||||||
visitor.grow(count);
|
visitor.grow(count);
|
||||||
// NOTE: we don't do prefix coding, so we ignore commonPrefixLengths
|
// NOTE: we don't do prefix coding, so we ignore commonPrefixLengths
|
||||||
assert scratchPackedValue.length == config.packedBytesLength;
|
assert scratchPackedValue.length == config.packedBytesLength();
|
||||||
BytesRefBuilder scratch = new BytesRefBuilder();
|
BytesRefBuilder scratch = new BytesRefBuilder();
|
||||||
for (int i = 0; i < count; i++) {
|
for (int i = 0; i < count; i++) {
|
||||||
readLine(in, scratch);
|
readLine(in, scratch);
|
||||||
assert startsWith(scratch, BLOCK_VALUE);
|
assert startsWith(scratch, BLOCK_VALUE);
|
||||||
BytesRef br = SimpleTextUtil.fromBytesRefString(stripPrefix(scratch, BLOCK_VALUE));
|
BytesRef br = SimpleTextUtil.fromBytesRefString(stripPrefix(scratch, BLOCK_VALUE));
|
||||||
assert br.length == config.packedBytesLength;
|
assert br.length == config.packedBytesLength();
|
||||||
System.arraycopy(br.bytes, br.offset, scratchPackedValue, 0, config.packedBytesLength);
|
System.arraycopy(br.bytes, br.offset, scratchPackedValue, 0, config.packedBytesLength());
|
||||||
visitor.visit(scratchDocIDs[i], scratchPackedValue);
|
visitor.visit(scratchDocIDs[i], scratchPackedValue);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
|
@ -443,17 +445,17 @@ final class SimpleTextBKDReader extends PointValues {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public int getNumDimensions() throws IOException {
|
public int getNumDimensions() throws IOException {
|
||||||
return config.numDims;
|
return config.numDims();
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public int getNumIndexDimensions() throws IOException {
|
public int getNumIndexDimensions() throws IOException {
|
||||||
return config.numIndexDims;
|
return config.numIndexDims();
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public int getBytesPerDimension() throws IOException {
|
public int getBytesPerDimension() throws IOException {
|
||||||
return config.bytesPerDim;
|
return config.bytesPerDim();
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|
|
@ -144,28 +144,28 @@ final class SimpleTextBKDWriter implements Closeable {
|
||||||
this.maxDoc = maxDoc;
|
this.maxDoc = maxDoc;
|
||||||
docsSeen = new FixedBitSet(maxDoc);
|
docsSeen = new FixedBitSet(maxDoc);
|
||||||
|
|
||||||
scratchDiff = new byte[config.bytesPerDim];
|
scratchDiff = new byte[config.bytesPerDim()];
|
||||||
scratch1 = new byte[config.packedBytesLength];
|
scratch1 = new byte[config.packedBytesLength()];
|
||||||
scratch2 = new byte[config.packedBytesLength];
|
scratch2 = new byte[config.packedBytesLength()];
|
||||||
commonPrefixLengths = new int[config.numDims];
|
commonPrefixLengths = new int[config.numDims()];
|
||||||
|
|
||||||
minPackedValue = new byte[config.packedIndexBytesLength];
|
minPackedValue = new byte[config.packedIndexBytesLength()];
|
||||||
maxPackedValue = new byte[config.packedIndexBytesLength];
|
maxPackedValue = new byte[config.packedIndexBytesLength()];
|
||||||
|
|
||||||
// Maximum number of points we hold in memory at any time
|
// Maximum number of points we hold in memory at any time
|
||||||
maxPointsSortInHeap =
|
maxPointsSortInHeap =
|
||||||
(int) ((maxMBSortInHeap * 1024 * 1024) / (config.bytesPerDoc * config.numDims));
|
(int) ((maxMBSortInHeap * 1024 * 1024) / (config.bytesPerDoc() * config.numDims()));
|
||||||
|
|
||||||
// Finally, we must be able to hold at least the leaf node in heap during build:
|
// Finally, we must be able to hold at least the leaf node in heap during build:
|
||||||
if (maxPointsSortInHeap < config.maxPointsInLeafNode) {
|
if (maxPointsSortInHeap < config.maxPointsInLeafNode()) {
|
||||||
throw new IllegalArgumentException(
|
throw new IllegalArgumentException(
|
||||||
"maxMBSortInHeap="
|
"maxMBSortInHeap="
|
||||||
+ maxMBSortInHeap
|
+ maxMBSortInHeap
|
||||||
+ " only allows for maxPointsSortInHeap="
|
+ " only allows for maxPointsSortInHeap="
|
||||||
+ maxPointsSortInHeap
|
+ maxPointsSortInHeap
|
||||||
+ ", but this is less than config.maxPointsInLeafNode="
|
+ ", but this is less than config.maxPointsInLeafNode()="
|
||||||
+ config.maxPointsInLeafNode
|
+ config.maxPointsInLeafNode()
|
||||||
+ "; either increase maxMBSortInHeap or decrease config.maxPointsInLeafNode");
|
+ "; either increase maxMBSortInHeap or decrease config.maxPointsInLeafNode()");
|
||||||
}
|
}
|
||||||
|
|
||||||
this.maxMBSortInHeap = maxMBSortInHeap;
|
this.maxMBSortInHeap = maxMBSortInHeap;
|
||||||
|
@ -183,10 +183,10 @@ final class SimpleTextBKDWriter implements Closeable {
|
||||||
}
|
}
|
||||||
|
|
||||||
public void add(byte[] packedValue, int docID) throws IOException {
|
public void add(byte[] packedValue, int docID) throws IOException {
|
||||||
if (packedValue.length != config.packedBytesLength) {
|
if (packedValue.length != config.packedBytesLength()) {
|
||||||
throw new IllegalArgumentException(
|
throw new IllegalArgumentException(
|
||||||
"packedValue should be length="
|
"packedValue should be length="
|
||||||
+ config.packedBytesLength
|
+ config.packedBytesLength()
|
||||||
+ " (got: "
|
+ " (got: "
|
||||||
+ packedValue.length
|
+ packedValue.length
|
||||||
+ ")");
|
+ ")");
|
||||||
|
@ -209,30 +209,30 @@ final class SimpleTextBKDWriter implements Closeable {
|
||||||
} else {
|
} else {
|
||||||
pointWriter = new HeapPointWriter(config, Math.toIntExact(totalPointCount));
|
pointWriter = new HeapPointWriter(config, Math.toIntExact(totalPointCount));
|
||||||
}
|
}
|
||||||
System.arraycopy(packedValue, 0, minPackedValue, 0, config.packedIndexBytesLength);
|
System.arraycopy(packedValue, 0, minPackedValue, 0, config.packedIndexBytesLength());
|
||||||
System.arraycopy(packedValue, 0, maxPackedValue, 0, config.packedIndexBytesLength);
|
System.arraycopy(packedValue, 0, maxPackedValue, 0, config.packedIndexBytesLength());
|
||||||
} else {
|
} else {
|
||||||
for (int dim = 0; dim < config.numIndexDims; dim++) {
|
for (int dim = 0; dim < config.numIndexDims(); dim++) {
|
||||||
int offset = dim * config.bytesPerDim;
|
int offset = dim * config.bytesPerDim();
|
||||||
if (Arrays.compareUnsigned(
|
if (Arrays.compareUnsigned(
|
||||||
packedValue,
|
packedValue,
|
||||||
offset,
|
offset,
|
||||||
offset + config.bytesPerDim,
|
offset + config.bytesPerDim(),
|
||||||
minPackedValue,
|
minPackedValue,
|
||||||
offset,
|
offset,
|
||||||
offset + config.bytesPerDim)
|
offset + config.bytesPerDim())
|
||||||
< 0) {
|
< 0) {
|
||||||
System.arraycopy(packedValue, offset, minPackedValue, offset, config.bytesPerDim);
|
System.arraycopy(packedValue, offset, minPackedValue, offset, config.bytesPerDim());
|
||||||
}
|
}
|
||||||
if (Arrays.compareUnsigned(
|
if (Arrays.compareUnsigned(
|
||||||
packedValue,
|
packedValue,
|
||||||
offset,
|
offset,
|
||||||
offset + config.bytesPerDim,
|
offset + config.bytesPerDim(),
|
||||||
maxPackedValue,
|
maxPackedValue,
|
||||||
offset,
|
offset,
|
||||||
offset + config.bytesPerDim)
|
offset + config.bytesPerDim())
|
||||||
> 0) {
|
> 0) {
|
||||||
System.arraycopy(packedValue, offset, maxPackedValue, offset, config.bytesPerDim);
|
System.arraycopy(packedValue, offset, maxPackedValue, offset, config.bytesPerDim());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -254,7 +254,7 @@ final class SimpleTextBKDWriter implements Closeable {
|
||||||
*/
|
*/
|
||||||
public long writeField(IndexOutput out, String fieldName, MutablePointTree reader)
|
public long writeField(IndexOutput out, String fieldName, MutablePointTree reader)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
if (config.numIndexDims == 1) {
|
if (config.numIndexDims() == 1) {
|
||||||
return writeField1Dim(out, fieldName, reader);
|
return writeField1Dim(out, fieldName, reader);
|
||||||
} else {
|
} else {
|
||||||
return writeFieldNDims(out, fieldName, reader);
|
return writeFieldNDims(out, fieldName, reader);
|
||||||
|
@ -280,7 +280,7 @@ final class SimpleTextBKDWriter implements Closeable {
|
||||||
long countPerLeaf = pointCount = values.size();
|
long countPerLeaf = pointCount = values.size();
|
||||||
long innerNodeCount = 1;
|
long innerNodeCount = 1;
|
||||||
|
|
||||||
while (countPerLeaf > config.maxPointsInLeafNode) {
|
while (countPerLeaf > config.maxPointsInLeafNode()) {
|
||||||
countPerLeaf = (countPerLeaf + 1) / 2;
|
countPerLeaf = (countPerLeaf + 1) / 2;
|
||||||
innerNodeCount *= 2;
|
innerNodeCount *= 2;
|
||||||
}
|
}
|
||||||
|
@ -289,7 +289,7 @@ final class SimpleTextBKDWriter implements Closeable {
|
||||||
|
|
||||||
checkMaxLeafNodeCount(numLeaves);
|
checkMaxLeafNodeCount(numLeaves);
|
||||||
|
|
||||||
final byte[] splitPackedValues = new byte[numLeaves * (config.bytesPerDim + 1)];
|
final byte[] splitPackedValues = new byte[numLeaves * (config.bytesPerDim() + 1)];
|
||||||
final long[] leafBlockFPs = new long[numLeaves];
|
final long[] leafBlockFPs = new long[numLeaves];
|
||||||
|
|
||||||
// compute the min/max for this slice
|
// compute the min/max for this slice
|
||||||
|
@ -297,37 +297,37 @@ final class SimpleTextBKDWriter implements Closeable {
|
||||||
Arrays.fill(maxPackedValue, (byte) 0);
|
Arrays.fill(maxPackedValue, (byte) 0);
|
||||||
for (int i = 0; i < Math.toIntExact(pointCount); ++i) {
|
for (int i = 0; i < Math.toIntExact(pointCount); ++i) {
|
||||||
values.getValue(i, scratchBytesRef1);
|
values.getValue(i, scratchBytesRef1);
|
||||||
for (int dim = 0; dim < config.numIndexDims; dim++) {
|
for (int dim = 0; dim < config.numIndexDims(); dim++) {
|
||||||
int offset = dim * config.bytesPerDim;
|
int offset = dim * config.bytesPerDim();
|
||||||
if (Arrays.compareUnsigned(
|
if (Arrays.compareUnsigned(
|
||||||
scratchBytesRef1.bytes,
|
scratchBytesRef1.bytes,
|
||||||
scratchBytesRef1.offset + offset,
|
scratchBytesRef1.offset + offset,
|
||||||
scratchBytesRef1.offset + offset + config.bytesPerDim,
|
scratchBytesRef1.offset + offset + config.bytesPerDim(),
|
||||||
minPackedValue,
|
minPackedValue,
|
||||||
offset,
|
offset,
|
||||||
offset + config.bytesPerDim)
|
offset + config.bytesPerDim())
|
||||||
< 0) {
|
< 0) {
|
||||||
System.arraycopy(
|
System.arraycopy(
|
||||||
scratchBytesRef1.bytes,
|
scratchBytesRef1.bytes,
|
||||||
scratchBytesRef1.offset + offset,
|
scratchBytesRef1.offset + offset,
|
||||||
minPackedValue,
|
minPackedValue,
|
||||||
offset,
|
offset,
|
||||||
config.bytesPerDim);
|
config.bytesPerDim());
|
||||||
}
|
}
|
||||||
if (Arrays.compareUnsigned(
|
if (Arrays.compareUnsigned(
|
||||||
scratchBytesRef1.bytes,
|
scratchBytesRef1.bytes,
|
||||||
scratchBytesRef1.offset + offset,
|
scratchBytesRef1.offset + offset,
|
||||||
scratchBytesRef1.offset + offset + config.bytesPerDim,
|
scratchBytesRef1.offset + offset + config.bytesPerDim(),
|
||||||
maxPackedValue,
|
maxPackedValue,
|
||||||
offset,
|
offset,
|
||||||
offset + config.bytesPerDim)
|
offset + config.bytesPerDim())
|
||||||
> 0) {
|
> 0) {
|
||||||
System.arraycopy(
|
System.arraycopy(
|
||||||
scratchBytesRef1.bytes,
|
scratchBytesRef1.bytes,
|
||||||
scratchBytesRef1.offset + offset,
|
scratchBytesRef1.offset + offset,
|
||||||
maxPackedValue,
|
maxPackedValue,
|
||||||
offset,
|
offset,
|
||||||
config.bytesPerDim);
|
config.bytesPerDim());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -345,7 +345,7 @@ final class SimpleTextBKDWriter implements Closeable {
|
||||||
maxPackedValue,
|
maxPackedValue,
|
||||||
splitPackedValues,
|
splitPackedValues,
|
||||||
leafBlockFPs,
|
leafBlockFPs,
|
||||||
new int[config.maxPointsInLeafNode]);
|
new int[config.maxPointsInLeafNode()]);
|
||||||
|
|
||||||
long indexFP = out.getFilePointer();
|
long indexFP = out.getFilePointer();
|
||||||
writeIndex(out, leafBlockFPs, splitPackedValues, Math.toIntExact(countPerLeaf));
|
writeIndex(out, leafBlockFPs, splitPackedValues, Math.toIntExact(countPerLeaf));
|
||||||
|
@ -387,15 +387,15 @@ final class SimpleTextBKDWriter implements Closeable {
|
||||||
final IndexOutput out;
|
final IndexOutput out;
|
||||||
final List<Long> leafBlockFPs = new ArrayList<>();
|
final List<Long> leafBlockFPs = new ArrayList<>();
|
||||||
final List<byte[]> leafBlockStartValues = new ArrayList<>();
|
final List<byte[]> leafBlockStartValues = new ArrayList<>();
|
||||||
final byte[] leafValues = new byte[config.maxPointsInLeafNode * config.packedBytesLength];
|
final byte[] leafValues = new byte[config.maxPointsInLeafNode() * config.packedBytesLength()];
|
||||||
final int[] leafDocs = new int[config.maxPointsInLeafNode];
|
final int[] leafDocs = new int[config.maxPointsInLeafNode()];
|
||||||
long valueCount;
|
long valueCount;
|
||||||
int leafCount;
|
int leafCount;
|
||||||
|
|
||||||
OneDimensionBKDWriter(IndexOutput out) {
|
OneDimensionBKDWriter(IndexOutput out) {
|
||||||
if (config.numIndexDims != 1) {
|
if (config.numIndexDims() != 1) {
|
||||||
throw new UnsupportedOperationException(
|
throw new UnsupportedOperationException(
|
||||||
"config.numIndexDims must be 1 but got " + config.numIndexDims);
|
"config.numIndexDims() must be 1 but got " + config.numIndexDims());
|
||||||
}
|
}
|
||||||
if (pointCount != 0) {
|
if (pointCount != 0) {
|
||||||
throw new IllegalStateException("cannot mix add and merge");
|
throw new IllegalStateException("cannot mix add and merge");
|
||||||
|
@ -411,7 +411,7 @@ final class SimpleTextBKDWriter implements Closeable {
|
||||||
|
|
||||||
this.out = out;
|
this.out = out;
|
||||||
|
|
||||||
lastPackedValue = new byte[config.packedBytesLength];
|
lastPackedValue = new byte[config.packedBytesLength()];
|
||||||
}
|
}
|
||||||
|
|
||||||
// for asserts
|
// for asserts
|
||||||
|
@ -426,8 +426,8 @@ final class SimpleTextBKDWriter implements Closeable {
|
||||||
packedValue,
|
packedValue,
|
||||||
0,
|
0,
|
||||||
leafValues,
|
leafValues,
|
||||||
leafCount * config.packedBytesLength,
|
leafCount * config.packedBytesLength(),
|
||||||
config.packedBytesLength);
|
config.packedBytesLength());
|
||||||
leafDocs[leafCount] = docID;
|
leafDocs[leafCount] = docID;
|
||||||
docsSeen.set(docID);
|
docsSeen.set(docID);
|
||||||
leafCount++;
|
leafCount++;
|
||||||
|
@ -441,7 +441,7 @@ final class SimpleTextBKDWriter implements Closeable {
|
||||||
+ " values");
|
+ " values");
|
||||||
}
|
}
|
||||||
|
|
||||||
if (leafCount == config.maxPointsInLeafNode) {
|
if (leafCount == config.maxPointsInLeafNode()) {
|
||||||
// We write a block once we hit exactly the max count ... this is different from
|
// We write a block once we hit exactly the max count ... this is different from
|
||||||
// when we flush a new segment, where we write between max/2 and max per leaf block,
|
// when we flush a new segment, where we write between max/2 and max per leaf block,
|
||||||
// so merged segments will behave differently from newly flushed segments:
|
// so merged segments will behave differently from newly flushed segments:
|
||||||
|
@ -471,43 +471,44 @@ final class SimpleTextBKDWriter implements Closeable {
|
||||||
// System.out.println("BKDW: now rotate numInnerNodes=" + numInnerNodes + " leafBlockStarts="
|
// System.out.println("BKDW: now rotate numInnerNodes=" + numInnerNodes + " leafBlockStarts="
|
||||||
// + leafBlockStartValues.size());
|
// + leafBlockStartValues.size());
|
||||||
|
|
||||||
byte[] index = new byte[(1 + numInnerNodes) * (1 + config.bytesPerDim)];
|
byte[] index = new byte[(1 + numInnerNodes) * (1 + config.bytesPerDim())];
|
||||||
rotateToTree(1, 0, numInnerNodes, index, leafBlockStartValues);
|
rotateToTree(1, 0, numInnerNodes, index, leafBlockStartValues);
|
||||||
long[] arr = new long[leafBlockFPs.size()];
|
long[] arr = new long[leafBlockFPs.size()];
|
||||||
for (int i = 0; i < leafBlockFPs.size(); i++) {
|
for (int i = 0; i < leafBlockFPs.size(); i++) {
|
||||||
arr[i] = leafBlockFPs.get(i);
|
arr[i] = leafBlockFPs.get(i);
|
||||||
}
|
}
|
||||||
writeIndex(out, arr, index, config.maxPointsInLeafNode);
|
writeIndex(out, arr, index, config.maxPointsInLeafNode());
|
||||||
return indexFP;
|
return indexFP;
|
||||||
}
|
}
|
||||||
|
|
||||||
private void writeLeafBlock() throws IOException {
|
private void writeLeafBlock() throws IOException {
|
||||||
assert leafCount != 0;
|
assert leafCount != 0;
|
||||||
if (valueCount == 0) {
|
if (valueCount == 0) {
|
||||||
System.arraycopy(leafValues, 0, minPackedValue, 0, config.packedIndexBytesLength);
|
System.arraycopy(leafValues, 0, minPackedValue, 0, config.packedIndexBytesLength());
|
||||||
}
|
}
|
||||||
System.arraycopy(
|
System.arraycopy(
|
||||||
leafValues,
|
leafValues,
|
||||||
(leafCount - 1) * config.packedBytesLength,
|
(leafCount - 1) * config.packedBytesLength(),
|
||||||
maxPackedValue,
|
maxPackedValue,
|
||||||
0,
|
0,
|
||||||
config.packedIndexBytesLength);
|
config.packedIndexBytesLength());
|
||||||
|
|
||||||
valueCount += leafCount;
|
valueCount += leafCount;
|
||||||
|
|
||||||
if (leafBlockFPs.size() > 0) {
|
if (leafBlockFPs.size() > 0) {
|
||||||
// Save the first (minimum) value in each leaf block except the first, to build the split
|
// Save the first (minimum) value in each leaf block except the first, to build the split
|
||||||
// value index in the end:
|
// value index in the end:
|
||||||
leafBlockStartValues.add(ArrayUtil.copyOfSubArray(leafValues, 0, config.packedBytesLength));
|
leafBlockStartValues.add(
|
||||||
|
ArrayUtil.copyOfSubArray(leafValues, 0, config.packedBytesLength()));
|
||||||
}
|
}
|
||||||
leafBlockFPs.add(out.getFilePointer());
|
leafBlockFPs.add(out.getFilePointer());
|
||||||
checkMaxLeafNodeCount(leafBlockFPs.size());
|
checkMaxLeafNodeCount(leafBlockFPs.size());
|
||||||
|
|
||||||
Arrays.fill(commonPrefixLengths, config.bytesPerDim);
|
Arrays.fill(commonPrefixLengths, config.bytesPerDim());
|
||||||
// Find per-dim common prefix:
|
// Find per-dim common prefix:
|
||||||
for (int dim = 0; dim < config.numDims; dim++) {
|
for (int dim = 0; dim < config.numDims(); dim++) {
|
||||||
int offset1 = dim * config.bytesPerDim;
|
int offset1 = dim * config.bytesPerDim();
|
||||||
int offset2 = (leafCount - 1) * config.packedBytesLength + offset1;
|
int offset2 = (leafCount - 1) * config.packedBytesLength() + offset1;
|
||||||
for (int j = 0; j < commonPrefixLengths[dim]; j++) {
|
for (int j = 0; j < commonPrefixLengths[dim]; j++) {
|
||||||
if (leafValues[offset1 + j] != leafValues[offset2 + j]) {
|
if (leafValues[offset1 + j] != leafValues[offset2 + j]) {
|
||||||
commonPrefixLengths[dim] = j;
|
commonPrefixLengths[dim] = j;
|
||||||
|
@ -523,24 +524,24 @@ final class SimpleTextBKDWriter implements Closeable {
|
||||||
final BytesRef scratch = new BytesRef();
|
final BytesRef scratch = new BytesRef();
|
||||||
|
|
||||||
{
|
{
|
||||||
scratch.length = config.packedBytesLength;
|
scratch.length = config.packedBytesLength();
|
||||||
scratch.bytes = leafValues;
|
scratch.bytes = leafValues;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public BytesRef apply(int i) {
|
public BytesRef apply(int i) {
|
||||||
scratch.offset = config.packedBytesLength * i;
|
scratch.offset = config.packedBytesLength() * i;
|
||||||
return scratch;
|
return scratch;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
assert valuesInOrderAndBounds(
|
assert valuesInOrderAndBounds(
|
||||||
leafCount,
|
leafCount,
|
||||||
0,
|
0,
|
||||||
ArrayUtil.copyOfSubArray(leafValues, 0, config.packedBytesLength),
|
ArrayUtil.copyOfSubArray(leafValues, 0, config.packedBytesLength()),
|
||||||
ArrayUtil.copyOfSubArray(
|
ArrayUtil.copyOfSubArray(
|
||||||
leafValues,
|
leafValues,
|
||||||
(leafCount - 1) * config.packedBytesLength,
|
(leafCount - 1) * config.packedBytesLength(),
|
||||||
leafCount * config.packedBytesLength),
|
leafCount * config.packedBytesLength()),
|
||||||
packedValues,
|
packedValues,
|
||||||
leafDocs,
|
leafDocs,
|
||||||
0);
|
0);
|
||||||
|
@ -552,7 +553,7 @@ final class SimpleTextBKDWriter implements Closeable {
|
||||||
private void rotateToTree(
|
private void rotateToTree(
|
||||||
int nodeID, int offset, int count, byte[] index, List<byte[]> leafBlockStartValues) {
|
int nodeID, int offset, int count, byte[] index, List<byte[]> leafBlockStartValues) {
|
||||||
// System.out.println("ROTATE: nodeID=" + nodeID + " offset=" + offset + " count=" + count + "
|
// System.out.println("ROTATE: nodeID=" + nodeID + " offset=" + offset + " count=" + count + "
|
||||||
// bpd=" + config.bytesPerDim + " index.length=" + index.length);
|
// bpd=" + config.bytesPerDim() + " index.length=" + index.length);
|
||||||
if (count == 1) {
|
if (count == 1) {
|
||||||
// Leaf index node
|
// Leaf index node
|
||||||
// System.out.println(" leaf index node");
|
// System.out.println(" leaf index node");
|
||||||
|
@ -561,8 +562,8 @@ final class SimpleTextBKDWriter implements Closeable {
|
||||||
leafBlockStartValues.get(offset),
|
leafBlockStartValues.get(offset),
|
||||||
0,
|
0,
|
||||||
index,
|
index,
|
||||||
nodeID * (1 + config.bytesPerDim) + 1,
|
nodeID * (1 + config.bytesPerDim()) + 1,
|
||||||
config.bytesPerDim);
|
config.bytesPerDim());
|
||||||
} else if (count > 1) {
|
} else if (count > 1) {
|
||||||
// Internal index node: binary partition of count
|
// Internal index node: binary partition of count
|
||||||
int countAtLevel = 1;
|
int countAtLevel = 1;
|
||||||
|
@ -587,8 +588,8 @@ final class SimpleTextBKDWriter implements Closeable {
|
||||||
leafBlockStartValues.get(rootOffset),
|
leafBlockStartValues.get(rootOffset),
|
||||||
0,
|
0,
|
||||||
index,
|
index,
|
||||||
nodeID * (1 + config.bytesPerDim) + 1,
|
nodeID * (1 + config.bytesPerDim()) + 1,
|
||||||
config.bytesPerDim);
|
config.bytesPerDim());
|
||||||
// System.out.println(" index[" + nodeID + "] = blockStartValues[" + rootOffset + "]");
|
// System.out.println(" index[" + nodeID + "] = blockStartValues[" + rootOffset + "]");
|
||||||
|
|
||||||
// TODO: we could optimize/specialize, when we know it's simply fully balanced binary tree
|
// TODO: we could optimize/specialize, when we know it's simply fully balanced binary tree
|
||||||
|
@ -611,10 +612,10 @@ final class SimpleTextBKDWriter implements Closeable {
|
||||||
}
|
}
|
||||||
|
|
||||||
private void checkMaxLeafNodeCount(int numLeaves) {
|
private void checkMaxLeafNodeCount(int numLeaves) {
|
||||||
if ((1 + config.bytesPerDim) * (long) numLeaves > ArrayUtil.MAX_ARRAY_LENGTH) {
|
if ((1 + config.bytesPerDim()) * (long) numLeaves > ArrayUtil.MAX_ARRAY_LENGTH) {
|
||||||
throw new IllegalStateException(
|
throw new IllegalStateException(
|
||||||
"too many nodes; increase config.maxPointsInLeafNode (currently "
|
"too many nodes; increase config.maxPointsInLeafNode() (currently "
|
||||||
+ config.maxPointsInLeafNode
|
+ config.maxPointsInLeafNode()
|
||||||
+ ") and reindex");
|
+ ") and reindex");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -652,7 +653,7 @@ final class SimpleTextBKDWriter implements Closeable {
|
||||||
long countPerLeaf = pointCount;
|
long countPerLeaf = pointCount;
|
||||||
long innerNodeCount = 1;
|
long innerNodeCount = 1;
|
||||||
|
|
||||||
while (countPerLeaf > config.maxPointsInLeafNode) {
|
while (countPerLeaf > config.maxPointsInLeafNode()) {
|
||||||
countPerLeaf = (countPerLeaf + 1) / 2;
|
countPerLeaf = (countPerLeaf + 1) / 2;
|
||||||
innerNodeCount *= 2;
|
innerNodeCount *= 2;
|
||||||
}
|
}
|
||||||
|
@ -667,20 +668,20 @@ final class SimpleTextBKDWriter implements Closeable {
|
||||||
|
|
||||||
// Indexed by nodeID, but first (root) nodeID is 1. We do 1+ because the lead byte at each
|
// Indexed by nodeID, but first (root) nodeID is 1. We do 1+ because the lead byte at each
|
||||||
// recursion says which dim we split on.
|
// recursion says which dim we split on.
|
||||||
byte[] splitPackedValues = new byte[Math.multiplyExact(numLeaves, 1 + config.bytesPerDim)];
|
byte[] splitPackedValues = new byte[Math.multiplyExact(numLeaves, 1 + config.bytesPerDim())];
|
||||||
|
|
||||||
// +1 because leaf count is power of 2 (e.g. 8), and innerNodeCount is power of 2 minus 1 (e.g.
|
// +1 because leaf count is power of 2 (e.g. 8), and innerNodeCount is power of 2 minus 1 (e.g.
|
||||||
// 7)
|
// 7)
|
||||||
long[] leafBlockFPs = new long[numLeaves];
|
long[] leafBlockFPs = new long[numLeaves];
|
||||||
|
|
||||||
// Make sure the math above "worked":
|
// Make sure the math above "worked":
|
||||||
assert pointCount / numLeaves <= config.maxPointsInLeafNode
|
assert pointCount / numLeaves <= config.maxPointsInLeafNode()
|
||||||
: "pointCount="
|
: "pointCount="
|
||||||
+ pointCount
|
+ pointCount
|
||||||
+ " numLeaves="
|
+ " numLeaves="
|
||||||
+ numLeaves
|
+ numLeaves
|
||||||
+ " config.maxPointsInLeafNode="
|
+ " config.maxPointsInLeafNode()="
|
||||||
+ config.maxPointsInLeafNode;
|
+ config.maxPointsInLeafNode();
|
||||||
|
|
||||||
// We re-use the selector so we do not need to create an object every time.
|
// We re-use the selector so we do not need to create an object every time.
|
||||||
BKDRadixSelector radixSelector =
|
BKDRadixSelector radixSelector =
|
||||||
|
@ -699,7 +700,7 @@ final class SimpleTextBKDWriter implements Closeable {
|
||||||
maxPackedValue,
|
maxPackedValue,
|
||||||
splitPackedValues,
|
splitPackedValues,
|
||||||
leafBlockFPs,
|
leafBlockFPs,
|
||||||
new int[config.maxPointsInLeafNode]);
|
new int[config.maxPointsInLeafNode()]);
|
||||||
|
|
||||||
// If no exception, we should have cleaned everything up:
|
// If no exception, we should have cleaned everything up:
|
||||||
assert tempDir.getCreatedFiles().isEmpty();
|
assert tempDir.getCreatedFiles().isEmpty();
|
||||||
|
@ -724,15 +725,15 @@ final class SimpleTextBKDWriter implements Closeable {
|
||||||
IndexOutput out, long[] leafBlockFPs, byte[] splitPackedValues, int maxPointsInLeafNode)
|
IndexOutput out, long[] leafBlockFPs, byte[] splitPackedValues, int maxPointsInLeafNode)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
write(out, NUM_DATA_DIMS);
|
write(out, NUM_DATA_DIMS);
|
||||||
writeInt(out, config.numDims);
|
writeInt(out, config.numDims());
|
||||||
newline(out);
|
newline(out);
|
||||||
|
|
||||||
write(out, NUM_INDEX_DIMS);
|
write(out, NUM_INDEX_DIMS);
|
||||||
writeInt(out, config.numIndexDims);
|
writeInt(out, config.numIndexDims());
|
||||||
newline(out);
|
newline(out);
|
||||||
|
|
||||||
write(out, BYTES_PER_DIM);
|
write(out, BYTES_PER_DIM);
|
||||||
writeInt(out, config.bytesPerDim);
|
writeInt(out, config.bytesPerDim());
|
||||||
newline(out);
|
newline(out);
|
||||||
|
|
||||||
write(out, MAX_LEAF_POINTS);
|
write(out, MAX_LEAF_POINTS);
|
||||||
|
@ -767,8 +768,8 @@ final class SimpleTextBKDWriter implements Closeable {
|
||||||
newline(out);
|
newline(out);
|
||||||
}
|
}
|
||||||
|
|
||||||
assert (splitPackedValues.length % (1 + config.bytesPerDim)) == 0;
|
assert (splitPackedValues.length % (1 + config.bytesPerDim())) == 0;
|
||||||
int count = splitPackedValues.length / (1 + config.bytesPerDim);
|
int count = splitPackedValues.length / (1 + config.bytesPerDim());
|
||||||
assert count == leafBlockFPs.length;
|
assert count == leafBlockFPs.length;
|
||||||
|
|
||||||
write(out, SPLIT_COUNT);
|
write(out, SPLIT_COUNT);
|
||||||
|
@ -777,10 +778,12 @@ final class SimpleTextBKDWriter implements Closeable {
|
||||||
|
|
||||||
for (int i = 0; i < count; i++) {
|
for (int i = 0; i < count; i++) {
|
||||||
write(out, SPLIT_DIM);
|
write(out, SPLIT_DIM);
|
||||||
writeInt(out, splitPackedValues[i * (1 + config.bytesPerDim)] & 0xff);
|
writeInt(out, splitPackedValues[i * (1 + config.bytesPerDim())] & 0xff);
|
||||||
newline(out);
|
newline(out);
|
||||||
write(out, SPLIT_VALUE);
|
write(out, SPLIT_VALUE);
|
||||||
br = new BytesRef(splitPackedValues, 1 + (i * (1 + config.bytesPerDim)), config.bytesPerDim);
|
br =
|
||||||
|
new BytesRef(
|
||||||
|
splitPackedValues, 1 + (i * (1 + config.bytesPerDim())), config.bytesPerDim());
|
||||||
write(out, br.toString());
|
write(out, br.toString());
|
||||||
newline(out);
|
newline(out);
|
||||||
}
|
}
|
||||||
|
@ -852,25 +855,25 @@ final class SimpleTextBKDWriter implements Closeable {
|
||||||
/** Called only in assert */
|
/** Called only in assert */
|
||||||
private boolean valueInBounds(
|
private boolean valueInBounds(
|
||||||
BytesRef packedValue, byte[] minPackedValue, byte[] maxPackedValue) {
|
BytesRef packedValue, byte[] minPackedValue, byte[] maxPackedValue) {
|
||||||
for (int dim = 0; dim < config.numIndexDims; dim++) {
|
for (int dim = 0; dim < config.numIndexDims(); dim++) {
|
||||||
int offset = config.bytesPerDim * dim;
|
int offset = config.bytesPerDim() * dim;
|
||||||
if (Arrays.compareUnsigned(
|
if (Arrays.compareUnsigned(
|
||||||
packedValue.bytes,
|
packedValue.bytes,
|
||||||
packedValue.offset + offset,
|
packedValue.offset + offset,
|
||||||
packedValue.offset + offset + config.bytesPerDim,
|
packedValue.offset + offset + config.bytesPerDim(),
|
||||||
minPackedValue,
|
minPackedValue,
|
||||||
offset,
|
offset,
|
||||||
offset + config.bytesPerDim)
|
offset + config.bytesPerDim())
|
||||||
< 0) {
|
< 0) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
if (Arrays.compareUnsigned(
|
if (Arrays.compareUnsigned(
|
||||||
packedValue.bytes,
|
packedValue.bytes,
|
||||||
packedValue.offset + offset,
|
packedValue.offset + offset,
|
||||||
packedValue.offset + offset + config.bytesPerDim,
|
packedValue.offset + offset + config.bytesPerDim(),
|
||||||
maxPackedValue,
|
maxPackedValue,
|
||||||
offset,
|
offset,
|
||||||
offset + config.bytesPerDim)
|
offset + config.bytesPerDim())
|
||||||
> 0) {
|
> 0) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
@ -882,13 +885,13 @@ final class SimpleTextBKDWriter implements Closeable {
|
||||||
protected int split(byte[] minPackedValue, byte[] maxPackedValue) {
|
protected int split(byte[] minPackedValue, byte[] maxPackedValue) {
|
||||||
// Find which dim has the largest span so we can split on it:
|
// Find which dim has the largest span so we can split on it:
|
||||||
int splitDim = -1;
|
int splitDim = -1;
|
||||||
for (int dim = 0; dim < config.numIndexDims; dim++) {
|
for (int dim = 0; dim < config.numIndexDims(); dim++) {
|
||||||
NumericUtils.subtract(config.bytesPerDim, dim, maxPackedValue, minPackedValue, scratchDiff);
|
NumericUtils.subtract(config.bytesPerDim(), dim, maxPackedValue, minPackedValue, scratchDiff);
|
||||||
if (splitDim == -1
|
if (splitDim == -1
|
||||||
|| Arrays.compareUnsigned(
|
|| Arrays.compareUnsigned(
|
||||||
scratchDiff, 0, config.bytesPerDim, scratch1, 0, config.bytesPerDim)
|
scratchDiff, 0, config.bytesPerDim(), scratch1, 0, config.bytesPerDim())
|
||||||
> 0) {
|
> 0) {
|
||||||
System.arraycopy(scratchDiff, 0, scratch1, 0, config.bytesPerDim);
|
System.arraycopy(scratchDiff, 0, scratch1, 0, config.bytesPerDim());
|
||||||
splitDim = dim;
|
splitDim = dim;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -931,15 +934,15 @@ final class SimpleTextBKDWriter implements Closeable {
|
||||||
if (nodeID >= leafNodeOffset) {
|
if (nodeID >= leafNodeOffset) {
|
||||||
// leaf node
|
// leaf node
|
||||||
final int count = to - from;
|
final int count = to - from;
|
||||||
assert count <= config.maxPointsInLeafNode;
|
assert count <= config.maxPointsInLeafNode();
|
||||||
|
|
||||||
// Compute common prefixes
|
// Compute common prefixes
|
||||||
Arrays.fill(commonPrefixLengths, config.bytesPerDim);
|
Arrays.fill(commonPrefixLengths, config.bytesPerDim());
|
||||||
reader.getValue(from, scratchBytesRef1);
|
reader.getValue(from, scratchBytesRef1);
|
||||||
for (int i = from + 1; i < to; ++i) {
|
for (int i = from + 1; i < to; ++i) {
|
||||||
reader.getValue(i, scratchBytesRef2);
|
reader.getValue(i, scratchBytesRef2);
|
||||||
for (int dim = 0; dim < config.numDims; dim++) {
|
for (int dim = 0; dim < config.numDims(); dim++) {
|
||||||
final int offset = dim * config.bytesPerDim;
|
final int offset = dim * config.bytesPerDim();
|
||||||
for (int j = 0; j < commonPrefixLengths[dim]; j++) {
|
for (int j = 0; j < commonPrefixLengths[dim]; j++) {
|
||||||
if (scratchBytesRef1.bytes[scratchBytesRef1.offset + offset + j]
|
if (scratchBytesRef1.bytes[scratchBytesRef1.offset + offset + j]
|
||||||
!= scratchBytesRef2.bytes[scratchBytesRef2.offset + offset + j]) {
|
!= scratchBytesRef2.bytes[scratchBytesRef2.offset + offset + j]) {
|
||||||
|
@ -951,23 +954,23 @@ final class SimpleTextBKDWriter implements Closeable {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Find the dimension that has the least number of unique bytes at commonPrefixLengths[dim]
|
// Find the dimension that has the least number of unique bytes at commonPrefixLengths[dim]
|
||||||
FixedBitSet[] usedBytes = new FixedBitSet[config.numDims];
|
FixedBitSet[] usedBytes = new FixedBitSet[config.numDims()];
|
||||||
for (int dim = 0; dim < config.numDims; ++dim) {
|
for (int dim = 0; dim < config.numDims(); ++dim) {
|
||||||
if (commonPrefixLengths[dim] < config.bytesPerDim) {
|
if (commonPrefixLengths[dim] < config.bytesPerDim()) {
|
||||||
usedBytes[dim] = new FixedBitSet(256);
|
usedBytes[dim] = new FixedBitSet(256);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
for (int i = from + 1; i < to; ++i) {
|
for (int i = from + 1; i < to; ++i) {
|
||||||
for (int dim = 0; dim < config.numDims; dim++) {
|
for (int dim = 0; dim < config.numDims(); dim++) {
|
||||||
if (usedBytes[dim] != null) {
|
if (usedBytes[dim] != null) {
|
||||||
byte b = reader.getByteAt(i, dim * config.bytesPerDim + commonPrefixLengths[dim]);
|
byte b = reader.getByteAt(i, dim * config.bytesPerDim() + commonPrefixLengths[dim]);
|
||||||
usedBytes[dim].set(Byte.toUnsignedInt(b));
|
usedBytes[dim].set(Byte.toUnsignedInt(b));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
int sortedDim = 0;
|
int sortedDim = 0;
|
||||||
int sortedDimCardinality = Integer.MAX_VALUE;
|
int sortedDimCardinality = Integer.MAX_VALUE;
|
||||||
for (int dim = 0; dim < config.numDims; ++dim) {
|
for (int dim = 0; dim < config.numDims(); ++dim) {
|
||||||
if (usedBytes[dim] != null) {
|
if (usedBytes[dim] != null) {
|
||||||
final int cardinality = usedBytes[dim].cardinality();
|
final int cardinality = usedBytes[dim].cardinality();
|
||||||
if (cardinality < sortedDimCardinality) {
|
if (cardinality < sortedDimCardinality) {
|
||||||
|
@ -1001,7 +1004,7 @@ final class SimpleTextBKDWriter implements Closeable {
|
||||||
// Write the common prefixes:
|
// Write the common prefixes:
|
||||||
reader.getValue(from, scratchBytesRef1);
|
reader.getValue(from, scratchBytesRef1);
|
||||||
System.arraycopy(
|
System.arraycopy(
|
||||||
scratchBytesRef1.bytes, scratchBytesRef1.offset, scratch1, 0, config.packedBytesLength);
|
scratchBytesRef1.bytes, scratchBytesRef1.offset, scratch1, 0, config.packedBytesLength());
|
||||||
|
|
||||||
// Write the full values:
|
// Write the full values:
|
||||||
IntFunction<BytesRef> packedValues =
|
IntFunction<BytesRef> packedValues =
|
||||||
|
@ -1023,10 +1026,10 @@ final class SimpleTextBKDWriter implements Closeable {
|
||||||
final int splitDim = split(minPackedValue, maxPackedValue);
|
final int splitDim = split(minPackedValue, maxPackedValue);
|
||||||
final int mid = (from + to + 1) >>> 1;
|
final int mid = (from + to + 1) >>> 1;
|
||||||
|
|
||||||
int commonPrefixLen = config.bytesPerDim;
|
int commonPrefixLen = config.bytesPerDim();
|
||||||
for (int i = 0; i < config.bytesPerDim; ++i) {
|
for (int i = 0; i < config.bytesPerDim(); ++i) {
|
||||||
if (minPackedValue[splitDim * config.bytesPerDim + i]
|
if (minPackedValue[splitDim * config.bytesPerDim() + i]
|
||||||
!= maxPackedValue[splitDim * config.bytesPerDim + i]) {
|
!= maxPackedValue[splitDim * config.bytesPerDim() + i]) {
|
||||||
commonPrefixLen = i;
|
commonPrefixLen = i;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
@ -1044,32 +1047,32 @@ final class SimpleTextBKDWriter implements Closeable {
|
||||||
scratchBytesRef2);
|
scratchBytesRef2);
|
||||||
|
|
||||||
// set the split value
|
// set the split value
|
||||||
final int address = nodeID * (1 + config.bytesPerDim);
|
final int address = nodeID * (1 + config.bytesPerDim());
|
||||||
splitPackedValues[address] = (byte) splitDim;
|
splitPackedValues[address] = (byte) splitDim;
|
||||||
reader.getValue(mid, scratchBytesRef1);
|
reader.getValue(mid, scratchBytesRef1);
|
||||||
System.arraycopy(
|
System.arraycopy(
|
||||||
scratchBytesRef1.bytes,
|
scratchBytesRef1.bytes,
|
||||||
scratchBytesRef1.offset + splitDim * config.bytesPerDim,
|
scratchBytesRef1.offset + splitDim * config.bytesPerDim(),
|
||||||
splitPackedValues,
|
splitPackedValues,
|
||||||
address + 1,
|
address + 1,
|
||||||
config.bytesPerDim);
|
config.bytesPerDim());
|
||||||
|
|
||||||
byte[] minSplitPackedValue =
|
byte[] minSplitPackedValue =
|
||||||
ArrayUtil.copyOfSubArray(minPackedValue, 0, config.packedIndexBytesLength);
|
ArrayUtil.copyOfSubArray(minPackedValue, 0, config.packedIndexBytesLength());
|
||||||
byte[] maxSplitPackedValue =
|
byte[] maxSplitPackedValue =
|
||||||
ArrayUtil.copyOfSubArray(maxPackedValue, 0, config.packedIndexBytesLength);
|
ArrayUtil.copyOfSubArray(maxPackedValue, 0, config.packedIndexBytesLength());
|
||||||
System.arraycopy(
|
System.arraycopy(
|
||||||
scratchBytesRef1.bytes,
|
scratchBytesRef1.bytes,
|
||||||
scratchBytesRef1.offset + splitDim * config.bytesPerDim,
|
scratchBytesRef1.offset + splitDim * config.bytesPerDim(),
|
||||||
minSplitPackedValue,
|
minSplitPackedValue,
|
||||||
splitDim * config.bytesPerDim,
|
splitDim * config.bytesPerDim(),
|
||||||
config.bytesPerDim);
|
config.bytesPerDim());
|
||||||
System.arraycopy(
|
System.arraycopy(
|
||||||
scratchBytesRef1.bytes,
|
scratchBytesRef1.bytes,
|
||||||
scratchBytesRef1.offset + splitDim * config.bytesPerDim,
|
scratchBytesRef1.offset + splitDim * config.bytesPerDim(),
|
||||||
maxSplitPackedValue,
|
maxSplitPackedValue,
|
||||||
splitDim * config.bytesPerDim,
|
splitDim * config.bytesPerDim(),
|
||||||
config.bytesPerDim);
|
config.bytesPerDim());
|
||||||
|
|
||||||
// recurse
|
// recurse
|
||||||
build(
|
build(
|
||||||
|
@ -1137,17 +1140,17 @@ final class SimpleTextBKDWriter implements Closeable {
|
||||||
|
|
||||||
int sortedDim = 0;
|
int sortedDim = 0;
|
||||||
int sortedDimCardinality = Integer.MAX_VALUE;
|
int sortedDimCardinality = Integer.MAX_VALUE;
|
||||||
FixedBitSet[] usedBytes = new FixedBitSet[config.numDims];
|
FixedBitSet[] usedBytes = new FixedBitSet[config.numDims()];
|
||||||
for (int dim = 0; dim < config.numDims; ++dim) {
|
for (int dim = 0; dim < config.numDims(); ++dim) {
|
||||||
if (commonPrefixLengths[dim] < config.bytesPerDim) {
|
if (commonPrefixLengths[dim] < config.bytesPerDim()) {
|
||||||
usedBytes[dim] = new FixedBitSet(256);
|
usedBytes[dim] = new FixedBitSet(256);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// Find the dimension to compress
|
// Find the dimension to compress
|
||||||
for (int dim = 0; dim < config.numDims; dim++) {
|
for (int dim = 0; dim < config.numDims(); dim++) {
|
||||||
int prefix = commonPrefixLengths[dim];
|
int prefix = commonPrefixLengths[dim];
|
||||||
if (prefix < config.bytesPerDim) {
|
if (prefix < config.bytesPerDim()) {
|
||||||
int offset = dim * config.bytesPerDim;
|
int offset = dim * config.bytesPerDim();
|
||||||
for (int i = 0; i < heapSource.count(); ++i) {
|
for (int i = 0; i < heapSource.count(); ++i) {
|
||||||
PointValue value = heapSource.getPackedValueSlice(i);
|
PointValue value = heapSource.getPackedValueSlice(i);
|
||||||
BytesRef packedValue = value.packedValue();
|
BytesRef packedValue = value.packedValue();
|
||||||
|
@ -1190,7 +1193,7 @@ final class SimpleTextBKDWriter implements Closeable {
|
||||||
final BytesRef scratch = new BytesRef();
|
final BytesRef scratch = new BytesRef();
|
||||||
|
|
||||||
{
|
{
|
||||||
scratch.length = config.packedBytesLength;
|
scratch.length = config.packedBytesLength();
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -1207,7 +1210,7 @@ final class SimpleTextBKDWriter implements Closeable {
|
||||||
// Inner node: partition/recurse
|
// Inner node: partition/recurse
|
||||||
|
|
||||||
int splitDim;
|
int splitDim;
|
||||||
if (config.numIndexDims > 1) {
|
if (config.numIndexDims() > 1) {
|
||||||
splitDim = split(minPackedValue, maxPackedValue);
|
splitDim = split(minPackedValue, maxPackedValue);
|
||||||
} else {
|
} else {
|
||||||
splitDim = 0;
|
splitDim = 0;
|
||||||
|
@ -1223,13 +1226,13 @@ final class SimpleTextBKDWriter implements Closeable {
|
||||||
int commonPrefixLen =
|
int commonPrefixLen =
|
||||||
Arrays.mismatch(
|
Arrays.mismatch(
|
||||||
minPackedValue,
|
minPackedValue,
|
||||||
splitDim * config.bytesPerDim,
|
splitDim * config.bytesPerDim(),
|
||||||
splitDim * config.bytesPerDim + config.bytesPerDim,
|
splitDim * config.bytesPerDim() + config.bytesPerDim(),
|
||||||
maxPackedValue,
|
maxPackedValue,
|
||||||
splitDim * config.bytesPerDim,
|
splitDim * config.bytesPerDim(),
|
||||||
splitDim * config.bytesPerDim + config.bytesPerDim);
|
splitDim * config.bytesPerDim() + config.bytesPerDim());
|
||||||
if (commonPrefixLen == -1) {
|
if (commonPrefixLen == -1) {
|
||||||
commonPrefixLen = config.bytesPerDim;
|
commonPrefixLen = config.bytesPerDim();
|
||||||
}
|
}
|
||||||
|
|
||||||
BKDRadixSelector.PathSlice[] pathSlices = new BKDRadixSelector.PathSlice[2];
|
BKDRadixSelector.PathSlice[] pathSlices = new BKDRadixSelector.PathSlice[2];
|
||||||
|
@ -1244,20 +1247,28 @@ final class SimpleTextBKDWriter implements Closeable {
|
||||||
splitDim,
|
splitDim,
|
||||||
commonPrefixLen);
|
commonPrefixLen);
|
||||||
|
|
||||||
int address = nodeID * (1 + config.bytesPerDim);
|
int address = nodeID * (1 + config.bytesPerDim());
|
||||||
splitPackedValues[address] = (byte) splitDim;
|
splitPackedValues[address] = (byte) splitDim;
|
||||||
System.arraycopy(splitValue, 0, splitPackedValues, address + 1, config.bytesPerDim);
|
System.arraycopy(splitValue, 0, splitPackedValues, address + 1, config.bytesPerDim());
|
||||||
|
|
||||||
byte[] minSplitPackedValue = new byte[config.packedIndexBytesLength];
|
byte[] minSplitPackedValue = new byte[config.packedIndexBytesLength()];
|
||||||
System.arraycopy(minPackedValue, 0, minSplitPackedValue, 0, config.packedIndexBytesLength);
|
System.arraycopy(minPackedValue, 0, minSplitPackedValue, 0, config.packedIndexBytesLength());
|
||||||
|
|
||||||
byte[] maxSplitPackedValue = new byte[config.packedIndexBytesLength];
|
byte[] maxSplitPackedValue = new byte[config.packedIndexBytesLength()];
|
||||||
System.arraycopy(maxPackedValue, 0, maxSplitPackedValue, 0, config.packedIndexBytesLength);
|
System.arraycopy(maxPackedValue, 0, maxSplitPackedValue, 0, config.packedIndexBytesLength());
|
||||||
|
|
||||||
System.arraycopy(
|
System.arraycopy(
|
||||||
splitValue, 0, minSplitPackedValue, splitDim * config.bytesPerDim, config.bytesPerDim);
|
splitValue,
|
||||||
|
0,
|
||||||
|
minSplitPackedValue,
|
||||||
|
splitDim * config.bytesPerDim(),
|
||||||
|
config.bytesPerDim());
|
||||||
System.arraycopy(
|
System.arraycopy(
|
||||||
splitValue, 0, maxSplitPackedValue, splitDim * config.bytesPerDim, config.bytesPerDim);
|
splitValue,
|
||||||
|
0,
|
||||||
|
maxSplitPackedValue,
|
||||||
|
splitDim * config.bytesPerDim(),
|
||||||
|
config.bytesPerDim());
|
||||||
|
|
||||||
// Recurse on left tree:
|
// Recurse on left tree:
|
||||||
build(
|
build(
|
||||||
|
@ -1289,30 +1300,30 @@ final class SimpleTextBKDWriter implements Closeable {
|
||||||
}
|
}
|
||||||
|
|
||||||
private void computeCommonPrefixLength(HeapPointWriter heapPointWriter, byte[] commonPrefix) {
|
private void computeCommonPrefixLength(HeapPointWriter heapPointWriter, byte[] commonPrefix) {
|
||||||
Arrays.fill(commonPrefixLengths, config.bytesPerDim);
|
Arrays.fill(commonPrefixLengths, config.bytesPerDim());
|
||||||
PointValue value = heapPointWriter.getPackedValueSlice(0);
|
PointValue value = heapPointWriter.getPackedValueSlice(0);
|
||||||
BytesRef packedValue = value.packedValue();
|
BytesRef packedValue = value.packedValue();
|
||||||
for (int dim = 0; dim < config.numDims; dim++) {
|
for (int dim = 0; dim < config.numDims(); dim++) {
|
||||||
System.arraycopy(
|
System.arraycopy(
|
||||||
packedValue.bytes,
|
packedValue.bytes,
|
||||||
packedValue.offset + dim * config.bytesPerDim,
|
packedValue.offset + dim * config.bytesPerDim(),
|
||||||
commonPrefix,
|
commonPrefix,
|
||||||
dim * config.bytesPerDim,
|
dim * config.bytesPerDim(),
|
||||||
config.bytesPerDim);
|
config.bytesPerDim());
|
||||||
}
|
}
|
||||||
for (int i = 1; i < heapPointWriter.count(); i++) {
|
for (int i = 1; i < heapPointWriter.count(); i++) {
|
||||||
value = heapPointWriter.getPackedValueSlice(i);
|
value = heapPointWriter.getPackedValueSlice(i);
|
||||||
packedValue = value.packedValue();
|
packedValue = value.packedValue();
|
||||||
for (int dim = 0; dim < config.numDims; dim++) {
|
for (int dim = 0; dim < config.numDims(); dim++) {
|
||||||
if (commonPrefixLengths[dim] != 0) {
|
if (commonPrefixLengths[dim] != 0) {
|
||||||
int j =
|
int j =
|
||||||
Arrays.mismatch(
|
Arrays.mismatch(
|
||||||
commonPrefix,
|
commonPrefix,
|
||||||
dim * config.bytesPerDim,
|
dim * config.bytesPerDim(),
|
||||||
dim * config.bytesPerDim + commonPrefixLengths[dim],
|
dim * config.bytesPerDim() + commonPrefixLengths[dim],
|
||||||
packedValue.bytes,
|
packedValue.bytes,
|
||||||
packedValue.offset + dim * config.bytesPerDim,
|
packedValue.offset + dim * config.bytesPerDim(),
|
||||||
packedValue.offset + dim * config.bytesPerDim + commonPrefixLengths[dim]);
|
packedValue.offset + dim * config.bytesPerDim() + commonPrefixLengths[dim]);
|
||||||
if (j != -1) {
|
if (j != -1) {
|
||||||
commonPrefixLengths[dim] = j;
|
commonPrefixLengths[dim] = j;
|
||||||
}
|
}
|
||||||
|
@ -1331,11 +1342,11 @@ final class SimpleTextBKDWriter implements Closeable {
|
||||||
int[] docs,
|
int[] docs,
|
||||||
int docsOffset)
|
int docsOffset)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
byte[] lastPackedValue = new byte[config.packedBytesLength];
|
byte[] lastPackedValue = new byte[config.packedBytesLength()];
|
||||||
int lastDoc = -1;
|
int lastDoc = -1;
|
||||||
for (int i = 0; i < count; i++) {
|
for (int i = 0; i < count; i++) {
|
||||||
BytesRef packedValue = values.apply(i);
|
BytesRef packedValue = values.apply(i);
|
||||||
assert packedValue.length == config.packedBytesLength;
|
assert packedValue.length == config.packedBytesLength();
|
||||||
assert valueInOrder(
|
assert valueInOrder(
|
||||||
i,
|
i,
|
||||||
sortedDim,
|
sortedDim,
|
||||||
|
@ -1361,43 +1372,43 @@ final class SimpleTextBKDWriter implements Closeable {
|
||||||
int packedValueOffset,
|
int packedValueOffset,
|
||||||
int doc,
|
int doc,
|
||||||
int lastDoc) {
|
int lastDoc) {
|
||||||
int dimOffset = sortedDim * config.bytesPerDim;
|
int dimOffset = sortedDim * config.bytesPerDim();
|
||||||
if (ord > 0) {
|
if (ord > 0) {
|
||||||
int cmp =
|
int cmp =
|
||||||
Arrays.compareUnsigned(
|
Arrays.compareUnsigned(
|
||||||
lastPackedValue,
|
lastPackedValue,
|
||||||
dimOffset,
|
dimOffset,
|
||||||
dimOffset + config.bytesPerDim,
|
dimOffset + config.bytesPerDim(),
|
||||||
packedValue,
|
packedValue,
|
||||||
packedValueOffset + dimOffset,
|
packedValueOffset + dimOffset,
|
||||||
packedValueOffset + dimOffset + config.bytesPerDim);
|
packedValueOffset + dimOffset + config.bytesPerDim());
|
||||||
if (cmp > 0) {
|
if (cmp > 0) {
|
||||||
throw new AssertionError(
|
throw new AssertionError(
|
||||||
"values out of order: last value="
|
"values out of order: last value="
|
||||||
+ new BytesRef(lastPackedValue)
|
+ new BytesRef(lastPackedValue)
|
||||||
+ " current value="
|
+ " current value="
|
||||||
+ new BytesRef(packedValue, packedValueOffset, config.packedBytesLength)
|
+ new BytesRef(packedValue, packedValueOffset, config.packedBytesLength())
|
||||||
+ " ord="
|
+ " ord="
|
||||||
+ ord
|
+ ord
|
||||||
+ " sortedDim="
|
+ " sortedDim="
|
||||||
+ sortedDim);
|
+ sortedDim);
|
||||||
}
|
}
|
||||||
if (cmp == 0 && config.numDims > config.numIndexDims) {
|
if (cmp == 0 && config.numDims() > config.numIndexDims()) {
|
||||||
int dataOffset = config.numIndexDims * config.bytesPerDim;
|
int dataOffset = config.numIndexDims() * config.bytesPerDim();
|
||||||
cmp =
|
cmp =
|
||||||
Arrays.compareUnsigned(
|
Arrays.compareUnsigned(
|
||||||
lastPackedValue,
|
lastPackedValue,
|
||||||
dataOffset,
|
dataOffset,
|
||||||
config.packedBytesLength,
|
config.packedBytesLength(),
|
||||||
packedValue,
|
packedValue,
|
||||||
packedValueOffset + dataOffset,
|
packedValueOffset + dataOffset,
|
||||||
packedValueOffset + config.packedBytesLength);
|
packedValueOffset + config.packedBytesLength());
|
||||||
if (cmp > 0) {
|
if (cmp > 0) {
|
||||||
throw new AssertionError(
|
throw new AssertionError(
|
||||||
"data values out of order: last value="
|
"data values out of order: last value="
|
||||||
+ new BytesRef(lastPackedValue)
|
+ new BytesRef(lastPackedValue)
|
||||||
+ " current value="
|
+ " current value="
|
||||||
+ new BytesRef(packedValue, packedValueOffset, config.packedBytesLength)
|
+ new BytesRef(packedValue, packedValueOffset, config.packedBytesLength())
|
||||||
+ " ord="
|
+ " ord="
|
||||||
+ ord);
|
+ ord);
|
||||||
}
|
}
|
||||||
|
@ -1414,7 +1425,8 @@ final class SimpleTextBKDWriter implements Closeable {
|
||||||
+ sortedDim);
|
+ sortedDim);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
System.arraycopy(packedValue, packedValueOffset, lastPackedValue, 0, config.packedBytesLength);
|
System.arraycopy(
|
||||||
|
packedValue, packedValueOffset, lastPackedValue, 0, config.packedBytesLength());
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -829,7 +829,7 @@ class SimpleTextDocValuesReader extends DocValuesProducer {
|
||||||
clone.seek(0);
|
clone.seek(0);
|
||||||
// checksum is fixed-width encoded with 20 bytes, plus 1 byte for newline (the space is included
|
// checksum is fixed-width encoded with 20 bytes, plus 1 byte for newline (the space is included
|
||||||
// in SimpleTextUtil.CHECKSUM):
|
// in SimpleTextUtil.CHECKSUM):
|
||||||
long footerStartPos = data.length() - (SimpleTextUtil.CHECKSUM.length + 21);
|
long footerStartPos = clone.length() - (SimpleTextUtil.CHECKSUM.length + 21);
|
||||||
ChecksumIndexInput input = new BufferedChecksumIndexInput(clone);
|
ChecksumIndexInput input = new BufferedChecksumIndexInput(clone);
|
||||||
while (true) {
|
while (true) {
|
||||||
SimpleTextUtil.readLine(input, scratch);
|
SimpleTextUtil.readLine(input, scratch);
|
||||||
|
|
|
@ -227,7 +227,7 @@ class SimpleTextPointsReader extends PointsReader {
|
||||||
|
|
||||||
// checksum is fixed-width encoded with 20 bytes, plus 1 byte for newline (the space is included
|
// checksum is fixed-width encoded with 20 bytes, plus 1 byte for newline (the space is included
|
||||||
// in SimpleTextUtil.CHECKSUM):
|
// in SimpleTextUtil.CHECKSUM):
|
||||||
long footerStartPos = dataIn.length() - (SimpleTextUtil.CHECKSUM.length + 21);
|
long footerStartPos = clone.length() - (SimpleTextUtil.CHECKSUM.length + 21);
|
||||||
ChecksumIndexInput input = new BufferedChecksumIndexInput(clone);
|
ChecksumIndexInput input = new BufferedChecksumIndexInput(clone);
|
||||||
while (true) {
|
while (true) {
|
||||||
SimpleTextUtil.readLine(input, scratch);
|
SimpleTextUtil.readLine(input, scratch);
|
||||||
|
|
|
@ -17,13 +17,13 @@
|
||||||
|
|
||||||
package org.apache.lucene.codecs.uniformsplit;
|
package org.apache.lucene.codecs.uniformsplit;
|
||||||
|
|
||||||
import static org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.BLOCK_SIZE;
|
import static org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat.BLOCK_SIZE;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import org.apache.lucene.codecs.BlockTermState;
|
import org.apache.lucene.codecs.BlockTermState;
|
||||||
import org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.IntBlockTermState;
|
import org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat.IntBlockTermState;
|
||||||
import org.apache.lucene.codecs.lucene99.Lucene99PostingsReader;
|
import org.apache.lucene.codecs.lucene912.Lucene912PostingsReader;
|
||||||
import org.apache.lucene.codecs.lucene99.Lucene99PostingsWriter;
|
import org.apache.lucene.codecs.lucene912.Lucene912PostingsWriter;
|
||||||
import org.apache.lucene.index.FieldInfo;
|
import org.apache.lucene.index.FieldInfo;
|
||||||
import org.apache.lucene.index.IndexOptions;
|
import org.apache.lucene.index.IndexOptions;
|
||||||
import org.apache.lucene.index.TermState;
|
import org.apache.lucene.index.TermState;
|
||||||
|
@ -34,7 +34,7 @@ import org.apache.lucene.util.RamUsageEstimator;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* {@link TermState} serializer which encodes each file pointer as a delta relative to a base file
|
* {@link TermState} serializer which encodes each file pointer as a delta relative to a base file
|
||||||
* pointer. It differs from {@link Lucene99PostingsWriter#encodeTerm} which encodes each file
|
* pointer. It differs from {@link Lucene912PostingsWriter#encodeTerm} which encodes each file
|
||||||
* pointer as a delta relative to the previous file pointer.
|
* pointer as a delta relative to the previous file pointer.
|
||||||
*
|
*
|
||||||
* <p>It automatically sets the base file pointer to the first valid file pointer for doc start FP,
|
* <p>It automatically sets the base file pointer to the first valid file pointer for doc start FP,
|
||||||
|
@ -95,7 +95,7 @@ public class DeltaBaseTermStateSerializer implements Accountable {
|
||||||
/**
|
/**
|
||||||
* Writes a {@link BlockTermState} to the provided {@link DataOutput}.
|
* Writes a {@link BlockTermState} to the provided {@link DataOutput}.
|
||||||
*
|
*
|
||||||
* <p>Simpler variant of {@link Lucene99PostingsWriter#encodeTerm(DataOutput, FieldInfo,
|
* <p>Simpler variant of {@link Lucene912PostingsWriter#encodeTerm(DataOutput, FieldInfo,
|
||||||
* BlockTermState, boolean)}.
|
* BlockTermState, boolean)}.
|
||||||
*/
|
*/
|
||||||
public void writeTermState(
|
public void writeTermState(
|
||||||
|
@ -140,15 +140,12 @@ public class DeltaBaseTermStateSerializer implements Accountable {
|
||||||
termStatesOutput.writeVLong(intTermState.lastPosBlockOffset);
|
termStatesOutput.writeVLong(intTermState.lastPosBlockOffset);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (intTermState.skipOffset != -1) {
|
|
||||||
termStatesOutput.writeVLong(intTermState.skipOffset);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Reads a {@link BlockTermState} from the provided {@link DataInput}.
|
* Reads a {@link BlockTermState} from the provided {@link DataInput}.
|
||||||
*
|
*
|
||||||
* <p>Simpler variant of {@link Lucene99PostingsReader#decodeTerm(DataInput, FieldInfo,
|
* <p>Simpler variant of {@link Lucene912PostingsReader#decodeTerm(DataInput, FieldInfo,
|
||||||
* BlockTermState, boolean)}.
|
* BlockTermState, boolean)}.
|
||||||
*
|
*
|
||||||
* @param reuse {@link BlockTermState} to reuse; or null to create a new one.
|
* @param reuse {@link BlockTermState} to reuse; or null to create a new one.
|
||||||
|
@ -190,9 +187,6 @@ public class DeltaBaseTermStateSerializer implements Accountable {
|
||||||
intTermState.lastPosBlockOffset = termStatesInput.readVLong();
|
intTermState.lastPosBlockOffset = termStatesInput.readVLong();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (intTermState.docFreq > BLOCK_SIZE) {
|
|
||||||
intTermState.skipOffset = termStatesInput.readVLong();
|
|
||||||
}
|
|
||||||
return intTermState;
|
return intTermState;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -210,7 +204,6 @@ public class DeltaBaseTermStateSerializer implements Accountable {
|
||||||
termState.docStartFP = 0;
|
termState.docStartFP = 0;
|
||||||
termState.posStartFP = 0;
|
termState.posStartFP = 0;
|
||||||
termState.payStartFP = 0;
|
termState.payStartFP = 0;
|
||||||
termState.skipOffset = -1;
|
|
||||||
termState.lastPosBlockOffset = -1;
|
termState.lastPosBlockOffset = -1;
|
||||||
termState.singletonDocID = -1;
|
termState.singletonDocID = -1;
|
||||||
|
|
||||||
|
|
|
@ -90,10 +90,15 @@ public class FSTDictionary implements IndexDictionary {
|
||||||
}
|
}
|
||||||
PositiveIntOutputs fstOutputs = PositiveIntOutputs.getSingleton();
|
PositiveIntOutputs fstOutputs = PositiveIntOutputs.getSingleton();
|
||||||
FST.FSTMetadata<Long> metadata = FST.readMetadata(fstDataInput, fstOutputs);
|
FST.FSTMetadata<Long> metadata = FST.readMetadata(fstDataInput, fstOutputs);
|
||||||
FST<Long> fst =
|
FST<Long> fst;
|
||||||
isFSTOnHeap
|
if (isFSTOnHeap) {
|
||||||
? new FST<>(metadata, fstDataInput)
|
fst = new FST<>(metadata, fstDataInput);
|
||||||
: new FST<>(metadata, fstDataInput, new OffHeapFSTStore());
|
} else {
|
||||||
|
final IndexInput indexInput = (IndexInput) fstDataInput;
|
||||||
|
fst =
|
||||||
|
FST.fromFSTReader(
|
||||||
|
metadata, new OffHeapFSTStore(indexInput, indexInput.getFilePointer(), metadata));
|
||||||
|
}
|
||||||
return new FSTDictionary(fst);
|
return new FSTDictionary(fst);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -23,8 +23,8 @@ import org.apache.lucene.codecs.FieldsProducer;
|
||||||
import org.apache.lucene.codecs.PostingsFormat;
|
import org.apache.lucene.codecs.PostingsFormat;
|
||||||
import org.apache.lucene.codecs.PostingsReaderBase;
|
import org.apache.lucene.codecs.PostingsReaderBase;
|
||||||
import org.apache.lucene.codecs.PostingsWriterBase;
|
import org.apache.lucene.codecs.PostingsWriterBase;
|
||||||
import org.apache.lucene.codecs.lucene99.Lucene99PostingsReader;
|
import org.apache.lucene.codecs.lucene912.Lucene912PostingsReader;
|
||||||
import org.apache.lucene.codecs.lucene99.Lucene99PostingsWriter;
|
import org.apache.lucene.codecs.lucene912.Lucene912PostingsWriter;
|
||||||
import org.apache.lucene.index.SegmentReadState;
|
import org.apache.lucene.index.SegmentReadState;
|
||||||
import org.apache.lucene.index.SegmentWriteState;
|
import org.apache.lucene.index.SegmentWriteState;
|
||||||
import org.apache.lucene.util.IOUtils;
|
import org.apache.lucene.util.IOUtils;
|
||||||
|
@ -113,7 +113,7 @@ public class UniformSplitPostingsFormat extends PostingsFormat {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
|
public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
|
||||||
PostingsWriterBase postingsWriter = new Lucene99PostingsWriter(state);
|
PostingsWriterBase postingsWriter = new Lucene912PostingsWriter(state);
|
||||||
boolean success = false;
|
boolean success = false;
|
||||||
try {
|
try {
|
||||||
FieldsConsumer termsWriter =
|
FieldsConsumer termsWriter =
|
||||||
|
@ -130,7 +130,7 @@ public class UniformSplitPostingsFormat extends PostingsFormat {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException {
|
public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException {
|
||||||
PostingsReaderBase postingsReader = new Lucene99PostingsReader(state);
|
PostingsReaderBase postingsReader = new Lucene912PostingsReader(state);
|
||||||
boolean success = false;
|
boolean success = false;
|
||||||
try {
|
try {
|
||||||
FieldsProducer termsReader =
|
FieldsProducer termsReader =
|
||||||
|
|
|
@ -28,7 +28,7 @@
|
||||||
* org.apache.lucene.search.PhraseQuery})
|
* org.apache.lucene.search.PhraseQuery})
|
||||||
* <li>Quite efficient for {@link org.apache.lucene.search.PrefixQuery}
|
* <li>Quite efficient for {@link org.apache.lucene.search.PrefixQuery}
|
||||||
* <li>Not efficient for spell-check and {@link org.apache.lucene.search.FuzzyQuery}, in this case
|
* <li>Not efficient for spell-check and {@link org.apache.lucene.search.FuzzyQuery}, in this case
|
||||||
* prefer {@link org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat}
|
* prefer {@link org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat}
|
||||||
* </ul>
|
* </ul>
|
||||||
*/
|
*/
|
||||||
package org.apache.lucene.codecs.uniformsplit;
|
package org.apache.lucene.codecs.uniformsplit;
|
||||||
|
|
|
@ -20,11 +20,11 @@ package org.apache.lucene.codecs.uniformsplit.sharedterms;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.RandomAccess;
|
import java.util.RandomAccess;
|
||||||
|
import org.apache.lucene.index.BaseTermsEnum;
|
||||||
import org.apache.lucene.index.ImpactsEnum;
|
import org.apache.lucene.index.ImpactsEnum;
|
||||||
import org.apache.lucene.index.MergeState;
|
import org.apache.lucene.index.MergeState;
|
||||||
import org.apache.lucene.index.PostingsEnum;
|
import org.apache.lucene.index.PostingsEnum;
|
||||||
import org.apache.lucene.index.TermState;
|
import org.apache.lucene.index.TermState;
|
||||||
import org.apache.lucene.index.TermsEnum;
|
|
||||||
import org.apache.lucene.util.AttributeSource;
|
import org.apache.lucene.util.AttributeSource;
|
||||||
import org.apache.lucene.util.BytesRef;
|
import org.apache.lucene.util.BytesRef;
|
||||||
|
|
||||||
|
@ -34,7 +34,7 @@ import org.apache.lucene.util.BytesRef;
|
||||||
*
|
*
|
||||||
* @lucene.experimental
|
* @lucene.experimental
|
||||||
*/
|
*/
|
||||||
class STMergingTermsEnum extends TermsEnum {
|
class STMergingTermsEnum extends BaseTermsEnum {
|
||||||
|
|
||||||
protected final String fieldName;
|
protected final String fieldName;
|
||||||
protected final MultiSegmentsPostingsEnum multiPostingsEnum;
|
protected final MultiSegmentsPostingsEnum multiPostingsEnum;
|
||||||
|
@ -63,11 +63,6 @@ class STMergingTermsEnum extends TermsEnum {
|
||||||
throw new UnsupportedOperationException();
|
throw new UnsupportedOperationException();
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
|
||||||
public boolean seekExact(BytesRef text) throws IOException {
|
|
||||||
throw new UnsupportedOperationException();
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public SeekStatus seekCeil(BytesRef text) {
|
public SeekStatus seekCeil(BytesRef text) {
|
||||||
throw new UnsupportedOperationException();
|
throw new UnsupportedOperationException();
|
||||||
|
|
|
@ -22,7 +22,7 @@ import java.io.IOException;
|
||||||
import org.apache.lucene.codecs.Codec;
|
import org.apache.lucene.codecs.Codec;
|
||||||
import org.apache.lucene.codecs.FilterCodec;
|
import org.apache.lucene.codecs.FilterCodec;
|
||||||
import org.apache.lucene.codecs.KnnVectorsFormat;
|
import org.apache.lucene.codecs.KnnVectorsFormat;
|
||||||
import org.apache.lucene.codecs.lucene99.Lucene99Codec;
|
import org.apache.lucene.codecs.lucene912.Lucene912Codec;
|
||||||
import org.apache.lucene.document.Document;
|
import org.apache.lucene.document.Document;
|
||||||
import org.apache.lucene.document.Field;
|
import org.apache.lucene.document.Field;
|
||||||
import org.apache.lucene.document.KnnByteVectorField;
|
import org.apache.lucene.document.KnnByteVectorField;
|
||||||
|
@ -42,7 +42,7 @@ import org.apache.lucene.tests.index.BaseIndexFileFormatTestCase;
|
||||||
public class TestHnswBitVectorsFormat extends BaseIndexFileFormatTestCase {
|
public class TestHnswBitVectorsFormat extends BaseIndexFileFormatTestCase {
|
||||||
@Override
|
@Override
|
||||||
protected Codec getCodec() {
|
protected Codec getCodec() {
|
||||||
return new Lucene99Codec() {
|
return new Lucene912Codec() {
|
||||||
@Override
|
@Override
|
||||||
public KnnVectorsFormat getKnnVectorsFormatForField(String field) {
|
public KnnVectorsFormat getKnnVectorsFormatForField(String field) {
|
||||||
return new HnswBitVectorsFormat();
|
return new HnswBitVectorsFormat();
|
||||||
|
|
|
@ -17,7 +17,7 @@
|
||||||
|
|
||||||
package org.apache.lucene.codecs.lucene90.tests;
|
package org.apache.lucene.codecs.lucene90.tests;
|
||||||
|
|
||||||
import org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat.IntBlockTermState;
|
import org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat.IntBlockTermState;
|
||||||
|
|
||||||
/** Test utility class to create mock {@link IntBlockTermState}. */
|
/** Test utility class to create mock {@link IntBlockTermState}. */
|
||||||
public class MockTermStateFactory {
|
public class MockTermStateFactory {
|
||||||
|
|
|
@ -0,0 +1,4 @@
|
||||||
|
{
|
||||||
|
"lucene/core/src/java/org/apache/lucene/codecs/lucene912/ForDeltaUtil.java": "5115b12ac31537ce31d73c0a279df92060749a3a",
|
||||||
|
"lucene/core/src/java/org/apache/lucene/codecs/lucene912/gen_ForDeltaUtil.py": "db6154406e68b80d2c90116b5d0bfa9ba220762a"
|
||||||
|
}
|
|
@ -1,4 +1,4 @@
|
||||||
{
|
{
|
||||||
"lucene/core/src/java/org/apache/lucene/codecs/lucene99/ForUtil.java": "1292ad354d255b1272ffd3db684aa2ddb2bc49ec",
|
"lucene/core/src/java/org/apache/lucene/codecs/lucene912/ForUtil.java": "159e82388346fde147924d5e15ca65df4dd63b9a",
|
||||||
"lucene/core/src/java/org/apache/lucene/codecs/lucene99/gen_ForUtil.py": "ab7b63a1b73986cc04e43de1c8f474b97aef5116"
|
"lucene/core/src/java/org/apache/lucene/codecs/lucene912/gen_ForUtil.py": "66dc8813160feae2a37d8b50474f5f9830b6cb22"
|
||||||
}
|
}
|
|
@ -15,7 +15,7 @@
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import org.apache.lucene.codecs.lucene99.Lucene99Codec;
|
import org.apache.lucene.codecs.lucene912.Lucene912Codec;
|
||||||
|
|
||||||
/** Lucene Core. */
|
/** Lucene Core. */
|
||||||
@SuppressWarnings("module") // the test framework is compiled after the core...
|
@SuppressWarnings("module") // the test framework is compiled after the core...
|
||||||
|
@ -33,6 +33,7 @@ module org.apache.lucene.core {
|
||||||
exports org.apache.lucene.codecs.lucene94;
|
exports org.apache.lucene.codecs.lucene94;
|
||||||
exports org.apache.lucene.codecs.lucene95;
|
exports org.apache.lucene.codecs.lucene95;
|
||||||
exports org.apache.lucene.codecs.lucene99;
|
exports org.apache.lucene.codecs.lucene99;
|
||||||
|
exports org.apache.lucene.codecs.lucene912;
|
||||||
exports org.apache.lucene.codecs.perfield;
|
exports org.apache.lucene.codecs.perfield;
|
||||||
exports org.apache.lucene.codecs;
|
exports org.apache.lucene.codecs;
|
||||||
exports org.apache.lucene.document;
|
exports org.apache.lucene.document;
|
||||||
|
@ -71,7 +72,7 @@ module org.apache.lucene.core {
|
||||||
provides org.apache.lucene.analysis.TokenizerFactory with
|
provides org.apache.lucene.analysis.TokenizerFactory with
|
||||||
org.apache.lucene.analysis.standard.StandardTokenizerFactory;
|
org.apache.lucene.analysis.standard.StandardTokenizerFactory;
|
||||||
provides org.apache.lucene.codecs.Codec with
|
provides org.apache.lucene.codecs.Codec with
|
||||||
Lucene99Codec;
|
Lucene912Codec;
|
||||||
provides org.apache.lucene.codecs.DocValuesFormat with
|
provides org.apache.lucene.codecs.DocValuesFormat with
|
||||||
org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat;
|
org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat;
|
||||||
provides org.apache.lucene.codecs.KnnVectorsFormat with
|
provides org.apache.lucene.codecs.KnnVectorsFormat with
|
||||||
|
@ -79,7 +80,7 @@ module org.apache.lucene.core {
|
||||||
org.apache.lucene.codecs.lucene99.Lucene99HnswScalarQuantizedVectorsFormat,
|
org.apache.lucene.codecs.lucene99.Lucene99HnswScalarQuantizedVectorsFormat,
|
||||||
org.apache.lucene.codecs.lucene99.Lucene99ScalarQuantizedVectorsFormat;
|
org.apache.lucene.codecs.lucene99.Lucene99ScalarQuantizedVectorsFormat;
|
||||||
provides org.apache.lucene.codecs.PostingsFormat with
|
provides org.apache.lucene.codecs.PostingsFormat with
|
||||||
org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat;
|
org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat;
|
||||||
provides org.apache.lucene.index.SortFieldProvider with
|
provides org.apache.lucene.index.SortFieldProvider with
|
||||||
org.apache.lucene.search.SortField.Provider,
|
org.apache.lucene.search.SortField.Provider,
|
||||||
org.apache.lucene.search.SortedNumericSortField.Provider,
|
org.apache.lucene.search.SortedNumericSortField.Provider,
|
||||||
|
|
|
@ -55,7 +55,7 @@ public abstract class Codec implements NamedSPILoader.NamedSPI {
|
||||||
return LOADER;
|
return LOADER;
|
||||||
}
|
}
|
||||||
|
|
||||||
static Codec defaultCodec = LOADER.lookup("Lucene99");
|
static Codec defaultCodec = LOADER.lookup("Lucene912");
|
||||||
}
|
}
|
||||||
|
|
||||||
private final String name;
|
private final String name;
|
||||||
|
|
|
@ -18,8 +18,6 @@ package org.apache.lucene.codecs;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
import java.util.Collection;
|
|
||||||
import java.util.Collections;
|
|
||||||
import java.util.Comparator;
|
import java.util.Comparator;
|
||||||
import java.util.Iterator;
|
import java.util.Iterator;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
@ -106,7 +104,7 @@ public final class CompetitiveImpactAccumulator {
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Get the set of competitive freq and norm pairs, ordered by increasing freq and norm. */
|
/** Get the set of competitive freq and norm pairs, ordered by increasing freq and norm. */
|
||||||
public Collection<Impact> getCompetitiveFreqNormPairs() {
|
public List<Impact> getCompetitiveFreqNormPairs() {
|
||||||
List<Impact> impacts = new ArrayList<>();
|
List<Impact> impacts = new ArrayList<>();
|
||||||
int maxFreqForLowerNorms = 0;
|
int maxFreqForLowerNorms = 0;
|
||||||
for (int i = 0; i < maxFreqs.length; ++i) {
|
for (int i = 0; i < maxFreqs.length; ++i) {
|
||||||
|
@ -126,7 +124,7 @@ public final class CompetitiveImpactAccumulator {
|
||||||
for (Impact impact : impacts) {
|
for (Impact impact : impacts) {
|
||||||
add(impact, freqNormPairs);
|
add(impact, freqNormPairs);
|
||||||
}
|
}
|
||||||
return Collections.unmodifiableSet(freqNormPairs);
|
return List.copyOf(freqNormPairs);
|
||||||
}
|
}
|
||||||
|
|
||||||
private void add(Impact newEntry, TreeSet<Impact> freqNormPairs) {
|
private void add(Impact newEntry, TreeSet<Impact> freqNormPairs) {
|
||||||
|
|
|
@ -23,6 +23,7 @@ import java.io.IOException;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.Iterator;
|
import java.util.Iterator;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
import org.apache.lucene.index.BaseTermsEnum;
|
||||||
import org.apache.lucene.index.BinaryDocValues;
|
import org.apache.lucene.index.BinaryDocValues;
|
||||||
import org.apache.lucene.index.DocIDMerger;
|
import org.apache.lucene.index.DocIDMerger;
|
||||||
import org.apache.lucene.index.DocValues;
|
import org.apache.lucene.index.DocValues;
|
||||||
|
@ -498,7 +499,7 @@ public abstract class DocValuesConsumer implements Closeable {
|
||||||
* {@link SortedDocValues#lookupOrd(int)} or {@link SortedSetDocValues#lookupOrd(long)} on every
|
* {@link SortedDocValues#lookupOrd(int)} or {@link SortedSetDocValues#lookupOrd(long)} on every
|
||||||
* call to {@link TermsEnum#next()}.
|
* call to {@link TermsEnum#next()}.
|
||||||
*/
|
*/
|
||||||
private static class MergedTermsEnum extends TermsEnum {
|
private static class MergedTermsEnum extends BaseTermsEnum {
|
||||||
|
|
||||||
private final TermsEnum[] subs;
|
private final TermsEnum[] subs;
|
||||||
private final OrdinalMap ordinalMap;
|
private final OrdinalMap ordinalMap;
|
||||||
|
@ -542,11 +543,6 @@ public abstract class DocValuesConsumer implements Closeable {
|
||||||
throw new UnsupportedOperationException();
|
throw new UnsupportedOperationException();
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
|
||||||
public boolean seekExact(BytesRef text) throws IOException {
|
|
||||||
throw new UnsupportedOperationException();
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public SeekStatus seekCeil(BytesRef text) throws IOException {
|
public SeekStatus seekCeil(BytesRef text) throws IOException {
|
||||||
throw new UnsupportedOperationException();
|
throw new UnsupportedOperationException();
|
||||||
|
@ -557,11 +553,6 @@ public abstract class DocValuesConsumer implements Closeable {
|
||||||
throw new UnsupportedOperationException();
|
throw new UnsupportedOperationException();
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
|
||||||
public void seekExact(BytesRef term, TermState state) throws IOException {
|
|
||||||
throw new UnsupportedOperationException();
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public int docFreq() throws IOException {
|
public int docFreq() throws IOException {
|
||||||
throw new UnsupportedOperationException();
|
throw new UnsupportedOperationException();
|
||||||
|
|
|
@ -20,17 +20,23 @@ package org.apache.lucene.codecs;
|
||||||
import java.io.Closeable;
|
import java.io.Closeable;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
|
import java.util.Arrays;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
import java.util.Objects;
|
||||||
|
import java.util.function.BiFunction;
|
||||||
import org.apache.lucene.index.ByteVectorValues;
|
import org.apache.lucene.index.ByteVectorValues;
|
||||||
import org.apache.lucene.index.DocIDMerger;
|
import org.apache.lucene.index.DocIDMerger;
|
||||||
|
import org.apache.lucene.index.DocsWithFieldSet;
|
||||||
import org.apache.lucene.index.FieldInfo;
|
import org.apache.lucene.index.FieldInfo;
|
||||||
import org.apache.lucene.index.FloatVectorValues;
|
import org.apache.lucene.index.FloatVectorValues;
|
||||||
import org.apache.lucene.index.MergeState;
|
import org.apache.lucene.index.MergeState;
|
||||||
import org.apache.lucene.index.Sorter;
|
import org.apache.lucene.index.Sorter;
|
||||||
import org.apache.lucene.index.VectorEncoding;
|
import org.apache.lucene.index.VectorEncoding;
|
||||||
|
import org.apache.lucene.internal.hppc.IntIntHashMap;
|
||||||
import org.apache.lucene.search.DocIdSetIterator;
|
import org.apache.lucene.search.DocIdSetIterator;
|
||||||
import org.apache.lucene.search.VectorScorer;
|
import org.apache.lucene.search.VectorScorer;
|
||||||
import org.apache.lucene.util.Accountable;
|
import org.apache.lucene.util.Accountable;
|
||||||
|
import org.apache.lucene.util.IOFunction;
|
||||||
|
|
||||||
/** Writes vectors to an index. */
|
/** Writes vectors to an index. */
|
||||||
public abstract class KnnVectorsWriter implements Accountable, Closeable {
|
public abstract class KnnVectorsWriter implements Accountable, Closeable {
|
||||||
|
@ -107,11 +113,11 @@ public abstract class KnnVectorsWriter implements Accountable, Closeable {
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Tracks state of one sub-reader that we are merging */
|
/** Tracks state of one sub-reader that we are merging */
|
||||||
private static class VectorValuesSub extends DocIDMerger.Sub {
|
private static class FloatVectorValuesSub extends DocIDMerger.Sub {
|
||||||
|
|
||||||
final FloatVectorValues values;
|
final FloatVectorValues values;
|
||||||
|
|
||||||
VectorValuesSub(MergeState.DocMap docMap, FloatVectorValues values) {
|
FloatVectorValuesSub(MergeState.DocMap docMap, FloatVectorValues values) {
|
||||||
super(docMap);
|
super(docMap);
|
||||||
this.values = values;
|
this.values = values;
|
||||||
assert values.docID() == -1;
|
assert values.docID() == -1;
|
||||||
|
@ -139,65 +145,139 @@ public abstract class KnnVectorsWriter implements Accountable, Closeable {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Given old doc ids and an id mapping, maps old ordinal to new ordinal. Note: this method return
|
||||||
|
* nothing and output are written to parameters
|
||||||
|
*
|
||||||
|
* @param oldDocIds the old or current document ordinals. Must not be null.
|
||||||
|
* @param sortMap the document sorting map for how to make the new ordinals. Must not be null.
|
||||||
|
* @param old2NewOrd int[] maps from old ord to new ord
|
||||||
|
* @param new2OldOrd int[] maps from new ord to old ord
|
||||||
|
* @param newDocsWithField set of new doc ids which has the value
|
||||||
|
*/
|
||||||
|
public static void mapOldOrdToNewOrd(
|
||||||
|
DocsWithFieldSet oldDocIds,
|
||||||
|
Sorter.DocMap sortMap,
|
||||||
|
int[] old2NewOrd,
|
||||||
|
int[] new2OldOrd,
|
||||||
|
DocsWithFieldSet newDocsWithField)
|
||||||
|
throws IOException {
|
||||||
|
// TODO: a similar function exists in IncrementalHnswGraphMerger#getNewOrdMapping
|
||||||
|
// maybe we can do a further refactoring
|
||||||
|
Objects.requireNonNull(oldDocIds);
|
||||||
|
Objects.requireNonNull(sortMap);
|
||||||
|
assert (old2NewOrd != null || new2OldOrd != null || newDocsWithField != null);
|
||||||
|
assert (old2NewOrd == null || old2NewOrd.length == oldDocIds.cardinality());
|
||||||
|
assert (new2OldOrd == null || new2OldOrd.length == oldDocIds.cardinality());
|
||||||
|
IntIntHashMap newIdToOldOrd = new IntIntHashMap();
|
||||||
|
DocIdSetIterator iterator = oldDocIds.iterator();
|
||||||
|
int[] newDocIds = new int[oldDocIds.cardinality()];
|
||||||
|
int oldOrd = 0;
|
||||||
|
for (int oldDocId = iterator.nextDoc();
|
||||||
|
oldDocId != DocIdSetIterator.NO_MORE_DOCS;
|
||||||
|
oldDocId = iterator.nextDoc()) {
|
||||||
|
int newId = sortMap.oldToNew(oldDocId);
|
||||||
|
newIdToOldOrd.put(newId, oldOrd);
|
||||||
|
newDocIds[oldOrd] = newId;
|
||||||
|
oldOrd++;
|
||||||
|
}
|
||||||
|
|
||||||
|
Arrays.sort(newDocIds);
|
||||||
|
int newOrd = 0;
|
||||||
|
for (int newDocId : newDocIds) {
|
||||||
|
int currOldOrd = newIdToOldOrd.get(newDocId);
|
||||||
|
if (old2NewOrd != null) {
|
||||||
|
old2NewOrd[currOldOrd] = newOrd;
|
||||||
|
}
|
||||||
|
if (new2OldOrd != null) {
|
||||||
|
new2OldOrd[newOrd] = currOldOrd;
|
||||||
|
}
|
||||||
|
if (newDocsWithField != null) {
|
||||||
|
newDocsWithField.add(newDocId);
|
||||||
|
}
|
||||||
|
newOrd++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/** View over multiple vector values supporting iterator-style access via DocIdMerger. */
|
/** View over multiple vector values supporting iterator-style access via DocIdMerger. */
|
||||||
public static final class MergedVectorValues {
|
public static final class MergedVectorValues {
|
||||||
private MergedVectorValues() {}
|
private MergedVectorValues() {}
|
||||||
|
|
||||||
/** Returns a merged view over all the segment's {@link FloatVectorValues}. */
|
private static void validateFieldEncoding(FieldInfo fieldInfo, VectorEncoding expected) {
|
||||||
public static FloatVectorValues mergeFloatVectorValues(
|
|
||||||
FieldInfo fieldInfo, MergeState mergeState) throws IOException {
|
|
||||||
assert fieldInfo != null && fieldInfo.hasVectorValues();
|
assert fieldInfo != null && fieldInfo.hasVectorValues();
|
||||||
if (fieldInfo.getVectorEncoding() != VectorEncoding.FLOAT32) {
|
VectorEncoding fieldEncoding = fieldInfo.getVectorEncoding();
|
||||||
|
if (fieldEncoding != expected) {
|
||||||
throw new UnsupportedOperationException(
|
throw new UnsupportedOperationException(
|
||||||
"Cannot merge vectors encoded as [" + fieldInfo.getVectorEncoding() + "] as FLOAT32");
|
"Cannot merge vectors encoded as [" + fieldEncoding + "] as " + expected);
|
||||||
}
|
}
|
||||||
List<VectorValuesSub> subs = new ArrayList<>();
|
}
|
||||||
for (int i = 0; i < mergeState.knnVectorsReaders.length; i++) {
|
|
||||||
KnnVectorsReader knnVectorsReader = mergeState.knnVectorsReaders[i];
|
private static <V, S> List<S> mergeVectorValues(
|
||||||
|
KnnVectorsReader[] knnVectorsReaders,
|
||||||
|
MergeState.DocMap[] docMaps,
|
||||||
|
IOFunction<KnnVectorsReader, V> valuesSupplier,
|
||||||
|
BiFunction<MergeState.DocMap, V, S> newSub)
|
||||||
|
throws IOException {
|
||||||
|
List<S> subs = new ArrayList<>();
|
||||||
|
for (int i = 0; i < knnVectorsReaders.length; i++) {
|
||||||
|
KnnVectorsReader knnVectorsReader = knnVectorsReaders[i];
|
||||||
if (knnVectorsReader != null) {
|
if (knnVectorsReader != null) {
|
||||||
FloatVectorValues values = knnVectorsReader.getFloatVectorValues(fieldInfo.name);
|
V values = valuesSupplier.apply(knnVectorsReader);
|
||||||
if (values != null) {
|
if (values != null) {
|
||||||
subs.add(new VectorValuesSub(mergeState.docMaps[i], values));
|
subs.add(newSub.apply(docMaps[i], values));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return new MergedFloat32VectorValues(subs, mergeState);
|
return subs;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Returns a merged view over all the segment's {@link FloatVectorValues}. */
|
||||||
|
public static FloatVectorValues mergeFloatVectorValues(
|
||||||
|
FieldInfo fieldInfo, MergeState mergeState) throws IOException {
|
||||||
|
validateFieldEncoding(fieldInfo, VectorEncoding.FLOAT32);
|
||||||
|
return new MergedFloat32VectorValues(
|
||||||
|
mergeVectorValues(
|
||||||
|
mergeState.knnVectorsReaders,
|
||||||
|
mergeState.docMaps,
|
||||||
|
knnVectorsReader -> {
|
||||||
|
return knnVectorsReader.getFloatVectorValues(fieldInfo.name);
|
||||||
|
},
|
||||||
|
(docMap, values) -> {
|
||||||
|
return new FloatVectorValuesSub(docMap, values);
|
||||||
|
}),
|
||||||
|
mergeState);
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Returns a merged view over all the segment's {@link ByteVectorValues}. */
|
/** Returns a merged view over all the segment's {@link ByteVectorValues}. */
|
||||||
public static ByteVectorValues mergeByteVectorValues(FieldInfo fieldInfo, MergeState mergeState)
|
public static ByteVectorValues mergeByteVectorValues(FieldInfo fieldInfo, MergeState mergeState)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
assert fieldInfo != null && fieldInfo.hasVectorValues();
|
validateFieldEncoding(fieldInfo, VectorEncoding.BYTE);
|
||||||
if (fieldInfo.getVectorEncoding() != VectorEncoding.BYTE) {
|
return new MergedByteVectorValues(
|
||||||
throw new UnsupportedOperationException(
|
mergeVectorValues(
|
||||||
"Cannot merge vectors encoded as [" + fieldInfo.getVectorEncoding() + "] as BYTE");
|
mergeState.knnVectorsReaders,
|
||||||
}
|
mergeState.docMaps,
|
||||||
List<ByteVectorValuesSub> subs = new ArrayList<>();
|
knnVectorsReader -> {
|
||||||
for (int i = 0; i < mergeState.knnVectorsReaders.length; i++) {
|
return knnVectorsReader.getByteVectorValues(fieldInfo.name);
|
||||||
KnnVectorsReader knnVectorsReader = mergeState.knnVectorsReaders[i];
|
},
|
||||||
if (knnVectorsReader != null) {
|
(docMap, values) -> {
|
||||||
ByteVectorValues values = knnVectorsReader.getByteVectorValues(fieldInfo.name);
|
return new ByteVectorValuesSub(docMap, values);
|
||||||
if (values != null) {
|
}),
|
||||||
subs.add(new ByteVectorValuesSub(mergeState.docMaps[i], values));
|
mergeState);
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return new MergedByteVectorValues(subs, mergeState);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static class MergedFloat32VectorValues extends FloatVectorValues {
|
static class MergedFloat32VectorValues extends FloatVectorValues {
|
||||||
private final List<VectorValuesSub> subs;
|
private final List<FloatVectorValuesSub> subs;
|
||||||
private final DocIDMerger<VectorValuesSub> docIdMerger;
|
private final DocIDMerger<FloatVectorValuesSub> docIdMerger;
|
||||||
private final int size;
|
private final int size;
|
||||||
private int docId;
|
private int docId;
|
||||||
VectorValuesSub current;
|
FloatVectorValuesSub current;
|
||||||
|
|
||||||
private MergedFloat32VectorValues(List<VectorValuesSub> subs, MergeState mergeState)
|
private MergedFloat32VectorValues(List<FloatVectorValuesSub> subs, MergeState mergeState)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
this.subs = subs;
|
this.subs = subs;
|
||||||
docIdMerger = DocIDMerger.of(subs, mergeState.needsIndexSort);
|
docIdMerger = DocIDMerger.of(subs, mergeState.needsIndexSort);
|
||||||
int totalSize = 0;
|
int totalSize = 0;
|
||||||
for (VectorValuesSub sub : subs) {
|
for (FloatVectorValuesSub sub : subs) {
|
||||||
totalSize += sub.values.size();
|
totalSize += sub.values.size();
|
||||||
}
|
}
|
||||||
size = totalSize;
|
size = totalSize;
|
||||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue