mirror of https://github.com/apache/lucene.git
Merge branch 'main' into java_21
This commit is contained in:
commit
40c03b0e6c
|
@ -117,6 +117,9 @@ apply from: file('buildSrc/scriptDepVersions.gradle')
|
||||||
|
|
||||||
apply from: file('gradle/generation/local-settings.gradle')
|
apply from: file('gradle/generation/local-settings.gradle')
|
||||||
|
|
||||||
|
// Make sure the build environment is consistent.
|
||||||
|
apply from: file('gradle/validation/check-environment.gradle')
|
||||||
|
|
||||||
// IDE support, settings and specials.
|
// IDE support, settings and specials.
|
||||||
apply from: file('gradle/ide/intellij-idea.gradle')
|
apply from: file('gradle/ide/intellij-idea.gradle')
|
||||||
apply from: file('gradle/ide/eclipse.gradle')
|
apply from: file('gradle/ide/eclipse.gradle')
|
||||||
|
|
|
@ -38,3 +38,9 @@ dependencies {
|
||||||
implementation "commons-codec:commons-codec:${scriptDepVersions['commons-codec']}"
|
implementation "commons-codec:commons-codec:${scriptDepVersions['commons-codec']}"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (!rootProject.hasJavaFlightRecorder) {
|
||||||
|
logger.warn('Module jdk.jfr is not available; skipping compilation of Java Flight Recorder support.')
|
||||||
|
tasks.named('compileJava').configure {
|
||||||
|
exclude('**/ProfileResults.java')
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
|
@ -24,7 +24,7 @@ ext {
|
||||||
"apache-rat": "0.14",
|
"apache-rat": "0.14",
|
||||||
"asm": "9.6",
|
"asm": "9.6",
|
||||||
"commons-codec": "1.13",
|
"commons-codec": "1.13",
|
||||||
"ecj": "3.36.0-SNAPSHOT",
|
"ecj": "3.36.0",
|
||||||
"flexmark": "0.61.24",
|
"flexmark": "0.61.24",
|
||||||
"javacc": "7.0.12",
|
"javacc": "7.0.12",
|
||||||
"jflex": "1.8.2",
|
"jflex": "1.8.2",
|
||||||
|
|
|
@ -15,20 +15,18 @@
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import org.apache.lucene.gradle.ProfileResults;
|
|
||||||
|
|
||||||
def recordings = files()
|
def recordings = files()
|
||||||
|
|
||||||
allprojects {
|
allprojects {
|
||||||
plugins.withType(JavaPlugin) {
|
plugins.withType(JavaPlugin) {
|
||||||
ext {
|
ext {
|
||||||
testOptions += [
|
testOptions += [
|
||||||
[propName: 'tests.profile', value: false, description: "Enable java flight recorder profiling."]
|
[propName: 'tests.profile', value: false, description: "Enable Java Flight Recorder profiling."]
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
|
|
||||||
if (resolvedTestOption("tests.profile").toBoolean()) {
|
if (resolvedTestOption("tests.profile").toBoolean()) {
|
||||||
allprojects {
|
if (rootProject.hasJavaFlightRecorder) {
|
||||||
tasks.withType(Test) {
|
tasks.withType(Test) {
|
||||||
jvmArgs("-XX:StartFlightRecording=dumponexit=true,maxsize=250M,settings=" + rootProject.file("gradle/testing/profiling.jfc"),
|
jvmArgs("-XX:StartFlightRecording=dumponexit=true,maxsize=250M,settings=" + rootProject.file("gradle/testing/profiling.jfc"),
|
||||||
"-XX:+UnlockDiagnosticVMOptions",
|
"-XX:+UnlockDiagnosticVMOptions",
|
||||||
|
@ -41,6 +39,8 @@ allprojects {
|
||||||
recordings = recordings.plus fileTree(dir: workingDir, include: '*.jfr')
|
recordings = recordings.plus fileTree(dir: workingDir, include: '*.jfr')
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
throw new GradleException('Module jdk.jfr is not available; Java Flight Recorder profiles cannot be enabled.')
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -48,10 +48,11 @@ allprojects {
|
||||||
|
|
||||||
gradle.buildFinished {
|
gradle.buildFinished {
|
||||||
if (!recordings.isEmpty()) {
|
if (!recordings.isEmpty()) {
|
||||||
ProfileResults.printReport(recordings.getFiles().collect { it.toString() },
|
def pr = org.apache.lucene.gradle.ProfileResults;
|
||||||
propertyOrDefault(ProfileResults.MODE_KEY, ProfileResults.MODE_DEFAULT) as String,
|
pr.printReport(recordings.getFiles().collect { it.toString() },
|
||||||
Integer.parseInt(propertyOrDefault(ProfileResults.STACKSIZE_KEY, ProfileResults.STACKSIZE_DEFAULT)),
|
propertyOrDefault(pr.MODE_KEY, pr.MODE_DEFAULT) as String,
|
||||||
Integer.parseInt(propertyOrDefault(ProfileResults.COUNT_KEY, ProfileResults.COUNT_DEFAULT)),
|
Integer.parseInt(propertyOrDefault(pr.STACKSIZE_KEY, pr.STACKSIZE_DEFAULT)),
|
||||||
Boolean.parseBoolean(propertyOrDefault(ProfileResults.LINENUMBERS_KEY, ProfileResults.LINENUMBERS_DEFAULT)))
|
Integer.parseInt(propertyOrDefault(pr.COUNT_KEY, pr.COUNT_DEFAULT)),
|
||||||
|
Boolean.parseBoolean(propertyOrDefault(pr.LINENUMBERS_KEY, pr.LINENUMBERS_DEFAULT)))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -23,8 +23,6 @@ grant {
|
||||||
// jetty-specific:
|
// jetty-specific:
|
||||||
permission java.lang.RuntimePermission "getenv.JETTY_AVAILABLE_PROCESSORS";
|
permission java.lang.RuntimePermission "getenv.JETTY_AVAILABLE_PROCESSORS";
|
||||||
permission java.lang.RuntimePermission "getenv.JETTY_WORKER_INSTANCE";
|
permission java.lang.RuntimePermission "getenv.JETTY_WORKER_INSTANCE";
|
||||||
// servlet stuff
|
|
||||||
permission java.lang.RuntimePermission "setContextClassLoader";
|
|
||||||
// allow TestNRTReplication fork its jvm
|
// allow TestNRTReplication fork its jvm
|
||||||
permission java.io.FilePermission "${java.home}${/}-", "read,execute";
|
permission java.io.FilePermission "${java.home}${/}-", "read,execute";
|
||||||
// read/write access to all system properties (required by jetty in these tests)
|
// read/write access to all system properties (required by jetty in these tests)
|
||||||
|
|
|
@ -50,14 +50,11 @@ grant {
|
||||||
permission java.lang.RuntimePermission "getStackTrace";
|
permission java.lang.RuntimePermission "getStackTrace";
|
||||||
// needed for mock filesystems in tests
|
// needed for mock filesystems in tests
|
||||||
permission java.lang.RuntimePermission "fileSystemProvider";
|
permission java.lang.RuntimePermission "fileSystemProvider";
|
||||||
// analyzers/uima: needed by lucene expressions' JavascriptCompiler
|
|
||||||
permission java.lang.RuntimePermission "createClassLoader";
|
|
||||||
// needed to test unmap hack on platforms that support it
|
// needed to test unmap hack on platforms that support it
|
||||||
permission java.lang.RuntimePermission "accessClassInPackage.sun.misc";
|
permission java.lang.RuntimePermission "accessClassInPackage.sun.misc";
|
||||||
permission java.lang.reflect.ReflectPermission "suppressAccessChecks";
|
permission java.lang.reflect.ReflectPermission "suppressAccessChecks";
|
||||||
// needed by cyberneko usage by benchmarks on J9
|
// needed by cyberneko usage by benchmarks on J9
|
||||||
permission java.lang.RuntimePermission "accessClassInPackage.org.apache.xerces.util";
|
permission java.lang.RuntimePermission "accessClassInPackage.org.apache.xerces.util";
|
||||||
permission java.lang.RuntimePermission "getClassLoader";
|
|
||||||
|
|
||||||
// Needed for loading native library (lucene:misc:native) in lucene:misc
|
// Needed for loading native library (lucene:misc:native) in lucene:misc
|
||||||
permission java.lang.RuntimePermission "getFileStoreAttributes";
|
permission java.lang.RuntimePermission "getFileStoreAttributes";
|
||||||
|
@ -111,6 +108,8 @@ grant {
|
||||||
permission java.lang.RuntimePermission "shutdownHooks";
|
permission java.lang.RuntimePermission "shutdownHooks";
|
||||||
// needed by jacoco to instrument classes
|
// needed by jacoco to instrument classes
|
||||||
permission java.lang.RuntimePermission "defineClass";
|
permission java.lang.RuntimePermission "defineClass";
|
||||||
|
// needed by jacoco for God knows what.
|
||||||
|
permission java.lang.RuntimePermission "createClassLoader";
|
||||||
};
|
};
|
||||||
|
|
||||||
// Grant all permissions to Gradle test runner classes.
|
// Grant all permissions to Gradle test runner classes.
|
||||||
|
|
|
@ -23,6 +23,7 @@ import org.gradle.util.GradleVersion
|
||||||
configure(rootProject) {
|
configure(rootProject) {
|
||||||
ext {
|
ext {
|
||||||
expectedGradleVersion = '8.4'
|
expectedGradleVersion = '8.4'
|
||||||
|
hasJavaFlightRecorder = ModuleLayer.boot().findModule('jdk.jfr').map(this.class.module::canRead).orElse(false)
|
||||||
}
|
}
|
||||||
|
|
||||||
wrapper {
|
wrapper {
|
||||||
|
|
|
@ -17,8 +17,8 @@
|
||||||
|
|
||||||
def skipReason
|
def skipReason
|
||||||
|
|
||||||
if (rootProject.usesAltJvm && rootProject.runtimeJavaVersion > JavaVersion.VERSION_15) {
|
if (rootProject.usesAltJvm) {
|
||||||
skipReason = "won't work with JDK ${rootProject.runtimeJavaVersion} if used as alternative java toolchain"
|
skipReason = "won't work with alternative java toolchain"
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!propertyOrDefault("validation.errorprone", isCIBuild).asBoolean()) {
|
if (!propertyOrDefault("validation.errorprone", isCIBuild).asBoolean()) {
|
||||||
|
@ -37,7 +37,7 @@ if (skipReason) {
|
||||||
|
|
||||||
allprojects { prj ->
|
allprojects { prj ->
|
||||||
plugins.withType(JavaPlugin) {
|
plugins.withType(JavaPlugin) {
|
||||||
// LUCENE-9650: Errorprone on master/gradle does not work with JDK-16+ when running as plugin
|
// LUCENE-9650: Errorprone on master/gradle does not work when running as plugin
|
||||||
// inside a forked Javac process. Javac running inside Gradle works, because we have
|
// inside a forked Javac process. Javac running inside Gradle works, because we have
|
||||||
// additional module system opens in place.
|
// additional module system opens in place.
|
||||||
// This is a hack to keep the dependency (so that palantir's version check doesn't complain)
|
// This is a hack to keep the dependency (so that palantir's version check doesn't complain)
|
||||||
|
|
|
@ -59,6 +59,9 @@ allprojects {
|
||||||
}
|
}
|
||||||
|
|
||||||
subprojects {
|
subprojects {
|
||||||
|
// initialize empty, because no checks for benchmark-jmh module.
|
||||||
|
ext.jarInfos = []
|
||||||
|
|
||||||
// Configure jarValidation configuration for all projects. Any dependency
|
// Configure jarValidation configuration for all projects. Any dependency
|
||||||
// declared on this configuration (or any configuration it extends from) will
|
// declared on this configuration (or any configuration it extends from) will
|
||||||
// be verified.
|
// be verified.
|
||||||
|
|
|
@ -61,6 +61,7 @@ Otherwise you are stuck wrestling down full dependencies of OpenJDK (metal etc)
|
||||||
Also you must run benchmarks as root to use dtrace, but it works.
|
Also you must run benchmarks as root to use dtrace, but it works.
|
||||||
|
|
||||||
$ git clone --depth 1 https://github.com/openjdk/jdk/
|
$ git clone --depth 1 https://github.com/openjdk/jdk/
|
||||||
|
$ curl -f https://ftp.gnu.org/gnu/binutils/binutils-2.38.tar.gz | tar -zxf -
|
||||||
$ curl -fo jdk/src/utils/hsdis/binutils/Makefile https://raw.githubusercontent.com/openjdk/jdk/3c7ae1225f0d5575fd927a9b76fb40dc30e208cd/src/utils/hsdis/Makefile
|
$ curl -fo jdk/src/utils/hsdis/binutils/Makefile https://raw.githubusercontent.com/openjdk/jdk/3c7ae1225f0d5575fd927a9b76fb40dc30e208cd/src/utils/hsdis/Makefile
|
||||||
$ vi jdk/src/utils/hsdis/binutils/Makefile, change SOURCE = hsdis.c to SOURCE = hsdis-binutils.c
|
$ vi jdk/src/utils/hsdis/binutils/Makefile, change SOURCE = hsdis.c to SOURCE = hsdis-binutils.c
|
||||||
$ vi jdk/src/utils/hsdis/binutils/hsdis-binutils.c, change #include "hsdis.h" to #include "../hsdis.h"
|
$ vi jdk/src/utils/hsdis/binutils/hsdis-binutils.c, change #include "hsdis.h" to #include "../hsdis.h"
|
||||||
|
|
|
@ -7,7 +7,6 @@ http://s.apache.org/luceneversions
|
||||||
|
|
||||||
API Changes
|
API Changes
|
||||||
---------------------
|
---------------------
|
||||||
|
|
||||||
* LUCENE-12092: Remove deprecated UTF8TaxonomyWriterCache. Please use LruTaxonomyWriterCache
|
* LUCENE-12092: Remove deprecated UTF8TaxonomyWriterCache. Please use LruTaxonomyWriterCache
|
||||||
instead. (Vigya Sharma)
|
instead. (Vigya Sharma)
|
||||||
|
|
||||||
|
@ -62,10 +61,21 @@ API Changes
|
||||||
|
|
||||||
* GITHUB#12599: Add RandomAccessInput#readBytes method to the RandomAccessInput interface. (Ignacio Vera)
|
* GITHUB#12599: Add RandomAccessInput#readBytes method to the RandomAccessInput interface. (Ignacio Vera)
|
||||||
|
|
||||||
* GITHUB#12709: Consolidate FSTStore and BytesStore in FST. Created FSTReader which contains the common methods
|
* GITHUB#11023: Adding -level param to CheckIndex, making the old -fast param the default behaviour. (Jakub Slowinski)
|
||||||
of the two (Anh Dung Bui)
|
|
||||||
|
|
||||||
* GITHUB#12735: Remove FSTCompiler#getTermCount() and FSTCompiler.UnCompiledNode#inputCount (Anh Dung Bui)
|
* GITHUB#12873: Expressions module now uses MethodHandles to define custom functions. Support for
|
||||||
|
custom classloaders was removed. (Uwe Schindler)
|
||||||
|
|
||||||
|
* GITHUB#12243: Remove TermInSetQuery ctors taking varargs param. SortedSetDocValuesField#newSlowSetQuery,
|
||||||
|
SortedDocValuesField#newSlowSetQuery, KeywordField#newSetQuery, KeywordField#newSetQuery now take a collection. (Jakub Slowinski)
|
||||||
|
|
||||||
|
* GITHUB#12881: Performance improvements to MatchHighlighter and MatchRegionRetriever. MatchRegionRetriever can be
|
||||||
|
configured to not load matches (or content) of certain fields and to force-load other fields so that stored fields
|
||||||
|
of a document are accessed once. A configurable limit of field matches placed in the priority queue was added
|
||||||
|
(allows handling long fields with lots of hits more gracefully). MatchRegionRetriever utilizes IndexSearcher's
|
||||||
|
executor to extract hit offsets concurrently. (Dawid Weiss)
|
||||||
|
|
||||||
|
* GITHUB#12855: Remove deprecated DrillSideways#createDrillDownFacetsCollector extension method. (Greg Miller)
|
||||||
|
|
||||||
New Features
|
New Features
|
||||||
---------------------
|
---------------------
|
||||||
|
@ -89,18 +99,17 @@ Improvements
|
||||||
|
|
||||||
* GITHUB#12447: Hunspell: speed up the dictionary enumeration on suggestion (Peter Gromov)
|
* GITHUB#12447: Hunspell: speed up the dictionary enumeration on suggestion (Peter Gromov)
|
||||||
|
|
||||||
* GITHUB#12542: FSTCompiler can now approximately limit how much RAM it uses to share
|
* GITHUB#12873: Expressions module now uses JEP 371 "Hidden Classes" with JEP 309
|
||||||
suffixes during FST construction using the suffixRAMLimitMB method. Larger values
|
"Dynamic Class-File Constants" to implement Javascript expressions. (Uwe Schindler)
|
||||||
result in a more minimal FST (more common suffixes are shard). Pass
|
|
||||||
Double.POSITIVE_INFINITY to use as much RAM as is needed to create a purely
|
|
||||||
minimal FST. Inspired by this Rust FST implemention:
|
|
||||||
https://blog.burntsushi.net/transducers (Mike McCandless)
|
|
||||||
|
|
||||||
Optimizations
|
Optimizations
|
||||||
---------------------
|
---------------------
|
||||||
|
|
||||||
* GITHUB#11857, GITHUB#11859, GITHUB#11893, GITHUB#11909: Hunspell: improved suggestion performance (Peter Gromov)
|
* GITHUB#11857, GITHUB#11859, GITHUB#11893, GITHUB#11909: Hunspell: improved suggestion performance (Peter Gromov)
|
||||||
|
|
||||||
|
* GITHUB#12825, GITHUB#12834: Hunspell: improved dictionary loading performance, allowed in-memory entry sorting.
|
||||||
|
(Peter Gromov)
|
||||||
|
|
||||||
* GITHUB#12372: Reduce allocation during HNSW construction (Jonathan Ellis)
|
* GITHUB#12372: Reduce allocation during HNSW construction (Jonathan Ellis)
|
||||||
|
|
||||||
* GITHUB#12408: Lazy initialization improvements for Facets implementations when there are segments with no hits
|
* GITHUB#12408: Lazy initialization improvements for Facets implementations when there are segments with no hits
|
||||||
|
@ -116,6 +125,9 @@ Bug Fixes
|
||||||
|
|
||||||
* GITHUB#12220: Hunspell: disallow hidden title-case entries from compound middle/end
|
* GITHUB#12220: Hunspell: disallow hidden title-case entries from compound middle/end
|
||||||
|
|
||||||
|
* GITHUB#12878: Fix the declared Exceptions of Expression#evaluate() to match those
|
||||||
|
of DoubleValues#doubleValue(). (Uwe Schindler)
|
||||||
|
|
||||||
Other
|
Other
|
||||||
---------------------
|
---------------------
|
||||||
|
|
||||||
|
@ -142,6 +154,48 @@ Other
|
||||||
|
|
||||||
* GITHUB#12239: Hunspell: reduced suggestion set dependency on the hash table order (Peter Gromov)
|
* GITHUB#12239: Hunspell: reduced suggestion set dependency on the hash table order (Peter Gromov)
|
||||||
|
|
||||||
|
* GITHUB#9049: Fixing bug in UnescapedCharSequence#toStringEscaped() (Jakub Slowinski)
|
||||||
|
|
||||||
|
======================== Lucene 9.10.0 =======================
|
||||||
|
|
||||||
|
API Changes
|
||||||
|
---------------------
|
||||||
|
* GITHUB#12243: Mark TermInSetQuery ctors with varargs terms as @Deprecated. SortedSetDocValuesField#newSlowSetQuery,
|
||||||
|
SortedDocValuesField#newSlowSetQuery, KeywordField#newSetQuery now take a collection of terms as a param. (Jakub Slowinski)
|
||||||
|
|
||||||
|
* GITHUB#11041: Deprecate IndexSearch#search(Query, Collector) in favor of
|
||||||
|
IndexSearcher#search(Query, CollectorManager) for TopFieldCollectorManager
|
||||||
|
and TopScoreDocCollectorManager. (Zach Chen, Adrien Grand, Michael McCandless, Greg Miller, Luca Cavanna)
|
||||||
|
|
||||||
|
* GITHUB#12854: Mark DrillSideways#createDrillDownFacetsCollector as @Deprecated. (Greg Miller)
|
||||||
|
|
||||||
|
New Features
|
||||||
|
---------------------
|
||||||
|
(No changes)
|
||||||
|
|
||||||
|
Improvements
|
||||||
|
---------------------
|
||||||
|
|
||||||
|
* GITHUB#12870: Tighten synchronized loop in DirectoryTaxonomyReader#getOrdinal. (Stefan Vodita)
|
||||||
|
|
||||||
|
* GITHUB#12812: Avoid overflows and false negatives in int slice buffer filled-with-zeros assertion. (Stefan Vodita)
|
||||||
|
|
||||||
|
Optimizations
|
||||||
|
---------------------
|
||||||
|
(No changes)
|
||||||
|
|
||||||
|
Bug Fixes
|
||||||
|
---------------------
|
||||||
|
* GITHUB#12866: Prevent extra similarity computation for single-level HNSW graphs. (Kaival Parikh)
|
||||||
|
|
||||||
|
* GITHUB#12558: Ensure #finish is called on all drill-sideways FacetsCollectors even when no hits are scored.
|
||||||
|
(Greg Miller)
|
||||||
|
|
||||||
|
Other
|
||||||
|
---------------------
|
||||||
|
|
||||||
|
* GITHUB#11023: Removing some dead code in CheckIndex. (Jakub Slowinski)
|
||||||
|
|
||||||
======================== Lucene 9.9.0 =======================
|
======================== Lucene 9.9.0 =======================
|
||||||
|
|
||||||
API Changes
|
API Changes
|
||||||
|
@ -157,9 +211,6 @@ API Changes
|
||||||
* GITHUB#12592: Add RandomAccessInput#length method to the RandomAccessInput interface. In addition deprecate
|
* GITHUB#12592: Add RandomAccessInput#length method to the RandomAccessInput interface. In addition deprecate
|
||||||
ByteBuffersDataInput#size in favour of this new method. (Ignacio Vera)
|
ByteBuffersDataInput#size in favour of this new method. (Ignacio Vera)
|
||||||
|
|
||||||
* GITHUB#12646, GITHUB#12690: Move FST#addNode to FSTCompiler to avoid a circular dependency
|
|
||||||
between FST and FSTCompiler (Anh Dung Bui)
|
|
||||||
|
|
||||||
* GITHUB#12718: Make IndexSearcher#getSlices final as it is not expected to be overridden (Luca Cavanna)
|
* GITHUB#12718: Make IndexSearcher#getSlices final as it is not expected to be overridden (Luca Cavanna)
|
||||||
|
|
||||||
* GITHUB#12427: Automata#makeStringUnion #makeBinaryStringUnion now accept Iterable<BytesRef> instead of
|
* GITHUB#12427: Automata#makeStringUnion #makeBinaryStringUnion now accept Iterable<BytesRef> instead of
|
||||||
|
@ -169,6 +220,25 @@ API Changes
|
||||||
* GITHUB#12180: Add TaxonomyReader#getBulkOrdinals method to more efficiently retrieve facet ordinals for multiple
|
* GITHUB#12180: Add TaxonomyReader#getBulkOrdinals method to more efficiently retrieve facet ordinals for multiple
|
||||||
FacetLabel at once. (Egor Potemkin)
|
FacetLabel at once. (Egor Potemkin)
|
||||||
|
|
||||||
|
* GITHUB#12816: Add HumanReadableQuery which takes a description parameter for debugging purposes. (Jakub Slowinski)
|
||||||
|
|
||||||
|
* GITHUB#12646, GITHUB#12690: Move FST#addNode to FSTCompiler to avoid a circular dependency
|
||||||
|
between FST and FSTCompiler (Anh Dung Bui)
|
||||||
|
|
||||||
|
* GITHUB#12709: Consolidate FSTStore and BytesStore in FST. Created FSTReader which contains the common methods
|
||||||
|
of the two (Anh Dung Bui)
|
||||||
|
|
||||||
|
* GITHUB#12735: Remove FSTCompiler#getTermCount() and FSTCompiler.UnCompiledNode#inputCount (Anh Dung Bui)
|
||||||
|
|
||||||
|
* GITHUB-12695: Remove public constructor of FSTCompiler. Please use FSTCompiler.Builder
|
||||||
|
instead. (Juan M. Caicedo)
|
||||||
|
|
||||||
|
* GITHUB#12799: Make TaskExecutor constructor public and use TaskExecutor for concurrent
|
||||||
|
HNSW graph build. (Shubham Chaudhary)
|
||||||
|
|
||||||
|
* GITHUB#12758, GITHUB#12803: Remove FST constructor with DataInput for metadata. Please
|
||||||
|
use the constructor with FSTMetadata instead. (Anh Dung Bui)
|
||||||
|
|
||||||
New Features
|
New Features
|
||||||
---------------------
|
---------------------
|
||||||
|
|
||||||
|
@ -180,7 +250,7 @@ New Features
|
||||||
|
|
||||||
* GITHUB#12582: Add int8 scalar quantization to the HNSW vector format. This optionally allows for more compact lossy
|
* GITHUB#12582: Add int8 scalar quantization to the HNSW vector format. This optionally allows for more compact lossy
|
||||||
storage for the vectors, requiring about 75% memory for fast HNSW search. (Ben Trent)
|
storage for the vectors, requiring about 75% memory for fast HNSW search. (Ben Trent)
|
||||||
|
|
||||||
* GITHUB#12660: HNSW graph now can be merged with multiple thread. Configurable in Lucene99HnswVectorsFormat.
|
* GITHUB#12660: HNSW graph now can be merged with multiple thread. Configurable in Lucene99HnswVectorsFormat.
|
||||||
(Patrick Zhai)
|
(Patrick Zhai)
|
||||||
|
|
||||||
|
@ -225,6 +295,22 @@ Improvements
|
||||||
* GITHUB#12754: Refactor lookup of Hotspot VM options and do not initialize constants with NULL
|
* GITHUB#12754: Refactor lookup of Hotspot VM options and do not initialize constants with NULL
|
||||||
if SecurityManager prevents access. (Uwe Schindler)
|
if SecurityManager prevents access. (Uwe Schindler)
|
||||||
|
|
||||||
|
* GITHUB#12801: Remove possible contention on a ReentrantReadWriteLock in
|
||||||
|
Monitor which could result in searches waiting for commits. (Davis Cook)
|
||||||
|
|
||||||
|
* GITHUB#11277, LUCENE-10241: Upgrade to OpenNLP to 1.9.4. (Jeff Zemerick)
|
||||||
|
|
||||||
|
* GITHUB#12542: FSTCompiler can now approximately limit how much RAM it uses to share
|
||||||
|
suffixes during FST construction using the suffixRAMLimitMB method. Larger values
|
||||||
|
result in a more minimal FST (more common suffixes are shard). Pass
|
||||||
|
Double.POSITIVE_INFINITY to use as much RAM as is needed to create a purely
|
||||||
|
minimal FST. Inspired by this Rust FST implemention:
|
||||||
|
https://blog.burntsushi.net/transducers (Mike McCandless)
|
||||||
|
|
||||||
|
* GITHUB#12738: NodeHash now stores the FST nodes data instead of just node addresses (Anh Dung Bui)
|
||||||
|
|
||||||
|
* GITHUB#12847: Test2BFST now reports the time it took to build the FST and the real FST size (Anh Dung Bui)
|
||||||
|
|
||||||
Optimizations
|
Optimizations
|
||||||
---------------------
|
---------------------
|
||||||
* GITHUB#12183: Make TermStates#build concurrent. (Shubham Chaudhary)
|
* GITHUB#12183: Make TermStates#build concurrent. (Shubham Chaudhary)
|
||||||
|
@ -276,10 +362,14 @@ Optimizations
|
||||||
|
|
||||||
* GITHUB#12381: Skip docs with DocValues in NumericLeafComparator. (Lu Xugang, Adrien Grand)
|
* GITHUB#12381: Skip docs with DocValues in NumericLeafComparator. (Lu Xugang, Adrien Grand)
|
||||||
|
|
||||||
* GITHUB#12748: Specialize arc store for continuous label in FST. (Guo Feng, Chao Zhang)
|
|
||||||
|
|
||||||
* GITHUB#12784: Cache buckets to speed up BytesRefHash#sort. (Guo Feng)
|
* GITHUB#12784: Cache buckets to speed up BytesRefHash#sort. (Guo Feng)
|
||||||
|
|
||||||
|
* GITHUB#12806: Utilize exact kNN search when gathering k >= numVectors in a segment (Ben Trent)
|
||||||
|
|
||||||
|
* GITHUB#12782: Use group-varint encoding for the tail of postings. (Adrien Grand, Zhang Chao)
|
||||||
|
|
||||||
|
* GITHUB#12748: Specialize arc store for continuous label in FST. (Guo Feng, Zhang Chao)
|
||||||
|
|
||||||
Changes in runtime behavior
|
Changes in runtime behavior
|
||||||
---------------------
|
---------------------
|
||||||
|
|
||||||
|
@ -311,22 +401,33 @@ Bug Fixes
|
||||||
|
|
||||||
* GITHUB#12770: Stop exploring HNSW graph if scores are not getting better. (Ben Trent)
|
* GITHUB#12770: Stop exploring HNSW graph if scores are not getting better. (Ben Trent)
|
||||||
|
|
||||||
|
* GITHUB#12640: Ensure #finish is called on all drill-sideways collectors even if one throws a
|
||||||
|
CollectionTerminatedException (Greg Miller)
|
||||||
|
|
||||||
|
* GITHUB#12626: Fix segmentInfos replace to set userData (Shibi Balamurugan, Uwe Schindler, Marcus Eagan, Michael Froh)
|
||||||
|
|
||||||
Build
|
Build
|
||||||
---------------------
|
---------------------
|
||||||
|
|
||||||
* GITHUB#12752: tests.multiplier could be omitted in test failure reproduce lines (esp. in
|
* GITHUB#12752: tests.multiplier could be omitted in test failure reproduce lines (esp. in
|
||||||
nightly mode). (Dawid Weiss)
|
nightly mode). (Dawid Weiss)
|
||||||
|
|
||||||
* GITHUB#12742: JavaCompile tasks may be in up-to-date state when modular dependencies have changed
|
* GITHUB#12742: JavaCompile tasks may be in up-to-date state when modular dependencies have changed
|
||||||
leading to odd runtime errors (Chris Hostetter, Dawid Weiss)
|
leading to odd runtime errors (Chris Hostetter, Dawid Weiss)
|
||||||
|
|
||||||
* GITHUB#12612: Upgrade forbiddenapis to version 3.6 and ASM for APIJAR extraction to 9.6. (Uwe Schindler)
|
* GITHUB#12612: Upgrade forbiddenapis to version 3.6 and ASM for APIJAR extraction to 9.6. (Uwe Schindler)
|
||||||
|
|
||||||
* GITHUB#12655: Upgrade to Gradle 8.4 (Kevin Risden)
|
* GITHUB#12655: Upgrade to Gradle 8.4 (Kevin Risden)
|
||||||
|
|
||||||
|
* GITHUB#12845: Only enable support for tests.profile if jdk.jfr module is available
|
||||||
|
in Gradle runtime. (Uwe Schindler)
|
||||||
|
|
||||||
Other
|
Other
|
||||||
---------------------
|
---------------------
|
||||||
|
|
||||||
|
* GITHUB#12817: Add demo for faceting with StringValueFacetCounts over KeywordField and SortedDocValuesField.
|
||||||
|
(Stefan Vodita)
|
||||||
|
|
||||||
* GITHUB#12657: Internal refactor of HNSW graph merging (Ben Trent).
|
* GITHUB#12657: Internal refactor of HNSW graph merging (Ben Trent).
|
||||||
|
|
||||||
* GITHUB#12625: Refactor ByteBlockPool so it is just a "shift/mask big array". (Ignacio Vera)
|
* GITHUB#12625: Refactor ByteBlockPool so it is just a "shift/mask big array". (Ignacio Vera)
|
||||||
|
@ -336,6 +437,8 @@ Other
|
||||||
overflows and slices that are too large. Some bits of code are simplified. Documentation is updated and expanded.
|
overflows and slices that are too large. Some bits of code are simplified. Documentation is updated and expanded.
|
||||||
(Stefan Vodita)
|
(Stefan Vodita)
|
||||||
|
|
||||||
|
* GITHUB#12762: Refactor BKD HeapPointWriter to hide the internal data structure. (Ignacio Vera)
|
||||||
|
|
||||||
======================== Lucene 9.8.0 =======================
|
======================== Lucene 9.8.0 =======================
|
||||||
|
|
||||||
API Changes
|
API Changes
|
||||||
|
@ -364,6 +467,8 @@ New Features
|
||||||
* GITHUB#12479: Add new Maximum Inner Product vector similarity function for non-normalized dot-product
|
* GITHUB#12479: Add new Maximum Inner Product vector similarity function for non-normalized dot-product
|
||||||
vector search. (Jack Mazanec, Ben Trent)
|
vector search. (Jack Mazanec, Ben Trent)
|
||||||
|
|
||||||
|
* GITHUB#12525: `WordDelimiterGraphFilterFactory` now supports the `ignoreKeywords` flag (Thomas De Craemer)
|
||||||
|
|
||||||
* GITHUB#12489: Add support for recursive graph bisection, also called
|
* GITHUB#12489: Add support for recursive graph bisection, also called
|
||||||
bipartite graph partitioning, and often abbreviated BP, an algorithm for
|
bipartite graph partitioning, and often abbreviated BP, an algorithm for
|
||||||
reordering doc IDs that results in more compact postings and faster queries,
|
reordering doc IDs that results in more compact postings and faster queries,
|
||||||
|
@ -386,7 +491,7 @@ Improvements
|
||||||
Optimizations
|
Optimizations
|
||||||
---------------------
|
---------------------
|
||||||
|
|
||||||
* GITHUB#12377: Avoid redundant loop for compute min value in DirectMonotonicWriter. (Chao Zhang)
|
* GITHUB#12377: Avoid redundant loop for compute min value in DirectMonotonicWriter. (Zhang Chao)
|
||||||
|
|
||||||
* GITHUB#12361: Faster top-level disjunctions sorted by descending score.
|
* GITHUB#12361: Faster top-level disjunctions sorted by descending score.
|
||||||
(Adrien Grand)
|
(Adrien Grand)
|
||||||
|
@ -401,7 +506,7 @@ Optimizations
|
||||||
|
|
||||||
* GITHUB#12385: Restore parallel knn query rewrite across segments rather than slices (Luca Cavanna)
|
* GITHUB#12385: Restore parallel knn query rewrite across segments rather than slices (Luca Cavanna)
|
||||||
|
|
||||||
* GITHUB#12381: Speed up NumericDocValuesWriter with index sorting. (Chao Zhang)
|
* GITHUB#12381: Speed up NumericDocValuesWriter with index sorting. (Zhang Chao)
|
||||||
|
|
||||||
* GITHUB#12453: Faster bulk numeric reads from BufferedIndexInput (Armin Braun)
|
* GITHUB#12453: Faster bulk numeric reads from BufferedIndexInput (Armin Braun)
|
||||||
|
|
||||||
|
@ -468,7 +573,7 @@ Other
|
||||||
* GITHUB#12428: Replace consecutive close() calls and close() calls with null checks with IOUtils.close().
|
* GITHUB#12428: Replace consecutive close() calls and close() calls with null checks with IOUtils.close().
|
||||||
(Shubham Chaudhary)
|
(Shubham Chaudhary)
|
||||||
|
|
||||||
* GITHUB#12512: Remove unused variable in BKDWriter. (Chao Zhang)
|
* GITHUB#12512: Remove unused variable in BKDWriter. (Zhang Chao)
|
||||||
|
|
||||||
======================== Lucene 9.7.0 =======================
|
======================== Lucene 9.7.0 =======================
|
||||||
|
|
||||||
|
|
|
@ -19,6 +19,11 @@
|
||||||
|
|
||||||
## Migration from Lucene 9.x to Lucene 10.0
|
## Migration from Lucene 9.x to Lucene 10.0
|
||||||
|
|
||||||
|
### Minor API changes in MatchHighlighter and MatchRegionRetriever. (GITHUB#12881)
|
||||||
|
|
||||||
|
The API of interfaces for accepting highlights has changed to allow performance improvements. Look at the issue and the PR diff to get
|
||||||
|
a sense of what's changed (changes are minor).
|
||||||
|
|
||||||
### Removed deprecated IndexSearcher.doc, IndexReader.document, IndexReader.getTermVectors (GITHUB#11998)
|
### Removed deprecated IndexSearcher.doc, IndexReader.document, IndexReader.getTermVectors (GITHUB#11998)
|
||||||
|
|
||||||
The deprecated Stored Fields and Term Vectors apis relied upon threadlocal storage and have been removed.
|
The deprecated Stored Fields and Term Vectors apis relied upon threadlocal storage and have been removed.
|
||||||
|
@ -101,6 +106,34 @@ The deprecated getter for the `Executor` that was optionally provided to the `In
|
||||||
has been removed. Users that want to execute concurrent tasks should rely instead on the `TaskExecutor`
|
has been removed. Users that want to execute concurrent tasks should rely instead on the `TaskExecutor`
|
||||||
that the searcher holds, retrieved via `IndexSearcher#getTaskExecutor`.
|
that the searcher holds, retrieved via `IndexSearcher#getTaskExecutor`.
|
||||||
|
|
||||||
|
### CheckIndex params -slow and -fast are deprecated, replaced by -level X (GITHUB#11023)
|
||||||
|
|
||||||
|
The `CheckIndex` former `-fast` behaviour of performing checksum checks only, is now the default.
|
||||||
|
Added a new parameter: `-level X`, to set the detail level of the index check. The higher the value, the more checks are performed.
|
||||||
|
Sample `-level` usage: `1` (Default) - Checksum checks only, `2` - all level 1 checks as well as logical integrity checks, `3` - all
|
||||||
|
level 2 checks as well as slow checks.
|
||||||
|
|
||||||
|
### Expressions module now uses `MethodHandle` and hidden classes (GITHUB#12873)
|
||||||
|
|
||||||
|
Custom functions in the expressions module must now be passed in a `Map` using `MethodHandle` as values.
|
||||||
|
To convert legacy code using maps of reflective `java.lang.reflect.Method`, use the converter method
|
||||||
|
`JavascriptCompiler#convertLegacyFunctions`. This should make the mapping mostly compatible.
|
||||||
|
The use of `MethodHandle` and [Dynamic Class-File Constants (JEP 309)](https://openjdk.org/jeps/309)
|
||||||
|
now also allows to pass private methods or methods from different classloaders. It is also possible
|
||||||
|
to adapt guards or filters using the `MethodHandles` class.
|
||||||
|
|
||||||
|
The new implementation of the Javascript expressions compiler no longer supports use of custom
|
||||||
|
`ClassLoader`, because it uses the new JDK 15 feature [hidden classes (JEP 371)](https://openjdk.org/jeps/371).
|
||||||
|
Due to the use of `MethodHandle`, classloader isolation is no longer needed, because JS code can only call
|
||||||
|
MHs that were resolved by the application before using the expressions module.
|
||||||
|
|
||||||
|
### `Expression#evaluate()` declares to throw IOException (GITHUB#12878)
|
||||||
|
|
||||||
|
The expressions module has changed the `Expression#evaluate()` method signature:
|
||||||
|
It now declares that it may throw `IOException`. This was an oversight because
|
||||||
|
compiled expressions call `DoubleValues#doubleValue` behind the scenes, which
|
||||||
|
may throw `IOException` on index problems, bubbling up unexpectedly to the caller.
|
||||||
|
|
||||||
## Migration from Lucene 9.0 to Lucene 9.1
|
## Migration from Lucene 9.0 to Lucene 9.1
|
||||||
|
|
||||||
### Test framework package migration and module (LUCENE-10301)
|
### Test framework package migration and module (LUCENE-10301)
|
||||||
|
|
|
@ -105,7 +105,8 @@ public class NormalizeCharMap {
|
||||||
final FST<CharsRef> map;
|
final FST<CharsRef> map;
|
||||||
try {
|
try {
|
||||||
final Outputs<CharsRef> outputs = CharSequenceOutputs.getSingleton();
|
final Outputs<CharsRef> outputs = CharSequenceOutputs.getSingleton();
|
||||||
final FSTCompiler<CharsRef> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE2, outputs);
|
final FSTCompiler<CharsRef> fstCompiler =
|
||||||
|
new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE2, outputs).build();
|
||||||
final IntsRefBuilder scratch = new IntsRefBuilder();
|
final IntsRefBuilder scratch = new IntsRefBuilder();
|
||||||
for (Map.Entry<String, String> ent : pendingPairs.entrySet()) {
|
for (Map.Entry<String, String> ent : pendingPairs.entrySet()) {
|
||||||
fstCompiler.add(Util.toUTF16(ent.getKey(), scratch), new CharsRef(ent.getValue()));
|
fstCompiler.add(Util.toUTF16(ent.getKey(), scratch), new CharsRef(ent.getValue()));
|
||||||
|
|
|
@ -777,7 +777,6 @@ class KStemmer {
|
||||||
private int stemLength() {
|
private int stemLength() {
|
||||||
return j + 1;
|
return j + 1;
|
||||||
}
|
}
|
||||||
;
|
|
||||||
|
|
||||||
private boolean endsIn(char[] s) {
|
private boolean endsIn(char[] s) {
|
||||||
if (s.length > k) return false;
|
if (s.length > k) return false;
|
||||||
|
|
|
@ -40,7 +40,8 @@ class ConvTable {
|
||||||
|
|
||||||
try {
|
try {
|
||||||
Outputs<CharsRef> outputs = CharSequenceOutputs.getSingleton();
|
Outputs<CharsRef> outputs = CharSequenceOutputs.getSingleton();
|
||||||
FSTCompiler<CharsRef> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE2, outputs);
|
FSTCompiler<CharsRef> fstCompiler =
|
||||||
|
new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE2, outputs).build();
|
||||||
IntsRefBuilder scratchInts = new IntsRefBuilder();
|
IntsRefBuilder scratchInts = new IntsRefBuilder();
|
||||||
for (Map.Entry<String, String> entry : mappings.entrySet()) {
|
for (Map.Entry<String, String> entry : mappings.entrySet()) {
|
||||||
String key = entry.getKey();
|
String key = entry.getKey();
|
||||||
|
|
|
@ -50,18 +50,12 @@ import java.util.Set;
|
||||||
import java.util.TreeMap;
|
import java.util.TreeMap;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
import java.util.stream.Stream;
|
import java.util.stream.Stream;
|
||||||
import org.apache.lucene.codecs.CodecUtil;
|
import org.apache.lucene.analysis.hunspell.SortingStrategy.EntryAccumulator;
|
||||||
|
import org.apache.lucene.analysis.hunspell.SortingStrategy.EntrySupplier;
|
||||||
import org.apache.lucene.store.Directory;
|
import org.apache.lucene.store.Directory;
|
||||||
import org.apache.lucene.store.IOContext;
|
|
||||||
import org.apache.lucene.store.IndexOutput;
|
|
||||||
import org.apache.lucene.util.ArrayUtil;
|
import org.apache.lucene.util.ArrayUtil;
|
||||||
import org.apache.lucene.util.BytesRef;
|
|
||||||
import org.apache.lucene.util.IOUtils;
|
|
||||||
import org.apache.lucene.util.IntsRef;
|
import org.apache.lucene.util.IntsRef;
|
||||||
import org.apache.lucene.util.IntsRefBuilder;
|
import org.apache.lucene.util.IntsRefBuilder;
|
||||||
import org.apache.lucene.util.OfflineSorter;
|
|
||||||
import org.apache.lucene.util.OfflineSorter.ByteSequencesReader;
|
|
||||||
import org.apache.lucene.util.OfflineSorter.ByteSequencesWriter;
|
|
||||||
import org.apache.lucene.util.fst.FST;
|
import org.apache.lucene.util.fst.FST;
|
||||||
import org.apache.lucene.util.fst.FSTCompiler;
|
import org.apache.lucene.util.fst.FSTCompiler;
|
||||||
import org.apache.lucene.util.fst.IntSequenceOutputs;
|
import org.apache.lucene.util.fst.IntSequenceOutputs;
|
||||||
|
@ -215,6 +209,25 @@ public class Dictionary {
|
||||||
List<InputStream> dictionaries,
|
List<InputStream> dictionaries,
|
||||||
boolean ignoreCase)
|
boolean ignoreCase)
|
||||||
throws IOException, ParseException {
|
throws IOException, ParseException {
|
||||||
|
this(affix, dictionaries, ignoreCase, SortingStrategy.offline(tempDir, tempFileNamePrefix));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates a new Dictionary containing the information read from the provided InputStreams to
|
||||||
|
* hunspell affix and dictionary files. You have to close the provided InputStreams yourself.
|
||||||
|
*
|
||||||
|
* @param affix InputStream for reading the hunspell affix file (won't be closed).
|
||||||
|
* @param dictionaries InputStream for reading the hunspell dictionary files (won't be closed).
|
||||||
|
* @param sortingStrategy the entry strategy for the dictionary loading
|
||||||
|
* @throws IOException Can be thrown while reading from the InputStreams
|
||||||
|
* @throws ParseException Can be thrown if the content of the files does not meet expected formats
|
||||||
|
*/
|
||||||
|
public Dictionary(
|
||||||
|
InputStream affix,
|
||||||
|
List<InputStream> dictionaries,
|
||||||
|
boolean ignoreCase,
|
||||||
|
SortingStrategy sortingStrategy)
|
||||||
|
throws IOException, ParseException {
|
||||||
this.ignoreCase = ignoreCase;
|
this.ignoreCase = ignoreCase;
|
||||||
|
|
||||||
try (BufferedInputStream affixStream =
|
try (BufferedInputStream affixStream =
|
||||||
|
@ -250,10 +263,11 @@ public class Dictionary {
|
||||||
readAffixFile(affixStream, decoder, flagEnumerator);
|
readAffixFile(affixStream, decoder, flagEnumerator);
|
||||||
|
|
||||||
// read dictionary entries
|
// read dictionary entries
|
||||||
IndexOutput unsorted = tempDir.createTempOutput(tempFileNamePrefix, "dat", IOContext.DEFAULT);
|
EntryAccumulator acc = sortingStrategy.start();
|
||||||
int wordCount = mergeDictionaries(dictionaries, decoder, unsorted);
|
mergeDictionaries(dictionaries, decoder, acc);
|
||||||
String sortedFile = sortWordsOffline(tempDir, tempFileNamePrefix, unsorted);
|
try (EntrySupplier sorted = acc.finishAndSort()) {
|
||||||
words = readSortedDictionaries(tempDir, sortedFile, flagEnumerator, wordCount);
|
words = readSortedDictionaries(flagEnumerator, sorted);
|
||||||
|
}
|
||||||
flagLookup = flagEnumerator.finish();
|
flagLookup = flagEnumerator.finish();
|
||||||
aliases = null; // no longer needed
|
aliases = null; // no longer needed
|
||||||
morphAliases = null; // no longer needed
|
morphAliases = null; // no longer needed
|
||||||
|
@ -631,7 +645,8 @@ public class Dictionary {
|
||||||
|
|
||||||
private FST<IntsRef> affixFST(TreeMap<String, List<Integer>> affixes) throws IOException {
|
private FST<IntsRef> affixFST(TreeMap<String, List<Integer>> affixes) throws IOException {
|
||||||
IntSequenceOutputs outputs = IntSequenceOutputs.getSingleton();
|
IntSequenceOutputs outputs = IntSequenceOutputs.getSingleton();
|
||||||
FSTCompiler<IntsRef> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE4, outputs);
|
FSTCompiler<IntsRef> fstCompiler =
|
||||||
|
new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE4, outputs).build();
|
||||||
IntsRefBuilder scratch = new IntsRefBuilder();
|
IntsRefBuilder scratch = new IntsRefBuilder();
|
||||||
for (Map.Entry<String, List<Integer>> entry : affixes.entrySet()) {
|
for (Map.Entry<String, List<Integer>> entry : affixes.entrySet()) {
|
||||||
Util.toUTF32(entry.getKey(), scratch);
|
Util.toUTF32(entry.getKey(), scratch);
|
||||||
|
@ -984,52 +999,43 @@ public class Dictionary {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private int mergeDictionaries(
|
private void mergeDictionaries(
|
||||||
List<InputStream> dictionaries, CharsetDecoder decoder, IndexOutput output)
|
List<InputStream> dictionaries, CharsetDecoder decoder, EntryAccumulator acc)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
StringBuilder sb = new StringBuilder();
|
StringBuilder sb = new StringBuilder();
|
||||||
int wordCount = 0;
|
for (InputStream dictionary : dictionaries) {
|
||||||
try (ByteSequencesWriter writer = new ByteSequencesWriter(output)) {
|
BufferedReader lines = new BufferedReader(new InputStreamReader(dictionary, decoder));
|
||||||
for (InputStream dictionary : dictionaries) {
|
lines.readLine(); // first line is number of entries (approximately, sometimes)
|
||||||
BufferedReader lines = new BufferedReader(new InputStreamReader(dictionary, decoder));
|
|
||||||
lines.readLine(); // first line is number of entries (approximately, sometimes)
|
|
||||||
|
|
||||||
String line;
|
String line;
|
||||||
while ((line = lines.readLine()) != null) {
|
while ((line = lines.readLine()) != null) {
|
||||||
// wild and unpredictable code comment rules
|
// wild and unpredictable code comment rules
|
||||||
if (line.isEmpty() || line.charAt(0) == '#' || line.charAt(0) == '\t') {
|
if (line.isEmpty() || line.charAt(0) == '#' || line.charAt(0) == '\t') {
|
||||||
continue;
|
continue;
|
||||||
}
|
|
||||||
line = unescapeEntry(line);
|
|
||||||
// if we haven't seen any custom morphological data, try to parse one
|
|
||||||
if (!hasCustomMorphData) {
|
|
||||||
int morphStart = line.indexOf(MORPH_SEPARATOR);
|
|
||||||
if (morphStart >= 0) {
|
|
||||||
String data = line.substring(morphStart + 1);
|
|
||||||
hasCustomMorphData =
|
|
||||||
splitMorphData(data).stream().anyMatch(s -> !s.startsWith("ph:"));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
wordCount += writeNormalizedWordEntry(sb, writer, line);
|
|
||||||
}
|
}
|
||||||
|
line = unescapeEntry(line);
|
||||||
|
// if we haven't seen any custom morphological data, try to parse one
|
||||||
|
if (!hasCustomMorphData) {
|
||||||
|
int morphStart = line.indexOf(MORPH_SEPARATOR);
|
||||||
|
if (morphStart >= 0) {
|
||||||
|
String data = line.substring(morphStart + 1);
|
||||||
|
hasCustomMorphData = splitMorphData(data).stream().anyMatch(s -> !s.startsWith("ph:"));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
writeNormalizedWordEntry(sb, line, acc);
|
||||||
}
|
}
|
||||||
CodecUtil.writeFooter(output);
|
|
||||||
}
|
}
|
||||||
return wordCount;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
private void writeNormalizedWordEntry(StringBuilder reuse, String line, EntryAccumulator acc)
|
||||||
* @return the number of word entries written
|
|
||||||
*/
|
|
||||||
private int writeNormalizedWordEntry(StringBuilder reuse, ByteSequencesWriter writer, String line)
|
|
||||||
throws IOException {
|
throws IOException {
|
||||||
int flagSep = line.indexOf(FLAG_SEPARATOR);
|
int flagSep = line.indexOf(FLAG_SEPARATOR);
|
||||||
int morphSep = line.indexOf(MORPH_SEPARATOR);
|
int morphSep = line.indexOf(MORPH_SEPARATOR);
|
||||||
assert morphSep > 0;
|
assert morphSep > 0;
|
||||||
assert morphSep > flagSep;
|
assert morphSep > flagSep;
|
||||||
int sep = flagSep < 0 ? morphSep : flagSep;
|
int sep = flagSep < 0 ? morphSep : flagSep;
|
||||||
if (sep == 0) return 0;
|
if (sep == 0) return;
|
||||||
|
|
||||||
CharSequence toWrite;
|
CharSequence toWrite;
|
||||||
String beforeSep = line.substring(0, sep);
|
String beforeSep = line.substring(0, sep);
|
||||||
|
@ -1043,19 +1049,16 @@ public class Dictionary {
|
||||||
|
|
||||||
String written = toWrite.toString();
|
String written = toWrite.toString();
|
||||||
sep = written.length() - (line.length() - sep);
|
sep = written.length() - (line.length() - sep);
|
||||||
writer.write(written.getBytes(StandardCharsets.UTF_8));
|
acc.addEntry(written);
|
||||||
|
|
||||||
WordCase wordCase = WordCase.caseOf(written, sep);
|
WordCase wordCase = WordCase.caseOf(written, sep);
|
||||||
if (wordCase == WordCase.MIXED || wordCase == WordCase.UPPER && flagSep > 0) {
|
if (wordCase == WordCase.MIXED || wordCase == WordCase.UPPER && flagSep > 0) {
|
||||||
addHiddenCapitalizedWord(reuse, writer, written.substring(0, sep), written.substring(sep));
|
addHiddenCapitalizedWord(reuse, acc, written.substring(0, sep), written.substring(sep));
|
||||||
return 2;
|
|
||||||
}
|
}
|
||||||
return 1;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private void addHiddenCapitalizedWord(
|
private void addHiddenCapitalizedWord(
|
||||||
StringBuilder reuse, ByteSequencesWriter writer, String word, String afterSep)
|
StringBuilder reuse, EntryAccumulator acc, String word, String afterSep) throws IOException {
|
||||||
throws IOException {
|
|
||||||
reuse.setLength(0);
|
reuse.setLength(0);
|
||||||
reuse.append(Character.toUpperCase(word.charAt(0)));
|
reuse.append(Character.toUpperCase(word.charAt(0)));
|
||||||
for (int i = 1; i < word.length(); i++) {
|
for (int i = 1; i < word.length(); i++) {
|
||||||
|
@ -1064,7 +1067,7 @@ public class Dictionary {
|
||||||
reuse.append(FLAG_SEPARATOR);
|
reuse.append(FLAG_SEPARATOR);
|
||||||
reuse.append(HIDDEN_FLAG);
|
reuse.append(HIDDEN_FLAG);
|
||||||
reuse.append(afterSep, afterSep.charAt(0) == FLAG_SEPARATOR ? 1 : 0, afterSep.length());
|
reuse.append(afterSep, afterSep.charAt(0) == FLAG_SEPARATOR ? 1 : 0, afterSep.length());
|
||||||
writer.write(reuse.toString().getBytes(StandardCharsets.UTF_8));
|
acc.addEntry(reuse.toString());
|
||||||
}
|
}
|
||||||
|
|
||||||
String toLowerCase(String word) {
|
String toLowerCase(String word) {
|
||||||
|
@ -1084,137 +1087,66 @@ public class Dictionary {
|
||||||
return new String(chars);
|
return new String(chars);
|
||||||
}
|
}
|
||||||
|
|
||||||
private String sortWordsOffline(
|
private WordStorage readSortedDictionaries(FlagEnumerator flags, EntrySupplier sorted)
|
||||||
Directory tempDir, String tempFileNamePrefix, IndexOutput unsorted) throws IOException {
|
throws IOException {
|
||||||
OfflineSorter sorter =
|
|
||||||
new OfflineSorter(
|
|
||||||
tempDir,
|
|
||||||
tempFileNamePrefix,
|
|
||||||
new Comparator<>() {
|
|
||||||
final BytesRef scratch1 = new BytesRef();
|
|
||||||
final BytesRef scratch2 = new BytesRef();
|
|
||||||
|
|
||||||
private void initScratch(BytesRef o, BytesRef scratch) {
|
|
||||||
scratch.bytes = o.bytes;
|
|
||||||
scratch.offset = o.offset;
|
|
||||||
scratch.length = o.length;
|
|
||||||
|
|
||||||
for (int i = scratch.length - 1; i >= 0; i--) {
|
|
||||||
if (scratch.bytes[scratch.offset + i] == FLAG_SEPARATOR
|
|
||||||
|| scratch.bytes[scratch.offset + i] == MORPH_SEPARATOR) {
|
|
||||||
scratch.length = i;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public int compare(BytesRef o1, BytesRef o2) {
|
|
||||||
initScratch(o1, scratch1);
|
|
||||||
initScratch(o2, scratch2);
|
|
||||||
|
|
||||||
int cmp = scratch1.compareTo(scratch2);
|
|
||||||
if (cmp == 0) {
|
|
||||||
// tie break on whole row
|
|
||||||
return o1.compareTo(o2);
|
|
||||||
} else {
|
|
||||||
return cmp;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
});
|
|
||||||
|
|
||||||
String sorted;
|
|
||||||
boolean success = false;
|
|
||||||
try {
|
|
||||||
sorted = sorter.sort(unsorted.getName());
|
|
||||||
success = true;
|
|
||||||
} finally {
|
|
||||||
if (success) {
|
|
||||||
tempDir.deleteFile(unsorted.getName());
|
|
||||||
} else {
|
|
||||||
IOUtils.deleteFilesIgnoringExceptions(tempDir, unsorted.getName());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return sorted;
|
|
||||||
}
|
|
||||||
|
|
||||||
private WordStorage readSortedDictionaries(
|
|
||||||
Directory tempDir, String sorted, FlagEnumerator flags, int wordCount) throws IOException {
|
|
||||||
boolean success = false;
|
|
||||||
|
|
||||||
Map<String, Integer> morphIndices = new HashMap<>();
|
Map<String, Integer> morphIndices = new HashMap<>();
|
||||||
|
|
||||||
WordStorage.Builder builder =
|
WordStorage.Builder builder =
|
||||||
new WordStorage.Builder(
|
new WordStorage.Builder(
|
||||||
wordCount, hashFactor(), hasCustomMorphData, flags, allNonSuggestibleFlags());
|
sorted.wordCount(), hashFactor(), hasCustomMorphData, flags, allNonSuggestibleFlags());
|
||||||
|
|
||||||
try (ByteSequencesReader reader =
|
// TODO: the flags themselves can be double-chars (long) or also numeric
|
||||||
new ByteSequencesReader(tempDir.openChecksumInput(sorted), sorted)) {
|
// either way the trick is to encode them as char... but they must be parsed differently
|
||||||
|
|
||||||
// TODO: the flags themselves can be double-chars (long) or also numeric
|
while (true) {
|
||||||
// either way the trick is to encode them as char... but they must be parsed differently
|
String line = sorted.next();
|
||||||
|
if (line == null) break;
|
||||||
|
|
||||||
while (true) {
|
String entry;
|
||||||
BytesRef scratch = reader.next();
|
char[] wordForm;
|
||||||
if (scratch == null) {
|
int end;
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
String line = scratch.utf8ToString();
|
int flagSep = line.indexOf(FLAG_SEPARATOR);
|
||||||
String entry;
|
if (flagSep == -1) {
|
||||||
char[] wordForm;
|
wordForm = NOFLAGS;
|
||||||
int end;
|
end = line.indexOf(MORPH_SEPARATOR);
|
||||||
|
entry = line.substring(0, end);
|
||||||
int flagSep = line.indexOf(FLAG_SEPARATOR);
|
|
||||||
if (flagSep == -1) {
|
|
||||||
wordForm = NOFLAGS;
|
|
||||||
end = line.indexOf(MORPH_SEPARATOR);
|
|
||||||
entry = line.substring(0, end);
|
|
||||||
} else {
|
|
||||||
end = line.indexOf(MORPH_SEPARATOR);
|
|
||||||
boolean hidden = line.charAt(flagSep + 1) == HIDDEN_FLAG;
|
|
||||||
String flagPart = line.substring(flagSep + (hidden ? 2 : 1), end).strip();
|
|
||||||
if (aliasCount > 0 && !flagPart.isEmpty()) {
|
|
||||||
flagPart = getAliasValue(Integer.parseInt(flagPart));
|
|
||||||
}
|
|
||||||
|
|
||||||
wordForm = flagParsingStrategy.parseFlags(flagPart);
|
|
||||||
if (hidden) {
|
|
||||||
wordForm = ArrayUtil.growExact(wordForm, wordForm.length + 1);
|
|
||||||
wordForm[wordForm.length - 1] = HIDDEN_FLAG;
|
|
||||||
}
|
|
||||||
entry = line.substring(0, flagSep);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (entry.isEmpty()) continue;
|
|
||||||
|
|
||||||
int morphDataID = 0;
|
|
||||||
if (end + 1 < line.length()) {
|
|
||||||
List<String> morphFields = readMorphFields(entry, line.substring(end + 1));
|
|
||||||
if (!morphFields.isEmpty()) {
|
|
||||||
morphFields.sort(Comparator.naturalOrder());
|
|
||||||
morphDataID = addMorphFields(morphIndices, String.join(" ", morphFields));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
builder.add(entry, wordForm, morphDataID);
|
|
||||||
}
|
|
||||||
|
|
||||||
// finalize last entry
|
|
||||||
success = true;
|
|
||||||
return new WordStorage(builder) {
|
|
||||||
@Override
|
|
||||||
char caseFold(char c) {
|
|
||||||
return Dictionary.this.caseFold(c);
|
|
||||||
}
|
|
||||||
};
|
|
||||||
} finally {
|
|
||||||
if (success) {
|
|
||||||
tempDir.deleteFile(sorted);
|
|
||||||
} else {
|
} else {
|
||||||
IOUtils.deleteFilesIgnoringExceptions(tempDir, sorted);
|
end = line.indexOf(MORPH_SEPARATOR);
|
||||||
|
boolean hidden = line.charAt(flagSep + 1) == HIDDEN_FLAG;
|
||||||
|
String flagPart = line.substring(flagSep + (hidden ? 2 : 1), end).strip();
|
||||||
|
if (aliasCount > 0 && !flagPart.isEmpty()) {
|
||||||
|
flagPart = getAliasValue(Integer.parseInt(flagPart));
|
||||||
|
}
|
||||||
|
|
||||||
|
wordForm = flagParsingStrategy.parseFlags(flagPart);
|
||||||
|
if (hidden) {
|
||||||
|
wordForm = ArrayUtil.growExact(wordForm, wordForm.length + 1);
|
||||||
|
wordForm[wordForm.length - 1] = HIDDEN_FLAG;
|
||||||
|
}
|
||||||
|
entry = line.substring(0, flagSep);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (entry.isEmpty()) continue;
|
||||||
|
|
||||||
|
int morphDataID = 0;
|
||||||
|
if (end + 1 < line.length()) {
|
||||||
|
List<String> morphFields = readMorphFields(entry, line.substring(end + 1));
|
||||||
|
if (!morphFields.isEmpty()) {
|
||||||
|
morphFields.sort(Comparator.naturalOrder());
|
||||||
|
morphDataID = addMorphFields(morphIndices, String.join(" ", morphFields));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
builder.add(entry, wordForm, morphDataID);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
return new WordStorage(builder) {
|
||||||
|
@Override
|
||||||
|
char caseFold(char c) {
|
||||||
|
return Dictionary.this.caseFold(c);
|
||||||
|
}
|
||||||
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
|
@ -0,0 +1,181 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package org.apache.lucene.analysis.hunspell;
|
||||||
|
|
||||||
|
import java.io.Closeable;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.nio.charset.StandardCharsets;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Comparator;
|
||||||
|
import java.util.List;
|
||||||
|
import org.apache.lucene.codecs.CodecUtil;
|
||||||
|
import org.apache.lucene.store.Directory;
|
||||||
|
import org.apache.lucene.store.IOContext;
|
||||||
|
import org.apache.lucene.store.IndexOutput;
|
||||||
|
import org.apache.lucene.util.BytesRef;
|
||||||
|
import org.apache.lucene.util.BytesRefComparator;
|
||||||
|
import org.apache.lucene.util.IOUtils;
|
||||||
|
import org.apache.lucene.util.OfflineSorter;
|
||||||
|
import org.apache.lucene.util.OfflineSorter.ByteSequencesReader;
|
||||||
|
import org.apache.lucene.util.OfflineSorter.ByteSequencesWriter;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The strategy defining how a Hunspell dictionary should be loaded, with different tradeoffs. The
|
||||||
|
* entries should be sorted in a special way, and this can be done either in-memory (faster, but
|
||||||
|
* temporarily allocating more memory) or using disk (slower, but not needing much memory).
|
||||||
|
*
|
||||||
|
* @see #offline(Directory, String)
|
||||||
|
* @see #inMemory()
|
||||||
|
*/
|
||||||
|
public abstract class SortingStrategy {
|
||||||
|
|
||||||
|
abstract EntryAccumulator start() throws IOException;
|
||||||
|
|
||||||
|
interface EntryAccumulator {
|
||||||
|
|
||||||
|
void addEntry(String entry) throws IOException;
|
||||||
|
|
||||||
|
EntrySupplier finishAndSort() throws IOException;
|
||||||
|
}
|
||||||
|
|
||||||
|
interface EntrySupplier extends Closeable {
|
||||||
|
int wordCount();
|
||||||
|
|
||||||
|
/** The next line or {@code null} if the end is reached */
|
||||||
|
String next() throws IOException;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* An "offline" strategy that creates temporary files in the given directory and uses them for
|
||||||
|
* sorting with {@link OfflineSorter}. It's slower than {@link #inMemory()}, but doesn't need to
|
||||||
|
* load the entire dictionary into memory.
|
||||||
|
*/
|
||||||
|
public static SortingStrategy offline(Directory tempDir, String tempFileNamePrefix) {
|
||||||
|
return new SortingStrategy() {
|
||||||
|
@Override
|
||||||
|
EntryAccumulator start() throws IOException {
|
||||||
|
IndexOutput output = tempDir.createTempOutput(tempFileNamePrefix, "dat", IOContext.DEFAULT);
|
||||||
|
ByteSequencesWriter writer = new ByteSequencesWriter(output);
|
||||||
|
return new EntryAccumulator() {
|
||||||
|
int wordCount = 0;
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void addEntry(String entry) throws IOException {
|
||||||
|
wordCount++;
|
||||||
|
writer.write(entry.getBytes(StandardCharsets.UTF_8));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public EntrySupplier finishAndSort() throws IOException {
|
||||||
|
CodecUtil.writeFooter(output);
|
||||||
|
writer.close();
|
||||||
|
String sortedFile = sortWordsOffline();
|
||||||
|
ByteSequencesReader reader =
|
||||||
|
new ByteSequencesReader(tempDir.openChecksumInput(sortedFile), sortedFile);
|
||||||
|
return new EntrySupplier() {
|
||||||
|
boolean success = false;
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int wordCount() {
|
||||||
|
return wordCount;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String next() throws IOException {
|
||||||
|
BytesRef scratch = reader.next();
|
||||||
|
if (scratch == null) {
|
||||||
|
success = true;
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
return scratch.utf8ToString();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void close() throws IOException {
|
||||||
|
reader.close();
|
||||||
|
if (success) {
|
||||||
|
tempDir.deleteFile(sortedFile);
|
||||||
|
} else {
|
||||||
|
IOUtils.deleteFilesIgnoringExceptions(tempDir, sortedFile);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
private String sortWordsOffline() throws IOException {
|
||||||
|
var sorter = new OfflineSorter(tempDir, tempFileNamePrefix, BytesRefComparator.NATURAL);
|
||||||
|
|
||||||
|
String sorted;
|
||||||
|
boolean success = false;
|
||||||
|
try {
|
||||||
|
sorted = sorter.sort(output.getName());
|
||||||
|
success = true;
|
||||||
|
} finally {
|
||||||
|
if (success) {
|
||||||
|
tempDir.deleteFile(output.getName());
|
||||||
|
} else {
|
||||||
|
IOUtils.deleteFilesIgnoringExceptions(tempDir, output.getName());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return sorted;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
}
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The strategy that loads all entries as {@link String} objects and sorts them in memory. The
|
||||||
|
* entries are then stored in a more compressed way, and the strings are gc-ed, but the loading
|
||||||
|
* itself needs {@code O(dictionary_size)} memory.
|
||||||
|
*/
|
||||||
|
public static SortingStrategy inMemory() {
|
||||||
|
return new SortingStrategy() {
|
||||||
|
@Override
|
||||||
|
EntryAccumulator start() {
|
||||||
|
List<String> entries = new ArrayList<>();
|
||||||
|
return new EntryAccumulator() {
|
||||||
|
@Override
|
||||||
|
public void addEntry(String entry) {
|
||||||
|
entries.add(entry);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public EntrySupplier finishAndSort() {
|
||||||
|
entries.sort(Comparator.naturalOrder());
|
||||||
|
return new EntrySupplier() {
|
||||||
|
int i = 0;
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int wordCount() {
|
||||||
|
return entries.size();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String next() {
|
||||||
|
return i < entries.size() ? entries.get(i++) : null;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void close() {}
|
||||||
|
};
|
||||||
|
}
|
||||||
|
};
|
||||||
|
}
|
||||||
|
};
|
||||||
|
}
|
||||||
|
}
|
|
@ -350,16 +350,19 @@ abstract class WordStorage {
|
||||||
|
|
||||||
currentOrds.clear();
|
currentOrds.clear();
|
||||||
boolean hasNonHidden = false;
|
boolean hasNonHidden = false;
|
||||||
|
boolean isSuggestible = false;
|
||||||
for (char[] flags : group) {
|
for (char[] flags : group) {
|
||||||
if (!hasFlag(flags, Dictionary.HIDDEN_FLAG)) {
|
if (!hasFlag(flags, Dictionary.HIDDEN_FLAG)) {
|
||||||
hasNonHidden = true;
|
hasNonHidden = true;
|
||||||
break;
|
}
|
||||||
|
if (!hasNoSuggestFlag(flags)) {
|
||||||
|
isSuggestible = true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int i = 0; i < group.size(); i++) {
|
for (int i = 0; i < group.size(); i++) {
|
||||||
char[] flags = group.get(i);
|
char[] flags = group.get(i);
|
||||||
if (hasNonHidden && hasFlag(flags, Dictionary.HIDDEN_FLAG)) {
|
if (hasNonHidden && group.size() > 1 && hasFlag(flags, Dictionary.HIDDEN_FLAG)) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -388,7 +391,7 @@ abstract class WordStorage {
|
||||||
|
|
||||||
int mask =
|
int mask =
|
||||||
(prevCode == 0 ? 0 : COLLISION_MASK)
|
(prevCode == 0 ? 0 : COLLISION_MASK)
|
||||||
| (group.stream().anyMatch(flags -> !hasNoSuggestFlag(flags)) ? SUGGESTIBLE_MASK : 0)
|
| (isSuggestible ? SUGGESTIBLE_MASK : 0)
|
||||||
| Math.min(currentEntry.length(), MAX_STORED_LENGTH);
|
| Math.min(currentEntry.length(), MAX_STORED_LENGTH);
|
||||||
hashTable[hash] = (mask << OFFSET_BITS) | pos;
|
hashTable[hash] = (mask << OFFSET_BITS) | pos;
|
||||||
|
|
||||||
|
|
|
@ -210,7 +210,8 @@ public final class StemmerOverrideFilter extends TokenFilter {
|
||||||
*/
|
*/
|
||||||
public StemmerOverrideMap build() throws IOException {
|
public StemmerOverrideMap build() throws IOException {
|
||||||
ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton();
|
ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton();
|
||||||
FSTCompiler<BytesRef> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE4, outputs);
|
FSTCompiler<BytesRef> fstCompiler =
|
||||||
|
new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE4, outputs).build();
|
||||||
final int[] sort = hash.sort();
|
final int[] sort = hash.sort();
|
||||||
IntsRefBuilder intsSpare = new IntsRefBuilder();
|
IntsRefBuilder intsSpare = new IntsRefBuilder();
|
||||||
final int size = hash.size();
|
final int size = hash.size();
|
||||||
|
|
|
@ -46,11 +46,11 @@ public class TruncateTokenFilterFactory extends TokenFilterFactory {
|
||||||
public static final String NAME = "truncate";
|
public static final String NAME = "truncate";
|
||||||
|
|
||||||
public static final String PREFIX_LENGTH_KEY = "prefixLength";
|
public static final String PREFIX_LENGTH_KEY = "prefixLength";
|
||||||
private final byte prefixLength;
|
private final int prefixLength;
|
||||||
|
|
||||||
public TruncateTokenFilterFactory(Map<String, String> args) {
|
public TruncateTokenFilterFactory(Map<String, String> args) {
|
||||||
super(args);
|
super(args);
|
||||||
prefixLength = Byte.parseByte(get(args, PREFIX_LENGTH_KEY, "5"));
|
prefixLength = Integer.parseInt(get(args, PREFIX_LENGTH_KEY, "5"));
|
||||||
if (prefixLength < 1)
|
if (prefixLength < 1)
|
||||||
throw new IllegalArgumentException(
|
throw new IllegalArgumentException(
|
||||||
PREFIX_LENGTH_KEY + " parameter must be a positive number: " + prefixLength);
|
PREFIX_LENGTH_KEY + " parameter must be a positive number: " + prefixLength);
|
||||||
|
|
|
@ -163,7 +163,6 @@ public final class WordDelimiterFilter extends TokenFilter {
|
||||||
|
|
||||||
private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class);
|
private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class);
|
||||||
private final KeywordAttribute keywordAttribute = addAttribute(KeywordAttribute.class);
|
private final KeywordAttribute keywordAttribute = addAttribute(KeywordAttribute.class);
|
||||||
;
|
|
||||||
private final OffsetAttribute offsetAttribute = addAttribute(OffsetAttribute.class);
|
private final OffsetAttribute offsetAttribute = addAttribute(OffsetAttribute.class);
|
||||||
private final PositionIncrementAttribute posIncAttribute =
|
private final PositionIncrementAttribute posIncAttribute =
|
||||||
addAttribute(PositionIncrementAttribute.class);
|
addAttribute(PositionIncrementAttribute.class);
|
||||||
|
|
|
@ -164,7 +164,6 @@ public final class WordDelimiterGraphFilter extends TokenFilter {
|
||||||
|
|
||||||
private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class);
|
private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class);
|
||||||
private final KeywordAttribute keywordAttribute = addAttribute(KeywordAttribute.class);
|
private final KeywordAttribute keywordAttribute = addAttribute(KeywordAttribute.class);
|
||||||
;
|
|
||||||
private final OffsetAttribute offsetAttribute = addAttribute(OffsetAttribute.class);
|
private final OffsetAttribute offsetAttribute = addAttribute(OffsetAttribute.class);
|
||||||
private final PositionIncrementAttribute posIncAttribute =
|
private final PositionIncrementAttribute posIncAttribute =
|
||||||
addAttribute(PositionIncrementAttribute.class);
|
addAttribute(PositionIncrementAttribute.class);
|
||||||
|
|
|
@ -45,7 +45,7 @@ import org.apache.lucene.util.ResourceLoaderAware;
|
||||||
* preserveOriginal="0" splitOnNumerics="1" splitOnCaseChange="1"
|
* preserveOriginal="0" splitOnNumerics="1" splitOnCaseChange="1"
|
||||||
* catenateWords="0" catenateNumbers="0" catenateAll="0"
|
* catenateWords="0" catenateNumbers="0" catenateAll="0"
|
||||||
* generateWordParts="1" generateNumberParts="1" stemEnglishPossessive="1"
|
* generateWordParts="1" generateNumberParts="1" stemEnglishPossessive="1"
|
||||||
* types="wdfftypes.txt" />
|
* types="wdfftypes.txt" ignoreKeywords="0" />
|
||||||
* </analyzer>
|
* </analyzer>
|
||||||
* </fieldType></pre>
|
* </fieldType></pre>
|
||||||
*
|
*
|
||||||
|
@ -100,6 +100,9 @@ public class WordDelimiterGraphFilterFactory extends TokenFilterFactory
|
||||||
if (getInt(args, "stemEnglishPossessive", 1) != 0) {
|
if (getInt(args, "stemEnglishPossessive", 1) != 0) {
|
||||||
flags |= STEM_ENGLISH_POSSESSIVE;
|
flags |= STEM_ENGLISH_POSSESSIVE;
|
||||||
}
|
}
|
||||||
|
if (getInt(args, "ignoreKeywords", 0) != 0) {
|
||||||
|
flags |= IGNORE_KEYWORDS;
|
||||||
|
}
|
||||||
wordFiles = get(args, PROTECTED_TOKENS);
|
wordFiles = get(args, PROTECTED_TOKENS);
|
||||||
types = get(args, TYPES);
|
types = get(args, TYPES);
|
||||||
this.flags = flags;
|
this.flags = flags;
|
||||||
|
|
|
@ -216,7 +216,6 @@ public final class SynonymFilter extends TokenFilter {
|
||||||
count++;
|
count++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
;
|
|
||||||
|
|
||||||
private final ByteArrayDataInput bytesReader = new ByteArrayDataInput();
|
private final ByteArrayDataInput bytesReader = new ByteArrayDataInput();
|
||||||
|
|
||||||
|
|
|
@ -222,7 +222,8 @@ public class SynonymMap {
|
||||||
public SynonymMap build() throws IOException {
|
public SynonymMap build() throws IOException {
|
||||||
ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton();
|
ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton();
|
||||||
// TODO: are we using the best sharing options?
|
// TODO: are we using the best sharing options?
|
||||||
FSTCompiler<BytesRef> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE4, outputs);
|
FSTCompiler<BytesRef> fstCompiler =
|
||||||
|
new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE4, outputs).build();
|
||||||
|
|
||||||
BytesRefBuilder scratch = new BytesRefBuilder();
|
BytesRefBuilder scratch = new BytesRefBuilder();
|
||||||
ByteArrayDataOutput scratchOutput = new ByteArrayDataOutput();
|
ByteArrayDataOutput scratchOutput = new ByteArrayDataOutput();
|
||||||
|
|
|
@ -595,8 +595,7 @@ public class TestHTMLStripCharFilter extends BaseTokenStreamTestCase {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Reader reader = new HTMLStripCharFilter(new StringReader(text.toString()));
|
Reader reader = new HTMLStripCharFilter(new StringReader(text.toString()));
|
||||||
while (reader.read() != -1)
|
while (reader.read() != -1) {}
|
||||||
;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testUTF16Surrogates() throws Exception {
|
public void testUTF16Surrogates() throws Exception {
|
||||||
|
|
|
@ -230,7 +230,6 @@ public class TestDuelingAnalyzers extends BaseTokenStreamTestCase {
|
||||||
assertEquals(
|
assertEquals(
|
||||||
"wrong end offset for input: " + s, leftOffset.endOffset(), rightOffset.endOffset());
|
"wrong end offset for input: " + s, leftOffset.endOffset(), rightOffset.endOffset());
|
||||||
}
|
}
|
||||||
;
|
|
||||||
assertFalse("wrong number of tokens for input: " + s, right.incrementToken());
|
assertFalse("wrong number of tokens for input: " + s, right.incrementToken());
|
||||||
left.end();
|
left.end();
|
||||||
right.end();
|
right.end();
|
||||||
|
|
|
@ -41,7 +41,6 @@ import java.util.concurrent.atomic.AtomicLong;
|
||||||
import java.util.function.Function;
|
import java.util.function.Function;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
import java.util.stream.Stream;
|
import java.util.stream.Stream;
|
||||||
import org.apache.lucene.tests.store.BaseDirectoryWrapper;
|
|
||||||
import org.apache.lucene.tests.util.LuceneTestCase;
|
import org.apache.lucene.tests.util.LuceneTestCase;
|
||||||
import org.apache.lucene.tests.util.LuceneTestCase.SuppressSysoutChecks;
|
import org.apache.lucene.tests.util.LuceneTestCase.SuppressSysoutChecks;
|
||||||
import org.apache.lucene.tests.util.RamUsageTester;
|
import org.apache.lucene.tests.util.RamUsageTester;
|
||||||
|
@ -72,9 +71,8 @@ public class TestAllDictionaries extends LuceneTestCase {
|
||||||
Path dic = Path.of(affPath.substring(0, affPath.length() - 4) + ".dic");
|
Path dic = Path.of(affPath.substring(0, affPath.length() - 4) + ".dic");
|
||||||
assert Files.exists(dic) : dic;
|
assert Files.exists(dic) : dic;
|
||||||
try (InputStream dictionary = Files.newInputStream(dic);
|
try (InputStream dictionary = Files.newInputStream(dic);
|
||||||
InputStream affix = Files.newInputStream(aff);
|
InputStream affix = Files.newInputStream(aff)) {
|
||||||
BaseDirectoryWrapper tempDir = newDirectory()) {
|
return new Dictionary(affix, List.of(dictionary), false, SortingStrategy.inMemory()) {
|
||||||
return new Dictionary(tempDir, "dictionary", affix, dictionary) {
|
|
||||||
@Override
|
@Override
|
||||||
protected boolean tolerateAffixRuleCountMismatches() {
|
protected boolean tolerateAffixRuleCountMismatches() {
|
||||||
return true;
|
return true;
|
||||||
|
|
|
@ -256,15 +256,22 @@ public class TestSpellChecking extends LuceneTestCase {
|
||||||
}
|
}
|
||||||
|
|
||||||
static void checkSpellCheckerExpectations(Path basePath) throws IOException, ParseException {
|
static void checkSpellCheckerExpectations(Path basePath) throws IOException, ParseException {
|
||||||
InputStream affixStream = Files.newInputStream(Path.of(basePath.toString() + ".aff"));
|
checkSpellCheckerExpectations(
|
||||||
|
basePath, SortingStrategy.offline(new ByteBuffersDirectory(), "dictionary"));
|
||||||
|
checkSpellCheckerExpectations(basePath, SortingStrategy.inMemory());
|
||||||
|
}
|
||||||
|
|
||||||
|
private static void checkSpellCheckerExpectations(Path basePath, SortingStrategy strategy)
|
||||||
|
throws IOException, ParseException {
|
||||||
|
Path affFile = Path.of(basePath + ".aff");
|
||||||
Path dicFile = Path.of(basePath + ".dic");
|
Path dicFile = Path.of(basePath + ".dic");
|
||||||
|
InputStream affixStream = Files.newInputStream(affFile);
|
||||||
InputStream dictStream = Files.newInputStream(dicFile);
|
InputStream dictStream = Files.newInputStream(dicFile);
|
||||||
|
|
||||||
Hunspell speller;
|
Hunspell speller;
|
||||||
Map<String, Suggester> suggesters = new LinkedHashMap<>();
|
Map<String, Suggester> suggesters = new LinkedHashMap<>();
|
||||||
try {
|
try {
|
||||||
Dictionary dictionary =
|
Dictionary dictionary = new Dictionary(affixStream, List.of(dictStream), false, strategy);
|
||||||
new Dictionary(new ByteBuffersDirectory(), "dictionary", affixStream, dictStream);
|
|
||||||
speller = new Hunspell(dictionary, TimeoutPolicy.NO_TIMEOUT, () -> {});
|
speller = new Hunspell(dictionary, TimeoutPolicy.NO_TIMEOUT, () -> {});
|
||||||
Suggester suggester = new Suggester(dictionary);
|
Suggester suggester = new Suggester(dictionary);
|
||||||
suggesters.put("default", suggester);
|
suggesters.put("default", suggester);
|
||||||
|
|
|
@ -41,7 +41,6 @@ public class TestIndicNormalizer extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
private void check(String input, String output) throws IOException {
|
private void check(String input, String output) throws IOException {
|
||||||
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
|
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
|
||||||
;
|
|
||||||
tokenizer.setReader(new StringReader(input));
|
tokenizer.setReader(new StringReader(input));
|
||||||
TokenFilter tf = new IndicNormalizationFilter(tokenizer);
|
TokenFilter tf = new IndicNormalizationFilter(tokenizer);
|
||||||
assertTokenStreamContents(tf, new String[] {output});
|
assertTokenStreamContents(tf, new String[] {output});
|
||||||
|
|
|
@ -89,7 +89,6 @@ public class TestKeywordMarkerFilterFactory extends BaseTokenStreamFactoryTestCa
|
||||||
stream =
|
stream =
|
||||||
tokenFilterFactory("KeywordMarker", "pattern", "Cats", "ignoreCase", "true").create(stream);
|
tokenFilterFactory("KeywordMarker", "pattern", "Cats", "ignoreCase", "true").create(stream);
|
||||||
stream = tokenFilterFactory("PorterStem").create(stream);
|
stream = tokenFilterFactory("PorterStem").create(stream);
|
||||||
;
|
|
||||||
assertTokenStreamContents(stream, new String[] {"dog", "cats", "Cats"});
|
assertTokenStreamContents(stream, new String[] {"dog", "cats", "Cats"});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -68,4 +68,23 @@ public class TestTruncateTokenFilterFactory extends BaseTokenStreamFactoryTestCa
|
||||||
TruncateTokenFilterFactory.PREFIX_LENGTH_KEY
|
TruncateTokenFilterFactory.PREFIX_LENGTH_KEY
|
||||||
+ " parameter must be a positive number: -5"));
|
+ " parameter must be a positive number: -5"));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** Test that takes length greater than byte limit accepts it */
|
||||||
|
public void testLengthGreaterThanByteLimitArgument() throws Exception {
|
||||||
|
Reader reader =
|
||||||
|
new StringReader(
|
||||||
|
"abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvw128characters From here");
|
||||||
|
TokenStream stream = new MockTokenizer(MockTokenizer.WHITESPACE, false);
|
||||||
|
((Tokenizer) stream).setReader(reader);
|
||||||
|
stream =
|
||||||
|
tokenFilterFactory("Truncate", TruncateTokenFilterFactory.PREFIX_LENGTH_KEY, "128")
|
||||||
|
.create(stream);
|
||||||
|
assertTokenStreamContents(
|
||||||
|
stream,
|
||||||
|
new String[] {
|
||||||
|
"abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvw1",
|
||||||
|
"From",
|
||||||
|
"here"
|
||||||
|
});
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -69,7 +69,6 @@ public class TestEdgeNGramTokenizer extends BaseTokenStreamTestCase {
|
||||||
public void testOversizedNgrams() throws Exception {
|
public void testOversizedNgrams() throws Exception {
|
||||||
EdgeNGramTokenizer tokenizer = new EdgeNGramTokenizer(6, 6);
|
EdgeNGramTokenizer tokenizer = new EdgeNGramTokenizer(6, 6);
|
||||||
tokenizer.setReader(input);
|
tokenizer.setReader(input);
|
||||||
;
|
|
||||||
assertTokenStreamContents(tokenizer, new String[0], new int[0], new int[0], 5 /* abcde */);
|
assertTokenStreamContents(tokenizer, new String[0], new int[0], new int[0], 5 /* abcde */);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -156,7 +156,6 @@ public class TestCharArrayIterator extends LuceneTestCase {
|
||||||
|
|
||||||
private void consume(BreakIterator bi, CharacterIterator ci) {
|
private void consume(BreakIterator bi, CharacterIterator ci) {
|
||||||
bi.setText(ci);
|
bi.setText(ci);
|
||||||
while (bi.next() != BreakIterator.DONE)
|
while (bi.next() != BreakIterator.DONE) {}
|
||||||
;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -16,6 +16,8 @@
|
||||||
*/
|
*/
|
||||||
package org.apache.lucene.analysis.ja.dict;
|
package org.apache.lucene.analysis.ja.dict;
|
||||||
|
|
||||||
|
import static org.apache.lucene.util.fst.FST.readMetadata;
|
||||||
|
|
||||||
import java.io.BufferedInputStream;
|
import java.io.BufferedInputStream;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.InputStream;
|
import java.io.InputStream;
|
||||||
|
@ -103,7 +105,7 @@ public final class TokenInfoDictionary extends BinaryDictionary<TokenInfoMorphDa
|
||||||
FST<Long> fst;
|
FST<Long> fst;
|
||||||
try (InputStream is = new BufferedInputStream(fstResource.get())) {
|
try (InputStream is = new BufferedInputStream(fstResource.get())) {
|
||||||
DataInput in = new InputStreamDataInput(is);
|
DataInput in = new InputStreamDataInput(is);
|
||||||
fst = new FST<>(in, in, PositiveIntOutputs.getSingleton());
|
fst = new FST<>(readMetadata(in, PositiveIntOutputs.getSingleton()), in);
|
||||||
}
|
}
|
||||||
// TODO: some way to configure?
|
// TODO: some way to configure?
|
||||||
this.fst = new TokenInfoFST(fst, true);
|
this.fst = new TokenInfoFST(fst, true);
|
||||||
|
|
|
@ -101,7 +101,8 @@ class TokenInfoDictionaryBuilder {
|
||||||
lines.sort(Comparator.comparing(entry -> entry[0]));
|
lines.sort(Comparator.comparing(entry -> entry[0]));
|
||||||
|
|
||||||
PositiveIntOutputs fstOutput = PositiveIntOutputs.getSingleton();
|
PositiveIntOutputs fstOutput = PositiveIntOutputs.getSingleton();
|
||||||
FSTCompiler<Long> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE2, fstOutput);
|
FSTCompiler<Long> fstCompiler =
|
||||||
|
new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE2, fstOutput).build();
|
||||||
IntsRefBuilder scratch = new IntsRefBuilder();
|
IntsRefBuilder scratch = new IntsRefBuilder();
|
||||||
long ord = -1; // first ord will be 0
|
long ord = -1; // first ord will be 0
|
||||||
String lastValue = null;
|
String lastValue = null;
|
||||||
|
|
|
@ -93,7 +93,8 @@ public final class UserDictionary implements Dictionary<UserMorphData> {
|
||||||
List<int[]> segmentations = new ArrayList<>(featureEntries.size());
|
List<int[]> segmentations = new ArrayList<>(featureEntries.size());
|
||||||
|
|
||||||
PositiveIntOutputs fstOutput = PositiveIntOutputs.getSingleton();
|
PositiveIntOutputs fstOutput = PositiveIntOutputs.getSingleton();
|
||||||
FSTCompiler<Long> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE2, fstOutput);
|
FSTCompiler<Long> fstCompiler =
|
||||||
|
new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE2, fstOutput).build();
|
||||||
IntsRefBuilder scratch = new IntsRefBuilder();
|
IntsRefBuilder scratch = new IntsRefBuilder();
|
||||||
long ord = 0;
|
long ord = 0;
|
||||||
|
|
||||||
|
|
|
@ -758,8 +758,7 @@ public class TestJapaneseTokenizer extends BaseTokenStreamTestCase {
|
||||||
for (int i = 0; i < numIterations; i++) {
|
for (int i = 0; i < numIterations; i++) {
|
||||||
try (TokenStream ts = analyzer.tokenStream("ignored", line)) {
|
try (TokenStream ts = analyzer.tokenStream("ignored", line)) {
|
||||||
ts.reset();
|
ts.reset();
|
||||||
while (ts.incrementToken())
|
while (ts.incrementToken()) {}
|
||||||
;
|
|
||||||
ts.end();
|
ts.end();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -775,8 +774,7 @@ public class TestJapaneseTokenizer extends BaseTokenStreamTestCase {
|
||||||
for (String sentence : sentences) {
|
for (String sentence : sentences) {
|
||||||
try (TokenStream ts = analyzer.tokenStream("ignored", sentence)) {
|
try (TokenStream ts = analyzer.tokenStream("ignored", sentence)) {
|
||||||
ts.reset();
|
ts.reset();
|
||||||
while (ts.incrementToken())
|
while (ts.incrementToken()) {}
|
||||||
;
|
|
||||||
ts.end();
|
ts.end();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -831,8 +829,7 @@ public class TestJapaneseTokenizer extends BaseTokenStreamTestCase {
|
||||||
new JapaneseTokenizer(newAttributeFactory(), readDict(), false, Mode.NORMAL);
|
new JapaneseTokenizer(newAttributeFactory(), readDict(), false, Mode.NORMAL);
|
||||||
tokenizer.setReader(new StringReader(doc));
|
tokenizer.setReader(new StringReader(doc));
|
||||||
tokenizer.reset();
|
tokenizer.reset();
|
||||||
while (tokenizer.incrementToken())
|
while (tokenizer.incrementToken()) {}
|
||||||
;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testPatchedSystemDict() throws Exception {
|
public void testPatchedSystemDict() throws Exception {
|
||||||
|
|
|
@ -16,6 +16,8 @@
|
||||||
*/
|
*/
|
||||||
package org.apache.lucene.analysis.ko.dict;
|
package org.apache.lucene.analysis.ko.dict;
|
||||||
|
|
||||||
|
import static org.apache.lucene.util.fst.FST.readMetadata;
|
||||||
|
|
||||||
import java.io.BufferedInputStream;
|
import java.io.BufferedInputStream;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.InputStream;
|
import java.io.InputStream;
|
||||||
|
@ -102,7 +104,7 @@ public final class TokenInfoDictionary extends BinaryDictionary<TokenInfoMorphDa
|
||||||
FST<Long> fst;
|
FST<Long> fst;
|
||||||
try (InputStream is = new BufferedInputStream(fstResource.get())) {
|
try (InputStream is = new BufferedInputStream(fstResource.get())) {
|
||||||
DataInput in = new InputStreamDataInput(is);
|
DataInput in = new InputStreamDataInput(is);
|
||||||
fst = new FST<>(in, in, PositiveIntOutputs.getSingleton());
|
fst = new FST<>(readMetadata(in, PositiveIntOutputs.getSingleton()), in);
|
||||||
}
|
}
|
||||||
this.fst = new TokenInfoFST(fst);
|
this.fst = new TokenInfoFST(fst);
|
||||||
}
|
}
|
||||||
|
|
|
@ -94,7 +94,8 @@ class TokenInfoDictionaryBuilder {
|
||||||
lines.sort(Comparator.comparing(left -> left[0]));
|
lines.sort(Comparator.comparing(left -> left[0]));
|
||||||
|
|
||||||
PositiveIntOutputs fstOutput = PositiveIntOutputs.getSingleton();
|
PositiveIntOutputs fstOutput = PositiveIntOutputs.getSingleton();
|
||||||
FSTCompiler<Long> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE2, fstOutput);
|
FSTCompiler<Long> fstCompiler =
|
||||||
|
new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE2, fstOutput).build();
|
||||||
IntsRefBuilder scratch = new IntsRefBuilder();
|
IntsRefBuilder scratch = new IntsRefBuilder();
|
||||||
long ord = -1; // first ord will be 0
|
long ord = -1; // first ord will be 0
|
||||||
String lastValue = null;
|
String lastValue = null;
|
||||||
|
|
|
@ -75,7 +75,8 @@ public final class UserDictionary implements Dictionary<UserMorphData> {
|
||||||
entries.sort(Comparator.comparing(e -> e.split("\\s+")[0]));
|
entries.sort(Comparator.comparing(e -> e.split("\\s+")[0]));
|
||||||
|
|
||||||
PositiveIntOutputs fstOutput = PositiveIntOutputs.getSingleton();
|
PositiveIntOutputs fstOutput = PositiveIntOutputs.getSingleton();
|
||||||
FSTCompiler<Long> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE2, fstOutput);
|
FSTCompiler<Long> fstCompiler =
|
||||||
|
new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE2, fstOutput).build();
|
||||||
IntsRefBuilder scratch = new IntsRefBuilder();
|
IntsRefBuilder scratch = new IntsRefBuilder();
|
||||||
|
|
||||||
String lastToken = null;
|
String lastToken = null;
|
||||||
|
|
|
@ -41,7 +41,6 @@ public class TestPolishAnalyzer extends BaseTokenStreamTestCase {
|
||||||
/** test use of exclusion set */
|
/** test use of exclusion set */
|
||||||
public void testExclude() throws IOException {
|
public void testExclude() throws IOException {
|
||||||
CharArraySet exclusionSet = new CharArraySet(asSet("studenta"), false);
|
CharArraySet exclusionSet = new CharArraySet(asSet("studenta"), false);
|
||||||
;
|
|
||||||
Analyzer a = new PolishAnalyzer(PolishAnalyzer.getDefaultStopSet(), exclusionSet);
|
Analyzer a = new PolishAnalyzer(PolishAnalyzer.getDefaultStopSet(), exclusionSet);
|
||||||
checkOneTerm(a, "studenta", "studenta");
|
checkOneTerm(a, "studenta", "studenta");
|
||||||
checkOneTerm(a, "studenci", "student");
|
checkOneTerm(a, "studenci", "student");
|
||||||
|
|
|
@ -16,6 +16,8 @@
|
||||||
*/
|
*/
|
||||||
package org.apache.lucene.backward_codecs.lucene40.blocktree;
|
package org.apache.lucene.backward_codecs.lucene40.blocktree;
|
||||||
|
|
||||||
|
import static org.apache.lucene.util.fst.FST.readMetadata;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import org.apache.lucene.index.FieldInfo;
|
import org.apache.lucene.index.FieldInfo;
|
||||||
import org.apache.lucene.index.IndexOptions;
|
import org.apache.lucene.index.IndexOptions;
|
||||||
|
@ -89,9 +91,17 @@ public final class FieldReader extends Terms {
|
||||||
final IndexInput clone = indexIn.clone();
|
final IndexInput clone = indexIn.clone();
|
||||||
clone.seek(indexStartFP);
|
clone.seek(indexStartFP);
|
||||||
if (metaIn == indexIn) { // Only true before Lucene 8.6
|
if (metaIn == indexIn) { // Only true before Lucene 8.6
|
||||||
index = new FST<>(clone, clone, ByteSequenceOutputs.getSingleton(), new OffHeapFSTStore());
|
index =
|
||||||
|
new FST<>(
|
||||||
|
readMetadata(clone, ByteSequenceOutputs.getSingleton()),
|
||||||
|
clone,
|
||||||
|
new OffHeapFSTStore());
|
||||||
} else {
|
} else {
|
||||||
index = new FST<>(metaIn, clone, ByteSequenceOutputs.getSingleton(), new OffHeapFSTStore());
|
index =
|
||||||
|
new FST<>(
|
||||||
|
readMetadata(metaIn, ByteSequenceOutputs.getSingleton()),
|
||||||
|
clone,
|
||||||
|
new OffHeapFSTStore());
|
||||||
}
|
}
|
||||||
/*
|
/*
|
||||||
if (false) {
|
if (false) {
|
||||||
|
|
|
@ -22,6 +22,7 @@ import java.nio.file.Path;
|
||||||
import java.nio.file.Paths;
|
import java.nio.file.Paths;
|
||||||
import org.apache.lucene.document.Document;
|
import org.apache.lucene.document.Document;
|
||||||
import org.apache.lucene.document.IntPoint;
|
import org.apache.lucene.document.IntPoint;
|
||||||
|
import org.apache.lucene.index.CheckIndex;
|
||||||
import org.apache.lucene.index.IndexWriter;
|
import org.apache.lucene.index.IndexWriter;
|
||||||
import org.apache.lucene.index.IndexWriterConfig;
|
import org.apache.lucene.index.IndexWriterConfig;
|
||||||
import org.apache.lucene.store.Directory;
|
import org.apache.lucene.store.Directory;
|
||||||
|
@ -70,7 +71,7 @@ public class TestManyPointsInOldIndex extends LuceneTestCase {
|
||||||
dir.setCheckIndexOnClose(false);
|
dir.setCheckIndexOnClose(false);
|
||||||
|
|
||||||
// ... because we check ourselves here:
|
// ... because we check ourselves here:
|
||||||
TestUtil.checkIndex(dir, false, true, true, null);
|
TestUtil.checkIndex(dir, CheckIndex.Level.MIN_LEVEL_FOR_INTEGRITY_CHECKS, true, true, null);
|
||||||
dir.close();
|
dir.close();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -23,6 +23,7 @@ description = 'Lucene JMH micro-benchmarking module'
|
||||||
|
|
||||||
dependencies {
|
dependencies {
|
||||||
moduleImplementation project(':lucene:core')
|
moduleImplementation project(':lucene:core')
|
||||||
|
moduleImplementation project(':lucene:expressions')
|
||||||
|
|
||||||
moduleImplementation "org.openjdk.jmh:jmh-core:1.37"
|
moduleImplementation "org.openjdk.jmh:jmh-core:1.37"
|
||||||
annotationProcessor "org.openjdk.jmh:jmh-generator-annprocess:1.37"
|
annotationProcessor "org.openjdk.jmh:jmh-generator-annprocess:1.37"
|
||||||
|
@ -42,7 +43,7 @@ tasks.matching { it.name == "forbiddenApisMain" }.configureEach {
|
||||||
tasks.matching { it.name in [
|
tasks.matching { it.name in [
|
||||||
// Turn off JMH dependency checksums and licensing (it's GPL w/ classpath exception
|
// Turn off JMH dependency checksums and licensing (it's GPL w/ classpath exception
|
||||||
// but this seems fine for test/build only tools).
|
// but this seems fine for test/build only tools).
|
||||||
"validateJarChecksums", "validateJarLicenses",
|
"validateJarChecksums", "validateJarLicenses", "collectJarInfos",
|
||||||
// No special javadocs for JMH benchmarks.
|
// No special javadocs for JMH benchmarks.
|
||||||
"renderSiteJavadoc",
|
"renderSiteJavadoc",
|
||||||
"renderJavadoc",
|
"renderJavadoc",
|
||||||
|
|
|
@ -20,6 +20,7 @@ module org.apache.lucene.benchmark.jmh {
|
||||||
requires jmh.core;
|
requires jmh.core;
|
||||||
requires jdk.unsupported;
|
requires jdk.unsupported;
|
||||||
requires org.apache.lucene.core;
|
requires org.apache.lucene.core;
|
||||||
|
requires org.apache.lucene.expressions;
|
||||||
|
|
||||||
exports org.apache.lucene.benchmark.jmh;
|
exports org.apache.lucene.benchmark.jmh;
|
||||||
exports org.apache.lucene.benchmark.jmh.jmh_generated;
|
exports org.apache.lucene.benchmark.jmh.jmh_generated;
|
||||||
|
|
|
@ -0,0 +1,148 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package org.apache.lucene.benchmark.jmh;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.lang.invoke.MethodHandle;
|
||||||
|
import java.lang.invoke.MethodHandles;
|
||||||
|
import java.lang.invoke.MethodType;
|
||||||
|
import java.text.ParseException;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.Objects;
|
||||||
|
import java.util.concurrent.ThreadLocalRandom;
|
||||||
|
import java.util.concurrent.TimeUnit;
|
||||||
|
import org.apache.lucene.expressions.Expression;
|
||||||
|
import org.apache.lucene.expressions.js.JavascriptCompiler;
|
||||||
|
import org.apache.lucene.search.DoubleValues;
|
||||||
|
import org.openjdk.jmh.annotations.Benchmark;
|
||||||
|
import org.openjdk.jmh.annotations.BenchmarkMode;
|
||||||
|
import org.openjdk.jmh.annotations.Fork;
|
||||||
|
import org.openjdk.jmh.annotations.Level;
|
||||||
|
import org.openjdk.jmh.annotations.Measurement;
|
||||||
|
import org.openjdk.jmh.annotations.Mode;
|
||||||
|
import org.openjdk.jmh.annotations.OutputTimeUnit;
|
||||||
|
import org.openjdk.jmh.annotations.Param;
|
||||||
|
import org.openjdk.jmh.annotations.Scope;
|
||||||
|
import org.openjdk.jmh.annotations.Setup;
|
||||||
|
import org.openjdk.jmh.annotations.State;
|
||||||
|
import org.openjdk.jmh.annotations.Warmup;
|
||||||
|
|
||||||
|
@BenchmarkMode(Mode.Throughput)
|
||||||
|
@OutputTimeUnit(TimeUnit.MILLISECONDS)
|
||||||
|
@State(Scope.Benchmark)
|
||||||
|
@Warmup(iterations = 5, time = 5)
|
||||||
|
@Measurement(iterations = 12, time = 8)
|
||||||
|
@Fork(value = 1)
|
||||||
|
public class ExpressionsBenchmark {
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Some extra functions to bench "identity" in various variants, another one is named
|
||||||
|
* "native_identity" (see below).
|
||||||
|
*/
|
||||||
|
private static final Map<String, MethodHandle> FUNCTIONS = getFunctions();
|
||||||
|
|
||||||
|
private static final String NATIVE_IDENTITY_NAME = "native_identity";
|
||||||
|
|
||||||
|
private static Map<String, MethodHandle> getFunctions() {
|
||||||
|
try {
|
||||||
|
var lookup = MethodHandles.lookup();
|
||||||
|
Map<String, MethodHandle> m = new HashMap<>(JavascriptCompiler.DEFAULT_FUNCTIONS);
|
||||||
|
m.put(
|
||||||
|
"func_identity",
|
||||||
|
lookup.findStatic(
|
||||||
|
lookup.lookupClass(), "ident", MethodType.methodType(double.class, double.class)));
|
||||||
|
m.put("mh_identity", MethodHandles.identity(double.class));
|
||||||
|
return m;
|
||||||
|
} catch (ReflectiveOperationException e) {
|
||||||
|
throw new AssertionError(e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@SuppressWarnings("unused")
|
||||||
|
private static double ident(double v) {
|
||||||
|
return v;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** A native implementation of an expression to compare performance */
|
||||||
|
private static final Expression NATIVE_IDENTITY_EXPRESSION =
|
||||||
|
new Expression(NATIVE_IDENTITY_NAME, new String[] {"x"}) {
|
||||||
|
@Override
|
||||||
|
public double evaluate(DoubleValues[] functionValues) throws IOException {
|
||||||
|
return functionValues[0].doubleValue();
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
private double[] randomData;
|
||||||
|
private Expression expression;
|
||||||
|
|
||||||
|
@Param({"x", "func_identity(x)", "mh_identity", "native_identity", "cos(x)", "cos(x) + sin(x)"})
|
||||||
|
String js;
|
||||||
|
|
||||||
|
@Setup(Level.Iteration)
|
||||||
|
public void init() throws ParseException {
|
||||||
|
ThreadLocalRandom random = ThreadLocalRandom.current();
|
||||||
|
randomData = random.doubles().limit(1024).toArray();
|
||||||
|
expression =
|
||||||
|
Objects.equals(js, NATIVE_IDENTITY_NAME)
|
||||||
|
? NATIVE_IDENTITY_EXPRESSION
|
||||||
|
: JavascriptCompiler.compile(js, FUNCTIONS);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Benchmark
|
||||||
|
public double expression() throws IOException {
|
||||||
|
var it = new ValuesIterator(randomData);
|
||||||
|
var values = it.getDoubleValues();
|
||||||
|
double result = 0d;
|
||||||
|
while (it.next()) {
|
||||||
|
result += expression.evaluate(values);
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
static final class ValuesIterator {
|
||||||
|
final double[] data;
|
||||||
|
final DoubleValues[] dv;
|
||||||
|
int pos = -1;
|
||||||
|
|
||||||
|
ValuesIterator(double[] data) {
|
||||||
|
this.data = data;
|
||||||
|
var dv =
|
||||||
|
new DoubleValues() {
|
||||||
|
@Override
|
||||||
|
public double doubleValue() throws IOException {
|
||||||
|
return data[pos];
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean advanceExact(int doc) throws IOException {
|
||||||
|
throw new UnsupportedOperationException();
|
||||||
|
}
|
||||||
|
};
|
||||||
|
this.dv = new DoubleValues[] {dv};
|
||||||
|
}
|
||||||
|
|
||||||
|
boolean next() {
|
||||||
|
pos++;
|
||||||
|
return (pos < data.length);
|
||||||
|
}
|
||||||
|
|
||||||
|
DoubleValues[] getDoubleValues() {
|
||||||
|
return dv;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,176 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package org.apache.lucene.benchmark.jmh;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.Random;
|
||||||
|
import java.util.concurrent.TimeUnit;
|
||||||
|
import org.apache.lucene.codecs.lucene99.GroupVIntReader;
|
||||||
|
import org.apache.lucene.codecs.lucene99.GroupVIntWriter;
|
||||||
|
import org.apache.lucene.store.ByteArrayDataInput;
|
||||||
|
import org.apache.lucene.store.ByteArrayDataOutput;
|
||||||
|
import org.apache.lucene.store.Directory;
|
||||||
|
import org.apache.lucene.store.IOContext;
|
||||||
|
import org.apache.lucene.store.IndexInput;
|
||||||
|
import org.apache.lucene.store.IndexOutput;
|
||||||
|
import org.apache.lucene.store.MMapDirectory;
|
||||||
|
import org.openjdk.jmh.annotations.Benchmark;
|
||||||
|
import org.openjdk.jmh.annotations.BenchmarkMode;
|
||||||
|
import org.openjdk.jmh.annotations.Fork;
|
||||||
|
import org.openjdk.jmh.annotations.Level;
|
||||||
|
import org.openjdk.jmh.annotations.Measurement;
|
||||||
|
import org.openjdk.jmh.annotations.Mode;
|
||||||
|
import org.openjdk.jmh.annotations.OutputTimeUnit;
|
||||||
|
import org.openjdk.jmh.annotations.Param;
|
||||||
|
import org.openjdk.jmh.annotations.Scope;
|
||||||
|
import org.openjdk.jmh.annotations.Setup;
|
||||||
|
import org.openjdk.jmh.annotations.State;
|
||||||
|
import org.openjdk.jmh.annotations.Warmup;
|
||||||
|
import org.openjdk.jmh.infra.Blackhole;
|
||||||
|
|
||||||
|
@BenchmarkMode(Mode.Throughput)
|
||||||
|
@OutputTimeUnit(TimeUnit.MICROSECONDS)
|
||||||
|
@State(Scope.Benchmark)
|
||||||
|
@Warmup(iterations = 3, time = 3)
|
||||||
|
@Measurement(iterations = 5, time = 5)
|
||||||
|
@Fork(
|
||||||
|
value = 1,
|
||||||
|
jvmArgsPrepend = {"--add-modules=jdk.unsupported"})
|
||||||
|
public class GroupVIntBenchmark {
|
||||||
|
|
||||||
|
// Cumulative frequency for each number of bits per value used by doc deltas of tail postings on
|
||||||
|
// wikibigall.
|
||||||
|
private static final float[] CUMULATIVE_FREQUENCY_BY_BITS_REQUIRED =
|
||||||
|
new float[] {
|
||||||
|
0.0f,
|
||||||
|
0.01026574f,
|
||||||
|
0.021453038f,
|
||||||
|
0.03342156f,
|
||||||
|
0.046476692f,
|
||||||
|
0.060890317f,
|
||||||
|
0.07644147f,
|
||||||
|
0.093718216f,
|
||||||
|
0.11424741f,
|
||||||
|
0.13989712f,
|
||||||
|
0.17366524f,
|
||||||
|
0.22071244f,
|
||||||
|
0.2815692f,
|
||||||
|
0.3537585f,
|
||||||
|
0.43655503f,
|
||||||
|
0.52308f,
|
||||||
|
0.6104675f,
|
||||||
|
0.7047371f,
|
||||||
|
0.78155357f,
|
||||||
|
0.8671179f,
|
||||||
|
0.9740598f,
|
||||||
|
1.0f
|
||||||
|
};
|
||||||
|
|
||||||
|
final int maxSize = 256;
|
||||||
|
final long[] values = new long[maxSize];
|
||||||
|
|
||||||
|
IndexInput byteBufferGVIntIn;
|
||||||
|
IndexInput byteBufferVIntIn;
|
||||||
|
|
||||||
|
ByteArrayDataInput byteArrayVIntIn;
|
||||||
|
ByteArrayDataInput byteArrayGVIntIn;
|
||||||
|
|
||||||
|
// @Param({"16", "32", "64", "128", "248"})
|
||||||
|
@Param({"64"})
|
||||||
|
public int size;
|
||||||
|
|
||||||
|
void initArrayInput(long[] docs) throws Exception {
|
||||||
|
byte[] gVIntBytes = new byte[Integer.BYTES * maxSize * 2];
|
||||||
|
byte[] vIntBytes = new byte[Integer.BYTES * maxSize * 2];
|
||||||
|
ByteArrayDataOutput vIntOut = new ByteArrayDataOutput(vIntBytes);
|
||||||
|
GroupVIntWriter w = new GroupVIntWriter();
|
||||||
|
w.writeValues(new ByteArrayDataOutput(gVIntBytes), docs, docs.length);
|
||||||
|
for (long v : docs) {
|
||||||
|
vIntOut.writeVInt((int) v);
|
||||||
|
}
|
||||||
|
byteArrayVIntIn = new ByteArrayDataInput(vIntBytes);
|
||||||
|
byteArrayGVIntIn = new ByteArrayDataInput(gVIntBytes);
|
||||||
|
}
|
||||||
|
|
||||||
|
void initByteBufferInput(long[] docs) throws Exception {
|
||||||
|
Directory dir = MMapDirectory.open(Files.createTempDirectory("groupvintdata"));
|
||||||
|
IndexOutput vintOut = dir.createOutput("vint", IOContext.DEFAULT);
|
||||||
|
IndexOutput gvintOut = dir.createOutput("gvint", IOContext.DEFAULT);
|
||||||
|
|
||||||
|
GroupVIntWriter w = new GroupVIntWriter();
|
||||||
|
w.writeValues(gvintOut, docs, docs.length);
|
||||||
|
for (long v : docs) {
|
||||||
|
vintOut.writeVInt((int) v);
|
||||||
|
}
|
||||||
|
vintOut.close();
|
||||||
|
gvintOut.close();
|
||||||
|
byteBufferGVIntIn = dir.openInput("gvint", IOContext.DEFAULT);
|
||||||
|
byteBufferVIntIn = dir.openInput("vint", IOContext.DEFAULT);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Setup(Level.Trial)
|
||||||
|
public void init() throws Exception {
|
||||||
|
long[] docs = new long[maxSize];
|
||||||
|
Random r = new Random(0);
|
||||||
|
for (int i = 0; i < maxSize; ++i) {
|
||||||
|
float randomFloat = r.nextFloat();
|
||||||
|
// Reproduce the distribution of the number of bits per values that we're observing for tail
|
||||||
|
// postings on wikibigall.
|
||||||
|
int numBits = 1 + Arrays.binarySearch(CUMULATIVE_FREQUENCY_BY_BITS_REQUIRED, randomFloat);
|
||||||
|
if (numBits < 0) {
|
||||||
|
numBits = -numBits;
|
||||||
|
}
|
||||||
|
docs[i] = r.nextInt(1 << (numBits - 1), 1 << numBits);
|
||||||
|
}
|
||||||
|
initByteBufferInput(docs);
|
||||||
|
initArrayInput(docs);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Benchmark
|
||||||
|
public void byteBufferReadVInt(Blackhole bh) throws IOException {
|
||||||
|
byteBufferVIntIn.seek(0);
|
||||||
|
for (int i = 0; i < size; i++) {
|
||||||
|
values[i] = byteBufferVIntIn.readVInt();
|
||||||
|
}
|
||||||
|
bh.consume(values);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Benchmark
|
||||||
|
public void byteBufferReadGroupVInt(Blackhole bh) throws IOException {
|
||||||
|
byteBufferGVIntIn.seek(0);
|
||||||
|
GroupVIntReader.readValues(byteBufferGVIntIn, values, size);
|
||||||
|
bh.consume(values);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Benchmark
|
||||||
|
public void byteArrayReadVInt(Blackhole bh) {
|
||||||
|
byteArrayVIntIn.rewind();
|
||||||
|
for (int i = 0; i < size; i++) {
|
||||||
|
values[i] = byteArrayVIntIn.readVInt();
|
||||||
|
}
|
||||||
|
bh.consume(values);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Benchmark
|
||||||
|
public void byteArrayReadGroupVInt(Blackhole bh) throws IOException {
|
||||||
|
byteArrayGVIntIn.rewind();
|
||||||
|
GroupVIntReader.readValues(byteArrayGVIntIn, values, size);
|
||||||
|
bh.consume(values);
|
||||||
|
}
|
||||||
|
}
|
|
@ -30,8 +30,8 @@ import org.apache.lucene.search.Query;
|
||||||
import org.apache.lucene.search.ScoreDoc;
|
import org.apache.lucene.search.ScoreDoc;
|
||||||
import org.apache.lucene.search.Sort;
|
import org.apache.lucene.search.Sort;
|
||||||
import org.apache.lucene.search.TopDocs;
|
import org.apache.lucene.search.TopDocs;
|
||||||
import org.apache.lucene.search.TopFieldCollector;
|
import org.apache.lucene.search.TopFieldCollectorManager;
|
||||||
import org.apache.lucene.search.TopScoreDocCollector;
|
import org.apache.lucene.search.TopScoreDocCollectorManager;
|
||||||
import org.apache.lucene.store.Directory;
|
import org.apache.lucene.store.Directory;
|
||||||
import org.apache.lucene.util.Bits;
|
import org.apache.lucene.util.Bits;
|
||||||
|
|
||||||
|
@ -110,15 +110,17 @@ public abstract class ReadTask extends PerfTask {
|
||||||
// the IndexSearcher search methods that take
|
// the IndexSearcher search methods that take
|
||||||
// Weight public again, we can go back to
|
// Weight public again, we can go back to
|
||||||
// pulling the Weight ourselves:
|
// pulling the Weight ourselves:
|
||||||
TopFieldCollector collector =
|
int totalHitsThreshold = withTotalHits() ? Integer.MAX_VALUE : 1;
|
||||||
TopFieldCollector.create(sort, numHits, withTotalHits() ? Integer.MAX_VALUE : 1);
|
TopFieldCollectorManager collectorManager =
|
||||||
searcher.search(q, collector);
|
new TopFieldCollectorManager(
|
||||||
hits = collector.topDocs();
|
sort, numHits, null, totalHitsThreshold, searcher.getSlices().length > 1);
|
||||||
|
hits = searcher.search(q, collectorManager);
|
||||||
} else {
|
} else {
|
||||||
hits = searcher.search(q, numHits);
|
hits = searcher.search(q, numHits);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
Collector collector = createCollector();
|
Collector collector = createCollector();
|
||||||
|
|
||||||
searcher.search(q, collector);
|
searcher.search(q, collector);
|
||||||
// hits = collector.topDocs();
|
// hits = collector.topDocs();
|
||||||
}
|
}
|
||||||
|
@ -183,7 +185,8 @@ public abstract class ReadTask extends PerfTask {
|
||||||
}
|
}
|
||||||
|
|
||||||
protected Collector createCollector() throws Exception {
|
protected Collector createCollector() throws Exception {
|
||||||
return TopScoreDocCollector.create(numHits(), withTotalHits() ? Integer.MAX_VALUE : 1);
|
return new TopScoreDocCollectorManager(numHits(), withTotalHits() ? Integer.MAX_VALUE : 1)
|
||||||
|
.newCollector();
|
||||||
}
|
}
|
||||||
|
|
||||||
protected Document retrieveDoc(StoredFields storedFields, int id) throws IOException {
|
protected Document retrieveDoc(StoredFields storedFields, int id) throws IOException {
|
||||||
|
|
|
@ -207,7 +207,8 @@ public class BooleanPerceptronClassifier implements Classifier<Boolean> {
|
||||||
|
|
||||||
private void updateFST(SortedMap<String, Double> weights) throws IOException {
|
private void updateFST(SortedMap<String, Double> weights) throws IOException {
|
||||||
PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
|
PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
|
||||||
FSTCompiler<Long> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
|
FSTCompiler<Long> fstCompiler =
|
||||||
|
new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE1, outputs).build();
|
||||||
BytesRefBuilder scratchBytes = new BytesRefBuilder();
|
BytesRefBuilder scratchBytes = new BytesRefBuilder();
|
||||||
IntsRefBuilder scratchInts = new IntsRefBuilder();
|
IntsRefBuilder scratchInts = new IntsRefBuilder();
|
||||||
for (Map.Entry<String, Double> entry : weights.entrySet()) {
|
for (Map.Entry<String, Double> entry : weights.entrySet()) {
|
||||||
|
|
|
@ -16,6 +16,8 @@
|
||||||
*/
|
*/
|
||||||
package org.apache.lucene.codecs.blockterms;
|
package org.apache.lucene.codecs.blockterms;
|
||||||
|
|
||||||
|
import static org.apache.lucene.util.fst.FST.readMetadata;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.Collection;
|
import java.util.Collection;
|
||||||
import java.util.Collections;
|
import java.util.Collections;
|
||||||
|
@ -154,7 +156,7 @@ public class VariableGapTermsIndexReader extends TermsIndexReaderBase {
|
||||||
public FieldIndexData(IndexInput in, FieldInfo fieldInfo, long indexStart) throws IOException {
|
public FieldIndexData(IndexInput in, FieldInfo fieldInfo, long indexStart) throws IOException {
|
||||||
IndexInput clone = in.clone();
|
IndexInput clone = in.clone();
|
||||||
clone.seek(indexStart);
|
clone.seek(indexStart);
|
||||||
fst = new FST<>(clone, clone, fstOutputs);
|
fst = new FST<>(readMetadata(clone, fstOutputs), clone);
|
||||||
clone.close();
|
clone.close();
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
|
|
@ -238,7 +238,7 @@ public class VariableGapTermsIndexWriter extends TermsIndexWriterBase {
|
||||||
public FSTFieldWriter(FieldInfo fieldInfo, long termsFilePointer) throws IOException {
|
public FSTFieldWriter(FieldInfo fieldInfo, long termsFilePointer) throws IOException {
|
||||||
this.fieldInfo = fieldInfo;
|
this.fieldInfo = fieldInfo;
|
||||||
fstOutputs = PositiveIntOutputs.getSingleton();
|
fstOutputs = PositiveIntOutputs.getSingleton();
|
||||||
fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, fstOutputs);
|
fstCompiler = new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE1, fstOutputs).build();
|
||||||
indexStart = out.getFilePointer();
|
indexStart = out.getFilePointer();
|
||||||
//// System.out.println("VGW: field=" + fieldInfo.name);
|
//// System.out.println("VGW: field=" + fieldInfo.name);
|
||||||
|
|
||||||
|
|
|
@ -16,6 +16,8 @@
|
||||||
*/
|
*/
|
||||||
package org.apache.lucene.codecs.blocktreeords;
|
package org.apache.lucene.codecs.blocktreeords;
|
||||||
|
|
||||||
|
import static org.apache.lucene.util.fst.FST.readMetadata;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import org.apache.lucene.codecs.blocktreeords.FSTOrdsOutputs.Output;
|
import org.apache.lucene.codecs.blocktreeords.FSTOrdsOutputs.Output;
|
||||||
import org.apache.lucene.index.FieldInfo;
|
import org.apache.lucene.index.FieldInfo;
|
||||||
|
@ -85,7 +87,7 @@ final class OrdsFieldReader extends Terms {
|
||||||
final IndexInput clone = indexIn.clone();
|
final IndexInput clone = indexIn.clone();
|
||||||
// System.out.println("start=" + indexStartFP + " field=" + fieldInfo.name);
|
// System.out.println("start=" + indexStartFP + " field=" + fieldInfo.name);
|
||||||
clone.seek(indexStartFP);
|
clone.seek(indexStartFP);
|
||||||
index = new FST<>(clone, clone, OrdsBlockTreeTermsWriter.FST_OUTPUTS);
|
index = new FST<>(readMetadata(clone, OrdsBlockTreeTermsWriter.FST_OUTPUTS), clone);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
if (true) {
|
if (true) {
|
||||||
|
|
|
@ -194,7 +194,8 @@ public class FSTTermsReader extends FieldsProducer {
|
||||||
this.sumDocFreq = sumDocFreq;
|
this.sumDocFreq = sumDocFreq;
|
||||||
this.docCount = docCount;
|
this.docCount = docCount;
|
||||||
OffHeapFSTStore offHeapFSTStore = new OffHeapFSTStore();
|
OffHeapFSTStore offHeapFSTStore = new OffHeapFSTStore();
|
||||||
this.dict = new FST<>(in, in, new FSTTermOutputs(fieldInfo), offHeapFSTStore);
|
FSTTermOutputs outputs = new FSTTermOutputs(fieldInfo);
|
||||||
|
this.dict = new FST<>(FST.readMetadata(in, outputs), in, offHeapFSTStore);
|
||||||
in.skipBytes(offHeapFSTStore.size());
|
in.skipBytes(offHeapFSTStore.size());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -251,12 +251,12 @@ public class FSTTermsWriter extends FieldsConsumer {
|
||||||
private final IntsRefBuilder scratchTerm = new IntsRefBuilder();
|
private final IntsRefBuilder scratchTerm = new IntsRefBuilder();
|
||||||
private final ByteBuffersDataOutput metaWriter = ByteBuffersDataOutput.newResettableInstance();
|
private final ByteBuffersDataOutput metaWriter = ByteBuffersDataOutput.newResettableInstance();
|
||||||
|
|
||||||
TermsWriter(FieldInfo fieldInfo) {
|
TermsWriter(FieldInfo fieldInfo) throws IOException {
|
||||||
this.numTerms = 0;
|
this.numTerms = 0;
|
||||||
this.fieldInfo = fieldInfo;
|
this.fieldInfo = fieldInfo;
|
||||||
postingsWriter.setField(fieldInfo);
|
postingsWriter.setField(fieldInfo);
|
||||||
this.outputs = new FSTTermOutputs(fieldInfo);
|
this.outputs = new FSTTermOutputs(fieldInfo);
|
||||||
this.fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
|
this.fstCompiler = new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE1, outputs).build();
|
||||||
}
|
}
|
||||||
|
|
||||||
public void finishTerm(BytesRef text, BlockTermState state) throws IOException {
|
public void finishTerm(BytesRef text, BlockTermState state) throws IOException {
|
||||||
|
|
|
@ -683,7 +683,7 @@ class SimpleTextFieldsReader extends FieldsProducer {
|
||||||
final PairOutputs<Long, Long> outputsInner = new PairOutputs<>(posIntOutputs, posIntOutputs);
|
final PairOutputs<Long, Long> outputsInner = new PairOutputs<>(posIntOutputs, posIntOutputs);
|
||||||
final PairOutputs<PairOutputs.Pair<Long, Long>, PairOutputs.Pair<Long, Long>> outputs =
|
final PairOutputs<PairOutputs.Pair<Long, Long>, PairOutputs.Pair<Long, Long>> outputs =
|
||||||
new PairOutputs<>(outputsOuter, outputsInner);
|
new PairOutputs<>(outputsOuter, outputsInner);
|
||||||
fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
|
fstCompiler = new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE1, outputs).build();
|
||||||
IndexInput in = SimpleTextFieldsReader.this.in.clone();
|
IndexInput in = SimpleTextFieldsReader.this.in.clone();
|
||||||
in.seek(termsStart);
|
in.seek(termsStart);
|
||||||
final BytesRefBuilder lastTerm = new BytesRefBuilder();
|
final BytesRefBuilder lastTerm = new BytesRefBuilder();
|
||||||
|
|
|
@ -37,7 +37,6 @@ public class SimpleTextStoredFieldsFormat extends StoredFieldsFormat {
|
||||||
@Override
|
@Override
|
||||||
public StoredFieldsReader fieldsReader(
|
public StoredFieldsReader fieldsReader(
|
||||||
Directory directory, SegmentInfo si, FieldInfos fn, IOContext context) throws IOException {
|
Directory directory, SegmentInfo si, FieldInfos fn, IOContext context) throws IOException {
|
||||||
;
|
|
||||||
return new SimpleTextStoredFieldsReader(directory, si, fn, context);
|
return new SimpleTextStoredFieldsReader(directory, si, fn, context);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -89,10 +89,11 @@ public class FSTDictionary implements IndexDictionary {
|
||||||
isFSTOnHeap = true;
|
isFSTOnHeap = true;
|
||||||
}
|
}
|
||||||
PositiveIntOutputs fstOutputs = PositiveIntOutputs.getSingleton();
|
PositiveIntOutputs fstOutputs = PositiveIntOutputs.getSingleton();
|
||||||
|
FST.FSTMetadata<Long> metadata = FST.readMetadata(fstDataInput, fstOutputs);
|
||||||
FST<Long> fst =
|
FST<Long> fst =
|
||||||
isFSTOnHeap
|
isFSTOnHeap
|
||||||
? new FST<>(fstDataInput, fstDataInput, fstOutputs)
|
? new FST<>(metadata, fstDataInput)
|
||||||
: new FST<>(fstDataInput, fstDataInput, fstOutputs, new OffHeapFSTStore());
|
: new FST<>(metadata, fstDataInput, new OffHeapFSTStore());
|
||||||
return new FSTDictionary(fst);
|
return new FSTDictionary(fst);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -171,9 +172,9 @@ public class FSTDictionary implements IndexDictionary {
|
||||||
protected final FSTCompiler<Long> fstCompiler;
|
protected final FSTCompiler<Long> fstCompiler;
|
||||||
protected final IntsRefBuilder scratchInts;
|
protected final IntsRefBuilder scratchInts;
|
||||||
|
|
||||||
public Builder() {
|
public Builder() throws IOException {
|
||||||
PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
|
PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
|
||||||
fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
|
fstCompiler = new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE1, outputs).build();
|
||||||
scratchInts = new IntsRefBuilder();
|
scratchInts = new IntsRefBuilder();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -100,5 +100,4 @@ public abstract class DelegatingAnalyzerWrapper extends AnalyzerWrapper {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
;
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -70,7 +70,6 @@ public abstract class TermVectorsWriter implements Closeable, Accountable {
|
||||||
|
|
||||||
/** Called after a doc and all its fields have been added. */
|
/** Called after a doc and all its fields have been added. */
|
||||||
public void finishDocument() throws IOException {}
|
public void finishDocument() throws IOException {}
|
||||||
;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Called before writing the terms of the field. {@link #startTerm(BytesRef, int)} will be called
|
* Called before writing the terms of the field. {@link #startTerm(BytesRef, int)} will be called
|
||||||
|
@ -82,7 +81,6 @@ public abstract class TermVectorsWriter implements Closeable, Accountable {
|
||||||
|
|
||||||
/** Called after a field and all its terms have been added. */
|
/** Called after a field and all its terms have been added. */
|
||||||
public void finishField() throws IOException {}
|
public void finishField() throws IOException {}
|
||||||
;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Adds a term and its term frequency <code>freq</code>. If this field has positions and/or
|
* Adds a term and its term frequency <code>freq</code>. If this field has positions and/or
|
||||||
|
|
|
@ -91,7 +91,11 @@ public final class FieldReader extends Terms {
|
||||||
// Initialize FST always off-heap.
|
// Initialize FST always off-heap.
|
||||||
final IndexInput clone = indexIn.clone();
|
final IndexInput clone = indexIn.clone();
|
||||||
clone.seek(indexStartFP);
|
clone.seek(indexStartFP);
|
||||||
index = new FST<>(metaIn, clone, ByteSequenceOutputs.getSingleton(), new OffHeapFSTStore());
|
index =
|
||||||
|
new FST<>(
|
||||||
|
FST.readMetadata(metaIn, ByteSequenceOutputs.getSingleton()),
|
||||||
|
clone,
|
||||||
|
new OffHeapFSTStore());
|
||||||
/*
|
/*
|
||||||
if (false) {
|
if (false) {
|
||||||
final String dotFileName = segment + "_" + fieldInfo.name + ".dot";
|
final String dotFileName = segment + "_" + fieldInfo.name + ".dot";
|
||||||
|
|
|
@ -30,9 +30,7 @@ import org.apache.lucene.util.StringHelper;
|
||||||
import org.apache.lucene.util.automaton.ByteRunnable;
|
import org.apache.lucene.util.automaton.ByteRunnable;
|
||||||
import org.apache.lucene.util.automaton.Transition;
|
import org.apache.lucene.util.automaton.Transition;
|
||||||
import org.apache.lucene.util.automaton.TransitionAccessor;
|
import org.apache.lucene.util.automaton.TransitionAccessor;
|
||||||
import org.apache.lucene.util.fst.ByteSequenceOutputs;
|
|
||||||
import org.apache.lucene.util.fst.FST;
|
import org.apache.lucene.util.fst.FST;
|
||||||
import org.apache.lucene.util.fst.Outputs;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* This is used to implement efficient {@link Terms#intersect} for block-tree. Note that it cannot
|
* This is used to implement efficient {@link Terms#intersect} for block-tree. Note that it cannot
|
||||||
|
@ -46,7 +44,6 @@ final class IntersectTermsEnum extends BaseTermsEnum {
|
||||||
// static boolean DEBUG = BlockTreeTermsWriter.DEBUG;
|
// static boolean DEBUG = BlockTreeTermsWriter.DEBUG;
|
||||||
|
|
||||||
final IndexInput in;
|
final IndexInput in;
|
||||||
static final Outputs<BytesRef> fstOutputs = ByteSequenceOutputs.getSingleton();
|
|
||||||
|
|
||||||
IntersectTermsEnumFrame[] stack;
|
IntersectTermsEnumFrame[] stack;
|
||||||
|
|
||||||
|
@ -68,6 +65,9 @@ final class IntersectTermsEnum extends BaseTermsEnum {
|
||||||
|
|
||||||
private BytesRef savedStartTerm;
|
private BytesRef savedStartTerm;
|
||||||
|
|
||||||
|
private final SegmentTermsEnum.OutputAccumulator outputAccumulator =
|
||||||
|
new SegmentTermsEnum.OutputAccumulator();
|
||||||
|
|
||||||
// TODO: in some cases we can filter by length? eg
|
// TODO: in some cases we can filter by length? eg
|
||||||
// regexp foo*bar must be at least length 6 bytes
|
// regexp foo*bar must be at least length 6 bytes
|
||||||
public IntersectTermsEnum(
|
public IntersectTermsEnum(
|
||||||
|
@ -114,7 +114,6 @@ final class IntersectTermsEnum extends BaseTermsEnum {
|
||||||
f.prefix = 0;
|
f.prefix = 0;
|
||||||
f.setState(0);
|
f.setState(0);
|
||||||
f.arc = arc;
|
f.arc = arc;
|
||||||
f.outputPrefix = arc.output();
|
|
||||||
f.load(fr.rootCode);
|
f.load(fr.rootCode);
|
||||||
|
|
||||||
// for assert:
|
// for assert:
|
||||||
|
@ -184,7 +183,9 @@ final class IntersectTermsEnum extends BaseTermsEnum {
|
||||||
FST.Arc<BytesRef> arc = currentFrame.arc;
|
FST.Arc<BytesRef> arc = currentFrame.arc;
|
||||||
int idx = currentFrame.prefix;
|
int idx = currentFrame.prefix;
|
||||||
assert currentFrame.suffix > 0;
|
assert currentFrame.suffix > 0;
|
||||||
BytesRef output = currentFrame.outputPrefix;
|
|
||||||
|
outputAccumulator.reset();
|
||||||
|
outputAccumulator.push(arc.output());
|
||||||
while (idx < f.prefix) {
|
while (idx < f.prefix) {
|
||||||
final int target = term.bytes[idx] & 0xff;
|
final int target = term.bytes[idx] & 0xff;
|
||||||
// TODO: we could be more efficient for the next()
|
// TODO: we could be more efficient for the next()
|
||||||
|
@ -192,14 +193,14 @@ final class IntersectTermsEnum extends BaseTermsEnum {
|
||||||
// passed to findTargetArc
|
// passed to findTargetArc
|
||||||
arc = fr.index.findTargetArc(target, arc, getArc(1 + idx), fstReader);
|
arc = fr.index.findTargetArc(target, arc, getArc(1 + idx), fstReader);
|
||||||
assert arc != null;
|
assert arc != null;
|
||||||
output = fstOutputs.add(output, arc.output());
|
outputAccumulator.push(arc.output());
|
||||||
idx++;
|
idx++;
|
||||||
}
|
}
|
||||||
|
|
||||||
f.arc = arc;
|
f.arc = arc;
|
||||||
f.outputPrefix = output;
|
|
||||||
assert arc.isFinal();
|
assert arc.isFinal();
|
||||||
f.load(fstOutputs.add(output, arc.nextFinalOutput()));
|
outputAccumulator.push(arc.nextFinalOutput());
|
||||||
|
f.load(outputAccumulator);
|
||||||
return f;
|
return f;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -55,7 +55,6 @@ final class IntersectTermsEnumFrame {
|
||||||
int statsSingletonRunLength = 0;
|
int statsSingletonRunLength = 0;
|
||||||
final ByteArrayDataInput statsReader = new ByteArrayDataInput();
|
final ByteArrayDataInput statsReader = new ByteArrayDataInput();
|
||||||
|
|
||||||
byte[] floorData = new byte[32];
|
|
||||||
final ByteArrayDataInput floorDataReader = new ByteArrayDataInput();
|
final ByteArrayDataInput floorDataReader = new ByteArrayDataInput();
|
||||||
|
|
||||||
// Length of prefix shared by all terms in this block
|
// Length of prefix shared by all terms in this block
|
||||||
|
@ -90,9 +89,6 @@ final class IntersectTermsEnumFrame {
|
||||||
|
|
||||||
final ByteArrayDataInput bytesReader = new ByteArrayDataInput();
|
final ByteArrayDataInput bytesReader = new ByteArrayDataInput();
|
||||||
|
|
||||||
// Cumulative output so far
|
|
||||||
BytesRef outputPrefix;
|
|
||||||
|
|
||||||
int startBytePos;
|
int startBytePos;
|
||||||
int suffix;
|
int suffix;
|
||||||
|
|
||||||
|
@ -120,7 +116,7 @@ final class IntersectTermsEnumFrame {
|
||||||
}
|
}
|
||||||
} while (numFollowFloorBlocks != 0 && nextFloorLabel <= transition.min);
|
} while (numFollowFloorBlocks != 0 && nextFloorLabel <= transition.min);
|
||||||
|
|
||||||
load(null);
|
load((Long) null);
|
||||||
}
|
}
|
||||||
|
|
||||||
public void setState(int state) {
|
public void setState(int state) {
|
||||||
|
@ -142,12 +138,22 @@ final class IntersectTermsEnumFrame {
|
||||||
}
|
}
|
||||||
|
|
||||||
void load(BytesRef frameIndexData) throws IOException {
|
void load(BytesRef frameIndexData) throws IOException {
|
||||||
if (frameIndexData != null) {
|
floorDataReader.reset(frameIndexData.bytes, frameIndexData.offset, frameIndexData.length);
|
||||||
floorDataReader.reset(frameIndexData.bytes, frameIndexData.offset, frameIndexData.length);
|
load(ite.fr.readVLongOutput(floorDataReader));
|
||||||
// Skip first long -- has redundant fp, hasTerms
|
}
|
||||||
// flag, isFloor flag
|
|
||||||
final long code = ite.fr.readVLongOutput(floorDataReader);
|
void load(SegmentTermsEnum.OutputAccumulator outputAccumulator) throws IOException {
|
||||||
if ((code & Lucene90BlockTreeTermsReader.OUTPUT_FLAG_IS_FLOOR) != 0) {
|
outputAccumulator.prepareRead();
|
||||||
|
long code = ite.fr.readVLongOutput(outputAccumulator);
|
||||||
|
outputAccumulator.setFloorData(floorDataReader);
|
||||||
|
load(code);
|
||||||
|
}
|
||||||
|
|
||||||
|
void load(Long blockCode) throws IOException {
|
||||||
|
if (blockCode != null) {
|
||||||
|
// This block is the first one in a possible sequence of floor blocks corresponding to a
|
||||||
|
// single seek point from the FST terms index
|
||||||
|
if ((blockCode & Lucene90BlockTreeTermsReader.OUTPUT_FLAG_IS_FLOOR) != 0) {
|
||||||
// Floor frame
|
// Floor frame
|
||||||
numFollowFloorBlocks = floorDataReader.readVInt();
|
numFollowFloorBlocks = floorDataReader.readVInt();
|
||||||
nextFloorLabel = floorDataReader.readByte() & 0xff;
|
nextFloorLabel = floorDataReader.readByte() & 0xff;
|
||||||
|
|
|
@ -16,6 +16,8 @@
|
||||||
*/
|
*/
|
||||||
package org.apache.lucene.codecs.lucene90.blocktree;
|
package org.apache.lucene.codecs.lucene90.blocktree;
|
||||||
|
|
||||||
|
import static org.apache.lucene.util.fst.FSTCompiler.getOnHeapReaderWriter;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
|
@ -525,7 +527,7 @@ public final class Lucene90BlockTreeTermsWriter extends FieldsConsumer {
|
||||||
// Disable suffixes sharing for block tree index because suffixes are mostly dropped
|
// Disable suffixes sharing for block tree index because suffixes are mostly dropped
|
||||||
// from the FST index and left in the term blocks.
|
// from the FST index and left in the term blocks.
|
||||||
.suffixRAMLimitMB(0d)
|
.suffixRAMLimitMB(0d)
|
||||||
.bytesPageBits(pageBits)
|
.dataOutput(getOnHeapReaderWriter(pageBits))
|
||||||
.build();
|
.build();
|
||||||
// if (DEBUG) {
|
// if (DEBUG) {
|
||||||
// System.out.println(" compile index for prefix=" + prefix);
|
// System.out.println(" compile index for prefix=" + prefix);
|
||||||
|
|
|
@ -24,6 +24,7 @@ import org.apache.lucene.index.ImpactsEnum;
|
||||||
import org.apache.lucene.index.PostingsEnum;
|
import org.apache.lucene.index.PostingsEnum;
|
||||||
import org.apache.lucene.index.TermState;
|
import org.apache.lucene.index.TermState;
|
||||||
import org.apache.lucene.store.ByteArrayDataInput;
|
import org.apache.lucene.store.ByteArrayDataInput;
|
||||||
|
import org.apache.lucene.store.DataInput;
|
||||||
import org.apache.lucene.store.IndexInput;
|
import org.apache.lucene.store.IndexInput;
|
||||||
import org.apache.lucene.util.ArrayUtil;
|
import org.apache.lucene.util.ArrayUtil;
|
||||||
import org.apache.lucene.util.BytesRef;
|
import org.apache.lucene.util.BytesRef;
|
||||||
|
@ -48,7 +49,7 @@ final class SegmentTermsEnum extends BaseTermsEnum {
|
||||||
|
|
||||||
// static boolean DEBUG = BlockTreeTermsWriter.DEBUG;
|
// static boolean DEBUG = BlockTreeTermsWriter.DEBUG;
|
||||||
|
|
||||||
private final ByteArrayDataInput scratchReader = new ByteArrayDataInput();
|
private final OutputAccumulator outputAccumulator = new OutputAccumulator();
|
||||||
|
|
||||||
// What prefix of the current term was present in the index; when we only next() through the
|
// What prefix of the current term was present in the index; when we only next() through the
|
||||||
// index, this stays at 0. It's only set when
|
// index, this stays at 0. It's only set when
|
||||||
|
@ -232,18 +233,24 @@ final class SegmentTermsEnum extends BaseTermsEnum {
|
||||||
return arcs[ord];
|
return arcs[ord];
|
||||||
}
|
}
|
||||||
|
|
||||||
// Pushes a frame we seek'd to
|
|
||||||
SegmentTermsEnumFrame pushFrame(FST.Arc<BytesRef> arc, BytesRef frameData, int length)
|
SegmentTermsEnumFrame pushFrame(FST.Arc<BytesRef> arc, BytesRef frameData, int length)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
scratchReader.reset(frameData.bytes, frameData.offset, frameData.length);
|
outputAccumulator.reset();
|
||||||
final long code = fr.readVLongOutput(scratchReader);
|
outputAccumulator.push(frameData);
|
||||||
|
return pushFrame(arc, length);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Pushes a frame we seek'd to
|
||||||
|
SegmentTermsEnumFrame pushFrame(FST.Arc<BytesRef> arc, int length) throws IOException {
|
||||||
|
outputAccumulator.prepareRead();
|
||||||
|
final long code = fr.readVLongOutput(outputAccumulator);
|
||||||
final long fpSeek = code >>> Lucene90BlockTreeTermsReader.OUTPUT_FLAGS_NUM_BITS;
|
final long fpSeek = code >>> Lucene90BlockTreeTermsReader.OUTPUT_FLAGS_NUM_BITS;
|
||||||
final SegmentTermsEnumFrame f = getFrame(1 + currentFrame.ord);
|
final SegmentTermsEnumFrame f = getFrame(1 + currentFrame.ord);
|
||||||
f.hasTerms = (code & Lucene90BlockTreeTermsReader.OUTPUT_FLAG_HAS_TERMS) != 0;
|
f.hasTerms = (code & Lucene90BlockTreeTermsReader.OUTPUT_FLAG_HAS_TERMS) != 0;
|
||||||
f.hasTermsOrig = f.hasTerms;
|
f.hasTermsOrig = f.hasTerms;
|
||||||
f.isFloor = (code & Lucene90BlockTreeTermsReader.OUTPUT_FLAG_IS_FLOOR) != 0;
|
f.isFloor = (code & Lucene90BlockTreeTermsReader.OUTPUT_FLAG_IS_FLOOR) != 0;
|
||||||
if (f.isFloor) {
|
if (f.isFloor) {
|
||||||
f.setFloorData(scratchReader, frameData);
|
f.setFloorData(outputAccumulator);
|
||||||
}
|
}
|
||||||
pushFrame(arc, fpSeek, length);
|
pushFrame(arc, fpSeek, length);
|
||||||
|
|
||||||
|
@ -344,9 +351,9 @@ final class SegmentTermsEnum extends BaseTermsEnum {
|
||||||
|
|
||||||
FST.Arc<BytesRef> arc;
|
FST.Arc<BytesRef> arc;
|
||||||
int targetUpto;
|
int targetUpto;
|
||||||
BytesRef output;
|
|
||||||
|
|
||||||
targetBeforeCurrentLength = currentFrame.ord;
|
targetBeforeCurrentLength = currentFrame.ord;
|
||||||
|
outputAccumulator.reset();
|
||||||
|
|
||||||
if (currentFrame != staticFrame) {
|
if (currentFrame != staticFrame) {
|
||||||
|
|
||||||
|
@ -363,7 +370,7 @@ final class SegmentTermsEnum extends BaseTermsEnum {
|
||||||
|
|
||||||
arc = arcs[0];
|
arc = arcs[0];
|
||||||
assert arc.isFinal();
|
assert arc.isFinal();
|
||||||
output = arc.output();
|
outputAccumulator.push(arc.output());
|
||||||
targetUpto = 0;
|
targetUpto = 0;
|
||||||
|
|
||||||
SegmentTermsEnumFrame lastFrame = stack[0];
|
SegmentTermsEnumFrame lastFrame = stack[0];
|
||||||
|
@ -373,9 +380,6 @@ final class SegmentTermsEnum extends BaseTermsEnum {
|
||||||
|
|
||||||
int cmp = 0;
|
int cmp = 0;
|
||||||
|
|
||||||
// TODO: reverse vLong byte order for better FST
|
|
||||||
// prefix output sharing
|
|
||||||
|
|
||||||
// First compare up to valid seek frames:
|
// First compare up to valid seek frames:
|
||||||
while (targetUpto < targetLimit) {
|
while (targetUpto < targetLimit) {
|
||||||
cmp = (term.byteAt(targetUpto) & 0xFF) - (target.bytes[target.offset + targetUpto] & 0xFF);
|
cmp = (term.byteAt(targetUpto) & 0xFF) - (target.bytes[target.offset + targetUpto] & 0xFF);
|
||||||
|
@ -394,9 +398,8 @@ final class SegmentTermsEnum extends BaseTermsEnum {
|
||||||
+ (char) arc.label()
|
+ (char) arc.label()
|
||||||
+ " targetLabel="
|
+ " targetLabel="
|
||||||
+ (char) (target.bytes[target.offset + targetUpto] & 0xFF);
|
+ (char) (target.bytes[target.offset + targetUpto] & 0xFF);
|
||||||
if (arc.output() != Lucene90BlockTreeTermsReader.NO_OUTPUT) {
|
outputAccumulator.push(arc.output());
|
||||||
output = Lucene90BlockTreeTermsReader.FST_OUTPUTS.add(output, arc.output());
|
|
||||||
}
|
|
||||||
if (arc.isFinal()) {
|
if (arc.isFinal()) {
|
||||||
lastFrame = stack[1 + lastFrame.ord];
|
lastFrame = stack[1 + lastFrame.ord];
|
||||||
}
|
}
|
||||||
|
@ -484,15 +487,15 @@ final class SegmentTermsEnum extends BaseTermsEnum {
|
||||||
// System.out.println(" no seek state; push root frame");
|
// System.out.println(" no seek state; push root frame");
|
||||||
// }
|
// }
|
||||||
|
|
||||||
output = arc.output();
|
outputAccumulator.push(arc.output());
|
||||||
|
|
||||||
currentFrame = staticFrame;
|
currentFrame = staticFrame;
|
||||||
|
|
||||||
// term.length = 0;
|
// term.length = 0;
|
||||||
targetUpto = 0;
|
targetUpto = 0;
|
||||||
currentFrame =
|
outputAccumulator.push(arc.nextFinalOutput());
|
||||||
pushFrame(
|
currentFrame = pushFrame(arc, 0);
|
||||||
arc, Lucene90BlockTreeTermsReader.FST_OUTPUTS.add(output, arc.nextFinalOutput()), 0);
|
outputAccumulator.pop();
|
||||||
}
|
}
|
||||||
|
|
||||||
// if (DEBUG) {
|
// if (DEBUG) {
|
||||||
|
@ -554,9 +557,7 @@ final class SegmentTermsEnum extends BaseTermsEnum {
|
||||||
term.setByteAt(targetUpto, (byte) targetLabel);
|
term.setByteAt(targetUpto, (byte) targetLabel);
|
||||||
// Aggregate output as we go:
|
// Aggregate output as we go:
|
||||||
assert arc.output() != null;
|
assert arc.output() != null;
|
||||||
if (arc.output() != Lucene90BlockTreeTermsReader.NO_OUTPUT) {
|
outputAccumulator.push(arc.output());
|
||||||
output = Lucene90BlockTreeTermsReader.FST_OUTPUTS.add(output, arc.output());
|
|
||||||
}
|
|
||||||
|
|
||||||
// if (DEBUG) {
|
// if (DEBUG) {
|
||||||
// System.out.println(" index: follow label=" + toHex(target.bytes[target.offset +
|
// System.out.println(" index: follow label=" + toHex(target.bytes[target.offset +
|
||||||
|
@ -566,11 +567,9 @@ final class SegmentTermsEnum extends BaseTermsEnum {
|
||||||
|
|
||||||
if (arc.isFinal()) {
|
if (arc.isFinal()) {
|
||||||
// if (DEBUG) System.out.println(" arc is final!");
|
// if (DEBUG) System.out.println(" arc is final!");
|
||||||
currentFrame =
|
outputAccumulator.push(arc.nextFinalOutput());
|
||||||
pushFrame(
|
currentFrame = pushFrame(arc, targetUpto);
|
||||||
arc,
|
outputAccumulator.pop();
|
||||||
Lucene90BlockTreeTermsReader.FST_OUTPUTS.add(output, arc.nextFinalOutput()),
|
|
||||||
targetUpto);
|
|
||||||
// if (DEBUG) System.out.println(" curFrame.ord=" + currentFrame.ord + " hasTerms=" +
|
// if (DEBUG) System.out.println(" curFrame.ord=" + currentFrame.ord + " hasTerms=" +
|
||||||
// currentFrame.hasTerms);
|
// currentFrame.hasTerms);
|
||||||
}
|
}
|
||||||
|
@ -630,9 +629,9 @@ final class SegmentTermsEnum extends BaseTermsEnum {
|
||||||
|
|
||||||
FST.Arc<BytesRef> arc;
|
FST.Arc<BytesRef> arc;
|
||||||
int targetUpto;
|
int targetUpto;
|
||||||
BytesRef output;
|
|
||||||
|
|
||||||
targetBeforeCurrentLength = currentFrame.ord;
|
targetBeforeCurrentLength = currentFrame.ord;
|
||||||
|
outputAccumulator.reset();
|
||||||
|
|
||||||
if (currentFrame != staticFrame) {
|
if (currentFrame != staticFrame) {
|
||||||
|
|
||||||
|
@ -649,7 +648,7 @@ final class SegmentTermsEnum extends BaseTermsEnum {
|
||||||
|
|
||||||
arc = arcs[0];
|
arc = arcs[0];
|
||||||
assert arc.isFinal();
|
assert arc.isFinal();
|
||||||
output = arc.output();
|
outputAccumulator.push(arc.output());
|
||||||
targetUpto = 0;
|
targetUpto = 0;
|
||||||
|
|
||||||
SegmentTermsEnumFrame lastFrame = stack[0];
|
SegmentTermsEnumFrame lastFrame = stack[0];
|
||||||
|
@ -659,9 +658,6 @@ final class SegmentTermsEnum extends BaseTermsEnum {
|
||||||
|
|
||||||
int cmp = 0;
|
int cmp = 0;
|
||||||
|
|
||||||
// TODO: we should write our vLong backwards (MSB
|
|
||||||
// first) to get better sharing from the FST
|
|
||||||
|
|
||||||
// First compare up to valid seek frames:
|
// First compare up to valid seek frames:
|
||||||
while (targetUpto < targetLimit) {
|
while (targetUpto < targetLimit) {
|
||||||
cmp = (term.byteAt(targetUpto) & 0xFF) - (target.bytes[target.offset + targetUpto] & 0xFF);
|
cmp = (term.byteAt(targetUpto) & 0xFF) - (target.bytes[target.offset + targetUpto] & 0xFF);
|
||||||
|
@ -680,14 +676,8 @@ final class SegmentTermsEnum extends BaseTermsEnum {
|
||||||
+ (char) arc.label()
|
+ (char) arc.label()
|
||||||
+ " targetLabel="
|
+ " targetLabel="
|
||||||
+ (char) (target.bytes[target.offset + targetUpto] & 0xFF);
|
+ (char) (target.bytes[target.offset + targetUpto] & 0xFF);
|
||||||
// TODO: we could save the outputs in local
|
|
||||||
// byte[][] instead of making new objs ever
|
outputAccumulator.push(arc.output());
|
||||||
// seek; but, often the FST doesn't have any
|
|
||||||
// shared bytes (but this could change if we
|
|
||||||
// reverse vLong byte order)
|
|
||||||
if (arc.output() != Lucene90BlockTreeTermsReader.NO_OUTPUT) {
|
|
||||||
output = Lucene90BlockTreeTermsReader.FST_OUTPUTS.add(output, arc.output());
|
|
||||||
}
|
|
||||||
if (arc.isFinal()) {
|
if (arc.isFinal()) {
|
||||||
lastFrame = stack[1 + lastFrame.ord];
|
lastFrame = stack[1 + lastFrame.ord];
|
||||||
}
|
}
|
||||||
|
@ -769,15 +759,15 @@ final class SegmentTermsEnum extends BaseTermsEnum {
|
||||||
// System.out.println(" no seek state; push root frame");
|
// System.out.println(" no seek state; push root frame");
|
||||||
// }
|
// }
|
||||||
|
|
||||||
output = arc.output();
|
outputAccumulator.push(arc.output());
|
||||||
|
|
||||||
currentFrame = staticFrame;
|
currentFrame = staticFrame;
|
||||||
|
|
||||||
// term.length = 0;
|
// term.length = 0;
|
||||||
targetUpto = 0;
|
targetUpto = 0;
|
||||||
currentFrame =
|
outputAccumulator.push(arc.nextFinalOutput());
|
||||||
pushFrame(
|
currentFrame = pushFrame(arc, 0);
|
||||||
arc, Lucene90BlockTreeTermsReader.FST_OUTPUTS.add(output, arc.nextFinalOutput()), 0);
|
outputAccumulator.pop();
|
||||||
}
|
}
|
||||||
|
|
||||||
// if (DEBUG) {
|
// if (DEBUG) {
|
||||||
|
@ -839,9 +829,7 @@ final class SegmentTermsEnum extends BaseTermsEnum {
|
||||||
arc = nextArc;
|
arc = nextArc;
|
||||||
// Aggregate output as we go:
|
// Aggregate output as we go:
|
||||||
assert arc.output() != null;
|
assert arc.output() != null;
|
||||||
if (arc.output() != Lucene90BlockTreeTermsReader.NO_OUTPUT) {
|
outputAccumulator.push(arc.output());
|
||||||
output = Lucene90BlockTreeTermsReader.FST_OUTPUTS.add(output, arc.output());
|
|
||||||
}
|
|
||||||
|
|
||||||
// if (DEBUG) {
|
// if (DEBUG) {
|
||||||
// System.out.println(" index: follow label=" + (target.bytes[target.offset +
|
// System.out.println(" index: follow label=" + (target.bytes[target.offset +
|
||||||
|
@ -851,11 +839,9 @@ final class SegmentTermsEnum extends BaseTermsEnum {
|
||||||
|
|
||||||
if (arc.isFinal()) {
|
if (arc.isFinal()) {
|
||||||
// if (DEBUG) System.out.println(" arc is final!");
|
// if (DEBUG) System.out.println(" arc is final!");
|
||||||
currentFrame =
|
outputAccumulator.push(arc.nextFinalOutput());
|
||||||
pushFrame(
|
currentFrame = pushFrame(arc, targetUpto);
|
||||||
arc,
|
outputAccumulator.pop();
|
||||||
Lucene90BlockTreeTermsReader.FST_OUTPUTS.add(output, arc.nextFinalOutput()),
|
|
||||||
targetUpto);
|
|
||||||
// if (DEBUG) System.out.println(" curFrame.ord=" + currentFrame.ord + " hasTerms=" +
|
// if (DEBUG) System.out.println(" curFrame.ord=" + currentFrame.ord + " hasTerms=" +
|
||||||
// currentFrame.hasTerms);
|
// currentFrame.hasTerms);
|
||||||
}
|
}
|
||||||
|
@ -1190,4 +1176,68 @@ final class SegmentTermsEnum extends BaseTermsEnum {
|
||||||
public long ord() {
|
public long ord() {
|
||||||
throw new UnsupportedOperationException();
|
throw new UnsupportedOperationException();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static class OutputAccumulator extends DataInput {
|
||||||
|
|
||||||
|
BytesRef[] outputs = new BytesRef[16];
|
||||||
|
BytesRef current;
|
||||||
|
int num;
|
||||||
|
int outputIndex;
|
||||||
|
int index;
|
||||||
|
|
||||||
|
void push(BytesRef output) {
|
||||||
|
if (output != Lucene90BlockTreeTermsReader.NO_OUTPUT) {
|
||||||
|
outputs = ArrayUtil.grow(outputs, num + 1);
|
||||||
|
outputs[num++] = output;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void pop() {
|
||||||
|
assert num > 0;
|
||||||
|
num--;
|
||||||
|
}
|
||||||
|
|
||||||
|
void reset() {
|
||||||
|
num = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
void prepareRead() {
|
||||||
|
index = 0;
|
||||||
|
outputIndex = 0;
|
||||||
|
current = outputs[0];
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Set the last arc as the source of the floorData. This won't change the reading position of
|
||||||
|
* this {@link OutputAccumulator}
|
||||||
|
*/
|
||||||
|
void setFloorData(ByteArrayDataInput floorData) {
|
||||||
|
assert outputIndex == num - 1
|
||||||
|
: "floor data should be stored in last arc, get outputIndex: "
|
||||||
|
+ outputIndex
|
||||||
|
+ ", num: "
|
||||||
|
+ num;
|
||||||
|
BytesRef output = outputs[outputIndex];
|
||||||
|
floorData.reset(output.bytes, output.offset + index, output.length - index);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public byte readByte() throws IOException {
|
||||||
|
if (index >= current.length) {
|
||||||
|
current = outputs[++outputIndex];
|
||||||
|
index = 0;
|
||||||
|
}
|
||||||
|
return current.bytes[current.offset + index++];
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void readBytes(byte[] b, int offset, int len) throws IOException {
|
||||||
|
throw new UnsupportedOperationException();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void skipBytes(long numBytes) throws IOException {
|
||||||
|
throw new UnsupportedOperationException();
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -55,7 +55,7 @@ final class SegmentTermsEnumFrame {
|
||||||
int statsSingletonRunLength = 0;
|
int statsSingletonRunLength = 0;
|
||||||
final ByteArrayDataInput statsReader = new ByteArrayDataInput();
|
final ByteArrayDataInput statsReader = new ByteArrayDataInput();
|
||||||
|
|
||||||
byte[] floorData = new byte[32];
|
int rewindPos;
|
||||||
final ByteArrayDataInput floorDataReader = new ByteArrayDataInput();
|
final ByteArrayDataInput floorDataReader = new ByteArrayDataInput();
|
||||||
|
|
||||||
// Length of prefix shared by all terms in this block
|
// Length of prefix shared by all terms in this block
|
||||||
|
@ -104,13 +104,9 @@ final class SegmentTermsEnumFrame {
|
||||||
suffixLengthsReader = new ByteArrayDataInput();
|
suffixLengthsReader = new ByteArrayDataInput();
|
||||||
}
|
}
|
||||||
|
|
||||||
public void setFloorData(ByteArrayDataInput in, BytesRef source) {
|
public void setFloorData(SegmentTermsEnum.OutputAccumulator outputAccumulator) {
|
||||||
final int numBytes = source.length - (in.getPosition() - source.offset);
|
outputAccumulator.setFloorData(floorDataReader);
|
||||||
if (numBytes > floorData.length) {
|
rewindPos = floorDataReader.getPosition();
|
||||||
floorData = new byte[ArrayUtil.oversize(numBytes, 1)];
|
|
||||||
}
|
|
||||||
System.arraycopy(source.bytes, source.offset + in.getPosition(), floorData, 0, numBytes);
|
|
||||||
floorDataReader.reset(floorData, 0, numBytes);
|
|
||||||
numFollowFloorBlocks = floorDataReader.readVInt();
|
numFollowFloorBlocks = floorDataReader.readVInt();
|
||||||
nextFloorLabel = floorDataReader.readByte() & 0xff;
|
nextFloorLabel = floorDataReader.readByte() & 0xff;
|
||||||
// if (DEBUG) {
|
// if (DEBUG) {
|
||||||
|
@ -247,7 +243,7 @@ final class SegmentTermsEnumFrame {
|
||||||
nextEnt = -1;
|
nextEnt = -1;
|
||||||
hasTerms = hasTermsOrig;
|
hasTerms = hasTermsOrig;
|
||||||
if (isFloor) {
|
if (isFloor) {
|
||||||
floorDataReader.rewind();
|
floorDataReader.setPosition(rewindPos);
|
||||||
numFollowFloorBlocks = floorDataReader.readVInt();
|
numFollowFloorBlocks = floorDataReader.readVInt();
|
||||||
assert numFollowFloorBlocks > 0;
|
assert numFollowFloorBlocks > 0;
|
||||||
nextFloorLabel = floorDataReader.readByte() & 0xff;
|
nextFloorLabel = floorDataReader.readByte() & 0xff;
|
||||||
|
|
|
@ -0,0 +1,57 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package org.apache.lucene.codecs.lucene99;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import org.apache.lucene.store.DataInput;
|
||||||
|
|
||||||
|
/** Decode integers using group-varint. */
|
||||||
|
public class GroupVIntReader {
|
||||||
|
|
||||||
|
public static void readValues(DataInput in, long[] docs, int limit) throws IOException {
|
||||||
|
int i;
|
||||||
|
for (i = 0; i <= limit - 4; i += 4) {
|
||||||
|
final int flag = in.readByte() & 0xFF;
|
||||||
|
|
||||||
|
final int n1Minus1 = flag >> 6;
|
||||||
|
final int n2Minus1 = (flag >> 4) & 0x03;
|
||||||
|
final int n3Minus1 = (flag >> 2) & 0x03;
|
||||||
|
final int n4Minus1 = flag & 0x03;
|
||||||
|
|
||||||
|
docs[i] = readLong(in, n1Minus1);
|
||||||
|
docs[i + 1] = readLong(in, n2Minus1);
|
||||||
|
docs[i + 2] = readLong(in, n3Minus1);
|
||||||
|
docs[i + 3] = readLong(in, n4Minus1);
|
||||||
|
}
|
||||||
|
for (; i < limit; ++i) {
|
||||||
|
docs[i] = in.readVInt();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private static long readLong(DataInput in, int numBytesMinus1) throws IOException {
|
||||||
|
switch (numBytesMinus1) {
|
||||||
|
case 0:
|
||||||
|
return in.readByte() & 0xFFL;
|
||||||
|
case 1:
|
||||||
|
return in.readShort() & 0xFFFFL;
|
||||||
|
case 2:
|
||||||
|
return (in.readShort() & 0xFFFFL) | ((in.readByte() & 0xFFL) << 16);
|
||||||
|
default:
|
||||||
|
return in.readInt() & 0xFFFFFFFFL;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,63 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package org.apache.lucene.codecs.lucene99;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import org.apache.lucene.store.DataOutput;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Encode integers using group-varint. It uses VInt to encode tail values that are not enough for a
|
||||||
|
* group
|
||||||
|
*/
|
||||||
|
public class GroupVIntWriter {
|
||||||
|
|
||||||
|
// the maximum size of one group is 4 integers + 1 byte flag.
|
||||||
|
private byte[] bytes = new byte[17];
|
||||||
|
private int byteOffset = 0;
|
||||||
|
|
||||||
|
public GroupVIntWriter() {}
|
||||||
|
|
||||||
|
private int encodeValue(int v) {
|
||||||
|
int lastOff = byteOffset;
|
||||||
|
do {
|
||||||
|
bytes[byteOffset++] = (byte) (v & 0xFF);
|
||||||
|
v >>>= 8;
|
||||||
|
} while (v != 0);
|
||||||
|
return byteOffset - lastOff;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void writeValues(DataOutput out, long[] values, int limit) throws IOException {
|
||||||
|
int off = 0;
|
||||||
|
|
||||||
|
// encode each group
|
||||||
|
while ((limit - off) >= 4) {
|
||||||
|
byte flag = 0;
|
||||||
|
byteOffset = 1;
|
||||||
|
flag |= (encodeValue((int) values[off++]) - 1) << 6;
|
||||||
|
flag |= (encodeValue((int) values[off++]) - 1) << 4;
|
||||||
|
flag |= (encodeValue((int) values[off++]) - 1) << 2;
|
||||||
|
flag |= (encodeValue((int) values[off++]) - 1);
|
||||||
|
bytes[0] = flag;
|
||||||
|
out.writeBytes(bytes, byteOffset);
|
||||||
|
}
|
||||||
|
|
||||||
|
// tail vints
|
||||||
|
for (; off < limit; off++) {
|
||||||
|
out.writeVInt((int) values[off]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -31,6 +31,7 @@ import org.apache.lucene.codecs.KnnVectorsReader;
|
||||||
import org.apache.lucene.codecs.KnnVectorsWriter;
|
import org.apache.lucene.codecs.KnnVectorsWriter;
|
||||||
import org.apache.lucene.index.SegmentReadState;
|
import org.apache.lucene.index.SegmentReadState;
|
||||||
import org.apache.lucene.index.SegmentWriteState;
|
import org.apache.lucene.index.SegmentWriteState;
|
||||||
|
import org.apache.lucene.search.TaskExecutor;
|
||||||
import org.apache.lucene.util.hnsw.HnswGraph;
|
import org.apache.lucene.util.hnsw.HnswGraph;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -60,7 +61,7 @@ public final class Lucene99HnswScalarQuantizedVectorsFormat extends KnnVectorsFo
|
||||||
private final FlatVectorsFormat flatVectorsFormat;
|
private final FlatVectorsFormat flatVectorsFormat;
|
||||||
|
|
||||||
private final int numMergeWorkers;
|
private final int numMergeWorkers;
|
||||||
private final ExecutorService mergeExec;
|
private final TaskExecutor mergeExec;
|
||||||
|
|
||||||
/** Constructs a format using default graph construction parameters */
|
/** Constructs a format using default graph construction parameters */
|
||||||
public Lucene99HnswScalarQuantizedVectorsFormat() {
|
public Lucene99HnswScalarQuantizedVectorsFormat() {
|
||||||
|
@ -84,8 +85,8 @@ public final class Lucene99HnswScalarQuantizedVectorsFormat extends KnnVectorsFo
|
||||||
* @param beamWidth the size of the queue maintained during graph construction.
|
* @param beamWidth the size of the queue maintained during graph construction.
|
||||||
* @param numMergeWorkers number of workers (threads) that will be used when doing merge. If
|
* @param numMergeWorkers number of workers (threads) that will be used when doing merge. If
|
||||||
* larger than 1, a non-null {@link ExecutorService} must be passed as mergeExec
|
* larger than 1, a non-null {@link ExecutorService} must be passed as mergeExec
|
||||||
* @param configuredQuantile the quantile for scalar quantizing the vectors, when `null` it is
|
* @param confidenceInterval the confidenceInterval for scalar quantizing the vectors, when `null`
|
||||||
* calculated based on the vector field dimensions.
|
* it is calculated based on the vector field dimensions.
|
||||||
* @param mergeExec the {@link ExecutorService} that will be used by ALL vector writers that are
|
* @param mergeExec the {@link ExecutorService} that will be used by ALL vector writers that are
|
||||||
* generated by this format to do the merge
|
* generated by this format to do the merge
|
||||||
*/
|
*/
|
||||||
|
@ -93,7 +94,7 @@ public final class Lucene99HnswScalarQuantizedVectorsFormat extends KnnVectorsFo
|
||||||
int maxConn,
|
int maxConn,
|
||||||
int beamWidth,
|
int beamWidth,
|
||||||
int numMergeWorkers,
|
int numMergeWorkers,
|
||||||
Float configuredQuantile,
|
Float confidenceInterval,
|
||||||
ExecutorService mergeExec) {
|
ExecutorService mergeExec) {
|
||||||
super("Lucene99HnswScalarQuantizedVectorsFormat");
|
super("Lucene99HnswScalarQuantizedVectorsFormat");
|
||||||
if (maxConn <= 0 || maxConn > MAXIMUM_MAX_CONN) {
|
if (maxConn <= 0 || maxConn > MAXIMUM_MAX_CONN) {
|
||||||
|
@ -121,8 +122,12 @@ public final class Lucene99HnswScalarQuantizedVectorsFormat extends KnnVectorsFo
|
||||||
"No executor service is needed as we'll use single thread to merge");
|
"No executor service is needed as we'll use single thread to merge");
|
||||||
}
|
}
|
||||||
this.numMergeWorkers = numMergeWorkers;
|
this.numMergeWorkers = numMergeWorkers;
|
||||||
this.mergeExec = mergeExec;
|
if (mergeExec != null) {
|
||||||
this.flatVectorsFormat = new Lucene99ScalarQuantizedVectorsFormat(configuredQuantile);
|
this.mergeExec = new TaskExecutor(mergeExec);
|
||||||
|
} else {
|
||||||
|
this.mergeExec = null;
|
||||||
|
}
|
||||||
|
this.flatVectorsFormat = new Lucene99ScalarQuantizedVectorsFormat(confidenceInterval);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|
|
@ -27,6 +27,7 @@ import org.apache.lucene.codecs.lucene90.IndexedDISI;
|
||||||
import org.apache.lucene.index.SegmentReadState;
|
import org.apache.lucene.index.SegmentReadState;
|
||||||
import org.apache.lucene.index.SegmentWriteState;
|
import org.apache.lucene.index.SegmentWriteState;
|
||||||
import org.apache.lucene.search.DocIdSetIterator;
|
import org.apache.lucene.search.DocIdSetIterator;
|
||||||
|
import org.apache.lucene.search.TaskExecutor;
|
||||||
import org.apache.lucene.store.IndexOutput;
|
import org.apache.lucene.store.IndexOutput;
|
||||||
import org.apache.lucene.util.hnsw.HnswGraph;
|
import org.apache.lucene.util.hnsw.HnswGraph;
|
||||||
|
|
||||||
|
@ -137,7 +138,7 @@ public final class Lucene99HnswVectorsFormat extends KnnVectorsFormat {
|
||||||
private static final FlatVectorsFormat flatVectorsFormat = new Lucene99FlatVectorsFormat();
|
private static final FlatVectorsFormat flatVectorsFormat = new Lucene99FlatVectorsFormat();
|
||||||
|
|
||||||
private final int numMergeWorkers;
|
private final int numMergeWorkers;
|
||||||
private final ExecutorService mergeExec;
|
private final TaskExecutor mergeExec;
|
||||||
|
|
||||||
/** Constructs a format using default graph construction parameters */
|
/** Constructs a format using default graph construction parameters */
|
||||||
public Lucene99HnswVectorsFormat() {
|
public Lucene99HnswVectorsFormat() {
|
||||||
|
@ -192,7 +193,11 @@ public final class Lucene99HnswVectorsFormat extends KnnVectorsFormat {
|
||||||
"No executor service is needed as we'll use single thread to merge");
|
"No executor service is needed as we'll use single thread to merge");
|
||||||
}
|
}
|
||||||
this.numMergeWorkers = numMergeWorkers;
|
this.numMergeWorkers = numMergeWorkers;
|
||||||
this.mergeExec = mergeExec;
|
if (mergeExec != null) {
|
||||||
|
this.mergeExec = new TaskExecutor(mergeExec);
|
||||||
|
} else {
|
||||||
|
this.mergeExec = null;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|
|
@ -92,18 +92,8 @@ public final class Lucene99HnswVectorsReader extends KnnVectorsReader
|
||||||
} catch (Throwable exception) {
|
} catch (Throwable exception) {
|
||||||
priorE = exception;
|
priorE = exception;
|
||||||
} finally {
|
} finally {
|
||||||
try {
|
CodecUtil.checkFooter(meta, priorE);
|
||||||
CodecUtil.checkFooter(meta, priorE);
|
|
||||||
success = true;
|
|
||||||
} finally {
|
|
||||||
if (success == false) {
|
|
||||||
IOUtils.close(flatVectorsReader);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
|
||||||
success = false;
|
|
||||||
try {
|
|
||||||
vectorIndex =
|
vectorIndex =
|
||||||
openDataInput(
|
openDataInput(
|
||||||
state,
|
state,
|
||||||
|
@ -237,12 +227,22 @@ public final class Lucene99HnswVectorsReader extends KnnVectorsReader
|
||||||
|| fieldEntry.vectorEncoding != VectorEncoding.FLOAT32) {
|
|| fieldEntry.vectorEncoding != VectorEncoding.FLOAT32) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
RandomVectorScorer scorer = flatVectorsReader.getRandomVectorScorer(field, target);
|
final RandomVectorScorer scorer = flatVectorsReader.getRandomVectorScorer(field, target);
|
||||||
HnswGraphSearcher.search(
|
final KnnCollector collector =
|
||||||
scorer,
|
new OrdinalTranslatedKnnCollector(knnCollector, scorer::ordToDoc);
|
||||||
new OrdinalTranslatedKnnCollector(knnCollector, scorer::ordToDoc),
|
final Bits acceptedOrds = scorer.getAcceptOrds(acceptDocs);
|
||||||
getGraph(fieldEntry),
|
if (knnCollector.k() < scorer.maxOrd()) {
|
||||||
scorer.getAcceptOrds(acceptDocs));
|
HnswGraphSearcher.search(scorer, collector, getGraph(fieldEntry), acceptedOrds);
|
||||||
|
} else {
|
||||||
|
// if k is larger than the number of vectors, we can just iterate over all vectors
|
||||||
|
// and collect them
|
||||||
|
for (int i = 0; i < scorer.maxOrd(); i++) {
|
||||||
|
if (acceptedOrds == null || acceptedOrds.get(i)) {
|
||||||
|
knnCollector.incVisitedCount(1);
|
||||||
|
knnCollector.collect(scorer.ordToDoc(i), scorer.score(i));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -255,12 +255,22 @@ public final class Lucene99HnswVectorsReader extends KnnVectorsReader
|
||||||
|| fieldEntry.vectorEncoding != VectorEncoding.BYTE) {
|
|| fieldEntry.vectorEncoding != VectorEncoding.BYTE) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
RandomVectorScorer scorer = flatVectorsReader.getRandomVectorScorer(field, target);
|
final RandomVectorScorer scorer = flatVectorsReader.getRandomVectorScorer(field, target);
|
||||||
HnswGraphSearcher.search(
|
final KnnCollector collector =
|
||||||
scorer,
|
new OrdinalTranslatedKnnCollector(knnCollector, scorer::ordToDoc);
|
||||||
new OrdinalTranslatedKnnCollector(knnCollector, scorer::ordToDoc),
|
final Bits acceptedOrds = scorer.getAcceptOrds(acceptDocs);
|
||||||
getGraph(fieldEntry),
|
if (knnCollector.k() < scorer.maxOrd()) {
|
||||||
scorer.getAcceptOrds(acceptDocs));
|
HnswGraphSearcher.search(scorer, collector, getGraph(fieldEntry), acceptedOrds);
|
||||||
|
} else {
|
||||||
|
// if k is larger than the number of vectors, we can just iterate over all vectors
|
||||||
|
// and collect them
|
||||||
|
for (int i = 0; i < scorer.maxOrd(); i++) {
|
||||||
|
if (acceptedOrds == null || acceptedOrds.get(i)) {
|
||||||
|
knnCollector.incVisitedCount(1);
|
||||||
|
knnCollector.collect(scorer.ordToDoc(i), scorer.score(i));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|
|
@ -23,7 +23,6 @@ import java.io.IOException;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.concurrent.ExecutorService;
|
|
||||||
import org.apache.lucene.codecs.CodecUtil;
|
import org.apache.lucene.codecs.CodecUtil;
|
||||||
import org.apache.lucene.codecs.FlatVectorsWriter;
|
import org.apache.lucene.codecs.FlatVectorsWriter;
|
||||||
import org.apache.lucene.codecs.KnnFieldVectorsWriter;
|
import org.apache.lucene.codecs.KnnFieldVectorsWriter;
|
||||||
|
@ -35,6 +34,7 @@ import org.apache.lucene.index.MergeState;
|
||||||
import org.apache.lucene.index.SegmentWriteState;
|
import org.apache.lucene.index.SegmentWriteState;
|
||||||
import org.apache.lucene.index.Sorter;
|
import org.apache.lucene.index.Sorter;
|
||||||
import org.apache.lucene.search.DocIdSetIterator;
|
import org.apache.lucene.search.DocIdSetIterator;
|
||||||
|
import org.apache.lucene.search.TaskExecutor;
|
||||||
import org.apache.lucene.store.IndexOutput;
|
import org.apache.lucene.store.IndexOutput;
|
||||||
import org.apache.lucene.util.IOUtils;
|
import org.apache.lucene.util.IOUtils;
|
||||||
import org.apache.lucene.util.InfoStream;
|
import org.apache.lucene.util.InfoStream;
|
||||||
|
@ -67,7 +67,7 @@ public final class Lucene99HnswVectorsWriter extends KnnVectorsWriter {
|
||||||
private final int beamWidth;
|
private final int beamWidth;
|
||||||
private final FlatVectorsWriter flatVectorWriter;
|
private final FlatVectorsWriter flatVectorWriter;
|
||||||
private final int numMergeWorkers;
|
private final int numMergeWorkers;
|
||||||
private final ExecutorService mergeExec;
|
private final TaskExecutor mergeExec;
|
||||||
|
|
||||||
private final List<FieldWriter<?>> fields = new ArrayList<>();
|
private final List<FieldWriter<?>> fields = new ArrayList<>();
|
||||||
private boolean finished;
|
private boolean finished;
|
||||||
|
@ -78,7 +78,7 @@ public final class Lucene99HnswVectorsWriter extends KnnVectorsWriter {
|
||||||
int beamWidth,
|
int beamWidth,
|
||||||
FlatVectorsWriter flatVectorWriter,
|
FlatVectorsWriter flatVectorWriter,
|
||||||
int numMergeWorkers,
|
int numMergeWorkers,
|
||||||
ExecutorService mergeExec)
|
TaskExecutor mergeExec)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
this.M = M;
|
this.M = M;
|
||||||
this.flatVectorWriter = flatVectorWriter;
|
this.flatVectorWriter = flatVectorWriter;
|
||||||
|
|
|
@ -158,8 +158,8 @@ import org.apache.lucene.util.packed.PackedInts;
|
||||||
* <dd><b>Frequencies and Skip Data</b>
|
* <dd><b>Frequencies and Skip Data</b>
|
||||||
* <p>The .doc file contains the lists of documents which contain each term, along with the
|
* <p>The .doc file contains the lists of documents which contain each term, along with the
|
||||||
* frequency of the term in that document (except when frequencies are omitted: {@link
|
* frequency of the term in that document (except when frequencies are omitted: {@link
|
||||||
* IndexOptions#DOCS}). It also saves skip data to the beginning of each packed or VInt block,
|
* IndexOptions#DOCS}). Skip data is saved at the end of each term's postings. The skip data
|
||||||
* when the length of document list is larger than packed block size.
|
* is saved once for the entire postings list.
|
||||||
* <ul>
|
* <ul>
|
||||||
* <li>docFile(.doc) --> Header, <TermFreqs, SkipData?><sup>TermCount</sup>, Footer
|
* <li>docFile(.doc) --> Header, <TermFreqs, SkipData?><sup>TermCount</sup>, Footer
|
||||||
* <li>Header --> {@link CodecUtil#writeIndexHeader IndexHeader}
|
* <li>Header --> {@link CodecUtil#writeIndexHeader IndexHeader}
|
||||||
|
@ -174,7 +174,8 @@ import org.apache.lucene.util.packed.PackedInts;
|
||||||
* <li>SkipDatum --> DocSkip, DocFPSkip, <PosFPSkip, PosBlockOffset, PayLength?,
|
* <li>SkipDatum --> DocSkip, DocFPSkip, <PosFPSkip, PosBlockOffset, PayLength?,
|
||||||
* PayFPSkip?>?, ImpactLength, <CompetitiveFreqDelta, CompetitiveNormDelta?>
|
* PayFPSkip?>?, ImpactLength, <CompetitiveFreqDelta, CompetitiveNormDelta?>
|
||||||
* <sup>ImpactCount</sup>, SkipChildLevelPointer?
|
* <sup>ImpactCount</sup>, SkipChildLevelPointer?
|
||||||
* <li>PackedDocDeltaBlock, PackedFreqBlock --> {@link PackedInts PackedInts}
|
* <li>PackedFreqBlock --> {@link PackedInts PackedInts}, uses patching
|
||||||
|
* <li>PackedDocDeltaBlock --> {@link PackedInts PackedInts}, does not use patching
|
||||||
* <li>DocDelta, Freq, DocSkip, DocFPSkip, PosFPSkip, PosBlockOffset, PayByteUpto,
|
* <li>DocDelta, Freq, DocSkip, DocFPSkip, PosFPSkip, PosBlockOffset, PayByteUpto,
|
||||||
* PayFPSkip, ImpactLength, CompetitiveFreqDelta --> {@link DataOutput#writeVInt
|
* PayFPSkip, ImpactLength, CompetitiveFreqDelta --> {@link DataOutput#writeVInt
|
||||||
* VInt}
|
* VInt}
|
||||||
|
|
|
@ -142,21 +142,25 @@ public final class Lucene99PostingsReader extends PostingsReaderBase {
|
||||||
|
|
||||||
/** Read values that have been written using variable-length encoding instead of bit-packing. */
|
/** Read values that have been written using variable-length encoding instead of bit-packing. */
|
||||||
static void readVIntBlock(
|
static void readVIntBlock(
|
||||||
IndexInput docIn, long[] docBuffer, long[] freqBuffer, int num, boolean indexHasFreq)
|
IndexInput docIn,
|
||||||
|
long[] docBuffer,
|
||||||
|
long[] freqBuffer,
|
||||||
|
int num,
|
||||||
|
boolean indexHasFreq,
|
||||||
|
boolean decodeFreq)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
if (indexHasFreq) {
|
GroupVIntReader.readValues(docIn, docBuffer, num);
|
||||||
for (int i = 0; i < num; i++) {
|
if (indexHasFreq && decodeFreq) {
|
||||||
final int code = docIn.readVInt();
|
for (int i = 0; i < num; ++i) {
|
||||||
docBuffer[i] = code >>> 1;
|
freqBuffer[i] = docBuffer[i] & 0x01;
|
||||||
if ((code & 1) != 0) {
|
docBuffer[i] >>= 1;
|
||||||
freqBuffer[i] = 1;
|
if (freqBuffer[i] == 0) {
|
||||||
} else {
|
|
||||||
freqBuffer[i] = docIn.readVInt();
|
freqBuffer[i] = docIn.readVInt();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else {
|
} else if (indexHasFreq) {
|
||||||
for (int i = 0; i < num; i++) {
|
for (int i = 0; i < num; ++i) {
|
||||||
docBuffer[i] = docIn.readVInt();
|
docBuffer[i] >>= 1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -471,7 +475,7 @@ public final class Lucene99PostingsReader extends PostingsReaderBase {
|
||||||
blockUpto++;
|
blockUpto++;
|
||||||
} else {
|
} else {
|
||||||
// Read vInts:
|
// Read vInts:
|
||||||
readVIntBlock(docIn, docBuffer, freqBuffer, left, indexHasFreq);
|
readVIntBlock(docIn, docBuffer, freqBuffer, left, indexHasFreq, needsFreq);
|
||||||
prefixSum(docBuffer, left, accum);
|
prefixSum(docBuffer, left, accum);
|
||||||
docBuffer[left] = NO_MORE_DOCS;
|
docBuffer[left] = NO_MORE_DOCS;
|
||||||
blockUpto += left;
|
blockUpto += left;
|
||||||
|
@ -764,7 +768,7 @@ public final class Lucene99PostingsReader extends PostingsReaderBase {
|
||||||
docBuffer[1] = NO_MORE_DOCS;
|
docBuffer[1] = NO_MORE_DOCS;
|
||||||
blockUpto++;
|
blockUpto++;
|
||||||
} else {
|
} else {
|
||||||
readVIntBlock(docIn, docBuffer, freqBuffer, left, true);
|
readVIntBlock(docIn, docBuffer, freqBuffer, left, true, true);
|
||||||
prefixSum(docBuffer, left, accum);
|
prefixSum(docBuffer, left, accum);
|
||||||
docBuffer[left] = NO_MORE_DOCS;
|
docBuffer[left] = NO_MORE_DOCS;
|
||||||
blockUpto += left;
|
blockUpto += left;
|
||||||
|
@ -1073,8 +1077,6 @@ public final class Lucene99PostingsReader extends PostingsReaderBase {
|
||||||
|
|
||||||
private int nextSkipDoc = -1;
|
private int nextSkipDoc = -1;
|
||||||
|
|
||||||
private long seekTo = -1;
|
|
||||||
|
|
||||||
// as we read freqBuffer lazily, isFreqsRead shows if freqBuffer are read for the current block
|
// as we read freqBuffer lazily, isFreqsRead shows if freqBuffer are read for the current block
|
||||||
// always true when we don't have freqBuffer (indexHasFreq=false) or don't need freqBuffer
|
// always true when we don't have freqBuffer (indexHasFreq=false) or don't need freqBuffer
|
||||||
// (needsFreq=false)
|
// (needsFreq=false)
|
||||||
|
@ -1153,7 +1155,7 @@ public final class Lucene99PostingsReader extends PostingsReaderBase {
|
||||||
}
|
}
|
||||||
blockUpto += BLOCK_SIZE;
|
blockUpto += BLOCK_SIZE;
|
||||||
} else {
|
} else {
|
||||||
readVIntBlock(docIn, docBuffer, freqBuffer, left, indexHasFreqs);
|
readVIntBlock(docIn, docBuffer, freqBuffer, left, indexHasFreqs, true);
|
||||||
prefixSum(docBuffer, left, accum);
|
prefixSum(docBuffer, left, accum);
|
||||||
docBuffer[left] = NO_MORE_DOCS;
|
docBuffer[left] = NO_MORE_DOCS;
|
||||||
blockUpto += left;
|
blockUpto += left;
|
||||||
|
@ -1178,7 +1180,8 @@ public final class Lucene99PostingsReader extends PostingsReaderBase {
|
||||||
// Force to read next block
|
// Force to read next block
|
||||||
docBufferUpto = BLOCK_SIZE;
|
docBufferUpto = BLOCK_SIZE;
|
||||||
accum = skipper.getDoc();
|
accum = skipper.getDoc();
|
||||||
seekTo = skipper.getDocPointer(); // delay the seek
|
docIn.seek(skipper.getDocPointer());
|
||||||
|
isFreqsRead = true;
|
||||||
}
|
}
|
||||||
// next time we call advance, this is used to
|
// next time we call advance, this is used to
|
||||||
// foresee whether skipper is necessary.
|
// foresee whether skipper is necessary.
|
||||||
|
@ -1198,11 +1201,6 @@ public final class Lucene99PostingsReader extends PostingsReaderBase {
|
||||||
@Override
|
@Override
|
||||||
public int nextDoc() throws IOException {
|
public int nextDoc() throws IOException {
|
||||||
if (docBufferUpto == BLOCK_SIZE) {
|
if (docBufferUpto == BLOCK_SIZE) {
|
||||||
if (seekTo >= 0) {
|
|
||||||
docIn.seek(seekTo);
|
|
||||||
isFreqsRead = true; // reset isFreqsRead
|
|
||||||
seekTo = -1;
|
|
||||||
}
|
|
||||||
refillDocs();
|
refillDocs();
|
||||||
}
|
}
|
||||||
return this.doc = (int) docBuffer[docBufferUpto++];
|
return this.doc = (int) docBuffer[docBufferUpto++];
|
||||||
|
@ -1214,11 +1212,6 @@ public final class Lucene99PostingsReader extends PostingsReaderBase {
|
||||||
advanceShallow(target);
|
advanceShallow(target);
|
||||||
}
|
}
|
||||||
if (docBufferUpto == BLOCK_SIZE) {
|
if (docBufferUpto == BLOCK_SIZE) {
|
||||||
if (seekTo >= 0) {
|
|
||||||
docIn.seek(seekTo);
|
|
||||||
isFreqsRead = true; // reset isFreqsRead
|
|
||||||
seekTo = -1;
|
|
||||||
}
|
|
||||||
refillDocs();
|
refillDocs();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1307,8 +1300,6 @@ public final class Lucene99PostingsReader extends PostingsReaderBase {
|
||||||
|
|
||||||
private int nextSkipDoc = -1;
|
private int nextSkipDoc = -1;
|
||||||
|
|
||||||
private long seekTo = -1;
|
|
||||||
|
|
||||||
public BlockImpactsPostingsEnum(FieldInfo fieldInfo, IntBlockTermState termState)
|
public BlockImpactsPostingsEnum(FieldInfo fieldInfo, IntBlockTermState termState)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
indexHasOffsets =
|
indexHasOffsets =
|
||||||
|
@ -1372,7 +1363,7 @@ public final class Lucene99PostingsReader extends PostingsReaderBase {
|
||||||
forDeltaUtil.decodeAndPrefixSum(docIn, accum, docBuffer);
|
forDeltaUtil.decodeAndPrefixSum(docIn, accum, docBuffer);
|
||||||
pforUtil.decode(docIn, freqBuffer);
|
pforUtil.decode(docIn, freqBuffer);
|
||||||
} else {
|
} else {
|
||||||
readVIntBlock(docIn, docBuffer, freqBuffer, left, true);
|
readVIntBlock(docIn, docBuffer, freqBuffer, left, true, true);
|
||||||
prefixSum(docBuffer, left, accum);
|
prefixSum(docBuffer, left, accum);
|
||||||
docBuffer[left] = NO_MORE_DOCS;
|
docBuffer[left] = NO_MORE_DOCS;
|
||||||
}
|
}
|
||||||
|
@ -1426,7 +1417,7 @@ public final class Lucene99PostingsReader extends PostingsReaderBase {
|
||||||
accum = skipper.getDoc();
|
accum = skipper.getDoc();
|
||||||
posPendingFP = skipper.getPosPointer();
|
posPendingFP = skipper.getPosPointer();
|
||||||
posPendingCount = skipper.getPosBufferUpto();
|
posPendingCount = skipper.getPosBufferUpto();
|
||||||
seekTo = skipper.getDocPointer(); // delay the seek
|
docIn.seek(skipper.getDocPointer());
|
||||||
}
|
}
|
||||||
// next time we call advance, this is used to
|
// next time we call advance, this is used to
|
||||||
// foresee whether skipper is necessary.
|
// foresee whether skipper is necessary.
|
||||||
|
@ -1452,10 +1443,6 @@ public final class Lucene99PostingsReader extends PostingsReaderBase {
|
||||||
advanceShallow(target);
|
advanceShallow(target);
|
||||||
}
|
}
|
||||||
if (docBufferUpto == BLOCK_SIZE) {
|
if (docBufferUpto == BLOCK_SIZE) {
|
||||||
if (seekTo >= 0) {
|
|
||||||
docIn.seek(seekTo);
|
|
||||||
seekTo = -1;
|
|
||||||
}
|
|
||||||
refillDocs();
|
refillDocs();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1766,7 +1753,7 @@ public final class Lucene99PostingsReader extends PostingsReaderBase {
|
||||||
false; // freq block will be loaded lazily when necessary, we don't load it here
|
false; // freq block will be loaded lazily when necessary, we don't load it here
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
readVIntBlock(docIn, docBuffer, freqBuffer, left, indexHasFreq);
|
readVIntBlock(docIn, docBuffer, freqBuffer, left, indexHasFreq, true);
|
||||||
prefixSum(docBuffer, left, accum);
|
prefixSum(docBuffer, left, accum);
|
||||||
docBuffer[left] = NO_MORE_DOCS;
|
docBuffer[left] = NO_MORE_DOCS;
|
||||||
}
|
}
|
||||||
|
|
|
@ -92,6 +92,7 @@ public final class Lucene99PostingsWriter extends PushPostingsWriterBase {
|
||||||
private final PForUtil pforUtil;
|
private final PForUtil pforUtil;
|
||||||
private final ForDeltaUtil forDeltaUtil;
|
private final ForDeltaUtil forDeltaUtil;
|
||||||
private final Lucene99SkipWriter skipWriter;
|
private final Lucene99SkipWriter skipWriter;
|
||||||
|
private final GroupVIntWriter docGroupVIntWriter;
|
||||||
|
|
||||||
private boolean fieldHasNorms;
|
private boolean fieldHasNorms;
|
||||||
private NumericDocValues norms;
|
private NumericDocValues norms;
|
||||||
|
@ -172,6 +173,7 @@ public final class Lucene99PostingsWriter extends PushPostingsWriterBase {
|
||||||
skipWriter =
|
skipWriter =
|
||||||
new Lucene99SkipWriter(
|
new Lucene99SkipWriter(
|
||||||
MAX_SKIP_LEVELS, BLOCK_SIZE, state.segmentInfo.maxDoc(), docOut, posOut, payOut);
|
MAX_SKIP_LEVELS, BLOCK_SIZE, state.segmentInfo.maxDoc(), docOut, posOut, payOut);
|
||||||
|
docGroupVIntWriter = new GroupVIntWriter();
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -370,17 +372,19 @@ public final class Lucene99PostingsWriter extends PushPostingsWriterBase {
|
||||||
singletonDocID = (int) docDeltaBuffer[0];
|
singletonDocID = (int) docDeltaBuffer[0];
|
||||||
} else {
|
} else {
|
||||||
singletonDocID = -1;
|
singletonDocID = -1;
|
||||||
// vInt encode the remaining doc deltas and freqs:
|
// Group vInt encode the remaining doc deltas and freqs:
|
||||||
for (int i = 0; i < docBufferUpto; i++) {
|
if (writeFreqs) {
|
||||||
final int docDelta = (int) docDeltaBuffer[i];
|
for (int i = 0; i < docBufferUpto; i++) {
|
||||||
final int freq = (int) freqBuffer[i];
|
docDeltaBuffer[i] = (docDeltaBuffer[i] << 1) | (freqBuffer[i] == 1 ? 1 : 0);
|
||||||
if (!writeFreqs) {
|
}
|
||||||
docOut.writeVInt(docDelta);
|
}
|
||||||
} else if (freq == 1) {
|
docGroupVIntWriter.writeValues(docOut, docDeltaBuffer, docBufferUpto);
|
||||||
docOut.writeVInt((docDelta << 1) | 1);
|
if (writeFreqs) {
|
||||||
} else {
|
for (int i = 0; i < docBufferUpto; i++) {
|
||||||
docOut.writeVInt(docDelta << 1);
|
final int freq = (int) freqBuffer[i];
|
||||||
docOut.writeVInt(freq);
|
if (freq != 1) {
|
||||||
|
docOut.writeVInt(freq);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -43,17 +43,17 @@ public final class Lucene99ScalarQuantizedVectorsFormat extends FlatVectorsForma
|
||||||
|
|
||||||
private static final FlatVectorsFormat rawVectorFormat = new Lucene99FlatVectorsFormat();
|
private static final FlatVectorsFormat rawVectorFormat = new Lucene99FlatVectorsFormat();
|
||||||
|
|
||||||
/** The minimum quantile */
|
/** The minimum confidence interval */
|
||||||
private static final float MINIMUM_QUANTILE = 0.9f;
|
private static final float MINIMUM_CONFIDENCE_INTERVAL = 0.9f;
|
||||||
|
|
||||||
/** The maximum quantile */
|
/** The maximum confidence interval */
|
||||||
private static final float MAXIMUM_QUANTILE = 1f;
|
private static final float MAXIMUM_CONFIDENCE_INTERVAL = 1f;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Controls the quantile used to scalar quantize the vectors the default quantile is calculated as
|
* Controls the confidence interval used to scalar quantize the vectors the default value is
|
||||||
* `1-1/(vector_dimensions + 1)`
|
* calculated as `1-1/(vector_dimensions + 1)`
|
||||||
*/
|
*/
|
||||||
final Float quantile;
|
final Float confidenceInterval;
|
||||||
|
|
||||||
/** Constructs a format using default graph construction parameters */
|
/** Constructs a format using default graph construction parameters */
|
||||||
public Lucene99ScalarQuantizedVectorsFormat() {
|
public Lucene99ScalarQuantizedVectorsFormat() {
|
||||||
|
@ -63,24 +63,26 @@ public final class Lucene99ScalarQuantizedVectorsFormat extends FlatVectorsForma
|
||||||
/**
|
/**
|
||||||
* Constructs a format using the given graph construction parameters.
|
* Constructs a format using the given graph construction parameters.
|
||||||
*
|
*
|
||||||
* @param quantile the quantile for scalar quantizing the vectors, when `null` it is calculated
|
* @param confidenceInterval the confidenceInterval for scalar quantizing the vectors, when `null`
|
||||||
* based on the vector field dimensions.
|
* it is calculated based on the vector field dimensions.
|
||||||
*/
|
*/
|
||||||
public Lucene99ScalarQuantizedVectorsFormat(Float quantile) {
|
public Lucene99ScalarQuantizedVectorsFormat(Float confidenceInterval) {
|
||||||
if (quantile != null && (quantile < MINIMUM_QUANTILE || quantile > MAXIMUM_QUANTILE)) {
|
if (confidenceInterval != null
|
||||||
|
&& (confidenceInterval < MINIMUM_CONFIDENCE_INTERVAL
|
||||||
|
|| confidenceInterval > MAXIMUM_CONFIDENCE_INTERVAL)) {
|
||||||
throw new IllegalArgumentException(
|
throw new IllegalArgumentException(
|
||||||
"quantile must be between "
|
"confidenceInterval must be between "
|
||||||
+ MINIMUM_QUANTILE
|
+ MINIMUM_CONFIDENCE_INTERVAL
|
||||||
+ " and "
|
+ " and "
|
||||||
+ MAXIMUM_QUANTILE
|
+ MAXIMUM_CONFIDENCE_INTERVAL
|
||||||
+ "; quantile="
|
+ "; confidenceInterval="
|
||||||
+ quantile);
|
+ confidenceInterval);
|
||||||
}
|
}
|
||||||
this.quantile = quantile;
|
this.confidenceInterval = confidenceInterval;
|
||||||
}
|
}
|
||||||
|
|
||||||
static float calculateDefaultQuantile(int vectorDimension) {
|
static float calculateDefaultConfidenceInterval(int vectorDimension) {
|
||||||
return Math.max(MINIMUM_QUANTILE, 1f - (1f / (vectorDimension + 1)));
|
return Math.max(MINIMUM_CONFIDENCE_INTERVAL, 1f - (1f / (vectorDimension + 1)));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -88,8 +90,8 @@ public final class Lucene99ScalarQuantizedVectorsFormat extends FlatVectorsForma
|
||||||
return NAME
|
return NAME
|
||||||
+ "(name="
|
+ "(name="
|
||||||
+ NAME
|
+ NAME
|
||||||
+ ", quantile="
|
+ ", confidenceInterval="
|
||||||
+ quantile
|
+ confidenceInterval
|
||||||
+ ", rawVectorFormat="
|
+ ", rawVectorFormat="
|
||||||
+ rawVectorFormat
|
+ rawVectorFormat
|
||||||
+ ")";
|
+ ")";
|
||||||
|
@ -98,7 +100,7 @@ public final class Lucene99ScalarQuantizedVectorsFormat extends FlatVectorsForma
|
||||||
@Override
|
@Override
|
||||||
public FlatVectorsWriter fieldsWriter(SegmentWriteState state) throws IOException {
|
public FlatVectorsWriter fieldsWriter(SegmentWriteState state) throws IOException {
|
||||||
return new Lucene99ScalarQuantizedVectorsWriter(
|
return new Lucene99ScalarQuantizedVectorsWriter(
|
||||||
state, quantile, rawVectorFormat.fieldsWriter(state));
|
state, confidenceInterval, rawVectorFormat.fieldsWriter(state));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|
|
@ -58,6 +58,7 @@ public final class Lucene99ScalarQuantizedVectorsReader extends FlatVectorsReade
|
||||||
|
|
||||||
Lucene99ScalarQuantizedVectorsReader(SegmentReadState state, FlatVectorsReader rawVectorsReader)
|
Lucene99ScalarQuantizedVectorsReader(SegmentReadState state, FlatVectorsReader rawVectorsReader)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
|
this.rawVectorsReader = rawVectorsReader;
|
||||||
int versionMeta = -1;
|
int versionMeta = -1;
|
||||||
String metaFileName =
|
String metaFileName =
|
||||||
IndexFileNames.segmentFileName(
|
IndexFileNames.segmentFileName(
|
||||||
|
@ -80,19 +81,8 @@ public final class Lucene99ScalarQuantizedVectorsReader extends FlatVectorsReade
|
||||||
} catch (Throwable exception) {
|
} catch (Throwable exception) {
|
||||||
priorE = exception;
|
priorE = exception;
|
||||||
} finally {
|
} finally {
|
||||||
try {
|
CodecUtil.checkFooter(meta, priorE);
|
||||||
CodecUtil.checkFooter(meta, priorE);
|
|
||||||
success = true;
|
|
||||||
} finally {
|
|
||||||
if (success == false) {
|
|
||||||
IOUtils.close(rawVectorsReader);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
|
||||||
success = false;
|
|
||||||
this.rawVectorsReader = rawVectorsReader;
|
|
||||||
try {
|
|
||||||
quantizedVectorData =
|
quantizedVectorData =
|
||||||
openDataInput(
|
openDataInput(
|
||||||
state,
|
state,
|
||||||
|
@ -313,10 +303,10 @@ public final class Lucene99ScalarQuantizedVectorsReader extends FlatVectorsReade
|
||||||
dimension = input.readVInt();
|
dimension = input.readVInt();
|
||||||
size = input.readInt();
|
size = input.readInt();
|
||||||
if (size > 0) {
|
if (size > 0) {
|
||||||
float configuredQuantile = Float.intBitsToFloat(input.readInt());
|
float confidenceInterval = Float.intBitsToFloat(input.readInt());
|
||||||
float minQuantile = Float.intBitsToFloat(input.readInt());
|
float minQuantile = Float.intBitsToFloat(input.readInt());
|
||||||
float maxQuantile = Float.intBitsToFloat(input.readInt());
|
float maxQuantile = Float.intBitsToFloat(input.readInt());
|
||||||
scalarQuantizer = new ScalarQuantizer(minQuantile, maxQuantile, configuredQuantile);
|
scalarQuantizer = new ScalarQuantizer(minQuantile, maxQuantile, confidenceInterval);
|
||||||
} else {
|
} else {
|
||||||
scalarQuantizer = null;
|
scalarQuantizer = null;
|
||||||
}
|
}
|
||||||
|
|
|
@ -19,7 +19,7 @@ package org.apache.lucene.codecs.lucene99;
|
||||||
|
|
||||||
import static org.apache.lucene.codecs.lucene99.Lucene99FlatVectorsFormat.DIRECT_MONOTONIC_BLOCK_SHIFT;
|
import static org.apache.lucene.codecs.lucene99.Lucene99FlatVectorsFormat.DIRECT_MONOTONIC_BLOCK_SHIFT;
|
||||||
import static org.apache.lucene.codecs.lucene99.Lucene99ScalarQuantizedVectorsFormat.QUANTIZED_VECTOR_COMPONENT;
|
import static org.apache.lucene.codecs.lucene99.Lucene99ScalarQuantizedVectorsFormat.QUANTIZED_VECTOR_COMPONENT;
|
||||||
import static org.apache.lucene.codecs.lucene99.Lucene99ScalarQuantizedVectorsFormat.calculateDefaultQuantile;
|
import static org.apache.lucene.codecs.lucene99.Lucene99ScalarQuantizedVectorsFormat.calculateDefaultConfidenceInterval;
|
||||||
import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS;
|
import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS;
|
||||||
import static org.apache.lucene.util.RamUsageEstimator.shallowSizeOfInstance;
|
import static org.apache.lucene.util.RamUsageEstimator.shallowSizeOfInstance;
|
||||||
|
|
||||||
|
@ -91,14 +91,14 @@ public final class Lucene99ScalarQuantizedVectorsWriter extends FlatVectorsWrite
|
||||||
|
|
||||||
private final List<FieldWriter> fields = new ArrayList<>();
|
private final List<FieldWriter> fields = new ArrayList<>();
|
||||||
private final IndexOutput meta, quantizedVectorData;
|
private final IndexOutput meta, quantizedVectorData;
|
||||||
private final Float quantile;
|
private final Float confidenceInterval;
|
||||||
private final FlatVectorsWriter rawVectorDelegate;
|
private final FlatVectorsWriter rawVectorDelegate;
|
||||||
private boolean finished;
|
private boolean finished;
|
||||||
|
|
||||||
Lucene99ScalarQuantizedVectorsWriter(
|
Lucene99ScalarQuantizedVectorsWriter(
|
||||||
SegmentWriteState state, Float quantile, FlatVectorsWriter rawVectorDelegate)
|
SegmentWriteState state, Float confidenceInterval, FlatVectorsWriter rawVectorDelegate)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
this.quantile = quantile;
|
this.confidenceInterval = confidenceInterval;
|
||||||
segmentWriteState = state;
|
segmentWriteState = state;
|
||||||
String metaFileName =
|
String metaFileName =
|
||||||
IndexFileNames.segmentFileName(
|
IndexFileNames.segmentFileName(
|
||||||
|
@ -142,12 +142,12 @@ public final class Lucene99ScalarQuantizedVectorsWriter extends FlatVectorsWrite
|
||||||
public FlatFieldVectorsWriter<?> addField(
|
public FlatFieldVectorsWriter<?> addField(
|
||||||
FieldInfo fieldInfo, KnnFieldVectorsWriter<?> indexWriter) throws IOException {
|
FieldInfo fieldInfo, KnnFieldVectorsWriter<?> indexWriter) throws IOException {
|
||||||
if (fieldInfo.getVectorEncoding().equals(VectorEncoding.FLOAT32)) {
|
if (fieldInfo.getVectorEncoding().equals(VectorEncoding.FLOAT32)) {
|
||||||
float quantile =
|
float confidenceInterval =
|
||||||
this.quantile == null
|
this.confidenceInterval == null
|
||||||
? calculateDefaultQuantile(fieldInfo.getVectorDimension())
|
? calculateDefaultConfidenceInterval(fieldInfo.getVectorDimension())
|
||||||
: this.quantile;
|
: this.confidenceInterval;
|
||||||
FieldWriter quantizedWriter =
|
FieldWriter quantizedWriter =
|
||||||
new FieldWriter(quantile, fieldInfo, segmentWriteState.infoStream, indexWriter);
|
new FieldWriter(confidenceInterval, fieldInfo, segmentWriteState.infoStream, indexWriter);
|
||||||
fields.add(quantizedWriter);
|
fields.add(quantizedWriter);
|
||||||
indexWriter = quantizedWriter;
|
indexWriter = quantizedWriter;
|
||||||
}
|
}
|
||||||
|
@ -169,16 +169,16 @@ public final class Lucene99ScalarQuantizedVectorsWriter extends FlatVectorsWrite
|
||||||
DocsWithFieldSet docsWithField =
|
DocsWithFieldSet docsWithField =
|
||||||
writeQuantizedVectorData(quantizedVectorData, byteVectorValues);
|
writeQuantizedVectorData(quantizedVectorData, byteVectorValues);
|
||||||
long vectorDataLength = quantizedVectorData.getFilePointer() - vectorDataOffset;
|
long vectorDataLength = quantizedVectorData.getFilePointer() - vectorDataOffset;
|
||||||
float quantile =
|
float confidenceInterval =
|
||||||
this.quantile == null
|
this.confidenceInterval == null
|
||||||
? calculateDefaultQuantile(fieldInfo.getVectorDimension())
|
? calculateDefaultConfidenceInterval(fieldInfo.getVectorDimension())
|
||||||
: this.quantile;
|
: this.confidenceInterval;
|
||||||
writeMeta(
|
writeMeta(
|
||||||
fieldInfo,
|
fieldInfo,
|
||||||
segmentWriteState.segmentInfo.maxDoc(),
|
segmentWriteState.segmentInfo.maxDoc(),
|
||||||
vectorDataOffset,
|
vectorDataOffset,
|
||||||
vectorDataLength,
|
vectorDataLength,
|
||||||
quantile,
|
confidenceInterval,
|
||||||
mergedQuantizationState.getLowerQuantile(),
|
mergedQuantizationState.getLowerQuantile(),
|
||||||
mergedQuantizationState.getUpperQuantile(),
|
mergedQuantizationState.getUpperQuantile(),
|
||||||
docsWithField);
|
docsWithField);
|
||||||
|
@ -251,7 +251,7 @@ public final class Lucene99ScalarQuantizedVectorsWriter extends FlatVectorsWrite
|
||||||
maxDoc,
|
maxDoc,
|
||||||
vectorDataOffset,
|
vectorDataOffset,
|
||||||
vectorDataLength,
|
vectorDataLength,
|
||||||
quantile,
|
confidenceInterval,
|
||||||
fieldData.minQuantile,
|
fieldData.minQuantile,
|
||||||
fieldData.maxQuantile,
|
fieldData.maxQuantile,
|
||||||
fieldData.docsWithField);
|
fieldData.docsWithField);
|
||||||
|
@ -262,7 +262,7 @@ public final class Lucene99ScalarQuantizedVectorsWriter extends FlatVectorsWrite
|
||||||
int maxDoc,
|
int maxDoc,
|
||||||
long vectorDataOffset,
|
long vectorDataOffset,
|
||||||
long vectorDataLength,
|
long vectorDataLength,
|
||||||
Float configuredQuantizationQuantile,
|
Float confidenceInterval,
|
||||||
Float lowerQuantile,
|
Float lowerQuantile,
|
||||||
Float upperQuantile,
|
Float upperQuantile,
|
||||||
DocsWithFieldSet docsWithField)
|
DocsWithFieldSet docsWithField)
|
||||||
|
@ -279,9 +279,9 @@ public final class Lucene99ScalarQuantizedVectorsWriter extends FlatVectorsWrite
|
||||||
assert Float.isFinite(lowerQuantile) && Float.isFinite(upperQuantile);
|
assert Float.isFinite(lowerQuantile) && Float.isFinite(upperQuantile);
|
||||||
meta.writeInt(
|
meta.writeInt(
|
||||||
Float.floatToIntBits(
|
Float.floatToIntBits(
|
||||||
configuredQuantizationQuantile != null
|
confidenceInterval != null
|
||||||
? configuredQuantizationQuantile
|
? confidenceInterval
|
||||||
: calculateDefaultQuantile(field.getVectorDimension())));
|
: calculateDefaultConfidenceInterval(field.getVectorDimension())));
|
||||||
meta.writeInt(Float.floatToIntBits(lowerQuantile));
|
meta.writeInt(Float.floatToIntBits(lowerQuantile));
|
||||||
meta.writeInt(Float.floatToIntBits(upperQuantile));
|
meta.writeInt(Float.floatToIntBits(upperQuantile));
|
||||||
}
|
}
|
||||||
|
@ -344,7 +344,7 @@ public final class Lucene99ScalarQuantizedVectorsWriter extends FlatVectorsWrite
|
||||||
maxDoc,
|
maxDoc,
|
||||||
vectorDataOffset,
|
vectorDataOffset,
|
||||||
quantizedVectorLength,
|
quantizedVectorLength,
|
||||||
quantile,
|
confidenceInterval,
|
||||||
fieldData.minQuantile,
|
fieldData.minQuantile,
|
||||||
fieldData.maxQuantile,
|
fieldData.maxQuantile,
|
||||||
newDocsWithField);
|
newDocsWithField);
|
||||||
|
@ -374,11 +374,11 @@ public final class Lucene99ScalarQuantizedVectorsWriter extends FlatVectorsWrite
|
||||||
private ScalarQuantizer mergeQuantiles(FieldInfo fieldInfo, MergeState mergeState)
|
private ScalarQuantizer mergeQuantiles(FieldInfo fieldInfo, MergeState mergeState)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
assert fieldInfo.getVectorEncoding() == VectorEncoding.FLOAT32;
|
assert fieldInfo.getVectorEncoding() == VectorEncoding.FLOAT32;
|
||||||
float quantile =
|
float confidenceInterval =
|
||||||
this.quantile == null
|
this.confidenceInterval == null
|
||||||
? calculateDefaultQuantile(fieldInfo.getVectorDimension())
|
? calculateDefaultConfidenceInterval(fieldInfo.getVectorDimension())
|
||||||
: this.quantile;
|
: this.confidenceInterval;
|
||||||
return mergeAndRecalculateQuantiles(mergeState, fieldInfo, quantile);
|
return mergeAndRecalculateQuantiles(mergeState, fieldInfo, confidenceInterval);
|
||||||
}
|
}
|
||||||
|
|
||||||
private ScalarQuantizedCloseableRandomVectorScorerSupplier mergeOneFieldToIndex(
|
private ScalarQuantizedCloseableRandomVectorScorerSupplier mergeOneFieldToIndex(
|
||||||
|
@ -408,16 +408,16 @@ public final class Lucene99ScalarQuantizedVectorsWriter extends FlatVectorsWrite
|
||||||
quantizationDataInput, quantizationDataInput.length() - CodecUtil.footerLength());
|
quantizationDataInput, quantizationDataInput.length() - CodecUtil.footerLength());
|
||||||
long vectorDataLength = quantizedVectorData.getFilePointer() - vectorDataOffset;
|
long vectorDataLength = quantizedVectorData.getFilePointer() - vectorDataOffset;
|
||||||
CodecUtil.retrieveChecksum(quantizationDataInput);
|
CodecUtil.retrieveChecksum(quantizationDataInput);
|
||||||
float quantile =
|
float confidenceInterval =
|
||||||
this.quantile == null
|
this.confidenceInterval == null
|
||||||
? calculateDefaultQuantile(fieldInfo.getVectorDimension())
|
? calculateDefaultConfidenceInterval(fieldInfo.getVectorDimension())
|
||||||
: this.quantile;
|
: this.confidenceInterval;
|
||||||
writeMeta(
|
writeMeta(
|
||||||
fieldInfo,
|
fieldInfo,
|
||||||
segmentWriteState.segmentInfo.maxDoc(),
|
segmentWriteState.segmentInfo.maxDoc(),
|
||||||
vectorDataOffset,
|
vectorDataOffset,
|
||||||
vectorDataLength,
|
vectorDataLength,
|
||||||
quantile,
|
confidenceInterval,
|
||||||
mergedQuantizationState.getLowerQuantile(),
|
mergedQuantizationState.getLowerQuantile(),
|
||||||
mergedQuantizationState.getUpperQuantile(),
|
mergedQuantizationState.getUpperQuantile(),
|
||||||
docsWithField);
|
docsWithField);
|
||||||
|
@ -446,7 +446,9 @@ public final class Lucene99ScalarQuantizedVectorsWriter extends FlatVectorsWrite
|
||||||
}
|
}
|
||||||
|
|
||||||
static ScalarQuantizer mergeQuantiles(
|
static ScalarQuantizer mergeQuantiles(
|
||||||
List<ScalarQuantizer> quantizationStates, List<Integer> segmentSizes, float quantile) {
|
List<ScalarQuantizer> quantizationStates,
|
||||||
|
List<Integer> segmentSizes,
|
||||||
|
float confidenceInterval) {
|
||||||
assert quantizationStates.size() == segmentSizes.size();
|
assert quantizationStates.size() == segmentSizes.size();
|
||||||
if (quantizationStates.isEmpty()) {
|
if (quantizationStates.isEmpty()) {
|
||||||
return null;
|
return null;
|
||||||
|
@ -464,7 +466,7 @@ public final class Lucene99ScalarQuantizedVectorsWriter extends FlatVectorsWrite
|
||||||
}
|
}
|
||||||
lowerQuantile /= totalCount;
|
lowerQuantile /= totalCount;
|
||||||
upperQuantile /= totalCount;
|
upperQuantile /= totalCount;
|
||||||
return new ScalarQuantizer(lowerQuantile, upperQuantile, quantile);
|
return new ScalarQuantizer(lowerQuantile, upperQuantile, confidenceInterval);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -521,7 +523,7 @@ public final class Lucene99ScalarQuantizedVectorsWriter extends FlatVectorsWrite
|
||||||
}
|
}
|
||||||
|
|
||||||
static ScalarQuantizer mergeAndRecalculateQuantiles(
|
static ScalarQuantizer mergeAndRecalculateQuantiles(
|
||||||
MergeState mergeState, FieldInfo fieldInfo, float quantile) throws IOException {
|
MergeState mergeState, FieldInfo fieldInfo, float confidenceInterval) throws IOException {
|
||||||
List<ScalarQuantizer> quantizationStates = new ArrayList<>(mergeState.liveDocs.length);
|
List<ScalarQuantizer> quantizationStates = new ArrayList<>(mergeState.liveDocs.length);
|
||||||
List<Integer> segmentSizes = new ArrayList<>(mergeState.liveDocs.length);
|
List<Integer> segmentSizes = new ArrayList<>(mergeState.liveDocs.length);
|
||||||
for (int i = 0; i < mergeState.liveDocs.length; i++) {
|
for (int i = 0; i < mergeState.liveDocs.length; i++) {
|
||||||
|
@ -536,7 +538,8 @@ public final class Lucene99ScalarQuantizedVectorsWriter extends FlatVectorsWrite
|
||||||
segmentSizes.add(fvv.size());
|
segmentSizes.add(fvv.size());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
ScalarQuantizer mergedQuantiles = mergeQuantiles(quantizationStates, segmentSizes, quantile);
|
ScalarQuantizer mergedQuantiles =
|
||||||
|
mergeQuantiles(quantizationStates, segmentSizes, confidenceInterval);
|
||||||
// Segments no providing quantization state indicates that their quantiles were never
|
// Segments no providing quantization state indicates that their quantiles were never
|
||||||
// calculated.
|
// calculated.
|
||||||
// To be safe, we should always recalculate given a sample set over all the float vectors in the
|
// To be safe, we should always recalculate given a sample set over all the float vectors in the
|
||||||
|
@ -545,7 +548,7 @@ public final class Lucene99ScalarQuantizedVectorsWriter extends FlatVectorsWrite
|
||||||
if (mergedQuantiles == null || shouldRecomputeQuantiles(mergedQuantiles, quantizationStates)) {
|
if (mergedQuantiles == null || shouldRecomputeQuantiles(mergedQuantiles, quantizationStates)) {
|
||||||
FloatVectorValues vectorValues =
|
FloatVectorValues vectorValues =
|
||||||
KnnVectorsWriter.MergedVectorValues.mergeFloatVectorValues(fieldInfo, mergeState);
|
KnnVectorsWriter.MergedVectorValues.mergeFloatVectorValues(fieldInfo, mergeState);
|
||||||
mergedQuantiles = ScalarQuantizer.fromVectors(vectorValues, quantile);
|
mergedQuantiles = ScalarQuantizer.fromVectors(vectorValues, confidenceInterval);
|
||||||
}
|
}
|
||||||
return mergedQuantiles;
|
return mergedQuantiles;
|
||||||
}
|
}
|
||||||
|
@ -599,7 +602,7 @@ public final class Lucene99ScalarQuantizedVectorsWriter extends FlatVectorsWrite
|
||||||
private static final long SHALLOW_SIZE = shallowSizeOfInstance(FieldWriter.class);
|
private static final long SHALLOW_SIZE = shallowSizeOfInstance(FieldWriter.class);
|
||||||
private final List<float[]> floatVectors;
|
private final List<float[]> floatVectors;
|
||||||
private final FieldInfo fieldInfo;
|
private final FieldInfo fieldInfo;
|
||||||
private final float quantile;
|
private final float confidenceInterval;
|
||||||
private final InfoStream infoStream;
|
private final InfoStream infoStream;
|
||||||
private final boolean normalize;
|
private final boolean normalize;
|
||||||
private float minQuantile = Float.POSITIVE_INFINITY;
|
private float minQuantile = Float.POSITIVE_INFINITY;
|
||||||
|
@ -609,12 +612,12 @@ public final class Lucene99ScalarQuantizedVectorsWriter extends FlatVectorsWrite
|
||||||
|
|
||||||
@SuppressWarnings("unchecked")
|
@SuppressWarnings("unchecked")
|
||||||
FieldWriter(
|
FieldWriter(
|
||||||
float quantile,
|
float confidenceInterval,
|
||||||
FieldInfo fieldInfo,
|
FieldInfo fieldInfo,
|
||||||
InfoStream infoStream,
|
InfoStream infoStream,
|
||||||
KnnFieldVectorsWriter<?> indexWriter) {
|
KnnFieldVectorsWriter<?> indexWriter) {
|
||||||
super((KnnFieldVectorsWriter<float[]>) indexWriter);
|
super((KnnFieldVectorsWriter<float[]>) indexWriter);
|
||||||
this.quantile = quantile;
|
this.confidenceInterval = confidenceInterval;
|
||||||
this.fieldInfo = fieldInfo;
|
this.fieldInfo = fieldInfo;
|
||||||
this.normalize = fieldInfo.getVectorSimilarityFunction() == VectorSimilarityFunction.COSINE;
|
this.normalize = fieldInfo.getVectorSimilarityFunction() == VectorSimilarityFunction.COSINE;
|
||||||
this.floatVectors = new ArrayList<>();
|
this.floatVectors = new ArrayList<>();
|
||||||
|
@ -635,15 +638,15 @@ public final class Lucene99ScalarQuantizedVectorsWriter extends FlatVectorsWrite
|
||||||
new FloatVectorWrapper(
|
new FloatVectorWrapper(
|
||||||
floatVectors,
|
floatVectors,
|
||||||
fieldInfo.getVectorSimilarityFunction() == VectorSimilarityFunction.COSINE),
|
fieldInfo.getVectorSimilarityFunction() == VectorSimilarityFunction.COSINE),
|
||||||
quantile);
|
confidenceInterval);
|
||||||
minQuantile = quantizer.getLowerQuantile();
|
minQuantile = quantizer.getLowerQuantile();
|
||||||
maxQuantile = quantizer.getUpperQuantile();
|
maxQuantile = quantizer.getUpperQuantile();
|
||||||
if (infoStream.isEnabled(QUANTIZED_VECTOR_COMPONENT)) {
|
if (infoStream.isEnabled(QUANTIZED_VECTOR_COMPONENT)) {
|
||||||
infoStream.message(
|
infoStream.message(
|
||||||
QUANTIZED_VECTOR_COMPONENT,
|
QUANTIZED_VECTOR_COMPONENT,
|
||||||
"quantized field="
|
"quantized field="
|
||||||
+ " quantile="
|
+ " confidenceInterval="
|
||||||
+ quantile
|
+ confidenceInterval
|
||||||
+ " minQuantile="
|
+ " minQuantile="
|
||||||
+ minQuantile
|
+ minQuantile
|
||||||
+ " maxQuantile="
|
+ " maxQuantile="
|
||||||
|
@ -654,7 +657,7 @@ public final class Lucene99ScalarQuantizedVectorsWriter extends FlatVectorsWrite
|
||||||
|
|
||||||
ScalarQuantizer createQuantizer() {
|
ScalarQuantizer createQuantizer() {
|
||||||
assert finished;
|
assert finished;
|
||||||
return new ScalarQuantizer(minQuantile, maxQuantile, quantile);
|
return new ScalarQuantizer(minQuantile, maxQuantile, confidenceInterval);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|
|
@ -119,7 +119,6 @@ public abstract class PerFieldPostingsFormat extends PostingsFormat {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
;
|
|
||||||
|
|
||||||
static String getSuffix(String formatName, String suffix) {
|
static String getSuffix(String formatName, String suffix) {
|
||||||
return formatName + "_" + suffix;
|
return formatName + "_" + suffix;
|
||||||
|
|
|
@ -272,7 +272,6 @@ public final class FeatureField extends Field {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
;
|
|
||||||
|
|
||||||
static final class LogFunction extends FeatureFunction {
|
static final class LogFunction extends FeatureFunction {
|
||||||
|
|
||||||
|
|
|
@ -16,6 +16,7 @@
|
||||||
*/
|
*/
|
||||||
package org.apache.lucene.document;
|
package org.apache.lucene.document;
|
||||||
|
|
||||||
|
import java.util.Collection;
|
||||||
import java.util.Objects;
|
import java.util.Objects;
|
||||||
import org.apache.lucene.index.DocValuesType;
|
import org.apache.lucene.index.DocValuesType;
|
||||||
import org.apache.lucene.index.IndexOptions;
|
import org.apache.lucene.index.IndexOptions;
|
||||||
|
@ -171,7 +172,7 @@ public class KeywordField extends Field {
|
||||||
* @throws NullPointerException if {@code field} is null.
|
* @throws NullPointerException if {@code field} is null.
|
||||||
* @return a query matching documents with this exact value
|
* @return a query matching documents with this exact value
|
||||||
*/
|
*/
|
||||||
public static Query newSetQuery(String field, BytesRef... values) {
|
public static Query newSetQuery(String field, Collection<BytesRef> values) {
|
||||||
Objects.requireNonNull(field, "field must not be null");
|
Objects.requireNonNull(field, "field must not be null");
|
||||||
Objects.requireNonNull(values, "values must not be null");
|
Objects.requireNonNull(values, "values must not be null");
|
||||||
Query indexQuery = new TermInSetQuery(field, values);
|
Query indexQuery = new TermInSetQuery(field, values);
|
||||||
|
|
|
@ -16,6 +16,7 @@
|
||||||
*/
|
*/
|
||||||
package org.apache.lucene.document;
|
package org.apache.lucene.document;
|
||||||
|
|
||||||
|
import java.util.Collection;
|
||||||
import org.apache.lucene.index.DocValuesType;
|
import org.apache.lucene.index.DocValuesType;
|
||||||
import org.apache.lucene.search.IndexOrDocValuesQuery;
|
import org.apache.lucene.search.IndexOrDocValuesQuery;
|
||||||
import org.apache.lucene.search.MultiTermQuery;
|
import org.apache.lucene.search.MultiTermQuery;
|
||||||
|
@ -99,7 +100,7 @@ public class SortedDocValuesField extends Field {
|
||||||
* in an {@link IndexOrDocValuesQuery}, alongside a set query that executes on postings, such as
|
* in an {@link IndexOrDocValuesQuery}, alongside a set query that executes on postings, such as
|
||||||
* {@link TermInSetQuery}.
|
* {@link TermInSetQuery}.
|
||||||
*/
|
*/
|
||||||
public static Query newSlowSetQuery(String field, BytesRef... values) {
|
public static Query newSlowSetQuery(String field, Collection<BytesRef> values) {
|
||||||
return new TermInSetQuery(MultiTermQuery.DOC_VALUES_REWRITE, field, values);
|
return new TermInSetQuery(MultiTermQuery.DOC_VALUES_REWRITE, field, values);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -16,6 +16,7 @@
|
||||||
*/
|
*/
|
||||||
package org.apache.lucene.document;
|
package org.apache.lucene.document;
|
||||||
|
|
||||||
|
import java.util.Collection;
|
||||||
import org.apache.lucene.index.DocValuesType;
|
import org.apache.lucene.index.DocValuesType;
|
||||||
import org.apache.lucene.search.IndexOrDocValuesQuery;
|
import org.apache.lucene.search.IndexOrDocValuesQuery;
|
||||||
import org.apache.lucene.search.MultiTermQuery;
|
import org.apache.lucene.search.MultiTermQuery;
|
||||||
|
@ -103,7 +104,7 @@ public class SortedSetDocValuesField extends Field {
|
||||||
* in an {@link IndexOrDocValuesQuery}, alongside a set query that executes on postings, such as
|
* in an {@link IndexOrDocValuesQuery}, alongside a set query that executes on postings, such as
|
||||||
* {@link TermInSetQuery}.
|
* {@link TermInSetQuery}.
|
||||||
*/
|
*/
|
||||||
public static Query newSlowSetQuery(String field, BytesRef... values) {
|
public static Query newSlowSetQuery(String field, Collection<BytesRef> values) {
|
||||||
return new TermInSetQuery(MultiTermQuery.DOC_VALUES_REWRITE, field, values);
|
return new TermInSetQuery(MultiTermQuery.DOC_VALUES_REWRITE, field, values);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -694,7 +694,7 @@ abstract class SpatialQuery extends Query {
|
||||||
final SpatialVisitor spatialVisitor, QueryRelation queryRelation, final FixedBitSet result) {
|
final SpatialVisitor spatialVisitor, QueryRelation queryRelation, final FixedBitSet result) {
|
||||||
final BiFunction<byte[], byte[], Relation> innerFunction =
|
final BiFunction<byte[], byte[], Relation> innerFunction =
|
||||||
spatialVisitor.getInnerFunction(queryRelation);
|
spatialVisitor.getInnerFunction(queryRelation);
|
||||||
;
|
|
||||||
return new IntersectVisitor() {
|
return new IntersectVisitor() {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|
|
@ -1254,8 +1254,7 @@ public final class Tessellator {
|
||||||
++numMerges;
|
++numMerges;
|
||||||
// step 'insize' places along from p
|
// step 'insize' places along from p
|
||||||
q = p;
|
q = p;
|
||||||
for (i = 0, pSize = 0; i < inSize && q != null; ++i, ++pSize, q = q.nextZ)
|
for (i = 0, pSize = 0; i < inSize && q != null; ++i, ++pSize, q = q.nextZ) {}
|
||||||
;
|
|
||||||
// if q hasn't fallen off end, we have two lists to merge
|
// if q hasn't fallen off end, we have two lists to merge
|
||||||
qSize = inSize;
|
qSize = inSize;
|
||||||
|
|
||||||
|
|
|
@ -22,11 +22,11 @@ import org.apache.lucene.store.DataOutput;
|
||||||
import org.apache.lucene.util.BitUtil;
|
import org.apache.lucene.util.BitUtil;
|
||||||
import org.apache.lucene.util.ByteBlockPool;
|
import org.apache.lucene.util.ByteBlockPool;
|
||||||
|
|
||||||
/* IndexInput that knows how to read the byte slices written
|
/**
|
||||||
* by Posting and PostingVector. We read the bytes in
|
* IndexInput that knows how to read the byte slices written by Posting and PostingVector. We read
|
||||||
* each slice until we hit the end of that slice at which
|
* the bytes in each slice until we hit the end of that slice at which point we read the forwarding
|
||||||
* point we read the forwarding address of the next slice
|
* address of the next slice and then jump to it.
|
||||||
* and then jump to it.*/
|
*/
|
||||||
final class ByteSliceReader extends DataInput {
|
final class ByteSliceReader extends DataInput {
|
||||||
ByteBlockPool pool;
|
ByteBlockPool pool;
|
||||||
int bufferUpto;
|
int bufferUpto;
|
||||||
|
|
|
@ -28,7 +28,7 @@ import java.nio.file.Paths;
|
||||||
import java.text.NumberFormat;
|
import java.text.NumberFormat;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
import java.util.Collections;
|
import java.util.Comparator;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.Iterator;
|
import java.util.Iterator;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
@ -96,11 +96,11 @@ import org.apache.lucene.util.Version;
|
||||||
*/
|
*/
|
||||||
public final class CheckIndex implements Closeable {
|
public final class CheckIndex implements Closeable {
|
||||||
|
|
||||||
|
private final Directory dir;
|
||||||
|
private final Lock writeLock;
|
||||||
|
private final NumberFormat nf = NumberFormat.getInstance(Locale.ROOT);
|
||||||
private PrintStream infoStream;
|
private PrintStream infoStream;
|
||||||
private Directory dir;
|
|
||||||
private Lock writeLock;
|
|
||||||
private volatile boolean closed;
|
private volatile boolean closed;
|
||||||
private NumberFormat nf = NumberFormat.getInstance(Locale.ROOT);
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Returned from {@link #checkIndex()} detailing the health and status of the index.
|
* Returned from {@link #checkIndex()} detailing the health and status of the index.
|
||||||
|
@ -441,19 +441,20 @@ public final class CheckIndex implements Closeable {
|
||||||
IOUtils.close(writeLock);
|
IOUtils.close(writeLock);
|
||||||
}
|
}
|
||||||
|
|
||||||
private boolean doSlowChecks;
|
private int level;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* If true, additional slow checks are performed. This will likely drastically increase time it
|
* Sets Level, the higher the value, the more additional checks are performed. This will likely
|
||||||
* takes to run CheckIndex!
|
* drastically increase time it takes to run CheckIndex! See {@link Level}
|
||||||
*/
|
*/
|
||||||
public void setDoSlowChecks(boolean v) {
|
public void setLevel(int v) {
|
||||||
doSlowChecks = v;
|
Level.checkIfLevelInBounds(v);
|
||||||
|
level = v;
|
||||||
}
|
}
|
||||||
|
|
||||||
/** See {@link #setDoSlowChecks}. */
|
/** See {@link #setLevel}. */
|
||||||
public boolean doSlowChecks() {
|
public int getLevel() {
|
||||||
return doSlowChecks;
|
return level;
|
||||||
}
|
}
|
||||||
|
|
||||||
private boolean failFast;
|
private boolean failFast;
|
||||||
|
@ -473,21 +474,6 @@ public final class CheckIndex implements Closeable {
|
||||||
|
|
||||||
private boolean verbose;
|
private boolean verbose;
|
||||||
|
|
||||||
/** See {@link #getChecksumsOnly}. */
|
|
||||||
public boolean getChecksumsOnly() {
|
|
||||||
return checksumsOnly;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* If true, only validate physical integrity for all files. Note that the returned nested status
|
|
||||||
* objects (e.g. storedFieldStatus) will be null.
|
|
||||||
*/
|
|
||||||
public void setChecksumsOnly(boolean v) {
|
|
||||||
checksumsOnly = v;
|
|
||||||
}
|
|
||||||
|
|
||||||
private boolean checksumsOnly;
|
|
||||||
|
|
||||||
/** Set threadCount used for parallelizing index integrity checking. */
|
/** Set threadCount used for parallelizing index integrity checking. */
|
||||||
public void setThreadCount(int tc) {
|
public void setThreadCount(int tc) {
|
||||||
if (tc <= 0) {
|
if (tc <= 0) {
|
||||||
|
@ -586,7 +572,6 @@ public final class CheckIndex implements Closeable {
|
||||||
ensureOpen();
|
ensureOpen();
|
||||||
long startNS = System.nanoTime();
|
long startNS = System.nanoTime();
|
||||||
|
|
||||||
SegmentInfos sis = null;
|
|
||||||
Status result = new Status();
|
Status result = new Status();
|
||||||
result.dir = dir;
|
result.dir = dir;
|
||||||
String[] files = dir.listAll();
|
String[] files = dir.listAll();
|
||||||
|
@ -595,43 +580,115 @@ public final class CheckIndex implements Closeable {
|
||||||
throw new IndexNotFoundException(
|
throw new IndexNotFoundException(
|
||||||
"no segments* file found in " + dir + ": files: " + Arrays.toString(files));
|
"no segments* file found in " + dir + ": files: " + Arrays.toString(files));
|
||||||
}
|
}
|
||||||
try {
|
|
||||||
// Do not use SegmentInfos.read(Directory) since the spooky
|
// https://github.com/apache/lucene/issues/7820: also attempt to open any older commit
|
||||||
// retrying it does is not necessary here (we hold the write lock):
|
// points (segments_N), which will catch certain corruption like missing _N.si files
|
||||||
sis =
|
// for segments not also referenced by the newest commit point (which was already
|
||||||
SegmentInfos.readCommit(
|
// loaded, successfully, above). Note that we do not do a deeper check of segments
|
||||||
dir, lastSegmentsFile, 0 /* always open old indices if codecs are around */);
|
// referenced ONLY by these older commit points, because such corruption would not
|
||||||
} catch (Throwable t) {
|
// prevent a new IndexWriter from opening on the newest commit point. but it is still
|
||||||
if (failFast) {
|
// corruption, e.g. a reader opened on those old commit points can hit corruption
|
||||||
throw IOUtils.rethrowAlways(t);
|
// exceptions which we (still) will not detect here. progress not perfection!
|
||||||
|
|
||||||
|
SegmentInfos lastCommit = null;
|
||||||
|
|
||||||
|
List<String> allSegmentsFiles = new ArrayList<>();
|
||||||
|
for (String fileName : files) {
|
||||||
|
if (fileName.startsWith(IndexFileNames.SEGMENTS)
|
||||||
|
&& fileName.equals(SegmentInfos.OLD_SEGMENTS_GEN) == false) {
|
||||||
|
allSegmentsFiles.add(fileName);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Sort descending by generation so that we always attempt to read the last commit first. This
|
||||||
|
// way if an index has a broken last commit AND a broken old commit, we report the last commit
|
||||||
|
// error first:
|
||||||
|
allSegmentsFiles.sort(
|
||||||
|
new Comparator<String>() {
|
||||||
|
@Override
|
||||||
|
public int compare(String a, String b) {
|
||||||
|
long genA = SegmentInfos.generationFromSegmentsFileName(a);
|
||||||
|
long genB = SegmentInfos.generationFromSegmentsFileName(b);
|
||||||
|
|
||||||
|
// reversed natural sort (largest generation first):
|
||||||
|
return -Long.compare(genA, genB);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
for (String fileName : allSegmentsFiles) {
|
||||||
|
|
||||||
|
boolean isLastCommit = fileName.equals(lastSegmentsFile);
|
||||||
|
|
||||||
|
SegmentInfos infos;
|
||||||
|
|
||||||
|
try {
|
||||||
|
// Do not use SegmentInfos.read(Directory) since the spooky
|
||||||
|
// retrying it does is not necessary here (we hold the write lock):
|
||||||
|
// always open old indices if codecs are around
|
||||||
|
infos = SegmentInfos.readCommit(dir, fileName, 0);
|
||||||
|
} catch (Throwable t) {
|
||||||
|
if (failFast) {
|
||||||
|
throw IOUtils.rethrowAlways(t);
|
||||||
|
}
|
||||||
|
|
||||||
|
String message;
|
||||||
|
|
||||||
|
if (isLastCommit) {
|
||||||
|
message =
|
||||||
|
"ERROR: could not read latest commit point from segments file \""
|
||||||
|
+ fileName
|
||||||
|
+ "\" in directory";
|
||||||
|
} else {
|
||||||
|
message =
|
||||||
|
"ERROR: could not read old (not latest) commit point segments file \""
|
||||||
|
+ fileName
|
||||||
|
+ "\" in directory";
|
||||||
|
}
|
||||||
|
msg(infoStream, message);
|
||||||
|
result.missingSegments = true;
|
||||||
|
if (infoStream != null) {
|
||||||
|
t.printStackTrace(infoStream);
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (isLastCommit) {
|
||||||
|
// record the latest commit point: we will deeply check all segments referenced by it
|
||||||
|
lastCommit = infos;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// we know there is a lastSegmentsFileName, so we must've attempted to load it in the above for
|
||||||
|
// loop. if it failed to load, we threw the exception (fastFail == true) or we returned the
|
||||||
|
// failure (fastFail == false). so if we get here, we should // always have a valid lastCommit:
|
||||||
|
assert lastCommit != null;
|
||||||
|
|
||||||
|
if (lastCommit == null) {
|
||||||
msg(infoStream, "ERROR: could not read any segments file in directory");
|
msg(infoStream, "ERROR: could not read any segments file in directory");
|
||||||
result.missingSegments = true;
|
result.missingSegments = true;
|
||||||
if (infoStream != null) t.printStackTrace(infoStream);
|
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (infoStream != null) {
|
if (infoStream != null) {
|
||||||
int maxDoc = 0;
|
int maxDoc = 0;
|
||||||
int delCount = 0;
|
int delCount = 0;
|
||||||
for (SegmentCommitInfo info : sis) {
|
for (SegmentCommitInfo info : lastCommit) {
|
||||||
maxDoc += info.info.maxDoc();
|
maxDoc += info.info.maxDoc();
|
||||||
delCount += info.getDelCount();
|
delCount += info.getDelCount();
|
||||||
}
|
}
|
||||||
infoStream.println(
|
infoStream.printf(
|
||||||
String.format(
|
Locale.ROOT,
|
||||||
Locale.ROOT,
|
"%.2f%% total deletions; %d documents; %d deletions%n",
|
||||||
"%.2f%% total deletions; %d documents; %d deletions",
|
100. * delCount / maxDoc,
|
||||||
100. * delCount / maxDoc,
|
maxDoc,
|
||||||
maxDoc,
|
delCount);
|
||||||
delCount));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// find the oldest and newest segment versions
|
// find the oldest and newest segment versions
|
||||||
Version oldest = null;
|
Version oldest = null;
|
||||||
Version newest = null;
|
Version newest = null;
|
||||||
String oldSegs = null;
|
String oldSegs = null;
|
||||||
for (SegmentCommitInfo si : sis) {
|
for (SegmentCommitInfo si : lastCommit) {
|
||||||
Version version = si.info.getVersion();
|
Version version = si.info.getVersion();
|
||||||
if (version == null) {
|
if (version == null) {
|
||||||
// pre-3.1 segment
|
// pre-3.1 segment
|
||||||
|
@ -646,14 +703,14 @@ public final class CheckIndex implements Closeable {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
final int numSegments = sis.size();
|
final int numSegments = lastCommit.size();
|
||||||
final String segmentsFileName = sis.getSegmentsFileName();
|
final String segmentsFileName = lastCommit.getSegmentsFileName();
|
||||||
result.segmentsFileName = segmentsFileName;
|
result.segmentsFileName = segmentsFileName;
|
||||||
result.numSegments = numSegments;
|
result.numSegments = numSegments;
|
||||||
result.userData = sis.getUserData();
|
result.userData = lastCommit.getUserData();
|
||||||
String userDataString;
|
String userDataString;
|
||||||
if (sis.getUserData().size() > 0) {
|
if (lastCommit.getUserData().size() > 0) {
|
||||||
userDataString = " userData=" + sis.getUserData();
|
userDataString = " userData=" + lastCommit.getUserData();
|
||||||
} else {
|
} else {
|
||||||
userDataString = "";
|
userDataString = "";
|
||||||
}
|
}
|
||||||
|
@ -681,7 +738,7 @@ public final class CheckIndex implements Closeable {
|
||||||
+ " "
|
+ " "
|
||||||
+ versionString
|
+ versionString
|
||||||
+ " id="
|
+ " id="
|
||||||
+ StringHelper.idToString(sis.getId())
|
+ StringHelper.idToString(lastCommit.getId())
|
||||||
+ userDataString);
|
+ userDataString);
|
||||||
|
|
||||||
if (onlySegments != null) {
|
if (onlySegments != null) {
|
||||||
|
@ -696,14 +753,14 @@ public final class CheckIndex implements Closeable {
|
||||||
msg(infoStream, ":");
|
msg(infoStream, ":");
|
||||||
}
|
}
|
||||||
|
|
||||||
result.newSegments = sis.clone();
|
result.newSegments = lastCommit.clone();
|
||||||
result.newSegments.clear();
|
result.newSegments.clear();
|
||||||
result.maxSegmentName = -1;
|
result.maxSegmentName = -1;
|
||||||
|
|
||||||
// checks segments sequentially
|
// checks segments sequentially
|
||||||
if (executorService == null) {
|
if (executorService == null) {
|
||||||
for (int i = 0; i < numSegments; i++) {
|
for (int i = 0; i < numSegments; i++) {
|
||||||
final SegmentCommitInfo info = sis.info(i);
|
final SegmentCommitInfo info = lastCommit.info(i);
|
||||||
updateMaxSegmentName(result, info);
|
updateMaxSegmentName(result, info);
|
||||||
if (onlySegments != null && !onlySegments.contains(info.info.name)) {
|
if (onlySegments != null && !onlySegments.contains(info.info.name)) {
|
||||||
continue;
|
continue;
|
||||||
|
@ -718,7 +775,7 @@ public final class CheckIndex implements Closeable {
|
||||||
+ info.info.name
|
+ info.info.name
|
||||||
+ " maxDoc="
|
+ " maxDoc="
|
||||||
+ info.info.maxDoc());
|
+ info.info.maxDoc());
|
||||||
Status.SegmentInfoStatus segmentInfoStatus = testSegment(sis, info, infoStream);
|
Status.SegmentInfoStatus segmentInfoStatus = testSegment(lastCommit, info, infoStream);
|
||||||
|
|
||||||
processSegmentInfoStatusResult(result, info, segmentInfoStatus);
|
processSegmentInfoStatusResult(result, info, segmentInfoStatus);
|
||||||
}
|
}
|
||||||
|
@ -729,14 +786,13 @@ public final class CheckIndex implements Closeable {
|
||||||
|
|
||||||
// checks segments concurrently
|
// checks segments concurrently
|
||||||
List<SegmentCommitInfo> segmentCommitInfos = new ArrayList<>();
|
List<SegmentCommitInfo> segmentCommitInfos = new ArrayList<>();
|
||||||
for (SegmentCommitInfo sci : sis) {
|
for (SegmentCommitInfo sci : lastCommit) {
|
||||||
segmentCommitInfos.add(sci);
|
segmentCommitInfos.add(sci);
|
||||||
}
|
}
|
||||||
|
|
||||||
// sort segmentCommitInfos by segment size, as smaller segment tends to finish faster, and
|
// sort segmentCommitInfos by segment size, as smaller segment tends to finish faster, and
|
||||||
// hence its output can be printed out faster
|
// hence its output can be printed out faster
|
||||||
Collections.sort(
|
segmentCommitInfos.sort(
|
||||||
segmentCommitInfos,
|
|
||||||
(info1, info2) -> {
|
(info1, info2) -> {
|
||||||
try {
|
try {
|
||||||
return Long.compare(info1.sizeInBytes(), info2.sizeInBytes());
|
return Long.compare(info1.sizeInBytes(), info2.sizeInBytes());
|
||||||
|
@ -757,7 +813,7 @@ public final class CheckIndex implements Closeable {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
SegmentInfos finalSis = sis;
|
SegmentInfos finalSis = lastCommit;
|
||||||
|
|
||||||
ByteArrayOutputStream output = new ByteArrayOutputStream();
|
ByteArrayOutputStream output = new ByteArrayOutputStream();
|
||||||
PrintStream stream = new PrintStream(output, true, IOUtils.UTF_8);
|
PrintStream stream = new PrintStream(output, true, IOUtils.UTF_8);
|
||||||
|
@ -813,7 +869,7 @@ public final class CheckIndex implements Closeable {
|
||||||
|
|
||||||
if (0 == result.numBadSegments) {
|
if (0 == result.numBadSegments) {
|
||||||
result.clean = true;
|
result.clean = true;
|
||||||
} else
|
} else {
|
||||||
msg(
|
msg(
|
||||||
infoStream,
|
infoStream,
|
||||||
"WARNING: "
|
"WARNING: "
|
||||||
|
@ -821,14 +877,16 @@ public final class CheckIndex implements Closeable {
|
||||||
+ " broken segments (containing "
|
+ " broken segments (containing "
|
||||||
+ result.totLoseDocCount
|
+ result.totLoseDocCount
|
||||||
+ " documents) detected");
|
+ " documents) detected");
|
||||||
|
}
|
||||||
|
|
||||||
if (!(result.validCounter = (result.maxSegmentName < sis.counter))) {
|
result.validCounter = result.maxSegmentName < lastCommit.counter;
|
||||||
|
if (result.validCounter == false) {
|
||||||
result.clean = false;
|
result.clean = false;
|
||||||
result.newSegments.counter = result.maxSegmentName + 1;
|
result.newSegments.counter = result.maxSegmentName + 1;
|
||||||
msg(
|
msg(
|
||||||
infoStream,
|
infoStream,
|
||||||
"ERROR: Next segment name counter "
|
"ERROR: Next segment name counter "
|
||||||
+ sis.counter
|
+ lastCommit.counter
|
||||||
+ " is not greater than max segment name "
|
+ " is not greater than max segment name "
|
||||||
+ result.maxSegmentName);
|
+ result.maxSegmentName);
|
||||||
}
|
}
|
||||||
|
@ -921,7 +979,7 @@ public final class CheckIndex implements Closeable {
|
||||||
msg(infoStream, " diagnostics = " + diagnostics);
|
msg(infoStream, " diagnostics = " + diagnostics);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!info.hasDeletions()) {
|
if (info.hasDeletions() == false) {
|
||||||
msg(infoStream, " no deletions");
|
msg(infoStream, " no deletions");
|
||||||
segInfoStat.hasDeletions = false;
|
segInfoStat.hasDeletions = false;
|
||||||
} else {
|
} else {
|
||||||
|
@ -960,26 +1018,26 @@ public final class CheckIndex implements Closeable {
|
||||||
toLoseDocCount = numDocs;
|
toLoseDocCount = numDocs;
|
||||||
|
|
||||||
if (reader.hasDeletions()) {
|
if (reader.hasDeletions()) {
|
||||||
if (reader.numDocs() != info.info.maxDoc() - info.getDelCount()) {
|
if (numDocs != info.info.maxDoc() - info.getDelCount()) {
|
||||||
throw new CheckIndexException(
|
throw new CheckIndexException(
|
||||||
"delete count mismatch: info="
|
"delete count mismatch: info="
|
||||||
+ (info.info.maxDoc() - info.getDelCount())
|
+ (info.info.maxDoc() - info.getDelCount())
|
||||||
+ " vs reader="
|
+ " vs reader="
|
||||||
+ reader.numDocs());
|
+ numDocs);
|
||||||
}
|
}
|
||||||
if ((info.info.maxDoc() - reader.numDocs()) > reader.maxDoc()) {
|
if ((info.info.maxDoc() - numDocs) > reader.maxDoc()) {
|
||||||
throw new CheckIndexException(
|
throw new CheckIndexException(
|
||||||
"too many deleted docs: maxDoc()="
|
"too many deleted docs: maxDoc()="
|
||||||
+ reader.maxDoc()
|
+ reader.maxDoc()
|
||||||
+ " vs del count="
|
+ " vs del count="
|
||||||
+ (info.info.maxDoc() - reader.numDocs()));
|
+ (info.info.maxDoc() - numDocs));
|
||||||
}
|
}
|
||||||
if (info.info.maxDoc() - reader.numDocs() != info.getDelCount()) {
|
if (info.info.maxDoc() - numDocs != info.getDelCount()) {
|
||||||
throw new CheckIndexException(
|
throw new CheckIndexException(
|
||||||
"delete count mismatch: info="
|
"delete count mismatch: info="
|
||||||
+ info.getDelCount()
|
+ info.getDelCount()
|
||||||
+ " vs reader="
|
+ " vs reader="
|
||||||
+ (info.info.maxDoc() - reader.numDocs()));
|
+ (info.info.maxDoc() - numDocs));
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if (info.getDelCount() != 0) {
|
if (info.getDelCount() != 0) {
|
||||||
|
@ -987,11 +1045,10 @@ public final class CheckIndex implements Closeable {
|
||||||
"delete count mismatch: info="
|
"delete count mismatch: info="
|
||||||
+ info.getDelCount()
|
+ info.getDelCount()
|
||||||
+ " vs reader="
|
+ " vs reader="
|
||||||
+ (info.info.maxDoc() - reader.numDocs()));
|
+ (info.info.maxDoc() - numDocs));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if (level >= Level.MIN_LEVEL_FOR_INTEGRITY_CHECKS) {
|
||||||
if (checksumsOnly == false) {
|
|
||||||
// Test Livedocs
|
// Test Livedocs
|
||||||
segInfoStat.liveDocStatus = testLiveDocs(reader, infoStream, failFast);
|
segInfoStat.liveDocStatus = testLiveDocs(reader, infoStream, failFast);
|
||||||
|
|
||||||
|
@ -1002,15 +1059,14 @@ public final class CheckIndex implements Closeable {
|
||||||
segInfoStat.fieldNormStatus = testFieldNorms(reader, infoStream, failFast);
|
segInfoStat.fieldNormStatus = testFieldNorms(reader, infoStream, failFast);
|
||||||
|
|
||||||
// Test the Term Index
|
// Test the Term Index
|
||||||
segInfoStat.termIndexStatus =
|
segInfoStat.termIndexStatus = testPostings(reader, infoStream, verbose, level, failFast);
|
||||||
testPostings(reader, infoStream, verbose, doSlowChecks, failFast);
|
|
||||||
|
|
||||||
// Test Stored Fields
|
// Test Stored Fields
|
||||||
segInfoStat.storedFieldStatus = testStoredFields(reader, infoStream, failFast);
|
segInfoStat.storedFieldStatus = testStoredFields(reader, infoStream, failFast);
|
||||||
|
|
||||||
// Test Term Vectors
|
// Test Term Vectors
|
||||||
segInfoStat.termVectorStatus =
|
segInfoStat.termVectorStatus =
|
||||||
testTermVectors(reader, infoStream, verbose, doSlowChecks, failFast);
|
testTermVectors(reader, infoStream, verbose, level, failFast);
|
||||||
|
|
||||||
// Test Docvalues
|
// Test Docvalues
|
||||||
segInfoStat.docValuesStatus = testDocValues(reader, infoStream, failFast);
|
segInfoStat.docValuesStatus = testDocValues(reader, infoStream, failFast);
|
||||||
|
@ -1213,7 +1269,7 @@ public final class CheckIndex implements Closeable {
|
||||||
if (liveDocs != null) {
|
if (liveDocs != null) {
|
||||||
// it's ok for it to be non-null here, as long as none are set right?
|
// it's ok for it to be non-null here, as long as none are set right?
|
||||||
for (int j = 0; j < liveDocs.length(); j++) {
|
for (int j = 0; j < liveDocs.length(); j++) {
|
||||||
if (!liveDocs.get(j)) {
|
if (liveDocs.get(j) == false) {
|
||||||
throw new CheckIndexException(
|
throw new CheckIndexException(
|
||||||
"liveDocs mismatch: info says no deletions but doc " + j + " is deleted.");
|
"liveDocs mismatch: info says no deletions but doc " + j + " is deleted.");
|
||||||
}
|
}
|
||||||
|
@ -1341,7 +1397,7 @@ public final class CheckIndex implements Closeable {
|
||||||
boolean isVectors,
|
boolean isVectors,
|
||||||
PrintStream infoStream,
|
PrintStream infoStream,
|
||||||
boolean verbose,
|
boolean verbose,
|
||||||
boolean doSlowChecks)
|
int level)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
// TODO: we should probably return our own stats thing...?!
|
// TODO: we should probably return our own stats thing...?!
|
||||||
long startNS;
|
long startNS;
|
||||||
|
@ -1450,7 +1506,7 @@ public final class CheckIndex implements Closeable {
|
||||||
+ hasFreqs);
|
+ hasFreqs);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!isVectors) {
|
if (isVectors == false) {
|
||||||
final boolean expectedHasPositions =
|
final boolean expectedHasPositions =
|
||||||
fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
|
fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
|
||||||
if (hasPositions != expectedHasPositions) {
|
if (hasPositions != expectedHasPositions) {
|
||||||
|
@ -1810,7 +1866,7 @@ public final class CheckIndex implements Closeable {
|
||||||
// free-for-all before?
|
// free-for-all before?
|
||||||
// but for offsets in the postings lists these checks are fine: they were always
|
// but for offsets in the postings lists these checks are fine: they were always
|
||||||
// enforced by IndexWriter
|
// enforced by IndexWriter
|
||||||
if (!isVectors) {
|
if (isVectors == false) {
|
||||||
if (startOffset < 0) {
|
if (startOffset < 0) {
|
||||||
throw new CheckIndexException(
|
throw new CheckIndexException(
|
||||||
"term "
|
"term "
|
||||||
|
@ -1924,14 +1980,13 @@ public final class CheckIndex implements Closeable {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Checking score blocks is heavy, we only do it on long postings lists, on every 1024th
|
// Checking score blocks is heavy, we only do it on long postings lists, on every 1024th
|
||||||
// term
|
// term or if slow checks are enabled.
|
||||||
// or if slow checks are enabled.
|
if (level >= Level.MIN_LEVEL_FOR_SLOW_CHECKS
|
||||||
if (doSlowChecks
|
|
||||||
|| docFreq > 1024
|
|| docFreq > 1024
|
||||||
|| (status.termCount + status.delTermCount) % 1024 == 0) {
|
|| (status.termCount + status.delTermCount) % 1024 == 0) {
|
||||||
// First check max scores and block uptos
|
// First check max scores and block uptos
|
||||||
// But only if slok checks are enabled since we visit all docs
|
// But only if slow checks are enabled since we visit all docs
|
||||||
if (doSlowChecks) {
|
if (level >= Level.MIN_LEVEL_FOR_SLOW_CHECKS) {
|
||||||
int max = -1;
|
int max = -1;
|
||||||
int maxFreq = 0;
|
int maxFreq = 0;
|
||||||
ImpactsEnum impactsEnum = termsEnum.impacts(PostingsEnum.FREQS);
|
ImpactsEnum impactsEnum = termsEnum.impacts(PostingsEnum.FREQS);
|
||||||
|
@ -1998,9 +2053,9 @@ public final class CheckIndex implements Closeable {
|
||||||
Impacts impacts = impactsEnum.getImpacts();
|
Impacts impacts = impactsEnum.getImpacts();
|
||||||
checkImpacts(impacts, doc);
|
checkImpacts(impacts, doc);
|
||||||
maxFreq = Integer.MAX_VALUE;
|
maxFreq = Integer.MAX_VALUE;
|
||||||
for (int level = 0; level < impacts.numLevels(); ++level) {
|
for (int impactsLevel = 0; impactsLevel < impacts.numLevels(); ++impactsLevel) {
|
||||||
if (impacts.getDocIdUpTo(level) >= max) {
|
if (impacts.getDocIdUpTo(impactsLevel) >= max) {
|
||||||
List<Impact> perLevelImpacts = impacts.getImpacts(level);
|
List<Impact> perLevelImpacts = impacts.getImpacts(impactsLevel);
|
||||||
maxFreq = perLevelImpacts.get(perLevelImpacts.size() - 1).freq;
|
maxFreq = perLevelImpacts.get(perLevelImpacts.size() - 1).freq;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
@ -2040,9 +2095,9 @@ public final class CheckIndex implements Closeable {
|
||||||
Impacts impacts = impactsEnum.getImpacts();
|
Impacts impacts = impactsEnum.getImpacts();
|
||||||
checkImpacts(impacts, doc);
|
checkImpacts(impacts, doc);
|
||||||
maxFreq = Integer.MAX_VALUE;
|
maxFreq = Integer.MAX_VALUE;
|
||||||
for (int level = 0; level < impacts.numLevels(); ++level) {
|
for (int impactsLevel = 0; impactsLevel < impacts.numLevels(); ++impactsLevel) {
|
||||||
if (impacts.getDocIdUpTo(level) >= max) {
|
if (impacts.getDocIdUpTo(impactsLevel) >= max) {
|
||||||
List<Impact> perLevelImpacts = impacts.getImpacts(level);
|
List<Impact> perLevelImpacts = impacts.getImpacts(impactsLevel);
|
||||||
maxFreq = perLevelImpacts.get(perLevelImpacts.size() - 1).freq;
|
maxFreq = perLevelImpacts.get(perLevelImpacts.size() - 1).freq;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
@ -2151,7 +2206,7 @@ public final class CheckIndex implements Closeable {
|
||||||
+ " doesn't have terms according to postings but has a norm value that is not zero: "
|
+ " doesn't have terms according to postings but has a norm value that is not zero: "
|
||||||
+ Long.toUnsignedString(norm));
|
+ Long.toUnsignedString(norm));
|
||||||
}
|
}
|
||||||
} else if (norm == 0 && visitedDocs.get(doc)) {
|
} else if (visitedDocs.get(doc)) {
|
||||||
throw new CheckIndexException(
|
throw new CheckIndexException(
|
||||||
"Document "
|
"Document "
|
||||||
+ doc
|
+ doc
|
||||||
|
@ -2307,7 +2362,7 @@ public final class CheckIndex implements Closeable {
|
||||||
static void checkImpacts(Impacts impacts, int lastTarget) {
|
static void checkImpacts(Impacts impacts, int lastTarget) {
|
||||||
final int numLevels = impacts.numLevels();
|
final int numLevels = impacts.numLevels();
|
||||||
if (numLevels < 1) {
|
if (numLevels < 1) {
|
||||||
throw new CheckIndexException("The number of levels must be >= 1, got " + numLevels);
|
throw new CheckIndexException("The number of impact levels must be >= 1, got " + numLevels);
|
||||||
}
|
}
|
||||||
|
|
||||||
int docIdUpTo0 = impacts.getDocIdUpTo(0);
|
int docIdUpTo0 = impacts.getDocIdUpTo(0);
|
||||||
|
@ -2319,17 +2374,17 @@ public final class CheckIndex implements Closeable {
|
||||||
+ lastTarget);
|
+ lastTarget);
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int level = 1; level < numLevels; ++level) {
|
for (int impactsLevel = 1; impactsLevel < numLevels; ++impactsLevel) {
|
||||||
int docIdUpTo = impacts.getDocIdUpTo(level);
|
int docIdUpTo = impacts.getDocIdUpTo(impactsLevel);
|
||||||
int previousDocIdUpTo = impacts.getDocIdUpTo(level - 1);
|
int previousDocIdUpTo = impacts.getDocIdUpTo(impactsLevel - 1);
|
||||||
if (docIdUpTo < previousDocIdUpTo) {
|
if (docIdUpTo < previousDocIdUpTo) {
|
||||||
throw new CheckIndexException(
|
throw new CheckIndexException(
|
||||||
"Decreasing return for getDocIdUpTo: level "
|
"Decreasing return for getDocIdUpTo: level "
|
||||||
+ (level - 1)
|
+ (impactsLevel - 1)
|
||||||
+ " returned "
|
+ " returned "
|
||||||
+ previousDocIdUpTo
|
+ previousDocIdUpTo
|
||||||
+ " but level "
|
+ " but level "
|
||||||
+ level
|
+ impactsLevel
|
||||||
+ " returned "
|
+ " returned "
|
||||||
+ docIdUpTo
|
+ docIdUpTo
|
||||||
+ " for target "
|
+ " for target "
|
||||||
|
@ -2337,10 +2392,10 @@ public final class CheckIndex implements Closeable {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int level = 0; level < numLevels; ++level) {
|
for (int impactsLevel = 0; impactsLevel < numLevels; ++impactsLevel) {
|
||||||
List<Impact> perLevelImpacts = impacts.getImpacts(level);
|
List<Impact> perLevelImpacts = impacts.getImpacts(impactsLevel);
|
||||||
if (perLevelImpacts.isEmpty()) {
|
if (perLevelImpacts.isEmpty()) {
|
||||||
throw new CheckIndexException("Got empty list of impacts on level " + level);
|
throw new CheckIndexException("Got empty list of impacts on level " + impactsLevel);
|
||||||
}
|
}
|
||||||
Impact first = perLevelImpacts.get(0);
|
Impact first = perLevelImpacts.get(0);
|
||||||
if (first.freq < 1) {
|
if (first.freq < 1) {
|
||||||
|
@ -2358,9 +2413,9 @@ public final class CheckIndex implements Closeable {
|
||||||
"Impacts are not ordered or contain dups, got " + previous + " then " + impact);
|
"Impacts are not ordered or contain dups, got " + previous + " then " + impact);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (level > 0) {
|
if (impactsLevel > 0) {
|
||||||
// Make sure that impacts at level N trigger better scores than an level N-1
|
// Make sure that impacts at level N trigger better scores than an impactsLevel N-1
|
||||||
Iterator<Impact> previousIt = impacts.getImpacts(level - 1).iterator();
|
Iterator<Impact> previousIt = impacts.getImpacts(impactsLevel - 1).iterator();
|
||||||
previous = previousIt.next();
|
previous = previousIt.next();
|
||||||
Iterator<Impact> it = perLevelImpacts.iterator();
|
Iterator<Impact> it = perLevelImpacts.iterator();
|
||||||
Impact impact = it.next();
|
Impact impact = it.next();
|
||||||
|
@ -2376,9 +2431,9 @@ public final class CheckIndex implements Closeable {
|
||||||
"Found impact "
|
"Found impact "
|
||||||
+ previous
|
+ previous
|
||||||
+ " on level "
|
+ " on level "
|
||||||
+ (level - 1)
|
+ (impactsLevel - 1)
|
||||||
+ " but no impact on level "
|
+ " but no impact on level "
|
||||||
+ level
|
+ impactsLevel
|
||||||
+ " triggers a better score: "
|
+ " triggers a better score: "
|
||||||
+ perLevelImpacts);
|
+ perLevelImpacts);
|
||||||
}
|
}
|
||||||
|
@ -2395,7 +2450,7 @@ public final class CheckIndex implements Closeable {
|
||||||
*/
|
*/
|
||||||
public static Status.TermIndexStatus testPostings(CodecReader reader, PrintStream infoStream)
|
public static Status.TermIndexStatus testPostings(CodecReader reader, PrintStream infoStream)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
return testPostings(reader, infoStream, false, true, false);
|
return testPostings(reader, infoStream, false, Level.MIN_LEVEL_FOR_SLOW_CHECKS, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -2404,15 +2459,11 @@ public final class CheckIndex implements Closeable {
|
||||||
* @lucene.experimental
|
* @lucene.experimental
|
||||||
*/
|
*/
|
||||||
public static Status.TermIndexStatus testPostings(
|
public static Status.TermIndexStatus testPostings(
|
||||||
CodecReader reader,
|
CodecReader reader, PrintStream infoStream, boolean verbose, int level, boolean failFast)
|
||||||
PrintStream infoStream,
|
|
||||||
boolean verbose,
|
|
||||||
boolean doSlowChecks,
|
|
||||||
boolean failFast)
|
|
||||||
throws IOException {
|
throws IOException {
|
||||||
|
|
||||||
// TODO: we should go and verify term vectors match, if
|
// TODO: we should go and verify term vectors match, if the Level is high enough to
|
||||||
// doSlowChecks is on...
|
// include slow checks
|
||||||
Status.TermIndexStatus status;
|
Status.TermIndexStatus status;
|
||||||
final int maxDoc = reader.maxDoc();
|
final int maxDoc = reader.maxDoc();
|
||||||
|
|
||||||
|
@ -2443,7 +2494,7 @@ public final class CheckIndex implements Closeable {
|
||||||
false,
|
false,
|
||||||
infoStream,
|
infoStream,
|
||||||
verbose,
|
verbose,
|
||||||
doSlowChecks);
|
level);
|
||||||
} catch (Throwable e) {
|
} catch (Throwable e) {
|
||||||
if (failFast) {
|
if (failFast) {
|
||||||
throw IOUtils.rethrowAlways(e);
|
throw IOUtils.rethrowAlways(e);
|
||||||
|
@ -3132,7 +3183,7 @@ public final class CheckIndex implements Closeable {
|
||||||
for (FieldInfo fieldInfo : reader.getFieldInfos()) {
|
for (FieldInfo fieldInfo : reader.getFieldInfos()) {
|
||||||
if (fieldInfo.getDocValuesType() != DocValuesType.NONE) {
|
if (fieldInfo.getDocValuesType() != DocValuesType.NONE) {
|
||||||
status.totalValueFields++;
|
status.totalValueFields++;
|
||||||
checkDocValues(fieldInfo, dvReader, reader.maxDoc(), infoStream, status);
|
checkDocValues(fieldInfo, dvReader, status);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -3162,11 +3213,11 @@ public final class CheckIndex implements Closeable {
|
||||||
}
|
}
|
||||||
|
|
||||||
@FunctionalInterface
|
@FunctionalInterface
|
||||||
private static interface DocValuesIteratorSupplier {
|
private interface DocValuesIteratorSupplier {
|
||||||
DocValuesIterator get(FieldInfo fi) throws IOException;
|
DocValuesIterator get(FieldInfo fi) throws IOException;
|
||||||
}
|
}
|
||||||
|
|
||||||
private static void checkDVIterator(FieldInfo fi, int maxDoc, DocValuesIteratorSupplier producer)
|
private static void checkDVIterator(FieldInfo fi, DocValuesIteratorSupplier producer)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
String field = fi.name;
|
String field = fi.name;
|
||||||
|
|
||||||
|
@ -3284,7 +3335,7 @@ public final class CheckIndex implements Closeable {
|
||||||
}
|
}
|
||||||
|
|
||||||
private static void checkBinaryDocValues(
|
private static void checkBinaryDocValues(
|
||||||
String fieldName, int maxDoc, BinaryDocValues bdv, BinaryDocValues bdv2) throws IOException {
|
String fieldName, BinaryDocValues bdv, BinaryDocValues bdv2) throws IOException {
|
||||||
if (bdv.docID() != -1) {
|
if (bdv.docID() != -1) {
|
||||||
throw new CheckIndexException(
|
throw new CheckIndexException(
|
||||||
"binary dv iterator for field: "
|
"binary dv iterator for field: "
|
||||||
|
@ -3309,7 +3360,7 @@ public final class CheckIndex implements Closeable {
|
||||||
}
|
}
|
||||||
|
|
||||||
private static void checkSortedDocValues(
|
private static void checkSortedDocValues(
|
||||||
String fieldName, int maxDoc, SortedDocValues dv, SortedDocValues dv2) throws IOException {
|
String fieldName, SortedDocValues dv, SortedDocValues dv2) throws IOException {
|
||||||
if (dv.docID() != -1) {
|
if (dv.docID() != -1) {
|
||||||
throw new CheckIndexException(
|
throw new CheckIndexException(
|
||||||
"sorted dv iterator for field: "
|
"sorted dv iterator for field: "
|
||||||
|
@ -3373,8 +3424,7 @@ public final class CheckIndex implements Closeable {
|
||||||
}
|
}
|
||||||
|
|
||||||
private static void checkSortedSetDocValues(
|
private static void checkSortedSetDocValues(
|
||||||
String fieldName, int maxDoc, SortedSetDocValues dv, SortedSetDocValues dv2)
|
String fieldName, SortedSetDocValues dv, SortedSetDocValues dv2) throws IOException {
|
||||||
throws IOException {
|
|
||||||
final long maxOrd = dv.getValueCount() - 1;
|
final long maxOrd = dv.getValueCount() - 1;
|
||||||
LongBitSet seenOrds = new LongBitSet(dv.getValueCount());
|
LongBitSet seenOrds = new LongBitSet(dv.getValueCount());
|
||||||
long maxOrd2 = -1;
|
long maxOrd2 = -1;
|
||||||
|
@ -3470,7 +3520,7 @@ public final class CheckIndex implements Closeable {
|
||||||
}
|
}
|
||||||
|
|
||||||
private static void checkSortedNumericDocValues(
|
private static void checkSortedNumericDocValues(
|
||||||
String fieldName, int maxDoc, SortedNumericDocValues ndv, SortedNumericDocValues ndv2)
|
String fieldName, SortedNumericDocValues ndv, SortedNumericDocValues ndv2)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
if (ndv.docID() != -1) {
|
if (ndv.docID() != -1) {
|
||||||
throw new CheckIndexException(
|
throw new CheckIndexException(
|
||||||
|
@ -3539,38 +3589,32 @@ public final class CheckIndex implements Closeable {
|
||||||
}
|
}
|
||||||
|
|
||||||
private static void checkDocValues(
|
private static void checkDocValues(
|
||||||
FieldInfo fi,
|
FieldInfo fi, DocValuesProducer dvReader, DocValuesStatus status) throws Exception {
|
||||||
DocValuesProducer dvReader,
|
|
||||||
int maxDoc,
|
|
||||||
PrintStream infoStream,
|
|
||||||
DocValuesStatus status)
|
|
||||||
throws Exception {
|
|
||||||
switch (fi.getDocValuesType()) {
|
switch (fi.getDocValuesType()) {
|
||||||
case SORTED:
|
case SORTED:
|
||||||
status.totalSortedFields++;
|
status.totalSortedFields++;
|
||||||
checkDVIterator(fi, maxDoc, dvReader::getSorted);
|
checkDVIterator(fi, dvReader::getSorted);
|
||||||
checkSortedDocValues(fi.name, maxDoc, dvReader.getSorted(fi), dvReader.getSorted(fi));
|
checkSortedDocValues(fi.name, dvReader.getSorted(fi), dvReader.getSorted(fi));
|
||||||
break;
|
break;
|
||||||
case SORTED_NUMERIC:
|
case SORTED_NUMERIC:
|
||||||
status.totalSortedNumericFields++;
|
status.totalSortedNumericFields++;
|
||||||
checkDVIterator(fi, maxDoc, dvReader::getSortedNumeric);
|
checkDVIterator(fi, dvReader::getSortedNumeric);
|
||||||
checkSortedNumericDocValues(
|
checkSortedNumericDocValues(
|
||||||
fi.name, maxDoc, dvReader.getSortedNumeric(fi), dvReader.getSortedNumeric(fi));
|
fi.name, dvReader.getSortedNumeric(fi), dvReader.getSortedNumeric(fi));
|
||||||
break;
|
break;
|
||||||
case SORTED_SET:
|
case SORTED_SET:
|
||||||
status.totalSortedSetFields++;
|
status.totalSortedSetFields++;
|
||||||
checkDVIterator(fi, maxDoc, dvReader::getSortedSet);
|
checkDVIterator(fi, dvReader::getSortedSet);
|
||||||
checkSortedSetDocValues(
|
checkSortedSetDocValues(fi.name, dvReader.getSortedSet(fi), dvReader.getSortedSet(fi));
|
||||||
fi.name, maxDoc, dvReader.getSortedSet(fi), dvReader.getSortedSet(fi));
|
|
||||||
break;
|
break;
|
||||||
case BINARY:
|
case BINARY:
|
||||||
status.totalBinaryFields++;
|
status.totalBinaryFields++;
|
||||||
checkDVIterator(fi, maxDoc, dvReader::getBinary);
|
checkDVIterator(fi, dvReader::getBinary);
|
||||||
checkBinaryDocValues(fi.name, maxDoc, dvReader.getBinary(fi), dvReader.getBinary(fi));
|
checkBinaryDocValues(fi.name, dvReader.getBinary(fi), dvReader.getBinary(fi));
|
||||||
break;
|
break;
|
||||||
case NUMERIC:
|
case NUMERIC:
|
||||||
status.totalNumericFields++;
|
status.totalNumericFields++;
|
||||||
checkDVIterator(fi, maxDoc, dvReader::getNumeric);
|
checkDVIterator(fi, dvReader::getNumeric);
|
||||||
checkNumericDocValues(fi.name, dvReader.getNumeric(fi), dvReader.getNumeric(fi));
|
checkNumericDocValues(fi.name, dvReader.getNumeric(fi), dvReader.getNumeric(fi));
|
||||||
break;
|
break;
|
||||||
case NONE:
|
case NONE:
|
||||||
|
@ -3586,7 +3630,7 @@ public final class CheckIndex implements Closeable {
|
||||||
*/
|
*/
|
||||||
public static Status.TermVectorStatus testTermVectors(CodecReader reader, PrintStream infoStream)
|
public static Status.TermVectorStatus testTermVectors(CodecReader reader, PrintStream infoStream)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
return testTermVectors(reader, infoStream, false, false, false);
|
return testTermVectors(reader, infoStream, false, Level.MIN_LEVEL_FOR_INTEGRITY_CHECKS, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -3595,11 +3639,7 @@ public final class CheckIndex implements Closeable {
|
||||||
* @lucene.experimental
|
* @lucene.experimental
|
||||||
*/
|
*/
|
||||||
public static Status.TermVectorStatus testTermVectors(
|
public static Status.TermVectorStatus testTermVectors(
|
||||||
CodecReader reader,
|
CodecReader reader, PrintStream infoStream, boolean verbose, int level, boolean failFast)
|
||||||
PrintStream infoStream,
|
|
||||||
boolean verbose,
|
|
||||||
boolean doSlowChecks,
|
|
||||||
boolean failFast)
|
|
||||||
throws IOException {
|
throws IOException {
|
||||||
long startNS = System.nanoTime();
|
long startNS = System.nanoTime();
|
||||||
final Status.TermVectorStatus status = new Status.TermVectorStatus();
|
final Status.TermVectorStatus status = new Status.TermVectorStatus();
|
||||||
|
@ -3612,14 +3652,14 @@ public final class CheckIndex implements Closeable {
|
||||||
|
|
||||||
PostingsEnum postings = null;
|
PostingsEnum postings = null;
|
||||||
|
|
||||||
// Only used if doSlowChecks is true:
|
// Only used if the Level is high enough to include slow checks:
|
||||||
PostingsEnum postingsDocs = null;
|
PostingsEnum postingsDocs = null;
|
||||||
|
|
||||||
final Bits liveDocs = reader.getLiveDocs();
|
final Bits liveDocs = reader.getLiveDocs();
|
||||||
|
|
||||||
FieldsProducer postingsFields;
|
FieldsProducer postingsFields;
|
||||||
// TODO: testTermsIndex
|
// TODO: testTermsIndex
|
||||||
if (doSlowChecks) {
|
if (level >= Level.MIN_LEVEL_FOR_SLOW_CHECKS) {
|
||||||
postingsFields = reader.getPostingsReader();
|
postingsFields = reader.getPostingsReader();
|
||||||
if (postingsFields != null) {
|
if (postingsFields != null) {
|
||||||
postingsFields = postingsFields.getMergeInstance();
|
postingsFields = postingsFields.getMergeInstance();
|
||||||
|
@ -3643,8 +3683,7 @@ public final class CheckIndex implements Closeable {
|
||||||
|
|
||||||
if (tfv != null) {
|
if (tfv != null) {
|
||||||
// First run with no deletions:
|
// First run with no deletions:
|
||||||
checkFields(
|
checkFields(tfv, null, 1, fieldInfos, null, false, true, infoStream, verbose, level);
|
||||||
tfv, null, 1, fieldInfos, null, false, true, infoStream, verbose, doSlowChecks);
|
|
||||||
|
|
||||||
// Only agg stats if the doc is live:
|
// Only agg stats if the doc is live:
|
||||||
final boolean doStats = liveDocs == null || liveDocs.get(j);
|
final boolean doStats = liveDocs == null || liveDocs.get(j);
|
||||||
|
@ -3660,7 +3699,7 @@ public final class CheckIndex implements Closeable {
|
||||||
|
|
||||||
// Make sure FieldInfo thinks this field is vector'd:
|
// Make sure FieldInfo thinks this field is vector'd:
|
||||||
final FieldInfo fieldInfo = fieldInfos.fieldInfo(field);
|
final FieldInfo fieldInfo = fieldInfos.fieldInfo(field);
|
||||||
if (!fieldInfo.hasVectors()) {
|
if (fieldInfo.hasVectors() == false) {
|
||||||
throw new CheckIndexException(
|
throw new CheckIndexException(
|
||||||
"docID="
|
"docID="
|
||||||
+ j
|
+ j
|
||||||
|
@ -3669,7 +3708,7 @@ public final class CheckIndex implements Closeable {
|
||||||
+ " but FieldInfo has storeTermVector=false");
|
+ " but FieldInfo has storeTermVector=false");
|
||||||
}
|
}
|
||||||
|
|
||||||
if (doSlowChecks) {
|
if (level >= Level.MIN_LEVEL_FOR_SLOW_CHECKS) {
|
||||||
Terms terms = tfv.terms(field);
|
Terms terms = tfv.terms(field);
|
||||||
TermsEnum termsEnum = terms.iterator();
|
TermsEnum termsEnum = terms.iterator();
|
||||||
final boolean postingsHasFreq =
|
final boolean postingsHasFreq =
|
||||||
|
@ -3696,7 +3735,7 @@ public final class CheckIndex implements Closeable {
|
||||||
postings = termsEnum.postings(postings, PostingsEnum.ALL);
|
postings = termsEnum.postings(postings, PostingsEnum.ALL);
|
||||||
assert postings != null;
|
assert postings != null;
|
||||||
|
|
||||||
if (!postingsTermsEnum.seekExact(term)) {
|
if (postingsTermsEnum.seekExact(term) == false) {
|
||||||
throw new CheckIndexException(
|
throw new CheckIndexException(
|
||||||
"vector term="
|
"vector term="
|
||||||
+ term
|
+ term
|
||||||
|
@ -3852,7 +3891,7 @@ public final class CheckIndex implements Closeable {
|
||||||
+ " but postings does not.");
|
+ " but postings does not.");
|
||||||
}
|
}
|
||||||
BytesRef postingsPayload = postingsDocs.getPayload();
|
BytesRef postingsPayload = postingsDocs.getPayload();
|
||||||
if (!payload.equals(postingsPayload)) {
|
if (payload.equals(postingsPayload) == false) {
|
||||||
throw new CheckIndexException(
|
throw new CheckIndexException(
|
||||||
"vector term="
|
"vector term="
|
||||||
+ term
|
+ term
|
||||||
|
@ -3972,9 +4011,8 @@ public final class CheckIndex implements Closeable {
|
||||||
/** Run-time configuration options for CheckIndex commands. */
|
/** Run-time configuration options for CheckIndex commands. */
|
||||||
public static class Options {
|
public static class Options {
|
||||||
boolean doExorcise = false;
|
boolean doExorcise = false;
|
||||||
boolean doSlowChecks = false;
|
|
||||||
boolean verbose = false;
|
boolean verbose = false;
|
||||||
boolean doChecksumsOnly = false;
|
int level = Level.DEFAULT_VALUE;
|
||||||
int threadCount;
|
int threadCount;
|
||||||
List<String> onlySegments = new ArrayList<>();
|
List<String> onlySegments = new ArrayList<>();
|
||||||
String indexPath = null;
|
String indexPath = null;
|
||||||
|
@ -4011,9 +4049,10 @@ public final class CheckIndex implements Closeable {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!assertsOn())
|
if (assertsOn() == false) {
|
||||||
System.out.println(
|
System.out.println(
|
||||||
"\nNOTE: testing will be more thorough if you run java with '-ea:org.apache.lucene...', so assertions are enabled");
|
"\nNOTE: testing will be more thorough if you run java with '-ea:org.apache.lucene...', so assertions are enabled");
|
||||||
|
}
|
||||||
|
|
||||||
System.out.println("\nOpening index @ " + opts.indexPath + "\n");
|
System.out.println("\nOpening index @ " + opts.indexPath + "\n");
|
||||||
Directory directory = null;
|
Directory directory = null;
|
||||||
|
@ -4037,6 +4076,42 @@ public final class CheckIndex implements Closeable {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** Class with static variables with information about CheckIndex's -level parameter. */
|
||||||
|
public static class Level {
|
||||||
|
private Level() {}
|
||||||
|
|
||||||
|
/** Minimum valid level. */
|
||||||
|
public static final int MIN_VALUE = 1;
|
||||||
|
|
||||||
|
/** Maximum valid level. */
|
||||||
|
public static final int MAX_VALUE = 3;
|
||||||
|
|
||||||
|
/** The default level if none is specified. */
|
||||||
|
public static final int DEFAULT_VALUE = MIN_VALUE;
|
||||||
|
|
||||||
|
/** Minimum level required to run checksum checks. */
|
||||||
|
public static final int MIN_LEVEL_FOR_CHECKSUM_CHECKS = 1;
|
||||||
|
|
||||||
|
/** Minimum level required to run integrity checks. */
|
||||||
|
public static final int MIN_LEVEL_FOR_INTEGRITY_CHECKS = 2;
|
||||||
|
|
||||||
|
/** Minimum level required to run slow checks. */
|
||||||
|
public static final int MIN_LEVEL_FOR_SLOW_CHECKS = 3;
|
||||||
|
|
||||||
|
/** Checks if given level value is within the allowed bounds else it raises an Exception. */
|
||||||
|
public static void checkIfLevelInBounds(int levelVal) throws IllegalArgumentException {
|
||||||
|
if (levelVal < Level.MIN_VALUE || levelVal > Level.MAX_VALUE) {
|
||||||
|
throw new IllegalArgumentException(
|
||||||
|
String.format(
|
||||||
|
Locale.ROOT,
|
||||||
|
"ERROR: given value: '%d' for -level option is out of bounds. Please use a value from '%d'->'%d'",
|
||||||
|
levelVal,
|
||||||
|
Level.MIN_VALUE,
|
||||||
|
Level.MAX_VALUE));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Parse command line args into fields
|
* Parse command line args into fields
|
||||||
*
|
*
|
||||||
|
@ -4051,15 +4126,29 @@ public final class CheckIndex implements Closeable {
|
||||||
int i = 0;
|
int i = 0;
|
||||||
while (i < args.length) {
|
while (i < args.length) {
|
||||||
String arg = args[i];
|
String arg = args[i];
|
||||||
if ("-fast".equals(arg)) {
|
if ("-level".equals(arg)) {
|
||||||
opts.doChecksumsOnly = true;
|
if (i == args.length - 1) {
|
||||||
|
throw new IllegalArgumentException("ERROR: missing value for -level option");
|
||||||
|
}
|
||||||
|
i++;
|
||||||
|
int level = Integer.parseInt(args[i]);
|
||||||
|
Level.checkIfLevelInBounds(level);
|
||||||
|
opts.level = level;
|
||||||
|
} else if ("-fast".equals(arg)) {
|
||||||
|
// Deprecated. Remove in Lucene 11.
|
||||||
|
System.err.println(
|
||||||
|
"-fast is deprecated, use '-level 1' for explicitly verifying file checksums only. This is also now the default "
|
||||||
|
+ "behaviour!");
|
||||||
|
} else if ("-slow".equals(arg)) {
|
||||||
|
// Deprecated. Remove in Lucene 11.
|
||||||
|
System.err.println("-slow is deprecated, use '-level 3' instead for slow checks");
|
||||||
|
opts.level = Level.MIN_LEVEL_FOR_SLOW_CHECKS;
|
||||||
} else if ("-exorcise".equals(arg)) {
|
} else if ("-exorcise".equals(arg)) {
|
||||||
opts.doExorcise = true;
|
opts.doExorcise = true;
|
||||||
} else if ("-crossCheckTermVectors".equals(arg)) {
|
} else if ("-crossCheckTermVectors".equals(arg)) {
|
||||||
System.err.println("-crossCheckTermVectors is deprecated, use -slow instead");
|
// Deprecated. Remove in Lucene 11.
|
||||||
opts.doSlowChecks = true;
|
System.err.println("-crossCheckTermVectors is deprecated, use '-level 3' instead");
|
||||||
} else if ("-slow".equals(arg)) {
|
opts.level = Level.MAX_VALUE;
|
||||||
opts.doSlowChecks = true;
|
|
||||||
} else if (arg.equals("-verbose")) {
|
} else if (arg.equals("-verbose")) {
|
||||||
opts.verbose = true;
|
opts.verbose = true;
|
||||||
} else if (arg.equals("-segment")) {
|
} else if (arg.equals("-segment")) {
|
||||||
|
@ -4096,11 +4185,13 @@ public final class CheckIndex implements Closeable {
|
||||||
if (opts.indexPath == null) {
|
if (opts.indexPath == null) {
|
||||||
throw new IllegalArgumentException(
|
throw new IllegalArgumentException(
|
||||||
"\nERROR: index path not specified"
|
"\nERROR: index path not specified"
|
||||||
+ "\nUsage: java org.apache.lucene.index.CheckIndex pathToIndex [-exorcise] [-slow] [-segment X] [-segment Y] [-threadCount X] [-dir-impl X]\n"
|
+ "\nUsage: java org.apache.lucene.index.CheckIndex pathToIndex [-exorcise] [-level X] [-segment X] [-segment Y] [-threadCount X] [-dir-impl X]\n"
|
||||||
+ "\n"
|
+ "\n"
|
||||||
+ " -exorcise: actually write a new segments_N file, removing any problematic segments\n"
|
+ " -exorcise: actually write a new segments_N file, removing any problematic segments\n"
|
||||||
+ " -fast: just verify file checksums, omitting logical integrity checks\n"
|
+ " -level X: sets the detail level of the check. The higher the value, the more checks are done.\n"
|
||||||
+ " -slow: do additional slow checks; THIS IS VERY SLOW!\n"
|
+ " 1 - (Default) Checksum checks only.\n"
|
||||||
|
+ " 2 - All level 1 checks + logical integrity checks.\n"
|
||||||
|
+ " 3 - All level 2 checks + slow checks.\n"
|
||||||
+ " -codec X: when exorcising, codec to write the new segments_N file with\n"
|
+ " -codec X: when exorcising, codec to write the new segments_N file with\n"
|
||||||
+ " -verbose: print additional details\n"
|
+ " -verbose: print additional details\n"
|
||||||
+ " -segment X: only check the specified segments. This can be specified multiple\n"
|
+ " -segment X: only check the specified segments. This can be specified multiple\n"
|
||||||
|
@ -4115,7 +4206,8 @@ public final class CheckIndex implements Closeable {
|
||||||
+ "If no package is specified the "
|
+ "If no package is specified the "
|
||||||
+ FSDirectory.class.getPackage().getName()
|
+ FSDirectory.class.getPackage().getName()
|
||||||
+ " package will be used.\n"
|
+ " package will be used.\n"
|
||||||
+ "\n"
|
+ "CheckIndex only verifies file checksums as default.\n"
|
||||||
|
+ "Use -level with value of '2' or higher if you also want to check segment file contents.\n\n"
|
||||||
+ "**WARNING**: -exorcise *LOSES DATA*. This should only be used on an emergency basis as it will cause\n"
|
+ "**WARNING**: -exorcise *LOSES DATA*. This should only be used on an emergency basis as it will cause\n"
|
||||||
+ "documents (perhaps many) to be permanently removed from the index. Always make\n"
|
+ "documents (perhaps many) to be permanently removed from the index. Always make\n"
|
||||||
+ "a backup copy of your index before running this! Do not run this tool on an index\n"
|
+ "a backup copy of your index before running this! Do not run this tool on an index\n"
|
||||||
|
@ -4137,10 +4229,6 @@ public final class CheckIndex implements Closeable {
|
||||||
throw new IllegalArgumentException("ERROR: cannot specify both -exorcise and -segment");
|
throw new IllegalArgumentException("ERROR: cannot specify both -exorcise and -segment");
|
||||||
}
|
}
|
||||||
|
|
||||||
if (opts.doChecksumsOnly && opts.doSlowChecks) {
|
|
||||||
throw new IllegalArgumentException("ERROR: cannot specify both -fast and -slow");
|
|
||||||
}
|
|
||||||
|
|
||||||
return opts;
|
return opts;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -4151,8 +4239,7 @@ public final class CheckIndex implements Closeable {
|
||||||
* @return 0 iff the index is clean, 1 otherwise
|
* @return 0 iff the index is clean, 1 otherwise
|
||||||
*/
|
*/
|
||||||
public int doCheck(Options opts) throws IOException, InterruptedException {
|
public int doCheck(Options opts) throws IOException, InterruptedException {
|
||||||
setDoSlowChecks(opts.doSlowChecks);
|
setLevel(opts.level);
|
||||||
setChecksumsOnly(opts.doChecksumsOnly);
|
|
||||||
setInfoStream(opts.out, opts.verbose);
|
setInfoStream(opts.out, opts.verbose);
|
||||||
// user provided thread count via command line argument, overriding the default with user
|
// user provided thread count via command line argument, overriding the default with user
|
||||||
// provided value
|
// provided value
|
||||||
|
@ -4166,8 +4253,8 @@ public final class CheckIndex implements Closeable {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!result.clean) {
|
if (result.clean == false) {
|
||||||
if (!opts.doExorcise) {
|
if (opts.doExorcise == false) {
|
||||||
opts.out.println(
|
opts.out.println(
|
||||||
"WARNING: would write new segments file, and "
|
"WARNING: would write new segments file, and "
|
||||||
+ result.totLoseDocCount
|
+ result.totLoseDocCount
|
||||||
|
|
|
@ -270,7 +270,6 @@ final class FieldUpdatesBuffer {
|
||||||
static class BufferedUpdate {
|
static class BufferedUpdate {
|
||||||
|
|
||||||
private BufferedUpdate() {}
|
private BufferedUpdate() {}
|
||||||
;
|
|
||||||
|
|
||||||
/** the max document ID this update should be applied to */
|
/** the max document ID this update should be applied to */
|
||||||
int docUpTo;
|
int docUpTo;
|
||||||
|
|
|
@ -33,6 +33,7 @@ import java.util.HashSet;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Locale;
|
import java.util.Locale;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
import java.util.Objects;
|
||||||
import java.util.Queue;
|
import java.util.Queue;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
import java.util.concurrent.ConcurrentLinkedQueue;
|
import java.util.concurrent.ConcurrentLinkedQueue;
|
||||||
|
@ -55,6 +56,8 @@ import org.apache.lucene.index.DocValuesUpdate.BinaryDocValuesUpdate;
|
||||||
import org.apache.lucene.index.DocValuesUpdate.NumericDocValuesUpdate;
|
import org.apache.lucene.index.DocValuesUpdate.NumericDocValuesUpdate;
|
||||||
import org.apache.lucene.index.FieldInfos.FieldNumbers;
|
import org.apache.lucene.index.FieldInfos.FieldNumbers;
|
||||||
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
|
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
|
||||||
|
import org.apache.lucene.index.MergePolicy.MergeReader;
|
||||||
|
import org.apache.lucene.index.Sorter.DocMap;
|
||||||
import org.apache.lucene.internal.tests.IndexPackageAccess;
|
import org.apache.lucene.internal.tests.IndexPackageAccess;
|
||||||
import org.apache.lucene.internal.tests.IndexWriterAccess;
|
import org.apache.lucene.internal.tests.IndexWriterAccess;
|
||||||
import org.apache.lucene.internal.tests.TestSecrets;
|
import org.apache.lucene.internal.tests.TestSecrets;
|
||||||
|
@ -3413,8 +3416,20 @@ public class IndexWriter
|
||||||
Collections.emptyMap(),
|
Collections.emptyMap(),
|
||||||
config.getIndexSort());
|
config.getIndexSort());
|
||||||
|
|
||||||
List<CodecReader> readers =
|
List<CodecReader> readers = new ArrayList<>();
|
||||||
merge.getMergeReader().stream().map(r -> r.codecReader).collect(Collectors.toList());
|
for (MergeReader mr : merge.getMergeReader()) {
|
||||||
|
CodecReader reader = merge.wrapForMerge(mr.codecReader);
|
||||||
|
readers.add(reader);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (config.getIndexSort() == null && readers.isEmpty() == false) {
|
||||||
|
CodecReader mergedReader = SlowCompositeCodecReaderWrapper.wrap(readers);
|
||||||
|
DocMap docMap = merge.reorder(mergedReader, directory);
|
||||||
|
if (docMap != null) {
|
||||||
|
readers = Collections.singletonList(SortingCodecReader.wrap(mergedReader, docMap, null));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
SegmentMerger merger =
|
SegmentMerger merger =
|
||||||
new SegmentMerger(readers, segInfo, infoStream, trackingDir, globalFieldNumberMap, context);
|
new SegmentMerger(readers, segInfo, infoStream, trackingDir, globalFieldNumberMap, context);
|
||||||
|
|
||||||
|
@ -3464,6 +3479,8 @@ public class IndexWriter
|
||||||
merge.getMergeInfo().info.setUseCompoundFile(true);
|
merge.getMergeInfo().info.setUseCompoundFile(true);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
merge.setMergeInfo(merge.info);
|
||||||
|
|
||||||
// Have codec write SegmentInfo. Must do this after
|
// Have codec write SegmentInfo. Must do this after
|
||||||
// creating CFS so that 1) .si isn't slurped into CFS,
|
// creating CFS so that 1) .si isn't slurped into CFS,
|
||||||
// and 2) .si reflects useCompoundFile=true change
|
// and 2) .si reflects useCompoundFile=true change
|
||||||
|
@ -3791,7 +3808,7 @@ public class IndexWriter
|
||||||
new OneMergeWrappingMergePolicy(
|
new OneMergeWrappingMergePolicy(
|
||||||
config.getMergePolicy(),
|
config.getMergePolicy(),
|
||||||
toWrap ->
|
toWrap ->
|
||||||
new MergePolicy.OneMerge(toWrap.segments) {
|
new MergePolicy.OneMerge(toWrap) {
|
||||||
SegmentCommitInfo origInfo;
|
SegmentCommitInfo origInfo;
|
||||||
final AtomicBoolean onlyOnce = new AtomicBoolean(false);
|
final AtomicBoolean onlyOnce = new AtomicBoolean(false);
|
||||||
|
|
||||||
|
@ -3890,6 +3907,18 @@ public class IndexWriter
|
||||||
public CodecReader wrapForMerge(CodecReader reader) throws IOException {
|
public CodecReader wrapForMerge(CodecReader reader) throws IOException {
|
||||||
return toWrap.wrapForMerge(reader); // must delegate
|
return toWrap.wrapForMerge(reader); // must delegate
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Sorter.DocMap reorder(CodecReader reader, Directory dir)
|
||||||
|
throws IOException {
|
||||||
|
return toWrap.reorder(reader, dir); // must delegate
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void setMergeInfo(SegmentCommitInfo info) {
|
||||||
|
super.setMergeInfo(info);
|
||||||
|
toWrap.setMergeInfo(info);
|
||||||
|
}
|
||||||
}),
|
}),
|
||||||
trigger,
|
trigger,
|
||||||
UNBOUNDED_MAX_MERGE_SEGMENTS);
|
UNBOUNDED_MAX_MERGE_SEGMENTS);
|
||||||
|
@ -4312,7 +4341,7 @@ public class IndexWriter
|
||||||
* merge.info). If no deletes were flushed, no new deletes file is saved.
|
* merge.info). If no deletes were flushed, no new deletes file is saved.
|
||||||
*/
|
*/
|
||||||
private synchronized ReadersAndUpdates commitMergedDeletesAndUpdates(
|
private synchronized ReadersAndUpdates commitMergedDeletesAndUpdates(
|
||||||
MergePolicy.OneMerge merge, MergeState mergeState) throws IOException {
|
MergePolicy.OneMerge merge, MergeState.DocMap[] docMaps) throws IOException {
|
||||||
|
|
||||||
mergeFinishedGen.incrementAndGet();
|
mergeFinishedGen.incrementAndGet();
|
||||||
|
|
||||||
|
@ -4336,7 +4365,7 @@ public class IndexWriter
|
||||||
|
|
||||||
boolean anyDVUpdates = false;
|
boolean anyDVUpdates = false;
|
||||||
|
|
||||||
assert sourceSegments.size() == mergeState.docMaps.length;
|
assert sourceSegments.size() == docMaps.length;
|
||||||
for (int i = 0; i < sourceSegments.size(); i++) {
|
for (int i = 0; i < sourceSegments.size(); i++) {
|
||||||
SegmentCommitInfo info = sourceSegments.get(i);
|
SegmentCommitInfo info = sourceSegments.get(i);
|
||||||
minGen = Math.min(info.getBufferedDeletesGen(), minGen);
|
minGen = Math.min(info.getBufferedDeletesGen(), minGen);
|
||||||
|
@ -4346,12 +4375,11 @@ public class IndexWriter
|
||||||
// the pool:
|
// the pool:
|
||||||
assert rld != null : "seg=" + info.info.name;
|
assert rld != null : "seg=" + info.info.name;
|
||||||
|
|
||||||
MergeState.DocMap segDocMap = mergeState.docMaps[i];
|
MergeState.DocMap segDocMap = docMaps[i];
|
||||||
|
|
||||||
carryOverHardDeletes(
|
carryOverHardDeletes(
|
||||||
mergedDeletesAndUpdates,
|
mergedDeletesAndUpdates,
|
||||||
maxDoc,
|
maxDoc,
|
||||||
mergeState.liveDocs[i],
|
|
||||||
merge.getMergeReader().get(i).hardLiveDocs,
|
merge.getMergeReader().get(i).hardLiveDocs,
|
||||||
rld.getHardLiveDocs(),
|
rld.getHardLiveDocs(),
|
||||||
segDocMap);
|
segDocMap);
|
||||||
|
@ -4454,26 +4482,21 @@ public class IndexWriter
|
||||||
private static void carryOverHardDeletes(
|
private static void carryOverHardDeletes(
|
||||||
ReadersAndUpdates mergedReadersAndUpdates,
|
ReadersAndUpdates mergedReadersAndUpdates,
|
||||||
int maxDoc,
|
int maxDoc,
|
||||||
Bits mergeLiveDocs, // the liveDocs used to build the segDocMaps
|
|
||||||
Bits prevHardLiveDocs, // the hard deletes when the merge reader was pulled
|
Bits prevHardLiveDocs, // the hard deletes when the merge reader was pulled
|
||||||
Bits currentHardLiveDocs, // the current hard deletes
|
Bits currentHardLiveDocs, // the current hard deletes
|
||||||
MergeState.DocMap segDocMap)
|
MergeState.DocMap segDocMap)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
|
|
||||||
assert mergeLiveDocs == null || mergeLiveDocs.length() == maxDoc;
|
|
||||||
// if we mix soft and hard deletes we need to make sure that we only carry over deletes
|
// if we mix soft and hard deletes we need to make sure that we only carry over deletes
|
||||||
// that were not deleted before. Otherwise the segDocMap doesn't contain a mapping.
|
// that were not deleted before. Otherwise the segDocMap doesn't contain a mapping.
|
||||||
// yet this is also required if any MergePolicy modifies the liveDocs since this is
|
// yet this is also required if any MergePolicy modifies the liveDocs since this is
|
||||||
// what the segDocMap is build on.
|
// what the segDocMap is build on.
|
||||||
final IntPredicate carryOverDelete =
|
final IntPredicate carryOverDelete =
|
||||||
mergeLiveDocs == null || mergeLiveDocs == prevHardLiveDocs
|
docId -> segDocMap.get(docId) != -1 && currentHardLiveDocs.get(docId) == false;
|
||||||
? docId -> currentHardLiveDocs.get(docId) == false
|
|
||||||
: docId -> mergeLiveDocs.get(docId) && currentHardLiveDocs.get(docId) == false;
|
|
||||||
if (prevHardLiveDocs != null) {
|
if (prevHardLiveDocs != null) {
|
||||||
// If we had deletions on starting the merge we must
|
// If we had deletions on starting the merge we must
|
||||||
// still have deletions now:
|
// still have deletions now:
|
||||||
assert currentHardLiveDocs != null;
|
assert currentHardLiveDocs != null;
|
||||||
assert mergeLiveDocs != null;
|
|
||||||
assert prevHardLiveDocs.length() == maxDoc;
|
assert prevHardLiveDocs.length() == maxDoc;
|
||||||
assert currentHardLiveDocs.length() == maxDoc;
|
assert currentHardLiveDocs.length() == maxDoc;
|
||||||
|
|
||||||
|
@ -4516,7 +4539,7 @@ public class IndexWriter
|
||||||
}
|
}
|
||||||
|
|
||||||
@SuppressWarnings("try")
|
@SuppressWarnings("try")
|
||||||
private synchronized boolean commitMerge(MergePolicy.OneMerge merge, MergeState mergeState)
|
private synchronized boolean commitMerge(MergePolicy.OneMerge merge, MergeState.DocMap[] docMaps)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
merge.onMergeComplete();
|
merge.onMergeComplete();
|
||||||
testPoint("startCommitMerge");
|
testPoint("startCommitMerge");
|
||||||
|
@ -4559,7 +4582,7 @@ public class IndexWriter
|
||||||
}
|
}
|
||||||
|
|
||||||
final ReadersAndUpdates mergedUpdates =
|
final ReadersAndUpdates mergedUpdates =
|
||||||
merge.info.info.maxDoc() == 0 ? null : commitMergedDeletesAndUpdates(merge, mergeState);
|
merge.info.info.maxDoc() == 0 ? null : commitMergedDeletesAndUpdates(merge, docMaps);
|
||||||
|
|
||||||
// If the doc store we are using has been closed and
|
// If the doc store we are using has been closed and
|
||||||
// is in now compound format (but wasn't when we
|
// is in now compound format (but wasn't when we
|
||||||
|
@ -5163,12 +5186,57 @@ public class IndexWriter
|
||||||
}
|
}
|
||||||
mergeReaders.add(wrappedReader);
|
mergeReaders.add(wrappedReader);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
MergeState.DocMap[] reorderDocMaps = null;
|
||||||
|
if (config.getIndexSort() == null) {
|
||||||
|
// Create a merged view of the input segments. This effectively does the merge.
|
||||||
|
CodecReader mergedView = SlowCompositeCodecReaderWrapper.wrap(mergeReaders);
|
||||||
|
Sorter.DocMap docMap = merge.reorder(mergedView, directory);
|
||||||
|
if (docMap != null) {
|
||||||
|
reorderDocMaps = new MergeState.DocMap[mergeReaders.size()];
|
||||||
|
int docBase = 0;
|
||||||
|
int i = 0;
|
||||||
|
for (CodecReader reader : mergeReaders) {
|
||||||
|
final int currentDocBase = docBase;
|
||||||
|
reorderDocMaps[i] =
|
||||||
|
docID -> {
|
||||||
|
Objects.checkIndex(docID, reader.maxDoc());
|
||||||
|
return docMap.oldToNew(currentDocBase + docID);
|
||||||
|
};
|
||||||
|
i++;
|
||||||
|
docBase += reader.maxDoc();
|
||||||
|
}
|
||||||
|
// This makes merging more expensive as it disables some bulk merging optimizations, so
|
||||||
|
// only do this if a non-null DocMap is returned.
|
||||||
|
mergeReaders =
|
||||||
|
Collections.singletonList(SortingCodecReader.wrap(mergedView, docMap, null));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
final SegmentMerger merger =
|
final SegmentMerger merger =
|
||||||
new SegmentMerger(
|
new SegmentMerger(
|
||||||
mergeReaders, merge.info.info, infoStream, dirWrapper, globalFieldNumberMap, context);
|
mergeReaders, merge.info.info, infoStream, dirWrapper, globalFieldNumberMap, context);
|
||||||
merge.info.setSoftDelCount(Math.toIntExact(softDeleteCount.get()));
|
merge.info.setSoftDelCount(Math.toIntExact(softDeleteCount.get()));
|
||||||
merge.checkAborted();
|
merge.checkAborted();
|
||||||
|
|
||||||
|
MergeState mergeState = merger.mergeState;
|
||||||
|
MergeState.DocMap[] docMaps;
|
||||||
|
if (reorderDocMaps == null) {
|
||||||
|
docMaps = mergeState.docMaps;
|
||||||
|
} else {
|
||||||
|
// Since the reader was reordered, we passed a merged view to MergeState and from its
|
||||||
|
// perspective there is a single input segment to the merge and the
|
||||||
|
// SlowCompositeCodecReaderWrapper is effectively doing the merge.
|
||||||
|
assert mergeState.docMaps.length == 1
|
||||||
|
: "Got " + mergeState.docMaps.length + " docMaps, but expected 1";
|
||||||
|
MergeState.DocMap compactionDocMap = mergeState.docMaps[0];
|
||||||
|
docMaps = new MergeState.DocMap[reorderDocMaps.length];
|
||||||
|
for (int i = 0; i < docMaps.length; ++i) {
|
||||||
|
MergeState.DocMap reorderDocMap = reorderDocMaps[i];
|
||||||
|
docMaps[i] = docID -> compactionDocMap.get(reorderDocMap.get(docID));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
merge.mergeStartNS = System.nanoTime();
|
merge.mergeStartNS = System.nanoTime();
|
||||||
|
|
||||||
// This is where all the work happens:
|
// This is where all the work happens:
|
||||||
|
@ -5176,7 +5244,6 @@ public class IndexWriter
|
||||||
merger.merge();
|
merger.merge();
|
||||||
}
|
}
|
||||||
|
|
||||||
MergeState mergeState = merger.mergeState;
|
|
||||||
assert mergeState.segmentInfo == merge.info.info;
|
assert mergeState.segmentInfo == merge.info.info;
|
||||||
merge.info.info.setFiles(new HashSet<>(dirWrapper.getCreatedFiles()));
|
merge.info.info.setFiles(new HashSet<>(dirWrapper.getCreatedFiles()));
|
||||||
Codec codec = config.getCodec();
|
Codec codec = config.getCodec();
|
||||||
|
@ -5229,7 +5296,7 @@ public class IndexWriter
|
||||||
// Merge would produce a 0-doc segment, so we do nothing except commit the merge to remove
|
// Merge would produce a 0-doc segment, so we do nothing except commit the merge to remove
|
||||||
// all the 0-doc segments that we "merged":
|
// all the 0-doc segments that we "merged":
|
||||||
assert merge.info.info.maxDoc() == 0;
|
assert merge.info.info.maxDoc() == 0;
|
||||||
success = commitMerge(merge, mergeState);
|
success = commitMerge(merge, docMaps);
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -5309,6 +5376,8 @@ public class IndexWriter
|
||||||
success = false;
|
success = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
merge.setMergeInfo(merge.info);
|
||||||
|
|
||||||
// Have codec write SegmentInfo. Must do this after
|
// Have codec write SegmentInfo. Must do this after
|
||||||
// creating CFS so that 1) .si isn't slurped into CFS,
|
// creating CFS so that 1) .si isn't slurped into CFS,
|
||||||
// and 2) .si reflects useCompoundFile=true change
|
// and 2) .si reflects useCompoundFile=true change
|
||||||
|
@ -5352,7 +5421,7 @@ public class IndexWriter
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!commitMerge(merge, mergeState)) {
|
if (!commitMerge(merge, docMaps)) {
|
||||||
// commitMerge will return false if this merge was
|
// commitMerge will return false if this merge was
|
||||||
// aborted
|
// aborted
|
||||||
return 0;
|
return 0;
|
||||||
|
|
|
@ -255,6 +255,15 @@ public abstract class MergePolicy {
|
||||||
usesPooledReaders = false;
|
usesPooledReaders = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** Constructor for wrapping. */
|
||||||
|
protected OneMerge(OneMerge oneMerge) {
|
||||||
|
this.segments = oneMerge.segments;
|
||||||
|
this.mergeReaders = oneMerge.mergeReaders;
|
||||||
|
this.totalMaxDoc = oneMerge.totalMaxDoc;
|
||||||
|
this.mergeProgress = new OneMergeProgress();
|
||||||
|
this.usesPooledReaders = oneMerge.usesPooledReaders;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Called by {@link IndexWriter} after the merge started and from the thread that will be
|
* Called by {@link IndexWriter} after the merge started and from the thread that will be
|
||||||
* executing the merge.
|
* executing the merge.
|
||||||
|
@ -288,11 +297,32 @@ public abstract class MergePolicy {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Wrap the reader in order to add/remove information to the merged segment. */
|
/**
|
||||||
|
* Wrap a reader prior to merging in order to add/remove fields or documents.
|
||||||
|
*
|
||||||
|
* <p><b>NOTE:</b> It is illegal to reorder doc IDs here, use {@link
|
||||||
|
* #reorder(CodecReader,Directory)} instead.
|
||||||
|
*/
|
||||||
public CodecReader wrapForMerge(CodecReader reader) throws IOException {
|
public CodecReader wrapForMerge(CodecReader reader) throws IOException {
|
||||||
return reader;
|
return reader;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Extend this method if you wish to renumber doc IDs. This method will be called when index
|
||||||
|
* sorting is disabled on a merged view of the {@link OneMerge}. A {@code null} return value
|
||||||
|
* indicates that doc IDs should not be reordered.
|
||||||
|
*
|
||||||
|
* <p><b>NOTE:</b> Returning a non-null value here disables several optimizations and increases
|
||||||
|
* the merging overhead.
|
||||||
|
*
|
||||||
|
* @param reader The reader to reorder.
|
||||||
|
* @param dir The {@link Directory} of the index, which may be used to create temporary files.
|
||||||
|
* @lucene.experimental
|
||||||
|
*/
|
||||||
|
public Sorter.DocMap reorder(CodecReader reader, Directory dir) throws IOException {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Expert: Sets the {@link SegmentCommitInfo} of the merged segment. Allows sub-classes to e.g.
|
* Expert: Sets the {@link SegmentCommitInfo} of the merged segment. Allows sub-classes to e.g.
|
||||||
* {@link SegmentInfo#addDiagnostics(Map) add diagnostic} properties.
|
* {@link SegmentInfo#addDiagnostics(Map) add diagnostic} properties.
|
||||||
|
@ -355,11 +385,7 @@ public abstract class MergePolicy {
|
||||||
* not indicate the number of documents after the merge.
|
* not indicate the number of documents after the merge.
|
||||||
*/
|
*/
|
||||||
public int totalNumDocs() {
|
public int totalNumDocs() {
|
||||||
int total = 0;
|
return totalMaxDoc;
|
||||||
for (SegmentCommitInfo info : segments) {
|
|
||||||
total += info.info.maxDoc();
|
|
||||||
}
|
|
||||||
return total;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Return {@link MergeInfo} describing this merge. */
|
/** Return {@link MergeInfo} describing this merge. */
|
||||||
|
|
|
@ -177,16 +177,13 @@ public class MergeState {
|
||||||
|
|
||||||
final int docBase = totalDocs;
|
final int docBase = totalDocs;
|
||||||
docMaps[i] =
|
docMaps[i] =
|
||||||
new DocMap() {
|
docID -> {
|
||||||
@Override
|
if (liveDocs == null) {
|
||||||
public int get(int docID) {
|
return docBase + docID;
|
||||||
if (liveDocs == null) {
|
} else if (liveDocs.get(docID)) {
|
||||||
return docBase + docID;
|
return docBase + (int) delDocMap.get(docID);
|
||||||
} else if (liveDocs.get(docID)) {
|
} else {
|
||||||
return docBase + (int) delDocMap.get(docID);
|
return -1;
|
||||||
} else {
|
|
||||||
return -1;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
totalDocs += reader.numDocs();
|
totalDocs += reader.numDocs();
|
||||||
|
@ -242,13 +239,10 @@ public class MergeState {
|
||||||
}
|
}
|
||||||
|
|
||||||
/** A map of doc IDs. */
|
/** A map of doc IDs. */
|
||||||
public abstract static class DocMap {
|
@FunctionalInterface
|
||||||
/** Sole constructor. (For invocation by subclass constructors, typically implicit.) */
|
public interface DocMap {
|
||||||
// Explicitly declared so that we have non-empty javadoc
|
|
||||||
protected DocMap() {}
|
|
||||||
|
|
||||||
/** Return the mapped docID or -1 if the given doc is not mapped. */
|
/** Return the mapped docID or -1 if the given doc is not mapped. */
|
||||||
public abstract int get(int docID);
|
int get(int docID);
|
||||||
}
|
}
|
||||||
|
|
||||||
static PackedLongValues removeDeletes(final int maxDoc, final Bits liveDocs) {
|
static PackedLongValues removeDeletes(final int maxDoc, final Bits liveDocs) {
|
||||||
|
|
|
@ -122,14 +122,11 @@ final class MultiSorter {
|
||||||
final PackedLongValues remapped = builders[i].build();
|
final PackedLongValues remapped = builders[i].build();
|
||||||
final Bits liveDocs = readers.get(i).getLiveDocs();
|
final Bits liveDocs = readers.get(i).getLiveDocs();
|
||||||
docMaps[i] =
|
docMaps[i] =
|
||||||
new MergeState.DocMap() {
|
docID -> {
|
||||||
@Override
|
if (liveDocs == null || liveDocs.get(docID)) {
|
||||||
public int get(int docID) {
|
return (int) remapped.get(docID);
|
||||||
if (liveDocs == null || liveDocs.get(docID)) {
|
} else {
|
||||||
return (int) remapped.get(docID);
|
return -1;
|
||||||
} else {
|
|
||||||
return -1;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
|
@ -325,7 +325,6 @@ public abstract class PointValues {
|
||||||
|
|
||||||
/** Notifies the caller that this many documents are about to be visited */
|
/** Notifies the caller that this many documents are about to be visited */
|
||||||
default void grow(int count) {}
|
default void grow(int count) {}
|
||||||
;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
|
@ -526,7 +526,6 @@ final class ReadersAndUpdates {
|
||||||
return docIDOut;
|
return docIDOut;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
;
|
|
||||||
|
|
||||||
private synchronized Set<String> writeFieldInfosGen(
|
private synchronized Set<String> writeFieldInfosGen(
|
||||||
FieldInfos fieldInfos, Directory dir, FieldInfosFormat infosFormat) throws IOException {
|
FieldInfos fieldInfos, Directory dir, FieldInfosFormat infosFormat) throws IOException {
|
||||||
|
|
|
@ -122,7 +122,7 @@ public final class SegmentInfos implements Cloneable, Iterable<SegmentCommitInfo
|
||||||
static final int VERSION_CURRENT = VERSION_86;
|
static final int VERSION_CURRENT = VERSION_86;
|
||||||
|
|
||||||
/** Name of the generation reference file name */
|
/** Name of the generation reference file name */
|
||||||
private static final String OLD_SEGMENTS_GEN = "segments.gen";
|
static final String OLD_SEGMENTS_GEN = "segments.gen";
|
||||||
|
|
||||||
/** Used to name new segments. */
|
/** Used to name new segments. */
|
||||||
public long counter;
|
public long counter;
|
||||||
|
@ -146,7 +146,7 @@ public final class SegmentInfos implements Cloneable, Iterable<SegmentCommitInfo
|
||||||
*
|
*
|
||||||
* @see #setInfoStream
|
* @see #setInfoStream
|
||||||
*/
|
*/
|
||||||
private static PrintStream infoStream = null;
|
private static PrintStream infoStream;
|
||||||
|
|
||||||
/** Id for this commit; only written starting with Lucene 5.0 */
|
/** Id for this commit; only written starting with Lucene 5.0 */
|
||||||
private byte[] id;
|
private byte[] id;
|
||||||
|
@ -1010,6 +1010,7 @@ public final class SegmentInfos implements Cloneable, Iterable<SegmentCommitInfo
|
||||||
void replace(SegmentInfos other) {
|
void replace(SegmentInfos other) {
|
||||||
rollbackSegmentInfos(other.asList());
|
rollbackSegmentInfos(other.asList());
|
||||||
lastGeneration = other.lastGeneration;
|
lastGeneration = other.lastGeneration;
|
||||||
|
userData = other.userData;
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Returns sum of all segment's maxDocs. Note that this does not include deletions */
|
/** Returns sum of all segment's maxDocs. Note that this does not include deletions */
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -24,6 +24,7 @@ import java.util.Arrays;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.Iterator;
|
import java.util.Iterator;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
import java.util.Objects;
|
||||||
import org.apache.lucene.codecs.DocValuesProducer;
|
import org.apache.lucene.codecs.DocValuesProducer;
|
||||||
import org.apache.lucene.codecs.FieldsProducer;
|
import org.apache.lucene.codecs.FieldsProducer;
|
||||||
import org.apache.lucene.codecs.KnnVectorsReader;
|
import org.apache.lucene.codecs.KnnVectorsReader;
|
||||||
|
@ -77,7 +78,7 @@ public final class SortingCodecReader extends FilterCodecReader {
|
||||||
private final Sorter.DocMap docMap;
|
private final Sorter.DocMap docMap;
|
||||||
|
|
||||||
SortingPointValues(final PointValues in, Sorter.DocMap docMap) {
|
SortingPointValues(final PointValues in, Sorter.DocMap docMap) {
|
||||||
this.in = in;
|
this.in = Objects.requireNonNull(in);
|
||||||
this.docMap = docMap;
|
this.docMap = docMap;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -472,6 +473,10 @@ public final class SortingCodecReader extends FilterCodecReader {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public PointValues getValues(String field) throws IOException {
|
public PointValues getValues(String field) throws IOException {
|
||||||
|
var values = delegate.getValues(field);
|
||||||
|
if (values == null) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
return new SortingPointValues(delegate.getValues(field), docMap);
|
return new SortingPointValues(delegate.getValues(field), docMap);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -85,7 +85,11 @@ public final class IndexOrDocValuesQuery extends Query {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public String toString(String field) {
|
public String toString(String field) {
|
||||||
return indexQuery.toString(field);
|
return "IndexOrDocValuesQuery(indexQuery="
|
||||||
|
+ indexQuery.toString(field)
|
||||||
|
+ ", dvQuery="
|
||||||
|
+ dvQuery.toString(field)
|
||||||
|
+ ")";
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|
|
@ -19,7 +19,6 @@ package org.apache.lucene.search;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
import java.util.Collection;
|
|
||||||
import java.util.Collections;
|
import java.util.Collections;
|
||||||
import java.util.Comparator;
|
import java.util.Comparator;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
@ -62,9 +61,9 @@ import org.apache.lucene.util.automaton.ByteRunAutomaton;
|
||||||
* match lots of documents, counting the number of hits may take much longer than computing the top
|
* match lots of documents, counting the number of hits may take much longer than computing the top
|
||||||
* hits so this trade-off allows to get some minimal information about the hit count without slowing
|
* hits so this trade-off allows to get some minimal information about the hit count without slowing
|
||||||
* down search too much. The {@link TopDocs#scoreDocs} array is always accurate however. If this
|
* down search too much. The {@link TopDocs#scoreDocs} array is always accurate however. If this
|
||||||
* behavior doesn't suit your needs, you should create collectors manually with either {@link
|
* behavior doesn't suit your needs, you should create collectorManagers manually with either {@link
|
||||||
* TopScoreDocCollector#create} or {@link TopFieldCollector#create} and call {@link #search(Query,
|
* TopScoreDocCollectorManager} or {@link TopFieldCollectorManager} and call {@link #search(Query,
|
||||||
* Collector)}.
|
* CollectorManager)}.
|
||||||
*
|
*
|
||||||
* <p><a id="thread-safety"></a>
|
* <p><a id="thread-safety"></a>
|
||||||
*
|
*
|
||||||
|
@ -455,35 +454,10 @@ public class IndexSearcher {
|
||||||
}
|
}
|
||||||
|
|
||||||
final int cappedNumHits = Math.min(numHits, limit);
|
final int cappedNumHits = Math.min(numHits, limit);
|
||||||
|
final boolean supportsConcurrency = getSlices().length > 1;
|
||||||
final LeafSlice[] leafSlices = getSlices();
|
CollectorManager<TopScoreDocCollector, TopDocs> manager =
|
||||||
final CollectorManager<TopScoreDocCollector, TopDocs> manager =
|
new TopScoreDocCollectorManager(
|
||||||
new CollectorManager<TopScoreDocCollector, TopDocs>() {
|
cappedNumHits, after, TOTAL_HITS_THRESHOLD, supportsConcurrency);
|
||||||
|
|
||||||
private final HitsThresholdChecker hitsThresholdChecker =
|
|
||||||
leafSlices.length <= 1
|
|
||||||
? HitsThresholdChecker.create(Math.max(TOTAL_HITS_THRESHOLD, numHits))
|
|
||||||
: HitsThresholdChecker.createShared(Math.max(TOTAL_HITS_THRESHOLD, numHits));
|
|
||||||
|
|
||||||
private final MaxScoreAccumulator minScoreAcc =
|
|
||||||
leafSlices.length <= 1 ? null : new MaxScoreAccumulator();
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public TopScoreDocCollector newCollector() throws IOException {
|
|
||||||
return TopScoreDocCollector.create(
|
|
||||||
cappedNumHits, after, hitsThresholdChecker, minScoreAcc);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public TopDocs reduce(Collection<TopScoreDocCollector> collectors) throws IOException {
|
|
||||||
final TopDocs[] topDocs = new TopDocs[collectors.size()];
|
|
||||||
int i = 0;
|
|
||||||
for (TopScoreDocCollector collector : collectors) {
|
|
||||||
topDocs[i++] = collector.topDocs();
|
|
||||||
}
|
|
||||||
return TopDocs.merge(0, cappedNumHits, topDocs);
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
return search(query, manager);
|
return search(query, manager);
|
||||||
}
|
}
|
||||||
|
@ -510,7 +484,10 @@ public class IndexSearcher {
|
||||||
*
|
*
|
||||||
* @throws TooManyClauses If a query would exceed {@link IndexSearcher#getMaxClauseCount()}
|
* @throws TooManyClauses If a query would exceed {@link IndexSearcher#getMaxClauseCount()}
|
||||||
* clauses.
|
* clauses.
|
||||||
|
* @deprecated This method is being deprecated in favor of {@link IndexSearcher#search(Query,
|
||||||
|
* CollectorManager)} due to its support for concurrency in IndexSearcher
|
||||||
*/
|
*/
|
||||||
|
@Deprecated
|
||||||
public void search(Query query, Collector results) throws IOException {
|
public void search(Query query, Collector results) throws IOException {
|
||||||
query = rewrite(query, results.scoreMode().needsScores());
|
query = rewrite(query, results.scoreMode().needsScores());
|
||||||
search(leafContexts, createWeight(query, results.scoreMode(), 1), results);
|
search(leafContexts, createWeight(query, results.scoreMode(), 1), results);
|
||||||
|
@ -602,34 +579,10 @@ public class IndexSearcher {
|
||||||
final Sort rewrittenSort = sort.rewrite(this);
|
final Sort rewrittenSort = sort.rewrite(this);
|
||||||
final LeafSlice[] leafSlices = getSlices();
|
final LeafSlice[] leafSlices = getSlices();
|
||||||
|
|
||||||
|
final boolean supportsConcurrency = leafSlices.length > 1;
|
||||||
final CollectorManager<TopFieldCollector, TopFieldDocs> manager =
|
final CollectorManager<TopFieldCollector, TopFieldDocs> manager =
|
||||||
new CollectorManager<>() {
|
new TopFieldCollectorManager(
|
||||||
|
rewrittenSort, cappedNumHits, after, TOTAL_HITS_THRESHOLD, supportsConcurrency);
|
||||||
private final HitsThresholdChecker hitsThresholdChecker =
|
|
||||||
leafSlices.length <= 1
|
|
||||||
? HitsThresholdChecker.create(Math.max(TOTAL_HITS_THRESHOLD, numHits))
|
|
||||||
: HitsThresholdChecker.createShared(Math.max(TOTAL_HITS_THRESHOLD, numHits));
|
|
||||||
|
|
||||||
private final MaxScoreAccumulator minScoreAcc =
|
|
||||||
leafSlices.length <= 1 ? null : new MaxScoreAccumulator();
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public TopFieldCollector newCollector() throws IOException {
|
|
||||||
// TODO: don't pay the price for accurate hit counts by default
|
|
||||||
return TopFieldCollector.create(
|
|
||||||
rewrittenSort, cappedNumHits, after, hitsThresholdChecker, minScoreAcc);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public TopFieldDocs reduce(Collection<TopFieldCollector> collectors) throws IOException {
|
|
||||||
final TopFieldDocs[] topDocs = new TopFieldDocs[collectors.size()];
|
|
||||||
int i = 0;
|
|
||||||
for (TopFieldCollector collector : collectors) {
|
|
||||||
topDocs[i++] = collector.topDocs();
|
|
||||||
}
|
|
||||||
return TopDocs.merge(rewrittenSort, 0, cappedNumHits, topDocs);
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
TopFieldDocs topDocs = search(query, manager);
|
TopFieldDocs topDocs = search(query, manager);
|
||||||
if (doDocScores) {
|
if (doDocScores) {
|
||||||
|
|
|
@ -69,7 +69,6 @@ public abstract class PointInSetQuery extends Query implements Accountable {
|
||||||
@Override
|
@Override
|
||||||
public abstract BytesRef next();
|
public abstract BytesRef next();
|
||||||
}
|
}
|
||||||
;
|
|
||||||
|
|
||||||
/** The {@code packedPoints} iterator must be in sorted order. */
|
/** The {@code packedPoints} iterator must be in sorted order. */
|
||||||
protected PointInSetQuery(String field, int numDims, int bytesPerDim, Stream packedPoints) {
|
protected PointInSetQuery(String field, int numDims, int bytesPerDim, Stream packedPoints) {
|
||||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue