Merge branch 'main' into java_21

This commit is contained in:
ChrisHegarty 2023-12-11 14:18:04 +00:00
commit 40c03b0e6c
300 changed files with 8039 additions and 4021 deletions

View File

@ -117,6 +117,9 @@ apply from: file('buildSrc/scriptDepVersions.gradle')
apply from: file('gradle/generation/local-settings.gradle') apply from: file('gradle/generation/local-settings.gradle')
// Make sure the build environment is consistent.
apply from: file('gradle/validation/check-environment.gradle')
// IDE support, settings and specials. // IDE support, settings and specials.
apply from: file('gradle/ide/intellij-idea.gradle') apply from: file('gradle/ide/intellij-idea.gradle')
apply from: file('gradle/ide/eclipse.gradle') apply from: file('gradle/ide/eclipse.gradle')

View File

@ -38,3 +38,9 @@ dependencies {
implementation "commons-codec:commons-codec:${scriptDepVersions['commons-codec']}" implementation "commons-codec:commons-codec:${scriptDepVersions['commons-codec']}"
} }
if (!rootProject.hasJavaFlightRecorder) {
logger.warn('Module jdk.jfr is not available; skipping compilation of Java Flight Recorder support.')
tasks.named('compileJava').configure {
exclude('**/ProfileResults.java')
}
}

View File

@ -24,7 +24,7 @@ ext {
"apache-rat": "0.14", "apache-rat": "0.14",
"asm": "9.6", "asm": "9.6",
"commons-codec": "1.13", "commons-codec": "1.13",
"ecj": "3.36.0-SNAPSHOT", "ecj": "3.36.0",
"flexmark": "0.61.24", "flexmark": "0.61.24",
"javacc": "7.0.12", "javacc": "7.0.12",
"jflex": "1.8.2", "jflex": "1.8.2",

View File

@ -15,20 +15,18 @@
* limitations under the License. * limitations under the License.
*/ */
import org.apache.lucene.gradle.ProfileResults;
def recordings = files() def recordings = files()
allprojects { allprojects {
plugins.withType(JavaPlugin) { plugins.withType(JavaPlugin) {
ext { ext {
testOptions += [ testOptions += [
[propName: 'tests.profile', value: false, description: "Enable java flight recorder profiling."] [propName: 'tests.profile', value: false, description: "Enable Java Flight Recorder profiling."]
] ]
} }
if (resolvedTestOption("tests.profile").toBoolean()) { if (resolvedTestOption("tests.profile").toBoolean()) {
allprojects { if (rootProject.hasJavaFlightRecorder) {
tasks.withType(Test) { tasks.withType(Test) {
jvmArgs("-XX:StartFlightRecording=dumponexit=true,maxsize=250M,settings=" + rootProject.file("gradle/testing/profiling.jfc"), jvmArgs("-XX:StartFlightRecording=dumponexit=true,maxsize=250M,settings=" + rootProject.file("gradle/testing/profiling.jfc"),
"-XX:+UnlockDiagnosticVMOptions", "-XX:+UnlockDiagnosticVMOptions",
@ -41,6 +39,8 @@ allprojects {
recordings = recordings.plus fileTree(dir: workingDir, include: '*.jfr') recordings = recordings.plus fileTree(dir: workingDir, include: '*.jfr')
} }
} }
} else {
throw new GradleException('Module jdk.jfr is not available; Java Flight Recorder profiles cannot be enabled.')
} }
} }
} }
@ -48,10 +48,11 @@ allprojects {
gradle.buildFinished { gradle.buildFinished {
if (!recordings.isEmpty()) { if (!recordings.isEmpty()) {
ProfileResults.printReport(recordings.getFiles().collect { it.toString() }, def pr = org.apache.lucene.gradle.ProfileResults;
propertyOrDefault(ProfileResults.MODE_KEY, ProfileResults.MODE_DEFAULT) as String, pr.printReport(recordings.getFiles().collect { it.toString() },
Integer.parseInt(propertyOrDefault(ProfileResults.STACKSIZE_KEY, ProfileResults.STACKSIZE_DEFAULT)), propertyOrDefault(pr.MODE_KEY, pr.MODE_DEFAULT) as String,
Integer.parseInt(propertyOrDefault(ProfileResults.COUNT_KEY, ProfileResults.COUNT_DEFAULT)), Integer.parseInt(propertyOrDefault(pr.STACKSIZE_KEY, pr.STACKSIZE_DEFAULT)),
Boolean.parseBoolean(propertyOrDefault(ProfileResults.LINENUMBERS_KEY, ProfileResults.LINENUMBERS_DEFAULT))) Integer.parseInt(propertyOrDefault(pr.COUNT_KEY, pr.COUNT_DEFAULT)),
Boolean.parseBoolean(propertyOrDefault(pr.LINENUMBERS_KEY, pr.LINENUMBERS_DEFAULT)))
} }
} }

View File

@ -23,8 +23,6 @@ grant {
// jetty-specific: // jetty-specific:
permission java.lang.RuntimePermission "getenv.JETTY_AVAILABLE_PROCESSORS"; permission java.lang.RuntimePermission "getenv.JETTY_AVAILABLE_PROCESSORS";
permission java.lang.RuntimePermission "getenv.JETTY_WORKER_INSTANCE"; permission java.lang.RuntimePermission "getenv.JETTY_WORKER_INSTANCE";
// servlet stuff
permission java.lang.RuntimePermission "setContextClassLoader";
// allow TestNRTReplication fork its jvm // allow TestNRTReplication fork its jvm
permission java.io.FilePermission "${java.home}${/}-", "read,execute"; permission java.io.FilePermission "${java.home}${/}-", "read,execute";
// read/write access to all system properties (required by jetty in these tests) // read/write access to all system properties (required by jetty in these tests)

View File

@ -50,14 +50,11 @@ grant {
permission java.lang.RuntimePermission "getStackTrace"; permission java.lang.RuntimePermission "getStackTrace";
// needed for mock filesystems in tests // needed for mock filesystems in tests
permission java.lang.RuntimePermission "fileSystemProvider"; permission java.lang.RuntimePermission "fileSystemProvider";
// analyzers/uima: needed by lucene expressions' JavascriptCompiler
permission java.lang.RuntimePermission "createClassLoader";
// needed to test unmap hack on platforms that support it // needed to test unmap hack on platforms that support it
permission java.lang.RuntimePermission "accessClassInPackage.sun.misc"; permission java.lang.RuntimePermission "accessClassInPackage.sun.misc";
permission java.lang.reflect.ReflectPermission "suppressAccessChecks"; permission java.lang.reflect.ReflectPermission "suppressAccessChecks";
// needed by cyberneko usage by benchmarks on J9 // needed by cyberneko usage by benchmarks on J9
permission java.lang.RuntimePermission "accessClassInPackage.org.apache.xerces.util"; permission java.lang.RuntimePermission "accessClassInPackage.org.apache.xerces.util";
permission java.lang.RuntimePermission "getClassLoader";
// Needed for loading native library (lucene:misc:native) in lucene:misc // Needed for loading native library (lucene:misc:native) in lucene:misc
permission java.lang.RuntimePermission "getFileStoreAttributes"; permission java.lang.RuntimePermission "getFileStoreAttributes";
@ -111,6 +108,8 @@ grant {
permission java.lang.RuntimePermission "shutdownHooks"; permission java.lang.RuntimePermission "shutdownHooks";
// needed by jacoco to instrument classes // needed by jacoco to instrument classes
permission java.lang.RuntimePermission "defineClass"; permission java.lang.RuntimePermission "defineClass";
// needed by jacoco for God knows what.
permission java.lang.RuntimePermission "createClassLoader";
}; };
// Grant all permissions to Gradle test runner classes. // Grant all permissions to Gradle test runner classes.

View File

@ -23,6 +23,7 @@ import org.gradle.util.GradleVersion
configure(rootProject) { configure(rootProject) {
ext { ext {
expectedGradleVersion = '8.4' expectedGradleVersion = '8.4'
hasJavaFlightRecorder = ModuleLayer.boot().findModule('jdk.jfr').map(this.class.module::canRead).orElse(false)
} }
wrapper { wrapper {

View File

@ -17,8 +17,8 @@
def skipReason def skipReason
if (rootProject.usesAltJvm && rootProject.runtimeJavaVersion > JavaVersion.VERSION_15) { if (rootProject.usesAltJvm) {
skipReason = "won't work with JDK ${rootProject.runtimeJavaVersion} if used as alternative java toolchain" skipReason = "won't work with alternative java toolchain"
} }
if (!propertyOrDefault("validation.errorprone", isCIBuild).asBoolean()) { if (!propertyOrDefault("validation.errorprone", isCIBuild).asBoolean()) {
@ -37,7 +37,7 @@ if (skipReason) {
allprojects { prj -> allprojects { prj ->
plugins.withType(JavaPlugin) { plugins.withType(JavaPlugin) {
// LUCENE-9650: Errorprone on master/gradle does not work with JDK-16+ when running as plugin // LUCENE-9650: Errorprone on master/gradle does not work when running as plugin
// inside a forked Javac process. Javac running inside Gradle works, because we have // inside a forked Javac process. Javac running inside Gradle works, because we have
// additional module system opens in place. // additional module system opens in place.
// This is a hack to keep the dependency (so that palantir's version check doesn't complain) // This is a hack to keep the dependency (so that palantir's version check doesn't complain)

View File

@ -59,6 +59,9 @@ allprojects {
} }
subprojects { subprojects {
// initialize empty, because no checks for benchmark-jmh module.
ext.jarInfos = []
// Configure jarValidation configuration for all projects. Any dependency // Configure jarValidation configuration for all projects. Any dependency
// declared on this configuration (or any configuration it extends from) will // declared on this configuration (or any configuration it extends from) will
// be verified. // be verified.

View File

@ -61,6 +61,7 @@ Otherwise you are stuck wrestling down full dependencies of OpenJDK (metal etc)
Also you must run benchmarks as root to use dtrace, but it works. Also you must run benchmarks as root to use dtrace, but it works.
$ git clone --depth 1 https://github.com/openjdk/jdk/ $ git clone --depth 1 https://github.com/openjdk/jdk/
$ curl -f https://ftp.gnu.org/gnu/binutils/binutils-2.38.tar.gz | tar -zxf -
$ curl -fo jdk/src/utils/hsdis/binutils/Makefile https://raw.githubusercontent.com/openjdk/jdk/3c7ae1225f0d5575fd927a9b76fb40dc30e208cd/src/utils/hsdis/Makefile $ curl -fo jdk/src/utils/hsdis/binutils/Makefile https://raw.githubusercontent.com/openjdk/jdk/3c7ae1225f0d5575fd927a9b76fb40dc30e208cd/src/utils/hsdis/Makefile
$ vi jdk/src/utils/hsdis/binutils/Makefile, change SOURCE = hsdis.c to SOURCE = hsdis-binutils.c $ vi jdk/src/utils/hsdis/binutils/Makefile, change SOURCE = hsdis.c to SOURCE = hsdis-binutils.c
$ vi jdk/src/utils/hsdis/binutils/hsdis-binutils.c, change #include "hsdis.h" to #include "../hsdis.h" $ vi jdk/src/utils/hsdis/binutils/hsdis-binutils.c, change #include "hsdis.h" to #include "../hsdis.h"

View File

@ -7,7 +7,6 @@ http://s.apache.org/luceneversions
API Changes API Changes
--------------------- ---------------------
* LUCENE-12092: Remove deprecated UTF8TaxonomyWriterCache. Please use LruTaxonomyWriterCache * LUCENE-12092: Remove deprecated UTF8TaxonomyWriterCache. Please use LruTaxonomyWriterCache
instead. (Vigya Sharma) instead. (Vigya Sharma)
@ -62,10 +61,21 @@ API Changes
* GITHUB#12599: Add RandomAccessInput#readBytes method to the RandomAccessInput interface. (Ignacio Vera) * GITHUB#12599: Add RandomAccessInput#readBytes method to the RandomAccessInput interface. (Ignacio Vera)
* GITHUB#12709: Consolidate FSTStore and BytesStore in FST. Created FSTReader which contains the common methods * GITHUB#11023: Adding -level param to CheckIndex, making the old -fast param the default behaviour. (Jakub Slowinski)
of the two (Anh Dung Bui)
* GITHUB#12735: Remove FSTCompiler#getTermCount() and FSTCompiler.UnCompiledNode#inputCount (Anh Dung Bui) * GITHUB#12873: Expressions module now uses MethodHandles to define custom functions. Support for
custom classloaders was removed. (Uwe Schindler)
* GITHUB#12243: Remove TermInSetQuery ctors taking varargs param. SortedSetDocValuesField#newSlowSetQuery,
SortedDocValuesField#newSlowSetQuery, KeywordField#newSetQuery, KeywordField#newSetQuery now take a collection. (Jakub Slowinski)
* GITHUB#12881: Performance improvements to MatchHighlighter and MatchRegionRetriever. MatchRegionRetriever can be
configured to not load matches (or content) of certain fields and to force-load other fields so that stored fields
of a document are accessed once. A configurable limit of field matches placed in the priority queue was added
(allows handling long fields with lots of hits more gracefully). MatchRegionRetriever utilizes IndexSearcher's
executor to extract hit offsets concurrently. (Dawid Weiss)
* GITHUB#12855: Remove deprecated DrillSideways#createDrillDownFacetsCollector extension method. (Greg Miller)
New Features New Features
--------------------- ---------------------
@ -89,18 +99,17 @@ Improvements
* GITHUB#12447: Hunspell: speed up the dictionary enumeration on suggestion (Peter Gromov) * GITHUB#12447: Hunspell: speed up the dictionary enumeration on suggestion (Peter Gromov)
* GITHUB#12542: FSTCompiler can now approximately limit how much RAM it uses to share * GITHUB#12873: Expressions module now uses JEP 371 "Hidden Classes" with JEP 309
suffixes during FST construction using the suffixRAMLimitMB method. Larger values "Dynamic Class-File Constants" to implement Javascript expressions. (Uwe Schindler)
result in a more minimal FST (more common suffixes are shard). Pass
Double.POSITIVE_INFINITY to use as much RAM as is needed to create a purely
minimal FST. Inspired by this Rust FST implemention:
https://blog.burntsushi.net/transducers (Mike McCandless)
Optimizations Optimizations
--------------------- ---------------------
* GITHUB#11857, GITHUB#11859, GITHUB#11893, GITHUB#11909: Hunspell: improved suggestion performance (Peter Gromov) * GITHUB#11857, GITHUB#11859, GITHUB#11893, GITHUB#11909: Hunspell: improved suggestion performance (Peter Gromov)
* GITHUB#12825, GITHUB#12834: Hunspell: improved dictionary loading performance, allowed in-memory entry sorting.
(Peter Gromov)
* GITHUB#12372: Reduce allocation during HNSW construction (Jonathan Ellis) * GITHUB#12372: Reduce allocation during HNSW construction (Jonathan Ellis)
* GITHUB#12408: Lazy initialization improvements for Facets implementations when there are segments with no hits * GITHUB#12408: Lazy initialization improvements for Facets implementations when there are segments with no hits
@ -116,6 +125,9 @@ Bug Fixes
* GITHUB#12220: Hunspell: disallow hidden title-case entries from compound middle/end * GITHUB#12220: Hunspell: disallow hidden title-case entries from compound middle/end
* GITHUB#12878: Fix the declared Exceptions of Expression#evaluate() to match those
of DoubleValues#doubleValue(). (Uwe Schindler)
Other Other
--------------------- ---------------------
@ -142,6 +154,48 @@ Other
* GITHUB#12239: Hunspell: reduced suggestion set dependency on the hash table order (Peter Gromov) * GITHUB#12239: Hunspell: reduced suggestion set dependency on the hash table order (Peter Gromov)
* GITHUB#9049: Fixing bug in UnescapedCharSequence#toStringEscaped() (Jakub Slowinski)
======================== Lucene 9.10.0 =======================
API Changes
---------------------
* GITHUB#12243: Mark TermInSetQuery ctors with varargs terms as @Deprecated. SortedSetDocValuesField#newSlowSetQuery,
SortedDocValuesField#newSlowSetQuery, KeywordField#newSetQuery now take a collection of terms as a param. (Jakub Slowinski)
* GITHUB#11041: Deprecate IndexSearch#search(Query, Collector) in favor of
IndexSearcher#search(Query, CollectorManager) for TopFieldCollectorManager
and TopScoreDocCollectorManager. (Zach Chen, Adrien Grand, Michael McCandless, Greg Miller, Luca Cavanna)
* GITHUB#12854: Mark DrillSideways#createDrillDownFacetsCollector as @Deprecated. (Greg Miller)
New Features
---------------------
(No changes)
Improvements
---------------------
* GITHUB#12870: Tighten synchronized loop in DirectoryTaxonomyReader#getOrdinal. (Stefan Vodita)
* GITHUB#12812: Avoid overflows and false negatives in int slice buffer filled-with-zeros assertion. (Stefan Vodita)
Optimizations
---------------------
(No changes)
Bug Fixes
---------------------
* GITHUB#12866: Prevent extra similarity computation for single-level HNSW graphs. (Kaival Parikh)
* GITHUB#12558: Ensure #finish is called on all drill-sideways FacetsCollectors even when no hits are scored.
(Greg Miller)
Other
---------------------
* GITHUB#11023: Removing some dead code in CheckIndex. (Jakub Slowinski)
======================== Lucene 9.9.0 ======================= ======================== Lucene 9.9.0 =======================
API Changes API Changes
@ -157,9 +211,6 @@ API Changes
* GITHUB#12592: Add RandomAccessInput#length method to the RandomAccessInput interface. In addition deprecate * GITHUB#12592: Add RandomAccessInput#length method to the RandomAccessInput interface. In addition deprecate
ByteBuffersDataInput#size in favour of this new method. (Ignacio Vera) ByteBuffersDataInput#size in favour of this new method. (Ignacio Vera)
* GITHUB#12646, GITHUB#12690: Move FST#addNode to FSTCompiler to avoid a circular dependency
between FST and FSTCompiler (Anh Dung Bui)
* GITHUB#12718: Make IndexSearcher#getSlices final as it is not expected to be overridden (Luca Cavanna) * GITHUB#12718: Make IndexSearcher#getSlices final as it is not expected to be overridden (Luca Cavanna)
* GITHUB#12427: Automata#makeStringUnion #makeBinaryStringUnion now accept Iterable<BytesRef> instead of * GITHUB#12427: Automata#makeStringUnion #makeBinaryStringUnion now accept Iterable<BytesRef> instead of
@ -169,6 +220,25 @@ API Changes
* GITHUB#12180: Add TaxonomyReader#getBulkOrdinals method to more efficiently retrieve facet ordinals for multiple * GITHUB#12180: Add TaxonomyReader#getBulkOrdinals method to more efficiently retrieve facet ordinals for multiple
FacetLabel at once. (Egor Potemkin) FacetLabel at once. (Egor Potemkin)
* GITHUB#12816: Add HumanReadableQuery which takes a description parameter for debugging purposes. (Jakub Slowinski)
* GITHUB#12646, GITHUB#12690: Move FST#addNode to FSTCompiler to avoid a circular dependency
between FST and FSTCompiler (Anh Dung Bui)
* GITHUB#12709: Consolidate FSTStore and BytesStore in FST. Created FSTReader which contains the common methods
of the two (Anh Dung Bui)
* GITHUB#12735: Remove FSTCompiler#getTermCount() and FSTCompiler.UnCompiledNode#inputCount (Anh Dung Bui)
* GITHUB-12695: Remove public constructor of FSTCompiler. Please use FSTCompiler.Builder
instead. (Juan M. Caicedo)
* GITHUB#12799: Make TaskExecutor constructor public and use TaskExecutor for concurrent
HNSW graph build. (Shubham Chaudhary)
* GITHUB#12758, GITHUB#12803: Remove FST constructor with DataInput for metadata. Please
use the constructor with FSTMetadata instead. (Anh Dung Bui)
New Features New Features
--------------------- ---------------------
@ -180,7 +250,7 @@ New Features
* GITHUB#12582: Add int8 scalar quantization to the HNSW vector format. This optionally allows for more compact lossy * GITHUB#12582: Add int8 scalar quantization to the HNSW vector format. This optionally allows for more compact lossy
storage for the vectors, requiring about 75% memory for fast HNSW search. (Ben Trent) storage for the vectors, requiring about 75% memory for fast HNSW search. (Ben Trent)
* GITHUB#12660: HNSW graph now can be merged with multiple thread. Configurable in Lucene99HnswVectorsFormat. * GITHUB#12660: HNSW graph now can be merged with multiple thread. Configurable in Lucene99HnswVectorsFormat.
(Patrick Zhai) (Patrick Zhai)
@ -225,6 +295,22 @@ Improvements
* GITHUB#12754: Refactor lookup of Hotspot VM options and do not initialize constants with NULL * GITHUB#12754: Refactor lookup of Hotspot VM options and do not initialize constants with NULL
if SecurityManager prevents access. (Uwe Schindler) if SecurityManager prevents access. (Uwe Schindler)
* GITHUB#12801: Remove possible contention on a ReentrantReadWriteLock in
Monitor which could result in searches waiting for commits. (Davis Cook)
* GITHUB#11277, LUCENE-10241: Upgrade to OpenNLP to 1.9.4. (Jeff Zemerick)
* GITHUB#12542: FSTCompiler can now approximately limit how much RAM it uses to share
suffixes during FST construction using the suffixRAMLimitMB method. Larger values
result in a more minimal FST (more common suffixes are shard). Pass
Double.POSITIVE_INFINITY to use as much RAM as is needed to create a purely
minimal FST. Inspired by this Rust FST implemention:
https://blog.burntsushi.net/transducers (Mike McCandless)
* GITHUB#12738: NodeHash now stores the FST nodes data instead of just node addresses (Anh Dung Bui)
* GITHUB#12847: Test2BFST now reports the time it took to build the FST and the real FST size (Anh Dung Bui)
Optimizations Optimizations
--------------------- ---------------------
* GITHUB#12183: Make TermStates#build concurrent. (Shubham Chaudhary) * GITHUB#12183: Make TermStates#build concurrent. (Shubham Chaudhary)
@ -276,10 +362,14 @@ Optimizations
* GITHUB#12381: Skip docs with DocValues in NumericLeafComparator. (Lu Xugang, Adrien Grand) * GITHUB#12381: Skip docs with DocValues in NumericLeafComparator. (Lu Xugang, Adrien Grand)
* GITHUB#12748: Specialize arc store for continuous label in FST. (Guo Feng, Chao Zhang)
* GITHUB#12784: Cache buckets to speed up BytesRefHash#sort. (Guo Feng) * GITHUB#12784: Cache buckets to speed up BytesRefHash#sort. (Guo Feng)
* GITHUB#12806: Utilize exact kNN search when gathering k >= numVectors in a segment (Ben Trent)
* GITHUB#12782: Use group-varint encoding for the tail of postings. (Adrien Grand, Zhang Chao)
* GITHUB#12748: Specialize arc store for continuous label in FST. (Guo Feng, Zhang Chao)
Changes in runtime behavior Changes in runtime behavior
--------------------- ---------------------
@ -311,22 +401,33 @@ Bug Fixes
* GITHUB#12770: Stop exploring HNSW graph if scores are not getting better. (Ben Trent) * GITHUB#12770: Stop exploring HNSW graph if scores are not getting better. (Ben Trent)
* GITHUB#12640: Ensure #finish is called on all drill-sideways collectors even if one throws a
CollectionTerminatedException (Greg Miller)
* GITHUB#12626: Fix segmentInfos replace to set userData (Shibi Balamurugan, Uwe Schindler, Marcus Eagan, Michael Froh)
Build Build
--------------------- ---------------------
* GITHUB#12752: tests.multiplier could be omitted in test failure reproduce lines (esp. in * GITHUB#12752: tests.multiplier could be omitted in test failure reproduce lines (esp. in
nightly mode). (Dawid Weiss) nightly mode). (Dawid Weiss)
* GITHUB#12742: JavaCompile tasks may be in up-to-date state when modular dependencies have changed * GITHUB#12742: JavaCompile tasks may be in up-to-date state when modular dependencies have changed
leading to odd runtime errors (Chris Hostetter, Dawid Weiss) leading to odd runtime errors (Chris Hostetter, Dawid Weiss)
* GITHUB#12612: Upgrade forbiddenapis to version 3.6 and ASM for APIJAR extraction to 9.6. (Uwe Schindler) * GITHUB#12612: Upgrade forbiddenapis to version 3.6 and ASM for APIJAR extraction to 9.6. (Uwe Schindler)
* GITHUB#12655: Upgrade to Gradle 8.4 (Kevin Risden) * GITHUB#12655: Upgrade to Gradle 8.4 (Kevin Risden)
* GITHUB#12845: Only enable support for tests.profile if jdk.jfr module is available
in Gradle runtime. (Uwe Schindler)
Other Other
--------------------- ---------------------
* GITHUB#12817: Add demo for faceting with StringValueFacetCounts over KeywordField and SortedDocValuesField.
(Stefan Vodita)
* GITHUB#12657: Internal refactor of HNSW graph merging (Ben Trent). * GITHUB#12657: Internal refactor of HNSW graph merging (Ben Trent).
* GITHUB#12625: Refactor ByteBlockPool so it is just a "shift/mask big array". (Ignacio Vera) * GITHUB#12625: Refactor ByteBlockPool so it is just a "shift/mask big array". (Ignacio Vera)
@ -336,6 +437,8 @@ Other
overflows and slices that are too large. Some bits of code are simplified. Documentation is updated and expanded. overflows and slices that are too large. Some bits of code are simplified. Documentation is updated and expanded.
(Stefan Vodita) (Stefan Vodita)
* GITHUB#12762: Refactor BKD HeapPointWriter to hide the internal data structure. (Ignacio Vera)
======================== Lucene 9.8.0 ======================= ======================== Lucene 9.8.0 =======================
API Changes API Changes
@ -364,6 +467,8 @@ New Features
* GITHUB#12479: Add new Maximum Inner Product vector similarity function for non-normalized dot-product * GITHUB#12479: Add new Maximum Inner Product vector similarity function for non-normalized dot-product
vector search. (Jack Mazanec, Ben Trent) vector search. (Jack Mazanec, Ben Trent)
* GITHUB#12525: `WordDelimiterGraphFilterFactory` now supports the `ignoreKeywords` flag (Thomas De Craemer)
* GITHUB#12489: Add support for recursive graph bisection, also called * GITHUB#12489: Add support for recursive graph bisection, also called
bipartite graph partitioning, and often abbreviated BP, an algorithm for bipartite graph partitioning, and often abbreviated BP, an algorithm for
reordering doc IDs that results in more compact postings and faster queries, reordering doc IDs that results in more compact postings and faster queries,
@ -386,7 +491,7 @@ Improvements
Optimizations Optimizations
--------------------- ---------------------
* GITHUB#12377: Avoid redundant loop for compute min value in DirectMonotonicWriter. (Chao Zhang) * GITHUB#12377: Avoid redundant loop for compute min value in DirectMonotonicWriter. (Zhang Chao)
* GITHUB#12361: Faster top-level disjunctions sorted by descending score. * GITHUB#12361: Faster top-level disjunctions sorted by descending score.
(Adrien Grand) (Adrien Grand)
@ -401,7 +506,7 @@ Optimizations
* GITHUB#12385: Restore parallel knn query rewrite across segments rather than slices (Luca Cavanna) * GITHUB#12385: Restore parallel knn query rewrite across segments rather than slices (Luca Cavanna)
* GITHUB#12381: Speed up NumericDocValuesWriter with index sorting. (Chao Zhang) * GITHUB#12381: Speed up NumericDocValuesWriter with index sorting. (Zhang Chao)
* GITHUB#12453: Faster bulk numeric reads from BufferedIndexInput (Armin Braun) * GITHUB#12453: Faster bulk numeric reads from BufferedIndexInput (Armin Braun)
@ -468,7 +573,7 @@ Other
* GITHUB#12428: Replace consecutive close() calls and close() calls with null checks with IOUtils.close(). * GITHUB#12428: Replace consecutive close() calls and close() calls with null checks with IOUtils.close().
(Shubham Chaudhary) (Shubham Chaudhary)
* GITHUB#12512: Remove unused variable in BKDWriter. (Chao Zhang) * GITHUB#12512: Remove unused variable in BKDWriter. (Zhang Chao)
======================== Lucene 9.7.0 ======================= ======================== Lucene 9.7.0 =======================

View File

@ -19,6 +19,11 @@
## Migration from Lucene 9.x to Lucene 10.0 ## Migration from Lucene 9.x to Lucene 10.0
### Minor API changes in MatchHighlighter and MatchRegionRetriever. (GITHUB#12881)
The API of interfaces for accepting highlights has changed to allow performance improvements. Look at the issue and the PR diff to get
a sense of what's changed (changes are minor).
### Removed deprecated IndexSearcher.doc, IndexReader.document, IndexReader.getTermVectors (GITHUB#11998) ### Removed deprecated IndexSearcher.doc, IndexReader.document, IndexReader.getTermVectors (GITHUB#11998)
The deprecated Stored Fields and Term Vectors apis relied upon threadlocal storage and have been removed. The deprecated Stored Fields and Term Vectors apis relied upon threadlocal storage and have been removed.
@ -101,6 +106,34 @@ The deprecated getter for the `Executor` that was optionally provided to the `In
has been removed. Users that want to execute concurrent tasks should rely instead on the `TaskExecutor` has been removed. Users that want to execute concurrent tasks should rely instead on the `TaskExecutor`
that the searcher holds, retrieved via `IndexSearcher#getTaskExecutor`. that the searcher holds, retrieved via `IndexSearcher#getTaskExecutor`.
### CheckIndex params -slow and -fast are deprecated, replaced by -level X (GITHUB#11023)
The `CheckIndex` former `-fast` behaviour of performing checksum checks only, is now the default.
Added a new parameter: `-level X`, to set the detail level of the index check. The higher the value, the more checks are performed.
Sample `-level` usage: `1` (Default) - Checksum checks only, `2` - all level 1 checks as well as logical integrity checks, `3` - all
level 2 checks as well as slow checks.
### Expressions module now uses `MethodHandle` and hidden classes (GITHUB#12873)
Custom functions in the expressions module must now be passed in a `Map` using `MethodHandle` as values.
To convert legacy code using maps of reflective `java.lang.reflect.Method`, use the converter method
`JavascriptCompiler#convertLegacyFunctions`. This should make the mapping mostly compatible.
The use of `MethodHandle` and [Dynamic Class-File Constants (JEP 309)](https://openjdk.org/jeps/309)
now also allows to pass private methods or methods from different classloaders. It is also possible
to adapt guards or filters using the `MethodHandles` class.
The new implementation of the Javascript expressions compiler no longer supports use of custom
`ClassLoader`, because it uses the new JDK 15 feature [hidden classes (JEP 371)](https://openjdk.org/jeps/371).
Due to the use of `MethodHandle`, classloader isolation is no longer needed, because JS code can only call
MHs that were resolved by the application before using the expressions module.
### `Expression#evaluate()` declares to throw IOException (GITHUB#12878)
The expressions module has changed the `Expression#evaluate()` method signature:
It now declares that it may throw `IOException`. This was an oversight because
compiled expressions call `DoubleValues#doubleValue` behind the scenes, which
may throw `IOException` on index problems, bubbling up unexpectedly to the caller.
## Migration from Lucene 9.0 to Lucene 9.1 ## Migration from Lucene 9.0 to Lucene 9.1
### Test framework package migration and module (LUCENE-10301) ### Test framework package migration and module (LUCENE-10301)

View File

@ -105,7 +105,8 @@ public class NormalizeCharMap {
final FST<CharsRef> map; final FST<CharsRef> map;
try { try {
final Outputs<CharsRef> outputs = CharSequenceOutputs.getSingleton(); final Outputs<CharsRef> outputs = CharSequenceOutputs.getSingleton();
final FSTCompiler<CharsRef> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE2, outputs); final FSTCompiler<CharsRef> fstCompiler =
new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE2, outputs).build();
final IntsRefBuilder scratch = new IntsRefBuilder(); final IntsRefBuilder scratch = new IntsRefBuilder();
for (Map.Entry<String, String> ent : pendingPairs.entrySet()) { for (Map.Entry<String, String> ent : pendingPairs.entrySet()) {
fstCompiler.add(Util.toUTF16(ent.getKey(), scratch), new CharsRef(ent.getValue())); fstCompiler.add(Util.toUTF16(ent.getKey(), scratch), new CharsRef(ent.getValue()));

View File

@ -777,7 +777,6 @@ class KStemmer {
private int stemLength() { private int stemLength() {
return j + 1; return j + 1;
} }
;
private boolean endsIn(char[] s) { private boolean endsIn(char[] s) {
if (s.length > k) return false; if (s.length > k) return false;

View File

@ -40,7 +40,8 @@ class ConvTable {
try { try {
Outputs<CharsRef> outputs = CharSequenceOutputs.getSingleton(); Outputs<CharsRef> outputs = CharSequenceOutputs.getSingleton();
FSTCompiler<CharsRef> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE2, outputs); FSTCompiler<CharsRef> fstCompiler =
new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE2, outputs).build();
IntsRefBuilder scratchInts = new IntsRefBuilder(); IntsRefBuilder scratchInts = new IntsRefBuilder();
for (Map.Entry<String, String> entry : mappings.entrySet()) { for (Map.Entry<String, String> entry : mappings.entrySet()) {
String key = entry.getKey(); String key = entry.getKey();

View File

@ -50,18 +50,12 @@ import java.util.Set;
import java.util.TreeMap; import java.util.TreeMap;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import java.util.stream.Stream; import java.util.stream.Stream;
import org.apache.lucene.codecs.CodecUtil; import org.apache.lucene.analysis.hunspell.SortingStrategy.EntryAccumulator;
import org.apache.lucene.analysis.hunspell.SortingStrategy.EntrySupplier;
import org.apache.lucene.store.Directory; import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.IntsRef; import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.IntsRefBuilder; import org.apache.lucene.util.IntsRefBuilder;
import org.apache.lucene.util.OfflineSorter;
import org.apache.lucene.util.OfflineSorter.ByteSequencesReader;
import org.apache.lucene.util.OfflineSorter.ByteSequencesWriter;
import org.apache.lucene.util.fst.FST; import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.FSTCompiler; import org.apache.lucene.util.fst.FSTCompiler;
import org.apache.lucene.util.fst.IntSequenceOutputs; import org.apache.lucene.util.fst.IntSequenceOutputs;
@ -215,6 +209,25 @@ public class Dictionary {
List<InputStream> dictionaries, List<InputStream> dictionaries,
boolean ignoreCase) boolean ignoreCase)
throws IOException, ParseException { throws IOException, ParseException {
this(affix, dictionaries, ignoreCase, SortingStrategy.offline(tempDir, tempFileNamePrefix));
}
/**
* Creates a new Dictionary containing the information read from the provided InputStreams to
* hunspell affix and dictionary files. You have to close the provided InputStreams yourself.
*
* @param affix InputStream for reading the hunspell affix file (won't be closed).
* @param dictionaries InputStream for reading the hunspell dictionary files (won't be closed).
* @param sortingStrategy the entry strategy for the dictionary loading
* @throws IOException Can be thrown while reading from the InputStreams
* @throws ParseException Can be thrown if the content of the files does not meet expected formats
*/
public Dictionary(
InputStream affix,
List<InputStream> dictionaries,
boolean ignoreCase,
SortingStrategy sortingStrategy)
throws IOException, ParseException {
this.ignoreCase = ignoreCase; this.ignoreCase = ignoreCase;
try (BufferedInputStream affixStream = try (BufferedInputStream affixStream =
@ -250,10 +263,11 @@ public class Dictionary {
readAffixFile(affixStream, decoder, flagEnumerator); readAffixFile(affixStream, decoder, flagEnumerator);
// read dictionary entries // read dictionary entries
IndexOutput unsorted = tempDir.createTempOutput(tempFileNamePrefix, "dat", IOContext.DEFAULT); EntryAccumulator acc = sortingStrategy.start();
int wordCount = mergeDictionaries(dictionaries, decoder, unsorted); mergeDictionaries(dictionaries, decoder, acc);
String sortedFile = sortWordsOffline(tempDir, tempFileNamePrefix, unsorted); try (EntrySupplier sorted = acc.finishAndSort()) {
words = readSortedDictionaries(tempDir, sortedFile, flagEnumerator, wordCount); words = readSortedDictionaries(flagEnumerator, sorted);
}
flagLookup = flagEnumerator.finish(); flagLookup = flagEnumerator.finish();
aliases = null; // no longer needed aliases = null; // no longer needed
morphAliases = null; // no longer needed morphAliases = null; // no longer needed
@ -631,7 +645,8 @@ public class Dictionary {
private FST<IntsRef> affixFST(TreeMap<String, List<Integer>> affixes) throws IOException { private FST<IntsRef> affixFST(TreeMap<String, List<Integer>> affixes) throws IOException {
IntSequenceOutputs outputs = IntSequenceOutputs.getSingleton(); IntSequenceOutputs outputs = IntSequenceOutputs.getSingleton();
FSTCompiler<IntsRef> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE4, outputs); FSTCompiler<IntsRef> fstCompiler =
new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE4, outputs).build();
IntsRefBuilder scratch = new IntsRefBuilder(); IntsRefBuilder scratch = new IntsRefBuilder();
for (Map.Entry<String, List<Integer>> entry : affixes.entrySet()) { for (Map.Entry<String, List<Integer>> entry : affixes.entrySet()) {
Util.toUTF32(entry.getKey(), scratch); Util.toUTF32(entry.getKey(), scratch);
@ -984,52 +999,43 @@ public class Dictionary {
} }
} }
private int mergeDictionaries( private void mergeDictionaries(
List<InputStream> dictionaries, CharsetDecoder decoder, IndexOutput output) List<InputStream> dictionaries, CharsetDecoder decoder, EntryAccumulator acc)
throws IOException { throws IOException {
StringBuilder sb = new StringBuilder(); StringBuilder sb = new StringBuilder();
int wordCount = 0; for (InputStream dictionary : dictionaries) {
try (ByteSequencesWriter writer = new ByteSequencesWriter(output)) { BufferedReader lines = new BufferedReader(new InputStreamReader(dictionary, decoder));
for (InputStream dictionary : dictionaries) { lines.readLine(); // first line is number of entries (approximately, sometimes)
BufferedReader lines = new BufferedReader(new InputStreamReader(dictionary, decoder));
lines.readLine(); // first line is number of entries (approximately, sometimes)
String line; String line;
while ((line = lines.readLine()) != null) { while ((line = lines.readLine()) != null) {
// wild and unpredictable code comment rules // wild and unpredictable code comment rules
if (line.isEmpty() || line.charAt(0) == '#' || line.charAt(0) == '\t') { if (line.isEmpty() || line.charAt(0) == '#' || line.charAt(0) == '\t') {
continue; continue;
}
line = unescapeEntry(line);
// if we haven't seen any custom morphological data, try to parse one
if (!hasCustomMorphData) {
int morphStart = line.indexOf(MORPH_SEPARATOR);
if (morphStart >= 0) {
String data = line.substring(morphStart + 1);
hasCustomMorphData =
splitMorphData(data).stream().anyMatch(s -> !s.startsWith("ph:"));
}
}
wordCount += writeNormalizedWordEntry(sb, writer, line);
} }
line = unescapeEntry(line);
// if we haven't seen any custom morphological data, try to parse one
if (!hasCustomMorphData) {
int morphStart = line.indexOf(MORPH_SEPARATOR);
if (morphStart >= 0) {
String data = line.substring(morphStart + 1);
hasCustomMorphData = splitMorphData(data).stream().anyMatch(s -> !s.startsWith("ph:"));
}
}
writeNormalizedWordEntry(sb, line, acc);
} }
CodecUtil.writeFooter(output);
} }
return wordCount;
} }
/** private void writeNormalizedWordEntry(StringBuilder reuse, String line, EntryAccumulator acc)
* @return the number of word entries written
*/
private int writeNormalizedWordEntry(StringBuilder reuse, ByteSequencesWriter writer, String line)
throws IOException { throws IOException {
int flagSep = line.indexOf(FLAG_SEPARATOR); int flagSep = line.indexOf(FLAG_SEPARATOR);
int morphSep = line.indexOf(MORPH_SEPARATOR); int morphSep = line.indexOf(MORPH_SEPARATOR);
assert morphSep > 0; assert morphSep > 0;
assert morphSep > flagSep; assert morphSep > flagSep;
int sep = flagSep < 0 ? morphSep : flagSep; int sep = flagSep < 0 ? morphSep : flagSep;
if (sep == 0) return 0; if (sep == 0) return;
CharSequence toWrite; CharSequence toWrite;
String beforeSep = line.substring(0, sep); String beforeSep = line.substring(0, sep);
@ -1043,19 +1049,16 @@ public class Dictionary {
String written = toWrite.toString(); String written = toWrite.toString();
sep = written.length() - (line.length() - sep); sep = written.length() - (line.length() - sep);
writer.write(written.getBytes(StandardCharsets.UTF_8)); acc.addEntry(written);
WordCase wordCase = WordCase.caseOf(written, sep); WordCase wordCase = WordCase.caseOf(written, sep);
if (wordCase == WordCase.MIXED || wordCase == WordCase.UPPER && flagSep > 0) { if (wordCase == WordCase.MIXED || wordCase == WordCase.UPPER && flagSep > 0) {
addHiddenCapitalizedWord(reuse, writer, written.substring(0, sep), written.substring(sep)); addHiddenCapitalizedWord(reuse, acc, written.substring(0, sep), written.substring(sep));
return 2;
} }
return 1;
} }
private void addHiddenCapitalizedWord( private void addHiddenCapitalizedWord(
StringBuilder reuse, ByteSequencesWriter writer, String word, String afterSep) StringBuilder reuse, EntryAccumulator acc, String word, String afterSep) throws IOException {
throws IOException {
reuse.setLength(0); reuse.setLength(0);
reuse.append(Character.toUpperCase(word.charAt(0))); reuse.append(Character.toUpperCase(word.charAt(0)));
for (int i = 1; i < word.length(); i++) { for (int i = 1; i < word.length(); i++) {
@ -1064,7 +1067,7 @@ public class Dictionary {
reuse.append(FLAG_SEPARATOR); reuse.append(FLAG_SEPARATOR);
reuse.append(HIDDEN_FLAG); reuse.append(HIDDEN_FLAG);
reuse.append(afterSep, afterSep.charAt(0) == FLAG_SEPARATOR ? 1 : 0, afterSep.length()); reuse.append(afterSep, afterSep.charAt(0) == FLAG_SEPARATOR ? 1 : 0, afterSep.length());
writer.write(reuse.toString().getBytes(StandardCharsets.UTF_8)); acc.addEntry(reuse.toString());
} }
String toLowerCase(String word) { String toLowerCase(String word) {
@ -1084,137 +1087,66 @@ public class Dictionary {
return new String(chars); return new String(chars);
} }
private String sortWordsOffline( private WordStorage readSortedDictionaries(FlagEnumerator flags, EntrySupplier sorted)
Directory tempDir, String tempFileNamePrefix, IndexOutput unsorted) throws IOException { throws IOException {
OfflineSorter sorter =
new OfflineSorter(
tempDir,
tempFileNamePrefix,
new Comparator<>() {
final BytesRef scratch1 = new BytesRef();
final BytesRef scratch2 = new BytesRef();
private void initScratch(BytesRef o, BytesRef scratch) {
scratch.bytes = o.bytes;
scratch.offset = o.offset;
scratch.length = o.length;
for (int i = scratch.length - 1; i >= 0; i--) {
if (scratch.bytes[scratch.offset + i] == FLAG_SEPARATOR
|| scratch.bytes[scratch.offset + i] == MORPH_SEPARATOR) {
scratch.length = i;
break;
}
}
}
@Override
public int compare(BytesRef o1, BytesRef o2) {
initScratch(o1, scratch1);
initScratch(o2, scratch2);
int cmp = scratch1.compareTo(scratch2);
if (cmp == 0) {
// tie break on whole row
return o1.compareTo(o2);
} else {
return cmp;
}
}
});
String sorted;
boolean success = false;
try {
sorted = sorter.sort(unsorted.getName());
success = true;
} finally {
if (success) {
tempDir.deleteFile(unsorted.getName());
} else {
IOUtils.deleteFilesIgnoringExceptions(tempDir, unsorted.getName());
}
}
return sorted;
}
private WordStorage readSortedDictionaries(
Directory tempDir, String sorted, FlagEnumerator flags, int wordCount) throws IOException {
boolean success = false;
Map<String, Integer> morphIndices = new HashMap<>(); Map<String, Integer> morphIndices = new HashMap<>();
WordStorage.Builder builder = WordStorage.Builder builder =
new WordStorage.Builder( new WordStorage.Builder(
wordCount, hashFactor(), hasCustomMorphData, flags, allNonSuggestibleFlags()); sorted.wordCount(), hashFactor(), hasCustomMorphData, flags, allNonSuggestibleFlags());
try (ByteSequencesReader reader = // TODO: the flags themselves can be double-chars (long) or also numeric
new ByteSequencesReader(tempDir.openChecksumInput(sorted), sorted)) { // either way the trick is to encode them as char... but they must be parsed differently
// TODO: the flags themselves can be double-chars (long) or also numeric while (true) {
// either way the trick is to encode them as char... but they must be parsed differently String line = sorted.next();
if (line == null) break;
while (true) { String entry;
BytesRef scratch = reader.next(); char[] wordForm;
if (scratch == null) { int end;
break;
}
String line = scratch.utf8ToString(); int flagSep = line.indexOf(FLAG_SEPARATOR);
String entry; if (flagSep == -1) {
char[] wordForm; wordForm = NOFLAGS;
int end; end = line.indexOf(MORPH_SEPARATOR);
entry = line.substring(0, end);
int flagSep = line.indexOf(FLAG_SEPARATOR);
if (flagSep == -1) {
wordForm = NOFLAGS;
end = line.indexOf(MORPH_SEPARATOR);
entry = line.substring(0, end);
} else {
end = line.indexOf(MORPH_SEPARATOR);
boolean hidden = line.charAt(flagSep + 1) == HIDDEN_FLAG;
String flagPart = line.substring(flagSep + (hidden ? 2 : 1), end).strip();
if (aliasCount > 0 && !flagPart.isEmpty()) {
flagPart = getAliasValue(Integer.parseInt(flagPart));
}
wordForm = flagParsingStrategy.parseFlags(flagPart);
if (hidden) {
wordForm = ArrayUtil.growExact(wordForm, wordForm.length + 1);
wordForm[wordForm.length - 1] = HIDDEN_FLAG;
}
entry = line.substring(0, flagSep);
}
if (entry.isEmpty()) continue;
int morphDataID = 0;
if (end + 1 < line.length()) {
List<String> morphFields = readMorphFields(entry, line.substring(end + 1));
if (!morphFields.isEmpty()) {
morphFields.sort(Comparator.naturalOrder());
morphDataID = addMorphFields(morphIndices, String.join(" ", morphFields));
}
}
builder.add(entry, wordForm, morphDataID);
}
// finalize last entry
success = true;
return new WordStorage(builder) {
@Override
char caseFold(char c) {
return Dictionary.this.caseFold(c);
}
};
} finally {
if (success) {
tempDir.deleteFile(sorted);
} else { } else {
IOUtils.deleteFilesIgnoringExceptions(tempDir, sorted); end = line.indexOf(MORPH_SEPARATOR);
boolean hidden = line.charAt(flagSep + 1) == HIDDEN_FLAG;
String flagPart = line.substring(flagSep + (hidden ? 2 : 1), end).strip();
if (aliasCount > 0 && !flagPart.isEmpty()) {
flagPart = getAliasValue(Integer.parseInt(flagPart));
}
wordForm = flagParsingStrategy.parseFlags(flagPart);
if (hidden) {
wordForm = ArrayUtil.growExact(wordForm, wordForm.length + 1);
wordForm[wordForm.length - 1] = HIDDEN_FLAG;
}
entry = line.substring(0, flagSep);
} }
if (entry.isEmpty()) continue;
int morphDataID = 0;
if (end + 1 < line.length()) {
List<String> morphFields = readMorphFields(entry, line.substring(end + 1));
if (!morphFields.isEmpty()) {
morphFields.sort(Comparator.naturalOrder());
morphDataID = addMorphFields(morphIndices, String.join(" ", morphFields));
}
}
builder.add(entry, wordForm, morphDataID);
} }
return new WordStorage(builder) {
@Override
char caseFold(char c) {
return Dictionary.this.caseFold(c);
}
};
} }
/** /**

View File

@ -0,0 +1,181 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.hunspell;
import java.io.Closeable;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.List;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefComparator;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.OfflineSorter;
import org.apache.lucene.util.OfflineSorter.ByteSequencesReader;
import org.apache.lucene.util.OfflineSorter.ByteSequencesWriter;
/**
* The strategy defining how a Hunspell dictionary should be loaded, with different tradeoffs. The
* entries should be sorted in a special way, and this can be done either in-memory (faster, but
* temporarily allocating more memory) or using disk (slower, but not needing much memory).
*
* @see #offline(Directory, String)
* @see #inMemory()
*/
public abstract class SortingStrategy {
abstract EntryAccumulator start() throws IOException;
interface EntryAccumulator {
void addEntry(String entry) throws IOException;
EntrySupplier finishAndSort() throws IOException;
}
interface EntrySupplier extends Closeable {
int wordCount();
/** The next line or {@code null} if the end is reached */
String next() throws IOException;
}
/**
* An "offline" strategy that creates temporary files in the given directory and uses them for
* sorting with {@link OfflineSorter}. It's slower than {@link #inMemory()}, but doesn't need to
* load the entire dictionary into memory.
*/
public static SortingStrategy offline(Directory tempDir, String tempFileNamePrefix) {
return new SortingStrategy() {
@Override
EntryAccumulator start() throws IOException {
IndexOutput output = tempDir.createTempOutput(tempFileNamePrefix, "dat", IOContext.DEFAULT);
ByteSequencesWriter writer = new ByteSequencesWriter(output);
return new EntryAccumulator() {
int wordCount = 0;
@Override
public void addEntry(String entry) throws IOException {
wordCount++;
writer.write(entry.getBytes(StandardCharsets.UTF_8));
}
@Override
public EntrySupplier finishAndSort() throws IOException {
CodecUtil.writeFooter(output);
writer.close();
String sortedFile = sortWordsOffline();
ByteSequencesReader reader =
new ByteSequencesReader(tempDir.openChecksumInput(sortedFile), sortedFile);
return new EntrySupplier() {
boolean success = false;
@Override
public int wordCount() {
return wordCount;
}
@Override
public String next() throws IOException {
BytesRef scratch = reader.next();
if (scratch == null) {
success = true;
return null;
}
return scratch.utf8ToString();
}
@Override
public void close() throws IOException {
reader.close();
if (success) {
tempDir.deleteFile(sortedFile);
} else {
IOUtils.deleteFilesIgnoringExceptions(tempDir, sortedFile);
}
}
};
}
private String sortWordsOffline() throws IOException {
var sorter = new OfflineSorter(tempDir, tempFileNamePrefix, BytesRefComparator.NATURAL);
String sorted;
boolean success = false;
try {
sorted = sorter.sort(output.getName());
success = true;
} finally {
if (success) {
tempDir.deleteFile(output.getName());
} else {
IOUtils.deleteFilesIgnoringExceptions(tempDir, output.getName());
}
}
return sorted;
}
};
}
};
}
/**
* The strategy that loads all entries as {@link String} objects and sorts them in memory. The
* entries are then stored in a more compressed way, and the strings are gc-ed, but the loading
* itself needs {@code O(dictionary_size)} memory.
*/
public static SortingStrategy inMemory() {
return new SortingStrategy() {
@Override
EntryAccumulator start() {
List<String> entries = new ArrayList<>();
return new EntryAccumulator() {
@Override
public void addEntry(String entry) {
entries.add(entry);
}
@Override
public EntrySupplier finishAndSort() {
entries.sort(Comparator.naturalOrder());
return new EntrySupplier() {
int i = 0;
@Override
public int wordCount() {
return entries.size();
}
@Override
public String next() {
return i < entries.size() ? entries.get(i++) : null;
}
@Override
public void close() {}
};
}
};
}
};
}
}

View File

@ -350,16 +350,19 @@ abstract class WordStorage {
currentOrds.clear(); currentOrds.clear();
boolean hasNonHidden = false; boolean hasNonHidden = false;
boolean isSuggestible = false;
for (char[] flags : group) { for (char[] flags : group) {
if (!hasFlag(flags, Dictionary.HIDDEN_FLAG)) { if (!hasFlag(flags, Dictionary.HIDDEN_FLAG)) {
hasNonHidden = true; hasNonHidden = true;
break; }
if (!hasNoSuggestFlag(flags)) {
isSuggestible = true;
} }
} }
for (int i = 0; i < group.size(); i++) { for (int i = 0; i < group.size(); i++) {
char[] flags = group.get(i); char[] flags = group.get(i);
if (hasNonHidden && hasFlag(flags, Dictionary.HIDDEN_FLAG)) { if (hasNonHidden && group.size() > 1 && hasFlag(flags, Dictionary.HIDDEN_FLAG)) {
continue; continue;
} }
@ -388,7 +391,7 @@ abstract class WordStorage {
int mask = int mask =
(prevCode == 0 ? 0 : COLLISION_MASK) (prevCode == 0 ? 0 : COLLISION_MASK)
| (group.stream().anyMatch(flags -> !hasNoSuggestFlag(flags)) ? SUGGESTIBLE_MASK : 0) | (isSuggestible ? SUGGESTIBLE_MASK : 0)
| Math.min(currentEntry.length(), MAX_STORED_LENGTH); | Math.min(currentEntry.length(), MAX_STORED_LENGTH);
hashTable[hash] = (mask << OFFSET_BITS) | pos; hashTable[hash] = (mask << OFFSET_BITS) | pos;

View File

@ -210,7 +210,8 @@ public final class StemmerOverrideFilter extends TokenFilter {
*/ */
public StemmerOverrideMap build() throws IOException { public StemmerOverrideMap build() throws IOException {
ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton(); ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton();
FSTCompiler<BytesRef> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE4, outputs); FSTCompiler<BytesRef> fstCompiler =
new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE4, outputs).build();
final int[] sort = hash.sort(); final int[] sort = hash.sort();
IntsRefBuilder intsSpare = new IntsRefBuilder(); IntsRefBuilder intsSpare = new IntsRefBuilder();
final int size = hash.size(); final int size = hash.size();

View File

@ -46,11 +46,11 @@ public class TruncateTokenFilterFactory extends TokenFilterFactory {
public static final String NAME = "truncate"; public static final String NAME = "truncate";
public static final String PREFIX_LENGTH_KEY = "prefixLength"; public static final String PREFIX_LENGTH_KEY = "prefixLength";
private final byte prefixLength; private final int prefixLength;
public TruncateTokenFilterFactory(Map<String, String> args) { public TruncateTokenFilterFactory(Map<String, String> args) {
super(args); super(args);
prefixLength = Byte.parseByte(get(args, PREFIX_LENGTH_KEY, "5")); prefixLength = Integer.parseInt(get(args, PREFIX_LENGTH_KEY, "5"));
if (prefixLength < 1) if (prefixLength < 1)
throw new IllegalArgumentException( throw new IllegalArgumentException(
PREFIX_LENGTH_KEY + " parameter must be a positive number: " + prefixLength); PREFIX_LENGTH_KEY + " parameter must be a positive number: " + prefixLength);

View File

@ -163,7 +163,6 @@ public final class WordDelimiterFilter extends TokenFilter {
private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class); private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class);
private final KeywordAttribute keywordAttribute = addAttribute(KeywordAttribute.class); private final KeywordAttribute keywordAttribute = addAttribute(KeywordAttribute.class);
;
private final OffsetAttribute offsetAttribute = addAttribute(OffsetAttribute.class); private final OffsetAttribute offsetAttribute = addAttribute(OffsetAttribute.class);
private final PositionIncrementAttribute posIncAttribute = private final PositionIncrementAttribute posIncAttribute =
addAttribute(PositionIncrementAttribute.class); addAttribute(PositionIncrementAttribute.class);

View File

@ -164,7 +164,6 @@ public final class WordDelimiterGraphFilter extends TokenFilter {
private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class); private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class);
private final KeywordAttribute keywordAttribute = addAttribute(KeywordAttribute.class); private final KeywordAttribute keywordAttribute = addAttribute(KeywordAttribute.class);
;
private final OffsetAttribute offsetAttribute = addAttribute(OffsetAttribute.class); private final OffsetAttribute offsetAttribute = addAttribute(OffsetAttribute.class);
private final PositionIncrementAttribute posIncAttribute = private final PositionIncrementAttribute posIncAttribute =
addAttribute(PositionIncrementAttribute.class); addAttribute(PositionIncrementAttribute.class);

View File

@ -45,7 +45,7 @@ import org.apache.lucene.util.ResourceLoaderAware;
* preserveOriginal="0" splitOnNumerics="1" splitOnCaseChange="1" * preserveOriginal="0" splitOnNumerics="1" splitOnCaseChange="1"
* catenateWords="0" catenateNumbers="0" catenateAll="0" * catenateWords="0" catenateNumbers="0" catenateAll="0"
* generateWordParts="1" generateNumberParts="1" stemEnglishPossessive="1" * generateWordParts="1" generateNumberParts="1" stemEnglishPossessive="1"
* types="wdfftypes.txt" /&gt; * types="wdfftypes.txt" ignoreKeywords="0" /&gt;
* &lt;/analyzer&gt; * &lt;/analyzer&gt;
* &lt;/fieldType&gt;</pre> * &lt;/fieldType&gt;</pre>
* *
@ -100,6 +100,9 @@ public class WordDelimiterGraphFilterFactory extends TokenFilterFactory
if (getInt(args, "stemEnglishPossessive", 1) != 0) { if (getInt(args, "stemEnglishPossessive", 1) != 0) {
flags |= STEM_ENGLISH_POSSESSIVE; flags |= STEM_ENGLISH_POSSESSIVE;
} }
if (getInt(args, "ignoreKeywords", 0) != 0) {
flags |= IGNORE_KEYWORDS;
}
wordFiles = get(args, PROTECTED_TOKENS); wordFiles = get(args, PROTECTED_TOKENS);
types = get(args, TYPES); types = get(args, TYPES);
this.flags = flags; this.flags = flags;

View File

@ -216,7 +216,6 @@ public final class SynonymFilter extends TokenFilter {
count++; count++;
} }
} }
;
private final ByteArrayDataInput bytesReader = new ByteArrayDataInput(); private final ByteArrayDataInput bytesReader = new ByteArrayDataInput();

View File

@ -222,7 +222,8 @@ public class SynonymMap {
public SynonymMap build() throws IOException { public SynonymMap build() throws IOException {
ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton(); ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton();
// TODO: are we using the best sharing options? // TODO: are we using the best sharing options?
FSTCompiler<BytesRef> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE4, outputs); FSTCompiler<BytesRef> fstCompiler =
new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE4, outputs).build();
BytesRefBuilder scratch = new BytesRefBuilder(); BytesRefBuilder scratch = new BytesRefBuilder();
ByteArrayDataOutput scratchOutput = new ByteArrayDataOutput(); ByteArrayDataOutput scratchOutput = new ByteArrayDataOutput();

View File

@ -595,8 +595,7 @@ public class TestHTMLStripCharFilter extends BaseTokenStreamTestCase {
} }
} }
Reader reader = new HTMLStripCharFilter(new StringReader(text.toString())); Reader reader = new HTMLStripCharFilter(new StringReader(text.toString()));
while (reader.read() != -1) while (reader.read() != -1) {}
;
} }
public void testUTF16Surrogates() throws Exception { public void testUTF16Surrogates() throws Exception {

View File

@ -230,7 +230,6 @@ public class TestDuelingAnalyzers extends BaseTokenStreamTestCase {
assertEquals( assertEquals(
"wrong end offset for input: " + s, leftOffset.endOffset(), rightOffset.endOffset()); "wrong end offset for input: " + s, leftOffset.endOffset(), rightOffset.endOffset());
} }
;
assertFalse("wrong number of tokens for input: " + s, right.incrementToken()); assertFalse("wrong number of tokens for input: " + s, right.incrementToken());
left.end(); left.end();
right.end(); right.end();

View File

@ -41,7 +41,6 @@ import java.util.concurrent.atomic.AtomicLong;
import java.util.function.Function; import java.util.function.Function;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import java.util.stream.Stream; import java.util.stream.Stream;
import org.apache.lucene.tests.store.BaseDirectoryWrapper;
import org.apache.lucene.tests.util.LuceneTestCase; import org.apache.lucene.tests.util.LuceneTestCase;
import org.apache.lucene.tests.util.LuceneTestCase.SuppressSysoutChecks; import org.apache.lucene.tests.util.LuceneTestCase.SuppressSysoutChecks;
import org.apache.lucene.tests.util.RamUsageTester; import org.apache.lucene.tests.util.RamUsageTester;
@ -72,9 +71,8 @@ public class TestAllDictionaries extends LuceneTestCase {
Path dic = Path.of(affPath.substring(0, affPath.length() - 4) + ".dic"); Path dic = Path.of(affPath.substring(0, affPath.length() - 4) + ".dic");
assert Files.exists(dic) : dic; assert Files.exists(dic) : dic;
try (InputStream dictionary = Files.newInputStream(dic); try (InputStream dictionary = Files.newInputStream(dic);
InputStream affix = Files.newInputStream(aff); InputStream affix = Files.newInputStream(aff)) {
BaseDirectoryWrapper tempDir = newDirectory()) { return new Dictionary(affix, List.of(dictionary), false, SortingStrategy.inMemory()) {
return new Dictionary(tempDir, "dictionary", affix, dictionary) {
@Override @Override
protected boolean tolerateAffixRuleCountMismatches() { protected boolean tolerateAffixRuleCountMismatches() {
return true; return true;

View File

@ -256,15 +256,22 @@ public class TestSpellChecking extends LuceneTestCase {
} }
static void checkSpellCheckerExpectations(Path basePath) throws IOException, ParseException { static void checkSpellCheckerExpectations(Path basePath) throws IOException, ParseException {
InputStream affixStream = Files.newInputStream(Path.of(basePath.toString() + ".aff")); checkSpellCheckerExpectations(
basePath, SortingStrategy.offline(new ByteBuffersDirectory(), "dictionary"));
checkSpellCheckerExpectations(basePath, SortingStrategy.inMemory());
}
private static void checkSpellCheckerExpectations(Path basePath, SortingStrategy strategy)
throws IOException, ParseException {
Path affFile = Path.of(basePath + ".aff");
Path dicFile = Path.of(basePath + ".dic"); Path dicFile = Path.of(basePath + ".dic");
InputStream affixStream = Files.newInputStream(affFile);
InputStream dictStream = Files.newInputStream(dicFile); InputStream dictStream = Files.newInputStream(dicFile);
Hunspell speller; Hunspell speller;
Map<String, Suggester> suggesters = new LinkedHashMap<>(); Map<String, Suggester> suggesters = new LinkedHashMap<>();
try { try {
Dictionary dictionary = Dictionary dictionary = new Dictionary(affixStream, List.of(dictStream), false, strategy);
new Dictionary(new ByteBuffersDirectory(), "dictionary", affixStream, dictStream);
speller = new Hunspell(dictionary, TimeoutPolicy.NO_TIMEOUT, () -> {}); speller = new Hunspell(dictionary, TimeoutPolicy.NO_TIMEOUT, () -> {});
Suggester suggester = new Suggester(dictionary); Suggester suggester = new Suggester(dictionary);
suggesters.put("default", suggester); suggesters.put("default", suggester);

View File

@ -41,7 +41,6 @@ public class TestIndicNormalizer extends BaseTokenStreamTestCase {
private void check(String input, String output) throws IOException { private void check(String input, String output) throws IOException {
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
;
tokenizer.setReader(new StringReader(input)); tokenizer.setReader(new StringReader(input));
TokenFilter tf = new IndicNormalizationFilter(tokenizer); TokenFilter tf = new IndicNormalizationFilter(tokenizer);
assertTokenStreamContents(tf, new String[] {output}); assertTokenStreamContents(tf, new String[] {output});

View File

@ -89,7 +89,6 @@ public class TestKeywordMarkerFilterFactory extends BaseTokenStreamFactoryTestCa
stream = stream =
tokenFilterFactory("KeywordMarker", "pattern", "Cats", "ignoreCase", "true").create(stream); tokenFilterFactory("KeywordMarker", "pattern", "Cats", "ignoreCase", "true").create(stream);
stream = tokenFilterFactory("PorterStem").create(stream); stream = tokenFilterFactory("PorterStem").create(stream);
;
assertTokenStreamContents(stream, new String[] {"dog", "cats", "Cats"}); assertTokenStreamContents(stream, new String[] {"dog", "cats", "Cats"});
} }

View File

@ -68,4 +68,23 @@ public class TestTruncateTokenFilterFactory extends BaseTokenStreamFactoryTestCa
TruncateTokenFilterFactory.PREFIX_LENGTH_KEY TruncateTokenFilterFactory.PREFIX_LENGTH_KEY
+ " parameter must be a positive number: -5")); + " parameter must be a positive number: -5"));
} }
/** Test that takes length greater than byte limit accepts it */
public void testLengthGreaterThanByteLimitArgument() throws Exception {
Reader reader =
new StringReader(
"abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvw128characters From here");
TokenStream stream = new MockTokenizer(MockTokenizer.WHITESPACE, false);
((Tokenizer) stream).setReader(reader);
stream =
tokenFilterFactory("Truncate", TruncateTokenFilterFactory.PREFIX_LENGTH_KEY, "128")
.create(stream);
assertTokenStreamContents(
stream,
new String[] {
"abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvw1",
"From",
"here"
});
}
} }

View File

@ -69,7 +69,6 @@ public class TestEdgeNGramTokenizer extends BaseTokenStreamTestCase {
public void testOversizedNgrams() throws Exception { public void testOversizedNgrams() throws Exception {
EdgeNGramTokenizer tokenizer = new EdgeNGramTokenizer(6, 6); EdgeNGramTokenizer tokenizer = new EdgeNGramTokenizer(6, 6);
tokenizer.setReader(input); tokenizer.setReader(input);
;
assertTokenStreamContents(tokenizer, new String[0], new int[0], new int[0], 5 /* abcde */); assertTokenStreamContents(tokenizer, new String[0], new int[0], new int[0], 5 /* abcde */);
} }

View File

@ -156,7 +156,6 @@ public class TestCharArrayIterator extends LuceneTestCase {
private void consume(BreakIterator bi, CharacterIterator ci) { private void consume(BreakIterator bi, CharacterIterator ci) {
bi.setText(ci); bi.setText(ci);
while (bi.next() != BreakIterator.DONE) while (bi.next() != BreakIterator.DONE) {}
;
} }
} }

View File

@ -16,6 +16,8 @@
*/ */
package org.apache.lucene.analysis.ja.dict; package org.apache.lucene.analysis.ja.dict;
import static org.apache.lucene.util.fst.FST.readMetadata;
import java.io.BufferedInputStream; import java.io.BufferedInputStream;
import java.io.IOException; import java.io.IOException;
import java.io.InputStream; import java.io.InputStream;
@ -103,7 +105,7 @@ public final class TokenInfoDictionary extends BinaryDictionary<TokenInfoMorphDa
FST<Long> fst; FST<Long> fst;
try (InputStream is = new BufferedInputStream(fstResource.get())) { try (InputStream is = new BufferedInputStream(fstResource.get())) {
DataInput in = new InputStreamDataInput(is); DataInput in = new InputStreamDataInput(is);
fst = new FST<>(in, in, PositiveIntOutputs.getSingleton()); fst = new FST<>(readMetadata(in, PositiveIntOutputs.getSingleton()), in);
} }
// TODO: some way to configure? // TODO: some way to configure?
this.fst = new TokenInfoFST(fst, true); this.fst = new TokenInfoFST(fst, true);

View File

@ -101,7 +101,8 @@ class TokenInfoDictionaryBuilder {
lines.sort(Comparator.comparing(entry -> entry[0])); lines.sort(Comparator.comparing(entry -> entry[0]));
PositiveIntOutputs fstOutput = PositiveIntOutputs.getSingleton(); PositiveIntOutputs fstOutput = PositiveIntOutputs.getSingleton();
FSTCompiler<Long> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE2, fstOutput); FSTCompiler<Long> fstCompiler =
new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE2, fstOutput).build();
IntsRefBuilder scratch = new IntsRefBuilder(); IntsRefBuilder scratch = new IntsRefBuilder();
long ord = -1; // first ord will be 0 long ord = -1; // first ord will be 0
String lastValue = null; String lastValue = null;

View File

@ -93,7 +93,8 @@ public final class UserDictionary implements Dictionary<UserMorphData> {
List<int[]> segmentations = new ArrayList<>(featureEntries.size()); List<int[]> segmentations = new ArrayList<>(featureEntries.size());
PositiveIntOutputs fstOutput = PositiveIntOutputs.getSingleton(); PositiveIntOutputs fstOutput = PositiveIntOutputs.getSingleton();
FSTCompiler<Long> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE2, fstOutput); FSTCompiler<Long> fstCompiler =
new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE2, fstOutput).build();
IntsRefBuilder scratch = new IntsRefBuilder(); IntsRefBuilder scratch = new IntsRefBuilder();
long ord = 0; long ord = 0;

View File

@ -758,8 +758,7 @@ public class TestJapaneseTokenizer extends BaseTokenStreamTestCase {
for (int i = 0; i < numIterations; i++) { for (int i = 0; i < numIterations; i++) {
try (TokenStream ts = analyzer.tokenStream("ignored", line)) { try (TokenStream ts = analyzer.tokenStream("ignored", line)) {
ts.reset(); ts.reset();
while (ts.incrementToken()) while (ts.incrementToken()) {}
;
ts.end(); ts.end();
} }
} }
@ -775,8 +774,7 @@ public class TestJapaneseTokenizer extends BaseTokenStreamTestCase {
for (String sentence : sentences) { for (String sentence : sentences) {
try (TokenStream ts = analyzer.tokenStream("ignored", sentence)) { try (TokenStream ts = analyzer.tokenStream("ignored", sentence)) {
ts.reset(); ts.reset();
while (ts.incrementToken()) while (ts.incrementToken()) {}
;
ts.end(); ts.end();
} }
} }
@ -831,8 +829,7 @@ public class TestJapaneseTokenizer extends BaseTokenStreamTestCase {
new JapaneseTokenizer(newAttributeFactory(), readDict(), false, Mode.NORMAL); new JapaneseTokenizer(newAttributeFactory(), readDict(), false, Mode.NORMAL);
tokenizer.setReader(new StringReader(doc)); tokenizer.setReader(new StringReader(doc));
tokenizer.reset(); tokenizer.reset();
while (tokenizer.incrementToken()) while (tokenizer.incrementToken()) {}
;
} }
public void testPatchedSystemDict() throws Exception { public void testPatchedSystemDict() throws Exception {

View File

@ -16,6 +16,8 @@
*/ */
package org.apache.lucene.analysis.ko.dict; package org.apache.lucene.analysis.ko.dict;
import static org.apache.lucene.util.fst.FST.readMetadata;
import java.io.BufferedInputStream; import java.io.BufferedInputStream;
import java.io.IOException; import java.io.IOException;
import java.io.InputStream; import java.io.InputStream;
@ -102,7 +104,7 @@ public final class TokenInfoDictionary extends BinaryDictionary<TokenInfoMorphDa
FST<Long> fst; FST<Long> fst;
try (InputStream is = new BufferedInputStream(fstResource.get())) { try (InputStream is = new BufferedInputStream(fstResource.get())) {
DataInput in = new InputStreamDataInput(is); DataInput in = new InputStreamDataInput(is);
fst = new FST<>(in, in, PositiveIntOutputs.getSingleton()); fst = new FST<>(readMetadata(in, PositiveIntOutputs.getSingleton()), in);
} }
this.fst = new TokenInfoFST(fst); this.fst = new TokenInfoFST(fst);
} }

View File

@ -94,7 +94,8 @@ class TokenInfoDictionaryBuilder {
lines.sort(Comparator.comparing(left -> left[0])); lines.sort(Comparator.comparing(left -> left[0]));
PositiveIntOutputs fstOutput = PositiveIntOutputs.getSingleton(); PositiveIntOutputs fstOutput = PositiveIntOutputs.getSingleton();
FSTCompiler<Long> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE2, fstOutput); FSTCompiler<Long> fstCompiler =
new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE2, fstOutput).build();
IntsRefBuilder scratch = new IntsRefBuilder(); IntsRefBuilder scratch = new IntsRefBuilder();
long ord = -1; // first ord will be 0 long ord = -1; // first ord will be 0
String lastValue = null; String lastValue = null;

View File

@ -75,7 +75,8 @@ public final class UserDictionary implements Dictionary<UserMorphData> {
entries.sort(Comparator.comparing(e -> e.split("\\s+")[0])); entries.sort(Comparator.comparing(e -> e.split("\\s+")[0]));
PositiveIntOutputs fstOutput = PositiveIntOutputs.getSingleton(); PositiveIntOutputs fstOutput = PositiveIntOutputs.getSingleton();
FSTCompiler<Long> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE2, fstOutput); FSTCompiler<Long> fstCompiler =
new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE2, fstOutput).build();
IntsRefBuilder scratch = new IntsRefBuilder(); IntsRefBuilder scratch = new IntsRefBuilder();
String lastToken = null; String lastToken = null;

View File

@ -41,7 +41,6 @@ public class TestPolishAnalyzer extends BaseTokenStreamTestCase {
/** test use of exclusion set */ /** test use of exclusion set */
public void testExclude() throws IOException { public void testExclude() throws IOException {
CharArraySet exclusionSet = new CharArraySet(asSet("studenta"), false); CharArraySet exclusionSet = new CharArraySet(asSet("studenta"), false);
;
Analyzer a = new PolishAnalyzer(PolishAnalyzer.getDefaultStopSet(), exclusionSet); Analyzer a = new PolishAnalyzer(PolishAnalyzer.getDefaultStopSet(), exclusionSet);
checkOneTerm(a, "studenta", "studenta"); checkOneTerm(a, "studenta", "studenta");
checkOneTerm(a, "studenci", "student"); checkOneTerm(a, "studenci", "student");

View File

@ -16,6 +16,8 @@
*/ */
package org.apache.lucene.backward_codecs.lucene40.blocktree; package org.apache.lucene.backward_codecs.lucene40.blocktree;
import static org.apache.lucene.util.fst.FST.readMetadata;
import java.io.IOException; import java.io.IOException;
import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.IndexOptions; import org.apache.lucene.index.IndexOptions;
@ -89,9 +91,17 @@ public final class FieldReader extends Terms {
final IndexInput clone = indexIn.clone(); final IndexInput clone = indexIn.clone();
clone.seek(indexStartFP); clone.seek(indexStartFP);
if (metaIn == indexIn) { // Only true before Lucene 8.6 if (metaIn == indexIn) { // Only true before Lucene 8.6
index = new FST<>(clone, clone, ByteSequenceOutputs.getSingleton(), new OffHeapFSTStore()); index =
new FST<>(
readMetadata(clone, ByteSequenceOutputs.getSingleton()),
clone,
new OffHeapFSTStore());
} else { } else {
index = new FST<>(metaIn, clone, ByteSequenceOutputs.getSingleton(), new OffHeapFSTStore()); index =
new FST<>(
readMetadata(metaIn, ByteSequenceOutputs.getSingleton()),
clone,
new OffHeapFSTStore());
} }
/* /*
if (false) { if (false) {

View File

@ -22,6 +22,7 @@ import java.nio.file.Path;
import java.nio.file.Paths; import java.nio.file.Paths;
import org.apache.lucene.document.Document; import org.apache.lucene.document.Document;
import org.apache.lucene.document.IntPoint; import org.apache.lucene.document.IntPoint;
import org.apache.lucene.index.CheckIndex;
import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.Directory; import org.apache.lucene.store.Directory;
@ -70,7 +71,7 @@ public class TestManyPointsInOldIndex extends LuceneTestCase {
dir.setCheckIndexOnClose(false); dir.setCheckIndexOnClose(false);
// ... because we check ourselves here: // ... because we check ourselves here:
TestUtil.checkIndex(dir, false, true, true, null); TestUtil.checkIndex(dir, CheckIndex.Level.MIN_LEVEL_FOR_INTEGRITY_CHECKS, true, true, null);
dir.close(); dir.close();
} }
} }

View File

@ -23,6 +23,7 @@ description = 'Lucene JMH micro-benchmarking module'
dependencies { dependencies {
moduleImplementation project(':lucene:core') moduleImplementation project(':lucene:core')
moduleImplementation project(':lucene:expressions')
moduleImplementation "org.openjdk.jmh:jmh-core:1.37" moduleImplementation "org.openjdk.jmh:jmh-core:1.37"
annotationProcessor "org.openjdk.jmh:jmh-generator-annprocess:1.37" annotationProcessor "org.openjdk.jmh:jmh-generator-annprocess:1.37"
@ -42,7 +43,7 @@ tasks.matching { it.name == "forbiddenApisMain" }.configureEach {
tasks.matching { it.name in [ tasks.matching { it.name in [
// Turn off JMH dependency checksums and licensing (it's GPL w/ classpath exception // Turn off JMH dependency checksums and licensing (it's GPL w/ classpath exception
// but this seems fine for test/build only tools). // but this seems fine for test/build only tools).
"validateJarChecksums", "validateJarLicenses", "validateJarChecksums", "validateJarLicenses", "collectJarInfos",
// No special javadocs for JMH benchmarks. // No special javadocs for JMH benchmarks.
"renderSiteJavadoc", "renderSiteJavadoc",
"renderJavadoc", "renderJavadoc",

View File

@ -20,6 +20,7 @@ module org.apache.lucene.benchmark.jmh {
requires jmh.core; requires jmh.core;
requires jdk.unsupported; requires jdk.unsupported;
requires org.apache.lucene.core; requires org.apache.lucene.core;
requires org.apache.lucene.expressions;
exports org.apache.lucene.benchmark.jmh; exports org.apache.lucene.benchmark.jmh;
exports org.apache.lucene.benchmark.jmh.jmh_generated; exports org.apache.lucene.benchmark.jmh.jmh_generated;

View File

@ -0,0 +1,148 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.benchmark.jmh;
import java.io.IOException;
import java.lang.invoke.MethodHandle;
import java.lang.invoke.MethodHandles;
import java.lang.invoke.MethodType;
import java.text.ParseException;
import java.util.HashMap;
import java.util.Map;
import java.util.Objects;
import java.util.concurrent.ThreadLocalRandom;
import java.util.concurrent.TimeUnit;
import org.apache.lucene.expressions.Expression;
import org.apache.lucene.expressions.js.JavascriptCompiler;
import org.apache.lucene.search.DoubleValues;
import org.openjdk.jmh.annotations.Benchmark;
import org.openjdk.jmh.annotations.BenchmarkMode;
import org.openjdk.jmh.annotations.Fork;
import org.openjdk.jmh.annotations.Level;
import org.openjdk.jmh.annotations.Measurement;
import org.openjdk.jmh.annotations.Mode;
import org.openjdk.jmh.annotations.OutputTimeUnit;
import org.openjdk.jmh.annotations.Param;
import org.openjdk.jmh.annotations.Scope;
import org.openjdk.jmh.annotations.Setup;
import org.openjdk.jmh.annotations.State;
import org.openjdk.jmh.annotations.Warmup;
@BenchmarkMode(Mode.Throughput)
@OutputTimeUnit(TimeUnit.MILLISECONDS)
@State(Scope.Benchmark)
@Warmup(iterations = 5, time = 5)
@Measurement(iterations = 12, time = 8)
@Fork(value = 1)
public class ExpressionsBenchmark {
/**
* Some extra functions to bench "identity" in various variants, another one is named
* "native_identity" (see below).
*/
private static final Map<String, MethodHandle> FUNCTIONS = getFunctions();
private static final String NATIVE_IDENTITY_NAME = "native_identity";
private static Map<String, MethodHandle> getFunctions() {
try {
var lookup = MethodHandles.lookup();
Map<String, MethodHandle> m = new HashMap<>(JavascriptCompiler.DEFAULT_FUNCTIONS);
m.put(
"func_identity",
lookup.findStatic(
lookup.lookupClass(), "ident", MethodType.methodType(double.class, double.class)));
m.put("mh_identity", MethodHandles.identity(double.class));
return m;
} catch (ReflectiveOperationException e) {
throw new AssertionError(e);
}
}
@SuppressWarnings("unused")
private static double ident(double v) {
return v;
}
/** A native implementation of an expression to compare performance */
private static final Expression NATIVE_IDENTITY_EXPRESSION =
new Expression(NATIVE_IDENTITY_NAME, new String[] {"x"}) {
@Override
public double evaluate(DoubleValues[] functionValues) throws IOException {
return functionValues[0].doubleValue();
}
};
private double[] randomData;
private Expression expression;
@Param({"x", "func_identity(x)", "mh_identity", "native_identity", "cos(x)", "cos(x) + sin(x)"})
String js;
@Setup(Level.Iteration)
public void init() throws ParseException {
ThreadLocalRandom random = ThreadLocalRandom.current();
randomData = random.doubles().limit(1024).toArray();
expression =
Objects.equals(js, NATIVE_IDENTITY_NAME)
? NATIVE_IDENTITY_EXPRESSION
: JavascriptCompiler.compile(js, FUNCTIONS);
}
@Benchmark
public double expression() throws IOException {
var it = new ValuesIterator(randomData);
var values = it.getDoubleValues();
double result = 0d;
while (it.next()) {
result += expression.evaluate(values);
}
return result;
}
static final class ValuesIterator {
final double[] data;
final DoubleValues[] dv;
int pos = -1;
ValuesIterator(double[] data) {
this.data = data;
var dv =
new DoubleValues() {
@Override
public double doubleValue() throws IOException {
return data[pos];
}
@Override
public boolean advanceExact(int doc) throws IOException {
throw new UnsupportedOperationException();
}
};
this.dv = new DoubleValues[] {dv};
}
boolean next() {
pos++;
return (pos < data.length);
}
DoubleValues[] getDoubleValues() {
return dv;
}
}
}

View File

@ -0,0 +1,176 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.benchmark.jmh;
import java.io.IOException;
import java.nio.file.Files;
import java.util.Arrays;
import java.util.Random;
import java.util.concurrent.TimeUnit;
import org.apache.lucene.codecs.lucene99.GroupVIntReader;
import org.apache.lucene.codecs.lucene99.GroupVIntWriter;
import org.apache.lucene.store.ByteArrayDataInput;
import org.apache.lucene.store.ByteArrayDataOutput;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.store.MMapDirectory;
import org.openjdk.jmh.annotations.Benchmark;
import org.openjdk.jmh.annotations.BenchmarkMode;
import org.openjdk.jmh.annotations.Fork;
import org.openjdk.jmh.annotations.Level;
import org.openjdk.jmh.annotations.Measurement;
import org.openjdk.jmh.annotations.Mode;
import org.openjdk.jmh.annotations.OutputTimeUnit;
import org.openjdk.jmh.annotations.Param;
import org.openjdk.jmh.annotations.Scope;
import org.openjdk.jmh.annotations.Setup;
import org.openjdk.jmh.annotations.State;
import org.openjdk.jmh.annotations.Warmup;
import org.openjdk.jmh.infra.Blackhole;
@BenchmarkMode(Mode.Throughput)
@OutputTimeUnit(TimeUnit.MICROSECONDS)
@State(Scope.Benchmark)
@Warmup(iterations = 3, time = 3)
@Measurement(iterations = 5, time = 5)
@Fork(
value = 1,
jvmArgsPrepend = {"--add-modules=jdk.unsupported"})
public class GroupVIntBenchmark {
// Cumulative frequency for each number of bits per value used by doc deltas of tail postings on
// wikibigall.
private static final float[] CUMULATIVE_FREQUENCY_BY_BITS_REQUIRED =
new float[] {
0.0f,
0.01026574f,
0.021453038f,
0.03342156f,
0.046476692f,
0.060890317f,
0.07644147f,
0.093718216f,
0.11424741f,
0.13989712f,
0.17366524f,
0.22071244f,
0.2815692f,
0.3537585f,
0.43655503f,
0.52308f,
0.6104675f,
0.7047371f,
0.78155357f,
0.8671179f,
0.9740598f,
1.0f
};
final int maxSize = 256;
final long[] values = new long[maxSize];
IndexInput byteBufferGVIntIn;
IndexInput byteBufferVIntIn;
ByteArrayDataInput byteArrayVIntIn;
ByteArrayDataInput byteArrayGVIntIn;
// @Param({"16", "32", "64", "128", "248"})
@Param({"64"})
public int size;
void initArrayInput(long[] docs) throws Exception {
byte[] gVIntBytes = new byte[Integer.BYTES * maxSize * 2];
byte[] vIntBytes = new byte[Integer.BYTES * maxSize * 2];
ByteArrayDataOutput vIntOut = new ByteArrayDataOutput(vIntBytes);
GroupVIntWriter w = new GroupVIntWriter();
w.writeValues(new ByteArrayDataOutput(gVIntBytes), docs, docs.length);
for (long v : docs) {
vIntOut.writeVInt((int) v);
}
byteArrayVIntIn = new ByteArrayDataInput(vIntBytes);
byteArrayGVIntIn = new ByteArrayDataInput(gVIntBytes);
}
void initByteBufferInput(long[] docs) throws Exception {
Directory dir = MMapDirectory.open(Files.createTempDirectory("groupvintdata"));
IndexOutput vintOut = dir.createOutput("vint", IOContext.DEFAULT);
IndexOutput gvintOut = dir.createOutput("gvint", IOContext.DEFAULT);
GroupVIntWriter w = new GroupVIntWriter();
w.writeValues(gvintOut, docs, docs.length);
for (long v : docs) {
vintOut.writeVInt((int) v);
}
vintOut.close();
gvintOut.close();
byteBufferGVIntIn = dir.openInput("gvint", IOContext.DEFAULT);
byteBufferVIntIn = dir.openInput("vint", IOContext.DEFAULT);
}
@Setup(Level.Trial)
public void init() throws Exception {
long[] docs = new long[maxSize];
Random r = new Random(0);
for (int i = 0; i < maxSize; ++i) {
float randomFloat = r.nextFloat();
// Reproduce the distribution of the number of bits per values that we're observing for tail
// postings on wikibigall.
int numBits = 1 + Arrays.binarySearch(CUMULATIVE_FREQUENCY_BY_BITS_REQUIRED, randomFloat);
if (numBits < 0) {
numBits = -numBits;
}
docs[i] = r.nextInt(1 << (numBits - 1), 1 << numBits);
}
initByteBufferInput(docs);
initArrayInput(docs);
}
@Benchmark
public void byteBufferReadVInt(Blackhole bh) throws IOException {
byteBufferVIntIn.seek(0);
for (int i = 0; i < size; i++) {
values[i] = byteBufferVIntIn.readVInt();
}
bh.consume(values);
}
@Benchmark
public void byteBufferReadGroupVInt(Blackhole bh) throws IOException {
byteBufferGVIntIn.seek(0);
GroupVIntReader.readValues(byteBufferGVIntIn, values, size);
bh.consume(values);
}
@Benchmark
public void byteArrayReadVInt(Blackhole bh) {
byteArrayVIntIn.rewind();
for (int i = 0; i < size; i++) {
values[i] = byteArrayVIntIn.readVInt();
}
bh.consume(values);
}
@Benchmark
public void byteArrayReadGroupVInt(Blackhole bh) throws IOException {
byteArrayGVIntIn.rewind();
GroupVIntReader.readValues(byteArrayGVIntIn, values, size);
bh.consume(values);
}
}

View File

@ -30,8 +30,8 @@ import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.Sort; import org.apache.lucene.search.Sort;
import org.apache.lucene.search.TopDocs; import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.TopFieldCollector; import org.apache.lucene.search.TopFieldCollectorManager;
import org.apache.lucene.search.TopScoreDocCollector; import org.apache.lucene.search.TopScoreDocCollectorManager;
import org.apache.lucene.store.Directory; import org.apache.lucene.store.Directory;
import org.apache.lucene.util.Bits; import org.apache.lucene.util.Bits;
@ -110,15 +110,17 @@ public abstract class ReadTask extends PerfTask {
// the IndexSearcher search methods that take // the IndexSearcher search methods that take
// Weight public again, we can go back to // Weight public again, we can go back to
// pulling the Weight ourselves: // pulling the Weight ourselves:
TopFieldCollector collector = int totalHitsThreshold = withTotalHits() ? Integer.MAX_VALUE : 1;
TopFieldCollector.create(sort, numHits, withTotalHits() ? Integer.MAX_VALUE : 1); TopFieldCollectorManager collectorManager =
searcher.search(q, collector); new TopFieldCollectorManager(
hits = collector.topDocs(); sort, numHits, null, totalHitsThreshold, searcher.getSlices().length > 1);
hits = searcher.search(q, collectorManager);
} else { } else {
hits = searcher.search(q, numHits); hits = searcher.search(q, numHits);
} }
} else { } else {
Collector collector = createCollector(); Collector collector = createCollector();
searcher.search(q, collector); searcher.search(q, collector);
// hits = collector.topDocs(); // hits = collector.topDocs();
} }
@ -183,7 +185,8 @@ public abstract class ReadTask extends PerfTask {
} }
protected Collector createCollector() throws Exception { protected Collector createCollector() throws Exception {
return TopScoreDocCollector.create(numHits(), withTotalHits() ? Integer.MAX_VALUE : 1); return new TopScoreDocCollectorManager(numHits(), withTotalHits() ? Integer.MAX_VALUE : 1)
.newCollector();
} }
protected Document retrieveDoc(StoredFields storedFields, int id) throws IOException { protected Document retrieveDoc(StoredFields storedFields, int id) throws IOException {

View File

@ -207,7 +207,8 @@ public class BooleanPerceptronClassifier implements Classifier<Boolean> {
private void updateFST(SortedMap<String, Double> weights) throws IOException { private void updateFST(SortedMap<String, Double> weights) throws IOException {
PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(); PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
FSTCompiler<Long> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs); FSTCompiler<Long> fstCompiler =
new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE1, outputs).build();
BytesRefBuilder scratchBytes = new BytesRefBuilder(); BytesRefBuilder scratchBytes = new BytesRefBuilder();
IntsRefBuilder scratchInts = new IntsRefBuilder(); IntsRefBuilder scratchInts = new IntsRefBuilder();
for (Map.Entry<String, Double> entry : weights.entrySet()) { for (Map.Entry<String, Double> entry : weights.entrySet()) {

View File

@ -16,6 +16,8 @@
*/ */
package org.apache.lucene.codecs.blockterms; package org.apache.lucene.codecs.blockterms;
import static org.apache.lucene.util.fst.FST.readMetadata;
import java.io.IOException; import java.io.IOException;
import java.util.Collection; import java.util.Collection;
import java.util.Collections; import java.util.Collections;
@ -154,7 +156,7 @@ public class VariableGapTermsIndexReader extends TermsIndexReaderBase {
public FieldIndexData(IndexInput in, FieldInfo fieldInfo, long indexStart) throws IOException { public FieldIndexData(IndexInput in, FieldInfo fieldInfo, long indexStart) throws IOException {
IndexInput clone = in.clone(); IndexInput clone = in.clone();
clone.seek(indexStart); clone.seek(indexStart);
fst = new FST<>(clone, clone, fstOutputs); fst = new FST<>(readMetadata(clone, fstOutputs), clone);
clone.close(); clone.close();
/* /*

View File

@ -238,7 +238,7 @@ public class VariableGapTermsIndexWriter extends TermsIndexWriterBase {
public FSTFieldWriter(FieldInfo fieldInfo, long termsFilePointer) throws IOException { public FSTFieldWriter(FieldInfo fieldInfo, long termsFilePointer) throws IOException {
this.fieldInfo = fieldInfo; this.fieldInfo = fieldInfo;
fstOutputs = PositiveIntOutputs.getSingleton(); fstOutputs = PositiveIntOutputs.getSingleton();
fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, fstOutputs); fstCompiler = new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE1, fstOutputs).build();
indexStart = out.getFilePointer(); indexStart = out.getFilePointer();
//// System.out.println("VGW: field=" + fieldInfo.name); //// System.out.println("VGW: field=" + fieldInfo.name);

View File

@ -16,6 +16,8 @@
*/ */
package org.apache.lucene.codecs.blocktreeords; package org.apache.lucene.codecs.blocktreeords;
import static org.apache.lucene.util.fst.FST.readMetadata;
import java.io.IOException; import java.io.IOException;
import org.apache.lucene.codecs.blocktreeords.FSTOrdsOutputs.Output; import org.apache.lucene.codecs.blocktreeords.FSTOrdsOutputs.Output;
import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.FieldInfo;
@ -85,7 +87,7 @@ final class OrdsFieldReader extends Terms {
final IndexInput clone = indexIn.clone(); final IndexInput clone = indexIn.clone();
// System.out.println("start=" + indexStartFP + " field=" + fieldInfo.name); // System.out.println("start=" + indexStartFP + " field=" + fieldInfo.name);
clone.seek(indexStartFP); clone.seek(indexStartFP);
index = new FST<>(clone, clone, OrdsBlockTreeTermsWriter.FST_OUTPUTS); index = new FST<>(readMetadata(clone, OrdsBlockTreeTermsWriter.FST_OUTPUTS), clone);
/* /*
if (true) { if (true) {

View File

@ -194,7 +194,8 @@ public class FSTTermsReader extends FieldsProducer {
this.sumDocFreq = sumDocFreq; this.sumDocFreq = sumDocFreq;
this.docCount = docCount; this.docCount = docCount;
OffHeapFSTStore offHeapFSTStore = new OffHeapFSTStore(); OffHeapFSTStore offHeapFSTStore = new OffHeapFSTStore();
this.dict = new FST<>(in, in, new FSTTermOutputs(fieldInfo), offHeapFSTStore); FSTTermOutputs outputs = new FSTTermOutputs(fieldInfo);
this.dict = new FST<>(FST.readMetadata(in, outputs), in, offHeapFSTStore);
in.skipBytes(offHeapFSTStore.size()); in.skipBytes(offHeapFSTStore.size());
} }

View File

@ -251,12 +251,12 @@ public class FSTTermsWriter extends FieldsConsumer {
private final IntsRefBuilder scratchTerm = new IntsRefBuilder(); private final IntsRefBuilder scratchTerm = new IntsRefBuilder();
private final ByteBuffersDataOutput metaWriter = ByteBuffersDataOutput.newResettableInstance(); private final ByteBuffersDataOutput metaWriter = ByteBuffersDataOutput.newResettableInstance();
TermsWriter(FieldInfo fieldInfo) { TermsWriter(FieldInfo fieldInfo) throws IOException {
this.numTerms = 0; this.numTerms = 0;
this.fieldInfo = fieldInfo; this.fieldInfo = fieldInfo;
postingsWriter.setField(fieldInfo); postingsWriter.setField(fieldInfo);
this.outputs = new FSTTermOutputs(fieldInfo); this.outputs = new FSTTermOutputs(fieldInfo);
this.fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs); this.fstCompiler = new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE1, outputs).build();
} }
public void finishTerm(BytesRef text, BlockTermState state) throws IOException { public void finishTerm(BytesRef text, BlockTermState state) throws IOException {

View File

@ -683,7 +683,7 @@ class SimpleTextFieldsReader extends FieldsProducer {
final PairOutputs<Long, Long> outputsInner = new PairOutputs<>(posIntOutputs, posIntOutputs); final PairOutputs<Long, Long> outputsInner = new PairOutputs<>(posIntOutputs, posIntOutputs);
final PairOutputs<PairOutputs.Pair<Long, Long>, PairOutputs.Pair<Long, Long>> outputs = final PairOutputs<PairOutputs.Pair<Long, Long>, PairOutputs.Pair<Long, Long>> outputs =
new PairOutputs<>(outputsOuter, outputsInner); new PairOutputs<>(outputsOuter, outputsInner);
fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs); fstCompiler = new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE1, outputs).build();
IndexInput in = SimpleTextFieldsReader.this.in.clone(); IndexInput in = SimpleTextFieldsReader.this.in.clone();
in.seek(termsStart); in.seek(termsStart);
final BytesRefBuilder lastTerm = new BytesRefBuilder(); final BytesRefBuilder lastTerm = new BytesRefBuilder();

View File

@ -37,7 +37,6 @@ public class SimpleTextStoredFieldsFormat extends StoredFieldsFormat {
@Override @Override
public StoredFieldsReader fieldsReader( public StoredFieldsReader fieldsReader(
Directory directory, SegmentInfo si, FieldInfos fn, IOContext context) throws IOException { Directory directory, SegmentInfo si, FieldInfos fn, IOContext context) throws IOException {
;
return new SimpleTextStoredFieldsReader(directory, si, fn, context); return new SimpleTextStoredFieldsReader(directory, si, fn, context);
} }

View File

@ -89,10 +89,11 @@ public class FSTDictionary implements IndexDictionary {
isFSTOnHeap = true; isFSTOnHeap = true;
} }
PositiveIntOutputs fstOutputs = PositiveIntOutputs.getSingleton(); PositiveIntOutputs fstOutputs = PositiveIntOutputs.getSingleton();
FST.FSTMetadata<Long> metadata = FST.readMetadata(fstDataInput, fstOutputs);
FST<Long> fst = FST<Long> fst =
isFSTOnHeap isFSTOnHeap
? new FST<>(fstDataInput, fstDataInput, fstOutputs) ? new FST<>(metadata, fstDataInput)
: new FST<>(fstDataInput, fstDataInput, fstOutputs, new OffHeapFSTStore()); : new FST<>(metadata, fstDataInput, new OffHeapFSTStore());
return new FSTDictionary(fst); return new FSTDictionary(fst);
} }
@ -171,9 +172,9 @@ public class FSTDictionary implements IndexDictionary {
protected final FSTCompiler<Long> fstCompiler; protected final FSTCompiler<Long> fstCompiler;
protected final IntsRefBuilder scratchInts; protected final IntsRefBuilder scratchInts;
public Builder() { public Builder() throws IOException {
PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(); PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs); fstCompiler = new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE1, outputs).build();
scratchInts = new IntsRefBuilder(); scratchInts = new IntsRefBuilder();
} }

View File

@ -100,5 +100,4 @@ public abstract class DelegatingAnalyzerWrapper extends AnalyzerWrapper {
} }
} }
} }
;
} }

View File

@ -70,7 +70,6 @@ public abstract class TermVectorsWriter implements Closeable, Accountable {
/** Called after a doc and all its fields have been added. */ /** Called after a doc and all its fields have been added. */
public void finishDocument() throws IOException {} public void finishDocument() throws IOException {}
;
/** /**
* Called before writing the terms of the field. {@link #startTerm(BytesRef, int)} will be called * Called before writing the terms of the field. {@link #startTerm(BytesRef, int)} will be called
@ -82,7 +81,6 @@ public abstract class TermVectorsWriter implements Closeable, Accountable {
/** Called after a field and all its terms have been added. */ /** Called after a field and all its terms have been added. */
public void finishField() throws IOException {} public void finishField() throws IOException {}
;
/** /**
* Adds a term and its term frequency <code>freq</code>. If this field has positions and/or * Adds a term and its term frequency <code>freq</code>. If this field has positions and/or

View File

@ -91,7 +91,11 @@ public final class FieldReader extends Terms {
// Initialize FST always off-heap. // Initialize FST always off-heap.
final IndexInput clone = indexIn.clone(); final IndexInput clone = indexIn.clone();
clone.seek(indexStartFP); clone.seek(indexStartFP);
index = new FST<>(metaIn, clone, ByteSequenceOutputs.getSingleton(), new OffHeapFSTStore()); index =
new FST<>(
FST.readMetadata(metaIn, ByteSequenceOutputs.getSingleton()),
clone,
new OffHeapFSTStore());
/* /*
if (false) { if (false) {
final String dotFileName = segment + "_" + fieldInfo.name + ".dot"; final String dotFileName = segment + "_" + fieldInfo.name + ".dot";

View File

@ -30,9 +30,7 @@ import org.apache.lucene.util.StringHelper;
import org.apache.lucene.util.automaton.ByteRunnable; import org.apache.lucene.util.automaton.ByteRunnable;
import org.apache.lucene.util.automaton.Transition; import org.apache.lucene.util.automaton.Transition;
import org.apache.lucene.util.automaton.TransitionAccessor; import org.apache.lucene.util.automaton.TransitionAccessor;
import org.apache.lucene.util.fst.ByteSequenceOutputs;
import org.apache.lucene.util.fst.FST; import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.Outputs;
/** /**
* This is used to implement efficient {@link Terms#intersect} for block-tree. Note that it cannot * This is used to implement efficient {@link Terms#intersect} for block-tree. Note that it cannot
@ -46,7 +44,6 @@ final class IntersectTermsEnum extends BaseTermsEnum {
// static boolean DEBUG = BlockTreeTermsWriter.DEBUG; // static boolean DEBUG = BlockTreeTermsWriter.DEBUG;
final IndexInput in; final IndexInput in;
static final Outputs<BytesRef> fstOutputs = ByteSequenceOutputs.getSingleton();
IntersectTermsEnumFrame[] stack; IntersectTermsEnumFrame[] stack;
@ -68,6 +65,9 @@ final class IntersectTermsEnum extends BaseTermsEnum {
private BytesRef savedStartTerm; private BytesRef savedStartTerm;
private final SegmentTermsEnum.OutputAccumulator outputAccumulator =
new SegmentTermsEnum.OutputAccumulator();
// TODO: in some cases we can filter by length? eg // TODO: in some cases we can filter by length? eg
// regexp foo*bar must be at least length 6 bytes // regexp foo*bar must be at least length 6 bytes
public IntersectTermsEnum( public IntersectTermsEnum(
@ -114,7 +114,6 @@ final class IntersectTermsEnum extends BaseTermsEnum {
f.prefix = 0; f.prefix = 0;
f.setState(0); f.setState(0);
f.arc = arc; f.arc = arc;
f.outputPrefix = arc.output();
f.load(fr.rootCode); f.load(fr.rootCode);
// for assert: // for assert:
@ -184,7 +183,9 @@ final class IntersectTermsEnum extends BaseTermsEnum {
FST.Arc<BytesRef> arc = currentFrame.arc; FST.Arc<BytesRef> arc = currentFrame.arc;
int idx = currentFrame.prefix; int idx = currentFrame.prefix;
assert currentFrame.suffix > 0; assert currentFrame.suffix > 0;
BytesRef output = currentFrame.outputPrefix;
outputAccumulator.reset();
outputAccumulator.push(arc.output());
while (idx < f.prefix) { while (idx < f.prefix) {
final int target = term.bytes[idx] & 0xff; final int target = term.bytes[idx] & 0xff;
// TODO: we could be more efficient for the next() // TODO: we could be more efficient for the next()
@ -192,14 +193,14 @@ final class IntersectTermsEnum extends BaseTermsEnum {
// passed to findTargetArc // passed to findTargetArc
arc = fr.index.findTargetArc(target, arc, getArc(1 + idx), fstReader); arc = fr.index.findTargetArc(target, arc, getArc(1 + idx), fstReader);
assert arc != null; assert arc != null;
output = fstOutputs.add(output, arc.output()); outputAccumulator.push(arc.output());
idx++; idx++;
} }
f.arc = arc; f.arc = arc;
f.outputPrefix = output;
assert arc.isFinal(); assert arc.isFinal();
f.load(fstOutputs.add(output, arc.nextFinalOutput())); outputAccumulator.push(arc.nextFinalOutput());
f.load(outputAccumulator);
return f; return f;
} }

View File

@ -55,7 +55,6 @@ final class IntersectTermsEnumFrame {
int statsSingletonRunLength = 0; int statsSingletonRunLength = 0;
final ByteArrayDataInput statsReader = new ByteArrayDataInput(); final ByteArrayDataInput statsReader = new ByteArrayDataInput();
byte[] floorData = new byte[32];
final ByteArrayDataInput floorDataReader = new ByteArrayDataInput(); final ByteArrayDataInput floorDataReader = new ByteArrayDataInput();
// Length of prefix shared by all terms in this block // Length of prefix shared by all terms in this block
@ -90,9 +89,6 @@ final class IntersectTermsEnumFrame {
final ByteArrayDataInput bytesReader = new ByteArrayDataInput(); final ByteArrayDataInput bytesReader = new ByteArrayDataInput();
// Cumulative output so far
BytesRef outputPrefix;
int startBytePos; int startBytePos;
int suffix; int suffix;
@ -120,7 +116,7 @@ final class IntersectTermsEnumFrame {
} }
} while (numFollowFloorBlocks != 0 && nextFloorLabel <= transition.min); } while (numFollowFloorBlocks != 0 && nextFloorLabel <= transition.min);
load(null); load((Long) null);
} }
public void setState(int state) { public void setState(int state) {
@ -142,12 +138,22 @@ final class IntersectTermsEnumFrame {
} }
void load(BytesRef frameIndexData) throws IOException { void load(BytesRef frameIndexData) throws IOException {
if (frameIndexData != null) { floorDataReader.reset(frameIndexData.bytes, frameIndexData.offset, frameIndexData.length);
floorDataReader.reset(frameIndexData.bytes, frameIndexData.offset, frameIndexData.length); load(ite.fr.readVLongOutput(floorDataReader));
// Skip first long -- has redundant fp, hasTerms }
// flag, isFloor flag
final long code = ite.fr.readVLongOutput(floorDataReader); void load(SegmentTermsEnum.OutputAccumulator outputAccumulator) throws IOException {
if ((code & Lucene90BlockTreeTermsReader.OUTPUT_FLAG_IS_FLOOR) != 0) { outputAccumulator.prepareRead();
long code = ite.fr.readVLongOutput(outputAccumulator);
outputAccumulator.setFloorData(floorDataReader);
load(code);
}
void load(Long blockCode) throws IOException {
if (blockCode != null) {
// This block is the first one in a possible sequence of floor blocks corresponding to a
// single seek point from the FST terms index
if ((blockCode & Lucene90BlockTreeTermsReader.OUTPUT_FLAG_IS_FLOOR) != 0) {
// Floor frame // Floor frame
numFollowFloorBlocks = floorDataReader.readVInt(); numFollowFloorBlocks = floorDataReader.readVInt();
nextFloorLabel = floorDataReader.readByte() & 0xff; nextFloorLabel = floorDataReader.readByte() & 0xff;

View File

@ -16,6 +16,8 @@
*/ */
package org.apache.lucene.codecs.lucene90.blocktree; package org.apache.lucene.codecs.lucene90.blocktree;
import static org.apache.lucene.util.fst.FSTCompiler.getOnHeapReaderWriter;
import java.io.IOException; import java.io.IOException;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Arrays; import java.util.Arrays;
@ -525,7 +527,7 @@ public final class Lucene90BlockTreeTermsWriter extends FieldsConsumer {
// Disable suffixes sharing for block tree index because suffixes are mostly dropped // Disable suffixes sharing for block tree index because suffixes are mostly dropped
// from the FST index and left in the term blocks. // from the FST index and left in the term blocks.
.suffixRAMLimitMB(0d) .suffixRAMLimitMB(0d)
.bytesPageBits(pageBits) .dataOutput(getOnHeapReaderWriter(pageBits))
.build(); .build();
// if (DEBUG) { // if (DEBUG) {
// System.out.println(" compile index for prefix=" + prefix); // System.out.println(" compile index for prefix=" + prefix);

View File

@ -24,6 +24,7 @@ import org.apache.lucene.index.ImpactsEnum;
import org.apache.lucene.index.PostingsEnum; import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.TermState; import org.apache.lucene.index.TermState;
import org.apache.lucene.store.ByteArrayDataInput; import org.apache.lucene.store.ByteArrayDataInput;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.IndexInput; import org.apache.lucene.store.IndexInput;
import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRef;
@ -48,7 +49,7 @@ final class SegmentTermsEnum extends BaseTermsEnum {
// static boolean DEBUG = BlockTreeTermsWriter.DEBUG; // static boolean DEBUG = BlockTreeTermsWriter.DEBUG;
private final ByteArrayDataInput scratchReader = new ByteArrayDataInput(); private final OutputAccumulator outputAccumulator = new OutputAccumulator();
// What prefix of the current term was present in the index; when we only next() through the // What prefix of the current term was present in the index; when we only next() through the
// index, this stays at 0. It's only set when // index, this stays at 0. It's only set when
@ -232,18 +233,24 @@ final class SegmentTermsEnum extends BaseTermsEnum {
return arcs[ord]; return arcs[ord];
} }
// Pushes a frame we seek'd to
SegmentTermsEnumFrame pushFrame(FST.Arc<BytesRef> arc, BytesRef frameData, int length) SegmentTermsEnumFrame pushFrame(FST.Arc<BytesRef> arc, BytesRef frameData, int length)
throws IOException { throws IOException {
scratchReader.reset(frameData.bytes, frameData.offset, frameData.length); outputAccumulator.reset();
final long code = fr.readVLongOutput(scratchReader); outputAccumulator.push(frameData);
return pushFrame(arc, length);
}
// Pushes a frame we seek'd to
SegmentTermsEnumFrame pushFrame(FST.Arc<BytesRef> arc, int length) throws IOException {
outputAccumulator.prepareRead();
final long code = fr.readVLongOutput(outputAccumulator);
final long fpSeek = code >>> Lucene90BlockTreeTermsReader.OUTPUT_FLAGS_NUM_BITS; final long fpSeek = code >>> Lucene90BlockTreeTermsReader.OUTPUT_FLAGS_NUM_BITS;
final SegmentTermsEnumFrame f = getFrame(1 + currentFrame.ord); final SegmentTermsEnumFrame f = getFrame(1 + currentFrame.ord);
f.hasTerms = (code & Lucene90BlockTreeTermsReader.OUTPUT_FLAG_HAS_TERMS) != 0; f.hasTerms = (code & Lucene90BlockTreeTermsReader.OUTPUT_FLAG_HAS_TERMS) != 0;
f.hasTermsOrig = f.hasTerms; f.hasTermsOrig = f.hasTerms;
f.isFloor = (code & Lucene90BlockTreeTermsReader.OUTPUT_FLAG_IS_FLOOR) != 0; f.isFloor = (code & Lucene90BlockTreeTermsReader.OUTPUT_FLAG_IS_FLOOR) != 0;
if (f.isFloor) { if (f.isFloor) {
f.setFloorData(scratchReader, frameData); f.setFloorData(outputAccumulator);
} }
pushFrame(arc, fpSeek, length); pushFrame(arc, fpSeek, length);
@ -344,9 +351,9 @@ final class SegmentTermsEnum extends BaseTermsEnum {
FST.Arc<BytesRef> arc; FST.Arc<BytesRef> arc;
int targetUpto; int targetUpto;
BytesRef output;
targetBeforeCurrentLength = currentFrame.ord; targetBeforeCurrentLength = currentFrame.ord;
outputAccumulator.reset();
if (currentFrame != staticFrame) { if (currentFrame != staticFrame) {
@ -363,7 +370,7 @@ final class SegmentTermsEnum extends BaseTermsEnum {
arc = arcs[0]; arc = arcs[0];
assert arc.isFinal(); assert arc.isFinal();
output = arc.output(); outputAccumulator.push(arc.output());
targetUpto = 0; targetUpto = 0;
SegmentTermsEnumFrame lastFrame = stack[0]; SegmentTermsEnumFrame lastFrame = stack[0];
@ -373,9 +380,6 @@ final class SegmentTermsEnum extends BaseTermsEnum {
int cmp = 0; int cmp = 0;
// TODO: reverse vLong byte order for better FST
// prefix output sharing
// First compare up to valid seek frames: // First compare up to valid seek frames:
while (targetUpto < targetLimit) { while (targetUpto < targetLimit) {
cmp = (term.byteAt(targetUpto) & 0xFF) - (target.bytes[target.offset + targetUpto] & 0xFF); cmp = (term.byteAt(targetUpto) & 0xFF) - (target.bytes[target.offset + targetUpto] & 0xFF);
@ -394,9 +398,8 @@ final class SegmentTermsEnum extends BaseTermsEnum {
+ (char) arc.label() + (char) arc.label()
+ " targetLabel=" + " targetLabel="
+ (char) (target.bytes[target.offset + targetUpto] & 0xFF); + (char) (target.bytes[target.offset + targetUpto] & 0xFF);
if (arc.output() != Lucene90BlockTreeTermsReader.NO_OUTPUT) { outputAccumulator.push(arc.output());
output = Lucene90BlockTreeTermsReader.FST_OUTPUTS.add(output, arc.output());
}
if (arc.isFinal()) { if (arc.isFinal()) {
lastFrame = stack[1 + lastFrame.ord]; lastFrame = stack[1 + lastFrame.ord];
} }
@ -484,15 +487,15 @@ final class SegmentTermsEnum extends BaseTermsEnum {
// System.out.println(" no seek state; push root frame"); // System.out.println(" no seek state; push root frame");
// } // }
output = arc.output(); outputAccumulator.push(arc.output());
currentFrame = staticFrame; currentFrame = staticFrame;
// term.length = 0; // term.length = 0;
targetUpto = 0; targetUpto = 0;
currentFrame = outputAccumulator.push(arc.nextFinalOutput());
pushFrame( currentFrame = pushFrame(arc, 0);
arc, Lucene90BlockTreeTermsReader.FST_OUTPUTS.add(output, arc.nextFinalOutput()), 0); outputAccumulator.pop();
} }
// if (DEBUG) { // if (DEBUG) {
@ -554,9 +557,7 @@ final class SegmentTermsEnum extends BaseTermsEnum {
term.setByteAt(targetUpto, (byte) targetLabel); term.setByteAt(targetUpto, (byte) targetLabel);
// Aggregate output as we go: // Aggregate output as we go:
assert arc.output() != null; assert arc.output() != null;
if (arc.output() != Lucene90BlockTreeTermsReader.NO_OUTPUT) { outputAccumulator.push(arc.output());
output = Lucene90BlockTreeTermsReader.FST_OUTPUTS.add(output, arc.output());
}
// if (DEBUG) { // if (DEBUG) {
// System.out.println(" index: follow label=" + toHex(target.bytes[target.offset + // System.out.println(" index: follow label=" + toHex(target.bytes[target.offset +
@ -566,11 +567,9 @@ final class SegmentTermsEnum extends BaseTermsEnum {
if (arc.isFinal()) { if (arc.isFinal()) {
// if (DEBUG) System.out.println(" arc is final!"); // if (DEBUG) System.out.println(" arc is final!");
currentFrame = outputAccumulator.push(arc.nextFinalOutput());
pushFrame( currentFrame = pushFrame(arc, targetUpto);
arc, outputAccumulator.pop();
Lucene90BlockTreeTermsReader.FST_OUTPUTS.add(output, arc.nextFinalOutput()),
targetUpto);
// if (DEBUG) System.out.println(" curFrame.ord=" + currentFrame.ord + " hasTerms=" + // if (DEBUG) System.out.println(" curFrame.ord=" + currentFrame.ord + " hasTerms=" +
// currentFrame.hasTerms); // currentFrame.hasTerms);
} }
@ -630,9 +629,9 @@ final class SegmentTermsEnum extends BaseTermsEnum {
FST.Arc<BytesRef> arc; FST.Arc<BytesRef> arc;
int targetUpto; int targetUpto;
BytesRef output;
targetBeforeCurrentLength = currentFrame.ord; targetBeforeCurrentLength = currentFrame.ord;
outputAccumulator.reset();
if (currentFrame != staticFrame) { if (currentFrame != staticFrame) {
@ -649,7 +648,7 @@ final class SegmentTermsEnum extends BaseTermsEnum {
arc = arcs[0]; arc = arcs[0];
assert arc.isFinal(); assert arc.isFinal();
output = arc.output(); outputAccumulator.push(arc.output());
targetUpto = 0; targetUpto = 0;
SegmentTermsEnumFrame lastFrame = stack[0]; SegmentTermsEnumFrame lastFrame = stack[0];
@ -659,9 +658,6 @@ final class SegmentTermsEnum extends BaseTermsEnum {
int cmp = 0; int cmp = 0;
// TODO: we should write our vLong backwards (MSB
// first) to get better sharing from the FST
// First compare up to valid seek frames: // First compare up to valid seek frames:
while (targetUpto < targetLimit) { while (targetUpto < targetLimit) {
cmp = (term.byteAt(targetUpto) & 0xFF) - (target.bytes[target.offset + targetUpto] & 0xFF); cmp = (term.byteAt(targetUpto) & 0xFF) - (target.bytes[target.offset + targetUpto] & 0xFF);
@ -680,14 +676,8 @@ final class SegmentTermsEnum extends BaseTermsEnum {
+ (char) arc.label() + (char) arc.label()
+ " targetLabel=" + " targetLabel="
+ (char) (target.bytes[target.offset + targetUpto] & 0xFF); + (char) (target.bytes[target.offset + targetUpto] & 0xFF);
// TODO: we could save the outputs in local
// byte[][] instead of making new objs ever outputAccumulator.push(arc.output());
// seek; but, often the FST doesn't have any
// shared bytes (but this could change if we
// reverse vLong byte order)
if (arc.output() != Lucene90BlockTreeTermsReader.NO_OUTPUT) {
output = Lucene90BlockTreeTermsReader.FST_OUTPUTS.add(output, arc.output());
}
if (arc.isFinal()) { if (arc.isFinal()) {
lastFrame = stack[1 + lastFrame.ord]; lastFrame = stack[1 + lastFrame.ord];
} }
@ -769,15 +759,15 @@ final class SegmentTermsEnum extends BaseTermsEnum {
// System.out.println(" no seek state; push root frame"); // System.out.println(" no seek state; push root frame");
// } // }
output = arc.output(); outputAccumulator.push(arc.output());
currentFrame = staticFrame; currentFrame = staticFrame;
// term.length = 0; // term.length = 0;
targetUpto = 0; targetUpto = 0;
currentFrame = outputAccumulator.push(arc.nextFinalOutput());
pushFrame( currentFrame = pushFrame(arc, 0);
arc, Lucene90BlockTreeTermsReader.FST_OUTPUTS.add(output, arc.nextFinalOutput()), 0); outputAccumulator.pop();
} }
// if (DEBUG) { // if (DEBUG) {
@ -839,9 +829,7 @@ final class SegmentTermsEnum extends BaseTermsEnum {
arc = nextArc; arc = nextArc;
// Aggregate output as we go: // Aggregate output as we go:
assert arc.output() != null; assert arc.output() != null;
if (arc.output() != Lucene90BlockTreeTermsReader.NO_OUTPUT) { outputAccumulator.push(arc.output());
output = Lucene90BlockTreeTermsReader.FST_OUTPUTS.add(output, arc.output());
}
// if (DEBUG) { // if (DEBUG) {
// System.out.println(" index: follow label=" + (target.bytes[target.offset + // System.out.println(" index: follow label=" + (target.bytes[target.offset +
@ -851,11 +839,9 @@ final class SegmentTermsEnum extends BaseTermsEnum {
if (arc.isFinal()) { if (arc.isFinal()) {
// if (DEBUG) System.out.println(" arc is final!"); // if (DEBUG) System.out.println(" arc is final!");
currentFrame = outputAccumulator.push(arc.nextFinalOutput());
pushFrame( currentFrame = pushFrame(arc, targetUpto);
arc, outputAccumulator.pop();
Lucene90BlockTreeTermsReader.FST_OUTPUTS.add(output, arc.nextFinalOutput()),
targetUpto);
// if (DEBUG) System.out.println(" curFrame.ord=" + currentFrame.ord + " hasTerms=" + // if (DEBUG) System.out.println(" curFrame.ord=" + currentFrame.ord + " hasTerms=" +
// currentFrame.hasTerms); // currentFrame.hasTerms);
} }
@ -1190,4 +1176,68 @@ final class SegmentTermsEnum extends BaseTermsEnum {
public long ord() { public long ord() {
throw new UnsupportedOperationException(); throw new UnsupportedOperationException();
} }
static class OutputAccumulator extends DataInput {
BytesRef[] outputs = new BytesRef[16];
BytesRef current;
int num;
int outputIndex;
int index;
void push(BytesRef output) {
if (output != Lucene90BlockTreeTermsReader.NO_OUTPUT) {
outputs = ArrayUtil.grow(outputs, num + 1);
outputs[num++] = output;
}
}
void pop() {
assert num > 0;
num--;
}
void reset() {
num = 0;
}
void prepareRead() {
index = 0;
outputIndex = 0;
current = outputs[0];
}
/**
* Set the last arc as the source of the floorData. This won't change the reading position of
* this {@link OutputAccumulator}
*/
void setFloorData(ByteArrayDataInput floorData) {
assert outputIndex == num - 1
: "floor data should be stored in last arc, get outputIndex: "
+ outputIndex
+ ", num: "
+ num;
BytesRef output = outputs[outputIndex];
floorData.reset(output.bytes, output.offset + index, output.length - index);
}
@Override
public byte readByte() throws IOException {
if (index >= current.length) {
current = outputs[++outputIndex];
index = 0;
}
return current.bytes[current.offset + index++];
}
@Override
public void readBytes(byte[] b, int offset, int len) throws IOException {
throw new UnsupportedOperationException();
}
@Override
public void skipBytes(long numBytes) throws IOException {
throw new UnsupportedOperationException();
}
}
} }

View File

@ -55,7 +55,7 @@ final class SegmentTermsEnumFrame {
int statsSingletonRunLength = 0; int statsSingletonRunLength = 0;
final ByteArrayDataInput statsReader = new ByteArrayDataInput(); final ByteArrayDataInput statsReader = new ByteArrayDataInput();
byte[] floorData = new byte[32]; int rewindPos;
final ByteArrayDataInput floorDataReader = new ByteArrayDataInput(); final ByteArrayDataInput floorDataReader = new ByteArrayDataInput();
// Length of prefix shared by all terms in this block // Length of prefix shared by all terms in this block
@ -104,13 +104,9 @@ final class SegmentTermsEnumFrame {
suffixLengthsReader = new ByteArrayDataInput(); suffixLengthsReader = new ByteArrayDataInput();
} }
public void setFloorData(ByteArrayDataInput in, BytesRef source) { public void setFloorData(SegmentTermsEnum.OutputAccumulator outputAccumulator) {
final int numBytes = source.length - (in.getPosition() - source.offset); outputAccumulator.setFloorData(floorDataReader);
if (numBytes > floorData.length) { rewindPos = floorDataReader.getPosition();
floorData = new byte[ArrayUtil.oversize(numBytes, 1)];
}
System.arraycopy(source.bytes, source.offset + in.getPosition(), floorData, 0, numBytes);
floorDataReader.reset(floorData, 0, numBytes);
numFollowFloorBlocks = floorDataReader.readVInt(); numFollowFloorBlocks = floorDataReader.readVInt();
nextFloorLabel = floorDataReader.readByte() & 0xff; nextFloorLabel = floorDataReader.readByte() & 0xff;
// if (DEBUG) { // if (DEBUG) {
@ -247,7 +243,7 @@ final class SegmentTermsEnumFrame {
nextEnt = -1; nextEnt = -1;
hasTerms = hasTermsOrig; hasTerms = hasTermsOrig;
if (isFloor) { if (isFloor) {
floorDataReader.rewind(); floorDataReader.setPosition(rewindPos);
numFollowFloorBlocks = floorDataReader.readVInt(); numFollowFloorBlocks = floorDataReader.readVInt();
assert numFollowFloorBlocks > 0; assert numFollowFloorBlocks > 0;
nextFloorLabel = floorDataReader.readByte() & 0xff; nextFloorLabel = floorDataReader.readByte() & 0xff;

View File

@ -0,0 +1,57 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.codecs.lucene99;
import java.io.IOException;
import org.apache.lucene.store.DataInput;
/** Decode integers using group-varint. */
public class GroupVIntReader {
public static void readValues(DataInput in, long[] docs, int limit) throws IOException {
int i;
for (i = 0; i <= limit - 4; i += 4) {
final int flag = in.readByte() & 0xFF;
final int n1Minus1 = flag >> 6;
final int n2Minus1 = (flag >> 4) & 0x03;
final int n3Minus1 = (flag >> 2) & 0x03;
final int n4Minus1 = flag & 0x03;
docs[i] = readLong(in, n1Minus1);
docs[i + 1] = readLong(in, n2Minus1);
docs[i + 2] = readLong(in, n3Minus1);
docs[i + 3] = readLong(in, n4Minus1);
}
for (; i < limit; ++i) {
docs[i] = in.readVInt();
}
}
private static long readLong(DataInput in, int numBytesMinus1) throws IOException {
switch (numBytesMinus1) {
case 0:
return in.readByte() & 0xFFL;
case 1:
return in.readShort() & 0xFFFFL;
case 2:
return (in.readShort() & 0xFFFFL) | ((in.readByte() & 0xFFL) << 16);
default:
return in.readInt() & 0xFFFFFFFFL;
}
}
}

View File

@ -0,0 +1,63 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.codecs.lucene99;
import java.io.IOException;
import org.apache.lucene.store.DataOutput;
/**
* Encode integers using group-varint. It uses VInt to encode tail values that are not enough for a
* group
*/
public class GroupVIntWriter {
// the maximum size of one group is 4 integers + 1 byte flag.
private byte[] bytes = new byte[17];
private int byteOffset = 0;
public GroupVIntWriter() {}
private int encodeValue(int v) {
int lastOff = byteOffset;
do {
bytes[byteOffset++] = (byte) (v & 0xFF);
v >>>= 8;
} while (v != 0);
return byteOffset - lastOff;
}
public void writeValues(DataOutput out, long[] values, int limit) throws IOException {
int off = 0;
// encode each group
while ((limit - off) >= 4) {
byte flag = 0;
byteOffset = 1;
flag |= (encodeValue((int) values[off++]) - 1) << 6;
flag |= (encodeValue((int) values[off++]) - 1) << 4;
flag |= (encodeValue((int) values[off++]) - 1) << 2;
flag |= (encodeValue((int) values[off++]) - 1);
bytes[0] = flag;
out.writeBytes(bytes, byteOffset);
}
// tail vints
for (; off < limit; off++) {
out.writeVInt((int) values[off]);
}
}
}

View File

@ -31,6 +31,7 @@ import org.apache.lucene.codecs.KnnVectorsReader;
import org.apache.lucene.codecs.KnnVectorsWriter; import org.apache.lucene.codecs.KnnVectorsWriter;
import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.SegmentWriteState; import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.search.TaskExecutor;
import org.apache.lucene.util.hnsw.HnswGraph; import org.apache.lucene.util.hnsw.HnswGraph;
/** /**
@ -60,7 +61,7 @@ public final class Lucene99HnswScalarQuantizedVectorsFormat extends KnnVectorsFo
private final FlatVectorsFormat flatVectorsFormat; private final FlatVectorsFormat flatVectorsFormat;
private final int numMergeWorkers; private final int numMergeWorkers;
private final ExecutorService mergeExec; private final TaskExecutor mergeExec;
/** Constructs a format using default graph construction parameters */ /** Constructs a format using default graph construction parameters */
public Lucene99HnswScalarQuantizedVectorsFormat() { public Lucene99HnswScalarQuantizedVectorsFormat() {
@ -84,8 +85,8 @@ public final class Lucene99HnswScalarQuantizedVectorsFormat extends KnnVectorsFo
* @param beamWidth the size of the queue maintained during graph construction. * @param beamWidth the size of the queue maintained during graph construction.
* @param numMergeWorkers number of workers (threads) that will be used when doing merge. If * @param numMergeWorkers number of workers (threads) that will be used when doing merge. If
* larger than 1, a non-null {@link ExecutorService} must be passed as mergeExec * larger than 1, a non-null {@link ExecutorService} must be passed as mergeExec
* @param configuredQuantile the quantile for scalar quantizing the vectors, when `null` it is * @param confidenceInterval the confidenceInterval for scalar quantizing the vectors, when `null`
* calculated based on the vector field dimensions. * it is calculated based on the vector field dimensions.
* @param mergeExec the {@link ExecutorService} that will be used by ALL vector writers that are * @param mergeExec the {@link ExecutorService} that will be used by ALL vector writers that are
* generated by this format to do the merge * generated by this format to do the merge
*/ */
@ -93,7 +94,7 @@ public final class Lucene99HnswScalarQuantizedVectorsFormat extends KnnVectorsFo
int maxConn, int maxConn,
int beamWidth, int beamWidth,
int numMergeWorkers, int numMergeWorkers,
Float configuredQuantile, Float confidenceInterval,
ExecutorService mergeExec) { ExecutorService mergeExec) {
super("Lucene99HnswScalarQuantizedVectorsFormat"); super("Lucene99HnswScalarQuantizedVectorsFormat");
if (maxConn <= 0 || maxConn > MAXIMUM_MAX_CONN) { if (maxConn <= 0 || maxConn > MAXIMUM_MAX_CONN) {
@ -121,8 +122,12 @@ public final class Lucene99HnswScalarQuantizedVectorsFormat extends KnnVectorsFo
"No executor service is needed as we'll use single thread to merge"); "No executor service is needed as we'll use single thread to merge");
} }
this.numMergeWorkers = numMergeWorkers; this.numMergeWorkers = numMergeWorkers;
this.mergeExec = mergeExec; if (mergeExec != null) {
this.flatVectorsFormat = new Lucene99ScalarQuantizedVectorsFormat(configuredQuantile); this.mergeExec = new TaskExecutor(mergeExec);
} else {
this.mergeExec = null;
}
this.flatVectorsFormat = new Lucene99ScalarQuantizedVectorsFormat(confidenceInterval);
} }
@Override @Override

View File

@ -27,6 +27,7 @@ import org.apache.lucene.codecs.lucene90.IndexedDISI;
import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.SegmentWriteState; import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.TaskExecutor;
import org.apache.lucene.store.IndexOutput; import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.hnsw.HnswGraph; import org.apache.lucene.util.hnsw.HnswGraph;
@ -137,7 +138,7 @@ public final class Lucene99HnswVectorsFormat extends KnnVectorsFormat {
private static final FlatVectorsFormat flatVectorsFormat = new Lucene99FlatVectorsFormat(); private static final FlatVectorsFormat flatVectorsFormat = new Lucene99FlatVectorsFormat();
private final int numMergeWorkers; private final int numMergeWorkers;
private final ExecutorService mergeExec; private final TaskExecutor mergeExec;
/** Constructs a format using default graph construction parameters */ /** Constructs a format using default graph construction parameters */
public Lucene99HnswVectorsFormat() { public Lucene99HnswVectorsFormat() {
@ -192,7 +193,11 @@ public final class Lucene99HnswVectorsFormat extends KnnVectorsFormat {
"No executor service is needed as we'll use single thread to merge"); "No executor service is needed as we'll use single thread to merge");
} }
this.numMergeWorkers = numMergeWorkers; this.numMergeWorkers = numMergeWorkers;
this.mergeExec = mergeExec; if (mergeExec != null) {
this.mergeExec = new TaskExecutor(mergeExec);
} else {
this.mergeExec = null;
}
} }
@Override @Override

View File

@ -92,18 +92,8 @@ public final class Lucene99HnswVectorsReader extends KnnVectorsReader
} catch (Throwable exception) { } catch (Throwable exception) {
priorE = exception; priorE = exception;
} finally { } finally {
try { CodecUtil.checkFooter(meta, priorE);
CodecUtil.checkFooter(meta, priorE);
success = true;
} finally {
if (success == false) {
IOUtils.close(flatVectorsReader);
}
}
} }
}
success = false;
try {
vectorIndex = vectorIndex =
openDataInput( openDataInput(
state, state,
@ -237,12 +227,22 @@ public final class Lucene99HnswVectorsReader extends KnnVectorsReader
|| fieldEntry.vectorEncoding != VectorEncoding.FLOAT32) { || fieldEntry.vectorEncoding != VectorEncoding.FLOAT32) {
return; return;
} }
RandomVectorScorer scorer = flatVectorsReader.getRandomVectorScorer(field, target); final RandomVectorScorer scorer = flatVectorsReader.getRandomVectorScorer(field, target);
HnswGraphSearcher.search( final KnnCollector collector =
scorer, new OrdinalTranslatedKnnCollector(knnCollector, scorer::ordToDoc);
new OrdinalTranslatedKnnCollector(knnCollector, scorer::ordToDoc), final Bits acceptedOrds = scorer.getAcceptOrds(acceptDocs);
getGraph(fieldEntry), if (knnCollector.k() < scorer.maxOrd()) {
scorer.getAcceptOrds(acceptDocs)); HnswGraphSearcher.search(scorer, collector, getGraph(fieldEntry), acceptedOrds);
} else {
// if k is larger than the number of vectors, we can just iterate over all vectors
// and collect them
for (int i = 0; i < scorer.maxOrd(); i++) {
if (acceptedOrds == null || acceptedOrds.get(i)) {
knnCollector.incVisitedCount(1);
knnCollector.collect(scorer.ordToDoc(i), scorer.score(i));
}
}
}
} }
@Override @Override
@ -255,12 +255,22 @@ public final class Lucene99HnswVectorsReader extends KnnVectorsReader
|| fieldEntry.vectorEncoding != VectorEncoding.BYTE) { || fieldEntry.vectorEncoding != VectorEncoding.BYTE) {
return; return;
} }
RandomVectorScorer scorer = flatVectorsReader.getRandomVectorScorer(field, target); final RandomVectorScorer scorer = flatVectorsReader.getRandomVectorScorer(field, target);
HnswGraphSearcher.search( final KnnCollector collector =
scorer, new OrdinalTranslatedKnnCollector(knnCollector, scorer::ordToDoc);
new OrdinalTranslatedKnnCollector(knnCollector, scorer::ordToDoc), final Bits acceptedOrds = scorer.getAcceptOrds(acceptDocs);
getGraph(fieldEntry), if (knnCollector.k() < scorer.maxOrd()) {
scorer.getAcceptOrds(acceptDocs)); HnswGraphSearcher.search(scorer, collector, getGraph(fieldEntry), acceptedOrds);
} else {
// if k is larger than the number of vectors, we can just iterate over all vectors
// and collect them
for (int i = 0; i < scorer.maxOrd(); i++) {
if (acceptedOrds == null || acceptedOrds.get(i)) {
knnCollector.incVisitedCount(1);
knnCollector.collect(scorer.ordToDoc(i), scorer.score(i));
}
}
}
} }
@Override @Override

View File

@ -23,7 +23,6 @@ import java.io.IOException;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Arrays; import java.util.Arrays;
import java.util.List; import java.util.List;
import java.util.concurrent.ExecutorService;
import org.apache.lucene.codecs.CodecUtil; import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.FlatVectorsWriter; import org.apache.lucene.codecs.FlatVectorsWriter;
import org.apache.lucene.codecs.KnnFieldVectorsWriter; import org.apache.lucene.codecs.KnnFieldVectorsWriter;
@ -35,6 +34,7 @@ import org.apache.lucene.index.MergeState;
import org.apache.lucene.index.SegmentWriteState; import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.index.Sorter; import org.apache.lucene.index.Sorter;
import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.TaskExecutor;
import org.apache.lucene.store.IndexOutput; import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.InfoStream; import org.apache.lucene.util.InfoStream;
@ -67,7 +67,7 @@ public final class Lucene99HnswVectorsWriter extends KnnVectorsWriter {
private final int beamWidth; private final int beamWidth;
private final FlatVectorsWriter flatVectorWriter; private final FlatVectorsWriter flatVectorWriter;
private final int numMergeWorkers; private final int numMergeWorkers;
private final ExecutorService mergeExec; private final TaskExecutor mergeExec;
private final List<FieldWriter<?>> fields = new ArrayList<>(); private final List<FieldWriter<?>> fields = new ArrayList<>();
private boolean finished; private boolean finished;
@ -78,7 +78,7 @@ public final class Lucene99HnswVectorsWriter extends KnnVectorsWriter {
int beamWidth, int beamWidth,
FlatVectorsWriter flatVectorWriter, FlatVectorsWriter flatVectorWriter,
int numMergeWorkers, int numMergeWorkers,
ExecutorService mergeExec) TaskExecutor mergeExec)
throws IOException { throws IOException {
this.M = M; this.M = M;
this.flatVectorWriter = flatVectorWriter; this.flatVectorWriter = flatVectorWriter;

View File

@ -158,8 +158,8 @@ import org.apache.lucene.util.packed.PackedInts;
* <dd><b>Frequencies and Skip Data</b> * <dd><b>Frequencies and Skip Data</b>
* <p>The .doc file contains the lists of documents which contain each term, along with the * <p>The .doc file contains the lists of documents which contain each term, along with the
* frequency of the term in that document (except when frequencies are omitted: {@link * frequency of the term in that document (except when frequencies are omitted: {@link
* IndexOptions#DOCS}). It also saves skip data to the beginning of each packed or VInt block, * IndexOptions#DOCS}). Skip data is saved at the end of each term's postings. The skip data
* when the length of document list is larger than packed block size. * is saved once for the entire postings list.
* <ul> * <ul>
* <li>docFile(.doc) --&gt; Header, &lt;TermFreqs, SkipData?&gt;<sup>TermCount</sup>, Footer * <li>docFile(.doc) --&gt; Header, &lt;TermFreqs, SkipData?&gt;<sup>TermCount</sup>, Footer
* <li>Header --&gt; {@link CodecUtil#writeIndexHeader IndexHeader} * <li>Header --&gt; {@link CodecUtil#writeIndexHeader IndexHeader}
@ -174,7 +174,8 @@ import org.apache.lucene.util.packed.PackedInts;
* <li>SkipDatum --&gt; DocSkip, DocFPSkip, &lt;PosFPSkip, PosBlockOffset, PayLength?, * <li>SkipDatum --&gt; DocSkip, DocFPSkip, &lt;PosFPSkip, PosBlockOffset, PayLength?,
* PayFPSkip?&gt;?, ImpactLength, &lt;CompetitiveFreqDelta, CompetitiveNormDelta?&gt; * PayFPSkip?&gt;?, ImpactLength, &lt;CompetitiveFreqDelta, CompetitiveNormDelta?&gt;
* <sup>ImpactCount</sup>, SkipChildLevelPointer? * <sup>ImpactCount</sup>, SkipChildLevelPointer?
* <li>PackedDocDeltaBlock, PackedFreqBlock --&gt; {@link PackedInts PackedInts} * <li>PackedFreqBlock --&gt; {@link PackedInts PackedInts}, uses patching
* <li>PackedDocDeltaBlock --&gt; {@link PackedInts PackedInts}, does not use patching
* <li>DocDelta, Freq, DocSkip, DocFPSkip, PosFPSkip, PosBlockOffset, PayByteUpto, * <li>DocDelta, Freq, DocSkip, DocFPSkip, PosFPSkip, PosBlockOffset, PayByteUpto,
* PayFPSkip, ImpactLength, CompetitiveFreqDelta --&gt; {@link DataOutput#writeVInt * PayFPSkip, ImpactLength, CompetitiveFreqDelta --&gt; {@link DataOutput#writeVInt
* VInt} * VInt}

View File

@ -142,21 +142,25 @@ public final class Lucene99PostingsReader extends PostingsReaderBase {
/** Read values that have been written using variable-length encoding instead of bit-packing. */ /** Read values that have been written using variable-length encoding instead of bit-packing. */
static void readVIntBlock( static void readVIntBlock(
IndexInput docIn, long[] docBuffer, long[] freqBuffer, int num, boolean indexHasFreq) IndexInput docIn,
long[] docBuffer,
long[] freqBuffer,
int num,
boolean indexHasFreq,
boolean decodeFreq)
throws IOException { throws IOException {
if (indexHasFreq) { GroupVIntReader.readValues(docIn, docBuffer, num);
for (int i = 0; i < num; i++) { if (indexHasFreq && decodeFreq) {
final int code = docIn.readVInt(); for (int i = 0; i < num; ++i) {
docBuffer[i] = code >>> 1; freqBuffer[i] = docBuffer[i] & 0x01;
if ((code & 1) != 0) { docBuffer[i] >>= 1;
freqBuffer[i] = 1; if (freqBuffer[i] == 0) {
} else {
freqBuffer[i] = docIn.readVInt(); freqBuffer[i] = docIn.readVInt();
} }
} }
} else { } else if (indexHasFreq) {
for (int i = 0; i < num; i++) { for (int i = 0; i < num; ++i) {
docBuffer[i] = docIn.readVInt(); docBuffer[i] >>= 1;
} }
} }
} }
@ -471,7 +475,7 @@ public final class Lucene99PostingsReader extends PostingsReaderBase {
blockUpto++; blockUpto++;
} else { } else {
// Read vInts: // Read vInts:
readVIntBlock(docIn, docBuffer, freqBuffer, left, indexHasFreq); readVIntBlock(docIn, docBuffer, freqBuffer, left, indexHasFreq, needsFreq);
prefixSum(docBuffer, left, accum); prefixSum(docBuffer, left, accum);
docBuffer[left] = NO_MORE_DOCS; docBuffer[left] = NO_MORE_DOCS;
blockUpto += left; blockUpto += left;
@ -764,7 +768,7 @@ public final class Lucene99PostingsReader extends PostingsReaderBase {
docBuffer[1] = NO_MORE_DOCS; docBuffer[1] = NO_MORE_DOCS;
blockUpto++; blockUpto++;
} else { } else {
readVIntBlock(docIn, docBuffer, freqBuffer, left, true); readVIntBlock(docIn, docBuffer, freqBuffer, left, true, true);
prefixSum(docBuffer, left, accum); prefixSum(docBuffer, left, accum);
docBuffer[left] = NO_MORE_DOCS; docBuffer[left] = NO_MORE_DOCS;
blockUpto += left; blockUpto += left;
@ -1073,8 +1077,6 @@ public final class Lucene99PostingsReader extends PostingsReaderBase {
private int nextSkipDoc = -1; private int nextSkipDoc = -1;
private long seekTo = -1;
// as we read freqBuffer lazily, isFreqsRead shows if freqBuffer are read for the current block // as we read freqBuffer lazily, isFreqsRead shows if freqBuffer are read for the current block
// always true when we don't have freqBuffer (indexHasFreq=false) or don't need freqBuffer // always true when we don't have freqBuffer (indexHasFreq=false) or don't need freqBuffer
// (needsFreq=false) // (needsFreq=false)
@ -1153,7 +1155,7 @@ public final class Lucene99PostingsReader extends PostingsReaderBase {
} }
blockUpto += BLOCK_SIZE; blockUpto += BLOCK_SIZE;
} else { } else {
readVIntBlock(docIn, docBuffer, freqBuffer, left, indexHasFreqs); readVIntBlock(docIn, docBuffer, freqBuffer, left, indexHasFreqs, true);
prefixSum(docBuffer, left, accum); prefixSum(docBuffer, left, accum);
docBuffer[left] = NO_MORE_DOCS; docBuffer[left] = NO_MORE_DOCS;
blockUpto += left; blockUpto += left;
@ -1178,7 +1180,8 @@ public final class Lucene99PostingsReader extends PostingsReaderBase {
// Force to read next block // Force to read next block
docBufferUpto = BLOCK_SIZE; docBufferUpto = BLOCK_SIZE;
accum = skipper.getDoc(); accum = skipper.getDoc();
seekTo = skipper.getDocPointer(); // delay the seek docIn.seek(skipper.getDocPointer());
isFreqsRead = true;
} }
// next time we call advance, this is used to // next time we call advance, this is used to
// foresee whether skipper is necessary. // foresee whether skipper is necessary.
@ -1198,11 +1201,6 @@ public final class Lucene99PostingsReader extends PostingsReaderBase {
@Override @Override
public int nextDoc() throws IOException { public int nextDoc() throws IOException {
if (docBufferUpto == BLOCK_SIZE) { if (docBufferUpto == BLOCK_SIZE) {
if (seekTo >= 0) {
docIn.seek(seekTo);
isFreqsRead = true; // reset isFreqsRead
seekTo = -1;
}
refillDocs(); refillDocs();
} }
return this.doc = (int) docBuffer[docBufferUpto++]; return this.doc = (int) docBuffer[docBufferUpto++];
@ -1214,11 +1212,6 @@ public final class Lucene99PostingsReader extends PostingsReaderBase {
advanceShallow(target); advanceShallow(target);
} }
if (docBufferUpto == BLOCK_SIZE) { if (docBufferUpto == BLOCK_SIZE) {
if (seekTo >= 0) {
docIn.seek(seekTo);
isFreqsRead = true; // reset isFreqsRead
seekTo = -1;
}
refillDocs(); refillDocs();
} }
@ -1307,8 +1300,6 @@ public final class Lucene99PostingsReader extends PostingsReaderBase {
private int nextSkipDoc = -1; private int nextSkipDoc = -1;
private long seekTo = -1;
public BlockImpactsPostingsEnum(FieldInfo fieldInfo, IntBlockTermState termState) public BlockImpactsPostingsEnum(FieldInfo fieldInfo, IntBlockTermState termState)
throws IOException { throws IOException {
indexHasOffsets = indexHasOffsets =
@ -1372,7 +1363,7 @@ public final class Lucene99PostingsReader extends PostingsReaderBase {
forDeltaUtil.decodeAndPrefixSum(docIn, accum, docBuffer); forDeltaUtil.decodeAndPrefixSum(docIn, accum, docBuffer);
pforUtil.decode(docIn, freqBuffer); pforUtil.decode(docIn, freqBuffer);
} else { } else {
readVIntBlock(docIn, docBuffer, freqBuffer, left, true); readVIntBlock(docIn, docBuffer, freqBuffer, left, true, true);
prefixSum(docBuffer, left, accum); prefixSum(docBuffer, left, accum);
docBuffer[left] = NO_MORE_DOCS; docBuffer[left] = NO_MORE_DOCS;
} }
@ -1426,7 +1417,7 @@ public final class Lucene99PostingsReader extends PostingsReaderBase {
accum = skipper.getDoc(); accum = skipper.getDoc();
posPendingFP = skipper.getPosPointer(); posPendingFP = skipper.getPosPointer();
posPendingCount = skipper.getPosBufferUpto(); posPendingCount = skipper.getPosBufferUpto();
seekTo = skipper.getDocPointer(); // delay the seek docIn.seek(skipper.getDocPointer());
} }
// next time we call advance, this is used to // next time we call advance, this is used to
// foresee whether skipper is necessary. // foresee whether skipper is necessary.
@ -1452,10 +1443,6 @@ public final class Lucene99PostingsReader extends PostingsReaderBase {
advanceShallow(target); advanceShallow(target);
} }
if (docBufferUpto == BLOCK_SIZE) { if (docBufferUpto == BLOCK_SIZE) {
if (seekTo >= 0) {
docIn.seek(seekTo);
seekTo = -1;
}
refillDocs(); refillDocs();
} }
@ -1766,7 +1753,7 @@ public final class Lucene99PostingsReader extends PostingsReaderBase {
false; // freq block will be loaded lazily when necessary, we don't load it here false; // freq block will be loaded lazily when necessary, we don't load it here
} }
} else { } else {
readVIntBlock(docIn, docBuffer, freqBuffer, left, indexHasFreq); readVIntBlock(docIn, docBuffer, freqBuffer, left, indexHasFreq, true);
prefixSum(docBuffer, left, accum); prefixSum(docBuffer, left, accum);
docBuffer[left] = NO_MORE_DOCS; docBuffer[left] = NO_MORE_DOCS;
} }

View File

@ -92,6 +92,7 @@ public final class Lucene99PostingsWriter extends PushPostingsWriterBase {
private final PForUtil pforUtil; private final PForUtil pforUtil;
private final ForDeltaUtil forDeltaUtil; private final ForDeltaUtil forDeltaUtil;
private final Lucene99SkipWriter skipWriter; private final Lucene99SkipWriter skipWriter;
private final GroupVIntWriter docGroupVIntWriter;
private boolean fieldHasNorms; private boolean fieldHasNorms;
private NumericDocValues norms; private NumericDocValues norms;
@ -172,6 +173,7 @@ public final class Lucene99PostingsWriter extends PushPostingsWriterBase {
skipWriter = skipWriter =
new Lucene99SkipWriter( new Lucene99SkipWriter(
MAX_SKIP_LEVELS, BLOCK_SIZE, state.segmentInfo.maxDoc(), docOut, posOut, payOut); MAX_SKIP_LEVELS, BLOCK_SIZE, state.segmentInfo.maxDoc(), docOut, posOut, payOut);
docGroupVIntWriter = new GroupVIntWriter();
} }
@Override @Override
@ -370,17 +372,19 @@ public final class Lucene99PostingsWriter extends PushPostingsWriterBase {
singletonDocID = (int) docDeltaBuffer[0]; singletonDocID = (int) docDeltaBuffer[0];
} else { } else {
singletonDocID = -1; singletonDocID = -1;
// vInt encode the remaining doc deltas and freqs: // Group vInt encode the remaining doc deltas and freqs:
for (int i = 0; i < docBufferUpto; i++) { if (writeFreqs) {
final int docDelta = (int) docDeltaBuffer[i]; for (int i = 0; i < docBufferUpto; i++) {
final int freq = (int) freqBuffer[i]; docDeltaBuffer[i] = (docDeltaBuffer[i] << 1) | (freqBuffer[i] == 1 ? 1 : 0);
if (!writeFreqs) { }
docOut.writeVInt(docDelta); }
} else if (freq == 1) { docGroupVIntWriter.writeValues(docOut, docDeltaBuffer, docBufferUpto);
docOut.writeVInt((docDelta << 1) | 1); if (writeFreqs) {
} else { for (int i = 0; i < docBufferUpto; i++) {
docOut.writeVInt(docDelta << 1); final int freq = (int) freqBuffer[i];
docOut.writeVInt(freq); if (freq != 1) {
docOut.writeVInt(freq);
}
} }
} }
} }

View File

@ -43,17 +43,17 @@ public final class Lucene99ScalarQuantizedVectorsFormat extends FlatVectorsForma
private static final FlatVectorsFormat rawVectorFormat = new Lucene99FlatVectorsFormat(); private static final FlatVectorsFormat rawVectorFormat = new Lucene99FlatVectorsFormat();
/** The minimum quantile */ /** The minimum confidence interval */
private static final float MINIMUM_QUANTILE = 0.9f; private static final float MINIMUM_CONFIDENCE_INTERVAL = 0.9f;
/** The maximum quantile */ /** The maximum confidence interval */
private static final float MAXIMUM_QUANTILE = 1f; private static final float MAXIMUM_CONFIDENCE_INTERVAL = 1f;
/** /**
* Controls the quantile used to scalar quantize the vectors the default quantile is calculated as * Controls the confidence interval used to scalar quantize the vectors the default value is
* `1-1/(vector_dimensions + 1)` * calculated as `1-1/(vector_dimensions + 1)`
*/ */
final Float quantile; final Float confidenceInterval;
/** Constructs a format using default graph construction parameters */ /** Constructs a format using default graph construction parameters */
public Lucene99ScalarQuantizedVectorsFormat() { public Lucene99ScalarQuantizedVectorsFormat() {
@ -63,24 +63,26 @@ public final class Lucene99ScalarQuantizedVectorsFormat extends FlatVectorsForma
/** /**
* Constructs a format using the given graph construction parameters. * Constructs a format using the given graph construction parameters.
* *
* @param quantile the quantile for scalar quantizing the vectors, when `null` it is calculated * @param confidenceInterval the confidenceInterval for scalar quantizing the vectors, when `null`
* based on the vector field dimensions. * it is calculated based on the vector field dimensions.
*/ */
public Lucene99ScalarQuantizedVectorsFormat(Float quantile) { public Lucene99ScalarQuantizedVectorsFormat(Float confidenceInterval) {
if (quantile != null && (quantile < MINIMUM_QUANTILE || quantile > MAXIMUM_QUANTILE)) { if (confidenceInterval != null
&& (confidenceInterval < MINIMUM_CONFIDENCE_INTERVAL
|| confidenceInterval > MAXIMUM_CONFIDENCE_INTERVAL)) {
throw new IllegalArgumentException( throw new IllegalArgumentException(
"quantile must be between " "confidenceInterval must be between "
+ MINIMUM_QUANTILE + MINIMUM_CONFIDENCE_INTERVAL
+ " and " + " and "
+ MAXIMUM_QUANTILE + MAXIMUM_CONFIDENCE_INTERVAL
+ "; quantile=" + "; confidenceInterval="
+ quantile); + confidenceInterval);
} }
this.quantile = quantile; this.confidenceInterval = confidenceInterval;
} }
static float calculateDefaultQuantile(int vectorDimension) { static float calculateDefaultConfidenceInterval(int vectorDimension) {
return Math.max(MINIMUM_QUANTILE, 1f - (1f / (vectorDimension + 1))); return Math.max(MINIMUM_CONFIDENCE_INTERVAL, 1f - (1f / (vectorDimension + 1)));
} }
@Override @Override
@ -88,8 +90,8 @@ public final class Lucene99ScalarQuantizedVectorsFormat extends FlatVectorsForma
return NAME return NAME
+ "(name=" + "(name="
+ NAME + NAME
+ ", quantile=" + ", confidenceInterval="
+ quantile + confidenceInterval
+ ", rawVectorFormat=" + ", rawVectorFormat="
+ rawVectorFormat + rawVectorFormat
+ ")"; + ")";
@ -98,7 +100,7 @@ public final class Lucene99ScalarQuantizedVectorsFormat extends FlatVectorsForma
@Override @Override
public FlatVectorsWriter fieldsWriter(SegmentWriteState state) throws IOException { public FlatVectorsWriter fieldsWriter(SegmentWriteState state) throws IOException {
return new Lucene99ScalarQuantizedVectorsWriter( return new Lucene99ScalarQuantizedVectorsWriter(
state, quantile, rawVectorFormat.fieldsWriter(state)); state, confidenceInterval, rawVectorFormat.fieldsWriter(state));
} }
@Override @Override

View File

@ -58,6 +58,7 @@ public final class Lucene99ScalarQuantizedVectorsReader extends FlatVectorsReade
Lucene99ScalarQuantizedVectorsReader(SegmentReadState state, FlatVectorsReader rawVectorsReader) Lucene99ScalarQuantizedVectorsReader(SegmentReadState state, FlatVectorsReader rawVectorsReader)
throws IOException { throws IOException {
this.rawVectorsReader = rawVectorsReader;
int versionMeta = -1; int versionMeta = -1;
String metaFileName = String metaFileName =
IndexFileNames.segmentFileName( IndexFileNames.segmentFileName(
@ -80,19 +81,8 @@ public final class Lucene99ScalarQuantizedVectorsReader extends FlatVectorsReade
} catch (Throwable exception) { } catch (Throwable exception) {
priorE = exception; priorE = exception;
} finally { } finally {
try { CodecUtil.checkFooter(meta, priorE);
CodecUtil.checkFooter(meta, priorE);
success = true;
} finally {
if (success == false) {
IOUtils.close(rawVectorsReader);
}
}
} }
}
success = false;
this.rawVectorsReader = rawVectorsReader;
try {
quantizedVectorData = quantizedVectorData =
openDataInput( openDataInput(
state, state,
@ -313,10 +303,10 @@ public final class Lucene99ScalarQuantizedVectorsReader extends FlatVectorsReade
dimension = input.readVInt(); dimension = input.readVInt();
size = input.readInt(); size = input.readInt();
if (size > 0) { if (size > 0) {
float configuredQuantile = Float.intBitsToFloat(input.readInt()); float confidenceInterval = Float.intBitsToFloat(input.readInt());
float minQuantile = Float.intBitsToFloat(input.readInt()); float minQuantile = Float.intBitsToFloat(input.readInt());
float maxQuantile = Float.intBitsToFloat(input.readInt()); float maxQuantile = Float.intBitsToFloat(input.readInt());
scalarQuantizer = new ScalarQuantizer(minQuantile, maxQuantile, configuredQuantile); scalarQuantizer = new ScalarQuantizer(minQuantile, maxQuantile, confidenceInterval);
} else { } else {
scalarQuantizer = null; scalarQuantizer = null;
} }

View File

@ -19,7 +19,7 @@ package org.apache.lucene.codecs.lucene99;
import static org.apache.lucene.codecs.lucene99.Lucene99FlatVectorsFormat.DIRECT_MONOTONIC_BLOCK_SHIFT; import static org.apache.lucene.codecs.lucene99.Lucene99FlatVectorsFormat.DIRECT_MONOTONIC_BLOCK_SHIFT;
import static org.apache.lucene.codecs.lucene99.Lucene99ScalarQuantizedVectorsFormat.QUANTIZED_VECTOR_COMPONENT; import static org.apache.lucene.codecs.lucene99.Lucene99ScalarQuantizedVectorsFormat.QUANTIZED_VECTOR_COMPONENT;
import static org.apache.lucene.codecs.lucene99.Lucene99ScalarQuantizedVectorsFormat.calculateDefaultQuantile; import static org.apache.lucene.codecs.lucene99.Lucene99ScalarQuantizedVectorsFormat.calculateDefaultConfidenceInterval;
import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS; import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS;
import static org.apache.lucene.util.RamUsageEstimator.shallowSizeOfInstance; import static org.apache.lucene.util.RamUsageEstimator.shallowSizeOfInstance;
@ -91,14 +91,14 @@ public final class Lucene99ScalarQuantizedVectorsWriter extends FlatVectorsWrite
private final List<FieldWriter> fields = new ArrayList<>(); private final List<FieldWriter> fields = new ArrayList<>();
private final IndexOutput meta, quantizedVectorData; private final IndexOutput meta, quantizedVectorData;
private final Float quantile; private final Float confidenceInterval;
private final FlatVectorsWriter rawVectorDelegate; private final FlatVectorsWriter rawVectorDelegate;
private boolean finished; private boolean finished;
Lucene99ScalarQuantizedVectorsWriter( Lucene99ScalarQuantizedVectorsWriter(
SegmentWriteState state, Float quantile, FlatVectorsWriter rawVectorDelegate) SegmentWriteState state, Float confidenceInterval, FlatVectorsWriter rawVectorDelegate)
throws IOException { throws IOException {
this.quantile = quantile; this.confidenceInterval = confidenceInterval;
segmentWriteState = state; segmentWriteState = state;
String metaFileName = String metaFileName =
IndexFileNames.segmentFileName( IndexFileNames.segmentFileName(
@ -142,12 +142,12 @@ public final class Lucene99ScalarQuantizedVectorsWriter extends FlatVectorsWrite
public FlatFieldVectorsWriter<?> addField( public FlatFieldVectorsWriter<?> addField(
FieldInfo fieldInfo, KnnFieldVectorsWriter<?> indexWriter) throws IOException { FieldInfo fieldInfo, KnnFieldVectorsWriter<?> indexWriter) throws IOException {
if (fieldInfo.getVectorEncoding().equals(VectorEncoding.FLOAT32)) { if (fieldInfo.getVectorEncoding().equals(VectorEncoding.FLOAT32)) {
float quantile = float confidenceInterval =
this.quantile == null this.confidenceInterval == null
? calculateDefaultQuantile(fieldInfo.getVectorDimension()) ? calculateDefaultConfidenceInterval(fieldInfo.getVectorDimension())
: this.quantile; : this.confidenceInterval;
FieldWriter quantizedWriter = FieldWriter quantizedWriter =
new FieldWriter(quantile, fieldInfo, segmentWriteState.infoStream, indexWriter); new FieldWriter(confidenceInterval, fieldInfo, segmentWriteState.infoStream, indexWriter);
fields.add(quantizedWriter); fields.add(quantizedWriter);
indexWriter = quantizedWriter; indexWriter = quantizedWriter;
} }
@ -169,16 +169,16 @@ public final class Lucene99ScalarQuantizedVectorsWriter extends FlatVectorsWrite
DocsWithFieldSet docsWithField = DocsWithFieldSet docsWithField =
writeQuantizedVectorData(quantizedVectorData, byteVectorValues); writeQuantizedVectorData(quantizedVectorData, byteVectorValues);
long vectorDataLength = quantizedVectorData.getFilePointer() - vectorDataOffset; long vectorDataLength = quantizedVectorData.getFilePointer() - vectorDataOffset;
float quantile = float confidenceInterval =
this.quantile == null this.confidenceInterval == null
? calculateDefaultQuantile(fieldInfo.getVectorDimension()) ? calculateDefaultConfidenceInterval(fieldInfo.getVectorDimension())
: this.quantile; : this.confidenceInterval;
writeMeta( writeMeta(
fieldInfo, fieldInfo,
segmentWriteState.segmentInfo.maxDoc(), segmentWriteState.segmentInfo.maxDoc(),
vectorDataOffset, vectorDataOffset,
vectorDataLength, vectorDataLength,
quantile, confidenceInterval,
mergedQuantizationState.getLowerQuantile(), mergedQuantizationState.getLowerQuantile(),
mergedQuantizationState.getUpperQuantile(), mergedQuantizationState.getUpperQuantile(),
docsWithField); docsWithField);
@ -251,7 +251,7 @@ public final class Lucene99ScalarQuantizedVectorsWriter extends FlatVectorsWrite
maxDoc, maxDoc,
vectorDataOffset, vectorDataOffset,
vectorDataLength, vectorDataLength,
quantile, confidenceInterval,
fieldData.minQuantile, fieldData.minQuantile,
fieldData.maxQuantile, fieldData.maxQuantile,
fieldData.docsWithField); fieldData.docsWithField);
@ -262,7 +262,7 @@ public final class Lucene99ScalarQuantizedVectorsWriter extends FlatVectorsWrite
int maxDoc, int maxDoc,
long vectorDataOffset, long vectorDataOffset,
long vectorDataLength, long vectorDataLength,
Float configuredQuantizationQuantile, Float confidenceInterval,
Float lowerQuantile, Float lowerQuantile,
Float upperQuantile, Float upperQuantile,
DocsWithFieldSet docsWithField) DocsWithFieldSet docsWithField)
@ -279,9 +279,9 @@ public final class Lucene99ScalarQuantizedVectorsWriter extends FlatVectorsWrite
assert Float.isFinite(lowerQuantile) && Float.isFinite(upperQuantile); assert Float.isFinite(lowerQuantile) && Float.isFinite(upperQuantile);
meta.writeInt( meta.writeInt(
Float.floatToIntBits( Float.floatToIntBits(
configuredQuantizationQuantile != null confidenceInterval != null
? configuredQuantizationQuantile ? confidenceInterval
: calculateDefaultQuantile(field.getVectorDimension()))); : calculateDefaultConfidenceInterval(field.getVectorDimension())));
meta.writeInt(Float.floatToIntBits(lowerQuantile)); meta.writeInt(Float.floatToIntBits(lowerQuantile));
meta.writeInt(Float.floatToIntBits(upperQuantile)); meta.writeInt(Float.floatToIntBits(upperQuantile));
} }
@ -344,7 +344,7 @@ public final class Lucene99ScalarQuantizedVectorsWriter extends FlatVectorsWrite
maxDoc, maxDoc,
vectorDataOffset, vectorDataOffset,
quantizedVectorLength, quantizedVectorLength,
quantile, confidenceInterval,
fieldData.minQuantile, fieldData.minQuantile,
fieldData.maxQuantile, fieldData.maxQuantile,
newDocsWithField); newDocsWithField);
@ -374,11 +374,11 @@ public final class Lucene99ScalarQuantizedVectorsWriter extends FlatVectorsWrite
private ScalarQuantizer mergeQuantiles(FieldInfo fieldInfo, MergeState mergeState) private ScalarQuantizer mergeQuantiles(FieldInfo fieldInfo, MergeState mergeState)
throws IOException { throws IOException {
assert fieldInfo.getVectorEncoding() == VectorEncoding.FLOAT32; assert fieldInfo.getVectorEncoding() == VectorEncoding.FLOAT32;
float quantile = float confidenceInterval =
this.quantile == null this.confidenceInterval == null
? calculateDefaultQuantile(fieldInfo.getVectorDimension()) ? calculateDefaultConfidenceInterval(fieldInfo.getVectorDimension())
: this.quantile; : this.confidenceInterval;
return mergeAndRecalculateQuantiles(mergeState, fieldInfo, quantile); return mergeAndRecalculateQuantiles(mergeState, fieldInfo, confidenceInterval);
} }
private ScalarQuantizedCloseableRandomVectorScorerSupplier mergeOneFieldToIndex( private ScalarQuantizedCloseableRandomVectorScorerSupplier mergeOneFieldToIndex(
@ -408,16 +408,16 @@ public final class Lucene99ScalarQuantizedVectorsWriter extends FlatVectorsWrite
quantizationDataInput, quantizationDataInput.length() - CodecUtil.footerLength()); quantizationDataInput, quantizationDataInput.length() - CodecUtil.footerLength());
long vectorDataLength = quantizedVectorData.getFilePointer() - vectorDataOffset; long vectorDataLength = quantizedVectorData.getFilePointer() - vectorDataOffset;
CodecUtil.retrieveChecksum(quantizationDataInput); CodecUtil.retrieveChecksum(quantizationDataInput);
float quantile = float confidenceInterval =
this.quantile == null this.confidenceInterval == null
? calculateDefaultQuantile(fieldInfo.getVectorDimension()) ? calculateDefaultConfidenceInterval(fieldInfo.getVectorDimension())
: this.quantile; : this.confidenceInterval;
writeMeta( writeMeta(
fieldInfo, fieldInfo,
segmentWriteState.segmentInfo.maxDoc(), segmentWriteState.segmentInfo.maxDoc(),
vectorDataOffset, vectorDataOffset,
vectorDataLength, vectorDataLength,
quantile, confidenceInterval,
mergedQuantizationState.getLowerQuantile(), mergedQuantizationState.getLowerQuantile(),
mergedQuantizationState.getUpperQuantile(), mergedQuantizationState.getUpperQuantile(),
docsWithField); docsWithField);
@ -446,7 +446,9 @@ public final class Lucene99ScalarQuantizedVectorsWriter extends FlatVectorsWrite
} }
static ScalarQuantizer mergeQuantiles( static ScalarQuantizer mergeQuantiles(
List<ScalarQuantizer> quantizationStates, List<Integer> segmentSizes, float quantile) { List<ScalarQuantizer> quantizationStates,
List<Integer> segmentSizes,
float confidenceInterval) {
assert quantizationStates.size() == segmentSizes.size(); assert quantizationStates.size() == segmentSizes.size();
if (quantizationStates.isEmpty()) { if (quantizationStates.isEmpty()) {
return null; return null;
@ -464,7 +466,7 @@ public final class Lucene99ScalarQuantizedVectorsWriter extends FlatVectorsWrite
} }
lowerQuantile /= totalCount; lowerQuantile /= totalCount;
upperQuantile /= totalCount; upperQuantile /= totalCount;
return new ScalarQuantizer(lowerQuantile, upperQuantile, quantile); return new ScalarQuantizer(lowerQuantile, upperQuantile, confidenceInterval);
} }
/** /**
@ -521,7 +523,7 @@ public final class Lucene99ScalarQuantizedVectorsWriter extends FlatVectorsWrite
} }
static ScalarQuantizer mergeAndRecalculateQuantiles( static ScalarQuantizer mergeAndRecalculateQuantiles(
MergeState mergeState, FieldInfo fieldInfo, float quantile) throws IOException { MergeState mergeState, FieldInfo fieldInfo, float confidenceInterval) throws IOException {
List<ScalarQuantizer> quantizationStates = new ArrayList<>(mergeState.liveDocs.length); List<ScalarQuantizer> quantizationStates = new ArrayList<>(mergeState.liveDocs.length);
List<Integer> segmentSizes = new ArrayList<>(mergeState.liveDocs.length); List<Integer> segmentSizes = new ArrayList<>(mergeState.liveDocs.length);
for (int i = 0; i < mergeState.liveDocs.length; i++) { for (int i = 0; i < mergeState.liveDocs.length; i++) {
@ -536,7 +538,8 @@ public final class Lucene99ScalarQuantizedVectorsWriter extends FlatVectorsWrite
segmentSizes.add(fvv.size()); segmentSizes.add(fvv.size());
} }
} }
ScalarQuantizer mergedQuantiles = mergeQuantiles(quantizationStates, segmentSizes, quantile); ScalarQuantizer mergedQuantiles =
mergeQuantiles(quantizationStates, segmentSizes, confidenceInterval);
// Segments no providing quantization state indicates that their quantiles were never // Segments no providing quantization state indicates that their quantiles were never
// calculated. // calculated.
// To be safe, we should always recalculate given a sample set over all the float vectors in the // To be safe, we should always recalculate given a sample set over all the float vectors in the
@ -545,7 +548,7 @@ public final class Lucene99ScalarQuantizedVectorsWriter extends FlatVectorsWrite
if (mergedQuantiles == null || shouldRecomputeQuantiles(mergedQuantiles, quantizationStates)) { if (mergedQuantiles == null || shouldRecomputeQuantiles(mergedQuantiles, quantizationStates)) {
FloatVectorValues vectorValues = FloatVectorValues vectorValues =
KnnVectorsWriter.MergedVectorValues.mergeFloatVectorValues(fieldInfo, mergeState); KnnVectorsWriter.MergedVectorValues.mergeFloatVectorValues(fieldInfo, mergeState);
mergedQuantiles = ScalarQuantizer.fromVectors(vectorValues, quantile); mergedQuantiles = ScalarQuantizer.fromVectors(vectorValues, confidenceInterval);
} }
return mergedQuantiles; return mergedQuantiles;
} }
@ -599,7 +602,7 @@ public final class Lucene99ScalarQuantizedVectorsWriter extends FlatVectorsWrite
private static final long SHALLOW_SIZE = shallowSizeOfInstance(FieldWriter.class); private static final long SHALLOW_SIZE = shallowSizeOfInstance(FieldWriter.class);
private final List<float[]> floatVectors; private final List<float[]> floatVectors;
private final FieldInfo fieldInfo; private final FieldInfo fieldInfo;
private final float quantile; private final float confidenceInterval;
private final InfoStream infoStream; private final InfoStream infoStream;
private final boolean normalize; private final boolean normalize;
private float minQuantile = Float.POSITIVE_INFINITY; private float minQuantile = Float.POSITIVE_INFINITY;
@ -609,12 +612,12 @@ public final class Lucene99ScalarQuantizedVectorsWriter extends FlatVectorsWrite
@SuppressWarnings("unchecked") @SuppressWarnings("unchecked")
FieldWriter( FieldWriter(
float quantile, float confidenceInterval,
FieldInfo fieldInfo, FieldInfo fieldInfo,
InfoStream infoStream, InfoStream infoStream,
KnnFieldVectorsWriter<?> indexWriter) { KnnFieldVectorsWriter<?> indexWriter) {
super((KnnFieldVectorsWriter<float[]>) indexWriter); super((KnnFieldVectorsWriter<float[]>) indexWriter);
this.quantile = quantile; this.confidenceInterval = confidenceInterval;
this.fieldInfo = fieldInfo; this.fieldInfo = fieldInfo;
this.normalize = fieldInfo.getVectorSimilarityFunction() == VectorSimilarityFunction.COSINE; this.normalize = fieldInfo.getVectorSimilarityFunction() == VectorSimilarityFunction.COSINE;
this.floatVectors = new ArrayList<>(); this.floatVectors = new ArrayList<>();
@ -635,15 +638,15 @@ public final class Lucene99ScalarQuantizedVectorsWriter extends FlatVectorsWrite
new FloatVectorWrapper( new FloatVectorWrapper(
floatVectors, floatVectors,
fieldInfo.getVectorSimilarityFunction() == VectorSimilarityFunction.COSINE), fieldInfo.getVectorSimilarityFunction() == VectorSimilarityFunction.COSINE),
quantile); confidenceInterval);
minQuantile = quantizer.getLowerQuantile(); minQuantile = quantizer.getLowerQuantile();
maxQuantile = quantizer.getUpperQuantile(); maxQuantile = quantizer.getUpperQuantile();
if (infoStream.isEnabled(QUANTIZED_VECTOR_COMPONENT)) { if (infoStream.isEnabled(QUANTIZED_VECTOR_COMPONENT)) {
infoStream.message( infoStream.message(
QUANTIZED_VECTOR_COMPONENT, QUANTIZED_VECTOR_COMPONENT,
"quantized field=" "quantized field="
+ " quantile=" + " confidenceInterval="
+ quantile + confidenceInterval
+ " minQuantile=" + " minQuantile="
+ minQuantile + minQuantile
+ " maxQuantile=" + " maxQuantile="
@ -654,7 +657,7 @@ public final class Lucene99ScalarQuantizedVectorsWriter extends FlatVectorsWrite
ScalarQuantizer createQuantizer() { ScalarQuantizer createQuantizer() {
assert finished; assert finished;
return new ScalarQuantizer(minQuantile, maxQuantile, quantile); return new ScalarQuantizer(minQuantile, maxQuantile, confidenceInterval);
} }
@Override @Override

View File

@ -119,7 +119,6 @@ public abstract class PerFieldPostingsFormat extends PostingsFormat {
} }
} }
} }
;
static String getSuffix(String formatName, String suffix) { static String getSuffix(String formatName, String suffix) {
return formatName + "_" + suffix; return formatName + "_" + suffix;

View File

@ -272,7 +272,6 @@ public final class FeatureField extends Field {
return true; return true;
} }
} }
;
static final class LogFunction extends FeatureFunction { static final class LogFunction extends FeatureFunction {

View File

@ -16,6 +16,7 @@
*/ */
package org.apache.lucene.document; package org.apache.lucene.document;
import java.util.Collection;
import java.util.Objects; import java.util.Objects;
import org.apache.lucene.index.DocValuesType; import org.apache.lucene.index.DocValuesType;
import org.apache.lucene.index.IndexOptions; import org.apache.lucene.index.IndexOptions;
@ -171,7 +172,7 @@ public class KeywordField extends Field {
* @throws NullPointerException if {@code field} is null. * @throws NullPointerException if {@code field} is null.
* @return a query matching documents with this exact value * @return a query matching documents with this exact value
*/ */
public static Query newSetQuery(String field, BytesRef... values) { public static Query newSetQuery(String field, Collection<BytesRef> values) {
Objects.requireNonNull(field, "field must not be null"); Objects.requireNonNull(field, "field must not be null");
Objects.requireNonNull(values, "values must not be null"); Objects.requireNonNull(values, "values must not be null");
Query indexQuery = new TermInSetQuery(field, values); Query indexQuery = new TermInSetQuery(field, values);

View File

@ -16,6 +16,7 @@
*/ */
package org.apache.lucene.document; package org.apache.lucene.document;
import java.util.Collection;
import org.apache.lucene.index.DocValuesType; import org.apache.lucene.index.DocValuesType;
import org.apache.lucene.search.IndexOrDocValuesQuery; import org.apache.lucene.search.IndexOrDocValuesQuery;
import org.apache.lucene.search.MultiTermQuery; import org.apache.lucene.search.MultiTermQuery;
@ -99,7 +100,7 @@ public class SortedDocValuesField extends Field {
* in an {@link IndexOrDocValuesQuery}, alongside a set query that executes on postings, such as * in an {@link IndexOrDocValuesQuery}, alongside a set query that executes on postings, such as
* {@link TermInSetQuery}. * {@link TermInSetQuery}.
*/ */
public static Query newSlowSetQuery(String field, BytesRef... values) { public static Query newSlowSetQuery(String field, Collection<BytesRef> values) {
return new TermInSetQuery(MultiTermQuery.DOC_VALUES_REWRITE, field, values); return new TermInSetQuery(MultiTermQuery.DOC_VALUES_REWRITE, field, values);
} }
} }

View File

@ -16,6 +16,7 @@
*/ */
package org.apache.lucene.document; package org.apache.lucene.document;
import java.util.Collection;
import org.apache.lucene.index.DocValuesType; import org.apache.lucene.index.DocValuesType;
import org.apache.lucene.search.IndexOrDocValuesQuery; import org.apache.lucene.search.IndexOrDocValuesQuery;
import org.apache.lucene.search.MultiTermQuery; import org.apache.lucene.search.MultiTermQuery;
@ -103,7 +104,7 @@ public class SortedSetDocValuesField extends Field {
* in an {@link IndexOrDocValuesQuery}, alongside a set query that executes on postings, such as * in an {@link IndexOrDocValuesQuery}, alongside a set query that executes on postings, such as
* {@link TermInSetQuery}. * {@link TermInSetQuery}.
*/ */
public static Query newSlowSetQuery(String field, BytesRef... values) { public static Query newSlowSetQuery(String field, Collection<BytesRef> values) {
return new TermInSetQuery(MultiTermQuery.DOC_VALUES_REWRITE, field, values); return new TermInSetQuery(MultiTermQuery.DOC_VALUES_REWRITE, field, values);
} }
} }

View File

@ -694,7 +694,7 @@ abstract class SpatialQuery extends Query {
final SpatialVisitor spatialVisitor, QueryRelation queryRelation, final FixedBitSet result) { final SpatialVisitor spatialVisitor, QueryRelation queryRelation, final FixedBitSet result) {
final BiFunction<byte[], byte[], Relation> innerFunction = final BiFunction<byte[], byte[], Relation> innerFunction =
spatialVisitor.getInnerFunction(queryRelation); spatialVisitor.getInnerFunction(queryRelation);
;
return new IntersectVisitor() { return new IntersectVisitor() {
@Override @Override

View File

@ -1254,8 +1254,7 @@ public final class Tessellator {
++numMerges; ++numMerges;
// step 'insize' places along from p // step 'insize' places along from p
q = p; q = p;
for (i = 0, pSize = 0; i < inSize && q != null; ++i, ++pSize, q = q.nextZ) for (i = 0, pSize = 0; i < inSize && q != null; ++i, ++pSize, q = q.nextZ) {}
;
// if q hasn't fallen off end, we have two lists to merge // if q hasn't fallen off end, we have two lists to merge
qSize = inSize; qSize = inSize;

View File

@ -22,11 +22,11 @@ import org.apache.lucene.store.DataOutput;
import org.apache.lucene.util.BitUtil; import org.apache.lucene.util.BitUtil;
import org.apache.lucene.util.ByteBlockPool; import org.apache.lucene.util.ByteBlockPool;
/* IndexInput that knows how to read the byte slices written /**
* by Posting and PostingVector. We read the bytes in * IndexInput that knows how to read the byte slices written by Posting and PostingVector. We read
* each slice until we hit the end of that slice at which * the bytes in each slice until we hit the end of that slice at which point we read the forwarding
* point we read the forwarding address of the next slice * address of the next slice and then jump to it.
* and then jump to it.*/ */
final class ByteSliceReader extends DataInput { final class ByteSliceReader extends DataInput {
ByteBlockPool pool; ByteBlockPool pool;
int bufferUpto; int bufferUpto;

View File

@ -28,7 +28,7 @@ import java.nio.file.Paths;
import java.text.NumberFormat; import java.text.NumberFormat;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Arrays; import java.util.Arrays;
import java.util.Collections; import java.util.Comparator;
import java.util.HashMap; import java.util.HashMap;
import java.util.Iterator; import java.util.Iterator;
import java.util.List; import java.util.List;
@ -96,11 +96,11 @@ import org.apache.lucene.util.Version;
*/ */
public final class CheckIndex implements Closeable { public final class CheckIndex implements Closeable {
private final Directory dir;
private final Lock writeLock;
private final NumberFormat nf = NumberFormat.getInstance(Locale.ROOT);
private PrintStream infoStream; private PrintStream infoStream;
private Directory dir;
private Lock writeLock;
private volatile boolean closed; private volatile boolean closed;
private NumberFormat nf = NumberFormat.getInstance(Locale.ROOT);
/** /**
* Returned from {@link #checkIndex()} detailing the health and status of the index. * Returned from {@link #checkIndex()} detailing the health and status of the index.
@ -441,19 +441,20 @@ public final class CheckIndex implements Closeable {
IOUtils.close(writeLock); IOUtils.close(writeLock);
} }
private boolean doSlowChecks; private int level;
/** /**
* If true, additional slow checks are performed. This will likely drastically increase time it * Sets Level, the higher the value, the more additional checks are performed. This will likely
* takes to run CheckIndex! * drastically increase time it takes to run CheckIndex! See {@link Level}
*/ */
public void setDoSlowChecks(boolean v) { public void setLevel(int v) {
doSlowChecks = v; Level.checkIfLevelInBounds(v);
level = v;
} }
/** See {@link #setDoSlowChecks}. */ /** See {@link #setLevel}. */
public boolean doSlowChecks() { public int getLevel() {
return doSlowChecks; return level;
} }
private boolean failFast; private boolean failFast;
@ -473,21 +474,6 @@ public final class CheckIndex implements Closeable {
private boolean verbose; private boolean verbose;
/** See {@link #getChecksumsOnly}. */
public boolean getChecksumsOnly() {
return checksumsOnly;
}
/**
* If true, only validate physical integrity for all files. Note that the returned nested status
* objects (e.g. storedFieldStatus) will be null.
*/
public void setChecksumsOnly(boolean v) {
checksumsOnly = v;
}
private boolean checksumsOnly;
/** Set threadCount used for parallelizing index integrity checking. */ /** Set threadCount used for parallelizing index integrity checking. */
public void setThreadCount(int tc) { public void setThreadCount(int tc) {
if (tc <= 0) { if (tc <= 0) {
@ -586,7 +572,6 @@ public final class CheckIndex implements Closeable {
ensureOpen(); ensureOpen();
long startNS = System.nanoTime(); long startNS = System.nanoTime();
SegmentInfos sis = null;
Status result = new Status(); Status result = new Status();
result.dir = dir; result.dir = dir;
String[] files = dir.listAll(); String[] files = dir.listAll();
@ -595,43 +580,115 @@ public final class CheckIndex implements Closeable {
throw new IndexNotFoundException( throw new IndexNotFoundException(
"no segments* file found in " + dir + ": files: " + Arrays.toString(files)); "no segments* file found in " + dir + ": files: " + Arrays.toString(files));
} }
try {
// Do not use SegmentInfos.read(Directory) since the spooky // https://github.com/apache/lucene/issues/7820: also attempt to open any older commit
// retrying it does is not necessary here (we hold the write lock): // points (segments_N), which will catch certain corruption like missing _N.si files
sis = // for segments not also referenced by the newest commit point (which was already
SegmentInfos.readCommit( // loaded, successfully, above). Note that we do not do a deeper check of segments
dir, lastSegmentsFile, 0 /* always open old indices if codecs are around */); // referenced ONLY by these older commit points, because such corruption would not
} catch (Throwable t) { // prevent a new IndexWriter from opening on the newest commit point. but it is still
if (failFast) { // corruption, e.g. a reader opened on those old commit points can hit corruption
throw IOUtils.rethrowAlways(t); // exceptions which we (still) will not detect here. progress not perfection!
SegmentInfos lastCommit = null;
List<String> allSegmentsFiles = new ArrayList<>();
for (String fileName : files) {
if (fileName.startsWith(IndexFileNames.SEGMENTS)
&& fileName.equals(SegmentInfos.OLD_SEGMENTS_GEN) == false) {
allSegmentsFiles.add(fileName);
} }
}
// Sort descending by generation so that we always attempt to read the last commit first. This
// way if an index has a broken last commit AND a broken old commit, we report the last commit
// error first:
allSegmentsFiles.sort(
new Comparator<String>() {
@Override
public int compare(String a, String b) {
long genA = SegmentInfos.generationFromSegmentsFileName(a);
long genB = SegmentInfos.generationFromSegmentsFileName(b);
// reversed natural sort (largest generation first):
return -Long.compare(genA, genB);
}
});
for (String fileName : allSegmentsFiles) {
boolean isLastCommit = fileName.equals(lastSegmentsFile);
SegmentInfos infos;
try {
// Do not use SegmentInfos.read(Directory) since the spooky
// retrying it does is not necessary here (we hold the write lock):
// always open old indices if codecs are around
infos = SegmentInfos.readCommit(dir, fileName, 0);
} catch (Throwable t) {
if (failFast) {
throw IOUtils.rethrowAlways(t);
}
String message;
if (isLastCommit) {
message =
"ERROR: could not read latest commit point from segments file \""
+ fileName
+ "\" in directory";
} else {
message =
"ERROR: could not read old (not latest) commit point segments file \""
+ fileName
+ "\" in directory";
}
msg(infoStream, message);
result.missingSegments = true;
if (infoStream != null) {
t.printStackTrace(infoStream);
}
return result;
}
if (isLastCommit) {
// record the latest commit point: we will deeply check all segments referenced by it
lastCommit = infos;
}
}
// we know there is a lastSegmentsFileName, so we must've attempted to load it in the above for
// loop. if it failed to load, we threw the exception (fastFail == true) or we returned the
// failure (fastFail == false). so if we get here, we should // always have a valid lastCommit:
assert lastCommit != null;
if (lastCommit == null) {
msg(infoStream, "ERROR: could not read any segments file in directory"); msg(infoStream, "ERROR: could not read any segments file in directory");
result.missingSegments = true; result.missingSegments = true;
if (infoStream != null) t.printStackTrace(infoStream);
return result; return result;
} }
if (infoStream != null) { if (infoStream != null) {
int maxDoc = 0; int maxDoc = 0;
int delCount = 0; int delCount = 0;
for (SegmentCommitInfo info : sis) { for (SegmentCommitInfo info : lastCommit) {
maxDoc += info.info.maxDoc(); maxDoc += info.info.maxDoc();
delCount += info.getDelCount(); delCount += info.getDelCount();
} }
infoStream.println( infoStream.printf(
String.format( Locale.ROOT,
Locale.ROOT, "%.2f%% total deletions; %d documents; %d deletions%n",
"%.2f%% total deletions; %d documents; %d deletions", 100. * delCount / maxDoc,
100. * delCount / maxDoc, maxDoc,
maxDoc, delCount);
delCount));
} }
// find the oldest and newest segment versions // find the oldest and newest segment versions
Version oldest = null; Version oldest = null;
Version newest = null; Version newest = null;
String oldSegs = null; String oldSegs = null;
for (SegmentCommitInfo si : sis) { for (SegmentCommitInfo si : lastCommit) {
Version version = si.info.getVersion(); Version version = si.info.getVersion();
if (version == null) { if (version == null) {
// pre-3.1 segment // pre-3.1 segment
@ -646,14 +703,14 @@ public final class CheckIndex implements Closeable {
} }
} }
final int numSegments = sis.size(); final int numSegments = lastCommit.size();
final String segmentsFileName = sis.getSegmentsFileName(); final String segmentsFileName = lastCommit.getSegmentsFileName();
result.segmentsFileName = segmentsFileName; result.segmentsFileName = segmentsFileName;
result.numSegments = numSegments; result.numSegments = numSegments;
result.userData = sis.getUserData(); result.userData = lastCommit.getUserData();
String userDataString; String userDataString;
if (sis.getUserData().size() > 0) { if (lastCommit.getUserData().size() > 0) {
userDataString = " userData=" + sis.getUserData(); userDataString = " userData=" + lastCommit.getUserData();
} else { } else {
userDataString = ""; userDataString = "";
} }
@ -681,7 +738,7 @@ public final class CheckIndex implements Closeable {
+ " " + " "
+ versionString + versionString
+ " id=" + " id="
+ StringHelper.idToString(sis.getId()) + StringHelper.idToString(lastCommit.getId())
+ userDataString); + userDataString);
if (onlySegments != null) { if (onlySegments != null) {
@ -696,14 +753,14 @@ public final class CheckIndex implements Closeable {
msg(infoStream, ":"); msg(infoStream, ":");
} }
result.newSegments = sis.clone(); result.newSegments = lastCommit.clone();
result.newSegments.clear(); result.newSegments.clear();
result.maxSegmentName = -1; result.maxSegmentName = -1;
// checks segments sequentially // checks segments sequentially
if (executorService == null) { if (executorService == null) {
for (int i = 0; i < numSegments; i++) { for (int i = 0; i < numSegments; i++) {
final SegmentCommitInfo info = sis.info(i); final SegmentCommitInfo info = lastCommit.info(i);
updateMaxSegmentName(result, info); updateMaxSegmentName(result, info);
if (onlySegments != null && !onlySegments.contains(info.info.name)) { if (onlySegments != null && !onlySegments.contains(info.info.name)) {
continue; continue;
@ -718,7 +775,7 @@ public final class CheckIndex implements Closeable {
+ info.info.name + info.info.name
+ " maxDoc=" + " maxDoc="
+ info.info.maxDoc()); + info.info.maxDoc());
Status.SegmentInfoStatus segmentInfoStatus = testSegment(sis, info, infoStream); Status.SegmentInfoStatus segmentInfoStatus = testSegment(lastCommit, info, infoStream);
processSegmentInfoStatusResult(result, info, segmentInfoStatus); processSegmentInfoStatusResult(result, info, segmentInfoStatus);
} }
@ -729,14 +786,13 @@ public final class CheckIndex implements Closeable {
// checks segments concurrently // checks segments concurrently
List<SegmentCommitInfo> segmentCommitInfos = new ArrayList<>(); List<SegmentCommitInfo> segmentCommitInfos = new ArrayList<>();
for (SegmentCommitInfo sci : sis) { for (SegmentCommitInfo sci : lastCommit) {
segmentCommitInfos.add(sci); segmentCommitInfos.add(sci);
} }
// sort segmentCommitInfos by segment size, as smaller segment tends to finish faster, and // sort segmentCommitInfos by segment size, as smaller segment tends to finish faster, and
// hence its output can be printed out faster // hence its output can be printed out faster
Collections.sort( segmentCommitInfos.sort(
segmentCommitInfos,
(info1, info2) -> { (info1, info2) -> {
try { try {
return Long.compare(info1.sizeInBytes(), info2.sizeInBytes()); return Long.compare(info1.sizeInBytes(), info2.sizeInBytes());
@ -757,7 +813,7 @@ public final class CheckIndex implements Closeable {
continue; continue;
} }
SegmentInfos finalSis = sis; SegmentInfos finalSis = lastCommit;
ByteArrayOutputStream output = new ByteArrayOutputStream(); ByteArrayOutputStream output = new ByteArrayOutputStream();
PrintStream stream = new PrintStream(output, true, IOUtils.UTF_8); PrintStream stream = new PrintStream(output, true, IOUtils.UTF_8);
@ -813,7 +869,7 @@ public final class CheckIndex implements Closeable {
if (0 == result.numBadSegments) { if (0 == result.numBadSegments) {
result.clean = true; result.clean = true;
} else } else {
msg( msg(
infoStream, infoStream,
"WARNING: " "WARNING: "
@ -821,14 +877,16 @@ public final class CheckIndex implements Closeable {
+ " broken segments (containing " + " broken segments (containing "
+ result.totLoseDocCount + result.totLoseDocCount
+ " documents) detected"); + " documents) detected");
}
if (!(result.validCounter = (result.maxSegmentName < sis.counter))) { result.validCounter = result.maxSegmentName < lastCommit.counter;
if (result.validCounter == false) {
result.clean = false; result.clean = false;
result.newSegments.counter = result.maxSegmentName + 1; result.newSegments.counter = result.maxSegmentName + 1;
msg( msg(
infoStream, infoStream,
"ERROR: Next segment name counter " "ERROR: Next segment name counter "
+ sis.counter + lastCommit.counter
+ " is not greater than max segment name " + " is not greater than max segment name "
+ result.maxSegmentName); + result.maxSegmentName);
} }
@ -921,7 +979,7 @@ public final class CheckIndex implements Closeable {
msg(infoStream, " diagnostics = " + diagnostics); msg(infoStream, " diagnostics = " + diagnostics);
} }
if (!info.hasDeletions()) { if (info.hasDeletions() == false) {
msg(infoStream, " no deletions"); msg(infoStream, " no deletions");
segInfoStat.hasDeletions = false; segInfoStat.hasDeletions = false;
} else { } else {
@ -960,26 +1018,26 @@ public final class CheckIndex implements Closeable {
toLoseDocCount = numDocs; toLoseDocCount = numDocs;
if (reader.hasDeletions()) { if (reader.hasDeletions()) {
if (reader.numDocs() != info.info.maxDoc() - info.getDelCount()) { if (numDocs != info.info.maxDoc() - info.getDelCount()) {
throw new CheckIndexException( throw new CheckIndexException(
"delete count mismatch: info=" "delete count mismatch: info="
+ (info.info.maxDoc() - info.getDelCount()) + (info.info.maxDoc() - info.getDelCount())
+ " vs reader=" + " vs reader="
+ reader.numDocs()); + numDocs);
} }
if ((info.info.maxDoc() - reader.numDocs()) > reader.maxDoc()) { if ((info.info.maxDoc() - numDocs) > reader.maxDoc()) {
throw new CheckIndexException( throw new CheckIndexException(
"too many deleted docs: maxDoc()=" "too many deleted docs: maxDoc()="
+ reader.maxDoc() + reader.maxDoc()
+ " vs del count=" + " vs del count="
+ (info.info.maxDoc() - reader.numDocs())); + (info.info.maxDoc() - numDocs));
} }
if (info.info.maxDoc() - reader.numDocs() != info.getDelCount()) { if (info.info.maxDoc() - numDocs != info.getDelCount()) {
throw new CheckIndexException( throw new CheckIndexException(
"delete count mismatch: info=" "delete count mismatch: info="
+ info.getDelCount() + info.getDelCount()
+ " vs reader=" + " vs reader="
+ (info.info.maxDoc() - reader.numDocs())); + (info.info.maxDoc() - numDocs));
} }
} else { } else {
if (info.getDelCount() != 0) { if (info.getDelCount() != 0) {
@ -987,11 +1045,10 @@ public final class CheckIndex implements Closeable {
"delete count mismatch: info=" "delete count mismatch: info="
+ info.getDelCount() + info.getDelCount()
+ " vs reader=" + " vs reader="
+ (info.info.maxDoc() - reader.numDocs())); + (info.info.maxDoc() - numDocs));
} }
} }
if (level >= Level.MIN_LEVEL_FOR_INTEGRITY_CHECKS) {
if (checksumsOnly == false) {
// Test Livedocs // Test Livedocs
segInfoStat.liveDocStatus = testLiveDocs(reader, infoStream, failFast); segInfoStat.liveDocStatus = testLiveDocs(reader, infoStream, failFast);
@ -1002,15 +1059,14 @@ public final class CheckIndex implements Closeable {
segInfoStat.fieldNormStatus = testFieldNorms(reader, infoStream, failFast); segInfoStat.fieldNormStatus = testFieldNorms(reader, infoStream, failFast);
// Test the Term Index // Test the Term Index
segInfoStat.termIndexStatus = segInfoStat.termIndexStatus = testPostings(reader, infoStream, verbose, level, failFast);
testPostings(reader, infoStream, verbose, doSlowChecks, failFast);
// Test Stored Fields // Test Stored Fields
segInfoStat.storedFieldStatus = testStoredFields(reader, infoStream, failFast); segInfoStat.storedFieldStatus = testStoredFields(reader, infoStream, failFast);
// Test Term Vectors // Test Term Vectors
segInfoStat.termVectorStatus = segInfoStat.termVectorStatus =
testTermVectors(reader, infoStream, verbose, doSlowChecks, failFast); testTermVectors(reader, infoStream, verbose, level, failFast);
// Test Docvalues // Test Docvalues
segInfoStat.docValuesStatus = testDocValues(reader, infoStream, failFast); segInfoStat.docValuesStatus = testDocValues(reader, infoStream, failFast);
@ -1213,7 +1269,7 @@ public final class CheckIndex implements Closeable {
if (liveDocs != null) { if (liveDocs != null) {
// it's ok for it to be non-null here, as long as none are set right? // it's ok for it to be non-null here, as long as none are set right?
for (int j = 0; j < liveDocs.length(); j++) { for (int j = 0; j < liveDocs.length(); j++) {
if (!liveDocs.get(j)) { if (liveDocs.get(j) == false) {
throw new CheckIndexException( throw new CheckIndexException(
"liveDocs mismatch: info says no deletions but doc " + j + " is deleted."); "liveDocs mismatch: info says no deletions but doc " + j + " is deleted.");
} }
@ -1341,7 +1397,7 @@ public final class CheckIndex implements Closeable {
boolean isVectors, boolean isVectors,
PrintStream infoStream, PrintStream infoStream,
boolean verbose, boolean verbose,
boolean doSlowChecks) int level)
throws IOException { throws IOException {
// TODO: we should probably return our own stats thing...?! // TODO: we should probably return our own stats thing...?!
long startNS; long startNS;
@ -1450,7 +1506,7 @@ public final class CheckIndex implements Closeable {
+ hasFreqs); + hasFreqs);
} }
if (!isVectors) { if (isVectors == false) {
final boolean expectedHasPositions = final boolean expectedHasPositions =
fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0; fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
if (hasPositions != expectedHasPositions) { if (hasPositions != expectedHasPositions) {
@ -1810,7 +1866,7 @@ public final class CheckIndex implements Closeable {
// free-for-all before? // free-for-all before?
// but for offsets in the postings lists these checks are fine: they were always // but for offsets in the postings lists these checks are fine: they were always
// enforced by IndexWriter // enforced by IndexWriter
if (!isVectors) { if (isVectors == false) {
if (startOffset < 0) { if (startOffset < 0) {
throw new CheckIndexException( throw new CheckIndexException(
"term " "term "
@ -1924,14 +1980,13 @@ public final class CheckIndex implements Closeable {
} }
// Checking score blocks is heavy, we only do it on long postings lists, on every 1024th // Checking score blocks is heavy, we only do it on long postings lists, on every 1024th
// term // term or if slow checks are enabled.
// or if slow checks are enabled. if (level >= Level.MIN_LEVEL_FOR_SLOW_CHECKS
if (doSlowChecks
|| docFreq > 1024 || docFreq > 1024
|| (status.termCount + status.delTermCount) % 1024 == 0) { || (status.termCount + status.delTermCount) % 1024 == 0) {
// First check max scores and block uptos // First check max scores and block uptos
// But only if slok checks are enabled since we visit all docs // But only if slow checks are enabled since we visit all docs
if (doSlowChecks) { if (level >= Level.MIN_LEVEL_FOR_SLOW_CHECKS) {
int max = -1; int max = -1;
int maxFreq = 0; int maxFreq = 0;
ImpactsEnum impactsEnum = termsEnum.impacts(PostingsEnum.FREQS); ImpactsEnum impactsEnum = termsEnum.impacts(PostingsEnum.FREQS);
@ -1998,9 +2053,9 @@ public final class CheckIndex implements Closeable {
Impacts impacts = impactsEnum.getImpacts(); Impacts impacts = impactsEnum.getImpacts();
checkImpacts(impacts, doc); checkImpacts(impacts, doc);
maxFreq = Integer.MAX_VALUE; maxFreq = Integer.MAX_VALUE;
for (int level = 0; level < impacts.numLevels(); ++level) { for (int impactsLevel = 0; impactsLevel < impacts.numLevels(); ++impactsLevel) {
if (impacts.getDocIdUpTo(level) >= max) { if (impacts.getDocIdUpTo(impactsLevel) >= max) {
List<Impact> perLevelImpacts = impacts.getImpacts(level); List<Impact> perLevelImpacts = impacts.getImpacts(impactsLevel);
maxFreq = perLevelImpacts.get(perLevelImpacts.size() - 1).freq; maxFreq = perLevelImpacts.get(perLevelImpacts.size() - 1).freq;
break; break;
} }
@ -2040,9 +2095,9 @@ public final class CheckIndex implements Closeable {
Impacts impacts = impactsEnum.getImpacts(); Impacts impacts = impactsEnum.getImpacts();
checkImpacts(impacts, doc); checkImpacts(impacts, doc);
maxFreq = Integer.MAX_VALUE; maxFreq = Integer.MAX_VALUE;
for (int level = 0; level < impacts.numLevels(); ++level) { for (int impactsLevel = 0; impactsLevel < impacts.numLevels(); ++impactsLevel) {
if (impacts.getDocIdUpTo(level) >= max) { if (impacts.getDocIdUpTo(impactsLevel) >= max) {
List<Impact> perLevelImpacts = impacts.getImpacts(level); List<Impact> perLevelImpacts = impacts.getImpacts(impactsLevel);
maxFreq = perLevelImpacts.get(perLevelImpacts.size() - 1).freq; maxFreq = perLevelImpacts.get(perLevelImpacts.size() - 1).freq;
break; break;
} }
@ -2151,7 +2206,7 @@ public final class CheckIndex implements Closeable {
+ " doesn't have terms according to postings but has a norm value that is not zero: " + " doesn't have terms according to postings but has a norm value that is not zero: "
+ Long.toUnsignedString(norm)); + Long.toUnsignedString(norm));
} }
} else if (norm == 0 && visitedDocs.get(doc)) { } else if (visitedDocs.get(doc)) {
throw new CheckIndexException( throw new CheckIndexException(
"Document " "Document "
+ doc + doc
@ -2307,7 +2362,7 @@ public final class CheckIndex implements Closeable {
static void checkImpacts(Impacts impacts, int lastTarget) { static void checkImpacts(Impacts impacts, int lastTarget) {
final int numLevels = impacts.numLevels(); final int numLevels = impacts.numLevels();
if (numLevels < 1) { if (numLevels < 1) {
throw new CheckIndexException("The number of levels must be >= 1, got " + numLevels); throw new CheckIndexException("The number of impact levels must be >= 1, got " + numLevels);
} }
int docIdUpTo0 = impacts.getDocIdUpTo(0); int docIdUpTo0 = impacts.getDocIdUpTo(0);
@ -2319,17 +2374,17 @@ public final class CheckIndex implements Closeable {
+ lastTarget); + lastTarget);
} }
for (int level = 1; level < numLevels; ++level) { for (int impactsLevel = 1; impactsLevel < numLevels; ++impactsLevel) {
int docIdUpTo = impacts.getDocIdUpTo(level); int docIdUpTo = impacts.getDocIdUpTo(impactsLevel);
int previousDocIdUpTo = impacts.getDocIdUpTo(level - 1); int previousDocIdUpTo = impacts.getDocIdUpTo(impactsLevel - 1);
if (docIdUpTo < previousDocIdUpTo) { if (docIdUpTo < previousDocIdUpTo) {
throw new CheckIndexException( throw new CheckIndexException(
"Decreasing return for getDocIdUpTo: level " "Decreasing return for getDocIdUpTo: level "
+ (level - 1) + (impactsLevel - 1)
+ " returned " + " returned "
+ previousDocIdUpTo + previousDocIdUpTo
+ " but level " + " but level "
+ level + impactsLevel
+ " returned " + " returned "
+ docIdUpTo + docIdUpTo
+ " for target " + " for target "
@ -2337,10 +2392,10 @@ public final class CheckIndex implements Closeable {
} }
} }
for (int level = 0; level < numLevels; ++level) { for (int impactsLevel = 0; impactsLevel < numLevels; ++impactsLevel) {
List<Impact> perLevelImpacts = impacts.getImpacts(level); List<Impact> perLevelImpacts = impacts.getImpacts(impactsLevel);
if (perLevelImpacts.isEmpty()) { if (perLevelImpacts.isEmpty()) {
throw new CheckIndexException("Got empty list of impacts on level " + level); throw new CheckIndexException("Got empty list of impacts on level " + impactsLevel);
} }
Impact first = perLevelImpacts.get(0); Impact first = perLevelImpacts.get(0);
if (first.freq < 1) { if (first.freq < 1) {
@ -2358,9 +2413,9 @@ public final class CheckIndex implements Closeable {
"Impacts are not ordered or contain dups, got " + previous + " then " + impact); "Impacts are not ordered or contain dups, got " + previous + " then " + impact);
} }
} }
if (level > 0) { if (impactsLevel > 0) {
// Make sure that impacts at level N trigger better scores than an level N-1 // Make sure that impacts at level N trigger better scores than an impactsLevel N-1
Iterator<Impact> previousIt = impacts.getImpacts(level - 1).iterator(); Iterator<Impact> previousIt = impacts.getImpacts(impactsLevel - 1).iterator();
previous = previousIt.next(); previous = previousIt.next();
Iterator<Impact> it = perLevelImpacts.iterator(); Iterator<Impact> it = perLevelImpacts.iterator();
Impact impact = it.next(); Impact impact = it.next();
@ -2376,9 +2431,9 @@ public final class CheckIndex implements Closeable {
"Found impact " "Found impact "
+ previous + previous
+ " on level " + " on level "
+ (level - 1) + (impactsLevel - 1)
+ " but no impact on level " + " but no impact on level "
+ level + impactsLevel
+ " triggers a better score: " + " triggers a better score: "
+ perLevelImpacts); + perLevelImpacts);
} }
@ -2395,7 +2450,7 @@ public final class CheckIndex implements Closeable {
*/ */
public static Status.TermIndexStatus testPostings(CodecReader reader, PrintStream infoStream) public static Status.TermIndexStatus testPostings(CodecReader reader, PrintStream infoStream)
throws IOException { throws IOException {
return testPostings(reader, infoStream, false, true, false); return testPostings(reader, infoStream, false, Level.MIN_LEVEL_FOR_SLOW_CHECKS, false);
} }
/** /**
@ -2404,15 +2459,11 @@ public final class CheckIndex implements Closeable {
* @lucene.experimental * @lucene.experimental
*/ */
public static Status.TermIndexStatus testPostings( public static Status.TermIndexStatus testPostings(
CodecReader reader, CodecReader reader, PrintStream infoStream, boolean verbose, int level, boolean failFast)
PrintStream infoStream,
boolean verbose,
boolean doSlowChecks,
boolean failFast)
throws IOException { throws IOException {
// TODO: we should go and verify term vectors match, if // TODO: we should go and verify term vectors match, if the Level is high enough to
// doSlowChecks is on... // include slow checks
Status.TermIndexStatus status; Status.TermIndexStatus status;
final int maxDoc = reader.maxDoc(); final int maxDoc = reader.maxDoc();
@ -2443,7 +2494,7 @@ public final class CheckIndex implements Closeable {
false, false,
infoStream, infoStream,
verbose, verbose,
doSlowChecks); level);
} catch (Throwable e) { } catch (Throwable e) {
if (failFast) { if (failFast) {
throw IOUtils.rethrowAlways(e); throw IOUtils.rethrowAlways(e);
@ -3132,7 +3183,7 @@ public final class CheckIndex implements Closeable {
for (FieldInfo fieldInfo : reader.getFieldInfos()) { for (FieldInfo fieldInfo : reader.getFieldInfos()) {
if (fieldInfo.getDocValuesType() != DocValuesType.NONE) { if (fieldInfo.getDocValuesType() != DocValuesType.NONE) {
status.totalValueFields++; status.totalValueFields++;
checkDocValues(fieldInfo, dvReader, reader.maxDoc(), infoStream, status); checkDocValues(fieldInfo, dvReader, status);
} }
} }
@ -3162,11 +3213,11 @@ public final class CheckIndex implements Closeable {
} }
@FunctionalInterface @FunctionalInterface
private static interface DocValuesIteratorSupplier { private interface DocValuesIteratorSupplier {
DocValuesIterator get(FieldInfo fi) throws IOException; DocValuesIterator get(FieldInfo fi) throws IOException;
} }
private static void checkDVIterator(FieldInfo fi, int maxDoc, DocValuesIteratorSupplier producer) private static void checkDVIterator(FieldInfo fi, DocValuesIteratorSupplier producer)
throws IOException { throws IOException {
String field = fi.name; String field = fi.name;
@ -3284,7 +3335,7 @@ public final class CheckIndex implements Closeable {
} }
private static void checkBinaryDocValues( private static void checkBinaryDocValues(
String fieldName, int maxDoc, BinaryDocValues bdv, BinaryDocValues bdv2) throws IOException { String fieldName, BinaryDocValues bdv, BinaryDocValues bdv2) throws IOException {
if (bdv.docID() != -1) { if (bdv.docID() != -1) {
throw new CheckIndexException( throw new CheckIndexException(
"binary dv iterator for field: " "binary dv iterator for field: "
@ -3309,7 +3360,7 @@ public final class CheckIndex implements Closeable {
} }
private static void checkSortedDocValues( private static void checkSortedDocValues(
String fieldName, int maxDoc, SortedDocValues dv, SortedDocValues dv2) throws IOException { String fieldName, SortedDocValues dv, SortedDocValues dv2) throws IOException {
if (dv.docID() != -1) { if (dv.docID() != -1) {
throw new CheckIndexException( throw new CheckIndexException(
"sorted dv iterator for field: " "sorted dv iterator for field: "
@ -3373,8 +3424,7 @@ public final class CheckIndex implements Closeable {
} }
private static void checkSortedSetDocValues( private static void checkSortedSetDocValues(
String fieldName, int maxDoc, SortedSetDocValues dv, SortedSetDocValues dv2) String fieldName, SortedSetDocValues dv, SortedSetDocValues dv2) throws IOException {
throws IOException {
final long maxOrd = dv.getValueCount() - 1; final long maxOrd = dv.getValueCount() - 1;
LongBitSet seenOrds = new LongBitSet(dv.getValueCount()); LongBitSet seenOrds = new LongBitSet(dv.getValueCount());
long maxOrd2 = -1; long maxOrd2 = -1;
@ -3470,7 +3520,7 @@ public final class CheckIndex implements Closeable {
} }
private static void checkSortedNumericDocValues( private static void checkSortedNumericDocValues(
String fieldName, int maxDoc, SortedNumericDocValues ndv, SortedNumericDocValues ndv2) String fieldName, SortedNumericDocValues ndv, SortedNumericDocValues ndv2)
throws IOException { throws IOException {
if (ndv.docID() != -1) { if (ndv.docID() != -1) {
throw new CheckIndexException( throw new CheckIndexException(
@ -3539,38 +3589,32 @@ public final class CheckIndex implements Closeable {
} }
private static void checkDocValues( private static void checkDocValues(
FieldInfo fi, FieldInfo fi, DocValuesProducer dvReader, DocValuesStatus status) throws Exception {
DocValuesProducer dvReader,
int maxDoc,
PrintStream infoStream,
DocValuesStatus status)
throws Exception {
switch (fi.getDocValuesType()) { switch (fi.getDocValuesType()) {
case SORTED: case SORTED:
status.totalSortedFields++; status.totalSortedFields++;
checkDVIterator(fi, maxDoc, dvReader::getSorted); checkDVIterator(fi, dvReader::getSorted);
checkSortedDocValues(fi.name, maxDoc, dvReader.getSorted(fi), dvReader.getSorted(fi)); checkSortedDocValues(fi.name, dvReader.getSorted(fi), dvReader.getSorted(fi));
break; break;
case SORTED_NUMERIC: case SORTED_NUMERIC:
status.totalSortedNumericFields++; status.totalSortedNumericFields++;
checkDVIterator(fi, maxDoc, dvReader::getSortedNumeric); checkDVIterator(fi, dvReader::getSortedNumeric);
checkSortedNumericDocValues( checkSortedNumericDocValues(
fi.name, maxDoc, dvReader.getSortedNumeric(fi), dvReader.getSortedNumeric(fi)); fi.name, dvReader.getSortedNumeric(fi), dvReader.getSortedNumeric(fi));
break; break;
case SORTED_SET: case SORTED_SET:
status.totalSortedSetFields++; status.totalSortedSetFields++;
checkDVIterator(fi, maxDoc, dvReader::getSortedSet); checkDVIterator(fi, dvReader::getSortedSet);
checkSortedSetDocValues( checkSortedSetDocValues(fi.name, dvReader.getSortedSet(fi), dvReader.getSortedSet(fi));
fi.name, maxDoc, dvReader.getSortedSet(fi), dvReader.getSortedSet(fi));
break; break;
case BINARY: case BINARY:
status.totalBinaryFields++; status.totalBinaryFields++;
checkDVIterator(fi, maxDoc, dvReader::getBinary); checkDVIterator(fi, dvReader::getBinary);
checkBinaryDocValues(fi.name, maxDoc, dvReader.getBinary(fi), dvReader.getBinary(fi)); checkBinaryDocValues(fi.name, dvReader.getBinary(fi), dvReader.getBinary(fi));
break; break;
case NUMERIC: case NUMERIC:
status.totalNumericFields++; status.totalNumericFields++;
checkDVIterator(fi, maxDoc, dvReader::getNumeric); checkDVIterator(fi, dvReader::getNumeric);
checkNumericDocValues(fi.name, dvReader.getNumeric(fi), dvReader.getNumeric(fi)); checkNumericDocValues(fi.name, dvReader.getNumeric(fi), dvReader.getNumeric(fi));
break; break;
case NONE: case NONE:
@ -3586,7 +3630,7 @@ public final class CheckIndex implements Closeable {
*/ */
public static Status.TermVectorStatus testTermVectors(CodecReader reader, PrintStream infoStream) public static Status.TermVectorStatus testTermVectors(CodecReader reader, PrintStream infoStream)
throws IOException { throws IOException {
return testTermVectors(reader, infoStream, false, false, false); return testTermVectors(reader, infoStream, false, Level.MIN_LEVEL_FOR_INTEGRITY_CHECKS, false);
} }
/** /**
@ -3595,11 +3639,7 @@ public final class CheckIndex implements Closeable {
* @lucene.experimental * @lucene.experimental
*/ */
public static Status.TermVectorStatus testTermVectors( public static Status.TermVectorStatus testTermVectors(
CodecReader reader, CodecReader reader, PrintStream infoStream, boolean verbose, int level, boolean failFast)
PrintStream infoStream,
boolean verbose,
boolean doSlowChecks,
boolean failFast)
throws IOException { throws IOException {
long startNS = System.nanoTime(); long startNS = System.nanoTime();
final Status.TermVectorStatus status = new Status.TermVectorStatus(); final Status.TermVectorStatus status = new Status.TermVectorStatus();
@ -3612,14 +3652,14 @@ public final class CheckIndex implements Closeable {
PostingsEnum postings = null; PostingsEnum postings = null;
// Only used if doSlowChecks is true: // Only used if the Level is high enough to include slow checks:
PostingsEnum postingsDocs = null; PostingsEnum postingsDocs = null;
final Bits liveDocs = reader.getLiveDocs(); final Bits liveDocs = reader.getLiveDocs();
FieldsProducer postingsFields; FieldsProducer postingsFields;
// TODO: testTermsIndex // TODO: testTermsIndex
if (doSlowChecks) { if (level >= Level.MIN_LEVEL_FOR_SLOW_CHECKS) {
postingsFields = reader.getPostingsReader(); postingsFields = reader.getPostingsReader();
if (postingsFields != null) { if (postingsFields != null) {
postingsFields = postingsFields.getMergeInstance(); postingsFields = postingsFields.getMergeInstance();
@ -3643,8 +3683,7 @@ public final class CheckIndex implements Closeable {
if (tfv != null) { if (tfv != null) {
// First run with no deletions: // First run with no deletions:
checkFields( checkFields(tfv, null, 1, fieldInfos, null, false, true, infoStream, verbose, level);
tfv, null, 1, fieldInfos, null, false, true, infoStream, verbose, doSlowChecks);
// Only agg stats if the doc is live: // Only agg stats if the doc is live:
final boolean doStats = liveDocs == null || liveDocs.get(j); final boolean doStats = liveDocs == null || liveDocs.get(j);
@ -3660,7 +3699,7 @@ public final class CheckIndex implements Closeable {
// Make sure FieldInfo thinks this field is vector'd: // Make sure FieldInfo thinks this field is vector'd:
final FieldInfo fieldInfo = fieldInfos.fieldInfo(field); final FieldInfo fieldInfo = fieldInfos.fieldInfo(field);
if (!fieldInfo.hasVectors()) { if (fieldInfo.hasVectors() == false) {
throw new CheckIndexException( throw new CheckIndexException(
"docID=" "docID="
+ j + j
@ -3669,7 +3708,7 @@ public final class CheckIndex implements Closeable {
+ " but FieldInfo has storeTermVector=false"); + " but FieldInfo has storeTermVector=false");
} }
if (doSlowChecks) { if (level >= Level.MIN_LEVEL_FOR_SLOW_CHECKS) {
Terms terms = tfv.terms(field); Terms terms = tfv.terms(field);
TermsEnum termsEnum = terms.iterator(); TermsEnum termsEnum = terms.iterator();
final boolean postingsHasFreq = final boolean postingsHasFreq =
@ -3696,7 +3735,7 @@ public final class CheckIndex implements Closeable {
postings = termsEnum.postings(postings, PostingsEnum.ALL); postings = termsEnum.postings(postings, PostingsEnum.ALL);
assert postings != null; assert postings != null;
if (!postingsTermsEnum.seekExact(term)) { if (postingsTermsEnum.seekExact(term) == false) {
throw new CheckIndexException( throw new CheckIndexException(
"vector term=" "vector term="
+ term + term
@ -3852,7 +3891,7 @@ public final class CheckIndex implements Closeable {
+ " but postings does not."); + " but postings does not.");
} }
BytesRef postingsPayload = postingsDocs.getPayload(); BytesRef postingsPayload = postingsDocs.getPayload();
if (!payload.equals(postingsPayload)) { if (payload.equals(postingsPayload) == false) {
throw new CheckIndexException( throw new CheckIndexException(
"vector term=" "vector term="
+ term + term
@ -3972,9 +4011,8 @@ public final class CheckIndex implements Closeable {
/** Run-time configuration options for CheckIndex commands. */ /** Run-time configuration options for CheckIndex commands. */
public static class Options { public static class Options {
boolean doExorcise = false; boolean doExorcise = false;
boolean doSlowChecks = false;
boolean verbose = false; boolean verbose = false;
boolean doChecksumsOnly = false; int level = Level.DEFAULT_VALUE;
int threadCount; int threadCount;
List<String> onlySegments = new ArrayList<>(); List<String> onlySegments = new ArrayList<>();
String indexPath = null; String indexPath = null;
@ -4011,9 +4049,10 @@ public final class CheckIndex implements Closeable {
return 1; return 1;
} }
if (!assertsOn()) if (assertsOn() == false) {
System.out.println( System.out.println(
"\nNOTE: testing will be more thorough if you run java with '-ea:org.apache.lucene...', so assertions are enabled"); "\nNOTE: testing will be more thorough if you run java with '-ea:org.apache.lucene...', so assertions are enabled");
}
System.out.println("\nOpening index @ " + opts.indexPath + "\n"); System.out.println("\nOpening index @ " + opts.indexPath + "\n");
Directory directory = null; Directory directory = null;
@ -4037,6 +4076,42 @@ public final class CheckIndex implements Closeable {
} }
} }
/** Class with static variables with information about CheckIndex's -level parameter. */
public static class Level {
private Level() {}
/** Minimum valid level. */
public static final int MIN_VALUE = 1;
/** Maximum valid level. */
public static final int MAX_VALUE = 3;
/** The default level if none is specified. */
public static final int DEFAULT_VALUE = MIN_VALUE;
/** Minimum level required to run checksum checks. */
public static final int MIN_LEVEL_FOR_CHECKSUM_CHECKS = 1;
/** Minimum level required to run integrity checks. */
public static final int MIN_LEVEL_FOR_INTEGRITY_CHECKS = 2;
/** Minimum level required to run slow checks. */
public static final int MIN_LEVEL_FOR_SLOW_CHECKS = 3;
/** Checks if given level value is within the allowed bounds else it raises an Exception. */
public static void checkIfLevelInBounds(int levelVal) throws IllegalArgumentException {
if (levelVal < Level.MIN_VALUE || levelVal > Level.MAX_VALUE) {
throw new IllegalArgumentException(
String.format(
Locale.ROOT,
"ERROR: given value: '%d' for -level option is out of bounds. Please use a value from '%d'->'%d'",
levelVal,
Level.MIN_VALUE,
Level.MAX_VALUE));
}
}
}
/** /**
* Parse command line args into fields * Parse command line args into fields
* *
@ -4051,15 +4126,29 @@ public final class CheckIndex implements Closeable {
int i = 0; int i = 0;
while (i < args.length) { while (i < args.length) {
String arg = args[i]; String arg = args[i];
if ("-fast".equals(arg)) { if ("-level".equals(arg)) {
opts.doChecksumsOnly = true; if (i == args.length - 1) {
throw new IllegalArgumentException("ERROR: missing value for -level option");
}
i++;
int level = Integer.parseInt(args[i]);
Level.checkIfLevelInBounds(level);
opts.level = level;
} else if ("-fast".equals(arg)) {
// Deprecated. Remove in Lucene 11.
System.err.println(
"-fast is deprecated, use '-level 1' for explicitly verifying file checksums only. This is also now the default "
+ "behaviour!");
} else if ("-slow".equals(arg)) {
// Deprecated. Remove in Lucene 11.
System.err.println("-slow is deprecated, use '-level 3' instead for slow checks");
opts.level = Level.MIN_LEVEL_FOR_SLOW_CHECKS;
} else if ("-exorcise".equals(arg)) { } else if ("-exorcise".equals(arg)) {
opts.doExorcise = true; opts.doExorcise = true;
} else if ("-crossCheckTermVectors".equals(arg)) { } else if ("-crossCheckTermVectors".equals(arg)) {
System.err.println("-crossCheckTermVectors is deprecated, use -slow instead"); // Deprecated. Remove in Lucene 11.
opts.doSlowChecks = true; System.err.println("-crossCheckTermVectors is deprecated, use '-level 3' instead");
} else if ("-slow".equals(arg)) { opts.level = Level.MAX_VALUE;
opts.doSlowChecks = true;
} else if (arg.equals("-verbose")) { } else if (arg.equals("-verbose")) {
opts.verbose = true; opts.verbose = true;
} else if (arg.equals("-segment")) { } else if (arg.equals("-segment")) {
@ -4096,11 +4185,13 @@ public final class CheckIndex implements Closeable {
if (opts.indexPath == null) { if (opts.indexPath == null) {
throw new IllegalArgumentException( throw new IllegalArgumentException(
"\nERROR: index path not specified" "\nERROR: index path not specified"
+ "\nUsage: java org.apache.lucene.index.CheckIndex pathToIndex [-exorcise] [-slow] [-segment X] [-segment Y] [-threadCount X] [-dir-impl X]\n" + "\nUsage: java org.apache.lucene.index.CheckIndex pathToIndex [-exorcise] [-level X] [-segment X] [-segment Y] [-threadCount X] [-dir-impl X]\n"
+ "\n" + "\n"
+ " -exorcise: actually write a new segments_N file, removing any problematic segments\n" + " -exorcise: actually write a new segments_N file, removing any problematic segments\n"
+ " -fast: just verify file checksums, omitting logical integrity checks\n" + " -level X: sets the detail level of the check. The higher the value, the more checks are done.\n"
+ " -slow: do additional slow checks; THIS IS VERY SLOW!\n" + " 1 - (Default) Checksum checks only.\n"
+ " 2 - All level 1 checks + logical integrity checks.\n"
+ " 3 - All level 2 checks + slow checks.\n"
+ " -codec X: when exorcising, codec to write the new segments_N file with\n" + " -codec X: when exorcising, codec to write the new segments_N file with\n"
+ " -verbose: print additional details\n" + " -verbose: print additional details\n"
+ " -segment X: only check the specified segments. This can be specified multiple\n" + " -segment X: only check the specified segments. This can be specified multiple\n"
@ -4115,7 +4206,8 @@ public final class CheckIndex implements Closeable {
+ "If no package is specified the " + "If no package is specified the "
+ FSDirectory.class.getPackage().getName() + FSDirectory.class.getPackage().getName()
+ " package will be used.\n" + " package will be used.\n"
+ "\n" + "CheckIndex only verifies file checksums as default.\n"
+ "Use -level with value of '2' or higher if you also want to check segment file contents.\n\n"
+ "**WARNING**: -exorcise *LOSES DATA*. This should only be used on an emergency basis as it will cause\n" + "**WARNING**: -exorcise *LOSES DATA*. This should only be used on an emergency basis as it will cause\n"
+ "documents (perhaps many) to be permanently removed from the index. Always make\n" + "documents (perhaps many) to be permanently removed from the index. Always make\n"
+ "a backup copy of your index before running this! Do not run this tool on an index\n" + "a backup copy of your index before running this! Do not run this tool on an index\n"
@ -4137,10 +4229,6 @@ public final class CheckIndex implements Closeable {
throw new IllegalArgumentException("ERROR: cannot specify both -exorcise and -segment"); throw new IllegalArgumentException("ERROR: cannot specify both -exorcise and -segment");
} }
if (opts.doChecksumsOnly && opts.doSlowChecks) {
throw new IllegalArgumentException("ERROR: cannot specify both -fast and -slow");
}
return opts; return opts;
} }
@ -4151,8 +4239,7 @@ public final class CheckIndex implements Closeable {
* @return 0 iff the index is clean, 1 otherwise * @return 0 iff the index is clean, 1 otherwise
*/ */
public int doCheck(Options opts) throws IOException, InterruptedException { public int doCheck(Options opts) throws IOException, InterruptedException {
setDoSlowChecks(opts.doSlowChecks); setLevel(opts.level);
setChecksumsOnly(opts.doChecksumsOnly);
setInfoStream(opts.out, opts.verbose); setInfoStream(opts.out, opts.verbose);
// user provided thread count via command line argument, overriding the default with user // user provided thread count via command line argument, overriding the default with user
// provided value // provided value
@ -4166,8 +4253,8 @@ public final class CheckIndex implements Closeable {
return 1; return 1;
} }
if (!result.clean) { if (result.clean == false) {
if (!opts.doExorcise) { if (opts.doExorcise == false) {
opts.out.println( opts.out.println(
"WARNING: would write new segments file, and " "WARNING: would write new segments file, and "
+ result.totLoseDocCount + result.totLoseDocCount

View File

@ -270,7 +270,6 @@ final class FieldUpdatesBuffer {
static class BufferedUpdate { static class BufferedUpdate {
private BufferedUpdate() {} private BufferedUpdate() {}
;
/** the max document ID this update should be applied to */ /** the max document ID this update should be applied to */
int docUpTo; int docUpTo;

View File

@ -33,6 +33,7 @@ import java.util.HashSet;
import java.util.List; import java.util.List;
import java.util.Locale; import java.util.Locale;
import java.util.Map; import java.util.Map;
import java.util.Objects;
import java.util.Queue; import java.util.Queue;
import java.util.Set; import java.util.Set;
import java.util.concurrent.ConcurrentLinkedQueue; import java.util.concurrent.ConcurrentLinkedQueue;
@ -55,6 +56,8 @@ import org.apache.lucene.index.DocValuesUpdate.BinaryDocValuesUpdate;
import org.apache.lucene.index.DocValuesUpdate.NumericDocValuesUpdate; import org.apache.lucene.index.DocValuesUpdate.NumericDocValuesUpdate;
import org.apache.lucene.index.FieldInfos.FieldNumbers; import org.apache.lucene.index.FieldInfos.FieldNumbers;
import org.apache.lucene.index.IndexWriterConfig.OpenMode; import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.index.MergePolicy.MergeReader;
import org.apache.lucene.index.Sorter.DocMap;
import org.apache.lucene.internal.tests.IndexPackageAccess; import org.apache.lucene.internal.tests.IndexPackageAccess;
import org.apache.lucene.internal.tests.IndexWriterAccess; import org.apache.lucene.internal.tests.IndexWriterAccess;
import org.apache.lucene.internal.tests.TestSecrets; import org.apache.lucene.internal.tests.TestSecrets;
@ -3413,8 +3416,20 @@ public class IndexWriter
Collections.emptyMap(), Collections.emptyMap(),
config.getIndexSort()); config.getIndexSort());
List<CodecReader> readers = List<CodecReader> readers = new ArrayList<>();
merge.getMergeReader().stream().map(r -> r.codecReader).collect(Collectors.toList()); for (MergeReader mr : merge.getMergeReader()) {
CodecReader reader = merge.wrapForMerge(mr.codecReader);
readers.add(reader);
}
if (config.getIndexSort() == null && readers.isEmpty() == false) {
CodecReader mergedReader = SlowCompositeCodecReaderWrapper.wrap(readers);
DocMap docMap = merge.reorder(mergedReader, directory);
if (docMap != null) {
readers = Collections.singletonList(SortingCodecReader.wrap(mergedReader, docMap, null));
}
}
SegmentMerger merger = SegmentMerger merger =
new SegmentMerger(readers, segInfo, infoStream, trackingDir, globalFieldNumberMap, context); new SegmentMerger(readers, segInfo, infoStream, trackingDir, globalFieldNumberMap, context);
@ -3464,6 +3479,8 @@ public class IndexWriter
merge.getMergeInfo().info.setUseCompoundFile(true); merge.getMergeInfo().info.setUseCompoundFile(true);
} }
merge.setMergeInfo(merge.info);
// Have codec write SegmentInfo. Must do this after // Have codec write SegmentInfo. Must do this after
// creating CFS so that 1) .si isn't slurped into CFS, // creating CFS so that 1) .si isn't slurped into CFS,
// and 2) .si reflects useCompoundFile=true change // and 2) .si reflects useCompoundFile=true change
@ -3791,7 +3808,7 @@ public class IndexWriter
new OneMergeWrappingMergePolicy( new OneMergeWrappingMergePolicy(
config.getMergePolicy(), config.getMergePolicy(),
toWrap -> toWrap ->
new MergePolicy.OneMerge(toWrap.segments) { new MergePolicy.OneMerge(toWrap) {
SegmentCommitInfo origInfo; SegmentCommitInfo origInfo;
final AtomicBoolean onlyOnce = new AtomicBoolean(false); final AtomicBoolean onlyOnce = new AtomicBoolean(false);
@ -3890,6 +3907,18 @@ public class IndexWriter
public CodecReader wrapForMerge(CodecReader reader) throws IOException { public CodecReader wrapForMerge(CodecReader reader) throws IOException {
return toWrap.wrapForMerge(reader); // must delegate return toWrap.wrapForMerge(reader); // must delegate
} }
@Override
public Sorter.DocMap reorder(CodecReader reader, Directory dir)
throws IOException {
return toWrap.reorder(reader, dir); // must delegate
}
@Override
public void setMergeInfo(SegmentCommitInfo info) {
super.setMergeInfo(info);
toWrap.setMergeInfo(info);
}
}), }),
trigger, trigger,
UNBOUNDED_MAX_MERGE_SEGMENTS); UNBOUNDED_MAX_MERGE_SEGMENTS);
@ -4312,7 +4341,7 @@ public class IndexWriter
* merge.info). If no deletes were flushed, no new deletes file is saved. * merge.info). If no deletes were flushed, no new deletes file is saved.
*/ */
private synchronized ReadersAndUpdates commitMergedDeletesAndUpdates( private synchronized ReadersAndUpdates commitMergedDeletesAndUpdates(
MergePolicy.OneMerge merge, MergeState mergeState) throws IOException { MergePolicy.OneMerge merge, MergeState.DocMap[] docMaps) throws IOException {
mergeFinishedGen.incrementAndGet(); mergeFinishedGen.incrementAndGet();
@ -4336,7 +4365,7 @@ public class IndexWriter
boolean anyDVUpdates = false; boolean anyDVUpdates = false;
assert sourceSegments.size() == mergeState.docMaps.length; assert sourceSegments.size() == docMaps.length;
for (int i = 0; i < sourceSegments.size(); i++) { for (int i = 0; i < sourceSegments.size(); i++) {
SegmentCommitInfo info = sourceSegments.get(i); SegmentCommitInfo info = sourceSegments.get(i);
minGen = Math.min(info.getBufferedDeletesGen(), minGen); minGen = Math.min(info.getBufferedDeletesGen(), minGen);
@ -4346,12 +4375,11 @@ public class IndexWriter
// the pool: // the pool:
assert rld != null : "seg=" + info.info.name; assert rld != null : "seg=" + info.info.name;
MergeState.DocMap segDocMap = mergeState.docMaps[i]; MergeState.DocMap segDocMap = docMaps[i];
carryOverHardDeletes( carryOverHardDeletes(
mergedDeletesAndUpdates, mergedDeletesAndUpdates,
maxDoc, maxDoc,
mergeState.liveDocs[i],
merge.getMergeReader().get(i).hardLiveDocs, merge.getMergeReader().get(i).hardLiveDocs,
rld.getHardLiveDocs(), rld.getHardLiveDocs(),
segDocMap); segDocMap);
@ -4454,26 +4482,21 @@ public class IndexWriter
private static void carryOverHardDeletes( private static void carryOverHardDeletes(
ReadersAndUpdates mergedReadersAndUpdates, ReadersAndUpdates mergedReadersAndUpdates,
int maxDoc, int maxDoc,
Bits mergeLiveDocs, // the liveDocs used to build the segDocMaps
Bits prevHardLiveDocs, // the hard deletes when the merge reader was pulled Bits prevHardLiveDocs, // the hard deletes when the merge reader was pulled
Bits currentHardLiveDocs, // the current hard deletes Bits currentHardLiveDocs, // the current hard deletes
MergeState.DocMap segDocMap) MergeState.DocMap segDocMap)
throws IOException { throws IOException {
assert mergeLiveDocs == null || mergeLiveDocs.length() == maxDoc;
// if we mix soft and hard deletes we need to make sure that we only carry over deletes // if we mix soft and hard deletes we need to make sure that we only carry over deletes
// that were not deleted before. Otherwise the segDocMap doesn't contain a mapping. // that were not deleted before. Otherwise the segDocMap doesn't contain a mapping.
// yet this is also required if any MergePolicy modifies the liveDocs since this is // yet this is also required if any MergePolicy modifies the liveDocs since this is
// what the segDocMap is build on. // what the segDocMap is build on.
final IntPredicate carryOverDelete = final IntPredicate carryOverDelete =
mergeLiveDocs == null || mergeLiveDocs == prevHardLiveDocs docId -> segDocMap.get(docId) != -1 && currentHardLiveDocs.get(docId) == false;
? docId -> currentHardLiveDocs.get(docId) == false
: docId -> mergeLiveDocs.get(docId) && currentHardLiveDocs.get(docId) == false;
if (prevHardLiveDocs != null) { if (prevHardLiveDocs != null) {
// If we had deletions on starting the merge we must // If we had deletions on starting the merge we must
// still have deletions now: // still have deletions now:
assert currentHardLiveDocs != null; assert currentHardLiveDocs != null;
assert mergeLiveDocs != null;
assert prevHardLiveDocs.length() == maxDoc; assert prevHardLiveDocs.length() == maxDoc;
assert currentHardLiveDocs.length() == maxDoc; assert currentHardLiveDocs.length() == maxDoc;
@ -4516,7 +4539,7 @@ public class IndexWriter
} }
@SuppressWarnings("try") @SuppressWarnings("try")
private synchronized boolean commitMerge(MergePolicy.OneMerge merge, MergeState mergeState) private synchronized boolean commitMerge(MergePolicy.OneMerge merge, MergeState.DocMap[] docMaps)
throws IOException { throws IOException {
merge.onMergeComplete(); merge.onMergeComplete();
testPoint("startCommitMerge"); testPoint("startCommitMerge");
@ -4559,7 +4582,7 @@ public class IndexWriter
} }
final ReadersAndUpdates mergedUpdates = final ReadersAndUpdates mergedUpdates =
merge.info.info.maxDoc() == 0 ? null : commitMergedDeletesAndUpdates(merge, mergeState); merge.info.info.maxDoc() == 0 ? null : commitMergedDeletesAndUpdates(merge, docMaps);
// If the doc store we are using has been closed and // If the doc store we are using has been closed and
// is in now compound format (but wasn't when we // is in now compound format (but wasn't when we
@ -5163,12 +5186,57 @@ public class IndexWriter
} }
mergeReaders.add(wrappedReader); mergeReaders.add(wrappedReader);
} }
MergeState.DocMap[] reorderDocMaps = null;
if (config.getIndexSort() == null) {
// Create a merged view of the input segments. This effectively does the merge.
CodecReader mergedView = SlowCompositeCodecReaderWrapper.wrap(mergeReaders);
Sorter.DocMap docMap = merge.reorder(mergedView, directory);
if (docMap != null) {
reorderDocMaps = new MergeState.DocMap[mergeReaders.size()];
int docBase = 0;
int i = 0;
for (CodecReader reader : mergeReaders) {
final int currentDocBase = docBase;
reorderDocMaps[i] =
docID -> {
Objects.checkIndex(docID, reader.maxDoc());
return docMap.oldToNew(currentDocBase + docID);
};
i++;
docBase += reader.maxDoc();
}
// This makes merging more expensive as it disables some bulk merging optimizations, so
// only do this if a non-null DocMap is returned.
mergeReaders =
Collections.singletonList(SortingCodecReader.wrap(mergedView, docMap, null));
}
}
final SegmentMerger merger = final SegmentMerger merger =
new SegmentMerger( new SegmentMerger(
mergeReaders, merge.info.info, infoStream, dirWrapper, globalFieldNumberMap, context); mergeReaders, merge.info.info, infoStream, dirWrapper, globalFieldNumberMap, context);
merge.info.setSoftDelCount(Math.toIntExact(softDeleteCount.get())); merge.info.setSoftDelCount(Math.toIntExact(softDeleteCount.get()));
merge.checkAborted(); merge.checkAborted();
MergeState mergeState = merger.mergeState;
MergeState.DocMap[] docMaps;
if (reorderDocMaps == null) {
docMaps = mergeState.docMaps;
} else {
// Since the reader was reordered, we passed a merged view to MergeState and from its
// perspective there is a single input segment to the merge and the
// SlowCompositeCodecReaderWrapper is effectively doing the merge.
assert mergeState.docMaps.length == 1
: "Got " + mergeState.docMaps.length + " docMaps, but expected 1";
MergeState.DocMap compactionDocMap = mergeState.docMaps[0];
docMaps = new MergeState.DocMap[reorderDocMaps.length];
for (int i = 0; i < docMaps.length; ++i) {
MergeState.DocMap reorderDocMap = reorderDocMaps[i];
docMaps[i] = docID -> compactionDocMap.get(reorderDocMap.get(docID));
}
}
merge.mergeStartNS = System.nanoTime(); merge.mergeStartNS = System.nanoTime();
// This is where all the work happens: // This is where all the work happens:
@ -5176,7 +5244,6 @@ public class IndexWriter
merger.merge(); merger.merge();
} }
MergeState mergeState = merger.mergeState;
assert mergeState.segmentInfo == merge.info.info; assert mergeState.segmentInfo == merge.info.info;
merge.info.info.setFiles(new HashSet<>(dirWrapper.getCreatedFiles())); merge.info.info.setFiles(new HashSet<>(dirWrapper.getCreatedFiles()));
Codec codec = config.getCodec(); Codec codec = config.getCodec();
@ -5229,7 +5296,7 @@ public class IndexWriter
// Merge would produce a 0-doc segment, so we do nothing except commit the merge to remove // Merge would produce a 0-doc segment, so we do nothing except commit the merge to remove
// all the 0-doc segments that we "merged": // all the 0-doc segments that we "merged":
assert merge.info.info.maxDoc() == 0; assert merge.info.info.maxDoc() == 0;
success = commitMerge(merge, mergeState); success = commitMerge(merge, docMaps);
return 0; return 0;
} }
@ -5309,6 +5376,8 @@ public class IndexWriter
success = false; success = false;
} }
merge.setMergeInfo(merge.info);
// Have codec write SegmentInfo. Must do this after // Have codec write SegmentInfo. Must do this after
// creating CFS so that 1) .si isn't slurped into CFS, // creating CFS so that 1) .si isn't slurped into CFS,
// and 2) .si reflects useCompoundFile=true change // and 2) .si reflects useCompoundFile=true change
@ -5352,7 +5421,7 @@ public class IndexWriter
} }
} }
if (!commitMerge(merge, mergeState)) { if (!commitMerge(merge, docMaps)) {
// commitMerge will return false if this merge was // commitMerge will return false if this merge was
// aborted // aborted
return 0; return 0;

View File

@ -255,6 +255,15 @@ public abstract class MergePolicy {
usesPooledReaders = false; usesPooledReaders = false;
} }
/** Constructor for wrapping. */
protected OneMerge(OneMerge oneMerge) {
this.segments = oneMerge.segments;
this.mergeReaders = oneMerge.mergeReaders;
this.totalMaxDoc = oneMerge.totalMaxDoc;
this.mergeProgress = new OneMergeProgress();
this.usesPooledReaders = oneMerge.usesPooledReaders;
}
/** /**
* Called by {@link IndexWriter} after the merge started and from the thread that will be * Called by {@link IndexWriter} after the merge started and from the thread that will be
* executing the merge. * executing the merge.
@ -288,11 +297,32 @@ public abstract class MergePolicy {
} }
} }
/** Wrap the reader in order to add/remove information to the merged segment. */ /**
* Wrap a reader prior to merging in order to add/remove fields or documents.
*
* <p><b>NOTE:</b> It is illegal to reorder doc IDs here, use {@link
* #reorder(CodecReader,Directory)} instead.
*/
public CodecReader wrapForMerge(CodecReader reader) throws IOException { public CodecReader wrapForMerge(CodecReader reader) throws IOException {
return reader; return reader;
} }
/**
* Extend this method if you wish to renumber doc IDs. This method will be called when index
* sorting is disabled on a merged view of the {@link OneMerge}. A {@code null} return value
* indicates that doc IDs should not be reordered.
*
* <p><b>NOTE:</b> Returning a non-null value here disables several optimizations and increases
* the merging overhead.
*
* @param reader The reader to reorder.
* @param dir The {@link Directory} of the index, which may be used to create temporary files.
* @lucene.experimental
*/
public Sorter.DocMap reorder(CodecReader reader, Directory dir) throws IOException {
return null;
}
/** /**
* Expert: Sets the {@link SegmentCommitInfo} of the merged segment. Allows sub-classes to e.g. * Expert: Sets the {@link SegmentCommitInfo} of the merged segment. Allows sub-classes to e.g.
* {@link SegmentInfo#addDiagnostics(Map) add diagnostic} properties. * {@link SegmentInfo#addDiagnostics(Map) add diagnostic} properties.
@ -355,11 +385,7 @@ public abstract class MergePolicy {
* not indicate the number of documents after the merge. * not indicate the number of documents after the merge.
*/ */
public int totalNumDocs() { public int totalNumDocs() {
int total = 0; return totalMaxDoc;
for (SegmentCommitInfo info : segments) {
total += info.info.maxDoc();
}
return total;
} }
/** Return {@link MergeInfo} describing this merge. */ /** Return {@link MergeInfo} describing this merge. */

View File

@ -177,16 +177,13 @@ public class MergeState {
final int docBase = totalDocs; final int docBase = totalDocs;
docMaps[i] = docMaps[i] =
new DocMap() { docID -> {
@Override if (liveDocs == null) {
public int get(int docID) { return docBase + docID;
if (liveDocs == null) { } else if (liveDocs.get(docID)) {
return docBase + docID; return docBase + (int) delDocMap.get(docID);
} else if (liveDocs.get(docID)) { } else {
return docBase + (int) delDocMap.get(docID); return -1;
} else {
return -1;
}
} }
}; };
totalDocs += reader.numDocs(); totalDocs += reader.numDocs();
@ -242,13 +239,10 @@ public class MergeState {
} }
/** A map of doc IDs. */ /** A map of doc IDs. */
public abstract static class DocMap { @FunctionalInterface
/** Sole constructor. (For invocation by subclass constructors, typically implicit.) */ public interface DocMap {
// Explicitly declared so that we have non-empty javadoc
protected DocMap() {}
/** Return the mapped docID or -1 if the given doc is not mapped. */ /** Return the mapped docID or -1 if the given doc is not mapped. */
public abstract int get(int docID); int get(int docID);
} }
static PackedLongValues removeDeletes(final int maxDoc, final Bits liveDocs) { static PackedLongValues removeDeletes(final int maxDoc, final Bits liveDocs) {

View File

@ -122,14 +122,11 @@ final class MultiSorter {
final PackedLongValues remapped = builders[i].build(); final PackedLongValues remapped = builders[i].build();
final Bits liveDocs = readers.get(i).getLiveDocs(); final Bits liveDocs = readers.get(i).getLiveDocs();
docMaps[i] = docMaps[i] =
new MergeState.DocMap() { docID -> {
@Override if (liveDocs == null || liveDocs.get(docID)) {
public int get(int docID) { return (int) remapped.get(docID);
if (liveDocs == null || liveDocs.get(docID)) { } else {
return (int) remapped.get(docID); return -1;
} else {
return -1;
}
} }
}; };
} }

View File

@ -325,7 +325,6 @@ public abstract class PointValues {
/** Notifies the caller that this many documents are about to be visited */ /** Notifies the caller that this many documents are about to be visited */
default void grow(int count) {} default void grow(int count) {}
;
} }
/** /**

View File

@ -526,7 +526,6 @@ final class ReadersAndUpdates {
return docIDOut; return docIDOut;
} }
} }
;
private synchronized Set<String> writeFieldInfosGen( private synchronized Set<String> writeFieldInfosGen(
FieldInfos fieldInfos, Directory dir, FieldInfosFormat infosFormat) throws IOException { FieldInfos fieldInfos, Directory dir, FieldInfosFormat infosFormat) throws IOException {

View File

@ -122,7 +122,7 @@ public final class SegmentInfos implements Cloneable, Iterable<SegmentCommitInfo
static final int VERSION_CURRENT = VERSION_86; static final int VERSION_CURRENT = VERSION_86;
/** Name of the generation reference file name */ /** Name of the generation reference file name */
private static final String OLD_SEGMENTS_GEN = "segments.gen"; static final String OLD_SEGMENTS_GEN = "segments.gen";
/** Used to name new segments. */ /** Used to name new segments. */
public long counter; public long counter;
@ -146,7 +146,7 @@ public final class SegmentInfos implements Cloneable, Iterable<SegmentCommitInfo
* *
* @see #setInfoStream * @see #setInfoStream
*/ */
private static PrintStream infoStream = null; private static PrintStream infoStream;
/** Id for this commit; only written starting with Lucene 5.0 */ /** Id for this commit; only written starting with Lucene 5.0 */
private byte[] id; private byte[] id;
@ -1010,6 +1010,7 @@ public final class SegmentInfos implements Cloneable, Iterable<SegmentCommitInfo
void replace(SegmentInfos other) { void replace(SegmentInfos other) {
rollbackSegmentInfos(other.asList()); rollbackSegmentInfos(other.asList());
lastGeneration = other.lastGeneration; lastGeneration = other.lastGeneration;
userData = other.userData;
} }
/** Returns sum of all segment's maxDocs. Note that this does not include deletions */ /** Returns sum of all segment's maxDocs. Note that this does not include deletions */

File diff suppressed because it is too large Load Diff

View File

@ -24,6 +24,7 @@ import java.util.Arrays;
import java.util.HashMap; import java.util.HashMap;
import java.util.Iterator; import java.util.Iterator;
import java.util.Map; import java.util.Map;
import java.util.Objects;
import org.apache.lucene.codecs.DocValuesProducer; import org.apache.lucene.codecs.DocValuesProducer;
import org.apache.lucene.codecs.FieldsProducer; import org.apache.lucene.codecs.FieldsProducer;
import org.apache.lucene.codecs.KnnVectorsReader; import org.apache.lucene.codecs.KnnVectorsReader;
@ -77,7 +78,7 @@ public final class SortingCodecReader extends FilterCodecReader {
private final Sorter.DocMap docMap; private final Sorter.DocMap docMap;
SortingPointValues(final PointValues in, Sorter.DocMap docMap) { SortingPointValues(final PointValues in, Sorter.DocMap docMap) {
this.in = in; this.in = Objects.requireNonNull(in);
this.docMap = docMap; this.docMap = docMap;
} }
@ -472,6 +473,10 @@ public final class SortingCodecReader extends FilterCodecReader {
@Override @Override
public PointValues getValues(String field) throws IOException { public PointValues getValues(String field) throws IOException {
var values = delegate.getValues(field);
if (values == null) {
return null;
}
return new SortingPointValues(delegate.getValues(field), docMap); return new SortingPointValues(delegate.getValues(field), docMap);
} }

View File

@ -85,7 +85,11 @@ public final class IndexOrDocValuesQuery extends Query {
@Override @Override
public String toString(String field) { public String toString(String field) {
return indexQuery.toString(field); return "IndexOrDocValuesQuery(indexQuery="
+ indexQuery.toString(field)
+ ", dvQuery="
+ dvQuery.toString(field)
+ ")";
} }
@Override @Override

View File

@ -19,7 +19,6 @@ package org.apache.lucene.search;
import java.io.IOException; import java.io.IOException;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Arrays; import java.util.Arrays;
import java.util.Collection;
import java.util.Collections; import java.util.Collections;
import java.util.Comparator; import java.util.Comparator;
import java.util.List; import java.util.List;
@ -62,9 +61,9 @@ import org.apache.lucene.util.automaton.ByteRunAutomaton;
* match lots of documents, counting the number of hits may take much longer than computing the top * match lots of documents, counting the number of hits may take much longer than computing the top
* hits so this trade-off allows to get some minimal information about the hit count without slowing * hits so this trade-off allows to get some minimal information about the hit count without slowing
* down search too much. The {@link TopDocs#scoreDocs} array is always accurate however. If this * down search too much. The {@link TopDocs#scoreDocs} array is always accurate however. If this
* behavior doesn't suit your needs, you should create collectors manually with either {@link * behavior doesn't suit your needs, you should create collectorManagers manually with either {@link
* TopScoreDocCollector#create} or {@link TopFieldCollector#create} and call {@link #search(Query, * TopScoreDocCollectorManager} or {@link TopFieldCollectorManager} and call {@link #search(Query,
* Collector)}. * CollectorManager)}.
* *
* <p><a id="thread-safety"></a> * <p><a id="thread-safety"></a>
* *
@ -455,35 +454,10 @@ public class IndexSearcher {
} }
final int cappedNumHits = Math.min(numHits, limit); final int cappedNumHits = Math.min(numHits, limit);
final boolean supportsConcurrency = getSlices().length > 1;
final LeafSlice[] leafSlices = getSlices(); CollectorManager<TopScoreDocCollector, TopDocs> manager =
final CollectorManager<TopScoreDocCollector, TopDocs> manager = new TopScoreDocCollectorManager(
new CollectorManager<TopScoreDocCollector, TopDocs>() { cappedNumHits, after, TOTAL_HITS_THRESHOLD, supportsConcurrency);
private final HitsThresholdChecker hitsThresholdChecker =
leafSlices.length <= 1
? HitsThresholdChecker.create(Math.max(TOTAL_HITS_THRESHOLD, numHits))
: HitsThresholdChecker.createShared(Math.max(TOTAL_HITS_THRESHOLD, numHits));
private final MaxScoreAccumulator minScoreAcc =
leafSlices.length <= 1 ? null : new MaxScoreAccumulator();
@Override
public TopScoreDocCollector newCollector() throws IOException {
return TopScoreDocCollector.create(
cappedNumHits, after, hitsThresholdChecker, minScoreAcc);
}
@Override
public TopDocs reduce(Collection<TopScoreDocCollector> collectors) throws IOException {
final TopDocs[] topDocs = new TopDocs[collectors.size()];
int i = 0;
for (TopScoreDocCollector collector : collectors) {
topDocs[i++] = collector.topDocs();
}
return TopDocs.merge(0, cappedNumHits, topDocs);
}
};
return search(query, manager); return search(query, manager);
} }
@ -510,7 +484,10 @@ public class IndexSearcher {
* *
* @throws TooManyClauses If a query would exceed {@link IndexSearcher#getMaxClauseCount()} * @throws TooManyClauses If a query would exceed {@link IndexSearcher#getMaxClauseCount()}
* clauses. * clauses.
* @deprecated This method is being deprecated in favor of {@link IndexSearcher#search(Query,
* CollectorManager)} due to its support for concurrency in IndexSearcher
*/ */
@Deprecated
public void search(Query query, Collector results) throws IOException { public void search(Query query, Collector results) throws IOException {
query = rewrite(query, results.scoreMode().needsScores()); query = rewrite(query, results.scoreMode().needsScores());
search(leafContexts, createWeight(query, results.scoreMode(), 1), results); search(leafContexts, createWeight(query, results.scoreMode(), 1), results);
@ -602,34 +579,10 @@ public class IndexSearcher {
final Sort rewrittenSort = sort.rewrite(this); final Sort rewrittenSort = sort.rewrite(this);
final LeafSlice[] leafSlices = getSlices(); final LeafSlice[] leafSlices = getSlices();
final boolean supportsConcurrency = leafSlices.length > 1;
final CollectorManager<TopFieldCollector, TopFieldDocs> manager = final CollectorManager<TopFieldCollector, TopFieldDocs> manager =
new CollectorManager<>() { new TopFieldCollectorManager(
rewrittenSort, cappedNumHits, after, TOTAL_HITS_THRESHOLD, supportsConcurrency);
private final HitsThresholdChecker hitsThresholdChecker =
leafSlices.length <= 1
? HitsThresholdChecker.create(Math.max(TOTAL_HITS_THRESHOLD, numHits))
: HitsThresholdChecker.createShared(Math.max(TOTAL_HITS_THRESHOLD, numHits));
private final MaxScoreAccumulator minScoreAcc =
leafSlices.length <= 1 ? null : new MaxScoreAccumulator();
@Override
public TopFieldCollector newCollector() throws IOException {
// TODO: don't pay the price for accurate hit counts by default
return TopFieldCollector.create(
rewrittenSort, cappedNumHits, after, hitsThresholdChecker, minScoreAcc);
}
@Override
public TopFieldDocs reduce(Collection<TopFieldCollector> collectors) throws IOException {
final TopFieldDocs[] topDocs = new TopFieldDocs[collectors.size()];
int i = 0;
for (TopFieldCollector collector : collectors) {
topDocs[i++] = collector.topDocs();
}
return TopDocs.merge(rewrittenSort, 0, cappedNumHits, topDocs);
}
};
TopFieldDocs topDocs = search(query, manager); TopFieldDocs topDocs = search(query, manager);
if (doDocScores) { if (doDocScores) {

View File

@ -69,7 +69,6 @@ public abstract class PointInSetQuery extends Query implements Accountable {
@Override @Override
public abstract BytesRef next(); public abstract BytesRef next();
} }
;
/** The {@code packedPoints} iterator must be in sorted order. */ /** The {@code packedPoints} iterator must be in sorted order. */
protected PointInSetQuery(String field, int numDims, int bytesPerDim, Stream packedPoints) { protected PointInSetQuery(String field, int numDims, int bytesPerDim, Stream packedPoints) {

Some files were not shown because too many files have changed in this diff Show More