mirror of https://github.com/apache/lucene.git
Merge branch 'main' into java_21
This commit is contained in:
commit
40c03b0e6c
|
@ -117,6 +117,9 @@ apply from: file('buildSrc/scriptDepVersions.gradle')
|
|||
|
||||
apply from: file('gradle/generation/local-settings.gradle')
|
||||
|
||||
// Make sure the build environment is consistent.
|
||||
apply from: file('gradle/validation/check-environment.gradle')
|
||||
|
||||
// IDE support, settings and specials.
|
||||
apply from: file('gradle/ide/intellij-idea.gradle')
|
||||
apply from: file('gradle/ide/eclipse.gradle')
|
||||
|
|
|
@ -38,3 +38,9 @@ dependencies {
|
|||
implementation "commons-codec:commons-codec:${scriptDepVersions['commons-codec']}"
|
||||
}
|
||||
|
||||
if (!rootProject.hasJavaFlightRecorder) {
|
||||
logger.warn('Module jdk.jfr is not available; skipping compilation of Java Flight Recorder support.')
|
||||
tasks.named('compileJava').configure {
|
||||
exclude('**/ProfileResults.java')
|
||||
}
|
||||
}
|
||||
|
|
|
@ -24,7 +24,7 @@ ext {
|
|||
"apache-rat": "0.14",
|
||||
"asm": "9.6",
|
||||
"commons-codec": "1.13",
|
||||
"ecj": "3.36.0-SNAPSHOT",
|
||||
"ecj": "3.36.0",
|
||||
"flexmark": "0.61.24",
|
||||
"javacc": "7.0.12",
|
||||
"jflex": "1.8.2",
|
||||
|
|
|
@ -15,20 +15,18 @@
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.gradle.ProfileResults;
|
||||
|
||||
def recordings = files()
|
||||
|
||||
allprojects {
|
||||
plugins.withType(JavaPlugin) {
|
||||
ext {
|
||||
testOptions += [
|
||||
[propName: 'tests.profile', value: false, description: "Enable java flight recorder profiling."]
|
||||
[propName: 'tests.profile', value: false, description: "Enable Java Flight Recorder profiling."]
|
||||
]
|
||||
}
|
||||
|
||||
if (resolvedTestOption("tests.profile").toBoolean()) {
|
||||
allprojects {
|
||||
if (rootProject.hasJavaFlightRecorder) {
|
||||
tasks.withType(Test) {
|
||||
jvmArgs("-XX:StartFlightRecording=dumponexit=true,maxsize=250M,settings=" + rootProject.file("gradle/testing/profiling.jfc"),
|
||||
"-XX:+UnlockDiagnosticVMOptions",
|
||||
|
@ -41,6 +39,8 @@ allprojects {
|
|||
recordings = recordings.plus fileTree(dir: workingDir, include: '*.jfr')
|
||||
}
|
||||
}
|
||||
} else {
|
||||
throw new GradleException('Module jdk.jfr is not available; Java Flight Recorder profiles cannot be enabled.')
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -48,10 +48,11 @@ allprojects {
|
|||
|
||||
gradle.buildFinished {
|
||||
if (!recordings.isEmpty()) {
|
||||
ProfileResults.printReport(recordings.getFiles().collect { it.toString() },
|
||||
propertyOrDefault(ProfileResults.MODE_KEY, ProfileResults.MODE_DEFAULT) as String,
|
||||
Integer.parseInt(propertyOrDefault(ProfileResults.STACKSIZE_KEY, ProfileResults.STACKSIZE_DEFAULT)),
|
||||
Integer.parseInt(propertyOrDefault(ProfileResults.COUNT_KEY, ProfileResults.COUNT_DEFAULT)),
|
||||
Boolean.parseBoolean(propertyOrDefault(ProfileResults.LINENUMBERS_KEY, ProfileResults.LINENUMBERS_DEFAULT)))
|
||||
def pr = org.apache.lucene.gradle.ProfileResults;
|
||||
pr.printReport(recordings.getFiles().collect { it.toString() },
|
||||
propertyOrDefault(pr.MODE_KEY, pr.MODE_DEFAULT) as String,
|
||||
Integer.parseInt(propertyOrDefault(pr.STACKSIZE_KEY, pr.STACKSIZE_DEFAULT)),
|
||||
Integer.parseInt(propertyOrDefault(pr.COUNT_KEY, pr.COUNT_DEFAULT)),
|
||||
Boolean.parseBoolean(propertyOrDefault(pr.LINENUMBERS_KEY, pr.LINENUMBERS_DEFAULT)))
|
||||
}
|
||||
}
|
||||
|
|
|
@ -23,8 +23,6 @@ grant {
|
|||
// jetty-specific:
|
||||
permission java.lang.RuntimePermission "getenv.JETTY_AVAILABLE_PROCESSORS";
|
||||
permission java.lang.RuntimePermission "getenv.JETTY_WORKER_INSTANCE";
|
||||
// servlet stuff
|
||||
permission java.lang.RuntimePermission "setContextClassLoader";
|
||||
// allow TestNRTReplication fork its jvm
|
||||
permission java.io.FilePermission "${java.home}${/}-", "read,execute";
|
||||
// read/write access to all system properties (required by jetty in these tests)
|
||||
|
|
|
@ -50,14 +50,11 @@ grant {
|
|||
permission java.lang.RuntimePermission "getStackTrace";
|
||||
// needed for mock filesystems in tests
|
||||
permission java.lang.RuntimePermission "fileSystemProvider";
|
||||
// analyzers/uima: needed by lucene expressions' JavascriptCompiler
|
||||
permission java.lang.RuntimePermission "createClassLoader";
|
||||
// needed to test unmap hack on platforms that support it
|
||||
permission java.lang.RuntimePermission "accessClassInPackage.sun.misc";
|
||||
permission java.lang.reflect.ReflectPermission "suppressAccessChecks";
|
||||
// needed by cyberneko usage by benchmarks on J9
|
||||
permission java.lang.RuntimePermission "accessClassInPackage.org.apache.xerces.util";
|
||||
permission java.lang.RuntimePermission "getClassLoader";
|
||||
|
||||
// Needed for loading native library (lucene:misc:native) in lucene:misc
|
||||
permission java.lang.RuntimePermission "getFileStoreAttributes";
|
||||
|
@ -111,6 +108,8 @@ grant {
|
|||
permission java.lang.RuntimePermission "shutdownHooks";
|
||||
// needed by jacoco to instrument classes
|
||||
permission java.lang.RuntimePermission "defineClass";
|
||||
// needed by jacoco for God knows what.
|
||||
permission java.lang.RuntimePermission "createClassLoader";
|
||||
};
|
||||
|
||||
// Grant all permissions to Gradle test runner classes.
|
||||
|
|
|
@ -23,6 +23,7 @@ import org.gradle.util.GradleVersion
|
|||
configure(rootProject) {
|
||||
ext {
|
||||
expectedGradleVersion = '8.4'
|
||||
hasJavaFlightRecorder = ModuleLayer.boot().findModule('jdk.jfr').map(this.class.module::canRead).orElse(false)
|
||||
}
|
||||
|
||||
wrapper {
|
||||
|
|
|
@ -17,8 +17,8 @@
|
|||
|
||||
def skipReason
|
||||
|
||||
if (rootProject.usesAltJvm && rootProject.runtimeJavaVersion > JavaVersion.VERSION_15) {
|
||||
skipReason = "won't work with JDK ${rootProject.runtimeJavaVersion} if used as alternative java toolchain"
|
||||
if (rootProject.usesAltJvm) {
|
||||
skipReason = "won't work with alternative java toolchain"
|
||||
}
|
||||
|
||||
if (!propertyOrDefault("validation.errorprone", isCIBuild).asBoolean()) {
|
||||
|
@ -37,7 +37,7 @@ if (skipReason) {
|
|||
|
||||
allprojects { prj ->
|
||||
plugins.withType(JavaPlugin) {
|
||||
// LUCENE-9650: Errorprone on master/gradle does not work with JDK-16+ when running as plugin
|
||||
// LUCENE-9650: Errorprone on master/gradle does not work when running as plugin
|
||||
// inside a forked Javac process. Javac running inside Gradle works, because we have
|
||||
// additional module system opens in place.
|
||||
// This is a hack to keep the dependency (so that palantir's version check doesn't complain)
|
||||
|
|
|
@ -59,6 +59,9 @@ allprojects {
|
|||
}
|
||||
|
||||
subprojects {
|
||||
// initialize empty, because no checks for benchmark-jmh module.
|
||||
ext.jarInfos = []
|
||||
|
||||
// Configure jarValidation configuration for all projects. Any dependency
|
||||
// declared on this configuration (or any configuration it extends from) will
|
||||
// be verified.
|
||||
|
|
|
@ -61,6 +61,7 @@ Otherwise you are stuck wrestling down full dependencies of OpenJDK (metal etc)
|
|||
Also you must run benchmarks as root to use dtrace, but it works.
|
||||
|
||||
$ git clone --depth 1 https://github.com/openjdk/jdk/
|
||||
$ curl -f https://ftp.gnu.org/gnu/binutils/binutils-2.38.tar.gz | tar -zxf -
|
||||
$ curl -fo jdk/src/utils/hsdis/binutils/Makefile https://raw.githubusercontent.com/openjdk/jdk/3c7ae1225f0d5575fd927a9b76fb40dc30e208cd/src/utils/hsdis/Makefile
|
||||
$ vi jdk/src/utils/hsdis/binutils/Makefile, change SOURCE = hsdis.c to SOURCE = hsdis-binutils.c
|
||||
$ vi jdk/src/utils/hsdis/binutils/hsdis-binutils.c, change #include "hsdis.h" to #include "../hsdis.h"
|
||||
|
|
|
@ -7,7 +7,6 @@ http://s.apache.org/luceneversions
|
|||
|
||||
API Changes
|
||||
---------------------
|
||||
|
||||
* LUCENE-12092: Remove deprecated UTF8TaxonomyWriterCache. Please use LruTaxonomyWriterCache
|
||||
instead. (Vigya Sharma)
|
||||
|
||||
|
@ -62,10 +61,21 @@ API Changes
|
|||
|
||||
* GITHUB#12599: Add RandomAccessInput#readBytes method to the RandomAccessInput interface. (Ignacio Vera)
|
||||
|
||||
* GITHUB#12709: Consolidate FSTStore and BytesStore in FST. Created FSTReader which contains the common methods
|
||||
of the two (Anh Dung Bui)
|
||||
* GITHUB#11023: Adding -level param to CheckIndex, making the old -fast param the default behaviour. (Jakub Slowinski)
|
||||
|
||||
* GITHUB#12735: Remove FSTCompiler#getTermCount() and FSTCompiler.UnCompiledNode#inputCount (Anh Dung Bui)
|
||||
* GITHUB#12873: Expressions module now uses MethodHandles to define custom functions. Support for
|
||||
custom classloaders was removed. (Uwe Schindler)
|
||||
|
||||
* GITHUB#12243: Remove TermInSetQuery ctors taking varargs param. SortedSetDocValuesField#newSlowSetQuery,
|
||||
SortedDocValuesField#newSlowSetQuery, KeywordField#newSetQuery, KeywordField#newSetQuery now take a collection. (Jakub Slowinski)
|
||||
|
||||
* GITHUB#12881: Performance improvements to MatchHighlighter and MatchRegionRetriever. MatchRegionRetriever can be
|
||||
configured to not load matches (or content) of certain fields and to force-load other fields so that stored fields
|
||||
of a document are accessed once. A configurable limit of field matches placed in the priority queue was added
|
||||
(allows handling long fields with lots of hits more gracefully). MatchRegionRetriever utilizes IndexSearcher's
|
||||
executor to extract hit offsets concurrently. (Dawid Weiss)
|
||||
|
||||
* GITHUB#12855: Remove deprecated DrillSideways#createDrillDownFacetsCollector extension method. (Greg Miller)
|
||||
|
||||
New Features
|
||||
---------------------
|
||||
|
@ -89,18 +99,17 @@ Improvements
|
|||
|
||||
* GITHUB#12447: Hunspell: speed up the dictionary enumeration on suggestion (Peter Gromov)
|
||||
|
||||
* GITHUB#12542: FSTCompiler can now approximately limit how much RAM it uses to share
|
||||
suffixes during FST construction using the suffixRAMLimitMB method. Larger values
|
||||
result in a more minimal FST (more common suffixes are shard). Pass
|
||||
Double.POSITIVE_INFINITY to use as much RAM as is needed to create a purely
|
||||
minimal FST. Inspired by this Rust FST implemention:
|
||||
https://blog.burntsushi.net/transducers (Mike McCandless)
|
||||
* GITHUB#12873: Expressions module now uses JEP 371 "Hidden Classes" with JEP 309
|
||||
"Dynamic Class-File Constants" to implement Javascript expressions. (Uwe Schindler)
|
||||
|
||||
Optimizations
|
||||
---------------------
|
||||
|
||||
* GITHUB#11857, GITHUB#11859, GITHUB#11893, GITHUB#11909: Hunspell: improved suggestion performance (Peter Gromov)
|
||||
|
||||
* GITHUB#12825, GITHUB#12834: Hunspell: improved dictionary loading performance, allowed in-memory entry sorting.
|
||||
(Peter Gromov)
|
||||
|
||||
* GITHUB#12372: Reduce allocation during HNSW construction (Jonathan Ellis)
|
||||
|
||||
* GITHUB#12408: Lazy initialization improvements for Facets implementations when there are segments with no hits
|
||||
|
@ -116,6 +125,9 @@ Bug Fixes
|
|||
|
||||
* GITHUB#12220: Hunspell: disallow hidden title-case entries from compound middle/end
|
||||
|
||||
* GITHUB#12878: Fix the declared Exceptions of Expression#evaluate() to match those
|
||||
of DoubleValues#doubleValue(). (Uwe Schindler)
|
||||
|
||||
Other
|
||||
---------------------
|
||||
|
||||
|
@ -142,6 +154,48 @@ Other
|
|||
|
||||
* GITHUB#12239: Hunspell: reduced suggestion set dependency on the hash table order (Peter Gromov)
|
||||
|
||||
* GITHUB#9049: Fixing bug in UnescapedCharSequence#toStringEscaped() (Jakub Slowinski)
|
||||
|
||||
======================== Lucene 9.10.0 =======================
|
||||
|
||||
API Changes
|
||||
---------------------
|
||||
* GITHUB#12243: Mark TermInSetQuery ctors with varargs terms as @Deprecated. SortedSetDocValuesField#newSlowSetQuery,
|
||||
SortedDocValuesField#newSlowSetQuery, KeywordField#newSetQuery now take a collection of terms as a param. (Jakub Slowinski)
|
||||
|
||||
* GITHUB#11041: Deprecate IndexSearch#search(Query, Collector) in favor of
|
||||
IndexSearcher#search(Query, CollectorManager) for TopFieldCollectorManager
|
||||
and TopScoreDocCollectorManager. (Zach Chen, Adrien Grand, Michael McCandless, Greg Miller, Luca Cavanna)
|
||||
|
||||
* GITHUB#12854: Mark DrillSideways#createDrillDownFacetsCollector as @Deprecated. (Greg Miller)
|
||||
|
||||
New Features
|
||||
---------------------
|
||||
(No changes)
|
||||
|
||||
Improvements
|
||||
---------------------
|
||||
|
||||
* GITHUB#12870: Tighten synchronized loop in DirectoryTaxonomyReader#getOrdinal. (Stefan Vodita)
|
||||
|
||||
* GITHUB#12812: Avoid overflows and false negatives in int slice buffer filled-with-zeros assertion. (Stefan Vodita)
|
||||
|
||||
Optimizations
|
||||
---------------------
|
||||
(No changes)
|
||||
|
||||
Bug Fixes
|
||||
---------------------
|
||||
* GITHUB#12866: Prevent extra similarity computation for single-level HNSW graphs. (Kaival Parikh)
|
||||
|
||||
* GITHUB#12558: Ensure #finish is called on all drill-sideways FacetsCollectors even when no hits are scored.
|
||||
(Greg Miller)
|
||||
|
||||
Other
|
||||
---------------------
|
||||
|
||||
* GITHUB#11023: Removing some dead code in CheckIndex. (Jakub Slowinski)
|
||||
|
||||
======================== Lucene 9.9.0 =======================
|
||||
|
||||
API Changes
|
||||
|
@ -157,9 +211,6 @@ API Changes
|
|||
* GITHUB#12592: Add RandomAccessInput#length method to the RandomAccessInput interface. In addition deprecate
|
||||
ByteBuffersDataInput#size in favour of this new method. (Ignacio Vera)
|
||||
|
||||
* GITHUB#12646, GITHUB#12690: Move FST#addNode to FSTCompiler to avoid a circular dependency
|
||||
between FST and FSTCompiler (Anh Dung Bui)
|
||||
|
||||
* GITHUB#12718: Make IndexSearcher#getSlices final as it is not expected to be overridden (Luca Cavanna)
|
||||
|
||||
* GITHUB#12427: Automata#makeStringUnion #makeBinaryStringUnion now accept Iterable<BytesRef> instead of
|
||||
|
@ -169,6 +220,25 @@ API Changes
|
|||
* GITHUB#12180: Add TaxonomyReader#getBulkOrdinals method to more efficiently retrieve facet ordinals for multiple
|
||||
FacetLabel at once. (Egor Potemkin)
|
||||
|
||||
* GITHUB#12816: Add HumanReadableQuery which takes a description parameter for debugging purposes. (Jakub Slowinski)
|
||||
|
||||
* GITHUB#12646, GITHUB#12690: Move FST#addNode to FSTCompiler to avoid a circular dependency
|
||||
between FST and FSTCompiler (Anh Dung Bui)
|
||||
|
||||
* GITHUB#12709: Consolidate FSTStore and BytesStore in FST. Created FSTReader which contains the common methods
|
||||
of the two (Anh Dung Bui)
|
||||
|
||||
* GITHUB#12735: Remove FSTCompiler#getTermCount() and FSTCompiler.UnCompiledNode#inputCount (Anh Dung Bui)
|
||||
|
||||
* GITHUB-12695: Remove public constructor of FSTCompiler. Please use FSTCompiler.Builder
|
||||
instead. (Juan M. Caicedo)
|
||||
|
||||
* GITHUB#12799: Make TaskExecutor constructor public and use TaskExecutor for concurrent
|
||||
HNSW graph build. (Shubham Chaudhary)
|
||||
|
||||
* GITHUB#12758, GITHUB#12803: Remove FST constructor with DataInput for metadata. Please
|
||||
use the constructor with FSTMetadata instead. (Anh Dung Bui)
|
||||
|
||||
New Features
|
||||
---------------------
|
||||
|
||||
|
@ -180,7 +250,7 @@ New Features
|
|||
|
||||
* GITHUB#12582: Add int8 scalar quantization to the HNSW vector format. This optionally allows for more compact lossy
|
||||
storage for the vectors, requiring about 75% memory for fast HNSW search. (Ben Trent)
|
||||
|
||||
|
||||
* GITHUB#12660: HNSW graph now can be merged with multiple thread. Configurable in Lucene99HnswVectorsFormat.
|
||||
(Patrick Zhai)
|
||||
|
||||
|
@ -225,6 +295,22 @@ Improvements
|
|||
* GITHUB#12754: Refactor lookup of Hotspot VM options and do not initialize constants with NULL
|
||||
if SecurityManager prevents access. (Uwe Schindler)
|
||||
|
||||
* GITHUB#12801: Remove possible contention on a ReentrantReadWriteLock in
|
||||
Monitor which could result in searches waiting for commits. (Davis Cook)
|
||||
|
||||
* GITHUB#11277, LUCENE-10241: Upgrade to OpenNLP to 1.9.4. (Jeff Zemerick)
|
||||
|
||||
* GITHUB#12542: FSTCompiler can now approximately limit how much RAM it uses to share
|
||||
suffixes during FST construction using the suffixRAMLimitMB method. Larger values
|
||||
result in a more minimal FST (more common suffixes are shard). Pass
|
||||
Double.POSITIVE_INFINITY to use as much RAM as is needed to create a purely
|
||||
minimal FST. Inspired by this Rust FST implemention:
|
||||
https://blog.burntsushi.net/transducers (Mike McCandless)
|
||||
|
||||
* GITHUB#12738: NodeHash now stores the FST nodes data instead of just node addresses (Anh Dung Bui)
|
||||
|
||||
* GITHUB#12847: Test2BFST now reports the time it took to build the FST and the real FST size (Anh Dung Bui)
|
||||
|
||||
Optimizations
|
||||
---------------------
|
||||
* GITHUB#12183: Make TermStates#build concurrent. (Shubham Chaudhary)
|
||||
|
@ -276,10 +362,14 @@ Optimizations
|
|||
|
||||
* GITHUB#12381: Skip docs with DocValues in NumericLeafComparator. (Lu Xugang, Adrien Grand)
|
||||
|
||||
* GITHUB#12748: Specialize arc store for continuous label in FST. (Guo Feng, Chao Zhang)
|
||||
|
||||
* GITHUB#12784: Cache buckets to speed up BytesRefHash#sort. (Guo Feng)
|
||||
|
||||
* GITHUB#12806: Utilize exact kNN search when gathering k >= numVectors in a segment (Ben Trent)
|
||||
|
||||
* GITHUB#12782: Use group-varint encoding for the tail of postings. (Adrien Grand, Zhang Chao)
|
||||
|
||||
* GITHUB#12748: Specialize arc store for continuous label in FST. (Guo Feng, Zhang Chao)
|
||||
|
||||
Changes in runtime behavior
|
||||
---------------------
|
||||
|
||||
|
@ -311,22 +401,33 @@ Bug Fixes
|
|||
|
||||
* GITHUB#12770: Stop exploring HNSW graph if scores are not getting better. (Ben Trent)
|
||||
|
||||
* GITHUB#12640: Ensure #finish is called on all drill-sideways collectors even if one throws a
|
||||
CollectionTerminatedException (Greg Miller)
|
||||
|
||||
* GITHUB#12626: Fix segmentInfos replace to set userData (Shibi Balamurugan, Uwe Schindler, Marcus Eagan, Michael Froh)
|
||||
|
||||
Build
|
||||
---------------------
|
||||
|
||||
* GITHUB#12752: tests.multiplier could be omitted in test failure reproduce lines (esp. in
|
||||
nightly mode). (Dawid Weiss)
|
||||
|
||||
* GITHUB#12742: JavaCompile tasks may be in up-to-date state when modular dependencies have changed
|
||||
* GITHUB#12742: JavaCompile tasks may be in up-to-date state when modular dependencies have changed
|
||||
leading to odd runtime errors (Chris Hostetter, Dawid Weiss)
|
||||
|
||||
* GITHUB#12612: Upgrade forbiddenapis to version 3.6 and ASM for APIJAR extraction to 9.6. (Uwe Schindler)
|
||||
|
||||
* GITHUB#12655: Upgrade to Gradle 8.4 (Kevin Risden)
|
||||
|
||||
* GITHUB#12845: Only enable support for tests.profile if jdk.jfr module is available
|
||||
in Gradle runtime. (Uwe Schindler)
|
||||
|
||||
Other
|
||||
---------------------
|
||||
|
||||
* GITHUB#12817: Add demo for faceting with StringValueFacetCounts over KeywordField and SortedDocValuesField.
|
||||
(Stefan Vodita)
|
||||
|
||||
* GITHUB#12657: Internal refactor of HNSW graph merging (Ben Trent).
|
||||
|
||||
* GITHUB#12625: Refactor ByteBlockPool so it is just a "shift/mask big array". (Ignacio Vera)
|
||||
|
@ -336,6 +437,8 @@ Other
|
|||
overflows and slices that are too large. Some bits of code are simplified. Documentation is updated and expanded.
|
||||
(Stefan Vodita)
|
||||
|
||||
* GITHUB#12762: Refactor BKD HeapPointWriter to hide the internal data structure. (Ignacio Vera)
|
||||
|
||||
======================== Lucene 9.8.0 =======================
|
||||
|
||||
API Changes
|
||||
|
@ -364,6 +467,8 @@ New Features
|
|||
* GITHUB#12479: Add new Maximum Inner Product vector similarity function for non-normalized dot-product
|
||||
vector search. (Jack Mazanec, Ben Trent)
|
||||
|
||||
* GITHUB#12525: `WordDelimiterGraphFilterFactory` now supports the `ignoreKeywords` flag (Thomas De Craemer)
|
||||
|
||||
* GITHUB#12489: Add support for recursive graph bisection, also called
|
||||
bipartite graph partitioning, and often abbreviated BP, an algorithm for
|
||||
reordering doc IDs that results in more compact postings and faster queries,
|
||||
|
@ -386,7 +491,7 @@ Improvements
|
|||
Optimizations
|
||||
---------------------
|
||||
|
||||
* GITHUB#12377: Avoid redundant loop for compute min value in DirectMonotonicWriter. (Chao Zhang)
|
||||
* GITHUB#12377: Avoid redundant loop for compute min value in DirectMonotonicWriter. (Zhang Chao)
|
||||
|
||||
* GITHUB#12361: Faster top-level disjunctions sorted by descending score.
|
||||
(Adrien Grand)
|
||||
|
@ -401,7 +506,7 @@ Optimizations
|
|||
|
||||
* GITHUB#12385: Restore parallel knn query rewrite across segments rather than slices (Luca Cavanna)
|
||||
|
||||
* GITHUB#12381: Speed up NumericDocValuesWriter with index sorting. (Chao Zhang)
|
||||
* GITHUB#12381: Speed up NumericDocValuesWriter with index sorting. (Zhang Chao)
|
||||
|
||||
* GITHUB#12453: Faster bulk numeric reads from BufferedIndexInput (Armin Braun)
|
||||
|
||||
|
@ -468,7 +573,7 @@ Other
|
|||
* GITHUB#12428: Replace consecutive close() calls and close() calls with null checks with IOUtils.close().
|
||||
(Shubham Chaudhary)
|
||||
|
||||
* GITHUB#12512: Remove unused variable in BKDWriter. (Chao Zhang)
|
||||
* GITHUB#12512: Remove unused variable in BKDWriter. (Zhang Chao)
|
||||
|
||||
======================== Lucene 9.7.0 =======================
|
||||
|
||||
|
|
|
@ -19,6 +19,11 @@
|
|||
|
||||
## Migration from Lucene 9.x to Lucene 10.0
|
||||
|
||||
### Minor API changes in MatchHighlighter and MatchRegionRetriever. (GITHUB#12881)
|
||||
|
||||
The API of interfaces for accepting highlights has changed to allow performance improvements. Look at the issue and the PR diff to get
|
||||
a sense of what's changed (changes are minor).
|
||||
|
||||
### Removed deprecated IndexSearcher.doc, IndexReader.document, IndexReader.getTermVectors (GITHUB#11998)
|
||||
|
||||
The deprecated Stored Fields and Term Vectors apis relied upon threadlocal storage and have been removed.
|
||||
|
@ -101,6 +106,34 @@ The deprecated getter for the `Executor` that was optionally provided to the `In
|
|||
has been removed. Users that want to execute concurrent tasks should rely instead on the `TaskExecutor`
|
||||
that the searcher holds, retrieved via `IndexSearcher#getTaskExecutor`.
|
||||
|
||||
### CheckIndex params -slow and -fast are deprecated, replaced by -level X (GITHUB#11023)
|
||||
|
||||
The `CheckIndex` former `-fast` behaviour of performing checksum checks only, is now the default.
|
||||
Added a new parameter: `-level X`, to set the detail level of the index check. The higher the value, the more checks are performed.
|
||||
Sample `-level` usage: `1` (Default) - Checksum checks only, `2` - all level 1 checks as well as logical integrity checks, `3` - all
|
||||
level 2 checks as well as slow checks.
|
||||
|
||||
### Expressions module now uses `MethodHandle` and hidden classes (GITHUB#12873)
|
||||
|
||||
Custom functions in the expressions module must now be passed in a `Map` using `MethodHandle` as values.
|
||||
To convert legacy code using maps of reflective `java.lang.reflect.Method`, use the converter method
|
||||
`JavascriptCompiler#convertLegacyFunctions`. This should make the mapping mostly compatible.
|
||||
The use of `MethodHandle` and [Dynamic Class-File Constants (JEP 309)](https://openjdk.org/jeps/309)
|
||||
now also allows to pass private methods or methods from different classloaders. It is also possible
|
||||
to adapt guards or filters using the `MethodHandles` class.
|
||||
|
||||
The new implementation of the Javascript expressions compiler no longer supports use of custom
|
||||
`ClassLoader`, because it uses the new JDK 15 feature [hidden classes (JEP 371)](https://openjdk.org/jeps/371).
|
||||
Due to the use of `MethodHandle`, classloader isolation is no longer needed, because JS code can only call
|
||||
MHs that were resolved by the application before using the expressions module.
|
||||
|
||||
### `Expression#evaluate()` declares to throw IOException (GITHUB#12878)
|
||||
|
||||
The expressions module has changed the `Expression#evaluate()` method signature:
|
||||
It now declares that it may throw `IOException`. This was an oversight because
|
||||
compiled expressions call `DoubleValues#doubleValue` behind the scenes, which
|
||||
may throw `IOException` on index problems, bubbling up unexpectedly to the caller.
|
||||
|
||||
## Migration from Lucene 9.0 to Lucene 9.1
|
||||
|
||||
### Test framework package migration and module (LUCENE-10301)
|
||||
|
|
|
@ -105,7 +105,8 @@ public class NormalizeCharMap {
|
|||
final FST<CharsRef> map;
|
||||
try {
|
||||
final Outputs<CharsRef> outputs = CharSequenceOutputs.getSingleton();
|
||||
final FSTCompiler<CharsRef> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE2, outputs);
|
||||
final FSTCompiler<CharsRef> fstCompiler =
|
||||
new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE2, outputs).build();
|
||||
final IntsRefBuilder scratch = new IntsRefBuilder();
|
||||
for (Map.Entry<String, String> ent : pendingPairs.entrySet()) {
|
||||
fstCompiler.add(Util.toUTF16(ent.getKey(), scratch), new CharsRef(ent.getValue()));
|
||||
|
|
|
@ -777,7 +777,6 @@ class KStemmer {
|
|||
private int stemLength() {
|
||||
return j + 1;
|
||||
}
|
||||
;
|
||||
|
||||
private boolean endsIn(char[] s) {
|
||||
if (s.length > k) return false;
|
||||
|
|
|
@ -40,7 +40,8 @@ class ConvTable {
|
|||
|
||||
try {
|
||||
Outputs<CharsRef> outputs = CharSequenceOutputs.getSingleton();
|
||||
FSTCompiler<CharsRef> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE2, outputs);
|
||||
FSTCompiler<CharsRef> fstCompiler =
|
||||
new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE2, outputs).build();
|
||||
IntsRefBuilder scratchInts = new IntsRefBuilder();
|
||||
for (Map.Entry<String, String> entry : mappings.entrySet()) {
|
||||
String key = entry.getKey();
|
||||
|
|
|
@ -50,18 +50,12 @@ import java.util.Set;
|
|||
import java.util.TreeMap;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
import org.apache.lucene.codecs.CodecUtil;
|
||||
import org.apache.lucene.analysis.hunspell.SortingStrategy.EntryAccumulator;
|
||||
import org.apache.lucene.analysis.hunspell.SortingStrategy.EntrySupplier;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.IOContext;
|
||||
import org.apache.lucene.store.IndexOutput;
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.lucene.util.IntsRef;
|
||||
import org.apache.lucene.util.IntsRefBuilder;
|
||||
import org.apache.lucene.util.OfflineSorter;
|
||||
import org.apache.lucene.util.OfflineSorter.ByteSequencesReader;
|
||||
import org.apache.lucene.util.OfflineSorter.ByteSequencesWriter;
|
||||
import org.apache.lucene.util.fst.FST;
|
||||
import org.apache.lucene.util.fst.FSTCompiler;
|
||||
import org.apache.lucene.util.fst.IntSequenceOutputs;
|
||||
|
@ -215,6 +209,25 @@ public class Dictionary {
|
|||
List<InputStream> dictionaries,
|
||||
boolean ignoreCase)
|
||||
throws IOException, ParseException {
|
||||
this(affix, dictionaries, ignoreCase, SortingStrategy.offline(tempDir, tempFileNamePrefix));
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a new Dictionary containing the information read from the provided InputStreams to
|
||||
* hunspell affix and dictionary files. You have to close the provided InputStreams yourself.
|
||||
*
|
||||
* @param affix InputStream for reading the hunspell affix file (won't be closed).
|
||||
* @param dictionaries InputStream for reading the hunspell dictionary files (won't be closed).
|
||||
* @param sortingStrategy the entry strategy for the dictionary loading
|
||||
* @throws IOException Can be thrown while reading from the InputStreams
|
||||
* @throws ParseException Can be thrown if the content of the files does not meet expected formats
|
||||
*/
|
||||
public Dictionary(
|
||||
InputStream affix,
|
||||
List<InputStream> dictionaries,
|
||||
boolean ignoreCase,
|
||||
SortingStrategy sortingStrategy)
|
||||
throws IOException, ParseException {
|
||||
this.ignoreCase = ignoreCase;
|
||||
|
||||
try (BufferedInputStream affixStream =
|
||||
|
@ -250,10 +263,11 @@ public class Dictionary {
|
|||
readAffixFile(affixStream, decoder, flagEnumerator);
|
||||
|
||||
// read dictionary entries
|
||||
IndexOutput unsorted = tempDir.createTempOutput(tempFileNamePrefix, "dat", IOContext.DEFAULT);
|
||||
int wordCount = mergeDictionaries(dictionaries, decoder, unsorted);
|
||||
String sortedFile = sortWordsOffline(tempDir, tempFileNamePrefix, unsorted);
|
||||
words = readSortedDictionaries(tempDir, sortedFile, flagEnumerator, wordCount);
|
||||
EntryAccumulator acc = sortingStrategy.start();
|
||||
mergeDictionaries(dictionaries, decoder, acc);
|
||||
try (EntrySupplier sorted = acc.finishAndSort()) {
|
||||
words = readSortedDictionaries(flagEnumerator, sorted);
|
||||
}
|
||||
flagLookup = flagEnumerator.finish();
|
||||
aliases = null; // no longer needed
|
||||
morphAliases = null; // no longer needed
|
||||
|
@ -631,7 +645,8 @@ public class Dictionary {
|
|||
|
||||
private FST<IntsRef> affixFST(TreeMap<String, List<Integer>> affixes) throws IOException {
|
||||
IntSequenceOutputs outputs = IntSequenceOutputs.getSingleton();
|
||||
FSTCompiler<IntsRef> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE4, outputs);
|
||||
FSTCompiler<IntsRef> fstCompiler =
|
||||
new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE4, outputs).build();
|
||||
IntsRefBuilder scratch = new IntsRefBuilder();
|
||||
for (Map.Entry<String, List<Integer>> entry : affixes.entrySet()) {
|
||||
Util.toUTF32(entry.getKey(), scratch);
|
||||
|
@ -984,52 +999,43 @@ public class Dictionary {
|
|||
}
|
||||
}
|
||||
|
||||
private int mergeDictionaries(
|
||||
List<InputStream> dictionaries, CharsetDecoder decoder, IndexOutput output)
|
||||
private void mergeDictionaries(
|
||||
List<InputStream> dictionaries, CharsetDecoder decoder, EntryAccumulator acc)
|
||||
throws IOException {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
int wordCount = 0;
|
||||
try (ByteSequencesWriter writer = new ByteSequencesWriter(output)) {
|
||||
for (InputStream dictionary : dictionaries) {
|
||||
BufferedReader lines = new BufferedReader(new InputStreamReader(dictionary, decoder));
|
||||
lines.readLine(); // first line is number of entries (approximately, sometimes)
|
||||
for (InputStream dictionary : dictionaries) {
|
||||
BufferedReader lines = new BufferedReader(new InputStreamReader(dictionary, decoder));
|
||||
lines.readLine(); // first line is number of entries (approximately, sometimes)
|
||||
|
||||
String line;
|
||||
while ((line = lines.readLine()) != null) {
|
||||
// wild and unpredictable code comment rules
|
||||
if (line.isEmpty() || line.charAt(0) == '#' || line.charAt(0) == '\t') {
|
||||
continue;
|
||||
}
|
||||
line = unescapeEntry(line);
|
||||
// if we haven't seen any custom morphological data, try to parse one
|
||||
if (!hasCustomMorphData) {
|
||||
int morphStart = line.indexOf(MORPH_SEPARATOR);
|
||||
if (morphStart >= 0) {
|
||||
String data = line.substring(morphStart + 1);
|
||||
hasCustomMorphData =
|
||||
splitMorphData(data).stream().anyMatch(s -> !s.startsWith("ph:"));
|
||||
}
|
||||
}
|
||||
|
||||
wordCount += writeNormalizedWordEntry(sb, writer, line);
|
||||
String line;
|
||||
while ((line = lines.readLine()) != null) {
|
||||
// wild and unpredictable code comment rules
|
||||
if (line.isEmpty() || line.charAt(0) == '#' || line.charAt(0) == '\t') {
|
||||
continue;
|
||||
}
|
||||
line = unescapeEntry(line);
|
||||
// if we haven't seen any custom morphological data, try to parse one
|
||||
if (!hasCustomMorphData) {
|
||||
int morphStart = line.indexOf(MORPH_SEPARATOR);
|
||||
if (morphStart >= 0) {
|
||||
String data = line.substring(morphStart + 1);
|
||||
hasCustomMorphData = splitMorphData(data).stream().anyMatch(s -> !s.startsWith("ph:"));
|
||||
}
|
||||
}
|
||||
|
||||
writeNormalizedWordEntry(sb, line, acc);
|
||||
}
|
||||
CodecUtil.writeFooter(output);
|
||||
}
|
||||
return wordCount;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the number of word entries written
|
||||
*/
|
||||
private int writeNormalizedWordEntry(StringBuilder reuse, ByteSequencesWriter writer, String line)
|
||||
private void writeNormalizedWordEntry(StringBuilder reuse, String line, EntryAccumulator acc)
|
||||
throws IOException {
|
||||
int flagSep = line.indexOf(FLAG_SEPARATOR);
|
||||
int morphSep = line.indexOf(MORPH_SEPARATOR);
|
||||
assert morphSep > 0;
|
||||
assert morphSep > flagSep;
|
||||
int sep = flagSep < 0 ? morphSep : flagSep;
|
||||
if (sep == 0) return 0;
|
||||
if (sep == 0) return;
|
||||
|
||||
CharSequence toWrite;
|
||||
String beforeSep = line.substring(0, sep);
|
||||
|
@ -1043,19 +1049,16 @@ public class Dictionary {
|
|||
|
||||
String written = toWrite.toString();
|
||||
sep = written.length() - (line.length() - sep);
|
||||
writer.write(written.getBytes(StandardCharsets.UTF_8));
|
||||
acc.addEntry(written);
|
||||
|
||||
WordCase wordCase = WordCase.caseOf(written, sep);
|
||||
if (wordCase == WordCase.MIXED || wordCase == WordCase.UPPER && flagSep > 0) {
|
||||
addHiddenCapitalizedWord(reuse, writer, written.substring(0, sep), written.substring(sep));
|
||||
return 2;
|
||||
addHiddenCapitalizedWord(reuse, acc, written.substring(0, sep), written.substring(sep));
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
private void addHiddenCapitalizedWord(
|
||||
StringBuilder reuse, ByteSequencesWriter writer, String word, String afterSep)
|
||||
throws IOException {
|
||||
StringBuilder reuse, EntryAccumulator acc, String word, String afterSep) throws IOException {
|
||||
reuse.setLength(0);
|
||||
reuse.append(Character.toUpperCase(word.charAt(0)));
|
||||
for (int i = 1; i < word.length(); i++) {
|
||||
|
@ -1064,7 +1067,7 @@ public class Dictionary {
|
|||
reuse.append(FLAG_SEPARATOR);
|
||||
reuse.append(HIDDEN_FLAG);
|
||||
reuse.append(afterSep, afterSep.charAt(0) == FLAG_SEPARATOR ? 1 : 0, afterSep.length());
|
||||
writer.write(reuse.toString().getBytes(StandardCharsets.UTF_8));
|
||||
acc.addEntry(reuse.toString());
|
||||
}
|
||||
|
||||
String toLowerCase(String word) {
|
||||
|
@ -1084,137 +1087,66 @@ public class Dictionary {
|
|||
return new String(chars);
|
||||
}
|
||||
|
||||
private String sortWordsOffline(
|
||||
Directory tempDir, String tempFileNamePrefix, IndexOutput unsorted) throws IOException {
|
||||
OfflineSorter sorter =
|
||||
new OfflineSorter(
|
||||
tempDir,
|
||||
tempFileNamePrefix,
|
||||
new Comparator<>() {
|
||||
final BytesRef scratch1 = new BytesRef();
|
||||
final BytesRef scratch2 = new BytesRef();
|
||||
|
||||
private void initScratch(BytesRef o, BytesRef scratch) {
|
||||
scratch.bytes = o.bytes;
|
||||
scratch.offset = o.offset;
|
||||
scratch.length = o.length;
|
||||
|
||||
for (int i = scratch.length - 1; i >= 0; i--) {
|
||||
if (scratch.bytes[scratch.offset + i] == FLAG_SEPARATOR
|
||||
|| scratch.bytes[scratch.offset + i] == MORPH_SEPARATOR) {
|
||||
scratch.length = i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public int compare(BytesRef o1, BytesRef o2) {
|
||||
initScratch(o1, scratch1);
|
||||
initScratch(o2, scratch2);
|
||||
|
||||
int cmp = scratch1.compareTo(scratch2);
|
||||
if (cmp == 0) {
|
||||
// tie break on whole row
|
||||
return o1.compareTo(o2);
|
||||
} else {
|
||||
return cmp;
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
String sorted;
|
||||
boolean success = false;
|
||||
try {
|
||||
sorted = sorter.sort(unsorted.getName());
|
||||
success = true;
|
||||
} finally {
|
||||
if (success) {
|
||||
tempDir.deleteFile(unsorted.getName());
|
||||
} else {
|
||||
IOUtils.deleteFilesIgnoringExceptions(tempDir, unsorted.getName());
|
||||
}
|
||||
}
|
||||
return sorted;
|
||||
}
|
||||
|
||||
private WordStorage readSortedDictionaries(
|
||||
Directory tempDir, String sorted, FlagEnumerator flags, int wordCount) throws IOException {
|
||||
boolean success = false;
|
||||
|
||||
private WordStorage readSortedDictionaries(FlagEnumerator flags, EntrySupplier sorted)
|
||||
throws IOException {
|
||||
Map<String, Integer> morphIndices = new HashMap<>();
|
||||
|
||||
WordStorage.Builder builder =
|
||||
new WordStorage.Builder(
|
||||
wordCount, hashFactor(), hasCustomMorphData, flags, allNonSuggestibleFlags());
|
||||
sorted.wordCount(), hashFactor(), hasCustomMorphData, flags, allNonSuggestibleFlags());
|
||||
|
||||
try (ByteSequencesReader reader =
|
||||
new ByteSequencesReader(tempDir.openChecksumInput(sorted), sorted)) {
|
||||
// TODO: the flags themselves can be double-chars (long) or also numeric
|
||||
// either way the trick is to encode them as char... but they must be parsed differently
|
||||
|
||||
// TODO: the flags themselves can be double-chars (long) or also numeric
|
||||
// either way the trick is to encode them as char... but they must be parsed differently
|
||||
while (true) {
|
||||
String line = sorted.next();
|
||||
if (line == null) break;
|
||||
|
||||
while (true) {
|
||||
BytesRef scratch = reader.next();
|
||||
if (scratch == null) {
|
||||
break;
|
||||
}
|
||||
String entry;
|
||||
char[] wordForm;
|
||||
int end;
|
||||
|
||||
String line = scratch.utf8ToString();
|
||||
String entry;
|
||||
char[] wordForm;
|
||||
int end;
|
||||
|
||||
int flagSep = line.indexOf(FLAG_SEPARATOR);
|
||||
if (flagSep == -1) {
|
||||
wordForm = NOFLAGS;
|
||||
end = line.indexOf(MORPH_SEPARATOR);
|
||||
entry = line.substring(0, end);
|
||||
} else {
|
||||
end = line.indexOf(MORPH_SEPARATOR);
|
||||
boolean hidden = line.charAt(flagSep + 1) == HIDDEN_FLAG;
|
||||
String flagPart = line.substring(flagSep + (hidden ? 2 : 1), end).strip();
|
||||
if (aliasCount > 0 && !flagPart.isEmpty()) {
|
||||
flagPart = getAliasValue(Integer.parseInt(flagPart));
|
||||
}
|
||||
|
||||
wordForm = flagParsingStrategy.parseFlags(flagPart);
|
||||
if (hidden) {
|
||||
wordForm = ArrayUtil.growExact(wordForm, wordForm.length + 1);
|
||||
wordForm[wordForm.length - 1] = HIDDEN_FLAG;
|
||||
}
|
||||
entry = line.substring(0, flagSep);
|
||||
}
|
||||
|
||||
if (entry.isEmpty()) continue;
|
||||
|
||||
int morphDataID = 0;
|
||||
if (end + 1 < line.length()) {
|
||||
List<String> morphFields = readMorphFields(entry, line.substring(end + 1));
|
||||
if (!morphFields.isEmpty()) {
|
||||
morphFields.sort(Comparator.naturalOrder());
|
||||
morphDataID = addMorphFields(morphIndices, String.join(" ", morphFields));
|
||||
}
|
||||
}
|
||||
|
||||
builder.add(entry, wordForm, morphDataID);
|
||||
}
|
||||
|
||||
// finalize last entry
|
||||
success = true;
|
||||
return new WordStorage(builder) {
|
||||
@Override
|
||||
char caseFold(char c) {
|
||||
return Dictionary.this.caseFold(c);
|
||||
}
|
||||
};
|
||||
} finally {
|
||||
if (success) {
|
||||
tempDir.deleteFile(sorted);
|
||||
int flagSep = line.indexOf(FLAG_SEPARATOR);
|
||||
if (flagSep == -1) {
|
||||
wordForm = NOFLAGS;
|
||||
end = line.indexOf(MORPH_SEPARATOR);
|
||||
entry = line.substring(0, end);
|
||||
} else {
|
||||
IOUtils.deleteFilesIgnoringExceptions(tempDir, sorted);
|
||||
end = line.indexOf(MORPH_SEPARATOR);
|
||||
boolean hidden = line.charAt(flagSep + 1) == HIDDEN_FLAG;
|
||||
String flagPart = line.substring(flagSep + (hidden ? 2 : 1), end).strip();
|
||||
if (aliasCount > 0 && !flagPart.isEmpty()) {
|
||||
flagPart = getAliasValue(Integer.parseInt(flagPart));
|
||||
}
|
||||
|
||||
wordForm = flagParsingStrategy.parseFlags(flagPart);
|
||||
if (hidden) {
|
||||
wordForm = ArrayUtil.growExact(wordForm, wordForm.length + 1);
|
||||
wordForm[wordForm.length - 1] = HIDDEN_FLAG;
|
||||
}
|
||||
entry = line.substring(0, flagSep);
|
||||
}
|
||||
|
||||
if (entry.isEmpty()) continue;
|
||||
|
||||
int morphDataID = 0;
|
||||
if (end + 1 < line.length()) {
|
||||
List<String> morphFields = readMorphFields(entry, line.substring(end + 1));
|
||||
if (!morphFields.isEmpty()) {
|
||||
morphFields.sort(Comparator.naturalOrder());
|
||||
morphDataID = addMorphFields(morphIndices, String.join(" ", morphFields));
|
||||
}
|
||||
}
|
||||
|
||||
builder.add(entry, wordForm, morphDataID);
|
||||
}
|
||||
|
||||
return new WordStorage(builder) {
|
||||
@Override
|
||||
char caseFold(char c) {
|
||||
return Dictionary.this.caseFold(c);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -0,0 +1,181 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.analysis.hunspell;
|
||||
|
||||
import java.io.Closeable;
|
||||
import java.io.IOException;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Comparator;
|
||||
import java.util.List;
|
||||
import org.apache.lucene.codecs.CodecUtil;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.IOContext;
|
||||
import org.apache.lucene.store.IndexOutput;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.BytesRefComparator;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.lucene.util.OfflineSorter;
|
||||
import org.apache.lucene.util.OfflineSorter.ByteSequencesReader;
|
||||
import org.apache.lucene.util.OfflineSorter.ByteSequencesWriter;
|
||||
|
||||
/**
|
||||
* The strategy defining how a Hunspell dictionary should be loaded, with different tradeoffs. The
|
||||
* entries should be sorted in a special way, and this can be done either in-memory (faster, but
|
||||
* temporarily allocating more memory) or using disk (slower, but not needing much memory).
|
||||
*
|
||||
* @see #offline(Directory, String)
|
||||
* @see #inMemory()
|
||||
*/
|
||||
public abstract class SortingStrategy {
|
||||
|
||||
abstract EntryAccumulator start() throws IOException;
|
||||
|
||||
interface EntryAccumulator {
|
||||
|
||||
void addEntry(String entry) throws IOException;
|
||||
|
||||
EntrySupplier finishAndSort() throws IOException;
|
||||
}
|
||||
|
||||
interface EntrySupplier extends Closeable {
|
||||
int wordCount();
|
||||
|
||||
/** The next line or {@code null} if the end is reached */
|
||||
String next() throws IOException;
|
||||
}
|
||||
|
||||
/**
|
||||
* An "offline" strategy that creates temporary files in the given directory and uses them for
|
||||
* sorting with {@link OfflineSorter}. It's slower than {@link #inMemory()}, but doesn't need to
|
||||
* load the entire dictionary into memory.
|
||||
*/
|
||||
public static SortingStrategy offline(Directory tempDir, String tempFileNamePrefix) {
|
||||
return new SortingStrategy() {
|
||||
@Override
|
||||
EntryAccumulator start() throws IOException {
|
||||
IndexOutput output = tempDir.createTempOutput(tempFileNamePrefix, "dat", IOContext.DEFAULT);
|
||||
ByteSequencesWriter writer = new ByteSequencesWriter(output);
|
||||
return new EntryAccumulator() {
|
||||
int wordCount = 0;
|
||||
|
||||
@Override
|
||||
public void addEntry(String entry) throws IOException {
|
||||
wordCount++;
|
||||
writer.write(entry.getBytes(StandardCharsets.UTF_8));
|
||||
}
|
||||
|
||||
@Override
|
||||
public EntrySupplier finishAndSort() throws IOException {
|
||||
CodecUtil.writeFooter(output);
|
||||
writer.close();
|
||||
String sortedFile = sortWordsOffline();
|
||||
ByteSequencesReader reader =
|
||||
new ByteSequencesReader(tempDir.openChecksumInput(sortedFile), sortedFile);
|
||||
return new EntrySupplier() {
|
||||
boolean success = false;
|
||||
|
||||
@Override
|
||||
public int wordCount() {
|
||||
return wordCount;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String next() throws IOException {
|
||||
BytesRef scratch = reader.next();
|
||||
if (scratch == null) {
|
||||
success = true;
|
||||
return null;
|
||||
}
|
||||
return scratch.utf8ToString();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
reader.close();
|
||||
if (success) {
|
||||
tempDir.deleteFile(sortedFile);
|
||||
} else {
|
||||
IOUtils.deleteFilesIgnoringExceptions(tempDir, sortedFile);
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
private String sortWordsOffline() throws IOException {
|
||||
var sorter = new OfflineSorter(tempDir, tempFileNamePrefix, BytesRefComparator.NATURAL);
|
||||
|
||||
String sorted;
|
||||
boolean success = false;
|
||||
try {
|
||||
sorted = sorter.sort(output.getName());
|
||||
success = true;
|
||||
} finally {
|
||||
if (success) {
|
||||
tempDir.deleteFile(output.getName());
|
||||
} else {
|
||||
IOUtils.deleteFilesIgnoringExceptions(tempDir, output.getName());
|
||||
}
|
||||
}
|
||||
return sorted;
|
||||
}
|
||||
};
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* The strategy that loads all entries as {@link String} objects and sorts them in memory. The
|
||||
* entries are then stored in a more compressed way, and the strings are gc-ed, but the loading
|
||||
* itself needs {@code O(dictionary_size)} memory.
|
||||
*/
|
||||
public static SortingStrategy inMemory() {
|
||||
return new SortingStrategy() {
|
||||
@Override
|
||||
EntryAccumulator start() {
|
||||
List<String> entries = new ArrayList<>();
|
||||
return new EntryAccumulator() {
|
||||
@Override
|
||||
public void addEntry(String entry) {
|
||||
entries.add(entry);
|
||||
}
|
||||
|
||||
@Override
|
||||
public EntrySupplier finishAndSort() {
|
||||
entries.sort(Comparator.naturalOrder());
|
||||
return new EntrySupplier() {
|
||||
int i = 0;
|
||||
|
||||
@Override
|
||||
public int wordCount() {
|
||||
return entries.size();
|
||||
}
|
||||
|
||||
@Override
|
||||
public String next() {
|
||||
return i < entries.size() ? entries.get(i++) : null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() {}
|
||||
};
|
||||
}
|
||||
};
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
|
@ -350,16 +350,19 @@ abstract class WordStorage {
|
|||
|
||||
currentOrds.clear();
|
||||
boolean hasNonHidden = false;
|
||||
boolean isSuggestible = false;
|
||||
for (char[] flags : group) {
|
||||
if (!hasFlag(flags, Dictionary.HIDDEN_FLAG)) {
|
||||
hasNonHidden = true;
|
||||
break;
|
||||
}
|
||||
if (!hasNoSuggestFlag(flags)) {
|
||||
isSuggestible = true;
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 0; i < group.size(); i++) {
|
||||
char[] flags = group.get(i);
|
||||
if (hasNonHidden && hasFlag(flags, Dictionary.HIDDEN_FLAG)) {
|
||||
if (hasNonHidden && group.size() > 1 && hasFlag(flags, Dictionary.HIDDEN_FLAG)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
|
@ -388,7 +391,7 @@ abstract class WordStorage {
|
|||
|
||||
int mask =
|
||||
(prevCode == 0 ? 0 : COLLISION_MASK)
|
||||
| (group.stream().anyMatch(flags -> !hasNoSuggestFlag(flags)) ? SUGGESTIBLE_MASK : 0)
|
||||
| (isSuggestible ? SUGGESTIBLE_MASK : 0)
|
||||
| Math.min(currentEntry.length(), MAX_STORED_LENGTH);
|
||||
hashTable[hash] = (mask << OFFSET_BITS) | pos;
|
||||
|
||||
|
|
|
@ -210,7 +210,8 @@ public final class StemmerOverrideFilter extends TokenFilter {
|
|||
*/
|
||||
public StemmerOverrideMap build() throws IOException {
|
||||
ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton();
|
||||
FSTCompiler<BytesRef> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE4, outputs);
|
||||
FSTCompiler<BytesRef> fstCompiler =
|
||||
new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE4, outputs).build();
|
||||
final int[] sort = hash.sort();
|
||||
IntsRefBuilder intsSpare = new IntsRefBuilder();
|
||||
final int size = hash.size();
|
||||
|
|
|
@ -46,11 +46,11 @@ public class TruncateTokenFilterFactory extends TokenFilterFactory {
|
|||
public static final String NAME = "truncate";
|
||||
|
||||
public static final String PREFIX_LENGTH_KEY = "prefixLength";
|
||||
private final byte prefixLength;
|
||||
private final int prefixLength;
|
||||
|
||||
public TruncateTokenFilterFactory(Map<String, String> args) {
|
||||
super(args);
|
||||
prefixLength = Byte.parseByte(get(args, PREFIX_LENGTH_KEY, "5"));
|
||||
prefixLength = Integer.parseInt(get(args, PREFIX_LENGTH_KEY, "5"));
|
||||
if (prefixLength < 1)
|
||||
throw new IllegalArgumentException(
|
||||
PREFIX_LENGTH_KEY + " parameter must be a positive number: " + prefixLength);
|
||||
|
|
|
@ -163,7 +163,6 @@ public final class WordDelimiterFilter extends TokenFilter {
|
|||
|
||||
private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class);
|
||||
private final KeywordAttribute keywordAttribute = addAttribute(KeywordAttribute.class);
|
||||
;
|
||||
private final OffsetAttribute offsetAttribute = addAttribute(OffsetAttribute.class);
|
||||
private final PositionIncrementAttribute posIncAttribute =
|
||||
addAttribute(PositionIncrementAttribute.class);
|
||||
|
|
|
@ -164,7 +164,6 @@ public final class WordDelimiterGraphFilter extends TokenFilter {
|
|||
|
||||
private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class);
|
||||
private final KeywordAttribute keywordAttribute = addAttribute(KeywordAttribute.class);
|
||||
;
|
||||
private final OffsetAttribute offsetAttribute = addAttribute(OffsetAttribute.class);
|
||||
private final PositionIncrementAttribute posIncAttribute =
|
||||
addAttribute(PositionIncrementAttribute.class);
|
||||
|
|
|
@ -45,7 +45,7 @@ import org.apache.lucene.util.ResourceLoaderAware;
|
|||
* preserveOriginal="0" splitOnNumerics="1" splitOnCaseChange="1"
|
||||
* catenateWords="0" catenateNumbers="0" catenateAll="0"
|
||||
* generateWordParts="1" generateNumberParts="1" stemEnglishPossessive="1"
|
||||
* types="wdfftypes.txt" />
|
||||
* types="wdfftypes.txt" ignoreKeywords="0" />
|
||||
* </analyzer>
|
||||
* </fieldType></pre>
|
||||
*
|
||||
|
@ -100,6 +100,9 @@ public class WordDelimiterGraphFilterFactory extends TokenFilterFactory
|
|||
if (getInt(args, "stemEnglishPossessive", 1) != 0) {
|
||||
flags |= STEM_ENGLISH_POSSESSIVE;
|
||||
}
|
||||
if (getInt(args, "ignoreKeywords", 0) != 0) {
|
||||
flags |= IGNORE_KEYWORDS;
|
||||
}
|
||||
wordFiles = get(args, PROTECTED_TOKENS);
|
||||
types = get(args, TYPES);
|
||||
this.flags = flags;
|
||||
|
|
|
@ -216,7 +216,6 @@ public final class SynonymFilter extends TokenFilter {
|
|||
count++;
|
||||
}
|
||||
}
|
||||
;
|
||||
|
||||
private final ByteArrayDataInput bytesReader = new ByteArrayDataInput();
|
||||
|
||||
|
|
|
@ -222,7 +222,8 @@ public class SynonymMap {
|
|||
public SynonymMap build() throws IOException {
|
||||
ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton();
|
||||
// TODO: are we using the best sharing options?
|
||||
FSTCompiler<BytesRef> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE4, outputs);
|
||||
FSTCompiler<BytesRef> fstCompiler =
|
||||
new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE4, outputs).build();
|
||||
|
||||
BytesRefBuilder scratch = new BytesRefBuilder();
|
||||
ByteArrayDataOutput scratchOutput = new ByteArrayDataOutput();
|
||||
|
|
|
@ -595,8 +595,7 @@ public class TestHTMLStripCharFilter extends BaseTokenStreamTestCase {
|
|||
}
|
||||
}
|
||||
Reader reader = new HTMLStripCharFilter(new StringReader(text.toString()));
|
||||
while (reader.read() != -1)
|
||||
;
|
||||
while (reader.read() != -1) {}
|
||||
}
|
||||
|
||||
public void testUTF16Surrogates() throws Exception {
|
||||
|
|
|
@ -230,7 +230,6 @@ public class TestDuelingAnalyzers extends BaseTokenStreamTestCase {
|
|||
assertEquals(
|
||||
"wrong end offset for input: " + s, leftOffset.endOffset(), rightOffset.endOffset());
|
||||
}
|
||||
;
|
||||
assertFalse("wrong number of tokens for input: " + s, right.incrementToken());
|
||||
left.end();
|
||||
right.end();
|
||||
|
|
|
@ -41,7 +41,6 @@ import java.util.concurrent.atomic.AtomicLong;
|
|||
import java.util.function.Function;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
import org.apache.lucene.tests.store.BaseDirectoryWrapper;
|
||||
import org.apache.lucene.tests.util.LuceneTestCase;
|
||||
import org.apache.lucene.tests.util.LuceneTestCase.SuppressSysoutChecks;
|
||||
import org.apache.lucene.tests.util.RamUsageTester;
|
||||
|
@ -72,9 +71,8 @@ public class TestAllDictionaries extends LuceneTestCase {
|
|||
Path dic = Path.of(affPath.substring(0, affPath.length() - 4) + ".dic");
|
||||
assert Files.exists(dic) : dic;
|
||||
try (InputStream dictionary = Files.newInputStream(dic);
|
||||
InputStream affix = Files.newInputStream(aff);
|
||||
BaseDirectoryWrapper tempDir = newDirectory()) {
|
||||
return new Dictionary(tempDir, "dictionary", affix, dictionary) {
|
||||
InputStream affix = Files.newInputStream(aff)) {
|
||||
return new Dictionary(affix, List.of(dictionary), false, SortingStrategy.inMemory()) {
|
||||
@Override
|
||||
protected boolean tolerateAffixRuleCountMismatches() {
|
||||
return true;
|
||||
|
|
|
@ -256,15 +256,22 @@ public class TestSpellChecking extends LuceneTestCase {
|
|||
}
|
||||
|
||||
static void checkSpellCheckerExpectations(Path basePath) throws IOException, ParseException {
|
||||
InputStream affixStream = Files.newInputStream(Path.of(basePath.toString() + ".aff"));
|
||||
checkSpellCheckerExpectations(
|
||||
basePath, SortingStrategy.offline(new ByteBuffersDirectory(), "dictionary"));
|
||||
checkSpellCheckerExpectations(basePath, SortingStrategy.inMemory());
|
||||
}
|
||||
|
||||
private static void checkSpellCheckerExpectations(Path basePath, SortingStrategy strategy)
|
||||
throws IOException, ParseException {
|
||||
Path affFile = Path.of(basePath + ".aff");
|
||||
Path dicFile = Path.of(basePath + ".dic");
|
||||
InputStream affixStream = Files.newInputStream(affFile);
|
||||
InputStream dictStream = Files.newInputStream(dicFile);
|
||||
|
||||
Hunspell speller;
|
||||
Map<String, Suggester> suggesters = new LinkedHashMap<>();
|
||||
try {
|
||||
Dictionary dictionary =
|
||||
new Dictionary(new ByteBuffersDirectory(), "dictionary", affixStream, dictStream);
|
||||
Dictionary dictionary = new Dictionary(affixStream, List.of(dictStream), false, strategy);
|
||||
speller = new Hunspell(dictionary, TimeoutPolicy.NO_TIMEOUT, () -> {});
|
||||
Suggester suggester = new Suggester(dictionary);
|
||||
suggesters.put("default", suggester);
|
||||
|
|
|
@ -41,7 +41,6 @@ public class TestIndicNormalizer extends BaseTokenStreamTestCase {
|
|||
|
||||
private void check(String input, String output) throws IOException {
|
||||
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
|
||||
;
|
||||
tokenizer.setReader(new StringReader(input));
|
||||
TokenFilter tf = new IndicNormalizationFilter(tokenizer);
|
||||
assertTokenStreamContents(tf, new String[] {output});
|
||||
|
|
|
@ -89,7 +89,6 @@ public class TestKeywordMarkerFilterFactory extends BaseTokenStreamFactoryTestCa
|
|||
stream =
|
||||
tokenFilterFactory("KeywordMarker", "pattern", "Cats", "ignoreCase", "true").create(stream);
|
||||
stream = tokenFilterFactory("PorterStem").create(stream);
|
||||
;
|
||||
assertTokenStreamContents(stream, new String[] {"dog", "cats", "Cats"});
|
||||
}
|
||||
|
||||
|
|
|
@ -68,4 +68,23 @@ public class TestTruncateTokenFilterFactory extends BaseTokenStreamFactoryTestCa
|
|||
TruncateTokenFilterFactory.PREFIX_LENGTH_KEY
|
||||
+ " parameter must be a positive number: -5"));
|
||||
}
|
||||
|
||||
/** Test that takes length greater than byte limit accepts it */
|
||||
public void testLengthGreaterThanByteLimitArgument() throws Exception {
|
||||
Reader reader =
|
||||
new StringReader(
|
||||
"abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvw128characters From here");
|
||||
TokenStream stream = new MockTokenizer(MockTokenizer.WHITESPACE, false);
|
||||
((Tokenizer) stream).setReader(reader);
|
||||
stream =
|
||||
tokenFilterFactory("Truncate", TruncateTokenFilterFactory.PREFIX_LENGTH_KEY, "128")
|
||||
.create(stream);
|
||||
assertTokenStreamContents(
|
||||
stream,
|
||||
new String[] {
|
||||
"abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvw1",
|
||||
"From",
|
||||
"here"
|
||||
});
|
||||
}
|
||||
}
|
||||
|
|
|
@ -69,7 +69,6 @@ public class TestEdgeNGramTokenizer extends BaseTokenStreamTestCase {
|
|||
public void testOversizedNgrams() throws Exception {
|
||||
EdgeNGramTokenizer tokenizer = new EdgeNGramTokenizer(6, 6);
|
||||
tokenizer.setReader(input);
|
||||
;
|
||||
assertTokenStreamContents(tokenizer, new String[0], new int[0], new int[0], 5 /* abcde */);
|
||||
}
|
||||
|
||||
|
|
|
@ -156,7 +156,6 @@ public class TestCharArrayIterator extends LuceneTestCase {
|
|||
|
||||
private void consume(BreakIterator bi, CharacterIterator ci) {
|
||||
bi.setText(ci);
|
||||
while (bi.next() != BreakIterator.DONE)
|
||||
;
|
||||
while (bi.next() != BreakIterator.DONE) {}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -16,6 +16,8 @@
|
|||
*/
|
||||
package org.apache.lucene.analysis.ja.dict;
|
||||
|
||||
import static org.apache.lucene.util.fst.FST.readMetadata;
|
||||
|
||||
import java.io.BufferedInputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
|
@ -103,7 +105,7 @@ public final class TokenInfoDictionary extends BinaryDictionary<TokenInfoMorphDa
|
|||
FST<Long> fst;
|
||||
try (InputStream is = new BufferedInputStream(fstResource.get())) {
|
||||
DataInput in = new InputStreamDataInput(is);
|
||||
fst = new FST<>(in, in, PositiveIntOutputs.getSingleton());
|
||||
fst = new FST<>(readMetadata(in, PositiveIntOutputs.getSingleton()), in);
|
||||
}
|
||||
// TODO: some way to configure?
|
||||
this.fst = new TokenInfoFST(fst, true);
|
||||
|
|
|
@ -101,7 +101,8 @@ class TokenInfoDictionaryBuilder {
|
|||
lines.sort(Comparator.comparing(entry -> entry[0]));
|
||||
|
||||
PositiveIntOutputs fstOutput = PositiveIntOutputs.getSingleton();
|
||||
FSTCompiler<Long> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE2, fstOutput);
|
||||
FSTCompiler<Long> fstCompiler =
|
||||
new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE2, fstOutput).build();
|
||||
IntsRefBuilder scratch = new IntsRefBuilder();
|
||||
long ord = -1; // first ord will be 0
|
||||
String lastValue = null;
|
||||
|
|
|
@ -93,7 +93,8 @@ public final class UserDictionary implements Dictionary<UserMorphData> {
|
|||
List<int[]> segmentations = new ArrayList<>(featureEntries.size());
|
||||
|
||||
PositiveIntOutputs fstOutput = PositiveIntOutputs.getSingleton();
|
||||
FSTCompiler<Long> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE2, fstOutput);
|
||||
FSTCompiler<Long> fstCompiler =
|
||||
new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE2, fstOutput).build();
|
||||
IntsRefBuilder scratch = new IntsRefBuilder();
|
||||
long ord = 0;
|
||||
|
||||
|
|
|
@ -758,8 +758,7 @@ public class TestJapaneseTokenizer extends BaseTokenStreamTestCase {
|
|||
for (int i = 0; i < numIterations; i++) {
|
||||
try (TokenStream ts = analyzer.tokenStream("ignored", line)) {
|
||||
ts.reset();
|
||||
while (ts.incrementToken())
|
||||
;
|
||||
while (ts.incrementToken()) {}
|
||||
ts.end();
|
||||
}
|
||||
}
|
||||
|
@ -775,8 +774,7 @@ public class TestJapaneseTokenizer extends BaseTokenStreamTestCase {
|
|||
for (String sentence : sentences) {
|
||||
try (TokenStream ts = analyzer.tokenStream("ignored", sentence)) {
|
||||
ts.reset();
|
||||
while (ts.incrementToken())
|
||||
;
|
||||
while (ts.incrementToken()) {}
|
||||
ts.end();
|
||||
}
|
||||
}
|
||||
|
@ -831,8 +829,7 @@ public class TestJapaneseTokenizer extends BaseTokenStreamTestCase {
|
|||
new JapaneseTokenizer(newAttributeFactory(), readDict(), false, Mode.NORMAL);
|
||||
tokenizer.setReader(new StringReader(doc));
|
||||
tokenizer.reset();
|
||||
while (tokenizer.incrementToken())
|
||||
;
|
||||
while (tokenizer.incrementToken()) {}
|
||||
}
|
||||
|
||||
public void testPatchedSystemDict() throws Exception {
|
||||
|
|
|
@ -16,6 +16,8 @@
|
|||
*/
|
||||
package org.apache.lucene.analysis.ko.dict;
|
||||
|
||||
import static org.apache.lucene.util.fst.FST.readMetadata;
|
||||
|
||||
import java.io.BufferedInputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
|
@ -102,7 +104,7 @@ public final class TokenInfoDictionary extends BinaryDictionary<TokenInfoMorphDa
|
|||
FST<Long> fst;
|
||||
try (InputStream is = new BufferedInputStream(fstResource.get())) {
|
||||
DataInput in = new InputStreamDataInput(is);
|
||||
fst = new FST<>(in, in, PositiveIntOutputs.getSingleton());
|
||||
fst = new FST<>(readMetadata(in, PositiveIntOutputs.getSingleton()), in);
|
||||
}
|
||||
this.fst = new TokenInfoFST(fst);
|
||||
}
|
||||
|
|
|
@ -94,7 +94,8 @@ class TokenInfoDictionaryBuilder {
|
|||
lines.sort(Comparator.comparing(left -> left[0]));
|
||||
|
||||
PositiveIntOutputs fstOutput = PositiveIntOutputs.getSingleton();
|
||||
FSTCompiler<Long> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE2, fstOutput);
|
||||
FSTCompiler<Long> fstCompiler =
|
||||
new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE2, fstOutput).build();
|
||||
IntsRefBuilder scratch = new IntsRefBuilder();
|
||||
long ord = -1; // first ord will be 0
|
||||
String lastValue = null;
|
||||
|
|
|
@ -75,7 +75,8 @@ public final class UserDictionary implements Dictionary<UserMorphData> {
|
|||
entries.sort(Comparator.comparing(e -> e.split("\\s+")[0]));
|
||||
|
||||
PositiveIntOutputs fstOutput = PositiveIntOutputs.getSingleton();
|
||||
FSTCompiler<Long> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE2, fstOutput);
|
||||
FSTCompiler<Long> fstCompiler =
|
||||
new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE2, fstOutput).build();
|
||||
IntsRefBuilder scratch = new IntsRefBuilder();
|
||||
|
||||
String lastToken = null;
|
||||
|
|
|
@ -41,7 +41,6 @@ public class TestPolishAnalyzer extends BaseTokenStreamTestCase {
|
|||
/** test use of exclusion set */
|
||||
public void testExclude() throws IOException {
|
||||
CharArraySet exclusionSet = new CharArraySet(asSet("studenta"), false);
|
||||
;
|
||||
Analyzer a = new PolishAnalyzer(PolishAnalyzer.getDefaultStopSet(), exclusionSet);
|
||||
checkOneTerm(a, "studenta", "studenta");
|
||||
checkOneTerm(a, "studenci", "student");
|
||||
|
|
|
@ -16,6 +16,8 @@
|
|||
*/
|
||||
package org.apache.lucene.backward_codecs.lucene40.blocktree;
|
||||
|
||||
import static org.apache.lucene.util.fst.FST.readMetadata;
|
||||
|
||||
import java.io.IOException;
|
||||
import org.apache.lucene.index.FieldInfo;
|
||||
import org.apache.lucene.index.IndexOptions;
|
||||
|
@ -89,9 +91,17 @@ public final class FieldReader extends Terms {
|
|||
final IndexInput clone = indexIn.clone();
|
||||
clone.seek(indexStartFP);
|
||||
if (metaIn == indexIn) { // Only true before Lucene 8.6
|
||||
index = new FST<>(clone, clone, ByteSequenceOutputs.getSingleton(), new OffHeapFSTStore());
|
||||
index =
|
||||
new FST<>(
|
||||
readMetadata(clone, ByteSequenceOutputs.getSingleton()),
|
||||
clone,
|
||||
new OffHeapFSTStore());
|
||||
} else {
|
||||
index = new FST<>(metaIn, clone, ByteSequenceOutputs.getSingleton(), new OffHeapFSTStore());
|
||||
index =
|
||||
new FST<>(
|
||||
readMetadata(metaIn, ByteSequenceOutputs.getSingleton()),
|
||||
clone,
|
||||
new OffHeapFSTStore());
|
||||
}
|
||||
/*
|
||||
if (false) {
|
||||
|
|
|
@ -22,6 +22,7 @@ import java.nio.file.Path;
|
|||
import java.nio.file.Paths;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.IntPoint;
|
||||
import org.apache.lucene.index.CheckIndex;
|
||||
import org.apache.lucene.index.IndexWriter;
|
||||
import org.apache.lucene.index.IndexWriterConfig;
|
||||
import org.apache.lucene.store.Directory;
|
||||
|
@ -70,7 +71,7 @@ public class TestManyPointsInOldIndex extends LuceneTestCase {
|
|||
dir.setCheckIndexOnClose(false);
|
||||
|
||||
// ... because we check ourselves here:
|
||||
TestUtil.checkIndex(dir, false, true, true, null);
|
||||
TestUtil.checkIndex(dir, CheckIndex.Level.MIN_LEVEL_FOR_INTEGRITY_CHECKS, true, true, null);
|
||||
dir.close();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -23,6 +23,7 @@ description = 'Lucene JMH micro-benchmarking module'
|
|||
|
||||
dependencies {
|
||||
moduleImplementation project(':lucene:core')
|
||||
moduleImplementation project(':lucene:expressions')
|
||||
|
||||
moduleImplementation "org.openjdk.jmh:jmh-core:1.37"
|
||||
annotationProcessor "org.openjdk.jmh:jmh-generator-annprocess:1.37"
|
||||
|
@ -42,7 +43,7 @@ tasks.matching { it.name == "forbiddenApisMain" }.configureEach {
|
|||
tasks.matching { it.name in [
|
||||
// Turn off JMH dependency checksums and licensing (it's GPL w/ classpath exception
|
||||
// but this seems fine for test/build only tools).
|
||||
"validateJarChecksums", "validateJarLicenses",
|
||||
"validateJarChecksums", "validateJarLicenses", "collectJarInfos",
|
||||
// No special javadocs for JMH benchmarks.
|
||||
"renderSiteJavadoc",
|
||||
"renderJavadoc",
|
||||
|
|
|
@ -20,6 +20,7 @@ module org.apache.lucene.benchmark.jmh {
|
|||
requires jmh.core;
|
||||
requires jdk.unsupported;
|
||||
requires org.apache.lucene.core;
|
||||
requires org.apache.lucene.expressions;
|
||||
|
||||
exports org.apache.lucene.benchmark.jmh;
|
||||
exports org.apache.lucene.benchmark.jmh.jmh_generated;
|
||||
|
|
|
@ -0,0 +1,148 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.benchmark.jmh;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.lang.invoke.MethodHandle;
|
||||
import java.lang.invoke.MethodHandles;
|
||||
import java.lang.invoke.MethodType;
|
||||
import java.text.ParseException;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
import java.util.Objects;
|
||||
import java.util.concurrent.ThreadLocalRandom;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import org.apache.lucene.expressions.Expression;
|
||||
import org.apache.lucene.expressions.js.JavascriptCompiler;
|
||||
import org.apache.lucene.search.DoubleValues;
|
||||
import org.openjdk.jmh.annotations.Benchmark;
|
||||
import org.openjdk.jmh.annotations.BenchmarkMode;
|
||||
import org.openjdk.jmh.annotations.Fork;
|
||||
import org.openjdk.jmh.annotations.Level;
|
||||
import org.openjdk.jmh.annotations.Measurement;
|
||||
import org.openjdk.jmh.annotations.Mode;
|
||||
import org.openjdk.jmh.annotations.OutputTimeUnit;
|
||||
import org.openjdk.jmh.annotations.Param;
|
||||
import org.openjdk.jmh.annotations.Scope;
|
||||
import org.openjdk.jmh.annotations.Setup;
|
||||
import org.openjdk.jmh.annotations.State;
|
||||
import org.openjdk.jmh.annotations.Warmup;
|
||||
|
||||
@BenchmarkMode(Mode.Throughput)
|
||||
@OutputTimeUnit(TimeUnit.MILLISECONDS)
|
||||
@State(Scope.Benchmark)
|
||||
@Warmup(iterations = 5, time = 5)
|
||||
@Measurement(iterations = 12, time = 8)
|
||||
@Fork(value = 1)
|
||||
public class ExpressionsBenchmark {
|
||||
|
||||
/**
|
||||
* Some extra functions to bench "identity" in various variants, another one is named
|
||||
* "native_identity" (see below).
|
||||
*/
|
||||
private static final Map<String, MethodHandle> FUNCTIONS = getFunctions();
|
||||
|
||||
private static final String NATIVE_IDENTITY_NAME = "native_identity";
|
||||
|
||||
private static Map<String, MethodHandle> getFunctions() {
|
||||
try {
|
||||
var lookup = MethodHandles.lookup();
|
||||
Map<String, MethodHandle> m = new HashMap<>(JavascriptCompiler.DEFAULT_FUNCTIONS);
|
||||
m.put(
|
||||
"func_identity",
|
||||
lookup.findStatic(
|
||||
lookup.lookupClass(), "ident", MethodType.methodType(double.class, double.class)));
|
||||
m.put("mh_identity", MethodHandles.identity(double.class));
|
||||
return m;
|
||||
} catch (ReflectiveOperationException e) {
|
||||
throw new AssertionError(e);
|
||||
}
|
||||
}
|
||||
|
||||
@SuppressWarnings("unused")
|
||||
private static double ident(double v) {
|
||||
return v;
|
||||
}
|
||||
|
||||
/** A native implementation of an expression to compare performance */
|
||||
private static final Expression NATIVE_IDENTITY_EXPRESSION =
|
||||
new Expression(NATIVE_IDENTITY_NAME, new String[] {"x"}) {
|
||||
@Override
|
||||
public double evaluate(DoubleValues[] functionValues) throws IOException {
|
||||
return functionValues[0].doubleValue();
|
||||
}
|
||||
};
|
||||
|
||||
private double[] randomData;
|
||||
private Expression expression;
|
||||
|
||||
@Param({"x", "func_identity(x)", "mh_identity", "native_identity", "cos(x)", "cos(x) + sin(x)"})
|
||||
String js;
|
||||
|
||||
@Setup(Level.Iteration)
|
||||
public void init() throws ParseException {
|
||||
ThreadLocalRandom random = ThreadLocalRandom.current();
|
||||
randomData = random.doubles().limit(1024).toArray();
|
||||
expression =
|
||||
Objects.equals(js, NATIVE_IDENTITY_NAME)
|
||||
? NATIVE_IDENTITY_EXPRESSION
|
||||
: JavascriptCompiler.compile(js, FUNCTIONS);
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public double expression() throws IOException {
|
||||
var it = new ValuesIterator(randomData);
|
||||
var values = it.getDoubleValues();
|
||||
double result = 0d;
|
||||
while (it.next()) {
|
||||
result += expression.evaluate(values);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
static final class ValuesIterator {
|
||||
final double[] data;
|
||||
final DoubleValues[] dv;
|
||||
int pos = -1;
|
||||
|
||||
ValuesIterator(double[] data) {
|
||||
this.data = data;
|
||||
var dv =
|
||||
new DoubleValues() {
|
||||
@Override
|
||||
public double doubleValue() throws IOException {
|
||||
return data[pos];
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean advanceExact(int doc) throws IOException {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
};
|
||||
this.dv = new DoubleValues[] {dv};
|
||||
}
|
||||
|
||||
boolean next() {
|
||||
pos++;
|
||||
return (pos < data.length);
|
||||
}
|
||||
|
||||
DoubleValues[] getDoubleValues() {
|
||||
return dv;
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,176 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.benchmark.jmh;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.util.Arrays;
|
||||
import java.util.Random;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import org.apache.lucene.codecs.lucene99.GroupVIntReader;
|
||||
import org.apache.lucene.codecs.lucene99.GroupVIntWriter;
|
||||
import org.apache.lucene.store.ByteArrayDataInput;
|
||||
import org.apache.lucene.store.ByteArrayDataOutput;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.IOContext;
|
||||
import org.apache.lucene.store.IndexInput;
|
||||
import org.apache.lucene.store.IndexOutput;
|
||||
import org.apache.lucene.store.MMapDirectory;
|
||||
import org.openjdk.jmh.annotations.Benchmark;
|
||||
import org.openjdk.jmh.annotations.BenchmarkMode;
|
||||
import org.openjdk.jmh.annotations.Fork;
|
||||
import org.openjdk.jmh.annotations.Level;
|
||||
import org.openjdk.jmh.annotations.Measurement;
|
||||
import org.openjdk.jmh.annotations.Mode;
|
||||
import org.openjdk.jmh.annotations.OutputTimeUnit;
|
||||
import org.openjdk.jmh.annotations.Param;
|
||||
import org.openjdk.jmh.annotations.Scope;
|
||||
import org.openjdk.jmh.annotations.Setup;
|
||||
import org.openjdk.jmh.annotations.State;
|
||||
import org.openjdk.jmh.annotations.Warmup;
|
||||
import org.openjdk.jmh.infra.Blackhole;
|
||||
|
||||
@BenchmarkMode(Mode.Throughput)
|
||||
@OutputTimeUnit(TimeUnit.MICROSECONDS)
|
||||
@State(Scope.Benchmark)
|
||||
@Warmup(iterations = 3, time = 3)
|
||||
@Measurement(iterations = 5, time = 5)
|
||||
@Fork(
|
||||
value = 1,
|
||||
jvmArgsPrepend = {"--add-modules=jdk.unsupported"})
|
||||
public class GroupVIntBenchmark {
|
||||
|
||||
// Cumulative frequency for each number of bits per value used by doc deltas of tail postings on
|
||||
// wikibigall.
|
||||
private static final float[] CUMULATIVE_FREQUENCY_BY_BITS_REQUIRED =
|
||||
new float[] {
|
||||
0.0f,
|
||||
0.01026574f,
|
||||
0.021453038f,
|
||||
0.03342156f,
|
||||
0.046476692f,
|
||||
0.060890317f,
|
||||
0.07644147f,
|
||||
0.093718216f,
|
||||
0.11424741f,
|
||||
0.13989712f,
|
||||
0.17366524f,
|
||||
0.22071244f,
|
||||
0.2815692f,
|
||||
0.3537585f,
|
||||
0.43655503f,
|
||||
0.52308f,
|
||||
0.6104675f,
|
||||
0.7047371f,
|
||||
0.78155357f,
|
||||
0.8671179f,
|
||||
0.9740598f,
|
||||
1.0f
|
||||
};
|
||||
|
||||
final int maxSize = 256;
|
||||
final long[] values = new long[maxSize];
|
||||
|
||||
IndexInput byteBufferGVIntIn;
|
||||
IndexInput byteBufferVIntIn;
|
||||
|
||||
ByteArrayDataInput byteArrayVIntIn;
|
||||
ByteArrayDataInput byteArrayGVIntIn;
|
||||
|
||||
// @Param({"16", "32", "64", "128", "248"})
|
||||
@Param({"64"})
|
||||
public int size;
|
||||
|
||||
void initArrayInput(long[] docs) throws Exception {
|
||||
byte[] gVIntBytes = new byte[Integer.BYTES * maxSize * 2];
|
||||
byte[] vIntBytes = new byte[Integer.BYTES * maxSize * 2];
|
||||
ByteArrayDataOutput vIntOut = new ByteArrayDataOutput(vIntBytes);
|
||||
GroupVIntWriter w = new GroupVIntWriter();
|
||||
w.writeValues(new ByteArrayDataOutput(gVIntBytes), docs, docs.length);
|
||||
for (long v : docs) {
|
||||
vIntOut.writeVInt((int) v);
|
||||
}
|
||||
byteArrayVIntIn = new ByteArrayDataInput(vIntBytes);
|
||||
byteArrayGVIntIn = new ByteArrayDataInput(gVIntBytes);
|
||||
}
|
||||
|
||||
void initByteBufferInput(long[] docs) throws Exception {
|
||||
Directory dir = MMapDirectory.open(Files.createTempDirectory("groupvintdata"));
|
||||
IndexOutput vintOut = dir.createOutput("vint", IOContext.DEFAULT);
|
||||
IndexOutput gvintOut = dir.createOutput("gvint", IOContext.DEFAULT);
|
||||
|
||||
GroupVIntWriter w = new GroupVIntWriter();
|
||||
w.writeValues(gvintOut, docs, docs.length);
|
||||
for (long v : docs) {
|
||||
vintOut.writeVInt((int) v);
|
||||
}
|
||||
vintOut.close();
|
||||
gvintOut.close();
|
||||
byteBufferGVIntIn = dir.openInput("gvint", IOContext.DEFAULT);
|
||||
byteBufferVIntIn = dir.openInput("vint", IOContext.DEFAULT);
|
||||
}
|
||||
|
||||
@Setup(Level.Trial)
|
||||
public void init() throws Exception {
|
||||
long[] docs = new long[maxSize];
|
||||
Random r = new Random(0);
|
||||
for (int i = 0; i < maxSize; ++i) {
|
||||
float randomFloat = r.nextFloat();
|
||||
// Reproduce the distribution of the number of bits per values that we're observing for tail
|
||||
// postings on wikibigall.
|
||||
int numBits = 1 + Arrays.binarySearch(CUMULATIVE_FREQUENCY_BY_BITS_REQUIRED, randomFloat);
|
||||
if (numBits < 0) {
|
||||
numBits = -numBits;
|
||||
}
|
||||
docs[i] = r.nextInt(1 << (numBits - 1), 1 << numBits);
|
||||
}
|
||||
initByteBufferInput(docs);
|
||||
initArrayInput(docs);
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public void byteBufferReadVInt(Blackhole bh) throws IOException {
|
||||
byteBufferVIntIn.seek(0);
|
||||
for (int i = 0; i < size; i++) {
|
||||
values[i] = byteBufferVIntIn.readVInt();
|
||||
}
|
||||
bh.consume(values);
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public void byteBufferReadGroupVInt(Blackhole bh) throws IOException {
|
||||
byteBufferGVIntIn.seek(0);
|
||||
GroupVIntReader.readValues(byteBufferGVIntIn, values, size);
|
||||
bh.consume(values);
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public void byteArrayReadVInt(Blackhole bh) {
|
||||
byteArrayVIntIn.rewind();
|
||||
for (int i = 0; i < size; i++) {
|
||||
values[i] = byteArrayVIntIn.readVInt();
|
||||
}
|
||||
bh.consume(values);
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public void byteArrayReadGroupVInt(Blackhole bh) throws IOException {
|
||||
byteArrayGVIntIn.rewind();
|
||||
GroupVIntReader.readValues(byteArrayGVIntIn, values, size);
|
||||
bh.consume(values);
|
||||
}
|
||||
}
|
|
@ -30,8 +30,8 @@ import org.apache.lucene.search.Query;
|
|||
import org.apache.lucene.search.ScoreDoc;
|
||||
import org.apache.lucene.search.Sort;
|
||||
import org.apache.lucene.search.TopDocs;
|
||||
import org.apache.lucene.search.TopFieldCollector;
|
||||
import org.apache.lucene.search.TopScoreDocCollector;
|
||||
import org.apache.lucene.search.TopFieldCollectorManager;
|
||||
import org.apache.lucene.search.TopScoreDocCollectorManager;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.util.Bits;
|
||||
|
||||
|
@ -110,15 +110,17 @@ public abstract class ReadTask extends PerfTask {
|
|||
// the IndexSearcher search methods that take
|
||||
// Weight public again, we can go back to
|
||||
// pulling the Weight ourselves:
|
||||
TopFieldCollector collector =
|
||||
TopFieldCollector.create(sort, numHits, withTotalHits() ? Integer.MAX_VALUE : 1);
|
||||
searcher.search(q, collector);
|
||||
hits = collector.topDocs();
|
||||
int totalHitsThreshold = withTotalHits() ? Integer.MAX_VALUE : 1;
|
||||
TopFieldCollectorManager collectorManager =
|
||||
new TopFieldCollectorManager(
|
||||
sort, numHits, null, totalHitsThreshold, searcher.getSlices().length > 1);
|
||||
hits = searcher.search(q, collectorManager);
|
||||
} else {
|
||||
hits = searcher.search(q, numHits);
|
||||
}
|
||||
} else {
|
||||
Collector collector = createCollector();
|
||||
|
||||
searcher.search(q, collector);
|
||||
// hits = collector.topDocs();
|
||||
}
|
||||
|
@ -183,7 +185,8 @@ public abstract class ReadTask extends PerfTask {
|
|||
}
|
||||
|
||||
protected Collector createCollector() throws Exception {
|
||||
return TopScoreDocCollector.create(numHits(), withTotalHits() ? Integer.MAX_VALUE : 1);
|
||||
return new TopScoreDocCollectorManager(numHits(), withTotalHits() ? Integer.MAX_VALUE : 1)
|
||||
.newCollector();
|
||||
}
|
||||
|
||||
protected Document retrieveDoc(StoredFields storedFields, int id) throws IOException {
|
||||
|
|
|
@ -207,7 +207,8 @@ public class BooleanPerceptronClassifier implements Classifier<Boolean> {
|
|||
|
||||
private void updateFST(SortedMap<String, Double> weights) throws IOException {
|
||||
PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
|
||||
FSTCompiler<Long> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
|
||||
FSTCompiler<Long> fstCompiler =
|
||||
new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE1, outputs).build();
|
||||
BytesRefBuilder scratchBytes = new BytesRefBuilder();
|
||||
IntsRefBuilder scratchInts = new IntsRefBuilder();
|
||||
for (Map.Entry<String, Double> entry : weights.entrySet()) {
|
||||
|
|
|
@ -16,6 +16,8 @@
|
|||
*/
|
||||
package org.apache.lucene.codecs.blockterms;
|
||||
|
||||
import static org.apache.lucene.util.fst.FST.readMetadata;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Collection;
|
||||
import java.util.Collections;
|
||||
|
@ -154,7 +156,7 @@ public class VariableGapTermsIndexReader extends TermsIndexReaderBase {
|
|||
public FieldIndexData(IndexInput in, FieldInfo fieldInfo, long indexStart) throws IOException {
|
||||
IndexInput clone = in.clone();
|
||||
clone.seek(indexStart);
|
||||
fst = new FST<>(clone, clone, fstOutputs);
|
||||
fst = new FST<>(readMetadata(clone, fstOutputs), clone);
|
||||
clone.close();
|
||||
|
||||
/*
|
||||
|
|
|
@ -238,7 +238,7 @@ public class VariableGapTermsIndexWriter extends TermsIndexWriterBase {
|
|||
public FSTFieldWriter(FieldInfo fieldInfo, long termsFilePointer) throws IOException {
|
||||
this.fieldInfo = fieldInfo;
|
||||
fstOutputs = PositiveIntOutputs.getSingleton();
|
||||
fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, fstOutputs);
|
||||
fstCompiler = new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE1, fstOutputs).build();
|
||||
indexStart = out.getFilePointer();
|
||||
//// System.out.println("VGW: field=" + fieldInfo.name);
|
||||
|
||||
|
|
|
@ -16,6 +16,8 @@
|
|||
*/
|
||||
package org.apache.lucene.codecs.blocktreeords;
|
||||
|
||||
import static org.apache.lucene.util.fst.FST.readMetadata;
|
||||
|
||||
import java.io.IOException;
|
||||
import org.apache.lucene.codecs.blocktreeords.FSTOrdsOutputs.Output;
|
||||
import org.apache.lucene.index.FieldInfo;
|
||||
|
@ -85,7 +87,7 @@ final class OrdsFieldReader extends Terms {
|
|||
final IndexInput clone = indexIn.clone();
|
||||
// System.out.println("start=" + indexStartFP + " field=" + fieldInfo.name);
|
||||
clone.seek(indexStartFP);
|
||||
index = new FST<>(clone, clone, OrdsBlockTreeTermsWriter.FST_OUTPUTS);
|
||||
index = new FST<>(readMetadata(clone, OrdsBlockTreeTermsWriter.FST_OUTPUTS), clone);
|
||||
|
||||
/*
|
||||
if (true) {
|
||||
|
|
|
@ -194,7 +194,8 @@ public class FSTTermsReader extends FieldsProducer {
|
|||
this.sumDocFreq = sumDocFreq;
|
||||
this.docCount = docCount;
|
||||
OffHeapFSTStore offHeapFSTStore = new OffHeapFSTStore();
|
||||
this.dict = new FST<>(in, in, new FSTTermOutputs(fieldInfo), offHeapFSTStore);
|
||||
FSTTermOutputs outputs = new FSTTermOutputs(fieldInfo);
|
||||
this.dict = new FST<>(FST.readMetadata(in, outputs), in, offHeapFSTStore);
|
||||
in.skipBytes(offHeapFSTStore.size());
|
||||
}
|
||||
|
||||
|
|
|
@ -251,12 +251,12 @@ public class FSTTermsWriter extends FieldsConsumer {
|
|||
private final IntsRefBuilder scratchTerm = new IntsRefBuilder();
|
||||
private final ByteBuffersDataOutput metaWriter = ByteBuffersDataOutput.newResettableInstance();
|
||||
|
||||
TermsWriter(FieldInfo fieldInfo) {
|
||||
TermsWriter(FieldInfo fieldInfo) throws IOException {
|
||||
this.numTerms = 0;
|
||||
this.fieldInfo = fieldInfo;
|
||||
postingsWriter.setField(fieldInfo);
|
||||
this.outputs = new FSTTermOutputs(fieldInfo);
|
||||
this.fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
|
||||
this.fstCompiler = new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE1, outputs).build();
|
||||
}
|
||||
|
||||
public void finishTerm(BytesRef text, BlockTermState state) throws IOException {
|
||||
|
|
|
@ -683,7 +683,7 @@ class SimpleTextFieldsReader extends FieldsProducer {
|
|||
final PairOutputs<Long, Long> outputsInner = new PairOutputs<>(posIntOutputs, posIntOutputs);
|
||||
final PairOutputs<PairOutputs.Pair<Long, Long>, PairOutputs.Pair<Long, Long>> outputs =
|
||||
new PairOutputs<>(outputsOuter, outputsInner);
|
||||
fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
|
||||
fstCompiler = new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE1, outputs).build();
|
||||
IndexInput in = SimpleTextFieldsReader.this.in.clone();
|
||||
in.seek(termsStart);
|
||||
final BytesRefBuilder lastTerm = new BytesRefBuilder();
|
||||
|
|
|
@ -37,7 +37,6 @@ public class SimpleTextStoredFieldsFormat extends StoredFieldsFormat {
|
|||
@Override
|
||||
public StoredFieldsReader fieldsReader(
|
||||
Directory directory, SegmentInfo si, FieldInfos fn, IOContext context) throws IOException {
|
||||
;
|
||||
return new SimpleTextStoredFieldsReader(directory, si, fn, context);
|
||||
}
|
||||
|
||||
|
|
|
@ -89,10 +89,11 @@ public class FSTDictionary implements IndexDictionary {
|
|||
isFSTOnHeap = true;
|
||||
}
|
||||
PositiveIntOutputs fstOutputs = PositiveIntOutputs.getSingleton();
|
||||
FST.FSTMetadata<Long> metadata = FST.readMetadata(fstDataInput, fstOutputs);
|
||||
FST<Long> fst =
|
||||
isFSTOnHeap
|
||||
? new FST<>(fstDataInput, fstDataInput, fstOutputs)
|
||||
: new FST<>(fstDataInput, fstDataInput, fstOutputs, new OffHeapFSTStore());
|
||||
? new FST<>(metadata, fstDataInput)
|
||||
: new FST<>(metadata, fstDataInput, new OffHeapFSTStore());
|
||||
return new FSTDictionary(fst);
|
||||
}
|
||||
|
||||
|
@ -171,9 +172,9 @@ public class FSTDictionary implements IndexDictionary {
|
|||
protected final FSTCompiler<Long> fstCompiler;
|
||||
protected final IntsRefBuilder scratchInts;
|
||||
|
||||
public Builder() {
|
||||
public Builder() throws IOException {
|
||||
PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
|
||||
fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
|
||||
fstCompiler = new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE1, outputs).build();
|
||||
scratchInts = new IntsRefBuilder();
|
||||
}
|
||||
|
||||
|
|
|
@ -100,5 +100,4 @@ public abstract class DelegatingAnalyzerWrapper extends AnalyzerWrapper {
|
|||
}
|
||||
}
|
||||
}
|
||||
;
|
||||
}
|
||||
|
|
|
@ -70,7 +70,6 @@ public abstract class TermVectorsWriter implements Closeable, Accountable {
|
|||
|
||||
/** Called after a doc and all its fields have been added. */
|
||||
public void finishDocument() throws IOException {}
|
||||
;
|
||||
|
||||
/**
|
||||
* Called before writing the terms of the field. {@link #startTerm(BytesRef, int)} will be called
|
||||
|
@ -82,7 +81,6 @@ public abstract class TermVectorsWriter implements Closeable, Accountable {
|
|||
|
||||
/** Called after a field and all its terms have been added. */
|
||||
public void finishField() throws IOException {}
|
||||
;
|
||||
|
||||
/**
|
||||
* Adds a term and its term frequency <code>freq</code>. If this field has positions and/or
|
||||
|
|
|
@ -91,7 +91,11 @@ public final class FieldReader extends Terms {
|
|||
// Initialize FST always off-heap.
|
||||
final IndexInput clone = indexIn.clone();
|
||||
clone.seek(indexStartFP);
|
||||
index = new FST<>(metaIn, clone, ByteSequenceOutputs.getSingleton(), new OffHeapFSTStore());
|
||||
index =
|
||||
new FST<>(
|
||||
FST.readMetadata(metaIn, ByteSequenceOutputs.getSingleton()),
|
||||
clone,
|
||||
new OffHeapFSTStore());
|
||||
/*
|
||||
if (false) {
|
||||
final String dotFileName = segment + "_" + fieldInfo.name + ".dot";
|
||||
|
|
|
@ -30,9 +30,7 @@ import org.apache.lucene.util.StringHelper;
|
|||
import org.apache.lucene.util.automaton.ByteRunnable;
|
||||
import org.apache.lucene.util.automaton.Transition;
|
||||
import org.apache.lucene.util.automaton.TransitionAccessor;
|
||||
import org.apache.lucene.util.fst.ByteSequenceOutputs;
|
||||
import org.apache.lucene.util.fst.FST;
|
||||
import org.apache.lucene.util.fst.Outputs;
|
||||
|
||||
/**
|
||||
* This is used to implement efficient {@link Terms#intersect} for block-tree. Note that it cannot
|
||||
|
@ -46,7 +44,6 @@ final class IntersectTermsEnum extends BaseTermsEnum {
|
|||
// static boolean DEBUG = BlockTreeTermsWriter.DEBUG;
|
||||
|
||||
final IndexInput in;
|
||||
static final Outputs<BytesRef> fstOutputs = ByteSequenceOutputs.getSingleton();
|
||||
|
||||
IntersectTermsEnumFrame[] stack;
|
||||
|
||||
|
@ -68,6 +65,9 @@ final class IntersectTermsEnum extends BaseTermsEnum {
|
|||
|
||||
private BytesRef savedStartTerm;
|
||||
|
||||
private final SegmentTermsEnum.OutputAccumulator outputAccumulator =
|
||||
new SegmentTermsEnum.OutputAccumulator();
|
||||
|
||||
// TODO: in some cases we can filter by length? eg
|
||||
// regexp foo*bar must be at least length 6 bytes
|
||||
public IntersectTermsEnum(
|
||||
|
@ -114,7 +114,6 @@ final class IntersectTermsEnum extends BaseTermsEnum {
|
|||
f.prefix = 0;
|
||||
f.setState(0);
|
||||
f.arc = arc;
|
||||
f.outputPrefix = arc.output();
|
||||
f.load(fr.rootCode);
|
||||
|
||||
// for assert:
|
||||
|
@ -184,7 +183,9 @@ final class IntersectTermsEnum extends BaseTermsEnum {
|
|||
FST.Arc<BytesRef> arc = currentFrame.arc;
|
||||
int idx = currentFrame.prefix;
|
||||
assert currentFrame.suffix > 0;
|
||||
BytesRef output = currentFrame.outputPrefix;
|
||||
|
||||
outputAccumulator.reset();
|
||||
outputAccumulator.push(arc.output());
|
||||
while (idx < f.prefix) {
|
||||
final int target = term.bytes[idx] & 0xff;
|
||||
// TODO: we could be more efficient for the next()
|
||||
|
@ -192,14 +193,14 @@ final class IntersectTermsEnum extends BaseTermsEnum {
|
|||
// passed to findTargetArc
|
||||
arc = fr.index.findTargetArc(target, arc, getArc(1 + idx), fstReader);
|
||||
assert arc != null;
|
||||
output = fstOutputs.add(output, arc.output());
|
||||
outputAccumulator.push(arc.output());
|
||||
idx++;
|
||||
}
|
||||
|
||||
f.arc = arc;
|
||||
f.outputPrefix = output;
|
||||
assert arc.isFinal();
|
||||
f.load(fstOutputs.add(output, arc.nextFinalOutput()));
|
||||
outputAccumulator.push(arc.nextFinalOutput());
|
||||
f.load(outputAccumulator);
|
||||
return f;
|
||||
}
|
||||
|
||||
|
|
|
@ -55,7 +55,6 @@ final class IntersectTermsEnumFrame {
|
|||
int statsSingletonRunLength = 0;
|
||||
final ByteArrayDataInput statsReader = new ByteArrayDataInput();
|
||||
|
||||
byte[] floorData = new byte[32];
|
||||
final ByteArrayDataInput floorDataReader = new ByteArrayDataInput();
|
||||
|
||||
// Length of prefix shared by all terms in this block
|
||||
|
@ -90,9 +89,6 @@ final class IntersectTermsEnumFrame {
|
|||
|
||||
final ByteArrayDataInput bytesReader = new ByteArrayDataInput();
|
||||
|
||||
// Cumulative output so far
|
||||
BytesRef outputPrefix;
|
||||
|
||||
int startBytePos;
|
||||
int suffix;
|
||||
|
||||
|
@ -120,7 +116,7 @@ final class IntersectTermsEnumFrame {
|
|||
}
|
||||
} while (numFollowFloorBlocks != 0 && nextFloorLabel <= transition.min);
|
||||
|
||||
load(null);
|
||||
load((Long) null);
|
||||
}
|
||||
|
||||
public void setState(int state) {
|
||||
|
@ -142,12 +138,22 @@ final class IntersectTermsEnumFrame {
|
|||
}
|
||||
|
||||
void load(BytesRef frameIndexData) throws IOException {
|
||||
if (frameIndexData != null) {
|
||||
floorDataReader.reset(frameIndexData.bytes, frameIndexData.offset, frameIndexData.length);
|
||||
// Skip first long -- has redundant fp, hasTerms
|
||||
// flag, isFloor flag
|
||||
final long code = ite.fr.readVLongOutput(floorDataReader);
|
||||
if ((code & Lucene90BlockTreeTermsReader.OUTPUT_FLAG_IS_FLOOR) != 0) {
|
||||
floorDataReader.reset(frameIndexData.bytes, frameIndexData.offset, frameIndexData.length);
|
||||
load(ite.fr.readVLongOutput(floorDataReader));
|
||||
}
|
||||
|
||||
void load(SegmentTermsEnum.OutputAccumulator outputAccumulator) throws IOException {
|
||||
outputAccumulator.prepareRead();
|
||||
long code = ite.fr.readVLongOutput(outputAccumulator);
|
||||
outputAccumulator.setFloorData(floorDataReader);
|
||||
load(code);
|
||||
}
|
||||
|
||||
void load(Long blockCode) throws IOException {
|
||||
if (blockCode != null) {
|
||||
// This block is the first one in a possible sequence of floor blocks corresponding to a
|
||||
// single seek point from the FST terms index
|
||||
if ((blockCode & Lucene90BlockTreeTermsReader.OUTPUT_FLAG_IS_FLOOR) != 0) {
|
||||
// Floor frame
|
||||
numFollowFloorBlocks = floorDataReader.readVInt();
|
||||
nextFloorLabel = floorDataReader.readByte() & 0xff;
|
||||
|
|
|
@ -16,6 +16,8 @@
|
|||
*/
|
||||
package org.apache.lucene.codecs.lucene90.blocktree;
|
||||
|
||||
import static org.apache.lucene.util.fst.FSTCompiler.getOnHeapReaderWriter;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
|
@ -525,7 +527,7 @@ public final class Lucene90BlockTreeTermsWriter extends FieldsConsumer {
|
|||
// Disable suffixes sharing for block tree index because suffixes are mostly dropped
|
||||
// from the FST index and left in the term blocks.
|
||||
.suffixRAMLimitMB(0d)
|
||||
.bytesPageBits(pageBits)
|
||||
.dataOutput(getOnHeapReaderWriter(pageBits))
|
||||
.build();
|
||||
// if (DEBUG) {
|
||||
// System.out.println(" compile index for prefix=" + prefix);
|
||||
|
|
|
@ -24,6 +24,7 @@ import org.apache.lucene.index.ImpactsEnum;
|
|||
import org.apache.lucene.index.PostingsEnum;
|
||||
import org.apache.lucene.index.TermState;
|
||||
import org.apache.lucene.store.ByteArrayDataInput;
|
||||
import org.apache.lucene.store.DataInput;
|
||||
import org.apache.lucene.store.IndexInput;
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
|
@ -48,7 +49,7 @@ final class SegmentTermsEnum extends BaseTermsEnum {
|
|||
|
||||
// static boolean DEBUG = BlockTreeTermsWriter.DEBUG;
|
||||
|
||||
private final ByteArrayDataInput scratchReader = new ByteArrayDataInput();
|
||||
private final OutputAccumulator outputAccumulator = new OutputAccumulator();
|
||||
|
||||
// What prefix of the current term was present in the index; when we only next() through the
|
||||
// index, this stays at 0. It's only set when
|
||||
|
@ -232,18 +233,24 @@ final class SegmentTermsEnum extends BaseTermsEnum {
|
|||
return arcs[ord];
|
||||
}
|
||||
|
||||
// Pushes a frame we seek'd to
|
||||
SegmentTermsEnumFrame pushFrame(FST.Arc<BytesRef> arc, BytesRef frameData, int length)
|
||||
throws IOException {
|
||||
scratchReader.reset(frameData.bytes, frameData.offset, frameData.length);
|
||||
final long code = fr.readVLongOutput(scratchReader);
|
||||
outputAccumulator.reset();
|
||||
outputAccumulator.push(frameData);
|
||||
return pushFrame(arc, length);
|
||||
}
|
||||
|
||||
// Pushes a frame we seek'd to
|
||||
SegmentTermsEnumFrame pushFrame(FST.Arc<BytesRef> arc, int length) throws IOException {
|
||||
outputAccumulator.prepareRead();
|
||||
final long code = fr.readVLongOutput(outputAccumulator);
|
||||
final long fpSeek = code >>> Lucene90BlockTreeTermsReader.OUTPUT_FLAGS_NUM_BITS;
|
||||
final SegmentTermsEnumFrame f = getFrame(1 + currentFrame.ord);
|
||||
f.hasTerms = (code & Lucene90BlockTreeTermsReader.OUTPUT_FLAG_HAS_TERMS) != 0;
|
||||
f.hasTermsOrig = f.hasTerms;
|
||||
f.isFloor = (code & Lucene90BlockTreeTermsReader.OUTPUT_FLAG_IS_FLOOR) != 0;
|
||||
if (f.isFloor) {
|
||||
f.setFloorData(scratchReader, frameData);
|
||||
f.setFloorData(outputAccumulator);
|
||||
}
|
||||
pushFrame(arc, fpSeek, length);
|
||||
|
||||
|
@ -344,9 +351,9 @@ final class SegmentTermsEnum extends BaseTermsEnum {
|
|||
|
||||
FST.Arc<BytesRef> arc;
|
||||
int targetUpto;
|
||||
BytesRef output;
|
||||
|
||||
targetBeforeCurrentLength = currentFrame.ord;
|
||||
outputAccumulator.reset();
|
||||
|
||||
if (currentFrame != staticFrame) {
|
||||
|
||||
|
@ -363,7 +370,7 @@ final class SegmentTermsEnum extends BaseTermsEnum {
|
|||
|
||||
arc = arcs[0];
|
||||
assert arc.isFinal();
|
||||
output = arc.output();
|
||||
outputAccumulator.push(arc.output());
|
||||
targetUpto = 0;
|
||||
|
||||
SegmentTermsEnumFrame lastFrame = stack[0];
|
||||
|
@ -373,9 +380,6 @@ final class SegmentTermsEnum extends BaseTermsEnum {
|
|||
|
||||
int cmp = 0;
|
||||
|
||||
// TODO: reverse vLong byte order for better FST
|
||||
// prefix output sharing
|
||||
|
||||
// First compare up to valid seek frames:
|
||||
while (targetUpto < targetLimit) {
|
||||
cmp = (term.byteAt(targetUpto) & 0xFF) - (target.bytes[target.offset + targetUpto] & 0xFF);
|
||||
|
@ -394,9 +398,8 @@ final class SegmentTermsEnum extends BaseTermsEnum {
|
|||
+ (char) arc.label()
|
||||
+ " targetLabel="
|
||||
+ (char) (target.bytes[target.offset + targetUpto] & 0xFF);
|
||||
if (arc.output() != Lucene90BlockTreeTermsReader.NO_OUTPUT) {
|
||||
output = Lucene90BlockTreeTermsReader.FST_OUTPUTS.add(output, arc.output());
|
||||
}
|
||||
outputAccumulator.push(arc.output());
|
||||
|
||||
if (arc.isFinal()) {
|
||||
lastFrame = stack[1 + lastFrame.ord];
|
||||
}
|
||||
|
@ -484,15 +487,15 @@ final class SegmentTermsEnum extends BaseTermsEnum {
|
|||
// System.out.println(" no seek state; push root frame");
|
||||
// }
|
||||
|
||||
output = arc.output();
|
||||
outputAccumulator.push(arc.output());
|
||||
|
||||
currentFrame = staticFrame;
|
||||
|
||||
// term.length = 0;
|
||||
targetUpto = 0;
|
||||
currentFrame =
|
||||
pushFrame(
|
||||
arc, Lucene90BlockTreeTermsReader.FST_OUTPUTS.add(output, arc.nextFinalOutput()), 0);
|
||||
outputAccumulator.push(arc.nextFinalOutput());
|
||||
currentFrame = pushFrame(arc, 0);
|
||||
outputAccumulator.pop();
|
||||
}
|
||||
|
||||
// if (DEBUG) {
|
||||
|
@ -554,9 +557,7 @@ final class SegmentTermsEnum extends BaseTermsEnum {
|
|||
term.setByteAt(targetUpto, (byte) targetLabel);
|
||||
// Aggregate output as we go:
|
||||
assert arc.output() != null;
|
||||
if (arc.output() != Lucene90BlockTreeTermsReader.NO_OUTPUT) {
|
||||
output = Lucene90BlockTreeTermsReader.FST_OUTPUTS.add(output, arc.output());
|
||||
}
|
||||
outputAccumulator.push(arc.output());
|
||||
|
||||
// if (DEBUG) {
|
||||
// System.out.println(" index: follow label=" + toHex(target.bytes[target.offset +
|
||||
|
@ -566,11 +567,9 @@ final class SegmentTermsEnum extends BaseTermsEnum {
|
|||
|
||||
if (arc.isFinal()) {
|
||||
// if (DEBUG) System.out.println(" arc is final!");
|
||||
currentFrame =
|
||||
pushFrame(
|
||||
arc,
|
||||
Lucene90BlockTreeTermsReader.FST_OUTPUTS.add(output, arc.nextFinalOutput()),
|
||||
targetUpto);
|
||||
outputAccumulator.push(arc.nextFinalOutput());
|
||||
currentFrame = pushFrame(arc, targetUpto);
|
||||
outputAccumulator.pop();
|
||||
// if (DEBUG) System.out.println(" curFrame.ord=" + currentFrame.ord + " hasTerms=" +
|
||||
// currentFrame.hasTerms);
|
||||
}
|
||||
|
@ -630,9 +629,9 @@ final class SegmentTermsEnum extends BaseTermsEnum {
|
|||
|
||||
FST.Arc<BytesRef> arc;
|
||||
int targetUpto;
|
||||
BytesRef output;
|
||||
|
||||
targetBeforeCurrentLength = currentFrame.ord;
|
||||
outputAccumulator.reset();
|
||||
|
||||
if (currentFrame != staticFrame) {
|
||||
|
||||
|
@ -649,7 +648,7 @@ final class SegmentTermsEnum extends BaseTermsEnum {
|
|||
|
||||
arc = arcs[0];
|
||||
assert arc.isFinal();
|
||||
output = arc.output();
|
||||
outputAccumulator.push(arc.output());
|
||||
targetUpto = 0;
|
||||
|
||||
SegmentTermsEnumFrame lastFrame = stack[0];
|
||||
|
@ -659,9 +658,6 @@ final class SegmentTermsEnum extends BaseTermsEnum {
|
|||
|
||||
int cmp = 0;
|
||||
|
||||
// TODO: we should write our vLong backwards (MSB
|
||||
// first) to get better sharing from the FST
|
||||
|
||||
// First compare up to valid seek frames:
|
||||
while (targetUpto < targetLimit) {
|
||||
cmp = (term.byteAt(targetUpto) & 0xFF) - (target.bytes[target.offset + targetUpto] & 0xFF);
|
||||
|
@ -680,14 +676,8 @@ final class SegmentTermsEnum extends BaseTermsEnum {
|
|||
+ (char) arc.label()
|
||||
+ " targetLabel="
|
||||
+ (char) (target.bytes[target.offset + targetUpto] & 0xFF);
|
||||
// TODO: we could save the outputs in local
|
||||
// byte[][] instead of making new objs ever
|
||||
// seek; but, often the FST doesn't have any
|
||||
// shared bytes (but this could change if we
|
||||
// reverse vLong byte order)
|
||||
if (arc.output() != Lucene90BlockTreeTermsReader.NO_OUTPUT) {
|
||||
output = Lucene90BlockTreeTermsReader.FST_OUTPUTS.add(output, arc.output());
|
||||
}
|
||||
|
||||
outputAccumulator.push(arc.output());
|
||||
if (arc.isFinal()) {
|
||||
lastFrame = stack[1 + lastFrame.ord];
|
||||
}
|
||||
|
@ -769,15 +759,15 @@ final class SegmentTermsEnum extends BaseTermsEnum {
|
|||
// System.out.println(" no seek state; push root frame");
|
||||
// }
|
||||
|
||||
output = arc.output();
|
||||
outputAccumulator.push(arc.output());
|
||||
|
||||
currentFrame = staticFrame;
|
||||
|
||||
// term.length = 0;
|
||||
targetUpto = 0;
|
||||
currentFrame =
|
||||
pushFrame(
|
||||
arc, Lucene90BlockTreeTermsReader.FST_OUTPUTS.add(output, arc.nextFinalOutput()), 0);
|
||||
outputAccumulator.push(arc.nextFinalOutput());
|
||||
currentFrame = pushFrame(arc, 0);
|
||||
outputAccumulator.pop();
|
||||
}
|
||||
|
||||
// if (DEBUG) {
|
||||
|
@ -839,9 +829,7 @@ final class SegmentTermsEnum extends BaseTermsEnum {
|
|||
arc = nextArc;
|
||||
// Aggregate output as we go:
|
||||
assert arc.output() != null;
|
||||
if (arc.output() != Lucene90BlockTreeTermsReader.NO_OUTPUT) {
|
||||
output = Lucene90BlockTreeTermsReader.FST_OUTPUTS.add(output, arc.output());
|
||||
}
|
||||
outputAccumulator.push(arc.output());
|
||||
|
||||
// if (DEBUG) {
|
||||
// System.out.println(" index: follow label=" + (target.bytes[target.offset +
|
||||
|
@ -851,11 +839,9 @@ final class SegmentTermsEnum extends BaseTermsEnum {
|
|||
|
||||
if (arc.isFinal()) {
|
||||
// if (DEBUG) System.out.println(" arc is final!");
|
||||
currentFrame =
|
||||
pushFrame(
|
||||
arc,
|
||||
Lucene90BlockTreeTermsReader.FST_OUTPUTS.add(output, arc.nextFinalOutput()),
|
||||
targetUpto);
|
||||
outputAccumulator.push(arc.nextFinalOutput());
|
||||
currentFrame = pushFrame(arc, targetUpto);
|
||||
outputAccumulator.pop();
|
||||
// if (DEBUG) System.out.println(" curFrame.ord=" + currentFrame.ord + " hasTerms=" +
|
||||
// currentFrame.hasTerms);
|
||||
}
|
||||
|
@ -1190,4 +1176,68 @@ final class SegmentTermsEnum extends BaseTermsEnum {
|
|||
public long ord() {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
static class OutputAccumulator extends DataInput {
|
||||
|
||||
BytesRef[] outputs = new BytesRef[16];
|
||||
BytesRef current;
|
||||
int num;
|
||||
int outputIndex;
|
||||
int index;
|
||||
|
||||
void push(BytesRef output) {
|
||||
if (output != Lucene90BlockTreeTermsReader.NO_OUTPUT) {
|
||||
outputs = ArrayUtil.grow(outputs, num + 1);
|
||||
outputs[num++] = output;
|
||||
}
|
||||
}
|
||||
|
||||
void pop() {
|
||||
assert num > 0;
|
||||
num--;
|
||||
}
|
||||
|
||||
void reset() {
|
||||
num = 0;
|
||||
}
|
||||
|
||||
void prepareRead() {
|
||||
index = 0;
|
||||
outputIndex = 0;
|
||||
current = outputs[0];
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the last arc as the source of the floorData. This won't change the reading position of
|
||||
* this {@link OutputAccumulator}
|
||||
*/
|
||||
void setFloorData(ByteArrayDataInput floorData) {
|
||||
assert outputIndex == num - 1
|
||||
: "floor data should be stored in last arc, get outputIndex: "
|
||||
+ outputIndex
|
||||
+ ", num: "
|
||||
+ num;
|
||||
BytesRef output = outputs[outputIndex];
|
||||
floorData.reset(output.bytes, output.offset + index, output.length - index);
|
||||
}
|
||||
|
||||
@Override
|
||||
public byte readByte() throws IOException {
|
||||
if (index >= current.length) {
|
||||
current = outputs[++outputIndex];
|
||||
index = 0;
|
||||
}
|
||||
return current.bytes[current.offset + index++];
|
||||
}
|
||||
|
||||
@Override
|
||||
public void readBytes(byte[] b, int offset, int len) throws IOException {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void skipBytes(long numBytes) throws IOException {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -55,7 +55,7 @@ final class SegmentTermsEnumFrame {
|
|||
int statsSingletonRunLength = 0;
|
||||
final ByteArrayDataInput statsReader = new ByteArrayDataInput();
|
||||
|
||||
byte[] floorData = new byte[32];
|
||||
int rewindPos;
|
||||
final ByteArrayDataInput floorDataReader = new ByteArrayDataInput();
|
||||
|
||||
// Length of prefix shared by all terms in this block
|
||||
|
@ -104,13 +104,9 @@ final class SegmentTermsEnumFrame {
|
|||
suffixLengthsReader = new ByteArrayDataInput();
|
||||
}
|
||||
|
||||
public void setFloorData(ByteArrayDataInput in, BytesRef source) {
|
||||
final int numBytes = source.length - (in.getPosition() - source.offset);
|
||||
if (numBytes > floorData.length) {
|
||||
floorData = new byte[ArrayUtil.oversize(numBytes, 1)];
|
||||
}
|
||||
System.arraycopy(source.bytes, source.offset + in.getPosition(), floorData, 0, numBytes);
|
||||
floorDataReader.reset(floorData, 0, numBytes);
|
||||
public void setFloorData(SegmentTermsEnum.OutputAccumulator outputAccumulator) {
|
||||
outputAccumulator.setFloorData(floorDataReader);
|
||||
rewindPos = floorDataReader.getPosition();
|
||||
numFollowFloorBlocks = floorDataReader.readVInt();
|
||||
nextFloorLabel = floorDataReader.readByte() & 0xff;
|
||||
// if (DEBUG) {
|
||||
|
@ -247,7 +243,7 @@ final class SegmentTermsEnumFrame {
|
|||
nextEnt = -1;
|
||||
hasTerms = hasTermsOrig;
|
||||
if (isFloor) {
|
||||
floorDataReader.rewind();
|
||||
floorDataReader.setPosition(rewindPos);
|
||||
numFollowFloorBlocks = floorDataReader.readVInt();
|
||||
assert numFollowFloorBlocks > 0;
|
||||
nextFloorLabel = floorDataReader.readByte() & 0xff;
|
||||
|
|
|
@ -0,0 +1,57 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.codecs.lucene99;
|
||||
|
||||
import java.io.IOException;
|
||||
import org.apache.lucene.store.DataInput;
|
||||
|
||||
/** Decode integers using group-varint. */
|
||||
public class GroupVIntReader {
|
||||
|
||||
public static void readValues(DataInput in, long[] docs, int limit) throws IOException {
|
||||
int i;
|
||||
for (i = 0; i <= limit - 4; i += 4) {
|
||||
final int flag = in.readByte() & 0xFF;
|
||||
|
||||
final int n1Minus1 = flag >> 6;
|
||||
final int n2Minus1 = (flag >> 4) & 0x03;
|
||||
final int n3Minus1 = (flag >> 2) & 0x03;
|
||||
final int n4Minus1 = flag & 0x03;
|
||||
|
||||
docs[i] = readLong(in, n1Minus1);
|
||||
docs[i + 1] = readLong(in, n2Minus1);
|
||||
docs[i + 2] = readLong(in, n3Minus1);
|
||||
docs[i + 3] = readLong(in, n4Minus1);
|
||||
}
|
||||
for (; i < limit; ++i) {
|
||||
docs[i] = in.readVInt();
|
||||
}
|
||||
}
|
||||
|
||||
private static long readLong(DataInput in, int numBytesMinus1) throws IOException {
|
||||
switch (numBytesMinus1) {
|
||||
case 0:
|
||||
return in.readByte() & 0xFFL;
|
||||
case 1:
|
||||
return in.readShort() & 0xFFFFL;
|
||||
case 2:
|
||||
return (in.readShort() & 0xFFFFL) | ((in.readByte() & 0xFFL) << 16);
|
||||
default:
|
||||
return in.readInt() & 0xFFFFFFFFL;
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,63 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.codecs.lucene99;
|
||||
|
||||
import java.io.IOException;
|
||||
import org.apache.lucene.store.DataOutput;
|
||||
|
||||
/**
|
||||
* Encode integers using group-varint. It uses VInt to encode tail values that are not enough for a
|
||||
* group
|
||||
*/
|
||||
public class GroupVIntWriter {
|
||||
|
||||
// the maximum size of one group is 4 integers + 1 byte flag.
|
||||
private byte[] bytes = new byte[17];
|
||||
private int byteOffset = 0;
|
||||
|
||||
public GroupVIntWriter() {}
|
||||
|
||||
private int encodeValue(int v) {
|
||||
int lastOff = byteOffset;
|
||||
do {
|
||||
bytes[byteOffset++] = (byte) (v & 0xFF);
|
||||
v >>>= 8;
|
||||
} while (v != 0);
|
||||
return byteOffset - lastOff;
|
||||
}
|
||||
|
||||
public void writeValues(DataOutput out, long[] values, int limit) throws IOException {
|
||||
int off = 0;
|
||||
|
||||
// encode each group
|
||||
while ((limit - off) >= 4) {
|
||||
byte flag = 0;
|
||||
byteOffset = 1;
|
||||
flag |= (encodeValue((int) values[off++]) - 1) << 6;
|
||||
flag |= (encodeValue((int) values[off++]) - 1) << 4;
|
||||
flag |= (encodeValue((int) values[off++]) - 1) << 2;
|
||||
flag |= (encodeValue((int) values[off++]) - 1);
|
||||
bytes[0] = flag;
|
||||
out.writeBytes(bytes, byteOffset);
|
||||
}
|
||||
|
||||
// tail vints
|
||||
for (; off < limit; off++) {
|
||||
out.writeVInt((int) values[off]);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -31,6 +31,7 @@ import org.apache.lucene.codecs.KnnVectorsReader;
|
|||
import org.apache.lucene.codecs.KnnVectorsWriter;
|
||||
import org.apache.lucene.index.SegmentReadState;
|
||||
import org.apache.lucene.index.SegmentWriteState;
|
||||
import org.apache.lucene.search.TaskExecutor;
|
||||
import org.apache.lucene.util.hnsw.HnswGraph;
|
||||
|
||||
/**
|
||||
|
@ -60,7 +61,7 @@ public final class Lucene99HnswScalarQuantizedVectorsFormat extends KnnVectorsFo
|
|||
private final FlatVectorsFormat flatVectorsFormat;
|
||||
|
||||
private final int numMergeWorkers;
|
||||
private final ExecutorService mergeExec;
|
||||
private final TaskExecutor mergeExec;
|
||||
|
||||
/** Constructs a format using default graph construction parameters */
|
||||
public Lucene99HnswScalarQuantizedVectorsFormat() {
|
||||
|
@ -84,8 +85,8 @@ public final class Lucene99HnswScalarQuantizedVectorsFormat extends KnnVectorsFo
|
|||
* @param beamWidth the size of the queue maintained during graph construction.
|
||||
* @param numMergeWorkers number of workers (threads) that will be used when doing merge. If
|
||||
* larger than 1, a non-null {@link ExecutorService} must be passed as mergeExec
|
||||
* @param configuredQuantile the quantile for scalar quantizing the vectors, when `null` it is
|
||||
* calculated based on the vector field dimensions.
|
||||
* @param confidenceInterval the confidenceInterval for scalar quantizing the vectors, when `null`
|
||||
* it is calculated based on the vector field dimensions.
|
||||
* @param mergeExec the {@link ExecutorService} that will be used by ALL vector writers that are
|
||||
* generated by this format to do the merge
|
||||
*/
|
||||
|
@ -93,7 +94,7 @@ public final class Lucene99HnswScalarQuantizedVectorsFormat extends KnnVectorsFo
|
|||
int maxConn,
|
||||
int beamWidth,
|
||||
int numMergeWorkers,
|
||||
Float configuredQuantile,
|
||||
Float confidenceInterval,
|
||||
ExecutorService mergeExec) {
|
||||
super("Lucene99HnswScalarQuantizedVectorsFormat");
|
||||
if (maxConn <= 0 || maxConn > MAXIMUM_MAX_CONN) {
|
||||
|
@ -121,8 +122,12 @@ public final class Lucene99HnswScalarQuantizedVectorsFormat extends KnnVectorsFo
|
|||
"No executor service is needed as we'll use single thread to merge");
|
||||
}
|
||||
this.numMergeWorkers = numMergeWorkers;
|
||||
this.mergeExec = mergeExec;
|
||||
this.flatVectorsFormat = new Lucene99ScalarQuantizedVectorsFormat(configuredQuantile);
|
||||
if (mergeExec != null) {
|
||||
this.mergeExec = new TaskExecutor(mergeExec);
|
||||
} else {
|
||||
this.mergeExec = null;
|
||||
}
|
||||
this.flatVectorsFormat = new Lucene99ScalarQuantizedVectorsFormat(confidenceInterval);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -27,6 +27,7 @@ import org.apache.lucene.codecs.lucene90.IndexedDISI;
|
|||
import org.apache.lucene.index.SegmentReadState;
|
||||
import org.apache.lucene.index.SegmentWriteState;
|
||||
import org.apache.lucene.search.DocIdSetIterator;
|
||||
import org.apache.lucene.search.TaskExecutor;
|
||||
import org.apache.lucene.store.IndexOutput;
|
||||
import org.apache.lucene.util.hnsw.HnswGraph;
|
||||
|
||||
|
@ -137,7 +138,7 @@ public final class Lucene99HnswVectorsFormat extends KnnVectorsFormat {
|
|||
private static final FlatVectorsFormat flatVectorsFormat = new Lucene99FlatVectorsFormat();
|
||||
|
||||
private final int numMergeWorkers;
|
||||
private final ExecutorService mergeExec;
|
||||
private final TaskExecutor mergeExec;
|
||||
|
||||
/** Constructs a format using default graph construction parameters */
|
||||
public Lucene99HnswVectorsFormat() {
|
||||
|
@ -192,7 +193,11 @@ public final class Lucene99HnswVectorsFormat extends KnnVectorsFormat {
|
|||
"No executor service is needed as we'll use single thread to merge");
|
||||
}
|
||||
this.numMergeWorkers = numMergeWorkers;
|
||||
this.mergeExec = mergeExec;
|
||||
if (mergeExec != null) {
|
||||
this.mergeExec = new TaskExecutor(mergeExec);
|
||||
} else {
|
||||
this.mergeExec = null;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -92,18 +92,8 @@ public final class Lucene99HnswVectorsReader extends KnnVectorsReader
|
|||
} catch (Throwable exception) {
|
||||
priorE = exception;
|
||||
} finally {
|
||||
try {
|
||||
CodecUtil.checkFooter(meta, priorE);
|
||||
success = true;
|
||||
} finally {
|
||||
if (success == false) {
|
||||
IOUtils.close(flatVectorsReader);
|
||||
}
|
||||
}
|
||||
CodecUtil.checkFooter(meta, priorE);
|
||||
}
|
||||
}
|
||||
success = false;
|
||||
try {
|
||||
vectorIndex =
|
||||
openDataInput(
|
||||
state,
|
||||
|
@ -237,12 +227,22 @@ public final class Lucene99HnswVectorsReader extends KnnVectorsReader
|
|||
|| fieldEntry.vectorEncoding != VectorEncoding.FLOAT32) {
|
||||
return;
|
||||
}
|
||||
RandomVectorScorer scorer = flatVectorsReader.getRandomVectorScorer(field, target);
|
||||
HnswGraphSearcher.search(
|
||||
scorer,
|
||||
new OrdinalTranslatedKnnCollector(knnCollector, scorer::ordToDoc),
|
||||
getGraph(fieldEntry),
|
||||
scorer.getAcceptOrds(acceptDocs));
|
||||
final RandomVectorScorer scorer = flatVectorsReader.getRandomVectorScorer(field, target);
|
||||
final KnnCollector collector =
|
||||
new OrdinalTranslatedKnnCollector(knnCollector, scorer::ordToDoc);
|
||||
final Bits acceptedOrds = scorer.getAcceptOrds(acceptDocs);
|
||||
if (knnCollector.k() < scorer.maxOrd()) {
|
||||
HnswGraphSearcher.search(scorer, collector, getGraph(fieldEntry), acceptedOrds);
|
||||
} else {
|
||||
// if k is larger than the number of vectors, we can just iterate over all vectors
|
||||
// and collect them
|
||||
for (int i = 0; i < scorer.maxOrd(); i++) {
|
||||
if (acceptedOrds == null || acceptedOrds.get(i)) {
|
||||
knnCollector.incVisitedCount(1);
|
||||
knnCollector.collect(scorer.ordToDoc(i), scorer.score(i));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -255,12 +255,22 @@ public final class Lucene99HnswVectorsReader extends KnnVectorsReader
|
|||
|| fieldEntry.vectorEncoding != VectorEncoding.BYTE) {
|
||||
return;
|
||||
}
|
||||
RandomVectorScorer scorer = flatVectorsReader.getRandomVectorScorer(field, target);
|
||||
HnswGraphSearcher.search(
|
||||
scorer,
|
||||
new OrdinalTranslatedKnnCollector(knnCollector, scorer::ordToDoc),
|
||||
getGraph(fieldEntry),
|
||||
scorer.getAcceptOrds(acceptDocs));
|
||||
final RandomVectorScorer scorer = flatVectorsReader.getRandomVectorScorer(field, target);
|
||||
final KnnCollector collector =
|
||||
new OrdinalTranslatedKnnCollector(knnCollector, scorer::ordToDoc);
|
||||
final Bits acceptedOrds = scorer.getAcceptOrds(acceptDocs);
|
||||
if (knnCollector.k() < scorer.maxOrd()) {
|
||||
HnswGraphSearcher.search(scorer, collector, getGraph(fieldEntry), acceptedOrds);
|
||||
} else {
|
||||
// if k is larger than the number of vectors, we can just iterate over all vectors
|
||||
// and collect them
|
||||
for (int i = 0; i < scorer.maxOrd(); i++) {
|
||||
if (acceptedOrds == null || acceptedOrds.get(i)) {
|
||||
knnCollector.incVisitedCount(1);
|
||||
knnCollector.collect(scorer.ordToDoc(i), scorer.score(i));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -23,7 +23,6 @@ import java.io.IOException;
|
|||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.concurrent.ExecutorService;
|
||||
import org.apache.lucene.codecs.CodecUtil;
|
||||
import org.apache.lucene.codecs.FlatVectorsWriter;
|
||||
import org.apache.lucene.codecs.KnnFieldVectorsWriter;
|
||||
|
@ -35,6 +34,7 @@ import org.apache.lucene.index.MergeState;
|
|||
import org.apache.lucene.index.SegmentWriteState;
|
||||
import org.apache.lucene.index.Sorter;
|
||||
import org.apache.lucene.search.DocIdSetIterator;
|
||||
import org.apache.lucene.search.TaskExecutor;
|
||||
import org.apache.lucene.store.IndexOutput;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.lucene.util.InfoStream;
|
||||
|
@ -67,7 +67,7 @@ public final class Lucene99HnswVectorsWriter extends KnnVectorsWriter {
|
|||
private final int beamWidth;
|
||||
private final FlatVectorsWriter flatVectorWriter;
|
||||
private final int numMergeWorkers;
|
||||
private final ExecutorService mergeExec;
|
||||
private final TaskExecutor mergeExec;
|
||||
|
||||
private final List<FieldWriter<?>> fields = new ArrayList<>();
|
||||
private boolean finished;
|
||||
|
@ -78,7 +78,7 @@ public final class Lucene99HnswVectorsWriter extends KnnVectorsWriter {
|
|||
int beamWidth,
|
||||
FlatVectorsWriter flatVectorWriter,
|
||||
int numMergeWorkers,
|
||||
ExecutorService mergeExec)
|
||||
TaskExecutor mergeExec)
|
||||
throws IOException {
|
||||
this.M = M;
|
||||
this.flatVectorWriter = flatVectorWriter;
|
||||
|
|
|
@ -158,8 +158,8 @@ import org.apache.lucene.util.packed.PackedInts;
|
|||
* <dd><b>Frequencies and Skip Data</b>
|
||||
* <p>The .doc file contains the lists of documents which contain each term, along with the
|
||||
* frequency of the term in that document (except when frequencies are omitted: {@link
|
||||
* IndexOptions#DOCS}). It also saves skip data to the beginning of each packed or VInt block,
|
||||
* when the length of document list is larger than packed block size.
|
||||
* IndexOptions#DOCS}). Skip data is saved at the end of each term's postings. The skip data
|
||||
* is saved once for the entire postings list.
|
||||
* <ul>
|
||||
* <li>docFile(.doc) --> Header, <TermFreqs, SkipData?><sup>TermCount</sup>, Footer
|
||||
* <li>Header --> {@link CodecUtil#writeIndexHeader IndexHeader}
|
||||
|
@ -174,7 +174,8 @@ import org.apache.lucene.util.packed.PackedInts;
|
|||
* <li>SkipDatum --> DocSkip, DocFPSkip, <PosFPSkip, PosBlockOffset, PayLength?,
|
||||
* PayFPSkip?>?, ImpactLength, <CompetitiveFreqDelta, CompetitiveNormDelta?>
|
||||
* <sup>ImpactCount</sup>, SkipChildLevelPointer?
|
||||
* <li>PackedDocDeltaBlock, PackedFreqBlock --> {@link PackedInts PackedInts}
|
||||
* <li>PackedFreqBlock --> {@link PackedInts PackedInts}, uses patching
|
||||
* <li>PackedDocDeltaBlock --> {@link PackedInts PackedInts}, does not use patching
|
||||
* <li>DocDelta, Freq, DocSkip, DocFPSkip, PosFPSkip, PosBlockOffset, PayByteUpto,
|
||||
* PayFPSkip, ImpactLength, CompetitiveFreqDelta --> {@link DataOutput#writeVInt
|
||||
* VInt}
|
||||
|
|
|
@ -142,21 +142,25 @@ public final class Lucene99PostingsReader extends PostingsReaderBase {
|
|||
|
||||
/** Read values that have been written using variable-length encoding instead of bit-packing. */
|
||||
static void readVIntBlock(
|
||||
IndexInput docIn, long[] docBuffer, long[] freqBuffer, int num, boolean indexHasFreq)
|
||||
IndexInput docIn,
|
||||
long[] docBuffer,
|
||||
long[] freqBuffer,
|
||||
int num,
|
||||
boolean indexHasFreq,
|
||||
boolean decodeFreq)
|
||||
throws IOException {
|
||||
if (indexHasFreq) {
|
||||
for (int i = 0; i < num; i++) {
|
||||
final int code = docIn.readVInt();
|
||||
docBuffer[i] = code >>> 1;
|
||||
if ((code & 1) != 0) {
|
||||
freqBuffer[i] = 1;
|
||||
} else {
|
||||
GroupVIntReader.readValues(docIn, docBuffer, num);
|
||||
if (indexHasFreq && decodeFreq) {
|
||||
for (int i = 0; i < num; ++i) {
|
||||
freqBuffer[i] = docBuffer[i] & 0x01;
|
||||
docBuffer[i] >>= 1;
|
||||
if (freqBuffer[i] == 0) {
|
||||
freqBuffer[i] = docIn.readVInt();
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for (int i = 0; i < num; i++) {
|
||||
docBuffer[i] = docIn.readVInt();
|
||||
} else if (indexHasFreq) {
|
||||
for (int i = 0; i < num; ++i) {
|
||||
docBuffer[i] >>= 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -471,7 +475,7 @@ public final class Lucene99PostingsReader extends PostingsReaderBase {
|
|||
blockUpto++;
|
||||
} else {
|
||||
// Read vInts:
|
||||
readVIntBlock(docIn, docBuffer, freqBuffer, left, indexHasFreq);
|
||||
readVIntBlock(docIn, docBuffer, freqBuffer, left, indexHasFreq, needsFreq);
|
||||
prefixSum(docBuffer, left, accum);
|
||||
docBuffer[left] = NO_MORE_DOCS;
|
||||
blockUpto += left;
|
||||
|
@ -764,7 +768,7 @@ public final class Lucene99PostingsReader extends PostingsReaderBase {
|
|||
docBuffer[1] = NO_MORE_DOCS;
|
||||
blockUpto++;
|
||||
} else {
|
||||
readVIntBlock(docIn, docBuffer, freqBuffer, left, true);
|
||||
readVIntBlock(docIn, docBuffer, freqBuffer, left, true, true);
|
||||
prefixSum(docBuffer, left, accum);
|
||||
docBuffer[left] = NO_MORE_DOCS;
|
||||
blockUpto += left;
|
||||
|
@ -1073,8 +1077,6 @@ public final class Lucene99PostingsReader extends PostingsReaderBase {
|
|||
|
||||
private int nextSkipDoc = -1;
|
||||
|
||||
private long seekTo = -1;
|
||||
|
||||
// as we read freqBuffer lazily, isFreqsRead shows if freqBuffer are read for the current block
|
||||
// always true when we don't have freqBuffer (indexHasFreq=false) or don't need freqBuffer
|
||||
// (needsFreq=false)
|
||||
|
@ -1153,7 +1155,7 @@ public final class Lucene99PostingsReader extends PostingsReaderBase {
|
|||
}
|
||||
blockUpto += BLOCK_SIZE;
|
||||
} else {
|
||||
readVIntBlock(docIn, docBuffer, freqBuffer, left, indexHasFreqs);
|
||||
readVIntBlock(docIn, docBuffer, freqBuffer, left, indexHasFreqs, true);
|
||||
prefixSum(docBuffer, left, accum);
|
||||
docBuffer[left] = NO_MORE_DOCS;
|
||||
blockUpto += left;
|
||||
|
@ -1178,7 +1180,8 @@ public final class Lucene99PostingsReader extends PostingsReaderBase {
|
|||
// Force to read next block
|
||||
docBufferUpto = BLOCK_SIZE;
|
||||
accum = skipper.getDoc();
|
||||
seekTo = skipper.getDocPointer(); // delay the seek
|
||||
docIn.seek(skipper.getDocPointer());
|
||||
isFreqsRead = true;
|
||||
}
|
||||
// next time we call advance, this is used to
|
||||
// foresee whether skipper is necessary.
|
||||
|
@ -1198,11 +1201,6 @@ public final class Lucene99PostingsReader extends PostingsReaderBase {
|
|||
@Override
|
||||
public int nextDoc() throws IOException {
|
||||
if (docBufferUpto == BLOCK_SIZE) {
|
||||
if (seekTo >= 0) {
|
||||
docIn.seek(seekTo);
|
||||
isFreqsRead = true; // reset isFreqsRead
|
||||
seekTo = -1;
|
||||
}
|
||||
refillDocs();
|
||||
}
|
||||
return this.doc = (int) docBuffer[docBufferUpto++];
|
||||
|
@ -1214,11 +1212,6 @@ public final class Lucene99PostingsReader extends PostingsReaderBase {
|
|||
advanceShallow(target);
|
||||
}
|
||||
if (docBufferUpto == BLOCK_SIZE) {
|
||||
if (seekTo >= 0) {
|
||||
docIn.seek(seekTo);
|
||||
isFreqsRead = true; // reset isFreqsRead
|
||||
seekTo = -1;
|
||||
}
|
||||
refillDocs();
|
||||
}
|
||||
|
||||
|
@ -1307,8 +1300,6 @@ public final class Lucene99PostingsReader extends PostingsReaderBase {
|
|||
|
||||
private int nextSkipDoc = -1;
|
||||
|
||||
private long seekTo = -1;
|
||||
|
||||
public BlockImpactsPostingsEnum(FieldInfo fieldInfo, IntBlockTermState termState)
|
||||
throws IOException {
|
||||
indexHasOffsets =
|
||||
|
@ -1372,7 +1363,7 @@ public final class Lucene99PostingsReader extends PostingsReaderBase {
|
|||
forDeltaUtil.decodeAndPrefixSum(docIn, accum, docBuffer);
|
||||
pforUtil.decode(docIn, freqBuffer);
|
||||
} else {
|
||||
readVIntBlock(docIn, docBuffer, freqBuffer, left, true);
|
||||
readVIntBlock(docIn, docBuffer, freqBuffer, left, true, true);
|
||||
prefixSum(docBuffer, left, accum);
|
||||
docBuffer[left] = NO_MORE_DOCS;
|
||||
}
|
||||
|
@ -1426,7 +1417,7 @@ public final class Lucene99PostingsReader extends PostingsReaderBase {
|
|||
accum = skipper.getDoc();
|
||||
posPendingFP = skipper.getPosPointer();
|
||||
posPendingCount = skipper.getPosBufferUpto();
|
||||
seekTo = skipper.getDocPointer(); // delay the seek
|
||||
docIn.seek(skipper.getDocPointer());
|
||||
}
|
||||
// next time we call advance, this is used to
|
||||
// foresee whether skipper is necessary.
|
||||
|
@ -1452,10 +1443,6 @@ public final class Lucene99PostingsReader extends PostingsReaderBase {
|
|||
advanceShallow(target);
|
||||
}
|
||||
if (docBufferUpto == BLOCK_SIZE) {
|
||||
if (seekTo >= 0) {
|
||||
docIn.seek(seekTo);
|
||||
seekTo = -1;
|
||||
}
|
||||
refillDocs();
|
||||
}
|
||||
|
||||
|
@ -1766,7 +1753,7 @@ public final class Lucene99PostingsReader extends PostingsReaderBase {
|
|||
false; // freq block will be loaded lazily when necessary, we don't load it here
|
||||
}
|
||||
} else {
|
||||
readVIntBlock(docIn, docBuffer, freqBuffer, left, indexHasFreq);
|
||||
readVIntBlock(docIn, docBuffer, freqBuffer, left, indexHasFreq, true);
|
||||
prefixSum(docBuffer, left, accum);
|
||||
docBuffer[left] = NO_MORE_DOCS;
|
||||
}
|
||||
|
|
|
@ -92,6 +92,7 @@ public final class Lucene99PostingsWriter extends PushPostingsWriterBase {
|
|||
private final PForUtil pforUtil;
|
||||
private final ForDeltaUtil forDeltaUtil;
|
||||
private final Lucene99SkipWriter skipWriter;
|
||||
private final GroupVIntWriter docGroupVIntWriter;
|
||||
|
||||
private boolean fieldHasNorms;
|
||||
private NumericDocValues norms;
|
||||
|
@ -172,6 +173,7 @@ public final class Lucene99PostingsWriter extends PushPostingsWriterBase {
|
|||
skipWriter =
|
||||
new Lucene99SkipWriter(
|
||||
MAX_SKIP_LEVELS, BLOCK_SIZE, state.segmentInfo.maxDoc(), docOut, posOut, payOut);
|
||||
docGroupVIntWriter = new GroupVIntWriter();
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -370,17 +372,19 @@ public final class Lucene99PostingsWriter extends PushPostingsWriterBase {
|
|||
singletonDocID = (int) docDeltaBuffer[0];
|
||||
} else {
|
||||
singletonDocID = -1;
|
||||
// vInt encode the remaining doc deltas and freqs:
|
||||
for (int i = 0; i < docBufferUpto; i++) {
|
||||
final int docDelta = (int) docDeltaBuffer[i];
|
||||
final int freq = (int) freqBuffer[i];
|
||||
if (!writeFreqs) {
|
||||
docOut.writeVInt(docDelta);
|
||||
} else if (freq == 1) {
|
||||
docOut.writeVInt((docDelta << 1) | 1);
|
||||
} else {
|
||||
docOut.writeVInt(docDelta << 1);
|
||||
docOut.writeVInt(freq);
|
||||
// Group vInt encode the remaining doc deltas and freqs:
|
||||
if (writeFreqs) {
|
||||
for (int i = 0; i < docBufferUpto; i++) {
|
||||
docDeltaBuffer[i] = (docDeltaBuffer[i] << 1) | (freqBuffer[i] == 1 ? 1 : 0);
|
||||
}
|
||||
}
|
||||
docGroupVIntWriter.writeValues(docOut, docDeltaBuffer, docBufferUpto);
|
||||
if (writeFreqs) {
|
||||
for (int i = 0; i < docBufferUpto; i++) {
|
||||
final int freq = (int) freqBuffer[i];
|
||||
if (freq != 1) {
|
||||
docOut.writeVInt(freq);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -43,17 +43,17 @@ public final class Lucene99ScalarQuantizedVectorsFormat extends FlatVectorsForma
|
|||
|
||||
private static final FlatVectorsFormat rawVectorFormat = new Lucene99FlatVectorsFormat();
|
||||
|
||||
/** The minimum quantile */
|
||||
private static final float MINIMUM_QUANTILE = 0.9f;
|
||||
/** The minimum confidence interval */
|
||||
private static final float MINIMUM_CONFIDENCE_INTERVAL = 0.9f;
|
||||
|
||||
/** The maximum quantile */
|
||||
private static final float MAXIMUM_QUANTILE = 1f;
|
||||
/** The maximum confidence interval */
|
||||
private static final float MAXIMUM_CONFIDENCE_INTERVAL = 1f;
|
||||
|
||||
/**
|
||||
* Controls the quantile used to scalar quantize the vectors the default quantile is calculated as
|
||||
* `1-1/(vector_dimensions + 1)`
|
||||
* Controls the confidence interval used to scalar quantize the vectors the default value is
|
||||
* calculated as `1-1/(vector_dimensions + 1)`
|
||||
*/
|
||||
final Float quantile;
|
||||
final Float confidenceInterval;
|
||||
|
||||
/** Constructs a format using default graph construction parameters */
|
||||
public Lucene99ScalarQuantizedVectorsFormat() {
|
||||
|
@ -63,24 +63,26 @@ public final class Lucene99ScalarQuantizedVectorsFormat extends FlatVectorsForma
|
|||
/**
|
||||
* Constructs a format using the given graph construction parameters.
|
||||
*
|
||||
* @param quantile the quantile for scalar quantizing the vectors, when `null` it is calculated
|
||||
* based on the vector field dimensions.
|
||||
* @param confidenceInterval the confidenceInterval for scalar quantizing the vectors, when `null`
|
||||
* it is calculated based on the vector field dimensions.
|
||||
*/
|
||||
public Lucene99ScalarQuantizedVectorsFormat(Float quantile) {
|
||||
if (quantile != null && (quantile < MINIMUM_QUANTILE || quantile > MAXIMUM_QUANTILE)) {
|
||||
public Lucene99ScalarQuantizedVectorsFormat(Float confidenceInterval) {
|
||||
if (confidenceInterval != null
|
||||
&& (confidenceInterval < MINIMUM_CONFIDENCE_INTERVAL
|
||||
|| confidenceInterval > MAXIMUM_CONFIDENCE_INTERVAL)) {
|
||||
throw new IllegalArgumentException(
|
||||
"quantile must be between "
|
||||
+ MINIMUM_QUANTILE
|
||||
"confidenceInterval must be between "
|
||||
+ MINIMUM_CONFIDENCE_INTERVAL
|
||||
+ " and "
|
||||
+ MAXIMUM_QUANTILE
|
||||
+ "; quantile="
|
||||
+ quantile);
|
||||
+ MAXIMUM_CONFIDENCE_INTERVAL
|
||||
+ "; confidenceInterval="
|
||||
+ confidenceInterval);
|
||||
}
|
||||
this.quantile = quantile;
|
||||
this.confidenceInterval = confidenceInterval;
|
||||
}
|
||||
|
||||
static float calculateDefaultQuantile(int vectorDimension) {
|
||||
return Math.max(MINIMUM_QUANTILE, 1f - (1f / (vectorDimension + 1)));
|
||||
static float calculateDefaultConfidenceInterval(int vectorDimension) {
|
||||
return Math.max(MINIMUM_CONFIDENCE_INTERVAL, 1f - (1f / (vectorDimension + 1)));
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -88,8 +90,8 @@ public final class Lucene99ScalarQuantizedVectorsFormat extends FlatVectorsForma
|
|||
return NAME
|
||||
+ "(name="
|
||||
+ NAME
|
||||
+ ", quantile="
|
||||
+ quantile
|
||||
+ ", confidenceInterval="
|
||||
+ confidenceInterval
|
||||
+ ", rawVectorFormat="
|
||||
+ rawVectorFormat
|
||||
+ ")";
|
||||
|
@ -98,7 +100,7 @@ public final class Lucene99ScalarQuantizedVectorsFormat extends FlatVectorsForma
|
|||
@Override
|
||||
public FlatVectorsWriter fieldsWriter(SegmentWriteState state) throws IOException {
|
||||
return new Lucene99ScalarQuantizedVectorsWriter(
|
||||
state, quantile, rawVectorFormat.fieldsWriter(state));
|
||||
state, confidenceInterval, rawVectorFormat.fieldsWriter(state));
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -58,6 +58,7 @@ public final class Lucene99ScalarQuantizedVectorsReader extends FlatVectorsReade
|
|||
|
||||
Lucene99ScalarQuantizedVectorsReader(SegmentReadState state, FlatVectorsReader rawVectorsReader)
|
||||
throws IOException {
|
||||
this.rawVectorsReader = rawVectorsReader;
|
||||
int versionMeta = -1;
|
||||
String metaFileName =
|
||||
IndexFileNames.segmentFileName(
|
||||
|
@ -80,19 +81,8 @@ public final class Lucene99ScalarQuantizedVectorsReader extends FlatVectorsReade
|
|||
} catch (Throwable exception) {
|
||||
priorE = exception;
|
||||
} finally {
|
||||
try {
|
||||
CodecUtil.checkFooter(meta, priorE);
|
||||
success = true;
|
||||
} finally {
|
||||
if (success == false) {
|
||||
IOUtils.close(rawVectorsReader);
|
||||
}
|
||||
}
|
||||
CodecUtil.checkFooter(meta, priorE);
|
||||
}
|
||||
}
|
||||
success = false;
|
||||
this.rawVectorsReader = rawVectorsReader;
|
||||
try {
|
||||
quantizedVectorData =
|
||||
openDataInput(
|
||||
state,
|
||||
|
@ -313,10 +303,10 @@ public final class Lucene99ScalarQuantizedVectorsReader extends FlatVectorsReade
|
|||
dimension = input.readVInt();
|
||||
size = input.readInt();
|
||||
if (size > 0) {
|
||||
float configuredQuantile = Float.intBitsToFloat(input.readInt());
|
||||
float confidenceInterval = Float.intBitsToFloat(input.readInt());
|
||||
float minQuantile = Float.intBitsToFloat(input.readInt());
|
||||
float maxQuantile = Float.intBitsToFloat(input.readInt());
|
||||
scalarQuantizer = new ScalarQuantizer(minQuantile, maxQuantile, configuredQuantile);
|
||||
scalarQuantizer = new ScalarQuantizer(minQuantile, maxQuantile, confidenceInterval);
|
||||
} else {
|
||||
scalarQuantizer = null;
|
||||
}
|
||||
|
|
|
@ -19,7 +19,7 @@ package org.apache.lucene.codecs.lucene99;
|
|||
|
||||
import static org.apache.lucene.codecs.lucene99.Lucene99FlatVectorsFormat.DIRECT_MONOTONIC_BLOCK_SHIFT;
|
||||
import static org.apache.lucene.codecs.lucene99.Lucene99ScalarQuantizedVectorsFormat.QUANTIZED_VECTOR_COMPONENT;
|
||||
import static org.apache.lucene.codecs.lucene99.Lucene99ScalarQuantizedVectorsFormat.calculateDefaultQuantile;
|
||||
import static org.apache.lucene.codecs.lucene99.Lucene99ScalarQuantizedVectorsFormat.calculateDefaultConfidenceInterval;
|
||||
import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS;
|
||||
import static org.apache.lucene.util.RamUsageEstimator.shallowSizeOfInstance;
|
||||
|
||||
|
@ -91,14 +91,14 @@ public final class Lucene99ScalarQuantizedVectorsWriter extends FlatVectorsWrite
|
|||
|
||||
private final List<FieldWriter> fields = new ArrayList<>();
|
||||
private final IndexOutput meta, quantizedVectorData;
|
||||
private final Float quantile;
|
||||
private final Float confidenceInterval;
|
||||
private final FlatVectorsWriter rawVectorDelegate;
|
||||
private boolean finished;
|
||||
|
||||
Lucene99ScalarQuantizedVectorsWriter(
|
||||
SegmentWriteState state, Float quantile, FlatVectorsWriter rawVectorDelegate)
|
||||
SegmentWriteState state, Float confidenceInterval, FlatVectorsWriter rawVectorDelegate)
|
||||
throws IOException {
|
||||
this.quantile = quantile;
|
||||
this.confidenceInterval = confidenceInterval;
|
||||
segmentWriteState = state;
|
||||
String metaFileName =
|
||||
IndexFileNames.segmentFileName(
|
||||
|
@ -142,12 +142,12 @@ public final class Lucene99ScalarQuantizedVectorsWriter extends FlatVectorsWrite
|
|||
public FlatFieldVectorsWriter<?> addField(
|
||||
FieldInfo fieldInfo, KnnFieldVectorsWriter<?> indexWriter) throws IOException {
|
||||
if (fieldInfo.getVectorEncoding().equals(VectorEncoding.FLOAT32)) {
|
||||
float quantile =
|
||||
this.quantile == null
|
||||
? calculateDefaultQuantile(fieldInfo.getVectorDimension())
|
||||
: this.quantile;
|
||||
float confidenceInterval =
|
||||
this.confidenceInterval == null
|
||||
? calculateDefaultConfidenceInterval(fieldInfo.getVectorDimension())
|
||||
: this.confidenceInterval;
|
||||
FieldWriter quantizedWriter =
|
||||
new FieldWriter(quantile, fieldInfo, segmentWriteState.infoStream, indexWriter);
|
||||
new FieldWriter(confidenceInterval, fieldInfo, segmentWriteState.infoStream, indexWriter);
|
||||
fields.add(quantizedWriter);
|
||||
indexWriter = quantizedWriter;
|
||||
}
|
||||
|
@ -169,16 +169,16 @@ public final class Lucene99ScalarQuantizedVectorsWriter extends FlatVectorsWrite
|
|||
DocsWithFieldSet docsWithField =
|
||||
writeQuantizedVectorData(quantizedVectorData, byteVectorValues);
|
||||
long vectorDataLength = quantizedVectorData.getFilePointer() - vectorDataOffset;
|
||||
float quantile =
|
||||
this.quantile == null
|
||||
? calculateDefaultQuantile(fieldInfo.getVectorDimension())
|
||||
: this.quantile;
|
||||
float confidenceInterval =
|
||||
this.confidenceInterval == null
|
||||
? calculateDefaultConfidenceInterval(fieldInfo.getVectorDimension())
|
||||
: this.confidenceInterval;
|
||||
writeMeta(
|
||||
fieldInfo,
|
||||
segmentWriteState.segmentInfo.maxDoc(),
|
||||
vectorDataOffset,
|
||||
vectorDataLength,
|
||||
quantile,
|
||||
confidenceInterval,
|
||||
mergedQuantizationState.getLowerQuantile(),
|
||||
mergedQuantizationState.getUpperQuantile(),
|
||||
docsWithField);
|
||||
|
@ -251,7 +251,7 @@ public final class Lucene99ScalarQuantizedVectorsWriter extends FlatVectorsWrite
|
|||
maxDoc,
|
||||
vectorDataOffset,
|
||||
vectorDataLength,
|
||||
quantile,
|
||||
confidenceInterval,
|
||||
fieldData.minQuantile,
|
||||
fieldData.maxQuantile,
|
||||
fieldData.docsWithField);
|
||||
|
@ -262,7 +262,7 @@ public final class Lucene99ScalarQuantizedVectorsWriter extends FlatVectorsWrite
|
|||
int maxDoc,
|
||||
long vectorDataOffset,
|
||||
long vectorDataLength,
|
||||
Float configuredQuantizationQuantile,
|
||||
Float confidenceInterval,
|
||||
Float lowerQuantile,
|
||||
Float upperQuantile,
|
||||
DocsWithFieldSet docsWithField)
|
||||
|
@ -279,9 +279,9 @@ public final class Lucene99ScalarQuantizedVectorsWriter extends FlatVectorsWrite
|
|||
assert Float.isFinite(lowerQuantile) && Float.isFinite(upperQuantile);
|
||||
meta.writeInt(
|
||||
Float.floatToIntBits(
|
||||
configuredQuantizationQuantile != null
|
||||
? configuredQuantizationQuantile
|
||||
: calculateDefaultQuantile(field.getVectorDimension())));
|
||||
confidenceInterval != null
|
||||
? confidenceInterval
|
||||
: calculateDefaultConfidenceInterval(field.getVectorDimension())));
|
||||
meta.writeInt(Float.floatToIntBits(lowerQuantile));
|
||||
meta.writeInt(Float.floatToIntBits(upperQuantile));
|
||||
}
|
||||
|
@ -344,7 +344,7 @@ public final class Lucene99ScalarQuantizedVectorsWriter extends FlatVectorsWrite
|
|||
maxDoc,
|
||||
vectorDataOffset,
|
||||
quantizedVectorLength,
|
||||
quantile,
|
||||
confidenceInterval,
|
||||
fieldData.minQuantile,
|
||||
fieldData.maxQuantile,
|
||||
newDocsWithField);
|
||||
|
@ -374,11 +374,11 @@ public final class Lucene99ScalarQuantizedVectorsWriter extends FlatVectorsWrite
|
|||
private ScalarQuantizer mergeQuantiles(FieldInfo fieldInfo, MergeState mergeState)
|
||||
throws IOException {
|
||||
assert fieldInfo.getVectorEncoding() == VectorEncoding.FLOAT32;
|
||||
float quantile =
|
||||
this.quantile == null
|
||||
? calculateDefaultQuantile(fieldInfo.getVectorDimension())
|
||||
: this.quantile;
|
||||
return mergeAndRecalculateQuantiles(mergeState, fieldInfo, quantile);
|
||||
float confidenceInterval =
|
||||
this.confidenceInterval == null
|
||||
? calculateDefaultConfidenceInterval(fieldInfo.getVectorDimension())
|
||||
: this.confidenceInterval;
|
||||
return mergeAndRecalculateQuantiles(mergeState, fieldInfo, confidenceInterval);
|
||||
}
|
||||
|
||||
private ScalarQuantizedCloseableRandomVectorScorerSupplier mergeOneFieldToIndex(
|
||||
|
@ -408,16 +408,16 @@ public final class Lucene99ScalarQuantizedVectorsWriter extends FlatVectorsWrite
|
|||
quantizationDataInput, quantizationDataInput.length() - CodecUtil.footerLength());
|
||||
long vectorDataLength = quantizedVectorData.getFilePointer() - vectorDataOffset;
|
||||
CodecUtil.retrieveChecksum(quantizationDataInput);
|
||||
float quantile =
|
||||
this.quantile == null
|
||||
? calculateDefaultQuantile(fieldInfo.getVectorDimension())
|
||||
: this.quantile;
|
||||
float confidenceInterval =
|
||||
this.confidenceInterval == null
|
||||
? calculateDefaultConfidenceInterval(fieldInfo.getVectorDimension())
|
||||
: this.confidenceInterval;
|
||||
writeMeta(
|
||||
fieldInfo,
|
||||
segmentWriteState.segmentInfo.maxDoc(),
|
||||
vectorDataOffset,
|
||||
vectorDataLength,
|
||||
quantile,
|
||||
confidenceInterval,
|
||||
mergedQuantizationState.getLowerQuantile(),
|
||||
mergedQuantizationState.getUpperQuantile(),
|
||||
docsWithField);
|
||||
|
@ -446,7 +446,9 @@ public final class Lucene99ScalarQuantizedVectorsWriter extends FlatVectorsWrite
|
|||
}
|
||||
|
||||
static ScalarQuantizer mergeQuantiles(
|
||||
List<ScalarQuantizer> quantizationStates, List<Integer> segmentSizes, float quantile) {
|
||||
List<ScalarQuantizer> quantizationStates,
|
||||
List<Integer> segmentSizes,
|
||||
float confidenceInterval) {
|
||||
assert quantizationStates.size() == segmentSizes.size();
|
||||
if (quantizationStates.isEmpty()) {
|
||||
return null;
|
||||
|
@ -464,7 +466,7 @@ public final class Lucene99ScalarQuantizedVectorsWriter extends FlatVectorsWrite
|
|||
}
|
||||
lowerQuantile /= totalCount;
|
||||
upperQuantile /= totalCount;
|
||||
return new ScalarQuantizer(lowerQuantile, upperQuantile, quantile);
|
||||
return new ScalarQuantizer(lowerQuantile, upperQuantile, confidenceInterval);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -521,7 +523,7 @@ public final class Lucene99ScalarQuantizedVectorsWriter extends FlatVectorsWrite
|
|||
}
|
||||
|
||||
static ScalarQuantizer mergeAndRecalculateQuantiles(
|
||||
MergeState mergeState, FieldInfo fieldInfo, float quantile) throws IOException {
|
||||
MergeState mergeState, FieldInfo fieldInfo, float confidenceInterval) throws IOException {
|
||||
List<ScalarQuantizer> quantizationStates = new ArrayList<>(mergeState.liveDocs.length);
|
||||
List<Integer> segmentSizes = new ArrayList<>(mergeState.liveDocs.length);
|
||||
for (int i = 0; i < mergeState.liveDocs.length; i++) {
|
||||
|
@ -536,7 +538,8 @@ public final class Lucene99ScalarQuantizedVectorsWriter extends FlatVectorsWrite
|
|||
segmentSizes.add(fvv.size());
|
||||
}
|
||||
}
|
||||
ScalarQuantizer mergedQuantiles = mergeQuantiles(quantizationStates, segmentSizes, quantile);
|
||||
ScalarQuantizer mergedQuantiles =
|
||||
mergeQuantiles(quantizationStates, segmentSizes, confidenceInterval);
|
||||
// Segments no providing quantization state indicates that their quantiles were never
|
||||
// calculated.
|
||||
// To be safe, we should always recalculate given a sample set over all the float vectors in the
|
||||
|
@ -545,7 +548,7 @@ public final class Lucene99ScalarQuantizedVectorsWriter extends FlatVectorsWrite
|
|||
if (mergedQuantiles == null || shouldRecomputeQuantiles(mergedQuantiles, quantizationStates)) {
|
||||
FloatVectorValues vectorValues =
|
||||
KnnVectorsWriter.MergedVectorValues.mergeFloatVectorValues(fieldInfo, mergeState);
|
||||
mergedQuantiles = ScalarQuantizer.fromVectors(vectorValues, quantile);
|
||||
mergedQuantiles = ScalarQuantizer.fromVectors(vectorValues, confidenceInterval);
|
||||
}
|
||||
return mergedQuantiles;
|
||||
}
|
||||
|
@ -599,7 +602,7 @@ public final class Lucene99ScalarQuantizedVectorsWriter extends FlatVectorsWrite
|
|||
private static final long SHALLOW_SIZE = shallowSizeOfInstance(FieldWriter.class);
|
||||
private final List<float[]> floatVectors;
|
||||
private final FieldInfo fieldInfo;
|
||||
private final float quantile;
|
||||
private final float confidenceInterval;
|
||||
private final InfoStream infoStream;
|
||||
private final boolean normalize;
|
||||
private float minQuantile = Float.POSITIVE_INFINITY;
|
||||
|
@ -609,12 +612,12 @@ public final class Lucene99ScalarQuantizedVectorsWriter extends FlatVectorsWrite
|
|||
|
||||
@SuppressWarnings("unchecked")
|
||||
FieldWriter(
|
||||
float quantile,
|
||||
float confidenceInterval,
|
||||
FieldInfo fieldInfo,
|
||||
InfoStream infoStream,
|
||||
KnnFieldVectorsWriter<?> indexWriter) {
|
||||
super((KnnFieldVectorsWriter<float[]>) indexWriter);
|
||||
this.quantile = quantile;
|
||||
this.confidenceInterval = confidenceInterval;
|
||||
this.fieldInfo = fieldInfo;
|
||||
this.normalize = fieldInfo.getVectorSimilarityFunction() == VectorSimilarityFunction.COSINE;
|
||||
this.floatVectors = new ArrayList<>();
|
||||
|
@ -635,15 +638,15 @@ public final class Lucene99ScalarQuantizedVectorsWriter extends FlatVectorsWrite
|
|||
new FloatVectorWrapper(
|
||||
floatVectors,
|
||||
fieldInfo.getVectorSimilarityFunction() == VectorSimilarityFunction.COSINE),
|
||||
quantile);
|
||||
confidenceInterval);
|
||||
minQuantile = quantizer.getLowerQuantile();
|
||||
maxQuantile = quantizer.getUpperQuantile();
|
||||
if (infoStream.isEnabled(QUANTIZED_VECTOR_COMPONENT)) {
|
||||
infoStream.message(
|
||||
QUANTIZED_VECTOR_COMPONENT,
|
||||
"quantized field="
|
||||
+ " quantile="
|
||||
+ quantile
|
||||
+ " confidenceInterval="
|
||||
+ confidenceInterval
|
||||
+ " minQuantile="
|
||||
+ minQuantile
|
||||
+ " maxQuantile="
|
||||
|
@ -654,7 +657,7 @@ public final class Lucene99ScalarQuantizedVectorsWriter extends FlatVectorsWrite
|
|||
|
||||
ScalarQuantizer createQuantizer() {
|
||||
assert finished;
|
||||
return new ScalarQuantizer(minQuantile, maxQuantile, quantile);
|
||||
return new ScalarQuantizer(minQuantile, maxQuantile, confidenceInterval);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -119,7 +119,6 @@ public abstract class PerFieldPostingsFormat extends PostingsFormat {
|
|||
}
|
||||
}
|
||||
}
|
||||
;
|
||||
|
||||
static String getSuffix(String formatName, String suffix) {
|
||||
return formatName + "_" + suffix;
|
||||
|
|
|
@ -272,7 +272,6 @@ public final class FeatureField extends Field {
|
|||
return true;
|
||||
}
|
||||
}
|
||||
;
|
||||
|
||||
static final class LogFunction extends FeatureFunction {
|
||||
|
||||
|
|
|
@ -16,6 +16,7 @@
|
|||
*/
|
||||
package org.apache.lucene.document;
|
||||
|
||||
import java.util.Collection;
|
||||
import java.util.Objects;
|
||||
import org.apache.lucene.index.DocValuesType;
|
||||
import org.apache.lucene.index.IndexOptions;
|
||||
|
@ -171,7 +172,7 @@ public class KeywordField extends Field {
|
|||
* @throws NullPointerException if {@code field} is null.
|
||||
* @return a query matching documents with this exact value
|
||||
*/
|
||||
public static Query newSetQuery(String field, BytesRef... values) {
|
||||
public static Query newSetQuery(String field, Collection<BytesRef> values) {
|
||||
Objects.requireNonNull(field, "field must not be null");
|
||||
Objects.requireNonNull(values, "values must not be null");
|
||||
Query indexQuery = new TermInSetQuery(field, values);
|
||||
|
|
|
@ -16,6 +16,7 @@
|
|||
*/
|
||||
package org.apache.lucene.document;
|
||||
|
||||
import java.util.Collection;
|
||||
import org.apache.lucene.index.DocValuesType;
|
||||
import org.apache.lucene.search.IndexOrDocValuesQuery;
|
||||
import org.apache.lucene.search.MultiTermQuery;
|
||||
|
@ -99,7 +100,7 @@ public class SortedDocValuesField extends Field {
|
|||
* in an {@link IndexOrDocValuesQuery}, alongside a set query that executes on postings, such as
|
||||
* {@link TermInSetQuery}.
|
||||
*/
|
||||
public static Query newSlowSetQuery(String field, BytesRef... values) {
|
||||
public static Query newSlowSetQuery(String field, Collection<BytesRef> values) {
|
||||
return new TermInSetQuery(MultiTermQuery.DOC_VALUES_REWRITE, field, values);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -16,6 +16,7 @@
|
|||
*/
|
||||
package org.apache.lucene.document;
|
||||
|
||||
import java.util.Collection;
|
||||
import org.apache.lucene.index.DocValuesType;
|
||||
import org.apache.lucene.search.IndexOrDocValuesQuery;
|
||||
import org.apache.lucene.search.MultiTermQuery;
|
||||
|
@ -103,7 +104,7 @@ public class SortedSetDocValuesField extends Field {
|
|||
* in an {@link IndexOrDocValuesQuery}, alongside a set query that executes on postings, such as
|
||||
* {@link TermInSetQuery}.
|
||||
*/
|
||||
public static Query newSlowSetQuery(String field, BytesRef... values) {
|
||||
public static Query newSlowSetQuery(String field, Collection<BytesRef> values) {
|
||||
return new TermInSetQuery(MultiTermQuery.DOC_VALUES_REWRITE, field, values);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -694,7 +694,7 @@ abstract class SpatialQuery extends Query {
|
|||
final SpatialVisitor spatialVisitor, QueryRelation queryRelation, final FixedBitSet result) {
|
||||
final BiFunction<byte[], byte[], Relation> innerFunction =
|
||||
spatialVisitor.getInnerFunction(queryRelation);
|
||||
;
|
||||
|
||||
return new IntersectVisitor() {
|
||||
|
||||
@Override
|
||||
|
|
|
@ -1254,8 +1254,7 @@ public final class Tessellator {
|
|||
++numMerges;
|
||||
// step 'insize' places along from p
|
||||
q = p;
|
||||
for (i = 0, pSize = 0; i < inSize && q != null; ++i, ++pSize, q = q.nextZ)
|
||||
;
|
||||
for (i = 0, pSize = 0; i < inSize && q != null; ++i, ++pSize, q = q.nextZ) {}
|
||||
// if q hasn't fallen off end, we have two lists to merge
|
||||
qSize = inSize;
|
||||
|
||||
|
|
|
@ -22,11 +22,11 @@ import org.apache.lucene.store.DataOutput;
|
|||
import org.apache.lucene.util.BitUtil;
|
||||
import org.apache.lucene.util.ByteBlockPool;
|
||||
|
||||
/* IndexInput that knows how to read the byte slices written
|
||||
* by Posting and PostingVector. We read the bytes in
|
||||
* each slice until we hit the end of that slice at which
|
||||
* point we read the forwarding address of the next slice
|
||||
* and then jump to it.*/
|
||||
/**
|
||||
* IndexInput that knows how to read the byte slices written by Posting and PostingVector. We read
|
||||
* the bytes in each slice until we hit the end of that slice at which point we read the forwarding
|
||||
* address of the next slice and then jump to it.
|
||||
*/
|
||||
final class ByteSliceReader extends DataInput {
|
||||
ByteBlockPool pool;
|
||||
int bufferUpto;
|
||||
|
|
|
@ -28,7 +28,7 @@ import java.nio.file.Paths;
|
|||
import java.text.NumberFormat;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.Comparator;
|
||||
import java.util.HashMap;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
|
@ -96,11 +96,11 @@ import org.apache.lucene.util.Version;
|
|||
*/
|
||||
public final class CheckIndex implements Closeable {
|
||||
|
||||
private final Directory dir;
|
||||
private final Lock writeLock;
|
||||
private final NumberFormat nf = NumberFormat.getInstance(Locale.ROOT);
|
||||
private PrintStream infoStream;
|
||||
private Directory dir;
|
||||
private Lock writeLock;
|
||||
private volatile boolean closed;
|
||||
private NumberFormat nf = NumberFormat.getInstance(Locale.ROOT);
|
||||
|
||||
/**
|
||||
* Returned from {@link #checkIndex()} detailing the health and status of the index.
|
||||
|
@ -441,19 +441,20 @@ public final class CheckIndex implements Closeable {
|
|||
IOUtils.close(writeLock);
|
||||
}
|
||||
|
||||
private boolean doSlowChecks;
|
||||
private int level;
|
||||
|
||||
/**
|
||||
* If true, additional slow checks are performed. This will likely drastically increase time it
|
||||
* takes to run CheckIndex!
|
||||
* Sets Level, the higher the value, the more additional checks are performed. This will likely
|
||||
* drastically increase time it takes to run CheckIndex! See {@link Level}
|
||||
*/
|
||||
public void setDoSlowChecks(boolean v) {
|
||||
doSlowChecks = v;
|
||||
public void setLevel(int v) {
|
||||
Level.checkIfLevelInBounds(v);
|
||||
level = v;
|
||||
}
|
||||
|
||||
/** See {@link #setDoSlowChecks}. */
|
||||
public boolean doSlowChecks() {
|
||||
return doSlowChecks;
|
||||
/** See {@link #setLevel}. */
|
||||
public int getLevel() {
|
||||
return level;
|
||||
}
|
||||
|
||||
private boolean failFast;
|
||||
|
@ -473,21 +474,6 @@ public final class CheckIndex implements Closeable {
|
|||
|
||||
private boolean verbose;
|
||||
|
||||
/** See {@link #getChecksumsOnly}. */
|
||||
public boolean getChecksumsOnly() {
|
||||
return checksumsOnly;
|
||||
}
|
||||
|
||||
/**
|
||||
* If true, only validate physical integrity for all files. Note that the returned nested status
|
||||
* objects (e.g. storedFieldStatus) will be null.
|
||||
*/
|
||||
public void setChecksumsOnly(boolean v) {
|
||||
checksumsOnly = v;
|
||||
}
|
||||
|
||||
private boolean checksumsOnly;
|
||||
|
||||
/** Set threadCount used for parallelizing index integrity checking. */
|
||||
public void setThreadCount(int tc) {
|
||||
if (tc <= 0) {
|
||||
|
@ -586,7 +572,6 @@ public final class CheckIndex implements Closeable {
|
|||
ensureOpen();
|
||||
long startNS = System.nanoTime();
|
||||
|
||||
SegmentInfos sis = null;
|
||||
Status result = new Status();
|
||||
result.dir = dir;
|
||||
String[] files = dir.listAll();
|
||||
|
@ -595,43 +580,115 @@ public final class CheckIndex implements Closeable {
|
|||
throw new IndexNotFoundException(
|
||||
"no segments* file found in " + dir + ": files: " + Arrays.toString(files));
|
||||
}
|
||||
try {
|
||||
// Do not use SegmentInfos.read(Directory) since the spooky
|
||||
// retrying it does is not necessary here (we hold the write lock):
|
||||
sis =
|
||||
SegmentInfos.readCommit(
|
||||
dir, lastSegmentsFile, 0 /* always open old indices if codecs are around */);
|
||||
} catch (Throwable t) {
|
||||
if (failFast) {
|
||||
throw IOUtils.rethrowAlways(t);
|
||||
|
||||
// https://github.com/apache/lucene/issues/7820: also attempt to open any older commit
|
||||
// points (segments_N), which will catch certain corruption like missing _N.si files
|
||||
// for segments not also referenced by the newest commit point (which was already
|
||||
// loaded, successfully, above). Note that we do not do a deeper check of segments
|
||||
// referenced ONLY by these older commit points, because such corruption would not
|
||||
// prevent a new IndexWriter from opening on the newest commit point. but it is still
|
||||
// corruption, e.g. a reader opened on those old commit points can hit corruption
|
||||
// exceptions which we (still) will not detect here. progress not perfection!
|
||||
|
||||
SegmentInfos lastCommit = null;
|
||||
|
||||
List<String> allSegmentsFiles = new ArrayList<>();
|
||||
for (String fileName : files) {
|
||||
if (fileName.startsWith(IndexFileNames.SEGMENTS)
|
||||
&& fileName.equals(SegmentInfos.OLD_SEGMENTS_GEN) == false) {
|
||||
allSegmentsFiles.add(fileName);
|
||||
}
|
||||
}
|
||||
|
||||
// Sort descending by generation so that we always attempt to read the last commit first. This
|
||||
// way if an index has a broken last commit AND a broken old commit, we report the last commit
|
||||
// error first:
|
||||
allSegmentsFiles.sort(
|
||||
new Comparator<String>() {
|
||||
@Override
|
||||
public int compare(String a, String b) {
|
||||
long genA = SegmentInfos.generationFromSegmentsFileName(a);
|
||||
long genB = SegmentInfos.generationFromSegmentsFileName(b);
|
||||
|
||||
// reversed natural sort (largest generation first):
|
||||
return -Long.compare(genA, genB);
|
||||
}
|
||||
});
|
||||
|
||||
for (String fileName : allSegmentsFiles) {
|
||||
|
||||
boolean isLastCommit = fileName.equals(lastSegmentsFile);
|
||||
|
||||
SegmentInfos infos;
|
||||
|
||||
try {
|
||||
// Do not use SegmentInfos.read(Directory) since the spooky
|
||||
// retrying it does is not necessary here (we hold the write lock):
|
||||
// always open old indices if codecs are around
|
||||
infos = SegmentInfos.readCommit(dir, fileName, 0);
|
||||
} catch (Throwable t) {
|
||||
if (failFast) {
|
||||
throw IOUtils.rethrowAlways(t);
|
||||
}
|
||||
|
||||
String message;
|
||||
|
||||
if (isLastCommit) {
|
||||
message =
|
||||
"ERROR: could not read latest commit point from segments file \""
|
||||
+ fileName
|
||||
+ "\" in directory";
|
||||
} else {
|
||||
message =
|
||||
"ERROR: could not read old (not latest) commit point segments file \""
|
||||
+ fileName
|
||||
+ "\" in directory";
|
||||
}
|
||||
msg(infoStream, message);
|
||||
result.missingSegments = true;
|
||||
if (infoStream != null) {
|
||||
t.printStackTrace(infoStream);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
if (isLastCommit) {
|
||||
// record the latest commit point: we will deeply check all segments referenced by it
|
||||
lastCommit = infos;
|
||||
}
|
||||
}
|
||||
|
||||
// we know there is a lastSegmentsFileName, so we must've attempted to load it in the above for
|
||||
// loop. if it failed to load, we threw the exception (fastFail == true) or we returned the
|
||||
// failure (fastFail == false). so if we get here, we should // always have a valid lastCommit:
|
||||
assert lastCommit != null;
|
||||
|
||||
if (lastCommit == null) {
|
||||
msg(infoStream, "ERROR: could not read any segments file in directory");
|
||||
result.missingSegments = true;
|
||||
if (infoStream != null) t.printStackTrace(infoStream);
|
||||
return result;
|
||||
}
|
||||
|
||||
if (infoStream != null) {
|
||||
int maxDoc = 0;
|
||||
int delCount = 0;
|
||||
for (SegmentCommitInfo info : sis) {
|
||||
for (SegmentCommitInfo info : lastCommit) {
|
||||
maxDoc += info.info.maxDoc();
|
||||
delCount += info.getDelCount();
|
||||
}
|
||||
infoStream.println(
|
||||
String.format(
|
||||
Locale.ROOT,
|
||||
"%.2f%% total deletions; %d documents; %d deletions",
|
||||
100. * delCount / maxDoc,
|
||||
maxDoc,
|
||||
delCount));
|
||||
infoStream.printf(
|
||||
Locale.ROOT,
|
||||
"%.2f%% total deletions; %d documents; %d deletions%n",
|
||||
100. * delCount / maxDoc,
|
||||
maxDoc,
|
||||
delCount);
|
||||
}
|
||||
|
||||
// find the oldest and newest segment versions
|
||||
Version oldest = null;
|
||||
Version newest = null;
|
||||
String oldSegs = null;
|
||||
for (SegmentCommitInfo si : sis) {
|
||||
for (SegmentCommitInfo si : lastCommit) {
|
||||
Version version = si.info.getVersion();
|
||||
if (version == null) {
|
||||
// pre-3.1 segment
|
||||
|
@ -646,14 +703,14 @@ public final class CheckIndex implements Closeable {
|
|||
}
|
||||
}
|
||||
|
||||
final int numSegments = sis.size();
|
||||
final String segmentsFileName = sis.getSegmentsFileName();
|
||||
final int numSegments = lastCommit.size();
|
||||
final String segmentsFileName = lastCommit.getSegmentsFileName();
|
||||
result.segmentsFileName = segmentsFileName;
|
||||
result.numSegments = numSegments;
|
||||
result.userData = sis.getUserData();
|
||||
result.userData = lastCommit.getUserData();
|
||||
String userDataString;
|
||||
if (sis.getUserData().size() > 0) {
|
||||
userDataString = " userData=" + sis.getUserData();
|
||||
if (lastCommit.getUserData().size() > 0) {
|
||||
userDataString = " userData=" + lastCommit.getUserData();
|
||||
} else {
|
||||
userDataString = "";
|
||||
}
|
||||
|
@ -681,7 +738,7 @@ public final class CheckIndex implements Closeable {
|
|||
+ " "
|
||||
+ versionString
|
||||
+ " id="
|
||||
+ StringHelper.idToString(sis.getId())
|
||||
+ StringHelper.idToString(lastCommit.getId())
|
||||
+ userDataString);
|
||||
|
||||
if (onlySegments != null) {
|
||||
|
@ -696,14 +753,14 @@ public final class CheckIndex implements Closeable {
|
|||
msg(infoStream, ":");
|
||||
}
|
||||
|
||||
result.newSegments = sis.clone();
|
||||
result.newSegments = lastCommit.clone();
|
||||
result.newSegments.clear();
|
||||
result.maxSegmentName = -1;
|
||||
|
||||
// checks segments sequentially
|
||||
if (executorService == null) {
|
||||
for (int i = 0; i < numSegments; i++) {
|
||||
final SegmentCommitInfo info = sis.info(i);
|
||||
final SegmentCommitInfo info = lastCommit.info(i);
|
||||
updateMaxSegmentName(result, info);
|
||||
if (onlySegments != null && !onlySegments.contains(info.info.name)) {
|
||||
continue;
|
||||
|
@ -718,7 +775,7 @@ public final class CheckIndex implements Closeable {
|
|||
+ info.info.name
|
||||
+ " maxDoc="
|
||||
+ info.info.maxDoc());
|
||||
Status.SegmentInfoStatus segmentInfoStatus = testSegment(sis, info, infoStream);
|
||||
Status.SegmentInfoStatus segmentInfoStatus = testSegment(lastCommit, info, infoStream);
|
||||
|
||||
processSegmentInfoStatusResult(result, info, segmentInfoStatus);
|
||||
}
|
||||
|
@ -729,14 +786,13 @@ public final class CheckIndex implements Closeable {
|
|||
|
||||
// checks segments concurrently
|
||||
List<SegmentCommitInfo> segmentCommitInfos = new ArrayList<>();
|
||||
for (SegmentCommitInfo sci : sis) {
|
||||
for (SegmentCommitInfo sci : lastCommit) {
|
||||
segmentCommitInfos.add(sci);
|
||||
}
|
||||
|
||||
// sort segmentCommitInfos by segment size, as smaller segment tends to finish faster, and
|
||||
// hence its output can be printed out faster
|
||||
Collections.sort(
|
||||
segmentCommitInfos,
|
||||
segmentCommitInfos.sort(
|
||||
(info1, info2) -> {
|
||||
try {
|
||||
return Long.compare(info1.sizeInBytes(), info2.sizeInBytes());
|
||||
|
@ -757,7 +813,7 @@ public final class CheckIndex implements Closeable {
|
|||
continue;
|
||||
}
|
||||
|
||||
SegmentInfos finalSis = sis;
|
||||
SegmentInfos finalSis = lastCommit;
|
||||
|
||||
ByteArrayOutputStream output = new ByteArrayOutputStream();
|
||||
PrintStream stream = new PrintStream(output, true, IOUtils.UTF_8);
|
||||
|
@ -813,7 +869,7 @@ public final class CheckIndex implements Closeable {
|
|||
|
||||
if (0 == result.numBadSegments) {
|
||||
result.clean = true;
|
||||
} else
|
||||
} else {
|
||||
msg(
|
||||
infoStream,
|
||||
"WARNING: "
|
||||
|
@ -821,14 +877,16 @@ public final class CheckIndex implements Closeable {
|
|||
+ " broken segments (containing "
|
||||
+ result.totLoseDocCount
|
||||
+ " documents) detected");
|
||||
}
|
||||
|
||||
if (!(result.validCounter = (result.maxSegmentName < sis.counter))) {
|
||||
result.validCounter = result.maxSegmentName < lastCommit.counter;
|
||||
if (result.validCounter == false) {
|
||||
result.clean = false;
|
||||
result.newSegments.counter = result.maxSegmentName + 1;
|
||||
msg(
|
||||
infoStream,
|
||||
"ERROR: Next segment name counter "
|
||||
+ sis.counter
|
||||
+ lastCommit.counter
|
||||
+ " is not greater than max segment name "
|
||||
+ result.maxSegmentName);
|
||||
}
|
||||
|
@ -921,7 +979,7 @@ public final class CheckIndex implements Closeable {
|
|||
msg(infoStream, " diagnostics = " + diagnostics);
|
||||
}
|
||||
|
||||
if (!info.hasDeletions()) {
|
||||
if (info.hasDeletions() == false) {
|
||||
msg(infoStream, " no deletions");
|
||||
segInfoStat.hasDeletions = false;
|
||||
} else {
|
||||
|
@ -960,26 +1018,26 @@ public final class CheckIndex implements Closeable {
|
|||
toLoseDocCount = numDocs;
|
||||
|
||||
if (reader.hasDeletions()) {
|
||||
if (reader.numDocs() != info.info.maxDoc() - info.getDelCount()) {
|
||||
if (numDocs != info.info.maxDoc() - info.getDelCount()) {
|
||||
throw new CheckIndexException(
|
||||
"delete count mismatch: info="
|
||||
+ (info.info.maxDoc() - info.getDelCount())
|
||||
+ " vs reader="
|
||||
+ reader.numDocs());
|
||||
+ numDocs);
|
||||
}
|
||||
if ((info.info.maxDoc() - reader.numDocs()) > reader.maxDoc()) {
|
||||
if ((info.info.maxDoc() - numDocs) > reader.maxDoc()) {
|
||||
throw new CheckIndexException(
|
||||
"too many deleted docs: maxDoc()="
|
||||
+ reader.maxDoc()
|
||||
+ " vs del count="
|
||||
+ (info.info.maxDoc() - reader.numDocs()));
|
||||
+ (info.info.maxDoc() - numDocs));
|
||||
}
|
||||
if (info.info.maxDoc() - reader.numDocs() != info.getDelCount()) {
|
||||
if (info.info.maxDoc() - numDocs != info.getDelCount()) {
|
||||
throw new CheckIndexException(
|
||||
"delete count mismatch: info="
|
||||
+ info.getDelCount()
|
||||
+ " vs reader="
|
||||
+ (info.info.maxDoc() - reader.numDocs()));
|
||||
+ (info.info.maxDoc() - numDocs));
|
||||
}
|
||||
} else {
|
||||
if (info.getDelCount() != 0) {
|
||||
|
@ -987,11 +1045,10 @@ public final class CheckIndex implements Closeable {
|
|||
"delete count mismatch: info="
|
||||
+ info.getDelCount()
|
||||
+ " vs reader="
|
||||
+ (info.info.maxDoc() - reader.numDocs()));
|
||||
+ (info.info.maxDoc() - numDocs));
|
||||
}
|
||||
}
|
||||
|
||||
if (checksumsOnly == false) {
|
||||
if (level >= Level.MIN_LEVEL_FOR_INTEGRITY_CHECKS) {
|
||||
// Test Livedocs
|
||||
segInfoStat.liveDocStatus = testLiveDocs(reader, infoStream, failFast);
|
||||
|
||||
|
@ -1002,15 +1059,14 @@ public final class CheckIndex implements Closeable {
|
|||
segInfoStat.fieldNormStatus = testFieldNorms(reader, infoStream, failFast);
|
||||
|
||||
// Test the Term Index
|
||||
segInfoStat.termIndexStatus =
|
||||
testPostings(reader, infoStream, verbose, doSlowChecks, failFast);
|
||||
segInfoStat.termIndexStatus = testPostings(reader, infoStream, verbose, level, failFast);
|
||||
|
||||
// Test Stored Fields
|
||||
segInfoStat.storedFieldStatus = testStoredFields(reader, infoStream, failFast);
|
||||
|
||||
// Test Term Vectors
|
||||
segInfoStat.termVectorStatus =
|
||||
testTermVectors(reader, infoStream, verbose, doSlowChecks, failFast);
|
||||
testTermVectors(reader, infoStream, verbose, level, failFast);
|
||||
|
||||
// Test Docvalues
|
||||
segInfoStat.docValuesStatus = testDocValues(reader, infoStream, failFast);
|
||||
|
@ -1213,7 +1269,7 @@ public final class CheckIndex implements Closeable {
|
|||
if (liveDocs != null) {
|
||||
// it's ok for it to be non-null here, as long as none are set right?
|
||||
for (int j = 0; j < liveDocs.length(); j++) {
|
||||
if (!liveDocs.get(j)) {
|
||||
if (liveDocs.get(j) == false) {
|
||||
throw new CheckIndexException(
|
||||
"liveDocs mismatch: info says no deletions but doc " + j + " is deleted.");
|
||||
}
|
||||
|
@ -1341,7 +1397,7 @@ public final class CheckIndex implements Closeable {
|
|||
boolean isVectors,
|
||||
PrintStream infoStream,
|
||||
boolean verbose,
|
||||
boolean doSlowChecks)
|
||||
int level)
|
||||
throws IOException {
|
||||
// TODO: we should probably return our own stats thing...?!
|
||||
long startNS;
|
||||
|
@ -1450,7 +1506,7 @@ public final class CheckIndex implements Closeable {
|
|||
+ hasFreqs);
|
||||
}
|
||||
|
||||
if (!isVectors) {
|
||||
if (isVectors == false) {
|
||||
final boolean expectedHasPositions =
|
||||
fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
|
||||
if (hasPositions != expectedHasPositions) {
|
||||
|
@ -1810,7 +1866,7 @@ public final class CheckIndex implements Closeable {
|
|||
// free-for-all before?
|
||||
// but for offsets in the postings lists these checks are fine: they were always
|
||||
// enforced by IndexWriter
|
||||
if (!isVectors) {
|
||||
if (isVectors == false) {
|
||||
if (startOffset < 0) {
|
||||
throw new CheckIndexException(
|
||||
"term "
|
||||
|
@ -1924,14 +1980,13 @@ public final class CheckIndex implements Closeable {
|
|||
}
|
||||
|
||||
// Checking score blocks is heavy, we only do it on long postings lists, on every 1024th
|
||||
// term
|
||||
// or if slow checks are enabled.
|
||||
if (doSlowChecks
|
||||
// term or if slow checks are enabled.
|
||||
if (level >= Level.MIN_LEVEL_FOR_SLOW_CHECKS
|
||||
|| docFreq > 1024
|
||||
|| (status.termCount + status.delTermCount) % 1024 == 0) {
|
||||
// First check max scores and block uptos
|
||||
// But only if slok checks are enabled since we visit all docs
|
||||
if (doSlowChecks) {
|
||||
// But only if slow checks are enabled since we visit all docs
|
||||
if (level >= Level.MIN_LEVEL_FOR_SLOW_CHECKS) {
|
||||
int max = -1;
|
||||
int maxFreq = 0;
|
||||
ImpactsEnum impactsEnum = termsEnum.impacts(PostingsEnum.FREQS);
|
||||
|
@ -1998,9 +2053,9 @@ public final class CheckIndex implements Closeable {
|
|||
Impacts impacts = impactsEnum.getImpacts();
|
||||
checkImpacts(impacts, doc);
|
||||
maxFreq = Integer.MAX_VALUE;
|
||||
for (int level = 0; level < impacts.numLevels(); ++level) {
|
||||
if (impacts.getDocIdUpTo(level) >= max) {
|
||||
List<Impact> perLevelImpacts = impacts.getImpacts(level);
|
||||
for (int impactsLevel = 0; impactsLevel < impacts.numLevels(); ++impactsLevel) {
|
||||
if (impacts.getDocIdUpTo(impactsLevel) >= max) {
|
||||
List<Impact> perLevelImpacts = impacts.getImpacts(impactsLevel);
|
||||
maxFreq = perLevelImpacts.get(perLevelImpacts.size() - 1).freq;
|
||||
break;
|
||||
}
|
||||
|
@ -2040,9 +2095,9 @@ public final class CheckIndex implements Closeable {
|
|||
Impacts impacts = impactsEnum.getImpacts();
|
||||
checkImpacts(impacts, doc);
|
||||
maxFreq = Integer.MAX_VALUE;
|
||||
for (int level = 0; level < impacts.numLevels(); ++level) {
|
||||
if (impacts.getDocIdUpTo(level) >= max) {
|
||||
List<Impact> perLevelImpacts = impacts.getImpacts(level);
|
||||
for (int impactsLevel = 0; impactsLevel < impacts.numLevels(); ++impactsLevel) {
|
||||
if (impacts.getDocIdUpTo(impactsLevel) >= max) {
|
||||
List<Impact> perLevelImpacts = impacts.getImpacts(impactsLevel);
|
||||
maxFreq = perLevelImpacts.get(perLevelImpacts.size() - 1).freq;
|
||||
break;
|
||||
}
|
||||
|
@ -2151,7 +2206,7 @@ public final class CheckIndex implements Closeable {
|
|||
+ " doesn't have terms according to postings but has a norm value that is not zero: "
|
||||
+ Long.toUnsignedString(norm));
|
||||
}
|
||||
} else if (norm == 0 && visitedDocs.get(doc)) {
|
||||
} else if (visitedDocs.get(doc)) {
|
||||
throw new CheckIndexException(
|
||||
"Document "
|
||||
+ doc
|
||||
|
@ -2307,7 +2362,7 @@ public final class CheckIndex implements Closeable {
|
|||
static void checkImpacts(Impacts impacts, int lastTarget) {
|
||||
final int numLevels = impacts.numLevels();
|
||||
if (numLevels < 1) {
|
||||
throw new CheckIndexException("The number of levels must be >= 1, got " + numLevels);
|
||||
throw new CheckIndexException("The number of impact levels must be >= 1, got " + numLevels);
|
||||
}
|
||||
|
||||
int docIdUpTo0 = impacts.getDocIdUpTo(0);
|
||||
|
@ -2319,17 +2374,17 @@ public final class CheckIndex implements Closeable {
|
|||
+ lastTarget);
|
||||
}
|
||||
|
||||
for (int level = 1; level < numLevels; ++level) {
|
||||
int docIdUpTo = impacts.getDocIdUpTo(level);
|
||||
int previousDocIdUpTo = impacts.getDocIdUpTo(level - 1);
|
||||
for (int impactsLevel = 1; impactsLevel < numLevels; ++impactsLevel) {
|
||||
int docIdUpTo = impacts.getDocIdUpTo(impactsLevel);
|
||||
int previousDocIdUpTo = impacts.getDocIdUpTo(impactsLevel - 1);
|
||||
if (docIdUpTo < previousDocIdUpTo) {
|
||||
throw new CheckIndexException(
|
||||
"Decreasing return for getDocIdUpTo: level "
|
||||
+ (level - 1)
|
||||
+ (impactsLevel - 1)
|
||||
+ " returned "
|
||||
+ previousDocIdUpTo
|
||||
+ " but level "
|
||||
+ level
|
||||
+ impactsLevel
|
||||
+ " returned "
|
||||
+ docIdUpTo
|
||||
+ " for target "
|
||||
|
@ -2337,10 +2392,10 @@ public final class CheckIndex implements Closeable {
|
|||
}
|
||||
}
|
||||
|
||||
for (int level = 0; level < numLevels; ++level) {
|
||||
List<Impact> perLevelImpacts = impacts.getImpacts(level);
|
||||
for (int impactsLevel = 0; impactsLevel < numLevels; ++impactsLevel) {
|
||||
List<Impact> perLevelImpacts = impacts.getImpacts(impactsLevel);
|
||||
if (perLevelImpacts.isEmpty()) {
|
||||
throw new CheckIndexException("Got empty list of impacts on level " + level);
|
||||
throw new CheckIndexException("Got empty list of impacts on level " + impactsLevel);
|
||||
}
|
||||
Impact first = perLevelImpacts.get(0);
|
||||
if (first.freq < 1) {
|
||||
|
@ -2358,9 +2413,9 @@ public final class CheckIndex implements Closeable {
|
|||
"Impacts are not ordered or contain dups, got " + previous + " then " + impact);
|
||||
}
|
||||
}
|
||||
if (level > 0) {
|
||||
// Make sure that impacts at level N trigger better scores than an level N-1
|
||||
Iterator<Impact> previousIt = impacts.getImpacts(level - 1).iterator();
|
||||
if (impactsLevel > 0) {
|
||||
// Make sure that impacts at level N trigger better scores than an impactsLevel N-1
|
||||
Iterator<Impact> previousIt = impacts.getImpacts(impactsLevel - 1).iterator();
|
||||
previous = previousIt.next();
|
||||
Iterator<Impact> it = perLevelImpacts.iterator();
|
||||
Impact impact = it.next();
|
||||
|
@ -2376,9 +2431,9 @@ public final class CheckIndex implements Closeable {
|
|||
"Found impact "
|
||||
+ previous
|
||||
+ " on level "
|
||||
+ (level - 1)
|
||||
+ (impactsLevel - 1)
|
||||
+ " but no impact on level "
|
||||
+ level
|
||||
+ impactsLevel
|
||||
+ " triggers a better score: "
|
||||
+ perLevelImpacts);
|
||||
}
|
||||
|
@ -2395,7 +2450,7 @@ public final class CheckIndex implements Closeable {
|
|||
*/
|
||||
public static Status.TermIndexStatus testPostings(CodecReader reader, PrintStream infoStream)
|
||||
throws IOException {
|
||||
return testPostings(reader, infoStream, false, true, false);
|
||||
return testPostings(reader, infoStream, false, Level.MIN_LEVEL_FOR_SLOW_CHECKS, false);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -2404,15 +2459,11 @@ public final class CheckIndex implements Closeable {
|
|||
* @lucene.experimental
|
||||
*/
|
||||
public static Status.TermIndexStatus testPostings(
|
||||
CodecReader reader,
|
||||
PrintStream infoStream,
|
||||
boolean verbose,
|
||||
boolean doSlowChecks,
|
||||
boolean failFast)
|
||||
CodecReader reader, PrintStream infoStream, boolean verbose, int level, boolean failFast)
|
||||
throws IOException {
|
||||
|
||||
// TODO: we should go and verify term vectors match, if
|
||||
// doSlowChecks is on...
|
||||
// TODO: we should go and verify term vectors match, if the Level is high enough to
|
||||
// include slow checks
|
||||
Status.TermIndexStatus status;
|
||||
final int maxDoc = reader.maxDoc();
|
||||
|
||||
|
@ -2443,7 +2494,7 @@ public final class CheckIndex implements Closeable {
|
|||
false,
|
||||
infoStream,
|
||||
verbose,
|
||||
doSlowChecks);
|
||||
level);
|
||||
} catch (Throwable e) {
|
||||
if (failFast) {
|
||||
throw IOUtils.rethrowAlways(e);
|
||||
|
@ -3132,7 +3183,7 @@ public final class CheckIndex implements Closeable {
|
|||
for (FieldInfo fieldInfo : reader.getFieldInfos()) {
|
||||
if (fieldInfo.getDocValuesType() != DocValuesType.NONE) {
|
||||
status.totalValueFields++;
|
||||
checkDocValues(fieldInfo, dvReader, reader.maxDoc(), infoStream, status);
|
||||
checkDocValues(fieldInfo, dvReader, status);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -3162,11 +3213,11 @@ public final class CheckIndex implements Closeable {
|
|||
}
|
||||
|
||||
@FunctionalInterface
|
||||
private static interface DocValuesIteratorSupplier {
|
||||
private interface DocValuesIteratorSupplier {
|
||||
DocValuesIterator get(FieldInfo fi) throws IOException;
|
||||
}
|
||||
|
||||
private static void checkDVIterator(FieldInfo fi, int maxDoc, DocValuesIteratorSupplier producer)
|
||||
private static void checkDVIterator(FieldInfo fi, DocValuesIteratorSupplier producer)
|
||||
throws IOException {
|
||||
String field = fi.name;
|
||||
|
||||
|
@ -3284,7 +3335,7 @@ public final class CheckIndex implements Closeable {
|
|||
}
|
||||
|
||||
private static void checkBinaryDocValues(
|
||||
String fieldName, int maxDoc, BinaryDocValues bdv, BinaryDocValues bdv2) throws IOException {
|
||||
String fieldName, BinaryDocValues bdv, BinaryDocValues bdv2) throws IOException {
|
||||
if (bdv.docID() != -1) {
|
||||
throw new CheckIndexException(
|
||||
"binary dv iterator for field: "
|
||||
|
@ -3309,7 +3360,7 @@ public final class CheckIndex implements Closeable {
|
|||
}
|
||||
|
||||
private static void checkSortedDocValues(
|
||||
String fieldName, int maxDoc, SortedDocValues dv, SortedDocValues dv2) throws IOException {
|
||||
String fieldName, SortedDocValues dv, SortedDocValues dv2) throws IOException {
|
||||
if (dv.docID() != -1) {
|
||||
throw new CheckIndexException(
|
||||
"sorted dv iterator for field: "
|
||||
|
@ -3373,8 +3424,7 @@ public final class CheckIndex implements Closeable {
|
|||
}
|
||||
|
||||
private static void checkSortedSetDocValues(
|
||||
String fieldName, int maxDoc, SortedSetDocValues dv, SortedSetDocValues dv2)
|
||||
throws IOException {
|
||||
String fieldName, SortedSetDocValues dv, SortedSetDocValues dv2) throws IOException {
|
||||
final long maxOrd = dv.getValueCount() - 1;
|
||||
LongBitSet seenOrds = new LongBitSet(dv.getValueCount());
|
||||
long maxOrd2 = -1;
|
||||
|
@ -3470,7 +3520,7 @@ public final class CheckIndex implements Closeable {
|
|||
}
|
||||
|
||||
private static void checkSortedNumericDocValues(
|
||||
String fieldName, int maxDoc, SortedNumericDocValues ndv, SortedNumericDocValues ndv2)
|
||||
String fieldName, SortedNumericDocValues ndv, SortedNumericDocValues ndv2)
|
||||
throws IOException {
|
||||
if (ndv.docID() != -1) {
|
||||
throw new CheckIndexException(
|
||||
|
@ -3539,38 +3589,32 @@ public final class CheckIndex implements Closeable {
|
|||
}
|
||||
|
||||
private static void checkDocValues(
|
||||
FieldInfo fi,
|
||||
DocValuesProducer dvReader,
|
||||
int maxDoc,
|
||||
PrintStream infoStream,
|
||||
DocValuesStatus status)
|
||||
throws Exception {
|
||||
FieldInfo fi, DocValuesProducer dvReader, DocValuesStatus status) throws Exception {
|
||||
switch (fi.getDocValuesType()) {
|
||||
case SORTED:
|
||||
status.totalSortedFields++;
|
||||
checkDVIterator(fi, maxDoc, dvReader::getSorted);
|
||||
checkSortedDocValues(fi.name, maxDoc, dvReader.getSorted(fi), dvReader.getSorted(fi));
|
||||
checkDVIterator(fi, dvReader::getSorted);
|
||||
checkSortedDocValues(fi.name, dvReader.getSorted(fi), dvReader.getSorted(fi));
|
||||
break;
|
||||
case SORTED_NUMERIC:
|
||||
status.totalSortedNumericFields++;
|
||||
checkDVIterator(fi, maxDoc, dvReader::getSortedNumeric);
|
||||
checkDVIterator(fi, dvReader::getSortedNumeric);
|
||||
checkSortedNumericDocValues(
|
||||
fi.name, maxDoc, dvReader.getSortedNumeric(fi), dvReader.getSortedNumeric(fi));
|
||||
fi.name, dvReader.getSortedNumeric(fi), dvReader.getSortedNumeric(fi));
|
||||
break;
|
||||
case SORTED_SET:
|
||||
status.totalSortedSetFields++;
|
||||
checkDVIterator(fi, maxDoc, dvReader::getSortedSet);
|
||||
checkSortedSetDocValues(
|
||||
fi.name, maxDoc, dvReader.getSortedSet(fi), dvReader.getSortedSet(fi));
|
||||
checkDVIterator(fi, dvReader::getSortedSet);
|
||||
checkSortedSetDocValues(fi.name, dvReader.getSortedSet(fi), dvReader.getSortedSet(fi));
|
||||
break;
|
||||
case BINARY:
|
||||
status.totalBinaryFields++;
|
||||
checkDVIterator(fi, maxDoc, dvReader::getBinary);
|
||||
checkBinaryDocValues(fi.name, maxDoc, dvReader.getBinary(fi), dvReader.getBinary(fi));
|
||||
checkDVIterator(fi, dvReader::getBinary);
|
||||
checkBinaryDocValues(fi.name, dvReader.getBinary(fi), dvReader.getBinary(fi));
|
||||
break;
|
||||
case NUMERIC:
|
||||
status.totalNumericFields++;
|
||||
checkDVIterator(fi, maxDoc, dvReader::getNumeric);
|
||||
checkDVIterator(fi, dvReader::getNumeric);
|
||||
checkNumericDocValues(fi.name, dvReader.getNumeric(fi), dvReader.getNumeric(fi));
|
||||
break;
|
||||
case NONE:
|
||||
|
@ -3586,7 +3630,7 @@ public final class CheckIndex implements Closeable {
|
|||
*/
|
||||
public static Status.TermVectorStatus testTermVectors(CodecReader reader, PrintStream infoStream)
|
||||
throws IOException {
|
||||
return testTermVectors(reader, infoStream, false, false, false);
|
||||
return testTermVectors(reader, infoStream, false, Level.MIN_LEVEL_FOR_INTEGRITY_CHECKS, false);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -3595,11 +3639,7 @@ public final class CheckIndex implements Closeable {
|
|||
* @lucene.experimental
|
||||
*/
|
||||
public static Status.TermVectorStatus testTermVectors(
|
||||
CodecReader reader,
|
||||
PrintStream infoStream,
|
||||
boolean verbose,
|
||||
boolean doSlowChecks,
|
||||
boolean failFast)
|
||||
CodecReader reader, PrintStream infoStream, boolean verbose, int level, boolean failFast)
|
||||
throws IOException {
|
||||
long startNS = System.nanoTime();
|
||||
final Status.TermVectorStatus status = new Status.TermVectorStatus();
|
||||
|
@ -3612,14 +3652,14 @@ public final class CheckIndex implements Closeable {
|
|||
|
||||
PostingsEnum postings = null;
|
||||
|
||||
// Only used if doSlowChecks is true:
|
||||
// Only used if the Level is high enough to include slow checks:
|
||||
PostingsEnum postingsDocs = null;
|
||||
|
||||
final Bits liveDocs = reader.getLiveDocs();
|
||||
|
||||
FieldsProducer postingsFields;
|
||||
// TODO: testTermsIndex
|
||||
if (doSlowChecks) {
|
||||
if (level >= Level.MIN_LEVEL_FOR_SLOW_CHECKS) {
|
||||
postingsFields = reader.getPostingsReader();
|
||||
if (postingsFields != null) {
|
||||
postingsFields = postingsFields.getMergeInstance();
|
||||
|
@ -3643,8 +3683,7 @@ public final class CheckIndex implements Closeable {
|
|||
|
||||
if (tfv != null) {
|
||||
// First run with no deletions:
|
||||
checkFields(
|
||||
tfv, null, 1, fieldInfos, null, false, true, infoStream, verbose, doSlowChecks);
|
||||
checkFields(tfv, null, 1, fieldInfos, null, false, true, infoStream, verbose, level);
|
||||
|
||||
// Only agg stats if the doc is live:
|
||||
final boolean doStats = liveDocs == null || liveDocs.get(j);
|
||||
|
@ -3660,7 +3699,7 @@ public final class CheckIndex implements Closeable {
|
|||
|
||||
// Make sure FieldInfo thinks this field is vector'd:
|
||||
final FieldInfo fieldInfo = fieldInfos.fieldInfo(field);
|
||||
if (!fieldInfo.hasVectors()) {
|
||||
if (fieldInfo.hasVectors() == false) {
|
||||
throw new CheckIndexException(
|
||||
"docID="
|
||||
+ j
|
||||
|
@ -3669,7 +3708,7 @@ public final class CheckIndex implements Closeable {
|
|||
+ " but FieldInfo has storeTermVector=false");
|
||||
}
|
||||
|
||||
if (doSlowChecks) {
|
||||
if (level >= Level.MIN_LEVEL_FOR_SLOW_CHECKS) {
|
||||
Terms terms = tfv.terms(field);
|
||||
TermsEnum termsEnum = terms.iterator();
|
||||
final boolean postingsHasFreq =
|
||||
|
@ -3696,7 +3735,7 @@ public final class CheckIndex implements Closeable {
|
|||
postings = termsEnum.postings(postings, PostingsEnum.ALL);
|
||||
assert postings != null;
|
||||
|
||||
if (!postingsTermsEnum.seekExact(term)) {
|
||||
if (postingsTermsEnum.seekExact(term) == false) {
|
||||
throw new CheckIndexException(
|
||||
"vector term="
|
||||
+ term
|
||||
|
@ -3852,7 +3891,7 @@ public final class CheckIndex implements Closeable {
|
|||
+ " but postings does not.");
|
||||
}
|
||||
BytesRef postingsPayload = postingsDocs.getPayload();
|
||||
if (!payload.equals(postingsPayload)) {
|
||||
if (payload.equals(postingsPayload) == false) {
|
||||
throw new CheckIndexException(
|
||||
"vector term="
|
||||
+ term
|
||||
|
@ -3972,9 +4011,8 @@ public final class CheckIndex implements Closeable {
|
|||
/** Run-time configuration options for CheckIndex commands. */
|
||||
public static class Options {
|
||||
boolean doExorcise = false;
|
||||
boolean doSlowChecks = false;
|
||||
boolean verbose = false;
|
||||
boolean doChecksumsOnly = false;
|
||||
int level = Level.DEFAULT_VALUE;
|
||||
int threadCount;
|
||||
List<String> onlySegments = new ArrayList<>();
|
||||
String indexPath = null;
|
||||
|
@ -4011,9 +4049,10 @@ public final class CheckIndex implements Closeable {
|
|||
return 1;
|
||||
}
|
||||
|
||||
if (!assertsOn())
|
||||
if (assertsOn() == false) {
|
||||
System.out.println(
|
||||
"\nNOTE: testing will be more thorough if you run java with '-ea:org.apache.lucene...', so assertions are enabled");
|
||||
}
|
||||
|
||||
System.out.println("\nOpening index @ " + opts.indexPath + "\n");
|
||||
Directory directory = null;
|
||||
|
@ -4037,6 +4076,42 @@ public final class CheckIndex implements Closeable {
|
|||
}
|
||||
}
|
||||
|
||||
/** Class with static variables with information about CheckIndex's -level parameter. */
|
||||
public static class Level {
|
||||
private Level() {}
|
||||
|
||||
/** Minimum valid level. */
|
||||
public static final int MIN_VALUE = 1;
|
||||
|
||||
/** Maximum valid level. */
|
||||
public static final int MAX_VALUE = 3;
|
||||
|
||||
/** The default level if none is specified. */
|
||||
public static final int DEFAULT_VALUE = MIN_VALUE;
|
||||
|
||||
/** Minimum level required to run checksum checks. */
|
||||
public static final int MIN_LEVEL_FOR_CHECKSUM_CHECKS = 1;
|
||||
|
||||
/** Minimum level required to run integrity checks. */
|
||||
public static final int MIN_LEVEL_FOR_INTEGRITY_CHECKS = 2;
|
||||
|
||||
/** Minimum level required to run slow checks. */
|
||||
public static final int MIN_LEVEL_FOR_SLOW_CHECKS = 3;
|
||||
|
||||
/** Checks if given level value is within the allowed bounds else it raises an Exception. */
|
||||
public static void checkIfLevelInBounds(int levelVal) throws IllegalArgumentException {
|
||||
if (levelVal < Level.MIN_VALUE || levelVal > Level.MAX_VALUE) {
|
||||
throw new IllegalArgumentException(
|
||||
String.format(
|
||||
Locale.ROOT,
|
||||
"ERROR: given value: '%d' for -level option is out of bounds. Please use a value from '%d'->'%d'",
|
||||
levelVal,
|
||||
Level.MIN_VALUE,
|
||||
Level.MAX_VALUE));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse command line args into fields
|
||||
*
|
||||
|
@ -4051,15 +4126,29 @@ public final class CheckIndex implements Closeable {
|
|||
int i = 0;
|
||||
while (i < args.length) {
|
||||
String arg = args[i];
|
||||
if ("-fast".equals(arg)) {
|
||||
opts.doChecksumsOnly = true;
|
||||
if ("-level".equals(arg)) {
|
||||
if (i == args.length - 1) {
|
||||
throw new IllegalArgumentException("ERROR: missing value for -level option");
|
||||
}
|
||||
i++;
|
||||
int level = Integer.parseInt(args[i]);
|
||||
Level.checkIfLevelInBounds(level);
|
||||
opts.level = level;
|
||||
} else if ("-fast".equals(arg)) {
|
||||
// Deprecated. Remove in Lucene 11.
|
||||
System.err.println(
|
||||
"-fast is deprecated, use '-level 1' for explicitly verifying file checksums only. This is also now the default "
|
||||
+ "behaviour!");
|
||||
} else if ("-slow".equals(arg)) {
|
||||
// Deprecated. Remove in Lucene 11.
|
||||
System.err.println("-slow is deprecated, use '-level 3' instead for slow checks");
|
||||
opts.level = Level.MIN_LEVEL_FOR_SLOW_CHECKS;
|
||||
} else if ("-exorcise".equals(arg)) {
|
||||
opts.doExorcise = true;
|
||||
} else if ("-crossCheckTermVectors".equals(arg)) {
|
||||
System.err.println("-crossCheckTermVectors is deprecated, use -slow instead");
|
||||
opts.doSlowChecks = true;
|
||||
} else if ("-slow".equals(arg)) {
|
||||
opts.doSlowChecks = true;
|
||||
// Deprecated. Remove in Lucene 11.
|
||||
System.err.println("-crossCheckTermVectors is deprecated, use '-level 3' instead");
|
||||
opts.level = Level.MAX_VALUE;
|
||||
} else if (arg.equals("-verbose")) {
|
||||
opts.verbose = true;
|
||||
} else if (arg.equals("-segment")) {
|
||||
|
@ -4096,11 +4185,13 @@ public final class CheckIndex implements Closeable {
|
|||
if (opts.indexPath == null) {
|
||||
throw new IllegalArgumentException(
|
||||
"\nERROR: index path not specified"
|
||||
+ "\nUsage: java org.apache.lucene.index.CheckIndex pathToIndex [-exorcise] [-slow] [-segment X] [-segment Y] [-threadCount X] [-dir-impl X]\n"
|
||||
+ "\nUsage: java org.apache.lucene.index.CheckIndex pathToIndex [-exorcise] [-level X] [-segment X] [-segment Y] [-threadCount X] [-dir-impl X]\n"
|
||||
+ "\n"
|
||||
+ " -exorcise: actually write a new segments_N file, removing any problematic segments\n"
|
||||
+ " -fast: just verify file checksums, omitting logical integrity checks\n"
|
||||
+ " -slow: do additional slow checks; THIS IS VERY SLOW!\n"
|
||||
+ " -level X: sets the detail level of the check. The higher the value, the more checks are done.\n"
|
||||
+ " 1 - (Default) Checksum checks only.\n"
|
||||
+ " 2 - All level 1 checks + logical integrity checks.\n"
|
||||
+ " 3 - All level 2 checks + slow checks.\n"
|
||||
+ " -codec X: when exorcising, codec to write the new segments_N file with\n"
|
||||
+ " -verbose: print additional details\n"
|
||||
+ " -segment X: only check the specified segments. This can be specified multiple\n"
|
||||
|
@ -4115,7 +4206,8 @@ public final class CheckIndex implements Closeable {
|
|||
+ "If no package is specified the "
|
||||
+ FSDirectory.class.getPackage().getName()
|
||||
+ " package will be used.\n"
|
||||
+ "\n"
|
||||
+ "CheckIndex only verifies file checksums as default.\n"
|
||||
+ "Use -level with value of '2' or higher if you also want to check segment file contents.\n\n"
|
||||
+ "**WARNING**: -exorcise *LOSES DATA*. This should only be used on an emergency basis as it will cause\n"
|
||||
+ "documents (perhaps many) to be permanently removed from the index. Always make\n"
|
||||
+ "a backup copy of your index before running this! Do not run this tool on an index\n"
|
||||
|
@ -4137,10 +4229,6 @@ public final class CheckIndex implements Closeable {
|
|||
throw new IllegalArgumentException("ERROR: cannot specify both -exorcise and -segment");
|
||||
}
|
||||
|
||||
if (opts.doChecksumsOnly && opts.doSlowChecks) {
|
||||
throw new IllegalArgumentException("ERROR: cannot specify both -fast and -slow");
|
||||
}
|
||||
|
||||
return opts;
|
||||
}
|
||||
|
||||
|
@ -4151,8 +4239,7 @@ public final class CheckIndex implements Closeable {
|
|||
* @return 0 iff the index is clean, 1 otherwise
|
||||
*/
|
||||
public int doCheck(Options opts) throws IOException, InterruptedException {
|
||||
setDoSlowChecks(opts.doSlowChecks);
|
||||
setChecksumsOnly(opts.doChecksumsOnly);
|
||||
setLevel(opts.level);
|
||||
setInfoStream(opts.out, opts.verbose);
|
||||
// user provided thread count via command line argument, overriding the default with user
|
||||
// provided value
|
||||
|
@ -4166,8 +4253,8 @@ public final class CheckIndex implements Closeable {
|
|||
return 1;
|
||||
}
|
||||
|
||||
if (!result.clean) {
|
||||
if (!opts.doExorcise) {
|
||||
if (result.clean == false) {
|
||||
if (opts.doExorcise == false) {
|
||||
opts.out.println(
|
||||
"WARNING: would write new segments file, and "
|
||||
+ result.totLoseDocCount
|
||||
|
|
|
@ -270,7 +270,6 @@ final class FieldUpdatesBuffer {
|
|||
static class BufferedUpdate {
|
||||
|
||||
private BufferedUpdate() {}
|
||||
;
|
||||
|
||||
/** the max document ID this update should be applied to */
|
||||
int docUpTo;
|
||||
|
|
|
@ -33,6 +33,7 @@ import java.util.HashSet;
|
|||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.Map;
|
||||
import java.util.Objects;
|
||||
import java.util.Queue;
|
||||
import java.util.Set;
|
||||
import java.util.concurrent.ConcurrentLinkedQueue;
|
||||
|
@ -55,6 +56,8 @@ import org.apache.lucene.index.DocValuesUpdate.BinaryDocValuesUpdate;
|
|||
import org.apache.lucene.index.DocValuesUpdate.NumericDocValuesUpdate;
|
||||
import org.apache.lucene.index.FieldInfos.FieldNumbers;
|
||||
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
|
||||
import org.apache.lucene.index.MergePolicy.MergeReader;
|
||||
import org.apache.lucene.index.Sorter.DocMap;
|
||||
import org.apache.lucene.internal.tests.IndexPackageAccess;
|
||||
import org.apache.lucene.internal.tests.IndexWriterAccess;
|
||||
import org.apache.lucene.internal.tests.TestSecrets;
|
||||
|
@ -3413,8 +3416,20 @@ public class IndexWriter
|
|||
Collections.emptyMap(),
|
||||
config.getIndexSort());
|
||||
|
||||
List<CodecReader> readers =
|
||||
merge.getMergeReader().stream().map(r -> r.codecReader).collect(Collectors.toList());
|
||||
List<CodecReader> readers = new ArrayList<>();
|
||||
for (MergeReader mr : merge.getMergeReader()) {
|
||||
CodecReader reader = merge.wrapForMerge(mr.codecReader);
|
||||
readers.add(reader);
|
||||
}
|
||||
|
||||
if (config.getIndexSort() == null && readers.isEmpty() == false) {
|
||||
CodecReader mergedReader = SlowCompositeCodecReaderWrapper.wrap(readers);
|
||||
DocMap docMap = merge.reorder(mergedReader, directory);
|
||||
if (docMap != null) {
|
||||
readers = Collections.singletonList(SortingCodecReader.wrap(mergedReader, docMap, null));
|
||||
}
|
||||
}
|
||||
|
||||
SegmentMerger merger =
|
||||
new SegmentMerger(readers, segInfo, infoStream, trackingDir, globalFieldNumberMap, context);
|
||||
|
||||
|
@ -3464,6 +3479,8 @@ public class IndexWriter
|
|||
merge.getMergeInfo().info.setUseCompoundFile(true);
|
||||
}
|
||||
|
||||
merge.setMergeInfo(merge.info);
|
||||
|
||||
// Have codec write SegmentInfo. Must do this after
|
||||
// creating CFS so that 1) .si isn't slurped into CFS,
|
||||
// and 2) .si reflects useCompoundFile=true change
|
||||
|
@ -3791,7 +3808,7 @@ public class IndexWriter
|
|||
new OneMergeWrappingMergePolicy(
|
||||
config.getMergePolicy(),
|
||||
toWrap ->
|
||||
new MergePolicy.OneMerge(toWrap.segments) {
|
||||
new MergePolicy.OneMerge(toWrap) {
|
||||
SegmentCommitInfo origInfo;
|
||||
final AtomicBoolean onlyOnce = new AtomicBoolean(false);
|
||||
|
||||
|
@ -3890,6 +3907,18 @@ public class IndexWriter
|
|||
public CodecReader wrapForMerge(CodecReader reader) throws IOException {
|
||||
return toWrap.wrapForMerge(reader); // must delegate
|
||||
}
|
||||
|
||||
@Override
|
||||
public Sorter.DocMap reorder(CodecReader reader, Directory dir)
|
||||
throws IOException {
|
||||
return toWrap.reorder(reader, dir); // must delegate
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setMergeInfo(SegmentCommitInfo info) {
|
||||
super.setMergeInfo(info);
|
||||
toWrap.setMergeInfo(info);
|
||||
}
|
||||
}),
|
||||
trigger,
|
||||
UNBOUNDED_MAX_MERGE_SEGMENTS);
|
||||
|
@ -4312,7 +4341,7 @@ public class IndexWriter
|
|||
* merge.info). If no deletes were flushed, no new deletes file is saved.
|
||||
*/
|
||||
private synchronized ReadersAndUpdates commitMergedDeletesAndUpdates(
|
||||
MergePolicy.OneMerge merge, MergeState mergeState) throws IOException {
|
||||
MergePolicy.OneMerge merge, MergeState.DocMap[] docMaps) throws IOException {
|
||||
|
||||
mergeFinishedGen.incrementAndGet();
|
||||
|
||||
|
@ -4336,7 +4365,7 @@ public class IndexWriter
|
|||
|
||||
boolean anyDVUpdates = false;
|
||||
|
||||
assert sourceSegments.size() == mergeState.docMaps.length;
|
||||
assert sourceSegments.size() == docMaps.length;
|
||||
for (int i = 0; i < sourceSegments.size(); i++) {
|
||||
SegmentCommitInfo info = sourceSegments.get(i);
|
||||
minGen = Math.min(info.getBufferedDeletesGen(), minGen);
|
||||
|
@ -4346,12 +4375,11 @@ public class IndexWriter
|
|||
// the pool:
|
||||
assert rld != null : "seg=" + info.info.name;
|
||||
|
||||
MergeState.DocMap segDocMap = mergeState.docMaps[i];
|
||||
MergeState.DocMap segDocMap = docMaps[i];
|
||||
|
||||
carryOverHardDeletes(
|
||||
mergedDeletesAndUpdates,
|
||||
maxDoc,
|
||||
mergeState.liveDocs[i],
|
||||
merge.getMergeReader().get(i).hardLiveDocs,
|
||||
rld.getHardLiveDocs(),
|
||||
segDocMap);
|
||||
|
@ -4454,26 +4482,21 @@ public class IndexWriter
|
|||
private static void carryOverHardDeletes(
|
||||
ReadersAndUpdates mergedReadersAndUpdates,
|
||||
int maxDoc,
|
||||
Bits mergeLiveDocs, // the liveDocs used to build the segDocMaps
|
||||
Bits prevHardLiveDocs, // the hard deletes when the merge reader was pulled
|
||||
Bits currentHardLiveDocs, // the current hard deletes
|
||||
MergeState.DocMap segDocMap)
|
||||
throws IOException {
|
||||
|
||||
assert mergeLiveDocs == null || mergeLiveDocs.length() == maxDoc;
|
||||
// if we mix soft and hard deletes we need to make sure that we only carry over deletes
|
||||
// that were not deleted before. Otherwise the segDocMap doesn't contain a mapping.
|
||||
// yet this is also required if any MergePolicy modifies the liveDocs since this is
|
||||
// what the segDocMap is build on.
|
||||
final IntPredicate carryOverDelete =
|
||||
mergeLiveDocs == null || mergeLiveDocs == prevHardLiveDocs
|
||||
? docId -> currentHardLiveDocs.get(docId) == false
|
||||
: docId -> mergeLiveDocs.get(docId) && currentHardLiveDocs.get(docId) == false;
|
||||
docId -> segDocMap.get(docId) != -1 && currentHardLiveDocs.get(docId) == false;
|
||||
if (prevHardLiveDocs != null) {
|
||||
// If we had deletions on starting the merge we must
|
||||
// still have deletions now:
|
||||
assert currentHardLiveDocs != null;
|
||||
assert mergeLiveDocs != null;
|
||||
assert prevHardLiveDocs.length() == maxDoc;
|
||||
assert currentHardLiveDocs.length() == maxDoc;
|
||||
|
||||
|
@ -4516,7 +4539,7 @@ public class IndexWriter
|
|||
}
|
||||
|
||||
@SuppressWarnings("try")
|
||||
private synchronized boolean commitMerge(MergePolicy.OneMerge merge, MergeState mergeState)
|
||||
private synchronized boolean commitMerge(MergePolicy.OneMerge merge, MergeState.DocMap[] docMaps)
|
||||
throws IOException {
|
||||
merge.onMergeComplete();
|
||||
testPoint("startCommitMerge");
|
||||
|
@ -4559,7 +4582,7 @@ public class IndexWriter
|
|||
}
|
||||
|
||||
final ReadersAndUpdates mergedUpdates =
|
||||
merge.info.info.maxDoc() == 0 ? null : commitMergedDeletesAndUpdates(merge, mergeState);
|
||||
merge.info.info.maxDoc() == 0 ? null : commitMergedDeletesAndUpdates(merge, docMaps);
|
||||
|
||||
// If the doc store we are using has been closed and
|
||||
// is in now compound format (but wasn't when we
|
||||
|
@ -5163,12 +5186,57 @@ public class IndexWriter
|
|||
}
|
||||
mergeReaders.add(wrappedReader);
|
||||
}
|
||||
|
||||
MergeState.DocMap[] reorderDocMaps = null;
|
||||
if (config.getIndexSort() == null) {
|
||||
// Create a merged view of the input segments. This effectively does the merge.
|
||||
CodecReader mergedView = SlowCompositeCodecReaderWrapper.wrap(mergeReaders);
|
||||
Sorter.DocMap docMap = merge.reorder(mergedView, directory);
|
||||
if (docMap != null) {
|
||||
reorderDocMaps = new MergeState.DocMap[mergeReaders.size()];
|
||||
int docBase = 0;
|
||||
int i = 0;
|
||||
for (CodecReader reader : mergeReaders) {
|
||||
final int currentDocBase = docBase;
|
||||
reorderDocMaps[i] =
|
||||
docID -> {
|
||||
Objects.checkIndex(docID, reader.maxDoc());
|
||||
return docMap.oldToNew(currentDocBase + docID);
|
||||
};
|
||||
i++;
|
||||
docBase += reader.maxDoc();
|
||||
}
|
||||
// This makes merging more expensive as it disables some bulk merging optimizations, so
|
||||
// only do this if a non-null DocMap is returned.
|
||||
mergeReaders =
|
||||
Collections.singletonList(SortingCodecReader.wrap(mergedView, docMap, null));
|
||||
}
|
||||
}
|
||||
|
||||
final SegmentMerger merger =
|
||||
new SegmentMerger(
|
||||
mergeReaders, merge.info.info, infoStream, dirWrapper, globalFieldNumberMap, context);
|
||||
merge.info.setSoftDelCount(Math.toIntExact(softDeleteCount.get()));
|
||||
merge.checkAborted();
|
||||
|
||||
MergeState mergeState = merger.mergeState;
|
||||
MergeState.DocMap[] docMaps;
|
||||
if (reorderDocMaps == null) {
|
||||
docMaps = mergeState.docMaps;
|
||||
} else {
|
||||
// Since the reader was reordered, we passed a merged view to MergeState and from its
|
||||
// perspective there is a single input segment to the merge and the
|
||||
// SlowCompositeCodecReaderWrapper is effectively doing the merge.
|
||||
assert mergeState.docMaps.length == 1
|
||||
: "Got " + mergeState.docMaps.length + " docMaps, but expected 1";
|
||||
MergeState.DocMap compactionDocMap = mergeState.docMaps[0];
|
||||
docMaps = new MergeState.DocMap[reorderDocMaps.length];
|
||||
for (int i = 0; i < docMaps.length; ++i) {
|
||||
MergeState.DocMap reorderDocMap = reorderDocMaps[i];
|
||||
docMaps[i] = docID -> compactionDocMap.get(reorderDocMap.get(docID));
|
||||
}
|
||||
}
|
||||
|
||||
merge.mergeStartNS = System.nanoTime();
|
||||
|
||||
// This is where all the work happens:
|
||||
|
@ -5176,7 +5244,6 @@ public class IndexWriter
|
|||
merger.merge();
|
||||
}
|
||||
|
||||
MergeState mergeState = merger.mergeState;
|
||||
assert mergeState.segmentInfo == merge.info.info;
|
||||
merge.info.info.setFiles(new HashSet<>(dirWrapper.getCreatedFiles()));
|
||||
Codec codec = config.getCodec();
|
||||
|
@ -5229,7 +5296,7 @@ public class IndexWriter
|
|||
// Merge would produce a 0-doc segment, so we do nothing except commit the merge to remove
|
||||
// all the 0-doc segments that we "merged":
|
||||
assert merge.info.info.maxDoc() == 0;
|
||||
success = commitMerge(merge, mergeState);
|
||||
success = commitMerge(merge, docMaps);
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@ -5309,6 +5376,8 @@ public class IndexWriter
|
|||
success = false;
|
||||
}
|
||||
|
||||
merge.setMergeInfo(merge.info);
|
||||
|
||||
// Have codec write SegmentInfo. Must do this after
|
||||
// creating CFS so that 1) .si isn't slurped into CFS,
|
||||
// and 2) .si reflects useCompoundFile=true change
|
||||
|
@ -5352,7 +5421,7 @@ public class IndexWriter
|
|||
}
|
||||
}
|
||||
|
||||
if (!commitMerge(merge, mergeState)) {
|
||||
if (!commitMerge(merge, docMaps)) {
|
||||
// commitMerge will return false if this merge was
|
||||
// aborted
|
||||
return 0;
|
||||
|
|
|
@ -255,6 +255,15 @@ public abstract class MergePolicy {
|
|||
usesPooledReaders = false;
|
||||
}
|
||||
|
||||
/** Constructor for wrapping. */
|
||||
protected OneMerge(OneMerge oneMerge) {
|
||||
this.segments = oneMerge.segments;
|
||||
this.mergeReaders = oneMerge.mergeReaders;
|
||||
this.totalMaxDoc = oneMerge.totalMaxDoc;
|
||||
this.mergeProgress = new OneMergeProgress();
|
||||
this.usesPooledReaders = oneMerge.usesPooledReaders;
|
||||
}
|
||||
|
||||
/**
|
||||
* Called by {@link IndexWriter} after the merge started and from the thread that will be
|
||||
* executing the merge.
|
||||
|
@ -288,11 +297,32 @@ public abstract class MergePolicy {
|
|||
}
|
||||
}
|
||||
|
||||
/** Wrap the reader in order to add/remove information to the merged segment. */
|
||||
/**
|
||||
* Wrap a reader prior to merging in order to add/remove fields or documents.
|
||||
*
|
||||
* <p><b>NOTE:</b> It is illegal to reorder doc IDs here, use {@link
|
||||
* #reorder(CodecReader,Directory)} instead.
|
||||
*/
|
||||
public CodecReader wrapForMerge(CodecReader reader) throws IOException {
|
||||
return reader;
|
||||
}
|
||||
|
||||
/**
|
||||
* Extend this method if you wish to renumber doc IDs. This method will be called when index
|
||||
* sorting is disabled on a merged view of the {@link OneMerge}. A {@code null} return value
|
||||
* indicates that doc IDs should not be reordered.
|
||||
*
|
||||
* <p><b>NOTE:</b> Returning a non-null value here disables several optimizations and increases
|
||||
* the merging overhead.
|
||||
*
|
||||
* @param reader The reader to reorder.
|
||||
* @param dir The {@link Directory} of the index, which may be used to create temporary files.
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public Sorter.DocMap reorder(CodecReader reader, Directory dir) throws IOException {
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Expert: Sets the {@link SegmentCommitInfo} of the merged segment. Allows sub-classes to e.g.
|
||||
* {@link SegmentInfo#addDiagnostics(Map) add diagnostic} properties.
|
||||
|
@ -355,11 +385,7 @@ public abstract class MergePolicy {
|
|||
* not indicate the number of documents after the merge.
|
||||
*/
|
||||
public int totalNumDocs() {
|
||||
int total = 0;
|
||||
for (SegmentCommitInfo info : segments) {
|
||||
total += info.info.maxDoc();
|
||||
}
|
||||
return total;
|
||||
return totalMaxDoc;
|
||||
}
|
||||
|
||||
/** Return {@link MergeInfo} describing this merge. */
|
||||
|
|
|
@ -177,16 +177,13 @@ public class MergeState {
|
|||
|
||||
final int docBase = totalDocs;
|
||||
docMaps[i] =
|
||||
new DocMap() {
|
||||
@Override
|
||||
public int get(int docID) {
|
||||
if (liveDocs == null) {
|
||||
return docBase + docID;
|
||||
} else if (liveDocs.get(docID)) {
|
||||
return docBase + (int) delDocMap.get(docID);
|
||||
} else {
|
||||
return -1;
|
||||
}
|
||||
docID -> {
|
||||
if (liveDocs == null) {
|
||||
return docBase + docID;
|
||||
} else if (liveDocs.get(docID)) {
|
||||
return docBase + (int) delDocMap.get(docID);
|
||||
} else {
|
||||
return -1;
|
||||
}
|
||||
};
|
||||
totalDocs += reader.numDocs();
|
||||
|
@ -242,13 +239,10 @@ public class MergeState {
|
|||
}
|
||||
|
||||
/** A map of doc IDs. */
|
||||
public abstract static class DocMap {
|
||||
/** Sole constructor. (For invocation by subclass constructors, typically implicit.) */
|
||||
// Explicitly declared so that we have non-empty javadoc
|
||||
protected DocMap() {}
|
||||
|
||||
@FunctionalInterface
|
||||
public interface DocMap {
|
||||
/** Return the mapped docID or -1 if the given doc is not mapped. */
|
||||
public abstract int get(int docID);
|
||||
int get(int docID);
|
||||
}
|
||||
|
||||
static PackedLongValues removeDeletes(final int maxDoc, final Bits liveDocs) {
|
||||
|
|
|
@ -122,14 +122,11 @@ final class MultiSorter {
|
|||
final PackedLongValues remapped = builders[i].build();
|
||||
final Bits liveDocs = readers.get(i).getLiveDocs();
|
||||
docMaps[i] =
|
||||
new MergeState.DocMap() {
|
||||
@Override
|
||||
public int get(int docID) {
|
||||
if (liveDocs == null || liveDocs.get(docID)) {
|
||||
return (int) remapped.get(docID);
|
||||
} else {
|
||||
return -1;
|
||||
}
|
||||
docID -> {
|
||||
if (liveDocs == null || liveDocs.get(docID)) {
|
||||
return (int) remapped.get(docID);
|
||||
} else {
|
||||
return -1;
|
||||
}
|
||||
};
|
||||
}
|
||||
|
|
|
@ -325,7 +325,6 @@ public abstract class PointValues {
|
|||
|
||||
/** Notifies the caller that this many documents are about to be visited */
|
||||
default void grow(int count) {}
|
||||
;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -526,7 +526,6 @@ final class ReadersAndUpdates {
|
|||
return docIDOut;
|
||||
}
|
||||
}
|
||||
;
|
||||
|
||||
private synchronized Set<String> writeFieldInfosGen(
|
||||
FieldInfos fieldInfos, Directory dir, FieldInfosFormat infosFormat) throws IOException {
|
||||
|
|
|
@ -122,7 +122,7 @@ public final class SegmentInfos implements Cloneable, Iterable<SegmentCommitInfo
|
|||
static final int VERSION_CURRENT = VERSION_86;
|
||||
|
||||
/** Name of the generation reference file name */
|
||||
private static final String OLD_SEGMENTS_GEN = "segments.gen";
|
||||
static final String OLD_SEGMENTS_GEN = "segments.gen";
|
||||
|
||||
/** Used to name new segments. */
|
||||
public long counter;
|
||||
|
@ -146,7 +146,7 @@ public final class SegmentInfos implements Cloneable, Iterable<SegmentCommitInfo
|
|||
*
|
||||
* @see #setInfoStream
|
||||
*/
|
||||
private static PrintStream infoStream = null;
|
||||
private static PrintStream infoStream;
|
||||
|
||||
/** Id for this commit; only written starting with Lucene 5.0 */
|
||||
private byte[] id;
|
||||
|
@ -1010,6 +1010,7 @@ public final class SegmentInfos implements Cloneable, Iterable<SegmentCommitInfo
|
|||
void replace(SegmentInfos other) {
|
||||
rollbackSegmentInfos(other.asList());
|
||||
lastGeneration = other.lastGeneration;
|
||||
userData = other.userData;
|
||||
}
|
||||
|
||||
/** Returns sum of all segment's maxDocs. Note that this does not include deletions */
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -24,6 +24,7 @@ import java.util.Arrays;
|
|||
import java.util.HashMap;
|
||||
import java.util.Iterator;
|
||||
import java.util.Map;
|
||||
import java.util.Objects;
|
||||
import org.apache.lucene.codecs.DocValuesProducer;
|
||||
import org.apache.lucene.codecs.FieldsProducer;
|
||||
import org.apache.lucene.codecs.KnnVectorsReader;
|
||||
|
@ -77,7 +78,7 @@ public final class SortingCodecReader extends FilterCodecReader {
|
|||
private final Sorter.DocMap docMap;
|
||||
|
||||
SortingPointValues(final PointValues in, Sorter.DocMap docMap) {
|
||||
this.in = in;
|
||||
this.in = Objects.requireNonNull(in);
|
||||
this.docMap = docMap;
|
||||
}
|
||||
|
||||
|
@ -472,6 +473,10 @@ public final class SortingCodecReader extends FilterCodecReader {
|
|||
|
||||
@Override
|
||||
public PointValues getValues(String field) throws IOException {
|
||||
var values = delegate.getValues(field);
|
||||
if (values == null) {
|
||||
return null;
|
||||
}
|
||||
return new SortingPointValues(delegate.getValues(field), docMap);
|
||||
}
|
||||
|
||||
|
|
|
@ -85,7 +85,11 @@ public final class IndexOrDocValuesQuery extends Query {
|
|||
|
||||
@Override
|
||||
public String toString(String field) {
|
||||
return indexQuery.toString(field);
|
||||
return "IndexOrDocValuesQuery(indexQuery="
|
||||
+ indexQuery.toString(field)
|
||||
+ ", dvQuery="
|
||||
+ dvQuery.toString(field)
|
||||
+ ")";
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -19,7 +19,6 @@ package org.apache.lucene.search;
|
|||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collection;
|
||||
import java.util.Collections;
|
||||
import java.util.Comparator;
|
||||
import java.util.List;
|
||||
|
@ -62,9 +61,9 @@ import org.apache.lucene.util.automaton.ByteRunAutomaton;
|
|||
* match lots of documents, counting the number of hits may take much longer than computing the top
|
||||
* hits so this trade-off allows to get some minimal information about the hit count without slowing
|
||||
* down search too much. The {@link TopDocs#scoreDocs} array is always accurate however. If this
|
||||
* behavior doesn't suit your needs, you should create collectors manually with either {@link
|
||||
* TopScoreDocCollector#create} or {@link TopFieldCollector#create} and call {@link #search(Query,
|
||||
* Collector)}.
|
||||
* behavior doesn't suit your needs, you should create collectorManagers manually with either {@link
|
||||
* TopScoreDocCollectorManager} or {@link TopFieldCollectorManager} and call {@link #search(Query,
|
||||
* CollectorManager)}.
|
||||
*
|
||||
* <p><a id="thread-safety"></a>
|
||||
*
|
||||
|
@ -455,35 +454,10 @@ public class IndexSearcher {
|
|||
}
|
||||
|
||||
final int cappedNumHits = Math.min(numHits, limit);
|
||||
|
||||
final LeafSlice[] leafSlices = getSlices();
|
||||
final CollectorManager<TopScoreDocCollector, TopDocs> manager =
|
||||
new CollectorManager<TopScoreDocCollector, TopDocs>() {
|
||||
|
||||
private final HitsThresholdChecker hitsThresholdChecker =
|
||||
leafSlices.length <= 1
|
||||
? HitsThresholdChecker.create(Math.max(TOTAL_HITS_THRESHOLD, numHits))
|
||||
: HitsThresholdChecker.createShared(Math.max(TOTAL_HITS_THRESHOLD, numHits));
|
||||
|
||||
private final MaxScoreAccumulator minScoreAcc =
|
||||
leafSlices.length <= 1 ? null : new MaxScoreAccumulator();
|
||||
|
||||
@Override
|
||||
public TopScoreDocCollector newCollector() throws IOException {
|
||||
return TopScoreDocCollector.create(
|
||||
cappedNumHits, after, hitsThresholdChecker, minScoreAcc);
|
||||
}
|
||||
|
||||
@Override
|
||||
public TopDocs reduce(Collection<TopScoreDocCollector> collectors) throws IOException {
|
||||
final TopDocs[] topDocs = new TopDocs[collectors.size()];
|
||||
int i = 0;
|
||||
for (TopScoreDocCollector collector : collectors) {
|
||||
topDocs[i++] = collector.topDocs();
|
||||
}
|
||||
return TopDocs.merge(0, cappedNumHits, topDocs);
|
||||
}
|
||||
};
|
||||
final boolean supportsConcurrency = getSlices().length > 1;
|
||||
CollectorManager<TopScoreDocCollector, TopDocs> manager =
|
||||
new TopScoreDocCollectorManager(
|
||||
cappedNumHits, after, TOTAL_HITS_THRESHOLD, supportsConcurrency);
|
||||
|
||||
return search(query, manager);
|
||||
}
|
||||
|
@ -510,7 +484,10 @@ public class IndexSearcher {
|
|||
*
|
||||
* @throws TooManyClauses If a query would exceed {@link IndexSearcher#getMaxClauseCount()}
|
||||
* clauses.
|
||||
* @deprecated This method is being deprecated in favor of {@link IndexSearcher#search(Query,
|
||||
* CollectorManager)} due to its support for concurrency in IndexSearcher
|
||||
*/
|
||||
@Deprecated
|
||||
public void search(Query query, Collector results) throws IOException {
|
||||
query = rewrite(query, results.scoreMode().needsScores());
|
||||
search(leafContexts, createWeight(query, results.scoreMode(), 1), results);
|
||||
|
@ -602,34 +579,10 @@ public class IndexSearcher {
|
|||
final Sort rewrittenSort = sort.rewrite(this);
|
||||
final LeafSlice[] leafSlices = getSlices();
|
||||
|
||||
final boolean supportsConcurrency = leafSlices.length > 1;
|
||||
final CollectorManager<TopFieldCollector, TopFieldDocs> manager =
|
||||
new CollectorManager<>() {
|
||||
|
||||
private final HitsThresholdChecker hitsThresholdChecker =
|
||||
leafSlices.length <= 1
|
||||
? HitsThresholdChecker.create(Math.max(TOTAL_HITS_THRESHOLD, numHits))
|
||||
: HitsThresholdChecker.createShared(Math.max(TOTAL_HITS_THRESHOLD, numHits));
|
||||
|
||||
private final MaxScoreAccumulator minScoreAcc =
|
||||
leafSlices.length <= 1 ? null : new MaxScoreAccumulator();
|
||||
|
||||
@Override
|
||||
public TopFieldCollector newCollector() throws IOException {
|
||||
// TODO: don't pay the price for accurate hit counts by default
|
||||
return TopFieldCollector.create(
|
||||
rewrittenSort, cappedNumHits, after, hitsThresholdChecker, minScoreAcc);
|
||||
}
|
||||
|
||||
@Override
|
||||
public TopFieldDocs reduce(Collection<TopFieldCollector> collectors) throws IOException {
|
||||
final TopFieldDocs[] topDocs = new TopFieldDocs[collectors.size()];
|
||||
int i = 0;
|
||||
for (TopFieldCollector collector : collectors) {
|
||||
topDocs[i++] = collector.topDocs();
|
||||
}
|
||||
return TopDocs.merge(rewrittenSort, 0, cappedNumHits, topDocs);
|
||||
}
|
||||
};
|
||||
new TopFieldCollectorManager(
|
||||
rewrittenSort, cappedNumHits, after, TOTAL_HITS_THRESHOLD, supportsConcurrency);
|
||||
|
||||
TopFieldDocs topDocs = search(query, manager);
|
||||
if (doDocScores) {
|
||||
|
|
|
@ -69,7 +69,6 @@ public abstract class PointInSetQuery extends Query implements Accountable {
|
|||
@Override
|
||||
public abstract BytesRef next();
|
||||
}
|
||||
;
|
||||
|
||||
/** The {@code packedPoints} iterator must be in sorted order. */
|
||||
protected PointInSetQuery(String field, int numDims, int bytesPerDim, Stream packedPoints) {
|
||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue