Merge branch 'main' into java_21

This commit is contained in:
ChrisHegarty 2023-12-11 14:18:04 +00:00
commit 40c03b0e6c
300 changed files with 8039 additions and 4021 deletions

View File

@ -117,6 +117,9 @@ apply from: file('buildSrc/scriptDepVersions.gradle')
apply from: file('gradle/generation/local-settings.gradle')
// Make sure the build environment is consistent.
apply from: file('gradle/validation/check-environment.gradle')
// IDE support, settings and specials.
apply from: file('gradle/ide/intellij-idea.gradle')
apply from: file('gradle/ide/eclipse.gradle')

View File

@ -38,3 +38,9 @@ dependencies {
implementation "commons-codec:commons-codec:${scriptDepVersions['commons-codec']}"
}
if (!rootProject.hasJavaFlightRecorder) {
logger.warn('Module jdk.jfr is not available; skipping compilation of Java Flight Recorder support.')
tasks.named('compileJava').configure {
exclude('**/ProfileResults.java')
}
}

View File

@ -24,7 +24,7 @@ ext {
"apache-rat": "0.14",
"asm": "9.6",
"commons-codec": "1.13",
"ecj": "3.36.0-SNAPSHOT",
"ecj": "3.36.0",
"flexmark": "0.61.24",
"javacc": "7.0.12",
"jflex": "1.8.2",

View File

@ -15,20 +15,18 @@
* limitations under the License.
*/
import org.apache.lucene.gradle.ProfileResults;
def recordings = files()
allprojects {
plugins.withType(JavaPlugin) {
ext {
testOptions += [
[propName: 'tests.profile', value: false, description: "Enable java flight recorder profiling."]
[propName: 'tests.profile', value: false, description: "Enable Java Flight Recorder profiling."]
]
}
if (resolvedTestOption("tests.profile").toBoolean()) {
allprojects {
if (rootProject.hasJavaFlightRecorder) {
tasks.withType(Test) {
jvmArgs("-XX:StartFlightRecording=dumponexit=true,maxsize=250M,settings=" + rootProject.file("gradle/testing/profiling.jfc"),
"-XX:+UnlockDiagnosticVMOptions",
@ -41,6 +39,8 @@ allprojects {
recordings = recordings.plus fileTree(dir: workingDir, include: '*.jfr')
}
}
} else {
throw new GradleException('Module jdk.jfr is not available; Java Flight Recorder profiles cannot be enabled.')
}
}
}
@ -48,10 +48,11 @@ allprojects {
gradle.buildFinished {
if (!recordings.isEmpty()) {
ProfileResults.printReport(recordings.getFiles().collect { it.toString() },
propertyOrDefault(ProfileResults.MODE_KEY, ProfileResults.MODE_DEFAULT) as String,
Integer.parseInt(propertyOrDefault(ProfileResults.STACKSIZE_KEY, ProfileResults.STACKSIZE_DEFAULT)),
Integer.parseInt(propertyOrDefault(ProfileResults.COUNT_KEY, ProfileResults.COUNT_DEFAULT)),
Boolean.parseBoolean(propertyOrDefault(ProfileResults.LINENUMBERS_KEY, ProfileResults.LINENUMBERS_DEFAULT)))
def pr = org.apache.lucene.gradle.ProfileResults;
pr.printReport(recordings.getFiles().collect { it.toString() },
propertyOrDefault(pr.MODE_KEY, pr.MODE_DEFAULT) as String,
Integer.parseInt(propertyOrDefault(pr.STACKSIZE_KEY, pr.STACKSIZE_DEFAULT)),
Integer.parseInt(propertyOrDefault(pr.COUNT_KEY, pr.COUNT_DEFAULT)),
Boolean.parseBoolean(propertyOrDefault(pr.LINENUMBERS_KEY, pr.LINENUMBERS_DEFAULT)))
}
}

View File

@ -23,8 +23,6 @@ grant {
// jetty-specific:
permission java.lang.RuntimePermission "getenv.JETTY_AVAILABLE_PROCESSORS";
permission java.lang.RuntimePermission "getenv.JETTY_WORKER_INSTANCE";
// servlet stuff
permission java.lang.RuntimePermission "setContextClassLoader";
// allow TestNRTReplication fork its jvm
permission java.io.FilePermission "${java.home}${/}-", "read,execute";
// read/write access to all system properties (required by jetty in these tests)

View File

@ -50,14 +50,11 @@ grant {
permission java.lang.RuntimePermission "getStackTrace";
// needed for mock filesystems in tests
permission java.lang.RuntimePermission "fileSystemProvider";
// analyzers/uima: needed by lucene expressions' JavascriptCompiler
permission java.lang.RuntimePermission "createClassLoader";
// needed to test unmap hack on platforms that support it
permission java.lang.RuntimePermission "accessClassInPackage.sun.misc";
permission java.lang.reflect.ReflectPermission "suppressAccessChecks";
// needed by cyberneko usage by benchmarks on J9
permission java.lang.RuntimePermission "accessClassInPackage.org.apache.xerces.util";
permission java.lang.RuntimePermission "getClassLoader";
// Needed for loading native library (lucene:misc:native) in lucene:misc
permission java.lang.RuntimePermission "getFileStoreAttributes";
@ -111,6 +108,8 @@ grant {
permission java.lang.RuntimePermission "shutdownHooks";
// needed by jacoco to instrument classes
permission java.lang.RuntimePermission "defineClass";
// needed by jacoco for God knows what.
permission java.lang.RuntimePermission "createClassLoader";
};
// Grant all permissions to Gradle test runner classes.

View File

@ -23,6 +23,7 @@ import org.gradle.util.GradleVersion
configure(rootProject) {
ext {
expectedGradleVersion = '8.4'
hasJavaFlightRecorder = ModuleLayer.boot().findModule('jdk.jfr').map(this.class.module::canRead).orElse(false)
}
wrapper {

View File

@ -17,8 +17,8 @@
def skipReason
if (rootProject.usesAltJvm && rootProject.runtimeJavaVersion > JavaVersion.VERSION_15) {
skipReason = "won't work with JDK ${rootProject.runtimeJavaVersion} if used as alternative java toolchain"
if (rootProject.usesAltJvm) {
skipReason = "won't work with alternative java toolchain"
}
if (!propertyOrDefault("validation.errorprone", isCIBuild).asBoolean()) {
@ -37,7 +37,7 @@ if (skipReason) {
allprojects { prj ->
plugins.withType(JavaPlugin) {
// LUCENE-9650: Errorprone on master/gradle does not work with JDK-16+ when running as plugin
// LUCENE-9650: Errorprone on master/gradle does not work when running as plugin
// inside a forked Javac process. Javac running inside Gradle works, because we have
// additional module system opens in place.
// This is a hack to keep the dependency (so that palantir's version check doesn't complain)

View File

@ -59,6 +59,9 @@ allprojects {
}
subprojects {
// initialize empty, because no checks for benchmark-jmh module.
ext.jarInfos = []
// Configure jarValidation configuration for all projects. Any dependency
// declared on this configuration (or any configuration it extends from) will
// be verified.

View File

@ -61,6 +61,7 @@ Otherwise you are stuck wrestling down full dependencies of OpenJDK (metal etc)
Also you must run benchmarks as root to use dtrace, but it works.
$ git clone --depth 1 https://github.com/openjdk/jdk/
$ curl -f https://ftp.gnu.org/gnu/binutils/binutils-2.38.tar.gz | tar -zxf -
$ curl -fo jdk/src/utils/hsdis/binutils/Makefile https://raw.githubusercontent.com/openjdk/jdk/3c7ae1225f0d5575fd927a9b76fb40dc30e208cd/src/utils/hsdis/Makefile
$ vi jdk/src/utils/hsdis/binutils/Makefile, change SOURCE = hsdis.c to SOURCE = hsdis-binutils.c
$ vi jdk/src/utils/hsdis/binutils/hsdis-binutils.c, change #include "hsdis.h" to #include "../hsdis.h"

View File

@ -7,7 +7,6 @@ http://s.apache.org/luceneversions
API Changes
---------------------
* LUCENE-12092: Remove deprecated UTF8TaxonomyWriterCache. Please use LruTaxonomyWriterCache
instead. (Vigya Sharma)
@ -62,10 +61,21 @@ API Changes
* GITHUB#12599: Add RandomAccessInput#readBytes method to the RandomAccessInput interface. (Ignacio Vera)
* GITHUB#12709: Consolidate FSTStore and BytesStore in FST. Created FSTReader which contains the common methods
of the two (Anh Dung Bui)
* GITHUB#11023: Adding -level param to CheckIndex, making the old -fast param the default behaviour. (Jakub Slowinski)
* GITHUB#12735: Remove FSTCompiler#getTermCount() and FSTCompiler.UnCompiledNode#inputCount (Anh Dung Bui)
* GITHUB#12873: Expressions module now uses MethodHandles to define custom functions. Support for
custom classloaders was removed. (Uwe Schindler)
* GITHUB#12243: Remove TermInSetQuery ctors taking varargs param. SortedSetDocValuesField#newSlowSetQuery,
SortedDocValuesField#newSlowSetQuery, KeywordField#newSetQuery, KeywordField#newSetQuery now take a collection. (Jakub Slowinski)
* GITHUB#12881: Performance improvements to MatchHighlighter and MatchRegionRetriever. MatchRegionRetriever can be
configured to not load matches (or content) of certain fields and to force-load other fields so that stored fields
of a document are accessed once. A configurable limit of field matches placed in the priority queue was added
(allows handling long fields with lots of hits more gracefully). MatchRegionRetriever utilizes IndexSearcher's
executor to extract hit offsets concurrently. (Dawid Weiss)
* GITHUB#12855: Remove deprecated DrillSideways#createDrillDownFacetsCollector extension method. (Greg Miller)
New Features
---------------------
@ -89,18 +99,17 @@ Improvements
* GITHUB#12447: Hunspell: speed up the dictionary enumeration on suggestion (Peter Gromov)
* GITHUB#12542: FSTCompiler can now approximately limit how much RAM it uses to share
suffixes during FST construction using the suffixRAMLimitMB method. Larger values
result in a more minimal FST (more common suffixes are shard). Pass
Double.POSITIVE_INFINITY to use as much RAM as is needed to create a purely
minimal FST. Inspired by this Rust FST implemention:
https://blog.burntsushi.net/transducers (Mike McCandless)
* GITHUB#12873: Expressions module now uses JEP 371 "Hidden Classes" with JEP 309
"Dynamic Class-File Constants" to implement Javascript expressions. (Uwe Schindler)
Optimizations
---------------------
* GITHUB#11857, GITHUB#11859, GITHUB#11893, GITHUB#11909: Hunspell: improved suggestion performance (Peter Gromov)
* GITHUB#12825, GITHUB#12834: Hunspell: improved dictionary loading performance, allowed in-memory entry sorting.
(Peter Gromov)
* GITHUB#12372: Reduce allocation during HNSW construction (Jonathan Ellis)
* GITHUB#12408: Lazy initialization improvements for Facets implementations when there are segments with no hits
@ -116,6 +125,9 @@ Bug Fixes
* GITHUB#12220: Hunspell: disallow hidden title-case entries from compound middle/end
* GITHUB#12878: Fix the declared Exceptions of Expression#evaluate() to match those
of DoubleValues#doubleValue(). (Uwe Schindler)
Other
---------------------
@ -142,6 +154,48 @@ Other
* GITHUB#12239: Hunspell: reduced suggestion set dependency on the hash table order (Peter Gromov)
* GITHUB#9049: Fixing bug in UnescapedCharSequence#toStringEscaped() (Jakub Slowinski)
======================== Lucene 9.10.0 =======================
API Changes
---------------------
* GITHUB#12243: Mark TermInSetQuery ctors with varargs terms as @Deprecated. SortedSetDocValuesField#newSlowSetQuery,
SortedDocValuesField#newSlowSetQuery, KeywordField#newSetQuery now take a collection of terms as a param. (Jakub Slowinski)
* GITHUB#11041: Deprecate IndexSearch#search(Query, Collector) in favor of
IndexSearcher#search(Query, CollectorManager) for TopFieldCollectorManager
and TopScoreDocCollectorManager. (Zach Chen, Adrien Grand, Michael McCandless, Greg Miller, Luca Cavanna)
* GITHUB#12854: Mark DrillSideways#createDrillDownFacetsCollector as @Deprecated. (Greg Miller)
New Features
---------------------
(No changes)
Improvements
---------------------
* GITHUB#12870: Tighten synchronized loop in DirectoryTaxonomyReader#getOrdinal. (Stefan Vodita)
* GITHUB#12812: Avoid overflows and false negatives in int slice buffer filled-with-zeros assertion. (Stefan Vodita)
Optimizations
---------------------
(No changes)
Bug Fixes
---------------------
* GITHUB#12866: Prevent extra similarity computation for single-level HNSW graphs. (Kaival Parikh)
* GITHUB#12558: Ensure #finish is called on all drill-sideways FacetsCollectors even when no hits are scored.
(Greg Miller)
Other
---------------------
* GITHUB#11023: Removing some dead code in CheckIndex. (Jakub Slowinski)
======================== Lucene 9.9.0 =======================
API Changes
@ -157,9 +211,6 @@ API Changes
* GITHUB#12592: Add RandomAccessInput#length method to the RandomAccessInput interface. In addition deprecate
ByteBuffersDataInput#size in favour of this new method. (Ignacio Vera)
* GITHUB#12646, GITHUB#12690: Move FST#addNode to FSTCompiler to avoid a circular dependency
between FST and FSTCompiler (Anh Dung Bui)
* GITHUB#12718: Make IndexSearcher#getSlices final as it is not expected to be overridden (Luca Cavanna)
* GITHUB#12427: Automata#makeStringUnion #makeBinaryStringUnion now accept Iterable<BytesRef> instead of
@ -169,6 +220,25 @@ API Changes
* GITHUB#12180: Add TaxonomyReader#getBulkOrdinals method to more efficiently retrieve facet ordinals for multiple
FacetLabel at once. (Egor Potemkin)
* GITHUB#12816: Add HumanReadableQuery which takes a description parameter for debugging purposes. (Jakub Slowinski)
* GITHUB#12646, GITHUB#12690: Move FST#addNode to FSTCompiler to avoid a circular dependency
between FST and FSTCompiler (Anh Dung Bui)
* GITHUB#12709: Consolidate FSTStore and BytesStore in FST. Created FSTReader which contains the common methods
of the two (Anh Dung Bui)
* GITHUB#12735: Remove FSTCompiler#getTermCount() and FSTCompiler.UnCompiledNode#inputCount (Anh Dung Bui)
* GITHUB-12695: Remove public constructor of FSTCompiler. Please use FSTCompiler.Builder
instead. (Juan M. Caicedo)
* GITHUB#12799: Make TaskExecutor constructor public and use TaskExecutor for concurrent
HNSW graph build. (Shubham Chaudhary)
* GITHUB#12758, GITHUB#12803: Remove FST constructor with DataInput for metadata. Please
use the constructor with FSTMetadata instead. (Anh Dung Bui)
New Features
---------------------
@ -225,6 +295,22 @@ Improvements
* GITHUB#12754: Refactor lookup of Hotspot VM options and do not initialize constants with NULL
if SecurityManager prevents access. (Uwe Schindler)
* GITHUB#12801: Remove possible contention on a ReentrantReadWriteLock in
Monitor which could result in searches waiting for commits. (Davis Cook)
* GITHUB#11277, LUCENE-10241: Upgrade to OpenNLP to 1.9.4. (Jeff Zemerick)
* GITHUB#12542: FSTCompiler can now approximately limit how much RAM it uses to share
suffixes during FST construction using the suffixRAMLimitMB method. Larger values
result in a more minimal FST (more common suffixes are shard). Pass
Double.POSITIVE_INFINITY to use as much RAM as is needed to create a purely
minimal FST. Inspired by this Rust FST implemention:
https://blog.burntsushi.net/transducers (Mike McCandless)
* GITHUB#12738: NodeHash now stores the FST nodes data instead of just node addresses (Anh Dung Bui)
* GITHUB#12847: Test2BFST now reports the time it took to build the FST and the real FST size (Anh Dung Bui)
Optimizations
---------------------
* GITHUB#12183: Make TermStates#build concurrent. (Shubham Chaudhary)
@ -276,10 +362,14 @@ Optimizations
* GITHUB#12381: Skip docs with DocValues in NumericLeafComparator. (Lu Xugang, Adrien Grand)
* GITHUB#12748: Specialize arc store for continuous label in FST. (Guo Feng, Chao Zhang)
* GITHUB#12784: Cache buckets to speed up BytesRefHash#sort. (Guo Feng)
* GITHUB#12806: Utilize exact kNN search when gathering k >= numVectors in a segment (Ben Trent)
* GITHUB#12782: Use group-varint encoding for the tail of postings. (Adrien Grand, Zhang Chao)
* GITHUB#12748: Specialize arc store for continuous label in FST. (Guo Feng, Zhang Chao)
Changes in runtime behavior
---------------------
@ -311,6 +401,11 @@ Bug Fixes
* GITHUB#12770: Stop exploring HNSW graph if scores are not getting better. (Ben Trent)
* GITHUB#12640: Ensure #finish is called on all drill-sideways collectors even if one throws a
CollectionTerminatedException (Greg Miller)
* GITHUB#12626: Fix segmentInfos replace to set userData (Shibi Balamurugan, Uwe Schindler, Marcus Eagan, Michael Froh)
Build
---------------------
@ -324,9 +419,15 @@ Build
* GITHUB#12655: Upgrade to Gradle 8.4 (Kevin Risden)
* GITHUB#12845: Only enable support for tests.profile if jdk.jfr module is available
in Gradle runtime. (Uwe Schindler)
Other
---------------------
* GITHUB#12817: Add demo for faceting with StringValueFacetCounts over KeywordField and SortedDocValuesField.
(Stefan Vodita)
* GITHUB#12657: Internal refactor of HNSW graph merging (Ben Trent).
* GITHUB#12625: Refactor ByteBlockPool so it is just a "shift/mask big array". (Ignacio Vera)
@ -336,6 +437,8 @@ Other
overflows and slices that are too large. Some bits of code are simplified. Documentation is updated and expanded.
(Stefan Vodita)
* GITHUB#12762: Refactor BKD HeapPointWriter to hide the internal data structure. (Ignacio Vera)
======================== Lucene 9.8.0 =======================
API Changes
@ -364,6 +467,8 @@ New Features
* GITHUB#12479: Add new Maximum Inner Product vector similarity function for non-normalized dot-product
vector search. (Jack Mazanec, Ben Trent)
* GITHUB#12525: `WordDelimiterGraphFilterFactory` now supports the `ignoreKeywords` flag (Thomas De Craemer)
* GITHUB#12489: Add support for recursive graph bisection, also called
bipartite graph partitioning, and often abbreviated BP, an algorithm for
reordering doc IDs that results in more compact postings and faster queries,
@ -386,7 +491,7 @@ Improvements
Optimizations
---------------------
* GITHUB#12377: Avoid redundant loop for compute min value in DirectMonotonicWriter. (Chao Zhang)
* GITHUB#12377: Avoid redundant loop for compute min value in DirectMonotonicWriter. (Zhang Chao)
* GITHUB#12361: Faster top-level disjunctions sorted by descending score.
(Adrien Grand)
@ -401,7 +506,7 @@ Optimizations
* GITHUB#12385: Restore parallel knn query rewrite across segments rather than slices (Luca Cavanna)
* GITHUB#12381: Speed up NumericDocValuesWriter with index sorting. (Chao Zhang)
* GITHUB#12381: Speed up NumericDocValuesWriter with index sorting. (Zhang Chao)
* GITHUB#12453: Faster bulk numeric reads from BufferedIndexInput (Armin Braun)
@ -468,7 +573,7 @@ Other
* GITHUB#12428: Replace consecutive close() calls and close() calls with null checks with IOUtils.close().
(Shubham Chaudhary)
* GITHUB#12512: Remove unused variable in BKDWriter. (Chao Zhang)
* GITHUB#12512: Remove unused variable in BKDWriter. (Zhang Chao)
======================== Lucene 9.7.0 =======================

View File

@ -19,6 +19,11 @@
## Migration from Lucene 9.x to Lucene 10.0
### Minor API changes in MatchHighlighter and MatchRegionRetriever. (GITHUB#12881)
The API of interfaces for accepting highlights has changed to allow performance improvements. Look at the issue and the PR diff to get
a sense of what's changed (changes are minor).
### Removed deprecated IndexSearcher.doc, IndexReader.document, IndexReader.getTermVectors (GITHUB#11998)
The deprecated Stored Fields and Term Vectors apis relied upon threadlocal storage and have been removed.
@ -101,6 +106,34 @@ The deprecated getter for the `Executor` that was optionally provided to the `In
has been removed. Users that want to execute concurrent tasks should rely instead on the `TaskExecutor`
that the searcher holds, retrieved via `IndexSearcher#getTaskExecutor`.
### CheckIndex params -slow and -fast are deprecated, replaced by -level X (GITHUB#11023)
The `CheckIndex` former `-fast` behaviour of performing checksum checks only, is now the default.
Added a new parameter: `-level X`, to set the detail level of the index check. The higher the value, the more checks are performed.
Sample `-level` usage: `1` (Default) - Checksum checks only, `2` - all level 1 checks as well as logical integrity checks, `3` - all
level 2 checks as well as slow checks.
### Expressions module now uses `MethodHandle` and hidden classes (GITHUB#12873)
Custom functions in the expressions module must now be passed in a `Map` using `MethodHandle` as values.
To convert legacy code using maps of reflective `java.lang.reflect.Method`, use the converter method
`JavascriptCompiler#convertLegacyFunctions`. This should make the mapping mostly compatible.
The use of `MethodHandle` and [Dynamic Class-File Constants (JEP 309)](https://openjdk.org/jeps/309)
now also allows to pass private methods or methods from different classloaders. It is also possible
to adapt guards or filters using the `MethodHandles` class.
The new implementation of the Javascript expressions compiler no longer supports use of custom
`ClassLoader`, because it uses the new JDK 15 feature [hidden classes (JEP 371)](https://openjdk.org/jeps/371).
Due to the use of `MethodHandle`, classloader isolation is no longer needed, because JS code can only call
MHs that were resolved by the application before using the expressions module.
### `Expression#evaluate()` declares to throw IOException (GITHUB#12878)
The expressions module has changed the `Expression#evaluate()` method signature:
It now declares that it may throw `IOException`. This was an oversight because
compiled expressions call `DoubleValues#doubleValue` behind the scenes, which
may throw `IOException` on index problems, bubbling up unexpectedly to the caller.
## Migration from Lucene 9.0 to Lucene 9.1
### Test framework package migration and module (LUCENE-10301)

View File

@ -105,7 +105,8 @@ public class NormalizeCharMap {
final FST<CharsRef> map;
try {
final Outputs<CharsRef> outputs = CharSequenceOutputs.getSingleton();
final FSTCompiler<CharsRef> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE2, outputs);
final FSTCompiler<CharsRef> fstCompiler =
new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE2, outputs).build();
final IntsRefBuilder scratch = new IntsRefBuilder();
for (Map.Entry<String, String> ent : pendingPairs.entrySet()) {
fstCompiler.add(Util.toUTF16(ent.getKey(), scratch), new CharsRef(ent.getValue()));

View File

@ -777,7 +777,6 @@ class KStemmer {
private int stemLength() {
return j + 1;
}
;
private boolean endsIn(char[] s) {
if (s.length > k) return false;

View File

@ -40,7 +40,8 @@ class ConvTable {
try {
Outputs<CharsRef> outputs = CharSequenceOutputs.getSingleton();
FSTCompiler<CharsRef> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE2, outputs);
FSTCompiler<CharsRef> fstCompiler =
new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE2, outputs).build();
IntsRefBuilder scratchInts = new IntsRefBuilder();
for (Map.Entry<String, String> entry : mappings.entrySet()) {
String key = entry.getKey();

View File

@ -50,18 +50,12 @@ import java.util.Set;
import java.util.TreeMap;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.analysis.hunspell.SortingStrategy.EntryAccumulator;
import org.apache.lucene.analysis.hunspell.SortingStrategy.EntrySupplier;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.IntsRefBuilder;
import org.apache.lucene.util.OfflineSorter;
import org.apache.lucene.util.OfflineSorter.ByteSequencesReader;
import org.apache.lucene.util.OfflineSorter.ByteSequencesWriter;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.FSTCompiler;
import org.apache.lucene.util.fst.IntSequenceOutputs;
@ -215,6 +209,25 @@ public class Dictionary {
List<InputStream> dictionaries,
boolean ignoreCase)
throws IOException, ParseException {
this(affix, dictionaries, ignoreCase, SortingStrategy.offline(tempDir, tempFileNamePrefix));
}
/**
* Creates a new Dictionary containing the information read from the provided InputStreams to
* hunspell affix and dictionary files. You have to close the provided InputStreams yourself.
*
* @param affix InputStream for reading the hunspell affix file (won't be closed).
* @param dictionaries InputStream for reading the hunspell dictionary files (won't be closed).
* @param sortingStrategy the entry strategy for the dictionary loading
* @throws IOException Can be thrown while reading from the InputStreams
* @throws ParseException Can be thrown if the content of the files does not meet expected formats
*/
public Dictionary(
InputStream affix,
List<InputStream> dictionaries,
boolean ignoreCase,
SortingStrategy sortingStrategy)
throws IOException, ParseException {
this.ignoreCase = ignoreCase;
try (BufferedInputStream affixStream =
@ -250,10 +263,11 @@ public class Dictionary {
readAffixFile(affixStream, decoder, flagEnumerator);
// read dictionary entries
IndexOutput unsorted = tempDir.createTempOutput(tempFileNamePrefix, "dat", IOContext.DEFAULT);
int wordCount = mergeDictionaries(dictionaries, decoder, unsorted);
String sortedFile = sortWordsOffline(tempDir, tempFileNamePrefix, unsorted);
words = readSortedDictionaries(tempDir, sortedFile, flagEnumerator, wordCount);
EntryAccumulator acc = sortingStrategy.start();
mergeDictionaries(dictionaries, decoder, acc);
try (EntrySupplier sorted = acc.finishAndSort()) {
words = readSortedDictionaries(flagEnumerator, sorted);
}
flagLookup = flagEnumerator.finish();
aliases = null; // no longer needed
morphAliases = null; // no longer needed
@ -631,7 +645,8 @@ public class Dictionary {
private FST<IntsRef> affixFST(TreeMap<String, List<Integer>> affixes) throws IOException {
IntSequenceOutputs outputs = IntSequenceOutputs.getSingleton();
FSTCompiler<IntsRef> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE4, outputs);
FSTCompiler<IntsRef> fstCompiler =
new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE4, outputs).build();
IntsRefBuilder scratch = new IntsRefBuilder();
for (Map.Entry<String, List<Integer>> entry : affixes.entrySet()) {
Util.toUTF32(entry.getKey(), scratch);
@ -984,12 +999,10 @@ public class Dictionary {
}
}
private int mergeDictionaries(
List<InputStream> dictionaries, CharsetDecoder decoder, IndexOutput output)
private void mergeDictionaries(
List<InputStream> dictionaries, CharsetDecoder decoder, EntryAccumulator acc)
throws IOException {
StringBuilder sb = new StringBuilder();
int wordCount = 0;
try (ByteSequencesWriter writer = new ByteSequencesWriter(output)) {
for (InputStream dictionary : dictionaries) {
BufferedReader lines = new BufferedReader(new InputStreamReader(dictionary, decoder));
lines.readLine(); // first line is number of entries (approximately, sometimes)
@ -1006,30 +1019,23 @@ public class Dictionary {
int morphStart = line.indexOf(MORPH_SEPARATOR);
if (morphStart >= 0) {
String data = line.substring(morphStart + 1);
hasCustomMorphData =
splitMorphData(data).stream().anyMatch(s -> !s.startsWith("ph:"));
hasCustomMorphData = splitMorphData(data).stream().anyMatch(s -> !s.startsWith("ph:"));
}
}
wordCount += writeNormalizedWordEntry(sb, writer, line);
writeNormalizedWordEntry(sb, line, acc);
}
}
CodecUtil.writeFooter(output);
}
return wordCount;
}
/**
* @return the number of word entries written
*/
private int writeNormalizedWordEntry(StringBuilder reuse, ByteSequencesWriter writer, String line)
private void writeNormalizedWordEntry(StringBuilder reuse, String line, EntryAccumulator acc)
throws IOException {
int flagSep = line.indexOf(FLAG_SEPARATOR);
int morphSep = line.indexOf(MORPH_SEPARATOR);
assert morphSep > 0;
assert morphSep > flagSep;
int sep = flagSep < 0 ? morphSep : flagSep;
if (sep == 0) return 0;
if (sep == 0) return;
CharSequence toWrite;
String beforeSep = line.substring(0, sep);
@ -1043,19 +1049,16 @@ public class Dictionary {
String written = toWrite.toString();
sep = written.length() - (line.length() - sep);
writer.write(written.getBytes(StandardCharsets.UTF_8));
acc.addEntry(written);
WordCase wordCase = WordCase.caseOf(written, sep);
if (wordCase == WordCase.MIXED || wordCase == WordCase.UPPER && flagSep > 0) {
addHiddenCapitalizedWord(reuse, writer, written.substring(0, sep), written.substring(sep));
return 2;
addHiddenCapitalizedWord(reuse, acc, written.substring(0, sep), written.substring(sep));
}
return 1;
}
private void addHiddenCapitalizedWord(
StringBuilder reuse, ByteSequencesWriter writer, String word, String afterSep)
throws IOException {
StringBuilder reuse, EntryAccumulator acc, String word, String afterSep) throws IOException {
reuse.setLength(0);
reuse.append(Character.toUpperCase(word.charAt(0)));
for (int i = 1; i < word.length(); i++) {
@ -1064,7 +1067,7 @@ public class Dictionary {
reuse.append(FLAG_SEPARATOR);
reuse.append(HIDDEN_FLAG);
reuse.append(afterSep, afterSep.charAt(0) == FLAG_SEPARATOR ? 1 : 0, afterSep.length());
writer.write(reuse.toString().getBytes(StandardCharsets.UTF_8));
acc.addEntry(reuse.toString());
}
String toLowerCase(String word) {
@ -1084,83 +1087,21 @@ public class Dictionary {
return new String(chars);
}
private String sortWordsOffline(
Directory tempDir, String tempFileNamePrefix, IndexOutput unsorted) throws IOException {
OfflineSorter sorter =
new OfflineSorter(
tempDir,
tempFileNamePrefix,
new Comparator<>() {
final BytesRef scratch1 = new BytesRef();
final BytesRef scratch2 = new BytesRef();
private void initScratch(BytesRef o, BytesRef scratch) {
scratch.bytes = o.bytes;
scratch.offset = o.offset;
scratch.length = o.length;
for (int i = scratch.length - 1; i >= 0; i--) {
if (scratch.bytes[scratch.offset + i] == FLAG_SEPARATOR
|| scratch.bytes[scratch.offset + i] == MORPH_SEPARATOR) {
scratch.length = i;
break;
}
}
}
@Override
public int compare(BytesRef o1, BytesRef o2) {
initScratch(o1, scratch1);
initScratch(o2, scratch2);
int cmp = scratch1.compareTo(scratch2);
if (cmp == 0) {
// tie break on whole row
return o1.compareTo(o2);
} else {
return cmp;
}
}
});
String sorted;
boolean success = false;
try {
sorted = sorter.sort(unsorted.getName());
success = true;
} finally {
if (success) {
tempDir.deleteFile(unsorted.getName());
} else {
IOUtils.deleteFilesIgnoringExceptions(tempDir, unsorted.getName());
}
}
return sorted;
}
private WordStorage readSortedDictionaries(
Directory tempDir, String sorted, FlagEnumerator flags, int wordCount) throws IOException {
boolean success = false;
private WordStorage readSortedDictionaries(FlagEnumerator flags, EntrySupplier sorted)
throws IOException {
Map<String, Integer> morphIndices = new HashMap<>();
WordStorage.Builder builder =
new WordStorage.Builder(
wordCount, hashFactor(), hasCustomMorphData, flags, allNonSuggestibleFlags());
try (ByteSequencesReader reader =
new ByteSequencesReader(tempDir.openChecksumInput(sorted), sorted)) {
sorted.wordCount(), hashFactor(), hasCustomMorphData, flags, allNonSuggestibleFlags());
// TODO: the flags themselves can be double-chars (long) or also numeric
// either way the trick is to encode them as char... but they must be parsed differently
while (true) {
BytesRef scratch = reader.next();
if (scratch == null) {
break;
}
String line = sorted.next();
if (line == null) break;
String line = scratch.utf8ToString();
String entry;
char[] wordForm;
int end;
@ -1200,21 +1141,12 @@ public class Dictionary {
builder.add(entry, wordForm, morphDataID);
}
// finalize last entry
success = true;
return new WordStorage(builder) {
@Override
char caseFold(char c) {
return Dictionary.this.caseFold(c);
}
};
} finally {
if (success) {
tempDir.deleteFile(sorted);
} else {
IOUtils.deleteFilesIgnoringExceptions(tempDir, sorted);
}
}
}
/**

View File

@ -0,0 +1,181 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.hunspell;
import java.io.Closeable;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.List;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefComparator;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.OfflineSorter;
import org.apache.lucene.util.OfflineSorter.ByteSequencesReader;
import org.apache.lucene.util.OfflineSorter.ByteSequencesWriter;
/**
* The strategy defining how a Hunspell dictionary should be loaded, with different tradeoffs. The
* entries should be sorted in a special way, and this can be done either in-memory (faster, but
* temporarily allocating more memory) or using disk (slower, but not needing much memory).
*
* @see #offline(Directory, String)
* @see #inMemory()
*/
public abstract class SortingStrategy {
abstract EntryAccumulator start() throws IOException;
interface EntryAccumulator {
void addEntry(String entry) throws IOException;
EntrySupplier finishAndSort() throws IOException;
}
interface EntrySupplier extends Closeable {
int wordCount();
/** The next line or {@code null} if the end is reached */
String next() throws IOException;
}
/**
* An "offline" strategy that creates temporary files in the given directory and uses them for
* sorting with {@link OfflineSorter}. It's slower than {@link #inMemory()}, but doesn't need to
* load the entire dictionary into memory.
*/
public static SortingStrategy offline(Directory tempDir, String tempFileNamePrefix) {
return new SortingStrategy() {
@Override
EntryAccumulator start() throws IOException {
IndexOutput output = tempDir.createTempOutput(tempFileNamePrefix, "dat", IOContext.DEFAULT);
ByteSequencesWriter writer = new ByteSequencesWriter(output);
return new EntryAccumulator() {
int wordCount = 0;
@Override
public void addEntry(String entry) throws IOException {
wordCount++;
writer.write(entry.getBytes(StandardCharsets.UTF_8));
}
@Override
public EntrySupplier finishAndSort() throws IOException {
CodecUtil.writeFooter(output);
writer.close();
String sortedFile = sortWordsOffline();
ByteSequencesReader reader =
new ByteSequencesReader(tempDir.openChecksumInput(sortedFile), sortedFile);
return new EntrySupplier() {
boolean success = false;
@Override
public int wordCount() {
return wordCount;
}
@Override
public String next() throws IOException {
BytesRef scratch = reader.next();
if (scratch == null) {
success = true;
return null;
}
return scratch.utf8ToString();
}
@Override
public void close() throws IOException {
reader.close();
if (success) {
tempDir.deleteFile(sortedFile);
} else {
IOUtils.deleteFilesIgnoringExceptions(tempDir, sortedFile);
}
}
};
}
private String sortWordsOffline() throws IOException {
var sorter = new OfflineSorter(tempDir, tempFileNamePrefix, BytesRefComparator.NATURAL);
String sorted;
boolean success = false;
try {
sorted = sorter.sort(output.getName());
success = true;
} finally {
if (success) {
tempDir.deleteFile(output.getName());
} else {
IOUtils.deleteFilesIgnoringExceptions(tempDir, output.getName());
}
}
return sorted;
}
};
}
};
}
/**
* The strategy that loads all entries as {@link String} objects and sorts them in memory. The
* entries are then stored in a more compressed way, and the strings are gc-ed, but the loading
* itself needs {@code O(dictionary_size)} memory.
*/
public static SortingStrategy inMemory() {
return new SortingStrategy() {
@Override
EntryAccumulator start() {
List<String> entries = new ArrayList<>();
return new EntryAccumulator() {
@Override
public void addEntry(String entry) {
entries.add(entry);
}
@Override
public EntrySupplier finishAndSort() {
entries.sort(Comparator.naturalOrder());
return new EntrySupplier() {
int i = 0;
@Override
public int wordCount() {
return entries.size();
}
@Override
public String next() {
return i < entries.size() ? entries.get(i++) : null;
}
@Override
public void close() {}
};
}
};
}
};
}
}

View File

@ -350,16 +350,19 @@ abstract class WordStorage {
currentOrds.clear();
boolean hasNonHidden = false;
boolean isSuggestible = false;
for (char[] flags : group) {
if (!hasFlag(flags, Dictionary.HIDDEN_FLAG)) {
hasNonHidden = true;
break;
}
if (!hasNoSuggestFlag(flags)) {
isSuggestible = true;
}
}
for (int i = 0; i < group.size(); i++) {
char[] flags = group.get(i);
if (hasNonHidden && hasFlag(flags, Dictionary.HIDDEN_FLAG)) {
if (hasNonHidden && group.size() > 1 && hasFlag(flags, Dictionary.HIDDEN_FLAG)) {
continue;
}
@ -388,7 +391,7 @@ abstract class WordStorage {
int mask =
(prevCode == 0 ? 0 : COLLISION_MASK)
| (group.stream().anyMatch(flags -> !hasNoSuggestFlag(flags)) ? SUGGESTIBLE_MASK : 0)
| (isSuggestible ? SUGGESTIBLE_MASK : 0)
| Math.min(currentEntry.length(), MAX_STORED_LENGTH);
hashTable[hash] = (mask << OFFSET_BITS) | pos;

View File

@ -210,7 +210,8 @@ public final class StemmerOverrideFilter extends TokenFilter {
*/
public StemmerOverrideMap build() throws IOException {
ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton();
FSTCompiler<BytesRef> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE4, outputs);
FSTCompiler<BytesRef> fstCompiler =
new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE4, outputs).build();
final int[] sort = hash.sort();
IntsRefBuilder intsSpare = new IntsRefBuilder();
final int size = hash.size();

View File

@ -46,11 +46,11 @@ public class TruncateTokenFilterFactory extends TokenFilterFactory {
public static final String NAME = "truncate";
public static final String PREFIX_LENGTH_KEY = "prefixLength";
private final byte prefixLength;
private final int prefixLength;
public TruncateTokenFilterFactory(Map<String, String> args) {
super(args);
prefixLength = Byte.parseByte(get(args, PREFIX_LENGTH_KEY, "5"));
prefixLength = Integer.parseInt(get(args, PREFIX_LENGTH_KEY, "5"));
if (prefixLength < 1)
throw new IllegalArgumentException(
PREFIX_LENGTH_KEY + " parameter must be a positive number: " + prefixLength);

View File

@ -163,7 +163,6 @@ public final class WordDelimiterFilter extends TokenFilter {
private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class);
private final KeywordAttribute keywordAttribute = addAttribute(KeywordAttribute.class);
;
private final OffsetAttribute offsetAttribute = addAttribute(OffsetAttribute.class);
private final PositionIncrementAttribute posIncAttribute =
addAttribute(PositionIncrementAttribute.class);

View File

@ -164,7 +164,6 @@ public final class WordDelimiterGraphFilter extends TokenFilter {
private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class);
private final KeywordAttribute keywordAttribute = addAttribute(KeywordAttribute.class);
;
private final OffsetAttribute offsetAttribute = addAttribute(OffsetAttribute.class);
private final PositionIncrementAttribute posIncAttribute =
addAttribute(PositionIncrementAttribute.class);

View File

@ -45,7 +45,7 @@ import org.apache.lucene.util.ResourceLoaderAware;
* preserveOriginal="0" splitOnNumerics="1" splitOnCaseChange="1"
* catenateWords="0" catenateNumbers="0" catenateAll="0"
* generateWordParts="1" generateNumberParts="1" stemEnglishPossessive="1"
* types="wdfftypes.txt" /&gt;
* types="wdfftypes.txt" ignoreKeywords="0" /&gt;
* &lt;/analyzer&gt;
* &lt;/fieldType&gt;</pre>
*
@ -100,6 +100,9 @@ public class WordDelimiterGraphFilterFactory extends TokenFilterFactory
if (getInt(args, "stemEnglishPossessive", 1) != 0) {
flags |= STEM_ENGLISH_POSSESSIVE;
}
if (getInt(args, "ignoreKeywords", 0) != 0) {
flags |= IGNORE_KEYWORDS;
}
wordFiles = get(args, PROTECTED_TOKENS);
types = get(args, TYPES);
this.flags = flags;

View File

@ -216,7 +216,6 @@ public final class SynonymFilter extends TokenFilter {
count++;
}
}
;
private final ByteArrayDataInput bytesReader = new ByteArrayDataInput();

View File

@ -222,7 +222,8 @@ public class SynonymMap {
public SynonymMap build() throws IOException {
ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton();
// TODO: are we using the best sharing options?
FSTCompiler<BytesRef> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE4, outputs);
FSTCompiler<BytesRef> fstCompiler =
new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE4, outputs).build();
BytesRefBuilder scratch = new BytesRefBuilder();
ByteArrayDataOutput scratchOutput = new ByteArrayDataOutput();

View File

@ -595,8 +595,7 @@ public class TestHTMLStripCharFilter extends BaseTokenStreamTestCase {
}
}
Reader reader = new HTMLStripCharFilter(new StringReader(text.toString()));
while (reader.read() != -1)
;
while (reader.read() != -1) {}
}
public void testUTF16Surrogates() throws Exception {

View File

@ -230,7 +230,6 @@ public class TestDuelingAnalyzers extends BaseTokenStreamTestCase {
assertEquals(
"wrong end offset for input: " + s, leftOffset.endOffset(), rightOffset.endOffset());
}
;
assertFalse("wrong number of tokens for input: " + s, right.incrementToken());
left.end();
right.end();

View File

@ -41,7 +41,6 @@ import java.util.concurrent.atomic.AtomicLong;
import java.util.function.Function;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.apache.lucene.tests.store.BaseDirectoryWrapper;
import org.apache.lucene.tests.util.LuceneTestCase;
import org.apache.lucene.tests.util.LuceneTestCase.SuppressSysoutChecks;
import org.apache.lucene.tests.util.RamUsageTester;
@ -72,9 +71,8 @@ public class TestAllDictionaries extends LuceneTestCase {
Path dic = Path.of(affPath.substring(0, affPath.length() - 4) + ".dic");
assert Files.exists(dic) : dic;
try (InputStream dictionary = Files.newInputStream(dic);
InputStream affix = Files.newInputStream(aff);
BaseDirectoryWrapper tempDir = newDirectory()) {
return new Dictionary(tempDir, "dictionary", affix, dictionary) {
InputStream affix = Files.newInputStream(aff)) {
return new Dictionary(affix, List.of(dictionary), false, SortingStrategy.inMemory()) {
@Override
protected boolean tolerateAffixRuleCountMismatches() {
return true;

View File

@ -256,15 +256,22 @@ public class TestSpellChecking extends LuceneTestCase {
}
static void checkSpellCheckerExpectations(Path basePath) throws IOException, ParseException {
InputStream affixStream = Files.newInputStream(Path.of(basePath.toString() + ".aff"));
checkSpellCheckerExpectations(
basePath, SortingStrategy.offline(new ByteBuffersDirectory(), "dictionary"));
checkSpellCheckerExpectations(basePath, SortingStrategy.inMemory());
}
private static void checkSpellCheckerExpectations(Path basePath, SortingStrategy strategy)
throws IOException, ParseException {
Path affFile = Path.of(basePath + ".aff");
Path dicFile = Path.of(basePath + ".dic");
InputStream affixStream = Files.newInputStream(affFile);
InputStream dictStream = Files.newInputStream(dicFile);
Hunspell speller;
Map<String, Suggester> suggesters = new LinkedHashMap<>();
try {
Dictionary dictionary =
new Dictionary(new ByteBuffersDirectory(), "dictionary", affixStream, dictStream);
Dictionary dictionary = new Dictionary(affixStream, List.of(dictStream), false, strategy);
speller = new Hunspell(dictionary, TimeoutPolicy.NO_TIMEOUT, () -> {});
Suggester suggester = new Suggester(dictionary);
suggesters.put("default", suggester);

View File

@ -41,7 +41,6 @@ public class TestIndicNormalizer extends BaseTokenStreamTestCase {
private void check(String input, String output) throws IOException {
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
;
tokenizer.setReader(new StringReader(input));
TokenFilter tf = new IndicNormalizationFilter(tokenizer);
assertTokenStreamContents(tf, new String[] {output});

View File

@ -89,7 +89,6 @@ public class TestKeywordMarkerFilterFactory extends BaseTokenStreamFactoryTestCa
stream =
tokenFilterFactory("KeywordMarker", "pattern", "Cats", "ignoreCase", "true").create(stream);
stream = tokenFilterFactory("PorterStem").create(stream);
;
assertTokenStreamContents(stream, new String[] {"dog", "cats", "Cats"});
}

View File

@ -68,4 +68,23 @@ public class TestTruncateTokenFilterFactory extends BaseTokenStreamFactoryTestCa
TruncateTokenFilterFactory.PREFIX_LENGTH_KEY
+ " parameter must be a positive number: -5"));
}
/** Test that takes length greater than byte limit accepts it */
public void testLengthGreaterThanByteLimitArgument() throws Exception {
Reader reader =
new StringReader(
"abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvw128characters From here");
TokenStream stream = new MockTokenizer(MockTokenizer.WHITESPACE, false);
((Tokenizer) stream).setReader(reader);
stream =
tokenFilterFactory("Truncate", TruncateTokenFilterFactory.PREFIX_LENGTH_KEY, "128")
.create(stream);
assertTokenStreamContents(
stream,
new String[] {
"abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvw1",
"From",
"here"
});
}
}

View File

@ -69,7 +69,6 @@ public class TestEdgeNGramTokenizer extends BaseTokenStreamTestCase {
public void testOversizedNgrams() throws Exception {
EdgeNGramTokenizer tokenizer = new EdgeNGramTokenizer(6, 6);
tokenizer.setReader(input);
;
assertTokenStreamContents(tokenizer, new String[0], new int[0], new int[0], 5 /* abcde */);
}

View File

@ -156,7 +156,6 @@ public class TestCharArrayIterator extends LuceneTestCase {
private void consume(BreakIterator bi, CharacterIterator ci) {
bi.setText(ci);
while (bi.next() != BreakIterator.DONE)
;
while (bi.next() != BreakIterator.DONE) {}
}
}

View File

@ -16,6 +16,8 @@
*/
package org.apache.lucene.analysis.ja.dict;
import static org.apache.lucene.util.fst.FST.readMetadata;
import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
@ -103,7 +105,7 @@ public final class TokenInfoDictionary extends BinaryDictionary<TokenInfoMorphDa
FST<Long> fst;
try (InputStream is = new BufferedInputStream(fstResource.get())) {
DataInput in = new InputStreamDataInput(is);
fst = new FST<>(in, in, PositiveIntOutputs.getSingleton());
fst = new FST<>(readMetadata(in, PositiveIntOutputs.getSingleton()), in);
}
// TODO: some way to configure?
this.fst = new TokenInfoFST(fst, true);

View File

@ -101,7 +101,8 @@ class TokenInfoDictionaryBuilder {
lines.sort(Comparator.comparing(entry -> entry[0]));
PositiveIntOutputs fstOutput = PositiveIntOutputs.getSingleton();
FSTCompiler<Long> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE2, fstOutput);
FSTCompiler<Long> fstCompiler =
new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE2, fstOutput).build();
IntsRefBuilder scratch = new IntsRefBuilder();
long ord = -1; // first ord will be 0
String lastValue = null;

View File

@ -93,7 +93,8 @@ public final class UserDictionary implements Dictionary<UserMorphData> {
List<int[]> segmentations = new ArrayList<>(featureEntries.size());
PositiveIntOutputs fstOutput = PositiveIntOutputs.getSingleton();
FSTCompiler<Long> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE2, fstOutput);
FSTCompiler<Long> fstCompiler =
new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE2, fstOutput).build();
IntsRefBuilder scratch = new IntsRefBuilder();
long ord = 0;

View File

@ -758,8 +758,7 @@ public class TestJapaneseTokenizer extends BaseTokenStreamTestCase {
for (int i = 0; i < numIterations; i++) {
try (TokenStream ts = analyzer.tokenStream("ignored", line)) {
ts.reset();
while (ts.incrementToken())
;
while (ts.incrementToken()) {}
ts.end();
}
}
@ -775,8 +774,7 @@ public class TestJapaneseTokenizer extends BaseTokenStreamTestCase {
for (String sentence : sentences) {
try (TokenStream ts = analyzer.tokenStream("ignored", sentence)) {
ts.reset();
while (ts.incrementToken())
;
while (ts.incrementToken()) {}
ts.end();
}
}
@ -831,8 +829,7 @@ public class TestJapaneseTokenizer extends BaseTokenStreamTestCase {
new JapaneseTokenizer(newAttributeFactory(), readDict(), false, Mode.NORMAL);
tokenizer.setReader(new StringReader(doc));
tokenizer.reset();
while (tokenizer.incrementToken())
;
while (tokenizer.incrementToken()) {}
}
public void testPatchedSystemDict() throws Exception {

View File

@ -16,6 +16,8 @@
*/
package org.apache.lucene.analysis.ko.dict;
import static org.apache.lucene.util.fst.FST.readMetadata;
import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
@ -102,7 +104,7 @@ public final class TokenInfoDictionary extends BinaryDictionary<TokenInfoMorphDa
FST<Long> fst;
try (InputStream is = new BufferedInputStream(fstResource.get())) {
DataInput in = new InputStreamDataInput(is);
fst = new FST<>(in, in, PositiveIntOutputs.getSingleton());
fst = new FST<>(readMetadata(in, PositiveIntOutputs.getSingleton()), in);
}
this.fst = new TokenInfoFST(fst);
}

View File

@ -94,7 +94,8 @@ class TokenInfoDictionaryBuilder {
lines.sort(Comparator.comparing(left -> left[0]));
PositiveIntOutputs fstOutput = PositiveIntOutputs.getSingleton();
FSTCompiler<Long> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE2, fstOutput);
FSTCompiler<Long> fstCompiler =
new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE2, fstOutput).build();
IntsRefBuilder scratch = new IntsRefBuilder();
long ord = -1; // first ord will be 0
String lastValue = null;

View File

@ -75,7 +75,8 @@ public final class UserDictionary implements Dictionary<UserMorphData> {
entries.sort(Comparator.comparing(e -> e.split("\\s+")[0]));
PositiveIntOutputs fstOutput = PositiveIntOutputs.getSingleton();
FSTCompiler<Long> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE2, fstOutput);
FSTCompiler<Long> fstCompiler =
new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE2, fstOutput).build();
IntsRefBuilder scratch = new IntsRefBuilder();
String lastToken = null;

View File

@ -41,7 +41,6 @@ public class TestPolishAnalyzer extends BaseTokenStreamTestCase {
/** test use of exclusion set */
public void testExclude() throws IOException {
CharArraySet exclusionSet = new CharArraySet(asSet("studenta"), false);
;
Analyzer a = new PolishAnalyzer(PolishAnalyzer.getDefaultStopSet(), exclusionSet);
checkOneTerm(a, "studenta", "studenta");
checkOneTerm(a, "studenci", "student");

View File

@ -16,6 +16,8 @@
*/
package org.apache.lucene.backward_codecs.lucene40.blocktree;
import static org.apache.lucene.util.fst.FST.readMetadata;
import java.io.IOException;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.IndexOptions;
@ -89,9 +91,17 @@ public final class FieldReader extends Terms {
final IndexInput clone = indexIn.clone();
clone.seek(indexStartFP);
if (metaIn == indexIn) { // Only true before Lucene 8.6
index = new FST<>(clone, clone, ByteSequenceOutputs.getSingleton(), new OffHeapFSTStore());
index =
new FST<>(
readMetadata(clone, ByteSequenceOutputs.getSingleton()),
clone,
new OffHeapFSTStore());
} else {
index = new FST<>(metaIn, clone, ByteSequenceOutputs.getSingleton(), new OffHeapFSTStore());
index =
new FST<>(
readMetadata(metaIn, ByteSequenceOutputs.getSingleton()),
clone,
new OffHeapFSTStore());
}
/*
if (false) {

View File

@ -22,6 +22,7 @@ import java.nio.file.Path;
import java.nio.file.Paths;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.IntPoint;
import org.apache.lucene.index.CheckIndex;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.Directory;
@ -70,7 +71,7 @@ public class TestManyPointsInOldIndex extends LuceneTestCase {
dir.setCheckIndexOnClose(false);
// ... because we check ourselves here:
TestUtil.checkIndex(dir, false, true, true, null);
TestUtil.checkIndex(dir, CheckIndex.Level.MIN_LEVEL_FOR_INTEGRITY_CHECKS, true, true, null);
dir.close();
}
}

View File

@ -23,6 +23,7 @@ description = 'Lucene JMH micro-benchmarking module'
dependencies {
moduleImplementation project(':lucene:core')
moduleImplementation project(':lucene:expressions')
moduleImplementation "org.openjdk.jmh:jmh-core:1.37"
annotationProcessor "org.openjdk.jmh:jmh-generator-annprocess:1.37"
@ -42,7 +43,7 @@ tasks.matching { it.name == "forbiddenApisMain" }.configureEach {
tasks.matching { it.name in [
// Turn off JMH dependency checksums and licensing (it's GPL w/ classpath exception
// but this seems fine for test/build only tools).
"validateJarChecksums", "validateJarLicenses",
"validateJarChecksums", "validateJarLicenses", "collectJarInfos",
// No special javadocs for JMH benchmarks.
"renderSiteJavadoc",
"renderJavadoc",

View File

@ -20,6 +20,7 @@ module org.apache.lucene.benchmark.jmh {
requires jmh.core;
requires jdk.unsupported;
requires org.apache.lucene.core;
requires org.apache.lucene.expressions;
exports org.apache.lucene.benchmark.jmh;
exports org.apache.lucene.benchmark.jmh.jmh_generated;

View File

@ -0,0 +1,148 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.benchmark.jmh;
import java.io.IOException;
import java.lang.invoke.MethodHandle;
import java.lang.invoke.MethodHandles;
import java.lang.invoke.MethodType;
import java.text.ParseException;
import java.util.HashMap;
import java.util.Map;
import java.util.Objects;
import java.util.concurrent.ThreadLocalRandom;
import java.util.concurrent.TimeUnit;
import org.apache.lucene.expressions.Expression;
import org.apache.lucene.expressions.js.JavascriptCompiler;
import org.apache.lucene.search.DoubleValues;
import org.openjdk.jmh.annotations.Benchmark;
import org.openjdk.jmh.annotations.BenchmarkMode;
import org.openjdk.jmh.annotations.Fork;
import org.openjdk.jmh.annotations.Level;
import org.openjdk.jmh.annotations.Measurement;
import org.openjdk.jmh.annotations.Mode;
import org.openjdk.jmh.annotations.OutputTimeUnit;
import org.openjdk.jmh.annotations.Param;
import org.openjdk.jmh.annotations.Scope;
import org.openjdk.jmh.annotations.Setup;
import org.openjdk.jmh.annotations.State;
import org.openjdk.jmh.annotations.Warmup;
@BenchmarkMode(Mode.Throughput)
@OutputTimeUnit(TimeUnit.MILLISECONDS)
@State(Scope.Benchmark)
@Warmup(iterations = 5, time = 5)
@Measurement(iterations = 12, time = 8)
@Fork(value = 1)
public class ExpressionsBenchmark {
/**
* Some extra functions to bench "identity" in various variants, another one is named
* "native_identity" (see below).
*/
private static final Map<String, MethodHandle> FUNCTIONS = getFunctions();
private static final String NATIVE_IDENTITY_NAME = "native_identity";
private static Map<String, MethodHandle> getFunctions() {
try {
var lookup = MethodHandles.lookup();
Map<String, MethodHandle> m = new HashMap<>(JavascriptCompiler.DEFAULT_FUNCTIONS);
m.put(
"func_identity",
lookup.findStatic(
lookup.lookupClass(), "ident", MethodType.methodType(double.class, double.class)));
m.put("mh_identity", MethodHandles.identity(double.class));
return m;
} catch (ReflectiveOperationException e) {
throw new AssertionError(e);
}
}
@SuppressWarnings("unused")
private static double ident(double v) {
return v;
}
/** A native implementation of an expression to compare performance */
private static final Expression NATIVE_IDENTITY_EXPRESSION =
new Expression(NATIVE_IDENTITY_NAME, new String[] {"x"}) {
@Override
public double evaluate(DoubleValues[] functionValues) throws IOException {
return functionValues[0].doubleValue();
}
};
private double[] randomData;
private Expression expression;
@Param({"x", "func_identity(x)", "mh_identity", "native_identity", "cos(x)", "cos(x) + sin(x)"})
String js;
@Setup(Level.Iteration)
public void init() throws ParseException {
ThreadLocalRandom random = ThreadLocalRandom.current();
randomData = random.doubles().limit(1024).toArray();
expression =
Objects.equals(js, NATIVE_IDENTITY_NAME)
? NATIVE_IDENTITY_EXPRESSION
: JavascriptCompiler.compile(js, FUNCTIONS);
}
@Benchmark
public double expression() throws IOException {
var it = new ValuesIterator(randomData);
var values = it.getDoubleValues();
double result = 0d;
while (it.next()) {
result += expression.evaluate(values);
}
return result;
}
static final class ValuesIterator {
final double[] data;
final DoubleValues[] dv;
int pos = -1;
ValuesIterator(double[] data) {
this.data = data;
var dv =
new DoubleValues() {
@Override
public double doubleValue() throws IOException {
return data[pos];
}
@Override
public boolean advanceExact(int doc) throws IOException {
throw new UnsupportedOperationException();
}
};
this.dv = new DoubleValues[] {dv};
}
boolean next() {
pos++;
return (pos < data.length);
}
DoubleValues[] getDoubleValues() {
return dv;
}
}
}

View File

@ -0,0 +1,176 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.benchmark.jmh;
import java.io.IOException;
import java.nio.file.Files;
import java.util.Arrays;
import java.util.Random;
import java.util.concurrent.TimeUnit;
import org.apache.lucene.codecs.lucene99.GroupVIntReader;
import org.apache.lucene.codecs.lucene99.GroupVIntWriter;
import org.apache.lucene.store.ByteArrayDataInput;
import org.apache.lucene.store.ByteArrayDataOutput;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.store.MMapDirectory;
import org.openjdk.jmh.annotations.Benchmark;
import org.openjdk.jmh.annotations.BenchmarkMode;
import org.openjdk.jmh.annotations.Fork;
import org.openjdk.jmh.annotations.Level;
import org.openjdk.jmh.annotations.Measurement;
import org.openjdk.jmh.annotations.Mode;
import org.openjdk.jmh.annotations.OutputTimeUnit;
import org.openjdk.jmh.annotations.Param;
import org.openjdk.jmh.annotations.Scope;
import org.openjdk.jmh.annotations.Setup;
import org.openjdk.jmh.annotations.State;
import org.openjdk.jmh.annotations.Warmup;
import org.openjdk.jmh.infra.Blackhole;
@BenchmarkMode(Mode.Throughput)
@OutputTimeUnit(TimeUnit.MICROSECONDS)
@State(Scope.Benchmark)
@Warmup(iterations = 3, time = 3)
@Measurement(iterations = 5, time = 5)
@Fork(
value = 1,
jvmArgsPrepend = {"--add-modules=jdk.unsupported"})
public class GroupVIntBenchmark {
// Cumulative frequency for each number of bits per value used by doc deltas of tail postings on
// wikibigall.
private static final float[] CUMULATIVE_FREQUENCY_BY_BITS_REQUIRED =
new float[] {
0.0f,
0.01026574f,
0.021453038f,
0.03342156f,
0.046476692f,
0.060890317f,
0.07644147f,
0.093718216f,
0.11424741f,
0.13989712f,
0.17366524f,
0.22071244f,
0.2815692f,
0.3537585f,
0.43655503f,
0.52308f,
0.6104675f,
0.7047371f,
0.78155357f,
0.8671179f,
0.9740598f,
1.0f
};
final int maxSize = 256;
final long[] values = new long[maxSize];
IndexInput byteBufferGVIntIn;
IndexInput byteBufferVIntIn;
ByteArrayDataInput byteArrayVIntIn;
ByteArrayDataInput byteArrayGVIntIn;
// @Param({"16", "32", "64", "128", "248"})
@Param({"64"})
public int size;
void initArrayInput(long[] docs) throws Exception {
byte[] gVIntBytes = new byte[Integer.BYTES * maxSize * 2];
byte[] vIntBytes = new byte[Integer.BYTES * maxSize * 2];
ByteArrayDataOutput vIntOut = new ByteArrayDataOutput(vIntBytes);
GroupVIntWriter w = new GroupVIntWriter();
w.writeValues(new ByteArrayDataOutput(gVIntBytes), docs, docs.length);
for (long v : docs) {
vIntOut.writeVInt((int) v);
}
byteArrayVIntIn = new ByteArrayDataInput(vIntBytes);
byteArrayGVIntIn = new ByteArrayDataInput(gVIntBytes);
}
void initByteBufferInput(long[] docs) throws Exception {
Directory dir = MMapDirectory.open(Files.createTempDirectory("groupvintdata"));
IndexOutput vintOut = dir.createOutput("vint", IOContext.DEFAULT);
IndexOutput gvintOut = dir.createOutput("gvint", IOContext.DEFAULT);
GroupVIntWriter w = new GroupVIntWriter();
w.writeValues(gvintOut, docs, docs.length);
for (long v : docs) {
vintOut.writeVInt((int) v);
}
vintOut.close();
gvintOut.close();
byteBufferGVIntIn = dir.openInput("gvint", IOContext.DEFAULT);
byteBufferVIntIn = dir.openInput("vint", IOContext.DEFAULT);
}
@Setup(Level.Trial)
public void init() throws Exception {
long[] docs = new long[maxSize];
Random r = new Random(0);
for (int i = 0; i < maxSize; ++i) {
float randomFloat = r.nextFloat();
// Reproduce the distribution of the number of bits per values that we're observing for tail
// postings on wikibigall.
int numBits = 1 + Arrays.binarySearch(CUMULATIVE_FREQUENCY_BY_BITS_REQUIRED, randomFloat);
if (numBits < 0) {
numBits = -numBits;
}
docs[i] = r.nextInt(1 << (numBits - 1), 1 << numBits);
}
initByteBufferInput(docs);
initArrayInput(docs);
}
@Benchmark
public void byteBufferReadVInt(Blackhole bh) throws IOException {
byteBufferVIntIn.seek(0);
for (int i = 0; i < size; i++) {
values[i] = byteBufferVIntIn.readVInt();
}
bh.consume(values);
}
@Benchmark
public void byteBufferReadGroupVInt(Blackhole bh) throws IOException {
byteBufferGVIntIn.seek(0);
GroupVIntReader.readValues(byteBufferGVIntIn, values, size);
bh.consume(values);
}
@Benchmark
public void byteArrayReadVInt(Blackhole bh) {
byteArrayVIntIn.rewind();
for (int i = 0; i < size; i++) {
values[i] = byteArrayVIntIn.readVInt();
}
bh.consume(values);
}
@Benchmark
public void byteArrayReadGroupVInt(Blackhole bh) throws IOException {
byteArrayGVIntIn.rewind();
GroupVIntReader.readValues(byteArrayGVIntIn, values, size);
bh.consume(values);
}
}

View File

@ -30,8 +30,8 @@ import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.TopFieldCollector;
import org.apache.lucene.search.TopScoreDocCollector;
import org.apache.lucene.search.TopFieldCollectorManager;
import org.apache.lucene.search.TopScoreDocCollectorManager;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.Bits;
@ -110,15 +110,17 @@ public abstract class ReadTask extends PerfTask {
// the IndexSearcher search methods that take
// Weight public again, we can go back to
// pulling the Weight ourselves:
TopFieldCollector collector =
TopFieldCollector.create(sort, numHits, withTotalHits() ? Integer.MAX_VALUE : 1);
searcher.search(q, collector);
hits = collector.topDocs();
int totalHitsThreshold = withTotalHits() ? Integer.MAX_VALUE : 1;
TopFieldCollectorManager collectorManager =
new TopFieldCollectorManager(
sort, numHits, null, totalHitsThreshold, searcher.getSlices().length > 1);
hits = searcher.search(q, collectorManager);
} else {
hits = searcher.search(q, numHits);
}
} else {
Collector collector = createCollector();
searcher.search(q, collector);
// hits = collector.topDocs();
}
@ -183,7 +185,8 @@ public abstract class ReadTask extends PerfTask {
}
protected Collector createCollector() throws Exception {
return TopScoreDocCollector.create(numHits(), withTotalHits() ? Integer.MAX_VALUE : 1);
return new TopScoreDocCollectorManager(numHits(), withTotalHits() ? Integer.MAX_VALUE : 1)
.newCollector();
}
protected Document retrieveDoc(StoredFields storedFields, int id) throws IOException {

View File

@ -207,7 +207,8 @@ public class BooleanPerceptronClassifier implements Classifier<Boolean> {
private void updateFST(SortedMap<String, Double> weights) throws IOException {
PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
FSTCompiler<Long> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
FSTCompiler<Long> fstCompiler =
new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE1, outputs).build();
BytesRefBuilder scratchBytes = new BytesRefBuilder();
IntsRefBuilder scratchInts = new IntsRefBuilder();
for (Map.Entry<String, Double> entry : weights.entrySet()) {

View File

@ -16,6 +16,8 @@
*/
package org.apache.lucene.codecs.blockterms;
import static org.apache.lucene.util.fst.FST.readMetadata;
import java.io.IOException;
import java.util.Collection;
import java.util.Collections;
@ -154,7 +156,7 @@ public class VariableGapTermsIndexReader extends TermsIndexReaderBase {
public FieldIndexData(IndexInput in, FieldInfo fieldInfo, long indexStart) throws IOException {
IndexInput clone = in.clone();
clone.seek(indexStart);
fst = new FST<>(clone, clone, fstOutputs);
fst = new FST<>(readMetadata(clone, fstOutputs), clone);
clone.close();
/*

View File

@ -238,7 +238,7 @@ public class VariableGapTermsIndexWriter extends TermsIndexWriterBase {
public FSTFieldWriter(FieldInfo fieldInfo, long termsFilePointer) throws IOException {
this.fieldInfo = fieldInfo;
fstOutputs = PositiveIntOutputs.getSingleton();
fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, fstOutputs);
fstCompiler = new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE1, fstOutputs).build();
indexStart = out.getFilePointer();
//// System.out.println("VGW: field=" + fieldInfo.name);

View File

@ -16,6 +16,8 @@
*/
package org.apache.lucene.codecs.blocktreeords;
import static org.apache.lucene.util.fst.FST.readMetadata;
import java.io.IOException;
import org.apache.lucene.codecs.blocktreeords.FSTOrdsOutputs.Output;
import org.apache.lucene.index.FieldInfo;
@ -85,7 +87,7 @@ final class OrdsFieldReader extends Terms {
final IndexInput clone = indexIn.clone();
// System.out.println("start=" + indexStartFP + " field=" + fieldInfo.name);
clone.seek(indexStartFP);
index = new FST<>(clone, clone, OrdsBlockTreeTermsWriter.FST_OUTPUTS);
index = new FST<>(readMetadata(clone, OrdsBlockTreeTermsWriter.FST_OUTPUTS), clone);
/*
if (true) {

View File

@ -194,7 +194,8 @@ public class FSTTermsReader extends FieldsProducer {
this.sumDocFreq = sumDocFreq;
this.docCount = docCount;
OffHeapFSTStore offHeapFSTStore = new OffHeapFSTStore();
this.dict = new FST<>(in, in, new FSTTermOutputs(fieldInfo), offHeapFSTStore);
FSTTermOutputs outputs = new FSTTermOutputs(fieldInfo);
this.dict = new FST<>(FST.readMetadata(in, outputs), in, offHeapFSTStore);
in.skipBytes(offHeapFSTStore.size());
}

View File

@ -251,12 +251,12 @@ public class FSTTermsWriter extends FieldsConsumer {
private final IntsRefBuilder scratchTerm = new IntsRefBuilder();
private final ByteBuffersDataOutput metaWriter = ByteBuffersDataOutput.newResettableInstance();
TermsWriter(FieldInfo fieldInfo) {
TermsWriter(FieldInfo fieldInfo) throws IOException {
this.numTerms = 0;
this.fieldInfo = fieldInfo;
postingsWriter.setField(fieldInfo);
this.outputs = new FSTTermOutputs(fieldInfo);
this.fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
this.fstCompiler = new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE1, outputs).build();
}
public void finishTerm(BytesRef text, BlockTermState state) throws IOException {

View File

@ -683,7 +683,7 @@ class SimpleTextFieldsReader extends FieldsProducer {
final PairOutputs<Long, Long> outputsInner = new PairOutputs<>(posIntOutputs, posIntOutputs);
final PairOutputs<PairOutputs.Pair<Long, Long>, PairOutputs.Pair<Long, Long>> outputs =
new PairOutputs<>(outputsOuter, outputsInner);
fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
fstCompiler = new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE1, outputs).build();
IndexInput in = SimpleTextFieldsReader.this.in.clone();
in.seek(termsStart);
final BytesRefBuilder lastTerm = new BytesRefBuilder();

View File

@ -37,7 +37,6 @@ public class SimpleTextStoredFieldsFormat extends StoredFieldsFormat {
@Override
public StoredFieldsReader fieldsReader(
Directory directory, SegmentInfo si, FieldInfos fn, IOContext context) throws IOException {
;
return new SimpleTextStoredFieldsReader(directory, si, fn, context);
}

View File

@ -89,10 +89,11 @@ public class FSTDictionary implements IndexDictionary {
isFSTOnHeap = true;
}
PositiveIntOutputs fstOutputs = PositiveIntOutputs.getSingleton();
FST.FSTMetadata<Long> metadata = FST.readMetadata(fstDataInput, fstOutputs);
FST<Long> fst =
isFSTOnHeap
? new FST<>(fstDataInput, fstDataInput, fstOutputs)
: new FST<>(fstDataInput, fstDataInput, fstOutputs, new OffHeapFSTStore());
? new FST<>(metadata, fstDataInput)
: new FST<>(metadata, fstDataInput, new OffHeapFSTStore());
return new FSTDictionary(fst);
}
@ -171,9 +172,9 @@ public class FSTDictionary implements IndexDictionary {
protected final FSTCompiler<Long> fstCompiler;
protected final IntsRefBuilder scratchInts;
public Builder() {
public Builder() throws IOException {
PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
fstCompiler = new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE1, outputs).build();
scratchInts = new IntsRefBuilder();
}

View File

@ -100,5 +100,4 @@ public abstract class DelegatingAnalyzerWrapper extends AnalyzerWrapper {
}
}
}
;
}

View File

@ -70,7 +70,6 @@ public abstract class TermVectorsWriter implements Closeable, Accountable {
/** Called after a doc and all its fields have been added. */
public void finishDocument() throws IOException {}
;
/**
* Called before writing the terms of the field. {@link #startTerm(BytesRef, int)} will be called
@ -82,7 +81,6 @@ public abstract class TermVectorsWriter implements Closeable, Accountable {
/** Called after a field and all its terms have been added. */
public void finishField() throws IOException {}
;
/**
* Adds a term and its term frequency <code>freq</code>. If this field has positions and/or

View File

@ -91,7 +91,11 @@ public final class FieldReader extends Terms {
// Initialize FST always off-heap.
final IndexInput clone = indexIn.clone();
clone.seek(indexStartFP);
index = new FST<>(metaIn, clone, ByteSequenceOutputs.getSingleton(), new OffHeapFSTStore());
index =
new FST<>(
FST.readMetadata(metaIn, ByteSequenceOutputs.getSingleton()),
clone,
new OffHeapFSTStore());
/*
if (false) {
final String dotFileName = segment + "_" + fieldInfo.name + ".dot";

View File

@ -30,9 +30,7 @@ import org.apache.lucene.util.StringHelper;
import org.apache.lucene.util.automaton.ByteRunnable;
import org.apache.lucene.util.automaton.Transition;
import org.apache.lucene.util.automaton.TransitionAccessor;
import org.apache.lucene.util.fst.ByteSequenceOutputs;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.Outputs;
/**
* This is used to implement efficient {@link Terms#intersect} for block-tree. Note that it cannot
@ -46,7 +44,6 @@ final class IntersectTermsEnum extends BaseTermsEnum {
// static boolean DEBUG = BlockTreeTermsWriter.DEBUG;
final IndexInput in;
static final Outputs<BytesRef> fstOutputs = ByteSequenceOutputs.getSingleton();
IntersectTermsEnumFrame[] stack;
@ -68,6 +65,9 @@ final class IntersectTermsEnum extends BaseTermsEnum {
private BytesRef savedStartTerm;
private final SegmentTermsEnum.OutputAccumulator outputAccumulator =
new SegmentTermsEnum.OutputAccumulator();
// TODO: in some cases we can filter by length? eg
// regexp foo*bar must be at least length 6 bytes
public IntersectTermsEnum(
@ -114,7 +114,6 @@ final class IntersectTermsEnum extends BaseTermsEnum {
f.prefix = 0;
f.setState(0);
f.arc = arc;
f.outputPrefix = arc.output();
f.load(fr.rootCode);
// for assert:
@ -184,7 +183,9 @@ final class IntersectTermsEnum extends BaseTermsEnum {
FST.Arc<BytesRef> arc = currentFrame.arc;
int idx = currentFrame.prefix;
assert currentFrame.suffix > 0;
BytesRef output = currentFrame.outputPrefix;
outputAccumulator.reset();
outputAccumulator.push(arc.output());
while (idx < f.prefix) {
final int target = term.bytes[idx] & 0xff;
// TODO: we could be more efficient for the next()
@ -192,14 +193,14 @@ final class IntersectTermsEnum extends BaseTermsEnum {
// passed to findTargetArc
arc = fr.index.findTargetArc(target, arc, getArc(1 + idx), fstReader);
assert arc != null;
output = fstOutputs.add(output, arc.output());
outputAccumulator.push(arc.output());
idx++;
}
f.arc = arc;
f.outputPrefix = output;
assert arc.isFinal();
f.load(fstOutputs.add(output, arc.nextFinalOutput()));
outputAccumulator.push(arc.nextFinalOutput());
f.load(outputAccumulator);
return f;
}

View File

@ -55,7 +55,6 @@ final class IntersectTermsEnumFrame {
int statsSingletonRunLength = 0;
final ByteArrayDataInput statsReader = new ByteArrayDataInput();
byte[] floorData = new byte[32];
final ByteArrayDataInput floorDataReader = new ByteArrayDataInput();
// Length of prefix shared by all terms in this block
@ -90,9 +89,6 @@ final class IntersectTermsEnumFrame {
final ByteArrayDataInput bytesReader = new ByteArrayDataInput();
// Cumulative output so far
BytesRef outputPrefix;
int startBytePos;
int suffix;
@ -120,7 +116,7 @@ final class IntersectTermsEnumFrame {
}
} while (numFollowFloorBlocks != 0 && nextFloorLabel <= transition.min);
load(null);
load((Long) null);
}
public void setState(int state) {
@ -142,12 +138,22 @@ final class IntersectTermsEnumFrame {
}
void load(BytesRef frameIndexData) throws IOException {
if (frameIndexData != null) {
floorDataReader.reset(frameIndexData.bytes, frameIndexData.offset, frameIndexData.length);
// Skip first long -- has redundant fp, hasTerms
// flag, isFloor flag
final long code = ite.fr.readVLongOutput(floorDataReader);
if ((code & Lucene90BlockTreeTermsReader.OUTPUT_FLAG_IS_FLOOR) != 0) {
load(ite.fr.readVLongOutput(floorDataReader));
}
void load(SegmentTermsEnum.OutputAccumulator outputAccumulator) throws IOException {
outputAccumulator.prepareRead();
long code = ite.fr.readVLongOutput(outputAccumulator);
outputAccumulator.setFloorData(floorDataReader);
load(code);
}
void load(Long blockCode) throws IOException {
if (blockCode != null) {
// This block is the first one in a possible sequence of floor blocks corresponding to a
// single seek point from the FST terms index
if ((blockCode & Lucene90BlockTreeTermsReader.OUTPUT_FLAG_IS_FLOOR) != 0) {
// Floor frame
numFollowFloorBlocks = floorDataReader.readVInt();
nextFloorLabel = floorDataReader.readByte() & 0xff;

View File

@ -16,6 +16,8 @@
*/
package org.apache.lucene.codecs.lucene90.blocktree;
import static org.apache.lucene.util.fst.FSTCompiler.getOnHeapReaderWriter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
@ -525,7 +527,7 @@ public final class Lucene90BlockTreeTermsWriter extends FieldsConsumer {
// Disable suffixes sharing for block tree index because suffixes are mostly dropped
// from the FST index and left in the term blocks.
.suffixRAMLimitMB(0d)
.bytesPageBits(pageBits)
.dataOutput(getOnHeapReaderWriter(pageBits))
.build();
// if (DEBUG) {
// System.out.println(" compile index for prefix=" + prefix);

View File

@ -24,6 +24,7 @@ import org.apache.lucene.index.ImpactsEnum;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.TermState;
import org.apache.lucene.store.ByteArrayDataInput;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
@ -48,7 +49,7 @@ final class SegmentTermsEnum extends BaseTermsEnum {
// static boolean DEBUG = BlockTreeTermsWriter.DEBUG;
private final ByteArrayDataInput scratchReader = new ByteArrayDataInput();
private final OutputAccumulator outputAccumulator = new OutputAccumulator();
// What prefix of the current term was present in the index; when we only next() through the
// index, this stays at 0. It's only set when
@ -232,18 +233,24 @@ final class SegmentTermsEnum extends BaseTermsEnum {
return arcs[ord];
}
// Pushes a frame we seek'd to
SegmentTermsEnumFrame pushFrame(FST.Arc<BytesRef> arc, BytesRef frameData, int length)
throws IOException {
scratchReader.reset(frameData.bytes, frameData.offset, frameData.length);
final long code = fr.readVLongOutput(scratchReader);
outputAccumulator.reset();
outputAccumulator.push(frameData);
return pushFrame(arc, length);
}
// Pushes a frame we seek'd to
SegmentTermsEnumFrame pushFrame(FST.Arc<BytesRef> arc, int length) throws IOException {
outputAccumulator.prepareRead();
final long code = fr.readVLongOutput(outputAccumulator);
final long fpSeek = code >>> Lucene90BlockTreeTermsReader.OUTPUT_FLAGS_NUM_BITS;
final SegmentTermsEnumFrame f = getFrame(1 + currentFrame.ord);
f.hasTerms = (code & Lucene90BlockTreeTermsReader.OUTPUT_FLAG_HAS_TERMS) != 0;
f.hasTermsOrig = f.hasTerms;
f.isFloor = (code & Lucene90BlockTreeTermsReader.OUTPUT_FLAG_IS_FLOOR) != 0;
if (f.isFloor) {
f.setFloorData(scratchReader, frameData);
f.setFloorData(outputAccumulator);
}
pushFrame(arc, fpSeek, length);
@ -344,9 +351,9 @@ final class SegmentTermsEnum extends BaseTermsEnum {
FST.Arc<BytesRef> arc;
int targetUpto;
BytesRef output;
targetBeforeCurrentLength = currentFrame.ord;
outputAccumulator.reset();
if (currentFrame != staticFrame) {
@ -363,7 +370,7 @@ final class SegmentTermsEnum extends BaseTermsEnum {
arc = arcs[0];
assert arc.isFinal();
output = arc.output();
outputAccumulator.push(arc.output());
targetUpto = 0;
SegmentTermsEnumFrame lastFrame = stack[0];
@ -373,9 +380,6 @@ final class SegmentTermsEnum extends BaseTermsEnum {
int cmp = 0;
// TODO: reverse vLong byte order for better FST
// prefix output sharing
// First compare up to valid seek frames:
while (targetUpto < targetLimit) {
cmp = (term.byteAt(targetUpto) & 0xFF) - (target.bytes[target.offset + targetUpto] & 0xFF);
@ -394,9 +398,8 @@ final class SegmentTermsEnum extends BaseTermsEnum {
+ (char) arc.label()
+ " targetLabel="
+ (char) (target.bytes[target.offset + targetUpto] & 0xFF);
if (arc.output() != Lucene90BlockTreeTermsReader.NO_OUTPUT) {
output = Lucene90BlockTreeTermsReader.FST_OUTPUTS.add(output, arc.output());
}
outputAccumulator.push(arc.output());
if (arc.isFinal()) {
lastFrame = stack[1 + lastFrame.ord];
}
@ -484,15 +487,15 @@ final class SegmentTermsEnum extends BaseTermsEnum {
// System.out.println(" no seek state; push root frame");
// }
output = arc.output();
outputAccumulator.push(arc.output());
currentFrame = staticFrame;
// term.length = 0;
targetUpto = 0;
currentFrame =
pushFrame(
arc, Lucene90BlockTreeTermsReader.FST_OUTPUTS.add(output, arc.nextFinalOutput()), 0);
outputAccumulator.push(arc.nextFinalOutput());
currentFrame = pushFrame(arc, 0);
outputAccumulator.pop();
}
// if (DEBUG) {
@ -554,9 +557,7 @@ final class SegmentTermsEnum extends BaseTermsEnum {
term.setByteAt(targetUpto, (byte) targetLabel);
// Aggregate output as we go:
assert arc.output() != null;
if (arc.output() != Lucene90BlockTreeTermsReader.NO_OUTPUT) {
output = Lucene90BlockTreeTermsReader.FST_OUTPUTS.add(output, arc.output());
}
outputAccumulator.push(arc.output());
// if (DEBUG) {
// System.out.println(" index: follow label=" + toHex(target.bytes[target.offset +
@ -566,11 +567,9 @@ final class SegmentTermsEnum extends BaseTermsEnum {
if (arc.isFinal()) {
// if (DEBUG) System.out.println(" arc is final!");
currentFrame =
pushFrame(
arc,
Lucene90BlockTreeTermsReader.FST_OUTPUTS.add(output, arc.nextFinalOutput()),
targetUpto);
outputAccumulator.push(arc.nextFinalOutput());
currentFrame = pushFrame(arc, targetUpto);
outputAccumulator.pop();
// if (DEBUG) System.out.println(" curFrame.ord=" + currentFrame.ord + " hasTerms=" +
// currentFrame.hasTerms);
}
@ -630,9 +629,9 @@ final class SegmentTermsEnum extends BaseTermsEnum {
FST.Arc<BytesRef> arc;
int targetUpto;
BytesRef output;
targetBeforeCurrentLength = currentFrame.ord;
outputAccumulator.reset();
if (currentFrame != staticFrame) {
@ -649,7 +648,7 @@ final class SegmentTermsEnum extends BaseTermsEnum {
arc = arcs[0];
assert arc.isFinal();
output = arc.output();
outputAccumulator.push(arc.output());
targetUpto = 0;
SegmentTermsEnumFrame lastFrame = stack[0];
@ -659,9 +658,6 @@ final class SegmentTermsEnum extends BaseTermsEnum {
int cmp = 0;
// TODO: we should write our vLong backwards (MSB
// first) to get better sharing from the FST
// First compare up to valid seek frames:
while (targetUpto < targetLimit) {
cmp = (term.byteAt(targetUpto) & 0xFF) - (target.bytes[target.offset + targetUpto] & 0xFF);
@ -680,14 +676,8 @@ final class SegmentTermsEnum extends BaseTermsEnum {
+ (char) arc.label()
+ " targetLabel="
+ (char) (target.bytes[target.offset + targetUpto] & 0xFF);
// TODO: we could save the outputs in local
// byte[][] instead of making new objs ever
// seek; but, often the FST doesn't have any
// shared bytes (but this could change if we
// reverse vLong byte order)
if (arc.output() != Lucene90BlockTreeTermsReader.NO_OUTPUT) {
output = Lucene90BlockTreeTermsReader.FST_OUTPUTS.add(output, arc.output());
}
outputAccumulator.push(arc.output());
if (arc.isFinal()) {
lastFrame = stack[1 + lastFrame.ord];
}
@ -769,15 +759,15 @@ final class SegmentTermsEnum extends BaseTermsEnum {
// System.out.println(" no seek state; push root frame");
// }
output = arc.output();
outputAccumulator.push(arc.output());
currentFrame = staticFrame;
// term.length = 0;
targetUpto = 0;
currentFrame =
pushFrame(
arc, Lucene90BlockTreeTermsReader.FST_OUTPUTS.add(output, arc.nextFinalOutput()), 0);
outputAccumulator.push(arc.nextFinalOutput());
currentFrame = pushFrame(arc, 0);
outputAccumulator.pop();
}
// if (DEBUG) {
@ -839,9 +829,7 @@ final class SegmentTermsEnum extends BaseTermsEnum {
arc = nextArc;
// Aggregate output as we go:
assert arc.output() != null;
if (arc.output() != Lucene90BlockTreeTermsReader.NO_OUTPUT) {
output = Lucene90BlockTreeTermsReader.FST_OUTPUTS.add(output, arc.output());
}
outputAccumulator.push(arc.output());
// if (DEBUG) {
// System.out.println(" index: follow label=" + (target.bytes[target.offset +
@ -851,11 +839,9 @@ final class SegmentTermsEnum extends BaseTermsEnum {
if (arc.isFinal()) {
// if (DEBUG) System.out.println(" arc is final!");
currentFrame =
pushFrame(
arc,
Lucene90BlockTreeTermsReader.FST_OUTPUTS.add(output, arc.nextFinalOutput()),
targetUpto);
outputAccumulator.push(arc.nextFinalOutput());
currentFrame = pushFrame(arc, targetUpto);
outputAccumulator.pop();
// if (DEBUG) System.out.println(" curFrame.ord=" + currentFrame.ord + " hasTerms=" +
// currentFrame.hasTerms);
}
@ -1190,4 +1176,68 @@ final class SegmentTermsEnum extends BaseTermsEnum {
public long ord() {
throw new UnsupportedOperationException();
}
static class OutputAccumulator extends DataInput {
BytesRef[] outputs = new BytesRef[16];
BytesRef current;
int num;
int outputIndex;
int index;
void push(BytesRef output) {
if (output != Lucene90BlockTreeTermsReader.NO_OUTPUT) {
outputs = ArrayUtil.grow(outputs, num + 1);
outputs[num++] = output;
}
}
void pop() {
assert num > 0;
num--;
}
void reset() {
num = 0;
}
void prepareRead() {
index = 0;
outputIndex = 0;
current = outputs[0];
}
/**
* Set the last arc as the source of the floorData. This won't change the reading position of
* this {@link OutputAccumulator}
*/
void setFloorData(ByteArrayDataInput floorData) {
assert outputIndex == num - 1
: "floor data should be stored in last arc, get outputIndex: "
+ outputIndex
+ ", num: "
+ num;
BytesRef output = outputs[outputIndex];
floorData.reset(output.bytes, output.offset + index, output.length - index);
}
@Override
public byte readByte() throws IOException {
if (index >= current.length) {
current = outputs[++outputIndex];
index = 0;
}
return current.bytes[current.offset + index++];
}
@Override
public void readBytes(byte[] b, int offset, int len) throws IOException {
throw new UnsupportedOperationException();
}
@Override
public void skipBytes(long numBytes) throws IOException {
throw new UnsupportedOperationException();
}
}
}

View File

@ -55,7 +55,7 @@ final class SegmentTermsEnumFrame {
int statsSingletonRunLength = 0;
final ByteArrayDataInput statsReader = new ByteArrayDataInput();
byte[] floorData = new byte[32];
int rewindPos;
final ByteArrayDataInput floorDataReader = new ByteArrayDataInput();
// Length of prefix shared by all terms in this block
@ -104,13 +104,9 @@ final class SegmentTermsEnumFrame {
suffixLengthsReader = new ByteArrayDataInput();
}
public void setFloorData(ByteArrayDataInput in, BytesRef source) {
final int numBytes = source.length - (in.getPosition() - source.offset);
if (numBytes > floorData.length) {
floorData = new byte[ArrayUtil.oversize(numBytes, 1)];
}
System.arraycopy(source.bytes, source.offset + in.getPosition(), floorData, 0, numBytes);
floorDataReader.reset(floorData, 0, numBytes);
public void setFloorData(SegmentTermsEnum.OutputAccumulator outputAccumulator) {
outputAccumulator.setFloorData(floorDataReader);
rewindPos = floorDataReader.getPosition();
numFollowFloorBlocks = floorDataReader.readVInt();
nextFloorLabel = floorDataReader.readByte() & 0xff;
// if (DEBUG) {
@ -247,7 +243,7 @@ final class SegmentTermsEnumFrame {
nextEnt = -1;
hasTerms = hasTermsOrig;
if (isFloor) {
floorDataReader.rewind();
floorDataReader.setPosition(rewindPos);
numFollowFloorBlocks = floorDataReader.readVInt();
assert numFollowFloorBlocks > 0;
nextFloorLabel = floorDataReader.readByte() & 0xff;

View File

@ -0,0 +1,57 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.codecs.lucene99;
import java.io.IOException;
import org.apache.lucene.store.DataInput;
/** Decode integers using group-varint. */
public class GroupVIntReader {
public static void readValues(DataInput in, long[] docs, int limit) throws IOException {
int i;
for (i = 0; i <= limit - 4; i += 4) {
final int flag = in.readByte() & 0xFF;
final int n1Minus1 = flag >> 6;
final int n2Minus1 = (flag >> 4) & 0x03;
final int n3Minus1 = (flag >> 2) & 0x03;
final int n4Minus1 = flag & 0x03;
docs[i] = readLong(in, n1Minus1);
docs[i + 1] = readLong(in, n2Minus1);
docs[i + 2] = readLong(in, n3Minus1);
docs[i + 3] = readLong(in, n4Minus1);
}
for (; i < limit; ++i) {
docs[i] = in.readVInt();
}
}
private static long readLong(DataInput in, int numBytesMinus1) throws IOException {
switch (numBytesMinus1) {
case 0:
return in.readByte() & 0xFFL;
case 1:
return in.readShort() & 0xFFFFL;
case 2:
return (in.readShort() & 0xFFFFL) | ((in.readByte() & 0xFFL) << 16);
default:
return in.readInt() & 0xFFFFFFFFL;
}
}
}

View File

@ -0,0 +1,63 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.codecs.lucene99;
import java.io.IOException;
import org.apache.lucene.store.DataOutput;
/**
* Encode integers using group-varint. It uses VInt to encode tail values that are not enough for a
* group
*/
public class GroupVIntWriter {
// the maximum size of one group is 4 integers + 1 byte flag.
private byte[] bytes = new byte[17];
private int byteOffset = 0;
public GroupVIntWriter() {}
private int encodeValue(int v) {
int lastOff = byteOffset;
do {
bytes[byteOffset++] = (byte) (v & 0xFF);
v >>>= 8;
} while (v != 0);
return byteOffset - lastOff;
}
public void writeValues(DataOutput out, long[] values, int limit) throws IOException {
int off = 0;
// encode each group
while ((limit - off) >= 4) {
byte flag = 0;
byteOffset = 1;
flag |= (encodeValue((int) values[off++]) - 1) << 6;
flag |= (encodeValue((int) values[off++]) - 1) << 4;
flag |= (encodeValue((int) values[off++]) - 1) << 2;
flag |= (encodeValue((int) values[off++]) - 1);
bytes[0] = flag;
out.writeBytes(bytes, byteOffset);
}
// tail vints
for (; off < limit; off++) {
out.writeVInt((int) values[off]);
}
}
}

View File

@ -31,6 +31,7 @@ import org.apache.lucene.codecs.KnnVectorsReader;
import org.apache.lucene.codecs.KnnVectorsWriter;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.search.TaskExecutor;
import org.apache.lucene.util.hnsw.HnswGraph;
/**
@ -60,7 +61,7 @@ public final class Lucene99HnswScalarQuantizedVectorsFormat extends KnnVectorsFo
private final FlatVectorsFormat flatVectorsFormat;
private final int numMergeWorkers;
private final ExecutorService mergeExec;
private final TaskExecutor mergeExec;
/** Constructs a format using default graph construction parameters */
public Lucene99HnswScalarQuantizedVectorsFormat() {
@ -84,8 +85,8 @@ public final class Lucene99HnswScalarQuantizedVectorsFormat extends KnnVectorsFo
* @param beamWidth the size of the queue maintained during graph construction.
* @param numMergeWorkers number of workers (threads) that will be used when doing merge. If
* larger than 1, a non-null {@link ExecutorService} must be passed as mergeExec
* @param configuredQuantile the quantile for scalar quantizing the vectors, when `null` it is
* calculated based on the vector field dimensions.
* @param confidenceInterval the confidenceInterval for scalar quantizing the vectors, when `null`
* it is calculated based on the vector field dimensions.
* @param mergeExec the {@link ExecutorService} that will be used by ALL vector writers that are
* generated by this format to do the merge
*/
@ -93,7 +94,7 @@ public final class Lucene99HnswScalarQuantizedVectorsFormat extends KnnVectorsFo
int maxConn,
int beamWidth,
int numMergeWorkers,
Float configuredQuantile,
Float confidenceInterval,
ExecutorService mergeExec) {
super("Lucene99HnswScalarQuantizedVectorsFormat");
if (maxConn <= 0 || maxConn > MAXIMUM_MAX_CONN) {
@ -121,8 +122,12 @@ public final class Lucene99HnswScalarQuantizedVectorsFormat extends KnnVectorsFo
"No executor service is needed as we'll use single thread to merge");
}
this.numMergeWorkers = numMergeWorkers;
this.mergeExec = mergeExec;
this.flatVectorsFormat = new Lucene99ScalarQuantizedVectorsFormat(configuredQuantile);
if (mergeExec != null) {
this.mergeExec = new TaskExecutor(mergeExec);
} else {
this.mergeExec = null;
}
this.flatVectorsFormat = new Lucene99ScalarQuantizedVectorsFormat(confidenceInterval);
}
@Override

View File

@ -27,6 +27,7 @@ import org.apache.lucene.codecs.lucene90.IndexedDISI;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.TaskExecutor;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.hnsw.HnswGraph;
@ -137,7 +138,7 @@ public final class Lucene99HnswVectorsFormat extends KnnVectorsFormat {
private static final FlatVectorsFormat flatVectorsFormat = new Lucene99FlatVectorsFormat();
private final int numMergeWorkers;
private final ExecutorService mergeExec;
private final TaskExecutor mergeExec;
/** Constructs a format using default graph construction parameters */
public Lucene99HnswVectorsFormat() {
@ -192,7 +193,11 @@ public final class Lucene99HnswVectorsFormat extends KnnVectorsFormat {
"No executor service is needed as we'll use single thread to merge");
}
this.numMergeWorkers = numMergeWorkers;
this.mergeExec = mergeExec;
if (mergeExec != null) {
this.mergeExec = new TaskExecutor(mergeExec);
} else {
this.mergeExec = null;
}
}
@Override

View File

@ -92,18 +92,8 @@ public final class Lucene99HnswVectorsReader extends KnnVectorsReader
} catch (Throwable exception) {
priorE = exception;
} finally {
try {
CodecUtil.checkFooter(meta, priorE);
success = true;
} finally {
if (success == false) {
IOUtils.close(flatVectorsReader);
}
}
}
}
success = false;
try {
vectorIndex =
openDataInput(
state,
@ -237,12 +227,22 @@ public final class Lucene99HnswVectorsReader extends KnnVectorsReader
|| fieldEntry.vectorEncoding != VectorEncoding.FLOAT32) {
return;
}
RandomVectorScorer scorer = flatVectorsReader.getRandomVectorScorer(field, target);
HnswGraphSearcher.search(
scorer,
new OrdinalTranslatedKnnCollector(knnCollector, scorer::ordToDoc),
getGraph(fieldEntry),
scorer.getAcceptOrds(acceptDocs));
final RandomVectorScorer scorer = flatVectorsReader.getRandomVectorScorer(field, target);
final KnnCollector collector =
new OrdinalTranslatedKnnCollector(knnCollector, scorer::ordToDoc);
final Bits acceptedOrds = scorer.getAcceptOrds(acceptDocs);
if (knnCollector.k() < scorer.maxOrd()) {
HnswGraphSearcher.search(scorer, collector, getGraph(fieldEntry), acceptedOrds);
} else {
// if k is larger than the number of vectors, we can just iterate over all vectors
// and collect them
for (int i = 0; i < scorer.maxOrd(); i++) {
if (acceptedOrds == null || acceptedOrds.get(i)) {
knnCollector.incVisitedCount(1);
knnCollector.collect(scorer.ordToDoc(i), scorer.score(i));
}
}
}
}
@Override
@ -255,12 +255,22 @@ public final class Lucene99HnswVectorsReader extends KnnVectorsReader
|| fieldEntry.vectorEncoding != VectorEncoding.BYTE) {
return;
}
RandomVectorScorer scorer = flatVectorsReader.getRandomVectorScorer(field, target);
HnswGraphSearcher.search(
scorer,
new OrdinalTranslatedKnnCollector(knnCollector, scorer::ordToDoc),
getGraph(fieldEntry),
scorer.getAcceptOrds(acceptDocs));
final RandomVectorScorer scorer = flatVectorsReader.getRandomVectorScorer(field, target);
final KnnCollector collector =
new OrdinalTranslatedKnnCollector(knnCollector, scorer::ordToDoc);
final Bits acceptedOrds = scorer.getAcceptOrds(acceptDocs);
if (knnCollector.k() < scorer.maxOrd()) {
HnswGraphSearcher.search(scorer, collector, getGraph(fieldEntry), acceptedOrds);
} else {
// if k is larger than the number of vectors, we can just iterate over all vectors
// and collect them
for (int i = 0; i < scorer.maxOrd(); i++) {
if (acceptedOrds == null || acceptedOrds.get(i)) {
knnCollector.incVisitedCount(1);
knnCollector.collect(scorer.ordToDoc(i), scorer.score(i));
}
}
}
}
@Override

View File

@ -23,7 +23,6 @@ import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.concurrent.ExecutorService;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.FlatVectorsWriter;
import org.apache.lucene.codecs.KnnFieldVectorsWriter;
@ -35,6 +34,7 @@ import org.apache.lucene.index.MergeState;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.index.Sorter;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.TaskExecutor;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.InfoStream;
@ -67,7 +67,7 @@ public final class Lucene99HnswVectorsWriter extends KnnVectorsWriter {
private final int beamWidth;
private final FlatVectorsWriter flatVectorWriter;
private final int numMergeWorkers;
private final ExecutorService mergeExec;
private final TaskExecutor mergeExec;
private final List<FieldWriter<?>> fields = new ArrayList<>();
private boolean finished;
@ -78,7 +78,7 @@ public final class Lucene99HnswVectorsWriter extends KnnVectorsWriter {
int beamWidth,
FlatVectorsWriter flatVectorWriter,
int numMergeWorkers,
ExecutorService mergeExec)
TaskExecutor mergeExec)
throws IOException {
this.M = M;
this.flatVectorWriter = flatVectorWriter;

View File

@ -158,8 +158,8 @@ import org.apache.lucene.util.packed.PackedInts;
* <dd><b>Frequencies and Skip Data</b>
* <p>The .doc file contains the lists of documents which contain each term, along with the
* frequency of the term in that document (except when frequencies are omitted: {@link
* IndexOptions#DOCS}). It also saves skip data to the beginning of each packed or VInt block,
* when the length of document list is larger than packed block size.
* IndexOptions#DOCS}). Skip data is saved at the end of each term's postings. The skip data
* is saved once for the entire postings list.
* <ul>
* <li>docFile(.doc) --&gt; Header, &lt;TermFreqs, SkipData?&gt;<sup>TermCount</sup>, Footer
* <li>Header --&gt; {@link CodecUtil#writeIndexHeader IndexHeader}
@ -174,7 +174,8 @@ import org.apache.lucene.util.packed.PackedInts;
* <li>SkipDatum --&gt; DocSkip, DocFPSkip, &lt;PosFPSkip, PosBlockOffset, PayLength?,
* PayFPSkip?&gt;?, ImpactLength, &lt;CompetitiveFreqDelta, CompetitiveNormDelta?&gt;
* <sup>ImpactCount</sup>, SkipChildLevelPointer?
* <li>PackedDocDeltaBlock, PackedFreqBlock --&gt; {@link PackedInts PackedInts}
* <li>PackedFreqBlock --&gt; {@link PackedInts PackedInts}, uses patching
* <li>PackedDocDeltaBlock --&gt; {@link PackedInts PackedInts}, does not use patching
* <li>DocDelta, Freq, DocSkip, DocFPSkip, PosFPSkip, PosBlockOffset, PayByteUpto,
* PayFPSkip, ImpactLength, CompetitiveFreqDelta --&gt; {@link DataOutput#writeVInt
* VInt}

View File

@ -142,21 +142,25 @@ public final class Lucene99PostingsReader extends PostingsReaderBase {
/** Read values that have been written using variable-length encoding instead of bit-packing. */
static void readVIntBlock(
IndexInput docIn, long[] docBuffer, long[] freqBuffer, int num, boolean indexHasFreq)
IndexInput docIn,
long[] docBuffer,
long[] freqBuffer,
int num,
boolean indexHasFreq,
boolean decodeFreq)
throws IOException {
if (indexHasFreq) {
for (int i = 0; i < num; i++) {
final int code = docIn.readVInt();
docBuffer[i] = code >>> 1;
if ((code & 1) != 0) {
freqBuffer[i] = 1;
} else {
GroupVIntReader.readValues(docIn, docBuffer, num);
if (indexHasFreq && decodeFreq) {
for (int i = 0; i < num; ++i) {
freqBuffer[i] = docBuffer[i] & 0x01;
docBuffer[i] >>= 1;
if (freqBuffer[i] == 0) {
freqBuffer[i] = docIn.readVInt();
}
}
} else {
for (int i = 0; i < num; i++) {
docBuffer[i] = docIn.readVInt();
} else if (indexHasFreq) {
for (int i = 0; i < num; ++i) {
docBuffer[i] >>= 1;
}
}
}
@ -471,7 +475,7 @@ public final class Lucene99PostingsReader extends PostingsReaderBase {
blockUpto++;
} else {
// Read vInts:
readVIntBlock(docIn, docBuffer, freqBuffer, left, indexHasFreq);
readVIntBlock(docIn, docBuffer, freqBuffer, left, indexHasFreq, needsFreq);
prefixSum(docBuffer, left, accum);
docBuffer[left] = NO_MORE_DOCS;
blockUpto += left;
@ -764,7 +768,7 @@ public final class Lucene99PostingsReader extends PostingsReaderBase {
docBuffer[1] = NO_MORE_DOCS;
blockUpto++;
} else {
readVIntBlock(docIn, docBuffer, freqBuffer, left, true);
readVIntBlock(docIn, docBuffer, freqBuffer, left, true, true);
prefixSum(docBuffer, left, accum);
docBuffer[left] = NO_MORE_DOCS;
blockUpto += left;
@ -1073,8 +1077,6 @@ public final class Lucene99PostingsReader extends PostingsReaderBase {
private int nextSkipDoc = -1;
private long seekTo = -1;
// as we read freqBuffer lazily, isFreqsRead shows if freqBuffer are read for the current block
// always true when we don't have freqBuffer (indexHasFreq=false) or don't need freqBuffer
// (needsFreq=false)
@ -1153,7 +1155,7 @@ public final class Lucene99PostingsReader extends PostingsReaderBase {
}
blockUpto += BLOCK_SIZE;
} else {
readVIntBlock(docIn, docBuffer, freqBuffer, left, indexHasFreqs);
readVIntBlock(docIn, docBuffer, freqBuffer, left, indexHasFreqs, true);
prefixSum(docBuffer, left, accum);
docBuffer[left] = NO_MORE_DOCS;
blockUpto += left;
@ -1178,7 +1180,8 @@ public final class Lucene99PostingsReader extends PostingsReaderBase {
// Force to read next block
docBufferUpto = BLOCK_SIZE;
accum = skipper.getDoc();
seekTo = skipper.getDocPointer(); // delay the seek
docIn.seek(skipper.getDocPointer());
isFreqsRead = true;
}
// next time we call advance, this is used to
// foresee whether skipper is necessary.
@ -1198,11 +1201,6 @@ public final class Lucene99PostingsReader extends PostingsReaderBase {
@Override
public int nextDoc() throws IOException {
if (docBufferUpto == BLOCK_SIZE) {
if (seekTo >= 0) {
docIn.seek(seekTo);
isFreqsRead = true; // reset isFreqsRead
seekTo = -1;
}
refillDocs();
}
return this.doc = (int) docBuffer[docBufferUpto++];
@ -1214,11 +1212,6 @@ public final class Lucene99PostingsReader extends PostingsReaderBase {
advanceShallow(target);
}
if (docBufferUpto == BLOCK_SIZE) {
if (seekTo >= 0) {
docIn.seek(seekTo);
isFreqsRead = true; // reset isFreqsRead
seekTo = -1;
}
refillDocs();
}
@ -1307,8 +1300,6 @@ public final class Lucene99PostingsReader extends PostingsReaderBase {
private int nextSkipDoc = -1;
private long seekTo = -1;
public BlockImpactsPostingsEnum(FieldInfo fieldInfo, IntBlockTermState termState)
throws IOException {
indexHasOffsets =
@ -1372,7 +1363,7 @@ public final class Lucene99PostingsReader extends PostingsReaderBase {
forDeltaUtil.decodeAndPrefixSum(docIn, accum, docBuffer);
pforUtil.decode(docIn, freqBuffer);
} else {
readVIntBlock(docIn, docBuffer, freqBuffer, left, true);
readVIntBlock(docIn, docBuffer, freqBuffer, left, true, true);
prefixSum(docBuffer, left, accum);
docBuffer[left] = NO_MORE_DOCS;
}
@ -1426,7 +1417,7 @@ public final class Lucene99PostingsReader extends PostingsReaderBase {
accum = skipper.getDoc();
posPendingFP = skipper.getPosPointer();
posPendingCount = skipper.getPosBufferUpto();
seekTo = skipper.getDocPointer(); // delay the seek
docIn.seek(skipper.getDocPointer());
}
// next time we call advance, this is used to
// foresee whether skipper is necessary.
@ -1452,10 +1443,6 @@ public final class Lucene99PostingsReader extends PostingsReaderBase {
advanceShallow(target);
}
if (docBufferUpto == BLOCK_SIZE) {
if (seekTo >= 0) {
docIn.seek(seekTo);
seekTo = -1;
}
refillDocs();
}
@ -1766,7 +1753,7 @@ public final class Lucene99PostingsReader extends PostingsReaderBase {
false; // freq block will be loaded lazily when necessary, we don't load it here
}
} else {
readVIntBlock(docIn, docBuffer, freqBuffer, left, indexHasFreq);
readVIntBlock(docIn, docBuffer, freqBuffer, left, indexHasFreq, true);
prefixSum(docBuffer, left, accum);
docBuffer[left] = NO_MORE_DOCS;
}

View File

@ -92,6 +92,7 @@ public final class Lucene99PostingsWriter extends PushPostingsWriterBase {
private final PForUtil pforUtil;
private final ForDeltaUtil forDeltaUtil;
private final Lucene99SkipWriter skipWriter;
private final GroupVIntWriter docGroupVIntWriter;
private boolean fieldHasNorms;
private NumericDocValues norms;
@ -172,6 +173,7 @@ public final class Lucene99PostingsWriter extends PushPostingsWriterBase {
skipWriter =
new Lucene99SkipWriter(
MAX_SKIP_LEVELS, BLOCK_SIZE, state.segmentInfo.maxDoc(), docOut, posOut, payOut);
docGroupVIntWriter = new GroupVIntWriter();
}
@Override
@ -370,20 +372,22 @@ public final class Lucene99PostingsWriter extends PushPostingsWriterBase {
singletonDocID = (int) docDeltaBuffer[0];
} else {
singletonDocID = -1;
// vInt encode the remaining doc deltas and freqs:
// Group vInt encode the remaining doc deltas and freqs:
if (writeFreqs) {
for (int i = 0; i < docBufferUpto; i++) {
docDeltaBuffer[i] = (docDeltaBuffer[i] << 1) | (freqBuffer[i] == 1 ? 1 : 0);
}
}
docGroupVIntWriter.writeValues(docOut, docDeltaBuffer, docBufferUpto);
if (writeFreqs) {
for (int i = 0; i < docBufferUpto; i++) {
final int docDelta = (int) docDeltaBuffer[i];
final int freq = (int) freqBuffer[i];
if (!writeFreqs) {
docOut.writeVInt(docDelta);
} else if (freq == 1) {
docOut.writeVInt((docDelta << 1) | 1);
} else {
docOut.writeVInt(docDelta << 1);
if (freq != 1) {
docOut.writeVInt(freq);
}
}
}
}
final long lastPosBlockOffset;

View File

@ -43,17 +43,17 @@ public final class Lucene99ScalarQuantizedVectorsFormat extends FlatVectorsForma
private static final FlatVectorsFormat rawVectorFormat = new Lucene99FlatVectorsFormat();
/** The minimum quantile */
private static final float MINIMUM_QUANTILE = 0.9f;
/** The minimum confidence interval */
private static final float MINIMUM_CONFIDENCE_INTERVAL = 0.9f;
/** The maximum quantile */
private static final float MAXIMUM_QUANTILE = 1f;
/** The maximum confidence interval */
private static final float MAXIMUM_CONFIDENCE_INTERVAL = 1f;
/**
* Controls the quantile used to scalar quantize the vectors the default quantile is calculated as
* `1-1/(vector_dimensions + 1)`
* Controls the confidence interval used to scalar quantize the vectors the default value is
* calculated as `1-1/(vector_dimensions + 1)`
*/
final Float quantile;
final Float confidenceInterval;
/** Constructs a format using default graph construction parameters */
public Lucene99ScalarQuantizedVectorsFormat() {
@ -63,24 +63,26 @@ public final class Lucene99ScalarQuantizedVectorsFormat extends FlatVectorsForma
/**
* Constructs a format using the given graph construction parameters.
*
* @param quantile the quantile for scalar quantizing the vectors, when `null` it is calculated
* based on the vector field dimensions.
* @param confidenceInterval the confidenceInterval for scalar quantizing the vectors, when `null`
* it is calculated based on the vector field dimensions.
*/
public Lucene99ScalarQuantizedVectorsFormat(Float quantile) {
if (quantile != null && (quantile < MINIMUM_QUANTILE || quantile > MAXIMUM_QUANTILE)) {
public Lucene99ScalarQuantizedVectorsFormat(Float confidenceInterval) {
if (confidenceInterval != null
&& (confidenceInterval < MINIMUM_CONFIDENCE_INTERVAL
|| confidenceInterval > MAXIMUM_CONFIDENCE_INTERVAL)) {
throw new IllegalArgumentException(
"quantile must be between "
+ MINIMUM_QUANTILE
"confidenceInterval must be between "
+ MINIMUM_CONFIDENCE_INTERVAL
+ " and "
+ MAXIMUM_QUANTILE
+ "; quantile="
+ quantile);
+ MAXIMUM_CONFIDENCE_INTERVAL
+ "; confidenceInterval="
+ confidenceInterval);
}
this.quantile = quantile;
this.confidenceInterval = confidenceInterval;
}
static float calculateDefaultQuantile(int vectorDimension) {
return Math.max(MINIMUM_QUANTILE, 1f - (1f / (vectorDimension + 1)));
static float calculateDefaultConfidenceInterval(int vectorDimension) {
return Math.max(MINIMUM_CONFIDENCE_INTERVAL, 1f - (1f / (vectorDimension + 1)));
}
@Override
@ -88,8 +90,8 @@ public final class Lucene99ScalarQuantizedVectorsFormat extends FlatVectorsForma
return NAME
+ "(name="
+ NAME
+ ", quantile="
+ quantile
+ ", confidenceInterval="
+ confidenceInterval
+ ", rawVectorFormat="
+ rawVectorFormat
+ ")";
@ -98,7 +100,7 @@ public final class Lucene99ScalarQuantizedVectorsFormat extends FlatVectorsForma
@Override
public FlatVectorsWriter fieldsWriter(SegmentWriteState state) throws IOException {
return new Lucene99ScalarQuantizedVectorsWriter(
state, quantile, rawVectorFormat.fieldsWriter(state));
state, confidenceInterval, rawVectorFormat.fieldsWriter(state));
}
@Override

View File

@ -58,6 +58,7 @@ public final class Lucene99ScalarQuantizedVectorsReader extends FlatVectorsReade
Lucene99ScalarQuantizedVectorsReader(SegmentReadState state, FlatVectorsReader rawVectorsReader)
throws IOException {
this.rawVectorsReader = rawVectorsReader;
int versionMeta = -1;
String metaFileName =
IndexFileNames.segmentFileName(
@ -80,19 +81,8 @@ public final class Lucene99ScalarQuantizedVectorsReader extends FlatVectorsReade
} catch (Throwable exception) {
priorE = exception;
} finally {
try {
CodecUtil.checkFooter(meta, priorE);
success = true;
} finally {
if (success == false) {
IOUtils.close(rawVectorsReader);
}
}
}
}
success = false;
this.rawVectorsReader = rawVectorsReader;
try {
quantizedVectorData =
openDataInput(
state,
@ -313,10 +303,10 @@ public final class Lucene99ScalarQuantizedVectorsReader extends FlatVectorsReade
dimension = input.readVInt();
size = input.readInt();
if (size > 0) {
float configuredQuantile = Float.intBitsToFloat(input.readInt());
float confidenceInterval = Float.intBitsToFloat(input.readInt());
float minQuantile = Float.intBitsToFloat(input.readInt());
float maxQuantile = Float.intBitsToFloat(input.readInt());
scalarQuantizer = new ScalarQuantizer(minQuantile, maxQuantile, configuredQuantile);
scalarQuantizer = new ScalarQuantizer(minQuantile, maxQuantile, confidenceInterval);
} else {
scalarQuantizer = null;
}

View File

@ -19,7 +19,7 @@ package org.apache.lucene.codecs.lucene99;
import static org.apache.lucene.codecs.lucene99.Lucene99FlatVectorsFormat.DIRECT_MONOTONIC_BLOCK_SHIFT;
import static org.apache.lucene.codecs.lucene99.Lucene99ScalarQuantizedVectorsFormat.QUANTIZED_VECTOR_COMPONENT;
import static org.apache.lucene.codecs.lucene99.Lucene99ScalarQuantizedVectorsFormat.calculateDefaultQuantile;
import static org.apache.lucene.codecs.lucene99.Lucene99ScalarQuantizedVectorsFormat.calculateDefaultConfidenceInterval;
import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS;
import static org.apache.lucene.util.RamUsageEstimator.shallowSizeOfInstance;
@ -91,14 +91,14 @@ public final class Lucene99ScalarQuantizedVectorsWriter extends FlatVectorsWrite
private final List<FieldWriter> fields = new ArrayList<>();
private final IndexOutput meta, quantizedVectorData;
private final Float quantile;
private final Float confidenceInterval;
private final FlatVectorsWriter rawVectorDelegate;
private boolean finished;
Lucene99ScalarQuantizedVectorsWriter(
SegmentWriteState state, Float quantile, FlatVectorsWriter rawVectorDelegate)
SegmentWriteState state, Float confidenceInterval, FlatVectorsWriter rawVectorDelegate)
throws IOException {
this.quantile = quantile;
this.confidenceInterval = confidenceInterval;
segmentWriteState = state;
String metaFileName =
IndexFileNames.segmentFileName(
@ -142,12 +142,12 @@ public final class Lucene99ScalarQuantizedVectorsWriter extends FlatVectorsWrite
public FlatFieldVectorsWriter<?> addField(
FieldInfo fieldInfo, KnnFieldVectorsWriter<?> indexWriter) throws IOException {
if (fieldInfo.getVectorEncoding().equals(VectorEncoding.FLOAT32)) {
float quantile =
this.quantile == null
? calculateDefaultQuantile(fieldInfo.getVectorDimension())
: this.quantile;
float confidenceInterval =
this.confidenceInterval == null
? calculateDefaultConfidenceInterval(fieldInfo.getVectorDimension())
: this.confidenceInterval;
FieldWriter quantizedWriter =
new FieldWriter(quantile, fieldInfo, segmentWriteState.infoStream, indexWriter);
new FieldWriter(confidenceInterval, fieldInfo, segmentWriteState.infoStream, indexWriter);
fields.add(quantizedWriter);
indexWriter = quantizedWriter;
}
@ -169,16 +169,16 @@ public final class Lucene99ScalarQuantizedVectorsWriter extends FlatVectorsWrite
DocsWithFieldSet docsWithField =
writeQuantizedVectorData(quantizedVectorData, byteVectorValues);
long vectorDataLength = quantizedVectorData.getFilePointer() - vectorDataOffset;
float quantile =
this.quantile == null
? calculateDefaultQuantile(fieldInfo.getVectorDimension())
: this.quantile;
float confidenceInterval =
this.confidenceInterval == null
? calculateDefaultConfidenceInterval(fieldInfo.getVectorDimension())
: this.confidenceInterval;
writeMeta(
fieldInfo,
segmentWriteState.segmentInfo.maxDoc(),
vectorDataOffset,
vectorDataLength,
quantile,
confidenceInterval,
mergedQuantizationState.getLowerQuantile(),
mergedQuantizationState.getUpperQuantile(),
docsWithField);
@ -251,7 +251,7 @@ public final class Lucene99ScalarQuantizedVectorsWriter extends FlatVectorsWrite
maxDoc,
vectorDataOffset,
vectorDataLength,
quantile,
confidenceInterval,
fieldData.minQuantile,
fieldData.maxQuantile,
fieldData.docsWithField);
@ -262,7 +262,7 @@ public final class Lucene99ScalarQuantizedVectorsWriter extends FlatVectorsWrite
int maxDoc,
long vectorDataOffset,
long vectorDataLength,
Float configuredQuantizationQuantile,
Float confidenceInterval,
Float lowerQuantile,
Float upperQuantile,
DocsWithFieldSet docsWithField)
@ -279,9 +279,9 @@ public final class Lucene99ScalarQuantizedVectorsWriter extends FlatVectorsWrite
assert Float.isFinite(lowerQuantile) && Float.isFinite(upperQuantile);
meta.writeInt(
Float.floatToIntBits(
configuredQuantizationQuantile != null
? configuredQuantizationQuantile
: calculateDefaultQuantile(field.getVectorDimension())));
confidenceInterval != null
? confidenceInterval
: calculateDefaultConfidenceInterval(field.getVectorDimension())));
meta.writeInt(Float.floatToIntBits(lowerQuantile));
meta.writeInt(Float.floatToIntBits(upperQuantile));
}
@ -344,7 +344,7 @@ public final class Lucene99ScalarQuantizedVectorsWriter extends FlatVectorsWrite
maxDoc,
vectorDataOffset,
quantizedVectorLength,
quantile,
confidenceInterval,
fieldData.minQuantile,
fieldData.maxQuantile,
newDocsWithField);
@ -374,11 +374,11 @@ public final class Lucene99ScalarQuantizedVectorsWriter extends FlatVectorsWrite
private ScalarQuantizer mergeQuantiles(FieldInfo fieldInfo, MergeState mergeState)
throws IOException {
assert fieldInfo.getVectorEncoding() == VectorEncoding.FLOAT32;
float quantile =
this.quantile == null
? calculateDefaultQuantile(fieldInfo.getVectorDimension())
: this.quantile;
return mergeAndRecalculateQuantiles(mergeState, fieldInfo, quantile);
float confidenceInterval =
this.confidenceInterval == null
? calculateDefaultConfidenceInterval(fieldInfo.getVectorDimension())
: this.confidenceInterval;
return mergeAndRecalculateQuantiles(mergeState, fieldInfo, confidenceInterval);
}
private ScalarQuantizedCloseableRandomVectorScorerSupplier mergeOneFieldToIndex(
@ -408,16 +408,16 @@ public final class Lucene99ScalarQuantizedVectorsWriter extends FlatVectorsWrite
quantizationDataInput, quantizationDataInput.length() - CodecUtil.footerLength());
long vectorDataLength = quantizedVectorData.getFilePointer() - vectorDataOffset;
CodecUtil.retrieveChecksum(quantizationDataInput);
float quantile =
this.quantile == null
? calculateDefaultQuantile(fieldInfo.getVectorDimension())
: this.quantile;
float confidenceInterval =
this.confidenceInterval == null
? calculateDefaultConfidenceInterval(fieldInfo.getVectorDimension())
: this.confidenceInterval;
writeMeta(
fieldInfo,
segmentWriteState.segmentInfo.maxDoc(),
vectorDataOffset,
vectorDataLength,
quantile,
confidenceInterval,
mergedQuantizationState.getLowerQuantile(),
mergedQuantizationState.getUpperQuantile(),
docsWithField);
@ -446,7 +446,9 @@ public final class Lucene99ScalarQuantizedVectorsWriter extends FlatVectorsWrite
}
static ScalarQuantizer mergeQuantiles(
List<ScalarQuantizer> quantizationStates, List<Integer> segmentSizes, float quantile) {
List<ScalarQuantizer> quantizationStates,
List<Integer> segmentSizes,
float confidenceInterval) {
assert quantizationStates.size() == segmentSizes.size();
if (quantizationStates.isEmpty()) {
return null;
@ -464,7 +466,7 @@ public final class Lucene99ScalarQuantizedVectorsWriter extends FlatVectorsWrite
}
lowerQuantile /= totalCount;
upperQuantile /= totalCount;
return new ScalarQuantizer(lowerQuantile, upperQuantile, quantile);
return new ScalarQuantizer(lowerQuantile, upperQuantile, confidenceInterval);
}
/**
@ -521,7 +523,7 @@ public final class Lucene99ScalarQuantizedVectorsWriter extends FlatVectorsWrite
}
static ScalarQuantizer mergeAndRecalculateQuantiles(
MergeState mergeState, FieldInfo fieldInfo, float quantile) throws IOException {
MergeState mergeState, FieldInfo fieldInfo, float confidenceInterval) throws IOException {
List<ScalarQuantizer> quantizationStates = new ArrayList<>(mergeState.liveDocs.length);
List<Integer> segmentSizes = new ArrayList<>(mergeState.liveDocs.length);
for (int i = 0; i < mergeState.liveDocs.length; i++) {
@ -536,7 +538,8 @@ public final class Lucene99ScalarQuantizedVectorsWriter extends FlatVectorsWrite
segmentSizes.add(fvv.size());
}
}
ScalarQuantizer mergedQuantiles = mergeQuantiles(quantizationStates, segmentSizes, quantile);
ScalarQuantizer mergedQuantiles =
mergeQuantiles(quantizationStates, segmentSizes, confidenceInterval);
// Segments no providing quantization state indicates that their quantiles were never
// calculated.
// To be safe, we should always recalculate given a sample set over all the float vectors in the
@ -545,7 +548,7 @@ public final class Lucene99ScalarQuantizedVectorsWriter extends FlatVectorsWrite
if (mergedQuantiles == null || shouldRecomputeQuantiles(mergedQuantiles, quantizationStates)) {
FloatVectorValues vectorValues =
KnnVectorsWriter.MergedVectorValues.mergeFloatVectorValues(fieldInfo, mergeState);
mergedQuantiles = ScalarQuantizer.fromVectors(vectorValues, quantile);
mergedQuantiles = ScalarQuantizer.fromVectors(vectorValues, confidenceInterval);
}
return mergedQuantiles;
}
@ -599,7 +602,7 @@ public final class Lucene99ScalarQuantizedVectorsWriter extends FlatVectorsWrite
private static final long SHALLOW_SIZE = shallowSizeOfInstance(FieldWriter.class);
private final List<float[]> floatVectors;
private final FieldInfo fieldInfo;
private final float quantile;
private final float confidenceInterval;
private final InfoStream infoStream;
private final boolean normalize;
private float minQuantile = Float.POSITIVE_INFINITY;
@ -609,12 +612,12 @@ public final class Lucene99ScalarQuantizedVectorsWriter extends FlatVectorsWrite
@SuppressWarnings("unchecked")
FieldWriter(
float quantile,
float confidenceInterval,
FieldInfo fieldInfo,
InfoStream infoStream,
KnnFieldVectorsWriter<?> indexWriter) {
super((KnnFieldVectorsWriter<float[]>) indexWriter);
this.quantile = quantile;
this.confidenceInterval = confidenceInterval;
this.fieldInfo = fieldInfo;
this.normalize = fieldInfo.getVectorSimilarityFunction() == VectorSimilarityFunction.COSINE;
this.floatVectors = new ArrayList<>();
@ -635,15 +638,15 @@ public final class Lucene99ScalarQuantizedVectorsWriter extends FlatVectorsWrite
new FloatVectorWrapper(
floatVectors,
fieldInfo.getVectorSimilarityFunction() == VectorSimilarityFunction.COSINE),
quantile);
confidenceInterval);
minQuantile = quantizer.getLowerQuantile();
maxQuantile = quantizer.getUpperQuantile();
if (infoStream.isEnabled(QUANTIZED_VECTOR_COMPONENT)) {
infoStream.message(
QUANTIZED_VECTOR_COMPONENT,
"quantized field="
+ " quantile="
+ quantile
+ " confidenceInterval="
+ confidenceInterval
+ " minQuantile="
+ minQuantile
+ " maxQuantile="
@ -654,7 +657,7 @@ public final class Lucene99ScalarQuantizedVectorsWriter extends FlatVectorsWrite
ScalarQuantizer createQuantizer() {
assert finished;
return new ScalarQuantizer(minQuantile, maxQuantile, quantile);
return new ScalarQuantizer(minQuantile, maxQuantile, confidenceInterval);
}
@Override

View File

@ -119,7 +119,6 @@ public abstract class PerFieldPostingsFormat extends PostingsFormat {
}
}
}
;
static String getSuffix(String formatName, String suffix) {
return formatName + "_" + suffix;

View File

@ -272,7 +272,6 @@ public final class FeatureField extends Field {
return true;
}
}
;
static final class LogFunction extends FeatureFunction {

View File

@ -16,6 +16,7 @@
*/
package org.apache.lucene.document;
import java.util.Collection;
import java.util.Objects;
import org.apache.lucene.index.DocValuesType;
import org.apache.lucene.index.IndexOptions;
@ -171,7 +172,7 @@ public class KeywordField extends Field {
* @throws NullPointerException if {@code field} is null.
* @return a query matching documents with this exact value
*/
public static Query newSetQuery(String field, BytesRef... values) {
public static Query newSetQuery(String field, Collection<BytesRef> values) {
Objects.requireNonNull(field, "field must not be null");
Objects.requireNonNull(values, "values must not be null");
Query indexQuery = new TermInSetQuery(field, values);

View File

@ -16,6 +16,7 @@
*/
package org.apache.lucene.document;
import java.util.Collection;
import org.apache.lucene.index.DocValuesType;
import org.apache.lucene.search.IndexOrDocValuesQuery;
import org.apache.lucene.search.MultiTermQuery;
@ -99,7 +100,7 @@ public class SortedDocValuesField extends Field {
* in an {@link IndexOrDocValuesQuery}, alongside a set query that executes on postings, such as
* {@link TermInSetQuery}.
*/
public static Query newSlowSetQuery(String field, BytesRef... values) {
public static Query newSlowSetQuery(String field, Collection<BytesRef> values) {
return new TermInSetQuery(MultiTermQuery.DOC_VALUES_REWRITE, field, values);
}
}

View File

@ -16,6 +16,7 @@
*/
package org.apache.lucene.document;
import java.util.Collection;
import org.apache.lucene.index.DocValuesType;
import org.apache.lucene.search.IndexOrDocValuesQuery;
import org.apache.lucene.search.MultiTermQuery;
@ -103,7 +104,7 @@ public class SortedSetDocValuesField extends Field {
* in an {@link IndexOrDocValuesQuery}, alongside a set query that executes on postings, such as
* {@link TermInSetQuery}.
*/
public static Query newSlowSetQuery(String field, BytesRef... values) {
public static Query newSlowSetQuery(String field, Collection<BytesRef> values) {
return new TermInSetQuery(MultiTermQuery.DOC_VALUES_REWRITE, field, values);
}
}

View File

@ -694,7 +694,7 @@ abstract class SpatialQuery extends Query {
final SpatialVisitor spatialVisitor, QueryRelation queryRelation, final FixedBitSet result) {
final BiFunction<byte[], byte[], Relation> innerFunction =
spatialVisitor.getInnerFunction(queryRelation);
;
return new IntersectVisitor() {
@Override

View File

@ -1254,8 +1254,7 @@ public final class Tessellator {
++numMerges;
// step 'insize' places along from p
q = p;
for (i = 0, pSize = 0; i < inSize && q != null; ++i, ++pSize, q = q.nextZ)
;
for (i = 0, pSize = 0; i < inSize && q != null; ++i, ++pSize, q = q.nextZ) {}
// if q hasn't fallen off end, we have two lists to merge
qSize = inSize;

View File

@ -22,11 +22,11 @@ import org.apache.lucene.store.DataOutput;
import org.apache.lucene.util.BitUtil;
import org.apache.lucene.util.ByteBlockPool;
/* IndexInput that knows how to read the byte slices written
* by Posting and PostingVector. We read the bytes in
* each slice until we hit the end of that slice at which
* point we read the forwarding address of the next slice
* and then jump to it.*/
/**
* IndexInput that knows how to read the byte slices written by Posting and PostingVector. We read
* the bytes in each slice until we hit the end of that slice at which point we read the forwarding
* address of the next slice and then jump to it.
*/
final class ByteSliceReader extends DataInput {
ByteBlockPool pool;
int bufferUpto;

View File

@ -28,7 +28,7 @@ import java.nio.file.Paths;
import java.text.NumberFormat;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
@ -96,11 +96,11 @@ import org.apache.lucene.util.Version;
*/
public final class CheckIndex implements Closeable {
private final Directory dir;
private final Lock writeLock;
private final NumberFormat nf = NumberFormat.getInstance(Locale.ROOT);
private PrintStream infoStream;
private Directory dir;
private Lock writeLock;
private volatile boolean closed;
private NumberFormat nf = NumberFormat.getInstance(Locale.ROOT);
/**
* Returned from {@link #checkIndex()} detailing the health and status of the index.
@ -441,19 +441,20 @@ public final class CheckIndex implements Closeable {
IOUtils.close(writeLock);
}
private boolean doSlowChecks;
private int level;
/**
* If true, additional slow checks are performed. This will likely drastically increase time it
* takes to run CheckIndex!
* Sets Level, the higher the value, the more additional checks are performed. This will likely
* drastically increase time it takes to run CheckIndex! See {@link Level}
*/
public void setDoSlowChecks(boolean v) {
doSlowChecks = v;
public void setLevel(int v) {
Level.checkIfLevelInBounds(v);
level = v;
}
/** See {@link #setDoSlowChecks}. */
public boolean doSlowChecks() {
return doSlowChecks;
/** See {@link #setLevel}. */
public int getLevel() {
return level;
}
private boolean failFast;
@ -473,21 +474,6 @@ public final class CheckIndex implements Closeable {
private boolean verbose;
/** See {@link #getChecksumsOnly}. */
public boolean getChecksumsOnly() {
return checksumsOnly;
}
/**
* If true, only validate physical integrity for all files. Note that the returned nested status
* objects (e.g. storedFieldStatus) will be null.
*/
public void setChecksumsOnly(boolean v) {
checksumsOnly = v;
}
private boolean checksumsOnly;
/** Set threadCount used for parallelizing index integrity checking. */
public void setThreadCount(int tc) {
if (tc <= 0) {
@ -586,7 +572,6 @@ public final class CheckIndex implements Closeable {
ensureOpen();
long startNS = System.nanoTime();
SegmentInfos sis = null;
Status result = new Status();
result.dir = dir;
String[] files = dir.listAll();
@ -595,43 +580,115 @@ public final class CheckIndex implements Closeable {
throw new IndexNotFoundException(
"no segments* file found in " + dir + ": files: " + Arrays.toString(files));
}
// https://github.com/apache/lucene/issues/7820: also attempt to open any older commit
// points (segments_N), which will catch certain corruption like missing _N.si files
// for segments not also referenced by the newest commit point (which was already
// loaded, successfully, above). Note that we do not do a deeper check of segments
// referenced ONLY by these older commit points, because such corruption would not
// prevent a new IndexWriter from opening on the newest commit point. but it is still
// corruption, e.g. a reader opened on those old commit points can hit corruption
// exceptions which we (still) will not detect here. progress not perfection!
SegmentInfos lastCommit = null;
List<String> allSegmentsFiles = new ArrayList<>();
for (String fileName : files) {
if (fileName.startsWith(IndexFileNames.SEGMENTS)
&& fileName.equals(SegmentInfos.OLD_SEGMENTS_GEN) == false) {
allSegmentsFiles.add(fileName);
}
}
// Sort descending by generation so that we always attempt to read the last commit first. This
// way if an index has a broken last commit AND a broken old commit, we report the last commit
// error first:
allSegmentsFiles.sort(
new Comparator<String>() {
@Override
public int compare(String a, String b) {
long genA = SegmentInfos.generationFromSegmentsFileName(a);
long genB = SegmentInfos.generationFromSegmentsFileName(b);
// reversed natural sort (largest generation first):
return -Long.compare(genA, genB);
}
});
for (String fileName : allSegmentsFiles) {
boolean isLastCommit = fileName.equals(lastSegmentsFile);
SegmentInfos infos;
try {
// Do not use SegmentInfos.read(Directory) since the spooky
// retrying it does is not necessary here (we hold the write lock):
sis =
SegmentInfos.readCommit(
dir, lastSegmentsFile, 0 /* always open old indices if codecs are around */);
// always open old indices if codecs are around
infos = SegmentInfos.readCommit(dir, fileName, 0);
} catch (Throwable t) {
if (failFast) {
throw IOUtils.rethrowAlways(t);
}
String message;
if (isLastCommit) {
message =
"ERROR: could not read latest commit point from segments file \""
+ fileName
+ "\" in directory";
} else {
message =
"ERROR: could not read old (not latest) commit point segments file \""
+ fileName
+ "\" in directory";
}
msg(infoStream, message);
result.missingSegments = true;
if (infoStream != null) {
t.printStackTrace(infoStream);
}
return result;
}
if (isLastCommit) {
// record the latest commit point: we will deeply check all segments referenced by it
lastCommit = infos;
}
}
// we know there is a lastSegmentsFileName, so we must've attempted to load it in the above for
// loop. if it failed to load, we threw the exception (fastFail == true) or we returned the
// failure (fastFail == false). so if we get here, we should // always have a valid lastCommit:
assert lastCommit != null;
if (lastCommit == null) {
msg(infoStream, "ERROR: could not read any segments file in directory");
result.missingSegments = true;
if (infoStream != null) t.printStackTrace(infoStream);
return result;
}
if (infoStream != null) {
int maxDoc = 0;
int delCount = 0;
for (SegmentCommitInfo info : sis) {
for (SegmentCommitInfo info : lastCommit) {
maxDoc += info.info.maxDoc();
delCount += info.getDelCount();
}
infoStream.println(
String.format(
infoStream.printf(
Locale.ROOT,
"%.2f%% total deletions; %d documents; %d deletions",
"%.2f%% total deletions; %d documents; %d deletions%n",
100. * delCount / maxDoc,
maxDoc,
delCount));
delCount);
}
// find the oldest and newest segment versions
Version oldest = null;
Version newest = null;
String oldSegs = null;
for (SegmentCommitInfo si : sis) {
for (SegmentCommitInfo si : lastCommit) {
Version version = si.info.getVersion();
if (version == null) {
// pre-3.1 segment
@ -646,14 +703,14 @@ public final class CheckIndex implements Closeable {
}
}
final int numSegments = sis.size();
final String segmentsFileName = sis.getSegmentsFileName();
final int numSegments = lastCommit.size();
final String segmentsFileName = lastCommit.getSegmentsFileName();
result.segmentsFileName = segmentsFileName;
result.numSegments = numSegments;
result.userData = sis.getUserData();
result.userData = lastCommit.getUserData();
String userDataString;
if (sis.getUserData().size() > 0) {
userDataString = " userData=" + sis.getUserData();
if (lastCommit.getUserData().size() > 0) {
userDataString = " userData=" + lastCommit.getUserData();
} else {
userDataString = "";
}
@ -681,7 +738,7 @@ public final class CheckIndex implements Closeable {
+ " "
+ versionString
+ " id="
+ StringHelper.idToString(sis.getId())
+ StringHelper.idToString(lastCommit.getId())
+ userDataString);
if (onlySegments != null) {
@ -696,14 +753,14 @@ public final class CheckIndex implements Closeable {
msg(infoStream, ":");
}
result.newSegments = sis.clone();
result.newSegments = lastCommit.clone();
result.newSegments.clear();
result.maxSegmentName = -1;
// checks segments sequentially
if (executorService == null) {
for (int i = 0; i < numSegments; i++) {
final SegmentCommitInfo info = sis.info(i);
final SegmentCommitInfo info = lastCommit.info(i);
updateMaxSegmentName(result, info);
if (onlySegments != null && !onlySegments.contains(info.info.name)) {
continue;
@ -718,7 +775,7 @@ public final class CheckIndex implements Closeable {
+ info.info.name
+ " maxDoc="
+ info.info.maxDoc());
Status.SegmentInfoStatus segmentInfoStatus = testSegment(sis, info, infoStream);
Status.SegmentInfoStatus segmentInfoStatus = testSegment(lastCommit, info, infoStream);
processSegmentInfoStatusResult(result, info, segmentInfoStatus);
}
@ -729,14 +786,13 @@ public final class CheckIndex implements Closeable {
// checks segments concurrently
List<SegmentCommitInfo> segmentCommitInfos = new ArrayList<>();
for (SegmentCommitInfo sci : sis) {
for (SegmentCommitInfo sci : lastCommit) {
segmentCommitInfos.add(sci);
}
// sort segmentCommitInfos by segment size, as smaller segment tends to finish faster, and
// hence its output can be printed out faster
Collections.sort(
segmentCommitInfos,
segmentCommitInfos.sort(
(info1, info2) -> {
try {
return Long.compare(info1.sizeInBytes(), info2.sizeInBytes());
@ -757,7 +813,7 @@ public final class CheckIndex implements Closeable {
continue;
}
SegmentInfos finalSis = sis;
SegmentInfos finalSis = lastCommit;
ByteArrayOutputStream output = new ByteArrayOutputStream();
PrintStream stream = new PrintStream(output, true, IOUtils.UTF_8);
@ -813,7 +869,7 @@ public final class CheckIndex implements Closeable {
if (0 == result.numBadSegments) {
result.clean = true;
} else
} else {
msg(
infoStream,
"WARNING: "
@ -821,14 +877,16 @@ public final class CheckIndex implements Closeable {
+ " broken segments (containing "
+ result.totLoseDocCount
+ " documents) detected");
}
if (!(result.validCounter = (result.maxSegmentName < sis.counter))) {
result.validCounter = result.maxSegmentName < lastCommit.counter;
if (result.validCounter == false) {
result.clean = false;
result.newSegments.counter = result.maxSegmentName + 1;
msg(
infoStream,
"ERROR: Next segment name counter "
+ sis.counter
+ lastCommit.counter
+ " is not greater than max segment name "
+ result.maxSegmentName);
}
@ -921,7 +979,7 @@ public final class CheckIndex implements Closeable {
msg(infoStream, " diagnostics = " + diagnostics);
}
if (!info.hasDeletions()) {
if (info.hasDeletions() == false) {
msg(infoStream, " no deletions");
segInfoStat.hasDeletions = false;
} else {
@ -960,26 +1018,26 @@ public final class CheckIndex implements Closeable {
toLoseDocCount = numDocs;
if (reader.hasDeletions()) {
if (reader.numDocs() != info.info.maxDoc() - info.getDelCount()) {
if (numDocs != info.info.maxDoc() - info.getDelCount()) {
throw new CheckIndexException(
"delete count mismatch: info="
+ (info.info.maxDoc() - info.getDelCount())
+ " vs reader="
+ reader.numDocs());
+ numDocs);
}
if ((info.info.maxDoc() - reader.numDocs()) > reader.maxDoc()) {
if ((info.info.maxDoc() - numDocs) > reader.maxDoc()) {
throw new CheckIndexException(
"too many deleted docs: maxDoc()="
+ reader.maxDoc()
+ " vs del count="
+ (info.info.maxDoc() - reader.numDocs()));
+ (info.info.maxDoc() - numDocs));
}
if (info.info.maxDoc() - reader.numDocs() != info.getDelCount()) {
if (info.info.maxDoc() - numDocs != info.getDelCount()) {
throw new CheckIndexException(
"delete count mismatch: info="
+ info.getDelCount()
+ " vs reader="
+ (info.info.maxDoc() - reader.numDocs()));
+ (info.info.maxDoc() - numDocs));
}
} else {
if (info.getDelCount() != 0) {
@ -987,11 +1045,10 @@ public final class CheckIndex implements Closeable {
"delete count mismatch: info="
+ info.getDelCount()
+ " vs reader="
+ (info.info.maxDoc() - reader.numDocs()));
+ (info.info.maxDoc() - numDocs));
}
}
if (checksumsOnly == false) {
if (level >= Level.MIN_LEVEL_FOR_INTEGRITY_CHECKS) {
// Test Livedocs
segInfoStat.liveDocStatus = testLiveDocs(reader, infoStream, failFast);
@ -1002,15 +1059,14 @@ public final class CheckIndex implements Closeable {
segInfoStat.fieldNormStatus = testFieldNorms(reader, infoStream, failFast);
// Test the Term Index
segInfoStat.termIndexStatus =
testPostings(reader, infoStream, verbose, doSlowChecks, failFast);
segInfoStat.termIndexStatus = testPostings(reader, infoStream, verbose, level, failFast);
// Test Stored Fields
segInfoStat.storedFieldStatus = testStoredFields(reader, infoStream, failFast);
// Test Term Vectors
segInfoStat.termVectorStatus =
testTermVectors(reader, infoStream, verbose, doSlowChecks, failFast);
testTermVectors(reader, infoStream, verbose, level, failFast);
// Test Docvalues
segInfoStat.docValuesStatus = testDocValues(reader, infoStream, failFast);
@ -1213,7 +1269,7 @@ public final class CheckIndex implements Closeable {
if (liveDocs != null) {
// it's ok for it to be non-null here, as long as none are set right?
for (int j = 0; j < liveDocs.length(); j++) {
if (!liveDocs.get(j)) {
if (liveDocs.get(j) == false) {
throw new CheckIndexException(
"liveDocs mismatch: info says no deletions but doc " + j + " is deleted.");
}
@ -1341,7 +1397,7 @@ public final class CheckIndex implements Closeable {
boolean isVectors,
PrintStream infoStream,
boolean verbose,
boolean doSlowChecks)
int level)
throws IOException {
// TODO: we should probably return our own stats thing...?!
long startNS;
@ -1450,7 +1506,7 @@ public final class CheckIndex implements Closeable {
+ hasFreqs);
}
if (!isVectors) {
if (isVectors == false) {
final boolean expectedHasPositions =
fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
if (hasPositions != expectedHasPositions) {
@ -1810,7 +1866,7 @@ public final class CheckIndex implements Closeable {
// free-for-all before?
// but for offsets in the postings lists these checks are fine: they were always
// enforced by IndexWriter
if (!isVectors) {
if (isVectors == false) {
if (startOffset < 0) {
throw new CheckIndexException(
"term "
@ -1924,14 +1980,13 @@ public final class CheckIndex implements Closeable {
}
// Checking score blocks is heavy, we only do it on long postings lists, on every 1024th
// term
// or if slow checks are enabled.
if (doSlowChecks
// term or if slow checks are enabled.
if (level >= Level.MIN_LEVEL_FOR_SLOW_CHECKS
|| docFreq > 1024
|| (status.termCount + status.delTermCount) % 1024 == 0) {
// First check max scores and block uptos
// But only if slok checks are enabled since we visit all docs
if (doSlowChecks) {
// But only if slow checks are enabled since we visit all docs
if (level >= Level.MIN_LEVEL_FOR_SLOW_CHECKS) {
int max = -1;
int maxFreq = 0;
ImpactsEnum impactsEnum = termsEnum.impacts(PostingsEnum.FREQS);
@ -1998,9 +2053,9 @@ public final class CheckIndex implements Closeable {
Impacts impacts = impactsEnum.getImpacts();
checkImpacts(impacts, doc);
maxFreq = Integer.MAX_VALUE;
for (int level = 0; level < impacts.numLevels(); ++level) {
if (impacts.getDocIdUpTo(level) >= max) {
List<Impact> perLevelImpacts = impacts.getImpacts(level);
for (int impactsLevel = 0; impactsLevel < impacts.numLevels(); ++impactsLevel) {
if (impacts.getDocIdUpTo(impactsLevel) >= max) {
List<Impact> perLevelImpacts = impacts.getImpacts(impactsLevel);
maxFreq = perLevelImpacts.get(perLevelImpacts.size() - 1).freq;
break;
}
@ -2040,9 +2095,9 @@ public final class CheckIndex implements Closeable {
Impacts impacts = impactsEnum.getImpacts();
checkImpacts(impacts, doc);
maxFreq = Integer.MAX_VALUE;
for (int level = 0; level < impacts.numLevels(); ++level) {
if (impacts.getDocIdUpTo(level) >= max) {
List<Impact> perLevelImpacts = impacts.getImpacts(level);
for (int impactsLevel = 0; impactsLevel < impacts.numLevels(); ++impactsLevel) {
if (impacts.getDocIdUpTo(impactsLevel) >= max) {
List<Impact> perLevelImpacts = impacts.getImpacts(impactsLevel);
maxFreq = perLevelImpacts.get(perLevelImpacts.size() - 1).freq;
break;
}
@ -2151,7 +2206,7 @@ public final class CheckIndex implements Closeable {
+ " doesn't have terms according to postings but has a norm value that is not zero: "
+ Long.toUnsignedString(norm));
}
} else if (norm == 0 && visitedDocs.get(doc)) {
} else if (visitedDocs.get(doc)) {
throw new CheckIndexException(
"Document "
+ doc
@ -2307,7 +2362,7 @@ public final class CheckIndex implements Closeable {
static void checkImpacts(Impacts impacts, int lastTarget) {
final int numLevels = impacts.numLevels();
if (numLevels < 1) {
throw new CheckIndexException("The number of levels must be >= 1, got " + numLevels);
throw new CheckIndexException("The number of impact levels must be >= 1, got " + numLevels);
}
int docIdUpTo0 = impacts.getDocIdUpTo(0);
@ -2319,17 +2374,17 @@ public final class CheckIndex implements Closeable {
+ lastTarget);
}
for (int level = 1; level < numLevels; ++level) {
int docIdUpTo = impacts.getDocIdUpTo(level);
int previousDocIdUpTo = impacts.getDocIdUpTo(level - 1);
for (int impactsLevel = 1; impactsLevel < numLevels; ++impactsLevel) {
int docIdUpTo = impacts.getDocIdUpTo(impactsLevel);
int previousDocIdUpTo = impacts.getDocIdUpTo(impactsLevel - 1);
if (docIdUpTo < previousDocIdUpTo) {
throw new CheckIndexException(
"Decreasing return for getDocIdUpTo: level "
+ (level - 1)
+ (impactsLevel - 1)
+ " returned "
+ previousDocIdUpTo
+ " but level "
+ level
+ impactsLevel
+ " returned "
+ docIdUpTo
+ " for target "
@ -2337,10 +2392,10 @@ public final class CheckIndex implements Closeable {
}
}
for (int level = 0; level < numLevels; ++level) {
List<Impact> perLevelImpacts = impacts.getImpacts(level);
for (int impactsLevel = 0; impactsLevel < numLevels; ++impactsLevel) {
List<Impact> perLevelImpacts = impacts.getImpacts(impactsLevel);
if (perLevelImpacts.isEmpty()) {
throw new CheckIndexException("Got empty list of impacts on level " + level);
throw new CheckIndexException("Got empty list of impacts on level " + impactsLevel);
}
Impact first = perLevelImpacts.get(0);
if (first.freq < 1) {
@ -2358,9 +2413,9 @@ public final class CheckIndex implements Closeable {
"Impacts are not ordered or contain dups, got " + previous + " then " + impact);
}
}
if (level > 0) {
// Make sure that impacts at level N trigger better scores than an level N-1
Iterator<Impact> previousIt = impacts.getImpacts(level - 1).iterator();
if (impactsLevel > 0) {
// Make sure that impacts at level N trigger better scores than an impactsLevel N-1
Iterator<Impact> previousIt = impacts.getImpacts(impactsLevel - 1).iterator();
previous = previousIt.next();
Iterator<Impact> it = perLevelImpacts.iterator();
Impact impact = it.next();
@ -2376,9 +2431,9 @@ public final class CheckIndex implements Closeable {
"Found impact "
+ previous
+ " on level "
+ (level - 1)
+ (impactsLevel - 1)
+ " but no impact on level "
+ level
+ impactsLevel
+ " triggers a better score: "
+ perLevelImpacts);
}
@ -2395,7 +2450,7 @@ public final class CheckIndex implements Closeable {
*/
public static Status.TermIndexStatus testPostings(CodecReader reader, PrintStream infoStream)
throws IOException {
return testPostings(reader, infoStream, false, true, false);
return testPostings(reader, infoStream, false, Level.MIN_LEVEL_FOR_SLOW_CHECKS, false);
}
/**
@ -2404,15 +2459,11 @@ public final class CheckIndex implements Closeable {
* @lucene.experimental
*/
public static Status.TermIndexStatus testPostings(
CodecReader reader,
PrintStream infoStream,
boolean verbose,
boolean doSlowChecks,
boolean failFast)
CodecReader reader, PrintStream infoStream, boolean verbose, int level, boolean failFast)
throws IOException {
// TODO: we should go and verify term vectors match, if
// doSlowChecks is on...
// TODO: we should go and verify term vectors match, if the Level is high enough to
// include slow checks
Status.TermIndexStatus status;
final int maxDoc = reader.maxDoc();
@ -2443,7 +2494,7 @@ public final class CheckIndex implements Closeable {
false,
infoStream,
verbose,
doSlowChecks);
level);
} catch (Throwable e) {
if (failFast) {
throw IOUtils.rethrowAlways(e);
@ -3132,7 +3183,7 @@ public final class CheckIndex implements Closeable {
for (FieldInfo fieldInfo : reader.getFieldInfos()) {
if (fieldInfo.getDocValuesType() != DocValuesType.NONE) {
status.totalValueFields++;
checkDocValues(fieldInfo, dvReader, reader.maxDoc(), infoStream, status);
checkDocValues(fieldInfo, dvReader, status);
}
}
@ -3162,11 +3213,11 @@ public final class CheckIndex implements Closeable {
}
@FunctionalInterface
private static interface DocValuesIteratorSupplier {
private interface DocValuesIteratorSupplier {
DocValuesIterator get(FieldInfo fi) throws IOException;
}
private static void checkDVIterator(FieldInfo fi, int maxDoc, DocValuesIteratorSupplier producer)
private static void checkDVIterator(FieldInfo fi, DocValuesIteratorSupplier producer)
throws IOException {
String field = fi.name;
@ -3284,7 +3335,7 @@ public final class CheckIndex implements Closeable {
}
private static void checkBinaryDocValues(
String fieldName, int maxDoc, BinaryDocValues bdv, BinaryDocValues bdv2) throws IOException {
String fieldName, BinaryDocValues bdv, BinaryDocValues bdv2) throws IOException {
if (bdv.docID() != -1) {
throw new CheckIndexException(
"binary dv iterator for field: "
@ -3309,7 +3360,7 @@ public final class CheckIndex implements Closeable {
}
private static void checkSortedDocValues(
String fieldName, int maxDoc, SortedDocValues dv, SortedDocValues dv2) throws IOException {
String fieldName, SortedDocValues dv, SortedDocValues dv2) throws IOException {
if (dv.docID() != -1) {
throw new CheckIndexException(
"sorted dv iterator for field: "
@ -3373,8 +3424,7 @@ public final class CheckIndex implements Closeable {
}
private static void checkSortedSetDocValues(
String fieldName, int maxDoc, SortedSetDocValues dv, SortedSetDocValues dv2)
throws IOException {
String fieldName, SortedSetDocValues dv, SortedSetDocValues dv2) throws IOException {
final long maxOrd = dv.getValueCount() - 1;
LongBitSet seenOrds = new LongBitSet(dv.getValueCount());
long maxOrd2 = -1;
@ -3470,7 +3520,7 @@ public final class CheckIndex implements Closeable {
}
private static void checkSortedNumericDocValues(
String fieldName, int maxDoc, SortedNumericDocValues ndv, SortedNumericDocValues ndv2)
String fieldName, SortedNumericDocValues ndv, SortedNumericDocValues ndv2)
throws IOException {
if (ndv.docID() != -1) {
throw new CheckIndexException(
@ -3539,38 +3589,32 @@ public final class CheckIndex implements Closeable {
}
private static void checkDocValues(
FieldInfo fi,
DocValuesProducer dvReader,
int maxDoc,
PrintStream infoStream,
DocValuesStatus status)
throws Exception {
FieldInfo fi, DocValuesProducer dvReader, DocValuesStatus status) throws Exception {
switch (fi.getDocValuesType()) {
case SORTED:
status.totalSortedFields++;
checkDVIterator(fi, maxDoc, dvReader::getSorted);
checkSortedDocValues(fi.name, maxDoc, dvReader.getSorted(fi), dvReader.getSorted(fi));
checkDVIterator(fi, dvReader::getSorted);
checkSortedDocValues(fi.name, dvReader.getSorted(fi), dvReader.getSorted(fi));
break;
case SORTED_NUMERIC:
status.totalSortedNumericFields++;
checkDVIterator(fi, maxDoc, dvReader::getSortedNumeric);
checkDVIterator(fi, dvReader::getSortedNumeric);
checkSortedNumericDocValues(
fi.name, maxDoc, dvReader.getSortedNumeric(fi), dvReader.getSortedNumeric(fi));
fi.name, dvReader.getSortedNumeric(fi), dvReader.getSortedNumeric(fi));
break;
case SORTED_SET:
status.totalSortedSetFields++;
checkDVIterator(fi, maxDoc, dvReader::getSortedSet);
checkSortedSetDocValues(
fi.name, maxDoc, dvReader.getSortedSet(fi), dvReader.getSortedSet(fi));
checkDVIterator(fi, dvReader::getSortedSet);
checkSortedSetDocValues(fi.name, dvReader.getSortedSet(fi), dvReader.getSortedSet(fi));
break;
case BINARY:
status.totalBinaryFields++;
checkDVIterator(fi, maxDoc, dvReader::getBinary);
checkBinaryDocValues(fi.name, maxDoc, dvReader.getBinary(fi), dvReader.getBinary(fi));
checkDVIterator(fi, dvReader::getBinary);
checkBinaryDocValues(fi.name, dvReader.getBinary(fi), dvReader.getBinary(fi));
break;
case NUMERIC:
status.totalNumericFields++;
checkDVIterator(fi, maxDoc, dvReader::getNumeric);
checkDVIterator(fi, dvReader::getNumeric);
checkNumericDocValues(fi.name, dvReader.getNumeric(fi), dvReader.getNumeric(fi));
break;
case NONE:
@ -3586,7 +3630,7 @@ public final class CheckIndex implements Closeable {
*/
public static Status.TermVectorStatus testTermVectors(CodecReader reader, PrintStream infoStream)
throws IOException {
return testTermVectors(reader, infoStream, false, false, false);
return testTermVectors(reader, infoStream, false, Level.MIN_LEVEL_FOR_INTEGRITY_CHECKS, false);
}
/**
@ -3595,11 +3639,7 @@ public final class CheckIndex implements Closeable {
* @lucene.experimental
*/
public static Status.TermVectorStatus testTermVectors(
CodecReader reader,
PrintStream infoStream,
boolean verbose,
boolean doSlowChecks,
boolean failFast)
CodecReader reader, PrintStream infoStream, boolean verbose, int level, boolean failFast)
throws IOException {
long startNS = System.nanoTime();
final Status.TermVectorStatus status = new Status.TermVectorStatus();
@ -3612,14 +3652,14 @@ public final class CheckIndex implements Closeable {
PostingsEnum postings = null;
// Only used if doSlowChecks is true:
// Only used if the Level is high enough to include slow checks:
PostingsEnum postingsDocs = null;
final Bits liveDocs = reader.getLiveDocs();
FieldsProducer postingsFields;
// TODO: testTermsIndex
if (doSlowChecks) {
if (level >= Level.MIN_LEVEL_FOR_SLOW_CHECKS) {
postingsFields = reader.getPostingsReader();
if (postingsFields != null) {
postingsFields = postingsFields.getMergeInstance();
@ -3643,8 +3683,7 @@ public final class CheckIndex implements Closeable {
if (tfv != null) {
// First run with no deletions:
checkFields(
tfv, null, 1, fieldInfos, null, false, true, infoStream, verbose, doSlowChecks);
checkFields(tfv, null, 1, fieldInfos, null, false, true, infoStream, verbose, level);
// Only agg stats if the doc is live:
final boolean doStats = liveDocs == null || liveDocs.get(j);
@ -3660,7 +3699,7 @@ public final class CheckIndex implements Closeable {
// Make sure FieldInfo thinks this field is vector'd:
final FieldInfo fieldInfo = fieldInfos.fieldInfo(field);
if (!fieldInfo.hasVectors()) {
if (fieldInfo.hasVectors() == false) {
throw new CheckIndexException(
"docID="
+ j
@ -3669,7 +3708,7 @@ public final class CheckIndex implements Closeable {
+ " but FieldInfo has storeTermVector=false");
}
if (doSlowChecks) {
if (level >= Level.MIN_LEVEL_FOR_SLOW_CHECKS) {
Terms terms = tfv.terms(field);
TermsEnum termsEnum = terms.iterator();
final boolean postingsHasFreq =
@ -3696,7 +3735,7 @@ public final class CheckIndex implements Closeable {
postings = termsEnum.postings(postings, PostingsEnum.ALL);
assert postings != null;
if (!postingsTermsEnum.seekExact(term)) {
if (postingsTermsEnum.seekExact(term) == false) {
throw new CheckIndexException(
"vector term="
+ term
@ -3852,7 +3891,7 @@ public final class CheckIndex implements Closeable {
+ " but postings does not.");
}
BytesRef postingsPayload = postingsDocs.getPayload();
if (!payload.equals(postingsPayload)) {
if (payload.equals(postingsPayload) == false) {
throw new CheckIndexException(
"vector term="
+ term
@ -3972,9 +4011,8 @@ public final class CheckIndex implements Closeable {
/** Run-time configuration options for CheckIndex commands. */
public static class Options {
boolean doExorcise = false;
boolean doSlowChecks = false;
boolean verbose = false;
boolean doChecksumsOnly = false;
int level = Level.DEFAULT_VALUE;
int threadCount;
List<String> onlySegments = new ArrayList<>();
String indexPath = null;
@ -4011,9 +4049,10 @@ public final class CheckIndex implements Closeable {
return 1;
}
if (!assertsOn())
if (assertsOn() == false) {
System.out.println(
"\nNOTE: testing will be more thorough if you run java with '-ea:org.apache.lucene...', so assertions are enabled");
}
System.out.println("\nOpening index @ " + opts.indexPath + "\n");
Directory directory = null;
@ -4037,6 +4076,42 @@ public final class CheckIndex implements Closeable {
}
}
/** Class with static variables with information about CheckIndex's -level parameter. */
public static class Level {
private Level() {}
/** Minimum valid level. */
public static final int MIN_VALUE = 1;
/** Maximum valid level. */
public static final int MAX_VALUE = 3;
/** The default level if none is specified. */
public static final int DEFAULT_VALUE = MIN_VALUE;
/** Minimum level required to run checksum checks. */
public static final int MIN_LEVEL_FOR_CHECKSUM_CHECKS = 1;
/** Minimum level required to run integrity checks. */
public static final int MIN_LEVEL_FOR_INTEGRITY_CHECKS = 2;
/** Minimum level required to run slow checks. */
public static final int MIN_LEVEL_FOR_SLOW_CHECKS = 3;
/** Checks if given level value is within the allowed bounds else it raises an Exception. */
public static void checkIfLevelInBounds(int levelVal) throws IllegalArgumentException {
if (levelVal < Level.MIN_VALUE || levelVal > Level.MAX_VALUE) {
throw new IllegalArgumentException(
String.format(
Locale.ROOT,
"ERROR: given value: '%d' for -level option is out of bounds. Please use a value from '%d'->'%d'",
levelVal,
Level.MIN_VALUE,
Level.MAX_VALUE));
}
}
}
/**
* Parse command line args into fields
*
@ -4051,15 +4126,29 @@ public final class CheckIndex implements Closeable {
int i = 0;
while (i < args.length) {
String arg = args[i];
if ("-fast".equals(arg)) {
opts.doChecksumsOnly = true;
if ("-level".equals(arg)) {
if (i == args.length - 1) {
throw new IllegalArgumentException("ERROR: missing value for -level option");
}
i++;
int level = Integer.parseInt(args[i]);
Level.checkIfLevelInBounds(level);
opts.level = level;
} else if ("-fast".equals(arg)) {
// Deprecated. Remove in Lucene 11.
System.err.println(
"-fast is deprecated, use '-level 1' for explicitly verifying file checksums only. This is also now the default "
+ "behaviour!");
} else if ("-slow".equals(arg)) {
// Deprecated. Remove in Lucene 11.
System.err.println("-slow is deprecated, use '-level 3' instead for slow checks");
opts.level = Level.MIN_LEVEL_FOR_SLOW_CHECKS;
} else if ("-exorcise".equals(arg)) {
opts.doExorcise = true;
} else if ("-crossCheckTermVectors".equals(arg)) {
System.err.println("-crossCheckTermVectors is deprecated, use -slow instead");
opts.doSlowChecks = true;
} else if ("-slow".equals(arg)) {
opts.doSlowChecks = true;
// Deprecated. Remove in Lucene 11.
System.err.println("-crossCheckTermVectors is deprecated, use '-level 3' instead");
opts.level = Level.MAX_VALUE;
} else if (arg.equals("-verbose")) {
opts.verbose = true;
} else if (arg.equals("-segment")) {
@ -4096,11 +4185,13 @@ public final class CheckIndex implements Closeable {
if (opts.indexPath == null) {
throw new IllegalArgumentException(
"\nERROR: index path not specified"
+ "\nUsage: java org.apache.lucene.index.CheckIndex pathToIndex [-exorcise] [-slow] [-segment X] [-segment Y] [-threadCount X] [-dir-impl X]\n"
+ "\nUsage: java org.apache.lucene.index.CheckIndex pathToIndex [-exorcise] [-level X] [-segment X] [-segment Y] [-threadCount X] [-dir-impl X]\n"
+ "\n"
+ " -exorcise: actually write a new segments_N file, removing any problematic segments\n"
+ " -fast: just verify file checksums, omitting logical integrity checks\n"
+ " -slow: do additional slow checks; THIS IS VERY SLOW!\n"
+ " -level X: sets the detail level of the check. The higher the value, the more checks are done.\n"
+ " 1 - (Default) Checksum checks only.\n"
+ " 2 - All level 1 checks + logical integrity checks.\n"
+ " 3 - All level 2 checks + slow checks.\n"
+ " -codec X: when exorcising, codec to write the new segments_N file with\n"
+ " -verbose: print additional details\n"
+ " -segment X: only check the specified segments. This can be specified multiple\n"
@ -4115,7 +4206,8 @@ public final class CheckIndex implements Closeable {
+ "If no package is specified the "
+ FSDirectory.class.getPackage().getName()
+ " package will be used.\n"
+ "\n"
+ "CheckIndex only verifies file checksums as default.\n"
+ "Use -level with value of '2' or higher if you also want to check segment file contents.\n\n"
+ "**WARNING**: -exorcise *LOSES DATA*. This should only be used on an emergency basis as it will cause\n"
+ "documents (perhaps many) to be permanently removed from the index. Always make\n"
+ "a backup copy of your index before running this! Do not run this tool on an index\n"
@ -4137,10 +4229,6 @@ public final class CheckIndex implements Closeable {
throw new IllegalArgumentException("ERROR: cannot specify both -exorcise and -segment");
}
if (opts.doChecksumsOnly && opts.doSlowChecks) {
throw new IllegalArgumentException("ERROR: cannot specify both -fast and -slow");
}
return opts;
}
@ -4151,8 +4239,7 @@ public final class CheckIndex implements Closeable {
* @return 0 iff the index is clean, 1 otherwise
*/
public int doCheck(Options opts) throws IOException, InterruptedException {
setDoSlowChecks(opts.doSlowChecks);
setChecksumsOnly(opts.doChecksumsOnly);
setLevel(opts.level);
setInfoStream(opts.out, opts.verbose);
// user provided thread count via command line argument, overriding the default with user
// provided value
@ -4166,8 +4253,8 @@ public final class CheckIndex implements Closeable {
return 1;
}
if (!result.clean) {
if (!opts.doExorcise) {
if (result.clean == false) {
if (opts.doExorcise == false) {
opts.out.println(
"WARNING: would write new segments file, and "
+ result.totLoseDocCount

View File

@ -270,7 +270,6 @@ final class FieldUpdatesBuffer {
static class BufferedUpdate {
private BufferedUpdate() {}
;
/** the max document ID this update should be applied to */
int docUpTo;

View File

@ -33,6 +33,7 @@ import java.util.HashSet;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Objects;
import java.util.Queue;
import java.util.Set;
import java.util.concurrent.ConcurrentLinkedQueue;
@ -55,6 +56,8 @@ import org.apache.lucene.index.DocValuesUpdate.BinaryDocValuesUpdate;
import org.apache.lucene.index.DocValuesUpdate.NumericDocValuesUpdate;
import org.apache.lucene.index.FieldInfos.FieldNumbers;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.index.MergePolicy.MergeReader;
import org.apache.lucene.index.Sorter.DocMap;
import org.apache.lucene.internal.tests.IndexPackageAccess;
import org.apache.lucene.internal.tests.IndexWriterAccess;
import org.apache.lucene.internal.tests.TestSecrets;
@ -3413,8 +3416,20 @@ public class IndexWriter
Collections.emptyMap(),
config.getIndexSort());
List<CodecReader> readers =
merge.getMergeReader().stream().map(r -> r.codecReader).collect(Collectors.toList());
List<CodecReader> readers = new ArrayList<>();
for (MergeReader mr : merge.getMergeReader()) {
CodecReader reader = merge.wrapForMerge(mr.codecReader);
readers.add(reader);
}
if (config.getIndexSort() == null && readers.isEmpty() == false) {
CodecReader mergedReader = SlowCompositeCodecReaderWrapper.wrap(readers);
DocMap docMap = merge.reorder(mergedReader, directory);
if (docMap != null) {
readers = Collections.singletonList(SortingCodecReader.wrap(mergedReader, docMap, null));
}
}
SegmentMerger merger =
new SegmentMerger(readers, segInfo, infoStream, trackingDir, globalFieldNumberMap, context);
@ -3464,6 +3479,8 @@ public class IndexWriter
merge.getMergeInfo().info.setUseCompoundFile(true);
}
merge.setMergeInfo(merge.info);
// Have codec write SegmentInfo. Must do this after
// creating CFS so that 1) .si isn't slurped into CFS,
// and 2) .si reflects useCompoundFile=true change
@ -3791,7 +3808,7 @@ public class IndexWriter
new OneMergeWrappingMergePolicy(
config.getMergePolicy(),
toWrap ->
new MergePolicy.OneMerge(toWrap.segments) {
new MergePolicy.OneMerge(toWrap) {
SegmentCommitInfo origInfo;
final AtomicBoolean onlyOnce = new AtomicBoolean(false);
@ -3890,6 +3907,18 @@ public class IndexWriter
public CodecReader wrapForMerge(CodecReader reader) throws IOException {
return toWrap.wrapForMerge(reader); // must delegate
}
@Override
public Sorter.DocMap reorder(CodecReader reader, Directory dir)
throws IOException {
return toWrap.reorder(reader, dir); // must delegate
}
@Override
public void setMergeInfo(SegmentCommitInfo info) {
super.setMergeInfo(info);
toWrap.setMergeInfo(info);
}
}),
trigger,
UNBOUNDED_MAX_MERGE_SEGMENTS);
@ -4312,7 +4341,7 @@ public class IndexWriter
* merge.info). If no deletes were flushed, no new deletes file is saved.
*/
private synchronized ReadersAndUpdates commitMergedDeletesAndUpdates(
MergePolicy.OneMerge merge, MergeState mergeState) throws IOException {
MergePolicy.OneMerge merge, MergeState.DocMap[] docMaps) throws IOException {
mergeFinishedGen.incrementAndGet();
@ -4336,7 +4365,7 @@ public class IndexWriter
boolean anyDVUpdates = false;
assert sourceSegments.size() == mergeState.docMaps.length;
assert sourceSegments.size() == docMaps.length;
for (int i = 0; i < sourceSegments.size(); i++) {
SegmentCommitInfo info = sourceSegments.get(i);
minGen = Math.min(info.getBufferedDeletesGen(), minGen);
@ -4346,12 +4375,11 @@ public class IndexWriter
// the pool:
assert rld != null : "seg=" + info.info.name;
MergeState.DocMap segDocMap = mergeState.docMaps[i];
MergeState.DocMap segDocMap = docMaps[i];
carryOverHardDeletes(
mergedDeletesAndUpdates,
maxDoc,
mergeState.liveDocs[i],
merge.getMergeReader().get(i).hardLiveDocs,
rld.getHardLiveDocs(),
segDocMap);
@ -4454,26 +4482,21 @@ public class IndexWriter
private static void carryOverHardDeletes(
ReadersAndUpdates mergedReadersAndUpdates,
int maxDoc,
Bits mergeLiveDocs, // the liveDocs used to build the segDocMaps
Bits prevHardLiveDocs, // the hard deletes when the merge reader was pulled
Bits currentHardLiveDocs, // the current hard deletes
MergeState.DocMap segDocMap)
throws IOException {
assert mergeLiveDocs == null || mergeLiveDocs.length() == maxDoc;
// if we mix soft and hard deletes we need to make sure that we only carry over deletes
// that were not deleted before. Otherwise the segDocMap doesn't contain a mapping.
// yet this is also required if any MergePolicy modifies the liveDocs since this is
// what the segDocMap is build on.
final IntPredicate carryOverDelete =
mergeLiveDocs == null || mergeLiveDocs == prevHardLiveDocs
? docId -> currentHardLiveDocs.get(docId) == false
: docId -> mergeLiveDocs.get(docId) && currentHardLiveDocs.get(docId) == false;
docId -> segDocMap.get(docId) != -1 && currentHardLiveDocs.get(docId) == false;
if (prevHardLiveDocs != null) {
// If we had deletions on starting the merge we must
// still have deletions now:
assert currentHardLiveDocs != null;
assert mergeLiveDocs != null;
assert prevHardLiveDocs.length() == maxDoc;
assert currentHardLiveDocs.length() == maxDoc;
@ -4516,7 +4539,7 @@ public class IndexWriter
}
@SuppressWarnings("try")
private synchronized boolean commitMerge(MergePolicy.OneMerge merge, MergeState mergeState)
private synchronized boolean commitMerge(MergePolicy.OneMerge merge, MergeState.DocMap[] docMaps)
throws IOException {
merge.onMergeComplete();
testPoint("startCommitMerge");
@ -4559,7 +4582,7 @@ public class IndexWriter
}
final ReadersAndUpdates mergedUpdates =
merge.info.info.maxDoc() == 0 ? null : commitMergedDeletesAndUpdates(merge, mergeState);
merge.info.info.maxDoc() == 0 ? null : commitMergedDeletesAndUpdates(merge, docMaps);
// If the doc store we are using has been closed and
// is in now compound format (but wasn't when we
@ -5163,12 +5186,57 @@ public class IndexWriter
}
mergeReaders.add(wrappedReader);
}
MergeState.DocMap[] reorderDocMaps = null;
if (config.getIndexSort() == null) {
// Create a merged view of the input segments. This effectively does the merge.
CodecReader mergedView = SlowCompositeCodecReaderWrapper.wrap(mergeReaders);
Sorter.DocMap docMap = merge.reorder(mergedView, directory);
if (docMap != null) {
reorderDocMaps = new MergeState.DocMap[mergeReaders.size()];
int docBase = 0;
int i = 0;
for (CodecReader reader : mergeReaders) {
final int currentDocBase = docBase;
reorderDocMaps[i] =
docID -> {
Objects.checkIndex(docID, reader.maxDoc());
return docMap.oldToNew(currentDocBase + docID);
};
i++;
docBase += reader.maxDoc();
}
// This makes merging more expensive as it disables some bulk merging optimizations, so
// only do this if a non-null DocMap is returned.
mergeReaders =
Collections.singletonList(SortingCodecReader.wrap(mergedView, docMap, null));
}
}
final SegmentMerger merger =
new SegmentMerger(
mergeReaders, merge.info.info, infoStream, dirWrapper, globalFieldNumberMap, context);
merge.info.setSoftDelCount(Math.toIntExact(softDeleteCount.get()));
merge.checkAborted();
MergeState mergeState = merger.mergeState;
MergeState.DocMap[] docMaps;
if (reorderDocMaps == null) {
docMaps = mergeState.docMaps;
} else {
// Since the reader was reordered, we passed a merged view to MergeState and from its
// perspective there is a single input segment to the merge and the
// SlowCompositeCodecReaderWrapper is effectively doing the merge.
assert mergeState.docMaps.length == 1
: "Got " + mergeState.docMaps.length + " docMaps, but expected 1";
MergeState.DocMap compactionDocMap = mergeState.docMaps[0];
docMaps = new MergeState.DocMap[reorderDocMaps.length];
for (int i = 0; i < docMaps.length; ++i) {
MergeState.DocMap reorderDocMap = reorderDocMaps[i];
docMaps[i] = docID -> compactionDocMap.get(reorderDocMap.get(docID));
}
}
merge.mergeStartNS = System.nanoTime();
// This is where all the work happens:
@ -5176,7 +5244,6 @@ public class IndexWriter
merger.merge();
}
MergeState mergeState = merger.mergeState;
assert mergeState.segmentInfo == merge.info.info;
merge.info.info.setFiles(new HashSet<>(dirWrapper.getCreatedFiles()));
Codec codec = config.getCodec();
@ -5229,7 +5296,7 @@ public class IndexWriter
// Merge would produce a 0-doc segment, so we do nothing except commit the merge to remove
// all the 0-doc segments that we "merged":
assert merge.info.info.maxDoc() == 0;
success = commitMerge(merge, mergeState);
success = commitMerge(merge, docMaps);
return 0;
}
@ -5309,6 +5376,8 @@ public class IndexWriter
success = false;
}
merge.setMergeInfo(merge.info);
// Have codec write SegmentInfo. Must do this after
// creating CFS so that 1) .si isn't slurped into CFS,
// and 2) .si reflects useCompoundFile=true change
@ -5352,7 +5421,7 @@ public class IndexWriter
}
}
if (!commitMerge(merge, mergeState)) {
if (!commitMerge(merge, docMaps)) {
// commitMerge will return false if this merge was
// aborted
return 0;

View File

@ -255,6 +255,15 @@ public abstract class MergePolicy {
usesPooledReaders = false;
}
/** Constructor for wrapping. */
protected OneMerge(OneMerge oneMerge) {
this.segments = oneMerge.segments;
this.mergeReaders = oneMerge.mergeReaders;
this.totalMaxDoc = oneMerge.totalMaxDoc;
this.mergeProgress = new OneMergeProgress();
this.usesPooledReaders = oneMerge.usesPooledReaders;
}
/**
* Called by {@link IndexWriter} after the merge started and from the thread that will be
* executing the merge.
@ -288,11 +297,32 @@ public abstract class MergePolicy {
}
}
/** Wrap the reader in order to add/remove information to the merged segment. */
/**
* Wrap a reader prior to merging in order to add/remove fields or documents.
*
* <p><b>NOTE:</b> It is illegal to reorder doc IDs here, use {@link
* #reorder(CodecReader,Directory)} instead.
*/
public CodecReader wrapForMerge(CodecReader reader) throws IOException {
return reader;
}
/**
* Extend this method if you wish to renumber doc IDs. This method will be called when index
* sorting is disabled on a merged view of the {@link OneMerge}. A {@code null} return value
* indicates that doc IDs should not be reordered.
*
* <p><b>NOTE:</b> Returning a non-null value here disables several optimizations and increases
* the merging overhead.
*
* @param reader The reader to reorder.
* @param dir The {@link Directory} of the index, which may be used to create temporary files.
* @lucene.experimental
*/
public Sorter.DocMap reorder(CodecReader reader, Directory dir) throws IOException {
return null;
}
/**
* Expert: Sets the {@link SegmentCommitInfo} of the merged segment. Allows sub-classes to e.g.
* {@link SegmentInfo#addDiagnostics(Map) add diagnostic} properties.
@ -355,11 +385,7 @@ public abstract class MergePolicy {
* not indicate the number of documents after the merge.
*/
public int totalNumDocs() {
int total = 0;
for (SegmentCommitInfo info : segments) {
total += info.info.maxDoc();
}
return total;
return totalMaxDoc;
}
/** Return {@link MergeInfo} describing this merge. */

View File

@ -177,9 +177,7 @@ public class MergeState {
final int docBase = totalDocs;
docMaps[i] =
new DocMap() {
@Override
public int get(int docID) {
docID -> {
if (liveDocs == null) {
return docBase + docID;
} else if (liveDocs.get(docID)) {
@ -187,7 +185,6 @@ public class MergeState {
} else {
return -1;
}
}
};
totalDocs += reader.numDocs();
}
@ -242,13 +239,10 @@ public class MergeState {
}
/** A map of doc IDs. */
public abstract static class DocMap {
/** Sole constructor. (For invocation by subclass constructors, typically implicit.) */
// Explicitly declared so that we have non-empty javadoc
protected DocMap() {}
@FunctionalInterface
public interface DocMap {
/** Return the mapped docID or -1 if the given doc is not mapped. */
public abstract int get(int docID);
int get(int docID);
}
static PackedLongValues removeDeletes(final int maxDoc, final Bits liveDocs) {

View File

@ -122,15 +122,12 @@ final class MultiSorter {
final PackedLongValues remapped = builders[i].build();
final Bits liveDocs = readers.get(i).getLiveDocs();
docMaps[i] =
new MergeState.DocMap() {
@Override
public int get(int docID) {
docID -> {
if (liveDocs == null || liveDocs.get(docID)) {
return (int) remapped.get(docID);
} else {
return -1;
}
}
};
}

View File

@ -325,7 +325,6 @@ public abstract class PointValues {
/** Notifies the caller that this many documents are about to be visited */
default void grow(int count) {}
;
}
/**

View File

@ -526,7 +526,6 @@ final class ReadersAndUpdates {
return docIDOut;
}
}
;
private synchronized Set<String> writeFieldInfosGen(
FieldInfos fieldInfos, Directory dir, FieldInfosFormat infosFormat) throws IOException {

View File

@ -122,7 +122,7 @@ public final class SegmentInfos implements Cloneable, Iterable<SegmentCommitInfo
static final int VERSION_CURRENT = VERSION_86;
/** Name of the generation reference file name */
private static final String OLD_SEGMENTS_GEN = "segments.gen";
static final String OLD_SEGMENTS_GEN = "segments.gen";
/** Used to name new segments. */
public long counter;
@ -146,7 +146,7 @@ public final class SegmentInfos implements Cloneable, Iterable<SegmentCommitInfo
*
* @see #setInfoStream
*/
private static PrintStream infoStream = null;
private static PrintStream infoStream;
/** Id for this commit; only written starting with Lucene 5.0 */
private byte[] id;
@ -1010,6 +1010,7 @@ public final class SegmentInfos implements Cloneable, Iterable<SegmentCommitInfo
void replace(SegmentInfos other) {
rollbackSegmentInfos(other.asList());
lastGeneration = other.lastGeneration;
userData = other.userData;
}
/** Returns sum of all segment's maxDocs. Note that this does not include deletions */

File diff suppressed because it is too large Load Diff

View File

@ -24,6 +24,7 @@ import java.util.Arrays;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.Objects;
import org.apache.lucene.codecs.DocValuesProducer;
import org.apache.lucene.codecs.FieldsProducer;
import org.apache.lucene.codecs.KnnVectorsReader;
@ -77,7 +78,7 @@ public final class SortingCodecReader extends FilterCodecReader {
private final Sorter.DocMap docMap;
SortingPointValues(final PointValues in, Sorter.DocMap docMap) {
this.in = in;
this.in = Objects.requireNonNull(in);
this.docMap = docMap;
}
@ -472,6 +473,10 @@ public final class SortingCodecReader extends FilterCodecReader {
@Override
public PointValues getValues(String field) throws IOException {
var values = delegate.getValues(field);
if (values == null) {
return null;
}
return new SortingPointValues(delegate.getValues(field), docMap);
}

View File

@ -85,7 +85,11 @@ public final class IndexOrDocValuesQuery extends Query {
@Override
public String toString(String field) {
return indexQuery.toString(field);
return "IndexOrDocValuesQuery(indexQuery="
+ indexQuery.toString(field)
+ ", dvQuery="
+ dvQuery.toString(field)
+ ")";
}
@Override

View File

@ -19,7 +19,6 @@ package org.apache.lucene.search;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
@ -62,9 +61,9 @@ import org.apache.lucene.util.automaton.ByteRunAutomaton;
* match lots of documents, counting the number of hits may take much longer than computing the top
* hits so this trade-off allows to get some minimal information about the hit count without slowing
* down search too much. The {@link TopDocs#scoreDocs} array is always accurate however. If this
* behavior doesn't suit your needs, you should create collectors manually with either {@link
* TopScoreDocCollector#create} or {@link TopFieldCollector#create} and call {@link #search(Query,
* Collector)}.
* behavior doesn't suit your needs, you should create collectorManagers manually with either {@link
* TopScoreDocCollectorManager} or {@link TopFieldCollectorManager} and call {@link #search(Query,
* CollectorManager)}.
*
* <p><a id="thread-safety"></a>
*
@ -455,35 +454,10 @@ public class IndexSearcher {
}
final int cappedNumHits = Math.min(numHits, limit);
final LeafSlice[] leafSlices = getSlices();
final CollectorManager<TopScoreDocCollector, TopDocs> manager =
new CollectorManager<TopScoreDocCollector, TopDocs>() {
private final HitsThresholdChecker hitsThresholdChecker =
leafSlices.length <= 1
? HitsThresholdChecker.create(Math.max(TOTAL_HITS_THRESHOLD, numHits))
: HitsThresholdChecker.createShared(Math.max(TOTAL_HITS_THRESHOLD, numHits));
private final MaxScoreAccumulator minScoreAcc =
leafSlices.length <= 1 ? null : new MaxScoreAccumulator();
@Override
public TopScoreDocCollector newCollector() throws IOException {
return TopScoreDocCollector.create(
cappedNumHits, after, hitsThresholdChecker, minScoreAcc);
}
@Override
public TopDocs reduce(Collection<TopScoreDocCollector> collectors) throws IOException {
final TopDocs[] topDocs = new TopDocs[collectors.size()];
int i = 0;
for (TopScoreDocCollector collector : collectors) {
topDocs[i++] = collector.topDocs();
}
return TopDocs.merge(0, cappedNumHits, topDocs);
}
};
final boolean supportsConcurrency = getSlices().length > 1;
CollectorManager<TopScoreDocCollector, TopDocs> manager =
new TopScoreDocCollectorManager(
cappedNumHits, after, TOTAL_HITS_THRESHOLD, supportsConcurrency);
return search(query, manager);
}
@ -510,7 +484,10 @@ public class IndexSearcher {
*
* @throws TooManyClauses If a query would exceed {@link IndexSearcher#getMaxClauseCount()}
* clauses.
* @deprecated This method is being deprecated in favor of {@link IndexSearcher#search(Query,
* CollectorManager)} due to its support for concurrency in IndexSearcher
*/
@Deprecated
public void search(Query query, Collector results) throws IOException {
query = rewrite(query, results.scoreMode().needsScores());
search(leafContexts, createWeight(query, results.scoreMode(), 1), results);
@ -602,34 +579,10 @@ public class IndexSearcher {
final Sort rewrittenSort = sort.rewrite(this);
final LeafSlice[] leafSlices = getSlices();
final boolean supportsConcurrency = leafSlices.length > 1;
final CollectorManager<TopFieldCollector, TopFieldDocs> manager =
new CollectorManager<>() {
private final HitsThresholdChecker hitsThresholdChecker =
leafSlices.length <= 1
? HitsThresholdChecker.create(Math.max(TOTAL_HITS_THRESHOLD, numHits))
: HitsThresholdChecker.createShared(Math.max(TOTAL_HITS_THRESHOLD, numHits));
private final MaxScoreAccumulator minScoreAcc =
leafSlices.length <= 1 ? null : new MaxScoreAccumulator();
@Override
public TopFieldCollector newCollector() throws IOException {
// TODO: don't pay the price for accurate hit counts by default
return TopFieldCollector.create(
rewrittenSort, cappedNumHits, after, hitsThresholdChecker, minScoreAcc);
}
@Override
public TopFieldDocs reduce(Collection<TopFieldCollector> collectors) throws IOException {
final TopFieldDocs[] topDocs = new TopFieldDocs[collectors.size()];
int i = 0;
for (TopFieldCollector collector : collectors) {
topDocs[i++] = collector.topDocs();
}
return TopDocs.merge(rewrittenSort, 0, cappedNumHits, topDocs);
}
};
new TopFieldCollectorManager(
rewrittenSort, cappedNumHits, after, TOTAL_HITS_THRESHOLD, supportsConcurrency);
TopFieldDocs topDocs = search(query, manager);
if (doDocScores) {

View File

@ -69,7 +69,6 @@ public abstract class PointInSetQuery extends Query implements Accountable {
@Override
public abstract BytesRef next();
}
;
/** The {@code packedPoints} iterator must be in sorted order. */
protected PointInSetQuery(String field, int numDims, int bytesPerDim, Stream packedPoints) {

Some files were not shown because too many files have changed in this diff Show More