diff --git a/build.gradle b/build.gradle index 9c64ab1c3ab..14beabb4250 100644 --- a/build.gradle +++ b/build.gradle @@ -117,6 +117,9 @@ apply from: file('buildSrc/scriptDepVersions.gradle') apply from: file('gradle/generation/local-settings.gradle') +// Make sure the build environment is consistent. +apply from: file('gradle/validation/check-environment.gradle') + // IDE support, settings and specials. apply from: file('gradle/ide/intellij-idea.gradle') apply from: file('gradle/ide/eclipse.gradle') diff --git a/buildSrc/build.gradle b/buildSrc/build.gradle index b946105bae0..9879caa8e18 100644 --- a/buildSrc/build.gradle +++ b/buildSrc/build.gradle @@ -38,3 +38,9 @@ dependencies { implementation "commons-codec:commons-codec:${scriptDepVersions['commons-codec']}" } +if (!rootProject.hasJavaFlightRecorder) { + logger.warn('Module jdk.jfr is not available; skipping compilation of Java Flight Recorder support.') + tasks.named('compileJava').configure { + exclude('**/ProfileResults.java') + } +} diff --git a/buildSrc/scriptDepVersions.gradle b/buildSrc/scriptDepVersions.gradle index 795ce08366c..82341e2c38c 100644 --- a/buildSrc/scriptDepVersions.gradle +++ b/buildSrc/scriptDepVersions.gradle @@ -24,7 +24,7 @@ ext { "apache-rat": "0.14", "asm": "9.6", "commons-codec": "1.13", - "ecj": "3.36.0-SNAPSHOT", + "ecj": "3.36.0", "flexmark": "0.61.24", "javacc": "7.0.12", "jflex": "1.8.2", diff --git a/gradle/testing/profiling.gradle b/gradle/testing/profiling.gradle index 34b3efe59fa..6c71b3f827a 100644 --- a/gradle/testing/profiling.gradle +++ b/gradle/testing/profiling.gradle @@ -15,20 +15,18 @@ * limitations under the License. */ -import org.apache.lucene.gradle.ProfileResults; - def recordings = files() allprojects { plugins.withType(JavaPlugin) { ext { testOptions += [ - [propName: 'tests.profile', value: false, description: "Enable java flight recorder profiling."] + [propName: 'tests.profile', value: false, description: "Enable Java Flight Recorder profiling."] ] } if (resolvedTestOption("tests.profile").toBoolean()) { - allprojects { + if (rootProject.hasJavaFlightRecorder) { tasks.withType(Test) { jvmArgs("-XX:StartFlightRecording=dumponexit=true,maxsize=250M,settings=" + rootProject.file("gradle/testing/profiling.jfc"), "-XX:+UnlockDiagnosticVMOptions", @@ -41,6 +39,8 @@ allprojects { recordings = recordings.plus fileTree(dir: workingDir, include: '*.jfr') } } + } else { + throw new GradleException('Module jdk.jfr is not available; Java Flight Recorder profiles cannot be enabled.') } } } @@ -48,10 +48,11 @@ allprojects { gradle.buildFinished { if (!recordings.isEmpty()) { - ProfileResults.printReport(recordings.getFiles().collect { it.toString() }, - propertyOrDefault(ProfileResults.MODE_KEY, ProfileResults.MODE_DEFAULT) as String, - Integer.parseInt(propertyOrDefault(ProfileResults.STACKSIZE_KEY, ProfileResults.STACKSIZE_DEFAULT)), - Integer.parseInt(propertyOrDefault(ProfileResults.COUNT_KEY, ProfileResults.COUNT_DEFAULT)), - Boolean.parseBoolean(propertyOrDefault(ProfileResults.LINENUMBERS_KEY, ProfileResults.LINENUMBERS_DEFAULT))) + def pr = org.apache.lucene.gradle.ProfileResults; + pr.printReport(recordings.getFiles().collect { it.toString() }, + propertyOrDefault(pr.MODE_KEY, pr.MODE_DEFAULT) as String, + Integer.parseInt(propertyOrDefault(pr.STACKSIZE_KEY, pr.STACKSIZE_DEFAULT)), + Integer.parseInt(propertyOrDefault(pr.COUNT_KEY, pr.COUNT_DEFAULT)), + Boolean.parseBoolean(propertyOrDefault(pr.LINENUMBERS_KEY, pr.LINENUMBERS_DEFAULT))) } } diff --git a/gradle/testing/randomization/policies/replicator-tests.policy b/gradle/testing/randomization/policies/replicator-tests.policy index 7deff688915..4e78f4650e5 100644 --- a/gradle/testing/randomization/policies/replicator-tests.policy +++ b/gradle/testing/randomization/policies/replicator-tests.policy @@ -23,8 +23,6 @@ grant { // jetty-specific: permission java.lang.RuntimePermission "getenv.JETTY_AVAILABLE_PROCESSORS"; permission java.lang.RuntimePermission "getenv.JETTY_WORKER_INSTANCE"; - // servlet stuff - permission java.lang.RuntimePermission "setContextClassLoader"; // allow TestNRTReplication fork its jvm permission java.io.FilePermission "${java.home}${/}-", "read,execute"; // read/write access to all system properties (required by jetty in these tests) diff --git a/gradle/testing/randomization/policies/tests.policy b/gradle/testing/randomization/policies/tests.policy index c32ae6cedd2..150fa970b97 100644 --- a/gradle/testing/randomization/policies/tests.policy +++ b/gradle/testing/randomization/policies/tests.policy @@ -50,14 +50,11 @@ grant { permission java.lang.RuntimePermission "getStackTrace"; // needed for mock filesystems in tests permission java.lang.RuntimePermission "fileSystemProvider"; - // analyzers/uima: needed by lucene expressions' JavascriptCompiler - permission java.lang.RuntimePermission "createClassLoader"; // needed to test unmap hack on platforms that support it permission java.lang.RuntimePermission "accessClassInPackage.sun.misc"; permission java.lang.reflect.ReflectPermission "suppressAccessChecks"; // needed by cyberneko usage by benchmarks on J9 permission java.lang.RuntimePermission "accessClassInPackage.org.apache.xerces.util"; - permission java.lang.RuntimePermission "getClassLoader"; // Needed for loading native library (lucene:misc:native) in lucene:misc permission java.lang.RuntimePermission "getFileStoreAttributes"; @@ -111,6 +108,8 @@ grant { permission java.lang.RuntimePermission "shutdownHooks"; // needed by jacoco to instrument classes permission java.lang.RuntimePermission "defineClass"; + // needed by jacoco for God knows what. + permission java.lang.RuntimePermission "createClassLoader"; }; // Grant all permissions to Gradle test runner classes. diff --git a/gradle/validation/check-environment.gradle b/gradle/validation/check-environment.gradle index 514c191ead2..f62e387faf9 100644 --- a/gradle/validation/check-environment.gradle +++ b/gradle/validation/check-environment.gradle @@ -23,6 +23,7 @@ import org.gradle.util.GradleVersion configure(rootProject) { ext { expectedGradleVersion = '8.4' + hasJavaFlightRecorder = ModuleLayer.boot().findModule('jdk.jfr').map(this.class.module::canRead).orElse(false) } wrapper { diff --git a/gradle/validation/error-prone.gradle b/gradle/validation/error-prone.gradle index 4c9a68f9924..8af0358e02a 100644 --- a/gradle/validation/error-prone.gradle +++ b/gradle/validation/error-prone.gradle @@ -17,8 +17,8 @@ def skipReason -if (rootProject.usesAltJvm && rootProject.runtimeJavaVersion > JavaVersion.VERSION_15) { - skipReason = "won't work with JDK ${rootProject.runtimeJavaVersion} if used as alternative java toolchain" +if (rootProject.usesAltJvm) { + skipReason = "won't work with alternative java toolchain" } if (!propertyOrDefault("validation.errorprone", isCIBuild).asBoolean()) { @@ -37,7 +37,7 @@ if (skipReason) { allprojects { prj -> plugins.withType(JavaPlugin) { - // LUCENE-9650: Errorprone on master/gradle does not work with JDK-16+ when running as plugin + // LUCENE-9650: Errorprone on master/gradle does not work when running as plugin // inside a forked Javac process. Javac running inside Gradle works, because we have // additional module system opens in place. // This is a hack to keep the dependency (so that palantir's version check doesn't complain) diff --git a/gradle/validation/jar-checks.gradle b/gradle/validation/jar-checks.gradle index f0fff470f35..7fa6cd3b487 100644 --- a/gradle/validation/jar-checks.gradle +++ b/gradle/validation/jar-checks.gradle @@ -59,6 +59,9 @@ allprojects { } subprojects { + // initialize empty, because no checks for benchmark-jmh module. + ext.jarInfos = [] + // Configure jarValidation configuration for all projects. Any dependency // declared on this configuration (or any configuration it extends from) will // be verified. diff --git a/help/jmh.txt b/help/jmh.txt index 58c32e43ae4..f9b9d7f4994 100644 --- a/help/jmh.txt +++ b/help/jmh.txt @@ -61,6 +61,7 @@ Otherwise you are stuck wrestling down full dependencies of OpenJDK (metal etc) Also you must run benchmarks as root to use dtrace, but it works. $ git clone --depth 1 https://github.com/openjdk/jdk/ + $ curl -f https://ftp.gnu.org/gnu/binutils/binutils-2.38.tar.gz | tar -zxf - $ curl -fo jdk/src/utils/hsdis/binutils/Makefile https://raw.githubusercontent.com/openjdk/jdk/3c7ae1225f0d5575fd927a9b76fb40dc30e208cd/src/utils/hsdis/Makefile $ vi jdk/src/utils/hsdis/binutils/Makefile, change SOURCE = hsdis.c to SOURCE = hsdis-binutils.c $ vi jdk/src/utils/hsdis/binutils/hsdis-binutils.c, change #include "hsdis.h" to #include "../hsdis.h" diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index a3a144db849..616606f23c5 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -7,7 +7,6 @@ http://s.apache.org/luceneversions API Changes --------------------- - * LUCENE-12092: Remove deprecated UTF8TaxonomyWriterCache. Please use LruTaxonomyWriterCache instead. (Vigya Sharma) @@ -62,10 +61,21 @@ API Changes * GITHUB#12599: Add RandomAccessInput#readBytes method to the RandomAccessInput interface. (Ignacio Vera) -* GITHUB#12709: Consolidate FSTStore and BytesStore in FST. Created FSTReader which contains the common methods - of the two (Anh Dung Bui) +* GITHUB#11023: Adding -level param to CheckIndex, making the old -fast param the default behaviour. (Jakub Slowinski) -* GITHUB#12735: Remove FSTCompiler#getTermCount() and FSTCompiler.UnCompiledNode#inputCount (Anh Dung Bui) +* GITHUB#12873: Expressions module now uses MethodHandles to define custom functions. Support for + custom classloaders was removed. (Uwe Schindler) + +* GITHUB#12243: Remove TermInSetQuery ctors taking varargs param. SortedSetDocValuesField#newSlowSetQuery, + SortedDocValuesField#newSlowSetQuery, KeywordField#newSetQuery, KeywordField#newSetQuery now take a collection. (Jakub Slowinski) + +* GITHUB#12881: Performance improvements to MatchHighlighter and MatchRegionRetriever. MatchRegionRetriever can be + configured to not load matches (or content) of certain fields and to force-load other fields so that stored fields + of a document are accessed once. A configurable limit of field matches placed in the priority queue was added + (allows handling long fields with lots of hits more gracefully). MatchRegionRetriever utilizes IndexSearcher's + executor to extract hit offsets concurrently. (Dawid Weiss) + +* GITHUB#12855: Remove deprecated DrillSideways#createDrillDownFacetsCollector extension method. (Greg Miller) New Features --------------------- @@ -89,18 +99,17 @@ Improvements * GITHUB#12447: Hunspell: speed up the dictionary enumeration on suggestion (Peter Gromov) -* GITHUB#12542: FSTCompiler can now approximately limit how much RAM it uses to share - suffixes during FST construction using the suffixRAMLimitMB method. Larger values - result in a more minimal FST (more common suffixes are shard). Pass - Double.POSITIVE_INFINITY to use as much RAM as is needed to create a purely - minimal FST. Inspired by this Rust FST implemention: - https://blog.burntsushi.net/transducers (Mike McCandless) +* GITHUB#12873: Expressions module now uses JEP 371 "Hidden Classes" with JEP 309 + "Dynamic Class-File Constants" to implement Javascript expressions. (Uwe Schindler) Optimizations --------------------- * GITHUB#11857, GITHUB#11859, GITHUB#11893, GITHUB#11909: Hunspell: improved suggestion performance (Peter Gromov) +* GITHUB#12825, GITHUB#12834: Hunspell: improved dictionary loading performance, allowed in-memory entry sorting. + (Peter Gromov) + * GITHUB#12372: Reduce allocation during HNSW construction (Jonathan Ellis) * GITHUB#12408: Lazy initialization improvements for Facets implementations when there are segments with no hits @@ -116,6 +125,9 @@ Bug Fixes * GITHUB#12220: Hunspell: disallow hidden title-case entries from compound middle/end +* GITHUB#12878: Fix the declared Exceptions of Expression#evaluate() to match those + of DoubleValues#doubleValue(). (Uwe Schindler) + Other --------------------- @@ -142,6 +154,48 @@ Other * GITHUB#12239: Hunspell: reduced suggestion set dependency on the hash table order (Peter Gromov) +* GITHUB#9049: Fixing bug in UnescapedCharSequence#toStringEscaped() (Jakub Slowinski) + +======================== Lucene 9.10.0 ======================= + +API Changes +--------------------- +* GITHUB#12243: Mark TermInSetQuery ctors with varargs terms as @Deprecated. SortedSetDocValuesField#newSlowSetQuery, + SortedDocValuesField#newSlowSetQuery, KeywordField#newSetQuery now take a collection of terms as a param. (Jakub Slowinski) + +* GITHUB#11041: Deprecate IndexSearch#search(Query, Collector) in favor of + IndexSearcher#search(Query, CollectorManager) for TopFieldCollectorManager + and TopScoreDocCollectorManager. (Zach Chen, Adrien Grand, Michael McCandless, Greg Miller, Luca Cavanna) + +* GITHUB#12854: Mark DrillSideways#createDrillDownFacetsCollector as @Deprecated. (Greg Miller) + +New Features +--------------------- +(No changes) + +Improvements +--------------------- + +* GITHUB#12870: Tighten synchronized loop in DirectoryTaxonomyReader#getOrdinal. (Stefan Vodita) + +* GITHUB#12812: Avoid overflows and false negatives in int slice buffer filled-with-zeros assertion. (Stefan Vodita) + +Optimizations +--------------------- +(No changes) + +Bug Fixes +--------------------- +* GITHUB#12866: Prevent extra similarity computation for single-level HNSW graphs. (Kaival Parikh) + +* GITHUB#12558: Ensure #finish is called on all drill-sideways FacetsCollectors even when no hits are scored. + (Greg Miller) + +Other +--------------------- + +* GITHUB#11023: Removing some dead code in CheckIndex. (Jakub Slowinski) + ======================== Lucene 9.9.0 ======================= API Changes @@ -157,9 +211,6 @@ API Changes * GITHUB#12592: Add RandomAccessInput#length method to the RandomAccessInput interface. In addition deprecate ByteBuffersDataInput#size in favour of this new method. (Ignacio Vera) -* GITHUB#12646, GITHUB#12690: Move FST#addNode to FSTCompiler to avoid a circular dependency - between FST and FSTCompiler (Anh Dung Bui) - * GITHUB#12718: Make IndexSearcher#getSlices final as it is not expected to be overridden (Luca Cavanna) * GITHUB#12427: Automata#makeStringUnion #makeBinaryStringUnion now accept Iterable instead of @@ -169,6 +220,25 @@ API Changes * GITHUB#12180: Add TaxonomyReader#getBulkOrdinals method to more efficiently retrieve facet ordinals for multiple FacetLabel at once. (Egor Potemkin) +* GITHUB#12816: Add HumanReadableQuery which takes a description parameter for debugging purposes. (Jakub Slowinski) + +* GITHUB#12646, GITHUB#12690: Move FST#addNode to FSTCompiler to avoid a circular dependency + between FST and FSTCompiler (Anh Dung Bui) + +* GITHUB#12709: Consolidate FSTStore and BytesStore in FST. Created FSTReader which contains the common methods + of the two (Anh Dung Bui) + +* GITHUB#12735: Remove FSTCompiler#getTermCount() and FSTCompiler.UnCompiledNode#inputCount (Anh Dung Bui) + +* GITHUB-12695: Remove public constructor of FSTCompiler. Please use FSTCompiler.Builder + instead. (Juan M. Caicedo) + +* GITHUB#12799: Make TaskExecutor constructor public and use TaskExecutor for concurrent + HNSW graph build. (Shubham Chaudhary) + +* GITHUB#12758, GITHUB#12803: Remove FST constructor with DataInput for metadata. Please + use the constructor with FSTMetadata instead. (Anh Dung Bui) + New Features --------------------- @@ -180,7 +250,7 @@ New Features * GITHUB#12582: Add int8 scalar quantization to the HNSW vector format. This optionally allows for more compact lossy storage for the vectors, requiring about 75% memory for fast HNSW search. (Ben Trent) - + * GITHUB#12660: HNSW graph now can be merged with multiple thread. Configurable in Lucene99HnswVectorsFormat. (Patrick Zhai) @@ -225,6 +295,22 @@ Improvements * GITHUB#12754: Refactor lookup of Hotspot VM options and do not initialize constants with NULL if SecurityManager prevents access. (Uwe Schindler) +* GITHUB#12801: Remove possible contention on a ReentrantReadWriteLock in + Monitor which could result in searches waiting for commits. (Davis Cook) + +* GITHUB#11277, LUCENE-10241: Upgrade to OpenNLP to 1.9.4. (Jeff Zemerick) + +* GITHUB#12542: FSTCompiler can now approximately limit how much RAM it uses to share + suffixes during FST construction using the suffixRAMLimitMB method. Larger values + result in a more minimal FST (more common suffixes are shard). Pass + Double.POSITIVE_INFINITY to use as much RAM as is needed to create a purely + minimal FST. Inspired by this Rust FST implemention: + https://blog.burntsushi.net/transducers (Mike McCandless) + +* GITHUB#12738: NodeHash now stores the FST nodes data instead of just node addresses (Anh Dung Bui) + +* GITHUB#12847: Test2BFST now reports the time it took to build the FST and the real FST size (Anh Dung Bui) + Optimizations --------------------- * GITHUB#12183: Make TermStates#build concurrent. (Shubham Chaudhary) @@ -276,10 +362,14 @@ Optimizations * GITHUB#12381: Skip docs with DocValues in NumericLeafComparator. (Lu Xugang, Adrien Grand) -* GITHUB#12748: Specialize arc store for continuous label in FST. (Guo Feng, Chao Zhang) - * GITHUB#12784: Cache buckets to speed up BytesRefHash#sort. (Guo Feng) +* GITHUB#12806: Utilize exact kNN search when gathering k >= numVectors in a segment (Ben Trent) + +* GITHUB#12782: Use group-varint encoding for the tail of postings. (Adrien Grand, Zhang Chao) + +* GITHUB#12748: Specialize arc store for continuous label in FST. (Guo Feng, Zhang Chao) + Changes in runtime behavior --------------------- @@ -311,22 +401,33 @@ Bug Fixes * GITHUB#12770: Stop exploring HNSW graph if scores are not getting better. (Ben Trent) +* GITHUB#12640: Ensure #finish is called on all drill-sideways collectors even if one throws a + CollectionTerminatedException (Greg Miller) + +* GITHUB#12626: Fix segmentInfos replace to set userData (Shibi Balamurugan, Uwe Schindler, Marcus Eagan, Michael Froh) + Build --------------------- * GITHUB#12752: tests.multiplier could be omitted in test failure reproduce lines (esp. in nightly mode). (Dawid Weiss) -* GITHUB#12742: JavaCompile tasks may be in up-to-date state when modular dependencies have changed +* GITHUB#12742: JavaCompile tasks may be in up-to-date state when modular dependencies have changed leading to odd runtime errors (Chris Hostetter, Dawid Weiss) * GITHUB#12612: Upgrade forbiddenapis to version 3.6 and ASM for APIJAR extraction to 9.6. (Uwe Schindler) * GITHUB#12655: Upgrade to Gradle 8.4 (Kevin Risden) +* GITHUB#12845: Only enable support for tests.profile if jdk.jfr module is available + in Gradle runtime. (Uwe Schindler) + Other --------------------- +* GITHUB#12817: Add demo for faceting with StringValueFacetCounts over KeywordField and SortedDocValuesField. + (Stefan Vodita) + * GITHUB#12657: Internal refactor of HNSW graph merging (Ben Trent). * GITHUB#12625: Refactor ByteBlockPool so it is just a "shift/mask big array". (Ignacio Vera) @@ -336,6 +437,8 @@ Other overflows and slices that are too large. Some bits of code are simplified. Documentation is updated and expanded. (Stefan Vodita) +* GITHUB#12762: Refactor BKD HeapPointWriter to hide the internal data structure. (Ignacio Vera) + ======================== Lucene 9.8.0 ======================= API Changes @@ -364,6 +467,8 @@ New Features * GITHUB#12479: Add new Maximum Inner Product vector similarity function for non-normalized dot-product vector search. (Jack Mazanec, Ben Trent) +* GITHUB#12525: `WordDelimiterGraphFilterFactory` now supports the `ignoreKeywords` flag (Thomas De Craemer) + * GITHUB#12489: Add support for recursive graph bisection, also called bipartite graph partitioning, and often abbreviated BP, an algorithm for reordering doc IDs that results in more compact postings and faster queries, @@ -386,7 +491,7 @@ Improvements Optimizations --------------------- -* GITHUB#12377: Avoid redundant loop for compute min value in DirectMonotonicWriter. (Chao Zhang) +* GITHUB#12377: Avoid redundant loop for compute min value in DirectMonotonicWriter. (Zhang Chao) * GITHUB#12361: Faster top-level disjunctions sorted by descending score. (Adrien Grand) @@ -401,7 +506,7 @@ Optimizations * GITHUB#12385: Restore parallel knn query rewrite across segments rather than slices (Luca Cavanna) -* GITHUB#12381: Speed up NumericDocValuesWriter with index sorting. (Chao Zhang) +* GITHUB#12381: Speed up NumericDocValuesWriter with index sorting. (Zhang Chao) * GITHUB#12453: Faster bulk numeric reads from BufferedIndexInput (Armin Braun) @@ -468,7 +573,7 @@ Other * GITHUB#12428: Replace consecutive close() calls and close() calls with null checks with IOUtils.close(). (Shubham Chaudhary) -* GITHUB#12512: Remove unused variable in BKDWriter. (Chao Zhang) +* GITHUB#12512: Remove unused variable in BKDWriter. (Zhang Chao) ======================== Lucene 9.7.0 ======================= diff --git a/lucene/MIGRATE.md b/lucene/MIGRATE.md index 158ce93c9a7..348a9d2e84e 100644 --- a/lucene/MIGRATE.md +++ b/lucene/MIGRATE.md @@ -19,6 +19,11 @@ ## Migration from Lucene 9.x to Lucene 10.0 +### Minor API changes in MatchHighlighter and MatchRegionRetriever. (GITHUB#12881) + +The API of interfaces for accepting highlights has changed to allow performance improvements. Look at the issue and the PR diff to get +a sense of what's changed (changes are minor). + ### Removed deprecated IndexSearcher.doc, IndexReader.document, IndexReader.getTermVectors (GITHUB#11998) The deprecated Stored Fields and Term Vectors apis relied upon threadlocal storage and have been removed. @@ -101,6 +106,34 @@ The deprecated getter for the `Executor` that was optionally provided to the `In has been removed. Users that want to execute concurrent tasks should rely instead on the `TaskExecutor` that the searcher holds, retrieved via `IndexSearcher#getTaskExecutor`. +### CheckIndex params -slow and -fast are deprecated, replaced by -level X (GITHUB#11023) + +The `CheckIndex` former `-fast` behaviour of performing checksum checks only, is now the default. +Added a new parameter: `-level X`, to set the detail level of the index check. The higher the value, the more checks are performed. +Sample `-level` usage: `1` (Default) - Checksum checks only, `2` - all level 1 checks as well as logical integrity checks, `3` - all +level 2 checks as well as slow checks. + +### Expressions module now uses `MethodHandle` and hidden classes (GITHUB#12873) + +Custom functions in the expressions module must now be passed in a `Map` using `MethodHandle` as values. +To convert legacy code using maps of reflective `java.lang.reflect.Method`, use the converter method +`JavascriptCompiler#convertLegacyFunctions`. This should make the mapping mostly compatible. +The use of `MethodHandle` and [Dynamic Class-File Constants (JEP 309)](https://openjdk.org/jeps/309) +now also allows to pass private methods or methods from different classloaders. It is also possible +to adapt guards or filters using the `MethodHandles` class. + +The new implementation of the Javascript expressions compiler no longer supports use of custom +`ClassLoader`, because it uses the new JDK 15 feature [hidden classes (JEP 371)](https://openjdk.org/jeps/371). +Due to the use of `MethodHandle`, classloader isolation is no longer needed, because JS code can only call +MHs that were resolved by the application before using the expressions module. + +### `Expression#evaluate()` declares to throw IOException (GITHUB#12878) + +The expressions module has changed the `Expression#evaluate()` method signature: +It now declares that it may throw `IOException`. This was an oversight because +compiled expressions call `DoubleValues#doubleValue` behind the scenes, which +may throw `IOException` on index problems, bubbling up unexpectedly to the caller. + ## Migration from Lucene 9.0 to Lucene 9.1 ### Test framework package migration and module (LUCENE-10301) diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/NormalizeCharMap.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/NormalizeCharMap.java index 2d6ccd0ebb8..01baea12b01 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/NormalizeCharMap.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/NormalizeCharMap.java @@ -105,7 +105,8 @@ public class NormalizeCharMap { final FST map; try { final Outputs outputs = CharSequenceOutputs.getSingleton(); - final FSTCompiler fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE2, outputs); + final FSTCompiler fstCompiler = + new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE2, outputs).build(); final IntsRefBuilder scratch = new IntsRefBuilder(); for (Map.Entry ent : pendingPairs.entrySet()) { fstCompiler.add(Util.toUTF16(ent.getKey(), scratch), new CharsRef(ent.getValue())); diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/en/KStemmer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/en/KStemmer.java index 17d50c239b1..c571a3635b5 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/en/KStemmer.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/en/KStemmer.java @@ -777,7 +777,6 @@ class KStemmer { private int stemLength() { return j + 1; } - ; private boolean endsIn(char[] s) { if (s.length > k) return false; diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/ConvTable.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/ConvTable.java index 6a87167e7e1..f22bee1db0d 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/ConvTable.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/ConvTable.java @@ -40,7 +40,8 @@ class ConvTable { try { Outputs outputs = CharSequenceOutputs.getSingleton(); - FSTCompiler fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE2, outputs); + FSTCompiler fstCompiler = + new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE2, outputs).build(); IntsRefBuilder scratchInts = new IntsRefBuilder(); for (Map.Entry entry : mappings.entrySet()) { String key = entry.getKey(); diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java index 1811c3b9bc8..4768cced9ab 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java @@ -50,18 +50,12 @@ import java.util.Set; import java.util.TreeMap; import java.util.stream.Collectors; import java.util.stream.Stream; -import org.apache.lucene.codecs.CodecUtil; +import org.apache.lucene.analysis.hunspell.SortingStrategy.EntryAccumulator; +import org.apache.lucene.analysis.hunspell.SortingStrategy.EntrySupplier; import org.apache.lucene.store.Directory; -import org.apache.lucene.store.IOContext; -import org.apache.lucene.store.IndexOutput; import org.apache.lucene.util.ArrayUtil; -import org.apache.lucene.util.BytesRef; -import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.IntsRef; import org.apache.lucene.util.IntsRefBuilder; -import org.apache.lucene.util.OfflineSorter; -import org.apache.lucene.util.OfflineSorter.ByteSequencesReader; -import org.apache.lucene.util.OfflineSorter.ByteSequencesWriter; import org.apache.lucene.util.fst.FST; import org.apache.lucene.util.fst.FSTCompiler; import org.apache.lucene.util.fst.IntSequenceOutputs; @@ -215,6 +209,25 @@ public class Dictionary { List dictionaries, boolean ignoreCase) throws IOException, ParseException { + this(affix, dictionaries, ignoreCase, SortingStrategy.offline(tempDir, tempFileNamePrefix)); + } + + /** + * Creates a new Dictionary containing the information read from the provided InputStreams to + * hunspell affix and dictionary files. You have to close the provided InputStreams yourself. + * + * @param affix InputStream for reading the hunspell affix file (won't be closed). + * @param dictionaries InputStream for reading the hunspell dictionary files (won't be closed). + * @param sortingStrategy the entry strategy for the dictionary loading + * @throws IOException Can be thrown while reading from the InputStreams + * @throws ParseException Can be thrown if the content of the files does not meet expected formats + */ + public Dictionary( + InputStream affix, + List dictionaries, + boolean ignoreCase, + SortingStrategy sortingStrategy) + throws IOException, ParseException { this.ignoreCase = ignoreCase; try (BufferedInputStream affixStream = @@ -250,10 +263,11 @@ public class Dictionary { readAffixFile(affixStream, decoder, flagEnumerator); // read dictionary entries - IndexOutput unsorted = tempDir.createTempOutput(tempFileNamePrefix, "dat", IOContext.DEFAULT); - int wordCount = mergeDictionaries(dictionaries, decoder, unsorted); - String sortedFile = sortWordsOffline(tempDir, tempFileNamePrefix, unsorted); - words = readSortedDictionaries(tempDir, sortedFile, flagEnumerator, wordCount); + EntryAccumulator acc = sortingStrategy.start(); + mergeDictionaries(dictionaries, decoder, acc); + try (EntrySupplier sorted = acc.finishAndSort()) { + words = readSortedDictionaries(flagEnumerator, sorted); + } flagLookup = flagEnumerator.finish(); aliases = null; // no longer needed morphAliases = null; // no longer needed @@ -631,7 +645,8 @@ public class Dictionary { private FST affixFST(TreeMap> affixes) throws IOException { IntSequenceOutputs outputs = IntSequenceOutputs.getSingleton(); - FSTCompiler fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE4, outputs); + FSTCompiler fstCompiler = + new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE4, outputs).build(); IntsRefBuilder scratch = new IntsRefBuilder(); for (Map.Entry> entry : affixes.entrySet()) { Util.toUTF32(entry.getKey(), scratch); @@ -984,52 +999,43 @@ public class Dictionary { } } - private int mergeDictionaries( - List dictionaries, CharsetDecoder decoder, IndexOutput output) + private void mergeDictionaries( + List dictionaries, CharsetDecoder decoder, EntryAccumulator acc) throws IOException { StringBuilder sb = new StringBuilder(); - int wordCount = 0; - try (ByteSequencesWriter writer = new ByteSequencesWriter(output)) { - for (InputStream dictionary : dictionaries) { - BufferedReader lines = new BufferedReader(new InputStreamReader(dictionary, decoder)); - lines.readLine(); // first line is number of entries (approximately, sometimes) + for (InputStream dictionary : dictionaries) { + BufferedReader lines = new BufferedReader(new InputStreamReader(dictionary, decoder)); + lines.readLine(); // first line is number of entries (approximately, sometimes) - String line; - while ((line = lines.readLine()) != null) { - // wild and unpredictable code comment rules - if (line.isEmpty() || line.charAt(0) == '#' || line.charAt(0) == '\t') { - continue; - } - line = unescapeEntry(line); - // if we haven't seen any custom morphological data, try to parse one - if (!hasCustomMorphData) { - int morphStart = line.indexOf(MORPH_SEPARATOR); - if (morphStart >= 0) { - String data = line.substring(morphStart + 1); - hasCustomMorphData = - splitMorphData(data).stream().anyMatch(s -> !s.startsWith("ph:")); - } - } - - wordCount += writeNormalizedWordEntry(sb, writer, line); + String line; + while ((line = lines.readLine()) != null) { + // wild and unpredictable code comment rules + if (line.isEmpty() || line.charAt(0) == '#' || line.charAt(0) == '\t') { + continue; } + line = unescapeEntry(line); + // if we haven't seen any custom morphological data, try to parse one + if (!hasCustomMorphData) { + int morphStart = line.indexOf(MORPH_SEPARATOR); + if (morphStart >= 0) { + String data = line.substring(morphStart + 1); + hasCustomMorphData = splitMorphData(data).stream().anyMatch(s -> !s.startsWith("ph:")); + } + } + + writeNormalizedWordEntry(sb, line, acc); } - CodecUtil.writeFooter(output); } - return wordCount; } - /** - * @return the number of word entries written - */ - private int writeNormalizedWordEntry(StringBuilder reuse, ByteSequencesWriter writer, String line) + private void writeNormalizedWordEntry(StringBuilder reuse, String line, EntryAccumulator acc) throws IOException { int flagSep = line.indexOf(FLAG_SEPARATOR); int morphSep = line.indexOf(MORPH_SEPARATOR); assert morphSep > 0; assert morphSep > flagSep; int sep = flagSep < 0 ? morphSep : flagSep; - if (sep == 0) return 0; + if (sep == 0) return; CharSequence toWrite; String beforeSep = line.substring(0, sep); @@ -1043,19 +1049,16 @@ public class Dictionary { String written = toWrite.toString(); sep = written.length() - (line.length() - sep); - writer.write(written.getBytes(StandardCharsets.UTF_8)); + acc.addEntry(written); WordCase wordCase = WordCase.caseOf(written, sep); if (wordCase == WordCase.MIXED || wordCase == WordCase.UPPER && flagSep > 0) { - addHiddenCapitalizedWord(reuse, writer, written.substring(0, sep), written.substring(sep)); - return 2; + addHiddenCapitalizedWord(reuse, acc, written.substring(0, sep), written.substring(sep)); } - return 1; } private void addHiddenCapitalizedWord( - StringBuilder reuse, ByteSequencesWriter writer, String word, String afterSep) - throws IOException { + StringBuilder reuse, EntryAccumulator acc, String word, String afterSep) throws IOException { reuse.setLength(0); reuse.append(Character.toUpperCase(word.charAt(0))); for (int i = 1; i < word.length(); i++) { @@ -1064,7 +1067,7 @@ public class Dictionary { reuse.append(FLAG_SEPARATOR); reuse.append(HIDDEN_FLAG); reuse.append(afterSep, afterSep.charAt(0) == FLAG_SEPARATOR ? 1 : 0, afterSep.length()); - writer.write(reuse.toString().getBytes(StandardCharsets.UTF_8)); + acc.addEntry(reuse.toString()); } String toLowerCase(String word) { @@ -1084,137 +1087,66 @@ public class Dictionary { return new String(chars); } - private String sortWordsOffline( - Directory tempDir, String tempFileNamePrefix, IndexOutput unsorted) throws IOException { - OfflineSorter sorter = - new OfflineSorter( - tempDir, - tempFileNamePrefix, - new Comparator<>() { - final BytesRef scratch1 = new BytesRef(); - final BytesRef scratch2 = new BytesRef(); - - private void initScratch(BytesRef o, BytesRef scratch) { - scratch.bytes = o.bytes; - scratch.offset = o.offset; - scratch.length = o.length; - - for (int i = scratch.length - 1; i >= 0; i--) { - if (scratch.bytes[scratch.offset + i] == FLAG_SEPARATOR - || scratch.bytes[scratch.offset + i] == MORPH_SEPARATOR) { - scratch.length = i; - break; - } - } - } - - @Override - public int compare(BytesRef o1, BytesRef o2) { - initScratch(o1, scratch1); - initScratch(o2, scratch2); - - int cmp = scratch1.compareTo(scratch2); - if (cmp == 0) { - // tie break on whole row - return o1.compareTo(o2); - } else { - return cmp; - } - } - }); - - String sorted; - boolean success = false; - try { - sorted = sorter.sort(unsorted.getName()); - success = true; - } finally { - if (success) { - tempDir.deleteFile(unsorted.getName()); - } else { - IOUtils.deleteFilesIgnoringExceptions(tempDir, unsorted.getName()); - } - } - return sorted; - } - - private WordStorage readSortedDictionaries( - Directory tempDir, String sorted, FlagEnumerator flags, int wordCount) throws IOException { - boolean success = false; - + private WordStorage readSortedDictionaries(FlagEnumerator flags, EntrySupplier sorted) + throws IOException { Map morphIndices = new HashMap<>(); WordStorage.Builder builder = new WordStorage.Builder( - wordCount, hashFactor(), hasCustomMorphData, flags, allNonSuggestibleFlags()); + sorted.wordCount(), hashFactor(), hasCustomMorphData, flags, allNonSuggestibleFlags()); - try (ByteSequencesReader reader = - new ByteSequencesReader(tempDir.openChecksumInput(sorted), sorted)) { + // TODO: the flags themselves can be double-chars (long) or also numeric + // either way the trick is to encode them as char... but they must be parsed differently - // TODO: the flags themselves can be double-chars (long) or also numeric - // either way the trick is to encode them as char... but they must be parsed differently + while (true) { + String line = sorted.next(); + if (line == null) break; - while (true) { - BytesRef scratch = reader.next(); - if (scratch == null) { - break; - } + String entry; + char[] wordForm; + int end; - String line = scratch.utf8ToString(); - String entry; - char[] wordForm; - int end; - - int flagSep = line.indexOf(FLAG_SEPARATOR); - if (flagSep == -1) { - wordForm = NOFLAGS; - end = line.indexOf(MORPH_SEPARATOR); - entry = line.substring(0, end); - } else { - end = line.indexOf(MORPH_SEPARATOR); - boolean hidden = line.charAt(flagSep + 1) == HIDDEN_FLAG; - String flagPart = line.substring(flagSep + (hidden ? 2 : 1), end).strip(); - if (aliasCount > 0 && !flagPart.isEmpty()) { - flagPart = getAliasValue(Integer.parseInt(flagPart)); - } - - wordForm = flagParsingStrategy.parseFlags(flagPart); - if (hidden) { - wordForm = ArrayUtil.growExact(wordForm, wordForm.length + 1); - wordForm[wordForm.length - 1] = HIDDEN_FLAG; - } - entry = line.substring(0, flagSep); - } - - if (entry.isEmpty()) continue; - - int morphDataID = 0; - if (end + 1 < line.length()) { - List morphFields = readMorphFields(entry, line.substring(end + 1)); - if (!morphFields.isEmpty()) { - morphFields.sort(Comparator.naturalOrder()); - morphDataID = addMorphFields(morphIndices, String.join(" ", morphFields)); - } - } - - builder.add(entry, wordForm, morphDataID); - } - - // finalize last entry - success = true; - return new WordStorage(builder) { - @Override - char caseFold(char c) { - return Dictionary.this.caseFold(c); - } - }; - } finally { - if (success) { - tempDir.deleteFile(sorted); + int flagSep = line.indexOf(FLAG_SEPARATOR); + if (flagSep == -1) { + wordForm = NOFLAGS; + end = line.indexOf(MORPH_SEPARATOR); + entry = line.substring(0, end); } else { - IOUtils.deleteFilesIgnoringExceptions(tempDir, sorted); + end = line.indexOf(MORPH_SEPARATOR); + boolean hidden = line.charAt(flagSep + 1) == HIDDEN_FLAG; + String flagPart = line.substring(flagSep + (hidden ? 2 : 1), end).strip(); + if (aliasCount > 0 && !flagPart.isEmpty()) { + flagPart = getAliasValue(Integer.parseInt(flagPart)); + } + + wordForm = flagParsingStrategy.parseFlags(flagPart); + if (hidden) { + wordForm = ArrayUtil.growExact(wordForm, wordForm.length + 1); + wordForm[wordForm.length - 1] = HIDDEN_FLAG; + } + entry = line.substring(0, flagSep); } + + if (entry.isEmpty()) continue; + + int morphDataID = 0; + if (end + 1 < line.length()) { + List morphFields = readMorphFields(entry, line.substring(end + 1)); + if (!morphFields.isEmpty()) { + morphFields.sort(Comparator.naturalOrder()); + morphDataID = addMorphFields(morphIndices, String.join(" ", morphFields)); + } + } + + builder.add(entry, wordForm, morphDataID); } + + return new WordStorage(builder) { + @Override + char caseFold(char c) { + return Dictionary.this.caseFold(c); + } + }; } /** diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SortingStrategy.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SortingStrategy.java new file mode 100644 index 00000000000..e1c59020d44 --- /dev/null +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SortingStrategy.java @@ -0,0 +1,181 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.analysis.hunspell; + +import java.io.Closeable; +import java.io.IOException; +import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.Comparator; +import java.util.List; +import org.apache.lucene.codecs.CodecUtil; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.IOContext; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.BytesRefComparator; +import org.apache.lucene.util.IOUtils; +import org.apache.lucene.util.OfflineSorter; +import org.apache.lucene.util.OfflineSorter.ByteSequencesReader; +import org.apache.lucene.util.OfflineSorter.ByteSequencesWriter; + +/** + * The strategy defining how a Hunspell dictionary should be loaded, with different tradeoffs. The + * entries should be sorted in a special way, and this can be done either in-memory (faster, but + * temporarily allocating more memory) or using disk (slower, but not needing much memory). + * + * @see #offline(Directory, String) + * @see #inMemory() + */ +public abstract class SortingStrategy { + + abstract EntryAccumulator start() throws IOException; + + interface EntryAccumulator { + + void addEntry(String entry) throws IOException; + + EntrySupplier finishAndSort() throws IOException; + } + + interface EntrySupplier extends Closeable { + int wordCount(); + + /** The next line or {@code null} if the end is reached */ + String next() throws IOException; + } + + /** + * An "offline" strategy that creates temporary files in the given directory and uses them for + * sorting with {@link OfflineSorter}. It's slower than {@link #inMemory()}, but doesn't need to + * load the entire dictionary into memory. + */ + public static SortingStrategy offline(Directory tempDir, String tempFileNamePrefix) { + return new SortingStrategy() { + @Override + EntryAccumulator start() throws IOException { + IndexOutput output = tempDir.createTempOutput(tempFileNamePrefix, "dat", IOContext.DEFAULT); + ByteSequencesWriter writer = new ByteSequencesWriter(output); + return new EntryAccumulator() { + int wordCount = 0; + + @Override + public void addEntry(String entry) throws IOException { + wordCount++; + writer.write(entry.getBytes(StandardCharsets.UTF_8)); + } + + @Override + public EntrySupplier finishAndSort() throws IOException { + CodecUtil.writeFooter(output); + writer.close(); + String sortedFile = sortWordsOffline(); + ByteSequencesReader reader = + new ByteSequencesReader(tempDir.openChecksumInput(sortedFile), sortedFile); + return new EntrySupplier() { + boolean success = false; + + @Override + public int wordCount() { + return wordCount; + } + + @Override + public String next() throws IOException { + BytesRef scratch = reader.next(); + if (scratch == null) { + success = true; + return null; + } + return scratch.utf8ToString(); + } + + @Override + public void close() throws IOException { + reader.close(); + if (success) { + tempDir.deleteFile(sortedFile); + } else { + IOUtils.deleteFilesIgnoringExceptions(tempDir, sortedFile); + } + } + }; + } + + private String sortWordsOffline() throws IOException { + var sorter = new OfflineSorter(tempDir, tempFileNamePrefix, BytesRefComparator.NATURAL); + + String sorted; + boolean success = false; + try { + sorted = sorter.sort(output.getName()); + success = true; + } finally { + if (success) { + tempDir.deleteFile(output.getName()); + } else { + IOUtils.deleteFilesIgnoringExceptions(tempDir, output.getName()); + } + } + return sorted; + } + }; + } + }; + } + + /** + * The strategy that loads all entries as {@link String} objects and sorts them in memory. The + * entries are then stored in a more compressed way, and the strings are gc-ed, but the loading + * itself needs {@code O(dictionary_size)} memory. + */ + public static SortingStrategy inMemory() { + return new SortingStrategy() { + @Override + EntryAccumulator start() { + List entries = new ArrayList<>(); + return new EntryAccumulator() { + @Override + public void addEntry(String entry) { + entries.add(entry); + } + + @Override + public EntrySupplier finishAndSort() { + entries.sort(Comparator.naturalOrder()); + return new EntrySupplier() { + int i = 0; + + @Override + public int wordCount() { + return entries.size(); + } + + @Override + public String next() { + return i < entries.size() ? entries.get(i++) : null; + } + + @Override + public void close() {} + }; + } + }; + } + }; + } +} diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordStorage.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordStorage.java index 0b5e4b86978..1e739f03d47 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordStorage.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordStorage.java @@ -350,16 +350,19 @@ abstract class WordStorage { currentOrds.clear(); boolean hasNonHidden = false; + boolean isSuggestible = false; for (char[] flags : group) { if (!hasFlag(flags, Dictionary.HIDDEN_FLAG)) { hasNonHidden = true; - break; + } + if (!hasNoSuggestFlag(flags)) { + isSuggestible = true; } } for (int i = 0; i < group.size(); i++) { char[] flags = group.get(i); - if (hasNonHidden && hasFlag(flags, Dictionary.HIDDEN_FLAG)) { + if (hasNonHidden && group.size() > 1 && hasFlag(flags, Dictionary.HIDDEN_FLAG)) { continue; } @@ -388,7 +391,7 @@ abstract class WordStorage { int mask = (prevCode == 0 ? 0 : COLLISION_MASK) - | (group.stream().anyMatch(flags -> !hasNoSuggestFlag(flags)) ? SUGGESTIBLE_MASK : 0) + | (isSuggestible ? SUGGESTIBLE_MASK : 0) | Math.min(currentEntry.length(), MAX_STORED_LENGTH); hashTable[hash] = (mask << OFFSET_BITS) | pos; diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/StemmerOverrideFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/StemmerOverrideFilter.java index ad47d702f64..b46f8f8ff02 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/StemmerOverrideFilter.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/StemmerOverrideFilter.java @@ -210,7 +210,8 @@ public final class StemmerOverrideFilter extends TokenFilter { */ public StemmerOverrideMap build() throws IOException { ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton(); - FSTCompiler fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE4, outputs); + FSTCompiler fstCompiler = + new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE4, outputs).build(); final int[] sort = hash.sort(); IntsRefBuilder intsSpare = new IntsRefBuilder(); final int size = hash.size(); diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/TruncateTokenFilterFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/TruncateTokenFilterFactory.java index 3c508b94322..ba57a675b6f 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/TruncateTokenFilterFactory.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/TruncateTokenFilterFactory.java @@ -46,11 +46,11 @@ public class TruncateTokenFilterFactory extends TokenFilterFactory { public static final String NAME = "truncate"; public static final String PREFIX_LENGTH_KEY = "prefixLength"; - private final byte prefixLength; + private final int prefixLength; public TruncateTokenFilterFactory(Map args) { super(args); - prefixLength = Byte.parseByte(get(args, PREFIX_LENGTH_KEY, "5")); + prefixLength = Integer.parseInt(get(args, PREFIX_LENGTH_KEY, "5")); if (prefixLength < 1) throw new IllegalArgumentException( PREFIX_LENGTH_KEY + " parameter must be a positive number: " + prefixLength); diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilter.java index d19a97f06d5..24dc38ef664 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilter.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilter.java @@ -163,7 +163,6 @@ public final class WordDelimiterFilter extends TokenFilter { private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class); private final KeywordAttribute keywordAttribute = addAttribute(KeywordAttribute.class); - ; private final OffsetAttribute offsetAttribute = addAttribute(OffsetAttribute.class); private final PositionIncrementAttribute posIncAttribute = addAttribute(PositionIncrementAttribute.class); diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterGraphFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterGraphFilter.java index 8b871d3f2e9..e9a563fecf1 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterGraphFilter.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterGraphFilter.java @@ -164,7 +164,6 @@ public final class WordDelimiterGraphFilter extends TokenFilter { private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class); private final KeywordAttribute keywordAttribute = addAttribute(KeywordAttribute.class); - ; private final OffsetAttribute offsetAttribute = addAttribute(OffsetAttribute.class); private final PositionIncrementAttribute posIncAttribute = addAttribute(PositionIncrementAttribute.class); diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterGraphFilterFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterGraphFilterFactory.java index 09ec073bd9b..63634687e2b 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterGraphFilterFactory.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterGraphFilterFactory.java @@ -45,7 +45,7 @@ import org.apache.lucene.util.ResourceLoaderAware; * preserveOriginal="0" splitOnNumerics="1" splitOnCaseChange="1" * catenateWords="0" catenateNumbers="0" catenateAll="0" * generateWordParts="1" generateNumberParts="1" stemEnglishPossessive="1" - * types="wdfftypes.txt" /> + * types="wdfftypes.txt" ignoreKeywords="0" /> * </analyzer> * </fieldType> * @@ -100,6 +100,9 @@ public class WordDelimiterGraphFilterFactory extends TokenFilterFactory if (getInt(args, "stemEnglishPossessive", 1) != 0) { flags |= STEM_ENGLISH_POSSESSIVE; } + if (getInt(args, "ignoreKeywords", 0) != 0) { + flags |= IGNORE_KEYWORDS; + } wordFiles = get(args, PROTECTED_TOKENS); types = get(args, TYPES); this.flags = flags; diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymFilter.java index d3eb6165f74..51298c42a49 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymFilter.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymFilter.java @@ -216,7 +216,6 @@ public final class SynonymFilter extends TokenFilter { count++; } } - ; private final ByteArrayDataInput bytesReader = new ByteArrayDataInput(); diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymMap.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymMap.java index b92b9228860..22ba92ff555 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymMap.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymMap.java @@ -222,7 +222,8 @@ public class SynonymMap { public SynonymMap build() throws IOException { ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton(); // TODO: are we using the best sharing options? - FSTCompiler fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE4, outputs); + FSTCompiler fstCompiler = + new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE4, outputs).build(); BytesRefBuilder scratch = new BytesRefBuilder(); ByteArrayDataOutput scratchOutput = new ByteArrayDataOutput(); diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/charfilter/TestHTMLStripCharFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/charfilter/TestHTMLStripCharFilter.java index 82b3d25dd1d..2b5340dd3d6 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/charfilter/TestHTMLStripCharFilter.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/charfilter/TestHTMLStripCharFilter.java @@ -595,8 +595,7 @@ public class TestHTMLStripCharFilter extends BaseTokenStreamTestCase { } } Reader reader = new HTMLStripCharFilter(new StringReader(text.toString())); - while (reader.read() != -1) - ; + while (reader.read() != -1) {} } public void testUTF16Surrogates() throws Exception { diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestDuelingAnalyzers.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestDuelingAnalyzers.java index 8f336c269da..0359400c669 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestDuelingAnalyzers.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestDuelingAnalyzers.java @@ -230,7 +230,6 @@ public class TestDuelingAnalyzers extends BaseTokenStreamTestCase { assertEquals( "wrong end offset for input: " + s, leftOffset.endOffset(), rightOffset.endOffset()); } - ; assertFalse("wrong number of tokens for input: " + s, right.incrementToken()); left.end(); right.end(); diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestAllDictionaries.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestAllDictionaries.java index 708a52b6ed6..68f6922c758 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestAllDictionaries.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestAllDictionaries.java @@ -41,7 +41,6 @@ import java.util.concurrent.atomic.AtomicLong; import java.util.function.Function; import java.util.stream.Collectors; import java.util.stream.Stream; -import org.apache.lucene.tests.store.BaseDirectoryWrapper; import org.apache.lucene.tests.util.LuceneTestCase; import org.apache.lucene.tests.util.LuceneTestCase.SuppressSysoutChecks; import org.apache.lucene.tests.util.RamUsageTester; @@ -72,9 +71,8 @@ public class TestAllDictionaries extends LuceneTestCase { Path dic = Path.of(affPath.substring(0, affPath.length() - 4) + ".dic"); assert Files.exists(dic) : dic; try (InputStream dictionary = Files.newInputStream(dic); - InputStream affix = Files.newInputStream(aff); - BaseDirectoryWrapper tempDir = newDirectory()) { - return new Dictionary(tempDir, "dictionary", affix, dictionary) { + InputStream affix = Files.newInputStream(aff)) { + return new Dictionary(affix, List.of(dictionary), false, SortingStrategy.inMemory()) { @Override protected boolean tolerateAffixRuleCountMismatches() { return true; diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestSpellChecking.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestSpellChecking.java index cbb6f21f0cf..4ca262799c6 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestSpellChecking.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestSpellChecking.java @@ -256,15 +256,22 @@ public class TestSpellChecking extends LuceneTestCase { } static void checkSpellCheckerExpectations(Path basePath) throws IOException, ParseException { - InputStream affixStream = Files.newInputStream(Path.of(basePath.toString() + ".aff")); + checkSpellCheckerExpectations( + basePath, SortingStrategy.offline(new ByteBuffersDirectory(), "dictionary")); + checkSpellCheckerExpectations(basePath, SortingStrategy.inMemory()); + } + + private static void checkSpellCheckerExpectations(Path basePath, SortingStrategy strategy) + throws IOException, ParseException { + Path affFile = Path.of(basePath + ".aff"); Path dicFile = Path.of(basePath + ".dic"); + InputStream affixStream = Files.newInputStream(affFile); InputStream dictStream = Files.newInputStream(dicFile); Hunspell speller; Map suggesters = new LinkedHashMap<>(); try { - Dictionary dictionary = - new Dictionary(new ByteBuffersDirectory(), "dictionary", affixStream, dictStream); + Dictionary dictionary = new Dictionary(affixStream, List.of(dictStream), false, strategy); speller = new Hunspell(dictionary, TimeoutPolicy.NO_TIMEOUT, () -> {}); Suggester suggester = new Suggester(dictionary); suggesters.put("default", suggester); diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/in/TestIndicNormalizer.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/in/TestIndicNormalizer.java index e214620c61e..1dbc0528a70 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/in/TestIndicNormalizer.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/in/TestIndicNormalizer.java @@ -41,7 +41,6 @@ public class TestIndicNormalizer extends BaseTokenStreamTestCase { private void check(String input, String output) throws IOException { Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); - ; tokenizer.setReader(new StringReader(input)); TokenFilter tf = new IndicNormalizationFilter(tokenizer); assertTokenStreamContents(tf, new String[] {output}); diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestKeywordMarkerFilterFactory.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestKeywordMarkerFilterFactory.java index 166b4b7b1ef..a8372181f39 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestKeywordMarkerFilterFactory.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestKeywordMarkerFilterFactory.java @@ -89,7 +89,6 @@ public class TestKeywordMarkerFilterFactory extends BaseTokenStreamFactoryTestCa stream = tokenFilterFactory("KeywordMarker", "pattern", "Cats", "ignoreCase", "true").create(stream); stream = tokenFilterFactory("PorterStem").create(stream); - ; assertTokenStreamContents(stream, new String[] {"dog", "cats", "Cats"}); } diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestTruncateTokenFilterFactory.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestTruncateTokenFilterFactory.java index 2fed6c2ae21..f537b93a156 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestTruncateTokenFilterFactory.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestTruncateTokenFilterFactory.java @@ -68,4 +68,23 @@ public class TestTruncateTokenFilterFactory extends BaseTokenStreamFactoryTestCa TruncateTokenFilterFactory.PREFIX_LENGTH_KEY + " parameter must be a positive number: -5")); } + + /** Test that takes length greater than byte limit accepts it */ + public void testLengthGreaterThanByteLimitArgument() throws Exception { + Reader reader = + new StringReader( + "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvw128characters From here"); + TokenStream stream = new MockTokenizer(MockTokenizer.WHITESPACE, false); + ((Tokenizer) stream).setReader(reader); + stream = + tokenFilterFactory("Truncate", TruncateTokenFilterFactory.PREFIX_LENGTH_KEY, "128") + .create(stream); + assertTokenStreamContents( + stream, + new String[] { + "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvw1", + "From", + "here" + }); + } } diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/TestEdgeNGramTokenizer.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/TestEdgeNGramTokenizer.java index 434a552eaf7..0cc41a51876 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/TestEdgeNGramTokenizer.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/TestEdgeNGramTokenizer.java @@ -69,7 +69,6 @@ public class TestEdgeNGramTokenizer extends BaseTokenStreamTestCase { public void testOversizedNgrams() throws Exception { EdgeNGramTokenizer tokenizer = new EdgeNGramTokenizer(6, 6); tokenizer.setReader(input); - ; assertTokenStreamContents(tokenizer, new String[0], new int[0], new int[0], 5 /* abcde */); } diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/util/TestCharArrayIterator.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/util/TestCharArrayIterator.java index 13a02f0369e..c1abf9e3615 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/util/TestCharArrayIterator.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/util/TestCharArrayIterator.java @@ -156,7 +156,6 @@ public class TestCharArrayIterator extends LuceneTestCase { private void consume(BreakIterator bi, CharacterIterator ci) { bi.setText(ci); - while (bi.next() != BreakIterator.DONE) - ; + while (bi.next() != BreakIterator.DONE) {} } } diff --git a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/TokenInfoDictionary.java b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/TokenInfoDictionary.java index ed7b49fb138..7fd5a9244c1 100644 --- a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/TokenInfoDictionary.java +++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/TokenInfoDictionary.java @@ -16,6 +16,8 @@ */ package org.apache.lucene.analysis.ja.dict; +import static org.apache.lucene.util.fst.FST.readMetadata; + import java.io.BufferedInputStream; import java.io.IOException; import java.io.InputStream; @@ -103,7 +105,7 @@ public final class TokenInfoDictionary extends BinaryDictionary fst; try (InputStream is = new BufferedInputStream(fstResource.get())) { DataInput in = new InputStreamDataInput(is); - fst = new FST<>(in, in, PositiveIntOutputs.getSingleton()); + fst = new FST<>(readMetadata(in, PositiveIntOutputs.getSingleton()), in); } // TODO: some way to configure? this.fst = new TokenInfoFST(fst, true); diff --git a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/TokenInfoDictionaryBuilder.java b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/TokenInfoDictionaryBuilder.java index 5a16db673ce..8afddb9ca96 100644 --- a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/TokenInfoDictionaryBuilder.java +++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/TokenInfoDictionaryBuilder.java @@ -101,7 +101,8 @@ class TokenInfoDictionaryBuilder { lines.sort(Comparator.comparing(entry -> entry[0])); PositiveIntOutputs fstOutput = PositiveIntOutputs.getSingleton(); - FSTCompiler fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE2, fstOutput); + FSTCompiler fstCompiler = + new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE2, fstOutput).build(); IntsRefBuilder scratch = new IntsRefBuilder(); long ord = -1; // first ord will be 0 String lastValue = null; diff --git a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UserDictionary.java b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UserDictionary.java index de69c726ee2..42807eed278 100644 --- a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UserDictionary.java +++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UserDictionary.java @@ -93,7 +93,8 @@ public final class UserDictionary implements Dictionary { List segmentations = new ArrayList<>(featureEntries.size()); PositiveIntOutputs fstOutput = PositiveIntOutputs.getSingleton(); - FSTCompiler fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE2, fstOutput); + FSTCompiler fstCompiler = + new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE2, fstOutput).build(); IntsRefBuilder scratch = new IntsRefBuilder(); long ord = 0; diff --git a/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseTokenizer.java b/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseTokenizer.java index c910c11842d..e2eed814892 100644 --- a/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseTokenizer.java +++ b/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseTokenizer.java @@ -758,8 +758,7 @@ public class TestJapaneseTokenizer extends BaseTokenStreamTestCase { for (int i = 0; i < numIterations; i++) { try (TokenStream ts = analyzer.tokenStream("ignored", line)) { ts.reset(); - while (ts.incrementToken()) - ; + while (ts.incrementToken()) {} ts.end(); } } @@ -775,8 +774,7 @@ public class TestJapaneseTokenizer extends BaseTokenStreamTestCase { for (String sentence : sentences) { try (TokenStream ts = analyzer.tokenStream("ignored", sentence)) { ts.reset(); - while (ts.incrementToken()) - ; + while (ts.incrementToken()) {} ts.end(); } } @@ -831,8 +829,7 @@ public class TestJapaneseTokenizer extends BaseTokenStreamTestCase { new JapaneseTokenizer(newAttributeFactory(), readDict(), false, Mode.NORMAL); tokenizer.setReader(new StringReader(doc)); tokenizer.reset(); - while (tokenizer.incrementToken()) - ; + while (tokenizer.incrementToken()) {} } public void testPatchedSystemDict() throws Exception { diff --git a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/TokenInfoDictionary.java b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/TokenInfoDictionary.java index 317123d5b88..07540075da0 100644 --- a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/TokenInfoDictionary.java +++ b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/TokenInfoDictionary.java @@ -16,6 +16,8 @@ */ package org.apache.lucene.analysis.ko.dict; +import static org.apache.lucene.util.fst.FST.readMetadata; + import java.io.BufferedInputStream; import java.io.IOException; import java.io.InputStream; @@ -102,7 +104,7 @@ public final class TokenInfoDictionary extends BinaryDictionary fst; try (InputStream is = new BufferedInputStream(fstResource.get())) { DataInput in = new InputStreamDataInput(is); - fst = new FST<>(in, in, PositiveIntOutputs.getSingleton()); + fst = new FST<>(readMetadata(in, PositiveIntOutputs.getSingleton()), in); } this.fst = new TokenInfoFST(fst); } diff --git a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/TokenInfoDictionaryBuilder.java b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/TokenInfoDictionaryBuilder.java index e3db26b08b8..f66abba8b3a 100644 --- a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/TokenInfoDictionaryBuilder.java +++ b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/TokenInfoDictionaryBuilder.java @@ -94,7 +94,8 @@ class TokenInfoDictionaryBuilder { lines.sort(Comparator.comparing(left -> left[0])); PositiveIntOutputs fstOutput = PositiveIntOutputs.getSingleton(); - FSTCompiler fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE2, fstOutput); + FSTCompiler fstCompiler = + new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE2, fstOutput).build(); IntsRefBuilder scratch = new IntsRefBuilder(); long ord = -1; // first ord will be 0 String lastValue = null; diff --git a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/UserDictionary.java b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/UserDictionary.java index 58a233112aa..4632edc8a21 100644 --- a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/UserDictionary.java +++ b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/UserDictionary.java @@ -75,7 +75,8 @@ public final class UserDictionary implements Dictionary { entries.sort(Comparator.comparing(e -> e.split("\\s+")[0])); PositiveIntOutputs fstOutput = PositiveIntOutputs.getSingleton(); - FSTCompiler fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE2, fstOutput); + FSTCompiler fstCompiler = + new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE2, fstOutput).build(); IntsRefBuilder scratch = new IntsRefBuilder(); String lastToken = null; diff --git a/lucene/analysis/stempel/src/test/org/apache/lucene/analysis/pl/TestPolishAnalyzer.java b/lucene/analysis/stempel/src/test/org/apache/lucene/analysis/pl/TestPolishAnalyzer.java index c1687cc119d..c4f808b52ca 100644 --- a/lucene/analysis/stempel/src/test/org/apache/lucene/analysis/pl/TestPolishAnalyzer.java +++ b/lucene/analysis/stempel/src/test/org/apache/lucene/analysis/pl/TestPolishAnalyzer.java @@ -41,7 +41,6 @@ public class TestPolishAnalyzer extends BaseTokenStreamTestCase { /** test use of exclusion set */ public void testExclude() throws IOException { CharArraySet exclusionSet = new CharArraySet(asSet("studenta"), false); - ; Analyzer a = new PolishAnalyzer(PolishAnalyzer.getDefaultStopSet(), exclusionSet); checkOneTerm(a, "studenta", "studenta"); checkOneTerm(a, "studenci", "student"); diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene40/blocktree/FieldReader.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene40/blocktree/FieldReader.java index b30e6ef2fd7..06f18d3bfd9 100644 --- a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene40/blocktree/FieldReader.java +++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene40/blocktree/FieldReader.java @@ -16,6 +16,8 @@ */ package org.apache.lucene.backward_codecs.lucene40.blocktree; +import static org.apache.lucene.util.fst.FST.readMetadata; + import java.io.IOException; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.IndexOptions; @@ -89,9 +91,17 @@ public final class FieldReader extends Terms { final IndexInput clone = indexIn.clone(); clone.seek(indexStartFP); if (metaIn == indexIn) { // Only true before Lucene 8.6 - index = new FST<>(clone, clone, ByteSequenceOutputs.getSingleton(), new OffHeapFSTStore()); + index = + new FST<>( + readMetadata(clone, ByteSequenceOutputs.getSingleton()), + clone, + new OffHeapFSTStore()); } else { - index = new FST<>(metaIn, clone, ByteSequenceOutputs.getSingleton(), new OffHeapFSTStore()); + index = + new FST<>( + readMetadata(metaIn, ByteSequenceOutputs.getSingleton()), + clone, + new OffHeapFSTStore()); } /* if (false) { diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/TestManyPointsInOldIndex.java b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/TestManyPointsInOldIndex.java index 0a7571bcc2f..7cfe57ddba6 100644 --- a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/TestManyPointsInOldIndex.java +++ b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/TestManyPointsInOldIndex.java @@ -22,6 +22,7 @@ import java.nio.file.Path; import java.nio.file.Paths; import org.apache.lucene.document.Document; import org.apache.lucene.document.IntPoint; +import org.apache.lucene.index.CheckIndex; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.store.Directory; @@ -70,7 +71,7 @@ public class TestManyPointsInOldIndex extends LuceneTestCase { dir.setCheckIndexOnClose(false); // ... because we check ourselves here: - TestUtil.checkIndex(dir, false, true, true, null); + TestUtil.checkIndex(dir, CheckIndex.Level.MIN_LEVEL_FOR_INTEGRITY_CHECKS, true, true, null); dir.close(); } } diff --git a/lucene/benchmark-jmh/build.gradle b/lucene/benchmark-jmh/build.gradle index 4a35d998dc9..2a0e0bffd01 100644 --- a/lucene/benchmark-jmh/build.gradle +++ b/lucene/benchmark-jmh/build.gradle @@ -23,6 +23,7 @@ description = 'Lucene JMH micro-benchmarking module' dependencies { moduleImplementation project(':lucene:core') + moduleImplementation project(':lucene:expressions') moduleImplementation "org.openjdk.jmh:jmh-core:1.37" annotationProcessor "org.openjdk.jmh:jmh-generator-annprocess:1.37" @@ -42,7 +43,7 @@ tasks.matching { it.name == "forbiddenApisMain" }.configureEach { tasks.matching { it.name in [ // Turn off JMH dependency checksums and licensing (it's GPL w/ classpath exception // but this seems fine for test/build only tools). - "validateJarChecksums", "validateJarLicenses", + "validateJarChecksums", "validateJarLicenses", "collectJarInfos", // No special javadocs for JMH benchmarks. "renderSiteJavadoc", "renderJavadoc", diff --git a/lucene/benchmark-jmh/src/java/module-info.java b/lucene/benchmark-jmh/src/java/module-info.java index c212a42fb7e..d92164cfae1 100644 --- a/lucene/benchmark-jmh/src/java/module-info.java +++ b/lucene/benchmark-jmh/src/java/module-info.java @@ -20,6 +20,7 @@ module org.apache.lucene.benchmark.jmh { requires jmh.core; requires jdk.unsupported; requires org.apache.lucene.core; + requires org.apache.lucene.expressions; exports org.apache.lucene.benchmark.jmh; exports org.apache.lucene.benchmark.jmh.jmh_generated; diff --git a/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/ExpressionsBenchmark.java b/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/ExpressionsBenchmark.java new file mode 100644 index 00000000000..0c65305dccd --- /dev/null +++ b/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/ExpressionsBenchmark.java @@ -0,0 +1,148 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.benchmark.jmh; + +import java.io.IOException; +import java.lang.invoke.MethodHandle; +import java.lang.invoke.MethodHandles; +import java.lang.invoke.MethodType; +import java.text.ParseException; +import java.util.HashMap; +import java.util.Map; +import java.util.Objects; +import java.util.concurrent.ThreadLocalRandom; +import java.util.concurrent.TimeUnit; +import org.apache.lucene.expressions.Expression; +import org.apache.lucene.expressions.js.JavascriptCompiler; +import org.apache.lucene.search.DoubleValues; +import org.openjdk.jmh.annotations.Benchmark; +import org.openjdk.jmh.annotations.BenchmarkMode; +import org.openjdk.jmh.annotations.Fork; +import org.openjdk.jmh.annotations.Level; +import org.openjdk.jmh.annotations.Measurement; +import org.openjdk.jmh.annotations.Mode; +import org.openjdk.jmh.annotations.OutputTimeUnit; +import org.openjdk.jmh.annotations.Param; +import org.openjdk.jmh.annotations.Scope; +import org.openjdk.jmh.annotations.Setup; +import org.openjdk.jmh.annotations.State; +import org.openjdk.jmh.annotations.Warmup; + +@BenchmarkMode(Mode.Throughput) +@OutputTimeUnit(TimeUnit.MILLISECONDS) +@State(Scope.Benchmark) +@Warmup(iterations = 5, time = 5) +@Measurement(iterations = 12, time = 8) +@Fork(value = 1) +public class ExpressionsBenchmark { + + /** + * Some extra functions to bench "identity" in various variants, another one is named + * "native_identity" (see below). + */ + private static final Map FUNCTIONS = getFunctions(); + + private static final String NATIVE_IDENTITY_NAME = "native_identity"; + + private static Map getFunctions() { + try { + var lookup = MethodHandles.lookup(); + Map m = new HashMap<>(JavascriptCompiler.DEFAULT_FUNCTIONS); + m.put( + "func_identity", + lookup.findStatic( + lookup.lookupClass(), "ident", MethodType.methodType(double.class, double.class))); + m.put("mh_identity", MethodHandles.identity(double.class)); + return m; + } catch (ReflectiveOperationException e) { + throw new AssertionError(e); + } + } + + @SuppressWarnings("unused") + private static double ident(double v) { + return v; + } + + /** A native implementation of an expression to compare performance */ + private static final Expression NATIVE_IDENTITY_EXPRESSION = + new Expression(NATIVE_IDENTITY_NAME, new String[] {"x"}) { + @Override + public double evaluate(DoubleValues[] functionValues) throws IOException { + return functionValues[0].doubleValue(); + } + }; + + private double[] randomData; + private Expression expression; + + @Param({"x", "func_identity(x)", "mh_identity", "native_identity", "cos(x)", "cos(x) + sin(x)"}) + String js; + + @Setup(Level.Iteration) + public void init() throws ParseException { + ThreadLocalRandom random = ThreadLocalRandom.current(); + randomData = random.doubles().limit(1024).toArray(); + expression = + Objects.equals(js, NATIVE_IDENTITY_NAME) + ? NATIVE_IDENTITY_EXPRESSION + : JavascriptCompiler.compile(js, FUNCTIONS); + } + + @Benchmark + public double expression() throws IOException { + var it = new ValuesIterator(randomData); + var values = it.getDoubleValues(); + double result = 0d; + while (it.next()) { + result += expression.evaluate(values); + } + return result; + } + + static final class ValuesIterator { + final double[] data; + final DoubleValues[] dv; + int pos = -1; + + ValuesIterator(double[] data) { + this.data = data; + var dv = + new DoubleValues() { + @Override + public double doubleValue() throws IOException { + return data[pos]; + } + + @Override + public boolean advanceExact(int doc) throws IOException { + throw new UnsupportedOperationException(); + } + }; + this.dv = new DoubleValues[] {dv}; + } + + boolean next() { + pos++; + return (pos < data.length); + } + + DoubleValues[] getDoubleValues() { + return dv; + } + } +} diff --git a/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/GroupVIntBenchmark.java b/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/GroupVIntBenchmark.java new file mode 100644 index 00000000000..0a1a20843ef --- /dev/null +++ b/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/GroupVIntBenchmark.java @@ -0,0 +1,176 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.benchmark.jmh; + +import java.io.IOException; +import java.nio.file.Files; +import java.util.Arrays; +import java.util.Random; +import java.util.concurrent.TimeUnit; +import org.apache.lucene.codecs.lucene99.GroupVIntReader; +import org.apache.lucene.codecs.lucene99.GroupVIntWriter; +import org.apache.lucene.store.ByteArrayDataInput; +import org.apache.lucene.store.ByteArrayDataOutput; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.IOContext; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.store.MMapDirectory; +import org.openjdk.jmh.annotations.Benchmark; +import org.openjdk.jmh.annotations.BenchmarkMode; +import org.openjdk.jmh.annotations.Fork; +import org.openjdk.jmh.annotations.Level; +import org.openjdk.jmh.annotations.Measurement; +import org.openjdk.jmh.annotations.Mode; +import org.openjdk.jmh.annotations.OutputTimeUnit; +import org.openjdk.jmh.annotations.Param; +import org.openjdk.jmh.annotations.Scope; +import org.openjdk.jmh.annotations.Setup; +import org.openjdk.jmh.annotations.State; +import org.openjdk.jmh.annotations.Warmup; +import org.openjdk.jmh.infra.Blackhole; + +@BenchmarkMode(Mode.Throughput) +@OutputTimeUnit(TimeUnit.MICROSECONDS) +@State(Scope.Benchmark) +@Warmup(iterations = 3, time = 3) +@Measurement(iterations = 5, time = 5) +@Fork( + value = 1, + jvmArgsPrepend = {"--add-modules=jdk.unsupported"}) +public class GroupVIntBenchmark { + + // Cumulative frequency for each number of bits per value used by doc deltas of tail postings on + // wikibigall. + private static final float[] CUMULATIVE_FREQUENCY_BY_BITS_REQUIRED = + new float[] { + 0.0f, + 0.01026574f, + 0.021453038f, + 0.03342156f, + 0.046476692f, + 0.060890317f, + 0.07644147f, + 0.093718216f, + 0.11424741f, + 0.13989712f, + 0.17366524f, + 0.22071244f, + 0.2815692f, + 0.3537585f, + 0.43655503f, + 0.52308f, + 0.6104675f, + 0.7047371f, + 0.78155357f, + 0.8671179f, + 0.9740598f, + 1.0f + }; + + final int maxSize = 256; + final long[] values = new long[maxSize]; + + IndexInput byteBufferGVIntIn; + IndexInput byteBufferVIntIn; + + ByteArrayDataInput byteArrayVIntIn; + ByteArrayDataInput byteArrayGVIntIn; + + // @Param({"16", "32", "64", "128", "248"}) + @Param({"64"}) + public int size; + + void initArrayInput(long[] docs) throws Exception { + byte[] gVIntBytes = new byte[Integer.BYTES * maxSize * 2]; + byte[] vIntBytes = new byte[Integer.BYTES * maxSize * 2]; + ByteArrayDataOutput vIntOut = new ByteArrayDataOutput(vIntBytes); + GroupVIntWriter w = new GroupVIntWriter(); + w.writeValues(new ByteArrayDataOutput(gVIntBytes), docs, docs.length); + for (long v : docs) { + vIntOut.writeVInt((int) v); + } + byteArrayVIntIn = new ByteArrayDataInput(vIntBytes); + byteArrayGVIntIn = new ByteArrayDataInput(gVIntBytes); + } + + void initByteBufferInput(long[] docs) throws Exception { + Directory dir = MMapDirectory.open(Files.createTempDirectory("groupvintdata")); + IndexOutput vintOut = dir.createOutput("vint", IOContext.DEFAULT); + IndexOutput gvintOut = dir.createOutput("gvint", IOContext.DEFAULT); + + GroupVIntWriter w = new GroupVIntWriter(); + w.writeValues(gvintOut, docs, docs.length); + for (long v : docs) { + vintOut.writeVInt((int) v); + } + vintOut.close(); + gvintOut.close(); + byteBufferGVIntIn = dir.openInput("gvint", IOContext.DEFAULT); + byteBufferVIntIn = dir.openInput("vint", IOContext.DEFAULT); + } + + @Setup(Level.Trial) + public void init() throws Exception { + long[] docs = new long[maxSize]; + Random r = new Random(0); + for (int i = 0; i < maxSize; ++i) { + float randomFloat = r.nextFloat(); + // Reproduce the distribution of the number of bits per values that we're observing for tail + // postings on wikibigall. + int numBits = 1 + Arrays.binarySearch(CUMULATIVE_FREQUENCY_BY_BITS_REQUIRED, randomFloat); + if (numBits < 0) { + numBits = -numBits; + } + docs[i] = r.nextInt(1 << (numBits - 1), 1 << numBits); + } + initByteBufferInput(docs); + initArrayInput(docs); + } + + @Benchmark + public void byteBufferReadVInt(Blackhole bh) throws IOException { + byteBufferVIntIn.seek(0); + for (int i = 0; i < size; i++) { + values[i] = byteBufferVIntIn.readVInt(); + } + bh.consume(values); + } + + @Benchmark + public void byteBufferReadGroupVInt(Blackhole bh) throws IOException { + byteBufferGVIntIn.seek(0); + GroupVIntReader.readValues(byteBufferGVIntIn, values, size); + bh.consume(values); + } + + @Benchmark + public void byteArrayReadVInt(Blackhole bh) { + byteArrayVIntIn.rewind(); + for (int i = 0; i < size; i++) { + values[i] = byteArrayVIntIn.readVInt(); + } + bh.consume(values); + } + + @Benchmark + public void byteArrayReadGroupVInt(Blackhole bh) throws IOException { + byteArrayGVIntIn.rewind(); + GroupVIntReader.readValues(byteArrayGVIntIn, values, size); + bh.consume(values); + } +} diff --git a/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ReadTask.java b/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ReadTask.java index 405305d50a1..58cf8e79efa 100644 --- a/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ReadTask.java +++ b/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ReadTask.java @@ -30,8 +30,8 @@ import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.Sort; import org.apache.lucene.search.TopDocs; -import org.apache.lucene.search.TopFieldCollector; -import org.apache.lucene.search.TopScoreDocCollector; +import org.apache.lucene.search.TopFieldCollectorManager; +import org.apache.lucene.search.TopScoreDocCollectorManager; import org.apache.lucene.store.Directory; import org.apache.lucene.util.Bits; @@ -110,15 +110,17 @@ public abstract class ReadTask extends PerfTask { // the IndexSearcher search methods that take // Weight public again, we can go back to // pulling the Weight ourselves: - TopFieldCollector collector = - TopFieldCollector.create(sort, numHits, withTotalHits() ? Integer.MAX_VALUE : 1); - searcher.search(q, collector); - hits = collector.topDocs(); + int totalHitsThreshold = withTotalHits() ? Integer.MAX_VALUE : 1; + TopFieldCollectorManager collectorManager = + new TopFieldCollectorManager( + sort, numHits, null, totalHitsThreshold, searcher.getSlices().length > 1); + hits = searcher.search(q, collectorManager); } else { hits = searcher.search(q, numHits); } } else { Collector collector = createCollector(); + searcher.search(q, collector); // hits = collector.topDocs(); } @@ -183,7 +185,8 @@ public abstract class ReadTask extends PerfTask { } protected Collector createCollector() throws Exception { - return TopScoreDocCollector.create(numHits(), withTotalHits() ? Integer.MAX_VALUE : 1); + return new TopScoreDocCollectorManager(numHits(), withTotalHits() ? Integer.MAX_VALUE : 1) + .newCollector(); } protected Document retrieveDoc(StoredFields storedFields, int id) throws IOException { diff --git a/lucene/classification/src/java/org/apache/lucene/classification/BooleanPerceptronClassifier.java b/lucene/classification/src/java/org/apache/lucene/classification/BooleanPerceptronClassifier.java index 4a8482c4fc7..280e82cc2c0 100644 --- a/lucene/classification/src/java/org/apache/lucene/classification/BooleanPerceptronClassifier.java +++ b/lucene/classification/src/java/org/apache/lucene/classification/BooleanPerceptronClassifier.java @@ -207,7 +207,8 @@ public class BooleanPerceptronClassifier implements Classifier { private void updateFST(SortedMap weights) throws IOException { PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(); - FSTCompiler fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs); + FSTCompiler fstCompiler = + new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE1, outputs).build(); BytesRefBuilder scratchBytes = new BytesRefBuilder(); IntsRefBuilder scratchInts = new IntsRefBuilder(); for (Map.Entry entry : weights.entrySet()) { diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/VariableGapTermsIndexReader.java b/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/VariableGapTermsIndexReader.java index 5ebaa9e5d8e..154e0e22066 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/VariableGapTermsIndexReader.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/VariableGapTermsIndexReader.java @@ -16,6 +16,8 @@ */ package org.apache.lucene.codecs.blockterms; +import static org.apache.lucene.util.fst.FST.readMetadata; + import java.io.IOException; import java.util.Collection; import java.util.Collections; @@ -154,7 +156,7 @@ public class VariableGapTermsIndexReader extends TermsIndexReaderBase { public FieldIndexData(IndexInput in, FieldInfo fieldInfo, long indexStart) throws IOException { IndexInput clone = in.clone(); clone.seek(indexStart); - fst = new FST<>(clone, clone, fstOutputs); + fst = new FST<>(readMetadata(clone, fstOutputs), clone); clone.close(); /* diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/VariableGapTermsIndexWriter.java b/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/VariableGapTermsIndexWriter.java index f15650670a1..04e3e80c71b 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/VariableGapTermsIndexWriter.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/VariableGapTermsIndexWriter.java @@ -238,7 +238,7 @@ public class VariableGapTermsIndexWriter extends TermsIndexWriterBase { public FSTFieldWriter(FieldInfo fieldInfo, long termsFilePointer) throws IOException { this.fieldInfo = fieldInfo; fstOutputs = PositiveIntOutputs.getSingleton(); - fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, fstOutputs); + fstCompiler = new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE1, fstOutputs).build(); indexStart = out.getFilePointer(); //// System.out.println("VGW: field=" + fieldInfo.name); diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/OrdsFieldReader.java b/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/OrdsFieldReader.java index 486724a0098..c3e2b0362f4 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/OrdsFieldReader.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/OrdsFieldReader.java @@ -16,6 +16,8 @@ */ package org.apache.lucene.codecs.blocktreeords; +import static org.apache.lucene.util.fst.FST.readMetadata; + import java.io.IOException; import org.apache.lucene.codecs.blocktreeords.FSTOrdsOutputs.Output; import org.apache.lucene.index.FieldInfo; @@ -85,7 +87,7 @@ final class OrdsFieldReader extends Terms { final IndexInput clone = indexIn.clone(); // System.out.println("start=" + indexStartFP + " field=" + fieldInfo.name); clone.seek(indexStartFP); - index = new FST<>(clone, clone, OrdsBlockTreeTermsWriter.FST_OUTPUTS); + index = new FST<>(readMetadata(clone, OrdsBlockTreeTermsWriter.FST_OUTPUTS), clone); /* if (true) { diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTTermsReader.java b/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTTermsReader.java index d4d59d278ff..e63c85a3050 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTTermsReader.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTTermsReader.java @@ -194,7 +194,8 @@ public class FSTTermsReader extends FieldsProducer { this.sumDocFreq = sumDocFreq; this.docCount = docCount; OffHeapFSTStore offHeapFSTStore = new OffHeapFSTStore(); - this.dict = new FST<>(in, in, new FSTTermOutputs(fieldInfo), offHeapFSTStore); + FSTTermOutputs outputs = new FSTTermOutputs(fieldInfo); + this.dict = new FST<>(FST.readMetadata(in, outputs), in, offHeapFSTStore); in.skipBytes(offHeapFSTStore.size()); } diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTTermsWriter.java b/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTTermsWriter.java index b5722ace3d7..fa46f6451da 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTTermsWriter.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTTermsWriter.java @@ -251,12 +251,12 @@ public class FSTTermsWriter extends FieldsConsumer { private final IntsRefBuilder scratchTerm = new IntsRefBuilder(); private final ByteBuffersDataOutput metaWriter = ByteBuffersDataOutput.newResettableInstance(); - TermsWriter(FieldInfo fieldInfo) { + TermsWriter(FieldInfo fieldInfo) throws IOException { this.numTerms = 0; this.fieldInfo = fieldInfo; postingsWriter.setField(fieldInfo); this.outputs = new FSTTermOutputs(fieldInfo); - this.fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs); + this.fstCompiler = new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE1, outputs).build(); } public void finishTerm(BytesRef text, BlockTermState state) throws IOException { diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextFieldsReader.java b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextFieldsReader.java index c5718753462..abfbdd25a02 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextFieldsReader.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextFieldsReader.java @@ -683,7 +683,7 @@ class SimpleTextFieldsReader extends FieldsProducer { final PairOutputs outputsInner = new PairOutputs<>(posIntOutputs, posIntOutputs); final PairOutputs, PairOutputs.Pair> outputs = new PairOutputs<>(outputsOuter, outputsInner); - fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs); + fstCompiler = new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE1, outputs).build(); IndexInput in = SimpleTextFieldsReader.this.in.clone(); in.seek(termsStart); final BytesRefBuilder lastTerm = new BytesRefBuilder(); diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextStoredFieldsFormat.java b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextStoredFieldsFormat.java index 34b24d8fce0..50ce5987eba 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextStoredFieldsFormat.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextStoredFieldsFormat.java @@ -37,7 +37,6 @@ public class SimpleTextStoredFieldsFormat extends StoredFieldsFormat { @Override public StoredFieldsReader fieldsReader( Directory directory, SegmentInfo si, FieldInfos fn, IOContext context) throws IOException { - ; return new SimpleTextStoredFieldsReader(directory, si, fn, context); } diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/FSTDictionary.java b/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/FSTDictionary.java index 7778a1e1816..0a6c9010143 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/FSTDictionary.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/FSTDictionary.java @@ -89,10 +89,11 @@ public class FSTDictionary implements IndexDictionary { isFSTOnHeap = true; } PositiveIntOutputs fstOutputs = PositiveIntOutputs.getSingleton(); + FST.FSTMetadata metadata = FST.readMetadata(fstDataInput, fstOutputs); FST fst = isFSTOnHeap - ? new FST<>(fstDataInput, fstDataInput, fstOutputs) - : new FST<>(fstDataInput, fstDataInput, fstOutputs, new OffHeapFSTStore()); + ? new FST<>(metadata, fstDataInput) + : new FST<>(metadata, fstDataInput, new OffHeapFSTStore()); return new FSTDictionary(fst); } @@ -171,9 +172,9 @@ public class FSTDictionary implements IndexDictionary { protected final FSTCompiler fstCompiler; protected final IntsRefBuilder scratchInts; - public Builder() { + public Builder() throws IOException { PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(); - fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs); + fstCompiler = new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE1, outputs).build(); scratchInts = new IntsRefBuilder(); } diff --git a/lucene/core/src/java/org/apache/lucene/analysis/DelegatingAnalyzerWrapper.java b/lucene/core/src/java/org/apache/lucene/analysis/DelegatingAnalyzerWrapper.java index 9fc24af29a0..6a77078a5cc 100644 --- a/lucene/core/src/java/org/apache/lucene/analysis/DelegatingAnalyzerWrapper.java +++ b/lucene/core/src/java/org/apache/lucene/analysis/DelegatingAnalyzerWrapper.java @@ -100,5 +100,4 @@ public abstract class DelegatingAnalyzerWrapper extends AnalyzerWrapper { } } } - ; } diff --git a/lucene/core/src/java/org/apache/lucene/codecs/TermVectorsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/TermVectorsWriter.java index 5a5a4e672be..f6d672aa41b 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/TermVectorsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/TermVectorsWriter.java @@ -70,7 +70,6 @@ public abstract class TermVectorsWriter implements Closeable, Accountable { /** Called after a doc and all its fields have been added. */ public void finishDocument() throws IOException {} - ; /** * Called before writing the terms of the field. {@link #startTerm(BytesRef, int)} will be called @@ -82,7 +81,6 @@ public abstract class TermVectorsWriter implements Closeable, Accountable { /** Called after a field and all its terms have been added. */ public void finishField() throws IOException {} - ; /** * Adds a term and its term frequency freq. If this field has positions and/or diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/FieldReader.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/FieldReader.java index 2bbcc9541ed..2b25863d661 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/FieldReader.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/FieldReader.java @@ -91,7 +91,11 @@ public final class FieldReader extends Terms { // Initialize FST always off-heap. final IndexInput clone = indexIn.clone(); clone.seek(indexStartFP); - index = new FST<>(metaIn, clone, ByteSequenceOutputs.getSingleton(), new OffHeapFSTStore()); + index = + new FST<>( + FST.readMetadata(metaIn, ByteSequenceOutputs.getSingleton()), + clone, + new OffHeapFSTStore()); /* if (false) { final String dotFileName = segment + "_" + fieldInfo.name + ".dot"; diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/IntersectTermsEnum.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/IntersectTermsEnum.java index 5773e4a5f0f..9475c0de5f8 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/IntersectTermsEnum.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/IntersectTermsEnum.java @@ -30,9 +30,7 @@ import org.apache.lucene.util.StringHelper; import org.apache.lucene.util.automaton.ByteRunnable; import org.apache.lucene.util.automaton.Transition; import org.apache.lucene.util.automaton.TransitionAccessor; -import org.apache.lucene.util.fst.ByteSequenceOutputs; import org.apache.lucene.util.fst.FST; -import org.apache.lucene.util.fst.Outputs; /** * This is used to implement efficient {@link Terms#intersect} for block-tree. Note that it cannot @@ -46,7 +44,6 @@ final class IntersectTermsEnum extends BaseTermsEnum { // static boolean DEBUG = BlockTreeTermsWriter.DEBUG; final IndexInput in; - static final Outputs fstOutputs = ByteSequenceOutputs.getSingleton(); IntersectTermsEnumFrame[] stack; @@ -68,6 +65,9 @@ final class IntersectTermsEnum extends BaseTermsEnum { private BytesRef savedStartTerm; + private final SegmentTermsEnum.OutputAccumulator outputAccumulator = + new SegmentTermsEnum.OutputAccumulator(); + // TODO: in some cases we can filter by length? eg // regexp foo*bar must be at least length 6 bytes public IntersectTermsEnum( @@ -114,7 +114,6 @@ final class IntersectTermsEnum extends BaseTermsEnum { f.prefix = 0; f.setState(0); f.arc = arc; - f.outputPrefix = arc.output(); f.load(fr.rootCode); // for assert: @@ -184,7 +183,9 @@ final class IntersectTermsEnum extends BaseTermsEnum { FST.Arc arc = currentFrame.arc; int idx = currentFrame.prefix; assert currentFrame.suffix > 0; - BytesRef output = currentFrame.outputPrefix; + + outputAccumulator.reset(); + outputAccumulator.push(arc.output()); while (idx < f.prefix) { final int target = term.bytes[idx] & 0xff; // TODO: we could be more efficient for the next() @@ -192,14 +193,14 @@ final class IntersectTermsEnum extends BaseTermsEnum { // passed to findTargetArc arc = fr.index.findTargetArc(target, arc, getArc(1 + idx), fstReader); assert arc != null; - output = fstOutputs.add(output, arc.output()); + outputAccumulator.push(arc.output()); idx++; } f.arc = arc; - f.outputPrefix = output; assert arc.isFinal(); - f.load(fstOutputs.add(output, arc.nextFinalOutput())); + outputAccumulator.push(arc.nextFinalOutput()); + f.load(outputAccumulator); return f; } diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/IntersectTermsEnumFrame.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/IntersectTermsEnumFrame.java index d9ca7a9bbd8..2b0e05a0b09 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/IntersectTermsEnumFrame.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/IntersectTermsEnumFrame.java @@ -55,7 +55,6 @@ final class IntersectTermsEnumFrame { int statsSingletonRunLength = 0; final ByteArrayDataInput statsReader = new ByteArrayDataInput(); - byte[] floorData = new byte[32]; final ByteArrayDataInput floorDataReader = new ByteArrayDataInput(); // Length of prefix shared by all terms in this block @@ -90,9 +89,6 @@ final class IntersectTermsEnumFrame { final ByteArrayDataInput bytesReader = new ByteArrayDataInput(); - // Cumulative output so far - BytesRef outputPrefix; - int startBytePos; int suffix; @@ -120,7 +116,7 @@ final class IntersectTermsEnumFrame { } } while (numFollowFloorBlocks != 0 && nextFloorLabel <= transition.min); - load(null); + load((Long) null); } public void setState(int state) { @@ -142,12 +138,22 @@ final class IntersectTermsEnumFrame { } void load(BytesRef frameIndexData) throws IOException { - if (frameIndexData != null) { - floorDataReader.reset(frameIndexData.bytes, frameIndexData.offset, frameIndexData.length); - // Skip first long -- has redundant fp, hasTerms - // flag, isFloor flag - final long code = ite.fr.readVLongOutput(floorDataReader); - if ((code & Lucene90BlockTreeTermsReader.OUTPUT_FLAG_IS_FLOOR) != 0) { + floorDataReader.reset(frameIndexData.bytes, frameIndexData.offset, frameIndexData.length); + load(ite.fr.readVLongOutput(floorDataReader)); + } + + void load(SegmentTermsEnum.OutputAccumulator outputAccumulator) throws IOException { + outputAccumulator.prepareRead(); + long code = ite.fr.readVLongOutput(outputAccumulator); + outputAccumulator.setFloorData(floorDataReader); + load(code); + } + + void load(Long blockCode) throws IOException { + if (blockCode != null) { + // This block is the first one in a possible sequence of floor blocks corresponding to a + // single seek point from the FST terms index + if ((blockCode & Lucene90BlockTreeTermsReader.OUTPUT_FLAG_IS_FLOOR) != 0) { // Floor frame numFollowFloorBlocks = floorDataReader.readVInt(); nextFloorLabel = floorDataReader.readByte() & 0xff; diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Lucene90BlockTreeTermsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Lucene90BlockTreeTermsWriter.java index acd75092022..463532218cd 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Lucene90BlockTreeTermsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Lucene90BlockTreeTermsWriter.java @@ -16,6 +16,8 @@ */ package org.apache.lucene.codecs.lucene90.blocktree; +import static org.apache.lucene.util.fst.FSTCompiler.getOnHeapReaderWriter; + import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; @@ -525,7 +527,7 @@ public final class Lucene90BlockTreeTermsWriter extends FieldsConsumer { // Disable suffixes sharing for block tree index because suffixes are mostly dropped // from the FST index and left in the term blocks. .suffixRAMLimitMB(0d) - .bytesPageBits(pageBits) + .dataOutput(getOnHeapReaderWriter(pageBits)) .build(); // if (DEBUG) { // System.out.println(" compile index for prefix=" + prefix); diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/SegmentTermsEnum.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/SegmentTermsEnum.java index cb5577d8d6c..30a4529c5da 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/SegmentTermsEnum.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/SegmentTermsEnum.java @@ -24,6 +24,7 @@ import org.apache.lucene.index.ImpactsEnum; import org.apache.lucene.index.PostingsEnum; import org.apache.lucene.index.TermState; import org.apache.lucene.store.ByteArrayDataInput; +import org.apache.lucene.store.DataInput; import org.apache.lucene.store.IndexInput; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.BytesRef; @@ -48,7 +49,7 @@ final class SegmentTermsEnum extends BaseTermsEnum { // static boolean DEBUG = BlockTreeTermsWriter.DEBUG; - private final ByteArrayDataInput scratchReader = new ByteArrayDataInput(); + private final OutputAccumulator outputAccumulator = new OutputAccumulator(); // What prefix of the current term was present in the index; when we only next() through the // index, this stays at 0. It's only set when @@ -232,18 +233,24 @@ final class SegmentTermsEnum extends BaseTermsEnum { return arcs[ord]; } - // Pushes a frame we seek'd to SegmentTermsEnumFrame pushFrame(FST.Arc arc, BytesRef frameData, int length) throws IOException { - scratchReader.reset(frameData.bytes, frameData.offset, frameData.length); - final long code = fr.readVLongOutput(scratchReader); + outputAccumulator.reset(); + outputAccumulator.push(frameData); + return pushFrame(arc, length); + } + + // Pushes a frame we seek'd to + SegmentTermsEnumFrame pushFrame(FST.Arc arc, int length) throws IOException { + outputAccumulator.prepareRead(); + final long code = fr.readVLongOutput(outputAccumulator); final long fpSeek = code >>> Lucene90BlockTreeTermsReader.OUTPUT_FLAGS_NUM_BITS; final SegmentTermsEnumFrame f = getFrame(1 + currentFrame.ord); f.hasTerms = (code & Lucene90BlockTreeTermsReader.OUTPUT_FLAG_HAS_TERMS) != 0; f.hasTermsOrig = f.hasTerms; f.isFloor = (code & Lucene90BlockTreeTermsReader.OUTPUT_FLAG_IS_FLOOR) != 0; if (f.isFloor) { - f.setFloorData(scratchReader, frameData); + f.setFloorData(outputAccumulator); } pushFrame(arc, fpSeek, length); @@ -344,9 +351,9 @@ final class SegmentTermsEnum extends BaseTermsEnum { FST.Arc arc; int targetUpto; - BytesRef output; targetBeforeCurrentLength = currentFrame.ord; + outputAccumulator.reset(); if (currentFrame != staticFrame) { @@ -363,7 +370,7 @@ final class SegmentTermsEnum extends BaseTermsEnum { arc = arcs[0]; assert arc.isFinal(); - output = arc.output(); + outputAccumulator.push(arc.output()); targetUpto = 0; SegmentTermsEnumFrame lastFrame = stack[0]; @@ -373,9 +380,6 @@ final class SegmentTermsEnum extends BaseTermsEnum { int cmp = 0; - // TODO: reverse vLong byte order for better FST - // prefix output sharing - // First compare up to valid seek frames: while (targetUpto < targetLimit) { cmp = (term.byteAt(targetUpto) & 0xFF) - (target.bytes[target.offset + targetUpto] & 0xFF); @@ -394,9 +398,8 @@ final class SegmentTermsEnum extends BaseTermsEnum { + (char) arc.label() + " targetLabel=" + (char) (target.bytes[target.offset + targetUpto] & 0xFF); - if (arc.output() != Lucene90BlockTreeTermsReader.NO_OUTPUT) { - output = Lucene90BlockTreeTermsReader.FST_OUTPUTS.add(output, arc.output()); - } + outputAccumulator.push(arc.output()); + if (arc.isFinal()) { lastFrame = stack[1 + lastFrame.ord]; } @@ -484,15 +487,15 @@ final class SegmentTermsEnum extends BaseTermsEnum { // System.out.println(" no seek state; push root frame"); // } - output = arc.output(); + outputAccumulator.push(arc.output()); currentFrame = staticFrame; // term.length = 0; targetUpto = 0; - currentFrame = - pushFrame( - arc, Lucene90BlockTreeTermsReader.FST_OUTPUTS.add(output, arc.nextFinalOutput()), 0); + outputAccumulator.push(arc.nextFinalOutput()); + currentFrame = pushFrame(arc, 0); + outputAccumulator.pop(); } // if (DEBUG) { @@ -554,9 +557,7 @@ final class SegmentTermsEnum extends BaseTermsEnum { term.setByteAt(targetUpto, (byte) targetLabel); // Aggregate output as we go: assert arc.output() != null; - if (arc.output() != Lucene90BlockTreeTermsReader.NO_OUTPUT) { - output = Lucene90BlockTreeTermsReader.FST_OUTPUTS.add(output, arc.output()); - } + outputAccumulator.push(arc.output()); // if (DEBUG) { // System.out.println(" index: follow label=" + toHex(target.bytes[target.offset + @@ -566,11 +567,9 @@ final class SegmentTermsEnum extends BaseTermsEnum { if (arc.isFinal()) { // if (DEBUG) System.out.println(" arc is final!"); - currentFrame = - pushFrame( - arc, - Lucene90BlockTreeTermsReader.FST_OUTPUTS.add(output, arc.nextFinalOutput()), - targetUpto); + outputAccumulator.push(arc.nextFinalOutput()); + currentFrame = pushFrame(arc, targetUpto); + outputAccumulator.pop(); // if (DEBUG) System.out.println(" curFrame.ord=" + currentFrame.ord + " hasTerms=" + // currentFrame.hasTerms); } @@ -630,9 +629,9 @@ final class SegmentTermsEnum extends BaseTermsEnum { FST.Arc arc; int targetUpto; - BytesRef output; targetBeforeCurrentLength = currentFrame.ord; + outputAccumulator.reset(); if (currentFrame != staticFrame) { @@ -649,7 +648,7 @@ final class SegmentTermsEnum extends BaseTermsEnum { arc = arcs[0]; assert arc.isFinal(); - output = arc.output(); + outputAccumulator.push(arc.output()); targetUpto = 0; SegmentTermsEnumFrame lastFrame = stack[0]; @@ -659,9 +658,6 @@ final class SegmentTermsEnum extends BaseTermsEnum { int cmp = 0; - // TODO: we should write our vLong backwards (MSB - // first) to get better sharing from the FST - // First compare up to valid seek frames: while (targetUpto < targetLimit) { cmp = (term.byteAt(targetUpto) & 0xFF) - (target.bytes[target.offset + targetUpto] & 0xFF); @@ -680,14 +676,8 @@ final class SegmentTermsEnum extends BaseTermsEnum { + (char) arc.label() + " targetLabel=" + (char) (target.bytes[target.offset + targetUpto] & 0xFF); - // TODO: we could save the outputs in local - // byte[][] instead of making new objs ever - // seek; but, often the FST doesn't have any - // shared bytes (but this could change if we - // reverse vLong byte order) - if (arc.output() != Lucene90BlockTreeTermsReader.NO_OUTPUT) { - output = Lucene90BlockTreeTermsReader.FST_OUTPUTS.add(output, arc.output()); - } + + outputAccumulator.push(arc.output()); if (arc.isFinal()) { lastFrame = stack[1 + lastFrame.ord]; } @@ -769,15 +759,15 @@ final class SegmentTermsEnum extends BaseTermsEnum { // System.out.println(" no seek state; push root frame"); // } - output = arc.output(); + outputAccumulator.push(arc.output()); currentFrame = staticFrame; // term.length = 0; targetUpto = 0; - currentFrame = - pushFrame( - arc, Lucene90BlockTreeTermsReader.FST_OUTPUTS.add(output, arc.nextFinalOutput()), 0); + outputAccumulator.push(arc.nextFinalOutput()); + currentFrame = pushFrame(arc, 0); + outputAccumulator.pop(); } // if (DEBUG) { @@ -839,9 +829,7 @@ final class SegmentTermsEnum extends BaseTermsEnum { arc = nextArc; // Aggregate output as we go: assert arc.output() != null; - if (arc.output() != Lucene90BlockTreeTermsReader.NO_OUTPUT) { - output = Lucene90BlockTreeTermsReader.FST_OUTPUTS.add(output, arc.output()); - } + outputAccumulator.push(arc.output()); // if (DEBUG) { // System.out.println(" index: follow label=" + (target.bytes[target.offset + @@ -851,11 +839,9 @@ final class SegmentTermsEnum extends BaseTermsEnum { if (arc.isFinal()) { // if (DEBUG) System.out.println(" arc is final!"); - currentFrame = - pushFrame( - arc, - Lucene90BlockTreeTermsReader.FST_OUTPUTS.add(output, arc.nextFinalOutput()), - targetUpto); + outputAccumulator.push(arc.nextFinalOutput()); + currentFrame = pushFrame(arc, targetUpto); + outputAccumulator.pop(); // if (DEBUG) System.out.println(" curFrame.ord=" + currentFrame.ord + " hasTerms=" + // currentFrame.hasTerms); } @@ -1190,4 +1176,68 @@ final class SegmentTermsEnum extends BaseTermsEnum { public long ord() { throw new UnsupportedOperationException(); } + + static class OutputAccumulator extends DataInput { + + BytesRef[] outputs = new BytesRef[16]; + BytesRef current; + int num; + int outputIndex; + int index; + + void push(BytesRef output) { + if (output != Lucene90BlockTreeTermsReader.NO_OUTPUT) { + outputs = ArrayUtil.grow(outputs, num + 1); + outputs[num++] = output; + } + } + + void pop() { + assert num > 0; + num--; + } + + void reset() { + num = 0; + } + + void prepareRead() { + index = 0; + outputIndex = 0; + current = outputs[0]; + } + + /** + * Set the last arc as the source of the floorData. This won't change the reading position of + * this {@link OutputAccumulator} + */ + void setFloorData(ByteArrayDataInput floorData) { + assert outputIndex == num - 1 + : "floor data should be stored in last arc, get outputIndex: " + + outputIndex + + ", num: " + + num; + BytesRef output = outputs[outputIndex]; + floorData.reset(output.bytes, output.offset + index, output.length - index); + } + + @Override + public byte readByte() throws IOException { + if (index >= current.length) { + current = outputs[++outputIndex]; + index = 0; + } + return current.bytes[current.offset + index++]; + } + + @Override + public void readBytes(byte[] b, int offset, int len) throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public void skipBytes(long numBytes) throws IOException { + throw new UnsupportedOperationException(); + } + } } diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/SegmentTermsEnumFrame.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/SegmentTermsEnumFrame.java index 48c4fd0a6d4..4016b5c784d 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/SegmentTermsEnumFrame.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/SegmentTermsEnumFrame.java @@ -55,7 +55,7 @@ final class SegmentTermsEnumFrame { int statsSingletonRunLength = 0; final ByteArrayDataInput statsReader = new ByteArrayDataInput(); - byte[] floorData = new byte[32]; + int rewindPos; final ByteArrayDataInput floorDataReader = new ByteArrayDataInput(); // Length of prefix shared by all terms in this block @@ -104,13 +104,9 @@ final class SegmentTermsEnumFrame { suffixLengthsReader = new ByteArrayDataInput(); } - public void setFloorData(ByteArrayDataInput in, BytesRef source) { - final int numBytes = source.length - (in.getPosition() - source.offset); - if (numBytes > floorData.length) { - floorData = new byte[ArrayUtil.oversize(numBytes, 1)]; - } - System.arraycopy(source.bytes, source.offset + in.getPosition(), floorData, 0, numBytes); - floorDataReader.reset(floorData, 0, numBytes); + public void setFloorData(SegmentTermsEnum.OutputAccumulator outputAccumulator) { + outputAccumulator.setFloorData(floorDataReader); + rewindPos = floorDataReader.getPosition(); numFollowFloorBlocks = floorDataReader.readVInt(); nextFloorLabel = floorDataReader.readByte() & 0xff; // if (DEBUG) { @@ -247,7 +243,7 @@ final class SegmentTermsEnumFrame { nextEnt = -1; hasTerms = hasTermsOrig; if (isFloor) { - floorDataReader.rewind(); + floorDataReader.setPosition(rewindPos); numFollowFloorBlocks = floorDataReader.readVInt(); assert numFollowFloorBlocks > 0; nextFloorLabel = floorDataReader.readByte() & 0xff; diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/GroupVIntReader.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/GroupVIntReader.java new file mode 100644 index 00000000000..5fbd2069701 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/GroupVIntReader.java @@ -0,0 +1,57 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.lucene99; + +import java.io.IOException; +import org.apache.lucene.store.DataInput; + +/** Decode integers using group-varint. */ +public class GroupVIntReader { + + public static void readValues(DataInput in, long[] docs, int limit) throws IOException { + int i; + for (i = 0; i <= limit - 4; i += 4) { + final int flag = in.readByte() & 0xFF; + + final int n1Minus1 = flag >> 6; + final int n2Minus1 = (flag >> 4) & 0x03; + final int n3Minus1 = (flag >> 2) & 0x03; + final int n4Minus1 = flag & 0x03; + + docs[i] = readLong(in, n1Minus1); + docs[i + 1] = readLong(in, n2Minus1); + docs[i + 2] = readLong(in, n3Minus1); + docs[i + 3] = readLong(in, n4Minus1); + } + for (; i < limit; ++i) { + docs[i] = in.readVInt(); + } + } + + private static long readLong(DataInput in, int numBytesMinus1) throws IOException { + switch (numBytesMinus1) { + case 0: + return in.readByte() & 0xFFL; + case 1: + return in.readShort() & 0xFFFFL; + case 2: + return (in.readShort() & 0xFFFFL) | ((in.readByte() & 0xFFL) << 16); + default: + return in.readInt() & 0xFFFFFFFFL; + } + } +} diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/GroupVIntWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/GroupVIntWriter.java new file mode 100644 index 00000000000..905cab22b2b --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/GroupVIntWriter.java @@ -0,0 +1,63 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.lucene99; + +import java.io.IOException; +import org.apache.lucene.store.DataOutput; + +/** + * Encode integers using group-varint. It uses VInt to encode tail values that are not enough for a + * group + */ +public class GroupVIntWriter { + + // the maximum size of one group is 4 integers + 1 byte flag. + private byte[] bytes = new byte[17]; + private int byteOffset = 0; + + public GroupVIntWriter() {} + + private int encodeValue(int v) { + int lastOff = byteOffset; + do { + bytes[byteOffset++] = (byte) (v & 0xFF); + v >>>= 8; + } while (v != 0); + return byteOffset - lastOff; + } + + public void writeValues(DataOutput out, long[] values, int limit) throws IOException { + int off = 0; + + // encode each group + while ((limit - off) >= 4) { + byte flag = 0; + byteOffset = 1; + flag |= (encodeValue((int) values[off++]) - 1) << 6; + flag |= (encodeValue((int) values[off++]) - 1) << 4; + flag |= (encodeValue((int) values[off++]) - 1) << 2; + flag |= (encodeValue((int) values[off++]) - 1); + bytes[0] = flag; + out.writeBytes(bytes, byteOffset); + } + + // tail vints + for (; off < limit; off++) { + out.writeVInt((int) values[off]); + } + } +} diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99HnswScalarQuantizedVectorsFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99HnswScalarQuantizedVectorsFormat.java index 23d607a1c77..6023777ea94 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99HnswScalarQuantizedVectorsFormat.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99HnswScalarQuantizedVectorsFormat.java @@ -31,6 +31,7 @@ import org.apache.lucene.codecs.KnnVectorsReader; import org.apache.lucene.codecs.KnnVectorsWriter; import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.index.SegmentWriteState; +import org.apache.lucene.search.TaskExecutor; import org.apache.lucene.util.hnsw.HnswGraph; /** @@ -60,7 +61,7 @@ public final class Lucene99HnswScalarQuantizedVectorsFormat extends KnnVectorsFo private final FlatVectorsFormat flatVectorsFormat; private final int numMergeWorkers; - private final ExecutorService mergeExec; + private final TaskExecutor mergeExec; /** Constructs a format using default graph construction parameters */ public Lucene99HnswScalarQuantizedVectorsFormat() { @@ -84,8 +85,8 @@ public final class Lucene99HnswScalarQuantizedVectorsFormat extends KnnVectorsFo * @param beamWidth the size of the queue maintained during graph construction. * @param numMergeWorkers number of workers (threads) that will be used when doing merge. If * larger than 1, a non-null {@link ExecutorService} must be passed as mergeExec - * @param configuredQuantile the quantile for scalar quantizing the vectors, when `null` it is - * calculated based on the vector field dimensions. + * @param confidenceInterval the confidenceInterval for scalar quantizing the vectors, when `null` + * it is calculated based on the vector field dimensions. * @param mergeExec the {@link ExecutorService} that will be used by ALL vector writers that are * generated by this format to do the merge */ @@ -93,7 +94,7 @@ public final class Lucene99HnswScalarQuantizedVectorsFormat extends KnnVectorsFo int maxConn, int beamWidth, int numMergeWorkers, - Float configuredQuantile, + Float confidenceInterval, ExecutorService mergeExec) { super("Lucene99HnswScalarQuantizedVectorsFormat"); if (maxConn <= 0 || maxConn > MAXIMUM_MAX_CONN) { @@ -121,8 +122,12 @@ public final class Lucene99HnswScalarQuantizedVectorsFormat extends KnnVectorsFo "No executor service is needed as we'll use single thread to merge"); } this.numMergeWorkers = numMergeWorkers; - this.mergeExec = mergeExec; - this.flatVectorsFormat = new Lucene99ScalarQuantizedVectorsFormat(configuredQuantile); + if (mergeExec != null) { + this.mergeExec = new TaskExecutor(mergeExec); + } else { + this.mergeExec = null; + } + this.flatVectorsFormat = new Lucene99ScalarQuantizedVectorsFormat(confidenceInterval); } @Override diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99HnswVectorsFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99HnswVectorsFormat.java index 85d65df55b9..e2e154a6c51 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99HnswVectorsFormat.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99HnswVectorsFormat.java @@ -27,6 +27,7 @@ import org.apache.lucene.codecs.lucene90.IndexedDISI; import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.index.SegmentWriteState; import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.search.TaskExecutor; import org.apache.lucene.store.IndexOutput; import org.apache.lucene.util.hnsw.HnswGraph; @@ -137,7 +138,7 @@ public final class Lucene99HnswVectorsFormat extends KnnVectorsFormat { private static final FlatVectorsFormat flatVectorsFormat = new Lucene99FlatVectorsFormat(); private final int numMergeWorkers; - private final ExecutorService mergeExec; + private final TaskExecutor mergeExec; /** Constructs a format using default graph construction parameters */ public Lucene99HnswVectorsFormat() { @@ -192,7 +193,11 @@ public final class Lucene99HnswVectorsFormat extends KnnVectorsFormat { "No executor service is needed as we'll use single thread to merge"); } this.numMergeWorkers = numMergeWorkers; - this.mergeExec = mergeExec; + if (mergeExec != null) { + this.mergeExec = new TaskExecutor(mergeExec); + } else { + this.mergeExec = null; + } } @Override diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99HnswVectorsReader.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99HnswVectorsReader.java index 47f1b726527..140477cf749 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99HnswVectorsReader.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99HnswVectorsReader.java @@ -92,18 +92,8 @@ public final class Lucene99HnswVectorsReader extends KnnVectorsReader } catch (Throwable exception) { priorE = exception; } finally { - try { - CodecUtil.checkFooter(meta, priorE); - success = true; - } finally { - if (success == false) { - IOUtils.close(flatVectorsReader); - } - } + CodecUtil.checkFooter(meta, priorE); } - } - success = false; - try { vectorIndex = openDataInput( state, @@ -237,12 +227,22 @@ public final class Lucene99HnswVectorsReader extends KnnVectorsReader || fieldEntry.vectorEncoding != VectorEncoding.FLOAT32) { return; } - RandomVectorScorer scorer = flatVectorsReader.getRandomVectorScorer(field, target); - HnswGraphSearcher.search( - scorer, - new OrdinalTranslatedKnnCollector(knnCollector, scorer::ordToDoc), - getGraph(fieldEntry), - scorer.getAcceptOrds(acceptDocs)); + final RandomVectorScorer scorer = flatVectorsReader.getRandomVectorScorer(field, target); + final KnnCollector collector = + new OrdinalTranslatedKnnCollector(knnCollector, scorer::ordToDoc); + final Bits acceptedOrds = scorer.getAcceptOrds(acceptDocs); + if (knnCollector.k() < scorer.maxOrd()) { + HnswGraphSearcher.search(scorer, collector, getGraph(fieldEntry), acceptedOrds); + } else { + // if k is larger than the number of vectors, we can just iterate over all vectors + // and collect them + for (int i = 0; i < scorer.maxOrd(); i++) { + if (acceptedOrds == null || acceptedOrds.get(i)) { + knnCollector.incVisitedCount(1); + knnCollector.collect(scorer.ordToDoc(i), scorer.score(i)); + } + } + } } @Override @@ -255,12 +255,22 @@ public final class Lucene99HnswVectorsReader extends KnnVectorsReader || fieldEntry.vectorEncoding != VectorEncoding.BYTE) { return; } - RandomVectorScorer scorer = flatVectorsReader.getRandomVectorScorer(field, target); - HnswGraphSearcher.search( - scorer, - new OrdinalTranslatedKnnCollector(knnCollector, scorer::ordToDoc), - getGraph(fieldEntry), - scorer.getAcceptOrds(acceptDocs)); + final RandomVectorScorer scorer = flatVectorsReader.getRandomVectorScorer(field, target); + final KnnCollector collector = + new OrdinalTranslatedKnnCollector(knnCollector, scorer::ordToDoc); + final Bits acceptedOrds = scorer.getAcceptOrds(acceptDocs); + if (knnCollector.k() < scorer.maxOrd()) { + HnswGraphSearcher.search(scorer, collector, getGraph(fieldEntry), acceptedOrds); + } else { + // if k is larger than the number of vectors, we can just iterate over all vectors + // and collect them + for (int i = 0; i < scorer.maxOrd(); i++) { + if (acceptedOrds == null || acceptedOrds.get(i)) { + knnCollector.incVisitedCount(1); + knnCollector.collect(scorer.ordToDoc(i), scorer.score(i)); + } + } + } } @Override diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99HnswVectorsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99HnswVectorsWriter.java index ec9909e9698..3cc1d631f60 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99HnswVectorsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99HnswVectorsWriter.java @@ -23,7 +23,6 @@ import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; import java.util.List; -import java.util.concurrent.ExecutorService; import org.apache.lucene.codecs.CodecUtil; import org.apache.lucene.codecs.FlatVectorsWriter; import org.apache.lucene.codecs.KnnFieldVectorsWriter; @@ -35,6 +34,7 @@ import org.apache.lucene.index.MergeState; import org.apache.lucene.index.SegmentWriteState; import org.apache.lucene.index.Sorter; import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.search.TaskExecutor; import org.apache.lucene.store.IndexOutput; import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.InfoStream; @@ -67,7 +67,7 @@ public final class Lucene99HnswVectorsWriter extends KnnVectorsWriter { private final int beamWidth; private final FlatVectorsWriter flatVectorWriter; private final int numMergeWorkers; - private final ExecutorService mergeExec; + private final TaskExecutor mergeExec; private final List> fields = new ArrayList<>(); private boolean finished; @@ -78,7 +78,7 @@ public final class Lucene99HnswVectorsWriter extends KnnVectorsWriter { int beamWidth, FlatVectorsWriter flatVectorWriter, int numMergeWorkers, - ExecutorService mergeExec) + TaskExecutor mergeExec) throws IOException { this.M = M; this.flatVectorWriter = flatVectorWriter; diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99PostingsFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99PostingsFormat.java index f233276c6c5..877746641b4 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99PostingsFormat.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99PostingsFormat.java @@ -158,8 +158,8 @@ import org.apache.lucene.util.packed.PackedInts; *
Frequencies and Skip Data *

The .doc file contains the lists of documents which contain each term, along with the * frequency of the term in that document (except when frequencies are omitted: {@link - * IndexOptions#DOCS}). It also saves skip data to the beginning of each packed or VInt block, - * when the length of document list is larger than packed block size. + * IndexOptions#DOCS}). Skip data is saved at the end of each term's postings. The skip data + * is saved once for the entire postings list. *