Merge branch 'main' into optimize_prefix_query

This commit is contained in:
zhouhui 2024-09-20 17:17:21 +08:00
commit 73b4ced245
718 changed files with 20858 additions and 7946 deletions

View File

@ -30,7 +30,7 @@ jobs:
strategy:
matrix:
os: [ ubuntu-latest ]
java-version: [ '22' ]
java-version: [ '23-ea' ]
uses-alt-java: [ true, false ]
runs-on: ${{ matrix.os }}
@ -61,7 +61,16 @@ jobs:
# https://docs.github.com/en/actions/using-workflows/workflow-commands-for-github-actions#setting-an-environment-variable
echo "RUNTIME_JAVA_HOME=${{ env.ALT_JAVA_DIR }}" >> "$GITHUB_ENV"
- run: ./gradlew -p lucene/core check -x test
- name: ./gradlew tidy
run: |
./gradlew tidy
if [ ! -z "$(git status --porcelain)" ]; then
echo ":warning: **tidy left local checkout in modified state**" >> $GITHUB_STEP_SUMMARY
echo '```' >> $GITHUB_STEP_SUMMARY
git status --porcelain >> $GITHUB_STEP_SUMMARY
echo '```' >> $GITHUB_STEP_SUMMARY
git reset --hard && git clean -xfd .
fi
- name: ./gradlew regenerate
run: |
@ -69,7 +78,7 @@ jobs:
sudo apt-get install libwww-perl
./gradlew regenerate -x generateUAX29URLEmailTokenizerInternal --rerun-tasks
if [ ! -z "$(git status --porcelain)" ]; then
echo ":warning: **regenerateleft local checkout in modified state**" >> $GITHUB_STEP_SUMMARY
echo ":warning: **regenerate left local checkout in modified state**" >> $GITHUB_STEP_SUMMARY
echo '```' >> $GITHUB_STEP_SUMMARY
git status --porcelain >> $GITHUB_STEP_SUMMARY
echo '```' >> $GITHUB_STEP_SUMMARY
@ -79,8 +88,7 @@ jobs:
- run: ./gradlew testOpts
- run: ./gradlew helpWorkflow
- run: ./gradlew licenses updateLicenses
- run: ./gradlew tidy
- run: ./gradlew check -x test
- run: ./gradlew check -x test -Pvalidation.git.failOnModified=false
- run: ./gradlew assembleRelease mavenToLocal
# Conserve resources: only run these in non-alt-java mode.

View File

@ -18,7 +18,7 @@ jobs:
strategy:
matrix:
os: [ ubuntu-latest ]
java-version: [ '21', '22' ]
java-version: [ '21', '22', '23-ea' ]
runs-on: ${{ matrix.os }}
@ -72,3 +72,4 @@ jobs:
name: smoke-tester-logs-jdk-${{ matrix.java-version }}
path: |
${{ env.TMP_DIR }}/**/*.log
/tmp/release.log

View File

@ -41,7 +41,7 @@ import jdk.jfr.consumer.RecordingFile;
*/
public class ProfileResults {
/** Formats a frame to a formatted line. This is deduplicated on! */
static String frameToString(RecordedFrame frame, boolean lineNumbers) {
static String frameToString(RecordedFrame frame, boolean lineNumbers, boolean frameTypes) {
StringBuilder builder = new StringBuilder();
RecordedMethod method = frame.getMethod();
RecordedClass clazz = method.getType();
@ -55,13 +55,14 @@ public class ProfileResults {
builder.append("#");
builder.append(method.getName());
builder.append("()");
if (lineNumbers) {
if (lineNumbers && frame.getLineNumber() != -1) {
builder.append(":");
if (frame.getLineNumber() == -1) {
builder.append("(" + frame.getType() + " code)");
} else {
builder.append(frame.getLineNumber());
}
builder.append(frame.getLineNumber());
}
if (clazz != null && frameTypes) {
builder.append(" [");
builder.append(frame.getType());
builder.append(" code]");
}
return builder.toString();
}
@ -77,6 +78,8 @@ public class ProfileResults {
public static final String COUNT_DEFAULT = "10";
public static final String LINENUMBERS_KEY = "tests.profile.linenumbers";
public static final String LINENUMBERS_DEFAULT = "false";
public static final String FRAMETYPES_KEY = "tests.profile.frametypes";
public static final String FRAMETYPES_DEFAULT = "true";
/**
* Driver method, for testing standalone.
@ -92,7 +95,8 @@ public class ProfileResults {
System.getProperty(MODE_KEY, MODE_DEFAULT),
Integer.parseInt(System.getProperty(STACKSIZE_KEY, STACKSIZE_DEFAULT)),
Integer.parseInt(System.getProperty(COUNT_KEY, COUNT_DEFAULT)),
Boolean.parseBoolean(System.getProperty(LINENUMBERS_KEY, LINENUMBERS_DEFAULT)));
Boolean.parseBoolean(System.getProperty(LINENUMBERS_KEY, LINENUMBERS_DEFAULT)),
Boolean.parseBoolean(System.getProperty(FRAMETYPES_KEY, FRAMETYPES_DEFAULT)));
}
/** true if we care about this event */
@ -152,7 +156,12 @@ public class ProfileResults {
/** Process all the JFR files passed in args and print a merged summary. */
public static void printReport(
List<String> files, String mode, int stacksize, int count, boolean lineNumbers)
List<String> files,
String mode,
int stacksize,
int count,
boolean lineNumbers,
boolean frameTypes)
throws IOException {
if (!"cpu".equals(mode) && !"heap".equals(mode)) {
throw new IllegalArgumentException("tests.profile.mode must be one of (cpu,heap)");
@ -181,7 +190,7 @@ public class ProfileResults {
if (stack.length() > 0) {
stack.append("\n").append(framePadding).append(" at ");
}
stack.append(frameToString(trace.getFrames().get(i), lineNumbers));
stack.append(frameToString(trace.getFrames().get(i), lineNumbers, frameTypes));
}
String line = stack.toString();
SimpleEntry<String, Long> entry =

View File

@ -60,8 +60,8 @@ public class WrapperDownloader {
public static void checkVersion() {
int major = Runtime.version().feature();
if (major != 21 && major != 22) {
throw new IllegalStateException("java version must be 21 or 22, your version: " + major);
if (major != 21 && major != 22 && major != 23) {
throw new IllegalStateException("java version must be 21, 22 or 23, your version: " + major);
}
}

View File

@ -231,8 +231,8 @@ public class MissingDoclet extends StandardDoclet {
case PACKAGE:
checkComment(element);
break;
// class-like elements, check them, then recursively check their children (fields and
// methods)
// class-like elements, check them, then recursively check their children (fields and
// methods)
case CLASS:
case INTERFACE:
case ENUM:
@ -257,7 +257,7 @@ public class MissingDoclet extends StandardDoclet {
}
}
break;
// method-like elements, check them if we are configured to do so
// method-like elements, check them if we are configured to do so
case METHOD:
case CONSTRUCTOR:
case FIELD:

View File

@ -80,6 +80,9 @@ ext {
// Minimum Java version required to compile and run Lucene.
minJavaVersion = JavaVersion.toVersion(deps.versions.minJava.get())
// also change this in extractor tool: ExtractForeignAPI
vectorIncubatorJavaVersions = [ JavaVersion.VERSION_21, JavaVersion.VERSION_22, JavaVersion.VERSION_23 ] as Set
// snapshot build marker used in scripts.
snapshotBuild = version.contains("SNAPSHOT")
@ -117,10 +120,6 @@ apply from: file('gradle/generation/local-settings.gradle')
// Make sure the build environment is consistent.
apply from: file('gradle/validation/check-environment.gradle')
// IDE support, settings and specials.
apply from: file('gradle/ide/intellij-idea.gradle')
apply from: file('gradle/ide/eclipse.gradle')
// Set up defaults and configure aspects for certain modules or functionality
// (java, tests)
apply from: file('gradle/java/folder-layout.gradle')
@ -133,6 +132,10 @@ apply from: file('gradle/testing/alternative-jdk-support.gradle')
apply from: file('gradle/java/jar-manifest.gradle')
apply from: file('gradle/java/modules.gradle')
// IDE support, settings and specials.
apply from: file('gradle/ide/intellij-idea.gradle')
apply from: file('gradle/ide/eclipse.gradle')
// Maven artifact publishing.
apply from: file('gradle/maven/publications.gradle')

View File

@ -112,8 +112,8 @@ def prepare(root, version, pause_before_sign, gpg_key_id, gpg_password, gpg_home
checkDOAPfiles(version)
if not dev_mode:
print(' ./gradlew --no-daemon clean check')
run('./gradlew --no-daemon clean check')
print(' ./gradlew --stacktrace --no-daemon clean check')
run('./gradlew --stacktrace --no-daemon clean check')
else:
print(' skipping precommit check due to dev-mode')
@ -121,7 +121,7 @@ def prepare(root, version, pause_before_sign, gpg_key_id, gpg_password, gpg_home
input("Tests complete! Please press ENTER to proceed to assembleRelease: ")
print(' prepare-release')
cmd = './gradlew --no-daemon assembleRelease' \
cmd = './gradlew --stacktrace --no-daemon assembleRelease' \
' -Dversion.release=%s' % version
if dev_mode:
cmd += ' -Pvalidation.git.failOnModified=false'

View File

@ -32,7 +32,7 @@ allprojects {
missingdoclet "org.apache.lucene.tools:missing-doclet"
}
ext {
project.ext {
relativeDocPath = project.path.replaceFirst(/:\w+:/, "").replace(':', '/')
}

View File

@ -17,13 +17,6 @@
def resources = scriptResources(buildscript)
configure(rootProject) {
ext {
// also change this in extractor tool: ExtractForeignAPI
vectorIncubatorJavaVersions = [ JavaVersion.VERSION_21, JavaVersion.VERSION_22 ] as Set
}
}
configure(project(":lucene:core")) {
ext {
apijars = layout.projectDirectory.dir("src/generated/jdk")

View File

@ -43,6 +43,31 @@ configure(project(":lucene:core")) {
andThenTasks: ["spotlessJava", "spotlessJavaApply"],
mustRunBefore: [ "compileJava" ]
])
task generateForDeltaUtilInternal() {
description "Regenerate gen_ForDeltaUtil.py"
group "generation"
def genDir = file("src/java/org/apache/lucene/codecs/lucene912")
def genScript = file("${genDir}/gen_ForDeltaUtil.py")
def genOutput = file("${genDir}/ForDeltaUtil.java")
inputs.file genScript
outputs.file genOutput
doLast {
quietExec {
workingDir genDir
executable project.externalTool("python3")
args = [ '-B', genScript ]
}
}
}
regenerate.dependsOn wrapWithPersistentChecksums(generateForDeltaUtilInternal, [
andThenTasks: ["spotlessJava", "spotlessJavaApply"],
mustRunBefore: [ "compileJava" ]
])
}
configure(project(":lucene:backward-codecs")) {

View File

@ -65,10 +65,8 @@ configure(project(":lucene:analysis:icu")) {
icupkg = file("${icuBinDir}/icupkg")
}
// Resolve version lazily (can't resolve at configuration time).
def icu4jVersionProvider = project.provider { getVersion('com.ibm.icu', 'icu4j') }
// lazy gstring with ICU version.
def icu4jVersion = "${-> icu4jVersionProvider.get()}"
def icu4jVersion = deps.icu4j.get().version
def icuCompileTask = Os.isFamily(Os.FAMILY_WINDOWS) ? "compileIcuWindows" : "compileIcuLinux"

View File

@ -33,7 +33,7 @@ configure(project(":lucene:analysis:kuromoji")) {
apply plugin: deps.plugins.undercouch.download.get().pluginId
plugins.withType(JavaPlugin) {
ext {
project.ext {
targetDir = file("src/resources")
}

View File

@ -33,7 +33,7 @@ configure(project(":lucene:analysis:nori")) {
apply plugin: deps.plugins.undercouch.download.get().pluginId
plugins.withType(JavaPlugin) {
ext {
project.ext {
targetDir = file("src/resources")
}

View File

@ -22,10 +22,11 @@ import org.gradle.plugins.ide.eclipse.model.ClasspathEntry
def resources = scriptResources(buildscript)
configure(rootProject) {
plugins.withType(JavaPlugin) {
apply plugin: "eclipse"
if (gradle.startParameter.taskNames.contains("eclipse")) {
project.pluginManager.apply("java-base")
project.pluginManager.apply("eclipse")
def eclipseJavaVersion = propertyOrDefault("eclipse.javaVersion", rootProject.minJavaVersion)
def eclipseJavaVersion = propertyOrDefault("eclipse.javaVersion", deps.versions.minJava.get())
def relativize = { other -> rootProject.rootDir.relativePath(other).toString() }
eclipse {
@ -105,9 +106,9 @@ configure(rootProject) {
}
}
eclipseJdt {
eclipseJdt {
enabled = false
dependsOn 'luceneEclipse'
dependsOn 'luceneEclipseJdt'
}
eclipseClasspath {

View File

@ -27,7 +27,7 @@ def beastingMode = gradle.startParameter.taskNames.any{ name -> name == 'beast'
allprojects {
plugins.withType(JavaPlugin) {
ext {
project.ext {
testOptions += [
[propName: 'tests.dups', value: 0, description: "Reiterate runs of entire test suites ('beast' task)."]
]

View File

@ -19,7 +19,7 @@ def recordings = files()
allprojects {
plugins.withType(JavaPlugin) {
ext {
project.ext {
testOptions += [
[propName: 'tests.profile', value: false, description: "Enable Java Flight Recorder profiling."]
]

View File

@ -62,7 +62,7 @@ allprojects {
// Configure test property defaults and their descriptions.
allprojects {
plugins.withType(JavaPlugin) {
ext {
project.ext {
String randomVectorSize = RandomPicks.randomFrom(new Random(projectSeedLong), ["default", "128", "256", "512"])
testOptions += [
// seed, repetition and amplification.
@ -135,14 +135,14 @@ allprojects {
}
afterEvaluate {
ext.testOptionsResolved = testOptions.findAll { opt ->
project.ext.testOptionsResolved = testOptions.findAll { opt ->
propertyOrDefault(opt.propName, opt.value) != null
}.collectEntries { opt ->
[(opt.propName): Objects.toString(resolvedTestOption(opt.propName))]
}
// Compute the "reproduce with" string.
ext.testOptionsForReproduceLine = testOptions.findAll { opt ->
project.ext.testOptionsForReproduceLine = testOptions.findAll { opt ->
if (opt["includeInReproLine"] == false) {
return false
}

View File

@ -22,7 +22,7 @@ def allSuites = []
allprojects {
plugins.withType(JavaPlugin) {
ext {
project.ext {
testOptions += [
[propName: 'tests.slowestTests', value: true, description: "Print the summary of the slowest tests."],
[propName: 'tests.slowestSuites', value: true, description: "Print the summary of the slowest suites."]

View File

@ -75,6 +75,18 @@ configure(rootProject) {
it.dependsOn(":versionCatalogFormatDeps")
}
// correct crlf/ default encoding after version catalog formatting finishes.
tasks.matching {
it.path in [
":versionCatalogFormatDeps"
]
}.configureEach {
it.doLast {
ant.fixcrlf(file: it.catalogFile.get().asFile,
eol: "lf", fixlast: "true", encoding: "UTF-8")
}
}
tasks.matching {
it.path in [
":versionCatalogUpdateDeps"

View File

@ -13,7 +13,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
@defaultMessage Spawns threads with vague names; use a custom thread factory (Lucene's NamedThreadFactory, Solr's SolrNamedThreadFactory) and name threads so that you can tell (by its name) which executor it is associated with
@defaultMessage Spawns threads with vague names; use a custom thread factory (Lucene's NamedThreadFactory) and name threads so that you can tell (by its name) which executor it is associated with
java.util.concurrent.Executors#newFixedThreadPool(int)
java.util.concurrent.Executors#newSingleThreadExecutor()
java.util.concurrent.Executors#newCachedThreadPool()

View File

@ -74,21 +74,6 @@ configure(rootProject) {
logger.warn("WARNING: Directory is not a valid git checkout (won't check dirty files): ${rootProject.projectDir}")
}
} else {
// git ignores any folders which are empty (this includes folders with recursively empty sub-folders).
def untrackedNonEmptyFolders = status.untrackedFolders.findAll { path ->
File location = file("${rootProject.projectDir}/${path}")
boolean hasFiles = false
Files.walkFileTree(location.toPath(), new SimpleFileVisitor<Path>() {
@Override
FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException {
hasFiles = true
// Terminate early.
return FileVisitResult.TERMINATE
}
})
return hasFiles
}
def offenders = [
// Exclude staged changes. These are fine in precommit.
// "(added)": status.added,
@ -97,8 +82,7 @@ configure(rootProject) {
"(conflicting)": status.conflicting,
"(missing)": status.missing,
"(modified)": status.modified,
"(untracked)": status.untracked,
"(untracked non-empty dir)": untrackedNonEmptyFolders
"(untracked)": status.untracked
].collectMany { fileStatus, files ->
files.collect {file -> " - ${file} ${fileStatus}" }
}.sort()

View File

@ -20,6 +20,10 @@
// 2) notice file
// 3) checksum validation/ generation.
// WARNING: The tasks in this file share internal state between tasks without using files.
// Because of this all tasks here must always execute together, so they cannot define task outputs.
// TODO: Rewrite the internal state to use state files containing the ext.jarInfos and its referencedFiles
// This should be false only for debugging.
def failOnError = true
@ -194,13 +198,6 @@ subprojects {
description = "Validate license and notice files of dependencies"
dependsOn collectJarInfos
def outputFileName = 'validateJarLicenses'
inputs.dir(file(project.rootDir.path + '/lucene/licenses'))
.withPropertyName('licenses')
.withPathSensitivity(PathSensitivity.RELATIVE)
outputs.file(layout.buildDirectory.file(outputFileName))
.withPropertyName('validateJarLicensesResult')
doLast {
def errors = []
jarInfos.each { dep ->
@ -246,9 +243,7 @@ subprojects {
}
}
}
// Required to take advantage of incremental building and the build cache
def f = new File(project.buildDir.path + "/" + outputFileName)
f.write(errors.toString(), "UTF-8")
if (errors) {
def msg = "Certain license/ notice files are missing:\n - " + errors.join("\n - ")
if (failOnError) {

View File

@ -1 +1 @@
cb0da6751c2b753a16ac168bb354870ebb1e162e9083f116729cec9c781156b8
2db75c40782f5e8ba1fc278a5574bab070adccb2d21ca5a6e5ed840888448046

View File

@ -1 +1 @@
8.8.0
8.10.0

View File

@ -1,6 +1,6 @@
distributionBase=GRADLE_USER_HOME
distributionPath=wrapper/dists
distributionUrl=https\://services.gradle.org/distributions/gradle-8.8-bin.zip
distributionUrl=https\://services.gradle.org/distributions/gradle-8.10-bin.zip
networkTimeout=10000
validateDistributionUrl=true
zipStoreBase=GRADLE_USER_HOME

View File

@ -112,6 +112,16 @@ API Changes
* GITHUB#13632: CandidateMatcher public matching functions (Bryan Jacobowitz)
* GITHUB#13708: Move Operations.sameLanguage/subsetOf to test-framework. (Robert Muir)
* GITHUB#13733: Move FacetsCollector#search utility methods to `FacetsCollectorManager`, replace the `Collector`
argument with a `FacetsCollectorManager` and update the return type to include both `TopDocs` results as well as
facets results. (Luca Cavanna)
* GITHUB#13328: Convert many basic Lucene classes to record classes, including CollectionStatistics, TermStatistics and LeafMetadata. (Shubham Chaudhary)
* GITHUB#13780: Remove `IndexSearcher#search(List<LeafReaderContext>, Weight, Collector)` in favour of the newly
introduced `IndexSearcher#search(LeafReaderContextPartition[], Weight, Collector)`
New Features
---------------------
@ -141,6 +151,19 @@ New Features
* GITHUB#13604: Add Kmeans clustering on vectors (Mayya Sharipova, Jim Ferenczi, Tom Veasey)
* GITHUB#13592: Take advantage of the doc value skipper when it is primary sort in SortedNumericDocValuesRangeQuery
and SortedSetDocValuesRangeQuery. (Ignacio Vera)
* GITHUB#13542: Add initial support for intra-segment concurrency. IndexSearcher now supports searching across leaf
reader partitions concurrently. This is useful to max out available resource usage especially with force merged
indices or big segments. There is still a performance penalty for queries that require segment-level computation
ahead of time, such as points/range queries. This is an implementation limitation that we expect to improve in
future releases, ad that's why intra-segment slicing is not enabled by default, but leveraged in tests when the
searcher is created via LuceneTestCase#newSearcher. Users may override IndexSearcher#slices(List) to optionally
create slices that target segment partitions. (Luca Cavanna)
* GITHUB#13741: Implement Accountable for NFARunAutomaton, fix hashCode implementation of CompiledAutomaton. (Patrick Zhai)
Improvements
---------------------
@ -164,6 +187,8 @@ Improvements
* GITHUB#12172: Update Romanian stopwords list to include the modern unicode forms. (Trey Jones)
* GITHUB#13707: Improve Operations.isTotal() to work with non-minimal automata. (Dawid Weiss, Robert Muir)
Optimizations
---------------------
@ -176,6 +201,8 @@ Optimizations
* GITHUB#12552: Make FSTPostingsFormat load FSTs off-heap. (Tony X)
* GITHUB#13672: Leverage doc value skip lists in DocValuesRewriteMethod if indexed. (Greg Miller)
Bug Fixes
---------------------
@ -213,6 +240,9 @@ Changes in Backwards Compatibility Policy
* GITHUB#13230: Remove the Kp and Lovins snowball algorithms which are not supported
or intended for general use. (Robert Muir)
* GITHUB#13602: SearchWithCollectorTask no longer supports the `collector.class` config parameter to load a custom
collector implementation. `collector.manager.class` allows users to load a collector manager instead. (Luca Cavanna)
Other
---------------------
@ -253,6 +283,13 @@ Other
* GITHUB#13499: Remove usage of TopScoreDocCollector + TopFieldCollector deprecated methods (#create, #createSharedManager) (Jakub Slowinski)
Build
---------------------
* GITHUB#13649: Fix eclipse ide settings generation #13649 (Uwe Schindler, Dawid Weiss)
* GITHUB#13698: Upgrade to gradle 8.10 (Dawid Weiss)
======================== Lucene 9.12.0 =======================
API Changes
@ -268,6 +305,12 @@ API Changes
* GITHUB#13559: Add BitSet#nextSetBit(int, int) to get the index of the first set bit in range. (Egor Potemkin)
* GITHUB#13568: Add DoubleValuesSource#toSortableLongDoubleValuesSource and
MultiDoubleValuesSource#toSortableMultiLongValuesSource methods. (Shradha Shankar)
* GITHUB#13568, GITHUB#13750: Add DrillSideways#search method that supports any CollectorManagers for drill-sideways dimensions
or drill-down. (Egor Potemkin)
New Features
---------------------
@ -280,8 +323,21 @@ New Features
and LogByteSizeMergePolicy via a new #setTargetConcurrency setter.
(Adrien Grand)
* GITHUB#13568: Add sandbox facets module to compute facets while collecting. (Egor Potemkin, Shradha Shankar)
* GITHUB#13678: Add support JDK 23 to the Panama Vectorization Provider. (Chris Hegarty)
* GITHUB#13689: Add a new faceting feature, dynamic range facets, which automatically picks a balanced set of numeric
ranges based on the distribution of values that occur across all hits. For use cases that have a highly variable
numeric doc values field, such as "price" in an e-commerce application, this facet method is powerful as it allows the
presented ranges to adapt depending on what hits the query actually matches. This is in contrast to existing range
faceting that requires the application to provide the specific fixed ranges up front. (Yuting Gan, Greg Miller,
Stefan Vodita)
Improvements
---------------------
* GITHUB#13475: Re-enable intra-merge parallelism except for terms, norms, and doc values.
Related to GITHUB#13478. (Ben Trent)
* GITHUB#13548: Refactor and javadoc update for KNN vector writer classes. (Patrick Zhai)
@ -299,6 +355,11 @@ Improvements
* GITHUB#13201: Better cost estimation on MultiTermQuery over few terms. (Michael Froh)
* GITHUB#13735: Migrate monitor package usage of deprecated IndexSearcher#search(Query, Collector)
to IndexSearcher#search(Query, CollectorManager). (Greg Miller)
* GITHUB#13746: Introduce ProfilerCollectorManager to parallelize search when using ProfilerCollector. (Luca Cavanna)
Optimizations
---------------------
@ -330,8 +391,14 @@ Optimizations
Closing many individual index files can potentially lead to a degradation in execution performance.
Index files are mmapped one-to-one with the JDK's foreign shared Arena. The JVM deoptimizes the top
few frames of all threads when closing a shared Arena (see JDK-8335480). We mitigate this situation
by 1) using a confined Arena where appropriate, and 2) grouping files from the same segment to a
single shared Arena. (Chris Hegarty, Michael Gibney, Uwe Schindler)
when running with JDK 21 and greater, by 1) using a confined Arena where appropriate, and 2) grouping
files from the same segment to a single shared Arena.
A system property has been added that allows to control the total maximum number of mmapped files
that may be associated with a single shared Arena. For example, to set the max number of permits to
256, pass the following on the command line
-Dorg.apache.lucene.store.MMapDirectory.sharedArenaMaxPermits=256. Setting a value of 1 associates
a single file to a single shared arena.
(Chris Hegarty, Michael Gibney, Uwe Schindler)
* GITHUB#13585: Lucene912PostingsFormat, the new default postings format, now
only has 2 levels of skip data, which are inlined into postings instead of
@ -341,6 +408,22 @@ Optimizations
* GITHUB#13581: OnHeapHnswGraph no longer allocates a lock for every graph node (Mike Sokolov)
* GITHUB#13636, GITHUB#13658: Optimizations to the decoding logic of blocks of
postings. (Adrien Grand, Uwe Schindler, Greg Miller)
* GITHUB##13644: Improve NumericComparator competitive iterator logic by comparing the missing value with the top
value even after the hit queue is full (Pan Guixin)
* GITHUB#13587: Use Max WAND optimizations with ToParentBlockJoinQuery when using ScoreMode.Max (Mike Pellegrini)
* GITHUB#13742: Reorder checks in LRUQueryCache#count (Shubham Chaudhary)
* GITHUB#13686: Replace Map<String,Object> with IntObjectHashMap for DV producer (Pan Guixin)
* GITHUB#13697: Add a bulk scorer to ToParentBlockJoinQuery, which delegates to the bulk scorer of the child query.
This should speed up query evaluation when the child query has a specialized bulk scorer, such as disjunctive queries.
(Mike Pellegrini)
Changes in runtime behavior
---------------------
@ -366,9 +449,38 @@ Bug Fixes
* GITHUB#13627: Fix race condition on flush for DWPT seqNo generation. (Ben Trent, Ao Li)
* GITHUB#13691: Fix incorrect exponent value in explain of SigmoidFunction. (Owais Kazi)
* GITHUB#13703: Fix bug in LatLonPoint queries where narrow polygons close to latitude 90 don't
match any points due to an Integer overflow. (Ignacio Vera)
* GITHUB#13641: Unify how KnnFormats handle missing fields and correctly handle missing vector fields when
merging segments. (Ben Trent)
* GITHUB#13519: 8 bit scalar vector quantization is no longer
supported: it was buggy starting in 9.11 (GITHUB#13197). 4 and 7
bit quantization are still supported. Existing (9.x) Lucene indices
that previously used 8 bit quantization can still be read/searched
but the results from `KNN*VectorQuery` are silently buggy. Further
8 bit quantized vector indexing into such (9.11) indices is not
permitted, so your path forward if you wish to continue using the
same 9.11 index is to index additional vectors into the same field
with either 4 or 7 bit quantization (or no quantization), and ensure
all older (9.11 written) segments are rewritten either via
`IndexWriter.forceMerge` or
`IndexWriter.addIndexes(CodecReader...)`, or reindexing entirely.
Build
---------------------
* GITHUB#13695, GITHUB#13696: Fix Gradle build sometimes gives spurious "unreferenced license file" warnings.
(Uwe Schindler)
Other
--------------------
(No changes)
* GITHUB#13720: Add float comparison based on unit of least precision and use it to stop test failures caused by float
summation not being associative in IEEE 754. (Alex Herbert, Stefan Vodita)
======================== Lucene 9.11.1 =======================

View File

@ -80,9 +80,22 @@ behaviour as 9.x, clone `PersianAnalyzer` in 9.x or create custom analyzer by us
### AutomatonQuery/CompiledAutomaton/RunAutomaton/RegExp no longer determinize (LUCENE-10010)
These classes no longer take a `determinizeWorkLimit` and no longer determinize
behind the scenes. It is the responsibility of the caller to to call
behind the scenes. It is the responsibility of the caller to call
`Operations.determinize()` for DFA execution.
### RegExp optional complement syntax has been deprecated
Support for the optional complement syntax (`~`) has been deprecated.
The `COMPLEMENT` syntax flag has been removed and replaced by the
`DEPRECATED_COMPLEMENT` flag. Users wanting to enable the deprecated
complement support can do so by explicitly passing a syntax flags that
has `DEPRECATED_COMPLEMENT` when creating a `RegExp`. For example:
`new RegExp("~(foo)", RegExp.DEPRECATED_COMPLEMENT)`.
Alternatively, and quite commonly, a more simple _complement bracket expression_,
`[^...]`, may be a suitable replacement, For example, `[^fo]` matches any
character that is not an `f` or `o`.
### DocValuesFieldExistsQuery, NormsFieldExistsQuery and KnnVectorFieldExistsQuery removed in favor of FieldExistsQuery (LUCENE-10436)
These classes have been removed and consolidated into `FieldExistsQuery`. To migrate, caller simply replace those classes
@ -180,6 +193,9 @@ access the members using method calls instead of field accesses. Affected classe
- `IOContext`, `MergeInfo`, and `FlushInfo` (GITHUB#13205)
- `BooleanClause` (GITHUB#13261)
- `TotalHits` (GITHUB#13762)
- `TermAndVector` (GITHUB#13772)
- Many basic Lucene classes, including `CollectionStatistics`, `TermStatistics` and `LeafMetadata` (GITHUB#13328)
### Boolean flags on IOContext replaced with a new ReadAdvice enum.
@ -248,6 +264,11 @@ ConcurrentMergeScheduler now disables auto I/O throttling by default. There is s
happening at the CPU level, since ConcurrentMergeScheduler has a maximum number of threads it can
use, which is only a fraction of the total number of threads of the host by default.
### FieldInfos#hasVectors and FieldInfo#hasVectors renamed to hasTermVectors
To reduce confusion between term vectors and numeric vectors, `hasVectors` has been renamed to
`hasTermVectors`.
## Migration from Lucene 9.0 to Lucene 9.1
### Test framework package migration and module (LUCENE-10301)
@ -793,3 +814,77 @@ Specifically, the method `FunctionValues#getScorer(Weight weight, LeafReaderCont
Callers must now keep track of the Weight instance that created the Scorer if they need it, instead of relying on
Scorer.
### `FacetsCollector#search` utility methods moved and updated
The static `search` methods exposed by `FacetsCollector` have been moved to `FacetsCollectorManager`.
Furthermore, they take a `FacetsCollectorManager` last argument in place of a `Collector` so that they support
intra query concurrency. The return type has also be updated to `FacetsCollectorManager.FacetsResult` which includes
both `TopDocs` as well as facets results included in a reduced `FacetsCollector` instance.
### `SearchWithCollectorTask` no longer supports the `collector.class` config parameter
`collector.class` used to allow users to load a custom collector implementation. `collector.manager.class`
replaces it by allowing users to load a custom collector manager instead.
### BulkScorer#score(LeafCollector collector, Bits acceptDocs) removed
Use `BulkScorer#score(LeafCollector collector, Bits acceptDocs, int min, int max)` instead. In order to score the
entire leaf, provide `0` as min and `DocIdSetIterator.NO_MORE_DOCS` as max. `BulkScorer` subclasses that override
such method need to instead override the method variant that takes the range of doc ids as well as arguments.
### CollectorManager#newCollector and Collector#getLeafCollector contract
With the introduction of intra-segment query concurrency support, multiple `LeafCollector`s may be requested for the
same `LeafReaderContext` via `Collector#getLeafCollector(LeafReaderContext)` across the different `Collector` instances
returned by multiple `CollectorManager#newCollector` calls. Any logic or computation that needs to happen
once per segment requires specific handling in the collector manager implementation. See `TotalHitCountCollectorManager`
as an example. Individual collectors don't need to be adapted as a specific `Collector` instance will still see a given
`LeafReaderContext` once, given that it is not possible to add more than one partition of the same segment to the same
leaf slice.
### Weight#scorer, Weight#bulkScorer and Weight#scorerSupplier contract
With the introduction of intra-segment query concurrency support, multiple `Scorer`s, `ScorerSupplier`s or `BulkScorer`s
may be requested for the same `LeafReaderContext` instance as part of a single search call. That may happen concurrently
from separate threads each searching a specific doc id range of the segment. `Weight` implementations that rely on the
assumption that a scorer, bulk scorer or scorer supplier for a given `LeafReaderContext` is requested once per search
need updating.
### Signature of IndexSearcher#searchLeaf changed
With the introduction of intra-segment query concurrency support, the `IndexSearcher#searchLeaf(LeafReaderContext ctx, Weight weight, Collector collector)`
method now accepts two additional int arguments to identify the min/max range of doc ids that will be searched in this
leaf partition`: IndexSearcher#searchLeaf(LeafReaderContext ctx, int minDocId, int maxDocId, Weight weight, Collector collector)`.
Subclasses of `IndexSearcher` that call or override the `searchLeaf` method need to be updated accordingly.
### Signature of static IndexSearch#slices method changed
The static `IndexSearcher#slices(List<LeafReaderContext> leaves, int maxDocsPerSlice, int maxSegmentsPerSlice)`
method now supports an additional 4th and last argument to optionally enable creating segment partitions:
`IndexSearcher#slices(List<LeafReaderContext> leaves, int maxDocsPerSlice, int maxSegmentsPerSlice, boolean allowSegmentPartitions)`
### TotalHitCountCollectorManager constructor
`TotalHitCountCollectorManager` now requires that an array of `LeafSlice`s, retrieved via `IndexSearcher#getSlices`,
is provided to its constructor. Depending on whether segment partitions are present among slices, the manager can
optimize the type of collectors it creates and exposes via `newCollector`.
### `IndexSearcher#search(List<LeafReaderContext>, Weight, Collector)` removed
The protected `IndexSearcher#search(List<LeafReaderContext> leaves, Weight weight, Collector collector)` method has been
removed in favour of the newly introduced `search(LeafReaderContextPartition[] partitions, Weight weight, Collector collector)`.
`IndexSearcher` subclasses that override this method need to instead override the new method.
### Indexing vectors with 8 bit scalar quantization is no longer supported but 7 and 4 bit quantization still work (GITHUB#13519)
8 bit scalar vector quantization is no longer supported: it was buggy
starting in 9.11 (GITHUB#13197). 4 and 7 bit quantization are still
supported. Existing (9.11) Lucene indices that previously used 8 bit
quantization can still be read/searched but the results from
`KNN*VectorQuery` are silently buggy. Further 8 bit quantized vector
indexing into such (9.11) indices is not permitted, so your path
forward if you wish to continue using the same 9.11 index is to index
additional vectors into the same field with either 4 or 7 bit
quantization (or no quantization), and ensure all older (9.x written)
segments are rewritten either via `IndexWriter.forceMerge` or
`IndexWriter.addIndexes(CodecReader...)`, or reindexing entirely.

View File

@ -1,5 +1,5 @@
{
"gradle/generation/jflex/skeleton.default.txt": "58944f66c9113a940dfaf6a17210ec8219024390",
"lucene/analysis/common/src/java/org/apache/lucene/analysis/classic/ClassicTokenizerImpl.java": "1f7a446f3483326385eef257cea8366c27da0850",
"lucene/analysis/common/src/java/org/apache/lucene/analysis/classic/ClassicTokenizerImpl.java": "e62dcd8c25219d8f5d783823b228ffe38d2bacde",
"lucene/analysis/common/src/java/org/apache/lucene/analysis/classic/ClassicTokenizerImpl.jflex": "f52109bb7d5701979fde90aeeeda726246a8d5fd"
}

View File

@ -1,5 +1,5 @@
{
"gradle/generation/jflex/skeleton.default.txt": "58944f66c9113a940dfaf6a17210ec8219024390",
"lucene/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.java": "ac298e08bc5b96202efca0c01f9f0376fda976bd",
"lucene/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.java": "2b5df5ff35543a6380c82f298225eb5fa06e4453",
"lucene/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.jflex": "0b8c7774b98e8237702013e82c352d4711509bd0"
}

View File

@ -37,23 +37,23 @@ class BengaliNormalizer {
for (int i = 0; i < len; i++) {
switch (s[i]) {
// delete Chandrabindu
// delete Chandrabindu
case '\u0981':
len = delete(s, i, len);
i--;
break;
// DirghoI kar -> RosshoI kar
// DirghoI kar -> RosshoI kar
case '\u09C0':
s[i] = '\u09BF';
break;
// DirghoU kar -> RosshoU kar
// DirghoU kar -> RosshoU kar
case '\u09C2':
s[i] = '\u09C1';
break;
// Khio (Ka + Hoshonto + Murdorno Sh)
// Khio (Ka + Hoshonto + Murdorno Sh)
case '\u0995':
if (i + 2 < len && s[i + 1] == '\u09CD' && s[i + 2] == '\u09BF') {
if (i == 0) {
@ -67,12 +67,12 @@ class BengaliNormalizer {
}
break;
// Nga to Anusvara
// Nga to Anusvara
case '\u0999':
s[i] = '\u0982';
break;
// Ja Phala
// Ja Phala
case '\u09AF':
if (i - 2 == 0 && s[i - 1] == '\u09CD') {
s[i - 1] = '\u09C7';
@ -89,7 +89,7 @@ class BengaliNormalizer {
}
break;
// Ba Phalaa
// Ba Phalaa
case '\u09AC':
if ((i >= 1 && s[i - 1] != '\u09CD') || i == 0) {
break;
@ -109,7 +109,7 @@ class BengaliNormalizer {
}
break;
// Visarga
// Visarga
case '\u0983':
if (i == len - 1) {
if (len <= 3) {
@ -122,18 +122,18 @@ class BengaliNormalizer {
}
break;
// All sh
// All sh
case '\u09B6':
case '\u09B7':
s[i] = '\u09B8';
break;
// check na
// check na
case '\u09A3':
s[i] = '\u09A8';
break;
// check ra
// check ra
case '\u09DC':
case '\u09DD':
s[i] = '\u09B0';

View File

@ -747,70 +747,70 @@ class ClassicTokenizerImpl {
/* Break so we don't hit fall-through warning: */
break; /* ignore */
}
// fall through
// fall through
case 11:
break;
case 2:
{
return ALPHANUM;
}
// fall through
// fall through
case 12:
break;
case 3:
{
return CJ;
}
// fall through
// fall through
case 13:
break;
case 4:
{
return NUM;
}
// fall through
// fall through
case 14:
break;
case 5:
{
return HOST;
}
// fall through
// fall through
case 15:
break;
case 6:
{
return COMPANY;
}
// fall through
// fall through
case 16:
break;
case 7:
{
return APOSTROPHE;
}
// fall through
// fall through
case 17:
break;
case 8:
{
return ACRONYM_DEP;
}
// fall through
// fall through
case 18:
break;
case 9:
{
return ACRONYM;
}
// fall through
// fall through
case 19:
break;
case 10:
{
return EMAIL;
}
// fall through
// fall through
case 20:
break;
default:

View File

@ -53,18 +53,18 @@ public final class GreekLowerCaseFilter extends TokenFilter {
private int lowerCase(int codepoint) {
switch (codepoint) {
/* There are two lowercase forms of sigma:
* U+03C2: small final sigma (end of word)
* U+03C3: small sigma (otherwise)
*
* Standardize both to U+03C3
*/
/* There are two lowercase forms of sigma:
* U+03C2: small final sigma (end of word)
* U+03C3: small sigma (otherwise)
*
* Standardize both to U+03C3
*/
case '\u03C2': /* small final sigma */
return '\u03C3'; /* small sigma */
/* Some greek characters contain diacritics.
* This filter removes these, converting to the lowercase base form.
*/
/* Some greek characters contain diacritics.
* This filter removes these, converting to the lowercase base form.
*/
case '\u0386': /* capital alpha with tonos */
case '\u03AC': /* small alpha with tonos */
@ -100,9 +100,9 @@ public final class GreekLowerCaseFilter extends TokenFilter {
case '\u03CE': /* small omega with tonos */
return '\u03C9'; /* small omega */
/* The previous implementation did the conversion below.
* Only implemented for backwards compatibility with old indexes.
*/
/* The previous implementation did the conversion below.
* Only implemented for backwards compatibility with old indexes.
*/
case '\u03A2': /* reserved */
return '\u03C2'; /* small final sigma */

View File

@ -456,7 +456,7 @@ class PorterStemmer {
/* j >= 0 fixes Bug 2 */
if (ends("ou")) break;
return;
/* takes care of -ous */
/* takes care of -ous */
case 's':
if (ends("ism")) break;
return;

View File

@ -67,7 +67,7 @@ public final class IrishLowerCaseFilter extends TokenFilter {
case 'I':
case 'O':
case 'U':
// vowels with acute accent (fada)
// vowels with acute accent (fada)
case '\u00c1':
case '\u00c9':
case '\u00cd':

View File

@ -47,18 +47,18 @@ class HindiNormalizer {
for (int i = 0; i < len; i++) {
switch (s[i]) {
// dead n -> bindu
// dead n -> bindu
case '\u0928':
if (i + 1 < len && s[i + 1] == '\u094D') {
s[i] = '\u0902';
len = delete(s, i + 1, len);
}
break;
// candrabindu -> bindu
// candrabindu -> bindu
case '\u0901':
s[i] = '\u0902';
break;
// nukta deletions
// nukta deletions
case '\u093C':
len = delete(s, i, len);
i--;
@ -96,18 +96,18 @@ class HindiNormalizer {
case '\u095F':
s[i] = '\u092F';
break;
// zwj/zwnj -> delete
// zwj/zwnj -> delete
case '\u200D':
case '\u200C':
len = delete(s, i, len);
i--;
break;
// virama -> delete
// virama -> delete
case '\u094D':
len = delete(s, i, len);
i--;
break;
// chandra/short -> replace
// chandra/short -> replace
case '\u0945':
case '\u0946':
s[i] = '\u0947';
@ -127,7 +127,7 @@ class HindiNormalizer {
case '\u0972':
s[i] = '\u0905';
break;
// long -> short ind. vowels
// long -> short ind. vowels
case '\u0906':
s[i] = '\u0905';
break;
@ -149,7 +149,7 @@ class HindiNormalizer {
case '\u0914':
s[i] = '\u0913';
break;
// long -> short dep. vowels
// long -> short dep. vowels
case '\u0940':
s[i] = '\u093F';
break;

View File

@ -52,7 +52,7 @@ class CheckCompoundPattern {
boolean prohibitsCompounding(CharsRef word, int breakPos, Root<?> rootBefore, Root<?> rootAfter) {
if (isNonAffixedPattern(endChars)) {
if (!charsMatch(word, breakPos - rootBefore.word.length(), rootBefore.word)) {
if (!charsMatch(word, breakPos - rootBefore.word().length(), rootBefore.word())) {
return false;
}
} else if (!charsMatch(word, breakPos - endChars.length(), endChars)) {
@ -60,7 +60,7 @@ class CheckCompoundPattern {
}
if (isNonAffixedPattern(beginChars)) {
if (!charsMatch(word, breakPos, rootAfter.word)) {
if (!charsMatch(word, breakPos, rootAfter.word())) {
return false;
}
} else if (!charsMatch(word, breakPos, beginChars)) {
@ -84,7 +84,7 @@ class CheckCompoundPattern {
private boolean hasAllFlags(Root<?> root, char[] flags) {
for (char flag : flags) {
if (!dictionary.hasFlag(root.entryId, flag)) {
if (!dictionary.hasFlag(root.entryId(), flag)) {
return false;
}
}

View File

@ -24,7 +24,6 @@ import java.util.ArrayList;
import java.util.Comparator;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Objects;
import java.util.PriorityQueue;
import java.util.Set;
import java.util.TreeSet;
@ -62,8 +61,7 @@ class GeneratingSuggester {
private List<Weighted<Root<String>>> findSimilarDictionaryEntries(
String word, WordCase originalCase) {
Comparator<Weighted<Root<String>>> natural = Comparator.naturalOrder();
PriorityQueue<Weighted<Root<String>>> roots = new PriorityQueue<>(natural.reversed());
PriorityQueue<Weighted<Root<String>>> roots = new PriorityQueue<>(Comparator.reverseOrder());
char[] excludeFlags = dictionary.allNonSuggestibleFlags();
FlagEnumerator.Lookup flagLookup = dictionary.flagLookup;
@ -111,7 +109,7 @@ class GeneratingSuggester {
private static boolean isWorseThan(int score, CharsRef candidate, Weighted<Root<String>> root) {
return score < root.score
|| score == root.score && CharSequence.compare(candidate, root.word.word) > 0;
|| score == root.score && CharSequence.compare(candidate, root.word.word()) > 0;
}
private void processSuggestibleWords(
@ -162,11 +160,11 @@ class GeneratingSuggester {
List<char[]> crossProducts = new ArrayList<>();
Set<String> result = new LinkedHashSet<>();
if (!dictionary.hasFlag(root.entryId, dictionary.needaffix)) {
result.add(root.word);
if (!dictionary.hasFlag(root.entryId(), dictionary.needaffix)) {
result.add(root.word());
}
char[] wordChars = root.word.toCharArray();
char[] wordChars = root.word().toCharArray();
// suffixes
processAffixes(
@ -180,7 +178,7 @@ class GeneratingSuggester {
}
String suffix = misspelled.substring(misspelled.length() - suffixLength);
String withSuffix = root.word.substring(0, root.word.length() - stripLength) + suffix;
String withSuffix = root.word().substring(0, root.word().length() - stripLength) + suffix;
result.add(withSuffix);
if (dictionary.isCrossProduct(suffixId)) {
crossProducts.add(withSuffix.toCharArray());
@ -192,7 +190,7 @@ class GeneratingSuggester {
true,
misspelled,
(prefixLength, prefixId) -> {
if (!dictionary.hasFlag(root.entryId, dictionary.affixData(prefixId, AFFIX_FLAG))
if (!dictionary.hasFlag(root.entryId(), dictionary.affixData(prefixId, AFFIX_FLAG))
|| !dictionary.isCrossProduct(prefixId)) {
return;
}
@ -217,7 +215,7 @@ class GeneratingSuggester {
if (hasCompatibleFlags(root, prefixId)
&& checkAffixCondition(prefixId, wordChars, stripLength, stemLength)) {
String prefix = misspelled.substring(0, prefixLength);
result.add(prefix + root.word.substring(stripLength));
result.add(prefix + root.word().substring(stripLength));
}
});
@ -263,7 +261,7 @@ class GeneratingSuggester {
}
private boolean hasCompatibleFlags(Root<?> root, int affixId) {
if (!dictionary.hasFlag(root.entryId, dictionary.affixData(affixId, AFFIX_FLAG))) {
if (!dictionary.hasFlag(root.entryId(), dictionary.affixData(affixId, AFFIX_FLAG))) {
return false;
}
@ -447,28 +445,8 @@ class GeneratingSuggester {
return commonScore;
}
private static class Weighted<T extends Comparable<T>> implements Comparable<Weighted<T>> {
final T word;
final int score;
Weighted(T word, int score) {
this.word = word;
this.score = score;
}
@Override
public boolean equals(Object o) {
if (this == o) return true;
if (!(o instanceof Weighted)) return false;
@SuppressWarnings("unchecked")
Weighted<T> that = (Weighted<T>) o;
return score == that.score && word.equals(that.word);
}
@Override
public int hashCode() {
return Objects.hash(word, score);
}
private record Weighted<T extends Comparable<T>>(T word, int score)
implements Comparable<Weighted<T>> {
@Override
public String toString() {

View File

@ -132,7 +132,7 @@ public class Hunspell {
Boolean checkSimpleWord(char[] wordChars, int length, WordCase originalCase) {
Root<CharsRef> entry = findStem(wordChars, 0, length, originalCase, SIMPLE_WORD);
if (entry != null) {
return !dictionary.hasFlag(entry.entryId, dictionary.forbiddenword);
return !dictionary.hasFlag(entry.entryId(), dictionary.forbiddenword);
}
return null;
@ -229,7 +229,7 @@ public class Hunspell {
stem = findStem(word.chars, word.offset, breakPos + 1, originalCase, context);
}
if (stem != null
&& !dictionary.hasFlag(stem.entryId, dictionary.forbiddenword)
&& !dictionary.hasFlag(stem.entryId(), dictionary.forbiddenword)
&& (prev == null || prev.mayCompound(stem, breakPos, originalCase))) {
CompoundPart part = new CompoundPart(prev, word, breakPos, stem, null);
if (checkCompoundsAfter(originalCase, part)) {
@ -274,7 +274,7 @@ public class Hunspell {
Root<CharsRef> lastRoot =
findStem(word.chars, breakOffset, remainingLength, originalCase, COMPOUND_END);
if (lastRoot != null
&& !dictionary.hasFlag(lastRoot.entryId, dictionary.forbiddenword)
&& !dictionary.hasFlag(lastRoot.entryId(), dictionary.forbiddenword)
&& !(dictionary.checkCompoundDup && prev.root.equals(lastRoot))
&& !hasForceUCaseProblem(lastRoot, originalCase, word.chars)
&& prev.mayCompound(lastRoot, remainingLength, originalCase)) {
@ -288,7 +288,7 @@ public class Hunspell {
private boolean hasForceUCaseProblem(Root<?> root, WordCase originalCase, char[] wordChars) {
if (originalCase == WordCase.TITLE || originalCase == WordCase.UPPER) return false;
if (originalCase == null && Character.isUpperCase(wordChars[0])) return false;
return dictionary.hasFlag(root.entryId, dictionary.forceUCase);
return dictionary.hasFlag(root.entryId(), dictionary.forceUCase);
}
/**

View File

@ -17,8 +17,6 @@
package org.apache.lucene.analysis.hunspell;
import java.io.IOException;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
@ -117,7 +115,16 @@ public final class HunspellStemFilter extends TokenFilter {
}
if (longestOnly && buffer.size() > 1) {
Collections.sort(buffer, lengthComparator);
buffer.sort(
(o1, o2) -> {
int cmp = Integer.compare(o2.length, o1.length);
if (cmp == 0) {
// tie break on text
return o2.compareTo(o1);
} else {
return cmp;
}
});
}
CharsRef stem = buffer.remove(0);
@ -139,18 +146,4 @@ public final class HunspellStemFilter extends TokenFilter {
super.reset();
buffer = null;
}
static final Comparator<CharsRef> lengthComparator =
new Comparator<CharsRef>() {
@Override
public int compare(CharsRef o1, CharsRef o2) {
int cmp = Integer.compare(o2.length, o1.length);
if (cmp == 0) {
// tie break on text
return o2.compareTo(o1);
} else {
return cmp;
}
}
};
}

View File

@ -16,36 +16,13 @@
*/
package org.apache.lucene.analysis.hunspell;
import java.util.Objects;
class Root<T extends CharSequence> implements Comparable<Root<T>> {
final T word;
final int entryId;
Root(T word, int entryId) {
this.word = word;
this.entryId = entryId;
}
record Root<T extends CharSequence>(T word, int entryId) implements Comparable<Root<T>> {
@Override
public String toString() {
return word.toString();
}
@Override
public boolean equals(Object o) {
if (this == o) return true;
if (!(o instanceof Root)) return false;
@SuppressWarnings("unchecked")
Root<T> root = (Root<T>) o;
return entryId == root.entryId && word.equals(root.word);
}
@Override
public int hashCode() {
return Objects.hash(word, entryId);
}
@Override
public int compareTo(Root<T> o) {
return CharSequence.compare(word, o.word);

View File

@ -18,7 +18,6 @@ package org.apache.lucene.analysis.miscellaneous;
import java.io.IOException;
import java.util.Arrays;
import java.util.Comparator;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
@ -147,26 +146,23 @@ public class FingerprintFilter extends TokenFilter {
Arrays.sort(
items,
new Comparator<Object>() {
@Override
public int compare(Object o1, Object o2) {
char[] v1 = (char[]) o1;
char[] v2 = (char[]) o2;
int len1 = v1.length;
int len2 = v2.length;
int lim = Math.min(len1, len2);
(o1, o2) -> {
char[] v1 = (char[]) o1;
char[] v2 = (char[]) o2;
int len1 = v1.length;
int len2 = v2.length;
int lim = Math.min(len1, len2);
int k = 0;
while (k < lim) {
char c1 = v1[k];
char c2 = v2[k];
if (c1 != c2) {
return c1 - c2;
}
k++;
int k = 0;
while (k < lim) {
char c1 = v1[k];
char c2 = v2[k];
if (c1 != c2) {
return c1 - c2;
}
return len1 - len2;
k++;
}
return len1 - len2;
});
// TODO lets append directly to termAttribute?

View File

@ -194,7 +194,7 @@ public final class WordDelimiterIterator {
int type = charType(text[current]);
switch (type) {
// return ALPHA word type for both lower and upper
// return ALPHA word type for both lower and upper
case LOWER:
case UPPER:
return ALPHA;
@ -332,27 +332,27 @@ public final class WordDelimiterIterator {
case Character.OTHER_NUMBER:
return DIGIT;
// case Character.SPACE_SEPARATOR:
// case Character.LINE_SEPARATOR:
// case Character.PARAGRAPH_SEPARATOR:
// case Character.CONTROL:
// case Character.FORMAT:
// case Character.PRIVATE_USE:
// case Character.SPACE_SEPARATOR:
// case Character.LINE_SEPARATOR:
// case Character.PARAGRAPH_SEPARATOR:
// case Character.CONTROL:
// case Character.FORMAT:
// case Character.PRIVATE_USE:
case Character.SURROGATE: // prevent splitting
return ALPHA | DIGIT;
// case Character.DASH_PUNCTUATION:
// case Character.START_PUNCTUATION:
// case Character.END_PUNCTUATION:
// case Character.CONNECTOR_PUNCTUATION:
// case Character.OTHER_PUNCTUATION:
// case Character.MATH_SYMBOL:
// case Character.CURRENCY_SYMBOL:
// case Character.MODIFIER_SYMBOL:
// case Character.OTHER_SYMBOL:
// case Character.INITIAL_QUOTE_PUNCTUATION:
// case Character.FINAL_QUOTE_PUNCTUATION:
// case Character.DASH_PUNCTUATION:
// case Character.START_PUNCTUATION:
// case Character.END_PUNCTUATION:
// case Character.CONNECTOR_PUNCTUATION:
// case Character.OTHER_PUNCTUATION:
// case Character.MATH_SYMBOL:
// case Character.CURRENCY_SYMBOL:
// case Character.MODIFIER_SYMBOL:
// case Character.OTHER_SYMBOL:
// case Character.INITIAL_QUOTE_PUNCTUATION:
// case Character.FINAL_QUOTE_PUNCTUATION:
default:
return SUBWORD_DELIM;

View File

@ -59,12 +59,12 @@ public class PatternTypingFilter extends TokenFilter {
public final boolean incrementToken() throws IOException {
if (input.incrementToken()) {
for (PatternTypingRule rule : replacementAndFlagByPattern) {
Matcher matcher = rule.getPattern().matcher(termAtt);
Matcher matcher = rule.pattern().matcher(termAtt);
if (matcher.find()) {
// allow 2nd reset() and find() that occurs inside replaceFirst to avoid excess string
// creation
typeAtt.setType(matcher.replaceFirst(rule.getTypeTemplate()));
flagAtt.setFlags(rule.getFlags());
typeAtt.setType(matcher.replaceFirst(rule.typeTemplate()));
flagAtt.setFlags(rule.flags());
return true;
}
}
@ -74,27 +74,5 @@ public class PatternTypingFilter extends TokenFilter {
}
/** Value holding class for pattern typing rules. */
public static class PatternTypingRule {
private final Pattern pattern;
private final int flags;
private final String typeTemplate;
public PatternTypingRule(Pattern pattern, int flags, String typeTemplate) {
this.pattern = pattern;
this.flags = flags;
this.typeTemplate = typeTemplate;
}
public Pattern getPattern() {
return pattern;
}
public int getFlags() {
return flags;
}
public String getTypeTemplate() {
return typeTemplate;
}
}
public record PatternTypingRule(Pattern pattern, int flags, String typeTemplate) {}
}

View File

@ -142,22 +142,10 @@ public final class SynonymGraphFilter extends TokenFilter {
}
}
static class BufferedOutputToken {
final String term;
// Non-null if this was an incoming token:
final State state;
final int startNode;
final int endNode;
public BufferedOutputToken(State state, String term, int startNode, int endNode) {
this.state = state;
this.term = term;
this.startNode = startNode;
this.endNode = endNode;
}
}
/**
* @param state Non-null if this was an incoming token:
*/
record BufferedOutputToken(State state, String term, int startNode, int endNode) {}
/**
* Apply previously built synonyms to incoming tokens.

View File

@ -18,17 +18,15 @@ package org.apache.lucene.analysis.synonym.word2vec;
import org.apache.lucene.util.BytesRef;
/** Wraps a term and boost */
public class TermAndBoost {
/** the term */
public final BytesRef term;
/** the boost */
public final float boost;
/**
* Wraps a term and boost
*
* @param term the term
* @param boost the boost
*/
public record TermAndBoost(BytesRef term, float boost) {
/** Creates a new TermAndBoost */
public TermAndBoost(BytesRef term, float boost) {
this.term = BytesRef.deepCopyOf(term);
this.boost = boost;
public TermAndBoost {
term = BytesRef.deepCopyOf(term);
}
}

View File

@ -56,25 +56,25 @@ public class Word2VecModel implements RandomAccessVectorValues.Floats {
}
public void addTermAndVector(TermAndVector modelEntry) {
modelEntry.normalizeVector();
modelEntry = modelEntry.normalizeVector();
this.termsAndVectors[loadedCount++] = modelEntry;
this.word2Vec.add(modelEntry.getTerm());
this.word2Vec.add(modelEntry.term());
}
@Override
public float[] vectorValue(int targetOrd) {
return termsAndVectors[targetOrd].getVector();
return termsAndVectors[targetOrd].vector();
}
public float[] vectorValue(BytesRef term) {
int termOrd = this.word2Vec.find(term);
if (termOrd < 0) return null;
TermAndVector entry = this.termsAndVectors[termOrd];
return (entry == null) ? null : entry.getVector();
return (entry == null) ? null : entry.vector();
}
public BytesRef termValue(int targetOrd) {
return termsAndVectors[targetOrd].getTerm();
return termsAndVectors[targetOrd].term();
}
@Override

View File

@ -80,7 +80,7 @@ public final class Word2VecSynonymFilter extends TokenFilter {
clearAttributes();
restoreState(this.lastState);
termAtt.setEmpty();
termAtt.append(synonym.term.utf8ToString());
termAtt.append(synonym.term().utf8ToString());
typeAtt.setType(SynonymGraphFilter.TYPE_SYNONYM);
posLenAtt.setPositionLength(1);
posIncrementAtt.setPositionIncrement(0);

View File

@ -38,25 +38,25 @@ class TeluguNormalizer {
for (int i = 0; i < len; i++) {
switch (s[i]) {
// candrabindu ( and ) -> bindu ()
// candrabindu ( and ) -> bindu ()
case '\u0C00': //
case '\u0C01': //
s[i] = '\u0C02'; //
break;
// delete visarga ()
// delete visarga ()
case '\u0C03':
len = delete(s, i, len);
i--;
break;
// zwj/zwnj -> delete
// zwj/zwnj -> delete
case '\u200D':
case '\u200C':
len = delete(s, i, len);
i--;
break;
// long -> short vowels
// long -> short vowels
case '\u0C14': //
s[i] = '\u0C13'; //
break;
@ -73,7 +73,7 @@ class TeluguNormalizer {
s[i] = '\u0C09'; //
break;
// long -> short vowels matras
// long -> short vowels matras
case '\u0C40': //
s[i] = '\u0C3F'; // ి
break;
@ -86,14 +86,14 @@ class TeluguNormalizer {
case '\u0C4B': //
s[i] = '\u0C4A'; //
break;
// decomposed dipthong ( + ) -> precomposed diphthong vowel sign ()
// decomposed dipthong ( + ) -> precomposed diphthong vowel sign ()
case '\u0C46':
if (i + 1 < len && s[i + 1] == '\u0C56') {
s[i] = '\u0C48';
len = delete(s, i + 1, len);
}
break;
// composed oo or au -> oo or au
// composed oo or au -> oo or au
case '\u0C12':
if (i + 1 < len && s[i + 1] == '\u0C55') {
// ( + ) -> oo ()

View File

@ -61,12 +61,12 @@ public final class TurkishLowerCaseFilter extends TokenFilter {
if (iOrAfter) { // all the special I turkish handling happens here.
switch (ch) {
// remove COMBINING_DOT_ABOVE to mimic composed lowercase
// remove COMBINING_DOT_ABOVE to mimic composed lowercase
case COMBINING_DOT_ABOVE:
length = delete(buffer, i, length);
continue;
// i itself, it depends if it is followed by COMBINING_DOT_ABOVE
// if it is, we will make it small i and later remove the dot
// i itself, it depends if it is followed by COMBINING_DOT_ABOVE
// if it is, we will make it small i and later remove the dot
case LATIN_CAPITAL_LETTER_I:
if (isBeforeDot(buffer, i + 1, length)) {
buffer[i] = LATIN_SMALL_LETTER_I;

View File

@ -901,7 +901,7 @@ class WikipediaTokenizerImpl {
positionInc = 1; /* Break so we don't hit fall-through warning: */
break;
}
// fall through
// fall through
case 47:
break;
case 2:
@ -909,7 +909,7 @@ class WikipediaTokenizerImpl {
positionInc = 1;
return ALPHANUM;
}
// fall through
// fall through
case 48:
break;
case 3:
@ -920,7 +920,7 @@ class WikipediaTokenizerImpl {
yybegin(EXTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */
break;
}
// fall through
// fall through
case 49:
break;
case 4:
@ -928,7 +928,7 @@ class WikipediaTokenizerImpl {
positionInc = 1;
return CJ;
}
// fall through
// fall through
case 50:
break;
case 5:
@ -936,7 +936,7 @@ class WikipediaTokenizerImpl {
positionInc = 1; /* Break so we don't hit fall-through warning: */
break;
}
// fall through
// fall through
case 51:
break;
case 6:
@ -945,7 +945,7 @@ class WikipediaTokenizerImpl {
numWikiTokensSeen++;
return currentTokType;
}
// fall through
// fall through
case 52:
break;
case 7:
@ -954,7 +954,7 @@ class WikipediaTokenizerImpl {
numWikiTokensSeen++;
return currentTokType;
}
// fall through
// fall through
case 53:
break;
case 8:
@ -962,7 +962,7 @@ class WikipediaTokenizerImpl {
/* Break so we don't hit fall-through warning: */
break; /* ignore */
}
// fall through
// fall through
case 54:
break;
case 9:
@ -978,7 +978,7 @@ class WikipediaTokenizerImpl {
numLinkToks++;
return currentTokType;
}
// fall through
// fall through
case 55:
break;
case 10:
@ -988,7 +988,7 @@ class WikipediaTokenizerImpl {
yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */
break;
}
// fall through
// fall through
case 56:
break;
case 11:
@ -997,7 +997,7 @@ class WikipediaTokenizerImpl {
yybegin(THREE_SINGLE_QUOTES_STATE); /* Break so we don't hit fall-through warning: */
break;
}
// fall through
// fall through
case 57:
break;
case 12:
@ -1007,7 +1007,7 @@ class WikipediaTokenizerImpl {
yybegin(STRING);
return currentTokType; /*italics*/
}
// fall through
// fall through
case 58:
break;
case 13:
@ -1017,7 +1017,7 @@ class WikipediaTokenizerImpl {
yybegin(EXTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */
break;
}
// fall through
// fall through
case 59:
break;
case 14:
@ -1026,7 +1026,7 @@ class WikipediaTokenizerImpl {
numWikiTokensSeen++;
return currentTokType;
}
// fall through
// fall through
case 60:
break;
case 15:
@ -1036,7 +1036,7 @@ class WikipediaTokenizerImpl {
numWikiTokensSeen++;
return currentTokType;
}
// fall through
// fall through
case 61:
break;
case 16:
@ -1046,7 +1046,7 @@ class WikipediaTokenizerImpl {
yybegin(STRING); /* Break so we don't hit fall-through warning: */
break;
}
// fall through
// fall through
case 62:
break;
case 17:
@ -1055,7 +1055,7 @@ class WikipediaTokenizerImpl {
numWikiTokensSeen = 0;
return currentTokType;
}
// fall through
// fall through
case 63:
break;
case 18:
@ -1063,7 +1063,7 @@ class WikipediaTokenizerImpl {
/* Break so we don't hit fall-through warning: */
break; /* ignore STRING */
}
// fall through
// fall through
case 64:
break;
case 19:
@ -1072,7 +1072,7 @@ class WikipediaTokenizerImpl {
numWikiTokensSeen++;
return currentTokType; /* STRING ALPHANUM*/
}
// fall through
// fall through
case 65:
break;
case 20:
@ -1083,7 +1083,7 @@ class WikipediaTokenizerImpl {
yybegin(EXTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */
break;
}
// fall through
// fall through
case 66:
break;
case 21:
@ -1091,7 +1091,7 @@ class WikipediaTokenizerImpl {
yybegin(STRING);
return currentTokType; /*pipe*/
}
// fall through
// fall through
case 67:
break;
case 22:
@ -1106,7 +1106,7 @@ class WikipediaTokenizerImpl {
} /* Break so we don't hit fall-through warning: */
break;
}
// fall through
// fall through
case 68:
break;
case 23:
@ -1116,7 +1116,7 @@ class WikipediaTokenizerImpl {
yybegin(DOUBLE_EQUALS_STATE); /* Break so we don't hit fall-through warning: */
break;
}
// fall through
// fall through
case 69:
break;
case 24:
@ -1127,7 +1127,7 @@ class WikipediaTokenizerImpl {
yybegin(INTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */
break;
}
// fall through
// fall through
case 70:
break;
case 25:
@ -1138,7 +1138,7 @@ class WikipediaTokenizerImpl {
yybegin(DOUBLE_BRACE_STATE); /* Break so we don't hit fall-through warning: */
break;
}
// fall through
// fall through
case 71:
break;
case 26:
@ -1146,7 +1146,7 @@ class WikipediaTokenizerImpl {
yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */
break;
}
// fall through
// fall through
case 72:
break;
case 27:
@ -1155,7 +1155,7 @@ class WikipediaTokenizerImpl {
yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */
break;
}
// fall through
// fall through
case 73:
break;
case 28:
@ -1165,7 +1165,7 @@ class WikipediaTokenizerImpl {
yybegin(INTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */
break;
}
// fall through
// fall through
case 74:
break;
case 29:
@ -1175,7 +1175,7 @@ class WikipediaTokenizerImpl {
yybegin(INTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */
break;
}
// fall through
// fall through
case 75:
break;
case 30:
@ -1183,7 +1183,7 @@ class WikipediaTokenizerImpl {
yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */
break;
}
// fall through
// fall through
case 76:
break;
case 31:
@ -1193,7 +1193,7 @@ class WikipediaTokenizerImpl {
yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */
break; /*end italics*/
}
// fall through
// fall through
case 77:
break;
case 32:
@ -1204,7 +1204,7 @@ class WikipediaTokenizerImpl {
yybegin(INTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */
break;
}
// fall through
// fall through
case 78:
break;
case 33:
@ -1212,7 +1212,7 @@ class WikipediaTokenizerImpl {
positionInc = 1;
return NUM;
}
// fall through
// fall through
case 79:
break;
case 34:
@ -1220,7 +1220,7 @@ class WikipediaTokenizerImpl {
positionInc = 1;
return COMPANY;
}
// fall through
// fall through
case 80:
break;
case 35:
@ -1228,7 +1228,7 @@ class WikipediaTokenizerImpl {
positionInc = 1;
return APOSTROPHE;
}
// fall through
// fall through
case 81:
break;
case 36:
@ -1236,7 +1236,7 @@ class WikipediaTokenizerImpl {
positionInc = 1;
return HOST;
}
// fall through
// fall through
case 82:
break;
case 37:
@ -1245,7 +1245,7 @@ class WikipediaTokenizerImpl {
yybegin(FIVE_SINGLE_QUOTES_STATE); /* Break so we don't hit fall-through warning: */
break;
}
// fall through
// fall through
case 83:
break;
case 38:
@ -1255,7 +1255,7 @@ class WikipediaTokenizerImpl {
yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */
break; /*end bold*/
}
// fall through
// fall through
case 84:
break;
case 39:
@ -1265,7 +1265,7 @@ class WikipediaTokenizerImpl {
yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */
break; /*end sub header*/
}
// fall through
// fall through
case 85:
break;
case 40:
@ -1273,7 +1273,7 @@ class WikipediaTokenizerImpl {
positionInc = 1;
return ACRONYM;
}
// fall through
// fall through
case 86:
break;
case 41:
@ -1281,7 +1281,7 @@ class WikipediaTokenizerImpl {
positionInc = 1;
return EMAIL;
}
// fall through
// fall through
case 87:
break;
case 42:
@ -1291,7 +1291,7 @@ class WikipediaTokenizerImpl {
yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */
break; /*end bold italics*/
}
// fall through
// fall through
case 88:
break;
case 43:
@ -1301,7 +1301,7 @@ class WikipediaTokenizerImpl {
yybegin(EXTERNAL_LINK_STATE);
return currentTokType;
}
// fall through
// fall through
case 89:
break;
case 44:
@ -1312,7 +1312,7 @@ class WikipediaTokenizerImpl {
yybegin(CATEGORY_STATE); /* Break so we don't hit fall-through warning: */
break;
}
// fall through
// fall through
case 90:
break;
case 45:
@ -1322,7 +1322,7 @@ class WikipediaTokenizerImpl {
yybegin(CATEGORY_STATE); /* Break so we don't hit fall-through warning: */
break;
}
// fall through
// fall through
case 91:
break;
case 46:
@ -1333,7 +1333,7 @@ class WikipediaTokenizerImpl {
yybegin(CATEGORY_STATE); /* Break so we don't hit fall-through warning: */
break;
}
// fall through
// fall through
case 92:
break;
default:

View File

@ -1490,7 +1490,7 @@ public class TestSynonymGraphFilter extends BaseTokenStreamTestCase {
}
assertTrue(approxEquals(actual, expected));
assertTrue(Operations.sameLanguage(actual, expected));
assertTrue(AutomatonTestUtil.sameLanguage(actual, expected));
}
a.close();

View File

@ -64,7 +64,7 @@ public class TestWord2VecSynonymProvider extends LuceneTestCase {
assertEquals(4, actualSynonymsResults.size());
for (int i = 0; i < expectedSynonyms.length; i++) {
assertEquals(new BytesRef(expectedSynonyms[i]), actualSynonymsResults.get(i).term);
assertEquals(new BytesRef(expectedSynonyms[i]), actualSynonymsResults.get(i).term());
}
}
@ -83,8 +83,8 @@ public class TestWord2VecSynonymProvider extends LuceneTestCase {
BytesRef expectedFirstSynonymTerm = new BytesRef("b");
double expectedFirstSynonymBoost = 1.0;
assertEquals(expectedFirstSynonymTerm, actualSynonymsResults.get(0).term);
assertEquals(expectedFirstSynonymBoost, actualSynonymsResults.get(0).boost, 0.001f);
assertEquals(expectedFirstSynonymTerm, actualSynonymsResults.get(0).term());
assertEquals(expectedFirstSynonymBoost, actualSynonymsResults.get(0).boost(), 0.001f);
}
@Test
@ -120,8 +120,8 @@ public class TestWord2VecSynonymProvider extends LuceneTestCase {
@Test
public void normalizedVector_shouldReturnModule1() {
TermAndVector synonymTerm = new TermAndVector(new BytesRef("a"), new float[] {10, 10});
synonymTerm.normalizeVector();
float[] vector = synonymTerm.getVector();
synonymTerm = synonymTerm.normalizeVector();
float[] vector = synonymTerm.vector();
float len = 0;
for (int i = 0; i < vector.length; i++) {
len += vector[i] * vector[i];

View File

@ -139,19 +139,7 @@ public final class JapaneseCompletionFilter extends TokenFilter {
}
}
private static class CompletionToken {
final String term;
final boolean isFirst;
final int startOffset;
final int endOffset;
CompletionToken(String term, boolean isFirst, int startOffset, int endOffset) {
this.term = term;
this.isFirst = isFirst;
this.startOffset = startOffset;
this.endOffset = endOffset;
}
}
private record CompletionToken(String term, boolean isFirst, int startOffset, int endOffset) {}
private static class CompletionTokenGenerator implements Iterator<CompletionToken> {

View File

@ -180,13 +180,5 @@ public class KatakanaRomanizer {
return null;
}
private static class MatchedKeystroke {
final int keystrokeLen;
final int keystrokeIndex;
MatchedKeystroke(int keystrokeLen, int keystrokeIndex) {
this.keystrokeLen = keystrokeLen;
this.keystrokeIndex = keystrokeIndex;
}
}
private record MatchedKeystroke(int keystrokeLen, int keystrokeIndex) {}
}

View File

@ -20,8 +20,6 @@ import java.io.BufferedReader;
import java.io.IOException;
import java.io.Reader;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import java.util.regex.Pattern;
import org.apache.lucene.analysis.morph.Dictionary;
@ -83,14 +81,7 @@ public final class UserDictionary implements Dictionary<UserMorphData> {
// TODO: should we allow multiple segmentations per input 'phrase'?
// the old treemap didn't support this either, and i'm not sure if it's needed/useful?
Collections.sort(
featureEntries,
new Comparator<String[]>() {
@Override
public int compare(String[] left, String[] right) {
return left[0].compareTo(right[0]);
}
});
featureEntries.sort((left, right) -> left[0].compareTo(right[0]));
List<String> data = new ArrayList<>(featureEntries.size());
List<int[]> segmentations = new ArrayList<>(featureEntries.size());

View File

@ -268,19 +268,19 @@ final class Viterbi
final KoMorphData.Morpheme morpheme = morphemes[i];
final Token compoundToken;
if (token.getPOSType() == POS.Type.COMPOUND) {
assert endOffset - morpheme.surfaceForm.length() >= 0;
assert endOffset - morpheme.surfaceForm().length() >= 0;
compoundToken =
new DecompoundToken(
morpheme.posTag,
morpheme.surfaceForm,
endOffset - morpheme.surfaceForm.length(),
morpheme.posTag(),
morpheme.surfaceForm(),
endOffset - morpheme.surfaceForm().length(),
endOffset,
backType);
} else {
compoundToken =
new DecompoundToken(
morpheme.posTag,
morpheme.surfaceForm,
morpheme.posTag(),
morpheme.surfaceForm(),
token.getStartOffset(),
token.getEndOffset(),
backType);
@ -289,7 +289,7 @@ final class Viterbi
compoundToken.setPositionIncrement(0);
}
++posLen;
endOffset -= morpheme.surfaceForm.length();
endOffset -= morpheme.surfaceForm().length();
pending.add(compoundToken);
if (VERBOSE) {
System.out.println(" add token=" + pending.get(pending.size() - 1));

View File

@ -22,15 +22,7 @@ import org.apache.lucene.analysis.morph.MorphData;
/** Represents Korean morphological information. */
public interface KoMorphData extends MorphData {
/** A morpheme extracted from a compound token. */
class Morpheme {
public final POS.Tag posTag;
public final String surfaceForm;
public Morpheme(POS.Tag posTag, String surfaceForm) {
this.posTag = posTag;
this.surfaceForm = surfaceForm;
}
}
record Morpheme(POS.Tag posTag, String surfaceForm) {}
/**
* Get the {@link org.apache.lucene.analysis.ko.POS.Type} of specified word (morpheme, compound,

View File

@ -150,13 +150,13 @@ class TokenInfoDictionaryEntryWriter extends DictionaryEntryWriter {
int compoundOffset = 0;
for (KoMorphData.Morpheme morpheme : morphemes) {
if (hasSinglePOS == false) {
buffer.put((byte) morpheme.posTag.ordinal());
buffer.put((byte) morpheme.posTag().ordinal());
}
if (posType != POS.Type.INFLECT) {
buffer.put((byte) morpheme.surfaceForm.length());
compoundOffset += morpheme.surfaceForm.length();
buffer.put((byte) morpheme.surfaceForm().length());
compoundOffset += morpheme.surfaceForm().length();
} else {
writeString(morpheme.surfaceForm);
writeString(morpheme.surfaceForm());
}
assert compoundOffset <= entry[0].length() : Arrays.toString(entry);
}

View File

@ -86,11 +86,11 @@ public class PartOfSpeechAttributeImpl extends AttributeImpl implements PartOfSp
builder.append("+");
}
builder
.append(morpheme.surfaceForm)
.append(morpheme.surfaceForm())
.append('/')
.append(morpheme.posTag.name())
.append(morpheme.posTag().name())
.append('(')
.append(morpheme.posTag.description())
.append(morpheme.posTag().description())
.append(')');
}
return builder.toString();

View File

@ -170,14 +170,14 @@ public class TestTokenInfoDictionary extends LuceneTestCase {
if (decompound != null) {
int offset = 0;
for (KoMorphData.Morpheme morph : decompound) {
assertTrue(UnicodeUtil.validUTF16String(morph.surfaceForm));
assertFalse(morph.surfaceForm.isEmpty());
assertEquals(morph.surfaceForm.trim(), morph.surfaceForm);
assertTrue(UnicodeUtil.validUTF16String(morph.surfaceForm()));
assertFalse(morph.surfaceForm().isEmpty());
assertEquals(morph.surfaceForm().trim(), morph.surfaceForm());
if (type != POS.Type.INFLECT) {
assertEquals(
morph.surfaceForm,
surfaceForm.substring(offset, offset + morph.surfaceForm.length()));
offset += morph.surfaceForm.length();
morph.surfaceForm(),
surfaceForm.substring(offset, offset + morph.surfaceForm().length()));
offset += morph.surfaceForm().length();
}
}
assertTrue(offset <= surfaceForm.length());

View File

@ -43,10 +43,10 @@ public class TestUserDictionary extends LuceneTestCase {
dictionary.getMorphAttributes().getMorphemes(wordIds.get(1), sArray, 0, s.length());
assertNotNull(decompound);
assertEquals(2, decompound.length);
assertEquals(decompound[0].posTag, POS.Tag.NNG);
assertEquals(decompound[0].surfaceForm, "세종");
assertEquals(decompound[1].posTag, POS.Tag.NNG);
assertEquals(decompound[1].surfaceForm, "");
assertEquals(decompound[0].posTag(), POS.Tag.NNG);
assertEquals(decompound[0].surfaceForm(), "세종");
assertEquals(decompound[1].posTag(), POS.Tag.NNG);
assertEquals(decompound[1].surfaceForm(), "");
s = "c++";
sArray = s.toCharArray();

View File

@ -245,7 +245,7 @@ public class Diff {
deletes++;
x--;
break;
// delete
// delete
case Y:
if (deletes != base) {
result.append('D').append(deletes);
@ -258,7 +258,7 @@ public class Diff {
result.append('I');
result.append(b.charAt(--y));
break;
// insert
// insert
case R:
if (deletes != base) {
result.append('D').append(deletes);
@ -272,7 +272,7 @@ public class Diff {
result.append(b.charAt(--y));
x--;
break;
// replace
// replace
case D:
if (deletes != base) {
result.append('D').append(deletes);

View File

@ -103,19 +103,20 @@ final class ForUtil {
for (int bpv = 1; bpv <= 32; ++bpv) {
final FormatAndBits formatAndBits =
PackedInts.fastestFormatAndBits(BLOCK_SIZE, bpv, acceptableOverheadRatio);
assert formatAndBits.format.isSupported(formatAndBits.bitsPerValue);
assert formatAndBits.bitsPerValue <= 32;
assert formatAndBits.format().isSupported(formatAndBits.bitsPerValue());
assert formatAndBits.bitsPerValue() <= 32;
encodedSizes[bpv] =
encodedSize(formatAndBits.format, PackedInts.VERSION_CURRENT, formatAndBits.bitsPerValue);
encodedSize(
formatAndBits.format(), PackedInts.VERSION_CURRENT, formatAndBits.bitsPerValue());
encoders[bpv] =
PackedInts.getEncoder(
formatAndBits.format, PackedInts.VERSION_CURRENT, formatAndBits.bitsPerValue);
formatAndBits.format(), PackedInts.VERSION_CURRENT, formatAndBits.bitsPerValue());
decoders[bpv] =
PackedInts.getDecoder(
formatAndBits.format, PackedInts.VERSION_CURRENT, formatAndBits.bitsPerValue);
formatAndBits.format(), PackedInts.VERSION_CURRENT, formatAndBits.bitsPerValue());
iterations[bpv] = computeIterations(decoders[bpv]);
out.writeVInt(formatAndBits.format.getId() << 5 | (formatAndBits.bitsPerValue - 1));
out.writeVInt(formatAndBits.format().getId() << 5 | (formatAndBits.bitsPerValue() - 1));
}
}

View File

@ -24,6 +24,7 @@ import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.DocValuesFormat;
import org.apache.lucene.codecs.FieldInfosFormat;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.DocValuesSkipIndexType;
import org.apache.lucene.index.DocValuesType;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FieldInfos;
@ -209,7 +210,7 @@ public final class Lucene60FieldInfosFormat extends FieldInfosFormat {
storePayloads,
indexOptions,
docValuesType,
false,
DocValuesSkipIndexType.NONE,
dvGen,
attributes,
pointDataDimensionCount,
@ -347,7 +348,7 @@ public final class Lucene60FieldInfosFormat extends FieldInfosFormat {
output.writeVInt(fi.number);
byte bits = 0x0;
if (fi.hasVectors()) bits |= STORE_TERMVECTOR;
if (fi.hasTermVectors()) bits |= STORE_TERMVECTOR;
if (fi.omitsNorms()) bits |= OMIT_NORMS;
if (fi.hasPayloads()) bits |= STORE_PAYLOADS;
if (fi.isSoftDeletesField()) bits |= SOFT_DELETES_FIELD;

View File

@ -17,8 +17,6 @@
package org.apache.lucene.backward_codecs.lucene80;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import org.apache.lucene.backward_codecs.packed.LegacyDirectMonotonicReader;
import org.apache.lucene.backward_codecs.packed.LegacyDirectReader;
import org.apache.lucene.backward_codecs.store.EndiannessReverserUtil;
@ -41,6 +39,7 @@ import org.apache.lucene.index.SortedNumericDocValues;
import org.apache.lucene.index.SortedSetDocValues;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.index.TermsEnum.SeekStatus;
import org.apache.lucene.internal.hppc.IntObjectHashMap;
import org.apache.lucene.store.ByteArrayDataInput;
import org.apache.lucene.store.ChecksumIndexInput;
import org.apache.lucene.store.DataInput;
@ -53,11 +52,11 @@ import org.apache.lucene.util.compress.LZ4;
/** reader for {@link Lucene80DocValuesFormat} */
final class Lucene80DocValuesProducer extends DocValuesProducer {
private final Map<String, NumericEntry> numerics = new HashMap<>();
private final Map<String, BinaryEntry> binaries = new HashMap<>();
private final Map<String, SortedEntry> sorted = new HashMap<>();
private final Map<String, SortedSetEntry> sortedSets = new HashMap<>();
private final Map<String, SortedNumericEntry> sortedNumerics = new HashMap<>();
private final IntObjectHashMap<NumericEntry> numerics = new IntObjectHashMap<>();
private final IntObjectHashMap<BinaryEntry> binaries = new IntObjectHashMap<>();
private final IntObjectHashMap<SortedEntry> sorted = new IntObjectHashMap<>();
private final IntObjectHashMap<SortedSetEntry> sortedSets = new IntObjectHashMap<>();
private final IntObjectHashMap<SortedNumericEntry> sortedNumerics = new IntObjectHashMap<>();
private final IndexInput data;
private final int maxDoc;
private int version = -1;
@ -139,7 +138,7 @@ final class Lucene80DocValuesProducer extends DocValuesProducer {
}
byte type = meta.readByte();
if (type == Lucene80DocValuesFormat.NUMERIC) {
numerics.put(info.name, readNumeric(meta));
numerics.put(info.number, readNumeric(meta));
} else if (type == Lucene80DocValuesFormat.BINARY) {
final boolean compressed;
if (version >= Lucene80DocValuesFormat.VERSION_CONFIGURABLE_COMPRESSION) {
@ -158,13 +157,13 @@ final class Lucene80DocValuesProducer extends DocValuesProducer {
} else {
compressed = version >= Lucene80DocValuesFormat.VERSION_BIN_COMPRESSED;
}
binaries.put(info.name, readBinary(meta, compressed));
binaries.put(info.number, readBinary(meta, compressed));
} else if (type == Lucene80DocValuesFormat.SORTED) {
sorted.put(info.name, readSorted(meta));
sorted.put(info.number, readSorted(meta));
} else if (type == Lucene80DocValuesFormat.SORTED_SET) {
sortedSets.put(info.name, readSortedSet(meta));
sortedSets.put(info.number, readSortedSet(meta));
} else if (type == Lucene80DocValuesFormat.SORTED_NUMERIC) {
sortedNumerics.put(info.name, readSortedNumeric(meta));
sortedNumerics.put(info.number, readSortedNumeric(meta));
} else {
throw new CorruptIndexException("invalid type: " + type, meta);
}
@ -426,7 +425,7 @@ final class Lucene80DocValuesProducer extends DocValuesProducer {
@Override
public NumericDocValues getNumeric(FieldInfo field) throws IOException {
NumericEntry entry = numerics.get(field.name);
NumericEntry entry = numerics.get(field.number);
return getNumeric(entry);
}
@ -915,7 +914,7 @@ final class Lucene80DocValuesProducer extends DocValuesProducer {
@Override
public BinaryDocValues getBinary(FieldInfo field) throws IOException {
BinaryEntry entry = binaries.get(field.name);
BinaryEntry entry = binaries.get(field.number);
if (entry.compressed) {
return getCompressedBinary(entry);
} else {
@ -973,7 +972,7 @@ final class Lucene80DocValuesProducer extends DocValuesProducer {
@Override
public SortedDocValues getSorted(FieldInfo field) throws IOException {
SortedEntry entry = sorted.get(field.name);
SortedEntry entry = sorted.get(field.number);
return getSorted(entry);
}
@ -1407,7 +1406,7 @@ final class Lucene80DocValuesProducer extends DocValuesProducer {
@Override
public SortedNumericDocValues getSortedNumeric(FieldInfo field) throws IOException {
SortedNumericEntry entry = sortedNumerics.get(field.name);
SortedNumericEntry entry = sortedNumerics.get(field.number);
if (entry.numValues == entry.numDocsWithField) {
return DocValues.singleton(getNumeric(entry));
}
@ -1543,7 +1542,7 @@ final class Lucene80DocValuesProducer extends DocValuesProducer {
@Override
public SortedSetDocValues getSortedSet(FieldInfo field) throws IOException {
SortedSetEntry entry = sortedSets.get(field.name);
SortedSetEntry entry = sortedSets.get(field.number);
if (entry.singleValueEntry != null) {
return DocValues.singleton(getSorted(entry.singleValueEntry));
}

View File

@ -23,6 +23,7 @@ import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.DocValuesFormat;
import org.apache.lucene.codecs.FieldInfosFormat;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.DocValuesSkipIndexType;
import org.apache.lucene.index.DocValuesType;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FieldInfos;
@ -186,7 +187,7 @@ public final class Lucene90FieldInfosFormat extends FieldInfosFormat {
storePayloads,
indexOptions,
docValuesType,
false,
DocValuesSkipIndexType.NONE,
dvGen,
attributes,
pointDataDimensionCount,
@ -333,7 +334,7 @@ public final class Lucene90FieldInfosFormat extends FieldInfosFormat {
output.writeVInt(fi.number);
byte bits = 0x0;
if (fi.hasVectors()) bits |= STORE_TERMVECTOR;
if (fi.hasTermVectors()) bits |= STORE_TERMVECTOR;
if (fi.omitsNorms()) bits |= OMIT_NORMS;
if (fi.hasPayloads()) bits |= STORE_PAYLOADS;
if (fi.isSoftDeletesField()) bits |= SOFT_DELETES_FIELD;

View File

@ -224,6 +224,9 @@ public final class Lucene90HnswVectorsReader extends KnnVectorsReader {
@Override
public FloatVectorValues getFloatVectorValues(String field) throws IOException {
FieldEntry fieldEntry = fields.get(field);
if (fieldEntry == null) {
throw new IllegalArgumentException("field=\"" + field + "\" not found");
}
return getOffHeapVectorValues(fieldEntry);
}

View File

@ -218,6 +218,9 @@ public final class Lucene91HnswVectorsReader extends KnnVectorsReader {
@Override
public FloatVectorValues getFloatVectorValues(String field) throws IOException {
FieldEntry fieldEntry = fields.get(field);
if (fieldEntry == null) {
throw new IllegalArgumentException("field=\"" + field + "\" not found");
}
return getOffHeapVectorValues(fieldEntry);
}

View File

@ -215,6 +215,9 @@ public final class Lucene92HnswVectorsReader extends KnnVectorsReader {
@Override
public FloatVectorValues getFloatVectorValues(String field) throws IOException {
FieldEntry fieldEntry = fields.get(field);
if (fieldEntry == null) {
throw new IllegalArgumentException("field=\"" + field + "\" not found");
}
return OffHeapFloatVectorValues.load(fieldEntry, vectorData);
}

View File

@ -233,6 +233,9 @@ public final class Lucene94HnswVectorsReader extends KnnVectorsReader {
@Override
public FloatVectorValues getFloatVectorValues(String field) throws IOException {
FieldEntry fieldEntry = fields.get(field);
if (fieldEntry == null) {
throw new IllegalArgumentException("field=\"" + field + "\" not found");
}
if (fieldEntry.vectorEncoding != VectorEncoding.FLOAT32) {
throw new IllegalArgumentException(
"field=\""
@ -248,6 +251,9 @@ public final class Lucene94HnswVectorsReader extends KnnVectorsReader {
@Override
public ByteVectorValues getByteVectorValues(String field) throws IOException {
FieldEntry fieldEntry = fields.get(field);
if (fieldEntry == null) {
throw new IllegalArgumentException("field=\"" + field + "\" not found");
}
if (fieldEntry.vectorEncoding != VectorEncoding.BYTE) {
throw new IllegalArgumentException(
"field=\""

View File

@ -241,6 +241,9 @@ public final class Lucene95HnswVectorsReader extends KnnVectorsReader implements
@Override
public FloatVectorValues getFloatVectorValues(String field) throws IOException {
FieldEntry fieldEntry = fields.get(field);
if (fieldEntry == null) {
throw new IllegalArgumentException("field=\"" + field + "\" not found");
}
if (fieldEntry.vectorEncoding != VectorEncoding.FLOAT32) {
throw new IllegalArgumentException(
"field=\""
@ -264,6 +267,9 @@ public final class Lucene95HnswVectorsReader extends KnnVectorsReader implements
@Override
public ByteVectorValues getByteVectorValues(String field) throws IOException {
FieldEntry fieldEntry = fields.get(field);
if (fieldEntry == null) {
throw new IllegalArgumentException("field=\"" + field + "\" not found");
}
if (fieldEntry.vectorEncoding != VectorEncoding.BYTE) {
throw new IllegalArgumentException(
"field=\""

View File

@ -46,10 +46,10 @@ import org.apache.lucene.store.IndexOutput;
* uptos(position, payload). 4. start offset.
*/
public final class Lucene99SkipWriter extends MultiLevelSkipListWriter {
private int[] lastSkipDoc;
private long[] lastSkipDocPointer;
private long[] lastSkipPosPointer;
private long[] lastSkipPayPointer;
private final int[] lastSkipDoc;
private final long[] lastSkipDocPointer;
private final long[] lastSkipPosPointer;
private final long[] lastSkipPayPointer;
private final IndexOutput docOut;
private final IndexOutput posOut;
@ -61,7 +61,7 @@ public final class Lucene99SkipWriter extends MultiLevelSkipListWriter {
private long curPayPointer;
private int curPosBufferUpto;
private int curPayloadByteUpto;
private CompetitiveImpactAccumulator[] curCompetitiveFreqNorms;
private final CompetitiveImpactAccumulator[] curCompetitiveFreqNorms;
private boolean fieldHasPositions;
private boolean fieldHasOffsets;
private boolean fieldHasPayloads;
@ -85,7 +85,12 @@ public final class Lucene99SkipWriter extends MultiLevelSkipListWriter {
lastSkipPosPointer = new long[maxSkipLevels];
if (payOut != null) {
lastSkipPayPointer = new long[maxSkipLevels];
} else {
lastSkipPayPointer = null;
}
} else {
lastSkipPosPointer = null;
lastSkipPayPointer = null;
}
curCompetitiveFreqNorms = new CompetitiveImpactAccumulator[maxSkipLevels];
for (int i = 0; i < maxSkipLevels; ++i) {

View File

@ -642,13 +642,13 @@ public class BKDWriter60 implements Closeable {
throws IOException {
assert docMaps == null || readers.size() == docMaps.size();
BKDMergeQueue queue = new BKDMergeQueue(config.bytesPerDim, readers.size());
BKDMergeQueue queue = new BKDMergeQueue(config.bytesPerDim(), readers.size());
for (int i = 0; i < readers.size(); i++) {
PointValues pointValues = readers.get(i);
assert pointValues.getNumDimensions() == config.numDims
&& pointValues.getBytesPerDimension() == config.bytesPerDim
&& pointValues.getNumIndexDimensions() == config.numIndexDims;
assert pointValues.getNumDimensions() == config.numDims()
&& pointValues.getBytesPerDimension() == config.bytesPerDim()
&& pointValues.getNumIndexDimensions() == config.numIndexDims();
MergeState.DocMap docMap;
if (docMaps == null) {
docMap = null;
@ -1931,7 +1931,7 @@ public class BKDWriter60 implements Closeable {
private void computePackedValueBounds(
BKDRadixSelector.PathSlice slice, byte[] minPackedValue, byte[] maxPackedValue)
throws IOException {
try (PointReader reader = slice.writer.getReader(slice.start, slice.count)) {
try (PointReader reader = slice.writer().getReader(slice.start(), slice.count())) {
if (reader.next() == false) {
return;
}
@ -1995,16 +1995,16 @@ public class BKDWriter60 implements Closeable {
// least number of unique bytes at commonPrefixLengths[dim], which makes compression more
// efficient
HeapPointWriter heapSource;
if (points.writer instanceof HeapPointWriter == false) {
if (points.writer() instanceof HeapPointWriter == false) {
// Adversarial cases can cause this, e.g. merging big segments with most of the points
// deleted
heapSource = switchToHeap(points.writer);
heapSource = switchToHeap(points.writer());
} else {
heapSource = (HeapPointWriter) points.writer;
heapSource = (HeapPointWriter) points.writer();
}
int from = Math.toIntExact(points.start);
int to = Math.toIntExact(points.start + points.count);
int from = Math.toIntExact(points.start());
int to = Math.toIntExact(points.start() + points.count());
// we store common prefix on scratch1
computeCommonPrefixLength(heapSource, scratch1, from, to);
@ -2107,8 +2107,8 @@ public class BKDWriter60 implements Closeable {
: "nodeID=" + nodeID + " splitValues.length=" + splitPackedValues.length;
// How many points will be in the left tree:
long rightCount = points.count / 2;
long leftCount = points.count - rightCount;
long rightCount = points.count() / 2;
long leftCount = points.count() - rightCount;
BKDRadixSelector.PathSlice[] slices = new BKDRadixSelector.PathSlice[2];
@ -2128,9 +2128,9 @@ public class BKDWriter60 implements Closeable {
radixSelector.select(
points,
slices,
points.start,
points.start + points.count,
points.start + leftCount,
points.start(),
points.start() + points.count(),
points.start() + leftCount,
splitDim,
commonPrefixLen);

View File

@ -78,4 +78,9 @@ public class TestLucene90HnswVectorsFormat extends BaseKnnVectorsFormatTestCase
public void testEmptyByteVectorData() {
// unimplemented
}
@Override
public void testMergingWithDifferentByteKnnFields() {
// unimplemented
}
}

View File

@ -77,4 +77,9 @@ public class TestLucene91HnswVectorsFormat extends BaseKnnVectorsFormatTestCase
public void testEmptyByteVectorData() {
// unimplemented
}
@Override
public void testMergingWithDifferentByteKnnFields() {
// unimplemented
}
}

View File

@ -67,4 +67,9 @@ public class TestLucene92HnswVectorsFormat extends BaseKnnVectorsFormatTestCase
public void testEmptyByteVectorData() {
// unimplemented
}
@Override
public void testMergingWithDifferentByteKnnFields() {
// unimplemented
}
}

View File

@ -388,10 +388,14 @@ public final class Lucene94HnswVectorsWriter extends KnnVectorsWriter {
// write the vector data to a temporary file
DocsWithFieldSet docsWithField =
switch (fieldInfo.getVectorEncoding()) {
case BYTE -> writeByteVectorData(
tempVectorData, MergedVectorValues.mergeByteVectorValues(fieldInfo, mergeState));
case FLOAT32 -> writeVectorData(
tempVectorData, MergedVectorValues.mergeFloatVectorValues(fieldInfo, mergeState));
case BYTE ->
writeByteVectorData(
tempVectorData,
MergedVectorValues.mergeByteVectorValues(fieldInfo, mergeState));
case FLOAT32 ->
writeVectorData(
tempVectorData,
MergedVectorValues.mergeFloatVectorValues(fieldInfo, mergeState));
};
CodecUtil.writeFooter(tempVectorData);
IOUtils.close(tempVectorData);
@ -638,18 +642,20 @@ public final class Lucene94HnswVectorsWriter extends KnnVectorsWriter {
throws IOException {
int dim = fieldInfo.getVectorDimension();
return switch (fieldInfo.getVectorEncoding()) {
case BYTE -> new FieldWriter<byte[]>(fieldInfo, M, beamWidth, infoStream) {
@Override
public byte[] copyValue(byte[] value) {
return ArrayUtil.copyOfSubArray(value, 0, dim);
}
};
case FLOAT32 -> new FieldWriter<float[]>(fieldInfo, M, beamWidth, infoStream) {
@Override
public float[] copyValue(float[] value) {
return ArrayUtil.copyOfSubArray(value, 0, dim);
}
};
case BYTE ->
new FieldWriter<byte[]>(fieldInfo, M, beamWidth, infoStream) {
@Override
public byte[] copyValue(byte[] value) {
return ArrayUtil.copyOfSubArray(value, 0, dim);
}
};
case FLOAT32 ->
new FieldWriter<float[]>(fieldInfo, M, beamWidth, infoStream) {
@Override
public float[] copyValue(float[] value) {
return ArrayUtil.copyOfSubArray(value, 0, dim);
}
};
};
}
@ -663,12 +669,14 @@ public final class Lucene94HnswVectorsWriter extends KnnVectorsWriter {
DefaultFlatVectorScorer defaultFlatVectorScorer = new DefaultFlatVectorScorer();
RandomVectorScorerSupplier scorerSupplier =
switch (fieldInfo.getVectorEncoding()) {
case BYTE -> defaultFlatVectorScorer.getRandomVectorScorerSupplier(
fieldInfo.getVectorSimilarityFunction(),
RandomAccessVectorValues.fromBytes((List<byte[]>) vectors, dim));
case FLOAT32 -> defaultFlatVectorScorer.getRandomVectorScorerSupplier(
fieldInfo.getVectorSimilarityFunction(),
RandomAccessVectorValues.fromFloats((List<float[]>) vectors, dim));
case BYTE ->
defaultFlatVectorScorer.getRandomVectorScorerSupplier(
fieldInfo.getVectorSimilarityFunction(),
RandomAccessVectorValues.fromBytes((List<byte[]>) vectors, dim));
case FLOAT32 ->
defaultFlatVectorScorer.getRandomVectorScorerSupplier(
fieldInfo.getVectorSimilarityFunction(),
RandomAccessVectorValues.fromFloats((List<float[]>) vectors, dim));
};
hnswGraphBuilder =
HnswGraphBuilder.create(scorerSupplier, M, beamWidth, HnswGraphBuilder.randSeed);
@ -693,9 +701,9 @@ public final class Lucene94HnswVectorsWriter extends KnnVectorsWriter {
lastDocID = docID;
}
OnHeapHnswGraph getGraph() {
OnHeapHnswGraph getGraph() throws IOException {
if (vectors.size() > 0) {
return hnswGraphBuilder.getGraph();
return hnswGraphBuilder.getCompletedGraph();
} else {
return null;
}

View File

@ -18,6 +18,7 @@
package org.apache.lucene.backward_codecs.lucene95;
import static org.apache.lucene.backward_codecs.lucene95.Lucene95HnswVectorsFormat.DIRECT_MONOTONIC_BLOCK_SHIFT;
import static org.apache.lucene.codecs.KnnVectorsWriter.MergedVectorValues.hasVectorValues;
import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS;
import java.io.IOException;
@ -414,10 +415,14 @@ public final class Lucene95HnswVectorsWriter extends KnnVectorsWriter {
// write the vector data to a temporary file
DocsWithFieldSet docsWithField =
switch (fieldInfo.getVectorEncoding()) {
case BYTE -> writeByteVectorData(
tempVectorData, MergedVectorValues.mergeByteVectorValues(fieldInfo, mergeState));
case FLOAT32 -> writeVectorData(
tempVectorData, MergedVectorValues.mergeFloatVectorValues(fieldInfo, mergeState));
case BYTE ->
writeByteVectorData(
tempVectorData,
MergedVectorValues.mergeByteVectorValues(fieldInfo, mergeState));
case FLOAT32 ->
writeVectorData(
tempVectorData,
MergedVectorValues.mergeFloatVectorValues(fieldInfo, mergeState));
};
CodecUtil.writeFooter(tempVectorData);
IOUtils.close(tempVectorData);
@ -472,15 +477,19 @@ public final class Lucene95HnswVectorsWriter extends KnnVectorsWriter {
IncrementalHnswGraphMerger merger =
new IncrementalHnswGraphMerger(fieldInfo, scorerSupplier, M, beamWidth);
for (int i = 0; i < mergeState.liveDocs.length; i++) {
merger.addReader(
mergeState.knnVectorsReaders[i], mergeState.docMaps[i], mergeState.liveDocs[i]);
if (hasVectorValues(mergeState.fieldInfos[i], fieldInfo.name)) {
merger.addReader(
mergeState.knnVectorsReaders[i], mergeState.docMaps[i], mergeState.liveDocs[i]);
}
}
DocIdSetIterator mergedVectorIterator = null;
switch (fieldInfo.getVectorEncoding()) {
case BYTE -> mergedVectorIterator =
KnnVectorsWriter.MergedVectorValues.mergeByteVectorValues(fieldInfo, mergeState);
case FLOAT32 -> mergedVectorIterator =
KnnVectorsWriter.MergedVectorValues.mergeFloatVectorValues(fieldInfo, mergeState);
case BYTE ->
mergedVectorIterator =
KnnVectorsWriter.MergedVectorValues.mergeByteVectorValues(fieldInfo, mergeState);
case FLOAT32 ->
mergedVectorIterator =
KnnVectorsWriter.MergedVectorValues.mergeFloatVectorValues(fieldInfo, mergeState);
}
graph =
merger.merge(
@ -680,18 +689,20 @@ public final class Lucene95HnswVectorsWriter extends KnnVectorsWriter {
throws IOException {
int dim = fieldInfo.getVectorDimension();
return switch (fieldInfo.getVectorEncoding()) {
case BYTE -> new FieldWriter<byte[]>(fieldInfo, M, beamWidth, infoStream) {
@Override
public byte[] copyValue(byte[] value) {
return ArrayUtil.copyOfSubArray(value, 0, dim);
}
};
case FLOAT32 -> new FieldWriter<float[]>(fieldInfo, M, beamWidth, infoStream) {
@Override
public float[] copyValue(float[] value) {
return ArrayUtil.copyOfSubArray(value, 0, dim);
}
};
case BYTE ->
new FieldWriter<byte[]>(fieldInfo, M, beamWidth, infoStream) {
@Override
public byte[] copyValue(byte[] value) {
return ArrayUtil.copyOfSubArray(value, 0, dim);
}
};
case FLOAT32 ->
new FieldWriter<float[]>(fieldInfo, M, beamWidth, infoStream) {
@Override
public float[] copyValue(float[] value) {
return ArrayUtil.copyOfSubArray(value, 0, dim);
}
};
};
}
@ -704,12 +715,14 @@ public final class Lucene95HnswVectorsWriter extends KnnVectorsWriter {
vectors = new ArrayList<>();
RandomVectorScorerSupplier scorerSupplier =
switch (fieldInfo.getVectorEncoding()) {
case BYTE -> defaultFlatVectorScorer.getRandomVectorScorerSupplier(
fieldInfo.getVectorSimilarityFunction(),
RandomAccessVectorValues.fromBytes((List<byte[]>) vectors, dim));
case FLOAT32 -> defaultFlatVectorScorer.getRandomVectorScorerSupplier(
fieldInfo.getVectorSimilarityFunction(),
RandomAccessVectorValues.fromFloats((List<float[]>) vectors, dim));
case BYTE ->
defaultFlatVectorScorer.getRandomVectorScorerSupplier(
fieldInfo.getVectorSimilarityFunction(),
RandomAccessVectorValues.fromBytes((List<byte[]>) vectors, dim));
case FLOAT32 ->
defaultFlatVectorScorer.getRandomVectorScorerSupplier(
fieldInfo.getVectorSimilarityFunction(),
RandomAccessVectorValues.fromFloats((List<float[]>) vectors, dim));
};
hnswGraphBuilder =
HnswGraphBuilder.create(scorerSupplier, M, beamWidth, HnswGraphBuilder.randSeed);
@ -732,9 +745,9 @@ public final class Lucene95HnswVectorsWriter extends KnnVectorsWriter {
lastDocID = docID;
}
OnHeapHnswGraph getGraph() {
OnHeapHnswGraph getGraph() throws IOException {
if (vectors.size() > 0) {
return hnswGraphBuilder.getGraph();
return hnswGraphBuilder.getCompletedGraph();
} else {
return null;
}

View File

@ -72,7 +72,7 @@ public class TestIndexSortBackwardsCompatibility extends BackwardsCompatibilityT
final Sort sort;
try (DirectoryReader reader = DirectoryReader.open(directory)) {
assertEquals(1, reader.leaves().size());
sort = reader.leaves().get(0).reader().getMetaData().getSort();
sort = reader.leaves().get(0).reader().getMetaData().sort();
assertNotNull(sort);
searchExampleIndex(reader);
}
@ -125,8 +125,8 @@ public class TestIndexSortBackwardsCompatibility extends BackwardsCompatibilityT
.add(new TermQuery(new Term("bid", "" + i)), BooleanClause.Occur.MUST)
.build(),
2);
assertEquals(2, children.totalHits.value);
assertEquals(1, parents.totalHits.value);
assertEquals(2, children.totalHits.value());
assertEquals(1, parents.totalHits.value());
// make sure it's sorted
assertEquals(children.scoreDocs[0].doc + 1, children.scoreDocs[1].doc);
assertEquals(children.scoreDocs[1].doc + 1, parents.scoreDocs[0].doc);
@ -140,7 +140,7 @@ public class TestIndexSortBackwardsCompatibility extends BackwardsCompatibilityT
public void testSortedIndex() throws Exception {
try (DirectoryReader reader = DirectoryReader.open(directory)) {
assertEquals(1, reader.leaves().size());
Sort sort = reader.leaves().get(0).reader().getMetaData().getSort();
Sort sort = reader.leaves().get(0).reader().getMetaData().sort();
assertNotNull(sort);
assertEquals("<long: \"dateDV\">!", sort.toString());
// This will confirm the docs are really sorted
@ -195,28 +195,28 @@ public class TestIndexSortBackwardsCompatibility extends BackwardsCompatibilityT
IndexSearcher searcher = newSearcher(reader);
TopDocs topDocs = searcher.search(new FieldExistsQuery("titleTokenized"), 10);
assertEquals(50, topDocs.totalHits.value);
assertEquals(50, topDocs.totalHits.value());
topDocs = searcher.search(new FieldExistsQuery("titleDV"), 10);
assertEquals(50, topDocs.totalHits.value);
assertEquals(50, topDocs.totalHits.value());
topDocs =
searcher.search(
IntPoint.newRangeQuery("docid_int", 42, 44),
10,
new Sort(new SortField("docid_intDV", SortField.Type.INT)));
assertEquals(3, topDocs.totalHits.value);
assertEquals(3, topDocs.totalHits.value());
assertEquals(3, topDocs.scoreDocs.length);
assertEquals(42, ((FieldDoc) topDocs.scoreDocs[0]).fields[0]);
assertEquals(43, ((FieldDoc) topDocs.scoreDocs[1]).fields[0]);
assertEquals(44, ((FieldDoc) topDocs.scoreDocs[2]).fields[0]);
topDocs = searcher.search(new TermQuery(new Term("body", "the")), 5);
assertTrue(topDocs.totalHits.value > 0);
assertTrue(topDocs.totalHits.value() > 0);
topDocs =
searcher.search(
new MatchAllDocsQuery(), 5, new Sort(new SortField("dateDV", SortField.Type.LONG)));
assertEquals(50, topDocs.totalHits.value);
assertEquals(50, topDocs.totalHits.value());
assertEquals(5, topDocs.scoreDocs.length);
long firstDate = (Long) ((FieldDoc) topDocs.scoreDocs[0]).fields[0];
long lastDate = (Long) ((FieldDoc) topDocs.scoreDocs[4]).fields[0];

View File

@ -0,0 +1,108 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.benchmark.jmh;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.Random;
import java.util.concurrent.TimeUnit;
import org.apache.lucene.codecs.lucene912.ForDeltaUtil;
import org.apache.lucene.codecs.lucene912.ForUtil;
import org.apache.lucene.codecs.lucene912.PostingIndexInput;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.store.MMapDirectory;
import org.apache.lucene.util.IOUtils;
import org.openjdk.jmh.annotations.Benchmark;
import org.openjdk.jmh.annotations.BenchmarkMode;
import org.openjdk.jmh.annotations.Fork;
import org.openjdk.jmh.annotations.Level;
import org.openjdk.jmh.annotations.Measurement;
import org.openjdk.jmh.annotations.Mode;
import org.openjdk.jmh.annotations.OutputTimeUnit;
import org.openjdk.jmh.annotations.Param;
import org.openjdk.jmh.annotations.Scope;
import org.openjdk.jmh.annotations.Setup;
import org.openjdk.jmh.annotations.State;
import org.openjdk.jmh.annotations.TearDown;
import org.openjdk.jmh.annotations.Warmup;
import org.openjdk.jmh.infra.Blackhole;
@BenchmarkMode(Mode.Throughput)
@OutputTimeUnit(TimeUnit.MICROSECONDS)
@State(Scope.Benchmark)
@Warmup(iterations = 5, time = 1)
@Measurement(iterations = 5, time = 1)
@Fork(
value = 3,
jvmArgsAppend = {"-Xmx1g", "-Xms1g", "-XX:+AlwaysPreTouch"})
public class PostingIndexInputBenchmark {
private Path path;
private Directory dir;
private IndexInput in;
private PostingIndexInput postingIn;
private final ForUtil forUtil = new ForUtil();
private final ForDeltaUtil forDeltaUtil = new ForDeltaUtil();
private final long[] values = new long[128];
@Param({"2", "3", "4", "5", "6", "7", "8", "9", "10"})
public int bpv;
@Setup(Level.Trial)
public void setup() throws Exception {
path = Files.createTempDirectory("forUtil");
dir = MMapDirectory.open(path);
try (IndexOutput out = dir.createOutput("docs", IOContext.DEFAULT)) {
Random r = new Random(0);
// Write enough random data to not reach EOF while decoding
for (int i = 0; i < 100; ++i) {
out.writeLong(r.nextLong());
}
}
in = dir.openInput("docs", IOContext.DEFAULT);
postingIn = new PostingIndexInput(in, forUtil, forDeltaUtil);
}
@TearDown(Level.Trial)
public void tearDown() throws Exception {
if (dir != null) {
dir.deleteFile("docs");
}
IOUtils.close(in, dir);
in = null;
dir = null;
Files.deleteIfExists(path);
}
@Benchmark
public void decode(Blackhole bh) throws IOException {
in.seek(3); // random unaligned offset
postingIn.decode(bpv, values);
bh.consume(values);
}
@Benchmark
public void decodeAndPrefixSum(Blackhole bh) throws IOException {
in.seek(3); // random unaligned offset
postingIn.decodeAndPrefixSum(bpv, 100, values);
bh.consume(values);
}
}

View File

@ -17,11 +17,10 @@
# -------------------------------------------------------------------------------------
# multi val params are iterated by NewRound's, added to reports, start with column name.
# collector.class can be:
# Fully Qualified Class Name of a Collector with a empty constructor
# topScoreDocOrdered - Creates a TopScoreDocCollector that requires in order docs
# topScoreDocUnordered - Like above, but allows out of order
collector.class=coll:topScoreDoc
# collector.manager.class can be:
# Fully Qualified Class Name of a CollectorManager with a empty constructor
# topScoreDoc - Creates a TopScoreDocCollectorManager
collector.manager.class=coll:topScoreDoc
analyzer=org.apache.lucene.analysis.core.WhitespaceAnalyzer
directory=FSDirectory

View File

@ -17,11 +17,10 @@
# -------------------------------------------------------------------------------------
# multi val params are iterated by NewRound's, added to reports, start with column name.
# collector.class can be:
# Fully Qualified Class Name of a Collector with a empty constructor
# topScoreDocOrdered - Creates a TopScoreDocCollector that requires in order docs
# topScoreDocUnordered - Like above, but allows out of order
collector.class=coll:topScoreDoc
# collector.manager.class can be:
# Fully Qualified Class Name of a CollectorManager with a empty constructor
# topScoreDoc - Creates a TopScoreDocCollectorManager
collector.manager.class=coll:topScoreDoc
analyzer=org.apache.lucene.analysis.core.WhitespaceAnalyzer
directory=FSDirectory

View File

@ -238,7 +238,7 @@ public class EnwikiContentSource extends ContentSource {
time = null;
id = null;
break;
// intentional fall-through.
// intentional fall-through.
case BODY:
case DATE:
case TITLE:

View File

@ -99,7 +99,7 @@ public class SpatialDocMaker extends DocMaker {
return makeRPTStrategy(SPATIAL_FIELD, config, configMap, ctx);
case "composite":
return makeCompositeStrategy(config, configMap, ctx);
// TODO add more as-needed
// TODO add more as-needed
default:
throw new IllegalStateException("Unknown spatial.strategy: " + strategyName);
}

View File

@ -24,7 +24,7 @@ import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.MultiBits;
import org.apache.lucene.index.StoredFields;
import org.apache.lucene.search.Collector;
import org.apache.lucene.search.CollectorManager;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
@ -119,9 +119,7 @@ public abstract class ReadTask extends PerfTask {
hits = searcher.search(q, numHits);
}
} else {
Collector collector = createCollector();
searcher.search(q, collector);
searcher.search(q, createCollectorManager());
// hits = collector.topDocs();
}
@ -184,9 +182,8 @@ public abstract class ReadTask extends PerfTask {
return res;
}
protected Collector createCollector() throws Exception {
return new TopScoreDocCollectorManager(numHits(), withTotalHits() ? Integer.MAX_VALUE : 1)
.newCollector();
protected CollectorManager<?, ?> createCollectorManager() throws Exception {
return new TopScoreDocCollectorManager(numHits(), withTotalHits() ? Integer.MAX_VALUE : 1);
}
protected Document retrieveDoc(StoredFields storedFields, int id) throws IOException {

View File

@ -19,7 +19,7 @@ package org.apache.lucene.benchmark.byTask.tasks;
import org.apache.lucene.benchmark.byTask.PerfRunData;
import org.apache.lucene.benchmark.byTask.feeds.QueryMaker;
import org.apache.lucene.benchmark.byTask.utils.Config;
import org.apache.lucene.search.Collector;
import org.apache.lucene.search.CollectorManager;
import org.apache.lucene.search.TopScoreDocCollectorManager;
/** Does search w/ a custom collector */
@ -37,7 +37,11 @@ public class SearchWithCollectorTask extends SearchTask {
// check to make sure either the doc is being stored
PerfRunData runData = getRunData();
Config config = runData.getConfig();
clnName = config.get("collector.class", "");
if (config.get("collector.class", null) != null) {
throw new IllegalArgumentException(
"collector.class is no longer supported as a config parameter, use collector.manager.class instead to provide a CollectorManager class name");
}
clnName = config.get("collector.manager.class", "");
}
@Override
@ -46,18 +50,17 @@ public class SearchWithCollectorTask extends SearchTask {
}
@Override
protected Collector createCollector() throws Exception {
Collector collector = null;
protected CollectorManager<?, ?> createCollectorManager() throws Exception {
CollectorManager<?, ?> collectorManager;
if (clnName.equalsIgnoreCase("topScoreDoc") == true) {
collector =
new TopScoreDocCollectorManager(numHits(), null, Integer.MAX_VALUE, false).newCollector();
collectorManager = new TopScoreDocCollectorManager(numHits(), Integer.MAX_VALUE);
} else if (clnName.length() > 0) {
collector = Class.forName(clnName).asSubclass(Collector.class).getConstructor().newInstance();
collectorManager =
Class.forName(clnName).asSubclass(CollectorManager.class).getConstructor().newInstance();
} else {
collector = super.createCollector();
collectorManager = super.createCollectorManager();
}
return collector;
return collectorManager;
}
@Override

View File

@ -91,7 +91,7 @@ public class TestDocMaker extends BenchmarkTestCase {
IndexReader reader = DirectoryReader.open(runData.getDirectory());
IndexSearcher searcher = newSearcher(reader);
TopDocs td = searcher.search(new TermQuery(new Term("key", "value")), 10);
assertEquals(numExpectedResults, td.totalHits.value);
assertEquals(numExpectedResults, td.totalHits.value());
reader.close();
}

View File

@ -160,7 +160,7 @@ public class TestLineDocSource extends BenchmarkTestCase {
reader = DirectoryReader.open(runData.getDirectory());
searcher = newSearcher(reader);
TopDocs td = searcher.search(new TermQuery(new Term("body", "body")), 10);
assertEquals(numAdds, td.totalHits.value);
assertEquals(numAdds, td.totalHits.value());
assertNotNull(td.scoreDocs[0]);
if (storedField == null) {

View File

@ -151,13 +151,13 @@ public class BM25NBClassifier implements Classifier<BytesRef> {
if (!assignedClasses.isEmpty()) {
Collections.sort(assignedClasses);
// this is a negative number closest to 0 = a
double smax = assignedClasses.get(0).getScore();
double smax = assignedClasses.get(0).score();
double sumLog = 0;
// log(sum(exp(x_n-a)))
for (ClassificationResult<BytesRef> cr : assignedClasses) {
// getScore-smax <=0 (both negative, smax is the smallest abs()
sumLog += Math.exp(cr.getScore() - smax);
sumLog += Math.exp(cr.score() - smax);
}
// loga=a+log(sum(exp(x_n-a))) = log(sum(exp(x_n)))
double loga = smax;
@ -165,8 +165,8 @@ public class BM25NBClassifier implements Classifier<BytesRef> {
// 1/sum*x = exp(log(x))*1/sum = exp(log(x)-log(sum))
for (ClassificationResult<BytesRef> cr : assignedClasses) {
double scoreDiff = cr.getScore() - loga;
returnList.add(new ClassificationResult<>(cr.getAssignedClass(), Math.exp(scoreDiff)));
double scoreDiff = cr.score() - loga;
returnList.add(new ClassificationResult<>(cr.assignedClass(), Math.exp(scoreDiff)));
}
}
return returnList;
@ -216,7 +216,7 @@ public class BM25NBClassifier implements Classifier<BytesRef> {
builder.add(query, BooleanClause.Occur.MUST);
}
TopDocs search = indexSearcher.search(builder.build(), 1);
return search.totalHits.value > 0 ? search.scoreDocs[0].score : 1;
return search.totalHits.value() > 0 ? search.scoreDocs[0].score : 1;
}
private double calculateLogPrior(Term term) throws IOException {
@ -227,6 +227,6 @@ public class BM25NBClassifier implements Classifier<BytesRef> {
bq.add(query, BooleanClause.Occur.MUST);
}
TopDocs topDocs = indexSearcher.search(bq.build(), 1);
return topDocs.totalHits.value > 0 ? Math.log(topDocs.scoreDocs[0].score) : 0;
return topDocs.totalHits.value() > 0 ? Math.log(topDocs.scoreDocs[0].score) : 0;
}
}

View File

@ -148,7 +148,7 @@ public class BooleanPerceptronClassifier implements Classifier<Boolean> {
if (textField != null && classField != null) {
// assign class to the doc
ClassificationResult<Boolean> classificationResult = assignClass(textField.stringValue());
Boolean assignedClass = classificationResult.getAssignedClass();
Boolean assignedClass = classificationResult.assignedClass();
Boolean correctClass = Boolean.valueOf(classField.stringValue());
double modifier = Math.signum(correctClass.compareTo(assignedClass));

View File

@ -126,7 +126,7 @@ public class CachingNaiveBayesClassifier extends SimpleNaiveBayesClassifier {
int removeIdx = -1;
int i = 0;
for (ClassificationResult<BytesRef> cr : ret) {
if (cr.getAssignedClass().equals(cclass)) {
if (cr.assignedClass().equals(cclass)) {
removeIdx = i;
break;
}
@ -137,7 +137,7 @@ public class CachingNaiveBayesClassifier extends SimpleNaiveBayesClassifier {
ClassificationResult<BytesRef> toRemove = ret.get(removeIdx);
ret.add(
new ClassificationResult<>(
toRemove.getAssignedClass(), toRemove.getScore() + Math.log(wordProbability)));
toRemove.assignedClass(), toRemove.score() + Math.log(wordProbability)));
ret.remove(removeIdx);
}
}

View File

@ -20,44 +20,15 @@ package org.apache.lucene.classification;
* The result of a call to {@link Classifier#assignClass(String)} holding an assigned class of type
* <code>T</code> and a score.
*
* @param assignedClass the class <code>T</code> assigned by a {@link Classifier}
* @param score score the score for the assignedClass as a <code>double</code>
* @lucene.experimental
*/
public class ClassificationResult<T> implements Comparable<ClassificationResult<T>> {
private final T assignedClass;
private final double score;
/**
* Constructor
*
* @param assignedClass the class <code>T</code> assigned by a {@link Classifier}
* @param score the score for the assignedClass as a <code>double</code>
*/
public ClassificationResult(T assignedClass, double score) {
this.assignedClass = assignedClass;
this.score = score;
}
/**
* retrieve the result class
*
* @return a <code>T</code> representing an assigned class
*/
public T getAssignedClass() {
return assignedClass;
}
/**
* retrieve the result score
*
* @return a <code>double</code> representing a result score
*/
public double getScore() {
return score;
}
public record ClassificationResult<T>(T assignedClass, double score)
implements Comparable<ClassificationResult<T>> {
@Override
public int compareTo(ClassificationResult<T> o) {
return Double.compare(o.getScore(), this.getScore());
return Double.compare(o.score(), this.score());
}
}

View File

@ -108,9 +108,9 @@ public class KNearestFuzzyClassifier implements Classifier<BytesRef> {
ClassificationResult<BytesRef> assignedClass = null;
double maxscore = -Double.MAX_VALUE;
for (ClassificationResult<BytesRef> cl : assignedClasses) {
if (cl.getScore() > maxscore) {
if (cl.score() > maxscore) {
assignedClass = cl;
maxscore = cl.getScore();
maxscore = cl.score();
}
}
return assignedClass;
@ -159,7 +159,7 @@ public class KNearestFuzzyClassifier implements Classifier<BytesRef> {
Map<BytesRef, Integer> classCounts = new HashMap<>();
Map<BytesRef, Double> classBoosts =
new HashMap<>(); // this is a boost based on class ranking positions in topDocs
float maxScore = topDocs.totalHits.value == 0 ? Float.NaN : topDocs.scoreDocs[0].score;
float maxScore = topDocs.totalHits.value() == 0 ? Float.NaN : topDocs.scoreDocs[0].score;
StoredFields storedFields = indexSearcher.storedFields();
for (ScoreDoc scoreDoc : topDocs.scoreDocs) {
IndexableField storableField = storedFields.document(scoreDoc.doc).getField(classFieldName);
@ -193,7 +193,7 @@ public class KNearestFuzzyClassifier implements Classifier<BytesRef> {
if (sumdoc < k) {
for (ClassificationResult<BytesRef> cr : temporaryList) {
returnList.add(
new ClassificationResult<>(cr.getAssignedClass(), cr.getScore() * k / (double) sumdoc));
new ClassificationResult<>(cr.assignedClass(), cr.score() * k / (double) sumdoc));
}
} else {
returnList = temporaryList;

View File

@ -129,9 +129,9 @@ public class KNearestNeighborClassifier implements Classifier<BytesRef> {
ClassificationResult<BytesRef> assignedClass = null;
double maxscore = -Double.MAX_VALUE;
for (ClassificationResult<BytesRef> cl : assignedClasses) {
if (cl.getScore() > maxscore) {
if (cl.score() > maxscore) {
assignedClass = cl;
maxscore = cl.getScore();
maxscore = cl.score();
}
}
return assignedClass;
@ -192,7 +192,7 @@ public class KNearestNeighborClassifier implements Classifier<BytesRef> {
Map<BytesRef, Integer> classCounts = new HashMap<>();
Map<BytesRef, Double> classBoosts =
new HashMap<>(); // this is a boost based on class ranking positions in topDocs
float maxScore = topDocs.totalHits.value == 0 ? Float.NaN : topDocs.scoreDocs[0].score;
float maxScore = topDocs.totalHits.value() == 0 ? Float.NaN : topDocs.scoreDocs[0].score;
StoredFields storedFields = indexSearcher.storedFields();
for (ScoreDoc scoreDoc : topDocs.scoreDocs) {
IndexableField[] storableFields =
@ -229,7 +229,7 @@ public class KNearestNeighborClassifier implements Classifier<BytesRef> {
if (sumdoc < k) {
for (ClassificationResult<BytesRef> cr : temporaryList) {
returnList.add(
new ClassificationResult<>(cr.getAssignedClass(), cr.getScore() * k / (double) sumdoc));
new ClassificationResult<>(cr.assignedClass(), cr.score() * k / (double) sumdoc));
}
} else {
returnList = temporaryList;

View File

@ -105,9 +105,9 @@ public class SimpleNaiveBayesClassifier implements Classifier<BytesRef> {
ClassificationResult<BytesRef> assignedClass = null;
double maxscore = -Double.MAX_VALUE;
for (ClassificationResult<BytesRef> c : assignedClasses) {
if (c.getScore() > maxscore) {
if (c.score() > maxscore) {
assignedClass = c;
maxscore = c.getScore();
maxscore = c.score();
}
}
return assignedClass;
@ -297,13 +297,13 @@ public class SimpleNaiveBayesClassifier implements Classifier<BytesRef> {
if (!assignedClasses.isEmpty()) {
Collections.sort(assignedClasses);
// this is a negative number closest to 0 = a
double smax = assignedClasses.get(0).getScore();
double smax = assignedClasses.get(0).score();
double sumLog = 0;
// log(sum(exp(x_n-a)))
for (ClassificationResult<BytesRef> cr : assignedClasses) {
// getScore-smax <=0 (both negative, smax is the smallest abs()
sumLog += Math.exp(cr.getScore() - smax);
sumLog += Math.exp(cr.score() - smax);
}
// loga=a+log(sum(exp(x_n-a))) = log(sum(exp(x_n)))
double loga = smax;
@ -311,8 +311,8 @@ public class SimpleNaiveBayesClassifier implements Classifier<BytesRef> {
// 1/sum*x = exp(log(x))*1/sum = exp(log(x)-log(sum))
for (ClassificationResult<BytesRef> cr : assignedClasses) {
double scoreDiff = cr.getScore() - loga;
returnList.add(new ClassificationResult<>(cr.getAssignedClass(), Math.exp(scoreDiff)));
double scoreDiff = cr.score() - loga;
returnList.add(new ClassificationResult<>(cr.assignedClass(), Math.exp(scoreDiff)));
}
}
return returnList;

View File

@ -80,9 +80,9 @@ public class SimpleNaiveBayesDocumentClassifier extends SimpleNaiveBayesClassifi
ClassificationResult<BytesRef> assignedClass = null;
double maxscore = -Double.MAX_VALUE;
for (ClassificationResult<BytesRef> c : assignedClasses) {
if (c.getScore() > maxscore) {
if (c.score() > maxscore) {
assignedClass = c;
maxscore = c.getScore();
maxscore = c.score();
}
}
return assignedClass;

View File

@ -107,7 +107,7 @@ public class ConfusionMatrixGenerator {
time += end - start;
if (result != null) {
T assignedClass = result.getAssignedClass();
T assignedClass = result.assignedClass();
if (assignedClass != null) {
counter++;
String classified =

View File

@ -138,13 +138,13 @@ public class DatasetSplitter {
// iterate over existing documents
StoredFields storedFields = originalIndex.storedFields();
for (GroupDocs<Object> group : topGroups.groups) {
assert group.totalHits.relation == TotalHits.Relation.EQUAL_TO;
long totalHits = group.totalHits.value;
assert group.totalHits().relation() == TotalHits.Relation.EQUAL_TO;
long totalHits = group.totalHits().value();
double testSize = totalHits * testRatio;
int tc = 0;
double cvSize = totalHits * crossValidationRatio;
int cvc = 0;
for (ScoreDoc scoreDoc : group.scoreDocs) {
for (ScoreDoc scoreDoc : group.scoreDocs()) {
// create a new document for indexing
Document doc = createNewDoc(storedFields, ft, scoreDoc, fieldNames);

View File

@ -91,7 +91,7 @@ public abstract class ClassificationTestBase<T> extends LuceneTestCase {
Classifier<T> classifier, String inputDoc, T expectedResult) throws Exception {
ClassificationResult<T> classificationResult = classifier.assignClass(inputDoc);
assertNotNull(classificationResult);
T assignedClass = classificationResult.getAssignedClass();
T assignedClass = classificationResult.assignedClass();
assertNotNull(assignedClass);
assertEquals(
"got an assigned class of " + assignedClass,
@ -101,7 +101,7 @@ public abstract class ClassificationTestBase<T> extends LuceneTestCase {
assignedClass instanceof BytesRef
? ((BytesRef) assignedClass).utf8ToString()
: assignedClass);
double score = classificationResult.getScore();
double score = classificationResult.score();
assertTrue("score should be between 0 and 1, got:" + score, score <= 1 && score >= 0);
return classificationResult;
}
@ -130,18 +130,17 @@ public abstract class ClassificationTestBase<T> extends LuceneTestCase {
getSampleIndex(analyzer);
ClassificationResult<T> classificationResult = classifier.assignClass(inputDoc);
assertNotNull(classificationResult.getAssignedClass());
assertNotNull(classificationResult.assignedClass());
assertEquals(
"got an assigned class of " + classificationResult.getAssignedClass(),
"got an assigned class of " + classificationResult.assignedClass(),
expectedResult,
classificationResult.getAssignedClass());
double score = classificationResult.getScore();
classificationResult.assignedClass());
double score = classificationResult.score();
assertTrue("score should be between 0 and 1, got: " + score, score <= 1 && score >= 0);
updateSampleIndex();
ClassificationResult<T> secondClassificationResult = classifier.assignClass(inputDoc);
assertEquals(
classificationResult.getAssignedClass(), secondClassificationResult.getAssignedClass());
assertEquals(Double.valueOf(score), Double.valueOf(secondClassificationResult.getScore()));
assertEquals(classificationResult.assignedClass(), secondClassificationResult.assignedClass());
assertEquals(Double.valueOf(score), Double.valueOf(secondClassificationResult.score()));
}
protected LeafReader getSampleIndex(Analyzer analyzer) throws IOException {

View File

@ -88,7 +88,7 @@ public class TestKNearestNeighborClassifier extends ClassificationTestBase<Bytes
textFieldName),
TECHNOLOGY_INPUT,
TECHNOLOGY_RESULT);
assertTrue(resultDS.getScore() != resultLMS.getScore());
assertTrue(resultDS.score() != resultLMS.score());
} finally {
IOUtils.close(leafReader);
}
@ -113,7 +113,7 @@ public class TestKNearestNeighborClassifier extends ClassificationTestBase<Bytes
leafReader, null, analyzer, null, 6, 1, 1, categoryFieldName, textFieldName);
List<ClassificationResult<BytesRef>> classes =
knnClassifier.getClasses(STRONG_TECHNOLOGY_INPUT);
assertTrue(classes.get(0).getScore() > classes.get(1).getScore());
assertTrue(classes.get(0).score() > classes.get(1).score());
checkCorrectClassification(knnClassifier, STRONG_TECHNOLOGY_INPUT, TECHNOLOGY_RESULT);
} finally {
IOUtils.close(leafReader);
@ -139,7 +139,7 @@ public class TestKNearestNeighborClassifier extends ClassificationTestBase<Bytes
leafReader, null, analyzer, null, 3, 1, 1, categoryFieldName, textFieldName);
List<ClassificationResult<BytesRef>> classes =
knnClassifier.getClasses(SUPER_STRONG_TECHNOLOGY_INPUT);
assertTrue(classes.get(0).getScore() > classes.get(1).getScore());
assertTrue(classes.get(0).score() > classes.get(1).score());
checkCorrectClassification(knnClassifier, SUPER_STRONG_TECHNOLOGY_INPUT, TECHNOLOGY_RESULT);
} finally {
IOUtils.close(leafReader);

Some files were not shown because too many files have changed in this diff Show More