mirror of https://github.com/apache/lucene.git
Merge branch 'main' into optimize_prefix_query
This commit is contained in:
commit
73b4ced245
|
@ -30,7 +30,7 @@ jobs:
|
|||
strategy:
|
||||
matrix:
|
||||
os: [ ubuntu-latest ]
|
||||
java-version: [ '22' ]
|
||||
java-version: [ '23-ea' ]
|
||||
uses-alt-java: [ true, false ]
|
||||
|
||||
runs-on: ${{ matrix.os }}
|
||||
|
@ -61,7 +61,16 @@ jobs:
|
|||
# https://docs.github.com/en/actions/using-workflows/workflow-commands-for-github-actions#setting-an-environment-variable
|
||||
echo "RUNTIME_JAVA_HOME=${{ env.ALT_JAVA_DIR }}" >> "$GITHUB_ENV"
|
||||
|
||||
- run: ./gradlew -p lucene/core check -x test
|
||||
- name: ./gradlew tidy
|
||||
run: |
|
||||
./gradlew tidy
|
||||
if [ ! -z "$(git status --porcelain)" ]; then
|
||||
echo ":warning: **tidy left local checkout in modified state**" >> $GITHUB_STEP_SUMMARY
|
||||
echo '```' >> $GITHUB_STEP_SUMMARY
|
||||
git status --porcelain >> $GITHUB_STEP_SUMMARY
|
||||
echo '```' >> $GITHUB_STEP_SUMMARY
|
||||
git reset --hard && git clean -xfd .
|
||||
fi
|
||||
|
||||
- name: ./gradlew regenerate
|
||||
run: |
|
||||
|
@ -69,7 +78,7 @@ jobs:
|
|||
sudo apt-get install libwww-perl
|
||||
./gradlew regenerate -x generateUAX29URLEmailTokenizerInternal --rerun-tasks
|
||||
if [ ! -z "$(git status --porcelain)" ]; then
|
||||
echo ":warning: **regenerateleft local checkout in modified state**" >> $GITHUB_STEP_SUMMARY
|
||||
echo ":warning: **regenerate left local checkout in modified state**" >> $GITHUB_STEP_SUMMARY
|
||||
echo '```' >> $GITHUB_STEP_SUMMARY
|
||||
git status --porcelain >> $GITHUB_STEP_SUMMARY
|
||||
echo '```' >> $GITHUB_STEP_SUMMARY
|
||||
|
@ -79,8 +88,7 @@ jobs:
|
|||
- run: ./gradlew testOpts
|
||||
- run: ./gradlew helpWorkflow
|
||||
- run: ./gradlew licenses updateLicenses
|
||||
- run: ./gradlew tidy
|
||||
- run: ./gradlew check -x test
|
||||
- run: ./gradlew check -x test -Pvalidation.git.failOnModified=false
|
||||
- run: ./gradlew assembleRelease mavenToLocal
|
||||
|
||||
# Conserve resources: only run these in non-alt-java mode.
|
||||
|
|
|
@ -18,7 +18,7 @@ jobs:
|
|||
strategy:
|
||||
matrix:
|
||||
os: [ ubuntu-latest ]
|
||||
java-version: [ '21', '22' ]
|
||||
java-version: [ '21', '22', '23-ea' ]
|
||||
|
||||
runs-on: ${{ matrix.os }}
|
||||
|
||||
|
@ -72,3 +72,4 @@ jobs:
|
|||
name: smoke-tester-logs-jdk-${{ matrix.java-version }}
|
||||
path: |
|
||||
${{ env.TMP_DIR }}/**/*.log
|
||||
/tmp/release.log
|
||||
|
|
|
@ -41,7 +41,7 @@ import jdk.jfr.consumer.RecordingFile;
|
|||
*/
|
||||
public class ProfileResults {
|
||||
/** Formats a frame to a formatted line. This is deduplicated on! */
|
||||
static String frameToString(RecordedFrame frame, boolean lineNumbers) {
|
||||
static String frameToString(RecordedFrame frame, boolean lineNumbers, boolean frameTypes) {
|
||||
StringBuilder builder = new StringBuilder();
|
||||
RecordedMethod method = frame.getMethod();
|
||||
RecordedClass clazz = method.getType();
|
||||
|
@ -55,13 +55,14 @@ public class ProfileResults {
|
|||
builder.append("#");
|
||||
builder.append(method.getName());
|
||||
builder.append("()");
|
||||
if (lineNumbers) {
|
||||
if (lineNumbers && frame.getLineNumber() != -1) {
|
||||
builder.append(":");
|
||||
if (frame.getLineNumber() == -1) {
|
||||
builder.append("(" + frame.getType() + " code)");
|
||||
} else {
|
||||
builder.append(frame.getLineNumber());
|
||||
}
|
||||
builder.append(frame.getLineNumber());
|
||||
}
|
||||
if (clazz != null && frameTypes) {
|
||||
builder.append(" [");
|
||||
builder.append(frame.getType());
|
||||
builder.append(" code]");
|
||||
}
|
||||
return builder.toString();
|
||||
}
|
||||
|
@ -77,6 +78,8 @@ public class ProfileResults {
|
|||
public static final String COUNT_DEFAULT = "10";
|
||||
public static final String LINENUMBERS_KEY = "tests.profile.linenumbers";
|
||||
public static final String LINENUMBERS_DEFAULT = "false";
|
||||
public static final String FRAMETYPES_KEY = "tests.profile.frametypes";
|
||||
public static final String FRAMETYPES_DEFAULT = "true";
|
||||
|
||||
/**
|
||||
* Driver method, for testing standalone.
|
||||
|
@ -92,7 +95,8 @@ public class ProfileResults {
|
|||
System.getProperty(MODE_KEY, MODE_DEFAULT),
|
||||
Integer.parseInt(System.getProperty(STACKSIZE_KEY, STACKSIZE_DEFAULT)),
|
||||
Integer.parseInt(System.getProperty(COUNT_KEY, COUNT_DEFAULT)),
|
||||
Boolean.parseBoolean(System.getProperty(LINENUMBERS_KEY, LINENUMBERS_DEFAULT)));
|
||||
Boolean.parseBoolean(System.getProperty(LINENUMBERS_KEY, LINENUMBERS_DEFAULT)),
|
||||
Boolean.parseBoolean(System.getProperty(FRAMETYPES_KEY, FRAMETYPES_DEFAULT)));
|
||||
}
|
||||
|
||||
/** true if we care about this event */
|
||||
|
@ -152,7 +156,12 @@ public class ProfileResults {
|
|||
|
||||
/** Process all the JFR files passed in args and print a merged summary. */
|
||||
public static void printReport(
|
||||
List<String> files, String mode, int stacksize, int count, boolean lineNumbers)
|
||||
List<String> files,
|
||||
String mode,
|
||||
int stacksize,
|
||||
int count,
|
||||
boolean lineNumbers,
|
||||
boolean frameTypes)
|
||||
throws IOException {
|
||||
if (!"cpu".equals(mode) && !"heap".equals(mode)) {
|
||||
throw new IllegalArgumentException("tests.profile.mode must be one of (cpu,heap)");
|
||||
|
@ -181,7 +190,7 @@ public class ProfileResults {
|
|||
if (stack.length() > 0) {
|
||||
stack.append("\n").append(framePadding).append(" at ");
|
||||
}
|
||||
stack.append(frameToString(trace.getFrames().get(i), lineNumbers));
|
||||
stack.append(frameToString(trace.getFrames().get(i), lineNumbers, frameTypes));
|
||||
}
|
||||
String line = stack.toString();
|
||||
SimpleEntry<String, Long> entry =
|
||||
|
|
|
@ -60,8 +60,8 @@ public class WrapperDownloader {
|
|||
|
||||
public static void checkVersion() {
|
||||
int major = Runtime.version().feature();
|
||||
if (major != 21 && major != 22) {
|
||||
throw new IllegalStateException("java version must be 21 or 22, your version: " + major);
|
||||
if (major != 21 && major != 22 && major != 23) {
|
||||
throw new IllegalStateException("java version must be 21, 22 or 23, your version: " + major);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -231,8 +231,8 @@ public class MissingDoclet extends StandardDoclet {
|
|||
case PACKAGE:
|
||||
checkComment(element);
|
||||
break;
|
||||
// class-like elements, check them, then recursively check their children (fields and
|
||||
// methods)
|
||||
// class-like elements, check them, then recursively check their children (fields and
|
||||
// methods)
|
||||
case CLASS:
|
||||
case INTERFACE:
|
||||
case ENUM:
|
||||
|
@ -257,7 +257,7 @@ public class MissingDoclet extends StandardDoclet {
|
|||
}
|
||||
}
|
||||
break;
|
||||
// method-like elements, check them if we are configured to do so
|
||||
// method-like elements, check them if we are configured to do so
|
||||
case METHOD:
|
||||
case CONSTRUCTOR:
|
||||
case FIELD:
|
||||
|
|
11
build.gradle
11
build.gradle
|
@ -80,6 +80,9 @@ ext {
|
|||
// Minimum Java version required to compile and run Lucene.
|
||||
minJavaVersion = JavaVersion.toVersion(deps.versions.minJava.get())
|
||||
|
||||
// also change this in extractor tool: ExtractForeignAPI
|
||||
vectorIncubatorJavaVersions = [ JavaVersion.VERSION_21, JavaVersion.VERSION_22, JavaVersion.VERSION_23 ] as Set
|
||||
|
||||
// snapshot build marker used in scripts.
|
||||
snapshotBuild = version.contains("SNAPSHOT")
|
||||
|
||||
|
@ -117,10 +120,6 @@ apply from: file('gradle/generation/local-settings.gradle')
|
|||
// Make sure the build environment is consistent.
|
||||
apply from: file('gradle/validation/check-environment.gradle')
|
||||
|
||||
// IDE support, settings and specials.
|
||||
apply from: file('gradle/ide/intellij-idea.gradle')
|
||||
apply from: file('gradle/ide/eclipse.gradle')
|
||||
|
||||
// Set up defaults and configure aspects for certain modules or functionality
|
||||
// (java, tests)
|
||||
apply from: file('gradle/java/folder-layout.gradle')
|
||||
|
@ -133,6 +132,10 @@ apply from: file('gradle/testing/alternative-jdk-support.gradle')
|
|||
apply from: file('gradle/java/jar-manifest.gradle')
|
||||
apply from: file('gradle/java/modules.gradle')
|
||||
|
||||
// IDE support, settings and specials.
|
||||
apply from: file('gradle/ide/intellij-idea.gradle')
|
||||
apply from: file('gradle/ide/eclipse.gradle')
|
||||
|
||||
// Maven artifact publishing.
|
||||
apply from: file('gradle/maven/publications.gradle')
|
||||
|
||||
|
|
|
@ -112,8 +112,8 @@ def prepare(root, version, pause_before_sign, gpg_key_id, gpg_password, gpg_home
|
|||
checkDOAPfiles(version)
|
||||
|
||||
if not dev_mode:
|
||||
print(' ./gradlew --no-daemon clean check')
|
||||
run('./gradlew --no-daemon clean check')
|
||||
print(' ./gradlew --stacktrace --no-daemon clean check')
|
||||
run('./gradlew --stacktrace --no-daemon clean check')
|
||||
else:
|
||||
print(' skipping precommit check due to dev-mode')
|
||||
|
||||
|
@ -121,7 +121,7 @@ def prepare(root, version, pause_before_sign, gpg_key_id, gpg_password, gpg_home
|
|||
input("Tests complete! Please press ENTER to proceed to assembleRelease: ")
|
||||
|
||||
print(' prepare-release')
|
||||
cmd = './gradlew --no-daemon assembleRelease' \
|
||||
cmd = './gradlew --stacktrace --no-daemon assembleRelease' \
|
||||
' -Dversion.release=%s' % version
|
||||
if dev_mode:
|
||||
cmd += ' -Pvalidation.git.failOnModified=false'
|
||||
|
|
|
@ -32,7 +32,7 @@ allprojects {
|
|||
missingdoclet "org.apache.lucene.tools:missing-doclet"
|
||||
}
|
||||
|
||||
ext {
|
||||
project.ext {
|
||||
relativeDocPath = project.path.replaceFirst(/:\w+:/, "").replace(':', '/')
|
||||
}
|
||||
|
||||
|
|
|
@ -17,13 +17,6 @@
|
|||
|
||||
def resources = scriptResources(buildscript)
|
||||
|
||||
configure(rootProject) {
|
||||
ext {
|
||||
// also change this in extractor tool: ExtractForeignAPI
|
||||
vectorIncubatorJavaVersions = [ JavaVersion.VERSION_21, JavaVersion.VERSION_22 ] as Set
|
||||
}
|
||||
}
|
||||
|
||||
configure(project(":lucene:core")) {
|
||||
ext {
|
||||
apijars = layout.projectDirectory.dir("src/generated/jdk")
|
||||
|
|
|
@ -43,6 +43,31 @@ configure(project(":lucene:core")) {
|
|||
andThenTasks: ["spotlessJava", "spotlessJavaApply"],
|
||||
mustRunBefore: [ "compileJava" ]
|
||||
])
|
||||
|
||||
task generateForDeltaUtilInternal() {
|
||||
description "Regenerate gen_ForDeltaUtil.py"
|
||||
group "generation"
|
||||
|
||||
def genDir = file("src/java/org/apache/lucene/codecs/lucene912")
|
||||
def genScript = file("${genDir}/gen_ForDeltaUtil.py")
|
||||
def genOutput = file("${genDir}/ForDeltaUtil.java")
|
||||
|
||||
inputs.file genScript
|
||||
outputs.file genOutput
|
||||
|
||||
doLast {
|
||||
quietExec {
|
||||
workingDir genDir
|
||||
executable project.externalTool("python3")
|
||||
args = [ '-B', genScript ]
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
regenerate.dependsOn wrapWithPersistentChecksums(generateForDeltaUtilInternal, [
|
||||
andThenTasks: ["spotlessJava", "spotlessJavaApply"],
|
||||
mustRunBefore: [ "compileJava" ]
|
||||
])
|
||||
}
|
||||
|
||||
configure(project(":lucene:backward-codecs")) {
|
||||
|
|
|
@ -65,10 +65,8 @@ configure(project(":lucene:analysis:icu")) {
|
|||
icupkg = file("${icuBinDir}/icupkg")
|
||||
}
|
||||
|
||||
// Resolve version lazily (can't resolve at configuration time).
|
||||
def icu4jVersionProvider = project.provider { getVersion('com.ibm.icu', 'icu4j') }
|
||||
// lazy gstring with ICU version.
|
||||
def icu4jVersion = "${-> icu4jVersionProvider.get()}"
|
||||
def icu4jVersion = deps.icu4j.get().version
|
||||
|
||||
def icuCompileTask = Os.isFamily(Os.FAMILY_WINDOWS) ? "compileIcuWindows" : "compileIcuLinux"
|
||||
|
||||
|
|
|
@ -33,7 +33,7 @@ configure(project(":lucene:analysis:kuromoji")) {
|
|||
apply plugin: deps.plugins.undercouch.download.get().pluginId
|
||||
|
||||
plugins.withType(JavaPlugin) {
|
||||
ext {
|
||||
project.ext {
|
||||
targetDir = file("src/resources")
|
||||
}
|
||||
|
||||
|
|
|
@ -33,7 +33,7 @@ configure(project(":lucene:analysis:nori")) {
|
|||
apply plugin: deps.plugins.undercouch.download.get().pluginId
|
||||
|
||||
plugins.withType(JavaPlugin) {
|
||||
ext {
|
||||
project.ext {
|
||||
targetDir = file("src/resources")
|
||||
}
|
||||
|
||||
|
|
|
@ -22,10 +22,11 @@ import org.gradle.plugins.ide.eclipse.model.ClasspathEntry
|
|||
def resources = scriptResources(buildscript)
|
||||
|
||||
configure(rootProject) {
|
||||
plugins.withType(JavaPlugin) {
|
||||
apply plugin: "eclipse"
|
||||
if (gradle.startParameter.taskNames.contains("eclipse")) {
|
||||
project.pluginManager.apply("java-base")
|
||||
project.pluginManager.apply("eclipse")
|
||||
|
||||
def eclipseJavaVersion = propertyOrDefault("eclipse.javaVersion", rootProject.minJavaVersion)
|
||||
def eclipseJavaVersion = propertyOrDefault("eclipse.javaVersion", deps.versions.minJava.get())
|
||||
def relativize = { other -> rootProject.rootDir.relativePath(other).toString() }
|
||||
|
||||
eclipse {
|
||||
|
@ -105,9 +106,9 @@ configure(rootProject) {
|
|||
}
|
||||
}
|
||||
|
||||
eclipseJdt {
|
||||
eclipseJdt {
|
||||
enabled = false
|
||||
dependsOn 'luceneEclipse'
|
||||
dependsOn 'luceneEclipseJdt'
|
||||
}
|
||||
|
||||
eclipseClasspath {
|
||||
|
|
|
@ -27,7 +27,7 @@ def beastingMode = gradle.startParameter.taskNames.any{ name -> name == 'beast'
|
|||
|
||||
allprojects {
|
||||
plugins.withType(JavaPlugin) {
|
||||
ext {
|
||||
project.ext {
|
||||
testOptions += [
|
||||
[propName: 'tests.dups', value: 0, description: "Reiterate runs of entire test suites ('beast' task)."]
|
||||
]
|
||||
|
|
|
@ -19,7 +19,7 @@ def recordings = files()
|
|||
|
||||
allprojects {
|
||||
plugins.withType(JavaPlugin) {
|
||||
ext {
|
||||
project.ext {
|
||||
testOptions += [
|
||||
[propName: 'tests.profile', value: false, description: "Enable Java Flight Recorder profiling."]
|
||||
]
|
||||
|
|
|
@ -62,7 +62,7 @@ allprojects {
|
|||
// Configure test property defaults and their descriptions.
|
||||
allprojects {
|
||||
plugins.withType(JavaPlugin) {
|
||||
ext {
|
||||
project.ext {
|
||||
String randomVectorSize = RandomPicks.randomFrom(new Random(projectSeedLong), ["default", "128", "256", "512"])
|
||||
testOptions += [
|
||||
// seed, repetition and amplification.
|
||||
|
@ -135,14 +135,14 @@ allprojects {
|
|||
}
|
||||
|
||||
afterEvaluate {
|
||||
ext.testOptionsResolved = testOptions.findAll { opt ->
|
||||
project.ext.testOptionsResolved = testOptions.findAll { opt ->
|
||||
propertyOrDefault(opt.propName, opt.value) != null
|
||||
}.collectEntries { opt ->
|
||||
[(opt.propName): Objects.toString(resolvedTestOption(opt.propName))]
|
||||
}
|
||||
|
||||
// Compute the "reproduce with" string.
|
||||
ext.testOptionsForReproduceLine = testOptions.findAll { opt ->
|
||||
project.ext.testOptionsForReproduceLine = testOptions.findAll { opt ->
|
||||
if (opt["includeInReproLine"] == false) {
|
||||
return false
|
||||
}
|
||||
|
|
|
@ -22,7 +22,7 @@ def allSuites = []
|
|||
|
||||
allprojects {
|
||||
plugins.withType(JavaPlugin) {
|
||||
ext {
|
||||
project.ext {
|
||||
testOptions += [
|
||||
[propName: 'tests.slowestTests', value: true, description: "Print the summary of the slowest tests."],
|
||||
[propName: 'tests.slowestSuites', value: true, description: "Print the summary of the slowest suites."]
|
||||
|
|
|
@ -75,6 +75,18 @@ configure(rootProject) {
|
|||
it.dependsOn(":versionCatalogFormatDeps")
|
||||
}
|
||||
|
||||
// correct crlf/ default encoding after version catalog formatting finishes.
|
||||
tasks.matching {
|
||||
it.path in [
|
||||
":versionCatalogFormatDeps"
|
||||
]
|
||||
}.configureEach {
|
||||
it.doLast {
|
||||
ant.fixcrlf(file: it.catalogFile.get().asFile,
|
||||
eol: "lf", fixlast: "true", encoding: "UTF-8")
|
||||
}
|
||||
}
|
||||
|
||||
tasks.matching {
|
||||
it.path in [
|
||||
":versionCatalogUpdateDeps"
|
||||
|
|
|
@ -13,7 +13,7 @@
|
|||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
@defaultMessage Spawns threads with vague names; use a custom thread factory (Lucene's NamedThreadFactory, Solr's SolrNamedThreadFactory) and name threads so that you can tell (by its name) which executor it is associated with
|
||||
@defaultMessage Spawns threads with vague names; use a custom thread factory (Lucene's NamedThreadFactory) and name threads so that you can tell (by its name) which executor it is associated with
|
||||
java.util.concurrent.Executors#newFixedThreadPool(int)
|
||||
java.util.concurrent.Executors#newSingleThreadExecutor()
|
||||
java.util.concurrent.Executors#newCachedThreadPool()
|
||||
|
|
|
@ -74,21 +74,6 @@ configure(rootProject) {
|
|||
logger.warn("WARNING: Directory is not a valid git checkout (won't check dirty files): ${rootProject.projectDir}")
|
||||
}
|
||||
} else {
|
||||
// git ignores any folders which are empty (this includes folders with recursively empty sub-folders).
|
||||
def untrackedNonEmptyFolders = status.untrackedFolders.findAll { path ->
|
||||
File location = file("${rootProject.projectDir}/${path}")
|
||||
boolean hasFiles = false
|
||||
Files.walkFileTree(location.toPath(), new SimpleFileVisitor<Path>() {
|
||||
@Override
|
||||
FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException {
|
||||
hasFiles = true
|
||||
// Terminate early.
|
||||
return FileVisitResult.TERMINATE
|
||||
}
|
||||
})
|
||||
return hasFiles
|
||||
}
|
||||
|
||||
def offenders = [
|
||||
// Exclude staged changes. These are fine in precommit.
|
||||
// "(added)": status.added,
|
||||
|
@ -97,8 +82,7 @@ configure(rootProject) {
|
|||
"(conflicting)": status.conflicting,
|
||||
"(missing)": status.missing,
|
||||
"(modified)": status.modified,
|
||||
"(untracked)": status.untracked,
|
||||
"(untracked non-empty dir)": untrackedNonEmptyFolders
|
||||
"(untracked)": status.untracked
|
||||
].collectMany { fileStatus, files ->
|
||||
files.collect {file -> " - ${file} ${fileStatus}" }
|
||||
}.sort()
|
||||
|
|
|
@ -20,6 +20,10 @@
|
|||
// 2) notice file
|
||||
// 3) checksum validation/ generation.
|
||||
|
||||
// WARNING: The tasks in this file share internal state between tasks without using files.
|
||||
// Because of this all tasks here must always execute together, so they cannot define task outputs.
|
||||
// TODO: Rewrite the internal state to use state files containing the ext.jarInfos and its referencedFiles
|
||||
|
||||
// This should be false only for debugging.
|
||||
def failOnError = true
|
||||
|
||||
|
@ -194,13 +198,6 @@ subprojects {
|
|||
description = "Validate license and notice files of dependencies"
|
||||
dependsOn collectJarInfos
|
||||
|
||||
def outputFileName = 'validateJarLicenses'
|
||||
inputs.dir(file(project.rootDir.path + '/lucene/licenses'))
|
||||
.withPropertyName('licenses')
|
||||
.withPathSensitivity(PathSensitivity.RELATIVE)
|
||||
outputs.file(layout.buildDirectory.file(outputFileName))
|
||||
.withPropertyName('validateJarLicensesResult')
|
||||
|
||||
doLast {
|
||||
def errors = []
|
||||
jarInfos.each { dep ->
|
||||
|
@ -246,9 +243,7 @@ subprojects {
|
|||
}
|
||||
}
|
||||
}
|
||||
// Required to take advantage of incremental building and the build cache
|
||||
def f = new File(project.buildDir.path + "/" + outputFileName)
|
||||
f.write(errors.toString(), "UTF-8")
|
||||
|
||||
if (errors) {
|
||||
def msg = "Certain license/ notice files are missing:\n - " + errors.join("\n - ")
|
||||
if (failOnError) {
|
||||
|
|
|
@ -1 +1 @@
|
|||
cb0da6751c2b753a16ac168bb354870ebb1e162e9083f116729cec9c781156b8
|
||||
2db75c40782f5e8ba1fc278a5574bab070adccb2d21ca5a6e5ed840888448046
|
|
@ -1 +1 @@
|
|||
8.8.0
|
||||
8.10.0
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
distributionBase=GRADLE_USER_HOME
|
||||
distributionPath=wrapper/dists
|
||||
distributionUrl=https\://services.gradle.org/distributions/gradle-8.8-bin.zip
|
||||
distributionUrl=https\://services.gradle.org/distributions/gradle-8.10-bin.zip
|
||||
networkTimeout=10000
|
||||
validateDistributionUrl=true
|
||||
zipStoreBase=GRADLE_USER_HOME
|
||||
|
|
|
@ -112,6 +112,16 @@ API Changes
|
|||
|
||||
* GITHUB#13632: CandidateMatcher public matching functions (Bryan Jacobowitz)
|
||||
|
||||
* GITHUB#13708: Move Operations.sameLanguage/subsetOf to test-framework. (Robert Muir)
|
||||
|
||||
* GITHUB#13733: Move FacetsCollector#search utility methods to `FacetsCollectorManager`, replace the `Collector`
|
||||
argument with a `FacetsCollectorManager` and update the return type to include both `TopDocs` results as well as
|
||||
facets results. (Luca Cavanna)
|
||||
|
||||
* GITHUB#13328: Convert many basic Lucene classes to record classes, including CollectionStatistics, TermStatistics and LeafMetadata. (Shubham Chaudhary)
|
||||
|
||||
* GITHUB#13780: Remove `IndexSearcher#search(List<LeafReaderContext>, Weight, Collector)` in favour of the newly
|
||||
introduced `IndexSearcher#search(LeafReaderContextPartition[], Weight, Collector)`
|
||||
|
||||
New Features
|
||||
---------------------
|
||||
|
@ -141,6 +151,19 @@ New Features
|
|||
|
||||
* GITHUB#13604: Add Kmeans clustering on vectors (Mayya Sharipova, Jim Ferenczi, Tom Veasey)
|
||||
|
||||
* GITHUB#13592: Take advantage of the doc value skipper when it is primary sort in SortedNumericDocValuesRangeQuery
|
||||
and SortedSetDocValuesRangeQuery. (Ignacio Vera)
|
||||
|
||||
* GITHUB#13542: Add initial support for intra-segment concurrency. IndexSearcher now supports searching across leaf
|
||||
reader partitions concurrently. This is useful to max out available resource usage especially with force merged
|
||||
indices or big segments. There is still a performance penalty for queries that require segment-level computation
|
||||
ahead of time, such as points/range queries. This is an implementation limitation that we expect to improve in
|
||||
future releases, ad that's why intra-segment slicing is not enabled by default, but leveraged in tests when the
|
||||
searcher is created via LuceneTestCase#newSearcher. Users may override IndexSearcher#slices(List) to optionally
|
||||
create slices that target segment partitions. (Luca Cavanna)
|
||||
|
||||
* GITHUB#13741: Implement Accountable for NFARunAutomaton, fix hashCode implementation of CompiledAutomaton. (Patrick Zhai)
|
||||
|
||||
Improvements
|
||||
---------------------
|
||||
|
||||
|
@ -164,6 +187,8 @@ Improvements
|
|||
|
||||
* GITHUB#12172: Update Romanian stopwords list to include the modern unicode forms. (Trey Jones)
|
||||
|
||||
* GITHUB#13707: Improve Operations.isTotal() to work with non-minimal automata. (Dawid Weiss, Robert Muir)
|
||||
|
||||
Optimizations
|
||||
---------------------
|
||||
|
||||
|
@ -176,6 +201,8 @@ Optimizations
|
|||
|
||||
* GITHUB#12552: Make FSTPostingsFormat load FSTs off-heap. (Tony X)
|
||||
|
||||
* GITHUB#13672: Leverage doc value skip lists in DocValuesRewriteMethod if indexed. (Greg Miller)
|
||||
|
||||
Bug Fixes
|
||||
---------------------
|
||||
|
||||
|
@ -213,6 +240,9 @@ Changes in Backwards Compatibility Policy
|
|||
* GITHUB#13230: Remove the Kp and Lovins snowball algorithms which are not supported
|
||||
or intended for general use. (Robert Muir)
|
||||
|
||||
* GITHUB#13602: SearchWithCollectorTask no longer supports the `collector.class` config parameter to load a custom
|
||||
collector implementation. `collector.manager.class` allows users to load a collector manager instead. (Luca Cavanna)
|
||||
|
||||
Other
|
||||
---------------------
|
||||
|
||||
|
@ -253,6 +283,13 @@ Other
|
|||
|
||||
* GITHUB#13499: Remove usage of TopScoreDocCollector + TopFieldCollector deprecated methods (#create, #createSharedManager) (Jakub Slowinski)
|
||||
|
||||
Build
|
||||
---------------------
|
||||
|
||||
* GITHUB#13649: Fix eclipse ide settings generation #13649 (Uwe Schindler, Dawid Weiss)
|
||||
|
||||
* GITHUB#13698: Upgrade to gradle 8.10 (Dawid Weiss)
|
||||
|
||||
======================== Lucene 9.12.0 =======================
|
||||
|
||||
API Changes
|
||||
|
@ -268,6 +305,12 @@ API Changes
|
|||
|
||||
* GITHUB#13559: Add BitSet#nextSetBit(int, int) to get the index of the first set bit in range. (Egor Potemkin)
|
||||
|
||||
* GITHUB#13568: Add DoubleValuesSource#toSortableLongDoubleValuesSource and
|
||||
MultiDoubleValuesSource#toSortableMultiLongValuesSource methods. (Shradha Shankar)
|
||||
|
||||
* GITHUB#13568, GITHUB#13750: Add DrillSideways#search method that supports any CollectorManagers for drill-sideways dimensions
|
||||
or drill-down. (Egor Potemkin)
|
||||
|
||||
New Features
|
||||
---------------------
|
||||
|
||||
|
@ -280,8 +323,21 @@ New Features
|
|||
and LogByteSizeMergePolicy via a new #setTargetConcurrency setter.
|
||||
(Adrien Grand)
|
||||
|
||||
* GITHUB#13568: Add sandbox facets module to compute facets while collecting. (Egor Potemkin, Shradha Shankar)
|
||||
|
||||
* GITHUB#13678: Add support JDK 23 to the Panama Vectorization Provider. (Chris Hegarty)
|
||||
|
||||
* GITHUB#13689: Add a new faceting feature, dynamic range facets, which automatically picks a balanced set of numeric
|
||||
ranges based on the distribution of values that occur across all hits. For use cases that have a highly variable
|
||||
numeric doc values field, such as "price" in an e-commerce application, this facet method is powerful as it allows the
|
||||
presented ranges to adapt depending on what hits the query actually matches. This is in contrast to existing range
|
||||
faceting that requires the application to provide the specific fixed ranges up front. (Yuting Gan, Greg Miller,
|
||||
Stefan Vodita)
|
||||
|
||||
Improvements
|
||||
---------------------
|
||||
* GITHUB#13475: Re-enable intra-merge parallelism except for terms, norms, and doc values.
|
||||
Related to GITHUB#13478. (Ben Trent)
|
||||
|
||||
* GITHUB#13548: Refactor and javadoc update for KNN vector writer classes. (Patrick Zhai)
|
||||
|
||||
|
@ -299,6 +355,11 @@ Improvements
|
|||
|
||||
* GITHUB#13201: Better cost estimation on MultiTermQuery over few terms. (Michael Froh)
|
||||
|
||||
* GITHUB#13735: Migrate monitor package usage of deprecated IndexSearcher#search(Query, Collector)
|
||||
to IndexSearcher#search(Query, CollectorManager). (Greg Miller)
|
||||
|
||||
* GITHUB#13746: Introduce ProfilerCollectorManager to parallelize search when using ProfilerCollector. (Luca Cavanna)
|
||||
|
||||
Optimizations
|
||||
---------------------
|
||||
|
||||
|
@ -330,8 +391,14 @@ Optimizations
|
|||
Closing many individual index files can potentially lead to a degradation in execution performance.
|
||||
Index files are mmapped one-to-one with the JDK's foreign shared Arena. The JVM deoptimizes the top
|
||||
few frames of all threads when closing a shared Arena (see JDK-8335480). We mitigate this situation
|
||||
by 1) using a confined Arena where appropriate, and 2) grouping files from the same segment to a
|
||||
single shared Arena. (Chris Hegarty, Michael Gibney, Uwe Schindler)
|
||||
when running with JDK 21 and greater, by 1) using a confined Arena where appropriate, and 2) grouping
|
||||
files from the same segment to a single shared Arena.
|
||||
A system property has been added that allows to control the total maximum number of mmapped files
|
||||
that may be associated with a single shared Arena. For example, to set the max number of permits to
|
||||
256, pass the following on the command line
|
||||
-Dorg.apache.lucene.store.MMapDirectory.sharedArenaMaxPermits=256. Setting a value of 1 associates
|
||||
a single file to a single shared arena.
|
||||
(Chris Hegarty, Michael Gibney, Uwe Schindler)
|
||||
|
||||
* GITHUB#13585: Lucene912PostingsFormat, the new default postings format, now
|
||||
only has 2 levels of skip data, which are inlined into postings instead of
|
||||
|
@ -341,6 +408,22 @@ Optimizations
|
|||
|
||||
* GITHUB#13581: OnHeapHnswGraph no longer allocates a lock for every graph node (Mike Sokolov)
|
||||
|
||||
* GITHUB#13636, GITHUB#13658: Optimizations to the decoding logic of blocks of
|
||||
postings. (Adrien Grand, Uwe Schindler, Greg Miller)
|
||||
|
||||
* GITHUB##13644: Improve NumericComparator competitive iterator logic by comparing the missing value with the top
|
||||
value even after the hit queue is full (Pan Guixin)
|
||||
|
||||
* GITHUB#13587: Use Max WAND optimizations with ToParentBlockJoinQuery when using ScoreMode.Max (Mike Pellegrini)
|
||||
|
||||
* GITHUB#13742: Reorder checks in LRUQueryCache#count (Shubham Chaudhary)
|
||||
|
||||
* GITHUB#13686: Replace Map<String,Object> with IntObjectHashMap for DV producer (Pan Guixin)
|
||||
|
||||
* GITHUB#13697: Add a bulk scorer to ToParentBlockJoinQuery, which delegates to the bulk scorer of the child query.
|
||||
This should speed up query evaluation when the child query has a specialized bulk scorer, such as disjunctive queries.
|
||||
(Mike Pellegrini)
|
||||
|
||||
Changes in runtime behavior
|
||||
---------------------
|
||||
|
||||
|
@ -366,9 +449,38 @@ Bug Fixes
|
|||
|
||||
* GITHUB#13627: Fix race condition on flush for DWPT seqNo generation. (Ben Trent, Ao Li)
|
||||
|
||||
* GITHUB#13691: Fix incorrect exponent value in explain of SigmoidFunction. (Owais Kazi)
|
||||
|
||||
* GITHUB#13703: Fix bug in LatLonPoint queries where narrow polygons close to latitude 90 don't
|
||||
match any points due to an Integer overflow. (Ignacio Vera)
|
||||
|
||||
* GITHUB#13641: Unify how KnnFormats handle missing fields and correctly handle missing vector fields when
|
||||
merging segments. (Ben Trent)
|
||||
|
||||
* GITHUB#13519: 8 bit scalar vector quantization is no longer
|
||||
supported: it was buggy starting in 9.11 (GITHUB#13197). 4 and 7
|
||||
bit quantization are still supported. Existing (9.x) Lucene indices
|
||||
that previously used 8 bit quantization can still be read/searched
|
||||
but the results from `KNN*VectorQuery` are silently buggy. Further
|
||||
8 bit quantized vector indexing into such (9.11) indices is not
|
||||
permitted, so your path forward if you wish to continue using the
|
||||
same 9.11 index is to index additional vectors into the same field
|
||||
with either 4 or 7 bit quantization (or no quantization), and ensure
|
||||
all older (9.11 written) segments are rewritten either via
|
||||
`IndexWriter.forceMerge` or
|
||||
`IndexWriter.addIndexes(CodecReader...)`, or reindexing entirely.
|
||||
|
||||
Build
|
||||
---------------------
|
||||
|
||||
* GITHUB#13695, GITHUB#13696: Fix Gradle build sometimes gives spurious "unreferenced license file" warnings.
|
||||
(Uwe Schindler)
|
||||
|
||||
Other
|
||||
--------------------
|
||||
(No changes)
|
||||
|
||||
* GITHUB#13720: Add float comparison based on unit of least precision and use it to stop test failures caused by float
|
||||
summation not being associative in IEEE 754. (Alex Herbert, Stefan Vodita)
|
||||
|
||||
======================== Lucene 9.11.1 =======================
|
||||
|
||||
|
|
|
@ -80,9 +80,22 @@ behaviour as 9.x, clone `PersianAnalyzer` in 9.x or create custom analyzer by us
|
|||
### AutomatonQuery/CompiledAutomaton/RunAutomaton/RegExp no longer determinize (LUCENE-10010)
|
||||
|
||||
These classes no longer take a `determinizeWorkLimit` and no longer determinize
|
||||
behind the scenes. It is the responsibility of the caller to to call
|
||||
behind the scenes. It is the responsibility of the caller to call
|
||||
`Operations.determinize()` for DFA execution.
|
||||
|
||||
### RegExp optional complement syntax has been deprecated
|
||||
|
||||
Support for the optional complement syntax (`~`) has been deprecated.
|
||||
The `COMPLEMENT` syntax flag has been removed and replaced by the
|
||||
`DEPRECATED_COMPLEMENT` flag. Users wanting to enable the deprecated
|
||||
complement support can do so by explicitly passing a syntax flags that
|
||||
has `DEPRECATED_COMPLEMENT` when creating a `RegExp`. For example:
|
||||
`new RegExp("~(foo)", RegExp.DEPRECATED_COMPLEMENT)`.
|
||||
|
||||
Alternatively, and quite commonly, a more simple _complement bracket expression_,
|
||||
`[^...]`, may be a suitable replacement, For example, `[^fo]` matches any
|
||||
character that is not an `f` or `o`.
|
||||
|
||||
### DocValuesFieldExistsQuery, NormsFieldExistsQuery and KnnVectorFieldExistsQuery removed in favor of FieldExistsQuery (LUCENE-10436)
|
||||
|
||||
These classes have been removed and consolidated into `FieldExistsQuery`. To migrate, caller simply replace those classes
|
||||
|
@ -180,6 +193,9 @@ access the members using method calls instead of field accesses. Affected classe
|
|||
|
||||
- `IOContext`, `MergeInfo`, and `FlushInfo` (GITHUB#13205)
|
||||
- `BooleanClause` (GITHUB#13261)
|
||||
- `TotalHits` (GITHUB#13762)
|
||||
- `TermAndVector` (GITHUB#13772)
|
||||
- Many basic Lucene classes, including `CollectionStatistics`, `TermStatistics` and `LeafMetadata` (GITHUB#13328)
|
||||
|
||||
### Boolean flags on IOContext replaced with a new ReadAdvice enum.
|
||||
|
||||
|
@ -248,6 +264,11 @@ ConcurrentMergeScheduler now disables auto I/O throttling by default. There is s
|
|||
happening at the CPU level, since ConcurrentMergeScheduler has a maximum number of threads it can
|
||||
use, which is only a fraction of the total number of threads of the host by default.
|
||||
|
||||
### FieldInfos#hasVectors and FieldInfo#hasVectors renamed to hasTermVectors
|
||||
|
||||
To reduce confusion between term vectors and numeric vectors, `hasVectors` has been renamed to
|
||||
`hasTermVectors`.
|
||||
|
||||
## Migration from Lucene 9.0 to Lucene 9.1
|
||||
|
||||
### Test framework package migration and module (LUCENE-10301)
|
||||
|
@ -793,3 +814,77 @@ Specifically, the method `FunctionValues#getScorer(Weight weight, LeafReaderCont
|
|||
Callers must now keep track of the Weight instance that created the Scorer if they need it, instead of relying on
|
||||
Scorer.
|
||||
|
||||
### `FacetsCollector#search` utility methods moved and updated
|
||||
|
||||
The static `search` methods exposed by `FacetsCollector` have been moved to `FacetsCollectorManager`.
|
||||
Furthermore, they take a `FacetsCollectorManager` last argument in place of a `Collector` so that they support
|
||||
intra query concurrency. The return type has also be updated to `FacetsCollectorManager.FacetsResult` which includes
|
||||
both `TopDocs` as well as facets results included in a reduced `FacetsCollector` instance.
|
||||
|
||||
### `SearchWithCollectorTask` no longer supports the `collector.class` config parameter
|
||||
|
||||
`collector.class` used to allow users to load a custom collector implementation. `collector.manager.class`
|
||||
replaces it by allowing users to load a custom collector manager instead.
|
||||
|
||||
### BulkScorer#score(LeafCollector collector, Bits acceptDocs) removed
|
||||
|
||||
Use `BulkScorer#score(LeafCollector collector, Bits acceptDocs, int min, int max)` instead. In order to score the
|
||||
entire leaf, provide `0` as min and `DocIdSetIterator.NO_MORE_DOCS` as max. `BulkScorer` subclasses that override
|
||||
such method need to instead override the method variant that takes the range of doc ids as well as arguments.
|
||||
|
||||
### CollectorManager#newCollector and Collector#getLeafCollector contract
|
||||
|
||||
With the introduction of intra-segment query concurrency support, multiple `LeafCollector`s may be requested for the
|
||||
same `LeafReaderContext` via `Collector#getLeafCollector(LeafReaderContext)` across the different `Collector` instances
|
||||
returned by multiple `CollectorManager#newCollector` calls. Any logic or computation that needs to happen
|
||||
once per segment requires specific handling in the collector manager implementation. See `TotalHitCountCollectorManager`
|
||||
as an example. Individual collectors don't need to be adapted as a specific `Collector` instance will still see a given
|
||||
`LeafReaderContext` once, given that it is not possible to add more than one partition of the same segment to the same
|
||||
leaf slice.
|
||||
|
||||
### Weight#scorer, Weight#bulkScorer and Weight#scorerSupplier contract
|
||||
|
||||
With the introduction of intra-segment query concurrency support, multiple `Scorer`s, `ScorerSupplier`s or `BulkScorer`s
|
||||
may be requested for the same `LeafReaderContext` instance as part of a single search call. That may happen concurrently
|
||||
from separate threads each searching a specific doc id range of the segment. `Weight` implementations that rely on the
|
||||
assumption that a scorer, bulk scorer or scorer supplier for a given `LeafReaderContext` is requested once per search
|
||||
need updating.
|
||||
|
||||
### Signature of IndexSearcher#searchLeaf changed
|
||||
|
||||
With the introduction of intra-segment query concurrency support, the `IndexSearcher#searchLeaf(LeafReaderContext ctx, Weight weight, Collector collector)`
|
||||
method now accepts two additional int arguments to identify the min/max range of doc ids that will be searched in this
|
||||
leaf partition`: IndexSearcher#searchLeaf(LeafReaderContext ctx, int minDocId, int maxDocId, Weight weight, Collector collector)`.
|
||||
Subclasses of `IndexSearcher` that call or override the `searchLeaf` method need to be updated accordingly.
|
||||
|
||||
### Signature of static IndexSearch#slices method changed
|
||||
|
||||
The static `IndexSearcher#slices(List<LeafReaderContext> leaves, int maxDocsPerSlice, int maxSegmentsPerSlice)`
|
||||
method now supports an additional 4th and last argument to optionally enable creating segment partitions:
|
||||
`IndexSearcher#slices(List<LeafReaderContext> leaves, int maxDocsPerSlice, int maxSegmentsPerSlice, boolean allowSegmentPartitions)`
|
||||
|
||||
### TotalHitCountCollectorManager constructor
|
||||
|
||||
`TotalHitCountCollectorManager` now requires that an array of `LeafSlice`s, retrieved via `IndexSearcher#getSlices`,
|
||||
is provided to its constructor. Depending on whether segment partitions are present among slices, the manager can
|
||||
optimize the type of collectors it creates and exposes via `newCollector`.
|
||||
|
||||
### `IndexSearcher#search(List<LeafReaderContext>, Weight, Collector)` removed
|
||||
|
||||
The protected `IndexSearcher#search(List<LeafReaderContext> leaves, Weight weight, Collector collector)` method has been
|
||||
removed in favour of the newly introduced `search(LeafReaderContextPartition[] partitions, Weight weight, Collector collector)`.
|
||||
`IndexSearcher` subclasses that override this method need to instead override the new method.
|
||||
|
||||
### Indexing vectors with 8 bit scalar quantization is no longer supported but 7 and 4 bit quantization still work (GITHUB#13519)
|
||||
|
||||
8 bit scalar vector quantization is no longer supported: it was buggy
|
||||
starting in 9.11 (GITHUB#13197). 4 and 7 bit quantization are still
|
||||
supported. Existing (9.11) Lucene indices that previously used 8 bit
|
||||
quantization can still be read/searched but the results from
|
||||
`KNN*VectorQuery` are silently buggy. Further 8 bit quantized vector
|
||||
indexing into such (9.11) indices is not permitted, so your path
|
||||
forward if you wish to continue using the same 9.11 index is to index
|
||||
additional vectors into the same field with either 4 or 7 bit
|
||||
quantization (or no quantization), and ensure all older (9.x written)
|
||||
segments are rewritten either via `IndexWriter.forceMerge` or
|
||||
`IndexWriter.addIndexes(CodecReader...)`, or reindexing entirely.
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
{
|
||||
"gradle/generation/jflex/skeleton.default.txt": "58944f66c9113a940dfaf6a17210ec8219024390",
|
||||
"lucene/analysis/common/src/java/org/apache/lucene/analysis/classic/ClassicTokenizerImpl.java": "1f7a446f3483326385eef257cea8366c27da0850",
|
||||
"lucene/analysis/common/src/java/org/apache/lucene/analysis/classic/ClassicTokenizerImpl.java": "e62dcd8c25219d8f5d783823b228ffe38d2bacde",
|
||||
"lucene/analysis/common/src/java/org/apache/lucene/analysis/classic/ClassicTokenizerImpl.jflex": "f52109bb7d5701979fde90aeeeda726246a8d5fd"
|
||||
}
|
|
@ -1,5 +1,5 @@
|
|||
{
|
||||
"gradle/generation/jflex/skeleton.default.txt": "58944f66c9113a940dfaf6a17210ec8219024390",
|
||||
"lucene/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.java": "ac298e08bc5b96202efca0c01f9f0376fda976bd",
|
||||
"lucene/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.java": "2b5df5ff35543a6380c82f298225eb5fa06e4453",
|
||||
"lucene/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.jflex": "0b8c7774b98e8237702013e82c352d4711509bd0"
|
||||
}
|
|
@ -37,23 +37,23 @@ class BengaliNormalizer {
|
|||
|
||||
for (int i = 0; i < len; i++) {
|
||||
switch (s[i]) {
|
||||
// delete Chandrabindu
|
||||
// delete Chandrabindu
|
||||
case '\u0981':
|
||||
len = delete(s, i, len);
|
||||
i--;
|
||||
break;
|
||||
|
||||
// DirghoI kar -> RosshoI kar
|
||||
// DirghoI kar -> RosshoI kar
|
||||
case '\u09C0':
|
||||
s[i] = '\u09BF';
|
||||
break;
|
||||
|
||||
// DirghoU kar -> RosshoU kar
|
||||
// DirghoU kar -> RosshoU kar
|
||||
case '\u09C2':
|
||||
s[i] = '\u09C1';
|
||||
break;
|
||||
|
||||
// Khio (Ka + Hoshonto + Murdorno Sh)
|
||||
// Khio (Ka + Hoshonto + Murdorno Sh)
|
||||
case '\u0995':
|
||||
if (i + 2 < len && s[i + 1] == '\u09CD' && s[i + 2] == '\u09BF') {
|
||||
if (i == 0) {
|
||||
|
@ -67,12 +67,12 @@ class BengaliNormalizer {
|
|||
}
|
||||
break;
|
||||
|
||||
// Nga to Anusvara
|
||||
// Nga to Anusvara
|
||||
case '\u0999':
|
||||
s[i] = '\u0982';
|
||||
break;
|
||||
|
||||
// Ja Phala
|
||||
// Ja Phala
|
||||
case '\u09AF':
|
||||
if (i - 2 == 0 && s[i - 1] == '\u09CD') {
|
||||
s[i - 1] = '\u09C7';
|
||||
|
@ -89,7 +89,7 @@ class BengaliNormalizer {
|
|||
}
|
||||
break;
|
||||
|
||||
// Ba Phalaa
|
||||
// Ba Phalaa
|
||||
case '\u09AC':
|
||||
if ((i >= 1 && s[i - 1] != '\u09CD') || i == 0) {
|
||||
break;
|
||||
|
@ -109,7 +109,7 @@ class BengaliNormalizer {
|
|||
}
|
||||
break;
|
||||
|
||||
// Visarga
|
||||
// Visarga
|
||||
case '\u0983':
|
||||
if (i == len - 1) {
|
||||
if (len <= 3) {
|
||||
|
@ -122,18 +122,18 @@ class BengaliNormalizer {
|
|||
}
|
||||
break;
|
||||
|
||||
// All sh
|
||||
// All sh
|
||||
case '\u09B6':
|
||||
case '\u09B7':
|
||||
s[i] = '\u09B8';
|
||||
break;
|
||||
|
||||
// check na
|
||||
// check na
|
||||
case '\u09A3':
|
||||
s[i] = '\u09A8';
|
||||
break;
|
||||
|
||||
// check ra
|
||||
// check ra
|
||||
case '\u09DC':
|
||||
case '\u09DD':
|
||||
s[i] = '\u09B0';
|
||||
|
|
|
@ -747,70 +747,70 @@ class ClassicTokenizerImpl {
|
|||
/* Break so we don't hit fall-through warning: */
|
||||
break; /* ignore */
|
||||
}
|
||||
// fall through
|
||||
// fall through
|
||||
case 11:
|
||||
break;
|
||||
case 2:
|
||||
{
|
||||
return ALPHANUM;
|
||||
}
|
||||
// fall through
|
||||
// fall through
|
||||
case 12:
|
||||
break;
|
||||
case 3:
|
||||
{
|
||||
return CJ;
|
||||
}
|
||||
// fall through
|
||||
// fall through
|
||||
case 13:
|
||||
break;
|
||||
case 4:
|
||||
{
|
||||
return NUM;
|
||||
}
|
||||
// fall through
|
||||
// fall through
|
||||
case 14:
|
||||
break;
|
||||
case 5:
|
||||
{
|
||||
return HOST;
|
||||
}
|
||||
// fall through
|
||||
// fall through
|
||||
case 15:
|
||||
break;
|
||||
case 6:
|
||||
{
|
||||
return COMPANY;
|
||||
}
|
||||
// fall through
|
||||
// fall through
|
||||
case 16:
|
||||
break;
|
||||
case 7:
|
||||
{
|
||||
return APOSTROPHE;
|
||||
}
|
||||
// fall through
|
||||
// fall through
|
||||
case 17:
|
||||
break;
|
||||
case 8:
|
||||
{
|
||||
return ACRONYM_DEP;
|
||||
}
|
||||
// fall through
|
||||
// fall through
|
||||
case 18:
|
||||
break;
|
||||
case 9:
|
||||
{
|
||||
return ACRONYM;
|
||||
}
|
||||
// fall through
|
||||
// fall through
|
||||
case 19:
|
||||
break;
|
||||
case 10:
|
||||
{
|
||||
return EMAIL;
|
||||
}
|
||||
// fall through
|
||||
// fall through
|
||||
case 20:
|
||||
break;
|
||||
default:
|
||||
|
|
|
@ -53,18 +53,18 @@ public final class GreekLowerCaseFilter extends TokenFilter {
|
|||
|
||||
private int lowerCase(int codepoint) {
|
||||
switch (codepoint) {
|
||||
/* There are two lowercase forms of sigma:
|
||||
* U+03C2: small final sigma (end of word)
|
||||
* U+03C3: small sigma (otherwise)
|
||||
*
|
||||
* Standardize both to U+03C3
|
||||
*/
|
||||
/* There are two lowercase forms of sigma:
|
||||
* U+03C2: small final sigma (end of word)
|
||||
* U+03C3: small sigma (otherwise)
|
||||
*
|
||||
* Standardize both to U+03C3
|
||||
*/
|
||||
case '\u03C2': /* small final sigma */
|
||||
return '\u03C3'; /* small sigma */
|
||||
|
||||
/* Some greek characters contain diacritics.
|
||||
* This filter removes these, converting to the lowercase base form.
|
||||
*/
|
||||
/* Some greek characters contain diacritics.
|
||||
* This filter removes these, converting to the lowercase base form.
|
||||
*/
|
||||
|
||||
case '\u0386': /* capital alpha with tonos */
|
||||
case '\u03AC': /* small alpha with tonos */
|
||||
|
@ -100,9 +100,9 @@ public final class GreekLowerCaseFilter extends TokenFilter {
|
|||
case '\u03CE': /* small omega with tonos */
|
||||
return '\u03C9'; /* small omega */
|
||||
|
||||
/* The previous implementation did the conversion below.
|
||||
* Only implemented for backwards compatibility with old indexes.
|
||||
*/
|
||||
/* The previous implementation did the conversion below.
|
||||
* Only implemented for backwards compatibility with old indexes.
|
||||
*/
|
||||
|
||||
case '\u03A2': /* reserved */
|
||||
return '\u03C2'; /* small final sigma */
|
||||
|
|
|
@ -456,7 +456,7 @@ class PorterStemmer {
|
|||
/* j >= 0 fixes Bug 2 */
|
||||
if (ends("ou")) break;
|
||||
return;
|
||||
/* takes care of -ous */
|
||||
/* takes care of -ous */
|
||||
case 's':
|
||||
if (ends("ism")) break;
|
||||
return;
|
||||
|
|
|
@ -67,7 +67,7 @@ public final class IrishLowerCaseFilter extends TokenFilter {
|
|||
case 'I':
|
||||
case 'O':
|
||||
case 'U':
|
||||
// vowels with acute accent (fada)
|
||||
// vowels with acute accent (fada)
|
||||
case '\u00c1':
|
||||
case '\u00c9':
|
||||
case '\u00cd':
|
||||
|
|
|
@ -47,18 +47,18 @@ class HindiNormalizer {
|
|||
|
||||
for (int i = 0; i < len; i++) {
|
||||
switch (s[i]) {
|
||||
// dead n -> bindu
|
||||
// dead n -> bindu
|
||||
case '\u0928':
|
||||
if (i + 1 < len && s[i + 1] == '\u094D') {
|
||||
s[i] = '\u0902';
|
||||
len = delete(s, i + 1, len);
|
||||
}
|
||||
break;
|
||||
// candrabindu -> bindu
|
||||
// candrabindu -> bindu
|
||||
case '\u0901':
|
||||
s[i] = '\u0902';
|
||||
break;
|
||||
// nukta deletions
|
||||
// nukta deletions
|
||||
case '\u093C':
|
||||
len = delete(s, i, len);
|
||||
i--;
|
||||
|
@ -96,18 +96,18 @@ class HindiNormalizer {
|
|||
case '\u095F':
|
||||
s[i] = '\u092F';
|
||||
break;
|
||||
// zwj/zwnj -> delete
|
||||
// zwj/zwnj -> delete
|
||||
case '\u200D':
|
||||
case '\u200C':
|
||||
len = delete(s, i, len);
|
||||
i--;
|
||||
break;
|
||||
// virama -> delete
|
||||
// virama -> delete
|
||||
case '\u094D':
|
||||
len = delete(s, i, len);
|
||||
i--;
|
||||
break;
|
||||
// chandra/short -> replace
|
||||
// chandra/short -> replace
|
||||
case '\u0945':
|
||||
case '\u0946':
|
||||
s[i] = '\u0947';
|
||||
|
@ -127,7 +127,7 @@ class HindiNormalizer {
|
|||
case '\u0972':
|
||||
s[i] = '\u0905';
|
||||
break;
|
||||
// long -> short ind. vowels
|
||||
// long -> short ind. vowels
|
||||
case '\u0906':
|
||||
s[i] = '\u0905';
|
||||
break;
|
||||
|
@ -149,7 +149,7 @@ class HindiNormalizer {
|
|||
case '\u0914':
|
||||
s[i] = '\u0913';
|
||||
break;
|
||||
// long -> short dep. vowels
|
||||
// long -> short dep. vowels
|
||||
case '\u0940':
|
||||
s[i] = '\u093F';
|
||||
break;
|
||||
|
|
|
@ -52,7 +52,7 @@ class CheckCompoundPattern {
|
|||
|
||||
boolean prohibitsCompounding(CharsRef word, int breakPos, Root<?> rootBefore, Root<?> rootAfter) {
|
||||
if (isNonAffixedPattern(endChars)) {
|
||||
if (!charsMatch(word, breakPos - rootBefore.word.length(), rootBefore.word)) {
|
||||
if (!charsMatch(word, breakPos - rootBefore.word().length(), rootBefore.word())) {
|
||||
return false;
|
||||
}
|
||||
} else if (!charsMatch(word, breakPos - endChars.length(), endChars)) {
|
||||
|
@ -60,7 +60,7 @@ class CheckCompoundPattern {
|
|||
}
|
||||
|
||||
if (isNonAffixedPattern(beginChars)) {
|
||||
if (!charsMatch(word, breakPos, rootAfter.word)) {
|
||||
if (!charsMatch(word, breakPos, rootAfter.word())) {
|
||||
return false;
|
||||
}
|
||||
} else if (!charsMatch(word, breakPos, beginChars)) {
|
||||
|
@ -84,7 +84,7 @@ class CheckCompoundPattern {
|
|||
|
||||
private boolean hasAllFlags(Root<?> root, char[] flags) {
|
||||
for (char flag : flags) {
|
||||
if (!dictionary.hasFlag(root.entryId, flag)) {
|
||||
if (!dictionary.hasFlag(root.entryId(), flag)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -24,7 +24,6 @@ import java.util.ArrayList;
|
|||
import java.util.Comparator;
|
||||
import java.util.LinkedHashSet;
|
||||
import java.util.List;
|
||||
import java.util.Objects;
|
||||
import java.util.PriorityQueue;
|
||||
import java.util.Set;
|
||||
import java.util.TreeSet;
|
||||
|
@ -62,8 +61,7 @@ class GeneratingSuggester {
|
|||
|
||||
private List<Weighted<Root<String>>> findSimilarDictionaryEntries(
|
||||
String word, WordCase originalCase) {
|
||||
Comparator<Weighted<Root<String>>> natural = Comparator.naturalOrder();
|
||||
PriorityQueue<Weighted<Root<String>>> roots = new PriorityQueue<>(natural.reversed());
|
||||
PriorityQueue<Weighted<Root<String>>> roots = new PriorityQueue<>(Comparator.reverseOrder());
|
||||
|
||||
char[] excludeFlags = dictionary.allNonSuggestibleFlags();
|
||||
FlagEnumerator.Lookup flagLookup = dictionary.flagLookup;
|
||||
|
@ -111,7 +109,7 @@ class GeneratingSuggester {
|
|||
|
||||
private static boolean isWorseThan(int score, CharsRef candidate, Weighted<Root<String>> root) {
|
||||
return score < root.score
|
||||
|| score == root.score && CharSequence.compare(candidate, root.word.word) > 0;
|
||||
|| score == root.score && CharSequence.compare(candidate, root.word.word()) > 0;
|
||||
}
|
||||
|
||||
private void processSuggestibleWords(
|
||||
|
@ -162,11 +160,11 @@ class GeneratingSuggester {
|
|||
List<char[]> crossProducts = new ArrayList<>();
|
||||
Set<String> result = new LinkedHashSet<>();
|
||||
|
||||
if (!dictionary.hasFlag(root.entryId, dictionary.needaffix)) {
|
||||
result.add(root.word);
|
||||
if (!dictionary.hasFlag(root.entryId(), dictionary.needaffix)) {
|
||||
result.add(root.word());
|
||||
}
|
||||
|
||||
char[] wordChars = root.word.toCharArray();
|
||||
char[] wordChars = root.word().toCharArray();
|
||||
|
||||
// suffixes
|
||||
processAffixes(
|
||||
|
@ -180,7 +178,7 @@ class GeneratingSuggester {
|
|||
}
|
||||
|
||||
String suffix = misspelled.substring(misspelled.length() - suffixLength);
|
||||
String withSuffix = root.word.substring(0, root.word.length() - stripLength) + suffix;
|
||||
String withSuffix = root.word().substring(0, root.word().length() - stripLength) + suffix;
|
||||
result.add(withSuffix);
|
||||
if (dictionary.isCrossProduct(suffixId)) {
|
||||
crossProducts.add(withSuffix.toCharArray());
|
||||
|
@ -192,7 +190,7 @@ class GeneratingSuggester {
|
|||
true,
|
||||
misspelled,
|
||||
(prefixLength, prefixId) -> {
|
||||
if (!dictionary.hasFlag(root.entryId, dictionary.affixData(prefixId, AFFIX_FLAG))
|
||||
if (!dictionary.hasFlag(root.entryId(), dictionary.affixData(prefixId, AFFIX_FLAG))
|
||||
|| !dictionary.isCrossProduct(prefixId)) {
|
||||
return;
|
||||
}
|
||||
|
@ -217,7 +215,7 @@ class GeneratingSuggester {
|
|||
if (hasCompatibleFlags(root, prefixId)
|
||||
&& checkAffixCondition(prefixId, wordChars, stripLength, stemLength)) {
|
||||
String prefix = misspelled.substring(0, prefixLength);
|
||||
result.add(prefix + root.word.substring(stripLength));
|
||||
result.add(prefix + root.word().substring(stripLength));
|
||||
}
|
||||
});
|
||||
|
||||
|
@ -263,7 +261,7 @@ class GeneratingSuggester {
|
|||
}
|
||||
|
||||
private boolean hasCompatibleFlags(Root<?> root, int affixId) {
|
||||
if (!dictionary.hasFlag(root.entryId, dictionary.affixData(affixId, AFFIX_FLAG))) {
|
||||
if (!dictionary.hasFlag(root.entryId(), dictionary.affixData(affixId, AFFIX_FLAG))) {
|
||||
return false;
|
||||
}
|
||||
|
||||
|
@ -447,28 +445,8 @@ class GeneratingSuggester {
|
|||
return commonScore;
|
||||
}
|
||||
|
||||
private static class Weighted<T extends Comparable<T>> implements Comparable<Weighted<T>> {
|
||||
final T word;
|
||||
final int score;
|
||||
|
||||
Weighted(T word, int score) {
|
||||
this.word = word;
|
||||
this.score = score;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object o) {
|
||||
if (this == o) return true;
|
||||
if (!(o instanceof Weighted)) return false;
|
||||
@SuppressWarnings("unchecked")
|
||||
Weighted<T> that = (Weighted<T>) o;
|
||||
return score == that.score && word.equals(that.word);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return Objects.hash(word, score);
|
||||
}
|
||||
private record Weighted<T extends Comparable<T>>(T word, int score)
|
||||
implements Comparable<Weighted<T>> {
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
|
|
@ -132,7 +132,7 @@ public class Hunspell {
|
|||
Boolean checkSimpleWord(char[] wordChars, int length, WordCase originalCase) {
|
||||
Root<CharsRef> entry = findStem(wordChars, 0, length, originalCase, SIMPLE_WORD);
|
||||
if (entry != null) {
|
||||
return !dictionary.hasFlag(entry.entryId, dictionary.forbiddenword);
|
||||
return !dictionary.hasFlag(entry.entryId(), dictionary.forbiddenword);
|
||||
}
|
||||
|
||||
return null;
|
||||
|
@ -229,7 +229,7 @@ public class Hunspell {
|
|||
stem = findStem(word.chars, word.offset, breakPos + 1, originalCase, context);
|
||||
}
|
||||
if (stem != null
|
||||
&& !dictionary.hasFlag(stem.entryId, dictionary.forbiddenword)
|
||||
&& !dictionary.hasFlag(stem.entryId(), dictionary.forbiddenword)
|
||||
&& (prev == null || prev.mayCompound(stem, breakPos, originalCase))) {
|
||||
CompoundPart part = new CompoundPart(prev, word, breakPos, stem, null);
|
||||
if (checkCompoundsAfter(originalCase, part)) {
|
||||
|
@ -274,7 +274,7 @@ public class Hunspell {
|
|||
Root<CharsRef> lastRoot =
|
||||
findStem(word.chars, breakOffset, remainingLength, originalCase, COMPOUND_END);
|
||||
if (lastRoot != null
|
||||
&& !dictionary.hasFlag(lastRoot.entryId, dictionary.forbiddenword)
|
||||
&& !dictionary.hasFlag(lastRoot.entryId(), dictionary.forbiddenword)
|
||||
&& !(dictionary.checkCompoundDup && prev.root.equals(lastRoot))
|
||||
&& !hasForceUCaseProblem(lastRoot, originalCase, word.chars)
|
||||
&& prev.mayCompound(lastRoot, remainingLength, originalCase)) {
|
||||
|
@ -288,7 +288,7 @@ public class Hunspell {
|
|||
private boolean hasForceUCaseProblem(Root<?> root, WordCase originalCase, char[] wordChars) {
|
||||
if (originalCase == WordCase.TITLE || originalCase == WordCase.UPPER) return false;
|
||||
if (originalCase == null && Character.isUpperCase(wordChars[0])) return false;
|
||||
return dictionary.hasFlag(root.entryId, dictionary.forceUCase);
|
||||
return dictionary.hasFlag(root.entryId(), dictionary.forceUCase);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -17,8 +17,6 @@
|
|||
package org.apache.lucene.analysis.hunspell;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Collections;
|
||||
import java.util.Comparator;
|
||||
import java.util.List;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
|
@ -117,7 +115,16 @@ public final class HunspellStemFilter extends TokenFilter {
|
|||
}
|
||||
|
||||
if (longestOnly && buffer.size() > 1) {
|
||||
Collections.sort(buffer, lengthComparator);
|
||||
buffer.sort(
|
||||
(o1, o2) -> {
|
||||
int cmp = Integer.compare(o2.length, o1.length);
|
||||
if (cmp == 0) {
|
||||
// tie break on text
|
||||
return o2.compareTo(o1);
|
||||
} else {
|
||||
return cmp;
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
CharsRef stem = buffer.remove(0);
|
||||
|
@ -139,18 +146,4 @@ public final class HunspellStemFilter extends TokenFilter {
|
|||
super.reset();
|
||||
buffer = null;
|
||||
}
|
||||
|
||||
static final Comparator<CharsRef> lengthComparator =
|
||||
new Comparator<CharsRef>() {
|
||||
@Override
|
||||
public int compare(CharsRef o1, CharsRef o2) {
|
||||
int cmp = Integer.compare(o2.length, o1.length);
|
||||
if (cmp == 0) {
|
||||
// tie break on text
|
||||
return o2.compareTo(o1);
|
||||
} else {
|
||||
return cmp;
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
|
|
|
@ -16,36 +16,13 @@
|
|||
*/
|
||||
package org.apache.lucene.analysis.hunspell;
|
||||
|
||||
import java.util.Objects;
|
||||
|
||||
class Root<T extends CharSequence> implements Comparable<Root<T>> {
|
||||
final T word;
|
||||
final int entryId;
|
||||
|
||||
Root(T word, int entryId) {
|
||||
this.word = word;
|
||||
this.entryId = entryId;
|
||||
}
|
||||
record Root<T extends CharSequence>(T word, int entryId) implements Comparable<Root<T>> {
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return word.toString();
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object o) {
|
||||
if (this == o) return true;
|
||||
if (!(o instanceof Root)) return false;
|
||||
@SuppressWarnings("unchecked")
|
||||
Root<T> root = (Root<T>) o;
|
||||
return entryId == root.entryId && word.equals(root.word);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return Objects.hash(word, entryId);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int compareTo(Root<T> o) {
|
||||
return CharSequence.compare(word, o.word);
|
||||
|
|
|
@ -18,7 +18,6 @@ package org.apache.lucene.analysis.miscellaneous;
|
|||
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
import java.util.Comparator;
|
||||
import org.apache.lucene.analysis.CharArraySet;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
|
@ -147,26 +146,23 @@ public class FingerprintFilter extends TokenFilter {
|
|||
|
||||
Arrays.sort(
|
||||
items,
|
||||
new Comparator<Object>() {
|
||||
@Override
|
||||
public int compare(Object o1, Object o2) {
|
||||
char[] v1 = (char[]) o1;
|
||||
char[] v2 = (char[]) o2;
|
||||
int len1 = v1.length;
|
||||
int len2 = v2.length;
|
||||
int lim = Math.min(len1, len2);
|
||||
(o1, o2) -> {
|
||||
char[] v1 = (char[]) o1;
|
||||
char[] v2 = (char[]) o2;
|
||||
int len1 = v1.length;
|
||||
int len2 = v2.length;
|
||||
int lim = Math.min(len1, len2);
|
||||
|
||||
int k = 0;
|
||||
while (k < lim) {
|
||||
char c1 = v1[k];
|
||||
char c2 = v2[k];
|
||||
if (c1 != c2) {
|
||||
return c1 - c2;
|
||||
}
|
||||
k++;
|
||||
int k = 0;
|
||||
while (k < lim) {
|
||||
char c1 = v1[k];
|
||||
char c2 = v2[k];
|
||||
if (c1 != c2) {
|
||||
return c1 - c2;
|
||||
}
|
||||
return len1 - len2;
|
||||
k++;
|
||||
}
|
||||
return len1 - len2;
|
||||
});
|
||||
|
||||
// TODO lets append directly to termAttribute?
|
||||
|
|
|
@ -194,7 +194,7 @@ public final class WordDelimiterIterator {
|
|||
|
||||
int type = charType(text[current]);
|
||||
switch (type) {
|
||||
// return ALPHA word type for both lower and upper
|
||||
// return ALPHA word type for both lower and upper
|
||||
case LOWER:
|
||||
case UPPER:
|
||||
return ALPHA;
|
||||
|
@ -332,27 +332,27 @@ public final class WordDelimiterIterator {
|
|||
case Character.OTHER_NUMBER:
|
||||
return DIGIT;
|
||||
|
||||
// case Character.SPACE_SEPARATOR:
|
||||
// case Character.LINE_SEPARATOR:
|
||||
// case Character.PARAGRAPH_SEPARATOR:
|
||||
// case Character.CONTROL:
|
||||
// case Character.FORMAT:
|
||||
// case Character.PRIVATE_USE:
|
||||
// case Character.SPACE_SEPARATOR:
|
||||
// case Character.LINE_SEPARATOR:
|
||||
// case Character.PARAGRAPH_SEPARATOR:
|
||||
// case Character.CONTROL:
|
||||
// case Character.FORMAT:
|
||||
// case Character.PRIVATE_USE:
|
||||
|
||||
case Character.SURROGATE: // prevent splitting
|
||||
return ALPHA | DIGIT;
|
||||
|
||||
// case Character.DASH_PUNCTUATION:
|
||||
// case Character.START_PUNCTUATION:
|
||||
// case Character.END_PUNCTUATION:
|
||||
// case Character.CONNECTOR_PUNCTUATION:
|
||||
// case Character.OTHER_PUNCTUATION:
|
||||
// case Character.MATH_SYMBOL:
|
||||
// case Character.CURRENCY_SYMBOL:
|
||||
// case Character.MODIFIER_SYMBOL:
|
||||
// case Character.OTHER_SYMBOL:
|
||||
// case Character.INITIAL_QUOTE_PUNCTUATION:
|
||||
// case Character.FINAL_QUOTE_PUNCTUATION:
|
||||
// case Character.DASH_PUNCTUATION:
|
||||
// case Character.START_PUNCTUATION:
|
||||
// case Character.END_PUNCTUATION:
|
||||
// case Character.CONNECTOR_PUNCTUATION:
|
||||
// case Character.OTHER_PUNCTUATION:
|
||||
// case Character.MATH_SYMBOL:
|
||||
// case Character.CURRENCY_SYMBOL:
|
||||
// case Character.MODIFIER_SYMBOL:
|
||||
// case Character.OTHER_SYMBOL:
|
||||
// case Character.INITIAL_QUOTE_PUNCTUATION:
|
||||
// case Character.FINAL_QUOTE_PUNCTUATION:
|
||||
|
||||
default:
|
||||
return SUBWORD_DELIM;
|
||||
|
|
|
@ -59,12 +59,12 @@ public class PatternTypingFilter extends TokenFilter {
|
|||
public final boolean incrementToken() throws IOException {
|
||||
if (input.incrementToken()) {
|
||||
for (PatternTypingRule rule : replacementAndFlagByPattern) {
|
||||
Matcher matcher = rule.getPattern().matcher(termAtt);
|
||||
Matcher matcher = rule.pattern().matcher(termAtt);
|
||||
if (matcher.find()) {
|
||||
// allow 2nd reset() and find() that occurs inside replaceFirst to avoid excess string
|
||||
// creation
|
||||
typeAtt.setType(matcher.replaceFirst(rule.getTypeTemplate()));
|
||||
flagAtt.setFlags(rule.getFlags());
|
||||
typeAtt.setType(matcher.replaceFirst(rule.typeTemplate()));
|
||||
flagAtt.setFlags(rule.flags());
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
@ -74,27 +74,5 @@ public class PatternTypingFilter extends TokenFilter {
|
|||
}
|
||||
|
||||
/** Value holding class for pattern typing rules. */
|
||||
public static class PatternTypingRule {
|
||||
private final Pattern pattern;
|
||||
private final int flags;
|
||||
private final String typeTemplate;
|
||||
|
||||
public PatternTypingRule(Pattern pattern, int flags, String typeTemplate) {
|
||||
this.pattern = pattern;
|
||||
this.flags = flags;
|
||||
this.typeTemplate = typeTemplate;
|
||||
}
|
||||
|
||||
public Pattern getPattern() {
|
||||
return pattern;
|
||||
}
|
||||
|
||||
public int getFlags() {
|
||||
return flags;
|
||||
}
|
||||
|
||||
public String getTypeTemplate() {
|
||||
return typeTemplate;
|
||||
}
|
||||
}
|
||||
public record PatternTypingRule(Pattern pattern, int flags, String typeTemplate) {}
|
||||
}
|
||||
|
|
|
@ -142,22 +142,10 @@ public final class SynonymGraphFilter extends TokenFilter {
|
|||
}
|
||||
}
|
||||
|
||||
static class BufferedOutputToken {
|
||||
final String term;
|
||||
|
||||
// Non-null if this was an incoming token:
|
||||
final State state;
|
||||
|
||||
final int startNode;
|
||||
final int endNode;
|
||||
|
||||
public BufferedOutputToken(State state, String term, int startNode, int endNode) {
|
||||
this.state = state;
|
||||
this.term = term;
|
||||
this.startNode = startNode;
|
||||
this.endNode = endNode;
|
||||
}
|
||||
}
|
||||
/**
|
||||
* @param state Non-null if this was an incoming token:
|
||||
*/
|
||||
record BufferedOutputToken(State state, String term, int startNode, int endNode) {}
|
||||
|
||||
/**
|
||||
* Apply previously built synonyms to incoming tokens.
|
||||
|
|
|
@ -18,17 +18,15 @@ package org.apache.lucene.analysis.synonym.word2vec;
|
|||
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
|
||||
/** Wraps a term and boost */
|
||||
public class TermAndBoost {
|
||||
/** the term */
|
||||
public final BytesRef term;
|
||||
|
||||
/** the boost */
|
||||
public final float boost;
|
||||
|
||||
/**
|
||||
* Wraps a term and boost
|
||||
*
|
||||
* @param term the term
|
||||
* @param boost the boost
|
||||
*/
|
||||
public record TermAndBoost(BytesRef term, float boost) {
|
||||
/** Creates a new TermAndBoost */
|
||||
public TermAndBoost(BytesRef term, float boost) {
|
||||
this.term = BytesRef.deepCopyOf(term);
|
||||
this.boost = boost;
|
||||
public TermAndBoost {
|
||||
term = BytesRef.deepCopyOf(term);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -56,25 +56,25 @@ public class Word2VecModel implements RandomAccessVectorValues.Floats {
|
|||
}
|
||||
|
||||
public void addTermAndVector(TermAndVector modelEntry) {
|
||||
modelEntry.normalizeVector();
|
||||
modelEntry = modelEntry.normalizeVector();
|
||||
this.termsAndVectors[loadedCount++] = modelEntry;
|
||||
this.word2Vec.add(modelEntry.getTerm());
|
||||
this.word2Vec.add(modelEntry.term());
|
||||
}
|
||||
|
||||
@Override
|
||||
public float[] vectorValue(int targetOrd) {
|
||||
return termsAndVectors[targetOrd].getVector();
|
||||
return termsAndVectors[targetOrd].vector();
|
||||
}
|
||||
|
||||
public float[] vectorValue(BytesRef term) {
|
||||
int termOrd = this.word2Vec.find(term);
|
||||
if (termOrd < 0) return null;
|
||||
TermAndVector entry = this.termsAndVectors[termOrd];
|
||||
return (entry == null) ? null : entry.getVector();
|
||||
return (entry == null) ? null : entry.vector();
|
||||
}
|
||||
|
||||
public BytesRef termValue(int targetOrd) {
|
||||
return termsAndVectors[targetOrd].getTerm();
|
||||
return termsAndVectors[targetOrd].term();
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -80,7 +80,7 @@ public final class Word2VecSynonymFilter extends TokenFilter {
|
|||
clearAttributes();
|
||||
restoreState(this.lastState);
|
||||
termAtt.setEmpty();
|
||||
termAtt.append(synonym.term.utf8ToString());
|
||||
termAtt.append(synonym.term().utf8ToString());
|
||||
typeAtt.setType(SynonymGraphFilter.TYPE_SYNONYM);
|
||||
posLenAtt.setPositionLength(1);
|
||||
posIncrementAtt.setPositionIncrement(0);
|
||||
|
|
|
@ -38,25 +38,25 @@ class TeluguNormalizer {
|
|||
|
||||
for (int i = 0; i < len; i++) {
|
||||
switch (s[i]) {
|
||||
// candrabindu (ఀ and ఁ) -> bindu (ం)
|
||||
// candrabindu (ఀ and ఁ) -> bindu (ం)
|
||||
case '\u0C00': // ఀ
|
||||
case '\u0C01': // ఁ
|
||||
s[i] = '\u0C02'; // ం
|
||||
break;
|
||||
// delete visarga (ః)
|
||||
// delete visarga (ః)
|
||||
case '\u0C03':
|
||||
len = delete(s, i, len);
|
||||
i--;
|
||||
break;
|
||||
|
||||
// zwj/zwnj -> delete
|
||||
// zwj/zwnj -> delete
|
||||
case '\u200D':
|
||||
case '\u200C':
|
||||
len = delete(s, i, len);
|
||||
i--;
|
||||
break;
|
||||
|
||||
// long -> short vowels
|
||||
// long -> short vowels
|
||||
case '\u0C14': // ఔ
|
||||
s[i] = '\u0C13'; // ఓ
|
||||
break;
|
||||
|
@ -73,7 +73,7 @@ class TeluguNormalizer {
|
|||
s[i] = '\u0C09'; // ఉ
|
||||
break;
|
||||
|
||||
// long -> short vowels matras
|
||||
// long -> short vowels matras
|
||||
case '\u0C40': // ీ
|
||||
s[i] = '\u0C3F'; // ి
|
||||
break;
|
||||
|
@ -86,14 +86,14 @@ class TeluguNormalizer {
|
|||
case '\u0C4B': // ో
|
||||
s[i] = '\u0C4A'; // ొ
|
||||
break;
|
||||
// decomposed dipthong (ె + ౖ) -> precomposed diphthong vowel sign (ై)
|
||||
// decomposed dipthong (ె + ౖ) -> precomposed diphthong vowel sign (ై)
|
||||
case '\u0C46':
|
||||
if (i + 1 < len && s[i + 1] == '\u0C56') {
|
||||
s[i] = '\u0C48';
|
||||
len = delete(s, i + 1, len);
|
||||
}
|
||||
break;
|
||||
// composed oo or au -> oo or au
|
||||
// composed oo or au -> oo or au
|
||||
case '\u0C12':
|
||||
if (i + 1 < len && s[i + 1] == '\u0C55') {
|
||||
// (ఒ + ౕ) -> oo (ఓ)
|
||||
|
|
|
@ -61,12 +61,12 @@ public final class TurkishLowerCaseFilter extends TokenFilter {
|
|||
|
||||
if (iOrAfter) { // all the special I turkish handling happens here.
|
||||
switch (ch) {
|
||||
// remove COMBINING_DOT_ABOVE to mimic composed lowercase
|
||||
// remove COMBINING_DOT_ABOVE to mimic composed lowercase
|
||||
case COMBINING_DOT_ABOVE:
|
||||
length = delete(buffer, i, length);
|
||||
continue;
|
||||
// i itself, it depends if it is followed by COMBINING_DOT_ABOVE
|
||||
// if it is, we will make it small i and later remove the dot
|
||||
// i itself, it depends if it is followed by COMBINING_DOT_ABOVE
|
||||
// if it is, we will make it small i and later remove the dot
|
||||
case LATIN_CAPITAL_LETTER_I:
|
||||
if (isBeforeDot(buffer, i + 1, length)) {
|
||||
buffer[i] = LATIN_SMALL_LETTER_I;
|
||||
|
|
|
@ -901,7 +901,7 @@ class WikipediaTokenizerImpl {
|
|||
positionInc = 1; /* Break so we don't hit fall-through warning: */
|
||||
break;
|
||||
}
|
||||
// fall through
|
||||
// fall through
|
||||
case 47:
|
||||
break;
|
||||
case 2:
|
||||
|
@ -909,7 +909,7 @@ class WikipediaTokenizerImpl {
|
|||
positionInc = 1;
|
||||
return ALPHANUM;
|
||||
}
|
||||
// fall through
|
||||
// fall through
|
||||
case 48:
|
||||
break;
|
||||
case 3:
|
||||
|
@ -920,7 +920,7 @@ class WikipediaTokenizerImpl {
|
|||
yybegin(EXTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */
|
||||
break;
|
||||
}
|
||||
// fall through
|
||||
// fall through
|
||||
case 49:
|
||||
break;
|
||||
case 4:
|
||||
|
@ -928,7 +928,7 @@ class WikipediaTokenizerImpl {
|
|||
positionInc = 1;
|
||||
return CJ;
|
||||
}
|
||||
// fall through
|
||||
// fall through
|
||||
case 50:
|
||||
break;
|
||||
case 5:
|
||||
|
@ -936,7 +936,7 @@ class WikipediaTokenizerImpl {
|
|||
positionInc = 1; /* Break so we don't hit fall-through warning: */
|
||||
break;
|
||||
}
|
||||
// fall through
|
||||
// fall through
|
||||
case 51:
|
||||
break;
|
||||
case 6:
|
||||
|
@ -945,7 +945,7 @@ class WikipediaTokenizerImpl {
|
|||
numWikiTokensSeen++;
|
||||
return currentTokType;
|
||||
}
|
||||
// fall through
|
||||
// fall through
|
||||
case 52:
|
||||
break;
|
||||
case 7:
|
||||
|
@ -954,7 +954,7 @@ class WikipediaTokenizerImpl {
|
|||
numWikiTokensSeen++;
|
||||
return currentTokType;
|
||||
}
|
||||
// fall through
|
||||
// fall through
|
||||
case 53:
|
||||
break;
|
||||
case 8:
|
||||
|
@ -962,7 +962,7 @@ class WikipediaTokenizerImpl {
|
|||
/* Break so we don't hit fall-through warning: */
|
||||
break; /* ignore */
|
||||
}
|
||||
// fall through
|
||||
// fall through
|
||||
case 54:
|
||||
break;
|
||||
case 9:
|
||||
|
@ -978,7 +978,7 @@ class WikipediaTokenizerImpl {
|
|||
numLinkToks++;
|
||||
return currentTokType;
|
||||
}
|
||||
// fall through
|
||||
// fall through
|
||||
case 55:
|
||||
break;
|
||||
case 10:
|
||||
|
@ -988,7 +988,7 @@ class WikipediaTokenizerImpl {
|
|||
yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */
|
||||
break;
|
||||
}
|
||||
// fall through
|
||||
// fall through
|
||||
case 56:
|
||||
break;
|
||||
case 11:
|
||||
|
@ -997,7 +997,7 @@ class WikipediaTokenizerImpl {
|
|||
yybegin(THREE_SINGLE_QUOTES_STATE); /* Break so we don't hit fall-through warning: */
|
||||
break;
|
||||
}
|
||||
// fall through
|
||||
// fall through
|
||||
case 57:
|
||||
break;
|
||||
case 12:
|
||||
|
@ -1007,7 +1007,7 @@ class WikipediaTokenizerImpl {
|
|||
yybegin(STRING);
|
||||
return currentTokType; /*italics*/
|
||||
}
|
||||
// fall through
|
||||
// fall through
|
||||
case 58:
|
||||
break;
|
||||
case 13:
|
||||
|
@ -1017,7 +1017,7 @@ class WikipediaTokenizerImpl {
|
|||
yybegin(EXTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */
|
||||
break;
|
||||
}
|
||||
// fall through
|
||||
// fall through
|
||||
case 59:
|
||||
break;
|
||||
case 14:
|
||||
|
@ -1026,7 +1026,7 @@ class WikipediaTokenizerImpl {
|
|||
numWikiTokensSeen++;
|
||||
return currentTokType;
|
||||
}
|
||||
// fall through
|
||||
// fall through
|
||||
case 60:
|
||||
break;
|
||||
case 15:
|
||||
|
@ -1036,7 +1036,7 @@ class WikipediaTokenizerImpl {
|
|||
numWikiTokensSeen++;
|
||||
return currentTokType;
|
||||
}
|
||||
// fall through
|
||||
// fall through
|
||||
case 61:
|
||||
break;
|
||||
case 16:
|
||||
|
@ -1046,7 +1046,7 @@ class WikipediaTokenizerImpl {
|
|||
yybegin(STRING); /* Break so we don't hit fall-through warning: */
|
||||
break;
|
||||
}
|
||||
// fall through
|
||||
// fall through
|
||||
case 62:
|
||||
break;
|
||||
case 17:
|
||||
|
@ -1055,7 +1055,7 @@ class WikipediaTokenizerImpl {
|
|||
numWikiTokensSeen = 0;
|
||||
return currentTokType;
|
||||
}
|
||||
// fall through
|
||||
// fall through
|
||||
case 63:
|
||||
break;
|
||||
case 18:
|
||||
|
@ -1063,7 +1063,7 @@ class WikipediaTokenizerImpl {
|
|||
/* Break so we don't hit fall-through warning: */
|
||||
break; /* ignore STRING */
|
||||
}
|
||||
// fall through
|
||||
// fall through
|
||||
case 64:
|
||||
break;
|
||||
case 19:
|
||||
|
@ -1072,7 +1072,7 @@ class WikipediaTokenizerImpl {
|
|||
numWikiTokensSeen++;
|
||||
return currentTokType; /* STRING ALPHANUM*/
|
||||
}
|
||||
// fall through
|
||||
// fall through
|
||||
case 65:
|
||||
break;
|
||||
case 20:
|
||||
|
@ -1083,7 +1083,7 @@ class WikipediaTokenizerImpl {
|
|||
yybegin(EXTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */
|
||||
break;
|
||||
}
|
||||
// fall through
|
||||
// fall through
|
||||
case 66:
|
||||
break;
|
||||
case 21:
|
||||
|
@ -1091,7 +1091,7 @@ class WikipediaTokenizerImpl {
|
|||
yybegin(STRING);
|
||||
return currentTokType; /*pipe*/
|
||||
}
|
||||
// fall through
|
||||
// fall through
|
||||
case 67:
|
||||
break;
|
||||
case 22:
|
||||
|
@ -1106,7 +1106,7 @@ class WikipediaTokenizerImpl {
|
|||
} /* Break so we don't hit fall-through warning: */
|
||||
break;
|
||||
}
|
||||
// fall through
|
||||
// fall through
|
||||
case 68:
|
||||
break;
|
||||
case 23:
|
||||
|
@ -1116,7 +1116,7 @@ class WikipediaTokenizerImpl {
|
|||
yybegin(DOUBLE_EQUALS_STATE); /* Break so we don't hit fall-through warning: */
|
||||
break;
|
||||
}
|
||||
// fall through
|
||||
// fall through
|
||||
case 69:
|
||||
break;
|
||||
case 24:
|
||||
|
@ -1127,7 +1127,7 @@ class WikipediaTokenizerImpl {
|
|||
yybegin(INTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */
|
||||
break;
|
||||
}
|
||||
// fall through
|
||||
// fall through
|
||||
case 70:
|
||||
break;
|
||||
case 25:
|
||||
|
@ -1138,7 +1138,7 @@ class WikipediaTokenizerImpl {
|
|||
yybegin(DOUBLE_BRACE_STATE); /* Break so we don't hit fall-through warning: */
|
||||
break;
|
||||
}
|
||||
// fall through
|
||||
// fall through
|
||||
case 71:
|
||||
break;
|
||||
case 26:
|
||||
|
@ -1146,7 +1146,7 @@ class WikipediaTokenizerImpl {
|
|||
yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */
|
||||
break;
|
||||
}
|
||||
// fall through
|
||||
// fall through
|
||||
case 72:
|
||||
break;
|
||||
case 27:
|
||||
|
@ -1155,7 +1155,7 @@ class WikipediaTokenizerImpl {
|
|||
yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */
|
||||
break;
|
||||
}
|
||||
// fall through
|
||||
// fall through
|
||||
case 73:
|
||||
break;
|
||||
case 28:
|
||||
|
@ -1165,7 +1165,7 @@ class WikipediaTokenizerImpl {
|
|||
yybegin(INTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */
|
||||
break;
|
||||
}
|
||||
// fall through
|
||||
// fall through
|
||||
case 74:
|
||||
break;
|
||||
case 29:
|
||||
|
@ -1175,7 +1175,7 @@ class WikipediaTokenizerImpl {
|
|||
yybegin(INTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */
|
||||
break;
|
||||
}
|
||||
// fall through
|
||||
// fall through
|
||||
case 75:
|
||||
break;
|
||||
case 30:
|
||||
|
@ -1183,7 +1183,7 @@ class WikipediaTokenizerImpl {
|
|||
yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */
|
||||
break;
|
||||
}
|
||||
// fall through
|
||||
// fall through
|
||||
case 76:
|
||||
break;
|
||||
case 31:
|
||||
|
@ -1193,7 +1193,7 @@ class WikipediaTokenizerImpl {
|
|||
yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */
|
||||
break; /*end italics*/
|
||||
}
|
||||
// fall through
|
||||
// fall through
|
||||
case 77:
|
||||
break;
|
||||
case 32:
|
||||
|
@ -1204,7 +1204,7 @@ class WikipediaTokenizerImpl {
|
|||
yybegin(INTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */
|
||||
break;
|
||||
}
|
||||
// fall through
|
||||
// fall through
|
||||
case 78:
|
||||
break;
|
||||
case 33:
|
||||
|
@ -1212,7 +1212,7 @@ class WikipediaTokenizerImpl {
|
|||
positionInc = 1;
|
||||
return NUM;
|
||||
}
|
||||
// fall through
|
||||
// fall through
|
||||
case 79:
|
||||
break;
|
||||
case 34:
|
||||
|
@ -1220,7 +1220,7 @@ class WikipediaTokenizerImpl {
|
|||
positionInc = 1;
|
||||
return COMPANY;
|
||||
}
|
||||
// fall through
|
||||
// fall through
|
||||
case 80:
|
||||
break;
|
||||
case 35:
|
||||
|
@ -1228,7 +1228,7 @@ class WikipediaTokenizerImpl {
|
|||
positionInc = 1;
|
||||
return APOSTROPHE;
|
||||
}
|
||||
// fall through
|
||||
// fall through
|
||||
case 81:
|
||||
break;
|
||||
case 36:
|
||||
|
@ -1236,7 +1236,7 @@ class WikipediaTokenizerImpl {
|
|||
positionInc = 1;
|
||||
return HOST;
|
||||
}
|
||||
// fall through
|
||||
// fall through
|
||||
case 82:
|
||||
break;
|
||||
case 37:
|
||||
|
@ -1245,7 +1245,7 @@ class WikipediaTokenizerImpl {
|
|||
yybegin(FIVE_SINGLE_QUOTES_STATE); /* Break so we don't hit fall-through warning: */
|
||||
break;
|
||||
}
|
||||
// fall through
|
||||
// fall through
|
||||
case 83:
|
||||
break;
|
||||
case 38:
|
||||
|
@ -1255,7 +1255,7 @@ class WikipediaTokenizerImpl {
|
|||
yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */
|
||||
break; /*end bold*/
|
||||
}
|
||||
// fall through
|
||||
// fall through
|
||||
case 84:
|
||||
break;
|
||||
case 39:
|
||||
|
@ -1265,7 +1265,7 @@ class WikipediaTokenizerImpl {
|
|||
yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */
|
||||
break; /*end sub header*/
|
||||
}
|
||||
// fall through
|
||||
// fall through
|
||||
case 85:
|
||||
break;
|
||||
case 40:
|
||||
|
@ -1273,7 +1273,7 @@ class WikipediaTokenizerImpl {
|
|||
positionInc = 1;
|
||||
return ACRONYM;
|
||||
}
|
||||
// fall through
|
||||
// fall through
|
||||
case 86:
|
||||
break;
|
||||
case 41:
|
||||
|
@ -1281,7 +1281,7 @@ class WikipediaTokenizerImpl {
|
|||
positionInc = 1;
|
||||
return EMAIL;
|
||||
}
|
||||
// fall through
|
||||
// fall through
|
||||
case 87:
|
||||
break;
|
||||
case 42:
|
||||
|
@ -1291,7 +1291,7 @@ class WikipediaTokenizerImpl {
|
|||
yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */
|
||||
break; /*end bold italics*/
|
||||
}
|
||||
// fall through
|
||||
// fall through
|
||||
case 88:
|
||||
break;
|
||||
case 43:
|
||||
|
@ -1301,7 +1301,7 @@ class WikipediaTokenizerImpl {
|
|||
yybegin(EXTERNAL_LINK_STATE);
|
||||
return currentTokType;
|
||||
}
|
||||
// fall through
|
||||
// fall through
|
||||
case 89:
|
||||
break;
|
||||
case 44:
|
||||
|
@ -1312,7 +1312,7 @@ class WikipediaTokenizerImpl {
|
|||
yybegin(CATEGORY_STATE); /* Break so we don't hit fall-through warning: */
|
||||
break;
|
||||
}
|
||||
// fall through
|
||||
// fall through
|
||||
case 90:
|
||||
break;
|
||||
case 45:
|
||||
|
@ -1322,7 +1322,7 @@ class WikipediaTokenizerImpl {
|
|||
yybegin(CATEGORY_STATE); /* Break so we don't hit fall-through warning: */
|
||||
break;
|
||||
}
|
||||
// fall through
|
||||
// fall through
|
||||
case 91:
|
||||
break;
|
||||
case 46:
|
||||
|
@ -1333,7 +1333,7 @@ class WikipediaTokenizerImpl {
|
|||
yybegin(CATEGORY_STATE); /* Break so we don't hit fall-through warning: */
|
||||
break;
|
||||
}
|
||||
// fall through
|
||||
// fall through
|
||||
case 92:
|
||||
break;
|
||||
default:
|
||||
|
|
|
@ -1490,7 +1490,7 @@ public class TestSynonymGraphFilter extends BaseTokenStreamTestCase {
|
|||
}
|
||||
|
||||
assertTrue(approxEquals(actual, expected));
|
||||
assertTrue(Operations.sameLanguage(actual, expected));
|
||||
assertTrue(AutomatonTestUtil.sameLanguage(actual, expected));
|
||||
}
|
||||
|
||||
a.close();
|
||||
|
|
|
@ -64,7 +64,7 @@ public class TestWord2VecSynonymProvider extends LuceneTestCase {
|
|||
|
||||
assertEquals(4, actualSynonymsResults.size());
|
||||
for (int i = 0; i < expectedSynonyms.length; i++) {
|
||||
assertEquals(new BytesRef(expectedSynonyms[i]), actualSynonymsResults.get(i).term);
|
||||
assertEquals(new BytesRef(expectedSynonyms[i]), actualSynonymsResults.get(i).term());
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -83,8 +83,8 @@ public class TestWord2VecSynonymProvider extends LuceneTestCase {
|
|||
|
||||
BytesRef expectedFirstSynonymTerm = new BytesRef("b");
|
||||
double expectedFirstSynonymBoost = 1.0;
|
||||
assertEquals(expectedFirstSynonymTerm, actualSynonymsResults.get(0).term);
|
||||
assertEquals(expectedFirstSynonymBoost, actualSynonymsResults.get(0).boost, 0.001f);
|
||||
assertEquals(expectedFirstSynonymTerm, actualSynonymsResults.get(0).term());
|
||||
assertEquals(expectedFirstSynonymBoost, actualSynonymsResults.get(0).boost(), 0.001f);
|
||||
}
|
||||
|
||||
@Test
|
||||
|
@ -120,8 +120,8 @@ public class TestWord2VecSynonymProvider extends LuceneTestCase {
|
|||
@Test
|
||||
public void normalizedVector_shouldReturnModule1() {
|
||||
TermAndVector synonymTerm = new TermAndVector(new BytesRef("a"), new float[] {10, 10});
|
||||
synonymTerm.normalizeVector();
|
||||
float[] vector = synonymTerm.getVector();
|
||||
synonymTerm = synonymTerm.normalizeVector();
|
||||
float[] vector = synonymTerm.vector();
|
||||
float len = 0;
|
||||
for (int i = 0; i < vector.length; i++) {
|
||||
len += vector[i] * vector[i];
|
||||
|
|
|
@ -139,19 +139,7 @@ public final class JapaneseCompletionFilter extends TokenFilter {
|
|||
}
|
||||
}
|
||||
|
||||
private static class CompletionToken {
|
||||
final String term;
|
||||
final boolean isFirst;
|
||||
final int startOffset;
|
||||
final int endOffset;
|
||||
|
||||
CompletionToken(String term, boolean isFirst, int startOffset, int endOffset) {
|
||||
this.term = term;
|
||||
this.isFirst = isFirst;
|
||||
this.startOffset = startOffset;
|
||||
this.endOffset = endOffset;
|
||||
}
|
||||
}
|
||||
private record CompletionToken(String term, boolean isFirst, int startOffset, int endOffset) {}
|
||||
|
||||
private static class CompletionTokenGenerator implements Iterator<CompletionToken> {
|
||||
|
||||
|
|
|
@ -180,13 +180,5 @@ public class KatakanaRomanizer {
|
|||
return null;
|
||||
}
|
||||
|
||||
private static class MatchedKeystroke {
|
||||
final int keystrokeLen;
|
||||
final int keystrokeIndex;
|
||||
|
||||
MatchedKeystroke(int keystrokeLen, int keystrokeIndex) {
|
||||
this.keystrokeLen = keystrokeLen;
|
||||
this.keystrokeIndex = keystrokeIndex;
|
||||
}
|
||||
}
|
||||
private record MatchedKeystroke(int keystrokeLen, int keystrokeIndex) {}
|
||||
}
|
||||
|
|
|
@ -20,8 +20,6 @@ import java.io.BufferedReader;
|
|||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.Comparator;
|
||||
import java.util.List;
|
||||
import java.util.regex.Pattern;
|
||||
import org.apache.lucene.analysis.morph.Dictionary;
|
||||
|
@ -83,14 +81,7 @@ public final class UserDictionary implements Dictionary<UserMorphData> {
|
|||
// TODO: should we allow multiple segmentations per input 'phrase'?
|
||||
// the old treemap didn't support this either, and i'm not sure if it's needed/useful?
|
||||
|
||||
Collections.sort(
|
||||
featureEntries,
|
||||
new Comparator<String[]>() {
|
||||
@Override
|
||||
public int compare(String[] left, String[] right) {
|
||||
return left[0].compareTo(right[0]);
|
||||
}
|
||||
});
|
||||
featureEntries.sort((left, right) -> left[0].compareTo(right[0]));
|
||||
|
||||
List<String> data = new ArrayList<>(featureEntries.size());
|
||||
List<int[]> segmentations = new ArrayList<>(featureEntries.size());
|
||||
|
|
|
@ -268,19 +268,19 @@ final class Viterbi
|
|||
final KoMorphData.Morpheme morpheme = morphemes[i];
|
||||
final Token compoundToken;
|
||||
if (token.getPOSType() == POS.Type.COMPOUND) {
|
||||
assert endOffset - morpheme.surfaceForm.length() >= 0;
|
||||
assert endOffset - morpheme.surfaceForm().length() >= 0;
|
||||
compoundToken =
|
||||
new DecompoundToken(
|
||||
morpheme.posTag,
|
||||
morpheme.surfaceForm,
|
||||
endOffset - morpheme.surfaceForm.length(),
|
||||
morpheme.posTag(),
|
||||
morpheme.surfaceForm(),
|
||||
endOffset - morpheme.surfaceForm().length(),
|
||||
endOffset,
|
||||
backType);
|
||||
} else {
|
||||
compoundToken =
|
||||
new DecompoundToken(
|
||||
morpheme.posTag,
|
||||
morpheme.surfaceForm,
|
||||
morpheme.posTag(),
|
||||
morpheme.surfaceForm(),
|
||||
token.getStartOffset(),
|
||||
token.getEndOffset(),
|
||||
backType);
|
||||
|
@ -289,7 +289,7 @@ final class Viterbi
|
|||
compoundToken.setPositionIncrement(0);
|
||||
}
|
||||
++posLen;
|
||||
endOffset -= morpheme.surfaceForm.length();
|
||||
endOffset -= morpheme.surfaceForm().length();
|
||||
pending.add(compoundToken);
|
||||
if (VERBOSE) {
|
||||
System.out.println(" add token=" + pending.get(pending.size() - 1));
|
||||
|
|
|
@ -22,15 +22,7 @@ import org.apache.lucene.analysis.morph.MorphData;
|
|||
/** Represents Korean morphological information. */
|
||||
public interface KoMorphData extends MorphData {
|
||||
/** A morpheme extracted from a compound token. */
|
||||
class Morpheme {
|
||||
public final POS.Tag posTag;
|
||||
public final String surfaceForm;
|
||||
|
||||
public Morpheme(POS.Tag posTag, String surfaceForm) {
|
||||
this.posTag = posTag;
|
||||
this.surfaceForm = surfaceForm;
|
||||
}
|
||||
}
|
||||
record Morpheme(POS.Tag posTag, String surfaceForm) {}
|
||||
|
||||
/**
|
||||
* Get the {@link org.apache.lucene.analysis.ko.POS.Type} of specified word (morpheme, compound,
|
||||
|
|
|
@ -150,13 +150,13 @@ class TokenInfoDictionaryEntryWriter extends DictionaryEntryWriter {
|
|||
int compoundOffset = 0;
|
||||
for (KoMorphData.Morpheme morpheme : morphemes) {
|
||||
if (hasSinglePOS == false) {
|
||||
buffer.put((byte) morpheme.posTag.ordinal());
|
||||
buffer.put((byte) morpheme.posTag().ordinal());
|
||||
}
|
||||
if (posType != POS.Type.INFLECT) {
|
||||
buffer.put((byte) morpheme.surfaceForm.length());
|
||||
compoundOffset += morpheme.surfaceForm.length();
|
||||
buffer.put((byte) morpheme.surfaceForm().length());
|
||||
compoundOffset += morpheme.surfaceForm().length();
|
||||
} else {
|
||||
writeString(morpheme.surfaceForm);
|
||||
writeString(morpheme.surfaceForm());
|
||||
}
|
||||
assert compoundOffset <= entry[0].length() : Arrays.toString(entry);
|
||||
}
|
||||
|
|
|
@ -86,11 +86,11 @@ public class PartOfSpeechAttributeImpl extends AttributeImpl implements PartOfSp
|
|||
builder.append("+");
|
||||
}
|
||||
builder
|
||||
.append(morpheme.surfaceForm)
|
||||
.append(morpheme.surfaceForm())
|
||||
.append('/')
|
||||
.append(morpheme.posTag.name())
|
||||
.append(morpheme.posTag().name())
|
||||
.append('(')
|
||||
.append(morpheme.posTag.description())
|
||||
.append(morpheme.posTag().description())
|
||||
.append(')');
|
||||
}
|
||||
return builder.toString();
|
||||
|
|
|
@ -170,14 +170,14 @@ public class TestTokenInfoDictionary extends LuceneTestCase {
|
|||
if (decompound != null) {
|
||||
int offset = 0;
|
||||
for (KoMorphData.Morpheme morph : decompound) {
|
||||
assertTrue(UnicodeUtil.validUTF16String(morph.surfaceForm));
|
||||
assertFalse(morph.surfaceForm.isEmpty());
|
||||
assertEquals(morph.surfaceForm.trim(), morph.surfaceForm);
|
||||
assertTrue(UnicodeUtil.validUTF16String(morph.surfaceForm()));
|
||||
assertFalse(morph.surfaceForm().isEmpty());
|
||||
assertEquals(morph.surfaceForm().trim(), morph.surfaceForm());
|
||||
if (type != POS.Type.INFLECT) {
|
||||
assertEquals(
|
||||
morph.surfaceForm,
|
||||
surfaceForm.substring(offset, offset + morph.surfaceForm.length()));
|
||||
offset += morph.surfaceForm.length();
|
||||
morph.surfaceForm(),
|
||||
surfaceForm.substring(offset, offset + morph.surfaceForm().length()));
|
||||
offset += morph.surfaceForm().length();
|
||||
}
|
||||
}
|
||||
assertTrue(offset <= surfaceForm.length());
|
||||
|
|
|
@ -43,10 +43,10 @@ public class TestUserDictionary extends LuceneTestCase {
|
|||
dictionary.getMorphAttributes().getMorphemes(wordIds.get(1), sArray, 0, s.length());
|
||||
assertNotNull(decompound);
|
||||
assertEquals(2, decompound.length);
|
||||
assertEquals(decompound[0].posTag, POS.Tag.NNG);
|
||||
assertEquals(decompound[0].surfaceForm, "세종");
|
||||
assertEquals(decompound[1].posTag, POS.Tag.NNG);
|
||||
assertEquals(decompound[1].surfaceForm, "시");
|
||||
assertEquals(decompound[0].posTag(), POS.Tag.NNG);
|
||||
assertEquals(decompound[0].surfaceForm(), "세종");
|
||||
assertEquals(decompound[1].posTag(), POS.Tag.NNG);
|
||||
assertEquals(decompound[1].surfaceForm(), "시");
|
||||
|
||||
s = "c++";
|
||||
sArray = s.toCharArray();
|
||||
|
|
|
@ -245,7 +245,7 @@ public class Diff {
|
|||
deletes++;
|
||||
x--;
|
||||
break;
|
||||
// delete
|
||||
// delete
|
||||
case Y:
|
||||
if (deletes != base) {
|
||||
result.append('D').append(deletes);
|
||||
|
@ -258,7 +258,7 @@ public class Diff {
|
|||
result.append('I');
|
||||
result.append(b.charAt(--y));
|
||||
break;
|
||||
// insert
|
||||
// insert
|
||||
case R:
|
||||
if (deletes != base) {
|
||||
result.append('D').append(deletes);
|
||||
|
@ -272,7 +272,7 @@ public class Diff {
|
|||
result.append(b.charAt(--y));
|
||||
x--;
|
||||
break;
|
||||
// replace
|
||||
// replace
|
||||
case D:
|
||||
if (deletes != base) {
|
||||
result.append('D').append(deletes);
|
||||
|
|
|
@ -103,19 +103,20 @@ final class ForUtil {
|
|||
for (int bpv = 1; bpv <= 32; ++bpv) {
|
||||
final FormatAndBits formatAndBits =
|
||||
PackedInts.fastestFormatAndBits(BLOCK_SIZE, bpv, acceptableOverheadRatio);
|
||||
assert formatAndBits.format.isSupported(formatAndBits.bitsPerValue);
|
||||
assert formatAndBits.bitsPerValue <= 32;
|
||||
assert formatAndBits.format().isSupported(formatAndBits.bitsPerValue());
|
||||
assert formatAndBits.bitsPerValue() <= 32;
|
||||
encodedSizes[bpv] =
|
||||
encodedSize(formatAndBits.format, PackedInts.VERSION_CURRENT, formatAndBits.bitsPerValue);
|
||||
encodedSize(
|
||||
formatAndBits.format(), PackedInts.VERSION_CURRENT, formatAndBits.bitsPerValue());
|
||||
encoders[bpv] =
|
||||
PackedInts.getEncoder(
|
||||
formatAndBits.format, PackedInts.VERSION_CURRENT, formatAndBits.bitsPerValue);
|
||||
formatAndBits.format(), PackedInts.VERSION_CURRENT, formatAndBits.bitsPerValue());
|
||||
decoders[bpv] =
|
||||
PackedInts.getDecoder(
|
||||
formatAndBits.format, PackedInts.VERSION_CURRENT, formatAndBits.bitsPerValue);
|
||||
formatAndBits.format(), PackedInts.VERSION_CURRENT, formatAndBits.bitsPerValue());
|
||||
iterations[bpv] = computeIterations(decoders[bpv]);
|
||||
|
||||
out.writeVInt(formatAndBits.format.getId() << 5 | (formatAndBits.bitsPerValue - 1));
|
||||
out.writeVInt(formatAndBits.format().getId() << 5 | (formatAndBits.bitsPerValue() - 1));
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -24,6 +24,7 @@ import org.apache.lucene.codecs.CodecUtil;
|
|||
import org.apache.lucene.codecs.DocValuesFormat;
|
||||
import org.apache.lucene.codecs.FieldInfosFormat;
|
||||
import org.apache.lucene.index.CorruptIndexException;
|
||||
import org.apache.lucene.index.DocValuesSkipIndexType;
|
||||
import org.apache.lucene.index.DocValuesType;
|
||||
import org.apache.lucene.index.FieldInfo;
|
||||
import org.apache.lucene.index.FieldInfos;
|
||||
|
@ -209,7 +210,7 @@ public final class Lucene60FieldInfosFormat extends FieldInfosFormat {
|
|||
storePayloads,
|
||||
indexOptions,
|
||||
docValuesType,
|
||||
false,
|
||||
DocValuesSkipIndexType.NONE,
|
||||
dvGen,
|
||||
attributes,
|
||||
pointDataDimensionCount,
|
||||
|
@ -347,7 +348,7 @@ public final class Lucene60FieldInfosFormat extends FieldInfosFormat {
|
|||
output.writeVInt(fi.number);
|
||||
|
||||
byte bits = 0x0;
|
||||
if (fi.hasVectors()) bits |= STORE_TERMVECTOR;
|
||||
if (fi.hasTermVectors()) bits |= STORE_TERMVECTOR;
|
||||
if (fi.omitsNorms()) bits |= OMIT_NORMS;
|
||||
if (fi.hasPayloads()) bits |= STORE_PAYLOADS;
|
||||
if (fi.isSoftDeletesField()) bits |= SOFT_DELETES_FIELD;
|
||||
|
|
|
@ -17,8 +17,6 @@
|
|||
package org.apache.lucene.backward_codecs.lucene80;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
import org.apache.lucene.backward_codecs.packed.LegacyDirectMonotonicReader;
|
||||
import org.apache.lucene.backward_codecs.packed.LegacyDirectReader;
|
||||
import org.apache.lucene.backward_codecs.store.EndiannessReverserUtil;
|
||||
|
@ -41,6 +39,7 @@ import org.apache.lucene.index.SortedNumericDocValues;
|
|||
import org.apache.lucene.index.SortedSetDocValues;
|
||||
import org.apache.lucene.index.TermsEnum;
|
||||
import org.apache.lucene.index.TermsEnum.SeekStatus;
|
||||
import org.apache.lucene.internal.hppc.IntObjectHashMap;
|
||||
import org.apache.lucene.store.ByteArrayDataInput;
|
||||
import org.apache.lucene.store.ChecksumIndexInput;
|
||||
import org.apache.lucene.store.DataInput;
|
||||
|
@ -53,11 +52,11 @@ import org.apache.lucene.util.compress.LZ4;
|
|||
|
||||
/** reader for {@link Lucene80DocValuesFormat} */
|
||||
final class Lucene80DocValuesProducer extends DocValuesProducer {
|
||||
private final Map<String, NumericEntry> numerics = new HashMap<>();
|
||||
private final Map<String, BinaryEntry> binaries = new HashMap<>();
|
||||
private final Map<String, SortedEntry> sorted = new HashMap<>();
|
||||
private final Map<String, SortedSetEntry> sortedSets = new HashMap<>();
|
||||
private final Map<String, SortedNumericEntry> sortedNumerics = new HashMap<>();
|
||||
private final IntObjectHashMap<NumericEntry> numerics = new IntObjectHashMap<>();
|
||||
private final IntObjectHashMap<BinaryEntry> binaries = new IntObjectHashMap<>();
|
||||
private final IntObjectHashMap<SortedEntry> sorted = new IntObjectHashMap<>();
|
||||
private final IntObjectHashMap<SortedSetEntry> sortedSets = new IntObjectHashMap<>();
|
||||
private final IntObjectHashMap<SortedNumericEntry> sortedNumerics = new IntObjectHashMap<>();
|
||||
private final IndexInput data;
|
||||
private final int maxDoc;
|
||||
private int version = -1;
|
||||
|
@ -139,7 +138,7 @@ final class Lucene80DocValuesProducer extends DocValuesProducer {
|
|||
}
|
||||
byte type = meta.readByte();
|
||||
if (type == Lucene80DocValuesFormat.NUMERIC) {
|
||||
numerics.put(info.name, readNumeric(meta));
|
||||
numerics.put(info.number, readNumeric(meta));
|
||||
} else if (type == Lucene80DocValuesFormat.BINARY) {
|
||||
final boolean compressed;
|
||||
if (version >= Lucene80DocValuesFormat.VERSION_CONFIGURABLE_COMPRESSION) {
|
||||
|
@ -158,13 +157,13 @@ final class Lucene80DocValuesProducer extends DocValuesProducer {
|
|||
} else {
|
||||
compressed = version >= Lucene80DocValuesFormat.VERSION_BIN_COMPRESSED;
|
||||
}
|
||||
binaries.put(info.name, readBinary(meta, compressed));
|
||||
binaries.put(info.number, readBinary(meta, compressed));
|
||||
} else if (type == Lucene80DocValuesFormat.SORTED) {
|
||||
sorted.put(info.name, readSorted(meta));
|
||||
sorted.put(info.number, readSorted(meta));
|
||||
} else if (type == Lucene80DocValuesFormat.SORTED_SET) {
|
||||
sortedSets.put(info.name, readSortedSet(meta));
|
||||
sortedSets.put(info.number, readSortedSet(meta));
|
||||
} else if (type == Lucene80DocValuesFormat.SORTED_NUMERIC) {
|
||||
sortedNumerics.put(info.name, readSortedNumeric(meta));
|
||||
sortedNumerics.put(info.number, readSortedNumeric(meta));
|
||||
} else {
|
||||
throw new CorruptIndexException("invalid type: " + type, meta);
|
||||
}
|
||||
|
@ -426,7 +425,7 @@ final class Lucene80DocValuesProducer extends DocValuesProducer {
|
|||
|
||||
@Override
|
||||
public NumericDocValues getNumeric(FieldInfo field) throws IOException {
|
||||
NumericEntry entry = numerics.get(field.name);
|
||||
NumericEntry entry = numerics.get(field.number);
|
||||
return getNumeric(entry);
|
||||
}
|
||||
|
||||
|
@ -915,7 +914,7 @@ final class Lucene80DocValuesProducer extends DocValuesProducer {
|
|||
|
||||
@Override
|
||||
public BinaryDocValues getBinary(FieldInfo field) throws IOException {
|
||||
BinaryEntry entry = binaries.get(field.name);
|
||||
BinaryEntry entry = binaries.get(field.number);
|
||||
if (entry.compressed) {
|
||||
return getCompressedBinary(entry);
|
||||
} else {
|
||||
|
@ -973,7 +972,7 @@ final class Lucene80DocValuesProducer extends DocValuesProducer {
|
|||
|
||||
@Override
|
||||
public SortedDocValues getSorted(FieldInfo field) throws IOException {
|
||||
SortedEntry entry = sorted.get(field.name);
|
||||
SortedEntry entry = sorted.get(field.number);
|
||||
return getSorted(entry);
|
||||
}
|
||||
|
||||
|
@ -1407,7 +1406,7 @@ final class Lucene80DocValuesProducer extends DocValuesProducer {
|
|||
|
||||
@Override
|
||||
public SortedNumericDocValues getSortedNumeric(FieldInfo field) throws IOException {
|
||||
SortedNumericEntry entry = sortedNumerics.get(field.name);
|
||||
SortedNumericEntry entry = sortedNumerics.get(field.number);
|
||||
if (entry.numValues == entry.numDocsWithField) {
|
||||
return DocValues.singleton(getNumeric(entry));
|
||||
}
|
||||
|
@ -1543,7 +1542,7 @@ final class Lucene80DocValuesProducer extends DocValuesProducer {
|
|||
|
||||
@Override
|
||||
public SortedSetDocValues getSortedSet(FieldInfo field) throws IOException {
|
||||
SortedSetEntry entry = sortedSets.get(field.name);
|
||||
SortedSetEntry entry = sortedSets.get(field.number);
|
||||
if (entry.singleValueEntry != null) {
|
||||
return DocValues.singleton(getSorted(entry.singleValueEntry));
|
||||
}
|
||||
|
|
|
@ -23,6 +23,7 @@ import org.apache.lucene.codecs.CodecUtil;
|
|||
import org.apache.lucene.codecs.DocValuesFormat;
|
||||
import org.apache.lucene.codecs.FieldInfosFormat;
|
||||
import org.apache.lucene.index.CorruptIndexException;
|
||||
import org.apache.lucene.index.DocValuesSkipIndexType;
|
||||
import org.apache.lucene.index.DocValuesType;
|
||||
import org.apache.lucene.index.FieldInfo;
|
||||
import org.apache.lucene.index.FieldInfos;
|
||||
|
@ -186,7 +187,7 @@ public final class Lucene90FieldInfosFormat extends FieldInfosFormat {
|
|||
storePayloads,
|
||||
indexOptions,
|
||||
docValuesType,
|
||||
false,
|
||||
DocValuesSkipIndexType.NONE,
|
||||
dvGen,
|
||||
attributes,
|
||||
pointDataDimensionCount,
|
||||
|
@ -333,7 +334,7 @@ public final class Lucene90FieldInfosFormat extends FieldInfosFormat {
|
|||
output.writeVInt(fi.number);
|
||||
|
||||
byte bits = 0x0;
|
||||
if (fi.hasVectors()) bits |= STORE_TERMVECTOR;
|
||||
if (fi.hasTermVectors()) bits |= STORE_TERMVECTOR;
|
||||
if (fi.omitsNorms()) bits |= OMIT_NORMS;
|
||||
if (fi.hasPayloads()) bits |= STORE_PAYLOADS;
|
||||
if (fi.isSoftDeletesField()) bits |= SOFT_DELETES_FIELD;
|
||||
|
|
|
@ -224,6 +224,9 @@ public final class Lucene90HnswVectorsReader extends KnnVectorsReader {
|
|||
@Override
|
||||
public FloatVectorValues getFloatVectorValues(String field) throws IOException {
|
||||
FieldEntry fieldEntry = fields.get(field);
|
||||
if (fieldEntry == null) {
|
||||
throw new IllegalArgumentException("field=\"" + field + "\" not found");
|
||||
}
|
||||
return getOffHeapVectorValues(fieldEntry);
|
||||
}
|
||||
|
||||
|
|
|
@ -218,6 +218,9 @@ public final class Lucene91HnswVectorsReader extends KnnVectorsReader {
|
|||
@Override
|
||||
public FloatVectorValues getFloatVectorValues(String field) throws IOException {
|
||||
FieldEntry fieldEntry = fields.get(field);
|
||||
if (fieldEntry == null) {
|
||||
throw new IllegalArgumentException("field=\"" + field + "\" not found");
|
||||
}
|
||||
return getOffHeapVectorValues(fieldEntry);
|
||||
}
|
||||
|
||||
|
|
|
@ -215,6 +215,9 @@ public final class Lucene92HnswVectorsReader extends KnnVectorsReader {
|
|||
@Override
|
||||
public FloatVectorValues getFloatVectorValues(String field) throws IOException {
|
||||
FieldEntry fieldEntry = fields.get(field);
|
||||
if (fieldEntry == null) {
|
||||
throw new IllegalArgumentException("field=\"" + field + "\" not found");
|
||||
}
|
||||
return OffHeapFloatVectorValues.load(fieldEntry, vectorData);
|
||||
}
|
||||
|
||||
|
|
|
@ -233,6 +233,9 @@ public final class Lucene94HnswVectorsReader extends KnnVectorsReader {
|
|||
@Override
|
||||
public FloatVectorValues getFloatVectorValues(String field) throws IOException {
|
||||
FieldEntry fieldEntry = fields.get(field);
|
||||
if (fieldEntry == null) {
|
||||
throw new IllegalArgumentException("field=\"" + field + "\" not found");
|
||||
}
|
||||
if (fieldEntry.vectorEncoding != VectorEncoding.FLOAT32) {
|
||||
throw new IllegalArgumentException(
|
||||
"field=\""
|
||||
|
@ -248,6 +251,9 @@ public final class Lucene94HnswVectorsReader extends KnnVectorsReader {
|
|||
@Override
|
||||
public ByteVectorValues getByteVectorValues(String field) throws IOException {
|
||||
FieldEntry fieldEntry = fields.get(field);
|
||||
if (fieldEntry == null) {
|
||||
throw new IllegalArgumentException("field=\"" + field + "\" not found");
|
||||
}
|
||||
if (fieldEntry.vectorEncoding != VectorEncoding.BYTE) {
|
||||
throw new IllegalArgumentException(
|
||||
"field=\""
|
||||
|
|
|
@ -241,6 +241,9 @@ public final class Lucene95HnswVectorsReader extends KnnVectorsReader implements
|
|||
@Override
|
||||
public FloatVectorValues getFloatVectorValues(String field) throws IOException {
|
||||
FieldEntry fieldEntry = fields.get(field);
|
||||
if (fieldEntry == null) {
|
||||
throw new IllegalArgumentException("field=\"" + field + "\" not found");
|
||||
}
|
||||
if (fieldEntry.vectorEncoding != VectorEncoding.FLOAT32) {
|
||||
throw new IllegalArgumentException(
|
||||
"field=\""
|
||||
|
@ -264,6 +267,9 @@ public final class Lucene95HnswVectorsReader extends KnnVectorsReader implements
|
|||
@Override
|
||||
public ByteVectorValues getByteVectorValues(String field) throws IOException {
|
||||
FieldEntry fieldEntry = fields.get(field);
|
||||
if (fieldEntry == null) {
|
||||
throw new IllegalArgumentException("field=\"" + field + "\" not found");
|
||||
}
|
||||
if (fieldEntry.vectorEncoding != VectorEncoding.BYTE) {
|
||||
throw new IllegalArgumentException(
|
||||
"field=\""
|
||||
|
|
|
@ -46,10 +46,10 @@ import org.apache.lucene.store.IndexOutput;
|
|||
* uptos(position, payload). 4. start offset.
|
||||
*/
|
||||
public final class Lucene99SkipWriter extends MultiLevelSkipListWriter {
|
||||
private int[] lastSkipDoc;
|
||||
private long[] lastSkipDocPointer;
|
||||
private long[] lastSkipPosPointer;
|
||||
private long[] lastSkipPayPointer;
|
||||
private final int[] lastSkipDoc;
|
||||
private final long[] lastSkipDocPointer;
|
||||
private final long[] lastSkipPosPointer;
|
||||
private final long[] lastSkipPayPointer;
|
||||
|
||||
private final IndexOutput docOut;
|
||||
private final IndexOutput posOut;
|
||||
|
@ -61,7 +61,7 @@ public final class Lucene99SkipWriter extends MultiLevelSkipListWriter {
|
|||
private long curPayPointer;
|
||||
private int curPosBufferUpto;
|
||||
private int curPayloadByteUpto;
|
||||
private CompetitiveImpactAccumulator[] curCompetitiveFreqNorms;
|
||||
private final CompetitiveImpactAccumulator[] curCompetitiveFreqNorms;
|
||||
private boolean fieldHasPositions;
|
||||
private boolean fieldHasOffsets;
|
||||
private boolean fieldHasPayloads;
|
||||
|
@ -85,7 +85,12 @@ public final class Lucene99SkipWriter extends MultiLevelSkipListWriter {
|
|||
lastSkipPosPointer = new long[maxSkipLevels];
|
||||
if (payOut != null) {
|
||||
lastSkipPayPointer = new long[maxSkipLevels];
|
||||
} else {
|
||||
lastSkipPayPointer = null;
|
||||
}
|
||||
} else {
|
||||
lastSkipPosPointer = null;
|
||||
lastSkipPayPointer = null;
|
||||
}
|
||||
curCompetitiveFreqNorms = new CompetitiveImpactAccumulator[maxSkipLevels];
|
||||
for (int i = 0; i < maxSkipLevels; ++i) {
|
||||
|
|
|
@ -642,13 +642,13 @@ public class BKDWriter60 implements Closeable {
|
|||
throws IOException {
|
||||
assert docMaps == null || readers.size() == docMaps.size();
|
||||
|
||||
BKDMergeQueue queue = new BKDMergeQueue(config.bytesPerDim, readers.size());
|
||||
BKDMergeQueue queue = new BKDMergeQueue(config.bytesPerDim(), readers.size());
|
||||
|
||||
for (int i = 0; i < readers.size(); i++) {
|
||||
PointValues pointValues = readers.get(i);
|
||||
assert pointValues.getNumDimensions() == config.numDims
|
||||
&& pointValues.getBytesPerDimension() == config.bytesPerDim
|
||||
&& pointValues.getNumIndexDimensions() == config.numIndexDims;
|
||||
assert pointValues.getNumDimensions() == config.numDims()
|
||||
&& pointValues.getBytesPerDimension() == config.bytesPerDim()
|
||||
&& pointValues.getNumIndexDimensions() == config.numIndexDims();
|
||||
MergeState.DocMap docMap;
|
||||
if (docMaps == null) {
|
||||
docMap = null;
|
||||
|
@ -1931,7 +1931,7 @@ public class BKDWriter60 implements Closeable {
|
|||
private void computePackedValueBounds(
|
||||
BKDRadixSelector.PathSlice slice, byte[] minPackedValue, byte[] maxPackedValue)
|
||||
throws IOException {
|
||||
try (PointReader reader = slice.writer.getReader(slice.start, slice.count)) {
|
||||
try (PointReader reader = slice.writer().getReader(slice.start(), slice.count())) {
|
||||
if (reader.next() == false) {
|
||||
return;
|
||||
}
|
||||
|
@ -1995,16 +1995,16 @@ public class BKDWriter60 implements Closeable {
|
|||
// least number of unique bytes at commonPrefixLengths[dim], which makes compression more
|
||||
// efficient
|
||||
HeapPointWriter heapSource;
|
||||
if (points.writer instanceof HeapPointWriter == false) {
|
||||
if (points.writer() instanceof HeapPointWriter == false) {
|
||||
// Adversarial cases can cause this, e.g. merging big segments with most of the points
|
||||
// deleted
|
||||
heapSource = switchToHeap(points.writer);
|
||||
heapSource = switchToHeap(points.writer());
|
||||
} else {
|
||||
heapSource = (HeapPointWriter) points.writer;
|
||||
heapSource = (HeapPointWriter) points.writer();
|
||||
}
|
||||
|
||||
int from = Math.toIntExact(points.start);
|
||||
int to = Math.toIntExact(points.start + points.count);
|
||||
int from = Math.toIntExact(points.start());
|
||||
int to = Math.toIntExact(points.start() + points.count());
|
||||
// we store common prefix on scratch1
|
||||
computeCommonPrefixLength(heapSource, scratch1, from, to);
|
||||
|
||||
|
@ -2107,8 +2107,8 @@ public class BKDWriter60 implements Closeable {
|
|||
: "nodeID=" + nodeID + " splitValues.length=" + splitPackedValues.length;
|
||||
|
||||
// How many points will be in the left tree:
|
||||
long rightCount = points.count / 2;
|
||||
long leftCount = points.count - rightCount;
|
||||
long rightCount = points.count() / 2;
|
||||
long leftCount = points.count() - rightCount;
|
||||
|
||||
BKDRadixSelector.PathSlice[] slices = new BKDRadixSelector.PathSlice[2];
|
||||
|
||||
|
@ -2128,9 +2128,9 @@ public class BKDWriter60 implements Closeable {
|
|||
radixSelector.select(
|
||||
points,
|
||||
slices,
|
||||
points.start,
|
||||
points.start + points.count,
|
||||
points.start + leftCount,
|
||||
points.start(),
|
||||
points.start() + points.count(),
|
||||
points.start() + leftCount,
|
||||
splitDim,
|
||||
commonPrefixLen);
|
||||
|
||||
|
|
|
@ -78,4 +78,9 @@ public class TestLucene90HnswVectorsFormat extends BaseKnnVectorsFormatTestCase
|
|||
public void testEmptyByteVectorData() {
|
||||
// unimplemented
|
||||
}
|
||||
|
||||
@Override
|
||||
public void testMergingWithDifferentByteKnnFields() {
|
||||
// unimplemented
|
||||
}
|
||||
}
|
||||
|
|
|
@ -77,4 +77,9 @@ public class TestLucene91HnswVectorsFormat extends BaseKnnVectorsFormatTestCase
|
|||
public void testEmptyByteVectorData() {
|
||||
// unimplemented
|
||||
}
|
||||
|
||||
@Override
|
||||
public void testMergingWithDifferentByteKnnFields() {
|
||||
// unimplemented
|
||||
}
|
||||
}
|
||||
|
|
|
@ -67,4 +67,9 @@ public class TestLucene92HnswVectorsFormat extends BaseKnnVectorsFormatTestCase
|
|||
public void testEmptyByteVectorData() {
|
||||
// unimplemented
|
||||
}
|
||||
|
||||
@Override
|
||||
public void testMergingWithDifferentByteKnnFields() {
|
||||
// unimplemented
|
||||
}
|
||||
}
|
||||
|
|
|
@ -388,10 +388,14 @@ public final class Lucene94HnswVectorsWriter extends KnnVectorsWriter {
|
|||
// write the vector data to a temporary file
|
||||
DocsWithFieldSet docsWithField =
|
||||
switch (fieldInfo.getVectorEncoding()) {
|
||||
case BYTE -> writeByteVectorData(
|
||||
tempVectorData, MergedVectorValues.mergeByteVectorValues(fieldInfo, mergeState));
|
||||
case FLOAT32 -> writeVectorData(
|
||||
tempVectorData, MergedVectorValues.mergeFloatVectorValues(fieldInfo, mergeState));
|
||||
case BYTE ->
|
||||
writeByteVectorData(
|
||||
tempVectorData,
|
||||
MergedVectorValues.mergeByteVectorValues(fieldInfo, mergeState));
|
||||
case FLOAT32 ->
|
||||
writeVectorData(
|
||||
tempVectorData,
|
||||
MergedVectorValues.mergeFloatVectorValues(fieldInfo, mergeState));
|
||||
};
|
||||
CodecUtil.writeFooter(tempVectorData);
|
||||
IOUtils.close(tempVectorData);
|
||||
|
@ -638,18 +642,20 @@ public final class Lucene94HnswVectorsWriter extends KnnVectorsWriter {
|
|||
throws IOException {
|
||||
int dim = fieldInfo.getVectorDimension();
|
||||
return switch (fieldInfo.getVectorEncoding()) {
|
||||
case BYTE -> new FieldWriter<byte[]>(fieldInfo, M, beamWidth, infoStream) {
|
||||
@Override
|
||||
public byte[] copyValue(byte[] value) {
|
||||
return ArrayUtil.copyOfSubArray(value, 0, dim);
|
||||
}
|
||||
};
|
||||
case FLOAT32 -> new FieldWriter<float[]>(fieldInfo, M, beamWidth, infoStream) {
|
||||
@Override
|
||||
public float[] copyValue(float[] value) {
|
||||
return ArrayUtil.copyOfSubArray(value, 0, dim);
|
||||
}
|
||||
};
|
||||
case BYTE ->
|
||||
new FieldWriter<byte[]>(fieldInfo, M, beamWidth, infoStream) {
|
||||
@Override
|
||||
public byte[] copyValue(byte[] value) {
|
||||
return ArrayUtil.copyOfSubArray(value, 0, dim);
|
||||
}
|
||||
};
|
||||
case FLOAT32 ->
|
||||
new FieldWriter<float[]>(fieldInfo, M, beamWidth, infoStream) {
|
||||
@Override
|
||||
public float[] copyValue(float[] value) {
|
||||
return ArrayUtil.copyOfSubArray(value, 0, dim);
|
||||
}
|
||||
};
|
||||
};
|
||||
}
|
||||
|
||||
|
@ -663,12 +669,14 @@ public final class Lucene94HnswVectorsWriter extends KnnVectorsWriter {
|
|||
DefaultFlatVectorScorer defaultFlatVectorScorer = new DefaultFlatVectorScorer();
|
||||
RandomVectorScorerSupplier scorerSupplier =
|
||||
switch (fieldInfo.getVectorEncoding()) {
|
||||
case BYTE -> defaultFlatVectorScorer.getRandomVectorScorerSupplier(
|
||||
fieldInfo.getVectorSimilarityFunction(),
|
||||
RandomAccessVectorValues.fromBytes((List<byte[]>) vectors, dim));
|
||||
case FLOAT32 -> defaultFlatVectorScorer.getRandomVectorScorerSupplier(
|
||||
fieldInfo.getVectorSimilarityFunction(),
|
||||
RandomAccessVectorValues.fromFloats((List<float[]>) vectors, dim));
|
||||
case BYTE ->
|
||||
defaultFlatVectorScorer.getRandomVectorScorerSupplier(
|
||||
fieldInfo.getVectorSimilarityFunction(),
|
||||
RandomAccessVectorValues.fromBytes((List<byte[]>) vectors, dim));
|
||||
case FLOAT32 ->
|
||||
defaultFlatVectorScorer.getRandomVectorScorerSupplier(
|
||||
fieldInfo.getVectorSimilarityFunction(),
|
||||
RandomAccessVectorValues.fromFloats((List<float[]>) vectors, dim));
|
||||
};
|
||||
hnswGraphBuilder =
|
||||
HnswGraphBuilder.create(scorerSupplier, M, beamWidth, HnswGraphBuilder.randSeed);
|
||||
|
@ -693,9 +701,9 @@ public final class Lucene94HnswVectorsWriter extends KnnVectorsWriter {
|
|||
lastDocID = docID;
|
||||
}
|
||||
|
||||
OnHeapHnswGraph getGraph() {
|
||||
OnHeapHnswGraph getGraph() throws IOException {
|
||||
if (vectors.size() > 0) {
|
||||
return hnswGraphBuilder.getGraph();
|
||||
return hnswGraphBuilder.getCompletedGraph();
|
||||
} else {
|
||||
return null;
|
||||
}
|
||||
|
|
|
@ -18,6 +18,7 @@
|
|||
package org.apache.lucene.backward_codecs.lucene95;
|
||||
|
||||
import static org.apache.lucene.backward_codecs.lucene95.Lucene95HnswVectorsFormat.DIRECT_MONOTONIC_BLOCK_SHIFT;
|
||||
import static org.apache.lucene.codecs.KnnVectorsWriter.MergedVectorValues.hasVectorValues;
|
||||
import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS;
|
||||
|
||||
import java.io.IOException;
|
||||
|
@ -414,10 +415,14 @@ public final class Lucene95HnswVectorsWriter extends KnnVectorsWriter {
|
|||
// write the vector data to a temporary file
|
||||
DocsWithFieldSet docsWithField =
|
||||
switch (fieldInfo.getVectorEncoding()) {
|
||||
case BYTE -> writeByteVectorData(
|
||||
tempVectorData, MergedVectorValues.mergeByteVectorValues(fieldInfo, mergeState));
|
||||
case FLOAT32 -> writeVectorData(
|
||||
tempVectorData, MergedVectorValues.mergeFloatVectorValues(fieldInfo, mergeState));
|
||||
case BYTE ->
|
||||
writeByteVectorData(
|
||||
tempVectorData,
|
||||
MergedVectorValues.mergeByteVectorValues(fieldInfo, mergeState));
|
||||
case FLOAT32 ->
|
||||
writeVectorData(
|
||||
tempVectorData,
|
||||
MergedVectorValues.mergeFloatVectorValues(fieldInfo, mergeState));
|
||||
};
|
||||
CodecUtil.writeFooter(tempVectorData);
|
||||
IOUtils.close(tempVectorData);
|
||||
|
@ -472,15 +477,19 @@ public final class Lucene95HnswVectorsWriter extends KnnVectorsWriter {
|
|||
IncrementalHnswGraphMerger merger =
|
||||
new IncrementalHnswGraphMerger(fieldInfo, scorerSupplier, M, beamWidth);
|
||||
for (int i = 0; i < mergeState.liveDocs.length; i++) {
|
||||
merger.addReader(
|
||||
mergeState.knnVectorsReaders[i], mergeState.docMaps[i], mergeState.liveDocs[i]);
|
||||
if (hasVectorValues(mergeState.fieldInfos[i], fieldInfo.name)) {
|
||||
merger.addReader(
|
||||
mergeState.knnVectorsReaders[i], mergeState.docMaps[i], mergeState.liveDocs[i]);
|
||||
}
|
||||
}
|
||||
DocIdSetIterator mergedVectorIterator = null;
|
||||
switch (fieldInfo.getVectorEncoding()) {
|
||||
case BYTE -> mergedVectorIterator =
|
||||
KnnVectorsWriter.MergedVectorValues.mergeByteVectorValues(fieldInfo, mergeState);
|
||||
case FLOAT32 -> mergedVectorIterator =
|
||||
KnnVectorsWriter.MergedVectorValues.mergeFloatVectorValues(fieldInfo, mergeState);
|
||||
case BYTE ->
|
||||
mergedVectorIterator =
|
||||
KnnVectorsWriter.MergedVectorValues.mergeByteVectorValues(fieldInfo, mergeState);
|
||||
case FLOAT32 ->
|
||||
mergedVectorIterator =
|
||||
KnnVectorsWriter.MergedVectorValues.mergeFloatVectorValues(fieldInfo, mergeState);
|
||||
}
|
||||
graph =
|
||||
merger.merge(
|
||||
|
@ -680,18 +689,20 @@ public final class Lucene95HnswVectorsWriter extends KnnVectorsWriter {
|
|||
throws IOException {
|
||||
int dim = fieldInfo.getVectorDimension();
|
||||
return switch (fieldInfo.getVectorEncoding()) {
|
||||
case BYTE -> new FieldWriter<byte[]>(fieldInfo, M, beamWidth, infoStream) {
|
||||
@Override
|
||||
public byte[] copyValue(byte[] value) {
|
||||
return ArrayUtil.copyOfSubArray(value, 0, dim);
|
||||
}
|
||||
};
|
||||
case FLOAT32 -> new FieldWriter<float[]>(fieldInfo, M, beamWidth, infoStream) {
|
||||
@Override
|
||||
public float[] copyValue(float[] value) {
|
||||
return ArrayUtil.copyOfSubArray(value, 0, dim);
|
||||
}
|
||||
};
|
||||
case BYTE ->
|
||||
new FieldWriter<byte[]>(fieldInfo, M, beamWidth, infoStream) {
|
||||
@Override
|
||||
public byte[] copyValue(byte[] value) {
|
||||
return ArrayUtil.copyOfSubArray(value, 0, dim);
|
||||
}
|
||||
};
|
||||
case FLOAT32 ->
|
||||
new FieldWriter<float[]>(fieldInfo, M, beamWidth, infoStream) {
|
||||
@Override
|
||||
public float[] copyValue(float[] value) {
|
||||
return ArrayUtil.copyOfSubArray(value, 0, dim);
|
||||
}
|
||||
};
|
||||
};
|
||||
}
|
||||
|
||||
|
@ -704,12 +715,14 @@ public final class Lucene95HnswVectorsWriter extends KnnVectorsWriter {
|
|||
vectors = new ArrayList<>();
|
||||
RandomVectorScorerSupplier scorerSupplier =
|
||||
switch (fieldInfo.getVectorEncoding()) {
|
||||
case BYTE -> defaultFlatVectorScorer.getRandomVectorScorerSupplier(
|
||||
fieldInfo.getVectorSimilarityFunction(),
|
||||
RandomAccessVectorValues.fromBytes((List<byte[]>) vectors, dim));
|
||||
case FLOAT32 -> defaultFlatVectorScorer.getRandomVectorScorerSupplier(
|
||||
fieldInfo.getVectorSimilarityFunction(),
|
||||
RandomAccessVectorValues.fromFloats((List<float[]>) vectors, dim));
|
||||
case BYTE ->
|
||||
defaultFlatVectorScorer.getRandomVectorScorerSupplier(
|
||||
fieldInfo.getVectorSimilarityFunction(),
|
||||
RandomAccessVectorValues.fromBytes((List<byte[]>) vectors, dim));
|
||||
case FLOAT32 ->
|
||||
defaultFlatVectorScorer.getRandomVectorScorerSupplier(
|
||||
fieldInfo.getVectorSimilarityFunction(),
|
||||
RandomAccessVectorValues.fromFloats((List<float[]>) vectors, dim));
|
||||
};
|
||||
hnswGraphBuilder =
|
||||
HnswGraphBuilder.create(scorerSupplier, M, beamWidth, HnswGraphBuilder.randSeed);
|
||||
|
@ -732,9 +745,9 @@ public final class Lucene95HnswVectorsWriter extends KnnVectorsWriter {
|
|||
lastDocID = docID;
|
||||
}
|
||||
|
||||
OnHeapHnswGraph getGraph() {
|
||||
OnHeapHnswGraph getGraph() throws IOException {
|
||||
if (vectors.size() > 0) {
|
||||
return hnswGraphBuilder.getGraph();
|
||||
return hnswGraphBuilder.getCompletedGraph();
|
||||
} else {
|
||||
return null;
|
||||
}
|
||||
|
|
|
@ -72,7 +72,7 @@ public class TestIndexSortBackwardsCompatibility extends BackwardsCompatibilityT
|
|||
final Sort sort;
|
||||
try (DirectoryReader reader = DirectoryReader.open(directory)) {
|
||||
assertEquals(1, reader.leaves().size());
|
||||
sort = reader.leaves().get(0).reader().getMetaData().getSort();
|
||||
sort = reader.leaves().get(0).reader().getMetaData().sort();
|
||||
assertNotNull(sort);
|
||||
searchExampleIndex(reader);
|
||||
}
|
||||
|
@ -125,8 +125,8 @@ public class TestIndexSortBackwardsCompatibility extends BackwardsCompatibilityT
|
|||
.add(new TermQuery(new Term("bid", "" + i)), BooleanClause.Occur.MUST)
|
||||
.build(),
|
||||
2);
|
||||
assertEquals(2, children.totalHits.value);
|
||||
assertEquals(1, parents.totalHits.value);
|
||||
assertEquals(2, children.totalHits.value());
|
||||
assertEquals(1, parents.totalHits.value());
|
||||
// make sure it's sorted
|
||||
assertEquals(children.scoreDocs[0].doc + 1, children.scoreDocs[1].doc);
|
||||
assertEquals(children.scoreDocs[1].doc + 1, parents.scoreDocs[0].doc);
|
||||
|
@ -140,7 +140,7 @@ public class TestIndexSortBackwardsCompatibility extends BackwardsCompatibilityT
|
|||
public void testSortedIndex() throws Exception {
|
||||
try (DirectoryReader reader = DirectoryReader.open(directory)) {
|
||||
assertEquals(1, reader.leaves().size());
|
||||
Sort sort = reader.leaves().get(0).reader().getMetaData().getSort();
|
||||
Sort sort = reader.leaves().get(0).reader().getMetaData().sort();
|
||||
assertNotNull(sort);
|
||||
assertEquals("<long: \"dateDV\">!", sort.toString());
|
||||
// This will confirm the docs are really sorted
|
||||
|
@ -195,28 +195,28 @@ public class TestIndexSortBackwardsCompatibility extends BackwardsCompatibilityT
|
|||
IndexSearcher searcher = newSearcher(reader);
|
||||
|
||||
TopDocs topDocs = searcher.search(new FieldExistsQuery("titleTokenized"), 10);
|
||||
assertEquals(50, topDocs.totalHits.value);
|
||||
assertEquals(50, topDocs.totalHits.value());
|
||||
|
||||
topDocs = searcher.search(new FieldExistsQuery("titleDV"), 10);
|
||||
assertEquals(50, topDocs.totalHits.value);
|
||||
assertEquals(50, topDocs.totalHits.value());
|
||||
|
||||
topDocs =
|
||||
searcher.search(
|
||||
IntPoint.newRangeQuery("docid_int", 42, 44),
|
||||
10,
|
||||
new Sort(new SortField("docid_intDV", SortField.Type.INT)));
|
||||
assertEquals(3, topDocs.totalHits.value);
|
||||
assertEquals(3, topDocs.totalHits.value());
|
||||
assertEquals(3, topDocs.scoreDocs.length);
|
||||
assertEquals(42, ((FieldDoc) topDocs.scoreDocs[0]).fields[0]);
|
||||
assertEquals(43, ((FieldDoc) topDocs.scoreDocs[1]).fields[0]);
|
||||
assertEquals(44, ((FieldDoc) topDocs.scoreDocs[2]).fields[0]);
|
||||
|
||||
topDocs = searcher.search(new TermQuery(new Term("body", "the")), 5);
|
||||
assertTrue(topDocs.totalHits.value > 0);
|
||||
assertTrue(topDocs.totalHits.value() > 0);
|
||||
topDocs =
|
||||
searcher.search(
|
||||
new MatchAllDocsQuery(), 5, new Sort(new SortField("dateDV", SortField.Type.LONG)));
|
||||
assertEquals(50, topDocs.totalHits.value);
|
||||
assertEquals(50, topDocs.totalHits.value());
|
||||
assertEquals(5, topDocs.scoreDocs.length);
|
||||
long firstDate = (Long) ((FieldDoc) topDocs.scoreDocs[0]).fields[0];
|
||||
long lastDate = (Long) ((FieldDoc) topDocs.scoreDocs[4]).fields[0];
|
||||
|
|
|
@ -0,0 +1,108 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.benchmark.jmh;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.Random;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import org.apache.lucene.codecs.lucene912.ForDeltaUtil;
|
||||
import org.apache.lucene.codecs.lucene912.ForUtil;
|
||||
import org.apache.lucene.codecs.lucene912.PostingIndexInput;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.IOContext;
|
||||
import org.apache.lucene.store.IndexInput;
|
||||
import org.apache.lucene.store.IndexOutput;
|
||||
import org.apache.lucene.store.MMapDirectory;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.openjdk.jmh.annotations.Benchmark;
|
||||
import org.openjdk.jmh.annotations.BenchmarkMode;
|
||||
import org.openjdk.jmh.annotations.Fork;
|
||||
import org.openjdk.jmh.annotations.Level;
|
||||
import org.openjdk.jmh.annotations.Measurement;
|
||||
import org.openjdk.jmh.annotations.Mode;
|
||||
import org.openjdk.jmh.annotations.OutputTimeUnit;
|
||||
import org.openjdk.jmh.annotations.Param;
|
||||
import org.openjdk.jmh.annotations.Scope;
|
||||
import org.openjdk.jmh.annotations.Setup;
|
||||
import org.openjdk.jmh.annotations.State;
|
||||
import org.openjdk.jmh.annotations.TearDown;
|
||||
import org.openjdk.jmh.annotations.Warmup;
|
||||
import org.openjdk.jmh.infra.Blackhole;
|
||||
|
||||
@BenchmarkMode(Mode.Throughput)
|
||||
@OutputTimeUnit(TimeUnit.MICROSECONDS)
|
||||
@State(Scope.Benchmark)
|
||||
@Warmup(iterations = 5, time = 1)
|
||||
@Measurement(iterations = 5, time = 1)
|
||||
@Fork(
|
||||
value = 3,
|
||||
jvmArgsAppend = {"-Xmx1g", "-Xms1g", "-XX:+AlwaysPreTouch"})
|
||||
public class PostingIndexInputBenchmark {
|
||||
|
||||
private Path path;
|
||||
private Directory dir;
|
||||
private IndexInput in;
|
||||
private PostingIndexInput postingIn;
|
||||
private final ForUtil forUtil = new ForUtil();
|
||||
private final ForDeltaUtil forDeltaUtil = new ForDeltaUtil();
|
||||
private final long[] values = new long[128];
|
||||
|
||||
@Param({"2", "3", "4", "5", "6", "7", "8", "9", "10"})
|
||||
public int bpv;
|
||||
|
||||
@Setup(Level.Trial)
|
||||
public void setup() throws Exception {
|
||||
path = Files.createTempDirectory("forUtil");
|
||||
dir = MMapDirectory.open(path);
|
||||
try (IndexOutput out = dir.createOutput("docs", IOContext.DEFAULT)) {
|
||||
Random r = new Random(0);
|
||||
// Write enough random data to not reach EOF while decoding
|
||||
for (int i = 0; i < 100; ++i) {
|
||||
out.writeLong(r.nextLong());
|
||||
}
|
||||
}
|
||||
in = dir.openInput("docs", IOContext.DEFAULT);
|
||||
postingIn = new PostingIndexInput(in, forUtil, forDeltaUtil);
|
||||
}
|
||||
|
||||
@TearDown(Level.Trial)
|
||||
public void tearDown() throws Exception {
|
||||
if (dir != null) {
|
||||
dir.deleteFile("docs");
|
||||
}
|
||||
IOUtils.close(in, dir);
|
||||
in = null;
|
||||
dir = null;
|
||||
Files.deleteIfExists(path);
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public void decode(Blackhole bh) throws IOException {
|
||||
in.seek(3); // random unaligned offset
|
||||
postingIn.decode(bpv, values);
|
||||
bh.consume(values);
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public void decodeAndPrefixSum(Blackhole bh) throws IOException {
|
||||
in.seek(3); // random unaligned offset
|
||||
postingIn.decodeAndPrefixSum(bpv, 100, values);
|
||||
bh.consume(values);
|
||||
}
|
||||
}
|
|
@ -17,11 +17,10 @@
|
|||
# -------------------------------------------------------------------------------------
|
||||
# multi val params are iterated by NewRound's, added to reports, start with column name.
|
||||
|
||||
# collector.class can be:
|
||||
# Fully Qualified Class Name of a Collector with a empty constructor
|
||||
# topScoreDocOrdered - Creates a TopScoreDocCollector that requires in order docs
|
||||
# topScoreDocUnordered - Like above, but allows out of order
|
||||
collector.class=coll:topScoreDoc
|
||||
# collector.manager.class can be:
|
||||
# Fully Qualified Class Name of a CollectorManager with a empty constructor
|
||||
# topScoreDoc - Creates a TopScoreDocCollectorManager
|
||||
collector.manager.class=coll:topScoreDoc
|
||||
|
||||
analyzer=org.apache.lucene.analysis.core.WhitespaceAnalyzer
|
||||
directory=FSDirectory
|
||||
|
|
|
@ -17,11 +17,10 @@
|
|||
# -------------------------------------------------------------------------------------
|
||||
# multi val params are iterated by NewRound's, added to reports, start with column name.
|
||||
|
||||
# collector.class can be:
|
||||
# Fully Qualified Class Name of a Collector with a empty constructor
|
||||
# topScoreDocOrdered - Creates a TopScoreDocCollector that requires in order docs
|
||||
# topScoreDocUnordered - Like above, but allows out of order
|
||||
collector.class=coll:topScoreDoc
|
||||
# collector.manager.class can be:
|
||||
# Fully Qualified Class Name of a CollectorManager with a empty constructor
|
||||
# topScoreDoc - Creates a TopScoreDocCollectorManager
|
||||
collector.manager.class=coll:topScoreDoc
|
||||
|
||||
analyzer=org.apache.lucene.analysis.core.WhitespaceAnalyzer
|
||||
directory=FSDirectory
|
||||
|
|
|
@ -238,7 +238,7 @@ public class EnwikiContentSource extends ContentSource {
|
|||
time = null;
|
||||
id = null;
|
||||
break;
|
||||
// intentional fall-through.
|
||||
// intentional fall-through.
|
||||
case BODY:
|
||||
case DATE:
|
||||
case TITLE:
|
||||
|
|
|
@ -99,7 +99,7 @@ public class SpatialDocMaker extends DocMaker {
|
|||
return makeRPTStrategy(SPATIAL_FIELD, config, configMap, ctx);
|
||||
case "composite":
|
||||
return makeCompositeStrategy(config, configMap, ctx);
|
||||
// TODO add more as-needed
|
||||
// TODO add more as-needed
|
||||
default:
|
||||
throw new IllegalStateException("Unknown spatial.strategy: " + strategyName);
|
||||
}
|
||||
|
|
|
@ -24,7 +24,7 @@ import org.apache.lucene.index.DirectoryReader;
|
|||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.MultiBits;
|
||||
import org.apache.lucene.index.StoredFields;
|
||||
import org.apache.lucene.search.Collector;
|
||||
import org.apache.lucene.search.CollectorManager;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.search.ScoreDoc;
|
||||
|
@ -119,9 +119,7 @@ public abstract class ReadTask extends PerfTask {
|
|||
hits = searcher.search(q, numHits);
|
||||
}
|
||||
} else {
|
||||
Collector collector = createCollector();
|
||||
|
||||
searcher.search(q, collector);
|
||||
searcher.search(q, createCollectorManager());
|
||||
// hits = collector.topDocs();
|
||||
}
|
||||
|
||||
|
@ -184,9 +182,8 @@ public abstract class ReadTask extends PerfTask {
|
|||
return res;
|
||||
}
|
||||
|
||||
protected Collector createCollector() throws Exception {
|
||||
return new TopScoreDocCollectorManager(numHits(), withTotalHits() ? Integer.MAX_VALUE : 1)
|
||||
.newCollector();
|
||||
protected CollectorManager<?, ?> createCollectorManager() throws Exception {
|
||||
return new TopScoreDocCollectorManager(numHits(), withTotalHits() ? Integer.MAX_VALUE : 1);
|
||||
}
|
||||
|
||||
protected Document retrieveDoc(StoredFields storedFields, int id) throws IOException {
|
||||
|
|
|
@ -19,7 +19,7 @@ package org.apache.lucene.benchmark.byTask.tasks;
|
|||
import org.apache.lucene.benchmark.byTask.PerfRunData;
|
||||
import org.apache.lucene.benchmark.byTask.feeds.QueryMaker;
|
||||
import org.apache.lucene.benchmark.byTask.utils.Config;
|
||||
import org.apache.lucene.search.Collector;
|
||||
import org.apache.lucene.search.CollectorManager;
|
||||
import org.apache.lucene.search.TopScoreDocCollectorManager;
|
||||
|
||||
/** Does search w/ a custom collector */
|
||||
|
@ -37,7 +37,11 @@ public class SearchWithCollectorTask extends SearchTask {
|
|||
// check to make sure either the doc is being stored
|
||||
PerfRunData runData = getRunData();
|
||||
Config config = runData.getConfig();
|
||||
clnName = config.get("collector.class", "");
|
||||
if (config.get("collector.class", null) != null) {
|
||||
throw new IllegalArgumentException(
|
||||
"collector.class is no longer supported as a config parameter, use collector.manager.class instead to provide a CollectorManager class name");
|
||||
}
|
||||
clnName = config.get("collector.manager.class", "");
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -46,18 +50,17 @@ public class SearchWithCollectorTask extends SearchTask {
|
|||
}
|
||||
|
||||
@Override
|
||||
protected Collector createCollector() throws Exception {
|
||||
Collector collector = null;
|
||||
protected CollectorManager<?, ?> createCollectorManager() throws Exception {
|
||||
CollectorManager<?, ?> collectorManager;
|
||||
if (clnName.equalsIgnoreCase("topScoreDoc") == true) {
|
||||
collector =
|
||||
new TopScoreDocCollectorManager(numHits(), null, Integer.MAX_VALUE, false).newCollector();
|
||||
collectorManager = new TopScoreDocCollectorManager(numHits(), Integer.MAX_VALUE);
|
||||
} else if (clnName.length() > 0) {
|
||||
collector = Class.forName(clnName).asSubclass(Collector.class).getConstructor().newInstance();
|
||||
|
||||
collectorManager =
|
||||
Class.forName(clnName).asSubclass(CollectorManager.class).getConstructor().newInstance();
|
||||
} else {
|
||||
collector = super.createCollector();
|
||||
collectorManager = super.createCollectorManager();
|
||||
}
|
||||
return collector;
|
||||
return collectorManager;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -91,7 +91,7 @@ public class TestDocMaker extends BenchmarkTestCase {
|
|||
IndexReader reader = DirectoryReader.open(runData.getDirectory());
|
||||
IndexSearcher searcher = newSearcher(reader);
|
||||
TopDocs td = searcher.search(new TermQuery(new Term("key", "value")), 10);
|
||||
assertEquals(numExpectedResults, td.totalHits.value);
|
||||
assertEquals(numExpectedResults, td.totalHits.value());
|
||||
reader.close();
|
||||
}
|
||||
|
||||
|
|
|
@ -160,7 +160,7 @@ public class TestLineDocSource extends BenchmarkTestCase {
|
|||
reader = DirectoryReader.open(runData.getDirectory());
|
||||
searcher = newSearcher(reader);
|
||||
TopDocs td = searcher.search(new TermQuery(new Term("body", "body")), 10);
|
||||
assertEquals(numAdds, td.totalHits.value);
|
||||
assertEquals(numAdds, td.totalHits.value());
|
||||
assertNotNull(td.scoreDocs[0]);
|
||||
|
||||
if (storedField == null) {
|
||||
|
|
|
@ -151,13 +151,13 @@ public class BM25NBClassifier implements Classifier<BytesRef> {
|
|||
if (!assignedClasses.isEmpty()) {
|
||||
Collections.sort(assignedClasses);
|
||||
// this is a negative number closest to 0 = a
|
||||
double smax = assignedClasses.get(0).getScore();
|
||||
double smax = assignedClasses.get(0).score();
|
||||
|
||||
double sumLog = 0;
|
||||
// log(sum(exp(x_n-a)))
|
||||
for (ClassificationResult<BytesRef> cr : assignedClasses) {
|
||||
// getScore-smax <=0 (both negative, smax is the smallest abs()
|
||||
sumLog += Math.exp(cr.getScore() - smax);
|
||||
sumLog += Math.exp(cr.score() - smax);
|
||||
}
|
||||
// loga=a+log(sum(exp(x_n-a))) = log(sum(exp(x_n)))
|
||||
double loga = smax;
|
||||
|
@ -165,8 +165,8 @@ public class BM25NBClassifier implements Classifier<BytesRef> {
|
|||
|
||||
// 1/sum*x = exp(log(x))*1/sum = exp(log(x)-log(sum))
|
||||
for (ClassificationResult<BytesRef> cr : assignedClasses) {
|
||||
double scoreDiff = cr.getScore() - loga;
|
||||
returnList.add(new ClassificationResult<>(cr.getAssignedClass(), Math.exp(scoreDiff)));
|
||||
double scoreDiff = cr.score() - loga;
|
||||
returnList.add(new ClassificationResult<>(cr.assignedClass(), Math.exp(scoreDiff)));
|
||||
}
|
||||
}
|
||||
return returnList;
|
||||
|
@ -216,7 +216,7 @@ public class BM25NBClassifier implements Classifier<BytesRef> {
|
|||
builder.add(query, BooleanClause.Occur.MUST);
|
||||
}
|
||||
TopDocs search = indexSearcher.search(builder.build(), 1);
|
||||
return search.totalHits.value > 0 ? search.scoreDocs[0].score : 1;
|
||||
return search.totalHits.value() > 0 ? search.scoreDocs[0].score : 1;
|
||||
}
|
||||
|
||||
private double calculateLogPrior(Term term) throws IOException {
|
||||
|
@ -227,6 +227,6 @@ public class BM25NBClassifier implements Classifier<BytesRef> {
|
|||
bq.add(query, BooleanClause.Occur.MUST);
|
||||
}
|
||||
TopDocs topDocs = indexSearcher.search(bq.build(), 1);
|
||||
return topDocs.totalHits.value > 0 ? Math.log(topDocs.scoreDocs[0].score) : 0;
|
||||
return topDocs.totalHits.value() > 0 ? Math.log(topDocs.scoreDocs[0].score) : 0;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -148,7 +148,7 @@ public class BooleanPerceptronClassifier implements Classifier<Boolean> {
|
|||
if (textField != null && classField != null) {
|
||||
// assign class to the doc
|
||||
ClassificationResult<Boolean> classificationResult = assignClass(textField.stringValue());
|
||||
Boolean assignedClass = classificationResult.getAssignedClass();
|
||||
Boolean assignedClass = classificationResult.assignedClass();
|
||||
|
||||
Boolean correctClass = Boolean.valueOf(classField.stringValue());
|
||||
double modifier = Math.signum(correctClass.compareTo(assignedClass));
|
||||
|
|
|
@ -126,7 +126,7 @@ public class CachingNaiveBayesClassifier extends SimpleNaiveBayesClassifier {
|
|||
int removeIdx = -1;
|
||||
int i = 0;
|
||||
for (ClassificationResult<BytesRef> cr : ret) {
|
||||
if (cr.getAssignedClass().equals(cclass)) {
|
||||
if (cr.assignedClass().equals(cclass)) {
|
||||
removeIdx = i;
|
||||
break;
|
||||
}
|
||||
|
@ -137,7 +137,7 @@ public class CachingNaiveBayesClassifier extends SimpleNaiveBayesClassifier {
|
|||
ClassificationResult<BytesRef> toRemove = ret.get(removeIdx);
|
||||
ret.add(
|
||||
new ClassificationResult<>(
|
||||
toRemove.getAssignedClass(), toRemove.getScore() + Math.log(wordProbability)));
|
||||
toRemove.assignedClass(), toRemove.score() + Math.log(wordProbability)));
|
||||
ret.remove(removeIdx);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -20,44 +20,15 @@ package org.apache.lucene.classification;
|
|||
* The result of a call to {@link Classifier#assignClass(String)} holding an assigned class of type
|
||||
* <code>T</code> and a score.
|
||||
*
|
||||
* @param assignedClass the class <code>T</code> assigned by a {@link Classifier}
|
||||
* @param score score the score for the assignedClass as a <code>double</code>
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public class ClassificationResult<T> implements Comparable<ClassificationResult<T>> {
|
||||
|
||||
private final T assignedClass;
|
||||
private final double score;
|
||||
|
||||
/**
|
||||
* Constructor
|
||||
*
|
||||
* @param assignedClass the class <code>T</code> assigned by a {@link Classifier}
|
||||
* @param score the score for the assignedClass as a <code>double</code>
|
||||
*/
|
||||
public ClassificationResult(T assignedClass, double score) {
|
||||
this.assignedClass = assignedClass;
|
||||
this.score = score;
|
||||
}
|
||||
|
||||
/**
|
||||
* retrieve the result class
|
||||
*
|
||||
* @return a <code>T</code> representing an assigned class
|
||||
*/
|
||||
public T getAssignedClass() {
|
||||
return assignedClass;
|
||||
}
|
||||
|
||||
/**
|
||||
* retrieve the result score
|
||||
*
|
||||
* @return a <code>double</code> representing a result score
|
||||
*/
|
||||
public double getScore() {
|
||||
return score;
|
||||
}
|
||||
public record ClassificationResult<T>(T assignedClass, double score)
|
||||
implements Comparable<ClassificationResult<T>> {
|
||||
|
||||
@Override
|
||||
public int compareTo(ClassificationResult<T> o) {
|
||||
return Double.compare(o.getScore(), this.getScore());
|
||||
return Double.compare(o.score(), this.score());
|
||||
}
|
||||
}
|
||||
|
|
|
@ -108,9 +108,9 @@ public class KNearestFuzzyClassifier implements Classifier<BytesRef> {
|
|||
ClassificationResult<BytesRef> assignedClass = null;
|
||||
double maxscore = -Double.MAX_VALUE;
|
||||
for (ClassificationResult<BytesRef> cl : assignedClasses) {
|
||||
if (cl.getScore() > maxscore) {
|
||||
if (cl.score() > maxscore) {
|
||||
assignedClass = cl;
|
||||
maxscore = cl.getScore();
|
||||
maxscore = cl.score();
|
||||
}
|
||||
}
|
||||
return assignedClass;
|
||||
|
@ -159,7 +159,7 @@ public class KNearestFuzzyClassifier implements Classifier<BytesRef> {
|
|||
Map<BytesRef, Integer> classCounts = new HashMap<>();
|
||||
Map<BytesRef, Double> classBoosts =
|
||||
new HashMap<>(); // this is a boost based on class ranking positions in topDocs
|
||||
float maxScore = topDocs.totalHits.value == 0 ? Float.NaN : topDocs.scoreDocs[0].score;
|
||||
float maxScore = topDocs.totalHits.value() == 0 ? Float.NaN : topDocs.scoreDocs[0].score;
|
||||
StoredFields storedFields = indexSearcher.storedFields();
|
||||
for (ScoreDoc scoreDoc : topDocs.scoreDocs) {
|
||||
IndexableField storableField = storedFields.document(scoreDoc.doc).getField(classFieldName);
|
||||
|
@ -193,7 +193,7 @@ public class KNearestFuzzyClassifier implements Classifier<BytesRef> {
|
|||
if (sumdoc < k) {
|
||||
for (ClassificationResult<BytesRef> cr : temporaryList) {
|
||||
returnList.add(
|
||||
new ClassificationResult<>(cr.getAssignedClass(), cr.getScore() * k / (double) sumdoc));
|
||||
new ClassificationResult<>(cr.assignedClass(), cr.score() * k / (double) sumdoc));
|
||||
}
|
||||
} else {
|
||||
returnList = temporaryList;
|
||||
|
|
|
@ -129,9 +129,9 @@ public class KNearestNeighborClassifier implements Classifier<BytesRef> {
|
|||
ClassificationResult<BytesRef> assignedClass = null;
|
||||
double maxscore = -Double.MAX_VALUE;
|
||||
for (ClassificationResult<BytesRef> cl : assignedClasses) {
|
||||
if (cl.getScore() > maxscore) {
|
||||
if (cl.score() > maxscore) {
|
||||
assignedClass = cl;
|
||||
maxscore = cl.getScore();
|
||||
maxscore = cl.score();
|
||||
}
|
||||
}
|
||||
return assignedClass;
|
||||
|
@ -192,7 +192,7 @@ public class KNearestNeighborClassifier implements Classifier<BytesRef> {
|
|||
Map<BytesRef, Integer> classCounts = new HashMap<>();
|
||||
Map<BytesRef, Double> classBoosts =
|
||||
new HashMap<>(); // this is a boost based on class ranking positions in topDocs
|
||||
float maxScore = topDocs.totalHits.value == 0 ? Float.NaN : topDocs.scoreDocs[0].score;
|
||||
float maxScore = topDocs.totalHits.value() == 0 ? Float.NaN : topDocs.scoreDocs[0].score;
|
||||
StoredFields storedFields = indexSearcher.storedFields();
|
||||
for (ScoreDoc scoreDoc : topDocs.scoreDocs) {
|
||||
IndexableField[] storableFields =
|
||||
|
@ -229,7 +229,7 @@ public class KNearestNeighborClassifier implements Classifier<BytesRef> {
|
|||
if (sumdoc < k) {
|
||||
for (ClassificationResult<BytesRef> cr : temporaryList) {
|
||||
returnList.add(
|
||||
new ClassificationResult<>(cr.getAssignedClass(), cr.getScore() * k / (double) sumdoc));
|
||||
new ClassificationResult<>(cr.assignedClass(), cr.score() * k / (double) sumdoc));
|
||||
}
|
||||
} else {
|
||||
returnList = temporaryList;
|
||||
|
|
|
@ -105,9 +105,9 @@ public class SimpleNaiveBayesClassifier implements Classifier<BytesRef> {
|
|||
ClassificationResult<BytesRef> assignedClass = null;
|
||||
double maxscore = -Double.MAX_VALUE;
|
||||
for (ClassificationResult<BytesRef> c : assignedClasses) {
|
||||
if (c.getScore() > maxscore) {
|
||||
if (c.score() > maxscore) {
|
||||
assignedClass = c;
|
||||
maxscore = c.getScore();
|
||||
maxscore = c.score();
|
||||
}
|
||||
}
|
||||
return assignedClass;
|
||||
|
@ -297,13 +297,13 @@ public class SimpleNaiveBayesClassifier implements Classifier<BytesRef> {
|
|||
if (!assignedClasses.isEmpty()) {
|
||||
Collections.sort(assignedClasses);
|
||||
// this is a negative number closest to 0 = a
|
||||
double smax = assignedClasses.get(0).getScore();
|
||||
double smax = assignedClasses.get(0).score();
|
||||
|
||||
double sumLog = 0;
|
||||
// log(sum(exp(x_n-a)))
|
||||
for (ClassificationResult<BytesRef> cr : assignedClasses) {
|
||||
// getScore-smax <=0 (both negative, smax is the smallest abs()
|
||||
sumLog += Math.exp(cr.getScore() - smax);
|
||||
sumLog += Math.exp(cr.score() - smax);
|
||||
}
|
||||
// loga=a+log(sum(exp(x_n-a))) = log(sum(exp(x_n)))
|
||||
double loga = smax;
|
||||
|
@ -311,8 +311,8 @@ public class SimpleNaiveBayesClassifier implements Classifier<BytesRef> {
|
|||
|
||||
// 1/sum*x = exp(log(x))*1/sum = exp(log(x)-log(sum))
|
||||
for (ClassificationResult<BytesRef> cr : assignedClasses) {
|
||||
double scoreDiff = cr.getScore() - loga;
|
||||
returnList.add(new ClassificationResult<>(cr.getAssignedClass(), Math.exp(scoreDiff)));
|
||||
double scoreDiff = cr.score() - loga;
|
||||
returnList.add(new ClassificationResult<>(cr.assignedClass(), Math.exp(scoreDiff)));
|
||||
}
|
||||
}
|
||||
return returnList;
|
||||
|
|
|
@ -80,9 +80,9 @@ public class SimpleNaiveBayesDocumentClassifier extends SimpleNaiveBayesClassifi
|
|||
ClassificationResult<BytesRef> assignedClass = null;
|
||||
double maxscore = -Double.MAX_VALUE;
|
||||
for (ClassificationResult<BytesRef> c : assignedClasses) {
|
||||
if (c.getScore() > maxscore) {
|
||||
if (c.score() > maxscore) {
|
||||
assignedClass = c;
|
||||
maxscore = c.getScore();
|
||||
maxscore = c.score();
|
||||
}
|
||||
}
|
||||
return assignedClass;
|
||||
|
|
|
@ -107,7 +107,7 @@ public class ConfusionMatrixGenerator {
|
|||
time += end - start;
|
||||
|
||||
if (result != null) {
|
||||
T assignedClass = result.getAssignedClass();
|
||||
T assignedClass = result.assignedClass();
|
||||
if (assignedClass != null) {
|
||||
counter++;
|
||||
String classified =
|
||||
|
|
|
@ -138,13 +138,13 @@ public class DatasetSplitter {
|
|||
// iterate over existing documents
|
||||
StoredFields storedFields = originalIndex.storedFields();
|
||||
for (GroupDocs<Object> group : topGroups.groups) {
|
||||
assert group.totalHits.relation == TotalHits.Relation.EQUAL_TO;
|
||||
long totalHits = group.totalHits.value;
|
||||
assert group.totalHits().relation() == TotalHits.Relation.EQUAL_TO;
|
||||
long totalHits = group.totalHits().value();
|
||||
double testSize = totalHits * testRatio;
|
||||
int tc = 0;
|
||||
double cvSize = totalHits * crossValidationRatio;
|
||||
int cvc = 0;
|
||||
for (ScoreDoc scoreDoc : group.scoreDocs) {
|
||||
for (ScoreDoc scoreDoc : group.scoreDocs()) {
|
||||
|
||||
// create a new document for indexing
|
||||
Document doc = createNewDoc(storedFields, ft, scoreDoc, fieldNames);
|
||||
|
|
|
@ -91,7 +91,7 @@ public abstract class ClassificationTestBase<T> extends LuceneTestCase {
|
|||
Classifier<T> classifier, String inputDoc, T expectedResult) throws Exception {
|
||||
ClassificationResult<T> classificationResult = classifier.assignClass(inputDoc);
|
||||
assertNotNull(classificationResult);
|
||||
T assignedClass = classificationResult.getAssignedClass();
|
||||
T assignedClass = classificationResult.assignedClass();
|
||||
assertNotNull(assignedClass);
|
||||
assertEquals(
|
||||
"got an assigned class of " + assignedClass,
|
||||
|
@ -101,7 +101,7 @@ public abstract class ClassificationTestBase<T> extends LuceneTestCase {
|
|||
assignedClass instanceof BytesRef
|
||||
? ((BytesRef) assignedClass).utf8ToString()
|
||||
: assignedClass);
|
||||
double score = classificationResult.getScore();
|
||||
double score = classificationResult.score();
|
||||
assertTrue("score should be between 0 and 1, got:" + score, score <= 1 && score >= 0);
|
||||
return classificationResult;
|
||||
}
|
||||
|
@ -130,18 +130,17 @@ public abstract class ClassificationTestBase<T> extends LuceneTestCase {
|
|||
getSampleIndex(analyzer);
|
||||
|
||||
ClassificationResult<T> classificationResult = classifier.assignClass(inputDoc);
|
||||
assertNotNull(classificationResult.getAssignedClass());
|
||||
assertNotNull(classificationResult.assignedClass());
|
||||
assertEquals(
|
||||
"got an assigned class of " + classificationResult.getAssignedClass(),
|
||||
"got an assigned class of " + classificationResult.assignedClass(),
|
||||
expectedResult,
|
||||
classificationResult.getAssignedClass());
|
||||
double score = classificationResult.getScore();
|
||||
classificationResult.assignedClass());
|
||||
double score = classificationResult.score();
|
||||
assertTrue("score should be between 0 and 1, got: " + score, score <= 1 && score >= 0);
|
||||
updateSampleIndex();
|
||||
ClassificationResult<T> secondClassificationResult = classifier.assignClass(inputDoc);
|
||||
assertEquals(
|
||||
classificationResult.getAssignedClass(), secondClassificationResult.getAssignedClass());
|
||||
assertEquals(Double.valueOf(score), Double.valueOf(secondClassificationResult.getScore()));
|
||||
assertEquals(classificationResult.assignedClass(), secondClassificationResult.assignedClass());
|
||||
assertEquals(Double.valueOf(score), Double.valueOf(secondClassificationResult.score()));
|
||||
}
|
||||
|
||||
protected LeafReader getSampleIndex(Analyzer analyzer) throws IOException {
|
||||
|
|
|
@ -88,7 +88,7 @@ public class TestKNearestNeighborClassifier extends ClassificationTestBase<Bytes
|
|||
textFieldName),
|
||||
TECHNOLOGY_INPUT,
|
||||
TECHNOLOGY_RESULT);
|
||||
assertTrue(resultDS.getScore() != resultLMS.getScore());
|
||||
assertTrue(resultDS.score() != resultLMS.score());
|
||||
} finally {
|
||||
IOUtils.close(leafReader);
|
||||
}
|
||||
|
@ -113,7 +113,7 @@ public class TestKNearestNeighborClassifier extends ClassificationTestBase<Bytes
|
|||
leafReader, null, analyzer, null, 6, 1, 1, categoryFieldName, textFieldName);
|
||||
List<ClassificationResult<BytesRef>> classes =
|
||||
knnClassifier.getClasses(STRONG_TECHNOLOGY_INPUT);
|
||||
assertTrue(classes.get(0).getScore() > classes.get(1).getScore());
|
||||
assertTrue(classes.get(0).score() > classes.get(1).score());
|
||||
checkCorrectClassification(knnClassifier, STRONG_TECHNOLOGY_INPUT, TECHNOLOGY_RESULT);
|
||||
} finally {
|
||||
IOUtils.close(leafReader);
|
||||
|
@ -139,7 +139,7 @@ public class TestKNearestNeighborClassifier extends ClassificationTestBase<Bytes
|
|||
leafReader, null, analyzer, null, 3, 1, 1, categoryFieldName, textFieldName);
|
||||
List<ClassificationResult<BytesRef>> classes =
|
||||
knnClassifier.getClasses(SUPER_STRONG_TECHNOLOGY_INPUT);
|
||||
assertTrue(classes.get(0).getScore() > classes.get(1).getScore());
|
||||
assertTrue(classes.get(0).score() > classes.get(1).score());
|
||||
checkCorrectClassification(knnClassifier, SUPER_STRONG_TECHNOLOGY_INPUT, TECHNOLOGY_RESULT);
|
||||
} finally {
|
||||
IOUtils.close(leafReader);
|
||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue