From ea1441c81cccd4ba1910c0bd2a59088fd50e2720 Mon Sep 17 00:00:00 2001 From: Dawid Weiss Date: Fri, 30 Aug 2024 12:36:56 +0200 Subject: [PATCH 01/18] Upgrade to gradle 8.10 (#13700) --- .../workflows/run-checks-gradle-upgrade.yml | 18 +++++++++++++----- .../lucene/gradle/WrapperDownloader.java | 4 ++-- gradle/documentation/render-javadoc.gradle | 2 +- gradle/generation/kuromoji.gradle | 2 +- gradle/generation/nori.gradle | 2 +- gradle/testing/beasting.gradle | 2 +- gradle/testing/profiling.gradle | 2 +- gradle/testing/randomization.gradle | 6 +++--- gradle/testing/slowest-tests-at-end.gradle | 2 +- gradle/wrapper/gradle-wrapper.jar.sha256 | 2 +- gradle/wrapper/gradle-wrapper.jar.version | 2 +- gradle/wrapper/gradle-wrapper.properties | 2 +- lucene/CHANGES.txt | 2 ++ .../demo/facet/SandboxFacetsExample.java | 7 ++++--- lucene/distribution/binary-release.gradle | 4 ++-- versions.toml | 6 +++--- 16 files changed, 38 insertions(+), 27 deletions(-) diff --git a/.github/workflows/run-checks-gradle-upgrade.yml b/.github/workflows/run-checks-gradle-upgrade.yml index 751c3471b51..4e0ef65f03a 100644 --- a/.github/workflows/run-checks-gradle-upgrade.yml +++ b/.github/workflows/run-checks-gradle-upgrade.yml @@ -30,7 +30,7 @@ jobs: strategy: matrix: os: [ ubuntu-latest ] - java-version: [ '22' ] + java-version: [ '23-ea' ] uses-alt-java: [ true, false ] runs-on: ${{ matrix.os }} @@ -61,7 +61,16 @@ jobs: # https://docs.github.com/en/actions/using-workflows/workflow-commands-for-github-actions#setting-an-environment-variable echo "RUNTIME_JAVA_HOME=${{ env.ALT_JAVA_DIR }}" >> "$GITHUB_ENV" - - run: ./gradlew -p lucene/core check -x test + - name: ./gradlew tidy + run: | + ./gradlew tidy + if [ ! -z "$(git status --porcelain)" ]; then + echo ":warning: **tidy left local checkout in modified state**" >> $GITHUB_STEP_SUMMARY + echo '```' >> $GITHUB_STEP_SUMMARY + git status --porcelain >> $GITHUB_STEP_SUMMARY + echo '```' >> $GITHUB_STEP_SUMMARY + git reset --hard && git clean -xfd . + fi - name: ./gradlew regenerate run: | @@ -69,7 +78,7 @@ jobs: sudo apt-get install libwww-perl ./gradlew regenerate -x generateUAX29URLEmailTokenizerInternal --rerun-tasks if [ ! -z "$(git status --porcelain)" ]; then - echo ":warning: **regenerateleft local checkout in modified state**" >> $GITHUB_STEP_SUMMARY + echo ":warning: **regenerate left local checkout in modified state**" >> $GITHUB_STEP_SUMMARY echo '```' >> $GITHUB_STEP_SUMMARY git status --porcelain >> $GITHUB_STEP_SUMMARY echo '```' >> $GITHUB_STEP_SUMMARY @@ -79,8 +88,7 @@ jobs: - run: ./gradlew testOpts - run: ./gradlew helpWorkflow - run: ./gradlew licenses updateLicenses - - run: ./gradlew tidy - - run: ./gradlew check -x test + - run: ./gradlew check -x test -Pvalidation.git.failOnModified=false - run: ./gradlew assembleRelease mavenToLocal # Conserve resources: only run these in non-alt-java mode. diff --git a/build-tools/build-infra/src/main/java/org/apache/lucene/gradle/WrapperDownloader.java b/build-tools/build-infra/src/main/java/org/apache/lucene/gradle/WrapperDownloader.java index aaf7059bb5f..d084243b2a5 100644 --- a/build-tools/build-infra/src/main/java/org/apache/lucene/gradle/WrapperDownloader.java +++ b/build-tools/build-infra/src/main/java/org/apache/lucene/gradle/WrapperDownloader.java @@ -60,8 +60,8 @@ public class WrapperDownloader { public static void checkVersion() { int major = Runtime.version().feature(); - if (major != 21 && major != 22) { - throw new IllegalStateException("java version must be 21 or 22, your version: " + major); + if (major != 21 && major != 22 && major != 23) { + throw new IllegalStateException("java version must be 21, 22 or 23, your version: " + major); } } diff --git a/gradle/documentation/render-javadoc.gradle b/gradle/documentation/render-javadoc.gradle index ddfb68a1b1c..a161fbc24dd 100644 --- a/gradle/documentation/render-javadoc.gradle +++ b/gradle/documentation/render-javadoc.gradle @@ -32,7 +32,7 @@ allprojects { missingdoclet "org.apache.lucene.tools:missing-doclet" } - ext { + project.ext { relativeDocPath = project.path.replaceFirst(/:\w+:/, "").replace(':', '/') } diff --git a/gradle/generation/kuromoji.gradle b/gradle/generation/kuromoji.gradle index cfe2cd559ce..a5aa5fbd604 100644 --- a/gradle/generation/kuromoji.gradle +++ b/gradle/generation/kuromoji.gradle @@ -33,7 +33,7 @@ configure(project(":lucene:analysis:kuromoji")) { apply plugin: deps.plugins.undercouch.download.get().pluginId plugins.withType(JavaPlugin) { - ext { + project.ext { targetDir = file("src/resources") } diff --git a/gradle/generation/nori.gradle b/gradle/generation/nori.gradle index db05babdf03..3a558325964 100644 --- a/gradle/generation/nori.gradle +++ b/gradle/generation/nori.gradle @@ -33,7 +33,7 @@ configure(project(":lucene:analysis:nori")) { apply plugin: deps.plugins.undercouch.download.get().pluginId plugins.withType(JavaPlugin) { - ext { + project.ext { targetDir = file("src/resources") } diff --git a/gradle/testing/beasting.gradle b/gradle/testing/beasting.gradle index 8934100ec10..67c20140ba8 100644 --- a/gradle/testing/beasting.gradle +++ b/gradle/testing/beasting.gradle @@ -27,7 +27,7 @@ def beastingMode = gradle.startParameter.taskNames.any{ name -> name == 'beast' allprojects { plugins.withType(JavaPlugin) { - ext { + project.ext { testOptions += [ [propName: 'tests.dups', value: 0, description: "Reiterate runs of entire test suites ('beast' task)."] ] diff --git a/gradle/testing/profiling.gradle b/gradle/testing/profiling.gradle index 6c71b3f827a..88284fb5454 100644 --- a/gradle/testing/profiling.gradle +++ b/gradle/testing/profiling.gradle @@ -19,7 +19,7 @@ def recordings = files() allprojects { plugins.withType(JavaPlugin) { - ext { + project.ext { testOptions += [ [propName: 'tests.profile', value: false, description: "Enable Java Flight Recorder profiling."] ] diff --git a/gradle/testing/randomization.gradle b/gradle/testing/randomization.gradle index 9ca8625b0ee..670f8ef2689 100644 --- a/gradle/testing/randomization.gradle +++ b/gradle/testing/randomization.gradle @@ -62,7 +62,7 @@ allprojects { // Configure test property defaults and their descriptions. allprojects { plugins.withType(JavaPlugin) { - ext { + project.ext { String randomVectorSize = RandomPicks.randomFrom(new Random(projectSeedLong), ["default", "128", "256", "512"]) testOptions += [ // seed, repetition and amplification. @@ -135,14 +135,14 @@ allprojects { } afterEvaluate { - ext.testOptionsResolved = testOptions.findAll { opt -> + project.ext.testOptionsResolved = testOptions.findAll { opt -> propertyOrDefault(opt.propName, opt.value) != null }.collectEntries { opt -> [(opt.propName): Objects.toString(resolvedTestOption(opt.propName))] } // Compute the "reproduce with" string. - ext.testOptionsForReproduceLine = testOptions.findAll { opt -> + project.ext.testOptionsForReproduceLine = testOptions.findAll { opt -> if (opt["includeInReproLine"] == false) { return false } diff --git a/gradle/testing/slowest-tests-at-end.gradle b/gradle/testing/slowest-tests-at-end.gradle index eaf9cd1a2f1..d24e523394d 100644 --- a/gradle/testing/slowest-tests-at-end.gradle +++ b/gradle/testing/slowest-tests-at-end.gradle @@ -22,7 +22,7 @@ def allSuites = [] allprojects { plugins.withType(JavaPlugin) { - ext { + project.ext { testOptions += [ [propName: 'tests.slowestTests', value: true, description: "Print the summary of the slowest tests."], [propName: 'tests.slowestSuites', value: true, description: "Print the summary of the slowest suites."] diff --git a/gradle/wrapper/gradle-wrapper.jar.sha256 b/gradle/wrapper/gradle-wrapper.jar.sha256 index 2a6b8668ac7..6d5fdcd3f30 100644 --- a/gradle/wrapper/gradle-wrapper.jar.sha256 +++ b/gradle/wrapper/gradle-wrapper.jar.sha256 @@ -1 +1 @@ -cb0da6751c2b753a16ac168bb354870ebb1e162e9083f116729cec9c781156b8 +2db75c40782f5e8ba1fc278a5574bab070adccb2d21ca5a6e5ed840888448046 \ No newline at end of file diff --git a/gradle/wrapper/gradle-wrapper.jar.version b/gradle/wrapper/gradle-wrapper.jar.version index 3b6825376ad..7f6758ef97b 100644 --- a/gradle/wrapper/gradle-wrapper.jar.version +++ b/gradle/wrapper/gradle-wrapper.jar.version @@ -1 +1 @@ -8.8.0 +8.10.0 diff --git a/gradle/wrapper/gradle-wrapper.properties b/gradle/wrapper/gradle-wrapper.properties index a4413138c96..9355b415575 100644 --- a/gradle/wrapper/gradle-wrapper.properties +++ b/gradle/wrapper/gradle-wrapper.properties @@ -1,6 +1,6 @@ distributionBase=GRADLE_USER_HOME distributionPath=wrapper/dists -distributionUrl=https\://services.gradle.org/distributions/gradle-8.8-bin.zip +distributionUrl=https\://services.gradle.org/distributions/gradle-8.10-bin.zip networkTimeout=10000 validateDistributionUrl=true zipStoreBase=GRADLE_USER_HOME diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index e956c99289f..ae15247ac23 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -266,6 +266,8 @@ Build * GITHUB#13649: Fix eclipse ide settings generation #13649 (Uwe Schindler, Dawid Weiss) +* GITHUB#13698: Upgrade to gradle 8.10 (Dawid Weiss) + ======================== Lucene 9.12.0 ======================= API Changes diff --git a/lucene/demo/src/java/org/apache/lucene/demo/facet/SandboxFacetsExample.java b/lucene/demo/src/java/org/apache/lucene/demo/facet/SandboxFacetsExample.java index b6759da115b..0655abac131 100644 --- a/lucene/demo/src/java/org/apache/lucene/demo/facet/SandboxFacetsExample.java +++ b/lucene/demo/src/java/org/apache/lucene/demo/facet/SandboxFacetsExample.java @@ -148,9 +148,10 @@ public class SandboxFacetsExample { FacetFieldCollectorManager collectorManager = new FacetFieldCollectorManager<>(defaultTaxoCutter, defaultRecorder); - //// (2.1) if we need to collect data using multiple different collectors, e.g. taxonomy and - //// ranges, or even two taxonomy facets that use different Category List Field, we can - //// use MultiCollectorManager, e.g.: + // (2.1) if we need to collect data using multiple different collectors, e.g. taxonomy and + // ranges, or even two taxonomy facets that use different Category List Field, we can + // use MultiCollectorManager, e.g.: + // // TODO: add a demo for it. // TaxonomyFacetsCutter publishDateCutter = new // TaxonomyFacetsCutter(config.getDimConfig("Publish Date"), taxoReader); diff --git a/lucene/distribution/binary-release.gradle b/lucene/distribution/binary-release.gradle index 5c03ac3a82a..c3365e961ab 100644 --- a/lucene/distribution/binary-release.gradle +++ b/lucene/distribution/binary-release.gradle @@ -37,9 +37,9 @@ configure(project(":lucene:distribution")) { // Maven-published submodule JARs are part of the binary distribution. // We don't copy their transitive dependencies. - def binaryModules = rootProject.ext.mavenProjects.findAll { p -> !(p in [ + def binaryModules = rootProject.ext.mavenProjects.findAll { p -> !(p.path in [ // Placed in a separate folder (module layer conflicts). - project(":lucene:test-framework"), + ":lucene:test-framework", ]) } for (Project module : binaryModules) { jars(module, { diff --git a/versions.toml b/versions.toml index 1a2c52c1f38..c93d1e405c4 100644 --- a/versions.toml +++ b/versions.toml @@ -9,7 +9,7 @@ errorprone = "2.18.0" flexmark = "0.61.24" # @keep This is GJF version for spotless/ tidy. googleJavaFormat = "1.23.0" -groovy = "3.0.21" +groovy = "4.0.22" hamcrest = "2.2" icu4j = "74.2" javacc = "7.0.12" @@ -19,7 +19,7 @@ jmh = "1.37" jts = "1.17.0" junit = "4.13.1" # @keep Minimum gradle version to run the build -minGradle = "8.8" +minGradle = "8.10" # @keep This is the minimum required Java version. minJava = "21" morfologik = "2.1.9" @@ -49,7 +49,7 @@ flexmark-ext-abbreviation = { module = "com.vladsch.flexmark:flexmark-ext-abbrev flexmark-ext-attributes = { module = "com.vladsch.flexmark:flexmark-ext-attributes", version.ref = "flexmark" } flexmark-ext-autolink = { module = "com.vladsch.flexmark:flexmark-ext-autolink", version.ref = "flexmark" } flexmark-ext-tables = { module = "com.vladsch.flexmark:flexmark-ext-tables", version.ref = "flexmark" } -groovy = { module = "org.codehaus.groovy:groovy-all", version.ref = "groovy" } +groovy = { module = "org.apache.groovy:groovy-all", version.ref = "groovy" } hamcrest = { module = "org.hamcrest:hamcrest", version.ref = "hamcrest" } icu4j = { module = "com.ibm.icu:icu4j", version.ref = "icu4j" } javacc = { module = "net.java.dev.javacc:javacc", version.ref = "javacc" } From 6d373dbb4e563b427207da38ca5629a595c786e3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christoph=20B=C3=BCscher?= Date: Fri, 30 Aug 2024 15:00:32 +0200 Subject: [PATCH 02/18] Remove IOException from MultiTermQuery#getTermsCount (#13701) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Neither this method nor any of the two overrides can throw an IOException. This change removes the throws clauses from this method in order simplify not have to handle them on the callers side. --- .../core/src/java/org/apache/lucene/search/MultiTermQuery.java | 2 +- .../core/src/java/org/apache/lucene/search/TermInSetQuery.java | 2 +- .../join/src/java/org/apache/lucene/search/join/TermsQuery.java | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/search/MultiTermQuery.java b/lucene/core/src/java/org/apache/lucene/search/MultiTermQuery.java index 5a56286ebed..8192f612165 100644 --- a/lucene/core/src/java/org/apache/lucene/search/MultiTermQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/MultiTermQuery.java @@ -313,7 +313,7 @@ public abstract class MultiTermQuery extends Query { * Return the number of unique terms contained in this query, if known up-front. If not known, -1 * will be returned. */ - public long getTermsCount() throws IOException { + public long getTermsCount() { return -1; } diff --git a/lucene/core/src/java/org/apache/lucene/search/TermInSetQuery.java b/lucene/core/src/java/org/apache/lucene/search/TermInSetQuery.java index 4e1abab09cc..da01b24f0bd 100644 --- a/lucene/core/src/java/org/apache/lucene/search/TermInSetQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/TermInSetQuery.java @@ -137,7 +137,7 @@ public class TermInSetQuery extends MultiTermQuery implements Accountable { } @Override - public long getTermsCount() throws IOException { + public long getTermsCount() { return termData.size(); } diff --git a/lucene/join/src/java/org/apache/lucene/search/join/TermsQuery.java b/lucene/join/src/java/org/apache/lucene/search/join/TermsQuery.java index 729feb19b9c..f781fc6a29e 100644 --- a/lucene/join/src/java/org/apache/lucene/search/join/TermsQuery.java +++ b/lucene/join/src/java/org/apache/lucene/search/join/TermsQuery.java @@ -95,7 +95,7 @@ class TermsQuery extends MultiTermQuery implements Accountable { } @Override - public long getTermsCount() throws IOException { + public long getTermsCount() { return terms.size(); } From ce4f56e74ad1087df8bdc019fc09d1b40438e8b3 Mon Sep 17 00:00:00 2001 From: Dawid Weiss Date: Fri, 30 Aug 2024 18:19:47 +0200 Subject: [PATCH 03/18] Add jdk 23-ea to nightly smoke tester run. #13700 --- .github/workflows/run-nightly-smoketester.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/run-nightly-smoketester.yml b/.github/workflows/run-nightly-smoketester.yml index c0987afc8aa..18a826e4a36 100644 --- a/.github/workflows/run-nightly-smoketester.yml +++ b/.github/workflows/run-nightly-smoketester.yml @@ -18,7 +18,7 @@ jobs: strategy: matrix: os: [ ubuntu-latest ] - java-version: [ '21', '22' ] + java-version: [ '21', '22', '23-ea' ] runs-on: ${{ matrix.os }} From 68cc8734ca28a9db800e4192a636d3b490cfd41a Mon Sep 17 00:00:00 2001 From: Ignacio Vera Date: Mon, 2 Sep 2024 12:13:53 +0200 Subject: [PATCH 04/18] Fix integer overflow in GeoEncodingUtils#Grid implementations (#13704) * Fix integer overflow in GeoEncodingUtils#Grid implementations * Add entry in CHANGES.txt --- lucene/CHANGES.txt | 3 ++ .../apache/lucene/geo/GeoEncodingUtils.java | 4 +- .../tests/geo/BaseGeoPointTestCase.java | 38 +++++++++++++++++++ 3 files changed, 43 insertions(+), 2 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index ae15247ac23..63390dd8f9f 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -410,6 +410,9 @@ Bug Fixes * GITHUB#13691: Fix incorrect exponent value in explain of SigmoidFunction. (Owais Kazi) +* GITHUB#13703: Fix bug in LatLonPoint queries where narrow polygons close to latitude 90 don't + match any points due to an Integer overflow. (Ignacio Vera) + Build --------------------- diff --git a/lucene/core/src/java/org/apache/lucene/geo/GeoEncodingUtils.java b/lucene/core/src/java/org/apache/lucene/geo/GeoEncodingUtils.java index d7ff62ba3be..818639bee8f 100644 --- a/lucene/core/src/java/org/apache/lucene/geo/GeoEncodingUtils.java +++ b/lucene/core/src/java/org/apache/lucene/geo/GeoEncodingUtils.java @@ -363,7 +363,7 @@ public final class GeoEncodingUtils { */ public boolean test(int lat, int lon) { final int lat2 = ((lat - Integer.MIN_VALUE) >>> latShift); - if (lat2 < latBase || lat2 >= latBase + maxLatDelta) { + if (lat2 < latBase || lat2 - latBase >= maxLatDelta) { return false; } int lon2 = ((lon - Integer.MIN_VALUE) >>> lonShift); @@ -411,7 +411,7 @@ public final class GeoEncodingUtils { */ public boolean test(int lat, int lon) { final int lat2 = ((lat - Integer.MIN_VALUE) >>> latShift); - if (lat2 < latBase || lat2 >= latBase + maxLatDelta) { + if (lat2 < latBase || lat2 - latBase >= maxLatDelta) { return false; } int lon2 = ((lon - Integer.MIN_VALUE) >>> lonShift); diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/geo/BaseGeoPointTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/tests/geo/BaseGeoPointTestCase.java index d371292324d..5c0c134fef3 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/geo/BaseGeoPointTestCase.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/geo/BaseGeoPointTestCase.java @@ -39,6 +39,7 @@ import org.apache.lucene.document.StoredField; import org.apache.lucene.document.StringField; import org.apache.lucene.geo.Circle; import org.apache.lucene.geo.Component2D; +import org.apache.lucene.geo.GeoEncodingUtils; import org.apache.lucene.geo.GeoUtils; import org.apache.lucene.geo.LatLonGeometry; import org.apache.lucene.geo.Polygon; @@ -1751,4 +1752,41 @@ public abstract class BaseGeoPointTestCase extends LuceneTestCase { newDistanceQuery("point", 32.94823588839368, -179.9538113027811, 120000), 20); assertEquals(3, td.totalHits.value); } + + public void testNarrowPolygonCloseToNorthPole() throws Exception { + IndexWriterConfig iwc = newIndexWriterConfig(); + iwc.setMergeScheduler(new SerialMergeScheduler()); + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, iwc); + + // index point closes to Lat 90 + Document doc = new Document(); + final int base = Integer.MAX_VALUE; + addPointToDoc( + FIELD_NAME, + doc, + GeoEncodingUtils.decodeLatitude(base - 2), + GeoEncodingUtils.decodeLongitude(base - 2)); + w.addDocument(doc); + w.flush(); + + // query testing + final IndexReader reader = DirectoryReader.open(w); + final IndexSearcher s = newSearcher(reader); + + double minLat = GeoEncodingUtils.decodeLatitude(base - 3); + double maxLat = GeoEncodingUtils.decodeLatitude(base); + double minLon = GeoEncodingUtils.decodeLongitude(base - 3); + double maxLon = GeoEncodingUtils.decodeLongitude(base); + + Query query = + newPolygonQuery( + FIELD_NAME, + new Polygon( + new double[] {minLat, minLat, maxLat, maxLat, minLat}, + new double[] {minLon, maxLon, maxLon, minLon, minLon})); + + assertEquals(1, s.count(query)); + IOUtils.close(w, reader, dir); + } } From 10420c6c47888f369fb8ca12bcecd7c6890f4178 Mon Sep 17 00:00:00 2001 From: Luca Cavanna Date: Tue, 3 Sep 2024 09:49:18 +0200 Subject: [PATCH 05/18] Added missing migrate entry for #13602 --- lucene/MIGRATE.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/lucene/MIGRATE.md b/lucene/MIGRATE.md index 64de353b153..cebc479aa24 100644 --- a/lucene/MIGRATE.md +++ b/lucene/MIGRATE.md @@ -793,3 +793,7 @@ Specifically, the method `FunctionValues#getScorer(Weight weight, LeafReaderCont Callers must now keep track of the Weight instance that created the Scorer if they need it, instead of relying on Scorer. +### `SearchWithCollectorTask` no longer supports the `collector.class` config parameter + +`collector.class` used to allow users to load a custom collector implementation. `collector.manager.class` +replaces it by allowing users to load a custom collector manager instead. (Luca Cavanna) \ No newline at end of file From c21bc5405be258bcebf6b93ceb13f9554c214b2c Mon Sep 17 00:00:00 2001 From: Egor Potemkin Date: Wed, 4 Sep 2024 01:34:50 +0100 Subject: [PATCH 06/18] Remove CollectorOwner class (#13702) --- lucene/CHANGES.txt | 5 +- .../apache/lucene/search/CollectorOwner.java | 78 -------- .../apache/lucene/search/IndexSearcher.java | 47 ----- .../demo/facet/SandboxFacetsExample.java | 13 +- .../apache/lucene/facet/DrillSideways.java | 187 ++++++++---------- .../lucene/facet/DrillSidewaysQuery.java | 76 +++---- .../lucene/facet/DrillSidewaysScorer.java | 19 -- .../lucene/facet/TestDrillSideways.java | 3 - .../lucene/sandbox/facet/TestRangeFacet.java | 27 +-- 9 files changed, 141 insertions(+), 314 deletions(-) delete mode 100644 lucene/core/src/java/org/apache/lucene/search/CollectorOwner.java diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 63390dd8f9f..80ca30c4fc7 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -286,10 +286,7 @@ API Changes * GITHUB#13568: Add DoubleValuesSource#toSortableLongDoubleValuesSource and MultiDoubleValuesSource#toSortableMultiLongValuesSource methods. (Shradha Shankar) -* GITHUB#13568: Add CollectorOwner class that wraps CollectorManager, and handles list of Collectors and results. - Add IndexSearcher#search method that takes CollectorOwner. (Egor Potemkin) - -* GITHUB#13568: Add DrillSideways#search method that supports any collector types for any drill-sideways dimensions +* GITHUB#13568: Add DrillSideways#search method that supports any CollectorManagers for drill-sideways dimensions or drill-down. (Egor Potemkin) New Features diff --git a/lucene/core/src/java/org/apache/lucene/search/CollectorOwner.java b/lucene/core/src/java/org/apache/lucene/search/CollectorOwner.java deleted file mode 100644 index 4c5c2f1eea1..00000000000 --- a/lucene/core/src/java/org/apache/lucene/search/CollectorOwner.java +++ /dev/null @@ -1,78 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.lucene.search; - -import java.io.IOException; -import java.util.ArrayList; -import java.util.Collection; -import java.util.Collections; -import java.util.List; - -/** - * This class wraps {@link CollectorManager} and owns the collectors the manager creates. It is - * convenient that clients of the class don't have to worry about keeping the list of collectors, as - * well as about making the collector's type (C) compatible when reduce is called. Instances of this - * class cache results of {@link CollectorManager#reduce(Collection)}. - * - *

Note that instance of this class ignores any {@link Collector} created by {@link - * CollectorManager#newCollector()} directly, not through {@link #newCollector()} - * - * @lucene.experimental - */ -public final class CollectorOwner { - - private final CollectorManager manager; - - private T result; - private boolean reduced; - - // TODO: For IndexSearcher, the list doesn't have to be synchronized - // because we create new collectors sequentially. Drill sideways creates new collectors in - // DrillSidewaysQuery#Weight#bulkScorer which is already called concurrently. - // I think making the list synchronized here is not a huge concern, at the same time, do we want - // to do something about it? - // e.g. have boolean property in constructor that makes it threads friendly when set? - private final List collectors = Collections.synchronizedList(new ArrayList<>()); - - public CollectorOwner(CollectorManager manager) { - this.manager = manager; - } - - /** Return a new {@link Collector}. This must return a different instance on each call. */ - public C newCollector() throws IOException { - C collector = manager.newCollector(); - collectors.add(collector); - return collector; - } - - public C getCollector(int i) { - return collectors.get(i); - } - - /** - * Returns result of {@link CollectorManager#reduce(Collection)}. The result is cached. - * - *

This method is NOT threadsafe. - */ - public T getResult() throws IOException { - if (reduced == false) { - result = manager.reduce(collectors); - reduced = true; - } - return result; - } -} diff --git a/lucene/core/src/java/org/apache/lucene/search/IndexSearcher.java b/lucene/core/src/java/org/apache/lucene/search/IndexSearcher.java index a8f8d14c24f..77d6edf34a0 100644 --- a/lucene/core/src/java/org/apache/lucene/search/IndexSearcher.java +++ b/lucene/core/src/java/org/apache/lucene/search/IndexSearcher.java @@ -671,53 +671,6 @@ public class IndexSearcher { } } - /** - * Lower-level search API. Search all leaves using the given {@link CollectorOwner}, without - * calling {@link CollectorOwner#getResult()} so that clients can reduce and read results - * themselves. - * - *

Note that this method doesn't return anything - users can access results by calling {@link - * CollectorOwner#getResult()} - * - * @lucene.experimental - */ - public void search(Query query, CollectorOwner collectorOwner) - throws IOException { - final C firstCollector = collectorOwner.newCollector(); - query = rewrite(query, firstCollector.scoreMode().needsScores()); - final Weight weight = createWeight(query, firstCollector.scoreMode(), 1); - search(weight, collectorOwner, firstCollector); - } - - private void search( - Weight weight, CollectorOwner collectorOwner, C firstCollector) throws IOException { - final LeafSlice[] leafSlices = getSlices(); - if (leafSlices.length == 0) { - // there are no segments, nothing to offload to the executor - assert leafContexts.isEmpty(); - } else { - final ScoreMode scoreMode = firstCollector.scoreMode(); - for (int i = 1; i < leafSlices.length; ++i) { - final C collector = collectorOwner.newCollector(); - if (scoreMode != collector.scoreMode()) { - throw new IllegalStateException( - "CollectorManager does not always produce collectors with the same score mode"); - } - } - final List> listTasks = new ArrayList<>(leafSlices.length); - for (int i = 0; i < leafSlices.length; ++i) { - final LeafReaderContext[] leaves = leafSlices[i].leaves; - final C collector = collectorOwner.getCollector(i); - listTasks.add( - () -> { - search(Arrays.asList(leaves), weight, collector); - return collector; - }); - } - taskExecutor.invokeAll(listTasks); - } - } - /** * Lower-level search API. * diff --git a/lucene/demo/src/java/org/apache/lucene/demo/facet/SandboxFacetsExample.java b/lucene/demo/src/java/org/apache/lucene/demo/facet/SandboxFacetsExample.java index 0655abac131..5a838954992 100644 --- a/lucene/demo/src/java/org/apache/lucene/demo/facet/SandboxFacetsExample.java +++ b/lucene/demo/src/java/org/apache/lucene/demo/facet/SandboxFacetsExample.java @@ -21,7 +21,6 @@ import static org.apache.lucene.sandbox.facet.ComparableUtils.byAggregatedValue; import java.io.IOException; import java.util.ArrayList; -import java.util.Collections; import java.util.List; import org.apache.lucene.analysis.core.WhitespaceAnalyzer; import org.apache.lucene.document.Document; @@ -58,7 +57,7 @@ import org.apache.lucene.sandbox.facet.recorders.CountFacetRecorder; import org.apache.lucene.sandbox.facet.recorders.LongAggregationsFacetRecorder; import org.apache.lucene.sandbox.facet.recorders.MultiFacetsRecorder; import org.apache.lucene.sandbox.facet.recorders.Reducer; -import org.apache.lucene.search.CollectorOwner; +import org.apache.lucene.search.CollectorManager; import org.apache.lucene.search.DoubleValuesSource; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.LongValuesSource; @@ -564,17 +563,13 @@ public class SandboxFacetsExample { // FacetFieldCollectorManager anyway, and leaf cutter are not merged or anything like that. FacetFieldCollectorManager publishDayDimensionCollectorManager = new FacetFieldCollectorManager<>(defaultTaxoCutter, publishDayDimensionRecorder); - List> drillSidewaysOwners = - List.of(new CollectorOwner<>(publishDayDimensionCollectorManager)); + List> drillSidewaysManagers = + List.of(publishDayDimensionCollectorManager); //// (3) search // Right now we return the same Recorder we created - so we can ignore results DrillSideways ds = new DrillSideways(searcher, config, taxoReader); - // We must wrap list of drill sideways owner with unmodifiableList to make generics work. - ds.search( - q, - new CollectorOwner<>(drillDownCollectorManager), - Collections.unmodifiableList(drillSidewaysOwners)); + ds.search(q, drillDownCollectorManager, drillSidewaysManagers); //// (4) Get top 10 results by count for Author List facetResults = new ArrayList<>(2); diff --git a/lucene/facet/src/java/org/apache/lucene/facet/DrillSideways.java b/lucene/facet/src/java/org/apache/lucene/facet/DrillSideways.java index 2db660e1c5b..951a0997cea 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/DrillSideways.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/DrillSideways.java @@ -18,7 +18,6 @@ package org.apache.lucene.facet; import java.io.IOException; import java.util.ArrayList; -import java.util.Collections; import java.util.HashMap; import java.util.List; import java.util.Map; @@ -31,8 +30,8 @@ import org.apache.lucene.facet.sortedset.SortedSetDocValuesFacetField; import org.apache.lucene.facet.sortedset.SortedSetDocValuesReaderState; import org.apache.lucene.facet.taxonomy.FastTaxonomyFacetCounts; import org.apache.lucene.facet.taxonomy.TaxonomyReader; +import org.apache.lucene.search.Collector; import org.apache.lucene.search.CollectorManager; -import org.apache.lucene.search.CollectorOwner; import org.apache.lucene.search.FieldDoc; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.MatchAllDocsQuery; @@ -302,25 +301,13 @@ public class DrillSideways { } } - private static class CallableCollector implements Callable { - private final IndexSearcher searcher; - private final Query query; - private final CollectorOwner collectorOwner; - - private CallableCollector( - IndexSearcher searcher, Query query, CollectorOwner collectorOwner) { - this.searcher = searcher; - this.query = query; - this.collectorOwner = collectorOwner; - } + private record CallableCollector( + IndexSearcher searcher, Query query, CollectorManager collectorManager) + implements Callable { @Override - public Void call() throws Exception { - searcher.search(query, collectorOwner); - // Call getResult to trigger reduce, we don't need to return results because users can access - // them directly from collectorOwner - collectorOwner.getResult(); - return null; + public R call() throws Exception { + return searcher.search(query, collectorManager); } } @@ -344,31 +331,30 @@ public class DrillSideways { // Main query FacetsCollectorManager drillDownFacetsCollectorManager = createDrillDownFacetsCollectorManager(); - final CollectorOwner mainCollectorOwner; + final CollectorManager mainCollectorManager; if (drillDownFacetsCollectorManager != null) { // Make sure we populate a facet collector corresponding to the base query if desired: - mainCollectorOwner = - new CollectorOwner<>( - new MultiCollectorManager(drillDownFacetsCollectorManager, hitCollectorManager)); + mainCollectorManager = + new MultiCollectorManager(drillDownFacetsCollectorManager, hitCollectorManager); } else { - mainCollectorOwner = new CollectorOwner<>(hitCollectorManager); + mainCollectorManager = hitCollectorManager; } // Drill sideways dimensions - final List> drillSidewaysCollectorOwners; + final List> drillSidewaysCollectorManagers; if (query.getDims().isEmpty() == false) { - drillSidewaysCollectorOwners = new ArrayList<>(query.getDims().size()); + drillSidewaysCollectorManagers = new ArrayList<>(query.getDims().size()); for (int i = 0; i < query.getDims().size(); i++) { - drillSidewaysCollectorOwners.add( - new CollectorOwner<>(createDrillSidewaysFacetsCollectorManager())); + drillSidewaysCollectorManagers.add(createDrillSidewaysFacetsCollectorManager()); } } else { - drillSidewaysCollectorOwners = null; + drillSidewaysCollectorManagers = null; } // Execute query + final Result result; if (executor != null) { - searchConcurrently(query, mainCollectorOwner, drillSidewaysCollectorOwners); + result = searchConcurrently(query, mainCollectorManager, drillSidewaysCollectorManagers); } else { - searchSequentially(query, mainCollectorOwner, drillSidewaysCollectorOwners); + result = searchSequentially(query, mainCollectorManager, drillSidewaysCollectorManagers); } // Collect results @@ -377,12 +363,12 @@ public class DrillSideways { if (drillDownFacetsCollectorManager != null) { // drill down collected using MultiCollector // Extract the results: - Object[] drillDownResult = (Object[]) mainCollectorOwner.getResult(); + Object[] drillDownResult = (Object[]) result.drillDownResult; facetsCollectorResult = (FacetsCollector) drillDownResult[0]; hitCollectorResult = (R) drillDownResult[1]; } else { facetsCollectorResult = null; - hitCollectorResult = (R) mainCollectorOwner.getResult(); + hitCollectorResult = (R) result.drillDownResult; } // Getting results for drill sideways dimensions (if any) @@ -391,12 +377,11 @@ public class DrillSideways { if (query.getDims().isEmpty() == false) { drillSidewaysDims = query.getDims().keySet().toArray(new String[0]); int numDims = query.getDims().size(); - assert drillSidewaysCollectorOwners != null; - assert drillSidewaysCollectorOwners.size() == numDims; + assert drillSidewaysCollectorManagers != null; + assert drillSidewaysCollectorManagers.size() == numDims; drillSidewaysCollectors = new FacetsCollector[numDims]; for (int dim = 0; dim < numDims; dim++) { - drillSidewaysCollectors[dim] = - (FacetsCollector) drillSidewaysCollectorOwners.get(dim).getResult(); + drillSidewaysCollectors[dim] = result.drillSidewaysResults.get(dim); } } else { drillSidewaysDims = null; @@ -414,52 +399,51 @@ public class DrillSideways { /** * Search using DrillDownQuery with custom collectors. This method can be used with any {@link - * CollectorOwner}s. It doesn't return anything because it is expected that you read results from - * provided {@link CollectorOwner}s. + * CollectorManager}s. * - *

To read the results, run {@link CollectorOwner#getResult()} for drill down and all drill - * sideways dimensions. - * - *

Note: use {@link Collections#unmodifiableList(List)} to wrap {@code - * drillSidewaysCollectorOwners} to convince compiler that it is safe to use List here. - * - *

Use {@link MultiCollectorManager} wrapped by {@link CollectorOwner} to collect both hits and - * facets for the entire query and/or for drill-sideways dimensions. - * - *

TODO: Class CollectorOwner was created so that we can ignore CollectorManager type C, - * because we want each dimensions to be able to use their own types. Alternatively, we can use - * typesafe heterogeneous container and provide CollectorManager type for each dimension to this - * method? I do like CollectorOwner approach as it seems more intuitive? + *

Note: Use {@link MultiCollectorManager} to collect both hits and facets for the entire query + * and/or for drill-sideways dimensions. You can also use it to wrap different types of {@link + * CollectorManager} for drill-sideways dimensions. */ - public void search( - final DrillDownQuery query, - CollectorOwner drillDownCollectorOwner, - List> drillSidewaysCollectorOwners) + public Result search( + DrillDownQuery query, + CollectorManager drillDownCollectorManager, + List> drillSidewaysCollectorManagers) throws IOException { - if (drillDownCollectorOwner == null) { + if (drillDownCollectorManager == null) { throw new IllegalArgumentException( "This search method requires client to provide drill down collector manager"); } - if (drillSidewaysCollectorOwners == null) { + if (drillSidewaysCollectorManagers == null) { if (query.getDims().isEmpty() == false) { throw new IllegalArgumentException( - "The query requires not null drillSidewaysCollectorOwners"); + "The query requires not null drillSidewaysCollectorManagers"); } - } else if (drillSidewaysCollectorOwners.size() != query.getDims().size()) { + } else if (drillSidewaysCollectorManagers.size() != query.getDims().size()) { throw new IllegalArgumentException( - "drillSidewaysCollectorOwners size must be equal to number of dimensions in the query."); + "drillSidewaysCollectorManagers size must be equal to number of dimensions in the query."); } if (executor != null) { - searchConcurrently(query, drillDownCollectorOwner, drillSidewaysCollectorOwners); + return searchConcurrently(query, drillDownCollectorManager, drillSidewaysCollectorManagers); } else { - searchSequentially(query, drillDownCollectorOwner, drillSidewaysCollectorOwners); + return searchSequentially(query, drillDownCollectorManager, drillSidewaysCollectorManagers); } } - private void searchSequentially( + /** + * {@link #search(DrillDownQuery, CollectorManager, List)} result. It doesn't depend on {@link + * Facets} to allow users to use any type of {@link CollectorManager} for drill-down or + * drill-sideways dimension. + * + * @param drillDownResult result from drill down (main) {@link CollectorManager} + * @param drillSidewaysResults results from drill sideways {@link CollectorManager}s + */ + public record Result(T drillDownResult, List drillSidewaysResults) {} + + private Result searchSequentially( final DrillDownQuery query, - final CollectorOwner drillDownCollectorOwner, - final List> drillSidewaysCollectorOwners) + final CollectorManager drillDownCollectorManager, + final List> drillSidewaysCollectorManagers) throws IOException { Map drillDownDims = query.getDims(); @@ -467,9 +451,7 @@ public class DrillSideways { if (drillDownDims.isEmpty()) { // There are no drill-down dims, so there is no // drill-sideways to compute: - searcher.search(query, drillDownCollectorOwner); - drillDownCollectorOwner.getResult(); - return; + return new Result<>(searcher.search(query, drillDownCollectorManager), null); } Query baseQuery = query.getBaseQuery(); @@ -480,59 +462,60 @@ public class DrillSideways { } Query[] drillDownQueries = query.getDrillDownQueries(); - DrillSidewaysQuery dsq = - new DrillSidewaysQuery( - baseQuery, - // drillDownCollectorOwner, - // Don't pass drill down collector because drill down is collected by IndexSearcher - // itself. - // TODO: deprecate drillDown collection in DrillSidewaysQuery? - null, - drillSidewaysCollectorOwners, - drillDownQueries, - scoreSubDocsAtOnce()); + DrillSidewaysQuery dsq = + new DrillSidewaysQuery<>( + baseQuery, drillSidewaysCollectorManagers, drillDownQueries, scoreSubDocsAtOnce()); - searcher.search(dsq, drillDownCollectorOwner); - // This method doesn't return results as each dimension might have its own result type. - // But we call getResult to trigger results reducing, so that users don't have to worry about - // it. - drillDownCollectorOwner.getResult(); - if (drillSidewaysCollectorOwners != null) { - for (CollectorOwner sidewaysOwner : drillSidewaysCollectorOwners) { - sidewaysOwner.getResult(); + T collectorResult = searcher.search(dsq, drillDownCollectorManager); + List drillSidewaysResults = new ArrayList<>(drillDownDims.size()); + assert drillSidewaysCollectorManagers != null + : "Case without drill sideways dimensions is handled above"; + int numSlices = dsq.managedDrillSidewaysCollectors.size(); + for (int dim = 0; dim < drillDownDims.size(); dim++) { + List collectorsForDim = new ArrayList<>(numSlices); + for (int slice = 0; slice < numSlices; slice++) { + collectorsForDim.add(dsq.managedDrillSidewaysCollectors.get(slice).get(dim)); } + drillSidewaysResults.add( + dim, drillSidewaysCollectorManagers.get(dim).reduce(collectorsForDim)); } + return new Result<>(collectorResult, drillSidewaysResults); } - private void searchConcurrently( + private Result searchConcurrently( final DrillDownQuery query, - final CollectorOwner drillDownCollectorOwner, - final List> drillSidewaysCollectorOwners) - throws IOException { + final CollectorManager drillDownCollectorManager, + final List> drillSidewaysCollectorManagers) { final Map drillDownDims = query.getDims(); - final List callableCollectors = new ArrayList<>(drillDownDims.size() + 1); + final CallableCollector drillDownCallableCollector = + new CallableCollector<>(searcher, query, drillDownCollectorManager); + final List> drillSidewaysCallableCollectors = + new ArrayList<>(drillDownDims.size()); - callableCollectors.add(new CallableCollector(searcher, query, drillDownCollectorOwner)); int i = 0; final Query[] filters = query.getDrillDownQueries(); for (String dim : drillDownDims.keySet()) { - callableCollectors.add( - new CallableCollector( + drillSidewaysCallableCollectors.add( + new CallableCollector<>( searcher, getDrillDownQuery(query, filters, dim), - drillSidewaysCollectorOwners.get(i))); + drillSidewaysCollectorManagers.get(i))); i++; } try { - // Run the query pool - final List> futures = executor.invokeAll(callableCollectors); + final Future drillDownFuture = executor.submit(drillDownCallableCollector); + final List> drillSidewaysFutures = + executor.invokeAll(drillSidewaysCallableCollectors); - // Wait for results. We don't read the results as they are collected by CollectorOwners - for (i = 0; i < futures.size(); i++) { - futures.get(i).get(); + T collectorResult = drillDownFuture.get(); + List drillSidewaysResults = new ArrayList<>(drillDownDims.size()); + + for (i = 0; i < drillSidewaysFutures.size(); i++) { + drillSidewaysResults.add(i, drillSidewaysFutures.get(i).get()); } + return new Result<>(collectorResult, drillSidewaysResults); } catch (InterruptedException e) { throw new ThreadInterruptedException(e); } catch (ExecutionException e) { diff --git a/lucene/facet/src/java/org/apache/lucene/facet/DrillSidewaysQuery.java b/lucene/facet/src/java/org/apache/lucene/facet/DrillSidewaysQuery.java index aa670914d1a..0e61b1a0643 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/DrillSidewaysQuery.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/DrillSidewaysQuery.java @@ -17,19 +17,20 @@ package org.apache.lucene.facet; import java.io.IOException; +import java.util.ArrayList; import java.util.Arrays; +import java.util.Collections; import java.util.Comparator; import java.util.List; import java.util.Objects; import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.search.BulkScorer; import org.apache.lucene.search.Collector; -import org.apache.lucene.search.CollectorOwner; +import org.apache.lucene.search.CollectorManager; import org.apache.lucene.search.ConstantScoreScorer; import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.Explanation; import org.apache.lucene.search.IndexSearcher; -import org.apache.lucene.search.LeafCollector; import org.apache.lucene.search.Query; import org.apache.lucene.search.QueryVisitor; import org.apache.lucene.search.ScoreMode; @@ -41,12 +42,12 @@ import org.apache.lucene.search.Weight; // TODO change the way DrillSidewaysScorer is used, this query does not work // with filter caching -class DrillSidewaysQuery extends Query { +class DrillSidewaysQuery extends Query { final Query baseQuery; - final CollectorOwner drillDownCollectorOwner; - final List> drillSidewaysCollectorOwners; + final List> drillSidewaysCollectorManagers; + final List> managedDrillSidewaysCollectors; final Query[] drillDownQueries; @@ -58,15 +59,36 @@ class DrillSidewaysQuery extends Query { */ DrillSidewaysQuery( Query baseQuery, - CollectorOwner drillDownCollectorOwner, - List> drillSidewaysCollectorOwners, + List> drillSidewaysCollectorManagers, + Query[] drillDownQueries, + boolean scoreSubDocsAtOnce) { + // Note that the "managed" collector lists are synchronized here since bulkScorer() + // can be invoked concurrently and needs to remain thread-safe. We're OK with synchronizing + // on the whole list as contention is expected to remain very low: + this( + baseQuery, + drillSidewaysCollectorManagers, + Collections.synchronizedList(new ArrayList<>()), + drillDownQueries, + scoreSubDocsAtOnce); + } + + /** + * Needed for {@link Query#rewrite(IndexSearcher)}. Ensures the same "managed" lists get used + * since {@link DrillSideways} accesses references to these through the original {@code + * DrillSidewaysQuery}. + */ + private DrillSidewaysQuery( + Query baseQuery, + List> drillSidewaysCollectorManagers, + List> managedDrillSidewaysCollectors, Query[] drillDownQueries, boolean scoreSubDocsAtOnce) { this.baseQuery = Objects.requireNonNull(baseQuery); - this.drillDownCollectorOwner = drillDownCollectorOwner; - this.drillSidewaysCollectorOwners = drillSidewaysCollectorOwners; + this.drillSidewaysCollectorManagers = drillSidewaysCollectorManagers; this.drillDownQueries = drillDownQueries; this.scoreSubDocsAtOnce = scoreSubDocsAtOnce; + this.managedDrillSidewaysCollectors = managedDrillSidewaysCollectors; } @Override @@ -87,10 +109,10 @@ class DrillSidewaysQuery extends Query { if (newQuery == baseQuery) { return super.rewrite(indexSearcher); } else { - return new DrillSidewaysQuery( + return new DrillSidewaysQuery<>( newQuery, - drillDownCollectorOwner, - drillSidewaysCollectorOwners, + drillSidewaysCollectorManagers, + managedDrillSidewaysCollectors, drillDownQueries, scoreSubDocsAtOnce); } @@ -124,14 +146,8 @@ class DrillSidewaysQuery extends Query { int drillDownCount = drillDowns.length; - Collector drillDownCollector; - final LeafCollector drillDownLeafCollector; - if (drillDownCollectorOwner != null) { - drillDownCollector = drillDownCollectorOwner.newCollector(); - drillDownLeafCollector = drillDownCollector.getLeafCollector(context); - } else { - drillDownLeafCollector = null; - } + List sidewaysCollectors = new ArrayList<>(drillDownCount); + managedDrillSidewaysCollectors.add(sidewaysCollectors); DrillSidewaysScorer.DocsAndCost[] dims = new DrillSidewaysScorer.DocsAndCost[drillDownCount]; @@ -144,7 +160,8 @@ class DrillSidewaysQuery extends Query { scorer = new ConstantScoreScorer(0f, scoreMode, DocIdSetIterator.empty()); } - Collector sidewaysCollector = drillSidewaysCollectorOwners.get(dim).newCollector(); + K sidewaysCollector = drillSidewaysCollectorManagers.get(dim).newCollector(); + sidewaysCollectors.add(dim, sidewaysCollector); dims[dim] = new DrillSidewaysScorer.DocsAndCost( @@ -155,9 +172,6 @@ class DrillSidewaysQuery extends Query { // a null scorer in this case, but we need to make sure #finish gets called on all facet // collectors since IndexSearcher won't handle this for us: if (baseScorerSupplier == null || nullCount > 1) { - if (drillDownLeafCollector != null) { - drillDownLeafCollector.finish(); - } for (DrillSidewaysScorer.DocsAndCost dim : dims) { dim.sidewaysLeafCollector.finish(); } @@ -177,11 +191,7 @@ class DrillSidewaysQuery extends Query { @Override public BulkScorer bulkScorer() throws IOException { return new DrillSidewaysScorer( - context, - baseScorerSupplier.get(Long.MAX_VALUE), - drillDownLeafCollector, - dims, - scoreSubDocsAtOnce); + context, baseScorerSupplier.get(Long.MAX_VALUE), dims, scoreSubDocsAtOnce); } @Override @@ -212,9 +222,8 @@ class DrillSidewaysQuery extends Query { final int prime = 31; int result = classHash(); result = prime * result + Objects.hashCode(baseQuery); - result = prime * result + Objects.hashCode(drillDownCollectorOwner); result = prime * result + Arrays.hashCode(drillDownQueries); - result = prime * result + Objects.hashCode(drillSidewaysCollectorOwners); + result = prime * result + Objects.hashCode(drillSidewaysCollectorManagers); return result; } @@ -223,10 +232,9 @@ class DrillSidewaysQuery extends Query { return sameClassAs(other) && equalsTo(getClass().cast(other)); } - private boolean equalsTo(DrillSidewaysQuery other) { + private boolean equalsTo(DrillSidewaysQuery other) { return Objects.equals(baseQuery, other.baseQuery) - && Objects.equals(drillDownCollectorOwner, other.drillDownCollectorOwner) && Arrays.equals(drillDownQueries, other.drillDownQueries) - && Objects.equals(drillSidewaysCollectorOwners, other.drillSidewaysCollectorOwners); + && Objects.equals(drillSidewaysCollectorManagers, other.drillSidewaysCollectorManagers); } } diff --git a/lucene/facet/src/java/org/apache/lucene/facet/DrillSidewaysScorer.java b/lucene/facet/src/java/org/apache/lucene/facet/DrillSidewaysScorer.java index 5ff9c6420ad..3a5bfc3c464 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/DrillSidewaysScorer.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/DrillSidewaysScorer.java @@ -45,8 +45,6 @@ class DrillSidewaysScorer extends BulkScorer { // private static boolean DEBUG = false; - private final LeafCollector drillDownLeafCollector; - private final DocsAndCost[] dims; // DrillDown DocsEnums: @@ -68,7 +66,6 @@ class DrillSidewaysScorer extends BulkScorer { DrillSidewaysScorer( LeafReaderContext context, Scorer baseScorer, - LeafCollector drillDownLeafCollector, DocsAndCost[] dims, boolean scoreSubDocsAtOnce) { this.dims = dims; @@ -81,7 +78,6 @@ class DrillSidewaysScorer extends BulkScorer { } else { this.baseApproximation = baseIterator; } - this.drillDownLeafCollector = drillDownLeafCollector; this.scoreSubDocsAtOnce = scoreSubDocsAtOnce; } @@ -709,9 +705,6 @@ class DrillSidewaysScorer extends BulkScorer { // } collector.collect(collectDocID); - if (drillDownLeafCollector != null) { - drillDownLeafCollector.collect(collectDocID); - } // TODO: we could "fix" faceting of the sideways counts // to do this "union" (of the drill down hits) in the @@ -725,9 +718,6 @@ class DrillSidewaysScorer extends BulkScorer { private void collectHit(LeafCollector collector, DocsAndCost dim) throws IOException { collector.collect(collectDocID); - if (drillDownLeafCollector != null) { - drillDownLeafCollector.collect(collectDocID); - } // Tally sideways count: dim.sidewaysLeafCollector.collect(collectDocID); @@ -735,9 +725,6 @@ class DrillSidewaysScorer extends BulkScorer { private void collectHit(LeafCollector collector, List dims) throws IOException { collector.collect(collectDocID); - if (drillDownLeafCollector != null) { - drillDownLeafCollector.collect(collectDocID); - } // Tally sideways counts: for (DocsAndCost dim : dims) { @@ -756,9 +743,6 @@ class DrillSidewaysScorer extends BulkScorer { // Note: We _only_ call #finish on the facets collectors we're managing here, but not the // "main" collector. This is because IndexSearcher handles calling #finish on the main // collector. - if (drillDownLeafCollector != null) { - drillDownLeafCollector.finish(); - } for (DocsAndCost dim : dims) { dim.sidewaysLeafCollector.finish(); } @@ -766,9 +750,6 @@ class DrillSidewaysScorer extends BulkScorer { private void setScorer(LeafCollector mainCollector, Scorable scorer) throws IOException { mainCollector.setScorer(scorer); - if (drillDownLeafCollector != null) { - drillDownLeafCollector.setScorer(scorer); - } for (DocsAndCost dim : dims) { dim.sidewaysLeafCollector.setScorer(scorer); } diff --git a/lucene/facet/src/test/org/apache/lucene/facet/TestDrillSideways.java b/lucene/facet/src/test/org/apache/lucene/facet/TestDrillSideways.java index 5645f191549..fa82e921eb8 100644 --- a/lucene/facet/src/test/org/apache/lucene/facet/TestDrillSideways.java +++ b/lucene/facet/src/test/org/apache/lucene/facet/TestDrillSideways.java @@ -284,7 +284,6 @@ public class TestDrillSideways extends FacetTestCase { Weight dimWeight = searcher.createWeight(dimQ, ScoreMode.COMPLETE_NO_SCORES, 1f); Scorer dimScorer = dimWeight.scorer(ctx); - FacetsCollector baseFC = new FacetsCollector(); FacetsCollector dimFC = new FacetsCollector(); DrillSidewaysScorer.DocsAndCost docsAndCost = new DrillSidewaysScorer.DocsAndCost(dimScorer, dimFC.getLeafCollector(ctx)); @@ -311,7 +310,6 @@ public class TestDrillSideways extends FacetTestCase { new DrillSidewaysScorer( ctx, baseScorer, - baseFC.getLeafCollector(ctx), new DrillSidewaysScorer.DocsAndCost[] {docsAndCost}, scoreSubDocsAtOnce); expectThrows(CollectionTerminatedException.class, () -> scorer.score(baseCollector, null)); @@ -321,7 +319,6 @@ public class TestDrillSideways extends FacetTestCase { // both our base and sideways dim facets collectors. What we really want to test here is // that the matching docs are still correctly present and populated after an early // termination occurs (i.e., #finish is properly called in that scenario): - assertEquals(1, baseFC.getMatchingDocs().size()); assertEquals(1, dimFC.getMatchingDocs().size()); } } diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/facet/TestRangeFacet.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/facet/TestRangeFacet.java index 57c323e18a9..c61eaae33d0 100644 --- a/lucene/sandbox/src/test/org/apache/lucene/sandbox/facet/TestRangeFacet.java +++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/facet/TestRangeFacet.java @@ -50,7 +50,6 @@ import org.apache.lucene.sandbox.facet.cutters.ranges.LongRangeFacetCutter; import org.apache.lucene.sandbox.facet.labels.OrdToLabel; import org.apache.lucene.sandbox.facet.labels.RangeOrdToLabel; import org.apache.lucene.sandbox.facet.recorders.CountFacetRecorder; -import org.apache.lucene.search.CollectorOwner; import org.apache.lucene.search.DoubleValues; import org.apache.lucene.search.DoubleValuesSource; import org.apache.lucene.search.Explanation; @@ -538,7 +537,7 @@ public class TestRangeFacet extends SandboxFacetTestCase { ////// First search, no drill-downs: DrillDownQuery ddq = new DrillDownQuery(config); - ds.search(ddq, new CollectorOwner<>(collectorManager), List.of()); + ds.search(ddq, collectorManager, List.of()); // assertEquals(100, dsr.hits.totalHits.value); assertEquals( @@ -556,10 +555,7 @@ public class TestRangeFacet extends SandboxFacetTestCase { dimCollectorManager = new FacetFieldCollectorManager<>(dimCutter, dimCountRecorder); ddq = new DrillDownQuery(config); ddq.add("dim", "b"); - ds.search( - ddq, - new CollectorOwner<>(fieldCollectorManager), - List.of(new CollectorOwner<>(dimCollectorManager))); + ds.search(ddq, fieldCollectorManager, List.of(dimCollectorManager)); // assertEquals(75, dsr.hits.totalHits.value); assertEquals( @@ -577,10 +573,7 @@ public class TestRangeFacet extends SandboxFacetTestCase { dimCollectorManager = new FacetFieldCollectorManager<>(dimCutter, dimCountRecorder); ddq = new DrillDownQuery(config); ddq.add("field", LongPoint.newRangeQuery("field", 0L, 10L)); - ds.search( - ddq, - new CollectorOwner<>(dimCollectorManager), - List.of(new CollectorOwner<>(fieldCollectorManager))); + ds.search(ddq, dimCollectorManager, List.of(fieldCollectorManager)); // assertEquals(11, dsr.hits.totalHits.value); assertEquals( @@ -1629,14 +1622,12 @@ public class TestRangeFacet extends SandboxFacetTestCase { countRecorder = new CountFacetRecorder(); - CollectorOwner totalHitsCollectorOwner = - new CollectorOwner<>(DummyTotalHitCountCollector.createManager()); - CollectorOwner drillSidewaysCollectorOwner = - new CollectorOwner<>( - new FacetFieldCollectorManager<>(doubleRangeFacetCutter, countRecorder)); - ds.search(ddq, totalHitsCollectorOwner, List.of(drillSidewaysCollectorOwner)); - assertEquals(1, totalHitsCollectorOwner.getResult().intValue()); - drillSidewaysCollectorOwner.getResult(); + DrillSideways.Result result = + ds.search( + ddq, + DummyTotalHitCountCollector.createManager(), + List.of(new FacetFieldCollectorManager<>(doubleRangeFacetCutter, countRecorder))); + assertEquals(1, result.drillDownResult().intValue()); assertEquals( "dim=field path=[] value=-2147483648 childCount=6\n < 1 (0)\n < 2 (1)\n < 5 (3)\n < 10 (3)\n < 20 (3)\n < 50 (3)\n", getAllSortByOrd(getRangeOrdinals(ranges), countRecorder, "field", ordToLabel).toString()); From b91b4136aff239f45e074fd6603c06ec7c296f43 Mon Sep 17 00:00:00 2001 From: Luca Cavanna Date: Wed, 4 Sep 2024 16:06:19 +0200 Subject: [PATCH 07/18] Reduce size of queue in TestBoolean2#testRandomQueries (#13713) The queue is created as 1000 * mulFactor, but the test succeeds without the 1000 factor. That causes out of memories as the mulFactor can be as high as 4096 in some cases, and then each slice will get a collector with a queue of size 1000 * mulFactor. Only the allocation of such queue makes the test go OOM. Closes #11754 --- .../src/test/org/apache/lucene/search/TestBoolean2.java | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/lucene/core/src/test/org/apache/lucene/search/TestBoolean2.java b/lucene/core/src/test/org/apache/lucene/search/TestBoolean2.java index b0634e56da4..1b27683a251 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestBoolean2.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestBoolean2.java @@ -403,12 +403,8 @@ public class TestBoolean2 extends LuceneTestCase { bigSearcher.count(q3.build())); // test diff (randomized) scorers produce the same results on bigSearcher as well - hits1 = - bigSearcher.search(q1, new TopFieldCollectorManager(sort, 1000 * mulFactor, 1)) - .scoreDocs; - hits2 = - bigSearcher.search(q1, new TopFieldCollectorManager(sort, 1000 * mulFactor, 1)) - .scoreDocs; + hits1 = bigSearcher.search(q1, new TopFieldCollectorManager(sort, mulFactor, 1)).scoreDocs; + hits2 = bigSearcher.search(q1, new TopFieldCollectorManager(sort, mulFactor, 1)).scoreDocs; CheckHits.checkEqual(q1, hits1, hits2); } From 87bc8270ffa2fc7d1438418e965b19bcb863a52a Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Thu, 5 Sep 2024 06:59:55 -0400 Subject: [PATCH 08/18] move Operations.sameLanguage/subsetOf to AutomatonTestUtil in test-framework (#13708) This code is suitable for tests only and may throw unexpected Exceptions or AssertionErrors for some input. --- lucene/CHANGES.txt | 2 + .../synonym/TestSynonymGraphFilter.java | 2 +- .../lucene/util/automaton/Operations.java | 79 ------------------ .../lucene/util/automaton/StatePair.java | 11 ++- .../lucene/analysis/TestGraphTokenizers.java | 2 +- .../apache/lucene/index/TestTermsEnum2.java | 2 +- .../lucene/util/automaton/TestAutomaton.java | 19 ++--- .../util/automaton/TestDeterminism.java | 10 +-- .../automaton/TestLevenshteinAutomata.java | 20 ++--- .../lucene/util/automaton/TestMinimize.java | 4 +- .../lucene/util/automaton/TestOperations.java | 8 +- .../util/automaton/TestRegExpParsing.java | 3 +- .../automaton/TestStringsToAutomaton.java | 3 +- .../util/automaton/AutomatonTestUtil.java | 80 +++++++++++++++++++ 14 files changed, 129 insertions(+), 116 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 80ca30c4fc7..b41144b2d9c 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -112,6 +112,8 @@ API Changes * GITHUB#13632: CandidateMatcher public matching functions (Bryan Jacobowitz) +* GITHUB#13708: Move Operations.sameLanguage/subsetOf to test-framework. (Robert Muir) + New Features --------------------- diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSynonymGraphFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSynonymGraphFilter.java index 1eb80ea5081..974edc8e9f6 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSynonymGraphFilter.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSynonymGraphFilter.java @@ -1490,7 +1490,7 @@ public class TestSynonymGraphFilter extends BaseTokenStreamTestCase { } assertTrue(approxEquals(actual, expected)); - assertTrue(Operations.sameLanguage(actual, expected)); + assertTrue(AutomatonTestUtil.sameLanguage(actual, expected)); } a.close(); diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/Operations.java b/lucene/core/src/java/org/apache/lucene/util/automaton/Operations.java index 2052b1c50bf..8fd43dbe1ff 100644 --- a/lucene/core/src/java/org/apache/lucene/util/automaton/Operations.java +++ b/lucene/core/src/java/org/apache/lucene/util/automaton/Operations.java @@ -35,7 +35,6 @@ import java.util.Arrays; import java.util.BitSet; import java.util.Collection; import java.util.HashMap; -import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; @@ -374,17 +373,6 @@ public final class Operations { return removeDeadStates(c); } - /** - * Returns true if these two automata accept exactly the same language. This is a costly - * computation! Both automata must be determinized and have no dead states! - */ - public static boolean sameLanguage(Automaton a1, Automaton a2) { - if (a1 == a2) { - return true; - } - return subsetOf(a2, a1) && subsetOf(a1, a2); - } - // TODO: move to test-framework? /** * Returns true if this automaton has any states that cannot be reached from the initial state or @@ -417,73 +405,6 @@ public final class Operations { return reachableFromAccept.isEmpty() == false; } - /** - * Returns true if the language of a1 is a subset of the language of a2. - * Both automata must be determinized and must have no dead states. - * - *

Complexity: quadratic in number of states. - */ - public static boolean subsetOf(Automaton a1, Automaton a2) { - if (a1.isDeterministic() == false) { - throw new IllegalArgumentException("a1 must be deterministic"); - } - if (a2.isDeterministic() == false) { - throw new IllegalArgumentException("a2 must be deterministic"); - } - assert hasDeadStatesFromInitial(a1) == false; - assert hasDeadStatesFromInitial(a2) == false; - if (a1.getNumStates() == 0) { - // Empty language is alwyas a subset of any other language - return true; - } else if (a2.getNumStates() == 0) { - return isEmpty(a1); - } - - // TODO: cutover to iterators instead - Transition[][] transitions1 = a1.getSortedTransitions(); - Transition[][] transitions2 = a2.getSortedTransitions(); - ArrayDeque worklist = new ArrayDeque<>(); - HashSet visited = new HashSet<>(); - StatePair p = new StatePair(0, 0); - worklist.add(p); - visited.add(p); - while (worklist.size() > 0) { - p = worklist.removeFirst(); - if (a1.isAccept(p.s1) && a2.isAccept(p.s2) == false) { - return false; - } - Transition[] t1 = transitions1[p.s1]; - Transition[] t2 = transitions2[p.s2]; - for (int n1 = 0, b2 = 0; n1 < t1.length; n1++) { - while (b2 < t2.length && t2[b2].max < t1[n1].min) { - b2++; - } - int min1 = t1[n1].min, max1 = t1[n1].max; - - for (int n2 = b2; n2 < t2.length && t1[n1].max >= t2[n2].min; n2++) { - if (t2[n2].min > min1) { - return false; - } - if (t2[n2].max < Character.MAX_CODE_POINT) { - min1 = t2[n2].max + 1; - } else { - min1 = Character.MAX_CODE_POINT; - max1 = Character.MIN_CODE_POINT; - } - StatePair q = new StatePair(t1[n1].dest, t2[n2].dest); - if (!visited.contains(q)) { - worklist.add(q); - visited.add(q); - } - } - if (min1 <= max1) { - return false; - } - } - } - return true; - } - /** * Returns an automaton that accepts the union of the languages of the given automata. * diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/StatePair.java b/lucene/core/src/java/org/apache/lucene/util/automaton/StatePair.java index ad1be724f9c..bd003507f2d 100644 --- a/lucene/core/src/java/org/apache/lucene/util/automaton/StatePair.java +++ b/lucene/core/src/java/org/apache/lucene/util/automaton/StatePair.java @@ -35,9 +35,14 @@ package org.apache.lucene.util.automaton; * @lucene.experimental */ public class StatePair { + // only mike knows what it does (do not expose) int s; - int s1; - int s2; + + /** first state */ + public final int s1; + + /** second state */ + public final int s2; StatePair(int s, int s1, int s2) { this.s = s; @@ -81,7 +86,7 @@ public class StatePair { @Override public int hashCode() { // Don't use s1 ^ s2 since it's vulnerable to the case where s1 == s2 always --> hashCode = 0, - // e.g. if you call Operations.sameLanguage, + // e.g. if you call AutomatonTestUtil.sameLanguage, // passing the same automaton against itself: return s1 * 31 + s2; } diff --git a/lucene/core/src/test/org/apache/lucene/analysis/TestGraphTokenizers.java b/lucene/core/src/test/org/apache/lucene/analysis/TestGraphTokenizers.java index 6e51a6bbb0c..4737694702f 100644 --- a/lucene/core/src/test/org/apache/lucene/analysis/TestGraphTokenizers.java +++ b/lucene/core/src/test/org/apache/lucene/analysis/TestGraphTokenizers.java @@ -625,7 +625,7 @@ public class TestGraphTokenizers extends BaseTokenStreamTestCase { Operations.removeDeadStates(expected), DEFAULT_DETERMINIZE_WORK_LIMIT); Automaton actualDet = Operations.determinize(Operations.removeDeadStates(actual), DEFAULT_DETERMINIZE_WORK_LIMIT); - if (Operations.sameLanguage(expectedDet, actualDet) == false) { + if (AutomatonTestUtil.sameLanguage(expectedDet, actualDet) == false) { Set expectedPaths = toPathStrings(expectedDet); Set actualPaths = toPathStrings(actualDet); StringBuilder b = new StringBuilder(); diff --git a/lucene/core/src/test/org/apache/lucene/index/TestTermsEnum2.java b/lucene/core/src/test/org/apache/lucene/index/TestTermsEnum2.java index ddca01a2387..105579b25d6 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestTermsEnum2.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestTermsEnum2.java @@ -183,7 +183,7 @@ public class TestTermsEnum2 extends LuceneTestCase { Automaton actual = Operations.determinize(Automata.makeStringUnion(found), DEFAULT_DETERMINIZE_WORK_LIMIT); - assertTrue(Operations.sameLanguage(expected, actual)); + assertTrue(AutomatonTestUtil.sameLanguage(expected, actual)); } } } diff --git a/lucene/core/src/test/org/apache/lucene/util/automaton/TestAutomaton.java b/lucene/core/src/test/org/apache/lucene/util/automaton/TestAutomaton.java index e4dd739ef78..3c7d6eea198 100644 --- a/lucene/core/src/test/org/apache/lucene/util/automaton/TestAutomaton.java +++ b/lucene/core/src/test/org/apache/lucene/util/automaton/TestAutomaton.java @@ -87,7 +87,7 @@ public class TestAutomaton extends LuceneTestCase { Automaton a2 = Operations.removeDeadStates( Operations.concatenate(Automata.makeString("foo"), Automata.makeString("bar"))); - assertTrue(Operations.sameLanguage(a1, a2)); + assertTrue(AutomatonTestUtil.sameLanguage(a1, a2)); } public void testCommonPrefixString() throws Exception { @@ -257,7 +257,7 @@ public class TestAutomaton extends LuceneTestCase { Automaton a = Automata.makeString("foobar"); Automaton aMin = MinimizationOperations.minimize(a, DEFAULT_DETERMINIZE_WORK_LIMIT); - assertTrue(Operations.sameLanguage(a, aMin)); + assertTrue(AutomatonTestUtil.sameLanguage(a, aMin)); } public void testMinimize2() throws Exception { @@ -266,7 +266,7 @@ public class TestAutomaton extends LuceneTestCase { Arrays.asList(Automata.makeString("foobar"), Automata.makeString("boobar"))); Automaton aMin = MinimizationOperations.minimize(a, DEFAULT_DETERMINIZE_WORK_LIMIT); assertTrue( - Operations.sameLanguage( + AutomatonTestUtil.sameLanguage( Operations.determinize(Operations.removeDeadStates(a), DEFAULT_DETERMINIZE_WORK_LIMIT), aMin)); } @@ -276,7 +276,7 @@ public class TestAutomaton extends LuceneTestCase { Automaton ra = Operations.reverse(a); Automaton a2 = Operations.determinize(Operations.reverse(ra), DEFAULT_DETERMINIZE_WORK_LIMIT); - assertTrue(Operations.sameLanguage(a, a2)); + assertTrue(AutomatonTestUtil.sameLanguage(a, a2)); } public void testOptional() throws Exception { @@ -401,7 +401,7 @@ public class TestAutomaton extends LuceneTestCase { Automaton ra = Operations.reverse(a); Automaton rra = Operations.reverse(ra); assertTrue( - Operations.sameLanguage( + AutomatonTestUtil.sameLanguage( Operations.determinize(Operations.removeDeadStates(a), Integer.MAX_VALUE), Operations.determinize(Operations.removeDeadStates(rra), Integer.MAX_VALUE))); } @@ -502,7 +502,7 @@ public class TestAutomaton extends LuceneTestCase { } assertTrue( - Operations.sameLanguage( + AutomatonTestUtil.sameLanguage( Operations.determinize(Operations.removeDeadStates(a), Integer.MAX_VALUE), Operations.determinize( Operations.removeDeadStates(builder.finish()), Integer.MAX_VALUE))); @@ -735,7 +735,8 @@ public class TestAutomaton extends LuceneTestCase { a2.addTransition(0, state, 'a'); a2.finishState(); assertTrue( - Operations.sameLanguage(Operations.removeDeadStates(a), Operations.removeDeadStates(a2))); + AutomatonTestUtil.sameLanguage( + Operations.removeDeadStates(a), Operations.removeDeadStates(a2))); } private Automaton randomNoOp(Automaton a) { @@ -1288,7 +1289,7 @@ public class TestAutomaton extends LuceneTestCase { Automaton a2 = Operations.removeDeadStates(Operations.determinize(unionTerms(terms), Integer.MAX_VALUE)); assertTrue( - Operations.sameLanguage( + AutomatonTestUtil.sameLanguage( a2, Operations.removeDeadStates(Operations.determinize(a, Integer.MAX_VALUE)))); // Do same check, in UTF8 space @@ -1613,7 +1614,7 @@ public class TestAutomaton extends LuceneTestCase { public void testAcceptAllEmptyStringMin() throws Exception { Automaton a = Automata.makeBinaryInterval(newBytesRef(), true, null, true); - assertTrue(Operations.sameLanguage(Automata.makeAnyBinary(), a)); + assertTrue(AutomatonTestUtil.sameLanguage(Automata.makeAnyBinary(), a)); } private static IntsRef toIntsRef(String s) { diff --git a/lucene/core/src/test/org/apache/lucene/util/automaton/TestDeterminism.java b/lucene/core/src/test/org/apache/lucene/util/automaton/TestDeterminism.java index 65616fa55b9..e69568d3873 100644 --- a/lucene/core/src/test/org/apache/lucene/util/automaton/TestDeterminism.java +++ b/lucene/core/src/test/org/apache/lucene/util/automaton/TestDeterminism.java @@ -41,7 +41,7 @@ public class TestDeterminism extends LuceneTestCase { a = AutomatonTestUtil.determinizeSimple(a); Automaton b = Operations.determinize(a, Integer.MAX_VALUE); // TODO: more verifications possible? - assertTrue(Operations.sameLanguage(a, b)); + assertTrue(AutomatonTestUtil.sameLanguage(a, b)); } } @@ -53,20 +53,20 @@ public class TestDeterminism extends LuceneTestCase { Operations.complement( Operations.complement(a, DEFAULT_DETERMINIZE_WORK_LIMIT), DEFAULT_DETERMINIZE_WORK_LIMIT); - assertTrue(Operations.sameLanguage(a, equivalent)); + assertTrue(AutomatonTestUtil.sameLanguage(a, equivalent)); // a union a = a equivalent = Operations.determinize( Operations.removeDeadStates(Operations.union(a, a)), DEFAULT_DETERMINIZE_WORK_LIMIT); - assertTrue(Operations.sameLanguage(a, equivalent)); + assertTrue(AutomatonTestUtil.sameLanguage(a, equivalent)); // a intersect a = a equivalent = Operations.determinize( Operations.removeDeadStates(Operations.intersection(a, a)), DEFAULT_DETERMINIZE_WORK_LIMIT); - assertTrue(Operations.sameLanguage(a, equivalent)); + assertTrue(AutomatonTestUtil.sameLanguage(a, equivalent)); // a minus a = empty Automaton empty = Operations.minus(a, a, DEFAULT_DETERMINIZE_WORK_LIMIT); @@ -81,7 +81,7 @@ public class TestDeterminism extends LuceneTestCase { equivalent = Operations.minus(optional, Automata.makeEmptyString(), DEFAULT_DETERMINIZE_WORK_LIMIT); // System.out.println("equiv " + equivalent); - assertTrue(Operations.sameLanguage(a, equivalent)); + assertTrue(AutomatonTestUtil.sameLanguage(a, equivalent)); } } } diff --git a/lucene/core/src/test/org/apache/lucene/util/automaton/TestLevenshteinAutomata.java b/lucene/core/src/test/org/apache/lucene/util/automaton/TestLevenshteinAutomata.java index c8adb8751b9..bc6d268c15e 100644 --- a/lucene/core/src/test/org/apache/lucene/util/automaton/TestLevenshteinAutomata.java +++ b/lucene/core/src/test/org/apache/lucene/util/automaton/TestLevenshteinAutomata.java @@ -81,44 +81,46 @@ public class TestLevenshteinAutomata extends LuceneTestCase { // check that the dfa for n-1 accepts a subset of the dfa for n if (n > 0) { assertTrue( - Operations.subsetOf( + AutomatonTestUtil.subsetOf( Operations.removeDeadStates(automata[n - 1]), Operations.removeDeadStates(automata[n]))); assertTrue( - Operations.subsetOf( + AutomatonTestUtil.subsetOf( Operations.removeDeadStates(automata[n - 1]), Operations.removeDeadStates(tautomata[n]))); assertTrue( - Operations.subsetOf( + AutomatonTestUtil.subsetOf( Operations.removeDeadStates(tautomata[n - 1]), Operations.removeDeadStates(automata[n]))); assertTrue( - Operations.subsetOf( + AutomatonTestUtil.subsetOf( Operations.removeDeadStates(tautomata[n - 1]), Operations.removeDeadStates(tautomata[n]))); assertNotSame(automata[n - 1], automata[n]); } // check that Lev(N) is a subset of LevT(N) assertTrue( - Operations.subsetOf( + AutomatonTestUtil.subsetOf( Operations.removeDeadStates(automata[n]), Operations.removeDeadStates(tautomata[n]))); // special checks for specific n switch (n) { case 0: // easy, matches the string itself assertTrue( - Operations.sameLanguage( + AutomatonTestUtil.sameLanguage( Automata.makeString(s), Operations.removeDeadStates(automata[0]))); assertTrue( - Operations.sameLanguage( + AutomatonTestUtil.sameLanguage( Automata.makeString(s), Operations.removeDeadStates(tautomata[0]))); break; case 1: // generate a lev1 naively, and check the accepted lang is the same. assertTrue( - Operations.sameLanguage(naiveLev1(s), Operations.removeDeadStates(automata[1]))); + AutomatonTestUtil.sameLanguage( + naiveLev1(s), Operations.removeDeadStates(automata[1]))); assertTrue( - Operations.sameLanguage(naiveLev1T(s), Operations.removeDeadStates(tautomata[1]))); + AutomatonTestUtil.sameLanguage( + naiveLev1T(s), Operations.removeDeadStates(tautomata[1]))); break; default: assertBruteForce(s, automata[n], n); diff --git a/lucene/core/src/test/org/apache/lucene/util/automaton/TestMinimize.java b/lucene/core/src/test/org/apache/lucene/util/automaton/TestMinimize.java index a43cc8ae8b1..92be1e4d569 100644 --- a/lucene/core/src/test/org/apache/lucene/util/automaton/TestMinimize.java +++ b/lucene/core/src/test/org/apache/lucene/util/automaton/TestMinimize.java @@ -28,7 +28,7 @@ public class TestMinimize extends LuceneTestCase { Automaton a = AutomatonTestUtil.randomAutomaton(random()); Automaton la = Operations.determinize(Operations.removeDeadStates(a), Integer.MAX_VALUE); Automaton lb = MinimizationOperations.minimize(a, Integer.MAX_VALUE); - assertTrue(Operations.sameLanguage(la, lb)); + assertTrue(AutomatonTestUtil.sameLanguage(la, lb)); } } @@ -42,7 +42,7 @@ public class TestMinimize extends LuceneTestCase { Automaton a = AutomatonTestUtil.randomAutomaton(random()); a = AutomatonTestUtil.minimizeSimple(a); Automaton b = MinimizationOperations.minimize(a, Integer.MAX_VALUE); - assertTrue(Operations.sameLanguage(a, b)); + assertTrue(AutomatonTestUtil.sameLanguage(a, b)); assertEquals(a.getNumStates(), b.getNumStates()); int numStates = a.getNumStates(); diff --git a/lucene/core/src/test/org/apache/lucene/util/automaton/TestOperations.java b/lucene/core/src/test/org/apache/lucene/util/automaton/TestOperations.java index ec38eafe0ce..c6ccf403fc8 100644 --- a/lucene/core/src/test/org/apache/lucene/util/automaton/TestOperations.java +++ b/lucene/core/src/test/org/apache/lucene/util/automaton/TestOperations.java @@ -50,7 +50,7 @@ public class TestOperations extends LuceneTestCase { assertTrue(naiveUnion.isDeterministic()); assertFalse(Operations.hasDeadStatesFromInitial(naiveUnion)); - assertTrue(Operations.sameLanguage(union, naiveUnion)); + assertTrue(AutomatonTestUtil.sameLanguage(union, naiveUnion)); } private static Automaton naiveUnion(List strings) { @@ -116,13 +116,13 @@ public class TestOperations extends LuceneTestCase { Automaton concat2 = Operations.concatenate(singleton, nfa); assertFalse(concat2.isDeterministic()); assertTrue( - Operations.sameLanguage( + AutomatonTestUtil.sameLanguage( Operations.determinize(concat1, 100), Operations.determinize(concat2, 100))); assertTrue( - Operations.sameLanguage( + AutomatonTestUtil.sameLanguage( Operations.determinize(nfa, 100), Operations.determinize(concat1, 100))); assertTrue( - Operations.sameLanguage( + AutomatonTestUtil.sameLanguage( Operations.determinize(nfa, 100), Operations.determinize(concat2, 100))); } diff --git a/lucene/core/src/test/org/apache/lucene/util/automaton/TestRegExpParsing.java b/lucene/core/src/test/org/apache/lucene/util/automaton/TestRegExpParsing.java index 7d0f062f36b..74fb08cb718 100644 --- a/lucene/core/src/test/org/apache/lucene/util/automaton/TestRegExpParsing.java +++ b/lucene/core/src/test/org/apache/lucene/util/automaton/TestRegExpParsing.java @@ -20,6 +20,7 @@ import java.io.IOException; import java.util.Map; import java.util.Set; import org.apache.lucene.tests.util.LuceneTestCase; +import org.apache.lucene.tests.util.automaton.AutomatonTestUtil; /** * Simple unit tests for RegExp parsing. @@ -698,7 +699,7 @@ public class TestRegExpParsing extends LuceneTestCase { private void assertSameLanguage(Automaton expected, Automaton actual) { expected = Operations.determinize(expected, Operations.DEFAULT_DETERMINIZE_WORK_LIMIT); actual = Operations.determinize(actual, Operations.DEFAULT_DETERMINIZE_WORK_LIMIT); - boolean result = Operations.sameLanguage(expected, actual); + boolean result = AutomatonTestUtil.sameLanguage(expected, actual); if (result == false) { System.out.println(expected.toDot()); System.out.println(actual.toDot()); diff --git a/lucene/core/src/test/org/apache/lucene/util/automaton/TestStringsToAutomaton.java b/lucene/core/src/test/org/apache/lucene/util/automaton/TestStringsToAutomaton.java index 0e5a3f9fc30..efaa451258b 100644 --- a/lucene/core/src/test/org/apache/lucene/util/automaton/TestStringsToAutomaton.java +++ b/lucene/core/src/test/org/apache/lucene/util/automaton/TestStringsToAutomaton.java @@ -28,6 +28,7 @@ import java.util.List; import java.util.Set; import org.apache.lucene.tests.util.LuceneTestCase; import org.apache.lucene.tests.util.TestUtil; +import org.apache.lucene.tests.util.automaton.AutomatonTestUtil; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRefBuilder; @@ -158,7 +159,7 @@ public class TestStringsToAutomaton extends LuceneTestCase { private static void assertSameAutomaton(Automaton a, Automaton b) { assertEquals(a.getNumStates(), b.getNumStates()); assertEquals(a.getNumTransitions(), b.getNumTransitions()); - assertTrue(Operations.sameLanguage(a, b)); + assertTrue(AutomatonTestUtil.sameLanguage(a, b)); } private List basicTerms() { diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/util/automaton/AutomatonTestUtil.java b/lucene/test-framework/src/java/org/apache/lucene/tests/util/automaton/AutomatonTestUtil.java index 38819479cfc..df2731f3cb4 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/util/automaton/AutomatonTestUtil.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/util/automaton/AutomatonTestUtil.java @@ -16,6 +16,7 @@ */ package org.apache.lucene.tests.util.automaton; +import java.util.ArrayDeque; import java.util.ArrayList; import java.util.BitSet; import java.util.HashMap; @@ -33,6 +34,7 @@ import org.apache.lucene.util.UnicodeUtil; import org.apache.lucene.util.automaton.Automaton; import org.apache.lucene.util.automaton.Operations; import org.apache.lucene.util.automaton.RegExp; +import org.apache.lucene.util.automaton.StatePair; import org.apache.lucene.util.automaton.TooComplexToDeterminizeException; import org.apache.lucene.util.automaton.Transition; @@ -533,4 +535,82 @@ public class AutomatonTestUtil { assert a.isDeterministic() == true; return true; } + + /** + * Returns true if these two automata accept exactly the same language. This is a costly + * computation! Both automata must be determinized and have no dead states! + */ + public static boolean sameLanguage(Automaton a1, Automaton a2) { + if (a1 == a2) { + return true; + } + return subsetOf(a2, a1) && subsetOf(a1, a2); + } + + /** + * Returns true if the language of a1 is a subset of the language of a2. + * Both automata must be determinized and must have no dead states. + * + *

Complexity: quadratic in number of states. + */ + public static boolean subsetOf(Automaton a1, Automaton a2) { + if (a1.isDeterministic() == false) { + throw new IllegalArgumentException("a1 must be deterministic"); + } + if (a2.isDeterministic() == false) { + throw new IllegalArgumentException("a2 must be deterministic"); + } + assert Operations.hasDeadStatesFromInitial(a1) == false; + assert Operations.hasDeadStatesFromInitial(a2) == false; + if (a1.getNumStates() == 0) { + // Empty language is alwyas a subset of any other language + return true; + } else if (a2.getNumStates() == 0) { + return Operations.isEmpty(a1); + } + + // TODO: cutover to iterators instead + Transition[][] transitions1 = a1.getSortedTransitions(); + Transition[][] transitions2 = a2.getSortedTransitions(); + ArrayDeque worklist = new ArrayDeque<>(); + HashSet visited = new HashSet<>(); + StatePair p = new StatePair(0, 0); + worklist.add(p); + visited.add(p); + while (worklist.size() > 0) { + p = worklist.removeFirst(); + if (a1.isAccept(p.s1) && a2.isAccept(p.s2) == false) { + return false; + } + Transition[] t1 = transitions1[p.s1]; + Transition[] t2 = transitions2[p.s2]; + for (int n1 = 0, b2 = 0; n1 < t1.length; n1++) { + while (b2 < t2.length && t2[b2].max < t1[n1].min) { + b2++; + } + int min1 = t1[n1].min, max1 = t1[n1].max; + + for (int n2 = b2; n2 < t2.length && t1[n1].max >= t2[n2].min; n2++) { + if (t2[n2].min > min1) { + return false; + } + if (t2[n2].max < Character.MAX_CODE_POINT) { + min1 = t2[n2].max + 1; + } else { + min1 = Character.MAX_CODE_POINT; + max1 = Character.MIN_CODE_POINT; + } + StatePair q = new StatePair(t1[n1].dest, t2[n2].dest); + if (!visited.contains(q)) { + worklist.add(q); + visited.add(q); + } + } + if (min1 <= max1) { + return false; + } + } + } + return true; + } } From ea3a9b89278e2bbaebdd54c91e692f200f79e37b Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Thu, 5 Sep 2024 08:36:10 -0400 Subject: [PATCH 09/18] Relax Operations.isTotal() to work with a deterministic automaton (#13707) Improve Operations.isTotal() to work with non-minimal automata. --- lucene/CHANGES.txt | 2 + .../lucene/util/automaton/Operations.java | 40 +++++++++++++++---- .../lucene/util/automaton/TestOperations.java | 36 +++++++++++++++++ 3 files changed, 71 insertions(+), 7 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index b41144b2d9c..42ab9c6fc63 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -169,6 +169,8 @@ Improvements * GITHUB#12172: Update Romanian stopwords list to include the modern unicode forms. (Trey Jones) +* GITHUB#13707: Improve Operations.isTotal() to work with non-minimal automata. (Dawid Weiss, Robert Muir) + Optimizations --------------------- diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/Operations.java b/lucene/core/src/java/org/apache/lucene/util/automaton/Operations.java index 8fd43dbe1ff..9cd0b3934aa 100644 --- a/lucene/core/src/java/org/apache/lucene/util/automaton/Operations.java +++ b/lucene/core/src/java/org/apache/lucene/util/automaton/Operations.java @@ -778,22 +778,48 @@ public final class Operations { return true; } - /** Returns true if the given automaton accepts all strings. The automaton must be minimized. */ + /** + * Returns true if the given automaton accepts all strings. + * + *

The automaton must be deterministic, or this method may return false. + * + *

Complexity: linear in number of states and transitions. + */ public static boolean isTotal(Automaton a) { return isTotal(a, Character.MIN_CODE_POINT, Character.MAX_CODE_POINT); } /** * Returns true if the given automaton accepts all strings for the specified min/max range of the - * alphabet. The automaton must be minimized. + * alphabet. + * + *

The automaton must be deterministic, or this method may return false. + * + *

Complexity: linear in number of states and transitions. */ public static boolean isTotal(Automaton a, int minAlphabet, int maxAlphabet) { - if (a.isAccept(0) && a.getNumTransitions(0) == 1) { - Transition t = new Transition(); - a.getTransition(0, 0, t); - return t.dest == 0 && t.min == minAlphabet && t.max == maxAlphabet; + BitSet states = getLiveStates(a); + Transition spare = new Transition(); + int seenStates = 0; + for (int state = states.nextSetBit(0); state >= 0; state = states.nextSetBit(state + 1)) { + // all reachable states must be accept states + if (a.isAccept(state) == false) return false; + // all reachable states must contain transitions covering minAlphabet-maxAlphabet + int previousLabel = minAlphabet - 1; + for (int transition = 0; transition < a.getNumTransitions(state); transition++) { + a.getTransition(state, transition, spare); + // no gaps are allowed + if (spare.min > previousLabel + 1) return false; + previousLabel = spare.max; + } + if (previousLabel < maxAlphabet) return false; + if (state == Integer.MAX_VALUE) { + break; // or (state+1) would overflow + } + seenStates++; } - return false; + // we've checked all the states, automaton is either total or empty + return seenStates > 0; } /** diff --git a/lucene/core/src/test/org/apache/lucene/util/automaton/TestOperations.java b/lucene/core/src/test/org/apache/lucene/util/automaton/TestOperations.java index c6ccf403fc8..849f615e726 100644 --- a/lucene/core/src/test/org/apache/lucene/util/automaton/TestOperations.java +++ b/lucene/core/src/test/org/apache/lucene/util/automaton/TestOperations.java @@ -173,6 +173,42 @@ public class TestOperations extends LuceneTestCase { assertTrue(exc.getMessage().contains("input automaton is too large")); } + public void testIsTotal() { + // minimal + assertFalse(Operations.isTotal(Automata.makeEmpty())); + assertFalse(Operations.isTotal(Automata.makeEmptyString())); + assertTrue(Operations.isTotal(Automata.makeAnyString())); + assertTrue(Operations.isTotal(Automata.makeAnyBinary(), 0, 255)); + assertFalse(Operations.isTotal(Automata.makeNonEmptyBinary(), 0, 255)); + // deterministic, but not minimal + assertTrue(Operations.isTotal(Operations.repeat(Automata.makeAnyChar()))); + Automaton tricky = + Operations.repeat( + Operations.union( + Automata.makeCharRange(Character.MIN_CODE_POINT, 100), + Automata.makeCharRange(101, Character.MAX_CODE_POINT))); + assertTrue(Operations.isTotal(tricky)); + // not total, but close + Automaton tricky2 = + Operations.repeat( + Operations.union( + Automata.makeCharRange(Character.MIN_CODE_POINT + 1, 100), + Automata.makeCharRange(101, Character.MAX_CODE_POINT))); + assertFalse(Operations.isTotal(tricky2)); + Automaton tricky3 = + Operations.repeat( + Operations.union( + Automata.makeCharRange(Character.MIN_CODE_POINT, 99), + Automata.makeCharRange(101, Character.MAX_CODE_POINT))); + assertFalse(Operations.isTotal(tricky3)); + Automaton tricky4 = + Operations.repeat( + Operations.union( + Automata.makeCharRange(Character.MIN_CODE_POINT, 100), + Automata.makeCharRange(101, Character.MAX_CODE_POINT - 1))); + assertFalse(Operations.isTotal(tricky4)); + } + /** * Returns the set of all accepted strings. * From 67ca244ef1b0cb8d0d36fa05ff2b4f3392895d44 Mon Sep 17 00:00:00 2001 From: Uwe Schindler Date: Thu, 5 Sep 2024 14:37:04 +0200 Subject: [PATCH 10/18] Clean up forbiddenapis: No need to serialize and deserialize the package private class --- .../taxonomy/writercache/CharBlockArray.java | 36 ------------------- .../writercache/TestCharBlockArray.java | 17 --------- 2 files changed, 53 deletions(-) diff --git a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/writercache/CharBlockArray.java b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/writercache/CharBlockArray.java index f358ed41836..f0d2a58adb6 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/writercache/CharBlockArray.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/writercache/CharBlockArray.java @@ -16,15 +16,9 @@ */ package org.apache.lucene.facet.taxonomy.writercache; -import java.io.IOException; -import java.io.InputStream; -import java.io.ObjectInputStream; -import java.io.ObjectOutputStream; -import java.io.OutputStream; import java.io.Serializable; import java.util.ArrayList; import java.util.List; -import org.apache.lucene.util.SuppressForbidden; /** * Similar to {@link StringBuilder}, but with a more efficient growing strategy. This class uses @@ -185,34 +179,4 @@ class CharBlockArray implements Appendable, Serializable, CharSequence { } return sb.toString(); } - - @SuppressForbidden( - reason = "TODO: don't use java serialization here, inefficient and unnecessary") - void flush(OutputStream out) throws IOException { - ObjectOutputStream oos = null; - try { - oos = new ObjectOutputStream(out); - oos.writeObject(this); - oos.flush(); - } finally { - if (oos != null) { - oos.close(); - } - } - } - - @SuppressForbidden( - reason = "TODO: don't use java serialization here, inefficient and unnecessary") - public static CharBlockArray open(InputStream in) throws IOException, ClassNotFoundException { - ObjectInputStream ois = null; - try { - ois = new ObjectInputStream(in); - CharBlockArray a = (CharBlockArray) ois.readObject(); - return a; - } finally { - if (ois != null) { - ois.close(); - } - } - } } diff --git a/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/writercache/TestCharBlockArray.java b/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/writercache/TestCharBlockArray.java index f918ed5e150..3c3ae4938a0 100644 --- a/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/writercache/TestCharBlockArray.java +++ b/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/writercache/TestCharBlockArray.java @@ -16,14 +16,10 @@ */ package org.apache.lucene.facet.taxonomy.writercache; -import java.io.BufferedInputStream; -import java.io.BufferedOutputStream; import java.nio.ByteBuffer; import java.nio.charset.CharsetDecoder; import java.nio.charset.CodingErrorAction; import java.nio.charset.StandardCharsets; -import java.nio.file.Files; -import java.nio.file.Path; import org.apache.lucene.facet.FacetTestCase; public class TestCharBlockArray extends FacetTestCase { @@ -89,19 +85,6 @@ public class TestCharBlockArray extends FacetTestCase { } assertEqualsInternal("GrowingCharArray<->StringBuilder mismatch.", builder, array); - - Path tempDir = createTempDir("growingchararray"); - Path f = tempDir.resolve("GrowingCharArrayTest.tmp"); - BufferedOutputStream out = new BufferedOutputStream(Files.newOutputStream(f)); - array.flush(out); - out.flush(); - out.close(); - - BufferedInputStream in = new BufferedInputStream(Files.newInputStream(f)); - array = CharBlockArray.open(in); - assertEqualsInternal( - "GrowingCharArray<->StringBuilder mismatch after flush/load.", builder, array); - in.close(); } private static void assertEqualsInternal( From e8440956b969413d84b1d565826a24be1eba2093 Mon Sep 17 00:00:00 2001 From: Uwe Schindler Date: Thu, 5 Sep 2024 14:55:12 +0200 Subject: [PATCH 11/18] Remove useless Serializable interface --- .../facet/taxonomy/writercache/CharBlockArray.java | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/writercache/CharBlockArray.java b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/writercache/CharBlockArray.java index f0d2a58adb6..9a514486bad 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/writercache/CharBlockArray.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/writercache/CharBlockArray.java @@ -16,7 +16,6 @@ */ package org.apache.lucene.facet.taxonomy.writercache; -import java.io.Serializable; import java.util.ArrayList; import java.util.List; @@ -26,15 +25,11 @@ import java.util.List; * * @lucene.experimental */ -class CharBlockArray implements Appendable, Serializable, CharSequence { - - private static final long serialVersionUID = 1L; +class CharBlockArray implements Appendable, CharSequence { private static final int DefaultBlockSize = 32 * 1024; // 32 KB default size - static final class Block implements Serializable, Cloneable { - private static final long serialVersionUID = 1L; - + static final class Block implements Cloneable { final char[] chars; int length; From 11d7566229c602989514604d1a7d8bda365c8a29 Mon Sep 17 00:00:00 2001 From: Adrien Grand Date: Thu, 5 Sep 2024 15:08:26 +0200 Subject: [PATCH 12/18] Make Operations#repeat create simpler automata. (#13714) `Operations#repeat` currently creates an automaton that has one more final state, and 2x more transitions for each of the final states. With this change, the returned automaton has a single final state and only 2x more transitions for state 0. --- .../lucene/util/automaton/Operations.java | 51 ++++++-- .../lucene/util/automaton/TestOperations.java | 122 ++++++++++++++++++ 2 files changed, 165 insertions(+), 8 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/Operations.java b/lucene/core/src/java/org/apache/lucene/util/automaton/Operations.java index 9cd0b3934aa..a29b8b42f97 100644 --- a/lucene/core/src/java/org/apache/lucene/util/automaton/Operations.java +++ b/lucene/core/src/java/org/apache/lucene/util/automaton/Operations.java @@ -181,30 +181,65 @@ public final class Operations { // Repeating the empty automata will still only accept the empty automata. return a; } + + if (a.isAccept(0) && a.getAcceptStates().cardinality() == 1) { + // If state 0 is the only accept state, then this automaton already repeats itself. + return a; + } + Automaton.Builder builder = new Automaton.Builder(); + // Create the initial state, which is accepted builder.createState(); builder.setAccept(0, true); - builder.copy(a); - Transition t = new Transition(); + + int[] stateMap = new int[a.getNumStates()]; + for (int state = 0; state < a.getNumStates(); ++state) { + if (a.isAccept(state) == false) { + stateMap[state] = builder.createState(); + } else if (a.getNumTransitions(state) == 0) { + // Accept states that have no transitions get merged into state 0. + stateMap[state] = 0; + } else { + int newState = builder.createState(); + stateMap[state] = newState; + builder.setAccept(newState, true); + } + } + + // Now copy the automaton while renumbering states. + for (int state = 0; state < a.getNumStates(); ++state) { + int src = stateMap[state]; + int count = a.initTransition(state, t); + for (int i = 0; i < count; i++) { + a.getNextTransition(t); + int dest = stateMap[t.dest]; + builder.addTransition(src, dest, t.min, t.max); + } + } + + // Now copy transitions of the initial state to our new initial state. int count = a.initTransition(0, t); for (int i = 0; i < count; i++) { a.getNextTransition(t); - builder.addTransition(0, t.dest + 1, t.min, t.max); + builder.addTransition(0, stateMap[t.dest], t.min, t.max); } - int numStates = a.getNumStates(); - for (int s = 0; s < numStates; s++) { - if (a.isAccept(s)) { + // Now copy transitions of the initial state to final states to make the automaton repeat + // itself. + for (int s = a.getAcceptStates().nextSetBit(0); + s != -1; + s = a.getAcceptStates().nextSetBit(s + 1)) { + if (stateMap[s] != 0) { count = a.initTransition(0, t); for (int i = 0; i < count; i++) { a.getNextTransition(t); - builder.addTransition(s + 1, t.dest + 1, t.min, t.max); + builder.addTransition(stateMap[s], stateMap[t.dest], t.min, t.max); } } } - return builder.finish(); + return removeDeadStates(builder.finish()); } /** diff --git a/lucene/core/src/test/org/apache/lucene/util/automaton/TestOperations.java b/lucene/core/src/test/org/apache/lucene/util/automaton/TestOperations.java index 849f615e726..40a6c7161e4 100644 --- a/lucene/core/src/test/org/apache/lucene/util/automaton/TestOperations.java +++ b/lucene/core/src/test/org/apache/lucene/util/automaton/TestOperations.java @@ -290,4 +290,126 @@ public class TestOperations extends LuceneTestCase { a.finishState(); return a; } + + public void testRepeat() { + Automaton emptyLanguage = Automata.makeEmpty(); + assertSame(emptyLanguage, Operations.repeat(emptyLanguage)); + + Automaton emptyString = Automata.makeEmptyString(); + assertSame(emptyString, Operations.repeat(emptyString)); + + Automaton a = Automata.makeChar('a'); + Automaton as = new Automaton(); + as.createState(); + as.setAccept(0, true); + as.addTransition(0, 0, 'a'); + as.finishState(); + assertTrue(Operations.sameLanguage(as, Operations.repeat(a))); + assertSame(as, Operations.repeat(as)); + + Automaton aOrEmpty = new Automaton(); + aOrEmpty.createState(); + aOrEmpty.setAccept(0, true); + aOrEmpty.createState(); + aOrEmpty.setAccept(1, true); + aOrEmpty.addTransition(0, 1, 'a'); + assertTrue(Operations.sameLanguage(as, Operations.repeat(aOrEmpty))); + + Automaton ab = Automata.makeString("ab"); + Automaton abs = new Automaton(); + abs.createState(); + abs.createState(); + abs.setAccept(0, true); + abs.addTransition(0, 1, 'a'); + abs.finishState(); + abs.addTransition(1, 0, 'b'); + abs.finishState(); + assertTrue(Operations.sameLanguage(abs, Operations.repeat(ab))); + assertSame(abs, Operations.repeat(abs)); + + Automaton absThenC = Operations.concatenate(abs, Automata.makeChar('c')); + Automaton absThenCs = new Automaton(); + absThenCs.createState(); + absThenCs.createState(); + absThenCs.createState(); + absThenCs.setAccept(0, true); + absThenCs.addTransition(0, 1, 'a'); + absThenCs.addTransition(0, 0, 'c'); + absThenCs.finishState(); + absThenCs.addTransition(1, 2, 'b'); + absThenCs.finishState(); + absThenCs.addTransition(2, 1, 'a'); + absThenCs.addTransition(2, 0, 'c'); + absThenCs.finishState(); + assertTrue(Operations.sameLanguage(absThenCs, Operations.repeat(absThenC))); + assertSame(absThenCs, Operations.repeat(absThenCs)); + + Automaton aOrAb = new Automaton(); + aOrAb.createState(); + aOrAb.createState(); + aOrAb.createState(); + aOrAb.setAccept(1, true); + aOrAb.setAccept(2, true); + aOrAb.addTransition(0, 1, 'a'); + aOrAb.finishState(); + aOrAb.addTransition(1, 2, 'b'); + aOrAb.finishState(); + Automaton aOrAbs = new Automaton(); + aOrAbs.createState(); + aOrAbs.createState(); + aOrAbs.setAccept(0, true); + aOrAbs.addTransition(0, 0, 'a'); + aOrAbs.addTransition(0, 1, 'a'); + aOrAbs.finishState(); + aOrAbs.addTransition(1, 0, 'b'); + aOrAbs.finishState(); + assertTrue( + Operations.sameLanguage( + Operations.determinize(aOrAbs, Integer.MAX_VALUE), + Operations.determinize(Operations.repeat(aOrAb), Integer.MAX_VALUE))); + } + + public void testDuelRepeat() { + final int iters = atLeast(1_000); + for (int iter = 0; iter < iters; ++iter) { + Automaton a = AutomatonTestUtil.randomAutomaton(random()); + Automaton repeat1 = Operations.determinize(Operations.repeat(a), Integer.MAX_VALUE); + Automaton repeat2 = Operations.determinize(naiveRepeat(a), Integer.MAX_VALUE); + assertTrue(Operations.sameLanguage(repeat1, repeat2)); + } + } + + // This is the original implementation of Operations#repeat, before we improved it to generate + // simpler automata in some common cases. + private static Automaton naiveRepeat(Automaton a) { + if (a.getNumStates() == 0) { + return a; + } + + Automaton.Builder builder = new Automaton.Builder(); + // Create the initial state, which is accepted + builder.createState(); + builder.setAccept(0, true); + builder.copy(a); + + Transition t = new Transition(); + int count = a.initTransition(0, t); + for (int i = 0; i < count; i++) { + a.getNextTransition(t); + builder.addTransition(0, t.dest + 1, t.min, t.max); + } + + int numStates = a.getNumStates(); + for (int s = 0; s < numStates; s++) { + if (a.isAccept(s)) { + count = a.initTransition(0, t); + for (int i = 0; i < count; i++) { + a.getNextTransition(t); + builder.addTransition(s + 1, t.dest + 1, t.min, t.max); + } + } + } + + return builder.finish(); + } } From 67c0f8e847fc2802d71ff9685d4b43ad095115d2 Mon Sep 17 00:00:00 2001 From: Chris Hegarty <62058229+ChrisHegarty@users.noreply.github.com> Date: Thu, 5 Sep 2024 14:15:24 +0100 Subject: [PATCH 13/18] Determinize automata used by IntervalsSource.regex (#13718) This commit determinizes internal automata used in the construction of the IntervalsSource created by the regexp factory. --- .../lucene/queries/intervals/Intervals.java | 1 + .../queries/intervals/TestIntervalQuery.java | 20 ++++++++++++++++ .../queries/intervals/TestIntervals.java | 23 +++++++++++++++++++ 3 files changed, 44 insertions(+) diff --git a/lucene/queries/src/java/org/apache/lucene/queries/intervals/Intervals.java b/lucene/queries/src/java/org/apache/lucene/queries/intervals/Intervals.java index 7b9c933c167..57d5674b254 100644 --- a/lucene/queries/src/java/org/apache/lucene/queries/intervals/Intervals.java +++ b/lucene/queries/src/java/org/apache/lucene/queries/intervals/Intervals.java @@ -238,6 +238,7 @@ public final class Intervals { */ public static IntervalsSource regexp(BytesRef regexp, int maxExpansions) { Automaton automaton = new RegExp(new Term("", regexp).text()).toAutomaton(); + automaton = Operations.determinize(automaton, Operations.DEFAULT_DETERMINIZE_WORK_LIMIT); CompiledAutomaton ca = new CompiledAutomaton(automaton, false, true, false); return new MultiTermIntervalsSource(ca, maxExpansions, regexp.utf8ToString()); } diff --git a/lucene/queries/src/test/org/apache/lucene/queries/intervals/TestIntervalQuery.java b/lucene/queries/src/test/org/apache/lucene/queries/intervals/TestIntervalQuery.java index d9141882c94..71228e4178f 100644 --- a/lucene/queries/src/test/org/apache/lucene/queries/intervals/TestIntervalQuery.java +++ b/lucene/queries/src/test/org/apache/lucene/queries/intervals/TestIntervalQuery.java @@ -447,4 +447,24 @@ public class TestIntervalQuery extends LuceneTestCase { field, or(term("XXX"), containing(extend(term("message"), 0, 10), term("intend")))); checkHits(q, new int[] {}); } + + public void testEquality() { + assertEquals( + new IntervalQuery("f", Intervals.regexp(new BytesRef(".*foo"))), + new IntervalQuery("f", Intervals.regexp(new BytesRef(".*foo")))); + assertEquals( + new IntervalQuery("f", Intervals.prefix(new BytesRef("p"), 1)), + new IntervalQuery("f", Intervals.prefix(new BytesRef("p"), 1))); + assertEquals( + new IntervalQuery("f", Intervals.fuzzyTerm("kot", 1)), + new IntervalQuery("f", Intervals.fuzzyTerm("kot", 1))); + assertEquals( + new IntervalQuery("f", Intervals.wildcard(new BytesRef("*.txt"))), + new IntervalQuery("f", Intervals.wildcard(new BytesRef("*.txt")))); + assertEquals( + new IntervalQuery( + "f", Intervals.range(new BytesRef("cold"), new BytesRef("hot"), true, true)), + new IntervalQuery( + "f", Intervals.range(new BytesRef("cold"), new BytesRef("hot"), true, true))); + } } diff --git a/lucene/queries/src/test/org/apache/lucene/queries/intervals/TestIntervals.java b/lucene/queries/src/test/org/apache/lucene/queries/intervals/TestIntervals.java index 944530937ce..43fd8fb7fb8 100644 --- a/lucene/queries/src/test/org/apache/lucene/queries/intervals/TestIntervals.java +++ b/lucene/queries/src/test/org/apache/lucene/queries/intervals/TestIntervals.java @@ -1187,4 +1187,27 @@ public class TestIntervals extends LuceneTestCase { checkVisits(source, 1); } + + // basic test for equality and inequality of instances created by the factories + public void testEquality() { + assertEquals(Intervals.term("wibble"), Intervals.term("wibble")); + assertEquals(Intervals.prefix(new BytesRef("p"), 1), Intervals.prefix(new BytesRef("p"), 1)); + assertEquals(Intervals.fuzzyTerm("kot", 1), Intervals.fuzzyTerm("kot", 1)); + assertEquals(Intervals.regexp(new BytesRef(".*ot")), Intervals.regexp(new BytesRef(".*ot"))); + assertEquals( + Intervals.wildcard(new BytesRef("*.txt")), Intervals.wildcard(new BytesRef("*.txt"))); + assertEquals( + Intervals.range(new BytesRef("cold"), new BytesRef("hot"), true, true), + Intervals.range(new BytesRef("cold"), new BytesRef("hot"), true, true)); + + assertNotEquals(Intervals.term("wibble"), Intervals.term("wobble")); + assertNotEquals(Intervals.prefix(new BytesRef("p"), 1), Intervals.prefix(new BytesRef("b"), 1)); + assertNotEquals(Intervals.fuzzyTerm("kot", 1), Intervals.fuzzyTerm("kof", 1)); + assertNotEquals(Intervals.regexp(new BytesRef(".*ot")), Intervals.regexp(new BytesRef(".*at"))); + assertNotEquals( + Intervals.wildcard(new BytesRef("*.txt")), Intervals.wildcard(new BytesRef("*.tat"))); + assertNotEquals( + Intervals.range(new BytesRef("warm"), new BytesRef("hot"), true, true), + Intervals.range(new BytesRef("cold"), new BytesRef("hot"), true, true)); + } } From a414a96eee54eebf7dda367e15f80d079228c7ec Mon Sep 17 00:00:00 2001 From: Adrien Grand Date: Thu, 5 Sep 2024 16:03:29 +0200 Subject: [PATCH 14/18] Operations#sameLanguage moved to AutomatonTestUtil. --- .../apache/lucene/util/automaton/TestOperations.java | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/lucene/core/src/test/org/apache/lucene/util/automaton/TestOperations.java b/lucene/core/src/test/org/apache/lucene/util/automaton/TestOperations.java index 40a6c7161e4..c158dd9f3bb 100644 --- a/lucene/core/src/test/org/apache/lucene/util/automaton/TestOperations.java +++ b/lucene/core/src/test/org/apache/lucene/util/automaton/TestOperations.java @@ -304,7 +304,7 @@ public class TestOperations extends LuceneTestCase { as.setAccept(0, true); as.addTransition(0, 0, 'a'); as.finishState(); - assertTrue(Operations.sameLanguage(as, Operations.repeat(a))); + assertTrue(AutomatonTestUtil.sameLanguage(as, Operations.repeat(a))); assertSame(as, Operations.repeat(as)); Automaton aOrEmpty = new Automaton(); @@ -313,7 +313,7 @@ public class TestOperations extends LuceneTestCase { aOrEmpty.createState(); aOrEmpty.setAccept(1, true); aOrEmpty.addTransition(0, 1, 'a'); - assertTrue(Operations.sameLanguage(as, Operations.repeat(aOrEmpty))); + assertTrue(AutomatonTestUtil.sameLanguage(as, Operations.repeat(aOrEmpty))); Automaton ab = Automata.makeString("ab"); Automaton abs = new Automaton(); @@ -324,7 +324,7 @@ public class TestOperations extends LuceneTestCase { abs.finishState(); abs.addTransition(1, 0, 'b'); abs.finishState(); - assertTrue(Operations.sameLanguage(abs, Operations.repeat(ab))); + assertTrue(AutomatonTestUtil.sameLanguage(abs, Operations.repeat(ab))); assertSame(abs, Operations.repeat(abs)); Automaton absThenC = Operations.concatenate(abs, Automata.makeChar('c')); @@ -341,7 +341,7 @@ public class TestOperations extends LuceneTestCase { absThenCs.addTransition(2, 1, 'a'); absThenCs.addTransition(2, 0, 'c'); absThenCs.finishState(); - assertTrue(Operations.sameLanguage(absThenCs, Operations.repeat(absThenC))); + assertTrue(AutomatonTestUtil.sameLanguage(absThenCs, Operations.repeat(absThenC))); assertSame(absThenCs, Operations.repeat(absThenCs)); Automaton aOrAb = new Automaton(); @@ -364,7 +364,7 @@ public class TestOperations extends LuceneTestCase { aOrAbs.addTransition(1, 0, 'b'); aOrAbs.finishState(); assertTrue( - Operations.sameLanguage( + AutomatonTestUtil.sameLanguage( Operations.determinize(aOrAbs, Integer.MAX_VALUE), Operations.determinize(Operations.repeat(aOrAb), Integer.MAX_VALUE))); } @@ -375,7 +375,7 @@ public class TestOperations extends LuceneTestCase { Automaton a = AutomatonTestUtil.randomAutomaton(random()); Automaton repeat1 = Operations.determinize(Operations.repeat(a), Integer.MAX_VALUE); Automaton repeat2 = Operations.determinize(naiveRepeat(a), Integer.MAX_VALUE); - assertTrue(Operations.sameLanguage(repeat1, repeat2)); + assertTrue(AutomatonTestUtil.sameLanguage(repeat1, repeat2)); } } From 4c27bccac544550bbc48a0a77a2245d02c2bf7d4 Mon Sep 17 00:00:00 2001 From: Adrien Grand Date: Thu, 5 Sep 2024 16:50:31 +0200 Subject: [PATCH 15/18] Early exit from Operations#removeDeadStates when an automaton doesn't have dead states. (#13721) --- .../src/java/org/apache/lucene/util/automaton/Automaton.java | 1 + .../apache/lucene/util/automaton/FiniteStringsIterator.java | 2 +- .../src/java/org/apache/lucene/util/automaton/Operations.java | 3 +++ 3 files changed, 5 insertions(+), 1 deletion(-) diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/Automaton.java b/lucene/core/src/java/org/apache/lucene/util/automaton/Automaton.java index 7ad9eedda6f..34a18446191 100644 --- a/lucene/core/src/java/org/apache/lucene/util/automaton/Automaton.java +++ b/lucene/core/src/java/org/apache/lucene/util/automaton/Automaton.java @@ -339,6 +339,7 @@ public class Automaton implements Accountable, TransitionAccessor { @Override public int getNumTransitions(int state) { assert state >= 0; + assert state < getNumStates(); int count = states[2 * state + 1]; if (count == -1) { return 0; diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/FiniteStringsIterator.java b/lucene/core/src/java/org/apache/lucene/util/automaton/FiniteStringsIterator.java index b9030c19cf0..e141b4b95cf 100644 --- a/lucene/core/src/java/org/apache/lucene/util/automaton/FiniteStringsIterator.java +++ b/lucene/core/src/java/org/apache/lucene/util/automaton/FiniteStringsIterator.java @@ -86,7 +86,7 @@ public class FiniteStringsIterator { this.emitEmptyString = a.isAccept(0); // Start iteration with node startState. - if (a.getNumTransitions(startState) > 0) { + if (a.getNumStates() > startState && a.getNumTransitions(startState) > 0) { pathStates.set(startState); nodes[0].resetState(a, startState); string.append(startState); diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/Operations.java b/lucene/core/src/java/org/apache/lucene/util/automaton/Operations.java index a29b8b42f97..3cb02c77a82 100644 --- a/lucene/core/src/java/org/apache/lucene/util/automaton/Operations.java +++ b/lucene/core/src/java/org/apache/lucene/util/automaton/Operations.java @@ -986,6 +986,9 @@ public final class Operations { public static Automaton removeDeadStates(Automaton a) { int numStates = a.getNumStates(); BitSet liveSet = getLiveStates(a); + if (liveSet.cardinality() == numStates) { + return a; + } int[] map = new int[numStates]; From 5f242b3b268b5aba187070bf813e02d563c07af4 Mon Sep 17 00:00:00 2001 From: Luca Cavanna Date: Thu, 5 Sep 2024 20:05:46 +0200 Subject: [PATCH 16/18] Dry up TestScorerPerf (#13712) Some of the test methods were commented out when this test class was added. They got later removed but the removal left unused method behind. I also adjusted visibility of all the internal methods that were public and should have been private, which led me to further clean up: `MatchingHitCollector` was not needed and can be removed. --- .../apache/lucene/search/TestScorerPerf.java | 207 +++--------------- 1 file changed, 25 insertions(+), 182 deletions(-) diff --git a/lucene/core/src/test/org/apache/lucene/search/TestScorerPerf.java b/lucene/core/src/test/org/apache/lucene/search/TestScorerPerf.java index 25404eacb30..fa20d124b04 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestScorerPerf.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestScorerPerf.java @@ -17,15 +17,11 @@ package org.apache.lucene.search; import java.io.IOException; -import java.util.BitSet; import java.util.Collection; import org.apache.lucene.document.Document; -import org.apache.lucene.document.Field; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexWriter; -import org.apache.lucene.index.IndexWriterConfig.OpenMode; import org.apache.lucene.index.LeafReaderContext; -import org.apache.lucene.index.Term; import org.apache.lucene.store.Directory; import org.apache.lucene.tests.analysis.MockAnalyzer; import org.apache.lucene.tests.util.LuceneTestCase; @@ -35,34 +31,7 @@ import org.apache.lucene.util.FixedBitSet; public class TestScorerPerf extends LuceneTestCase { private final boolean validate = true; // set to false when doing performance testing - public void createRandomTerms(int nDocs, int nTerms, double power, Directory dir) - throws Exception { - int[] freq = new int[nTerms]; - Term[] terms = new Term[nTerms]; - for (int i = 0; i < nTerms; i++) { - int f = (nTerms + 1) - i; // make first terms less frequent - freq[i] = (int) Math.ceil(Math.pow(f, power)); - terms[i] = new Term("f", Character.toString((char) ('A' + i))); - } - - IndexWriter iw = - new IndexWriter( - dir, newIndexWriterConfig(new MockAnalyzer(random())).setOpenMode(OpenMode.CREATE)); - for (int i = 0; i < nDocs; i++) { - Document d = new Document(); - for (int j = 0; j < nTerms; j++) { - if (random().nextInt(freq[j]) == 0) { - d.add(newStringField("f", terms[j].text(), Field.Store.NO)); - // System.out.println(d); - } - } - iw.addDocument(d); - } - iw.forceMerge(1); - iw.close(); - } - - public FixedBitSet randBitSet(int sz, int numBitsToSet) { + private static FixedBitSet randBitSet(int sz, int numBitsToSet) { FixedBitSet set = new FixedBitSet(sz); for (int i = 0; i < numBitsToSet; i++) { set.set(random().nextInt(sz)); @@ -70,7 +39,7 @@ public class TestScorerPerf extends LuceneTestCase { return set; } - public FixedBitSet[] randBitSets(int numSets, int setSize) { + private static FixedBitSet[] randBitSets(int numSets, int setSize) { FixedBitSet[] sets = new FixedBitSet[numSets]; for (int i = 0; i < sets.length; i++) { sets[i] = randBitSet(setSize, random().nextInt(setSize)); @@ -81,22 +50,13 @@ public class TestScorerPerf extends LuceneTestCase { private static final class CountingHitCollectorManager implements CollectorManager { - private final boolean validate; - private final FixedBitSet result; - - CountingHitCollectorManager(boolean validate, FixedBitSet result) { - this.validate = validate; - this.result = result; - } - @Override public CountingHitCollector newCollector() { - return validate ? new MatchingHitCollector(result) : new CountingHitCollector(); + return new CountingHitCollector(); } @Override - public CountingHitCollector reduce(Collection collectors) - throws IOException { + public CountingHitCollector reduce(Collection collectors) { CountingHitCollector result = new CountingHitCollector(); for (CountingHitCollector collector : collectors) { result.count += collector.count; @@ -106,7 +66,7 @@ public class TestScorerPerf extends LuceneTestCase { } } - public static class CountingHitCollector extends SimpleCollector { + private static class CountingHitCollector extends SimpleCollector { int count = 0; int sum = 0; protected int docBase = 0; @@ -121,12 +81,8 @@ public class TestScorerPerf extends LuceneTestCase { return count; } - public int getSum() { - return sum; - } - @Override - protected void doSetNextReader(LeafReaderContext context) throws IOException { + protected void doSetNextReader(LeafReaderContext context) { docBase = context.docBase; } @@ -136,24 +92,6 @@ public class TestScorerPerf extends LuceneTestCase { } } - public static class MatchingHitCollector extends CountingHitCollector { - FixedBitSet answer; - int pos = -1; - - public MatchingHitCollector(FixedBitSet answer) { - this.answer = answer; - } - - public void collect(int doc, float score) { - - pos = answer.nextSetBit(pos + 1); - if (pos != doc + docBase) { - throw new RuntimeException("Expected doc " + pos + " but got " + (doc + docBase)); - } - super.collect(doc); - } - } - private static class BitSetQuery extends Query { private final FixedBitSet docs; @@ -163,11 +101,10 @@ public class TestScorerPerf extends LuceneTestCase { } @Override - public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) - throws IOException { + public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) { return new ConstantScoreWeight(this, boost) { @Override - public ScorerSupplier scorerSupplier(LeafReaderContext context) throws IOException { + public ScorerSupplier scorerSupplier(LeafReaderContext context) { final var scorer = new ConstantScoreScorer( score(), scoreMode, new BitSetIterator(docs, docs.approximateCardinality())); @@ -200,20 +137,22 @@ public class TestScorerPerf extends LuceneTestCase { } } - FixedBitSet addClause(FixedBitSet[] sets, BooleanQuery.Builder bq, FixedBitSet result) { + private FixedBitSet addClause(FixedBitSet[] sets, BooleanQuery.Builder bq, FixedBitSet result) { final FixedBitSet rnd = sets[random().nextInt(sets.length)]; Query q = new BitSetQuery(rnd); bq.add(q, BooleanClause.Occur.MUST); if (validate) { - if (result == null) result = rnd.clone(); - else result.and(rnd); + if (result == null) { + result = rnd.clone(); + } else { + result.and(rnd); + } } return result; } - public int doConjunctions(IndexSearcher s, FixedBitSet[] sets, int iter, int maxClauses) + private void doConjunctions(IndexSearcher s, FixedBitSet[] sets, int iter, int maxClauses) throws IOException { - int ret = 0; for (int i = 0; i < iter; i++) { int nClauses = random().nextInt(maxClauses - 1) + 2; // min 2 clauses @@ -222,21 +161,17 @@ public class TestScorerPerf extends LuceneTestCase { for (int j = 0; j < nClauses; j++) { result = addClause(sets, bq, result); } - CountingHitCollector hc = - s.search(bq.build(), new CountingHitCollectorManager(validate, result)); - ret += hc.getSum(); + CountingHitCollector hc = s.search(bq.build(), new CountingHitCollectorManager()); - if (validate) assertEquals(result.cardinality(), hc.getCount()); - // System.out.println(hc.getCount()); + if (validate) { + assertEquals(result.cardinality(), hc.getCount()); + } } - - return ret; } - public int doNestedConjunctions( + private void doNestedConjunctions( IndexSearcher s, FixedBitSet[] sets, int iter, int maxOuterClauses, int maxClauses) throws IOException { - int ret = 0; long nMatches = 0; for (int i = 0; i < iter; i++) { @@ -255,107 +190,15 @@ public class TestScorerPerf extends LuceneTestCase { oq.add(bq.build(), BooleanClause.Occur.MUST); } // outer - CountingHitCollector hc = - s.search(oq.build(), new CountingHitCollectorManager(validate, result)); + CountingHitCollector hc = s.search(oq.build(), new CountingHitCollectorManager()); nMatches += hc.getCount(); - ret += hc.getSum(); - if (validate) assertEquals(result.cardinality(), hc.getCount()); - // System.out.println(hc.getCount()); - } - if (VERBOSE) System.out.println("Average number of matches=" + (nMatches / iter)); - return ret; - } - - public int doTermConjunctions( - Term[] terms, IndexSearcher s, int termsInIndex, int maxClauses, int iter) - throws IOException { - int ret = 0; - - long nMatches = 0; - for (int i = 0; i < iter; i++) { - int nClauses = random().nextInt(maxClauses - 1) + 2; // min 2 clauses - BooleanQuery.Builder bq = new BooleanQuery.Builder(); - BitSet termflag = new BitSet(termsInIndex); - for (int j = 0; j < nClauses; j++) { - int tnum; - // don't pick same clause twice - tnum = random().nextInt(termsInIndex); - if (termflag.get(tnum)) tnum = termflag.nextClearBit(tnum); - if (tnum < 0 || tnum >= termsInIndex) tnum = termflag.nextClearBit(0); - termflag.set(tnum); - Query tq = new TermQuery(terms[tnum]); - bq.add(tq, BooleanClause.Occur.MUST); + if (validate) { + assertEquals(result.cardinality(), hc.getCount()); } - - CountingHitCollector hc = s.search(bq.build(), new CountingHitCollectorManager(false, null)); - nMatches += hc.getCount(); - ret += hc.getSum(); } - if (VERBOSE) System.out.println("Average number of matches=" + (nMatches / iter)); - - return ret; - } - - public int doNestedTermConjunctions( - IndexSearcher s, - Term[] terms, - int termsInIndex, - int maxOuterClauses, - int maxClauses, - int iter) - throws IOException { - int ret = 0; - long nMatches = 0; - for (int i = 0; i < iter; i++) { - int oClauses = random().nextInt(maxOuterClauses - 1) + 2; - BooleanQuery.Builder oq = new BooleanQuery.Builder(); - for (int o = 0; o < oClauses; o++) { - - int nClauses = random().nextInt(maxClauses - 1) + 2; // min 2 clauses - BooleanQuery.Builder bq = new BooleanQuery.Builder(); - BitSet termflag = new BitSet(termsInIndex); - for (int j = 0; j < nClauses; j++) { - int tnum; - // don't pick same clause twice - tnum = random().nextInt(termsInIndex); - if (termflag.get(tnum)) tnum = termflag.nextClearBit(tnum); - if (tnum < 0 || tnum >= 25) tnum = termflag.nextClearBit(0); - termflag.set(tnum); - Query tq = new TermQuery(terms[tnum]); - bq.add(tq, BooleanClause.Occur.MUST); - } // inner - - oq.add(bq.build(), BooleanClause.Occur.MUST); - } // outer - - CountingHitCollector hc = s.search(oq.build(), new CountingHitCollectorManager(false, null)); - nMatches += hc.getCount(); - ret += hc.getSum(); + if (VERBOSE) { + System.out.println("Average number of matches=" + (nMatches / iter)); } - if (VERBOSE) System.out.println("Average number of matches=" + (nMatches / iter)); - return ret; - } - - public int doSloppyPhrase(IndexSearcher s, int termsInIndex, int maxClauses, int iter) - throws IOException { - int ret = 0; - - for (int i = 0; i < iter; i++) { - int nClauses = random().nextInt(maxClauses - 1) + 2; // min 2 clauses - PhraseQuery.Builder builder = new PhraseQuery.Builder(); - for (int j = 0; j < nClauses; j++) { - int tnum = random().nextInt(termsInIndex); - builder.add(new Term("f", Character.toString((char) (tnum + 'A')))); - } - // slop could be random too - builder.setSlop(termsInIndex); - PhraseQuery q = builder.build(); - - CountingHitCollector hc = s.search(q, new CountingHitCollectorManager(false, null)); - ret += hc.getSum(); - } - - return ret; } public void testConjunctions() throws Exception { From d7dc57dd0d624c1a69a69a787dad1b5223d66ba4 Mon Sep 17 00:00:00 2001 From: Dawid Weiss Date: Fri, 6 Sep 2024 09:01:15 +0200 Subject: [PATCH 17/18] jgit/ clean status check should ignore any 'untracked folders' (#13728) * Ignore any 'untracked folders' #13719 * Upgrade jgit to 6.10.0.202406032230-r. --- gradle/validation/git-status.gradle | 18 +----------------- versions.toml | 2 +- 2 files changed, 2 insertions(+), 18 deletions(-) diff --git a/gradle/validation/git-status.gradle b/gradle/validation/git-status.gradle index 31b80641683..fa976589836 100644 --- a/gradle/validation/git-status.gradle +++ b/gradle/validation/git-status.gradle @@ -74,21 +74,6 @@ configure(rootProject) { logger.warn("WARNING: Directory is not a valid git checkout (won't check dirty files): ${rootProject.projectDir}") } } else { - // git ignores any folders which are empty (this includes folders with recursively empty sub-folders). - def untrackedNonEmptyFolders = status.untrackedFolders.findAll { path -> - File location = file("${rootProject.projectDir}/${path}") - boolean hasFiles = false - Files.walkFileTree(location.toPath(), new SimpleFileVisitor() { - @Override - FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException { - hasFiles = true - // Terminate early. - return FileVisitResult.TERMINATE - } - }) - return hasFiles - } - def offenders = [ // Exclude staged changes. These are fine in precommit. // "(added)": status.added, @@ -97,8 +82,7 @@ configure(rootProject) { "(conflicting)": status.conflicting, "(missing)": status.missing, "(modified)": status.modified, - "(untracked)": status.untracked, - "(untracked non-empty dir)": untrackedNonEmptyFolders + "(untracked)": status.untracked ].collectMany { fileStatus, files -> files.collect {file -> " - ${file} ${fileStatus}" } }.sort() diff --git a/versions.toml b/versions.toml index c93d1e405c4..96dfc797080 100644 --- a/versions.toml +++ b/versions.toml @@ -14,7 +14,7 @@ hamcrest = "2.2" icu4j = "74.2" javacc = "7.0.12" jflex = "1.8.2" -jgit = "5.13.1.202206130422-r" +jgit = "6.10.0.202406032230-r" jmh = "1.37" jts = "1.17.0" junit = "4.13.1" From 0b8ae220bedaa1ff1b81a0f6ea8427ed9861f94b Mon Sep 17 00:00:00 2001 From: Luca Cavanna Date: Fri, 6 Sep 2024 11:57:52 +0200 Subject: [PATCH 18/18] Remove leftover search(Query, Collector) usages in TestTaxonomyFacetAssociations (#13726) --- .../facet/taxonomy/TestTaxonomyFacetAssociations.java | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/TestTaxonomyFacetAssociations.java b/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/TestTaxonomyFacetAssociations.java index c452a01292e..bd2d091a2fd 100644 --- a/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/TestTaxonomyFacetAssociations.java +++ b/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/TestTaxonomyFacetAssociations.java @@ -242,10 +242,9 @@ public class TestTaxonomyFacetAssociations extends FacetTestCase { public void testIntAssociationRandom() throws Exception { - FacetsCollector fc = new FacetsCollector(); - IndexSearcher searcher = newSearcher(reader); - searcher.search(new TermQuery(new Term("match", "yes")), fc); + FacetsCollector fc = + searcher.search(new TermQuery(new Term("match", "yes")), new FacetsCollectorManager()); Map expected; Facets facets; @@ -332,10 +331,9 @@ public class TestTaxonomyFacetAssociations extends FacetTestCase { public void testFloatAssociationRandom() throws Exception { - FacetsCollector fc = new FacetsCollector(); - IndexSearcher searcher = newSearcher(reader); - searcher.search(new TermQuery(new Term("match", "yes")), fc); + FacetsCollector fc = + searcher.search(new TermQuery(new Term("match", "yes")), new FacetsCollectorManager()); Map expected; Facets facets;