From cf016f8987e804bcd858a2a414eacdf1b3c54cf5 Mon Sep 17 00:00:00 2001 From: javanna Date: Wed, 14 Nov 2018 10:51:48 +0100 Subject: [PATCH] LUCENE-8563: Remove k1+1 constant factor from BM25 formula numerator. Signed-off-by: Adrien Grand --- lucene/CHANGES.txt | 4 + lucene/MIGRATE.txt | 8 ++ .../search/similarities/BM25Similarity.java | 4 +- .../similarity/LegacyBM25Similarity.java | 96 ++++++++++++++ .../lucene/search/similarity/package.html | 22 ++++ .../similarity/TestLegacyBM25Similarity.java | 122 ++++++++++++++++++ .../similarities/BM25SimilarityFactory.java | 8 +- .../similarities/SchemaSimilarityFactory.java | 6 +- .../solr/rest/schema/TestBulkSchemaAPI.java | 49 ++++--- .../TestBM25SimilarityFactory.java | 8 +- .../TestNonDefinedSimilarityFactory.java | 4 +- .../similarities/TestPerFieldSimilarity.java | 8 +- 12 files changed, 293 insertions(+), 46 deletions(-) create mode 100644 lucene/misc/src/java/org/apache/lucene/search/similarity/LegacyBM25Similarity.java create mode 100644 lucene/misc/src/java/org/apache/lucene/search/similarity/package.html create mode 100644 lucene/misc/src/test/org/apache/lucene/search/similarity/TestLegacyBM25Similarity.java diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index f5b056e5207..bcecbc9957d 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -138,6 +138,10 @@ Changes in Runtime Behavior were collapsed and renamed to just getTermPostingsEnum and moved to MultiTerms. (David Smiley) +* LUCENE-8563: BM25 scores don't include the (k1+1) factor in their numerator + anymore. This doesn't affect ordering as this is a constant factor which is + the same for every document. (Luca Cavanna via Adrien Grand) + New Features * LUCENE-8340: LongPoint#newDistanceQuery may be used to boost scores based on diff --git a/lucene/MIGRATE.txt b/lucene/MIGRATE.txt index 53187aa2d87..0b78d3c345a 100644 --- a/lucene/MIGRATE.txt +++ b/lucene/MIGRATE.txt @@ -150,3 +150,11 @@ in order to support ToParent/ToChildBlockJoinQuery. Normalization is now type-safe, with CharFilterFactory#normalize() returning a Reader and TokenFilterFactory#normalize() returning a TokenFilter. + +## k1+1 constant factor removed from BM25 similarity numerator (LUCENE-8563) ## + +Scores computed by the BM25 similarity are lower than previously as the k1+1 +constant factor was removed from the numerator of the scoring formula. +Ordering of results is preserved unless scores are computed from multiple +fields using different similarities. The previous behaviour is now exposed +by the LegacyBM25Similarity class which can be found in the lucene-misc jar. diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/BM25Similarity.java b/lucene/core/src/java/org/apache/lucene/search/similarities/BM25Similarity.java index e1d42421d3c..626e7d0bf68 100644 --- a/lucene/core/src/java/org/apache/lucene/search/similarities/BM25Similarity.java +++ b/lucene/core/src/java/org/apache/lucene/search/similarities/BM25Similarity.java @@ -216,7 +216,7 @@ public class BM25Similarity extends Similarity { this.k1 = k1; this.b = b; this.cache = cache; - this.weight = (k1 + 1) * boost * idf.getValue().floatValue(); + this.weight = boost * idf.getValue().floatValue(); } @Override @@ -254,8 +254,6 @@ public class BM25Similarity extends Similarity { private List explainConstantFactors() { List subs = new ArrayList<>(); - // scale factor - subs.add(Explanation.match(k1 + 1, "scaling factor, k1 + 1")); // query boost if (boost != 1.0f) { subs.add(Explanation.match(boost, "boost")); diff --git a/lucene/misc/src/java/org/apache/lucene/search/similarity/LegacyBM25Similarity.java b/lucene/misc/src/java/org/apache/lucene/search/similarity/LegacyBM25Similarity.java new file mode 100644 index 00000000000..58091a7c972 --- /dev/null +++ b/lucene/misc/src/java/org/apache/lucene/search/similarity/LegacyBM25Similarity.java @@ -0,0 +1,96 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.search.similarity; + +import org.apache.lucene.index.FieldInvertState; +import org.apache.lucene.search.CollectionStatistics; +import org.apache.lucene.search.TermStatistics; +import org.apache.lucene.search.similarities.BM25Similarity; +import org.apache.lucene.search.similarities.Similarity; + +/** + * Similarity that behaves like {@link BM25Similarity} while also applying + * the k1+1 factor to the numerator of the scoring formula + * + * @see BM25Similarity + * + * @deprecated {@link BM25Similarity} should be used instead + */ +@Deprecated +public final class LegacyBM25Similarity extends Similarity { + + private final BM25Similarity bm25Similarity; + + /** BM25 with these default values: + *
    + *
  • {@code k1 = 1.2}
  • + *
  • {@code b = 0.75}
  • + *
+ */ + public LegacyBM25Similarity() { + this.bm25Similarity = new BM25Similarity(); + } + + /** + * BM25 with the supplied parameter values. + * @param k1 Controls non-linear term frequency normalization (saturation). + * @param b Controls to what degree document length normalizes tf values. + * @throws IllegalArgumentException if {@code k1} is infinite or negative, or if {@code b} is + * not within the range {@code [0..1]} + */ + public LegacyBM25Similarity(float k1, float b) { + this.bm25Similarity = new BM25Similarity(k1, b); + } + + @Override + public long computeNorm(FieldInvertState state) { + return bm25Similarity.computeNorm(state); + } + + @Override + public SimScorer scorer(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) { + return bm25Similarity.scorer(boost * (1 + bm25Similarity.getK1()), collectionStats, termStats); + } + + /** + * Returns the k1 parameter + * @see #LegacyBM25Similarity(float, float) + */ + public final float getK1() { + return bm25Similarity.getK1(); + } + + /** + * Returns the b parameter + * @see #LegacyBM25Similarity(float, float) + */ + public final float getB() { + return bm25Similarity.getB(); + } + + /** Sets whether overlap tokens (Tokens with 0 position increment) are + * ignored when computing norm. By default this is true, meaning overlap + * tokens do not count when computing norms. */ + public void setDiscountOverlaps(boolean v) { + bm25Similarity.setDiscountOverlaps(v); + } + + @Override + public String toString() { + return bm25Similarity.toString(); + } +} diff --git a/lucene/misc/src/java/org/apache/lucene/search/similarity/package.html b/lucene/misc/src/java/org/apache/lucene/search/similarity/package.html new file mode 100644 index 00000000000..7f624d41d54 --- /dev/null +++ b/lucene/misc/src/java/org/apache/lucene/search/similarity/package.html @@ -0,0 +1,22 @@ + + + + +Misc similarity implementations. + + \ No newline at end of file diff --git a/lucene/misc/src/test/org/apache/lucene/search/similarity/TestLegacyBM25Similarity.java b/lucene/misc/src/test/org/apache/lucene/search/similarity/TestLegacyBM25Similarity.java new file mode 100644 index 00000000000..b3a0cd24ca1 --- /dev/null +++ b/lucene/misc/src/test/org/apache/lucene/search/similarity/TestLegacyBM25Similarity.java @@ -0,0 +1,122 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.search.similarity; + +import java.util.Random; + +import org.apache.lucene.search.similarities.BM25Similarity; +import org.apache.lucene.search.similarities.BaseSimilarityTestCase; +import org.apache.lucene.search.similarities.Similarity; + +public class TestLegacyBM25Similarity extends BaseSimilarityTestCase { + + public void testIllegalK1() { + IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () -> { + new LegacyBM25Similarity(Float.POSITIVE_INFINITY, 0.75f); + }); + assertTrue(expected.getMessage().contains("illegal k1 value")); + + expected = expectThrows(IllegalArgumentException.class, () -> { + new LegacyBM25Similarity(-1, 0.75f); + }); + assertTrue(expected.getMessage().contains("illegal k1 value")); + + expected = expectThrows(IllegalArgumentException.class, () -> { + new LegacyBM25Similarity(Float.NaN, 0.75f); + }); + assertTrue(expected.getMessage().contains("illegal k1 value")); + } + + public void testIllegalB() { + IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () -> { + new LegacyBM25Similarity(1.2f, 2f); + }); + assertTrue(expected.getMessage().contains("illegal b value")); + + expected = expectThrows(IllegalArgumentException.class, () -> { + new LegacyBM25Similarity(1.2f, -1f); + }); + assertTrue(expected.getMessage().contains("illegal b value")); + + expected = expectThrows(IllegalArgumentException.class, () -> { + new LegacyBM25Similarity(1.2f, Float.POSITIVE_INFINITY); + }); + assertTrue(expected.getMessage().contains("illegal b value")); + + expected = expectThrows(IllegalArgumentException.class, () -> { + new LegacyBM25Similarity(1.2f, Float.NaN); + }); + assertTrue(expected.getMessage().contains("illegal b value")); + } + + public void testDefaults() { + LegacyBM25Similarity legacyBM25Similarity = new LegacyBM25Similarity(); + BM25Similarity bm25Similarity = new BM25Similarity(); + assertEquals(bm25Similarity.getB(), legacyBM25Similarity.getB(), 0f); + assertEquals(bm25Similarity.getK1(), legacyBM25Similarity.getK1(), 0f); + } + + public void testToString() { + LegacyBM25Similarity legacyBM25Similarity = new LegacyBM25Similarity(); + BM25Similarity bm25Similarity = new BM25Similarity(); + assertEquals(bm25Similarity.toString(), legacyBM25Similarity.toString()); + } + + @Override + protected Similarity getSimilarity(Random random) { + return new LegacyBM25Similarity(randomK1(random), randomB(random)); + } + + private static float randomK1(Random random) { + // term frequency normalization parameter k1 + switch (random.nextInt(4)) { + case 0: + // minimum value + return 0; + case 1: + // tiny value + return Float.MIN_VALUE; + case 2: + // maximum value + // upper bounds on individual term's score is 43.262806 * (k1 + 1) * boost + // we just limit the test to "reasonable" k1 values but don't enforce this anywhere. + return Integer.MAX_VALUE; + default: + // random value + return Integer.MAX_VALUE * random.nextFloat(); + } + } + + private static float randomB(Random random) { + // length normalization parameter b [0 .. 1] + switch (random.nextInt(4)) { + case 0: + // minimum value + return 0; + case 1: + // tiny value + return Float.MIN_VALUE; + case 2: + // maximum value + return 1; + default: + // random value + return random.nextFloat(); + } + } +} diff --git a/solr/core/src/java/org/apache/solr/search/similarities/BM25SimilarityFactory.java b/solr/core/src/java/org/apache/solr/search/similarities/BM25SimilarityFactory.java index ef8ffbd2654..fd8a48cb7d6 100644 --- a/solr/core/src/java/org/apache/solr/search/similarities/BM25SimilarityFactory.java +++ b/solr/core/src/java/org/apache/solr/search/similarities/BM25SimilarityFactory.java @@ -16,13 +16,13 @@ */ package org.apache.solr.search.similarities; -import org.apache.lucene.search.similarities.BM25Similarity; import org.apache.lucene.search.similarities.Similarity; +import org.apache.lucene.search.similarity.LegacyBM25Similarity; import org.apache.solr.common.params.SolrParams; import org.apache.solr.schema.SimilarityFactory; /** - * Factory for {@link BM25Similarity} + * Factory for {@link LegacyBM25Similarity} *

* Parameters: *

    @@ -35,7 +35,7 @@ import org.apache.solr.schema.SimilarityFactory; * Optional settings: *
      *
    • discountOverlaps (bool): Sets - * {@link BM25Similarity#setDiscountOverlaps(boolean)}
    • + * {@link LegacyBM25Similarity#setDiscountOverlaps(boolean)} *
    * @lucene.experimental */ @@ -54,7 +54,7 @@ public class BM25SimilarityFactory extends SimilarityFactory { @Override public Similarity getSimilarity() { - BM25Similarity sim = new BM25Similarity(k1, b); + LegacyBM25Similarity sim = new LegacyBM25Similarity(k1, b); sim.setDiscountOverlaps(discountOverlaps); return sim; } diff --git a/solr/core/src/java/org/apache/solr/search/similarities/SchemaSimilarityFactory.java b/solr/core/src/java/org/apache/solr/search/similarities/SchemaSimilarityFactory.java index 378197c8e4a..6c3dedfb398 100644 --- a/solr/core/src/java/org/apache/solr/search/similarities/SchemaSimilarityFactory.java +++ b/solr/core/src/java/org/apache/solr/search/similarities/SchemaSimilarityFactory.java @@ -16,10 +16,10 @@ */ package org.apache.solr.search.similarities; -import org.apache.lucene.search.similarities.BM25Similarity; import org.apache.lucene.search.similarities.ClassicSimilarity; import org.apache.lucene.search.similarities.PerFieldSimilarityWrapper; import org.apache.lucene.search.similarities.Similarity; +import org.apache.lucene.search.similarity.LegacyBM25Similarity; import org.apache.lucene.util.Version; import org.apache.solr.common.SolrException; import org.apache.solr.common.SolrException.ErrorCode; @@ -40,7 +40,7 @@ import org.apache.solr.util.plugin.SolrCoreAware; *

    *
      *
    • luceneMatchVersion < 6.0 = {@link ClassicSimilarity}
    • - *
    • luceneMatchVersion >= 6.0 = {@link BM25Similarity}
    • + *
    • luceneMatchVersion >= 6.0 = {@link LegacyBM25Similarity}
    • *
    *

    * The defaultSimFromFieldType option accepts the name of any fieldtype, and uses @@ -109,7 +109,7 @@ public class SchemaSimilarityFactory extends SimilarityFactory implements SolrCo Similarity defaultSim = null; if (null == defaultSimFromFieldType) { // nothing configured, choose a sensible implicit default... - defaultSim = new BM25Similarity(); + defaultSim = new LegacyBM25Similarity(); } else { FieldType defSimFT = core.getLatestSchema().getFieldTypeByName(defaultSimFromFieldType); if (null == defSimFT) { diff --git a/solr/core/src/test/org/apache/solr/rest/schema/TestBulkSchemaAPI.java b/solr/core/src/test/org/apache/solr/rest/schema/TestBulkSchemaAPI.java index 5d1dab1982b..9a72043913e 100644 --- a/solr/core/src/test/org/apache/solr/rest/schema/TestBulkSchemaAPI.java +++ b/solr/core/src/test/org/apache/solr/rest/schema/TestBulkSchemaAPI.java @@ -16,30 +16,6 @@ */ package org.apache.solr.rest.schema; -import org.apache.commons.io.FileUtils; - -import org.apache.lucene.search.similarities.DFISimilarity; -import org.apache.lucene.search.similarities.PerFieldSimilarityWrapper; -import org.apache.lucene.search.similarities.BM25Similarity; -import org.apache.lucene.misc.SweetSpotSimilarity; -import org.apache.lucene.search.similarities.Similarity; - -import org.apache.solr.common.SolrDocumentList; -import org.apache.solr.core.SolrCore; -import org.apache.solr.core.CoreContainer; -import org.apache.solr.schema.SimilarityFactory; -import org.apache.solr.search.similarities.SchemaSimilarityFactory; -import org.apache.solr.util.RESTfulServerProvider; -import org.apache.solr.util.RestTestBase; -import org.apache.solr.util.RestTestHarness; - -import org.junit.After; -import org.junit.Before; -import org.noggit.JSONParser; -import org.noggit.ObjectBuilder; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - import java.io.File; import java.io.StringReader; import java.lang.invoke.MethodHandles; @@ -51,6 +27,27 @@ import java.util.Map; import java.util.Set; import java.util.function.Consumer; +import org.apache.commons.io.FileUtils; +import org.apache.lucene.misc.SweetSpotSimilarity; +import org.apache.lucene.search.similarities.DFISimilarity; +import org.apache.lucene.search.similarities.PerFieldSimilarityWrapper; +import org.apache.lucene.search.similarities.Similarity; +import org.apache.lucene.search.similarity.LegacyBM25Similarity; +import org.apache.solr.common.SolrDocumentList; +import org.apache.solr.core.CoreContainer; +import org.apache.solr.core.SolrCore; +import org.apache.solr.schema.SimilarityFactory; +import org.apache.solr.search.similarities.SchemaSimilarityFactory; +import org.apache.solr.util.RESTfulServerProvider; +import org.apache.solr.util.RestTestBase; +import org.apache.solr.util.RestTestHarness; +import org.junit.After; +import org.junit.Before; +import org.noggit.JSONParser; +import org.noggit.ObjectBuilder; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + public class TestBulkSchemaAPI extends RestTestBase { private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); @@ -591,7 +588,7 @@ public class TestBulkSchemaAPI extends RestTestBase { assertNotNull("field a5 not created", m); assertEquals("myWhitespaceTxtField", m.get("type")); assertNull(m.get("uninvertible")); // inherited, but API shouldn't return w/o explicit showDefaults - assertFieldSimilarity("a5", BM25Similarity.class); // unspecified, expect default + assertFieldSimilarity("a5", LegacyBM25Similarity.class); // unspecified, expect default m = getObj(harness, "wdf_nocase", "fields"); assertNull("field 'wdf_nocase' not deleted", m); @@ -933,7 +930,7 @@ public class TestBulkSchemaAPI extends RestTestBase { Map fields = getObj(harness, fieldName, "fields"); assertNotNull("field " + fieldName + " not created", fields); - assertFieldSimilarity(fieldName, BM25Similarity.class, + assertFieldSimilarity(fieldName, LegacyBM25Similarity.class, sim -> assertEquals("Unexpected k1", k1, sim.getK1(), .001), sim -> assertEquals("Unexpected b", b, sim.getB(), .001)); diff --git a/solr/core/src/test/org/apache/solr/search/similarities/TestBM25SimilarityFactory.java b/solr/core/src/test/org/apache/solr/search/similarities/TestBM25SimilarityFactory.java index 3f6deacec42..6445b3469f0 100644 --- a/solr/core/src/test/org/apache/solr/search/similarities/TestBM25SimilarityFactory.java +++ b/solr/core/src/test/org/apache/solr/search/similarities/TestBM25SimilarityFactory.java @@ -16,8 +16,8 @@ */ package org.apache.solr.search.similarities; -import org.apache.lucene.search.similarities.BM25Similarity; import org.apache.lucene.search.similarities.Similarity; +import org.apache.lucene.search.similarity.LegacyBM25Similarity; import org.junit.BeforeClass; /** @@ -31,14 +31,14 @@ public class TestBM25SimilarityFactory extends BaseSimilarityTestCase { /** bm25 with default parameters */ public void test() throws Exception { - assertEquals(BM25Similarity.class, getSimilarity("text").getClass()); + assertEquals(LegacyBM25Similarity.class, getSimilarity("text").getClass()); } /** bm25 with parameters */ public void testParameters() throws Exception { Similarity sim = getSimilarity("text_params"); - assertEquals(BM25Similarity.class, sim.getClass()); - BM25Similarity bm25 = (BM25Similarity) sim; + assertEquals(LegacyBM25Similarity.class, sim.getClass()); + LegacyBM25Similarity bm25 = (LegacyBM25Similarity) sim; assertEquals(1.2f, bm25.getK1(), 0.01f); assertEquals(0.76f, bm25.getB(), 0.01f); } diff --git a/solr/core/src/test/org/apache/solr/search/similarities/TestNonDefinedSimilarityFactory.java b/solr/core/src/test/org/apache/solr/search/similarities/TestNonDefinedSimilarityFactory.java index 7460652366b..9fe33b7638e 100644 --- a/solr/core/src/test/org/apache/solr/search/similarities/TestNonDefinedSimilarityFactory.java +++ b/solr/core/src/test/org/apache/solr/search/similarities/TestNonDefinedSimilarityFactory.java @@ -16,7 +16,7 @@ */ package org.apache.solr.search.similarities; -import org.apache.lucene.search.similarities.BM25Similarity; +import org.apache.lucene.search.similarity.LegacyBM25Similarity; import org.junit.After; /** @@ -36,7 +36,7 @@ public class TestNonDefinedSimilarityFactory extends BaseSimilarityTestCase { public void testCurrentBM25() throws Exception { // no sys prop set, rely on LATEST initCore("solrconfig-basic.xml","schema-tiny.xml"); - BM25Similarity sim = getSimilarity("text", BM25Similarity.class); + LegacyBM25Similarity sim = getSimilarity("text", LegacyBM25Similarity.class); assertEquals(0.75F, sim.getB(), 0.0F); } } diff --git a/solr/core/src/test/org/apache/solr/search/similarities/TestPerFieldSimilarity.java b/solr/core/src/test/org/apache/solr/search/similarities/TestPerFieldSimilarity.java index 58fe6eff2f0..a27837bb8a3 100644 --- a/solr/core/src/test/org/apache/solr/search/similarities/TestPerFieldSimilarity.java +++ b/solr/core/src/test/org/apache/solr/search/similarities/TestPerFieldSimilarity.java @@ -17,8 +17,8 @@ package org.apache.solr.search.similarities; import org.apache.lucene.misc.SweetSpotSimilarity; -import org.apache.lucene.search.similarities.BM25Similarity; import org.apache.lucene.search.similarities.Similarity; +import org.apache.lucene.search.similarity.LegacyBM25Similarity; import org.junit.BeforeClass; /** @@ -58,18 +58,18 @@ public class TestPerFieldSimilarity extends BaseSimilarityTestCase { /** test a field where no similarity is specified */ public void testDefaults() throws Exception { Similarity sim = getSimilarity("sim3text"); - assertEquals(BM25Similarity.class, sim.getClass());; + assertEquals(LegacyBM25Similarity.class, sim.getClass());; } /** ... and for a dynamic field */ public void testDefaultsDynamic() throws Exception { Similarity sim = getSimilarity("text_sim3"); - assertEquals(BM25Similarity.class, sim.getClass()); + assertEquals(LegacyBM25Similarity.class, sim.getClass()); } /** test a field that does not exist */ public void testNonexistent() throws Exception { Similarity sim = getSimilarity("sdfdsfdsfdswr5fsdfdsfdsfs"); - assertEquals(BM25Similarity.class, sim.getClass()); + assertEquals(LegacyBM25Similarity.class, sim.getClass()); } }