LUCENE-8563: Remove k1+1 constant factor from BM25 formula numerator.

Signed-off-by: Adrien Grand <jpountz@gmail.com>
This commit is contained in:
javanna 2018-11-14 10:51:48 +01:00 committed by Adrien Grand
parent c2ab04775d
commit cf016f8987
12 changed files with 293 additions and 46 deletions

View File

@ -138,6 +138,10 @@ Changes in Runtime Behavior
were collapsed and renamed to just getTermPostingsEnum and moved to MultiTerms.
(David Smiley)
* LUCENE-8563: BM25 scores don't include the (k1+1) factor in their numerator
anymore. This doesn't affect ordering as this is a constant factor which is
the same for every document. (Luca Cavanna via Adrien Grand)
New Features
* LUCENE-8340: LongPoint#newDistanceQuery may be used to boost scores based on

View File

@ -150,3 +150,11 @@ in order to support ToParent/ToChildBlockJoinQuery.
Normalization is now type-safe, with CharFilterFactory#normalize() returning a Reader and
TokenFilterFactory#normalize() returning a TokenFilter.
## k1+1 constant factor removed from BM25 similarity numerator (LUCENE-8563) ##
Scores computed by the BM25 similarity are lower than previously as the k1+1
constant factor was removed from the numerator of the scoring formula.
Ordering of results is preserved unless scores are computed from multiple
fields using different similarities. The previous behaviour is now exposed
by the LegacyBM25Similarity class which can be found in the lucene-misc jar.

View File

@ -216,7 +216,7 @@ public class BM25Similarity extends Similarity {
this.k1 = k1;
this.b = b;
this.cache = cache;
this.weight = (k1 + 1) * boost * idf.getValue().floatValue();
this.weight = boost * idf.getValue().floatValue();
}
@Override
@ -254,8 +254,6 @@ public class BM25Similarity extends Similarity {
private List<Explanation> explainConstantFactors() {
List<Explanation> subs = new ArrayList<>();
// scale factor
subs.add(Explanation.match(k1 + 1, "scaling factor, k1 + 1"));
// query boost
if (boost != 1.0f) {
subs.add(Explanation.match(boost, "boost"));

View File

@ -0,0 +1,96 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search.similarity;
import org.apache.lucene.index.FieldInvertState;
import org.apache.lucene.search.CollectionStatistics;
import org.apache.lucene.search.TermStatistics;
import org.apache.lucene.search.similarities.BM25Similarity;
import org.apache.lucene.search.similarities.Similarity;
/**
* Similarity that behaves like {@link BM25Similarity} while also applying
* the k1+1 factor to the numerator of the scoring formula
*
* @see BM25Similarity
*
* @deprecated {@link BM25Similarity} should be used instead
*/
@Deprecated
public final class LegacyBM25Similarity extends Similarity {
private final BM25Similarity bm25Similarity;
/** BM25 with these default values:
* <ul>
* <li>{@code k1 = 1.2}</li>
* <li>{@code b = 0.75}</li>
* </ul>
*/
public LegacyBM25Similarity() {
this.bm25Similarity = new BM25Similarity();
}
/**
* BM25 with the supplied parameter values.
* @param k1 Controls non-linear term frequency normalization (saturation).
* @param b Controls to what degree document length normalizes tf values.
* @throws IllegalArgumentException if {@code k1} is infinite or negative, or if {@code b} is
* not within the range {@code [0..1]}
*/
public LegacyBM25Similarity(float k1, float b) {
this.bm25Similarity = new BM25Similarity(k1, b);
}
@Override
public long computeNorm(FieldInvertState state) {
return bm25Similarity.computeNorm(state);
}
@Override
public SimScorer scorer(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) {
return bm25Similarity.scorer(boost * (1 + bm25Similarity.getK1()), collectionStats, termStats);
}
/**
* Returns the <code>k1</code> parameter
* @see #LegacyBM25Similarity(float, float)
*/
public final float getK1() {
return bm25Similarity.getK1();
}
/**
* Returns the <code>b</code> parameter
* @see #LegacyBM25Similarity(float, float)
*/
public final float getB() {
return bm25Similarity.getB();
}
/** Sets whether overlap tokens (Tokens with 0 position increment) are
* ignored when computing norm. By default this is true, meaning overlap
* tokens do not count when computing norms. */
public void setDiscountOverlaps(boolean v) {
bm25Similarity.setDiscountOverlaps(v);
}
@Override
public String toString() {
return bm25Similarity.toString();
}
}

View File

@ -0,0 +1,22 @@
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<!-- not a package-info.java, because we already defined this package in core/ -->
<html>
<body>
Misc similarity implementations.
</body>
</html>

View File

@ -0,0 +1,122 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search.similarity;
import java.util.Random;
import org.apache.lucene.search.similarities.BM25Similarity;
import org.apache.lucene.search.similarities.BaseSimilarityTestCase;
import org.apache.lucene.search.similarities.Similarity;
public class TestLegacyBM25Similarity extends BaseSimilarityTestCase {
public void testIllegalK1() {
IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () -> {
new LegacyBM25Similarity(Float.POSITIVE_INFINITY, 0.75f);
});
assertTrue(expected.getMessage().contains("illegal k1 value"));
expected = expectThrows(IllegalArgumentException.class, () -> {
new LegacyBM25Similarity(-1, 0.75f);
});
assertTrue(expected.getMessage().contains("illegal k1 value"));
expected = expectThrows(IllegalArgumentException.class, () -> {
new LegacyBM25Similarity(Float.NaN, 0.75f);
});
assertTrue(expected.getMessage().contains("illegal k1 value"));
}
public void testIllegalB() {
IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () -> {
new LegacyBM25Similarity(1.2f, 2f);
});
assertTrue(expected.getMessage().contains("illegal b value"));
expected = expectThrows(IllegalArgumentException.class, () -> {
new LegacyBM25Similarity(1.2f, -1f);
});
assertTrue(expected.getMessage().contains("illegal b value"));
expected = expectThrows(IllegalArgumentException.class, () -> {
new LegacyBM25Similarity(1.2f, Float.POSITIVE_INFINITY);
});
assertTrue(expected.getMessage().contains("illegal b value"));
expected = expectThrows(IllegalArgumentException.class, () -> {
new LegacyBM25Similarity(1.2f, Float.NaN);
});
assertTrue(expected.getMessage().contains("illegal b value"));
}
public void testDefaults() {
LegacyBM25Similarity legacyBM25Similarity = new LegacyBM25Similarity();
BM25Similarity bm25Similarity = new BM25Similarity();
assertEquals(bm25Similarity.getB(), legacyBM25Similarity.getB(), 0f);
assertEquals(bm25Similarity.getK1(), legacyBM25Similarity.getK1(), 0f);
}
public void testToString() {
LegacyBM25Similarity legacyBM25Similarity = new LegacyBM25Similarity();
BM25Similarity bm25Similarity = new BM25Similarity();
assertEquals(bm25Similarity.toString(), legacyBM25Similarity.toString());
}
@Override
protected Similarity getSimilarity(Random random) {
return new LegacyBM25Similarity(randomK1(random), randomB(random));
}
private static float randomK1(Random random) {
// term frequency normalization parameter k1
switch (random.nextInt(4)) {
case 0:
// minimum value
return 0;
case 1:
// tiny value
return Float.MIN_VALUE;
case 2:
// maximum value
// upper bounds on individual term's score is 43.262806 * (k1 + 1) * boost
// we just limit the test to "reasonable" k1 values but don't enforce this anywhere.
return Integer.MAX_VALUE;
default:
// random value
return Integer.MAX_VALUE * random.nextFloat();
}
}
private static float randomB(Random random) {
// length normalization parameter b [0 .. 1]
switch (random.nextInt(4)) {
case 0:
// minimum value
return 0;
case 1:
// tiny value
return Float.MIN_VALUE;
case 2:
// maximum value
return 1;
default:
// random value
return random.nextFloat();
}
}
}

View File

@ -16,13 +16,13 @@
*/
package org.apache.solr.search.similarities;
import org.apache.lucene.search.similarities.BM25Similarity;
import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.search.similarity.LegacyBM25Similarity;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.schema.SimilarityFactory;
/**
* Factory for {@link BM25Similarity}
* Factory for {@link LegacyBM25Similarity}
* <p>
* Parameters:
* <ul>
@ -35,7 +35,7 @@ import org.apache.solr.schema.SimilarityFactory;
* Optional settings:
* <ul>
* <li>discountOverlaps (bool): Sets
* {@link BM25Similarity#setDiscountOverlaps(boolean)}</li>
* {@link LegacyBM25Similarity#setDiscountOverlaps(boolean)}</li>
* </ul>
* @lucene.experimental
*/
@ -54,7 +54,7 @@ public class BM25SimilarityFactory extends SimilarityFactory {
@Override
public Similarity getSimilarity() {
BM25Similarity sim = new BM25Similarity(k1, b);
LegacyBM25Similarity sim = new LegacyBM25Similarity(k1, b);
sim.setDiscountOverlaps(discountOverlaps);
return sim;
}

View File

@ -16,10 +16,10 @@
*/
package org.apache.solr.search.similarities;
import org.apache.lucene.search.similarities.BM25Similarity;
import org.apache.lucene.search.similarities.ClassicSimilarity;
import org.apache.lucene.search.similarities.PerFieldSimilarityWrapper;
import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.search.similarity.LegacyBM25Similarity;
import org.apache.lucene.util.Version;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.SolrException.ErrorCode;
@ -40,7 +40,7 @@ import org.apache.solr.util.plugin.SolrCoreAware;
* </p>
* <ul>
* <li><code>luceneMatchVersion &lt; 6.0</code> = {@link ClassicSimilarity}</li>
* <li><code>luceneMatchVersion &gt;= 6.0</code> = {@link BM25Similarity}</li>
* <li><code>luceneMatchVersion &gt;= 6.0</code> = {@link LegacyBM25Similarity}</li>
* </ul>
* <p>
* The <code>defaultSimFromFieldType</code> option accepts the name of any fieldtype, and uses
@ -109,7 +109,7 @@ public class SchemaSimilarityFactory extends SimilarityFactory implements SolrCo
Similarity defaultSim = null;
if (null == defaultSimFromFieldType) {
// nothing configured, choose a sensible implicit default...
defaultSim = new BM25Similarity();
defaultSim = new LegacyBM25Similarity();
} else {
FieldType defSimFT = core.getLatestSchema().getFieldTypeByName(defaultSimFromFieldType);
if (null == defSimFT) {

View File

@ -16,30 +16,6 @@
*/
package org.apache.solr.rest.schema;
import org.apache.commons.io.FileUtils;
import org.apache.lucene.search.similarities.DFISimilarity;
import org.apache.lucene.search.similarities.PerFieldSimilarityWrapper;
import org.apache.lucene.search.similarities.BM25Similarity;
import org.apache.lucene.misc.SweetSpotSimilarity;
import org.apache.lucene.search.similarities.Similarity;
import org.apache.solr.common.SolrDocumentList;
import org.apache.solr.core.SolrCore;
import org.apache.solr.core.CoreContainer;
import org.apache.solr.schema.SimilarityFactory;
import org.apache.solr.search.similarities.SchemaSimilarityFactory;
import org.apache.solr.util.RESTfulServerProvider;
import org.apache.solr.util.RestTestBase;
import org.apache.solr.util.RestTestHarness;
import org.junit.After;
import org.junit.Before;
import org.noggit.JSONParser;
import org.noggit.ObjectBuilder;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.File;
import java.io.StringReader;
import java.lang.invoke.MethodHandles;
@ -51,6 +27,27 @@ import java.util.Map;
import java.util.Set;
import java.util.function.Consumer;
import org.apache.commons.io.FileUtils;
import org.apache.lucene.misc.SweetSpotSimilarity;
import org.apache.lucene.search.similarities.DFISimilarity;
import org.apache.lucene.search.similarities.PerFieldSimilarityWrapper;
import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.search.similarity.LegacyBM25Similarity;
import org.apache.solr.common.SolrDocumentList;
import org.apache.solr.core.CoreContainer;
import org.apache.solr.core.SolrCore;
import org.apache.solr.schema.SimilarityFactory;
import org.apache.solr.search.similarities.SchemaSimilarityFactory;
import org.apache.solr.util.RESTfulServerProvider;
import org.apache.solr.util.RestTestBase;
import org.apache.solr.util.RestTestHarness;
import org.junit.After;
import org.junit.Before;
import org.noggit.JSONParser;
import org.noggit.ObjectBuilder;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class TestBulkSchemaAPI extends RestTestBase {
private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
@ -591,7 +588,7 @@ public class TestBulkSchemaAPI extends RestTestBase {
assertNotNull("field a5 not created", m);
assertEquals("myWhitespaceTxtField", m.get("type"));
assertNull(m.get("uninvertible")); // inherited, but API shouldn't return w/o explicit showDefaults
assertFieldSimilarity("a5", BM25Similarity.class); // unspecified, expect default
assertFieldSimilarity("a5", LegacyBM25Similarity.class); // unspecified, expect default
m = getObj(harness, "wdf_nocase", "fields");
assertNull("field 'wdf_nocase' not deleted", m);
@ -933,7 +930,7 @@ public class TestBulkSchemaAPI extends RestTestBase {
Map fields = getObj(harness, fieldName, "fields");
assertNotNull("field " + fieldName + " not created", fields);
assertFieldSimilarity(fieldName, BM25Similarity.class,
assertFieldSimilarity(fieldName, LegacyBM25Similarity.class,
sim -> assertEquals("Unexpected k1", k1, sim.getK1(), .001),
sim -> assertEquals("Unexpected b", b, sim.getB(), .001));

View File

@ -16,8 +16,8 @@
*/
package org.apache.solr.search.similarities;
import org.apache.lucene.search.similarities.BM25Similarity;
import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.search.similarity.LegacyBM25Similarity;
import org.junit.BeforeClass;
/**
@ -31,14 +31,14 @@ public class TestBM25SimilarityFactory extends BaseSimilarityTestCase {
/** bm25 with default parameters */
public void test() throws Exception {
assertEquals(BM25Similarity.class, getSimilarity("text").getClass());
assertEquals(LegacyBM25Similarity.class, getSimilarity("text").getClass());
}
/** bm25 with parameters */
public void testParameters() throws Exception {
Similarity sim = getSimilarity("text_params");
assertEquals(BM25Similarity.class, sim.getClass());
BM25Similarity bm25 = (BM25Similarity) sim;
assertEquals(LegacyBM25Similarity.class, sim.getClass());
LegacyBM25Similarity bm25 = (LegacyBM25Similarity) sim;
assertEquals(1.2f, bm25.getK1(), 0.01f);
assertEquals(0.76f, bm25.getB(), 0.01f);
}

View File

@ -16,7 +16,7 @@
*/
package org.apache.solr.search.similarities;
import org.apache.lucene.search.similarities.BM25Similarity;
import org.apache.lucene.search.similarity.LegacyBM25Similarity;
import org.junit.After;
/**
@ -36,7 +36,7 @@ public class TestNonDefinedSimilarityFactory extends BaseSimilarityTestCase {
public void testCurrentBM25() throws Exception {
// no sys prop set, rely on LATEST
initCore("solrconfig-basic.xml","schema-tiny.xml");
BM25Similarity sim = getSimilarity("text", BM25Similarity.class);
LegacyBM25Similarity sim = getSimilarity("text", LegacyBM25Similarity.class);
assertEquals(0.75F, sim.getB(), 0.0F);
}
}

View File

@ -17,8 +17,8 @@
package org.apache.solr.search.similarities;
import org.apache.lucene.misc.SweetSpotSimilarity;
import org.apache.lucene.search.similarities.BM25Similarity;
import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.search.similarity.LegacyBM25Similarity;
import org.junit.BeforeClass;
/**
@ -58,18 +58,18 @@ public class TestPerFieldSimilarity extends BaseSimilarityTestCase {
/** test a field where no similarity is specified */
public void testDefaults() throws Exception {
Similarity sim = getSimilarity("sim3text");
assertEquals(BM25Similarity.class, sim.getClass());;
assertEquals(LegacyBM25Similarity.class, sim.getClass());;
}
/** ... and for a dynamic field */
public void testDefaultsDynamic() throws Exception {
Similarity sim = getSimilarity("text_sim3");
assertEquals(BM25Similarity.class, sim.getClass());
assertEquals(LegacyBM25Similarity.class, sim.getClass());
}
/** test a field that does not exist */
public void testNonexistent() throws Exception {
Similarity sim = getSimilarity("sdfdsfdsfdswr5fsdfdsfdsfs");
assertEquals(BM25Similarity.class, sim.getClass());
assertEquals(LegacyBM25Similarity.class, sim.getClass());
}
}