SOLR-13025: SchemaSimilarityFactory fallback to LegacyBM25Similarity for luceneMatchVersion < 8.0

This commit is contained in:
Jan Høydahl 2018-12-12 09:40:57 +01:00
parent bcdc6dadbb
commit 5affe7421f
15 changed files with 206 additions and 32 deletions

View File

@ -59,6 +59,13 @@ Upgrade Notes
* SOLR-12754: The UnifiedHighlighter hl.weightMatches now defaults to true. If there are unforseen highlight problems,
this may be the culprit.
* If you explicitly use BM25SimilarityFactory in your schema, the absolute scoring will be lower due to SOLR-13025.
But ordering of documents will not change in the normal case. Use LegacyBM25SimilarityFactory if you need to force
the old 6.x/7.x scoring. Note that if you have not specified any similarity in schema or use the default
SchemaSimilarityFactory, then LegacyBM25Similarity is automatically selected for 'luceneMatchVersion' < 8.0.0.
See also explanation in Reference Guide chapter "Other Schema Elements".
New Features
----------------------
@ -94,6 +101,10 @@ Optimizations
* SOLR-12725: ParseDateFieldUpdateProcessorFactory should reuse ParsePosition. (ab)
* SOLR-13025: Due to LUCENE-8563, the BM25Similarity formula no longer includes the (k1+1) factor in the numerator
This gives a lower absolute score but doesn't affect ordering, as this is a constant factor which is the same
for every document. Use LegacyBM25SimilarityFactory if you need the old 6.x/7.x scoring. See also upgrade notes (janhoy)
Other Changes
----------------------

View File

@ -70,7 +70,7 @@ public class TestExternalFeatures extends TestRerankBase {
query.add("rq", "{!ltr reRankDocs=10 model=externalmodel efi.user_query=w3 efi.userTitlePhrase1=w4 efi.userTitlePhrase2=w5}");
assertJQ("/query" + query.toQueryString(), "/response/docs/[0]/id=='3'");
assertJQ("/query" + query.toQueryString(), "/response/docs/[0]/score==0.7693934");
assertJQ("/query" + query.toQueryString(), "/response/docs/[0]/score==0.34972426");
assertJQ("/query" + query.toQueryString(), "/response/docs/[1]/score==0.0");
assertJQ("/query" + query.toQueryString(), "/response/docs/[2]/score==0.0");
@ -80,7 +80,7 @@ public class TestExternalFeatures extends TestRerankBase {
query.add("fl", "*,score,[fv efi.user_query=w2 efi.userTitlePhrase1=w4 efi.userTitlePhrase2=w5]");
assertJQ("/query" + query.toQueryString(), "/response/docs/[0]/id=='3'");
assertJQ("/query" + query.toQueryString(), "/response/docs/[0]/score==0.7693934");
assertJQ("/query" + query.toQueryString(), "/response/docs/[0]/score==0.34972426");
assertJQ("/query" + query.toQueryString(), "/response/docs/[1]/score==0.0");
assertJQ("/query" + query.toQueryString(), "/response/docs/[2]/score==0.0");
}

View File

@ -16,13 +16,15 @@
*/
package org.apache.solr.search.similarities;
import org.apache.lucene.search.similarities.BM25Similarity;
import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.search.similarity.LegacyBM25Similarity;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.schema.SimilarityFactory;
/**
* Factory for {@link LegacyBM25Similarity}
* Factory for BM25Similarity. This is the default similarity since 8.x.
* If you need the exact same formula as in 6.x and 7.x you should instead look at
* {@link LegacyBM25SimilarityFactory}
* <p>
* Parameters:
* <ul>
@ -35,9 +37,10 @@ import org.apache.solr.schema.SimilarityFactory;
* Optional settings:
* <ul>
* <li>discountOverlaps (bool): Sets
* {@link LegacyBM25Similarity#setDiscountOverlaps(boolean)}</li>
* {@link BM25Similarity#setDiscountOverlaps(boolean)}</li>
* </ul>
* @lucene.experimental
* @since 8.0.0
*/
public class BM25SimilarityFactory extends SimilarityFactory {
private boolean discountOverlaps;
@ -54,7 +57,7 @@ public class BM25SimilarityFactory extends SimilarityFactory {
@Override
public Similarity getSimilarity() {
LegacyBM25Similarity sim = new LegacyBM25Similarity(k1, b);
BM25Similarity sim = new BM25Similarity(k1, b);
sim.setDiscountOverlaps(discountOverlaps);
return sim;
}

View File

@ -0,0 +1,64 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.search.similarities;
import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.search.similarity.LegacyBM25Similarity;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.schema.SimilarityFactory;
/**
* Factory for {@link LegacyBM25Similarity}.
* Use this to force explicit creation of the BM25 formula that was used by BM25Similarity before Solr/Lucene 8.0.0.
* Note that {@link SchemaSimilarityFactory} will automatically create an instance of LegacyBM25Similarity if luceneMatchVersion is &lt; 8.0.0
* <p>
* Parameters:
* <ul>
* <li>k1 (float): Controls non-linear term frequency normalization (saturation).
* The default is <code>1.2</code>
* <li>b (float): Controls to what degree document length normalizes tf values.
* The default is <code>0.75</code>
* </ul>
* <p>
* Optional settings:
* <ul>
* <li>discountOverlaps (bool): Sets
* {@link LegacyBM25Similarity#setDiscountOverlaps(boolean)}</li>
* </ul>
* @lucene.experimental
* @since 8.0.0
*/
public class LegacyBM25SimilarityFactory extends SimilarityFactory {
private boolean discountOverlaps;
private float k1;
private float b;
@Override
public void init(SolrParams params) {
super.init(params);
discountOverlaps = params.getBool("discountOverlaps", true);
k1 = params.getFloat("k1", 1.2f);
b = params.getFloat("b", 0.75f);
}
@Override
public Similarity getSimilarity() {
LegacyBM25Similarity sim = new LegacyBM25Similarity(k1, b);
sim.setDiscountOverlaps(discountOverlaps);
return sim;
}
}

View File

@ -16,7 +16,7 @@
*/
package org.apache.solr.search.similarities;
import org.apache.lucene.search.similarities.ClassicSimilarity;
import org.apache.lucene.search.similarities.BM25Similarity;
import org.apache.lucene.search.similarities.PerFieldSimilarityWrapper;
import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.search.similarity.LegacyBM25Similarity;
@ -39,8 +39,8 @@ import org.apache.solr.util.plugin.SolrCoreAware;
* matching configured:
* </p>
* <ul>
* <li><code>luceneMatchVersion &lt; 6.0</code> = {@link ClassicSimilarity}</li>
* <li><code>luceneMatchVersion &gt;= 6.0</code> = {@link LegacyBM25Similarity}</li>
* <li><code>luceneMatchVersion &lt; 8.0</code> = {@link LegacyBM25Similarity}</li>
* <li><code>luceneMatchVersion &gt;= 8.0</code> = {@link BM25Similarity}</li>
* </ul>
* <p>
* The <code>defaultSimFromFieldType</code> option accepts the name of any fieldtype, and uses
@ -85,10 +85,12 @@ public class SchemaSimilarityFactory extends SimilarityFactory implements SolrCo
private volatile SolrCore core; // set by inform(SolrCore)
private volatile Similarity similarity; // lazy instantiated
private Version coreVersion = Version.LATEST;
@Override
public void inform(SolrCore core) {
this.core = core;
this.coreVersion = this.core.getSolrConfig().luceneMatchVersion;
}
@Override
@ -109,7 +111,9 @@ public class SchemaSimilarityFactory extends SimilarityFactory implements SolrCo
Similarity defaultSim = null;
if (null == defaultSimFromFieldType) {
// nothing configured, choose a sensible implicit default...
defaultSim = new LegacyBM25Similarity();
defaultSim = coreVersion.onOrAfter(Version.LUCENE_8_0_0) ?
new BM25Similarity() :
new LegacyBM25Similarity();
} else {
FieldType defSimFT = core.getLatestSchema().getFieldTypeByName(defaultSimFromFieldType);
if (null == defSimFT) {

View File

@ -36,10 +36,27 @@
</similarity>
</fieldType>
<!-- legacybm25 with default parameters -->
<fieldType name="legacy_text" class="solr.TextField">
<analyzer class="org.apache.lucene.analysis.standard.StandardAnalyzer"/>
<similarity class="solr.LegacyBM25SimilarityFactory"/>
</fieldType>
<!-- legacybm25 with parameters -->
<fieldType name="legacy_text_params" class="solr.TextField">
<analyzer class="org.apache.lucene.analysis.standard.StandardAnalyzer"/>
<similarity class="solr.LegacyBM25SimilarityFactory">
<float name="k1">1.2</float>
<float name="b">0.76</float>
</similarity>
</fieldType>
<field name="id" type="string" indexed="true" stored="true" multiValued="false" required="false"/>
<field name="text" type="text" indexed="true" stored="false"/>
<field name="text_params" type="text_params" indexed="true" stored="false"/>
<field name="legacy_text" type="legacy_text" indexed="true" stored="false"/>
<field name="legacy_text_params" type="legacy_text_params" indexed="true" stored="false"/>
<uniqueKey>id</uniqueKey>

View File

@ -29,10 +29,10 @@ import java.util.function.Consumer;
import org.apache.commons.io.FileUtils;
import org.apache.lucene.misc.SweetSpotSimilarity;
import org.apache.lucene.search.similarities.BM25Similarity;
import org.apache.lucene.search.similarities.DFISimilarity;
import org.apache.lucene.search.similarities.PerFieldSimilarityWrapper;
import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.search.similarity.LegacyBM25Similarity;
import org.apache.solr.common.SolrDocumentList;
import org.apache.solr.core.CoreContainer;
import org.apache.solr.core.SolrCore;
@ -588,7 +588,7 @@ public class TestBulkSchemaAPI extends RestTestBase {
assertNotNull("field a5 not created", m);
assertEquals("myWhitespaceTxtField", m.get("type"));
assertNull(m.get("uninvertible")); // inherited, but API shouldn't return w/o explicit showDefaults
assertFieldSimilarity("a5", LegacyBM25Similarity.class); // unspecified, expect default
assertFieldSimilarity("a5", BM25Similarity.class); // unspecified, expect default
m = getObj(harness, "wdf_nocase", "fields");
assertNull("field 'wdf_nocase' not deleted", m);
@ -930,7 +930,7 @@ public class TestBulkSchemaAPI extends RestTestBase {
Map fields = getObj(harness, fieldName, "fields");
assertNotNull("field " + fieldName + " not created", fields);
assertFieldSimilarity(fieldName, LegacyBM25Similarity.class,
assertFieldSimilarity(fieldName, BM25Similarity.class,
sim -> assertEquals("Unexpected k1", k1, sim.getK1(), .001),
sim -> assertEquals("Unexpected b", b, sim.getB(), .001));

View File

@ -57,6 +57,6 @@ public class TestPayloadScoreQParserPlugin extends SolrTestCaseJ4 {
// TODO: fix this includeSpanScore test to be less brittle - score result is score of "A" (via BM25) multipled by 1.0 (payload value)
assertQ(req("fl","*,score", "q", "{!payload_score f=vals_dpf v=A func=min}"), "//float[@name='score']='1.0'");
assertQ(req("fl","*,score", "q", "{!payload_score f=vals_dpf v=A func=min includeSpanScore=true}"), "//float[@name='score']='0.2876821'");
assertQ(req("fl","*,score", "q", "{!payload_score f=vals_dpf v=A func=min includeSpanScore=true}"), "//float[@name='score']='0.13076457'");
}
}

View File

@ -389,16 +389,16 @@ public class TestFunctionQuery extends SolrTestCaseJ4 {
// superman has a higher df (thus lower idf) in one segment, but reversed in the complete index
String q ="{!func}query($qq)";
String fq="id:120";
assertQ(req("fl","*,score","q", q, "qq","text:batman", "fq",fq), "//float[@name='score']<'1.0'");
assertQ(req("fl","*,score","q", q, "qq","text:superman", "fq",fq), "//float[@name='score']>'1.0'");
assertQ(req("fl","*,score","q", q, "qq","text:batman", "fq",fq), "//float[@name='score']<'0.6'");
assertQ(req("fl","*,score","q", q, "qq","text:superman", "fq",fq), "//float[@name='score']>'0.6'");
// test weighting through a function range query
assertQ(req("fl","*,score", "fq",fq, "q", "{!frange l=1 u=10}query($qq)", "qq","text:superman"), "//*[@numFound='1']");
assertQ(req("fl","*,score", "fq",fq, "q", "{!frange l=0.6 u=10}query($qq)", "qq","text:superman"), "//*[@numFound='1']");
// test weighting through a complex function
q ="{!func}sub(div(sum(0.0,product(1,query($qq))),1),0)";
assertQ(req("fl","*,score","q", q, "qq","text:batman", "fq",fq), "//float[@name='score']<'1.0'");
assertQ(req("fl","*,score","q", q, "qq","text:superman", "fq",fq), "//float[@name='score']>'1.0'");
assertQ(req("fl","*,score","q", q, "qq","text:batman", "fq",fq), "//float[@name='score']<'0.6'");
assertQ(req("fl","*,score","q", q, "qq","text:superman", "fq",fq), "//float[@name='score']>'0.6'");
// test full param dereferencing

View File

@ -16,8 +16,8 @@
*/
package org.apache.solr.search.similarities;
import org.apache.lucene.search.similarities.BM25Similarity;
import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.search.similarity.LegacyBM25Similarity;
import org.junit.BeforeClass;
/**
@ -31,14 +31,14 @@ public class TestBM25SimilarityFactory extends BaseSimilarityTestCase {
/** bm25 with default parameters */
public void test() throws Exception {
assertEquals(LegacyBM25Similarity.class, getSimilarity("text").getClass());
assertEquals(BM25Similarity.class, getSimilarity("text").getClass());
}
/** bm25 with parameters */
public void testParameters() throws Exception {
Similarity sim = getSimilarity("text_params");
assertEquals(LegacyBM25Similarity.class, sim.getClass());
LegacyBM25Similarity bm25 = (LegacyBM25Similarity) sim;
assertEquals(BM25Similarity.class, sim.getClass());
BM25Similarity bm25 = (BM25Similarity) sim;
assertEquals(1.2f, bm25.getK1(), 0.01f);
assertEquals(0.76f, bm25.getB(), 0.01f);
}

View File

@ -0,0 +1,45 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.search.similarities;
import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.search.similarity.LegacyBM25Similarity;
import org.junit.BeforeClass;
/**
* Tests {@link LegacyBM25SimilarityFactory}
*/
public class TestLegacyBM25SimilarityFactory extends BaseSimilarityTestCase {
@BeforeClass
public static void beforeClass() throws Exception {
initCore("solrconfig-basic.xml","schema-bm25.xml");
}
/** bm25 with default parameters */
public void test() throws Exception {
assertEquals(LegacyBM25Similarity.class, getSimilarity("legacy_text").getClass());
}
/** bm25 with parameters */
public void testParameters() throws Exception {
Similarity sim = getSimilarity("legacy_text_params");
assertEquals(LegacyBM25Similarity.class, sim.getClass());
LegacyBM25Similarity bm25 = (LegacyBM25Similarity) sim;
assertEquals(1.2f, bm25.getK1(), 0.01f);
assertEquals(0.76f, bm25.getB(), 0.01f);
}
}

View File

@ -16,15 +16,19 @@
*/
package org.apache.solr.search.similarities;
import org.apache.lucene.search.similarities.BM25Similarity;
import org.apache.lucene.search.similarity.LegacyBM25Similarity;
import org.apache.lucene.util.Version;
import org.junit.After;
/**
* Verifies that the default behavior of the implicit {@link ClassicSimilarityFactory}
* Verifies that the default behavior of the implicit {@link BM25Similarity}
* (ie: no similarity configured in schema.xml at all) is consistent with
* expectations based on the luceneMatchVersion
* @see <a href="https://issues.apache.org/jira/browse/SOLR-5561">SOLR-5561</a>
* @see <a href="https://issues.apache.org/jira/browse/SOLR-8057">SOLR-8057</a>
* @see <a href="https://issues.apache.org/jira/browse/SOLR-13025">SOLR-13025</a>
* @see <a href="https://issues.apache.org/jira/browse/LUCENE-8563">LUCENE-8563</a>
*/
public class TestNonDefinedSimilarityFactory extends BaseSimilarityTestCase {
@ -33,10 +37,30 @@ public class TestNonDefinedSimilarityFactory extends BaseSimilarityTestCase {
deleteCore();
}
public void testCurrentBM25() throws Exception {
public void testCurrentBM25FromV8() throws Exception {
// no sys prop set, rely on LATEST
initCore("solrconfig-basic.xml","schema-tiny.xml");
LegacyBM25Similarity sim = getSimilarity("text", LegacyBM25Similarity.class);
BM25Similarity sim = getSimilarity("text", BM25Similarity.class);
assertEquals(0.75F, sim.getB(), 0.0F);
}
public void testLegacyBM25BeforeV8() throws Exception {
System.setProperty("tests.luceneMatchVersion", Version.LUCENE_7_0_0.toString());
initCore("solrconfig-basic.xml","schema-tiny.xml");
System.clearProperty("tests.luceneMatchVersion");
LegacyBM25Similarity sim = getSimilarity("text", LegacyBM25Similarity.class);
assertEquals(0.75F, sim.getB(), 0.0F);
deleteCore();
System.setProperty("tests.luceneMatchVersion", "5.0.0");
initCore("solrconfig-basic.xml","schema-tiny.xml");
System.clearProperty("tests.luceneMatchVersion");
getSimilarity("text", LegacyBM25Similarity.class);
deleteCore();
System.setProperty("tests.luceneMatchVersion", "6.0.0");
initCore("solrconfig-basic.xml","schema-tiny.xml");
System.clearProperty("tests.luceneMatchVersion");
getSimilarity("text", LegacyBM25Similarity.class);
}
}

View File

@ -17,8 +17,8 @@
package org.apache.solr.search.similarities;
import org.apache.lucene.misc.SweetSpotSimilarity;
import org.apache.lucene.search.similarities.BM25Similarity;
import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.search.similarity.LegacyBM25Similarity;
import org.junit.BeforeClass;
/**
@ -58,18 +58,18 @@ public class TestPerFieldSimilarity extends BaseSimilarityTestCase {
/** test a field where no similarity is specified */
public void testDefaults() throws Exception {
Similarity sim = getSimilarity("sim3text");
assertEquals(LegacyBM25Similarity.class, sim.getClass());;
assertEquals(BM25Similarity.class, sim.getClass());;
}
/** ... and for a dynamic field */
public void testDefaultsDynamic() throws Exception {
Similarity sim = getSimilarity("text_sim3");
assertEquals(LegacyBM25Similarity.class, sim.getClass());
assertEquals(BM25Similarity.class, sim.getClass());
}
/** test a field that does not exist */
public void testNonexistent() throws Exception {
Similarity sim = getSimilarity("sdfdsfdsfdswr5fsdfdsfdsfs");
assertEquals(LegacyBM25Similarity.class, sim.getClass());
assertEquals(BM25Similarity.class, sim.getClass());
}
}

View File

@ -23,3 +23,9 @@
// *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
* Due to LIR redesign in SOLR-11702, all users must be on Solr 7.3 or higher to upgrade to Solr 8
* If you explicitly use BM25SimilarityFactory in your schema the absolute scoring will be lower, see SOLR-13025.
But ordering of documents will not change in the normal case. Use LegacyBM25SimilarityFactory if you need to force
the old 6.x/7.x scoring. Note that if you have not specified any similarity in schema or use the default
SchemaSimilarityFactory, then LegacyBM25Similarity is automatically selected for 'luceneMatchVersion' < 8.0.0.
See also explanation in Reference Guide chapter "Other Schema Elements".

View File

@ -90,6 +90,6 @@ In most cases, specifying global level similarity like this will cause an error
In the example above `IBSimilarityFactory` (using the Information-Based model) will be used for any fields of type `text_ib`, while `DFRSimilarityFactory` (divergence from random) will be used for any fields of type `text_dfr`, as well as any fields using a type that does not explicitly specify a `<similarity/>`.
If `SchemaSimilarityFactory` is explicitly declared without configuring a `defaultSimFromFieldType`, then `BM25Similarity` is implicitly used as the default.
If `SchemaSimilarityFactory` is explicitly declared without configuring a `defaultSimFromFieldType`, then `BM25Similarity` is implicitly used as the default for `luceneMatchVersion >= 8.0.0` and otherwise `LegacyBM25Similarity` is used to mimic the same BM25 formula that was the default in those versions.
In addition to the various factories mentioned on this page, there are several other similarity implementations that can be used such as the `SweetSpotSimilarityFactory`, `ClassicSimilarityFactory`, etc. For details, see the Solr Javadocs for the {solr-javadocs}/solr-core/org/apache/solr/schema/SimilarityFactory.html[similarity factories].
In addition to the various factories mentioned on this page, there are several other similarity implementations that can be used such as the `SweetSpotSimilarityFactory`, `ClassicSimilarityFactory`, `LegacyBM25SimilarityFactory` etc. For details, see the Solr Javadocs for the {solr-javadocs}/solr-core/org/apache/solr/schema/SimilarityFactory.html[similarity factories].