Adds boolean similarity to Elasticsearch (#23637)
This commit adds the boolean similarity scoring from Lucene to Elasticsearch. The boolean similarity provides a means to specify that a field should not be scored with typical full-text ranking algorithms, but rather just whether the query terms match the document or not. Boolean similarity scores a query term equal to its query boost only. Boolean similarity is available as a default similarity option and thus a field can be specified to have boolean similarity by declaring in its mapping: "similarity": "boolean" Closes #6731
This commit is contained in:
parent
b3dca364ce
commit
8359dd05c9
|
@ -0,0 +1,48 @@
|
|||
/*
|
||||
* Licensed to Elasticsearch under one or more contributor
|
||||
* license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright
|
||||
* ownership. Elasticsearch licenses this file to you under
|
||||
* the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.elasticsearch.index.similarity;
|
||||
|
||||
import org.apache.lucene.search.similarities.BooleanSimilarity;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
|
||||
/**
|
||||
* {@link SimilarityProvider} for the {@link BooleanSimilarity},
|
||||
* which is a simple similarity that gives terms a score equal
|
||||
* to their query boost only. This is useful in situations where
|
||||
* a field does not need to be scored by a full-text ranking
|
||||
* algorithm, but rather all that matters is whether the query
|
||||
* terms matched or not.
|
||||
*/
|
||||
public class BooleanSimilarityProvider extends AbstractSimilarityProvider {
|
||||
|
||||
private final BooleanSimilarity similarity = new BooleanSimilarity();
|
||||
|
||||
public BooleanSimilarityProvider(String name, Settings settings, Settings indexSettings) {
|
||||
super(name);
|
||||
}
|
||||
|
||||
/**
|
||||
* {@inheritDoc}
|
||||
*/
|
||||
@Override
|
||||
public BooleanSimilarity get() {
|
||||
return similarity;
|
||||
}
|
||||
}
|
|
@ -47,6 +47,7 @@ public final class SimilarityService extends AbstractIndexComponent {
|
|||
Map<String, TriFunction<String, Settings, Settings, SimilarityProvider>> buildIn = new HashMap<>();
|
||||
defaults.put("classic", ClassicSimilarityProvider::new);
|
||||
defaults.put("BM25", BM25SimilarityProvider::new);
|
||||
defaults.put("boolean", BooleanSimilarityProvider::new);
|
||||
buildIn.put("classic", ClassicSimilarityProvider::new);
|
||||
buildIn.put("BM25", BM25SimilarityProvider::new);
|
||||
buildIn.put("DFR", DFRSimilarityProvider::new);
|
||||
|
|
|
@ -19,6 +19,7 @@
|
|||
|
||||
package org.elasticsearch.index.similarity;
|
||||
|
||||
import org.apache.lucene.search.similarities.BooleanSimilarity;
|
||||
import org.apache.lucene.search.similarities.ClassicSimilarity;
|
||||
import org.apache.lucene.search.similarities.DFISimilarity;
|
||||
import org.apache.lucene.search.similarities.AfterEffectL;
|
||||
|
@ -64,6 +65,7 @@ public class SimilarityTests extends ESSingleNodeTestCase {
|
|||
SimilarityService similarityService = createIndex("foo").similarityService();
|
||||
assertThat(similarityService.getSimilarity("classic").get(), instanceOf(ClassicSimilarity.class));
|
||||
assertThat(similarityService.getSimilarity("BM25").get(), instanceOf(BM25Similarity.class));
|
||||
assertThat(similarityService.getSimilarity("boolean").get(), instanceOf(BooleanSimilarity.class));
|
||||
assertThat(similarityService.getSimilarity("default"), equalTo(null));
|
||||
}
|
||||
|
||||
|
@ -109,6 +111,21 @@ public class SimilarityTests extends ESSingleNodeTestCase {
|
|||
assertThat(similarity.getDiscountOverlaps(), equalTo(false));
|
||||
}
|
||||
|
||||
public void testResolveSimilaritiesFromMapping_boolean() throws IOException {
|
||||
String mapping = XContentFactory.jsonBuilder().startObject().startObject("type")
|
||||
.startObject("properties")
|
||||
.startObject("field1").field("type", "text").field("similarity", "boolean").endObject()
|
||||
.endObject()
|
||||
.endObject().endObject().string();
|
||||
|
||||
IndexService indexService = createIndex("foo", Settings.EMPTY);
|
||||
DocumentMapper documentMapper = indexService.mapperService()
|
||||
.documentMapperParser()
|
||||
.parse("type", new CompressedXContent(mapping));
|
||||
assertThat(documentMapper.mappers().getMapper("field1").fieldType().similarity(),
|
||||
instanceOf(BooleanSimilarityProvider.class));
|
||||
}
|
||||
|
||||
public void testResolveSimilaritiesFromMapping_DFR() throws IOException {
|
||||
String mapping = XContentFactory.jsonBuilder().startObject().startObject("type")
|
||||
.startObject("properties")
|
||||
|
|
|
@ -3,7 +3,7 @@
|
|||
|
||||
Elasticsearch allows you to configure a scoring algorithm or _similarity_ per
|
||||
field. The `similarity` setting provides a simple way of choosing a similarity
|
||||
algorithm other than the default TF/IDF, such as `BM25`.
|
||||
algorithm other than the default `BM25`, such as `TF/IDF`.
|
||||
|
||||
Similarities are mostly useful for <<text,`text`>> fields, but can also apply
|
||||
to other field types.
|
||||
|
@ -25,6 +25,11 @@ configuration are:
|
|||
Lucene. See {defguide}/practical-scoring-function.html[Lucene’s Practical Scoring Function]
|
||||
for more information.
|
||||
|
||||
`boolean`::
|
||||
A simple boolean similarity, which is used when full-text ranking is not needed
|
||||
and the score should only be based on whether the query terms match or not.
|
||||
Boolean similarity gives terms a score equal to their query boost.
|
||||
|
||||
|
||||
The `similarity` can be set on the field level when a field is first created,
|
||||
as follows:
|
||||
|
@ -42,6 +47,10 @@ PUT my_index
|
|||
"classic_field": {
|
||||
"type": "text",
|
||||
"similarity": "classic" <2>
|
||||
},
|
||||
"boolean_sim_field": {
|
||||
"type": "text",
|
||||
"similarity": "boolean" <3>
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -51,3 +60,4 @@ PUT my_index
|
|||
// CONSOLE
|
||||
<1> The `default_field` uses the `BM25` similarity.
|
||||
<2> The `classic_field` uses the `classic` similarity (ie TF/IDF).
|
||||
<3> The `boolean_sim_field` uses the `boolean` similarity.
|
||||
|
|
Loading…
Reference in New Issue