Adds boolean similarity to Elasticsearch (#23637)

This commit adds the boolean similarity scoring from Lucene to Elasticsearch. The boolean similarity provides a means to specify that a field should not be scored with typical full-text ranking algorithms, but rather just whether the query terms match the document or not. Boolean similarity scores a query term equal to its query boost only. Boolean similarity is available as a default similarity option and thus a field can be specified to have boolean similarity by declaring in its mapping: "similarity": "boolean" Closes #6731
2017-03-28 10:17:23 -04:00 · 2017-03-28 10:17:23 -04:00 · 8359dd05c9
parent b3dca364ce
commit 8359dd05c9
4 changed files with 77 additions and 1 deletions
--- a/core/src/main/java/org/elasticsearch/index/similarity/BooleanSimilarityProvider.java
+++ b/core/src/main/java/org/elasticsearch/index/similarity/BooleanSimilarityProvider.java
@ -0,0 +1,48 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.index.similarity;
+
+import org.apache.lucene.search.similarities.BooleanSimilarity;
+import org.elasticsearch.common.settings.Settings;
+
+/**
+ * {@link SimilarityProvider} for the {@link BooleanSimilarity},
+ * which is a simple similarity that gives terms a score equal
+ * to their query boost only.  This is useful in situations where
+ * a field does not need to be scored by a full-text ranking
+ * algorithm, but rather all that matters is whether the query
+ * terms matched or not.
+ */
+public class BooleanSimilarityProvider extends AbstractSimilarityProvider {
+
+    private final BooleanSimilarity similarity = new BooleanSimilarity();
+
+    public BooleanSimilarityProvider(String name, Settings settings, Settings indexSettings) {
+        super(name);
+    }
+
+    /**
+     * {@inheritDoc}
+     */
+    @Override
+    public BooleanSimilarity get() {
+        return similarity;
+    }
+}
--- a/core/src/main/java/org/elasticsearch/index/similarity/SimilarityService.java
+++ b/core/src/main/java/org/elasticsearch/index/similarity/SimilarityService.java
@ -47,6 +47,7 @@ public final class SimilarityService extends AbstractIndexComponent {
        Map<String, TriFunction<String, Settings, Settings, SimilarityProvider>> buildIn = new HashMap<>();
        defaults.put("classic", ClassicSimilarityProvider::new);
        defaults.put("BM25", BM25SimilarityProvider::new);
+        defaults.put("boolean", BooleanSimilarityProvider::new);
        buildIn.put("classic", ClassicSimilarityProvider::new);
        buildIn.put("BM25", BM25SimilarityProvider::new);
        buildIn.put("DFR", DFRSimilarityProvider::new);
--- a/core/src/test/java/org/elasticsearch/index/similarity/SimilarityTests.java
+++ b/core/src/test/java/org/elasticsearch/index/similarity/SimilarityTests.java
@ -19,6 +19,7 @@

 package org.elasticsearch.index.similarity;

+import org.apache.lucene.search.similarities.BooleanSimilarity;
 import org.apache.lucene.search.similarities.ClassicSimilarity;
 import org.apache.lucene.search.similarities.DFISimilarity;
 import org.apache.lucene.search.similarities.AfterEffectL;
@ -64,6 +65,7 @@ public class SimilarityTests extends ESSingleNodeTestCase {
        SimilarityService similarityService = createIndex("foo").similarityService();
        assertThat(similarityService.getSimilarity("classic").get(), instanceOf(ClassicSimilarity.class));
        assertThat(similarityService.getSimilarity("BM25").get(), instanceOf(BM25Similarity.class));
+        assertThat(similarityService.getSimilarity("boolean").get(), instanceOf(BooleanSimilarity.class));
        assertThat(similarityService.getSimilarity("default"), equalTo(null));
    }

@ -109,6 +111,21 @@ public class SimilarityTests extends ESSingleNodeTestCase {
        assertThat(similarity.getDiscountOverlaps(), equalTo(false));
    }

+    public void testResolveSimilaritiesFromMapping_boolean() throws IOException {
+        String mapping = XContentFactory.jsonBuilder().startObject().startObject("type")
+            .startObject("properties")
+            .startObject("field1").field("type", "text").field("similarity", "boolean").endObject()
+            .endObject()
+            .endObject().endObject().string();
+
+        IndexService indexService = createIndex("foo", Settings.EMPTY);
+        DocumentMapper documentMapper = indexService.mapperService()
+            .documentMapperParser()
+            .parse("type", new CompressedXContent(mapping));
+        assertThat(documentMapper.mappers().getMapper("field1").fieldType().similarity(),
+            instanceOf(BooleanSimilarityProvider.class));
+    }
+
    public void testResolveSimilaritiesFromMapping_DFR() throws IOException {
        String mapping = XContentFactory.jsonBuilder().startObject().startObject("type")
            .startObject("properties")
--- a/docs/reference/mapping/params/similarity.asciidoc
+++ b/docs/reference/mapping/params/similarity.asciidoc
@ -3,7 +3,7 @@

 Elasticsearch allows you to configure a scoring algorithm or _similarity_ per
 field. The `similarity` setting provides a simple way of choosing a similarity
-algorithm other than the default TF/IDF, such as `BM25`.
+algorithm other than the default `BM25`, such as `TF/IDF`.

 Similarities are mostly useful for <<text,`text`>> fields, but can also apply
 to other field types.
@ -25,6 +25,11 @@ configuration are:
        Lucene. See {defguide}/practical-scoring-function.html[Lucene’s Practical Scoring Function]
        for more information.

+`boolean`::
+        A simple boolean similarity, which is used when full-text ranking is not needed
+        and the score should only be based on whether the query terms match or not.
+        Boolean similarity gives terms a score equal to their query boost.
+

 The `similarity` can be set on the field level when a field is first created,
 as follows:
@ -42,6 +47,10 @@ PUT my_index
        "classic_field": {
          "type": "text",
          "similarity": "classic" <2>
+        },
+        "boolean_sim_field": {
+          "type": "text",
+          "similarity": "boolean" <3>
        }
      }
    }
@ -51,3 +60,4 @@ PUT my_index
 // CONSOLE
 <1> The `default_field` uses the `BM25` similarity.
 <2> The `classic_field` uses the `classic` similarity (ie TF/IDF).
+<3> The `boolean_sim_field` uses the `boolean` similarity.