Merge pull request #18948 from jimferenczi/bm25

Change default similarity to BM25
This commit is contained in:
Jim Ferenczi 2016-06-21 11:30:30 +02:00 committed by GitHub
commit cc91014dee
4 changed files with 37 additions and 23 deletions

View File

@ -36,7 +36,7 @@ import java.util.function.BiFunction;
public final class SimilarityService extends AbstractIndexComponent { public final class SimilarityService extends AbstractIndexComponent {
public final static String DEFAULT_SIMILARITY = "classic"; public final static String DEFAULT_SIMILARITY = "BM25";
private final Similarity defaultSimilarity; private final Similarity defaultSimilarity;
private final Similarity baseSimilarity; private final Similarity baseSimilarity;
private final Map<String, SimilarityProvider> similarities; private final Map<String, SimilarityProvider> similarities;
@ -121,8 +121,8 @@ public final class SimilarityService extends AbstractIndexComponent {
return similarities.get(name); return similarities.get(name);
} }
public SimilarityProvider getDefaultSimilarity() { Similarity getDefaultSimilarity() {
return similarities.get("default"); return defaultSimilarity;
} }
static class PerFieldSimilarity extends PerFieldSimilarityWrapper { static class PerFieldSimilarity extends PerFieldSimilarityWrapper {

View File

@ -18,6 +18,8 @@
*/ */
package org.elasticsearch.index.similarity; package org.elasticsearch.index.similarity;
import org.apache.lucene.search.similarities.BM25Similarity;
import org.apache.lucene.search.similarities.ClassicSimilarity;
import org.elasticsearch.Version; import org.elasticsearch.Version;
import org.elasticsearch.cluster.metadata.IndexMetaData; import org.elasticsearch.cluster.metadata.IndexMetaData;
import org.elasticsearch.common.settings.Settings; import org.elasticsearch.common.settings.Settings;
@ -27,7 +29,15 @@ import org.elasticsearch.test.IndexSettingsModule;
import java.util.Collections; import java.util.Collections;
import static org.hamcrest.Matchers.instanceOf;
public class SimilarityServiceTests extends ESTestCase { public class SimilarityServiceTests extends ESTestCase {
public void testDefaultSimilarity() {
Settings settings = Settings.builder().build();
IndexSettings indexSettings = IndexSettingsModule.newIndexSettings("test", settings);
SimilarityService service = new SimilarityService(indexSettings, Collections.emptyMap());
assertThat(service.getDefaultSimilarity(), instanceOf(BM25Similarity.class));
}
// Tests #16594 // Tests #16594
public void testOverrideBuiltInSimilarity() { public void testOverrideBuiltInSimilarity() {
@ -53,10 +63,10 @@ public class SimilarityServiceTests extends ESTestCase {
} }
// Tests #16594 // Tests #16594
public void testDefaultSimilarity() { public void testOverrideDefaultSimilarity() {
Settings settings = Settings.builder().put("index.similarity.default.type", "BM25").build(); Settings settings = Settings.builder().put("index.similarity.default.type", "classic").build();
IndexSettings indexSettings = IndexSettingsModule.newIndexSettings("test", settings); IndexSettings indexSettings = IndexSettingsModule.newIndexSettings("test", settings);
SimilarityService service = new SimilarityService(indexSettings, Collections.emptyMap()); SimilarityService service = new SimilarityService(indexSettings, Collections.emptyMap());
assertTrue(service.getDefaultSimilarity() instanceof BM25SimilarityProvider); assertTrue(service.getDefaultSimilarity() instanceof ClassicSimilarity);
} }
} }

View File

@ -47,25 +47,11 @@ Here we configure the DFRSimilarity so it can be referenced as
[float] [float]
=== Available similarities === Available similarities
[float]
[[classic-similarity]]
==== Classic similarity
The classic similarity that is based on the TF/IDF model. This
similarity has the following option:
`discount_overlaps`::
Determines whether overlap tokens (Tokens with
0 position increment) are ignored when computing norm. By default this
is true, meaning overlap tokens do not count when computing norms.
Type name: `classic`
[float] [float]
[[bm25]] [[bm25]]
==== BM25 similarity ==== BM25 similarity (*default*)
Another TF/IDF based similarity that has built-in tf normalization and TF/IDF based similarity that has built-in tf normalization and
is supposed to work better for short fields (like names). See is supposed to work better for short fields (like names). See
http://en.wikipedia.org/wiki/Okapi_BM25[Okapi_BM25] for more details. http://en.wikipedia.org/wiki/Okapi_BM25[Okapi_BM25] for more details.
This similarity has the following options: This similarity has the following options:
@ -86,6 +72,20 @@ This similarity has the following options:
Type name: `BM25` Type name: `BM25`
[float]
[[classic-similarity]]
==== Classic similarity
The classic similarity that is based on the TF/IDF model. This
similarity has the following option:
`discount_overlaps`::
Determines whether overlap tokens (Tokens with
0 position increment) are ignored when computing norm. By default this
is true, meaning overlap tokens do not count when computing norms.
Type name: `classic`
[float] [float]
[[drf]] [[drf]]
==== DFR similarity ==== DFR similarity
@ -178,5 +178,5 @@ You can change the default similarity for all fields by putting the following se
[source,js] [source,js]
-------------------------------------------------- --------------------------------------------------
index.similarity.default.type: BM25 index.similarity.default.type: classic
-------------------------------------------------- --------------------------------------------------

View File

@ -196,3 +196,7 @@ The <<search-request-preference,search preference>> `_prefer_node` has
been superseded by `_prefer_nodes`. By specifying a single node, been superseded by `_prefer_nodes`. By specifying a single node,
`_prefer_nodes` provides the same functionality as `_prefer_node` but `_prefer_nodes` provides the same functionality as `_prefer_node` but
also supports specifying multiple nodes. also supports specifying multiple nodes.
==== Default similarity
The default similarity has been changed to `BM25`.