Merge pull request #15446 from jimferenczi/classic_similarity

Renames `default` similarity into `classic`
This commit is contained in:
Jim Ferenczi 2015-12-30 08:42:20 -08:00
commit 992ffac509
14 changed files with 154 additions and 86 deletions

View File

@ -54,7 +54,7 @@ import java.util.Objects;
* While aggregating the total term frequency is trivial since it
* can be summed up not every {@link org.apache.lucene.search.similarities.Similarity}
* makes use of this statistic. The document frequency which is used in the
* {@link org.apache.lucene.search.similarities.DefaultSimilarity}
* {@link org.apache.lucene.search.similarities.ClassicSimilarity}
* can only be estimated as an lower-bound since it is a document based statistic. For
* the document frequency the maximum frequency across all fields per term is used
* which is the minimum number of documents the terms occurs in.

View File

@ -24,7 +24,7 @@ import org.apache.lucene.index.Term;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.TermStatistics;
import org.apache.lucene.search.similarities.DefaultSimilarity;
import org.apache.lucene.search.similarities.ClassicSimilarity;
import org.apache.lucene.search.similarities.TFIDFSimilarity;
import org.apache.lucene.util.BytesRef;
import org.elasticsearch.common.Nullable;
@ -67,7 +67,7 @@ public class TermVectorsFilter {
this.dfs = dfs;
this.scoreTerms = new HashMap<>();
this.similarity = new DefaultSimilarity();
this.similarity = new ClassicSimilarity();
}
public void setSettings(TermVectorsRequest.FilterSettings settings) {

View File

@ -30,7 +30,7 @@ import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.similarities.DefaultSimilarity;
import org.apache.lucene.search.similarities.ClassicSimilarity;
import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.search.similarities.TFIDFSimilarity;
import org.apache.lucene.util.BytesRef;
@ -138,7 +138,7 @@ public class MoreLikeThisQuery extends Query {
if (rewritten != this) {
return rewritten;
}
XMoreLikeThis mlt = new XMoreLikeThis(reader, similarity == null ? new DefaultSimilarity() : similarity);
XMoreLikeThis mlt = new XMoreLikeThis(reader, similarity == null ? new ClassicSimilarity() : similarity);
mlt.setFieldNames(moreLikeFields);
mlt.setAnalyzer(analyzer);

View File

@ -52,7 +52,7 @@ import org.apache.lucene.search.BoostQuery;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.similarities.DefaultSimilarity;
import org.apache.lucene.search.similarities.ClassicSimilarity;
import org.apache.lucene.search.similarities.TFIDFSimilarity;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CharsRefBuilder;
@ -304,7 +304,7 @@ public final class XMoreLikeThis {
/**
* For idf() calculations.
*/
private TFIDFSimilarity similarity;// = new DefaultSimilarity();
private TFIDFSimilarity similarity;// = new ClassicSimilarity();
/**
* IndexReader to use
@ -346,7 +346,7 @@ public final class XMoreLikeThis {
* Constructor requiring an IndexReader.
*/
public XMoreLikeThis(IndexReader ir) {
this(ir, new DefaultSimilarity());
this(ir, new ClassicSimilarity());
}
public XMoreLikeThis(IndexReader ir, TFIDFSimilarity sim) {

View File

@ -35,6 +35,8 @@ import org.elasticsearch.index.mapper.MappedFieldType.Loading;
import org.elasticsearch.index.mapper.Mapper;
import org.elasticsearch.index.mapper.MapperParsingException;
import org.elasticsearch.index.mapper.object.ObjectMapper;
import org.elasticsearch.index.similarity.SimilarityProvider;
import org.elasticsearch.index.similarity.SimilarityService;
import java.util.Collections;
import java.util.Iterator;
@ -79,7 +81,8 @@ public class TypeParsers {
builder.omitNorms(nodeBooleanValue(propNode));
iterator.remove();
} else if (propName.equals("similarity")) {
builder.similarity(parserContext.getSimilarity(propNode.toString()));
SimilarityProvider similarityProvider = resolveSimilarity(parserContext, name, propNode.toString());
builder.similarity(similarityProvider);
iterator.remove();
} else if (parseMultiField(builder, name, parserContext, propName, propNode)) {
iterator.remove();
@ -210,7 +213,8 @@ public class TypeParsers {
// ignore for old indexes
iterator.remove();
} else if (propName.equals("similarity")) {
builder.similarity(parserContext.getSimilarity(propNode.toString()));
SimilarityProvider similarityProvider = resolveSimilarity(parserContext, name, propNode.toString());
builder.similarity(similarityProvider);
iterator.remove();
} else if (propName.equals("fielddata")) {
final Settings settings = Settings.builder().put(SettingsLoader.Helper.loadNestedFromMap(nodeMapValue(propNode, "fielddata"))).build();
@ -369,4 +373,15 @@ public class TypeParsers {
builder.copyTo(copyToBuilder.build());
}
private static SimilarityProvider resolveSimilarity(Mapper.TypeParser.ParserContext parserContext, String name, String value) {
if (parserContext.indexVersionCreated().before(Version.V_3_0_0) && "default".equals(value)) {
// "default" similarity has been renamed into "classic" in 3.x.
value = SimilarityService.DEFAULT_SIMILARITY;
}
SimilarityProvider similarityProvider = parserContext.getSimilarity(value);
if (similarityProvider == null) {
throw new MapperParsingException("Unknown Similarity type [" + value + "] for [" + name + "]");
}
return similarityProvider;
}
}

View File

@ -19,23 +19,23 @@
package org.elasticsearch.index.similarity;
import org.apache.lucene.search.similarities.DefaultSimilarity;
import org.apache.lucene.search.similarities.ClassicSimilarity;
import org.elasticsearch.common.settings.Settings;
/**
* {@link SimilarityProvider} for {@link DefaultSimilarity}.
* {@link SimilarityProvider} for {@link ClassicSimilarity}.
* <p>
* Configuration options available:
* <ul>
* <li>discount_overlaps</li>
* </ul>
* @see DefaultSimilarity For more information about configuration
* @see ClassicSimilarity For more information about configuration
*/
public class DefaultSimilarityProvider extends AbstractSimilarityProvider {
public class ClassicSimilarityProvider extends AbstractSimilarityProvider {
private final DefaultSimilarity similarity = new DefaultSimilarity();
private final ClassicSimilarity similarity = new ClassicSimilarity();
public DefaultSimilarityProvider(String name, Settings settings) {
public ClassicSimilarityProvider(String name, Settings settings) {
super(name);
boolean discountOverlaps = settings.getAsBoolean("discount_overlaps", true);
this.similarity.setDiscountOverlaps(discountOverlaps);
@ -45,7 +45,7 @@ public class DefaultSimilarityProvider extends AbstractSimilarityProvider {
* {@inheritDoc}
*/
@Override
public DefaultSimilarity get() {
public ClassicSimilarity get() {
return similarity;
}
}

View File

@ -35,7 +35,7 @@ import java.util.function.BiFunction;
public final class SimilarityService extends AbstractIndexComponent {
public final static String DEFAULT_SIMILARITY = "default";
public final static String DEFAULT_SIMILARITY = "classic";
private final Similarity defaultSimilarity;
private final Similarity baseSimilarity;
private final Map<String, SimilarityProvider> similarities;
@ -44,9 +44,9 @@ public final class SimilarityService extends AbstractIndexComponent {
static {
Map<String, BiFunction<String, Settings, SimilarityProvider>> defaults = new HashMap<>();
Map<String, BiFunction<String, Settings, SimilarityProvider>> buildIn = new HashMap<>();
defaults.put("default", DefaultSimilarityProvider::new);
defaults.put("classic", ClassicSimilarityProvider::new);
defaults.put("BM25", BM25SimilarityProvider::new);
buildIn.put("default", DefaultSimilarityProvider::new);
buildIn.put("classic", ClassicSimilarityProvider::new);
buildIn.put("BM25", BM25SimilarityProvider::new);
buildIn.put("DFR", DFRSimilarityProvider::new);
buildIn.put("IB", IBSimilarityProvider::new);

View File

@ -37,7 +37,7 @@ import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.similarities.BM25Similarity;
import org.apache.lucene.search.similarities.DefaultSimilarity;
import org.apache.lucene.search.similarities.ClassicSimilarity;
import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.TestUtil;
@ -214,7 +214,7 @@ public class BlendedTermQueryTests extends ESTestCase {
}
public IndexSearcher setSimilarity(IndexSearcher searcher) {
Similarity similarity = random().nextBoolean() ? new BM25Similarity() : new DefaultSimilarity();
Similarity similarity = random().nextBoolean() ? new BM25Similarity() : new ClassicSimilarity();
searcher.setSimilarity(similarity);
return searcher;
}

View File

@ -19,11 +19,11 @@
package org.elasticsearch.index.similarity;
import org.apache.lucene.search.similarities.ClassicSimilarity;
import org.apache.lucene.search.similarities.AfterEffectL;
import org.apache.lucene.search.similarities.BM25Similarity;
import org.apache.lucene.search.similarities.BasicModelG;
import org.apache.lucene.search.similarities.DFRSimilarity;
import org.apache.lucene.search.similarities.DefaultSimilarity;
import org.apache.lucene.search.similarities.DistributionSPL;
import org.apache.lucene.search.similarities.IBSimilarity;
import org.apache.lucene.search.similarities.LMDirichletSimilarity;
@ -31,11 +31,16 @@ import org.apache.lucene.search.similarities.LMJelinekMercerSimilarity;
import org.apache.lucene.search.similarities.LambdaTTF;
import org.apache.lucene.search.similarities.NormalizationH2;
import org.elasticsearch.common.compress.CompressedXContent;
import org.elasticsearch.Version;
import org.elasticsearch.cluster.metadata.IndexMetaData;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.xcontent.XContentFactory;
import org.elasticsearch.index.IndexService;
import org.elasticsearch.index.mapper.DocumentMapper;
import org.elasticsearch.index.mapper.DocumentMapperParser;
import org.elasticsearch.index.mapper.MapperParsingException;
import org.elasticsearch.test.ESSingleNodeTestCase;
import org.elasticsearch.test.VersionUtils;
import java.io.IOException;
@ -45,42 +50,43 @@ import static org.hamcrest.CoreMatchers.instanceOf;
public class SimilarityTests extends ESSingleNodeTestCase {
public void testResolveDefaultSimilarities() {
SimilarityService similarityService = createIndex("foo").similarityService();
assertThat(similarityService.getSimilarity("default").get(), instanceOf(DefaultSimilarity.class));
assertThat(similarityService.getSimilarity("classic").get(), instanceOf(ClassicSimilarity.class));
assertThat(similarityService.getSimilarity("BM25").get(), instanceOf(BM25Similarity.class));
assertThat(similarityService.getSimilarity("default"), equalTo(null));
}
public void testResolveSimilaritiesFromMapping_default() throws IOException {
public void testResolveSimilaritiesFromMapping_classic() throws IOException {
String mapping = XContentFactory.jsonBuilder().startObject().startObject("type")
.startObject("properties")
.startObject("field1").field("type", "string").field("similarity", "my_similarity").endObject()
.endObject()
.endObject().endObject().string();
.startObject("properties")
.startObject("field1").field("type", "string").field("similarity", "my_similarity").endObject()
.endObject()
.endObject().endObject().string();
Settings indexSettings = Settings.settingsBuilder()
.put("index.similarity.my_similarity.type", "default")
.put("index.similarity.my_similarity.discount_overlaps", false)
.build();
.put("index.similarity.my_similarity.type", "classic")
.put("index.similarity.my_similarity.discount_overlaps", false)
.build();
IndexService indexService = createIndex("foo", indexSettings);
DocumentMapper documentMapper = indexService.mapperService().documentMapperParser().parse("type", new CompressedXContent(mapping));
assertThat(documentMapper.mappers().getMapper("field1").fieldType().similarity(), instanceOf(DefaultSimilarityProvider.class));
assertThat(documentMapper.mappers().getMapper("field1").fieldType().similarity(), instanceOf(ClassicSimilarityProvider.class));
DefaultSimilarity similarity = (DefaultSimilarity) documentMapper.mappers().getMapper("field1").fieldType().similarity().get();
ClassicSimilarity similarity = (ClassicSimilarity) documentMapper.mappers().getMapper("field1").fieldType().similarity().get();
assertThat(similarity.getDiscountOverlaps(), equalTo(false));
}
public void testResolveSimilaritiesFromMapping_bm25() throws IOException {
String mapping = XContentFactory.jsonBuilder().startObject().startObject("type")
.startObject("properties")
.startObject("field1").field("type", "string").field("similarity", "my_similarity").endObject()
.endObject()
.endObject().endObject().string();
.startObject("properties")
.startObject("field1").field("type", "string").field("similarity", "my_similarity").endObject()
.endObject()
.endObject().endObject().string();
Settings indexSettings = Settings.settingsBuilder()
.put("index.similarity.my_similarity.type", "BM25")
.put("index.similarity.my_similarity.k1", 2.0f)
.put("index.similarity.my_similarity.b", 1.5f)
.put("index.similarity.my_similarity.discount_overlaps", false)
.build();
.put("index.similarity.my_similarity.type", "BM25")
.put("index.similarity.my_similarity.k1", 2.0f)
.put("index.similarity.my_similarity.b", 1.5f)
.put("index.similarity.my_similarity.discount_overlaps", false)
.build();
IndexService indexService = createIndex("foo", indexSettings);
DocumentMapper documentMapper = indexService.mapperService().documentMapperParser().parse("type", new CompressedXContent(mapping));
assertThat(documentMapper.mappers().getMapper("field1").fieldType().similarity(), instanceOf(BM25SimilarityProvider.class));
@ -93,18 +99,18 @@ public class SimilarityTests extends ESSingleNodeTestCase {
public void testResolveSimilaritiesFromMapping_DFR() throws IOException {
String mapping = XContentFactory.jsonBuilder().startObject().startObject("type")
.startObject("properties")
.startObject("field1").field("type", "string").field("similarity", "my_similarity").endObject()
.endObject()
.endObject().endObject().string();
.startObject("properties")
.startObject("field1").field("type", "string").field("similarity", "my_similarity").endObject()
.endObject()
.endObject().endObject().string();
Settings indexSettings = Settings.settingsBuilder()
.put("index.similarity.my_similarity.type", "DFR")
.put("index.similarity.my_similarity.basic_model", "g")
.put("index.similarity.my_similarity.after_effect", "l")
.put("index.similarity.my_similarity.normalization", "h2")
.put("index.similarity.my_similarity.normalization.h2.c", 3f)
.build();
.put("index.similarity.my_similarity.type", "DFR")
.put("index.similarity.my_similarity.basic_model", "g")
.put("index.similarity.my_similarity.after_effect", "l")
.put("index.similarity.my_similarity.normalization", "h2")
.put("index.similarity.my_similarity.normalization.h2.c", 3f)
.build();
IndexService indexService = createIndex("foo", indexSettings);
DocumentMapper documentMapper = indexService.mapperService().documentMapperParser().parse("type", new CompressedXContent(mapping));
assertThat(documentMapper.mappers().getMapper("field1").fieldType().similarity(), instanceOf(DFRSimilarityProvider.class));
@ -118,18 +124,18 @@ public class SimilarityTests extends ESSingleNodeTestCase {
public void testResolveSimilaritiesFromMapping_IB() throws IOException {
String mapping = XContentFactory.jsonBuilder().startObject().startObject("type")
.startObject("properties")
.startObject("field1").field("type", "string").field("similarity", "my_similarity").endObject()
.endObject()
.endObject().endObject().string();
.startObject("properties")
.startObject("field1").field("type", "string").field("similarity", "my_similarity").endObject()
.endObject()
.endObject().endObject().string();
Settings indexSettings = Settings.settingsBuilder()
.put("index.similarity.my_similarity.type", "IB")
.put("index.similarity.my_similarity.distribution", "spl")
.put("index.similarity.my_similarity.lambda", "ttf")
.put("index.similarity.my_similarity.normalization", "h2")
.put("index.similarity.my_similarity.normalization.h2.c", 3f)
.build();
.put("index.similarity.my_similarity.type", "IB")
.put("index.similarity.my_similarity.distribution", "spl")
.put("index.similarity.my_similarity.lambda", "ttf")
.put("index.similarity.my_similarity.normalization", "h2")
.put("index.similarity.my_similarity.normalization.h2.c", 3f)
.build();
IndexService indexService = createIndex("foo", indexSettings);
DocumentMapper documentMapper = indexService.mapperService().documentMapperParser().parse("type", new CompressedXContent(mapping));
assertThat(documentMapper.mappers().getMapper("field1").fieldType().similarity(), instanceOf(IBSimilarityProvider.class));
@ -143,15 +149,15 @@ public class SimilarityTests extends ESSingleNodeTestCase {
public void testResolveSimilaritiesFromMapping_LMDirichlet() throws IOException {
String mapping = XContentFactory.jsonBuilder().startObject().startObject("type")
.startObject("properties")
.startObject("field1").field("type", "string").field("similarity", "my_similarity").endObject()
.endObject()
.endObject().endObject().string();
.startObject("properties")
.startObject("field1").field("type", "string").field("similarity", "my_similarity").endObject()
.endObject()
.endObject().endObject().string();
Settings indexSettings = Settings.settingsBuilder()
.put("index.similarity.my_similarity.type", "LMDirichlet")
.put("index.similarity.my_similarity.mu", 3000f)
.build();
.put("index.similarity.my_similarity.type", "LMDirichlet")
.put("index.similarity.my_similarity.mu", 3000f)
.build();
IndexService indexService = createIndex("foo", indexSettings);
DocumentMapper documentMapper = indexService.mapperService().documentMapperParser().parse("type", new CompressedXContent(mapping));
assertThat(documentMapper.mappers().getMapper("field1").fieldType().similarity(), instanceOf(LMDirichletSimilarityProvider.class));
@ -162,15 +168,15 @@ public class SimilarityTests extends ESSingleNodeTestCase {
public void testResolveSimilaritiesFromMapping_LMJelinekMercer() throws IOException {
String mapping = XContentFactory.jsonBuilder().startObject().startObject("type")
.startObject("properties")
.startObject("field1").field("type", "string").field("similarity", "my_similarity").endObject()
.endObject()
.endObject().endObject().string();
.startObject("properties")
.startObject("field1").field("type", "string").field("similarity", "my_similarity").endObject()
.endObject()
.endObject().endObject().string();
Settings indexSettings = Settings.settingsBuilder()
.put("index.similarity.my_similarity.type", "LMJelinekMercer")
.put("index.similarity.my_similarity.lambda", 0.7f)
.build();
.put("index.similarity.my_similarity.type", "LMJelinekMercer")
.put("index.similarity.my_similarity.lambda", 0.7f)
.build();
IndexService indexService = createIndex("foo", indexSettings);
DocumentMapper documentMapper = indexService.mapperService().documentMapperParser().parse("type", new CompressedXContent(mapping));
assertThat(documentMapper.mappers().getMapper("field1").fieldType().similarity(), instanceOf(LMJelinekMercerSimilarityProvider.class));
@ -178,4 +184,47 @@ public class SimilarityTests extends ESSingleNodeTestCase {
LMJelinekMercerSimilarity similarity = (LMJelinekMercerSimilarity) documentMapper.mappers().getMapper("field1").fieldType().similarity().get();
assertThat(similarity.getLambda(), equalTo(0.7f));
}
public void testResolveSimilaritiesFromMapping_Unknown() throws IOException {
String mapping = XContentFactory.jsonBuilder().startObject().startObject("type")
.startObject("properties")
.startObject("field1").field("type", "string").field("similarity", "unknown_similarity").endObject()
.endObject()
.endObject().endObject().string();
IndexService indexService = createIndex("foo");
try {
indexService.mapperService().documentMapperParser().parse("type", new CompressedXContent(mapping));
fail("Expected MappingParsingException");
} catch (MapperParsingException e) {
assertThat(e.getMessage(), equalTo("Unknown Similarity type [unknown_similarity] for [field1]"));
}
}
public void testSimilarityDefaultBackCompat() throws IOException {
String mapping = XContentFactory.jsonBuilder().startObject().startObject("type")
.startObject("properties")
.startObject("field1")
.field("similarity", "default")
.field("type", "string")
.endObject()
.endObject()
.endObject().string();
Settings settings = Settings.builder()
.put(IndexMetaData.SETTING_VERSION_CREATED, VersionUtils.randomVersionBetween(random(), Version.V_2_0_0, Version.V_2_2_0))
.build();
DocumentMapperParser parser = createIndex("test_v2.x", settings).mapperService().documentMapperParser();
DocumentMapper documentMapper = parser.parse("type", new CompressedXContent(mapping));
assertThat(documentMapper.mappers().getMapper("field1").fieldType().similarity(), instanceOf(ClassicSimilarityProvider.class));
assertThat(documentMapper.mappers().getMapper("field1").fieldType().similarity().name(), equalTo("classic"));
parser = createIndex("test_v3.x").mapperService().documentMapperParser();
try {
parser.parse("type", new CompressedXContent(mapping));
fail("Expected MappingParsingException");
} catch (MapperParsingException e) {
assertThat(e.getMessage(), equalTo("Unknown Similarity type [default] for [field1]"));
}
}
}

View File

@ -28,7 +28,7 @@ import static org.elasticsearch.index.query.QueryBuilders.matchQuery;
import static org.hamcrest.Matchers.equalTo;
import static org.hamcrest.Matchers.not;
public class SimilarityIT extends ESIntegTestCase {
public class SimilarityIT extends ESIntegTestCase {
public void testCustomBM25Similarity() throws Exception {
try {
client().admin().indices().prepareDelete("test").execute().actionGet();
@ -45,7 +45,7 @@ public class SimilarityIT extends ESIntegTestCase {
.field("type", "string")
.endObject()
.startObject("field2")
.field("similarity", "default")
.field("similarity", "classic")
.field("type", "string")
.endObject()
.endObject()

View File

@ -48,10 +48,10 @@ Here we configure the DFRSimilarity so it can be referenced as
=== Available similarities
[float]
[[default-similarity]]
==== Default similarity
[[classic-similarity]]
==== Classic similarity
The default similarity that is based on the TF/IDF model. This
The classic similarity that is based on the TF/IDF model. This
similarity has the following option:
`discount_overlaps`::
@ -59,7 +59,7 @@ similarity has the following option:
0 position increment) are ignored when computing norm. By default this
is true, meaning overlap tokens do not count when computing norms.
Type name: `default`
Type name: `classic`
[float]
[[bm25]]

View File

@ -15,7 +15,7 @@ similarities. For more details about this expert options, see the
The only similarities which can be used out of the box, without any further
configuration are:
`default`::
`classic`::
The Default TF/IDF algorithm used by Elasticsearch and
Lucene. See {defguide}/practical-scoring-function.html[Lucenes Practical Scoring Function]
for more information.
@ -49,6 +49,6 @@ PUT my_index
}
--------------------------------------------------
// AUTOSENSE
<1> The `default_field` uses the `default` similarity (ie TF/IDF).
<1> The `default_field` uses the `classic` similarity (ie TF/IDF).
<2> The `bm25_field` uses the `BM25` similarity.

View File

@ -166,7 +166,7 @@ Defaults depend on the <<mapping-index,`index`>> setting:
<<similarity,`similarity`>>::
Which scoring algorithm or _similarity_ should be used. Defaults
to `default`, which uses TF/IDF.
to `classic`, which uses TF/IDF.
<<term-vector,`term_vector`>>::

View File

@ -224,6 +224,10 @@ Allocation settings deprecated in 1.x have been removed:
Please change the setting in your configuration files or in the clusterstate to use the new settings instead.
==== Similarity settings
The 'default' similarity has been renamed to 'classic'.
[[breaking_30_mapping_changes]]
=== Mapping changes