From ecab74fe6ca8b135ecf756ffd977322e5df960ed Mon Sep 17 00:00:00 2001 From: Kevin Wang Date: Mon, 7 Apr 2014 12:20:46 +1000 Subject: [PATCH] add lucene language model similarities (Dirichlet & JelinekMercer) --- .../index-modules/similarity.asciidoc | 25 +++++++++ .../LMDirichletSimilarityProvider.java | 55 +++++++++++++++++++ .../LMJelinekMercerSimilarityProvider.java | 55 +++++++++++++++++++ .../index/similarity/SimilarityTests.java | 41 ++++++++++++++ 4 files changed, 176 insertions(+) create mode 100644 src/main/java/org/elasticsearch/index/similarity/LMDirichletSimilarityProvider.java create mode 100644 src/main/java/org/elasticsearch/index/similarity/LMJelinekMercerSimilarityProvider.java diff --git a/docs/reference/index-modules/similarity.asciidoc b/docs/reference/index-modules/similarity.asciidoc index ae9f368f3f1..0c95709a860 100644 --- a/docs/reference/index-modules/similarity.asciidoc +++ b/docs/reference/index-modules/similarity.asciidoc @@ -121,6 +121,31 @@ based model] . This similarity has the following options: Type name: `IB` +[float] +[[lm_dirichlet]] +==== LM Dirichlet similarity. + +http://lucene.apache.org/core/4_7_1/core/org/apache/lucene/search/similarities/LMDirichletSimilarity.html[LM +Dirichlet similarity] . This similarity has the following options: + +[horizontal] +`mu`:: Default to `2000`. + +Type name: `LMDirichlet` + +[float] +[[lm_jelinek_mercer]] +==== LM Jelinek Mercer similarity. + +http://lucene.apache.org/core/4_7_1/core/org/apache/lucene/search/similarities/LMJelinekMercerSimilarity.html[LM +Jelinek Mercer similarity] . This similarity has the following options: + +[horizontal] +`lambda`:: The optimal value depends on both the collection and the query. The optimal value is around `0.1` +for title queries and `0.7` for long queries. Default to `0.1`. + +Type name: `LMJelinekMercer` + [float] [[default-base]] ==== Default and Base Similarities diff --git a/src/main/java/org/elasticsearch/index/similarity/LMDirichletSimilarityProvider.java b/src/main/java/org/elasticsearch/index/similarity/LMDirichletSimilarityProvider.java new file mode 100644 index 00000000000..797ce6417e8 --- /dev/null +++ b/src/main/java/org/elasticsearch/index/similarity/LMDirichletSimilarityProvider.java @@ -0,0 +1,55 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.index.similarity; + +import org.apache.lucene.search.similarities.LMDirichletSimilarity; +import org.apache.lucene.search.similarities.Similarity; +import org.elasticsearch.common.inject.Inject; +import org.elasticsearch.common.inject.assistedinject.Assisted; +import org.elasticsearch.common.settings.Settings; + +/** + * {@link SimilarityProvider} for {@link LMDirichletSimilarity}. + *

+ * Configuration options available: + *

+ * @see LMDirichletSimilarity For more information about configuration + */ +public class LMDirichletSimilarityProvider extends AbstractSimilarityProvider { + + private final LMDirichletSimilarity similarity; + + @Inject + public LMDirichletSimilarityProvider(@Assisted String name, @Assisted Settings settings) { + super(name); + float mu = settings.getAsFloat("mu", 2000f); + this.similarity = new LMDirichletSimilarity(mu); + } + + /** + * {@inheritDoc} + */ + @Override + public Similarity get() { + return similarity; + } +} diff --git a/src/main/java/org/elasticsearch/index/similarity/LMJelinekMercerSimilarityProvider.java b/src/main/java/org/elasticsearch/index/similarity/LMJelinekMercerSimilarityProvider.java new file mode 100644 index 00000000000..9be02366b63 --- /dev/null +++ b/src/main/java/org/elasticsearch/index/similarity/LMJelinekMercerSimilarityProvider.java @@ -0,0 +1,55 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.index.similarity; + +import org.apache.lucene.search.similarities.LMJelinekMercerSimilarity; +import org.apache.lucene.search.similarities.Similarity; +import org.elasticsearch.common.inject.Inject; +import org.elasticsearch.common.inject.assistedinject.Assisted; +import org.elasticsearch.common.settings.Settings; + +/** + * {@link SimilarityProvider} for {@link LMJelinekMercerSimilarity}. + *

+ * Configuration options available: + *

+ * @see LMJelinekMercerSimilarity For more information about configuration + */ +public class LMJelinekMercerSimilarityProvider extends AbstractSimilarityProvider { + + private final LMJelinekMercerSimilarity similarity; + + @Inject + public LMJelinekMercerSimilarityProvider(@Assisted String name, @Assisted Settings settings) { + super(name); + float lambda = settings.getAsFloat("lambda", 0.1f); + this.similarity = new LMJelinekMercerSimilarity(lambda); + } + + /** + * {@inheritDoc} + */ + @Override + public Similarity get() { + return similarity; + } +} diff --git a/src/test/java/org/elasticsearch/index/similarity/SimilarityTests.java b/src/test/java/org/elasticsearch/index/similarity/SimilarityTests.java index 8bf8586a6a8..a3f9753f4b5 100644 --- a/src/test/java/org/elasticsearch/index/similarity/SimilarityTests.java +++ b/src/test/java/org/elasticsearch/index/similarity/SimilarityTests.java @@ -151,6 +151,47 @@ public class SimilarityTests extends ElasticsearchTestCase { assertThat(((NormalizationH2) similarity.getNormalization()).getC(), equalTo(3f)); } + @Test + public void testResolveSimilaritiesFromMapping_LMDirichlet() throws IOException { + String mapping = XContentFactory.jsonBuilder().startObject().startObject("type") + .startObject("properties") + .startObject("field1").field("type", "string").field("similarity", "my_similarity").endObject() + .endObject() + .endObject().endObject().string(); + + Settings indexSettings = ImmutableSettings.settingsBuilder() + .put("index.similarity.my_similarity.type", "LMDirichlet") + .put("index.similarity.my_similarity.mu", 3000f) + .build(); + SimilarityService similarityService = similarityService(indexSettings); + DocumentMapper documentMapper = similarityService.mapperService().documentMapperParser().parse(mapping); + assertThat(documentMapper.mappers().name("field1").mapper().similarity(), instanceOf(LMDirichletSimilarityProvider.class)); + + LMDirichletSimilarity similarity = (LMDirichletSimilarity) documentMapper.mappers().name("field1").mapper().similarity().get(); + assertThat(similarity.getMu(), equalTo(3000f)); + } + + @Test + public void testResolveSimilaritiesFromMapping_LMJelinekMercer() throws IOException { + String mapping = XContentFactory.jsonBuilder().startObject().startObject("type") + .startObject("properties") + .startObject("field1").field("type", "string").field("similarity", "my_similarity").endObject() + .endObject() + .endObject().endObject().string(); + + Settings indexSettings = ImmutableSettings.settingsBuilder() + .put("index.similarity.my_similarity.type", "LMJelinekMercer") + .put("index.similarity.my_similarity.lambda", 0.7f) + .build(); + SimilarityService similarityService = similarityService(indexSettings); + DocumentMapper documentMapper = similarityService.mapperService().documentMapperParser().parse(mapping); + assertThat(documentMapper.mappers().name("field1").mapper().similarity(), instanceOf(LMJelinekMercerSimilarityProvider.class)); + + LMJelinekMercerSimilarity similarity = (LMJelinekMercerSimilarity) documentMapper.mappers().name("field1").mapper().similarity().get(); + assertThat(similarity.getLambda(), equalTo(0.7f)); + } + + private static SimilarityService similarityService() { return similarityService(ImmutableSettings.Builder.EMPTY_SETTINGS); }