add lucene language model similarities (Dirichlet & JelinekMercer)

This commit is contained in:
Kevin Wang 2014-04-07 12:20:46 +10:00 committed by Luca Cavanna
parent 9df655adb2
commit ecab74fe6c
4 changed files with 176 additions and 0 deletions

View File

@ -121,6 +121,31 @@ based model] . This similarity has the following options:
Type name: `IB`
[float]
[[lm_dirichlet]]
==== LM Dirichlet similarity.
http://lucene.apache.org/core/4_7_1/core/org/apache/lucene/search/similarities/LMDirichletSimilarity.html[LM
Dirichlet similarity] . This similarity has the following options:
[horizontal]
`mu`:: Default to `2000`.
Type name: `LMDirichlet`
[float]
[[lm_jelinek_mercer]]
==== LM Jelinek Mercer similarity.
http://lucene.apache.org/core/4_7_1/core/org/apache/lucene/search/similarities/LMJelinekMercerSimilarity.html[LM
Jelinek Mercer similarity] . This similarity has the following options:
[horizontal]
`lambda`:: The optimal value depends on both the collection and the query. The optimal value is around `0.1`
for title queries and `0.7` for long queries. Default to `0.1`.
Type name: `LMJelinekMercer`
[float]
[[default-base]]
==== Default and Base Similarities

View File

@ -0,0 +1,55 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.index.similarity;
import org.apache.lucene.search.similarities.LMDirichletSimilarity;
import org.apache.lucene.search.similarities.Similarity;
import org.elasticsearch.common.inject.Inject;
import org.elasticsearch.common.inject.assistedinject.Assisted;
import org.elasticsearch.common.settings.Settings;
/**
* {@link SimilarityProvider} for {@link LMDirichletSimilarity}.
* <p/>
* Configuration options available:
* <ul>
* <li>mu</li>
* </ul>
* @see LMDirichletSimilarity For more information about configuration
*/
public class LMDirichletSimilarityProvider extends AbstractSimilarityProvider {
private final LMDirichletSimilarity similarity;
@Inject
public LMDirichletSimilarityProvider(@Assisted String name, @Assisted Settings settings) {
super(name);
float mu = settings.getAsFloat("mu", 2000f);
this.similarity = new LMDirichletSimilarity(mu);
}
/**
* {@inheritDoc}
*/
@Override
public Similarity get() {
return similarity;
}
}

View File

@ -0,0 +1,55 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.index.similarity;
import org.apache.lucene.search.similarities.LMJelinekMercerSimilarity;
import org.apache.lucene.search.similarities.Similarity;
import org.elasticsearch.common.inject.Inject;
import org.elasticsearch.common.inject.assistedinject.Assisted;
import org.elasticsearch.common.settings.Settings;
/**
* {@link SimilarityProvider} for {@link LMJelinekMercerSimilarity}.
* <p/>
* Configuration options available:
* <ul>
* <li>lambda</li>
* </ul>
* @see LMJelinekMercerSimilarity For more information about configuration
*/
public class LMJelinekMercerSimilarityProvider extends AbstractSimilarityProvider {
private final LMJelinekMercerSimilarity similarity;
@Inject
public LMJelinekMercerSimilarityProvider(@Assisted String name, @Assisted Settings settings) {
super(name);
float lambda = settings.getAsFloat("lambda", 0.1f);
this.similarity = new LMJelinekMercerSimilarity(lambda);
}
/**
* {@inheritDoc}
*/
@Override
public Similarity get() {
return similarity;
}
}

View File

@ -151,6 +151,47 @@ public class SimilarityTests extends ElasticsearchTestCase {
assertThat(((NormalizationH2) similarity.getNormalization()).getC(), equalTo(3f));
}
@Test
public void testResolveSimilaritiesFromMapping_LMDirichlet() throws IOException {
String mapping = XContentFactory.jsonBuilder().startObject().startObject("type")
.startObject("properties")
.startObject("field1").field("type", "string").field("similarity", "my_similarity").endObject()
.endObject()
.endObject().endObject().string();
Settings indexSettings = ImmutableSettings.settingsBuilder()
.put("index.similarity.my_similarity.type", "LMDirichlet")
.put("index.similarity.my_similarity.mu", 3000f)
.build();
SimilarityService similarityService = similarityService(indexSettings);
DocumentMapper documentMapper = similarityService.mapperService().documentMapperParser().parse(mapping);
assertThat(documentMapper.mappers().name("field1").mapper().similarity(), instanceOf(LMDirichletSimilarityProvider.class));
LMDirichletSimilarity similarity = (LMDirichletSimilarity) documentMapper.mappers().name("field1").mapper().similarity().get();
assertThat(similarity.getMu(), equalTo(3000f));
}
@Test
public void testResolveSimilaritiesFromMapping_LMJelinekMercer() throws IOException {
String mapping = XContentFactory.jsonBuilder().startObject().startObject("type")
.startObject("properties")
.startObject("field1").field("type", "string").field("similarity", "my_similarity").endObject()
.endObject()
.endObject().endObject().string();
Settings indexSettings = ImmutableSettings.settingsBuilder()
.put("index.similarity.my_similarity.type", "LMJelinekMercer")
.put("index.similarity.my_similarity.lambda", 0.7f)
.build();
SimilarityService similarityService = similarityService(indexSettings);
DocumentMapper documentMapper = similarityService.mapperService().documentMapperParser().parse(mapping);
assertThat(documentMapper.mappers().name("field1").mapper().similarity(), instanceOf(LMJelinekMercerSimilarityProvider.class));
LMJelinekMercerSimilarity similarity = (LMJelinekMercerSimilarity) documentMapper.mappers().name("field1").mapper().similarity().get();
assertThat(similarity.getLambda(), equalTo(0.7f));
}
private static SimilarityService similarityService() {
return similarityService(ImmutableSettings.Builder.EMPTY_SETTINGS);
}