mirror of https://github.com/apache/lucene.git
LUCENE-6986: add more DFI measures
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1726205 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
dabf5e8ce8
commit
79572ada76
|
@ -126,8 +126,8 @@ New Features
|
|||
as alternative to their SPI name. This enables compile-time safety when
|
||||
defining analyzer's components. (Uwe Schindler, Shai Erera)
|
||||
|
||||
* LUCENE-6818: Add DFISimilarity implementing the divergence from independence
|
||||
model. (Ahmet Arslan via Robert Muir)
|
||||
* LUCENE-6818, LUCENE-6986: Add DFISimilarity implementing the divergence
|
||||
from independence model. (Ahmet Arslan via Robert Muir)
|
||||
|
||||
* SOLR-4619: Added removeAllAttributes() to AttributeSource, which removes
|
||||
all previously added attributes.
|
||||
|
|
|
@ -32,16 +32,21 @@ package org.apache.lucene.search.similarities;
|
|||
* For more information see: <a href="http://dx.doi.org/10.1007/s10791-013-9225-4">A nonparametric term weighting method for information retrieval based on measuring the divergence from independence</a>
|
||||
*
|
||||
* @lucene.experimental
|
||||
* @see org.apache.lucene.search.similarities.DFRSimilarity
|
||||
* @see org.apache.lucene.search.similarities.IndependenceStandardized
|
||||
* @see org.apache.lucene.search.similarities.IndependenceSaturated
|
||||
* @see org.apache.lucene.search.similarities.IndependenceChiSquared
|
||||
*/
|
||||
|
||||
|
||||
public class DFISimilarity extends SimilarityBase {
|
||||
|
||||
private final Independence independence;
|
||||
|
||||
/**
|
||||
* Sole constructor: DFI is parameter-free.
|
||||
* Create DFI with the specified divergence from independence measure
|
||||
* @param independenceMeasure measure of divergence from independence
|
||||
*/
|
||||
public DFISimilarity() {
|
||||
public DFISimilarity(Independence independenceMeasure) {
|
||||
this.independence = independenceMeasure;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -52,14 +57,21 @@ public class DFISimilarity extends SimilarityBase {
|
|||
// if the observed frequency is less than or equal to the expected value, then return zero.
|
||||
if (freq <= expected) return 0;
|
||||
|
||||
final float chiSquare = (freq - expected) * (freq - expected) / expected;
|
||||
final float measure = independence.score(freq, expected);
|
||||
|
||||
return stats.getBoost() * (float) log2(chiSquare + 1);
|
||||
return stats.getBoost() * (float) log2(measure + 1);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the measure of independence
|
||||
*/
|
||||
public Independence getIndependence() {
|
||||
return independence;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "DFI";
|
||||
return "DFI(" + independence + ")";
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -0,0 +1,46 @@
|
|||
package org.apache.lucene.search.similarities;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Computes the measure of divergence from independence for DFI
|
||||
* scoring functions.
|
||||
* <p>
|
||||
* See http://trec.nist.gov/pubs/trec21/papers/irra.web.nb.pdf for more information
|
||||
* on different methods.
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public abstract class Independence {
|
||||
|
||||
/**
|
||||
* Sole constructor. (For invocation by subclass
|
||||
* constructors, typically implicit.)
|
||||
*/
|
||||
public Independence() {}
|
||||
|
||||
/**
|
||||
* Computes distance from independence
|
||||
* @param freq actual term frequency
|
||||
* @param expected expected term frequency
|
||||
*/
|
||||
public abstract float score(float freq, float expected);
|
||||
|
||||
// subclasses must provide a name
|
||||
@Override
|
||||
public abstract String toString();
|
||||
}
|
|
@ -0,0 +1,44 @@
|
|||
package org.apache.lucene.search.similarities;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Normalized chi-squared measure of distance from independence
|
||||
* <p>
|
||||
* Described as:
|
||||
* "can be used for tasks that require high precision, against both
|
||||
* short and long queries."
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public class IndependenceChiSquared extends Independence {
|
||||
|
||||
/**
|
||||
* Sole constructor.
|
||||
*/
|
||||
public IndependenceChiSquared() {}
|
||||
|
||||
@Override
|
||||
public float score(float freq, float expected) {
|
||||
return (freq - expected) * (freq - expected) / expected;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "ChiSquared";
|
||||
}
|
||||
}
|
|
@ -0,0 +1,43 @@
|
|||
package org.apache.lucene.search.similarities;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Saturated measure of distance from independence
|
||||
* <p>
|
||||
* Described as:
|
||||
* "for tasks that require high recall against long queries"
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public class IndependenceSaturated extends Independence {
|
||||
|
||||
/**
|
||||
* Sole constructor.
|
||||
*/
|
||||
public IndependenceSaturated() {}
|
||||
|
||||
@Override
|
||||
public float score(float freq, float expected) {
|
||||
return (freq - expected) / expected;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "Saturated";
|
||||
}
|
||||
}
|
|
@ -0,0 +1,45 @@
|
|||
package org.apache.lucene.search.similarities;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Standardized measure of distance from independence
|
||||
* <p>
|
||||
* Described as:
|
||||
* "good at tasks that require high recall and high precision, especially
|
||||
* against short queries composed of a few words as in the case of Internet
|
||||
* searches"
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public class IndependenceStandardized extends Independence {
|
||||
|
||||
/**
|
||||
* Sole constructor.
|
||||
*/
|
||||
public IndependenceStandardized() {}
|
||||
|
||||
@Override
|
||||
public float score(float freq, float expected) {
|
||||
return (freq - expected) / (float) Math.sqrt(expected);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "Standardized";
|
||||
}
|
||||
}
|
|
@ -53,6 +53,9 @@
|
|||
* for IR;</li>
|
||||
* <li>The implementation of two {@linkplain org.apache.lucene.search.similarities.LMSimilarity language models} from
|
||||
* Zhai and Lafferty's paper.</li>
|
||||
* <li>{@linkplain org.apache.lucene.search.similarities.DFISimilarity Divergence from independence} models as described
|
||||
* in "IRRA at TREC 2012" (Dinçer).
|
||||
* <li>
|
||||
* </ul>
|
||||
*
|
||||
* Since {@link org.apache.lucene.search.similarities.SimilarityBase} is not
|
||||
|
|
|
@ -72,7 +72,9 @@ public class TestSimilarity2 extends LuceneTestCase {
|
|||
sims.add(new LMDirichletSimilarity());
|
||||
sims.add(new LMJelinekMercerSimilarity(0.1f));
|
||||
sims.add(new LMJelinekMercerSimilarity(0.7f));
|
||||
sims.add(new DFISimilarity());
|
||||
for (Independence independence : TestSimilarityBase.INDEPENDENCE_MEASURES) {
|
||||
sims.add(new DFISimilarity(independence));
|
||||
}
|
||||
}
|
||||
|
||||
/** because of stupid things like querynorm, it's possible we computeStats on a field that doesnt exist at all
|
||||
|
|
|
@ -95,6 +95,10 @@ public class TestSimilarityBase extends LuceneTestCase {
|
|||
static Lambda[] LAMBDAS = {
|
||||
new LambdaDF(), new LambdaTTF()
|
||||
};
|
||||
/** Independence measures for DFI */
|
||||
static Independence[] INDEPENDENCE_MEASURES = {
|
||||
new IndependenceStandardized(), new IndependenceSaturated(), new IndependenceChiSquared()
|
||||
};
|
||||
|
||||
private IndexSearcher searcher;
|
||||
private Directory dir;
|
||||
|
@ -140,7 +144,9 @@ public class TestSimilarityBase extends LuceneTestCase {
|
|||
sims.add(new LMDirichletSimilarity());
|
||||
sims.add(new LMJelinekMercerSimilarity(0.1f));
|
||||
sims.add(new LMJelinekMercerSimilarity(0.7f));
|
||||
sims.add(new DFISimilarity());
|
||||
for (Independence independence : INDEPENDENCE_MEASURES) {
|
||||
sims.add(new DFISimilarity(independence));
|
||||
}
|
||||
}
|
||||
|
||||
// ------------------------------- Unit tests --------------------------------
|
||||
|
|
|
@ -105,6 +105,10 @@ public class RandomSimilarity extends PerFieldSimilarityWrapper {
|
|||
static Lambda[] LAMBDAS = {
|
||||
new LambdaDF(), new LambdaTTF()
|
||||
};
|
||||
/** Independence measures for DFI */
|
||||
static Independence[] INDEPENDENCE_MEASURES = {
|
||||
new IndependenceStandardized(), new IndependenceSaturated(), new IndependenceChiSquared()
|
||||
};
|
||||
static List<Similarity> allSims;
|
||||
static {
|
||||
allSims = new ArrayList<>();
|
||||
|
@ -128,7 +132,9 @@ public class RandomSimilarity extends PerFieldSimilarityWrapper {
|
|||
allSims.add(new LMDirichletSimilarity()); */
|
||||
allSims.add(new LMJelinekMercerSimilarity(0.1f));
|
||||
allSims.add(new LMJelinekMercerSimilarity(0.7f));
|
||||
allSims.add(new DFISimilarity());
|
||||
for (Independence independence : INDEPENDENCE_MEASURES) {
|
||||
allSims.add(new DFISimilarity(independence));
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -18,6 +18,10 @@ package org.apache.solr.search.similarities;
|
|||
*/
|
||||
|
||||
import org.apache.lucene.search.similarities.DFISimilarity;
|
||||
import org.apache.lucene.search.similarities.Independence;
|
||||
import org.apache.lucene.search.similarities.IndependenceChiSquared;
|
||||
import org.apache.lucene.search.similarities.IndependenceSaturated;
|
||||
import org.apache.lucene.search.similarities.IndependenceStandardized;
|
||||
import org.apache.lucene.search.similarities.Similarity;
|
||||
import org.apache.solr.common.params.SolrParams;
|
||||
import org.apache.solr.schema.SimilarityFactory;
|
||||
|
@ -25,6 +29,12 @@ import org.apache.solr.schema.SimilarityFactory;
|
|||
/**
|
||||
* Factory for {@link DFISimilarity}
|
||||
* <p>
|
||||
* You must specify the measure of divergence from independence ("independenceMeasure")
|
||||
* <ul>
|
||||
* <li>"Standardized": {@link IndependenceStandardized}</li>
|
||||
* <li>"Saturated": {@link IndependenceSaturated}</li>
|
||||
* <li>"ChiSquared": {@link IndependenceChiSquared}</li>
|
||||
* </ul>
|
||||
* Optional settings:
|
||||
* <ul>
|
||||
* <li>discountOverlaps (bool): Sets {@link org.apache.lucene.search.similarities.SimilarityBase#setDiscountOverlaps(boolean)}</li>
|
||||
|
@ -35,18 +45,32 @@ import org.apache.solr.schema.SimilarityFactory;
|
|||
public class DFISimilarityFactory extends SimilarityFactory {
|
||||
|
||||
private boolean discountOverlaps;
|
||||
private Independence independenceMeasure;
|
||||
|
||||
@Override
|
||||
public void init(SolrParams params) {
|
||||
super.init(params);
|
||||
discountOverlaps = params.getBool(ClassicSimilarityFactory.DISCOUNT_OVERLAPS, true);
|
||||
independenceMeasure = parseIndependenceMeasure(params.get("independenceMeasure"));
|
||||
}
|
||||
|
||||
@Override
|
||||
public Similarity getSimilarity() {
|
||||
DFISimilarity sim = new DFISimilarity();
|
||||
DFISimilarity sim = new DFISimilarity(independenceMeasure);
|
||||
sim.setDiscountOverlaps(discountOverlaps);
|
||||
return sim;
|
||||
}
|
||||
|
||||
private Independence parseIndependenceMeasure(String expr) {
|
||||
if ("ChiSquared".equals(expr)) {
|
||||
return new IndependenceChiSquared();
|
||||
} else if ("Standardized".equals(expr)) {
|
||||
return new IndependenceStandardized();
|
||||
} else if ("Saturated".equals(expr)) {
|
||||
return new IndependenceSaturated();
|
||||
} else {
|
||||
throw new RuntimeException("Invalid independence measure: " + expr);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -25,13 +25,16 @@
|
|||
<!-- default parameters -->
|
||||
<fieldType name="text" class="solr.TextField">
|
||||
<analyzer class="org.apache.lucene.analysis.standard.StandardAnalyzer"/>
|
||||
<similarity class="solr.DFISimilarityFactory"/>
|
||||
<similarity class="solr.DFISimilarityFactory">
|
||||
<str name="independenceMeasure">ChiSquared</str>
|
||||
</similarity>
|
||||
</fieldType>
|
||||
|
||||
<!-- with discountOverlaps parameter -->
|
||||
<fieldType name="text_params" class="solr.TextField">
|
||||
<analyzer class="org.apache.lucene.analysis.standard.StandardAnalyzer"/>
|
||||
<similarity class="solr.DFISimilarityFactory">
|
||||
<str name="independenceMeasure">ChiSquared</str>
|
||||
<bool name="discountOverlaps">false</bool>
|
||||
</similarity>
|
||||
</fieldType>
|
||||
|
|
|
@ -18,6 +18,7 @@ package org.apache.solr.search.similarities;
|
|||
*/
|
||||
|
||||
import org.apache.lucene.search.similarities.DFISimilarity;
|
||||
import org.apache.lucene.search.similarities.IndependenceChiSquared;
|
||||
import org.apache.lucene.search.similarities.Similarity;
|
||||
import org.junit.BeforeClass;
|
||||
|
||||
|
@ -39,6 +40,7 @@ public class TestDFISimilarityFactory extends BaseSimilarityTestCase {
|
|||
assertEquals(DFISimilarity.class, sim.getClass());
|
||||
DFISimilarity dfi = (DFISimilarity) sim;
|
||||
assertTrue(dfi.getDiscountOverlaps());
|
||||
assertTrue(dfi.getIndependence() instanceof IndependenceChiSquared);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
Loading…
Reference in New Issue