LUCENE-6986: add more DFI measures

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1726205 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2016-01-22 13:30:12 +00:00
parent dabf5e8ce8
commit 79572ada76
13 changed files with 250 additions and 14 deletions

View File

@ -126,8 +126,8 @@ New Features
as alternative to their SPI name. This enables compile-time safety when
defining analyzer's components. (Uwe Schindler, Shai Erera)
* LUCENE-6818: Add DFISimilarity implementing the divergence from independence
model. (Ahmet Arslan via Robert Muir)
* LUCENE-6818, LUCENE-6986: Add DFISimilarity implementing the divergence
from independence model. (Ahmet Arslan via Robert Muir)
* SOLR-4619: Added removeAllAttributes() to AttributeSource, which removes
all previously added attributes.

View File

@ -32,16 +32,21 @@ package org.apache.lucene.search.similarities;
* For more information see: <a href="http://dx.doi.org/10.1007/s10791-013-9225-4">A nonparametric term weighting method for information retrieval based on measuring the divergence from independence</a>
*
* @lucene.experimental
* @see org.apache.lucene.search.similarities.DFRSimilarity
* @see org.apache.lucene.search.similarities.IndependenceStandardized
* @see org.apache.lucene.search.similarities.IndependenceSaturated
* @see org.apache.lucene.search.similarities.IndependenceChiSquared
*/
public class DFISimilarity extends SimilarityBase {
private final Independence independence;
/**
* Sole constructor: DFI is parameter-free.
* Create DFI with the specified divergence from independence measure
* @param independenceMeasure measure of divergence from independence
*/
public DFISimilarity() {
public DFISimilarity(Independence independenceMeasure) {
this.independence = independenceMeasure;
}
@Override
@ -52,14 +57,21 @@ public class DFISimilarity extends SimilarityBase {
// if the observed frequency is less than or equal to the expected value, then return zero.
if (freq <= expected) return 0;
final float chiSquare = (freq - expected) * (freq - expected) / expected;
final float measure = independence.score(freq, expected);
return stats.getBoost() * (float) log2(chiSquare + 1);
return stats.getBoost() * (float) log2(measure + 1);
}
/**
* Returns the measure of independence
*/
public Independence getIndependence() {
return independence;
}
@Override
public String toString() {
return "DFI";
return "DFI(" + independence + ")";
}
}

View File

@ -0,0 +1,46 @@
package org.apache.lucene.search.similarities;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Computes the measure of divergence from independence for DFI
* scoring functions.
* <p>
* See http://trec.nist.gov/pubs/trec21/papers/irra.web.nb.pdf for more information
* on different methods.
* @lucene.experimental
*/
public abstract class Independence {
/**
* Sole constructor. (For invocation by subclass
* constructors, typically implicit.)
*/
public Independence() {}
/**
* Computes distance from independence
* @param freq actual term frequency
* @param expected expected term frequency
*/
public abstract float score(float freq, float expected);
// subclasses must provide a name
@Override
public abstract String toString();
}

View File

@ -0,0 +1,44 @@
package org.apache.lucene.search.similarities;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Normalized chi-squared measure of distance from independence
* <p>
* Described as:
* "can be used for tasks that require high precision, against both
* short and long queries."
* @lucene.experimental
*/
public class IndependenceChiSquared extends Independence {
/**
* Sole constructor.
*/
public IndependenceChiSquared() {}
@Override
public float score(float freq, float expected) {
return (freq - expected) * (freq - expected) / expected;
}
@Override
public String toString() {
return "ChiSquared";
}
}

View File

@ -0,0 +1,43 @@
package org.apache.lucene.search.similarities;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Saturated measure of distance from independence
* <p>
* Described as:
* "for tasks that require high recall against long queries"
* @lucene.experimental
*/
public class IndependenceSaturated extends Independence {
/**
* Sole constructor.
*/
public IndependenceSaturated() {}
@Override
public float score(float freq, float expected) {
return (freq - expected) / expected;
}
@Override
public String toString() {
return "Saturated";
}
}

View File

@ -0,0 +1,45 @@
package org.apache.lucene.search.similarities;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Standardized measure of distance from independence
* <p>
* Described as:
* "good at tasks that require high recall and high precision, especially
* against short queries composed of a few words as in the case of Internet
* searches"
* @lucene.experimental
*/
public class IndependenceStandardized extends Independence {
/**
* Sole constructor.
*/
public IndependenceStandardized() {}
@Override
public float score(float freq, float expected) {
return (freq - expected) / (float) Math.sqrt(expected);
}
@Override
public String toString() {
return "Standardized";
}
}

View File

@ -53,6 +53,9 @@
* for IR;</li>
* <li>The implementation of two {@linkplain org.apache.lucene.search.similarities.LMSimilarity language models} from
* Zhai and Lafferty's paper.</li>
* <li>{@linkplain org.apache.lucene.search.similarities.DFISimilarity Divergence from independence} models as described
* in "IRRA at TREC 2012" (Dinçer).
* <li>
* </ul>
*
* Since {@link org.apache.lucene.search.similarities.SimilarityBase} is not

View File

@ -72,7 +72,9 @@ public class TestSimilarity2 extends LuceneTestCase {
sims.add(new LMDirichletSimilarity());
sims.add(new LMJelinekMercerSimilarity(0.1f));
sims.add(new LMJelinekMercerSimilarity(0.7f));
sims.add(new DFISimilarity());
for (Independence independence : TestSimilarityBase.INDEPENDENCE_MEASURES) {
sims.add(new DFISimilarity(independence));
}
}
/** because of stupid things like querynorm, it's possible we computeStats on a field that doesnt exist at all

View File

@ -95,6 +95,10 @@ public class TestSimilarityBase extends LuceneTestCase {
static Lambda[] LAMBDAS = {
new LambdaDF(), new LambdaTTF()
};
/** Independence measures for DFI */
static Independence[] INDEPENDENCE_MEASURES = {
new IndependenceStandardized(), new IndependenceSaturated(), new IndependenceChiSquared()
};
private IndexSearcher searcher;
private Directory dir;
@ -140,7 +144,9 @@ public class TestSimilarityBase extends LuceneTestCase {
sims.add(new LMDirichletSimilarity());
sims.add(new LMJelinekMercerSimilarity(0.1f));
sims.add(new LMJelinekMercerSimilarity(0.7f));
sims.add(new DFISimilarity());
for (Independence independence : INDEPENDENCE_MEASURES) {
sims.add(new DFISimilarity(independence));
}
}
// ------------------------------- Unit tests --------------------------------

View File

@ -105,6 +105,10 @@ public class RandomSimilarity extends PerFieldSimilarityWrapper {
static Lambda[] LAMBDAS = {
new LambdaDF(), new LambdaTTF()
};
/** Independence measures for DFI */
static Independence[] INDEPENDENCE_MEASURES = {
new IndependenceStandardized(), new IndependenceSaturated(), new IndependenceChiSquared()
};
static List<Similarity> allSims;
static {
allSims = new ArrayList<>();
@ -128,7 +132,9 @@ public class RandomSimilarity extends PerFieldSimilarityWrapper {
allSims.add(new LMDirichletSimilarity()); */
allSims.add(new LMJelinekMercerSimilarity(0.1f));
allSims.add(new LMJelinekMercerSimilarity(0.7f));
allSims.add(new DFISimilarity());
for (Independence independence : INDEPENDENCE_MEASURES) {
allSims.add(new DFISimilarity(independence));
}
}
@Override

View File

@ -18,6 +18,10 @@ package org.apache.solr.search.similarities;
*/
import org.apache.lucene.search.similarities.DFISimilarity;
import org.apache.lucene.search.similarities.Independence;
import org.apache.lucene.search.similarities.IndependenceChiSquared;
import org.apache.lucene.search.similarities.IndependenceSaturated;
import org.apache.lucene.search.similarities.IndependenceStandardized;
import org.apache.lucene.search.similarities.Similarity;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.schema.SimilarityFactory;
@ -25,6 +29,12 @@ import org.apache.solr.schema.SimilarityFactory;
/**
* Factory for {@link DFISimilarity}
* <p>
* You must specify the measure of divergence from independence ("independenceMeasure")
* <ul>
* <li>"Standardized": {@link IndependenceStandardized}</li>
* <li>"Saturated": {@link IndependenceSaturated}</li>
* <li>"ChiSquared": {@link IndependenceChiSquared}</li>
* </ul>
* Optional settings:
* <ul>
* <li>discountOverlaps (bool): Sets {@link org.apache.lucene.search.similarities.SimilarityBase#setDiscountOverlaps(boolean)}</li>
@ -35,18 +45,32 @@ import org.apache.solr.schema.SimilarityFactory;
public class DFISimilarityFactory extends SimilarityFactory {
private boolean discountOverlaps;
private Independence independenceMeasure;
@Override
public void init(SolrParams params) {
super.init(params);
discountOverlaps = params.getBool(ClassicSimilarityFactory.DISCOUNT_OVERLAPS, true);
independenceMeasure = parseIndependenceMeasure(params.get("independenceMeasure"));
}
@Override
public Similarity getSimilarity() {
DFISimilarity sim = new DFISimilarity();
DFISimilarity sim = new DFISimilarity(independenceMeasure);
sim.setDiscountOverlaps(discountOverlaps);
return sim;
}
private Independence parseIndependenceMeasure(String expr) {
if ("ChiSquared".equals(expr)) {
return new IndependenceChiSquared();
} else if ("Standardized".equals(expr)) {
return new IndependenceStandardized();
} else if ("Saturated".equals(expr)) {
return new IndependenceSaturated();
} else {
throw new RuntimeException("Invalid independence measure: " + expr);
}
}
}

View File

@ -25,13 +25,16 @@
<!-- default parameters -->
<fieldType name="text" class="solr.TextField">
<analyzer class="org.apache.lucene.analysis.standard.StandardAnalyzer"/>
<similarity class="solr.DFISimilarityFactory"/>
<similarity class="solr.DFISimilarityFactory">
<str name="independenceMeasure">ChiSquared</str>
</similarity>
</fieldType>
<!-- with discountOverlaps parameter -->
<fieldType name="text_params" class="solr.TextField">
<analyzer class="org.apache.lucene.analysis.standard.StandardAnalyzer"/>
<similarity class="solr.DFISimilarityFactory">
<str name="independenceMeasure">ChiSquared</str>
<bool name="discountOverlaps">false</bool>
</similarity>
</fieldType>

View File

@ -18,6 +18,7 @@ package org.apache.solr.search.similarities;
*/
import org.apache.lucene.search.similarities.DFISimilarity;
import org.apache.lucene.search.similarities.IndependenceChiSquared;
import org.apache.lucene.search.similarities.Similarity;
import org.junit.BeforeClass;
@ -39,6 +40,7 @@ public class TestDFISimilarityFactory extends BaseSimilarityTestCase {
assertEquals(DFISimilarity.class, sim.getClass());
DFISimilarity dfi = (DFISimilarity) sim;
assertTrue(dfi.getDiscountOverlaps());
assertTrue(dfi.getIndependence() instanceof IndependenceChiSquared);
}
/**