diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 7bc177447db..775f5f4eac9 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -126,8 +126,8 @@ New Features as alternative to their SPI name. This enables compile-time safety when defining analyzer's components. (Uwe Schindler, Shai Erera) -* LUCENE-6818: Add DFISimilarity implementing the divergence from independence - model. (Ahmet Arslan via Robert Muir) +* LUCENE-6818, LUCENE-6986: Add DFISimilarity implementing the divergence + from independence model. (Ahmet Arslan via Robert Muir) * SOLR-4619: Added removeAllAttributes() to AttributeSource, which removes all previously added attributes. diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/DFISimilarity.java b/lucene/core/src/java/org/apache/lucene/search/similarities/DFISimilarity.java index a407701f5b6..33da280a7f5 100644 --- a/lucene/core/src/java/org/apache/lucene/search/similarities/DFISimilarity.java +++ b/lucene/core/src/java/org/apache/lucene/search/similarities/DFISimilarity.java @@ -32,16 +32,21 @@ package org.apache.lucene.search.similarities; * For more information see: A nonparametric term weighting method for information retrieval based on measuring the divergence from independence * * @lucene.experimental - * @see org.apache.lucene.search.similarities.DFRSimilarity + * @see org.apache.lucene.search.similarities.IndependenceStandardized + * @see org.apache.lucene.search.similarities.IndependenceSaturated + * @see org.apache.lucene.search.similarities.IndependenceChiSquared */ public class DFISimilarity extends SimilarityBase { - + private final Independence independence; + /** - * Sole constructor: DFI is parameter-free. + * Create DFI with the specified divergence from independence measure + * @param independenceMeasure measure of divergence from independence */ - public DFISimilarity() { + public DFISimilarity(Independence independenceMeasure) { + this.independence = independenceMeasure; } @Override @@ -52,14 +57,21 @@ public class DFISimilarity extends SimilarityBase { // if the observed frequency is less than or equal to the expected value, then return zero. if (freq <= expected) return 0; - final float chiSquare = (freq - expected) * (freq - expected) / expected; + final float measure = independence.score(freq, expected); - return stats.getBoost() * (float) log2(chiSquare + 1); + return stats.getBoost() * (float) log2(measure + 1); + } + + /** + * Returns the measure of independence + */ + public Independence getIndependence() { + return independence; } @Override public String toString() { - return "DFI"; + return "DFI(" + independence + ")"; } } diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/Independence.java b/lucene/core/src/java/org/apache/lucene/search/similarities/Independence.java new file mode 100644 index 00000000000..1ce820d6dd6 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/search/similarities/Independence.java @@ -0,0 +1,46 @@ +package org.apache.lucene.search.similarities; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Computes the measure of divergence from independence for DFI + * scoring functions. + *

+ * See http://trec.nist.gov/pubs/trec21/papers/irra.web.nb.pdf for more information + * on different methods. + * @lucene.experimental + */ +public abstract class Independence { + + /** + * Sole constructor. (For invocation by subclass + * constructors, typically implicit.) + */ + public Independence() {} + + /** + * Computes distance from independence + * @param freq actual term frequency + * @param expected expected term frequency + */ + public abstract float score(float freq, float expected); + + // subclasses must provide a name + @Override + public abstract String toString(); +} diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/IndependenceChiSquared.java b/lucene/core/src/java/org/apache/lucene/search/similarities/IndependenceChiSquared.java new file mode 100644 index 00000000000..e703449dce2 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/search/similarities/IndependenceChiSquared.java @@ -0,0 +1,44 @@ +package org.apache.lucene.search.similarities; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Normalized chi-squared measure of distance from independence + *

+ * Described as: + * "can be used for tasks that require high precision, against both + * short and long queries." + * @lucene.experimental + */ +public class IndependenceChiSquared extends Independence { + + /** + * Sole constructor. + */ + public IndependenceChiSquared() {} + + @Override + public float score(float freq, float expected) { + return (freq - expected) * (freq - expected) / expected; + } + + @Override + public String toString() { + return "ChiSquared"; + } +} diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/IndependenceSaturated.java b/lucene/core/src/java/org/apache/lucene/search/similarities/IndependenceSaturated.java new file mode 100644 index 00000000000..dadd6f8cbeb --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/search/similarities/IndependenceSaturated.java @@ -0,0 +1,43 @@ +package org.apache.lucene.search.similarities; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Saturated measure of distance from independence + *

+ * Described as: + * "for tasks that require high recall against long queries" + * @lucene.experimental + */ +public class IndependenceSaturated extends Independence { + + /** + * Sole constructor. + */ + public IndependenceSaturated() {} + + @Override + public float score(float freq, float expected) { + return (freq - expected) / expected; + } + + @Override + public String toString() { + return "Saturated"; + } +} diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/IndependenceStandardized.java b/lucene/core/src/java/org/apache/lucene/search/similarities/IndependenceStandardized.java new file mode 100644 index 00000000000..8d36507de79 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/search/similarities/IndependenceStandardized.java @@ -0,0 +1,45 @@ +package org.apache.lucene.search.similarities; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Standardized measure of distance from independence + *

+ * Described as: + * "good at tasks that require high recall and high precision, especially + * against short queries composed of a few words as in the case of Internet + * searches" + * @lucene.experimental + */ +public class IndependenceStandardized extends Independence { + + /** + * Sole constructor. + */ + public IndependenceStandardized() {} + + @Override + public float score(float freq, float expected) { + return (freq - expected) / (float) Math.sqrt(expected); + } + + @Override + public String toString() { + return "Standardized"; + } +} diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/package-info.java b/lucene/core/src/java/org/apache/lucene/search/similarities/package-info.java index d6dcdb7689f..a3544d71442 100644 --- a/lucene/core/src/java/org/apache/lucene/search/similarities/package-info.java +++ b/lucene/core/src/java/org/apache/lucene/search/similarities/package-info.java @@ -53,6 +53,9 @@ * for IR; *

  • The implementation of two {@linkplain org.apache.lucene.search.similarities.LMSimilarity language models} from * Zhai and Lafferty's paper.
  • + *
  • {@linkplain org.apache.lucene.search.similarities.DFISimilarity Divergence from independence} models as described + * in "IRRA at TREC 2012" (Dinçer). + *
  • * * * Since {@link org.apache.lucene.search.similarities.SimilarityBase} is not diff --git a/lucene/core/src/test/org/apache/lucene/search/similarities/TestSimilarity2.java b/lucene/core/src/test/org/apache/lucene/search/similarities/TestSimilarity2.java index 1be7c7c05aa..fbd4359cf5d 100644 --- a/lucene/core/src/test/org/apache/lucene/search/similarities/TestSimilarity2.java +++ b/lucene/core/src/test/org/apache/lucene/search/similarities/TestSimilarity2.java @@ -72,7 +72,9 @@ public class TestSimilarity2 extends LuceneTestCase { sims.add(new LMDirichletSimilarity()); sims.add(new LMJelinekMercerSimilarity(0.1f)); sims.add(new LMJelinekMercerSimilarity(0.7f)); - sims.add(new DFISimilarity()); + for (Independence independence : TestSimilarityBase.INDEPENDENCE_MEASURES) { + sims.add(new DFISimilarity(independence)); + } } /** because of stupid things like querynorm, it's possible we computeStats on a field that doesnt exist at all diff --git a/lucene/core/src/test/org/apache/lucene/search/similarities/TestSimilarityBase.java b/lucene/core/src/test/org/apache/lucene/search/similarities/TestSimilarityBase.java index 22ce8f3f996..d62096ba372 100644 --- a/lucene/core/src/test/org/apache/lucene/search/similarities/TestSimilarityBase.java +++ b/lucene/core/src/test/org/apache/lucene/search/similarities/TestSimilarityBase.java @@ -95,6 +95,10 @@ public class TestSimilarityBase extends LuceneTestCase { static Lambda[] LAMBDAS = { new LambdaDF(), new LambdaTTF() }; + /** Independence measures for DFI */ + static Independence[] INDEPENDENCE_MEASURES = { + new IndependenceStandardized(), new IndependenceSaturated(), new IndependenceChiSquared() + }; private IndexSearcher searcher; private Directory dir; @@ -140,7 +144,9 @@ public class TestSimilarityBase extends LuceneTestCase { sims.add(new LMDirichletSimilarity()); sims.add(new LMJelinekMercerSimilarity(0.1f)); sims.add(new LMJelinekMercerSimilarity(0.7f)); - sims.add(new DFISimilarity()); + for (Independence independence : INDEPENDENCE_MEASURES) { + sims.add(new DFISimilarity(independence)); + } } // ------------------------------- Unit tests -------------------------------- diff --git a/lucene/test-framework/src/java/org/apache/lucene/search/similarities/RandomSimilarity.java b/lucene/test-framework/src/java/org/apache/lucene/search/similarities/RandomSimilarity.java index bc6095717d8..1893f0e9822 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/search/similarities/RandomSimilarity.java +++ b/lucene/test-framework/src/java/org/apache/lucene/search/similarities/RandomSimilarity.java @@ -105,6 +105,10 @@ public class RandomSimilarity extends PerFieldSimilarityWrapper { static Lambda[] LAMBDAS = { new LambdaDF(), new LambdaTTF() }; + /** Independence measures for DFI */ + static Independence[] INDEPENDENCE_MEASURES = { + new IndependenceStandardized(), new IndependenceSaturated(), new IndependenceChiSquared() + }; static List allSims; static { allSims = new ArrayList<>(); @@ -128,7 +132,9 @@ public class RandomSimilarity extends PerFieldSimilarityWrapper { allSims.add(new LMDirichletSimilarity()); */ allSims.add(new LMJelinekMercerSimilarity(0.1f)); allSims.add(new LMJelinekMercerSimilarity(0.7f)); - allSims.add(new DFISimilarity()); + for (Independence independence : INDEPENDENCE_MEASURES) { + allSims.add(new DFISimilarity(independence)); + } } @Override diff --git a/solr/core/src/java/org/apache/solr/search/similarities/DFISimilarityFactory.java b/solr/core/src/java/org/apache/solr/search/similarities/DFISimilarityFactory.java index 9f0f65685c4..ca1386dd0f5 100644 --- a/solr/core/src/java/org/apache/solr/search/similarities/DFISimilarityFactory.java +++ b/solr/core/src/java/org/apache/solr/search/similarities/DFISimilarityFactory.java @@ -18,6 +18,10 @@ package org.apache.solr.search.similarities; */ import org.apache.lucene.search.similarities.DFISimilarity; +import org.apache.lucene.search.similarities.Independence; +import org.apache.lucene.search.similarities.IndependenceChiSquared; +import org.apache.lucene.search.similarities.IndependenceSaturated; +import org.apache.lucene.search.similarities.IndependenceStandardized; import org.apache.lucene.search.similarities.Similarity; import org.apache.solr.common.params.SolrParams; import org.apache.solr.schema.SimilarityFactory; @@ -25,6 +29,12 @@ import org.apache.solr.schema.SimilarityFactory; /** * Factory for {@link DFISimilarity} *

    + * You must specify the measure of divergence from independence ("independenceMeasure") + *

    * Optional settings: *