diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 7bc177447db..775f5f4eac9 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -126,8 +126,8 @@ New Features as alternative to their SPI name. This enables compile-time safety when defining analyzer's components. (Uwe Schindler, Shai Erera) -* LUCENE-6818: Add DFISimilarity implementing the divergence from independence - model. (Ahmet Arslan via Robert Muir) +* LUCENE-6818, LUCENE-6986: Add DFISimilarity implementing the divergence + from independence model. (Ahmet Arslan via Robert Muir) * SOLR-4619: Added removeAllAttributes() to AttributeSource, which removes all previously added attributes. diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/DFISimilarity.java b/lucene/core/src/java/org/apache/lucene/search/similarities/DFISimilarity.java index a407701f5b6..33da280a7f5 100644 --- a/lucene/core/src/java/org/apache/lucene/search/similarities/DFISimilarity.java +++ b/lucene/core/src/java/org/apache/lucene/search/similarities/DFISimilarity.java @@ -32,16 +32,21 @@ package org.apache.lucene.search.similarities; * For more information see: A nonparametric term weighting method for information retrieval based on measuring the divergence from independence * * @lucene.experimental - * @see org.apache.lucene.search.similarities.DFRSimilarity + * @see org.apache.lucene.search.similarities.IndependenceStandardized + * @see org.apache.lucene.search.similarities.IndependenceSaturated + * @see org.apache.lucene.search.similarities.IndependenceChiSquared */ public class DFISimilarity extends SimilarityBase { - + private final Independence independence; + /** - * Sole constructor: DFI is parameter-free. + * Create DFI with the specified divergence from independence measure + * @param independenceMeasure measure of divergence from independence */ - public DFISimilarity() { + public DFISimilarity(Independence independenceMeasure) { + this.independence = independenceMeasure; } @Override @@ -52,14 +57,21 @@ public class DFISimilarity extends SimilarityBase { // if the observed frequency is less than or equal to the expected value, then return zero. if (freq <= expected) return 0; - final float chiSquare = (freq - expected) * (freq - expected) / expected; + final float measure = independence.score(freq, expected); - return stats.getBoost() * (float) log2(chiSquare + 1); + return stats.getBoost() * (float) log2(measure + 1); + } + + /** + * Returns the measure of independence + */ + public Independence getIndependence() { + return independence; } @Override public String toString() { - return "DFI"; + return "DFI(" + independence + ")"; } } diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/Independence.java b/lucene/core/src/java/org/apache/lucene/search/similarities/Independence.java new file mode 100644 index 00000000000..1ce820d6dd6 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/search/similarities/Independence.java @@ -0,0 +1,46 @@ +package org.apache.lucene.search.similarities; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Computes the measure of divergence from independence for DFI + * scoring functions. + *
+ * See http://trec.nist.gov/pubs/trec21/papers/irra.web.nb.pdf for more information + * on different methods. + * @lucene.experimental + */ +public abstract class Independence { + + /** + * Sole constructor. (For invocation by subclass + * constructors, typically implicit.) + */ + public Independence() {} + + /** + * Computes distance from independence + * @param freq actual term frequency + * @param expected expected term frequency + */ + public abstract float score(float freq, float expected); + + // subclasses must provide a name + @Override + public abstract String toString(); +} diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/IndependenceChiSquared.java b/lucene/core/src/java/org/apache/lucene/search/similarities/IndependenceChiSquared.java new file mode 100644 index 00000000000..e703449dce2 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/search/similarities/IndependenceChiSquared.java @@ -0,0 +1,44 @@ +package org.apache.lucene.search.similarities; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Normalized chi-squared measure of distance from independence + *
+ * Described as: + * "can be used for tasks that require high precision, against both + * short and long queries." + * @lucene.experimental + */ +public class IndependenceChiSquared extends Independence { + + /** + * Sole constructor. + */ + public IndependenceChiSquared() {} + + @Override + public float score(float freq, float expected) { + return (freq - expected) * (freq - expected) / expected; + } + + @Override + public String toString() { + return "ChiSquared"; + } +} diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/IndependenceSaturated.java b/lucene/core/src/java/org/apache/lucene/search/similarities/IndependenceSaturated.java new file mode 100644 index 00000000000..dadd6f8cbeb --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/search/similarities/IndependenceSaturated.java @@ -0,0 +1,43 @@ +package org.apache.lucene.search.similarities; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Saturated measure of distance from independence + *
+ * Described as: + * "for tasks that require high recall against long queries" + * @lucene.experimental + */ +public class IndependenceSaturated extends Independence { + + /** + * Sole constructor. + */ + public IndependenceSaturated() {} + + @Override + public float score(float freq, float expected) { + return (freq - expected) / expected; + } + + @Override + public String toString() { + return "Saturated"; + } +} diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/IndependenceStandardized.java b/lucene/core/src/java/org/apache/lucene/search/similarities/IndependenceStandardized.java new file mode 100644 index 00000000000..8d36507de79 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/search/similarities/IndependenceStandardized.java @@ -0,0 +1,45 @@ +package org.apache.lucene.search.similarities; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Standardized measure of distance from independence + *
+ * Described as: + * "good at tasks that require high recall and high precision, especially + * against short queries composed of a few words as in the case of Internet + * searches" + * @lucene.experimental + */ +public class IndependenceStandardized extends Independence { + + /** + * Sole constructor. + */ + public IndependenceStandardized() {} + + @Override + public float score(float freq, float expected) { + return (freq - expected) / (float) Math.sqrt(expected); + } + + @Override + public String toString() { + return "Standardized"; + } +} diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/package-info.java b/lucene/core/src/java/org/apache/lucene/search/similarities/package-info.java index d6dcdb7689f..a3544d71442 100644 --- a/lucene/core/src/java/org/apache/lucene/search/similarities/package-info.java +++ b/lucene/core/src/java/org/apache/lucene/search/similarities/package-info.java @@ -53,6 +53,9 @@ * for IR; *
+ * You must specify the measure of divergence from independence ("independenceMeasure") + *