From 63b63c573487fe6b054afb6073c057a88a15288f Mon Sep 17 00:00:00 2001 From: Adrien Grand Date: Wed, 6 Dec 2017 18:19:57 +0100 Subject: [PATCH] LUCENE-8015: Fixed DFR similarities' scores to not decrease when tfn increases. --- .../search/similarities/AfterEffect.java | 27 +--- .../search/similarities/AfterEffectB.java | 6 +- .../search/similarities/AfterEffectL.java | 6 +- .../search/similarities/BasicModel.java | 10 +- .../search/similarities/BasicModelBE.java | 55 ------- .../search/similarities/BasicModelD.java | 56 ------- .../search/similarities/BasicModelG.java | 12 +- .../search/similarities/BasicModelIF.java | 11 +- .../search/similarities/BasicModelIn.java | 15 +- .../search/similarities/BasicModelIne.java | 11 +- .../search/similarities/BasicModelP.java | 49 ------ .../search/similarities/DFRSimilarity.java | 22 +-- .../similarities/BasicModelTestCase.java | 5 +- .../search/similarities/TestBasicModelBE.java | 30 ---- .../search/similarities/TestBasicModelD.java | 30 ---- .../search/similarities/TestBasicModelP.java | 30 ---- .../similarities/TestSimilarityBase.java | 62 +------ .../similarities/BaseSimilarityTestCase.java | 153 +++++++++++++----- .../search/similarities/RandomSimilarity.java | 6 +- .../similarities/DFRSimilarityFactory.java | 18 +-- .../solr/collection1/conf/schema-dfr.xml | 2 +- .../TestDFRSimilarityFactory.java | 4 +- 22 files changed, 193 insertions(+), 427 deletions(-) delete mode 100644 lucene/core/src/java/org/apache/lucene/search/similarities/BasicModelBE.java delete mode 100644 lucene/core/src/java/org/apache/lucene/search/similarities/BasicModelD.java delete mode 100644 lucene/core/src/java/org/apache/lucene/search/similarities/BasicModelP.java delete mode 100644 lucene/core/src/test/org/apache/lucene/search/similarities/TestBasicModelBE.java delete mode 100644 lucene/core/src/test/org/apache/lucene/search/similarities/TestBasicModelD.java delete mode 100644 lucene/core/src/test/org/apache/lucene/search/similarities/TestBasicModelP.java diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/AfterEffect.java b/lucene/core/src/java/org/apache/lucene/search/similarities/AfterEffect.java index e62513e53ea..cbcd7898273 100644 --- a/lucene/core/src/java/org/apache/lucene/search/similarities/AfterEffect.java +++ b/lucene/core/src/java/org/apache/lucene/search/similarities/AfterEffect.java @@ -37,33 +37,12 @@ public abstract class AfterEffect { */ public AfterEffect() {} - /** Returns the aftereffect score. */ - public abstract double score(BasicStats stats, double tfn); + /** Returns the product of the after effect with {@code 1+tfn}. + * This may not depend on the value of {@code tfn}. */ + public abstract double scoreTimes1pTfn(BasicStats stats); /** Returns an explanation for the score. */ public abstract Explanation explain(BasicStats stats, double tfn); - - /** Implementation used when there is no aftereffect. */ - public static final class NoAfterEffect extends AfterEffect { - - /** Sole constructor: parameter-free */ - public NoAfterEffect() {} - - @Override - public double score(BasicStats stats, double tfn) { - return 1.0; - } - - @Override - public Explanation explain(BasicStats stats, double tfn) { - return Explanation.match(1, "no aftereffect"); - } - - @Override - public String toString() { - return ""; - } - } /** * Subclasses must override this method to return the code of the diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/AfterEffectB.java b/lucene/core/src/java/org/apache/lucene/search/similarities/AfterEffectB.java index b1bff96c5a5..6678cd9d721 100644 --- a/lucene/core/src/java/org/apache/lucene/search/similarities/AfterEffectB.java +++ b/lucene/core/src/java/org/apache/lucene/search/similarities/AfterEffectB.java @@ -29,16 +29,16 @@ public class AfterEffectB extends AfterEffect { public AfterEffectB() {} @Override - public final double score(BasicStats stats, double tfn) { + public final double scoreTimes1pTfn(BasicStats stats) { long F = stats.getTotalTermFreq()+1; long n = stats.getDocFreq()+1; - return (F + 1) / (n * (tfn + 1)); + return (F + 1.0) / n; } @Override public final Explanation explain(BasicStats stats, double tfn) { return Explanation.match( - (float) score(stats, tfn), + (float) (scoreTimes1pTfn(stats) / (1 + tfn)), getClass().getSimpleName() + ", computed from: ", Explanation.match((float) tfn, "tfn"), Explanation.match(stats.getTotalTermFreq(), "totalTermFreq"), diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/AfterEffectL.java b/lucene/core/src/java/org/apache/lucene/search/similarities/AfterEffectL.java index a8ee53d79e4..60a1b1dc9c7 100644 --- a/lucene/core/src/java/org/apache/lucene/search/similarities/AfterEffectL.java +++ b/lucene/core/src/java/org/apache/lucene/search/similarities/AfterEffectL.java @@ -29,14 +29,14 @@ public class AfterEffectL extends AfterEffect { public AfterEffectL() {} @Override - public final double score(BasicStats stats, double tfn) { - return 1 / (tfn + 1); + public final double scoreTimes1pTfn(BasicStats stats) { + return 1.0; } @Override public final Explanation explain(BasicStats stats, double tfn) { return Explanation.match( - (float) score(stats, tfn), + (float) (scoreTimes1pTfn(stats) / (1 + tfn)), getClass().getSimpleName() + ", computed from: ", Explanation.match((float) tfn, "tfn")); } diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/BasicModel.java b/lucene/core/src/java/org/apache/lucene/search/similarities/BasicModel.java index 20dee40c6b9..51d4571e6a7 100644 --- a/lucene/core/src/java/org/apache/lucene/search/similarities/BasicModel.java +++ b/lucene/core/src/java/org/apache/lucene/search/similarities/BasicModel.java @@ -36,8 +36,10 @@ public abstract class BasicModel { */ public BasicModel() {} - /** Returns the informative content score. */ - public abstract double score(BasicStats stats, double tfn); + /** Returns the informative content score combined with the after effect, more specifically + * {@code informationContentScore * aeTimes1pTfn / (1 + tfn)}. This function must be + * non-decreasing with {@code tfn}. */ + public abstract double score(BasicStats stats, double tfn, double aeTimes1pTfn); /** * Returns an explanation for the score. @@ -46,9 +48,9 @@ public abstract class BasicModel { * explanation for such models. Subclasses that use other statistics must * override this method.

*/ - public Explanation explain(BasicStats stats, double tfn) { + public Explanation explain(BasicStats stats, double tfn, double aeTimes1pTfn) { return Explanation.match( - (float) score(stats, tfn), + (float) (score(stats, tfn, aeTimes1pTfn) * (1 + tfn) / aeTimes1pTfn), getClass().getSimpleName() + ", computed from: ", Explanation.match(stats.getNumberOfDocuments(), "numberOfDocuments"), Explanation.match(stats.getTotalTermFreq(), "totalTermFreq")); diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/BasicModelBE.java b/lucene/core/src/java/org/apache/lucene/search/similarities/BasicModelBE.java deleted file mode 100644 index 0ba5686fc5f..00000000000 --- a/lucene/core/src/java/org/apache/lucene/search/similarities/BasicModelBE.java +++ /dev/null @@ -1,55 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.lucene.search.similarities; - - -import static org.apache.lucene.search.similarities.SimilarityBase.log2; - -/** - * Limiting form of the Bose-Einstein model. The formula used in Lucene differs - * slightly from the one in the original paper: {@code F} is increased by {@code tfn+1} - * and {@code N} is increased by {@code F} - * @lucene.experimental - * NOTE: in some corner cases this model may give poor performance or infinite scores with - * Normalizations that return large or small values for {@code tfn} such as NormalizationH3. - * Consider using the geometric approximation ({@link BasicModelG}) instead, which provides - * the same relevance but with less practical problems. - */ -public class BasicModelBE extends BasicModel { - - /** Sole constructor: parameter-free */ - public BasicModelBE() {} - - @Override - public final double score(BasicStats stats, double tfn) { - double F = stats.getTotalTermFreq() + 1 + tfn; - // approximation only holds true when F << N, so we use N += F - double N = F + stats.getNumberOfDocuments(); - return (-log2((N - 1) * Math.E) - + f(N + F - 1, N + F - tfn - 2) - f(F, F - tfn)); - } - - /** The f helper function defined for BE. */ - private final double f(double n, double m) { - return (m + 0.5) * log2(n / m) + (n - m) * log2(n); - } - - @Override - public String toString() { - return "Be"; - } -} diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/BasicModelD.java b/lucene/core/src/java/org/apache/lucene/search/similarities/BasicModelD.java deleted file mode 100644 index 70b004b29a5..00000000000 --- a/lucene/core/src/java/org/apache/lucene/search/similarities/BasicModelD.java +++ /dev/null @@ -1,56 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.lucene.search.similarities; - - -import static org.apache.lucene.search.similarities.SimilarityBase.log2; - -/** - * Implements the approximation of the binomial model with the divergence - * for DFR. The formula used in Lucene differs slightly from the one in the - * original paper: to avoid underflow for small values of {@code N} and - * {@code F}, {@code N} is increased by {@code 1} and - * {@code F} is always increased by {@code tfn+1}. - *

- * WARNING: for terms that do not meet the expected random distribution - * (e.g. stopwords), this model may give poor performance, such as - * abnormally high or NaN scores for low tf values. - * @lucene.experimental - */ -public class BasicModelD extends BasicModel { - - /** Sole constructor: parameter-free */ - public BasicModelD() {} - - @Override - public final double score(BasicStats stats, double tfn) { - // we have to ensure phi is always < 1 for tiny TTF values, otherwise nphi can go negative, - // resulting in NaN. cleanest way is to unconditionally always add tfn to totalTermFreq - // to create a 'normalized' F. - double F = stats.getTotalTermFreq() + 1 + tfn; - double phi = tfn / F; - double nphi = 1 - phi; - double p = 1.0 / (stats.getNumberOfDocuments() + 1); - double D = phi * log2(phi / p) + nphi * log2(nphi / (1 - p)); - return D * F + 0.5 * log2(1 + 2 * Math.PI * tfn * nphi); - } - - @Override - public String toString() { - return "D"; - } -} diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/BasicModelG.java b/lucene/core/src/java/org/apache/lucene/search/similarities/BasicModelG.java index 2f8cb4368e0..ce871967f9b 100644 --- a/lucene/core/src/java/org/apache/lucene/search/similarities/BasicModelG.java +++ b/lucene/core/src/java/org/apache/lucene/search/similarities/BasicModelG.java @@ -31,13 +31,21 @@ public class BasicModelG extends BasicModel { public BasicModelG() {} @Override - public final double score(BasicStats stats, double tfn) { + public final double score(BasicStats stats, double tfn, double aeTimes1pTfn) { // just like in BE, approximation only holds true when F << N, so we use lambda = F / (N + F) double F = stats.getTotalTermFreq() + 1; double N = stats.getNumberOfDocuments(); double lambda = F / (N + F); // -log(1 / (lambda + 1)) -> log(lambda + 1) - return log2(lambda + 1) + tfn * log2((1 + lambda) / lambda); + double A = log2(lambda + 1); + double B = log2((1 + lambda) / lambda); + + // basic model G should return (A + B * tfn) + // which we rewrite to B * (1 + tfn) - (B - A) + // so that it can be combined with the after effect while still guaranteeing + // that the result is non-decreasing with tfn since B >= A + + return (B - (B - A) / (1 + tfn)) * aeTimes1pTfn; } @Override diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/BasicModelIF.java b/lucene/core/src/java/org/apache/lucene/search/similarities/BasicModelIF.java index 5b7350bbd14..16781cdf496 100644 --- a/lucene/core/src/java/org/apache/lucene/search/similarities/BasicModelIF.java +++ b/lucene/core/src/java/org/apache/lucene/search/similarities/BasicModelIF.java @@ -29,10 +29,17 @@ public class BasicModelIF extends BasicModel { public BasicModelIF() {} @Override - public final double score(BasicStats stats, double tfn) { + public final double score(BasicStats stats, double tfn, double aeTimes1pTfn) { long N = stats.getNumberOfDocuments(); long F = stats.getTotalTermFreq(); - return tfn * log2(1 + (N + 1) / (F + 0.5)); + double A = log2(1 + (N + 1) / (F + 0.5)); + + // basic model IF should return A * tfn + // which we rewrite to A * (1 + tfn) - A + // so that it can be combined with the after effect while still guaranteeing + // that the result is non-decreasing with tfn + + return A * aeTimes1pTfn * (1 - 1 / (1 + tfn)); } @Override diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/BasicModelIn.java b/lucene/core/src/java/org/apache/lucene/search/similarities/BasicModelIn.java index a09eedb0d8f..5f1e1814c81 100644 --- a/lucene/core/src/java/org/apache/lucene/search/similarities/BasicModelIn.java +++ b/lucene/core/src/java/org/apache/lucene/search/similarities/BasicModelIn.java @@ -30,16 +30,23 @@ public class BasicModelIn extends BasicModel { public BasicModelIn() {} @Override - public final double score(BasicStats stats, double tfn) { + public final double score(BasicStats stats, double tfn, double aeTimes1pTfn) { long N = stats.getNumberOfDocuments(); long n = stats.getDocFreq(); - return tfn * log2((N + 1) / (n + 0.5)); + double A = log2((N + 1) / (n + 0.5)); + + // basic model I(n) should return A * tfn + // which we rewrite to A * (1 + tfn) - A + // so that it can be combined with the after effect while still guaranteeing + // that the result is non-decreasing with tfn + + return A * aeTimes1pTfn * (1 - 1 / (1 + tfn)); } @Override - public final Explanation explain(BasicStats stats, double tfn) { + public final Explanation explain(BasicStats stats, double tfn, double aeTimes1pTfn) { return Explanation.match( - (float) score(stats, tfn), + (float) (score(stats, tfn, aeTimes1pTfn) * (1 + tfn) / aeTimes1pTfn), getClass().getSimpleName() + ", computed from: ", Explanation.match(stats.getNumberOfDocuments(), "numberOfDocuments"), Explanation.match(stats.getDocFreq(), "docFreq")); diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/BasicModelIne.java b/lucene/core/src/java/org/apache/lucene/search/similarities/BasicModelIne.java index b4e830d166f..fb755fa50dd 100644 --- a/lucene/core/src/java/org/apache/lucene/search/similarities/BasicModelIne.java +++ b/lucene/core/src/java/org/apache/lucene/search/similarities/BasicModelIne.java @@ -30,11 +30,18 @@ public class BasicModelIne extends BasicModel { public BasicModelIne() {} @Override - public final double score(BasicStats stats, double tfn) { + public final double score(BasicStats stats, double tfn, double aeTimes1pTfn) { long N = stats.getNumberOfDocuments(); long F = stats.getTotalTermFreq(); double ne = N * (1 - Math.pow((N - 1) / (double)N, F)); - return tfn * log2((N + 1) / (ne + 0.5)); + double A = log2((N + 1) / (ne + 0.5)); + + // basic model I(ne) should return A * tfn + // which we rewrite to A * (1 + tfn) - A + // so that it can be combined with the after effect while still guaranteeing + // that the result is non-decreasing with tfn + + return A * aeTimes1pTfn * (1 - 1 / (1 + tfn)); } @Override diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/BasicModelP.java b/lucene/core/src/java/org/apache/lucene/search/similarities/BasicModelP.java deleted file mode 100644 index f66e3d004bd..00000000000 --- a/lucene/core/src/java/org/apache/lucene/search/similarities/BasicModelP.java +++ /dev/null @@ -1,49 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.lucene.search.similarities; - - -import static org.apache.lucene.search.similarities.SimilarityBase.log2; - -/** - * Implements the Poisson approximation for the binomial model for DFR. - * @lucene.experimental - *

- * WARNING: for terms that do not meet the expected random distribution - * (e.g. stopwords), this model may give poor performance, such as - * abnormally high scores for low tf values. - */ -public class BasicModelP extends BasicModel { - /** {@code log2(Math.E)}, precomputed. */ - protected static double LOG2_E = log2(Math.E); - - /** Sole constructor: parameter-free */ - public BasicModelP() {} - - @Override - public final double score(BasicStats stats, double tfn) { - double lambda = (stats.getTotalTermFreq()+1) / (double) (stats.getNumberOfDocuments()+1); - return tfn * log2(tfn / lambda) - + (lambda + 1 / (12 * tfn) - tfn) * LOG2_E - + 0.5 * log2(2 * Math.PI * tfn); - } - - @Override - public String toString() { - return "P"; - } -} diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/DFRSimilarity.java b/lucene/core/src/java/org/apache/lucene/search/similarities/DFRSimilarity.java index aacd2460d7a..d793d947f3a 100644 --- a/lucene/core/src/java/org/apache/lucene/search/similarities/DFRSimilarity.java +++ b/lucene/core/src/java/org/apache/lucene/search/similarities/DFRSimilarity.java @@ -20,7 +20,6 @@ package org.apache.lucene.search.similarities; import java.util.List; import org.apache.lucene.search.Explanation; -import org.apache.lucene.search.similarities.AfterEffect.NoAfterEffect; import org.apache.lucene.search.similarities.Normalization.NoNormalization; /** @@ -40,10 +39,7 @@ import org.apache.lucene.search.similarities.Normalization.NoNormalization; *

    *
  1. {@link BasicModel}: Basic model of information content: *
      - *
    • {@link BasicModelBE}: Limiting form of Bose-Einstein *
    • {@link BasicModelG}: Geometric approximation of Bose-Einstein - *
    • {@link BasicModelP}: Poisson approximation of the Binomial - *
    • {@link BasicModelD}: Divergence approximation of the Binomial *
    • {@link BasicModelIn}: Inverse document frequency *
    • {@link BasicModelIne}: Inverse expected document * frequency [mixture of Poisson and IDF] @@ -55,7 +51,6 @@ import org.apache.lucene.search.similarities.Normalization.NoNormalization; *
        *
      • {@link AfterEffectL}: Laplace's law of succession *
      • {@link AfterEffectB}: Ratio of two Bernoulli processes - *
      • {@link NoAfterEffect}: no first normalization *
      *
    • {@link Normalization}: Second (length) normalization: *
        @@ -72,6 +67,10 @@ import org.apache.lucene.search.similarities.Normalization.NoNormalization; *
*

Note that qtf, the multiplicity of term-occurrence in the query, * is not handled by this implementation.

+ *

Note that basic models BE (Limiting form of Bose-Einstein), P (Poisson + * approximation of the Binomial) and D (Divergence approximation of the + * Binomial) are not implemented because their formula couldn't be written in + * a way that makes scores non-decreasing with the normalized term frequency. * @see BasicModel * @see AfterEffect * @see Normalization @@ -89,8 +88,8 @@ public class DFRSimilarity extends SimilarityBase { * Creates DFRSimilarity from the three components. *

* Note that null values are not allowed: - * if you want no normalization or after-effect, instead pass - * {@link NoNormalization} or {@link NoAfterEffect} respectively. + * if you want no normalization, instead pass + * {@link NoNormalization}. * @param basicModel Basic model of information content * @param afterEffect First normalization of information gain * @param normalization Second (length) normalization @@ -109,8 +108,8 @@ public class DFRSimilarity extends SimilarityBase { @Override protected double score(BasicStats stats, double freq, double docLen) { double tfn = normalization.tfn(stats, freq, docLen); - return stats.getBoost() * - basicModel.score(stats, tfn) * afterEffect.score(stats, tfn); + double aeTimes1pTfn = afterEffect.scoreTimes1pTfn(stats); + return stats.getBoost() * basicModel.score(stats, tfn, aeTimes1pTfn); } @Override @@ -121,9 +120,10 @@ public class DFRSimilarity extends SimilarityBase { } Explanation normExpl = normalization.explain(stats, freq, docLen); - float tfn = normExpl.getValue(); + double tfn = normalization.tfn(stats, freq, docLen); + double aeTimes1pTfn = afterEffect.scoreTimes1pTfn(stats); subs.add(normExpl); - subs.add(basicModel.explain(stats, tfn)); + subs.add(basicModel.explain(stats, tfn, aeTimes1pTfn)); subs.add(afterEffect.explain(stats, tfn)); } diff --git a/lucene/core/src/test/org/apache/lucene/search/similarities/BasicModelTestCase.java b/lucene/core/src/test/org/apache/lucene/search/similarities/BasicModelTestCase.java index 66236669704..3cb83e2da94 100644 --- a/lucene/core/src/test/org/apache/lucene/search/similarities/BasicModelTestCase.java +++ b/lucene/core/src/test/org/apache/lucene/search/similarities/BasicModelTestCase.java @@ -23,11 +23,8 @@ public abstract class BasicModelTestCase extends BaseSimilarityTestCase { @Override protected final Similarity getSimilarity(Random random) { final AfterEffect afterEffect; - switch(random.nextInt(3)) { + switch(random.nextInt(2)) { case 0: - afterEffect = new AfterEffect.NoAfterEffect(); - break; - case 1: afterEffect = new AfterEffectL(); break; default: diff --git a/lucene/core/src/test/org/apache/lucene/search/similarities/TestBasicModelBE.java b/lucene/core/src/test/org/apache/lucene/search/similarities/TestBasicModelBE.java deleted file mode 100644 index 2dc956f7da0..00000000000 --- a/lucene/core/src/test/org/apache/lucene/search/similarities/TestBasicModelBE.java +++ /dev/null @@ -1,30 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.lucene.search.similarities; - -import org.apache.lucene.util.LuceneTestCase.AwaitsFix; - -// returns negative scores at least, but it warns it has problems -@AwaitsFix(bugUrl = "https://issues.apache.org/jira/browse/LUCENE-8010") -public class TestBasicModelBE extends BasicModelTestCase { - - @Override - protected BasicModel getBasicModel() { - return new BasicModelBE(); - } - -} diff --git a/lucene/core/src/test/org/apache/lucene/search/similarities/TestBasicModelD.java b/lucene/core/src/test/org/apache/lucene/search/similarities/TestBasicModelD.java deleted file mode 100644 index 7eee359b3d4..00000000000 --- a/lucene/core/src/test/org/apache/lucene/search/similarities/TestBasicModelD.java +++ /dev/null @@ -1,30 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.lucene.search.similarities; - -import org.apache.lucene.util.LuceneTestCase.AwaitsFix; - -// scores go backwards with respect to TF, but it warns it has problems -@AwaitsFix(bugUrl = "https://issues.apache.org/jira/browse/LUCENE-8010") -public class TestBasicModelD extends BasicModelTestCase { - - @Override - protected BasicModel getBasicModel() { - return new BasicModelD(); - } - -} diff --git a/lucene/core/src/test/org/apache/lucene/search/similarities/TestBasicModelP.java b/lucene/core/src/test/org/apache/lucene/search/similarities/TestBasicModelP.java deleted file mode 100644 index 2788ff8edb8..00000000000 --- a/lucene/core/src/test/org/apache/lucene/search/similarities/TestBasicModelP.java +++ /dev/null @@ -1,30 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.lucene.search.similarities; - -import org.apache.lucene.util.LuceneTestCase.AwaitsFix; - -//scores go backwards with respect to TF, but it warns it has problems -@AwaitsFix(bugUrl = "https://issues.apache.org/jira/browse/LUCENE-8010") -public class TestBasicModelP extends BasicModelTestCase { - - @Override - protected BasicModel getBasicModel() { - return new BasicModelP(); - } - -} diff --git a/lucene/core/src/test/org/apache/lucene/search/similarities/TestSimilarityBase.java b/lucene/core/src/test/org/apache/lucene/search/similarities/TestSimilarityBase.java index 8a6227c9552..be85801a471 100644 --- a/lucene/core/src/test/org/apache/lucene/search/similarities/TestSimilarityBase.java +++ b/lucene/core/src/test/org/apache/lucene/search/similarities/TestSimilarityBase.java @@ -76,13 +76,12 @@ public class TestSimilarityBase extends LuceneTestCase { private static float FLOAT_EPSILON = 1e-5f; /** The DFR basic models to test. */ static BasicModel[] BASIC_MODELS = { - new BasicModelBE(), new BasicModelD(), new BasicModelG(), - new BasicModelIF(), new BasicModelIn(), new BasicModelIne(), - new BasicModelP() + new BasicModelG(), new BasicModelIF(), new BasicModelIn(), + new BasicModelIne() }; /** The DFR aftereffects to test. */ static AfterEffect[] AFTER_EFFECTS = { - new AfterEffectB(), new AfterEffectL(), new AfterEffect.NoAfterEffect() + new AfterEffectB(), new AfterEffectL() }; /** The DFR normalizations to test. */ static Normalization[] NORMALIZATIONS = { @@ -445,21 +444,6 @@ public class TestSimilarityBase extends LuceneTestCase { new IBSimilarity(new DistributionSPL(), new LambdaTTF(), new Normalization.NoNormalization()); correctnessTestCore(sim, 2.2387237548828125f); } - - /** Correctness test for the PL2 DFR model. */ - public void testPL2() throws IOException { - SimilarityBase sim = new DFRSimilarity( - new BasicModelP(), new AfterEffectL(), new NormalizationH2()); - float tfn = (float)(FREQ * SimilarityBase.log2( - 1 + AVG_FIELD_LENGTH / DOC_LEN)); // 8.1894750101 - float l = 1.0f / (tfn + 1.0f); // 0.108820144666 - float lambda = (1.0f + TOTAL_TERM_FREQ) / (1f + NUMBER_OF_DOCUMENTS); // 0.7029703 - float p = (float)(tfn * SimilarityBase.log2(tfn / lambda) + - (lambda + 1 / (12 * tfn) - tfn) * SimilarityBase.log2(Math.E) + - 0.5 * SimilarityBase.log2(2 * Math.PI * tfn)); // 21.065619 - float gold = l * p; // 2.2923636 - correctnessTestCore(sim, gold); - } /** Correctness test for the IneB2 DFR model. */ public void testIneB2() throws IOException { @@ -475,50 +459,14 @@ public class TestSimilarityBase extends LuceneTestCase { correctnessTestCore(sim, 1.6390540599822998f); } - /** Correctness test for the BEB1 DFR model. */ - public void testBEB1() throws IOException { - SimilarityBase sim = new DFRSimilarity( - new BasicModelBE(), new AfterEffectB(), new NormalizationH1()); - float tfn = FREQ * AVG_FIELD_LENGTH / DOC_LEN; // 8.75 - float b = (TOTAL_TERM_FREQ + 1 + 1) / ((DOC_FREQ + 1) * (tfn + 1)); // 0.67132866 - double f = TOTAL_TERM_FREQ + 1 + tfn; - double n = f + NUMBER_OF_DOCUMENTS; - double n1 = n + f - 1; // 258.5 - double m1 = n + f - tfn - 2; // 248.75 - double n2 = f; // 79.75 - double m2 = f - tfn; // 71.0 - float be = (float)(-SimilarityBase.log2(n - 1) - - SimilarityBase.log2(Math.E) + // -8.924494472554715 - ((m1 + 0.5f) * SimilarityBase.log2(n1 / m1) + - (n1 - m1) * SimilarityBase.log2(n1)) - // 91.9620374903885 - ((m2 + 0.5f) * SimilarityBase.log2(n2 / m2) + - (n2 - m2) * SimilarityBase.log2(n2))); // 67.26544321004599 - // 15.7720995 - float gold = b * be; // 10.588263 - correctnessTestCore(sim, gold); - } - - /** Correctness test for the D DFR model (basic model only). */ - public void testD() throws IOException { - SimilarityBase sim = new DFRSimilarity(new BasicModelD(), new AfterEffect.NoAfterEffect(), new Normalization.NoNormalization()); - double totalTermFreqNorm = TOTAL_TERM_FREQ + FREQ + 1; - double p = 1.0 / (NUMBER_OF_DOCUMENTS + 1); // 0.009900990099009901 - double phi = FREQ / totalTermFreqNorm; // 0.08974358974358974 - double D = phi * SimilarityBase.log2(phi / p) + // 0.17498542370019005 - (1 - phi) * SimilarityBase.log2((1 - phi) / (1 - p)); - float gold = (float)(totalTermFreqNorm * D + 0.5 * SimilarityBase.log2( - 1 + 2 * Math.PI * FREQ * (1 - phi))); // 16.328257 - correctnessTestCore(sim, gold); - } - /** Correctness test for the In2 DFR model with no aftereffect. */ public void testIn2() throws IOException { SimilarityBase sim = new DFRSimilarity( - new BasicModelIn(), new AfterEffect.NoAfterEffect(), new NormalizationH2()); + new BasicModelIn(), new AfterEffectL(), new NormalizationH2()); float tfn = (float)(FREQ * SimilarityBase.log2( // 8.1894750101 1 + AVG_FIELD_LENGTH / DOC_LEN)); float gold = (float)(tfn * SimilarityBase.log2( // 26.7459577898 - (NUMBER_OF_DOCUMENTS + 1) / (DOC_FREQ + 0.5))); + (NUMBER_OF_DOCUMENTS + 1) / (DOC_FREQ + 0.5)) / (1 + tfn)); correctnessTestCore(sim, gold); } diff --git a/lucene/test-framework/src/java/org/apache/lucene/search/similarities/BaseSimilarityTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/search/similarities/BaseSimilarityTestCase.java index a0f2ece9a43..85a3d6c2c34 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/search/similarities/BaseSimilarityTestCase.java +++ b/lucene/test-framework/src/java/org/apache/lucene/search/similarities/BaseSimilarityTestCase.java @@ -193,20 +193,46 @@ public abstract class BaseSimilarityTestCase extends LuceneTestCase { lowerBound = SmallFloat.byte4ToInt((byte) norm); } final long maxDoc; - if (random.nextBoolean()) { - // small collection - maxDoc = TestUtil.nextLong(random, 1, 100000); - } else { - // yuge collection - maxDoc = TestUtil.nextLong(random, 1, MAXDOC_FORTESTING); + switch (random.nextInt(6)) { + case 0: + // 1 doc collection + maxDoc = 1; + break; + case 1: + // 2 doc collection + maxDoc = 2; + break; + case 2: + // tiny collection + maxDoc = TestUtil.nextLong(random, 3, 16); + break; + case 3: + // small collection + maxDoc = TestUtil.nextLong(random, 16, 100000); + break; + case 4: + // big collection + maxDoc = TestUtil.nextLong(random, 100000, MAXDOC_FORTESTING); + break; + default: + // yuge collection + maxDoc = MAXDOC_FORTESTING; + break; } final long docCount; - if (random.nextBoolean()) { - // sparse field - docCount = TestUtil.nextLong(random, 1, maxDoc); - } else { - // fully populated - docCount = maxDoc; + switch (random.nextInt(3)) { + case 0: + // sparsest field + docCount = 1; + break; + case 1: + // sparse field + docCount = TestUtil.nextLong(random, 1, maxDoc); + break; + default: + // fully populated + docCount = maxDoc; + break; } // random docsize: but can't require docs to have > 2B tokens long upperBound; @@ -216,15 +242,22 @@ public abstract class BaseSimilarityTestCase extends LuceneTestCase { upperBound = MAXTOKENS_FORTESTING; } final long sumDocFreq; - if (random.nextBoolean()) { - // shortest possible docs - sumDocFreq = docCount; - } else { - // random docsize - sumDocFreq = TestUtil.nextLong(random, docCount, upperBound + 1 - lowerBound); + switch (random.nextInt(3)) { + case 0: + // shortest possible docs + sumDocFreq = docCount; + break; + case 1: + // biggest possible docs + sumDocFreq = upperBound + 1 - lowerBound; + break; + default: + // random docsize + sumDocFreq = TestUtil.nextLong(random, docCount, upperBound + 1 - lowerBound); + break; } final long sumTotalTermFreq; - switch (random.nextInt(3)) { + switch (random.nextInt(4)) { case 0: // term frequencies were omitted sumTotalTermFreq = sumDocFreq; @@ -233,6 +266,10 @@ public abstract class BaseSimilarityTestCase extends LuceneTestCase { // no repetition of terms (except to satisfy this norm) sumTotalTermFreq = sumDocFreq - 1 + lowerBound; break; + case 2: + // maximum repetition of terms + sumTotalTermFreq = upperBound; + break; default: // random repetition assert sumDocFreq - 1 + lowerBound <= upperBound; @@ -249,29 +286,46 @@ public abstract class BaseSimilarityTestCase extends LuceneTestCase { */ static TermStatistics newTerm(Random random, CollectionStatistics corpus) { final long docFreq; - if (random.nextBoolean()) { - // rare term - docFreq = 1; - } else { - // random specificity - docFreq = TestUtil.nextLong(random, 1, corpus.docCount()); + switch (random.nextInt(3)) { + case 0: + // rare term + docFreq = 1; + break; + case 1: + // common term + docFreq = corpus.docCount(); + break; + default: + // random specificity + docFreq = TestUtil.nextLong(random, 1, corpus.docCount()); + break; } final long totalTermFreq; + // can't require docs to have > 2B tokens + long upperBound; + try { + upperBound = Math.min(corpus.sumTotalTermFreq(), Math.multiplyExact(docFreq, Integer.MAX_VALUE)); + } catch (ArithmeticException overflow) { + upperBound = corpus.sumTotalTermFreq(); + } if (corpus.sumTotalTermFreq() == corpus.sumDocFreq()) { // omitTF totalTermFreq = docFreq; - } else if (random.nextBoolean()) { - // no repetition - totalTermFreq = docFreq; } else { - // random repetition: but can't require docs to have > 2B tokens - long upperBound; - try { - upperBound = Math.min(corpus.sumTotalTermFreq(), Math.multiplyExact(docFreq, Integer.MAX_VALUE)); - } catch (ArithmeticException overflow) { - upperBound = corpus.sumTotalTermFreq(); + switch (random.nextInt(3)) { + case 0: + // no repetition + totalTermFreq = docFreq; + break; + case 1: + // maximum repetition + totalTermFreq = upperBound; + break; + default: + // random repetition + totalTermFreq = TestUtil.nextLong(random, docFreq, upperBound); + break; } - totalTermFreq = TestUtil.nextLong(random, docFreq, upperBound); } return new TermStatistics(TERM, docFreq, totalTermFreq); } @@ -315,9 +369,34 @@ public abstract class BaseSimilarityTestCase extends LuceneTestCase { // there is at least one other document, and those must have at least 1 instance each. int upperBound = Math.toIntExact(Math.min(term.totalTermFreq() - term.docFreq() + 1, Integer.MAX_VALUE)); if (random.nextBoolean()) { - freq = TestUtil.nextInt(random, 1, upperBound); + // integer freq + switch (random.nextInt(3)) { + case 0: + // smallest freq + freq = 1; + break; + case 1: + // largest freq + freq = upperBound; + break; + default: + // random freq + freq = TestUtil.nextInt(random, 1, upperBound); + break; + } } else { - float freqCandidate = upperBound * random.nextFloat(); + // float freq + float freqCandidate; + switch (random.nextInt(2)) { + case 0: + // smallest freq + freqCandidate = Float.MIN_VALUE; + break; + default: + // random freq + freqCandidate = upperBound * random.nextFloat(); + break; + } // we need to be 2nd float value at a minimum, the pairwise test will check MIN_VALUE in this case. // this avoids testing frequencies of 0 which seem wrong to allow (we should enforce computeSlopFactor etc) if (freqCandidate <= Float.MIN_VALUE) { diff --git a/lucene/test-framework/src/java/org/apache/lucene/search/similarities/RandomSimilarity.java b/lucene/test-framework/src/java/org/apache/lucene/search/similarities/RandomSimilarity.java index f880935339e..444e8ef0e45 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/search/similarities/RandomSimilarity.java +++ b/lucene/test-framework/src/java/org/apache/lucene/search/similarities/RandomSimilarity.java @@ -58,13 +58,11 @@ public class RandomSimilarity extends PerFieldSimilarityWrapper { // all the similarities that we rotate through /** The DFR basic models to test. */ static BasicModel[] BASIC_MODELS = { - /* TODO: enable new BasicModelBE(), */ /* TODO: enable new BasicModelD(), */ new BasicModelG(), - new BasicModelIF(), new BasicModelIn(), new BasicModelIne(), - /* TODO: enable new BasicModelP() */ + new BasicModelG(), new BasicModelIF(), new BasicModelIn(), new BasicModelIne(), }; /** The DFR aftereffects to test. */ static AfterEffect[] AFTER_EFFECTS = { - new AfterEffectB(), new AfterEffectL(), new AfterEffect.NoAfterEffect() + new AfterEffectB(), new AfterEffectL() }; /** The DFR normalizations to test. */ static Normalization[] NORMALIZATIONS = { diff --git a/solr/core/src/java/org/apache/solr/search/similarities/DFRSimilarityFactory.java b/solr/core/src/java/org/apache/solr/search/similarities/DFRSimilarityFactory.java index 18fde0dfc88..572b32da053 100644 --- a/solr/core/src/java/org/apache/solr/search/similarities/DFRSimilarityFactory.java +++ b/solr/core/src/java/org/apache/solr/search/similarities/DFRSimilarityFactory.java @@ -17,17 +17,13 @@ package org.apache.solr.search.similarities; import org.apache.lucene.search.similarities.AfterEffect; -import org.apache.lucene.search.similarities.AfterEffect.NoAfterEffect; // javadoc import org.apache.lucene.search.similarities.AfterEffectB; import org.apache.lucene.search.similarities.AfterEffectL; import org.apache.lucene.search.similarities.BasicModel; -import org.apache.lucene.search.similarities.BasicModelBE; -import org.apache.lucene.search.similarities.BasicModelD; import org.apache.lucene.search.similarities.BasicModelG; import org.apache.lucene.search.similarities.BasicModelIF; import org.apache.lucene.search.similarities.BasicModelIn; import org.apache.lucene.search.similarities.BasicModelIne; -import org.apache.lucene.search.similarities.BasicModelP; import org.apache.lucene.search.similarities.DFRSimilarity; import org.apache.lucene.search.similarities.Normalization; import org.apache.lucene.search.similarities.Normalization.NoNormalization; // javadoc @@ -48,10 +44,7 @@ import org.apache.solr.schema.SimilarityFactory; *

    *
  1. {@link BasicModel basicModel}: Basic model of information content: *
      - *
    • {@link BasicModelBE Be}: Limiting form of Bose-Einstein *
    • {@link BasicModelG G}: Geometric approximation of Bose-Einstein - *
    • {@link BasicModelP P}: Poisson approximation of the Binomial - *
    • {@link BasicModelD D}: Divergence approximation of the Binomial *
    • {@link BasicModelIn I(n)}: Inverse document frequency *
    • {@link BasicModelIne I(ne)}: Inverse expected document * frequency [mixture of Poisson and IDF] @@ -63,7 +56,6 @@ import org.apache.solr.schema.SimilarityFactory; *
        *
      • {@link AfterEffectL L}: Laplace's law of succession *
      • {@link AfterEffectB B}: Ratio of two Bernoulli processes - *
      • {@link NoAfterEffect none}: no first normalization *
      *
    • {@link Normalization normalization}: Second (length) normalization: *
        @@ -122,11 +114,7 @@ public class DFRSimilarityFactory extends SimilarityFactory { } private BasicModel parseBasicModel(String expr) { - if ("Be".equals(expr)) { - return new BasicModelBE(); - } else if ("D".equals(expr)) { - return new BasicModelD(); - } else if ("G".equals(expr)) { + if ("G".equals(expr)) { return new BasicModelG(); } else if ("I(F)".equals(expr)) { return new BasicModelIF(); @@ -134,8 +122,6 @@ public class DFRSimilarityFactory extends SimilarityFactory { return new BasicModelIn(); } else if ("I(ne)".equals(expr)) { return new BasicModelIne(); - } else if ("P".equals(expr)) { - return new BasicModelP(); } else { throw new RuntimeException("Invalid basicModel: " + expr); } @@ -146,8 +132,6 @@ public class DFRSimilarityFactory extends SimilarityFactory { return new AfterEffectB(); } else if ("L".equals(expr)) { return new AfterEffectL(); - } else if ("none".equals(expr)) { - return new AfterEffect.NoAfterEffect(); } else { throw new RuntimeException("Invalid afterEffect: " + expr); } diff --git a/solr/core/src/test-files/solr/collection1/conf/schema-dfr.xml b/solr/core/src/test-files/solr/collection1/conf/schema-dfr.xml index 30835105570..78c3b7ff958 100644 --- a/solr/core/src/test-files/solr/collection1/conf/schema-dfr.xml +++ b/solr/core/src/test-files/solr/collection1/conf/schema-dfr.xml @@ -46,7 +46,7 @@ - P + G L H2 7 diff --git a/solr/core/src/test/org/apache/solr/search/similarities/TestDFRSimilarityFactory.java b/solr/core/src/test/org/apache/solr/search/similarities/TestDFRSimilarityFactory.java index 2159f1abf6a..f3b05b3c39d 100644 --- a/solr/core/src/test/org/apache/solr/search/similarities/TestDFRSimilarityFactory.java +++ b/solr/core/src/test/org/apache/solr/search/similarities/TestDFRSimilarityFactory.java @@ -18,8 +18,8 @@ package org.apache.solr.search.similarities; import org.apache.lucene.search.similarities.AfterEffectB; import org.apache.lucene.search.similarities.AfterEffectL; +import org.apache.lucene.search.similarities.BasicModelG; import org.apache.lucene.search.similarities.BasicModelIF; -import org.apache.lucene.search.similarities.BasicModelP; import org.apache.lucene.search.similarities.DFRSimilarity; import org.apache.lucene.search.similarities.NormalizationH2; import org.apache.lucene.search.similarities.NormalizationH3; @@ -62,7 +62,7 @@ public class TestDFRSimilarityFactory extends BaseSimilarityTestCase { Similarity sim = getSimilarity("text_paramc"); assertEquals(DFRSimilarity.class, sim.getClass()); DFRSimilarity dfr = (DFRSimilarity) sim; - assertEquals(BasicModelP.class, dfr.getBasicModel().getClass()); + assertEquals(BasicModelG.class, dfr.getBasicModel().getClass()); assertEquals(AfterEffectL.class, dfr.getAfterEffect().getClass()); assertEquals(NormalizationH2.class, dfr.getNormalization().getClass()); NormalizationH2 norm = (NormalizationH2) dfr.getNormalization();