LUCENE-8015: Fixed DFR similarities' scores to not decrease when tfn increases.

2017-12-06 18:19:57 +01:00 · 2017-12-06 18:19:57 +01:00 · 63b63c5734
parent 70b36666d4
commit 63b63c5734
22 changed files with 193 additions and 427 deletions
--- a/lucene/core/src/java/org/apache/lucene/search/similarities/AfterEffect.java
+++ b/lucene/core/src/java/org/apache/lucene/search/similarities/AfterEffect.java
@ -37,34 +37,13 @@ public abstract class AfterEffect {
   */
  public AfterEffect() {}

-  /** Returns the aftereffect score. */
-  public abstract double score(BasicStats stats, double tfn);
+  /** Returns the product of the after effect with {@code 1+tfn}.
+   *  This may not depend on the value of {@code tfn}. */
+  public abstract double scoreTimes1pTfn(BasicStats stats);
  
  /** Returns an explanation for the score. */
  public abstract Explanation explain(BasicStats stats, double tfn);
  
-  /** Implementation used when there is no aftereffect. */
-  public static final class NoAfterEffect extends AfterEffect {
-    
-    /** Sole constructor: parameter-free */
-    public NoAfterEffect() {}
-    
-    @Override
-    public double score(BasicStats stats, double tfn) {
-      return 1.0;
-    }
-
-    @Override
-    public Explanation explain(BasicStats stats, double tfn) {
-      return Explanation.match(1, "no aftereffect");
-    }
-    
-    @Override
-    public String toString() {
-      return "";
-    }
-  }
-  
  /**
   * Subclasses must override this method to return the code of the
   * after effect formula. Refer to the original paper for the list. 
--- a/lucene/core/src/java/org/apache/lucene/search/similarities/AfterEffectB.java
+++ b/lucene/core/src/java/org/apache/lucene/search/similarities/AfterEffectB.java
@ -29,16 +29,16 @@ public class AfterEffectB extends AfterEffect {
  public AfterEffectB() {}

  @Override
-  public final double score(BasicStats stats, double tfn) {
+  public final double scoreTimes1pTfn(BasicStats stats) {
    long F = stats.getTotalTermFreq()+1;
    long n = stats.getDocFreq()+1;
-    return (F + 1) / (n * (tfn + 1));
+    return (F + 1.0) / n;
  }
  
  @Override
  public final Explanation explain(BasicStats stats, double tfn) {
    return Explanation.match(
-        (float) score(stats, tfn),
+        (float) (scoreTimes1pTfn(stats) / (1 + tfn)),
        getClass().getSimpleName() + ", computed from: ",
        Explanation.match((float) tfn, "tfn"),
        Explanation.match(stats.getTotalTermFreq(), "totalTermFreq"),
--- a/lucene/core/src/java/org/apache/lucene/search/similarities/AfterEffectL.java
+++ b/lucene/core/src/java/org/apache/lucene/search/similarities/AfterEffectL.java
@ -29,14 +29,14 @@ public class AfterEffectL extends AfterEffect {
  public AfterEffectL() {}

  @Override
-  public final double score(BasicStats stats, double tfn) {
-    return 1 / (tfn + 1);
+  public final double scoreTimes1pTfn(BasicStats stats) {
+    return 1.0;
  }
  
  @Override
  public final Explanation explain(BasicStats stats, double tfn) {
    return Explanation.match(
-        (float) score(stats, tfn),
+        (float) (scoreTimes1pTfn(stats) / (1 + tfn)),
        getClass().getSimpleName() + ", computed from: ",
        Explanation.match((float) tfn, "tfn"));
  }
--- a/lucene/core/src/java/org/apache/lucene/search/similarities/BasicModel.java
+++ b/lucene/core/src/java/org/apache/lucene/search/similarities/BasicModel.java
@ -36,8 +36,10 @@ public abstract class BasicModel {
   */
  public BasicModel() {}

-  /** Returns the informative content score. */
-  public abstract double score(BasicStats stats, double tfn);
+  /** Returns the informative content score combined with the after effect, more specifically
+   * {@code informationContentScore * aeTimes1pTfn / (1 + tfn)}. This function must be
+   * non-decreasing with {@code tfn}. */
+  public abstract double score(BasicStats stats, double tfn, double aeTimes1pTfn);
  
  /**
   * Returns an explanation for the score.
@ -46,9 +48,9 @@ public abstract class BasicModel {
   * explanation for such models. Subclasses that use other statistics must
   * override this method.</p>
   */
-  public Explanation explain(BasicStats stats, double tfn) {
+  public Explanation explain(BasicStats stats, double tfn, double aeTimes1pTfn) {
    return Explanation.match(
-        (float) score(stats, tfn),
+        (float) (score(stats, tfn, aeTimes1pTfn) * (1 + tfn) / aeTimes1pTfn),
        getClass().getSimpleName() + ", computed from: ",
        Explanation.match(stats.getNumberOfDocuments(), "numberOfDocuments"),
        Explanation.match(stats.getTotalTermFreq(), "totalTermFreq"));
--- a/lucene/core/src/java/org/apache/lucene/search/similarities/BasicModelBE.java
+++ b/lucene/core/src/java/org/apache/lucene/search/similarities/BasicModelBE.java
@ -1,55 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.lucene.search.similarities;
-
-
-import static org.apache.lucene.search.similarities.SimilarityBase.log2;
-
-/**
- * Limiting form of the Bose-Einstein model. The formula used in Lucene differs
- * slightly from the one in the original paper: {@code F} is increased by {@code tfn+1}
- * and {@code N} is increased by {@code F} 
- * @lucene.experimental
- * NOTE: in some corner cases this model may give poor performance or infinite scores with 
- * Normalizations that return large or small values for {@code tfn} such as NormalizationH3. 
- * Consider using the geometric approximation ({@link BasicModelG}) instead, which provides 
- * the same relevance but with less practical problems. 
- */
-public class BasicModelBE extends BasicModel {
-  
-  /** Sole constructor: parameter-free */
-  public BasicModelBE() {}
-
-  @Override
-  public final double score(BasicStats stats, double tfn) {
-    double F = stats.getTotalTermFreq() + 1 + tfn;
-    // approximation only holds true when F << N, so we use N += F
-    double N = F + stats.getNumberOfDocuments();
-    return (-log2((N - 1) * Math.E)
-        + f(N + F - 1, N + F - tfn - 2) - f(F, F - tfn));
-  }
-  
-  /** The <em>f</em> helper function defined for <em>B<sub>E</sub></em>. */
-  private final double f(double n, double m) {
-    return (m + 0.5) * log2(n / m) + (n - m) * log2(n);
-  }
-  
-  @Override
-  public String toString() {
-    return "Be";
-  }
-}
--- a/lucene/core/src/java/org/apache/lucene/search/similarities/BasicModelD.java
+++ b/lucene/core/src/java/org/apache/lucene/search/similarities/BasicModelD.java
@ -1,56 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.lucene.search.similarities;
-
-
-import static org.apache.lucene.search.similarities.SimilarityBase.log2;
-
-/**
- * Implements the approximation of the binomial model with the divergence
- * for DFR. The formula used in Lucene differs slightly from the one in the
- * original paper: to avoid underflow for small values of {@code N} and
- * {@code F}, {@code N} is increased by {@code 1} and
- * {@code F} is always increased by {@code tfn+1}.
- * <p>
- * WARNING: for terms that do not meet the expected random distribution
- * (e.g. stopwords), this model may give poor performance, such as
- * abnormally high or NaN scores for low tf values.
- * @lucene.experimental
- */
-public class BasicModelD extends BasicModel {
-  
-  /** Sole constructor: parameter-free */
-  public BasicModelD() {}
-  
-  @Override
-  public final double score(BasicStats stats, double tfn) {
-    // we have to ensure phi is always < 1 for tiny TTF values, otherwise nphi can go negative,
-    // resulting in NaN. cleanest way is to unconditionally always add tfn to totalTermFreq
-    // to create a 'normalized' F.
-    double F = stats.getTotalTermFreq() + 1 + tfn;
-    double phi = tfn / F;
-    double nphi = 1 - phi;
-    double p = 1.0 / (stats.getNumberOfDocuments() + 1);
-    double D = phi * log2(phi / p) + nphi * log2(nphi / (1 - p));
-    return D * F + 0.5 * log2(1 + 2 * Math.PI * tfn * nphi);
-  }
-  
-  @Override
-  public String toString() {
-    return "D";
-  }
-}
--- a/lucene/core/src/java/org/apache/lucene/search/similarities/BasicModelG.java
+++ b/lucene/core/src/java/org/apache/lucene/search/similarities/BasicModelG.java
@ -31,13 +31,21 @@ public class BasicModelG extends BasicModel {
  public BasicModelG() {}

  @Override
-  public final double score(BasicStats stats, double tfn) {
+  public final double score(BasicStats stats, double tfn, double aeTimes1pTfn) {
    // just like in BE, approximation only holds true when F << N, so we use lambda = F / (N + F)
    double F = stats.getTotalTermFreq() + 1;
    double N = stats.getNumberOfDocuments();
    double lambda = F / (N + F);
    // -log(1 / (lambda + 1)) -> log(lambda + 1)
-    return log2(lambda + 1) + tfn * log2((1 + lambda) / lambda);
+    double A = log2(lambda + 1);
+    double B = log2((1 + lambda) / lambda);
+    
+    // basic model G should return (A + B * tfn)
+    // which we rewrite to B * (1 + tfn) - (B - A)
+    // so that it can be combined with the after effect while still guaranteeing
+    // that the result is non-decreasing with tfn since B >= A
+    
+    return (B - (B - A) / (1 + tfn)) * aeTimes1pTfn;
  }

  @Override
--- a/lucene/core/src/java/org/apache/lucene/search/similarities/BasicModelIF.java
+++ b/lucene/core/src/java/org/apache/lucene/search/similarities/BasicModelIF.java
@ -29,10 +29,17 @@ public class BasicModelIF extends BasicModel {
  public BasicModelIF() {}

  @Override
-  public final double score(BasicStats stats, double tfn) {
+  public final double score(BasicStats stats, double tfn, double aeTimes1pTfn) {
    long N = stats.getNumberOfDocuments();
    long F = stats.getTotalTermFreq();
-    return tfn * log2(1 + (N + 1) / (F + 0.5));
+    double A = log2(1 + (N + 1) / (F + 0.5));
+    
+    // basic model IF should return A * tfn
+    // which we rewrite to A * (1 + tfn) - A
+    // so that it can be combined with the after effect while still guaranteeing
+    // that the result is non-decreasing with tfn
+    
+    return A * aeTimes1pTfn * (1 - 1 / (1 + tfn));
  }

  @Override
--- a/lucene/core/src/java/org/apache/lucene/search/similarities/BasicModelIn.java
+++ b/lucene/core/src/java/org/apache/lucene/search/similarities/BasicModelIn.java
@ -30,16 +30,23 @@ public class BasicModelIn extends BasicModel {
  public BasicModelIn() {}

  @Override
-  public final double score(BasicStats stats, double tfn) {
+  public final double score(BasicStats stats, double tfn, double aeTimes1pTfn) {
    long N = stats.getNumberOfDocuments();
    long n = stats.getDocFreq();
-    return tfn * log2((N + 1) / (n + 0.5));
+    double A = log2((N + 1) / (n + 0.5));
+
+    // basic model I(n) should return A * tfn
+    // which we rewrite to A * (1 + tfn) - A
+    // so that it can be combined with the after effect while still guaranteeing
+    // that the result is non-decreasing with tfn
+
+    return A * aeTimes1pTfn * (1 - 1 / (1 + tfn));
  }
  
  @Override
-  public final Explanation explain(BasicStats stats, double tfn) {
+  public final Explanation explain(BasicStats stats, double tfn, double aeTimes1pTfn) {
    return Explanation.match(
-        (float) score(stats, tfn),
+        (float) (score(stats, tfn, aeTimes1pTfn) * (1 + tfn) / aeTimes1pTfn),
        getClass().getSimpleName() + ", computed from: ",
        Explanation.match(stats.getNumberOfDocuments(), "numberOfDocuments"),
        Explanation.match(stats.getDocFreq(), "docFreq"));
--- a/lucene/core/src/java/org/apache/lucene/search/similarities/BasicModelIne.java
+++ b/lucene/core/src/java/org/apache/lucene/search/similarities/BasicModelIne.java
@ -30,11 +30,18 @@ public class BasicModelIne extends BasicModel {
  public BasicModelIne() {}

  @Override
-  public final double score(BasicStats stats, double tfn) {
+  public final double score(BasicStats stats, double tfn, double aeTimes1pTfn) {
    long N = stats.getNumberOfDocuments();
    long F = stats.getTotalTermFreq();
    double ne = N * (1 - Math.pow((N - 1) / (double)N, F));
-    return tfn * log2((N + 1) / (ne + 0.5));
+    double A = log2((N + 1) / (ne + 0.5));
+
+    // basic model I(ne) should return A * tfn
+    // which we rewrite to A * (1 + tfn) - A
+    // so that it can be combined with the after effect while still guaranteeing
+    // that the result is non-decreasing with tfn
+
+    return A * aeTimes1pTfn * (1 - 1 / (1 + tfn));
  }

  @Override
--- a/lucene/core/src/java/org/apache/lucene/search/similarities/BasicModelP.java
+++ b/lucene/core/src/java/org/apache/lucene/search/similarities/BasicModelP.java
@ -1,49 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.lucene.search.similarities;
-
-
-import static org.apache.lucene.search.similarities.SimilarityBase.log2;
-
-/**
- * Implements the Poisson approximation for the binomial model for DFR.
- * @lucene.experimental
- * <p>
- * WARNING: for terms that do not meet the expected random distribution
- * (e.g. stopwords), this model may give poor performance, such as
- * abnormally high scores for low tf values.
- */
-public class BasicModelP extends BasicModel {
-  /** {@code log2(Math.E)}, precomputed. */
-  protected static double LOG2_E = log2(Math.E);
-  
-  /** Sole constructor: parameter-free */
-  public BasicModelP() {}
-  
-  @Override
-  public final double score(BasicStats stats, double tfn) {
-    double lambda = (stats.getTotalTermFreq()+1) / (double) (stats.getNumberOfDocuments()+1);
-    return tfn * log2(tfn / lambda)
-        + (lambda + 1 / (12 * tfn) - tfn) * LOG2_E
-        + 0.5 * log2(2 * Math.PI * tfn);
-  }
-
-  @Override
-  public String toString() {
-    return "P";
-  }
-}
--- a/lucene/core/src/java/org/apache/lucene/search/similarities/DFRSimilarity.java
+++ b/lucene/core/src/java/org/apache/lucene/search/similarities/DFRSimilarity.java
@ -20,7 +20,6 @@ package org.apache.lucene.search.similarities;
 import java.util.List;

 import org.apache.lucene.search.Explanation;
-import org.apache.lucene.search.similarities.AfterEffect.NoAfterEffect;
 import org.apache.lucene.search.similarities.Normalization.NoNormalization;

 /**
@ -40,10 +39,7 @@ import org.apache.lucene.search.similarities.Normalization.NoNormalization;
 * <ol>
 *    <li>{@link BasicModel}: Basic model of information content:
 *        <ul>
- *           <li>{@link BasicModelBE}: Limiting form of Bose-Einstein
 *           <li>{@link BasicModelG}: Geometric approximation of Bose-Einstein
- *           <li>{@link BasicModelP}: Poisson approximation of the Binomial
- *           <li>{@link BasicModelD}: Divergence approximation of the Binomial 
 *           <li>{@link BasicModelIn}: Inverse document frequency
 *           <li>{@link BasicModelIne}: Inverse expected document
 *               frequency [mixture of Poisson and IDF]
@ -55,7 +51,6 @@ import org.apache.lucene.search.similarities.Normalization.NoNormalization;
 *        <ul>
 *           <li>{@link AfterEffectL}: Laplace's law of succession
 *           <li>{@link AfterEffectB}: Ratio of two Bernoulli processes
- *           <li>{@link NoAfterEffect}: no first normalization
 *        </ul>
 *    <li>{@link Normalization}: Second (length) normalization:
 *        <ul>
@ -72,6 +67,10 @@ import org.apache.lucene.search.similarities.Normalization.NoNormalization;
 * </ol>
 * <p>Note that <em>qtf</em>, the multiplicity of term-occurrence in the query,
 * is not handled by this implementation.</p>
+ * <p> Note that basic models BE (Limiting form of Bose-Einstein), P (Poisson
+ * approximation of the Binomial) and D (Divergence approximation of the
+ * Binomial) are not implemented because their formula couldn't be written in
+ * a way that makes scores non-decreasing with the normalized term frequency.
 * @see BasicModel
 * @see AfterEffect
 * @see Normalization
@ -89,8 +88,8 @@ public class DFRSimilarity extends SimilarityBase {
   * Creates DFRSimilarity from the three components.
   * <p>
   * Note that <code>null</code> values are not allowed:
-   * if you want no normalization or after-effect, instead pass 
-   * {@link NoNormalization} or {@link NoAfterEffect} respectively.
+   * if you want no normalization, instead pass
+   * {@link NoNormalization}.
   * @param basicModel Basic model of information content
   * @param afterEffect First normalization of information gain
   * @param normalization Second (length) normalization
@ -109,8 +108,8 @@ public class DFRSimilarity extends SimilarityBase {
  @Override
  protected double score(BasicStats stats, double freq, double docLen) {
    double tfn = normalization.tfn(stats, freq, docLen);
-    return stats.getBoost() *
-        basicModel.score(stats, tfn) * afterEffect.score(stats, tfn);
+    double aeTimes1pTfn = afterEffect.scoreTimes1pTfn(stats);
+    return stats.getBoost() * basicModel.score(stats, tfn, aeTimes1pTfn);
  }
  
  @Override
@ -121,9 +120,10 @@ public class DFRSimilarity extends SimilarityBase {
    }
    
    Explanation normExpl = normalization.explain(stats, freq, docLen);
-    float tfn = normExpl.getValue();
+    double tfn = normalization.tfn(stats, freq, docLen);
+    double aeTimes1pTfn = afterEffect.scoreTimes1pTfn(stats);
    subs.add(normExpl);
-    subs.add(basicModel.explain(stats, tfn));
+    subs.add(basicModel.explain(stats, tfn, aeTimes1pTfn));
    subs.add(afterEffect.explain(stats, tfn));
  }

--- a/lucene/core/src/test/org/apache/lucene/search/similarities/BasicModelTestCase.java
+++ b/lucene/core/src/test/org/apache/lucene/search/similarities/BasicModelTestCase.java
@ -23,11 +23,8 @@ public abstract class BasicModelTestCase extends BaseSimilarityTestCase {
  @Override
  protected final Similarity getSimilarity(Random random) {
    final AfterEffect afterEffect;
-    switch(random.nextInt(3)) {
+    switch(random.nextInt(2)) {
      case 0: 
-        afterEffect = new AfterEffect.NoAfterEffect();
-        break;
-      case 1: 
        afterEffect = new AfterEffectL();
        break;
      default: 
--- a/lucene/core/src/test/org/apache/lucene/search/similarities/TestBasicModelBE.java
+++ b/lucene/core/src/test/org/apache/lucene/search/similarities/TestBasicModelBE.java
@ -1,30 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.lucene.search.similarities;
-
-import org.apache.lucene.util.LuceneTestCase.AwaitsFix;
-
-// returns negative scores at least, but it warns it has problems
-@AwaitsFix(bugUrl = "https://issues.apache.org/jira/browse/LUCENE-8010")
-public class TestBasicModelBE extends BasicModelTestCase {
-
-  @Override
-  protected BasicModel getBasicModel() {
-    return new BasicModelBE();
-  }
-
-}
--- a/lucene/core/src/test/org/apache/lucene/search/similarities/TestBasicModelD.java
+++ b/lucene/core/src/test/org/apache/lucene/search/similarities/TestBasicModelD.java
@ -1,30 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.lucene.search.similarities;
-
-import org.apache.lucene.util.LuceneTestCase.AwaitsFix;
-
-// scores go backwards with respect to TF, but it warns it has problems
-@AwaitsFix(bugUrl = "https://issues.apache.org/jira/browse/LUCENE-8010")
-public class TestBasicModelD extends BasicModelTestCase {
-
-  @Override
-  protected BasicModel getBasicModel() {
-    return new BasicModelD();
-  }
-
-}
--- a/lucene/core/src/test/org/apache/lucene/search/similarities/TestBasicModelP.java
+++ b/lucene/core/src/test/org/apache/lucene/search/similarities/TestBasicModelP.java
@ -1,30 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.lucene.search.similarities;
-
-import org.apache.lucene.util.LuceneTestCase.AwaitsFix;
-
-//scores go backwards with respect to TF, but it warns it has problems
-@AwaitsFix(bugUrl = "https://issues.apache.org/jira/browse/LUCENE-8010")
-public class TestBasicModelP extends BasicModelTestCase {
-
-  @Override
-  protected BasicModel getBasicModel() {
-    return new BasicModelP();
-  }
-
-}
--- a/lucene/core/src/test/org/apache/lucene/search/similarities/TestSimilarityBase.java
+++ b/lucene/core/src/test/org/apache/lucene/search/similarities/TestSimilarityBase.java
@ -76,13 +76,12 @@ public class TestSimilarityBase extends LuceneTestCase {
  private static float FLOAT_EPSILON = 1e-5f;
  /** The DFR basic models to test. */
  static BasicModel[] BASIC_MODELS = {
-    new BasicModelBE(), new BasicModelD(), new BasicModelG(),
-    new BasicModelIF(), new BasicModelIn(), new BasicModelIne(),
-    new BasicModelP()
+    new BasicModelG(), new BasicModelIF(), new BasicModelIn(),
+    new BasicModelIne()
  };
  /** The DFR aftereffects to test. */
  static AfterEffect[] AFTER_EFFECTS = {
-    new AfterEffectB(), new AfterEffectL(), new AfterEffect.NoAfterEffect()
+    new AfterEffectB(), new AfterEffectL()
  };
  /** The DFR normalizations to test. */
  static Normalization[] NORMALIZATIONS = {
@ -446,21 +445,6 @@ public class TestSimilarityBase extends LuceneTestCase {
    correctnessTestCore(sim, 2.2387237548828125f);
  }

-  /** Correctness test for the PL2 DFR model. */
-  public void testPL2() throws IOException {
-    SimilarityBase sim = new DFRSimilarity(
-        new BasicModelP(), new AfterEffectL(), new NormalizationH2());
-    float tfn = (float)(FREQ * SimilarityBase.log2(
-        1 + AVG_FIELD_LENGTH / DOC_LEN));  // 8.1894750101
-    float l = 1.0f / (tfn + 1.0f);         // 0.108820144666
-    float lambda = (1.0f + TOTAL_TERM_FREQ) / (1f + NUMBER_OF_DOCUMENTS);  // 0.7029703
-    float p = (float)(tfn * SimilarityBase.log2(tfn / lambda) +
-              (lambda + 1 / (12 * tfn) - tfn) * SimilarityBase.log2(Math.E) +
-              0.5 * SimilarityBase.log2(2 * Math.PI * tfn)); // 21.065619
-    float gold = l * p;                    // 2.2923636
-    correctnessTestCore(sim, gold);
-  }
-
  /** Correctness test for the IneB2 DFR model. */
  public void testIneB2() throws IOException {
    SimilarityBase sim = new DFRSimilarity(
@ -475,50 +459,14 @@ public class TestSimilarityBase extends LuceneTestCase {
    correctnessTestCore(sim, 1.6390540599822998f);
  }
  
-  /** Correctness test for the BEB1 DFR model. */
-  public void testBEB1() throws IOException {
-    SimilarityBase sim = new DFRSimilarity(
-        new BasicModelBE(), new AfterEffectB(), new NormalizationH1());
-    float tfn = FREQ * AVG_FIELD_LENGTH / DOC_LEN;  // 8.75
-    float b = (TOTAL_TERM_FREQ + 1 + 1) / ((DOC_FREQ + 1) * (tfn + 1));  // 0.67132866
-    double f = TOTAL_TERM_FREQ + 1 + tfn;
-    double n = f + NUMBER_OF_DOCUMENTS;
-    double n1 = n + f - 1;        // 258.5
-    double m1 = n + f - tfn - 2;  // 248.75
-    double n2 = f;                                      // 79.75
-    double m2 = f - tfn;                                // 71.0
-    float be = (float)(-SimilarityBase.log2(n - 1) -
-               SimilarityBase.log2(Math.E) +                   // -8.924494472554715
-               ((m1 + 0.5f) * SimilarityBase.log2(n1 / m1) +
-                (n1 - m1) * SimilarityBase.log2(n1)) -         // 91.9620374903885
-               ((m2 + 0.5f) * SimilarityBase.log2(n2 / m2) +
-                (n2 - m2) * SimilarityBase.log2(n2)));         // 67.26544321004599
-               // 15.7720995
-    float gold = b * be;                                       // 10.588263
-    correctnessTestCore(sim, gold);
-  }
-
-  /** Correctness test for the D DFR model (basic model only). */
-  public void testD() throws IOException {
-    SimilarityBase sim = new DFRSimilarity(new BasicModelD(), new AfterEffect.NoAfterEffect(), new Normalization.NoNormalization());
-    double totalTermFreqNorm = TOTAL_TERM_FREQ + FREQ + 1;
-    double p = 1.0 / (NUMBER_OF_DOCUMENTS + 1);                // 0.009900990099009901
-    double phi = FREQ / totalTermFreqNorm;                       // 0.08974358974358974
-    double D = phi * SimilarityBase.log2(phi / p) +            // 0.17498542370019005
-              (1 - phi) * SimilarityBase.log2((1 - phi) / (1 - p));
-    float gold = (float)(totalTermFreqNorm * D + 0.5 * SimilarityBase.log2(
-                 1 + 2 * Math.PI * FREQ * (1 - phi)));         // 16.328257
-    correctnessTestCore(sim, gold);
-  }
-  
  /** Correctness test for the In2 DFR model with no aftereffect. */
  public void testIn2() throws IOException {
    SimilarityBase sim = new DFRSimilarity(
-        new BasicModelIn(), new AfterEffect.NoAfterEffect(), new NormalizationH2());
+        new BasicModelIn(), new AfterEffectL(), new NormalizationH2());
    float tfn = (float)(FREQ * SimilarityBase.log2(            // 8.1894750101
                1 + AVG_FIELD_LENGTH / DOC_LEN));
    float gold = (float)(tfn * SimilarityBase.log2(            // 26.7459577898
-                 (NUMBER_OF_DOCUMENTS + 1) / (DOC_FREQ + 0.5)));
+                 (NUMBER_OF_DOCUMENTS + 1) / (DOC_FREQ + 0.5)) / (1 + tfn));
    correctnessTestCore(sim, gold);
  }
  
--- a/lucene/test-framework/src/java/org/apache/lucene/search/similarities/BaseSimilarityTestCase.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/search/similarities/BaseSimilarityTestCase.java
@ -193,20 +193,46 @@ public abstract class BaseSimilarityTestCase extends LuceneTestCase {
      lowerBound = SmallFloat.byte4ToInt((byte) norm);
    }
    final long maxDoc;
-    if (random.nextBoolean()) {
+    switch (random.nextInt(6)) {
+      case 0:
+        // 1 doc collection
+        maxDoc = 1;
+        break;
+      case 1:
+        // 2 doc collection
+        maxDoc = 2;
+        break;
+      case 2:
+        // tiny collection
+        maxDoc = TestUtil.nextLong(random, 3, 16);
+        break;
+      case 3:
        // small collection
-      maxDoc = TestUtil.nextLong(random, 1, 100000);
-    } else {
+        maxDoc = TestUtil.nextLong(random, 16, 100000);
+        break;
+      case 4:
+        // big collection
+        maxDoc = TestUtil.nextLong(random, 100000, MAXDOC_FORTESTING);
+        break;
+      default:
        // yuge collection
-      maxDoc = TestUtil.nextLong(random, 1, MAXDOC_FORTESTING);
+        maxDoc = MAXDOC_FORTESTING;
+        break;
    }
    final long docCount;
-    if (random.nextBoolean()) {
+    switch (random.nextInt(3)) {
+      case 0:
+        // sparsest field
+        docCount = 1;
+        break;
+      case 1:
        // sparse field
        docCount = TestUtil.nextLong(random, 1, maxDoc);
-    } else {
+        break;
+      default:
        // fully populated
        docCount = maxDoc;
+        break;
    }
    // random docsize: but can't require docs to have > 2B tokens
    long upperBound;
@ -216,15 +242,22 @@ public abstract class BaseSimilarityTestCase extends LuceneTestCase {
      upperBound = MAXTOKENS_FORTESTING;
    }
    final long sumDocFreq;
-    if (random.nextBoolean()) {
+    switch (random.nextInt(3)) {
+      case 0:
        // shortest possible docs
        sumDocFreq = docCount;
-    } else {
+        break;
+      case 1:
+        // biggest possible docs
+        sumDocFreq = upperBound + 1 - lowerBound;
+        break;
+      default:
        // random docsize
        sumDocFreq = TestUtil.nextLong(random, docCount, upperBound + 1 - lowerBound);
+        break;
    }
    final long sumTotalTermFreq;
-    switch (random.nextInt(3)) {
+    switch (random.nextInt(4)) {
      case 0:
        // term frequencies were omitted
        sumTotalTermFreq = sumDocFreq;
@ -233,6 +266,10 @@ public abstract class BaseSimilarityTestCase extends LuceneTestCase {
        // no repetition of terms (except to satisfy this norm)
        sumTotalTermFreq = sumDocFreq - 1 + lowerBound;
        break;
+      case 2:
+        // maximum repetition of terms
+        sumTotalTermFreq = upperBound;
+        break;
      default:
        // random repetition
        assert sumDocFreq - 1 + lowerBound <= upperBound;
@ -249,29 +286,46 @@ public abstract class BaseSimilarityTestCase extends LuceneTestCase {
   */
  static TermStatistics newTerm(Random random, CollectionStatistics corpus) {
    final long docFreq;
-    if (random.nextBoolean()) {
+    switch (random.nextInt(3)) {
+      case 0:
        // rare term
        docFreq = 1;
-    } else {
+        break;
+      case 1:
+        // common term
+        docFreq = corpus.docCount();
+        break;
+      default:
        // random specificity
        docFreq = TestUtil.nextLong(random, 1, corpus.docCount());
+        break;
    }
    final long totalTermFreq;
-    if (corpus.sumTotalTermFreq() == corpus.sumDocFreq()) {
-      // omitTF
-      totalTermFreq = docFreq;
-    } else if (random.nextBoolean()) {
-      // no repetition
-      totalTermFreq = docFreq;
-    } else {
-      // random repetition: but can't require docs to have > 2B tokens
+    // can't require docs to have > 2B tokens
    long upperBound;
    try {
      upperBound = Math.min(corpus.sumTotalTermFreq(), Math.multiplyExact(docFreq, Integer.MAX_VALUE));
    } catch (ArithmeticException overflow) {
      upperBound = corpus.sumTotalTermFreq();
    }
+    if (corpus.sumTotalTermFreq() == corpus.sumDocFreq()) {
+      // omitTF
+      totalTermFreq = docFreq;
+    } else {
+      switch (random.nextInt(3)) {
+        case 0:
+          // no repetition
+          totalTermFreq = docFreq;
+          break;
+        case 1:
+          // maximum repetition
+          totalTermFreq = upperBound;
+          break;
+        default:
+          // random repetition
          totalTermFreq = TestUtil.nextLong(random, docFreq, upperBound);
+          break;
+      }
    }
    return new TermStatistics(TERM, docFreq, totalTermFreq);
  }
@ -315,9 +369,34 @@ public abstract class BaseSimilarityTestCase extends LuceneTestCase {
              // there is at least one other document, and those must have at least 1 instance each.
              int upperBound = Math.toIntExact(Math.min(term.totalTermFreq() - term.docFreq() + 1, Integer.MAX_VALUE));
              if (random.nextBoolean()) {
+                // integer freq
+                switch (random.nextInt(3)) {
+                  case 0:
+                    // smallest freq
+                    freq = 1;
+                    break;
+                  case 1:
+                    // largest freq
+                    freq = upperBound;
+                    break;
+                  default:
+                    // random freq
                    freq = TestUtil.nextInt(random, 1, upperBound);
+                    break;
+                }
              } else {
-                float freqCandidate = upperBound * random.nextFloat();
+                // float freq
+                float freqCandidate;
+                switch (random.nextInt(2)) {
+                  case 0:
+                    // smallest freq
+                    freqCandidate = Float.MIN_VALUE;
+                    break;
+                  default:
+                    // random freq
+                    freqCandidate = upperBound * random.nextFloat();
+                    break;
+                }
                // we need to be 2nd float value at a minimum, the pairwise test will check MIN_VALUE in this case.
                // this avoids testing frequencies of 0 which seem wrong to allow (we should enforce computeSlopFactor etc)
                if (freqCandidate <= Float.MIN_VALUE) {
--- a/lucene/test-framework/src/java/org/apache/lucene/search/similarities/RandomSimilarity.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/search/similarities/RandomSimilarity.java
@ -58,13 +58,11 @@ public class RandomSimilarity extends PerFieldSimilarityWrapper {
  // all the similarities that we rotate through
  /** The DFR basic models to test. */
  static BasicModel[] BASIC_MODELS = {
-    /* TODO: enable new BasicModelBE(), */ /* TODO: enable new BasicModelD(), */ new BasicModelG(),
-    new BasicModelIF(), new BasicModelIn(), new BasicModelIne(),
-    /* TODO: enable new BasicModelP() */
+    new BasicModelG(), new BasicModelIF(), new BasicModelIn(), new BasicModelIne(),
  };
  /** The DFR aftereffects to test. */
  static AfterEffect[] AFTER_EFFECTS = {
-    new AfterEffectB(), new AfterEffectL(), new AfterEffect.NoAfterEffect()
+    new AfterEffectB(), new AfterEffectL()
  };
  /** The DFR normalizations to test. */
  static Normalization[] NORMALIZATIONS = {
--- a/solr/core/src/java/org/apache/solr/search/similarities/DFRSimilarityFactory.java
+++ b/solr/core/src/java/org/apache/solr/search/similarities/DFRSimilarityFactory.java
@ -17,17 +17,13 @@
 package org.apache.solr.search.similarities;

 import org.apache.lucene.search.similarities.AfterEffect;
-import org.apache.lucene.search.similarities.AfterEffect.NoAfterEffect; // javadoc
 import org.apache.lucene.search.similarities.AfterEffectB;
 import org.apache.lucene.search.similarities.AfterEffectL;
 import org.apache.lucene.search.similarities.BasicModel;
-import org.apache.lucene.search.similarities.BasicModelBE;
-import org.apache.lucene.search.similarities.BasicModelD;
 import org.apache.lucene.search.similarities.BasicModelG;
 import org.apache.lucene.search.similarities.BasicModelIF;
 import org.apache.lucene.search.similarities.BasicModelIn;
 import org.apache.lucene.search.similarities.BasicModelIne;
-import org.apache.lucene.search.similarities.BasicModelP;
 import org.apache.lucene.search.similarities.DFRSimilarity;
 import org.apache.lucene.search.similarities.Normalization;
 import org.apache.lucene.search.similarities.Normalization.NoNormalization; // javadoc
@ -48,10 +44,7 @@ import org.apache.solr.schema.SimilarityFactory;
 * <ol>
 *    <li>{@link BasicModel basicModel}: Basic model of information content:
 *        <ul>
- *           <li>{@link BasicModelBE Be}: Limiting form of Bose-Einstein
 *           <li>{@link BasicModelG G}: Geometric approximation of Bose-Einstein
- *           <li>{@link BasicModelP P}: Poisson approximation of the Binomial
- *           <li>{@link BasicModelD D}: Divergence approximation of the Binomial 
 *           <li>{@link BasicModelIn I(n)}: Inverse document frequency
 *           <li>{@link BasicModelIne I(ne)}: Inverse expected document
 *               frequency [mixture of Poisson and IDF]
@ -63,7 +56,6 @@ import org.apache.solr.schema.SimilarityFactory;
 *        <ul>
 *           <li>{@link AfterEffectL L}: Laplace's law of succession
 *           <li>{@link AfterEffectB B}: Ratio of two Bernoulli processes
- *           <li>{@link NoAfterEffect none}: no first normalization
 *        </ul>
 *    <li>{@link Normalization normalization}: Second (length) normalization:
 *        <ul>
@ -122,11 +114,7 @@ public class DFRSimilarityFactory extends SimilarityFactory {
  }
  
  private BasicModel parseBasicModel(String expr) {
-    if ("Be".equals(expr)) {
-      return new BasicModelBE();
-    } else if ("D".equals(expr)) {
-      return new BasicModelD();
-    } else if ("G".equals(expr)) {
+    if ("G".equals(expr)) {
      return new BasicModelG();
    } else if ("I(F)".equals(expr)) {
      return new BasicModelIF();
@ -134,8 +122,6 @@ public class DFRSimilarityFactory extends SimilarityFactory {
      return new BasicModelIn();
    } else if ("I(ne)".equals(expr)) {
      return new BasicModelIne();
-    } else if ("P".equals(expr)) {
-      return new BasicModelP();
    } else {
      throw new RuntimeException("Invalid basicModel: " + expr);
    }
@ -146,8 +132,6 @@ public class DFRSimilarityFactory extends SimilarityFactory {
      return new AfterEffectB();
    } else if ("L".equals(expr)) {
      return new AfterEffectL();
-    } else if ("none".equals(expr)) {
-      return new AfterEffect.NoAfterEffect();
    } else {
      throw new RuntimeException("Invalid afterEffect: " + expr);
    }
--- a/solr/core/src/test-files/solr/collection1/conf/schema-dfr.xml
+++ b/solr/core/src/test-files/solr/collection1/conf/schema-dfr.xml
@ -46,7 +46,7 @@
  <fieldType name="text_paramc" class="solr.TextField">
    <analyzer class="org.apache.lucene.analysis.standard.StandardAnalyzer"/>
    <similarity class="solr.DFRSimilarityFactory">
-      <str name="basicModel">P</str>
+      <str name="basicModel">G</str>
      <str name="afterEffect">L</str>
      <str name="normalization">H2</str>
      <float name="c">7</float>
--- a/solr/core/src/test/org/apache/solr/search/similarities/TestDFRSimilarityFactory.java
+++ b/solr/core/src/test/org/apache/solr/search/similarities/TestDFRSimilarityFactory.java
@ -18,8 +18,8 @@ package org.apache.solr.search.similarities;

 import org.apache.lucene.search.similarities.AfterEffectB;
 import org.apache.lucene.search.similarities.AfterEffectL;
+import org.apache.lucene.search.similarities.BasicModelG;
 import org.apache.lucene.search.similarities.BasicModelIF;
-import org.apache.lucene.search.similarities.BasicModelP;
 import org.apache.lucene.search.similarities.DFRSimilarity;
 import org.apache.lucene.search.similarities.NormalizationH2;
 import org.apache.lucene.search.similarities.NormalizationH3;
@ -62,7 +62,7 @@ public class TestDFRSimilarityFactory extends BaseSimilarityTestCase {
    Similarity sim = getSimilarity("text_paramc");
    assertEquals(DFRSimilarity.class, sim.getClass());
    DFRSimilarity dfr = (DFRSimilarity) sim;
-    assertEquals(BasicModelP.class, dfr.getBasicModel().getClass());
+    assertEquals(BasicModelG.class, dfr.getBasicModel().getClass());
    assertEquals(AfterEffectL.class, dfr.getAfterEffect().getClass());
    assertEquals(NormalizationH2.class, dfr.getNormalization().getClass());
    NormalizationH2 norm = (NormalizationH2) dfr.getNormalization();