From 63b63c573487fe6b054afb6073c057a88a15288f Mon Sep 17 00:00:00 2001
From: Adrien Grand
- * WARNING: for terms that do not meet the expected random distribution - * (e.g. stopwords), this model may give poor performance, such as - * abnormally high or NaN scores for low tf values. - * @lucene.experimental - */ -public class BasicModelD extends BasicModel { - - /** Sole constructor: parameter-free */ - public BasicModelD() {} - - @Override - public final double score(BasicStats stats, double tfn) { - // we have to ensure phi is always < 1 for tiny TTF values, otherwise nphi can go negative, - // resulting in NaN. cleanest way is to unconditionally always add tfn to totalTermFreq - // to create a 'normalized' F. - double F = stats.getTotalTermFreq() + 1 + tfn; - double phi = tfn / F; - double nphi = 1 - phi; - double p = 1.0 / (stats.getNumberOfDocuments() + 1); - double D = phi * log2(phi / p) + nphi * log2(nphi / (1 - p)); - return D * F + 0.5 * log2(1 + 2 * Math.PI * tfn * nphi); - } - - @Override - public String toString() { - return "D"; - } -} diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/BasicModelG.java b/lucene/core/src/java/org/apache/lucene/search/similarities/BasicModelG.java index 2f8cb4368e0..ce871967f9b 100644 --- a/lucene/core/src/java/org/apache/lucene/search/similarities/BasicModelG.java +++ b/lucene/core/src/java/org/apache/lucene/search/similarities/BasicModelG.java @@ -31,13 +31,21 @@ public class BasicModelG extends BasicModel { public BasicModelG() {} @Override - public final double score(BasicStats stats, double tfn) { + public final double score(BasicStats stats, double tfn, double aeTimes1pTfn) { // just like in BE, approximation only holds true when F << N, so we use lambda = F / (N + F) double F = stats.getTotalTermFreq() + 1; double N = stats.getNumberOfDocuments(); double lambda = F / (N + F); // -log(1 / (lambda + 1)) -> log(lambda + 1) - return log2(lambda + 1) + tfn * log2((1 + lambda) / lambda); + double A = log2(lambda + 1); + double B = log2((1 + lambda) / lambda); + + // basic model G should return (A + B * tfn) + // which we rewrite to B * (1 + tfn) - (B - A) + // so that it can be combined with the after effect while still guaranteeing + // that the result is non-decreasing with tfn since B >= A + + return (B - (B - A) / (1 + tfn)) * aeTimes1pTfn; } @Override diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/BasicModelIF.java b/lucene/core/src/java/org/apache/lucene/search/similarities/BasicModelIF.java index 5b7350bbd14..16781cdf496 100644 --- a/lucene/core/src/java/org/apache/lucene/search/similarities/BasicModelIF.java +++ b/lucene/core/src/java/org/apache/lucene/search/similarities/BasicModelIF.java @@ -29,10 +29,17 @@ public class BasicModelIF extends BasicModel { public BasicModelIF() {} @Override - public final double score(BasicStats stats, double tfn) { + public final double score(BasicStats stats, double tfn, double aeTimes1pTfn) { long N = stats.getNumberOfDocuments(); long F = stats.getTotalTermFreq(); - return tfn * log2(1 + (N + 1) / (F + 0.5)); + double A = log2(1 + (N + 1) / (F + 0.5)); + + // basic model IF should return A * tfn + // which we rewrite to A * (1 + tfn) - A + // so that it can be combined with the after effect while still guaranteeing + // that the result is non-decreasing with tfn + + return A * aeTimes1pTfn * (1 - 1 / (1 + tfn)); } @Override diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/BasicModelIn.java b/lucene/core/src/java/org/apache/lucene/search/similarities/BasicModelIn.java index a09eedb0d8f..5f1e1814c81 100644 --- a/lucene/core/src/java/org/apache/lucene/search/similarities/BasicModelIn.java +++ b/lucene/core/src/java/org/apache/lucene/search/similarities/BasicModelIn.java @@ -30,16 +30,23 @@ public class BasicModelIn extends BasicModel { public BasicModelIn() {} @Override - public final double score(BasicStats stats, double tfn) { + public final double score(BasicStats stats, double tfn, double aeTimes1pTfn) { long N = stats.getNumberOfDocuments(); long n = stats.getDocFreq(); - return tfn * log2((N + 1) / (n + 0.5)); + double A = log2((N + 1) / (n + 0.5)); + + // basic model I(n) should return A * tfn + // which we rewrite to A * (1 + tfn) - A + // so that it can be combined with the after effect while still guaranteeing + // that the result is non-decreasing with tfn + + return A * aeTimes1pTfn * (1 - 1 / (1 + tfn)); } @Override - public final Explanation explain(BasicStats stats, double tfn) { + public final Explanation explain(BasicStats stats, double tfn, double aeTimes1pTfn) { return Explanation.match( - (float) score(stats, tfn), + (float) (score(stats, tfn, aeTimes1pTfn) * (1 + tfn) / aeTimes1pTfn), getClass().getSimpleName() + ", computed from: ", Explanation.match(stats.getNumberOfDocuments(), "numberOfDocuments"), Explanation.match(stats.getDocFreq(), "docFreq")); diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/BasicModelIne.java b/lucene/core/src/java/org/apache/lucene/search/similarities/BasicModelIne.java index b4e830d166f..fb755fa50dd 100644 --- a/lucene/core/src/java/org/apache/lucene/search/similarities/BasicModelIne.java +++ b/lucene/core/src/java/org/apache/lucene/search/similarities/BasicModelIne.java @@ -30,11 +30,18 @@ public class BasicModelIne extends BasicModel { public BasicModelIne() {} @Override - public final double score(BasicStats stats, double tfn) { + public final double score(BasicStats stats, double tfn, double aeTimes1pTfn) { long N = stats.getNumberOfDocuments(); long F = stats.getTotalTermFreq(); double ne = N * (1 - Math.pow((N - 1) / (double)N, F)); - return tfn * log2((N + 1) / (ne + 0.5)); + double A = log2((N + 1) / (ne + 0.5)); + + // basic model I(ne) should return A * tfn + // which we rewrite to A * (1 + tfn) - A + // so that it can be combined with the after effect while still guaranteeing + // that the result is non-decreasing with tfn + + return A * aeTimes1pTfn * (1 - 1 / (1 + tfn)); } @Override diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/BasicModelP.java b/lucene/core/src/java/org/apache/lucene/search/similarities/BasicModelP.java deleted file mode 100644 index f66e3d004bd..00000000000 --- a/lucene/core/src/java/org/apache/lucene/search/similarities/BasicModelP.java +++ /dev/null @@ -1,49 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.lucene.search.similarities; - - -import static org.apache.lucene.search.similarities.SimilarityBase.log2; - -/** - * Implements the Poisson approximation for the binomial model for DFR. - * @lucene.experimental - *
- * WARNING: for terms that do not meet the expected random distribution - * (e.g. stopwords), this model may give poor performance, such as - * abnormally high scores for low tf values. - */ -public class BasicModelP extends BasicModel { - /** {@code log2(Math.E)}, precomputed. */ - protected static double LOG2_E = log2(Math.E); - - /** Sole constructor: parameter-free */ - public BasicModelP() {} - - @Override - public final double score(BasicStats stats, double tfn) { - double lambda = (stats.getTotalTermFreq()+1) / (double) (stats.getNumberOfDocuments()+1); - return tfn * log2(tfn / lambda) - + (lambda + 1 / (12 * tfn) - tfn) * LOG2_E - + 0.5 * log2(2 * Math.PI * tfn); - } - - @Override - public String toString() { - return "P"; - } -} diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/DFRSimilarity.java b/lucene/core/src/java/org/apache/lucene/search/similarities/DFRSimilarity.java index aacd2460d7a..d793d947f3a 100644 --- a/lucene/core/src/java/org/apache/lucene/search/similarities/DFRSimilarity.java +++ b/lucene/core/src/java/org/apache/lucene/search/similarities/DFRSimilarity.java @@ -20,7 +20,6 @@ package org.apache.lucene.search.similarities; import java.util.List; import org.apache.lucene.search.Explanation; -import org.apache.lucene.search.similarities.AfterEffect.NoAfterEffect; import org.apache.lucene.search.similarities.Normalization.NoNormalization; /** @@ -40,10 +39,7 @@ import org.apache.lucene.search.similarities.Normalization.NoNormalization; *
Note that qtf, the multiplicity of term-occurrence in the query, * is not handled by this implementation.
+ *Note that basic models BE (Limiting form of Bose-Einstein), P (Poisson + * approximation of the Binomial) and D (Divergence approximation of the + * Binomial) are not implemented because their formula couldn't be written in + * a way that makes scores non-decreasing with the normalized term frequency. * @see BasicModel * @see AfterEffect * @see Normalization @@ -89,8 +88,8 @@ public class DFRSimilarity extends SimilarityBase { * Creates DFRSimilarity from the three components. *
* Note that null
values are not allowed:
- * if you want no normalization or after-effect, instead pass
- * {@link NoNormalization} or {@link NoAfterEffect} respectively.
+ * if you want no normalization, instead pass
+ * {@link NoNormalization}.
* @param basicModel Basic model of information content
* @param afterEffect First normalization of information gain
* @param normalization Second (length) normalization
@@ -109,8 +108,8 @@ public class DFRSimilarity extends SimilarityBase {
@Override
protected double score(BasicStats stats, double freq, double docLen) {
double tfn = normalization.tfn(stats, freq, docLen);
- return stats.getBoost() *
- basicModel.score(stats, tfn) * afterEffect.score(stats, tfn);
+ double aeTimes1pTfn = afterEffect.scoreTimes1pTfn(stats);
+ return stats.getBoost() * basicModel.score(stats, tfn, aeTimes1pTfn);
}
@Override
@@ -121,9 +120,10 @@ public class DFRSimilarity extends SimilarityBase {
}
Explanation normExpl = normalization.explain(stats, freq, docLen);
- float tfn = normExpl.getValue();
+ double tfn = normalization.tfn(stats, freq, docLen);
+ double aeTimes1pTfn = afterEffect.scoreTimes1pTfn(stats);
subs.add(normExpl);
- subs.add(basicModel.explain(stats, tfn));
+ subs.add(basicModel.explain(stats, tfn, aeTimes1pTfn));
subs.add(afterEffect.explain(stats, tfn));
}
diff --git a/lucene/core/src/test/org/apache/lucene/search/similarities/BasicModelTestCase.java b/lucene/core/src/test/org/apache/lucene/search/similarities/BasicModelTestCase.java
index 66236669704..3cb83e2da94 100644
--- a/lucene/core/src/test/org/apache/lucene/search/similarities/BasicModelTestCase.java
+++ b/lucene/core/src/test/org/apache/lucene/search/similarities/BasicModelTestCase.java
@@ -23,11 +23,8 @@ public abstract class BasicModelTestCase extends BaseSimilarityTestCase {
@Override
protected final Similarity getSimilarity(Random random) {
final AfterEffect afterEffect;
- switch(random.nextInt(3)) {
+ switch(random.nextInt(2)) {
case 0:
- afterEffect = new AfterEffect.NoAfterEffect();
- break;
- case 1:
afterEffect = new AfterEffectL();
break;
default:
diff --git a/lucene/core/src/test/org/apache/lucene/search/similarities/TestBasicModelBE.java b/lucene/core/src/test/org/apache/lucene/search/similarities/TestBasicModelBE.java
deleted file mode 100644
index 2dc956f7da0..00000000000
--- a/lucene/core/src/test/org/apache/lucene/search/similarities/TestBasicModelBE.java
+++ /dev/null
@@ -1,30 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.lucene.search.similarities;
-
-import org.apache.lucene.util.LuceneTestCase.AwaitsFix;
-
-// returns negative scores at least, but it warns it has problems
-@AwaitsFix(bugUrl = "https://issues.apache.org/jira/browse/LUCENE-8010")
-public class TestBasicModelBE extends BasicModelTestCase {
-
- @Override
- protected BasicModel getBasicModel() {
- return new BasicModelBE();
- }
-
-}
diff --git a/lucene/core/src/test/org/apache/lucene/search/similarities/TestBasicModelD.java b/lucene/core/src/test/org/apache/lucene/search/similarities/TestBasicModelD.java
deleted file mode 100644
index 7eee359b3d4..00000000000
--- a/lucene/core/src/test/org/apache/lucene/search/similarities/TestBasicModelD.java
+++ /dev/null
@@ -1,30 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.lucene.search.similarities;
-
-import org.apache.lucene.util.LuceneTestCase.AwaitsFix;
-
-// scores go backwards with respect to TF, but it warns it has problems
-@AwaitsFix(bugUrl = "https://issues.apache.org/jira/browse/LUCENE-8010")
-public class TestBasicModelD extends BasicModelTestCase {
-
- @Override
- protected BasicModel getBasicModel() {
- return new BasicModelD();
- }
-
-}
diff --git a/lucene/core/src/test/org/apache/lucene/search/similarities/TestBasicModelP.java b/lucene/core/src/test/org/apache/lucene/search/similarities/TestBasicModelP.java
deleted file mode 100644
index 2788ff8edb8..00000000000
--- a/lucene/core/src/test/org/apache/lucene/search/similarities/TestBasicModelP.java
+++ /dev/null
@@ -1,30 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.lucene.search.similarities;
-
-import org.apache.lucene.util.LuceneTestCase.AwaitsFix;
-
-//scores go backwards with respect to TF, but it warns it has problems
-@AwaitsFix(bugUrl = "https://issues.apache.org/jira/browse/LUCENE-8010")
-public class TestBasicModelP extends BasicModelTestCase {
-
- @Override
- protected BasicModel getBasicModel() {
- return new BasicModelP();
- }
-
-}
diff --git a/lucene/core/src/test/org/apache/lucene/search/similarities/TestSimilarityBase.java b/lucene/core/src/test/org/apache/lucene/search/similarities/TestSimilarityBase.java
index 8a6227c9552..be85801a471 100644
--- a/lucene/core/src/test/org/apache/lucene/search/similarities/TestSimilarityBase.java
+++ b/lucene/core/src/test/org/apache/lucene/search/similarities/TestSimilarityBase.java
@@ -76,13 +76,12 @@ public class TestSimilarityBase extends LuceneTestCase {
private static float FLOAT_EPSILON = 1e-5f;
/** The DFR basic models to test. */
static BasicModel[] BASIC_MODELS = {
- new BasicModelBE(), new BasicModelD(), new BasicModelG(),
- new BasicModelIF(), new BasicModelIn(), new BasicModelIne(),
- new BasicModelP()
+ new BasicModelG(), new BasicModelIF(), new BasicModelIn(),
+ new BasicModelIne()
};
/** The DFR aftereffects to test. */
static AfterEffect[] AFTER_EFFECTS = {
- new AfterEffectB(), new AfterEffectL(), new AfterEffect.NoAfterEffect()
+ new AfterEffectB(), new AfterEffectL()
};
/** The DFR normalizations to test. */
static Normalization[] NORMALIZATIONS = {
@@ -445,21 +444,6 @@ public class TestSimilarityBase extends LuceneTestCase {
new IBSimilarity(new DistributionSPL(), new LambdaTTF(), new Normalization.NoNormalization());
correctnessTestCore(sim, 2.2387237548828125f);
}
-
- /** Correctness test for the PL2 DFR model. */
- public void testPL2() throws IOException {
- SimilarityBase sim = new DFRSimilarity(
- new BasicModelP(), new AfterEffectL(), new NormalizationH2());
- float tfn = (float)(FREQ * SimilarityBase.log2(
- 1 + AVG_FIELD_LENGTH / DOC_LEN)); // 8.1894750101
- float l = 1.0f / (tfn + 1.0f); // 0.108820144666
- float lambda = (1.0f + TOTAL_TERM_FREQ) / (1f + NUMBER_OF_DOCUMENTS); // 0.7029703
- float p = (float)(tfn * SimilarityBase.log2(tfn / lambda) +
- (lambda + 1 / (12 * tfn) - tfn) * SimilarityBase.log2(Math.E) +
- 0.5 * SimilarityBase.log2(2 * Math.PI * tfn)); // 21.065619
- float gold = l * p; // 2.2923636
- correctnessTestCore(sim, gold);
- }
/** Correctness test for the IneB2 DFR model. */
public void testIneB2() throws IOException {
@@ -475,50 +459,14 @@ public class TestSimilarityBase extends LuceneTestCase {
correctnessTestCore(sim, 1.6390540599822998f);
}
- /** Correctness test for the BEB1 DFR model. */
- public void testBEB1() throws IOException {
- SimilarityBase sim = new DFRSimilarity(
- new BasicModelBE(), new AfterEffectB(), new NormalizationH1());
- float tfn = FREQ * AVG_FIELD_LENGTH / DOC_LEN; // 8.75
- float b = (TOTAL_TERM_FREQ + 1 + 1) / ((DOC_FREQ + 1) * (tfn + 1)); // 0.67132866
- double f = TOTAL_TERM_FREQ + 1 + tfn;
- double n = f + NUMBER_OF_DOCUMENTS;
- double n1 = n + f - 1; // 258.5
- double m1 = n + f - tfn - 2; // 248.75
- double n2 = f; // 79.75
- double m2 = f - tfn; // 71.0
- float be = (float)(-SimilarityBase.log2(n - 1) -
- SimilarityBase.log2(Math.E) + // -8.924494472554715
- ((m1 + 0.5f) * SimilarityBase.log2(n1 / m1) +
- (n1 - m1) * SimilarityBase.log2(n1)) - // 91.9620374903885
- ((m2 + 0.5f) * SimilarityBase.log2(n2 / m2) +
- (n2 - m2) * SimilarityBase.log2(n2))); // 67.26544321004599
- // 15.7720995
- float gold = b * be; // 10.588263
- correctnessTestCore(sim, gold);
- }
-
- /** Correctness test for the D DFR model (basic model only). */
- public void testD() throws IOException {
- SimilarityBase sim = new DFRSimilarity(new BasicModelD(), new AfterEffect.NoAfterEffect(), new Normalization.NoNormalization());
- double totalTermFreqNorm = TOTAL_TERM_FREQ + FREQ + 1;
- double p = 1.0 / (NUMBER_OF_DOCUMENTS + 1); // 0.009900990099009901
- double phi = FREQ / totalTermFreqNorm; // 0.08974358974358974
- double D = phi * SimilarityBase.log2(phi / p) + // 0.17498542370019005
- (1 - phi) * SimilarityBase.log2((1 - phi) / (1 - p));
- float gold = (float)(totalTermFreqNorm * D + 0.5 * SimilarityBase.log2(
- 1 + 2 * Math.PI * FREQ * (1 - phi))); // 16.328257
- correctnessTestCore(sim, gold);
- }
-
/** Correctness test for the In2 DFR model with no aftereffect. */
public void testIn2() throws IOException {
SimilarityBase sim = new DFRSimilarity(
- new BasicModelIn(), new AfterEffect.NoAfterEffect(), new NormalizationH2());
+ new BasicModelIn(), new AfterEffectL(), new NormalizationH2());
float tfn = (float)(FREQ * SimilarityBase.log2( // 8.1894750101
1 + AVG_FIELD_LENGTH / DOC_LEN));
float gold = (float)(tfn * SimilarityBase.log2( // 26.7459577898
- (NUMBER_OF_DOCUMENTS + 1) / (DOC_FREQ + 0.5)));
+ (NUMBER_OF_DOCUMENTS + 1) / (DOC_FREQ + 0.5)) / (1 + tfn));
correctnessTestCore(sim, gold);
}
diff --git a/lucene/test-framework/src/java/org/apache/lucene/search/similarities/BaseSimilarityTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/search/similarities/BaseSimilarityTestCase.java
index a0f2ece9a43..85a3d6c2c34 100644
--- a/lucene/test-framework/src/java/org/apache/lucene/search/similarities/BaseSimilarityTestCase.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/search/similarities/BaseSimilarityTestCase.java
@@ -193,20 +193,46 @@ public abstract class BaseSimilarityTestCase extends LuceneTestCase {
lowerBound = SmallFloat.byte4ToInt((byte) norm);
}
final long maxDoc;
- if (random.nextBoolean()) {
- // small collection
- maxDoc = TestUtil.nextLong(random, 1, 100000);
- } else {
- // yuge collection
- maxDoc = TestUtil.nextLong(random, 1, MAXDOC_FORTESTING);
+ switch (random.nextInt(6)) {
+ case 0:
+ // 1 doc collection
+ maxDoc = 1;
+ break;
+ case 1:
+ // 2 doc collection
+ maxDoc = 2;
+ break;
+ case 2:
+ // tiny collection
+ maxDoc = TestUtil.nextLong(random, 3, 16);
+ break;
+ case 3:
+ // small collection
+ maxDoc = TestUtil.nextLong(random, 16, 100000);
+ break;
+ case 4:
+ // big collection
+ maxDoc = TestUtil.nextLong(random, 100000, MAXDOC_FORTESTING);
+ break;
+ default:
+ // yuge collection
+ maxDoc = MAXDOC_FORTESTING;
+ break;
}
final long docCount;
- if (random.nextBoolean()) {
- // sparse field
- docCount = TestUtil.nextLong(random, 1, maxDoc);
- } else {
- // fully populated
- docCount = maxDoc;
+ switch (random.nextInt(3)) {
+ case 0:
+ // sparsest field
+ docCount = 1;
+ break;
+ case 1:
+ // sparse field
+ docCount = TestUtil.nextLong(random, 1, maxDoc);
+ break;
+ default:
+ // fully populated
+ docCount = maxDoc;
+ break;
}
// random docsize: but can't require docs to have > 2B tokens
long upperBound;
@@ -216,15 +242,22 @@ public abstract class BaseSimilarityTestCase extends LuceneTestCase {
upperBound = MAXTOKENS_FORTESTING;
}
final long sumDocFreq;
- if (random.nextBoolean()) {
- // shortest possible docs
- sumDocFreq = docCount;
- } else {
- // random docsize
- sumDocFreq = TestUtil.nextLong(random, docCount, upperBound + 1 - lowerBound);
+ switch (random.nextInt(3)) {
+ case 0:
+ // shortest possible docs
+ sumDocFreq = docCount;
+ break;
+ case 1:
+ // biggest possible docs
+ sumDocFreq = upperBound + 1 - lowerBound;
+ break;
+ default:
+ // random docsize
+ sumDocFreq = TestUtil.nextLong(random, docCount, upperBound + 1 - lowerBound);
+ break;
}
final long sumTotalTermFreq;
- switch (random.nextInt(3)) {
+ switch (random.nextInt(4)) {
case 0:
// term frequencies were omitted
sumTotalTermFreq = sumDocFreq;
@@ -233,6 +266,10 @@ public abstract class BaseSimilarityTestCase extends LuceneTestCase {
// no repetition of terms (except to satisfy this norm)
sumTotalTermFreq = sumDocFreq - 1 + lowerBound;
break;
+ case 2:
+ // maximum repetition of terms
+ sumTotalTermFreq = upperBound;
+ break;
default:
// random repetition
assert sumDocFreq - 1 + lowerBound <= upperBound;
@@ -249,29 +286,46 @@ public abstract class BaseSimilarityTestCase extends LuceneTestCase {
*/
static TermStatistics newTerm(Random random, CollectionStatistics corpus) {
final long docFreq;
- if (random.nextBoolean()) {
- // rare term
- docFreq = 1;
- } else {
- // random specificity
- docFreq = TestUtil.nextLong(random, 1, corpus.docCount());
+ switch (random.nextInt(3)) {
+ case 0:
+ // rare term
+ docFreq = 1;
+ break;
+ case 1:
+ // common term
+ docFreq = corpus.docCount();
+ break;
+ default:
+ // random specificity
+ docFreq = TestUtil.nextLong(random, 1, corpus.docCount());
+ break;
}
final long totalTermFreq;
+ // can't require docs to have > 2B tokens
+ long upperBound;
+ try {
+ upperBound = Math.min(corpus.sumTotalTermFreq(), Math.multiplyExact(docFreq, Integer.MAX_VALUE));
+ } catch (ArithmeticException overflow) {
+ upperBound = corpus.sumTotalTermFreq();
+ }
if (corpus.sumTotalTermFreq() == corpus.sumDocFreq()) {
// omitTF
totalTermFreq = docFreq;
- } else if (random.nextBoolean()) {
- // no repetition
- totalTermFreq = docFreq;
} else {
- // random repetition: but can't require docs to have > 2B tokens
- long upperBound;
- try {
- upperBound = Math.min(corpus.sumTotalTermFreq(), Math.multiplyExact(docFreq, Integer.MAX_VALUE));
- } catch (ArithmeticException overflow) {
- upperBound = corpus.sumTotalTermFreq();
+ switch (random.nextInt(3)) {
+ case 0:
+ // no repetition
+ totalTermFreq = docFreq;
+ break;
+ case 1:
+ // maximum repetition
+ totalTermFreq = upperBound;
+ break;
+ default:
+ // random repetition
+ totalTermFreq = TestUtil.nextLong(random, docFreq, upperBound);
+ break;
}
- totalTermFreq = TestUtil.nextLong(random, docFreq, upperBound);
}
return new TermStatistics(TERM, docFreq, totalTermFreq);
}
@@ -315,9 +369,34 @@ public abstract class BaseSimilarityTestCase extends LuceneTestCase {
// there is at least one other document, and those must have at least 1 instance each.
int upperBound = Math.toIntExact(Math.min(term.totalTermFreq() - term.docFreq() + 1, Integer.MAX_VALUE));
if (random.nextBoolean()) {
- freq = TestUtil.nextInt(random, 1, upperBound);
+ // integer freq
+ switch (random.nextInt(3)) {
+ case 0:
+ // smallest freq
+ freq = 1;
+ break;
+ case 1:
+ // largest freq
+ freq = upperBound;
+ break;
+ default:
+ // random freq
+ freq = TestUtil.nextInt(random, 1, upperBound);
+ break;
+ }
} else {
- float freqCandidate = upperBound * random.nextFloat();
+ // float freq
+ float freqCandidate;
+ switch (random.nextInt(2)) {
+ case 0:
+ // smallest freq
+ freqCandidate = Float.MIN_VALUE;
+ break;
+ default:
+ // random freq
+ freqCandidate = upperBound * random.nextFloat();
+ break;
+ }
// we need to be 2nd float value at a minimum, the pairwise test will check MIN_VALUE in this case.
// this avoids testing frequencies of 0 which seem wrong to allow (we should enforce computeSlopFactor etc)
if (freqCandidate <= Float.MIN_VALUE) {
diff --git a/lucene/test-framework/src/java/org/apache/lucene/search/similarities/RandomSimilarity.java b/lucene/test-framework/src/java/org/apache/lucene/search/similarities/RandomSimilarity.java
index f880935339e..444e8ef0e45 100644
--- a/lucene/test-framework/src/java/org/apache/lucene/search/similarities/RandomSimilarity.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/search/similarities/RandomSimilarity.java
@@ -58,13 +58,11 @@ public class RandomSimilarity extends PerFieldSimilarityWrapper {
// all the similarities that we rotate through
/** The DFR basic models to test. */
static BasicModel[] BASIC_MODELS = {
- /* TODO: enable new BasicModelBE(), */ /* TODO: enable new BasicModelD(), */ new BasicModelG(),
- new BasicModelIF(), new BasicModelIn(), new BasicModelIne(),
- /* TODO: enable new BasicModelP() */
+ new BasicModelG(), new BasicModelIF(), new BasicModelIn(), new BasicModelIne(),
};
/** The DFR aftereffects to test. */
static AfterEffect[] AFTER_EFFECTS = {
- new AfterEffectB(), new AfterEffectL(), new AfterEffect.NoAfterEffect()
+ new AfterEffectB(), new AfterEffectL()
};
/** The DFR normalizations to test. */
static Normalization[] NORMALIZATIONS = {
diff --git a/solr/core/src/java/org/apache/solr/search/similarities/DFRSimilarityFactory.java b/solr/core/src/java/org/apache/solr/search/similarities/DFRSimilarityFactory.java
index 18fde0dfc88..572b32da053 100644
--- a/solr/core/src/java/org/apache/solr/search/similarities/DFRSimilarityFactory.java
+++ b/solr/core/src/java/org/apache/solr/search/similarities/DFRSimilarityFactory.java
@@ -17,17 +17,13 @@
package org.apache.solr.search.similarities;
import org.apache.lucene.search.similarities.AfterEffect;
-import org.apache.lucene.search.similarities.AfterEffect.NoAfterEffect; // javadoc
import org.apache.lucene.search.similarities.AfterEffectB;
import org.apache.lucene.search.similarities.AfterEffectL;
import org.apache.lucene.search.similarities.BasicModel;
-import org.apache.lucene.search.similarities.BasicModelBE;
-import org.apache.lucene.search.similarities.BasicModelD;
import org.apache.lucene.search.similarities.BasicModelG;
import org.apache.lucene.search.similarities.BasicModelIF;
import org.apache.lucene.search.similarities.BasicModelIn;
import org.apache.lucene.search.similarities.BasicModelIne;
-import org.apache.lucene.search.similarities.BasicModelP;
import org.apache.lucene.search.similarities.DFRSimilarity;
import org.apache.lucene.search.similarities.Normalization;
import org.apache.lucene.search.similarities.Normalization.NoNormalization; // javadoc
@@ -48,10 +44,7 @@ import org.apache.solr.schema.SimilarityFactory;
*