LUCENE-8015: Fixed DFR similarities' scores to not decrease when tfn increases.

This commit is contained in:
Adrien Grand 2017-12-06 18:19:57 +01:00
parent 70b36666d4
commit 63b63c5734
22 changed files with 193 additions and 427 deletions

View File

@ -37,34 +37,13 @@ public abstract class AfterEffect {
*/
public AfterEffect() {}
/** Returns the aftereffect score. */
public abstract double score(BasicStats stats, double tfn);
/** Returns the product of the after effect with {@code 1+tfn}.
* This may not depend on the value of {@code tfn}. */
public abstract double scoreTimes1pTfn(BasicStats stats);
/** Returns an explanation for the score. */
public abstract Explanation explain(BasicStats stats, double tfn);
/** Implementation used when there is no aftereffect. */
public static final class NoAfterEffect extends AfterEffect {
/** Sole constructor: parameter-free */
public NoAfterEffect() {}
@Override
public double score(BasicStats stats, double tfn) {
return 1.0;
}
@Override
public Explanation explain(BasicStats stats, double tfn) {
return Explanation.match(1, "no aftereffect");
}
@Override
public String toString() {
return "";
}
}
/**
* Subclasses must override this method to return the code of the
* after effect formula. Refer to the original paper for the list.

View File

@ -29,16 +29,16 @@ public class AfterEffectB extends AfterEffect {
public AfterEffectB() {}
@Override
public final double score(BasicStats stats, double tfn) {
public final double scoreTimes1pTfn(BasicStats stats) {
long F = stats.getTotalTermFreq()+1;
long n = stats.getDocFreq()+1;
return (F + 1) / (n * (tfn + 1));
return (F + 1.0) / n;
}
@Override
public final Explanation explain(BasicStats stats, double tfn) {
return Explanation.match(
(float) score(stats, tfn),
(float) (scoreTimes1pTfn(stats) / (1 + tfn)),
getClass().getSimpleName() + ", computed from: ",
Explanation.match((float) tfn, "tfn"),
Explanation.match(stats.getTotalTermFreq(), "totalTermFreq"),

View File

@ -29,14 +29,14 @@ public class AfterEffectL extends AfterEffect {
public AfterEffectL() {}
@Override
public final double score(BasicStats stats, double tfn) {
return 1 / (tfn + 1);
public final double scoreTimes1pTfn(BasicStats stats) {
return 1.0;
}
@Override
public final Explanation explain(BasicStats stats, double tfn) {
return Explanation.match(
(float) score(stats, tfn),
(float) (scoreTimes1pTfn(stats) / (1 + tfn)),
getClass().getSimpleName() + ", computed from: ",
Explanation.match((float) tfn, "tfn"));
}

View File

@ -36,8 +36,10 @@ public abstract class BasicModel {
*/
public BasicModel() {}
/** Returns the informative content score. */
public abstract double score(BasicStats stats, double tfn);
/** Returns the informative content score combined with the after effect, more specifically
* {@code informationContentScore * aeTimes1pTfn / (1 + tfn)}. This function must be
* non-decreasing with {@code tfn}. */
public abstract double score(BasicStats stats, double tfn, double aeTimes1pTfn);
/**
* Returns an explanation for the score.
@ -46,9 +48,9 @@ public abstract class BasicModel {
* explanation for such models. Subclasses that use other statistics must
* override this method.</p>
*/
public Explanation explain(BasicStats stats, double tfn) {
public Explanation explain(BasicStats stats, double tfn, double aeTimes1pTfn) {
return Explanation.match(
(float) score(stats, tfn),
(float) (score(stats, tfn, aeTimes1pTfn) * (1 + tfn) / aeTimes1pTfn),
getClass().getSimpleName() + ", computed from: ",
Explanation.match(stats.getNumberOfDocuments(), "numberOfDocuments"),
Explanation.match(stats.getTotalTermFreq(), "totalTermFreq"));

View File

@ -1,55 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search.similarities;
import static org.apache.lucene.search.similarities.SimilarityBase.log2;
/**
* Limiting form of the Bose-Einstein model. The formula used in Lucene differs
* slightly from the one in the original paper: {@code F} is increased by {@code tfn+1}
* and {@code N} is increased by {@code F}
* @lucene.experimental
* NOTE: in some corner cases this model may give poor performance or infinite scores with
* Normalizations that return large or small values for {@code tfn} such as NormalizationH3.
* Consider using the geometric approximation ({@link BasicModelG}) instead, which provides
* the same relevance but with less practical problems.
*/
public class BasicModelBE extends BasicModel {
/** Sole constructor: parameter-free */
public BasicModelBE() {}
@Override
public final double score(BasicStats stats, double tfn) {
double F = stats.getTotalTermFreq() + 1 + tfn;
// approximation only holds true when F << N, so we use N += F
double N = F + stats.getNumberOfDocuments();
return (-log2((N - 1) * Math.E)
+ f(N + F - 1, N + F - tfn - 2) - f(F, F - tfn));
}
/** The <em>f</em> helper function defined for <em>B<sub>E</sub></em>. */
private final double f(double n, double m) {
return (m + 0.5) * log2(n / m) + (n - m) * log2(n);
}
@Override
public String toString() {
return "Be";
}
}

View File

@ -1,56 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search.similarities;
import static org.apache.lucene.search.similarities.SimilarityBase.log2;
/**
* Implements the approximation of the binomial model with the divergence
* for DFR. The formula used in Lucene differs slightly from the one in the
* original paper: to avoid underflow for small values of {@code N} and
* {@code F}, {@code N} is increased by {@code 1} and
* {@code F} is always increased by {@code tfn+1}.
* <p>
* WARNING: for terms that do not meet the expected random distribution
* (e.g. stopwords), this model may give poor performance, such as
* abnormally high or NaN scores for low tf values.
* @lucene.experimental
*/
public class BasicModelD extends BasicModel {
/** Sole constructor: parameter-free */
public BasicModelD() {}
@Override
public final double score(BasicStats stats, double tfn) {
// we have to ensure phi is always < 1 for tiny TTF values, otherwise nphi can go negative,
// resulting in NaN. cleanest way is to unconditionally always add tfn to totalTermFreq
// to create a 'normalized' F.
double F = stats.getTotalTermFreq() + 1 + tfn;
double phi = tfn / F;
double nphi = 1 - phi;
double p = 1.0 / (stats.getNumberOfDocuments() + 1);
double D = phi * log2(phi / p) + nphi * log2(nphi / (1 - p));
return D * F + 0.5 * log2(1 + 2 * Math.PI * tfn * nphi);
}
@Override
public String toString() {
return "D";
}
}

View File

@ -31,13 +31,21 @@ public class BasicModelG extends BasicModel {
public BasicModelG() {}
@Override
public final double score(BasicStats stats, double tfn) {
public final double score(BasicStats stats, double tfn, double aeTimes1pTfn) {
// just like in BE, approximation only holds true when F << N, so we use lambda = F / (N + F)
double F = stats.getTotalTermFreq() + 1;
double N = stats.getNumberOfDocuments();
double lambda = F / (N + F);
// -log(1 / (lambda + 1)) -> log(lambda + 1)
return log2(lambda + 1) + tfn * log2((1 + lambda) / lambda);
double A = log2(lambda + 1);
double B = log2((1 + lambda) / lambda);
// basic model G should return (A + B * tfn)
// which we rewrite to B * (1 + tfn) - (B - A)
// so that it can be combined with the after effect while still guaranteeing
// that the result is non-decreasing with tfn since B >= A
return (B - (B - A) / (1 + tfn)) * aeTimes1pTfn;
}
@Override

View File

@ -29,10 +29,17 @@ public class BasicModelIF extends BasicModel {
public BasicModelIF() {}
@Override
public final double score(BasicStats stats, double tfn) {
public final double score(BasicStats stats, double tfn, double aeTimes1pTfn) {
long N = stats.getNumberOfDocuments();
long F = stats.getTotalTermFreq();
return tfn * log2(1 + (N + 1) / (F + 0.5));
double A = log2(1 + (N + 1) / (F + 0.5));
// basic model IF should return A * tfn
// which we rewrite to A * (1 + tfn) - A
// so that it can be combined with the after effect while still guaranteeing
// that the result is non-decreasing with tfn
return A * aeTimes1pTfn * (1 - 1 / (1 + tfn));
}
@Override

View File

@ -30,16 +30,23 @@ public class BasicModelIn extends BasicModel {
public BasicModelIn() {}
@Override
public final double score(BasicStats stats, double tfn) {
public final double score(BasicStats stats, double tfn, double aeTimes1pTfn) {
long N = stats.getNumberOfDocuments();
long n = stats.getDocFreq();
return tfn * log2((N + 1) / (n + 0.5));
double A = log2((N + 1) / (n + 0.5));
// basic model I(n) should return A * tfn
// which we rewrite to A * (1 + tfn) - A
// so that it can be combined with the after effect while still guaranteeing
// that the result is non-decreasing with tfn
return A * aeTimes1pTfn * (1 - 1 / (1 + tfn));
}
@Override
public final Explanation explain(BasicStats stats, double tfn) {
public final Explanation explain(BasicStats stats, double tfn, double aeTimes1pTfn) {
return Explanation.match(
(float) score(stats, tfn),
(float) (score(stats, tfn, aeTimes1pTfn) * (1 + tfn) / aeTimes1pTfn),
getClass().getSimpleName() + ", computed from: ",
Explanation.match(stats.getNumberOfDocuments(), "numberOfDocuments"),
Explanation.match(stats.getDocFreq(), "docFreq"));

View File

@ -30,11 +30,18 @@ public class BasicModelIne extends BasicModel {
public BasicModelIne() {}
@Override
public final double score(BasicStats stats, double tfn) {
public final double score(BasicStats stats, double tfn, double aeTimes1pTfn) {
long N = stats.getNumberOfDocuments();
long F = stats.getTotalTermFreq();
double ne = N * (1 - Math.pow((N - 1) / (double)N, F));
return tfn * log2((N + 1) / (ne + 0.5));
double A = log2((N + 1) / (ne + 0.5));
// basic model I(ne) should return A * tfn
// which we rewrite to A * (1 + tfn) - A
// so that it can be combined with the after effect while still guaranteeing
// that the result is non-decreasing with tfn
return A * aeTimes1pTfn * (1 - 1 / (1 + tfn));
}
@Override

View File

@ -1,49 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search.similarities;
import static org.apache.lucene.search.similarities.SimilarityBase.log2;
/**
* Implements the Poisson approximation for the binomial model for DFR.
* @lucene.experimental
* <p>
* WARNING: for terms that do not meet the expected random distribution
* (e.g. stopwords), this model may give poor performance, such as
* abnormally high scores for low tf values.
*/
public class BasicModelP extends BasicModel {
/** {@code log2(Math.E)}, precomputed. */
protected static double LOG2_E = log2(Math.E);
/** Sole constructor: parameter-free */
public BasicModelP() {}
@Override
public final double score(BasicStats stats, double tfn) {
double lambda = (stats.getTotalTermFreq()+1) / (double) (stats.getNumberOfDocuments()+1);
return tfn * log2(tfn / lambda)
+ (lambda + 1 / (12 * tfn) - tfn) * LOG2_E
+ 0.5 * log2(2 * Math.PI * tfn);
}
@Override
public String toString() {
return "P";
}
}

View File

@ -20,7 +20,6 @@ package org.apache.lucene.search.similarities;
import java.util.List;
import org.apache.lucene.search.Explanation;
import org.apache.lucene.search.similarities.AfterEffect.NoAfterEffect;
import org.apache.lucene.search.similarities.Normalization.NoNormalization;
/**
@ -40,10 +39,7 @@ import org.apache.lucene.search.similarities.Normalization.NoNormalization;
* <ol>
* <li>{@link BasicModel}: Basic model of information content:
* <ul>
* <li>{@link BasicModelBE}: Limiting form of Bose-Einstein
* <li>{@link BasicModelG}: Geometric approximation of Bose-Einstein
* <li>{@link BasicModelP}: Poisson approximation of the Binomial
* <li>{@link BasicModelD}: Divergence approximation of the Binomial
* <li>{@link BasicModelIn}: Inverse document frequency
* <li>{@link BasicModelIne}: Inverse expected document
* frequency [mixture of Poisson and IDF]
@ -55,7 +51,6 @@ import org.apache.lucene.search.similarities.Normalization.NoNormalization;
* <ul>
* <li>{@link AfterEffectL}: Laplace's law of succession
* <li>{@link AfterEffectB}: Ratio of two Bernoulli processes
* <li>{@link NoAfterEffect}: no first normalization
* </ul>
* <li>{@link Normalization}: Second (length) normalization:
* <ul>
@ -72,6 +67,10 @@ import org.apache.lucene.search.similarities.Normalization.NoNormalization;
* </ol>
* <p>Note that <em>qtf</em>, the multiplicity of term-occurrence in the query,
* is not handled by this implementation.</p>
* <p> Note that basic models BE (Limiting form of Bose-Einstein), P (Poisson
* approximation of the Binomial) and D (Divergence approximation of the
* Binomial) are not implemented because their formula couldn't be written in
* a way that makes scores non-decreasing with the normalized term frequency.
* @see BasicModel
* @see AfterEffect
* @see Normalization
@ -89,8 +88,8 @@ public class DFRSimilarity extends SimilarityBase {
* Creates DFRSimilarity from the three components.
* <p>
* Note that <code>null</code> values are not allowed:
* if you want no normalization or after-effect, instead pass
* {@link NoNormalization} or {@link NoAfterEffect} respectively.
* if you want no normalization, instead pass
* {@link NoNormalization}.
* @param basicModel Basic model of information content
* @param afterEffect First normalization of information gain
* @param normalization Second (length) normalization
@ -109,8 +108,8 @@ public class DFRSimilarity extends SimilarityBase {
@Override
protected double score(BasicStats stats, double freq, double docLen) {
double tfn = normalization.tfn(stats, freq, docLen);
return stats.getBoost() *
basicModel.score(stats, tfn) * afterEffect.score(stats, tfn);
double aeTimes1pTfn = afterEffect.scoreTimes1pTfn(stats);
return stats.getBoost() * basicModel.score(stats, tfn, aeTimes1pTfn);
}
@Override
@ -121,9 +120,10 @@ public class DFRSimilarity extends SimilarityBase {
}
Explanation normExpl = normalization.explain(stats, freq, docLen);
float tfn = normExpl.getValue();
double tfn = normalization.tfn(stats, freq, docLen);
double aeTimes1pTfn = afterEffect.scoreTimes1pTfn(stats);
subs.add(normExpl);
subs.add(basicModel.explain(stats, tfn));
subs.add(basicModel.explain(stats, tfn, aeTimes1pTfn));
subs.add(afterEffect.explain(stats, tfn));
}

View File

@ -23,11 +23,8 @@ public abstract class BasicModelTestCase extends BaseSimilarityTestCase {
@Override
protected final Similarity getSimilarity(Random random) {
final AfterEffect afterEffect;
switch(random.nextInt(3)) {
switch(random.nextInt(2)) {
case 0:
afterEffect = new AfterEffect.NoAfterEffect();
break;
case 1:
afterEffect = new AfterEffectL();
break;
default:

View File

@ -1,30 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search.similarities;
import org.apache.lucene.util.LuceneTestCase.AwaitsFix;
// returns negative scores at least, but it warns it has problems
@AwaitsFix(bugUrl = "https://issues.apache.org/jira/browse/LUCENE-8010")
public class TestBasicModelBE extends BasicModelTestCase {
@Override
protected BasicModel getBasicModel() {
return new BasicModelBE();
}
}

View File

@ -1,30 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search.similarities;
import org.apache.lucene.util.LuceneTestCase.AwaitsFix;
// scores go backwards with respect to TF, but it warns it has problems
@AwaitsFix(bugUrl = "https://issues.apache.org/jira/browse/LUCENE-8010")
public class TestBasicModelD extends BasicModelTestCase {
@Override
protected BasicModel getBasicModel() {
return new BasicModelD();
}
}

View File

@ -1,30 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search.similarities;
import org.apache.lucene.util.LuceneTestCase.AwaitsFix;
//scores go backwards with respect to TF, but it warns it has problems
@AwaitsFix(bugUrl = "https://issues.apache.org/jira/browse/LUCENE-8010")
public class TestBasicModelP extends BasicModelTestCase {
@Override
protected BasicModel getBasicModel() {
return new BasicModelP();
}
}

View File

@ -76,13 +76,12 @@ public class TestSimilarityBase extends LuceneTestCase {
private static float FLOAT_EPSILON = 1e-5f;
/** The DFR basic models to test. */
static BasicModel[] BASIC_MODELS = {
new BasicModelBE(), new BasicModelD(), new BasicModelG(),
new BasicModelIF(), new BasicModelIn(), new BasicModelIne(),
new BasicModelP()
new BasicModelG(), new BasicModelIF(), new BasicModelIn(),
new BasicModelIne()
};
/** The DFR aftereffects to test. */
static AfterEffect[] AFTER_EFFECTS = {
new AfterEffectB(), new AfterEffectL(), new AfterEffect.NoAfterEffect()
new AfterEffectB(), new AfterEffectL()
};
/** The DFR normalizations to test. */
static Normalization[] NORMALIZATIONS = {
@ -446,21 +445,6 @@ public class TestSimilarityBase extends LuceneTestCase {
correctnessTestCore(sim, 2.2387237548828125f);
}
/** Correctness test for the PL2 DFR model. */
public void testPL2() throws IOException {
SimilarityBase sim = new DFRSimilarity(
new BasicModelP(), new AfterEffectL(), new NormalizationH2());
float tfn = (float)(FREQ * SimilarityBase.log2(
1 + AVG_FIELD_LENGTH / DOC_LEN)); // 8.1894750101
float l = 1.0f / (tfn + 1.0f); // 0.108820144666
float lambda = (1.0f + TOTAL_TERM_FREQ) / (1f + NUMBER_OF_DOCUMENTS); // 0.7029703
float p = (float)(tfn * SimilarityBase.log2(tfn / lambda) +
(lambda + 1 / (12 * tfn) - tfn) * SimilarityBase.log2(Math.E) +
0.5 * SimilarityBase.log2(2 * Math.PI * tfn)); // 21.065619
float gold = l * p; // 2.2923636
correctnessTestCore(sim, gold);
}
/** Correctness test for the IneB2 DFR model. */
public void testIneB2() throws IOException {
SimilarityBase sim = new DFRSimilarity(
@ -475,50 +459,14 @@ public class TestSimilarityBase extends LuceneTestCase {
correctnessTestCore(sim, 1.6390540599822998f);
}
/** Correctness test for the BEB1 DFR model. */
public void testBEB1() throws IOException {
SimilarityBase sim = new DFRSimilarity(
new BasicModelBE(), new AfterEffectB(), new NormalizationH1());
float tfn = FREQ * AVG_FIELD_LENGTH / DOC_LEN; // 8.75
float b = (TOTAL_TERM_FREQ + 1 + 1) / ((DOC_FREQ + 1) * (tfn + 1)); // 0.67132866
double f = TOTAL_TERM_FREQ + 1 + tfn;
double n = f + NUMBER_OF_DOCUMENTS;
double n1 = n + f - 1; // 258.5
double m1 = n + f - tfn - 2; // 248.75
double n2 = f; // 79.75
double m2 = f - tfn; // 71.0
float be = (float)(-SimilarityBase.log2(n - 1) -
SimilarityBase.log2(Math.E) + // -8.924494472554715
((m1 + 0.5f) * SimilarityBase.log2(n1 / m1) +
(n1 - m1) * SimilarityBase.log2(n1)) - // 91.9620374903885
((m2 + 0.5f) * SimilarityBase.log2(n2 / m2) +
(n2 - m2) * SimilarityBase.log2(n2))); // 67.26544321004599
// 15.7720995
float gold = b * be; // 10.588263
correctnessTestCore(sim, gold);
}
/** Correctness test for the D DFR model (basic model only). */
public void testD() throws IOException {
SimilarityBase sim = new DFRSimilarity(new BasicModelD(), new AfterEffect.NoAfterEffect(), new Normalization.NoNormalization());
double totalTermFreqNorm = TOTAL_TERM_FREQ + FREQ + 1;
double p = 1.0 / (NUMBER_OF_DOCUMENTS + 1); // 0.009900990099009901
double phi = FREQ / totalTermFreqNorm; // 0.08974358974358974
double D = phi * SimilarityBase.log2(phi / p) + // 0.17498542370019005
(1 - phi) * SimilarityBase.log2((1 - phi) / (1 - p));
float gold = (float)(totalTermFreqNorm * D + 0.5 * SimilarityBase.log2(
1 + 2 * Math.PI * FREQ * (1 - phi))); // 16.328257
correctnessTestCore(sim, gold);
}
/** Correctness test for the In2 DFR model with no aftereffect. */
public void testIn2() throws IOException {
SimilarityBase sim = new DFRSimilarity(
new BasicModelIn(), new AfterEffect.NoAfterEffect(), new NormalizationH2());
new BasicModelIn(), new AfterEffectL(), new NormalizationH2());
float tfn = (float)(FREQ * SimilarityBase.log2( // 8.1894750101
1 + AVG_FIELD_LENGTH / DOC_LEN));
float gold = (float)(tfn * SimilarityBase.log2( // 26.7459577898
(NUMBER_OF_DOCUMENTS + 1) / (DOC_FREQ + 0.5)));
(NUMBER_OF_DOCUMENTS + 1) / (DOC_FREQ + 0.5)) / (1 + tfn));
correctnessTestCore(sim, gold);
}

View File

@ -193,20 +193,46 @@ public abstract class BaseSimilarityTestCase extends LuceneTestCase {
lowerBound = SmallFloat.byte4ToInt((byte) norm);
}
final long maxDoc;
if (random.nextBoolean()) {
switch (random.nextInt(6)) {
case 0:
// 1 doc collection
maxDoc = 1;
break;
case 1:
// 2 doc collection
maxDoc = 2;
break;
case 2:
// tiny collection
maxDoc = TestUtil.nextLong(random, 3, 16);
break;
case 3:
// small collection
maxDoc = TestUtil.nextLong(random, 1, 100000);
} else {
maxDoc = TestUtil.nextLong(random, 16, 100000);
break;
case 4:
// big collection
maxDoc = TestUtil.nextLong(random, 100000, MAXDOC_FORTESTING);
break;
default:
// yuge collection
maxDoc = TestUtil.nextLong(random, 1, MAXDOC_FORTESTING);
maxDoc = MAXDOC_FORTESTING;
break;
}
final long docCount;
if (random.nextBoolean()) {
switch (random.nextInt(3)) {
case 0:
// sparsest field
docCount = 1;
break;
case 1:
// sparse field
docCount = TestUtil.nextLong(random, 1, maxDoc);
} else {
break;
default:
// fully populated
docCount = maxDoc;
break;
}
// random docsize: but can't require docs to have > 2B tokens
long upperBound;
@ -216,15 +242,22 @@ public abstract class BaseSimilarityTestCase extends LuceneTestCase {
upperBound = MAXTOKENS_FORTESTING;
}
final long sumDocFreq;
if (random.nextBoolean()) {
switch (random.nextInt(3)) {
case 0:
// shortest possible docs
sumDocFreq = docCount;
} else {
break;
case 1:
// biggest possible docs
sumDocFreq = upperBound + 1 - lowerBound;
break;
default:
// random docsize
sumDocFreq = TestUtil.nextLong(random, docCount, upperBound + 1 - lowerBound);
break;
}
final long sumTotalTermFreq;
switch (random.nextInt(3)) {
switch (random.nextInt(4)) {
case 0:
// term frequencies were omitted
sumTotalTermFreq = sumDocFreq;
@ -233,6 +266,10 @@ public abstract class BaseSimilarityTestCase extends LuceneTestCase {
// no repetition of terms (except to satisfy this norm)
sumTotalTermFreq = sumDocFreq - 1 + lowerBound;
break;
case 2:
// maximum repetition of terms
sumTotalTermFreq = upperBound;
break;
default:
// random repetition
assert sumDocFreq - 1 + lowerBound <= upperBound;
@ -249,29 +286,46 @@ public abstract class BaseSimilarityTestCase extends LuceneTestCase {
*/
static TermStatistics newTerm(Random random, CollectionStatistics corpus) {
final long docFreq;
if (random.nextBoolean()) {
switch (random.nextInt(3)) {
case 0:
// rare term
docFreq = 1;
} else {
break;
case 1:
// common term
docFreq = corpus.docCount();
break;
default:
// random specificity
docFreq = TestUtil.nextLong(random, 1, corpus.docCount());
break;
}
final long totalTermFreq;
if (corpus.sumTotalTermFreq() == corpus.sumDocFreq()) {
// omitTF
totalTermFreq = docFreq;
} else if (random.nextBoolean()) {
// no repetition
totalTermFreq = docFreq;
} else {
// random repetition: but can't require docs to have > 2B tokens
// can't require docs to have > 2B tokens
long upperBound;
try {
upperBound = Math.min(corpus.sumTotalTermFreq(), Math.multiplyExact(docFreq, Integer.MAX_VALUE));
} catch (ArithmeticException overflow) {
upperBound = corpus.sumTotalTermFreq();
}
if (corpus.sumTotalTermFreq() == corpus.sumDocFreq()) {
// omitTF
totalTermFreq = docFreq;
} else {
switch (random.nextInt(3)) {
case 0:
// no repetition
totalTermFreq = docFreq;
break;
case 1:
// maximum repetition
totalTermFreq = upperBound;
break;
default:
// random repetition
totalTermFreq = TestUtil.nextLong(random, docFreq, upperBound);
break;
}
}
return new TermStatistics(TERM, docFreq, totalTermFreq);
}
@ -315,9 +369,34 @@ public abstract class BaseSimilarityTestCase extends LuceneTestCase {
// there is at least one other document, and those must have at least 1 instance each.
int upperBound = Math.toIntExact(Math.min(term.totalTermFreq() - term.docFreq() + 1, Integer.MAX_VALUE));
if (random.nextBoolean()) {
// integer freq
switch (random.nextInt(3)) {
case 0:
// smallest freq
freq = 1;
break;
case 1:
// largest freq
freq = upperBound;
break;
default:
// random freq
freq = TestUtil.nextInt(random, 1, upperBound);
break;
}
} else {
float freqCandidate = upperBound * random.nextFloat();
// float freq
float freqCandidate;
switch (random.nextInt(2)) {
case 0:
// smallest freq
freqCandidate = Float.MIN_VALUE;
break;
default:
// random freq
freqCandidate = upperBound * random.nextFloat();
break;
}
// we need to be 2nd float value at a minimum, the pairwise test will check MIN_VALUE in this case.
// this avoids testing frequencies of 0 which seem wrong to allow (we should enforce computeSlopFactor etc)
if (freqCandidate <= Float.MIN_VALUE) {

View File

@ -58,13 +58,11 @@ public class RandomSimilarity extends PerFieldSimilarityWrapper {
// all the similarities that we rotate through
/** The DFR basic models to test. */
static BasicModel[] BASIC_MODELS = {
/* TODO: enable new BasicModelBE(), */ /* TODO: enable new BasicModelD(), */ new BasicModelG(),
new BasicModelIF(), new BasicModelIn(), new BasicModelIne(),
/* TODO: enable new BasicModelP() */
new BasicModelG(), new BasicModelIF(), new BasicModelIn(), new BasicModelIne(),
};
/** The DFR aftereffects to test. */
static AfterEffect[] AFTER_EFFECTS = {
new AfterEffectB(), new AfterEffectL(), new AfterEffect.NoAfterEffect()
new AfterEffectB(), new AfterEffectL()
};
/** The DFR normalizations to test. */
static Normalization[] NORMALIZATIONS = {

View File

@ -17,17 +17,13 @@
package org.apache.solr.search.similarities;
import org.apache.lucene.search.similarities.AfterEffect;
import org.apache.lucene.search.similarities.AfterEffect.NoAfterEffect; // javadoc
import org.apache.lucene.search.similarities.AfterEffectB;
import org.apache.lucene.search.similarities.AfterEffectL;
import org.apache.lucene.search.similarities.BasicModel;
import org.apache.lucene.search.similarities.BasicModelBE;
import org.apache.lucene.search.similarities.BasicModelD;
import org.apache.lucene.search.similarities.BasicModelG;
import org.apache.lucene.search.similarities.BasicModelIF;
import org.apache.lucene.search.similarities.BasicModelIn;
import org.apache.lucene.search.similarities.BasicModelIne;
import org.apache.lucene.search.similarities.BasicModelP;
import org.apache.lucene.search.similarities.DFRSimilarity;
import org.apache.lucene.search.similarities.Normalization;
import org.apache.lucene.search.similarities.Normalization.NoNormalization; // javadoc
@ -48,10 +44,7 @@ import org.apache.solr.schema.SimilarityFactory;
* <ol>
* <li>{@link BasicModel basicModel}: Basic model of information content:
* <ul>
* <li>{@link BasicModelBE Be}: Limiting form of Bose-Einstein
* <li>{@link BasicModelG G}: Geometric approximation of Bose-Einstein
* <li>{@link BasicModelP P}: Poisson approximation of the Binomial
* <li>{@link BasicModelD D}: Divergence approximation of the Binomial
* <li>{@link BasicModelIn I(n)}: Inverse document frequency
* <li>{@link BasicModelIne I(ne)}: Inverse expected document
* frequency [mixture of Poisson and IDF]
@ -63,7 +56,6 @@ import org.apache.solr.schema.SimilarityFactory;
* <ul>
* <li>{@link AfterEffectL L}: Laplace's law of succession
* <li>{@link AfterEffectB B}: Ratio of two Bernoulli processes
* <li>{@link NoAfterEffect none}: no first normalization
* </ul>
* <li>{@link Normalization normalization}: Second (length) normalization:
* <ul>
@ -122,11 +114,7 @@ public class DFRSimilarityFactory extends SimilarityFactory {
}
private BasicModel parseBasicModel(String expr) {
if ("Be".equals(expr)) {
return new BasicModelBE();
} else if ("D".equals(expr)) {
return new BasicModelD();
} else if ("G".equals(expr)) {
if ("G".equals(expr)) {
return new BasicModelG();
} else if ("I(F)".equals(expr)) {
return new BasicModelIF();
@ -134,8 +122,6 @@ public class DFRSimilarityFactory extends SimilarityFactory {
return new BasicModelIn();
} else if ("I(ne)".equals(expr)) {
return new BasicModelIne();
} else if ("P".equals(expr)) {
return new BasicModelP();
} else {
throw new RuntimeException("Invalid basicModel: " + expr);
}
@ -146,8 +132,6 @@ public class DFRSimilarityFactory extends SimilarityFactory {
return new AfterEffectB();
} else if ("L".equals(expr)) {
return new AfterEffectL();
} else if ("none".equals(expr)) {
return new AfterEffect.NoAfterEffect();
} else {
throw new RuntimeException("Invalid afterEffect: " + expr);
}

View File

@ -46,7 +46,7 @@
<fieldType name="text_paramc" class="solr.TextField">
<analyzer class="org.apache.lucene.analysis.standard.StandardAnalyzer"/>
<similarity class="solr.DFRSimilarityFactory">
<str name="basicModel">P</str>
<str name="basicModel">G</str>
<str name="afterEffect">L</str>
<str name="normalization">H2</str>
<float name="c">7</float>

View File

@ -18,8 +18,8 @@ package org.apache.solr.search.similarities;
import org.apache.lucene.search.similarities.AfterEffectB;
import org.apache.lucene.search.similarities.AfterEffectL;
import org.apache.lucene.search.similarities.BasicModelG;
import org.apache.lucene.search.similarities.BasicModelIF;
import org.apache.lucene.search.similarities.BasicModelP;
import org.apache.lucene.search.similarities.DFRSimilarity;
import org.apache.lucene.search.similarities.NormalizationH2;
import org.apache.lucene.search.similarities.NormalizationH3;
@ -62,7 +62,7 @@ public class TestDFRSimilarityFactory extends BaseSimilarityTestCase {
Similarity sim = getSimilarity("text_paramc");
assertEquals(DFRSimilarity.class, sim.getClass());
DFRSimilarity dfr = (DFRSimilarity) sim;
assertEquals(BasicModelP.class, dfr.getBasicModel().getClass());
assertEquals(BasicModelG.class, dfr.getBasicModel().getClass());
assertEquals(AfterEffectL.class, dfr.getAfterEffect().getClass());
assertEquals(NormalizationH2.class, dfr.getNormalization().getClass());
NormalizationH2 norm = (NormalizationH2) dfr.getNormalization();