mirror of https://github.com/apache/lucene.git
LUCENE-8015: Fixed DFR similarities' scores to not decrease when tfn increases.
This commit is contained in:
parent
70b36666d4
commit
63b63c5734
|
@ -37,34 +37,13 @@ public abstract class AfterEffect {
|
|||
*/
|
||||
public AfterEffect() {}
|
||||
|
||||
/** Returns the aftereffect score. */
|
||||
public abstract double score(BasicStats stats, double tfn);
|
||||
/** Returns the product of the after effect with {@code 1+tfn}.
|
||||
* This may not depend on the value of {@code tfn}. */
|
||||
public abstract double scoreTimes1pTfn(BasicStats stats);
|
||||
|
||||
/** Returns an explanation for the score. */
|
||||
public abstract Explanation explain(BasicStats stats, double tfn);
|
||||
|
||||
/** Implementation used when there is no aftereffect. */
|
||||
public static final class NoAfterEffect extends AfterEffect {
|
||||
|
||||
/** Sole constructor: parameter-free */
|
||||
public NoAfterEffect() {}
|
||||
|
||||
@Override
|
||||
public double score(BasicStats stats, double tfn) {
|
||||
return 1.0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Explanation explain(BasicStats stats, double tfn) {
|
||||
return Explanation.match(1, "no aftereffect");
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "";
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Subclasses must override this method to return the code of the
|
||||
* after effect formula. Refer to the original paper for the list.
|
||||
|
|
|
@ -29,16 +29,16 @@ public class AfterEffectB extends AfterEffect {
|
|||
public AfterEffectB() {}
|
||||
|
||||
@Override
|
||||
public final double score(BasicStats stats, double tfn) {
|
||||
public final double scoreTimes1pTfn(BasicStats stats) {
|
||||
long F = stats.getTotalTermFreq()+1;
|
||||
long n = stats.getDocFreq()+1;
|
||||
return (F + 1) / (n * (tfn + 1));
|
||||
return (F + 1.0) / n;
|
||||
}
|
||||
|
||||
@Override
|
||||
public final Explanation explain(BasicStats stats, double tfn) {
|
||||
return Explanation.match(
|
||||
(float) score(stats, tfn),
|
||||
(float) (scoreTimes1pTfn(stats) / (1 + tfn)),
|
||||
getClass().getSimpleName() + ", computed from: ",
|
||||
Explanation.match((float) tfn, "tfn"),
|
||||
Explanation.match(stats.getTotalTermFreq(), "totalTermFreq"),
|
||||
|
|
|
@ -29,14 +29,14 @@ public class AfterEffectL extends AfterEffect {
|
|||
public AfterEffectL() {}
|
||||
|
||||
@Override
|
||||
public final double score(BasicStats stats, double tfn) {
|
||||
return 1 / (tfn + 1);
|
||||
public final double scoreTimes1pTfn(BasicStats stats) {
|
||||
return 1.0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public final Explanation explain(BasicStats stats, double tfn) {
|
||||
return Explanation.match(
|
||||
(float) score(stats, tfn),
|
||||
(float) (scoreTimes1pTfn(stats) / (1 + tfn)),
|
||||
getClass().getSimpleName() + ", computed from: ",
|
||||
Explanation.match((float) tfn, "tfn"));
|
||||
}
|
||||
|
|
|
@ -36,8 +36,10 @@ public abstract class BasicModel {
|
|||
*/
|
||||
public BasicModel() {}
|
||||
|
||||
/** Returns the informative content score. */
|
||||
public abstract double score(BasicStats stats, double tfn);
|
||||
/** Returns the informative content score combined with the after effect, more specifically
|
||||
* {@code informationContentScore * aeTimes1pTfn / (1 + tfn)}. This function must be
|
||||
* non-decreasing with {@code tfn}. */
|
||||
public abstract double score(BasicStats stats, double tfn, double aeTimes1pTfn);
|
||||
|
||||
/**
|
||||
* Returns an explanation for the score.
|
||||
|
@ -46,9 +48,9 @@ public abstract class BasicModel {
|
|||
* explanation for such models. Subclasses that use other statistics must
|
||||
* override this method.</p>
|
||||
*/
|
||||
public Explanation explain(BasicStats stats, double tfn) {
|
||||
public Explanation explain(BasicStats stats, double tfn, double aeTimes1pTfn) {
|
||||
return Explanation.match(
|
||||
(float) score(stats, tfn),
|
||||
(float) (score(stats, tfn, aeTimes1pTfn) * (1 + tfn) / aeTimes1pTfn),
|
||||
getClass().getSimpleName() + ", computed from: ",
|
||||
Explanation.match(stats.getNumberOfDocuments(), "numberOfDocuments"),
|
||||
Explanation.match(stats.getTotalTermFreq(), "totalTermFreq"));
|
||||
|
|
|
@ -1,55 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.search.similarities;
|
||||
|
||||
|
||||
import static org.apache.lucene.search.similarities.SimilarityBase.log2;
|
||||
|
||||
/**
|
||||
* Limiting form of the Bose-Einstein model. The formula used in Lucene differs
|
||||
* slightly from the one in the original paper: {@code F} is increased by {@code tfn+1}
|
||||
* and {@code N} is increased by {@code F}
|
||||
* @lucene.experimental
|
||||
* NOTE: in some corner cases this model may give poor performance or infinite scores with
|
||||
* Normalizations that return large or small values for {@code tfn} such as NormalizationH3.
|
||||
* Consider using the geometric approximation ({@link BasicModelG}) instead, which provides
|
||||
* the same relevance but with less practical problems.
|
||||
*/
|
||||
public class BasicModelBE extends BasicModel {
|
||||
|
||||
/** Sole constructor: parameter-free */
|
||||
public BasicModelBE() {}
|
||||
|
||||
@Override
|
||||
public final double score(BasicStats stats, double tfn) {
|
||||
double F = stats.getTotalTermFreq() + 1 + tfn;
|
||||
// approximation only holds true when F << N, so we use N += F
|
||||
double N = F + stats.getNumberOfDocuments();
|
||||
return (-log2((N - 1) * Math.E)
|
||||
+ f(N + F - 1, N + F - tfn - 2) - f(F, F - tfn));
|
||||
}
|
||||
|
||||
/** The <em>f</em> helper function defined for <em>B<sub>E</sub></em>. */
|
||||
private final double f(double n, double m) {
|
||||
return (m + 0.5) * log2(n / m) + (n - m) * log2(n);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "Be";
|
||||
}
|
||||
}
|
|
@ -1,56 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.search.similarities;
|
||||
|
||||
|
||||
import static org.apache.lucene.search.similarities.SimilarityBase.log2;
|
||||
|
||||
/**
|
||||
* Implements the approximation of the binomial model with the divergence
|
||||
* for DFR. The formula used in Lucene differs slightly from the one in the
|
||||
* original paper: to avoid underflow for small values of {@code N} and
|
||||
* {@code F}, {@code N} is increased by {@code 1} and
|
||||
* {@code F} is always increased by {@code tfn+1}.
|
||||
* <p>
|
||||
* WARNING: for terms that do not meet the expected random distribution
|
||||
* (e.g. stopwords), this model may give poor performance, such as
|
||||
* abnormally high or NaN scores for low tf values.
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public class BasicModelD extends BasicModel {
|
||||
|
||||
/** Sole constructor: parameter-free */
|
||||
public BasicModelD() {}
|
||||
|
||||
@Override
|
||||
public final double score(BasicStats stats, double tfn) {
|
||||
// we have to ensure phi is always < 1 for tiny TTF values, otherwise nphi can go negative,
|
||||
// resulting in NaN. cleanest way is to unconditionally always add tfn to totalTermFreq
|
||||
// to create a 'normalized' F.
|
||||
double F = stats.getTotalTermFreq() + 1 + tfn;
|
||||
double phi = tfn / F;
|
||||
double nphi = 1 - phi;
|
||||
double p = 1.0 / (stats.getNumberOfDocuments() + 1);
|
||||
double D = phi * log2(phi / p) + nphi * log2(nphi / (1 - p));
|
||||
return D * F + 0.5 * log2(1 + 2 * Math.PI * tfn * nphi);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "D";
|
||||
}
|
||||
}
|
|
@ -31,13 +31,21 @@ public class BasicModelG extends BasicModel {
|
|||
public BasicModelG() {}
|
||||
|
||||
@Override
|
||||
public final double score(BasicStats stats, double tfn) {
|
||||
public final double score(BasicStats stats, double tfn, double aeTimes1pTfn) {
|
||||
// just like in BE, approximation only holds true when F << N, so we use lambda = F / (N + F)
|
||||
double F = stats.getTotalTermFreq() + 1;
|
||||
double N = stats.getNumberOfDocuments();
|
||||
double lambda = F / (N + F);
|
||||
// -log(1 / (lambda + 1)) -> log(lambda + 1)
|
||||
return log2(lambda + 1) + tfn * log2((1 + lambda) / lambda);
|
||||
double A = log2(lambda + 1);
|
||||
double B = log2((1 + lambda) / lambda);
|
||||
|
||||
// basic model G should return (A + B * tfn)
|
||||
// which we rewrite to B * (1 + tfn) - (B - A)
|
||||
// so that it can be combined with the after effect while still guaranteeing
|
||||
// that the result is non-decreasing with tfn since B >= A
|
||||
|
||||
return (B - (B - A) / (1 + tfn)) * aeTimes1pTfn;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -29,10 +29,17 @@ public class BasicModelIF extends BasicModel {
|
|||
public BasicModelIF() {}
|
||||
|
||||
@Override
|
||||
public final double score(BasicStats stats, double tfn) {
|
||||
public final double score(BasicStats stats, double tfn, double aeTimes1pTfn) {
|
||||
long N = stats.getNumberOfDocuments();
|
||||
long F = stats.getTotalTermFreq();
|
||||
return tfn * log2(1 + (N + 1) / (F + 0.5));
|
||||
double A = log2(1 + (N + 1) / (F + 0.5));
|
||||
|
||||
// basic model IF should return A * tfn
|
||||
// which we rewrite to A * (1 + tfn) - A
|
||||
// so that it can be combined with the after effect while still guaranteeing
|
||||
// that the result is non-decreasing with tfn
|
||||
|
||||
return A * aeTimes1pTfn * (1 - 1 / (1 + tfn));
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -30,16 +30,23 @@ public class BasicModelIn extends BasicModel {
|
|||
public BasicModelIn() {}
|
||||
|
||||
@Override
|
||||
public final double score(BasicStats stats, double tfn) {
|
||||
public final double score(BasicStats stats, double tfn, double aeTimes1pTfn) {
|
||||
long N = stats.getNumberOfDocuments();
|
||||
long n = stats.getDocFreq();
|
||||
return tfn * log2((N + 1) / (n + 0.5));
|
||||
double A = log2((N + 1) / (n + 0.5));
|
||||
|
||||
// basic model I(n) should return A * tfn
|
||||
// which we rewrite to A * (1 + tfn) - A
|
||||
// so that it can be combined with the after effect while still guaranteeing
|
||||
// that the result is non-decreasing with tfn
|
||||
|
||||
return A * aeTimes1pTfn * (1 - 1 / (1 + tfn));
|
||||
}
|
||||
|
||||
@Override
|
||||
public final Explanation explain(BasicStats stats, double tfn) {
|
||||
public final Explanation explain(BasicStats stats, double tfn, double aeTimes1pTfn) {
|
||||
return Explanation.match(
|
||||
(float) score(stats, tfn),
|
||||
(float) (score(stats, tfn, aeTimes1pTfn) * (1 + tfn) / aeTimes1pTfn),
|
||||
getClass().getSimpleName() + ", computed from: ",
|
||||
Explanation.match(stats.getNumberOfDocuments(), "numberOfDocuments"),
|
||||
Explanation.match(stats.getDocFreq(), "docFreq"));
|
||||
|
|
|
@ -30,11 +30,18 @@ public class BasicModelIne extends BasicModel {
|
|||
public BasicModelIne() {}
|
||||
|
||||
@Override
|
||||
public final double score(BasicStats stats, double tfn) {
|
||||
public final double score(BasicStats stats, double tfn, double aeTimes1pTfn) {
|
||||
long N = stats.getNumberOfDocuments();
|
||||
long F = stats.getTotalTermFreq();
|
||||
double ne = N * (1 - Math.pow((N - 1) / (double)N, F));
|
||||
return tfn * log2((N + 1) / (ne + 0.5));
|
||||
double A = log2((N + 1) / (ne + 0.5));
|
||||
|
||||
// basic model I(ne) should return A * tfn
|
||||
// which we rewrite to A * (1 + tfn) - A
|
||||
// so that it can be combined with the after effect while still guaranteeing
|
||||
// that the result is non-decreasing with tfn
|
||||
|
||||
return A * aeTimes1pTfn * (1 - 1 / (1 + tfn));
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -1,49 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.search.similarities;
|
||||
|
||||
|
||||
import static org.apache.lucene.search.similarities.SimilarityBase.log2;
|
||||
|
||||
/**
|
||||
* Implements the Poisson approximation for the binomial model for DFR.
|
||||
* @lucene.experimental
|
||||
* <p>
|
||||
* WARNING: for terms that do not meet the expected random distribution
|
||||
* (e.g. stopwords), this model may give poor performance, such as
|
||||
* abnormally high scores for low tf values.
|
||||
*/
|
||||
public class BasicModelP extends BasicModel {
|
||||
/** {@code log2(Math.E)}, precomputed. */
|
||||
protected static double LOG2_E = log2(Math.E);
|
||||
|
||||
/** Sole constructor: parameter-free */
|
||||
public BasicModelP() {}
|
||||
|
||||
@Override
|
||||
public final double score(BasicStats stats, double tfn) {
|
||||
double lambda = (stats.getTotalTermFreq()+1) / (double) (stats.getNumberOfDocuments()+1);
|
||||
return tfn * log2(tfn / lambda)
|
||||
+ (lambda + 1 / (12 * tfn) - tfn) * LOG2_E
|
||||
+ 0.5 * log2(2 * Math.PI * tfn);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "P";
|
||||
}
|
||||
}
|
|
@ -20,7 +20,6 @@ package org.apache.lucene.search.similarities;
|
|||
import java.util.List;
|
||||
|
||||
import org.apache.lucene.search.Explanation;
|
||||
import org.apache.lucene.search.similarities.AfterEffect.NoAfterEffect;
|
||||
import org.apache.lucene.search.similarities.Normalization.NoNormalization;
|
||||
|
||||
/**
|
||||
|
@ -40,10 +39,7 @@ import org.apache.lucene.search.similarities.Normalization.NoNormalization;
|
|||
* <ol>
|
||||
* <li>{@link BasicModel}: Basic model of information content:
|
||||
* <ul>
|
||||
* <li>{@link BasicModelBE}: Limiting form of Bose-Einstein
|
||||
* <li>{@link BasicModelG}: Geometric approximation of Bose-Einstein
|
||||
* <li>{@link BasicModelP}: Poisson approximation of the Binomial
|
||||
* <li>{@link BasicModelD}: Divergence approximation of the Binomial
|
||||
* <li>{@link BasicModelIn}: Inverse document frequency
|
||||
* <li>{@link BasicModelIne}: Inverse expected document
|
||||
* frequency [mixture of Poisson and IDF]
|
||||
|
@ -55,7 +51,6 @@ import org.apache.lucene.search.similarities.Normalization.NoNormalization;
|
|||
* <ul>
|
||||
* <li>{@link AfterEffectL}: Laplace's law of succession
|
||||
* <li>{@link AfterEffectB}: Ratio of two Bernoulli processes
|
||||
* <li>{@link NoAfterEffect}: no first normalization
|
||||
* </ul>
|
||||
* <li>{@link Normalization}: Second (length) normalization:
|
||||
* <ul>
|
||||
|
@ -72,6 +67,10 @@ import org.apache.lucene.search.similarities.Normalization.NoNormalization;
|
|||
* </ol>
|
||||
* <p>Note that <em>qtf</em>, the multiplicity of term-occurrence in the query,
|
||||
* is not handled by this implementation.</p>
|
||||
* <p> Note that basic models BE (Limiting form of Bose-Einstein), P (Poisson
|
||||
* approximation of the Binomial) and D (Divergence approximation of the
|
||||
* Binomial) are not implemented because their formula couldn't be written in
|
||||
* a way that makes scores non-decreasing with the normalized term frequency.
|
||||
* @see BasicModel
|
||||
* @see AfterEffect
|
||||
* @see Normalization
|
||||
|
@ -89,8 +88,8 @@ public class DFRSimilarity extends SimilarityBase {
|
|||
* Creates DFRSimilarity from the three components.
|
||||
* <p>
|
||||
* Note that <code>null</code> values are not allowed:
|
||||
* if you want no normalization or after-effect, instead pass
|
||||
* {@link NoNormalization} or {@link NoAfterEffect} respectively.
|
||||
* if you want no normalization, instead pass
|
||||
* {@link NoNormalization}.
|
||||
* @param basicModel Basic model of information content
|
||||
* @param afterEffect First normalization of information gain
|
||||
* @param normalization Second (length) normalization
|
||||
|
@ -109,8 +108,8 @@ public class DFRSimilarity extends SimilarityBase {
|
|||
@Override
|
||||
protected double score(BasicStats stats, double freq, double docLen) {
|
||||
double tfn = normalization.tfn(stats, freq, docLen);
|
||||
return stats.getBoost() *
|
||||
basicModel.score(stats, tfn) * afterEffect.score(stats, tfn);
|
||||
double aeTimes1pTfn = afterEffect.scoreTimes1pTfn(stats);
|
||||
return stats.getBoost() * basicModel.score(stats, tfn, aeTimes1pTfn);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -121,9 +120,10 @@ public class DFRSimilarity extends SimilarityBase {
|
|||
}
|
||||
|
||||
Explanation normExpl = normalization.explain(stats, freq, docLen);
|
||||
float tfn = normExpl.getValue();
|
||||
double tfn = normalization.tfn(stats, freq, docLen);
|
||||
double aeTimes1pTfn = afterEffect.scoreTimes1pTfn(stats);
|
||||
subs.add(normExpl);
|
||||
subs.add(basicModel.explain(stats, tfn));
|
||||
subs.add(basicModel.explain(stats, tfn, aeTimes1pTfn));
|
||||
subs.add(afterEffect.explain(stats, tfn));
|
||||
}
|
||||
|
||||
|
|
|
@ -23,11 +23,8 @@ public abstract class BasicModelTestCase extends BaseSimilarityTestCase {
|
|||
@Override
|
||||
protected final Similarity getSimilarity(Random random) {
|
||||
final AfterEffect afterEffect;
|
||||
switch(random.nextInt(3)) {
|
||||
switch(random.nextInt(2)) {
|
||||
case 0:
|
||||
afterEffect = new AfterEffect.NoAfterEffect();
|
||||
break;
|
||||
case 1:
|
||||
afterEffect = new AfterEffectL();
|
||||
break;
|
||||
default:
|
||||
|
|
|
@ -1,30 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.search.similarities;
|
||||
|
||||
import org.apache.lucene.util.LuceneTestCase.AwaitsFix;
|
||||
|
||||
// returns negative scores at least, but it warns it has problems
|
||||
@AwaitsFix(bugUrl = "https://issues.apache.org/jira/browse/LUCENE-8010")
|
||||
public class TestBasicModelBE extends BasicModelTestCase {
|
||||
|
||||
@Override
|
||||
protected BasicModel getBasicModel() {
|
||||
return new BasicModelBE();
|
||||
}
|
||||
|
||||
}
|
|
@ -1,30 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.search.similarities;
|
||||
|
||||
import org.apache.lucene.util.LuceneTestCase.AwaitsFix;
|
||||
|
||||
// scores go backwards with respect to TF, but it warns it has problems
|
||||
@AwaitsFix(bugUrl = "https://issues.apache.org/jira/browse/LUCENE-8010")
|
||||
public class TestBasicModelD extends BasicModelTestCase {
|
||||
|
||||
@Override
|
||||
protected BasicModel getBasicModel() {
|
||||
return new BasicModelD();
|
||||
}
|
||||
|
||||
}
|
|
@ -1,30 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.search.similarities;
|
||||
|
||||
import org.apache.lucene.util.LuceneTestCase.AwaitsFix;
|
||||
|
||||
//scores go backwards with respect to TF, but it warns it has problems
|
||||
@AwaitsFix(bugUrl = "https://issues.apache.org/jira/browse/LUCENE-8010")
|
||||
public class TestBasicModelP extends BasicModelTestCase {
|
||||
|
||||
@Override
|
||||
protected BasicModel getBasicModel() {
|
||||
return new BasicModelP();
|
||||
}
|
||||
|
||||
}
|
|
@ -76,13 +76,12 @@ public class TestSimilarityBase extends LuceneTestCase {
|
|||
private static float FLOAT_EPSILON = 1e-5f;
|
||||
/** The DFR basic models to test. */
|
||||
static BasicModel[] BASIC_MODELS = {
|
||||
new BasicModelBE(), new BasicModelD(), new BasicModelG(),
|
||||
new BasicModelIF(), new BasicModelIn(), new BasicModelIne(),
|
||||
new BasicModelP()
|
||||
new BasicModelG(), new BasicModelIF(), new BasicModelIn(),
|
||||
new BasicModelIne()
|
||||
};
|
||||
/** The DFR aftereffects to test. */
|
||||
static AfterEffect[] AFTER_EFFECTS = {
|
||||
new AfterEffectB(), new AfterEffectL(), new AfterEffect.NoAfterEffect()
|
||||
new AfterEffectB(), new AfterEffectL()
|
||||
};
|
||||
/** The DFR normalizations to test. */
|
||||
static Normalization[] NORMALIZATIONS = {
|
||||
|
@ -446,21 +445,6 @@ public class TestSimilarityBase extends LuceneTestCase {
|
|||
correctnessTestCore(sim, 2.2387237548828125f);
|
||||
}
|
||||
|
||||
/** Correctness test for the PL2 DFR model. */
|
||||
public void testPL2() throws IOException {
|
||||
SimilarityBase sim = new DFRSimilarity(
|
||||
new BasicModelP(), new AfterEffectL(), new NormalizationH2());
|
||||
float tfn = (float)(FREQ * SimilarityBase.log2(
|
||||
1 + AVG_FIELD_LENGTH / DOC_LEN)); // 8.1894750101
|
||||
float l = 1.0f / (tfn + 1.0f); // 0.108820144666
|
||||
float lambda = (1.0f + TOTAL_TERM_FREQ) / (1f + NUMBER_OF_DOCUMENTS); // 0.7029703
|
||||
float p = (float)(tfn * SimilarityBase.log2(tfn / lambda) +
|
||||
(lambda + 1 / (12 * tfn) - tfn) * SimilarityBase.log2(Math.E) +
|
||||
0.5 * SimilarityBase.log2(2 * Math.PI * tfn)); // 21.065619
|
||||
float gold = l * p; // 2.2923636
|
||||
correctnessTestCore(sim, gold);
|
||||
}
|
||||
|
||||
/** Correctness test for the IneB2 DFR model. */
|
||||
public void testIneB2() throws IOException {
|
||||
SimilarityBase sim = new DFRSimilarity(
|
||||
|
@ -475,50 +459,14 @@ public class TestSimilarityBase extends LuceneTestCase {
|
|||
correctnessTestCore(sim, 1.6390540599822998f);
|
||||
}
|
||||
|
||||
/** Correctness test for the BEB1 DFR model. */
|
||||
public void testBEB1() throws IOException {
|
||||
SimilarityBase sim = new DFRSimilarity(
|
||||
new BasicModelBE(), new AfterEffectB(), new NormalizationH1());
|
||||
float tfn = FREQ * AVG_FIELD_LENGTH / DOC_LEN; // 8.75
|
||||
float b = (TOTAL_TERM_FREQ + 1 + 1) / ((DOC_FREQ + 1) * (tfn + 1)); // 0.67132866
|
||||
double f = TOTAL_TERM_FREQ + 1 + tfn;
|
||||
double n = f + NUMBER_OF_DOCUMENTS;
|
||||
double n1 = n + f - 1; // 258.5
|
||||
double m1 = n + f - tfn - 2; // 248.75
|
||||
double n2 = f; // 79.75
|
||||
double m2 = f - tfn; // 71.0
|
||||
float be = (float)(-SimilarityBase.log2(n - 1) -
|
||||
SimilarityBase.log2(Math.E) + // -8.924494472554715
|
||||
((m1 + 0.5f) * SimilarityBase.log2(n1 / m1) +
|
||||
(n1 - m1) * SimilarityBase.log2(n1)) - // 91.9620374903885
|
||||
((m2 + 0.5f) * SimilarityBase.log2(n2 / m2) +
|
||||
(n2 - m2) * SimilarityBase.log2(n2))); // 67.26544321004599
|
||||
// 15.7720995
|
||||
float gold = b * be; // 10.588263
|
||||
correctnessTestCore(sim, gold);
|
||||
}
|
||||
|
||||
/** Correctness test for the D DFR model (basic model only). */
|
||||
public void testD() throws IOException {
|
||||
SimilarityBase sim = new DFRSimilarity(new BasicModelD(), new AfterEffect.NoAfterEffect(), new Normalization.NoNormalization());
|
||||
double totalTermFreqNorm = TOTAL_TERM_FREQ + FREQ + 1;
|
||||
double p = 1.0 / (NUMBER_OF_DOCUMENTS + 1); // 0.009900990099009901
|
||||
double phi = FREQ / totalTermFreqNorm; // 0.08974358974358974
|
||||
double D = phi * SimilarityBase.log2(phi / p) + // 0.17498542370019005
|
||||
(1 - phi) * SimilarityBase.log2((1 - phi) / (1 - p));
|
||||
float gold = (float)(totalTermFreqNorm * D + 0.5 * SimilarityBase.log2(
|
||||
1 + 2 * Math.PI * FREQ * (1 - phi))); // 16.328257
|
||||
correctnessTestCore(sim, gold);
|
||||
}
|
||||
|
||||
/** Correctness test for the In2 DFR model with no aftereffect. */
|
||||
public void testIn2() throws IOException {
|
||||
SimilarityBase sim = new DFRSimilarity(
|
||||
new BasicModelIn(), new AfterEffect.NoAfterEffect(), new NormalizationH2());
|
||||
new BasicModelIn(), new AfterEffectL(), new NormalizationH2());
|
||||
float tfn = (float)(FREQ * SimilarityBase.log2( // 8.1894750101
|
||||
1 + AVG_FIELD_LENGTH / DOC_LEN));
|
||||
float gold = (float)(tfn * SimilarityBase.log2( // 26.7459577898
|
||||
(NUMBER_OF_DOCUMENTS + 1) / (DOC_FREQ + 0.5)));
|
||||
(NUMBER_OF_DOCUMENTS + 1) / (DOC_FREQ + 0.5)) / (1 + tfn));
|
||||
correctnessTestCore(sim, gold);
|
||||
}
|
||||
|
||||
|
|
|
@ -193,20 +193,46 @@ public abstract class BaseSimilarityTestCase extends LuceneTestCase {
|
|||
lowerBound = SmallFloat.byte4ToInt((byte) norm);
|
||||
}
|
||||
final long maxDoc;
|
||||
if (random.nextBoolean()) {
|
||||
switch (random.nextInt(6)) {
|
||||
case 0:
|
||||
// 1 doc collection
|
||||
maxDoc = 1;
|
||||
break;
|
||||
case 1:
|
||||
// 2 doc collection
|
||||
maxDoc = 2;
|
||||
break;
|
||||
case 2:
|
||||
// tiny collection
|
||||
maxDoc = TestUtil.nextLong(random, 3, 16);
|
||||
break;
|
||||
case 3:
|
||||
// small collection
|
||||
maxDoc = TestUtil.nextLong(random, 1, 100000);
|
||||
} else {
|
||||
maxDoc = TestUtil.nextLong(random, 16, 100000);
|
||||
break;
|
||||
case 4:
|
||||
// big collection
|
||||
maxDoc = TestUtil.nextLong(random, 100000, MAXDOC_FORTESTING);
|
||||
break;
|
||||
default:
|
||||
// yuge collection
|
||||
maxDoc = TestUtil.nextLong(random, 1, MAXDOC_FORTESTING);
|
||||
maxDoc = MAXDOC_FORTESTING;
|
||||
break;
|
||||
}
|
||||
final long docCount;
|
||||
if (random.nextBoolean()) {
|
||||
switch (random.nextInt(3)) {
|
||||
case 0:
|
||||
// sparsest field
|
||||
docCount = 1;
|
||||
break;
|
||||
case 1:
|
||||
// sparse field
|
||||
docCount = TestUtil.nextLong(random, 1, maxDoc);
|
||||
} else {
|
||||
break;
|
||||
default:
|
||||
// fully populated
|
||||
docCount = maxDoc;
|
||||
break;
|
||||
}
|
||||
// random docsize: but can't require docs to have > 2B tokens
|
||||
long upperBound;
|
||||
|
@ -216,15 +242,22 @@ public abstract class BaseSimilarityTestCase extends LuceneTestCase {
|
|||
upperBound = MAXTOKENS_FORTESTING;
|
||||
}
|
||||
final long sumDocFreq;
|
||||
if (random.nextBoolean()) {
|
||||
switch (random.nextInt(3)) {
|
||||
case 0:
|
||||
// shortest possible docs
|
||||
sumDocFreq = docCount;
|
||||
} else {
|
||||
break;
|
||||
case 1:
|
||||
// biggest possible docs
|
||||
sumDocFreq = upperBound + 1 - lowerBound;
|
||||
break;
|
||||
default:
|
||||
// random docsize
|
||||
sumDocFreq = TestUtil.nextLong(random, docCount, upperBound + 1 - lowerBound);
|
||||
break;
|
||||
}
|
||||
final long sumTotalTermFreq;
|
||||
switch (random.nextInt(3)) {
|
||||
switch (random.nextInt(4)) {
|
||||
case 0:
|
||||
// term frequencies were omitted
|
||||
sumTotalTermFreq = sumDocFreq;
|
||||
|
@ -233,6 +266,10 @@ public abstract class BaseSimilarityTestCase extends LuceneTestCase {
|
|||
// no repetition of terms (except to satisfy this norm)
|
||||
sumTotalTermFreq = sumDocFreq - 1 + lowerBound;
|
||||
break;
|
||||
case 2:
|
||||
// maximum repetition of terms
|
||||
sumTotalTermFreq = upperBound;
|
||||
break;
|
||||
default:
|
||||
// random repetition
|
||||
assert sumDocFreq - 1 + lowerBound <= upperBound;
|
||||
|
@ -249,29 +286,46 @@ public abstract class BaseSimilarityTestCase extends LuceneTestCase {
|
|||
*/
|
||||
static TermStatistics newTerm(Random random, CollectionStatistics corpus) {
|
||||
final long docFreq;
|
||||
if (random.nextBoolean()) {
|
||||
switch (random.nextInt(3)) {
|
||||
case 0:
|
||||
// rare term
|
||||
docFreq = 1;
|
||||
} else {
|
||||
break;
|
||||
case 1:
|
||||
// common term
|
||||
docFreq = corpus.docCount();
|
||||
break;
|
||||
default:
|
||||
// random specificity
|
||||
docFreq = TestUtil.nextLong(random, 1, corpus.docCount());
|
||||
break;
|
||||
}
|
||||
final long totalTermFreq;
|
||||
if (corpus.sumTotalTermFreq() == corpus.sumDocFreq()) {
|
||||
// omitTF
|
||||
totalTermFreq = docFreq;
|
||||
} else if (random.nextBoolean()) {
|
||||
// no repetition
|
||||
totalTermFreq = docFreq;
|
||||
} else {
|
||||
// random repetition: but can't require docs to have > 2B tokens
|
||||
// can't require docs to have > 2B tokens
|
||||
long upperBound;
|
||||
try {
|
||||
upperBound = Math.min(corpus.sumTotalTermFreq(), Math.multiplyExact(docFreq, Integer.MAX_VALUE));
|
||||
} catch (ArithmeticException overflow) {
|
||||
upperBound = corpus.sumTotalTermFreq();
|
||||
}
|
||||
if (corpus.sumTotalTermFreq() == corpus.sumDocFreq()) {
|
||||
// omitTF
|
||||
totalTermFreq = docFreq;
|
||||
} else {
|
||||
switch (random.nextInt(3)) {
|
||||
case 0:
|
||||
// no repetition
|
||||
totalTermFreq = docFreq;
|
||||
break;
|
||||
case 1:
|
||||
// maximum repetition
|
||||
totalTermFreq = upperBound;
|
||||
break;
|
||||
default:
|
||||
// random repetition
|
||||
totalTermFreq = TestUtil.nextLong(random, docFreq, upperBound);
|
||||
break;
|
||||
}
|
||||
}
|
||||
return new TermStatistics(TERM, docFreq, totalTermFreq);
|
||||
}
|
||||
|
@ -315,9 +369,34 @@ public abstract class BaseSimilarityTestCase extends LuceneTestCase {
|
|||
// there is at least one other document, and those must have at least 1 instance each.
|
||||
int upperBound = Math.toIntExact(Math.min(term.totalTermFreq() - term.docFreq() + 1, Integer.MAX_VALUE));
|
||||
if (random.nextBoolean()) {
|
||||
// integer freq
|
||||
switch (random.nextInt(3)) {
|
||||
case 0:
|
||||
// smallest freq
|
||||
freq = 1;
|
||||
break;
|
||||
case 1:
|
||||
// largest freq
|
||||
freq = upperBound;
|
||||
break;
|
||||
default:
|
||||
// random freq
|
||||
freq = TestUtil.nextInt(random, 1, upperBound);
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
float freqCandidate = upperBound * random.nextFloat();
|
||||
// float freq
|
||||
float freqCandidate;
|
||||
switch (random.nextInt(2)) {
|
||||
case 0:
|
||||
// smallest freq
|
||||
freqCandidate = Float.MIN_VALUE;
|
||||
break;
|
||||
default:
|
||||
// random freq
|
||||
freqCandidate = upperBound * random.nextFloat();
|
||||
break;
|
||||
}
|
||||
// we need to be 2nd float value at a minimum, the pairwise test will check MIN_VALUE in this case.
|
||||
// this avoids testing frequencies of 0 which seem wrong to allow (we should enforce computeSlopFactor etc)
|
||||
if (freqCandidate <= Float.MIN_VALUE) {
|
||||
|
|
|
@ -58,13 +58,11 @@ public class RandomSimilarity extends PerFieldSimilarityWrapper {
|
|||
// all the similarities that we rotate through
|
||||
/** The DFR basic models to test. */
|
||||
static BasicModel[] BASIC_MODELS = {
|
||||
/* TODO: enable new BasicModelBE(), */ /* TODO: enable new BasicModelD(), */ new BasicModelG(),
|
||||
new BasicModelIF(), new BasicModelIn(), new BasicModelIne(),
|
||||
/* TODO: enable new BasicModelP() */
|
||||
new BasicModelG(), new BasicModelIF(), new BasicModelIn(), new BasicModelIne(),
|
||||
};
|
||||
/** The DFR aftereffects to test. */
|
||||
static AfterEffect[] AFTER_EFFECTS = {
|
||||
new AfterEffectB(), new AfterEffectL(), new AfterEffect.NoAfterEffect()
|
||||
new AfterEffectB(), new AfterEffectL()
|
||||
};
|
||||
/** The DFR normalizations to test. */
|
||||
static Normalization[] NORMALIZATIONS = {
|
||||
|
|
|
@ -17,17 +17,13 @@
|
|||
package org.apache.solr.search.similarities;
|
||||
|
||||
import org.apache.lucene.search.similarities.AfterEffect;
|
||||
import org.apache.lucene.search.similarities.AfterEffect.NoAfterEffect; // javadoc
|
||||
import org.apache.lucene.search.similarities.AfterEffectB;
|
||||
import org.apache.lucene.search.similarities.AfterEffectL;
|
||||
import org.apache.lucene.search.similarities.BasicModel;
|
||||
import org.apache.lucene.search.similarities.BasicModelBE;
|
||||
import org.apache.lucene.search.similarities.BasicModelD;
|
||||
import org.apache.lucene.search.similarities.BasicModelG;
|
||||
import org.apache.lucene.search.similarities.BasicModelIF;
|
||||
import org.apache.lucene.search.similarities.BasicModelIn;
|
||||
import org.apache.lucene.search.similarities.BasicModelIne;
|
||||
import org.apache.lucene.search.similarities.BasicModelP;
|
||||
import org.apache.lucene.search.similarities.DFRSimilarity;
|
||||
import org.apache.lucene.search.similarities.Normalization;
|
||||
import org.apache.lucene.search.similarities.Normalization.NoNormalization; // javadoc
|
||||
|
@ -48,10 +44,7 @@ import org.apache.solr.schema.SimilarityFactory;
|
|||
* <ol>
|
||||
* <li>{@link BasicModel basicModel}: Basic model of information content:
|
||||
* <ul>
|
||||
* <li>{@link BasicModelBE Be}: Limiting form of Bose-Einstein
|
||||
* <li>{@link BasicModelG G}: Geometric approximation of Bose-Einstein
|
||||
* <li>{@link BasicModelP P}: Poisson approximation of the Binomial
|
||||
* <li>{@link BasicModelD D}: Divergence approximation of the Binomial
|
||||
* <li>{@link BasicModelIn I(n)}: Inverse document frequency
|
||||
* <li>{@link BasicModelIne I(ne)}: Inverse expected document
|
||||
* frequency [mixture of Poisson and IDF]
|
||||
|
@ -63,7 +56,6 @@ import org.apache.solr.schema.SimilarityFactory;
|
|||
* <ul>
|
||||
* <li>{@link AfterEffectL L}: Laplace's law of succession
|
||||
* <li>{@link AfterEffectB B}: Ratio of two Bernoulli processes
|
||||
* <li>{@link NoAfterEffect none}: no first normalization
|
||||
* </ul>
|
||||
* <li>{@link Normalization normalization}: Second (length) normalization:
|
||||
* <ul>
|
||||
|
@ -122,11 +114,7 @@ public class DFRSimilarityFactory extends SimilarityFactory {
|
|||
}
|
||||
|
||||
private BasicModel parseBasicModel(String expr) {
|
||||
if ("Be".equals(expr)) {
|
||||
return new BasicModelBE();
|
||||
} else if ("D".equals(expr)) {
|
||||
return new BasicModelD();
|
||||
} else if ("G".equals(expr)) {
|
||||
if ("G".equals(expr)) {
|
||||
return new BasicModelG();
|
||||
} else if ("I(F)".equals(expr)) {
|
||||
return new BasicModelIF();
|
||||
|
@ -134,8 +122,6 @@ public class DFRSimilarityFactory extends SimilarityFactory {
|
|||
return new BasicModelIn();
|
||||
} else if ("I(ne)".equals(expr)) {
|
||||
return new BasicModelIne();
|
||||
} else if ("P".equals(expr)) {
|
||||
return new BasicModelP();
|
||||
} else {
|
||||
throw new RuntimeException("Invalid basicModel: " + expr);
|
||||
}
|
||||
|
@ -146,8 +132,6 @@ public class DFRSimilarityFactory extends SimilarityFactory {
|
|||
return new AfterEffectB();
|
||||
} else if ("L".equals(expr)) {
|
||||
return new AfterEffectL();
|
||||
} else if ("none".equals(expr)) {
|
||||
return new AfterEffect.NoAfterEffect();
|
||||
} else {
|
||||
throw new RuntimeException("Invalid afterEffect: " + expr);
|
||||
}
|
||||
|
|
|
@ -46,7 +46,7 @@
|
|||
<fieldType name="text_paramc" class="solr.TextField">
|
||||
<analyzer class="org.apache.lucene.analysis.standard.StandardAnalyzer"/>
|
||||
<similarity class="solr.DFRSimilarityFactory">
|
||||
<str name="basicModel">P</str>
|
||||
<str name="basicModel">G</str>
|
||||
<str name="afterEffect">L</str>
|
||||
<str name="normalization">H2</str>
|
||||
<float name="c">7</float>
|
||||
|
|
|
@ -18,8 +18,8 @@ package org.apache.solr.search.similarities;
|
|||
|
||||
import org.apache.lucene.search.similarities.AfterEffectB;
|
||||
import org.apache.lucene.search.similarities.AfterEffectL;
|
||||
import org.apache.lucene.search.similarities.BasicModelG;
|
||||
import org.apache.lucene.search.similarities.BasicModelIF;
|
||||
import org.apache.lucene.search.similarities.BasicModelP;
|
||||
import org.apache.lucene.search.similarities.DFRSimilarity;
|
||||
import org.apache.lucene.search.similarities.NormalizationH2;
|
||||
import org.apache.lucene.search.similarities.NormalizationH3;
|
||||
|
@ -62,7 +62,7 @@ public class TestDFRSimilarityFactory extends BaseSimilarityTestCase {
|
|||
Similarity sim = getSimilarity("text_paramc");
|
||||
assertEquals(DFRSimilarity.class, sim.getClass());
|
||||
DFRSimilarity dfr = (DFRSimilarity) sim;
|
||||
assertEquals(BasicModelP.class, dfr.getBasicModel().getClass());
|
||||
assertEquals(BasicModelG.class, dfr.getBasicModel().getClass());
|
||||
assertEquals(AfterEffectL.class, dfr.getAfterEffect().getClass());
|
||||
assertEquals(NormalizationH2.class, dfr.getNormalization().getClass());
|
||||
NormalizationH2 norm = (NormalizationH2) dfr.getNormalization();
|
||||
|
|
Loading…
Reference in New Issue