From 42717d5f4bbed46009f11a86f307541a19fd7fb5 Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Tue, 24 Oct 2017 22:48:04 -0400 Subject: [PATCH] LUCENE-7997: More sanity testing of similarities --- lucene/CHANGES.txt | 5 + .../lucene/search/CollectionStatistics.java | 18 + .../org/apache/lucene/search/TermQuery.java | 2 +- .../apache/lucene/search/TermStatistics.java | 15 + .../search/similarities/AfterEffect.java | 10 +- .../search/similarities/AfterEffectB.java | 8 +- .../search/similarities/AfterEffectL.java | 8 +- .../lucene/search/similarities/Axiomatic.java | 28 +- .../search/similarities/AxiomaticF1EXP.java | 20 +- .../search/similarities/AxiomaticF1LOG.java | 20 +- .../search/similarities/AxiomaticF2EXP.java | 18 +- .../search/similarities/AxiomaticF2LOG.java | 18 +- .../search/similarities/AxiomaticF3EXP.java | 24 +- .../search/similarities/AxiomaticF3LOG.java | 24 +- .../search/similarities/BM25Similarity.java | 120 +++-- .../search/similarities/BasicModel.java | 6 +- .../search/similarities/BasicModelBE.java | 4 +- .../search/similarities/BasicModelD.java | 6 +- .../search/similarities/BasicModelG.java | 4 +- .../search/similarities/BasicModelIF.java | 4 +- .../search/similarities/BasicModelIn.java | 8 +- .../search/similarities/BasicModelIne.java | 4 +- .../search/similarities/BasicModelP.java | 8 +- .../search/similarities/BasicStats.java | 12 +- .../similarities/BooleanSimilarity.java | 2 +- .../similarities/ClassicSimilarity.java | 4 +- .../search/similarities/DFISimilarity.java | 8 +- .../search/similarities/DFRSimilarity.java | 10 +- .../search/similarities/Distribution.java | 8 +- .../search/similarities/DistributionLL.java | 4 +- .../search/similarities/DistributionSPL.java | 8 +- .../search/similarities/IBSimilarity.java | 8 +- .../search/similarities/Independence.java | 2 +- .../similarities/IndependenceChiSquared.java | 2 +- .../similarities/IndependenceSaturated.java | 2 +- .../IndependenceStandardized.java | 4 +- .../similarities/LMDirichletSimilarity.java | 18 +- .../LMJelinekMercerSimilarity.java | 20 +- .../search/similarities/LMSimilarity.java | 20 +- .../search/similarities/Normalization.java | 16 +- .../search/similarities/NormalizationH1.java | 8 +- .../search/similarities/NormalizationH2.java | 8 +- .../search/similarities/NormalizationH3.java | 5 +- .../search/similarities/NormalizationZ.java | 9 +- .../search/similarities/SimilarityBase.java | 28 +- .../search/similarities/TFIDFSimilarity.java | 37 +- .../search/similarities/package-info.java | 2 +- .../similarities/AxiomaticTestCase.java | 90 ++++ .../similarities/BasicModelTestCase.java | 124 +++++ .../similarities/DistributionTestCase.java | 119 +++++ .../similarities/TestAxiomaticF1EXP.java | 30 ++ .../similarities/TestAxiomaticF1LOG.java | 30 ++ .../similarities/TestAxiomaticF2EXP.java | 26 + .../similarities/TestAxiomaticF2LOG.java | 26 + .../similarities/TestAxiomaticF3EXP.java | 31 ++ .../similarities/TestAxiomaticF3LOG.java | 31 ++ .../similarities/TestBM25Similarity.java | 61 ++- .../search/similarities/TestBasicModelBE.java | 30 ++ .../search/similarities/TestBasicModelD.java | 30 ++ .../search/similarities/TestBasicModelG.java | 26 + .../search/similarities/TestBasicModelIF.java | 26 + .../search/similarities/TestBasicModelIn.java | 26 + .../similarities/TestBasicModelIne.java | 26 + .../search/similarities/TestBasicModelP.java | 30 ++ .../similarities/TestBooleanSimilarity.java | 9 +- .../similarities/TestClassicSimilarity.java | 9 +- .../similarities/TestDistributionLL.java | 26 + .../similarities/TestDistributionSPL.java | 30 ++ .../TestIndependenceChiSquared.java | 28 ++ .../TestIndependenceSaturated.java | 28 ++ .../TestIndependenceStandardized.java | 28 ++ .../TestLMDirichletSimilarity.java | 49 ++ .../TestLMJelinekMercerSimilarity.java | 44 ++ .../similarities/TestSimilarityBase.java | 14 +- .../lucene/search/join/TestBlockJoin.java | 2 +- .../org/apache/lucene/search/CheckHits.java | 35 +- .../similarities/BaseSimilarityTestCase.java | 473 ++++++++++++++++++ .../java/org/apache/lucene/util/TestUtil.java | 2 +- 78 files changed, 1862 insertions(+), 304 deletions(-) create mode 100644 lucene/core/src/test/org/apache/lucene/search/similarities/AxiomaticTestCase.java create mode 100644 lucene/core/src/test/org/apache/lucene/search/similarities/BasicModelTestCase.java create mode 100644 lucene/core/src/test/org/apache/lucene/search/similarities/DistributionTestCase.java create mode 100644 lucene/core/src/test/org/apache/lucene/search/similarities/TestAxiomaticF1EXP.java create mode 100644 lucene/core/src/test/org/apache/lucene/search/similarities/TestAxiomaticF1LOG.java create mode 100644 lucene/core/src/test/org/apache/lucene/search/similarities/TestAxiomaticF2EXP.java create mode 100644 lucene/core/src/test/org/apache/lucene/search/similarities/TestAxiomaticF2LOG.java create mode 100644 lucene/core/src/test/org/apache/lucene/search/similarities/TestAxiomaticF3EXP.java create mode 100644 lucene/core/src/test/org/apache/lucene/search/similarities/TestAxiomaticF3LOG.java create mode 100644 lucene/core/src/test/org/apache/lucene/search/similarities/TestBasicModelBE.java create mode 100644 lucene/core/src/test/org/apache/lucene/search/similarities/TestBasicModelD.java create mode 100644 lucene/core/src/test/org/apache/lucene/search/similarities/TestBasicModelG.java create mode 100644 lucene/core/src/test/org/apache/lucene/search/similarities/TestBasicModelIF.java create mode 100644 lucene/core/src/test/org/apache/lucene/search/similarities/TestBasicModelIn.java create mode 100644 lucene/core/src/test/org/apache/lucene/search/similarities/TestBasicModelIne.java create mode 100644 lucene/core/src/test/org/apache/lucene/search/similarities/TestBasicModelP.java create mode 100644 lucene/core/src/test/org/apache/lucene/search/similarities/TestDistributionLL.java create mode 100644 lucene/core/src/test/org/apache/lucene/search/similarities/TestDistributionSPL.java create mode 100644 lucene/core/src/test/org/apache/lucene/search/similarities/TestIndependenceChiSquared.java create mode 100644 lucene/core/src/test/org/apache/lucene/search/similarities/TestIndependenceSaturated.java create mode 100644 lucene/core/src/test/org/apache/lucene/search/similarities/TestIndependenceStandardized.java create mode 100644 lucene/core/src/test/org/apache/lucene/search/similarities/TestLMDirichletSimilarity.java create mode 100644 lucene/core/src/test/org/apache/lucene/search/similarities/TestLMJelinekMercerSimilarity.java create mode 100644 lucene/test-framework/src/java/org/apache/lucene/search/similarities/BaseSimilarityTestCase.java diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index c23b5bbe596..8d858e468e4 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -11,7 +11,12 @@ Changes in Runtime Behavior will now fail to open even if they have been merged with the previous major version. (Adrien Grand) +Improvements +* LUCENE-7997: Add BaseSimilarityTestCase to sanity check similarities. + SimilarityBase switches to 64-bit doubles internally to help avoid common numeric issues. + Add missing range checks for similarity parameters. + Improve BM25 and ClassicSimilarity's explanations. (Robert Muir) ======================= Lucene 7.2.0 ======================= diff --git a/lucene/core/src/java/org/apache/lucene/search/CollectionStatistics.java b/lucene/core/src/java/org/apache/lucene/search/CollectionStatistics.java index e0aafa84ab2..ef19abde6c9 100644 --- a/lucene/core/src/java/org/apache/lucene/search/CollectionStatistics.java +++ b/lucene/core/src/java/org/apache/lucene/search/CollectionStatistics.java @@ -73,4 +73,22 @@ public class CollectionStatistics { public final long sumDocFreq() { return sumDocFreq; } + + @Override + public String toString() { + StringBuilder sb = new StringBuilder(); + sb.append("field="); + sb.append('"'); + sb.append(field()); + sb.append('"'); + sb.append(",maxDoc="); + sb.append(maxDoc()); + sb.append(",docCount="); + sb.append(docCount()); + sb.append(",sumTotalTermFreq="); + sb.append(sumTotalTermFreq()); + sb.append(",sumDocFreq="); + sb.append(sumDocFreq); + return sb.toString(); + } } diff --git a/lucene/core/src/java/org/apache/lucene/search/TermQuery.java b/lucene/core/src/java/org/apache/lucene/search/TermQuery.java index e3e299f6bb5..48c61faf78b 100644 --- a/lucene/core/src/java/org/apache/lucene/search/TermQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/TermQuery.java @@ -142,7 +142,7 @@ public class TermQuery extends Query { if (newDoc == doc) { float freq = scorer.freq(); SimScorer docScorer = similarity.simScorer(stats, context); - Explanation freqExplanation = Explanation.match(freq, "termFreq=" + freq); + Explanation freqExplanation = Explanation.match(freq, "freq, occurrences of term within document"); Explanation scoreExplanation = docScorer.explain(doc, freqExplanation); return Explanation.match( scoreExplanation.getValue(), diff --git a/lucene/core/src/java/org/apache/lucene/search/TermStatistics.java b/lucene/core/src/java/org/apache/lucene/search/TermStatistics.java index c00774ba9c7..a8e2e069a57 100644 --- a/lucene/core/src/java/org/apache/lucene/search/TermStatistics.java +++ b/lucene/core/src/java/org/apache/lucene/search/TermStatistics.java @@ -17,6 +17,7 @@ package org.apache.lucene.search; +import org.apache.lucene.index.Term; import org.apache.lucene.index.TermsEnum; // javadocs import org.apache.lucene.util.BytesRef; /** @@ -52,4 +53,18 @@ public class TermStatistics { public final long totalTermFreq() { return totalTermFreq; } + + @Override + public String toString() { + StringBuilder sb = new StringBuilder(); + sb.append("term="); + sb.append('"'); + sb.append(Term.toString(term())); + sb.append('"'); + sb.append(",docFreq="); + sb.append(docFreq()); + sb.append(",totalTermFreq="); + sb.append(totalTermFreq()); + return sb.toString(); + } } diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/AfterEffect.java b/lucene/core/src/java/org/apache/lucene/search/similarities/AfterEffect.java index df6f3d27491..e62513e53ea 100644 --- a/lucene/core/src/java/org/apache/lucene/search/similarities/AfterEffect.java +++ b/lucene/core/src/java/org/apache/lucene/search/similarities/AfterEffect.java @@ -38,10 +38,10 @@ public abstract class AfterEffect { public AfterEffect() {} /** Returns the aftereffect score. */ - public abstract float score(BasicStats stats, float tfn); + public abstract double score(BasicStats stats, double tfn); /** Returns an explanation for the score. */ - public abstract Explanation explain(BasicStats stats, float tfn); + public abstract Explanation explain(BasicStats stats, double tfn); /** Implementation used when there is no aftereffect. */ public static final class NoAfterEffect extends AfterEffect { @@ -50,12 +50,12 @@ public abstract class AfterEffect { public NoAfterEffect() {} @Override - public final float score(BasicStats stats, float tfn) { - return 1f; + public double score(BasicStats stats, double tfn) { + return 1.0; } @Override - public final Explanation explain(BasicStats stats, float tfn) { + public Explanation explain(BasicStats stats, double tfn) { return Explanation.match(1, "no aftereffect"); } diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/AfterEffectB.java b/lucene/core/src/java/org/apache/lucene/search/similarities/AfterEffectB.java index 32d17486964..b1bff96c5a5 100644 --- a/lucene/core/src/java/org/apache/lucene/search/similarities/AfterEffectB.java +++ b/lucene/core/src/java/org/apache/lucene/search/similarities/AfterEffectB.java @@ -29,18 +29,18 @@ public class AfterEffectB extends AfterEffect { public AfterEffectB() {} @Override - public final float score(BasicStats stats, float tfn) { + public final double score(BasicStats stats, double tfn) { long F = stats.getTotalTermFreq()+1; long n = stats.getDocFreq()+1; return (F + 1) / (n * (tfn + 1)); } @Override - public final Explanation explain(BasicStats stats, float tfn) { + public final Explanation explain(BasicStats stats, double tfn) { return Explanation.match( - score(stats, tfn), + (float) score(stats, tfn), getClass().getSimpleName() + ", computed from: ", - Explanation.match(tfn, "tfn"), + Explanation.match((float) tfn, "tfn"), Explanation.match(stats.getTotalTermFreq(), "totalTermFreq"), Explanation.match(stats.getDocFreq(), "docFreq")); } diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/AfterEffectL.java b/lucene/core/src/java/org/apache/lucene/search/similarities/AfterEffectL.java index f6ba31283bb..a8ee53d79e4 100644 --- a/lucene/core/src/java/org/apache/lucene/search/similarities/AfterEffectL.java +++ b/lucene/core/src/java/org/apache/lucene/search/similarities/AfterEffectL.java @@ -29,16 +29,16 @@ public class AfterEffectL extends AfterEffect { public AfterEffectL() {} @Override - public final float score(BasicStats stats, float tfn) { + public final double score(BasicStats stats, double tfn) { return 1 / (tfn + 1); } @Override - public final Explanation explain(BasicStats stats, float tfn) { + public final Explanation explain(BasicStats stats, double tfn) { return Explanation.match( - score(stats, tfn), + (float) score(stats, tfn), getClass().getSimpleName() + ", computed from: ", - Explanation.match(tfn, "tfn")); + Explanation.match((float) tfn, "tfn")); } @Override diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/Axiomatic.java b/lucene/core/src/java/org/apache/lucene/search/similarities/Axiomatic.java index 9c2854c3bbd..9183cb65a53 100644 --- a/lucene/core/src/java/org/apache/lucene/search/similarities/Axiomatic.java +++ b/lucene/core/src/java/org/apache/lucene/search/similarities/Axiomatic.java @@ -100,7 +100,7 @@ public abstract class Axiomatic extends SimilarityBase { } @Override - public float score(BasicStats stats, float freq, float docLen) { + public double score(BasicStats stats, double freq, double docLen) { return tf(stats, freq, docLen) * ln(stats, freq, docLen) * tfln(stats, freq, docLen) @@ -110,19 +110,19 @@ public abstract class Axiomatic extends SimilarityBase { @Override protected void explain(List subs, BasicStats stats, int doc, - float freq, float docLen) { - if (stats.getBoost() != 1.0f) { - subs.add(Explanation.match(stats.getBoost(), "boost")); + double freq, double docLen) { + if (stats.getBoost() != 1.0d) { + subs.add(Explanation.match((float) stats.getBoost(), "boost")); } subs.add(Explanation.match(this.k, "k")); subs.add(Explanation.match(this.s, "s")); subs.add(Explanation.match(this.queryLen, "queryLen")); - subs.add(Explanation.match(tf(stats, freq, docLen), "tf")); - subs.add(Explanation.match(ln(stats, freq, docLen), "ln")); - subs.add(Explanation.match(tfln(stats, freq, docLen), "tfln")); - subs.add(Explanation.match(idf(stats, freq, docLen), "idf")); - subs.add(Explanation.match(gamma(stats, freq, docLen), "gamma")); + subs.add(Explanation.match((float) tf(stats, freq, docLen), "tf")); + subs.add(Explanation.match((float) ln(stats, freq, docLen), "ln")); + subs.add(Explanation.match((float) tfln(stats, freq, docLen), "tfln")); + subs.add(Explanation.match((float) idf(stats, freq, docLen), "idf")); + subs.add(Explanation.match((float) gamma(stats, freq, docLen), "gamma")); super.explain(subs, stats, doc, freq, docLen); } @@ -135,25 +135,25 @@ public abstract class Axiomatic extends SimilarityBase { /** * compute the term frequency component */ - protected abstract float tf(BasicStats stats, float freq, float docLen); + protected abstract double tf(BasicStats stats, double freq, double docLen); /** * compute the document length component */ - protected abstract float ln(BasicStats stats, float freq, float docLen); + protected abstract double ln(BasicStats stats, double freq, double docLen); /** * compute the mixed term frequency and document length component */ - protected abstract float tfln(BasicStats stats, float freq, float docLen); + protected abstract double tfln(BasicStats stats, double freq, double docLen); /** * compute the inverted document frequency component */ - protected abstract float idf(BasicStats stats, float freq, float docLen); + protected abstract double idf(BasicStats stats, double freq, double docLen); /** * compute the gamma component (only for F3EXp and F3LOG) */ - protected abstract float gamma(BasicStats stats, float freq, float docLen); + protected abstract double gamma(BasicStats stats, double freq, double docLen); } \ No newline at end of file diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/AxiomaticF1EXP.java b/lucene/core/src/java/org/apache/lucene/search/similarities/AxiomaticF1EXP.java index 62317fdf73f..0619b4e96f1 100644 --- a/lucene/core/src/java/org/apache/lucene/search/similarities/AxiomaticF1EXP.java +++ b/lucene/core/src/java/org/apache/lucene/search/similarities/AxiomaticF1EXP.java @@ -56,16 +56,16 @@ public class AxiomaticF1EXP extends Axiomatic { * compute the term frequency component */ @Override - protected float tf(BasicStats stats, float freq, float docLen) { - if (freq <= 0.0) return 0f; - return (float) (1 + Math.log(1 + Math.log(freq))); + protected double tf(BasicStats stats, double freq, double docLen) { + if (freq <= 0.0) return 0.0; + return 1 + Math.log(1 + Math.log(freq)); } /** * compute the document length component */ @Override - protected float ln(BasicStats stats, float freq, float docLen) { + protected double ln(BasicStats stats, double freq, double docLen) { return (stats.getAvgFieldLength() + this.s) / (stats.getAvgFieldLength() + docLen * this.s); } @@ -73,23 +73,23 @@ public class AxiomaticF1EXP extends Axiomatic { * compute the mixed term frequency and document length component */ @Override - protected float tfln(BasicStats stats, float freq, float docLen) { - return 1f; + protected double tfln(BasicStats stats, double freq, double docLen) { + return 1.0; } /** * compute the inverted document frequency component */ @Override - protected float idf(BasicStats stats, float freq, float docLen) { - return (float) Math.pow((stats.getNumberOfDocuments() + 1.0) / stats.getDocFreq(), this.k); + protected double idf(BasicStats stats, double freq, double docLen) { + return Math.pow((stats.getNumberOfDocuments() + 1.0) / stats.getDocFreq(), this.k); } /** * compute the gamma component */ @Override - protected float gamma(BasicStats stats, float freq, float docLen) { - return 0f; + protected double gamma(BasicStats stats, double freq, double docLen) { + return 0.0; } } \ No newline at end of file diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/AxiomaticF1LOG.java b/lucene/core/src/java/org/apache/lucene/search/similarities/AxiomaticF1LOG.java index 7cce2be4e95..f7a02da8c99 100644 --- a/lucene/core/src/java/org/apache/lucene/search/similarities/AxiomaticF1LOG.java +++ b/lucene/core/src/java/org/apache/lucene/search/similarities/AxiomaticF1LOG.java @@ -49,16 +49,16 @@ public class AxiomaticF1LOG extends Axiomatic { * compute the term frequency component */ @Override - protected float tf(BasicStats stats, float freq, float docLen) { - if (freq <= 0.0) return 0f; - return (float) (1 + Math.log(1 + Math.log(freq))); + protected double tf(BasicStats stats, double freq, double docLen) { + if (freq <= 0.0) return 0.0; + return 1 + Math.log(1 + Math.log(freq)); } /** * compute the document length component */ @Override - protected float ln(BasicStats stats, float freq, float docLen) { + protected double ln(BasicStats stats, double freq, double docLen) { return (stats.getAvgFieldLength() + this.s) / (stats.getAvgFieldLength() + docLen * this.s); } @@ -66,23 +66,23 @@ public class AxiomaticF1LOG extends Axiomatic { * compute the mixed term frequency and document length component */ @Override - protected float tfln(BasicStats stats, float freq, float docLen) { - return 1f; + protected double tfln(BasicStats stats, double freq, double docLen) { + return 1.0; } /** * compute the inverted document frequency component */ @Override - protected float idf(BasicStats stats, float freq, float docLen) { - return (float) Math.log((stats.getNumberOfDocuments() + 1.0) / stats.getDocFreq()); + protected double idf(BasicStats stats, double freq, double docLen) { + return Math.log((stats.getNumberOfDocuments() + 1.0) / stats.getDocFreq()); } /** * compute the gamma component */ @Override - protected float gamma(BasicStats stats, float freq, float docLen) { - return 0f; + protected double gamma(BasicStats stats, double freq, double docLen) { + return 0.0; } } \ No newline at end of file diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/AxiomaticF2EXP.java b/lucene/core/src/java/org/apache/lucene/search/similarities/AxiomaticF2EXP.java index f9bc98a4d27..0a3e4ad7cc5 100644 --- a/lucene/core/src/java/org/apache/lucene/search/similarities/AxiomaticF2EXP.java +++ b/lucene/core/src/java/org/apache/lucene/search/similarities/AxiomaticF2EXP.java @@ -56,23 +56,23 @@ public class AxiomaticF2EXP extends Axiomatic { * compute the term frequency component */ @Override - protected float tf(BasicStats stats, float freq, float docLen) { - return 1f; + protected double tf(BasicStats stats, double freq, double docLen) { + return 1.0; } /** * compute the document length component */ @Override - protected float ln(BasicStats stats, float freq, float docLen) { - return 1f; + protected double ln(BasicStats stats, double freq, double docLen) { + return 1.0; } /** * compute the mixed term frequency and document length component */ @Override - protected float tfln(BasicStats stats, float freq, float docLen) { + protected double tfln(BasicStats stats, double freq, double docLen) { return freq / (freq + this.s + this.s * docLen / stats.getAvgFieldLength()); } @@ -80,15 +80,15 @@ public class AxiomaticF2EXP extends Axiomatic { * compute the inverted document frequency component */ @Override - protected float idf(BasicStats stats, float freq, float docLen) { - return (float) Math.pow((stats.getNumberOfDocuments() + 1.0) / stats.getDocFreq(), this.k); + protected double idf(BasicStats stats, double freq, double docLen) { + return Math.pow((stats.getNumberOfDocuments() + 1.0) / stats.getDocFreq(), this.k); } /** * compute the gamma component */ @Override - protected float gamma(BasicStats stats, float freq, float docLen) { - return 0f; + protected double gamma(BasicStats stats, double freq, double docLen) { + return 0.0; } } \ No newline at end of file diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/AxiomaticF2LOG.java b/lucene/core/src/java/org/apache/lucene/search/similarities/AxiomaticF2LOG.java index fee2000bec3..2fc5e11a9c7 100644 --- a/lucene/core/src/java/org/apache/lucene/search/similarities/AxiomaticF2LOG.java +++ b/lucene/core/src/java/org/apache/lucene/search/similarities/AxiomaticF2LOG.java @@ -48,23 +48,23 @@ public class AxiomaticF2LOG extends Axiomatic { * compute the term frequency component */ @Override - protected float tf(BasicStats stats, float freq, float docLen) { - return 1f; + protected double tf(BasicStats stats, double freq, double docLen) { + return 1.0; } /** * compute the document length component */ @Override - protected float ln(BasicStats stats, float freq, float docLen) { - return 1f; + protected double ln(BasicStats stats, double freq, double docLen) { + return 1.0; } /** * compute the mixed term frequency and document length component */ @Override - protected float tfln(BasicStats stats, float freq, float docLen) { + protected double tfln(BasicStats stats, double freq, double docLen) { return freq / (freq + this.s + this.s * docLen / stats.getAvgFieldLength()); } @@ -72,15 +72,15 @@ public class AxiomaticF2LOG extends Axiomatic { * compute the inverted document frequency component */ @Override - protected float idf(BasicStats stats, float freq, float docLen) { - return (float) Math.log((stats.getNumberOfDocuments() + 1.0) / stats.getDocFreq()); + protected double idf(BasicStats stats, double freq, double docLen) { + return Math.log((stats.getNumberOfDocuments() + 1.0) / stats.getDocFreq()); } /** * compute the gamma component */ @Override - protected float gamma(BasicStats stats, float freq, float docLen) { - return 0f; + protected double gamma(BasicStats stats, double freq, double docLen) { + return 0.0; } } \ No newline at end of file diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/AxiomaticF3EXP.java b/lucene/core/src/java/org/apache/lucene/search/similarities/AxiomaticF3EXP.java index c20194ac28a..5c763178204 100644 --- a/lucene/core/src/java/org/apache/lucene/search/similarities/AxiomaticF3EXP.java +++ b/lucene/core/src/java/org/apache/lucene/search/similarities/AxiomaticF3EXP.java @@ -17,10 +17,10 @@ package org.apache.lucene.search.similarities; /** - * F2EXP is defined as Sum(tf(term_doc_freq)*IDF(term)-gamma(docLen, queryLen)) + * F3EXP is defined as Sum(tf(term_doc_freq)*IDF(term)-gamma(docLen, queryLen)) * where IDF(t) = pow((N+1)/df(t), k) N=total num of docs, df=doc freq * gamma(docLen, queryLen) = (docLen-queryLen)*queryLen*s/avdl - * + * NOTE: the gamma function of this similarity creates negative scores * @lucene.experimental */ public class AxiomaticF3EXP extends Axiomatic { @@ -55,40 +55,40 @@ public class AxiomaticF3EXP extends Axiomatic { * compute the term frequency component */ @Override - protected float tf(BasicStats stats, float freq, float docLen) { - if (freq <= 0.0) return 0f; - return (float) (1 + Math.log(1 + Math.log(freq))); + protected double tf(BasicStats stats, double freq, double docLen) { + if (freq <= 0.0) return 0.0; + return 1 + Math.log(1 + Math.log(freq)); } /** * compute the document length component */ @Override - protected float ln(BasicStats stats, float freq, float docLen) { - return 1f; + protected double ln(BasicStats stats, double freq, double docLen) { + return 1.0; } /** * compute the mixed term frequency and document length component */ @Override - protected float tfln(BasicStats stats, float freq, float docLen) { - return 1f; + protected double tfln(BasicStats stats, double freq, double docLen) { + return 1.0; } /** * compute the inverted document frequency component */ @Override - protected float idf(BasicStats stats, float freq, float docLen) { - return (float) Math.pow((stats.getNumberOfDocuments() + 1.0) / stats.getDocFreq(), this.k); + protected double idf(BasicStats stats, double freq, double docLen) { + return Math.pow((stats.getNumberOfDocuments() + 1.0) / stats.getDocFreq(), this.k); } /** * compute the gamma component */ @Override - protected float gamma(BasicStats stats, float freq, float docLen) { + protected double gamma(BasicStats stats, double freq, double docLen) { return (docLen - this.queryLen) * this.s * this.queryLen / stats.getAvgFieldLength(); } } \ No newline at end of file diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/AxiomaticF3LOG.java b/lucene/core/src/java/org/apache/lucene/search/similarities/AxiomaticF3LOG.java index a9d82aded39..22a50b1c78f 100644 --- a/lucene/core/src/java/org/apache/lucene/search/similarities/AxiomaticF3LOG.java +++ b/lucene/core/src/java/org/apache/lucene/search/similarities/AxiomaticF3LOG.java @@ -17,10 +17,10 @@ package org.apache.lucene.search.similarities; /** - * F2EXP is defined as Sum(tf(term_doc_freq)*IDF(term)-gamma(docLen, queryLen)) + * F3EXP is defined as Sum(tf(term_doc_freq)*IDF(term)-gamma(docLen, queryLen)) * where IDF(t) = ln((N+1)/df(t)) N=total num of docs, df=doc freq * gamma(docLen, queryLen) = (docLen-queryLen)*queryLen*s/avdl - * + * NOTE: the gamma function of this similarity creates negative scores * @lucene.experimental */ public class AxiomaticF3LOG extends Axiomatic { @@ -44,40 +44,40 @@ public class AxiomaticF3LOG extends Axiomatic { * compute the term frequency component */ @Override - protected float tf(BasicStats stats, float freq, float docLen) { - if (freq <= 0.0) return 0f; - return (float) (1 + Math.log(1 + Math.log(freq))); + protected double tf(BasicStats stats, double freq, double docLen) { + if (freq <= 0.0) return 0.0; + return 1 + Math.log(1 + Math.log(freq)); } /** * compute the document length component */ @Override - protected float ln(BasicStats stats, float freq, float docLen) { - return 1f; + protected double ln(BasicStats stats, double freq, double docLen) { + return 1.0; } /** * compute the mixed term frequency and document length component */ @Override - protected float tfln(BasicStats stats, float freq, float docLen) { - return 1f; + protected double tfln(BasicStats stats, double freq, double docLen) { + return 1.0; } /** * compute the inverted document frequency component */ @Override - protected float idf(BasicStats stats, float freq, float docLen) { - return (float) Math.log((stats.getNumberOfDocuments() + 1.0) / stats.getDocFreq()); + protected double idf(BasicStats stats, double freq, double docLen) { + return Math.log((stats.getNumberOfDocuments() + 1.0) / stats.getDocFreq()); } /** * compute the gamma component */ @Override - protected float gamma(BasicStats stats, float freq, float docLen) { + protected double gamma(BasicStats stats, double freq, double docLen) { return (docLen - this.queryLen) * this.s * this.queryLen / stats.getAvgFieldLength(); } } \ No newline at end of file diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/BM25Similarity.java b/lucene/core/src/java/org/apache/lucene/search/similarities/BM25Similarity.java index 35554e2b20e..1485101f657 100644 --- a/lucene/core/src/java/org/apache/lucene/search/similarities/BM25Similarity.java +++ b/lucene/core/src/java/org/apache/lucene/search/similarities/BM25Similarity.java @@ -159,9 +159,9 @@ public class BM25Similarity extends Similarity { final long df = termStats.docFreq(); final long docCount = collectionStats.docCount() == -1 ? collectionStats.maxDoc() : collectionStats.docCount(); final float idf = idf(df, docCount); - return Explanation.match(idf, "idf, computed as log(1 + (docCount - docFreq + 0.5) / (docFreq + 0.5)) from:", - Explanation.match(df, "docFreq"), - Explanation.match(docCount, "docCount")); + return Explanation.match(idf, "idf, computed as log(1 + (N - n + 0.5) / (n + 0.5)) from:", + Explanation.match(df, "n, number of documents containing term"), + Explanation.match(docCount, "N, total number of documents with field")); } /** @@ -185,7 +185,7 @@ public class BM25Similarity extends Similarity { details.add(idfExplain); idf += idfExplain.getValue(); } - return Explanation.match((float) idf, "idf(), sum of:", details); + return Explanation.match((float) idf, "idf, sum of:", details); } @Override @@ -197,7 +197,7 @@ public class BM25Similarity extends Similarity { for (int i = 0; i < cache.length; i++) { cache[i] = k1 * ((1 - b) + b * LENGTH_TABLE[i] / avgdl); } - return new BM25Stats(collectionStats.field(), boost, idf, avgdl, cache); + return new BM25Stats(collectionStats.field(), boost, k1, idf, avgdl, cache); } @Override @@ -217,7 +217,7 @@ public class BM25Similarity extends Similarity { BM25DocScorer(BM25Stats stats, NumericDocValues norms) throws IOException { this.stats = stats; - this.weightValue = stats.weight * (k1 + 1); + this.weightValue = stats.weight; this.norms = norms; lengthCache = LENGTH_TABLE; cache = stats.cache; @@ -226,7 +226,7 @@ public class BM25Similarity extends Similarity { @Override public float score(int doc, float freq) throws IOException { // if there are no norms, we act as if b=0 - float norm; + double norm; if (norms == null) { norm = k1; } else { @@ -236,12 +236,48 @@ public class BM25Similarity extends Similarity { norm = cache[0]; } } - return weightValue * freq / (freq + norm); + return weightValue * (float) (freq / (freq + norm)); } @Override public Explanation explain(int doc, Explanation freq) throws IOException { - return explainScore(doc, freq, stats, norms, lengthCache); + List subs = new ArrayList<>(); + subs.addAll(stats.explain()); + Explanation tfExpl = explainTF(doc, freq); + subs.add(tfExpl); + return Explanation.match(stats.weight * tfExpl.getValue(), + "score(doc="+doc+",freq="+freq.getValue()+"), product of:", subs); + } + + private Explanation explainTF(int doc, Explanation freq) throws IOException { + List subs = new ArrayList<>(); + subs.add(freq); + subs.add(Explanation.match(k1, "k1, term saturation parameter")); + if (norms == null) { + subs.add(Explanation.match(0, "b, field omits length norms")); + return Explanation.match( + (float) (freq.getValue() / (freq.getValue() + (double) k1)), + "tf, computed as freq / (freq + k1) from:", subs); + } else { + byte norm; + if (norms.advanceExact(doc)) { + norm = (byte) norms.longValue(); + } else { + norm = 0; + } + float doclen = lengthCache[norm & 0xff]; + subs.add(Explanation.match(b, "b, length normalization parameter")); + if ((norm & 0xFF) > 39) { + subs.add(Explanation.match(doclen, "dl, length of field (approximate)")); + } else { + subs.add(Explanation.match(doclen, "dl, length of field")); + } + subs.add(Explanation.match(stats.avgdl, "avgdl, average length of field")); + float normValue = k1 * ((1 - b) + b * doclen / stats.avgdl); + return Explanation.match( + (float) (freq.getValue() / (freq.getValue() + (double) normValue)), + "tf, computed as freq / (freq + k1 * (1 - b + b * dl / avgdl)) from:", subs); + } } @Override @@ -257,69 +293,45 @@ public class BM25Similarity extends Similarity { /** Collection statistics for the BM25 model. */ private static class BM25Stats extends SimWeight { + /** field name, for pulling norms */ + private final String field; + /** query boost */ + private final float boost; + /** k1 value for scale factor */ + private final float k1; /** BM25's idf */ private final Explanation idf; /** The average document length. */ private final float avgdl; - /** query boost */ - private final float boost; + /** precomputed norm[256] with k1 * ((1 - b) + b * dl / avgdl) */ + private final float[] cache; /** weight (idf * boost) */ private final float weight; - /** field name, for pulling norms */ - private final String field; - /** precomputed norm[256] with k1 * ((1 - b) + b * dl / avgdl) - * for LENGTH_TABLE */ - private final float[] cache; - BM25Stats(String field, float boost, Explanation idf, float avgdl, float[] cache) { + BM25Stats(String field, float boost, float k1, Explanation idf, float avgdl, float[] cache) { this.field = field; this.boost = boost; this.idf = idf; this.avgdl = avgdl; - this.weight = idf.getValue() * boost; + this.k1 = k1; this.cache = cache; + this.weight = (k1 + 1) * boost * idf.getValue(); } - } - - private Explanation explainTFNorm(int doc, Explanation freq, BM25Stats stats, NumericDocValues norms, float[] lengthCache) throws IOException { - List subs = new ArrayList<>(); - subs.add(freq); - subs.add(Explanation.match(k1, "parameter k1")); - if (norms == null) { - subs.add(Explanation.match(0, "parameter b (norms omitted for field)")); - return Explanation.match( - (freq.getValue() * (k1 + 1)) / (freq.getValue() + k1), - "tfNorm, computed as (freq * (k1 + 1)) / (freq + k1) from:", subs); - } else { - byte norm; - if (norms.advanceExact(doc)) { - norm = (byte) norms.longValue(); - } else { - norm = 0; + private List explain() { + List subs = new ArrayList<>(); + // scale factor + subs.add(Explanation.match(k1 + 1, "scaling factor, k1 + 1")); + // query boost + if (boost != 1.0f) { + subs.add(Explanation.match(boost, "boost")); } - float doclen = lengthCache[norm & 0xff]; - subs.add(Explanation.match(b, "parameter b")); - subs.add(Explanation.match(stats.avgdl, "avgFieldLength")); - subs.add(Explanation.match(doclen, "fieldLength")); - return Explanation.match( - (freq.getValue() * (k1 + 1)) / (freq.getValue() + k1 * (1 - b + b * doclen/stats.avgdl)), - "tfNorm, computed as (freq * (k1 + 1)) / (freq + k1 * (1 - b + b * fieldLength / avgFieldLength)) from:", subs); + // idf + subs.add(idf); + return subs; } } - private Explanation explainScore(int doc, Explanation freq, BM25Stats stats, NumericDocValues norms, float[] lengthCache) throws IOException { - Explanation boostExpl = Explanation.match(stats.boost, "boost"); - List subs = new ArrayList<>(); - if (boostExpl.getValue() != 1.0f) - subs.add(boostExpl); - subs.add(stats.idf); - Explanation tfNormExpl = explainTFNorm(doc, freq, stats, norms, lengthCache); - subs.add(tfNormExpl); - return Explanation.match( - boostExpl.getValue() * stats.idf.getValue() * tfNormExpl.getValue(), - "score(doc="+doc+",freq="+freq+"), product of:", subs); - } @Override public String toString() { diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/BasicModel.java b/lucene/core/src/java/org/apache/lucene/search/similarities/BasicModel.java index 9293422ea80..20dee40c6b9 100644 --- a/lucene/core/src/java/org/apache/lucene/search/similarities/BasicModel.java +++ b/lucene/core/src/java/org/apache/lucene/search/similarities/BasicModel.java @@ -37,7 +37,7 @@ public abstract class BasicModel { public BasicModel() {} /** Returns the informative content score. */ - public abstract float score(BasicStats stats, float tfn); + public abstract double score(BasicStats stats, double tfn); /** * Returns an explanation for the score. @@ -46,9 +46,9 @@ public abstract class BasicModel { * explanation for such models. Subclasses that use other statistics must * override this method.

*/ - public Explanation explain(BasicStats stats, float tfn) { + public Explanation explain(BasicStats stats, double tfn) { return Explanation.match( - score(stats, tfn), + (float) score(stats, tfn), getClass().getSimpleName() + ", computed from: ", Explanation.match(stats.getNumberOfDocuments(), "numberOfDocuments"), Explanation.match(stats.getTotalTermFreq(), "totalTermFreq")); diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/BasicModelBE.java b/lucene/core/src/java/org/apache/lucene/search/similarities/BasicModelBE.java index 12be56cbbae..0ba5686fc5f 100644 --- a/lucene/core/src/java/org/apache/lucene/search/similarities/BasicModelBE.java +++ b/lucene/core/src/java/org/apache/lucene/search/similarities/BasicModelBE.java @@ -35,11 +35,11 @@ public class BasicModelBE extends BasicModel { public BasicModelBE() {} @Override - public final float score(BasicStats stats, float tfn) { + public final double score(BasicStats stats, double tfn) { double F = stats.getTotalTermFreq() + 1 + tfn; // approximation only holds true when F << N, so we use N += F double N = F + stats.getNumberOfDocuments(); - return (float)(-log2((N - 1) * Math.E) + return (-log2((N - 1) * Math.E) + f(N + F - 1, N + F - tfn - 2) - f(F, F - tfn)); } diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/BasicModelD.java b/lucene/core/src/java/org/apache/lucene/search/similarities/BasicModelD.java index 28cd877ec26..70b004b29a5 100644 --- a/lucene/core/src/java/org/apache/lucene/search/similarities/BasicModelD.java +++ b/lucene/core/src/java/org/apache/lucene/search/similarities/BasicModelD.java @@ -37,16 +37,16 @@ public class BasicModelD extends BasicModel { public BasicModelD() {} @Override - public final float score(BasicStats stats, float tfn) { + public final double score(BasicStats stats, double tfn) { // we have to ensure phi is always < 1 for tiny TTF values, otherwise nphi can go negative, // resulting in NaN. cleanest way is to unconditionally always add tfn to totalTermFreq // to create a 'normalized' F. double F = stats.getTotalTermFreq() + 1 + tfn; - double phi = (double)tfn / F; + double phi = tfn / F; double nphi = 1 - phi; double p = 1.0 / (stats.getNumberOfDocuments() + 1); double D = phi * log2(phi / p) + nphi * log2(nphi / (1 - p)); - return (float)(D * F + 0.5 * log2(1 + 2 * Math.PI * tfn * nphi)); + return D * F + 0.5 * log2(1 + 2 * Math.PI * tfn * nphi); } @Override diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/BasicModelG.java b/lucene/core/src/java/org/apache/lucene/search/similarities/BasicModelG.java index d2d2f0d9815..2f8cb4368e0 100644 --- a/lucene/core/src/java/org/apache/lucene/search/similarities/BasicModelG.java +++ b/lucene/core/src/java/org/apache/lucene/search/similarities/BasicModelG.java @@ -31,13 +31,13 @@ public class BasicModelG extends BasicModel { public BasicModelG() {} @Override - public final float score(BasicStats stats, float tfn) { + public final double score(BasicStats stats, double tfn) { // just like in BE, approximation only holds true when F << N, so we use lambda = F / (N + F) double F = stats.getTotalTermFreq() + 1; double N = stats.getNumberOfDocuments(); double lambda = F / (N + F); // -log(1 / (lambda + 1)) -> log(lambda + 1) - return (float)(log2(lambda + 1) + tfn * log2((1 + lambda) / lambda)); + return log2(lambda + 1) + tfn * log2((1 + lambda) / lambda); } @Override diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/BasicModelIF.java b/lucene/core/src/java/org/apache/lucene/search/similarities/BasicModelIF.java index fe54122bd6c..5b7350bbd14 100644 --- a/lucene/core/src/java/org/apache/lucene/search/similarities/BasicModelIF.java +++ b/lucene/core/src/java/org/apache/lucene/search/similarities/BasicModelIF.java @@ -29,10 +29,10 @@ public class BasicModelIF extends BasicModel { public BasicModelIF() {} @Override - public final float score(BasicStats stats, float tfn) { + public final double score(BasicStats stats, double tfn) { long N = stats.getNumberOfDocuments(); long F = stats.getTotalTermFreq(); - return tfn * (float)(log2(1 + (N + 1) / (F + 0.5))); + return tfn * log2(1 + (N + 1) / (F + 0.5)); } @Override diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/BasicModelIn.java b/lucene/core/src/java/org/apache/lucene/search/similarities/BasicModelIn.java index 5865177ea45..a09eedb0d8f 100644 --- a/lucene/core/src/java/org/apache/lucene/search/similarities/BasicModelIn.java +++ b/lucene/core/src/java/org/apache/lucene/search/similarities/BasicModelIn.java @@ -30,16 +30,16 @@ public class BasicModelIn extends BasicModel { public BasicModelIn() {} @Override - public final float score(BasicStats stats, float tfn) { + public final double score(BasicStats stats, double tfn) { long N = stats.getNumberOfDocuments(); long n = stats.getDocFreq(); - return tfn * (float)(log2((N + 1) / (n + 0.5))); + return tfn * log2((N + 1) / (n + 0.5)); } @Override - public final Explanation explain(BasicStats stats, float tfn) { + public final Explanation explain(BasicStats stats, double tfn) { return Explanation.match( - score(stats, tfn), + (float) score(stats, tfn), getClass().getSimpleName() + ", computed from: ", Explanation.match(stats.getNumberOfDocuments(), "numberOfDocuments"), Explanation.match(stats.getDocFreq(), "docFreq")); diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/BasicModelIne.java b/lucene/core/src/java/org/apache/lucene/search/similarities/BasicModelIne.java index 28fa57e6ee9..b4e830d166f 100644 --- a/lucene/core/src/java/org/apache/lucene/search/similarities/BasicModelIne.java +++ b/lucene/core/src/java/org/apache/lucene/search/similarities/BasicModelIne.java @@ -30,11 +30,11 @@ public class BasicModelIne extends BasicModel { public BasicModelIne() {} @Override - public final float score(BasicStats stats, float tfn) { + public final double score(BasicStats stats, double tfn) { long N = stats.getNumberOfDocuments(); long F = stats.getTotalTermFreq(); double ne = N * (1 - Math.pow((N - 1) / (double)N, F)); - return tfn * (float)(log2((N + 1) / (ne + 0.5))); + return tfn * log2((N + 1) / (ne + 0.5)); } @Override diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/BasicModelP.java b/lucene/core/src/java/org/apache/lucene/search/similarities/BasicModelP.java index d032a2cc339..f66e3d004bd 100644 --- a/lucene/core/src/java/org/apache/lucene/search/similarities/BasicModelP.java +++ b/lucene/core/src/java/org/apache/lucene/search/similarities/BasicModelP.java @@ -35,11 +35,11 @@ public class BasicModelP extends BasicModel { public BasicModelP() {} @Override - public final float score(BasicStats stats, float tfn) { - float lambda = (float)(stats.getTotalTermFreq()+1) / (stats.getNumberOfDocuments()+1); - return (float)(tfn * log2(tfn / lambda) + public final double score(BasicStats stats, double tfn) { + double lambda = (stats.getTotalTermFreq()+1) / (double) (stats.getNumberOfDocuments()+1); + return tfn * log2(tfn / lambda) + (lambda + 1 / (12 * tfn) - tfn) * LOG2_E - + 0.5 * log2(2 * Math.PI * tfn)); + + 0.5 * log2(2 * Math.PI * tfn); } @Override diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/BasicStats.java b/lucene/core/src/java/org/apache/lucene/search/similarities/BasicStats.java index a08fe2fdbc1..cc3cab452fb 100644 --- a/lucene/core/src/java/org/apache/lucene/search/similarities/BasicStats.java +++ b/lucene/core/src/java/org/apache/lucene/search/similarities/BasicStats.java @@ -30,7 +30,7 @@ public class BasicStats extends Similarity.SimWeight { /** The total number of tokens in the field. */ protected long numberOfFieldTokens; /** The average field length. */ - protected float avgFieldLength; + protected double avgFieldLength; /** The document frequency. */ protected long docFreq; /** The total number of occurrences of this term across all documents. */ @@ -39,10 +39,10 @@ public class BasicStats extends Similarity.SimWeight { // -------------------------- Boost-related stuff -------------------------- /** A query boost. Should be applied as a multiplicative factor to the score. */ - protected final float boost; + protected final double boost; /** Constructor. */ - public BasicStats(String field, float boost) { + public BasicStats(String field, double boost) { this.field = field; this.boost = boost; } @@ -76,12 +76,12 @@ public class BasicStats extends Similarity.SimWeight { } /** Returns the average field length. */ - public float getAvgFieldLength() { + public double getAvgFieldLength() { return avgFieldLength; } /** Sets the average field length. */ - public void setAvgFieldLength(float avgFieldLength) { + public void setAvgFieldLength(double avgFieldLength) { this.avgFieldLength = avgFieldLength; } @@ -106,7 +106,7 @@ public class BasicStats extends Similarity.SimWeight { } /** Returns the total boost. */ - public float getBoost() { + public double getBoost() { return boost; } } diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/BooleanSimilarity.java b/lucene/core/src/java/org/apache/lucene/search/similarities/BooleanSimilarity.java index a7b7614cf30..30de698bfcf 100644 --- a/lucene/core/src/java/org/apache/lucene/search/similarities/BooleanSimilarity.java +++ b/lucene/core/src/java/org/apache/lucene/search/similarities/BooleanSimilarity.java @@ -73,7 +73,7 @@ public class BooleanSimilarity extends Similarity { @Override public Explanation explain(int doc, Explanation freq) throws IOException { - Explanation queryBoostExpl = Explanation.match(boost, "query boost"); + Explanation queryBoostExpl = Explanation.match(boost, "boost"); return Explanation.match( queryBoostExpl.getValue(), "score(" + getClass().getSimpleName() + ", doc=" + doc + "), computed from:", diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/ClassicSimilarity.java b/lucene/core/src/java/org/apache/lucene/search/similarities/ClassicSimilarity.java index c3d36c3a922..f33abdbf774 100644 --- a/lucene/core/src/java/org/apache/lucene/search/similarities/ClassicSimilarity.java +++ b/lucene/core/src/java/org/apache/lucene/search/similarities/ClassicSimilarity.java @@ -65,8 +65,8 @@ public class ClassicSimilarity extends TFIDFSimilarity { final long docCount = collectionStats.docCount() == -1 ? collectionStats.maxDoc() : collectionStats.docCount(); final float idf = idf(df, docCount); return Explanation.match(idf, "idf, computed as log((docCount+1)/(docFreq+1)) + 1 from:", - Explanation.match(df, "docFreq"), - Explanation.match(docCount, "docCount")); + Explanation.match(df, "docFreq, number of documents containing term"), + Explanation.match(docCount, "docCount, total number of documents with field")); } /** Implemented as log((docCount+1)/(docFreq+1)) + 1. */ diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/DFISimilarity.java b/lucene/core/src/java/org/apache/lucene/search/similarities/DFISimilarity.java index 5dea8a4f26e..8b7e43a30c3 100644 --- a/lucene/core/src/java/org/apache/lucene/search/similarities/DFISimilarity.java +++ b/lucene/core/src/java/org/apache/lucene/search/similarities/DFISimilarity.java @@ -50,16 +50,16 @@ public class DFISimilarity extends SimilarityBase { } @Override - protected float score(BasicStats stats, float freq, float docLen) { + protected double score(BasicStats stats, double freq, double docLen) { - final float expected = (stats.getTotalTermFreq() + 1) * docLen / (stats.getNumberOfFieldTokens() + 1); + final double expected = (stats.getTotalTermFreq() + 1) * docLen / (stats.getNumberOfFieldTokens() + 1); // if the observed frequency is less than or equal to the expected value, then return zero. if (freq <= expected) return 0; - final float measure = independence.score(freq, expected); + final double measure = independence.score(freq, expected); - return stats.getBoost() * (float) log2(measure + 1); + return stats.getBoost() * log2(measure + 1); } /** diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/DFRSimilarity.java b/lucene/core/src/java/org/apache/lucene/search/similarities/DFRSimilarity.java index b77fc5358e4..aacd2460d7a 100644 --- a/lucene/core/src/java/org/apache/lucene/search/similarities/DFRSimilarity.java +++ b/lucene/core/src/java/org/apache/lucene/search/similarities/DFRSimilarity.java @@ -107,17 +107,17 @@ public class DFRSimilarity extends SimilarityBase { } @Override - protected float score(BasicStats stats, float freq, float docLen) { - float tfn = normalization.tfn(stats, freq, docLen); + protected double score(BasicStats stats, double freq, double docLen) { + double tfn = normalization.tfn(stats, freq, docLen); return stats.getBoost() * basicModel.score(stats, tfn) * afterEffect.score(stats, tfn); } @Override protected void explain(List subs, - BasicStats stats, int doc, float freq, float docLen) { - if (stats.getBoost() != 1.0f) { - subs.add(Explanation.match(stats.getBoost(), "boost")); + BasicStats stats, int doc, double freq, double docLen) { + if (stats.getBoost() != 1.0d) { + subs.add(Explanation.match( (float)stats.getBoost(), "boost")); } Explanation normExpl = normalization.explain(stats, freq, docLen); diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/Distribution.java b/lucene/core/src/java/org/apache/lucene/search/similarities/Distribution.java index 8e670b0b8bc..c77d423cb02 100644 --- a/lucene/core/src/java/org/apache/lucene/search/similarities/Distribution.java +++ b/lucene/core/src/java/org/apache/lucene/search/similarities/Distribution.java @@ -34,13 +34,13 @@ public abstract class Distribution { public Distribution() {} /** Computes the score. */ - public abstract float score(BasicStats stats, float tfn, float lambda); + public abstract double score(BasicStats stats, double tfn, double lambda); /** Explains the score. Returns the name of the model only, since * both {@code tfn} and {@code lambda} are explained elsewhere. */ - public Explanation explain(BasicStats stats, float tfn, float lambda) { - return Explanation.match( - score(stats, tfn, lambda), getClass().getSimpleName()); + public Explanation explain(BasicStats stats, double tfn, double lambda) { + return Explanation.match((float)score(stats, tfn, lambda), + getClass().getSimpleName()); } /** diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/DistributionLL.java b/lucene/core/src/java/org/apache/lucene/search/similarities/DistributionLL.java index e34085a327e..7b2b97e2649 100644 --- a/lucene/core/src/java/org/apache/lucene/search/similarities/DistributionLL.java +++ b/lucene/core/src/java/org/apache/lucene/search/similarities/DistributionLL.java @@ -30,8 +30,8 @@ public class DistributionLL extends Distribution { public DistributionLL() {} @Override - public final float score(BasicStats stats, float tfn, float lambda) { - return (float)-Math.log(lambda / (tfn + lambda)); + public final double score(BasicStats stats, double tfn, double lambda) { + return -Math.log(lambda / (tfn + lambda)); } @Override diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/DistributionSPL.java b/lucene/core/src/java/org/apache/lucene/search/similarities/DistributionSPL.java index 1a9299f317c..fc05d72dc42 100644 --- a/lucene/core/src/java/org/apache/lucene/search/similarities/DistributionSPL.java +++ b/lucene/core/src/java/org/apache/lucene/search/similarities/DistributionSPL.java @@ -33,11 +33,11 @@ public class DistributionSPL extends Distribution { public DistributionSPL() {} @Override - public final float score(BasicStats stats, float tfn, float lambda) { - if (lambda == 1f) { - lambda = 0.99f; + public final double score(BasicStats stats, double tfn, double lambda) { + if (lambda == 1d) { + lambda = 0.99d; } - return (float)-Math.log( + return -Math.log( (Math.pow(lambda, (tfn / (tfn + 1))) - lambda) / (1 - lambda)); } diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/IBSimilarity.java b/lucene/core/src/java/org/apache/lucene/search/similarities/IBSimilarity.java index 0dff0f41830..875cbe4b02f 100644 --- a/lucene/core/src/java/org/apache/lucene/search/similarities/IBSimilarity.java +++ b/lucene/core/src/java/org/apache/lucene/search/similarities/IBSimilarity.java @@ -95,7 +95,7 @@ public class IBSimilarity extends SimilarityBase { } @Override - protected float score(BasicStats stats, float freq, float docLen) { + protected double score(BasicStats stats, double freq, double docLen) { return stats.getBoost() * distribution.score( stats, @@ -105,9 +105,9 @@ public class IBSimilarity extends SimilarityBase { @Override protected void explain( - List subs, BasicStats stats, int doc, float freq, float docLen) { - if (stats.getBoost() != 1.0f) { - subs.add(Explanation.match(stats.getBoost(), "boost")); + List subs, BasicStats stats, int doc, double freq, double docLen) { + if (stats.getBoost() != 1.0d) { + subs.add(Explanation.match((float)stats.getBoost(), "boost")); } Explanation normExpl = normalization.explain(stats, freq, docLen); Explanation lambdaExpl = lambda.explain(stats); diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/Independence.java b/lucene/core/src/java/org/apache/lucene/search/similarities/Independence.java index 9293f0b2388..05bd0f7af8b 100644 --- a/lucene/core/src/java/org/apache/lucene/search/similarities/Independence.java +++ b/lucene/core/src/java/org/apache/lucene/search/similarities/Independence.java @@ -38,7 +38,7 @@ public abstract class Independence { * @param freq actual term frequency * @param expected expected term frequency */ - public abstract float score(float freq, float expected); + public abstract double score(double freq, double expected); // subclasses must provide a name @Override diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/IndependenceChiSquared.java b/lucene/core/src/java/org/apache/lucene/search/similarities/IndependenceChiSquared.java index e209063d6f0..3352e105b66 100644 --- a/lucene/core/src/java/org/apache/lucene/search/similarities/IndependenceChiSquared.java +++ b/lucene/core/src/java/org/apache/lucene/search/similarities/IndependenceChiSquared.java @@ -33,7 +33,7 @@ public class IndependenceChiSquared extends Independence { public IndependenceChiSquared() {} @Override - public float score(float freq, float expected) { + public double score(double freq, double expected) { return (freq - expected) * (freq - expected) / expected; } diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/IndependenceSaturated.java b/lucene/core/src/java/org/apache/lucene/search/similarities/IndependenceSaturated.java index af4bf638b24..6d30c1768c6 100644 --- a/lucene/core/src/java/org/apache/lucene/search/similarities/IndependenceSaturated.java +++ b/lucene/core/src/java/org/apache/lucene/search/similarities/IndependenceSaturated.java @@ -32,7 +32,7 @@ public class IndependenceSaturated extends Independence { public IndependenceSaturated() {} @Override - public float score(float freq, float expected) { + public double score(double freq, double expected) { return (freq - expected) / expected; } diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/IndependenceStandardized.java b/lucene/core/src/java/org/apache/lucene/search/similarities/IndependenceStandardized.java index 1f7cda0ced5..047a7455457 100644 --- a/lucene/core/src/java/org/apache/lucene/search/similarities/IndependenceStandardized.java +++ b/lucene/core/src/java/org/apache/lucene/search/similarities/IndependenceStandardized.java @@ -34,8 +34,8 @@ public class IndependenceStandardized extends Independence { public IndependenceStandardized() {} @Override - public float score(float freq, float expected) { - return (freq - expected) / (float) Math.sqrt(expected); + public double score(double freq, double expected) { + return (freq - expected) / Math.sqrt(expected); } @Override diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/LMDirichletSimilarity.java b/lucene/core/src/java/org/apache/lucene/search/similarities/LMDirichletSimilarity.java index de59eb9cda8..a901bad800b 100644 --- a/lucene/core/src/java/org/apache/lucene/search/similarities/LMDirichletSimilarity.java +++ b/lucene/core/src/java/org/apache/lucene/search/similarities/LMDirichletSimilarity.java @@ -44,11 +44,17 @@ public class LMDirichletSimilarity extends LMSimilarity { /** Instantiates the similarity with the provided μ parameter. */ public LMDirichletSimilarity(CollectionModel collectionModel, float mu) { super(collectionModel); + if (Float.isFinite(mu) == false || mu < 0) { + throw new IllegalArgumentException("illegal mu value: " + mu + ", must be a non-negative finite value"); + } this.mu = mu; } /** Instantiates the similarity with the provided μ parameter. */ public LMDirichletSimilarity(float mu) { + if (Float.isFinite(mu) == false || mu < 0) { + throw new IllegalArgumentException("illegal mu value: " + mu + ", must be a non-negative finite value"); + } this.mu = mu; } @@ -63,18 +69,18 @@ public class LMDirichletSimilarity extends LMSimilarity { } @Override - protected float score(BasicStats stats, float freq, float docLen) { - float score = stats.getBoost() * (float)(Math.log(1 + freq / + protected double score(BasicStats stats, double freq, double docLen) { + double score = stats.getBoost() * (Math.log(1 + freq / (mu * ((LMStats)stats).getCollectionProbability())) + Math.log(mu / (docLen + mu))); - return score > 0.0f ? score : 0.0f; + return score > 0.0d ? score : 0.0d; } @Override protected void explain(List subs, BasicStats stats, int doc, - float freq, float docLen) { - if (stats.getBoost() != 1.0f) { - subs.add(Explanation.match(stats.getBoost(), "boost")); + double freq, double docLen) { + if (stats.getBoost() != 1.0d) { + subs.add(Explanation.match((float) stats.getBoost(), "boost")); } subs.add(Explanation.match(mu, "mu")); diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/LMJelinekMercerSimilarity.java b/lucene/core/src/java/org/apache/lucene/search/similarities/LMJelinekMercerSimilarity.java index 3788b5ca6b9..2799e3a0849 100644 --- a/lucene/core/src/java/org/apache/lucene/search/similarities/LMJelinekMercerSimilarity.java +++ b/lucene/core/src/java/org/apache/lucene/search/similarities/LMJelinekMercerSimilarity.java @@ -31,7 +31,9 @@ import org.apache.lucene.search.Explanation; *

The model has a single parameter, λ. According to said paper, the * optimal value depends on both the collection and the query. The optimal value * is around {@code 0.1} for title queries and {@code 0.7} for long queries.

- * + *

Values should be between 0 (exclusive) and 1 (inclusive). Values near zero act score more + * like a conjunction (coordinate level matching), whereas values near 1 behave + * the opposite (more like pure disjunction). * @lucene.experimental */ public class LMJelinekMercerSimilarity extends LMSimilarity { @@ -42,27 +44,33 @@ public class LMJelinekMercerSimilarity extends LMSimilarity { public LMJelinekMercerSimilarity( CollectionModel collectionModel, float lambda) { super(collectionModel); + if (Float.isNaN(lambda) || lambda <= 0 || lambda > 1) { + throw new IllegalArgumentException("lambda must be in the range (0 .. 1]"); + } this.lambda = lambda; } /** Instantiates with the specified λ parameter. */ public LMJelinekMercerSimilarity(float lambda) { + if (Float.isNaN(lambda) || lambda <= 0 || lambda > 1) { + throw new IllegalArgumentException("lambda must be in the range (0 .. 1]"); + } this.lambda = lambda; } @Override - protected float score(BasicStats stats, float freq, float docLen) { + protected double score(BasicStats stats, double freq, double docLen) { return stats.getBoost() * - (float)Math.log(1 + + Math.log(1 + ((1 - lambda) * freq / docLen) / (lambda * ((LMStats)stats).getCollectionProbability())); } @Override protected void explain(List subs, BasicStats stats, int doc, - float freq, float docLen) { - if (stats.getBoost() != 1.0f) { - subs.add(Explanation.match(stats.getBoost(), "boost")); + double freq, double docLen) { + if (stats.getBoost() != 1.0d) { + subs.add(Explanation.match((float) stats.getBoost(), "boost")); } subs.add(Explanation.match(lambda, "lambda")); super.explain(subs, stats, doc, freq, docLen); diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/LMSimilarity.java b/lucene/core/src/java/org/apache/lucene/search/similarities/LMSimilarity.java index 2e484eb641b..81548061e5c 100644 --- a/lucene/core/src/java/org/apache/lucene/search/similarities/LMSimilarity.java +++ b/lucene/core/src/java/org/apache/lucene/search/similarities/LMSimilarity.java @@ -54,7 +54,7 @@ public abstract class LMSimilarity extends SimilarityBase { } @Override - protected BasicStats newStats(String field, float boost) { + protected BasicStats newStats(String field, double boost) { return new LMStats(field, boost); } @@ -71,8 +71,8 @@ public abstract class LMSimilarity extends SimilarityBase { @Override protected void explain(List subExpls, BasicStats stats, int doc, - float freq, float docLen) { - subExpls.add(Explanation.match(collectionModel.computeProbability(stats), + double freq, double docLen) { + subExpls.add(Explanation.match((float) collectionModel.computeProbability(stats), "collection probability")); } @@ -103,12 +103,12 @@ public abstract class LMSimilarity extends SimilarityBase { /** Stores the collection distribution of the current term. */ public static class LMStats extends BasicStats { /** The probability that the current term is generated by the collection. */ - private float collectionProbability; + private double collectionProbability; /** * Creates LMStats for the provided field and query-time boost */ - public LMStats(String field, float boost) { + public LMStats(String field, double boost) { super(field, boost); } @@ -116,7 +116,7 @@ public abstract class LMSimilarity extends SimilarityBase { * Returns the probability that the current term is generated by the * collection. */ - public final float getCollectionProbability() { + public final double getCollectionProbability() { return collectionProbability; } @@ -124,7 +124,7 @@ public abstract class LMSimilarity extends SimilarityBase { * Sets the probability that the current term is generated by the * collection. */ - public final void setCollectionProbability(float collectionProbability) { + public final void setCollectionProbability(double collectionProbability) { this.collectionProbability = collectionProbability; } } @@ -135,7 +135,7 @@ public abstract class LMSimilarity extends SimilarityBase { * Computes the probability {@code p(w|C)} according to the language model * strategy for the current term. */ - public float computeProbability(BasicStats stats); + public double computeProbability(BasicStats stats); /** The name of the collection model strategy. */ public String getName(); @@ -151,8 +151,8 @@ public abstract class LMSimilarity extends SimilarityBase { public DefaultCollectionModel() {} @Override - public float computeProbability(BasicStats stats) { - return (stats.getTotalTermFreq()+1F) / (stats.getNumberOfFieldTokens()+1F); + public double computeProbability(BasicStats stats) { + return (stats.getTotalTermFreq()+1D) / (stats.getNumberOfFieldTokens()+1D); } @Override diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/Normalization.java b/lucene/core/src/java/org/apache/lucene/search/similarities/Normalization.java index 0ab70f6b000..e20ca020da0 100644 --- a/lucene/core/src/java/org/apache/lucene/search/similarities/Normalization.java +++ b/lucene/core/src/java/org/apache/lucene/search/similarities/Normalization.java @@ -36,7 +36,7 @@ public abstract class Normalization { /** Returns the normalized term frequency. * @param len the field length. */ - public abstract float tfn(BasicStats stats, float tf, float len); + public abstract double tfn(BasicStats stats, double tf, double len); /** Returns an explanation for the normalized term frequency. *

The default normalization methods use the field length of the document @@ -44,13 +44,13 @@ public abstract class Normalization { * This method provides a generic explanation for such methods. * Subclasses that use other statistics must override this method.

*/ - public Explanation explain(BasicStats stats, float tf, float len) { + public Explanation explain(BasicStats stats, double tf, double len) { return Explanation.match( - tfn(stats, tf, len), + (float) tfn(stats, tf, len), getClass().getSimpleName() + ", computed from: ", - Explanation.match(tf, "tf"), - Explanation.match(stats.getAvgFieldLength(), "avgFieldLength"), - Explanation.match(len, "len")); + Explanation.match((float) tf, "tf"), + Explanation.match((float) stats.getAvgFieldLength(), "avgFieldLength"), + Explanation.match((float) len, "len")); } /** Implementation used when there is no normalization. */ @@ -60,12 +60,12 @@ public abstract class Normalization { public NoNormalization() {} @Override - public final float tfn(BasicStats stats, float tf, float len) { + public double tfn(BasicStats stats, double tf, double len) { return tf; } @Override - public final Explanation explain(BasicStats stats, float tf, float len) { + public Explanation explain(BasicStats stats, double tf, double len) { return Explanation.match(1, "no normalization"); } diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/NormalizationH1.java b/lucene/core/src/java/org/apache/lucene/search/similarities/NormalizationH1.java index e7f47cafd3e..8e5a28fcaf9 100644 --- a/lucene/core/src/java/org/apache/lucene/search/similarities/NormalizationH1.java +++ b/lucene/core/src/java/org/apache/lucene/search/similarities/NormalizationH1.java @@ -36,6 +36,10 @@ public class NormalizationH1 extends Normalization { * normalization with respect to the document length. */ public NormalizationH1(float c) { + // unbounded but typical range 0..10 or so + if (Float.isFinite(c) == false || c < 0) { + throw new IllegalArgumentException("illegal c value: " + c + ", must be a non-negative finite value"); + } this.c = c; } @@ -47,8 +51,8 @@ public class NormalizationH1 extends Normalization { } @Override - public final float tfn(BasicStats stats, float tf, float len) { - return tf * c * stats.getAvgFieldLength() / len; + public final double tfn(BasicStats stats, double tf, double len) { + return tf * c * (stats.getAvgFieldLength() / len); } @Override diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/NormalizationH2.java b/lucene/core/src/java/org/apache/lucene/search/similarities/NormalizationH2.java index 4bc50045a09..24fb74ea2aa 100644 --- a/lucene/core/src/java/org/apache/lucene/search/similarities/NormalizationH2.java +++ b/lucene/core/src/java/org/apache/lucene/search/similarities/NormalizationH2.java @@ -38,6 +38,10 @@ public class NormalizationH2 extends Normalization { * normalization with respect to the document length. */ public NormalizationH2(float c) { + // unbounded but typical range 0..10 or so + if (Float.isFinite(c) == false || c < 0) { + throw new IllegalArgumentException("illegal c value: " + c + ", must be a non-negative finite value"); + } this.c = c; } @@ -49,8 +53,8 @@ public class NormalizationH2 extends Normalization { } @Override - public final float tfn(BasicStats stats, float tf, float len) { - return (float)(tf * log2(1 + c * stats.getAvgFieldLength() / len)); + public final double tfn(BasicStats stats, double tf, double len) { + return tf * log2(1 + c * stats.getAvgFieldLength() / len); } @Override diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/NormalizationH3.java b/lucene/core/src/java/org/apache/lucene/search/similarities/NormalizationH3.java index 579cdb5094a..0bbea496b70 100644 --- a/lucene/core/src/java/org/apache/lucene/search/similarities/NormalizationH3.java +++ b/lucene/core/src/java/org/apache/lucene/search/similarities/NormalizationH3.java @@ -36,11 +36,14 @@ public class NormalizationH3 extends Normalization { * @param mu smoothing parameter μ */ public NormalizationH3(float mu) { + if (Float.isFinite(mu) == false || mu < 0) { + throw new IllegalArgumentException("illegal mu value: " + mu + ", must be a non-negative finite value"); + } this.mu = mu; } @Override - public float tfn(BasicStats stats, float tf, float len) { + public double tfn(BasicStats stats, double tf, double len) { return (tf + mu * ((stats.getTotalTermFreq()+1F) / (stats.getNumberOfFieldTokens()+1F))) / (len + mu) * mu; } diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/NormalizationZ.java b/lucene/core/src/java/org/apache/lucene/search/similarities/NormalizationZ.java index 97b92a213b5..dabf9c906bc 100644 --- a/lucene/core/src/java/org/apache/lucene/search/similarities/NormalizationZ.java +++ b/lucene/core/src/java/org/apache/lucene/search/similarities/NormalizationZ.java @@ -34,15 +34,18 @@ public class NormalizationZ extends Normalization { /** * Creates NormalizationZ with the supplied parameter z. * @param z represents A/(A+1) where A - * measures the specificity of the language. + * measures the specificity of the language. It ranges from (0 .. 0.5) */ public NormalizationZ(float z) { + if (Float.isNaN(z) || z <= 0f || z >= 0.5f) { + throw new IllegalArgumentException("illegal z value: " + z + ", must be in the range (0 .. 0.5)"); + } this.z = z; } @Override - public float tfn(BasicStats stats, float tf, float len) { - return (float)(tf * Math.pow(stats.avgFieldLength / len, z)); + public double tfn(BasicStats stats, double tf, double len) { + return tf * Math.pow(stats.avgFieldLength / len, z); } @Override diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/SimilarityBase.java b/lucene/core/src/java/org/apache/lucene/search/similarities/SimilarityBase.java index 46899a3378f..d8ec244a6a7 100644 --- a/lucene/core/src/java/org/apache/lucene/search/similarities/SimilarityBase.java +++ b/lucene/core/src/java/org/apache/lucene/search/similarities/SimilarityBase.java @@ -34,7 +34,7 @@ import org.apache.lucene.util.SmallFloat; * A subclass of {@code Similarity} that provides a simplified API for its * descendants. Subclasses are only required to implement the {@link #score} * and {@link #toString()} methods. Implementing - * {@link #explain(List, BasicStats, int, float, float)} is optional, + * {@link #explain(List, BasicStats, int, double, double)} is optional, * inasmuch as SimilarityBase already provides a basic explanation of the score * and the term frequency. However, implementers of a subclass are encouraged to * include as much detail about the scoring method as possible. @@ -93,7 +93,7 @@ public abstract class SimilarityBase extends Similarity { } /** Factory method to return a custom stats object */ - protected BasicStats newStats(String field, float boost) { + protected BasicStats newStats(String field, double boost) { return new BasicStats(field, boost); } @@ -113,7 +113,7 @@ public abstract class SimilarityBase extends Similarity { } final long numberOfFieldTokens; - final float avgFieldLength; + final double avgFieldLength; long sumTotalTermFreq = collectionStats.sumTotalTermFreq(); @@ -145,7 +145,7 @@ public abstract class SimilarityBase extends Similarity { * @param docLen the document length. * @return the score. */ - protected abstract float score(BasicStats stats, float freq, float docLen); + protected abstract double score(BasicStats stats, double freq, double docLen); /** * Subclasses should implement this method to explain the score. {@code expl} @@ -161,16 +161,16 @@ public abstract class SimilarityBase extends Similarity { * @param docLen the document length. */ protected void explain( - List subExpls, BasicStats stats, int doc, float freq, float docLen) {} + List subExpls, BasicStats stats, int doc, double freq, double docLen) {} /** * Explains the score. The implementation here provides a basic explanation * in the format score(name-of-similarity, doc=doc-id, * freq=term-frequency), computed from:, and - * attaches the score (computed via the {@link #score(BasicStats, float, float)} + * attaches the score (computed via the {@link #score(BasicStats, double, double)} * method) and the explanation for the term frequency. Subclasses content with * this format may add additional details in - * {@link #explain(List, BasicStats, int, float, float)}. + * {@link #explain(List, BasicStats, int, double, double)}. * * @param stats the corpus level statistics. * @param doc the document id. @@ -179,12 +179,12 @@ public abstract class SimilarityBase extends Similarity { * @return the explanation. */ protected Explanation explain( - BasicStats stats, int doc, Explanation freq, float docLen) { + BasicStats stats, int doc, Explanation freq, double docLen) { List subs = new ArrayList<>(); explain(subs, stats, doc, freq.getValue(), docLen); return Explanation.match( - score(stats, freq.getValue(), docLen), + (float) score(stats, freq.getValue(), docLen), "score(" + getClass().getSimpleName() + ", doc=" + doc + ", freq=" + freq.getValue() +"), computed from:", subs); } @@ -248,8 +248,8 @@ public abstract class SimilarityBase extends Similarity { /** Delegates the {@link #score(int, float)} and * {@link #explain(int, Explanation)} methods to - * {@link SimilarityBase#score(BasicStats, float, float)} and - * {@link SimilarityBase#explain(BasicStats, int, Explanation, float)}, + * {@link SimilarityBase#score(BasicStats, double, double)} and + * {@link SimilarityBase#explain(BasicStats, int, Explanation, double)}, * respectively. */ final class BasicSimScorer extends SimScorer { @@ -261,9 +261,9 @@ public abstract class SimilarityBase extends Similarity { this.norms = norms; } - float getLengthValue(int doc) throws IOException { + double getLengthValue(int doc) throws IOException { if (norms == null) { - return 1F; + return 1D; } if (norms.advanceExact(doc)) { return LENGTH_TABLE[Byte.toUnsignedInt((byte) norms.longValue())]; @@ -275,7 +275,7 @@ public abstract class SimilarityBase extends Similarity { @Override public float score(int doc, float freq) throws IOException { // We have to supply something in case norms are omitted - return SimilarityBase.this.score(stats, freq, getLengthValue(doc)); + return (float) SimilarityBase.this.score(stats, freq, getLengthValue(doc)); } @Override diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/TFIDFSimilarity.java b/lucene/core/src/java/org/apache/lucene/search/similarities/TFIDFSimilarity.java index dba1c61b090..97e522697dd 100644 --- a/lucene/core/src/java/org/apache/lucene/search/similarities/TFIDFSimilarity.java +++ b/lucene/core/src/java/org/apache/lucene/search/similarities/TFIDFSimilarity.java @@ -450,7 +450,9 @@ public abstract class TFIDFSimilarity extends Similarity { final long df = termStats.docFreq(); final long docCount = collectionStats.docCount() == -1 ? collectionStats.maxDoc() : collectionStats.docCount(); final float idf = idf(df, docCount); - return Explanation.match(idf, "idf(docFreq=" + df + ", docCount=" + docCount + ")"); + return Explanation.match(idf, "idf(docFreq, docCount)", + Explanation.match(df, "docFreq, number of documents containing term"), + Explanation.match(docCount, "docCount, total number of documents with field")); } /** @@ -643,20 +645,37 @@ public abstract class TFIDFSimilarity extends Similarity { "fieldNorm(doc=" + doc + ")"); return Explanation.match( - tfExplanation.getValue() * stats.idf.getValue() * fieldNormExpl.getValue(), + tfExplanation.getValue() * fieldNormExpl.getValue(), "fieldWeight in " + doc + ", product of:", - tfExplanation, stats.idf, fieldNormExpl); + tfExplanation, fieldNormExpl); } private Explanation explainScore(int doc, Explanation freq, IDFStats stats, NumericDocValues norms, float[] normTable) throws IOException { - Explanation queryExpl = Explanation.match(stats.boost, "boost"); - Explanation fieldExpl = explainField(doc, freq, stats, norms, normTable); - if (stats.boost == 1f) { - return fieldExpl; + List subs = new ArrayList(); + if (stats.boost != 1F) { + subs.add(Explanation.match(stats.boost, "boost")); } + subs.add(stats.idf); + Explanation tf = Explanation.match(tf(freq.getValue()), "tf(freq="+freq.getValue()+"), with freq of:", freq); + subs.add(tf); + + float norm; + if (norms == null) { + norm = 1f; + } else if (norms.advanceExact(doc) == false) { + norm = 0f; + } else { + norm = normTable[(int) (norms.longValue() & 0xFF)]; + } + + Explanation fieldNorm = Explanation.match( + norm, + "fieldNorm(doc=" + doc + ")"); + subs.add(fieldNorm); + return Explanation.match( - queryExpl.getValue() * fieldExpl.getValue(), + stats.queryWeight * tf.getValue() * norm, "score(doc="+doc+",freq="+freq.getValue()+"), product of:", - queryExpl, fieldExpl); + subs); } } diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/package-info.java b/lucene/core/src/java/org/apache/lucene/search/similarities/package-info.java index a3544d71442..1ed9669147c 100644 --- a/lucene/core/src/java/org/apache/lucene/search/similarities/package-info.java +++ b/lucene/core/src/java/org/apache/lucene/search/similarities/package-info.java @@ -97,7 +97,7 @@ * The easiest way to quickly implement a new ranking method is to extend * {@link org.apache.lucene.search.similarities.SimilarityBase}, which provides * basic implementations for the low level . Subclasses are only required to - * implement the {@link org.apache.lucene.search.similarities.SimilarityBase#score(BasicStats, float, float)} + * implement the {@link org.apache.lucene.search.similarities.SimilarityBase#score(BasicStats, double, double)} * and {@link org.apache.lucene.search.similarities.SimilarityBase#toString()} * methods. * diff --git a/lucene/core/src/test/org/apache/lucene/search/similarities/AxiomaticTestCase.java b/lucene/core/src/test/org/apache/lucene/search/similarities/AxiomaticTestCase.java new file mode 100644 index 00000000000..c2f614c8a0d --- /dev/null +++ b/lucene/core/src/test/org/apache/lucene/search/similarities/AxiomaticTestCase.java @@ -0,0 +1,90 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.search.similarities; + +import java.util.Random; + +public abstract class AxiomaticTestCase extends BaseSimilarityTestCase { + + @Override + protected final Similarity getSimilarity(Random random) { + // axiomatic parameter s + final float s; + switch (random.nextInt(4)) { + case 0: + // minimum value + s = 0; + break; + case 1: + // tiny value + s = Float.MIN_VALUE; + break; + case 2: + // maximum value + s = 1; + break; + default: + // random value + s = random.nextFloat(); + break; + } + // axiomatic query length + final int queryLen; + switch (random.nextInt(4)) { + case 0: + // minimum value + queryLen = 0; + break; + case 1: + // tiny value + queryLen = 1; + break; + case 2: + // maximum value + queryLen = Integer.MAX_VALUE; + break; + default: + // random value + queryLen = random.nextInt(Integer.MAX_VALUE); + break; + } + // axiomatic parameter k + final float k; + switch (random.nextInt(4)) { + case 0: + // minimum value + k = 0; + break; + case 1: + // tiny value + k = Float.MIN_VALUE; + break; + case 2: + // maximum value + k = 1; + break; + default: + // random value + k = random.nextFloat(); + break; + } + + return getAxiomaticModel(s, queryLen, k); + } + + protected abstract Similarity getAxiomaticModel(float s, int queryLen, float k); +} diff --git a/lucene/core/src/test/org/apache/lucene/search/similarities/BasicModelTestCase.java b/lucene/core/src/test/org/apache/lucene/search/similarities/BasicModelTestCase.java new file mode 100644 index 00000000000..66236669704 --- /dev/null +++ b/lucene/core/src/test/org/apache/lucene/search/similarities/BasicModelTestCase.java @@ -0,0 +1,124 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.search.similarities; + +import java.util.Random; + +public abstract class BasicModelTestCase extends BaseSimilarityTestCase { + + @Override + protected final Similarity getSimilarity(Random random) { + final AfterEffect afterEffect; + switch(random.nextInt(3)) { + case 0: + afterEffect = new AfterEffect.NoAfterEffect(); + break; + case 1: + afterEffect = new AfterEffectL(); + break; + default: + afterEffect = new AfterEffectB(); + break; + } + // normalization hyper-parameter c + final float c; + switch (random.nextInt(4)) { + case 0: + // minimum value + c = 0; + break; + case 1: + // tiny value + c = Float.MIN_VALUE; + break; + case 2: + // maximum value + // we just limit the test to "reasonable" c values but don't enforce this anywhere. + c = Integer.MAX_VALUE; + break; + default: + // random value + c = Integer.MAX_VALUE * random.nextFloat(); + break; + } + // normalization hyper-parameter z + final float z; + switch (random.nextInt(3)) { + case 0: + // minimum value + z = Float.MIN_VALUE; + break; + case 1: + // maximum value + z = Math.nextDown(0.5f); + break; + default: + // random value + float zcand = random.nextFloat() / 2; + if (zcand == 0f) { + // nextFloat returns 0 inclusive, we have to avoid it. + z = Math.nextUp(zcand); + } else { + z = zcand; + } + } + // dirichlet parameter mu + final float mu; + switch (random.nextInt(4)) { + case 0: + // minimum value + mu = 0; + break; + case 1: + // tiny value + mu = Float.MIN_VALUE; + break; + case 2: + // maximum value + // we just limit the test to "reasonable" mu values but don't enforce this anywhere. + mu = Integer.MAX_VALUE; + break; + default: + // random value + mu = Integer.MAX_VALUE * random.nextFloat(); + break; + } + final Normalization normalization; + switch(random.nextInt(5)) { + case 0: + normalization = new Normalization.NoNormalization(); + break; + case 1: + normalization = new NormalizationH1(c); + break; + case 2: + normalization = new NormalizationH2(c); + break; + case 3: + normalization = new NormalizationH3(mu); + break; + default: + normalization = new NormalizationZ(z); + break; + } + return new DFRSimilarity(getBasicModel(), afterEffect, normalization); + } + + /** return BasicModel under test */ + protected abstract BasicModel getBasicModel(); + +} diff --git a/lucene/core/src/test/org/apache/lucene/search/similarities/DistributionTestCase.java b/lucene/core/src/test/org/apache/lucene/search/similarities/DistributionTestCase.java new file mode 100644 index 00000000000..6d425d29c0c --- /dev/null +++ b/lucene/core/src/test/org/apache/lucene/search/similarities/DistributionTestCase.java @@ -0,0 +1,119 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.search.similarities; + +import java.util.Random; + +public abstract class DistributionTestCase extends BaseSimilarityTestCase { + + @Override + protected final Similarity getSimilarity(Random random) { + final Lambda lambda; + if (random.nextBoolean()) { + lambda = new LambdaDF(); + } else { + lambda = new LambdaTTF(); + } + + // normalization hyper-parameter c + final float c; + switch (random.nextInt(4)) { + case 0: + // minimum value + c = 0; + break; + case 1: + // tiny value + c = Float.MIN_VALUE; + break; + case 2: + // maximum value + // we just limit the test to "reasonable" c values but don't enforce this anywhere. + c = Integer.MAX_VALUE; + break; + default: + // random value + c = Integer.MAX_VALUE * random.nextFloat(); + break; + } + // normalization hyper-parameter z + final float z; + switch (random.nextInt(3)) { + case 0: + // minimum value + z = Float.MIN_VALUE; + break; + case 1: + // maximum value + z = Math.nextDown(0.5f); + break; + default: + // random value + float zcand = random.nextFloat() / 2; + if (zcand == 0f) { + // nextFloat returns 0 inclusive, we have to avoid it. + z = Math.nextUp(zcand); + } else { + z = zcand; + } + } + // dirichlet parameter mu + final float mu; + switch (random.nextInt(4)) { + case 0: + // minimum value + mu = 0; + break; + case 1: + // tiny value + mu = Float.MIN_VALUE; + break; + case 2: + // maximum value + // we just limit the test to "reasonable" mu values but don't enforce this anywhere. + mu = Integer.MAX_VALUE; + break; + default: + // random value + mu = Integer.MAX_VALUE * random.nextFloat(); + break; + } + final Normalization normalization; + switch(random.nextInt(5)) { + case 0: + normalization = new Normalization.NoNormalization(); + break; + case 1: + normalization = new NormalizationH1(c); + break; + case 2: + normalization = new NormalizationH2(c); + break; + case 3: + normalization = new NormalizationH3(mu); + break; + default: + normalization = new NormalizationZ(z); + break; + } + return new IBSimilarity(getDistribution(), lambda, normalization); + } + + /** return BasicModel under test */ + protected abstract Distribution getDistribution(); + +} diff --git a/lucene/core/src/test/org/apache/lucene/search/similarities/TestAxiomaticF1EXP.java b/lucene/core/src/test/org/apache/lucene/search/similarities/TestAxiomaticF1EXP.java new file mode 100644 index 00000000000..16da903e1dc --- /dev/null +++ b/lucene/core/src/test/org/apache/lucene/search/similarities/TestAxiomaticF1EXP.java @@ -0,0 +1,30 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.search.similarities; + +import org.apache.lucene.util.LuceneTestCase.AwaitsFix; + +// returns NaN scores for sloppy freqs < 1 (due to log without floor) +@AwaitsFix(bugUrl = "https://issues.apache.org/jira/browse/LUCENE-8010") +public class TestAxiomaticF1EXP extends AxiomaticTestCase { + + @Override + protected final Similarity getAxiomaticModel(float s, int queryLen, float k) { + return new AxiomaticF1EXP(s, k); + } + +} diff --git a/lucene/core/src/test/org/apache/lucene/search/similarities/TestAxiomaticF1LOG.java b/lucene/core/src/test/org/apache/lucene/search/similarities/TestAxiomaticF1LOG.java new file mode 100644 index 00000000000..88ad18ee2ef --- /dev/null +++ b/lucene/core/src/test/org/apache/lucene/search/similarities/TestAxiomaticF1LOG.java @@ -0,0 +1,30 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.search.similarities; + +import org.apache.lucene.util.LuceneTestCase.AwaitsFix; + +// returns NaN scores for sloppy freqs < 1 (due to log without floor) +@AwaitsFix(bugUrl = "https://issues.apache.org/jira/browse/LUCENE-8010") +public class TestAxiomaticF1LOG extends AxiomaticTestCase { + + @Override + protected final Similarity getAxiomaticModel(float s, int queryLen, float k) { + return new AxiomaticF1LOG(s); + } + +} diff --git a/lucene/core/src/test/org/apache/lucene/search/similarities/TestAxiomaticF2EXP.java b/lucene/core/src/test/org/apache/lucene/search/similarities/TestAxiomaticF2EXP.java new file mode 100644 index 00000000000..e9ab9b6ff60 --- /dev/null +++ b/lucene/core/src/test/org/apache/lucene/search/similarities/TestAxiomaticF2EXP.java @@ -0,0 +1,26 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.search.similarities; + +public class TestAxiomaticF2EXP extends AxiomaticTestCase { + + @Override + protected final Similarity getAxiomaticModel(float s, int queryLen, float k) { + return new AxiomaticF2EXP(s, k); + } + +} diff --git a/lucene/core/src/test/org/apache/lucene/search/similarities/TestAxiomaticF2LOG.java b/lucene/core/src/test/org/apache/lucene/search/similarities/TestAxiomaticF2LOG.java new file mode 100644 index 00000000000..f9c9420cc72 --- /dev/null +++ b/lucene/core/src/test/org/apache/lucene/search/similarities/TestAxiomaticF2LOG.java @@ -0,0 +1,26 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.search.similarities; + +public class TestAxiomaticF2LOG extends AxiomaticTestCase { + + @Override + protected final Similarity getAxiomaticModel(float s, int queryLen, float k) { + return new AxiomaticF2LOG(s); + } + +} diff --git a/lucene/core/src/test/org/apache/lucene/search/similarities/TestAxiomaticF3EXP.java b/lucene/core/src/test/org/apache/lucene/search/similarities/TestAxiomaticF3EXP.java new file mode 100644 index 00000000000..69ab7193e56 --- /dev/null +++ b/lucene/core/src/test/org/apache/lucene/search/similarities/TestAxiomaticF3EXP.java @@ -0,0 +1,31 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.search.similarities; + +import org.apache.lucene.util.LuceneTestCase.AwaitsFix; + +// returns negative scores at least, but it (now) warns it has problems +@AwaitsFix(bugUrl = "https://issues.apache.org/jira/browse/LUCENE-8010") +public class TestAxiomaticF3EXP extends AxiomaticTestCase { + + @Override + protected final Similarity getAxiomaticModel(float s, int queryLen, float k) { + // TODO: use the randomized parameters and not these hardcoded ones + return new AxiomaticF3EXP(0.25f, 1); + } + +} diff --git a/lucene/core/src/test/org/apache/lucene/search/similarities/TestAxiomaticF3LOG.java b/lucene/core/src/test/org/apache/lucene/search/similarities/TestAxiomaticF3LOG.java new file mode 100644 index 00000000000..686327731f0 --- /dev/null +++ b/lucene/core/src/test/org/apache/lucene/search/similarities/TestAxiomaticF3LOG.java @@ -0,0 +1,31 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.search.similarities; + +import org.apache.lucene.util.LuceneTestCase.AwaitsFix; + +// returns negative scores at least, but it (now) warns it has problems +@AwaitsFix(bugUrl = "https://issues.apache.org/jira/browse/LUCENE-8010") +public class TestAxiomaticF3LOG extends AxiomaticTestCase { + + @Override + protected final Similarity getAxiomaticModel(float s, int queryLen, float k) { + // TODO: use the randomized parameters and not these hardcoded ones + return new AxiomaticF3LOG(0.25f, 1); + } + +} diff --git a/lucene/core/src/test/org/apache/lucene/search/similarities/TestBM25Similarity.java b/lucene/core/src/test/org/apache/lucene/search/similarities/TestBM25Similarity.java index 4c6382baf62..9dcf7e64889 100644 --- a/lucene/core/src/test/org/apache/lucene/search/similarities/TestBM25Similarity.java +++ b/lucene/core/src/test/org/apache/lucene/search/similarities/TestBM25Similarity.java @@ -17,10 +17,9 @@ package org.apache.lucene.search.similarities; -import org.apache.lucene.search.Explanation; -import org.apache.lucene.util.LuceneTestCase; +import java.util.Random; -public class TestBM25Similarity extends LuceneTestCase { +public class TestBM25Similarity extends BaseSimilarityTestCase { public void testIllegalK1() { IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () -> { @@ -61,17 +60,51 @@ public class TestBM25Similarity extends LuceneTestCase { assertTrue(expected.getMessage().contains("illegal b value")); } - private static Explanation findExplanation(Explanation expl, String text) { - if (expl.getDescription().equals(text)) { - return expl; - } else { - for (Explanation sub : expl.getDetails()) { - Explanation match = findExplanation(sub, text); - if (match != null) { - return match; - } - } + @Override + protected Similarity getSimilarity(Random random) { + // term frequency normalization parameter k1 + final float k1; + switch (random.nextInt(4)) { + case 0: + // minimum value + k1 = 0; + break; + case 1: + // tiny value + k1 = Float.MIN_VALUE; + break; + case 2: + // maximum value + // upper bounds on individual term's score is 43.262806 * (k1 + 1) * boost + // we just limit the test to "reasonable" k1 values but don't enforce this anywhere. + k1 = Integer.MAX_VALUE; + break; + default: + // random value + k1 = Integer.MAX_VALUE * random.nextFloat(); + break; } - return null; + + // length normalization parameter b [0 .. 1] + final float b; + switch (random.nextInt(4)) { + case 0: + // minimum value + b = 0; + break; + case 1: + // tiny value + b = Float.MIN_VALUE; + break; + case 2: + // maximum value + b = 1; + break; + default: + // random value + b = random.nextFloat(); + break; + } + return new BM25Similarity(k1, b); } } diff --git a/lucene/core/src/test/org/apache/lucene/search/similarities/TestBasicModelBE.java b/lucene/core/src/test/org/apache/lucene/search/similarities/TestBasicModelBE.java new file mode 100644 index 00000000000..2dc956f7da0 --- /dev/null +++ b/lucene/core/src/test/org/apache/lucene/search/similarities/TestBasicModelBE.java @@ -0,0 +1,30 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.search.similarities; + +import org.apache.lucene.util.LuceneTestCase.AwaitsFix; + +// returns negative scores at least, but it warns it has problems +@AwaitsFix(bugUrl = "https://issues.apache.org/jira/browse/LUCENE-8010") +public class TestBasicModelBE extends BasicModelTestCase { + + @Override + protected BasicModel getBasicModel() { + return new BasicModelBE(); + } + +} diff --git a/lucene/core/src/test/org/apache/lucene/search/similarities/TestBasicModelD.java b/lucene/core/src/test/org/apache/lucene/search/similarities/TestBasicModelD.java new file mode 100644 index 00000000000..7eee359b3d4 --- /dev/null +++ b/lucene/core/src/test/org/apache/lucene/search/similarities/TestBasicModelD.java @@ -0,0 +1,30 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.search.similarities; + +import org.apache.lucene.util.LuceneTestCase.AwaitsFix; + +// scores go backwards with respect to TF, but it warns it has problems +@AwaitsFix(bugUrl = "https://issues.apache.org/jira/browse/LUCENE-8010") +public class TestBasicModelD extends BasicModelTestCase { + + @Override + protected BasicModel getBasicModel() { + return new BasicModelD(); + } + +} diff --git a/lucene/core/src/test/org/apache/lucene/search/similarities/TestBasicModelG.java b/lucene/core/src/test/org/apache/lucene/search/similarities/TestBasicModelG.java new file mode 100644 index 00000000000..280affb89a0 --- /dev/null +++ b/lucene/core/src/test/org/apache/lucene/search/similarities/TestBasicModelG.java @@ -0,0 +1,26 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.search.similarities; + +public class TestBasicModelG extends BasicModelTestCase { + + @Override + protected BasicModel getBasicModel() { + return new BasicModelG(); + } + +} diff --git a/lucene/core/src/test/org/apache/lucene/search/similarities/TestBasicModelIF.java b/lucene/core/src/test/org/apache/lucene/search/similarities/TestBasicModelIF.java new file mode 100644 index 00000000000..0b7c9fc1e3f --- /dev/null +++ b/lucene/core/src/test/org/apache/lucene/search/similarities/TestBasicModelIF.java @@ -0,0 +1,26 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.search.similarities; + +public class TestBasicModelIF extends BasicModelTestCase { + + @Override + protected BasicModel getBasicModel() { + return new BasicModelIF(); + } + +} diff --git a/lucene/core/src/test/org/apache/lucene/search/similarities/TestBasicModelIn.java b/lucene/core/src/test/org/apache/lucene/search/similarities/TestBasicModelIn.java new file mode 100644 index 00000000000..c474982d0dd --- /dev/null +++ b/lucene/core/src/test/org/apache/lucene/search/similarities/TestBasicModelIn.java @@ -0,0 +1,26 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.search.similarities; + +public class TestBasicModelIn extends BasicModelTestCase { + + @Override + protected BasicModel getBasicModel() { + return new BasicModelIn(); + } + +} diff --git a/lucene/core/src/test/org/apache/lucene/search/similarities/TestBasicModelIne.java b/lucene/core/src/test/org/apache/lucene/search/similarities/TestBasicModelIne.java new file mode 100644 index 00000000000..c9a8a5f7102 --- /dev/null +++ b/lucene/core/src/test/org/apache/lucene/search/similarities/TestBasicModelIne.java @@ -0,0 +1,26 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.search.similarities; + +public class TestBasicModelIne extends BasicModelTestCase { + + @Override + protected BasicModel getBasicModel() { + return new BasicModelIne(); + } + +} diff --git a/lucene/core/src/test/org/apache/lucene/search/similarities/TestBasicModelP.java b/lucene/core/src/test/org/apache/lucene/search/similarities/TestBasicModelP.java new file mode 100644 index 00000000000..2788ff8edb8 --- /dev/null +++ b/lucene/core/src/test/org/apache/lucene/search/similarities/TestBasicModelP.java @@ -0,0 +1,30 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.search.similarities; + +import org.apache.lucene.util.LuceneTestCase.AwaitsFix; + +//scores go backwards with respect to TF, but it warns it has problems +@AwaitsFix(bugUrl = "https://issues.apache.org/jira/browse/LUCENE-8010") +public class TestBasicModelP extends BasicModelTestCase { + + @Override + protected BasicModel getBasicModel() { + return new BasicModelP(); + } + +} diff --git a/lucene/core/src/test/org/apache/lucene/search/similarities/TestBooleanSimilarity.java b/lucene/core/src/test/org/apache/lucene/search/similarities/TestBooleanSimilarity.java index c3885143547..c4dec7c13f5 100644 --- a/lucene/core/src/test/org/apache/lucene/search/similarities/TestBooleanSimilarity.java +++ b/lucene/core/src/test/org/apache/lucene/search/similarities/TestBooleanSimilarity.java @@ -17,6 +17,7 @@ package org.apache.lucene.search.similarities; import java.io.IOException; +import java.util.Random; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field.Store; @@ -32,11 +33,10 @@ import org.apache.lucene.search.PhraseQuery; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TopDocs; import org.apache.lucene.store.Directory; -import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util.TestUtil; import org.apache.lucene.util.Version; -public class TestBooleanSimilarity extends LuceneTestCase { +public class TestBooleanSimilarity extends BaseSimilarityTestCase { public void testTermScoreIsEqualToBoost() throws IOException { Directory dir = newDirectory(); @@ -114,4 +114,9 @@ public class TestBooleanSimilarity extends LuceneTestCase { 0f); } } + + @Override + protected Similarity getSimilarity(Random random) { + return new BooleanSimilarity(); + } } diff --git a/lucene/core/src/test/org/apache/lucene/search/similarities/TestClassicSimilarity.java b/lucene/core/src/test/org/apache/lucene/search/similarities/TestClassicSimilarity.java index e7a56067c55..4a5a10fcbaf 100644 --- a/lucene/core/src/test/org/apache/lucene/search/similarities/TestClassicSimilarity.java +++ b/lucene/core/src/test/org/apache/lucene/search/similarities/TestClassicSimilarity.java @@ -19,6 +19,7 @@ package org.apache.lucene.search.similarities; import java.io.IOException; import java.util.Arrays; +import java.util.Random; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field.Store; @@ -39,11 +40,10 @@ import org.apache.lucene.search.TopDocs; import org.apache.lucene.search.similarities.TFIDFSimilarity.IDFStats; import org.apache.lucene.store.Directory; import org.apache.lucene.util.IOUtils; -import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util.TestUtil; import org.apache.lucene.util.Version; -public class TestClassicSimilarity extends LuceneTestCase { +public class TestClassicSimilarity extends BaseSimilarityTestCase { private Directory directory; private IndexReader indexReader; private IndexSearcher indexSearcher; @@ -185,4 +185,9 @@ public class TestClassicSimilarity extends LuceneTestCase { 0f); } } + + @Override + protected Similarity getSimilarity(Random random) { + return new ClassicSimilarity(); + } } diff --git a/lucene/core/src/test/org/apache/lucene/search/similarities/TestDistributionLL.java b/lucene/core/src/test/org/apache/lucene/search/similarities/TestDistributionLL.java new file mode 100644 index 00000000000..de28d6f1b97 --- /dev/null +++ b/lucene/core/src/test/org/apache/lucene/search/similarities/TestDistributionLL.java @@ -0,0 +1,26 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.search.similarities; + +public class TestDistributionLL extends DistributionTestCase { + + @Override + protected Distribution getDistribution() { + return new DistributionLL(); + } + +} diff --git a/lucene/core/src/test/org/apache/lucene/search/similarities/TestDistributionSPL.java b/lucene/core/src/test/org/apache/lucene/search/similarities/TestDistributionSPL.java new file mode 100644 index 00000000000..984915a23da --- /dev/null +++ b/lucene/core/src/test/org/apache/lucene/search/similarities/TestDistributionSPL.java @@ -0,0 +1,30 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.search.similarities; + +import org.apache.lucene.util.LuceneTestCase.AwaitsFix; + +// scores go infinite, but it warns it has problems +@AwaitsFix(bugUrl = "https://issues.apache.org/jira/browse/LUCENE-8010") +public class TestDistributionSPL extends DistributionTestCase { + + @Override + protected Distribution getDistribution() { + return new DistributionSPL(); + } + +} diff --git a/lucene/core/src/test/org/apache/lucene/search/similarities/TestIndependenceChiSquared.java b/lucene/core/src/test/org/apache/lucene/search/similarities/TestIndependenceChiSquared.java new file mode 100644 index 00000000000..c2fa06c4f6f --- /dev/null +++ b/lucene/core/src/test/org/apache/lucene/search/similarities/TestIndependenceChiSquared.java @@ -0,0 +1,28 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.search.similarities; + +import java.util.Random; + +public class TestIndependenceChiSquared extends BaseSimilarityTestCase { + + @Override + protected final Similarity getSimilarity(Random random) { + return new DFISimilarity(new IndependenceChiSquared()); + } + +} diff --git a/lucene/core/src/test/org/apache/lucene/search/similarities/TestIndependenceSaturated.java b/lucene/core/src/test/org/apache/lucene/search/similarities/TestIndependenceSaturated.java new file mode 100644 index 00000000000..38be8b699df --- /dev/null +++ b/lucene/core/src/test/org/apache/lucene/search/similarities/TestIndependenceSaturated.java @@ -0,0 +1,28 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.search.similarities; + +import java.util.Random; + +public class TestIndependenceSaturated extends BaseSimilarityTestCase { + + @Override + protected final Similarity getSimilarity(Random random) { + return new DFISimilarity(new IndependenceSaturated()); + } + +} diff --git a/lucene/core/src/test/org/apache/lucene/search/similarities/TestIndependenceStandardized.java b/lucene/core/src/test/org/apache/lucene/search/similarities/TestIndependenceStandardized.java new file mode 100644 index 00000000000..959912a2553 --- /dev/null +++ b/lucene/core/src/test/org/apache/lucene/search/similarities/TestIndependenceStandardized.java @@ -0,0 +1,28 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.search.similarities; + +import java.util.Random; + +public class TestIndependenceStandardized extends BaseSimilarityTestCase { + + @Override + protected final Similarity getSimilarity(Random random) { + return new DFISimilarity(new IndependenceStandardized()); + } + +} diff --git a/lucene/core/src/test/org/apache/lucene/search/similarities/TestLMDirichletSimilarity.java b/lucene/core/src/test/org/apache/lucene/search/similarities/TestLMDirichletSimilarity.java new file mode 100644 index 00000000000..d6043e5cfec --- /dev/null +++ b/lucene/core/src/test/org/apache/lucene/search/similarities/TestLMDirichletSimilarity.java @@ -0,0 +1,49 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.search.similarities; + +import java.util.Random; + +public class TestLMDirichletSimilarity extends BaseSimilarityTestCase { + + @Override + protected final Similarity getSimilarity(Random random) { + // smoothing parameter mu, unbounded + final float mu; + switch (random.nextInt(4)) { + case 0: + // minimum value + mu = 0; + break; + case 1: + // tiny value + mu = Float.MIN_VALUE; + break; + case 2: + // maximum value + // we just limit the test to "reasonable" mu values but don't enforce this anywhere. + mu = Integer.MAX_VALUE; + break; + default: + // random value + mu = Integer.MAX_VALUE * random.nextFloat(); + break; + } + return new LMDirichletSimilarity(mu); + } + +} diff --git a/lucene/core/src/test/org/apache/lucene/search/similarities/TestLMJelinekMercerSimilarity.java b/lucene/core/src/test/org/apache/lucene/search/similarities/TestLMJelinekMercerSimilarity.java new file mode 100644 index 00000000000..0fa8db8c058 --- /dev/null +++ b/lucene/core/src/test/org/apache/lucene/search/similarities/TestLMJelinekMercerSimilarity.java @@ -0,0 +1,44 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.search.similarities; + +import java.util.Random; + +public class TestLMJelinekMercerSimilarity extends BaseSimilarityTestCase { + + @Override + protected final Similarity getSimilarity(Random random) { + // smoothing parameter lambda: (0..1] + final float lambda; + switch (random.nextInt(3)) { + case 0: + // tiny value + lambda = Float.MIN_VALUE; + break; + case 1: + // maximum value + lambda = 1; + break; + default: + // random value + lambda = random.nextFloat(); + break; + } + return new LMJelinekMercerSimilarity(lambda); + } + +} diff --git a/lucene/core/src/test/org/apache/lucene/search/similarities/TestSimilarityBase.java b/lucene/core/src/test/org/apache/lucene/search/similarities/TestSimilarityBase.java index 8fbd69d9190..e52c9742f65 100644 --- a/lucene/core/src/test/org/apache/lucene/search/similarities/TestSimilarityBase.java +++ b/lucene/core/src/test/org/apache/lucene/search/similarities/TestSimilarityBase.java @@ -51,7 +51,7 @@ import org.apache.lucene.util.Version; * items in the list. If a test case fails, the name of the Similarity that * caused the failure is returned as part of the assertion error message.

*

Unit testing is performed by constructing statistics manually and calling - * the {@link SimilarityBase#score(BasicStats, float, float)} method of the + * the {@link SimilarityBase#score(BasicStats, double, double)} method of the * Similarities. The statistics represent corner cases of corpus distributions. *

*

For the integration tests, a small (8-document) collection is indexed. The @@ -191,17 +191,17 @@ public class TestSimilarityBase extends LuceneTestCase { } /** * The generic test core called by all unit test methods. It calls the - * {@link SimilarityBase#score(BasicStats, float, float)} method of all + * {@link SimilarityBase#score(BasicStats, double, double)} method of all * Similarities in {@link #sims} and checks if the score is valid; i.e. it * is a finite positive real number. */ private void unitTestCore(BasicStats stats, float freq, int docLen) { for (SimilarityBase sim : sims) { BasicStats realStats = (BasicStats) sim.computeWeight( - stats.getBoost(), + (float)stats.getBoost(), toCollectionStats(stats), toTermStats(stats)); - float score = sim.score(realStats, freq, docLen); + float score = (float)sim.score(realStats, freq, docLen); float explScore = sim.explain( realStats, 1, Explanation.match(freq, "freq"), docLen).getValue(); assertFalse("Score infinite: " + sim.toString(), Float.isInfinite(score)); @@ -524,17 +524,17 @@ public class TestSimilarityBase extends LuceneTestCase { /** * The generic test core called by all correctness test methods. It calls the - * {@link SimilarityBase#score(BasicStats, float, float)} method of all + * {@link SimilarityBase#score(BasicStats, double, double)} method of all * Similarities in {@link #sims} and compares the score against the manually * computed {@code gold}. */ private void correctnessTestCore(SimilarityBase sim, float gold) { BasicStats stats = createStats(); BasicStats realStats = (BasicStats) sim.computeWeight( - stats.getBoost(), + (float)stats.getBoost(), toCollectionStats(stats), toTermStats(stats)); - float score = sim.score(realStats, FREQ, DOC_LEN); + float score = (float) sim.score(realStats, FREQ, DOC_LEN); assertEquals( sim.toString() + " score not correct.", gold, score, FLOAT_EPSILON); } diff --git a/lucene/join/src/test/org/apache/lucene/search/join/TestBlockJoin.java b/lucene/join/src/test/org/apache/lucene/search/join/TestBlockJoin.java index c87fdbb6ce5..7830648368b 100644 --- a/lucene/join/src/test/org/apache/lucene/search/join/TestBlockJoin.java +++ b/lucene/join/src/test/org/apache/lucene/search/join/TestBlockJoin.java @@ -1484,7 +1484,7 @@ public class TestBlockJoin extends LuceneTestCase { } @Override - protected float score(BasicStats stats, float freq, float docLen) { + protected double score(BasicStats stats, double freq, double docLen) { return freq; } }; diff --git a/lucene/test-framework/src/java/org/apache/lucene/search/CheckHits.java b/lucene/test-framework/src/java/org/apache/lucene/search/CheckHits.java index dee7d8405c0..7696a63f83e 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/search/CheckHits.java +++ b/lucene/test-framework/src/java/org/apache/lucene/search/CheckHits.java @@ -20,6 +20,7 @@ import java.io.IOException; import java.util.Locale; import java.util.Set; import java.util.TreeSet; +import java.util.regex.Pattern; import java.util.Random; import junit.framework.Assert; @@ -318,6 +319,8 @@ public class CheckHits { public static float explainToleranceDelta(float f1, float f2) { return Math.max(EXPLAIN_SCORE_TOLERANCE_MINIMUM, Math.max(Math.abs(f1), Math.abs(f2)) * EXPLAIN_SCORE_TOLERANCE_DELTA); } + + private static final Pattern COMPUTED_FROM_PATTERN = Pattern.compile(".*, computed as .* from:"); /** * Assert that an explanation has the expected score, and optionally that its @@ -335,9 +338,12 @@ public class CheckHits { boolean deep, Explanation expl) { float value = expl.getValue(); - Assert.assertEquals(q+": score(doc="+doc+")="+score+ - " != explanationScore="+value+" Explanation: "+expl, - score,value,explainToleranceDelta(score, value)); + // TODO: clean this up if we use junit 5 (the assert message is costly) + try { + Assert.assertEquals(score, value, explainToleranceDelta(score, value)); + } catch (Exception e) { + Assert.fail(q+": score(doc="+doc+")="+score+" != explanationScore="+value+" Explanation: "+expl); + } if (!deep) return; @@ -368,7 +374,7 @@ public class CheckHits { boolean productOf = descr.endsWith("product of:"); boolean sumOf = descr.endsWith("sum of:"); boolean maxOf = descr.endsWith("max of:"); - boolean computedOf = descr.matches(".*, computed as .* from:"); + boolean computedOf = descr.indexOf("computed as") > 0 && COMPUTED_FROM_PATTERN.matcher(descr).matches(); boolean maxTimesOthers = false; if (!(productOf || sumOf || maxOf || computedOf)) { // maybe 'max plus x times others' @@ -386,11 +392,12 @@ public class CheckHits { } } // TODO: this is a TERRIBLE assertion!!!! - Assert.assertTrue( - q+": multi valued explanation description=\""+descr - +"\" must be 'max of plus x times others', 'computed as x from:' or end with 'product of'" - +" or 'sum of:' or 'max of:' - "+expl, - productOf || sumOf || maxOf || computedOf || maxTimesOthers); + if (false == (productOf || sumOf || maxOf || computedOf || maxTimesOthers)) { + Assert.fail( + q+": multi valued explanation description=\""+descr + +"\" must be 'max of plus x times others', 'computed as x from:' or end with 'product of'" + +" or 'sum of:' or 'max of:' - "+expl); + } float sum = 0; float product = 1; float max = 0; @@ -414,9 +421,13 @@ public class CheckHits { Assert.assertTrue("should never get here!", computedOf); combined = value; } - Assert.assertEquals(q+": actual subDetails combined=="+combined+ - " != value="+value+" Explanation: "+expl, - combined,value,explainToleranceDelta(combined, value)); + // TODO: clean this up if we use junit 5 (the assert message is costly) + try { + Assert.assertEquals(combined, value, explainToleranceDelta(combined, value)); + } catch (Exception e) { + Assert.fail(q+": actual subDetails combined=="+combined+ + " != value="+value+" Explanation: "+expl); + } } } } diff --git a/lucene/test-framework/src/java/org/apache/lucene/search/similarities/BaseSimilarityTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/search/similarities/BaseSimilarityTestCase.java new file mode 100644 index 00000000000..d93594d884c --- /dev/null +++ b/lucene/test-framework/src/java/org/apache/lucene/search/similarities/BaseSimilarityTestCase.java @@ -0,0 +1,473 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.search.similarities; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.Random; + +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.FieldType; +import org.apache.lucene.document.TextField; +import org.apache.lucene.index.FilterLeafReader; +import org.apache.lucene.index.LeafReader; +import org.apache.lucene.index.NumericDocValues; +import org.apache.lucene.index.RandomIndexWriter; +import org.apache.lucene.search.CheckHits; +import org.apache.lucene.search.CollectionStatistics; +import org.apache.lucene.search.Explanation; +import org.apache.lucene.search.TermStatistics; +import org.apache.lucene.search.similarities.Similarity.SimScorer; +import org.apache.lucene.search.similarities.Similarity.SimWeight; +import org.apache.lucene.store.Directory; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.IOUtils; +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util.SmallFloat; +import org.apache.lucene.util.TestUtil; +import org.junit.AfterClass; +import org.junit.BeforeClass; + +/** + * Abstract class to do basic tests for a similarity. + * NOTE: This test focuses on the similarity impl, nothing else. + * The [stretch] goal is for this test to be + * so thorough in testing a new Similarity that if this + * test passes, then all Lucene/Solr tests should also pass. Ie, + * if there is some bug in a given Similarity that this + * test fails to catch then this test needs to be improved! */ +public abstract class BaseSimilarityTestCase extends LuceneTestCase { + + static LeafReader WITHOUT_NORM; + static Directory WITHOUT_NORM_DIR; + + static LeafReader WITH_NORM_BASE; + static Directory WITH_NORM_DIR; + static List NORM_VALUES; + + @BeforeClass + public static void beforeClass() throws Exception { + // without norms + WITHOUT_NORM_DIR = newDirectory(); + RandomIndexWriter writer = new RandomIndexWriter(random(), WITHOUT_NORM_DIR); + Document doc = new Document(); + doc.add(newTextField("field", "value", Field.Store.NO)); + writer.addDocument(doc); + WITHOUT_NORM = getOnlyLeafReader(writer.getReader()); + writer.close(); + + // with norms + WITH_NORM_DIR = newDirectory(); + writer = new RandomIndexWriter(random(), WITH_NORM_DIR); + doc = new Document(); + FieldType fieldType = new FieldType(TextField.TYPE_NOT_STORED); + fieldType.setOmitNorms(true); + doc.add(newField("field", "value", fieldType)); + writer.addDocument(doc); + WITH_NORM_BASE = getOnlyLeafReader(writer.getReader()); + writer.close(); + + // all possible norm values for the doc + NORM_VALUES = new ArrayList<>(); + NORM_VALUES.add(WITHOUT_NORM); + for (int i = 1; i < 256; i++) { + final long value = i; + NORM_VALUES.add(new FilterLeafReader(WITH_NORM_BASE) { + @Override + public CacheHelper getCoreCacheHelper() { + return null; + } + + @Override + public CacheHelper getReaderCacheHelper() { + return null; + } + + @Override + public NumericDocValues getNormValues(String field) throws IOException { + if (field.equals("field")) { + return new CannedNorm(value); + } else { + return super.getNormValues(field); + } + } + }); + } + } + + @AfterClass + public static void afterClass() throws Exception { + IOUtils.close(WITH_NORM_BASE, WITH_NORM_DIR, WITHOUT_NORM, WITHOUT_NORM_DIR); + WITH_NORM_BASE = WITHOUT_NORM = null; + WITH_NORM_DIR = WITHOUT_NORM_DIR = null; + NORM_VALUES = null; + } + + /** 1-document norms impl of the given value */ + static class CannedNorm extends NumericDocValues { + int docID = -1; + final long value; + + CannedNorm(long value) { + this.value = value; + } + + @Override + public long longValue() throws IOException { + return value; + } + + @Override + public boolean advanceExact(int target) throws IOException { + assert target == 0; + docID = target; + return true; + } + + @Override + public int docID() { + return docID; + } + + @Override + public int nextDoc() throws IOException { + if (docID == -1) { + return docID = 0; + } else { + return docID = NO_MORE_DOCS; + } + } + + @Override + public int advance(int target) throws IOException { + if (target == 0) { + return docID = 0; + } else { + return docID = NO_MORE_DOCS; + } + } + + @Override + public long cost() { + return 0; + } + } + + /** + * Return a new similarity with all parameters randomized within valid ranges. + */ + protected abstract Similarity getSimilarity(Random random); + + static final long MAXDOC_FORTESTING = 1L << 48; + // must be at least MAXDOC_FORTESTING + Integer.MAX_VALUE + static final long MAXTOKENS_FORTESTING = 1L << 49; + + /** + * returns a random corpus that is at least possible given + * the norm value for a single document. + */ + static CollectionStatistics newCorpus(Random random, int norm) { + // lower bound of tokens in the collection (you produced this norm somehow) + final int lowerBound; + if (norm == 0) { + // norms are omitted, but there must have been at least one token to produce that norm + lowerBound = 1; + } else { + // minimum value that would decode to such a norm + lowerBound = SmallFloat.byte4ToInt((byte) norm); + } + final long maxDoc; + if (random.nextBoolean()) { + // small collection + maxDoc = TestUtil.nextLong(random, 1, 100000); + } else { + // yuge collection + maxDoc = TestUtil.nextLong(random, 1, MAXDOC_FORTESTING); + } + // TODO: make this a mandatory statistic, or test it with -1 + final long docCount; + if (random.nextBoolean()) { + // sparse field + docCount = TestUtil.nextLong(random, 1, maxDoc); + } else { + // fully populated + docCount = maxDoc; + } + // random docsize: but can't require docs to have > 2B tokens + long upperBound; + try { + upperBound = Math.min(MAXTOKENS_FORTESTING, Math.multiplyExact(docCount, Integer.MAX_VALUE)); + } catch (ArithmeticException overflow) { + upperBound = MAXTOKENS_FORTESTING; + } + // TODO: make this a mandatory statistic, or test it with -1 + final long sumDocFreq; + if (random.nextBoolean()) { + // shortest possible docs + sumDocFreq = docCount; + } else { + // random docsize + sumDocFreq = TestUtil.nextLong(random, docCount, upperBound + 1 - lowerBound); + } + final long sumTotalTermFreq; + switch (random.nextInt(3)) { + case 0: + // unsupported (e.g. omitTF) + sumTotalTermFreq = -1; + break; + case 1: + // no repetition of terms (except to satisfy this norm) + sumTotalTermFreq = sumDocFreq - 1 + lowerBound; + break; + default: + // random repetition + assert sumDocFreq - 1 + lowerBound <= upperBound; + sumTotalTermFreq = TestUtil.nextLong(random, sumDocFreq - 1 + lowerBound, upperBound); + break; + } + return new CollectionStatistics("field", maxDoc, docCount, sumTotalTermFreq, sumDocFreq); + } + + private static final BytesRef TERM = new BytesRef("term"); + + /** + * returns new random term, that fits within the bounds of the corpus + */ + static TermStatistics newTerm(Random random, CollectionStatistics corpus) { + final long docFreq; + if (random.nextBoolean()) { + // rare term + docFreq = 1; + } else { + // random specificity + docFreq = TestUtil.nextLong(random, 1, corpus.docCount()); + } + final long totalTermFreq; + if (corpus.sumTotalTermFreq() == -1) { + // omitTF + totalTermFreq = -1; + } else if (random.nextBoolean()) { + // no repetition + totalTermFreq = docFreq; + } else { + // random repetition: but can't require docs to have > 2B tokens + long upperBound; + try { + upperBound = Math.min(corpus.sumTotalTermFreq(), Math.multiplyExact(docFreq, Integer.MAX_VALUE)); + } catch (ArithmeticException overflow) { + upperBound = corpus.sumTotalTermFreq(); + } + totalTermFreq = TestUtil.nextLong(random, docFreq, upperBound); + } + return new TermStatistics(TERM, docFreq, totalTermFreq); + } + + /** + * Tests scoring across a bunch of random terms/corpora/frequencies for each possible document length. + * It does the following checks: + *

    + *
  • scores are non-negative and finite. + *
  • score matches the explanation exactly. + *
  • internal explanations calculations are sane (e.g. sum of: and so on actually compute sums) + *
  • scores don't decrease as term frequencies increase: e.g. score(freq=N + 1) >= score(freq=N) + *
  • scores don't decrease as documents get shorter, e.g. score(len=M) >= score(len=M+1) + *
  • scores don't decrease as terms get rarer, e.g. score(term=N) >= score(term=N+1) + *
  • scoring works for floating point frequencies (e.g. sloppy phrase and span queries will work) + *
  • scoring works for reasonably large 64-bit statistic values (e.g. distributed search will work) + *
  • scoring works for reasonably large boost values (0 .. Integer.MAX_VALUE, e.g. query boosts will work) + *
  • scoring works for parameters randomized within valid ranges (see {@link #getSimilarity(Random)}) + *
+ */ + public void testRandomScoring() throws Exception { + Random random = random(); + final int iterations = atLeast(10); + for (int i = 0; i < iterations; i++) { + // pull a new similarity to switch up parameters + Similarity similarity = getSimilarity(random); + for (int j = 0; j < 10; j++) { + // for each norm value... + for (int k = 0; k < NORM_VALUES.size(); k++) { + CollectionStatistics corpus = newCorpus(random, k); + for (int l = 0; l < 10; l++) { + TermStatistics term = newTerm(random, corpus); + final float freq; + if (term.totalTermFreq() == -1) { + // omit TF + freq = 1; + } else if (term.docFreq() == 1) { + // only one document, all the instances must be here. + freq = Math.toIntExact(term.totalTermFreq()); + } else { + // there is at least one other document, and those must have at least 1 instance each. + int upperBound = Math.toIntExact(Math.min(term.totalTermFreq() - term.docFreq() + 1, Integer.MAX_VALUE)); + if (random.nextBoolean()) { + freq = TestUtil.nextInt(random, 1, upperBound); + } else { + float freqCandidate = upperBound * random.nextFloat(); + // we need to be 2nd float value at a minimum, the pairwise test will check MIN_VALUE in this case. + // this avoids testing frequencies of 0 which seem wrong to allow (we should enforce computeSlopFactor etc) + if (freqCandidate <= Float.MIN_VALUE) { + freqCandidate = Math.nextUp(Float.MIN_VALUE); + } + freq = freqCandidate; + } + } + // we just limit the test to "reasonable" boost values but don't enforce this anywhere. + // too big, and you are asking for overflow. that's hard for a sim to enforce (but definitely possible) + // for now, we just want to detect overflow where its a real bug/hazard in the computation with reasonable inputs. + final float boost; + switch (random.nextInt(5)) { + case 0: + // minimum value (not enforced) + boost = 0F; + break; + case 1: + // tiny value + boost = Float.MIN_VALUE; + break; + case 2: + // no-op value (sometimes treated special in explanations) + boost = 1F; + break; + case 3: + // maximum value (not enforceD) + boost = Integer.MAX_VALUE; + break; + default: + // random value + boost = random.nextFloat() * Integer.MAX_VALUE; + break; + } + doTestScoring(similarity, corpus, term, boost, freq, k); + } + } + } + } + } + + /** runs for a single test case, so that if you hit a test failure you can write a reproducer just for that scenario */ + private static void doTestScoring(Similarity similarity, CollectionStatistics corpus, TermStatistics term, float boost, float freq, int norm) throws IOException { + boolean success = false; + SimWeight weight = similarity.computeWeight(boost, corpus, term); + SimScorer scorer = similarity.simScorer(weight, NORM_VALUES.get(norm).getContext()); + try { + float score = scorer.score(0, freq); + // check that score isn't infinite or negative + assertTrue("infinite/NaN score: " + score, Float.isFinite(score)); + assertTrue("negative score: " + score, score >= 0); + // check explanation matches + Explanation explanation = scorer.explain(0, Explanation.match(freq, "freq, occurrences of term within document")); + if (score != explanation.getValue()) { + fail("expected: " + score + ", got: " + explanation); + } + CheckHits.verifyExplanation("", 0, score, true, explanation); + + // check score(freq-1), given the same norm it should be <= score(freq) [scores non-decreasing for more term occurrences] + final float prevFreq; + if (random().nextBoolean() && freq == (int)freq && freq > 1 && term.docFreq() > 1) { + // previous in integer space + prevFreq = freq - 1; + } else { + // previous in float space (e.g. for sloppyPhrase) + prevFreq = Math.nextDown(freq); + } + + float prevScore = scorer.score(0, prevFreq); + // check that score isn't infinite or negative + assertTrue(Float.isFinite(prevScore)); + assertTrue(prevScore >= 0); + // check explanation matches + Explanation prevExplanation = scorer.explain(0, Explanation.match(prevFreq, "freq, occurrences of term within document")); + if (prevScore != prevExplanation.getValue()) { + fail("expected: " + prevScore + ", got: " + prevExplanation); + } + CheckHits.verifyExplanation("test query (prevFreq)", 0, prevScore, true, prevExplanation); + + if (prevScore > score) { + System.out.println(prevExplanation); + System.out.println(explanation); + fail("score(" + prevFreq + ")=" + prevScore + " > score(" + freq + ")=" + score); + } + + // check score(norm-1), given the same freq it should be >= score(norm) [scores non-decreasing as docs get shorter] + if (norm > 1) { + SimScorer prevNormScorer = similarity.simScorer(weight, NORM_VALUES.get(norm - 1).getContext()); + float prevNormScore = prevNormScorer.score(0, freq); + // check that score isn't infinite or negative + assertTrue(Float.isFinite(prevNormScore)); + assertTrue(prevNormScore >= 0); + // check explanation matches + Explanation prevNormExplanation = prevNormScorer.explain(0, Explanation.match(freq, "freq, occurrences of term within document")); + if (prevNormScore != prevNormExplanation.getValue()) { + fail("expected: " + prevNormScore + ", got: " + prevNormExplanation); + } + CheckHits.verifyExplanation("test query (prevNorm)", 0, prevNormScore, true, prevNormExplanation); + + if (prevNormScore < score) { + System.out.println(prevNormExplanation); + System.out.println(explanation); + fail("score(" + freq + "," + (norm-1) + ")=" + prevNormScore + " < score(" + freq + "," + norm + ")=" + score); + } + } + + // check score(term-1), given the same freq/norm it should be >= score(term) [scores non-decreasing as terms get rarer] + if (term.docFreq() > 1 && (term.totalTermFreq() == -1 || freq < term.totalTermFreq())) { + final long prevTotalTermFreq; + if (term.totalTermFreq() == -1) { + prevTotalTermFreq = -1; + } else { + prevTotalTermFreq = term.totalTermFreq() - 1; + } + TermStatistics prevTerm = new TermStatistics(term.term(), term.docFreq() - 1, prevTotalTermFreq); + SimWeight prevWeight = similarity.computeWeight(boost, corpus, term); + SimScorer prevTermScorer = similarity.simScorer(prevWeight, NORM_VALUES.get(norm).getContext()); + float prevTermScore = prevTermScorer.score(0, freq); + // check that score isn't infinite or negative + assertTrue(Float.isFinite(prevTermScore)); + assertTrue(prevTermScore >= 0); + // check explanation matches + Explanation prevTermExplanation = prevTermScorer.explain(0, Explanation.match(freq, "freq, occurrences of term within document")); + if (prevTermScore != prevTermExplanation.getValue()) { + fail("expected: " + prevTermScore + ", got: " + prevTermExplanation); + } + CheckHits.verifyExplanation("test query (prevTerm)", 0, prevTermScore, true, prevTermExplanation); + + if (prevTermScore < score) { + System.out.println(prevTermExplanation); + System.out.println(explanation); + fail("score(" + freq + "," + (prevTerm) + ")=" + prevTermScore + " < score(" + freq + "," + term + ")=" + score); + } + } + + success = true; + } finally { + if (!success) { + System.out.println(similarity); + System.out.println(corpus); + System.out.println(term); + if (norm == 0) { + System.out.println("norms=omitted"); + } else { + System.out.println("norm=" + norm + " (doc length ~ " + SmallFloat.byte4ToInt((byte) norm) + ")"); + } + System.out.println("freq=" + freq); + } + } + } +} diff --git a/lucene/test-framework/src/java/org/apache/lucene/util/TestUtil.java b/lucene/test-framework/src/java/org/apache/lucene/util/TestUtil.java index d3351ab9f63..7f530177b9d 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/util/TestUtil.java +++ b/lucene/test-framework/src/java/org/apache/lucene/util/TestUtil.java @@ -434,7 +434,7 @@ public final class TestUtil { /** start and end are BOTH inclusive */ public static long nextLong(Random r, long start, long end) { - assert end >= start; + assert end >= start : "start=" + start + ",end=" + end; final BigInteger range = BigInteger.valueOf(end).add(BigInteger.valueOf(1)).subtract(BigInteger.valueOf(start)); if (range.compareTo(BigInteger.valueOf(Integer.MAX_VALUE)) <= 0) { return start + r.nextInt(range.intValue());